├── .gitignore ├── LICENSE ├── README.md ├── centroid.go ├── centroid_test.go ├── go.mod ├── go.sum ├── tdigest.go ├── tdigest_test.go └── test ├── README.md ├── gen └── main.go ├── main.cpp ├── main.go ├── tdigest.h ├── test.sh └── validate └── main.go /.gitignore: -------------------------------------------------------------------------------- 1 | /test/*.dat* 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2018 InfluxData Inc. 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tdigest 2 | 3 | This is an implementation of Ted Dunning's [t-digest](https://github.com/tdunning/t-digest/) in Go. 4 | 5 | The implementation is based off [Derrick Burns' C++ implementation](https://github.com/derrickburns/tdigest). 6 | 7 | ## Example 8 | 9 | ```go 10 | package main 11 | 12 | import ( 13 | "log" 14 | 15 | "github.com/influxdata/tdigest" 16 | ) 17 | 18 | func main() { 19 | td := tdigest.NewWithCompression(1000) 20 | for _, x := range []float64{1, 2, 3, 4, 5, 5, 4, 3, 2, 1} { 21 | td.Add(x, 1) 22 | } 23 | 24 | // Compute Quantiles 25 | log.Println("50th", td.Quantile(0.5)) 26 | log.Println("75th", td.Quantile(0.75)) 27 | log.Println("90th", td.Quantile(0.9)) 28 | log.Println("99th", td.Quantile(0.99)) 29 | 30 | // Compute CDFs 31 | log.Println("CDF(1) = ", td.CDF(1)) 32 | log.Println("CDF(2) = ", td.CDF(2)) 33 | log.Println("CDF(3) = ", td.CDF(3)) 34 | log.Println("CDF(4) = ", td.CDF(4)) 35 | log.Println("CDF(5) = ", td.CDF(5)) 36 | } 37 | ``` 38 | -------------------------------------------------------------------------------- /centroid.go: -------------------------------------------------------------------------------- 1 | package tdigest 2 | 3 | import ( 4 | "fmt" 5 | "sort" 6 | ) 7 | 8 | // ErrWeightLessThanZero is used when the weight is not able to be processed. 9 | const ErrWeightLessThanZero = Error("centroid weight cannot be less than zero") 10 | 11 | // Error is a domain error encountered while processing tdigests 12 | type Error string 13 | 14 | func (e Error) Error() string { 15 | return string(e) 16 | } 17 | 18 | // Centroid average position of all points in a shape 19 | type Centroid struct { 20 | Mean float64 21 | Weight float64 22 | } 23 | 24 | func (c *Centroid) String() string { 25 | return fmt.Sprintf("{mean: %f weight: %f}", c.Mean, c.Weight) 26 | } 27 | 28 | // Add averages the two centroids together and update this centroid 29 | func (c *Centroid) Add(r Centroid) error { 30 | if r.Weight < 0 { 31 | return ErrWeightLessThanZero 32 | } 33 | if c.Weight != 0 { 34 | c.Weight += r.Weight 35 | c.Mean += r.Weight * (r.Mean - c.Mean) / c.Weight 36 | } else { 37 | c.Weight = r.Weight 38 | c.Mean = r.Mean 39 | } 40 | return nil 41 | } 42 | 43 | // CentroidList is sorted by the Mean of the centroid, ascending. 44 | type CentroidList []Centroid 45 | 46 | // Clear clears the list. 47 | func (l *CentroidList) Clear() { 48 | *l = (*l)[:0] 49 | } 50 | 51 | func (l CentroidList) Len() int { return len(l) } 52 | func (l CentroidList) Less(i, j int) bool { return l[i].Mean < l[j].Mean } 53 | func (l CentroidList) Swap(i, j int) { l[i], l[j] = l[j], l[i] } 54 | 55 | // NewCentroidList creates a priority queue for the centroids 56 | func NewCentroidList(centroids []Centroid) CentroidList { 57 | l := CentroidList(centroids) 58 | sort.Sort(l) 59 | return l 60 | } 61 | -------------------------------------------------------------------------------- /centroid_test.go: -------------------------------------------------------------------------------- 1 | package tdigest_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/google/go-cmp/cmp" 7 | "github.com/influxdata/tdigest" 8 | ) 9 | 10 | func TestCentroid_Add(t *testing.T) { 11 | tests := []struct { 12 | name string 13 | c tdigest.Centroid 14 | r tdigest.Centroid 15 | want tdigest.Centroid 16 | wantErr bool 17 | errStr string 18 | }{ 19 | { 20 | name: "error when weight is zero", 21 | r: tdigest.Centroid{ 22 | Weight: -1.0, 23 | }, 24 | wantErr: true, 25 | errStr: "centroid weight cannot be less than zero", 26 | }, 27 | { 28 | name: "zero weight", 29 | c: tdigest.Centroid{ 30 | Weight: 0.0, 31 | Mean: 1.0, 32 | }, 33 | r: tdigest.Centroid{ 34 | Weight: 1.0, 35 | Mean: 2.0, 36 | }, 37 | want: tdigest.Centroid{ 38 | Weight: 1.0, 39 | Mean: 2.0, 40 | }, 41 | }, 42 | { 43 | name: "weight order of magnitude", 44 | c: tdigest.Centroid{ 45 | Weight: 1, 46 | Mean: 1, 47 | }, 48 | r: tdigest.Centroid{ 49 | Weight: 10, 50 | Mean: 10, 51 | }, 52 | want: tdigest.Centroid{ 53 | Weight: 11, 54 | Mean: 9.181818181818182, 55 | }, 56 | }, 57 | } 58 | for _, tt := range tests { 59 | t.Run(tt.name, func(t *testing.T) { 60 | c := &tt.c 61 | if err := c.Add(tt.r); (err != nil) != tt.wantErr { 62 | t.Errorf("Centroid.Add() error = %v, wantErr %v", err, tt.wantErr) 63 | } else if tt.wantErr && err.Error() != tt.errStr { 64 | t.Errorf("Centroid.Add() error.Error() = %s, errStr %v", err.Error(), tt.errStr) 65 | } 66 | if !cmp.Equal(tt.c, tt.want) { 67 | t.Errorf("unexprected centroid -want/+got\n%s", cmp.Diff(tt.want, tt.c)) 68 | } 69 | }) 70 | } 71 | } 72 | 73 | func TestNewCentroidList(t *testing.T) { 74 | tests := []struct { 75 | name string 76 | centroids []tdigest.Centroid 77 | want tdigest.CentroidList 78 | }{ 79 | { 80 | name: "empty list", 81 | }, 82 | { 83 | name: "priority should be by mean ascending", 84 | centroids: []tdigest.Centroid{ 85 | { 86 | Mean: 2.0, 87 | }, 88 | { 89 | Mean: 1.0, 90 | }, 91 | }, 92 | want: tdigest.CentroidList{ 93 | { 94 | Mean: 1.0, 95 | }, 96 | { 97 | Mean: 2.0, 98 | }, 99 | }, 100 | }, 101 | { 102 | name: "single element should be identity", 103 | centroids: []tdigest.Centroid{ 104 | { 105 | Mean: 1.0, 106 | }, 107 | }, 108 | want: tdigest.CentroidList{ 109 | { 110 | Mean: 1.0, 111 | }, 112 | }, 113 | }, 114 | } 115 | for _, tt := range tests { 116 | t.Run(tt.name, func(t *testing.T) { 117 | if got := tdigest.NewCentroidList(tt.centroids); !cmp.Equal(tt.want, got) { 118 | t.Errorf("NewCentroidList() = -want/+got %s", cmp.Diff(tt.want, got)) 119 | } 120 | }) 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/influxdata/tdigest 2 | 3 | require ( 4 | github.com/google/go-cmp v0.2.0 5 | golang.org/x/exp v0.0.0-20180321215751-8460e604b9de 6 | gonum.org/v1/gonum v0.0.0-20181121035319-3f7ecaa7e8ca 7 | gonum.org/v1/netlib v0.0.0-20181029234149-ec6d1f5cefe6 // indirect 8 | ) 9 | 10 | go 1.13 11 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/google/go-cmp v0.2.0 h1:+dTQ8DZQJz0Mb/HjFlkptS1FeQ4cWSnN941F8aEG4SQ= 2 | github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= 3 | golang.org/x/exp v0.0.0-20180321215751-8460e604b9de h1:xSjD6HQTqT0H/k60N5yYBtnN1OEkVy7WIo/DYyxKRO0= 4 | golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= 5 | golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 6 | gonum.org/v1/gonum v0.0.0-20181121035319-3f7ecaa7e8ca h1:PupagGYwj8+I4ubCxcmcBRk3VlUWtTg5huQpZR9flmE= 7 | gonum.org/v1/gonum v0.0.0-20181121035319-3f7ecaa7e8ca/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= 8 | gonum.org/v1/netlib v0.0.0-20181029234149-ec6d1f5cefe6 h1:4WsZyVtkthqrHTbDCJfiTs8IWNYE4uvsSDgaV6xpp+o= 9 | gonum.org/v1/netlib v0.0.0-20181029234149-ec6d1f5cefe6/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= 10 | -------------------------------------------------------------------------------- /tdigest.go: -------------------------------------------------------------------------------- 1 | package tdigest 2 | 3 | import ( 4 | "math" 5 | "sort" 6 | ) 7 | 8 | // TDigest is a data structure for accurate on-line accumulation of 9 | // rank-based statistics such as quantiles and trimmed means. 10 | type TDigest struct { 11 | Compression float64 12 | 13 | maxProcessed int 14 | maxUnprocessed int 15 | processed CentroidList 16 | unprocessed CentroidList 17 | cumulative []float64 18 | processedWeight float64 19 | unprocessedWeight float64 20 | min float64 21 | max float64 22 | } 23 | 24 | // New initializes a new distribution with a default compression. 25 | func New() *TDigest { 26 | return NewWithCompression(1000) 27 | } 28 | 29 | // NewWithCompression initializes a new distribution with custom compression. 30 | func NewWithCompression(c float64) *TDigest { 31 | t := &TDigest{ 32 | Compression: c, 33 | } 34 | t.maxProcessed = processedSize(0, t.Compression) 35 | t.maxUnprocessed = unprocessedSize(0, t.Compression) 36 | t.processed = make(CentroidList, 0, t.maxProcessed) 37 | t.unprocessed = make(CentroidList, 0, t.maxUnprocessed+1) 38 | t.Reset() 39 | return t 40 | } 41 | 42 | // Calculate number of bytes needed for a tdigest of size c, 43 | // where c is the compression value 44 | func ByteSizeForCompression(comp float64) int { 45 | c := int(comp) 46 | // // A centroid is 2 float64s, so we need 16 bytes for each centroid 47 | // float_size := 8 48 | // centroid_size := 2 * float_size 49 | 50 | // // Unprocessed and processed can grow up to length c 51 | // unprocessed_size := centroid_size * c 52 | // processed_size := unprocessed_size 53 | 54 | // // the cumulative field can also be of length c, but each item is a single float64 55 | // cumulative_size := float_size * c // <- this could also be unprocessed_size / 2 56 | 57 | // return unprocessed_size + processed_size + cumulative_size 58 | 59 | // // or, more succinctly: 60 | // return float_size * c * 5 61 | 62 | // or even more succinctly 63 | return c * 40 64 | } 65 | 66 | // Reset resets the distribution to its initial state. 67 | func (t *TDigest) Reset() { 68 | t.processed = t.processed[:0] 69 | t.unprocessed = t.unprocessed[:0] 70 | t.cumulative = t.cumulative[:0] 71 | t.processedWeight = 0 72 | t.unprocessedWeight = 0 73 | t.min = math.MaxFloat64 74 | t.max = -math.MaxFloat64 75 | } 76 | 77 | // Add adds a value x with a weight w to the distribution. 78 | func (t *TDigest) Add(x, w float64) { 79 | t.AddCentroid(Centroid{Mean: x, Weight: w}) 80 | } 81 | 82 | // AddCentroidList can quickly add multiple centroids. 83 | func (t *TDigest) AddCentroidList(c CentroidList) { 84 | // It's possible to optimize this by bulk-copying the slice, but this 85 | // yields just a 1-2% speedup (most time is in process()), so not worth 86 | // the complexity. 87 | for i := range c { 88 | t.AddCentroid(c[i]) 89 | } 90 | } 91 | 92 | // AddCentroid adds a single centroid. 93 | // Weights which are not a number or are <= 0 are ignored, as are NaN means. 94 | func (t *TDigest) AddCentroid(c Centroid) { 95 | if math.IsNaN(c.Mean) || c.Weight <= 0 || math.IsNaN(c.Weight) || math.IsInf(c.Weight, 1) { 96 | return 97 | } 98 | 99 | t.unprocessed = append(t.unprocessed, c) 100 | t.unprocessedWeight += c.Weight 101 | 102 | if t.processed.Len() > t.maxProcessed || 103 | t.unprocessed.Len() > t.maxUnprocessed { 104 | t.process() 105 | } 106 | } 107 | 108 | // Merges the supplied digest into this digest. Functionally equivalent to 109 | // calling t.AddCentroidList(t2.Centroids(nil)), but avoids making an extra 110 | // copy of the CentroidList. 111 | func (t *TDigest) Merge(t2 *TDigest) { 112 | t2.process() 113 | t.AddCentroidList(t2.processed) 114 | } 115 | 116 | func (t *TDigest) process() { 117 | if t.unprocessed.Len() > 0 || 118 | t.processed.Len() > t.maxProcessed { 119 | 120 | // Append all processed centroids to the unprocessed list and sort 121 | t.unprocessed = append(t.unprocessed, t.processed...) 122 | sort.Sort(&t.unprocessed) 123 | 124 | // Reset processed list with first centroid 125 | t.processed.Clear() 126 | t.processed = append(t.processed, t.unprocessed[0]) 127 | 128 | t.processedWeight += t.unprocessedWeight 129 | t.unprocessedWeight = 0 130 | soFar := t.unprocessed[0].Weight 131 | limit := t.processedWeight * t.integratedQ(1.0) 132 | for _, centroid := range t.unprocessed[1:] { 133 | projected := soFar + centroid.Weight 134 | if projected <= limit { 135 | soFar = projected 136 | (&t.processed[t.processed.Len()-1]).Add(centroid) 137 | } else { 138 | k1 := t.integratedLocation(soFar / t.processedWeight) 139 | limit = t.processedWeight * t.integratedQ(k1+1.0) 140 | soFar += centroid.Weight 141 | t.processed = append(t.processed, centroid) 142 | } 143 | } 144 | t.min = math.Min(t.min, t.processed[0].Mean) 145 | t.max = math.Max(t.max, t.processed[t.processed.Len()-1].Mean) 146 | t.unprocessed.Clear() 147 | } 148 | } 149 | 150 | // Centroids returns a copy of processed centroids. 151 | // Useful when aggregating multiple t-digests. 152 | // 153 | // Centroids are appended to the passed CentroidList; if you're re-using a 154 | // buffer, be sure to pass cl[:0]. 155 | func (t *TDigest) Centroids(cl CentroidList) CentroidList { 156 | t.process() 157 | return append(cl, t.processed...) 158 | } 159 | 160 | func (t *TDigest) Count() float64 { 161 | t.process() 162 | 163 | // t.process always updates t.processedWeight to the total count of all 164 | // centroids, so we don't need to re-count here. 165 | return t.processedWeight 166 | } 167 | 168 | func (t *TDigest) updateCumulative() { 169 | // Weight can only increase, so the final cumulative value will always be 170 | // either equal to, or less than, the total weight. If they are the same, 171 | // then nothing has changed since the last update. 172 | if len(t.cumulative) > 0 && t.cumulative[len(t.cumulative)-1] == t.processedWeight { 173 | return 174 | } 175 | 176 | if n := t.processed.Len() + 1; n <= cap(t.cumulative) { 177 | t.cumulative = t.cumulative[:n] 178 | } else { 179 | t.cumulative = make([]float64, n) 180 | } 181 | 182 | prev := 0.0 183 | for i, centroid := range t.processed { 184 | cur := centroid.Weight 185 | t.cumulative[i] = prev + cur/2.0 186 | prev = prev + cur 187 | } 188 | t.cumulative[t.processed.Len()] = prev 189 | } 190 | 191 | // Quantile returns the (approximate) quantile of 192 | // the distribution. Accepted values for q are between 0.0 and 1.0. 193 | // Returns NaN if Count is zero or bad inputs. 194 | func (t *TDigest) Quantile(q float64) float64 { 195 | t.process() 196 | t.updateCumulative() 197 | if q < 0 || q > 1 || t.processed.Len() == 0 { 198 | return math.NaN() 199 | } 200 | if t.processed.Len() == 1 { 201 | return t.processed[0].Mean 202 | } 203 | index := q * t.processedWeight 204 | if index <= t.processed[0].Weight/2.0 { 205 | return t.min + 2.0*index/t.processed[0].Weight*(t.processed[0].Mean-t.min) 206 | } 207 | 208 | lower := sort.Search(len(t.cumulative), func(i int) bool { 209 | return t.cumulative[i] >= index 210 | }) 211 | 212 | if lower+1 != len(t.cumulative) { 213 | z1 := index - t.cumulative[lower-1] 214 | z2 := t.cumulative[lower] - index 215 | return weightedAverage(t.processed[lower-1].Mean, z2, t.processed[lower].Mean, z1) 216 | } 217 | 218 | z1 := index - t.processedWeight - t.processed[lower-1].Weight/2.0 219 | z2 := (t.processed[lower-1].Weight / 2.0) - z1 220 | return weightedAverage(t.processed[t.processed.Len()-1].Mean, z1, t.max, z2) 221 | } 222 | 223 | // CDF returns the cumulative distribution function for a given value x. 224 | func (t *TDigest) CDF(x float64) float64 { 225 | t.process() 226 | t.updateCumulative() 227 | switch t.processed.Len() { 228 | case 0: 229 | return 0.0 230 | case 1: 231 | width := t.max - t.min 232 | if x <= t.min { 233 | return 0.0 234 | } 235 | if x >= t.max { 236 | return 1.0 237 | } 238 | if (x - t.min) <= width { 239 | // min and max are too close together to do any viable interpolation 240 | return 0.5 241 | } 242 | return (x - t.min) / width 243 | } 244 | 245 | if x <= t.min { 246 | return 0.0 247 | } 248 | if x >= t.max { 249 | return 1.0 250 | } 251 | m0 := t.processed[0].Mean 252 | // Left Tail 253 | if x <= m0 { 254 | if m0-t.min > 0 { 255 | return (x - t.min) / (m0 - t.min) * t.processed[0].Weight / t.processedWeight / 2.0 256 | } 257 | return 0.0 258 | } 259 | // Right Tail 260 | mn := t.processed[t.processed.Len()-1].Mean 261 | if x >= mn { 262 | if t.max-mn > 0.0 { 263 | return 1.0 - (t.max-x)/(t.max-mn)*t.processed[t.processed.Len()-1].Weight/t.processedWeight/2.0 264 | } 265 | return 1.0 266 | } 267 | 268 | upper := sort.Search(t.processed.Len(), func(i int) bool { 269 | return t.processed[i].Mean > x 270 | }) 271 | 272 | z1 := x - t.processed[upper-1].Mean 273 | z2 := t.processed[upper].Mean - x 274 | return weightedAverage(t.cumulative[upper-1], z2, t.cumulative[upper], z1) / t.processedWeight 275 | } 276 | 277 | func (t *TDigest) integratedQ(k float64) float64 { 278 | return (math.Sin(math.Min(k, t.Compression)*math.Pi/t.Compression-math.Pi/2.0) + 1.0) / 2.0 279 | } 280 | 281 | func (t *TDigest) integratedLocation(q float64) float64 { 282 | return t.Compression * (math.Asin(2.0*q-1.0) + math.Pi/2.0) / math.Pi 283 | } 284 | 285 | func weightedAverage(x1, w1, x2, w2 float64) float64 { 286 | if x1 <= x2 { 287 | return weightedAverageSorted(x1, w1, x2, w2) 288 | } 289 | return weightedAverageSorted(x2, w2, x1, w1) 290 | } 291 | 292 | func weightedAverageSorted(x1, w1, x2, w2 float64) float64 { 293 | x := (x1*w1 + x2*w2) / (w1 + w2) 294 | return math.Max(x1, math.Min(x, x2)) 295 | } 296 | 297 | func processedSize(size int, compression float64) int { 298 | if size == 0 { 299 | return int(2 * math.Ceil(compression)) 300 | } 301 | return size 302 | } 303 | 304 | func unprocessedSize(size int, compression float64) int { 305 | if size == 0 { 306 | return int(8 * math.Ceil(compression)) 307 | } 308 | return size 309 | } 310 | -------------------------------------------------------------------------------- /tdigest_test.go: -------------------------------------------------------------------------------- 1 | package tdigest_test 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | "reflect" 7 | "testing" 8 | 9 | "github.com/influxdata/tdigest" 10 | "golang.org/x/exp/rand" 11 | "gonum.org/v1/gonum/stat/distuv" 12 | ) 13 | 14 | const ( 15 | N = 1e6 16 | Mu = 10 17 | Sigma = 3 18 | 19 | seed = 42 20 | ) 21 | 22 | // NormalData is a slice of N random values that are normaly distributed with mean Mu and standard deviation Sigma. 23 | var NormalData []float64 24 | var UniformData []float64 25 | 26 | var NormalDigest *tdigest.TDigest 27 | var UniformDigest *tdigest.TDigest 28 | 29 | func init() { 30 | dist := distuv.Normal{ 31 | Mu: Mu, 32 | Sigma: Sigma, 33 | Src: rand.New(rand.NewSource(seed)), 34 | } 35 | uniform := rand.New(rand.NewSource(seed)) 36 | 37 | UniformData = make([]float64, N) 38 | UniformDigest = tdigest.NewWithCompression(1000) 39 | 40 | NormalData = make([]float64, N) 41 | NormalDigest = tdigest.NewWithCompression(1000) 42 | 43 | for i := range NormalData { 44 | NormalData[i] = dist.Rand() 45 | NormalDigest.Add(NormalData[i], 1) 46 | 47 | UniformData[i] = uniform.Float64() * 100 48 | UniformDigest.Add(UniformData[i], 1) 49 | } 50 | } 51 | 52 | // Compares the quantile results of two digests, and fails if the 53 | // fractional err exceeds maxErr. 54 | // Always fails if the total count differs. 55 | func compareQuantiles(td1, td2 *tdigest.TDigest, maxErr float64) error { 56 | if td1.Count() != td2.Count() { 57 | return fmt.Errorf("counts are not equal, %d vs %d", int64(td1.Count()), int64(td2.Count())) 58 | } 59 | for q := 0.05; q < 1; q += 0.05 { 60 | if math.Abs(td1.Quantile(q)-td2.Quantile(q))/td1.Quantile(q) > maxErr { 61 | return fmt.Errorf("quantile %g differs, %g vs %g", q, td1.Quantile(q), td2.Quantile(q)) 62 | } 63 | } 64 | return nil 65 | } 66 | 67 | // All Add methods should yield equivalent results. 68 | func TestTdigest_AddFuncs(t *testing.T) { 69 | centroids := NormalDigest.Centroids(nil) 70 | 71 | addDigest := tdigest.NewWithCompression(100) 72 | addCentroidDigest := tdigest.NewWithCompression(100) 73 | addCentroidListDigest := tdigest.NewWithCompression(100) 74 | 75 | for _, c := range centroids { 76 | addDigest.Add(c.Mean, c.Weight) 77 | addCentroidDigest.AddCentroid(c) 78 | } 79 | addCentroidListDigest.AddCentroidList(centroids) 80 | 81 | if err := compareQuantiles(addDigest, addCentroidDigest, 0.01); err != nil { 82 | t.Errorf("AddCentroid() differs from from Add(): %s", err.Error()) 83 | } 84 | if err := compareQuantiles(addDigest, addCentroidListDigest, 0.01); err != nil { 85 | t.Errorf("AddCentroidList() differs from from Add(): %s", err.Error()) 86 | } 87 | } 88 | 89 | func TestTdigest_Count(t *testing.T) { 90 | tests := []struct { 91 | name string 92 | data []float64 93 | digest *tdigest.TDigest 94 | want float64 95 | }{ 96 | { 97 | name: "empty", 98 | data: []float64{}, 99 | want: 0, 100 | }, 101 | { 102 | name: "not empty", 103 | data: []float64{5, 4}, 104 | want: 2, 105 | }, 106 | } 107 | 108 | for _, tt := range tests { 109 | t.Run(tt.name, func(t *testing.T) { 110 | td := tt.digest 111 | if td == nil { 112 | td = tdigest.NewWithCompression(1000) 113 | for _, x := range tt.data { 114 | td.Add(x, 1) 115 | } 116 | } 117 | got := td.Count() 118 | if got != tt.want { 119 | t.Errorf("unexpected count, got %g want %g", got, tt.want) 120 | } 121 | }) 122 | } 123 | 124 | got := NormalDigest.Count() 125 | want := float64(len(NormalData)) 126 | if got != want { 127 | t.Errorf("unexpected count for NormalDigest, got %g want %g", got, want) 128 | } 129 | 130 | got = UniformDigest.Count() 131 | want = float64(len(UniformData)) 132 | if got != want { 133 | t.Errorf("unexpected count for UniformDigest, got %g want %g", got, want) 134 | } 135 | } 136 | 137 | func TestTdigest_Quantile(t *testing.T) { 138 | tests := []struct { 139 | name string 140 | data []float64 141 | digest *tdigest.TDigest 142 | quantile float64 143 | want float64 144 | }{ 145 | { 146 | name: "increasing", 147 | quantile: 0.5, 148 | data: []float64{1, 2, 3, 4, 5}, 149 | want: 3, 150 | }, 151 | { 152 | name: "data in decreasing order", 153 | quantile: 0.25, 154 | data: []float64{555.349107, 432.842597}, 155 | want: 432.842597, 156 | }, 157 | { 158 | name: "small", 159 | quantile: 0.5, 160 | data: []float64{1, 2, 3, 4, 5, 5, 4, 3, 2, 1}, 161 | want: 3, 162 | }, 163 | { 164 | name: "small 99 (max)", 165 | quantile: 0.99, 166 | data: []float64{1, 2, 3, 4, 5, 5, 4, 3, 2, 1}, 167 | want: 5, 168 | }, 169 | { 170 | name: "normal 50", 171 | quantile: 0.5, 172 | digest: NormalDigest, 173 | want: 10.000673533707138, 174 | }, 175 | { 176 | name: "normal 90", 177 | quantile: 0.9, 178 | digest: NormalDigest, 179 | want: 13.842132136909889, 180 | }, 181 | { 182 | name: "uniform 50", 183 | quantile: 0.5, 184 | digest: UniformDigest, 185 | want: 49.992502345843555, 186 | }, 187 | { 188 | name: "uniform 90", 189 | quantile: 0.9, 190 | digest: UniformDigest, 191 | want: 89.98281777095822, 192 | }, 193 | { 194 | name: "uniform 99", 195 | quantile: 0.99, 196 | digest: UniformDigest, 197 | want: 98.98503400959562, 198 | }, 199 | { 200 | name: "uniform 99.9", 201 | quantile: 0.999, 202 | digest: UniformDigest, 203 | want: 99.90103781043621, 204 | }, 205 | } 206 | for _, tt := range tests { 207 | t.Run(tt.name, func(t *testing.T) { 208 | td := tt.digest 209 | if td == nil { 210 | td = tdigest.NewWithCompression(1000) 211 | for _, x := range tt.data { 212 | td.Add(x, 1) 213 | } 214 | } 215 | got := td.Quantile(tt.quantile) 216 | if got != tt.want { 217 | t.Errorf("unexpected quantile %f, got %g want %g", tt.quantile, got, tt.want) 218 | } 219 | }) 220 | } 221 | } 222 | 223 | func TestTdigest_CDFs(t *testing.T) { 224 | tests := []struct { 225 | name string 226 | data []float64 227 | digest *tdigest.TDigest 228 | cdf float64 229 | want float64 230 | }{ 231 | { 232 | name: "increasing", 233 | cdf: 3, 234 | data: []float64{1, 2, 3, 4, 5}, 235 | want: 0.5, 236 | }, 237 | { 238 | name: "small", 239 | cdf: 4, 240 | data: []float64{1, 2, 3, 4, 5, 5, 4, 3, 2, 1}, 241 | want: 0.75, 242 | }, 243 | { 244 | name: "small max", 245 | cdf: 5, 246 | data: []float64{1, 2, 3, 4, 5, 5, 4, 3, 2, 1}, 247 | want: 1, 248 | }, 249 | { 250 | name: "normal mean", 251 | cdf: 10, 252 | data: NormalData, 253 | want: 0.4999156505250766, 254 | }, 255 | { 256 | name: "normal high", 257 | cdf: -100, 258 | data: NormalData, 259 | want: 0, 260 | }, 261 | { 262 | name: "normal low", 263 | cdf: 110, 264 | data: NormalData, 265 | want: 1, 266 | }, 267 | { 268 | name: "uniform 50", 269 | cdf: 50, 270 | data: UniformData, 271 | want: 0.5000756133965755, 272 | }, 273 | { 274 | name: "uniform min", 275 | cdf: 0, 276 | data: UniformData, 277 | want: 0, 278 | }, 279 | { 280 | name: "uniform max", 281 | cdf: 100, 282 | data: UniformData, 283 | want: 1, 284 | }, 285 | { 286 | name: "uniform 10", 287 | cdf: 10, 288 | data: UniformData, 289 | want: 0.09987932577650871, 290 | }, 291 | { 292 | name: "uniform 90", 293 | cdf: 90, 294 | data: UniformData, 295 | want: 0.9001667885256108, 296 | }, 297 | } 298 | for _, tt := range tests { 299 | t.Run(tt.name, func(t *testing.T) { 300 | td := tt.digest 301 | if td == nil { 302 | td = tdigest.NewWithCompression(1000) 303 | for _, x := range tt.data { 304 | td.Add(x, 1) 305 | } 306 | } 307 | got := td.CDF(tt.cdf) 308 | if got != tt.want { 309 | t.Errorf("unexpected CDF %f, got %g want %g", tt.cdf, got, tt.want) 310 | } 311 | }) 312 | } 313 | } 314 | 315 | func TestTdigest_Reset(t *testing.T) { 316 | td := tdigest.New() 317 | for _, x := range NormalData { 318 | td.Add(x, 1) 319 | } 320 | q1 := td.Quantile(0.9) 321 | 322 | td.Reset() 323 | for _, x := range NormalData { 324 | td.Add(x, 1) 325 | } 326 | if q2 := td.Quantile(0.9); q2 != q1 { 327 | t.Errorf("unexpected quantile, got %g want %g", q2, q1) 328 | } 329 | } 330 | 331 | func TestTdigest_OddInputs(t *testing.T) { 332 | td := tdigest.New() 333 | td.Add(math.NaN(), 1) 334 | td.Add(1, math.NaN()) 335 | td.Add(1, 0) 336 | td.Add(1, -1000) 337 | if td.Count() != 0 { 338 | t.Error("invalid value was alloed to be added") 339 | } 340 | 341 | // Infinite values are allowed. 342 | td.Add(1, 1) 343 | td.Add(2, 1) 344 | td.Add(math.Inf(1), 1) 345 | if q := td.Quantile(0.5); q != 2 { 346 | t.Errorf("expected median value 2, got %f", q) 347 | } 348 | if q := td.Quantile(0.9); !math.IsInf(q, 1) { 349 | t.Errorf("expected median value 2, got %f", q) 350 | } 351 | } 352 | 353 | func TestTdigest_Merge(t *testing.T) { 354 | // Repeat merges enough times to ensure we call compress() 355 | numRepeats := 20 356 | addDigest := tdigest.New() 357 | for i := 0; i < numRepeats; i++ { 358 | for _, c := range NormalDigest.Centroids(nil) { 359 | addDigest.AddCentroid(c) 360 | } 361 | for _, c := range UniformDigest.Centroids(nil) { 362 | addDigest.AddCentroid(c) 363 | } 364 | } 365 | 366 | mergeDigest := tdigest.New() 367 | for i := 0; i < numRepeats; i++ { 368 | mergeDigest.Merge(NormalDigest) 369 | mergeDigest.Merge(UniformDigest) 370 | } 371 | 372 | if err := compareQuantiles(addDigest, mergeDigest, 0.001); err != nil { 373 | t.Errorf("AddCentroid() differs from from Merge(): %s", err.Error()) 374 | } 375 | 376 | // Empty merge does nothing and has no effect on underlying centroids. 377 | c1 := addDigest.Centroids(nil) 378 | addDigest.Merge(tdigest.New()) 379 | c2 := addDigest.Centroids(nil) 380 | if !reflect.DeepEqual(c1, c2) { 381 | t.Error("Merging an empty digest altered data") 382 | } 383 | } 384 | 385 | var quantiles = []float64{0.1, 0.5, 0.9, 0.99, 0.999} 386 | 387 | func BenchmarkTDigest_Add(b *testing.B) { 388 | for n := 0; n < b.N; n++ { 389 | td := tdigest.NewWithCompression(1000) 390 | for _, x := range NormalData { 391 | td.Add(x, 1) 392 | } 393 | } 394 | } 395 | 396 | func BenchmarkTDigest_AddCentroid(b *testing.B) { 397 | centroids := make(tdigest.CentroidList, len(NormalData)) 398 | for i := range centroids { 399 | centroids[i].Mean = NormalData[i] 400 | centroids[i].Weight = 1 401 | } 402 | 403 | b.ResetTimer() 404 | for n := 0; n < b.N; n++ { 405 | td := tdigest.NewWithCompression(1000) 406 | for i := range centroids { 407 | td.AddCentroid(centroids[i]) 408 | } 409 | } 410 | } 411 | 412 | func BenchmarkTDigest_AddCentroidList(b *testing.B) { 413 | centroids := make(tdigest.CentroidList, len(NormalData)) 414 | for i := range centroids { 415 | centroids[i].Mean = NormalData[i] 416 | centroids[i].Weight = 1 417 | } 418 | 419 | b.ResetTimer() 420 | for n := 0; n < b.N; n++ { 421 | td := tdigest.NewWithCompression(1000) 422 | td.AddCentroidList(centroids) 423 | } 424 | } 425 | 426 | func BenchmarkTDigest_Merge(b *testing.B) { 427 | b.Run("AddCentroid", func(b *testing.B) { 428 | var cl tdigest.CentroidList 429 | td := tdigest.New() 430 | for n := 0; n < b.N; n++ { 431 | cl = NormalDigest.Centroids(cl[:0]) 432 | for i := range cl { 433 | td.AddCentroid(cl[i]) 434 | } 435 | } 436 | }) 437 | b.Run("Merge", func(b *testing.B) { 438 | td := tdigest.New() 439 | for n := 0; n < b.N; n++ { 440 | td.Merge(NormalDigest) 441 | } 442 | }) 443 | } 444 | 445 | func BenchmarkTDigest_Quantile(b *testing.B) { 446 | td := tdigest.NewWithCompression(1000) 447 | for _, x := range NormalData { 448 | td.Add(x, 1) 449 | } 450 | b.ResetTimer() 451 | var x float64 452 | for n := 0; n < b.N; n++ { 453 | for _, q := range quantiles { 454 | x += td.Quantile(q) 455 | } 456 | } 457 | } 458 | 459 | func TestTdigest_Centroids(t *testing.T) { 460 | tests := []struct { 461 | name string 462 | data []float64 463 | digest *tdigest.TDigest 464 | want tdigest.CentroidList 465 | }{ 466 | { 467 | name: "increasing", 468 | data: []float64{1, 2, 3, 4, 5}, 469 | want: tdigest.CentroidList{ 470 | tdigest.Centroid{ 471 | Mean: 1.0, 472 | Weight: 1.0, 473 | }, 474 | 475 | tdigest.Centroid{ 476 | Mean: 2.5, 477 | Weight: 2.0, 478 | }, 479 | 480 | tdigest.Centroid{ 481 | Mean: 4.0, 482 | Weight: 1.0, 483 | }, 484 | 485 | tdigest.Centroid{ 486 | Mean: 5.0, 487 | Weight: 1.0, 488 | }, 489 | }, 490 | }, 491 | } 492 | 493 | for _, tt := range tests { 494 | t.Run(tt.name, func(t *testing.T) { 495 | var got tdigest.CentroidList 496 | td := tt.digest 497 | if td == nil { 498 | td = tdigest.NewWithCompression(3) 499 | for _, x := range tt.data { 500 | td.Add(x, 1) 501 | } 502 | } 503 | got = td.Centroids(got[:0]) 504 | if !reflect.DeepEqual(got, tt.want) { 505 | t.Errorf("unexpected list got %g want %g", got, tt.want) 506 | } 507 | }) 508 | } 509 | } 510 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | # Testing 2 | 3 | This directory contains two programs `main.go` and `main.cpp` which both read three input file compute various quantiles and write out their results. 4 | The purpose of these programs is to show that the Go implementaion is accurate as compared to the C++ implementaion. 5 | 6 | The tests can be run using `test.sh`. 7 | 8 | -------------------------------------------------------------------------------- /test/gen/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "os" 5 | "strconv" 6 | 7 | "golang.org/x/exp/rand" 8 | "gonum.org/v1/gonum/stat/distuv" 9 | ) 10 | 11 | const ( 12 | N = 1e6 13 | Mu = 10 14 | Sigma = 3 15 | 16 | seed = 42 17 | ) 18 | 19 | func main() { 20 | // Generate uniform and normal data 21 | uniform := rand.New(rand.NewSource(seed)) 22 | dist := distuv.Normal{ 23 | Mu: Mu, 24 | Sigma: Sigma, 25 | Src: rand.New(rand.NewSource(seed)), 26 | } 27 | 28 | uniformData := make([]float64, N) 29 | normalData := make([]float64, N) 30 | for i := range normalData { 31 | normalData[i] = dist.Rand() 32 | uniformData[i] = uniform.Float64() * 100 33 | } 34 | 35 | smallData := []float64{1, 2, 3, 4, 5, 5, 4, 3, 2, 1} 36 | 37 | writeData("uniform.dat", uniformData) 38 | writeData("normal.dat", normalData) 39 | writeData("small.dat", smallData) 40 | } 41 | 42 | func writeData(name string, data []float64) { 43 | f, err := os.Create(name) 44 | if err != nil { 45 | panic(err) 46 | } 47 | defer f.Close() 48 | 49 | buf := make([]byte, 0, 64) 50 | for _, x := range data { 51 | buf = strconv.AppendFloat(buf, x, 'f', -1, 64) 52 | _, err := f.Write(buf) 53 | if err != nil { 54 | panic(err) 55 | } 56 | _, err = f.Write([]byte{'\n'}) 57 | if err != nil { 58 | panic(err) 59 | } 60 | buf = buf[0:0] 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /test/main.cpp: -------------------------------------------------------------------------------- 1 | // +build ignore 2 | 3 | #include "tdigest.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace tdigest; 12 | 13 | double quantiles[7] = { 14 | 0.1, 15 | 0.2, 16 | 0.5, 17 | 0.75, 18 | 0.9, 19 | 0.99, 20 | 0.999, 21 | }; 22 | 23 | 24 | std::string dataFiles[3] = {"small.dat", "uniform.dat", "normal.dat"}; 25 | double cdfs[3][5] = { 26 | // small.dat 27 | {0, 1, 4, 5, 6}, 28 | // uniform.dat 29 | {-1, 0, 50, 100, 101}, 30 | // normal.dat 31 | {-100, 7, 10, 13, 110}, 32 | }; 33 | 34 | 35 | std::vector loadData(std::string name) { 36 | std::ifstream f (name); 37 | std::vector data; 38 | 39 | f >> std::setprecision(std::numeric_limits::digits10 + 1); 40 | double x; 41 | while (f >> x) { 42 | data.push_back(x); 43 | } 44 | return data; 45 | } 46 | 47 | TDigest* createTDigest(std::vector data){ 48 | TDigest* td = new TDigest(1000); 49 | for (auto x : data) { 50 | td->add(x); 51 | } 52 | return td; 53 | } 54 | 55 | std::vector computeQuantiles(TDigest* td){ 56 | std::vector results; 57 | for (int i = 0; i < 7; i++) { 58 | double q = td->quantile(quantiles[i]); 59 | results.push_back(q); 60 | } 61 | return results; 62 | } 63 | 64 | std::vector computeCDFs(TDigest* td, double cdfs[5]) { 65 | std::vector results; 66 | for (int i = 0; i < 5; i++) { 67 | double p = td->cdf(cdfs[i]); 68 | results.push_back(p); 69 | } 70 | 71 | return results; 72 | } 73 | 74 | void writeResults(std::string name, std::vector results){ 75 | std::ofstream f (name); 76 | 77 | f << std::setprecision(std::numeric_limits::digits10 + 1); 78 | for (auto x : results) { 79 | f << x << std::endl; 80 | } 81 | } 82 | 83 | int main() { 84 | for (int i = 0; i < 3; i++) { 85 | std::vector data = loadData(dataFiles[i]); 86 | TDigest* td = createTDigest(data); 87 | auto results = computeQuantiles(td); 88 | writeResults(dataFiles[i] + ".cpp.quantiles", results); 89 | results = computeCDFs(td, cdfs[i]); 90 | writeResults(dataFiles[i] + ".cpp.cdfs", results); 91 | } 92 | return 0; 93 | } 94 | -------------------------------------------------------------------------------- /test/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "os" 6 | "strconv" 7 | 8 | "github.com/influxdata/tdigest" 9 | ) 10 | 11 | var quantiles = []float64{ 12 | 0.1, 13 | 0.2, 14 | 0.5, 15 | 0.75, 16 | 0.9, 17 | 0.99, 18 | 0.999, 19 | } 20 | 21 | var cdfs = map[string][]float64{ 22 | "small.dat": []float64{0, 1, 4, 5, 6}, 23 | "uniform.dat": []float64{-1, 0, 50, 100, 101}, 24 | "normal.dat": []float64{-100, 7, 10, 13, 110}, 25 | } 26 | 27 | var dataFiles = []string{ 28 | "small.dat", 29 | "uniform.dat", 30 | "normal.dat", 31 | } 32 | 33 | func main() { 34 | for _, f := range dataFiles { 35 | data := loadData(f) 36 | td := createTdigest(data) 37 | results := computeQuantiles(td, quantiles) 38 | writeResults(f+".go.quantiles", results) 39 | results = computeCDFs(td, cdfs[f]) 40 | writeResults(f+".go.cdfs", results) 41 | } 42 | } 43 | 44 | func loadData(name string) []float64 { 45 | f, err := os.Open(name) 46 | if err != nil { 47 | panic(err) 48 | } 49 | defer f.Close() 50 | s := bufio.NewScanner(f) 51 | var data []float64 52 | for s.Scan() { 53 | x, err := strconv.ParseFloat(s.Text(), 64) 54 | if err != nil { 55 | panic(err) 56 | } 57 | data = append(data, x) 58 | } 59 | return data 60 | } 61 | 62 | func createTdigest(data []float64) *tdigest.TDigest { 63 | td := tdigest.NewWithCompression(1000) 64 | for _, x := range data { 65 | td.Add(x, 1) 66 | } 67 | return td 68 | } 69 | 70 | func computeQuantiles(td *tdigest.TDigest, quantiles []float64) (r []float64) { 71 | for _, q := range quantiles { 72 | r = append(r, td.Quantile(q)) 73 | } 74 | return 75 | } 76 | 77 | func computeCDFs(td *tdigest.TDigest, cdfs []float64) (r []float64) { 78 | for _, x := range cdfs { 79 | r = append(r, td.CDF(x)) 80 | } 81 | return 82 | } 83 | 84 | func writeResults(name string, results []float64) { 85 | f, err := os.Create(name) 86 | if err != nil { 87 | panic(err) 88 | } 89 | defer f.Close() 90 | buf := make([]byte, 0, 64) 91 | for _, x := range results { 92 | buf = strconv.AppendFloat(buf, x, 'f', -1, 64) 93 | _, err := f.Write(buf) 94 | if err != nil { 95 | panic(err) 96 | } 97 | _, err = f.Write([]byte{'\n'}) 98 | if err != nil { 99 | panic(err) 100 | } 101 | buf = buf[0:0] 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /test/tdigest.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to Derrick R. Burns under one or more 3 | * contributor license agreements. See the NOTICES file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #ifndef TDIGEST2_TDIGEST_H_ 19 | #define TDIGEST2_TDIGEST_H_ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | // Modifed from original to remove all external depedencies. 30 | #define DLOG(l) std::cerr 31 | #define LOG(l) std::cerr 32 | 33 | #define CHECK_LE(x1, x2) 34 | #define CHECK_GT(x1, x2) 35 | #define CHECK_GE(x1, x2) 36 | 37 | namespace tdigest { 38 | 39 | using Value = double; 40 | using Weight = double; 41 | using Index = size_t; 42 | 43 | const size_t kHighWater = 40000; 44 | 45 | class Centroid { 46 | public: 47 | Centroid() : Centroid(0.0, 0.0) {} 48 | 49 | Centroid(Value mean, Weight weight) : mean_(mean), weight_(weight) {} 50 | 51 | inline Value mean() const noexcept { return mean_; } 52 | 53 | inline Weight weight() const noexcept { return weight_; } 54 | 55 | inline void add(const Centroid& c) { 56 | CHECK_GT(c.weight_, 0); 57 | if( weight_ != 0.0 ) { 58 | weight_ += c.weight_; 59 | mean_ += c.weight_ * (c.mean_ - mean_) / weight_; 60 | } else { 61 | weight_ = c.weight_; 62 | mean_ = c.mean_; 63 | } 64 | } 65 | 66 | private: 67 | Value mean_ = 0; 68 | Weight weight_ = 0; 69 | }; 70 | 71 | struct CentroidList { 72 | CentroidList(const std::vector& s) : iter(s.cbegin()), end(s.cend()) {} 73 | std::vector::const_iterator iter; 74 | std::vector::const_iterator end; 75 | 76 | bool advance() { return ++iter != end; } 77 | }; 78 | 79 | class CentroidListComparator { 80 | public: 81 | CentroidListComparator() {} 82 | 83 | bool operator()(const CentroidList& left, const CentroidList& right) const { 84 | return left.iter->mean() > right.iter->mean(); 85 | } 86 | }; 87 | 88 | using CentroidListQueue = std::priority_queue, CentroidListComparator>; 89 | 90 | struct CentroidComparator { 91 | bool operator()(const Centroid& a, const Centroid& b) const { return a.mean() < b.mean(); } 92 | }; 93 | 94 | class TDigest { 95 | class TDigestComparator { 96 | public: 97 | TDigestComparator() {} 98 | 99 | bool operator()(const TDigest* left, const TDigest* right) const { return left->totalSize() > right->totalSize(); } 100 | }; 101 | 102 | using TDigestQueue = std::priority_queue, TDigestComparator>; 103 | 104 | public: 105 | TDigest() : TDigest(1000) {} 106 | 107 | explicit TDigest(Value compression) : TDigest(compression, 0) {} 108 | 109 | TDigest(Value compression, Index bufferSize) : TDigest(compression, bufferSize, 0) {} 110 | 111 | TDigest(Value compression, Index unmergedSize, Index mergedSize) 112 | : compression_(compression), 113 | maxProcessed_(processedSize(mergedSize, compression)), 114 | maxUnprocessed_(unprocessedSize(unmergedSize, compression)) { 115 | processed_.reserve(maxProcessed_); 116 | unprocessed_.reserve(maxUnprocessed_ + 1); 117 | } 118 | 119 | TDigest(std::vector&& processed, std::vector&& unprocessed, Value compression, 120 | Index unmergedSize, Index mergedSize) 121 | : TDigest(compression, unmergedSize, mergedSize) { 122 | processed_ = std::move(processed); 123 | unprocessed_ = std::move(unprocessed); 124 | 125 | processedWeight_ = weight(processed_); 126 | unprocessedWeight_ = weight(unprocessed_); 127 | if( processed_.size() > 0 ) { 128 | min_ = std::min(min_, processed_[0].mean()); 129 | max_ = std::max(max_, (processed_.cend() - 1)->mean()); 130 | } 131 | updateCumulative(); 132 | } 133 | 134 | static Weight weight(std::vector& centroids) noexcept { 135 | Weight w = 0.0; 136 | for (auto centroid : centroids) { 137 | w += centroid.weight(); 138 | } 139 | return w; 140 | } 141 | 142 | TDigest& operator=(TDigest&& o) { 143 | compression_ = o.compression_; 144 | maxProcessed_ = o.maxProcessed_; 145 | maxUnprocessed_ = o.maxUnprocessed_; 146 | processedWeight_ = o.processedWeight_; 147 | unprocessedWeight_ = o.unprocessedWeight_; 148 | processed_ = std::move(o.processed_); 149 | unprocessed_ = std::move(o.unprocessed_); 150 | cumulative_ = std::move(o.cumulative_); 151 | min_ = o.min_; 152 | max_ = o.max_; 153 | return *this; 154 | } 155 | 156 | TDigest(TDigest&& o) 157 | : TDigest(std::move(o.processed_), std::move(o.unprocessed_), o.compression_, o.maxUnprocessed_, 158 | o.maxProcessed_) {} 159 | 160 | static inline Index processedSize(Index size, Value compression) noexcept { 161 | return (size == 0) ? static_cast(2 * std::ceil(compression)) : size; 162 | } 163 | 164 | static inline Index unprocessedSize(Index size, Value compression) noexcept { 165 | return (size == 0) ? static_cast(8 * std::ceil(compression)) : size; 166 | } 167 | 168 | // merge in another t-digest 169 | inline void merge(const TDigest* other) { 170 | std::vector others{other}; 171 | add(others.cbegin(), others.cend()); 172 | } 173 | 174 | const std::vector& processed() const { return processed_; } 175 | 176 | const std::vector& unprocessed() const { return unprocessed_; } 177 | 178 | Index maxUnprocessed() const { return maxUnprocessed_; } 179 | 180 | Index maxProcessed() const { return maxProcessed_; } 181 | 182 | inline void add(std::vector digests) { add(digests.cbegin(), digests.cend()); } 183 | 184 | // merge in a vector of tdigests in the most efficient manner possible 185 | // in constant space 186 | // works for any value of kHighWater 187 | void add(std::vector::const_iterator iter, std::vector::const_iterator end) { 188 | if (iter != end) { 189 | auto size = std::distance(iter, end); 190 | TDigestQueue pq(TDigestComparator{}); 191 | for (; iter != end; iter++) { 192 | pq.push((*iter)); 193 | } 194 | std::vector batch; 195 | batch.reserve(size); 196 | 197 | size_t totalSize = 0; 198 | while (!pq.empty()) { 199 | auto td = pq.top(); 200 | batch.push_back(td); 201 | pq.pop(); 202 | totalSize += td->totalSize(); 203 | if (totalSize >= kHighWater || pq.empty()) { 204 | mergeProcessed(batch); 205 | mergeUnprocessed(batch); 206 | processIfNecessary(); 207 | batch.clear(); 208 | totalSize = 0; 209 | } 210 | } 211 | updateCumulative(); 212 | } 213 | } 214 | 215 | Weight processedWeight() const { return processedWeight_; } 216 | 217 | Weight unprocessedWeight() const { return unprocessedWeight_; } 218 | 219 | bool haveUnprocessed() const { return unprocessed_.size() > 0; } 220 | 221 | size_t totalSize() const { return processed_.size() + unprocessed_.size(); } 222 | 223 | long totalWeight() const { return static_cast(processedWeight_ + unprocessedWeight_); } 224 | 225 | // return the cdf on the t-digest 226 | Value cdf(Value x) { 227 | if (haveUnprocessed() || isDirty()) process(); 228 | return cdfProcessed(x); 229 | } 230 | 231 | bool isDirty() { return processed_.size() > maxProcessed_ || unprocessed_.size() > maxUnprocessed_; } 232 | 233 | // return the cdf on the processed values 234 | Value cdfProcessed(Value x) const { 235 | DLOG(INFO) << "cdf value " << x; 236 | DLOG(INFO) << "processed size " << processed_.size(); 237 | if (processed_.size() == 0) { 238 | // no data to examin_e 239 | DLOG(INFO) << "no processed values"; 240 | 241 | return 0.0; 242 | } else if (processed_.size() == 1) { 243 | DLOG(INFO) << "one processed value " 244 | << " min_ " << min_ << " max_ " << max_; 245 | // exactly one centroid, should have max_==min_ 246 | auto width = max_ - min_; 247 | if (x < min_) { 248 | return 0.0; 249 | } else if (x > max_) { 250 | return 1.0; 251 | } else if (x - min_ <= width) { 252 | // min_ and max_ are too close together to do any viable interpolation 253 | return 0.5; 254 | } else { 255 | // interpolate if somehow we have weight > 0 and max_ != min_ 256 | return (x - min_) / (max_ - min_); 257 | } 258 | } else { 259 | auto n = processed_.size(); 260 | if (x <= min_) { 261 | DLOG(INFO) << "below min_ " 262 | << " min_ " << min_ << " x " << x; 263 | return 0; 264 | } 265 | 266 | if (x >= max_) { 267 | DLOG(INFO) << "above max_ " 268 | << " max_ " << max_ << " x " << x; 269 | return 1; 270 | } 271 | 272 | // check for the left tail 273 | if (x <= mean(0)) { 274 | DLOG(INFO) << "left tail " 275 | << " min_ " << min_ << " mean(0) " << mean(0) << " x " << x; 276 | 277 | // note that this is different than mean(0) > min_ ... this guarantees interpolation works 278 | if (mean(0) - min_ > 0) { 279 | return (x - min_) / (mean(0) - min_) * weight(0) / processedWeight_ / 2.0; 280 | } else { 281 | return 0; 282 | } 283 | } 284 | 285 | // and the right tail 286 | if (x >= mean(n - 1)) { 287 | DLOG(INFO) << "right tail" 288 | << " max_ " << max_ << " mean(n - 1) " << mean(n - 1) << " x " << x; 289 | 290 | if (max_ - mean(n - 1) > 0) { 291 | return 1.0 - (max_ - x) / (max_ - mean(n - 1)) * weight(n - 1) / processedWeight_ / 2.0; 292 | } else { 293 | return 1; 294 | } 295 | } 296 | 297 | CentroidComparator cc; 298 | auto iter = std::upper_bound(processed_.cbegin(), processed_.cend(), Centroid(x, 0), cc); 299 | 300 | auto i = std::distance(processed_.cbegin(), iter); 301 | auto z1 = x - (iter - 1)->mean(); 302 | auto z2 = (iter)->mean() - x; 303 | CHECK_LE(0.0, z1); 304 | CHECK_LE(0.0, z2); 305 | DLOG(INFO) << "middle " 306 | << " z1 " << z1 << " z2 " << z2 << " x " << x; 307 | 308 | return weightedAverage(cumulative_[i - 1], z2, cumulative_[i], z1) / processedWeight_; 309 | } 310 | } 311 | 312 | // this returns a quantile on the t-digest 313 | Value quantile(Value q) { 314 | if (haveUnprocessed() || isDirty()) process(); 315 | return quantileProcessed(q); 316 | } 317 | 318 | // this returns a quantile on the currently processed values without changing the t-digest 319 | // the value will not represent the unprocessed values 320 | Value quantileProcessed(Value q) const { 321 | if (q < 0 || q > 1) { 322 | LOG(ERROR) << "q should be in [0,1], got " << q; 323 | return NAN; 324 | } 325 | 326 | if (processed_.size() == 0) { 327 | // no sorted means no data, no way to get a quantile 328 | return NAN; 329 | } else if (processed_.size() == 1) { 330 | // with one data point, all quantiles lead to Rome 331 | 332 | return mean(0); 333 | } 334 | 335 | // we know that there are at least two sorted now 336 | auto n = processed_.size(); 337 | 338 | // if values were stored in a sorted array, index would be the offset we are Weighterested in 339 | const auto index = q * processedWeight_; 340 | 341 | // at the boundaries, we return min_ or max_ 342 | if (index < weight(0) / 2.0) { 343 | CHECK_GT(weight(0), 0); 344 | return min_ + 2.0 * index / weight(0) * (mean(0) - min_); 345 | } 346 | 347 | auto iter = std::lower_bound(cumulative_.cbegin(), cumulative_.cend(), index); 348 | 349 | if (iter + 1 != cumulative_.cend()) { 350 | auto i = std::distance(cumulative_.cbegin(), iter); 351 | auto z1 = index - *(iter - 1); 352 | auto z2 = *(iter)-index; 353 | // LOG(INFO) << "z2 " << z2 << " index " << index << " z1 " << z1; 354 | return weightedAverage(mean(i - 1), z2, mean(i), z1); 355 | } 356 | 357 | CHECK_LE(index, processedWeight_); 358 | CHECK_GE(index, processedWeight_ - weight(n - 1) / 2.0); 359 | 360 | auto z1 = index - processedWeight_ - weight(n - 1) / 2.0; 361 | auto z2 = weight(n - 1) / 2 - z1; 362 | return weightedAverage(mean(n - 1), z1, max_, z2); 363 | } 364 | 365 | Value compression() const { return compression_; } 366 | 367 | void add(Value x) { add(x, 1); } 368 | 369 | inline void compress() { process(); } 370 | 371 | // add a single centroid to the unprocessed vector, processing previously unprocessed sorted if our limit has 372 | // been reached. 373 | inline bool add(Value x, Weight w) { 374 | if (std::isnan(x)) { 375 | return false; 376 | } 377 | unprocessed_.push_back(Centroid(x, w)); 378 | unprocessedWeight_ += w; 379 | processIfNecessary(); 380 | return true; 381 | } 382 | 383 | inline void add(std::vector::const_iterator iter, std::vector::const_iterator end) { 384 | while (iter != end) { 385 | const size_t diff = std::distance(iter, end); 386 | const size_t room = maxUnprocessed_ - unprocessed_.size(); 387 | auto mid = iter + std::min(diff, room); 388 | while (iter != mid) unprocessed_.push_back(*(iter++)); 389 | if (unprocessed_.size() >= maxUnprocessed_) { 390 | process(); 391 | } 392 | } 393 | } 394 | 395 | private: 396 | Value compression_; 397 | 398 | Value min_ = std::numeric_limits::max(); 399 | 400 | Value max_ = std::numeric_limits::min(); 401 | 402 | Index maxProcessed_; 403 | 404 | Index maxUnprocessed_; 405 | 406 | Value processedWeight_ = 0.0; 407 | 408 | Value unprocessedWeight_ = 0.0; 409 | 410 | std::vector processed_; 411 | 412 | std::vector unprocessed_; 413 | 414 | std::vector cumulative_; 415 | 416 | // return mean of i-th centroid 417 | inline Value mean(int i) const noexcept { return processed_[i].mean(); } 418 | 419 | // return weight of i-th centroid 420 | inline Weight weight(int i) const noexcept { return processed_[i].weight(); } 421 | 422 | // append all unprocessed centroids into current unprocessed vector 423 | void mergeUnprocessed(const std::vector& tdigests) { 424 | if (tdigests.size() == 0) return; 425 | 426 | size_t total = unprocessed_.size(); 427 | for (auto& td : tdigests) { 428 | total += td->unprocessed_.size(); 429 | } 430 | 431 | unprocessed_.reserve(total); 432 | for (auto& td : tdigests) { 433 | unprocessed_.insert(unprocessed_.end(), td->unprocessed_.cbegin(), td->unprocessed_.cend()); 434 | unprocessedWeight_ += td->unprocessedWeight_; 435 | } 436 | } 437 | 438 | // merge all processed centroids together into a single sorted vector 439 | void mergeProcessed(const std::vector& tdigests) { 440 | if (tdigests.size() == 0) return; 441 | 442 | size_t total = 0; 443 | CentroidListQueue pq(CentroidListComparator{}); 444 | for (auto& td : tdigests) { 445 | auto& sorted = td->processed_; 446 | auto size = sorted.size(); 447 | if (size > 0) { 448 | pq.push(CentroidList(sorted)); 449 | total += size; 450 | processedWeight_ += td->processedWeight_; 451 | } 452 | } 453 | if (total == 0) return; 454 | 455 | if (processed_.size() > 0) { 456 | pq.push(CentroidList(processed_)); 457 | total += processed_.size(); 458 | } 459 | 460 | std::vector sorted; 461 | LOG(INFO) << "total " << total; 462 | sorted.reserve(total); 463 | 464 | while (!pq.empty()) { 465 | auto best = pq.top(); 466 | pq.pop(); 467 | sorted.push_back(*(best.iter)); 468 | if (best.advance()) pq.push(best); 469 | } 470 | processed_ = std::move(sorted); 471 | if( processed_.size() > 0 ) { 472 | min_ = std::min(min_, processed_[0].mean()); 473 | max_ = std::max(max_, (processed_.cend() - 1)->mean()); 474 | } 475 | } 476 | 477 | inline void processIfNecessary() { 478 | if (isDirty()) { 479 | process(); 480 | } 481 | } 482 | 483 | void updateCumulative() { 484 | const auto n = processed_.size(); 485 | cumulative_.clear(); 486 | cumulative_.reserve(n + 1); 487 | auto previous = 0.0; 488 | for (Index i = 0; i < n; i++) { 489 | auto current = weight(i); 490 | auto halfCurrent = current / 2.0; 491 | cumulative_.push_back(previous + halfCurrent); 492 | previous = previous + current; 493 | } 494 | cumulative_.push_back(previous); 495 | } 496 | 497 | // merges unprocessed_ centroids and processed_ centroids together and processes them 498 | // when complete, unprocessed_ will be empty and processed_ will have at most maxProcessed_ centroids 499 | inline void process() { 500 | CentroidComparator cc; 501 | std::sort(unprocessed_.begin(), unprocessed_.end(), cc); 502 | auto count = unprocessed_.size(); 503 | unprocessed_.insert(unprocessed_.end(), processed_.cbegin(), processed_.cend()); 504 | std::inplace_merge(unprocessed_.begin(), unprocessed_.begin() + count, unprocessed_.end(), cc); 505 | 506 | processedWeight_ += unprocessedWeight_; 507 | unprocessedWeight_ = 0; 508 | processed_.clear(); 509 | 510 | processed_.push_back(unprocessed_[0]); 511 | Weight wSoFar = unprocessed_[0].weight(); 512 | Weight wLimit = processedWeight_ * integratedQ(1.0); 513 | 514 | auto end = unprocessed_.end(); 515 | for (auto iter = unprocessed_.cbegin() + 1; iter < end; iter++) { 516 | auto& centroid = *iter; 517 | Weight projectedW = wSoFar + centroid.weight(); 518 | if (projectedW <= wLimit) { 519 | wSoFar = projectedW; 520 | (processed_.end() - 1)->add(centroid); 521 | } else { 522 | auto k1 = integratedLocation(wSoFar / processedWeight_); 523 | wLimit = processedWeight_ * integratedQ(k1 + 1.0); 524 | wSoFar += centroid.weight(); 525 | processed_.emplace_back(centroid); 526 | } 527 | } 528 | unprocessed_.clear(); 529 | min_ = std::min(min_, processed_[0].mean()); 530 | DLOG(INFO) << "new min_ " << min_; 531 | max_ = std::max(max_, (processed_.cend() - 1)->mean()); 532 | DLOG(INFO) << "new max_ " << max_; 533 | updateCumulative(); 534 | } 535 | 536 | inline int checkWeights() { return checkWeights(processed_, processedWeight_); } 537 | 538 | size_t checkWeights(const std::vector& sorted, Value total) { 539 | size_t badWeight = 0; 540 | auto k1 = 0.0; 541 | auto q = 0.0; 542 | for (auto iter = sorted.cbegin(); iter != sorted.cend(); iter++) { 543 | auto w = iter->weight(); 544 | auto dq = w / total; 545 | auto k2 = integratedLocation(q + dq); 546 | if (k2 - k1 > 1 && w != 1) { 547 | LOG(WARNING) << "Oversize centroid at " << std::distance(sorted.cbegin(), iter) << " k1 " << k1 << " k2 " << k2 548 | << " dk " << (k2 - k1) << " w " << w << " q " << q; 549 | badWeight++; 550 | } 551 | if (k2 - k1 > 1.5 && w != 1) { 552 | LOG(ERROR) << "Egregiously Oversize centroid at " << std::distance(sorted.cbegin(), iter) << " k1 " << k1 553 | << " k2 " << k2 << " dk " << (k2 - k1) << " w " << w << " q " << q; 554 | badWeight++; 555 | } 556 | q += dq; 557 | k1 = k2; 558 | } 559 | 560 | return badWeight; 561 | } 562 | 563 | /** 564 | * Converts a quantile into a centroid scale value. The centroid scale is nomin_ally 565 | * the number k of the centroid that a quantile point q should belong to. Due to 566 | * round-offs, however, we can't align things perfectly without splitting points 567 | * and sorted. We don't want to do that, so we have to allow for offsets. 568 | * In the end, the criterion is that any quantile range that spans a centroid 569 | * scale range more than one should be split across more than one centroid if 570 | * possible. This won't be possible if the quantile range refers to a single point 571 | * or an already existing centroid. 572 | *

573 | * This mapping is steep near q=0 or q=1 so each centroid there will correspond to 574 | * less q range. Near q=0.5, the mapping is flatter so that sorted there will 575 | * represent a larger chunk of quantiles. 576 | * 577 | * @param q The quantile scale value to be mapped. 578 | * @return The centroid scale value corresponding to q. 579 | */ 580 | inline Value integratedLocation(Value q) const { 581 | return compression_ * (std::asin(2.0 * q - 1.0) + M_PI / 2) / M_PI; 582 | } 583 | 584 | inline Value integratedQ(Value k) const { 585 | return (std::sin(std::min(k, compression_) * M_PI / compression_ - M_PI / 2) + 1) / 2; 586 | } 587 | 588 | /** 589 | * Same as {@link #weightedAverageSorted(Value, Value, Value, Value)} but flips 590 | * the order of the variables if x2 is greater than 591 | * x1. 592 | */ 593 | static Value weightedAverage(Value x1, Value w1, Value x2, Value w2) { 594 | return (x1 <= x2) ? weightedAverageSorted(x1, w1, x2, w2) : weightedAverageSorted(x2, w2, x1, w1); 595 | } 596 | 597 | /** 598 | * Compute the weighted average between x1 with a weight of 599 | * w1 and x2 with a weight of w2. 600 | * This expects x1 to be less than or equal to x2 601 | * and is guaranteed to return a number between x1 and 602 | * x2. 603 | */ 604 | static Value weightedAverageSorted(Value x1, Value w1, Value x2, Value w2) { 605 | CHECK_LE(x1, x2); 606 | const Value x = (x1 * w1 + x2 * w2) / (w1 + w2); 607 | return std::max(x1, std::min(x, x2)); 608 | } 609 | 610 | static Value interpolate(Value x, Value x0, Value x1) { return (x - x0) / (x1 - x0); } 611 | 612 | /** 613 | * Computes an interpolated value of a quantile that is between two sorted. 614 | * 615 | * Index is the quantile desired multiplied by the total number of samples - 1. 616 | * 617 | * @param index Denormalized quantile desired 618 | * @param previousIndex The denormalized quantile corresponding to the center of the previous centroid. 619 | * @param nextIndex The denormalized quantile corresponding to the center of the following centroid. 620 | * @param previousMean The mean of the previous centroid. 621 | * @param nextMean The mean of the following centroid. 622 | * @return The interpolated mean. 623 | */ 624 | static Value quantile(Value index, Value previousIndex, Value nextIndex, Value previousMean, Value nextMean) { 625 | const auto delta = nextIndex - previousIndex; 626 | const auto previousWeight = (nextIndex - index) / delta; 627 | const auto nextWeight = (index - previousIndex) / delta; 628 | return previousMean * previousWeight + nextMean * nextWeight; 629 | } 630 | }; 631 | 632 | } // namespace tdigest2 633 | 634 | #endif // TDIGEST2_TDIGEST_H_ 635 | 636 | -------------------------------------------------------------------------------- /test/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) 6 | cd "$DIR" 7 | 8 | go run gen/main.go 9 | go run main.go 10 | g++ -std=c++11 -o cpp.test main.cpp 11 | ./cpp.test 2>/dev/null 12 | rm cpp.test 13 | 14 | go run validate/main.go 15 | -------------------------------------------------------------------------------- /test/validate/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "log" 6 | "math" 7 | "os" 8 | "strconv" 9 | "strings" 10 | ) 11 | 12 | var dataFiles = []string{ 13 | "small.dat", 14 | "uniform.dat", 15 | "normal.dat", 16 | } 17 | 18 | const ( 19 | cppQExt = ".cpp.quantiles" 20 | goQExt = ".go.quantiles" 21 | 22 | cppCDFExt = ".cpp.cdfs" 23 | goCDFExt = ".go.cdfs" 24 | 25 | epsilon = 1e-6 26 | ) 27 | 28 | func main() { 29 | for _, f := range dataFiles { 30 | // Validate Quantiles 31 | cppQuantiles := loadResults(f + cppQExt) 32 | goQuantiles := loadResults(f + goQExt) 33 | if len(cppQuantiles) != len(goQuantiles) { 34 | log.Fatal("differing number of quantiles results") 35 | } 36 | 37 | for i := range cppQuantiles { 38 | if math.Abs(cppQuantiles[i]-goQuantiles[i]) > epsilon { 39 | log.Fatalf("differing quantile result go: %f cpp: %f", goQuantiles[i], cppQuantiles[i]) 40 | } 41 | } 42 | 43 | // Validate CDFs 44 | cppCDFs := loadResults(f + cppCDFExt) 45 | goCDFs := loadResults(f + goCDFExt) 46 | if len(cppCDFs) != len(goCDFs) { 47 | log.Fatal("differing number of CDFs results") 48 | } 49 | 50 | for i := range cppCDFs { 51 | if math.Abs(cppCDFs[i]-goCDFs[i]) > epsilon { 52 | log.Fatalf("differing CDF result go: %f cpp: %f", goCDFs[i], cppCDFs[i]) 53 | } 54 | } 55 | } 56 | } 57 | 58 | func loadResults(name string) []float64 { 59 | f, err := os.Open(name) 60 | if err != nil { 61 | panic(err) 62 | } 63 | defer f.Close() 64 | s := bufio.NewScanner(f) 65 | var data []float64 66 | for s.Scan() { 67 | parts := strings.SplitN(s.Text(), " ", 2) 68 | x, err := strconv.ParseFloat(parts[0], 64) 69 | if err != nil { 70 | panic(err) 71 | } 72 | data = append(data, x) 73 | } 74 | return data 75 | } 76 | --------------------------------------------------------------------------------