├── .gitignore
├── LICENSE
├── README.md
├── centroid.go
├── centroid_test.go
├── go.mod
├── go.sum
├── tdigest.go
├── tdigest_test.go
└── test
    ├── README.md
    ├── gen
        └── main.go
    ├── main.cpp
    ├── main.go
    ├── tdigest.h
    ├── test.sh
    └── validate
        └── main.go


/.gitignore:
--------------------------------------------------------------------------------
1 | /test/*.dat*
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                 Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2018 InfluxData Inc.
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # tdigest
 2 | 
 3 | This is an implementation of Ted Dunning's [t-digest](https://github.com/tdunning/t-digest/) in Go.
 4 | 
 5 | The implementation is based off [Derrick Burns' C++ implementation](https://github.com/derrickburns/tdigest).
 6 | 
 7 | ## Example
 8 | 
 9 | ```go
10 | package main
11 | 
12 | import (
13 | 	"log"
14 | 
15 | 	"github.com/influxdata/tdigest"
16 | )
17 | 
18 | func main() {
19 | 	td := tdigest.NewWithCompression(1000)
20 | 	for _, x := range []float64{1, 2, 3, 4, 5, 5, 4, 3, 2, 1} {
21 | 		td.Add(x, 1)
22 | 	}
23 | 
24 | 	// Compute Quantiles
25 | 	log.Println("50th", td.Quantile(0.5))
26 | 	log.Println("75th", td.Quantile(0.75))
27 | 	log.Println("90th", td.Quantile(0.9))
28 | 	log.Println("99th", td.Quantile(0.99))
29 | 
30 | 	// Compute CDFs
31 | 	log.Println("CDF(1) = ", td.CDF(1))
32 | 	log.Println("CDF(2) = ", td.CDF(2))
33 | 	log.Println("CDF(3) = ", td.CDF(3))
34 | 	log.Println("CDF(4) = ", td.CDF(4))
35 | 	log.Println("CDF(5) = ", td.CDF(5))
36 | }
37 | ```
38 | 


--------------------------------------------------------------------------------
/centroid.go:
--------------------------------------------------------------------------------
 1 | package tdigest
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"sort"
 6 | )
 7 | 
 8 | // ErrWeightLessThanZero is used when the weight is not able to be processed.
 9 | const ErrWeightLessThanZero = Error("centroid weight cannot be less than zero")
10 | 
11 | // Error is a domain error encountered while processing tdigests
12 | type Error string
13 | 
14 | func (e Error) Error() string {
15 | 	return string(e)
16 | }
17 | 
18 | // Centroid average position of all points in a shape
19 | type Centroid struct {
20 | 	Mean   float64
21 | 	Weight float64
22 | }
23 | 
24 | func (c *Centroid) String() string {
25 | 	return fmt.Sprintf("{mean: %f weight: %f}", c.Mean, c.Weight)
26 | }
27 | 
28 | // Add averages the two centroids together and update this centroid
29 | func (c *Centroid) Add(r Centroid) error {
30 | 	if r.Weight < 0 {
31 | 		return ErrWeightLessThanZero
32 | 	}
33 | 	if c.Weight != 0 {
34 | 		c.Weight += r.Weight
35 | 		c.Mean += r.Weight * (r.Mean - c.Mean) / c.Weight
36 | 	} else {
37 | 		c.Weight = r.Weight
38 | 		c.Mean = r.Mean
39 | 	}
40 | 	return nil
41 | }
42 | 
43 | // CentroidList is sorted by the Mean of the centroid, ascending.
44 | type CentroidList []Centroid
45 | 
46 | // Clear clears the list.
47 | func (l *CentroidList) Clear() {
48 | 	*l = (*l)[:0]
49 | }
50 | 
51 | func (l CentroidList) Len() int           { return len(l) }
52 | func (l CentroidList) Less(i, j int) bool { return l[i].Mean < l[j].Mean }
53 | func (l CentroidList) Swap(i, j int)      { l[i], l[j] = l[j], l[i] }
54 | 
55 | // NewCentroidList creates a priority queue for the centroids
56 | func NewCentroidList(centroids []Centroid) CentroidList {
57 | 	l := CentroidList(centroids)
58 | 	sort.Sort(l)
59 | 	return l
60 | }
61 | 


--------------------------------------------------------------------------------
/centroid_test.go:
--------------------------------------------------------------------------------
  1 | package tdigest_test
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/google/go-cmp/cmp"
  7 | 	"github.com/influxdata/tdigest"
  8 | )
  9 | 
 10 | func TestCentroid_Add(t *testing.T) {
 11 | 	tests := []struct {
 12 | 		name    string
 13 | 		c       tdigest.Centroid
 14 | 		r       tdigest.Centroid
 15 | 		want    tdigest.Centroid
 16 | 		wantErr bool
 17 | 		errStr  string
 18 | 	}{
 19 | 		{
 20 | 			name: "error when weight is zero",
 21 | 			r: tdigest.Centroid{
 22 | 				Weight: -1.0,
 23 | 			},
 24 | 			wantErr: true,
 25 | 			errStr:  "centroid weight cannot be less than zero",
 26 | 		},
 27 | 		{
 28 | 			name: "zero weight",
 29 | 			c: tdigest.Centroid{
 30 | 				Weight: 0.0,
 31 | 				Mean:   1.0,
 32 | 			},
 33 | 			r: tdigest.Centroid{
 34 | 				Weight: 1.0,
 35 | 				Mean:   2.0,
 36 | 			},
 37 | 			want: tdigest.Centroid{
 38 | 				Weight: 1.0,
 39 | 				Mean:   2.0,
 40 | 			},
 41 | 		},
 42 | 		{
 43 | 			name: "weight order of magnitude",
 44 | 			c: tdigest.Centroid{
 45 | 				Weight: 1,
 46 | 				Mean:   1,
 47 | 			},
 48 | 			r: tdigest.Centroid{
 49 | 				Weight: 10,
 50 | 				Mean:   10,
 51 | 			},
 52 | 			want: tdigest.Centroid{
 53 | 				Weight: 11,
 54 | 				Mean:   9.181818181818182,
 55 | 			},
 56 | 		},
 57 | 	}
 58 | 	for _, tt := range tests {
 59 | 		t.Run(tt.name, func(t *testing.T) {
 60 | 			c := &tt.c
 61 | 			if err := c.Add(tt.r); (err != nil) != tt.wantErr {
 62 | 				t.Errorf("Centroid.Add() error = %v, wantErr %v", err, tt.wantErr)
 63 | 			} else if tt.wantErr && err.Error() != tt.errStr {
 64 | 				t.Errorf("Centroid.Add() error.Error() = %s, errStr %v", err.Error(), tt.errStr)
 65 | 			}
 66 | 			if !cmp.Equal(tt.c, tt.want) {
 67 | 				t.Errorf("unexprected centroid -want/+got\n%s", cmp.Diff(tt.want, tt.c))
 68 | 			}
 69 | 		})
 70 | 	}
 71 | }
 72 | 
 73 | func TestNewCentroidList(t *testing.T) {
 74 | 	tests := []struct {
 75 | 		name      string
 76 | 		centroids []tdigest.Centroid
 77 | 		want      tdigest.CentroidList
 78 | 	}{
 79 | 		{
 80 | 			name: "empty list",
 81 | 		},
 82 | 		{
 83 | 			name: "priority should be by mean ascending",
 84 | 			centroids: []tdigest.Centroid{
 85 | 				{
 86 | 					Mean: 2.0,
 87 | 				},
 88 | 				{
 89 | 					Mean: 1.0,
 90 | 				},
 91 | 			},
 92 | 			want: tdigest.CentroidList{
 93 | 				{
 94 | 					Mean: 1.0,
 95 | 				},
 96 | 				{
 97 | 					Mean: 2.0,
 98 | 				},
 99 | 			},
100 | 		},
101 | 		{
102 | 			name: "single element should be identity",
103 | 			centroids: []tdigest.Centroid{
104 | 				{
105 | 					Mean: 1.0,
106 | 				},
107 | 			},
108 | 			want: tdigest.CentroidList{
109 | 				{
110 | 					Mean: 1.0,
111 | 				},
112 | 			},
113 | 		},
114 | 	}
115 | 	for _, tt := range tests {
116 | 		t.Run(tt.name, func(t *testing.T) {
117 | 			if got := tdigest.NewCentroidList(tt.centroids); !cmp.Equal(tt.want, got) {
118 | 				t.Errorf("NewCentroidList() = -want/+got %s", cmp.Diff(tt.want, got))
119 | 			}
120 | 		})
121 | 	}
122 | }
123 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/influxdata/tdigest
 2 | 
 3 | require (
 4 | 	github.com/google/go-cmp v0.2.0
 5 | 	golang.org/x/exp v0.0.0-20180321215751-8460e604b9de
 6 | 	gonum.org/v1/gonum v0.0.0-20181121035319-3f7ecaa7e8ca
 7 | 	gonum.org/v1/netlib v0.0.0-20181029234149-ec6d1f5cefe6 // indirect
 8 | )
 9 | 
10 | go 1.13
11 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/google/go-cmp v0.2.0 h1:+dTQ8DZQJz0Mb/HjFlkptS1FeQ4cWSnN941F8aEG4SQ=
 2 | github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
 3 | golang.org/x/exp v0.0.0-20180321215751-8460e604b9de h1:xSjD6HQTqT0H/k60N5yYBtnN1OEkVy7WIo/DYyxKRO0=
 4 | golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 5 | golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 6 | gonum.org/v1/gonum v0.0.0-20181121035319-3f7ecaa7e8ca h1:PupagGYwj8+I4ubCxcmcBRk3VlUWtTg5huQpZR9flmE=
 7 | gonum.org/v1/gonum v0.0.0-20181121035319-3f7ecaa7e8ca/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
 8 | gonum.org/v1/netlib v0.0.0-20181029234149-ec6d1f5cefe6 h1:4WsZyVtkthqrHTbDCJfiTs8IWNYE4uvsSDgaV6xpp+o=
 9 | gonum.org/v1/netlib v0.0.0-20181029234149-ec6d1f5cefe6/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
10 | 


--------------------------------------------------------------------------------
/tdigest.go:
--------------------------------------------------------------------------------
  1 | package tdigest
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"sort"
  6 | )
  7 | 
  8 | // TDigest is a data structure for accurate on-line accumulation of
  9 | // rank-based statistics such as quantiles and trimmed means.
 10 | type TDigest struct {
 11 | 	Compression float64
 12 | 
 13 | 	maxProcessed      int
 14 | 	maxUnprocessed    int
 15 | 	processed         CentroidList
 16 | 	unprocessed       CentroidList
 17 | 	cumulative        []float64
 18 | 	processedWeight   float64
 19 | 	unprocessedWeight float64
 20 | 	min               float64
 21 | 	max               float64
 22 | }
 23 | 
 24 | // New initializes a new distribution with a default compression.
 25 | func New() *TDigest {
 26 | 	return NewWithCompression(1000)
 27 | }
 28 | 
 29 | // NewWithCompression initializes a new distribution with custom compression.
 30 | func NewWithCompression(c float64) *TDigest {
 31 | 	t := &TDigest{
 32 | 		Compression: c,
 33 | 	}
 34 | 	t.maxProcessed = processedSize(0, t.Compression)
 35 | 	t.maxUnprocessed = unprocessedSize(0, t.Compression)
 36 | 	t.processed = make(CentroidList, 0, t.maxProcessed)
 37 | 	t.unprocessed = make(CentroidList, 0, t.maxUnprocessed+1)
 38 | 	t.Reset()
 39 | 	return t
 40 | }
 41 | 
 42 | // Calculate number of bytes needed for a tdigest of size c,
 43 | // where c is the compression value
 44 | func ByteSizeForCompression(comp float64) int {
 45 | 	c := int(comp)
 46 | 	// // A centroid is 2 float64s, so we need 16 bytes for each centroid
 47 | 	// float_size := 8
 48 | 	// centroid_size := 2 * float_size
 49 | 
 50 | 	// // Unprocessed and processed can grow up to length c
 51 | 	// unprocessed_size := centroid_size * c
 52 | 	// processed_size := unprocessed_size
 53 | 
 54 | 	// // the cumulative field can also be of length c, but each item is a single float64
 55 | 	// cumulative_size := float_size * c // <- this could also be unprocessed_size / 2
 56 | 
 57 | 	// return unprocessed_size + processed_size + cumulative_size
 58 | 
 59 | 	// // or, more succinctly:
 60 | 	// return float_size * c * 5
 61 | 
 62 | 	// or even more succinctly
 63 | 	return c * 40
 64 | }
 65 | 
 66 | // Reset resets the distribution to its initial state.
 67 | func (t *TDigest) Reset() {
 68 | 	t.processed = t.processed[:0]
 69 | 	t.unprocessed = t.unprocessed[:0]
 70 | 	t.cumulative = t.cumulative[:0]
 71 | 	t.processedWeight = 0
 72 | 	t.unprocessedWeight = 0
 73 | 	t.min = math.MaxFloat64
 74 | 	t.max = -math.MaxFloat64
 75 | }
 76 | 
 77 | // Add adds a value x with a weight w to the distribution.
 78 | func (t *TDigest) Add(x, w float64) {
 79 | 	t.AddCentroid(Centroid{Mean: x, Weight: w})
 80 | }
 81 | 
 82 | // AddCentroidList can quickly add multiple centroids.
 83 | func (t *TDigest) AddCentroidList(c CentroidList) {
 84 | 	// It's possible to optimize this by bulk-copying the slice, but this
 85 | 	// yields just a 1-2% speedup (most time is in process()), so not worth
 86 | 	// the complexity.
 87 | 	for i := range c {
 88 | 		t.AddCentroid(c[i])
 89 | 	}
 90 | }
 91 | 
 92 | // AddCentroid adds a single centroid.
 93 | // Weights which are not a number or are <= 0 are ignored, as are NaN means.
 94 | func (t *TDigest) AddCentroid(c Centroid) {
 95 | 	if math.IsNaN(c.Mean) || c.Weight <= 0 || math.IsNaN(c.Weight) || math.IsInf(c.Weight, 1) {
 96 | 		return
 97 | 	}
 98 | 
 99 | 	t.unprocessed = append(t.unprocessed, c)
100 | 	t.unprocessedWeight += c.Weight
101 | 
102 | 	if t.processed.Len() > t.maxProcessed ||
103 | 		t.unprocessed.Len() > t.maxUnprocessed {
104 | 		t.process()
105 | 	}
106 | }
107 | 
108 | // Merges the supplied digest into this digest. Functionally equivalent to
109 | // calling t.AddCentroidList(t2.Centroids(nil)), but avoids making an extra
110 | // copy of the CentroidList.
111 | func (t *TDigest) Merge(t2 *TDigest) {
112 | 	t2.process()
113 | 	t.AddCentroidList(t2.processed)
114 | }
115 | 
116 | func (t *TDigest) process() {
117 | 	if t.unprocessed.Len() > 0 ||
118 | 		t.processed.Len() > t.maxProcessed {
119 | 
120 | 		// Append all processed centroids to the unprocessed list and sort
121 | 		t.unprocessed = append(t.unprocessed, t.processed...)
122 | 		sort.Sort(&t.unprocessed)
123 | 
124 | 		// Reset processed list with first centroid
125 | 		t.processed.Clear()
126 | 		t.processed = append(t.processed, t.unprocessed[0])
127 | 
128 | 		t.processedWeight += t.unprocessedWeight
129 | 		t.unprocessedWeight = 0
130 | 		soFar := t.unprocessed[0].Weight
131 | 		limit := t.processedWeight * t.integratedQ(1.0)
132 | 		for _, centroid := range t.unprocessed[1:] {
133 | 			projected := soFar + centroid.Weight
134 | 			if projected <= limit {
135 | 				soFar = projected
136 | 				(&t.processed[t.processed.Len()-1]).Add(centroid)
137 | 			} else {
138 | 				k1 := t.integratedLocation(soFar / t.processedWeight)
139 | 				limit = t.processedWeight * t.integratedQ(k1+1.0)
140 | 				soFar += centroid.Weight
141 | 				t.processed = append(t.processed, centroid)
142 | 			}
143 | 		}
144 | 		t.min = math.Min(t.min, t.processed[0].Mean)
145 | 		t.max = math.Max(t.max, t.processed[t.processed.Len()-1].Mean)
146 | 		t.unprocessed.Clear()
147 | 	}
148 | }
149 | 
150 | // Centroids returns a copy of processed centroids.
151 | // Useful when aggregating multiple t-digests.
152 | //
153 | // Centroids are appended to the passed CentroidList; if you're re-using a
154 | // buffer, be sure to pass cl[:0].
155 | func (t *TDigest) Centroids(cl CentroidList) CentroidList {
156 | 	t.process()
157 | 	return append(cl, t.processed...)
158 | }
159 | 
160 | func (t *TDigest) Count() float64 {
161 | 	t.process()
162 | 
163 | 	// t.process always updates t.processedWeight to the total count of all
164 | 	// centroids, so we don't need to re-count here.
165 | 	return t.processedWeight
166 | }
167 | 
168 | func (t *TDigest) updateCumulative() {
169 | 	// Weight can only increase, so the final cumulative value will always be
170 | 	// either equal to, or less than, the total weight. If they are the same,
171 | 	// then nothing has changed since the last update.
172 | 	if len(t.cumulative) > 0 && t.cumulative[len(t.cumulative)-1] == t.processedWeight {
173 | 		return
174 | 	}
175 | 
176 | 	if n := t.processed.Len() + 1; n <= cap(t.cumulative) {
177 | 		t.cumulative = t.cumulative[:n]
178 | 	} else {
179 | 		t.cumulative = make([]float64, n)
180 | 	}
181 | 
182 | 	prev := 0.0
183 | 	for i, centroid := range t.processed {
184 | 		cur := centroid.Weight
185 | 		t.cumulative[i] = prev + cur/2.0
186 | 		prev = prev + cur
187 | 	}
188 | 	t.cumulative[t.processed.Len()] = prev
189 | }
190 | 
191 | // Quantile returns the (approximate) quantile of
192 | // the distribution. Accepted values for q are between 0.0 and 1.0.
193 | // Returns NaN if Count is zero or bad inputs.
194 | func (t *TDigest) Quantile(q float64) float64 {
195 | 	t.process()
196 | 	t.updateCumulative()
197 | 	if q < 0 || q > 1 || t.processed.Len() == 0 {
198 | 		return math.NaN()
199 | 	}
200 | 	if t.processed.Len() == 1 {
201 | 		return t.processed[0].Mean
202 | 	}
203 | 	index := q * t.processedWeight
204 | 	if index <= t.processed[0].Weight/2.0 {
205 | 		return t.min + 2.0*index/t.processed[0].Weight*(t.processed[0].Mean-t.min)
206 | 	}
207 | 
208 | 	lower := sort.Search(len(t.cumulative), func(i int) bool {
209 | 		return t.cumulative[i] >= index
210 | 	})
211 | 
212 | 	if lower+1 != len(t.cumulative) {
213 | 		z1 := index - t.cumulative[lower-1]
214 | 		z2 := t.cumulative[lower] - index
215 | 		return weightedAverage(t.processed[lower-1].Mean, z2, t.processed[lower].Mean, z1)
216 | 	}
217 | 
218 | 	z1 := index - t.processedWeight - t.processed[lower-1].Weight/2.0
219 | 	z2 := (t.processed[lower-1].Weight / 2.0) - z1
220 | 	return weightedAverage(t.processed[t.processed.Len()-1].Mean, z1, t.max, z2)
221 | }
222 | 
223 | // CDF returns the cumulative distribution function for a given value x.
224 | func (t *TDigest) CDF(x float64) float64 {
225 | 	t.process()
226 | 	t.updateCumulative()
227 | 	switch t.processed.Len() {
228 | 	case 0:
229 | 		return 0.0
230 | 	case 1:
231 | 		width := t.max - t.min
232 | 		if x <= t.min {
233 | 			return 0.0
234 | 		}
235 | 		if x >= t.max {
236 | 			return 1.0
237 | 		}
238 | 		if (x - t.min) <= width {
239 | 			// min and max are too close together to do any viable interpolation
240 | 			return 0.5
241 | 		}
242 | 		return (x - t.min) / width
243 | 	}
244 | 
245 | 	if x <= t.min {
246 | 		return 0.0
247 | 	}
248 | 	if x >= t.max {
249 | 		return 1.0
250 | 	}
251 | 	m0 := t.processed[0].Mean
252 | 	// Left Tail
253 | 	if x <= m0 {
254 | 		if m0-t.min > 0 {
255 | 			return (x - t.min) / (m0 - t.min) * t.processed[0].Weight / t.processedWeight / 2.0
256 | 		}
257 | 		return 0.0
258 | 	}
259 | 	// Right Tail
260 | 	mn := t.processed[t.processed.Len()-1].Mean
261 | 	if x >= mn {
262 | 		if t.max-mn > 0.0 {
263 | 			return 1.0 - (t.max-x)/(t.max-mn)*t.processed[t.processed.Len()-1].Weight/t.processedWeight/2.0
264 | 		}
265 | 		return 1.0
266 | 	}
267 | 
268 | 	upper := sort.Search(t.processed.Len(), func(i int) bool {
269 | 		return t.processed[i].Mean > x
270 | 	})
271 | 
272 | 	z1 := x - t.processed[upper-1].Mean
273 | 	z2 := t.processed[upper].Mean - x
274 | 	return weightedAverage(t.cumulative[upper-1], z2, t.cumulative[upper], z1) / t.processedWeight
275 | }
276 | 
277 | func (t *TDigest) integratedQ(k float64) float64 {
278 | 	return (math.Sin(math.Min(k, t.Compression)*math.Pi/t.Compression-math.Pi/2.0) + 1.0) / 2.0
279 | }
280 | 
281 | func (t *TDigest) integratedLocation(q float64) float64 {
282 | 	return t.Compression * (math.Asin(2.0*q-1.0) + math.Pi/2.0) / math.Pi
283 | }
284 | 
285 | func weightedAverage(x1, w1, x2, w2 float64) float64 {
286 | 	if x1 <= x2 {
287 | 		return weightedAverageSorted(x1, w1, x2, w2)
288 | 	}
289 | 	return weightedAverageSorted(x2, w2, x1, w1)
290 | }
291 | 
292 | func weightedAverageSorted(x1, w1, x2, w2 float64) float64 {
293 | 	x := (x1*w1 + x2*w2) / (w1 + w2)
294 | 	return math.Max(x1, math.Min(x, x2))
295 | }
296 | 
297 | func processedSize(size int, compression float64) int {
298 | 	if size == 0 {
299 | 		return int(2 * math.Ceil(compression))
300 | 	}
301 | 	return size
302 | }
303 | 
304 | func unprocessedSize(size int, compression float64) int {
305 | 	if size == 0 {
306 | 		return int(8 * math.Ceil(compression))
307 | 	}
308 | 	return size
309 | }
310 | 


--------------------------------------------------------------------------------
/tdigest_test.go:
--------------------------------------------------------------------------------
  1 | package tdigest_test
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math"
  6 | 	"reflect"
  7 | 	"testing"
  8 | 
  9 | 	"github.com/influxdata/tdigest"
 10 | 	"golang.org/x/exp/rand"
 11 | 	"gonum.org/v1/gonum/stat/distuv"
 12 | )
 13 | 
 14 | const (
 15 | 	N     = 1e6
 16 | 	Mu    = 10
 17 | 	Sigma = 3
 18 | 
 19 | 	seed = 42
 20 | )
 21 | 
 22 | // NormalData is a slice of N random values that are normaly distributed with mean Mu and standard deviation Sigma.
 23 | var NormalData []float64
 24 | var UniformData []float64
 25 | 
 26 | var NormalDigest *tdigest.TDigest
 27 | var UniformDigest *tdigest.TDigest
 28 | 
 29 | func init() {
 30 | 	dist := distuv.Normal{
 31 | 		Mu:    Mu,
 32 | 		Sigma: Sigma,
 33 | 		Src:   rand.New(rand.NewSource(seed)),
 34 | 	}
 35 | 	uniform := rand.New(rand.NewSource(seed))
 36 | 
 37 | 	UniformData = make([]float64, N)
 38 | 	UniformDigest = tdigest.NewWithCompression(1000)
 39 | 
 40 | 	NormalData = make([]float64, N)
 41 | 	NormalDigest = tdigest.NewWithCompression(1000)
 42 | 
 43 | 	for i := range NormalData {
 44 | 		NormalData[i] = dist.Rand()
 45 | 		NormalDigest.Add(NormalData[i], 1)
 46 | 
 47 | 		UniformData[i] = uniform.Float64() * 100
 48 | 		UniformDigest.Add(UniformData[i], 1)
 49 | 	}
 50 | }
 51 | 
 52 | // Compares the quantile results of two digests, and fails if the
 53 | // fractional err exceeds maxErr.
 54 | // Always fails if the total count differs.
 55 | func compareQuantiles(td1, td2 *tdigest.TDigest, maxErr float64) error {
 56 | 	if td1.Count() != td2.Count() {
 57 | 		return fmt.Errorf("counts are not equal, %d vs %d", int64(td1.Count()), int64(td2.Count()))
 58 | 	}
 59 | 	for q := 0.05; q < 1; q += 0.05 {
 60 | 		if math.Abs(td1.Quantile(q)-td2.Quantile(q))/td1.Quantile(q) > maxErr {
 61 | 			return fmt.Errorf("quantile %g differs, %g vs %g", q, td1.Quantile(q), td2.Quantile(q))
 62 | 		}
 63 | 	}
 64 | 	return nil
 65 | }
 66 | 
 67 | // All Add methods should yield equivalent results.
 68 | func TestTdigest_AddFuncs(t *testing.T) {
 69 | 	centroids := NormalDigest.Centroids(nil)
 70 | 
 71 | 	addDigest := tdigest.NewWithCompression(100)
 72 | 	addCentroidDigest := tdigest.NewWithCompression(100)
 73 | 	addCentroidListDigest := tdigest.NewWithCompression(100)
 74 | 
 75 | 	for _, c := range centroids {
 76 | 		addDigest.Add(c.Mean, c.Weight)
 77 | 		addCentroidDigest.AddCentroid(c)
 78 | 	}
 79 | 	addCentroidListDigest.AddCentroidList(centroids)
 80 | 
 81 | 	if err := compareQuantiles(addDigest, addCentroidDigest, 0.01); err != nil {
 82 | 		t.Errorf("AddCentroid() differs from from Add(): %s", err.Error())
 83 | 	}
 84 | 	if err := compareQuantiles(addDigest, addCentroidListDigest, 0.01); err != nil {
 85 | 		t.Errorf("AddCentroidList() differs from from Add(): %s", err.Error())
 86 | 	}
 87 | }
 88 | 
 89 | func TestTdigest_Count(t *testing.T) {
 90 | 	tests := []struct {
 91 | 		name   string
 92 | 		data   []float64
 93 | 		digest *tdigest.TDigest
 94 | 		want   float64
 95 | 	}{
 96 | 		{
 97 | 			name: "empty",
 98 | 			data: []float64{},
 99 | 			want: 0,
100 | 		},
101 | 		{
102 | 			name: "not empty",
103 | 			data: []float64{5, 4},
104 | 			want: 2,
105 | 		},
106 | 	}
107 | 
108 | 	for _, tt := range tests {
109 | 		t.Run(tt.name, func(t *testing.T) {
110 | 			td := tt.digest
111 | 			if td == nil {
112 | 				td = tdigest.NewWithCompression(1000)
113 | 				for _, x := range tt.data {
114 | 					td.Add(x, 1)
115 | 				}
116 | 			}
117 | 			got := td.Count()
118 | 			if got != tt.want {
119 | 				t.Errorf("unexpected count, got %g want %g", got, tt.want)
120 | 			}
121 | 		})
122 | 	}
123 | 
124 | 	got := NormalDigest.Count()
125 | 	want := float64(len(NormalData))
126 | 	if got != want {
127 | 		t.Errorf("unexpected count for NormalDigest, got %g want %g", got, want)
128 | 	}
129 | 
130 | 	got = UniformDigest.Count()
131 | 	want = float64(len(UniformData))
132 | 	if got != want {
133 | 		t.Errorf("unexpected count for UniformDigest, got %g want %g", got, want)
134 | 	}
135 | }
136 | 
137 | func TestTdigest_Quantile(t *testing.T) {
138 | 	tests := []struct {
139 | 		name     string
140 | 		data     []float64
141 | 		digest   *tdigest.TDigest
142 | 		quantile float64
143 | 		want     float64
144 | 	}{
145 | 		{
146 | 			name:     "increasing",
147 | 			quantile: 0.5,
148 | 			data:     []float64{1, 2, 3, 4, 5},
149 | 			want:     3,
150 | 		},
151 | 		{
152 | 			name:     "data in decreasing order",
153 | 			quantile: 0.25,
154 | 			data:     []float64{555.349107, 432.842597},
155 | 			want:     432.842597,
156 | 		},
157 | 		{
158 | 			name:     "small",
159 | 			quantile: 0.5,
160 | 			data:     []float64{1, 2, 3, 4, 5, 5, 4, 3, 2, 1},
161 | 			want:     3,
162 | 		},
163 | 		{
164 | 			name:     "small 99 (max)",
165 | 			quantile: 0.99,
166 | 			data:     []float64{1, 2, 3, 4, 5, 5, 4, 3, 2, 1},
167 | 			want:     5,
168 | 		},
169 | 		{
170 | 			name:     "normal 50",
171 | 			quantile: 0.5,
172 | 			digest:   NormalDigest,
173 | 			want:     10.000673533707138,
174 | 		},
175 | 		{
176 | 			name:     "normal 90",
177 | 			quantile: 0.9,
178 | 			digest:   NormalDigest,
179 | 			want:     13.842132136909889,
180 | 		},
181 | 		{
182 | 			name:     "uniform 50",
183 | 			quantile: 0.5,
184 | 			digest:   UniformDigest,
185 | 			want:     49.992502345843555,
186 | 		},
187 | 		{
188 | 			name:     "uniform 90",
189 | 			quantile: 0.9,
190 | 			digest:   UniformDigest,
191 | 			want:     89.98281777095822,
192 | 		},
193 | 		{
194 | 			name:     "uniform 99",
195 | 			quantile: 0.99,
196 | 			digest:   UniformDigest,
197 | 			want:     98.98503400959562,
198 | 		},
199 | 		{
200 | 			name:     "uniform 99.9",
201 | 			quantile: 0.999,
202 | 			digest:   UniformDigest,
203 | 			want:     99.90103781043621,
204 | 		},
205 | 	}
206 | 	for _, tt := range tests {
207 | 		t.Run(tt.name, func(t *testing.T) {
208 | 			td := tt.digest
209 | 			if td == nil {
210 | 				td = tdigest.NewWithCompression(1000)
211 | 				for _, x := range tt.data {
212 | 					td.Add(x, 1)
213 | 				}
214 | 			}
215 | 			got := td.Quantile(tt.quantile)
216 | 			if got != tt.want {
217 | 				t.Errorf("unexpected quantile %f, got %g want %g", tt.quantile, got, tt.want)
218 | 			}
219 | 		})
220 | 	}
221 | }
222 | 
223 | func TestTdigest_CDFs(t *testing.T) {
224 | 	tests := []struct {
225 | 		name   string
226 | 		data   []float64
227 | 		digest *tdigest.TDigest
228 | 		cdf    float64
229 | 		want   float64
230 | 	}{
231 | 		{
232 | 			name: "increasing",
233 | 			cdf:  3,
234 | 			data: []float64{1, 2, 3, 4, 5},
235 | 			want: 0.5,
236 | 		},
237 | 		{
238 | 			name: "small",
239 | 			cdf:  4,
240 | 			data: []float64{1, 2, 3, 4, 5, 5, 4, 3, 2, 1},
241 | 			want: 0.75,
242 | 		},
243 | 		{
244 | 			name: "small max",
245 | 			cdf:  5,
246 | 			data: []float64{1, 2, 3, 4, 5, 5, 4, 3, 2, 1},
247 | 			want: 1,
248 | 		},
249 | 		{
250 | 			name: "normal mean",
251 | 			cdf:  10,
252 | 			data: NormalData,
253 | 			want: 0.4999156505250766,
254 | 		},
255 | 		{
256 | 			name: "normal high",
257 | 			cdf:  -100,
258 | 			data: NormalData,
259 | 			want: 0,
260 | 		},
261 | 		{
262 | 			name: "normal low",
263 | 			cdf:  110,
264 | 			data: NormalData,
265 | 			want: 1,
266 | 		},
267 | 		{
268 | 			name: "uniform 50",
269 | 			cdf:  50,
270 | 			data: UniformData,
271 | 			want: 0.5000756133965755,
272 | 		},
273 | 		{
274 | 			name: "uniform min",
275 | 			cdf:  0,
276 | 			data: UniformData,
277 | 			want: 0,
278 | 		},
279 | 		{
280 | 			name: "uniform max",
281 | 			cdf:  100,
282 | 			data: UniformData,
283 | 			want: 1,
284 | 		},
285 | 		{
286 | 			name: "uniform 10",
287 | 			cdf:  10,
288 | 			data: UniformData,
289 | 			want: 0.09987932577650871,
290 | 		},
291 | 		{
292 | 			name: "uniform 90",
293 | 			cdf:  90,
294 | 			data: UniformData,
295 | 			want: 0.9001667885256108,
296 | 		},
297 | 	}
298 | 	for _, tt := range tests {
299 | 		t.Run(tt.name, func(t *testing.T) {
300 | 			td := tt.digest
301 | 			if td == nil {
302 | 				td = tdigest.NewWithCompression(1000)
303 | 				for _, x := range tt.data {
304 | 					td.Add(x, 1)
305 | 				}
306 | 			}
307 | 			got := td.CDF(tt.cdf)
308 | 			if got != tt.want {
309 | 				t.Errorf("unexpected CDF %f, got %g want %g", tt.cdf, got, tt.want)
310 | 			}
311 | 		})
312 | 	}
313 | }
314 | 
315 | func TestTdigest_Reset(t *testing.T) {
316 | 	td := tdigest.New()
317 | 	for _, x := range NormalData {
318 | 		td.Add(x, 1)
319 | 	}
320 | 	q1 := td.Quantile(0.9)
321 | 
322 | 	td.Reset()
323 | 	for _, x := range NormalData {
324 | 		td.Add(x, 1)
325 | 	}
326 | 	if q2 := td.Quantile(0.9); q2 != q1 {
327 | 		t.Errorf("unexpected quantile, got %g want %g", q2, q1)
328 | 	}
329 | }
330 | 
331 | func TestTdigest_OddInputs(t *testing.T) {
332 | 	td := tdigest.New()
333 | 	td.Add(math.NaN(), 1)
334 | 	td.Add(1, math.NaN())
335 | 	td.Add(1, 0)
336 | 	td.Add(1, -1000)
337 | 	if td.Count() != 0 {
338 | 		t.Error("invalid value was alloed to be added")
339 | 	}
340 | 
341 | 	// Infinite values are allowed.
342 | 	td.Add(1, 1)
343 | 	td.Add(2, 1)
344 | 	td.Add(math.Inf(1), 1)
345 | 	if q := td.Quantile(0.5); q != 2 {
346 | 		t.Errorf("expected median value 2, got %f", q)
347 | 	}
348 | 	if q := td.Quantile(0.9); !math.IsInf(q, 1) {
349 | 		t.Errorf("expected median value 2, got %f", q)
350 | 	}
351 | }
352 | 
353 | func TestTdigest_Merge(t *testing.T) {
354 | 	// Repeat merges enough times to ensure we call compress()
355 | 	numRepeats := 20
356 | 	addDigest := tdigest.New()
357 | 	for i := 0; i < numRepeats; i++ {
358 | 		for _, c := range NormalDigest.Centroids(nil) {
359 | 			addDigest.AddCentroid(c)
360 | 		}
361 | 		for _, c := range UniformDigest.Centroids(nil) {
362 | 			addDigest.AddCentroid(c)
363 | 		}
364 | 	}
365 | 
366 | 	mergeDigest := tdigest.New()
367 | 	for i := 0; i < numRepeats; i++ {
368 | 		mergeDigest.Merge(NormalDigest)
369 | 		mergeDigest.Merge(UniformDigest)
370 | 	}
371 | 
372 | 	if err := compareQuantiles(addDigest, mergeDigest, 0.001); err != nil {
373 | 		t.Errorf("AddCentroid() differs from from Merge(): %s", err.Error())
374 | 	}
375 | 
376 | 	// Empty merge does nothing and has no effect on underlying centroids.
377 | 	c1 := addDigest.Centroids(nil)
378 | 	addDigest.Merge(tdigest.New())
379 | 	c2 := addDigest.Centroids(nil)
380 | 	if !reflect.DeepEqual(c1, c2) {
381 | 		t.Error("Merging an empty digest altered data")
382 | 	}
383 | }
384 | 
385 | var quantiles = []float64{0.1, 0.5, 0.9, 0.99, 0.999}
386 | 
387 | func BenchmarkTDigest_Add(b *testing.B) {
388 | 	for n := 0; n < b.N; n++ {
389 | 		td := tdigest.NewWithCompression(1000)
390 | 		for _, x := range NormalData {
391 | 			td.Add(x, 1)
392 | 		}
393 | 	}
394 | }
395 | 
396 | func BenchmarkTDigest_AddCentroid(b *testing.B) {
397 | 	centroids := make(tdigest.CentroidList, len(NormalData))
398 | 	for i := range centroids {
399 | 		centroids[i].Mean = NormalData[i]
400 | 		centroids[i].Weight = 1
401 | 	}
402 | 
403 | 	b.ResetTimer()
404 | 	for n := 0; n < b.N; n++ {
405 | 		td := tdigest.NewWithCompression(1000)
406 | 		for i := range centroids {
407 | 			td.AddCentroid(centroids[i])
408 | 		}
409 | 	}
410 | }
411 | 
412 | func BenchmarkTDigest_AddCentroidList(b *testing.B) {
413 | 	centroids := make(tdigest.CentroidList, len(NormalData))
414 | 	for i := range centroids {
415 | 		centroids[i].Mean = NormalData[i]
416 | 		centroids[i].Weight = 1
417 | 	}
418 | 
419 | 	b.ResetTimer()
420 | 	for n := 0; n < b.N; n++ {
421 | 		td := tdigest.NewWithCompression(1000)
422 | 		td.AddCentroidList(centroids)
423 | 	}
424 | }
425 | 
426 | func BenchmarkTDigest_Merge(b *testing.B) {
427 | 	b.Run("AddCentroid", func(b *testing.B) {
428 | 		var cl tdigest.CentroidList
429 | 		td := tdigest.New()
430 | 		for n := 0; n < b.N; n++ {
431 | 			cl = NormalDigest.Centroids(cl[:0])
432 | 			for i := range cl {
433 | 				td.AddCentroid(cl[i])
434 | 			}
435 | 		}
436 | 	})
437 | 	b.Run("Merge", func(b *testing.B) {
438 | 		td := tdigest.New()
439 | 		for n := 0; n < b.N; n++ {
440 | 			td.Merge(NormalDigest)
441 | 		}
442 | 	})
443 | }
444 | 
445 | func BenchmarkTDigest_Quantile(b *testing.B) {
446 | 	td := tdigest.NewWithCompression(1000)
447 | 	for _, x := range NormalData {
448 | 		td.Add(x, 1)
449 | 	}
450 | 	b.ResetTimer()
451 | 	var x float64
452 | 	for n := 0; n < b.N; n++ {
453 | 		for _, q := range quantiles {
454 | 			x += td.Quantile(q)
455 | 		}
456 | 	}
457 | }
458 | 
459 | func TestTdigest_Centroids(t *testing.T) {
460 | 	tests := []struct {
461 | 		name   string
462 | 		data   []float64
463 | 		digest *tdigest.TDigest
464 | 		want   tdigest.CentroidList
465 | 	}{
466 | 		{
467 | 			name: "increasing",
468 | 			data: []float64{1, 2, 3, 4, 5},
469 | 			want: tdigest.CentroidList{
470 | 				tdigest.Centroid{
471 | 					Mean:   1.0,
472 | 					Weight: 1.0,
473 | 				},
474 | 
475 | 				tdigest.Centroid{
476 | 					Mean:   2.5,
477 | 					Weight: 2.0,
478 | 				},
479 | 
480 | 				tdigest.Centroid{
481 | 					Mean:   4.0,
482 | 					Weight: 1.0,
483 | 				},
484 | 
485 | 				tdigest.Centroid{
486 | 					Mean:   5.0,
487 | 					Weight: 1.0,
488 | 				},
489 | 			},
490 | 		},
491 | 	}
492 | 
493 | 	for _, tt := range tests {
494 | 		t.Run(tt.name, func(t *testing.T) {
495 | 			var got tdigest.CentroidList
496 | 			td := tt.digest
497 | 			if td == nil {
498 | 				td = tdigest.NewWithCompression(3)
499 | 				for _, x := range tt.data {
500 | 					td.Add(x, 1)
501 | 				}
502 | 			}
503 | 			got = td.Centroids(got[:0])
504 | 			if !reflect.DeepEqual(got, tt.want) {
505 | 				t.Errorf("unexpected list got %g want %g", got, tt.want)
506 | 			}
507 | 		})
508 | 	}
509 | }
510 | 


--------------------------------------------------------------------------------
/test/README.md:
--------------------------------------------------------------------------------
1 | # Testing
2 | 
3 | This directory contains two programs `main.go` and `main.cpp` which both read three input file compute various quantiles and write out their results.
4 | The purpose of these programs is to show that the Go implementaion is accurate as compared to the C++ implementaion.
5 | 
6 | The tests can be run using `test.sh`.
7 | 
8 | 


--------------------------------------------------------------------------------
/test/gen/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"os"
 5 | 	"strconv"
 6 | 
 7 | 	"golang.org/x/exp/rand"
 8 | 	"gonum.org/v1/gonum/stat/distuv"
 9 | )
10 | 
11 | const (
12 | 	N     = 1e6
13 | 	Mu    = 10
14 | 	Sigma = 3
15 | 
16 | 	seed = 42
17 | )
18 | 
19 | func main() {
20 | 	// Generate uniform and normal data
21 | 	uniform := rand.New(rand.NewSource(seed))
22 | 	dist := distuv.Normal{
23 | 		Mu:    Mu,
24 | 		Sigma: Sigma,
25 | 		Src:   rand.New(rand.NewSource(seed)),
26 | 	}
27 | 
28 | 	uniformData := make([]float64, N)
29 | 	normalData := make([]float64, N)
30 | 	for i := range normalData {
31 | 		normalData[i] = dist.Rand()
32 | 		uniformData[i] = uniform.Float64() * 100
33 | 	}
34 | 
35 | 	smallData := []float64{1, 2, 3, 4, 5, 5, 4, 3, 2, 1}
36 | 
37 | 	writeData("uniform.dat", uniformData)
38 | 	writeData("normal.dat", normalData)
39 | 	writeData("small.dat", smallData)
40 | }
41 | 
42 | func writeData(name string, data []float64) {
43 | 	f, err := os.Create(name)
44 | 	if err != nil {
45 | 		panic(err)
46 | 	}
47 | 	defer f.Close()
48 | 
49 | 	buf := make([]byte, 0, 64)
50 | 	for _, x := range data {
51 | 		buf = strconv.AppendFloat(buf, x, 'f', -1, 64)
52 | 		_, err := f.Write(buf)
53 | 		if err != nil {
54 | 			panic(err)
55 | 		}
56 | 		_, err = f.Write([]byte{'\n'})
57 | 		if err != nil {
58 | 			panic(err)
59 | 		}
60 | 		buf = buf[0:0]
61 | 	}
62 | }
63 | 


--------------------------------------------------------------------------------
/test/main.cpp:
--------------------------------------------------------------------------------
 1 | // +build ignore
 2 | 
 3 | #include "tdigest.h"
 4 | #include <iostream>
 5 | #include <string>
 6 | #include <sstream>
 7 | #include <fstream>
 8 | #include <vector>
 9 | #include <iomanip>
10 | 
11 | using namespace tdigest;
12 | 
13 | double quantiles[7] = {
14 |     0.1,
15 |     0.2,
16 |     0.5,
17 |     0.75,
18 |     0.9,
19 |     0.99,
20 |     0.999,
21 | };
22 | 
23 | 
24 | std::string dataFiles[3] = {"small.dat", "uniform.dat", "normal.dat"};
25 | double cdfs[3][5] = {
26 |     // small.dat
27 | 	{0, 1, 4, 5, 6},
28 |     // uniform.dat
29 | 	{-1, 0, 50, 100, 101},
30 |     // normal.dat
31 | 	{-100, 7, 10, 13, 110},
32 | };
33 | 
34 | 
35 | std::vector<double> loadData(std::string name) {
36 |     std::ifstream f (name);
37 |     std::vector<double> data;
38 | 
39 |     f >> std::setprecision(std::numeric_limits<long double>::digits10 + 1);
40 |     double x;
41 |     while (f >> x) {
42 |         data.push_back(x);
43 |     }
44 |     return data;
45 | }
46 | 
47 | TDigest* createTDigest(std::vector<double> data){
48 |     TDigest* td = new TDigest(1000);
49 |     for (auto x : data) {
50 |         td->add(x);
51 |     }
52 |     return td;
53 | }
54 | 
55 | std::vector<double> computeQuantiles(TDigest* td){
56 |     std::vector<double> results;
57 |     for (int i = 0; i < 7; i++) {
58 |         double q = td->quantile(quantiles[i]);
59 |         results.push_back(q);
60 |     }
61 |     return results;
62 | }
63 | 
64 | std::vector<double> computeCDFs(TDigest* td, double cdfs[5]) {
65 |     std::vector<double> results;
66 |     for (int i = 0; i < 5; i++) {
67 |         double p = td->cdf(cdfs[i]);
68 |         results.push_back(p);
69 |     }
70 | 
71 |     return results;
72 | }
73 | 
74 | void writeResults(std::string name, std::vector<double> results){
75 |     std::ofstream f (name);
76 | 
77 |     f << std::setprecision(std::numeric_limits<long double>::digits10 + 1);
78 |     for (auto x : results) {
79 |         f << x << std::endl;
80 |     }
81 | }
82 | 
83 | int main() {
84 |     for (int i = 0; i < 3; i++) {
85 |         std::vector<double> data = loadData(dataFiles[i]);
86 |         TDigest* td = createTDigest(data);
87 |         auto results = computeQuantiles(td);
88 |         writeResults(dataFiles[i] + ".cpp.quantiles", results);
89 |         results = computeCDFs(td, cdfs[i]);
90 |         writeResults(dataFiles[i] + ".cpp.cdfs", results);
91 |     }
92 |     return 0;
93 | }
94 | 


--------------------------------------------------------------------------------
/test/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"os"
  6 | 	"strconv"
  7 | 
  8 | 	"github.com/influxdata/tdigest"
  9 | )
 10 | 
 11 | var quantiles = []float64{
 12 | 	0.1,
 13 | 	0.2,
 14 | 	0.5,
 15 | 	0.75,
 16 | 	0.9,
 17 | 	0.99,
 18 | 	0.999,
 19 | }
 20 | 
 21 | var cdfs = map[string][]float64{
 22 | 	"small.dat":   []float64{0, 1, 4, 5, 6},
 23 | 	"uniform.dat": []float64{-1, 0, 50, 100, 101},
 24 | 	"normal.dat":  []float64{-100, 7, 10, 13, 110},
 25 | }
 26 | 
 27 | var dataFiles = []string{
 28 | 	"small.dat",
 29 | 	"uniform.dat",
 30 | 	"normal.dat",
 31 | }
 32 | 
 33 | func main() {
 34 | 	for _, f := range dataFiles {
 35 | 		data := loadData(f)
 36 | 		td := createTdigest(data)
 37 | 		results := computeQuantiles(td, quantiles)
 38 | 		writeResults(f+".go.quantiles", results)
 39 | 		results = computeCDFs(td, cdfs[f])
 40 | 		writeResults(f+".go.cdfs", results)
 41 | 	}
 42 | }
 43 | 
 44 | func loadData(name string) []float64 {
 45 | 	f, err := os.Open(name)
 46 | 	if err != nil {
 47 | 		panic(err)
 48 | 	}
 49 | 	defer f.Close()
 50 | 	s := bufio.NewScanner(f)
 51 | 	var data []float64
 52 | 	for s.Scan() {
 53 | 		x, err := strconv.ParseFloat(s.Text(), 64)
 54 | 		if err != nil {
 55 | 			panic(err)
 56 | 		}
 57 | 		data = append(data, x)
 58 | 	}
 59 | 	return data
 60 | }
 61 | 
 62 | func createTdigest(data []float64) *tdigest.TDigest {
 63 | 	td := tdigest.NewWithCompression(1000)
 64 | 	for _, x := range data {
 65 | 		td.Add(x, 1)
 66 | 	}
 67 | 	return td
 68 | }
 69 | 
 70 | func computeQuantiles(td *tdigest.TDigest, quantiles []float64) (r []float64) {
 71 | 	for _, q := range quantiles {
 72 | 		r = append(r, td.Quantile(q))
 73 | 	}
 74 | 	return
 75 | }
 76 | 
 77 | func computeCDFs(td *tdigest.TDigest, cdfs []float64) (r []float64) {
 78 | 	for _, x := range cdfs {
 79 | 		r = append(r, td.CDF(x))
 80 | 	}
 81 | 	return
 82 | }
 83 | 
 84 | func writeResults(name string, results []float64) {
 85 | 	f, err := os.Create(name)
 86 | 	if err != nil {
 87 | 		panic(err)
 88 | 	}
 89 | 	defer f.Close()
 90 | 	buf := make([]byte, 0, 64)
 91 | 	for _, x := range results {
 92 | 		buf = strconv.AppendFloat(buf, x, 'f', -1, 64)
 93 | 		_, err := f.Write(buf)
 94 | 		if err != nil {
 95 | 			panic(err)
 96 | 		}
 97 | 		_, err = f.Write([]byte{'\n'})
 98 | 		if err != nil {
 99 | 			panic(err)
100 | 		}
101 | 		buf = buf[0:0]
102 | 	}
103 | }
104 | 


--------------------------------------------------------------------------------
/test/tdigest.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Derrick R. Burns under one or more
  3 |  * contributor license agreements.  See the NOTICES file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | #ifndef TDIGEST2_TDIGEST_H_
 19 | #define TDIGEST2_TDIGEST_H_
 20 | 
 21 | #include <algorithm>
 22 | #include <cfloat>
 23 | #include <cmath>
 24 | #include <queue>
 25 | #include <utility>
 26 | #include <vector>
 27 | #include <iostream>
 28 | 
 29 | // Modifed from original to remove all external depedencies.
 30 | #define DLOG(l) std::cerr
 31 | #define LOG(l) std::cerr
 32 | 
 33 | #define CHECK_LE(x1, x2)
 34 | #define CHECK_GT(x1, x2)
 35 | #define CHECK_GE(x1, x2)
 36 | 
 37 | namespace tdigest {
 38 | 
 39 | using Value = double;
 40 | using Weight = double;
 41 | using Index = size_t;
 42 | 
 43 | const size_t kHighWater = 40000;
 44 | 
 45 | class Centroid {
 46 |  public:
 47 |   Centroid() : Centroid(0.0, 0.0) {}
 48 | 
 49 |   Centroid(Value mean, Weight weight) : mean_(mean), weight_(weight) {}
 50 | 
 51 |   inline Value mean() const noexcept { return mean_; }
 52 | 
 53 |   inline Weight weight() const noexcept { return weight_; }
 54 | 
 55 |   inline void add(const Centroid& c) {
 56 |     CHECK_GT(c.weight_, 0);
 57 |     if( weight_ != 0.0 ) {
 58 |       weight_ += c.weight_;
 59 |       mean_ += c.weight_ * (c.mean_ - mean_) / weight_;
 60 |     } else {
 61 |       weight_ = c.weight_;
 62 |       mean_ = c.mean_;
 63 |     }
 64 |   }
 65 | 
 66 |  private:
 67 |   Value mean_ = 0;
 68 |   Weight weight_ = 0;
 69 | };
 70 | 
 71 | struct CentroidList {
 72 |   CentroidList(const std::vector<Centroid>& s) : iter(s.cbegin()), end(s.cend()) {}
 73 |   std::vector<Centroid>::const_iterator iter;
 74 |   std::vector<Centroid>::const_iterator end;
 75 | 
 76 |   bool advance() { return ++iter != end; }
 77 | };
 78 | 
 79 | class CentroidListComparator {
 80 |  public:
 81 |   CentroidListComparator() {}
 82 | 
 83 |   bool operator()(const CentroidList& left, const CentroidList& right) const {
 84 |     return left.iter->mean() > right.iter->mean();
 85 |   }
 86 | };
 87 | 
 88 | using CentroidListQueue = std::priority_queue<CentroidList, std::vector<CentroidList>, CentroidListComparator>;
 89 | 
 90 | struct CentroidComparator {
 91 |   bool operator()(const Centroid& a, const Centroid& b) const { return a.mean() < b.mean(); }
 92 | };
 93 | 
 94 | class TDigest {
 95 |   class TDigestComparator {
 96 |    public:
 97 |     TDigestComparator() {}
 98 | 
 99 |     bool operator()(const TDigest* left, const TDigest* right) const { return left->totalSize() > right->totalSize(); }
100 |   };
101 | 
102 |   using TDigestQueue = std::priority_queue<const TDigest*, std::vector<const TDigest*>, TDigestComparator>;
103 | 
104 |  public:
105 |   TDigest() : TDigest(1000) {}
106 | 
107 |   explicit TDigest(Value compression) : TDigest(compression, 0) {}
108 | 
109 |   TDigest(Value compression, Index bufferSize) : TDigest(compression, bufferSize, 0) {}
110 | 
111 |   TDigest(Value compression, Index unmergedSize, Index mergedSize)
112 |       : compression_(compression),
113 |         maxProcessed_(processedSize(mergedSize, compression)),
114 |         maxUnprocessed_(unprocessedSize(unmergedSize, compression)) {
115 |     processed_.reserve(maxProcessed_);
116 |     unprocessed_.reserve(maxUnprocessed_ + 1);
117 |   }
118 | 
119 |   TDigest(std::vector<Centroid>&& processed, std::vector<Centroid>&& unprocessed, Value compression,
120 |           Index unmergedSize, Index mergedSize)
121 |       : TDigest(compression, unmergedSize, mergedSize) {
122 |     processed_ = std::move(processed);
123 |     unprocessed_ = std::move(unprocessed);
124 | 
125 |     processedWeight_ = weight(processed_);
126 |     unprocessedWeight_ = weight(unprocessed_);
127 |     if( processed_.size() > 0 ) {
128 |       min_ = std::min(min_, processed_[0].mean());
129 |       max_ = std::max(max_, (processed_.cend() - 1)->mean());
130 |     }
131 |     updateCumulative();
132 |   }
133 | 
134 |   static Weight weight(std::vector<Centroid>& centroids) noexcept {
135 |     Weight w = 0.0;
136 |     for (auto centroid : centroids) {
137 |       w += centroid.weight();
138 |     }
139 |     return w;
140 |   }
141 | 
142 |   TDigest& operator=(TDigest&& o) {
143 |     compression_ = o.compression_;
144 |     maxProcessed_ = o.maxProcessed_;
145 |     maxUnprocessed_ = o.maxUnprocessed_;
146 |     processedWeight_ = o.processedWeight_;
147 |     unprocessedWeight_ = o.unprocessedWeight_;
148 |     processed_ = std::move(o.processed_);
149 |     unprocessed_ = std::move(o.unprocessed_);
150 |     cumulative_ = std::move(o.cumulative_);
151 |     min_ = o.min_;
152 |     max_ = o.max_;
153 |     return *this;
154 |   }
155 | 
156 |   TDigest(TDigest&& o)
157 |       : TDigest(std::move(o.processed_), std::move(o.unprocessed_), o.compression_, o.maxUnprocessed_,
158 |                 o.maxProcessed_) {}
159 | 
160 |   static inline Index processedSize(Index size, Value compression) noexcept {
161 |     return (size == 0) ? static_cast<Index>(2 * std::ceil(compression)) : size;
162 |   }
163 | 
164 |   static inline Index unprocessedSize(Index size, Value compression) noexcept {
165 |     return (size == 0) ? static_cast<Index>(8 * std::ceil(compression)) : size;
166 |   }
167 | 
168 |   // merge in another t-digest
169 |   inline void merge(const TDigest* other) {
170 |     std::vector<const TDigest*> others{other};
171 |     add(others.cbegin(), others.cend());
172 |   }
173 | 
174 |   const std::vector<Centroid>& processed() const { return processed_; }
175 | 
176 |   const std::vector<Centroid>& unprocessed() const { return unprocessed_; }
177 | 
178 |   Index maxUnprocessed() const { return maxUnprocessed_; }
179 | 
180 |   Index maxProcessed() const { return maxProcessed_; }
181 | 
182 |   inline void add(std::vector<const TDigest*> digests) { add(digests.cbegin(), digests.cend()); }
183 | 
184 |   // merge in a vector of tdigests in the most efficient manner possible
185 |   // in constant space
186 |   // works for any value of kHighWater
187 |   void add(std::vector<const TDigest*>::const_iterator iter, std::vector<const TDigest*>::const_iterator end) {
188 |     if (iter != end) {
189 |       auto size = std::distance(iter, end);
190 |       TDigestQueue pq(TDigestComparator{});
191 |       for (; iter != end; iter++) {
192 |         pq.push((*iter));
193 |       }
194 |       std::vector<const TDigest*> batch;
195 |       batch.reserve(size);
196 | 
197 |       size_t totalSize = 0;
198 |       while (!pq.empty()) {
199 |         auto td = pq.top();
200 |         batch.push_back(td);
201 |         pq.pop();
202 |         totalSize += td->totalSize();
203 |         if (totalSize >= kHighWater || pq.empty()) {
204 |           mergeProcessed(batch);
205 |           mergeUnprocessed(batch);
206 |           processIfNecessary();
207 |           batch.clear();
208 |           totalSize = 0;
209 |         }
210 |       }
211 |       updateCumulative();
212 |     }
213 |   }
214 | 
215 |   Weight processedWeight() const { return processedWeight_; }
216 | 
217 |   Weight unprocessedWeight() const { return unprocessedWeight_; }
218 | 
219 |   bool haveUnprocessed() const { return unprocessed_.size() > 0; }
220 | 
221 |   size_t totalSize() const { return processed_.size() + unprocessed_.size(); }
222 | 
223 |   long totalWeight() const { return static_cast<long>(processedWeight_ + unprocessedWeight_); }
224 | 
225 |   // return the cdf on the t-digest
226 |   Value cdf(Value x) {
227 |     if (haveUnprocessed() || isDirty()) process();
228 |     return cdfProcessed(x);
229 |   }
230 | 
231 |   bool isDirty() { return processed_.size() > maxProcessed_ || unprocessed_.size() > maxUnprocessed_; }
232 | 
233 |   // return the cdf on the processed values
234 |   Value cdfProcessed(Value x) const {
235 |     DLOG(INFO) << "cdf value " << x;
236 |     DLOG(INFO) << "processed size " << processed_.size();
237 |     if (processed_.size() == 0) {
238 |       // no data to examin_e
239 |       DLOG(INFO) << "no processed values";
240 | 
241 |       return 0.0;
242 |     } else if (processed_.size() == 1) {
243 |       DLOG(INFO) << "one processed value "
244 |                  << " min_ " << min_ << " max_ " << max_;
245 |       // exactly one centroid, should have max_==min_
246 |       auto width = max_ - min_;
247 |       if (x < min_) {
248 |         return 0.0;
249 |       } else if (x > max_) {
250 |         return 1.0;
251 |       } else if (x - min_ <= width) {
252 |         // min_ and max_ are too close together to do any viable interpolation
253 |         return 0.5;
254 |       } else {
255 |         // interpolate if somehow we have weight > 0 and max_ != min_
256 |         return (x - min_) / (max_ - min_);
257 |       }
258 |     } else {
259 |       auto n = processed_.size();
260 |       if (x <= min_) {
261 |         DLOG(INFO) << "below min_ "
262 |                    << " min_ " << min_ << " x " << x;
263 |         return 0;
264 |       }
265 | 
266 |       if (x >= max_) {
267 |         DLOG(INFO) << "above max_ "
268 |                    << " max_ " << max_ << " x " << x;
269 |         return 1;
270 |       }
271 | 
272 |       // check for the left tail
273 |       if (x <= mean(0)) {
274 |         DLOG(INFO) << "left tail "
275 |                    << " min_ " << min_ << " mean(0) " << mean(0) << " x " << x;
276 | 
277 |         // note that this is different than mean(0) > min_ ... this guarantees interpolation works
278 |         if (mean(0) - min_ > 0) {
279 |           return (x - min_) / (mean(0) - min_) * weight(0) / processedWeight_ / 2.0;
280 |         } else {
281 |           return 0;
282 |         }
283 |       }
284 | 
285 |       // and the right tail
286 |       if (x >= mean(n - 1)) {
287 |         DLOG(INFO) << "right tail"
288 |                    << " max_ " << max_ << " mean(n - 1) " << mean(n - 1) << " x " << x;
289 | 
290 |         if (max_ - mean(n - 1) > 0) {
291 |           return 1.0 - (max_ - x) / (max_ - mean(n - 1)) * weight(n - 1) / processedWeight_ / 2.0;
292 |         } else {
293 |           return 1;
294 |         }
295 |       }
296 | 
297 |       CentroidComparator cc;
298 |       auto iter = std::upper_bound(processed_.cbegin(), processed_.cend(), Centroid(x, 0), cc);
299 | 
300 |       auto i = std::distance(processed_.cbegin(), iter);
301 |       auto z1 = x - (iter - 1)->mean();
302 |       auto z2 = (iter)->mean() - x;
303 |       CHECK_LE(0.0, z1);
304 |       CHECK_LE(0.0, z2);
305 |       DLOG(INFO) << "middle "
306 |                  << " z1 " << z1 << " z2 " << z2 << " x " << x;
307 | 
308 |       return weightedAverage(cumulative_[i - 1], z2, cumulative_[i], z1) / processedWeight_;
309 |     }
310 |   }
311 | 
312 |   // this returns a quantile on the t-digest
313 |   Value quantile(Value q) {
314 |     if (haveUnprocessed() || isDirty()) process();
315 |     return quantileProcessed(q);
316 |   }
317 | 
318 |   // this returns a quantile on the currently processed values without changing the t-digest
319 |   // the value will not represent the unprocessed values
320 |   Value quantileProcessed(Value q) const {
321 |     if (q < 0 || q > 1) {
322 |       LOG(ERROR) << "q should be in [0,1], got " << q;
323 |       return NAN;
324 |     }
325 | 
326 |     if (processed_.size() == 0) {
327 |       // no sorted means no data, no way to get a quantile
328 |       return NAN;
329 |     } else if (processed_.size() == 1) {
330 |       // with one data point, all quantiles lead to Rome
331 | 
332 |       return mean(0);
333 |     }
334 | 
335 |     // we know that there are at least two sorted now
336 |     auto n = processed_.size();
337 | 
338 |     // if values were stored in a sorted array, index would be the offset we are Weighterested in
339 |     const auto index = q * processedWeight_;
340 | 
341 |     // at the boundaries, we return min_ or max_
342 |     if (index < weight(0) / 2.0) {
343 |       CHECK_GT(weight(0), 0);
344 |       return min_ + 2.0 * index / weight(0) * (mean(0) - min_);
345 |     }
346 | 
347 |     auto iter = std::lower_bound(cumulative_.cbegin(), cumulative_.cend(), index);
348 | 
349 |     if (iter + 1 != cumulative_.cend()) {
350 |       auto i = std::distance(cumulative_.cbegin(), iter);
351 |       auto z1 = index - *(iter - 1);
352 |       auto z2 = *(iter)-index;
353 |       // LOG(INFO) << "z2 " << z2 << " index " << index << " z1 " << z1;
354 |       return weightedAverage(mean(i - 1), z2, mean(i), z1);
355 |     }
356 | 
357 |     CHECK_LE(index, processedWeight_);
358 |     CHECK_GE(index, processedWeight_ - weight(n - 1) / 2.0);
359 | 
360 |     auto z1 = index - processedWeight_ - weight(n - 1) / 2.0;
361 |     auto z2 = weight(n - 1) / 2 - z1;
362 |     return weightedAverage(mean(n - 1), z1, max_, z2);
363 |   }
364 | 
365 |   Value compression() const { return compression_; }
366 | 
367 |   void add(Value x) { add(x, 1); }
368 | 
369 |   inline void compress() { process(); }
370 | 
371 |   // add a single centroid to the unprocessed vector, processing previously unprocessed sorted if our limit has
372 |   // been reached.
373 |   inline bool add(Value x, Weight w) {
374 |     if (std::isnan(x)) {
375 |       return false;
376 |     }
377 |     unprocessed_.push_back(Centroid(x, w));
378 |     unprocessedWeight_ += w;
379 |     processIfNecessary();
380 |     return true;
381 |   }
382 | 
383 |   inline void add(std::vector<Centroid>::const_iterator iter, std::vector<Centroid>::const_iterator end) {
384 |     while (iter != end) {
385 |       const size_t diff = std::distance(iter, end);
386 |       const size_t room = maxUnprocessed_ - unprocessed_.size();
387 |       auto mid = iter + std::min(diff, room);
388 |       while (iter != mid) unprocessed_.push_back(*(iter++));
389 |       if (unprocessed_.size() >= maxUnprocessed_) {
390 |         process();
391 |       }
392 |     }
393 |   }
394 | 
395 |  private:
396 |   Value compression_;
397 | 
398 |   Value min_ = std::numeric_limits<Value>::max();
399 | 
400 |   Value max_ = std::numeric_limits<Value>::min();
401 | 
402 |   Index maxProcessed_;
403 | 
404 |   Index maxUnprocessed_;
405 | 
406 |   Value processedWeight_ = 0.0;
407 | 
408 |   Value unprocessedWeight_ = 0.0;
409 | 
410 |   std::vector<Centroid> processed_;
411 | 
412 |   std::vector<Centroid> unprocessed_;
413 | 
414 |   std::vector<Weight> cumulative_;
415 | 
416 |   // return mean of i-th centroid
417 |   inline Value mean(int i) const noexcept { return processed_[i].mean(); }
418 | 
419 |   // return weight of i-th centroid
420 |   inline Weight weight(int i) const noexcept { return processed_[i].weight(); }
421 | 
422 |   // append all unprocessed centroids into current unprocessed vector
423 |   void mergeUnprocessed(const std::vector<const TDigest*>& tdigests) {
424 |     if (tdigests.size() == 0) return;
425 | 
426 |     size_t total = unprocessed_.size();
427 |     for (auto& td : tdigests) {
428 |       total += td->unprocessed_.size();
429 |     }
430 | 
431 |     unprocessed_.reserve(total);
432 |     for (auto& td : tdigests) {
433 |       unprocessed_.insert(unprocessed_.end(), td->unprocessed_.cbegin(), td->unprocessed_.cend());
434 |       unprocessedWeight_ += td->unprocessedWeight_;
435 |     }
436 |   }
437 | 
438 |   // merge all processed centroids together into a single sorted vector
439 |   void mergeProcessed(const std::vector<const TDigest*>& tdigests) {
440 |     if (tdigests.size() == 0) return;
441 | 
442 |     size_t total = 0;
443 |     CentroidListQueue pq(CentroidListComparator{});
444 |     for (auto& td : tdigests) {
445 |       auto& sorted = td->processed_;
446 |       auto size = sorted.size();
447 |       if (size > 0) {
448 |         pq.push(CentroidList(sorted));
449 |         total += size;
450 |         processedWeight_ += td->processedWeight_;
451 |       }
452 |     }
453 |     if (total == 0) return;
454 | 
455 |     if (processed_.size() > 0) {
456 |       pq.push(CentroidList(processed_));
457 |       total += processed_.size();
458 |     }
459 | 
460 |     std::vector<Centroid> sorted;
461 |     LOG(INFO) << "total " << total;
462 |     sorted.reserve(total);
463 | 
464 |     while (!pq.empty()) {
465 |       auto best = pq.top();
466 |       pq.pop();
467 |       sorted.push_back(*(best.iter));
468 |       if (best.advance()) pq.push(best);
469 |     }
470 |     processed_ = std::move(sorted);
471 |     if( processed_.size() > 0 ) {
472 |       min_ = std::min(min_, processed_[0].mean());
473 |       max_ = std::max(max_, (processed_.cend() - 1)->mean());
474 |     }
475 |   }
476 | 
477 |   inline void processIfNecessary() {
478 |     if (isDirty()) {
479 |       process();
480 |     }
481 |   }
482 | 
483 |   void updateCumulative() {
484 |     const auto n = processed_.size();
485 |     cumulative_.clear();
486 |     cumulative_.reserve(n + 1);
487 |     auto previous = 0.0;
488 |     for (Index i = 0; i < n; i++) {
489 |       auto current = weight(i);
490 |       auto halfCurrent = current / 2.0;
491 |       cumulative_.push_back(previous + halfCurrent);
492 |       previous = previous + current;
493 |     }
494 |     cumulative_.push_back(previous);
495 |   }
496 | 
497 |   // merges unprocessed_ centroids and processed_ centroids together and processes them
498 |   // when complete, unprocessed_ will be empty and processed_ will have at most maxProcessed_ centroids
499 |   inline void process() {
500 |     CentroidComparator cc;
501 |     std::sort(unprocessed_.begin(), unprocessed_.end(), cc);
502 |     auto count = unprocessed_.size();
503 |     unprocessed_.insert(unprocessed_.end(), processed_.cbegin(), processed_.cend());
504 |     std::inplace_merge(unprocessed_.begin(), unprocessed_.begin() + count, unprocessed_.end(), cc);
505 | 
506 |     processedWeight_ += unprocessedWeight_;
507 |     unprocessedWeight_ = 0;
508 |     processed_.clear();
509 | 
510 |     processed_.push_back(unprocessed_[0]);
511 |     Weight wSoFar = unprocessed_[0].weight();
512 |     Weight wLimit = processedWeight_ * integratedQ(1.0);
513 | 
514 |     auto end = unprocessed_.end();
515 |     for (auto iter = unprocessed_.cbegin() + 1; iter < end; iter++) {
516 |       auto& centroid = *iter;
517 |       Weight projectedW = wSoFar + centroid.weight();
518 |       if (projectedW <= wLimit) {
519 |         wSoFar = projectedW;
520 |         (processed_.end() - 1)->add(centroid);
521 |       } else {
522 |         auto k1 = integratedLocation(wSoFar / processedWeight_);
523 |         wLimit = processedWeight_ * integratedQ(k1 + 1.0);
524 |         wSoFar += centroid.weight();
525 |         processed_.emplace_back(centroid);
526 |       }
527 |     }
528 |     unprocessed_.clear();
529 |     min_ = std::min(min_, processed_[0].mean());
530 |     DLOG(INFO) << "new min_ " << min_;
531 |     max_ = std::max(max_, (processed_.cend() - 1)->mean());
532 |     DLOG(INFO) << "new max_ " << max_;
533 |     updateCumulative();
534 |   }
535 | 
536 |   inline int checkWeights() { return checkWeights(processed_, processedWeight_); }
537 | 
538 |   size_t checkWeights(const std::vector<Centroid>& sorted, Value total) {
539 |     size_t badWeight = 0;
540 |     auto k1 = 0.0;
541 |     auto q = 0.0;
542 |     for (auto iter = sorted.cbegin(); iter != sorted.cend(); iter++) {
543 |       auto w = iter->weight();
544 |       auto dq = w / total;
545 |       auto k2 = integratedLocation(q + dq);
546 |       if (k2 - k1 > 1 && w != 1) {
547 |         LOG(WARNING) << "Oversize centroid at " << std::distance(sorted.cbegin(), iter) << " k1 " << k1 << " k2 " << k2
548 |                      << " dk " << (k2 - k1) << " w " << w << " q " << q;
549 |         badWeight++;
550 |       }
551 |       if (k2 - k1 > 1.5 && w != 1) {
552 |         LOG(ERROR) << "Egregiously Oversize centroid at " << std::distance(sorted.cbegin(), iter) << " k1 " << k1
553 |                    << " k2 " << k2 << " dk " << (k2 - k1) << " w " << w << " q " << q;
554 |         badWeight++;
555 |       }
556 |       q += dq;
557 |       k1 = k2;
558 |     }
559 | 
560 |     return badWeight;
561 |   }
562 | 
563 |   /**
564 |    * Converts a quantile into a centroid scale value.  The centroid scale is nomin_ally
565 |    * the number k of the centroid that a quantile point q should belong to.  Due to
566 |    * round-offs, however, we can't align things perfectly without splitting points
567 |    * and sorted.  We don't want to do that, so we have to allow for offsets.
568 |    * In the end, the criterion is that any quantile range that spans a centroid
569 |    * scale range more than one should be split across more than one centroid if
570 |    * possible.  This won't be possible if the quantile range refers to a single point
571 |    * or an already existing centroid.
572 |    * <p/>
573 |    * This mapping is steep near q=0 or q=1 so each centroid there will correspond to
574 |    * less q range.  Near q=0.5, the mapping is flatter so that sorted there will
575 |    * represent a larger chunk of quantiles.
576 |    *
577 |    * @param q The quantile scale value to be mapped.
578 |    * @return The centroid scale value corresponding to q.
579 |    */
580 |   inline Value integratedLocation(Value q) const {
581 |     return compression_ * (std::asin(2.0 * q - 1.0) + M_PI / 2) / M_PI;
582 |   }
583 | 
584 |   inline Value integratedQ(Value k) const {
585 |     return (std::sin(std::min(k, compression_) * M_PI / compression_ - M_PI / 2) + 1) / 2;
586 |   }
587 | 
588 |   /**
589 |    * Same as {@link #weightedAverageSorted(Value, Value, Value, Value)} but flips
590 |    * the order of the variables if <code>x2</code> is greater than
591 |    * <code>x1</code>.
592 |    */
593 |   static Value weightedAverage(Value x1, Value w1, Value x2, Value w2) {
594 |     return (x1 <= x2) ? weightedAverageSorted(x1, w1, x2, w2) : weightedAverageSorted(x2, w2, x1, w1);
595 |   }
596 | 
597 |   /**
598 |    * Compute the weighted average between <code>x1</code> with a weight of
599 |    * <code>w1</code> and <code>x2</code> with a weight of <code>w2</code>.
600 |    * This expects <code>x1</code> to be less than or equal to <code>x2</code>
601 |    * and is guaranteed to return a number between <code>x1</code> and
602 |    * <code>x2</code>.
603 |    */
604 |   static Value weightedAverageSorted(Value x1, Value w1, Value x2, Value w2) {
605 |     CHECK_LE(x1, x2);
606 |     const Value x = (x1 * w1 + x2 * w2) / (w1 + w2);
607 |     return std::max(x1, std::min(x, x2));
608 |   }
609 | 
610 |   static Value interpolate(Value x, Value x0, Value x1) { return (x - x0) / (x1 - x0); }
611 | 
612 |   /**
613 |    * Computes an interpolated value of a quantile that is between two sorted.
614 |    *
615 |    * Index is the quantile desired multiplied by the total number of samples - 1.
616 |    *
617 |    * @param index              Denormalized quantile desired
618 |    * @param previousIndex      The denormalized quantile corresponding to the center of the previous centroid.
619 |    * @param nextIndex          The denormalized quantile corresponding to the center of the following centroid.
620 |    * @param previousMean       The mean of the previous centroid.
621 |    * @param nextMean           The mean of the following centroid.
622 |    * @return  The interpolated mean.
623 |    */
624 |   static Value quantile(Value index, Value previousIndex, Value nextIndex, Value previousMean, Value nextMean) {
625 |     const auto delta = nextIndex - previousIndex;
626 |     const auto previousWeight = (nextIndex - index) / delta;
627 |     const auto nextWeight = (index - previousIndex) / delta;
628 |     return previousMean * previousWeight + nextMean * nextWeight;
629 |   }
630 | };
631 | 
632 | }  // namespace tdigest2
633 | 
634 | #endif  // TDIGEST2_TDIGEST_H_
635 | 
636 | 


--------------------------------------------------------------------------------
/test/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 6 | cd "$DIR"
 7 | 
 8 | go run gen/main.go
 9 | go run main.go
10 | g++ -std=c++11 -o cpp.test main.cpp
11 | ./cpp.test 2>/dev/null
12 | rm cpp.test
13 | 
14 | go run validate/main.go
15 | 


--------------------------------------------------------------------------------
/test/validate/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"log"
 6 | 	"math"
 7 | 	"os"
 8 | 	"strconv"
 9 | 	"strings"
10 | )
11 | 
12 | var dataFiles = []string{
13 | 	"small.dat",
14 | 	"uniform.dat",
15 | 	"normal.dat",
16 | }
17 | 
18 | const (
19 | 	cppQExt = ".cpp.quantiles"
20 | 	goQExt  = ".go.quantiles"
21 | 
22 | 	cppCDFExt = ".cpp.cdfs"
23 | 	goCDFExt  = ".go.cdfs"
24 | 
25 | 	epsilon = 1e-6
26 | )
27 | 
28 | func main() {
29 | 	for _, f := range dataFiles {
30 | 		// Validate Quantiles
31 | 		cppQuantiles := loadResults(f + cppQExt)
32 | 		goQuantiles := loadResults(f + goQExt)
33 | 		if len(cppQuantiles) != len(goQuantiles) {
34 | 			log.Fatal("differing number of quantiles results")
35 | 		}
36 | 
37 | 		for i := range cppQuantiles {
38 | 			if math.Abs(cppQuantiles[i]-goQuantiles[i]) > epsilon {
39 | 				log.Fatalf("differing quantile result go: %f cpp: %f", goQuantiles[i], cppQuantiles[i])
40 | 			}
41 | 		}
42 | 
43 | 		// Validate CDFs
44 | 		cppCDFs := loadResults(f + cppCDFExt)
45 | 		goCDFs := loadResults(f + goCDFExt)
46 | 		if len(cppCDFs) != len(goCDFs) {
47 | 			log.Fatal("differing number of CDFs results")
48 | 		}
49 | 
50 | 		for i := range cppCDFs {
51 | 			if math.Abs(cppCDFs[i]-goCDFs[i]) > epsilon {
52 | 				log.Fatalf("differing CDF result go: %f cpp: %f", goCDFs[i], cppCDFs[i])
53 | 			}
54 | 		}
55 | 	}
56 | }
57 | 
58 | func loadResults(name string) []float64 {
59 | 	f, err := os.Open(name)
60 | 	if err != nil {
61 | 		panic(err)
62 | 	}
63 | 	defer f.Close()
64 | 	s := bufio.NewScanner(f)
65 | 	var data []float64
66 | 	for s.Scan() {
67 | 		parts := strings.SplitN(s.Text(), " ", 2)
68 | 		x, err := strconv.ParseFloat(parts[0], 64)
69 | 		if err != nil {
70 | 			panic(err)
71 | 		}
72 | 		data = append(data, x)
73 | 	}
74 | 	return data
75 | }
76 | 


--------------------------------------------------------------------------------