├── .github
└── workflows
│ └── maven.yml
├── .gitignore
├── CITATION.cff
├── LICENSE
├── NOTICES
├── README.md
├── RELEASE-NOTES.md
├── benchmark
├── pom.xml
├── src
│ └── main
│ │ └── java
│ │ └── com
│ │ └── tdunning
│ │ ├── ApproxLogBench.java
│ │ ├── Benchmark.java
│ │ ├── FloatHistogramBench.java
│ │ ├── MergeBench.java
│ │ ├── SortBench.java
│ │ └── TDigestBench.java
└── x.r
├── core
├── pom.xml
└── src
│ ├── main
│ ├── java
│ │ └── com
│ │ │ └── tdunning
│ │ │ └── math
│ │ │ └── stats
│ │ │ ├── AVLGroupTree.java
│ │ │ ├── AVLTreeDigest.java
│ │ │ ├── AbstractTDigest.java
│ │ │ ├── Centroid.java
│ │ │ ├── Comparison.java
│ │ │ ├── Dist.java
│ │ │ ├── FloatHistogram.java
│ │ │ ├── Histogram.java
│ │ │ ├── IntAVLTree.java
│ │ │ ├── LogHistogram.java
│ │ │ ├── MergingDigest.java
│ │ │ ├── ScaleFunction.java
│ │ │ ├── Simple64.java
│ │ │ ├── Sort.java
│ │ │ └── TDigest.java
│ └── r
│ │ └── asin-approx.r
│ └── test
│ └── java
│ └── com
│ └── tdunning
│ ├── math
│ └── stats
│ │ ├── AVLGroupTreeTest.java
│ │ ├── AVLTreeDigestTest.java
│ │ ├── AbstractTest.java
│ │ ├── AlternativeMergeTest.java
│ │ ├── BigCount.java
│ │ ├── BigCountMergingDigest.java
│ │ ├── BigCountTreeDigest.java
│ │ ├── ComparisonTest.java
│ │ ├── DigestFactory.java
│ │ ├── FloatHistogramTest.java
│ │ ├── HistogramTestCases.java
│ │ ├── IntAVLTreeTest.java
│ │ ├── LogHistogramTest.java
│ │ ├── MegaMergeTest.java
│ │ ├── MergingDigestTest.java
│ │ ├── ReproTest.java
│ │ ├── ReproduceInfoPrinterRunListener.java
│ │ ├── ScaleFunctionTests.java
│ │ ├── SerializationTest.java
│ │ ├── SortTest.java
│ │ ├── TDigestSerializationTest.java
│ │ ├── TDigestTest.java
│ │ └── TDigestUtilTest.java
│ └── scale
│ └── ScaleTest.java
├── docs
├── error-uniform-delta=100.png
├── error-uniform-delta=200.png
├── error-uniform-delta=50.png
├── error-uniform-delta=500.png
├── error-vs-compression.png
├── interpolation-figure.png
├── max-error-uniform.png
├── proofs
│ ├── invariant-preservation.pdf
│ ├── invariant-preservation.tex
│ ├── refs.bib
│ ├── sizing.pdf
│ └── sizing.tex
├── quantiles
│ ├── quantiles.pdf
│ └── quantiles.tex
├── r-sim-diagrams
│ ├── figs.r
│ ├── shifts.r
│ └── sim.r
├── simpa
│ ├── declaration-of-competing-interests.docx
│ ├── figures
│ │ ├── adaptive-threshold.pdf
│ │ ├── change-point.pdf
│ │ ├── detection.r
│ │ ├── error-vs-compression.pdf
│ │ ├── windows.graffle
│ │ └── windows.pdf
│ ├── highlights.pdf
│ ├── highlights.tex
│ ├── main.tex
│ └── refs.bib
├── software-paper
│ └── figures
│ │ ├── cluster-spread.pdf
│ │ ├── endpoint.pdf
│ │ ├── interpolation.pdf
│ │ ├── k-q-plot.pdf
│ │ ├── linear-interpolation.pdf
│ │ ├── merge.pdf
│ │ ├── qd-sizes.pdf
│ │ ├── relative-error.pdf
│ │ └── singleton.pdf
├── t-digest-paper
│ ├── build-figures.py
│ ├── cell-deviation.tsv
│ ├── comparison.r
│ ├── comparison.tsv
│ ├── error-scaling.tsv
│ ├── errors-old.tsv
│ ├── errors.csv
│ ├── errors.r
│ ├── figure-doc.pdf
│ ├── figure-doc.tex
│ ├── figures
│ │ ├── cluster-spread.pdf
│ │ ├── endpoint.pdf
│ │ ├── error-vs-compression.pdf
│ │ ├── interpolation.pdf
│ │ ├── k-q-plot.pdf
│ │ ├── linear-interpolation.pdf
│ │ ├── merge.pdf
│ │ ├── qd-sizes.pdf
│ │ ├── relative-error.pdf
│ │ └── singleton.pdf
│ ├── gamma-deviation.tsv
│ ├── histo.pdf
│ ├── histo.tex
│ ├── k-q-diagram.graffle
│ ├── k-q-diagram
│ │ ├── k-q-limits.pdf
│ │ └── slope-limiting.pdf
│ ├── k-q-plot.r
│ ├── linear-interpolation.r
│ ├── merge.eps
│ ├── merge.tsv
│ ├── natbib.sty
│ ├── quantile-figures.graffle
│ │ ├── data.plist
│ │ ├── image1.pdf
│ │ ├── image10.pdf
│ │ ├── image11.pdf
│ │ ├── image2.pdf
│ │ ├── image21.pdf
│ │ ├── image22.pdf
│ │ ├── image23.pdf
│ │ ├── image24.pdf
│ │ ├── image26.pdf
│ │ ├── image27.pdf
│ │ ├── image28.pdf
│ │ ├── image29.pdf
│ │ ├── image3.pdf
│ │ ├── image31.pdf
│ │ ├── image32.pdf
│ │ ├── image33.pdf
│ │ ├── image34.pdf
│ │ ├── image35.pdf
│ │ ├── image41.pdf
│ │ ├── image44.pdf
│ │ ├── image46.pdf
│ │ ├── image47.pdf
│ │ ├── image48.pdf
│ │ ├── image49.pdf
│ │ ├── image50.pdf
│ │ ├── image51.pdf
│ │ ├── image59.pdf
│ │ ├── image63.pdf
│ │ ├── image7.pdf
│ │ └── image8.pdf
│ ├── quantile-figures
│ │ ├── combined.pdf
│ │ ├── endpoint.pdf
│ │ ├── interpolation.pdf
│ │ └── singleton.pdf
│ ├── refs.bib
│ ├── scaling.r
│ ├── scaling.tsv
│ ├── sizes.csv
│ ├── sizes.r
│ └── statsoc.cls
└── vldb
│ ├── figures
│ ├── cluster-spread.pdf
│ ├── combined.pdf
│ ├── endpoint.pdf
│ ├── error-vs-compression-small.pdf
│ ├── error-vs-compression.pdf
│ ├── interpolation.pdf
│ ├── k-q-plot.pdf
│ ├── linear-interpolation.pdf
│ ├── merge.pdf
│ ├── qd-sizes-small.pdf
│ ├── qd-sizes.pdf
│ ├── relative-error-one-panel.pdf
│ ├── relative-error.pdf
│ └── singleton.pdf
│ ├── refs.bib
│ ├── short.pdf
│ ├── short.tex
│ └── vldb.cls
├── pom.xml
├── quality
├── README.md
├── accuracy.r
├── comparison.r
├── fh.r
├── kll-comparison.pdf
├── merge.r
├── pom.xml
├── src
│ └── test
│ │ └── java
│ │ └── com
│ │ └── tdunning
│ │ ├── math
│ │ └── stats
│ │ │ └── QuantileEstimator.java
│ │ └── tdigest
│ │ └── quality
│ │ ├── AccuracyTest.java
│ │ ├── BinFill.java
│ │ ├── CompareKllTest.java
│ │ ├── ComparisonTest.java
│ │ ├── Git.java
│ │ ├── ScalingTest.java
│ │ ├── SinglePassTest.java
│ │ └── Util.java
└── x.r
└── size-studies.r
/.github/workflows/maven.yml:
--------------------------------------------------------------------------------
1 | name: Java CI
2 |
3 | on:
4 | push:
5 | pull_request:
6 |
7 | permissions:
8 | contents: read
9 |
10 | jobs:
11 | build:
12 | runs-on: ${{ matrix.os }}
13 | continue-on-error: ${{ matrix.experimental }}
14 | strategy:
15 | matrix:
16 | os: [ ubuntu-latest, windows-latest ]
17 | java: [ 8, 11 ]
18 | experimental: [ false ]
19 | # include:
20 | # - java: 18-ea
21 | # os: ubuntu-latest
22 | # experimental: true
23 |
24 | steps:
25 | - uses: actions/checkout@v3.3.0
26 | with:
27 | persist-credentials: false
28 | - name: Set up JDK ${{ matrix.java }}
29 | uses: actions/setup-java@v3.10.0
30 | with:
31 | distribution: 'temurin'
32 | java-version: ${{ matrix.java }}
33 | - name: Build with Maven
34 | run: mvn -V --no-transfer-progress clean test
35 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .RData
2 | .Rapp.history
3 | .DS_Store
4 | *.class
5 | target/
6 | .Rhistory
7 | docs/r-sim-diagrams/*.pdf
8 | histo.synctex.gz
9 | log
10 |
11 | # Package Files #
12 | *.jar
13 | *.war
14 | *.ear
15 |
16 | # IDEA related files
17 | .idea/
18 | *.iml
19 |
20 | # Eclipse related files
21 | .classpath
22 | .project
23 | .settings/
24 |
25 | # Latex and R related files
26 | _region_.prv/
27 | _region_*
28 | *.log
29 | *.aux
30 | *.dvi
31 | *.blg
32 | *.bbl
33 | *.synctex.gz
34 | Rplots.pdf
35 | .Rhistory
36 | *.ini
37 |
38 | # diagnostic outputs
39 | *.csv
40 | *.tsv
41 |
42 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | message: "If you use this software, please cite it as below."
3 | authors:
4 | - family-names: "Dunning"
5 | given-names: "Ted"
6 | orcid: "https://orcid.org/0000-0000-0000-0000"
7 | title: "t-digest"
8 | version: 3.2
9 | date-released: 2017-08-06
10 | url: "https://github.com/tdunning/t-digest"
11 |
--------------------------------------------------------------------------------
/NOTICES:
--------------------------------------------------------------------------------
1 | The code for the t-digest was originally authored by Ted Dunning
2 |
3 | Adrien Grand contributed the heart of the AVLTreeDigest (https://github.com/jpountz)
4 |
5 |
--------------------------------------------------------------------------------
/RELEASE-NOTES.md:
--------------------------------------------------------------------------------
1 | Release 3.2
2 | ===========
3 | In release 3.2, the goal is to produce an update to the code given the large number of improvements since the previous release.
4 |
5 | There are a few bugs that will survive this release, most notably in the AVLTreeDigest. These have to do with large numbers of repeated data points and are not new bugs.
6 |
7 | There is also a lot of work going on with serialization. I need to hear from people about what they are doing with serialization so that we can build some test cases to allow an appropriate migration strategy to future serialization.
8 |
9 | The paper continues to be updated. The algorithmic descriptions are getting reasonably clear, but the speed and accuracy sections need a complete revamp with current implementations.
10 |
11 |
12 | Bugs, fixed and known
13 | ----
14 |
15 | #### Fixed
16 | The following important issues are fixed in this release
17 |
18 | [Issue #90](https://github.com/tdunning/t-digest/issues/90) Serialization for MergingDigest
19 |
20 | [Issue #92](https://github.com/tdunning/t-digest/issues/92) Serialization for AVLTreeDigest
21 |
22 | #### Maybe fixed
23 | This issue has substantial progress, but lacks a definitive test to determine whether it should be closed.
24 |
25 | [Issue 78](https://github.com/tdunning/t-digest/issues/78) Stability under merging.
26 |
27 | #### Pushed
28 | The following issues are pushed beyond this release
29 |
30 | [Issue #87](https://github.com/tdunning/t-digest/issues/87) Future proof and extensible serialization
31 |
32 | [Issue #89](https://github.com/tdunning/t-digest/issues/89) Bad handling for duplicate values in AVLTreeDigest
33 |
34 | #### All fixed issues
35 | Here is a complete list of issues resolved in this release:
36 |
37 | [Issue #55](https://github.com/tdunning/t-digest/issues/55) Add time
38 | decay to t-digest
39 |
40 | [Issue #52](https://github.com/tdunning/t-digest/issues/52) General
41 | factory method for "fromBytes"
42 |
43 | [Issue #90](https://github.com/tdunning/t-digest/issues/90)
44 | Deserialization of MergingDigest BufferUnderflowException in 3.1
45 |
46 | [Issue #92](https://github.com/tdunning/t-digest/issues/92) Error in
47 | AVLTreeDigest.fromBytes
48 |
49 | [Issue #93](https://github.com/tdunning/t-digest/issues/93) high
50 | centroid frequency causes overflow - giving incorrect results
51 |
52 | [Issue #67](https://github.com/tdunning/t-digest/issues/67) Release of
53 | version 3.2
54 |
55 | [Issue #81](https://github.com/tdunning/t-digest/issues/81)
56 | AVLTreeDigest with a lot of datas : integer overflow
57 |
58 | [Issue #75](https://github.com/tdunning/t-digest/issues/75) Adjusting
59 | the centroid threshold values to obtain better accuracy at interesting
60 | values
61 |
62 | [Issue #74](https://github.com/tdunning/t-digest/issues/74) underlying
63 | distribution : powerlaw
64 |
65 | [Issue #72](https://github.com/tdunning/t-digest/issues/72) Inverse
66 | quantile algorithm is non-contiguous
67 |
68 | [Issue #65](https://github.com/tdunning/t-digest/issues/65)
69 | totalDigest add spark dataframe column / array
70 |
71 | [Issue #60](https://github.com/tdunning/t-digest/issues/60) Getting
72 | IllegalArgumentException when adding digests
73 |
74 | [Issue #53](https://github.com/tdunning/t-digest/issues/53)
75 | smallByteSize methods are very trappy in many classes -- should be
76 | changed or have warnings in javadocs
77 |
78 | [Issue #82](https://github.com/tdunning/t-digest/issues/82) TDigest
79 | class does not implement Serializable interface in last release.
80 |
81 | [Issue #42](https://github.com/tdunning/t-digest/issues/42) Histogram
82 |
83 | [Issue #40](https://github.com/tdunning/t-digest/issues/40) Improved
84 | constraint on centroid sizes
85 |
86 | [Issue #37](https://github.com/tdunning/t-digest/issues/37) Allow
87 | arbitrary scaling laws for centroid sizes
88 |
89 | [Issue #29](https://github.com/tdunning/t-digest/issues/29) Test
90 | method testScaling() always adds values in ascending order
91 |
92 | [Issue #84](https://github.com/tdunning/t-digest/issues/84) Remove
93 | deprecated kinds of t-digest
94 |
95 | [Issue #76](https://github.com/tdunning/t-digest/issues/76) Add
96 | serializability
97 |
98 | [Issue #77](https://github.com/tdunning/t-digest/issues/77) Question:
99 | Proof of bounds on merging digest size
100 |
101 | [Issue #71](https://github.com/tdunning/t-digest/issues/71) Simple
102 | alternate algorithm using maxima, ranks and fixed cumulative weighting
103 |
104 | [Issue #61](https://github.com/tdunning/t-digest/issues/61) Possible
105 | improvement to the speed of the algorithm
106 |
107 | [Issue #58](https://github.com/tdunning/t-digest/issues/58) jdk8
108 | doclint incompatibility
109 |
110 | [Issue #48](https://github.com/tdunning/t-digest/issues/48) Build is
111 | unstable under some circumstances
112 |
113 | [Issue #63](https://github.com/tdunning/t-digest/issues/63) Which
114 | TDigest do you recommend?
115 |
116 | [Issue #62](https://github.com/tdunning/t-digest/issues/62) Very slow
117 | performance; what am I missing?
118 |
119 | [Issue #47](https://github.com/tdunning/t-digest/issues/47) Make
120 | TDigest serializable
121 |
122 | [Issue #49](https://github.com/tdunning/t-digest/issues/49)
123 | MergingDigest.centroids is wrong on an empty digest
124 |
125 |
--------------------------------------------------------------------------------
/benchmark/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 |
8 | com.tdunning
9 | t-digest-parent
10 | 3.4-SNAPSHOT
11 | ../pom.xml
12 |
13 | t-digest-benchmark
14 |
15 |
16 |
17 | com.tdunning
18 | t-digest
19 | 3.4-SNAPSHOT
20 |
21 |
22 | org.openjdk.jmh
23 | jmh-core
24 | 1.17.3
25 |
26 |
27 | org.openjdk.jmh
28 | jmh-generator-annprocess
29 | 1.17.3
30 | provided
31 |
32 |
33 | org.apache.mahout
34 | mahout-math
35 | 0.9
36 |
37 |
38 |
39 |
40 |
41 |
42 | org.codehaus.mojo
43 | exec-maven-plugin
44 | 1.1
45 |
46 | com.tdunning.Benchmark
47 |
48 |
49 |
50 | org.apache.maven.plugins
51 | maven-compiler-plugin
52 | 3.3
53 |
54 | true
55 | 1.8
56 | 1.7
57 | 1.7
58 |
59 |
60 |
61 |
62 |
63 | org.apache.maven.plugins
64 | maven-shade-plugin
65 | 2.0
66 |
67 |
68 | package
69 |
70 | shade
71 |
72 |
73 | microbenchmarks
74 |
75 |
76 | org.openjdk.jmh.Main
77 |
78 |
79 |
80 |
81 | *:*
82 |
83 | META-INF/services/javax.annotation.processing.Processor
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
--------------------------------------------------------------------------------
/benchmark/src/main/java/com/tdunning/ApproxLogBench.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning;
19 |
20 | import com.tdunning.math.stats.LogHistogram;
21 | import com.tdunning.math.stats.MergingDigest;
22 | import com.tdunning.math.stats.ScaleFunction;
23 |
24 | import org.openjdk.jmh.annotations.*;
25 | import org.openjdk.jmh.annotations.Benchmark;
26 | import org.openjdk.jmh.results.format.ResultFormatType;
27 | import org.openjdk.jmh.runner.Runner;
28 | import org.openjdk.jmh.runner.RunnerException;
29 | import org.openjdk.jmh.runner.options.Options;
30 | import org.openjdk.jmh.runner.options.OptionsBuilder;
31 |
32 | import java.util.Random;
33 | import java.util.concurrent.TimeUnit;
34 |
35 | /**
36 | * Explores the value of using a large buffer for the MergingDigest. The rationale is that the internal
37 | * sort is extremely fast while the merging function in the t-digest can be quite slow, if only because
38 | * computing the asin function involved in the merge is expensive. This argues for collecting more samples
39 | * before sorting and merging them into the digest.
40 | */
41 | @BenchmarkMode(Mode.AverageTime)
42 | @OutputTimeUnit(TimeUnit.NANOSECONDS)
43 | @Warmup(iterations = 3, time = 3, timeUnit = TimeUnit.SECONDS)
44 | @Measurement(iterations = 5, time = 3, timeUnit = TimeUnit.SECONDS)
45 | @Fork(1)
46 | @Threads(1)
47 | @State(Scope.Thread)
48 | public class ApproxLogBench {
49 | private static final double LOG_2 = Math.log(2);
50 | private Random gen = new Random();
51 | private double[] data;
52 |
53 | @Setup
54 | public void setup() {
55 | data = new double[10000000];
56 | for (int i = 0; i < data.length; i++) {
57 | data[i] = gen.nextDouble();
58 | }
59 | }
60 |
61 | @State(Scope.Thread)
62 | public static class ThreadState {
63 | int index = 0;
64 | }
65 |
66 | @Benchmark
67 | @BenchmarkMode(Mode.AverageTime)
68 | @OutputTimeUnit(TimeUnit.NANOSECONDS)
69 | public void addApprox(ThreadState state) {
70 | if (state.index >= data.length) {
71 | state.index = 0;
72 | }
73 | double sum = 0;
74 | for (int i = 0; i < 1000; i++) {
75 | sum += LogHistogram.approxLog2(data[state.index++]);
76 | }
77 | }
78 |
79 | @Benchmark
80 | @BenchmarkMode(Mode.AverageTime)
81 | @OutputTimeUnit(TimeUnit.NANOSECONDS)
82 | public void addLog(ThreadState state) {
83 | if (state.index >= data.length) {
84 | state.index = 0;
85 | }
86 | double sum = 0;
87 | for (int i = 0; i < 1000; i++) {
88 | sum += Math.log(data[state.index++])/LOG_2;
89 | }
90 |
91 | }
92 |
93 | public static void main(String[] args) throws RunnerException {
94 | Options opt = new OptionsBuilder()
95 | .include(ApproxLogBench.class.getSimpleName())
96 | .warmupIterations(5)
97 | .measurementIterations(5)
98 | .forks(1)
99 | .resultFormat(ResultFormatType.CSV)
100 | .build();
101 |
102 | new Runner(opt).run();
103 | }
104 | }
105 |
--------------------------------------------------------------------------------
/benchmark/src/main/java/com/tdunning/Benchmark.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning;
19 |
20 | import com.tdunning.math.stats.AVLTreeDigest;
21 | import com.tdunning.math.stats.MergingDigest;
22 | import com.tdunning.math.stats.TDigest;
23 | import org.openjdk.jmh.annotations.*;
24 | import org.openjdk.jmh.profile.GCProfiler;
25 | import org.openjdk.jmh.profile.StackProfiler;
26 | import org.openjdk.jmh.results.format.ResultFormatType;
27 | import org.openjdk.jmh.runner.Runner;
28 | import org.openjdk.jmh.runner.RunnerException;
29 | import org.openjdk.jmh.runner.options.Options;
30 | import org.openjdk.jmh.runner.options.OptionsBuilder;
31 |
32 | import java.util.Random;
33 | import java.util.concurrent.TimeUnit;
34 |
35 | @BenchmarkMode(Mode.AverageTime)
36 | @OutputTimeUnit(TimeUnit.NANOSECONDS)
37 | @Warmup(iterations = 3, time = 3, timeUnit = TimeUnit.SECONDS)
38 | @Measurement(iterations = 5, time = 2, timeUnit = TimeUnit.SECONDS)
39 | @Fork(1)
40 | @Threads(1)
41 | @State(Scope.Thread)
42 | public class Benchmark {
43 | private Random gen = new Random();
44 | private double[] data;
45 |
46 | @Param({"merge", "tree"})
47 | public String method;
48 |
49 | @Param({"20", "50", "100", "200", "500"})
50 | public int compression;
51 |
52 | private TDigest td;
53 |
54 | @Setup
55 | public void setup() {
56 | data = new double[10000000];
57 | for (int i = 0; i < data.length; i++) {
58 | data[i] = gen.nextDouble();
59 | }
60 | if (method.equals("tree")) {
61 | td = new AVLTreeDigest(compression);
62 | } else {
63 | td = new MergingDigest(500);
64 | }
65 |
66 | // First values are very cheap to add, we are more interested in the steady state,
67 | // when the summary is full. Summaries are expected to contain about 5*compression
68 | // centroids, hence the 5 factor
69 | for (int i = 0; i < 5 * compression; ++i) {
70 | td.add(gen.nextDouble());
71 | }
72 | }
73 |
74 | @State(Scope.Thread)
75 | public static class ThreadState {
76 | int index = 0;
77 | }
78 |
79 | @org.openjdk.jmh.annotations.Benchmark
80 | public void add(ThreadState state) {
81 | if (state.index >= data.length) {
82 | state.index = 0;
83 | }
84 | td.add(data[state.index++]);
85 | }
86 |
87 | public static void main(String[] args) throws RunnerException {
88 | Options opt = new OptionsBuilder()
89 | .include(".*" + Benchmark.class.getSimpleName() + ".*")
90 | .resultFormat(ResultFormatType.CSV)
91 | .result("results.csv")
92 | .addProfiler(GCProfiler.class)
93 | .addProfiler(StackProfiler.class)
94 | .build();
95 |
96 | new Runner(opt).run();
97 | }
98 |
99 | }
100 |
--------------------------------------------------------------------------------
/benchmark/src/main/java/com/tdunning/FloatHistogramBench.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning;
19 |
20 | import com.tdunning.math.stats.FloatHistogram;
21 | import org.openjdk.jmh.annotations.*;
22 | import org.openjdk.jmh.annotations.Benchmark;
23 | import org.openjdk.jmh.profile.GCProfiler;
24 | import org.openjdk.jmh.profile.StackProfiler;
25 | import org.openjdk.jmh.results.format.ResultFormatType;
26 | import org.openjdk.jmh.runner.Runner;
27 | import org.openjdk.jmh.runner.RunnerException;
28 | import org.openjdk.jmh.runner.options.Options;
29 | import org.openjdk.jmh.runner.options.OptionsBuilder;
30 |
31 | import java.util.Random;
32 | import java.util.concurrent.TimeUnit;
33 |
34 | /**
35 | * Explores the value of using a large buffer for the MergingDigest. The rationale is that the internal
36 | * sort is extremely fast while the merging function in the t-digest can be quite slow, if only because
37 | * computing the asin function involved in the merge is expensive. This argues for collecting more samples
38 | * before sorting and merging them into the digest.
39 | */
40 | @BenchmarkMode(Mode.AverageTime)
41 | @OutputTimeUnit(TimeUnit.NANOSECONDS)
42 | @Warmup(iterations = 3, time = 3, timeUnit = TimeUnit.SECONDS)
43 | @Measurement(iterations = 5, time = 2, timeUnit = TimeUnit.SECONDS)
44 | @Fork(1)
45 | @Threads(1)
46 | @State(Scope.Thread)
47 | public class FloatHistogramBench {
48 | private Random gen = new Random();
49 | private double[] data;
50 |
51 | @Param({"20", "50", "100"})
52 | public int binsPerDecade;
53 |
54 | private FloatHistogram fh;
55 |
56 | @Setup
57 | public void setup() {
58 | data = new double[10000000];
59 | for (int i = 0; i < data.length; i++) {
60 | data[i] = gen.nextDouble();
61 | }
62 | fh = new FloatHistogram(0.1, 10000, binsPerDecade);
63 |
64 | for (int i = 0; i < 10000; ++i) {
65 | fh.add(gen.nextDouble());
66 | }
67 | }
68 |
69 | @State(Scope.Thread)
70 | public static class ThreadState {
71 | int index = 0;
72 | }
73 |
74 | @Benchmark
75 | public void add(ThreadState state) {
76 | if (state.index >= data.length) {
77 | state.index = 0;
78 | }
79 | fh.add(data[state.index++]);
80 | }
81 |
82 | public static void main(String[] args) throws RunnerException {
83 | Options opt = new OptionsBuilder()
84 | .include(".*" + FloatHistogramBench.class.getSimpleName() + ".*")
85 | .resultFormat(ResultFormatType.CSV)
86 | .result("overall-results.csv")
87 | .addProfiler(StackProfiler.class)
88 | .addProfiler(GCProfiler.class)
89 | .build();
90 |
91 | new Runner(opt).run();
92 | }
93 |
94 | }
95 |
--------------------------------------------------------------------------------
/benchmark/src/main/java/com/tdunning/MergeBench.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning;
19 |
20 | import com.tdunning.math.stats.MergingDigest;
21 | import com.tdunning.math.stats.ScaleFunction;
22 |
23 | import org.openjdk.jmh.annotations.*;
24 | import org.openjdk.jmh.annotations.Benchmark;
25 | import org.openjdk.jmh.profile.StackProfiler;
26 | import org.openjdk.jmh.results.format.ResultFormatType;
27 | import org.openjdk.jmh.runner.Runner;
28 | import org.openjdk.jmh.runner.RunnerException;
29 | import org.openjdk.jmh.runner.options.Options;
30 | import org.openjdk.jmh.runner.options.OptionsBuilder;
31 |
32 | import java.util.Random;
33 | import java.util.concurrent.TimeUnit;
34 |
35 | /**
36 | * Explores the value of using a large buffer for the MergingDigest. The rationale is that the internal
37 | * sort is extremely fast while the merging function in the t-digest can be quite slow, if only because
38 | * computing the asin function involved in the merge is expensive. This argues for collecting more samples
39 | * before sorting and merging them into the digest.
40 | */
41 | @BenchmarkMode(Mode.AverageTime)
42 | @OutputTimeUnit(TimeUnit.NANOSECONDS)
43 | @Warmup(iterations = 3, time = 3, timeUnit = TimeUnit.SECONDS)
44 | @Measurement(iterations = 5, time = 3, timeUnit = TimeUnit.SECONDS)
45 | @Fork(1)
46 | @Threads(1)
47 | @State(Scope.Thread)
48 | public class MergeBench {
49 | private Random gen = new Random();
50 | private double[] data;
51 |
52 | // @Param({"20", "50", "100", "200", "500"})
53 | @Param({"50", "100"})
54 | public int compression;
55 |
56 | // @Param({"1", "2", "5", "10"})
57 | @Param({"2", "5", "10"})
58 | public int factor;
59 |
60 | // @Param({"K_1", "K_2", "K_3"})
61 | @Param({"K_2"})
62 | public String scaleFunction;
63 |
64 | private MergingDigest td;
65 |
66 | @Setup
67 | public void setup() {
68 | data = new double[10000000];
69 | for (int i = 0; i < data.length; i++) {
70 | data[i] = gen.nextDouble();
71 | }
72 | td = new MergingDigest(compression, (factor + 1) * compression, compression);
73 | td.setScaleFunction(ScaleFunction.valueOf(scaleFunction));
74 |
75 | // First values are very cheap to add, we are more interested in the steady state,
76 | // when the summary is full. Summaries are expected to contain about 0.6*compression
77 | // centroids, hence the 5 * compression * (factor+1)
78 | for (int i = 0; i < 5 * compression * (factor + 1); ++i) {
79 | td.add(gen.nextDouble());
80 | }
81 | }
82 |
83 | @State(Scope.Thread)
84 | public static class ThreadState {
85 | int index = 0;
86 | }
87 |
88 | @Benchmark
89 | @BenchmarkMode(Mode.AverageTime)
90 | @OutputTimeUnit(TimeUnit.MICROSECONDS)
91 | public void add(ThreadState state) {
92 | if (state.index >= data.length) {
93 | state.index = 0;
94 | }
95 | td.add(data[state.index++]);
96 | }
97 |
98 | public static void main(String[] args) throws RunnerException {
99 | Options opt = new OptionsBuilder()
100 | .include(MergeBench.class.getSimpleName())
101 | .warmupIterations(5)
102 | .measurementIterations(5)
103 | .forks(1)
104 | .resultFormat(ResultFormatType.CSV)
105 | .addProfiler(StackProfiler.class)
106 | .build();
107 |
108 | new Runner(opt).run();
109 | }
110 |
111 | }
112 |
--------------------------------------------------------------------------------
/benchmark/src/main/java/com/tdunning/SortBench.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning;
19 |
20 | import com.tdunning.math.stats.Sort;
21 | import org.openjdk.jmh.annotations.Benchmark;
22 | import org.openjdk.jmh.annotations.*;
23 |
24 | import java.util.Arrays;
25 | import java.util.Random;
26 | import java.util.concurrent.TimeUnit;
27 |
28 | /** Explores the performance of Sort on pathological input data. */
29 | @BenchmarkMode(Mode.AverageTime)
30 | @OutputTimeUnit(TimeUnit.MILLISECONDS)
31 | @Warmup(iterations = 10, time = 3, timeUnit = TimeUnit.SECONDS)
32 | @Measurement(iterations = 20, time = 2, timeUnit = TimeUnit.SECONDS)
33 | @Fork(1)
34 | @Threads(1)
35 | @State(Scope.Thread)
36 | public class SortBench {
37 | private final int size = 100000;
38 | private final double[] values = new double[size];
39 |
40 | @Param({"0", "1", "-1"})
41 | public int sortDirection;
42 |
43 | @Setup
44 | public void setup() {
45 | Random prng = new Random(999983);
46 | for (int i = 0; i < size; i++) {
47 | values[i] = prng.nextDouble();
48 | }
49 | if (sortDirection > 0) {
50 | Arrays.sort(values);
51 | } else if (sortDirection < 0) {
52 | Arrays.sort(values);
53 | Sort.reverse(values, 0, values.length);
54 | }
55 | }
56 |
57 | @Benchmark
58 | public void quicksort() {
59 | int[] order = new int[size];
60 | for (int i = 0; i < size; i++) {
61 | order[i] = i;
62 | }
63 | Sort.sort(order, values, null, values.length);
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/benchmark/src/main/java/com/tdunning/TDigestBench.java:
--------------------------------------------------------------------------------
1 | package com.tdunning;
2 |
3 | import com.tdunning.math.stats.AVLTreeDigest;
4 | import com.tdunning.math.stats.MergingDigest;
5 | import com.tdunning.math.stats.TDigest;
6 | import org.apache.mahout.math.jet.random.*;
7 | import org.openjdk.jmh.annotations.*;
8 | import org.openjdk.jmh.annotations.Benchmark;
9 | import org.openjdk.jmh.profile.GCProfiler;
10 | import org.openjdk.jmh.profile.StackProfiler;
11 | import org.openjdk.jmh.results.format.ResultFormatType;
12 | import org.openjdk.jmh.runner.Runner;
13 | import org.openjdk.jmh.runner.RunnerException;
14 | import org.openjdk.jmh.runner.options.Options;
15 | import org.openjdk.jmh.runner.options.OptionsBuilder;
16 |
17 | import java.util.Random;
18 | import java.util.concurrent.ThreadLocalRandom;
19 | import java.util.concurrent.TimeUnit;
20 |
21 | @BenchmarkMode(Mode.AverageTime)
22 | @OutputTimeUnit(TimeUnit.NANOSECONDS)
23 | @Warmup(iterations = 3, time = 3, timeUnit = TimeUnit.SECONDS)
24 | @Measurement(iterations = 5, time = 2, timeUnit = TimeUnit.SECONDS)
25 | @Fork(1)
26 | @Threads(1)
27 | @State(Scope.Thread)
28 | public class TDigestBench {
29 |
30 | public enum TDigestFactory {
31 | MERGE {
32 | @Override
33 | TDigest create(double compression) {
34 | return new MergingDigest(compression, (int) (10 * compression));
35 | }
36 |
37 | @Override
38 | TDigest create() {
39 | return create(100);
40 | }
41 | },
42 | AVL_TREE {
43 | @Override
44 | TDigest create(double compression) {
45 | return new AVLTreeDigest(compression);
46 | }
47 |
48 | @Override
49 | TDigest create() {
50 | return create(20);
51 | }
52 | };
53 |
54 | abstract TDigest create(double compression);
55 | abstract TDigest create();
56 | }
57 |
58 | public enum DistributionFactory {
59 | UNIFORM {
60 | @Override
61 | AbstractDistribution create(Random random) {
62 | return new Uniform(0, 1, random);
63 | }
64 | },
65 | SEQUENTIAL {
66 | @Override
67 | AbstractDistribution create(Random random) {
68 | return new AbstractContinousDistribution() {
69 | double base = 0;
70 |
71 | @Override
72 | public double nextDouble() {
73 | base += Math.PI * 1e-5;
74 | return base;
75 | }
76 | };
77 | }
78 | },
79 | REPEATED {
80 | @Override
81 | AbstractDistribution create(final Random random) {
82 | return new AbstractContinousDistribution() {
83 | @Override
84 | public double nextDouble() {
85 | return random.nextInt(10);
86 | }
87 | };
88 | }
89 | },
90 | GAMMA {
91 | @Override
92 | AbstractDistribution create(Random random) {
93 | return new Gamma(0.1, 0.1, random);
94 | }
95 | },
96 | NORMAL {
97 | @Override
98 | AbstractDistribution create(Random random) {
99 | return new Normal(0.1, 0.1, random);
100 | }
101 | };
102 |
103 | abstract AbstractDistribution create(Random random);
104 | }
105 |
106 | @Param({"100", "300"})
107 | double compression;
108 |
109 | @Param({"MERGE", "AVL_TREE"})
110 | TDigestFactory tdigestFactory;
111 |
112 | @Param({"NORMAL", "GAMMA"})
113 | DistributionFactory distributionFactory;
114 |
115 | Random random;
116 | TDigest tdigest;
117 | AbstractDistribution distribution;
118 |
119 | double[] data = new double[1000000];
120 |
121 | @Setup
122 | public void setUp() {
123 | random = ThreadLocalRandom.current();
124 | tdigest = tdigestFactory.create(compression);
125 | distribution = distributionFactory.create(random);
126 | // first values are cheap to add, so pre-fill the t-digest to have more realistic results
127 | for (int i = 0; i < 10000; ++i) {
128 | tdigest.add(distribution.nextDouble());
129 | }
130 |
131 | for (int i = 0; i < data.length; ++i) {
132 | data[i] = distribution.nextDouble();
133 | }
134 | }
135 |
136 | @State(Scope.Thread)
137 | public static class ThreadState {
138 | int index = 0;
139 | }
140 |
141 | @Benchmark
142 | public void timeAdd(MergeBench.ThreadState state) {
143 | if (state.index >= data.length) {
144 | state.index = 0;
145 | }
146 | tdigest.add(data[state.index++]);
147 | }
148 |
149 | public static void main(String[] args) throws RunnerException {
150 | Options opt = new OptionsBuilder()
151 | .include(".*" + TDigestBench.class.getSimpleName() + ".*")
152 | .resultFormat(ResultFormatType.CSV)
153 | .result("overall-results.csv")
154 | .addProfiler(GCProfiler.class)
155 | .addProfiler(StackProfiler.class)
156 | .build();
157 |
158 | new Runner(opt).run();
159 | }
160 | }
161 |
--------------------------------------------------------------------------------
/benchmark/x.r:
--------------------------------------------------------------------------------
1 | data = read.csv("bin-fill.csv")
2 |
--------------------------------------------------------------------------------
/core/src/main/java/com/tdunning/math/stats/AbstractTDigest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | import java.nio.ByteBuffer;
21 | import java.util.ArrayList;
22 | import java.util.List;
23 |
24 | public abstract class AbstractTDigest extends TDigest {
25 | boolean recordAllData = false;
26 |
27 | /**
28 | * Same as {@link #weightedAverageSorted(double, double, double, double)} but flips
29 | * the order of the variables if x2 is greater than
30 | * x1.
31 | */
32 | static double weightedAverage(double x1, double w1, double x2, double w2) {
33 | if (x1 <= x2) {
34 | return weightedAverageSorted(x1, w1, x2, w2);
35 | } else {
36 | return weightedAverageSorted(x2, w2, x1, w1);
37 | }
38 | }
39 |
40 | /**
41 | * Compute the weighted average between x1 with a weight of
42 | * w1 and x2 with a weight of w2.
43 | * This expects x1 to be less than or equal to x2
44 | * and is guaranteed to return a number in [x1, x2]. An
45 | * explicit check is required since this isn't guaranteed with floating-point
46 | * numbers.
47 | */
48 | private static double weightedAverageSorted(double x1, double w1, double x2, double w2) {
49 | assert x1 <= x2;
50 | final double x = (x1 * w1 + x2 * w2) / (w1 + w2);
51 | return Math.max(x1, Math.min(x, x2));
52 | }
53 |
54 | static double interpolate(double x, double x0, double x1) {
55 | return (x - x0) / (x1 - x0);
56 | }
57 |
58 | static void encode(ByteBuffer buf, int n) {
59 | int k = 0;
60 | while (n < 0 || n > 0x7f) {
61 | byte b = (byte) (0x80 | (0x7f & n));
62 | buf.put(b);
63 | n = n >>> 7;
64 | k++;
65 | if (k >= 6) {
66 | throw new IllegalStateException("Size is implausibly large");
67 | }
68 | }
69 | buf.put((byte) n);
70 | }
71 |
72 | static int decode(ByteBuffer buf) {
73 | int v = buf.get();
74 | int z = 0x7f & v;
75 | int shift = 7;
76 | while ((v & 0x80) != 0) {
77 | if (shift > 28) {
78 | throw new IllegalStateException("Shift too large in decode");
79 | }
80 | v = buf.get();
81 | z += (v & 0x7f) << shift;
82 | shift += 7;
83 | }
84 | return z;
85 | }
86 |
87 | abstract void add(double x, int w, Centroid base);
88 |
89 | /**
90 | * Computes an interpolated value of a quantile that is between two centroids.
91 | *
92 | * Index is the quantile desired multiplied by the total number of samples - 1.
93 | *
94 | * @param index Denormalized quantile desired
95 | * @param previousIndex The denormalized quantile corresponding to the center of the previous centroid.
96 | * @param nextIndex The denormalized quantile corresponding to the center of the following centroid.
97 | * @param previousMean The mean of the previous centroid.
98 | * @param nextMean The mean of the following centroid.
99 | * @return The interpolated mean.
100 | */
101 | static double quantile(double index, double previousIndex, double nextIndex, double previousMean, double nextMean) {
102 | final double delta = nextIndex - previousIndex;
103 | final double previousWeight = (nextIndex - index) / delta;
104 | final double nextWeight = (index - previousIndex) / delta;
105 | return previousMean * previousWeight + nextMean * nextWeight;
106 | }
107 |
108 | /**
109 | * Sets up so that all centroids will record all data assigned to them. For testing only, really.
110 | */
111 | @Override
112 | public TDigest recordAllData() {
113 | recordAllData = true;
114 | return this;
115 | }
116 |
117 | @Override
118 | public boolean isRecording() {
119 | return recordAllData;
120 | }
121 |
122 | /**
123 | * Adds a sample to a histogram.
124 | *
125 | * @param x The value to add.
126 | */
127 | @Override
128 | public void add(double x) {
129 | add(x, 1);
130 | }
131 |
132 | @Override
133 | public void add(TDigest other) {
134 | for (Centroid centroid : other.centroids()) {
135 | add(centroid.mean(), centroid.count(), centroid);
136 | }
137 | }
138 |
139 | protected Centroid createCentroid(double mean, int id) {
140 | return new Centroid(mean, id, recordAllData);
141 | }
142 | }
143 |
--------------------------------------------------------------------------------
/core/src/main/java/com/tdunning/math/stats/Centroid.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | import java.io.IOException;
21 | import java.io.ObjectInputStream;
22 | import java.io.Serializable;
23 | import java.util.ArrayList;
24 | import java.util.List;
25 | import java.util.concurrent.atomic.AtomicInteger;
26 |
27 | /**
28 | * A single centroid which represents a number of data points.
29 | */
30 | public class Centroid implements Comparable, Serializable {
31 | private static final AtomicInteger uniqueCount = new AtomicInteger(1);
32 |
33 | private double centroid = 0;
34 | private int count = 0;
35 |
36 | // The ID is transient because it must be unique within a given JVM. A new
37 | // ID should be generated from uniqueCount when a Centroid is deserialized.
38 | private transient int id;
39 |
40 | private List actualData = null;
41 |
42 | private Centroid(boolean record) {
43 | id = uniqueCount.getAndIncrement();
44 | if (record) {
45 | actualData = new ArrayList<>();
46 | }
47 | }
48 |
49 | public Centroid(double x) {
50 | this(false);
51 | start(x, 1, uniqueCount.getAndIncrement());
52 | }
53 |
54 | public Centroid(double x, int w) {
55 | this(false);
56 | start(x, w, uniqueCount.getAndIncrement());
57 | }
58 |
59 | public Centroid(double x, int w, int id) {
60 | this(false);
61 | start(x, w, id);
62 | }
63 |
64 | public Centroid(double x, int id, boolean record) {
65 | this(record);
66 | start(x, 1, id);
67 | }
68 |
69 | Centroid(double x, int w, List data) {
70 | this(x, w);
71 | actualData = data;
72 | }
73 |
74 | private void start(double x, int w, int id) {
75 | this.id = id;
76 | add(x, w);
77 | }
78 |
79 | public void add(double x, int w) {
80 | if (actualData != null) {
81 | actualData.add(x);
82 | }
83 | count += w;
84 | centroid += w * (x - centroid) / count;
85 | }
86 |
87 | public double mean() {
88 | return centroid;
89 | }
90 |
91 | public int count() {
92 | return count;
93 | }
94 |
95 | public int id() {
96 | return id;
97 | }
98 |
99 | @Override
100 | public String toString() {
101 | return "Centroid{" +
102 | "centroid=" + centroid +
103 | ", count=" + count +
104 | '}';
105 | }
106 |
107 | @Override
108 | public int hashCode() {
109 | return id;
110 | }
111 |
112 | @Override
113 | public int compareTo(@SuppressWarnings("NullableProblems") Centroid o) {
114 | int r = Double.compare(centroid, o.centroid);
115 | if (r == 0) {
116 | r = id - o.id;
117 | }
118 | return r;
119 | }
120 |
121 | public List data() {
122 | return actualData;
123 | }
124 |
125 | @SuppressWarnings("WeakerAccess")
126 | public void insertData(double x) {
127 | if (actualData == null) {
128 | actualData = new ArrayList<>();
129 | }
130 | actualData.add(x);
131 | }
132 |
133 | public static Centroid createWeighted(double x, int w, Iterable extends Double> data) {
134 | Centroid r = new Centroid(data != null);
135 | r.add(x, w, data);
136 | return r;
137 | }
138 |
139 | public void add(double x, int w, Iterable extends Double> data) {
140 | if (actualData != null) {
141 | if (data != null) {
142 | for (Double old : data) {
143 | actualData.add(old);
144 | }
145 | } else {
146 | actualData.add(x);
147 | }
148 | }
149 | centroid = AbstractTDigest.weightedAverage(centroid, count, x, w);
150 | count += w;
151 | }
152 |
153 | private void readObject(ObjectInputStream in) throws ClassNotFoundException, IOException {
154 | in.defaultReadObject();
155 | id = uniqueCount.getAndIncrement();
156 | }
157 | }
158 |
--------------------------------------------------------------------------------
/core/src/main/java/com/tdunning/math/stats/Dist.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | import java.util.Collection;
21 | import java.util.List;
22 |
23 | /**
24 | * Reference implementations for cdf and quantile if we have all data.
25 | */
26 | public class Dist {
27 | public static double cdf(final double x, double[] data) {
28 | return cdf(x, data, 0.5);
29 | }
30 |
31 | public static double cdf(final double x, double[] data, double w) {
32 | int n1 = 0;
33 | int n2 = 0;
34 | for (Double v : data) {
35 | n1 += (v < x) ? 1 : 0;
36 | n2 += (v == x) ? 1 : 0;
37 | }
38 | return (n1 + w * n2) / data.length;
39 | }
40 |
41 | public static double cdf(final double x, Collection data) {
42 | return cdf(x, data, 0.5);
43 | }
44 |
45 | public static double cdf(final double x, Collection data, double w) {
46 | int n1 = 0;
47 | int n2 = 0;
48 | for (Double v : data) {
49 | n1 += (v < x) ? 1 : 0;
50 | n2 += (v == x) ? 1 : 0;
51 | }
52 | return (n1 + w * n2) / data.size();
53 | }
54 |
55 | public static double quantile(final double q, double[] data) {
56 | int n = data.length;
57 | if (n == 0) {
58 | return Double.NaN;
59 | }
60 | double index = q * n;
61 | if (index < 0) {
62 | index = 0;
63 | }
64 | if (index > n - 1) {
65 | index = n - 1;
66 | }
67 | return data[(int) Math.floor(index)];
68 | }
69 |
70 | public static double quantile(final double q, List data) {
71 | int n = data.size();
72 | if (n == 0) {
73 | return Double.NaN;
74 | }
75 | double index = q * n;
76 | if (index < 0) {
77 | index = 0;
78 | }
79 | if (index > n - 1) {
80 | index = n - 1;
81 | }
82 | return data.get((int) Math.floor(index));
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/core/src/main/java/com/tdunning/math/stats/FloatHistogram.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | import java.io.IOException;
21 | import java.io.InvalidObjectException;
22 | import java.io.ObjectStreamException;
23 | import java.io.Serializable;
24 | import java.nio.ByteBuffer;
25 | import java.nio.LongBuffer;
26 |
27 | /**
28 | * Maintains histogram buckets that are constant width
29 | * in base-2 floating point representation space. This is close
30 | * to exponential binning, but should be much faster.
31 | */
32 | public class FloatHistogram extends Histogram {
33 | private int bitsOfPrecision;
34 | private int shift;
35 | private int offset;
36 |
37 |
38 | @SuppressWarnings("WeakerAccess")
39 | public FloatHistogram(double min, double max) {
40 | this(min, max, 50);
41 | }
42 |
43 | @SuppressWarnings("WeakerAccess")
44 | public FloatHistogram(double min, double max, double binsPerDecade) {
45 | super(min, max);
46 | if (max <= 2 * min) {
47 | throw new IllegalArgumentException(String.format("Illegal/nonsensical min, max (%.2f, %.2g)", min, max));
48 | }
49 | if (min <= 0 || max <= 0) {
50 | throw new IllegalArgumentException("Min and max must be positive");
51 | }
52 | if (binsPerDecade < 5 || binsPerDecade > 10000) {
53 | throw new IllegalArgumentException(
54 | String.format("Unreasonable number of bins per decade %.2g. Expected value in range [5,10000]",
55 | binsPerDecade));
56 | }
57 |
58 | // convert binsPerDecade into bins per octave, then figure out how many bits that takes
59 | bitsOfPrecision = (int) Math.ceil(Math.log(binsPerDecade * Math.log10(2)) / Math.log(2));
60 | // we keep just the required amount of the mantissa
61 | shift = 52 - bitsOfPrecision;
62 | // The exponent in a floating point number is offset
63 | offset = 0x3ff << bitsOfPrecision;
64 |
65 | setupBins(min, max);
66 | }
67 |
68 | @Override
69 | protected int bucketIndex(double x) {
70 | x = x / min;
71 | long floatBits = Double.doubleToLongBits(x);
72 | return (int) (floatBits >>> shift) - offset;
73 | }
74 |
75 | // exposed for testing
76 | @Override
77 | double lowerBound(int k) {
78 | return min * Double.longBitsToDouble((k + (0x3ffL << bitsOfPrecision)) << (52 - bitsOfPrecision)) /* / fuzz */;
79 | }
80 |
81 | @Override
82 | @SuppressWarnings("WeakerAccess")
83 | public long[] getCompressedCounts() {
84 | LongBuffer buf = LongBuffer.allocate(counts.length);
85 | Simple64.compress(buf, counts, 0, counts.length);
86 | long[] r = new long[buf.position()];
87 | buf.flip();
88 | buf.get(r);
89 | return r;
90 | }
91 |
92 | @Override
93 | @SuppressWarnings("WeakerAccess")
94 | public void writeObject(java.io.ObjectOutputStream out) throws IOException {
95 | out.writeDouble(min);
96 | out.writeDouble(max);
97 | out.writeByte(bitsOfPrecision);
98 | out.writeByte(shift);
99 |
100 | ByteBuffer buf = ByteBuffer.allocate(8 * counts.length);
101 | LongBuffer longBuffer = buf.asLongBuffer();
102 | Simple64.compress(longBuffer, counts, 0, counts.length);
103 | buf.position(8 * longBuffer.position());
104 | byte[] r = new byte[buf.position()];
105 | out.writeShort(buf.position());
106 | buf.flip();
107 | buf.get(r);
108 | out.write(r);
109 | }
110 |
111 | @Override
112 | @SuppressWarnings("WeakerAccess")
113 | public void readObject(java.io.ObjectInputStream in) throws IOException {
114 | min = in.readDouble();
115 | max = in.readDouble();
116 | bitsOfPrecision = in.readByte();
117 | shift = in.readByte();
118 | offset = 0x3ff << bitsOfPrecision;
119 |
120 | int n = in.readShort();
121 | ByteBuffer buf = ByteBuffer.allocate(n);
122 | in.readFully(buf.array(), 0, n);
123 | int binCount = bucketIndex(max) + 1;
124 | if (binCount > 10000) {
125 | throw new IllegalArgumentException(
126 | String.format("Excessive number of bins %d during deserialization = %.2g, %.2g",
127 | binCount, min, max));
128 |
129 | }
130 | counts = new long[binCount];
131 | Simple64.decompress(buf.asLongBuffer(), counts);
132 | }
133 |
134 | private void readObjectNoData() throws ObjectStreamException {
135 | throw new InvalidObjectException("Stream data required");
136 | }
137 |
138 | @Override
139 | void add(Iterable others) {
140 | for (Histogram other : others) {
141 | if (!this.getClass().equals(other.getClass())) {
142 | throw new IllegalArgumentException(String.format("Cannot add %s to FloatHistogram", others.getClass()));
143 | }
144 | FloatHistogram actual = (FloatHistogram) other;
145 | if (actual.min != min || actual.max != max || actual.counts.length != counts.length) {
146 | throw new IllegalArgumentException("Can only merge histograms with identical bounds and precision");
147 | }
148 | for (int i = 0; i < counts.length; i++) {
149 | counts[i] += other.counts[i];
150 | }
151 | }
152 | }
153 | }
154 |
--------------------------------------------------------------------------------
/core/src/main/java/com/tdunning/math/stats/Histogram.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | import java.io.IOException;
21 | import java.io.Serializable;
22 |
23 | /**
24 | * A Histogram is a histogram with cleverly chosen, but fixed, bin widths.
25 | *
26 | * Different implementations may provide better or worse speed or space complexity,
27 | * but each is attuned to a particular distribution or error metric.
28 | */
29 | @SuppressWarnings("WeakerAccess")
30 | public abstract class Histogram implements Serializable {
31 | protected long[] counts;
32 | protected double min;
33 | protected double max;
34 | protected double logFactor;
35 | protected double logOffset;
36 |
37 | public Histogram(double min, double max) {
38 | this.min = min;
39 | this.max = max;
40 | }
41 |
42 | protected void setupBins(double min, double max) {
43 | int binCount = bucketIndex(max) + 1;
44 | if (binCount > 10000) {
45 | throw new IllegalArgumentException(
46 | String.format("Excessive number of bins %d resulting from min,max = %.2g, %.2g",
47 | binCount, min, max));
48 |
49 | }
50 | counts = new long[binCount];
51 | }
52 |
53 | public void add(double v) {
54 | counts[bucket(v)]++;
55 | }
56 |
57 | @SuppressWarnings("WeakerAccess")
58 | public double[] getBounds() {
59 | double[] r = new double[counts.length];
60 | for (int i = 0; i < r.length; i++) {
61 | r[i] = lowerBound(i);
62 | }
63 | return r;
64 | }
65 |
66 | public long[] getCounts() {
67 | return counts;
68 | }
69 |
70 | // exposed for testing
71 | int bucket(double x) {
72 | if (x <= min) {
73 | return 0;
74 | } else if (x >= max) {
75 | return counts.length - 1;
76 | } else {
77 | return bucketIndex(x);
78 | }
79 | }
80 |
81 | protected abstract int bucketIndex(double x);
82 |
83 | // exposed for testing
84 | abstract double lowerBound(int k);
85 |
86 | @SuppressWarnings("WeakerAccess")
87 | abstract long[] getCompressedCounts();
88 |
89 | @SuppressWarnings("WeakerAccess")
90 | abstract void writeObject(java.io.ObjectOutputStream out) throws IOException;
91 |
92 | @SuppressWarnings("WeakerAccess")
93 | abstract void readObject(java.io.ObjectInputStream in) throws IOException;
94 |
95 | abstract void add(Iterable others);
96 | }
97 |
--------------------------------------------------------------------------------
/core/src/main/java/com/tdunning/math/stats/LogHistogram.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | import java.io.IOException;
21 | import java.io.ObjectInputStream;
22 | import java.io.ObjectOutputStream;
23 |
24 | import static java.lang.Math.sqrt;
25 |
26 | /**
27 | * Non-linear histogram that uses floating point representation plus a quadratic correction to
28 | * bin width to achieve tighter fit to the ideal log2 sizing.
29 | */
30 | public class LogHistogram extends Histogram {
31 | private double logFactor;
32 | private double logOffset;
33 |
34 | @SuppressWarnings("WeakerAccess")
35 | public LogHistogram(double min, double max) {
36 | this(min, max, 0.1);
37 | }
38 |
39 | @SuppressWarnings("WeakerAccess")
40 | public LogHistogram(double min, double max, double epsilonFactor) {
41 | super(min, max);
42 | logFactor = Math.log(2) / Math.log(1 + epsilonFactor);
43 | logOffset = LogHistogram.approxLog2(min) * logFactor;
44 |
45 | if (max <= 2 * min) {
46 | throw new IllegalArgumentException(String.format("Illegal/nonsensical min, max (%.2f, %.2g)", min, max));
47 | }
48 | if (min <= 0 || max <= 0) {
49 | throw new IllegalArgumentException("Min and max must be positive");
50 | }
51 | if (epsilonFactor < 1e-6 || epsilonFactor > 0.5) {
52 | throw new IllegalArgumentException(
53 | String.format("Unreasonable number of bins per decade %.2g. Expected value in range [1e-6,0.5]",
54 | epsilonFactor));
55 | }
56 |
57 | setupBins(min, max);
58 | }
59 |
60 | /**
61 | * Approximates log_2(value) by abusing floating point hardware. The floating point exponent
62 | * is used to get the integer part of the log. The mantissa is then adjusted with a second order
63 | * polynomial to get a better approximation. The error is bounded to be less than ±0.01 and is
64 | * zero at every power of two (which also implies the approximation is continuous).
65 | *
66 | * @param value The argument of the log
67 | * @return log_2(value) (within an error of about ± 0.01)
68 | */
69 | @SuppressWarnings("WeakerAccess")
70 | public static double approxLog2(double value) {
71 | final long valueBits = Double.doubleToRawLongBits(value);
72 | final long exponent = ((valueBits & 0x7ff0_0000_0000_0000L) >>> 52) - 1024;
73 | final double m = Double.longBitsToDouble((valueBits & 0x800fffffffffffffL) | 0x3ff0000000000000L);
74 | return (m * (2 - (1.0 / 3) * m) + exponent - (2.0 / 3.0));
75 | }
76 |
77 | /**
78 | * Computes an approximate value of 2^x. This is done as an exact inverse of #approxLog2 so
79 | * that bin boundaries can be computed exactly.
80 | *
81 | * @param x The power of 2 desired.
82 | * @return 2^x approximately.
83 | */
84 | @SuppressWarnings("WeakerAccess")
85 | public static double pow2(double x) {
86 | final double exponent = Math.floor(x) - 1;
87 | x = x - exponent;
88 | double m = 3 - sqrt(7 - 3 * x);
89 | return Math.pow(2, exponent + 1) * m;
90 | }
91 |
92 | @Override
93 | protected int bucketIndex(double x) {
94 | return (int) (LogHistogram.approxLog2(x) * logFactor - logOffset);
95 | }
96 |
97 | @Override
98 | double lowerBound(int k) {
99 | return LogHistogram.pow2((k + logOffset) / logFactor);
100 | }
101 |
102 | @Override
103 | long[] getCompressedCounts() {
104 | return new long[0];
105 | }
106 |
107 | @Override
108 | void writeObject(ObjectOutputStream out) throws IOException {
109 |
110 | }
111 |
112 | @Override
113 | void readObject(ObjectInputStream in) throws IOException {
114 |
115 | }
116 |
117 | @Override
118 | void add(Iterable others) {
119 | for (Histogram other : others) {
120 | if (!this.getClass().equals(other.getClass())) {
121 | throw new IllegalArgumentException(String.format("Cannot add %s to LogHistogram", others.getClass()));
122 | }
123 | LogHistogram actual = (LogHistogram) other;
124 | if (actual.min != min || actual.max != max || actual.counts.length != counts.length) {
125 | throw new IllegalArgumentException("Can only merge histograms with identical bounds and precision");
126 | }
127 | for (int i = 0; i < counts.length; i++) {
128 | counts[i] += other.counts[i];
129 | }
130 | }
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/core/src/main/r/asin-approx.r:
--------------------------------------------------------------------------------
1 | ### We want a piece-wise approximation of asin(x)
2 | ### But we want to have the following constraints:
3 | ### 1) each should be completely well behaved in its range
4 | ### 2) adjacent pieces will be blended using linear approximation so their regions should overlap
5 | ### 3) the blended result should have continuity
6 | ### 3) symmetry will be handled outside this approximation
7 | ### 4) the overall range handled should start at 0 and end before 1
8 | ### 5) the overall should be as large as possible, but need not reach 1
9 |
10 | fit = function(param) {
11 | c0.high = param[1]
12 | c1.high = param[2]
13 | c2.low = param[3]
14 | c2.high = param[4]
15 | c3.low = param[5]
16 | c3.high = param[6]
17 | c4.low = param[7]
18 |
19 | x = seq(-c0.high, c0.high, by=0.01)
20 | d0 = data.frame(y=asin(x), x=x, x2=x*x, x3=x*x*x, i1=1/(1-x), i2=1/(1-x)/(1-x))
21 | m0 = glm(y ~ x + x2 + x3 + i1 + i2, d0, family='gaussian')
22 |
23 | x = seq(0, c1.high, by=0.01)
24 | d1 = data.frame(y=asin(x), x=x, x2=x*x, x3=x*x*x, i1=1/(1-x), i2=1/(1-x)/(1-x))
25 | m1 = glm(y ~ x + x2 + x3 + i1 + i2, d1, family='gaussian')
26 |
27 | x = seq(c2.low, c2.high, by=0.01)
28 | d2 = data.frame(y=asin(x), x=x, x2=x*x, x3=x*x*x, i1=1/(1-x), i2=1/(1-x)/(1-x))
29 | m2 = glm(y ~ x + x2 + x3 + i1 + i2, d2, family='gaussian')
30 |
31 | x = seq(c3.low, c3.high, by=0.01)
32 | d3 = data.frame(y=asin(x), x=x, x2=x*x, x3=x*x*x, i1=1/(1-x), i2=1/(1-x)/(1-x))
33 | m3 = glm(y ~ x + x2 + x3 + i1 + i2, d3, family='gaussian')
34 |
35 | list(m0=m0,m1=m1,m2=m2,m3=m3,
36 | c0.high=c0.high,c1.high=c1.high, c2.low=c2.low, c2.high=c2.high,
37 | c3.low=c3.low, c3.high=c3.high, c4.low=c4.low)
38 | }
39 |
40 | follow = function(models) {
41 | x = seq(0, models$c3.high, by=0.01)
42 | data = data.frame(x=x, x2=x*x, x3=x*x*x, i1=1/(1-x), i2=1/(1-x)/(1-x))
43 | raw = data.frame(
44 | y0=predict(models$m0, newdata=data),
45 | y1=predict(models$m1, newdata=data),
46 | y2=predict(models$m2, newdata=data),
47 | y3=predict(models$m3, newdata=data),
48 | y4=asin(x)
49 | )
50 |
51 | ## c0.high, c1.high, c2.low, c1.high, c3.low, c2.high, c3.high, c4.low
52 | mix = with(models, {
53 | mix = matrix(0, nrow=dim(raw)[1], ncol=5)
54 | x0 = bound((c0.high - x) / c0.high)
55 | x1 = bound((c1.high - x) / (c1.high - c2.low));
56 | x2 = bound((c2.high - x) / (c2.high - c3.low));
57 | x3 = bound((c3.high - x) / (c3.high - c4.low));
58 |
59 | mix[, 1] = x0
60 | mix[, 2] = (1-x0) * x1
61 | mix[, 3] = (1-x1) * x2
62 | mix[, 4] = (1-x2) * x3
63 | mix[, 5] = 1-x3
64 | mix
65 | })
66 |
67 | data.frame(x=x, yhat=rowSums(raw * mix), y=asin(x))
68 | }
69 |
70 | bound = function(v) {
71 | over = v > 1
72 | under = v < 0
73 | v * (1-over) * (1-under) + over
74 | }
75 |
76 |
--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/AVLGroupTreeTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | import org.apache.mahout.common.RandomUtils;
21 | import org.junit.Before;
22 | import org.junit.Test;
23 |
24 | public class AVLGroupTreeTest extends AbstractTest {
25 |
26 | @Before
27 | public void setUp() {
28 | RandomUtils.useTestSeed();
29 | }
30 |
31 | @Test
32 | public void testSimpleAdds() {
33 | AVLGroupTree x = new AVLGroupTree(false);
34 | assertEquals(IntAVLTree.NIL, x.floor(34));
35 | assertEquals(IntAVLTree.NIL, x.first());
36 | assertEquals(IntAVLTree.NIL, x.last());
37 | assertEquals(0, x.size());
38 | assertEquals(0, x.sum());
39 |
40 | x.add(new Centroid(1));
41 | assertEquals(1, x.sum());
42 | Centroid centroid = new Centroid(2);
43 | centroid.add(3, 1);
44 | centroid.add(4, 1);
45 | x.add(centroid);
46 |
47 | assertEquals(2, x.size());
48 | assertEquals(4, x.sum());
49 | }
50 |
51 | @Test
52 | public void testBalancing() {
53 | AVLGroupTree x = new AVLGroupTree(false);
54 | for (int i = 0; i < 101; i++) {
55 | x.add(new Centroid(i));
56 | }
57 |
58 | assertEquals(101, x.size());
59 | assertEquals(101, x.sum());
60 |
61 | x.checkBalance();
62 | x.checkAggregates();
63 | }
64 |
65 | @Test
66 | public void testFloor() {
67 | // mostly tested in other tests
68 | AVLGroupTree x = new AVLGroupTree(false);
69 | for (int i = 0; i < 101; i++) {
70 | x.add(new Centroid(i / 2));
71 | }
72 |
73 | assertEquals(IntAVLTree.NIL, x.floor(-30));
74 |
75 | for (Centroid centroid : x) {
76 | assertEquals(centroid.mean(), x.mean(x.floor(centroid.mean() + 0.1)), 0);
77 | }
78 | }
79 |
80 | @Test
81 | public void testHeadSum() {
82 | AVLGroupTree x = new AVLGroupTree(false);
83 | for (int i = 0; i < 1000; ++i) {
84 | x.add(randomDouble(), randomIntBetween(1, 10), null);
85 | }
86 | long sum = 0;
87 | long last = -1;
88 | for (int node = x.first(); node != IntAVLTree.NIL; node = x.next(node)) {
89 | assertEquals(sum, x.headSum(node));
90 | sum += x.count(node);
91 | last = x.count(node);
92 | }
93 | assertEquals(last, x.count(x.last()));
94 | }
95 |
96 | @Test
97 | public void testFloorSum() {
98 | AVLGroupTree x = new AVLGroupTree(false);
99 | int total = 0;
100 | for (int i = 0; i < 1000; ++i) {
101 | int count = randomIntBetween(1, 10);
102 | x.add(randomDouble(), count, null);
103 | total += count;
104 | }
105 | assertEquals(IntAVLTree.NIL, x.floorSum(-1));
106 | for (long i = 0; i < total + 10; ++i) {
107 | final int floorNode = x.floorSum(i);
108 | assertTrue(x.headSum(floorNode) <= i);
109 | final int next = x.next(floorNode);
110 | assertTrue(next == IntAVLTree.NIL || x.headSum(next) > i);
111 | }
112 | }
113 |
114 | }
115 |
--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/AVLTreeDigestTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | import org.junit.BeforeClass;
21 |
22 | import java.io.IOException;
23 | import java.nio.ByteBuffer;
24 |
25 | public class AVLTreeDigestTest extends TDigestTest {
26 | @BeforeClass
27 | public static void setup() throws IOException {
28 | TDigestTest.setup("avl-tree");
29 | }
30 |
31 | protected DigestFactory factory(final double compression) {
32 | return new DigestFactory() {
33 | @Override
34 | public TDigest create() {
35 | return new AVLTreeDigest(compression);
36 | }
37 | };
38 | }
39 |
40 | @Override
41 | protected TDigest fromBytes(ByteBuffer bytes) {
42 | return AVLTreeDigest.fromBytes(bytes);
43 | }
44 |
45 | @Override
46 | public void testRepeatedValues() {
47 | // disabled for AVLTreeDigest for now
48 | }
49 |
50 | @Override
51 | public void testSingletonInACrowd() {
52 | // disabled for AVLTreeDigest for now
53 | }
54 |
55 | @Override
56 | public void singleSingleRange() {
57 | // disabled for AVLTreeDigest for now
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/AbstractTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | import org.junit.Ignore;
21 | import org.junit.runner.RunWith;
22 |
23 | import com.carrotsearch.randomizedtesting.JUnit3MethodProvider;
24 | import com.carrotsearch.randomizedtesting.JUnit4MethodProvider;
25 | import com.carrotsearch.randomizedtesting.RandomizedTest;
26 | import com.carrotsearch.randomizedtesting.annotations.Listeners;
27 | import com.carrotsearch.randomizedtesting.annotations.TestMethodProviders;
28 |
29 | @Ignore
30 | @Listeners({
31 | ReproduceInfoPrinterRunListener.class
32 | })
33 | @TestMethodProviders({
34 | JUnit3MethodProvider.class, // test names starting with test*
35 | JUnit4MethodProvider.class // test methods annotated with @Test
36 | })
37 | @RunWith(value = com.carrotsearch.randomizedtesting.RandomizedRunner.class)
38 | /**
39 | * Base test case, all other test cases must inherit this one.
40 | */
41 | public abstract class AbstractTest extends RandomizedTest {
42 |
43 | }
--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/AlternativeMergeTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | import org.junit.Test;
21 |
22 | import java.io.FileNotFoundException;
23 | import java.io.PrintStream;
24 | import java.io.PrintWriter;
25 | import java.io.UnsupportedEncodingException;
26 | import java.util.*;
27 |
28 | import static junit.framework.Assert.assertEquals;
29 |
30 | public class AlternativeMergeTest {
31 | /**
32 | * Computes size using the alternative scaling limit for both an idealized merge and for
33 | * a MergingDigest.
34 | *
35 | * This test does some sanity checking, but the real purpose is to create data files
36 | * sizes.csv and counts.csv
37 | * @throws FileNotFoundException If output files can't be created.
38 | */
39 | @Test
40 | public void testMerges() throws FileNotFoundException {
41 | try (PrintWriter sizes = new PrintWriter("sizes.csv");
42 | PrintWriter out = new PrintWriter("counts.csv")) {
43 | sizes.printf("algo, counts, digest, compression, n\n");
44 | out.printf("algo, compression, n, q, count\n");
45 | for (int n : new int[]{100, 1000, 10000, 100000}) {
46 | for (double compression : new double[]{50, 100, 200, 400}) {
47 | MergingDigest digest1 = new MergingDigest(compression);
48 | AVLTreeDigest digest2 = new AVLTreeDigest(compression);
49 | List data = new ArrayList<>();
50 | Random gen = new Random();
51 | for (int i = 0; i < n; i++) {
52 | double x = gen.nextDouble();
53 | data.add(x);
54 | digest1.add(x);
55 | digest2.add(x);
56 | }
57 | Collections.sort(data);
58 | List counts = new ArrayList<>();
59 | double soFar = 0;
60 | double current = 0;
61 | for (Double x : data) {
62 | double q = (soFar + (current + 1.0) / 2) / n;
63 | if (current == 0 || current + 1 < n * Math.PI / compression * Math.sqrt(q * (1 - q))) {
64 | current += 1;
65 | } else {
66 | counts.add(current);
67 | soFar += current;
68 | current = 1;
69 | }
70 | }
71 | if (current > 0) {
72 | counts.add(current);
73 | }
74 | sizes.printf("%s, %d, %d, %.0f, %d\n", "merge", counts.size(), digest1.centroids().size(), compression, n);
75 | sizes.printf("%s, %d, %d, %.0f, %d\n", "tree", counts.size(), digest2.centroids().size(), compression, n);
76 | sizes.printf("%s, %d, %d, %.0f, %d\n", "ideal", counts.size(), counts.size(), compression, n);
77 | soFar = 0;
78 | for (Double count : counts) {
79 | out.printf("%s, %.0f, %d, %.3f, %.0f\n", "ideal", compression, n, (soFar + count / 2) / n, count);
80 | soFar += count;
81 | }
82 | assertEquals(n, soFar, 0);
83 | soFar = 0;
84 | for (Centroid c : digest1.centroids()) {
85 | out.printf("%s, %.0f, %d, %.3f, %d\n", "merge", compression, n, (soFar + c.count() / 2) / n, c.count());
86 | soFar += c.count();
87 | }
88 | assertEquals(n, soFar, 0);
89 | soFar = 0;
90 | for (Centroid c : digest2.centroids()) {
91 | out.printf("%s, %.0f, %d, %.3f, %d\n", "tree", compression, n, (soFar + c.count() / 2) / n, c.count());
92 | soFar += c.count();
93 | }
94 | assertEquals(n, soFar, 0);
95 | }
96 | }
97 | }
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/BigCount.java:
--------------------------------------------------------------------------------
1 | package com.tdunning.math.stats;
2 |
3 | import org.junit.Test;
4 |
5 | public abstract class BigCount extends AbstractTest {
6 |
7 | @Test
8 | public void testBigMerge() {
9 | TDigest digest = createDigest();
10 | for (int i = 0; i < 5; i++) {
11 | digest.add(getDigest());
12 | double actual = digest.quantile(0.5);
13 | assertEquals("Count = " + digest.size(), 3000,
14 | actual, 0.001);
15 | }
16 | }
17 |
18 | private TDigest getDigest() {
19 | TDigest digest = createDigest();
20 | addData(digest);
21 | return digest;
22 | }
23 |
24 | public TDigest createDigest() {
25 | throw new IllegalStateException("Should have over-ridden createDigest");
26 | }
27 |
28 | private static void addData(TDigest digest) {
29 | double n = 300_000_000 * 5 + 200;
30 |
31 | addFakeCentroids(digest, n, 300_000_000, 10);
32 | addFakeCentroids(digest, n, 300_000_000, 200);
33 | addFakeCentroids(digest, n, 300_000_000, 3000);
34 | addFakeCentroids(digest, n, 300_000_000, 4000);
35 | addFakeCentroids(digest, n, 300_000_000, 5000);
36 | addFakeCentroids(digest, n, 200, 47883554);
37 |
38 | assertEquals(n, digest.size(), 0);
39 | }
40 |
41 | private static void addFakeCentroids(TDigest digest, double n, int points, int x) {
42 | long base = digest.size();
43 | double q0 = base / n;
44 | long added = 0;
45 | while (added < points) {
46 | double k0 = digest.scale.k(q0, digest.compression(), n);
47 | double q1 = digest.scale.q(k0 + 1, digest.compression(), n);
48 | q1 = Math.min(q1, (base + points) / n);
49 | int m = (int) Math.min(points - added, Math.max(1, Math.rint((q1 - q0) * n)));
50 | added += m;
51 | digest.add(x, m);
52 | q0 = q1;
53 | }
54 | }
55 | }
--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/BigCountMergingDigest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | public class BigCountMergingDigest extends BigCount {
21 | @Override
22 | public TDigest createDigest() {
23 | return new MergingDigest(100);
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/BigCountTreeDigest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | public class BigCountTreeDigest extends BigCount {
21 | @Override
22 | public TDigest createDigest() {
23 | return new AVLTreeDigest(100);
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/DigestFactory.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | /**
21 | * A DigestFactory is used in tests to abstract what kind of digest is being tested.
22 | */
23 | public interface DigestFactory {
24 | TDigest getDigest(double compression);
25 | }
26 |
--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/FloatHistogramTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | import org.junit.Before;
21 | import org.junit.Test;
22 |
23 | import java.io.FileNotFoundException;
24 |
25 | public class FloatHistogramTest extends HistogramTestCases {
26 | @Before
27 | public void setup() {
28 | useLinearBuckets = true;
29 | factory = new HistogramFactory() {
30 | @Override
31 | public Histogram create(double min, double max) {
32 | return new FloatHistogram(min, max);
33 | }
34 | };
35 | }
36 |
37 | @Test
38 | public void testBins() {
39 | super.testBinSizes(79, 141, new FloatHistogram(10e-6, 5, 20));
40 | }
41 |
42 | @Test
43 | public void testLinear() throws FileNotFoundException {
44 | super.doLinear(165.4, 18, 212);
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/IntAVLTreeTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | import org.junit.Test;
21 |
22 | import java.util.*;
23 |
24 |
25 | public class IntAVLTreeTest extends AbstractTest {
26 |
27 | static class IntBag extends IntAVLTree {
28 |
29 | int value;
30 | int[] values;
31 | int[] counts;
32 |
33 | IntBag() {
34 | values = new int[capacity()];
35 | counts = new int[capacity()];
36 | }
37 |
38 | @SuppressWarnings("WeakerAccess")
39 | public boolean addValue(int value) {
40 | this.value = value;
41 | return super.add();
42 | }
43 |
44 | @SuppressWarnings("WeakerAccess")
45 | public boolean removeValue(int value) {
46 | this.value = value;
47 | final int node = find();
48 | if (node == NIL) {
49 | return false;
50 | } else {
51 | super.remove(node);
52 | return true;
53 | }
54 | }
55 |
56 | @Override
57 | protected void resize(int newCapacity) {
58 | super.resize(newCapacity);
59 | values = Arrays.copyOf(values, newCapacity);
60 | counts = Arrays.copyOf(counts, newCapacity);
61 | }
62 |
63 | @Override
64 | protected int compare(int node) {
65 | return value - values[node];
66 | }
67 |
68 | @Override
69 | protected void copy(int node) {
70 | values[node] = value;
71 | counts[node] = 1;
72 | }
73 |
74 | @Override
75 | protected void merge(int node) {
76 | values[node] = value;
77 | counts[node]++;
78 | }
79 |
80 | }
81 |
82 | @Test
83 | public void dualAdd() {
84 | Random r = new Random(0);
85 | TreeMap map = new TreeMap<>();
86 | IntBag bag = new IntBag();
87 | for (int i = 0; i < 100000; ++i) {
88 | final int v = r.nextInt(100000);
89 | if (map.containsKey(v)) {
90 | map.put(v, map.get(v) + 1);
91 | assertFalse(bag.addValue(v));
92 | } else {
93 | map.put(v, 1);
94 | assertTrue(bag.addValue(v));
95 | }
96 | }
97 | Iterator> it = map.entrySet().iterator();
98 | for (int node = bag.first(bag.root()); node != IntAVLTree.NIL; node = bag.next(node)) {
99 | final Map.Entry next = it.next();
100 | assertEquals(next.getKey().intValue(), bag.values[node]);
101 | assertEquals(next.getValue().intValue(), bag.counts[node]);
102 | }
103 | assertFalse(it.hasNext());
104 | }
105 |
106 | @Test
107 | public void dualAddRemove() {
108 | Random r = new Random(0);
109 | TreeMap map = new TreeMap<>();
110 | IntBag bag = new IntBag();
111 | for (int i = 0; i < 100000; ++i) {
112 | final int v = r.nextInt(1000);
113 | if (r.nextBoolean()) {
114 | // add
115 | if (map.containsKey(v)) {
116 | map.put(v, map.get(v) + 1);
117 | assertFalse(bag.addValue(v));
118 | } else {
119 | map.put(v, 1);
120 | assertTrue(bag.addValue(v));
121 | }
122 | } else {
123 | // remove
124 | assertEquals(map.remove(v) != null, bag.removeValue(v));
125 | }
126 | }
127 | Iterator> it = map.entrySet().iterator();
128 | for (int node = bag.first(bag.root()); node != IntAVLTree.NIL; node = bag.next(node)) {
129 | final Map.Entry next = it.next();
130 | assertEquals(next.getKey().intValue(), bag.values[node]);
131 | assertEquals(next.getValue().intValue(), bag.counts[node]);
132 | }
133 | assertFalse(it.hasNext());
134 | }
135 |
136 | }
137 |
--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/LogHistogramTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | import org.junit.Before;
21 | import org.junit.Test;
22 |
23 | import java.io.FileNotFoundException;
24 | import java.io.IOException;
25 |
26 | import static org.junit.Assert.assertEquals;
27 | import static org.junit.Assert.assertTrue;
28 |
29 | public class LogHistogramTest extends HistogramTestCases {
30 | @Before
31 | public void setup() {
32 | useLinearBuckets = false;
33 | factory = new HistogramFactory() {
34 | @Override
35 | public Histogram create(double min, double max) {
36 | return new LogHistogram(min, max, 0.05);
37 | }
38 | };
39 | }
40 |
41 |
42 | @Test
43 | public void testApproxLog() {
44 | double x = 1e-6;
45 | for (int i = 0; i < 1000; i++) {
46 | assertEquals(Math.log(x) / Math.log(2), LogHistogram.approxLog2(x), 0.01);
47 | x *= 1.0 + Math.PI / 100.0;
48 | }
49 | assertTrue("Insufficient range", x > 1e6);
50 | }
51 |
52 | @Test
53 | public void testInverse() {
54 | for (double x = 0.001; x <= 100; x += 1e-3) {
55 | double log = LogHistogram.approxLog2(x);
56 | double roundTrip = LogHistogram.pow2(log);
57 | assertEquals(x, roundTrip, 1e-13);
58 | }
59 |
60 | }
61 |
62 | @Test
63 | public void testBins() {
64 | super.testBinSizes(72, 129, new LogHistogram(10e-6, 5, 0.1));
65 | }
66 |
67 | @Test
68 | public void testLinear() throws FileNotFoundException {
69 | super.doLinear(146, 17, 189);
70 | }
71 |
72 | @Override
73 | public void testCompression() {
74 | //ignore
75 | }
76 |
77 | @Override
78 | public void testSerialization() {
79 | //ignore
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/MegaMergeTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | import com.google.common.collect.Lists;
21 | import org.junit.Test;
22 |
23 | import java.util.ArrayList;
24 | import java.util.List;
25 | import java.util.Random;
26 | import java.util.concurrent.*;
27 |
28 | import static org.junit.Assume.assumeTrue;
29 |
30 | public class MegaMergeTest {
31 |
32 | private static final int DAY = 280;
33 | private static final int WIDTH = 1000;
34 | private static final int DATA_STRIDE = 23;
35 |
36 | @Test
37 | public void testLargeMerge() throws InterruptedException, ExecutionException {
38 | assumeTrue(Boolean.parseBoolean(System.getProperty("runSlowTests")));
39 | // prove we can summarize a days worth of data at 5 minute intervals. Each interval has
40 | // 1000 samples each with 1500 data points
41 | double t0 = System.nanoTime() * 1e-9;
42 | // we cheat by only having 23 samples that we rotate into the data summaries
43 | // the raw data
44 | double[][] data = new double[DATA_STRIDE][1500];
45 | Random gen = new Random();
46 | for (int i = 0; i < DATA_STRIDE; i++) {
47 | for (int j = 0; j < 1500; j++) {
48 | data[i][j] = gen.nextGaussian();
49 | }
50 | }
51 | double t1 = System.nanoTime() * 1e-9;
52 | System.out.printf("Data has been generated\n");
53 | // record the basic summaries
54 | final MergingDigest[][] td = new MergingDigest[DAY][WIDTH];
55 | int m = 0;
56 | for (int i = 0; i < DAY; i++) {
57 | if (i % 10 == 0) {
58 | System.out.printf("%d\n", i);
59 | }
60 | for (int j = 0; j < WIDTH; j++) {
61 | td[i][j] = new MergingDigest(100);
62 | for (int k = 0; k < 1500; k++) {
63 | td[i][j].add(data[m][k]);
64 | }
65 | m = (m + 1) % DATA_STRIDE;
66 | }
67 | }
68 | double t2 = System.nanoTime() * 1e-9;
69 | System.out.printf("Micro-summaries filled\n");
70 | System.out.printf("%.3f,%.3f\n", t1 - t0, t2 - t1);
71 | int cores = Runtime.getRuntime().availableProcessors();
72 | System.out.printf("using %d cores\n", cores);
73 | for (int threads = 1; threads < 2 * cores; threads++) {
74 | t2 = System.nanoTime() * 1e-9;
75 | // pull the summaries together into 288 reasonably high resolution t-digests
76 | List> tasks = new ArrayList<>();
77 | for (int i = 0; i < DAY; i++) {
78 | final MergingDigest[] elements = td[i];
79 | tasks.add(new Callable() {
80 | @Override
81 | public MergingDigest call() {
82 | MergingDigest rx = new MergingDigest(100);
83 | rx.add(Lists.newArrayList(elements));
84 | return rx;
85 | }
86 | });
87 | }
88 | ExecutorService pool = Executors.newFixedThreadPool(threads);
89 | List> results = pool.invokeAll(tasks);
90 | final MergingDigest[] r = new MergingDigest[DAY];
91 | try {
92 | int i = 0;
93 | for (Future result : results) {
94 | r[i++] = result.get();
95 | }
96 | } finally {
97 | pool.shutdown();
98 | pool.awaitTermination(2, TimeUnit.SECONDS);
99 | }
100 | double t3 = System.nanoTime() * 1e-9;
101 | System.out.printf("%.3f,%.3f,%.3f,%.3f\n",
102 | r[0].quantile(0.99), r[100].quantile(0.99),
103 | r[150].quantile(0.99), r[250].quantile(0.99));
104 | System.out.printf("%d,%.3f\n", threads, t3 - t2);
105 | }
106 | }
107 |
108 | @Test
109 | public void megaMerge() {
110 | assumeTrue(Boolean.parseBoolean(System.getProperty("runSlowTests")));
111 | final int SUMMARIES = 1000;
112 | final int POINTS = 1000000;
113 | double t0 = System.nanoTime() * 1e-9;
114 | double[] data = new double[10013];
115 | Random gen = new Random();
116 | for (int i = 0; i < data.length; i++) {
117 | data[i] = gen.nextGaussian();
118 | }
119 | double t1 = System.nanoTime() * 1e-9;
120 | System.out.printf("Data has been generated\n");
121 |
122 | // record the basic summaries
123 | final MergingDigest[] td = new MergingDigest[SUMMARIES];
124 | int k = 0;
125 | for (int i = 0; i < SUMMARIES; i++) {
126 | if (i % 100 == 0) {
127 | System.out.printf("%d\n", i);
128 | }
129 | td[i] = new MergingDigest(200);
130 | for (int j = 0; j < POINTS; j++) {
131 | td[i].add(data[k]);
132 | k = (k + 1) % data.length;
133 | }
134 | }
135 | System.out.printf("Partials built\n");
136 | double t2 = System.nanoTime() * 1e-9;
137 |
138 | MergingDigest tAll = new MergingDigest(200);
139 | tAll.add(Lists.newArrayList(td));
140 | double t3 = System.nanoTime() * 1e-9;
141 | System.out.printf("%.3f, %.3f, %.3f\n", t1 - t0, t2 - t1, t3 - t2);
142 | }
143 | }
144 |
--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/ReproTest.java:
--------------------------------------------------------------------------------
1 | package com.tdunning.math.stats;
2 |
3 | import org.junit.Assert;
4 | import org.junit.Test;
5 |
6 | import java.util.Random;
7 |
8 | import static com.tdunning.math.stats.ScaleFunction.*;
9 |
10 | public class ReproTest {
11 |
12 | @Test
13 | public void testRepro() {
14 | Random gen = new Random();
15 | gen.setSeed(1);
16 | double[] data = new double[10000];
17 | for (int i = 0; i < data.length; i++) {
18 | // these samples are truncated and thus have lots of duplicates
19 | // this can wreak havoc with the t-digest invariants
20 | data[i] = Math.floor(gen.nextDouble() * 10);
21 | }
22 |
23 | for (ScaleFunction sf : ScaleFunction.values()) {
24 | if (sf.toString().contains("NO_NORM")) {
25 | continue;
26 | }
27 | TDigest distLow = new MergingDigest(100);
28 | TDigest distMedian = new MergingDigest(100);
29 | TDigest distHigh = new MergingDigest(100);
30 | for (int i = 0; i < 500; i++) {
31 | MergingDigest d1 = new MergingDigest(100);
32 | d1.setScaleFunction(K_2);
33 | for (double x : data) {
34 | d1.add(x);
35 | }
36 | d1.compress();
37 | distLow.add(d1.quantile(0.001));
38 | distMedian.add(d1.quantile(0.5));
39 | distHigh.add(d1.quantile(0.999));
40 | }
41 | Assert.assertEquals(0, distLow.quantile(0.0), 0);
42 | Assert.assertEquals(0, distLow.quantile(0.5), 0);
43 | Assert.assertEquals(0, distLow.quantile(1.0), 0);
44 | Assert.assertEquals(9, distHigh.quantile(0.0), 0);
45 | Assert.assertEquals(9, distHigh.quantile(0.5), 0);
46 | Assert.assertEquals(9, distHigh.quantile(1.0), 0);
47 | System.out.printf("%s,%.3f,%.5f,%.5f,%.5f\n",
48 | sf, 0.5,
49 | distMedian.quantile(0.01), distMedian.quantile(0.5), distMedian.quantile(0.99));
50 | }
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/ReproduceInfoPrinterRunListener.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | import org.junit.runner.Result;
21 | import org.junit.runner.notification.Failure;
22 | import org.junit.runner.notification.RunListener;
23 |
24 | import com.carrotsearch.randomizedtesting.RandomizedContext;
25 |
26 | public final class ReproduceInfoPrinterRunListener extends RunListener {
27 |
28 | private boolean failed = false;
29 |
30 | @Override
31 | public void testFailure(Failure failure) {
32 | failed = true;
33 | }
34 |
35 | @Override
36 | public void testRunFinished(Result result) {
37 | if (failed) {
38 | printReproLine();
39 | }
40 | failed = false;
41 | }
42 |
43 | private void printReproLine() {
44 | final StringBuilder b = new StringBuilder();
45 | b.append("NOTE: reproduce with: mvn test -Dtests.seed=").append(RandomizedContext.current().getRunnerSeedAsString());
46 | if (System.getProperty("runSlowTests") != null) {
47 | b.append(" -DrunSlowTests=").append(System.getProperty("runSlowTests"));
48 | }
49 | b.append(" -Dtests.class=").append(RandomizedContext.current().getTargetClass().getName());
50 | System.out.println(b.toString());
51 | }
52 |
53 | }
--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/SerializationTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | import org.junit.Test;
21 |
22 | import java.nio.BufferUnderflowException;
23 | import java.nio.ByteBuffer;
24 | import java.util.Iterator;
25 |
26 | import static org.junit.Assert.assertEquals;
27 | import static org.junit.Assert.assertFalse;
28 | import static org.junit.Assert.assertTrue;
29 |
30 | public class SerializationTest {
31 | @Test
32 | public void mergingDigestSerDes() {
33 | final TDigest out = MergingDigest.createDigest(100);
34 | out.add(42.5);
35 | out.add(1);
36 | out.add(24.0);
37 |
38 | final ByteBuffer output = ByteBuffer.allocate(out.smallByteSize());
39 | out.asSmallBytes(output);
40 |
41 | ByteBuffer input = ByteBuffer.wrap(output.array());
42 | try {
43 | TDigest m = MergingDigest.fromBytes(input);
44 | for (double q = 0; q <= 1; q+=0.001) {
45 | assertEquals(m.quantile(q), out.quantile(q), 0);
46 | }
47 | Iterator ix = m.centroids().iterator();
48 | for (Centroid centroid : out.centroids()) {
49 | assertTrue(ix.hasNext());
50 | Centroid c = ix.next();
51 | assertEquals(centroid.mean(), c.mean(), 0);
52 | assertEquals(centroid.count(), c.count(), 0);
53 | }
54 | assertFalse(ix.hasNext());
55 | } catch (BufferUnderflowException e) {
56 | System.out.println("WTF?");
57 | }
58 |
59 | input = ByteBuffer.wrap(output.array());
60 | final TDigest in = MergingDigest.fromBytes(input);
61 | assertEquals(out.quantile(0.95), in.quantile(0.95), 0.001);
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/TDigestSerializationTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | import org.junit.Test;
21 |
22 | import java.io.ByteArrayInputStream;
23 | import java.io.ByteArrayOutputStream;
24 | import java.io.IOException;
25 | import java.io.ObjectInputStream;
26 | import java.io.ObjectOutputStream;
27 | import java.io.Serializable;
28 | import java.util.Iterator;
29 | import java.util.Random;
30 |
31 | import static org.junit.Assert.assertEquals;
32 | import static org.junit.Assert.assertFalse;
33 | import static org.junit.Assert.assertNotNull;
34 |
35 | /**
36 | * Verifies that the various TDigest implementations can be serialized.
37 | *
38 | * Serializability is important, for example, if we want to use t-digests with Spark.
39 | */
40 | public class TDigestSerializationTest {
41 | @Test
42 | public void testMergingDigest() throws IOException {
43 | assertSerializesAndDeserializes(new MergingDigest(100));
44 | }
45 |
46 | @Test
47 | public void testAVLTreeDigest() throws IOException {
48 | assertSerializesAndDeserializes(new AVLTreeDigest(100));
49 | }
50 |
51 | private void assertSerializesAndDeserializes(T tdigest) throws IOException {
52 | assertNotNull(deserialize(serialize(tdigest)));
53 |
54 | final Random gen = new Random();
55 | for (int i = 0; i < 100000; i++) {
56 | tdigest.add(gen.nextDouble());
57 | }
58 | T roundTrip = deserialize(serialize(tdigest));
59 |
60 | assertTDigestEquals(tdigest, roundTrip);
61 | }
62 |
63 | private static byte[] serialize(Serializable obj) throws IOException {
64 | ByteArrayOutputStream baos = new ByteArrayOutputStream(5120);
65 | try (ObjectOutputStream out = new ObjectOutputStream(baos)){
66 | out.writeObject(obj);
67 | return baos.toByteArray();
68 | }
69 | }
70 |
71 | private static T deserialize(byte[] objectData) throws IOException {
72 | try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(objectData))) {
73 | //noinspection unchecked
74 | return (T) in.readObject();
75 | } catch (ClassCastException | ClassNotFoundException | IOException e) {
76 | throw new IOException(e);
77 | }
78 | }
79 |
80 | private void assertTDigestEquals(TDigest t1, TDigest t2) {
81 | assertEquals(t1.getMin(), t2.getMin(), 0);
82 | assertEquals(t1.getMax(), t2.getMax(), 0);
83 | Iterator cx = t2.centroids().iterator();
84 | for (Centroid c1 : t1.centroids()) {
85 | Centroid c2 = cx.next();
86 | assertEquals(c1.count(), c2.count());
87 | assertEquals(c1.mean(), c2.mean(), 1e-10);
88 | }
89 | assertFalse(cx.hasNext());
90 | assertNotNull(t2);
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/TDigestUtilTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.math.stats;
19 |
20 | import java.nio.ByteBuffer;
21 | import java.util.List;
22 | import java.util.Random;
23 |
24 | import org.junit.Test;
25 |
26 | import com.google.common.collect.Lists;
27 |
28 | public class TDigestUtilTest extends AbstractTest {
29 |
30 | @Test
31 | public void testIntEncoding() {
32 | Random gen = getRandom();
33 | ByteBuffer buf = ByteBuffer.allocate(10000);
34 | List ref = Lists.newArrayList();
35 | for (int i = 0; i < 3000; i++) {
36 | int n = gen.nextInt();
37 | n = n >>> (i / 100);
38 | ref.add(n);
39 | AbstractTDigest.encode(buf, n);
40 | }
41 |
42 | buf.flip();
43 |
44 | for (int i = 0; i < 3000; i++) {
45 | int n = AbstractTDigest.decode(buf);
46 | assertEquals(String.format("%d:", i), ref.get(i).intValue(), n);
47 | }
48 | }
49 | }
--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/scale/ScaleTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.scale;
19 |
20 | import com.google.common.collect.Lists;
21 | import org.junit.Test;
22 |
23 | import java.util.List;
24 |
25 | import static org.junit.Assume.assumeTrue;
26 |
27 | /**
28 | * Tests scaling properties of t-digest variants
29 | */
30 | public class ScaleTest {
31 | @Test
32 | public void testGrowth() {
33 | assumeTrue(Boolean.parseBoolean(System.getProperty("runSlowTests")));
34 | for (Limit limit : new Limit[]{
35 | new RootLinearLimit(), new RootLimit(),
36 | new StandardLimit(), new LinearLimit(), new PiecewiseLinearLimit(0.05),
37 | new PiecewiseLinearLimit(0.1), new PiecewiseLinearLimit(0.2),
38 | }) {
39 | for (long n : new long[]{1000, 10000, 100000, 1000000L, 10000000L, 100000000L, 1000000000L}) {
40 | List r = size(n, 200.0, limit);
41 | int nonTrivial = 0;
42 | for (Centroid centroid : r) {
43 | if (centroid.count > 1) {
44 | nonTrivial++;
45 | }
46 | }
47 | System.out.printf("%s\t%d\t%d\t%d\n", limit.getClass().getSimpleName(), n, r.size(), nonTrivial);
48 | }
49 | }
50 | }
51 |
52 | @SuppressWarnings("WeakerAccess")
53 | public List size(long n, @SuppressWarnings("SameParameterValue") double compression, Limit limit) {
54 | if (compression <= 0) {
55 | compression = 50;
56 | }
57 |
58 | if (limit == null) {
59 | limit = new StandardLimit();
60 | }
61 |
62 | double total = 0;
63 | long i = 0;
64 | List r = Lists.newArrayList();
65 | while (i < n) {
66 | double mean = i;
67 | int count = 1;
68 | i++;
69 | double qx = total / n;
70 |
71 | while (i < n && count + 1 <= Math.max(1, limit.limit(n, qx) / compression)) {
72 | count++;
73 | mean += (i - mean) / count;
74 | qx = (total + count / 2) / n;
75 | i++;
76 | }
77 | total += count;
78 | r.add(new Centroid(mean, count));
79 | }
80 | return r;
81 | }
82 |
83 | public static class Centroid {
84 | final double mean;
85 | final int count;
86 |
87 | @SuppressWarnings("WeakerAccess")
88 | public Centroid(double mean, int count) {
89 | this.mean = mean;
90 | this.count = count;
91 | }
92 | }
93 |
94 | public interface Limit {
95 | double limit(long n, double q);
96 | }
97 |
98 | public static class StandardLimit implements Limit {
99 | @Override
100 | public double limit(long n, double q) {
101 | return 4 * n * q * (1 - q);
102 | }
103 | }
104 |
105 | public static class RootLimit implements Limit {
106 | @Override
107 | public double limit(long n, double q) {
108 | return 2 * n * Math.sqrt(q * (1 - q));
109 | }
110 | }
111 |
112 | public static class LinearLimit implements Limit {
113 | @Override
114 | public double limit(long n, double q) {
115 | return 2 * n * Math.min(q, 1 - q);
116 | }
117 | }
118 |
119 | public static class RootLinearLimit implements Limit {
120 | @Override
121 | public double limit(long n, double q) {
122 | return n * Math.sqrt(2 * Math.min(q, 1 - q));
123 | }
124 | }
125 |
126 | public static class PowerLinearLimit implements Limit {
127 | private final double exp;
128 |
129 | public PowerLinearLimit(double exp) {
130 | this.exp = exp;
131 | }
132 |
133 | @Override
134 | public double limit(long n, double q) {
135 | return n * Math.pow(2 * Math.min(q, 1 - q), exp);
136 | }
137 | }
138 |
139 | private class PiecewiseLinearLimit implements Limit {
140 | private final double cut;
141 |
142 | PiecewiseLinearLimit(double cut) {
143 | this.cut = cut;
144 | }
145 |
146 | @Override
147 | public double limit(long n, double q) {
148 | if (q < cut) {
149 | return n * q / cut;
150 | } else if (1 - q < cut) {
151 | return limit(n, 1 - q);
152 | } else {
153 | return n;
154 | }
155 |
156 | }
157 | }
158 | }
159 |
--------------------------------------------------------------------------------
/docs/error-uniform-delta=100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/error-uniform-delta=100.png
--------------------------------------------------------------------------------
/docs/error-uniform-delta=200.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/error-uniform-delta=200.png
--------------------------------------------------------------------------------
/docs/error-uniform-delta=50.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/error-uniform-delta=50.png
--------------------------------------------------------------------------------
/docs/error-uniform-delta=500.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/error-uniform-delta=500.png
--------------------------------------------------------------------------------
/docs/error-vs-compression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/error-vs-compression.png
--------------------------------------------------------------------------------
/docs/interpolation-figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/interpolation-figure.png
--------------------------------------------------------------------------------
/docs/max-error-uniform.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/max-error-uniform.png
--------------------------------------------------------------------------------
/docs/proofs/invariant-preservation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/proofs/invariant-preservation.pdf
--------------------------------------------------------------------------------
/docs/proofs/refs.bib:
--------------------------------------------------------------------------------
1 | @article{Gan:2018:MQS:3236187.3269475,
2 | author = {Gan, Edward and Ding, Jialin and Tai, Kai Sheng and Sharan, Vatsal and Bailis, Peter},
3 | title = {Moment-based Quantile Sketches for Efficient High Cardinality Aggregation Queries},
4 | journal = {Proc. VLDB Endow.},
5 | issue_date = {July 2018},
6 | volume = {11},
7 | number = {11},
8 | month = jul,
9 | year = {2018},
10 | issn = {2150-8097},
11 | pages = {1647--1660},
12 | numpages = {14},
13 | url = {https://doi.org/10.14778/3236187.3236212},
14 | doi = {10.14778/3236187.3236212},
15 | acmid = {3269475},
16 | publisher = {VLDB Endowment},
17 | }
18 |
19 | @INPROCEEDINGS{Chen2000,
20 | author = {Fei Chen and Diane Lambert and Jos{\'e} C. Pinheiro},
21 | title = {Incremental Quantile Estimation for Massive Tracking},
22 | booktitle = {In Proceedings of KDD},
23 | year = {2000},
24 | pages = {516--522}
25 | }
26 |
27 | @article{one-dimensional-k-means,
28 | id= "pmid:27942416",
29 | title = {Ckmeans.1d.dp: Optimal k-means Clustering in One Dimension by Dynamic Programming},
30 | author= {Haizhou Wang and Mingzhou Song},
31 | journal= "The R journal",
32 | ISSN= "2073-4859",
33 | year = {2011},
34 | month={12},
35 | pages= {29-33},
36 | volume=3,
37 | issue=2,
38 | PMID = "27942416",
39 | PMCID= "PMC5148156"
40 | }
41 |
42 | @inproceedings{Greenwald-space-efficient-online-quantiles,
43 | author = {Greenwald, Michael and Khanna, Sanjeev},
44 | title = {Space-efficient Online Computation of Quantile Summaries},
45 | booktitle = {Proceedings of the 2001 ACM SIGMOD International Conference on Management of Data},
46 | series = {SIGMOD '01},
47 | year = {2001},
48 | isbn = {1-58113-332-4},
49 | location = {Santa Barbara, California, USA},
50 | pages = {58--66},
51 | numpages = {9},
52 | url = {http://doi.acm.org/10.1145/375663.375670},
53 | doi = {10.1145/375663.375670},
54 | acmid = {375670},
55 | publisher = {ACM},
56 | address = {New York, NY, USA}
57 | }
58 |
59 | @article{sawzall,
60 | title = {Interpreting the Data: Parallel Analysis with Sawzall},
61 | author = {Rob Pike and Sean Dorward and Robert Griesemer and Sean Quinlan},
62 | year = {2005},
63 | journal = {Scientific Programming Journal},
64 | pages = {277--298},
65 | volume = {13}
66 | }
67 |
68 | @article{munro1980,
69 | author = "J.I. Munro and M.S. Paterson",
70 | year = "1980",
71 | title = "Selection and sorting with limited storage",
72 | journal = "Theoretical Computer Science",
73 | volume = "12",
74 | number = "3",
75 | pages = "315 - 323",
76 | issn = "0304-3975",
77 | doi = "https://doi.org/10.1016/0304-3975(80)90061-4",
78 | url = "http://www.sciencedirect.com/science/article/pii/0304397580900614"
79 | }
80 | @inproceedings{qdigest,
81 | author = {Shrivastava, Nisheeth and Buragohain, Chiranjeeb and Agrawal, Divyakant and Suri, Subhash},
82 | year = {2004},
83 | month = {09},
84 | title = {Medians and Beyond: New Aggregation Techniques for Sensor Networks},
85 | booktitle = {SenSys'04 - Proceedings of the Second International Conference on Embedded Networked Sensor Systems},
86 | doi = {10.1145/1031495.1031524}
87 | }
88 |
89 | @misc{wiki_welford,
90 | title = "Algorithms for Calculating Variance, Online Algorithm",
91 | author = "The Wikipedia Foundation",
92 | year = 2018,
93 | howpublished = "\url{http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm}",
94 | note = "[Online; accessed 19-October-2013]"
95 | }
96 |
97 | @misc{datafu,
98 | title = "The Apache Datafu Project",
99 | author = "DFU",
100 | year = 2019,
101 | publisher = "Apache Software Foundation",
102 | howpublished = "\url{https://datafu.apache.org/}",
103 | note = "[Online; accessed 23-January-2018]"
104 | }
105 |
106 | @misc{t-digest-project,
107 | title = "The t-digest Library",
108 | author = "Ted Dunning",
109 | year = 2018,
110 | howpublished = "\url{https://github.com/tdunning/t-digest/}",
111 | note = "[Online; accessed 23-January-2018]"
112 | }
113 |
114 | @misc{t-digest-arxiv,
115 | title = {Computing Extremely Accurate Quantiles Using t-Digests},
116 | author = "Ted Dunning and Otmar Ertl",
117 | year = 2018,
118 | howpublished = "\url{https://arxiv.org/abs/1902.04023}",
119 | note="arXiv:1902.04023 [stat.CO]"
120 | }
121 |
122 | @misc{moment-sketch-arxiv,
123 | title = {Moment-Based Quantile Sketches for Efficient High Cardinality Aggregation Queries},
124 | author = {Edward Gan and Jialin Ding and Kai Sheng Tai and Vatsal Sharan and Peter Bailis},
125 | year = 2018,
126 | howpublished = "\url{https://arxiv.org/abs/1803.01969v1}",
127 | note = "arXiv:1803.01969v1"
128 | }
129 |
130 | @misc{github:stream,
131 | title = "Stream summarizer and cardinality estimator",
132 | author = "StreamLib",
133 | year = 2019,
134 | howpublished = "\url{https://github.com/addthis/stream-lib}",
135 | note = "[Online; accessed 11-February-2019]"
136 | }
137 |
138 | @inbook{knuth2welford,
139 | author={Donald E. Knuth},
140 | year=1998,
141 | edition=3,
142 | pages=232,
143 | title= {The Art of Computer Programming, volume 2: Seminumerical Algorithms},
144 | publisher= {Addison-Wesley},
145 | address= {Boston}
146 | }
147 |
148 | @ARTICLE{welford62,
149 | author = {B. P. Welford },
150 | year = {1962},
151 | title = {Note on a method for calculating corrected sums of squares and products},
152 | journal = {Technometrics},
153 | pages = {419--420}
154 | }
155 |
--------------------------------------------------------------------------------
/docs/proofs/sizing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/proofs/sizing.pdf
--------------------------------------------------------------------------------
/docs/quantiles/quantiles.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/quantiles/quantiles.pdf
--------------------------------------------------------------------------------
/docs/quantiles/quantiles.tex:
--------------------------------------------------------------------------------
1 | \documentclass[11pt]{amsart}
2 | \usepackage{geometry} % See geometry.pdf to learn the layout options. There are lots.
3 | \geometry{letterpaper} % ... or a4paper or a5paper or ...
4 | %\geometry{landscape} % Activate for for rotated page geometry
5 | %\usepackage[parfill]{parskip} % Activate to begin paragraphs with an empty line rather than an indent
6 | \usepackage{graphicx}
7 | \usepackage{amssymb}
8 | \usepackage{epstopdf}
9 | \DeclareGraphicsRule{.tif}{png}{.png}{`convert #1 `dirname #1`/`basename #1 .tif`.png}
10 |
11 | \title{Brief Article}
12 | \author{The Author}
13 | %\date{} % Activate to display a given date or no date
14 |
15 | \begin{document}
16 | \maketitle
17 | \section{Assumptions}
18 | \begin{enumerate}
19 | \item The samples for each centroid are evenly divided on each side of the centroid
20 | \item The samples between centroids are uniformly distributed
21 | \item If a centroid has $n = 2k + 1$ samples, then there are $k$ samples on each side and one at the centroid
22 | \item If a centroid has $n = 2k$ samples, then there $k-1/2$ samples on each side
23 | \item the first and last centroid will have only one sample
24 | \end{enumerate}
25 | \section{Equal Spacing Model}
26 | Take two centroids separated by $x$ with $n_\mathtt{ left}$ and $n_\mathtt{ right}$ samples respectively. We know the following about the samples between these centroids
27 | \begin{enumerate}
28 | \item the first and last centroids represent the minimum and maximum samples for the entire datasets
29 | \item if $n_\mathtt{left}=1$ or $n_\mathtt{right}=1$ then the unique sample for the corresponding centroid is at the centroid
30 | \item there will be $\lfloor n_\mathtt{left} / 2 \rfloor + \lfloor n_\mathtt{right} / 2 \rfloor$ samples between the centroids
31 | \item samples will be spaced $\Delta x = 2x / ( n_\mathtt {left} + n_\mathtt{ right})$ apart
32 | \item the left-most sample is at $((n_\mathtt{left} \mod 2) + 1)\Delta x / 2$ from the left centroid
33 | \item the right-most sample is at $((n_\mathtt{right} \mod 2) + 1)\Delta x / 2$ from the left centroid
34 | \end{enumerate}
35 | %\subsection{}
36 |
37 |
38 |
39 | \end{document}
--------------------------------------------------------------------------------
/docs/r-sim-diagrams/figs.r:
--------------------------------------------------------------------------------
1 | fig.no = 1
2 | pdf(sprintf("plot-%03d.pdf", fig.no),
3 | width=7, height=5, pointsize=10)
4 | fig.no = fig.no + 1
5 |
6 | # This figure shows the mapping between q and k and how variable size clusters result.
7 | par(cex.lab=1.5)
8 | par(cex.axis=1.5)
9 | scale = 30
10 | q.marks = (sin(seq(-pi/2+0.01,pi/2-0.01,length.out=16))+1)/2
11 | plot(q.marks, scale*asin(2*q.marks-1)/pi+scale/2, xlim=c(0,1.05), ylim=c(-3,scale),
12 | lwd=2, cex=0.7,
13 | type='b', ylab='k', xlab='q')
14 | for (i in 1:(length(q.marks))) {
15 | q = q.marks[i]
16 | lines(c(q,q), c(-3, scale*asin(2*q-1)/pi + scale/2 -1), lwd=2, col='gray')
17 | }
18 | dev.off()
19 |
20 | pdf(sprintf("plot-%03d.pdf", fig.no),
21 | width=7, height=5, pointsize=10)
22 | fig.no = fig.no + 1
23 |
24 | # this shows the old and sqrt limits
25 | par(cex.lab=1.5)
26 | par(cex.axis=1.5)
27 | q = seq(0, 1, by=0.001)
28 | plot(q, 6*q*(1-q), type='l', lwd=2, ylab="Cluster Size")
29 | lines(q, 8/pi*sqrt(q*(1-q)), lwd=2, lty=2)
30 |
31 | dev.off()
32 |
33 | pdf(sprintf("plot-%03d.pdf", fig.no),
34 | width=7, height=5, pointsize=10)
35 | fig.no = fig.no + 1
36 |
37 | par(cex.lab=1.5)
38 | par(cex.axis=1.5)
39 | plot(q, 100*(asin(2*q-1)/pi+0.5), type='l', lwd=2,
40 | ylab="k")
41 |
42 | dev.off()
43 |
--------------------------------------------------------------------------------
/docs/r-sim-diagrams/shifts.r:
--------------------------------------------------------------------------------
1 | mark = function(x) {arrows(x,9, x,5, length=0.15, angle=10, lwd=2)}
2 |
3 | z = rnorm(2000)
4 | shift = c(rep(0, 400), rep(1.5, 600), rep(-2, 300), rep(-1, 700))
5 | spike = c(rep(0,1600), rep(9, 5), rep(395,0))
6 | plot(z+shift+spike, type='s', xlab='Time', ylab='Shift + Noise', ylim=c(-10,10))
7 | mark(400)
8 | mark(1000)
9 | mark(1300)
10 |
11 |
12 |
--------------------------------------------------------------------------------
/docs/r-sim-diagrams/sim.r:
--------------------------------------------------------------------------------
1 | x = seq(-5,5,by=0.01)
2 | sizeLimit = function(n,x) {q=pnorm(x);4*sum(counts)*q*(1-q)/50}
3 | addPoint = function(p) {
4 | dist = abs(centroids - p)
5 | k = which(min(dist) == dist)[1]
6 | if (counts[k] < sizeLimit(N, p)) {
7 | counts[k] <<- counts[k]+1
8 | centroids[k] <<- centroids[k] + (p-centroids[k]) / counts[k]
9 | } else {
10 | centroids <<- c(centroids, p)
11 | counts <<- c(counts, 1)
12 | }
13 | }
14 |
15 | offset = 100
16 | step = function(n=100) {
17 | for (i in 1:n) {
18 | addPoint(samples[i+offset])
19 | }
20 | offset <<- offset + n
21 | counts
22 | }
23 |
24 | plot(x, sizeLimit(40, x), type='l')
25 |
26 | N = 10e6
27 | samples = rnorm(N)
28 | centroids = samples[1:2]
29 | counts = (centroids != 1000) + 0
30 |
31 | plot.offset = 0
32 | plot.stuff = function(stepSize=1000) {
33 | for (i in 1:10) {
34 | step(stepSize)
35 | pdf(sprintf("fig-%03d.pdf", i + plot.offset),
36 | width=5, height=5, pointsize=10)
37 | plot(pnorm(centroids[order(centroids)]), counts[order(centroids)],
38 | type='p', ylim=c(0, 2*max(counts)),
39 | pch=21, bg=rgb(0,0,0,alpha=0.1), col=rgb(0,0,0,alpha=0.1), cex=0.6);
40 | centers = centroids[order(centroids)]
41 | limits = sizeLimit(sum(counts), centers)
42 | lines(pnorm(centers), limits, type='l')
43 | dev.off()
44 | }
45 | plot.offset <<- plot.offset + 10
46 | }
47 |
48 | plot.stuff(100)
49 | plot.stuff(1000)
50 | plot.stuff(10000)
51 | plot.stuff(100000)
52 |
--------------------------------------------------------------------------------
/docs/simpa/declaration-of-competing-interests.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/simpa/declaration-of-competing-interests.docx
--------------------------------------------------------------------------------
/docs/simpa/figures/adaptive-threshold.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/simpa/figures/adaptive-threshold.pdf
--------------------------------------------------------------------------------
/docs/simpa/figures/change-point.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/simpa/figures/change-point.pdf
--------------------------------------------------------------------------------
/docs/simpa/figures/detection.r:
--------------------------------------------------------------------------------
1 | ### Draws a figure illustrating change detection in the distribution of synthetic data.
2 | ### Each dot represents a single time period with 1000 samples. Before the change,
3 | ### the data is sampled from a unit normal distribution. After the change, 20 samples
4 | ### in each time period are taken from N(3,1). Comparing counts with a chi^2 test that
5 | ### is robust to small expected counts robustly detects this shift.
6 |
7 | ### log-likelihood ratio test for multinomial data
8 | llr = function(k) {
9 | 2 * sum(k) * (H(k) - H(rowSums(k)) - H(colSums(k)))
10 | }
11 | H = function(k) {
12 | N = sum(k) ;
13 | return (sum(k/N * log(k/N + (k==0))))
14 | }
15 |
16 | ### compare recent samples to historical by comparing counts in a range of interest
17 | analyze = function(historical, recent, cuts) {
18 | counts = data.frame(
19 | a=hist(recent, breaks=cuts, plot=F)$counts,
20 | b=hist(historical, breaks=cuts, plot=F)$counts)
21 | llr(counts)
22 | }
23 |
24 | ### use fixed seed for stability of the pictures
25 | set.seed(3)
26 | ### lots of reference data
27 | historical = rnorm(100000)
28 |
29 | ### set cuts based on historical data
30 | ### in practical systems, this step would be implemented with a t-digest
31 | cuts = c(-10, quantile(historical, probs=c(0.99, 0.999)), 20)
32 |
33 | ### 1000 samples per time period, 2% perturbation after change
34 | n = 1000
35 | epsilon = 0.02
36 |
37 | ### sample 60 scores without perturbation
38 | scores = rep(0,100)
39 | for (i in 1:60) {
40 | scores[i] = analyze(historical, c(rnorm(n)), cuts)
41 | }
42 |
43 | ### sample 40 scores with perturbation
44 | for (i in 1:40) {
45 | scores[i + 60] = analyze(historical, c(rnorm(n * (1-epsilon)), rnorm(n * epsilon, 3)), cuts)
46 | }
47 |
48 | ### plot the data
49 | pdf("change-point.pdf", width=5, height=4, pointsize=10)
50 | old = par('mgp')
51 | par(mgp=c(3,0.6,0))
52 | colors = c(rep(rgb(0,0,0,alpha=0.8),60), rep(rgb(1,0,0,alpha=0.8),40))
53 | plot(scores, xaxt='n', xlab=NA, ylab=NA, ylim=c(0,60), cex=1.3, pch=21, bg=colors, col=NA)
54 | abline(v=60.5, lwd=3, col=rgb(0,0,0, alpha=0.1))
55 |
56 | polygon(c(-1,55,55,-1,-1), c(60, 60,36,36,60), col='white')
57 | points(c(1.5, 1.5), c(55, 45), pch=21, bg=c('black', 'red'), col=NA)
58 | text(5, c(55,45), adj=0, labels=c(
59 | expression(x %~% symbol(N)(mu == 0)),
60 | expression(x %~% bgroup("[", atop(symbol(N)(mu==0) , symbol(N)(mu==3)),""))))
61 | text(30, c(55,48,41.5), c("1000 samples", "980 samples", "20 samples"), adj=0)
62 |
63 |
64 | mtext(expression(llr(counts)), side=2, padj=-1.3, cex=1.4)
65 | mtext("Before change", at=25, side=1, padj=1, cex=1.5)
66 | mtext("After change", at=75, side=1, padj=1, cex=1.5)
67 | par(mgp=old)
68 | dev.off()
69 |
70 |
71 |
--------------------------------------------------------------------------------
/docs/simpa/figures/error-vs-compression.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/simpa/figures/error-vs-compression.pdf
--------------------------------------------------------------------------------
/docs/simpa/figures/windows.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/simpa/figures/windows.graffle
--------------------------------------------------------------------------------
/docs/simpa/figures/windows.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/simpa/figures/windows.pdf
--------------------------------------------------------------------------------
/docs/simpa/highlights.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/simpa/highlights.pdf
--------------------------------------------------------------------------------
/docs/simpa/highlights.tex:
--------------------------------------------------------------------------------
1 | \documentclass[11pt, oneside]{article} % use "amsart" instead of "article" for AMSLaTeX format
2 | \usepackage{geometry} % See geometry.pdf to learn the layout options. There are lots.
3 | \geometry{letterpaper} % ... or a4paper or a5paper or ...
4 | %\geometry{landscape} % Activate for rotated page geometry
5 | %\usepackage[parfill]{parskip} % Activate to begin paragraphs with an empty line rather than an indent
6 | \usepackage{graphicx} % Use pdf, png, jpg, or eps§ with pdflatex; use eps in DVI mode
7 | % TeX will automatically convert eps --> pdf in pdflatex
8 | \usepackage{amssymb}
9 |
10 | %SetFonts
11 |
12 | %SetFonts
13 |
14 |
15 | \title{Highlights for: The $t$-digest: Efficient Estimates of Distributions}
16 | \author{Ted Dunning}
17 | %\date{} % Activate to display a given date or no date
18 |
19 | \begin{document}
20 | \maketitle
21 | %\section{}
22 | %\subsection{}
23 | \begin{itemize}
24 | \item The t-digest is an algorithm for accurately estimating quantiles from a compact
25 | sketch
26 |
27 | \item The t-digest is available as a library as well as embedded in popular query systems
28 |
29 | \item The t-digest allows accurate estimates of data with arbitrary distribuiton
30 |
31 | \item The t-digest library has a simple API and no runtime dependencies, available on GitHub
32 | \end{itemize}
33 |
34 |
35 | \end{document}
36 |
37 |
--------------------------------------------------------------------------------
/docs/software-paper/figures/cluster-spread.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/software-paper/figures/cluster-spread.pdf
--------------------------------------------------------------------------------
/docs/software-paper/figures/endpoint.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/software-paper/figures/endpoint.pdf
--------------------------------------------------------------------------------
/docs/software-paper/figures/interpolation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/software-paper/figures/interpolation.pdf
--------------------------------------------------------------------------------
/docs/software-paper/figures/k-q-plot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/software-paper/figures/k-q-plot.pdf
--------------------------------------------------------------------------------
/docs/software-paper/figures/linear-interpolation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/software-paper/figures/linear-interpolation.pdf
--------------------------------------------------------------------------------
/docs/software-paper/figures/merge.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/software-paper/figures/merge.pdf
--------------------------------------------------------------------------------
/docs/software-paper/figures/qd-sizes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/software-paper/figures/qd-sizes.pdf
--------------------------------------------------------------------------------
/docs/software-paper/figures/relative-error.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/software-paper/figures/relative-error.pdf
--------------------------------------------------------------------------------
/docs/software-paper/figures/singleton.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/software-paper/figures/singleton.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/build-figures.py:
--------------------------------------------------------------------------------
1 | import time
2 | import re
3 | import sys
4 | from glob import glob
5 | from subprocess import check_output
6 |
7 | logFile = open("figure.log", "a")
8 |
9 | def log(s):
10 | now = time.strftime("%v %T %Z")
11 | logLine = "\t".join([now, s, '\n'])
12 | logFile.write(logLine)
13 | sys.stdout.write(logLine)
14 |
15 | log('Starting figure run')
16 | for script in glob('*.r'):
17 | t0 = time.time()
18 | output = check_output(["Rscript", script])
19 | t1 = time.time()
20 | log('%(script)s took %(delta).1fs <%(output)s>' % {
21 | "script":script,
22 | "delta": (t1-t0),
23 | "output": re.sub("null device\s*\n\s+1\s?\n", "", output)})
24 |
--------------------------------------------------------------------------------
/docs/t-digest-paper/comparison.r:
--------------------------------------------------------------------------------
1 | data = read.delim("comparison.tsv")
2 | keep = function(data, tag) {
3 | data[data$dist == tag & data$compression == 50 & data$q !=0.3 & data$q != 0.7 & data$q != 0.2 & data$q != 0.8, ]
4 | }
5 |
6 | png("qd-sizes.png", width=1800, height=700, pointsize=28)
7 | layout(matrix(c(1,2,3), 1, 3, byrow=T), widths=c(1,1))
8 |
9 | gray = rgb(0,0,0,0.05)
10 |
11 | old = par(mar=c(4.5,5,3,0.5))
12 | plot(s2~s1, data, log='xy', pch=21, col=gray, bg=gray, cex=0.4,
13 | xlab="t-digest (bytes)", ylab="Q-digest (bytes)",
14 | xlim=c(100, 120000),
15 | cex.lab=1.5, xaxt='n', yaxt='n')
16 | box(lwd=3)
17 |
18 | axis(at=c(100, 300, 1000, 3000, 10000, 30000, 100000), labels=c(100, 300, "1K", "3K", "10K", "30K", "100K"), side=1)
19 | axis(at=c(100, 300, 1000, 3000, 10000, 30000, 100000), labels=c(100, 300, "1K", "3K", "10K", "30K", "100K"), side=2)
20 |
21 | steps = exp(seq(log(100), log(200000), by=log(2)))
22 | lines(steps, steps, col='lightgrey')
23 | lines(steps, steps/2, lty=2, col='lightgrey')
24 | lines(steps,steps*2, lty=2, col='lightgrey')
25 |
26 | for (compression in c(2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000)) {
27 | x = mean(data[data$compression == compression,]$s1) * 1.8
28 | y = mean(data[data$compression == compression,]$s2)
29 | text(x,y,compression)
30 | }
31 | text(10000, 1000, expression(1/delta), cex=1.5)
32 |
33 | par(old)
34 |
35 | old = par(mar=c(4.5,5,3,0.5))
36 | boxplot(1e6*e2 ~ q, keep(data, 'uniform'), at=1:7 - 0.13, boxwex=0.3, xaxt='n', yaxt='n',
37 | ylab="Quantile error (ppm)", xlab="Quantile",
38 | ylim=c(-10000, 20000), cex.lab=1.5, col=rgb(0.95, 0.95, 0.95))
39 | boxplot(1e6*e1 ~ q, keep(data, 'uniform'), col=rgb(0.4, 0.4, 0.4), at=1:7 + 0.13, add=T, boxwex=0.3, xaxt='n', yaxt='n')
40 | axis(at=1:7, labels=c(0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999), side=1)
41 | axis(side=2, cex.label=2)
42 | abline(h=0, lwd=2, col='gray')
43 | for (i in 1:7) {
44 | abline(v=i, lwd=1, col='lightgray', lty=2)
45 | }
46 | legend(5.5, 20000, c("Q-digest", "t-digest"), fill = c(rgb(0.95, 0.95, 0.95), rgb(0.4, 0.4, 0.4)))
47 | text(6.5, 14000, "Uniform", cex=1.5)
48 | text(6.5, 12000, expression(1/delta == 50), cex=1.5)
49 | box(lwd=3)
50 | par(old)
51 |
52 | old = par(mar=c(4.5,5,3,0.5))
53 | boxplot(1e6*e2 ~ q, keep(data, 'gamma'), col=rgb(0.95, 0.95, 0.95), at=1:7 - 0.13, boxwex=0.3, xaxt='n',
54 | ylab="Quantile error (ppm)", xlab="Quantile",
55 | cex.lab=1.5)
56 | boxplot(1e6*e1 ~ q, keep(data, 'gamma'), col=rgb(0.4, 0.4, 0.4), at=1:7 + 0.13, add=T, boxwex=0.3, xaxt='n')
57 | axis(at=1:7, labels=c(0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999), side=1)
58 | abline(h=0, lwd=2, col='gray')
59 | for (i in 1:7) {
60 | abline(v=i, lwd=1, col='lightgray', lty=2)
61 | }
62 | legend(5.5, 88000, c("Q-digest", "t-digest"), fill = c(rgb(0.95, 0.95, 0.95), rgb(0.4, 0.4, 0.4)))
63 | text(6.5, 68000, expression(Gamma(0.1, 0.1)), cex=1.5)
64 | text(6.5, 62000, expression(1/delta == 50), cex=1.5)
65 | box(lwd=3)
66 | par(old)
67 |
68 | dev.off()
69 |
--------------------------------------------------------------------------------
/docs/t-digest-paper/errors.csv:
--------------------------------------------------------------------------------
1 | dist tag x Q error
2 |
--------------------------------------------------------------------------------
/docs/t-digest-paper/errors.r:
--------------------------------------------------------------------------------
1 | errorData = read.delim("errors.csv")
2 |
3 | plotError = function(dist, ylim=c(-2000, 2000), yaxt='s', ylab) {
4 | boxplot(1e6*error ~ Q, errorData[errorData$dist==dist,], ylim=ylim, lwd=4, xlab="Cumulative Distribution", ylab=ylab, pars=list(lwd.ticks=4), yaxt=yaxt)
5 | box(lwd=8)
6 | abline(h=1000, lty=2, lwd=4)
7 | abline(h=-1000, lty=2, lwd=4)
8 | }
9 |
10 | png("error.png", width=2400, height=1200, pointsize=36)
11 | layout(matrix(c(1,2), 1, 2, byrow=T), heights=c(1200, 1200), widths=c(1300,1100))
12 | #plotError('mixture', 'mixture-error.png')
13 | plotError('gamma', ylab="Error (ppm)")
14 | old = par(mar=c(5.1,0,4.1,2))
15 | plotError('uniform', yaxt='n', ylab=NA)
16 | par(old)
17 | dev.off()
18 |
--------------------------------------------------------------------------------
/docs/t-digest-paper/figure-doc.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figure-doc.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/cluster-spread.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/cluster-spread.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/endpoint.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/endpoint.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/error-vs-compression.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/error-vs-compression.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/interpolation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/interpolation.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/k-q-plot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/k-q-plot.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/linear-interpolation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/linear-interpolation.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/merge.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/merge.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/qd-sizes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/qd-sizes.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/relative-error.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/relative-error.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/singleton.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/singleton.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/histo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/histo.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/k-q-diagram/k-q-limits.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/k-q-diagram/k-q-limits.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/k-q-diagram/slope-limiting.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/k-q-diagram/slope-limiting.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/k-q-plot.r:
--------------------------------------------------------------------------------
1 | n = 10
2 | fade = 'darkgray'
3 | pdf("k-q-plot.pdf", width=2.5, height=2.2, pointsize=8, family='serif')
4 |
5 | par(mar=c(3.,3,1,1))
6 |
7 | q.to.k = function(q,compression) {
8 | compression * (asin(2*q-1)/pi/2)
9 | }
10 |
11 | k.to.q = function(k,compression) {
12 | sin(k/compression*pi - pi/2)/2 + 0.5
13 | }
14 |
15 | q = seq(0,1,by=0.001)
16 |
17 | plot(q, q.to.k(q, compression=n), type='l', lwd=2, xlab=NA, ylab=NA, xaxt='n', yaxt='n')
18 | axis(side=1, at=(0:5)/5, mgp=c(1,0.5,0))
19 | title(xlab=expression(italic(q)), line=1.3, cex.lab=1.5)
20 | axis(side=2, at=seq(-5,5,by=1), mgp=c(1,0.6,0))
21 | title(ylab=expression(italic(k)), line=1.5, cex.lab=1.5)
22 |
23 | for (i in 0:n) {
24 | abline(h=i-5, col=fade)
25 | abline(v=k.to.q(i, compression=n), col=fade)
26 | }
27 | lines(q, q.to.k(q, compression=n), type='l', lwd=2)
28 | dev.off()
29 |
--------------------------------------------------------------------------------
/docs/t-digest-paper/linear-interpolation.r:
--------------------------------------------------------------------------------
1 | ### Illustrates the piece-wise linear approximation of the cumulative distribution using constant size bins
2 | fade = rgb(0,0,0,alpha=0.5)
3 | dot.size = 0.7
4 | n = 10000
5 | set.seed(5)
6 |
7 | pdf("linear-interpolation.pdf", width=6, height=2.7, pointsize=10, family='serif')
8 | layout(matrix(c(1,2),byrow=T, ncol=2), widths=c(1.1,1))
9 | u = sort(runif(n))
10 | x = log(1-u)
11 | x = sort(x)
12 | F = ((0:(n-1))+0.5)/n
13 | par(mar=c(2.5,3,0.5,1))
14 | plot(x, F, cex=dot.size, pch=21, bg=fade, col=NA, type='b', xlim=c(-9,-4.5), ylim=c(0,0.01), xaxt='n', ylab=NA, mgp=c(1,0.5,0), xlab=NA)
15 |
16 | axis(side=1, at=-10:-1, labels=NA)
17 | title(xlab=expression(italic(x)), line=0.8, cex.lab=1.5)
18 | title(ylab=expression(italic(q)), line=1.5, cex.lab=1.5)
19 |
20 | left.end = x[1] - (x[2]-x[1])
21 |
22 | lines(c(left.end, x[100]), c(0, 0.01), lwd=2)
23 | lines(c(left.end, left.end), c(-0.0005, 0.0005), lt=1, col='black', lwd=0.5)
24 | lines(c(x[100], x[100]), c(0.0085, 0.015), lt=1, col='black', lwd=0.5)
25 | text(-7, 0.006, "100")
26 |
27 | ###text(-5, 0.4, adj=0, "Constant size bins result in large")
28 | ###text(-5, 0.35, adj=0, "errors at extreme quantiles")
29 |
30 | par(mar=c(2.5,1.5,0.5,1))
31 |
32 | plot(x, F, cex=dot.size, pch=21, bg=fade, col=NA, type='b', xlim=c(-9,-4.5), ylim=c(0,0.01), yaxt='n', xaxt='n')
33 | axis(side=1, at=-10:-1, labels=NA)
34 | axis(side=2, at=(0:6)/10, labels=NA)
35 | title(xlab=expression(italic(x)), line=0.8, cex.lab=1.5)
36 | title(ylab=expression(italic(q)), line=2, cex.lab=1.5)
37 |
38 | q.to.k = function(q) {
39 | (asin(2*q-1)/pi + 1/2)
40 | }
41 |
42 | k.to.q = function(k,compression) {
43 | sin(k/compression*pi - pi/2)/2 + 0.5
44 | }
45 |
46 | weights = c(2, 8, 19, 35, 56, 81, 111)
47 | q.bin = cumsum(c(0, weights)/n)
48 |
49 | i.bin = c(1, cumsum(weights)+1)
50 | i.right = i.bin-1
51 | i.right = i.right[i.right > 0]
52 | m = length(i.right)
53 | i.bin = i.bin[1:m]
54 |
55 | x.bin = c(left.end, (x[i.right[1:(m-1)]] + x[i.bin[2:m]])/2)
56 | F.bin = (i.bin-1) / n
57 | lines(x.bin, F.bin, lwd=2)
58 | dy = 0.0005
59 | for (i in 1:m) {
60 | x.text = (x[i.bin[i]] + x[i.right[i]])/2
61 | y.text = (F.bin[i] + F.bin[i+1])/2
62 | x.offset = 0.3 * y.text
63 | y.offset = dy * (1 + 500*y.text)
64 | x.pos = x.text - x.offset
65 | y.pos = y.text + y.offset
66 | lines(c(x.bin[i],x.bin[i]), c(F.bin[i]-dy+0.000, F.bin[i]+dy+y.offset*0.6-0.0005), lt=1, lwd=0.5, col='black')
67 | text(x.text - x.offset, y.text + y.offset, i.right[i]-i.bin[i]+1)
68 | }
69 | ###text(-5, 0.35, adj=0, "Variable size bins keep errors")
70 | ###text(-5, 0.3, adj=0, "small at extreme quantiles")
71 |
72 | dev.off()
73 |
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/data.plist:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/data.plist
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image1.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image10.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image11.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image11.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image2.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image21.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image21.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image22.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image22.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image23.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image23.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image24.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image24.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image26.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image26.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image27.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image27.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image28.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image28.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image29.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image29.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image3.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image31.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image31.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image32.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image32.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image33.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image33.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image34.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image34.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image35.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image35.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image41.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image41.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image44.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image44.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image46.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image46.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image47.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image47.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image48.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image48.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image49.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image49.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image50.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image50.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image51.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image51.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image59.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image59.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image63.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image63.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image7.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image7.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image8.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures/combined.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures/combined.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures/endpoint.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures/endpoint.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures/interpolation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures/interpolation.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures/singleton.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures/singleton.pdf
--------------------------------------------------------------------------------
/docs/t-digest-paper/refs.bib:
--------------------------------------------------------------------------------
1 | @INPROCEEDINGS{Chen2000,
2 | author = {Fei Chen and Diane Lambert and Jos{\'e} C. Pinheiro},
3 | title = {Incremental Quantile Estimation for Massive Tracking},
4 | booktitle = {In Proceedings of KDD},
5 | year = {2000},
6 | pages = {516--522}
7 | }
8 |
9 | @article{one-dimensional-k-means,
10 | id= "pmid:27942416",
11 | title = {Ckmeans.1d.dp: Optimal k-means Clustering in One Dimension by Dynamic Programming},
12 | author= {Haizhou Wang and Mingzhou Song},
13 | journal= "The R journal",
14 | ISSN= "2073-4859",
15 | year = {2011},
16 | month={12},
17 | pages= {29-33},
18 | volume=3,
19 | issue=2,
20 | PMID = "27942416",
21 | PMCID= "PMC5148156"
22 | }
23 |
24 | @inproceedings{Greenwald-space-efficient-online-quantiles,
25 | author = {Greenwald, Michael and Khanna, Sanjeev},
26 | title = {Space-efficient Online Computation of Quantile Summaries},
27 | booktitle = {Proceedings of the 2001 ACM SIGMOD International Conference on Management of Data},
28 | series = {SIGMOD '01},
29 | year = {2001},
30 | isbn = {1-58113-332-4},
31 | location = {Santa Barbara, California, USA},
32 | pages = {58--66},
33 | numpages = {9},
34 | url = {http://doi.acm.org/10.1145/375663.375670},
35 | doi = {10.1145/375663.375670},
36 | acmid = {375670},
37 | publisher = {ACM},
38 | address = {New York, NY, USA}
39 | }
40 |
41 | @article{sawzall,
42 | title = {Interpreting the Data: Parallel Analysis with Sawzall},
43 | author = {Rob Pike and Sean Dorward and Robert Griesemer and Sean Quinlan},
44 | year = {2005},
45 | journal = {Scientific Programming Journal},
46 | pages = {277--298},
47 | volume = {13}
48 | }
49 |
50 | @article{munro1980,
51 | author = "J.I. Munro and M.S. Paterson",
52 | year = "1980",
53 | title = "Selection and sorting with limited storage",
54 | journal = "Theoretical Computer Science",
55 | volume = "12",
56 | number = "3",
57 | pages = "315 - 323",
58 | issn = "0304-3975",
59 | doi = "https://doi.org/10.1016/0304-3975(80)90061-4",
60 | url = "http://www.sciencedirect.com/science/article/pii/0304397580900614"
61 | }
62 | @inproceedings{qdigest,
63 | author = {Shrivastava, Nisheeth and Buragohain, Chiranjeeb and Agrawal, Divyakant and Suri, Subhash},
64 | year = {2004},
65 | month = {09},
66 | title = {Medians and Beyond: New Aggregation Techniques for Sensor Networks},
67 | booktitle = {SenSys'04 - Proceedings of the Second International Conference on Embedded Networked Sensor Systems},
68 | doi = {10.1145/1031495.1031524}
69 | }
70 |
71 | @misc{wiki_welford,
72 | title = "Algorithms for Calculating Variance, Online Algorithm",
73 | author = "The Wikipedia Foundation",
74 | year = 2018,
75 | howpublished = "\url{http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm}",
76 | note = "[Online; accessed 19-October-2013]"
77 | }
78 |
79 | @misc{datafu,
80 | title = "The Apache Datafu Project",
81 | author = "DFU",
82 | year = 2019,
83 | publisher = "Apache Software Foundation",
84 | howpublished = "\url{https://datafu.apache.org/}",
85 | note = "[Online; accessed 23-January-2018]"
86 | }
87 |
88 | @misc{t-digest-project,
89 | title = "The t-digest Library",
90 | author = "Ted Dunning",
91 | year = 2018,
92 | howpublished = "\url{https://github.com/tdunning/t-digest/}",
93 | note = "[Online; accessed 23-January-2018]"
94 | }
95 |
96 | @misc{github:stream,
97 | title = "Stream summarizer and cardinality estimator",
98 | author = "StreamLib",
99 | year = 2019,
100 | howpublished = "\url{https://github.com/addthis/stream-lib}",
101 | note = "[Online; accessed 11-February-2019]"
102 | }
103 |
104 | @inbook{knuth2welford,
105 | author={Donald E. Knuth},
106 | year=1998,
107 | edition=3,
108 | pages=232,
109 | title= {The Art of Computer Programming, volume 2: Seminumerical Algorithms},
110 | publisher= {Addison-Wesley},
111 | address= {Boston}
112 | }
113 |
114 | @ARTICLE{welford62,
115 | author = {B. P. Welford },
116 | year = {1962},
117 | title = {Note on a method for calculating corrected sums of squares and products},
118 | journal = {Technometrics},
119 | pages = {419--420}
120 | }
--------------------------------------------------------------------------------
/docs/t-digest-paper/scaling.r:
--------------------------------------------------------------------------------
1 | data = read.delim("scaling.tsv")
2 | errors = read.delim("error-scaling.tsv")
3 | errors$kb = errors$size/1000
4 |
5 | png("scaling.png", width=1800, height=700, pointsize=28)
6 | layout(matrix(c(1,2), 1, 2, byrow=T), widths=c(1,1))
7 |
8 | old = par(mar=c(5.1,2,2.1,2))
9 | plot(size1 ~ compression, data[data$samples==10000,], log='xy',
10 | xlab=expression(1/delta), ylab="Size (kB)", ylim=c(100,100000),
11 | cex=1, bg=rgb(0,0,0,0.1), col=rgb(0,0,0,0.1), pch=21,
12 | yaxt='n')
13 | axis(at=c(100,1000,10000,100000), labels=c("100 B", "1 kB", "10 kB", "100 kB"), side=2)
14 | box(lwd=3)
15 |
16 | #points(size1 ~ compression, data[data$samples==100,], log='xy',
17 | # xlab=expression(1/delta), ylab="Size (kB)",
18 | # cex=0.5, bg=rgb(0,0,0,0.1), col=rgb(0,0,0,0.1), pch=21)
19 | #
20 | points(size1 ~ compression, data[data$samples==10,],
21 | cex=1, bg=rgb(0,0,0,0.1), col=rgb(0,0,0,0.1), pch=22)
22 |
23 | legend(2, 9e4, c("10M samples", "10k samples"), pch=c(21,22))
24 |
25 | c = seq(2,1100,by=1)
26 | m10 = lm(log(size1) ~ log(compression), data[data$samples==10,])
27 | m100 = lm(log(size1) ~ log(compression), data[data$samples==100,])
28 | m10000 = lm(log(size1) ~ log(compression), data[data$samples==10000,])
29 | lines(c, exp(predict(m10, newdata=data.frame(compression=c))), lty=2, col='lightgray')
30 | #lines(c, exp(predict(m100, newdata=data.frame(compression=c))), lty=2, col='lightgray')
31 | lines(c, exp(predict(m10000, newdata=data.frame(compression=c))), lty=2, col='lightgray')
32 | par(old)
33 |
34 | old = par(mar=c(5.1,4.2,2.1,2))
35 | plot(size2/1000 ~ samples, data[data$compression==100,], log='x', ylim=c(0,22),
36 | xlab="Samples (x1000)", ylab="Size (kB)", xaxt='n',
37 | cex=1, bg=rgb(0,0,0,0.1), col=rgb(0,0,0,0.1), pch=21)
38 | axis(at=c(10,100,1000,10000), labels=c(10,100,1000,"10,000"), side=1)
39 | points(size1/1000 ~ samples, data[data$compression==100,],
40 | cex=1, bg=rgb(0,0,0,0.1), col=rgb(0,0,0,0.1), pch=22)
41 | ms2 = lm(size2/1000 ~ log(samples), data[data$compression==100,])
42 | s = seq(10, 10000, by=10)
43 | lines(s, predict(ms2, newdata=data.frame(samples=s)), lty=2, col='lightgray')
44 |
45 | ms1 = lm(size1/1000 ~ log(samples), data[data$compression==100,])
46 | lines(s, predict(ms1, newdata=data.frame(samples=s)), lty=2, col='lightgray')
47 | box(lwd=3)
48 |
49 | legend(10, 21.5, c("uncompressed", "compressed"), pch=c(21,22))
50 |
51 | par(old)
52 |
53 | dev.off()
54 |
55 | png("error-scaling.png", width=1800, height=700, pointsize=28)
56 |
57 | layout(matrix(c(1,2,3), 1, 3, byrow=T), widths=c(1.18,1,1))
58 |
59 | for (q in c(0.5, 0.01, 0.001)) {
60 | if (q == 0.5) {
61 | old = par(mar=c(5.1,4.5,2.1,2))
62 | } else {
63 | old = par(mar=c(5.1,0,2.1,2))
64 | }
65 | plot(error ~ kb, errors[errors$q == q,],
66 | ylim=c(-0.05, 0.05),
67 | pch=21, bg=rgb(0,0,0,.1), col=rgb(0,0,0,.1), log='x',
68 | xlab="t-digest size (kB)", ylab="Error in q", cex.lab=1.5,
69 | yaxt='n')
70 | abline(h=0, col='lightgray', lwd=2)
71 | abline(h=0.01, col='lightgray', lwd=2, lty=2)
72 | abline(h=-0.01, col='lightgray', lwd=2, lty=2)
73 |
74 | box(lwd=3)
75 | if (q == 0.5) {
76 | axis(side=2)
77 | }
78 | text(20, 0.09, paste("q =",q))
79 | par(old)
80 | }
81 |
82 | dev.off()
83 |
--------------------------------------------------------------------------------
/docs/t-digest-paper/scaling.tsv:
--------------------------------------------------------------------------------
1 | k samples compression size1 size2
2 | 0 10 2 172 364
3 | 0 10 5 415 928
4 | 0 10 10 573 1288
5 | 0 10 20 1027 2380
6 | 0 10 50 2117 5056
7 | 0 10 100 3846 9208
8 | 0 10 200 6666 15976
9 | 0 10 500 13781 33052
10 | 0 10 1000 22691 54436
11 | 0 100 2 190 388
12 | 0 100 5 430 928
13 | 0 100 10 765 1660
14 | 0 100 20 1506 3316
15 | 0 100 50 3076 6892
16 | 0 100 100 5650 12844
17 | 0 100 200 10008 23320
18 | 0 100 500 21419 51364
19 | 0 100 1000 38441 92236
20 | 0 1000 2 279 556
21 | 0 1000 5 598 1228
22 | 0 1000 10 1014 2104
23 | 0 1000 20 1945 4180
24 | 0 1000 50 4199 9136
25 | 0 1000 100 7467 16456
26 | 0 1000 200 14014 31144
27 | 0 1000 500 30923 69472
28 | 0 1000 1000 56465 128560
29 | 0 10000 2 314 592
30 | 0 10000 5 695 1360
31 | 0 10000 10 1238 2500
32 | 0 10000 20 2270 4576
33 | 0 10000 50 5052 10468
34 | 0 10000 100 9355 19684
35 | 0 10000 200 17839 38392
36 | 0 10000 500 40074 87664
37 | 0 10000 1000 74873 164896
38 | 1 10 2 184 388
39 | 1 10 5 376 820
40 | 1 10 10 550 1228
41 | 1 10 20 1027 2368
42 | 1 10 50 2277 5440
43 | 1 10 100 3951 9460
44 | 1 10 200 6701 16060
45 | 1 10 500 13696 32848
46 | 1 10 1000 22861 54844
47 | 1 100 2 215 448
48 | 1 100 5 511 1096
49 | 1 100 10 719 1564
50 | 1 100 20 1413 3136
51 | 1 100 50 3044 6820
52 | 1 100 100 5875 13408
53 | 1 100 200 9916 23044
54 | 1 100 500 21770 52180
55 | 1 100 1000 38301 91900
56 | 1 1000 2 309 616
57 | 1 1000 5 693 1444
58 | 1 1000 10 1004 2104
59 | 1 1000 20 1852 3988
60 | 1 1000 50 4077 8908
61 | 1 1000 100 7645 16828
62 | 1 1000 200 13876 30820
63 | 1 1000 500 31030 69736
64 | 1 1000 1000 56824 129484
65 | 1 10000 2 361 700
66 | 1 10000 5 663 1324
67 | 1 10000 10 1295 2596
68 | 1 10000 20 2420 4960
69 | 1 10000 50 5378 11152
70 | 1 10000 100 9472 19936
71 | 1 10000 200 18051 38872
72 | 1 10000 500 39994 87388
73 | 1 10000 1000 74796 164596
74 | 2 10 2 169 352
75 | 2 10 5 402 880
76 | 2 10 10 608 1360
77 | 2 10 20 1043 2392
78 | 2 10 50 2089 4984
79 | 2 10 100 3766 9016
80 | 2 10 200 6701 16060
81 | 2 10 500 13781 33052
82 | 2 10 1000 22446 53848
83 | 2 100 2 209 436
84 | 2 100 5 492 1060
85 | 2 100 10 806 1732
86 | 2 100 20 1501 3304
87 | 2 100 50 3053 6844
88 | 2 100 100 5751 13048
89 | 2 100 200 10099 23500
90 | 2 100 500 21567 51700
91 | 2 100 1000 38601 92620
92 | 2 1000 2 234 448
93 | 2 1000 5 580 1192
94 | 2 1000 10 1049 2188
95 | 2 1000 20 1806 3856
96 |
--------------------------------------------------------------------------------
/docs/t-digest-paper/sizes.csv:
--------------------------------------------------------------------------------
1 | tag i q k actual
2 |
--------------------------------------------------------------------------------
/docs/t-digest-paper/sizes.r:
--------------------------------------------------------------------------------
1 | data = read.delim("sizes.csv")
2 |
3 | plotGraph = function(tag, title='', showY=T) {
4 | n = max(data[data$tag == tag, ]$i)
5 | i = 1:n
6 | n2 = n/2
7 |
8 | if (showY) {
9 | yaxt = 's'
10 | } else {
11 | yaxt = 'n'
12 | }
13 |
14 | plot(actual~q, data[data$tag == tag,], cex=0.2, xaxt='n', xlim=c(0,1), ylim=c(0,1050), xlab='Quantile',
15 | ylab='Centroid size', yaxt=yaxt)
16 | title(title)
17 | box(lwd=3)
18 | axis(side=1, at=c(0,0.25, 0.5, 0.75, 1), labels=c(0.0,0.25,0.5,0.75, 1.0), lwd=3)
19 |
20 | q = seq(0,1,by=0.01)
21 | lines(q, 1000*4*q*(1-q), lwd=3, col='gray')
22 | }
23 |
24 | png("sizes.png", width=1800, height=700, pointsize=36)
25 | layout(matrix(c(1,2,3), 1, 3, byrow=T), widths=c(1.21,1,1))
26 | plotGraph("uniform", title="Uniform Distribution", T)
27 | old = par(mar=c(5.1,0,4.1,2))
28 | plotGraph("gamma", title="Gamma(0.1, 0.1) Distribution", F)
29 | plotGraph("sequential", title="Sequential Distribution", F)
30 | par(old)
31 | dev.off()
32 | #plotGraph("mixture", title="Mixture Distribution")
33 |
--------------------------------------------------------------------------------
/docs/vldb/figures/cluster-spread.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/cluster-spread.pdf
--------------------------------------------------------------------------------
/docs/vldb/figures/combined.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/combined.pdf
--------------------------------------------------------------------------------
/docs/vldb/figures/endpoint.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/endpoint.pdf
--------------------------------------------------------------------------------
/docs/vldb/figures/error-vs-compression-small.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/error-vs-compression-small.pdf
--------------------------------------------------------------------------------
/docs/vldb/figures/error-vs-compression.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/error-vs-compression.pdf
--------------------------------------------------------------------------------
/docs/vldb/figures/interpolation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/interpolation.pdf
--------------------------------------------------------------------------------
/docs/vldb/figures/k-q-plot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/k-q-plot.pdf
--------------------------------------------------------------------------------
/docs/vldb/figures/linear-interpolation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/linear-interpolation.pdf
--------------------------------------------------------------------------------
/docs/vldb/figures/merge.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/merge.pdf
--------------------------------------------------------------------------------
/docs/vldb/figures/qd-sizes-small.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/qd-sizes-small.pdf
--------------------------------------------------------------------------------
/docs/vldb/figures/qd-sizes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/qd-sizes.pdf
--------------------------------------------------------------------------------
/docs/vldb/figures/relative-error-one-panel.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/relative-error-one-panel.pdf
--------------------------------------------------------------------------------
/docs/vldb/figures/relative-error.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/relative-error.pdf
--------------------------------------------------------------------------------
/docs/vldb/figures/singleton.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/singleton.pdf
--------------------------------------------------------------------------------
/docs/vldb/refs.bib:
--------------------------------------------------------------------------------
1 | @article{Gan:2018:MQS:3236187.3269475,
2 | author = {Gan, Edward and Ding, Jialin and Tai, Kai Sheng and Sharan, Vatsal and Bailis, Peter},
3 | title = {Moment-based Quantile Sketches for Efficient High Cardinality Aggregation Queries},
4 | journal = {Proc. VLDB Endow.},
5 | issue_date = {July 2018},
6 | volume = {11},
7 | number = {11},
8 | month = jul,
9 | year = {2018},
10 | issn = {2150-8097},
11 | pages = {1647--1660},
12 | numpages = {14},
13 | url = {https://doi.org/10.14778/3236187.3236212},
14 | doi = {10.14778/3236187.3236212},
15 | acmid = {3269475},
16 | publisher = {VLDB Endowment},
17 | }
18 |
19 | @INPROCEEDINGS{Chen2000,
20 | author = {Fei Chen and Diane Lambert and Jos{\'e} C. Pinheiro},
21 | title = {Incremental Quantile Estimation for Massive Tracking},
22 | booktitle = {In Proceedings of KDD},
23 | year = {2000},
24 | pages = {516--522}
25 | }
26 |
27 | @article{one-dimensional-k-means,
28 | id= "pmid:27942416",
29 | title = {Ckmeans.1d.dp: Optimal k-means Clustering in One Dimension by Dynamic Programming},
30 | author= {Haizhou Wang and Mingzhou Song},
31 | journal= "The R journal",
32 | ISSN= "2073-4859",
33 | year = {2011},
34 | month={12},
35 | pages= {29-33},
36 | volume=3,
37 | issue=2,
38 | PMID = "27942416",
39 | PMCID= "PMC5148156"
40 | }
41 |
42 | @inproceedings{Greenwald-space-efficient-online-quantiles,
43 | author = {Greenwald, Michael and Khanna, Sanjeev},
44 | title = {Space-efficient Online Computation of Quantile Summaries},
45 | booktitle = {Proceedings of the 2001 ACM SIGMOD International Conference on Management of Data},
46 | series = {SIGMOD '01},
47 | year = {2001},
48 | isbn = {1-58113-332-4},
49 | location = {Santa Barbara, California, USA},
50 | pages = {58--66},
51 | numpages = {9},
52 | url = {http://doi.acm.org/10.1145/375663.375670},
53 | doi = {10.1145/375663.375670},
54 | acmid = {375670},
55 | publisher = {ACM},
56 | address = {New York, NY, USA}
57 | }
58 |
59 | @article{sawzall,
60 | title = {Interpreting the Data: Parallel Analysis with Sawzall},
61 | author = {Rob Pike and Sean Dorward and Robert Griesemer and Sean Quinlan},
62 | year = {2005},
63 | journal = {Scientific Programming Journal},
64 | pages = {277--298},
65 | volume = {13}
66 | }
67 |
68 | @article{munro1980,
69 | author = "J.I. Munro and M.S. Paterson",
70 | year = "1980",
71 | title = "Selection and sorting with limited storage",
72 | journal = "Theoretical Computer Science",
73 | volume = "12",
74 | number = "3",
75 | pages = "315 - 323",
76 | issn = "0304-3975",
77 | doi = "https://doi.org/10.1016/0304-3975(80)90061-4",
78 | url = "http://www.sciencedirect.com/science/article/pii/0304397580900614"
79 | }
80 | @inproceedings{qdigest,
81 | author = {Shrivastava, Nisheeth and Buragohain, Chiranjeeb and Agrawal, Divyakant and Suri, Subhash},
82 | year = {2004},
83 | month = {09},
84 | title = {Medians and Beyond: New Aggregation Techniques for Sensor Networks},
85 | booktitle = {SenSys'04 - Proceedings of the Second International Conference on Embedded Networked Sensor Systems},
86 | doi = {10.1145/1031495.1031524}
87 | }
88 |
89 | @misc{wiki_welford,
90 | title = "Algorithms for Calculating Variance, Online Algorithm",
91 | author = "The Wikipedia Foundation",
92 | year = 2018,
93 | howpublished = "\url{http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm}",
94 | note = "[Online; accessed 19-October-2013]"
95 | }
96 |
97 | @misc{datafu,
98 | title = "The Apache Datafu Project",
99 | author = "DFU",
100 | year = 2019,
101 | publisher = "Apache Software Foundation",
102 | howpublished = "\url{https://datafu.apache.org/}",
103 | note = "[Online; accessed 23-January-2018]"
104 | }
105 |
106 | @misc{t-digest-project,
107 | title = "The t-digest Library",
108 | author = "Ted Dunning",
109 | year = 2018,
110 | howpublished = "\url{https://github.com/tdunning/t-digest/}",
111 | note = "[Online; accessed 23-January-2018]"
112 | }
113 |
114 | @misc{t-digest-arxiv,
115 | title = {Computing Extremely Accurate Quantiles Using t-Digests},
116 | author = "Ted Dunning and Otmar Ertl",
117 | year = 2018,
118 | howpublished = "\url{https://arxiv.org/abs/1902.04023}",
119 | note="arXiv:1902.04023 [stat.CO]"
120 | }
121 |
122 | @misc{moment-sketch-arxiv,
123 | title = {Moment-Based Quantile Sketches for Efficient High Cardinality Aggregation Queries},
124 | author = {Edward Gan and Jialin Ding and Kai Sheng Tai and Vatsal Sharan and Peter Bailis},
125 | year = 2018,
126 | howpublished = "\url{https://arxiv.org/abs/1803.01969v1}",
127 | note = "arXiv:1803.01969v1"
128 | }
129 |
130 | @misc{github:stream,
131 | title = "Stream summarizer and cardinality estimator",
132 | author = "StreamLib",
133 | year = 2019,
134 | howpublished = "\url{https://github.com/addthis/stream-lib}",
135 | note = "[Online; accessed 11-February-2019]"
136 | }
137 |
138 | @inbook{knuth2welford,
139 | author={Donald E. Knuth},
140 | year=1998,
141 | edition=3,
142 | pages=232,
143 | title= {The Art of Computer Programming, volume 2: Seminumerical Algorithms},
144 | publisher= {Addison-Wesley},
145 | address= {Boston}
146 | }
147 |
148 | @ARTICLE{welford62,
149 | author = {B. P. Welford },
150 | year = {1962},
151 | title = {Note on a method for calculating corrected sums of squares and products},
152 | journal = {Technometrics},
153 | pages = {419--420}
154 | }
155 |
--------------------------------------------------------------------------------
/docs/vldb/short.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/short.pdf
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 |
4 | com.tdunning
5 | t-digest-parent
6 | 3.4-SNAPSHOT
7 | T-digest Parent
8 | pom
9 |
10 |
11 | scm:git:git@github.com:tdunning/t-digest.git
12 | scm:git:git@github.com:tdunning/t-digest.git
13 | https://github.com/tdunning/t-digest
14 | HEAD
15 |
16 |
17 |
18 |
19 | tdunning
20 | Ted
21 | ted.dunning@gmail.com
22 | https://github.com/tdunning/t-digest
23 |
24 | developer
25 |
26 | -8
27 |
28 | @ted_dunning
29 |
30 |
31 |
32 |
33 |
34 |
35 | The Apache Software License, Version 2.0
36 | http://www.apache.org/licenses/LICENSE-2.0.txt
37 | repo
38 |
39 |
40 |
41 |
42 | 1.8
43 | 1.8
44 |
45 |
46 |
47 |
48 | all
49 |
50 | core
51 | quality
52 | benchmark
53 |
54 |
55 |
56 | core-only
57 |
58 | true
59 |
60 |
61 | core
62 |
63 |
64 |
65 |
66 |
--------------------------------------------------------------------------------
/quality/README.md:
--------------------------------------------------------------------------------
1 | Quality Testing
2 | ===============
3 |
4 | This module contains a number of programs that assess the accuracy of t-digest implementations.
5 | In the process, bounds can be set on the quality of the t-digest idea itself.
6 |
7 | The implementation of a t-digest can have a variety of subtle flaws that do not
8 | affect operation except to compromise accuracy or to increase the size of
9 | a digest. The tests in this module aim to highlight how and where an
10 | implementation may be going wrong.
11 |
12 | Cluster Overlap
13 | --------------
14 |
15 | Run `com.tdunning.tdigest.quality.AccuracyTest#testBucketFill` to
16 | generate test data and then `accuracy.r` to generate
17 | `cluster-spread.pdf` which shows how clusters don't overlap.
18 |
19 | Basic Accuracy
20 | --------------
21 |
22 | Accuracy versus Size
23 | -----
24 | ```
25 | for algorithm in algorithms:
26 | for compression in [10,20,50,100,200,500,1000]:
27 | for distribution in ["gamma", "flip-gamma", "uniform"]:
28 | add data
29 | for q in [0.0001, 0.001,0.01,0.1,0.5,0.9,0.99,0.999,0.9999]:
30 | record algorithm, compression, distribution, q, x_data, x_digest, q_data
31 | ```
32 | Bin Distribution
33 | -----
34 | ```
35 | for each algorithm:
36 | compression = 100
37 | for distribution in ["gamma", "uniform"]:
38 | add data
39 | for bin in [0.0001, 0.001,0.01,0.5]:
40 | for sample in bin:
41 | record algorithm, compression, distribution, q_bin, x
42 | ```
43 |
44 | Comparison to KLL Algorithms
45 | ==========
46 |
47 | General Considerations
48 | =======
49 |
50 | Input Distribution
51 | -------
52 |
53 | Compression Factor
54 | -------
55 |
56 | Data Size
57 | -------
58 |
59 |
60 |
--------------------------------------------------------------------------------
/quality/comparison.r:
--------------------------------------------------------------------------------
1 | left.panel = function() {
2 |
3 | comp = read.csv("qd-tree-comparison.csv")
4 | filtered = comp %>% filter(tag == "uniform")
5 | plot(q.size ~ compression, filtered, pch=21, col=NA, bg=rgb(0,0,0,alpha=0.01), log='xy', yaxt='n', xlab=expression(delta), ylab="Digest Size (bytes)", ylim=c(70,6e3), cex=0.4, xlim=c(7,1100))
6 | points(t.size ~ compression, filtered, pch=21, col=NA, bg=rgb(0,0,0,alpha=0.01), cex=0.4)
7 | lines(q.size ~ compression, filtered %>% group_by(compression) %>% summarise(q.size=mean(q.size)), type='c')
8 | lines(t.size ~ compression, filtered %>% group_by(compression) %>% summarise(t.size=mean(t.size)), type='c')
9 |
10 | axis(side=2, at=c(100, 200, 500, 1000,2e3,5e3), labels=c("100", "200", "500", "1kB","2kB","5kB"))
11 | lines(c(7,16), c(954,954), col='grey')
12 | lines(c(25,160), c(954,954), col='grey')
13 | lines(c(250,1100), c(954,954), col='grey')
14 |
15 | text(25, 2500, expression(Q-digest), cex=0.7)
16 | text(400, 4000, expression(italic(t)-digest), cex=0.7)
17 |
18 | lines(c(20, 20), c(750, 270), col='grey')
19 | lines(c(20, 20), c(180, 110), col='grey')
20 | text(20, 85, expression(delta[italic(q)]==20), cex=0.7)
21 |
22 | lines(c(200, 200), c(750, 110), col='grey')
23 | text(200, 85, expression(delta[italic(t)]==200), cex=0.7)
24 | }
25 |
26 | right.panel = function(small=F) {
27 | boxplot(e2 ~ q, comp %>% filter(tag == "uniform", compression==20 ),
28 | boxwex=0.3, at=1:11-0.17, xaxt='n', col='grey',
29 | ylab="Absolute Error", xlab=expression(italic(q)),
30 | ylim=c(0,0.091), lwd=0.7, cex=0.5, cex.axis=0.9)
31 |
32 | boxplot(e1 ~ q, comp %>% filter(tag == "uniform", compression==200 ),
33 | boxwex=0.3, at=1:11+0.17, xaxt='n', add=T, lwd=0.7, cex=0.5, yaxt='n')
34 |
35 | axis(side=1, at=1:11,
36 | labels=c(expression(10^-5), expression(10^-4), expression(10^-3),
37 | "0.01", "0.1", "0.5",
38 | "0.9", "0.99", "0.999", "0.9999", "0.99999"),
39 | las=2, cex.axis=0.9)
40 |
41 | abline(h=0, col=rgb(0,0,0,alpha=0.2), lwd=1)
42 | if (small) {
43 | x.legend = 5.5
44 | cex.legend = 0.9
45 | } else {
46 | x.legend = 5.1
47 | cex.legend = 0.65
48 | }
49 | legend(x.legend, 0.092,
50 | legend=expression(italic(t)-"digest " (delta[italic(t)]==200),
51 | Q-"digest "(delta[italic(q)]==20)),
52 | fill=c('white', 'grey'), cex=cex.legend)
53 | }
54 |
55 | require(dplyr)
56 | pdf(file="qd-sizes.pdf", width=4.5, height=2, pointsize=9, family='serif')
57 | layout(matrix(c(1,2), nrow=1))
58 |
59 | par(cex.axis=0.8)
60 | par(cex.lab=1.1)
61 | par(mar=c(3.2, 3.3, 0.2, 0.2))
62 | par(mgp=c(2.0, 0.5, 0))
63 | par(tcl=-0.3)
64 | par(las=2)
65 |
66 | left.panel()
67 | right.panel()
68 | dev.off()
69 |
70 | pdf(file="qd-sizes-small.pdf", width=3, height=2.2, pointsize=9, family='serif')
71 | par(cex.axis=0.8)
72 | par(cex.lab=1.3)
73 | par(mar=c(3.2, 3.3, 0.2, 0.2))
74 | par(mgp=c(2.0, 0.5, 0))
75 | par(tcl=-0.3)
76 | par(las=2)
77 |
78 | right.panel(small=T)
79 | dev.off()
80 |
--------------------------------------------------------------------------------
/quality/fh.r:
--------------------------------------------------------------------------------
1 | expt = function(x) {
2 | if (length(x) > 1) {
3 | sapply(x, expt)
4 | } else {
5 | data = readBin(writeBin(x, raw(8)), "int", n=2)
6 | r = bitwShiftR(bitwAnd(data[2], 0x3ff00000), 20)
7 | if (r >= 512) {
8 | r = r-1024
9 | }
10 | return(r+1)
11 | }
12 | }
13 |
14 | mantissa = function(x) {
15 | if (length(x) > 1) {
16 | sapply(x, mantissa)
17 | } else {
18 | data = readBin(writeBin(x, raw(8)), "int", n=2)
19 | data[2] = bitwOr(bitwAnd(data[2], -0x7ff00001), 0x3ff00000)
20 | readBin(writeBin(data, raw(8)), "double")
21 | }
22 | }
23 |
24 | approxLog2 = function(x) {
25 | m = mantissa(x)
26 | expt(x) + ((6 * m - m * m) - 5)/3
27 | }
28 |
--------------------------------------------------------------------------------
/quality/kll-comparison.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/quality/kll-comparison.pdf
--------------------------------------------------------------------------------
/quality/merge.r:
--------------------------------------------------------------------------------
1 | require(dplyr)
2 | data = read.csv("merge.csv")
3 |
4 | plotMerge = function(n, yaxt = 's') {
5 | if (yaxt == 'n') {
6 | ylab = NA
7 | old = par(mar=c(3.5, 0.5, 2, 0.2))
8 | } else {
9 | ylab = "Absolute error (ppm)"
10 | old = par(mar=c(3.5, 4.0, 2, 0.2))
11 | }
12 | par(las=1)
13 | par(lwd=0.5)
14 | par(cex.lab=1.3)
15 | par(cex.axis=0.9)
16 | par(mgp=c(2.4, 0.5, 0))
17 | par(tcl=-0.3)
18 |
19 | our.data = data %>% filter(type == "quantile", parts == n)
20 | boxplot(e1*1e6 ~ q, at=(1:6)-0.23, xaxt='n', boxwex=0.19, our.data,
21 | ylim=c(-3000, 3000), cex=0.5, yaxt = yaxt,
22 | col=rgb(0.95, 0.95, 0.95),
23 | xlab=NA, ylab=NA)
24 | title(xlab=expression('Quantile '(italic(q))), mgp=c(2.2, 0.5, 0))
25 | title(ylab=ylab, mgp=c(2.8, 0.0, 0))
26 | boxplot(e3*1e6 ~ q, at=1:6, xaxt='n', boxwex=0.19, add=T, our.data,
27 | col=rgb(0.7, 0.7, 0.7), cex=0.5, yaxt = yaxt)
28 | boxplot(e2*1e6 ~ q, at=1:6+0.23, xaxt='n', boxwex=0.19, add=T, our.data,
29 | col=rgb(0.4, 0.4, 0.4), cex=0.5, yaxt = yaxt)
30 | axis(side=1, at=1:6,
31 | labels=c(expression(10^-3), expression(10^-2), 0.1, 0.2, 0.3, 0.5),
32 | )
33 | legend(0.13, -1300,
34 | expression("Direct "(delta==100),
35 | "Stratified merge "(delta==200,100),
36 | "Flat merge "(delta==100,100)),
37 | fill = c(rgb(0.95, 0.95, 0.95), rgb(0.7, 0.7, 0.7), rgb(0.4, 0.4, 0.4)),
38 | cex=0.75)
39 | abline(h=0, col=rgb(0.4, 0.4, 0.4))
40 | title(paste(n, " parts"), cex.main=1.3)
41 | box()
42 | par(old)
43 | }
44 |
45 | #setEPS()
46 | pdf("merge.pdf", width=6, height=2.4, pointsize=9, family='serif')
47 | layout(matrix(c(1,2,3), 1, 3, byrow=T), widths=c(1.285,1,1))
48 | par(cex=1)
49 |
50 | plotMerge(5, 's')
51 | plotMerge(20, 'n')
52 | plotMerge(100, 'n')
53 |
54 | dev.off()
55 |
--------------------------------------------------------------------------------
/quality/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 |
8 | com.tdunning
9 | t-digest-parent
10 | 3.4-SNAPSHOT
11 | ../pom.xml
12 |
13 | t-digest-quality
14 |
15 |
16 |
17 | junit
18 | junit
19 | 4.13.1
20 | test
21 |
22 |
23 | org.apache.mahout
24 | mahout-math
25 | 0.9
26 | test
27 |
28 |
29 | com.google.guava
30 | guava
31 | 32.0.0-jre
32 | test
33 |
34 |
35 | com.clearspring.analytics
36 | stream
37 | 2.5.2
38 | test
39 |
40 |
41 | org.apache.datasketches
42 | datasketches-java
43 | 2.0.0
44 | test
45 |
46 |
47 | com.tdunning
48 | t-digest
49 | ${project.parent.version}
50 |
51 |
52 |
53 |
54 |
55 |
56 | org.apache.maven.plugins
57 | maven-compiler-plugin
58 | 3.3
59 |
60 | true
61 | 1.7
62 | 1.8
63 | 1.8
64 |
65 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/quality/src/test/java/com/tdunning/tdigest/quality/BinFill.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.tdigest.quality;
19 |
20 | import com.google.common.collect.Lists;
21 | import com.tdunning.math.stats.Centroid;
22 | import com.tdunning.math.stats.Dist;
23 | import com.tdunning.math.stats.MergingDigest;
24 | import com.tdunning.math.stats.ScaleFunction;
25 | import com.tdunning.math.stats.TDigest;
26 | import org.apache.mahout.math.jet.random.AbstractDistribution;
27 | import org.junit.Test;
28 |
29 | import java.io.FileNotFoundException;
30 | import java.io.PrintWriter;
31 | import java.util.Random;
32 |
33 | /**
34 | * Plots the size of each bin for various distributions and parameters.
35 | *
36 | * The bin-fill.r program can run in the same directory as this program to get some
37 | * visualization about how well clusters are filled.
38 | */
39 | public class BinFill {
40 | @Test
41 | public void sampleFill() {
42 | System.out.printf("scale,delta,centroid,mean,count\n");
43 | for (double delta : new double[]{5, 10}) {
44 | double[] data = {0, 0, 3, 4, 1, 6, 0, 5, 2, 0, 3, 3, 2, 3, 0, 2, 5, 0, 3, 1};
45 |
46 | MergingDigest t1 = new MergingDigest(delta);
47 | t1.setScaleFunction(ScaleFunction.K_1);
48 |
49 | MergingDigest t2 = new MergingDigest(delta);
50 | t2.setScaleFunction(ScaleFunction.K_2);
51 |
52 | MergingDigest t3 = new MergingDigest(delta);
53 | t3.setScaleFunction(ScaleFunction.K_3);
54 | for (double x : data) {
55 | t1.add(x);
56 | t2.add(x);
57 | t3.add(x);
58 | }
59 |
60 |
61 | int i = 1;
62 | for (MergingDigest t : Lists.newArrayList(t1, t2, t3)) {
63 | System.out.printf("> %d, %.0f, %.5f, %.5f\n", i, delta, t.quantile(0.65), Dist.quantile(0.65, data));
64 | int j = 0;
65 | for (Centroid centroid : t.centroids()) {
66 | System.out.printf("%d,%.0f,%d,%.5f,%d\n", i, delta, j, centroid.mean(), centroid.count());
67 | j++;
68 | }
69 | i++;
70 | }
71 | }
72 | }
73 |
74 | private static final double N = 100000;
75 |
76 | public static void main(String[] args) throws FileNotFoundException {
77 | try (PrintWriter out = new PrintWriter("bin-fill.csv")) {
78 | out.printf("iteration,dist,algo,scale,q,x,k0,k1,dk,q0,q1,count,max0,max1\n");
79 |
80 | // for all scale functions except the non-normalized ones
81 | for (ScaleFunction f : ScaleFunction.values()) {
82 | if (f.toString().contains("NO_NORM")) {
83 | continue;
84 | }
85 | System.out.printf("%s\n", f);
86 |
87 | // for all kinds of t-digests
88 | for (Util.Factory factory : Util.Factory.values()) {
89 | // for different distributions of values
90 | for (Util.Distribution distribution : Util.Distribution.values()) {
91 | AbstractDistribution gen = distribution.create(new Random());
92 | // do multiple passes
93 | for (int i = 0; i < 10; i++) {
94 | TDigest dist = factory.create();
95 | if (dist instanceof MergingDigest) {
96 | // can only set scale function on merging digest right now ...
97 | // ability for TreeDigest coming soon
98 | dist.setScaleFunction(f);
99 | }
100 | for (int j = 0; j < N; j++) {
101 | dist.add(gen.nextDouble());
102 | }
103 |
104 | // now dump stats for the centroids
105 | double q0 = 0;
106 | double k0 = 0;
107 | for (Centroid c : dist.centroids()) {
108 | double q1 = q0 + (double) c.count() / N;
109 | double k1 = f.k(q1, dist.compression(), dist.size());
110 | out.printf("%d,%s,%s,%s,%.7f,%.7f,%.7f,%.7f,%.7f,%.7f,%.7f,%d,%.1f,%.1f\n",
111 | i, distribution, factory, f, (q0 + q1) / 2, c.mean(),
112 | k0, k1, k1 - k0, q0, q1, c.count(),
113 | dist.size() * f.max(q0, dist.compression(), dist.size()),
114 | dist.size() * f.max(q1, dist.compression(), dist.size())
115 | );
116 | q0 = q1;
117 | k0 = k1;
118 | }
119 | }
120 | }
121 | }
122 | }
123 | }
124 | }
125 | }
126 |
--------------------------------------------------------------------------------
/quality/src/test/java/com/tdunning/tdigest/quality/ComparisonTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.tdigest.quality;
19 |
20 | import com.clearspring.analytics.stream.quantile.QDigest;
21 | import com.tdunning.math.stats.Dist;
22 | import com.tdunning.math.stats.MergingDigest;
23 | import com.tdunning.math.stats.QuantileEstimator;
24 | import com.tdunning.math.stats.TDigest;
25 | import org.apache.mahout.math.jet.random.AbstractContinousDistribution;
26 | import org.apache.mahout.math.jet.random.Gamma;
27 | import org.apache.mahout.math.jet.random.Uniform;
28 | import org.junit.Test;
29 |
30 | import java.io.FileNotFoundException;
31 | import java.io.FileOutputStream;
32 | import java.io.PrintWriter;
33 | import java.util.Arrays;
34 | import java.util.List;
35 | import java.util.Random;
36 |
37 | /**
38 | * Compares t-digest to q-digest and traditional streaming quantile algorithms.
39 | */
40 | public class ComparisonTest {
41 | private static double M = 20;
42 |
43 | @Test
44 | public void compareToQDigest() throws FileNotFoundException {
45 | Random rand = new Random();
46 | try (PrintWriter out = new PrintWriter(new FileOutputStream("qd-tree-comparison.csv"))) {
47 | out.printf("tag,compression,q,e1,e2,t.size,q.size\n");
48 |
49 | for (int i = 0; i < M; i++) {
50 | compareQD(out, new Gamma(0.1, 0.1, rand), "gamma", 1L << 48);
51 | // the bounds for the uniform distribution are varied to avoid round off effects
52 | compareQD(out, new Uniform(0, rand.nextDouble() * 0.05 + 1.01, rand), "uniform", 1L << 48);
53 | }
54 | }
55 | }
56 |
57 | private void compareQD(PrintWriter out, AbstractContinousDistribution gen, String tag, long scale) {
58 | for (double compression : new double[]{10, 20, 50, 100, 200, 500, 1000, 2000}) {
59 | QDigest qd = new QDigest(compression);
60 | TDigest dist = new MergingDigest(compression);
61 | double[] data = new double[100000];
62 | for (int i = 0; i < 100000; i++) {
63 | double x = gen.nextDouble();
64 | dist.add(x);
65 | qd.offer((long) (x * scale));
66 | data[i] = x;
67 | }
68 | dist.compress();
69 | Arrays.sort(data);
70 |
71 | for (double q : new double[]{1e-5, 1e-4, 0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999, 0.9999, 0.99999}) {
72 | double x1 = dist.quantile(q);
73 | double x2 = (double) qd.getQuantile(q) / scale;
74 | double e1 = Dist.cdf(x1, data) - q;
75 | double e2 = Dist.cdf(x2, data) - q;
76 | out.printf("%s,%.0f,%.8f,%.10g,%.10g,%d,%d\n", tag, compression, q, e1, e2, dist.smallByteSize(), QDigest.serialize(qd).length);
77 | }
78 | }
79 | }
80 |
81 | @Test
82 | public void compareToStreamingQuantile() throws FileNotFoundException {
83 | Random rand = new Random();
84 |
85 | try (PrintWriter out = new PrintWriter(new FileOutputStream("sq-tree-comparison.csv"))) {
86 | out.printf("tag,compression,q,e1,e2,t.size,q.size\n");
87 | for (int i = 0; i < M; i++) {
88 | compareSQ(out, new Gamma(0.1, 0.1, rand), "gamma");
89 | compareSQ(out, new Uniform(0, 1, rand), "uniform");
90 | }
91 | }
92 | }
93 |
94 | private void compareSQ(PrintWriter out, AbstractContinousDistribution gen, String tag) {
95 | double[] quantiles = {0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 0.7, 0.8, 0.9, 0.99, 0.999};
96 | for (double compression : new double[]{10, 20, 50, 100, 200, 500, 1000, 2000}) {
97 | QuantileEstimator sq = new QuantileEstimator(1001);
98 | TDigest dist = new MergingDigest(compression);
99 | double[] data = new double[100000];
100 | for (int i = 0; i < 100000; i++) {
101 | double x = gen.nextDouble();
102 | dist.add(x);
103 | sq.add(x);
104 | data[i] = x;
105 | }
106 | dist.compress();
107 | Arrays.sort(data);
108 |
109 | List qz = sq.getQuantiles();
110 | for (double q : quantiles) {
111 | double x1 = dist.quantile(q);
112 | double x2 = qz.get((int) (q * 1000 + 0.5));
113 | double e1 = Dist.cdf(x1, data) - q;
114 | double e2 = Dist.cdf(x2, data) - q;
115 | out.printf("%s,%.0f,%.8f,%.10g,%.10g,%d,%d\n",
116 | tag, compression, q, e1, e2, dist.smallByteSize(), sq.serializedSize());
117 |
118 | }
119 | }
120 | }
121 |
122 | }
123 |
--------------------------------------------------------------------------------
/quality/src/test/java/com/tdunning/tdigest/quality/Git.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.tdigest.quality;
19 |
20 | import java.io.BufferedReader;
21 | import java.io.File;
22 | import java.io.IOException;
23 | import java.io.InputStreamReader;
24 |
25 | /**
26 | * Functions for probing Git. Handy for marking test results against git hashes.
27 | */
28 | class Git {
29 | private static boolean isGitClean() {
30 | try {
31 | return new ProcessBuilder("git", "diff-index", "--quiet", "HEAD", "--")
32 | .redirectOutput(new File("/dev/null"))
33 | .start()
34 | .exitValue() == 0;
35 | } catch (IOException e) {
36 | return false;
37 | }
38 | }
39 |
40 | static String getHash(boolean force) throws IOException {
41 | if (force || isGitClean()) {
42 | Process p = new ProcessBuilder("git", "log", "-1")
43 | .start();
44 | BufferedReader stdout = new BufferedReader(new InputStreamReader(p.getInputStream()));
45 | // output should look like "commit 01ea144ca865361be6786fd502bb554c75105e3c"
46 | return stdout.readLine().substring(7);
47 | } else {
48 | throw new IOException("Source directory has changes that need to be committed");
49 | }
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/quality/src/test/java/com/tdunning/tdigest/quality/ScalingTest.java:
--------------------------------------------------------------------------------
1 | package com.tdunning.tdigest.quality;
2 |
3 | /**
4 | * Measurement size of t-digests versus data size and compression
5 | */
6 | public class ScalingTest {
7 | }
8 |
--------------------------------------------------------------------------------
/quality/src/test/java/com/tdunning/tdigest/quality/SinglePassTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Ted Dunning under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.tdunning.tdigest.quality;
19 |
20 | import com.tdunning.math.stats.Centroid;
21 | import com.tdunning.math.stats.Dist;
22 | import com.tdunning.math.stats.MergingDigest;
23 | import com.tdunning.math.stats.ScaleFunction;
24 | import org.junit.Test;
25 |
26 | import java.io.FileNotFoundException;
27 | import java.io.FileOutputStream;
28 | import java.io.PrintWriter;
29 | import java.util.*;
30 |
31 | /**
32 | * By setting the buffer size on the MergingDigest to larger than the number of data points,
33 | * we get to see the theoretical performance of a t-digest.
34 | */
35 | public class SinglePassTest {
36 | private static final int N = 200000;
37 |
38 | /**
39 | * This test builds t-digests in a single pass with such a large buffer that all of the data is
40 | * sorted in one batch. This avoids questions about the accuracy of the merging strategy and tests
41 | * the basic error rates from the idea of the t-digest itself.
42 | *
43 | * This test produces two data files that describe the results of the test.
44 | *
45 | * The first file is called limit-errors.csv. It contains data about the accuracy of the t-digest
46 | * at values of q that are evenly spaced in logit space (i.e. even spacing of log10(q/1-q)). This
47 | * results in points that are closely spaced near q=0 and near q=1. At each point, the value of q,
48 | * the corresponding quantile estimate x1=F^{-1}(q), the actual value x1 (from the data samples),
49 | * the round-trip quantile q1=F(x) as estimated by the t-digest and the actual round-trip quantile
50 | * q2 as computed from the original data are given.
51 | *
52 | * The second file is called limit-sizes.csv and gives the centroid weights and locations in terms
53 | * of x and q for each centroid in the t-digest. In addition, q2=F(x) and x2=F^{-1}(q) are given as
54 | * estimated from the original data.
55 | *
56 | * All of these tests are done under a variety of parameter settings including compression from 10 to
57 | * 500, centroid merging strategy and such.
58 | *
59 | * @throws FileNotFoundException If output files can't be opened.
60 | */
61 | @Test
62 | public void testConservativeBuild() throws FileNotFoundException {
63 | try (PrintWriter errors = new PrintWriter(new FileOutputStream("limit-errors.csv"));
64 | PrintWriter buckets = new PrintWriter(new FileOutputStream("limit-sizes.csv"))) {
65 | errors.printf("pass,x1,x2,q,q1,q2,error,compression,conservative\n");
66 | buckets.printf("pass,compression,conservative,i,q,mean,count,q2,x2\n");
67 |
68 | Random gen = new Random();
69 | for (int pass = 0; pass < 50; pass++) {
70 | System.out.printf("%d\n", pass);
71 | for (ScaleFunction scale : ScaleFunction.values()) {
72 | if (scale.toString().endsWith("NO_NORM")) {
73 | continue;
74 | }
75 | for (double compression : new double[]{20, 50, 100, 200, 300, 500}) {
76 | double[] data = new double[N];
77 | MergingDigest digest = new MergingDigest(compression, 2 * N);
78 | digest.setScaleFunction(scale);
79 |
80 | for (int i = 0; i < N; i++) {
81 | double x = gen.nextDouble();
82 | data[i] = x;
83 | digest.add(x);
84 | }
85 |
86 | Arrays.sort(data);
87 | int i = 0;
88 | double sum = 0;
89 | for (Centroid centroid : digest.centroids()) {
90 | double q = (sum + centroid.count() / 2.0) / digest.size();
91 | sum += centroid.count();
92 | buckets.printf("%d,%.1f,%s,%d,%.12f,%.12f,%d,%.12f,%.12f\n",
93 | pass, compression, scale, i++, q, centroid.mean(), centroid.count(),
94 | Dist.cdf(centroid.mean(), data), Dist.quantile(q, data));
95 | }
96 | if (sum != digest.size()) {
97 | System.out.printf("Oops ... total mismatch %.5f != %5d\n", sum, digest.size());
98 | }
99 |
100 | for (double lq = -6; lq < 6.01; lq += 0.25) {
101 | double q = 1 / (1 + Math.pow(10, -lq));
102 | double x1 = Dist.quantile(q, data);
103 | double x2 = digest.quantile(q);
104 | double q1 = digest.cdf(x1);
105 | double q2 = Dist.cdf(x1, data);
106 | errors.printf("%d,%.12f,%.12f,%.12f,%.12f,%.12f,%.12f,%.0f,%s\n",
107 | pass, x1, x2, q, q1, q2, Math.abs(q1 - q2) / q1, compression, scale);
108 | }
109 | }
110 | }
111 | }
112 | }
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/quality/src/test/java/com/tdunning/tdigest/quality/Util.java:
--------------------------------------------------------------------------------
1 | package com.tdunning.tdigest.quality;
2 |
3 | import com.tdunning.math.stats.AVLTreeDigest;
4 | import com.tdunning.math.stats.MergingDigest;
5 | import com.tdunning.math.stats.TDigest;
6 | import org.apache.mahout.math.jet.random.AbstractContinousDistribution;
7 | import org.apache.mahout.math.jet.random.Gamma;
8 | import org.apache.mahout.math.jet.random.Uniform;
9 |
10 | import java.io.*;
11 | import java.util.Random;
12 |
13 | /**
14 | * Handy routings for computing cdf and quantile from a list of numbers
15 | */
16 | class Util {
17 | enum Factory {
18 | MERGE {
19 | TDigest create(double compression) {
20 | MergingDigest digest = new MergingDigest(compression, (int) (10 * compression));
21 | digest.useAlternatingSort = true;
22 | digest.useTwoLevelCompression = true;
23 | return digest;
24 | }
25 |
26 | TDigest create(double compression, int bufferSize) {
27 | MergingDigest digest = new MergingDigest(compression, bufferSize);
28 | digest.useAlternatingSort = true;
29 | digest.useTwoLevelCompression = true;
30 | return digest;
31 | }
32 | TDigest create() {
33 | return create(100);
34 | }
35 | },
36 |
37 | MERGE_OLD_STYLE {
38 | TDigest create(double compression) {
39 | MergingDigest digest = new MergingDigest(compression, (int) (10 * compression));
40 | digest.useAlternatingSort = false;
41 | digest.useTwoLevelCompression = false;
42 | return digest;
43 | }
44 |
45 | TDigest create(double compression, int bufferSize) {
46 | MergingDigest digest = new MergingDigest(compression, bufferSize);
47 | digest.useAlternatingSort = false;
48 | digest.useTwoLevelCompression = false;
49 | return digest;
50 | }
51 | TDigest create() {
52 | return create(100);
53 | }
54 | },
55 |
56 | TREE {
57 | TDigest create(double compression) {
58 | return new AVLTreeDigest(compression);
59 | }
60 | TDigest create() {
61 | return create(20);
62 | }
63 | };
64 |
65 | abstract TDigest create(double compression);
66 | abstract TDigest create();
67 |
68 | TDigest create(double compression, int bufferSize) {
69 | return create(compression);
70 | }
71 | }
72 |
73 | enum Distribution {
74 | UNIFORM {
75 | @Override
76 | public AbstractContinousDistribution create(Random gen) {
77 | return new Uniform(0, 1, gen);
78 | }
79 | },
80 |
81 | GAMMA {
82 | @Override
83 | public AbstractContinousDistribution create(Random gen) {
84 | return new Gamma(0.1, 0.1, gen);
85 | }
86 | };
87 |
88 | public abstract AbstractContinousDistribution create(Random gen);
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/quality/x.r:
--------------------------------------------------------------------------------
1 | plot(c(), c(), ylim=c(0,40), xlim=c(0,1), yaxt='n')
2 | axis(side=2, at=(0:4)*10, labels=(0:4)*50)
3 | r = matrix(0, nrow = 40, ncol=11)
4 | for (y in 2:40) {
5 | n = 5*y
6 | m = 50 * (10000/n)
7 | sum = rep(0, len=11)
8 | for (i in 1:m) {
9 | sum = sum + quantile(runif(n=y), seq(0,1,0.1))
10 | }
11 | r[y,] = sum/m
12 | print(y)
13 | }
14 | for (x in 1:11) {
15 | lines(r[2:40,x], 2:40, type='b')
16 | }
17 |
18 |
19 | plot(c(), c(), ylim=c(0,40), xlim=c(1e-6,0.03), yaxt='n', log='x')
20 | axis(side=2, at=(0:4)*10, labels=(0:4)*1000)
21 | for (i in 1:40) {
22 | n = i * 1000
23 | m = 1e6/n
24 | r = rep(0,4)
25 | for (j in 1:m) {
26 | r = r + quantile(runif(n), c(0, 0.001, 0.01, 0.02))
27 | }
28 | r = r/m
29 | points(r, rep(i, 4))
30 | }
31 |
--------------------------------------------------------------------------------
/size-studies.r:
--------------------------------------------------------------------------------
1 | # Experiments with t-digest in R
2 |
3 | standard.size.bound = function(n, q) {
4 | 4 * n * q * (1-q)
5 | }
6 |
7 | constant.size.bound = function(n, q) {
8 | n
9 | }
10 |
11 | root.size.bound = function(n, q) {
12 | n * sqrt(4 * q * (1-q))
13 | }
14 |
15 | abs.size.bound = function(n, q) {
16 | 2 * n * min(q, 1-q)
17 | }
18 |
19 | sorted.t.digest = function (points, compression=50, size.bound = standard.size.bound) {
20 | points = sort(points)
21 | n = length(points)
22 |
23 | total = 0
24 | i = 1
25 | r = data.frame()
26 | while (i <= n) {
27 | # accumulate a centroid of max size
28 | mean = 0
29 | count = 0
30 | qx = total/n
31 | while (count + 1 <= max(1, do.call(size.bound,list(n=n, q=qx)) / compression)) {
32 | count = count+1
33 | mean = mean + (points[i]-mean)/count
34 | qx = (total + count/2) / n
35 | i = i+1
36 | }
37 | total = total + count
38 | r = rbind(r, data.frame(center=c(mean), count=c(count)))
39 | }
40 | r
41 | }
42 |
43 | size.growth = data.frame()
44 | sample.size = c(100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000)
45 | bounds = c("standard.size.bound", "abs.size.bound", "root.size.bound", "constant.size.bound")
46 | for (j in 1:length(bounds)) {
47 | bound = bounds[j]
48 | print(bound)
49 | for (i in 1:length(sample.size)) {
50 | n = sample.size[i]
51 | x = rnorm(n)
52 | cx = sorted.t.digest(x, size.bound=bound)
53 | size.growth = rbind(size.growth, data.frame(f=bound, n=n, c=dim(cx)[1]))
54 | print(c(n, dim(cx)[1]))
55 | }
56 | }
57 |
58 | colors = rainbow(3)
59 | colors = c(colors[1],colors[1],colors[2], colors[3])
60 | plot(x=c(), y=c(), xlim=c(1e2,1e6), ylim=c(0,350), log="x", ylab="Centroids", xlab="Points")
61 | for (i in c(1,3,4)) {
62 | lines(c~n, size.growth[unclass(size.growth$f)==i,], col=colors[i], lwd=2, type='b', cex=0.6)
63 | }
64 | legend(1e2, y=350, legend=c("Standard", "Root", "Constant"), fill=rainbow(3))
65 |
66 | direct.t.digest = function (points, compression=50) {
67 | n = length(points)
68 |
69 | total = 0
70 | i = 1
71 | r = data.frame()
72 | while (i <= n) {
73 | # accumulate a centroid of max size
74 | mean = 0
75 | count = 0
76 | qx = total
77 | while (count + 1 <= max(1, 4 * n * (qx * (1-qx) / compression))) {
78 | count = count+1
79 | mean = mean + (points[i]-mean)/count
80 | qx = (total + count/2) / n
81 | i = i+1
82 | }
83 | total = total + count
84 | r = rbind(r, data.frame(center=c(mean), count=c(count)))
85 | }
86 | r
87 | }
88 |
89 |
90 |
--------------------------------------------------------------------------------