├── .github
    └── workflows
    │   └── maven.yml
├── .gitignore
├── CITATION.cff
├── LICENSE
├── NOTICES
├── README.md
├── RELEASE-NOTES.md
├── benchmark
    ├── pom.xml
    ├── src
    │   └── main
    │   │   └── java
    │   │       └── com
    │   │           └── tdunning
    │   │               ├── ApproxLogBench.java
    │   │               ├── Benchmark.java
    │   │               ├── FloatHistogramBench.java
    │   │               ├── MergeBench.java
    │   │               ├── SortBench.java
    │   │               └── TDigestBench.java
    └── x.r
├── core
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── com
    │       │   │   └── tdunning
    │       │   │       └── math
    │       │   │           └── stats
    │       │   │               ├── AVLGroupTree.java
    │       │   │               ├── AVLTreeDigest.java
    │       │   │               ├── AbstractTDigest.java
    │       │   │               ├── Centroid.java
    │       │   │               ├── Comparison.java
    │       │   │               ├── Dist.java
    │       │   │               ├── FloatHistogram.java
    │       │   │               ├── Histogram.java
    │       │   │               ├── IntAVLTree.java
    │       │   │               ├── LogHistogram.java
    │       │   │               ├── MergingDigest.java
    │       │   │               ├── ScaleFunction.java
    │       │   │               ├── Simple64.java
    │       │   │               ├── Sort.java
    │       │   │               └── TDigest.java
    │       └── r
    │       │   └── asin-approx.r
    │   └── test
    │       └── java
    │           └── com
    │               └── tdunning
    │                   ├── math
    │                       └── stats
    │                       │   ├── AVLGroupTreeTest.java
    │                       │   ├── AVLTreeDigestTest.java
    │                       │   ├── AbstractTest.java
    │                       │   ├── AlternativeMergeTest.java
    │                       │   ├── BigCount.java
    │                       │   ├── BigCountMergingDigest.java
    │                       │   ├── BigCountTreeDigest.java
    │                       │   ├── ComparisonTest.java
    │                       │   ├── DigestFactory.java
    │                       │   ├── FloatHistogramTest.java
    │                       │   ├── HistogramTestCases.java
    │                       │   ├── IntAVLTreeTest.java
    │                       │   ├── LogHistogramTest.java
    │                       │   ├── MegaMergeTest.java
    │                       │   ├── MergingDigestTest.java
    │                       │   ├── ReproTest.java
    │                       │   ├── ReproduceInfoPrinterRunListener.java
    │                       │   ├── ScaleFunctionTests.java
    │                       │   ├── SerializationTest.java
    │                       │   ├── SortTest.java
    │                       │   ├── TDigestSerializationTest.java
    │                       │   ├── TDigestTest.java
    │                       │   └── TDigestUtilTest.java
    │                   └── scale
    │                       └── ScaleTest.java
├── docs
    ├── error-uniform-delta=100.png
    ├── error-uniform-delta=200.png
    ├── error-uniform-delta=50.png
    ├── error-uniform-delta=500.png
    ├── error-vs-compression.png
    ├── interpolation-figure.png
    ├── max-error-uniform.png
    ├── proofs
    │   ├── invariant-preservation.pdf
    │   ├── invariant-preservation.tex
    │   ├── refs.bib
    │   ├── sizing.pdf
    │   └── sizing.tex
    ├── quantiles
    │   ├── quantiles.pdf
    │   └── quantiles.tex
    ├── r-sim-diagrams
    │   ├── figs.r
    │   ├── shifts.r
    │   └── sim.r
    ├── simpa
    │   ├── declaration-of-competing-interests.docx
    │   ├── figures
    │   │   ├── adaptive-threshold.pdf
    │   │   ├── change-point.pdf
    │   │   ├── detection.r
    │   │   ├── error-vs-compression.pdf
    │   │   ├── windows.graffle
    │   │   └── windows.pdf
    │   ├── highlights.pdf
    │   ├── highlights.tex
    │   ├── main.tex
    │   └── refs.bib
    ├── software-paper
    │   └── figures
    │   │   ├── cluster-spread.pdf
    │   │   ├── endpoint.pdf
    │   │   ├── interpolation.pdf
    │   │   ├── k-q-plot.pdf
    │   │   ├── linear-interpolation.pdf
    │   │   ├── merge.pdf
    │   │   ├── qd-sizes.pdf
    │   │   ├── relative-error.pdf
    │   │   └── singleton.pdf
    ├── t-digest-paper
    │   ├── build-figures.py
    │   ├── cell-deviation.tsv
    │   ├── comparison.r
    │   ├── comparison.tsv
    │   ├── error-scaling.tsv
    │   ├── errors-old.tsv
    │   ├── errors.csv
    │   ├── errors.r
    │   ├── figure-doc.pdf
    │   ├── figure-doc.tex
    │   ├── figures
    │   │   ├── cluster-spread.pdf
    │   │   ├── endpoint.pdf
    │   │   ├── error-vs-compression.pdf
    │   │   ├── interpolation.pdf
    │   │   ├── k-q-plot.pdf
    │   │   ├── linear-interpolation.pdf
    │   │   ├── merge.pdf
    │   │   ├── qd-sizes.pdf
    │   │   ├── relative-error.pdf
    │   │   └── singleton.pdf
    │   ├── gamma-deviation.tsv
    │   ├── histo.pdf
    │   ├── histo.tex
    │   ├── k-q-diagram.graffle
    │   ├── k-q-diagram
    │   │   ├── k-q-limits.pdf
    │   │   └── slope-limiting.pdf
    │   ├── k-q-plot.r
    │   ├── linear-interpolation.r
    │   ├── merge.eps
    │   ├── merge.tsv
    │   ├── natbib.sty
    │   ├── quantile-figures.graffle
    │   │   ├── data.plist
    │   │   ├── image1.pdf
    │   │   ├── image10.pdf
    │   │   ├── image11.pdf
    │   │   ├── image2.pdf
    │   │   ├── image21.pdf
    │   │   ├── image22.pdf
    │   │   ├── image23.pdf
    │   │   ├── image24.pdf
    │   │   ├── image26.pdf
    │   │   ├── image27.pdf
    │   │   ├── image28.pdf
    │   │   ├── image29.pdf
    │   │   ├── image3.pdf
    │   │   ├── image31.pdf
    │   │   ├── image32.pdf
    │   │   ├── image33.pdf
    │   │   ├── image34.pdf
    │   │   ├── image35.pdf
    │   │   ├── image41.pdf
    │   │   ├── image44.pdf
    │   │   ├── image46.pdf
    │   │   ├── image47.pdf
    │   │   ├── image48.pdf
    │   │   ├── image49.pdf
    │   │   ├── image50.pdf
    │   │   ├── image51.pdf
    │   │   ├── image59.pdf
    │   │   ├── image63.pdf
    │   │   ├── image7.pdf
    │   │   └── image8.pdf
    │   ├── quantile-figures
    │   │   ├── combined.pdf
    │   │   ├── endpoint.pdf
    │   │   ├── interpolation.pdf
    │   │   └── singleton.pdf
    │   ├── refs.bib
    │   ├── scaling.r
    │   ├── scaling.tsv
    │   ├── sizes.csv
    │   ├── sizes.r
    │   └── statsoc.cls
    └── vldb
    │   ├── figures
    │       ├── cluster-spread.pdf
    │       ├── combined.pdf
    │       ├── endpoint.pdf
    │       ├── error-vs-compression-small.pdf
    │       ├── error-vs-compression.pdf
    │       ├── interpolation.pdf
    │       ├── k-q-plot.pdf
    │       ├── linear-interpolation.pdf
    │       ├── merge.pdf
    │       ├── qd-sizes-small.pdf
    │       ├── qd-sizes.pdf
    │       ├── relative-error-one-panel.pdf
    │       ├── relative-error.pdf
    │       └── singleton.pdf
    │   ├── refs.bib
    │   ├── short.pdf
    │   ├── short.tex
    │   └── vldb.cls
├── pom.xml
├── quality
    ├── README.md
    ├── accuracy.r
    ├── comparison.r
    ├── fh.r
    ├── kll-comparison.pdf
    ├── merge.r
    ├── pom.xml
    ├── src
    │   └── test
    │   │   └── java
    │   │       └── com
    │   │           └── tdunning
    │   │               ├── math
    │   │                   └── stats
    │   │                   │   └── QuantileEstimator.java
    │   │               └── tdigest
    │   │                   └── quality
    │   │                       ├── AccuracyTest.java
    │   │                       ├── BinFill.java
    │   │                       ├── CompareKllTest.java
    │   │                       ├── ComparisonTest.java
    │   │                       ├── Git.java
    │   │                       ├── ScalingTest.java
    │   │                       ├── SinglePassTest.java
    │   │                       └── Util.java
    └── x.r
└── size-studies.r


/.github/workflows/maven.yml:
--------------------------------------------------------------------------------
 1 | name: Java CI
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 | 
 7 | permissions:
 8 |   contents: read
 9 | 
10 | jobs:
11 |   build:
12 |     runs-on: ${{ matrix.os }}
13 |     continue-on-error: ${{ matrix.experimental }}
14 |     strategy:
15 |       matrix:
16 |         os: [ ubuntu-latest, windows-latest ]
17 |         java: [ 8, 11 ]
18 |         experimental: [ false ]
19 | #        include:
20 | #          - java: 18-ea
21 | #            os: ubuntu-latest
22 | #            experimental: true
23 | 
24 |     steps:
25 |       - uses: actions/checkout@v3.3.0
26 |         with:
27 |           persist-credentials: false
28 |       - name: Set up JDK ${{ matrix.java }}
29 |         uses: actions/setup-java@v3.10.0
30 |         with:
31 |           distribution: 'temurin'
32 |           java-version: ${{ matrix.java }}
33 |       - name: Build with Maven
34 |         run: mvn -V --no-transfer-progress clean test
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .RData
 2 | .Rapp.history
 3 | .DS_Store
 4 | *.class
 5 | target/
 6 | .Rhistory
 7 | docs/r-sim-diagrams/*.pdf
 8 | histo.synctex.gz
 9 | log
10 | 
11 | # Package Files #
12 | *.jar
13 | *.war
14 | *.ear
15 | 
16 | # IDEA related files 
17 | .idea/
18 | *.iml
19 | 
20 | # Eclipse related files
21 | .classpath
22 | .project
23 | .settings/
24 | 
25 | # Latex and R related files
26 | _region_.prv/
27 | _region_*
28 | *.log
29 | *.aux
30 | *.dvi
31 | *.blg
32 | *.bbl
33 | *.synctex.gz
34 | Rplots.pdf
35 | .Rhistory
36 | *.ini
37 | 
38 | # diagnostic outputs
39 | *.csv
40 | *.tsv
41 | 
42 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 | - family-names: "Dunning"
 5 |   given-names: "Ted"
 6 |   orcid: "https://orcid.org/0000-0000-0000-0000"
 7 | title: "t-digest"
 8 | version: 3.2
 9 | date-released: 2017-08-06
10 | url: "https://github.com/tdunning/t-digest"
11 | 


--------------------------------------------------------------------------------
/NOTICES:
--------------------------------------------------------------------------------
1 | The code for the t-digest was originally authored by Ted Dunning
2 | 
3 | Adrien Grand contributed the heart of the AVLTreeDigest (https://github.com/jpountz)
4 | 
5 | 


--------------------------------------------------------------------------------
/RELEASE-NOTES.md:
--------------------------------------------------------------------------------
  1 | Release 3.2
  2 | ===========
  3 | In release 3.2, the goal is to produce an update to the code given the large number of improvements since the previous release.
  4 | 
  5 | There are a few bugs that will survive this release, most notably in the AVLTreeDigest. These have to do with large numbers of repeated data points and are not new bugs.
  6 |  
  7 | There is also a lot of work going on with serialization. I need to hear from people about  what they are doing with serialization so that we can build some test cases to allow an appropriate migration strategy to future serialization.
  8 | 
  9 | The paper continues to be updated. The algorithmic descriptions are getting reasonably clear, but the speed and accuracy sections need a complete revamp with current implementations.
 10 | 
 11 | 
 12 | Bugs, fixed and known
 13 | ----
 14 | 
 15 | #### Fixed
 16 | The following important issues are fixed in this release
 17 | 
 18 | [Issue #90](https://github.com/tdunning/t-digest/issues/90) Serialization for MergingDigest
 19 | 
 20 | [Issue #92](https://github.com/tdunning/t-digest/issues/92) Serialization for AVLTreeDigest
 21 | 
 22 | #### Maybe fixed
 23 | This issue has substantial progress, but lacks a definitive test to determine whether it should be closed.
 24 | 
 25 | [Issue 78](https://github.com/tdunning/t-digest/issues/78) Stability under merging.
 26 | 
 27 | #### Pushed
 28 | The following issues are pushed beyond this release
 29 | 
 30 | [Issue #87](https://github.com/tdunning/t-digest/issues/87) Future proof and extensible serialization
 31 | 
 32 | [Issue #89](https://github.com/tdunning/t-digest/issues/89) Bad handling for duplicate values in AVLTreeDigest
 33 | 
 34 | #### All fixed issues
 35 | Here is a complete list of issues resolved in this release:
 36 | 
 37 | [Issue #55](https://github.com/tdunning/t-digest/issues/55) Add time
 38 | decay to t-digest
 39 | 
 40 | [Issue #52](https://github.com/tdunning/t-digest/issues/52) General
 41 | factory method for "fromBytes"
 42 | 
 43 | [Issue #90](https://github.com/tdunning/t-digest/issues/90)
 44 | Deserialization of MergingDigest BufferUnderflowException in 3.1
 45 | 
 46 | [Issue #92](https://github.com/tdunning/t-digest/issues/92) Error in
 47 | AVLTreeDigest.fromBytes
 48 | 
 49 | [Issue #93](https://github.com/tdunning/t-digest/issues/93) high
 50 | centroid frequency causes overflow - giving incorrect results
 51 | 
 52 | [Issue #67](https://github.com/tdunning/t-digest/issues/67) Release of
 53 | version 3.2
 54 | 
 55 | [Issue #81](https://github.com/tdunning/t-digest/issues/81)
 56 | AVLTreeDigest with a lot of datas : integer overflow
 57 | 
 58 | [Issue #75](https://github.com/tdunning/t-digest/issues/75) Adjusting
 59 | the centroid threshold values to obtain better accuracy at interesting
 60 | values
 61 | 
 62 | [Issue #74](https://github.com/tdunning/t-digest/issues/74) underlying
 63 | distribution : powerlaw
 64 | 
 65 | [Issue #72](https://github.com/tdunning/t-digest/issues/72) Inverse
 66 | quantile algorithm is non-contiguous
 67 | 
 68 | [Issue #65](https://github.com/tdunning/t-digest/issues/65)
 69 | totalDigest add spark dataframe column / array
 70 | 
 71 | [Issue #60](https://github.com/tdunning/t-digest/issues/60) Getting
 72 | IllegalArgumentException when adding digests
 73 | 
 74 | [Issue #53](https://github.com/tdunning/t-digest/issues/53)
 75 | smallByteSize methods are very trappy in many classes -- should be
 76 | changed or have warnings in javadocs
 77 | 
 78 | [Issue #82](https://github.com/tdunning/t-digest/issues/82) TDigest
 79 | class does not implement Serializable interface in last release.
 80 | 
 81 | [Issue #42](https://github.com/tdunning/t-digest/issues/42) Histogram
 82 | 
 83 | [Issue #40](https://github.com/tdunning/t-digest/issues/40) Improved
 84 | constraint on centroid sizes
 85 | 
 86 | [Issue #37](https://github.com/tdunning/t-digest/issues/37) Allow
 87 | arbitrary scaling laws for centroid sizes
 88 | 
 89 | [Issue #29](https://github.com/tdunning/t-digest/issues/29) Test
 90 | method testScaling() always adds values in ascending order
 91 | 
 92 | [Issue #84](https://github.com/tdunning/t-digest/issues/84) Remove
 93 | deprecated kinds of t-digest
 94 | 
 95 | [Issue #76](https://github.com/tdunning/t-digest/issues/76) Add
 96 | serializability
 97 | 
 98 | [Issue #77](https://github.com/tdunning/t-digest/issues/77) Question:
 99 | Proof of bounds on merging digest size
100 | 
101 | [Issue #71](https://github.com/tdunning/t-digest/issues/71) Simple
102 | alternate algorithm using maxima, ranks and fixed cumulative weighting
103 | 
104 | [Issue #61](https://github.com/tdunning/t-digest/issues/61) Possible
105 | improvement to the speed of the algorithm
106 | 
107 | [Issue #58](https://github.com/tdunning/t-digest/issues/58) jdk8
108 | doclint incompatibility
109 | 
110 | [Issue #48](https://github.com/tdunning/t-digest/issues/48) Build is
111 | unstable under some circumstances
112 | 
113 | [Issue #63](https://github.com/tdunning/t-digest/issues/63) Which
114 | TDigest do you recommend?
115 | 
116 | [Issue #62](https://github.com/tdunning/t-digest/issues/62) Very slow
117 | performance; what am I missing?
118 | 
119 | [Issue #47](https://github.com/tdunning/t-digest/issues/47) Make
120 | TDigest serializable
121 | 
122 | [Issue #49](https://github.com/tdunning/t-digest/issues/49)
123 | MergingDigest.centroids is wrong on an empty digest
124 | 
125 | 


--------------------------------------------------------------------------------
/benchmark/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <parent>
 8 |         <groupId>com.tdunning</groupId>
 9 |         <artifactId>t-digest-parent</artifactId>
10 |         <version>3.4-SNAPSHOT</version>
11 |         <relativePath>../pom.xml</relativePath>
12 |     </parent>
13 |     <artifactId>t-digest-benchmark</artifactId>
14 | 
15 |     <dependencies>
16 |         <dependency>
17 |             <groupId>com.tdunning</groupId>
18 |             <artifactId>t-digest</artifactId>
19 |             <version>3.4-SNAPSHOT</version>
20 |         </dependency>
21 |         <dependency>
22 |             <groupId>org.openjdk.jmh</groupId>
23 |             <artifactId>jmh-core</artifactId>
24 |             <version>1.17.3</version>
25 |         </dependency>
26 |         <dependency>
27 |             <groupId>org.openjdk.jmh</groupId>
28 |             <artifactId>jmh-generator-annprocess</artifactId>
29 |             <version>1.17.3</version>
30 |             <scope>provided</scope>
31 |         </dependency>
32 |         <dependency>
33 |             <groupId>org.apache.mahout</groupId>
34 |             <artifactId>mahout-math</artifactId>
35 |             <version>0.9</version>
36 |         </dependency>
37 |     </dependencies>
38 | 
39 |     <build>
40 |         <plugins>
41 |             <plugin>
42 |                 <groupId>org.codehaus.mojo</groupId>
43 |                 <artifactId>exec-maven-plugin</artifactId>
44 |                 <version>1.1</version>
45 |                 <configuration>
46 |                     <mainClass>com.tdunning.Benchmark</mainClass>
47 |                 </configuration>
48 |             </plugin>
49 |             <plugin>
50 |                 <groupId>org.apache.maven.plugins</groupId>
51 |                 <artifactId>maven-compiler-plugin</artifactId>
52 |                 <version>3.3</version>
53 |                 <configuration>
54 |                     <verbose>true</verbose>
55 |                     <compilerVersion>1.8</compilerVersion>
56 |                     <source>1.7</source>
57 |                     <target>1.7</target>
58 |                 </configuration>
59 |             </plugin>
60 | 
61 |             <!-- this next generates the target/benchmarks.jar file -->
62 |             <plugin>
63 |                 <groupId>org.apache.maven.plugins</groupId>
64 |                 <artifactId>maven-shade-plugin</artifactId>
65 |                 <version>2.0</version>
66 |                 <executions>
67 |                     <execution>
68 |                         <phase>package</phase>
69 |                         <goals>
70 |                             <goal>shade</goal>
71 |                         </goals>
72 |                         <configuration>
73 |                             <finalName>microbenchmarks</finalName>
74 |                             <transformers>
75 |                                 <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
76 |                                     <mainClass>org.openjdk.jmh.Main</mainClass>
77 |                                 </transformer>
78 |                             </transformers>
79 |                             <filters>
80 |                                 <filter>
81 |                                     <artifact>*:*</artifact>
82 |                                     <excludes>
83 |                                         <exclude>META-INF/services/javax.annotation.processing.Processor</exclude>
84 |                                     </excludes>
85 |                                 </filter>
86 |                             </filters>
87 |                         </configuration>
88 |                     </execution>
89 |                 </executions>
90 |             </plugin>
91 |         </plugins>
92 |     </build>
93 | </project>
94 | 


--------------------------------------------------------------------------------
/benchmark/src/main/java/com/tdunning/ApproxLogBench.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.tdunning;
 19 | 
 20 | import com.tdunning.math.stats.LogHistogram;
 21 | import com.tdunning.math.stats.MergingDigest;
 22 | import com.tdunning.math.stats.ScaleFunction;
 23 | 
 24 | import org.openjdk.jmh.annotations.*;
 25 | import org.openjdk.jmh.annotations.Benchmark;
 26 | import org.openjdk.jmh.results.format.ResultFormatType;
 27 | import org.openjdk.jmh.runner.Runner;
 28 | import org.openjdk.jmh.runner.RunnerException;
 29 | import org.openjdk.jmh.runner.options.Options;
 30 | import org.openjdk.jmh.runner.options.OptionsBuilder;
 31 | 
 32 | import java.util.Random;
 33 | import java.util.concurrent.TimeUnit;
 34 | 
 35 | /**
 36 |  * Explores the value of using a large buffer for the MergingDigest. The rationale is that the internal
 37 |  * sort is extremely fast while the merging function in the t-digest can be quite slow, if only because
 38 |  * computing the asin function involved in the merge is expensive. This argues for collecting more samples
 39 |  * before sorting and merging them into the digest.
 40 |  */
 41 | @BenchmarkMode(Mode.AverageTime)
 42 | @OutputTimeUnit(TimeUnit.NANOSECONDS)
 43 | @Warmup(iterations = 3, time = 3, timeUnit = TimeUnit.SECONDS)
 44 | @Measurement(iterations = 5, time = 3, timeUnit = TimeUnit.SECONDS)
 45 | @Fork(1)
 46 | @Threads(1)
 47 | @State(Scope.Thread)
 48 | public class ApproxLogBench {
 49 |     private static final double LOG_2 = Math.log(2);
 50 |     private Random gen = new Random();
 51 |     private double[] data;
 52 | 
 53 |     @Setup
 54 |     public void setup() {
 55 |         data = new double[10000000];
 56 |         for (int i = 0; i < data.length; i++) {
 57 |             data[i] = gen.nextDouble();
 58 |         }
 59 |     }
 60 | 
 61 |     @State(Scope.Thread)
 62 |     public static class ThreadState {
 63 |         int index = 0;
 64 |     }
 65 | 
 66 |     @Benchmark
 67 |     @BenchmarkMode(Mode.AverageTime)
 68 |     @OutputTimeUnit(TimeUnit.NANOSECONDS)
 69 |     public void addApprox(ThreadState state) {
 70 |         if (state.index >= data.length) {
 71 |             state.index = 0;
 72 |         }
 73 |         double sum = 0;
 74 |         for (int i = 0; i < 1000; i++) {
 75 |             sum += LogHistogram.approxLog2(data[state.index++]);
 76 |         }
 77 |     }
 78 | 
 79 |     @Benchmark
 80 |     @BenchmarkMode(Mode.AverageTime)
 81 |     @OutputTimeUnit(TimeUnit.NANOSECONDS)
 82 |     public void addLog(ThreadState state) {
 83 |         if (state.index >= data.length) {
 84 |             state.index = 0;
 85 |         }
 86 |         double sum = 0;
 87 |         for (int i = 0; i < 1000; i++) {
 88 |             sum += Math.log(data[state.index++])/LOG_2;
 89 |         }
 90 | 
 91 |     }
 92 | 
 93 |     public static void main(String[] args) throws RunnerException {
 94 |         Options opt = new OptionsBuilder()
 95 |                 .include(ApproxLogBench.class.getSimpleName())
 96 |                 .warmupIterations(5)
 97 |                 .measurementIterations(5)
 98 |                 .forks(1)
 99 |                 .resultFormat(ResultFormatType.CSV)
100 |                 .build();
101 | 
102 |         new Runner(opt).run();
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/benchmark/src/main/java/com/tdunning/Benchmark.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.tdunning;
 19 | 
 20 | import com.tdunning.math.stats.AVLTreeDigest;
 21 | import com.tdunning.math.stats.MergingDigest;
 22 | import com.tdunning.math.stats.TDigest;
 23 | import org.openjdk.jmh.annotations.*;
 24 | import org.openjdk.jmh.profile.GCProfiler;
 25 | import org.openjdk.jmh.profile.StackProfiler;
 26 | import org.openjdk.jmh.results.format.ResultFormatType;
 27 | import org.openjdk.jmh.runner.Runner;
 28 | import org.openjdk.jmh.runner.RunnerException;
 29 | import org.openjdk.jmh.runner.options.Options;
 30 | import org.openjdk.jmh.runner.options.OptionsBuilder;
 31 | 
 32 | import java.util.Random;
 33 | import java.util.concurrent.TimeUnit;
 34 | 
 35 | @BenchmarkMode(Mode.AverageTime)
 36 | @OutputTimeUnit(TimeUnit.NANOSECONDS)
 37 | @Warmup(iterations = 3, time = 3, timeUnit = TimeUnit.SECONDS)
 38 | @Measurement(iterations = 5, time = 2, timeUnit = TimeUnit.SECONDS)
 39 | @Fork(1)
 40 | @Threads(1)
 41 | @State(Scope.Thread)
 42 | public class Benchmark {
 43 |     private Random gen = new Random();
 44 |     private double[] data;
 45 | 
 46 |     @Param({"merge", "tree"})
 47 |     public String method;
 48 | 
 49 |     @Param({"20", "50", "100", "200", "500"})
 50 |     public int compression;
 51 | 
 52 |     private TDigest td;
 53 | 
 54 |     @Setup
 55 |     public void setup() {
 56 |         data = new double[10000000];
 57 |         for (int i = 0; i < data.length; i++) {
 58 |             data[i] = gen.nextDouble();
 59 |         }
 60 |         if (method.equals("tree")) {
 61 |             td = new AVLTreeDigest(compression);
 62 |         } else {
 63 |             td = new MergingDigest(500);
 64 |         }
 65 | 
 66 |         // First values are very cheap to add, we are more interested in the steady state,
 67 |         // when the summary is full. Summaries are expected to contain about 5*compression
 68 |         // centroids, hence the 5 factor
 69 |         for (int i = 0; i < 5 * compression; ++i) {
 70 |             td.add(gen.nextDouble());
 71 |         }
 72 |     }
 73 | 
 74 |     @State(Scope.Thread)
 75 |     public static class ThreadState {
 76 |         int index = 0;
 77 |     }
 78 | 
 79 |     @org.openjdk.jmh.annotations.Benchmark
 80 |     public void add(ThreadState state) {
 81 |         if (state.index >= data.length) {
 82 |             state.index = 0;
 83 |         }
 84 |         td.add(data[state.index++]);
 85 |     }
 86 | 
 87 |     public static void main(String[] args) throws RunnerException {
 88 |         Options opt = new OptionsBuilder()
 89 |                 .include(".*" + Benchmark.class.getSimpleName() + ".*")
 90 |                 .resultFormat(ResultFormatType.CSV)
 91 |                 .result("results.csv")
 92 |                 .addProfiler(GCProfiler.class)
 93 |                 .addProfiler(StackProfiler.class)
 94 |                 .build();
 95 | 
 96 |         new Runner(opt).run();
 97 |     }
 98 | 
 99 | }
100 | 


--------------------------------------------------------------------------------
/benchmark/src/main/java/com/tdunning/FloatHistogramBench.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.tdunning;
19 | 
20 | import com.tdunning.math.stats.FloatHistogram;
21 | import org.openjdk.jmh.annotations.*;
22 | import org.openjdk.jmh.annotations.Benchmark;
23 | import org.openjdk.jmh.profile.GCProfiler;
24 | import org.openjdk.jmh.profile.StackProfiler;
25 | import org.openjdk.jmh.results.format.ResultFormatType;
26 | import org.openjdk.jmh.runner.Runner;
27 | import org.openjdk.jmh.runner.RunnerException;
28 | import org.openjdk.jmh.runner.options.Options;
29 | import org.openjdk.jmh.runner.options.OptionsBuilder;
30 | 
31 | import java.util.Random;
32 | import java.util.concurrent.TimeUnit;
33 | 
34 | /**
35 |  * Explores the value of using a large buffer for the MergingDigest. The rationale is that the internal
36 |  * sort is extremely fast while the merging function in the t-digest can be quite slow, if only because
37 |  * computing the asin function involved in the merge is expensive. This argues for collecting more samples
38 |  * before sorting and merging them into the digest.
39 |  */
40 | @BenchmarkMode(Mode.AverageTime)
41 | @OutputTimeUnit(TimeUnit.NANOSECONDS)
42 | @Warmup(iterations = 3, time = 3, timeUnit = TimeUnit.SECONDS)
43 | @Measurement(iterations = 5, time = 2, timeUnit = TimeUnit.SECONDS)
44 | @Fork(1)
45 | @Threads(1)
46 | @State(Scope.Thread)
47 | public class FloatHistogramBench {
48 |     private Random gen = new Random();
49 |     private double[] data;
50 | 
51 |     @Param({"20", "50", "100"})
52 |     public int binsPerDecade;
53 | 
54 |     private FloatHistogram fh;
55 | 
56 |     @Setup
57 |     public void setup() {
58 |         data = new double[10000000];
59 |         for (int i = 0; i < data.length; i++) {
60 |             data[i] = gen.nextDouble();
61 |         }
62 |         fh = new FloatHistogram(0.1, 10000, binsPerDecade);
63 | 
64 |         for (int i = 0; i < 10000; ++i) {
65 |             fh.add(gen.nextDouble());
66 |         }
67 |     }
68 | 
69 |     @State(Scope.Thread)
70 |     public static class ThreadState {
71 |         int index = 0;
72 |     }
73 | 
74 |     @Benchmark
75 |     public void add(ThreadState state) {
76 |         if (state.index >= data.length) {
77 |             state.index = 0;
78 |         }
79 |         fh.add(data[state.index++]);
80 |     }
81 | 
82 |     public static void main(String[] args) throws RunnerException {
83 |         Options opt = new OptionsBuilder()
84 |                 .include(".*" + FloatHistogramBench.class.getSimpleName() + ".*")
85 |                 .resultFormat(ResultFormatType.CSV)
86 |                 .result("overall-results.csv")
87 |                 .addProfiler(StackProfiler.class)
88 |                 .addProfiler(GCProfiler.class)
89 |                 .build();
90 | 
91 |         new Runner(opt).run();
92 |     }
93 | 
94 | }
95 | 


--------------------------------------------------------------------------------
/benchmark/src/main/java/com/tdunning/MergeBench.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.tdunning;
 19 | 
 20 | import com.tdunning.math.stats.MergingDigest;
 21 | import com.tdunning.math.stats.ScaleFunction;
 22 | 
 23 | import org.openjdk.jmh.annotations.*;
 24 | import org.openjdk.jmh.annotations.Benchmark;
 25 | import org.openjdk.jmh.profile.StackProfiler;
 26 | import org.openjdk.jmh.results.format.ResultFormatType;
 27 | import org.openjdk.jmh.runner.Runner;
 28 | import org.openjdk.jmh.runner.RunnerException;
 29 | import org.openjdk.jmh.runner.options.Options;
 30 | import org.openjdk.jmh.runner.options.OptionsBuilder;
 31 | 
 32 | import java.util.Random;
 33 | import java.util.concurrent.TimeUnit;
 34 | 
 35 | /**
 36 |  * Explores the value of using a large buffer for the MergingDigest. The rationale is that the internal
 37 |  * sort is extremely fast while the merging function in the t-digest can be quite slow, if only because
 38 |  * computing the asin function involved in the merge is expensive. This argues for collecting more samples
 39 |  * before sorting and merging them into the digest.
 40 |  */
 41 | @BenchmarkMode(Mode.AverageTime)
 42 | @OutputTimeUnit(TimeUnit.NANOSECONDS)
 43 | @Warmup(iterations = 3, time = 3, timeUnit = TimeUnit.SECONDS)
 44 | @Measurement(iterations = 5, time = 3, timeUnit = TimeUnit.SECONDS)
 45 | @Fork(1)
 46 | @Threads(1)
 47 | @State(Scope.Thread)
 48 | public class MergeBench {
 49 |     private Random gen = new Random();
 50 |     private double[] data;
 51 | 
 52 |     //    @Param({"20", "50", "100", "200", "500"})
 53 |     @Param({"50", "100"})
 54 |     public int compression;
 55 | 
 56 |     //    @Param({"1", "2", "5", "10"})
 57 |     @Param({"2", "5", "10"})
 58 |     public int factor;
 59 | 
 60 |     //    @Param({"K_1", "K_2", "K_3"})
 61 |     @Param({"K_2"})
 62 |     public String scaleFunction;
 63 | 
 64 |     private MergingDigest td;
 65 | 
 66 |     @Setup
 67 |     public void setup() {
 68 |         data = new double[10000000];
 69 |         for (int i = 0; i < data.length; i++) {
 70 |             data[i] = gen.nextDouble();
 71 |         }
 72 |         td = new MergingDigest(compression, (factor + 1) * compression, compression);
 73 |         td.setScaleFunction(ScaleFunction.valueOf(scaleFunction));
 74 | 
 75 |         // First values are very cheap to add, we are more interested in the steady state,
 76 |         // when the summary is full. Summaries are expected to contain about 0.6*compression
 77 |         // centroids, hence the 5 * compression * (factor+1)
 78 |         for (int i = 0; i < 5 * compression * (factor + 1); ++i) {
 79 |             td.add(gen.nextDouble());
 80 |         }
 81 |     }
 82 | 
 83 |     @State(Scope.Thread)
 84 |     public static class ThreadState {
 85 |         int index = 0;
 86 |     }
 87 | 
 88 |     @Benchmark
 89 |     @BenchmarkMode(Mode.AverageTime)
 90 |     @OutputTimeUnit(TimeUnit.MICROSECONDS)
 91 |     public void add(ThreadState state) {
 92 |         if (state.index >= data.length) {
 93 |             state.index = 0;
 94 |         }
 95 |         td.add(data[state.index++]);
 96 |     }
 97 | 
 98 |     public static void main(String[] args) throws RunnerException {
 99 |         Options opt = new OptionsBuilder()
100 |                 .include(MergeBench.class.getSimpleName())
101 |                 .warmupIterations(5)
102 |                 .measurementIterations(5)
103 |                 .forks(1)
104 |                 .resultFormat(ResultFormatType.CSV)
105 |                 .addProfiler(StackProfiler.class)
106 |                 .build();
107 | 
108 |         new Runner(opt).run();
109 |     }
110 | 
111 | }
112 | 


--------------------------------------------------------------------------------
/benchmark/src/main/java/com/tdunning/SortBench.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.tdunning;
19 | 
20 | import com.tdunning.math.stats.Sort;
21 | import org.openjdk.jmh.annotations.Benchmark;
22 | import org.openjdk.jmh.annotations.*;
23 | 
24 | import java.util.Arrays;
25 | import java.util.Random;
26 | import java.util.concurrent.TimeUnit;
27 | 
28 | /** Explores the performance of Sort on pathological input data. */
29 | @BenchmarkMode(Mode.AverageTime)
30 | @OutputTimeUnit(TimeUnit.MILLISECONDS)
31 | @Warmup(iterations = 10, time = 3, timeUnit = TimeUnit.SECONDS)
32 | @Measurement(iterations = 20, time = 2, timeUnit = TimeUnit.SECONDS)
33 | @Fork(1)
34 | @Threads(1)
35 | @State(Scope.Thread)
36 | public class SortBench {
37 |     private final int size = 100000;
38 |     private final double[] values = new double[size];
39 | 
40 |     @Param({"0", "1", "-1"})
41 |     public int sortDirection;
42 | 
43 |     @Setup
44 |     public void setup() {
45 |         Random prng = new Random(999983);
46 |         for (int i = 0; i < size; i++) {
47 |             values[i] = prng.nextDouble();
48 |         }
49 |         if (sortDirection > 0) {
50 |             Arrays.sort(values);
51 |         } else if (sortDirection < 0) {
52 |             Arrays.sort(values);
53 |             Sort.reverse(values, 0, values.length);
54 |         }
55 |     }
56 | 
57 |     @Benchmark
58 |     public void quicksort() {
59 |         int[] order = new int[size];
60 |         for (int i = 0; i < size; i++) {
61 |             order[i] = i;
62 |         }
63 |         Sort.sort(order, values, null, values.length);
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/benchmark/src/main/java/com/tdunning/TDigestBench.java:
--------------------------------------------------------------------------------
  1 | package com.tdunning;
  2 | 
  3 | import com.tdunning.math.stats.AVLTreeDigest;
  4 | import com.tdunning.math.stats.MergingDigest;
  5 | import com.tdunning.math.stats.TDigest;
  6 | import org.apache.mahout.math.jet.random.*;
  7 | import org.openjdk.jmh.annotations.*;
  8 | import org.openjdk.jmh.annotations.Benchmark;
  9 | import org.openjdk.jmh.profile.GCProfiler;
 10 | import org.openjdk.jmh.profile.StackProfiler;
 11 | import org.openjdk.jmh.results.format.ResultFormatType;
 12 | import org.openjdk.jmh.runner.Runner;
 13 | import org.openjdk.jmh.runner.RunnerException;
 14 | import org.openjdk.jmh.runner.options.Options;
 15 | import org.openjdk.jmh.runner.options.OptionsBuilder;
 16 | 
 17 | import java.util.Random;
 18 | import java.util.concurrent.ThreadLocalRandom;
 19 | import java.util.concurrent.TimeUnit;
 20 | 
 21 | @BenchmarkMode(Mode.AverageTime)
 22 | @OutputTimeUnit(TimeUnit.NANOSECONDS)
 23 | @Warmup(iterations = 3, time = 3, timeUnit = TimeUnit.SECONDS)
 24 | @Measurement(iterations = 5, time = 2, timeUnit = TimeUnit.SECONDS)
 25 | @Fork(1)
 26 | @Threads(1)
 27 | @State(Scope.Thread)
 28 | public class TDigestBench {
 29 | 
 30 |     public enum TDigestFactory {
 31 |         MERGE {
 32 |             @Override
 33 |             TDigest create(double compression) {
 34 |                 return new MergingDigest(compression, (int) (10 * compression));
 35 |             }
 36 | 
 37 |             @Override
 38 |             TDigest create() {
 39 |                 return create(100);
 40 |             }
 41 |         },
 42 |         AVL_TREE {
 43 |             @Override
 44 |             TDigest create(double compression) {
 45 |                 return new AVLTreeDigest(compression);
 46 |             }
 47 | 
 48 |             @Override
 49 |             TDigest create() {
 50 |                 return create(20);
 51 |             }
 52 |         };
 53 | 
 54 |         abstract TDigest create(double compression);
 55 |         abstract TDigest create();
 56 |     }
 57 | 
 58 |     public enum DistributionFactory {
 59 |         UNIFORM {
 60 |             @Override
 61 |             AbstractDistribution create(Random random) {
 62 |                 return new Uniform(0, 1, random);
 63 |             }
 64 |         },
 65 |         SEQUENTIAL {
 66 |             @Override
 67 |             AbstractDistribution create(Random random) {
 68 |                 return new AbstractContinousDistribution() {
 69 |                     double base = 0;
 70 | 
 71 |                     @Override
 72 |                     public double nextDouble() {
 73 |                         base += Math.PI * 1e-5;
 74 |                         return base;
 75 |                     }
 76 |                 };
 77 |             }
 78 |         },
 79 |         REPEATED {
 80 |             @Override
 81 |             AbstractDistribution create(final Random random) {
 82 |                 return new AbstractContinousDistribution() {
 83 |                     @Override
 84 |                     public double nextDouble() {
 85 |                         return random.nextInt(10);
 86 |                     }
 87 |                 };
 88 |             }
 89 |         },
 90 |         GAMMA {
 91 |             @Override
 92 |             AbstractDistribution create(Random random) {
 93 |                 return new Gamma(0.1, 0.1, random);
 94 |             }
 95 |         },
 96 |         NORMAL {
 97 |             @Override
 98 |             AbstractDistribution create(Random random) {
 99 |                 return new Normal(0.1, 0.1, random);
100 |             }
101 |         };
102 | 
103 |         abstract AbstractDistribution create(Random random);
104 |     }
105 | 
106 |     @Param({"100", "300"})
107 |     double compression;
108 | 
109 |     @Param({"MERGE", "AVL_TREE"})
110 |     TDigestFactory tdigestFactory;
111 | 
112 |     @Param({"NORMAL", "GAMMA"})
113 |     DistributionFactory distributionFactory;
114 | 
115 |     Random random;
116 |     TDigest tdigest;
117 |     AbstractDistribution distribution;
118 | 
119 |     double[] data = new double[1000000];
120 | 
121 |     @Setup
122 |     public void setUp() {
123 |         random = ThreadLocalRandom.current();
124 |         tdigest = tdigestFactory.create(compression);
125 |         distribution = distributionFactory.create(random);
126 |         // first values are cheap to add, so pre-fill the t-digest to have more realistic results
127 |         for (int i = 0; i < 10000; ++i) {
128 |             tdigest.add(distribution.nextDouble());
129 |         }
130 | 
131 |         for (int i = 0; i < data.length; ++i) {
132 |             data[i] = distribution.nextDouble();
133 |         }
134 |     }
135 | 
136 |     @State(Scope.Thread)
137 |     public static class ThreadState {
138 |         int index = 0;
139 |     }
140 | 
141 |     @Benchmark
142 |     public void timeAdd(MergeBench.ThreadState state) {
143 |         if (state.index >= data.length) {
144 |             state.index = 0;
145 |         }
146 |         tdigest.add(data[state.index++]);
147 |     }
148 | 
149 |     public static void main(String[] args) throws RunnerException {
150 |         Options opt = new OptionsBuilder()
151 |                 .include(".*" + TDigestBench.class.getSimpleName() + ".*")
152 |                 .resultFormat(ResultFormatType.CSV)
153 |                 .result("overall-results.csv")
154 |                 .addProfiler(GCProfiler.class)
155 |                 .addProfiler(StackProfiler.class)
156 |                 .build();
157 | 
158 |         new Runner(opt).run();
159 |     }
160 | }
161 | 


--------------------------------------------------------------------------------
/benchmark/x.r:
--------------------------------------------------------------------------------
1 | data = read.csv("bin-fill.csv")
2 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/tdunning/math/stats/AbstractTDigest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Ted Dunning under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.tdunning.math.stats;
 19 | 
 20 | import java.nio.ByteBuffer;
 21 | import java.util.ArrayList;
 22 | import java.util.List;
 23 | 
 24 | public abstract class AbstractTDigest extends TDigest {
 25 |     boolean recordAllData = false;
 26 | 
 27 |     /**
 28 |      * Same as {@link #weightedAverageSorted(double, double, double, double)} but flips
 29 |      * the order of the variables if <code>x2</code> is greater than
 30 |      * <code>x1</code>.
 31 |      */
 32 |     static double weightedAverage(double x1, double w1, double x2, double w2) {
 33 |         if (x1 <= x2) {
 34 |             return weightedAverageSorted(x1, w1, x2, w2);
 35 |         } else {
 36 |             return weightedAverageSorted(x2, w2, x1, w1);
 37 |         }
 38 |     }
 39 | 
 40 |     /**
 41 |      * Compute the weighted average between <code>x1</code> with a weight of
 42 |      * <code>w1</code> and <code>x2</code> with a weight of <code>w2</code>.
 43 |      * This expects <code>x1</code> to be less than or equal to <code>x2</code>
 44 |      * and is guaranteed to return a number in <code>[x1, x2]</code>. An
 45 |      * explicit check is required since this isn't guaranteed with floating-point
 46 |      * numbers.
 47 |      */
 48 |     private static double weightedAverageSorted(double x1, double w1, double x2, double w2) {
 49 |         assert x1 <= x2;
 50 |         final double x = (x1 * w1 + x2 * w2) / (w1 + w2);
 51 |         return Math.max(x1, Math.min(x, x2));
 52 |     }
 53 | 
 54 |     static double interpolate(double x, double x0, double x1) {
 55 |         return (x - x0) / (x1 - x0);
 56 |     }
 57 | 
 58 |     static void encode(ByteBuffer buf, int n) {
 59 |         int k = 0;
 60 |         while (n < 0 || n > 0x7f) {
 61 |             byte b = (byte) (0x80 | (0x7f & n));
 62 |             buf.put(b);
 63 |             n = n >>> 7;
 64 |             k++;
 65 |             if (k >= 6) {
 66 |                 throw new IllegalStateException("Size is implausibly large");
 67 |             }
 68 |         }
 69 |         buf.put((byte) n);
 70 |     }
 71 | 
 72 |     static int decode(ByteBuffer buf) {
 73 |         int v = buf.get();
 74 |         int z = 0x7f & v;
 75 |         int shift = 7;
 76 |         while ((v & 0x80) != 0) {
 77 |             if (shift > 28) {
 78 |                 throw new IllegalStateException("Shift too large in decode");
 79 |             }
 80 |             v = buf.get();
 81 |             z += (v & 0x7f) << shift;
 82 |             shift += 7;
 83 |         }
 84 |         return z;
 85 |     }
 86 | 
 87 |     abstract void add(double x, int w, Centroid base);
 88 | 
 89 |     /**
 90 |      * Computes an interpolated value of a quantile that is between two centroids.
 91 |      *
 92 |      * Index is the quantile desired multiplied by the total number of samples - 1.
 93 |      *
 94 |      * @param index              Denormalized quantile desired
 95 |      * @param previousIndex      The denormalized quantile corresponding to the center of the previous centroid.
 96 |      * @param nextIndex          The denormalized quantile corresponding to the center of the following centroid.
 97 |      * @param previousMean       The mean of the previous centroid.
 98 |      * @param nextMean           The mean of the following centroid.
 99 |      * @return  The interpolated mean.
100 |      */
101 |     static double quantile(double index, double previousIndex, double nextIndex, double previousMean, double nextMean) {
102 |         final double delta = nextIndex - previousIndex;
103 |         final double previousWeight = (nextIndex - index) / delta;
104 |         final double nextWeight = (index - previousIndex) / delta;
105 |         return previousMean * previousWeight + nextMean * nextWeight;
106 |     }
107 | 
108 |     /**
109 |      * Sets up so that all centroids will record all data assigned to them.  For testing only, really.
110 |      */
111 |     @Override
112 |     public TDigest recordAllData() {
113 |         recordAllData = true;
114 |         return this;
115 |     }
116 | 
117 |     @Override
118 |     public boolean isRecording() {
119 |         return recordAllData;
120 |     }
121 | 
122 |     /**
123 |      * Adds a sample to a histogram.
124 |      *
125 |      * @param x The value to add.
126 |      */
127 |     @Override
128 |     public void add(double x) {
129 |         add(x, 1);
130 |     }
131 | 
132 |     @Override
133 |     public void add(TDigest other) {
134 |         for (Centroid centroid : other.centroids()) {
135 |             add(centroid.mean(), centroid.count(), centroid);
136 |         }
137 |     }
138 | 
139 |     protected Centroid createCentroid(double mean, int id) {
140 |         return new Centroid(mean, id, recordAllData);
141 |     }
142 | }
143 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/tdunning/math/stats/Centroid.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Ted Dunning under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.tdunning.math.stats;
 19 | 
 20 | import java.io.IOException;
 21 | import java.io.ObjectInputStream;
 22 | import java.io.Serializable;
 23 | import java.util.ArrayList;
 24 | import java.util.List;
 25 | import java.util.concurrent.atomic.AtomicInteger;
 26 | 
 27 | /**
 28 |  * A single centroid which represents a number of data points.
 29 |  */
 30 | public class Centroid implements Comparable<Centroid>, Serializable {
 31 |     private static final AtomicInteger uniqueCount = new AtomicInteger(1);
 32 | 
 33 |     private double centroid = 0;
 34 |     private int count = 0;
 35 | 
 36 |     // The ID is transient because it must be unique within a given JVM. A new
 37 |     // ID should be generated from uniqueCount when a Centroid is deserialized.
 38 |     private transient int id;
 39 | 
 40 |     private List<Double> actualData = null;
 41 | 
 42 |     private Centroid(boolean record) {
 43 |         id = uniqueCount.getAndIncrement();
 44 |         if (record) {
 45 |             actualData = new ArrayList<>();
 46 |         }
 47 |     }
 48 | 
 49 |     public Centroid(double x) {
 50 |         this(false);
 51 |         start(x, 1, uniqueCount.getAndIncrement());
 52 |     }
 53 | 
 54 |     public Centroid(double x, int w) {
 55 |         this(false);
 56 |         start(x, w, uniqueCount.getAndIncrement());
 57 |     }
 58 | 
 59 |     public Centroid(double x, int w, int id) {
 60 |         this(false);
 61 |         start(x, w, id);
 62 |     }
 63 | 
 64 |     public Centroid(double x, int id, boolean record) {
 65 |         this(record);
 66 |         start(x, 1, id);
 67 |     }
 68 | 
 69 |     Centroid(double x, int w, List<Double> data) {
 70 |         this(x, w);
 71 |         actualData = data;
 72 |     }
 73 | 
 74 |     private void start(double x, int w, int id) {
 75 |         this.id = id;
 76 |         add(x, w);
 77 |     }
 78 | 
 79 |     public void add(double x, int w) {
 80 |         if (actualData != null) {
 81 |             actualData.add(x);
 82 |         }
 83 |         count += w;
 84 |         centroid += w * (x - centroid) / count;
 85 |     }
 86 | 
 87 |     public double mean() {
 88 |         return centroid;
 89 |     }
 90 | 
 91 |     public int count() {
 92 |         return count;
 93 |     }
 94 | 
 95 |     public int id() {
 96 |         return id;
 97 |     }
 98 | 
 99 |     @Override
100 |     public String toString() {
101 |         return "Centroid{" +
102 |                 "centroid=" + centroid +
103 |                 ", count=" + count +
104 |                 '}';
105 |     }
106 | 
107 |     @Override
108 |     public int hashCode() {
109 |         return id;
110 |     }
111 | 
112 |     @Override
113 |     public int compareTo(@SuppressWarnings("NullableProblems") Centroid o) {
114 |         int r = Double.compare(centroid, o.centroid);
115 |         if (r == 0) {
116 |             r = id - o.id;
117 |         }
118 |         return r;
119 |     }
120 | 
121 |     public List<Double> data() {
122 |         return actualData;
123 |     }
124 | 
125 |     @SuppressWarnings("WeakerAccess")
126 |     public void insertData(double x) {
127 |         if (actualData == null) {
128 |             actualData = new ArrayList<>();
129 |         }
130 |         actualData.add(x);
131 |     }
132 | 
133 |     public static Centroid createWeighted(double x, int w, Iterable<? extends Double> data) {
134 |         Centroid r = new Centroid(data != null);
135 |         r.add(x, w, data);
136 |         return r;
137 |     }
138 | 
139 |     public void add(double x, int w, Iterable<? extends Double> data) {
140 |         if (actualData != null) {
141 |             if (data != null) {
142 |                 for (Double old : data) {
143 |                     actualData.add(old);
144 |                 }
145 |             } else {
146 |                 actualData.add(x);
147 |             }
148 |         }
149 |         centroid = AbstractTDigest.weightedAverage(centroid, count, x, w);
150 |         count += w;
151 |     }
152 | 
153 |     private void readObject(ObjectInputStream in) throws ClassNotFoundException, IOException {
154 |         in.defaultReadObject();
155 |         id = uniqueCount.getAndIncrement();
156 |     }
157 | }
158 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/tdunning/math/stats/Dist.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Ted Dunning under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.tdunning.math.stats;
19 | 
20 | import java.util.Collection;
21 | import java.util.List;
22 | 
23 | /**
24 |  * Reference implementations for cdf and quantile if we have all data.
25 |  */
26 | public class Dist {
27 |     public static double cdf(final double x, double[] data) {
28 |         return cdf(x, data, 0.5);
29 |     }
30 | 
31 |     public static double cdf(final double x, double[] data, double w) {
32 |         int n1 = 0;
33 |         int n2 = 0;
34 |         for (Double v : data) {
35 |             n1 += (v < x) ? 1 : 0;
36 |             n2 += (v == x) ? 1 : 0;
37 |         }
38 |         return (n1 + w * n2) / data.length;
39 |     }
40 | 
41 |     public static double cdf(final double x, Collection<Double> data) {
42 |         return cdf(x, data, 0.5);
43 |     }
44 | 
45 |     public static double cdf(final double x, Collection<Double> data, double w) {
46 |         int n1 = 0;
47 |         int n2 = 0;
48 |         for (Double v : data) {
49 |             n1 += (v < x) ? 1 : 0;
50 |             n2 += (v == x) ? 1 : 0;
51 |         }
52 |         return (n1 + w * n2) / data.size();
53 |     }
54 | 
55 |     public static double quantile(final double q, double[] data) {
56 |         int n = data.length;
57 |         if (n == 0) {
58 |             return Double.NaN;
59 |         }
60 |         double index = q * n;
61 |         if (index < 0) {
62 |             index = 0;
63 |         }
64 |         if (index > n - 1) {
65 |             index = n - 1;
66 |         }
67 |         return data[(int) Math.floor(index)];
68 |     }
69 | 
70 |     public static double quantile(final double q, List<Double> data) {
71 |         int n = data.size();
72 |         if (n == 0) {
73 |             return Double.NaN;
74 |         }
75 |         double index = q * n;
76 |         if (index < 0) {
77 |             index = 0;
78 |         }
79 |         if (index > n - 1) {
80 |             index = n - 1;
81 |         }
82 |         return data.get((int) Math.floor(index));
83 |     }
84 | }
85 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/tdunning/math/stats/FloatHistogram.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Ted Dunning under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.tdunning.math.stats;
 19 | 
 20 | import java.io.IOException;
 21 | import java.io.InvalidObjectException;
 22 | import java.io.ObjectStreamException;
 23 | import java.io.Serializable;
 24 | import java.nio.ByteBuffer;
 25 | import java.nio.LongBuffer;
 26 | 
 27 | /**
 28 |  * Maintains histogram buckets that are constant width
 29 |  * in base-2 floating point representation space. This is close
 30 |  * to exponential binning, but should be much faster.
 31 |  */
 32 | public class FloatHistogram extends Histogram {
 33 |     private int bitsOfPrecision;
 34 |     private int shift;
 35 |     private int offset;
 36 | 
 37 | 
 38 |     @SuppressWarnings("WeakerAccess")
 39 |     public FloatHistogram(double min, double max) {
 40 |         this(min, max, 50);
 41 |     }
 42 | 
 43 |     @SuppressWarnings("WeakerAccess")
 44 |     public FloatHistogram(double min, double max, double binsPerDecade) {
 45 |         super(min, max);
 46 |         if (max <= 2 * min) {
 47 |             throw new IllegalArgumentException(String.format("Illegal/nonsensical min, max (%.2f, %.2g)", min, max));
 48 |         }
 49 |         if (min <= 0 || max <= 0) {
 50 |             throw new IllegalArgumentException("Min and max must be positive");
 51 |         }
 52 |         if (binsPerDecade < 5 || binsPerDecade > 10000) {
 53 |             throw new IllegalArgumentException(
 54 |                     String.format("Unreasonable number of bins per decade %.2g. Expected value in range [5,10000]",
 55 |                             binsPerDecade));
 56 |         }
 57 | 
 58 |         // convert binsPerDecade into bins per octave, then figure out how many bits that takes
 59 |         bitsOfPrecision = (int) Math.ceil(Math.log(binsPerDecade * Math.log10(2)) / Math.log(2));
 60 |         // we keep just the required amount of the mantissa
 61 |         shift = 52 - bitsOfPrecision;
 62 |         // The exponent in a floating point number is offset
 63 |         offset = 0x3ff << bitsOfPrecision;
 64 | 
 65 |         setupBins(min, max);
 66 |     }
 67 | 
 68 |     @Override
 69 |     protected int bucketIndex(double x) {
 70 |         x = x / min;
 71 |         long floatBits = Double.doubleToLongBits(x);
 72 |         return (int) (floatBits >>> shift) - offset;
 73 |     }
 74 | 
 75 |     // exposed for testing
 76 |     @Override
 77 |     double lowerBound(int k) {
 78 |         return min * Double.longBitsToDouble((k + (0x3ffL << bitsOfPrecision)) << (52 - bitsOfPrecision)) /* / fuzz */;
 79 |     }
 80 | 
 81 |     @Override
 82 |     @SuppressWarnings("WeakerAccess")
 83 |     public long[] getCompressedCounts() {
 84 |         LongBuffer buf = LongBuffer.allocate(counts.length);
 85 |         Simple64.compress(buf, counts, 0, counts.length);
 86 |         long[] r = new long[buf.position()];
 87 |         buf.flip();
 88 |         buf.get(r);
 89 |         return r;
 90 |     }
 91 | 
 92 |     @Override
 93 |     @SuppressWarnings("WeakerAccess")
 94 |     public void writeObject(java.io.ObjectOutputStream out) throws IOException {
 95 |         out.writeDouble(min);
 96 |         out.writeDouble(max);
 97 |         out.writeByte(bitsOfPrecision);
 98 |         out.writeByte(shift);
 99 | 
100 |         ByteBuffer buf = ByteBuffer.allocate(8 * counts.length);
101 |         LongBuffer longBuffer = buf.asLongBuffer();
102 |         Simple64.compress(longBuffer, counts, 0, counts.length);
103 |         buf.position(8 * longBuffer.position());
104 |         byte[] r = new byte[buf.position()];
105 |         out.writeShort(buf.position());
106 |         buf.flip();
107 |         buf.get(r);
108 |         out.write(r);
109 |     }
110 | 
111 |     @Override
112 |     @SuppressWarnings("WeakerAccess")
113 |     public void readObject(java.io.ObjectInputStream in) throws IOException {
114 |         min = in.readDouble();
115 |         max = in.readDouble();
116 |         bitsOfPrecision = in.readByte();
117 |         shift = in.readByte();
118 |         offset = 0x3ff << bitsOfPrecision;
119 | 
120 |         int n = in.readShort();
121 |         ByteBuffer buf = ByteBuffer.allocate(n);
122 |         in.readFully(buf.array(), 0, n);
123 |         int binCount = bucketIndex(max) + 1;
124 |         if (binCount > 10000) {
125 |             throw new IllegalArgumentException(
126 |                     String.format("Excessive number of bins %d during deserialization = %.2g, %.2g",
127 |                             binCount, min, max));
128 | 
129 |         }
130 |         counts = new long[binCount];
131 |         Simple64.decompress(buf.asLongBuffer(), counts);
132 |     }
133 | 
134 |     private void readObjectNoData() throws ObjectStreamException {
135 |         throw new InvalidObjectException("Stream data required");
136 |     }
137 | 
138 |     @Override
139 |     void add(Iterable<Histogram> others) {
140 |         for (Histogram other : others) {
141 |             if (!this.getClass().equals(other.getClass())) {
142 |                 throw new IllegalArgumentException(String.format("Cannot add %s to FloatHistogram", others.getClass()));
143 |             }
144 |             FloatHistogram actual = (FloatHistogram) other;
145 |             if (actual.min != min || actual.max != max || actual.counts.length != counts.length) {
146 |                 throw new IllegalArgumentException("Can only merge histograms with identical bounds and precision");
147 |             }
148 |             for (int i = 0; i < counts.length; i++) {
149 |                 counts[i] += other.counts[i];
150 |             }
151 |         }
152 |     }
153 | }
154 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/tdunning/math/stats/Histogram.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Ted Dunning under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.tdunning.math.stats;
19 | 
20 | import java.io.IOException;
21 | import java.io.Serializable;
22 | 
23 | /**
24 |  * A Histogram is a histogram with cleverly chosen, but fixed, bin widths.
25 |  *
26 |  * Different implementations may provide better or worse speed or space complexity,
27 |  * but each is attuned to a particular distribution or error metric.
28 |  */
29 | @SuppressWarnings("WeakerAccess")
30 | public abstract class Histogram implements Serializable {
31 |     protected long[] counts;
32 |     protected double min;
33 |     protected double max;
34 |     protected double logFactor;
35 |     protected double logOffset;
36 | 
37 |     public Histogram(double min, double max) {
38 |         this.min = min;
39 |         this.max = max;
40 |     }
41 | 
42 |     protected void setupBins(double min, double max) {
43 |         int binCount = bucketIndex(max) + 1;
44 |         if (binCount > 10000) {
45 |             throw new IllegalArgumentException(
46 |                     String.format("Excessive number of bins %d resulting from min,max = %.2g, %.2g",
47 |                             binCount, min, max));
48 | 
49 |         }
50 |         counts = new long[binCount];
51 |     }
52 | 
53 |     public void add(double v) {
54 |         counts[bucket(v)]++;
55 |     }
56 | 
57 |     @SuppressWarnings("WeakerAccess")
58 |     public double[] getBounds() {
59 |         double[] r = new double[counts.length];
60 |         for (int i = 0; i < r.length; i++) {
61 |             r[i] = lowerBound(i);
62 |         }
63 |         return r;
64 |     }
65 | 
66 |     public long[] getCounts() {
67 |         return counts;
68 |     }
69 | 
70 |     // exposed for testing
71 |     int bucket(double x) {
72 |         if (x <= min) {
73 |             return 0;
74 |         } else if (x >= max) {
75 |             return counts.length - 1;
76 |         } else {
77 |             return bucketIndex(x);
78 |         }
79 |     }
80 | 
81 |     protected abstract int bucketIndex(double x);
82 | 
83 |     // exposed for testing
84 |     abstract double lowerBound(int k);
85 | 
86 |     @SuppressWarnings("WeakerAccess")
87 |     abstract long[] getCompressedCounts();
88 | 
89 |     @SuppressWarnings("WeakerAccess")
90 |     abstract void writeObject(java.io.ObjectOutputStream out) throws IOException;
91 | 
92 |     @SuppressWarnings("WeakerAccess")
93 |     abstract void readObject(java.io.ObjectInputStream in) throws IOException;
94 | 
95 |     abstract void add(Iterable<Histogram> others);
96 | }
97 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/tdunning/math/stats/LogHistogram.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Ted Dunning under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.tdunning.math.stats;
 19 | 
 20 | import java.io.IOException;
 21 | import java.io.ObjectInputStream;
 22 | import java.io.ObjectOutputStream;
 23 | 
 24 | import static java.lang.Math.sqrt;
 25 | 
 26 | /**
 27 |  * Non-linear histogram that uses floating point representation plus a quadratic correction to
 28 |  * bin width to achieve tighter fit to the ideal log2 sizing.
 29 |  */
 30 | public class LogHistogram extends Histogram {
 31 |     private double logFactor;
 32 |     private double logOffset;
 33 | 
 34 |     @SuppressWarnings("WeakerAccess")
 35 |     public LogHistogram(double min, double max) {
 36 |         this(min, max, 0.1);
 37 |     }
 38 | 
 39 |     @SuppressWarnings("WeakerAccess")
 40 |     public LogHistogram(double min, double max, double epsilonFactor) {
 41 |         super(min, max);
 42 |         logFactor = Math.log(2) / Math.log(1 + epsilonFactor);
 43 |         logOffset = LogHistogram.approxLog2(min) * logFactor;
 44 | 
 45 |         if (max <= 2 * min) {
 46 |             throw new IllegalArgumentException(String.format("Illegal/nonsensical min, max (%.2f, %.2g)", min, max));
 47 |         }
 48 |         if (min <= 0 || max <= 0) {
 49 |             throw new IllegalArgumentException("Min and max must be positive");
 50 |         }
 51 |         if (epsilonFactor < 1e-6 || epsilonFactor > 0.5) {
 52 |             throw new IllegalArgumentException(
 53 |                     String.format("Unreasonable number of bins per decade %.2g. Expected value in range [1e-6,0.5]",
 54 |                             epsilonFactor));
 55 |         }
 56 | 
 57 |         setupBins(min, max);
 58 |     }
 59 | 
 60 |     /**
 61 |      * Approximates log_2(value) by abusing floating point hardware. The floating point exponent
 62 |      * is used to get the integer part of the log. The mantissa is then adjusted with a second order
 63 |      * polynomial to get a better approximation. The error is bounded to be less than ±0.01 and is
 64 |      * zero at every power of two (which also implies the approximation is continuous).
 65 |      *
 66 |      * @param value The argument of the log
 67 |      * @return log_2(value) (within an error of about ± 0.01)
 68 |      */
 69 |     @SuppressWarnings("WeakerAccess")
 70 |     public static double approxLog2(double value) {
 71 |         final long valueBits = Double.doubleToRawLongBits(value);
 72 |         final long exponent = ((valueBits & 0x7ff0_0000_0000_0000L) >>> 52) - 1024;
 73 |         final double m = Double.longBitsToDouble((valueBits & 0x800fffffffffffffL) | 0x3ff0000000000000L);
 74 |         return (m * (2 - (1.0 / 3) * m) + exponent - (2.0 / 3.0));
 75 |     }
 76 | 
 77 |     /**
 78 |      * Computes an approximate value of 2^x. This is done as an exact inverse of #approxLog2 so
 79 |      * that bin boundaries can be computed exactly.
 80 |      *
 81 |      * @param x The power of 2 desired.
 82 |      * @return 2^x approximately.
 83 |      */
 84 |     @SuppressWarnings("WeakerAccess")
 85 |     public static double pow2(double x) {
 86 |         final double exponent = Math.floor(x) - 1;
 87 |         x = x - exponent;
 88 |         double m = 3 - sqrt(7 - 3 * x);
 89 |         return Math.pow(2, exponent + 1) * m;
 90 |     }
 91 | 
 92 |     @Override
 93 |     protected int bucketIndex(double x) {
 94 |         return (int) (LogHistogram.approxLog2(x) * logFactor - logOffset);
 95 |     }
 96 | 
 97 |     @Override
 98 |     double lowerBound(int k) {
 99 |         return LogHistogram.pow2((k + logOffset) / logFactor);
100 |     }
101 | 
102 |     @Override
103 |     long[] getCompressedCounts() {
104 |         return new long[0];
105 |     }
106 | 
107 |     @Override
108 |     void writeObject(ObjectOutputStream out) throws IOException {
109 | 
110 |     }
111 | 
112 |     @Override
113 |     void readObject(ObjectInputStream in) throws IOException {
114 | 
115 |     }
116 | 
117 |     @Override
118 |     void add(Iterable<Histogram> others) {
119 |         for (Histogram other : others) {
120 |             if (!this.getClass().equals(other.getClass())) {
121 |                 throw new IllegalArgumentException(String.format("Cannot add %s to LogHistogram", others.getClass()));
122 |             }
123 |             LogHistogram actual = (LogHistogram) other;
124 |             if (actual.min != min || actual.max != max || actual.counts.length != counts.length) {
125 |                 throw new IllegalArgumentException("Can only merge histograms with identical bounds and precision");
126 |             }
127 |             for (int i = 0; i < counts.length; i++) {
128 |                 counts[i] += other.counts[i];
129 |             }
130 |         }
131 |     }
132 | }
133 | 


--------------------------------------------------------------------------------
/core/src/main/r/asin-approx.r:
--------------------------------------------------------------------------------
 1 | ### We want a piece-wise approximation of asin(x)
 2 | ### But we want to have the following constraints:
 3 | ### 1) each should be completely well behaved in its range
 4 | ### 2) adjacent pieces will be blended using linear approximation so their regions should overlap
 5 | ### 3) the blended result should have continuity
 6 | ### 3) symmetry will be handled outside this approximation
 7 | ### 4) the overall range handled should start at 0 and end before 1
 8 | ### 5) the overall should be as large as possible, but need not reach 1
 9 | 
10 | fit = function(param) {
11 |     c0.high = param[1]
12 |     c1.high = param[2]
13 |     c2.low = param[3]
14 |     c2.high = param[4]
15 |     c3.low = param[5]
16 |     c3.high = param[6]
17 |     c4.low = param[7]
18 | 
19 |     x = seq(-c0.high, c0.high, by=0.01)
20 |     d0 = data.frame(y=asin(x), x=x, x2=x*x, x3=x*x*x, i1=1/(1-x), i2=1/(1-x)/(1-x))
21 |     m0 = glm(y ~ x + x2 + x3 + i1 + i2, d0, family='gaussian')
22 | 
23 |     x = seq(0, c1.high, by=0.01)
24 |     d1 = data.frame(y=asin(x), x=x, x2=x*x, x3=x*x*x, i1=1/(1-x), i2=1/(1-x)/(1-x))
25 |     m1 = glm(y ~ x + x2 + x3 + i1 + i2, d1, family='gaussian')
26 | 
27 |     x = seq(c2.low, c2.high, by=0.01)
28 |     d2 = data.frame(y=asin(x), x=x, x2=x*x, x3=x*x*x, i1=1/(1-x), i2=1/(1-x)/(1-x))
29 |     m2 = glm(y ~ x + x2 + x3 + i1 + i2, d2, family='gaussian')
30 |     
31 |     x = seq(c3.low, c3.high, by=0.01)
32 |     d3 = data.frame(y=asin(x), x=x, x2=x*x, x3=x*x*x, i1=1/(1-x), i2=1/(1-x)/(1-x))
33 |     m3 = glm(y ~ x + x2 + x3 + i1 + i2, d3, family='gaussian')
34 | 
35 |     list(m0=m0,m1=m1,m2=m2,m3=m3,
36 |          c0.high=c0.high,c1.high=c1.high, c2.low=c2.low, c2.high=c2.high,
37 |          c3.low=c3.low, c3.high=c3.high, c4.low=c4.low)
38 | }
39 | 
40 | follow = function(models) {
41 |     x = seq(0, models$c3.high, by=0.01)
42 |     data = data.frame(x=x, x2=x*x, x3=x*x*x, i1=1/(1-x), i2=1/(1-x)/(1-x))
43 |     raw = data.frame(
44 |         y0=predict(models$m0, newdata=data),
45 |         y1=predict(models$m1, newdata=data),
46 |         y2=predict(models$m2, newdata=data),
47 |         y3=predict(models$m3, newdata=data),
48 |         y4=asin(x)
49 |     )
50 | 
51 |     ## c0.high, c1.high, c2.low, c1.high, c3.low, c2.high, c3.high, c4.low
52 |     mix = with(models, {
53 |          mix = matrix(0, nrow=dim(raw)[1], ncol=5)
54 |          x0 = bound((c0.high - x) / c0.high)
55 |          x1 = bound((c1.high - x) / (c1.high - c2.low));
56 |          x2 = bound((c2.high - x) / (c2.high - c3.low));
57 |          x3 = bound((c3.high - x) / (c3.high - c4.low));
58 | 
59 |          mix[, 1] = x0
60 |          mix[, 2] = (1-x0) * x1
61 |          mix[, 3] = (1-x1) * x2
62 |          mix[, 4] = (1-x2) * x3
63 |          mix[, 5] = 1-x3
64 |          mix
65 |      })
66 | 
67 |     data.frame(x=x, yhat=rowSums(raw * mix), y=asin(x))
68 | }
69 | 
70 | bound = function(v) {
71 |     over = v > 1
72 |     under = v < 0
73 |     v * (1-over) * (1-under) + over
74 | }
75 |     
76 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/AVLGroupTreeTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Ted Dunning under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.tdunning.math.stats;
 19 | 
 20 | import org.apache.mahout.common.RandomUtils;
 21 | import org.junit.Before;
 22 | import org.junit.Test;
 23 | 
 24 | public class AVLGroupTreeTest extends AbstractTest {
 25 | 
 26 |     @Before
 27 |     public void setUp() {
 28 |         RandomUtils.useTestSeed();
 29 |     }
 30 | 
 31 |     @Test
 32 |     public void testSimpleAdds() {
 33 |         AVLGroupTree x = new AVLGroupTree(false);
 34 |         assertEquals(IntAVLTree.NIL, x.floor(34));
 35 |         assertEquals(IntAVLTree.NIL, x.first());
 36 |         assertEquals(IntAVLTree.NIL, x.last());
 37 |         assertEquals(0, x.size());
 38 |         assertEquals(0, x.sum());
 39 | 
 40 |         x.add(new Centroid(1));
 41 |         assertEquals(1, x.sum());
 42 |         Centroid centroid = new Centroid(2);
 43 |         centroid.add(3, 1);
 44 |         centroid.add(4, 1);
 45 |         x.add(centroid);
 46 | 
 47 |         assertEquals(2, x.size());
 48 |         assertEquals(4, x.sum());
 49 |     }
 50 | 
 51 |     @Test
 52 |     public void testBalancing() {
 53 |         AVLGroupTree x = new AVLGroupTree(false);
 54 |         for (int i = 0; i < 101; i++) {
 55 |             x.add(new Centroid(i));
 56 |         }
 57 | 
 58 |         assertEquals(101, x.size());
 59 |         assertEquals(101, x.sum());
 60 | 
 61 |         x.checkBalance();
 62 |         x.checkAggregates();
 63 |     }
 64 | 
 65 |     @Test
 66 |     public void testFloor() {
 67 |         // mostly tested in other tests
 68 |         AVLGroupTree x = new AVLGroupTree(false);
 69 |         for (int i = 0; i < 101; i++) {
 70 |             x.add(new Centroid(i / 2));
 71 |         }
 72 | 
 73 |         assertEquals(IntAVLTree.NIL, x.floor(-30));
 74 | 
 75 |         for (Centroid centroid : x) {
 76 |             assertEquals(centroid.mean(), x.mean(x.floor(centroid.mean() + 0.1)), 0);
 77 |         }
 78 |     }
 79 | 
 80 |     @Test
 81 |     public void testHeadSum() {
 82 |         AVLGroupTree x = new AVLGroupTree(false);
 83 |         for (int i = 0; i < 1000; ++i) {
 84 |             x.add(randomDouble(), randomIntBetween(1, 10), null);
 85 |         }
 86 |         long sum = 0;
 87 |         long last = -1;
 88 |         for (int node = x.first(); node != IntAVLTree.NIL; node = x.next(node)) {
 89 |             assertEquals(sum, x.headSum(node));
 90 |             sum += x.count(node);
 91 |             last = x.count(node);
 92 |         }
 93 |         assertEquals(last, x.count(x.last()));
 94 |     }
 95 | 
 96 |     @Test
 97 |     public void testFloorSum() {
 98 |         AVLGroupTree x = new AVLGroupTree(false);
 99 |         int total = 0;
100 |         for (int i = 0; i < 1000; ++i) {
101 |             int count = randomIntBetween(1, 10);
102 |             x.add(randomDouble(), count, null);
103 |             total += count;
104 |         }
105 |         assertEquals(IntAVLTree.NIL, x.floorSum(-1));
106 |         for (long i = 0; i < total + 10; ++i) {
107 |             final int floorNode = x.floorSum(i);
108 |             assertTrue(x.headSum(floorNode) <= i);
109 |             final int next = x.next(floorNode);
110 |             assertTrue(next == IntAVLTree.NIL || x.headSum(next) > i);
111 |         }
112 |     }
113 | 
114 | }
115 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/AVLTreeDigestTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Ted Dunning under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.tdunning.math.stats;
19 | 
20 | import org.junit.BeforeClass;
21 | 
22 | import java.io.IOException;
23 | import java.nio.ByteBuffer;
24 | 
25 | public class AVLTreeDigestTest extends TDigestTest {
26 |     @BeforeClass
27 |     public static void setup() throws IOException {
28 |         TDigestTest.setup("avl-tree");
29 |     }
30 | 
31 |     protected DigestFactory factory(final double compression) {
32 |         return new DigestFactory() {
33 |             @Override
34 |             public TDigest create() {
35 |                 return new AVLTreeDigest(compression);
36 |             }
37 |         };
38 |     }
39 | 
40 |     @Override
41 |     protected TDigest fromBytes(ByteBuffer bytes) {
42 |         return AVLTreeDigest.fromBytes(bytes);
43 |     }
44 | 
45 |     @Override
46 |     public void testRepeatedValues() {
47 |         // disabled for AVLTreeDigest for now
48 |     }
49 | 
50 |     @Override
51 |     public void testSingletonInACrowd() {
52 |         // disabled for AVLTreeDigest for now
53 |     }
54 | 
55 |     @Override
56 |     public void singleSingleRange() {
57 |         // disabled for AVLTreeDigest for now
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/AbstractTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Ted Dunning under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.tdunning.math.stats;
19 | 
20 | import org.junit.Ignore;
21 | import org.junit.runner.RunWith;
22 | 
23 | import com.carrotsearch.randomizedtesting.JUnit3MethodProvider;
24 | import com.carrotsearch.randomizedtesting.JUnit4MethodProvider;
25 | import com.carrotsearch.randomizedtesting.RandomizedTest;
26 | import com.carrotsearch.randomizedtesting.annotations.Listeners;
27 | import com.carrotsearch.randomizedtesting.annotations.TestMethodProviders;
28 | 
29 | @Ignore
30 | @Listeners({
31 |         ReproduceInfoPrinterRunListener.class
32 | })
33 | @TestMethodProviders({
34 |         JUnit3MethodProvider.class, // test names starting with test*
35 |         JUnit4MethodProvider.class  // test methods annotated with @Test
36 | })
37 | @RunWith(value = com.carrotsearch.randomizedtesting.RandomizedRunner.class)
38 | /**
39 |  * Base test case, all other test cases must inherit this one.
40 |  */
41 | public abstract class AbstractTest extends RandomizedTest {
42 | 
43 | }


--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/AlternativeMergeTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Ted Dunning under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.tdunning.math.stats;
 19 | 
 20 | import org.junit.Test;
 21 | 
 22 | import java.io.FileNotFoundException;
 23 | import java.io.PrintStream;
 24 | import java.io.PrintWriter;
 25 | import java.io.UnsupportedEncodingException;
 26 | import java.util.*;
 27 | 
 28 | import static junit.framework.Assert.assertEquals;
 29 | 
 30 | public class AlternativeMergeTest {
 31 |     /**
 32 |      * Computes size using the alternative scaling limit for both an idealized merge and for
 33 |      * a MergingDigest.
 34 |      *
 35 |      * This test does some sanity checking, but the real purpose is to create data files
 36 |      * <code>sizes.csv</code> and <code>counts.csv</code>
 37 |      * @throws FileNotFoundException If output files can't be created.
 38 |      */
 39 |     @Test
 40 |     public void testMerges() throws FileNotFoundException {
 41 |         try (PrintWriter sizes = new PrintWriter("sizes.csv");
 42 |              PrintWriter out = new PrintWriter("counts.csv")) {
 43 |             sizes.printf("algo, counts, digest, compression, n\n");
 44 |             out.printf("algo, compression, n, q, count\n");
 45 |             for (int n : new int[]{100, 1000, 10000, 100000}) {
 46 |                 for (double compression : new double[]{50, 100, 200, 400}) {
 47 |                     MergingDigest digest1 = new MergingDigest(compression);
 48 |                     AVLTreeDigest digest2 = new AVLTreeDigest(compression);
 49 |                     List<Double> data = new ArrayList<>();
 50 |                     Random gen = new Random();
 51 |                     for (int i = 0; i < n; i++) {
 52 |                         double x = gen.nextDouble();
 53 |                         data.add(x);
 54 |                         digest1.add(x);
 55 |                         digest2.add(x);
 56 |                     }
 57 |                     Collections.sort(data);
 58 |                     List<Double> counts = new ArrayList<>();
 59 |                     double soFar = 0;
 60 |                     double current = 0;
 61 |                     for (Double x : data) {
 62 |                         double q = (soFar + (current + 1.0) / 2) / n;
 63 |                         if (current == 0 || current + 1 < n * Math.PI / compression * Math.sqrt(q * (1 - q))) {
 64 |                             current += 1;
 65 |                         } else {
 66 |                             counts.add(current);
 67 |                             soFar += current;
 68 |                             current = 1;
 69 |                         }
 70 |                     }
 71 |                     if (current > 0) {
 72 |                         counts.add(current);
 73 |                     }
 74 |                     sizes.printf("%s, %d, %d, %.0f, %d\n", "merge", counts.size(), digest1.centroids().size(), compression, n);
 75 |                     sizes.printf("%s, %d, %d, %.0f, %d\n", "tree", counts.size(), digest2.centroids().size(), compression, n);
 76 |                     sizes.printf("%s, %d, %d, %.0f, %d\n", "ideal", counts.size(), counts.size(), compression, n);
 77 |                     soFar = 0;
 78 |                     for (Double count : counts) {
 79 |                         out.printf("%s, %.0f, %d, %.3f, %.0f\n", "ideal", compression, n, (soFar + count / 2) / n, count);
 80 |                         soFar += count;
 81 |                     }
 82 |                     assertEquals(n, soFar, 0);
 83 |                     soFar = 0;
 84 |                     for (Centroid c : digest1.centroids()) {
 85 |                         out.printf("%s, %.0f, %d, %.3f, %d\n", "merge", compression, n, (soFar + c.count() / 2) / n, c.count());
 86 |                         soFar += c.count();
 87 |                     }
 88 |                     assertEquals(n, soFar, 0);
 89 |                     soFar = 0;
 90 |                     for (Centroid c : digest2.centroids()) {
 91 |                         out.printf("%s, %.0f, %d, %.3f, %d\n", "tree", compression, n, (soFar + c.count() / 2) / n, c.count());
 92 |                         soFar += c.count();
 93 |                     }
 94 |                     assertEquals(n, soFar, 0);
 95 |                 }
 96 |             }
 97 |         }
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/BigCount.java:
--------------------------------------------------------------------------------
 1 | package com.tdunning.math.stats;
 2 | 
 3 | import org.junit.Test;
 4 | 
 5 | public abstract class BigCount extends AbstractTest {
 6 | 
 7 |     @Test
 8 |     public void testBigMerge() {
 9 |         TDigest digest = createDigest();
10 |         for (int i = 0; i < 5; i++) {
11 |             digest.add(getDigest());
12 |             double actual = digest.quantile(0.5);
13 |             assertEquals("Count = " + digest.size(), 3000,
14 |                     actual, 0.001);
15 |         }
16 |     }
17 | 
18 |     private TDigest getDigest() {
19 |         TDigest digest = createDigest();
20 |         addData(digest);
21 |         return digest;
22 |     }
23 | 
24 |     public TDigest createDigest() {
25 |         throw new IllegalStateException("Should have over-ridden createDigest");
26 |     }
27 | 
28 |     private static void addData(TDigest digest) {
29 |         double n = 300_000_000 * 5 + 200;
30 | 
31 |         addFakeCentroids(digest, n, 300_000_000, 10);
32 |         addFakeCentroids(digest, n, 300_000_000, 200);
33 |         addFakeCentroids(digest, n, 300_000_000, 3000);
34 |         addFakeCentroids(digest, n, 300_000_000, 4000);
35 |         addFakeCentroids(digest, n, 300_000_000, 5000);
36 |         addFakeCentroids(digest, n, 200, 47883554);
37 | 
38 |         assertEquals(n, digest.size(), 0);
39 |     }
40 | 
41 |     private static void addFakeCentroids(TDigest digest, double n, int points, int x) {
42 |         long base = digest.size();
43 |         double q0 = base / n;
44 |         long added = 0;
45 |         while (added < points) {
46 |             double k0 = digest.scale.k(q0, digest.compression(), n);
47 |             double q1 = digest.scale.q(k0 + 1, digest.compression(), n);
48 |             q1 = Math.min(q1, (base + points) / n);
49 |             int m = (int) Math.min(points - added, Math.max(1, Math.rint((q1 - q0) * n)));
50 |             added += m;
51 |             digest.add(x, m);
52 |             q0 = q1;
53 |         }
54 |     }
55 | }


--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/BigCountMergingDigest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Ted Dunning under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.tdunning.math.stats;
19 | 
20 | public class BigCountMergingDigest extends BigCount {
21 |     @Override
22 |     public TDigest createDigest() {
23 |         return new MergingDigest(100);
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/BigCountTreeDigest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Ted Dunning under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.tdunning.math.stats;
19 | 
20 | public class BigCountTreeDigest extends BigCount {
21 |     @Override
22 |     public TDigest createDigest() {
23 |         return new AVLTreeDigest(100);
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/DigestFactory.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Ted Dunning under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.tdunning.math.stats;
19 | 
20 | /**
21 |  * A DigestFactory is used in tests to abstract what kind of digest is being tested.
22 |  */
23 | public interface DigestFactory {
24 |     TDigest getDigest(double compression);
25 | }
26 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/FloatHistogramTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Ted Dunning under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.tdunning.math.stats;
19 | 
20 | import org.junit.Before;
21 | import org.junit.Test;
22 | 
23 | import java.io.FileNotFoundException;
24 | 
25 | public class FloatHistogramTest extends HistogramTestCases {
26 |     @Before
27 |     public void setup() {
28 |         useLinearBuckets = true;
29 |         factory = new HistogramFactory() {
30 |             @Override
31 |             public Histogram create(double min, double max) {
32 |                 return new FloatHistogram(min, max);
33 |             }
34 |         };
35 |     }
36 | 
37 |     @Test
38 |     public void testBins() {
39 |         super.testBinSizes(79, 141, new FloatHistogram(10e-6, 5, 20));
40 |     }
41 | 
42 |     @Test
43 |     public void testLinear() throws FileNotFoundException {
44 |         super.doLinear(165.4, 18, 212);
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/IntAVLTreeTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Ted Dunning under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.tdunning.math.stats;
 19 | 
 20 | import org.junit.Test;
 21 | 
 22 | import java.util.*;
 23 | 
 24 | 
 25 | public class IntAVLTreeTest extends AbstractTest {
 26 | 
 27 |     static class IntBag extends IntAVLTree {
 28 | 
 29 |         int value;
 30 |         int[] values;
 31 |         int[] counts;
 32 | 
 33 |         IntBag() {
 34 |             values = new int[capacity()];
 35 |             counts = new int[capacity()];
 36 |         }
 37 | 
 38 |         @SuppressWarnings("WeakerAccess")
 39 |         public boolean addValue(int value) {
 40 |             this.value = value;
 41 |             return super.add();
 42 |         }
 43 | 
 44 |         @SuppressWarnings("WeakerAccess")
 45 |         public boolean removeValue(int value) {
 46 |             this.value = value;
 47 |             final int node = find();
 48 |             if (node == NIL) {
 49 |                 return false;
 50 |             } else {
 51 |                 super.remove(node);
 52 |                 return true;
 53 |             }
 54 |         }
 55 | 
 56 |         @Override
 57 |         protected void resize(int newCapacity) {
 58 |             super.resize(newCapacity);
 59 |             values = Arrays.copyOf(values, newCapacity);
 60 |             counts = Arrays.copyOf(counts, newCapacity);
 61 |         }
 62 | 
 63 |         @Override
 64 |         protected int compare(int node) {
 65 |             return value - values[node];
 66 |         }
 67 | 
 68 |         @Override
 69 |         protected void copy(int node) {
 70 |             values[node] = value;
 71 |             counts[node] = 1;
 72 |         }
 73 | 
 74 |         @Override
 75 |         protected void merge(int node) {
 76 |             values[node] = value;
 77 |             counts[node]++;
 78 |         }
 79 | 
 80 |     }
 81 | 
 82 |     @Test
 83 |     public void dualAdd() {
 84 |         Random r = new Random(0);
 85 |         TreeMap<Integer, Integer> map = new TreeMap<>();
 86 |         IntBag bag = new IntBag();
 87 |         for (int i = 0; i < 100000; ++i) {
 88 |             final int v = r.nextInt(100000);
 89 |             if (map.containsKey(v)) {
 90 |                 map.put(v, map.get(v) + 1);
 91 |                 assertFalse(bag.addValue(v));
 92 |             } else {
 93 |                 map.put(v, 1);
 94 |                 assertTrue(bag.addValue(v));
 95 |             }
 96 |         }
 97 |         Iterator<Map.Entry<Integer, Integer>> it = map.entrySet().iterator();
 98 |         for (int node = bag.first(bag.root()); node != IntAVLTree.NIL; node = bag.next(node)) {
 99 |             final Map.Entry<Integer, Integer> next = it.next();
100 |             assertEquals(next.getKey().intValue(), bag.values[node]);
101 |             assertEquals(next.getValue().intValue(), bag.counts[node]);
102 |         }
103 |         assertFalse(it.hasNext());
104 |     }
105 | 
106 |     @Test
107 |     public void dualAddRemove() {
108 |         Random r = new Random(0);
109 |         TreeMap<Integer, Integer> map = new TreeMap<>();
110 |         IntBag bag = new IntBag();
111 |         for (int i = 0; i < 100000; ++i) {
112 |             final int v = r.nextInt(1000);
113 |             if (r.nextBoolean()) {
114 |                 // add
115 |                 if (map.containsKey(v)) {
116 |                     map.put(v, map.get(v) + 1);
117 |                     assertFalse(bag.addValue(v));
118 |                 } else {
119 |                     map.put(v, 1);
120 |                     assertTrue(bag.addValue(v));
121 |                 }
122 |             } else {
123 |                 // remove
124 |                 assertEquals(map.remove(v) != null, bag.removeValue(v));
125 |             }
126 |         }
127 |         Iterator<Map.Entry<Integer, Integer>> it = map.entrySet().iterator();
128 |         for (int node = bag.first(bag.root()); node != IntAVLTree.NIL; node = bag.next(node)) {
129 |             final Map.Entry<Integer, Integer> next = it.next();
130 |             assertEquals(next.getKey().intValue(), bag.values[node]);
131 |             assertEquals(next.getValue().intValue(), bag.counts[node]);
132 |         }
133 |         assertFalse(it.hasNext());
134 |     }
135 | 
136 | }
137 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/LogHistogramTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Ted Dunning under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.tdunning.math.stats;
19 | 
20 | import org.junit.Before;
21 | import org.junit.Test;
22 | 
23 | import java.io.FileNotFoundException;
24 | import java.io.IOException;
25 | 
26 | import static org.junit.Assert.assertEquals;
27 | import static org.junit.Assert.assertTrue;
28 | 
29 | public class LogHistogramTest extends HistogramTestCases {
30 |     @Before
31 |     public void setup() {
32 |         useLinearBuckets = false;
33 |         factory = new HistogramFactory() {
34 |             @Override
35 |             public Histogram create(double min, double max) {
36 |                 return new LogHistogram(min, max, 0.05);
37 |             }
38 |         };
39 |     }
40 | 
41 | 
42 |     @Test
43 |     public void testApproxLog() {
44 |         double x = 1e-6;
45 |         for (int i = 0; i < 1000; i++) {
46 |             assertEquals(Math.log(x) / Math.log(2), LogHistogram.approxLog2(x), 0.01);
47 |             x *= 1.0 + Math.PI / 100.0;
48 |         }
49 |         assertTrue("Insufficient range", x > 1e6);
50 |     }
51 | 
52 |     @Test
53 |     public void testInverse() {
54 |         for (double x = 0.001; x <= 100; x += 1e-3) {
55 |             double log = LogHistogram.approxLog2(x);
56 |             double roundTrip = LogHistogram.pow2(log);
57 |             assertEquals(x, roundTrip, 1e-13);
58 |         }
59 | 
60 |     }
61 | 
62 |     @Test
63 |     public void testBins() {
64 |         super.testBinSizes(72, 129, new LogHistogram(10e-6, 5, 0.1));
65 |     }
66 | 
67 |     @Test
68 |     public void testLinear() throws FileNotFoundException {
69 |         super.doLinear(146, 17, 189);
70 |     }
71 | 
72 |     @Override
73 |     public void testCompression() {
74 |         //ignore
75 |     }
76 | 
77 |     @Override
78 |     public void testSerialization() {
79 |         //ignore
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/MegaMergeTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Ted Dunning under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.tdunning.math.stats;
 19 | 
 20 | import com.google.common.collect.Lists;
 21 | import org.junit.Test;
 22 | 
 23 | import java.util.ArrayList;
 24 | import java.util.List;
 25 | import java.util.Random;
 26 | import java.util.concurrent.*;
 27 | 
 28 | import static org.junit.Assume.assumeTrue;
 29 | 
 30 | public class MegaMergeTest {
 31 | 
 32 |     private static final int DAY = 280;
 33 |     private static final int WIDTH = 1000;
 34 |     private static final int DATA_STRIDE = 23;
 35 | 
 36 |     @Test
 37 |     public void testLargeMerge() throws InterruptedException, ExecutionException {
 38 |         assumeTrue(Boolean.parseBoolean(System.getProperty("runSlowTests")));
 39 |         // prove we can summarize a days worth of data at 5 minute intervals. Each interval has
 40 |         // 1000 samples each with 1500 data points
 41 |         double t0 = System.nanoTime() * 1e-9;
 42 |         // we cheat by only having 23 samples that we rotate into the data summaries
 43 |         // the raw data
 44 |         double[][] data = new double[DATA_STRIDE][1500];
 45 |         Random gen = new Random();
 46 |         for (int i = 0; i < DATA_STRIDE; i++) {
 47 |             for (int j = 0; j < 1500; j++) {
 48 |                 data[i][j] = gen.nextGaussian();
 49 |             }
 50 |         }
 51 |         double t1 = System.nanoTime() * 1e-9;
 52 |         System.out.printf("Data has been generated\n");
 53 |         // record the basic summaries
 54 |         final MergingDigest[][] td = new MergingDigest[DAY][WIDTH];
 55 |         int m = 0;
 56 |         for (int i = 0; i < DAY; i++) {
 57 |             if (i % 10 == 0) {
 58 |                 System.out.printf("%d\n", i);
 59 |             }
 60 |             for (int j = 0; j < WIDTH; j++) {
 61 |                 td[i][j] = new MergingDigest(100);
 62 |                 for (int k = 0; k < 1500; k++) {
 63 |                     td[i][j].add(data[m][k]);
 64 |                 }
 65 |                 m = (m + 1) % DATA_STRIDE;
 66 |             }
 67 |         }
 68 |         double t2 = System.nanoTime() * 1e-9;
 69 |         System.out.printf("Micro-summaries filled\n");
 70 |         System.out.printf("%.3f,%.3f\n", t1 - t0, t2 - t1);
 71 |         int cores = Runtime.getRuntime().availableProcessors();
 72 |         System.out.printf("using %d cores\n", cores);
 73 |         for (int threads = 1; threads < 2 * cores; threads++) {
 74 |             t2 = System.nanoTime() * 1e-9;
 75 |             // pull the summaries together into 288 reasonably high resolution t-digests
 76 |             List<Callable<MergingDigest>> tasks = new ArrayList<>();
 77 |             for (int i = 0; i < DAY; i++) {
 78 |                 final MergingDigest[] elements = td[i];
 79 |                 tasks.add(new Callable<MergingDigest>() {
 80 |                     @Override
 81 |                     public MergingDigest call() {
 82 |                         MergingDigest rx = new MergingDigest(100);
 83 |                         rx.add(Lists.newArrayList(elements));
 84 |                         return rx;
 85 |                     }
 86 |                 });
 87 |             }
 88 |             ExecutorService pool = Executors.newFixedThreadPool(threads);
 89 |             List<Future<MergingDigest>> results = pool.invokeAll(tasks);
 90 |             final MergingDigest[] r = new MergingDigest[DAY];
 91 |             try {
 92 |                 int i = 0;
 93 |                 for (Future<MergingDigest> result : results) {
 94 |                     r[i++] = result.get();
 95 |                 }
 96 |             } finally {
 97 |                 pool.shutdown();
 98 |                 pool.awaitTermination(2, TimeUnit.SECONDS);
 99 |             }
100 |             double t3 = System.nanoTime() * 1e-9;
101 |             System.out.printf("%.3f,%.3f,%.3f,%.3f\n",
102 |                     r[0].quantile(0.99), r[100].quantile(0.99),
103 |                     r[150].quantile(0.99), r[250].quantile(0.99));
104 |             System.out.printf("%d,%.3f\n", threads, t3 - t2);
105 |         }
106 |     }
107 | 
108 |     @Test
109 |     public void megaMerge() {
110 |         assumeTrue(Boolean.parseBoolean(System.getProperty("runSlowTests")));
111 |         final int SUMMARIES = 1000;
112 |         final int POINTS = 1000000;
113 |         double t0 = System.nanoTime() * 1e-9;
114 |         double[] data = new double[10013];
115 |         Random gen = new Random();
116 |         for (int i = 0; i < data.length; i++) {
117 |             data[i] = gen.nextGaussian();
118 |         }
119 |         double t1 = System.nanoTime() * 1e-9;
120 |         System.out.printf("Data has been generated\n");
121 | 
122 |         // record the basic summaries
123 |         final MergingDigest[] td = new MergingDigest[SUMMARIES];
124 |         int k = 0;
125 |         for (int i = 0; i < SUMMARIES; i++) {
126 |             if (i % 100 == 0) {
127 |                 System.out.printf("%d\n", i);
128 |             }
129 |             td[i] = new MergingDigest(200);
130 |             for (int j = 0; j < POINTS; j++) {
131 |                 td[i].add(data[k]);
132 |                 k = (k + 1) % data.length;
133 |             }
134 |         }
135 |         System.out.printf("Partials built\n");
136 |         double t2 = System.nanoTime() * 1e-9;
137 | 
138 |         MergingDigest tAll = new MergingDigest(200);
139 |         tAll.add(Lists.<TDigest>newArrayList(td));
140 |         double t3 = System.nanoTime() * 1e-9;
141 |         System.out.printf("%.3f, %.3f, %.3f\n", t1 - t0, t2 - t1, t3 - t2);
142 |     }
143 | }
144 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/ReproTest.java:
--------------------------------------------------------------------------------
 1 | package com.tdunning.math.stats;
 2 | 
 3 | import org.junit.Assert;
 4 | import org.junit.Test;
 5 | 
 6 | import java.util.Random;
 7 | 
 8 | import static com.tdunning.math.stats.ScaleFunction.*;
 9 | 
10 | public class ReproTest {
11 | 
12 |     @Test
13 |     public void testRepro() {
14 |         Random gen = new Random();
15 |         gen.setSeed(1);
16 |         double[] data = new double[10000];
17 |         for (int i = 0; i < data.length; i++) {
18 |             // these samples are truncated and thus have lots of duplicates
19 |             // this can wreak havoc with the t-digest invariants
20 |             data[i] = Math.floor(gen.nextDouble() * 10);
21 |         }
22 | 
23 |         for (ScaleFunction sf : ScaleFunction.values()) {
24 |             if (sf.toString().contains("NO_NORM")) {
25 |                 continue;
26 |             }
27 |             TDigest distLow = new MergingDigest(100);
28 |             TDigest distMedian = new MergingDigest(100);
29 |             TDigest distHigh = new MergingDigest(100);
30 |             for (int i = 0; i < 500; i++) {
31 |                 MergingDigest d1 = new MergingDigest(100);
32 |                 d1.setScaleFunction(K_2);
33 |                 for (double x : data) {
34 |                     d1.add(x);
35 |                 }
36 |                 d1.compress();
37 |                 distLow.add(d1.quantile(0.001));
38 |                 distMedian.add(d1.quantile(0.5));
39 |                 distHigh.add(d1.quantile(0.999));
40 |             }
41 |             Assert.assertEquals(0, distLow.quantile(0.0), 0);
42 |             Assert.assertEquals(0, distLow.quantile(0.5), 0);
43 |             Assert.assertEquals(0, distLow.quantile(1.0), 0);
44 |             Assert.assertEquals(9, distHigh.quantile(0.0), 0);
45 |             Assert.assertEquals(9, distHigh.quantile(0.5), 0);
46 |             Assert.assertEquals(9, distHigh.quantile(1.0), 0);
47 |             System.out.printf("%s,%.3f,%.5f,%.5f,%.5f\n",
48 |                     sf, 0.5,
49 |                     distMedian.quantile(0.01), distMedian.quantile(0.5), distMedian.quantile(0.99));
50 |         }
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/ReproduceInfoPrinterRunListener.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Ted Dunning under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.tdunning.math.stats;
19 | 
20 | import org.junit.runner.Result;
21 | import org.junit.runner.notification.Failure;
22 | import org.junit.runner.notification.RunListener;
23 | 
24 | import com.carrotsearch.randomizedtesting.RandomizedContext;
25 | 
26 | public final class ReproduceInfoPrinterRunListener extends RunListener {
27 | 
28 |     private boolean failed = false;
29 | 
30 |     @Override
31 |     public void testFailure(Failure failure) {
32 |         failed = true;
33 |     }
34 | 
35 |     @Override
36 |     public void testRunFinished(Result result) {
37 |         if (failed) {
38 |             printReproLine();
39 |         }
40 |         failed = false;
41 |     }
42 | 
43 |     private void printReproLine() {
44 |         final StringBuilder b = new StringBuilder();
45 |         b.append("NOTE: reproduce with: mvn test -Dtests.seed=").append(RandomizedContext.current().getRunnerSeedAsString());
46 |         if (System.getProperty("runSlowTests") != null) {
47 |             b.append(" -DrunSlowTests=").append(System.getProperty("runSlowTests"));
48 |         }
49 |         b.append(" -Dtests.class=").append(RandomizedContext.current().getTargetClass().getName());
50 |         System.out.println(b.toString());
51 |     }
52 | 
53 | }


--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/SerializationTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Ted Dunning under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.tdunning.math.stats;
19 | 
20 | import org.junit.Test;
21 | 
22 | import java.nio.BufferUnderflowException;
23 | import java.nio.ByteBuffer;
24 | import java.util.Iterator;
25 | 
26 | import static org.junit.Assert.assertEquals;
27 | import static org.junit.Assert.assertFalse;
28 | import static org.junit.Assert.assertTrue;
29 | 
30 | public class SerializationTest {
31 |     @Test
32 |     public void mergingDigestSerDes() {
33 |         final TDigest out = MergingDigest.createDigest(100);
34 |         out.add(42.5);
35 |         out.add(1);
36 |         out.add(24.0);
37 | 
38 |         final ByteBuffer output = ByteBuffer.allocate(out.smallByteSize());
39 |         out.asSmallBytes(output);
40 | 
41 |         ByteBuffer input = ByteBuffer.wrap(output.array());
42 |         try {
43 |             TDigest m = MergingDigest.fromBytes(input);
44 |             for (double q = 0; q <= 1; q+=0.001) {
45 |                 assertEquals(m.quantile(q), out.quantile(q), 0);
46 |             }
47 |             Iterator<Centroid> ix = m.centroids().iterator();
48 |             for (Centroid centroid : out.centroids()) {
49 |                 assertTrue(ix.hasNext());
50 |                 Centroid c = ix.next();
51 |                 assertEquals(centroid.mean(), c.mean(), 0);
52 |                 assertEquals(centroid.count(), c.count(), 0);
53 |             }
54 |             assertFalse(ix.hasNext());
55 |         } catch (BufferUnderflowException e) {
56 |             System.out.println("WTF?");
57 |         }
58 | 
59 |         input = ByteBuffer.wrap(output.array());
60 |         final TDigest in = MergingDigest.fromBytes(input);
61 |         assertEquals(out.quantile(0.95), in.quantile(0.95), 0.001);
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/TDigestSerializationTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Ted Dunning under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.tdunning.math.stats;
19 | 
20 | import org.junit.Test;
21 | 
22 | import java.io.ByteArrayInputStream;
23 | import java.io.ByteArrayOutputStream;
24 | import java.io.IOException;
25 | import java.io.ObjectInputStream;
26 | import java.io.ObjectOutputStream;
27 | import java.io.Serializable;
28 | import java.util.Iterator;
29 | import java.util.Random;
30 | 
31 | import static org.junit.Assert.assertEquals;
32 | import static org.junit.Assert.assertFalse;
33 | import static org.junit.Assert.assertNotNull;
34 | 
35 | /**
36 |  * Verifies that the various TDigest implementations can be serialized.
37 |  * <p>
38 |  * Serializability is important, for example, if we want to use t-digests with Spark.
39 |  */
40 | public class TDigestSerializationTest {
41 |     @Test
42 |     public void testMergingDigest() throws IOException {
43 |         assertSerializesAndDeserializes(new MergingDigest(100));
44 |     }
45 | 
46 |     @Test
47 |     public void testAVLTreeDigest() throws IOException {
48 |         assertSerializesAndDeserializes(new AVLTreeDigest(100));
49 |     }
50 | 
51 |     private <T extends TDigest> void assertSerializesAndDeserializes(T tdigest) throws IOException {
52 |         assertNotNull(deserialize(serialize(tdigest)));
53 | 
54 |         final Random gen = new Random();
55 |         for (int i = 0; i < 100000; i++) {
56 |             tdigest.add(gen.nextDouble());
57 |         }
58 |         T roundTrip = deserialize(serialize(tdigest));
59 | 
60 |         assertTDigestEquals(tdigest, roundTrip);
61 |     }
62 | 
63 |     private static byte[] serialize(Serializable obj) throws IOException {
64 |         ByteArrayOutputStream baos = new ByteArrayOutputStream(5120);
65 |         try (ObjectOutputStream out = new ObjectOutputStream(baos)){
66 |             out.writeObject(obj);
67 |             return baos.toByteArray();
68 |         }
69 |     }
70 | 
71 |     private static <T> T deserialize(byte[] objectData) throws IOException {
72 |         try (ObjectInputStream in = new ObjectInputStream(new ByteArrayInputStream(objectData))) {
73 |             //noinspection unchecked
74 |             return (T) in.readObject();
75 |         } catch (ClassCastException | ClassNotFoundException | IOException e) {
76 |             throw new IOException(e);
77 |         }
78 |     }
79 | 
80 |     private void assertTDigestEquals(TDigest t1, TDigest t2) {
81 |         assertEquals(t1.getMin(), t2.getMin(), 0);
82 |         assertEquals(t1.getMax(), t2.getMax(), 0);
83 |         Iterator<Centroid> cx = t2.centroids().iterator();
84 |         for (Centroid c1 : t1.centroids()) {
85 |             Centroid c2 = cx.next();
86 |             assertEquals(c1.count(), c2.count());
87 |             assertEquals(c1.mean(), c2.mean(), 1e-10);
88 |         }
89 |         assertFalse(cx.hasNext());
90 |         assertNotNull(t2);
91 |     }
92 | }
93 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/math/stats/TDigestUtilTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Ted Dunning under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.tdunning.math.stats;
19 | 
20 | import java.nio.ByteBuffer;
21 | import java.util.List;
22 | import java.util.Random;
23 | 
24 | import org.junit.Test;
25 | 
26 | import com.google.common.collect.Lists;
27 | 
28 | public class TDigestUtilTest extends AbstractTest {
29 | 
30 |     @Test
31 |     public void testIntEncoding() {
32 |         Random gen = getRandom();
33 |         ByteBuffer buf = ByteBuffer.allocate(10000);
34 |         List<Integer> ref = Lists.newArrayList();
35 |         for (int i = 0; i < 3000; i++) {
36 |             int n = gen.nextInt();
37 |             n = n >>> (i / 100);
38 |             ref.add(n);
39 |             AbstractTDigest.encode(buf, n);
40 |         }
41 | 
42 |         buf.flip();
43 | 
44 |         for (int i = 0; i < 3000; i++) {
45 |             int n = AbstractTDigest.decode(buf);
46 |             assertEquals(String.format("%d:", i), ref.get(i).intValue(), n);
47 |         }
48 |     }
49 | }


--------------------------------------------------------------------------------
/core/src/test/java/com/tdunning/scale/ScaleTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Ted Dunning under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.tdunning.scale;
 19 | 
 20 | import com.google.common.collect.Lists;
 21 | import org.junit.Test;
 22 | 
 23 | import java.util.List;
 24 | 
 25 | import static org.junit.Assume.assumeTrue;
 26 | 
 27 | /**
 28 |  * Tests scaling properties of t-digest variants
 29 |  */
 30 | public class ScaleTest {
 31 |     @Test
 32 |     public void testGrowth() {
 33 |         assumeTrue(Boolean.parseBoolean(System.getProperty("runSlowTests")));
 34 |         for (Limit limit : new Limit[]{
 35 |                 new RootLinearLimit(), new RootLimit(),
 36 |                 new StandardLimit(), new LinearLimit(), new PiecewiseLinearLimit(0.05),
 37 |                 new PiecewiseLinearLimit(0.1), new PiecewiseLinearLimit(0.2),
 38 |         }) {
 39 |             for (long n : new long[]{1000, 10000, 100000, 1000000L, 10000000L, 100000000L, 1000000000L}) {
 40 |                 List<Centroid> r = size(n, 200.0, limit);
 41 |                 int nonTrivial = 0;
 42 |                 for (Centroid centroid : r) {
 43 |                     if (centroid.count > 1) {
 44 |                         nonTrivial++;
 45 |                     }
 46 |                 }
 47 |                 System.out.printf("%s\t%d\t%d\t%d\n", limit.getClass().getSimpleName(), n, r.size(), nonTrivial);
 48 |             }
 49 |         }
 50 |     }
 51 | 
 52 |     @SuppressWarnings("WeakerAccess")
 53 |     public List<Centroid> size(long n, @SuppressWarnings("SameParameterValue") double compression, Limit limit) {
 54 |         if (compression <= 0) {
 55 |             compression = 50;
 56 |         }
 57 | 
 58 |         if (limit == null) {
 59 |             limit = new StandardLimit();
 60 |         }
 61 | 
 62 |         double total = 0;
 63 |         long i = 0;
 64 |         List<Centroid> r = Lists.newArrayList();
 65 |         while (i < n) {
 66 |             double mean = i;
 67 |             int count = 1;
 68 |             i++;
 69 |             double qx = total / n;
 70 | 
 71 |             while (i < n && count + 1 <= Math.max(1, limit.limit(n, qx) / compression)) {
 72 |                 count++;
 73 |                 mean += (i - mean) / count;
 74 |                 qx = (total + count / 2) / n;
 75 |                 i++;
 76 |             }
 77 |             total += count;
 78 |             r.add(new Centroid(mean, count));
 79 |         }
 80 |         return r;
 81 |     }
 82 | 
 83 |     public static class Centroid {
 84 |         final double mean;
 85 |         final int count;
 86 | 
 87 |         @SuppressWarnings("WeakerAccess")
 88 |         public Centroid(double mean, int count) {
 89 |             this.mean = mean;
 90 |             this.count = count;
 91 |         }
 92 |     }
 93 | 
 94 |     public interface Limit {
 95 |         double limit(long n, double q);
 96 |     }
 97 | 
 98 |     public static class StandardLimit implements Limit {
 99 |         @Override
100 |         public double limit(long n, double q) {
101 |             return 4 * n * q * (1 - q);
102 |         }
103 |     }
104 | 
105 |     public static class RootLimit implements Limit {
106 |         @Override
107 |         public double limit(long n, double q) {
108 |             return 2 * n * Math.sqrt(q * (1 - q));
109 |         }
110 |     }
111 | 
112 |     public static class LinearLimit implements Limit {
113 |         @Override
114 |         public double limit(long n, double q) {
115 |             return 2 * n * Math.min(q, 1 - q);
116 |         }
117 |     }
118 | 
119 |     public static class RootLinearLimit implements Limit {
120 |         @Override
121 |         public double limit(long n, double q) {
122 |             return n * Math.sqrt(2 * Math.min(q, 1 - q));
123 |         }
124 |     }
125 | 
126 |     public static class PowerLinearLimit implements Limit {
127 |         private final double exp;
128 | 
129 |         public PowerLinearLimit(double exp) {
130 |             this.exp = exp;
131 |         }
132 | 
133 |         @Override
134 |         public double limit(long n, double q) {
135 |             return n * Math.pow(2 * Math.min(q, 1 - q), exp);
136 |         }
137 |     }
138 | 
139 |     private class PiecewiseLinearLimit implements Limit {
140 |         private final double cut;
141 | 
142 |         PiecewiseLinearLimit(double cut) {
143 |             this.cut = cut;
144 |         }
145 | 
146 |         @Override
147 |         public double limit(long n, double q) {
148 |             if (q < cut) {
149 |                 return n * q / cut;
150 |             } else if (1 - q < cut) {
151 |                 return limit(n, 1 - q);
152 |             } else {
153 |                 return n;
154 |             }
155 | 
156 |         }
157 |     }
158 | }
159 | 


--------------------------------------------------------------------------------
/docs/error-uniform-delta=100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/error-uniform-delta=100.png


--------------------------------------------------------------------------------
/docs/error-uniform-delta=200.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/error-uniform-delta=200.png


--------------------------------------------------------------------------------
/docs/error-uniform-delta=50.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/error-uniform-delta=50.png


--------------------------------------------------------------------------------
/docs/error-uniform-delta=500.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/error-uniform-delta=500.png


--------------------------------------------------------------------------------
/docs/error-vs-compression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/error-vs-compression.png


--------------------------------------------------------------------------------
/docs/interpolation-figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/interpolation-figure.png


--------------------------------------------------------------------------------
/docs/max-error-uniform.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/max-error-uniform.png


--------------------------------------------------------------------------------
/docs/proofs/invariant-preservation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/proofs/invariant-preservation.pdf


--------------------------------------------------------------------------------
/docs/proofs/refs.bib:
--------------------------------------------------------------------------------
  1 | @article{Gan:2018:MQS:3236187.3269475,
  2 |  author = {Gan, Edward and Ding, Jialin and Tai, Kai Sheng and Sharan, Vatsal and Bailis, Peter},
  3 |  title = {Moment-based Quantile Sketches for Efficient High Cardinality Aggregation Queries},
  4 |  journal = {Proc. VLDB Endow.},
  5 |  issue_date = {July 2018},
  6 |  volume = {11},
  7 |  number = {11},
  8 |  month = jul,
  9 |  year = {2018},
 10 |  issn = {2150-8097},
 11 |  pages = {1647--1660},
 12 |  numpages = {14},
 13 |  url = {https://doi.org/10.14778/3236187.3236212},
 14 |  doi = {10.14778/3236187.3236212},
 15 |  acmid = {3269475},
 16 |  publisher = {VLDB Endowment},
 17 | } 
 18 | 
 19 | @INPROCEEDINGS{Chen2000,
 20 |     author = {Fei Chen and Diane Lambert and Jos{\'e} C. Pinheiro},
 21 |     title = {Incremental Quantile Estimation for Massive Tracking},
 22 |     booktitle = {In Proceedings of KDD},
 23 |     year = {2000},
 24 |     pages = {516--522}
 25 | }
 26 | 
 27 | @article{one-dimensional-k-means,
 28 |     id= "pmid:27942416",
 29 |     title = {Ckmeans.1d.dp: Optimal <i>k</i>-means Clustering in One Dimension by Dynamic Programming},
 30 |     author= {Haizhou Wang and Mingzhou Song},
 31 |     journal= "The R journal",
 32 |     ISSN= "2073-4859",
 33 |     year = {2011},
 34 |     month={12},
 35 |     pages= {29-33},
 36 |     volume=3,
 37 |     issue=2,
 38 |     PMID = "27942416",
 39 |     PMCID= "PMC5148156"
 40 | }
 41 | 
 42 | @inproceedings{Greenwald-space-efficient-online-quantiles,
 43 |  author = {Greenwald, Michael and Khanna, Sanjeev},
 44 |  title = {Space-efficient Online Computation of Quantile Summaries},
 45 |  booktitle = {Proceedings of the 2001 ACM SIGMOD International Conference on Management of Data},
 46 |  series = {SIGMOD '01},
 47 |  year = {2001},
 48 |  isbn = {1-58113-332-4},
 49 |  location = {Santa Barbara, California, USA},
 50 |  pages = {58--66},
 51 |  numpages = {9},
 52 |  url = {http://doi.acm.org/10.1145/375663.375670},
 53 |  doi = {10.1145/375663.375670},
 54 |  acmid = {375670},
 55 |  publisher = {ACM},
 56 |  address = {New York, NY, USA}
 57 | } 
 58 | 
 59 | @article{sawzall,
 60 | title	= {Interpreting the Data: Parallel Analysis with Sawzall},
 61 | author	= {Rob Pike and Sean Dorward and Robert Griesemer and Sean Quinlan},
 62 | year	= {2005},
 63 | journal	= {Scientific Programming Journal},
 64 | pages	= {277--298},
 65 | volume	= {13}
 66 | }
 67 | 
 68 | @article{munro1980,
 69 | author = "J.I. Munro and M.S. Paterson",
 70 | year = "1980",
 71 | title = "Selection and sorting with limited storage",
 72 | journal = "Theoretical Computer Science",
 73 | volume = "12",
 74 | number = "3",
 75 | pages = "315 - 323",
 76 | issn = "0304-3975",
 77 | doi = "https://doi.org/10.1016/0304-3975(80)90061-4",
 78 | url = "http://www.sciencedirect.com/science/article/pii/0304397580900614"
 79 | }
 80 | @inproceedings{qdigest,
 81 |     author = {Shrivastava, Nisheeth and Buragohain, Chiranjeeb and Agrawal, Divyakant and Suri, Subhash},
 82 |     year = {2004},
 83 |     month = {09},
 84 |     title = {Medians and Beyond: New Aggregation Techniques for Sensor Networks},
 85 |     booktitle = {SenSys'04 - Proceedings of the Second International Conference on Embedded Networked Sensor Systems},
 86 |     doi = {10.1145/1031495.1031524}
 87 | }
 88 | 
 89 | @misc{wiki_welford,
 90 |    title = "Algorithms for Calculating Variance, Online Algorithm",
 91 |    author = "The Wikipedia Foundation",
 92 |    year = 2018,
 93 |    howpublished = "\url{http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm}",
 94 |    note = "[Online; accessed 19-October-2013]"
 95 |  }
 96 | 
 97 | @misc{datafu,
 98 |    title = "The Apache Datafu Project",
 99 |    author = "DFU",
100 |    year = 2019,
101 |    publisher = "Apache Software Foundation",
102 |    howpublished = "\url{https://datafu.apache.org/}",
103 |    note = "[Online; accessed 23-January-2018]"
104 |  }
105 | 
106 | @misc{t-digest-project,
107 |    title = "The t-digest Library",
108 |    author = "Ted Dunning",
109 |    year = 2018,
110 |    howpublished = "\url{https://github.com/tdunning/t-digest/}",
111 |    note = "[Online; accessed 23-January-2018]"
112 |  }
113 | 
114 | @misc{t-digest-arxiv,
115 |     title = {Computing Extremely Accurate Quantiles Using t-Digests},
116 |     author = "Ted Dunning and Otmar Ertl",
117 |     year = 2018,
118 |     howpublished = "\url{https://arxiv.org/abs/1902.04023}",
119 |     note="arXiv:1902.04023 [stat.CO]"
120 | }
121 | 
122 | @misc{moment-sketch-arxiv,
123 |     title = {Moment-Based Quantile Sketches for Efficient High Cardinality Aggregation Queries},
124 |     author = {Edward Gan and Jialin Ding and Kai Sheng Tai and Vatsal Sharan and Peter Bailis},
125 |     year = 2018,
126 |     howpublished = "\url{https://arxiv.org/abs/1803.01969v1}",
127 |     note = "arXiv:1803.01969v1"
128 | }
129 | 
130 | @misc{github:stream,
131 |    title = "Stream summarizer and cardinality estimator",
132 |    author = "StreamLib",
133 |    year = 2019,
134 |    howpublished = "\url{https://github.com/addthis/stream-lib}",
135 |    note = "[Online; accessed 11-February-2019]"
136 |  }
137 |  
138 | @inbook{knuth2welford,
139 |   author={Donald E. Knuth},
140 |   year=1998,
141 |   edition=3,
142 |   pages=232,
143 |   title= {The Art of Computer Programming, volume 2: Seminumerical Algorithms},
144 |   publisher= {Addison-Wesley},
145 |   address= {Boston} 
146 | }
147 | 
148 | @ARTICLE{welford62,
149 |     author = {B. P. Welford },
150 |     year = {1962},
151 |     title = {Note on a method for calculating corrected sums of squares and products},
152 |     journal = {Technometrics},
153 |     pages = {419--420}
154 | }
155 | 


--------------------------------------------------------------------------------
/docs/proofs/sizing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/proofs/sizing.pdf


--------------------------------------------------------------------------------
/docs/quantiles/quantiles.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/quantiles/quantiles.pdf


--------------------------------------------------------------------------------
/docs/quantiles/quantiles.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[11pt]{amsart}
 2 | \usepackage{geometry}                % See geometry.pdf to learn the layout options. There are lots.
 3 | \geometry{letterpaper}                   % ... or a4paper or a5paper or ... 
 4 | %\geometry{landscape}                % Activate for for rotated page geometry
 5 | %\usepackage[parfill]{parskip}    % Activate to begin paragraphs with an empty line rather than an indent
 6 | \usepackage{graphicx}
 7 | \usepackage{amssymb}
 8 | \usepackage{epstopdf}
 9 | \DeclareGraphicsRule{.tif}{png}{.png}{`convert #1 `dirname #1`/`basename #1 .tif`.png}
10 | 
11 | \title{Brief Article}
12 | \author{The Author}
13 | %\date{}                                           % Activate to display a given date or no date
14 | 
15 | \begin{document}
16 | \maketitle
17 | \section{Assumptions}
18 | \begin{enumerate}
19 | \item The samples for each centroid are evenly divided on each side of the centroid
20 | \item The samples between centroids are uniformly distributed
21 | \item If a centroid has $n = 2k + 1$ samples, then there are $k$ samples on each side and one at the centroid
22 | \item If a centroid has $n = 2k$ samples, then there $k-1/2$ samples on each side
23 | \item the first and last centroid will have only one sample
24 | \end{enumerate}
25 | \section{Equal Spacing Model}
26 | Take two centroids separated by $x$ with $n_\mathtt{ left}$ and $n_\mathtt{ right}$ samples respectively. We know the following about the samples between these centroids
27 | \begin{enumerate}
28 | \item the first and last centroids represent the minimum and maximum samples for the entire datasets
29 | \item if $n_\mathtt{left}=1$ or $n_\mathtt{right}=1$ then the unique sample for the corresponding centroid is at the centroid
30 | \item there will be $\lfloor n_\mathtt{left} / 2 \rfloor + \lfloor n_\mathtt{right} / 2 \rfloor$ samples between the centroids
31 | \item samples will be spaced $\Delta x = 2x / ( n_\mathtt {left} + n_\mathtt{ right})$ apart
32 | \item the left-most sample is at $((n_\mathtt{left} \mod 2) + 1)\Delta x / 2$ from the left centroid
33 | \item the right-most sample is at $((n_\mathtt{right} \mod 2) + 1)\Delta x / 2$ from the left centroid
34 | \end{enumerate}
35 | %\subsection{}
36 | 
37 | 
38 | 
39 | \end{document}  


--------------------------------------------------------------------------------
/docs/r-sim-diagrams/figs.r:
--------------------------------------------------------------------------------
 1 | fig.no = 1
 2 | pdf(sprintf("plot-%03d.pdf", fig.no),
 3 |             width=7, height=5, pointsize=10)
 4 | fig.no = fig.no + 1
 5 | 
 6 | # This figure shows the mapping between q and k and how variable size clusters result.
 7 | par(cex.lab=1.5)
 8 | par(cex.axis=1.5)
 9 | scale = 30
10 | q.marks = (sin(seq(-pi/2+0.01,pi/2-0.01,length.out=16))+1)/2
11 | plot(q.marks, scale*asin(2*q.marks-1)/pi+scale/2, xlim=c(0,1.05), ylim=c(-3,scale), 
12 |     lwd=2, cex=0.7, 
13 |     type='b', ylab='k', xlab='q')
14 | for (i in 1:(length(q.marks))) {
15 |     q = q.marks[i]
16 |     lines(c(q,q), c(-3, scale*asin(2*q-1)/pi + scale/2 -1), lwd=2, col='gray')
17 | }
18 | dev.off()
19 | 
20 | pdf(sprintf("plot-%03d.pdf", fig.no),
21 |             width=7, height=5, pointsize=10)
22 | fig.no = fig.no + 1
23 | 
24 | # this shows the old and sqrt limits
25 | par(cex.lab=1.5)
26 | par(cex.axis=1.5)
27 | q = seq(0, 1, by=0.001)
28 | plot(q, 6*q*(1-q), type='l', lwd=2, ylab="Cluster Size")
29 | lines(q, 8/pi*sqrt(q*(1-q)), lwd=2, lty=2)
30 | 
31 | dev.off()
32 | 
33 | pdf(sprintf("plot-%03d.pdf", fig.no),
34 |             width=7, height=5, pointsize=10)
35 | fig.no = fig.no + 1
36 | 
37 | par(cex.lab=1.5)
38 | par(cex.axis=1.5)
39 | plot(q, 100*(asin(2*q-1)/pi+0.5), type='l', lwd=2,
40 |     ylab="k")
41 | 
42 | dev.off()
43 | 


--------------------------------------------------------------------------------
/docs/r-sim-diagrams/shifts.r:
--------------------------------------------------------------------------------
 1 | mark = function(x) {arrows(x,9, x,5, length=0.15, angle=10, lwd=2)}
 2 | 
 3 | z = rnorm(2000)
 4 | shift = c(rep(0, 400), rep(1.5, 600), rep(-2, 300), rep(-1, 700))
 5 | spike = c(rep(0,1600), rep(9, 5), rep(395,0))
 6 | plot(z+shift+spike, type='s', xlab='Time', ylab='Shift + Noise', ylim=c(-10,10))
 7 | mark(400)
 8 | mark(1000)
 9 | mark(1300)
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/docs/r-sim-diagrams/sim.r:
--------------------------------------------------------------------------------
 1 | x = seq(-5,5,by=0.01)
 2 | sizeLimit = function(n,x) {q=pnorm(x);4*sum(counts)*q*(1-q)/50}
 3 | addPoint = function(p) {
 4 |     dist = abs(centroids - p)
 5 |     k = which(min(dist) == dist)[1]
 6 |     if (counts[k] < sizeLimit(N, p)) {
 7 |         counts[k] <<- counts[k]+1
 8 |         centroids[k] <<- centroids[k] + (p-centroids[k]) / counts[k]
 9 |     } else {
10 |         centroids <<- c(centroids, p)
11 |         counts <<- c(counts, 1)
12 |     }
13 | }
14 | 
15 | offset = 100
16 | step = function(n=100) {
17 |     for (i in 1:n) {
18 |         addPoint(samples[i+offset])
19 |     }
20 |     offset <<- offset + n
21 |     counts
22 | }
23 | 
24 | plot(x, sizeLimit(40, x), type='l')
25 | 
26 | N = 10e6
27 | samples = rnorm(N)
28 | centroids = samples[1:2]
29 | counts = (centroids != 1000) + 0
30 | 
31 | plot.offset = 0
32 | plot.stuff = function(stepSize=1000) {
33 |     for (i in 1:10) {
34 |         step(stepSize)
35 |         pdf(sprintf("fig-%03d.pdf", i + plot.offset),
36 |             width=5, height=5, pointsize=10)
37 |         plot(pnorm(centroids[order(centroids)]), counts[order(centroids)],
38 |              type='p', ylim=c(0, 2*max(counts)),
39 |              pch=21, bg=rgb(0,0,0,alpha=0.1), col=rgb(0,0,0,alpha=0.1), cex=0.6);
40 |         centers = centroids[order(centroids)]
41 |         limits = sizeLimit(sum(counts), centers)
42 |         lines(pnorm(centers), limits, type='l')
43 |         dev.off()
44 |     }
45 |     plot.offset <<- plot.offset + 10
46 | }
47 | 
48 | plot.stuff(100)
49 | plot.stuff(1000)
50 | plot.stuff(10000)
51 | plot.stuff(100000)
52 | 


--------------------------------------------------------------------------------
/docs/simpa/declaration-of-competing-interests.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/simpa/declaration-of-competing-interests.docx


--------------------------------------------------------------------------------
/docs/simpa/figures/adaptive-threshold.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/simpa/figures/adaptive-threshold.pdf


--------------------------------------------------------------------------------
/docs/simpa/figures/change-point.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/simpa/figures/change-point.pdf


--------------------------------------------------------------------------------
/docs/simpa/figures/detection.r:
--------------------------------------------------------------------------------
 1 | ### Draws a figure illustrating change detection in the distribution of synthetic data.
 2 | ### Each dot represents a single time period with 1000 samples. Before the change,
 3 | ### the data is sampled from a unit normal distribution. After the change, 20 samples
 4 | ### in each time period are taken from N(3,1). Comparing counts with a chi^2 test that
 5 | ### is robust to small expected counts robustly detects this shift.
 6 | 
 7 | ### log-likelihood ratio test for multinomial data
 8 | llr = function(k) {
 9 |     2 * sum(k) * (H(k) - H(rowSums(k)) - H(colSums(k)))
10 | }
11 | H = function(k) {
12 |     N = sum(k) ;
13 |     return (sum(k/N * log(k/N + (k==0))))
14 | }
15 | 
16 | ### compare recent samples to historical by comparing counts in a range of interest
17 | analyze = function(historical, recent, cuts) {
18 |     counts = data.frame(
19 |         a=hist(recent, breaks=cuts, plot=F)$counts, 
20 |         b=hist(historical, breaks=cuts, plot=F)$counts)
21 |     llr(counts)
22 | }
23 | 
24 | ### use fixed seed for stability of the pictures
25 | set.seed(3)
26 | ### lots of reference data
27 | historical = rnorm(100000)
28 | 
29 | ### set cuts based on historical data
30 | ### in practical systems, this step would be implemented with a t-digest 
31 | cuts = c(-10, quantile(historical, probs=c(0.99, 0.999)), 20)
32 | 
33 | ### 1000 samples per time period, 2% perturbation after change
34 | n = 1000
35 | epsilon = 0.02
36 | 
37 | ### sample 60 scores without perturbation
38 | scores = rep(0,100)
39 | for (i in 1:60) {
40 |     scores[i] = analyze(historical, c(rnorm(n)), cuts)
41 | }
42 | 
43 | ### sample 40 scores with perturbation
44 | for (i in 1:40) {
45 |     scores[i + 60] = analyze(historical, c(rnorm(n * (1-epsilon)), rnorm(n * epsilon, 3)), cuts)
46 | }
47 | 
48 | ### plot the data
49 | pdf("change-point.pdf", width=5, height=4, pointsize=10)
50 | old = par('mgp')
51 | par(mgp=c(3,0.6,0))
52 | colors = c(rep(rgb(0,0,0,alpha=0.8),60), rep(rgb(1,0,0,alpha=0.8),40))
53 | plot(scores, xaxt='n', xlab=NA, ylab=NA, ylim=c(0,60), cex=1.3, pch=21, bg=colors, col=NA)
54 | abline(v=60.5, lwd=3, col=rgb(0,0,0, alpha=0.1))
55 | 
56 | polygon(c(-1,55,55,-1,-1), c(60, 60,36,36,60), col='white')
57 | points(c(1.5, 1.5), c(55, 45), pch=21, bg=c('black', 'red'), col=NA)
58 | text(5, c(55,45), adj=0, labels=c(
59 |     expression(x %~% symbol(N)(mu == 0)), 
60 |     expression(x %~% bgroup("[", atop(symbol(N)(mu==0) , symbol(N)(mu==3)),""))))
61 | text(30, c(55,48,41.5), c("1000 samples", "980 samples", "20 samples"), adj=0)
62 | 
63 | 
64 | mtext(expression(llr(counts)), side=2, padj=-1.3, cex=1.4)
65 | mtext("Before change", at=25, side=1, padj=1, cex=1.5)
66 | mtext("After change", at=75, side=1, padj=1, cex=1.5)
67 | par(mgp=old)
68 | dev.off()
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/docs/simpa/figures/error-vs-compression.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/simpa/figures/error-vs-compression.pdf


--------------------------------------------------------------------------------
/docs/simpa/figures/windows.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/simpa/figures/windows.graffle


--------------------------------------------------------------------------------
/docs/simpa/figures/windows.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/simpa/figures/windows.pdf


--------------------------------------------------------------------------------
/docs/simpa/highlights.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/simpa/highlights.pdf


--------------------------------------------------------------------------------
/docs/simpa/highlights.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[11pt, oneside]{article}   	% use "amsart" instead of "article" for AMSLaTeX format
 2 | \usepackage{geometry}                		% See geometry.pdf to learn the layout options. There are lots.
 3 | \geometry{letterpaper}                   		% ... or a4paper or a5paper or ... 
 4 | %\geometry{landscape}                		% Activate for rotated page geometry
 5 | %\usepackage[parfill]{parskip}    		% Activate to begin paragraphs with an empty line rather than an indent
 6 | \usepackage{graphicx}				% Use pdf, png, jpg, or eps§ with pdflatex; use eps in DVI mode
 7 | 								% TeX will automatically convert eps --> pdf in pdflatex		
 8 | \usepackage{amssymb}
 9 | 
10 | %SetFonts
11 | 
12 | %SetFonts
13 | 
14 | 
15 | \title{Highlights for: The $t$-digest: Efficient Estimates of Distributions}
16 | \author{Ted Dunning}
17 | %\date{}							% Activate to display a given date or no date
18 | 
19 | \begin{document}
20 | \maketitle
21 | %\section{}
22 | %\subsection{}
23 | \begin{itemize}
24 | \item The t-digest is an algorithm for accurately estimating quantiles from a compact
25 | sketch
26 | 
27 | \item The t-digest is available as a library as well as embedded in popular query systems
28 | 
29 | \item The t-digest allows accurate estimates of data with arbitrary distribuiton
30 | 
31 | \item The t-digest library has a simple API and no runtime dependencies, available on GitHub
32 | \end{itemize}
33 | 
34 | 
35 | \end{document}  
36 | 
37 | 


--------------------------------------------------------------------------------
/docs/software-paper/figures/cluster-spread.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/software-paper/figures/cluster-spread.pdf


--------------------------------------------------------------------------------
/docs/software-paper/figures/endpoint.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/software-paper/figures/endpoint.pdf


--------------------------------------------------------------------------------
/docs/software-paper/figures/interpolation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/software-paper/figures/interpolation.pdf


--------------------------------------------------------------------------------
/docs/software-paper/figures/k-q-plot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/software-paper/figures/k-q-plot.pdf


--------------------------------------------------------------------------------
/docs/software-paper/figures/linear-interpolation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/software-paper/figures/linear-interpolation.pdf


--------------------------------------------------------------------------------
/docs/software-paper/figures/merge.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/software-paper/figures/merge.pdf


--------------------------------------------------------------------------------
/docs/software-paper/figures/qd-sizes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/software-paper/figures/qd-sizes.pdf


--------------------------------------------------------------------------------
/docs/software-paper/figures/relative-error.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/software-paper/figures/relative-error.pdf


--------------------------------------------------------------------------------
/docs/software-paper/figures/singleton.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/software-paper/figures/singleton.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/build-figures.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import re
 3 | import sys
 4 | from glob import glob
 5 | from subprocess import check_output
 6 | 
 7 | logFile = open("figure.log", "a")
 8 | 
 9 | def log(s):
10 |     now = time.strftime("%v %T %Z")
11 |     logLine = "\t".join([now, s, '\n'])
12 |     logFile.write(logLine)
13 |     sys.stdout.write(logLine)
14 | 
15 | log('Starting figure run')
16 | for script in glob('*.r'):
17 |     t0 = time.time()
18 |     output = check_output(["Rscript", script])
19 |     t1 = time.time() 
20 |     log('%(script)s took %(delta).1fs <%(output)s>' % {
21 |         "script":script,
22 |         "delta": (t1-t0),
23 |         "output": re.sub("null device\s*\n\s+1\s?\n", "", output)})
24 | 


--------------------------------------------------------------------------------
/docs/t-digest-paper/comparison.r:
--------------------------------------------------------------------------------
 1 | data = read.delim("comparison.tsv")
 2 | keep = function(data, tag) {
 3 |   data[data$dist == tag & data$compression == 50 & data$q !=0.3 & data$q != 0.7 & data$q != 0.2 & data$q != 0.8, ]
 4 | }
 5 | 
 6 | png("qd-sizes.png", width=1800, height=700, pointsize=28)
 7 | layout(matrix(c(1,2,3), 1, 3, byrow=T), widths=c(1,1))
 8 | 
 9 | gray = rgb(0,0,0,0.05)
10 | 
11 | old = par(mar=c(4.5,5,3,0.5))
12 | plot(s2~s1, data, log='xy', pch=21, col=gray, bg=gray, cex=0.4,
13 |      xlab="t-digest (bytes)", ylab="Q-digest (bytes)",
14 |      xlim=c(100, 120000),
15 |      cex.lab=1.5, xaxt='n', yaxt='n')
16 | box(lwd=3)
17 | 
18 | axis(at=c(100, 300, 1000, 3000, 10000, 30000, 100000), labels=c(100, 300, "1K", "3K", "10K", "30K", "100K"), side=1)
19 | axis(at=c(100, 300, 1000, 3000, 10000, 30000, 100000), labels=c(100, 300, "1K", "3K", "10K", "30K", "100K"), side=2)
20 | 
21 | steps = exp(seq(log(100), log(200000), by=log(2)))
22 | lines(steps, steps, col='lightgrey')
23 | lines(steps, steps/2, lty=2, col='lightgrey')
24 | lines(steps,steps*2, lty=2, col='lightgrey')
25 | 
26 | for (compression in c(2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000)) {
27 |   x = mean(data[data$compression == compression,]$s1) * 1.8
28 |   y = mean(data[data$compression == compression,]$s2)
29 |   text(x,y,compression)
30 | }
31 | text(10000, 1000, expression(1/delta), cex=1.5)
32 | 
33 | par(old)
34 | 
35 | old = par(mar=c(4.5,5,3,0.5))
36 | boxplot(1e6*e2 ~ q, keep(data, 'uniform'), at=1:7 - 0.13, boxwex=0.3, xaxt='n', yaxt='n',
37 |         ylab="Quantile error (ppm)", xlab="Quantile",
38 |         ylim=c(-10000, 20000), cex.lab=1.5, col=rgb(0.95, 0.95, 0.95))
39 | boxplot(1e6*e1 ~ q, keep(data, 'uniform'), col=rgb(0.4, 0.4, 0.4), at=1:7 + 0.13, add=T, boxwex=0.3, xaxt='n', yaxt='n')
40 | axis(at=1:7, labels=c(0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999), side=1)
41 | axis(side=2, cex.label=2)
42 | abline(h=0, lwd=2, col='gray')
43 | for (i in 1:7) {
44 |   abline(v=i, lwd=1, col='lightgray', lty=2)
45 | }
46 | legend(5.5, 20000, c("Q-digest", "t-digest"), fill = c(rgb(0.95, 0.95, 0.95), rgb(0.4, 0.4, 0.4)))
47 | text(6.5, 14000, "Uniform", cex=1.5)
48 | text(6.5, 12000, expression(1/delta == 50), cex=1.5)
49 | box(lwd=3)
50 | par(old)
51 | 
52 | old = par(mar=c(4.5,5,3,0.5))
53 | boxplot(1e6*e2 ~ q, keep(data, 'gamma'), col=rgb(0.95, 0.95, 0.95), at=1:7 - 0.13, boxwex=0.3, xaxt='n',
54 |         ylab="Quantile error (ppm)", xlab="Quantile",
55 |         cex.lab=1.5)
56 | boxplot(1e6*e1 ~ q, keep(data, 'gamma'), col=rgb(0.4, 0.4, 0.4), at=1:7 + 0.13, add=T, boxwex=0.3, xaxt='n')
57 | axis(at=1:7, labels=c(0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999), side=1)
58 | abline(h=0, lwd=2, col='gray')
59 | for (i in 1:7) {
60 |   abline(v=i, lwd=1, col='lightgray', lty=2)
61 | }
62 | legend(5.5, 88000, c("Q-digest", "t-digest"), fill = c(rgb(0.95, 0.95, 0.95), rgb(0.4, 0.4, 0.4)))
63 | text(6.5, 68000, expression(Gamma(0.1, 0.1)), cex=1.5)
64 | text(6.5, 62000, expression(1/delta == 50), cex=1.5)
65 | box(lwd=3)
66 | par(old)
67 | 
68 | dev.off()
69 | 


--------------------------------------------------------------------------------
/docs/t-digest-paper/errors.csv:
--------------------------------------------------------------------------------
1 | dist	tag	x	Q	error
2 | 


--------------------------------------------------------------------------------
/docs/t-digest-paper/errors.r:
--------------------------------------------------------------------------------
 1 | errorData = read.delim("errors.csv")
 2 | 
 3 | plotError = function(dist, ylim=c(-2000, 2000), yaxt='s', ylab) {
 4 |   boxplot(1e6*error ~ Q, errorData[errorData$dist==dist,], ylim=ylim, lwd=4, xlab="Cumulative Distribution", ylab=ylab, pars=list(lwd.ticks=4), yaxt=yaxt)
 5 |   box(lwd=8)
 6 |   abline(h=1000, lty=2, lwd=4)
 7 |   abline(h=-1000, lty=2, lwd=4)
 8 | }
 9 | 
10 |   png("error.png", width=2400, height=1200, pointsize=36)
11 | layout(matrix(c(1,2), 1, 2, byrow=T), heights=c(1200, 1200), widths=c(1300,1100))
12 | #plotError('mixture', 'mixture-error.png')
13 | plotError('gamma', ylab="Error (ppm)")
14 | old = par(mar=c(5.1,0,4.1,2))
15 | plotError('uniform', yaxt='n', ylab=NA)
16 | par(old)
17 |   dev.off()
18 | 


--------------------------------------------------------------------------------
/docs/t-digest-paper/figure-doc.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figure-doc.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/cluster-spread.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/cluster-spread.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/endpoint.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/endpoint.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/error-vs-compression.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/error-vs-compression.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/interpolation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/interpolation.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/k-q-plot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/k-q-plot.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/linear-interpolation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/linear-interpolation.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/merge.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/merge.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/qd-sizes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/qd-sizes.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/relative-error.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/relative-error.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/figures/singleton.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/figures/singleton.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/histo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/histo.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/k-q-diagram/k-q-limits.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/k-q-diagram/k-q-limits.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/k-q-diagram/slope-limiting.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/k-q-diagram/slope-limiting.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/k-q-plot.r:
--------------------------------------------------------------------------------
 1 | n = 10
 2 | fade = 'darkgray'
 3 | pdf("k-q-plot.pdf", width=2.5, height=2.2, pointsize=8, family='serif')
 4 | 
 5 | par(mar=c(3.,3,1,1))
 6 | 
 7 | q.to.k = function(q,compression) {
 8 |     compression * (asin(2*q-1)/pi/2)
 9 | }
10 | 
11 | k.to.q = function(k,compression) {
12 |     sin(k/compression*pi - pi/2)/2 + 0.5
13 | }
14 | 
15 | q = seq(0,1,by=0.001)
16 | 
17 | plot(q, q.to.k(q, compression=n), type='l', lwd=2, xlab=NA, ylab=NA, xaxt='n', yaxt='n')
18 | axis(side=1, at=(0:5)/5, mgp=c(1,0.5,0))
19 | title(xlab=expression(italic(q)), line=1.3, cex.lab=1.5)
20 | axis(side=2, at=seq(-5,5,by=1), mgp=c(1,0.6,0))
21 | title(ylab=expression(italic(k)), line=1.5, cex.lab=1.5)
22 | 
23 | for (i in 0:n) {
24 |     abline(h=i-5, col=fade)
25 |     abline(v=k.to.q(i, compression=n), col=fade)
26 | }
27 | lines(q, q.to.k(q, compression=n), type='l', lwd=2)
28 | dev.off()
29 | 


--------------------------------------------------------------------------------
/docs/t-digest-paper/linear-interpolation.r:
--------------------------------------------------------------------------------
 1 | ### Illustrates the piece-wise linear approximation of the cumulative distribution using constant size bins
 2 | fade = rgb(0,0,0,alpha=0.5)
 3 | dot.size = 0.7
 4 | n = 10000
 5 | set.seed(5)
 6 | 
 7 | pdf("linear-interpolation.pdf", width=6, height=2.7, pointsize=10, family='serif')
 8 | layout(matrix(c(1,2),byrow=T, ncol=2), widths=c(1.1,1))
 9 | u = sort(runif(n))
10 | x = log(1-u)
11 | x = sort(x)
12 | F = ((0:(n-1))+0.5)/n
13 | par(mar=c(2.5,3,0.5,1))
14 | plot(x, F, cex=dot.size, pch=21, bg=fade, col=NA, type='b', xlim=c(-9,-4.5), ylim=c(0,0.01), xaxt='n', ylab=NA, mgp=c(1,0.5,0), xlab=NA)
15 | 
16 | axis(side=1, at=-10:-1, labels=NA)
17 | title(xlab=expression(italic(x)), line=0.8, cex.lab=1.5)
18 | title(ylab=expression(italic(q)), line=1.5, cex.lab=1.5)
19 | 
20 | left.end = x[1] - (x[2]-x[1])
21 | 
22 | lines(c(left.end, x[100]), c(0, 0.01), lwd=2)
23 | lines(c(left.end, left.end), c(-0.0005, 0.0005), lt=1, col='black', lwd=0.5)
24 | lines(c(x[100], x[100]), c(0.0085, 0.015), lt=1, col='black', lwd=0.5)
25 | text(-7, 0.006, "100")
26 | 
27 | ###text(-5, 0.4, adj=0, "Constant size bins result in large")
28 | ###text(-5, 0.35, adj=0, "errors at extreme quantiles")
29 | 
30 | par(mar=c(2.5,1.5,0.5,1))
31 | 
32 | plot(x, F, cex=dot.size, pch=21, bg=fade, col=NA, type='b', xlim=c(-9,-4.5), ylim=c(0,0.01), yaxt='n', xaxt='n')
33 | axis(side=1, at=-10:-1, labels=NA)
34 | axis(side=2, at=(0:6)/10, labels=NA)
35 | title(xlab=expression(italic(x)), line=0.8, cex.lab=1.5)
36 | title(ylab=expression(italic(q)), line=2, cex.lab=1.5)
37 | 
38 | q.to.k = function(q) {
39 |     (asin(2*q-1)/pi + 1/2)
40 | }
41 | 
42 | k.to.q = function(k,compression) {
43 |     sin(k/compression*pi - pi/2)/2 + 0.5
44 | }
45 | 
46 | weights = c(2, 8, 19, 35, 56, 81, 111)
47 | q.bin = cumsum(c(0, weights)/n)
48 | 
49 | i.bin = c(1, cumsum(weights)+1)
50 | i.right = i.bin-1
51 | i.right = i.right[i.right > 0]
52 | m = length(i.right)
53 | i.bin = i.bin[1:m]
54 | 
55 | x.bin = c(left.end, (x[i.right[1:(m-1)]] + x[i.bin[2:m]])/2)
56 | F.bin = (i.bin-1) / n
57 | lines(x.bin, F.bin, lwd=2)
58 | dy = 0.0005
59 | for (i in 1:m) {
60 |     x.text = (x[i.bin[i]] + x[i.right[i]])/2
61 |     y.text = (F.bin[i] + F.bin[i+1])/2
62 |     x.offset = 0.3 * y.text
63 |     y.offset = dy * (1 + 500*y.text)
64 |     x.pos = x.text - x.offset
65 |     y.pos = y.text + y.offset
66 |     lines(c(x.bin[i],x.bin[i]), c(F.bin[i]-dy+0.000, F.bin[i]+dy+y.offset*0.6-0.0005), lt=1, lwd=0.5, col='black')
67 |     text(x.text - x.offset, y.text + y.offset, i.right[i]-i.bin[i]+1)
68 | }
69 | ###text(-5, 0.35, adj=0, "Variable size bins keep errors")
70 | ###text(-5, 0.3, adj=0, "small at extreme quantiles")
71 | 
72 | dev.off()
73 | 


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/data.plist:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/data.plist


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image1.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image10.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image11.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image11.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image2.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image21.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image21.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image22.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image22.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image23.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image23.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image24.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image24.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image26.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image26.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image27.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image27.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image28.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image28.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image29.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image29.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image3.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image31.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image31.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image32.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image32.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image33.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image33.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image34.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image34.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image35.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image35.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image41.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image41.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image44.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image44.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image46.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image46.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image47.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image47.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image48.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image48.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image49.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image49.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image50.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image50.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image51.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image51.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image59.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image59.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image63.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image63.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image7.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image7.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures.graffle/image8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures.graffle/image8.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures/combined.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures/combined.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures/endpoint.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures/endpoint.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures/interpolation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures/interpolation.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/quantile-figures/singleton.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/t-digest-paper/quantile-figures/singleton.pdf


--------------------------------------------------------------------------------
/docs/t-digest-paper/refs.bib:
--------------------------------------------------------------------------------
  1 | @INPROCEEDINGS{Chen2000,
  2 |     author = {Fei Chen and Diane Lambert and Jos{\'e} C. Pinheiro},
  3 |     title = {Incremental Quantile Estimation for Massive Tracking},
  4 |     booktitle = {In Proceedings of KDD},
  5 |     year = {2000},
  6 |     pages = {516--522}
  7 | }
  8 | 
  9 | @article{one-dimensional-k-means,
 10 |     id= "pmid:27942416",
 11 |     title = {Ckmeans.1d.dp: Optimal <i>k</i>-means Clustering in One Dimension by Dynamic Programming},
 12 |     author= {Haizhou Wang and Mingzhou Song},
 13 |     journal= "The R journal",
 14 |     ISSN= "2073-4859",
 15 |     year = {2011},
 16 |     month={12},
 17 |     pages= {29-33},
 18 |     volume=3,
 19 |     issue=2,
 20 |     PMID = "27942416",
 21 |     PMCID= "PMC5148156"
 22 | }
 23 | 
 24 | @inproceedings{Greenwald-space-efficient-online-quantiles,
 25 |  author = {Greenwald, Michael and Khanna, Sanjeev},
 26 |  title = {Space-efficient Online Computation of Quantile Summaries},
 27 |  booktitle = {Proceedings of the 2001 ACM SIGMOD International Conference on Management of Data},
 28 |  series = {SIGMOD '01},
 29 |  year = {2001},
 30 |  isbn = {1-58113-332-4},
 31 |  location = {Santa Barbara, California, USA},
 32 |  pages = {58--66},
 33 |  numpages = {9},
 34 |  url = {http://doi.acm.org/10.1145/375663.375670},
 35 |  doi = {10.1145/375663.375670},
 36 |  acmid = {375670},
 37 |  publisher = {ACM},
 38 |  address = {New York, NY, USA}
 39 | } 
 40 | 
 41 | @article{sawzall,
 42 | title	= {Interpreting the Data: Parallel Analysis with Sawzall},
 43 | author	= {Rob Pike and Sean Dorward and Robert Griesemer and Sean Quinlan},
 44 | year	= {2005},
 45 | journal	= {Scientific Programming Journal},
 46 | pages	= {277--298},
 47 | volume	= {13}
 48 | }
 49 | 
 50 | @article{munro1980,
 51 | author = "J.I. Munro and M.S. Paterson",
 52 | year = "1980",
 53 | title = "Selection and sorting with limited storage",
 54 | journal = "Theoretical Computer Science",
 55 | volume = "12",
 56 | number = "3",
 57 | pages = "315 - 323",
 58 | issn = "0304-3975",
 59 | doi = "https://doi.org/10.1016/0304-3975(80)90061-4",
 60 | url = "http://www.sciencedirect.com/science/article/pii/0304397580900614"
 61 | }
 62 | @inproceedings{qdigest,
 63 |     author = {Shrivastava, Nisheeth and Buragohain, Chiranjeeb and Agrawal, Divyakant and Suri, Subhash},
 64 |     year = {2004},
 65 |     month = {09},
 66 |     title = {Medians and Beyond: New Aggregation Techniques for Sensor Networks},
 67 |     booktitle = {SenSys'04 - Proceedings of the Second International Conference on Embedded Networked Sensor Systems},
 68 |     doi = {10.1145/1031495.1031524}
 69 | }
 70 | 
 71 | @misc{wiki_welford,
 72 |    title = "Algorithms for Calculating Variance, Online Algorithm",
 73 |    author = "The Wikipedia Foundation",
 74 |    year = 2018,
 75 |    howpublished = "\url{http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm}",
 76 |    note = "[Online; accessed 19-October-2013]"
 77 |  }
 78 | 
 79 | @misc{datafu,
 80 |    title = "The Apache Datafu Project",
 81 |    author = "DFU",
 82 |    year = 2019,
 83 |    publisher = "Apache Software Foundation",
 84 |    howpublished = "\url{https://datafu.apache.org/}",
 85 |    note = "[Online; accessed 23-January-2018]"
 86 |  }
 87 | 
 88 | @misc{t-digest-project,
 89 |    title = "The t-digest Library",
 90 |    author = "Ted Dunning",
 91 |    year = 2018,
 92 |    howpublished = "\url{https://github.com/tdunning/t-digest/}",
 93 |    note = "[Online; accessed 23-January-2018]"
 94 |  }
 95 | 
 96 | @misc{github:stream,
 97 |    title = "Stream summarizer and cardinality estimator",
 98 |    author = "StreamLib",
 99 |    year = 2019,
100 |    howpublished = "\url{https://github.com/addthis/stream-lib}",
101 |    note = "[Online; accessed 11-February-2019]"
102 |  }
103 |  
104 | @inbook{knuth2welford,
105 |   author={Donald E. Knuth},
106 |   year=1998,
107 |   edition=3,
108 |   pages=232,
109 |   title= {The Art of Computer Programming, volume 2: Seminumerical Algorithms},
110 |   publisher= {Addison-Wesley},
111 |   address= {Boston} 
112 | }
113 | 
114 | @ARTICLE{welford62,
115 |     author = {B. P. Welford },
116 |     year = {1962},
117 |     title = {Note on a method for calculating corrected sums of squares and products},
118 |     journal = {Technometrics},
119 |     pages = {419--420}
120 | }


--------------------------------------------------------------------------------
/docs/t-digest-paper/scaling.r:
--------------------------------------------------------------------------------
 1 | data = read.delim("scaling.tsv")
 2 | errors = read.delim("error-scaling.tsv")
 3 | errors$kb = errors$size/1000
 4 | 
 5 | png("scaling.png", width=1800, height=700, pointsize=28)
 6 | layout(matrix(c(1,2), 1, 2, byrow=T), widths=c(1,1))
 7 | 
 8 | old = par(mar=c(5.1,2,2.1,2))
 9 | plot(size1 ~ compression, data[data$samples==10000,], log='xy',
10 |      xlab=expression(1/delta), ylab="Size (kB)", ylim=c(100,100000),
11 |      cex=1, bg=rgb(0,0,0,0.1), col=rgb(0,0,0,0.1), pch=21,
12 |      yaxt='n')
13 | axis(at=c(100,1000,10000,100000), labels=c("100 B", "1 kB", "10 kB", "100 kB"), side=2)
14 | box(lwd=3)
15 | 
16 | #points(size1 ~ compression, data[data$samples==100,], log='xy',
17 | #     xlab=expression(1/delta), ylab="Size (kB)", 
18 | #       cex=0.5, bg=rgb(0,0,0,0.1), col=rgb(0,0,0,0.1), pch=21)
19 | #
20 | points(size1 ~ compression, data[data$samples==10,], 
21 |        cex=1, bg=rgb(0,0,0,0.1), col=rgb(0,0,0,0.1), pch=22)
22 | 
23 | legend(2, 9e4, c("10M samples", "10k samples"), pch=c(21,22)) 
24 | 
25 | c = seq(2,1100,by=1)
26 | m10 = lm(log(size1) ~ log(compression), data[data$samples==10,])
27 | m100 = lm(log(size1) ~ log(compression), data[data$samples==100,])
28 | m10000 = lm(log(size1) ~ log(compression), data[data$samples==10000,])
29 | lines(c, exp(predict(m10, newdata=data.frame(compression=c))), lty=2, col='lightgray')
30 | #lines(c, exp(predict(m100, newdata=data.frame(compression=c))), lty=2, col='lightgray')
31 | lines(c, exp(predict(m10000, newdata=data.frame(compression=c))), lty=2, col='lightgray')
32 | par(old)
33 | 
34 | old = par(mar=c(5.1,4.2,2.1,2))
35 | plot(size2/1000 ~ samples, data[data$compression==100,], log='x', ylim=c(0,22),
36 |      xlab="Samples (x1000)", ylab="Size (kB)", xaxt='n',
37 |      cex=1, bg=rgb(0,0,0,0.1), col=rgb(0,0,0,0.1), pch=21)
38 | axis(at=c(10,100,1000,10000), labels=c(10,100,1000,"10,000"), side=1)
39 | points(size1/1000 ~ samples, data[data$compression==100,],
40 |        cex=1, bg=rgb(0,0,0,0.1), col=rgb(0,0,0,0.1), pch=22)
41 | ms2 = lm(size2/1000 ~ log(samples), data[data$compression==100,])
42 | s = seq(10, 10000, by=10)
43 | lines(s, predict(ms2, newdata=data.frame(samples=s)), lty=2, col='lightgray')
44 | 
45 | ms1 = lm(size1/1000 ~ log(samples), data[data$compression==100,])
46 | lines(s, predict(ms1, newdata=data.frame(samples=s)), lty=2, col='lightgray')
47 | box(lwd=3)
48 | 
49 | legend(10, 21.5, c("uncompressed", "compressed"), pch=c(21,22)) 
50 | 
51 | par(old)
52 | 
53 | dev.off()
54 | 
55 | png("error-scaling.png", width=1800, height=700, pointsize=28)
56 | 
57 | layout(matrix(c(1,2,3), 1, 3, byrow=T), widths=c(1.18,1,1))
58 | 
59 | for (q in c(0.5, 0.01, 0.001)) {
60 |   if (q == 0.5) {
61 |     old = par(mar=c(5.1,4.5,2.1,2))
62 |   } else {
63 |     old = par(mar=c(5.1,0,2.1,2))
64 |   }
65 |   plot(error ~ kb, errors[errors$q == q,],
66 |        ylim=c(-0.05, 0.05),
67 |        pch=21, bg=rgb(0,0,0,.1), col=rgb(0,0,0,.1), log='x',
68 |        xlab="t-digest size (kB)", ylab="Error in q", cex.lab=1.5,
69 |        yaxt='n')
70 |   abline(h=0, col='lightgray', lwd=2)
71 |   abline(h=0.01, col='lightgray', lwd=2, lty=2)
72 |   abline(h=-0.01, col='lightgray', lwd=2, lty=2)
73 |   
74 |   box(lwd=3)
75 |   if (q == 0.5) {
76 |     axis(side=2)
77 |   }
78 |   text(20, 0.09, paste("q =",q))
79 |   par(old)
80 | }
81 | 
82 | dev.off()
83 | 


--------------------------------------------------------------------------------
/docs/t-digest-paper/scaling.tsv:
--------------------------------------------------------------------------------
 1 | k	samples	compression	size1	size2
 2 | 0	10	2	172	364
 3 | 0	10	5	415	928
 4 | 0	10	10	573	1288
 5 | 0	10	20	1027	2380
 6 | 0	10	50	2117	5056
 7 | 0	10	100	3846	9208
 8 | 0	10	200	6666	15976
 9 | 0	10	500	13781	33052
10 | 0	10	1000	22691	54436
11 | 0	100	2	190	388
12 | 0	100	5	430	928
13 | 0	100	10	765	1660
14 | 0	100	20	1506	3316
15 | 0	100	50	3076	6892
16 | 0	100	100	5650	12844
17 | 0	100	200	10008	23320
18 | 0	100	500	21419	51364
19 | 0	100	1000	38441	92236
20 | 0	1000	2	279	556
21 | 0	1000	5	598	1228
22 | 0	1000	10	1014	2104
23 | 0	1000	20	1945	4180
24 | 0	1000	50	4199	9136
25 | 0	1000	100	7467	16456
26 | 0	1000	200	14014	31144
27 | 0	1000	500	30923	69472
28 | 0	1000	1000	56465	128560
29 | 0	10000	2	314	592
30 | 0	10000	5	695	1360
31 | 0	10000	10	1238	2500
32 | 0	10000	20	2270	4576
33 | 0	10000	50	5052	10468
34 | 0	10000	100	9355	19684
35 | 0	10000	200	17839	38392
36 | 0	10000	500	40074	87664
37 | 0	10000	1000	74873	164896
38 | 1	10	2	184	388
39 | 1	10	5	376	820
40 | 1	10	10	550	1228
41 | 1	10	20	1027	2368
42 | 1	10	50	2277	5440
43 | 1	10	100	3951	9460
44 | 1	10	200	6701	16060
45 | 1	10	500	13696	32848
46 | 1	10	1000	22861	54844
47 | 1	100	2	215	448
48 | 1	100	5	511	1096
49 | 1	100	10	719	1564
50 | 1	100	20	1413	3136
51 | 1	100	50	3044	6820
52 | 1	100	100	5875	13408
53 | 1	100	200	9916	23044
54 | 1	100	500	21770	52180
55 | 1	100	1000	38301	91900
56 | 1	1000	2	309	616
57 | 1	1000	5	693	1444
58 | 1	1000	10	1004	2104
59 | 1	1000	20	1852	3988
60 | 1	1000	50	4077	8908
61 | 1	1000	100	7645	16828
62 | 1	1000	200	13876	30820
63 | 1	1000	500	31030	69736
64 | 1	1000	1000	56824	129484
65 | 1	10000	2	361	700
66 | 1	10000	5	663	1324
67 | 1	10000	10	1295	2596
68 | 1	10000	20	2420	4960
69 | 1	10000	50	5378	11152
70 | 1	10000	100	9472	19936
71 | 1	10000	200	18051	38872
72 | 1	10000	500	39994	87388
73 | 1	10000	1000	74796	164596
74 | 2	10	2	169	352
75 | 2	10	5	402	880
76 | 2	10	10	608	1360
77 | 2	10	20	1043	2392
78 | 2	10	50	2089	4984
79 | 2	10	100	3766	9016
80 | 2	10	200	6701	16060
81 | 2	10	500	13781	33052
82 | 2	10	1000	22446	53848
83 | 2	100	2	209	436
84 | 2	100	5	492	1060
85 | 2	100	10	806	1732
86 | 2	100	20	1501	3304
87 | 2	100	50	3053	6844
88 | 2	100	100	5751	13048
89 | 2	100	200	10099	23500
90 | 2	100	500	21567	51700
91 | 2	100	1000	38601	92620
92 | 2	1000	2	234	448
93 | 2	1000	5	580	1192
94 | 2	1000	10	1049	2188
95 | 2	1000	20	1806	3856
96 | 


--------------------------------------------------------------------------------
/docs/t-digest-paper/sizes.csv:
--------------------------------------------------------------------------------
1 | tag	i	q	k	actual
2 | 


--------------------------------------------------------------------------------
/docs/t-digest-paper/sizes.r:
--------------------------------------------------------------------------------
 1 | data = read.delim("sizes.csv")
 2 | 
 3 | plotGraph = function(tag, title='', showY=T) {
 4 |   n = max(data[data$tag == tag, ]$i)
 5 |   i = 1:n
 6 |   n2 = n/2
 7 | 
 8 |   if (showY) {
 9 |     yaxt = 's'
10 |   } else {
11 |     yaxt = 'n'
12 |   }
13 | 
14 |   plot(actual~q, data[data$tag == tag,], cex=0.2, xaxt='n', xlim=c(0,1), ylim=c(0,1050), xlab='Quantile',
15 |        ylab='Centroid size', yaxt=yaxt)
16 |   title(title)
17 |   box(lwd=3)
18 |   axis(side=1, at=c(0,0.25, 0.5, 0.75, 1), labels=c(0.0,0.25,0.5,0.75, 1.0), lwd=3)
19 | 
20 |   q = seq(0,1,by=0.01)
21 |   lines(q, 1000*4*q*(1-q), lwd=3, col='gray')
22 | }
23 | 
24 | png("sizes.png", width=1800, height=700, pointsize=36)
25 | layout(matrix(c(1,2,3), 1, 3, byrow=T), widths=c(1.21,1,1))
26 | plotGraph("uniform", title="Uniform Distribution", T)
27 | old = par(mar=c(5.1,0,4.1,2))
28 | plotGraph("gamma", title="Gamma(0.1, 0.1) Distribution", F)
29 | plotGraph("sequential", title="Sequential Distribution", F)
30 | par(old)
31 |   dev.off()
32 | #plotGraph("mixture", title="Mixture Distribution")
33 | 


--------------------------------------------------------------------------------
/docs/vldb/figures/cluster-spread.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/cluster-spread.pdf


--------------------------------------------------------------------------------
/docs/vldb/figures/combined.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/combined.pdf


--------------------------------------------------------------------------------
/docs/vldb/figures/endpoint.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/endpoint.pdf


--------------------------------------------------------------------------------
/docs/vldb/figures/error-vs-compression-small.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/error-vs-compression-small.pdf


--------------------------------------------------------------------------------
/docs/vldb/figures/error-vs-compression.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/error-vs-compression.pdf


--------------------------------------------------------------------------------
/docs/vldb/figures/interpolation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/interpolation.pdf


--------------------------------------------------------------------------------
/docs/vldb/figures/k-q-plot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/k-q-plot.pdf


--------------------------------------------------------------------------------
/docs/vldb/figures/linear-interpolation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/linear-interpolation.pdf


--------------------------------------------------------------------------------
/docs/vldb/figures/merge.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/merge.pdf


--------------------------------------------------------------------------------
/docs/vldb/figures/qd-sizes-small.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/qd-sizes-small.pdf


--------------------------------------------------------------------------------
/docs/vldb/figures/qd-sizes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/qd-sizes.pdf


--------------------------------------------------------------------------------
/docs/vldb/figures/relative-error-one-panel.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/relative-error-one-panel.pdf


--------------------------------------------------------------------------------
/docs/vldb/figures/relative-error.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/relative-error.pdf


--------------------------------------------------------------------------------
/docs/vldb/figures/singleton.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/figures/singleton.pdf


--------------------------------------------------------------------------------
/docs/vldb/refs.bib:
--------------------------------------------------------------------------------
  1 | @article{Gan:2018:MQS:3236187.3269475,
  2 |  author = {Gan, Edward and Ding, Jialin and Tai, Kai Sheng and Sharan, Vatsal and Bailis, Peter},
  3 |  title = {Moment-based Quantile Sketches for Efficient High Cardinality Aggregation Queries},
  4 |  journal = {Proc. VLDB Endow.},
  5 |  issue_date = {July 2018},
  6 |  volume = {11},
  7 |  number = {11},
  8 |  month = jul,
  9 |  year = {2018},
 10 |  issn = {2150-8097},
 11 |  pages = {1647--1660},
 12 |  numpages = {14},
 13 |  url = {https://doi.org/10.14778/3236187.3236212},
 14 |  doi = {10.14778/3236187.3236212},
 15 |  acmid = {3269475},
 16 |  publisher = {VLDB Endowment},
 17 | } 
 18 | 
 19 | @INPROCEEDINGS{Chen2000,
 20 |     author = {Fei Chen and Diane Lambert and Jos{\'e} C. Pinheiro},
 21 |     title = {Incremental Quantile Estimation for Massive Tracking},
 22 |     booktitle = {In Proceedings of KDD},
 23 |     year = {2000},
 24 |     pages = {516--522}
 25 | }
 26 | 
 27 | @article{one-dimensional-k-means,
 28 |     id= "pmid:27942416",
 29 |     title = {Ckmeans.1d.dp: Optimal <i>k</i>-means Clustering in One Dimension by Dynamic Programming},
 30 |     author= {Haizhou Wang and Mingzhou Song},
 31 |     journal= "The R journal",
 32 |     ISSN= "2073-4859",
 33 |     year = {2011},
 34 |     month={12},
 35 |     pages= {29-33},
 36 |     volume=3,
 37 |     issue=2,
 38 |     PMID = "27942416",
 39 |     PMCID= "PMC5148156"
 40 | }
 41 | 
 42 | @inproceedings{Greenwald-space-efficient-online-quantiles,
 43 |  author = {Greenwald, Michael and Khanna, Sanjeev},
 44 |  title = {Space-efficient Online Computation of Quantile Summaries},
 45 |  booktitle = {Proceedings of the 2001 ACM SIGMOD International Conference on Management of Data},
 46 |  series = {SIGMOD '01},
 47 |  year = {2001},
 48 |  isbn = {1-58113-332-4},
 49 |  location = {Santa Barbara, California, USA},
 50 |  pages = {58--66},
 51 |  numpages = {9},
 52 |  url = {http://doi.acm.org/10.1145/375663.375670},
 53 |  doi = {10.1145/375663.375670},
 54 |  acmid = {375670},
 55 |  publisher = {ACM},
 56 |  address = {New York, NY, USA}
 57 | } 
 58 | 
 59 | @article{sawzall,
 60 | title	= {Interpreting the Data: Parallel Analysis with Sawzall},
 61 | author	= {Rob Pike and Sean Dorward and Robert Griesemer and Sean Quinlan},
 62 | year	= {2005},
 63 | journal	= {Scientific Programming Journal},
 64 | pages	= {277--298},
 65 | volume	= {13}
 66 | }
 67 | 
 68 | @article{munro1980,
 69 | author = "J.I. Munro and M.S. Paterson",
 70 | year = "1980",
 71 | title = "Selection and sorting with limited storage",
 72 | journal = "Theoretical Computer Science",
 73 | volume = "12",
 74 | number = "3",
 75 | pages = "315 - 323",
 76 | issn = "0304-3975",
 77 | doi = "https://doi.org/10.1016/0304-3975(80)90061-4",
 78 | url = "http://www.sciencedirect.com/science/article/pii/0304397580900614"
 79 | }
 80 | @inproceedings{qdigest,
 81 |     author = {Shrivastava, Nisheeth and Buragohain, Chiranjeeb and Agrawal, Divyakant and Suri, Subhash},
 82 |     year = {2004},
 83 |     month = {09},
 84 |     title = {Medians and Beyond: New Aggregation Techniques for Sensor Networks},
 85 |     booktitle = {SenSys'04 - Proceedings of the Second International Conference on Embedded Networked Sensor Systems},
 86 |     doi = {10.1145/1031495.1031524}
 87 | }
 88 | 
 89 | @misc{wiki_welford,
 90 |    title = "Algorithms for Calculating Variance, Online Algorithm",
 91 |    author = "The Wikipedia Foundation",
 92 |    year = 2018,
 93 |    howpublished = "\url{http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm}",
 94 |    note = "[Online; accessed 19-October-2013]"
 95 |  }
 96 | 
 97 | @misc{datafu,
 98 |    title = "The Apache Datafu Project",
 99 |    author = "DFU",
100 |    year = 2019,
101 |    publisher = "Apache Software Foundation",
102 |    howpublished = "\url{https://datafu.apache.org/}",
103 |    note = "[Online; accessed 23-January-2018]"
104 |  }
105 | 
106 | @misc{t-digest-project,
107 |    title = "The t-digest Library",
108 |    author = "Ted Dunning",
109 |    year = 2018,
110 |    howpublished = "\url{https://github.com/tdunning/t-digest/}",
111 |    note = "[Online; accessed 23-January-2018]"
112 |  }
113 | 
114 | @misc{t-digest-arxiv,
115 |     title = {Computing Extremely Accurate Quantiles Using t-Digests},
116 |     author = "Ted Dunning and Otmar Ertl",
117 |     year = 2018,
118 |     howpublished = "\url{https://arxiv.org/abs/1902.04023}",
119 |     note="arXiv:1902.04023 [stat.CO]"
120 | }
121 | 
122 | @misc{moment-sketch-arxiv,
123 |     title = {Moment-Based Quantile Sketches for Efficient High Cardinality Aggregation Queries},
124 |     author = {Edward Gan and Jialin Ding and Kai Sheng Tai and Vatsal Sharan and Peter Bailis},
125 |     year = 2018,
126 |     howpublished = "\url{https://arxiv.org/abs/1803.01969v1}",
127 |     note = "arXiv:1803.01969v1"
128 | }
129 | 
130 | @misc{github:stream,
131 |    title = "Stream summarizer and cardinality estimator",
132 |    author = "StreamLib",
133 |    year = 2019,
134 |    howpublished = "\url{https://github.com/addthis/stream-lib}",
135 |    note = "[Online; accessed 11-February-2019]"
136 |  }
137 |  
138 | @inbook{knuth2welford,
139 |   author={Donald E. Knuth},
140 |   year=1998,
141 |   edition=3,
142 |   pages=232,
143 |   title= {The Art of Computer Programming, volume 2: Seminumerical Algorithms},
144 |   publisher= {Addison-Wesley},
145 |   address= {Boston} 
146 | }
147 | 
148 | @ARTICLE{welford62,
149 |     author = {B. P. Welford },
150 |     year = {1962},
151 |     title = {Note on a method for calculating corrected sums of squares and products},
152 |     journal = {Technometrics},
153 |     pages = {419--420}
154 | }
155 | 


--------------------------------------------------------------------------------
/docs/vldb/short.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/docs/vldb/short.pdf


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0">
 2 |     <modelVersion>4.0.0</modelVersion>
 3 | 
 4 |     <groupId>com.tdunning</groupId>
 5 |     <artifactId>t-digest-parent</artifactId>
 6 |     <version>3.4-SNAPSHOT</version>
 7 |     <name>T-digest Parent</name>
 8 |     <packaging>pom</packaging>
 9 | 
10 |     <scm>
11 |         <connection>scm:git:git@github.com:tdunning/t-digest.git</connection>
12 |         <developerConnection>scm:git:git@github.com:tdunning/t-digest.git</developerConnection>
13 |         <url>https://github.com/tdunning/t-digest</url>
14 |         <tag>HEAD</tag>
15 |     </scm>
16 | 
17 |     <developers>
18 |         <developer>
19 |             <id>tdunning</id>
20 |             <name>Ted</name>
21 |             <email>ted.dunning@gmail.com</email>
22 |             <url>https://github.com/tdunning/t-digest</url>
23 |             <roles>
24 |                 <role>developer</role>
25 |             </roles>
26 |             <timezone>-8</timezone>
27 |             <properties>
28 |                 <twitter>@ted_dunning</twitter>
29 |             </properties>
30 |         </developer>
31 |     </developers>
32 | 
33 |     <licenses>
34 |         <license>
35 |             <name>The Apache Software License, Version 2.0</name>
36 |             <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
37 |             <distribution>repo</distribution>
38 |         </license>
39 |     </licenses>
40 | 
41 |     <properties>
42 |         <source.property>1.8</source.property>
43 |         <target.property>1.8</target.property>
44 |     </properties>
45 | 
46 |     <profiles>
47 |         <profile>
48 |             <id>all</id>
49 |             <modules>
50 |                 <module>core</module>
51 |                 <module>quality</module>
52 |                 <module>benchmark</module>
53 |             </modules>
54 |         </profile>
55 |         <profile>
56 |             <id>core-only</id>
57 |             <activation>
58 |                 <activeByDefault>true</activeByDefault>
59 |             </activation>
60 |             <modules>
61 |                 <module>core</module>
62 |             </modules>
63 |         </profile>
64 |     </profiles>
65 | </project>
66 | 


--------------------------------------------------------------------------------
/quality/README.md:
--------------------------------------------------------------------------------
 1 | Quality Testing
 2 | ===============
 3 | 
 4 | This module contains a number of programs that assess the accuracy of t-digest implementations. 
 5 | In the process, bounds can be set on the quality of the t-digest idea itself. 
 6 | 
 7 | The implementation of a t-digest can have a variety of subtle flaws that do not 
 8 | affect operation except to compromise accuracy or to increase the size of
 9 | a digest. The tests in this module aim to highlight how and where an
10 | implementation may be going wrong.
11 | 
12 | Cluster Overlap
13 | --------------
14 | 
15 | Run `com.tdunning.tdigest.quality.AccuracyTest#testBucketFill` to
16 | generate test data and then `accuracy.r` to generate
17 | `cluster-spread.pdf` which shows how clusters don't overlap.
18 | 
19 | Basic Accuracy
20 | --------------
21 | 
22 | Accuracy versus Size
23 | -----
24 | ```
25 | for algorithm in algorithms:
26 |   for compression in [10,20,50,100,200,500,1000]:
27 |     for distribution in ["gamma", "flip-gamma", "uniform"]:
28 |       add data
29 |       for q in [0.0001, 0.001,0.01,0.1,0.5,0.9,0.99,0.999,0.9999]:
30 |         record algorithm, compression, distribution, q, x_data, x_digest, q_data
31 | ```
32 | Bin Distribution
33 | -----
34 | ```
35 | for each algorithm:
36 |   compression = 100
37 |   for distribution in ["gamma", "uniform"]:
38 |       add data
39 |       for bin in [0.0001, 0.001,0.01,0.5]:
40 |         for sample in bin:
41 |            record algorithm, compression, distribution, q_bin, x
42 | ```
43 | 
44 | Comparison to KLL Algorithms
45 | ==========
46 | 
47 | General Considerations
48 | =======
49 | 
50 | Input Distribution
51 | -------
52 | 
53 | Compression Factor
54 | -------
55 | 
56 | Data Size
57 | -------
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/quality/comparison.r:
--------------------------------------------------------------------------------
 1 | left.panel = function() {
 2 | 
 3 |     comp = read.csv("qd-tree-comparison.csv")
 4 |     filtered = comp %>% filter(tag == "uniform")
 5 |     plot(q.size ~ compression, filtered, pch=21, col=NA, bg=rgb(0,0,0,alpha=0.01), log='xy', yaxt='n', xlab=expression(delta), ylab="Digest Size (bytes)", ylim=c(70,6e3), cex=0.4, xlim=c(7,1100))
 6 |     points(t.size ~ compression, filtered, pch=21, col=NA, bg=rgb(0,0,0,alpha=0.01), cex=0.4)
 7 |     lines(q.size ~ compression, filtered %>% group_by(compression) %>% summarise(q.size=mean(q.size)), type='c')
 8 |     lines(t.size ~ compression, filtered %>% group_by(compression) %>% summarise(t.size=mean(t.size)), type='c')
 9 | 
10 |     axis(side=2, at=c(100, 200, 500, 1000,2e3,5e3), labels=c("100", "200", "500", "1kB","2kB","5kB"))
11 |     lines(c(7,16), c(954,954), col='grey')
12 |     lines(c(25,160), c(954,954), col='grey')
13 |     lines(c(250,1100), c(954,954), col='grey')
14 | 
15 |     text(25, 2500, expression(Q-digest), cex=0.7)
16 |     text(400, 4000, expression(italic(t)-digest), cex=0.7)
17 | 
18 |     lines(c(20, 20), c(750, 270), col='grey')
19 |     lines(c(20, 20), c(180, 110), col='grey')
20 |     text(20, 85, expression(delta[italic(q)]==20), cex=0.7)
21 | 
22 |     lines(c(200, 200), c(750, 110), col='grey')
23 |     text(200, 85, expression(delta[italic(t)]==200), cex=0.7)
24 | }
25 | 
26 | right.panel = function(small=F) {
27 |     boxplot(e2 ~ q, comp %>% filter(tag == "uniform", compression==20 ), 
28 |             boxwex=0.3, at=1:11-0.17, xaxt='n', col='grey',
29 |             ylab="Absolute Error", xlab=expression(italic(q)),
30 |             ylim=c(0,0.091), lwd=0.7, cex=0.5, cex.axis=0.9)
31 | 
32 |     boxplot(e1 ~ q, comp %>% filter(tag == "uniform", compression==200 ),
33 |             boxwex=0.3, at=1:11+0.17, xaxt='n', add=T, lwd=0.7, cex=0.5, yaxt='n')
34 |     
35 |     axis(side=1, at=1:11,
36 |          labels=c(expression(10^-5), expression(10^-4), expression(10^-3),
37 |                   "0.01", "0.1", "0.5",
38 |                   "0.9", "0.99", "0.999", "0.9999", "0.99999"),
39 |          las=2, cex.axis=0.9)
40 | 
41 |     abline(h=0, col=rgb(0,0,0,alpha=0.2), lwd=1)
42 |     if (small) {
43 |         x.legend = 5.5
44 |         cex.legend = 0.9
45 |     } else {
46 |         x.legend = 5.1
47 |         cex.legend = 0.65
48 |     }
49 |     legend(x.legend, 0.092,
50 |            legend=expression(italic(t)-"digest  " (delta[italic(t)]==200),
51 |                              Q-"digest "(delta[italic(q)]==20)),
52 |            fill=c('white', 'grey'), cex=cex.legend)
53 | }
54 | 
55 | require(dplyr)
56 | pdf(file="qd-sizes.pdf", width=4.5, height=2, pointsize=9, family='serif')
57 | layout(matrix(c(1,2), nrow=1))
58 | 
59 | par(cex.axis=0.8)
60 | par(cex.lab=1.1)
61 | par(mar=c(3.2, 3.3, 0.2, 0.2))
62 | par(mgp=c(2.0, 0.5, 0))
63 | par(tcl=-0.3)
64 | par(las=2)
65 | 
66 | left.panel()
67 | right.panel()
68 | dev.off()
69 | 
70 | pdf(file="qd-sizes-small.pdf", width=3, height=2.2, pointsize=9, family='serif')
71 | par(cex.axis=0.8)
72 | par(cex.lab=1.3)
73 | par(mar=c(3.2, 3.3, 0.2, 0.2))
74 | par(mgp=c(2.0, 0.5, 0))
75 | par(tcl=-0.3)
76 | par(las=2)
77 | 
78 | right.panel(small=T)
79 | dev.off()
80 | 


--------------------------------------------------------------------------------
/quality/fh.r:
--------------------------------------------------------------------------------
 1 | expt = function(x) {
 2 |     if (length(x) > 1) {
 3 |         sapply(x, expt)
 4 |     } else {
 5 |         data = readBin(writeBin(x, raw(8)), "int", n=2)
 6 |         r = bitwShiftR(bitwAnd(data[2], 0x3ff00000), 20)
 7 |         if (r >= 512) {
 8 |             r = r-1024
 9 |         }
10 |         return(r+1)
11 |     }
12 | }
13 | 
14 | mantissa = function(x) {
15 |     if (length(x) > 1) {
16 |         sapply(x, mantissa)
17 |     } else {
18 |         data = readBin(writeBin(x, raw(8)), "int", n=2)
19 |         data[2] = bitwOr(bitwAnd(data[2], -0x7ff00001), 0x3ff00000)
20 |         readBin(writeBin(data, raw(8)), "double")
21 |     }
22 | }
23 | 
24 | approxLog2 = function(x) {
25 |     m = mantissa(x)
26 |     expt(x) + ((6 * m - m * m) - 5)/3
27 | }
28 | 


--------------------------------------------------------------------------------
/quality/kll-comparison.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tdunning/t-digest/7905f3d2ad18e7d7176811147d1316a3e23d7061/quality/kll-comparison.pdf


--------------------------------------------------------------------------------
/quality/merge.r:
--------------------------------------------------------------------------------
 1 | require(dplyr)
 2 | data = read.csv("merge.csv")
 3 | 
 4 | plotMerge = function(n, yaxt = 's') {
 5 |   if (yaxt == 'n') {
 6 |     ylab = NA
 7 |     old = par(mar=c(3.5, 0.5, 2, 0.2))
 8 |   } else {
 9 |     ylab = "Absolute error (ppm)"
10 |     old = par(mar=c(3.5, 4.0, 2, 0.2))
11 |   }
12 |   par(las=1)
13 |   par(lwd=0.5)
14 |   par(cex.lab=1.3)
15 |   par(cex.axis=0.9)
16 |   par(mgp=c(2.4, 0.5, 0))
17 |   par(tcl=-0.3)
18 | 
19 |   our.data = data %>% filter(type == "quantile", parts == n)
20 |   boxplot(e1*1e6 ~ q, at=(1:6)-0.23, xaxt='n', boxwex=0.19, our.data,
21 |             ylim=c(-3000, 3000), cex=0.5, yaxt = yaxt,
22 |           col=rgb(0.95, 0.95, 0.95), 
23 |           xlab=NA, ylab=NA)
24 |   title(xlab=expression('Quantile '(italic(q))), mgp=c(2.2, 0.5, 0))
25 |   title(ylab=ylab, mgp=c(2.8, 0.0, 0))
26 |   boxplot(e3*1e6 ~ q, at=1:6, xaxt='n', boxwex=0.19, add=T, our.data,
27 |           col=rgb(0.7, 0.7, 0.7), cex=0.5, yaxt = yaxt)
28 |   boxplot(e2*1e6 ~ q, at=1:6+0.23, xaxt='n', boxwex=0.19, add=T, our.data,
29 |           col=rgb(0.4, 0.4, 0.4), cex=0.5, yaxt = yaxt)
30 |   axis(side=1, at=1:6,
31 |        labels=c(expression(10^-3), expression(10^-2), 0.1, 0.2, 0.3, 0.5),
32 |        )
33 |   legend(0.13, -1300,
34 |          expression("Direct "(delta==100),
35 |                     "Stratified merge "(delta==200,100),
36 |                     "Flat merge "(delta==100,100)), 
37 |          fill = c(rgb(0.95, 0.95, 0.95), rgb(0.7, 0.7, 0.7), rgb(0.4, 0.4, 0.4)),
38 |          cex=0.75)
39 |   abline(h=0, col=rgb(0.4, 0.4, 0.4))
40 |   title(paste(n, " parts"), cex.main=1.3)
41 |   box()
42 |   par(old)
43 | }
44 | 
45 | #setEPS()
46 | pdf("merge.pdf", width=6, height=2.4, pointsize=9, family='serif')
47 | layout(matrix(c(1,2,3), 1, 3, byrow=T), widths=c(1.285,1,1))
48 | par(cex=1)
49 | 
50 | plotMerge(5, 's')
51 | plotMerge(20, 'n')
52 | plotMerge(100, 'n')
53 | 
54 | dev.off()
55 | 


--------------------------------------------------------------------------------
/quality/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <parent>
 8 |         <groupId>com.tdunning</groupId>
 9 |         <artifactId>t-digest-parent</artifactId>
10 |         <version>3.4-SNAPSHOT</version>
11 |         <relativePath>../pom.xml</relativePath>
12 |     </parent>
13 |     <artifactId>t-digest-quality</artifactId>
14 | 
15 |     <dependencies>
16 |         <dependency>
17 |             <groupId>junit</groupId>
18 |             <artifactId>junit</artifactId>
19 |             <version>4.13.1</version>
20 |             <scope>test</scope>
21 |         </dependency>
22 |         <dependency>
23 |             <groupId>org.apache.mahout</groupId>
24 |             <artifactId>mahout-math</artifactId>
25 |             <version>0.9</version>
26 |             <scope>test</scope>
27 |         </dependency>
28 |         <dependency>
29 |             <groupId>com.google.guava</groupId>
30 |             <artifactId>guava</artifactId>
31 |             <version>32.0.0-jre</version>
32 |             <scope>test</scope>
33 |         </dependency>
34 |         <dependency>
35 |             <groupId>com.clearspring.analytics</groupId>
36 |             <artifactId>stream</artifactId>
37 |             <version>2.5.2</version>
38 |             <scope>test</scope>
39 |         </dependency>
40 |         <dependency>
41 |             <groupId>org.apache.datasketches</groupId>
42 |             <artifactId>datasketches-java</artifactId>
43 |             <version>2.0.0</version>
44 |             <scope>test</scope>
45 |         </dependency>
46 |         <dependency>
47 |             <groupId>com.tdunning</groupId>
48 |             <artifactId>t-digest</artifactId>
49 |             <version>${project.parent.version}</version>
50 |         </dependency>
51 |     </dependencies>
52 | 
53 |     <build>
54 |         <plugins>
55 |             <plugin>
56 |                 <groupId>org.apache.maven.plugins</groupId>
57 |                 <artifactId>maven-compiler-plugin</artifactId>
58 |                 <version>3.3</version>
59 |                 <configuration>
60 |                     <verbose>true</verbose>
61 |                     <compilerVersion>1.7</compilerVersion>
62 |                     <source>1.8</source>
63 |                     <target>1.8</target>
64 |                 </configuration>
65 |             </plugin>
66 |         </plugins>
67 |     </build>
68 | </project>


--------------------------------------------------------------------------------
/quality/src/test/java/com/tdunning/tdigest/quality/BinFill.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Ted Dunning under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.tdunning.tdigest.quality;
 19 | 
 20 | import com.google.common.collect.Lists;
 21 | import com.tdunning.math.stats.Centroid;
 22 | import com.tdunning.math.stats.Dist;
 23 | import com.tdunning.math.stats.MergingDigest;
 24 | import com.tdunning.math.stats.ScaleFunction;
 25 | import com.tdunning.math.stats.TDigest;
 26 | import org.apache.mahout.math.jet.random.AbstractDistribution;
 27 | import org.junit.Test;
 28 | 
 29 | import java.io.FileNotFoundException;
 30 | import java.io.PrintWriter;
 31 | import java.util.Random;
 32 | 
 33 | /**
 34 |  * Plots the size of each bin for various distributions and parameters.
 35 |  * <p>
 36 |  * The bin-fill.r program can run in the same directory as this program to get some
 37 |  * visualization about how well clusters are filled.
 38 |  */
 39 | public class BinFill {
 40 |     @Test
 41 |     public void sampleFill() {
 42 |         System.out.printf("scale,delta,centroid,mean,count\n");
 43 |         for (double delta : new double[]{5, 10}) {
 44 |             double[] data = {0, 0, 3, 4, 1, 6, 0, 5, 2, 0, 3, 3, 2, 3, 0, 2, 5, 0, 3, 1};
 45 | 
 46 |             MergingDigest t1 = new MergingDigest(delta);
 47 |             t1.setScaleFunction(ScaleFunction.K_1);
 48 | 
 49 |             MergingDigest t2 = new MergingDigest(delta);
 50 |             t2.setScaleFunction(ScaleFunction.K_2);
 51 | 
 52 |             MergingDigest t3 = new MergingDigest(delta);
 53 |             t3.setScaleFunction(ScaleFunction.K_3);
 54 |             for (double x : data) {
 55 |                 t1.add(x);
 56 |                 t2.add(x);
 57 |                 t3.add(x);
 58 |             }
 59 | 
 60 | 
 61 |             int i = 1;
 62 |             for (MergingDigest t : Lists.newArrayList(t1, t2, t3)) {
 63 |                 System.out.printf("> %d, %.0f, %.5f, %.5f\n", i, delta, t.quantile(0.65), Dist.quantile(0.65, data));
 64 |                 int j = 0;
 65 |                 for (Centroid centroid : t.centroids()) {
 66 |                     System.out.printf("%d,%.0f,%d,%.5f,%d\n", i, delta, j, centroid.mean(), centroid.count());
 67 |                     j++;
 68 |                 }
 69 |                 i++;
 70 |             }
 71 |         }
 72 |     }
 73 | 
 74 |     private static final double N = 100000;
 75 | 
 76 |     public static void main(String[] args) throws FileNotFoundException {
 77 |         try (PrintWriter out = new PrintWriter("bin-fill.csv")) {
 78 |             out.printf("iteration,dist,algo,scale,q,x,k0,k1,dk,q0,q1,count,max0,max1\n");
 79 | 
 80 |             // for all scale functions except the non-normalized ones
 81 |             for (ScaleFunction f : ScaleFunction.values()) {
 82 |                 if (f.toString().contains("NO_NORM")) {
 83 |                     continue;
 84 |                 }
 85 |                 System.out.printf("%s\n", f);
 86 | 
 87 |                 // for all kinds of t-digests
 88 |                 for (Util.Factory factory : Util.Factory.values()) {
 89 |                     // for different distributions of values
 90 |                     for (Util.Distribution distribution : Util.Distribution.values()) {
 91 |                         AbstractDistribution gen = distribution.create(new Random());
 92 |                         // do multiple passes
 93 |                         for (int i = 0; i < 10; i++) {
 94 |                             TDigest dist = factory.create();
 95 |                             if (dist instanceof MergingDigest) {
 96 |                                 // can only set scale function on merging digest right now ...
 97 |                                 // ability for TreeDigest coming soon
 98 |                                 dist.setScaleFunction(f);
 99 |                             }
100 |                             for (int j = 0; j < N; j++) {
101 |                                 dist.add(gen.nextDouble());
102 |                             }
103 | 
104 |                             // now dump stats for the centroids
105 |                             double q0 = 0;
106 |                             double k0 = 0;
107 |                             for (Centroid c : dist.centroids()) {
108 |                                 double q1 = q0 + (double) c.count() / N;
109 |                                 double k1 = f.k(q1, dist.compression(), dist.size());
110 |                                 out.printf("%d,%s,%s,%s,%.7f,%.7f,%.7f,%.7f,%.7f,%.7f,%.7f,%d,%.1f,%.1f\n",
111 |                                         i, distribution, factory, f, (q0 + q1) / 2, c.mean(),
112 |                                         k0, k1, k1 - k0, q0, q1, c.count(),
113 |                                         dist.size() * f.max(q0, dist.compression(), dist.size()),
114 |                                         dist.size() * f.max(q1, dist.compression(), dist.size())
115 |                                 );
116 |                                 q0 = q1;
117 |                                 k0 = k1;
118 |                             }
119 |                         }
120 |                     }
121 |                 }
122 |             }
123 |         }
124 |     }
125 | }
126 | 


--------------------------------------------------------------------------------
/quality/src/test/java/com/tdunning/tdigest/quality/ComparisonTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Ted Dunning under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.tdunning.tdigest.quality;
 19 | 
 20 | import com.clearspring.analytics.stream.quantile.QDigest;
 21 | import com.tdunning.math.stats.Dist;
 22 | import com.tdunning.math.stats.MergingDigest;
 23 | import com.tdunning.math.stats.QuantileEstimator;
 24 | import com.tdunning.math.stats.TDigest;
 25 | import org.apache.mahout.math.jet.random.AbstractContinousDistribution;
 26 | import org.apache.mahout.math.jet.random.Gamma;
 27 | import org.apache.mahout.math.jet.random.Uniform;
 28 | import org.junit.Test;
 29 | 
 30 | import java.io.FileNotFoundException;
 31 | import java.io.FileOutputStream;
 32 | import java.io.PrintWriter;
 33 | import java.util.Arrays;
 34 | import java.util.List;
 35 | import java.util.Random;
 36 | 
 37 | /**
 38 |  * Compares t-digest to q-digest and traditional streaming quantile algorithms.
 39 |  */
 40 | public class ComparisonTest {
 41 |     private static double M = 20;
 42 | 
 43 |     @Test
 44 |     public void compareToQDigest() throws FileNotFoundException {
 45 |         Random rand = new Random();
 46 |         try (PrintWriter out = new PrintWriter(new FileOutputStream("qd-tree-comparison.csv"))) {
 47 |             out.printf("tag,compression,q,e1,e2,t.size,q.size\n");
 48 | 
 49 |             for (int i = 0; i < M; i++) {
 50 |                 compareQD(out, new Gamma(0.1, 0.1, rand), "gamma", 1L << 48);
 51 |                 // the bounds for the uniform distribution are varied to avoid round off effects
 52 |                 compareQD(out, new Uniform(0, rand.nextDouble() * 0.05 + 1.01, rand), "uniform", 1L << 48);
 53 |             }
 54 |         }
 55 |     }
 56 | 
 57 |     private void compareQD(PrintWriter out, AbstractContinousDistribution gen, String tag, long scale) {
 58 |         for (double compression : new double[]{10, 20, 50, 100, 200, 500, 1000, 2000}) {
 59 |             QDigest qd = new QDigest(compression);
 60 |             TDigest dist = new MergingDigest(compression);
 61 |             double[] data = new double[100000];
 62 |             for (int i = 0; i < 100000; i++) {
 63 |                 double x = gen.nextDouble();
 64 |                 dist.add(x);
 65 |                 qd.offer((long) (x * scale));
 66 |                 data[i] = x;
 67 |             }
 68 |             dist.compress();
 69 |             Arrays.sort(data);
 70 | 
 71 |             for (double q : new double[]{1e-5, 1e-4, 0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999, 0.9999, 0.99999}) {
 72 |                 double x1 = dist.quantile(q);
 73 |                 double x2 = (double) qd.getQuantile(q) / scale;
 74 |                 double e1 = Dist.cdf(x1, data) - q;
 75 |                 double e2 = Dist.cdf(x2, data) - q;
 76 |                 out.printf("%s,%.0f,%.8f,%.10g,%.10g,%d,%d\n", tag, compression, q, e1, e2, dist.smallByteSize(), QDigest.serialize(qd).length);
 77 |             }
 78 |         }
 79 |     }
 80 | 
 81 |     @Test
 82 |     public void compareToStreamingQuantile() throws FileNotFoundException {
 83 |         Random rand = new Random();
 84 | 
 85 |         try (PrintWriter out = new PrintWriter(new FileOutputStream("sq-tree-comparison.csv"))) {
 86 |             out.printf("tag,compression,q,e1,e2,t.size,q.size\n");
 87 |             for (int i = 0; i < M; i++) {
 88 |                 compareSQ(out, new Gamma(0.1, 0.1, rand), "gamma");
 89 |                 compareSQ(out, new Uniform(0, 1, rand), "uniform");
 90 |             }
 91 |         }
 92 |     }
 93 | 
 94 |     private void compareSQ(PrintWriter out, AbstractContinousDistribution gen, String tag) {
 95 |         double[] quantiles = {0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 0.7, 0.8, 0.9, 0.99, 0.999};
 96 |         for (double compression : new double[]{10, 20, 50, 100, 200, 500, 1000, 2000}) {
 97 |             QuantileEstimator sq = new QuantileEstimator(1001);
 98 |             TDigest dist = new MergingDigest(compression);
 99 |             double[] data = new double[100000];
100 |             for (int i = 0; i < 100000; i++) {
101 |                 double x = gen.nextDouble();
102 |                 dist.add(x);
103 |                 sq.add(x);
104 |                 data[i] = x;
105 |             }
106 |             dist.compress();
107 |             Arrays.sort(data);
108 | 
109 |             List<Double> qz = sq.getQuantiles();
110 |             for (double q : quantiles) {
111 |                 double x1 = dist.quantile(q);
112 |                 double x2 = qz.get((int) (q * 1000 + 0.5));
113 |                 double e1 = Dist.cdf(x1, data) - q;
114 |                 double e2 = Dist.cdf(x2, data) - q;
115 |                 out.printf("%s,%.0f,%.8f,%.10g,%.10g,%d,%d\n",
116 |                         tag, compression, q, e1, e2, dist.smallByteSize(), sq.serializedSize());
117 | 
118 |             }
119 |         }
120 |     }
121 | 
122 | }
123 | 


--------------------------------------------------------------------------------
/quality/src/test/java/com/tdunning/tdigest/quality/Git.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to Ted Dunning under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.tdunning.tdigest.quality;
19 | 
20 | import java.io.BufferedReader;
21 | import java.io.File;
22 | import java.io.IOException;
23 | import java.io.InputStreamReader;
24 | 
25 | /**
26 |  * Functions for probing Git. Handy for marking test results against git hashes.
27 |  */
28 | class Git {
29 |     private static boolean isGitClean() {
30 |         try {
31 |             return new ProcessBuilder("git", "diff-index", "--quiet", "HEAD", "--")
32 |                     .redirectOutput(new File("/dev/null"))
33 |                     .start()
34 |                     .exitValue() == 0;
35 |         } catch (IOException e) {
36 |             return false;
37 |         }
38 |     }
39 | 
40 |     static String getHash(boolean force) throws IOException {
41 |         if (force || isGitClean()) {
42 |             Process p = new ProcessBuilder("git", "log", "-1")
43 |                     .start();
44 |             BufferedReader stdout = new BufferedReader(new InputStreamReader(p.getInputStream()));
45 |             // output should look like "commit 01ea144ca865361be6786fd502bb554c75105e3c"
46 |             return stdout.readLine().substring(7);
47 |         } else {
48 |             throw new IOException("Source directory has changes that need to be committed");
49 |         }
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/quality/src/test/java/com/tdunning/tdigest/quality/ScalingTest.java:
--------------------------------------------------------------------------------
1 | package com.tdunning.tdigest.quality;
2 | 
3 | /**
4 |  * Measurement size of t-digests versus data size and compression
5 |  */
6 | public class ScalingTest {
7 | }
8 | 


--------------------------------------------------------------------------------
/quality/src/test/java/com/tdunning/tdigest/quality/SinglePassTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Ted Dunning under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.tdunning.tdigest.quality;
 19 | 
 20 | import com.tdunning.math.stats.Centroid;
 21 | import com.tdunning.math.stats.Dist;
 22 | import com.tdunning.math.stats.MergingDigest;
 23 | import com.tdunning.math.stats.ScaleFunction;
 24 | import org.junit.Test;
 25 | 
 26 | import java.io.FileNotFoundException;
 27 | import java.io.FileOutputStream;
 28 | import java.io.PrintWriter;
 29 | import java.util.*;
 30 | 
 31 | /**
 32 |  * By setting the buffer size on the MergingDigest to larger than the number of data points,
 33 |  * we get to see the theoretical performance of a t-digest.
 34 |  */
 35 | public class SinglePassTest {
 36 |     private static final int N = 200000;
 37 | 
 38 |     /**
 39 |      * This test builds t-digests in a single pass with such a large buffer that all of the data is
 40 |      * sorted in one batch. This avoids questions about the accuracy of the merging strategy and tests
 41 |      * the basic error rates from the idea of the t-digest itself.
 42 |      * <p>
 43 |      * This test produces two data files that describe the results of the test.
 44 |      * <p>
 45 |      * The first file is called limit-errors.csv. It contains data about the accuracy of the t-digest
 46 |      * at values of q that are evenly spaced in logit space (i.e. even spacing of log10(q/1-q)). This
 47 |      * results in points that are closely spaced near q=0 and near q=1. At each point, the value of q,
 48 |      * the corresponding quantile estimate x1=F^{-1}(q), the actual value x1 (from the data samples),
 49 |      * the round-trip quantile q1=F(x) as estimated by the t-digest and the actual round-trip quantile
 50 |      * q2 as computed from the original data are given.
 51 |      * <p>
 52 |      * The second file is called limit-sizes.csv and gives the centroid weights and locations in terms
 53 |      * of x and q for each centroid in the t-digest. In addition, q2=F(x) and x2=F^{-1}(q) are given as
 54 |      * estimated from the original data.
 55 |      * <p>
 56 |      * All of these tests are done under a variety of parameter settings including compression from 10 to
 57 |      * 500, centroid merging strategy and such.
 58 |      *
 59 |      * @throws FileNotFoundException If output files can't be opened.
 60 |      */
 61 |     @Test
 62 |     public void testConservativeBuild() throws FileNotFoundException {
 63 |         try (PrintWriter errors = new PrintWriter(new FileOutputStream("limit-errors.csv"));
 64 |              PrintWriter buckets = new PrintWriter(new FileOutputStream("limit-sizes.csv"))) {
 65 |             errors.printf("pass,x1,x2,q,q1,q2,error,compression,conservative\n");
 66 |             buckets.printf("pass,compression,conservative,i,q,mean,count,q2,x2\n");
 67 | 
 68 |             Random gen = new Random();
 69 |             for (int pass = 0; pass < 50; pass++) {
 70 |                 System.out.printf("%d\n", pass);
 71 |                 for (ScaleFunction scale : ScaleFunction.values()) {
 72 |                     if (scale.toString().endsWith("NO_NORM")) {
 73 |                         continue;
 74 |                     }
 75 |                     for (double compression : new double[]{20, 50, 100, 200, 300, 500}) {
 76 |                         double[] data = new double[N];
 77 |                         MergingDigest digest = new MergingDigest(compression, 2 * N);
 78 |                         digest.setScaleFunction(scale);
 79 | 
 80 |                         for (int i = 0; i < N; i++) {
 81 |                             double x = gen.nextDouble();
 82 |                             data[i] = x;
 83 |                             digest.add(x);
 84 |                         }
 85 | 
 86 |                         Arrays.sort(data);
 87 |                         int i = 0;
 88 |                         double sum = 0;
 89 |                         for (Centroid centroid : digest.centroids()) {
 90 |                             double q = (sum + centroid.count() / 2.0) / digest.size();
 91 |                             sum += centroid.count();
 92 |                             buckets.printf("%d,%.1f,%s,%d,%.12f,%.12f,%d,%.12f,%.12f\n",
 93 |                                     pass, compression, scale, i++, q, centroid.mean(), centroid.count(),
 94 |                                     Dist.cdf(centroid.mean(), data), Dist.quantile(q, data));
 95 |                         }
 96 |                         if (sum != digest.size()) {
 97 |                             System.out.printf("Oops ... total mismatch %.5f != %5d\n", sum, digest.size());
 98 |                         }
 99 | 
100 |                         for (double lq = -6; lq < 6.01; lq += 0.25) {
101 |                             double q = 1 / (1 + Math.pow(10, -lq));
102 |                             double x1 = Dist.quantile(q, data);
103 |                             double x2 = digest.quantile(q);
104 |                             double q1 = digest.cdf(x1);
105 |                             double q2 = Dist.cdf(x1, data);
106 |                             errors.printf("%d,%.12f,%.12f,%.12f,%.12f,%.12f,%.12f,%.0f,%s\n",
107 |                                     pass, x1, x2, q, q1, q2, Math.abs(q1 - q2) / q1, compression, scale);
108 |                         }
109 |                     }
110 |                 }
111 |             }
112 |         }
113 |     }
114 | }
115 | 


--------------------------------------------------------------------------------
/quality/src/test/java/com/tdunning/tdigest/quality/Util.java:
--------------------------------------------------------------------------------
 1 | package com.tdunning.tdigest.quality;
 2 | 
 3 | import com.tdunning.math.stats.AVLTreeDigest;
 4 | import com.tdunning.math.stats.MergingDigest;
 5 | import com.tdunning.math.stats.TDigest;
 6 | import org.apache.mahout.math.jet.random.AbstractContinousDistribution;
 7 | import org.apache.mahout.math.jet.random.Gamma;
 8 | import org.apache.mahout.math.jet.random.Uniform;
 9 | 
10 | import java.io.*;
11 | import java.util.Random;
12 | 
13 | /**
14 |  * Handy routings for computing cdf and quantile from a list of numbers
15 |  */
16 | class Util {
17 |     enum Factory {
18 |         MERGE {
19 |             TDigest create(double compression) {
20 |                 MergingDigest digest = new MergingDigest(compression, (int) (10 * compression));
21 |                 digest.useAlternatingSort = true;
22 |                 digest.useTwoLevelCompression = true;
23 |                 return digest;
24 |             }
25 | 
26 |             TDigest create(double compression, int bufferSize) {
27 |                 MergingDigest digest = new MergingDigest(compression, bufferSize);
28 |                 digest.useAlternatingSort = true;
29 |                 digest.useTwoLevelCompression = true;
30 |                 return digest;
31 |             }
32 |             TDigest create() {
33 |                 return create(100);
34 |             }
35 |         },
36 | 
37 |         MERGE_OLD_STYLE {
38 |             TDigest create(double compression) {
39 |                 MergingDigest digest = new MergingDigest(compression, (int) (10 * compression));
40 |                 digest.useAlternatingSort = false;
41 |                 digest.useTwoLevelCompression = false;
42 |                 return digest;
43 |             }
44 | 
45 |             TDigest create(double compression, int bufferSize) {
46 |                 MergingDigest digest = new MergingDigest(compression, bufferSize);
47 |                 digest.useAlternatingSort = false;
48 |                 digest.useTwoLevelCompression = false;
49 |                 return digest;
50 |             }
51 |             TDigest create() {
52 |                 return create(100);
53 |             }
54 |         },
55 | 
56 |         TREE {
57 |             TDigest create(double compression) {
58 |                 return new AVLTreeDigest(compression);
59 |             }
60 |             TDigest create() {
61 |                 return create(20);
62 |             }
63 |         };
64 | 
65 |         abstract TDigest create(double compression);
66 |         abstract TDigest create();
67 | 
68 |         TDigest create(double compression, int bufferSize) {
69 |             return create(compression);
70 |         }
71 |     }
72 | 
73 |     enum Distribution {
74 |         UNIFORM {
75 |             @Override
76 |             public AbstractContinousDistribution create(Random gen) {
77 |                 return new Uniform(0, 1, gen);
78 |             }
79 |         },
80 | 
81 |         GAMMA {
82 |             @Override
83 |             public AbstractContinousDistribution create(Random gen) {
84 |                 return new Gamma(0.1, 0.1, gen);
85 |             }
86 |         };
87 | 
88 |         public abstract AbstractContinousDistribution create(Random gen);
89 |     }
90 | }
91 | 


--------------------------------------------------------------------------------
/quality/x.r:
--------------------------------------------------------------------------------
 1 | plot(c(), c(), ylim=c(0,40), xlim=c(0,1), yaxt='n')
 2 | axis(side=2, at=(0:4)*10, labels=(0:4)*50)
 3 | r = matrix(0, nrow = 40, ncol=11)
 4 | for (y in 2:40) {
 5 |     n = 5*y
 6 |     m = 50 * (10000/n)
 7 |     sum = rep(0, len=11)
 8 |     for (i in 1:m) {
 9 |         sum = sum + quantile(runif(n=y), seq(0,1,0.1))
10 |     }
11 |     r[y,] = sum/m
12 |     print(y)
13 | }
14 | for (x in 1:11) {
15 |     lines(r[2:40,x], 2:40, type='b')
16 | }
17 | 
18 | 
19 | plot(c(), c(), ylim=c(0,40), xlim=c(1e-6,0.03), yaxt='n', log='x')
20 | axis(side=2, at=(0:4)*10, labels=(0:4)*1000)
21 | for (i in 1:40) {
22 |     n = i * 1000
23 |     m = 1e6/n
24 |     r = rep(0,4)
25 |     for (j in 1:m) {
26 |         r = r + quantile(runif(n), c(0, 0.001, 0.01, 0.02))
27 |     }
28 |     r = r/m
29 |     points(r, rep(i, 4))
30 | }
31 | 


--------------------------------------------------------------------------------
/size-studies.r:
--------------------------------------------------------------------------------
 1 | # Experiments with t-digest in R
 2 | 
 3 | standard.size.bound = function(n, q) {
 4 |     4 * n * q * (1-q)
 5 | }
 6 | 
 7 | constant.size.bound = function(n, q) {
 8 |     n
 9 | }
10 | 
11 | root.size.bound = function(n, q) {
12 |     n * sqrt(4 * q * (1-q))
13 | }
14 | 
15 | abs.size.bound = function(n, q) {
16 |     2 * n * min(q, 1-q)
17 | }
18 | 
19 | sorted.t.digest = function (points, compression=50, size.bound = standard.size.bound) {
20 |     points = sort(points)
21 |     n = length(points)
22 | 
23 |     total = 0
24 |     i = 1
25 |     r = data.frame()
26 |     while (i <= n) {
27 |         # accumulate a centroid of max size
28 |         mean = 0
29 |         count = 0
30 |         qx = total/n
31 |         while (count + 1 <= max(1, do.call(size.bound,list(n=n, q=qx)) / compression)) {
32 |             count = count+1
33 |             mean = mean + (points[i]-mean)/count
34 |             qx = (total + count/2) / n
35 |             i = i+1
36 |         }
37 |         total = total + count
38 |         r = rbind(r, data.frame(center=c(mean), count=c(count)))
39 |     }
40 |     r
41 | }
42 | 
43 | size.growth = data.frame()
44 | sample.size = c(100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000)
45 | bounds = c("standard.size.bound", "abs.size.bound", "root.size.bound", "constant.size.bound")
46 | for (j in 1:length(bounds)) {
47 |     bound = bounds[j]
48 |     print(bound)
49 |     for (i in 1:length(sample.size)) {
50 |         n = sample.size[i]
51 |         x = rnorm(n)
52 |         cx = sorted.t.digest(x, size.bound=bound)
53 |         size.growth = rbind(size.growth, data.frame(f=bound, n=n, c=dim(cx)[1]))
54 |         print(c(n, dim(cx)[1]))
55 |     }
56 | }
57 | 
58 | colors = rainbow(3)
59 | colors = c(colors[1],colors[1],colors[2], colors[3])
60 | plot(x=c(), y=c(), xlim=c(1e2,1e6), ylim=c(0,350), log="x", ylab="Centroids", xlab="Points")
61 | for (i in c(1,3,4)) {
62 |   lines(c~n, size.growth[unclass(size.growth$f)==i,], col=colors[i], lwd=2, type='b', cex=0.6)
63 | }
64 | legend(1e2, y=350, legend=c("Standard", "Root", "Constant"), fill=rainbow(3))
65 | 
66 | direct.t.digest = function (points, compression=50) {
67 |     n = length(points)
68 | 
69 |     total = 0
70 |     i = 1
71 |     r = data.frame()
72 |     while (i <= n) {
73 |         # accumulate a centroid of max size
74 |         mean = 0
75 |         count = 0
76 |         qx = total
77 |         while (count + 1 <= max(1, 4 * n * (qx * (1-qx) / compression))) {
78 |             count = count+1
79 |             mean = mean + (points[i]-mean)/count
80 |             qx = (total + count/2) / n
81 |             i = i+1
82 |         }
83 |         total = total + count
84 |         r = rbind(r, data.frame(center=c(mean), count=c(count)))
85 |     }
86 |     r
87 | }
88 | 
89 |     
90 | 


--------------------------------------------------------------------------------