├── .gitignore
├── LICENSE
├── README.md
├── build.sbt
├── isarn-sketches-java
    └── src
    │   └── main
    │       └── java
    │           └── org
    │               └── isarnproject
    │                   └── sketches
    │                       └── java
    │                           └── TDigest.java
├── project
    ├── build.properties
    └── plugins.sbt
└── src
    ├── main
        └── scala
        │   └── org
        │       └── isarnproject
        │           └── sketches
        │               ├── TDigest.scala
        │               └── tdmap
        │                   └── TDigestMap.scala
    ├── site
        └── index.html
    └── test
        └── scala
            └── org
                └── isarnproject
                    └── sketches
                        ├── TDigestTest.scala
                        └── java
                            └── JavaTDigestTest.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | # sbt specific
 5 | .cache
 6 | .history
 7 | .lib/
 8 | dist/*
 9 | target/
10 | lib_managed/
11 | src_managed/
12 | project/boot/
13 | project/plugins/project/
14 | 
15 | # Scala-IDE specific
16 | .scala_dependencies
17 | .worksheet
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # isarn-sketches
 2 | Sketching data structures
 3 | 
 4 | ### API documentation
 5 | - https://isarn.github.io/isarn-sketches/scala/api/
 6 | - https://isarn.github.io/isarn-sketches/java/api/
 7 | 
 8 | ### Compatibility
 9 | isarn-sketches can operate with [Algebird](https://twitter.github.io/algebird/) via the
10 | [isarn-sketches-algebird-api](https://github.com/isarn/isarn-sketches-algebird-api)
11 | 
12 | isarn-sketches can also operate with [Apache Spark](https://github.com/apache/spark) via the [isarn-sketches-spark](https://github.com/isarn/isarn-sketches-spark) library
13 | 
14 | ### How to use in your project
15 | 
16 | ``` scala
17 | // isarn-sketches
18 | libraryDependencies += "org.isarnproject" %% "isarn-sketches" % "0.3.0"
19 | 
20 | // isarn-sketches-java
21 | libraryDependencies += "org.isarnproject" % "isarn-sketches-java" % "0.3.0"
22 | ```
23 | 
24 | ### t-digest
25 | ``` scala
26 | scala> import org.isarnproject.sketches.TDigest
27 | import org.isarnproject.sketches.TDigest
28 | 
29 | scala> val data = Vector.fill(10000) { scala.util.Random.nextGaussian() }
30 | data: scala.collection.immutable.Vector[Double] = Vector(1.6046163970051968, 0.44151418924289004, ...
31 | 
32 | scala> val sketch = TDigest.sketch(data)
33 | sketch: org.isarnproject.sketches.TDigest = TDigest(0.5,0,74,TDigestMap(-3.819069044174932 -> (1.0, 1.0), ...
34 | 
35 | scala> sketch.cdf(0)
36 | res0: Double = 0.4984362744530557
37 | 
38 | scala> sketch.cdfInverse(0.5)
39 | res1: Double = 0.0038481195948969205
40 | ```
41 | 
42 | #### t-digest resources
43 | * Original paper: [Computing Extremely Accurate Quantiles Using t-Digests](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf)
44 | * Video Talk: [Sketching Data with T-Digest In Apache Spark](https://youtu.be/ETUYhEZRtWE)
45 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2016-2018 Erik Erlandson
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 | http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | // sbt clean unidoc previewSite
 18 | // sbt clean unidoc ghpagesPushSite
 19 | // sbt +isarn_sketches/publish
 20 | // publish isarn-sketches-java for exactly one scala version:
 21 | // sbt isarn_sketches_java/publish
 22 | 
 23 | scalaVersion := "2.12.8"
 24 | 
 25 | crossScalaVersions := Seq("2.11.12", "2.12.8")
 26 | 
 27 | // these do not "inherit" when defined at top level, so
 28 | // define them here for inclusion in each subproject.
 29 | // This also worked: 'xxx in ThisProject := yyy', but you have to do it
 30 | // for each setting below, so this seemed a bit cleaner
 31 | def publishSettings = Seq(
 32 |   version := "0.3.1-SNAPSHOT",
 33 |   //isSnapshot := true,
 34 |   //publishConfiguration := publishConfiguration.value.withOverwrite(true),
 35 |   publishLocalConfiguration := publishLocalConfiguration.value.withOverwrite(true),
 36 |   organization := "org.isarnproject",
 37 |   pomIncludeRepository := { _ => false },
 38 |   publishMavenStyle := true,
 39 |   publishTo := {
 40 |     val nexus = "https://oss.sonatype.org/"
 41 |     if (isSnapshot.value)
 42 |       Some("snapshots" at nexus + "content/repositories/snapshots")
 43 |     else
 44 |       Some("releases"  at nexus + "service/local/staging/deploy/maven2")
 45 |   },
 46 |   licenses += ("Apache-2.0", url("http://opensource.org/licenses/Apache-2.0")),
 47 |   homepage := Some(url("https://github.com/isarn/isarn-sketches")),
 48 |   scmInfo := Some(
 49 |     ScmInfo(
 50 |       url("https://github.com/isarn/isarn-sketches"),
 51 |       "scm:git@github.com:isarn/isarn-sketches.git"
 52 |     )
 53 |   ),
 54 |   developers := List(
 55 |     Developer(
 56 |       id    = "erikerlandson",
 57 |       name  = "Erik Erlandson",
 58 |       email = "eje@redhat.com",
 59 |       url   = url("https://erikerlandson.github.io/")
 60 |     )
 61 |   )
 62 | )
 63 | 
 64 | compileOrder := CompileOrder.JavaThenScala
 65 | 
 66 | javacOptions ++= Seq()
 67 | 
 68 | scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature")
 69 | 
 70 | scalacOptions in (Compile, doc) ++= Seq("-doc-root-content", baseDirectory.value+"/root-doc.txt")
 71 | 
 72 | enablePlugins(ScalaUnidocPlugin, JavaUnidocPlugin, GhpagesPlugin)
 73 | 
 74 | git.remoteRepo := "git@github.com:isarn/isarn-sketches.git"
 75 | 
 76 | siteSubdirName in ScalaUnidoc := "scala/api"
 77 | 
 78 | siteSubdirName in JavaUnidoc := "java/api"
 79 | 
 80 | addMappingsToSiteDir(mappings in (ScalaUnidoc, packageDoc), siteSubdirName in ScalaUnidoc)
 81 | 
 82 | addMappingsToSiteDir(mappings in (JavaUnidoc, packageDoc), siteSubdirName in JavaUnidoc)
 83 | 
 84 | // tell unidoc to not do scala-doc for the isarn-sketches-java (javadoc will still get created)
 85 | unidocProjectFilter in (ScalaUnidoc, unidoc) := inAnyProject -- inProjects(isarn_sketches_java)
 86 | 
 87 | // this target needs to execute only once, at the top level
 88 | // turn it off for any sub-projects
 89 | def siteSubProjectSettings = Seq(
 90 |   previewSite := {}
 91 | )
 92 | 
 93 | // browser insisted on caching some older generated site at the default (4000)
 94 | previewFixedPort := Some(4444)
 95 | 
 96 | lazy val isarn_sketches_java = (project in file("isarn-sketches-java"))
 97 |   .settings(name := "isarn-sketches-java")
 98 |   .enablePlugins(GenJavadocPlugin, PublishJavadocPlugin)
 99 |   .settings(siteSubProjectSettings :_*)
100 |   .settings(
101 |     crossPaths := false,                            // drop off Scala suffix from artifact names
102 |     autoScalaLibrary := false                       // exclude scala-library from dependencies
103 |     )
104 |   .settings(publishSettings :_*)
105 | 
106 | lazy val isarn_sketches = (project in file("."))
107 |   .aggregate(isarn_sketches_java)
108 |   .dependsOn(isarn_sketches_java)
109 |   .settings(name := "isarn-sketches")
110 |   .settings(
111 |     // isarn_sketches_java needs to be published separately to work with 'crossPaths := false'
112 |     aggregate in publish := false,
113 |     libraryDependencies ++= Seq(
114 |       "org.isarnproject" %% "isarn-algebra-api" % "0.0.3",
115 |       "org.isarnproject" %% "isarn-collections" % "0.0.4",
116 |       "org.isarnproject" %% "isarn-scalatest" % "0.0.3" % Test,
117 |       "org.scalatest" %% "scalatest" % "3.0.5" % Test,
118 |       "org.apache.commons" % "commons-math3" % "3.6.1" % Test)
119 |       )
120 |   .settings(publishSettings :_*)
121 | 


--------------------------------------------------------------------------------
/isarn-sketches-java/src/main/java/org/isarnproject/sketches/java/TDigest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2016-2018 Erik Erlandson
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 | http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | package org.isarnproject.sketches.java;
 18 | 
 19 | import java.lang.System;
 20 | import java.lang.StringBuilder;
 21 | import java.util.Arrays;
 22 | import java.util.Comparator;
 23 | import java.io.Serializable;
 24 | import java.util.concurrent.ThreadLocalRandom;
 25 | import java.util.Random;
 26 | 
 27 | /**
 28 |  * A t-digest sketch of sampled numeric data
 29 |  * <pre>
 30 |  * Computing Extremely Accurate Quantiles Using t-Digests,
 31 |  * Ted Dunning and Otmar Ertl,
 32 |  * https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf
 33 |  * </pre>
 34 |  *
 35 |  * <pre>
 36 |  * import org.isarnproject.sketches.java.TDigest;
 37 |  * double[] data = // data that you would like to sketch
 38 |  * TDigest sketch = TDigest.sketch(data)
 39 |  * // the cumulative distribution function of the sketch; cdf(x) at x = 0
 40 |  * double cdf = sketch.cdf(0.0)
 41 |  * // inverse of the CDF, evaluated at q = 0.5
 42 |  * double cdfi = sketch.cdfInverse(0.5)
 43 |  * </pre>
 44 |  */
 45 | public class TDigest implements Serializable {
 46 |     /** compression setting (delta in original paper) */
 47 |     protected final double C;
 48 |     /** maximum number of unique discrete values to track */
 49 |     protected final int maxDiscrete;
 50 |     /** current number of clusters */
 51 |     protected int nclusters = 0;
 52 |     /** total mass of data sampled so far */
 53 |     protected double M = 0.0;
 54 |     /** cluster centers */
 55 |     protected double[] cent = null;
 56 |     /** cluster masses */
 57 |     protected double[] mass = null;
 58 |     /** cumulative cluster masses, represented as a Fenwick Tree */
 59 |     protected double[] ftre = null;
 60 | 
 61 |     /** A new t-digest sketching structure with default compression and maximum discrete tracking. */
 62 |     public TDigest() {
 63 |         this(COMPRESSION_DEFAULT, 0, INIT_SIZE_DEFAULT);
 64 |     }
 65 | 
 66 |     /** Construct a t-digest with the given compression.
 67 |      * Maximum discrete tracking defaults to zero.
 68 |      * @param compression sketching compression setting. Higher = more compression.
 69 |      * Must be &gt; 0.
 70 |      */
 71 |     public TDigest(double compression) {
 72 |         this(compression, 0, INIT_SIZE_DEFAULT);
 73 |     }
 74 | 
 75 |     /** Construct a t-digest with the given compression and maximum discrete tracking.
 76 |      * @param compression sketching compression setting. Higher = more compression.
 77 |      * Must be &gt; 0.
 78 |      * @param maxDiscrete maximum number of unique discrete values to track. Must be &ge; 0.
 79 |      * If this number of values is exceeded, the sketch will begin to operate in 
 80 |      * normal continuous mode.
 81 |      */
 82 |     public TDigest(double compression, int maxDiscrete) {
 83 |         this(compression, maxDiscrete, INIT_SIZE_DEFAULT);
 84 |     }
 85 | 
 86 |     /** Construct a t-digest with the given compression and maximum discrete tracking.
 87 |      * @param compression sketching compression setting. Higher = more compression.
 88 |      * Must be &gt; 0.
 89 |      * @param maxDiscrete maximum number of unique discrete values to track. Must be &ge; 0.
 90 |      * If this number of values is exceeded, the sketch will begin to operate in 
 91 |      * normal continuous mode.
 92 |      * @param sz initial capacity to use for internal arrays. Must be &gt; 0.
 93 |      */
 94 |     public TDigest(double compression, int maxDiscrete, int sz) {
 95 |         assert compression > 0.0;
 96 |         assert maxDiscrete >= 0;
 97 |         assert sz > 0;
 98 |         C = compression;
 99 |         this.maxDiscrete = maxDiscrete;
100 |         cent = new double[sz];
101 |         mass = new double[sz];
102 |         ftre = new double[1 + sz];
103 |         // ftre is 1-based. set ftre[0] to zero just to be tidy
104 |         ftre[0] = 0.0;        
105 |     }
106 | 
107 |     /**
108 |      * Construct a t-digest from a list of cluster centers and masses.
109 |      * Object deserialization is one of the intended use cases for this constructor.
110 |      * NOTE: This constructor assumes the 'cent' and 'mass' arrays will be owned
111 |      * by the new t-digest object. If 'cent' and 'mass' are both null then an empty cluster
112 |      * will be created.
113 |      * @param compression sketching compression setting. Higher = more compression.
114 |      * Must be &gt; 0.
115 |      * @param maxDiscrete maximum number of unique discrete values to track. Must be &ge; 0.
116 |      * If this number of values is exceeded, the sketch will begin to operate in 
117 |      * @param cent the list of cluster centers. Assumed to be in sorted order.
118 |      * This array is assumed to be owned by the t-digest object after construction.
119 |      * @param mass a list of cluster masses. Assumed to be parallel to centers.
120 |      * This array is assumed to be owned by the t-digest object after construction.
121 |      */
122 |     public TDigest(double compression, int maxDiscrete, double cent[], double mass[]) {
123 |         assert compression > 0.0;
124 |         assert maxDiscrete >= 0;
125 |         this.C = compression;
126 |         this.maxDiscrete = maxDiscrete;
127 |         assert (cent != null && mass != null) || (cent == null && mass == null);
128 |         this.nclusters = (cent != null) ? cent.length : 0;
129 |         int sz = nclusters;
130 |         if (sz == 0) {
131 |             // cent, mass and ftre cannot be zero length
132 |             sz = INIT_SIZE_DEFAULT;
133 |             this.cent = new double[sz];
134 |             this.mass = new double[sz];
135 |         } else {
136 |             this.cent = cent;
137 |             this.mass = mass;
138 |         }
139 |         assert cent != null && mass != null;
140 |         assert cent.length == sz;
141 |         assert cent.length == mass.length;
142 |         assert cent.length > 0;
143 |         this.ftre = new double[1 + sz];
144 |         Arrays.fill(ftre, 0, 1 + nclusters, 0.0);
145 |         this.M = 0.0;
146 |         for (int j = 0; j < nclusters; ++j) {
147 |             M += mass[j];
148 |             ftInc(j, mass[j]);
149 |         }
150 |     }
151 | 
152 |     /** Construct a deep copy of another t-digest */
153 |     public TDigest(TDigest that) {
154 |         C = that.C;
155 |         maxDiscrete = that.maxDiscrete;
156 |         nclusters = that.nclusters;
157 |         M = that.M;
158 |         cent = Arrays.copyOf(that.cent, nclusters);
159 |         mass = Arrays.copyOf(that.mass, nclusters);
160 |         ftre = Arrays.copyOf(that.ftre, nclusters);
161 |     }
162 | 
163 |     /** Update the sketch with a new sampled value
164 |      * @param x the new sampled value
165 |      */
166 |     public final void update(double x) {
167 |         update(x, 1.0);
168 |     }
169 | 
170 |     /** Update the sketch with a new sampled value
171 |      * @param x the new sampled value
172 |      * @param w the weight (aka mass) associated with x
173 |      */
174 |     public final void update(double x, double w) {
175 |         updateLogic(x, w);
176 |         if ((nclusters > maxDiscrete) && (nclusters > R())) recluster();
177 |     }
178 | 
179 |     private final void updateLogic(double x, double w) {
180 |         if (nclusters == 0) {
181 |             // clusters are empty, so (x,w) becomes the first cluster
182 |             cent[0] = x;
183 |             M = w;
184 |             mass[0] = w;
185 |             ftre[1] = w;
186 |             nclusters += 1;
187 |             return;
188 |         }
189 |         if (nclusters <= maxDiscrete) {
190 |             // we are under the limit for discrete values to track
191 |             int j = Arrays.binarySearch(cent, 0, nclusters, x);
192 |             if (j >= 0) {
193 |                 // landed on existing cluster: add its mass and we're done
194 |                 M += w;
195 |                 mass[j] += w;
196 |                 ftInc(j, w);
197 |             } else {
198 |                 // a new x value: insert as a new discrete cluster
199 |                 newCluster(-(j + 1), x, w);
200 |             }
201 |             return;
202 |         }
203 |         // get the index of the cluster closest to x
204 |         int j = closest(x);
205 |         if (x == cent[j]) {
206 |             // landed on existing cluster: add its mass and we're done
207 |             M += w;
208 |             mass[j] += w;
209 |             ftInc(j, w);
210 |             return;
211 |         }
212 |         double m = mass[j];
213 |         // q is the quantile of the closest cluster to x
214 |         // (ftSum does the right thing (return 0) for j = 0)
215 |         double q = (ftSum(j - 1) + (m / 2.0)) / M;
216 |         // this is the upper-bound for the mass of closest cluster
217 |         double ub = C * M * q * (1.0 - q);
218 |         // dm is how much mass we're allowed to add to closest cluster
219 |         double dm = Math.min(w, Math.max(0.0, ub - m));
220 |         // rm is the remainder of the mass
221 |         double rm = w - dm;
222 |         if (dm > 0.0) {
223 |             // Add any allowable mass to closest cluster and update its center.
224 |             // It is safe to update center this way because it will remain
225 |             // between x and original center, and so cannot move out of its original
226 |             // ordering relative to its neighbors, because x is by previous logic
227 |             // closer to cent[j] than any other cluster.
228 |             double dc = dm * (x - cent[j]) / (m + dm);
229 |             cent[j] += dc;
230 |             M += dm;
231 |             mass[j] += dm;
232 |             ftInc(j, dm);
233 |         }
234 |         // if there is remaining mass, it becomes a new cluster
235 |         if (rm > 0.0) newCluster((x < cent[j]) ? j : j + 1, x, rm);
236 |     }
237 | 
238 |     /** Merge another t-digest into this one.
239 |      * @param that the t-digest to merge. This t-digest is unaltered.
240 |      */
241 |     public final void merge(TDigest that) {
242 |         Integer[] indexes = new Integer[that.nclusters];
243 |         for (int j = 0; j < that.nclusters; ++j) indexes[j] = j;
244 |         // sort so that largest clusters are first.
245 |         // inserting large to small yields stable distribution estimations
246 |         Comparator<Integer> cmp = new Comparator<Integer>() {
247 |             @Override
248 |             public int compare(Integer a, Integer b) {
249 |                 return (int)Math.signum(that.mass[b] - that.mass[a]);
250 |             }
251 |         };
252 |         Arrays.sort(indexes, cmp);
253 |         for (int j: indexes) update(that.cent[j], that.mass[j]);
254 |     }
255 | 
256 |     /** Re-cluster this t-digest by reinserting its clusters in randomized order. */
257 |     public final void recluster() {
258 |         // I suspect it may be possible to improve on this fully-randomized algorithm,
259 |         // by leveraging the largest-first heuristic I use in cluster merging. See:
260 |         // http://erikerlandson.github.io/blog/2016/12/19/converging-monoid-addition-for-t-digest/
261 |         int[] indexes = new int[nclusters];
262 |         for (int j = 0; j < nclusters; ++j) indexes[j] = j;
263 |         intShuffle(indexes);
264 |         int sz = cent.length;
265 |         double[] oldCent = cent;
266 |         double[] oldMass = mass;
267 |         cent = new double[sz];
268 |         mass = new double[sz];
269 |         reset();
270 |         for (int j: indexes) updateLogic(oldCent[j], oldMass[j]);
271 |     }
272 | 
273 |     /** Reset this t-digest to an empty state */
274 |     public final void reset() {
275 |         nclusters = 0;
276 |         M = 0.0;
277 |     }
278 | 
279 |     private final void newCluster(int j, double x, double w) {
280 |         double[] newCent = cent;
281 |         double[] newMass = mass;
282 |         double[] newFtre = ftre;
283 |         int sz = cent.length;
284 |         if (nclusters >= sz) {
285 |             int szinc = (int)Math.ceil(0.1 * (double)sz);
286 |             sz += szinc;
287 |             newCent = new double[sz];
288 |             newMass = new double[sz];
289 |             newFtre = new double[1 + sz];
290 |             System.arraycopy(cent, 0, newCent, 0, j);
291 |             System.arraycopy(mass, 0, newMass, 0, j);
292 |         }
293 |         // arraycopy can handle when cent == newCent
294 |         System.arraycopy(cent, j, newCent, 1 + j, nclusters - j);
295 |         System.arraycopy(mass, j, newMass, 1 + j, nclusters - j);
296 |         // do this after copies above
297 |         newCent[j] = x;
298 |         newMass[j] = w;
299 |         nclusters += 1;
300 |         cent = newCent;
301 |         mass = newMass;
302 |         ftre = newFtre;
303 |         Arrays.fill(ftre, 0, 1 + nclusters, 0.0);
304 |         for (int k = 0; k < nclusters; ++k) ftInc(k, mass[k]);
305 |         M += w;
306 |     }
307 | 
308 |     private final int closest(double x) {
309 |         int j = Arrays.binarySearch(cent, 0, nclusters, x);
310 |         // exact match, return its index:
311 |         if (j >= 0) return j;
312 |         // x is not a cluster center, get its insertion index:
313 |         j = -(j + 1);
314 |         // x is to left of left-most cluster:
315 |         if (j == 0) return j;
316 |         // x is to right of right-most cluster:
317 |         if (j == nclusters) return j - 1;
318 |         // x is between two clusters, return index of closest:
319 |         double dL = x - cent[j - 1];
320 |         double dR = cent[j] - x;
321 |         return (dL < dR) ? (j - 1) : j;
322 |     }
323 | 
324 |     /** Obtain the number of clusters in this t-digest 
325 |      * @return the number of clusters in this t-digest
326 |      */
327 |     public final int size() {
328 |         return nclusters;
329 |     }
330 | 
331 |     /** Obtain the total mass sampled by this t-digest
332 |      * @return the total mass
333 |      */
334 |     public final double mass() {
335 |         return M;
336 |     }
337 | 
338 |     /** Obtain the compression setting for this t-digest
339 |      * @return the compression setting
340 |      */
341 |     public final double getCompression() {
342 |         return C;
343 |     }
344 | 
345 |     /** Obtain the maximum discrete setting for this t-digest
346 |      * @return the maximum discrete setting
347 |      */
348 |     public final int getMaxDiscrete() {
349 |         return maxDiscrete;
350 |     }
351 | 
352 |     /** Obtain a reference to this t-digest's cluster center array.
353 |      * NOTE: this array is not safe to modify, and should be used only in "read-only" mode!
354 |      * @return a reference to the cluster center array
355 |      */
356 |     public final double[] getCentUnsafe() {
357 |         return cent;
358 |     }
359 | 
360 |     /** Obtain a reference to this t-digest's cluster mass array.
361 |      * NOTE: this array is not safe to modify, and should be used only in "read-only" mode!
362 |      * @return a reference to the cluster mass array
363 |      */
364 |     public final double[] getMassUnsafe() {
365 |         return mass;
366 |     }
367 | 
368 |     /** Obtain a reference to this t-digest's cumulative mass array.
369 |      * This array stores the cumulative masses of clusters in Fenwick Tree format.
370 |      * NOTE: this array is not safe to modify, and should be used only in "read-only" mode!
371 |      * @return a reference to the cumulative mass array
372 |      */
373 |     public final double[] getFTUnsafe() {
374 |         return ftre;
375 |     }
376 | 
377 |     /** Returns true if this t-digest is empty, false otherwise. */
378 |     public final boolean isEmpty() {
379 |         return nclusters == 0;
380 |     }
381 | 
382 |     @Override
383 |     public String toString() {
384 |         StringBuilder sb = new StringBuilder("TDigest(");
385 |         for (int j = 0; j < nclusters; ++j) {
386 |             if (j > 25) {
387 |                 sb.append(" ...");
388 |                 break;
389 |             }
390 |             if (j > 0) sb.append(", ");
391 |             sb.append(cent[j])
392 |                 .append(" -> (")
393 |                 .append(mass[j])
394 |                 .append(", ")
395 |                 .append(ftSum(j))
396 |                 .append(")");
397 |         }
398 |         sb.append(")");
399 |         return sb.toString();
400 |     }
401 | 
402 |     /**
403 |      * Perform a random sampling from the distribution as sketched by this t-digest, in
404 |      * "probability density" mode.
405 |      * @return A random number sampled from the sketched distribution
406 |      */
407 |     public final double samplePDF() {
408 |         return samplePDF(ThreadLocalRandom.current());
409 |     }
410 | 
411 |     /**
412 |      * Perform a random sampling from the distribution as sketched by this t-digest, in
413 |      * "probability density" mode.
414 |      * @param prng a (pseudo) random number generator to use for the random sampling
415 |      * @return A random number sampled from the sketched distribution
416 |      */
417 |     public final double samplePDF(Random prng) {
418 |         return cdfInverse(prng.nextDouble());
419 |     }
420 | 
421 |     /**
422 |      * Perform a random sampling from the distribution as sketched by this t-digest, in
423 |      * "probability mass" (i.e. discrete) mode.
424 |      * @return A random number sampled from the sketched distribution
425 |      */
426 |     public final double samplePMF() {
427 |         return samplePMF(ThreadLocalRandom.current());
428 |     }
429 | 
430 |     /**
431 |      * Perform a random sampling from the distribution as sketched by this t-digest, in
432 |      * "probability mass" (i.e. discrete) mode.
433 |      * @param prng a (pseudo) random number generator to use for the random sampling
434 |      * @return A random number sampled from the sketched distribution
435 |      */
436 |     public final double samplePMF(Random prng) {
437 |         return cdfDiscreteInverse(prng.nextDouble());
438 |     }
439 | 
440 |     /**
441 |      * Perform a random sampling from the distribution as sketched by this t-digest,
442 |      * using "discrete" (PMF) mode if the number of clusters &le; maxDiscrete setting,
443 |      * and "density" (PDF) mode otherwise.
444 |      * @return A random number sampled from the sketched distribution
445 |      */
446 |     public final double sample() {
447 |         return sample(ThreadLocalRandom.current());
448 |     }
449 | 
450 |     /**
451 |      * Perform a random sampling from the distribution as sketched by this t-digest,
452 |      * using "discrete" (PMF) mode if the number of clusters &le; maxDiscrete setting,
453 |      * and "density" (PDF) mode otherwise.
454 |      * @param prng a (pseudo) random number generator to use for the random sampling
455 |      * @return A random number sampled from the sketched distribution
456 |      */
457 |     public final double sample(Random prng) {
458 |         if (nclusters <= maxDiscrete) {
459 |             return cdfDiscreteInverse(prng.nextDouble());
460 |         } else {
461 |             return cdfInverse(prng.nextDouble());
462 |         }    
463 |     }
464 | 
465 |     /**
466 |      * Compute a cumulative probability (CDF) for a numeric value, from the estimated probability
467 |      * distribution represented by this t-digest sketch.
468 |      * @param x a numeric value
469 |      * @return the cumulative probability that a random sample from the distribution is &le; x
470 |      */
471 |     public final double cdf(double x) {
472 |         int j1 = rcovj(x);
473 |         if (j1 < 0) return 0.0;
474 |         if (j1 >= nclusters - 1) return 1.0;
475 |         int j2 = j1 + 1;
476 |         double c1 = cent[j1];
477 |         double c2 = cent[j2];
478 |         double tm1 = mass[j1];
479 |         double tm2 = mass[j2];
480 |         double s = ftSum(j1 - 1);
481 |         double d1 = (j1 == 0) ? 0.0 : tm1 / 2.0;
482 |         double m1 = s + d1;
483 |         double m2 = m1 + (tm1 - d1) + ((j2 == nclusters - 1) ? tm2 : tm2 / 2.0);
484 |         double m = m1 + (x - c1) * (m2 - m1) / (c2 - c1);
485 |         return Math.min(m2, Math.max(m1, m)) / M;
486 |     }
487 | 
488 |     /**
489 |      * Compute a cumulative probability (CDF) for a numeric value, from the estimated probability
490 |      * distribution represented by this t-digest sketch, assuming sketch is "discrete"
491 |      * (e.g. if number of clusters &le; maxDiscrete setting)
492 |      * @param x a numeric value
493 |      * @return the cumulative probability that a random sample from the distribution is &le; x
494 |      */
495 |     public final double cdfDiscrete(double x) {
496 |         int j = rcovj(x);
497 |         return ftSum(j) / M;
498 |     }
499 | 
500 |     /**
501 |      * Compute the inverse cumulative probability (inverse-CDF) for a quantile value, from the
502 |      * estimated probability distribution represented by this t-digest sketch.
503 |      * @param q a quantile value.  The value of q is expected to be on interval [0, 1]
504 |      * @return the value x such that cdf(x) = q
505 |      */
506 |     public final double cdfInverse(double q) {
507 |         if (q < 0.0 || q > 1.0) return Double.NaN;
508 |         if (nclusters == 0) return Double.NaN;
509 |         if (nclusters == 1) return cent[0];
510 |         double m = q * M;
511 |         int j1 = rmcovj(m);
512 |         int j2 = j1 + 1;
513 |         double c1 = cent[j1];
514 |         double c2 = cent[j2];
515 |         double tm1 = mass[j1];
516 |         double tm2 = mass[j2];
517 |         double s = ftSum(j1 - 1);
518 |         double d1 = (j1 == 0) ? 0.0 : tm1 / 2.0;
519 |         double m1 = s + d1;
520 |         double m2 = m1 + (tm1 - d1) + ((j2 == nclusters - 1) ? tm2 : tm2 / 2.0);
521 |         double x = c1 + (m - m1) * (c2 - c1) / (m2 - m1);
522 |         return Math.min(c2, Math.max(c1, x));
523 |     }
524 | 
525 |     /**
526 |      * Compute the inverse cumulative probability (inverse-CDF) for a quantile value, from the
527 |      * estimated probability distribution represented by this t-digest sketch,
528 |      * assuming the sketch is "discrete" (e.g. if number of clusters &le; maxDiscrete setting)
529 |      * @param q a quantile value.  The value of q is expected to be on interval [0, 1]
530 |      * @return the smallest value x such that q &le; cdf(x)
531 |      */
532 |     public final double cdfDiscreteInverse(double q) {
533 |         if (q < 0.0 || q > 1.0) return Double.NaN;
534 |         if (nclusters == 0) return Double.NaN;
535 |         if (nclusters == 1) return cent[0];
536 |         double m = q * M;
537 |         int j = lmcovj(m);
538 |         return cent[j];
539 |     }
540 | 
541 |     // returns the index of a right mass cover
542 |     // ftSum(j) <= m < ftSum(j+1)
543 |     private final int rmcovj(double m) {
544 |         assert nclusters >= 2;
545 |         assert (m >= 0.0) && (m <= M);
546 |         int beg = 0;
547 |         double mbeg = 0.0;
548 |         int end = nclusters - 1;
549 |         double mend = M;
550 |         while ((end - beg) > 1) {
551 |             int mid = (beg + end) / 2;
552 |             double mmid = ftSum(mid);
553 |             if (m >= mmid) {
554 |                 beg = mid;
555 |                 mbeg = mmid;
556 |             } else {
557 |                 end = mid;
558 |                 mend = mmid;
559 |             }
560 |         }
561 |         return beg;
562 |     }
563 | 
564 |     // returns the index of a left mass cover
565 |     // ftSum(j-1) < m <= ftSum(j)
566 |     private final int lmcovj(double m) {
567 |         assert nclusters >= 2;
568 |         assert (m >= 0.0) && (m <= M);
569 |         int beg = -1;
570 |         double mbeg = 0.0;
571 |         int end = nclusters - 1;
572 |         double mend = M;
573 |         while ((end - beg) > 1) {
574 |             int mid = (beg + end) / 2;
575 |             double mmid = ftSum(mid);
576 |             if (m <= mmid) {
577 |                 end = mid;
578 |                 mend = mmid;
579 |             } else {
580 |                 beg = mid;
581 |                 mbeg = mmid;
582 |             }
583 |         }
584 |         return end;
585 |     }
586 | 
587 |     // returns the left index of a right-cover
588 |     private final int rcovj(double x) {
589 |         int j = Arrays.binarySearch(cent, 0, nclusters, x);
590 |         // exact match, return its index:
591 |         if (j >= 0) return j;
592 |         // x is not a cluster center, get its insertion index:
593 |         j = -(j + 1);
594 |         // x is to left of left-most cluster:
595 |         if (j == 0) return -1;
596 |         // return the index to the left of x:
597 |         return j - 1;
598 |     }
599 | 
600 |     // cumulative-sum algorithm for a Fenwick tree
601 |     private final double ftSum(int j) {
602 |         j += 1;
603 |         double s = 0.0;
604 |         while (j > 0) {
605 |             s += ftre[j];
606 |             j -= j & (-j); // dec by least significant nonzero bit of j
607 |         }
608 |         return s;
609 |     }
610 | 
611 |     // increment algorithm for a Fenwick tree
612 |     private final void ftInc(int j, double w) {
613 |         j += 1;
614 |         while (j <= nclusters) {
615 |             ftre[j] += w;
616 |             j += j & (-j); // inc by least significant nonzero bit of j
617 |         }
618 |     }
619 | 
620 |     @Override
621 |     public boolean equals(Object that) {
622 |         if (!(that instanceof TDigest)) return false;
623 |         if (this == that) return true;
624 |         TDigest rhs = (TDigest)that;
625 |         if (C != rhs.C) return false;
626 |         if (maxDiscrete != rhs.maxDiscrete) return false;
627 |         if (nclusters != rhs.nclusters) return false;
628 |         if (M != rhs.M) return false;
629 |         if (!equal(cent, rhs.cent, nclusters)) return false;
630 |         if (!equal(mass, rhs.mass, nclusters)) return false;
631 |         // if masses are equal, cumulative ftre had better also be equal
632 |         return true;
633 |     }
634 | 
635 |     // I can't believe java just added this to Arrays in java 9
636 |     static final boolean equal(double[] lhs, double[] rhs, int n) {
637 |         for (int j = 0; j < n; ++j) {
638 |             if (lhs[j] != rhs[j]) return false;
639 |         }
640 |         return true;
641 |     }
642 | 
643 |     @Override
644 |     public int hashCode() {
645 |         int h = nclusters;
646 |         h ^= doubleHash(M);
647 |         if (nclusters >= 1) {
648 |             h ^= doubleHash(cent[0]);
649 |             h ^= doubleHash(mass[0]);
650 |             h ^= doubleHash(ftre[1]);
651 |         }
652 |         if (nclusters >= 2) {
653 |             h ^= doubleHash(cent[nclusters - 1]);
654 |             h ^= doubleHash(mass[nclusters - 1]);
655 |             h ^= doubleHash(ftre[nclusters]);
656 |         }
657 |         if (nclusters >= 3) {
658 |             int j = nclusters / 2;
659 |             h ^= doubleHash(cent[j]);
660 |             h ^= doubleHash(mass[j]);
661 |             h ^= doubleHash(ftre[1 + j]);
662 |         }
663 |         return h;
664 |     }
665 | 
666 |     // I can't believe Double doesn't provide a static method for this
667 |     static final int doubleHash(double x) {
668 |         long v = Double.doubleToLongBits(x);
669 |         return (int)(v ^ (v >>> 32));
670 |     }
671 | 
672 |     protected final int R() {
673 |         return (int)(K / C);
674 |     }
675 | 
676 |     /**
677 |      * The t-digest algorithm will re-cluster itself whenever its number of clusters exceeds
678 |      * (K/delta).  This value is set such that the threshold is about 10x the heuristically
679 |      * expected number of clusters for the user-specified delta value.  Generally the number of
680 |      * clusters will only trigger the corresponding re-clustering threshold when data are being
681 |      * presented in a non-random order.
682 |      */
683 |     public static final double K = 10.0 * 50.0;
684 | 
685 |     /**
686 |      * Default value for a t-digest compression (aka delta) parameter.
687 |      * The number of clusters varies, roughly, as
688 |      * about (50/delta), when data are presented in random order
689 |      * (it may grow larger if data are not presented randomly).  The default corresponds to
690 |      * an expected number of clusters of about 100.
691 |      */
692 |     public static final double COMPRESSION_DEFAULT = 50.0 / 100.0;
693 | 
694 |     /** Default for the initial cluster array capacity */
695 |     public static final int INIT_SIZE_DEFAULT = 5;
696 | 
697 |     /** Obtain an empty t-digest with default compression and maximum discrete tracking. 
698 |      * @return a new empty t-digest
699 |      */
700 |     public static TDigest empty() {
701 |         return new TDigest(COMPRESSION_DEFAULT, 0, INIT_SIZE_DEFAULT);
702 |     }
703 | 
704 |     /**
705 |      * Obtain an empty t-digest.
706 |      * maxDiscrete defaults to zero.
707 |      * @param compression sketching compression setting. Higher = more compression.
708 |      * Must be &gt; 0.
709 |      * @return a new empty t-digest
710 |      */
711 |     public static TDigest empty(double compression) {
712 |         return new TDigest(compression, 0, INIT_SIZE_DEFAULT);
713 |     }
714 | 
715 |     /**
716 |      * Obtain an empty t-digest.
717 |      * @param compression sketching compression setting. Higher = more compression.
718 |      * Must be &gt; 0.
719 |      * @param maxDiscrete maximum number of unique discrete values to track. Must be &ge; 0.
720 |      * If this number of values is exceeded, the sketch will begin to operate in 
721 |      * normal continuous mode.
722 |      * @return a new empty t-digest
723 |      */
724 |     public static TDigest empty(double compression, int maxDiscrete) {
725 |         return new TDigest(compression, maxDiscrete, INIT_SIZE_DEFAULT);
726 |     }
727 | 
728 |     /**
729 |      * Obtain an empty t-digest.
730 |      * @param compression sketching compression setting. Higher = more compression.
731 |      * Must be &gt; 0.
732 |      * @param maxDiscrete maximum number of unique discrete values to track. Must be &ge; 0.
733 |      * If this number of values is exceeded, the sketch will begin to operate in 
734 |      * normal continuous mode.
735 |      * @param sz initial capacity to use for internal arrays. Must be &gt; 0.
736 |      * @return a new empty t-digest
737 |      */
738 |     public static TDigest empty(double compression, int maxDiscrete, int sz) {
739 |         return new TDigest(compression, maxDiscrete, sz);
740 |     }
741 | 
742 |     /** Merge the argument with smaller mass into the one with larger mass, and return
743 |      * the larger as the result.
744 |      * Note this means either (ltd) or (rtd) will be modified.
745 |      * @param ltd a t-digest
746 |      * @param rtd another t-digest
747 |      * @return if ltd has larger mass, then returns <pre>ltd.merge(rtd)</pre>,
748 |      * otherwise <pre>rtd.merge(ltd)</pre>
749 |      */
750 |     public static TDigest merge(TDigest ltd, TDigest rtd) {
751 |         if (ltd.size() < rtd.size()) return merge(rtd, ltd);
752 |         if (rtd.size() == 0) return ltd;
753 |         if (rtd.size() == 1) {
754 |             ltd.update(rtd.cent[0], rtd.mass[0]);
755 |             return ltd;
756 |         }
757 |         if (rtd.mass() < ltd.mass()) {
758 |             ltd.merge(rtd);
759 |             return ltd;
760 |         } else {
761 |             rtd.merge(ltd);
762 |             return rtd;
763 |         }
764 |     }
765 | 
766 |     /**
767 |      * Sketch data using a t-digest with default compression and maximum discrete tracking.
768 |      * @param data the data to sketch
769 |      * @return a t-digest sketch of the data
770 |      */
771 |     public static TDigest sketch(double[] data) {
772 |         return sketch(data, COMPRESSION_DEFAULT, 0, INIT_SIZE_DEFAULT);
773 |     }
774 | 
775 |     /**
776 |      * Sketch data using a t-digest.
777 |      * maxDiscrete defaults to zero.
778 |      * @param data the data to sketch
779 |      * @param compression sketching compression setting. Higher = more compression.
780 |      * Must be &gt; 0.
781 |      * @return a t-digest sketch of the data
782 |      */
783 |     public static TDigest sketch(double[] data, double compression) {
784 |         return sketch(data, compression, 0, INIT_SIZE_DEFAULT);
785 |     }
786 | 
787 |     /**
788 |      * Sketch data using a t-digest.
789 |      * @param data the data to sketch
790 |      * @param compression sketching compression setting. Higher = more compression.
791 |      * Must be &gt; 0.
792 |      * @param maxDiscrete maximum number of unique discrete values to track. Must be &ge; 0.
793 |      * If this number of values is exceeded, the sketch will begin to operate in 
794 |      * normal continuous mode.
795 |      * @return a t-digest sketch of the data
796 |      */
797 |     public static TDigest sketch(double[] data, double compression, int maxDiscrete) {
798 |         return sketch(data, compression, maxDiscrete, INIT_SIZE_DEFAULT);
799 |     }
800 | 
801 |     /**
802 |      * Sketch data using a t-digest.
803 |      * @param data the data to sketch
804 |      * @param compression sketching compression setting. Higher = more compression.
805 |      * Must be &gt; 0.
806 |      * @param maxDiscrete maximum number of unique discrete values to track. Must be &ge; 0.
807 |      * If this number of values is exceeded, the sketch will begin to operate in 
808 |      * normal continuous mode.
809 |      * @param sz initial capacity to use for internal arrays. Must be &gt; 0.
810 |      * @return a t-digest sketch of the data
811 |      */
812 |     public static TDigest sketch(double[] data, double compression, int maxDiscrete, int sz) {
813 |         TDigest td = empty(compression, maxDiscrete, sz);
814 |         for (double x: data) td.update(x, 1.0);
815 |         if (td.size() > maxDiscrete) td.recluster();
816 |         return td;
817 |     }
818 | 
819 |     static void intShuffle(int[] data) {
820 |         intShuffle(data, 0, data.length);
821 |     }
822 | 
823 |     static void intShuffle(int[] data, int end) {
824 |         intShuffle(data, 0, end);
825 |     }
826 | 
827 |     static void intShuffle(int[] data, int beg, int end) {
828 |         ThreadLocalRandom rnd = ThreadLocalRandom.current();
829 |         end -= 1;
830 |         while (end > beg) {
831 |             int r = rnd.nextInt(beg, end);
832 |             int d = data[end];
833 |             data[end] = data[r];
834 |             data[r] = d;
835 |             end -= 1;
836 |         }
837 |     }
838 | }
839 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.3.12
2 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | resolvers += Resolver.url(
 2 |   "bintray-sbt-plugin-releases",
 3 |     url("http://dl.bintray.com/content/sbt/sbt-plugin-releases"))(
 4 |         Resolver.ivyStylePatterns)
 5 | 
 6 | resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/"
 7 | 
 8 | resolvers += "jgit-repo" at "http://download.eclipse.org/jgit/maven"
 9 | 
10 | addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.3")
11 | 
12 | addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.3")
13 | 
14 | addSbtPlugin("io.crashbox" % "sbt-gpg" % "0.2.1")
15 | 
16 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.2")
17 | 
18 | // scoverage and coveralls deps are at old versions to avoid a bug in the current versions
19 | // update these when this fix is released:  https://github.com/scoverage/sbt-coveralls/issues/73
20 | //addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.0.4")
21 | 
22 | //addSbtPlugin("org.scoverage" % "sbt-coveralls" % "1.0.0")
23 | 
24 | //addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.6.0")
25 | 


--------------------------------------------------------------------------------
/src/main/scala/org/isarnproject/sketches/TDigest.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2016 Erik Erlandson
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 | http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | package org.isarnproject.sketches
 18 | 
 19 | import scala.util.Random
 20 | 
 21 | import tdmap.TDigestMap
 22 | 
 23 | /**
 24 |  * A t-digest sketch of sampled numeric data, as described in:
 25 |  * Computing Extremely Accurate Quantiles Using t-Digests,
 26 |  * Ted Dunning and Otmar Ertl,
 27 |  * https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf
 28 |  *
 29 |  * {{{
 30 |  * import org.isarnproject.sketches.TDigest
 31 |  * val data = Vector.fill(10000) { scala.util.Random.nextGaussian() }
 32 |  * // sketch of some Gaussian data
 33 |  * val sketch = TDigest.sketch(data)
 34 |  * // the cumulative distribution function of the sketch; cdf(x) at x = 0
 35 |  * val cdf = sketch.cdf(0.0)
 36 |  * // inverse of the CDF, evaluated at q = 0.5
 37 |  * val cdfi = sketch.cdfInverse(0.5)
 38 |  * }}}
 39 |  */
 40 | case class TDigest(
 41 |   delta: Double,
 42 |   maxDiscrete: Int,
 43 |   nclusters: Int,
 44 |   clusters: TDigestMap) extends Serializable {
 45 | 
 46 |   // re-cluster when number of clusters exceeds this threshold
 47 |   @inline private def R = (TDigest.K / delta).toInt
 48 | 
 49 |   /**
 50 |    * Returns a new t-digest with value x included in its sketch; td + x is equivalent to
 51 |    * td + (x, 1).
 52 |    * @param x The numeric data value to include in the sketch
 53 |    * @return the updated sketch
 54 |    */
 55 |   def +[N](x: N)(implicit num: Numeric[N]): TDigest = this.plus(num.toDouble(x), 1.0)
 56 | 
 57 |   /**
 58 |    * Returns a new t-digest with new pair (x, w) included in its sketch.
 59 |    * @param xw A pair (x, w) where x is the numeric value and w is its weight
 60 |    * @return the updated sketch
 61 |    * @note This implements 'algorithm 1' from:
 62 |    * Computing Extremely Accurate Quantiles Using t-Digests,
 63 |    * Ted Dunning and Otmar Ertl,
 64 |    * https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf
 65 |    */
 66 |   def +[N1, N2](xw: (N1, N2))(implicit num1: Numeric[N1], num2: Numeric[N2]): TDigest =
 67 |     this.plus(num1.toDouble(xw._1), num2.toDouble(xw._2))
 68 | 
 69 |   private def plus(x: Double, w: Double): TDigest = {
 70 |     if (nclusters <= maxDiscrete) {
 71 |       clusters.getNode(x).fold {
 72 |         TDigest(delta, maxDiscrete, nclusters + 1, clusters + (x -> w))
 73 |       } { xnode =>
 74 |         TDigest(delta, maxDiscrete, nclusters, clusters.update(x, x, xnode.data.value + w))
 75 |       }
 76 |     } else {
 77 |       val s = this.update(x, w)
 78 |       if (s.nclusters <= R) s
 79 |       else {
 80 |         // too many clusters: attempt to compress it by re-clustering
 81 |         val ds = TDigest.shuffle(s.clusters.toVector)
 82 |         ds.foldLeft(TDigest.empty(delta, maxDiscrete)) { case (d, (x, w)) => d.update(x, w) }
 83 |       }
 84 |     }
 85 |   }
 86 | 
 87 |   /**
 88 |    * Add this digest to another
 89 |    * @param that The right-hand t-digest operand
 90 |    * @return the result of combining left and right digests
 91 |    */
 92 |   def ++(that: TDigest): TDigest = TDigest.combine(this, that, this.delta, this.maxDiscrete)
 93 | 
 94 |   // This is most of 'algorithm 1', except for re-clustering which is factored out to avoid
 95 |   // recursive calls during a reclustering phase
 96 |   private def update(x: Double, w: Double) = {
 97 |     require(w > 0.0, "data weight must be > 0")
 98 | 
 99 |     if (nclusters == 0) {
100 |       // our map is empty, so insert this pair as the first cluster
101 |       TDigest(delta, maxDiscrete, nclusters + 1, clusters + (x -> w))
102 |     } else {
103 |       // Get the current cluster nearest to incoming (x)
104 |       val (c, m, psum) = clusters.nearTD(x)
105 |       if (x == c) {
106 |         // data landed on an existing cluster: increment that cluster's mass directly
107 |         TDigest(delta, maxDiscrete, nclusters, clusters.update(c, c, m + w))
108 |       } else {
109 |         val M = clusters.sum
110 |         val q = (psum + m / 2.0) / M
111 |         val ub = M * delta * q * (1.0 - q)
112 | 
113 |         val dm = math.min(w, math.max(0.0, ub - m))
114 |         val rm = w - dm
115 | 
116 |         val tClust = if (dm > 0.0) {
117 |           val nm = m + dm
118 |           val dc = dm * (x - c) / nm
119 |           clusters.update(c, c + dc, nm)
120 |         } else clusters
121 | 
122 |         val uClust = if (rm > 0.0) tClust + (x -> rm) else tClust
123 | 
124 |         // return the updated t-digest
125 |         TDigest(delta, maxDiscrete, nclusters + (if (rm > 0.0) 1 else 0), uClust)
126 |       }
127 |     }
128 |   }
129 | 
130 |   /**
131 |    * Compute a cumulative probability (CDF) for a numeric value, from the estimated probability
132 |    * distribution represented by this t-digest sketch.
133 |    * @param x a numeric value
134 |    * @return the cumulative probability that a random sample from the distribution is <= x
135 |    */
136 |   def cdf[N](x: N)(implicit num: Numeric[N]): Double = clusters.cdf(x)
137 | 
138 |   /**
139 |    * Compute the inverse cumulative probability (inverse-CDF) for a quantile value, from the
140 |    * estimated probability distribution represented by this t-digest sketch.
141 |    * @param q a quantile value.  The value of q is expected to be on interval [0, 1]
142 |    * @return the value x such that cdf(x) = q
143 |    */
144 |   def cdfInverse[N](q: N)(implicit num: Numeric[N]): Double = clusters.cdfInverse(q)
145 | 
146 |   /**
147 |    * Compute a cumulative probability (CDF) for a numeric value, from the estimated probability
148 |    * distribution represented by this t-digest sketch, assuming sketch is "discrete"
149 |    * (e.g. if number of clusters <= maxDiscrete setting)
150 |    * @param x a numeric value
151 |    * @return the cumulative probability that a random sample from the distribution is <= x
152 |    */
153 |   def cdfDiscrete[N](x: N)(implicit num: Numeric[N]): Double =
154 |     clusters.cdfDiscrete(x)
155 | 
156 |   /**
157 |    * Compute the inverse cumulative probability (inverse-CDF) for a quantile value, from the
158 |    * estimated probability distribution represented by this t-digest sketch,
159 |    * assuming the sketch is "discrete" (e.g. if number of clusters <= maxDiscrete setting)
160 |    * @param q a quantile value.  The value of q is expected to be on interval [0, 1]
161 |    * @return the smallest value x such that q <= cdf(x)
162 |    */
163 |   def cdfDiscreteInverse[N](q: N)(implicit num: Numeric[N]): Double =
164 |     clusters.cdfDiscreteInverse(q)
165 | 
166 |   /**
167 |    * Perform a random sampling from the distribution as sketched by this t-digest, in
168 |    * "probability density" mode.
169 |    * @return A random number sampled from the sketched distribution
170 |    * @note uses the inverse transform sampling method
171 |    */
172 |   def samplePDF: Double = clusters.cdfInverse(Random.nextDouble)
173 | 
174 |   /**
175 |    * Perform a random sampling from the distribution as sketched by this t-digest, in
176 |    * "probability mass" (i.e. discrete) mode.
177 |    * @return A random number sampled from the sketched distribution
178 |    * @note uses the inverse transform sampling method
179 |    */
180 |   def samplePMF: Double = clusters.cdfDiscreteInverse(Random.nextDouble)
181 | 
182 |   /**
183 |    * Perform a random sampling from the distribution as sketched by this t-digest,
184 |    * using "discrete" (PMF) mode if the number of clusters <= maxDiscrete setting,
185 |    * and "density" (PDF) mode otherwise.
186 |    * @return A random number sampled from the sketched distribution
187 |    * @note uses the inverse transform sampling method
188 |    */
189 |   def sample: Double = if (nclusters <= maxDiscrete) samplePMF else samplePDF
190 | }
191 | 
192 | /** Factory functions for TDigest */
193 | object TDigest {
194 |   import scala.language.higherKinds
195 |   import scala.collection.SeqLike
196 |   import scala.collection.generic.CanBuildFrom
197 | 
198 |   /**
199 |    * Default value for a t-digest delta parameter.  The number of clusters varies, roughly, as
200 |    * about (50/delta), when data are presented in random order
201 |    * (it may grow larger if data are not presented randomly).  The default corresponds to
202 |    * an expected number of clusters of about 100.
203 |    */
204 |   val deltaDefault: Double = (50.0 / 100.0) // delta * E[clusters] ~ 50
205 | 
206 |   /**
207 |    * The t-digest algorithm will re-cluster itself whenever its number of clusters exceeds
208 |    * (K/delta).  This value is set such that the threshold is about 10x the heuristically
209 |    * expected number of clusters for the user-specified delta value.  Generally the number of
210 |    * clusters will only trigger the corresponding re-clustering threshold when data are being
211 |    * presented in a non-random order.
212 |    */
213 |   val K: Double = 10.0 * 50.0
214 | 
215 |   /**
216 |    * Obtain an empty t-digest
217 |    * @param delta a sketch resolution parameter.
218 |    * @param maxDiscrete sketch in discrete distribution mode up to this number of
219 |    * unique values.  Defaults to zero; normal continuous mode.
220 |    * @note Smaller values of delta yield sketches with more clusters, and higher resolution
221 |    * @note The expected number of clusters will vary (roughly) as (50/delta)
222 |    */
223 |   def empty(delta: Double = deltaDefault, maxDiscrete: Int = 0): TDigest = {
224 |     require(delta > 0.0, s"delta was not > 0")
225 |     require(maxDiscrete >= 0, s"maxDiscrete was not >= 0")
226 |     TDigest(delta, maxDiscrete, 0, TDigestMap.empty)
227 |   }
228 | 
229 |   /**
230 |    * Sketch some data with a t-digest
231 |    * @param data The data elements to sketch
232 |    * @param delta The sketch resolution parameter.
233 |    * @param maxDiscrete sketch in discrete distribution mode up to this number of
234 |    * unique values.  Defaults to zero; normal continuous mode.
235 |    * @return A t-digest sketch of the input data
236 |    * @note Smaller values of delta yield sketches with more clusters, and higher resolution
237 |    * @note The expected number of clusters will vary (roughly) as (50/delta)
238 |    */
239 |   def sketch[N](
240 |     data: TraversableOnce[N],
241 |     delta: Double = deltaDefault,
242 |     maxDiscrete: Int = 0)(implicit num: Numeric[N]): TDigest = {
243 |     require(delta > 0.0, s"delta was not > 0")
244 |     require(maxDiscrete >= 0, s"maxDiscrete was not >= 0")
245 |     val td = data.foldLeft(empty(delta, maxDiscrete))((c, e) => c + e)
246 |     TDigest.shuffle(td.clusters.toVector).foldLeft(empty(delta, maxDiscrete))((c, e) => c + e)
247 |   }
248 | 
249 |   /**
250 |    * Combine two t-digests to yield a new digest
251 |    * @param ltd the left-hand t-digest operand
252 |    * @param rtd the right hand t-digest
253 |    * @param delta a sketch resolution parameter.
254 |    * @param maxDiscrete sketch in discrete distribution mode up to this number of
255 |    * unique values.  Defaults to zero; normal continuous mode.
256 |    * @return the sum of left and right digests, defined as their aggregation
257 |    * @note This operation satisfies a Semigroup law, with the caveat
258 |    * that it is only "statistically" associative: d1++(d2++d3) will be statistically
259 |    * similar to (d1++d2)++d3, but rarely identical.
260 |    */
261 |   def combine(ltd: TDigest, rtd: TDigest,
262 |       delta: Double = deltaDefault,
263 |       maxDiscrete: Int = 0): TDigest = {
264 |     if (ltd.nclusters <= 1 && rtd.nclusters > 1) combine(rtd, ltd, delta, maxDiscrete)
265 |     else if (rtd.nclusters == 0) ltd
266 |     else if (rtd.nclusters == 1) {
267 |       // handle the singleton RHS case specially to prevent quadratic catastrophe when
268 |       // it is being used in the Aggregator use case
269 |       val d = rtd.clusters.asInstanceOf[tdmap.tree.INodeTD].data
270 |       ltd + ((d.key, d.value))
271 |     } else {
272 |       // insert clusters from largest to smallest
273 |       (ltd.clusters.toVector ++ rtd.clusters.toVector).sortWith((a, b) => a._2 > b._2)
274 |         .foldLeft(empty(delta, maxDiscrete))((d, e) => d + e)
275 |     }
276 |   }
277 | 
278 |   // Shuffle a sequence in a referentially-transparent way: pseudo-randomly, but with a random
279 |   // seed that is a function of the sequence argument.
280 |   private[sketches] def shuffle[E, S[X] <: SeqLike[X, S[X]]](
281 |     seq: S[E])(implicit cbf: CanBuildFrom[S[E], E, S[E]]) =
282 |     if (seq.length <= 1) seq
283 |     else {
284 |       val seed = scala.util.hashing.MurmurHash3.productHash((seq(0), seq(1), seq.length))
285 |       (new scala.util.Random(seed)).shuffle(seq)
286 |     }
287 | }
288 | 


--------------------------------------------------------------------------------
/src/main/scala/org/isarnproject/sketches/tdmap/TDigestMap.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2016-2018 Erik Erlandson
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 | http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | package org.isarnproject.sketches.tdmap
 18 | 
 19 | import math.Numeric
 20 | 
 21 | import scala.collection.SortedMap
 22 | 
 23 | import org.isarnproject.algebraAPI.{ MonoidAPI => Monoid, AggregatorAPI => Aggregator }
 24 | 
 25 | import org.isarnproject.collections.mixmaps.increment._
 26 | import org.isarnproject.collections.mixmaps.prefixsum._
 27 | import org.isarnproject.collections.mixmaps.nearest._
 28 | 
 29 | object tree {
 30 |   import org.isarnproject.collections.mixmaps.redblack.tree._
 31 |   import org.isarnproject.collections.mixmaps.ordered._
 32 |   import org.isarnproject.collections.mixmaps.ordered.tree.DataMap
 33 |   import org.isarnproject.collections.mixmaps.increment.tree._
 34 |   import org.isarnproject.collections.mixmaps.prefixsum.tree._
 35 |   import org.isarnproject.collections.mixmaps.nearest.tree._
 36 | 
 37 |   /** Base class of tree node for a TDigestMap */
 38 |   trait NodeTD extends NodePS[Double, Double, Double]
 39 |     with NodeInc[Double, Double] with NodeNearMap[Double, Double] {
 40 | 
 41 |     /**
 42 |      * Obtain a "mass cover": two adjacent nodes in the tree such that the cumulative mass
 43 |      * of the left node is <= (m) and the cumulative mass of the right node is > (m)
 44 |      * @param m The target mass to cover between two adjacent nodes
 45 |      * @return a Cover instance with the left and right covering tree nodes.  If the (m)
 46 |      * was < the mass of the left-most tree node, the left cover value will be None.  Similarly
 47 |      * if the mass was >= the cumulative mass of the right-most node (equivalent to sum of all
 48 |      * node masses in the tree), then the right cover value will be None.
 49 |      */
 50 |     final def mCover(m: Double) = mcov(m, 0.0, Cover[INodeTD](None, None))
 51 | 
 52 |     private[tree] def mcov(m: Double, psum: Double, cov: Cover[INodeTD]): Cover[INodeTD]
 53 | 
 54 |     // Find the cluster whose prefix sum is the least upper bound of mass 'm'
 55 |     // domain specific to t-digest algorithms
 56 |     private[sketches] final def keyPFSLUB(m: Double) = this match {
 57 |       case _: LNodeTD => Double.NaN
 58 |       case _ if (m < 0.0 || m > this.pfs) => Double.NaN
 59 |       case _ if (m == 0.0) => this.nodeMin.get.asInstanceOf[INodeTD].data.key
 60 |       case _ => this.kpl(m, 0.0)
 61 |     }
 62 | 
 63 |     // recursive implementation of keyPFSLUB
 64 |     private[tree] def kpl(m: Double, psum: Double): Double
 65 | 
 66 |     // obtains the nearest cluster to 'x'.  Returns the cluster (location, mass, prefix-sum)
 67 |     private[sketches] final def nearTD(x: Double): (Double, Double, Double) = ntd(x, 0.0)
 68 | 
 69 |     // recursive implementation for nearTD
 70 |     private[tree] def ntd(x: Double, psum: Double): (Double, Double, Double)
 71 | 
 72 |     // recursive implementation of 'update' method
 73 |     private[tdmap] def upd(x0: Double, x: Double, m: Double): Node[Double]
 74 |   }
 75 | 
 76 |   trait LNodeTD extends NodeTD
 77 |     with LNodePS[Double, Double, Double] with LNodeInc[Double, Double]
 78 |     with LNodeNearMap[Double, Double] {
 79 |     final def mcov(m: Double, psum: Double, cov: Cover[INodeTD]) = cov
 80 |     final def kpl(m: Double, psum: Double) = Double.NaN
 81 |     final def ntd(x: Double, psum: Double) = (Double.NaN, Double.NaN, Double.NaN)
 82 |     final def upd(x0: Double, x: Double, m: Double) =
 83 |       throw new Exception("If this exception threw, there is a bug in this code")
 84 |   }
 85 | 
 86 |   trait INodeTD extends NodeTD
 87 |     with INodePS[Double, Double, Double] with INodeInc[Double, Double]
 88 |     with INodeNearMap[Double, Double] {
 89 |     val lsub: NodeTD
 90 |     val rsub: NodeTD
 91 | 
 92 |     final def mcov(m: Double, psum: Double, cov: Cover[INodeTD]) = {
 93 |       if (m < psum + lsub.pfs) {
 94 |         lsub match {
 95 |           case n: INodeTD =>
 96 |             lsub.mcov(m, psum, cov.copy(r = Some(n.nodeMax.get.asInstanceOf[INodeTD])))
 97 |           case _ => cov.copy(r = Some(this))
 98 |         }
 99 |       } else {
100 |         val t = psum + lsub.pfs + data.value
101 |         if (m >= t) rsub.mcov(m, t, cov.copy(l = Some(this)))
102 |         else {
103 |           lsub match {
104 |             case n: INodeTD => Cover(Some(lsub.nodeMax.get.asInstanceOf[INodeTD]), Some(this))
105 |             case _ => cov.copy(r = Some(this))
106 |           }
107 |         }
108 |       }
109 |     }
110 | 
111 |     final def kpl(m: Double, psum: Double) = {
112 |       val lb = psum + lsub.pfs
113 |       val ub = lb + data.value
114 |       if (m > ub) {
115 |         rsub.kpl(m, ub)
116 |       } else if (m > lb) {
117 |         data.key
118 |       } else {
119 |         lsub.kpl(m, psum)
120 |       }
121 |     }
122 | 
123 |     final def ntd(x: Double, psum: Double) = {
124 |       if (x < data.key) {
125 |         lsub match {
126 |           case ls: INodeTD => {
127 |             if (x <= ls.kmax) ls.ntd(x, psum)
128 |             else {
129 |               val (dk, ldk) = (math.abs(x - data.key), math.abs(x - ls.kmax))
130 |               if (dk <= ldk) (data.key, data.value, psum + lsub.pfs)
131 |               else {
132 |                 val n = ls.node(ls.kmax).get.asInstanceOf[INodeTD]
133 |                 (n.data.key, n.data.value, psum + lsub.pfs - n.data.value)
134 |               }
135 |             }
136 |           }
137 |           case _ => (data.key, data.value, psum + lsub.pfs)
138 |         }
139 |       } else if (x > data.key) {
140 |         rsub match {
141 |           case rs: INodeTD => {
142 |             if (x >= rs.kmin) rs.ntd(x, psum + lsub.pfs + data.value)
143 |             else {
144 |               val (dk, rdk) = (math.abs(x - data.key), math.abs(x - rs.kmin))
145 |               if (dk <= rdk) (data.key, data.value, psum + lsub.pfs)
146 |               else {
147 |                 val n = rs.node(rs.kmin).get.asInstanceOf[INodeTD]
148 |                 (n.data.key, n.data.value, psum + lsub.pfs + data.value)
149 |               }
150 |             }
151 |           }
152 |           case _ => (data.key, data.value, psum + lsub.pfs)
153 |         }
154 |       } else (data.key, data.value, psum + lsub.pfs)
155 |     }
156 | 
157 |     final def upd(x0: Double, x: Double, m: Double) =
158 |       if (color == R) {
159 |         if (x0 < data.key) rNode(data, lsub.upd(x0, x, m), rsub)
160 |         else if (x0 > data.key) rNode(data, lsub, rsub.upd(x0, x, m))
161 |         else {
162 |           val d = new DataMap[Double, Double] {
163 |             val key = x
164 |             val value = m
165 |           }
166 |           rNode(d, lsub, rsub)
167 |         }
168 |       } else {
169 |         // We know we are directly replacing a node, so no need to call balance()
170 |         // in the case of black nodes. This is quite a bit faster. \o/
171 |         if (x0 < data.key) bNode(data, lsub.upd(x0, x, m), rsub)
172 |         else if (x0 > data.key) bNode(data, lsub, rsub.upd(x0, x, m))
173 |         else {
174 |           val d = new DataMap[Double, Double] {
175 |             val key = x
176 |             val value = m
177 |           }
178 |           bNode(d, lsub, rsub)
179 |         }
180 |       }
181 |   }
182 | }
183 | 
184 | import tree._
185 | 
186 | object infra {
187 |   import org.isarnproject.collections.mixmaps.redblack.tree._
188 |   import org.isarnproject.collections.mixmaps.ordered.tree.DataMap
189 | 
190 |   object tdmapMonoid extends Monoid[Double] {
191 |     def empty = 0.0
192 |     def combine(x: Double, y: Double) = x + y
193 |     def combineAll(as: TraversableOnce[Double]) = as.fold(0.0)(_ + _)
194 |     def combineAllOption(as: TraversableOnce[Double]) =
195 |       if (as.isEmpty) None else Some(combineAll(as))
196 |   }
197 | 
198 |   object tdmapAggregator extends Aggregator[Double, Double] {
199 |     def monoid = tdmapMonoid
200 |     def lff = (m: Double, d: Double) => m + d
201 |     def mf = (d: Double) => d
202 |     def aggregate(as: TraversableOnce[Double]) = as.fold(0.0)(_ + _)
203 |   }
204 | 
205 |   /** Dependency injection class for TDigestMap */
206 |   class Inject extends Serializable {
207 |     // Typeclasses corresponding to "regular real numbers":
208 |     val keyOrdering = implicitly[Numeric[Double]]
209 | 
210 |     val valueMonoid = tdmapMonoid
211 |     
212 |     val prefixAggregator = tdmapAggregator
213 | 
214 |     def iNode(clr: Color, dat: Data[Double], ls: Node[Double], rs: Node[Double]) =
215 |       new Inject with INodeTD with TDigestMap {
216 |         // INode
217 |         val color = clr
218 |         val lsub = ls.asInstanceOf[NodeTD]
219 |         val rsub = rs.asInstanceOf[NodeTD]
220 |         val data = dat.asInstanceOf[DataMap[Double, Double]]
221 |         // INodePS
222 |         val prefix = prefixAggregator.lff(
223 |           prefixAggregator.monoid.combine(lsub.pfs, rsub.pfs), data.value)
224 |         // INodeNear
225 |         val kmin = lsub match {
226 |           case n: INodeTD => n.kmin
227 |           case _ => data.key
228 |         }
229 |         val kmax = rsub match {
230 |           case n: INodeTD => n.kmax
231 |           case _ => data.key
232 |         }
233 |       }
234 |   }
235 | 
236 | }
237 | 
238 | import infra._
239 | 
240 | /**
241 |  * The tree-backed map object a TDigest uses to store and update its clusters.  TDigestMap
242 |  * inherits functionality for value increment, prefix-sum and nearest-neighbor queries.
243 |  */
244 | sealed trait TDigestMap extends SortedMap[Double, Double] with NodeTD
245 |   with IncrementMapLike[Double, Double, INodeTD, TDigestMap]
246 |   with PrefixSumMapLike[Double, Double, Double, INodeTD, TDigestMap]
247 |   with NearestMapLike[Double, Double, INodeTD, TDigestMap] {
248 | 
249 |   override def empty = TDigestMap.empty
250 | 
251 |   private def m1m2(c1: Double, tm1: Double, c2: Double, tm2: Double) = {
252 |     val s = this.prefixSum(c1, open = true)
253 |     val d1 = if (c1 == this.keyMin.get) 0.0 else tm1 / 2.0
254 |     val m1 = s + d1
255 |     val m2 = m1 + (tm1 - d1) + (if (c2 == this.keyMax.get) tm2 else tm2 / 2.0)
256 |     (m1, m2)
257 |   }
258 | 
259 |   // This updates an existing cluster with a new location and mass.  It does this
260 |   // efficiently by taking advantage of the knowledge that (a) this kind of update
261 |   // never changes the key ordering, and therefore that (b) this operation can
262 |   // always directly replace an existing node, without otherwise changing the topology
263 |   // of the tree.  Clearly, this is a domain-dependent method, and not exposed to the
264 |   // public API
265 |   private[sketches] def update(x0: Double, x: Double, m: Double): TDigestMap =
266 |     this.upd(x0, x, m).asInstanceOf[TDigestMap]
267 | 
268 |   /** Compute the CDF for a value, using piece-wise linear between clusters */
269 |   def cdf[N](xx: N)(implicit num: Numeric[N]) = {
270 |     val x = num.toDouble(xx)
271 |     this.coverR(x) match {
272 |       case Cover(Some((c1, tm1)), Some((c2, tm2))) => {
273 |         val (m1, m2) = m1m2(c1, tm1, c2, tm2)
274 |         val m = m1 + (x - c1) * (m2 - m1) / (c2 - c1)
275 |         // Clipping to [m1,m2] corrects numeric precision errors that can cause non-monotonic quirks
276 |         math.min(m2, math.max(m1, m)) / this.sum
277 |       }
278 |       case Cover(Some(_), None) => 1.0
279 |       case _ => 0.0
280 |     }
281 |   }
282 | 
283 |   def cdfDiscrete[N](xx: N)(implicit num: Numeric[N]) = {
284 |     if (this.isEmpty) 0.0 else {
285 |       val x = num.toDouble(xx)
286 |       this.prefixSum(x) / this.sum
287 |     }
288 |   }
289 | 
290 |   def cdfDiscreteInverse[N](qq: N)(implicit num: Numeric[N]) = {
291 |     val q = num.toDouble(qq)
292 |     keyPFSLUB(q * this.sum)
293 |   }
294 | 
295 |   /**
296 |    * Compute the inverse-CDF from a given quantile on interval [0, 1], using piecewise linear
297 |    * interpolation between clusters
298 |    */
299 |   def cdfInverse[N](qq: N)(implicit num: Numeric[N]) = {
300 |     def cdfI(m: Double, c1: Double, tm1: Double, c2: Double, tm2: Double) = {
301 |       val (m1, m2) = m1m2(c1, tm1, c2, tm2)
302 |       val x = c1 + (m - m1) * (c2 - c1) / (m2 - m1)
303 |       // Clipping to [c1,c2] corrects numeric precision errors that can cause non-monotonic quirks
304 |       math.min(c2, math.max(c1, x))
305 |     }
306 | 
307 |     val q = num.toDouble(qq)
308 |     if (q < 0.0 || q > 1.0) Double.NaN
309 |     else {
310 |       val m = q * this.sum
311 |       this.mCover(m).map(n => (n.data.key, n.data.value)) match {
312 |         case Cover(Some((c1, tm1)), Some((c2, tm2))) => cdfI(m, c1, tm1, c2, tm2)
313 |         case Cover(None, Some((c, _))) => this.coverR(c) match {
314 |           case Cover(Some((c1, tm1)), Some((c2, tm2))) => cdfI(m, c1, tm1, c2, tm2)
315 |           case _ => Double.NaN
316 |         }
317 |         case Cover(Some((c, _)), None) => c
318 |         case _ => Double.NaN
319 |       }
320 |     }
321 |   }
322 | 
323 |   override def toString =
324 |     "TDigestMap(" +
325 |       iterator.zip(prefixSumsIterator())
326 |       .map(x => s"${x._1._1} -> (${x._1._2}, ${x._2})").mkString(", ") +
327 |       ")"
328 | }
329 | 
330 | /** factory functions for TDigestMap */
331 | object TDigestMap {
332 |   /** Obtain an empty TDigestMap instance */
333 |   def empty: TDigestMap = new Inject with LNodeTD with TDigestMap
334 | }
335 | 


--------------------------------------------------------------------------------
/src/site/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Project Documentation - isarn-sketches</title>
 6 | </head>
 7 | <body>
 8 |     <h1>Project Documentation - isarn-sketches</h1>
 9 |     <p>
10 |     <a href="scala/api">Scala API documentation</a>
11 |     <p>
12 |     <a href="java/api">Java API documentation</a>
13 | </body>
14 | </html>
15 | 


--------------------------------------------------------------------------------
/src/test/scala/org/isarnproject/sketches/TDigestTest.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2016-2018 Erik Erlandson
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 | http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | package org.isarnproject.sketches
 18 | 
 19 | import org.scalatest._
 20 | 
 21 | import org.isarnproject.scalatest.matchers.seq._
 22 | 
 23 | class TDigestTest extends FlatSpec with Matchers {
 24 |   import org.apache.commons.math3.distribution.RealDistribution
 25 |   import org.apache.commons.math3.distribution.IntegerDistribution
 26 | 
 27 |   val seed = 235711L
 28 |   scala.util.Random.setSeed(seed)
 29 | 
 30 |   val ss = 100000
 31 |   val delta = 50.0 / 1000
 32 | 
 33 |   val maxD = 0.05
 34 |   val maxDI = 0.1
 35 | 
 36 |   def testTDvsDist(td: TDigest, dist: RealDistribution, stdv: Double): Boolean = {
 37 |     val xmin = td.clusters.keyMin.get
 38 |     val xmax = td.clusters.keyMax.get
 39 |     val step = (xmax - xmin) / 1000
 40 |     val d = (xmin to xmax by step).iterator
 41 |       .map(x => math.abs(td.cdf(x) - dist.cumulativeProbability(x))).max
 42 | 
 43 |     val dInv = (0.01 to 0.99 by 0.01).iterator
 44 |       .map(x => math.abs(td.cdfInverse(x) - dist.inverseCumulativeProbability(x))).max / stdv
 45 | 
 46 |     val pass = d <= maxD && dInv <= maxDI
 47 |     if (!pass) Console.err.println(s"testTDvsDist failure: d= $d  dInv= $dInv")
 48 |     pass
 49 |   }
 50 | 
 51 |   def testSamplingPDF(td: TDigest, dist: RealDistribution): Boolean = {
 52 |     val tdSamples = Array.fill(10000) { td.samplePDF }
 53 |     val distSamples = Array.fill(10000) { dist.sample }
 54 |     val kst = new org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest()
 55 |     val d = kst.kolmogorovSmirnovStatistic(tdSamples, distSamples)
 56 |     val pass = d <= maxD
 57 |     if (!pass) Console.err.println(s"testSamplingPDF failure: d= $d")
 58 |     pass
 59 |   }
 60 | 
 61 |   def testSamplingPMF(td: TDigest, dist: IntegerDistribution): Boolean = {
 62 |     td.nclusters should be <=(td.maxDiscrete)
 63 |     val tdSamples = Array.fill(10000) { td.samplePMF }
 64 |     val distSamples = Array.fill(10000) { dist.sample.toDouble }
 65 |     val kst = new org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest()
 66 |     val d = kst.kolmogorovSmirnovStatistic(tdSamples, distSamples)
 67 |     val pass = d <= maxD
 68 |     if (!pass) Console.err.println(s"testSamplingPDF failure: d= $d")
 69 |     pass
 70 |   }
 71 | 
 72 |   def testDistribution(dist: RealDistribution, stdv: Double): Boolean = {
 73 |     dist.reseedRandomGenerator(seed)
 74 | 
 75 |     val td = TDigest.sketch(Iterator.fill(ss) { dist.sample }, delta = delta)
 76 | 
 77 |     testTDvsDist(td, dist, stdv) && testSamplingPDF(td, dist)
 78 |   }
 79 | 
 80 |   def testMonotoneCDF(dist: RealDistribution): Boolean = {
 81 |     dist.reseedRandomGenerator(seed)
 82 |     val td = TDigest.sketch(Iterator.fill(ss) { dist.sample }, delta = delta)
 83 |     val (xmin, xmax) = (td.clusters.keyMin.get, td.clusters.keyMax.get)
 84 |     val step = (xmax - xmin) / 100000
 85 |     val t = (xmin to xmax by step).iterator.map(x => td.cdf(x)).sliding(2).map(w => w(1) - w(0)).min
 86 |     val pass = t >= 0.0
 87 |     if (!pass) Console.err.println(s"testMonotoneCDF failure: t= $t")
 88 |     pass
 89 |   }
 90 | 
 91 |   def testMonotoneCDFI(dist: RealDistribution): Boolean = {
 92 |     dist.reseedRandomGenerator(seed)
 93 |     val td = TDigest.sketch(Iterator.fill(ss) { dist.sample }, delta = delta)
 94 |     val (xmin, xmax) = (0.0, 1.0)
 95 |     val step = (xmax - xmin) / 100000
 96 |     val t = (xmin to xmax by step).iterator.map(q => td.cdfInverse(q)).sliding(2).map(w => w(1) - w(0)).min
 97 |     val pass = t >= 0.0
 98 |     if (!pass) Console.err.println(s"testMonotoneCDFI failure: t= $t")
 99 |     pass
100 |   }
101 | 
102 |   def testMonotone(dist: RealDistribution): Boolean = {
103 |     testMonotoneCDF(dist) && testMonotoneCDFI(dist)
104 |   }
105 | 
106 |   it should "sketch a uniform distribution" in {
107 |     import org.apache.commons.math3.distribution.UniformRealDistribution
108 |     val dist = new UniformRealDistribution()
109 |     testDistribution(dist, math.sqrt(dist.getNumericalVariance())) should be (true)
110 |   }
111 | 
112 |   it should "sketch a normal distribution" in {
113 |     import org.apache.commons.math3.distribution.NormalDistribution
114 |     val dist = new NormalDistribution()
115 |     testDistribution(dist, math.sqrt(dist.getNumericalVariance())) should be (true)
116 |   }
117 | 
118 |   it should "sketch an exponential distribution" in {
119 |     import org.apache.commons.math3.distribution.ExponentialDistribution
120 |     val dist = new ExponentialDistribution(1.0)
121 |     testDistribution(dist, math.sqrt(dist.getNumericalVariance())) should be (true)
122 |   }
123 | 
124 |   it should "aggregate with another t-digest using ++" in {
125 |     import org.apache.commons.math3.distribution.NormalDistribution
126 |     val dist = new NormalDistribution()
127 |     dist.reseedRandomGenerator(seed)
128 | 
129 |     val td1 = TDigest.sketch(Iterator.fill(ss) { dist.sample }, delta = delta)
130 |     val td2 = TDigest.sketch(Iterator.fill(ss) { dist.sample }, delta = delta)
131 | 
132 |     testTDvsDist(td1 ++ td2, dist, math.sqrt(dist.getNumericalVariance())) should be (true)
133 |   }
134 | 
135 |   it should "respect monotonic cdf and inverse" in {
136 |     import org.apache.commons.math3.distribution.ExponentialDistribution
137 |     import org.apache.commons.math3.distribution.NormalDistribution
138 |     import org.apache.commons.math3.distribution.UniformRealDistribution
139 | 
140 |     testMonotone(new UniformRealDistribution()) should be (true)
141 |     testMonotone(new ExponentialDistribution(1.0)) should be (true)
142 |     testMonotone(new NormalDistribution(0.0, 0.1)) should be (true)
143 |   }
144 | 
145 |   it should "respect maxDiscrete parameter" in {
146 |     import org.apache.commons.math3.distribution.GeometricDistribution
147 |     val gd = new GeometricDistribution(0.33)
148 |     val data = gd.sample(1000000)
149 |     val dataUniq = data.distinct.sorted
150 |     val kt = dataUniq.map(_.toDouble).toSet
151 |     val td = TDigest.sketch(data, maxDiscrete = 50)
152 |     val clust = td.clusters
153 |     clust.keys.toSet should be (kt)
154 |     val D = clust.keys.map { x => td.cdfDiscrete(x) }
155 |       .zip(dataUniq.map { k => gd.cumulativeProbability(k) })
156 |       .map { case (p1, p2) => math.abs(p1 - p2) }
157 |       .max
158 |     (D <= 0.01) should be (true)
159 |     testSamplingPMF(td, gd) should be (true)
160 |   }
161 | 
162 |   it should "respect maxDiscrete parameter over ++" in {
163 |     import org.apache.commons.math3.distribution.GeometricDistribution
164 |     val gd = new GeometricDistribution(0.33)
165 |     val tdvec = Vector.fill(10) { TDigest.sketch(gd.sample(100000), maxDiscrete = 50) }
166 |     val td = tdvec.reduce(_ ++ _)
167 |     val clust = td.clusters
168 |     clust.keys.map(_.toInt).map(_.toDouble) should beEqSeq(clust.keys)
169 |     val D = clust.keys.map { x => td.cdfDiscrete(x) }
170 |       .zip(clust.keys.map(_.toInt).map { k => gd.cumulativeProbability(k) })
171 |       .map { case (p1, p2) => math.abs(p1 - p2) }
172 |       .max
173 |     (D <= 0.01) should be (true)
174 |     testSamplingPMF(td, gd) should be (true)
175 |   }
176 | 
177 |   it should "serialize and deserialize" in {
178 |     import org.apache.commons.math3.distribution.NormalDistribution
179 | 
180 |     import org.isarnproject.scalatest.serde.roundTripSerDe
181 | 
182 |     val dist = new NormalDistribution()
183 |     dist.reseedRandomGenerator(seed)
184 | 
185 |     val tdo = TDigest.sketch(Iterator.fill(ss) { dist.sample }, delta = delta)
186 | 
187 |     val tdi = roundTripSerDe(tdo)
188 | 
189 |     (tdi == tdo) should be (true)
190 | 
191 |     testTDvsDist(tdi, dist, math.sqrt(dist.getNumericalVariance())) should be (true)
192 |   }
193 | }
194 | 


--------------------------------------------------------------------------------
/src/test/scala/org/isarnproject/sketches/java/JavaTDigestTest.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2016-2018 Erik Erlandson
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 | http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | package org.isarnproject.sketches.java
 18 | 
 19 | import org.scalatest._
 20 | 
 21 | import org.isarnproject.scalatest.matchers.seq._
 22 | 
 23 | class JavaTDigestTest extends FlatSpec with Matchers {
 24 |   import org.apache.commons.math3.distribution.RealDistribution
 25 |   import org.apache.commons.math3.distribution.IntegerDistribution
 26 | 
 27 |   val seed = 235711L
 28 |   scala.util.Random.setSeed(seed)
 29 | 
 30 |   val ss = 100000
 31 |   val delta = 50.0 / 1000
 32 | 
 33 |   val maxD = 0.05
 34 |   val maxDI = 0.1
 35 | 
 36 |   def testTDvsDist(td: TDigest, dist: RealDistribution, stdv: Double): Boolean = {
 37 |     val xmin = td.cent(0)
 38 |     val xmax = td.cent(td.nclusters - 1)
 39 |     val step = (xmax - xmin) / 1000
 40 |     val d = (xmin to xmax by step).iterator
 41 |       .map(x => math.abs(td.cdf(x) - dist.cumulativeProbability(x))).max
 42 | 
 43 |     val dInv = (0.01 to 0.99 by 0.01).iterator
 44 |       .map(x => math.abs(td.cdfInverse(x) - dist.inverseCumulativeProbability(x))).max / stdv
 45 | 
 46 |     val pass = d <= maxD && dInv <= maxDI
 47 |     if (!pass) Console.err.println(s"testTDvsDist failure: d= $d  dInv= $dInv")
 48 |     pass
 49 |   }
 50 | 
 51 |   def testSamplingPDF(td: TDigest, dist: RealDistribution): Boolean = {
 52 |     val tdSamples = Array.fill(10000) { td.samplePDF }
 53 |     val distSamples = Array.fill(10000) { dist.sample }
 54 |     val kst = new org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest()
 55 |     val d = kst.kolmogorovSmirnovStatistic(tdSamples, distSamples)
 56 |     val pass = d <= maxD
 57 |     if (!pass) Console.err.println(s"testSamplingPDF failure: d= $d")
 58 |     pass
 59 |   }
 60 | 
 61 |   def testSamplingPMF(td: TDigest, dist: IntegerDistribution): Boolean = {
 62 |     td.nclusters should be <=(td.maxDiscrete)
 63 |     val tdSamples = Array.fill(10000) { td.samplePMF }
 64 |     val distSamples = Array.fill(10000) { dist.sample.toDouble }
 65 |     val kst = new org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest()
 66 |     val d = kst.kolmogorovSmirnovStatistic(tdSamples, distSamples)
 67 |     val pass = d <= maxD
 68 |     if (!pass) Console.err.println(s"testSamplingPDF failure: d= $d")
 69 |     pass
 70 |   }
 71 | 
 72 |   def testDistribution(dist: RealDistribution, stdv: Double): Boolean = {
 73 |     dist.reseedRandomGenerator(seed)
 74 | 
 75 |     val td = TDigest.sketch(Array.fill(ss) { dist.sample }, delta)
 76 | 
 77 |     testTDvsDist(td, dist, stdv) && testSamplingPDF(td, dist)
 78 |   }
 79 | 
 80 |   def testMonotoneCDF(dist: RealDistribution): Boolean = {
 81 |     dist.reseedRandomGenerator(seed)
 82 |     val td = TDigest.sketch(Array.fill(ss) { dist.sample }, delta)
 83 |     val (xmin, xmax) = (td.cent(0), td.cent(td.nclusters - 1))
 84 |     val step = (xmax - xmin) / 100000
 85 |     val t = (xmin to xmax by step).iterator.map(x => td.cdf(x)).sliding(2).map(w => w(1) - w(0)).min
 86 |     val pass = t >= 0.0
 87 |     if (!pass) Console.err.println(s"testMonotoneCDF failure: t= $t")
 88 |     pass
 89 |   }
 90 | 
 91 |   def testMonotoneCDFI(dist: RealDistribution): Boolean = {
 92 |     dist.reseedRandomGenerator(seed)
 93 |     val td = TDigest.sketch(Array.fill(ss) { dist.sample }, delta)
 94 |     val (xmin, xmax) = (0.0, 1.0)
 95 |     val step = (xmax - xmin) / 100000
 96 |     val t = (xmin to xmax by step).iterator.map(q => td.cdfInverse(q)).sliding(2).map(w => w(1) - w(0)).min
 97 |     val pass = t >= 0.0
 98 |     if (!pass) Console.err.println(s"testMonotoneCDFI failure: t= $t")
 99 |     pass
100 |   }
101 | 
102 |   def testMonotone(dist: RealDistribution): Boolean = {
103 |     testMonotoneCDF(dist) && testMonotoneCDFI(dist)
104 |   }
105 | 
106 |   it should "sketch a uniform distribution" in {
107 |     import org.apache.commons.math3.distribution.UniformRealDistribution
108 |     val dist = new UniformRealDistribution()
109 |     testDistribution(dist, math.sqrt(dist.getNumericalVariance())) should be (true)
110 |   }
111 | 
112 |   it should "sketch a normal distribution" in {
113 |     import org.apache.commons.math3.distribution.NormalDistribution
114 |     val dist = new NormalDistribution()
115 |     testDistribution(dist, math.sqrt(dist.getNumericalVariance())) should be (true)
116 |   }
117 | 
118 |   it should "sketch an exponential distribution" in {
119 |     import org.apache.commons.math3.distribution.ExponentialDistribution
120 |     val dist = new ExponentialDistribution(1.0)
121 |     testDistribution(dist, math.sqrt(dist.getNumericalVariance())) should be (true)
122 |   }
123 | 
124 |   it should "aggregate with another t-digest using merge method" in {
125 |     import org.apache.commons.math3.distribution.NormalDistribution
126 |     val dist = new NormalDistribution()
127 |     dist.reseedRandomGenerator(seed)
128 | 
129 |     val td1 = TDigest.sketch(Array.fill(ss) { dist.sample }, delta)
130 |     val td2 = TDigest.sketch(Array.fill(ss) { dist.sample }, delta)
131 | 
132 |     testTDvsDist(TDigest.merge(td1, td2), dist, math.sqrt(dist.getNumericalVariance())) should be (true)
133 |   }
134 | 
135 |   it should "respect monotonic cdf and inverse" in {
136 |     import org.apache.commons.math3.distribution.ExponentialDistribution
137 |     import org.apache.commons.math3.distribution.NormalDistribution
138 |     import org.apache.commons.math3.distribution.UniformRealDistribution
139 | 
140 |     testMonotone(new UniformRealDistribution()) should be (true)
141 |     testMonotone(new ExponentialDistribution(1.0)) should be (true)
142 |     testMonotone(new NormalDistribution(0.0, 0.1)) should be (true)
143 |   }
144 | 
145 |   it should "respect maxDiscrete parameter" in {
146 |     import org.apache.commons.math3.distribution.GeometricDistribution
147 |     val gd = new GeometricDistribution(0.33)
148 |     val data = gd.sample(1000000).map(_.toDouble)
149 |     val dataUniq = data.distinct.sorted
150 |     val kt = dataUniq.map(_.toDouble).toSet
151 |     val td = TDigest.sketch(data, delta, 50)
152 |     val clust = td.cent
153 |     clust.toSet should be (kt)
154 |     val D = clust.map { x => td.cdfDiscrete(x) }
155 |       .zip(dataUniq.map { k => gd.cumulativeProbability(k.toInt) })
156 |       .map { case (p1, p2) => math.abs(p1 - p2) }
157 |       .max
158 |     (D <= 0.01) should be (true)
159 |     testSamplingPMF(td, gd) should be (true)
160 |   }
161 | 
162 |   it should "respect maxDiscrete parameter over merge" in {
163 |     import org.apache.commons.math3.distribution.GeometricDistribution
164 |     val gd = new GeometricDistribution(0.33)
165 |     val tdvec = Vector.fill(10) { TDigest.sketch(gd.sample(100000).map(_.toDouble), delta, 50) }
166 |     val td = tdvec.reduce((a, b) => TDigest.merge(a, b))
167 |     val clust = td.cent
168 |     clust.map(_.toInt).map(_.toDouble).toVector should beEqSeq(clust.toVector)
169 |     val D = clust.map { x => td.cdfDiscrete(x) }
170 |       .zip(clust.map(_.toInt).map { k => gd.cumulativeProbability(k) })
171 |       .map { case (p1, p2) => math.abs(p1 - p2) }
172 |       .max
173 |     (D <= 0.01) should be (true)
174 |     testSamplingPMF(td, gd) should be (true)
175 |   }
176 | 
177 |   it should "support copy constructor" in {
178 |     import org.apache.commons.math3.distribution.NormalDistribution
179 | 
180 |     val dist = new NormalDistribution()
181 |     dist.reseedRandomGenerator(seed)
182 |     val data = Array.fill(ss) { dist.sample }
183 |     val td1 = TDigest.sketch(data, delta)
184 |     val td2 = new TDigest(td1)
185 |     (td2.equals(td1)) should be (true)
186 |     (td1.equals(td2)) should be (true)
187 | 
188 |     // add more data and re-check equality to ensure
189 |     // that all state for future updates was correctly copied
190 |     for { x <- data } {
191 |       td1.update(x)
192 |       td2.update(x)
193 |     }
194 |     (td2.equals(td1)) should be (true)
195 |     (td1.equals(td2)) should be (true)
196 |   }
197 | 
198 |   def testTDClose(td1: TDigest, td2: TDigest, eps: Double = 1e-6): Unit = {
199 |     td1.getCompression() should be (td2.getCompression())
200 |     td1.getMaxDiscrete() should be (td2.getMaxDiscrete())
201 |     td1.size() should be (td2.size())
202 |     td1.mass() should be (td2.mass() +- eps)
203 |     for { j <- 0 until td1.size() } {
204 |       td1.getCentUnsafe()(j) should be (td2.getCentUnsafe()(j) +- eps)
205 |       td1.getMassUnsafe()(j) should be (td2.getMassUnsafe()(j) +- eps)
206 |       td1.getFTUnsafe()(1 + j) should be (td2.getFTUnsafe()(1 + j) +- eps)
207 |     }
208 |   }
209 | 
210 |   it should "support dser constructor" in {
211 |     import java.util.Arrays;
212 |     import org.apache.commons.math3.distribution.NormalDistribution
213 | 
214 |     val eps = 1e-9
215 | 
216 |     val dist = new NormalDistribution()
217 |     dist.reseedRandomGenerator(seed)
218 |     val data = Array.fill(ss) { dist.sample }
219 | 
220 |     // test constructing empty t-digests
221 |     val td1 = new TDigest(0.5, 0, Array.empty[Double], Array.empty[Double])
222 |     val td2 = new TDigest(
223 |       td1.getCompression(),
224 |       td1.getMaxDiscrete(),
225 |       Arrays.copyOf(td1.getCentUnsafe(), td1.size()),
226 |       Arrays.copyOf(td1.getMassUnsafe(), td1.size())
227 |     )
228 |     testTDClose(td1, td2, eps)
229 | 
230 |     // test sketching from empty state
231 |     for { x <- data } {
232 |       td1.update(x)
233 |       td2.update(x)
234 |     }    
235 |     testTDClose(td1, td2, eps)
236 | 
237 |     // copy from non-empty state
238 |     val td3 = new TDigest(
239 |       td1.getCompression(),
240 |       td1.getMaxDiscrete(),
241 |       Arrays.copyOf(td1.getCentUnsafe(), td1.size()),
242 |       Arrays.copyOf(td1.getMassUnsafe(), td1.size())
243 |     )
244 |     testTDClose(td1, td3, eps)
245 | 
246 |     // test from non-empty state
247 |     for { x <- data } {
248 |       td1.update(x)
249 |       td3.update(x)
250 |     }
251 |     testTDClose(td1, td3, eps)
252 |   }
253 | 
254 |   it should "serialize and deserialize" in {
255 |     import org.apache.commons.math3.distribution.NormalDistribution
256 | 
257 |     import org.isarnproject.scalatest.serde.roundTripSerDe
258 | 
259 |     val dist = new NormalDistribution()
260 |     dist.reseedRandomGenerator(seed)
261 | 
262 |     val tdo = TDigest.sketch(Array.fill(ss) { dist.sample }, delta)
263 | 
264 |     val tdi = roundTripSerDe(tdo)
265 | 
266 |     (tdi.equals(tdo)) should be (true)
267 | 
268 |     testTDvsDist(tdi, dist, math.sqrt(dist.getNumericalVariance())) should be (true)
269 |   }
270 | }
271 | 


--------------------------------------------------------------------------------