├── .gitignore ├── LICENSE ├── README.md ├── build.sbt ├── isarn-sketches-java └── src │ └── main │ └── java │ └── org │ └── isarnproject │ └── sketches │ └── java │ └── TDigest.java ├── project ├── build.properties └── plugins.sbt └── src ├── main └── scala │ └── org │ └── isarnproject │ └── sketches │ ├── TDigest.scala │ └── tdmap │ └── TDigestMap.scala ├── site └── index.html └── test └── scala └── org └── isarnproject └── sketches ├── TDigestTest.scala └── java └── JavaTDigestTest.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache 6 | .history 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | # Scala-IDE specific 16 | .scala_dependencies 17 | .worksheet 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # isarn-sketches 2 | Sketching data structures 3 | 4 | ### API documentation 5 | - https://isarn.github.io/isarn-sketches/scala/api/ 6 | - https://isarn.github.io/isarn-sketches/java/api/ 7 | 8 | ### Compatibility 9 | isarn-sketches can operate with [Algebird](https://twitter.github.io/algebird/) via the 10 | [isarn-sketches-algebird-api](https://github.com/isarn/isarn-sketches-algebird-api) 11 | 12 | isarn-sketches can also operate with [Apache Spark](https://github.com/apache/spark) via the [isarn-sketches-spark](https://github.com/isarn/isarn-sketches-spark) library 13 | 14 | ### How to use in your project 15 | 16 | ``` scala 17 | // isarn-sketches 18 | libraryDependencies += "org.isarnproject" %% "isarn-sketches" % "0.3.0" 19 | 20 | // isarn-sketches-java 21 | libraryDependencies += "org.isarnproject" % "isarn-sketches-java" % "0.3.0" 22 | ``` 23 | 24 | ### t-digest 25 | ``` scala 26 | scala> import org.isarnproject.sketches.TDigest 27 | import org.isarnproject.sketches.TDigest 28 | 29 | scala> val data = Vector.fill(10000) { scala.util.Random.nextGaussian() } 30 | data: scala.collection.immutable.Vector[Double] = Vector(1.6046163970051968, 0.44151418924289004, ... 31 | 32 | scala> val sketch = TDigest.sketch(data) 33 | sketch: org.isarnproject.sketches.TDigest = TDigest(0.5,0,74,TDigestMap(-3.819069044174932 -> (1.0, 1.0), ... 34 | 35 | scala> sketch.cdf(0) 36 | res0: Double = 0.4984362744530557 37 | 38 | scala> sketch.cdfInverse(0.5) 39 | res1: Double = 0.0038481195948969205 40 | ``` 41 | 42 | #### t-digest resources 43 | * Original paper: [Computing Extremely Accurate Quantiles Using t-Digests](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) 44 | * Video Talk: [Sketching Data with T-Digest In Apache Spark](https://youtu.be/ETUYhEZRtWE) 45 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2016-2018 Erik Erlandson 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | // sbt clean unidoc previewSite 18 | // sbt clean unidoc ghpagesPushSite 19 | // sbt +isarn_sketches/publish 20 | // publish isarn-sketches-java for exactly one scala version: 21 | // sbt isarn_sketches_java/publish 22 | 23 | scalaVersion := "2.12.8" 24 | 25 | crossScalaVersions := Seq("2.11.12", "2.12.8") 26 | 27 | // these do not "inherit" when defined at top level, so 28 | // define them here for inclusion in each subproject. 29 | // This also worked: 'xxx in ThisProject := yyy', but you have to do it 30 | // for each setting below, so this seemed a bit cleaner 31 | def publishSettings = Seq( 32 | version := "0.3.1-SNAPSHOT", 33 | //isSnapshot := true, 34 | //publishConfiguration := publishConfiguration.value.withOverwrite(true), 35 | publishLocalConfiguration := publishLocalConfiguration.value.withOverwrite(true), 36 | organization := "org.isarnproject", 37 | pomIncludeRepository := { _ => false }, 38 | publishMavenStyle := true, 39 | publishTo := { 40 | val nexus = "https://oss.sonatype.org/" 41 | if (isSnapshot.value) 42 | Some("snapshots" at nexus + "content/repositories/snapshots") 43 | else 44 | Some("releases" at nexus + "service/local/staging/deploy/maven2") 45 | }, 46 | licenses += ("Apache-2.0", url("http://opensource.org/licenses/Apache-2.0")), 47 | homepage := Some(url("https://github.com/isarn/isarn-sketches")), 48 | scmInfo := Some( 49 | ScmInfo( 50 | url("https://github.com/isarn/isarn-sketches"), 51 | "scm:git@github.com:isarn/isarn-sketches.git" 52 | ) 53 | ), 54 | developers := List( 55 | Developer( 56 | id = "erikerlandson", 57 | name = "Erik Erlandson", 58 | email = "eje@redhat.com", 59 | url = url("https://erikerlandson.github.io/") 60 | ) 61 | ) 62 | ) 63 | 64 | compileOrder := CompileOrder.JavaThenScala 65 | 66 | javacOptions ++= Seq() 67 | 68 | scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature") 69 | 70 | scalacOptions in (Compile, doc) ++= Seq("-doc-root-content", baseDirectory.value+"/root-doc.txt") 71 | 72 | enablePlugins(ScalaUnidocPlugin, JavaUnidocPlugin, GhpagesPlugin) 73 | 74 | git.remoteRepo := "git@github.com:isarn/isarn-sketches.git" 75 | 76 | siteSubdirName in ScalaUnidoc := "scala/api" 77 | 78 | siteSubdirName in JavaUnidoc := "java/api" 79 | 80 | addMappingsToSiteDir(mappings in (ScalaUnidoc, packageDoc), siteSubdirName in ScalaUnidoc) 81 | 82 | addMappingsToSiteDir(mappings in (JavaUnidoc, packageDoc), siteSubdirName in JavaUnidoc) 83 | 84 | // tell unidoc to not do scala-doc for the isarn-sketches-java (javadoc will still get created) 85 | unidocProjectFilter in (ScalaUnidoc, unidoc) := inAnyProject -- inProjects(isarn_sketches_java) 86 | 87 | // this target needs to execute only once, at the top level 88 | // turn it off for any sub-projects 89 | def siteSubProjectSettings = Seq( 90 | previewSite := {} 91 | ) 92 | 93 | // browser insisted on caching some older generated site at the default (4000) 94 | previewFixedPort := Some(4444) 95 | 96 | lazy val isarn_sketches_java = (project in file("isarn-sketches-java")) 97 | .settings(name := "isarn-sketches-java") 98 | .enablePlugins(GenJavadocPlugin, PublishJavadocPlugin) 99 | .settings(siteSubProjectSettings :_*) 100 | .settings( 101 | crossPaths := false, // drop off Scala suffix from artifact names 102 | autoScalaLibrary := false // exclude scala-library from dependencies 103 | ) 104 | .settings(publishSettings :_*) 105 | 106 | lazy val isarn_sketches = (project in file(".")) 107 | .aggregate(isarn_sketches_java) 108 | .dependsOn(isarn_sketches_java) 109 | .settings(name := "isarn-sketches") 110 | .settings( 111 | // isarn_sketches_java needs to be published separately to work with 'crossPaths := false' 112 | aggregate in publish := false, 113 | libraryDependencies ++= Seq( 114 | "org.isarnproject" %% "isarn-algebra-api" % "0.0.3", 115 | "org.isarnproject" %% "isarn-collections" % "0.0.4", 116 | "org.isarnproject" %% "isarn-scalatest" % "0.0.3" % Test, 117 | "org.scalatest" %% "scalatest" % "3.0.5" % Test, 118 | "org.apache.commons" % "commons-math3" % "3.6.1" % Test) 119 | ) 120 | .settings(publishSettings :_*) 121 | -------------------------------------------------------------------------------- /isarn-sketches-java/src/main/java/org/isarnproject/sketches/java/TDigest.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2016-2018 Erik Erlandson 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package org.isarnproject.sketches.java; 18 | 19 | import java.lang.System; 20 | import java.lang.StringBuilder; 21 | import java.util.Arrays; 22 | import java.util.Comparator; 23 | import java.io.Serializable; 24 | import java.util.concurrent.ThreadLocalRandom; 25 | import java.util.Random; 26 | 27 | /** 28 | * A t-digest sketch of sampled numeric data 29 | *
 30 |  * Computing Extremely Accurate Quantiles Using t-Digests,
 31 |  * Ted Dunning and Otmar Ertl,
 32 |  * https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf
 33 |  * 
34 | * 35 | *
 36 |  * import org.isarnproject.sketches.java.TDigest;
 37 |  * double[] data = // data that you would like to sketch
 38 |  * TDigest sketch = TDigest.sketch(data)
 39 |  * // the cumulative distribution function of the sketch; cdf(x) at x = 0
 40 |  * double cdf = sketch.cdf(0.0)
 41 |  * // inverse of the CDF, evaluated at q = 0.5
 42 |  * double cdfi = sketch.cdfInverse(0.5)
 43 |  * 
44 | */ 45 | public class TDigest implements Serializable { 46 | /** compression setting (delta in original paper) */ 47 | protected final double C; 48 | /** maximum number of unique discrete values to track */ 49 | protected final int maxDiscrete; 50 | /** current number of clusters */ 51 | protected int nclusters = 0; 52 | /** total mass of data sampled so far */ 53 | protected double M = 0.0; 54 | /** cluster centers */ 55 | protected double[] cent = null; 56 | /** cluster masses */ 57 | protected double[] mass = null; 58 | /** cumulative cluster masses, represented as a Fenwick Tree */ 59 | protected double[] ftre = null; 60 | 61 | /** A new t-digest sketching structure with default compression and maximum discrete tracking. */ 62 | public TDigest() { 63 | this(COMPRESSION_DEFAULT, 0, INIT_SIZE_DEFAULT); 64 | } 65 | 66 | /** Construct a t-digest with the given compression. 67 | * Maximum discrete tracking defaults to zero. 68 | * @param compression sketching compression setting. Higher = more compression. 69 | * Must be > 0. 70 | */ 71 | public TDigest(double compression) { 72 | this(compression, 0, INIT_SIZE_DEFAULT); 73 | } 74 | 75 | /** Construct a t-digest with the given compression and maximum discrete tracking. 76 | * @param compression sketching compression setting. Higher = more compression. 77 | * Must be > 0. 78 | * @param maxDiscrete maximum number of unique discrete values to track. Must be ≥ 0. 79 | * If this number of values is exceeded, the sketch will begin to operate in 80 | * normal continuous mode. 81 | */ 82 | public TDigest(double compression, int maxDiscrete) { 83 | this(compression, maxDiscrete, INIT_SIZE_DEFAULT); 84 | } 85 | 86 | /** Construct a t-digest with the given compression and maximum discrete tracking. 87 | * @param compression sketching compression setting. Higher = more compression. 88 | * Must be > 0. 89 | * @param maxDiscrete maximum number of unique discrete values to track. Must be ≥ 0. 90 | * If this number of values is exceeded, the sketch will begin to operate in 91 | * normal continuous mode. 92 | * @param sz initial capacity to use for internal arrays. Must be > 0. 93 | */ 94 | public TDigest(double compression, int maxDiscrete, int sz) { 95 | assert compression > 0.0; 96 | assert maxDiscrete >= 0; 97 | assert sz > 0; 98 | C = compression; 99 | this.maxDiscrete = maxDiscrete; 100 | cent = new double[sz]; 101 | mass = new double[sz]; 102 | ftre = new double[1 + sz]; 103 | // ftre is 1-based. set ftre[0] to zero just to be tidy 104 | ftre[0] = 0.0; 105 | } 106 | 107 | /** 108 | * Construct a t-digest from a list of cluster centers and masses. 109 | * Object deserialization is one of the intended use cases for this constructor. 110 | * NOTE: This constructor assumes the 'cent' and 'mass' arrays will be owned 111 | * by the new t-digest object. If 'cent' and 'mass' are both null then an empty cluster 112 | * will be created. 113 | * @param compression sketching compression setting. Higher = more compression. 114 | * Must be > 0. 115 | * @param maxDiscrete maximum number of unique discrete values to track. Must be ≥ 0. 116 | * If this number of values is exceeded, the sketch will begin to operate in 117 | * @param cent the list of cluster centers. Assumed to be in sorted order. 118 | * This array is assumed to be owned by the t-digest object after construction. 119 | * @param mass a list of cluster masses. Assumed to be parallel to centers. 120 | * This array is assumed to be owned by the t-digest object after construction. 121 | */ 122 | public TDigest(double compression, int maxDiscrete, double cent[], double mass[]) { 123 | assert compression > 0.0; 124 | assert maxDiscrete >= 0; 125 | this.C = compression; 126 | this.maxDiscrete = maxDiscrete; 127 | assert (cent != null && mass != null) || (cent == null && mass == null); 128 | this.nclusters = (cent != null) ? cent.length : 0; 129 | int sz = nclusters; 130 | if (sz == 0) { 131 | // cent, mass and ftre cannot be zero length 132 | sz = INIT_SIZE_DEFAULT; 133 | this.cent = new double[sz]; 134 | this.mass = new double[sz]; 135 | } else { 136 | this.cent = cent; 137 | this.mass = mass; 138 | } 139 | assert cent != null && mass != null; 140 | assert cent.length == sz; 141 | assert cent.length == mass.length; 142 | assert cent.length > 0; 143 | this.ftre = new double[1 + sz]; 144 | Arrays.fill(ftre, 0, 1 + nclusters, 0.0); 145 | this.M = 0.0; 146 | for (int j = 0; j < nclusters; ++j) { 147 | M += mass[j]; 148 | ftInc(j, mass[j]); 149 | } 150 | } 151 | 152 | /** Construct a deep copy of another t-digest */ 153 | public TDigest(TDigest that) { 154 | C = that.C; 155 | maxDiscrete = that.maxDiscrete; 156 | nclusters = that.nclusters; 157 | M = that.M; 158 | cent = Arrays.copyOf(that.cent, nclusters); 159 | mass = Arrays.copyOf(that.mass, nclusters); 160 | ftre = Arrays.copyOf(that.ftre, nclusters); 161 | } 162 | 163 | /** Update the sketch with a new sampled value 164 | * @param x the new sampled value 165 | */ 166 | public final void update(double x) { 167 | update(x, 1.0); 168 | } 169 | 170 | /** Update the sketch with a new sampled value 171 | * @param x the new sampled value 172 | * @param w the weight (aka mass) associated with x 173 | */ 174 | public final void update(double x, double w) { 175 | updateLogic(x, w); 176 | if ((nclusters > maxDiscrete) && (nclusters > R())) recluster(); 177 | } 178 | 179 | private final void updateLogic(double x, double w) { 180 | if (nclusters == 0) { 181 | // clusters are empty, so (x,w) becomes the first cluster 182 | cent[0] = x; 183 | M = w; 184 | mass[0] = w; 185 | ftre[1] = w; 186 | nclusters += 1; 187 | return; 188 | } 189 | if (nclusters <= maxDiscrete) { 190 | // we are under the limit for discrete values to track 191 | int j = Arrays.binarySearch(cent, 0, nclusters, x); 192 | if (j >= 0) { 193 | // landed on existing cluster: add its mass and we're done 194 | M += w; 195 | mass[j] += w; 196 | ftInc(j, w); 197 | } else { 198 | // a new x value: insert as a new discrete cluster 199 | newCluster(-(j + 1), x, w); 200 | } 201 | return; 202 | } 203 | // get the index of the cluster closest to x 204 | int j = closest(x); 205 | if (x == cent[j]) { 206 | // landed on existing cluster: add its mass and we're done 207 | M += w; 208 | mass[j] += w; 209 | ftInc(j, w); 210 | return; 211 | } 212 | double m = mass[j]; 213 | // q is the quantile of the closest cluster to x 214 | // (ftSum does the right thing (return 0) for j = 0) 215 | double q = (ftSum(j - 1) + (m / 2.0)) / M; 216 | // this is the upper-bound for the mass of closest cluster 217 | double ub = C * M * q * (1.0 - q); 218 | // dm is how much mass we're allowed to add to closest cluster 219 | double dm = Math.min(w, Math.max(0.0, ub - m)); 220 | // rm is the remainder of the mass 221 | double rm = w - dm; 222 | if (dm > 0.0) { 223 | // Add any allowable mass to closest cluster and update its center. 224 | // It is safe to update center this way because it will remain 225 | // between x and original center, and so cannot move out of its original 226 | // ordering relative to its neighbors, because x is by previous logic 227 | // closer to cent[j] than any other cluster. 228 | double dc = dm * (x - cent[j]) / (m + dm); 229 | cent[j] += dc; 230 | M += dm; 231 | mass[j] += dm; 232 | ftInc(j, dm); 233 | } 234 | // if there is remaining mass, it becomes a new cluster 235 | if (rm > 0.0) newCluster((x < cent[j]) ? j : j + 1, x, rm); 236 | } 237 | 238 | /** Merge another t-digest into this one. 239 | * @param that the t-digest to merge. This t-digest is unaltered. 240 | */ 241 | public final void merge(TDigest that) { 242 | Integer[] indexes = new Integer[that.nclusters]; 243 | for (int j = 0; j < that.nclusters; ++j) indexes[j] = j; 244 | // sort so that largest clusters are first. 245 | // inserting large to small yields stable distribution estimations 246 | Comparator cmp = new Comparator() { 247 | @Override 248 | public int compare(Integer a, Integer b) { 249 | return (int)Math.signum(that.mass[b] - that.mass[a]); 250 | } 251 | }; 252 | Arrays.sort(indexes, cmp); 253 | for (int j: indexes) update(that.cent[j], that.mass[j]); 254 | } 255 | 256 | /** Re-cluster this t-digest by reinserting its clusters in randomized order. */ 257 | public final void recluster() { 258 | // I suspect it may be possible to improve on this fully-randomized algorithm, 259 | // by leveraging the largest-first heuristic I use in cluster merging. See: 260 | // http://erikerlandson.github.io/blog/2016/12/19/converging-monoid-addition-for-t-digest/ 261 | int[] indexes = new int[nclusters]; 262 | for (int j = 0; j < nclusters; ++j) indexes[j] = j; 263 | intShuffle(indexes); 264 | int sz = cent.length; 265 | double[] oldCent = cent; 266 | double[] oldMass = mass; 267 | cent = new double[sz]; 268 | mass = new double[sz]; 269 | reset(); 270 | for (int j: indexes) updateLogic(oldCent[j], oldMass[j]); 271 | } 272 | 273 | /** Reset this t-digest to an empty state */ 274 | public final void reset() { 275 | nclusters = 0; 276 | M = 0.0; 277 | } 278 | 279 | private final void newCluster(int j, double x, double w) { 280 | double[] newCent = cent; 281 | double[] newMass = mass; 282 | double[] newFtre = ftre; 283 | int sz = cent.length; 284 | if (nclusters >= sz) { 285 | int szinc = (int)Math.ceil(0.1 * (double)sz); 286 | sz += szinc; 287 | newCent = new double[sz]; 288 | newMass = new double[sz]; 289 | newFtre = new double[1 + sz]; 290 | System.arraycopy(cent, 0, newCent, 0, j); 291 | System.arraycopy(mass, 0, newMass, 0, j); 292 | } 293 | // arraycopy can handle when cent == newCent 294 | System.arraycopy(cent, j, newCent, 1 + j, nclusters - j); 295 | System.arraycopy(mass, j, newMass, 1 + j, nclusters - j); 296 | // do this after copies above 297 | newCent[j] = x; 298 | newMass[j] = w; 299 | nclusters += 1; 300 | cent = newCent; 301 | mass = newMass; 302 | ftre = newFtre; 303 | Arrays.fill(ftre, 0, 1 + nclusters, 0.0); 304 | for (int k = 0; k < nclusters; ++k) ftInc(k, mass[k]); 305 | M += w; 306 | } 307 | 308 | private final int closest(double x) { 309 | int j = Arrays.binarySearch(cent, 0, nclusters, x); 310 | // exact match, return its index: 311 | if (j >= 0) return j; 312 | // x is not a cluster center, get its insertion index: 313 | j = -(j + 1); 314 | // x is to left of left-most cluster: 315 | if (j == 0) return j; 316 | // x is to right of right-most cluster: 317 | if (j == nclusters) return j - 1; 318 | // x is between two clusters, return index of closest: 319 | double dL = x - cent[j - 1]; 320 | double dR = cent[j] - x; 321 | return (dL < dR) ? (j - 1) : j; 322 | } 323 | 324 | /** Obtain the number of clusters in this t-digest 325 | * @return the number of clusters in this t-digest 326 | */ 327 | public final int size() { 328 | return nclusters; 329 | } 330 | 331 | /** Obtain the total mass sampled by this t-digest 332 | * @return the total mass 333 | */ 334 | public final double mass() { 335 | return M; 336 | } 337 | 338 | /** Obtain the compression setting for this t-digest 339 | * @return the compression setting 340 | */ 341 | public final double getCompression() { 342 | return C; 343 | } 344 | 345 | /** Obtain the maximum discrete setting for this t-digest 346 | * @return the maximum discrete setting 347 | */ 348 | public final int getMaxDiscrete() { 349 | return maxDiscrete; 350 | } 351 | 352 | /** Obtain a reference to this t-digest's cluster center array. 353 | * NOTE: this array is not safe to modify, and should be used only in "read-only" mode! 354 | * @return a reference to the cluster center array 355 | */ 356 | public final double[] getCentUnsafe() { 357 | return cent; 358 | } 359 | 360 | /** Obtain a reference to this t-digest's cluster mass array. 361 | * NOTE: this array is not safe to modify, and should be used only in "read-only" mode! 362 | * @return a reference to the cluster mass array 363 | */ 364 | public final double[] getMassUnsafe() { 365 | return mass; 366 | } 367 | 368 | /** Obtain a reference to this t-digest's cumulative mass array. 369 | * This array stores the cumulative masses of clusters in Fenwick Tree format. 370 | * NOTE: this array is not safe to modify, and should be used only in "read-only" mode! 371 | * @return a reference to the cumulative mass array 372 | */ 373 | public final double[] getFTUnsafe() { 374 | return ftre; 375 | } 376 | 377 | /** Returns true if this t-digest is empty, false otherwise. */ 378 | public final boolean isEmpty() { 379 | return nclusters == 0; 380 | } 381 | 382 | @Override 383 | public String toString() { 384 | StringBuilder sb = new StringBuilder("TDigest("); 385 | for (int j = 0; j < nclusters; ++j) { 386 | if (j > 25) { 387 | sb.append(" ..."); 388 | break; 389 | } 390 | if (j > 0) sb.append(", "); 391 | sb.append(cent[j]) 392 | .append(" -> (") 393 | .append(mass[j]) 394 | .append(", ") 395 | .append(ftSum(j)) 396 | .append(")"); 397 | } 398 | sb.append(")"); 399 | return sb.toString(); 400 | } 401 | 402 | /** 403 | * Perform a random sampling from the distribution as sketched by this t-digest, in 404 | * "probability density" mode. 405 | * @return A random number sampled from the sketched distribution 406 | */ 407 | public final double samplePDF() { 408 | return samplePDF(ThreadLocalRandom.current()); 409 | } 410 | 411 | /** 412 | * Perform a random sampling from the distribution as sketched by this t-digest, in 413 | * "probability density" mode. 414 | * @param prng a (pseudo) random number generator to use for the random sampling 415 | * @return A random number sampled from the sketched distribution 416 | */ 417 | public final double samplePDF(Random prng) { 418 | return cdfInverse(prng.nextDouble()); 419 | } 420 | 421 | /** 422 | * Perform a random sampling from the distribution as sketched by this t-digest, in 423 | * "probability mass" (i.e. discrete) mode. 424 | * @return A random number sampled from the sketched distribution 425 | */ 426 | public final double samplePMF() { 427 | return samplePMF(ThreadLocalRandom.current()); 428 | } 429 | 430 | /** 431 | * Perform a random sampling from the distribution as sketched by this t-digest, in 432 | * "probability mass" (i.e. discrete) mode. 433 | * @param prng a (pseudo) random number generator to use for the random sampling 434 | * @return A random number sampled from the sketched distribution 435 | */ 436 | public final double samplePMF(Random prng) { 437 | return cdfDiscreteInverse(prng.nextDouble()); 438 | } 439 | 440 | /** 441 | * Perform a random sampling from the distribution as sketched by this t-digest, 442 | * using "discrete" (PMF) mode if the number of clusters ≤ maxDiscrete setting, 443 | * and "density" (PDF) mode otherwise. 444 | * @return A random number sampled from the sketched distribution 445 | */ 446 | public final double sample() { 447 | return sample(ThreadLocalRandom.current()); 448 | } 449 | 450 | /** 451 | * Perform a random sampling from the distribution as sketched by this t-digest, 452 | * using "discrete" (PMF) mode if the number of clusters ≤ maxDiscrete setting, 453 | * and "density" (PDF) mode otherwise. 454 | * @param prng a (pseudo) random number generator to use for the random sampling 455 | * @return A random number sampled from the sketched distribution 456 | */ 457 | public final double sample(Random prng) { 458 | if (nclusters <= maxDiscrete) { 459 | return cdfDiscreteInverse(prng.nextDouble()); 460 | } else { 461 | return cdfInverse(prng.nextDouble()); 462 | } 463 | } 464 | 465 | /** 466 | * Compute a cumulative probability (CDF) for a numeric value, from the estimated probability 467 | * distribution represented by this t-digest sketch. 468 | * @param x a numeric value 469 | * @return the cumulative probability that a random sample from the distribution is ≤ x 470 | */ 471 | public final double cdf(double x) { 472 | int j1 = rcovj(x); 473 | if (j1 < 0) return 0.0; 474 | if (j1 >= nclusters - 1) return 1.0; 475 | int j2 = j1 + 1; 476 | double c1 = cent[j1]; 477 | double c2 = cent[j2]; 478 | double tm1 = mass[j1]; 479 | double tm2 = mass[j2]; 480 | double s = ftSum(j1 - 1); 481 | double d1 = (j1 == 0) ? 0.0 : tm1 / 2.0; 482 | double m1 = s + d1; 483 | double m2 = m1 + (tm1 - d1) + ((j2 == nclusters - 1) ? tm2 : tm2 / 2.0); 484 | double m = m1 + (x - c1) * (m2 - m1) / (c2 - c1); 485 | return Math.min(m2, Math.max(m1, m)) / M; 486 | } 487 | 488 | /** 489 | * Compute a cumulative probability (CDF) for a numeric value, from the estimated probability 490 | * distribution represented by this t-digest sketch, assuming sketch is "discrete" 491 | * (e.g. if number of clusters ≤ maxDiscrete setting) 492 | * @param x a numeric value 493 | * @return the cumulative probability that a random sample from the distribution is ≤ x 494 | */ 495 | public final double cdfDiscrete(double x) { 496 | int j = rcovj(x); 497 | return ftSum(j) / M; 498 | } 499 | 500 | /** 501 | * Compute the inverse cumulative probability (inverse-CDF) for a quantile value, from the 502 | * estimated probability distribution represented by this t-digest sketch. 503 | * @param q a quantile value. The value of q is expected to be on interval [0, 1] 504 | * @return the value x such that cdf(x) = q 505 | */ 506 | public final double cdfInverse(double q) { 507 | if (q < 0.0 || q > 1.0) return Double.NaN; 508 | if (nclusters == 0) return Double.NaN; 509 | if (nclusters == 1) return cent[0]; 510 | double m = q * M; 511 | int j1 = rmcovj(m); 512 | int j2 = j1 + 1; 513 | double c1 = cent[j1]; 514 | double c2 = cent[j2]; 515 | double tm1 = mass[j1]; 516 | double tm2 = mass[j2]; 517 | double s = ftSum(j1 - 1); 518 | double d1 = (j1 == 0) ? 0.0 : tm1 / 2.0; 519 | double m1 = s + d1; 520 | double m2 = m1 + (tm1 - d1) + ((j2 == nclusters - 1) ? tm2 : tm2 / 2.0); 521 | double x = c1 + (m - m1) * (c2 - c1) / (m2 - m1); 522 | return Math.min(c2, Math.max(c1, x)); 523 | } 524 | 525 | /** 526 | * Compute the inverse cumulative probability (inverse-CDF) for a quantile value, from the 527 | * estimated probability distribution represented by this t-digest sketch, 528 | * assuming the sketch is "discrete" (e.g. if number of clusters ≤ maxDiscrete setting) 529 | * @param q a quantile value. The value of q is expected to be on interval [0, 1] 530 | * @return the smallest value x such that q ≤ cdf(x) 531 | */ 532 | public final double cdfDiscreteInverse(double q) { 533 | if (q < 0.0 || q > 1.0) return Double.NaN; 534 | if (nclusters == 0) return Double.NaN; 535 | if (nclusters == 1) return cent[0]; 536 | double m = q * M; 537 | int j = lmcovj(m); 538 | return cent[j]; 539 | } 540 | 541 | // returns the index of a right mass cover 542 | // ftSum(j) <= m < ftSum(j+1) 543 | private final int rmcovj(double m) { 544 | assert nclusters >= 2; 545 | assert (m >= 0.0) && (m <= M); 546 | int beg = 0; 547 | double mbeg = 0.0; 548 | int end = nclusters - 1; 549 | double mend = M; 550 | while ((end - beg) > 1) { 551 | int mid = (beg + end) / 2; 552 | double mmid = ftSum(mid); 553 | if (m >= mmid) { 554 | beg = mid; 555 | mbeg = mmid; 556 | } else { 557 | end = mid; 558 | mend = mmid; 559 | } 560 | } 561 | return beg; 562 | } 563 | 564 | // returns the index of a left mass cover 565 | // ftSum(j-1) < m <= ftSum(j) 566 | private final int lmcovj(double m) { 567 | assert nclusters >= 2; 568 | assert (m >= 0.0) && (m <= M); 569 | int beg = -1; 570 | double mbeg = 0.0; 571 | int end = nclusters - 1; 572 | double mend = M; 573 | while ((end - beg) > 1) { 574 | int mid = (beg + end) / 2; 575 | double mmid = ftSum(mid); 576 | if (m <= mmid) { 577 | end = mid; 578 | mend = mmid; 579 | } else { 580 | beg = mid; 581 | mbeg = mmid; 582 | } 583 | } 584 | return end; 585 | } 586 | 587 | // returns the left index of a right-cover 588 | private final int rcovj(double x) { 589 | int j = Arrays.binarySearch(cent, 0, nclusters, x); 590 | // exact match, return its index: 591 | if (j >= 0) return j; 592 | // x is not a cluster center, get its insertion index: 593 | j = -(j + 1); 594 | // x is to left of left-most cluster: 595 | if (j == 0) return -1; 596 | // return the index to the left of x: 597 | return j - 1; 598 | } 599 | 600 | // cumulative-sum algorithm for a Fenwick tree 601 | private final double ftSum(int j) { 602 | j += 1; 603 | double s = 0.0; 604 | while (j > 0) { 605 | s += ftre[j]; 606 | j -= j & (-j); // dec by least significant nonzero bit of j 607 | } 608 | return s; 609 | } 610 | 611 | // increment algorithm for a Fenwick tree 612 | private final void ftInc(int j, double w) { 613 | j += 1; 614 | while (j <= nclusters) { 615 | ftre[j] += w; 616 | j += j & (-j); // inc by least significant nonzero bit of j 617 | } 618 | } 619 | 620 | @Override 621 | public boolean equals(Object that) { 622 | if (!(that instanceof TDigest)) return false; 623 | if (this == that) return true; 624 | TDigest rhs = (TDigest)that; 625 | if (C != rhs.C) return false; 626 | if (maxDiscrete != rhs.maxDiscrete) return false; 627 | if (nclusters != rhs.nclusters) return false; 628 | if (M != rhs.M) return false; 629 | if (!equal(cent, rhs.cent, nclusters)) return false; 630 | if (!equal(mass, rhs.mass, nclusters)) return false; 631 | // if masses are equal, cumulative ftre had better also be equal 632 | return true; 633 | } 634 | 635 | // I can't believe java just added this to Arrays in java 9 636 | static final boolean equal(double[] lhs, double[] rhs, int n) { 637 | for (int j = 0; j < n; ++j) { 638 | if (lhs[j] != rhs[j]) return false; 639 | } 640 | return true; 641 | } 642 | 643 | @Override 644 | public int hashCode() { 645 | int h = nclusters; 646 | h ^= doubleHash(M); 647 | if (nclusters >= 1) { 648 | h ^= doubleHash(cent[0]); 649 | h ^= doubleHash(mass[0]); 650 | h ^= doubleHash(ftre[1]); 651 | } 652 | if (nclusters >= 2) { 653 | h ^= doubleHash(cent[nclusters - 1]); 654 | h ^= doubleHash(mass[nclusters - 1]); 655 | h ^= doubleHash(ftre[nclusters]); 656 | } 657 | if (nclusters >= 3) { 658 | int j = nclusters / 2; 659 | h ^= doubleHash(cent[j]); 660 | h ^= doubleHash(mass[j]); 661 | h ^= doubleHash(ftre[1 + j]); 662 | } 663 | return h; 664 | } 665 | 666 | // I can't believe Double doesn't provide a static method for this 667 | static final int doubleHash(double x) { 668 | long v = Double.doubleToLongBits(x); 669 | return (int)(v ^ (v >>> 32)); 670 | } 671 | 672 | protected final int R() { 673 | return (int)(K / C); 674 | } 675 | 676 | /** 677 | * The t-digest algorithm will re-cluster itself whenever its number of clusters exceeds 678 | * (K/delta). This value is set such that the threshold is about 10x the heuristically 679 | * expected number of clusters for the user-specified delta value. Generally the number of 680 | * clusters will only trigger the corresponding re-clustering threshold when data are being 681 | * presented in a non-random order. 682 | */ 683 | public static final double K = 10.0 * 50.0; 684 | 685 | /** 686 | * Default value for a t-digest compression (aka delta) parameter. 687 | * The number of clusters varies, roughly, as 688 | * about (50/delta), when data are presented in random order 689 | * (it may grow larger if data are not presented randomly). The default corresponds to 690 | * an expected number of clusters of about 100. 691 | */ 692 | public static final double COMPRESSION_DEFAULT = 50.0 / 100.0; 693 | 694 | /** Default for the initial cluster array capacity */ 695 | public static final int INIT_SIZE_DEFAULT = 5; 696 | 697 | /** Obtain an empty t-digest with default compression and maximum discrete tracking. 698 | * @return a new empty t-digest 699 | */ 700 | public static TDigest empty() { 701 | return new TDigest(COMPRESSION_DEFAULT, 0, INIT_SIZE_DEFAULT); 702 | } 703 | 704 | /** 705 | * Obtain an empty t-digest. 706 | * maxDiscrete defaults to zero. 707 | * @param compression sketching compression setting. Higher = more compression. 708 | * Must be > 0. 709 | * @return a new empty t-digest 710 | */ 711 | public static TDigest empty(double compression) { 712 | return new TDigest(compression, 0, INIT_SIZE_DEFAULT); 713 | } 714 | 715 | /** 716 | * Obtain an empty t-digest. 717 | * @param compression sketching compression setting. Higher = more compression. 718 | * Must be > 0. 719 | * @param maxDiscrete maximum number of unique discrete values to track. Must be ≥ 0. 720 | * If this number of values is exceeded, the sketch will begin to operate in 721 | * normal continuous mode. 722 | * @return a new empty t-digest 723 | */ 724 | public static TDigest empty(double compression, int maxDiscrete) { 725 | return new TDigest(compression, maxDiscrete, INIT_SIZE_DEFAULT); 726 | } 727 | 728 | /** 729 | * Obtain an empty t-digest. 730 | * @param compression sketching compression setting. Higher = more compression. 731 | * Must be > 0. 732 | * @param maxDiscrete maximum number of unique discrete values to track. Must be ≥ 0. 733 | * If this number of values is exceeded, the sketch will begin to operate in 734 | * normal continuous mode. 735 | * @param sz initial capacity to use for internal arrays. Must be > 0. 736 | * @return a new empty t-digest 737 | */ 738 | public static TDigest empty(double compression, int maxDiscrete, int sz) { 739 | return new TDigest(compression, maxDiscrete, sz); 740 | } 741 | 742 | /** Merge the argument with smaller mass into the one with larger mass, and return 743 | * the larger as the result. 744 | * Note this means either (ltd) or (rtd) will be modified. 745 | * @param ltd a t-digest 746 | * @param rtd another t-digest 747 | * @return if ltd has larger mass, then returns
ltd.merge(rtd)
, 748 | * otherwise
rtd.merge(ltd)
749 | */ 750 | public static TDigest merge(TDigest ltd, TDigest rtd) { 751 | if (ltd.size() < rtd.size()) return merge(rtd, ltd); 752 | if (rtd.size() == 0) return ltd; 753 | if (rtd.size() == 1) { 754 | ltd.update(rtd.cent[0], rtd.mass[0]); 755 | return ltd; 756 | } 757 | if (rtd.mass() < ltd.mass()) { 758 | ltd.merge(rtd); 759 | return ltd; 760 | } else { 761 | rtd.merge(ltd); 762 | return rtd; 763 | } 764 | } 765 | 766 | /** 767 | * Sketch data using a t-digest with default compression and maximum discrete tracking. 768 | * @param data the data to sketch 769 | * @return a t-digest sketch of the data 770 | */ 771 | public static TDigest sketch(double[] data) { 772 | return sketch(data, COMPRESSION_DEFAULT, 0, INIT_SIZE_DEFAULT); 773 | } 774 | 775 | /** 776 | * Sketch data using a t-digest. 777 | * maxDiscrete defaults to zero. 778 | * @param data the data to sketch 779 | * @param compression sketching compression setting. Higher = more compression. 780 | * Must be > 0. 781 | * @return a t-digest sketch of the data 782 | */ 783 | public static TDigest sketch(double[] data, double compression) { 784 | return sketch(data, compression, 0, INIT_SIZE_DEFAULT); 785 | } 786 | 787 | /** 788 | * Sketch data using a t-digest. 789 | * @param data the data to sketch 790 | * @param compression sketching compression setting. Higher = more compression. 791 | * Must be > 0. 792 | * @param maxDiscrete maximum number of unique discrete values to track. Must be ≥ 0. 793 | * If this number of values is exceeded, the sketch will begin to operate in 794 | * normal continuous mode. 795 | * @return a t-digest sketch of the data 796 | */ 797 | public static TDigest sketch(double[] data, double compression, int maxDiscrete) { 798 | return sketch(data, compression, maxDiscrete, INIT_SIZE_DEFAULT); 799 | } 800 | 801 | /** 802 | * Sketch data using a t-digest. 803 | * @param data the data to sketch 804 | * @param compression sketching compression setting. Higher = more compression. 805 | * Must be > 0. 806 | * @param maxDiscrete maximum number of unique discrete values to track. Must be ≥ 0. 807 | * If this number of values is exceeded, the sketch will begin to operate in 808 | * normal continuous mode. 809 | * @param sz initial capacity to use for internal arrays. Must be > 0. 810 | * @return a t-digest sketch of the data 811 | */ 812 | public static TDigest sketch(double[] data, double compression, int maxDiscrete, int sz) { 813 | TDigest td = empty(compression, maxDiscrete, sz); 814 | for (double x: data) td.update(x, 1.0); 815 | if (td.size() > maxDiscrete) td.recluster(); 816 | return td; 817 | } 818 | 819 | static void intShuffle(int[] data) { 820 | intShuffle(data, 0, data.length); 821 | } 822 | 823 | static void intShuffle(int[] data, int end) { 824 | intShuffle(data, 0, end); 825 | } 826 | 827 | static void intShuffle(int[] data, int beg, int end) { 828 | ThreadLocalRandom rnd = ThreadLocalRandom.current(); 829 | end -= 1; 830 | while (end > beg) { 831 | int r = rnd.nextInt(beg, end); 832 | int d = data[end]; 833 | data[end] = data[r]; 834 | data[r] = d; 835 | end -= 1; 836 | } 837 | } 838 | } 839 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.3.12 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Resolver.url( 2 | "bintray-sbt-plugin-releases", 3 | url("http://dl.bintray.com/content/sbt/sbt-plugin-releases"))( 4 | Resolver.ivyStylePatterns) 5 | 6 | resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/" 7 | 8 | resolvers += "jgit-repo" at "http://download.eclipse.org/jgit/maven" 9 | 10 | addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.3") 11 | 12 | addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.3") 13 | 14 | addSbtPlugin("io.crashbox" % "sbt-gpg" % "0.2.1") 15 | 16 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.2") 17 | 18 | // scoverage and coveralls deps are at old versions to avoid a bug in the current versions 19 | // update these when this fix is released: https://github.com/scoverage/sbt-coveralls/issues/73 20 | //addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.0.4") 21 | 22 | //addSbtPlugin("org.scoverage" % "sbt-coveralls" % "1.0.0") 23 | 24 | //addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.6.0") 25 | -------------------------------------------------------------------------------- /src/main/scala/org/isarnproject/sketches/TDigest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2016 Erik Erlandson 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package org.isarnproject.sketches 18 | 19 | import scala.util.Random 20 | 21 | import tdmap.TDigestMap 22 | 23 | /** 24 | * A t-digest sketch of sampled numeric data, as described in: 25 | * Computing Extremely Accurate Quantiles Using t-Digests, 26 | * Ted Dunning and Otmar Ertl, 27 | * https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf 28 | * 29 | * {{{ 30 | * import org.isarnproject.sketches.TDigest 31 | * val data = Vector.fill(10000) { scala.util.Random.nextGaussian() } 32 | * // sketch of some Gaussian data 33 | * val sketch = TDigest.sketch(data) 34 | * // the cumulative distribution function of the sketch; cdf(x) at x = 0 35 | * val cdf = sketch.cdf(0.0) 36 | * // inverse of the CDF, evaluated at q = 0.5 37 | * val cdfi = sketch.cdfInverse(0.5) 38 | * }}} 39 | */ 40 | case class TDigest( 41 | delta: Double, 42 | maxDiscrete: Int, 43 | nclusters: Int, 44 | clusters: TDigestMap) extends Serializable { 45 | 46 | // re-cluster when number of clusters exceeds this threshold 47 | @inline private def R = (TDigest.K / delta).toInt 48 | 49 | /** 50 | * Returns a new t-digest with value x included in its sketch; td + x is equivalent to 51 | * td + (x, 1). 52 | * @param x The numeric data value to include in the sketch 53 | * @return the updated sketch 54 | */ 55 | def +[N](x: N)(implicit num: Numeric[N]): TDigest = this.plus(num.toDouble(x), 1.0) 56 | 57 | /** 58 | * Returns a new t-digest with new pair (x, w) included in its sketch. 59 | * @param xw A pair (x, w) where x is the numeric value and w is its weight 60 | * @return the updated sketch 61 | * @note This implements 'algorithm 1' from: 62 | * Computing Extremely Accurate Quantiles Using t-Digests, 63 | * Ted Dunning and Otmar Ertl, 64 | * https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf 65 | */ 66 | def +[N1, N2](xw: (N1, N2))(implicit num1: Numeric[N1], num2: Numeric[N2]): TDigest = 67 | this.plus(num1.toDouble(xw._1), num2.toDouble(xw._2)) 68 | 69 | private def plus(x: Double, w: Double): TDigest = { 70 | if (nclusters <= maxDiscrete) { 71 | clusters.getNode(x).fold { 72 | TDigest(delta, maxDiscrete, nclusters + 1, clusters + (x -> w)) 73 | } { xnode => 74 | TDigest(delta, maxDiscrete, nclusters, clusters.update(x, x, xnode.data.value + w)) 75 | } 76 | } else { 77 | val s = this.update(x, w) 78 | if (s.nclusters <= R) s 79 | else { 80 | // too many clusters: attempt to compress it by re-clustering 81 | val ds = TDigest.shuffle(s.clusters.toVector) 82 | ds.foldLeft(TDigest.empty(delta, maxDiscrete)) { case (d, (x, w)) => d.update(x, w) } 83 | } 84 | } 85 | } 86 | 87 | /** 88 | * Add this digest to another 89 | * @param that The right-hand t-digest operand 90 | * @return the result of combining left and right digests 91 | */ 92 | def ++(that: TDigest): TDigest = TDigest.combine(this, that, this.delta, this.maxDiscrete) 93 | 94 | // This is most of 'algorithm 1', except for re-clustering which is factored out to avoid 95 | // recursive calls during a reclustering phase 96 | private def update(x: Double, w: Double) = { 97 | require(w > 0.0, "data weight must be > 0") 98 | 99 | if (nclusters == 0) { 100 | // our map is empty, so insert this pair as the first cluster 101 | TDigest(delta, maxDiscrete, nclusters + 1, clusters + (x -> w)) 102 | } else { 103 | // Get the current cluster nearest to incoming (x) 104 | val (c, m, psum) = clusters.nearTD(x) 105 | if (x == c) { 106 | // data landed on an existing cluster: increment that cluster's mass directly 107 | TDigest(delta, maxDiscrete, nclusters, clusters.update(c, c, m + w)) 108 | } else { 109 | val M = clusters.sum 110 | val q = (psum + m / 2.0) / M 111 | val ub = M * delta * q * (1.0 - q) 112 | 113 | val dm = math.min(w, math.max(0.0, ub - m)) 114 | val rm = w - dm 115 | 116 | val tClust = if (dm > 0.0) { 117 | val nm = m + dm 118 | val dc = dm * (x - c) / nm 119 | clusters.update(c, c + dc, nm) 120 | } else clusters 121 | 122 | val uClust = if (rm > 0.0) tClust + (x -> rm) else tClust 123 | 124 | // return the updated t-digest 125 | TDigest(delta, maxDiscrete, nclusters + (if (rm > 0.0) 1 else 0), uClust) 126 | } 127 | } 128 | } 129 | 130 | /** 131 | * Compute a cumulative probability (CDF) for a numeric value, from the estimated probability 132 | * distribution represented by this t-digest sketch. 133 | * @param x a numeric value 134 | * @return the cumulative probability that a random sample from the distribution is <= x 135 | */ 136 | def cdf[N](x: N)(implicit num: Numeric[N]): Double = clusters.cdf(x) 137 | 138 | /** 139 | * Compute the inverse cumulative probability (inverse-CDF) for a quantile value, from the 140 | * estimated probability distribution represented by this t-digest sketch. 141 | * @param q a quantile value. The value of q is expected to be on interval [0, 1] 142 | * @return the value x such that cdf(x) = q 143 | */ 144 | def cdfInverse[N](q: N)(implicit num: Numeric[N]): Double = clusters.cdfInverse(q) 145 | 146 | /** 147 | * Compute a cumulative probability (CDF) for a numeric value, from the estimated probability 148 | * distribution represented by this t-digest sketch, assuming sketch is "discrete" 149 | * (e.g. if number of clusters <= maxDiscrete setting) 150 | * @param x a numeric value 151 | * @return the cumulative probability that a random sample from the distribution is <= x 152 | */ 153 | def cdfDiscrete[N](x: N)(implicit num: Numeric[N]): Double = 154 | clusters.cdfDiscrete(x) 155 | 156 | /** 157 | * Compute the inverse cumulative probability (inverse-CDF) for a quantile value, from the 158 | * estimated probability distribution represented by this t-digest sketch, 159 | * assuming the sketch is "discrete" (e.g. if number of clusters <= maxDiscrete setting) 160 | * @param q a quantile value. The value of q is expected to be on interval [0, 1] 161 | * @return the smallest value x such that q <= cdf(x) 162 | */ 163 | def cdfDiscreteInverse[N](q: N)(implicit num: Numeric[N]): Double = 164 | clusters.cdfDiscreteInverse(q) 165 | 166 | /** 167 | * Perform a random sampling from the distribution as sketched by this t-digest, in 168 | * "probability density" mode. 169 | * @return A random number sampled from the sketched distribution 170 | * @note uses the inverse transform sampling method 171 | */ 172 | def samplePDF: Double = clusters.cdfInverse(Random.nextDouble) 173 | 174 | /** 175 | * Perform a random sampling from the distribution as sketched by this t-digest, in 176 | * "probability mass" (i.e. discrete) mode. 177 | * @return A random number sampled from the sketched distribution 178 | * @note uses the inverse transform sampling method 179 | */ 180 | def samplePMF: Double = clusters.cdfDiscreteInverse(Random.nextDouble) 181 | 182 | /** 183 | * Perform a random sampling from the distribution as sketched by this t-digest, 184 | * using "discrete" (PMF) mode if the number of clusters <= maxDiscrete setting, 185 | * and "density" (PDF) mode otherwise. 186 | * @return A random number sampled from the sketched distribution 187 | * @note uses the inverse transform sampling method 188 | */ 189 | def sample: Double = if (nclusters <= maxDiscrete) samplePMF else samplePDF 190 | } 191 | 192 | /** Factory functions for TDigest */ 193 | object TDigest { 194 | import scala.language.higherKinds 195 | import scala.collection.SeqLike 196 | import scala.collection.generic.CanBuildFrom 197 | 198 | /** 199 | * Default value for a t-digest delta parameter. The number of clusters varies, roughly, as 200 | * about (50/delta), when data are presented in random order 201 | * (it may grow larger if data are not presented randomly). The default corresponds to 202 | * an expected number of clusters of about 100. 203 | */ 204 | val deltaDefault: Double = (50.0 / 100.0) // delta * E[clusters] ~ 50 205 | 206 | /** 207 | * The t-digest algorithm will re-cluster itself whenever its number of clusters exceeds 208 | * (K/delta). This value is set such that the threshold is about 10x the heuristically 209 | * expected number of clusters for the user-specified delta value. Generally the number of 210 | * clusters will only trigger the corresponding re-clustering threshold when data are being 211 | * presented in a non-random order. 212 | */ 213 | val K: Double = 10.0 * 50.0 214 | 215 | /** 216 | * Obtain an empty t-digest 217 | * @param delta a sketch resolution parameter. 218 | * @param maxDiscrete sketch in discrete distribution mode up to this number of 219 | * unique values. Defaults to zero; normal continuous mode. 220 | * @note Smaller values of delta yield sketches with more clusters, and higher resolution 221 | * @note The expected number of clusters will vary (roughly) as (50/delta) 222 | */ 223 | def empty(delta: Double = deltaDefault, maxDiscrete: Int = 0): TDigest = { 224 | require(delta > 0.0, s"delta was not > 0") 225 | require(maxDiscrete >= 0, s"maxDiscrete was not >= 0") 226 | TDigest(delta, maxDiscrete, 0, TDigestMap.empty) 227 | } 228 | 229 | /** 230 | * Sketch some data with a t-digest 231 | * @param data The data elements to sketch 232 | * @param delta The sketch resolution parameter. 233 | * @param maxDiscrete sketch in discrete distribution mode up to this number of 234 | * unique values. Defaults to zero; normal continuous mode. 235 | * @return A t-digest sketch of the input data 236 | * @note Smaller values of delta yield sketches with more clusters, and higher resolution 237 | * @note The expected number of clusters will vary (roughly) as (50/delta) 238 | */ 239 | def sketch[N]( 240 | data: TraversableOnce[N], 241 | delta: Double = deltaDefault, 242 | maxDiscrete: Int = 0)(implicit num: Numeric[N]): TDigest = { 243 | require(delta > 0.0, s"delta was not > 0") 244 | require(maxDiscrete >= 0, s"maxDiscrete was not >= 0") 245 | val td = data.foldLeft(empty(delta, maxDiscrete))((c, e) => c + e) 246 | TDigest.shuffle(td.clusters.toVector).foldLeft(empty(delta, maxDiscrete))((c, e) => c + e) 247 | } 248 | 249 | /** 250 | * Combine two t-digests to yield a new digest 251 | * @param ltd the left-hand t-digest operand 252 | * @param rtd the right hand t-digest 253 | * @param delta a sketch resolution parameter. 254 | * @param maxDiscrete sketch in discrete distribution mode up to this number of 255 | * unique values. Defaults to zero; normal continuous mode. 256 | * @return the sum of left and right digests, defined as their aggregation 257 | * @note This operation satisfies a Semigroup law, with the caveat 258 | * that it is only "statistically" associative: d1++(d2++d3) will be statistically 259 | * similar to (d1++d2)++d3, but rarely identical. 260 | */ 261 | def combine(ltd: TDigest, rtd: TDigest, 262 | delta: Double = deltaDefault, 263 | maxDiscrete: Int = 0): TDigest = { 264 | if (ltd.nclusters <= 1 && rtd.nclusters > 1) combine(rtd, ltd, delta, maxDiscrete) 265 | else if (rtd.nclusters == 0) ltd 266 | else if (rtd.nclusters == 1) { 267 | // handle the singleton RHS case specially to prevent quadratic catastrophe when 268 | // it is being used in the Aggregator use case 269 | val d = rtd.clusters.asInstanceOf[tdmap.tree.INodeTD].data 270 | ltd + ((d.key, d.value)) 271 | } else { 272 | // insert clusters from largest to smallest 273 | (ltd.clusters.toVector ++ rtd.clusters.toVector).sortWith((a, b) => a._2 > b._2) 274 | .foldLeft(empty(delta, maxDiscrete))((d, e) => d + e) 275 | } 276 | } 277 | 278 | // Shuffle a sequence in a referentially-transparent way: pseudo-randomly, but with a random 279 | // seed that is a function of the sequence argument. 280 | private[sketches] def shuffle[E, S[X] <: SeqLike[X, S[X]]]( 281 | seq: S[E])(implicit cbf: CanBuildFrom[S[E], E, S[E]]) = 282 | if (seq.length <= 1) seq 283 | else { 284 | val seed = scala.util.hashing.MurmurHash3.productHash((seq(0), seq(1), seq.length)) 285 | (new scala.util.Random(seed)).shuffle(seq) 286 | } 287 | } 288 | -------------------------------------------------------------------------------- /src/main/scala/org/isarnproject/sketches/tdmap/TDigestMap.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2016-2018 Erik Erlandson 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package org.isarnproject.sketches.tdmap 18 | 19 | import math.Numeric 20 | 21 | import scala.collection.SortedMap 22 | 23 | import org.isarnproject.algebraAPI.{ MonoidAPI => Monoid, AggregatorAPI => Aggregator } 24 | 25 | import org.isarnproject.collections.mixmaps.increment._ 26 | import org.isarnproject.collections.mixmaps.prefixsum._ 27 | import org.isarnproject.collections.mixmaps.nearest._ 28 | 29 | object tree { 30 | import org.isarnproject.collections.mixmaps.redblack.tree._ 31 | import org.isarnproject.collections.mixmaps.ordered._ 32 | import org.isarnproject.collections.mixmaps.ordered.tree.DataMap 33 | import org.isarnproject.collections.mixmaps.increment.tree._ 34 | import org.isarnproject.collections.mixmaps.prefixsum.tree._ 35 | import org.isarnproject.collections.mixmaps.nearest.tree._ 36 | 37 | /** Base class of tree node for a TDigestMap */ 38 | trait NodeTD extends NodePS[Double, Double, Double] 39 | with NodeInc[Double, Double] with NodeNearMap[Double, Double] { 40 | 41 | /** 42 | * Obtain a "mass cover": two adjacent nodes in the tree such that the cumulative mass 43 | * of the left node is <= (m) and the cumulative mass of the right node is > (m) 44 | * @param m The target mass to cover between two adjacent nodes 45 | * @return a Cover instance with the left and right covering tree nodes. If the (m) 46 | * was < the mass of the left-most tree node, the left cover value will be None. Similarly 47 | * if the mass was >= the cumulative mass of the right-most node (equivalent to sum of all 48 | * node masses in the tree), then the right cover value will be None. 49 | */ 50 | final def mCover(m: Double) = mcov(m, 0.0, Cover[INodeTD](None, None)) 51 | 52 | private[tree] def mcov(m: Double, psum: Double, cov: Cover[INodeTD]): Cover[INodeTD] 53 | 54 | // Find the cluster whose prefix sum is the least upper bound of mass 'm' 55 | // domain specific to t-digest algorithms 56 | private[sketches] final def keyPFSLUB(m: Double) = this match { 57 | case _: LNodeTD => Double.NaN 58 | case _ if (m < 0.0 || m > this.pfs) => Double.NaN 59 | case _ if (m == 0.0) => this.nodeMin.get.asInstanceOf[INodeTD].data.key 60 | case _ => this.kpl(m, 0.0) 61 | } 62 | 63 | // recursive implementation of keyPFSLUB 64 | private[tree] def kpl(m: Double, psum: Double): Double 65 | 66 | // obtains the nearest cluster to 'x'. Returns the cluster (location, mass, prefix-sum) 67 | private[sketches] final def nearTD(x: Double): (Double, Double, Double) = ntd(x, 0.0) 68 | 69 | // recursive implementation for nearTD 70 | private[tree] def ntd(x: Double, psum: Double): (Double, Double, Double) 71 | 72 | // recursive implementation of 'update' method 73 | private[tdmap] def upd(x0: Double, x: Double, m: Double): Node[Double] 74 | } 75 | 76 | trait LNodeTD extends NodeTD 77 | with LNodePS[Double, Double, Double] with LNodeInc[Double, Double] 78 | with LNodeNearMap[Double, Double] { 79 | final def mcov(m: Double, psum: Double, cov: Cover[INodeTD]) = cov 80 | final def kpl(m: Double, psum: Double) = Double.NaN 81 | final def ntd(x: Double, psum: Double) = (Double.NaN, Double.NaN, Double.NaN) 82 | final def upd(x0: Double, x: Double, m: Double) = 83 | throw new Exception("If this exception threw, there is a bug in this code") 84 | } 85 | 86 | trait INodeTD extends NodeTD 87 | with INodePS[Double, Double, Double] with INodeInc[Double, Double] 88 | with INodeNearMap[Double, Double] { 89 | val lsub: NodeTD 90 | val rsub: NodeTD 91 | 92 | final def mcov(m: Double, psum: Double, cov: Cover[INodeTD]) = { 93 | if (m < psum + lsub.pfs) { 94 | lsub match { 95 | case n: INodeTD => 96 | lsub.mcov(m, psum, cov.copy(r = Some(n.nodeMax.get.asInstanceOf[INodeTD]))) 97 | case _ => cov.copy(r = Some(this)) 98 | } 99 | } else { 100 | val t = psum + lsub.pfs + data.value 101 | if (m >= t) rsub.mcov(m, t, cov.copy(l = Some(this))) 102 | else { 103 | lsub match { 104 | case n: INodeTD => Cover(Some(lsub.nodeMax.get.asInstanceOf[INodeTD]), Some(this)) 105 | case _ => cov.copy(r = Some(this)) 106 | } 107 | } 108 | } 109 | } 110 | 111 | final def kpl(m: Double, psum: Double) = { 112 | val lb = psum + lsub.pfs 113 | val ub = lb + data.value 114 | if (m > ub) { 115 | rsub.kpl(m, ub) 116 | } else if (m > lb) { 117 | data.key 118 | } else { 119 | lsub.kpl(m, psum) 120 | } 121 | } 122 | 123 | final def ntd(x: Double, psum: Double) = { 124 | if (x < data.key) { 125 | lsub match { 126 | case ls: INodeTD => { 127 | if (x <= ls.kmax) ls.ntd(x, psum) 128 | else { 129 | val (dk, ldk) = (math.abs(x - data.key), math.abs(x - ls.kmax)) 130 | if (dk <= ldk) (data.key, data.value, psum + lsub.pfs) 131 | else { 132 | val n = ls.node(ls.kmax).get.asInstanceOf[INodeTD] 133 | (n.data.key, n.data.value, psum + lsub.pfs - n.data.value) 134 | } 135 | } 136 | } 137 | case _ => (data.key, data.value, psum + lsub.pfs) 138 | } 139 | } else if (x > data.key) { 140 | rsub match { 141 | case rs: INodeTD => { 142 | if (x >= rs.kmin) rs.ntd(x, psum + lsub.pfs + data.value) 143 | else { 144 | val (dk, rdk) = (math.abs(x - data.key), math.abs(x - rs.kmin)) 145 | if (dk <= rdk) (data.key, data.value, psum + lsub.pfs) 146 | else { 147 | val n = rs.node(rs.kmin).get.asInstanceOf[INodeTD] 148 | (n.data.key, n.data.value, psum + lsub.pfs + data.value) 149 | } 150 | } 151 | } 152 | case _ => (data.key, data.value, psum + lsub.pfs) 153 | } 154 | } else (data.key, data.value, psum + lsub.pfs) 155 | } 156 | 157 | final def upd(x0: Double, x: Double, m: Double) = 158 | if (color == R) { 159 | if (x0 < data.key) rNode(data, lsub.upd(x0, x, m), rsub) 160 | else if (x0 > data.key) rNode(data, lsub, rsub.upd(x0, x, m)) 161 | else { 162 | val d = new DataMap[Double, Double] { 163 | val key = x 164 | val value = m 165 | } 166 | rNode(d, lsub, rsub) 167 | } 168 | } else { 169 | // We know we are directly replacing a node, so no need to call balance() 170 | // in the case of black nodes. This is quite a bit faster. \o/ 171 | if (x0 < data.key) bNode(data, lsub.upd(x0, x, m), rsub) 172 | else if (x0 > data.key) bNode(data, lsub, rsub.upd(x0, x, m)) 173 | else { 174 | val d = new DataMap[Double, Double] { 175 | val key = x 176 | val value = m 177 | } 178 | bNode(d, lsub, rsub) 179 | } 180 | } 181 | } 182 | } 183 | 184 | import tree._ 185 | 186 | object infra { 187 | import org.isarnproject.collections.mixmaps.redblack.tree._ 188 | import org.isarnproject.collections.mixmaps.ordered.tree.DataMap 189 | 190 | object tdmapMonoid extends Monoid[Double] { 191 | def empty = 0.0 192 | def combine(x: Double, y: Double) = x + y 193 | def combineAll(as: TraversableOnce[Double]) = as.fold(0.0)(_ + _) 194 | def combineAllOption(as: TraversableOnce[Double]) = 195 | if (as.isEmpty) None else Some(combineAll(as)) 196 | } 197 | 198 | object tdmapAggregator extends Aggregator[Double, Double] { 199 | def monoid = tdmapMonoid 200 | def lff = (m: Double, d: Double) => m + d 201 | def mf = (d: Double) => d 202 | def aggregate(as: TraversableOnce[Double]) = as.fold(0.0)(_ + _) 203 | } 204 | 205 | /** Dependency injection class for TDigestMap */ 206 | class Inject extends Serializable { 207 | // Typeclasses corresponding to "regular real numbers": 208 | val keyOrdering = implicitly[Numeric[Double]] 209 | 210 | val valueMonoid = tdmapMonoid 211 | 212 | val prefixAggregator = tdmapAggregator 213 | 214 | def iNode(clr: Color, dat: Data[Double], ls: Node[Double], rs: Node[Double]) = 215 | new Inject with INodeTD with TDigestMap { 216 | // INode 217 | val color = clr 218 | val lsub = ls.asInstanceOf[NodeTD] 219 | val rsub = rs.asInstanceOf[NodeTD] 220 | val data = dat.asInstanceOf[DataMap[Double, Double]] 221 | // INodePS 222 | val prefix = prefixAggregator.lff( 223 | prefixAggregator.monoid.combine(lsub.pfs, rsub.pfs), data.value) 224 | // INodeNear 225 | val kmin = lsub match { 226 | case n: INodeTD => n.kmin 227 | case _ => data.key 228 | } 229 | val kmax = rsub match { 230 | case n: INodeTD => n.kmax 231 | case _ => data.key 232 | } 233 | } 234 | } 235 | 236 | } 237 | 238 | import infra._ 239 | 240 | /** 241 | * The tree-backed map object a TDigest uses to store and update its clusters. TDigestMap 242 | * inherits functionality for value increment, prefix-sum and nearest-neighbor queries. 243 | */ 244 | sealed trait TDigestMap extends SortedMap[Double, Double] with NodeTD 245 | with IncrementMapLike[Double, Double, INodeTD, TDigestMap] 246 | with PrefixSumMapLike[Double, Double, Double, INodeTD, TDigestMap] 247 | with NearestMapLike[Double, Double, INodeTD, TDigestMap] { 248 | 249 | override def empty = TDigestMap.empty 250 | 251 | private def m1m2(c1: Double, tm1: Double, c2: Double, tm2: Double) = { 252 | val s = this.prefixSum(c1, open = true) 253 | val d1 = if (c1 == this.keyMin.get) 0.0 else tm1 / 2.0 254 | val m1 = s + d1 255 | val m2 = m1 + (tm1 - d1) + (if (c2 == this.keyMax.get) tm2 else tm2 / 2.0) 256 | (m1, m2) 257 | } 258 | 259 | // This updates an existing cluster with a new location and mass. It does this 260 | // efficiently by taking advantage of the knowledge that (a) this kind of update 261 | // never changes the key ordering, and therefore that (b) this operation can 262 | // always directly replace an existing node, without otherwise changing the topology 263 | // of the tree. Clearly, this is a domain-dependent method, and not exposed to the 264 | // public API 265 | private[sketches] def update(x0: Double, x: Double, m: Double): TDigestMap = 266 | this.upd(x0, x, m).asInstanceOf[TDigestMap] 267 | 268 | /** Compute the CDF for a value, using piece-wise linear between clusters */ 269 | def cdf[N](xx: N)(implicit num: Numeric[N]) = { 270 | val x = num.toDouble(xx) 271 | this.coverR(x) match { 272 | case Cover(Some((c1, tm1)), Some((c2, tm2))) => { 273 | val (m1, m2) = m1m2(c1, tm1, c2, tm2) 274 | val m = m1 + (x - c1) * (m2 - m1) / (c2 - c1) 275 | // Clipping to [m1,m2] corrects numeric precision errors that can cause non-monotonic quirks 276 | math.min(m2, math.max(m1, m)) / this.sum 277 | } 278 | case Cover(Some(_), None) => 1.0 279 | case _ => 0.0 280 | } 281 | } 282 | 283 | def cdfDiscrete[N](xx: N)(implicit num: Numeric[N]) = { 284 | if (this.isEmpty) 0.0 else { 285 | val x = num.toDouble(xx) 286 | this.prefixSum(x) / this.sum 287 | } 288 | } 289 | 290 | def cdfDiscreteInverse[N](qq: N)(implicit num: Numeric[N]) = { 291 | val q = num.toDouble(qq) 292 | keyPFSLUB(q * this.sum) 293 | } 294 | 295 | /** 296 | * Compute the inverse-CDF from a given quantile on interval [0, 1], using piecewise linear 297 | * interpolation between clusters 298 | */ 299 | def cdfInverse[N](qq: N)(implicit num: Numeric[N]) = { 300 | def cdfI(m: Double, c1: Double, tm1: Double, c2: Double, tm2: Double) = { 301 | val (m1, m2) = m1m2(c1, tm1, c2, tm2) 302 | val x = c1 + (m - m1) * (c2 - c1) / (m2 - m1) 303 | // Clipping to [c1,c2] corrects numeric precision errors that can cause non-monotonic quirks 304 | math.min(c2, math.max(c1, x)) 305 | } 306 | 307 | val q = num.toDouble(qq) 308 | if (q < 0.0 || q > 1.0) Double.NaN 309 | else { 310 | val m = q * this.sum 311 | this.mCover(m).map(n => (n.data.key, n.data.value)) match { 312 | case Cover(Some((c1, tm1)), Some((c2, tm2))) => cdfI(m, c1, tm1, c2, tm2) 313 | case Cover(None, Some((c, _))) => this.coverR(c) match { 314 | case Cover(Some((c1, tm1)), Some((c2, tm2))) => cdfI(m, c1, tm1, c2, tm2) 315 | case _ => Double.NaN 316 | } 317 | case Cover(Some((c, _)), None) => c 318 | case _ => Double.NaN 319 | } 320 | } 321 | } 322 | 323 | override def toString = 324 | "TDigestMap(" + 325 | iterator.zip(prefixSumsIterator()) 326 | .map(x => s"${x._1._1} -> (${x._1._2}, ${x._2})").mkString(", ") + 327 | ")" 328 | } 329 | 330 | /** factory functions for TDigestMap */ 331 | object TDigestMap { 332 | /** Obtain an empty TDigestMap instance */ 333 | def empty: TDigestMap = new Inject with LNodeTD with TDigestMap 334 | } 335 | -------------------------------------------------------------------------------- /src/site/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Project Documentation - isarn-sketches 6 | 7 | 8 |

Project Documentation - isarn-sketches

9 |

10 | Scala API documentation 11 |

12 | Java API documentation 13 | 14 | 15 | -------------------------------------------------------------------------------- /src/test/scala/org/isarnproject/sketches/TDigestTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2016-2018 Erik Erlandson 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package org.isarnproject.sketches 18 | 19 | import org.scalatest._ 20 | 21 | import org.isarnproject.scalatest.matchers.seq._ 22 | 23 | class TDigestTest extends FlatSpec with Matchers { 24 | import org.apache.commons.math3.distribution.RealDistribution 25 | import org.apache.commons.math3.distribution.IntegerDistribution 26 | 27 | val seed = 235711L 28 | scala.util.Random.setSeed(seed) 29 | 30 | val ss = 100000 31 | val delta = 50.0 / 1000 32 | 33 | val maxD = 0.05 34 | val maxDI = 0.1 35 | 36 | def testTDvsDist(td: TDigest, dist: RealDistribution, stdv: Double): Boolean = { 37 | val xmin = td.clusters.keyMin.get 38 | val xmax = td.clusters.keyMax.get 39 | val step = (xmax - xmin) / 1000 40 | val d = (xmin to xmax by step).iterator 41 | .map(x => math.abs(td.cdf(x) - dist.cumulativeProbability(x))).max 42 | 43 | val dInv = (0.01 to 0.99 by 0.01).iterator 44 | .map(x => math.abs(td.cdfInverse(x) - dist.inverseCumulativeProbability(x))).max / stdv 45 | 46 | val pass = d <= maxD && dInv <= maxDI 47 | if (!pass) Console.err.println(s"testTDvsDist failure: d= $d dInv= $dInv") 48 | pass 49 | } 50 | 51 | def testSamplingPDF(td: TDigest, dist: RealDistribution): Boolean = { 52 | val tdSamples = Array.fill(10000) { td.samplePDF } 53 | val distSamples = Array.fill(10000) { dist.sample } 54 | val kst = new org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest() 55 | val d = kst.kolmogorovSmirnovStatistic(tdSamples, distSamples) 56 | val pass = d <= maxD 57 | if (!pass) Console.err.println(s"testSamplingPDF failure: d= $d") 58 | pass 59 | } 60 | 61 | def testSamplingPMF(td: TDigest, dist: IntegerDistribution): Boolean = { 62 | td.nclusters should be <=(td.maxDiscrete) 63 | val tdSamples = Array.fill(10000) { td.samplePMF } 64 | val distSamples = Array.fill(10000) { dist.sample.toDouble } 65 | val kst = new org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest() 66 | val d = kst.kolmogorovSmirnovStatistic(tdSamples, distSamples) 67 | val pass = d <= maxD 68 | if (!pass) Console.err.println(s"testSamplingPDF failure: d= $d") 69 | pass 70 | } 71 | 72 | def testDistribution(dist: RealDistribution, stdv: Double): Boolean = { 73 | dist.reseedRandomGenerator(seed) 74 | 75 | val td = TDigest.sketch(Iterator.fill(ss) { dist.sample }, delta = delta) 76 | 77 | testTDvsDist(td, dist, stdv) && testSamplingPDF(td, dist) 78 | } 79 | 80 | def testMonotoneCDF(dist: RealDistribution): Boolean = { 81 | dist.reseedRandomGenerator(seed) 82 | val td = TDigest.sketch(Iterator.fill(ss) { dist.sample }, delta = delta) 83 | val (xmin, xmax) = (td.clusters.keyMin.get, td.clusters.keyMax.get) 84 | val step = (xmax - xmin) / 100000 85 | val t = (xmin to xmax by step).iterator.map(x => td.cdf(x)).sliding(2).map(w => w(1) - w(0)).min 86 | val pass = t >= 0.0 87 | if (!pass) Console.err.println(s"testMonotoneCDF failure: t= $t") 88 | pass 89 | } 90 | 91 | def testMonotoneCDFI(dist: RealDistribution): Boolean = { 92 | dist.reseedRandomGenerator(seed) 93 | val td = TDigest.sketch(Iterator.fill(ss) { dist.sample }, delta = delta) 94 | val (xmin, xmax) = (0.0, 1.0) 95 | val step = (xmax - xmin) / 100000 96 | val t = (xmin to xmax by step).iterator.map(q => td.cdfInverse(q)).sliding(2).map(w => w(1) - w(0)).min 97 | val pass = t >= 0.0 98 | if (!pass) Console.err.println(s"testMonotoneCDFI failure: t= $t") 99 | pass 100 | } 101 | 102 | def testMonotone(dist: RealDistribution): Boolean = { 103 | testMonotoneCDF(dist) && testMonotoneCDFI(dist) 104 | } 105 | 106 | it should "sketch a uniform distribution" in { 107 | import org.apache.commons.math3.distribution.UniformRealDistribution 108 | val dist = new UniformRealDistribution() 109 | testDistribution(dist, math.sqrt(dist.getNumericalVariance())) should be (true) 110 | } 111 | 112 | it should "sketch a normal distribution" in { 113 | import org.apache.commons.math3.distribution.NormalDistribution 114 | val dist = new NormalDistribution() 115 | testDistribution(dist, math.sqrt(dist.getNumericalVariance())) should be (true) 116 | } 117 | 118 | it should "sketch an exponential distribution" in { 119 | import org.apache.commons.math3.distribution.ExponentialDistribution 120 | val dist = new ExponentialDistribution(1.0) 121 | testDistribution(dist, math.sqrt(dist.getNumericalVariance())) should be (true) 122 | } 123 | 124 | it should "aggregate with another t-digest using ++" in { 125 | import org.apache.commons.math3.distribution.NormalDistribution 126 | val dist = new NormalDistribution() 127 | dist.reseedRandomGenerator(seed) 128 | 129 | val td1 = TDigest.sketch(Iterator.fill(ss) { dist.sample }, delta = delta) 130 | val td2 = TDigest.sketch(Iterator.fill(ss) { dist.sample }, delta = delta) 131 | 132 | testTDvsDist(td1 ++ td2, dist, math.sqrt(dist.getNumericalVariance())) should be (true) 133 | } 134 | 135 | it should "respect monotonic cdf and inverse" in { 136 | import org.apache.commons.math3.distribution.ExponentialDistribution 137 | import org.apache.commons.math3.distribution.NormalDistribution 138 | import org.apache.commons.math3.distribution.UniformRealDistribution 139 | 140 | testMonotone(new UniformRealDistribution()) should be (true) 141 | testMonotone(new ExponentialDistribution(1.0)) should be (true) 142 | testMonotone(new NormalDistribution(0.0, 0.1)) should be (true) 143 | } 144 | 145 | it should "respect maxDiscrete parameter" in { 146 | import org.apache.commons.math3.distribution.GeometricDistribution 147 | val gd = new GeometricDistribution(0.33) 148 | val data = gd.sample(1000000) 149 | val dataUniq = data.distinct.sorted 150 | val kt = dataUniq.map(_.toDouble).toSet 151 | val td = TDigest.sketch(data, maxDiscrete = 50) 152 | val clust = td.clusters 153 | clust.keys.toSet should be (kt) 154 | val D = clust.keys.map { x => td.cdfDiscrete(x) } 155 | .zip(dataUniq.map { k => gd.cumulativeProbability(k) }) 156 | .map { case (p1, p2) => math.abs(p1 - p2) } 157 | .max 158 | (D <= 0.01) should be (true) 159 | testSamplingPMF(td, gd) should be (true) 160 | } 161 | 162 | it should "respect maxDiscrete parameter over ++" in { 163 | import org.apache.commons.math3.distribution.GeometricDistribution 164 | val gd = new GeometricDistribution(0.33) 165 | val tdvec = Vector.fill(10) { TDigest.sketch(gd.sample(100000), maxDiscrete = 50) } 166 | val td = tdvec.reduce(_ ++ _) 167 | val clust = td.clusters 168 | clust.keys.map(_.toInt).map(_.toDouble) should beEqSeq(clust.keys) 169 | val D = clust.keys.map { x => td.cdfDiscrete(x) } 170 | .zip(clust.keys.map(_.toInt).map { k => gd.cumulativeProbability(k) }) 171 | .map { case (p1, p2) => math.abs(p1 - p2) } 172 | .max 173 | (D <= 0.01) should be (true) 174 | testSamplingPMF(td, gd) should be (true) 175 | } 176 | 177 | it should "serialize and deserialize" in { 178 | import org.apache.commons.math3.distribution.NormalDistribution 179 | 180 | import org.isarnproject.scalatest.serde.roundTripSerDe 181 | 182 | val dist = new NormalDistribution() 183 | dist.reseedRandomGenerator(seed) 184 | 185 | val tdo = TDigest.sketch(Iterator.fill(ss) { dist.sample }, delta = delta) 186 | 187 | val tdi = roundTripSerDe(tdo) 188 | 189 | (tdi == tdo) should be (true) 190 | 191 | testTDvsDist(tdi, dist, math.sqrt(dist.getNumericalVariance())) should be (true) 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /src/test/scala/org/isarnproject/sketches/java/JavaTDigestTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2016-2018 Erik Erlandson 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package org.isarnproject.sketches.java 18 | 19 | import org.scalatest._ 20 | 21 | import org.isarnproject.scalatest.matchers.seq._ 22 | 23 | class JavaTDigestTest extends FlatSpec with Matchers { 24 | import org.apache.commons.math3.distribution.RealDistribution 25 | import org.apache.commons.math3.distribution.IntegerDistribution 26 | 27 | val seed = 235711L 28 | scala.util.Random.setSeed(seed) 29 | 30 | val ss = 100000 31 | val delta = 50.0 / 1000 32 | 33 | val maxD = 0.05 34 | val maxDI = 0.1 35 | 36 | def testTDvsDist(td: TDigest, dist: RealDistribution, stdv: Double): Boolean = { 37 | val xmin = td.cent(0) 38 | val xmax = td.cent(td.nclusters - 1) 39 | val step = (xmax - xmin) / 1000 40 | val d = (xmin to xmax by step).iterator 41 | .map(x => math.abs(td.cdf(x) - dist.cumulativeProbability(x))).max 42 | 43 | val dInv = (0.01 to 0.99 by 0.01).iterator 44 | .map(x => math.abs(td.cdfInverse(x) - dist.inverseCumulativeProbability(x))).max / stdv 45 | 46 | val pass = d <= maxD && dInv <= maxDI 47 | if (!pass) Console.err.println(s"testTDvsDist failure: d= $d dInv= $dInv") 48 | pass 49 | } 50 | 51 | def testSamplingPDF(td: TDigest, dist: RealDistribution): Boolean = { 52 | val tdSamples = Array.fill(10000) { td.samplePDF } 53 | val distSamples = Array.fill(10000) { dist.sample } 54 | val kst = new org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest() 55 | val d = kst.kolmogorovSmirnovStatistic(tdSamples, distSamples) 56 | val pass = d <= maxD 57 | if (!pass) Console.err.println(s"testSamplingPDF failure: d= $d") 58 | pass 59 | } 60 | 61 | def testSamplingPMF(td: TDigest, dist: IntegerDistribution): Boolean = { 62 | td.nclusters should be <=(td.maxDiscrete) 63 | val tdSamples = Array.fill(10000) { td.samplePMF } 64 | val distSamples = Array.fill(10000) { dist.sample.toDouble } 65 | val kst = new org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest() 66 | val d = kst.kolmogorovSmirnovStatistic(tdSamples, distSamples) 67 | val pass = d <= maxD 68 | if (!pass) Console.err.println(s"testSamplingPDF failure: d= $d") 69 | pass 70 | } 71 | 72 | def testDistribution(dist: RealDistribution, stdv: Double): Boolean = { 73 | dist.reseedRandomGenerator(seed) 74 | 75 | val td = TDigest.sketch(Array.fill(ss) { dist.sample }, delta) 76 | 77 | testTDvsDist(td, dist, stdv) && testSamplingPDF(td, dist) 78 | } 79 | 80 | def testMonotoneCDF(dist: RealDistribution): Boolean = { 81 | dist.reseedRandomGenerator(seed) 82 | val td = TDigest.sketch(Array.fill(ss) { dist.sample }, delta) 83 | val (xmin, xmax) = (td.cent(0), td.cent(td.nclusters - 1)) 84 | val step = (xmax - xmin) / 100000 85 | val t = (xmin to xmax by step).iterator.map(x => td.cdf(x)).sliding(2).map(w => w(1) - w(0)).min 86 | val pass = t >= 0.0 87 | if (!pass) Console.err.println(s"testMonotoneCDF failure: t= $t") 88 | pass 89 | } 90 | 91 | def testMonotoneCDFI(dist: RealDistribution): Boolean = { 92 | dist.reseedRandomGenerator(seed) 93 | val td = TDigest.sketch(Array.fill(ss) { dist.sample }, delta) 94 | val (xmin, xmax) = (0.0, 1.0) 95 | val step = (xmax - xmin) / 100000 96 | val t = (xmin to xmax by step).iterator.map(q => td.cdfInverse(q)).sliding(2).map(w => w(1) - w(0)).min 97 | val pass = t >= 0.0 98 | if (!pass) Console.err.println(s"testMonotoneCDFI failure: t= $t") 99 | pass 100 | } 101 | 102 | def testMonotone(dist: RealDistribution): Boolean = { 103 | testMonotoneCDF(dist) && testMonotoneCDFI(dist) 104 | } 105 | 106 | it should "sketch a uniform distribution" in { 107 | import org.apache.commons.math3.distribution.UniformRealDistribution 108 | val dist = new UniformRealDistribution() 109 | testDistribution(dist, math.sqrt(dist.getNumericalVariance())) should be (true) 110 | } 111 | 112 | it should "sketch a normal distribution" in { 113 | import org.apache.commons.math3.distribution.NormalDistribution 114 | val dist = new NormalDistribution() 115 | testDistribution(dist, math.sqrt(dist.getNumericalVariance())) should be (true) 116 | } 117 | 118 | it should "sketch an exponential distribution" in { 119 | import org.apache.commons.math3.distribution.ExponentialDistribution 120 | val dist = new ExponentialDistribution(1.0) 121 | testDistribution(dist, math.sqrt(dist.getNumericalVariance())) should be (true) 122 | } 123 | 124 | it should "aggregate with another t-digest using merge method" in { 125 | import org.apache.commons.math3.distribution.NormalDistribution 126 | val dist = new NormalDistribution() 127 | dist.reseedRandomGenerator(seed) 128 | 129 | val td1 = TDigest.sketch(Array.fill(ss) { dist.sample }, delta) 130 | val td2 = TDigest.sketch(Array.fill(ss) { dist.sample }, delta) 131 | 132 | testTDvsDist(TDigest.merge(td1, td2), dist, math.sqrt(dist.getNumericalVariance())) should be (true) 133 | } 134 | 135 | it should "respect monotonic cdf and inverse" in { 136 | import org.apache.commons.math3.distribution.ExponentialDistribution 137 | import org.apache.commons.math3.distribution.NormalDistribution 138 | import org.apache.commons.math3.distribution.UniformRealDistribution 139 | 140 | testMonotone(new UniformRealDistribution()) should be (true) 141 | testMonotone(new ExponentialDistribution(1.0)) should be (true) 142 | testMonotone(new NormalDistribution(0.0, 0.1)) should be (true) 143 | } 144 | 145 | it should "respect maxDiscrete parameter" in { 146 | import org.apache.commons.math3.distribution.GeometricDistribution 147 | val gd = new GeometricDistribution(0.33) 148 | val data = gd.sample(1000000).map(_.toDouble) 149 | val dataUniq = data.distinct.sorted 150 | val kt = dataUniq.map(_.toDouble).toSet 151 | val td = TDigest.sketch(data, delta, 50) 152 | val clust = td.cent 153 | clust.toSet should be (kt) 154 | val D = clust.map { x => td.cdfDiscrete(x) } 155 | .zip(dataUniq.map { k => gd.cumulativeProbability(k.toInt) }) 156 | .map { case (p1, p2) => math.abs(p1 - p2) } 157 | .max 158 | (D <= 0.01) should be (true) 159 | testSamplingPMF(td, gd) should be (true) 160 | } 161 | 162 | it should "respect maxDiscrete parameter over merge" in { 163 | import org.apache.commons.math3.distribution.GeometricDistribution 164 | val gd = new GeometricDistribution(0.33) 165 | val tdvec = Vector.fill(10) { TDigest.sketch(gd.sample(100000).map(_.toDouble), delta, 50) } 166 | val td = tdvec.reduce((a, b) => TDigest.merge(a, b)) 167 | val clust = td.cent 168 | clust.map(_.toInt).map(_.toDouble).toVector should beEqSeq(clust.toVector) 169 | val D = clust.map { x => td.cdfDiscrete(x) } 170 | .zip(clust.map(_.toInt).map { k => gd.cumulativeProbability(k) }) 171 | .map { case (p1, p2) => math.abs(p1 - p2) } 172 | .max 173 | (D <= 0.01) should be (true) 174 | testSamplingPMF(td, gd) should be (true) 175 | } 176 | 177 | it should "support copy constructor" in { 178 | import org.apache.commons.math3.distribution.NormalDistribution 179 | 180 | val dist = new NormalDistribution() 181 | dist.reseedRandomGenerator(seed) 182 | val data = Array.fill(ss) { dist.sample } 183 | val td1 = TDigest.sketch(data, delta) 184 | val td2 = new TDigest(td1) 185 | (td2.equals(td1)) should be (true) 186 | (td1.equals(td2)) should be (true) 187 | 188 | // add more data and re-check equality to ensure 189 | // that all state for future updates was correctly copied 190 | for { x <- data } { 191 | td1.update(x) 192 | td2.update(x) 193 | } 194 | (td2.equals(td1)) should be (true) 195 | (td1.equals(td2)) should be (true) 196 | } 197 | 198 | def testTDClose(td1: TDigest, td2: TDigest, eps: Double = 1e-6): Unit = { 199 | td1.getCompression() should be (td2.getCompression()) 200 | td1.getMaxDiscrete() should be (td2.getMaxDiscrete()) 201 | td1.size() should be (td2.size()) 202 | td1.mass() should be (td2.mass() +- eps) 203 | for { j <- 0 until td1.size() } { 204 | td1.getCentUnsafe()(j) should be (td2.getCentUnsafe()(j) +- eps) 205 | td1.getMassUnsafe()(j) should be (td2.getMassUnsafe()(j) +- eps) 206 | td1.getFTUnsafe()(1 + j) should be (td2.getFTUnsafe()(1 + j) +- eps) 207 | } 208 | } 209 | 210 | it should "support dser constructor" in { 211 | import java.util.Arrays; 212 | import org.apache.commons.math3.distribution.NormalDistribution 213 | 214 | val eps = 1e-9 215 | 216 | val dist = new NormalDistribution() 217 | dist.reseedRandomGenerator(seed) 218 | val data = Array.fill(ss) { dist.sample } 219 | 220 | // test constructing empty t-digests 221 | val td1 = new TDigest(0.5, 0, Array.empty[Double], Array.empty[Double]) 222 | val td2 = new TDigest( 223 | td1.getCompression(), 224 | td1.getMaxDiscrete(), 225 | Arrays.copyOf(td1.getCentUnsafe(), td1.size()), 226 | Arrays.copyOf(td1.getMassUnsafe(), td1.size()) 227 | ) 228 | testTDClose(td1, td2, eps) 229 | 230 | // test sketching from empty state 231 | for { x <- data } { 232 | td1.update(x) 233 | td2.update(x) 234 | } 235 | testTDClose(td1, td2, eps) 236 | 237 | // copy from non-empty state 238 | val td3 = new TDigest( 239 | td1.getCompression(), 240 | td1.getMaxDiscrete(), 241 | Arrays.copyOf(td1.getCentUnsafe(), td1.size()), 242 | Arrays.copyOf(td1.getMassUnsafe(), td1.size()) 243 | ) 244 | testTDClose(td1, td3, eps) 245 | 246 | // test from non-empty state 247 | for { x <- data } { 248 | td1.update(x) 249 | td3.update(x) 250 | } 251 | testTDClose(td1, td3, eps) 252 | } 253 | 254 | it should "serialize and deserialize" in { 255 | import org.apache.commons.math3.distribution.NormalDistribution 256 | 257 | import org.isarnproject.scalatest.serde.roundTripSerDe 258 | 259 | val dist = new NormalDistribution() 260 | dist.reseedRandomGenerator(seed) 261 | 262 | val tdo = TDigest.sketch(Array.fill(ss) { dist.sample }, delta) 263 | 264 | val tdi = roundTripSerDe(tdo) 265 | 266 | (tdi.equals(tdo)) should be (true) 267 | 268 | testTDvsDist(tdi, dist, math.sqrt(dist.getNumericalVariance())) should be (true) 269 | } 270 | } 271 | --------------------------------------------------------------------------------