├── .gitignore ├── .travis.yml ├── Histogram.png ├── LICENSE ├── README.md ├── build.sbt ├── no-test-sbt.sh ├── project └── plugins.sbt ├── sbt-offline.sh ├── src ├── main │ └── scala │ │ └── com │ │ └── github │ │ └── vicpara │ │ └── eda │ │ ├── AppLogger.scala │ │ └── stats │ │ ├── PercentileStats.scala │ │ └── SequenceStats.scala └── test │ └── scala │ └── com │ └── github │ └── vicpara │ └── eda │ ├── DescriptiveStatsJob.scala │ ├── Generators.scala │ ├── Stats.scala │ ├── StatsSpec.scala │ └── TestUtils.scala └── version.sbt /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache 6 | .history 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | # Scala-IDE specific 16 | .scala_dependencies 17 | .worksheet 18 | 19 | 20 | # Byte-compiled / optimized / DLL files 21 | __pycache__/ 22 | *.py[cod] 23 | .project 24 | scala/projectFilesBackup/ 25 | scala/src/sandbox.sc 26 | 27 | 28 | # C extensions 29 | *.so 30 | 31 | .idea/* 32 | */.idea/* 33 | */.idea_modules/* 34 | project/project/* 35 | project/target/* 36 | */target/* 37 | 38 | 39 | #Notebook temp files 40 | *.ipynb_checkpoints/* 41 | *.dump 42 | # Data 43 | *.xls 44 | *.xlsx 45 | *.zip 46 | *.out 47 | 48 | 49 | # Distribution / packaging 50 | .Python 51 | env/ 52 | build/ 53 | develop-eggs/ 54 | dist/ 55 | downloads/ 56 | eggs/ 57 | .eggs/ 58 | lib/ 59 | lib64/ 60 | parts/ 61 | sdist/ 62 | var/ 63 | *.egg-info/ 64 | .installed.cfg 65 | *.egg 66 | 67 | 68 | # PyInstaller 69 | # Usually these files are written by a python script from a template 70 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 71 | *.manifest 72 | *.spec 73 | 74 | 75 | # Installer logs 76 | pip-log.txt 77 | pip-delete-this-directory.txt 78 | 79 | 80 | # Unit test / coverage reports 81 | htmlcov/ 82 | .tox/ 83 | .coverage 84 | .coverage.* 85 | .cache 86 | nosetests.xml 87 | coverage.xml 88 | *,cover 89 | 90 | 91 | # Translations 92 | *.mo 93 | *.pot 94 | 95 | # Django stuff: 96 | *.log 97 | 98 | 99 | # Sphinx documentation 100 | docs/_build/ 101 | 102 | # PyBuilder 103 | target/ 104 | .Rhistory 105 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | #test execution script. 2 | 3 | language: scala 4 | jdk: 5 | - oraclejdk8 6 | 7 | sudo: false 8 | before_install: umask 0022 9 | scala: 10 | - 2.10.4 11 | - 2.11.7 12 | script: 13 | - "echo no op" 14 | 15 | # whitelist 16 | branches: 17 | only: 18 | - master 19 | - develop -------------------------------------------------------------------------------- /Histogram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vicpara/exploratory-data-analysis/e071c4cf92a6f2462cfa509855e59cbd17c4d466/Histogram.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Victor Paraschiv (https://github.com/vicpara/) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Exploratory-Data-Analysis - EDA 2 | [![Build Status](https://travis-ci.org/vicpara/exploratory-data-analysis.svg?branch=master)](https://travis-ci.org/vicpara/exploratory-data-analysis) [![Licence](https://img.shields.io/badge/licence-MIT-blue.svg)](https://tldrlegal.com/license/mit-license) 3 | 4 | Spark library for doing exploratory data analysis on your data set in a scalable way. 5 | 6 | 7 | ## Getting Exploratory Data Analysis 8 | 9 | If you're using SBT, add the following line to your build file: 10 | 11 | ```scala 12 | libraryDependencies += "com.github.vicpara" % "exploratory-data-analysis_2.10" % "1.0.0" 13 | ``` 14 | 15 | For Maven and other build tools, you can visit [search.maven.org](http://search.maven.org/#search%7Cga%7C1%7Cexploratory%20data%20analysis). 16 | To get sample configurations, click on the version of the module you are interested in. 17 | 18 | ----- 19 | ### The problem 20 | 21 | Before running any algorithm on a data set it is essential to understand how the data looks like and what are the expected edge cases. 22 | Maybe you have some expectations about the distribution of the numbers. Are all the buckets that you expect to find in the dataset populated with numbers? 23 | Very often a data set is provided which contains two types of columns:: 24 | 25 | * dimensions 26 | * measurements 27 | 28 | Dimensions are columns that help describe the data points such as: Date, UniqueID, Gender, Postcode, Country, Categories. 29 | Measurements are metrics that quantitatively characterize the described datum. 30 | 31 | Depending on your analytical task, some sort of ordering or bucketing will be used in your model. Some examples:: 32 | 33 | * process daily transactions of a customer 34 | * aggregate daily transactions of a merchant 35 | * count the number of unique customers per merchant or postcode, sector or district 36 | * number of transactions per day. 37 | 38 | Let's take as example the metric : number of daily transactions per merchant 39 | 40 | This is made out of two concepts. A key which is (dayId, merchantId) and a value count(transactions). 41 | Depending on the richness of your dataset is there may be keys with high number of transactiona and keys with very small number. This becomes a problem especially when you are dealing with a sample dataset which promises to capture the entire diversity of the original, complete dataset. 42 | 43 | ----- 44 | ###the library 45 | 46 | For a collection / RDD[T] given a :: 47 | 48 | * function that maps an element of T to a *key* 49 | * function that maps an element of T to a *value* that corresponds to the *key* 50 | * function to aggregate in a monoid-al fashion two *values* 51 | 52 | 53 | It can compute two types of statistics:: 54 | 55 | * the percentile values of the ordered sequence key-statistic 56 | * the unique count of specific key 57 | 58 | 59 | The results are saved into 2 possible formats: 60 | 61 | * object file 62 | * pretty text format ready for inspection 63 | * html files 64 | 65 | We use the *pretty* format when examining the results in bash on remote servers or HDFS. *Pretty* looks like this: 66 | 67 | ``` 68 | BusinessId x Day - Distinct(CustomerId) DrillDownValue : Tuesday 69 | [PercentileStats]: NumBinsInHistogram: 1188 70 | (9191,2015-10-27) | 0| ####### | 44 71 | (6305,2015-11-10) | 1| ###################### | 51 72 | (6774,2015-11-03) | 2| ########################### | 53 73 | (4278,2015-11-03) | 3| ################################# | 54 74 | (9191,2015-11-03) | 4| ################################# | 54 75 | (4687,2015-11-17) | 5| ################################### | 55 76 | (380,2015-11-03) | 6| ###################################### | 56 77 | (8114,2015-11-03) | 7| ############################################ | 57 78 | (5629,2015-10-27) | 8| ############################################ | 57 79 | (404,2015-11-03) | 9| ############################################### | 58 80 | (7586,2015-11-10) | 10| ############################################### | 58 81 | (3765,2015-11-10) | 11| ############################################### | 58 82 | (8478,2015-11-17) | 12| ############################################### | 58 83 | (3701,2015-10-27) | 13| ###################################################### | 60 84 | 85 | ``` 86 | 87 | Using `scala-bokeh`, the library can also output the results of the EDA in html files for easier examination. 88 | ![Histogram chart example](https://github.com/vicpara/exploratory-data-analysis/blob/master/Histogram.png) 89 | 90 | #### Example distinct statistic 91 | 92 | You can find a working example for the capabilities of EDA in *test* folder. 93 | 94 | For a datum of type transaction 95 | ```scala 96 | 97 | case class Transaction(timestamp: Long, customerId: Long, businessId: Long, postcode: Option[String]) { 98 | def toSv(sep: String = "\t"): String = List(timestamp, customerId, businessId).mkString(sep) 99 | } 100 | 101 | ``` 102 | 103 | we define a distinct statistic over number of unique businesses (distinct businessId) inside a *postcode* for a given *day* for each unique values of the key *toDrillDownKeyOption*. The value of this field would normally be a categorical segmentation axis which is expected to have relatively small cardinality. Examples of fields that could be used for toDrillDownKeyOption would be country / city / type of product / region. 104 | 105 | ```scala 106 | 107 | def uniqueBusinessIdPerPostcodeNDayStats(tx: RDD[Transaction], nPercentiles: Int = 1001) = 108 | Some("Postcode x Day - Distinct(BusinessId)") 109 | .map(n => PrettyPercentileStats( 110 | name = n, 111 | levels = SequenceStats.distinct[Transaction, (String, String), Long]( 112 | data = tx, 113 | toDrillDownKeyOption = None, 114 | toDimKey = t => (t.postcode.getOrElse(""), dayAsString(t.timestamp)), 115 | toVal = tx => tx.businessId, 116 | numPercentiles = nPercentiles 117 | ) 118 | )) 119 | 120 | ``` 121 | 122 | 123 | #### Example Percentile statistic 124 | ```scala 125 | 126 | def globalUniqueCustomersCounterStats(transactions: RDD[Transaction], nPercentiles: Int = 1001) = 127 | Some("Global distinct Customers") 128 | .map(n => PrettyPercentileStats( 129 | name = n, 130 | levels = SequenceStats.distinct[Transaction, Long, Long](data = AppLogger.logStage(transactions, n), 131 | toDrillDownKeyOption = None, 132 | toDimKey = t => 1l, 133 | toVal = tx => tx.customerId, 134 | numPercentiles = nPercentiles 135 | ) 136 | )) 137 | 138 | ``` 139 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import com.typesafe.sbt.SbtScalariform 2 | 3 | sonatypeProfileName := "com.github.vicpara" 4 | 5 | organization := "com.github.vicpara" 6 | 7 | name := "exploratory-data-analysis" 8 | 9 | releaseVersionFile := file("version.sbt") 10 | 11 | scalaVersion := "2.10.4" 12 | 13 | crossScalaVersions := Seq("2.11.7", "2.10.4") 14 | 15 | publishMavenStyle := true 16 | 17 | publishTo <<= version { (v: String) => 18 | val nexus = "https://oss.sonatype.org/" 19 | if (v.trim.endsWith("-SNAPSHOT")) 20 | Some("snapshots" at nexus + "content/repositories/snapshots") 21 | else 22 | Some("releases" at nexus + "service/local/staging/deploy/maven2") 23 | } 24 | 25 | publishArtifact in Test := false 26 | 27 | libraryDependencies ++= Seq( 28 | "joda-time" % "joda-time" % "2.6" withSources() withJavadoc(), 29 | "org.joda" % "joda-convert" % "1.2" withSources() withJavadoc(), 30 | "org.apache.spark" % "spark-core_2.10" % "1.3.0-cdh5.4.4" withSources() withJavadoc(), 31 | "org.apache.commons" % "commons-csv" % "1.2" withSources() withJavadoc(), 32 | "com.github.scala-incubator.io" %% "scala-io-core" % "0.4.3" withSources() withJavadoc(), 33 | "com.github.scala-incubator.io" %% "scala-io-file" % "0.4.3" withSources() withJavadoc(), 34 | "com.rockymadden.stringmetric" %% "stringmetric-core" % "0.27.3" withSources() withJavadoc(), 35 | "org.scalaz" %% "scalaz-core" % "7.0.6" withSources() withJavadoc(), 36 | "org.rogach" %% "scallop" % "0.9.5" withSources() withJavadoc(), 37 | "org.scala-lang" % "scalap" % "2.10.4" withSources() withJavadoc(), 38 | "org.scala-lang" % "scala-compiler" % "2.10.4" withSources() withJavadoc(), 39 | "com.github.tototoshi" %% "scala-csv" % "1.2.2" withSources() withJavadoc(), 40 | "org.specs2" %% "specs2-core" % "2.4.9-scalaz-7.0.6" % "test" withSources() withJavadoc(), 41 | "org.specs2" %% "specs2-scalacheck" % "2.4.9-scalaz-7.0.6" % "test" withSources() withJavadoc(), 42 | "io.spray" %% "spray-json" % "1.3.1" withSources() withJavadoc(), 43 | "org.scalaj" %% "scalaj-http" % "1.1.5" withSources() withJavadoc(), 44 | "io.continuum.bokeh" %% "bokeh" % "0.6" withSources() withJavadoc() 45 | ) 46 | 47 | resolvers ++= Seq( 48 | "mvnrepository" at "https://repository.cloudera.com/artifactory/cloudera-repos/", 49 | "Maven Central" at "https://repo1.maven.org/maven2/", 50 | "Sonatype OSS Releases" at "https://oss.sonatype.org/service/local/staging/deploy/maven2" 51 | ) 52 | 53 | assemblyMergeStrategy in assembly := { 54 | case el if el.contains("fasterxml.jackson.core") => MergeStrategy.first 55 | case el if el.contains("guava") => MergeStrategy.first 56 | 57 | case x if Assembly.isConfigFile(x) => MergeStrategy.concat 58 | case PathList(ps@_*) if Assembly.isReadme(ps.last) || Assembly.isLicenseFile(ps.last) => MergeStrategy.rename 59 | case PathList("META-INF", xs@_*) => (xs map { 60 | _.toLowerCase 61 | }) match { 62 | case ("manifest.mf" :: Nil) | ("index.list" :: Nil) | ("dependencies" :: Nil) => MergeStrategy.discard 63 | case ps@(x :: xs) if ps.last.endsWith(".sf") || ps.last.endsWith(".dsa") => MergeStrategy.discard 64 | case "plexus" :: xs => MergeStrategy.discard 65 | case "services" :: xs => MergeStrategy.filterDistinctLines 66 | case ("spring.schemas" :: Nil) | ("spring.handlers" :: Nil) => MergeStrategy.filterDistinctLines 67 | case _ => MergeStrategy.first // Changed deduplicate to first 68 | } 69 | case PathList(_*) => MergeStrategy.first // added this line 70 | } 71 | 72 | pomIncludeRepository := { _ => false } 73 | 74 | pomExtra := ( 75 | http://github.com/vicpara/exploratory-data-analysis 76 | 77 | 78 | Apache License, Version 2.0 79 | http://www.apache.org/licenses/LICENSE-2.0.html 80 | repo 81 | 82 | 83 | 84 | git@github.com:vicpara/exploratory-data-analysis.git 85 | scm:git:git@github.com:vicpara/exploratory-data-analysis.git 86 | 87 | 88 | 89 | vicpara 90 | Victor Paraschiv 91 | http://github.com/vicpara 92 | 93 | ) 94 | 95 | scalariformSettings 96 | 97 | -------------------------------------------------------------------------------- /no-test-sbt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sbt "set test in assembly := {}" "set skip in update := true" "set offline := true" "set scalacOptions in ThisBuild ++= Seq(\"-unchecked\", \"-deprecation\")" "$*" 3 | 4 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | scalacOptions += "-deprecation" 2 | 3 | addSbtPlugin("org.scalariform" % "sbt-scalariform" % "1.6.0") 4 | 5 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0") 6 | 7 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.1") 8 | 9 | addSbtPlugin("com.typesafe.sbt" % "sbt-osgi" % "0.7.0") 10 | 11 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.4.0") 12 | 13 | addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.3.3") 14 | 15 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0") 16 | 17 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.5") 18 | 19 | addSbtPlugin("com.github.tkawachi" % "sbt-doctest" % "0.3.2") 20 | 21 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "1.1") 22 | 23 | fullResolvers ~= {_.filterNot(_.name == "jcenter")} 24 | -------------------------------------------------------------------------------- /sbt-offline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sbt "set skip in update := true" "set offline := true" "set scalacOptions in ThisBuild ++= Seq(\"-unchecked\", \"-deprecation\")" "$*" 3 | 4 | -------------------------------------------------------------------------------- /src/main/scala/com/github/vicpara/eda/AppLogger.scala: -------------------------------------------------------------------------------- 1 | package com.github.vicpara.eda 2 | 3 | import org.apache.log4j.{ Logger, PatternLayout, ConsoleAppender, Level } 4 | import org.apache.spark.rdd.RDD 5 | 6 | case object AppLogger { 7 | def getLogger(level: Level = Level.INFO): Logger = { 8 | val level = org.apache.log4j.Level.INFO 9 | val logger = org.apache.log4j.Logger.getLogger("nlp-topic-modelling") 10 | logger.setLevel(level) 11 | 12 | val capp = new ConsoleAppender() 13 | capp.setName(s"ConsoleAppender ${level.toString}") 14 | capp.setLayout(new PatternLayout("%d %m%n")) 15 | capp.setThreshold(level) 16 | capp.activateOptions() 17 | logger.addAppender(capp) 18 | logger 19 | } 20 | 21 | def logStage[T](data: RDD[T], message: String): RDD[T] = { 22 | infoLevel.info(message) 23 | data 24 | } 25 | 26 | def apply(): org.apache.log4j.Logger = infoLevel 27 | val infoLevel = getLogger() 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/com/github/vicpara/eda/stats/PercentileStats.scala: -------------------------------------------------------------------------------- 1 | package com.github.vicpara.eda.stats 2 | 3 | import io.continuum.bokeh._ 4 | 5 | import scalaz.Scalaz._ 6 | 7 | case class PercentileStats(points: List[(String, Long)], numBuckets: Long) { 8 | def isSingleValue = points match { 9 | case pts if pts.size == 1 => true 10 | case _ => false 11 | } 12 | } 13 | 14 | case class PercentileStatsWithFilterLevel(drillDownFilterValue: String, stats: PercentileStats) { 15 | def isSingleStat = drillDownFilterValue.equals(SequenceStats.drillDownKeyAll) 16 | def isSingleValue = stats.points.size == 1 17 | } 18 | 19 | case class PrettyPercentileStats(name: String, levels: List[PercentileStatsWithFilterLevel]) { 20 | def lineImage(xvals: Seq[Double], yvals: Seq[Double], name: String): Plot = { 21 | object source extends ColumnDataSource { 22 | val x = column(xvals) 23 | val y = column(yvals) 24 | } 25 | 26 | import source.{ x, y } 27 | 28 | val xdr = new DataRange1d() 29 | val ydr = new DataRange1d() 30 | 31 | val line = new Line().x(x).y(y).line_color("#666699").line_width(2) 32 | 33 | val line_renderer = new GlyphRenderer() 34 | .data_source(source) 35 | .glyph(line) 36 | 37 | val plot = new Plot() 38 | .title(name).title_text_font_size(FontSize.apply(10, FontUnits.PT)) 39 | .x_range(xdr).y_range(ydr) 40 | .width(500).height(500) 41 | .border_fill(Color.White) 42 | .background_fill("#FFE8C7") 43 | 44 | val xaxis = new LinearAxis().plot(plot) 45 | val yaxis = new LinearAxis().plot(plot) 46 | plot.below <<= (xaxis :: _) 47 | plot.left <<= (yaxis :: _) 48 | val xgrid = new Grid().plot(plot).axis(xaxis).dimension(0) 49 | val ygrid = new Grid().plot(plot).axis(yaxis).dimension(1) 50 | 51 | val pantool = new PanTool().plot(plot) 52 | val wheelzoomtool = new WheelZoomTool().plot(plot) 53 | 54 | plot.tools := List(pantool, wheelzoomtool) 55 | plot.renderers := List(xaxis, yaxis, xgrid, ygrid, line_renderer) 56 | 57 | plot 58 | } 59 | 60 | def toPlot: List[Plot] = 61 | levels.flatMap(stats => if (stats.stats.points.size > 1) 62 | Some(lineImage( 63 | xvals = stats.stats.points.indices.map(_.toDouble), 64 | yvals = stats.stats.points.map(_._2.toDouble), 65 | name = name + " #" + stats.drillDownFilterValue 66 | )) 67 | else None) 68 | 69 | def toHumanReadable: String = levels.map(l => s"\n${"_" * 148}\n$name \tDrillDownValue : ${l.drillDownFilterValue}\n\t" + 70 | s"${l.stats |> prettyContent}").mkString("\n") 71 | 72 | def prettyContent(stats: PercentileStats): String = stats.points match { 73 | case points if points.isEmpty => "[EmptyStatistic]" 74 | case points if points.size == 1 => s"[SingleStats]: Value: ${stats.points.head._2}\n" 75 | 76 | case points if points.size > 1 => 77 | val min +: _ :+ max = stats.points.map(_._2) 78 | 79 | val pointsPerChar: Double = (max - min) / 100.0 80 | val fillChar = "#" 81 | 82 | val maxLabelLength = math.min(100, stats.points.map(_._1.length).max) + 5 83 | 84 | val histPretty: String = 85 | stats.points 86 | .zipWithIndex.map { 87 | case ((key, value), index) => 88 | val strIndex = "%5s".format(index) 89 | val label = s"%${maxLabelLength}s".format(key) 90 | val bars = "%101s".format(fillChar * (1 + ((value - min) / pointsPerChar).toInt)).reverse 91 | s"$label |$strIndex| $bars| $value" 92 | } 93 | .mkString("\n") 94 | 95 | s"[PercentileStats]: NumBinsInHistogram: ${stats.numBuckets.toString}\n$histPretty" 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/main/scala/com/github/vicpara/eda/stats/SequenceStats.scala: -------------------------------------------------------------------------------- 1 | package com.github.vicpara.eda.stats 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | import scala.reflect.ClassTag 6 | import scalaz.Scalaz._ 7 | 8 | case object SequenceStats { 9 | val drillDownKeyAll = "$None$" 10 | 11 | def percentile[T: ClassTag, DimKey: ClassTag, V: ClassTag]( 12 | data: RDD[T], 13 | toDrillDownKeyOption: Option[T => String], 14 | toDimKey: T => DimKey, 15 | toVal: T => V, 16 | toStats: V => Long, 17 | reduceFunc: (V, V) => V, 18 | numPercentiles: Int 19 | ): List[PercentileStatsWithFilterLevel] = 20 | 21 | data.map(t => (toDrillDownKeyOption.getOrElse((_: T) => drillDownKeyAll)(t), toDimKey(t)) -> toVal(t)) 22 | .reduceByKey(reduceFunc) 23 | .mapValues(toStats) 24 | .sortBy(_._2) 25 | .zipWithIndex() |> 26 | percentileStats(numPercentiles) |> 27 | (percentileStats => percentileStats.toList.map((PercentileStatsWithFilterLevel.apply _).tupled)) 28 | 29 | def distinct[T: ClassTag, DimKey: ClassTag, V: ClassTag]( 30 | data: RDD[T], 31 | toDrillDownKeyOption: Option[T => String], 32 | toDimKey: T => DimKey, 33 | toVal: T => V, 34 | numPercentiles: Int 35 | ): List[PercentileStatsWithFilterLevel] = 36 | data.map(t => (toDrillDownKeyOption.getOrElse((_: T) => drillDownKeyAll)(t), toDimKey(t), toVal(t))) 37 | .distinct() 38 | .map(e => (e._1, e._2) -> Set(e._3)) 39 | .reduceByKey(_ ++ _) 40 | .mapValues(_.size.toLong) 41 | .sortBy(_._2) 42 | .zipWithIndex() |> 43 | percentileStats(numPercentiles) |> 44 | (percentileStats => percentileStats.toList.map((PercentileStatsWithFilterLevel.apply _).tupled)) 45 | 46 | def percentileStats[DimKey](numPercentiles: Int = 1001)(data: RDD[(((String, DimKey), Long), Long)]): Map[String, PercentileStats] = { 47 | 48 | val numBuckets: Map[String, Int] = 49 | data.map { 50 | case (((drillDownKey, _), _), _) => drillDownKey -> 1 51 | } 52 | .reduceByKey(_ + _) 53 | .collect() 54 | .toMap 55 | 56 | val drillDownKeyToIndices: Map[String, Set[Long]] = 57 | numBuckets.mapValues(percentileIndices(_, numPercentiles)).map(identity) 58 | 59 | data.filter { case (((drillDownKey, _), _), index) => drillDownKeyToIndices(drillDownKey).contains(index) } 60 | .map { 61 | case (((drillDownKey, dimKey), stats), _) => drillDownKey -> (dimKey.toString -> stats) 62 | } 63 | .collect() 64 | .groupBy(_._1) 65 | .mapValues(_.map(_._2).toList.sortBy(_._2)) 66 | .map { 67 | case (key, points) => key -> PercentileStats(points, numBuckets(key)) 68 | } 69 | } 70 | 71 | def percentileIndices(numElems: Long, numPercentiles: Int): Set[Long] = { 72 | val delta: Double = (1.0 / math.max(1, numPercentiles - 1)) * (numElems - 1) 73 | (0 until numPercentiles).map(el => (el * delta).round).toSet 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/test/scala/com/github/vicpara/eda/DescriptiveStatsJob.scala: -------------------------------------------------------------------------------- 1 | package com.github.vicpara.eda 2 | 3 | import com.github.vicpara.eda.stats.PrettyPercentileStats 4 | import io.continuum.bokeh.{ Document, Plot, GridPlot } 5 | import org.apache.spark.rdd.RDD 6 | import org.apache.spark.{ SparkConf, SparkContext } 7 | import org.rogach.scallop.ScallopConf 8 | 9 | case class Transaction(timestamp: Long, customerId: Long, businessId: Long, postcode: Option[String]) { 10 | def toSv(sep: String = "\t"): String = List(timestamp, customerId, businessId).mkString(sep) 11 | } 12 | 13 | case object Transaction { 14 | def apply(sep: String)(line: String): Transaction = line.split(sep, -1).toList match { 15 | case List(timestamp, customerId, businessId, postcode) => Transaction(timestamp.toLong, customerId.toLong, 16 | businessId.toLong, Some(postcode)) 17 | } 18 | } 19 | 20 | case object DescriptiveStatsJob extends Generators { 21 | def main(args: Array[String]) { 22 | val conf = new ScallopConf(args) { 23 | val delim = opt[String](default = Some("\t"), descr = "The delimiter character") 24 | val tmpFolder = opt[String](descr = "Overrides the directory used in spark.local.dir") 25 | 26 | val outputPath = opt[String](required = true, descr = "The output path for the EDA stats") 27 | val outputPathHuman = opt[String](descr = "The output path [Human readable] for the EDA stats") 28 | } 29 | 30 | conf.printHelp() 31 | AppLogger().info(conf.summary) 32 | 33 | val sparkConf = new SparkConf() 34 | .set("spark.akka.frameSize", "128") 35 | .setMaster("local") 36 | .set("spark.hadoop.validateOutputSpecs", "false") 37 | .set("spark.io.compression.codec", "lz4") 38 | .setAppName("Local Exploratory Data Analysis") 39 | 40 | if (conf.tmpFolder.isDefined) sparkConf.set("spark.local.dir", conf.tmpFolder()) 41 | @transient val sc: SparkContext = new SparkContext(sparkConf) 42 | 43 | val transactions = sc.parallelize(randomTransactions(80000).sample.get) 44 | AppLogger.logStage(transactions, "Finished generating the data points") 45 | 46 | val results: RDD[PrettyPercentileStats] = sc.makeRDD(List( 47 | Stats.txCountPerCustomerNDayStats(transactions, 101), 48 | Stats.txCountPerBusinessIdNDayStats(transactions, 101), 49 | 50 | Stats.uniqueBusinessIdPerPostcodeNDayStats(transactions, 101), 51 | Stats.uniqueCustomersPerBusinessIdNDayStats(transactions, 101), 52 | Stats.uniqueCustomerIdPerPostcodeNDayStats(transactions, 101), 53 | 54 | Stats.globalUniqueCustomersCounterStats(transactions, 101), 55 | Stats.globalUniqueBusinessesCounterStats(transactions, 101), 56 | Stats.globalUniquePostcodesCounterStats(transactions, 101) 57 | ).flatten, numSlices = 1) 58 | 59 | results.saveAsObjectFile(conf.outputPath()) 60 | conf.outputPathHuman.foreach(results.map(_.toHumanReadable).saveAsTextFile) 61 | 62 | savePlotLists(results, "/tmp/eda/human/") 63 | } 64 | 65 | def savePlotLists(results: RDD[PrettyPercentileStats], outFolder: String) = { 66 | val plotGrid: List[List[Plot]] = results.collect() 67 | .flatMap(_.toPlot) 68 | .zipWithIndex 69 | .groupBy(_._2 / 2) 70 | .map(_._2.map(_._1).toList).toList 71 | 72 | val grid = new GridPlot().children(plotGrid) 73 | 74 | val document = new Document(grid) 75 | val html = document.save(outFolder + "/" + "EDA_Stats_Results.html") 76 | AppLogger.infoLevel.info(s"Wrote EDA stats charts in ${html.file}. Open ${html.url} in a web browser.") 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/test/scala/com/github/vicpara/eda/Generators.scala: -------------------------------------------------------------------------------- 1 | package com.github.vicpara.eda 2 | 3 | import org.joda.time.{ DateTime, DateTimeZone, Interval, LocalDateTime } 4 | import org.scalacheck.{ Prop, Arbitrary, Gen } 5 | 6 | import scala.collection.immutable 7 | import scalaz.Scalaz._ 8 | 9 | trait Generators { 10 | val nTransactions = 1000000 11 | val nCustomers = 800 12 | val nBusinesses = 300 13 | 14 | val nonNegativeLongGen: Gen[Long] = Gen.choose[Long](0l, Long.MaxValue - 1) 15 | val nonNegativeIntGen: Gen[Int] = Gen.choose(0, 100) 16 | val positiveIntGen: Gen[Int] = Gen.choose(1, 100) 17 | 18 | val nonEmptyAlphaStr = Gen.nonEmptyListOf(Gen.alphaChar).map(_.mkString).suchThat(_.forall(_.isLetter)) 19 | 20 | val noOfDays = 23 21 | val dates = DateTime.now() |> (t0 => (0 until noOfDays).map(t0.minusDays)) 22 | 23 | val availablePostcodes = Gen.listOfN(100, Gen.listOfN(6, Gen.alphaNumChar).flatMap(_.mkString("").toUpperCase())) 24 | .sample.get 25 | 26 | val businesses: List[Int] = Gen.listOfN(nBusinesses, Gen.choose(0, 10000)).sample.get 27 | val customers: List[Int] = Gen.listOfN(nCustomers, Gen.chooseNum(10001, 20000)).sample.get 28 | 29 | def sameDayTimestampGen(date: DateTime): Gen[DateTime] = for { 30 | hourOfDay <- Gen.choose(0, 23) 31 | minuteOfHour <- Gen.choose(0, 59) 32 | } yield { 33 | val dtz = DateTimeZone.forID("Europe/London") 34 | val ldt = new LocalDateTime(date.getMillis, dtz).withTime(hourOfDay, minuteOfHour, 0, 0) 35 | (if (dtz.isLocalDateTimeGap(ldt)) ldt.plusHours(2) else ldt).toDateTime(dtz) 36 | } 37 | 38 | def timestampGen(interval: Interval) = for { 39 | date: DateTime <- Gen.choose(interval.getStart.getMillis, interval.getEnd.getMillis).map(new DateTime(_)) 40 | timestamp <- sameDayTimestampGen(date) 41 | } yield timestamp 42 | 43 | val timestampGen: Gen[DateTime] = for { 44 | date: DateTime <- Gen.oneOf(dates) 45 | timestamp <- sameDayTimestampGen(date) 46 | } yield timestamp 47 | 48 | val todayTimestampGen: Gen[DateTime] = for { 49 | hourOfDay <- Gen.choose(0, 23) 50 | minuteOfHour <- Gen.choose(0, 59) 51 | date = DateTime.now() 52 | timestamp = date.withTime(hourOfDay, minuteOfHour, 0, 0) 53 | } yield timestamp 54 | 55 | val postcodeGen: Gen[String] = Gen.oneOf(availablePostcodes) 56 | 57 | def postcodeFromSectorGen(sector: String) = 58 | Gen.listOfN(2, Gen.alphaNumChar).map(_.mkString).map(sector + _.toUpperCase) 59 | 60 | def randomTransactionGen(): Gen[Transaction] = for { 61 | timestamp <- timestampGen.map(_.getMillis) 62 | postcode <- postcodeGen 63 | businessId <- Gen.oneOf(businesses) 64 | customerId <- Gen.oneOf(customers) 65 | } yield Transaction(timestamp, customerId, businessId, Some(postcode)) 66 | 67 | def randomTransactions(n: Int): Gen[List[Transaction]] = Gen.listOfN(nTransactions, randomTransactionGen()) 68 | } 69 | -------------------------------------------------------------------------------- /src/test/scala/com/github/vicpara/eda/Stats.scala: -------------------------------------------------------------------------------- 1 | package com.github.vicpara.eda 2 | 3 | import com.github.vicpara.eda.stats.{ SequenceStats, PrettyPercentileStats } 4 | import org.apache.spark.rdd.RDD 5 | import org.joda.time.DateTime 6 | 7 | case object Stats { 8 | def txCountPerCustomerNDayStats(transactions: RDD[Transaction], nPercentiles: Int = 1001) = 9 | Some("Customer x Day - Count(Tx)") 10 | .map(n => PrettyPercentileStats( 11 | name = n, 12 | levels = SequenceStats.percentile[Transaction, (Long, String), Long]( 13 | data = AppLogger.logStage(transactions, n), 14 | toDrillDownKeyOption = None, 15 | toDimKey = tx => (tx.customerId, dayAsString(tx.timestamp)), 16 | toVal = _ => 1l, 17 | toStats = identity, 18 | reduceFunc = _ + _, 19 | numPercentiles = nPercentiles 20 | ) 21 | )) 22 | 23 | def txCountPerBusinessIdNDayStats(transactions: RDD[Transaction], nPercentiles: Int = 1001) = 24 | Some("BusinessId x Day - Count(Tx)") 25 | .map(n => PrettyPercentileStats( 26 | name = n, 27 | levels = SequenceStats.percentile[Transaction, (Long, String), Long]( 28 | data = AppLogger.logStage(transactions, n), 29 | toDrillDownKeyOption = None, 30 | toDimKey = tx => (tx.businessId, dayAsString(tx.timestamp)), 31 | toVal = r => 1l, 32 | toStats = identity, 33 | reduceFunc = _ + _, 34 | numPercentiles = nPercentiles 35 | ) 36 | )) 37 | 38 | def globalUniqueBusinessesCounterStats(transactions: RDD[Transaction], nPercentiles: Int = 1001) = 39 | Some("Global distinct Businesses") 40 | .map(n => PrettyPercentileStats( 41 | name = n, 42 | levels = SequenceStats.distinct[Transaction, Long, Long]( 43 | data = AppLogger.logStage(transactions, n), 44 | toDrillDownKeyOption = None, 45 | toDimKey = t => 1l, 46 | toVal = tx => tx.businessId, 47 | numPercentiles = nPercentiles 48 | ) 49 | )) 50 | 51 | def globalUniquePostcodesCounterStats(transactions: RDD[Transaction], nPercentiles: Int = 1001) = 52 | Some("Global distinct Postcodes") 53 | .map(n => PrettyPercentileStats( 54 | name = n, 55 | levels = SequenceStats.distinct[Transaction, Long, String]( 56 | data = AppLogger.logStage(transactions, n), 57 | toDrillDownKeyOption = None, 58 | toDimKey = t => 1l, 59 | toVal = tx => tx.postcode.get, 60 | numPercentiles = nPercentiles 61 | ) 62 | )) 63 | 64 | def globalUniqueCustomersCounterStats(transactions: RDD[Transaction], nPercentiles: Int = 1001) = 65 | Some("Global distinct Customers") 66 | .map(n => PrettyPercentileStats( 67 | name = n, 68 | levels = SequenceStats.distinct[Transaction, Long, Long]( 69 | data = AppLogger.logStage(transactions, n), 70 | toDrillDownKeyOption = None, 71 | toDimKey = t => 1l, 72 | toVal = tx => tx.customerId, 73 | numPercentiles = nPercentiles 74 | ) 75 | )) 76 | 77 | def uniqueCustomerIdPerPostcodeNDayStats(tx: RDD[Transaction], nPercentiles: Int = 1001) = 78 | Some("Postcode x Day - Distinct(CustomerID)") 79 | .map(n => PrettyPercentileStats( 80 | name = n, 81 | levels = SequenceStats.distinct[Transaction, (String, String), Long]( 82 | data = AppLogger.logStage(tx, n), 83 | toDrillDownKeyOption = None, 84 | toDimKey = t => (t.postcode.getOrElse(""), dayAsString(t.timestamp)), 85 | toVal = tx => tx.customerId, 86 | numPercentiles = nPercentiles 87 | ) 88 | )) 89 | 90 | def uniqueBusinessIdPerPostcodeNDayStats(tx: RDD[Transaction], nPercentiles: Int = 1001) = 91 | Some("Postcode x Day - Distinct(BusinessId)") 92 | .map(n => PrettyPercentileStats( 93 | name = n, 94 | levels = SequenceStats.distinct[Transaction, (String, String), Long]( 95 | data = AppLogger.logStage(tx, n), 96 | toDrillDownKeyOption = None, 97 | toDimKey = t => (t.postcode.getOrElse(""), dayAsString(t.timestamp)), 98 | toVal = tx => tx.businessId, 99 | numPercentiles = nPercentiles 100 | ) 101 | )) 102 | 103 | def uniqueCustomersPerBusinessIdNDayStats(tx: RDD[Transaction], nPercentiles: Int = 1001) = 104 | Some("BusinessId x Day - Distinct(CustomerId)") 105 | .map(n => PrettyPercentileStats( 106 | name = n, 107 | levels = SequenceStats.distinct[Transaction, (Long, String), Long]( 108 | data = AppLogger.logStage(tx, n), 109 | toDrillDownKeyOption = Some(tx => dayOfWeek(tx.timestamp)), 110 | toDimKey = tx => (tx.businessId, dayAsString(tx.timestamp)), 111 | toVal = tx => tx.customerId, 112 | numPercentiles = nPercentiles 113 | ) 114 | )) 115 | 116 | def dayAsString(millis: Long): String = new DateTime(millis).toString("yyyy-MM-dd") 117 | def dayOfWeek(millis: Long): String = new DateTime(millis).dayOfWeek().getAsText 118 | } 119 | -------------------------------------------------------------------------------- /src/test/scala/com/github/vicpara/eda/StatsSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.vicpara.eda 2 | 3 | import com.github.vicpara.eda.stats.{ SequenceStats, PercentileStats, PercentileStatsWithFilterLevel, PrettyPercentileStats } 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD 6 | import org.specs2.matcher.ScalaCheckMatchers 7 | import org.specs2.mutable.Specification 8 | 9 | import scalaz.Scalaz._ 10 | 11 | case object StatsSpec extends Specification with ScalaCheckMatchers with TestUtils { 12 | "ExploratoryDataAnalysis" should { 13 | 14 | "correctly compute percentiles statistics on monotonic increasing 21 sample dataset" in { 15 | val rawData = 16 | (0l until 21).map((" ", _)) 17 | .map { 18 | case (drillDownKey, dimKey) => (((drillDownKey, dimKey), dimKey), dimKey) 19 | } 20 | .toList 21 | val dataRDD: RDD[(((String, Long), Long), Long)] = sc.parallelize(rawData) 22 | 23 | val res: PercentileStats = SequenceStats.percentileStats[Long](numPercentiles = 21)(dataRDD).head._2 24 | res must_=== PercentileStats(points = rawData.map(el => (el._2.toString, el._2)), numBuckets = 21l) 25 | } 26 | 27 | "correctly compute the right number of elements in the percentiles statistics on 101 constant samples dataset" in { 28 | val key = " " 29 | val rawData: List[(((String, Long), Long), Long)] = (0l until 101 by 1).toList.map(e => (((key, e), e), e)) 30 | val dataRDD: RDD[(((String, Long), Long), Long)] = sc.parallelize(rawData) 31 | 32 | val intRes = SequenceStats.percentileStats[Long](numPercentiles = 11)(dataRDD) 33 | val res = 34 | List(1, 2, 3, 5, 6, 7, 9, 11, 20, 21, 99, 100) 35 | .map(r => (r, SequenceStats.percentileStats[Long](numPercentiles = r)(dataRDD).get(key).get.points.size)) 36 | .filter(e => e._1 != e._2) 37 | 38 | res.foreach(el => el._1 must_=== el._2) 39 | res.size must_=== 0 40 | } 41 | 42 | "correctly compute number percentiles when Num Percentiles is larger than samples" in { 43 | val numSamplesList = List(2, 4, 5, 10, 11, 13, 17, 21, 39, 55, 101) 44 | val res = numSamplesList.flatMap(numSample => { 45 | val ires = 46 | List(102, 103, 104, 101, 130, 200, 300, 500, 1000) 47 | .map(v => (numSample, SequenceStats.percentileIndices(numSample, v).size)) 48 | .filter(v => v._1 != v._2) 49 | 50 | ires.forall(el => el._1 must_=== el._2) 51 | ires.size must_=== 0 52 | ires 53 | }) 54 | if (res.nonEmpty) 55 | println("Numer of percentile indices" + res.map(el => s"Expected:${el._1} but got ${el._2} ").mkString("\n")) 56 | res.size must_=== 0 57 | } 58 | 59 | "correctly compute percentiles statistics on 101 constant samples dataset" in { 60 | val rawData: List[(((String, Long), Long), Long)] = (0l until 101 by 1).toList.map(e => (((" ", e), 2l), e)) 61 | val dataRDD: RDD[(((String, Long), Long), Long)] = sc.parallelize(rawData) 62 | 63 | val result = SequenceStats.percentileStats[Long](numPercentiles = 11)(dataRDD).head._2 64 | val expected = PercentileStats(points = (0 until 101 by 10).toList.zipWithIndex.map(e => (e._1.toString, 2l)), numBuckets = 101) 65 | result must_=== expected 66 | } 67 | 68 | "correctly compute percentiles statistics on smaller dataset than stats" in { 69 | val rawData = (1l until 6 by 1).map(e => (((" ", e), e), e - 1)) 70 | val dataRDD: RDD[(((String, Long), Long), Long)] = sc.parallelize(rawData) 71 | 72 | val result: PercentileStats = SequenceStats.percentileStats[Long](numPercentiles = 10)(dataRDD).head._2 73 | val expected = PercentileStats(points = (1l until 6 by 1).map(e => (e.toString, e)).toList, numBuckets = 5l) 74 | 75 | result must_=== expected 76 | } 77 | 78 | "correctly computes the number of Percentile Indices for increasing number of percentiles" in { 79 | val res = 80 | List(1, 2, 3, 5, 6, 7, 9, 11, 17, 20, 21, 40, 50, 51, 99, 100, 101) 81 | .map(v => (v, SequenceStats.percentileIndices(101, v).size)) 82 | .filter(v => v._1 != v._2) 83 | 84 | res.foreach(el => el._1 must_=== el._2) 85 | res.size must_=== 0 86 | } 87 | 88 | "correctly generate 10 Percentile Indexes from 1 to 10" in { 89 | val result: Set[Long] = SequenceStats.percentileIndices(10, 10) 90 | val expected: Set[Long] = (0 until 10).toList.map(_.toLong).toSet 91 | 92 | result must_=== expected 93 | } 94 | 95 | "correctly generate 10 Percentile Indexes from 1 to 10 when requested 20 for a smaller dataset" in { 96 | val result: Set[Long] = SequenceStats.percentileIndices(10, 20) 97 | val expected: Set[Long] = (0 until 10).toList.map(_.toLong).toSet 98 | 99 | result must_=== expected 100 | } 101 | 102 | "correctly generate 10 Percentile Indexes from 0 to 1000 by 100 when requested 10 for a 1001 dataset" in { 103 | val result: Set[Long] = SequenceStats.percentileIndices(1001, 11) 104 | val expected: Set[Long] = (0 until 1001 by 100).toList.map(_.toLong).toSet 105 | 106 | result must_=== expected 107 | } 108 | 109 | "correctly pretty prints humanReadable" in { 110 | val setEmpty = PrettyPercentileStats( 111 | name = "xxx", 112 | levels = List(PercentileStatsWithFilterLevel( 113 | "NONE", 114 | stats = PercentileStats(points = Nil, numBuckets = 0) 115 | )) 116 | ) 117 | 118 | val set1 = PrettyPercentileStats( 119 | name = "xxx", 120 | levels = List(PercentileStatsWithFilterLevel( 121 | "NONE", 122 | stats = PercentileStats(points = List(("Key1", 2l)), numBuckets = 1) 123 | )) 124 | ) 125 | 126 | val set2 = PrettyPercentileStats( 127 | name = "xxx", 128 | levels = List(PercentileStatsWithFilterLevel( 129 | "NONE", 130 | stats = PercentileStats(points = List(("Key1", 1l), ("Key2", 2l)), numBuckets = 2) 131 | )) 132 | ) 133 | 134 | val set3 = PrettyPercentileStats( 135 | name = "xxx", 136 | levels = List(PercentileStatsWithFilterLevel( 137 | "NONE", 138 | stats = PercentileStats(points = List(("Key1", 1l), ("Key2", 2l), ("Key3", 3l)), numBuckets = 3) 139 | )) 140 | ) 141 | 142 | List(setEmpty, set1, set2, set3) 143 | .map(e => e.levels.head.stats.points.size -> e.toHumanReadable) 144 | .map(e => e._1 -> (e._1 == e._2.split("\n").size + 2)) 145 | .count(_._2) must_=== 0 146 | } 147 | 148 | "correctly pretty prints for humans bad samples" in { 149 | val data = List( 150 | PrettyPercentileStats( 151 | name = "BusinessId x Day - Count(Tx)", 152 | levels = List(PercentileStatsWithFilterLevel( 153 | drillDownFilterValue = "DrillDownKey.ALL", 154 | stats = PercentileStats(points = List(("1", 1830683l)), numBuckets = 1l) 155 | )) 156 | ), 157 | 158 | PrettyPercentileStats( 159 | name = "Postcode x Day - Count(RichTx)", 160 | levels = List(PercentileStatsWithFilterLevel( 161 | drillDownFilterValue = "DrillDownKey.ALL", 162 | PercentileStats(points = List( 163 | (("YO126EE", "2014-12-02").toString(), 1l), 164 | (("CH441BA", "2014-09-23").toString(), 1l), (("LS287BJ", "2014-10-24").toString(), 1l), 165 | (("G156RX", "2014-01-08").toString(), 1l) 166 | ), numBuckets = 4) 167 | )) 168 | ) 169 | ) 170 | 171 | val hr = data.map(_.toHumanReadable) 172 | hr.foreach(println) 173 | hr.nonEmpty must_=== true 174 | } 175 | 176 | "correctly runs end to end SequenceStats.percentile on constant dataset" in { 177 | case class DataP(k: String, v: Int) 178 | val dataRDD = sc.parallelize((0 until 101).map(el => DataP(k = el.toString, v = el))) 179 | 180 | @transient implicit lazy val isc: SparkContext = sc 181 | 182 | val res = SequenceStats.percentile[DataP, String, Long]( 183 | data = dataRDD, 184 | toDrillDownKeyOption = None, 185 | toDimKey = _.k, 186 | toVal = _ => 1l, 187 | toStats = identity, 188 | reduceFunc = _ |+| _, 189 | numPercentiles = 10 190 | ) 191 | 192 | println(PrettyPercentileStats(name = "Constant Dataset", levels = res).toHumanReadable) 193 | 194 | val expected = PercentileStats(points = (0 until 10).toList.map(e => (e.toString, 1l)), numBuckets = 101l) 195 | res.head.stats.points.map(_._2) must_=== expected.points.map(_._2) 196 | } 197 | 198 | "correctly runs end to end SequenceStats.percentile on increasing 10 bucket dataset" in { 199 | case class DataP(k: String, v: Int) 200 | val dataRDD = sc.parallelize((1 until 11).flatMap(el => (0 until el).map(num => DataP(k = el.toString, v = 1)))) 201 | 202 | @transient implicit lazy val isc: SparkContext = sc 203 | 204 | val res = SequenceStats.percentile[DataP, String, Long]( 205 | data = dataRDD, 206 | toDrillDownKeyOption = None, 207 | toDimKey = _.k, 208 | toVal = _ => 1l, 209 | toStats = identity, 210 | reduceFunc = _ + _, 211 | numPercentiles = 10 212 | ) 213 | 214 | println(PrettyPercentileStats(name = "", levels = res).toHumanReadable) 215 | 216 | val expected = PercentileStats( 217 | points = (1 until 11).toList.map(e => (e.toString, e.toLong)), 218 | numBuckets = 10l 219 | ) 220 | 221 | res.head.stats must_=== expected 222 | res.head.stats.points.map(_._2) must_=== expected.points.map(_._2) 223 | } 224 | } 225 | } 226 | -------------------------------------------------------------------------------- /src/test/scala/com/github/vicpara/eda/TestUtils.scala: -------------------------------------------------------------------------------- 1 | package com.github.vicpara.eda 2 | 3 | import org.apache.spark.{ SparkConf, SparkContext } 4 | 5 | object StaticSparkContext { 6 | val staticSc = new SparkContext( 7 | new SparkConf().setMaster("local") 8 | .set("spark.ui.port", "14321") 9 | .set("spark.eventLog.dir", System.getProperty("java.io.tmpdir")) 10 | .set("spark.io.compression.codec", "lz4") 11 | .setAppName("Test Local Spark Context") 12 | ) 13 | } 14 | 15 | trait TestUtils { 16 | implicit class PimpedDouble(d: Double) { 17 | def roundDecimal = "%.2f".format(d).toDouble 18 | } 19 | 20 | @transient lazy val sc = StaticSparkContext.staticSc 21 | } 22 | -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | version in ThisBuild := "1.0.1-SNAPSHOT" --------------------------------------------------------------------------------