├── .gitignore
├── .travis.yml
├── Histogram.png
├── LICENSE
├── README.md
├── build.sbt
├── no-test-sbt.sh
├── project
    └── plugins.sbt
├── sbt-offline.sh
├── src
    ├── main
    │   └── scala
    │   │   └── com
    │   │       └── github
    │   │           └── vicpara
    │   │               └── eda
    │   │                   ├── AppLogger.scala
    │   │                   └── stats
    │   │                       ├── PercentileStats.scala
    │   │                       └── SequenceStats.scala
    └── test
    │   └── scala
    │       └── com
    │           └── github
    │               └── vicpara
    │                   └── eda
    │                       ├── DescriptiveStatsJob.scala
    │                       ├── Generators.scala
    │                       ├── Stats.scala
    │                       ├── StatsSpec.scala
    │                       └── TestUtils.scala
└── version.sbt


/.gitignore:
--------------------------------------------------------------------------------
  1 | *.class
  2 | *.log
  3 | 
  4 | # sbt specific
  5 | .cache
  6 | .history
  7 | .lib/
  8 | dist/*
  9 | target/
 10 | lib_managed/
 11 | src_managed/
 12 | project/boot/
 13 | project/plugins/project/
 14 | 
 15 | # Scala-IDE specific
 16 | .scala_dependencies
 17 | .worksheet
 18 | 
 19 | 
 20 | # Byte-compiled / optimized / DLL files
 21 | __pycache__/
 22 | *.py[cod]
 23 | .project
 24 | scala/projectFilesBackup/
 25 | scala/src/sandbox.sc
 26 | 
 27 | 
 28 | # C extensions
 29 | *.so
 30 | 
 31 | .idea/*
 32 | */.idea/*
 33 | */.idea_modules/*
 34 | project/project/*
 35 | project/target/*
 36 | */target/*
 37 | 
 38 | 
 39 | #Notebook temp files
 40 | *.ipynb_checkpoints/*
 41 | *.dump
 42 | # Data
 43 | *.xls
 44 | *.xlsx
 45 | *.zip
 46 | *.out
 47 | 
 48 | 
 49 | # Distribution / packaging
 50 | .Python
 51 | env/
 52 | build/
 53 | develop-eggs/
 54 | dist/
 55 | downloads/
 56 | eggs/
 57 | .eggs/
 58 | lib/
 59 | lib64/
 60 | parts/
 61 | sdist/
 62 | var/
 63 | *.egg-info/
 64 | .installed.cfg
 65 | *.egg
 66 | 
 67 | 
 68 | # PyInstaller
 69 | #  Usually these files are written by a python script from a template
 70 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 71 | *.manifest
 72 | *.spec
 73 | 
 74 | 
 75 | # Installer logs
 76 | pip-log.txt
 77 | pip-delete-this-directory.txt
 78 | 
 79 | 
 80 | # Unit test / coverage reports
 81 | htmlcov/
 82 | .tox/
 83 | .coverage
 84 | .coverage.*
 85 | .cache
 86 | nosetests.xml
 87 | coverage.xml
 88 | *,cover
 89 | 
 90 | 
 91 | # Translations
 92 | *.mo
 93 | *.pot
 94 | 
 95 | # Django stuff:
 96 | *.log
 97 | 
 98 | 
 99 | # Sphinx documentation
100 | docs/_build/
101 | 
102 | # PyBuilder
103 | target/
104 | .Rhistory
105 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | #test execution script.
 2 | 
 3 | language: scala
 4 | jdk:
 5 |   - oraclejdk8
 6 | 
 7 | sudo: false
 8 | before_install: umask 0022
 9 | scala:
10 |   - 2.10.4
11 |   - 2.11.7
12 | script:
13 | - "echo no op"
14 | 
15 | # whitelist
16 | branches:
17 |   only:
18 |     - master
19 |     - develop


--------------------------------------------------------------------------------
/Histogram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vicpara/exploratory-data-analysis/e071c4cf92a6f2462cfa509855e59cbd17c4d466/Histogram.png


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Victor Paraschiv (https://github.com/vicpara/)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Exploratory-Data-Analysis - EDA  
  2 | [![Build Status](https://travis-ci.org/vicpara/exploratory-data-analysis.svg?branch=master)](https://travis-ci.org/vicpara/exploratory-data-analysis) [![Licence](https://img.shields.io/badge/licence-MIT-blue.svg)](https://tldrlegal.com/license/mit-license)
  3 | 
  4 | Spark library for doing exploratory data analysis on your data set in a scalable way.
  5 | 
  6 | 
  7 | ## Getting Exploratory Data Analysis
  8 | 
  9 | If you're using SBT, add the following line to your build file:
 10 | 
 11 | ```scala
 12 | libraryDependencies += "com.github.vicpara" % "exploratory-data-analysis_2.10" % "1.0.0"
 13 | ```
 14 | 
 15 | For Maven and other build tools, you can visit [search.maven.org](http://search.maven.org/#search%7Cga%7C1%7Cexploratory%20data%20analysis).
 16 | To get sample configurations, click on the version of the module you are interested in.
 17 | 
 18 | -----
 19 | ### The problem
 20 | 
 21 | Before running any algorithm on a data set it is essential to understand how the data looks like and what are the expected edge cases.
 22 | Maybe you have some expectations about the distribution of the numbers. Are all the buckets that you expect to find in the dataset populated with numbers?
 23 | Very often a data set is provided which contains two types of columns::
 24 | 
 25 | * dimensions
 26 | * measurements
 27 | 
 28 | Dimensions are columns that help describe the data points such as: Date, UniqueID, Gender, Postcode, Country, Categories.
 29 | Measurements are metrics that quantitatively characterize the described datum.
 30 | 
 31 | Depending on your analytical task, some sort of ordering or bucketing will be used in your model. Some examples::
 32 | 
 33 | * process daily transactions of a customer
 34 | * aggregate daily transactions of a merchant
 35 | * count the number of unique customers per merchant or postcode, sector or district
 36 | * number of transactions per day.
 37 | 
 38 | Let's take as example the metric : number of daily transactions per merchant
 39 | 
 40 | This is made out of two concepts. A key which is (dayId, merchantId) and a value count(transactions).
 41 | Depending on the richness of your dataset is there may be keys with high number of transactiona and keys with very small number. This becomes a problem especially when you are dealing with a sample dataset which promises to capture the entire diversity of the original, complete dataset.
 42 | 
 43 | -----
 44 | ###the library
 45 | 
 46 | For a collection / RDD[T] given a ::
 47 | 
 48 | * function that maps an element of T to a *key*
 49 | * function that maps an element of T to a *value* that corresponds to the *key*
 50 | * function to aggregate in a monoid-al fashion two *values*
 51 | 
 52 | 
 53 | It can compute two types of statistics::
 54 | 
 55 | * the percentile values of the ordered sequence key-statistic
 56 | * the unique count of specific key
 57 | 
 58 | 
 59 | The results are saved into 2 possible formats:
 60 | 
 61 | * object file
 62 | * pretty text format ready for inspection
 63 | * html files
 64 | 
 65 | We use the *pretty* format when examining the results in bash on remote servers or HDFS. *Pretty* looks like this:
 66 | 
 67 | ```
 68 | BusinessId x Day - Distinct(CustomerId) 	DrillDownValue : Tuesday
 69 | 	[PercentileStats]: NumBinsInHistogram: 1188
 70 |           (9191,2015-10-27) |    0| #######                                                | 44
 71 |           (6305,2015-11-10) |    1| ######################                                 | 51
 72 |           (6774,2015-11-03) |    2| ###########################                            | 53
 73 |           (4278,2015-11-03) |    3| #################################                      | 54
 74 |           (9191,2015-11-03) |    4| #################################                      | 54
 75 |           (4687,2015-11-17) |    5| ###################################                    | 55
 76 |           (380,2015-11-03)  |    6| ######################################                 | 56
 77 |           (8114,2015-11-03) |    7| ############################################           | 57
 78 |           (5629,2015-10-27) |    8| ############################################           | 57
 79 |           (404,2015-11-03)  |    9| ###############################################        | 58
 80 |           (7586,2015-11-10) |   10| ###############################################        | 58
 81 |           (3765,2015-11-10) |   11| ###############################################        | 58
 82 |           (8478,2015-11-17) |   12| ###############################################        | 58
 83 |           (3701,2015-10-27) |   13| ###################################################### | 60
 84 |           
 85 | ```
 86 | 
 87 | Using `scala-bokeh`, the library can also output the results of the EDA in html files for easier examination.
 88 | ![Histogram chart example](https://github.com/vicpara/exploratory-data-analysis/blob/master/Histogram.png)
 89 | 
 90 | #### Example distinct statistic
 91 | 
 92 | You can find a working example for the capabilities of EDA in *test* folder.
 93 | 
 94 | For a datum of type transaction 
 95 | ```scala
 96 | 
 97 | case class Transaction(timestamp: Long, customerId: Long, businessId: Long, postcode: Option[String]) {
 98 |   def toSv(sep: String = "\t"): String = List(timestamp, customerId, businessId).mkString(sep)
 99 | }
100 | 
101 | ```
102 | 
103 | we define a distinct statistic over number of unique businesses (distinct businessId) inside a *postcode* for a given *day* for each unique values of the key *toDrillDownKeyOption*. The value of this field would normally be a categorical segmentation axis which is expected to have relatively small cardinality. Examples of fields that could be used for toDrillDownKeyOption would be country / city / type of product / region.
104 | 
105 | ```scala
106 | 
107 |   def uniqueBusinessIdPerPostcodeNDayStats(tx: RDD[Transaction], nPercentiles: Int = 1001) =
108 |     Some("Postcode x Day - Distinct(BusinessId)")
109 |     .map(n => PrettyPercentileStats(
110 |       name = n,
111 |       levels = SequenceStats.distinct[Transaction, (String, String), Long](
112 |         data = tx,
113 |         toDrillDownKeyOption = None,
114 |         toDimKey = t => (t.postcode.getOrElse(""), dayAsString(t.timestamp)),
115 |         toVal = tx => tx.businessId,
116 |         numPercentiles = nPercentiles
117 |       )
118 |     ))
119 |     
120 | ```
121 | 
122 | 
123 | #### Example Percentile statistic
124 | ```scala
125 | 
126 |   def globalUniqueCustomersCounterStats(transactions: RDD[Transaction], nPercentiles: Int = 1001) =
127 |     Some("Global distinct Customers")
128 |     .map(n => PrettyPercentileStats(
129 |       name = n,
130 |       levels = SequenceStats.distinct[Transaction, Long, Long](data = AppLogger.logStage(transactions, n),
131 |         toDrillDownKeyOption = None,
132 |         toDimKey = t => 1l,
133 |         toVal = tx => tx.customerId,
134 |         numPercentiles = nPercentiles
135 |       )
136 |     ))
137 |     
138 | ```
139 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | import com.typesafe.sbt.SbtScalariform
 2 | 
 3 | sonatypeProfileName := "com.github.vicpara"
 4 | 
 5 | organization := "com.github.vicpara"
 6 | 
 7 | name := "exploratory-data-analysis"
 8 | 
 9 | releaseVersionFile := file("version.sbt")
10 | 
11 | scalaVersion := "2.10.4"
12 | 
13 | crossScalaVersions := Seq("2.11.7", "2.10.4")
14 | 
15 | publishMavenStyle := true
16 | 
17 | publishTo <<= version { (v: String) =>
18 |   val nexus = "https://oss.sonatype.org/"
19 |   if (v.trim.endsWith("-SNAPSHOT"))
20 |     Some("snapshots" at nexus + "content/repositories/snapshots")
21 |   else
22 |     Some("releases" at nexus + "service/local/staging/deploy/maven2")
23 | }
24 | 
25 | publishArtifact in Test := false
26 | 
27 | libraryDependencies ++= Seq(
28 |   "joda-time" % "joda-time" % "2.6" withSources() withJavadoc(),
29 |   "org.joda" % "joda-convert" % "1.2" withSources() withJavadoc(),
30 |   "org.apache.spark" % "spark-core_2.10" % "1.3.0-cdh5.4.4" withSources() withJavadoc(),
31 |   "org.apache.commons" % "commons-csv" % "1.2" withSources() withJavadoc(),
32 |   "com.github.scala-incubator.io" %% "scala-io-core" % "0.4.3" withSources() withJavadoc(),
33 |   "com.github.scala-incubator.io" %% "scala-io-file" % "0.4.3" withSources() withJavadoc(),
34 |   "com.rockymadden.stringmetric" %% "stringmetric-core" % "0.27.3" withSources() withJavadoc(),
35 |   "org.scalaz" %% "scalaz-core" % "7.0.6" withSources() withJavadoc(),
36 |   "org.rogach" %% "scallop" % "0.9.5" withSources() withJavadoc(),
37 |   "org.scala-lang" % "scalap" % "2.10.4" withSources() withJavadoc(),
38 |   "org.scala-lang" % "scala-compiler" % "2.10.4" withSources() withJavadoc(),
39 |   "com.github.tototoshi" %% "scala-csv" % "1.2.2" withSources() withJavadoc(),
40 |   "org.specs2" %% "specs2-core" % "2.4.9-scalaz-7.0.6" % "test" withSources() withJavadoc(),
41 |   "org.specs2" %% "specs2-scalacheck" % "2.4.9-scalaz-7.0.6" % "test" withSources() withJavadoc(),
42 |   "io.spray" %% "spray-json" % "1.3.1" withSources() withJavadoc(),
43 |   "org.scalaj" %% "scalaj-http" % "1.1.5" withSources() withJavadoc(),
44 |   "io.continuum.bokeh" %% "bokeh" % "0.6" withSources() withJavadoc()
45 | )
46 | 
47 | resolvers ++= Seq(
48 |   "mvnrepository" at "https://repository.cloudera.com/artifactory/cloudera-repos/",
49 |   "Maven Central" at "https://repo1.maven.org/maven2/",
50 |   "Sonatype OSS Releases" at "https://oss.sonatype.org/service/local/staging/deploy/maven2"
51 | )
52 | 
53 | assemblyMergeStrategy in assembly := {
54 |   case el if el.contains("fasterxml.jackson.core") => MergeStrategy.first
55 |   case el if el.contains("guava") => MergeStrategy.first
56 | 
57 |   case x if Assembly.isConfigFile(x) => MergeStrategy.concat
58 |   case PathList(ps@_*) if Assembly.isReadme(ps.last) || Assembly.isLicenseFile(ps.last) => MergeStrategy.rename
59 |   case PathList("META-INF", xs@_*) => (xs map {
60 |     _.toLowerCase
61 |   }) match {
62 |     case ("manifest.mf" :: Nil) | ("index.list" :: Nil) | ("dependencies" :: Nil) => MergeStrategy.discard
63 |     case ps@(x :: xs) if ps.last.endsWith(".sf") || ps.last.endsWith(".dsa") => MergeStrategy.discard
64 |     case "plexus" :: xs => MergeStrategy.discard
65 |     case "services" :: xs => MergeStrategy.filterDistinctLines
66 |     case ("spring.schemas" :: Nil) | ("spring.handlers" :: Nil) => MergeStrategy.filterDistinctLines
67 |     case _ => MergeStrategy.first // Changed deduplicate to first
68 |   }
69 |   case PathList(_*) => MergeStrategy.first // added this line
70 | }
71 | 
72 | pomIncludeRepository := { _ => false }
73 | 
74 | pomExtra := (
75 |   <url>http://github.com/vicpara/exploratory-data-analysis</url>
76 |     <licenses>
77 |       <license>
78 |         <name>Apache License, Version 2.0</name>
79 |         <url>http://www.apache.org/licenses/LICENSE-2.0.html</url>
80 |         <distribution>repo</distribution>
81 |       </license>
82 |     </licenses>
83 |     <scm>
84 |       <url>git@github.com:vicpara/exploratory-data-analysis.git</url>
85 |       <connection>scm:git:git@github.com:vicpara/exploratory-data-analysis.git</connection>
86 |     </scm>
87 |     <developers>
88 |       <developer>
89 |         <id>vicpara</id>
90 |         <name>Victor Paraschiv</name>
91 |         <url>http://github.com/vicpara</url>
92 |       </developer>
93 |     </developers>)
94 | 
95 | scalariformSettings
96 | 
97 | 


--------------------------------------------------------------------------------
/no-test-sbt.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | sbt "set test in assembly := {}"  "set skip in update := true" "set offline := true" "set scalacOptions in ThisBuild ++= Seq(\"-unchecked\", \"-deprecation\")" "$*"
3 | 
4 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | scalacOptions += "-deprecation"
 2 | 
 3 | addSbtPlugin("org.scalariform" % "sbt-scalariform" % "1.6.0")
 4 | 
 5 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
 6 | 
 7 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.1")
 8 | 
 9 | addSbtPlugin("com.typesafe.sbt" % "sbt-osgi" % "0.7.0")
10 | 
11 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.4.0")
12 | 
13 | addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.3.3")
14 | 
15 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0")
16 | 
17 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.5")
18 | 
19 | addSbtPlugin("com.github.tkawachi" % "sbt-doctest" % "0.3.2")
20 | 
21 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "1.1")
22 | 
23 | fullResolvers ~= {_.filterNot(_.name == "jcenter")}
24 | 


--------------------------------------------------------------------------------
/sbt-offline.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | sbt "set skip in update := true" "set offline := true" "set scalacOptions in ThisBuild ++= Seq(\"-unchecked\", \"-deprecation\")" "$*"
3 | 
4 | 


--------------------------------------------------------------------------------
/src/main/scala/com/github/vicpara/eda/AppLogger.scala:
--------------------------------------------------------------------------------
 1 | package com.github.vicpara.eda
 2 | 
 3 | import org.apache.log4j.{ Logger, PatternLayout, ConsoleAppender, Level }
 4 | import org.apache.spark.rdd.RDD
 5 | 
 6 | case object AppLogger {
 7 |   def getLogger(level: Level = Level.INFO): Logger = {
 8 |     val level = org.apache.log4j.Level.INFO
 9 |     val logger = org.apache.log4j.Logger.getLogger("nlp-topic-modelling")
10 |     logger.setLevel(level)
11 | 
12 |     val capp = new ConsoleAppender()
13 |     capp.setName(s"ConsoleAppender ${level.toString}")
14 |     capp.setLayout(new PatternLayout("%d %m%n"))
15 |     capp.setThreshold(level)
16 |     capp.activateOptions()
17 |     logger.addAppender(capp)
18 |     logger
19 |   }
20 | 
21 |   def logStage[T](data: RDD[T], message: String): RDD[T] = {
22 |     infoLevel.info(message)
23 |     data
24 |   }
25 | 
26 |   def apply(): org.apache.log4j.Logger = infoLevel
27 |   val infoLevel = getLogger()
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/scala/com/github/vicpara/eda/stats/PercentileStats.scala:
--------------------------------------------------------------------------------
 1 | package com.github.vicpara.eda.stats
 2 | 
 3 | import io.continuum.bokeh._
 4 | 
 5 | import scalaz.Scalaz._
 6 | 
 7 | case class PercentileStats(points: List[(String, Long)], numBuckets: Long) {
 8 |   def isSingleValue = points match {
 9 |     case pts if pts.size == 1 => true
10 |     case _ => false
11 |   }
12 | }
13 | 
14 | case class PercentileStatsWithFilterLevel(drillDownFilterValue: String, stats: PercentileStats) {
15 |   def isSingleStat = drillDownFilterValue.equals(SequenceStats.drillDownKeyAll)
16 |   def isSingleValue = stats.points.size == 1
17 | }
18 | 
19 | case class PrettyPercentileStats(name: String, levels: List[PercentileStatsWithFilterLevel]) {
20 |   def lineImage(xvals: Seq[Double], yvals: Seq[Double], name: String): Plot = {
21 |     object source extends ColumnDataSource {
22 |       val x = column(xvals)
23 |       val y = column(yvals)
24 |     }
25 | 
26 |     import source.{ x, y }
27 | 
28 |     val xdr = new DataRange1d()
29 |     val ydr = new DataRange1d()
30 | 
31 |     val line = new Line().x(x).y(y).line_color("#666699").line_width(2)
32 | 
33 |     val line_renderer = new GlyphRenderer()
34 |       .data_source(source)
35 |       .glyph(line)
36 | 
37 |     val plot = new Plot()
38 |       .title(name).title_text_font_size(FontSize.apply(10, FontUnits.PT))
39 |       .x_range(xdr).y_range(ydr)
40 |       .width(500).height(500)
41 |       .border_fill(Color.White)
42 |       .background_fill("#FFE8C7")
43 | 
44 |     val xaxis = new LinearAxis().plot(plot)
45 |     val yaxis = new LinearAxis().plot(plot)
46 |     plot.below <<= (xaxis :: _)
47 |     plot.left <<= (yaxis :: _)
48 |     val xgrid = new Grid().plot(plot).axis(xaxis).dimension(0)
49 |     val ygrid = new Grid().plot(plot).axis(yaxis).dimension(1)
50 | 
51 |     val pantool = new PanTool().plot(plot)
52 |     val wheelzoomtool = new WheelZoomTool().plot(plot)
53 | 
54 |     plot.tools := List(pantool, wheelzoomtool)
55 |     plot.renderers := List(xaxis, yaxis, xgrid, ygrid, line_renderer)
56 | 
57 |     plot
58 |   }
59 | 
60 |   def toPlot: List[Plot] =
61 |     levels.flatMap(stats => if (stats.stats.points.size > 1)
62 |       Some(lineImage(
63 |       xvals = stats.stats.points.indices.map(_.toDouble),
64 |       yvals = stats.stats.points.map(_._2.toDouble),
65 |       name = name + " #" + stats.drillDownFilterValue
66 |     ))
67 |     else None)
68 | 
69 |   def toHumanReadable: String = levels.map(l => s"\n${"_" * 148}\n$name \tDrillDownValue : ${l.drillDownFilterValue}\n\t" +
70 |     s"${l.stats |> prettyContent}").mkString("\n")
71 | 
72 |   def prettyContent(stats: PercentileStats): String = stats.points match {
73 |     case points if points.isEmpty => "[EmptyStatistic]"
74 |     case points if points.size == 1 => s"[SingleStats]: Value: ${stats.points.head._2}\n"
75 | 
76 |     case points if points.size > 1 =>
77 |       val min +: _ :+ max = stats.points.map(_._2)
78 | 
79 |       val pointsPerChar: Double = (max - min) / 100.0
80 |       val fillChar = "#"
81 | 
82 |       val maxLabelLength = math.min(100, stats.points.map(_._1.length).max) + 5
83 | 
84 |       val histPretty: String =
85 |         stats.points
86 |           .zipWithIndex.map {
87 |             case ((key, value), index) =>
88 |               val strIndex = "%5s".format(index)
89 |               val label = s"%${maxLabelLength}s".format(key)
90 |               val bars = "%101s".format(fillChar * (1 + ((value - min) / pointsPerChar).toInt)).reverse
91 |               s"$label |$strIndex| $bars| $value"
92 |           }
93 |           .mkString("\n")
94 | 
95 |       s"[PercentileStats]: NumBinsInHistogram: ${stats.numBuckets.toString}\n$histPretty"
96 |   }
97 | }
98 | 


--------------------------------------------------------------------------------
/src/main/scala/com/github/vicpara/eda/stats/SequenceStats.scala:
--------------------------------------------------------------------------------
 1 | package com.github.vicpara.eda.stats
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | import scala.reflect.ClassTag
 6 | import scalaz.Scalaz._
 7 | 
 8 | case object SequenceStats {
 9 |   val drillDownKeyAll = "$None$"
10 | 
11 |   def percentile[T: ClassTag, DimKey: ClassTag, V: ClassTag](
12 |     data: RDD[T],
13 |     toDrillDownKeyOption: Option[T => String],
14 |     toDimKey: T => DimKey,
15 |     toVal: T => V,
16 |     toStats: V => Long,
17 |     reduceFunc: (V, V) => V,
18 |     numPercentiles: Int
19 |   ): List[PercentileStatsWithFilterLevel] =
20 | 
21 |     data.map(t => (toDrillDownKeyOption.getOrElse((_: T) => drillDownKeyAll)(t), toDimKey(t)) -> toVal(t))
22 |       .reduceByKey(reduceFunc)
23 |       .mapValues(toStats)
24 |       .sortBy(_._2)
25 |       .zipWithIndex() |>
26 |       percentileStats(numPercentiles) |>
27 |       (percentileStats => percentileStats.toList.map((PercentileStatsWithFilterLevel.apply _).tupled))
28 | 
29 |   def distinct[T: ClassTag, DimKey: ClassTag, V: ClassTag](
30 |     data: RDD[T],
31 |     toDrillDownKeyOption: Option[T => String],
32 |     toDimKey: T => DimKey,
33 |     toVal: T => V,
34 |     numPercentiles: Int
35 |   ): List[PercentileStatsWithFilterLevel] =
36 |     data.map(t => (toDrillDownKeyOption.getOrElse((_: T) => drillDownKeyAll)(t), toDimKey(t), toVal(t)))
37 |       .distinct()
38 |       .map(e => (e._1, e._2) -> Set(e._3))
39 |       .reduceByKey(_ ++ _)
40 |       .mapValues(_.size.toLong)
41 |       .sortBy(_._2)
42 |       .zipWithIndex() |>
43 |       percentileStats(numPercentiles) |>
44 |       (percentileStats => percentileStats.toList.map((PercentileStatsWithFilterLevel.apply _).tupled))
45 | 
46 |   def percentileStats[DimKey](numPercentiles: Int = 1001)(data: RDD[(((String, DimKey), Long), Long)]): Map[String, PercentileStats] = {
47 | 
48 |     val numBuckets: Map[String, Int] =
49 |       data.map {
50 |         case (((drillDownKey, _), _), _) => drillDownKey -> 1
51 |       }
52 |         .reduceByKey(_ + _)
53 |         .collect()
54 |         .toMap
55 | 
56 |     val drillDownKeyToIndices: Map[String, Set[Long]] =
57 |       numBuckets.mapValues(percentileIndices(_, numPercentiles)).map(identity)
58 | 
59 |     data.filter { case (((drillDownKey, _), _), index) => drillDownKeyToIndices(drillDownKey).contains(index) }
60 |       .map {
61 |         case (((drillDownKey, dimKey), stats), _) => drillDownKey -> (dimKey.toString -> stats)
62 |       }
63 |       .collect()
64 |       .groupBy(_._1)
65 |       .mapValues(_.map(_._2).toList.sortBy(_._2))
66 |       .map {
67 |         case (key, points) => key -> PercentileStats(points, numBuckets(key))
68 |       }
69 |   }
70 | 
71 |   def percentileIndices(numElems: Long, numPercentiles: Int): Set[Long] = {
72 |     val delta: Double = (1.0 / math.max(1, numPercentiles - 1)) * (numElems - 1)
73 |     (0 until numPercentiles).map(el => (el * delta).round).toSet
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/test/scala/com/github/vicpara/eda/DescriptiveStatsJob.scala:
--------------------------------------------------------------------------------
 1 | package com.github.vicpara.eda
 2 | 
 3 | import com.github.vicpara.eda.stats.PrettyPercentileStats
 4 | import io.continuum.bokeh.{ Document, Plot, GridPlot }
 5 | import org.apache.spark.rdd.RDD
 6 | import org.apache.spark.{ SparkConf, SparkContext }
 7 | import org.rogach.scallop.ScallopConf
 8 | 
 9 | case class Transaction(timestamp: Long, customerId: Long, businessId: Long, postcode: Option[String]) {
10 |   def toSv(sep: String = "\t"): String = List(timestamp, customerId, businessId).mkString(sep)
11 | }
12 | 
13 | case object Transaction {
14 |   def apply(sep: String)(line: String): Transaction = line.split(sep, -1).toList match {
15 |     case List(timestamp, customerId, businessId, postcode) => Transaction(timestamp.toLong, customerId.toLong,
16 |       businessId.toLong, Some(postcode))
17 |   }
18 | }
19 | 
20 | case object DescriptiveStatsJob extends Generators {
21 |   def main(args: Array[String]) {
22 |     val conf = new ScallopConf(args) {
23 |       val delim = opt[String](default = Some("\t"), descr = "The delimiter character")
24 |       val tmpFolder = opt[String](descr = "Overrides the directory used in spark.local.dir")
25 | 
26 |       val outputPath = opt[String](required = true, descr = "The output path for the EDA stats")
27 |       val outputPathHuman = opt[String](descr = "The output path [Human readable] for the EDA stats")
28 |     }
29 | 
30 |     conf.printHelp()
31 |     AppLogger().info(conf.summary)
32 | 
33 |     val sparkConf = new SparkConf()
34 |       .set("spark.akka.frameSize", "128")
35 |       .setMaster("local")
36 |       .set("spark.hadoop.validateOutputSpecs", "false")
37 |       .set("spark.io.compression.codec", "lz4")
38 |       .setAppName("Local Exploratory Data Analysis")
39 | 
40 |     if (conf.tmpFolder.isDefined) sparkConf.set("spark.local.dir", conf.tmpFolder())
41 |     @transient val sc: SparkContext = new SparkContext(sparkConf)
42 | 
43 |     val transactions = sc.parallelize(randomTransactions(80000).sample.get)
44 |     AppLogger.logStage(transactions, "Finished generating the data points")
45 | 
46 |     val results: RDD[PrettyPercentileStats] = sc.makeRDD(List(
47 |       Stats.txCountPerCustomerNDayStats(transactions, 101),
48 |       Stats.txCountPerBusinessIdNDayStats(transactions, 101),
49 | 
50 |       Stats.uniqueBusinessIdPerPostcodeNDayStats(transactions, 101),
51 |       Stats.uniqueCustomersPerBusinessIdNDayStats(transactions, 101),
52 |       Stats.uniqueCustomerIdPerPostcodeNDayStats(transactions, 101),
53 | 
54 |       Stats.globalUniqueCustomersCounterStats(transactions, 101),
55 |       Stats.globalUniqueBusinessesCounterStats(transactions, 101),
56 |       Stats.globalUniquePostcodesCounterStats(transactions, 101)
57 |     ).flatten, numSlices = 1)
58 | 
59 |     results.saveAsObjectFile(conf.outputPath())
60 |     conf.outputPathHuman.foreach(results.map(_.toHumanReadable).saveAsTextFile)
61 | 
62 |     savePlotLists(results, "/tmp/eda/human/")
63 |   }
64 | 
65 |   def savePlotLists(results: RDD[PrettyPercentileStats], outFolder: String) = {
66 |     val plotGrid: List[List[Plot]] = results.collect()
67 |       .flatMap(_.toPlot)
68 |       .zipWithIndex
69 |       .groupBy(_._2 / 2)
70 |       .map(_._2.map(_._1).toList).toList
71 | 
72 |     val grid = new GridPlot().children(plotGrid)
73 | 
74 |     val document = new Document(grid)
75 |     val html = document.save(outFolder + "/" + "EDA_Stats_Results.html")
76 |     AppLogger.infoLevel.info(s"Wrote EDA stats charts in ${html.file}. Open ${html.url} in a web browser.")
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/src/test/scala/com/github/vicpara/eda/Generators.scala:
--------------------------------------------------------------------------------
 1 | package com.github.vicpara.eda
 2 | 
 3 | import org.joda.time.{ DateTime, DateTimeZone, Interval, LocalDateTime }
 4 | import org.scalacheck.{ Prop, Arbitrary, Gen }
 5 | 
 6 | import scala.collection.immutable
 7 | import scalaz.Scalaz._
 8 | 
 9 | trait Generators {
10 |   val nTransactions = 1000000
11 |   val nCustomers = 800
12 |   val nBusinesses = 300
13 | 
14 |   val nonNegativeLongGen: Gen[Long] = Gen.choose[Long](0l, Long.MaxValue - 1)
15 |   val nonNegativeIntGen: Gen[Int] = Gen.choose(0, 100)
16 |   val positiveIntGen: Gen[Int] = Gen.choose(1, 100)
17 | 
18 |   val nonEmptyAlphaStr = Gen.nonEmptyListOf(Gen.alphaChar).map(_.mkString).suchThat(_.forall(_.isLetter))
19 | 
20 |   val noOfDays = 23
21 |   val dates = DateTime.now() |> (t0 => (0 until noOfDays).map(t0.minusDays))
22 | 
23 |   val availablePostcodes = Gen.listOfN(100, Gen.listOfN(6, Gen.alphaNumChar).flatMap(_.mkString("").toUpperCase()))
24 |     .sample.get
25 | 
26 |   val businesses: List[Int] = Gen.listOfN(nBusinesses, Gen.choose(0, 10000)).sample.get
27 |   val customers: List[Int] = Gen.listOfN(nCustomers, Gen.chooseNum(10001, 20000)).sample.get
28 | 
29 |   def sameDayTimestampGen(date: DateTime): Gen[DateTime] = for {
30 |     hourOfDay <- Gen.choose(0, 23)
31 |     minuteOfHour <- Gen.choose(0, 59)
32 |   } yield {
33 |     val dtz = DateTimeZone.forID("Europe/London")
34 |     val ldt = new LocalDateTime(date.getMillis, dtz).withTime(hourOfDay, minuteOfHour, 0, 0)
35 |     (if (dtz.isLocalDateTimeGap(ldt)) ldt.plusHours(2) else ldt).toDateTime(dtz)
36 |   }
37 | 
38 |   def timestampGen(interval: Interval) = for {
39 |     date: DateTime <- Gen.choose(interval.getStart.getMillis, interval.getEnd.getMillis).map(new DateTime(_))
40 |     timestamp <- sameDayTimestampGen(date)
41 |   } yield timestamp
42 | 
43 |   val timestampGen: Gen[DateTime] = for {
44 |     date: DateTime <- Gen.oneOf(dates)
45 |     timestamp <- sameDayTimestampGen(date)
46 |   } yield timestamp
47 | 
48 |   val todayTimestampGen: Gen[DateTime] = for {
49 |     hourOfDay <- Gen.choose(0, 23)
50 |     minuteOfHour <- Gen.choose(0, 59)
51 |     date = DateTime.now()
52 |     timestamp = date.withTime(hourOfDay, minuteOfHour, 0, 0)
53 |   } yield timestamp
54 | 
55 |   val postcodeGen: Gen[String] = Gen.oneOf(availablePostcodes)
56 | 
57 |   def postcodeFromSectorGen(sector: String) =
58 |     Gen.listOfN(2, Gen.alphaNumChar).map(_.mkString).map(sector + _.toUpperCase)
59 | 
60 |   def randomTransactionGen(): Gen[Transaction] = for {
61 |     timestamp <- timestampGen.map(_.getMillis)
62 |     postcode <- postcodeGen
63 |     businessId <- Gen.oneOf(businesses)
64 |     customerId <- Gen.oneOf(customers)
65 |   } yield Transaction(timestamp, customerId, businessId, Some(postcode))
66 | 
67 |   def randomTransactions(n: Int): Gen[List[Transaction]] = Gen.listOfN(nTransactions, randomTransactionGen())
68 | }
69 | 


--------------------------------------------------------------------------------
/src/test/scala/com/github/vicpara/eda/Stats.scala:
--------------------------------------------------------------------------------
  1 | package com.github.vicpara.eda
  2 | 
  3 | import com.github.vicpara.eda.stats.{ SequenceStats, PrettyPercentileStats }
  4 | import org.apache.spark.rdd.RDD
  5 | import org.joda.time.DateTime
  6 | 
  7 | case object Stats {
  8 |   def txCountPerCustomerNDayStats(transactions: RDD[Transaction], nPercentiles: Int = 1001) =
  9 |     Some("Customer x Day - Count(Tx)")
 10 |       .map(n => PrettyPercentileStats(
 11 |         name = n,
 12 |         levels = SequenceStats.percentile[Transaction, (Long, String), Long](
 13 |           data = AppLogger.logStage(transactions, n),
 14 |           toDrillDownKeyOption = None,
 15 |           toDimKey = tx => (tx.customerId, dayAsString(tx.timestamp)),
 16 |           toVal = _ => 1l,
 17 |           toStats = identity,
 18 |           reduceFunc = _ + _,
 19 |           numPercentiles = nPercentiles
 20 |         )
 21 |       ))
 22 | 
 23 |   def txCountPerBusinessIdNDayStats(transactions: RDD[Transaction], nPercentiles: Int = 1001) =
 24 |     Some("BusinessId x Day - Count(Tx)")
 25 |       .map(n => PrettyPercentileStats(
 26 |         name = n,
 27 |         levels = SequenceStats.percentile[Transaction, (Long, String), Long](
 28 |           data = AppLogger.logStage(transactions, n),
 29 |           toDrillDownKeyOption = None,
 30 |           toDimKey = tx => (tx.businessId, dayAsString(tx.timestamp)),
 31 |           toVal = r => 1l,
 32 |           toStats = identity,
 33 |           reduceFunc = _ + _,
 34 |           numPercentiles = nPercentiles
 35 |         )
 36 |       ))
 37 | 
 38 |   def globalUniqueBusinessesCounterStats(transactions: RDD[Transaction], nPercentiles: Int = 1001) =
 39 |     Some("Global distinct Businesses")
 40 |       .map(n => PrettyPercentileStats(
 41 |         name = n,
 42 |         levels = SequenceStats.distinct[Transaction, Long, Long](
 43 |           data = AppLogger.logStage(transactions, n),
 44 |           toDrillDownKeyOption = None,
 45 |           toDimKey = t => 1l,
 46 |           toVal = tx => tx.businessId,
 47 |           numPercentiles = nPercentiles
 48 |         )
 49 |       ))
 50 | 
 51 |   def globalUniquePostcodesCounterStats(transactions: RDD[Transaction], nPercentiles: Int = 1001) =
 52 |     Some("Global distinct Postcodes")
 53 |       .map(n => PrettyPercentileStats(
 54 |         name = n,
 55 |         levels = SequenceStats.distinct[Transaction, Long, String](
 56 |           data = AppLogger.logStage(transactions, n),
 57 |           toDrillDownKeyOption = None,
 58 |           toDimKey = t => 1l,
 59 |           toVal = tx => tx.postcode.get,
 60 |           numPercentiles = nPercentiles
 61 |         )
 62 |       ))
 63 | 
 64 |   def globalUniqueCustomersCounterStats(transactions: RDD[Transaction], nPercentiles: Int = 1001) =
 65 |     Some("Global distinct Customers")
 66 |       .map(n => PrettyPercentileStats(
 67 |         name = n,
 68 |         levels = SequenceStats.distinct[Transaction, Long, Long](
 69 |           data = AppLogger.logStage(transactions, n),
 70 |           toDrillDownKeyOption = None,
 71 |           toDimKey = t => 1l,
 72 |           toVal = tx => tx.customerId,
 73 |           numPercentiles = nPercentiles
 74 |         )
 75 |       ))
 76 | 
 77 |   def uniqueCustomerIdPerPostcodeNDayStats(tx: RDD[Transaction], nPercentiles: Int = 1001) =
 78 |     Some("Postcode x Day - Distinct(CustomerID)")
 79 |       .map(n => PrettyPercentileStats(
 80 |         name = n,
 81 |         levels = SequenceStats.distinct[Transaction, (String, String), Long](
 82 |           data = AppLogger.logStage(tx, n),
 83 |           toDrillDownKeyOption = None,
 84 |           toDimKey = t => (t.postcode.getOrElse(""), dayAsString(t.timestamp)),
 85 |           toVal = tx => tx.customerId,
 86 |           numPercentiles = nPercentiles
 87 |         )
 88 |       ))
 89 | 
 90 |   def uniqueBusinessIdPerPostcodeNDayStats(tx: RDD[Transaction], nPercentiles: Int = 1001) =
 91 |     Some("Postcode x Day - Distinct(BusinessId)")
 92 |       .map(n => PrettyPercentileStats(
 93 |         name = n,
 94 |         levels = SequenceStats.distinct[Transaction, (String, String), Long](
 95 |           data = AppLogger.logStage(tx, n),
 96 |           toDrillDownKeyOption = None,
 97 |           toDimKey = t => (t.postcode.getOrElse(""), dayAsString(t.timestamp)),
 98 |           toVal = tx => tx.businessId,
 99 |           numPercentiles = nPercentiles
100 |         )
101 |       ))
102 | 
103 |   def uniqueCustomersPerBusinessIdNDayStats(tx: RDD[Transaction], nPercentiles: Int = 1001) =
104 |     Some("BusinessId x Day - Distinct(CustomerId)")
105 |       .map(n => PrettyPercentileStats(
106 |         name = n,
107 |         levels = SequenceStats.distinct[Transaction, (Long, String), Long](
108 |           data = AppLogger.logStage(tx, n),
109 |           toDrillDownKeyOption = Some(tx => dayOfWeek(tx.timestamp)),
110 |           toDimKey = tx => (tx.businessId, dayAsString(tx.timestamp)),
111 |           toVal = tx => tx.customerId,
112 |           numPercentiles = nPercentiles
113 |         )
114 |       ))
115 | 
116 |   def dayAsString(millis: Long): String = new DateTime(millis).toString("yyyy-MM-dd")
117 |   def dayOfWeek(millis: Long): String = new DateTime(millis).dayOfWeek().getAsText
118 | }
119 | 


--------------------------------------------------------------------------------
/src/test/scala/com/github/vicpara/eda/StatsSpec.scala:
--------------------------------------------------------------------------------
  1 | package com.github.vicpara.eda
  2 | 
  3 | import com.github.vicpara.eda.stats.{ SequenceStats, PercentileStats, PercentileStatsWithFilterLevel, PrettyPercentileStats }
  4 | import org.apache.spark.SparkContext
  5 | import org.apache.spark.rdd.RDD
  6 | import org.specs2.matcher.ScalaCheckMatchers
  7 | import org.specs2.mutable.Specification
  8 | 
  9 | import scalaz.Scalaz._
 10 | 
 11 | case object StatsSpec extends Specification with ScalaCheckMatchers with TestUtils {
 12 |   "ExploratoryDataAnalysis" should {
 13 | 
 14 |     "correctly compute percentiles statistics on monotonic increasing 21 sample dataset" in {
 15 |       val rawData =
 16 |         (0l until 21).map((" ", _))
 17 |           .map {
 18 |             case (drillDownKey, dimKey) => (((drillDownKey, dimKey), dimKey), dimKey)
 19 |           }
 20 |           .toList
 21 |       val dataRDD: RDD[(((String, Long), Long), Long)] = sc.parallelize(rawData)
 22 | 
 23 |       val res: PercentileStats = SequenceStats.percentileStats[Long](numPercentiles = 21)(dataRDD).head._2
 24 |       res must_=== PercentileStats(points = rawData.map(el => (el._2.toString, el._2)), numBuckets = 21l)
 25 |     }
 26 | 
 27 |     "correctly compute the right number of elements in the percentiles statistics on 101 constant samples dataset" in {
 28 |       val key = " "
 29 |       val rawData: List[(((String, Long), Long), Long)] = (0l until 101 by 1).toList.map(e => (((key, e), e), e))
 30 |       val dataRDD: RDD[(((String, Long), Long), Long)] = sc.parallelize(rawData)
 31 | 
 32 |       val intRes = SequenceStats.percentileStats[Long](numPercentiles = 11)(dataRDD)
 33 |       val res =
 34 |         List(1, 2, 3, 5, 6, 7, 9, 11, 20, 21, 99, 100)
 35 |           .map(r => (r, SequenceStats.percentileStats[Long](numPercentiles = r)(dataRDD).get(key).get.points.size))
 36 |           .filter(e => e._1 != e._2)
 37 | 
 38 |       res.foreach(el => el._1 must_=== el._2)
 39 |       res.size must_=== 0
 40 |     }
 41 | 
 42 |     "correctly compute number percentiles when Num Percentiles is larger than samples" in {
 43 |       val numSamplesList = List(2, 4, 5, 10, 11, 13, 17, 21, 39, 55, 101)
 44 |       val res = numSamplesList.flatMap(numSample => {
 45 |         val ires =
 46 |           List(102, 103, 104, 101, 130, 200, 300, 500, 1000)
 47 |             .map(v => (numSample, SequenceStats.percentileIndices(numSample, v).size))
 48 |             .filter(v => v._1 != v._2)
 49 | 
 50 |         ires.forall(el => el._1 must_=== el._2)
 51 |         ires.size must_=== 0
 52 |         ires
 53 |       })
 54 |       if (res.nonEmpty)
 55 |         println("Numer of percentile indices" + res.map(el => s"Expected:${el._1} but got ${el._2} ").mkString("\n"))
 56 |       res.size must_=== 0
 57 |     }
 58 | 
 59 |     "correctly compute percentiles statistics on 101 constant samples dataset" in {
 60 |       val rawData: List[(((String, Long), Long), Long)] = (0l until 101 by 1).toList.map(e => (((" ", e), 2l), e))
 61 |       val dataRDD: RDD[(((String, Long), Long), Long)] = sc.parallelize(rawData)
 62 | 
 63 |       val result = SequenceStats.percentileStats[Long](numPercentiles = 11)(dataRDD).head._2
 64 |       val expected = PercentileStats(points = (0 until 101 by 10).toList.zipWithIndex.map(e => (e._1.toString, 2l)), numBuckets = 101)
 65 |       result must_=== expected
 66 |     }
 67 | 
 68 |     "correctly compute percentiles statistics on smaller dataset than stats" in {
 69 |       val rawData = (1l until 6 by 1).map(e => (((" ", e), e), e - 1))
 70 |       val dataRDD: RDD[(((String, Long), Long), Long)] = sc.parallelize(rawData)
 71 | 
 72 |       val result: PercentileStats = SequenceStats.percentileStats[Long](numPercentiles = 10)(dataRDD).head._2
 73 |       val expected = PercentileStats(points = (1l until 6 by 1).map(e => (e.toString, e)).toList, numBuckets = 5l)
 74 | 
 75 |       result must_=== expected
 76 |     }
 77 | 
 78 |     "correctly computes the number of Percentile Indices for increasing number of percentiles" in {
 79 |       val res =
 80 |         List(1, 2, 3, 5, 6, 7, 9, 11, 17, 20, 21, 40, 50, 51, 99, 100, 101)
 81 |           .map(v => (v, SequenceStats.percentileIndices(101, v).size))
 82 |           .filter(v => v._1 != v._2)
 83 | 
 84 |       res.foreach(el => el._1 must_=== el._2)
 85 |       res.size must_=== 0
 86 |     }
 87 | 
 88 |     "correctly generate 10 Percentile Indexes from 1 to 10" in {
 89 |       val result: Set[Long] = SequenceStats.percentileIndices(10, 10)
 90 |       val expected: Set[Long] = (0 until 10).toList.map(_.toLong).toSet
 91 | 
 92 |       result must_=== expected
 93 |     }
 94 | 
 95 |     "correctly generate 10 Percentile Indexes from 1 to 10 when requested 20 for a smaller dataset" in {
 96 |       val result: Set[Long] = SequenceStats.percentileIndices(10, 20)
 97 |       val expected: Set[Long] = (0 until 10).toList.map(_.toLong).toSet
 98 | 
 99 |       result must_=== expected
100 |     }
101 | 
102 |     "correctly generate 10 Percentile Indexes from 0 to 1000 by 100 when requested 10 for a 1001 dataset" in {
103 |       val result: Set[Long] = SequenceStats.percentileIndices(1001, 11)
104 |       val expected: Set[Long] = (0 until 1001 by 100).toList.map(_.toLong).toSet
105 | 
106 |       result must_=== expected
107 |     }
108 | 
109 |     "correctly pretty prints humanReadable" in {
110 |       val setEmpty = PrettyPercentileStats(
111 |         name = "xxx",
112 |         levels = List(PercentileStatsWithFilterLevel(
113 |           "NONE",
114 |           stats = PercentileStats(points = Nil, numBuckets = 0)
115 |         ))
116 |       )
117 | 
118 |       val set1 = PrettyPercentileStats(
119 |         name = "xxx",
120 |         levels = List(PercentileStatsWithFilterLevel(
121 |           "NONE",
122 |           stats = PercentileStats(points = List(("Key1", 2l)), numBuckets = 1)
123 |         ))
124 |       )
125 | 
126 |       val set2 = PrettyPercentileStats(
127 |         name = "xxx",
128 |         levels = List(PercentileStatsWithFilterLevel(
129 |           "NONE",
130 |           stats = PercentileStats(points = List(("Key1", 1l), ("Key2", 2l)), numBuckets = 2)
131 |         ))
132 |       )
133 | 
134 |       val set3 = PrettyPercentileStats(
135 |         name = "xxx",
136 |         levels = List(PercentileStatsWithFilterLevel(
137 |           "NONE",
138 |           stats = PercentileStats(points = List(("Key1", 1l), ("Key2", 2l), ("Key3", 3l)), numBuckets = 3)
139 |         ))
140 |       )
141 | 
142 |       List(setEmpty, set1, set2, set3)
143 |         .map(e => e.levels.head.stats.points.size -> e.toHumanReadable)
144 |         .map(e => e._1 -> (e._1 == e._2.split("\n").size + 2))
145 |         .count(_._2) must_=== 0
146 |     }
147 | 
148 |     "correctly pretty prints for humans bad samples" in {
149 |       val data = List(
150 |         PrettyPercentileStats(
151 |           name = "BusinessId x Day - Count(Tx)",
152 |           levels = List(PercentileStatsWithFilterLevel(
153 |             drillDownFilterValue = "DrillDownKey.ALL",
154 |             stats = PercentileStats(points = List(("1", 1830683l)), numBuckets = 1l)
155 |           ))
156 |         ),
157 | 
158 |         PrettyPercentileStats(
159 |           name = "Postcode x Day - Count(RichTx)",
160 |           levels = List(PercentileStatsWithFilterLevel(
161 |             drillDownFilterValue = "DrillDownKey.ALL",
162 |             PercentileStats(points = List(
163 |               (("YO126EE", "2014-12-02").toString(), 1l),
164 |               (("CH441BA", "2014-09-23").toString(), 1l), (("LS287BJ", "2014-10-24").toString(), 1l),
165 |               (("G156RX", "2014-01-08").toString(), 1l)
166 |             ), numBuckets = 4)
167 |           ))
168 |         )
169 |       )
170 | 
171 |       val hr = data.map(_.toHumanReadable)
172 |       hr.foreach(println)
173 |       hr.nonEmpty must_=== true
174 |     }
175 | 
176 |     "correctly runs end to end SequenceStats.percentile on constant dataset" in {
177 |       case class DataP(k: String, v: Int)
178 |       val dataRDD = sc.parallelize((0 until 101).map(el => DataP(k = el.toString, v = el)))
179 | 
180 |       @transient implicit lazy val isc: SparkContext = sc
181 | 
182 |       val res = SequenceStats.percentile[DataP, String, Long](
183 |         data = dataRDD,
184 |         toDrillDownKeyOption = None,
185 |         toDimKey = _.k,
186 |         toVal = _ => 1l,
187 |         toStats = identity,
188 |         reduceFunc = _ |+| _,
189 |         numPercentiles = 10
190 |       )
191 | 
192 |       println(PrettyPercentileStats(name = "Constant Dataset", levels = res).toHumanReadable)
193 | 
194 |       val expected = PercentileStats(points = (0 until 10).toList.map(e => (e.toString, 1l)), numBuckets = 101l)
195 |       res.head.stats.points.map(_._2) must_=== expected.points.map(_._2)
196 |     }
197 | 
198 |     "correctly runs end to end SequenceStats.percentile on increasing 10 bucket dataset" in {
199 |       case class DataP(k: String, v: Int)
200 |       val dataRDD = sc.parallelize((1 until 11).flatMap(el => (0 until el).map(num => DataP(k = el.toString, v = 1))))
201 | 
202 |       @transient implicit lazy val isc: SparkContext = sc
203 | 
204 |       val res = SequenceStats.percentile[DataP, String, Long](
205 |         data = dataRDD,
206 |         toDrillDownKeyOption = None,
207 |         toDimKey = _.k,
208 |         toVal = _ => 1l,
209 |         toStats = identity,
210 |         reduceFunc = _ + _,
211 |         numPercentiles = 10
212 |       )
213 | 
214 |       println(PrettyPercentileStats(name = "", levels = res).toHumanReadable)
215 | 
216 |       val expected = PercentileStats(
217 |         points = (1 until 11).toList.map(e => (e.toString, e.toLong)),
218 |         numBuckets = 10l
219 |       )
220 | 
221 |       res.head.stats must_=== expected
222 |       res.head.stats.points.map(_._2) must_=== expected.points.map(_._2)
223 |     }
224 |   }
225 | }
226 | 


--------------------------------------------------------------------------------
/src/test/scala/com/github/vicpara/eda/TestUtils.scala:
--------------------------------------------------------------------------------
 1 | package com.github.vicpara.eda
 2 | 
 3 | import org.apache.spark.{ SparkConf, SparkContext }
 4 | 
 5 | object StaticSparkContext {
 6 |   val staticSc = new SparkContext(
 7 |     new SparkConf().setMaster("local")
 8 |       .set("spark.ui.port", "14321")
 9 |       .set("spark.eventLog.dir", System.getProperty("java.io.tmpdir"))
10 |       .set("spark.io.compression.codec", "lz4")
11 |       .setAppName("Test Local Spark Context")
12 |   )
13 | }
14 | 
15 | trait TestUtils {
16 |   implicit class PimpedDouble(d: Double) {
17 |     def roundDecimal = "%.2f".format(d).toDouble
18 |   }
19 | 
20 |   @transient lazy val sc = StaticSparkContext.staticSc
21 | }
22 | 


--------------------------------------------------------------------------------
/version.sbt:
--------------------------------------------------------------------------------
1 | version in ThisBuild := "1.0.1-SNAPSHOT"


--------------------------------------------------------------------------------