├── .gitignore
├── .travis.yml
├── Histogram.png
├── LICENSE
├── README.md
├── build.sbt
├── no-test-sbt.sh
├── project
└── plugins.sbt
├── sbt-offline.sh
├── src
├── main
│ └── scala
│ │ └── com
│ │ └── github
│ │ └── vicpara
│ │ └── eda
│ │ ├── AppLogger.scala
│ │ └── stats
│ │ ├── PercentileStats.scala
│ │ └── SequenceStats.scala
└── test
│ └── scala
│ └── com
│ └── github
│ └── vicpara
│ └── eda
│ ├── DescriptiveStatsJob.scala
│ ├── Generators.scala
│ ├── Stats.scala
│ ├── StatsSpec.scala
│ └── TestUtils.scala
└── version.sbt
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | *.log
3 |
4 | # sbt specific
5 | .cache
6 | .history
7 | .lib/
8 | dist/*
9 | target/
10 | lib_managed/
11 | src_managed/
12 | project/boot/
13 | project/plugins/project/
14 |
15 | # Scala-IDE specific
16 | .scala_dependencies
17 | .worksheet
18 |
19 |
20 | # Byte-compiled / optimized / DLL files
21 | __pycache__/
22 | *.py[cod]
23 | .project
24 | scala/projectFilesBackup/
25 | scala/src/sandbox.sc
26 |
27 |
28 | # C extensions
29 | *.so
30 |
31 | .idea/*
32 | */.idea/*
33 | */.idea_modules/*
34 | project/project/*
35 | project/target/*
36 | */target/*
37 |
38 |
39 | #Notebook temp files
40 | *.ipynb_checkpoints/*
41 | *.dump
42 | # Data
43 | *.xls
44 | *.xlsx
45 | *.zip
46 | *.out
47 |
48 |
49 | # Distribution / packaging
50 | .Python
51 | env/
52 | build/
53 | develop-eggs/
54 | dist/
55 | downloads/
56 | eggs/
57 | .eggs/
58 | lib/
59 | lib64/
60 | parts/
61 | sdist/
62 | var/
63 | *.egg-info/
64 | .installed.cfg
65 | *.egg
66 |
67 |
68 | # PyInstaller
69 | # Usually these files are written by a python script from a template
70 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
71 | *.manifest
72 | *.spec
73 |
74 |
75 | # Installer logs
76 | pip-log.txt
77 | pip-delete-this-directory.txt
78 |
79 |
80 | # Unit test / coverage reports
81 | htmlcov/
82 | .tox/
83 | .coverage
84 | .coverage.*
85 | .cache
86 | nosetests.xml
87 | coverage.xml
88 | *,cover
89 |
90 |
91 | # Translations
92 | *.mo
93 | *.pot
94 |
95 | # Django stuff:
96 | *.log
97 |
98 |
99 | # Sphinx documentation
100 | docs/_build/
101 |
102 | # PyBuilder
103 | target/
104 | .Rhistory
105 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | #test execution script.
2 |
3 | language: scala
4 | jdk:
5 | - oraclejdk8
6 |
7 | sudo: false
8 | before_install: umask 0022
9 | scala:
10 | - 2.10.4
11 | - 2.11.7
12 | script:
13 | - "echo no op"
14 |
15 | # whitelist
16 | branches:
17 | only:
18 | - master
19 | - develop
--------------------------------------------------------------------------------
/Histogram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vicpara/exploratory-data-analysis/e071c4cf92a6f2462cfa509855e59cbd17c4d466/Histogram.png
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016 Victor Paraschiv (https://github.com/vicpara/)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Exploratory-Data-Analysis - EDA
2 | [](https://travis-ci.org/vicpara/exploratory-data-analysis) [](https://tldrlegal.com/license/mit-license)
3 |
4 | Spark library for doing exploratory data analysis on your data set in a scalable way.
5 |
6 |
7 | ## Getting Exploratory Data Analysis
8 |
9 | If you're using SBT, add the following line to your build file:
10 |
11 | ```scala
12 | libraryDependencies += "com.github.vicpara" % "exploratory-data-analysis_2.10" % "1.0.0"
13 | ```
14 |
15 | For Maven and other build tools, you can visit [search.maven.org](http://search.maven.org/#search%7Cga%7C1%7Cexploratory%20data%20analysis).
16 | To get sample configurations, click on the version of the module you are interested in.
17 |
18 | -----
19 | ### The problem
20 |
21 | Before running any algorithm on a data set it is essential to understand how the data looks like and what are the expected edge cases.
22 | Maybe you have some expectations about the distribution of the numbers. Are all the buckets that you expect to find in the dataset populated with numbers?
23 | Very often a data set is provided which contains two types of columns::
24 |
25 | * dimensions
26 | * measurements
27 |
28 | Dimensions are columns that help describe the data points such as: Date, UniqueID, Gender, Postcode, Country, Categories.
29 | Measurements are metrics that quantitatively characterize the described datum.
30 |
31 | Depending on your analytical task, some sort of ordering or bucketing will be used in your model. Some examples::
32 |
33 | * process daily transactions of a customer
34 | * aggregate daily transactions of a merchant
35 | * count the number of unique customers per merchant or postcode, sector or district
36 | * number of transactions per day.
37 |
38 | Let's take as example the metric : number of daily transactions per merchant
39 |
40 | This is made out of two concepts. A key which is (dayId, merchantId) and a value count(transactions).
41 | Depending on the richness of your dataset is there may be keys with high number of transactiona and keys with very small number. This becomes a problem especially when you are dealing with a sample dataset which promises to capture the entire diversity of the original, complete dataset.
42 |
43 | -----
44 | ###the library
45 |
46 | For a collection / RDD[T] given a ::
47 |
48 | * function that maps an element of T to a *key*
49 | * function that maps an element of T to a *value* that corresponds to the *key*
50 | * function to aggregate in a monoid-al fashion two *values*
51 |
52 |
53 | It can compute two types of statistics::
54 |
55 | * the percentile values of the ordered sequence key-statistic
56 | * the unique count of specific key
57 |
58 |
59 | The results are saved into 2 possible formats:
60 |
61 | * object file
62 | * pretty text format ready for inspection
63 | * html files
64 |
65 | We use the *pretty* format when examining the results in bash on remote servers or HDFS. *Pretty* looks like this:
66 |
67 | ```
68 | BusinessId x Day - Distinct(CustomerId) DrillDownValue : Tuesday
69 | [PercentileStats]: NumBinsInHistogram: 1188
70 | (9191,2015-10-27) | 0| ####### | 44
71 | (6305,2015-11-10) | 1| ###################### | 51
72 | (6774,2015-11-03) | 2| ########################### | 53
73 | (4278,2015-11-03) | 3| ################################# | 54
74 | (9191,2015-11-03) | 4| ################################# | 54
75 | (4687,2015-11-17) | 5| ################################### | 55
76 | (380,2015-11-03) | 6| ###################################### | 56
77 | (8114,2015-11-03) | 7| ############################################ | 57
78 | (5629,2015-10-27) | 8| ############################################ | 57
79 | (404,2015-11-03) | 9| ############################################### | 58
80 | (7586,2015-11-10) | 10| ############################################### | 58
81 | (3765,2015-11-10) | 11| ############################################### | 58
82 | (8478,2015-11-17) | 12| ############################################### | 58
83 | (3701,2015-10-27) | 13| ###################################################### | 60
84 |
85 | ```
86 |
87 | Using `scala-bokeh`, the library can also output the results of the EDA in html files for easier examination.
88 | 
89 |
90 | #### Example distinct statistic
91 |
92 | You can find a working example for the capabilities of EDA in *test* folder.
93 |
94 | For a datum of type transaction
95 | ```scala
96 |
97 | case class Transaction(timestamp: Long, customerId: Long, businessId: Long, postcode: Option[String]) {
98 | def toSv(sep: String = "\t"): String = List(timestamp, customerId, businessId).mkString(sep)
99 | }
100 |
101 | ```
102 |
103 | we define a distinct statistic over number of unique businesses (distinct businessId) inside a *postcode* for a given *day* for each unique values of the key *toDrillDownKeyOption*. The value of this field would normally be a categorical segmentation axis which is expected to have relatively small cardinality. Examples of fields that could be used for toDrillDownKeyOption would be country / city / type of product / region.
104 |
105 | ```scala
106 |
107 | def uniqueBusinessIdPerPostcodeNDayStats(tx: RDD[Transaction], nPercentiles: Int = 1001) =
108 | Some("Postcode x Day - Distinct(BusinessId)")
109 | .map(n => PrettyPercentileStats(
110 | name = n,
111 | levels = SequenceStats.distinct[Transaction, (String, String), Long](
112 | data = tx,
113 | toDrillDownKeyOption = None,
114 | toDimKey = t => (t.postcode.getOrElse(""), dayAsString(t.timestamp)),
115 | toVal = tx => tx.businessId,
116 | numPercentiles = nPercentiles
117 | )
118 | ))
119 |
120 | ```
121 |
122 |
123 | #### Example Percentile statistic
124 | ```scala
125 |
126 | def globalUniqueCustomersCounterStats(transactions: RDD[Transaction], nPercentiles: Int = 1001) =
127 | Some("Global distinct Customers")
128 | .map(n => PrettyPercentileStats(
129 | name = n,
130 | levels = SequenceStats.distinct[Transaction, Long, Long](data = AppLogger.logStage(transactions, n),
131 | toDrillDownKeyOption = None,
132 | toDimKey = t => 1l,
133 | toVal = tx => tx.customerId,
134 | numPercentiles = nPercentiles
135 | )
136 | ))
137 |
138 | ```
139 |
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | import com.typesafe.sbt.SbtScalariform
2 |
3 | sonatypeProfileName := "com.github.vicpara"
4 |
5 | organization := "com.github.vicpara"
6 |
7 | name := "exploratory-data-analysis"
8 |
9 | releaseVersionFile := file("version.sbt")
10 |
11 | scalaVersion := "2.10.4"
12 |
13 | crossScalaVersions := Seq("2.11.7", "2.10.4")
14 |
15 | publishMavenStyle := true
16 |
17 | publishTo <<= version { (v: String) =>
18 | val nexus = "https://oss.sonatype.org/"
19 | if (v.trim.endsWith("-SNAPSHOT"))
20 | Some("snapshots" at nexus + "content/repositories/snapshots")
21 | else
22 | Some("releases" at nexus + "service/local/staging/deploy/maven2")
23 | }
24 |
25 | publishArtifact in Test := false
26 |
27 | libraryDependencies ++= Seq(
28 | "joda-time" % "joda-time" % "2.6" withSources() withJavadoc(),
29 | "org.joda" % "joda-convert" % "1.2" withSources() withJavadoc(),
30 | "org.apache.spark" % "spark-core_2.10" % "1.3.0-cdh5.4.4" withSources() withJavadoc(),
31 | "org.apache.commons" % "commons-csv" % "1.2" withSources() withJavadoc(),
32 | "com.github.scala-incubator.io" %% "scala-io-core" % "0.4.3" withSources() withJavadoc(),
33 | "com.github.scala-incubator.io" %% "scala-io-file" % "0.4.3" withSources() withJavadoc(),
34 | "com.rockymadden.stringmetric" %% "stringmetric-core" % "0.27.3" withSources() withJavadoc(),
35 | "org.scalaz" %% "scalaz-core" % "7.0.6" withSources() withJavadoc(),
36 | "org.rogach" %% "scallop" % "0.9.5" withSources() withJavadoc(),
37 | "org.scala-lang" % "scalap" % "2.10.4" withSources() withJavadoc(),
38 | "org.scala-lang" % "scala-compiler" % "2.10.4" withSources() withJavadoc(),
39 | "com.github.tototoshi" %% "scala-csv" % "1.2.2" withSources() withJavadoc(),
40 | "org.specs2" %% "specs2-core" % "2.4.9-scalaz-7.0.6" % "test" withSources() withJavadoc(),
41 | "org.specs2" %% "specs2-scalacheck" % "2.4.9-scalaz-7.0.6" % "test" withSources() withJavadoc(),
42 | "io.spray" %% "spray-json" % "1.3.1" withSources() withJavadoc(),
43 | "org.scalaj" %% "scalaj-http" % "1.1.5" withSources() withJavadoc(),
44 | "io.continuum.bokeh" %% "bokeh" % "0.6" withSources() withJavadoc()
45 | )
46 |
47 | resolvers ++= Seq(
48 | "mvnrepository" at "https://repository.cloudera.com/artifactory/cloudera-repos/",
49 | "Maven Central" at "https://repo1.maven.org/maven2/",
50 | "Sonatype OSS Releases" at "https://oss.sonatype.org/service/local/staging/deploy/maven2"
51 | )
52 |
53 | assemblyMergeStrategy in assembly := {
54 | case el if el.contains("fasterxml.jackson.core") => MergeStrategy.first
55 | case el if el.contains("guava") => MergeStrategy.first
56 |
57 | case x if Assembly.isConfigFile(x) => MergeStrategy.concat
58 | case PathList(ps@_*) if Assembly.isReadme(ps.last) || Assembly.isLicenseFile(ps.last) => MergeStrategy.rename
59 | case PathList("META-INF", xs@_*) => (xs map {
60 | _.toLowerCase
61 | }) match {
62 | case ("manifest.mf" :: Nil) | ("index.list" :: Nil) | ("dependencies" :: Nil) => MergeStrategy.discard
63 | case ps@(x :: xs) if ps.last.endsWith(".sf") || ps.last.endsWith(".dsa") => MergeStrategy.discard
64 | case "plexus" :: xs => MergeStrategy.discard
65 | case "services" :: xs => MergeStrategy.filterDistinctLines
66 | case ("spring.schemas" :: Nil) | ("spring.handlers" :: Nil) => MergeStrategy.filterDistinctLines
67 | case _ => MergeStrategy.first // Changed deduplicate to first
68 | }
69 | case PathList(_*) => MergeStrategy.first // added this line
70 | }
71 |
72 | pomIncludeRepository := { _ => false }
73 |
74 | pomExtra := (
75 | http://github.com/vicpara/exploratory-data-analysis
76 |
77 |
78 | Apache License, Version 2.0
79 | http://www.apache.org/licenses/LICENSE-2.0.html
80 | repo
81 |
82 |
83 |
84 | git@github.com:vicpara/exploratory-data-analysis.git
85 | scm:git:git@github.com:vicpara/exploratory-data-analysis.git
86 |
87 |
88 |
89 | vicpara
90 | Victor Paraschiv
91 | http://github.com/vicpara
92 |
93 | )
94 |
95 | scalariformSettings
96 |
97 |
--------------------------------------------------------------------------------
/no-test-sbt.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | sbt "set test in assembly := {}" "set skip in update := true" "set offline := true" "set scalacOptions in ThisBuild ++= Seq(\"-unchecked\", \"-deprecation\")" "$*"
3 |
4 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | scalacOptions += "-deprecation"
2 |
3 | addSbtPlugin("org.scalariform" % "sbt-scalariform" % "1.6.0")
4 |
5 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
6 |
7 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.1")
8 |
9 | addSbtPlugin("com.typesafe.sbt" % "sbt-osgi" % "0.7.0")
10 |
11 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.4.0")
12 |
13 | addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.3.3")
14 |
15 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0")
16 |
17 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.5")
18 |
19 | addSbtPlugin("com.github.tkawachi" % "sbt-doctest" % "0.3.2")
20 |
21 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "1.1")
22 |
23 | fullResolvers ~= {_.filterNot(_.name == "jcenter")}
24 |
--------------------------------------------------------------------------------
/sbt-offline.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | sbt "set skip in update := true" "set offline := true" "set scalacOptions in ThisBuild ++= Seq(\"-unchecked\", \"-deprecation\")" "$*"
3 |
4 |
--------------------------------------------------------------------------------
/src/main/scala/com/github/vicpara/eda/AppLogger.scala:
--------------------------------------------------------------------------------
1 | package com.github.vicpara.eda
2 |
3 | import org.apache.log4j.{ Logger, PatternLayout, ConsoleAppender, Level }
4 | import org.apache.spark.rdd.RDD
5 |
6 | case object AppLogger {
7 | def getLogger(level: Level = Level.INFO): Logger = {
8 | val level = org.apache.log4j.Level.INFO
9 | val logger = org.apache.log4j.Logger.getLogger("nlp-topic-modelling")
10 | logger.setLevel(level)
11 |
12 | val capp = new ConsoleAppender()
13 | capp.setName(s"ConsoleAppender ${level.toString}")
14 | capp.setLayout(new PatternLayout("%d %m%n"))
15 | capp.setThreshold(level)
16 | capp.activateOptions()
17 | logger.addAppender(capp)
18 | logger
19 | }
20 |
21 | def logStage[T](data: RDD[T], message: String): RDD[T] = {
22 | infoLevel.info(message)
23 | data
24 | }
25 |
26 | def apply(): org.apache.log4j.Logger = infoLevel
27 | val infoLevel = getLogger()
28 | }
29 |
--------------------------------------------------------------------------------
/src/main/scala/com/github/vicpara/eda/stats/PercentileStats.scala:
--------------------------------------------------------------------------------
1 | package com.github.vicpara.eda.stats
2 |
3 | import io.continuum.bokeh._
4 |
5 | import scalaz.Scalaz._
6 |
7 | case class PercentileStats(points: List[(String, Long)], numBuckets: Long) {
8 | def isSingleValue = points match {
9 | case pts if pts.size == 1 => true
10 | case _ => false
11 | }
12 | }
13 |
14 | case class PercentileStatsWithFilterLevel(drillDownFilterValue: String, stats: PercentileStats) {
15 | def isSingleStat = drillDownFilterValue.equals(SequenceStats.drillDownKeyAll)
16 | def isSingleValue = stats.points.size == 1
17 | }
18 |
19 | case class PrettyPercentileStats(name: String, levels: List[PercentileStatsWithFilterLevel]) {
20 | def lineImage(xvals: Seq[Double], yvals: Seq[Double], name: String): Plot = {
21 | object source extends ColumnDataSource {
22 | val x = column(xvals)
23 | val y = column(yvals)
24 | }
25 |
26 | import source.{ x, y }
27 |
28 | val xdr = new DataRange1d()
29 | val ydr = new DataRange1d()
30 |
31 | val line = new Line().x(x).y(y).line_color("#666699").line_width(2)
32 |
33 | val line_renderer = new GlyphRenderer()
34 | .data_source(source)
35 | .glyph(line)
36 |
37 | val plot = new Plot()
38 | .title(name).title_text_font_size(FontSize.apply(10, FontUnits.PT))
39 | .x_range(xdr).y_range(ydr)
40 | .width(500).height(500)
41 | .border_fill(Color.White)
42 | .background_fill("#FFE8C7")
43 |
44 | val xaxis = new LinearAxis().plot(plot)
45 | val yaxis = new LinearAxis().plot(plot)
46 | plot.below <<= (xaxis :: _)
47 | plot.left <<= (yaxis :: _)
48 | val xgrid = new Grid().plot(plot).axis(xaxis).dimension(0)
49 | val ygrid = new Grid().plot(plot).axis(yaxis).dimension(1)
50 |
51 | val pantool = new PanTool().plot(plot)
52 | val wheelzoomtool = new WheelZoomTool().plot(plot)
53 |
54 | plot.tools := List(pantool, wheelzoomtool)
55 | plot.renderers := List(xaxis, yaxis, xgrid, ygrid, line_renderer)
56 |
57 | plot
58 | }
59 |
60 | def toPlot: List[Plot] =
61 | levels.flatMap(stats => if (stats.stats.points.size > 1)
62 | Some(lineImage(
63 | xvals = stats.stats.points.indices.map(_.toDouble),
64 | yvals = stats.stats.points.map(_._2.toDouble),
65 | name = name + " #" + stats.drillDownFilterValue
66 | ))
67 | else None)
68 |
69 | def toHumanReadable: String = levels.map(l => s"\n${"_" * 148}\n$name \tDrillDownValue : ${l.drillDownFilterValue}\n\t" +
70 | s"${l.stats |> prettyContent}").mkString("\n")
71 |
72 | def prettyContent(stats: PercentileStats): String = stats.points match {
73 | case points if points.isEmpty => "[EmptyStatistic]"
74 | case points if points.size == 1 => s"[SingleStats]: Value: ${stats.points.head._2}\n"
75 |
76 | case points if points.size > 1 =>
77 | val min +: _ :+ max = stats.points.map(_._2)
78 |
79 | val pointsPerChar: Double = (max - min) / 100.0
80 | val fillChar = "#"
81 |
82 | val maxLabelLength = math.min(100, stats.points.map(_._1.length).max) + 5
83 |
84 | val histPretty: String =
85 | stats.points
86 | .zipWithIndex.map {
87 | case ((key, value), index) =>
88 | val strIndex = "%5s".format(index)
89 | val label = s"%${maxLabelLength}s".format(key)
90 | val bars = "%101s".format(fillChar * (1 + ((value - min) / pointsPerChar).toInt)).reverse
91 | s"$label |$strIndex| $bars| $value"
92 | }
93 | .mkString("\n")
94 |
95 | s"[PercentileStats]: NumBinsInHistogram: ${stats.numBuckets.toString}\n$histPretty"
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/src/main/scala/com/github/vicpara/eda/stats/SequenceStats.scala:
--------------------------------------------------------------------------------
1 | package com.github.vicpara.eda.stats
2 |
3 | import org.apache.spark.rdd.RDD
4 |
5 | import scala.reflect.ClassTag
6 | import scalaz.Scalaz._
7 |
8 | case object SequenceStats {
9 | val drillDownKeyAll = "$None$"
10 |
11 | def percentile[T: ClassTag, DimKey: ClassTag, V: ClassTag](
12 | data: RDD[T],
13 | toDrillDownKeyOption: Option[T => String],
14 | toDimKey: T => DimKey,
15 | toVal: T => V,
16 | toStats: V => Long,
17 | reduceFunc: (V, V) => V,
18 | numPercentiles: Int
19 | ): List[PercentileStatsWithFilterLevel] =
20 |
21 | data.map(t => (toDrillDownKeyOption.getOrElse((_: T) => drillDownKeyAll)(t), toDimKey(t)) -> toVal(t))
22 | .reduceByKey(reduceFunc)
23 | .mapValues(toStats)
24 | .sortBy(_._2)
25 | .zipWithIndex() |>
26 | percentileStats(numPercentiles) |>
27 | (percentileStats => percentileStats.toList.map((PercentileStatsWithFilterLevel.apply _).tupled))
28 |
29 | def distinct[T: ClassTag, DimKey: ClassTag, V: ClassTag](
30 | data: RDD[T],
31 | toDrillDownKeyOption: Option[T => String],
32 | toDimKey: T => DimKey,
33 | toVal: T => V,
34 | numPercentiles: Int
35 | ): List[PercentileStatsWithFilterLevel] =
36 | data.map(t => (toDrillDownKeyOption.getOrElse((_: T) => drillDownKeyAll)(t), toDimKey(t), toVal(t)))
37 | .distinct()
38 | .map(e => (e._1, e._2) -> Set(e._3))
39 | .reduceByKey(_ ++ _)
40 | .mapValues(_.size.toLong)
41 | .sortBy(_._2)
42 | .zipWithIndex() |>
43 | percentileStats(numPercentiles) |>
44 | (percentileStats => percentileStats.toList.map((PercentileStatsWithFilterLevel.apply _).tupled))
45 |
46 | def percentileStats[DimKey](numPercentiles: Int = 1001)(data: RDD[(((String, DimKey), Long), Long)]): Map[String, PercentileStats] = {
47 |
48 | val numBuckets: Map[String, Int] =
49 | data.map {
50 | case (((drillDownKey, _), _), _) => drillDownKey -> 1
51 | }
52 | .reduceByKey(_ + _)
53 | .collect()
54 | .toMap
55 |
56 | val drillDownKeyToIndices: Map[String, Set[Long]] =
57 | numBuckets.mapValues(percentileIndices(_, numPercentiles)).map(identity)
58 |
59 | data.filter { case (((drillDownKey, _), _), index) => drillDownKeyToIndices(drillDownKey).contains(index) }
60 | .map {
61 | case (((drillDownKey, dimKey), stats), _) => drillDownKey -> (dimKey.toString -> stats)
62 | }
63 | .collect()
64 | .groupBy(_._1)
65 | .mapValues(_.map(_._2).toList.sortBy(_._2))
66 | .map {
67 | case (key, points) => key -> PercentileStats(points, numBuckets(key))
68 | }
69 | }
70 |
71 | def percentileIndices(numElems: Long, numPercentiles: Int): Set[Long] = {
72 | val delta: Double = (1.0 / math.max(1, numPercentiles - 1)) * (numElems - 1)
73 | (0 until numPercentiles).map(el => (el * delta).round).toSet
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/src/test/scala/com/github/vicpara/eda/DescriptiveStatsJob.scala:
--------------------------------------------------------------------------------
1 | package com.github.vicpara.eda
2 |
3 | import com.github.vicpara.eda.stats.PrettyPercentileStats
4 | import io.continuum.bokeh.{ Document, Plot, GridPlot }
5 | import org.apache.spark.rdd.RDD
6 | import org.apache.spark.{ SparkConf, SparkContext }
7 | import org.rogach.scallop.ScallopConf
8 |
9 | case class Transaction(timestamp: Long, customerId: Long, businessId: Long, postcode: Option[String]) {
10 | def toSv(sep: String = "\t"): String = List(timestamp, customerId, businessId).mkString(sep)
11 | }
12 |
13 | case object Transaction {
14 | def apply(sep: String)(line: String): Transaction = line.split(sep, -1).toList match {
15 | case List(timestamp, customerId, businessId, postcode) => Transaction(timestamp.toLong, customerId.toLong,
16 | businessId.toLong, Some(postcode))
17 | }
18 | }
19 |
20 | case object DescriptiveStatsJob extends Generators {
21 | def main(args: Array[String]) {
22 | val conf = new ScallopConf(args) {
23 | val delim = opt[String](default = Some("\t"), descr = "The delimiter character")
24 | val tmpFolder = opt[String](descr = "Overrides the directory used in spark.local.dir")
25 |
26 | val outputPath = opt[String](required = true, descr = "The output path for the EDA stats")
27 | val outputPathHuman = opt[String](descr = "The output path [Human readable] for the EDA stats")
28 | }
29 |
30 | conf.printHelp()
31 | AppLogger().info(conf.summary)
32 |
33 | val sparkConf = new SparkConf()
34 | .set("spark.akka.frameSize", "128")
35 | .setMaster("local")
36 | .set("spark.hadoop.validateOutputSpecs", "false")
37 | .set("spark.io.compression.codec", "lz4")
38 | .setAppName("Local Exploratory Data Analysis")
39 |
40 | if (conf.tmpFolder.isDefined) sparkConf.set("spark.local.dir", conf.tmpFolder())
41 | @transient val sc: SparkContext = new SparkContext(sparkConf)
42 |
43 | val transactions = sc.parallelize(randomTransactions(80000).sample.get)
44 | AppLogger.logStage(transactions, "Finished generating the data points")
45 |
46 | val results: RDD[PrettyPercentileStats] = sc.makeRDD(List(
47 | Stats.txCountPerCustomerNDayStats(transactions, 101),
48 | Stats.txCountPerBusinessIdNDayStats(transactions, 101),
49 |
50 | Stats.uniqueBusinessIdPerPostcodeNDayStats(transactions, 101),
51 | Stats.uniqueCustomersPerBusinessIdNDayStats(transactions, 101),
52 | Stats.uniqueCustomerIdPerPostcodeNDayStats(transactions, 101),
53 |
54 | Stats.globalUniqueCustomersCounterStats(transactions, 101),
55 | Stats.globalUniqueBusinessesCounterStats(transactions, 101),
56 | Stats.globalUniquePostcodesCounterStats(transactions, 101)
57 | ).flatten, numSlices = 1)
58 |
59 | results.saveAsObjectFile(conf.outputPath())
60 | conf.outputPathHuman.foreach(results.map(_.toHumanReadable).saveAsTextFile)
61 |
62 | savePlotLists(results, "/tmp/eda/human/")
63 | }
64 |
65 | def savePlotLists(results: RDD[PrettyPercentileStats], outFolder: String) = {
66 | val plotGrid: List[List[Plot]] = results.collect()
67 | .flatMap(_.toPlot)
68 | .zipWithIndex
69 | .groupBy(_._2 / 2)
70 | .map(_._2.map(_._1).toList).toList
71 |
72 | val grid = new GridPlot().children(plotGrid)
73 |
74 | val document = new Document(grid)
75 | val html = document.save(outFolder + "/" + "EDA_Stats_Results.html")
76 | AppLogger.infoLevel.info(s"Wrote EDA stats charts in ${html.file}. Open ${html.url} in a web browser.")
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/test/scala/com/github/vicpara/eda/Generators.scala:
--------------------------------------------------------------------------------
1 | package com.github.vicpara.eda
2 |
3 | import org.joda.time.{ DateTime, DateTimeZone, Interval, LocalDateTime }
4 | import org.scalacheck.{ Prop, Arbitrary, Gen }
5 |
6 | import scala.collection.immutable
7 | import scalaz.Scalaz._
8 |
9 | trait Generators {
10 | val nTransactions = 1000000
11 | val nCustomers = 800
12 | val nBusinesses = 300
13 |
14 | val nonNegativeLongGen: Gen[Long] = Gen.choose[Long](0l, Long.MaxValue - 1)
15 | val nonNegativeIntGen: Gen[Int] = Gen.choose(0, 100)
16 | val positiveIntGen: Gen[Int] = Gen.choose(1, 100)
17 |
18 | val nonEmptyAlphaStr = Gen.nonEmptyListOf(Gen.alphaChar).map(_.mkString).suchThat(_.forall(_.isLetter))
19 |
20 | val noOfDays = 23
21 | val dates = DateTime.now() |> (t0 => (0 until noOfDays).map(t0.minusDays))
22 |
23 | val availablePostcodes = Gen.listOfN(100, Gen.listOfN(6, Gen.alphaNumChar).flatMap(_.mkString("").toUpperCase()))
24 | .sample.get
25 |
26 | val businesses: List[Int] = Gen.listOfN(nBusinesses, Gen.choose(0, 10000)).sample.get
27 | val customers: List[Int] = Gen.listOfN(nCustomers, Gen.chooseNum(10001, 20000)).sample.get
28 |
29 | def sameDayTimestampGen(date: DateTime): Gen[DateTime] = for {
30 | hourOfDay <- Gen.choose(0, 23)
31 | minuteOfHour <- Gen.choose(0, 59)
32 | } yield {
33 | val dtz = DateTimeZone.forID("Europe/London")
34 | val ldt = new LocalDateTime(date.getMillis, dtz).withTime(hourOfDay, minuteOfHour, 0, 0)
35 | (if (dtz.isLocalDateTimeGap(ldt)) ldt.plusHours(2) else ldt).toDateTime(dtz)
36 | }
37 |
38 | def timestampGen(interval: Interval) = for {
39 | date: DateTime <- Gen.choose(interval.getStart.getMillis, interval.getEnd.getMillis).map(new DateTime(_))
40 | timestamp <- sameDayTimestampGen(date)
41 | } yield timestamp
42 |
43 | val timestampGen: Gen[DateTime] = for {
44 | date: DateTime <- Gen.oneOf(dates)
45 | timestamp <- sameDayTimestampGen(date)
46 | } yield timestamp
47 |
48 | val todayTimestampGen: Gen[DateTime] = for {
49 | hourOfDay <- Gen.choose(0, 23)
50 | minuteOfHour <- Gen.choose(0, 59)
51 | date = DateTime.now()
52 | timestamp = date.withTime(hourOfDay, minuteOfHour, 0, 0)
53 | } yield timestamp
54 |
55 | val postcodeGen: Gen[String] = Gen.oneOf(availablePostcodes)
56 |
57 | def postcodeFromSectorGen(sector: String) =
58 | Gen.listOfN(2, Gen.alphaNumChar).map(_.mkString).map(sector + _.toUpperCase)
59 |
60 | def randomTransactionGen(): Gen[Transaction] = for {
61 | timestamp <- timestampGen.map(_.getMillis)
62 | postcode <- postcodeGen
63 | businessId <- Gen.oneOf(businesses)
64 | customerId <- Gen.oneOf(customers)
65 | } yield Transaction(timestamp, customerId, businessId, Some(postcode))
66 |
67 | def randomTransactions(n: Int): Gen[List[Transaction]] = Gen.listOfN(nTransactions, randomTransactionGen())
68 | }
69 |
--------------------------------------------------------------------------------
/src/test/scala/com/github/vicpara/eda/Stats.scala:
--------------------------------------------------------------------------------
1 | package com.github.vicpara.eda
2 |
3 | import com.github.vicpara.eda.stats.{ SequenceStats, PrettyPercentileStats }
4 | import org.apache.spark.rdd.RDD
5 | import org.joda.time.DateTime
6 |
7 | case object Stats {
8 | def txCountPerCustomerNDayStats(transactions: RDD[Transaction], nPercentiles: Int = 1001) =
9 | Some("Customer x Day - Count(Tx)")
10 | .map(n => PrettyPercentileStats(
11 | name = n,
12 | levels = SequenceStats.percentile[Transaction, (Long, String), Long](
13 | data = AppLogger.logStage(transactions, n),
14 | toDrillDownKeyOption = None,
15 | toDimKey = tx => (tx.customerId, dayAsString(tx.timestamp)),
16 | toVal = _ => 1l,
17 | toStats = identity,
18 | reduceFunc = _ + _,
19 | numPercentiles = nPercentiles
20 | )
21 | ))
22 |
23 | def txCountPerBusinessIdNDayStats(transactions: RDD[Transaction], nPercentiles: Int = 1001) =
24 | Some("BusinessId x Day - Count(Tx)")
25 | .map(n => PrettyPercentileStats(
26 | name = n,
27 | levels = SequenceStats.percentile[Transaction, (Long, String), Long](
28 | data = AppLogger.logStage(transactions, n),
29 | toDrillDownKeyOption = None,
30 | toDimKey = tx => (tx.businessId, dayAsString(tx.timestamp)),
31 | toVal = r => 1l,
32 | toStats = identity,
33 | reduceFunc = _ + _,
34 | numPercentiles = nPercentiles
35 | )
36 | ))
37 |
38 | def globalUniqueBusinessesCounterStats(transactions: RDD[Transaction], nPercentiles: Int = 1001) =
39 | Some("Global distinct Businesses")
40 | .map(n => PrettyPercentileStats(
41 | name = n,
42 | levels = SequenceStats.distinct[Transaction, Long, Long](
43 | data = AppLogger.logStage(transactions, n),
44 | toDrillDownKeyOption = None,
45 | toDimKey = t => 1l,
46 | toVal = tx => tx.businessId,
47 | numPercentiles = nPercentiles
48 | )
49 | ))
50 |
51 | def globalUniquePostcodesCounterStats(transactions: RDD[Transaction], nPercentiles: Int = 1001) =
52 | Some("Global distinct Postcodes")
53 | .map(n => PrettyPercentileStats(
54 | name = n,
55 | levels = SequenceStats.distinct[Transaction, Long, String](
56 | data = AppLogger.logStage(transactions, n),
57 | toDrillDownKeyOption = None,
58 | toDimKey = t => 1l,
59 | toVal = tx => tx.postcode.get,
60 | numPercentiles = nPercentiles
61 | )
62 | ))
63 |
64 | def globalUniqueCustomersCounterStats(transactions: RDD[Transaction], nPercentiles: Int = 1001) =
65 | Some("Global distinct Customers")
66 | .map(n => PrettyPercentileStats(
67 | name = n,
68 | levels = SequenceStats.distinct[Transaction, Long, Long](
69 | data = AppLogger.logStage(transactions, n),
70 | toDrillDownKeyOption = None,
71 | toDimKey = t => 1l,
72 | toVal = tx => tx.customerId,
73 | numPercentiles = nPercentiles
74 | )
75 | ))
76 |
77 | def uniqueCustomerIdPerPostcodeNDayStats(tx: RDD[Transaction], nPercentiles: Int = 1001) =
78 | Some("Postcode x Day - Distinct(CustomerID)")
79 | .map(n => PrettyPercentileStats(
80 | name = n,
81 | levels = SequenceStats.distinct[Transaction, (String, String), Long](
82 | data = AppLogger.logStage(tx, n),
83 | toDrillDownKeyOption = None,
84 | toDimKey = t => (t.postcode.getOrElse(""), dayAsString(t.timestamp)),
85 | toVal = tx => tx.customerId,
86 | numPercentiles = nPercentiles
87 | )
88 | ))
89 |
90 | def uniqueBusinessIdPerPostcodeNDayStats(tx: RDD[Transaction], nPercentiles: Int = 1001) =
91 | Some("Postcode x Day - Distinct(BusinessId)")
92 | .map(n => PrettyPercentileStats(
93 | name = n,
94 | levels = SequenceStats.distinct[Transaction, (String, String), Long](
95 | data = AppLogger.logStage(tx, n),
96 | toDrillDownKeyOption = None,
97 | toDimKey = t => (t.postcode.getOrElse(""), dayAsString(t.timestamp)),
98 | toVal = tx => tx.businessId,
99 | numPercentiles = nPercentiles
100 | )
101 | ))
102 |
103 | def uniqueCustomersPerBusinessIdNDayStats(tx: RDD[Transaction], nPercentiles: Int = 1001) =
104 | Some("BusinessId x Day - Distinct(CustomerId)")
105 | .map(n => PrettyPercentileStats(
106 | name = n,
107 | levels = SequenceStats.distinct[Transaction, (Long, String), Long](
108 | data = AppLogger.logStage(tx, n),
109 | toDrillDownKeyOption = Some(tx => dayOfWeek(tx.timestamp)),
110 | toDimKey = tx => (tx.businessId, dayAsString(tx.timestamp)),
111 | toVal = tx => tx.customerId,
112 | numPercentiles = nPercentiles
113 | )
114 | ))
115 |
116 | def dayAsString(millis: Long): String = new DateTime(millis).toString("yyyy-MM-dd")
117 | def dayOfWeek(millis: Long): String = new DateTime(millis).dayOfWeek().getAsText
118 | }
119 |
--------------------------------------------------------------------------------
/src/test/scala/com/github/vicpara/eda/StatsSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.vicpara.eda
2 |
3 | import com.github.vicpara.eda.stats.{ SequenceStats, PercentileStats, PercentileStatsWithFilterLevel, PrettyPercentileStats }
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.rdd.RDD
6 | import org.specs2.matcher.ScalaCheckMatchers
7 | import org.specs2.mutable.Specification
8 |
9 | import scalaz.Scalaz._
10 |
11 | case object StatsSpec extends Specification with ScalaCheckMatchers with TestUtils {
12 | "ExploratoryDataAnalysis" should {
13 |
14 | "correctly compute percentiles statistics on monotonic increasing 21 sample dataset" in {
15 | val rawData =
16 | (0l until 21).map((" ", _))
17 | .map {
18 | case (drillDownKey, dimKey) => (((drillDownKey, dimKey), dimKey), dimKey)
19 | }
20 | .toList
21 | val dataRDD: RDD[(((String, Long), Long), Long)] = sc.parallelize(rawData)
22 |
23 | val res: PercentileStats = SequenceStats.percentileStats[Long](numPercentiles = 21)(dataRDD).head._2
24 | res must_=== PercentileStats(points = rawData.map(el => (el._2.toString, el._2)), numBuckets = 21l)
25 | }
26 |
27 | "correctly compute the right number of elements in the percentiles statistics on 101 constant samples dataset" in {
28 | val key = " "
29 | val rawData: List[(((String, Long), Long), Long)] = (0l until 101 by 1).toList.map(e => (((key, e), e), e))
30 | val dataRDD: RDD[(((String, Long), Long), Long)] = sc.parallelize(rawData)
31 |
32 | val intRes = SequenceStats.percentileStats[Long](numPercentiles = 11)(dataRDD)
33 | val res =
34 | List(1, 2, 3, 5, 6, 7, 9, 11, 20, 21, 99, 100)
35 | .map(r => (r, SequenceStats.percentileStats[Long](numPercentiles = r)(dataRDD).get(key).get.points.size))
36 | .filter(e => e._1 != e._2)
37 |
38 | res.foreach(el => el._1 must_=== el._2)
39 | res.size must_=== 0
40 | }
41 |
42 | "correctly compute number percentiles when Num Percentiles is larger than samples" in {
43 | val numSamplesList = List(2, 4, 5, 10, 11, 13, 17, 21, 39, 55, 101)
44 | val res = numSamplesList.flatMap(numSample => {
45 | val ires =
46 | List(102, 103, 104, 101, 130, 200, 300, 500, 1000)
47 | .map(v => (numSample, SequenceStats.percentileIndices(numSample, v).size))
48 | .filter(v => v._1 != v._2)
49 |
50 | ires.forall(el => el._1 must_=== el._2)
51 | ires.size must_=== 0
52 | ires
53 | })
54 | if (res.nonEmpty)
55 | println("Numer of percentile indices" + res.map(el => s"Expected:${el._1} but got ${el._2} ").mkString("\n"))
56 | res.size must_=== 0
57 | }
58 |
59 | "correctly compute percentiles statistics on 101 constant samples dataset" in {
60 | val rawData: List[(((String, Long), Long), Long)] = (0l until 101 by 1).toList.map(e => (((" ", e), 2l), e))
61 | val dataRDD: RDD[(((String, Long), Long), Long)] = sc.parallelize(rawData)
62 |
63 | val result = SequenceStats.percentileStats[Long](numPercentiles = 11)(dataRDD).head._2
64 | val expected = PercentileStats(points = (0 until 101 by 10).toList.zipWithIndex.map(e => (e._1.toString, 2l)), numBuckets = 101)
65 | result must_=== expected
66 | }
67 |
68 | "correctly compute percentiles statistics on smaller dataset than stats" in {
69 | val rawData = (1l until 6 by 1).map(e => (((" ", e), e), e - 1))
70 | val dataRDD: RDD[(((String, Long), Long), Long)] = sc.parallelize(rawData)
71 |
72 | val result: PercentileStats = SequenceStats.percentileStats[Long](numPercentiles = 10)(dataRDD).head._2
73 | val expected = PercentileStats(points = (1l until 6 by 1).map(e => (e.toString, e)).toList, numBuckets = 5l)
74 |
75 | result must_=== expected
76 | }
77 |
78 | "correctly computes the number of Percentile Indices for increasing number of percentiles" in {
79 | val res =
80 | List(1, 2, 3, 5, 6, 7, 9, 11, 17, 20, 21, 40, 50, 51, 99, 100, 101)
81 | .map(v => (v, SequenceStats.percentileIndices(101, v).size))
82 | .filter(v => v._1 != v._2)
83 |
84 | res.foreach(el => el._1 must_=== el._2)
85 | res.size must_=== 0
86 | }
87 |
88 | "correctly generate 10 Percentile Indexes from 1 to 10" in {
89 | val result: Set[Long] = SequenceStats.percentileIndices(10, 10)
90 | val expected: Set[Long] = (0 until 10).toList.map(_.toLong).toSet
91 |
92 | result must_=== expected
93 | }
94 |
95 | "correctly generate 10 Percentile Indexes from 1 to 10 when requested 20 for a smaller dataset" in {
96 | val result: Set[Long] = SequenceStats.percentileIndices(10, 20)
97 | val expected: Set[Long] = (0 until 10).toList.map(_.toLong).toSet
98 |
99 | result must_=== expected
100 | }
101 |
102 | "correctly generate 10 Percentile Indexes from 0 to 1000 by 100 when requested 10 for a 1001 dataset" in {
103 | val result: Set[Long] = SequenceStats.percentileIndices(1001, 11)
104 | val expected: Set[Long] = (0 until 1001 by 100).toList.map(_.toLong).toSet
105 |
106 | result must_=== expected
107 | }
108 |
109 | "correctly pretty prints humanReadable" in {
110 | val setEmpty = PrettyPercentileStats(
111 | name = "xxx",
112 | levels = List(PercentileStatsWithFilterLevel(
113 | "NONE",
114 | stats = PercentileStats(points = Nil, numBuckets = 0)
115 | ))
116 | )
117 |
118 | val set1 = PrettyPercentileStats(
119 | name = "xxx",
120 | levels = List(PercentileStatsWithFilterLevel(
121 | "NONE",
122 | stats = PercentileStats(points = List(("Key1", 2l)), numBuckets = 1)
123 | ))
124 | )
125 |
126 | val set2 = PrettyPercentileStats(
127 | name = "xxx",
128 | levels = List(PercentileStatsWithFilterLevel(
129 | "NONE",
130 | stats = PercentileStats(points = List(("Key1", 1l), ("Key2", 2l)), numBuckets = 2)
131 | ))
132 | )
133 |
134 | val set3 = PrettyPercentileStats(
135 | name = "xxx",
136 | levels = List(PercentileStatsWithFilterLevel(
137 | "NONE",
138 | stats = PercentileStats(points = List(("Key1", 1l), ("Key2", 2l), ("Key3", 3l)), numBuckets = 3)
139 | ))
140 | )
141 |
142 | List(setEmpty, set1, set2, set3)
143 | .map(e => e.levels.head.stats.points.size -> e.toHumanReadable)
144 | .map(e => e._1 -> (e._1 == e._2.split("\n").size + 2))
145 | .count(_._2) must_=== 0
146 | }
147 |
148 | "correctly pretty prints for humans bad samples" in {
149 | val data = List(
150 | PrettyPercentileStats(
151 | name = "BusinessId x Day - Count(Tx)",
152 | levels = List(PercentileStatsWithFilterLevel(
153 | drillDownFilterValue = "DrillDownKey.ALL",
154 | stats = PercentileStats(points = List(("1", 1830683l)), numBuckets = 1l)
155 | ))
156 | ),
157 |
158 | PrettyPercentileStats(
159 | name = "Postcode x Day - Count(RichTx)",
160 | levels = List(PercentileStatsWithFilterLevel(
161 | drillDownFilterValue = "DrillDownKey.ALL",
162 | PercentileStats(points = List(
163 | (("YO126EE", "2014-12-02").toString(), 1l),
164 | (("CH441BA", "2014-09-23").toString(), 1l), (("LS287BJ", "2014-10-24").toString(), 1l),
165 | (("G156RX", "2014-01-08").toString(), 1l)
166 | ), numBuckets = 4)
167 | ))
168 | )
169 | )
170 |
171 | val hr = data.map(_.toHumanReadable)
172 | hr.foreach(println)
173 | hr.nonEmpty must_=== true
174 | }
175 |
176 | "correctly runs end to end SequenceStats.percentile on constant dataset" in {
177 | case class DataP(k: String, v: Int)
178 | val dataRDD = sc.parallelize((0 until 101).map(el => DataP(k = el.toString, v = el)))
179 |
180 | @transient implicit lazy val isc: SparkContext = sc
181 |
182 | val res = SequenceStats.percentile[DataP, String, Long](
183 | data = dataRDD,
184 | toDrillDownKeyOption = None,
185 | toDimKey = _.k,
186 | toVal = _ => 1l,
187 | toStats = identity,
188 | reduceFunc = _ |+| _,
189 | numPercentiles = 10
190 | )
191 |
192 | println(PrettyPercentileStats(name = "Constant Dataset", levels = res).toHumanReadable)
193 |
194 | val expected = PercentileStats(points = (0 until 10).toList.map(e => (e.toString, 1l)), numBuckets = 101l)
195 | res.head.stats.points.map(_._2) must_=== expected.points.map(_._2)
196 | }
197 |
198 | "correctly runs end to end SequenceStats.percentile on increasing 10 bucket dataset" in {
199 | case class DataP(k: String, v: Int)
200 | val dataRDD = sc.parallelize((1 until 11).flatMap(el => (0 until el).map(num => DataP(k = el.toString, v = 1))))
201 |
202 | @transient implicit lazy val isc: SparkContext = sc
203 |
204 | val res = SequenceStats.percentile[DataP, String, Long](
205 | data = dataRDD,
206 | toDrillDownKeyOption = None,
207 | toDimKey = _.k,
208 | toVal = _ => 1l,
209 | toStats = identity,
210 | reduceFunc = _ + _,
211 | numPercentiles = 10
212 | )
213 |
214 | println(PrettyPercentileStats(name = "", levels = res).toHumanReadable)
215 |
216 | val expected = PercentileStats(
217 | points = (1 until 11).toList.map(e => (e.toString, e.toLong)),
218 | numBuckets = 10l
219 | )
220 |
221 | res.head.stats must_=== expected
222 | res.head.stats.points.map(_._2) must_=== expected.points.map(_._2)
223 | }
224 | }
225 | }
226 |
--------------------------------------------------------------------------------
/src/test/scala/com/github/vicpara/eda/TestUtils.scala:
--------------------------------------------------------------------------------
1 | package com.github.vicpara.eda
2 |
3 | import org.apache.spark.{ SparkConf, SparkContext }
4 |
5 | object StaticSparkContext {
6 | val staticSc = new SparkContext(
7 | new SparkConf().setMaster("local")
8 | .set("spark.ui.port", "14321")
9 | .set("spark.eventLog.dir", System.getProperty("java.io.tmpdir"))
10 | .set("spark.io.compression.codec", "lz4")
11 | .setAppName("Test Local Spark Context")
12 | )
13 | }
14 |
15 | trait TestUtils {
16 | implicit class PimpedDouble(d: Double) {
17 | def roundDecimal = "%.2f".format(d).toDouble
18 | }
19 |
20 | @transient lazy val sc = StaticSparkContext.staticSc
21 | }
22 |
--------------------------------------------------------------------------------
/version.sbt:
--------------------------------------------------------------------------------
1 | version in ThisBuild := "1.0.1-SNAPSHOT"
--------------------------------------------------------------------------------