├── project
    ├── build.properties
    ├── GitHelper.scala
    ├── VersionScheme.scala
    ├── Dependencies.scala
    ├── Dependency.scala
    ├── plugins.sbt
    ├── ShellPrompt.scala
    └── TestSettings.scala
├── sparkext-sql
    ├── build.sbt
    └── src
    │   ├── main
    │       └── scala
    │       │   └── org
    │       │       └── apache
    │       │           └── spark
    │       │               └── sql
    │       │                   ├── ext
    │       │                       └── functions.scala
    │       │                   └── catalyst
    │       │                       └── expressions
    │       │                           └── aggregates.scala
    │   └── test
    │       └── scala
    │           └── org
    │               └── apache
    │                   └── spark
    │                       └── sql
    │                           └── ExtAggregatesSpec.scala
├── sparkext-test
    ├── build.sbt
    └── src
    │   └── test
    │       ├── resources
    │           ├── log4j-turned-off.properties
    │           └── log4j.properties
    │       └── scala
    │           └── com
    │               └── collective
    │                   └── TestSparkContext.scala
├── sparkext-example
    ├── build.sbt
    └── src
    │   └── main
    │       ├── scala
    │           └── com
    │           │   └── collective
    │           │       └── sparkext
    │           │           └── example
    │           │               ├── package.scala
    │           │               ├── InMemorySparkContext.scala
    │           │               ├── DataGenerator.scala
    │           │               └── SparkMlExtExample.scala
    │       └── resources
    │           └── response.csv
├── sparkext-mllib
    ├── build.sbt
    └── src
    │   ├── main
    │       └── scala
    │       │   └── org
    │       │       └── apache
    │       │           └── spark
    │       │               ├── mllib
    │       │                   └── evaluation
    │       │                   │   ├── BinaryModelMetricComputer.scala
    │       │                   │   └── BinaryModelMetrics.scala
    │       │               └── ml
    │       │                   ├── feature
    │       │                       ├── sharedParams.scala
    │       │                       ├── S2CellTransformer.scala
    │       │                       ├── StringToShortIndexer.scala
    │       │                       ├── Gather.scala
    │       │                       ├── Binning.scala
    │       │                       └── GatherEncoder.scala
    │       │                   ├── sampling
    │       │                       └── Downsampling.scala
    │       │                   └── classification
    │       │                       └── LocalLogisticRegression.scala
    │   └── test
    │       └── scala
    │           └── org
    │               └── apache
    │                   └── spark
    │                       ├── ml
    │                           ├── feature
    │                           │   ├── StringToShortIndexerSpec.scala
    │                           │   ├── S2CellTransformerSpec.scala
    │                           │   ├── SplitOptimizerSpec.scala
    │                           │   ├── GatherSpec.scala
    │                           │   ├── BinningSpec.scala
    │                           │   ├── GatherEncoderModelSpec.scala
    │                           │   └── GatherEncoderSpec.scala
    │                           ├── sampling
    │                           │   └── DownsamplingSpec.scala
    │                           ├── TestingUtils.scala
    │                           └── classification
    │                           │   └── LocalLogisticRegressionSpec.scala
    │                       └── mllib
    │                           └── evaluation
    │                               ├── BinaryModelMetricComputerSpec.scala
    │                               └── BinaryModelMetricsSpec.scala
├── version.sbt
├── .travis.yml
├── .gitignore
├── README.md
├── scalastyle-config.xml
└── LICENSE


/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.8
2 | 


--------------------------------------------------------------------------------
/sparkext-sql/build.sbt:
--------------------------------------------------------------------------------
1 | libraryDependencies ++= Dependencies.sparkExtSql
2 | 


--------------------------------------------------------------------------------
/sparkext-test/build.sbt:
--------------------------------------------------------------------------------
1 | libraryDependencies ++= Dependencies.sparkExtTest
2 | 


--------------------------------------------------------------------------------
/sparkext-example/build.sbt:
--------------------------------------------------------------------------------
1 | libraryDependencies ++= Dependencies.sparkExtExample
2 | 


--------------------------------------------------------------------------------
/sparkext-mllib/build.sbt:
--------------------------------------------------------------------------------
1 | libraryDependencies ++= Dependencies.sparkExtMllib
2 | 


--------------------------------------------------------------------------------
/project/GitHelper.scala:
--------------------------------------------------------------------------------
1 | import sbt._
2 | 
3 | object GitHelper {
4 | 
5 |   def headSha(): String = Process("git rev-parse --short HEAD").!!.stripLineEnd
6 | 
7 | }
8 | 


--------------------------------------------------------------------------------
/sparkext-example/src/main/scala/com/collective/sparkext/example/package.scala:
--------------------------------------------------------------------------------
1 | package com.collective.sparkext
2 | 
3 | import scala.util.Random
4 | 
5 | 
6 | package object example {
7 |   val rnd = new Random()
8 | }
9 | 


--------------------------------------------------------------------------------
/project/VersionScheme.scala:
--------------------------------------------------------------------------------
 1 | import sbt._
 2 | 
 3 | object VersionScheme {
 4 | 
 5 |   object Keys {
 6 | 
 7 |     val isRelease = Def.settingKey[Boolean]("True if this is a release")
 8 | 
 9 |     val versionPrefix = Def.settingKey[String](
10 |       "Prefix of the version string")
11 | 
12 |   }
13 | 
14 | }
15 | 


--------------------------------------------------------------------------------
/sparkext-test/src/test/resources/log4j-turned-off.properties:
--------------------------------------------------------------------------------
1 | log4j.rootCategory=ERROR, console
2 | log4j.appender.console=org.apache.log4j.ConsoleAppender
3 | log4j.appender.console.target=System.err
4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n


--------------------------------------------------------------------------------
/version.sbt:
--------------------------------------------------------------------------------
 1 | import VersionScheme.Keys._
 2 | 
 3 | isRelease in ThisBuild := sys.props("release") == "true"
 4 | 
 5 | versionPrefix in ThisBuild := "0.0.23"
 6 | 
 7 | version in ThisBuild <<= Def.setting[String] {
 8 |   if (isRelease.value) {
 9 |     versionPrefix.value
10 |   } else  {
11 |     val headSha = GitHelper.headSha()
12 |     s"${versionPrefix.value}.$headSha"
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/sparkext-test/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=WARN, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 7 | 
 8 | # Verbose logging for Collective packages
 9 | log4j.logger.com.collective=TRACE
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Use Docker-based container (instead of OpenVZ)
 2 | sudo: false
 3 | 
 4 | cache:
 5 |   directories:
 6 |     - $HOME/.ivy2/cache
 7 |     - $HOME/.sbt/boot/scala-$TRAVIS_SCALA_VERSION
 8 | 
 9 | language: scala
10 | scala:
11 |   - 2.11.7
12 |   - 2.10.6
13 | jdk:
14 |   - oraclejdk8
15 | 
16 | script:
17 |   - sbt ++$TRAVIS_SCALA_VERSION -J-Xmx2512m clean test
18 | 
19 |   # Tricks to avoid unnecessary cache updates
20 |   - find $HOME/.sbt -name "*.lock" | xargs rm
21 |   - find $HOME/.ivy2 -name "ivydata-*.properties" | xargs rm
22 | 


--------------------------------------------------------------------------------
/project/Dependencies.scala:
--------------------------------------------------------------------------------
 1 | 
 2 | object Dependencies {
 3 | 
 4 |   import Dependency._
 5 | 
 6 |   val sparkExtSql =
 7 |     Seq(
 8 |         sparkSql % "provided"
 9 |       , Test.scalaTest
10 |     )
11 | 
12 |   val sparkExtMllib =
13 |     Seq(
14 |         sparkMLLib % "provided"
15 |       , s2Geometry
16 |       , Test.scalaTest
17 |     )
18 | 
19 |   val sparkExtTest =
20 |     Seq(
21 |         sparkSql % "provided"
22 |       , Test.scalaTest
23 |     )
24 | 
25 |   val sparkExtExample =
26 |     Seq(
27 |       sparkMLLib
28 |     )
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/sparkext-sql/src/main/scala/org/apache/spark/sql/ext/functions.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.sql.ext
 2 | 
 3 | import org.apache.spark.sql.catalyst.expressions._
 4 | import org.apache.spark.sql._
 5 | 
 6 | import scala.language.implicitConversions
 7 | 
 8 | // scalastyle:off
 9 | object functions {
10 |   // scalastyle:on
11 | 
12 |   private[this] implicit def toColumn(expr: Expression): Column = Column(expr)
13 | 
14 |   // TODO: Workaround for https://issues.apache.org/jira/browse/SPARK-9301
15 |   def collectArray(expr: Column): Column = CollectArray(expr.expr)
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/project/Dependency.scala:
--------------------------------------------------------------------------------
 1 | import sbt._
 2 | 
 3 | 
 4 | object Dependency {
 5 | 
 6 |   object V {
 7 | 
 8 |     val Spark              = "1.5.2"
 9 |     val S2Geometry         = "1.0"
10 | 
11 |     val ScalaTest          = "2.2.4"
12 | 
13 |   }
14 | 
15 |   val sparkSql            = "org.apache.spark"           %% "spark-sql"      % V.Spark
16 |   val sparkMLLib          = "org.apache.spark"           %% "spark-mllib"    % V.Spark
17 | 
18 |   val s2Geometry          = "com.google.common.geometry"  % "s2-geometry"    % V.S2Geometry intransitive()
19 | 
20 |   object Test {
21 | 
22 |     val scalaTest         = "org.scalatest"              %% "scalatest"      % V.ScalaTest  % "test"
23 | 
24 |   }
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | libraryDependencies += "org.slf4j" % "slf4j-nop" % "1.7.5"
 2 | 
 3 | resolvers += "jgit-repo" at "http://download.eclipse.org/jgit/maven"
 4 | 
 5 | // Dependency graph
 6 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.5")
 7 | 
 8 | // Check Scala style
 9 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.7.0")
10 | 
11 | // Publish unified documentation to site
12 | addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.3.3")
13 | 
14 | // Publish to bintray
15 | addSbtPlugin("me.lessis" % "bintray-sbt" % "0.3.0")
16 | 
17 | // Publish unidoc to Github pages
18 | addSbtPlugin("com.typesafe.sbt" % "sbt-site" % "0.7.1")
19 | 
20 | addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.5.2")
21 | 


--------------------------------------------------------------------------------
/project/ShellPrompt.scala:
--------------------------------------------------------------------------------
 1 | import sbt._
 2 | 
 3 | import scala.language.postfixOps
 4 | 
 5 | object ShellPrompt {
 6 | 
 7 |   object devnull extends ProcessLogger {
 8 |     def info (s: => String): Unit = {}
 9 |     def error (s: => String): Unit = {}
10 |     def buffer[T] (f: => T): T = f
11 |   }
12 | 
13 |   val current = """\*\s+([\w-/]+)""".r
14 | 
15 |   def gitBranches = "git branch --no-color" lines_! devnull mkString
16 | 
17 |   val buildShellPrompt = {
18 |     (state: State) => {
19 |       val currBranch =
20 |         current findFirstMatchIn gitBranches map (_ group(1)) getOrElse "-"
21 |       val currProject = Project.extract (state).currentProject.id
22 |       "%s:%s> ".format (
23 |         currProject, currBranch
24 |       )
25 |     }
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/project/TestSettings.scala:
--------------------------------------------------------------------------------
 1 | import sbt.Keys._
 2 | import sbt._
 3 | import org.scalastyle.sbt.ScalastylePlugin
 4 | 
 5 | 
 6 | object TestSettings {
 7 | 
 8 |   private[this] lazy val checkScalastyle = taskKey[Unit]("checkScalastyle")
 9 | 
10 |   def testSettings: Seq[Def.Setting[_]] = Seq(
11 |     fork in Test := true,
12 | 
13 |     // Run Scalastyle as a part of tests
14 |     checkScalastyle := ScalastylePlugin.scalastyle.in(Compile).toTask("").value,
15 |     test in Test <<= (test in Test) dependsOn checkScalastyle,
16 | 
17 |     // Disable logging in all tests
18 |     javaOptions in Test += "-Dlog4j.configuration=log4j-turned-off.properties",
19 | 
20 |     // Generate JUnit test reports
21 |     testOptions in Test <+= (target in Test) map {
22 |       t => Tests.Argument(TestFrameworks.ScalaTest, "-u", (t / "test-reports").toString)
23 |     }
24 |   )
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryModelMetricComputer.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.mllib.evaluation
 2 | 
 3 | import org.apache.spark.mllib.evaluation.binary.{BinaryClassificationMetricComputer, BinaryConfusionMatrix, Recall}
 4 | 
 5 | /** Precision. Defined as 1.0 when there are no positive examples. */
 6 | private[evaluation] object Reach extends BinaryClassificationMetricComputer {
 7 |   override def apply(c: BinaryConfusionMatrix): Double = {
 8 |     val totalPopulation = c.numNegatives + c.numPositives
 9 |     if (totalPopulation == 0) {
10 |       1.0
11 |     } else {
12 |       (c.numTruePositives.toDouble + c.numFalsePositives.toDouble) / totalPopulation
13 |     }
14 |   }
15 | }
16 | 
17 | private[evaluation] object Lift extends BinaryClassificationMetricComputer {
18 |   override def apply(c: BinaryConfusionMatrix): Double = {
19 |     Recall(c) / Reach(c)
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/sparkext-test/src/test/scala/com/collective/TestSparkContext.scala:
--------------------------------------------------------------------------------
 1 | package com.collective
 2 | 
 3 | import org.apache.spark.sql.SQLContext
 4 | import org.apache.spark.{SparkConf, SparkContext}
 5 | 
 6 | import scala.concurrent.duration._
 7 | import scala.concurrent.{Await, Future}
 8 | 
 9 | object TestSparkContext {
10 | 
11 |   private[this] val conf =
12 |     new SparkConf()
13 |       .setMaster("local[1]")
14 |       .set("spark.local.ip","localhost")
15 |       .set("spark.driver.host","localhost")
16 |       .setAppName("Spark Ext Test")
17 | 
18 |   lazy val sc: SparkContext = new SparkContext(conf)
19 | 
20 |   lazy val sqlContext: SQLContext = new SQLContext(sc)
21 | }
22 | 
23 | 
24 | trait TestSparkContext {
25 | 
26 |   lazy val sc: SparkContext = TestSparkContext.sc
27 | 
28 |   lazy val sqlContext: SQLContext = TestSparkContext.sqlContext
29 | 
30 |   def waitFor[T](f: Future[T], timeout: Duration = 5.second): T = {
31 |     Await.result(f, timeout)
32 |   }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/main/scala/org/apache/spark/ml/feature/sharedParams.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.feature
 2 | 
 3 | import org.apache.spark.ml.param.{Param, Params}
 4 | 
 5 | /**
 6 |  * Trait for shared param keyCol.
 7 |  */
 8 | private[ml] trait HasKeyCol extends Params {
 9 | 
10 |   /**
11 |    * Param for category column name.
12 |    * @group param
13 |    */
14 |   final val keyCol: Param[String] = new Param[String](this, "keyCol",
15 |     "Column that holds value for category name")
16 | 
17 |   /** @group getParam */
18 |   def getCategoryCol: String = $(keyCol)
19 | }
20 | 
21 | /**
22 |  * Trait for shared param valueCol.
23 |  */
24 | private[ml] trait HasValueCol extends Params {
25 | 
26 |   /**
27 |    * Param for value column name.
28 |    * @group param
29 |    */
30 |   val valueCol: Param[String] = new Param[String](this, "valueCol",
31 |     "Column that holds a value for category")
32 | 
33 | 
34 |   /** @group getParam */
35 |   def getValueCol: String = $(valueCol)
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.iml
 2 | 
 3 | ## Directory-based project format:
 4 | .idea/
 5 | 
 6 | ## File-based project format:
 7 | *.ipr
 8 | *.iws
 9 | 
10 | ## Plugin-specific files:
11 | 
12 | # IntelliJ
13 | /out/
14 | 
15 | # mpeltonen/sbt-idea plugin
16 | .idea_modules/
17 | 
18 | # JIRA plugin
19 | atlassian-ide-plugin.xml
20 | 
21 | # Crashlytics plugin (for Android Studio and IntelliJ)
22 | com_crashlytics_export_strings.xml
23 | crashlytics.properties
24 | crashlytics-build.properties
25 | ### SBT template
26 | # Simple Build Tool
27 | # http://www.scala-sbt.org/release/docs/Getting-Started/Directories.html#configuring-version-control
28 | 
29 | target/
30 | lib_managed/
31 | src_managed/
32 | project/boot/
33 | .history
34 | .cache
35 | ### Scala template
36 | *.class
37 | *.log
38 | 
39 | # sbt specific
40 | .cache
41 | .history
42 | .lib/
43 | dist/*
44 | target/
45 | lib_managed/
46 | src_managed/
47 | project/boot/
48 | project/plugins/project/
49 | 
50 | # Scala-IDE specific
51 | .scala_dependencies
52 | .worksheet
53 | 
54 | # Created by .ignore support plugin (hsz.mobi)
55 | 


--------------------------------------------------------------------------------
/sparkext-example/src/main/scala/com/collective/sparkext/example/InMemorySparkContext.scala:
--------------------------------------------------------------------------------
 1 | package com.collective.sparkext.example
 2 | 
 3 | import org.apache.spark.{SparkConf, SparkContext}
 4 | import org.apache.spark.sql.SQLContext
 5 | 
 6 | import scala.concurrent.{Await, Future}
 7 | import scala.concurrent.duration._
 8 | 
 9 | object InMemorySparkContext {
10 | 
11 |   private[this] val conf =
12 |     new SparkConf()
13 |       .setMaster("local[4]")
14 |       .set("spark.local.ip", "localhost")
15 |       .set("spark.driver.host", "localhost")
16 |       .set("spark.sql.tungsten.enabled", "false")
17 |       .setAppName("Spark Ext Example App")
18 | 
19 |   lazy val sc: SparkContext = new SparkContext(conf)
20 | 
21 |   lazy val sqlContext: SQLContext = new SQLContext(sc)
22 | }
23 | 
24 | 
25 | trait InMemorySparkContext {
26 | 
27 |   lazy val sc: SparkContext = InMemorySparkContext.sc
28 | 
29 |   lazy val sqlContext: SQLContext = InMemorySparkContext.sqlContext
30 | 
31 |   def waitFor[T](f: Future[T], timeout: Duration = 5.second): T = {
32 |     Await.result(f, timeout)
33 |   }
34 | 
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/test/scala/org/apache/spark/ml/feature/StringToShortIndexerSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.feature
 2 | 
 3 | import com.collective.TestSparkContext
 4 | import org.apache.spark.ml.attribute.{NominalAttribute, Attribute}
 5 | import org.scalatest.FlatSpec
 6 | 
 7 | class StringToShortIndexerSpec extends FlatSpec with TestSparkContext {
 8 | 
 9 |   "StringToShortIndexer" should "assign correct index for columns" in {
10 |     val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2)
11 |     val df = sqlContext.createDataFrame(data).toDF("id", "label")
12 |     val indexer = new StringToShortIndexer()
13 |       .setInputCol("label")
14 |       .setOutputCol("labelIndex")
15 |       .fit(df)
16 | 
17 |     val transformed = indexer.transform(df)
18 |     val attr = Attribute.fromStructField(transformed.schema("labelIndex"))
19 |       .asInstanceOf[NominalAttribute]
20 |     assert(attr.values.get === Array("a", "c", "b"))
21 |     val output = transformed.select("id", "labelIndex").map { r =>
22 |       (r.getInt(0), r.getShort(1))
23 |     }.collect().toSet
24 |     // a -> 0, b -> 2, c -> 1
25 |     val expected = Set((0, 0), (1, 2), (2, 1), (3, 0), (4, 0), (5, 1))
26 |     assert(output === expected)
27 |   }
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryModelMetricComputerSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.mllib.evaluation
 2 | 
 3 | import com.collective.TestSparkContext
 4 | import org.apache.spark.mllib.evaluation.binary.{Recall, BinaryConfusionMatrixImpl, BinaryLabelCounter}
 5 | import org.scalatest.{GivenWhenThen, FlatSpec}
 6 | 
 7 | class BinaryModelMetricComputerSpec extends FlatSpec with GivenWhenThen with TestSparkContext {
 8 | 
 9 |   val confusions = Seq(
10 |     BinaryConfusionMatrixImpl(new BinaryLabelCounter(1, 0), new BinaryLabelCounter(5, 5)),
11 |     BinaryConfusionMatrixImpl(new BinaryLabelCounter(5, 2), new BinaryLabelCounter(5, 5))
12 |   )
13 | 
14 |   behavior of "AudienceReach"
15 |   confusions foreach {
16 |     b => {
17 |       it should s"compute proper reach for $b" in {
18 |         Given(s"confusion matrix entry $b")
19 |         val expectedAudienceReach = (b.count.numPositives + b.count.numNegatives).toDouble /
20 |           (b.totalCount.numNegatives + b.totalCount.numPositives)
21 | 
22 |         Then(s"audience reach should be equal to $expectedAudienceReach")
23 |         assert(Reach(b) === expectedAudienceReach)
24 |       }
25 |     }
26 |   }
27 | 
28 |   behavior of "Lift"
29 |   confusions foreach {
30 |     b => {
31 |       it should s"compute proper lift for $b" in {
32 |         Given(s"confusion matrix entry $b")
33 |         val expectedAudienceReach = (b.count.numPositives + b.count.numNegatives).toDouble /
34 |           (b.totalCount.numNegatives + b.totalCount.numPositives)
35 |         val expectedLift = Recall(b)/expectedAudienceReach
36 | 
37 |         Then(s"lift should be equal to $expectedLift")
38 |         assert(Lift(b) === expectedLift)
39 |       }
40 |     }
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/test/scala/org/apache/spark/ml/feature/S2CellTransformerSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.feature
 2 | 
 3 | import com.collective.TestSparkContext
 4 | import org.apache.spark.sql.Row
 5 | import org.apache.spark.sql.types._
 6 | import org.scalatest.{FlatSpec, GivenWhenThen}
 7 | 
 8 | 
 9 | class S2CellTransformerSpec extends FlatSpec with GivenWhenThen with TestSparkContext {
10 | 
11 |   val schema = StructType(Seq(
12 |     StructField("city", StringType),
13 |     StructField("lat", DoubleType),
14 |     StructField("lon", DoubleType)
15 |   ))
16 | 
17 |   val cities = sqlContext.createDataFrame(sc.parallelize(Seq(
18 |     Row("New York", 40.7142700, -74.0059700),
19 |     Row("London", 51.50722, -0.12750),
20 |     Row("Princeton", 40.3487200, -74.6590500)
21 |   )), schema)
22 | 
23 |   def cellMap(rows: Array[Row]): Map[String, String] = {
24 |     rows.map { case Row(city: String, _, _, cell: String) => city -> cell }.toMap
25 |   }
26 | 
27 |   "S2 Cell Transformer" should "compute S2 Cell Id for level = 6" in {
28 |     Given("S2 Cell Transformer with level = 6")
29 |     val s2CellTransformer = new S2CellTransformer().setLevel(6)
30 |     val transformed = s2CellTransformer.transform(cities)
31 |     val cells = cellMap(transformed.collect())
32 |     Then("New York should be in the same cell with Princeton")
33 |     assert(cells("New York") == cells("Princeton"))
34 |   }
35 | 
36 |   it should "compute S2 Cell Id for level = 12" in {
37 |     Given("S2 Cell Transformer with level = 12")
38 |     val s2CellTransformer = new S2CellTransformer().setLevel(12)
39 |     val transformed = s2CellTransformer.transform(cities)
40 |     val cells = cellMap(transformed.collect())
41 |     Then("all cities should in it's onw cell")
42 |     assert(cells.values.toSet.size == 3)
43 |   }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryModelMetricsSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.mllib.evaluation
 2 | 
 3 | import com.collective.TestSparkContext
 4 | import org.apache.spark.rdd.RDD
 5 | import org.scalatest.{GivenWhenThen, FlatSpec}
 6 | 
 7 | /**
 8 |  * We are just testing gains and lift methods.
 9 |  * Since code for this class was copied from spark 1.5.0
10 |  */
11 | class BinaryModelMetricsSpec extends FlatSpec with GivenWhenThen with TestSparkContext {
12 | 
13 |   val scoreAndLabels: RDD[(Double, Double)] = sc.parallelize(Seq(
14 |     (0.8, 0.0),
15 |     (0.7, 1.0),
16 |     (0.3, 0.0),
17 |     (0.9, 1.0),
18 |     (0.6, 0.0),
19 |     (0.6, 1.0),
20 |     (0.6, 0.0),
21 |     (0.8, 1.0),
22 |     (0.2, 0.0),
23 |     (0.5, 1.0)
24 |   ), 1)
25 | 
26 |   val modelMetricsNoBin = new BinaryModelMetrics(scoreAndLabels)
27 | 
28 |   behavior of "BinaryModelMetrics"
29 | 
30 |   it should "compute gains chart" in {
31 |     Given(s"score and labels set with 7 unique scores")
32 |     When("creating BinaryModelMetrics without bins specified")
33 |     val modelMetricsNoBin = new BinaryModelMetrics(scoreAndLabels)
34 |     val gainsChart = modelMetricsNoBin.gains()
35 | 
36 |     Then("resulting gains chart should have 9 pair of coordinates")
37 |     assert(gainsChart.count() === 9)
38 |   }
39 | 
40 | 
41 |   it should "compute gains chart with numBins = 3" in {
42 |     Given(s"score and labels set with 7 unique scores")
43 |     When("creating BinaryModelMetrics with 3 bins specified")
44 |     val modelMetricsNoBin = new BinaryModelMetrics(scoreAndLabels, 3)
45 |     val gainsChart = modelMetricsNoBin.gains()
46 | 
47 |     val expectedGainsPoints = (1 + Math.ceil(7.toDouble/(7/3)) + 1).toInt
48 |     Then(s"resulting gains chart should have $expectedGainsPoints pair of coordinates")
49 |     assert(gainsChart.count() === expectedGainsPoints)
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/test/scala/org/apache/spark/ml/feature/SplitOptimizerSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.feature
 2 | 
 3 | import org.scalatest._
 4 | 
 5 | class SplitOptimizerSpec extends FlatSpec with ShouldMatchers with SplitOptimizer {
 6 | 
 7 |   "SplitOptimizer" should "get from diff to original values" in {
 8 |     val diff = Array(0.1, 0.21, 0.05, 0.5)
 9 |     assert(fromDiff(diff).toSeq == Seq(0.1, 0.31, 0.36, 0.86))
10 |   }
11 | 
12 |   it should "get diff from original values" in {
13 |     val values = Array(0.1, 0.31, 0.37, 0.88)
14 |     assert(toDiff(values).toSeq == Seq(0.1, 0.21, 0.06, 0.51))
15 |   }
16 | 
17 |   it should "calculate perfect split of 9" in {
18 |     val x = (0 until 100).toArray.map(_.toDouble + math.random - math.random)
19 | 
20 |     val splits = optimalSplit(x, 9)
21 |     assert(splits.length == 9)
22 | 
23 |     splits.zipWithIndex.foreach { case (s, idx) =>
24 |       s should be ((idx + 1) * (x.length.toDouble / 10) +- 2.5)
25 |     }
26 |   }
27 | 
28 |   it should "calculate perfect split for highly skewed data" in {
29 | 
30 |     // R: x <- exp(rnorm(1000))
31 | 
32 |     // Heavy right skewed data
33 |     val g = breeze.stats.distributions.Gaussian(0, 1)
34 |     val skewed = g.sample(1000).map(d => math.exp(d)).toArray
35 | 
36 |     val splits = optimalSplit(skewed, 9)
37 |     assert(splits.length == 9)
38 | 
39 |     val cnt = counts(skewed)(splits)
40 |     assert(cnt.sum == skewed.length)
41 | 
42 |     cnt.foreach { count =>
43 |       count should be((skewed.length / 10) +- 5)
44 |     }
45 |   }
46 | 
47 |   private def counts(x: Array[Double])(p: Seq[Double]): Seq[Int] = {
48 |     val splits = Double.NegativeInfinity +: p :+ Double.PositiveInfinity
49 | 
50 |     val count = splits.sliding(2) map { case split =>
51 |       val low = split(0)
52 |       val high = split(1)
53 |       val filter = (v: Double) => v >= low && v < high
54 |       x.count(filter)
55 |     }
56 | 
57 |     count.toSeq
58 |   }
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/test/scala/org/apache/spark/ml/feature/GatherSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.feature
 2 | 
 3 | import com.collective.TestSparkContext
 4 | import org.apache.spark.sql.Row
 5 | import org.apache.spark.sql.types._
 6 | import org.scalatest._
 7 | 
 8 | import scala.collection.mutable
 9 | 
10 | class GatherSpec extends FlatSpec with GivenWhenThen with ShouldMatchers with TestSparkContext {
11 | 
12 |   val schema = StructType(Seq(
13 |     StructField("cookie_id", StringType),
14 |     StructField("site", StringType),
15 |     StructField("impressions", LongType)
16 |   ))
17 | 
18 |   val cookie1 = "cookie1"
19 |   val cookie2 = "cookie2"
20 |   val cookie3 = "cookie3"
21 | 
22 |   val impressionLog = sqlContext.createDataFrame(sc.parallelize(Seq(
23 |     Row(cookie1, "google.com", 10L),
24 |     Row(cookie1, "cnn.com", 14L),
25 |     Row(cookie1, "google.com", 2L),
26 |     Row(cookie2, "bbc.com", 20L),
27 |     Row(cookie2, "auto.com", null),
28 |     Row(cookie2, "auto.com", 1L),
29 |     Row(cookie3, "sport.com", 100L)
30 |   )), schema)
31 | 
32 |   "Gather Transformer" should "transform 'long' DataFrame into 'wide'" in {
33 |     val gather = new Gather()
34 |       .setPrimaryKeyCols("cookie_id")
35 |       .setKeyCol("site")
36 |       .setValueCol("impressions")
37 |       .setOutputCol("sites")
38 | 
39 |     val gathered = gather.transform(impressionLog)
40 | 
41 |     val lookupMap: Map[String, Map[String, Double]] =
42 |       gathered.collect().map { case Row(cookieId: String, map: mutable.WrappedArray[_]) =>
43 |         val imps = map.map { case Row(site: String, impressions: Double) => site -> impressions }.toMap
44 |         cookieId -> imps
45 |       }.toMap
46 | 
47 |     assert(lookupMap(cookie1)("google.com") == 12.0)
48 |     assert(lookupMap(cookie1)("cnn.com") == 14.0)
49 |     assert(lookupMap(cookie2)("bbc.com") == 20.0)
50 |     assert(lookupMap(cookie2)("auto.com") == 1.0)
51 |     assert(lookupMap(cookie3)("sport.com") == 100.0)
52 | 
53 |   }
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/sparkext-sql/src/test/scala/org/apache/spark/sql/ExtAggregatesSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.sql
 2 | 
 3 | import com.collective.TestSparkContext
 4 | import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
 5 | import org.scalatest.FlatSpec
 6 | import org.apache.spark.sql.functions._
 7 | import org.apache.spark.sql.ext.functions._
 8 | 
 9 | import scala.collection.mutable
10 | 
11 | class ExtAggregatesSpec extends FlatSpec with TestSparkContext {
12 | 
13 |   val schema = StructType(Seq(
14 |     StructField("cookie_id", StringType),
15 |     StructField("site", StringType),
16 |     StructField("impressions", LongType)
17 |   ))
18 | 
19 |   val cookie1 = "cookie1"
20 |   val cookie2 = "cookie2"
21 |   val cookie3 = "cookie3"
22 | 
23 |   val impressionLog = sqlContext.createDataFrame(sc.parallelize(Seq(
24 |     Row(cookie1, "google.com", 10L),
25 |     Row(cookie1, "cnn.com", 14L),
26 |     Row(cookie1, "google.com", 2L),
27 |     Row(cookie2, "bbc.com", 20L),
28 |     Row(cookie2, "auto.com", null),
29 |     Row(cookie2, "auto.com", 1L),
30 |     Row(cookie3, "sport.com", 100L)
31 |   )), schema)
32 | 
33 |   "Ext Aggregates" should "collect column values as array" in {
34 |     val cookies = impressionLog
35 |       .select(collectArray(col("cookie_id")))
36 |       .first().getAs[mutable.WrappedArray[String]](0)
37 |     assert(cookies.length == 7)
38 |     assert(cookies.toSet.size == 3)
39 |   }
40 | 
41 |   it should "collect distinct values as array" in {
42 |     val distinctCookies = impressionLog.select(col("cookie_id"))
43 |       .distinct()
44 |       .select(collectArray(col("cookie_id")))
45 |       .first().getAs[mutable.WrappedArray[String]](0)
46 |     assert(distinctCookies.length == 3)
47 |   }
48 | 
49 |   it should "collect values after group by" in {
50 |     val result = impressionLog
51 |       .groupBy(col("cookie_id"))
52 |       .agg(collectArray(col("site")))
53 | 
54 |     val cookieSites = result.collect().map { case Row(cookie: String, sites: mutable.WrappedArray[_]) =>
55 |       cookie -> sites.toSeq
56 |     }.toMap
57 | 
58 |     assert(cookieSites(cookie1).length == 3)
59 |     assert(cookieSites(cookie2).length == 3)
60 |     assert(cookieSites(cookie3).length == 1)
61 | 
62 |   }
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/test/scala/org/apache/spark/ml/sampling/DownsamplingSpec.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.sampling
 2 | 
 3 | import java.util.UUID
 4 | 
 5 | import com.collective.TestSparkContext
 6 | import org.apache.spark.sql.Row
 7 | import org.apache.spark.sql.types._
 8 | import org.scalatest._
 9 | 
10 | import scala.util.Random
11 | 
12 | 
13 | class DownsamplingSpec extends FlatSpec with GivenWhenThen with ShouldMatchers with TestSparkContext {
14 | 
15 |   val schema = StructType(Seq(
16 |     StructField("cookie_id", StringType),
17 |     StructField("label", DoubleType)
18 |   ))
19 | 
20 |   def cookieId = UUID.randomUUID().toString
21 | 
22 |   def positives(n: Int): Seq[Row] = Seq.fill(n)(Row(cookieId, 1.0))
23 |   def negatives(n: Int): Seq[Row] = Seq.fill(n)(Row(cookieId, 0.0))
24 | 
25 |   val dataset1 = sqlContext.createDataFrame(sc.parallelize(Random.shuffle(positives(100) ++ negatives(900))), schema)
26 |   val dataset2 = sqlContext.createDataFrame(sc.parallelize(Random.shuffle(positives(100) ++ negatives(9000))), schema)
27 | 
28 |   "Downsampling" should "skip sampling if class ratio is below threshold" in {
29 |     val downsampling = new Downsampling()
30 |       .setLabelCol("label")
31 |       .setOutputCol("sample_weight")
32 |       .setPrimaryClass(1.0)
33 | 
34 |     val model = downsampling.fit(dataset1)
35 |     assert(model.sampleFraction.isEmpty)
36 | 
37 |     val sampled = model.transform(dataset1)
38 |     assert(sampled.schema("sample_weight").dataType == DoubleType)
39 | 
40 |     val w = sampled.select("sample_weight").collect().map(_.getDouble(0)).toSet
41 |     assert(w.size == 1)
42 |     assert(w.head == 1.0)
43 |   }
44 | 
45 |   it should "sample negatives if class ratio is above threshold" in {
46 |     val downsampling = new Downsampling()
47 |       .setLabelCol("label")
48 |       .setOutputCol("sample_weight")
49 |       .setMaxClassRatio(29.0)
50 |       .setPrimaryClass(1.0)
51 | 
52 |     val model = downsampling.fit(dataset2)
53 |     assert(model.sampleFraction.isDefined)
54 |     val fraction = model.sampleFraction.get
55 |     val expectedFraction = 2900.0 / 9000
56 |     fraction should (be >= 0.9 * expectedFraction and be <= 1.1 * expectedFraction)
57 | 
58 |     val sampled = model.transform(dataset2)
59 |     assert(sampled.schema("sample_weight").dataType == DoubleType)
60 | 
61 |     sampled.count() should (be >= 2900L and be <= 3100L)
62 | 
63 |     val sampleWeight = sampled.select("label", "sample_weight").collect().map(r => r.getDouble(0) -> r.getDouble(1)).toMap
64 |     assert(sampleWeight.size == 2)
65 | 
66 |     val expectedSampleWeight = 9000.0 / 2900
67 |     sampleWeight(1.0) should equal (1.0)
68 |     sampleWeight(0.0) should (be >= 0.9 * expectedSampleWeight and be <= 1.1 * expectedSampleWeight)
69 |   }
70 | 
71 | }
72 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/main/scala/org/apache/spark/ml/feature/S2CellTransformer.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.feature
 2 | 
 3 | import com.google.common.geometry.{S2LatLng, S2CellId}
 4 | import org.apache.spark.ml.Transformer
 5 | import org.apache.spark.ml.attribute.NominalAttribute
 6 | import org.apache.spark.ml.param.{IntParam, Param, ParamMap, ParamValidators}
 7 | import org.apache.spark.ml.util.Identifiable
 8 | import org.apache.spark.sql.DataFrame
 9 | import org.apache.spark.sql.functions._
10 | import org.apache.spark.sql.types.{DoubleType, StructType}
11 | 
12 | /**
13 |  * Transform latitude and longitude into S2 Cell id
14 |  */
15 | class S2CellTransformer(override val uid: String) extends Transformer {
16 | 
17 |   def this() = this(Identifiable.randomUID("S2CellTransformer"))
18 | 
19 |   // Input/Output column names
20 | 
21 |   val latCol: Param[String] = new Param[String](this, "latCol", "latitude column")
22 | 
23 |   val lonCol: Param[String] = new Param[String](this, "lonCol", "longitude column")
24 | 
25 |   val cellCol: Param[String] = new Param[String](this, "cellCol", "S2 Cell Id column")
26 | 
27 |   val level: Param[Int] = new IntParam(this, "level", "S2 Level [0, 30]",
28 |     (i: Int) => ParamValidators.gtEq(0)(i) && ParamValidators.ltEq(30)(i))
29 | 
30 |   // Default parameters
31 | 
32 |   setDefault(
33 |     latCol  -> "lat",
34 |     lonCol  -> "lon",
35 |     cellCol -> "cell",
36 |     level   -> 10
37 |   )
38 | 
39 |   def getLatCol: String = $(latCol)
40 | 
41 |   def getLonCol: String = $(lonCol)
42 | 
43 |   def getCellCol: String = $(cellCol)
44 | 
45 |   def getLevel: Int = $(level)
46 | 
47 |   def setLatCol(value: String): this.type = set(latCol, value)
48 | 
49 |   def setLonCol(value: String): this.type = set(lonCol, value)
50 | 
51 |   def setCellCol(value: String): this.type = set(cellCol, value)
52 | 
53 |   def setLevel(value: Int): this.type = set(level, value)
54 | 
55 |   override def transform(dataset: DataFrame): DataFrame = {
56 |     val outputSchema = transformSchema(dataset.schema)
57 |     val currentLevel = $(level)
58 |     val t = udf { (lat: Double, lon: Double) =>
59 |       val cellId = S2CellId.fromLatLng(S2LatLng.fromDegrees(lat, lon))
60 |       cellId.parent(currentLevel).toToken
61 |     }
62 |     val metadata = outputSchema($(cellCol)).metadata
63 |     dataset.select(col("*"), t(col($(latCol)), col($(lonCol))).as($(cellCol), metadata))
64 |   }
65 | 
66 |   override def transformSchema(schema: StructType): StructType = {
67 |     val latColumnName = $(latCol)
68 |     val latDataType = schema(latColumnName).dataType
69 |     require(latDataType == DoubleType,
70 |       s"The latitude column $latColumnName must be Double type, " +
71 |         s"but got $latDataType.")
72 | 
73 |     val lonColumnName = $(lonCol)
74 |     val lonDataType = schema(lonColumnName).dataType
75 |     require(lonDataType == DoubleType,
76 |       s"The longitude column $lonColumnName must be Double type, " +
77 |         s"but got $lonDataType.")
78 | 
79 |     val inputFields = schema.fields
80 |     val outputColName = $(cellCol)
81 |     require(inputFields.forall(_.name != outputColName),
82 |       s"Output column $outputColName already exists.")
83 | 
84 |     val attr = NominalAttribute.defaultAttr.withName($(cellCol))
85 |     val outputFields = inputFields :+ attr.toStructField()
86 |     StructType(outputFields)
87 |   }
88 | 
89 |   override def copy(extra: ParamMap): S2CellTransformer = defaultCopy(extra)
90 | }
91 | 


--------------------------------------------------------------------------------
/sparkext-example/src/main/scala/com/collective/sparkext/example/DataGenerator.scala:
--------------------------------------------------------------------------------
  1 | package com.collective.sparkext.example
  2 | 
  3 | import java.io.{PrintWriter, File}
  4 | 
  5 | import scala.util.Random
  6 | 
  7 | /**
  8 |  * Generate dummy dataset based on positive/negative predictors: site visitation log + geo location log
  9 |  */
 10 | object DataGenerator extends App with PositivePredictors with NegativePredictors {
 11 | 
 12 |   val (positive, negative) = Seq.fill(1000)(Random.alphanumeric.take(15).mkString).splitAt(100)
 13 | 
 14 |   val (pSites, pGeo, pResp) = generateDataset(positive, positivePredictors, negativePredictors, response = 1)
 15 |   val (nSites, nGeo, nResp) = generateDataset(negative, negativePredictors, positivePredictors, response = 0)
 16 | 
 17 |   // Write site impression log
 18 |   val sitesW = new PrintWriter(new File("sites.csv"))
 19 |   sitesW.println("cookie,site,impressions")
 20 |   (pSites ++ nSites).foreach(sitesW.println)
 21 |   sitesW.close()
 22 | 
 23 |   // Write geo impression log
 24 |   val geoW = new PrintWriter(new File("geo.csv"))
 25 |   geoW.println("cookie,lat,lon,impressions")
 26 |   (pGeo ++ nGeo).foreach(geoW.println)
 27 |   geoW.close()
 28 | 
 29 |   // Write response log
 30 |   val responseW = new PrintWriter(new File("response.csv"))
 31 |   responseW.println("cookie,response")
 32 |   (pResp ++ nResp).foreach(responseW.println)
 33 |   responseW.close()
 34 | 
 35 |   private def generateDataset(
 36 |     cookies: Seq[String],
 37 |     primaryPredictors: Predictors,
 38 |     secondaryPredictors: Predictors,
 39 |     response: Int,
 40 |     primaryImpMean: Int = 10,
 41 |     secondaryImpMean: Int = 3
 42 |   ): (Seq[String], Seq[String], Seq[String]) = {
 43 | 
 44 |     def impressions(mean: Int): Int = math.max(1, mean + (mean * rnd.nextGaussian()).toInt)
 45 | 
 46 |     val sites = cookies.flatMap { cookie =>
 47 |       val primary = primaryPredictors.sites(6).map((_, impressions(primaryImpMean)))
 48 |       val secondary = secondaryPredictors.sites(3).map((_, impressions(secondaryImpMean)))
 49 |       (primary ++ secondary) map { case (site, imp) => s"$cookie,$site,$imp" }
 50 |     }
 51 | 
 52 |     val geo = cookies.flatMap { cookie =>
 53 |       val primary = primaryPredictors.latLon(2).map((_, impressions(primaryImpMean)))
 54 |       val secondary = secondaryPredictors.latLon(1).map((_, impressions(secondaryImpMean)))
 55 |       (primary ++ secondary) map { case ((lat, lon), imp) => s"$cookie,$lat,$lon,$imp" }
 56 |     }
 57 | 
 58 |     val resp = cookies.map { cookie => s"$cookie,$response" }
 59 | 
 60 |     (sites, geo, resp)
 61 |   }
 62 | 
 63 | }
 64 | 
 65 | trait Predictors {
 66 | 
 67 |   def lat: Double
 68 |   def lon: Double
 69 |   def allSites: Seq[String]
 70 | 
 71 |   def sites(n: Int): Seq[String] =
 72 |     rnd.shuffle(allSites).take(1 + rnd.nextInt(n))
 73 | 
 74 |   def latLon(n: Int): Seq[(Double, Double)] =
 75 |     Seq.fill(1 + rnd.nextInt(n))((lat + 3 * rnd.nextGaussian(), lon + 3 * rnd.nextGaussian()))
 76 | }
 77 | 
 78 | trait PositivePredictors {
 79 | 
 80 |   val positivePredictors = new Predictors {
 81 | 
 82 |     // New York
 83 |     val lat = 40.7127
 84 |     val lon = 74.0059
 85 | 
 86 |     val allSites = Seq(
 87 |       "google.com", "facebook.com", "amazon.com",
 88 |       "youtube.com", "yahoo.com", "ebay.com", "wikipedia.org",
 89 |       "twitter.com", "craiglist.com", "reddit.com", "netflix.com",
 90 |       "live.com", "bing.com", "linkedin.com", "pinterest.com"
 91 |     )
 92 | 
 93 |   }
 94 | }
 95 | 
 96 | trait NegativePredictors {
 97 | 
 98 |   val negativePredictors = new Predictors {
 99 | 
100 |     // Los Angeles
101 |     val lat = 34.0500
102 |     val lon = 118.2500
103 | 
104 |     val allSites = Seq(
105 |       "imgur.com", "go.com", "tumblr.com", "espn.go.com",
106 |       "cnn.com", "paypal.com", "chase.com", "instagram.com", "blogpost.com",
107 |       "t.co", "msn.com", "imdb.com", "nytimes.com", "walmart.com",
108 |       "huffingtonpost.com", "yelp.com", "diply.com"
109 |     )
110 | 
111 |   }
112 | }
113 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/test/scala/org/apache/spark/ml/feature/BinningSpec.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.feature
  2 | 
  3 | import java.util.UUID
  4 | 
  5 | import com.collective.TestSparkContext
  6 | import org.apache.spark.mllib.linalg.{SparseVector, Vector}
  7 | import org.apache.spark.sql.Row
  8 | import org.apache.spark.sql.types._
  9 | import org.scalatest.{FlatSpec, GivenWhenThen, ShouldMatchers}
 10 | 
 11 | class BinningSpec extends FlatSpec with GivenWhenThen with ShouldMatchers with TestSparkContext {
 12 | 
 13 |   val schema = StructType(Seq(
 14 |     StructField("cookie_id", StringType),
 15 |     StructField("num_days", IntegerType),
 16 |     StructField("ctr", DoubleType),
 17 |     StructField("actions", DoubleType)
 18 |   ))
 19 | 
 20 |   val N = 1000
 21 | 
 22 |   def cookieId = UUID.randomUUID().toString
 23 | 
 24 |   val users = sqlContext.createDataFrame(sc.parallelize((1 to N).map { i =>
 25 |     Row(cookieId, i, math.random, if (math.random > 0.5) 10 * math.random else null)
 26 |   }), schema)
 27 | 
 28 |   "Optimal Binning" should "compute binning for ctr" in {
 29 |     val optimalBinning = new OptimalBinning()
 30 |       .setInputCol("ctr")
 31 |       .setOutputCol("ctr_bin")
 32 |       .setNumBins(5)
 33 | 
 34 |     val binning = optimalBinning.fit(users)
 35 | 
 36 |     assert(binning.getSplits.length == 6)
 37 |     binning.getSplits(1) should be(0.20 +- 0.5)
 38 |     binning.getSplits(2) should be(0.40 +- 0.5)
 39 |     binning.getSplits(3) should be(0.60 +- 0.5)
 40 |     binning.getSplits(4) should be(0.80 +- 0.5)
 41 | 
 42 |     val binned = binning.transform(users).collect()
 43 |     assert(binned.length == N)
 44 |   }
 45 | 
 46 |   "Binning" should "bin DoubleType column" in {
 47 |     val binning = new Binning()
 48 |       .setInputCol("ctr")
 49 |       .setOutputCol("ctr_bin")
 50 |       .setSplits(Array(0.0, 0.25, 0.5, 0.75, 1.0))
 51 | 
 52 |     def validate(ctr: Double, bin: Vector) = {
 53 |       assert(bin.size == 4)
 54 |       assert(bin.toSparse.indices.length == 1)
 55 |       assert(ctr match {
 56 |         case v if v >= 0.0 && v < 0.25 => bin.toSparse.indices.head == 0
 57 |         case v if v >= 0.25 && v < 0.50 => bin.toSparse.indices.head == 1
 58 |         case v if v >= 0.50 && v < 0.75 => bin.toSparse.indices.head == 2
 59 |         case v if v >= 0.75 && v < 1.0 => bin.toSparse.indices.head == 3
 60 | 
 61 |       })
 62 |     }
 63 | 
 64 |     val binned = binning.transform(users)
 65 |     binned.collect().foreach { case Row(_, _, ctr: Double, _, bin: SparseVector) =>
 66 |       validate(ctr, bin)
 67 |     }
 68 |   }
 69 | 
 70 |   it should "bin IntegerType column" in {
 71 |     val binning = new Binning()
 72 |       .setInputCol("num_days")
 73 |       .setOutputCol("num_days_bin")
 74 |       .setSplits(Array(0.0, 400, 800, 1000))
 75 | 
 76 |     def validate(numDays: Int, bin: Vector) = {
 77 |       assert(bin.size == 3)
 78 |       assert(bin.toSparse.indices.length == 1)
 79 |       assert(numDays match {
 80 |         case v if v >= 0 && v < 400 => bin.toSparse.indices.head == 0
 81 |         case v if v >= 400 && v < 800 => bin.toSparse.indices.head == 1
 82 |         case v if v >= 800 && v <= 1000 => bin.toSparse.indices.head == 2
 83 |       })
 84 |     }
 85 | 
 86 |     val binned = binning.transform(users)
 87 | 
 88 |     binned.collect().foreach { case Row(_, numDays: Int, _, _, bin: SparseVector) =>
 89 |       validate(numDays, bin)
 90 |     }
 91 |   }
 92 | 
 93 |   it should "fail to bin StringType column" in {
 94 |     val binning = new Binning()
 95 |       .setInputCol("cookie_id")
 96 |       .setOutputCol("cookie_id_bins")
 97 |       .setSplits(Array(0.0, 400, 800, 1000))
 98 | 
 99 |     intercept[IllegalArgumentException] {
100 |       binning.transform(users)
101 |     }
102 |   }
103 | 
104 |   it should "bin column with nulls" in {
105 |     val binning = new Binning()
106 |       .setInputCol("actions")
107 |       .setOutputCol("actions_bins")
108 |       .setSplits(Array(0.0, 4.0, 8.0, 10.0))
109 | 
110 |     binning.transform(users)
111 |   }
112 | 
113 | }
114 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/main/scala/org/apache/spark/ml/feature/StringToShortIndexer.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.feature
  2 | 
  3 | import org.apache.spark.SparkException
  4 | import org.apache.spark.ml.{Estimator, Model}
  5 | import org.apache.spark.ml.attribute.NominalAttribute
  6 | import org.apache.spark.ml.param._
  7 | import org.apache.spark.ml.util.Identifiable
  8 | import org.apache.spark.sql.DataFrame
  9 | import org.apache.spark.sql.functions._
 10 | import org.apache.spark.sql.types._
 11 | import org.apache.spark.util.collection.OpenHashMap
 12 | 
 13 | /**
 14 |  * A label indexer that maps a string column of labels to an ML column of label indices.
 15 |  * If the input column is numeric, we cast it to string and index the string values.
 16 |  * The indices are in [0, numLabels), ordered by label frequencies.
 17 |  * So the most frequent label gets index 0.
 18 |  *
 19 |  * In contrast to Spark [[StringIndexer]] use Short for labels (instead of Double)
 20 |  */
 21 | class StringToShortIndexer(override val uid: String) extends Estimator[StringToShortIndexerModel]
 22 | with StringIndexerBase {
 23 | 
 24 |   def this() = this(Identifiable.randomUID("strShortIdx"))
 25 | 
 26 |   def setInputCol(value: String): this.type = set(inputCol, value)
 27 | 
 28 |   def setOutputCol(value: String): this.type = set(outputCol, value)
 29 | 
 30 |   override def fit(dataset: DataFrame): StringToShortIndexerModel = {
 31 |     val counts = dataset.select(col($(inputCol)).cast(StringType))
 32 |       .map(_.getString(0))
 33 |       .countByValue()
 34 |     val labels = counts.toSeq.sortBy(-_._2).map(_._1).toArray
 35 |     require(labels.length <= Short.MaxValue,
 36 |       s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})")
 37 |     copyValues(new StringToShortIndexerModel(uid, labels).setParent(this))
 38 |   }
 39 | 
 40 |   override def transformSchema(schema: StructType): StructType = {
 41 |     validateAndTransformSchema(schema)
 42 |   }
 43 | 
 44 |   override def copy(extra: ParamMap): StringToShortIndexer = defaultCopy(extra)
 45 | }
 46 | 
 47 | class StringToShortIndexerModel (
 48 |   override val uid: String,
 49 |   val labels: Array[String]) extends Model[StringToShortIndexerModel] with StringIndexerBase {
 50 | 
 51 |   def this(labels: Array[String]) = this(Identifiable.randomUID("strIdx"), labels)
 52 | 
 53 |   require(labels.length <= Short.MaxValue,
 54 |     s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})")
 55 | 
 56 |   private val labelToIndex: OpenHashMap[String, Short] = {
 57 |     val n = labels.length.toShort
 58 |     val map = new OpenHashMap[String, Short](n)
 59 |     var i: Short = 0
 60 |     while (i < n) {
 61 |       map.update(labels(i), i)
 62 |       i = (i + 1).toShort
 63 |     }
 64 |     map
 65 |   }
 66 | 
 67 |   def setInputCol(value: String): this.type = set(inputCol, value)
 68 | 
 69 |   def setOutputCol(value: String): this.type = set(outputCol, value)
 70 | 
 71 |   override def transform(dataset: DataFrame): DataFrame = {
 72 |     if (!dataset.schema.fieldNames.contains($(inputCol))) {
 73 |       logInfo(s"Input column ${$(inputCol)} does not exist during transformation. " +
 74 |         "Skip StringToShortIndexerModel.")
 75 |       return dataset
 76 |     }
 77 | 
 78 |     val indexer = udf { label: String =>
 79 |       if (labelToIndex.contains(label)) {
 80 |         labelToIndex(label)
 81 |       } else {
 82 |         // TODO: handle unseen labels
 83 |         throw new SparkException(s"Unseen label: $label.")
 84 |       }
 85 |     }
 86 |     val outputColName = $(outputCol)
 87 |     val metadata = NominalAttribute.defaultAttr
 88 |       .withName(outputColName).withValues(labels).toMetadata()
 89 |     dataset.select(col("*"),
 90 |       indexer(dataset($(inputCol)).cast(StringType)).as(outputColName, metadata))
 91 |   }
 92 | 
 93 |   override def transformSchema(schema: StructType): StructType = {
 94 |     if (schema.fieldNames.contains($(inputCol))) {
 95 |       validateAndTransformSchema(schema)
 96 |     } else {
 97 |       // If the input column does not exist during transformation, we skip StringToShortIndexerModel.
 98 |       schema
 99 |     }
100 |   }
101 | 
102 |   override def copy(extra: ParamMap): StringToShortIndexerModel = {
103 |     val copied = new StringToShortIndexerModel(uid, labels)
104 |     copyValues(copied, extra).setParent(parent)
105 |   }
106 | }
107 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/main/scala/org/apache/spark/ml/feature/Gather.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.feature
  2 | 
  3 | import org.apache.spark.ml.Transformer
  4 | import org.apache.spark.ml.param._
  5 | import org.apache.spark.ml.param.shared.HasOutputCol
  6 | import org.apache.spark.ml.util.Identifiable
  7 | import org.apache.spark.sql.DataFrame
  8 | import org.apache.spark.sql.ext.functions._
  9 | import org.apache.spark.sql.functions._
 10 | import org.apache.spark.sql.types._
 11 | 
 12 | private[feature] trait GatherParams extends Params with HasKeyCol with HasValueCol with HasOutputCol {
 13 | 
 14 |   val primaryKeyCols: Param[Array[String]] = new StringArrayParam(this, "primaryKeyCols",
 15 |     "Primary key column names",
 16 |     ParamValidators.arrayLengthGt(0))
 17 | 
 18 |   val valueAgg: Param[String] = new Param[String](this, "valueAgg",
 19 |     "Aggregate function applied to valueCol: 'sum' or 'count'",
 20 |     ParamValidators.inArray(Array("sum", "count")))
 21 | 
 22 |   def getPrimaryKeyCols: Array[String] = $(primaryKeyCols)
 23 | 
 24 |   def getValueAgg: String = $(valueAgg)
 25 | }
 26 | 
 27 | /**
 28 |  * Inspired by R `tidyr` and `reshape2` packages. Convert long [[org.apache.spark.sql.DataFrame DataFrame]] with values
 29 |  * for each key into wide [[org.apache.spark.sql.DataFrame DataFrame]], applying aggregation function if single
 30 |  * key has multiple values
 31 |  * {{{
 32 |  * cookie_id | site_id | impressions
 33 |  * ----------|---------|--------------
 34 |  *  cookieAA |   123   | 10
 35 |  *  cookieAA |   123   | 5
 36 |  *  cookieAA |   456   | 20
 37 |  * }}}
 38 |  *
 39 |  * gathered using `sum` agregate
 40 |  *
 41 |  * {{{
 42 |  *  cookie_id | output_col
 43 |  *  ----------|------------------------
 44 |  *  cookieAA  | [{ site_id: 123, impressions: 15.0 }, { site_id: 456, impressions: 20.0 }]
 45 |  *  }}}
 46 |  */
 47 | class Gather(override val uid: String) extends Transformer with GatherParams {
 48 | 
 49 |   def this() = this(Identifiable.randomUID("gather"))
 50 | 
 51 |   def setPrimaryKeyCols(value: String*): this.type = set(primaryKeyCols, value.toArray)
 52 | 
 53 |   def setKeyCol(value: String): this.type = set(keyCol, value)
 54 | 
 55 |   def setValueCol(value: String): this.type = set(valueCol, value)
 56 | 
 57 |   def setValueAgg(value: String): this.type = set(valueAgg, value)
 58 | 
 59 |   def setOutputCol(value: String): this.type = set(outputCol, value)
 60 | 
 61 |   setDefault(
 62 |     valueAgg -> "sum"
 63 |   )
 64 | 
 65 |   override def transform(dataset: DataFrame): DataFrame = {
 66 |     val outputSchema = transformSchema(dataset.schema)
 67 | 
 68 |     val pkCols = $(primaryKeyCols).map(col)
 69 | 
 70 |     val grouped = dataset.groupBy(pkCols :+ col($(keyCol)) : _*)
 71 |     val aggregateCol = s"${uid}_value_aggregate"
 72 |     val aggregated = $(valueAgg) match {
 73 |       case "sum"   => grouped.agg(sum($(valueCol))   as aggregateCol)
 74 |       case "count" => grouped.agg(count($(valueCol)) as aggregateCol)
 75 |     }
 76 | 
 77 |     val metadata = outputSchema($(outputCol)).metadata
 78 | 
 79 |     aggregated
 80 |       .groupBy(pkCols: _*)
 81 |       .agg(collectArray(struct(
 82 |           col($(keyCol)),
 83 |           col(aggregateCol).cast(DoubleType).as($(valueCol))
 84 |       )).as($(outputCol), metadata))
 85 |   }
 86 | 
 87 |   override def transformSchema(schema: StructType): StructType = {
 88 |     val valueFunName = $(valueAgg)
 89 | 
 90 |     val keyColName = $(keyCol)
 91 |     val keyColDataType = schema(keyColName).dataType
 92 |     keyColDataType match {
 93 |       case _: NumericType =>
 94 |       case _: StringType =>
 95 |       case other =>
 96 |         throw new IllegalArgumentException(s"Key column data type $other is not supported.")
 97 |     }
 98 | 
 99 |     val valueColName = $(valueCol)
100 |     val valueColDataType = schema(valueColName).dataType
101 |     valueColDataType match {
102 |       case _: NumericType =>
103 |       case _: StringType if valueFunName == "count" =>
104 |       case other =>
105 |         throw new IllegalArgumentException(s"Value data type $other is not supported with value aggregate $valueAgg.")
106 |     }
107 | 
108 |     val pkFields = $(primaryKeyCols).map(schema.apply)
109 |     val rollupType = StructType(Array(
110 |       StructField($(keyCol), keyColDataType),
111 |       StructField($(valueCol), DoubleType)
112 |     ))
113 |     val rollupField = StructField($(outputCol), ArrayType(rollupType), nullable = false)
114 | 
115 |     StructType(pkFields :+ rollupField)
116 |   }
117 | 
118 |   override def copy(extra: ParamMap): S2CellTransformer = defaultCopy(extra)
119 | 
120 | }
121 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Spark Ext
  2 | 
  3 | [![Build Status](https://travis-ci.org/collectivemedia/spark-ext.svg?branch=master)](https://travis-ci.org/collectivemedia/spark-ext)
  4 | 
  5 | Spark ML transformers, estimator, Spark SQL aggregations, etc that are missing in Apache Spark.
  6 | 
  7 | That's how we are doing [Audience Modeling](https://databricks.com/blog/2015/10/20/audience-modeling-with-spark-ml-pipelines.html) at Collective
  8 | 
  9 | ## Where to get it
 10 | 
 11 | ``` scala
 12 | resolvers += "Collective Media Bintray" at "https://dl.bintray.com/collectivemedia/releases"
 13 | ```
 14 | 
 15 | And use following library dependencies:
 16 | 
 17 | ```
 18 | libraryDependencies +=  "com.collective.sparkext" %% "sparkext-sql" % "0.0.23"
 19 | libraryDependencies +=  "com.collective.sparkext" %% "sparkext-mllib" % "0.0.23"
 20 | ```
 21 | 
 22 | ## Testing
 23 | 
 24 |     sbt test
 25 |     
 26 | ## Spark SQL
 27 | 
 28 | ``` scala
 29 | val schema = StructType(Seq(
 30 |   StructField("cookie_id", StringType),
 31 |   StructField("site", StringType),
 32 |   StructField("impressions", LongType)
 33 | ))
 34 | 
 35 | val impressionLog = sqlContext.createDataFrame(sc.parallelize(Seq(
 36 |   Row("cookie_1", "google.com", 10L),
 37 |   Row("cookie_2", "cnn.com", 14L),
 38 |   ...
 39 | )), schema)
 40 |  ```
 41 |     
 42 | #### CollectArray
 43 | 
 44 | Aggregation function that collects all values from a column
 45 | 
 46 | ``` scala
 47 | import org.apache.spark.sql.ext.functions._
 48 | 
 49 | // collects all sites for cookie (with duplicates)
 50 | impressionLog
 51 |       .groupBy(col("cookie_id"))
 52 |       .agg(collectArray(col("site")))
 53 | ```
 54 | 
 55 | ## Spark ML
 56 | 
 57 | #### S2 Geometry CellId transformer
 58 | 
 59 | Gets Google S2 Geometry CellId from decimal `lat` and `lon`
 60 | 
 61 | ``` scala
 62 | val schema = StructType(Seq(
 63 |    StructField("city", StringType),
 64 |    StructField("lat", DoubleType),
 65 |    StructField("lon", DoubleType)
 66 |  ))
 67 | 
 68 |  val cities = sqlContext.createDataFrame(sc.parallelize(Seq(
 69 |    Row("New York", 40.7142700, -74.0059700),
 70 |    Row("London", 51.50722, -0.12750),
 71 |    Row("Princeton", 40.3487200, -74.6590500)
 72 |  )), schema)
 73 |  
 74 |   val s2CellTransformer = new S2CellTransformer().setLevel(6)
 75 |   s2CellTransformer.transform(cities)
 76 | ```
 77 | 
 78 | #### Optimal Binning
 79 | 
 80 | Continuous features may need to be transformed to binary format using binning to account for nonlinearity. In general, 
 81 | binning attempts to break a set of ordered values into evenly distributed groups, such that each group 
 82 | contains approximately the same number of values from the sample.
 83 | 
 84 | #### Gather
 85 | 
 86 | Inspired by R `tidyr` and `reshape2` packages. Convert `long` `DataFrame` with values
 87 | for each key into `wide` `DataFrame`, applying aggregation function if single
 88 | key has multiple values
 89 | 
 90 | cookie_id | site_id | impressions
 91 | ----------|---------|-------------
 92 |  cookieAA |   123   | 10
 93 |  cookieAA |   123   | 5
 94 |  cookieAA |   456   | 20
 95 |  
 96 | ``` scala
 97 | val gather = new Gather()
 98 |       .setPrimaryKeyCols("cookie_id")
 99 |       .setKeyCol("site_id")
100 |       .setValueCol("impressions")
101 |       .setOutputCol("sites")
102 | val gathered = gather.transform(siteLog)      
103 | ```
104 | 
105 | cookie_id | sites
106 | ----------|-------------
107 | cookieAA  | [{ site_id: 123, impressions: 15.0 }, { site_id: 456, impressions: 20.0 }]
108 | 
109 | #### Gather Encoder
110 | 
111 | Encode categorical key-value pairs using dummy variables. 
112 | 
113 |  cookie_id | sites
114 |  ----------|------------------------------------------------------------------------
115 |  cookieAA  | [{ site_id: 1, impressions: 15.0 }, { site_id: 2, impressions: 20.0 }]
116 |  cookieBB  | [{ site_id: 2, impressions: 7.0 }, { site_id: 3, impressions: 5.0 }]
117 | 
118 | transformed into
119 | 
120 |  cookie_id | site_features
121 |  ----------|------------------------
122 |  cookieAA  | [ 15.0 , 20.0 , 0   ]
123 |  cookieBB  | [ 0.0  ,  7.0 , 5.0 ]
124 | 
125 | Optionally apply dimensionality reduction using `top` transformation:
126 |  - Top coverage, is selecting categorical values by computing the count of distinct users for each value,
127 |    sorting the values in descending order by the count of users, and choosing the top values from the resulting
128 |    list such that the sum of the distinct user counts over these values covers c percent of all users,
129 |    for example, selecting top sites covering 99% of users.
130 | 
131 | 
132 | #### Downsampling Negatives
133 | 
134 | If class ratio between positives and negatives is too high, you might want to downsample all you negatives before building a model.
135 | 
136 | ``` scala
137 | val downsampling = new Downsampling()
138 |       .setLabelCol("label")
139 |       .setOutputCol("sample_weight")
140 |       .setMaxClassRatio(30.0)
141 |       .setPrimaryClass(1.0) // positive class to keep as-is   
142 | ```
143 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/main/scala/org/apache/spark/ml/sampling/Downsampling.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.sampling
  2 | 
  3 | import org.apache.spark.ml.param.shared.{HasLabelCol, HasOutputCol}
  4 | import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators, Params}
  5 | import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
  6 | import org.apache.spark.ml.{Estimator, Model}
  7 | import org.apache.spark.sql.DataFrame
  8 | import org.apache.spark.sql.functions._
  9 | import org.apache.spark.sql.types._
 10 | 
 11 | private[sampling] trait DownsamplingParams
 12 |   extends Params with HasLabelCol with HasOutputCol {
 13 | 
 14 |   val primaryClass: Param[Double] = new Param[Double](this, "primaryClass",
 15 |     "Primary class to keep (0.0 or 1.0)",
 16 |     (v: Double) => v == 0.0 || v == 1.0)
 17 | 
 18 |   val sampleWithReplacement: Param[Boolean] = new Param[Boolean](this, "sampleWithReplacement",
 19 |     "Sample secondary class with replacement")
 20 | 
 21 |   def getPrimaryClass: Double = $(primaryClass)
 22 | 
 23 |   def getSampleWithReplacement: Boolean = $(sampleWithReplacement)
 24 | 
 25 |   setDefault(outputCol, uid + "_sample_weight")
 26 | 
 27 |   protected def validateAndTransformSchema(schema: StructType): StructType = {
 28 |     val labelColName = $(labelCol)
 29 |     val labelColDataType = schema(labelColName).dataType
 30 |     labelColDataType match {
 31 |       case _: DoubleType =>
 32 |       case other =>
 33 |         throw new IllegalArgumentException(s"Label column data type $other is not supported.")
 34 |     }
 35 |     SchemaUtils.appendColumn(schema, StructField(getOutputCol, DoubleType, nullable = false))
 36 |   }
 37 | 
 38 | }
 39 | 
 40 | /**
 41 |  * Downsample input dataset in order to reduce class ratio
 42 |  * between positive (primary) and negative (secondary) classes
 43 |   */
 44 | class Downsampling(override val uid: String) extends Estimator[DownsamplingModel] with DownsamplingParams {
 45 | 
 46 |   def this() = this(Identifiable.randomUID("downsampling"))
 47 | 
 48 |   val maxClassRatio: Param[Double] = new Param[Double](this, "maxClassRatio",
 49 |     "Max class ratio",
 50 |     (v: Double) => ParamValidators.gt(0.0)(v) && ParamValidators.ltEq(1000.0)(v))
 51 | 
 52 |   def getMaxClassRatio: Double = $(maxClassRatio)
 53 | 
 54 |   def setLabelCol(value: String): this.type = set(labelCol, value)
 55 | 
 56 |   def setOutputCol(value: String): this.type = set(outputCol, value)
 57 | 
 58 |   def setPrimaryClass(value: Double): this.type = set(primaryClass, value)
 59 |   setDefault(primaryClass -> 1.0)
 60 | 
 61 |   def setMaxClassRatio(value: Double): this.type = set(maxClassRatio, value)
 62 |   setDefault(maxClassRatio -> 30.0)
 63 | 
 64 |   def setSampleWithReplacement(value: Boolean): this.type = set(sampleWithReplacement, value)
 65 |   setDefault(sampleWithReplacement -> false)
 66 | 
 67 |   override def fit(dataset: DataFrame): DownsamplingModel = {
 68 |     log.info(s"Compute downsampling model with primary class: $getPrimaryClass")
 69 | 
 70 |     val primaryCnt = dataset.filter(col(getLabelCol) === getPrimaryClass).count()
 71 |     val secondaryCnt = dataset.filter(col(getLabelCol) !== getPrimaryClass).count()
 72 | 
 73 |     require(primaryCnt > 0,
 74 |       s"Primary class $getPrimaryClass should be presented in dataset")
 75 | 
 76 |     val classRatio = secondaryCnt.toDouble / primaryCnt
 77 | 
 78 |     if (classRatio <= getMaxClassRatio) {
 79 |       log.debug(s"Class ratio: $classRatio is below max class ratio: $getMaxClassRatio. Skip downsampling.")
 80 |       copyValues(new DownsamplingModel(uid, None).setParent(this))
 81 |     } else {
 82 |       val desiredSecondaryCnt = primaryCnt * getMaxClassRatio
 83 |       val sampleFraction = desiredSecondaryCnt / secondaryCnt
 84 |       log.debug(s"Class ratio: $classRatio is above max class ratio: $getMaxClassRatio. Sample fraction: $sampleFraction")
 85 |       copyValues(new DownsamplingModel(uid, Some(sampleFraction)).setParent(this))
 86 |     }
 87 | 
 88 |   }
 89 | 
 90 |   override def transformSchema(schema: StructType): StructType = {
 91 |     validateAndTransformSchema(schema)
 92 |   }
 93 | 
 94 |   override def copy(extra: ParamMap): Downsampling = defaultCopy(extra)
 95 | 
 96 | }
 97 | 
 98 | class DownsamplingModel(
 99 |   override val uid: String,
100 |   val sampleFraction: Option[Double]
101 | ) extends Model[DownsamplingModel] with DownsamplingParams {
102 | 
103 |   def this(sampleFraction: Option[Double]) = this(Identifiable.randomUID("downsampling"), sampleFraction)
104 | 
105 |   def setLabelCol(value: String): this.type = set(labelCol, value)
106 | 
107 |   def setOutputCol(value: String): this.type = set(outputCol, value)
108 | 
109 |   def setPrimaryClass(value: Double): this.type = set(primaryClass, value)
110 |   setDefault(primaryClass -> 1.0)
111 | 
112 |   def setSampleWithReplacement(value: Boolean): this.type = set(sampleWithReplacement, value)
113 |   setDefault(sampleWithReplacement -> false)
114 | 
115 |   override def transform(dataset: DataFrame): DataFrame = sampleFraction match {
116 |     case None =>
117 |       log.debug(s"Skip dataset downsampling")
118 |       dataset.select(col("*"), lit(1.0) as getOutputCol)
119 | 
120 |     case Some(fraction) =>
121 |       log.debug(s"Downsample dataset with sample fraction: $fraction")
122 | 
123 |       val primary = dataset.filter(col(getLabelCol) === getPrimaryClass)
124 |         .select(col("*"), lit(1.0) as getOutputCol)
125 | 
126 |       val secondary = dataset.filter(col(getLabelCol) !== getPrimaryClass)
127 |         .sample(withReplacement = getSampleWithReplacement, fraction)
128 |         .select(col("*"), lit(1.0 / fraction) as getOutputCol)
129 | 
130 |       primary.unionAll(secondary)
131 |   }
132 | 
133 |   override def transformSchema(schema: StructType): StructType = {
134 |     validateAndTransformSchema(schema)
135 |   }
136 | 
137 |   override def copy(extra: ParamMap): DownsamplingModel = {
138 |     val copied = new DownsamplingModel(uid, sampleFraction)
139 |     copyValues(copied, extra).setParent(parent)
140 |   }
141 | }
142 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/test/scala/org/apache/spark/ml/feature/GatherEncoderModelSpec.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.feature
  2 | 
  3 | import com.collective.TestSparkContext
  4 | import org.apache.spark.mllib.linalg.Vector
  5 | import org.apache.spark.sql.types._
  6 | import org.apache.spark.sql.{DataFrame, Row}
  7 | import org.scalatest.FlatSpec
  8 | 
  9 | class GatherEncoderModelSpec extends FlatSpec with TestSparkContext {
 10 | 
 11 |   val schema = StructType(Seq(
 12 |     StructField("cookie_id", StringType),
 13 |     StructField("sites", ArrayType(StructType(Seq(
 14 |       StructField("site", StringType),
 15 |       StructField("site_id", IntegerType),
 16 |       StructField("impressions", LongType
 17 |     ))), containsNull = true))
 18 |   ))
 19 | 
 20 |   val cookie1 = "cookie1"
 21 |   val cookie2 = "cookie2"
 22 |   val cookie3 = "cookie3"
 23 |   val cookie4 = "cookie4"
 24 |   val cookie5 = "cookie5"
 25 | 
 26 |   val (google, googleId) = "google.com" -> 1
 27 |   val (cnn, cnnId) = "cnn.com" -> 2
 28 |   val (bbc, bbcId) = "bbc.com" -> 3
 29 |   val (auto, autoId) = "auto.com" -> 4
 30 |   val (moto, motoId) = "moto.com" -> 5
 31 |   val (sport, sportId) = "sport.com" -> 6
 32 | 
 33 |   val dataset = sqlContext.createDataFrame(sc.parallelize(Seq(
 34 |     Row(cookie1, Array(
 35 |       Row(google, googleId, 12L),
 36 |       Row(cnn, cnnId, 14L)
 37 |     )),
 38 |     Row(cookie2, Array(
 39 |       Row(bbc, bbcId, 20L),
 40 |       Row(auto, autoId, 1L),
 41 |       Row(moto, motoId, 3L)
 42 |     )),
 43 |     Row(cookie3, Array(
 44 |       Row(sport, sportId, 100L)
 45 |     )),
 46 |     Row(cookie4, Array.empty[Row]),
 47 |     Row(cookie5, null)
 48 |   )), schema)
 49 | 
 50 |   def createEncoder(keys: Array[Any]) =
 51 |     new GatherEncoderModel(keys)
 52 |     .setInputCol("sites")
 53 |     .setOutputCol("features")
 54 |     .setKeyCol("site")
 55 |     .setValueCol("impressions")
 56 | 
 57 |   val sites: Array[Any] = Array(google, bbc, cnn)
 58 |   val siteIds: Array[Any] = Array(googleId, bbcId, cnnId)
 59 | 
 60 |   def toFeatures(encoder: GatherEncoderModel, dataset: DataFrame): Map[String, Vector] = {
 61 |     val encodedDf = encoder.transform(dataset).select("cookie_id", "features")
 62 |     encodedDf.collect().map { case Row(cookieId: String, features: Vector) =>
 63 |       cookieId -> features
 64 |     }.toMap
 65 |   }
 66 | 
 67 |   "Gather Encoder Model" should "encode categories ignoring all other" in {
 68 |     val sitesEncoder = createEncoder(sites).setAllOther(false)
 69 |     val siteIdsEncoder = createEncoder(siteIds).setKeyCol("site_id").setAllOther(false)
 70 | 
 71 |     // Check that type of the keys doesn't matter
 72 |     val siteFeatures = toFeatures(sitesEncoder, dataset)
 73 |     val idFeatures = toFeatures(siteIdsEncoder, dataset)
 74 |     assert(siteFeatures == idFeatures)
 75 | 
 76 |     assert(siteFeatures(cookie1).size == 3)
 77 |     assert(siteFeatures(cookie1).toSparse.indices.toSeq == 0 :: 2 :: Nil)
 78 |     assert(siteFeatures(cookie1).toSparse.values.toSeq == 12 :: 14 :: Nil)
 79 | 
 80 |     assert(siteFeatures(cookie2).size == 3)
 81 |     assert(siteFeatures(cookie2).toSparse.indices.toSeq == 1 :: Nil)
 82 |     assert(siteFeatures(cookie2).toSparse.values.toSeq == 20 :: Nil)
 83 | 
 84 |     def assertEmptyFeatures(cookie: String): Unit = {
 85 |       assert(siteFeatures(cookie).size == 3)
 86 |       assert(siteFeatures(cookie).toSparse.indices.toSeq == Nil)
 87 |       assert(siteFeatures(cookie).toSparse.values.toSeq == Nil)
 88 |     }
 89 | 
 90 |     assertEmptyFeatures(cookie3)
 91 |     assertEmptyFeatures(cookie4)
 92 |     assertEmptyFeatures(cookie5)
 93 |   }
 94 | 
 95 |   it should "encode categories with all other" in {
 96 |     val sitesEncoder = createEncoder(sites).setAllOther(true)
 97 |     val features = toFeatures(sitesEncoder, dataset)
 98 | 
 99 |     assert(features(cookie1).size == 4)
100 |     assert(features(cookie1).toSparse.indices.toSeq == 0 :: 2 :: Nil)
101 |     assert(features(cookie1).toSparse.values.toSeq == 12 :: 14 :: Nil)
102 | 
103 |     assert(features(cookie2).size == 4)
104 |     assert(features(cookie2).toSparse.indices.toSeq == 1 :: 3 :: Nil)
105 |     assert(features(cookie2).toSparse.values.toSeq == 20 :: 4 :: Nil)
106 | 
107 |     assert(features(cookie3).size == 4)
108 |     assert(features(cookie3).toSparse.indices.toSeq == 3 :: Nil)
109 |     assert(features(cookie3).toSparse.values.toSeq == 100 :: Nil)
110 | 
111 |     def assertEmptyFeatures(cookie: String): Unit = {
112 |       assert(features(cookie).size == 4)
113 |       assert(features(cookie).toSparse.indices.toSeq == Nil)
114 |       assert(features(cookie).toSparse.values.toSeq == Nil)
115 |     }
116 | 
117 |     assertEmptyFeatures(cookie4)
118 |     assertEmptyFeatures(cookie5)
119 |   }
120 | 
121 |   it should "remove input col" in {
122 |     val sitesEncoder = createEncoder(sites).setKeepInputCol(false)
123 |     val encoded = sitesEncoder.transform(dataset)
124 |     assert(encoded.schema.size == dataset.schema.size)
125 |     assert(!encoded.schema.exists(_.name == "sites"))
126 |   }
127 | 
128 |   it should "fail to encode with empty key set" in {
129 |     val encoder = createEncoder(Array.empty)
130 |     intercept[IllegalArgumentException] {
131 |       encoder.transform(dataset)
132 |     }
133 |   }
134 | 
135 |   it should "output empty vectors for empty keys with all other disabled" in {
136 |     val sitesEncoder = createEncoder(Array.empty)
137 |       .setFailOnEmptyKeys(false)
138 |       .setAllOther(false)
139 |     val features = toFeatures(sitesEncoder, dataset)
140 |     assert(features(cookie1).size == 0)
141 |   }
142 | 
143 |   it should "put all values into all other column for empty keys" in {
144 |     val sitesEncoder = createEncoder(Array.empty)
145 |       .setFailOnEmptyKeys(false)
146 |       .setAllOther(true)
147 | 
148 |     val features = toFeatures(sitesEncoder, dataset)
149 | 
150 |     assert(features(cookie1).toArray.toSeq == Seq(26.0))
151 |     assert(features(cookie2).toArray.toSeq == Seq(24.0))
152 |     assert(features(cookie3).toArray.toSeq == Seq(100.0))
153 | 
154 |     def assertEmptyFeatures(cookie: String): Unit = {
155 |       assert(features(cookie).size == 1)
156 |       assert(features(cookie).toSparse.indices.toSeq == Nil)
157 |       assert(features(cookie).toSparse.values.toSeq == Nil)
158 |     }
159 | 
160 |     assertEmptyFeatures(cookie4)
161 |     assertEmptyFeatures(cookie5)
162 |   }
163 | 
164 | }
165 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/test/scala/org/apache/spark/ml/feature/GatherEncoderSpec.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.feature
  2 | 
  3 | import com.collective.TestSparkContext
  4 | import org.apache.spark.sql.Row
  5 | import org.apache.spark.sql.types._
  6 | import org.scalatest.FlatSpec
  7 | 
  8 | class GatherEncoderSpec extends FlatSpec with TestSparkContext {
  9 | 
 10 |   val schema = StructType(Seq(
 11 |     StructField("cookie_id", StringType),
 12 |     StructField("sites", ArrayType(StructType(Seq(
 13 |       StructField("site", StringType),
 14 |       StructField("impressions", LongType
 15 |     ))), containsNull = false))
 16 |   ))
 17 | 
 18 |   val cookie1 = "cookie1"
 19 |   val cookie2 = "cookie2"
 20 |   val cookie3 = "cookie3"
 21 |   val cookie4 = "cookie4"
 22 |   val cookie5 = "cookie5"
 23 | 
 24 |   val dataset = sqlContext.createDataFrame(sc.parallelize(
 25 |     Seq.fill(250)(Row(cookie1, Array( // 250 * 2 = 500   // total: 500  // cover: 50%
 26 |       Row("google.com", 12L),
 27 |       Row("cnn.com", 14L)
 28 |     ))) ++
 29 |     Seq.fill(100)(Row(cookie2, Array( // 100 * 3 = 300   // total: 800  // cover: 80%
 30 |       Row("bbc.com", 20L),
 31 |       Row("auto.com", 1L),
 32 |       Row("moto.com", 3L)
 33 |     ))) ++
 34 |     Seq.fill(80)(Row(cookie3, Array(  // 80              // total: 880   // cover: 88%
 35 |       Row("sport.com", 100L)
 36 |     ))) ++
 37 |     Seq.fill(50)(Row(cookie3, Array(  // 50              // total: 930   // cover: 93%
 38 |       Row("netflix.com", 1L)
 39 |     ))) ++
 40 |     Seq.fill(40)(Row(cookie3, Array(  // 40              // total: 970   // cover: 97%
 41 |       Row("amazon.com", 1L)
 42 |     ))) ++
 43 |     Seq.fill(30)(Row(cookie3, Array(  // 30              // total: 1000  // cover: 100%
 44 |       Row("imdb.com", 1L)
 45 |     ))) ++
 46 |     Seq.fill(150)(Row(cookie4, Array( // 0 : cookie_id doesn't have any site statistics
 47 |     ))) ++
 48 |     Seq.fill(150)(Row(cookie5, null   // 0 : check that null doesn't break anything
 49 |     ))
 50 |   ), schema)
 51 | 
 52 |   // Empty and Null dataset can arise from outer joins in bigger pipelines
 53 | 
 54 |   val emptyDataset = sqlContext.createDataFrame(sc.parallelize(
 55 |     Seq.fill(250)(Row(cookie1, Array.empty[Row])) ++
 56 |     Seq.fill(100)(Row(cookie2, Array.empty[Row])) ++
 57 |     Seq.fill(80)(Row(cookie3, Array.empty[Row]))
 58 |   ), schema)
 59 | 
 60 |   val nullDataset = sqlContext.createDataFrame(sc.parallelize(
 61 |     Seq.fill(250)(Row(cookie1, null)) ++
 62 |     Seq.fill(100)(Row(cookie2, null)) ++
 63 |     Seq.fill(80)(Row(cookie3, null))
 64 |   ), schema)
 65 | 
 66 |   def topEncoder: GatherEncoder = new GatherEncoder()
 67 |     .setInputCol("sites")
 68 |     .setOutputCol("features")
 69 |     .setKeyCol("site")
 70 |     .setValueCol("impressions")
 71 |     .setTransformation("top")
 72 | 
 73 |   def indexEncoder: GatherEncoder = topEncoder
 74 |     .setTransformation("index")
 75 | 
 76 |   "Index Gather Encoder" should "collect all keys when support is 1%" in {
 77 |     val encoder = indexEncoder.setSupport(1.0)
 78 |     val features = encoder.fit(dataset)
 79 |     assert(features.modelKeys.length == 9)
 80 |   }
 81 | 
 82 |   it should "support key exclusion when support is 1%" in {
 83 |     val encoder = indexEncoder.setSupport(1.0).setExcludeKeys(Set("imdb.com"))
 84 |     val features = encoder.fit(dataset)
 85 |     assert(features.modelKeys.length == 8)
 86 |     assert(!features.modelKeys.contains("imdb.com"))
 87 |   }
 88 | 
 89 |   it should "exclude imdb.com for 3.1% support" in {
 90 |     val encoder = indexEncoder.setSupport(3.1)
 91 |     val features = encoder.fit(dataset)
 92 |     assert(features.modelKeys.length == 8)
 93 |     assert(!features.modelKeys.contains("imdb.com"))
 94 |   }
 95 | 
 96 |   it should "exclude imdb.com and amazon.com for 4.1% support" in {
 97 |     val encoder = indexEncoder.setSupport(4.1)
 98 |     val features = encoder.fit(dataset)
 99 |     assert(features.modelKeys.length == 7)
100 |     assert(!features.modelKeys.contains("imdb.com"))
101 |     assert(!features.modelKeys.contains("amazon.com"))
102 |   }
103 | 
104 |   "Top Gather Encoder" should "collect all keys when cover is 100.0" in {
105 |     val encoder = topEncoder.setCover(100.0)
106 |     val features = encoder.fit(dataset)
107 |     assert(features.modelKeys.length == 9)
108 |   }
109 | 
110 |   it should "support key exclusion when cover is 100.0" in {
111 |     val encoder = topEncoder.setCover(100.0).setExcludeKeys(Set("imdb.com"))
112 |     val features = encoder.fit(dataset)
113 |     assert(features.modelKeys.length == 8)
114 |     assert(!features.modelKeys.contains("imdb.com"))
115 |   }
116 | 
117 |   it should "exclude imdb.com for 95% coverage" in {
118 |     val encoder = topEncoder.setCover(95.0)
119 |     val features = encoder.fit(dataset)
120 |     assert(features.modelKeys.length == 8)
121 |     assert(!features.modelKeys.contains("imdb.com"))
122 |   }
123 | 
124 |   it should "support key exclusion when cover is 95%" in {
125 |     val encoder = topEncoder.setCover(95.0).setExcludeKeys(Set("amazon.com"))
126 |     val features = encoder.fit(dataset)
127 |     assert(features.modelKeys.length == 7)
128 |     // Imdb excluded by coverage
129 |     assert(!features.modelKeys.contains("imdb.com"))
130 |     // Amazon excluded explicitly
131 |     assert(!features.modelKeys.contains("amazon.com"))
132 |   }
133 | 
134 |   it should "exclude amazon.com for 90% coverage" in {
135 |     val encoder = topEncoder.setCover(90.0)
136 |     val features = encoder.fit(dataset)
137 |     assert(features.modelKeys.length == 7)
138 |     assert(!features.modelKeys.contains("amazon.com"))
139 |   }
140 | 
141 |   it should "exclude netflix.com for 85% coverage" in {
142 |     val encoder = topEncoder.setCover(85.0)
143 |     val features = encoder.fit(dataset)
144 |     assert(features.modelKeys.length == 6)
145 |     assert(!features.modelKeys.contains("netflix.com"))
146 |   }
147 | 
148 |   it should "exclude sport.com for 75% coverage" in {
149 |     val encoder = topEncoder.setCover(75.0)
150 |     val features = encoder.fit(dataset)
151 |     assert(features.modelKeys.length == 5)
152 |     assert(!features.modelKeys.contains("sport.com"))
153 |   }
154 | 
155 |   it should "get empty key set for empty dataset" in {
156 |     val encoder = topEncoder
157 |     val features = encoder.fit(emptyDataset)
158 |     assert(features.modelKeys.isEmpty)
159 |   }
160 | 
161 |   it should "get empty key set for null dataset" in {
162 |     val encoder = topEncoder
163 |     val features = encoder.fit(nullDataset)
164 |     assert(features.modelKeys.isEmpty)
165 |   }
166 | 
167 | 
168 | }
169 | 


--------------------------------------------------------------------------------
/sparkext-sql/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.sql.catalyst.expressions
  2 | 
  3 | import org.apache.spark.sql.catalyst.InternalRow
  4 | import org.apache.spark.sql.types.{GenericArrayData, ArrayType, DataType}
  5 | import scala.collection.mutable
  6 | 
  7 | case class CollectArray(expression: Expression) extends PartialAggregate1 {
  8 |   def this() = this(null)
  9 | 
 10 |   override def children: Seq[Expression] = expression :: Nil
 11 | 
 12 |   override def nullable: Boolean = false
 13 |   override def dataType: DataType = ArrayType(expression.dataType, containsNull = false)
 14 |   override def toString: String = s"COLLECT_ARRAY($expression)"
 15 |   override def newInstance(): CollectArrayFunction = new CollectArrayFunction(expression, this)
 16 | 
 17 |   override def asPartial: SplitEvaluation = {
 18 |     val partialSet = Alias(CollectPartialArray(expression), "partialArrays")()
 19 |     SplitEvaluation(
 20 |       CombinePartialArrays(partialSet.toAttribute),
 21 |       partialSet :: Nil)
 22 |   }
 23 | }
 24 | 
 25 | case class CollectArrayFunction(
 26 |   @transient expr: Expression,
 27 |   @transient base: AggregateExpression1)
 28 |   extends AggregateFunction1 {
 29 | 
 30 |   def this() = this(null, null) // Required for serialization.
 31 | 
 32 |   // Reducing GC pressure with this trick
 33 | 
 34 |   var firstValue: Any = _
 35 |   var builder: mutable.ListBuffer[Any] = _
 36 | 
 37 |   override def update(input: InternalRow): Unit = {
 38 |     val evaluatedExpr = expr.eval(input)
 39 |     if (evaluatedExpr != null) {
 40 |       if (firstValue == null && builder == null) {
 41 |         // Got first value
 42 |         firstValue = evaluatedExpr
 43 |       } else if (firstValue != null && builder == null) {
 44 |         // Got second value
 45 |         builder = mutable.ListBuffer.empty[Any]
 46 |         builder += firstValue
 47 |         builder += evaluatedExpr
 48 |         firstValue = null
 49 |       } else if (firstValue == null && builder != null) {
 50 |         // Got 2+ values
 51 |         builder += evaluatedExpr
 52 |       } else {
 53 |         throw new IllegalStateException(s"Both state variables are defined")
 54 |       }
 55 |     }
 56 |   }
 57 | 
 58 |   override def eval(input: InternalRow): Any = {
 59 |     if (firstValue == null && builder == null) {
 60 |       new GenericArrayData(Array.empty)
 61 |     } else if (firstValue != null && builder == null) {
 62 |       new GenericArrayData(Array(firstValue))
 63 |     } else if (firstValue == null && builder != null) {
 64 |       new GenericArrayData(builder.toArray)
 65 |     } else {
 66 |       throw new IllegalStateException("Both state variables are defined")
 67 |     }
 68 |   }
 69 | }
 70 | 
 71 | case class CollectPartialArray(expression: Expression) extends AggregateExpression1 {
 72 |   def this() = this(null)
 73 | 
 74 |   override def children: Seq[Expression] = expression :: Nil
 75 |   override def nullable: Boolean = false
 76 |   override def dataType: DataType = ArrayType(expression.dataType, containsNull = false)
 77 |   override def toString: String = s"AddToPartialArray($expression)"
 78 |   override def newInstance(): CollectPartialArrayFunction =
 79 |     new CollectPartialArrayFunction(expression, this)
 80 | }
 81 | 
 82 | case class CollectPartialArrayFunction(
 83 |   @transient expr: Expression,
 84 |   @transient base: AggregateExpression1)
 85 |   extends AggregateFunction1 {
 86 | 
 87 |   def this() = this(null, null) // Required for serialization.
 88 | 
 89 |   // Reducing GC pressure with this trick
 90 | 
 91 |   var firstValue: Any = _
 92 |   var builder: mutable.ListBuffer[Any] = _
 93 | 
 94 |   override def update(input: InternalRow): Unit = {
 95 |     val evaluatedExpr = expr.eval(input)
 96 |     if (evaluatedExpr != null) {
 97 |       if (firstValue == null && builder == null) {
 98 |         // Got first value
 99 |         firstValue = evaluatedExpr
100 |       } else if (firstValue != null && builder == null) {
101 |         // Got second value
102 |         builder = mutable.ListBuffer.empty[Any]
103 |         builder += firstValue
104 |         builder += evaluatedExpr
105 |         firstValue = null
106 |       } else if (firstValue == null && builder != null) {
107 |         // Got 2+ values
108 |         builder += evaluatedExpr
109 |       } else {
110 |         throw new IllegalStateException(s"Both state variables are defined")
111 |       }
112 |     }
113 |   }
114 | 
115 |   override def eval(input: InternalRow): Any = {
116 |     if (firstValue == null && builder == null) {
117 |       new GenericArrayData(Array.empty)
118 |     } else if (firstValue != null && builder == null) {
119 |       new GenericArrayData(Array(firstValue))
120 |     } else if (firstValue == null && builder != null) {
121 |       new GenericArrayData(builder.toArray)
122 |     } else {
123 |       throw new IllegalStateException("Both state variables are defined")
124 |     }
125 |   }
126 | }
127 | 
128 | case class CombinePartialArrays(inputSet: Expression) extends AggregateExpression1 {
129 |   def this() = this(null)
130 | 
131 |   override def children: Seq[Expression] = inputSet :: Nil
132 |   override def nullable: Boolean = false
133 |   override def dataType: DataType = inputSet.dataType
134 |   override def toString: String = s"CombinePartialArrays($inputSet)"
135 |   override def newInstance(): CombinePartialArraysFunction = {
136 |     new CombinePartialArraysFunction(inputSet, this)
137 |   }
138 | }
139 | 
140 | case class CombinePartialArraysFunction(
141 |   @transient inputSet: Expression,
142 |   @transient base: AggregateExpression1)
143 |   extends AggregateFunction1 {
144 | 
145 |   def this() = this(null, null) // Required for serialization.
146 | 
147 |   // Reducing GC pressure with this trick
148 | 
149 |   var firstArray: GenericArrayData = _
150 |   var builder: mutable.ListBuffer[Any] = _
151 | 
152 |   override def update(input: InternalRow): Unit = {
153 |     val inputSetEval = inputSet.eval(input).asInstanceOf[GenericArrayData]
154 | 
155 |     if (firstArray == null && builder == null) {
156 |       // Got first array
157 |       firstArray = inputSetEval
158 |     } else if (firstArray != null && builder == null) {
159 |       // Got second value
160 |       builder = mutable.ListBuffer.empty[Any]
161 |       val inputIterator = firstArray.array.iterator ++ inputSetEval.array.iterator
162 |       while (inputIterator.hasNext) {
163 |         builder += inputIterator.next
164 |       }
165 |       firstArray = null
166 |     } else if (firstArray == null && builder != null) {
167 |       // Got 2+ values
168 |       val inputIterator = inputSetEval.array.iterator
169 |       while (inputIterator.hasNext) {
170 |         builder += inputIterator.next
171 |       }
172 |     } else {
173 |       throw new IllegalStateException(s"Both state variables are defined")
174 |     }
175 |   }
176 | 
177 |   override def eval(input: InternalRow): Any = {
178 |     if (firstArray == null && builder == null) {
179 |       new GenericArrayData(Array.empty)
180 |     } else if (firstArray != null && builder == null) {
181 |       firstArray
182 |     } else if (firstArray == null && builder != null) {
183 |       new GenericArrayData(builder.toArray)
184 |     } else {
185 |       throw new IllegalStateException("Both state variables are defined")
186 |     }
187 |   }
188 | }
189 | 


--------------------------------------------------------------------------------
/sparkext-example/src/main/scala/com/collective/sparkext/example/SparkMlExtExample.scala:
--------------------------------------------------------------------------------
  1 | package com.collective.sparkext.example
  2 | 
  3 | import org.apache.log4j.Logger
  4 | import org.apache.log4j.varia.NullAppender
  5 | import org.apache.spark.ml.Pipeline
  6 | import org.apache.spark.ml.attribute.AttributeGroup
  7 | import org.apache.spark.ml.classification.LogisticRegression
  8 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
  9 | import org.apache.spark.ml.feature.{VectorAssembler, GatherEncoder, S2CellTransformer, Gather}
 10 | import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
 11 | import org.apache.spark.mllib.evaluation.BinaryModelMetrics
 12 | import org.apache.spark.mllib.linalg.DenseVector
 13 | import org.apache.spark.sql.functions._
 14 | import org.apache.spark.sql.{Row, DataFrame}
 15 | import org.apache.spark.sql.types._
 16 | 
 17 | object SparkMlExtExample extends App with Sites with Geo with Response {
 18 | 
 19 |   import sqlContext.implicits._
 20 | 
 21 |   turnOffLogging()
 22 | 
 23 |   println(s"Run Spark ML Ext Example application")
 24 | 
 25 |   println(s"Sites data frame size = ${sitesDf.count()}")
 26 |   println(s"Geo data frame size = ${geoDf.count()}")
 27 |   println(s"Response data frame size = ${responseDf.count()} ")
 28 | 
 29 |   // Gather site visitation log
 30 |   val gatherSites = new Gather()
 31 |     .setPrimaryKeyCols(Sites.cookie)
 32 |     .setKeyCol(Sites.site)
 33 |     .setValueCol(Sites.impressions)
 34 |     .setOutputCol("sites")
 35 | 
 36 |   // Transform lat/lon into S2 Cell Id
 37 |   val s2Transformer = new S2CellTransformer()
 38 |     .setLevel(5)
 39 |     .setCellCol("s2_cell")
 40 | 
 41 |   // Gather S2 CellId log
 42 |   val gatherS2Cells = new Gather()
 43 |     .setPrimaryKeyCols(Geo.cookie)
 44 |     .setKeyCol("s2_cell")
 45 |     .setValueCol(Geo.impressions)
 46 |     .setOutputCol("s2_cells")
 47 | 
 48 |   // Gather raw data into wide rows
 49 |   val gatheredSites = gatherSites.transform(sitesDf)
 50 |   val gatheredCells = gatherS2Cells.transform(s2Transformer.transform(geoDf))
 51 | 
 52 |   // Assemble input dataset
 53 |   val dataset = responseDf.as("response")
 54 |     .join(gatheredSites, responseDf(Response.cookie) === gatheredSites(Sites.cookie))
 55 |     .join(gatheredCells, responseDf(Response.cookie) === gatheredCells(Sites.cookie))
 56 |     .select(
 57 |       $"response.*",
 58 |       $"sites",
 59 |       $"s2_cells"
 60 |     ).cache()
 61 | 
 62 |   println(s"Input dataset size = ${dataset.count()}")
 63 | 
 64 |   dataset.show(10)
 65 | 
 66 |   // Split dataset into test/train sets
 67 |   val trainPct = 0.1
 68 |   val Array(trainSet, testSet) = dataset.randomSplit(Array(1 - trainPct, trainPct))
 69 | 
 70 |   // Setup ML Pipeline stages
 71 | 
 72 |   // Encode site data
 73 |   val encodeSites = new GatherEncoder()
 74 |     .setInputCol("sites")
 75 |     .setOutputCol("sites_f")
 76 |     .setKeyCol(Sites.site)
 77 |     .setValueCol(Sites.impressions)
 78 | 
 79 |   // Encode S2 Cell data
 80 |   val encodeS2Cells = new GatherEncoder()
 81 |     .setInputCol("s2_cells")
 82 |     .setOutputCol("s2_cells_f")
 83 |     .setKeyCol("s2_cell")
 84 |     .setValueCol(Geo.impressions)
 85 |     .setCover(0.95)
 86 | 
 87 |   // Assemble feature vectors together
 88 |   val assemble = new VectorAssembler()
 89 |     .setInputCols(Array("sites_f", "s2_cells_f"))
 90 |     .setOutputCol("features")
 91 | 
 92 |   // Extract features label information
 93 |   val dummyPipeline = new Pipeline()
 94 |     .setStages(Array(encodeSites, encodeS2Cells, assemble))
 95 |   val out = dummyPipeline.fit(dataset).transform(dataset)
 96 |   val attrGroup = AttributeGroup.fromStructField(out.schema("features"))
 97 | 
 98 |   val attributes = attrGroup.attributes.get
 99 |   println(s"Num features = ${attributes.length}")
100 |   attributes.zipWithIndex.foreach { case (attr, idx) =>
101 |     println(s" - $idx = $attr")
102 |   }
103 | 
104 |   // Build logistic regression using featurized statistics
105 |   val lr = new LogisticRegression()
106 |     .setFeaturesCol("features")
107 |     .setLabelCol(Response.response)
108 |     .setProbabilityCol("probability")
109 | 
110 |   // Define pipeline with 4 stages
111 |   val pipeline = new Pipeline()
112 |     .setStages(Array(encodeSites, encodeS2Cells, assemble, lr))
113 | 
114 |   val evaluator = new BinaryClassificationEvaluator()
115 |     .setLabelCol(Response.response)
116 | 
117 |   val crossValidator = new CrossValidator()
118 |     .setEstimator(pipeline)
119 |     .setEvaluator(evaluator)
120 | 
121 |   val paramGrid = new ParamGridBuilder()
122 |     .addGrid(lr.elasticNetParam, Array(0.1, 0.5))
123 |     .build()
124 | 
125 |   crossValidator.setEstimatorParamMaps(paramGrid)
126 |   crossValidator.setNumFolds(2)
127 | 
128 |   println(s"Train model on train set")
129 |   val cvModel = crossValidator.fit(trainSet)
130 | 
131 |   println(s"Score test set")
132 |   val testScores = cvModel.transform(testSet)
133 | 
134 |   val scoreAndLabels = testScores
135 |     .select(col("probability"), col(Response.response))
136 |     .map { case Row(probability: DenseVector, label: Double) =>
137 |     val predictedActionProbability = probability(1)
138 |     (predictedActionProbability, label)
139 |   }
140 | 
141 |   println("Evaluate model")
142 |   val metrics = new BinaryModelMetrics(scoreAndLabels)
143 |   val auc = metrics.areaUnderROC()
144 | 
145 |   println(s"Model AUC: $auc")
146 | 
147 |   private def turnOffLogging(): Unit = {
148 |     Logger.getRootLogger.removeAllAppenders()
149 |     Logger.getRootLogger.addAppender(new NullAppender())
150 |   }
151 | }
152 | 
153 | trait Sites extends InMemorySparkContext {
154 | 
155 |   object Sites {
156 |     val cookie = "cookie"
157 |     val site = "site"
158 |     val impressions = "impressions"
159 | 
160 |     val schema = StructType(Array(
161 |       StructField(cookie, StringType),
162 |       StructField(site, StringType),
163 |       StructField(impressions, IntegerType)
164 |     ))
165 |   }
166 | 
167 |   lazy val sitesDf: DataFrame = {
168 |     val lines = scala.io.Source.fromInputStream(this.getClass.getResourceAsStream("/sites.csv")).getLines()
169 |     val rows = lines.map(_.split(",")).drop(1) collect {
170 |       case Array(cookie, site, impressions) => Row(cookie, site, impressions.toInt)
171 |     }
172 |     val rdd = sc.parallelize(rows.toSeq)
173 |     sqlContext.createDataFrame(rdd, Sites.schema)
174 |   }
175 | 
176 | }
177 | 
178 | trait Geo extends InMemorySparkContext {
179 | 
180 |   object Geo {
181 |     val cookie = "cookie"
182 |     val lat = "lat"
183 |     val lon = "lon"
184 |     val impressions = "impressions"
185 | 
186 |     val schema = StructType(Array(
187 |       StructField(cookie, StringType),
188 |       StructField(lat, DoubleType),
189 |       StructField(lon, DoubleType),
190 |       StructField(impressions, IntegerType)
191 |     ))
192 |   }
193 | 
194 |   lazy val geoDf: DataFrame = {
195 |     val lines = scala.io.Source.fromInputStream(this.getClass.getResourceAsStream("/geo.csv")).getLines()
196 |     val rows = lines.map(_.split(",")).drop(1) collect {
197 |       case Array(cookie, lat, lon, impressions) => Row(cookie, lat.toDouble, lon.toDouble, impressions.toInt)
198 |     }
199 |     val rdd = sc.parallelize(rows.toSeq)
200 |     sqlContext.createDataFrame(rdd, Geo.schema)
201 |   }
202 | 
203 | }
204 | 
205 | trait Response extends InMemorySparkContext {
206 | 
207 |   object Response {
208 |     val cookie = "cookie"
209 |     val response = "response"
210 | 
211 |     val schema = StructType(Array(
212 |       StructField(cookie, StringType),
213 |       StructField(response, DoubleType)
214 |     ))
215 |   }
216 | 
217 |   lazy val responseDf: DataFrame = {
218 |     val lines = scala.io.Source.fromInputStream(this.getClass.getResourceAsStream("/response.csv")).getLines()
219 |     val rows = lines.map(_.split(",")).drop(1) collect {
220 |       case Array(cookie, response) => Row(cookie, response.toDouble)
221 |     }
222 |     val rdd = sc.parallelize(rows.toSeq)
223 |     sqlContext.createDataFrame(rdd, Response.schema)
224 |   }
225 | 
226 | }
227 | 
228 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/test/scala/org/apache/spark/ml/TestingUtils.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml
  2 | 
  3 | import org.apache.spark.mllib.linalg.{Matrix, Vector}
  4 | import org.scalatest.exceptions.TestFailedException
  5 | 
  6 | 
  7 | object TestingUtils {
  8 | 
  9 |   val ABS_TOL_MSG = " using absolute tolerance"
 10 |   val REL_TOL_MSG = " using relative tolerance"
 11 | 
 12 |   /**
 13 |    * Private helper function for comparing two values using relative tolerance.
 14 |    * Note that if x or y is extremely close to zero, i.e., smaller than Double.MinPositiveValue,
 15 |    * the relative tolerance is meaningless, so the exception will be raised to warn users.
 16 |    */
 17 |   private def RelativeErrorComparison(x: Double, y: Double, eps: Double): Boolean = {
 18 |     val absX = math.abs(x)
 19 |     val absY = math.abs(y)
 20 |     val diff = math.abs(x - y)
 21 |     if (x == y) {
 22 |       true
 23 |     } else if (absX < Double.MinPositiveValue || absY < Double.MinPositiveValue) {
 24 |       throw new TestFailedException(
 25 |         s"$x or $y is extremely close to zero, so the relative tolerance is meaningless.", 0)
 26 |     } else {
 27 |       diff < eps * math.min(absX, absY)
 28 |     }
 29 |   }
 30 | 
 31 |   /**
 32 |    * Private helper function for comparing two values using absolute tolerance.
 33 |    */
 34 |   private def AbsoluteErrorComparison(x: Double, y: Double, eps: Double): Boolean = {
 35 |     math.abs(x - y) < eps
 36 |   }
 37 | 
 38 |   case class CompareDoubleRightSide(
 39 |     fun: (Double, Double, Double) => Boolean, y: Double, eps: Double, method: String)
 40 | 
 41 |   /**
 42 |    * Implicit class for comparing two double values using relative tolerance or absolute tolerance.
 43 |    */
 44 |   implicit class DoubleWithAlmostEquals(val x: Double) {
 45 | 
 46 |     /**
 47 |      * When the difference of two values are within eps, returns true; otherwise, returns false.
 48 |      */
 49 |     def ~=(r: CompareDoubleRightSide): Boolean = r.fun(x, r.y, r.eps)
 50 | 
 51 |     /**
 52 |      * When the difference of two values are within eps, returns false; otherwise, returns true.
 53 |      */
 54 |     def !~=(r: CompareDoubleRightSide): Boolean = !r.fun(x, r.y, r.eps)
 55 | 
 56 |     /**
 57 |      * Throws exception when the difference of two values are NOT within eps;
 58 |      * otherwise, returns true.
 59 |      */
 60 |     def ~==(r: CompareDoubleRightSide): Boolean = {
 61 |       if (!r.fun(x, r.y, r.eps)) {
 62 |         throw new TestFailedException(
 63 |           s"Expected $x and ${r.y} to be within ${r.eps}${r.method}.", 0)
 64 |       }
 65 |       true
 66 |     }
 67 | 
 68 |     /**
 69 |      * Throws exception when the difference of two values are within eps; otherwise, returns true.
 70 |      */
 71 |     def !~==(r: CompareDoubleRightSide): Boolean = {
 72 |       if (r.fun(x, r.y, r.eps)) {
 73 |         throw new TestFailedException(
 74 |           s"Did not expect $x and ${r.y} to be within ${r.eps}${r.method}.", 0)
 75 |       }
 76 |       true
 77 |     }
 78 | 
 79 |     /**
 80 |      * Comparison using absolute tolerance.
 81 |      */
 82 |     def absTol(eps: Double): CompareDoubleRightSide =
 83 |       CompareDoubleRightSide(AbsoluteErrorComparison, x, eps, ABS_TOL_MSG)
 84 | 
 85 |     /**
 86 |      * Comparison using relative tolerance.
 87 |      */
 88 |     def relTol(eps: Double): CompareDoubleRightSide =
 89 |       CompareDoubleRightSide(RelativeErrorComparison, x, eps, REL_TOL_MSG)
 90 | 
 91 |     override def toString: String = x.toString
 92 |   }
 93 | 
 94 |   case class CompareVectorRightSide(
 95 |     fun: (Vector, Vector, Double) => Boolean, y: Vector, eps: Double, method: String)
 96 | 
 97 |   /**
 98 |    * Implicit class for comparing two vectors using relative tolerance or absolute tolerance.
 99 |    */
100 |   implicit class VectorWithAlmostEquals(val x: Vector) {
101 | 
102 |     /**
103 |      * When the difference of two vectors are within eps, returns true; otherwise, returns false.
104 |      */
105 |     def ~=(r: CompareVectorRightSide): Boolean = r.fun(x, r.y, r.eps)
106 | 
107 |     /**
108 |      * When the difference of two vectors are within eps, returns false; otherwise, returns true.
109 |      */
110 |     def !~=(r: CompareVectorRightSide): Boolean = !r.fun(x, r.y, r.eps)
111 | 
112 |     /**
113 |      * Throws exception when the difference of two vectors are NOT within eps;
114 |      * otherwise, returns true.
115 |      */
116 |     def ~==(r: CompareVectorRightSide): Boolean = {
117 |       if (!r.fun(x, r.y, r.eps)) {
118 |         throw new TestFailedException(
119 |           s"Expected $x and ${r.y} to be within ${r.eps}${r.method} for all elements.", 0)
120 |       }
121 |       true
122 |     }
123 | 
124 |     /**
125 |      * Throws exception when the difference of two vectors are within eps; otherwise, returns true.
126 |      */
127 |     def !~==(r: CompareVectorRightSide): Boolean = {
128 |       if (r.fun(x, r.y, r.eps)) {
129 |         throw new TestFailedException(
130 |           s"Did not expect $x and ${r.y} to be within ${r.eps}${r.method} for all elements.", 0)
131 |       }
132 |       true
133 |     }
134 | 
135 |     /**
136 |      * Comparison using absolute tolerance.
137 |      */
138 |     def absTol(eps: Double): CompareVectorRightSide = CompareVectorRightSide(
139 |       (x: Vector, y: Vector, eps: Double) => {
140 |         x.toArray.zip(y.toArray).forall(x => x._1 ~= x._2 absTol eps)
141 |       }, x, eps, ABS_TOL_MSG)
142 | 
143 |     /**
144 |      * Comparison using relative tolerance. Note that comparing against sparse vector
145 |      * with elements having value of zero will raise exception because it involves with
146 |      * comparing against zero.
147 |      */
148 |     def relTol(eps: Double): CompareVectorRightSide = CompareVectorRightSide(
149 |       (x: Vector, y: Vector, eps: Double) => {
150 |         x.toArray.zip(y.toArray).forall(x => x._1 ~= x._2 relTol eps)
151 |       }, x, eps, REL_TOL_MSG)
152 | 
153 |     override def toString: String = x.toString
154 |   }
155 | 
156 |   case class CompareMatrixRightSide(
157 |     fun: (Matrix, Matrix, Double) => Boolean, y: Matrix, eps: Double, method: String)
158 | 
159 |   /**
160 |    * Implicit class for comparing two matrices using relative tolerance or absolute tolerance.
161 |    */
162 |   implicit class MatrixWithAlmostEquals(val x: Matrix) {
163 | 
164 |     /**
165 |      * When the difference of two matrices are within eps, returns true; otherwise, returns false.
166 |      */
167 |     def ~=(r: CompareMatrixRightSide): Boolean = r.fun(x, r.y, r.eps)
168 | 
169 |     /**
170 |      * When the difference of two matrices are within eps, returns false; otherwise, returns true.
171 |      */
172 |     def !~=(r: CompareMatrixRightSide): Boolean = !r.fun(x, r.y, r.eps)
173 | 
174 |     /**
175 |      * Throws exception when the difference of two matrices are NOT within eps;
176 |      * otherwise, returns true.
177 |      */
178 |     def ~==(r: CompareMatrixRightSide): Boolean = {
179 |       if (!r.fun(x, r.y, r.eps)) {
180 |         throw new TestFailedException(
181 |           s"Expected \n$x\n and \n${r.y}\n to be within ${r.eps}${r.method} for all elements.", 0)
182 |       }
183 |       true
184 |     }
185 | 
186 |     /**
187 |      * Throws exception when the difference of two matrices are within eps; otherwise, returns true.
188 |      */
189 |     def !~==(r: CompareMatrixRightSide): Boolean = {
190 |       if (r.fun(x, r.y, r.eps)) {
191 |         throw new TestFailedException(
192 |           s"Did not expect \n$x\n and \n${r.y}\n to be within " +
193 |             "${r.eps}${r.method} for all elements.", 0)
194 |       }
195 |       true
196 |     }
197 | 
198 |     /**
199 |      * Comparison using absolute tolerance.
200 |      */
201 |     def absTol(eps: Double): CompareMatrixRightSide = CompareMatrixRightSide(
202 |       (x: Matrix, y: Matrix, eps: Double) => {
203 |         x.toArray.zip(y.toArray).forall(x => x._1 ~= x._2 absTol eps)
204 |       }, x, eps, ABS_TOL_MSG)
205 | 
206 |     /**
207 |      * Comparison using relative tolerance. Note that comparing against sparse vector
208 |      * with elements having value of zero will raise exception because it involves with
209 |      * comparing against zero.
210 |      */
211 |     def relTol(eps: Double): CompareMatrixRightSide = CompareMatrixRightSide(
212 |       (x: Matrix, y: Matrix, eps: Double) => {
213 |         x.toArray.zip(y.toArray).forall(x => x._1 ~= x._2 relTol eps)
214 |       }, x, eps, REL_TOL_MSG)
215 | 
216 |     override def toString: String = x.toString
217 |   }
218 | 
219 | }
220 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/test/scala/org/apache/spark/ml/classification/LocalLogisticRegressionSpec.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.classification
  2 | 
  3 | import com.collective.TestSparkContext
  4 | import org.apache.spark.mllib.linalg.{Vector, Vectors}
  5 | import org.apache.spark.mllib.regression.LabeledPoint
  6 | import org.apache.spark.sql.Row
  7 | import org.scalatest._
  8 | 
  9 | import scala.util.Random
 10 | import scala.util.control.Breaks._
 11 | import org.apache.spark.ml.TestingUtils._
 12 | 
 13 | /**
 14 |  * Copy Pasted from Spark LogisticRegressionSuite to verify that nothing is broken
 15 |  */
 16 | object LocalLogisticRegressionSpec {
 17 | 
 18 |   // Generate input of the form Y = logistic(offset + scale*X)
 19 |   def generateLogisticInput(
 20 |     offset: Double,
 21 |     scale: Double,
 22 |     nPoints: Int,
 23 |     seed: Int): Seq[LabeledPoint] = {
 24 |     val rnd = new Random(seed)
 25 |     val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian())
 26 | 
 27 |     val y = (0 until nPoints).map { i =>
 28 |       val p = 1.0 / (1.0 + math.exp(-(offset + scale * x1(i))))
 29 |       if (rnd.nextDouble() < p) 1.0 else 0.0
 30 |     }
 31 | 
 32 |     val testData = (0 until nPoints).map(i => LabeledPoint(y(i), Vectors.dense(Array(x1(i)))))
 33 |     testData
 34 |   }
 35 | 
 36 |   /**
 37 |    * Generates `k` classes multinomial synthetic logistic input in `n` dimensional space given the
 38 |    * model weights and mean/variance of the features. The synthetic data will be drawn from
 39 |    * the probability distribution constructed by weights using the following formula.
 40 |    *
 41 |    * P(y = 0 | x) = 1 / norm
 42 |    * P(y = 1 | x) = exp(x * w_1) / norm
 43 |    * P(y = 2 | x) = exp(x * w_2) / norm
 44 |    * ...
 45 |    * P(y = k-1 | x) = exp(x * w_{k-1}) / norm
 46 |    * where norm = 1 + exp(x * w_1) + exp(x * w_2) + ... + exp(x * w_{k-1})
 47 |    *
 48 |    * @param weights matrix is flatten into a vector; as a result, the dimension of weights vector
 49 |    *                will be (k - 1) * (n + 1) if `addIntercept == true`, and
 50 |    *                if `addIntercept != true`, the dimension will be (k - 1) * n.
 51 |    * @param xMean the mean of the generated features. Lots of time, if the features are not properly
 52 |    *              standardized, the algorithm with poor implementation will have difficulty
 53 |    *              to converge.
 54 |    * @param xVariance the variance of the generated features.
 55 |    * @param addIntercept whether to add intercept.
 56 |    * @param nPoints the number of instance of generated data.
 57 |    * @param seed the seed for random generator. For consistent testing result, it will be fixed.
 58 |    */
 59 |   def generateMultinomialLogisticInput(
 60 |     weights: Array[Double],
 61 |     xMean: Array[Double],
 62 |     xVariance: Array[Double],
 63 |     addIntercept: Boolean,
 64 |     nPoints: Int,
 65 |     seed: Int): Seq[LabeledPoint] = {
 66 |     val rnd = new Random(seed)
 67 | 
 68 |     val xDim = xMean.length
 69 |     val xWithInterceptsDim = if (addIntercept) xDim + 1 else xDim
 70 |     val nClasses = weights.length / xWithInterceptsDim + 1
 71 | 
 72 |     val x = Array.fill[Vector](nPoints)(Vectors.dense(Array.fill[Double](xDim)(rnd.nextGaussian())))
 73 | 
 74 |     x.foreach { vector =>
 75 |       // This doesn't work if `vector` is a sparse vector.
 76 |       val vectorArray = vector.toArray
 77 |       var i = 0
 78 |       val len = vectorArray.length
 79 |       while (i < len) {
 80 |         vectorArray(i) = vectorArray(i) * math.sqrt(xVariance(i)) + xMean(i)
 81 |         i += 1
 82 |       }
 83 |     }
 84 | 
 85 |     val y = (0 until nPoints).map { idx =>
 86 |       val xArray = x(idx).toArray
 87 |       val margins = Array.ofDim[Double](nClasses)
 88 |       val probs = Array.ofDim[Double](nClasses)
 89 | 
 90 |       for (i <- 0 until nClasses - 1) {
 91 |         for (j <- 0 until xDim) margins(i + 1) += weights(i * xWithInterceptsDim + j) * xArray(j)
 92 |         if (addIntercept) margins(i + 1) += weights((i + 1) * xWithInterceptsDim - 1)
 93 |       }
 94 |       // Preventing the overflow when we compute the probability
 95 |       val maxMargin = margins.max
 96 |       if (maxMargin > 0) for (i <- 0 until nClasses) margins(i) -= maxMargin
 97 | 
 98 |       // Computing the probabilities for each class from the margins.
 99 |       val norm = {
100 |         var temp = 0.0
101 |         for (i <- 0 until nClasses) {
102 |           probs(i) = math.exp(margins(i))
103 |           temp += probs(i)
104 |         }
105 |         temp
106 |       }
107 |       for (i <- 0 until nClasses) probs(i) /= norm
108 | 
109 |       // Compute the cumulative probability so we can generate a random number and assign a label.
110 |       for (i <- 1 until nClasses) probs(i) += probs(i - 1)
111 |       val p = rnd.nextDouble()
112 |       var y = 0
113 |       breakable {
114 |         for (i <- 0 until nClasses) {
115 |           if (p < probs(i)) {
116 |             y = i
117 |             break
118 |           }
119 |         }
120 |       }
121 |       y
122 |     }
123 | 
124 |     val testData = (0 until nPoints).map(i => LabeledPoint(y(i), x(i)))
125 |     testData
126 |   }
127 | 
128 | }
129 | 
130 | // Runs local Logistic Regression
131 | class LocalLogisticRegressionSpec extends AbstractLocalLogisticRegressionSpec("Local", 1)
132 | 
133 | // Runs default Spark Logistic Regression
134 | class DefaultLogisticRegressionSpec extends AbstractLocalLogisticRegressionSpec("Default", 2)
135 | 
136 | abstract class AbstractLocalLogisticRegressionSpec(name: String, partitions: Int)
137 |   extends FlatSpec with GivenWhenThen with ShouldMatchers with TestSparkContext {
138 | 
139 |   import LocalLogisticRegressionSpec._
140 | 
141 |   private val eps: Double = 1e-5
142 | 
143 |   lazy val dataset = sqlContext
144 |     .createDataFrame(generateLogisticInput(1.0, 1.0, nPoints = 100, seed = 42)).repartition(partitions)
145 | 
146 |   lazy val binaryDataset = {
147 |     val nPoints = 10000
148 |     val weights = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
149 |     val xMean = Array(5.843, 3.057, 3.758, 1.199)
150 |     val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
151 | 
152 |     sqlContext.createDataFrame(
153 |       generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42)).repartition(partitions)
154 |   }
155 | 
156 |   s"$name LogisticRegression" should "test logistic regression: Predictor, Classifier methods" in {
157 |     val lr = new LocalLogisticRegression
158 | 
159 |     val model = lr.fit(dataset)
160 |     assert(model.numClasses === 2)
161 | 
162 |     val results = model.transform(dataset)
163 | 
164 |     // Compare rawPrediction with probability
165 |     results.select("rawPrediction", "probability").collect().foreach {
166 |       case Row(raw: Vector, prob: Vector) =>
167 |         assert(raw.size === 2)
168 |         assert(prob.size === 2)
169 |         val probFromRaw1 = 1.0 / (1.0 + math.exp(-raw(1)))
170 |         assert(prob(1) ~== probFromRaw1 relTol eps)
171 |         assert(prob(0) ~== 1.0 - probFromRaw1 relTol eps)
172 |     }
173 | 
174 |     // Compare prediction with probability
175 |     results.select("prediction", "probability").collect().foreach {
176 |       case Row(pred: Double, prob: Vector) =>
177 |         val predFromProb = prob.toArray.zipWithIndex.maxBy(_._1)._2
178 |         assert(pred == predFromProb)
179 |     }
180 |   }
181 | 
182 |   it should "test binary logistic regression with intercept with L1 regularization" in {
183 |     val trainer1 = (new LocalLogisticRegression).setFitIntercept(true)
184 |       .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(true)
185 |     val trainer2 = (new LocalLogisticRegression).setFitIntercept(true)
186 |       .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(false)
187 | 
188 |     val model1 = trainer1.fit(binaryDataset)
189 |     val model2 = trainer2.fit(binaryDataset)
190 | 
191 |     /*
192 |        Using the following R code to load the data and train the model using glmnet package.
193 | 
194 |        library("glmnet")
195 |        data <- read.csv("path", header=FALSE)
196 |        label = factor(data$V1)
197 |        features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
198 |        weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12))
199 |        weights
200 | 
201 |        5 x 1 sparse Matrix of class "dgCMatrix"
202 |                             s0
203 |        (Intercept) -0.05627428
204 |        data.V2       .
205 |        data.V3       .
206 |        data.V4     -0.04325749
207 |        data.V5     -0.02481551
208 |      */
209 |     val interceptR1 = -0.05627428
210 |     val weightsR1 = Vectors.dense(0.0, 0.0, -0.04325749, -0.02481551)
211 | 
212 |     assert(model1.intercept ~== interceptR1 relTol 1E-2)
213 |     assert(model1.weights ~= weightsR1 absTol 2E-2)
214 | 
215 |     /*
216 |        Using the following R code to load the data and train the model using glmnet package.
217 | 
218 |        library("glmnet")
219 |        data <- read.csv("path", header=FALSE)
220 |        label = factor(data$V1)
221 |        features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
222 |        weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
223 |            standardize=FALSE))
224 |        weights
225 | 
226 |        5 x 1 sparse Matrix of class "dgCMatrix"
227 |                            s0
228 |        (Intercept)  0.3722152
229 |        data.V2       .
230 |        data.V3       .
231 |        data.V4     -0.1665453
232 |        data.V5       .
233 |      */
234 |     val interceptR2 = 0.3722152
235 |     val weightsR2 = Vectors.dense(0.0, 0.0, -0.1665453, 0.0)
236 | 
237 |     assert(model2.intercept ~== interceptR2 relTol 1E-2)
238 |     assert(model2.weights ~= weightsR2 absTol 1E-3)
239 |   }
240 | 
241 | }
242 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/main/scala/org/apache/spark/ml/feature/Binning.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.feature
  2 | 
  3 | import breeze.linalg.DenseVector
  4 | import breeze.optimize.{ApproximateGradientFunction, DiffFunction, LBFGS}
  5 | import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, BinaryAttribute}
  6 | import org.apache.spark.ml.param._
  7 | import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
  8 | import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
  9 | import org.apache.spark.ml.{Estimator, Model}
 10 | import org.apache.spark.mllib.linalg.Vectors
 11 | import org.apache.spark.sql.DataFrame
 12 | import org.apache.spark.sql.functions._
 13 | import org.apache.spark.sql.types.{DoubleType, NumericType, StructType}
 14 | 
 15 | 
 16 | private[feature] trait BinningBase extends Params with HasInputCol with HasOutputCol
 17 | 
 18 | class OptimalBinning(override val uid: String) extends Estimator[Binning] with BinningBase with SplitOptimizer {
 19 | 
 20 |   def this() = this(Identifiable.randomUID("optimalBinning"))
 21 | 
 22 |   val numBins: Param[Int] = new Param[Int](this, "numBins", "Number of bins",
 23 |     ParamValidators.gt(2))
 24 | 
 25 |   val sampleSize: Param[Int] = new Param[Int](this, "sampleSize", "Size of a sample used for split optimizer",
 26 |     ParamValidators.gt(1000))
 27 | 
 28 |   def getNumBins: Int = $(numBins)
 29 | 
 30 |   def getSampleSize: Int = $(sampleSize)
 31 | 
 32 |   def setNumBins(value: Int): this.type = set(numBins, value)
 33 | 
 34 |   def setSampleSize(value: Int): this.type = set(sampleSize, value)
 35 | 
 36 |   def setInputCol(value: String): this.type = set(inputCol, value)
 37 | 
 38 |   def setOutputCol(value: String): this.type = set(outputCol, value)
 39 | 
 40 |   setDefault(
 41 |     numBins    -> 5,
 42 |     sampleSize -> 10000
 43 |   )
 44 | 
 45 |   override def fit(dataset: DataFrame): Binning = {
 46 |     transformSchema(dataset.schema, logging = true)
 47 | 
 48 |     val notNulls = dataset.filter(col($(inputCol)).isNotNull)
 49 |     val inputSize = notNulls.count()
 50 |     val fraction = if ($(sampleSize) >= inputSize) 1.0D else $(sampleSize).toDouble / inputSize
 51 |     val sample = notNulls.select(col($(inputCol)).cast(DoubleType)).sample(withReplacement = false, fraction)
 52 | 
 53 |     val x = sample.collect().map(_.getDouble(0))
 54 | 
 55 |     log.debug(s"Collected sample size of: ${x.length}")
 56 | 
 57 |     // Doesn't make any sense to do binning if no enough sample points available
 58 |     require(x.length > ${numBins} * 10,
 59 |       s"Number of sample points for binning is too small")
 60 | 
 61 |     // Find optimal split with -Inf, +Inf bounds
 62 |     val splits = Double.NegativeInfinity +: optimalSplit(x, $(numBins) - 1) :+ Double.PositiveInfinity
 63 |     val bins = splits.sliding(2).map(bin => s"[${bin.mkString(", ")})").toArray
 64 |     log.debug(s"Calculated optimal split. Bins: ${bins.mkString(", ")}")
 65 | 
 66 |     copyValues(new Binning(uid).setSplits(splits).setParent(this))
 67 |   }
 68 | 
 69 |   override def transformSchema(schema: StructType): StructType = {
 70 |     val inputColName = $(inputCol)
 71 |     val inputDataType = schema(inputColName).dataType
 72 |     inputDataType match {
 73 |       case _: NumericType =>
 74 |       case other =>
 75 |         throw new IllegalArgumentException(s"Data type $other is not supported.")
 76 |     }
 77 |     // Names of bins are not available at this point
 78 |     val attrGroup = new AttributeGroup($(outputCol), $(numBins))
 79 |     SchemaUtils.appendColumn(schema, attrGroup.toStructField())
 80 |   }
 81 | 
 82 |   override def copy(extra: ParamMap): Estimator[Binning] = defaultCopy(extra)
 83 | }
 84 | 
 85 | 
 86 | /**
 87 |  * Based on [[org.apache.spark.ml.feature.Bucketizer Bucketizer]], except that
 88 |  * instead of [[org.apache.spark.ml.attribute.NominalAttribute NominalAttribute]] it
 89 |  * outputs [[org.apache.spark.ml.attribute.AttributeGroup AttributeGroup]] column
 90 |  */
 91 | final class Binning(override val uid: String)
 92 |   extends Model[Binning] with BinningBase {
 93 | 
 94 |   def this() = this(Identifiable.randomUID("binning"))
 95 | 
 96 |   val splits: DoubleArrayParam = new DoubleArrayParam(this, "splits",
 97 |     "Split points for mapping continuous features into bins. With n+1 splits, there are n " +
 98 |       "bins. A bin defined by splits x,y holds values in the range [x,y) except the last " +
 99 |       "bin, which also includes y. The splits should be strictly increasing. " +
100 |       "Values at -inf, inf must be explicitly provided to cover all Double values; " +
101 |       "otherwise, values outside the splits specified will be treated as errors.",
102 |     Bucketizer.checkSplits)
103 | 
104 |   def getSplits: Array[Double] = $(splits)
105 | 
106 |   def setSplits(value: Array[Double]): this.type = set(splits, value)
107 | 
108 |   def setInputCol(value: String): this.type = set(inputCol, value)
109 | 
110 |   def setOutputCol(value: String): this.type = set(outputCol, value)
111 | 
112 |   override def transform(dataset: DataFrame): DataFrame = {
113 |     val outputSchema = transformSchema(dataset.schema)
114 |     val numBins = ${splits}.length - 1
115 |     val t = udf { feature: Double =>
116 |       val binIdx = Bucketizer.binarySearchForBuckets($(splits), feature).toInt
117 |       Vectors.sparse(numBins, Seq((binIdx, 1.0)))
118 |     }
119 |     val metadata = outputSchema($(outputCol)).metadata
120 |     dataset.select(col("*"), t(col($(inputCol)).cast(DoubleType)).as($(outputCol), metadata))
121 |   }
122 | 
123 |   override def transformSchema(schema: StructType): StructType = {
124 |     val inputColName = $(inputCol)
125 |     val inputDataType = schema(inputColName).dataType
126 |     inputDataType match {
127 |       case _: NumericType =>
128 |       case other =>
129 |         throw new IllegalArgumentException(s"Data type $other is not supported.")
130 |     }
131 |     val bins = $(splits).sliding(2).map(bin => s"[${bin.mkString(", ")})").toArray
132 |     val attrs: Array[Attribute] = bins.map(bin => new BinaryAttribute(Some(bin)))
133 |     val attrGroup = new AttributeGroup($(outputCol), attrs)
134 |     SchemaUtils.appendColumn(schema, attrGroup.toStructField())
135 |   }
136 | 
137 |   override def copy(extra: ParamMap): Binning = {
138 |     defaultCopy[Binning](extra).setParent(parent)
139 |   }
140 | }
141 | 
142 | /**
143 |  * Compute optimal split to have the same number of points in each bucket/bin
144 |  */
145 | trait SplitOptimizer {
146 | 
147 |   protected def fromDiff(diff: Array[Double]): Array[Double] = {
148 |     diff.scanLeft(0D)((acc, v) => acc + v).drop(1)
149 |   }
150 | 
151 |   protected def toDiff(values: Array[Double]): Array[Double] = {
152 | 
153 |     if (values.isEmpty) {
154 |       Array.empty
155 |     } else if (values.length == 1) {
156 |       values
157 |     } else {
158 |       val diff = values.sliding(2) map {
159 |         case s if s.length == 2 => s(1) - s(0)
160 |         case s => sys.error(s"Unexpected sliding window: $s")
161 |       }
162 |       (values.head +: diff.toSeq).toArray
163 |     }
164 |   }
165 | 
166 |   protected def quantiles(x: Array[Double])(percentiles: Array[Double]): Array[Double] = {
167 |     val as = x.sorted
168 |     percentiles.map({ p =>
169 |       val i = p * (as.length - 1)
170 |       val lb = i.toInt
171 |       val ub = math.ceil(i).toInt
172 |       val w = i - lb
173 |       val quantile = as(lb) * (1 - w) + as(ub) * w
174 |       quantile
175 |     })(collection.breakOut)
176 |   }
177 | 
178 |   /**
179 |    * Mean squared error from ideal split
180 |    */
181 |   protected def error(counts: Array[Int]): Double = {
182 |     val sum = counts.sum
183 |     val bins = counts.length
184 |     counts.map(_ - (sum / bins)).map(math.pow(_, 2)).sum / bins
185 |   }
186 | 
187 |   protected class OptimalSplitTargetFunction(
188 |     x: Array[Double],
189 |     splits: Int
190 |   ) extends DiffFunction[DenseVector[Double]] {
191 | 
192 |     // Calculate starting point based on quantile split
193 |     val init: DenseVector[Double] = {
194 |       val percentile = (1 to splits) map (_.toDouble / (splits + 1))
195 |       DenseVector.apply(toDiff(quantiles(x)(percentile.toArray)))
196 |     }
197 | 
198 |     // Target minimization function
199 |     private val targetFunction: DenseVector[Double] => Double =
200 |       p => error(counts(p))
201 | 
202 |     def counts(p: DenseVector[Double]): Array[Int] = {
203 |       val splits = Double.NegativeInfinity +: fromDiff(p.toArray) :+ Double.PositiveInfinity
204 | 
205 |       val count = splits.sliding(2) map {
206 |         case split if split.length == 2 =>
207 |           val low = split(0)
208 |           val high = split(1)
209 |           val filter = (v: Double) => v >= low && v < high
210 |           x.count(filter)
211 | 
212 |         case split => sys.error(s"Unexpected split: $split")
213 |       }
214 | 
215 |       count.toArray
216 |     }
217 | 
218 |     private val gradient = new ApproximateGradientFunction(targetFunction)
219 | 
220 |     def calculate(p: DenseVector[Double]): (Double, DenseVector[Double]) = {
221 |       (targetFunction(p), gradient.gradientAt(p))
222 |     }
223 |   }
224 | 
225 |   /**
226 |    * Compute optimal split values to get uniformly
227 |    * distributed number of points in each bin
228 |    *
229 |    * @param x input data that needs to be splitted
230 |    * @param splits number of splits
231 |    * @param maxIter max iterations for LBFGS optimizer
232 |    * @param m memory parameter for LBFGS optimizer
233 |    * @return
234 |    */
235 |   def optimalSplit(
236 |     x: Array[Double],
237 |     splits: Int,
238 |     maxIter: Int = 100,
239 |     m: Int = 3
240 |   ): Array[Double] = {
241 | 
242 |     // Binning requires at least 3 splits
243 |     require(splits >= 3, s"Target splits should greater or equal 3")
244 | 
245 |     val lbfgs = new LBFGS[DenseVector[Double]](maxIter, m)
246 |     val f = new OptimalSplitTargetFunction(x, splits)
247 | 
248 |     fromDiff(lbfgs.minimize(f, f.init).toArray)
249 |   }
250 | 
251 | }
252 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryModelMetrics.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.mllib.evaluation
 19 | 
 20 | import org.apache.spark.Logging
 21 | import org.apache.spark.annotation.Experimental
 22 | import org.apache.spark.mllib.evaluation.binary._
 23 | import org.apache.spark.rdd.{RDD, UnionRDD}
 24 | import org.apache.spark.sql.DataFrame
 25 | 
 26 | /**
 27 |  * :: Experimental ::
 28 |  * Evaluator for binary classification.
 29 |  *
 30 |  * @param scoreAndLabels an RDD of (score, label) pairs.
 31 |  * @param numBins if greater than 0, then the curves (ROC curve, PR curve) computed internally
 32 |  *                will be down-sampled to this many "bins". If 0, no down-sampling will occur.
 33 |  *                This is useful because the curve contains a point for each distinct score
 34 |  *                in the input, and this could be as large as the input itself -- millions of
 35 |  *                points or more, when thousands may be entirely sufficient to summarize
 36 |  *                the curve. After down-sampling, the curves will instead be made of approximately
 37 |  *                `numBins` points instead. Points are made from bins of equal numbers of
 38 |  *                consecutive points. The size of each bin is
 39 |  *                `floor(scoreAndLabels.count() / numBins)`, which means the resulting number
 40 |  *                of bins may not exactly equal numBins. The last bin in each partition may
 41 |  *                be smaller as a result, meaning there may be an extra sample at
 42 |  *                partition boundaries.
 43 |  * @since 1.3.0
 44 |  */
 45 | @Experimental
 46 | class BinaryModelMetrics(
 47 |   val scoreAndLabels: RDD[(Double, Double)],
 48 |   val numBins: Int) extends Logging {
 49 | 
 50 |   require(numBins >= 0, "numBins must be nonnegative")
 51 | 
 52 |   /**
 53 |    * Defaults `numBins` to 0.
 54 |    * @since 1.0.0
 55 |    */
 56 |   def this(scoreAndLabels: RDD[(Double, Double)]) = this(scoreAndLabels, 0)
 57 | 
 58 |   /**
 59 |    * An auxiliary constructor taking a DataFrame.
 60 |    * @param scoreAndLabels a DataFrame with two double columns: score and label
 61 |    */
 62 |   private[mllib] def this(scoreAndLabels: DataFrame) =
 63 |     this(scoreAndLabels.map(r => (r.getDouble(0), r.getDouble(1))))
 64 | 
 65 |   /**
 66 |    * Unpersist intermediate RDDs used in the computation.
 67 |    * @since 1.0.0
 68 |    */
 69 |   def unpersist() {
 70 |     cumulativeCounts.unpersist()
 71 |   }
 72 | 
 73 |   /**
 74 |    * Returns thresholds in descending order.
 75 |    * @since 1.0.0
 76 |    */
 77 |   def thresholds(): RDD[Double] = cumulativeCounts.map(_._1)
 78 | 
 79 |   def gains(): RDD[(Double, Double)] = {
 80 |     val gainsChart = createCurve(Reach, Recall)
 81 |     val sc = confusions.context
 82 |     val first = sc.makeRDD(Seq((0.0, 0.0)), 1)
 83 |     val last = sc.makeRDD(Seq((1.0, 1.0)), 1)
 84 |     new UnionRDD[(Double, Double)](sc, Seq(first, gainsChart, last))
 85 |   }
 86 | 
 87 |   def lift(): RDD[(Double, Double)] = createCurve(Reach, Lift)
 88 | 
 89 |   /**
 90 |    * Returns the receiver operating characteristic (ROC) curve,
 91 |    * which is an RDD of (false positive rate, true positive rate)
 92 |    * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it.
 93 |    * @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic
 94 |    * @since 1.0.0
 95 |    */
 96 |   def roc(): RDD[(Double, Double)] = {
 97 |     val rocCurve = createCurve(FalsePositiveRate, Recall)
 98 |     val sc = confusions.context
 99 |     val first = sc.makeRDD(Seq((0.0, 0.0)), 1)
100 |     val last = sc.makeRDD(Seq((1.0, 1.0)), 1)
101 |     new UnionRDD[(Double, Double)](sc, Seq(first, rocCurve, last))
102 |   }
103 | 
104 |   /**
105 |    * Computes the area under the receiver operating characteristic (ROC) curve.
106 |    * @since 1.0.0
107 |    */
108 |   def areaUnderROC(): Double = AreaUnderCurve.of(roc())
109 | 
110 |   /**
111 |    * Returns the precision-recall curve, which is an RDD of (recall, precision),
112 |    * NOT (precision, recall), with (0.0, 1.0) prepended to it.
113 |    * @see http://en.wikipedia.org/wiki/Precision_and_recall
114 |    * @since 1.0.0
115 |    */
116 |   def pr(): RDD[(Double, Double)] = {
117 |     val prCurve = createCurve(Recall, Precision)
118 |     val sc = confusions.context
119 |     val first = sc.makeRDD(Seq((0.0, 1.0)), 1)
120 |     first.union(prCurve)
121 |   }
122 | 
123 |   /**
124 |    * Computes the area under the precision-recall curve.
125 |    * @since 1.0.0
126 |    */
127 |   def areaUnderPR(): Double = AreaUnderCurve.of(pr())
128 | 
129 |   /**
130 |    * Returns the (threshold, F-Measure) curve.
131 |    * @param beta the beta factor in F-Measure computation.
132 |    * @return an RDD of (threshold, F-Measure) pairs.
133 |    * @see http://en.wikipedia.org/wiki/F1_score
134 |    * @since 1.0.0
135 |    */
136 |   def fMeasureByThreshold(beta: Double): RDD[(Double, Double)] = createCurve(FMeasure(beta))
137 | 
138 |   /**
139 |    * Returns the (threshold, F-Measure) curve with beta = 1.0.
140 |    * @since 1.0.0
141 |    */
142 |   def fMeasureByThreshold(): RDD[(Double, Double)] = fMeasureByThreshold(1.0)
143 | 
144 |   /**
145 |    * Returns the (threshold, precision) curve.
146 |    * @since 1.0.0
147 |    */
148 |   def precisionByThreshold(): RDD[(Double, Double)] = createCurve(Precision)
149 | 
150 |   /**
151 |    * Returns the (threshold, recall) curve.
152 |    * @since 1.0.0
153 |    */
154 |   def recallByThreshold(): RDD[(Double, Double)] = createCurve(Recall)
155 | 
156 |   private lazy val (
157 |     cumulativeCounts: RDD[(Double, BinaryLabelCounter)],
158 |     confusions: RDD[(Double, BinaryConfusionMatrix)]) = {
159 |     // Create a bin for each distinct score value, count positives and negatives within each bin,
160 |     // and then sort by score values in descending order.
161 |     val counts = scoreAndLabels.combineByKey(
162 |       createCombiner = (label: Double) => new BinaryLabelCounter(0L, 0L) += label,
163 |       mergeValue = (c: BinaryLabelCounter, label: Double) => c += label,
164 |       mergeCombiners = (c1: BinaryLabelCounter, c2: BinaryLabelCounter) => c1 += c2
165 |     ).sortByKey(ascending = false)
166 | 
167 |     val binnedCounts =
168 |     // Only down-sample if bins is > 0
169 |       if (numBins == 0) {
170 |         // Use original directly
171 |         counts
172 |       } else {
173 |         val countsSize = counts.count()
174 |         // Group the iterator into chunks of about countsSize / numBins points,
175 |         // so that the resulting number of bins is about numBins
176 |         var grouping = countsSize / numBins
177 |         if (grouping < 2) {
178 |           // numBins was more than half of the size; no real point in down-sampling to bins
179 |           logInfo(s"Curve is too small ($countsSize) for $numBins bins to be useful")
180 |           counts
181 |         } else {
182 |           if (grouping >= Int.MaxValue) {
183 |             logWarning(
184 |               s"Curve too large ($countsSize) for $numBins bins; capping at ${Int.MaxValue}")
185 |             grouping = Int.MaxValue
186 |           }
187 |           counts.mapPartitions(_.grouped(grouping.toInt).map { pairs =>
188 |             // The score of the combined point will be just the first one's score
189 |             val firstScore = pairs.head._1
190 |             // The point will contain all counts in this chunk
191 |             val agg = new BinaryLabelCounter()
192 |             pairs.foreach(pair => agg += pair._2)
193 |             (firstScore, agg)
194 |           })
195 |         }
196 |       }
197 | 
198 |     val agg = binnedCounts.values.mapPartitions { iter =>
199 |       val agg = new BinaryLabelCounter()
200 |       iter.foreach(agg += _)
201 |       Iterator(agg)
202 |     }.collect()
203 |     val partitionwiseCumulativeCounts =
204 |       agg.scanLeft(new BinaryLabelCounter())(
205 |         (agg: BinaryLabelCounter, c: BinaryLabelCounter) => agg.clone() += c)
206 |     val totalCount = partitionwiseCumulativeCounts.last
207 |     logInfo(s"Total counts: $totalCount")
208 |     val cumulativeCounts = binnedCounts.mapPartitionsWithIndex(
209 |       (index: Int, iter: Iterator[(Double, BinaryLabelCounter)]) => {
210 |         val cumCount = partitionwiseCumulativeCounts(index)
211 |         iter.map { case (score, c) =>
212 |           cumCount += c
213 |           (score, cumCount.clone())
214 |         }
215 |       }, preservesPartitioning = true)
216 |     cumulativeCounts.persist()
217 |     val confusions = cumulativeCounts.map { case (score, cumCount) =>
218 |       (score, BinaryConfusionMatrixImpl(cumCount, totalCount).asInstanceOf[BinaryConfusionMatrix])
219 |     }
220 |     (cumulativeCounts, confusions)
221 |   }
222 | 
223 |   /** Creates a curve of (threshold, metric). */
224 |   private def createCurve(y: BinaryClassificationMetricComputer): RDD[(Double, Double)] = {
225 |     confusions.map { case (s, c) =>
226 |       (s, y(c))
227 |     }
228 |   }
229 | 
230 |   /** Creates a curve of (metricX, metricY). */
231 |   private def createCurve(
232 |     x: BinaryClassificationMetricComputer,
233 |     y: BinaryClassificationMetricComputer): RDD[(Double, Double)] = {
234 |     confusions.map { case (_, c) =>
235 |       (x(c), y(c))
236 |     }
237 |   }
238 | }
239 | 


--------------------------------------------------------------------------------
/scalastyle-config.xml:
--------------------------------------------------------------------------------
  1 | <!--
  2 | 
  3 | If you wish to turn off checking for a section of code, you can put a comment in the source
  4 | before and after the section, with the following syntax:
  5 | 
  6 |   // scalastyle:off
  7 |   ...  // stuff that breaks the styles
  8 |   // scalastyle:on
  9 | 
 10 | You can also disable only one rule, by specifying its rule id, as specified in:
 11 |   http://www.scalastyle.org/rules-0.7.0.html
 12 | 
 13 |   // scalastyle:off no.finalize
 14 |   override def finalize(): Unit = ...
 15 |   // scalastyle:on no.finalize
 16 | 
 17 | This file is divided into 3 sections:
 18 |  (1) rules that we enforce.
 19 |  (2) rules that we would like to enforce, but haven't cleaned up the codebase to turn on yet
 20 |      (or we need to make the scalastyle rule more configurable).
 21 |  (3) rules that we don't want to enforce.
 22 | -->
 23 | 
 24 | <scalastyle>
 25 |     <name>Scalastyle standard configuration</name>
 26 | 
 27 |     <!-- ================================================================================ -->
 28 |     <!--                               rules we enforce                                   -->
 29 |     <!-- ================================================================================ -->
 30 | 
 31 |     <check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"></check>
 32 | 
 33 |     <check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check>
 34 | 
 35 |     <check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
 36 | 
 37 |     <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check>
 38 | 
 39 |     <check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
 40 |         <parameters>
 41 |             <parameter name="maxLineLength"><![CDATA[130]]></parameter>
 42 |             <parameter name="tabSize"><![CDATA[2]]></parameter>
 43 |             <parameter name="ignoreImports">true</parameter>
 44 |         </parameters>
 45 |     </check>
 46 | 
 47 |     <check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true">
 48 |         <parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters>
 49 |     </check>
 50 | 
 51 |     <check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true">
 52 |         <parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters>
 53 |     </check>
 54 | 
 55 |     <check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true">
 56 |         <parameters><parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter></parameters>
 57 |     </check>
 58 | 
 59 |     <check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
 60 |         <parameters><parameter name="maxParameters"><![CDATA[10]]></parameter></parameters>
 61 |     </check>
 62 | 
 63 |     <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check>
 64 | 
 65 |     <check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check>
 66 | 
 67 |     <check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check>
 68 | 
 69 |     <check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check>
 70 | 
 71 |     <check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true">
 72 |         <parameters>
 73 |             <parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
 74 |             <parameter name="doubleLineAllowed"><![CDATA[true]]></parameter>
 75 |         </parameters>
 76 |     </check>
 77 | 
 78 |     <check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check>
 79 | 
 80 |     <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
 81 | 
 82 |     <check level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check>
 83 | 
 84 |     <check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check>
 85 | 
 86 |     <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceAfterTokenChecker" enabled="true">
 87 |         <parameters>
 88 |             <parameter name="tokens">ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
 89 |         </parameters>
 90 |     </check>
 91 | 
 92 |     <!-- ??? usually shouldn't be checked into the code base. -->
 93 |     <check level="error" class="org.scalastyle.scalariform.NotImplementedErrorUsage" enabled="true"></check>
 94 | 
 95 |     <!-- As of SPARK-7558, all tests in Spark should extend o.a.s.SparkFunSuite instead of FunSuite directly -->
 96 |     <check customId="funsuite" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
 97 |         <parameters><parameter name="regex">^FunSuite[A-Za-z]*$</parameter></parameters>
 98 |         <customMessage>Tests must extend org.apache.spark.SparkFunSuite instead.</customMessage>
 99 |     </check>
100 | 
101 |     <!-- ================================================================================ -->
102 |     <!--       rules we'd like to enforce, but haven't cleaned up the codebase yet        -->
103 |     <!-- ================================================================================ -->
104 | 
105 |     <!-- SPARK-7977 We should turn this on, but we'd need to add whitelist to files that are using it first. -->
106 |     <check customId="println" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="false">
107 |         <parameters><parameter name="regex">^println$</parameter></parameters>
108 |         <customMessage><![CDATA[Are you sure you want to println? If yes, wrap the code block with
109 |       // scalastyle:off println
110 |       println(...)
111 |       // scalastyle:on println]]></customMessage>
112 |     </check>
113 | 
114 |     <!-- We cannot turn the following two on, because it'd fail a lot of string interpolation use cases. -->
115 |     <!-- Ideally the following two rules should be configurable to rule out string interpolation. -->
116 |     <check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="false"></check>
117 |     <check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="false"></check>
118 | 
119 |     <!-- This breaks symbolic method names so we don't turn it on. -->
120 |     <!-- Maybe we should update it to allow basic symbolic names, and then we are good to go. -->
121 |     <check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="false">
122 |         <parameters>
123 |             <parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter>
124 |         </parameters>
125 |     </check>
126 | 
127 |     <!-- Should turn this on, but we have a few places that need to be fixed first -->
128 |     <check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="false"></check>
129 | 
130 |     <!-- ================================================================================ -->
131 |     <!--                               rules we don't want                                -->
132 |     <!-- ================================================================================ -->
133 | 
134 |     <check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="false">
135 |         <parameters><parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter></parameters>
136 |     </check>
137 | 
138 |     <!-- We want to be able to align arrows with spaces -->
139 |     <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" enabled="false">
140 |         <parameters>
141 |             <parameter name="tokens">ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
142 |         </parameters>
143 |     </check>
144 | 
145 |     <!-- We want the opposite of this: NewLineAtEofChecker -->
146 |     <check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check>
147 | 
148 |     <!-- This one complains about all kinds of random things. Disable. -->
149 |     <check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="false"></check>
150 | 
151 |     <!-- We use return quite a bit for control flows and guards -->
152 |     <check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="false"></check>
153 | 
154 |     <!-- We use null a lot in low level code and to interface with 3rd party code -->
155 |     <check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="false"></check>
156 | 
157 |     <!-- Doesn't seem super big deal here ... -->
158 |     <check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="false"></check>
159 | 
160 |     <!-- Doesn't seem super big deal here ... -->
161 |     <check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="false">
162 |         <parameters><parameter name="maxFileLength">800></parameter></parameters>
163 |     </check>
164 | 
165 |     <!-- Doesn't seem super big deal here ... -->
166 |     <check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="false">
167 |         <parameters><parameter name="maxTypes">30</parameter></parameters>
168 |     </check>
169 | 
170 |     <!-- Doesn't seem super big deal here ... -->
171 |     <check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="false">
172 |         <parameters><parameter name="maximum">10</parameter></parameters>
173 |     </check>
174 | 
175 |     <!-- Doesn't seem super big deal here ... -->
176 |     <check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="false">
177 |         <parameters><parameter name="maxLength">50</parameter></parameters>
178 |     </check>
179 | 
180 |     <!-- Not exactly feasible to enforce this right now. -->
181 |     <!-- It is also infrequent that somebody introduces a new class with a lot of methods. -->
182 |     <check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="false">
183 |         <parameters><parameter name="maxMethods"><![CDATA[30]]></parameter></parameters>
184 |     </check>
185 | 
186 |     <!-- Doesn't seem super big deal here, and we have a lot of magic numbers ... -->
187 |     <check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false">
188 |         <parameters><parameter name="ignore">-1,0,1,2,3</parameter></parameters>
189 |     </check>
190 | 
191 | </scalastyle>
192 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/main/scala/org/apache/spark/ml/classification/LocalLogisticRegression.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.classification
  2 | 
  3 | import breeze.linalg.{DenseVector => BDV}
  4 | import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN}
  5 | import org.apache.spark.ml.param._
  6 | import org.apache.spark.ml.util.Identifiable
  7 | import org.apache.spark.mllib.linalg._
  8 | import org.apache.spark.mllib.regression.LabeledPoint
  9 | import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 10 | import org.apache.spark.sql.DataFrame
 11 | import org.apache.spark.{Logging, SparkException}
 12 | 
 13 | import scala.collection.mutable
 14 | 
 15 | /**
 16 |  * Local version of [[LogisticRegression]]
 17 |  * When DataFrame is too small that it can easily fit into single node it doesn't make sense
 18 |  * to build model using RDD, it can be built on single node. Essentially using Spark
 19 |  * as distributed Executor
 20 |  */
 21 | class LocalLogisticRegression(override val uid: String)
 22 |   extends ProbabilisticClassifier[Vector, LocalLogisticRegression, LogisticRegressionModel]
 23 |   with LogisticRegressionParams with Logging {
 24 | 
 25 |   def this() = this(Identifiable.randomUID("locallogreg"))
 26 | 
 27 |   def setRegParam(value: Double): this.type = set(regParam, value)
 28 |   setDefault(regParam -> 0.0)
 29 | 
 30 |   def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value)
 31 |   setDefault(elasticNetParam -> 0.0)
 32 | 
 33 |   def setMaxIter(value: Int): this.type = set(maxIter, value)
 34 |   setDefault(maxIter -> 100)
 35 | 
 36 |   def setTol(value: Double): this.type = set(tol, value)
 37 |   setDefault(tol -> 1E-6)
 38 | 
 39 |   def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
 40 |   setDefault(fitIntercept -> true)
 41 | 
 42 |   def setStandardization(value: Boolean): this.type = set(standardization, value)
 43 |   setDefault(standardization -> true)
 44 | 
 45 |   override def setThreshold(value: Double): this.type = super.setThreshold(value)
 46 | 
 47 |   override def getThreshold: Double = super.getThreshold
 48 | 
 49 |   override def setThresholds(value: Array[Double]): this.type = super.setThresholds(value)
 50 | 
 51 |   override def getThresholds: Array[Double] = super.getThresholds
 52 | 
 53 |   private def trainLocal(instances: Array[(Double, Vector)]): (LogisticRegressionModel, Array[Double]) = {
 54 | 
 55 |     val (summarizer, labelSummarizer) =
 56 |       instances.foldLeft((new MultivariateOnlineSummarizer, new MultiClassSummarizer)) {
 57 |         case ((summarizer: MultivariateOnlineSummarizer, labelSummarizer: MultiClassSummarizer),
 58 |           (label: Double, features: Vector)) =>
 59 |           (summarizer.add(features), labelSummarizer.add(label))
 60 |       }
 61 | 
 62 |     val histogram = labelSummarizer.histogram
 63 |     val numInvalid = labelSummarizer.countInvalid
 64 |     val numClasses = histogram.length
 65 |     val numFeatures = summarizer.mean.size
 66 | 
 67 |     if (numInvalid != 0) {
 68 |       val msg = s"Classification labels should be in {0 to ${numClasses - 1} " +
 69 |         s"Found $numInvalid invalid labels."
 70 |       logError(msg)
 71 |       throw new SparkException(msg)
 72 |     }
 73 | 
 74 |     if (numClasses > 2) {
 75 |       val msg = s"Currently, LogisticRegression with ElasticNet in ML package only supports " +
 76 |         s"binary classification. Found $numClasses in the input dataset."
 77 |       logError(msg)
 78 |       throw new SparkException(msg)
 79 |     }
 80 | 
 81 |     val featuresMean = summarizer.mean.toArray
 82 |     val featuresStd = summarizer.variance.toArray.map(math.sqrt)
 83 | 
 84 |     val regParamL1 = $(elasticNetParam) * $(regParam)
 85 |     val regParamL2 = (1.0 - $(elasticNetParam)) * $(regParam)
 86 | 
 87 |     val costFun = new LocalLogisticCostFun(instances, numClasses, $(fitIntercept), $(standardization),
 88 |       featuresStd, featuresMean, regParamL2)
 89 | 
 90 |     val optimizer = if ($(elasticNetParam) == 0.0 || $(regParam) == 0.0) {
 91 |       new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol))
 92 |     } else {
 93 |       def regParamL1Fun = (index: Int) => {
 94 |         // Remove the L1 penalization on the intercept
 95 |         if (index == numFeatures) {
 96 |           0.0
 97 |         } else {
 98 |           if ($(standardization)) {
 99 |             regParamL1
100 |           } else {
101 |             // If `standardization` is false, we still standardize the data
102 |             // to improve the rate of convergence; as a result, we have to
103 |             // perform this reverse standardization by penalizing each component
104 |             // differently to get effectively the same objective function when
105 |             // the training dataset is not standardized.
106 |             if (featuresStd(index) != 0.0) regParamL1 / featuresStd(index) else 0.0
107 |           }
108 |         }
109 |       }
110 |       new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol))
111 |     }
112 | 
113 |     val initialWeightsWithIntercept =
114 |       Vectors.zeros(if ($(fitIntercept)) numFeatures + 1 else numFeatures)
115 | 
116 |     if ($(fitIntercept)) {
117 |       /*
118 |          For binary logistic regression, when we initialize the weights as zeros,
119 |          it will converge faster if we initialize the intercept such that
120 |          it follows the distribution of the labels.
121 | 
122 |          {{{
123 |          P(0) = 1 / (1 + \exp(b)), and
124 |          P(1) = \exp(b) / (1 + \exp(b))
125 |          }}}, hence
126 |          {{{
127 |          b = \log{P(1) / P(0)} = \log{count_1 / count_0}
128 |          }}}
129 |        */
130 |       initialWeightsWithIntercept.toArray(numFeatures)
131 |         = math.log(histogram(1).toDouble / histogram(0).toDouble)
132 |     }
133 | 
134 |     val states = optimizer.iterations(new CachedDiffFunction(costFun),
135 |       initialWeightsWithIntercept.toBreeze.toDenseVector)
136 | 
137 |     val (weights, intercept, objectiveHistory) = {
138 |       /*
139 |          Note that in Logistic Regression, the objective history (loss + regularization)
140 |          is log-likelihood which is invariance under feature standardization. As a result,
141 |          the objective history from optimizer is the same as the one in the original space.
142 |        */
143 |       val arrayBuilder = mutable.ArrayBuilder.make[Double]
144 |       var state: optimizer.State = null
145 |       while (states.hasNext) {
146 |         state = states.next()
147 |         arrayBuilder += state.adjustedValue
148 |       }
149 | 
150 |       if (state == null) {
151 |         val msg = s"${optimizer.getClass.getName} failed."
152 |         logError(msg)
153 |         throw new SparkException(msg)
154 |       }
155 | 
156 |       /*
157 |          The weights are trained in the scaled space; we're converting them back to
158 |          the original space.
159 |          Note that the intercept in scaled space and original space is the same;
160 |          as a result, no scaling is needed.
161 |        */
162 |       val rawWeights = state.x.toArray.clone()
163 |       var i = 0
164 |       while (i < numFeatures) {
165 |         rawWeights(i) *= { if (featuresStd(i) != 0.0) 1.0 / featuresStd(i) else 0.0 }
166 |         i += 1
167 |       }
168 | 
169 |       if ($(fitIntercept)) {
170 |         (Vectors.dense(rawWeights.dropRight(1)).compressed, rawWeights.last, arrayBuilder.result())
171 |       } else {
172 |         (Vectors.dense(rawWeights).compressed, 0.0, arrayBuilder.result())
173 |       }
174 |     }
175 | 
176 |     val model = copyValues(new LogisticRegressionModel(uid, weights, intercept))
177 | 
178 |     (model, objectiveHistory)
179 |   }
180 | 
181 |   override protected def train(dataset: DataFrame): LogisticRegressionModel = {
182 | 
183 |     if (dataset.rdd.partitions.length == 1) {
184 |       log.info(s"Build LogisticRegression in local mode")
185 | 
186 |       val (model, objectiveHistory) = extractLabeledPoints(dataset).map {
187 |         case LabeledPoint(label: Double, features: Vector) => (label, features)
188 |       }.mapPartitions { instances =>
189 |         Seq(trainLocal(instances.toArray)).toIterator
190 |       }.first()
191 | 
192 |       val logRegSummary = new BinaryLogisticRegressionTrainingSummary(
193 |         model.transform(dataset),
194 |         $(probabilityCol),
195 |         $(labelCol),
196 |         objectiveHistory)
197 |       model.setSummary(logRegSummary)
198 | 
199 |     } else {
200 |       log.info(s"Fallback to distributed LogisticRegression")
201 | 
202 |       val that = classOf[LogisticRegression].getConstructor(classOf[String]).newInstance(uid)
203 |       val logisticRegression = copyValues(that)
204 |       // Scala Reflection magic to call protected train method
205 |       val ru = scala.reflect.runtime.universe
206 |       import ru._
207 |       val m = ru.runtimeMirror(logisticRegression.getClass.getClassLoader)
208 |       val im = m.reflect(logisticRegression)
209 |       val trainMethod = typeOf[LogisticRegression].declaration(newTermName("train")).asMethod
210 |       val mm = im.reflectMethod(trainMethod)
211 |       mm.apply(dataset).asInstanceOf[LogisticRegressionModel]
212 |     }
213 |   }
214 | 
215 |   override def copy(extra: ParamMap): LocalLogisticRegression = defaultCopy(extra)
216 | }
217 | 
218 | /**
219 |  * Local version of [[LogisticCostFun]]
220 |  */
221 | private class LocalLogisticCostFun(
222 |   data: Array[(Double, Vector)],
223 |   numClasses: Int,
224 |   fitIntercept: Boolean,
225 |   standardization: Boolean,
226 |   featuresStd: Array[Double],
227 |   featuresMean: Array[Double],
228 |   regParamL2: Double) extends DiffFunction[BDV[Double]] {
229 | 
230 |   override def calculate(weights: BDV[Double]): (Double, BDV[Double]) = {
231 |     val numFeatures = featuresStd.length
232 |     val w = Vectors.fromBreeze(weights)
233 | 
234 |     val logisticAggregator = data.foldLeft(new LogisticAggregator(w, numClasses, fitIntercept,
235 |       featuresStd, featuresMean)) {
236 |       case (aggregator, (label, features)) => aggregator.add(label, features)
237 |     }
238 | 
239 |     val totalGradientArray = logisticAggregator.gradient.toArray
240 | 
241 |     // regVal is the sum of weight squares excluding intercept for L2 regularization.
242 |     val regVal = if (regParamL2 == 0.0) {
243 |       0.0
244 |     } else {
245 |       var sum = 0.0
246 |       w.foreachActive { (index, value) =>
247 |         // If `fitIntercept` is true, the last term which is intercept doesn't
248 |         // contribute to the regularization.
249 |         if (index != numFeatures) {
250 |           // The following code will compute the loss of the regularization; also
251 |           // the gradient of the regularization, and add back to totalGradientArray.
252 |           sum += {
253 |             if (standardization) {
254 |               totalGradientArray(index) += regParamL2 * value
255 |               value * value
256 |             } else {
257 |               if (featuresStd(index) != 0.0) {
258 |                 // If `standardization` is false, we still standardize the data
259 |                 // to improve the rate of convergence; as a result, we have to
260 |                 // perform this reverse standardization by penalizing each component
261 |                 // differently to get effectively the same objective function when
262 |                 // the training dataset is not standardized.
263 |                 val temp = value / (featuresStd(index) * featuresStd(index))
264 |                 totalGradientArray(index) += regParamL2 * temp
265 |                 value * temp
266 |               } else {
267 |                 0.0
268 |               }
269 |             }
270 |           }
271 |         }
272 |       }
273 |       0.5 * regParamL2 * sum
274 |     }
275 | 
276 |     (logisticAggregator.loss + regVal, new BDV(totalGradientArray))
277 |   }
278 | }
279 | 
280 | 


--------------------------------------------------------------------------------
/sparkext-mllib/src/main/scala/org/apache/spark/ml/feature/GatherEncoder.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.feature
  2 | 
  3 | import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
  4 | import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
  5 | import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators, Params}
  6 | import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
  7 | import org.apache.spark.ml.{Estimator, Model}
  8 | import org.apache.spark.mllib.linalg.{VectorUDT, Vectors}
  9 | import org.apache.spark.sql.DataFrame
 10 | import org.apache.spark.sql.functions._
 11 | import org.apache.spark.sql.types._
 12 | 
 13 | import scala.collection.mutable
 14 | 
 15 | 
 16 | private[feature] trait GatherEncoderParams
 17 |   extends Params with HasInputCol with HasOutputCol with HasKeyCol with HasValueCol {
 18 | 
 19 |   val transformation: Param[String] = new Param[String](this, "transformation",
 20 |     "Transformation type: [top, index]",
 21 |     ParamValidators.inArray(Array("top", "index")))
 22 | 
 23 |   val support: Param[Double] = new Param[Double](this, "support",
 24 |     "Minimum support",
 25 |     ParamValidators.inRange(0.0, 100.0))
 26 | 
 27 |   val cover: Param[Double] = new Param[Double](this, "cover",
 28 |     "Top coverage",
 29 |     ParamValidators.inRange(0.0, 100.0))
 30 | 
 31 |   val allOther: Param[Boolean] = new Param[Boolean](this, "allOther",
 32 |     "Add all other column")
 33 | 
 34 |   val keepInputCol: Param[Boolean] = new Param[Boolean](this, "keepInputCol",
 35 |     "Keep input column in transformed data frame")
 36 | 
 37 |   val failOnEmptyKeys: Param[Boolean] = new Param[Boolean](this, "failOnEmptyKeys",
 38 |     "Fail if gathered key set is empty")
 39 | 
 40 |   val excludeKeys: Param[Set[Any]] = new Param[Set[Any]](this, "excludeKeys",
 41 |     "Exclude given keys from encoded model")
 42 | 
 43 |   def getCover: Double = $(cover)
 44 | 
 45 |   def getAllOther: Boolean = $(allOther)
 46 | 
 47 |   def getKeepInputCol: Boolean = $(keepInputCol)
 48 | 
 49 |   def getFailOnEmptyKeys: Boolean = ${failOnEmptyKeys}
 50 | 
 51 |   def getExcludeKeys: Set[Any] = $(excludeKeys)
 52 | 
 53 |   protected def validateSchema(schema: StructType): Unit = {
 54 |     // Check that inputCol is array of StructType
 55 |     val inputColName = $(inputCol)
 56 |     val inputColDataType = schema(inputColName).dataType
 57 |     val inputColStructSchema = inputColDataType match {
 58 |       case ArrayType(structType: StructType, _) =>
 59 |         structType
 60 |       case other =>
 61 |         throw new IllegalArgumentException(s"Input column data type $other is not supported.")
 62 |     }
 63 | 
 64 |     // Check that key type is supported
 65 |     val keyColName = $(keyCol)
 66 |     val keyColDataType = inputColStructSchema(keyColName).dataType
 67 |     keyColDataType match {
 68 |       case _: NumericType =>
 69 |       case _: StringType =>
 70 |       case other =>
 71 |         throw new IllegalArgumentException(s"Key column data type $other is not supported.")
 72 |     }
 73 | 
 74 |     // Check that value type is numerical
 75 |     val valueColName = $(valueCol)
 76 |     val valueColDataType = inputColStructSchema(valueColName).dataType
 77 |     valueColDataType match {
 78 |       case _: NumericType =>
 79 |       case other =>
 80 |         throw new IllegalArgumentException(s"Value data type $other is not supported.")
 81 |     }
 82 |   }
 83 | 
 84 | }
 85 | 
 86 | /**
 87 |  * Encode categorical values collected by [[Gather]] transformation as feature vector using
 88 |  * dummy variables inside [[org.apache.spark.ml.attribute.AttributeGroup AttributeGroup]]
 89 |  * with attached metadata
 90 |  *
 91 |  * {{{
 92 |  *  cookie_id | sites
 93 |  *  ----------|------------------------------------------------------------------------
 94 |  *  cookieAA  | [{ site_id: 1, impressions: 15.0 }, { site_id: 2, impressions: 20.0 }]
 95 |  *  cookieBB  | [{ site_id: 2, impressions: 7.0 }, { site_id: 3, impressions: 5.0 }]
 96 |  *  }}}
 97 |  *
 98 |  * transformed into
 99 |  *
100 |  * {{{
101 |  *  cookie_id | site_features
102 |  *  ----------|------------------------
103 |  *  cookieAA  | [ 15.0 , 20.0 , 0   ]
104 |  *  cookieBB  | [ 0.0  ,  7.0 , 5.0 ]
105 |  *  }}}
106 |  *
107 |  * Optionally apply dimensionality reduction using top transformation:
108 |  *  - Top coverage, is selecting categorical values by computing the count of distinct users for each value,
109 |  *    sorting the values in descending order by the count of users, and choosing the top values from the resulting
110 |  *    list such that the sum of the distinct user counts over these values covers c percent of all users,
111 |  *    for example, selecting top geographic locations covering 99% of users.
112 |  *  - Minimum Support, is selecting categorical values such that at least c percent of users have this value,
113 |  *    for example, web sites that account for at least c percent of traffic.
114 |  */
115 | class GatherEncoder(override val uid: String) extends Estimator[GatherEncoderModel] with GatherEncoderParams {
116 | 
117 |   def this() = this(Identifiable.randomUID("gatheredEncoder"))
118 | 
119 |   def setInputCol(value: String): this.type = set(inputCol, value)
120 | 
121 |   def setOutputCol(value: String): this.type = set(outputCol, value)
122 | 
123 |   def setKeyCol(value: String): this.type = set(keyCol, value)
124 | 
125 |   def setValueCol(value: String): this.type = set(valueCol, value)
126 | 
127 |   def setTransformation(value: String): this.type = set(transformation, value)
128 | 
129 |   def setSupport(value: Double): this.type = set(support, value)
130 | 
131 |   def setCover(value: Double): this.type = set(cover, value)
132 | 
133 |   def setAllOther(value: Boolean): this.type = set(allOther, value)
134 | 
135 |   def setKeepInputCol(value: Boolean): this.type = set(keepInputCol, value)
136 | 
137 |   def setFailOnEmptyKeys(value: Boolean): this.type = set(failOnEmptyKeys, value)
138 | 
139 |   def setExcludeKeys(value: Set[Any]): this.type = set(excludeKeys, value)
140 | 
141 |   setDefault(
142 |     transformation -> "top",
143 |     support -> 0.1,
144 |     cover -> 100.0,
145 |     allOther -> false,
146 |     keepInputCol -> true,
147 |     failOnEmptyKeys -> true,
148 |     excludeKeys -> Set.empty[Any]
149 |   )
150 | 
151 |   private def computeTopKeys(dataset: DataFrame): Array[Any] = {
152 |     val inputColName = $(inputCol)
153 |     val keyColName = $(keyCol)
154 |     val coverVal = $(cover)
155 | 
156 |     log.info(s"Compute top transformation." +
157 |       s"Key column: $keyColName " +
158 |       s"Cover: $coverVal")
159 | 
160 |     if (coverVal == 100.0) {
161 |       // With cover 100% it's required to collect all keys
162 |       val keyCol = s"${uid}_key"
163 |       dataset.select(explode(col(s"$inputColName.$keyColName")) as keyCol)
164 |         .groupBy(keyCol).agg(col(keyCol)).collect().map(_.get(0))
165 |         .filter(k => !getExcludeKeys.contains(k))
166 |     } else {
167 | 
168 |       val key = s"${uid}_key"
169 |       val grouped: DataFrame = dataset.select(explode(col(s"$inputColName.$keyColName")) as key).groupBy(key).count()
170 |       val keys: Array[(Any, Long)] = grouped.collect().map { row =>
171 |         val key = row.get(0)
172 |         val cnt = row.getLong(1)
173 |         (key, cnt)
174 |       }
175 | 
176 |       log.debug(s"Collected ${keys.length} top keys for key column: $keyColName")
177 | 
178 |       val topKeys = keys.sortBy(_._2)(implicitly[Ordering[Long]].reverse) filter {
179 |         case (k, _) => !getExcludeKeys.contains(k)
180 |       }
181 | 
182 |       // Get number of columns below cover threshold
183 |       val threshold = ($(cover) / 100) * topKeys.map(_._2).sum
184 |       val keysBelowThreshold = topKeys.map(_._2).scanLeft(0L)((cum, cnt) => cum + cnt).takeWhile(_ < threshold).length
185 | 
186 |       topKeys.take(keysBelowThreshold).map(_._1)
187 |     }
188 |   }
189 | 
190 |   private def computeIndexKeys(dataset: DataFrame): Array[Any] = {
191 |     val inputColName = $(inputCol)
192 |     val keyColName = $(keyCol)
193 |     val supportVal = $(support)
194 | 
195 |     log.info(s"Compute index transformation." +
196 |       s"Key column: $keyColName " +
197 |       s"Support: $supportVal")
198 | 
199 |     val key = s"${uid}_key"
200 |     val grouped: DataFrame = dataset.select(explode(col(s"$inputColName.$keyColName")) as key).groupBy(key).count()
201 | 
202 |     // Get support threshold
203 |     val totalCount = grouped.select(sum("count")).first().getLong(0)
204 |     val threshold = (supportVal / 100) * totalCount
205 | 
206 |     val aboveThresholdKeys: Array[(Any, Long)] =
207 |       grouped.filter(col("count") >= threshold).collect().map { row =>
208 |         val key = row.get(0)
209 |         val cnt = row.getLong(1)
210 |         (key, cnt)
211 |       }
212 | 
213 |     log.debug(s"Collected '${aboveThresholdKeys.length}' support keys " +
214 |       s"above threshold: $threshold for key column: $keyColName")
215 | 
216 |     val supportKeys = aboveThresholdKeys.sortBy(_._2)(implicitly[Ordering[Long]].reverse) filter {
217 |       case (k, _) => !getExcludeKeys.contains(k)
218 |     }
219 | 
220 |     supportKeys.map(_._1)
221 |   }
222 | 
223 |   override def fit(dataset: DataFrame): GatherEncoderModel = {
224 |     validateSchema(dataset.schema)
225 | 
226 |     val transformationVal = $(transformation)
227 |     val inputColName = $(inputCol)
228 |     val keyColName = $(keyCol)
229 |     val valueColName = $(valueCol)
230 | 
231 |     log.info(s"Fit gather encoder for input column: $inputColName. " +
232 |       s"Key column: $keyColName " +
233 |       s"Value column: $valueColName " +
234 |       s"Transformation: $transformationVal " +
235 |       s"All other: ${$(allOther)}.")
236 | 
237 |     val gatherKeys: Array[Any] = transformationVal match {
238 |       case "top" => computeTopKeys(dataset)
239 |       case "index" => computeIndexKeys(dataset)
240 |       case unknown =>
241 |         throw new IllegalArgumentException(s"Invalid gather transformation type: $unknown")
242 |     }
243 | 
244 |     copyValues(new GatherEncoderModel(uid, gatherKeys).setParent(this))
245 |   }
246 | 
247 |   override def transformSchema(schema: StructType): StructType = {
248 |     validateSchema(schema)
249 |     // at this point labels and size of feature vectors is unknown
250 |     val outputSchema = SchemaUtils.appendColumn(schema, StructField($(outputCol), new VectorUDT))
251 | 
252 |     if (getKeepInputCol) {
253 |       outputSchema
254 |     } else {
255 |       StructType(outputSchema.filter(_.name != getInputCol))
256 |     }
257 |   }
258 | 
259 |   override def copy(extra: ParamMap): GatherEncoder = defaultCopy(extra)
260 | 
261 | }
262 | 
263 | /**
264 |  * Model fitted by [[GatherEncoder]]
265 |  *
266 |  * @param modelKeys  Ordered list of keys, corresponding column indices in feature vector
267 |  */
268 | class GatherEncoderModel(
269 |   override val uid: String,
270 |   val modelKeys: Array[Any]
271 | ) extends Model[GatherEncoderModel] with GatherEncoderParams {
272 | 
273 |   def this(keys: Array[Any]) = this(Identifiable.randomUID("gatheredEncoder"), keys)
274 | 
275 |   def setInputCol(value: String): this.type = set(inputCol, value)
276 | 
277 |   def setOutputCol(value: String): this.type = set(outputCol, value)
278 | 
279 |   def setKeyCol(value: String): this.type = set(keyCol, value)
280 | 
281 |   def setValueCol(value: String): this.type = set(valueCol, value)
282 | 
283 |   def setTransformation(value: String): this.type = set(transformation, value)
284 | 
285 |   def setSupport(value: Double): this.type = set(support, value)
286 | 
287 |   def setCover(value: Double): this.type = set(cover, value)
288 | 
289 |   def setAllOther(value: Boolean): this.type = set(allOther, value)
290 | 
291 |   def setKeepInputCol(value: Boolean): this.type = set(keepInputCol, value)
292 | 
293 |   def setFailOnEmptyKeys(value: Boolean): this.type = set(failOnEmptyKeys, value)
294 | 
295 |   setDefault(
296 |     cover -> 100.0,
297 |     allOther -> true,
298 |     keepInputCol -> true,
299 |     failOnEmptyKeys -> true
300 |   )
301 | 
302 |   private val labels: Array[String] = modelKeys.map(_.toString)
303 | 
304 |   private val keyIndex: Map[Any, Int] = modelKeys.zipWithIndex.toMap
305 | 
306 |   override def transform(dataset: DataFrame): DataFrame = {
307 | 
308 |     if (modelKeys.isEmpty && getFailOnEmptyKeys) {
309 |       throw new IllegalArgumentException(s"Can't encode gathered data with empty model keys. " +
310 |         s"Check that input column '$getInputCol' has data.")
311 |     }
312 | 
313 |     if (modelKeys.isEmpty && !getFailOnEmptyKeys) {
314 |       log.warn(s"Gathered data has empty key set. Check input column $getInputCol")
315 |     }
316 | 
317 |     val outputSchema = transformSchema(dataset.schema)
318 | 
319 |     val inputColName = $(inputCol)
320 |     val keyColName = $(keyCol)
321 |     val valueColName = $(valueCol)
322 | 
323 |     val allOtherEnabled = $(allOther)
324 |     val featureSize = if (allOtherEnabled) modelKeys.length + 1 else modelKeys.length
325 | 
326 |     val encoder = udf { (keys: mutable.WrappedArray[AnyRef], values: mutable.WrappedArray[Double]) =>
327 | 
328 |       if (featureSize == 0) {
329 |         // Special case for empty model keys
330 |         Vectors.dense(Array.empty[Double])
331 | 
332 |       } else if (keys == null && values == null) {
333 |         Vectors.sparse(featureSize, Nil)
334 | 
335 |       } else if (keys != null && values != null) {
336 | 
337 |         require(keys.length == values.length,
338 |           s"Keys names length doesn't match with values length")
339 | 
340 |         if (keys.length > 0) {
341 |           var i: Int = 0
342 |           val elements = mutable.Map.empty[Int, Double]
343 |           while (i < keys.length) {
344 |             val key = keys(i)
345 |             val value = values(i)
346 | 
347 |             keyIndex.get(key) match {
348 |               // Take latest value for key
349 |               case Some(idx) =>
350 |                 elements(idx) = value
351 |               // Accumulate values is all other enabled
352 |               case None if allOtherEnabled =>
353 |                 val allOther = elements.getOrElse(modelKeys.length, 0.0)
354 |                 elements.update(modelKeys.length, allOther + value)
355 |               // Ignore key if all other is disables
356 |               case None =>
357 |             }
358 | 
359 |             i += 1
360 |           }
361 |           Vectors.sparse(featureSize, elements.toBuffer)
362 | 
363 |         } else {
364 |           Vectors.sparse(featureSize, Nil)
365 |         }
366 | 
367 |       } else {
368 |         throw new IllegalArgumentException(s"Keys and Values are not consistent")
369 |       }
370 |     }
371 | 
372 |     val outputColName = $(outputCol)
373 |     val metadata = outputSchema($(outputCol)).metadata
374 | 
375 |     val encodedCol = encoder(
376 |       dataset(s"$inputColName.$keyColName"),
377 |       dataset(s"$inputColName.$valueColName").cast(ArrayType(DoubleType))
378 |     ).as(outputColName, metadata)
379 | 
380 |     if (getKeepInputCol) {
381 |       dataset.select(col("*"), encodedCol)
382 |     } else {
383 |       val cols = dataset.schema.fieldNames.filter(_ != getInputCol).map(col)
384 |       dataset.select(cols :+ encodedCol: _*)
385 |     }
386 |   }
387 | 
388 |   override def transformSchema(schema: StructType): StructType = {
389 |     validateSchema(schema)
390 | 
391 |     val attrLabels = if ($(allOther)) labels :+ "all other" else labels
392 |     val attrs: Array[Attribute] = attrLabels.map(lbl => new NumericAttribute(Some(lbl)))
393 |     val attrGroup = new AttributeGroup($(outputCol), attrs)
394 |     val outputSchema = SchemaUtils.appendColumn(schema, attrGroup.toStructField())
395 | 
396 |     if (getKeepInputCol) {
397 |       outputSchema
398 |     } else {
399 |       StructType(outputSchema.filter(_.name != getInputCol))
400 |     }
401 |   }
402 | 
403 |   override def copy(extra: ParamMap): GatherEncoderModel = {
404 |     val copied = new GatherEncoderModel(uid, modelKeys)
405 |     copyValues(copied, extra).setParent(parent)
406 |   }
407 | 
408 | }
409 | 


--------------------------------------------------------------------------------
/sparkext-example/src/main/resources/response.csv:
--------------------------------------------------------------------------------
   1 | cookie,response
   2 | wKgQaV0lHZanDrp,1
   3 | rfTZLbQDwbu5mXV,1
   4 | r1CSY234HTYdvE3,1
   5 | 20ep6ddsVckCmFy,1
   6 | Jga3f9JqZIuXBJ1,1
   7 | 2THd1TMYVXjFLlI,1
   8 | RmBr0GvcnHepocE,1
   9 | 6bNRJD0f8rrvNHZ,1
  10 | o9EXMfVigHmaoQM,1
  11 | iLSH5Yhxsg5uf4q,1
  12 | tmHEHROFGmji3zM,1
  13 | gjR5HgiHWtNZuqK,1
  14 | kdx8i3MJqLmDJV6,1
  15 | vypM7m3z6SSRvGZ,1
  16 | xq63eKSvrJFq5aL,1
  17 | W4VBodItMK8475Q,1
  18 | ZFjrfbTYBxadHQ1,1
  19 | VAG5hYt89GntLYU,1
  20 | 9WNbUDJvypHi4JK,1
  21 | cJ5FhIM6lXTIFFZ,1
  22 | zpTvRW58SN2CJ4E,1
  23 | UJlUVPXDc4OaWmn,1
  24 | dPWY5AiqqWVG3JJ,1
  25 | 9fMT2NB9DFWl4Ox,1
  26 | t9QW1NCFCWbPOJt,1
  27 | c6tA8uzWFr9t1im,1
  28 | 2ra0Q3AQp8qSlc9,1
  29 | jVVrUj6wCsWcTOs,1
  30 | qFmzD8NrgLmpvux,1
  31 | VsPopNiYNgrDuls,1
  32 | PU9OtYfnS2oOyCM,1
  33 | 8lt5BpBLw2ahM1N,1
  34 | mG6WRTKprzaTokJ,1
  35 | U3J2Nv1EKokw4XS,1
  36 | JCPgwGGXPFXINhC,1
  37 | 1XmlUyrrVrBkiik,1
  38 | E2YwHbCRoKXzzEc,1
  39 | VwO5WFhxOooeCWb,1
  40 | MDJqzz1Lsf4QfCX,1
  41 | ehI13pvBcz8CEPr,1
  42 | vtfqDMpCBLrpRLN,1
  43 | 7lf3IRE7hPAM18w,1
  44 | 50SWlFbA9or3KUJ,1
  45 | bCUamTnw4qIEZ4J,1
  46 | xwj65CbKrSuCygd,1
  47 | Alq7HpCKRFcRgt8,1
  48 | KOHN2oFFdv7UBLQ,1
  49 | kwad6cmTpUk95Cr,1
  50 | QxJ8zBHwkPNOxRZ,1
  51 | v8EUlcsKj0KvYPr,1
  52 | 9YYbK3joLYHQTGA,1
  53 | fP8bh0C4xFuStUO,1
  54 | Tn5LnVL9DjRcOCa,1
  55 | SjExrrkZvl12k05,1
  56 | fwN0yAl9GtRoAnY,1
  57 | laSq5tUROYvMcID,1
  58 | lJuqKJuDA3w5BZz,1
  59 | cdkFNkwOneAQrND,1
  60 | Lc3Rb0Gjd52eV3f,1
  61 | JVLsLcx55JY1RZB,1
  62 | DFnWOsYjpNFSw4u,1
  63 | lpVHS3s9iJN9rg7,1
  64 | kS5awEFgijnfKpC,1
  65 | nNi7JzDeLIqC7Gi,1
  66 | tvwF4qazBMkEBtP,1
  67 | cdBUHoZRBS0mRXN,1
  68 | T4g7WskDNNfOzUt,1
  69 | lC18SA8sWsRldOJ,1
  70 | z38D6CRkiCUH1Je,1
  71 | meyO2Y4fOq3ZvpG,1
  72 | KuPG6y3WgCom6OZ,1
  73 | VjigIqWnYJMXBkJ,1
  74 | px8OpJHykJjcTM8,1
  75 | 4A4ZV38VHHUKl6E,1
  76 | KzBsGh8zNkgJddb,1
  77 | eNMQM0vh18nQfVY,1
  78 | a57XcvW4vxZ473U,1
  79 | hkvRWXnqTT2hrac,1
  80 | pXP2e9mM6kkeYwt,1
  81 | DcRbZkX92pDKHL8,1
  82 | z70255CEsUQPmLw,1
  83 | pDwUTusdFjPjwQM,1
  84 | sfAW2avORX4ZT1F,1
  85 | 4mZGAbuXKO3Qo6I,1
  86 | pa9fn7FOC04FKXZ,1
  87 | 6kXq4DMYsjKrLwf,1
  88 | VcZsVQOQPAyz00E,1
  89 | E2KHEHzA5lAIZtH,1
  90 | 9iVEePY5KXgRpPf,1
  91 | WkwvVLxSjRHxG5P,1
  92 | GRLd5YubtEAyQlT,1
  93 | 2X2ZBYw0vN6W5pX,1
  94 | diL9uidQfbrPZYd,1
  95 | AsIboE6VpP8liw6,1
  96 | wffozKQkAi7BA97,1
  97 | RaTLueWZCzjSsd1,1
  98 | x0DHdKd9q1tJlMn,1
  99 | KJOIbSDiRDe89R7,1
 100 | xdx58SVBtxisreg,1
 101 | Lxdrm3ZBgJ01n7r,1
 102 | nPKHjz4XuXLjNat,0
 103 | OoZYvAABOK8MpNu,0
 104 | bv7jb8IcBPt5Ltu,0
 105 | SjIdtw8LPsFqC4H,0
 106 | PJqpUjktDxT6Hnv,0
 107 | AQTT3ls2mO33hJK,0
 108 | cvsPwm76aEMBtiL,0
 109 | P7WoOvi9arjw9Y5,0
 110 | ugRKYkq4lXlFnCw,0
 111 | 8VZYKXzTFPEe5TZ,0
 112 | R3aoS4yr0uXNBKn,0
 113 | xlBcZLHtiogfmdy,0
 114 | iBK8oiA48Ht2Ny2,0
 115 | mkkCJ48GPs3QMN8,0
 116 | 0PFARBmT6ALY8tS,0
 117 | stdjSXIbzYvN3t8,0
 118 | PPhUck0Krpe2nRw,0
 119 | 3shCPyPOra01iFu,0
 120 | RrAvFltqb1TzE5h,0
 121 | QTP5HroNmKQeF59,0
 122 | TmLjI8nN9gfLWRn,0
 123 | TMENYL4j7vEty47,0
 124 | NIqL0zJnZLkIU4b,0
 125 | htip4jmn20j9N4q,0
 126 | 1pcUDVIUWEcp0xv,0
 127 | Xs2p7YI1wYpv9eo,0
 128 | mAXtAGZc11HSaXl,0
 129 | 2nHw31xl5g6RDcX,0
 130 | daOsbOvkkpt4Zrv,0
 131 | 3nWP6lH7EIJoz9z,0
 132 | BrQ2ixqg5JY5b3G,0
 133 | aKqDseFXcsFB151,0
 134 | jlC7SgoA9sMKpEu,0
 135 | xQZzaY4Qrwbcd8X,0
 136 | G48sd0cGVKChY0d,0
 137 | iPRPvpuu8IQvmWu,0
 138 | usoNGkliLzAjQyU,0
 139 | 7S5Ey82u0rxCvhR,0
 140 | 09qLsoxXz9qlMpm,0
 141 | v3x2nCBp5ubnbWU,0
 142 | C9QFMDK1nz8ayPf,0
 143 | TDAvlXzPfzBqtLM,0
 144 | 8cKZBwtxzCGh17j,0
 145 | IzfaRU0xUHmwewJ,0
 146 | csQ41JfWlqEVGUi,0
 147 | 8sMn7lPQ5q4l5XV,0
 148 | rFFqEgV3SGhds4Y,0
 149 | vFunkin9xerLoHh,0
 150 | 9vsaXPcqrzf0eEN,0
 151 | Ho9PqTqwoWZDKrF,0
 152 | biD3UKo7egruwow,0
 153 | 9yasPADnt2F2vPz,0
 154 | 2i4rSCyl94ETBpW,0
 155 | 5LinDQbtZiIi2IE,0
 156 | mj8CntIfxJJISWD,0
 157 | mmR1ewWmJHkJwwn,0
 158 | vRGelfBqNoCQme0,0
 159 | GZPFGVmvnFtg33j,0
 160 | AB1qKRmPnohJzk8,0
 161 | pA9bG5F2LfDMD8J,0
 162 | mg4zOzr7oNgxdQj,0
 163 | a5Jiv6roloQcOI8,0
 164 | PXkjmkfvt74LfyG,0
 165 | 8z4QwmDjjd8BcZt,0
 166 | wL6j8AYyG3lbqUL,0
 167 | YL28efelxO6aEkt,0
 168 | hCL6R8yqcSbWFbf,0
 169 | OnO3yfGcTBejpT4,0
 170 | 8EG73soHyyUDlJI,0
 171 | sAPRVTHJuK3ZDkU,0
 172 | HXCkc47u8ocE25O,0
 173 | XFTFeaK0rXfDUKI,0
 174 | 7sbZSwT5sHzIyDW,0
 175 | nptkbTSSkJIC4hP,0
 176 | klkgnD1Ha4ASU1B,0
 177 | IX8niiGMr2V3Qg1,0
 178 | R2aUsLPOcnvSQsU,0
 179 | XpFALYWVclal6yx,0
 180 | 34DFbjltb1bCbA8,0
 181 | mLitPzaFoFIjsUG,0
 182 | GTMZ3VAvFiXKWP5,0
 183 | V8MOUbmlHbJ4HQE,0
 184 | vQZpT1LC9CmfuBm,0
 185 | TaOBEJ5lb1oPKRO,0
 186 | AbJ5JABJ4LWeJkB,0
 187 | aB2UrElavymHwFH,0
 188 | 92JgExFAVnsDTH5,0
 189 | 6egnu86Db1dWsLC,0
 190 | CG0YBmLLGJUoJbN,0
 191 | A8wUIw5wsJDtMHN,0
 192 | BI42LboSsAhtaiq,0
 193 | Uwtur0ppGwzeflC,0
 194 | XnvTcMbi4zirNEp,0
 195 | R84RKdczylKFbtC,0
 196 | t8PBhdB2CdyXeMe,0
 197 | utOUGQ8EDE5oEIs,0
 198 | bhxRBdS94giKsbI,0
 199 | iWpyLstBxO1Biux,0
 200 | ywvzEWxWr6LUTcI,0
 201 | C2dSLgCOe4A9f1M,0
 202 | xL4AJ1tQGder4yr,0
 203 | hWI9ToM8M9s4vnf,0
 204 | vJWGT34oasRMT6G,0
 205 | vLbdKreaE4oCv5E,0
 206 | RQ2NELthlA8uT8n,0
 207 | oDTKArOeP0X70zT,0
 208 | vbqSP6KbBR37VI3,0
 209 | OtNxMwypvahPlM2,0
 210 | QnPhU6FcmgiMUiW,0
 211 | Hw6xCxcu53w6BZN,0
 212 | aSQvkt8QDtSPuEg,0
 213 | P2gUqCySiYJ1At5,0
 214 | ai5PY2NYCivs81Z,0
 215 | 9epcgrqhAenJ75w,0
 216 | fIuwNMlfW1uLDS2,0
 217 | cEGpBwbUFlUSOW8,0
 218 | bS790RhXdGe8SxE,0
 219 | 7BFuvlKw0RpVTU6,0
 220 | mEVcbzGifTYwsPu,0
 221 | nd0kFQJ8RH05g0X,0
 222 | bFAwYjQWWUNfj6D,0
 223 | vi3BKayZw8Wba16,0
 224 | rBkxRjtLWIG9W8K,0
 225 | nQyMy4peCqAlCpA,0
 226 | 81tn1leMnS9dMYG,0
 227 | QSDmjRgykRQew2l,0
 228 | 0bWkTDeNpWNqYW6,0
 229 | f2BqZImYK2wODHT,0
 230 | 3mW4rmrOqLDWE1H,0
 231 | onxJlEmpxzn4AsQ,0
 232 | 4Gmxi7XxCdB15rN,0
 233 | 12JxCGtq8a2PKvR,0
 234 | 89pY6o3gJd9XOaV,0
 235 | A5bN8QaCvUarj05,0
 236 | qNxQxhbViHk3o43,0
 237 | 6KMmENgSZpIaU2j,0
 238 | UtRYLOPEZCIgfbW,0
 239 | y1O1cp5Zx4tB5sJ,0
 240 | rNCr41Ttz2hikal,0
 241 | PZL1PLmjx4SpJIU,0
 242 | 9HRgvm6ffnn7A6p,0
 243 | nsB9g0iKYiTMrGa,0
 244 | BcvrOHOgpm5kOcc,0
 245 | DtU88WV6erzaVnV,0
 246 | NjwlIyqP1xcHdbr,0
 247 | 4XcvXLmXiM3JfMC,0
 248 | 3kVhzOgOKQqOJr4,0
 249 | eJL6epTf6gNimCO,0
 250 | Mr0mYSYqpg4fyGy,0
 251 | fLEWY187h5VqTNi,0
 252 | f9Yn1trmkbLUVwx,0
 253 | xLLeRpAatk1JAPq,0
 254 | b1mgCO3dcvaT2hp,0
 255 | 2IsxTYcj3xznJzx,0
 256 | JkGaDkTZ5A584zW,0
 257 | hRBV4glkADbeEyk,0
 258 | BtE8jIbhd279i0G,0
 259 | ikmQzn788nIXOFG,0
 260 | wrrDo6FTpMEuCrx,0
 261 | cwYbiky4hE4vj1I,0
 262 | o4xGbXhe9g7mp5x,0
 263 | aN6Gw5PXWvvaKv1,0
 264 | snJED6GAOeiIfVO,0
 265 | I5n4wOUE35OtqIr,0
 266 | gU3XDDEsilsKjjN,0
 267 | oOdUWss27MgL9ik,0
 268 | hDTITZ8yVGim4S1,0
 269 | 2PuISodonxd3ttw,0
 270 | JMbyTycSSfTuBme,0
 271 | sHUZdNgBCGqvxUZ,0
 272 | MmZ5igLXsc9UAch,0
 273 | jSwe6qnniV6MpcW,0
 274 | tstcPghhXNWb4dy,0
 275 | lt1tTYsH0GPqV9c,0
 276 | Y11mJNFZnFpJFNW,0
 277 | LGCwkDTUIJy3Uzm,0
 278 | EWmM5RUuriswSzE,0
 279 | DXJJGWESR9uBigO,0
 280 | fsp5X69wFhZKI0X,0
 281 | ytGVwWpfwWFSgz4,0
 282 | aGCypm2VtVZBoNc,0
 283 | 9dU1Eqya3KVXRWE,0
 284 | yhiyUwFZS9Q1iIo,0
 285 | NYTBrDdalAVC6dh,0
 286 | mZOsQDSeN5zFrDf,0
 287 | 5MglSlI2DGghdrW,0
 288 | aavJ3ubevq3kS76,0
 289 | CjEGTPVmlYCTkcV,0
 290 | zWqNyqGjlqW0pQ7,0
 291 | 7PswHGRvt1ivadC,0
 292 | 41Hr9d9i9u6wHmk,0
 293 | zxcNwRM6w33FfeS,0
 294 | t9zQZItXzdNLaDG,0
 295 | DyteNgYMg8fTv5c,0
 296 | e3uzZhqhyU9pVpL,0
 297 | dSZdpJh35VHXbFG,0
 298 | qprE3ReQCfxNKNe,0
 299 | VGWRqdNb8FfbmXg,0
 300 | KZg4kldGdhHGuC0,0
 301 | BeeITZPZFFOMaek,0
 302 | Zn0eZY9TTC14j6G,0
 303 | 9iZEmyH3nhqkKxr,0
 304 | o5LmvaM3mlQOyTo,0
 305 | BFZcFFLxklyDxFV,0
 306 | z4xShRQyqvVmPGO,0
 307 | nYKtpQaV72YX7j5,0
 308 | 0Pk0YVTZPeXP2ZB,0
 309 | 23i1MQbAT4xhuKv,0
 310 | 7gOZCH05sczohyy,0
 311 | H8ccYdk7RgC1GFp,0
 312 | PE21azkB5ankNHN,0
 313 | AvCF8PXlJp2G8o2,0
 314 | WK20AstUu7rr1RZ,0
 315 | rtR5IyGoNAY09Oa,0
 316 | wNTeSOHX4NSwrdp,0
 317 | IZcQ5l4NkfjTKAd,0
 318 | O2QbKT9C1bku3m7,0
 319 | SthTZhoWJOk4Zgz,0
 320 | bX6t6GGzUaGsWU8,0
 321 | ezH7Ba74vWKY2u0,0
 322 | WopcDblfNuxv0Mi,0
 323 | gXhbI81VuWVTdnX,0
 324 | nU8UqjGqgRgDDNq,0
 325 | 157i3jzu2GeKBYr,0
 326 | ao0gjyeoHyyIbPj,0
 327 | YTPtB6b4TxhzXYR,0
 328 | rQ6Tz8yl1Ov8Mi5,0
 329 | 3tPbgY2eAh1zPc8,0
 330 | AtJIHk6FOBvWJsk,0
 331 | UOTwXGuYHKaE0Ns,0
 332 | PX4rwMqX5419uyR,0
 333 | 5lwkSnPtF2CkHuC,0
 334 | CYJxf4qldM2CrUM,0
 335 | AaAeWATuP2v9fiM,0
 336 | ItEXKG7EomjOM18,0
 337 | 3fNuteSGeBGjTLE,0
 338 | X9KymVZ762YFScM,0
 339 | 4xVOyFqr3pUL0k8,0
 340 | iYwxbGgCE0DHnzg,0
 341 | UE5B0HXpDRjInNr,0
 342 | OWuvRX6GhiiNBXG,0
 343 | vIlQw8GgiqTvdOp,0
 344 | 1THvcDSDgxvL2LR,0
 345 | LCttC1LBaYoNRyO,0
 346 | LIS4gwpxfai7w6W,0
 347 | dB4gFCU7ZXzbtCc,0
 348 | 8HPVh5RN5u72pXQ,0
 349 | zxxrpfZvkv7OzVA,0
 350 | Gndsf6oH4BN8GFI,0
 351 | OYIxaNVcGfnPsYB,0
 352 | 1oyIwL5cnEluBbJ,0
 353 | eTXUwcPoIHidMHj,0
 354 | GADcbhLFNNDTBl6,0
 355 | 7mscV6eRV0teEdj,0
 356 | yN5moB4sFnS2u6G,0
 357 | g9pYvpUgnJOuz5P,0
 358 | 1rwIDqDxbYUZMyR,0
 359 | hpeiJXNqQYE1ZdJ,0
 360 | LJq6yg8vu51xato,0
 361 | I2M2FYj3z4Z46j8,0
 362 | X9GBrMRGMhmDP0S,0
 363 | 8dEQRPRpWGtJirh,0
 364 | WCWZBFs5RMdlQ2t,0
 365 | CpQX8wuSf5NRLZK,0
 366 | Pej4anlunCZc888,0
 367 | NIOrd6NfYmUf82V,0
 368 | VYdd62ccHtmghcC,0
 369 | 5lta7BvHcPvQT2Y,0
 370 | plkA7taGX23Uvp0,0
 371 | llNPMzSeJvgw4cX,0
 372 | IHvWGy4qutyoyMs,0
 373 | Z025v1Lbef0rppR,0
 374 | kY6mof86L5geVlm,0
 375 | NZN8x4YjlRKDPv8,0
 376 | FZXltWwI0pbb0c3,0
 377 | y1kTGEjykMmd500,0
 378 | R3SU5yONk02yeXx,0
 379 | mo3orWsGpzwbBPz,0
 380 | nnkTS9uwqMXGM1b,0
 381 | 9UpkA6lx5P0wZMW,0
 382 | aH1Uvsq2KpYYZjN,0
 383 | CDdI0s220nI2wFp,0
 384 | s5P7xQVwo3A89le,0
 385 | QPhdEgoxgBbYVgO,0
 386 | dz8rfFOqrFZk8ll,0
 387 | vVK4WaCj1Tn0mZT,0
 388 | Kjt1qTwUwEd4wSP,0
 389 | 6D4K3w634bdCNOS,0
 390 | j3itsfenMcf0AKs,0
 391 | XNZjlliphV4VhHo,0
 392 | EgbOanPteB1itQ2,0
 393 | Vo5Jd6eOubEfZm0,0
 394 | 58JLPFYmAzYpJxN,0
 395 | SRA5RX6AVII1Wub,0
 396 | xe5INtFzGp3b1yO,0
 397 | 3Dyg00Xzyfp5cXe,0
 398 | 1PuBR55yNwjXI51,0
 399 | pVYEAyfocQBvr4u,0
 400 | s7iiGLc4ypEygFS,0
 401 | YHlPGtzV95uFTdI,0
 402 | C4NjQ5qWMLlH736,0
 403 | z8asmgEDAcduBKM,0
 404 | j0rSFDk5uMgE8ae,0
 405 | fXFkg3DyhT4eIb7,0
 406 | 8EhUCN25ZG3olcK,0
 407 | c5M1i0avAsemEtJ,0
 408 | 06N6jKMmKQqj9bo,0
 409 | RfVKuNQi3UEAjBe,0
 410 | Xs8bOiQjAVOTdrX,0
 411 | sgI73flPdmVrMKa,0
 412 | r7VsOQSvLqkaCHT,0
 413 | 3zGRQIPE3QKxt64,0
 414 | jhpTeAvunjEUKVf,0
 415 | GP1pQ9jy58M8ETB,0
 416 | rBfmrUkVsHaN7XY,0
 417 | 2Cq1k3JDukKWxgb,0
 418 | NQeB35TLKzLSfEz,0
 419 | 4dlLdUlzYv5tpa5,0
 420 | frHQ7iAPf0TTX7n,0
 421 | 1E23YZZMUbm33BK,0
 422 | 7AyhxcpKis6ocJJ,0
 423 | wtbso34VaqCT2Zb,0
 424 | g7MRC8ozUigHCa4,0
 425 | USraY4mFRBpUwAq,0
 426 | zL8FTKOgH8PrtqM,0
 427 | 6l2d6dIEWt79vwo,0
 428 | Lt1Mgdss5RtypIp,0
 429 | ePkXyDmkB851lrP,0
 430 | XJ6sEyMV94vicBC,0
 431 | zD66H9F6vKKjE4r,0
 432 | Gt9HyuRggewsUKO,0
 433 | T9GTuTFDI99UnkS,0
 434 | 7oQojJAPSZHxBU8,0
 435 | AXCBA541jFb5T4r,0
 436 | W9CfzY0KoIi5WDc,0
 437 | RaghXCE90qhXAiD,0
 438 | ISdQ54lI4Xrj3oc,0
 439 | OMfrIN2c6KU4ngK,0
 440 | b8Qh9w1Nk2r8RmS,0
 441 | UgPiSskQ3uth1nz,0
 442 | 0zNtDyIdfuiuz5G,0
 443 | DVWx2U9yLdTb4OT,0
 444 | Z6Y3GtP4daO1Oxp,0
 445 | lE2v704MsIObqBu,0
 446 | LJ3isFTw0kSUnr4,0
 447 | 18iTwDzhF7dzcDT,0
 448 | hNCiSkWdgDzb6yi,0
 449 | QPi2JKRadNOiS8n,0
 450 | M0KAh5azeLRncVS,0
 451 | xFfX9YWA4gE5rIh,0
 452 | VVr3QcHrhY57bB7,0
 453 | S0bpaRFE04VXWzW,0
 454 | yaajZsAYU8VFKzg,0
 455 | TrqtNJz46wD5R03,0
 456 | gPoTHXsnduDT0qP,0
 457 | MPeZLlqldDrayRY,0
 458 | xcWYaQBWE6XKDgC,0
 459 | ZD1F9jNv5yAElK3,0
 460 | t6NL4cAQsjnRAWQ,0
 461 | hU4QFP1R9pNSAQQ,0
 462 | 1dFQjh0IwkzDq2E,0
 463 | cF9jgUCFysYeBBK,0
 464 | t0Pm080elVWgYyT,0
 465 | AFcB63pVW8xBE37,0
 466 | J3yQQss6E8KcVw3,0
 467 | lwACK4YmTCHaWZ3,0
 468 | VpOIz948Wq3lbmO,0
 469 | YQHAEtYaAj8dPcU,0
 470 | cwyy7W4PMdA4HEW,0
 471 | zaPyaZIzr2E9yeN,0
 472 | fZiWUdJnIAxQ2I7,0
 473 | T9Uw5gOlo7oxfQe,0
 474 | 6mEvscjDOa0MreW,0
 475 | ioqnUJ6qa3zwOVx,0
 476 | nZOXy38siCb8FSK,0
 477 | XYR6bWWaVe08q3i,0
 478 | OjV9WDrg81oeSN9,0
 479 | TLFrkp8F7RZJmBj,0
 480 | RuWstnSglPCvJoh,0
 481 | Kz3KPVGjuc70BQp,0
 482 | aNL8nBRwvrQmFcf,0
 483 | goA8YLoHHmtpTzT,0
 484 | iDYA41ljCGRowAm,0
 485 | 4LpzRMrk9vQTljL,0
 486 | BDN5hTQOYzAxpA2,0
 487 | jDHyXXt10L751VL,0
 488 | hrhTMqbD2fEusB4,0
 489 | uuWrGrzyuhryVDL,0
 490 | G0GtCw02orpAaAk,0
 491 | lg2elLXswpUu9KA,0
 492 | NaTCSBu3Jz2oUeb,0
 493 | 9h7BnVysR2fbcvU,0
 494 | m1xfVTBgNh2Lqu4,0
 495 | dZjoKYjGHH4r4YE,0
 496 | yVZoLkRZVwt57Oz,0
 497 | SLT3vTb0KMXX13O,0
 498 | KTnk2l7BfKpx3Ja,0
 499 | kJKC5vTrAX3uUVj,0
 500 | KJmlQVn3aiNg8i5,0
 501 | 2IAZnxEwFEdfeiK,0
 502 | IzrU8fC5IMyOgEE,0
 503 | KHiIhXnNMk4kpEA,0
 504 | FmVW9EIU32USkwF,0
 505 | zoDk2EbyWY9sCVi,0
 506 | i9xuhzmhakilPbE,0
 507 | aI2lv0WYJpwvf4w,0
 508 | hcm3KstO7Zl4agR,0
 509 | nCDfESWXGT0v9YH,0
 510 | b7xuwZSnx96PM18,0
 511 | cka8aIV2LGFbHz0,0
 512 | eRqcmc3yTofCn0L,0
 513 | T5IV0zdOL5EWcLm,0
 514 | TUGZD4mzadvwa2c,0
 515 | f6WyB95u0jQdhxm,0
 516 | foF8wCeb0QHCh9W,0
 517 | bTvtk7B7eJG370J,0
 518 | 72LHIywba1yxkvV,0
 519 | 7dlyFzIhovKLcl6,0
 520 | tceJFsqLRU5IFV6,0
 521 | k6xg7guVxtTQt6W,0
 522 | 94tzilOmB6mAVJB,0
 523 | afUIKMKjnCbK7QK,0
 524 | dlKX8mDBFsgazcM,0
 525 | XGNN2ypsRIa3pBD,0
 526 | bwAIpaIj1a7nCAC,0
 527 | FmiPxBQTb8ETYNt,0
 528 | uDWkMyvX3znBPAV,0
 529 | 11YWjD3GKX7fTET,0
 530 | IrNZrOHSgoStDfs,0
 531 | 0v70OKb68xlpL9k,0
 532 | 6uiqrE8N4hwPrW9,0
 533 | ajXtQbfKw74QLgG,0
 534 | fjuvMOBlwUKJIlT,0
 535 | bCe4lHULVAXrX6y,0
 536 | wL9lPh9KoZgQGFH,0
 537 | BqMucPvl5fDib36,0
 538 | GTfrRAHFdbxRQL1,0
 539 | RDy6knNYuS0b7at,0
 540 | lZiPqJkDKGz3PT6,0
 541 | tMAovFDGmc3M41i,0
 542 | FPOqk2iPEAAfxSi,0
 543 | xtwFjQPAdhG4vT2,0
 544 | neK1z1yTzfQENXd,0
 545 | 3iZcLgnlElH8BT6,0
 546 | yLLg9dLUZMrBoS5,0
 547 | zxm75fDCZw2kG6I,0
 548 | 0DwG37UZRHAji95,0
 549 | ZOWudjXcXAMmvSg,0
 550 | jS61RFsl1kL2rEJ,0
 551 | jdK8hpj8eqq0twb,0
 552 | QyuyLvMCgjqkzvX,0
 553 | 5wKLVQcoG88cOf6,0
 554 | bkmPjbVe1ZGBa5x,0
 555 | mdcdKjBmIacBDGj,0
 556 | EZ5UedIT5xbJNvR,0
 557 | 3fuF3IkA7jv79zv,0
 558 | gT5zNrMjg4XtSZ0,0
 559 | cbWJH7CqnBULNjM,0
 560 | KUO1wimLKcFW1EP,0
 561 | ZA4nDm3qtPw8Vl7,0
 562 | nxVmm8UXptdMli4,0
 563 | lBLhr4fKXiwp3Mh,0
 564 | vtgcVmOxj8yEFSP,0
 565 | V38MTOZRBD4f1aL,0
 566 | qOsA41BdaB8AgUl,0
 567 | ScjGihsEye8nbs1,0
 568 | etQjQwug5eC6Ut1,0
 569 | IBHxaYdcPUrI6l5,0
 570 | RWJMArmoAjNX696,0
 571 | FHIIjARyyQBZu7f,0
 572 | ZJUaQC8rgPZvjQF,0
 573 | M3Ot2d6hQyHYsxE,0
 574 | gWRGlUJIBBicKZM,0
 575 | Apue9Sf0kDtmP9v,0
 576 | HK5ZWDbNJwe1BUV,0
 577 | ODqjzxEoYdBbiUW,0
 578 | 6Fwhb7RT707JJKO,0
 579 | sIWcCVs4jasyLvg,0
 580 | PmhKgsxTu61Z9Du,0
 581 | qnXy0MDWMotQIf2,0
 582 | cu2XDfMyoUPglfa,0
 583 | X5cqUAmZK5sD9PY,0
 584 | vsjBBALvD4QNCQu,0
 585 | e1TUFlpaWXWg0zY,0
 586 | xwbTgcvBV05rEwW,0
 587 | mQHX4iLnSp0BxsV,0
 588 | c8CkNqhxfghBaN3,0
 589 | zq98gEEUxlnkY8K,0
 590 | WuSTui7rQv1LqIY,0
 591 | Vnv4Z9ewLI31vhR,0
 592 | JcVJxNrvJnXMmfa,0
 593 | BI8tAxuchYs6OTu,0
 594 | bicEN1t1bgg4SGW,0
 595 | EtIyzl5SSrNhcyb,0
 596 | fp5ulnQIQg6WENW,0
 597 | fKBfRbwOCgezAFY,0
 598 | 29apXre9koe6P1F,0
 599 | jIWDSgWFocTevVk,0
 600 | AuVOYRjrwnG366x,0
 601 | KyLQNxOpScL3Lpr,0
 602 | ihrvGRbsQ6XnJN9,0
 603 | sGYfoMaLZMf1Nox,0
 604 | LNzaQlm37HreNHs,0
 605 | MBbJgLpLaC1eu8U,0
 606 | vHDhTT94lzqgkEN,0
 607 | OcovSndupxG6nsg,0
 608 | tkBOa4Z717CNk8R,0
 609 | 7NAfzRYy2rNcI1b,0
 610 | UCphABzy2b6S5mr,0
 611 | OZrZsZ62WtPCvFe,0
 612 | GDTgwdLEupQJi4I,0
 613 | PViJu878u68y7rK,0
 614 | SIBXEUN1MXFv5kG,0
 615 | BU2oJLdTVLDsBUj,0
 616 | nUQRFuuTziV4wZN,0
 617 | MmTtikRjgeMhMgj,0
 618 | EEryF1O9vy50fQp,0
 619 | 0TInTyxq4BaWeSJ,0
 620 | SOmCcwRL8pstsFz,0
 621 | jAwKtuPcWohMuG8,0
 622 | LrgkxD0qBQdMsyL,0
 623 | 2x7B0moEqNiGSE9,0
 624 | 6ant0VcFjmog2CA,0
 625 | dtxmZSEJPY8WwA8,0
 626 | 26zJCdeD54XNhvJ,0
 627 | Nxz1zZ0rnye5kpt,0
 628 | LXcSt1hMfypgZ9P,0
 629 | LAuLKn4wua2kzb0,0
 630 | yWFo1P9VPBkvIvY,0
 631 | febhEN5snkl9Q3x,0
 632 | HV0J2zx4xQiAkzr,0
 633 | 71QhBGoU814NT6h,0
 634 | o6r3Grt6z3zUPCK,0
 635 | GPFmSgnxGhGMcg1,0
 636 | Sa2KgXRa6JSoimg,0
 637 | WEgfBj3RX6RFEb9,0
 638 | nw8gAf13Ki2x2Qf,0
 639 | Yp3yf69BKEdyU0c,0
 640 | 8cETG0f8jRWVtvc,0
 641 | lqxSOPvG1WwO5jU,0
 642 | WtsIKvoJJE7QgCH,0
 643 | KZvlYhEyMDTQB9G,0
 644 | oO6190PwDcbAbCO,0
 645 | jBNOSyqVnPOxIiT,0
 646 | ty3Nmj1H0fGLAGC,0
 647 | pPKw8PujS5euhJZ,0
 648 | 6f6bihELJMvj55Z,0
 649 | t8glpvEUkZNs52p,0
 650 | 45Hq7uYe72XS98O,0
 651 | ZK4eupmCRkEobCG,0
 652 | NrFjzZjxNgSKpu8,0
 653 | CDaRRYuy7YQmPQA,0
 654 | UGqsTDYtRXXRnyK,0
 655 | Xz71TGZA0MLkhdu,0
 656 | Dvafv4XYs29jtbI,0
 657 | RlHsFN6UhzVnOJo,0
 658 | rBuDmx5L7oJITVS,0
 659 | YglonQZbxyihUkq,0
 660 | I8T0Ic4c3X4ZpfL,0
 661 | BnZTh69NEyHlsoe,0
 662 | eDpqflCDbOH6qqf,0
 663 | gYgy2nof87Icw2E,0
 664 | PCwqP8jNZLTSTTB,0
 665 | oAtcvwCYahX0WNO,0
 666 | Za9lUlmz1uyIh5E,0
 667 | 1F8h8tkJiM1TE27,0
 668 | 83ErRrI0CZMo0CH,0
 669 | 12fST06o7FPdidu,0
 670 | o0pwGN0GjSEADUF,0
 671 | BoIwqOO0ulPzx8G,0
 672 | vST1G4gXyN6rhme,0
 673 | tNbYIOJvtQ8AmGI,0
 674 | tn3cnKfaSsn0MWM,0
 675 | gBYwSkDiltFyM1r,0
 676 | VM69dp9cuMExx5K,0
 677 | 1KUdP8lAajvDJ0g,0
 678 | rMMlBI80b3XHv8Q,0
 679 | sySq8MN1yPz3L9H,0
 680 | YZzioz2qSXZiKFG,0
 681 | MBKOjRqKCjM1uQF,0
 682 | Ei5EPbPn7tpsT3q,0
 683 | 7SolN7B9YieBp6Q,0
 684 | pxvUfH2I4oBOavk,0
 685 | XC16cPmZT2cghzs,0
 686 | xb1ThIe8fVlYEGi,0
 687 | I2bVaLRu00StOHD,0
 688 | gHjIR2DsnyNre7s,0
 689 | jHNaIpXkjzIlRf5,0
 690 | 5bjg3uogM4745AO,0
 691 | RfPEXIhUyri10a8,0
 692 | cKwPTcHxMke0dHl,0
 693 | vX8z2Z2wVOjW8xR,0
 694 | 9uEXsp69TuKA0YV,0
 695 | MZTNaBFGlCAJtjj,0
 696 | rt1W9alBM68Jkj5,0
 697 | U62OExoT1ndGjOC,0
 698 | CCUK0fHLMUaX9ot,0
 699 | Jra5i2Lba68Qt1L,0
 700 | TjBX7Je2AgOGj2N,0
 701 | hDRreP5NRFYCGxz,0
 702 | Mg2MrqsOyUIAZra,0
 703 | ERnJ7aIeB3qXcyq,0
 704 | hhriIuueAiKnIer,0
 705 | htYpQVsA0w8vUkr,0
 706 | 8dPEP2C8roqcAOO,0
 707 | yW0VQgqpUiqcte6,0
 708 | 1zSSbnHsXNHPfkc,0
 709 | 3vx9W9aAtYgidMG,0
 710 | 3R2LYcmLmK7pZD9,0
 711 | EQopeK3jOsTRepC,0
 712 | cEvPvNMpZNfsrir,0
 713 | toCpVHzcxzjYEwr,0
 714 | EnWzG3nbIVxgHEF,0
 715 | 5eXLvg7BE1XCjDg,0
 716 | vBxLLLipe6EI4w6,0
 717 | el4AmFdq8Zko9MQ,0
 718 | N79TdnonJkqfG2g,0
 719 | lD9844eT2DvCjPl,0
 720 | CYPVIvFXqxnO6Ug,0
 721 | 9frx0aWYmPkzyIN,0
 722 | EIqEltjPQnj3P2x,0
 723 | wdLhwuI2DgGzmZW,0
 724 | CxHbmb3lYK1nRku,0
 725 | cQqBjfHQQV8HQUu,0
 726 | acFsFK3GwbcMpUj,0
 727 | DB3lND5Zn3pOvbC,0
 728 | fPYOKLcjYFWoxsA,0
 729 | etrsKkeh8HXdGOJ,0
 730 | nkS2hINFNbSY7iI,0
 731 | Px1BE5MQ5ol9FzR,0
 732 | 9nHVOT6YbQ9KQQD,0
 733 | oTCqInqJlvQfW5p,0
 734 | efmN0G1GpUXaIrg,0
 735 | umUDou6YFU3O2ex,0
 736 | f7PIUFQ3BVaVJcz,0
 737 | Gmvam992yqQoldy,0
 738 | txQj3Z9rRmGrAwo,0
 739 | THslw4VZbNHSsJG,0
 740 | wUBqktL4KwGDn22,0
 741 | 7kOXiKFEFaQv9Te,0
 742 | SxQ0VRtt5Q1RsLE,0
 743 | BtAjznHhp2012S4,0
 744 | 4Z3XhFqc3WJmabW,0
 745 | ahDs7C8Ym4W91sl,0
 746 | BIvG6U9cZoOlLQO,0
 747 | 3wiJ946X0fUIfkS,0
 748 | cBG46NsSscMTwSE,0
 749 | HbihdIQ4VCVrfQJ,0
 750 | POjpodgzT6JzzaX,0
 751 | TAIf5L07K6LZ9Oe,0
 752 | nYj9jz4lqJerXZs,0
 753 | T62X0pBBy7QAsL7,0
 754 | GezRxPlQWSAI20V,0
 755 | 4TbsILrNmbESfV6,0
 756 | f8eqK1oB9c37it3,0
 757 | f10H5gz2o4FiTJ9,0
 758 | ocawHslPg9Zwi3a,0
 759 | XbWP3LbuMqssyei,0
 760 | qJL12F7De5G0X0E,0
 761 | 1hHbjpO7BCBARCl,0
 762 | DKPg4Q1Vs296S8c,0
 763 | 4Kj8vBgnENhn3oD,0
 764 | LX4pZg12PHyco9H,0
 765 | QO8BO4A2sSJuofE,0
 766 | WrLy7wfhElQ5ZDo,0
 767 | 87lfiON6LNeOMbz,0
 768 | BJ90i3BOdfsCtae,0
 769 | 1PzRdNXFm5TJ168,0
 770 | 2UI3ZTtsAYt0SS0,0
 771 | OXHkzHONO9kopNV,0
 772 | 2lY1gsjhtn49UWH,0
 773 | JB577SRnCuikqVq,0
 774 | qUp2lzLd5R05jUb,0
 775 | zgLocdeYlKRq1jW,0
 776 | CV6lKLasILKJx9T,0
 777 | KRituDdoUlk9zs6,0
 778 | mQEB6hZrq8mFAvf,0
 779 | QPTsihZcTswwraY,0
 780 | mZ6fUbQZGq1bBX9,0
 781 | cn82woWM0yfuNI3,0
 782 | iQyfIKdYSyRrszP,0
 783 | QLMhrrDlcyjWhsw,0
 784 | Cuc6x5l4vv25yYx,0
 785 | rZSgXwof5XPfnQ3,0
 786 | 5lls66C8WwYxC6U,0
 787 | M0zgwMfsXbQ4xYo,0
 788 | rYUyufvudwal6uX,0
 789 | IR0YWuFmTHnFe2Z,0
 790 | xm3c8cdJBTRTyFQ,0
 791 | 9i3UlROPSMVZkeV,0
 792 | NZ21zrqadFESnDo,0
 793 | Y4Xys8elUECwxAt,0
 794 | nh3R7Wj0RAYIYWo,0
 795 | ELcbeBheaiXsdZS,0
 796 | 9yBRbkAvsqduDg0,0
 797 | he1WTkgILLGc2dC,0
 798 | S76TVnpTIsDizHI,0
 799 | PYxHZgTkQGGT0fX,0
 800 | xNaNhX4JXK46cGV,0
 801 | ZhqljwtIR0uI3v5,0
 802 | xwcKFtwwhG4MPAv,0
 803 | KjKMixj99gS503d,0
 804 | ERl0NV0n2lHFj31,0
 805 | Yp4iX2WKICMm3e8,0
 806 | VIHQxDo2zPtBUXR,0
 807 | vkfPxtsZyKKqU0d,0
 808 | hUuMvifmTBJYLaj,0
 809 | 7YUTJEINRDcc4F1,0
 810 | YFk5VoBhIcjgnIp,0
 811 | ByNZv0TO13BqDgu,0
 812 | ZXDPQDXQq2ENEAe,0
 813 | 0UBXho4OGVnr3QI,0
 814 | LVQoI4eAeeX8KDq,0
 815 | EXu6QvESj4EuBFq,0
 816 | dQu2WPu7LXmtTtJ,0
 817 | pcfONfTqZaJcTzE,0
 818 | EylDmufrujlK0uf,0
 819 | C0rKOEnKblUyp7B,0
 820 | Ri0sGLcCTq0pJep,0
 821 | png2HNa39vxjTNs,0
 822 | Wyovy3Mo5wR1KxP,0
 823 | NbR20rUu4YS14TL,0
 824 | tchPqQMWvor3RsK,0
 825 | m8v1QjV7xwBmNRK,0
 826 | Yjxp9Ql0XoOzcyB,0
 827 | s2pS1xZhPE54Jju,0
 828 | LnQvrV9b5rhUROn,0
 829 | lMSjmpq7Gn5MTpH,0
 830 | jvDWDjohd4VHXU9,0
 831 | Q8la2FogOG7g1Af,0
 832 | 2XzbFvKpQQzdd6y,0
 833 | ogLqn1lzu0BIumB,0
 834 | K1v8fp8BQmpORNM,0
 835 | Qu9GLbOGdUexY4E,0
 836 | hpg2svMIsxpXaKF,0
 837 | CR7e2lWoO3itALD,0
 838 | J0xaewAkU2DtTJZ,0
 839 | rzUQlBcSzjMvNNC,0
 840 | Vf5EgIi8YDDREW7,0
 841 | eodhtivzrMJURee,0
 842 | bF7AU9XOLaKm7nN,0
 843 | mLIDWEafEfjttFr,0
 844 | 1KDqm7NebTViOsD,0
 845 | FyhlxIo1lEAtJM0,0
 846 | Tq9WIAZbPY6C6Cd,0
 847 | 3n9kGh9PRmCXXz3,0
 848 | pUo8XUFxueOlvki,0
 849 | 96nJc0C3HGIWDaN,0
 850 | ofde230oXCWW257,0
 851 | mh0tzB4WINxuGrM,0
 852 | GH7Qxgk0X46nxg7,0
 853 | 09SuUOrdAqm25X5,0
 854 | 9pY6YiN8ajYP9VN,0
 855 | ZGaQzN4U6WJ9YdO,0
 856 | JA9rZJRdnZFi8x8,0
 857 | 4fQUrj9Bfol3eIM,0
 858 | A2uObdaRMNOCSJC,0
 859 | xcGqxySpyN2NiRx,0
 860 | cu3rz5Qmf6Lib03,0
 861 | owGrzl9nt1tb7jQ,0
 862 | jjDLqbih3kdw7kZ,0
 863 | NBgLdhmiF5FJAD7,0
 864 | N6JwTjE5Ni1LDED,0
 865 | eAMisT2A79KZl44,0
 866 | Fb696Oe1pEChH6n,0
 867 | cz99bQCDDPkz6l2,0
 868 | VkgDHScJg0bd6Lu,0
 869 | ZKslYzw180bEgKF,0
 870 | fhXAPp2rrlwTEIK,0
 871 | gqYuc7mXE2uVPbP,0
 872 | Net0S4onkYtAqqs,0
 873 | kKxED3ioCx2fBvi,0
 874 | jCx1ozdSaAN5Ucp,0
 875 | PN2s9ZR0Nu4Y4Sn,0
 876 | Ow8XqbDCXDE9RA0,0
 877 | 8vxU3YScxR56Xm8,0
 878 | 1OHRnMJnnoHcuw5,0
 879 | I7ESUelaAXCP7Gm,0
 880 | OrxJoVMwHwv1TJv,0
 881 | g8KHfizaLbLTeyF,0
 882 | u9TRhjJH5Y8d287,0
 883 | bcwJXgJcfnM0xpD,0
 884 | T6EAomKrN6Weonw,0
 885 | kxn9uFOhNXr9ZsR,0
 886 | wBkPHebQJC2M7Nc,0
 887 | p3KqsE0hMPZdEEn,0
 888 | 6qbOJh51Mk4cJjL,0
 889 | j8LvmXNnLZ7Htjd,0
 890 | WgAxlIIp0sQh8V1,0
 891 | zfJmETHeB3sRaxk,0
 892 | HwvLq2ZqU16y4Y0,0
 893 | nHLvXPhSXjtfxBE,0
 894 | Vu6iaX5LPiXewyd,0
 895 | WyZxnzgg79xTXq5,0
 896 | Mm8LvBDjmBE3lRl,0
 897 | MmEeskc3XNqYOTy,0
 898 | qLWTCxMTIx0BvLz,0
 899 | Jdv0UZwR1muSjes,0
 900 | QshSze5oMRWUzEn,0
 901 | fen5eved2wH0N7g,0
 902 | YBcmLYUkPABJ5oj,0
 903 | rFjIgIhut0Bl04X,0
 904 | svjkIsDTe2EskiV,0
 905 | Ctn2iGhsrebatgi,0
 906 | O2AVUq2SoPw30eR,0
 907 | 1ceLy01sg5RuVCu,0
 908 | Yy25xhi7LkSdKdj,0
 909 | HMg1llHZ6Z0ldyp,0
 910 | riEOAbMtoftpXt7,0
 911 | if4OPJ6vRZrpRvP,0
 912 | e1uUB5aW4Monizb,0
 913 | D1QGTxAOk9GApNg,0
 914 | uKnGNpoOLttU8b5,0
 915 | b80u87cFUIaHi5O,0
 916 | Y3tmM6tE7zh6UhF,0
 917 | y1d555hqbmVZr3Q,0
 918 | mpBx0T8fPVMjvbC,0
 919 | mQe9OA1YYSdq8ef,0
 920 | BzBUI16TVuyLudT,0
 921 | tst8Otfe5xqENIl,0
 922 | dZqyfyEZixPfJtm,0
 923 | NOZOZ6Oaf6F53jx,0
 924 | gIcWuHKQaJzlx5o,0
 925 | hASj2ZSbNBT2Bu7,0
 926 | 6z28PVqxWXs3KZu,0
 927 | sUXymhjV2T8klfl,0
 928 | smF38YFvTHo5MWw,0
 929 | h3BbxjjAfe4UntT,0
 930 | hLZdyjflRe4m6cA,0
 931 | fiWLq2N7LDJL8uk,0
 932 | iTYZ0VDnpeajZfk,0
 933 | 8XbRHgDskcEinm1,0
 934 | KgLWCh2Gmu5ld40,0
 935 | b55lRo8Fvu1ZL4J,0
 936 | 9abC2I5T3pGVrhi,0
 937 | 53BkgahCOXHKrOi,0
 938 | LBG90s6hI78a68E,0
 939 | vonJCPGotQDpqUM,0
 940 | w5QPM6sIOOdgQVF,0
 941 | nYKw9qWvZtmHPAR,0
 942 | bgKyEhGbb0DBYDx,0
 943 | BL5cfl7DSO7fwLA,0
 944 | QTJJiEWYy0s3Xop,0
 945 | BUy8KYwgrV2862I,0
 946 | cVTivUqC1wRw6m9,0
 947 | CCw7r3nGUbzWW3s,0
 948 | oqI1SsDXSIMOb6t,0
 949 | FXbaRtHv6EffFap,0
 950 | 6ESs83A1yOIWBIf,0
 951 | l8xyTBHHS2GuKmd,0
 952 | OIEQWsLAlvgfMDH,0
 953 | P5W1aze7zAG60DB,0
 954 | i4IFAbcNVG0wliz,0
 955 | Uee6MG69V7puBbK,0
 956 | RjUs6ESsDIoTn4b,0
 957 | Ye4WiZKfYIc3BeJ,0
 958 | nKGyDhZUHkiGuae,0
 959 | 8lQGHu9CqQI7xmc,0
 960 | GISaDKb5aqBLbzv,0
 961 | EHJE5fHhRqZiTmg,0
 962 | Ku9RcAN34EtJeOu,0
 963 | Cz3PbkjG9Mb1u6p,0
 964 | PTRdP0UiR8TT36Z,0
 965 | KaqkhlRNydt53Ji,0
 966 | d5gxsdkNoQ6Jvhx,0
 967 | 6f795hyV2BFooLI,0
 968 | UKEYbGLrXBUbbMt,0
 969 | 7OCGLHacToRw7gq,0
 970 | S5SJoqiY91o8VQH,0
 971 | dKw135ddn678lO6,0
 972 | 5xNSGvbwPVKRGdF,0
 973 | O4fDR81bVHEVVYO,0
 974 | qmuT7vl6bIILzYb,0
 975 | 6Vi79OgPwMFI9Kw,0
 976 | NeZavHnu4yz460D,0
 977 | qJVRighu8Snvjjy,0
 978 | ikznDuPg6UPatNf,0
 979 | AeF4BptIHaT3tKp,0
 980 | fLl4nA6sw2s2ARU,0
 981 | SZotRoZeBA09sox,0
 982 | 8OoMsCRlk25F7Ll,0
 983 | XR0gzdweYG8c97e,0
 984 | O5pfmZNIAHCeVcA,0
 985 | uD7htgFPkW4Mzrk,0
 986 | QdvvzL92g2ZAmjC,0
 987 | 9mcmykV9DdtXu6A,0
 988 | 9PKOooL0KZiZ3CE,0
 989 | vO0dYOIVeaaAs0r,0
 990 | 8x5W5CEUxyP43xs,0
 991 | fdEX6ATJs28fO5E,0
 992 | 5D1BTCIxrMxpSmJ,0
 993 | w6BgLgcn5QUMjED,0
 994 | lVVIZ8gbfn9WgTz,0
 995 | Q8JJ6hZJPrsAs0i,0
 996 | 6MifPBNFvyCMBwr,0
 997 | D1JNMXaDIkMpTr2,0
 998 | st4O6Y8Frl1yZ30,0
 999 | tDEVIaC8IG6jjcP,0
1000 | J1twUytTSvdEAJa,0
1001 | J8BdjsDxViTQMCw,0
1002 | 


--------------------------------------------------------------------------------