├── project ├── build.properties ├── GitHelper.scala ├── VersionScheme.scala ├── Dependencies.scala ├── Dependency.scala ├── plugins.sbt ├── ShellPrompt.scala └── TestSettings.scala ├── sparkext-sql ├── build.sbt └── src │ ├── main │ └── scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── sql │ │ ├── ext │ │ └── functions.scala │ │ └── catalyst │ │ └── expressions │ │ └── aggregates.scala │ └── test │ └── scala │ └── org │ └── apache │ └── spark │ └── sql │ └── ExtAggregatesSpec.scala ├── sparkext-test ├── build.sbt └── src │ └── test │ ├── resources │ ├── log4j-turned-off.properties │ └── log4j.properties │ └── scala │ └── com │ └── collective │ └── TestSparkContext.scala ├── sparkext-example ├── build.sbt └── src │ └── main │ ├── scala │ └── com │ │ └── collective │ │ └── sparkext │ │ └── example │ │ ├── package.scala │ │ ├── InMemorySparkContext.scala │ │ ├── DataGenerator.scala │ │ └── SparkMlExtExample.scala │ └── resources │ └── response.csv ├── sparkext-mllib ├── build.sbt └── src │ ├── main │ └── scala │ │ └── org │ │ └── apache │ │ └── spark │ │ ├── mllib │ │ └── evaluation │ │ │ ├── BinaryModelMetricComputer.scala │ │ │ └── BinaryModelMetrics.scala │ │ └── ml │ │ ├── feature │ │ ├── sharedParams.scala │ │ ├── S2CellTransformer.scala │ │ ├── StringToShortIndexer.scala │ │ ├── Gather.scala │ │ ├── Binning.scala │ │ └── GatherEncoder.scala │ │ ├── sampling │ │ └── Downsampling.scala │ │ └── classification │ │ └── LocalLogisticRegression.scala │ └── test │ └── scala │ └── org │ └── apache │ └── spark │ ├── ml │ ├── feature │ │ ├── StringToShortIndexerSpec.scala │ │ ├── S2CellTransformerSpec.scala │ │ ├── SplitOptimizerSpec.scala │ │ ├── GatherSpec.scala │ │ ├── BinningSpec.scala │ │ ├── GatherEncoderModelSpec.scala │ │ └── GatherEncoderSpec.scala │ ├── sampling │ │ └── DownsamplingSpec.scala │ ├── TestingUtils.scala │ └── classification │ │ └── LocalLogisticRegressionSpec.scala │ └── mllib │ └── evaluation │ ├── BinaryModelMetricComputerSpec.scala │ └── BinaryModelMetricsSpec.scala ├── version.sbt ├── .travis.yml ├── .gitignore ├── README.md ├── scalastyle-config.xml └── LICENSE /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.8 2 | -------------------------------------------------------------------------------- /sparkext-sql/build.sbt: -------------------------------------------------------------------------------- 1 | libraryDependencies ++= Dependencies.sparkExtSql 2 | -------------------------------------------------------------------------------- /sparkext-test/build.sbt: -------------------------------------------------------------------------------- 1 | libraryDependencies ++= Dependencies.sparkExtTest 2 | -------------------------------------------------------------------------------- /sparkext-example/build.sbt: -------------------------------------------------------------------------------- 1 | libraryDependencies ++= Dependencies.sparkExtExample 2 | -------------------------------------------------------------------------------- /sparkext-mllib/build.sbt: -------------------------------------------------------------------------------- 1 | libraryDependencies ++= Dependencies.sparkExtMllib 2 | -------------------------------------------------------------------------------- /project/GitHelper.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | 3 | object GitHelper { 4 | 5 | def headSha(): String = Process("git rev-parse --short HEAD").!!.stripLineEnd 6 | 7 | } 8 | -------------------------------------------------------------------------------- /sparkext-example/src/main/scala/com/collective/sparkext/example/package.scala: -------------------------------------------------------------------------------- 1 | package com.collective.sparkext 2 | 3 | import scala.util.Random 4 | 5 | 6 | package object example { 7 | val rnd = new Random() 8 | } 9 | -------------------------------------------------------------------------------- /project/VersionScheme.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | 3 | object VersionScheme { 4 | 5 | object Keys { 6 | 7 | val isRelease = Def.settingKey[Boolean]("True if this is a release") 8 | 9 | val versionPrefix = Def.settingKey[String]( 10 | "Prefix of the version string") 11 | 12 | } 13 | 14 | } 15 | -------------------------------------------------------------------------------- /sparkext-test/src/test/resources/log4j-turned-off.properties: -------------------------------------------------------------------------------- 1 | log4j.rootCategory=ERROR, console 2 | log4j.appender.console=org.apache.log4j.ConsoleAppender 3 | log4j.appender.console.target=System.err 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | import VersionScheme.Keys._ 2 | 3 | isRelease in ThisBuild := sys.props("release") == "true" 4 | 5 | versionPrefix in ThisBuild := "0.0.23" 6 | 7 | version in ThisBuild <<= Def.setting[String] { 8 | if (isRelease.value) { 9 | versionPrefix.value 10 | } else { 11 | val headSha = GitHelper.headSha() 12 | s"${versionPrefix.value}.$headSha" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /sparkext-test/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=WARN, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Verbose logging for Collective packages 9 | log4j.logger.com.collective=TRACE 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Use Docker-based container (instead of OpenVZ) 2 | sudo: false 3 | 4 | cache: 5 | directories: 6 | - $HOME/.ivy2/cache 7 | - $HOME/.sbt/boot/scala-$TRAVIS_SCALA_VERSION 8 | 9 | language: scala 10 | scala: 11 | - 2.11.7 12 | - 2.10.6 13 | jdk: 14 | - oraclejdk8 15 | 16 | script: 17 | - sbt ++$TRAVIS_SCALA_VERSION -J-Xmx2512m clean test 18 | 19 | # Tricks to avoid unnecessary cache updates 20 | - find $HOME/.sbt -name "*.lock" | xargs rm 21 | - find $HOME/.ivy2 -name "ivydata-*.properties" | xargs rm 22 | -------------------------------------------------------------------------------- /project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | 2 | object Dependencies { 3 | 4 | import Dependency._ 5 | 6 | val sparkExtSql = 7 | Seq( 8 | sparkSql % "provided" 9 | , Test.scalaTest 10 | ) 11 | 12 | val sparkExtMllib = 13 | Seq( 14 | sparkMLLib % "provided" 15 | , s2Geometry 16 | , Test.scalaTest 17 | ) 18 | 19 | val sparkExtTest = 20 | Seq( 21 | sparkSql % "provided" 22 | , Test.scalaTest 23 | ) 24 | 25 | val sparkExtExample = 26 | Seq( 27 | sparkMLLib 28 | ) 29 | 30 | } 31 | -------------------------------------------------------------------------------- /sparkext-sql/src/main/scala/org/apache/spark/sql/ext/functions.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.ext 2 | 3 | import org.apache.spark.sql.catalyst.expressions._ 4 | import org.apache.spark.sql._ 5 | 6 | import scala.language.implicitConversions 7 | 8 | // scalastyle:off 9 | object functions { 10 | // scalastyle:on 11 | 12 | private[this] implicit def toColumn(expr: Expression): Column = Column(expr) 13 | 14 | // TODO: Workaround for https://issues.apache.org/jira/browse/SPARK-9301 15 | def collectArray(expr: Column): Column = CollectArray(expr.expr) 16 | 17 | } 18 | -------------------------------------------------------------------------------- /project/Dependency.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | 3 | 4 | object Dependency { 5 | 6 | object V { 7 | 8 | val Spark = "1.5.2" 9 | val S2Geometry = "1.0" 10 | 11 | val ScalaTest = "2.2.4" 12 | 13 | } 14 | 15 | val sparkSql = "org.apache.spark" %% "spark-sql" % V.Spark 16 | val sparkMLLib = "org.apache.spark" %% "spark-mllib" % V.Spark 17 | 18 | val s2Geometry = "com.google.common.geometry" % "s2-geometry" % V.S2Geometry intransitive() 19 | 20 | object Test { 21 | 22 | val scalaTest = "org.scalatest" %% "scalatest" % V.ScalaTest % "test" 23 | 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | libraryDependencies += "org.slf4j" % "slf4j-nop" % "1.7.5" 2 | 3 | resolvers += "jgit-repo" at "http://download.eclipse.org/jgit/maven" 4 | 5 | // Dependency graph 6 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.5") 7 | 8 | // Check Scala style 9 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.7.0") 10 | 11 | // Publish unified documentation to site 12 | addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.3.3") 13 | 14 | // Publish to bintray 15 | addSbtPlugin("me.lessis" % "bintray-sbt" % "0.3.0") 16 | 17 | // Publish unidoc to Github pages 18 | addSbtPlugin("com.typesafe.sbt" % "sbt-site" % "0.7.1") 19 | 20 | addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.5.2") 21 | -------------------------------------------------------------------------------- /project/ShellPrompt.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | 3 | import scala.language.postfixOps 4 | 5 | object ShellPrompt { 6 | 7 | object devnull extends ProcessLogger { 8 | def info (s: => String): Unit = {} 9 | def error (s: => String): Unit = {} 10 | def buffer[T] (f: => T): T = f 11 | } 12 | 13 | val current = """\*\s+([\w-/]+)""".r 14 | 15 | def gitBranches = "git branch --no-color" lines_! devnull mkString 16 | 17 | val buildShellPrompt = { 18 | (state: State) => { 19 | val currBranch = 20 | current findFirstMatchIn gitBranches map (_ group(1)) getOrElse "-" 21 | val currProject = Project.extract (state).currentProject.id 22 | "%s:%s> ".format ( 23 | currProject, currBranch 24 | ) 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /project/TestSettings.scala: -------------------------------------------------------------------------------- 1 | import sbt.Keys._ 2 | import sbt._ 3 | import org.scalastyle.sbt.ScalastylePlugin 4 | 5 | 6 | object TestSettings { 7 | 8 | private[this] lazy val checkScalastyle = taskKey[Unit]("checkScalastyle") 9 | 10 | def testSettings: Seq[Def.Setting[_]] = Seq( 11 | fork in Test := true, 12 | 13 | // Run Scalastyle as a part of tests 14 | checkScalastyle := ScalastylePlugin.scalastyle.in(Compile).toTask("").value, 15 | test in Test <<= (test in Test) dependsOn checkScalastyle, 16 | 17 | // Disable logging in all tests 18 | javaOptions in Test += "-Dlog4j.configuration=log4j-turned-off.properties", 19 | 20 | // Generate JUnit test reports 21 | testOptions in Test <+= (target in Test) map { 22 | t => Tests.Argument(TestFrameworks.ScalaTest, "-u", (t / "test-reports").toString) 23 | } 24 | ) 25 | 26 | } 27 | -------------------------------------------------------------------------------- /sparkext-mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryModelMetricComputer.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.mllib.evaluation 2 | 3 | import org.apache.spark.mllib.evaluation.binary.{BinaryClassificationMetricComputer, BinaryConfusionMatrix, Recall} 4 | 5 | /** Precision. Defined as 1.0 when there are no positive examples. */ 6 | private[evaluation] object Reach extends BinaryClassificationMetricComputer { 7 | override def apply(c: BinaryConfusionMatrix): Double = { 8 | val totalPopulation = c.numNegatives + c.numPositives 9 | if (totalPopulation == 0) { 10 | 1.0 11 | } else { 12 | (c.numTruePositives.toDouble + c.numFalsePositives.toDouble) / totalPopulation 13 | } 14 | } 15 | } 16 | 17 | private[evaluation] object Lift extends BinaryClassificationMetricComputer { 18 | override def apply(c: BinaryConfusionMatrix): Double = { 19 | Recall(c) / Reach(c) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /sparkext-test/src/test/scala/com/collective/TestSparkContext.scala: -------------------------------------------------------------------------------- 1 | package com.collective 2 | 3 | import org.apache.spark.sql.SQLContext 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | 6 | import scala.concurrent.duration._ 7 | import scala.concurrent.{Await, Future} 8 | 9 | object TestSparkContext { 10 | 11 | private[this] val conf = 12 | new SparkConf() 13 | .setMaster("local[1]") 14 | .set("spark.local.ip","localhost") 15 | .set("spark.driver.host","localhost") 16 | .setAppName("Spark Ext Test") 17 | 18 | lazy val sc: SparkContext = new SparkContext(conf) 19 | 20 | lazy val sqlContext: SQLContext = new SQLContext(sc) 21 | } 22 | 23 | 24 | trait TestSparkContext { 25 | 26 | lazy val sc: SparkContext = TestSparkContext.sc 27 | 28 | lazy val sqlContext: SQLContext = TestSparkContext.sqlContext 29 | 30 | def waitFor[T](f: Future[T], timeout: Duration = 5.second): T = { 31 | Await.result(f, timeout) 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /sparkext-mllib/src/main/scala/org/apache/spark/ml/feature/sharedParams.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.feature 2 | 3 | import org.apache.spark.ml.param.{Param, Params} 4 | 5 | /** 6 | * Trait for shared param keyCol. 7 | */ 8 | private[ml] trait HasKeyCol extends Params { 9 | 10 | /** 11 | * Param for category column name. 12 | * @group param 13 | */ 14 | final val keyCol: Param[String] = new Param[String](this, "keyCol", 15 | "Column that holds value for category name") 16 | 17 | /** @group getParam */ 18 | def getCategoryCol: String = $(keyCol) 19 | } 20 | 21 | /** 22 | * Trait for shared param valueCol. 23 | */ 24 | private[ml] trait HasValueCol extends Params { 25 | 26 | /** 27 | * Param for value column name. 28 | * @group param 29 | */ 30 | val valueCol: Param[String] = new Param[String](this, "valueCol", 31 | "Column that holds a value for category") 32 | 33 | 34 | /** @group getParam */ 35 | def getValueCol: String = $(valueCol) 36 | 37 | } 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | 3 | ## Directory-based project format: 4 | .idea/ 5 | 6 | ## File-based project format: 7 | *.ipr 8 | *.iws 9 | 10 | ## Plugin-specific files: 11 | 12 | # IntelliJ 13 | /out/ 14 | 15 | # mpeltonen/sbt-idea plugin 16 | .idea_modules/ 17 | 18 | # JIRA plugin 19 | atlassian-ide-plugin.xml 20 | 21 | # Crashlytics plugin (for Android Studio and IntelliJ) 22 | com_crashlytics_export_strings.xml 23 | crashlytics.properties 24 | crashlytics-build.properties 25 | ### SBT template 26 | # Simple Build Tool 27 | # http://www.scala-sbt.org/release/docs/Getting-Started/Directories.html#configuring-version-control 28 | 29 | target/ 30 | lib_managed/ 31 | src_managed/ 32 | project/boot/ 33 | .history 34 | .cache 35 | ### Scala template 36 | *.class 37 | *.log 38 | 39 | # sbt specific 40 | .cache 41 | .history 42 | .lib/ 43 | dist/* 44 | target/ 45 | lib_managed/ 46 | src_managed/ 47 | project/boot/ 48 | project/plugins/project/ 49 | 50 | # Scala-IDE specific 51 | .scala_dependencies 52 | .worksheet 53 | 54 | # Created by .ignore support plugin (hsz.mobi) 55 | -------------------------------------------------------------------------------- /sparkext-example/src/main/scala/com/collective/sparkext/example/InMemorySparkContext.scala: -------------------------------------------------------------------------------- 1 | package com.collective.sparkext.example 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import org.apache.spark.sql.SQLContext 5 | 6 | import scala.concurrent.{Await, Future} 7 | import scala.concurrent.duration._ 8 | 9 | object InMemorySparkContext { 10 | 11 | private[this] val conf = 12 | new SparkConf() 13 | .setMaster("local[4]") 14 | .set("spark.local.ip", "localhost") 15 | .set("spark.driver.host", "localhost") 16 | .set("spark.sql.tungsten.enabled", "false") 17 | .setAppName("Spark Ext Example App") 18 | 19 | lazy val sc: SparkContext = new SparkContext(conf) 20 | 21 | lazy val sqlContext: SQLContext = new SQLContext(sc) 22 | } 23 | 24 | 25 | trait InMemorySparkContext { 26 | 27 | lazy val sc: SparkContext = InMemorySparkContext.sc 28 | 29 | lazy val sqlContext: SQLContext = InMemorySparkContext.sqlContext 30 | 31 | def waitFor[T](f: Future[T], timeout: Duration = 5.second): T = { 32 | Await.result(f, timeout) 33 | } 34 | 35 | } 36 | 37 | -------------------------------------------------------------------------------- /sparkext-mllib/src/test/scala/org/apache/spark/ml/feature/StringToShortIndexerSpec.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.feature 2 | 3 | import com.collective.TestSparkContext 4 | import org.apache.spark.ml.attribute.{NominalAttribute, Attribute} 5 | import org.scalatest.FlatSpec 6 | 7 | class StringToShortIndexerSpec extends FlatSpec with TestSparkContext { 8 | 9 | "StringToShortIndexer" should "assign correct index for columns" in { 10 | val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2) 11 | val df = sqlContext.createDataFrame(data).toDF("id", "label") 12 | val indexer = new StringToShortIndexer() 13 | .setInputCol("label") 14 | .setOutputCol("labelIndex") 15 | .fit(df) 16 | 17 | val transformed = indexer.transform(df) 18 | val attr = Attribute.fromStructField(transformed.schema("labelIndex")) 19 | .asInstanceOf[NominalAttribute] 20 | assert(attr.values.get === Array("a", "c", "b")) 21 | val output = transformed.select("id", "labelIndex").map { r => 22 | (r.getInt(0), r.getShort(1)) 23 | }.collect().toSet 24 | // a -> 0, b -> 2, c -> 1 25 | val expected = Set((0, 0), (1, 2), (2, 1), (3, 0), (4, 0), (5, 1)) 26 | assert(output === expected) 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /sparkext-mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryModelMetricComputerSpec.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.mllib.evaluation 2 | 3 | import com.collective.TestSparkContext 4 | import org.apache.spark.mllib.evaluation.binary.{Recall, BinaryConfusionMatrixImpl, BinaryLabelCounter} 5 | import org.scalatest.{GivenWhenThen, FlatSpec} 6 | 7 | class BinaryModelMetricComputerSpec extends FlatSpec with GivenWhenThen with TestSparkContext { 8 | 9 | val confusions = Seq( 10 | BinaryConfusionMatrixImpl(new BinaryLabelCounter(1, 0), new BinaryLabelCounter(5, 5)), 11 | BinaryConfusionMatrixImpl(new BinaryLabelCounter(5, 2), new BinaryLabelCounter(5, 5)) 12 | ) 13 | 14 | behavior of "AudienceReach" 15 | confusions foreach { 16 | b => { 17 | it should s"compute proper reach for $b" in { 18 | Given(s"confusion matrix entry $b") 19 | val expectedAudienceReach = (b.count.numPositives + b.count.numNegatives).toDouble / 20 | (b.totalCount.numNegatives + b.totalCount.numPositives) 21 | 22 | Then(s"audience reach should be equal to $expectedAudienceReach") 23 | assert(Reach(b) === expectedAudienceReach) 24 | } 25 | } 26 | } 27 | 28 | behavior of "Lift" 29 | confusions foreach { 30 | b => { 31 | it should s"compute proper lift for $b" in { 32 | Given(s"confusion matrix entry $b") 33 | val expectedAudienceReach = (b.count.numPositives + b.count.numNegatives).toDouble / 34 | (b.totalCount.numNegatives + b.totalCount.numPositives) 35 | val expectedLift = Recall(b)/expectedAudienceReach 36 | 37 | Then(s"lift should be equal to $expectedLift") 38 | assert(Lift(b) === expectedLift) 39 | } 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /sparkext-mllib/src/test/scala/org/apache/spark/ml/feature/S2CellTransformerSpec.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.feature 2 | 3 | import com.collective.TestSparkContext 4 | import org.apache.spark.sql.Row 5 | import org.apache.spark.sql.types._ 6 | import org.scalatest.{FlatSpec, GivenWhenThen} 7 | 8 | 9 | class S2CellTransformerSpec extends FlatSpec with GivenWhenThen with TestSparkContext { 10 | 11 | val schema = StructType(Seq( 12 | StructField("city", StringType), 13 | StructField("lat", DoubleType), 14 | StructField("lon", DoubleType) 15 | )) 16 | 17 | val cities = sqlContext.createDataFrame(sc.parallelize(Seq( 18 | Row("New York", 40.7142700, -74.0059700), 19 | Row("London", 51.50722, -0.12750), 20 | Row("Princeton", 40.3487200, -74.6590500) 21 | )), schema) 22 | 23 | def cellMap(rows: Array[Row]): Map[String, String] = { 24 | rows.map { case Row(city: String, _, _, cell: String) => city -> cell }.toMap 25 | } 26 | 27 | "S2 Cell Transformer" should "compute S2 Cell Id for level = 6" in { 28 | Given("S2 Cell Transformer with level = 6") 29 | val s2CellTransformer = new S2CellTransformer().setLevel(6) 30 | val transformed = s2CellTransformer.transform(cities) 31 | val cells = cellMap(transformed.collect()) 32 | Then("New York should be in the same cell with Princeton") 33 | assert(cells("New York") == cells("Princeton")) 34 | } 35 | 36 | it should "compute S2 Cell Id for level = 12" in { 37 | Given("S2 Cell Transformer with level = 12") 38 | val s2CellTransformer = new S2CellTransformer().setLevel(12) 39 | val transformed = s2CellTransformer.transform(cities) 40 | val cells = cellMap(transformed.collect()) 41 | Then("all cities should in it's onw cell") 42 | assert(cells.values.toSet.size == 3) 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /sparkext-mllib/src/test/scala/org/apache/spark/mllib/evaluation/BinaryModelMetricsSpec.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.mllib.evaluation 2 | 3 | import com.collective.TestSparkContext 4 | import org.apache.spark.rdd.RDD 5 | import org.scalatest.{GivenWhenThen, FlatSpec} 6 | 7 | /** 8 | * We are just testing gains and lift methods. 9 | * Since code for this class was copied from spark 1.5.0 10 | */ 11 | class BinaryModelMetricsSpec extends FlatSpec with GivenWhenThen with TestSparkContext { 12 | 13 | val scoreAndLabels: RDD[(Double, Double)] = sc.parallelize(Seq( 14 | (0.8, 0.0), 15 | (0.7, 1.0), 16 | (0.3, 0.0), 17 | (0.9, 1.0), 18 | (0.6, 0.0), 19 | (0.6, 1.0), 20 | (0.6, 0.0), 21 | (0.8, 1.0), 22 | (0.2, 0.0), 23 | (0.5, 1.0) 24 | ), 1) 25 | 26 | val modelMetricsNoBin = new BinaryModelMetrics(scoreAndLabels) 27 | 28 | behavior of "BinaryModelMetrics" 29 | 30 | it should "compute gains chart" in { 31 | Given(s"score and labels set with 7 unique scores") 32 | When("creating BinaryModelMetrics without bins specified") 33 | val modelMetricsNoBin = new BinaryModelMetrics(scoreAndLabels) 34 | val gainsChart = modelMetricsNoBin.gains() 35 | 36 | Then("resulting gains chart should have 9 pair of coordinates") 37 | assert(gainsChart.count() === 9) 38 | } 39 | 40 | 41 | it should "compute gains chart with numBins = 3" in { 42 | Given(s"score and labels set with 7 unique scores") 43 | When("creating BinaryModelMetrics with 3 bins specified") 44 | val modelMetricsNoBin = new BinaryModelMetrics(scoreAndLabels, 3) 45 | val gainsChart = modelMetricsNoBin.gains() 46 | 47 | val expectedGainsPoints = (1 + Math.ceil(7.toDouble/(7/3)) + 1).toInt 48 | Then(s"resulting gains chart should have $expectedGainsPoints pair of coordinates") 49 | assert(gainsChart.count() === expectedGainsPoints) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /sparkext-mllib/src/test/scala/org/apache/spark/ml/feature/SplitOptimizerSpec.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.feature 2 | 3 | import org.scalatest._ 4 | 5 | class SplitOptimizerSpec extends FlatSpec with ShouldMatchers with SplitOptimizer { 6 | 7 | "SplitOptimizer" should "get from diff to original values" in { 8 | val diff = Array(0.1, 0.21, 0.05, 0.5) 9 | assert(fromDiff(diff).toSeq == Seq(0.1, 0.31, 0.36, 0.86)) 10 | } 11 | 12 | it should "get diff from original values" in { 13 | val values = Array(0.1, 0.31, 0.37, 0.88) 14 | assert(toDiff(values).toSeq == Seq(0.1, 0.21, 0.06, 0.51)) 15 | } 16 | 17 | it should "calculate perfect split of 9" in { 18 | val x = (0 until 100).toArray.map(_.toDouble + math.random - math.random) 19 | 20 | val splits = optimalSplit(x, 9) 21 | assert(splits.length == 9) 22 | 23 | splits.zipWithIndex.foreach { case (s, idx) => 24 | s should be ((idx + 1) * (x.length.toDouble / 10) +- 2.5) 25 | } 26 | } 27 | 28 | it should "calculate perfect split for highly skewed data" in { 29 | 30 | // R: x <- exp(rnorm(1000)) 31 | 32 | // Heavy right skewed data 33 | val g = breeze.stats.distributions.Gaussian(0, 1) 34 | val skewed = g.sample(1000).map(d => math.exp(d)).toArray 35 | 36 | val splits = optimalSplit(skewed, 9) 37 | assert(splits.length == 9) 38 | 39 | val cnt = counts(skewed)(splits) 40 | assert(cnt.sum == skewed.length) 41 | 42 | cnt.foreach { count => 43 | count should be((skewed.length / 10) +- 5) 44 | } 45 | } 46 | 47 | private def counts(x: Array[Double])(p: Seq[Double]): Seq[Int] = { 48 | val splits = Double.NegativeInfinity +: p :+ Double.PositiveInfinity 49 | 50 | val count = splits.sliding(2) map { case split => 51 | val low = split(0) 52 | val high = split(1) 53 | val filter = (v: Double) => v >= low && v < high 54 | x.count(filter) 55 | } 56 | 57 | count.toSeq 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /sparkext-mllib/src/test/scala/org/apache/spark/ml/feature/GatherSpec.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.feature 2 | 3 | import com.collective.TestSparkContext 4 | import org.apache.spark.sql.Row 5 | import org.apache.spark.sql.types._ 6 | import org.scalatest._ 7 | 8 | import scala.collection.mutable 9 | 10 | class GatherSpec extends FlatSpec with GivenWhenThen with ShouldMatchers with TestSparkContext { 11 | 12 | val schema = StructType(Seq( 13 | StructField("cookie_id", StringType), 14 | StructField("site", StringType), 15 | StructField("impressions", LongType) 16 | )) 17 | 18 | val cookie1 = "cookie1" 19 | val cookie2 = "cookie2" 20 | val cookie3 = "cookie3" 21 | 22 | val impressionLog = sqlContext.createDataFrame(sc.parallelize(Seq( 23 | Row(cookie1, "google.com", 10L), 24 | Row(cookie1, "cnn.com", 14L), 25 | Row(cookie1, "google.com", 2L), 26 | Row(cookie2, "bbc.com", 20L), 27 | Row(cookie2, "auto.com", null), 28 | Row(cookie2, "auto.com", 1L), 29 | Row(cookie3, "sport.com", 100L) 30 | )), schema) 31 | 32 | "Gather Transformer" should "transform 'long' DataFrame into 'wide'" in { 33 | val gather = new Gather() 34 | .setPrimaryKeyCols("cookie_id") 35 | .setKeyCol("site") 36 | .setValueCol("impressions") 37 | .setOutputCol("sites") 38 | 39 | val gathered = gather.transform(impressionLog) 40 | 41 | val lookupMap: Map[String, Map[String, Double]] = 42 | gathered.collect().map { case Row(cookieId: String, map: mutable.WrappedArray[_]) => 43 | val imps = map.map { case Row(site: String, impressions: Double) => site -> impressions }.toMap 44 | cookieId -> imps 45 | }.toMap 46 | 47 | assert(lookupMap(cookie1)("google.com") == 12.0) 48 | assert(lookupMap(cookie1)("cnn.com") == 14.0) 49 | assert(lookupMap(cookie2)("bbc.com") == 20.0) 50 | assert(lookupMap(cookie2)("auto.com") == 1.0) 51 | assert(lookupMap(cookie3)("sport.com") == 100.0) 52 | 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /sparkext-sql/src/test/scala/org/apache/spark/sql/ExtAggregatesSpec.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql 2 | 3 | import com.collective.TestSparkContext 4 | import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} 5 | import org.scalatest.FlatSpec 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.ext.functions._ 8 | 9 | import scala.collection.mutable 10 | 11 | class ExtAggregatesSpec extends FlatSpec with TestSparkContext { 12 | 13 | val schema = StructType(Seq( 14 | StructField("cookie_id", StringType), 15 | StructField("site", StringType), 16 | StructField("impressions", LongType) 17 | )) 18 | 19 | val cookie1 = "cookie1" 20 | val cookie2 = "cookie2" 21 | val cookie3 = "cookie3" 22 | 23 | val impressionLog = sqlContext.createDataFrame(sc.parallelize(Seq( 24 | Row(cookie1, "google.com", 10L), 25 | Row(cookie1, "cnn.com", 14L), 26 | Row(cookie1, "google.com", 2L), 27 | Row(cookie2, "bbc.com", 20L), 28 | Row(cookie2, "auto.com", null), 29 | Row(cookie2, "auto.com", 1L), 30 | Row(cookie3, "sport.com", 100L) 31 | )), schema) 32 | 33 | "Ext Aggregates" should "collect column values as array" in { 34 | val cookies = impressionLog 35 | .select(collectArray(col("cookie_id"))) 36 | .first().getAs[mutable.WrappedArray[String]](0) 37 | assert(cookies.length == 7) 38 | assert(cookies.toSet.size == 3) 39 | } 40 | 41 | it should "collect distinct values as array" in { 42 | val distinctCookies = impressionLog.select(col("cookie_id")) 43 | .distinct() 44 | .select(collectArray(col("cookie_id"))) 45 | .first().getAs[mutable.WrappedArray[String]](0) 46 | assert(distinctCookies.length == 3) 47 | } 48 | 49 | it should "collect values after group by" in { 50 | val result = impressionLog 51 | .groupBy(col("cookie_id")) 52 | .agg(collectArray(col("site"))) 53 | 54 | val cookieSites = result.collect().map { case Row(cookie: String, sites: mutable.WrappedArray[_]) => 55 | cookie -> sites.toSeq 56 | }.toMap 57 | 58 | assert(cookieSites(cookie1).length == 3) 59 | assert(cookieSites(cookie2).length == 3) 60 | assert(cookieSites(cookie3).length == 1) 61 | 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /sparkext-mllib/src/test/scala/org/apache/spark/ml/sampling/DownsamplingSpec.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.sampling 2 | 3 | import java.util.UUID 4 | 5 | import com.collective.TestSparkContext 6 | import org.apache.spark.sql.Row 7 | import org.apache.spark.sql.types._ 8 | import org.scalatest._ 9 | 10 | import scala.util.Random 11 | 12 | 13 | class DownsamplingSpec extends FlatSpec with GivenWhenThen with ShouldMatchers with TestSparkContext { 14 | 15 | val schema = StructType(Seq( 16 | StructField("cookie_id", StringType), 17 | StructField("label", DoubleType) 18 | )) 19 | 20 | def cookieId = UUID.randomUUID().toString 21 | 22 | def positives(n: Int): Seq[Row] = Seq.fill(n)(Row(cookieId, 1.0)) 23 | def negatives(n: Int): Seq[Row] = Seq.fill(n)(Row(cookieId, 0.0)) 24 | 25 | val dataset1 = sqlContext.createDataFrame(sc.parallelize(Random.shuffle(positives(100) ++ negatives(900))), schema) 26 | val dataset2 = sqlContext.createDataFrame(sc.parallelize(Random.shuffle(positives(100) ++ negatives(9000))), schema) 27 | 28 | "Downsampling" should "skip sampling if class ratio is below threshold" in { 29 | val downsampling = new Downsampling() 30 | .setLabelCol("label") 31 | .setOutputCol("sample_weight") 32 | .setPrimaryClass(1.0) 33 | 34 | val model = downsampling.fit(dataset1) 35 | assert(model.sampleFraction.isEmpty) 36 | 37 | val sampled = model.transform(dataset1) 38 | assert(sampled.schema("sample_weight").dataType == DoubleType) 39 | 40 | val w = sampled.select("sample_weight").collect().map(_.getDouble(0)).toSet 41 | assert(w.size == 1) 42 | assert(w.head == 1.0) 43 | } 44 | 45 | it should "sample negatives if class ratio is above threshold" in { 46 | val downsampling = new Downsampling() 47 | .setLabelCol("label") 48 | .setOutputCol("sample_weight") 49 | .setMaxClassRatio(29.0) 50 | .setPrimaryClass(1.0) 51 | 52 | val model = downsampling.fit(dataset2) 53 | assert(model.sampleFraction.isDefined) 54 | val fraction = model.sampleFraction.get 55 | val expectedFraction = 2900.0 / 9000 56 | fraction should (be >= 0.9 * expectedFraction and be <= 1.1 * expectedFraction) 57 | 58 | val sampled = model.transform(dataset2) 59 | assert(sampled.schema("sample_weight").dataType == DoubleType) 60 | 61 | sampled.count() should (be >= 2900L and be <= 3100L) 62 | 63 | val sampleWeight = sampled.select("label", "sample_weight").collect().map(r => r.getDouble(0) -> r.getDouble(1)).toMap 64 | assert(sampleWeight.size == 2) 65 | 66 | val expectedSampleWeight = 9000.0 / 2900 67 | sampleWeight(1.0) should equal (1.0) 68 | sampleWeight(0.0) should (be >= 0.9 * expectedSampleWeight and be <= 1.1 * expectedSampleWeight) 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /sparkext-mllib/src/main/scala/org/apache/spark/ml/feature/S2CellTransformer.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.feature 2 | 3 | import com.google.common.geometry.{S2LatLng, S2CellId} 4 | import org.apache.spark.ml.Transformer 5 | import org.apache.spark.ml.attribute.NominalAttribute 6 | import org.apache.spark.ml.param.{IntParam, Param, ParamMap, ParamValidators} 7 | import org.apache.spark.ml.util.Identifiable 8 | import org.apache.spark.sql.DataFrame 9 | import org.apache.spark.sql.functions._ 10 | import org.apache.spark.sql.types.{DoubleType, StructType} 11 | 12 | /** 13 | * Transform latitude and longitude into S2 Cell id 14 | */ 15 | class S2CellTransformer(override val uid: String) extends Transformer { 16 | 17 | def this() = this(Identifiable.randomUID("S2CellTransformer")) 18 | 19 | // Input/Output column names 20 | 21 | val latCol: Param[String] = new Param[String](this, "latCol", "latitude column") 22 | 23 | val lonCol: Param[String] = new Param[String](this, "lonCol", "longitude column") 24 | 25 | val cellCol: Param[String] = new Param[String](this, "cellCol", "S2 Cell Id column") 26 | 27 | val level: Param[Int] = new IntParam(this, "level", "S2 Level [0, 30]", 28 | (i: Int) => ParamValidators.gtEq(0)(i) && ParamValidators.ltEq(30)(i)) 29 | 30 | // Default parameters 31 | 32 | setDefault( 33 | latCol -> "lat", 34 | lonCol -> "lon", 35 | cellCol -> "cell", 36 | level -> 10 37 | ) 38 | 39 | def getLatCol: String = $(latCol) 40 | 41 | def getLonCol: String = $(lonCol) 42 | 43 | def getCellCol: String = $(cellCol) 44 | 45 | def getLevel: Int = $(level) 46 | 47 | def setLatCol(value: String): this.type = set(latCol, value) 48 | 49 | def setLonCol(value: String): this.type = set(lonCol, value) 50 | 51 | def setCellCol(value: String): this.type = set(cellCol, value) 52 | 53 | def setLevel(value: Int): this.type = set(level, value) 54 | 55 | override def transform(dataset: DataFrame): DataFrame = { 56 | val outputSchema = transformSchema(dataset.schema) 57 | val currentLevel = $(level) 58 | val t = udf { (lat: Double, lon: Double) => 59 | val cellId = S2CellId.fromLatLng(S2LatLng.fromDegrees(lat, lon)) 60 | cellId.parent(currentLevel).toToken 61 | } 62 | val metadata = outputSchema($(cellCol)).metadata 63 | dataset.select(col("*"), t(col($(latCol)), col($(lonCol))).as($(cellCol), metadata)) 64 | } 65 | 66 | override def transformSchema(schema: StructType): StructType = { 67 | val latColumnName = $(latCol) 68 | val latDataType = schema(latColumnName).dataType 69 | require(latDataType == DoubleType, 70 | s"The latitude column $latColumnName must be Double type, " + 71 | s"but got $latDataType.") 72 | 73 | val lonColumnName = $(lonCol) 74 | val lonDataType = schema(lonColumnName).dataType 75 | require(lonDataType == DoubleType, 76 | s"The longitude column $lonColumnName must be Double type, " + 77 | s"but got $lonDataType.") 78 | 79 | val inputFields = schema.fields 80 | val outputColName = $(cellCol) 81 | require(inputFields.forall(_.name != outputColName), 82 | s"Output column $outputColName already exists.") 83 | 84 | val attr = NominalAttribute.defaultAttr.withName($(cellCol)) 85 | val outputFields = inputFields :+ attr.toStructField() 86 | StructType(outputFields) 87 | } 88 | 89 | override def copy(extra: ParamMap): S2CellTransformer = defaultCopy(extra) 90 | } 91 | -------------------------------------------------------------------------------- /sparkext-example/src/main/scala/com/collective/sparkext/example/DataGenerator.scala: -------------------------------------------------------------------------------- 1 | package com.collective.sparkext.example 2 | 3 | import java.io.{PrintWriter, File} 4 | 5 | import scala.util.Random 6 | 7 | /** 8 | * Generate dummy dataset based on positive/negative predictors: site visitation log + geo location log 9 | */ 10 | object DataGenerator extends App with PositivePredictors with NegativePredictors { 11 | 12 | val (positive, negative) = Seq.fill(1000)(Random.alphanumeric.take(15).mkString).splitAt(100) 13 | 14 | val (pSites, pGeo, pResp) = generateDataset(positive, positivePredictors, negativePredictors, response = 1) 15 | val (nSites, nGeo, nResp) = generateDataset(negative, negativePredictors, positivePredictors, response = 0) 16 | 17 | // Write site impression log 18 | val sitesW = new PrintWriter(new File("sites.csv")) 19 | sitesW.println("cookie,site,impressions") 20 | (pSites ++ nSites).foreach(sitesW.println) 21 | sitesW.close() 22 | 23 | // Write geo impression log 24 | val geoW = new PrintWriter(new File("geo.csv")) 25 | geoW.println("cookie,lat,lon,impressions") 26 | (pGeo ++ nGeo).foreach(geoW.println) 27 | geoW.close() 28 | 29 | // Write response log 30 | val responseW = new PrintWriter(new File("response.csv")) 31 | responseW.println("cookie,response") 32 | (pResp ++ nResp).foreach(responseW.println) 33 | responseW.close() 34 | 35 | private def generateDataset( 36 | cookies: Seq[String], 37 | primaryPredictors: Predictors, 38 | secondaryPredictors: Predictors, 39 | response: Int, 40 | primaryImpMean: Int = 10, 41 | secondaryImpMean: Int = 3 42 | ): (Seq[String], Seq[String], Seq[String]) = { 43 | 44 | def impressions(mean: Int): Int = math.max(1, mean + (mean * rnd.nextGaussian()).toInt) 45 | 46 | val sites = cookies.flatMap { cookie => 47 | val primary = primaryPredictors.sites(6).map((_, impressions(primaryImpMean))) 48 | val secondary = secondaryPredictors.sites(3).map((_, impressions(secondaryImpMean))) 49 | (primary ++ secondary) map { case (site, imp) => s"$cookie,$site,$imp" } 50 | } 51 | 52 | val geo = cookies.flatMap { cookie => 53 | val primary = primaryPredictors.latLon(2).map((_, impressions(primaryImpMean))) 54 | val secondary = secondaryPredictors.latLon(1).map((_, impressions(secondaryImpMean))) 55 | (primary ++ secondary) map { case ((lat, lon), imp) => s"$cookie,$lat,$lon,$imp" } 56 | } 57 | 58 | val resp = cookies.map { cookie => s"$cookie,$response" } 59 | 60 | (sites, geo, resp) 61 | } 62 | 63 | } 64 | 65 | trait Predictors { 66 | 67 | def lat: Double 68 | def lon: Double 69 | def allSites: Seq[String] 70 | 71 | def sites(n: Int): Seq[String] = 72 | rnd.shuffle(allSites).take(1 + rnd.nextInt(n)) 73 | 74 | def latLon(n: Int): Seq[(Double, Double)] = 75 | Seq.fill(1 + rnd.nextInt(n))((lat + 3 * rnd.nextGaussian(), lon + 3 * rnd.nextGaussian())) 76 | } 77 | 78 | trait PositivePredictors { 79 | 80 | val positivePredictors = new Predictors { 81 | 82 | // New York 83 | val lat = 40.7127 84 | val lon = 74.0059 85 | 86 | val allSites = Seq( 87 | "google.com", "facebook.com", "amazon.com", 88 | "youtube.com", "yahoo.com", "ebay.com", "wikipedia.org", 89 | "twitter.com", "craiglist.com", "reddit.com", "netflix.com", 90 | "live.com", "bing.com", "linkedin.com", "pinterest.com" 91 | ) 92 | 93 | } 94 | } 95 | 96 | trait NegativePredictors { 97 | 98 | val negativePredictors = new Predictors { 99 | 100 | // Los Angeles 101 | val lat = 34.0500 102 | val lon = 118.2500 103 | 104 | val allSites = Seq( 105 | "imgur.com", "go.com", "tumblr.com", "espn.go.com", 106 | "cnn.com", "paypal.com", "chase.com", "instagram.com", "blogpost.com", 107 | "t.co", "msn.com", "imdb.com", "nytimes.com", "walmart.com", 108 | "huffingtonpost.com", "yelp.com", "diply.com" 109 | ) 110 | 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /sparkext-mllib/src/test/scala/org/apache/spark/ml/feature/BinningSpec.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.feature 2 | 3 | import java.util.UUID 4 | 5 | import com.collective.TestSparkContext 6 | import org.apache.spark.mllib.linalg.{SparseVector, Vector} 7 | import org.apache.spark.sql.Row 8 | import org.apache.spark.sql.types._ 9 | import org.scalatest.{FlatSpec, GivenWhenThen, ShouldMatchers} 10 | 11 | class BinningSpec extends FlatSpec with GivenWhenThen with ShouldMatchers with TestSparkContext { 12 | 13 | val schema = StructType(Seq( 14 | StructField("cookie_id", StringType), 15 | StructField("num_days", IntegerType), 16 | StructField("ctr", DoubleType), 17 | StructField("actions", DoubleType) 18 | )) 19 | 20 | val N = 1000 21 | 22 | def cookieId = UUID.randomUUID().toString 23 | 24 | val users = sqlContext.createDataFrame(sc.parallelize((1 to N).map { i => 25 | Row(cookieId, i, math.random, if (math.random > 0.5) 10 * math.random else null) 26 | }), schema) 27 | 28 | "Optimal Binning" should "compute binning for ctr" in { 29 | val optimalBinning = new OptimalBinning() 30 | .setInputCol("ctr") 31 | .setOutputCol("ctr_bin") 32 | .setNumBins(5) 33 | 34 | val binning = optimalBinning.fit(users) 35 | 36 | assert(binning.getSplits.length == 6) 37 | binning.getSplits(1) should be(0.20 +- 0.5) 38 | binning.getSplits(2) should be(0.40 +- 0.5) 39 | binning.getSplits(3) should be(0.60 +- 0.5) 40 | binning.getSplits(4) should be(0.80 +- 0.5) 41 | 42 | val binned = binning.transform(users).collect() 43 | assert(binned.length == N) 44 | } 45 | 46 | "Binning" should "bin DoubleType column" in { 47 | val binning = new Binning() 48 | .setInputCol("ctr") 49 | .setOutputCol("ctr_bin") 50 | .setSplits(Array(0.0, 0.25, 0.5, 0.75, 1.0)) 51 | 52 | def validate(ctr: Double, bin: Vector) = { 53 | assert(bin.size == 4) 54 | assert(bin.toSparse.indices.length == 1) 55 | assert(ctr match { 56 | case v if v >= 0.0 && v < 0.25 => bin.toSparse.indices.head == 0 57 | case v if v >= 0.25 && v < 0.50 => bin.toSparse.indices.head == 1 58 | case v if v >= 0.50 && v < 0.75 => bin.toSparse.indices.head == 2 59 | case v if v >= 0.75 && v < 1.0 => bin.toSparse.indices.head == 3 60 | 61 | }) 62 | } 63 | 64 | val binned = binning.transform(users) 65 | binned.collect().foreach { case Row(_, _, ctr: Double, _, bin: SparseVector) => 66 | validate(ctr, bin) 67 | } 68 | } 69 | 70 | it should "bin IntegerType column" in { 71 | val binning = new Binning() 72 | .setInputCol("num_days") 73 | .setOutputCol("num_days_bin") 74 | .setSplits(Array(0.0, 400, 800, 1000)) 75 | 76 | def validate(numDays: Int, bin: Vector) = { 77 | assert(bin.size == 3) 78 | assert(bin.toSparse.indices.length == 1) 79 | assert(numDays match { 80 | case v if v >= 0 && v < 400 => bin.toSparse.indices.head == 0 81 | case v if v >= 400 && v < 800 => bin.toSparse.indices.head == 1 82 | case v if v >= 800 && v <= 1000 => bin.toSparse.indices.head == 2 83 | }) 84 | } 85 | 86 | val binned = binning.transform(users) 87 | 88 | binned.collect().foreach { case Row(_, numDays: Int, _, _, bin: SparseVector) => 89 | validate(numDays, bin) 90 | } 91 | } 92 | 93 | it should "fail to bin StringType column" in { 94 | val binning = new Binning() 95 | .setInputCol("cookie_id") 96 | .setOutputCol("cookie_id_bins") 97 | .setSplits(Array(0.0, 400, 800, 1000)) 98 | 99 | intercept[IllegalArgumentException] { 100 | binning.transform(users) 101 | } 102 | } 103 | 104 | it should "bin column with nulls" in { 105 | val binning = new Binning() 106 | .setInputCol("actions") 107 | .setOutputCol("actions_bins") 108 | .setSplits(Array(0.0, 4.0, 8.0, 10.0)) 109 | 110 | binning.transform(users) 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /sparkext-mllib/src/main/scala/org/apache/spark/ml/feature/StringToShortIndexer.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.feature 2 | 3 | import org.apache.spark.SparkException 4 | import org.apache.spark.ml.{Estimator, Model} 5 | import org.apache.spark.ml.attribute.NominalAttribute 6 | import org.apache.spark.ml.param._ 7 | import org.apache.spark.ml.util.Identifiable 8 | import org.apache.spark.sql.DataFrame 9 | import org.apache.spark.sql.functions._ 10 | import org.apache.spark.sql.types._ 11 | import org.apache.spark.util.collection.OpenHashMap 12 | 13 | /** 14 | * A label indexer that maps a string column of labels to an ML column of label indices. 15 | * If the input column is numeric, we cast it to string and index the string values. 16 | * The indices are in [0, numLabels), ordered by label frequencies. 17 | * So the most frequent label gets index 0. 18 | * 19 | * In contrast to Spark [[StringIndexer]] use Short for labels (instead of Double) 20 | */ 21 | class StringToShortIndexer(override val uid: String) extends Estimator[StringToShortIndexerModel] 22 | with StringIndexerBase { 23 | 24 | def this() = this(Identifiable.randomUID("strShortIdx")) 25 | 26 | def setInputCol(value: String): this.type = set(inputCol, value) 27 | 28 | def setOutputCol(value: String): this.type = set(outputCol, value) 29 | 30 | override def fit(dataset: DataFrame): StringToShortIndexerModel = { 31 | val counts = dataset.select(col($(inputCol)).cast(StringType)) 32 | .map(_.getString(0)) 33 | .countByValue() 34 | val labels = counts.toSeq.sortBy(-_._2).map(_._1).toArray 35 | require(labels.length <= Short.MaxValue, 36 | s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})") 37 | copyValues(new StringToShortIndexerModel(uid, labels).setParent(this)) 38 | } 39 | 40 | override def transformSchema(schema: StructType): StructType = { 41 | validateAndTransformSchema(schema) 42 | } 43 | 44 | override def copy(extra: ParamMap): StringToShortIndexer = defaultCopy(extra) 45 | } 46 | 47 | class StringToShortIndexerModel ( 48 | override val uid: String, 49 | val labels: Array[String]) extends Model[StringToShortIndexerModel] with StringIndexerBase { 50 | 51 | def this(labels: Array[String]) = this(Identifiable.randomUID("strIdx"), labels) 52 | 53 | require(labels.length <= Short.MaxValue, 54 | s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})") 55 | 56 | private val labelToIndex: OpenHashMap[String, Short] = { 57 | val n = labels.length.toShort 58 | val map = new OpenHashMap[String, Short](n) 59 | var i: Short = 0 60 | while (i < n) { 61 | map.update(labels(i), i) 62 | i = (i + 1).toShort 63 | } 64 | map 65 | } 66 | 67 | def setInputCol(value: String): this.type = set(inputCol, value) 68 | 69 | def setOutputCol(value: String): this.type = set(outputCol, value) 70 | 71 | override def transform(dataset: DataFrame): DataFrame = { 72 | if (!dataset.schema.fieldNames.contains($(inputCol))) { 73 | logInfo(s"Input column ${$(inputCol)} does not exist during transformation. " + 74 | "Skip StringToShortIndexerModel.") 75 | return dataset 76 | } 77 | 78 | val indexer = udf { label: String => 79 | if (labelToIndex.contains(label)) { 80 | labelToIndex(label) 81 | } else { 82 | // TODO: handle unseen labels 83 | throw new SparkException(s"Unseen label: $label.") 84 | } 85 | } 86 | val outputColName = $(outputCol) 87 | val metadata = NominalAttribute.defaultAttr 88 | .withName(outputColName).withValues(labels).toMetadata() 89 | dataset.select(col("*"), 90 | indexer(dataset($(inputCol)).cast(StringType)).as(outputColName, metadata)) 91 | } 92 | 93 | override def transformSchema(schema: StructType): StructType = { 94 | if (schema.fieldNames.contains($(inputCol))) { 95 | validateAndTransformSchema(schema) 96 | } else { 97 | // If the input column does not exist during transformation, we skip StringToShortIndexerModel. 98 | schema 99 | } 100 | } 101 | 102 | override def copy(extra: ParamMap): StringToShortIndexerModel = { 103 | val copied = new StringToShortIndexerModel(uid, labels) 104 | copyValues(copied, extra).setParent(parent) 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /sparkext-mllib/src/main/scala/org/apache/spark/ml/feature/Gather.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.feature 2 | 3 | import org.apache.spark.ml.Transformer 4 | import org.apache.spark.ml.param._ 5 | import org.apache.spark.ml.param.shared.HasOutputCol 6 | import org.apache.spark.ml.util.Identifiable 7 | import org.apache.spark.sql.DataFrame 8 | import org.apache.spark.sql.ext.functions._ 9 | import org.apache.spark.sql.functions._ 10 | import org.apache.spark.sql.types._ 11 | 12 | private[feature] trait GatherParams extends Params with HasKeyCol with HasValueCol with HasOutputCol { 13 | 14 | val primaryKeyCols: Param[Array[String]] = new StringArrayParam(this, "primaryKeyCols", 15 | "Primary key column names", 16 | ParamValidators.arrayLengthGt(0)) 17 | 18 | val valueAgg: Param[String] = new Param[String](this, "valueAgg", 19 | "Aggregate function applied to valueCol: 'sum' or 'count'", 20 | ParamValidators.inArray(Array("sum", "count"))) 21 | 22 | def getPrimaryKeyCols: Array[String] = $(primaryKeyCols) 23 | 24 | def getValueAgg: String = $(valueAgg) 25 | } 26 | 27 | /** 28 | * Inspired by R `tidyr` and `reshape2` packages. Convert long [[org.apache.spark.sql.DataFrame DataFrame]] with values 29 | * for each key into wide [[org.apache.spark.sql.DataFrame DataFrame]], applying aggregation function if single 30 | * key has multiple values 31 | * {{{ 32 | * cookie_id | site_id | impressions 33 | * ----------|---------|-------------- 34 | * cookieAA | 123 | 10 35 | * cookieAA | 123 | 5 36 | * cookieAA | 456 | 20 37 | * }}} 38 | * 39 | * gathered using `sum` agregate 40 | * 41 | * {{{ 42 | * cookie_id | output_col 43 | * ----------|------------------------ 44 | * cookieAA | [{ site_id: 123, impressions: 15.0 }, { site_id: 456, impressions: 20.0 }] 45 | * }}} 46 | */ 47 | class Gather(override val uid: String) extends Transformer with GatherParams { 48 | 49 | def this() = this(Identifiable.randomUID("gather")) 50 | 51 | def setPrimaryKeyCols(value: String*): this.type = set(primaryKeyCols, value.toArray) 52 | 53 | def setKeyCol(value: String): this.type = set(keyCol, value) 54 | 55 | def setValueCol(value: String): this.type = set(valueCol, value) 56 | 57 | def setValueAgg(value: String): this.type = set(valueAgg, value) 58 | 59 | def setOutputCol(value: String): this.type = set(outputCol, value) 60 | 61 | setDefault( 62 | valueAgg -> "sum" 63 | ) 64 | 65 | override def transform(dataset: DataFrame): DataFrame = { 66 | val outputSchema = transformSchema(dataset.schema) 67 | 68 | val pkCols = $(primaryKeyCols).map(col) 69 | 70 | val grouped = dataset.groupBy(pkCols :+ col($(keyCol)) : _*) 71 | val aggregateCol = s"${uid}_value_aggregate" 72 | val aggregated = $(valueAgg) match { 73 | case "sum" => grouped.agg(sum($(valueCol)) as aggregateCol) 74 | case "count" => grouped.agg(count($(valueCol)) as aggregateCol) 75 | } 76 | 77 | val metadata = outputSchema($(outputCol)).metadata 78 | 79 | aggregated 80 | .groupBy(pkCols: _*) 81 | .agg(collectArray(struct( 82 | col($(keyCol)), 83 | col(aggregateCol).cast(DoubleType).as($(valueCol)) 84 | )).as($(outputCol), metadata)) 85 | } 86 | 87 | override def transformSchema(schema: StructType): StructType = { 88 | val valueFunName = $(valueAgg) 89 | 90 | val keyColName = $(keyCol) 91 | val keyColDataType = schema(keyColName).dataType 92 | keyColDataType match { 93 | case _: NumericType => 94 | case _: StringType => 95 | case other => 96 | throw new IllegalArgumentException(s"Key column data type $other is not supported.") 97 | } 98 | 99 | val valueColName = $(valueCol) 100 | val valueColDataType = schema(valueColName).dataType 101 | valueColDataType match { 102 | case _: NumericType => 103 | case _: StringType if valueFunName == "count" => 104 | case other => 105 | throw new IllegalArgumentException(s"Value data type $other is not supported with value aggregate $valueAgg.") 106 | } 107 | 108 | val pkFields = $(primaryKeyCols).map(schema.apply) 109 | val rollupType = StructType(Array( 110 | StructField($(keyCol), keyColDataType), 111 | StructField($(valueCol), DoubleType) 112 | )) 113 | val rollupField = StructField($(outputCol), ArrayType(rollupType), nullable = false) 114 | 115 | StructType(pkFields :+ rollupField) 116 | } 117 | 118 | override def copy(extra: ParamMap): S2CellTransformer = defaultCopy(extra) 119 | 120 | } 121 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spark Ext 2 | 3 | [![Build Status](https://travis-ci.org/collectivemedia/spark-ext.svg?branch=master)](https://travis-ci.org/collectivemedia/spark-ext) 4 | 5 | Spark ML transformers, estimator, Spark SQL aggregations, etc that are missing in Apache Spark. 6 | 7 | That's how we are doing [Audience Modeling](https://databricks.com/blog/2015/10/20/audience-modeling-with-spark-ml-pipelines.html) at Collective 8 | 9 | ## Where to get it 10 | 11 | ``` scala 12 | resolvers += "Collective Media Bintray" at "https://dl.bintray.com/collectivemedia/releases" 13 | ``` 14 | 15 | And use following library dependencies: 16 | 17 | ``` 18 | libraryDependencies += "com.collective.sparkext" %% "sparkext-sql" % "0.0.23" 19 | libraryDependencies += "com.collective.sparkext" %% "sparkext-mllib" % "0.0.23" 20 | ``` 21 | 22 | ## Testing 23 | 24 | sbt test 25 | 26 | ## Spark SQL 27 | 28 | ``` scala 29 | val schema = StructType(Seq( 30 | StructField("cookie_id", StringType), 31 | StructField("site", StringType), 32 | StructField("impressions", LongType) 33 | )) 34 | 35 | val impressionLog = sqlContext.createDataFrame(sc.parallelize(Seq( 36 | Row("cookie_1", "google.com", 10L), 37 | Row("cookie_2", "cnn.com", 14L), 38 | ... 39 | )), schema) 40 | ``` 41 | 42 | #### CollectArray 43 | 44 | Aggregation function that collects all values from a column 45 | 46 | ``` scala 47 | import org.apache.spark.sql.ext.functions._ 48 | 49 | // collects all sites for cookie (with duplicates) 50 | impressionLog 51 | .groupBy(col("cookie_id")) 52 | .agg(collectArray(col("site"))) 53 | ``` 54 | 55 | ## Spark ML 56 | 57 | #### S2 Geometry CellId transformer 58 | 59 | Gets Google S2 Geometry CellId from decimal `lat` and `lon` 60 | 61 | ``` scala 62 | val schema = StructType(Seq( 63 | StructField("city", StringType), 64 | StructField("lat", DoubleType), 65 | StructField("lon", DoubleType) 66 | )) 67 | 68 | val cities = sqlContext.createDataFrame(sc.parallelize(Seq( 69 | Row("New York", 40.7142700, -74.0059700), 70 | Row("London", 51.50722, -0.12750), 71 | Row("Princeton", 40.3487200, -74.6590500) 72 | )), schema) 73 | 74 | val s2CellTransformer = new S2CellTransformer().setLevel(6) 75 | s2CellTransformer.transform(cities) 76 | ``` 77 | 78 | #### Optimal Binning 79 | 80 | Continuous features may need to be transformed to binary format using binning to account for nonlinearity. In general, 81 | binning attempts to break a set of ordered values into evenly distributed groups, such that each group 82 | contains approximately the same number of values from the sample. 83 | 84 | #### Gather 85 | 86 | Inspired by R `tidyr` and `reshape2` packages. Convert `long` `DataFrame` with values 87 | for each key into `wide` `DataFrame`, applying aggregation function if single 88 | key has multiple values 89 | 90 | cookie_id | site_id | impressions 91 | ----------|---------|------------- 92 | cookieAA | 123 | 10 93 | cookieAA | 123 | 5 94 | cookieAA | 456 | 20 95 | 96 | ``` scala 97 | val gather = new Gather() 98 | .setPrimaryKeyCols("cookie_id") 99 | .setKeyCol("site_id") 100 | .setValueCol("impressions") 101 | .setOutputCol("sites") 102 | val gathered = gather.transform(siteLog) 103 | ``` 104 | 105 | cookie_id | sites 106 | ----------|------------- 107 | cookieAA | [{ site_id: 123, impressions: 15.0 }, { site_id: 456, impressions: 20.0 }] 108 | 109 | #### Gather Encoder 110 | 111 | Encode categorical key-value pairs using dummy variables. 112 | 113 | cookie_id | sites 114 | ----------|------------------------------------------------------------------------ 115 | cookieAA | [{ site_id: 1, impressions: 15.0 }, { site_id: 2, impressions: 20.0 }] 116 | cookieBB | [{ site_id: 2, impressions: 7.0 }, { site_id: 3, impressions: 5.0 }] 117 | 118 | transformed into 119 | 120 | cookie_id | site_features 121 | ----------|------------------------ 122 | cookieAA | [ 15.0 , 20.0 , 0 ] 123 | cookieBB | [ 0.0 , 7.0 , 5.0 ] 124 | 125 | Optionally apply dimensionality reduction using `top` transformation: 126 | - Top coverage, is selecting categorical values by computing the count of distinct users for each value, 127 | sorting the values in descending order by the count of users, and choosing the top values from the resulting 128 | list such that the sum of the distinct user counts over these values covers c percent of all users, 129 | for example, selecting top sites covering 99% of users. 130 | 131 | 132 | #### Downsampling Negatives 133 | 134 | If class ratio between positives and negatives is too high, you might want to downsample all you negatives before building a model. 135 | 136 | ``` scala 137 | val downsampling = new Downsampling() 138 | .setLabelCol("label") 139 | .setOutputCol("sample_weight") 140 | .setMaxClassRatio(30.0) 141 | .setPrimaryClass(1.0) // positive class to keep as-is 142 | ``` 143 | -------------------------------------------------------------------------------- /sparkext-mllib/src/main/scala/org/apache/spark/ml/sampling/Downsampling.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.sampling 2 | 3 | import org.apache.spark.ml.param.shared.{HasLabelCol, HasOutputCol} 4 | import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators, Params} 5 | import org.apache.spark.ml.util.{Identifiable, SchemaUtils} 6 | import org.apache.spark.ml.{Estimator, Model} 7 | import org.apache.spark.sql.DataFrame 8 | import org.apache.spark.sql.functions._ 9 | import org.apache.spark.sql.types._ 10 | 11 | private[sampling] trait DownsamplingParams 12 | extends Params with HasLabelCol with HasOutputCol { 13 | 14 | val primaryClass: Param[Double] = new Param[Double](this, "primaryClass", 15 | "Primary class to keep (0.0 or 1.0)", 16 | (v: Double) => v == 0.0 || v == 1.0) 17 | 18 | val sampleWithReplacement: Param[Boolean] = new Param[Boolean](this, "sampleWithReplacement", 19 | "Sample secondary class with replacement") 20 | 21 | def getPrimaryClass: Double = $(primaryClass) 22 | 23 | def getSampleWithReplacement: Boolean = $(sampleWithReplacement) 24 | 25 | setDefault(outputCol, uid + "_sample_weight") 26 | 27 | protected def validateAndTransformSchema(schema: StructType): StructType = { 28 | val labelColName = $(labelCol) 29 | val labelColDataType = schema(labelColName).dataType 30 | labelColDataType match { 31 | case _: DoubleType => 32 | case other => 33 | throw new IllegalArgumentException(s"Label column data type $other is not supported.") 34 | } 35 | SchemaUtils.appendColumn(schema, StructField(getOutputCol, DoubleType, nullable = false)) 36 | } 37 | 38 | } 39 | 40 | /** 41 | * Downsample input dataset in order to reduce class ratio 42 | * between positive (primary) and negative (secondary) classes 43 | */ 44 | class Downsampling(override val uid: String) extends Estimator[DownsamplingModel] with DownsamplingParams { 45 | 46 | def this() = this(Identifiable.randomUID("downsampling")) 47 | 48 | val maxClassRatio: Param[Double] = new Param[Double](this, "maxClassRatio", 49 | "Max class ratio", 50 | (v: Double) => ParamValidators.gt(0.0)(v) && ParamValidators.ltEq(1000.0)(v)) 51 | 52 | def getMaxClassRatio: Double = $(maxClassRatio) 53 | 54 | def setLabelCol(value: String): this.type = set(labelCol, value) 55 | 56 | def setOutputCol(value: String): this.type = set(outputCol, value) 57 | 58 | def setPrimaryClass(value: Double): this.type = set(primaryClass, value) 59 | setDefault(primaryClass -> 1.0) 60 | 61 | def setMaxClassRatio(value: Double): this.type = set(maxClassRatio, value) 62 | setDefault(maxClassRatio -> 30.0) 63 | 64 | def setSampleWithReplacement(value: Boolean): this.type = set(sampleWithReplacement, value) 65 | setDefault(sampleWithReplacement -> false) 66 | 67 | override def fit(dataset: DataFrame): DownsamplingModel = { 68 | log.info(s"Compute downsampling model with primary class: $getPrimaryClass") 69 | 70 | val primaryCnt = dataset.filter(col(getLabelCol) === getPrimaryClass).count() 71 | val secondaryCnt = dataset.filter(col(getLabelCol) !== getPrimaryClass).count() 72 | 73 | require(primaryCnt > 0, 74 | s"Primary class $getPrimaryClass should be presented in dataset") 75 | 76 | val classRatio = secondaryCnt.toDouble / primaryCnt 77 | 78 | if (classRatio <= getMaxClassRatio) { 79 | log.debug(s"Class ratio: $classRatio is below max class ratio: $getMaxClassRatio. Skip downsampling.") 80 | copyValues(new DownsamplingModel(uid, None).setParent(this)) 81 | } else { 82 | val desiredSecondaryCnt = primaryCnt * getMaxClassRatio 83 | val sampleFraction = desiredSecondaryCnt / secondaryCnt 84 | log.debug(s"Class ratio: $classRatio is above max class ratio: $getMaxClassRatio. Sample fraction: $sampleFraction") 85 | copyValues(new DownsamplingModel(uid, Some(sampleFraction)).setParent(this)) 86 | } 87 | 88 | } 89 | 90 | override def transformSchema(schema: StructType): StructType = { 91 | validateAndTransformSchema(schema) 92 | } 93 | 94 | override def copy(extra: ParamMap): Downsampling = defaultCopy(extra) 95 | 96 | } 97 | 98 | class DownsamplingModel( 99 | override val uid: String, 100 | val sampleFraction: Option[Double] 101 | ) extends Model[DownsamplingModel] with DownsamplingParams { 102 | 103 | def this(sampleFraction: Option[Double]) = this(Identifiable.randomUID("downsampling"), sampleFraction) 104 | 105 | def setLabelCol(value: String): this.type = set(labelCol, value) 106 | 107 | def setOutputCol(value: String): this.type = set(outputCol, value) 108 | 109 | def setPrimaryClass(value: Double): this.type = set(primaryClass, value) 110 | setDefault(primaryClass -> 1.0) 111 | 112 | def setSampleWithReplacement(value: Boolean): this.type = set(sampleWithReplacement, value) 113 | setDefault(sampleWithReplacement -> false) 114 | 115 | override def transform(dataset: DataFrame): DataFrame = sampleFraction match { 116 | case None => 117 | log.debug(s"Skip dataset downsampling") 118 | dataset.select(col("*"), lit(1.0) as getOutputCol) 119 | 120 | case Some(fraction) => 121 | log.debug(s"Downsample dataset with sample fraction: $fraction") 122 | 123 | val primary = dataset.filter(col(getLabelCol) === getPrimaryClass) 124 | .select(col("*"), lit(1.0) as getOutputCol) 125 | 126 | val secondary = dataset.filter(col(getLabelCol) !== getPrimaryClass) 127 | .sample(withReplacement = getSampleWithReplacement, fraction) 128 | .select(col("*"), lit(1.0 / fraction) as getOutputCol) 129 | 130 | primary.unionAll(secondary) 131 | } 132 | 133 | override def transformSchema(schema: StructType): StructType = { 134 | validateAndTransformSchema(schema) 135 | } 136 | 137 | override def copy(extra: ParamMap): DownsamplingModel = { 138 | val copied = new DownsamplingModel(uid, sampleFraction) 139 | copyValues(copied, extra).setParent(parent) 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /sparkext-mllib/src/test/scala/org/apache/spark/ml/feature/GatherEncoderModelSpec.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.feature 2 | 3 | import com.collective.TestSparkContext 4 | import org.apache.spark.mllib.linalg.Vector 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.{DataFrame, Row} 7 | import org.scalatest.FlatSpec 8 | 9 | class GatherEncoderModelSpec extends FlatSpec with TestSparkContext { 10 | 11 | val schema = StructType(Seq( 12 | StructField("cookie_id", StringType), 13 | StructField("sites", ArrayType(StructType(Seq( 14 | StructField("site", StringType), 15 | StructField("site_id", IntegerType), 16 | StructField("impressions", LongType 17 | ))), containsNull = true)) 18 | )) 19 | 20 | val cookie1 = "cookie1" 21 | val cookie2 = "cookie2" 22 | val cookie3 = "cookie3" 23 | val cookie4 = "cookie4" 24 | val cookie5 = "cookie5" 25 | 26 | val (google, googleId) = "google.com" -> 1 27 | val (cnn, cnnId) = "cnn.com" -> 2 28 | val (bbc, bbcId) = "bbc.com" -> 3 29 | val (auto, autoId) = "auto.com" -> 4 30 | val (moto, motoId) = "moto.com" -> 5 31 | val (sport, sportId) = "sport.com" -> 6 32 | 33 | val dataset = sqlContext.createDataFrame(sc.parallelize(Seq( 34 | Row(cookie1, Array( 35 | Row(google, googleId, 12L), 36 | Row(cnn, cnnId, 14L) 37 | )), 38 | Row(cookie2, Array( 39 | Row(bbc, bbcId, 20L), 40 | Row(auto, autoId, 1L), 41 | Row(moto, motoId, 3L) 42 | )), 43 | Row(cookie3, Array( 44 | Row(sport, sportId, 100L) 45 | )), 46 | Row(cookie4, Array.empty[Row]), 47 | Row(cookie5, null) 48 | )), schema) 49 | 50 | def createEncoder(keys: Array[Any]) = 51 | new GatherEncoderModel(keys) 52 | .setInputCol("sites") 53 | .setOutputCol("features") 54 | .setKeyCol("site") 55 | .setValueCol("impressions") 56 | 57 | val sites: Array[Any] = Array(google, bbc, cnn) 58 | val siteIds: Array[Any] = Array(googleId, bbcId, cnnId) 59 | 60 | def toFeatures(encoder: GatherEncoderModel, dataset: DataFrame): Map[String, Vector] = { 61 | val encodedDf = encoder.transform(dataset).select("cookie_id", "features") 62 | encodedDf.collect().map { case Row(cookieId: String, features: Vector) => 63 | cookieId -> features 64 | }.toMap 65 | } 66 | 67 | "Gather Encoder Model" should "encode categories ignoring all other" in { 68 | val sitesEncoder = createEncoder(sites).setAllOther(false) 69 | val siteIdsEncoder = createEncoder(siteIds).setKeyCol("site_id").setAllOther(false) 70 | 71 | // Check that type of the keys doesn't matter 72 | val siteFeatures = toFeatures(sitesEncoder, dataset) 73 | val idFeatures = toFeatures(siteIdsEncoder, dataset) 74 | assert(siteFeatures == idFeatures) 75 | 76 | assert(siteFeatures(cookie1).size == 3) 77 | assert(siteFeatures(cookie1).toSparse.indices.toSeq == 0 :: 2 :: Nil) 78 | assert(siteFeatures(cookie1).toSparse.values.toSeq == 12 :: 14 :: Nil) 79 | 80 | assert(siteFeatures(cookie2).size == 3) 81 | assert(siteFeatures(cookie2).toSparse.indices.toSeq == 1 :: Nil) 82 | assert(siteFeatures(cookie2).toSparse.values.toSeq == 20 :: Nil) 83 | 84 | def assertEmptyFeatures(cookie: String): Unit = { 85 | assert(siteFeatures(cookie).size == 3) 86 | assert(siteFeatures(cookie).toSparse.indices.toSeq == Nil) 87 | assert(siteFeatures(cookie).toSparse.values.toSeq == Nil) 88 | } 89 | 90 | assertEmptyFeatures(cookie3) 91 | assertEmptyFeatures(cookie4) 92 | assertEmptyFeatures(cookie5) 93 | } 94 | 95 | it should "encode categories with all other" in { 96 | val sitesEncoder = createEncoder(sites).setAllOther(true) 97 | val features = toFeatures(sitesEncoder, dataset) 98 | 99 | assert(features(cookie1).size == 4) 100 | assert(features(cookie1).toSparse.indices.toSeq == 0 :: 2 :: Nil) 101 | assert(features(cookie1).toSparse.values.toSeq == 12 :: 14 :: Nil) 102 | 103 | assert(features(cookie2).size == 4) 104 | assert(features(cookie2).toSparse.indices.toSeq == 1 :: 3 :: Nil) 105 | assert(features(cookie2).toSparse.values.toSeq == 20 :: 4 :: Nil) 106 | 107 | assert(features(cookie3).size == 4) 108 | assert(features(cookie3).toSparse.indices.toSeq == 3 :: Nil) 109 | assert(features(cookie3).toSparse.values.toSeq == 100 :: Nil) 110 | 111 | def assertEmptyFeatures(cookie: String): Unit = { 112 | assert(features(cookie).size == 4) 113 | assert(features(cookie).toSparse.indices.toSeq == Nil) 114 | assert(features(cookie).toSparse.values.toSeq == Nil) 115 | } 116 | 117 | assertEmptyFeatures(cookie4) 118 | assertEmptyFeatures(cookie5) 119 | } 120 | 121 | it should "remove input col" in { 122 | val sitesEncoder = createEncoder(sites).setKeepInputCol(false) 123 | val encoded = sitesEncoder.transform(dataset) 124 | assert(encoded.schema.size == dataset.schema.size) 125 | assert(!encoded.schema.exists(_.name == "sites")) 126 | } 127 | 128 | it should "fail to encode with empty key set" in { 129 | val encoder = createEncoder(Array.empty) 130 | intercept[IllegalArgumentException] { 131 | encoder.transform(dataset) 132 | } 133 | } 134 | 135 | it should "output empty vectors for empty keys with all other disabled" in { 136 | val sitesEncoder = createEncoder(Array.empty) 137 | .setFailOnEmptyKeys(false) 138 | .setAllOther(false) 139 | val features = toFeatures(sitesEncoder, dataset) 140 | assert(features(cookie1).size == 0) 141 | } 142 | 143 | it should "put all values into all other column for empty keys" in { 144 | val sitesEncoder = createEncoder(Array.empty) 145 | .setFailOnEmptyKeys(false) 146 | .setAllOther(true) 147 | 148 | val features = toFeatures(sitesEncoder, dataset) 149 | 150 | assert(features(cookie1).toArray.toSeq == Seq(26.0)) 151 | assert(features(cookie2).toArray.toSeq == Seq(24.0)) 152 | assert(features(cookie3).toArray.toSeq == Seq(100.0)) 153 | 154 | def assertEmptyFeatures(cookie: String): Unit = { 155 | assert(features(cookie).size == 1) 156 | assert(features(cookie).toSparse.indices.toSeq == Nil) 157 | assert(features(cookie).toSparse.values.toSeq == Nil) 158 | } 159 | 160 | assertEmptyFeatures(cookie4) 161 | assertEmptyFeatures(cookie5) 162 | } 163 | 164 | } 165 | -------------------------------------------------------------------------------- /sparkext-mllib/src/test/scala/org/apache/spark/ml/feature/GatherEncoderSpec.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.feature 2 | 3 | import com.collective.TestSparkContext 4 | import org.apache.spark.sql.Row 5 | import org.apache.spark.sql.types._ 6 | import org.scalatest.FlatSpec 7 | 8 | class GatherEncoderSpec extends FlatSpec with TestSparkContext { 9 | 10 | val schema = StructType(Seq( 11 | StructField("cookie_id", StringType), 12 | StructField("sites", ArrayType(StructType(Seq( 13 | StructField("site", StringType), 14 | StructField("impressions", LongType 15 | ))), containsNull = false)) 16 | )) 17 | 18 | val cookie1 = "cookie1" 19 | val cookie2 = "cookie2" 20 | val cookie3 = "cookie3" 21 | val cookie4 = "cookie4" 22 | val cookie5 = "cookie5" 23 | 24 | val dataset = sqlContext.createDataFrame(sc.parallelize( 25 | Seq.fill(250)(Row(cookie1, Array( // 250 * 2 = 500 // total: 500 // cover: 50% 26 | Row("google.com", 12L), 27 | Row("cnn.com", 14L) 28 | ))) ++ 29 | Seq.fill(100)(Row(cookie2, Array( // 100 * 3 = 300 // total: 800 // cover: 80% 30 | Row("bbc.com", 20L), 31 | Row("auto.com", 1L), 32 | Row("moto.com", 3L) 33 | ))) ++ 34 | Seq.fill(80)(Row(cookie3, Array( // 80 // total: 880 // cover: 88% 35 | Row("sport.com", 100L) 36 | ))) ++ 37 | Seq.fill(50)(Row(cookie3, Array( // 50 // total: 930 // cover: 93% 38 | Row("netflix.com", 1L) 39 | ))) ++ 40 | Seq.fill(40)(Row(cookie3, Array( // 40 // total: 970 // cover: 97% 41 | Row("amazon.com", 1L) 42 | ))) ++ 43 | Seq.fill(30)(Row(cookie3, Array( // 30 // total: 1000 // cover: 100% 44 | Row("imdb.com", 1L) 45 | ))) ++ 46 | Seq.fill(150)(Row(cookie4, Array( // 0 : cookie_id doesn't have any site statistics 47 | ))) ++ 48 | Seq.fill(150)(Row(cookie5, null // 0 : check that null doesn't break anything 49 | )) 50 | ), schema) 51 | 52 | // Empty and Null dataset can arise from outer joins in bigger pipelines 53 | 54 | val emptyDataset = sqlContext.createDataFrame(sc.parallelize( 55 | Seq.fill(250)(Row(cookie1, Array.empty[Row])) ++ 56 | Seq.fill(100)(Row(cookie2, Array.empty[Row])) ++ 57 | Seq.fill(80)(Row(cookie3, Array.empty[Row])) 58 | ), schema) 59 | 60 | val nullDataset = sqlContext.createDataFrame(sc.parallelize( 61 | Seq.fill(250)(Row(cookie1, null)) ++ 62 | Seq.fill(100)(Row(cookie2, null)) ++ 63 | Seq.fill(80)(Row(cookie3, null)) 64 | ), schema) 65 | 66 | def topEncoder: GatherEncoder = new GatherEncoder() 67 | .setInputCol("sites") 68 | .setOutputCol("features") 69 | .setKeyCol("site") 70 | .setValueCol("impressions") 71 | .setTransformation("top") 72 | 73 | def indexEncoder: GatherEncoder = topEncoder 74 | .setTransformation("index") 75 | 76 | "Index Gather Encoder" should "collect all keys when support is 1%" in { 77 | val encoder = indexEncoder.setSupport(1.0) 78 | val features = encoder.fit(dataset) 79 | assert(features.modelKeys.length == 9) 80 | } 81 | 82 | it should "support key exclusion when support is 1%" in { 83 | val encoder = indexEncoder.setSupport(1.0).setExcludeKeys(Set("imdb.com")) 84 | val features = encoder.fit(dataset) 85 | assert(features.modelKeys.length == 8) 86 | assert(!features.modelKeys.contains("imdb.com")) 87 | } 88 | 89 | it should "exclude imdb.com for 3.1% support" in { 90 | val encoder = indexEncoder.setSupport(3.1) 91 | val features = encoder.fit(dataset) 92 | assert(features.modelKeys.length == 8) 93 | assert(!features.modelKeys.contains("imdb.com")) 94 | } 95 | 96 | it should "exclude imdb.com and amazon.com for 4.1% support" in { 97 | val encoder = indexEncoder.setSupport(4.1) 98 | val features = encoder.fit(dataset) 99 | assert(features.modelKeys.length == 7) 100 | assert(!features.modelKeys.contains("imdb.com")) 101 | assert(!features.modelKeys.contains("amazon.com")) 102 | } 103 | 104 | "Top Gather Encoder" should "collect all keys when cover is 100.0" in { 105 | val encoder = topEncoder.setCover(100.0) 106 | val features = encoder.fit(dataset) 107 | assert(features.modelKeys.length == 9) 108 | } 109 | 110 | it should "support key exclusion when cover is 100.0" in { 111 | val encoder = topEncoder.setCover(100.0).setExcludeKeys(Set("imdb.com")) 112 | val features = encoder.fit(dataset) 113 | assert(features.modelKeys.length == 8) 114 | assert(!features.modelKeys.contains("imdb.com")) 115 | } 116 | 117 | it should "exclude imdb.com for 95% coverage" in { 118 | val encoder = topEncoder.setCover(95.0) 119 | val features = encoder.fit(dataset) 120 | assert(features.modelKeys.length == 8) 121 | assert(!features.modelKeys.contains("imdb.com")) 122 | } 123 | 124 | it should "support key exclusion when cover is 95%" in { 125 | val encoder = topEncoder.setCover(95.0).setExcludeKeys(Set("amazon.com")) 126 | val features = encoder.fit(dataset) 127 | assert(features.modelKeys.length == 7) 128 | // Imdb excluded by coverage 129 | assert(!features.modelKeys.contains("imdb.com")) 130 | // Amazon excluded explicitly 131 | assert(!features.modelKeys.contains("amazon.com")) 132 | } 133 | 134 | it should "exclude amazon.com for 90% coverage" in { 135 | val encoder = topEncoder.setCover(90.0) 136 | val features = encoder.fit(dataset) 137 | assert(features.modelKeys.length == 7) 138 | assert(!features.modelKeys.contains("amazon.com")) 139 | } 140 | 141 | it should "exclude netflix.com for 85% coverage" in { 142 | val encoder = topEncoder.setCover(85.0) 143 | val features = encoder.fit(dataset) 144 | assert(features.modelKeys.length == 6) 145 | assert(!features.modelKeys.contains("netflix.com")) 146 | } 147 | 148 | it should "exclude sport.com for 75% coverage" in { 149 | val encoder = topEncoder.setCover(75.0) 150 | val features = encoder.fit(dataset) 151 | assert(features.modelKeys.length == 5) 152 | assert(!features.modelKeys.contains("sport.com")) 153 | } 154 | 155 | it should "get empty key set for empty dataset" in { 156 | val encoder = topEncoder 157 | val features = encoder.fit(emptyDataset) 158 | assert(features.modelKeys.isEmpty) 159 | } 160 | 161 | it should "get empty key set for null dataset" in { 162 | val encoder = topEncoder 163 | val features = encoder.fit(nullDataset) 164 | assert(features.modelKeys.isEmpty) 165 | } 166 | 167 | 168 | } 169 | -------------------------------------------------------------------------------- /sparkext-sql/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.catalyst.expressions 2 | 3 | import org.apache.spark.sql.catalyst.InternalRow 4 | import org.apache.spark.sql.types.{GenericArrayData, ArrayType, DataType} 5 | import scala.collection.mutable 6 | 7 | case class CollectArray(expression: Expression) extends PartialAggregate1 { 8 | def this() = this(null) 9 | 10 | override def children: Seq[Expression] = expression :: Nil 11 | 12 | override def nullable: Boolean = false 13 | override def dataType: DataType = ArrayType(expression.dataType, containsNull = false) 14 | override def toString: String = s"COLLECT_ARRAY($expression)" 15 | override def newInstance(): CollectArrayFunction = new CollectArrayFunction(expression, this) 16 | 17 | override def asPartial: SplitEvaluation = { 18 | val partialSet = Alias(CollectPartialArray(expression), "partialArrays")() 19 | SplitEvaluation( 20 | CombinePartialArrays(partialSet.toAttribute), 21 | partialSet :: Nil) 22 | } 23 | } 24 | 25 | case class CollectArrayFunction( 26 | @transient expr: Expression, 27 | @transient base: AggregateExpression1) 28 | extends AggregateFunction1 { 29 | 30 | def this() = this(null, null) // Required for serialization. 31 | 32 | // Reducing GC pressure with this trick 33 | 34 | var firstValue: Any = _ 35 | var builder: mutable.ListBuffer[Any] = _ 36 | 37 | override def update(input: InternalRow): Unit = { 38 | val evaluatedExpr = expr.eval(input) 39 | if (evaluatedExpr != null) { 40 | if (firstValue == null && builder == null) { 41 | // Got first value 42 | firstValue = evaluatedExpr 43 | } else if (firstValue != null && builder == null) { 44 | // Got second value 45 | builder = mutable.ListBuffer.empty[Any] 46 | builder += firstValue 47 | builder += evaluatedExpr 48 | firstValue = null 49 | } else if (firstValue == null && builder != null) { 50 | // Got 2+ values 51 | builder += evaluatedExpr 52 | } else { 53 | throw new IllegalStateException(s"Both state variables are defined") 54 | } 55 | } 56 | } 57 | 58 | override def eval(input: InternalRow): Any = { 59 | if (firstValue == null && builder == null) { 60 | new GenericArrayData(Array.empty) 61 | } else if (firstValue != null && builder == null) { 62 | new GenericArrayData(Array(firstValue)) 63 | } else if (firstValue == null && builder != null) { 64 | new GenericArrayData(builder.toArray) 65 | } else { 66 | throw new IllegalStateException("Both state variables are defined") 67 | } 68 | } 69 | } 70 | 71 | case class CollectPartialArray(expression: Expression) extends AggregateExpression1 { 72 | def this() = this(null) 73 | 74 | override def children: Seq[Expression] = expression :: Nil 75 | override def nullable: Boolean = false 76 | override def dataType: DataType = ArrayType(expression.dataType, containsNull = false) 77 | override def toString: String = s"AddToPartialArray($expression)" 78 | override def newInstance(): CollectPartialArrayFunction = 79 | new CollectPartialArrayFunction(expression, this) 80 | } 81 | 82 | case class CollectPartialArrayFunction( 83 | @transient expr: Expression, 84 | @transient base: AggregateExpression1) 85 | extends AggregateFunction1 { 86 | 87 | def this() = this(null, null) // Required for serialization. 88 | 89 | // Reducing GC pressure with this trick 90 | 91 | var firstValue: Any = _ 92 | var builder: mutable.ListBuffer[Any] = _ 93 | 94 | override def update(input: InternalRow): Unit = { 95 | val evaluatedExpr = expr.eval(input) 96 | if (evaluatedExpr != null) { 97 | if (firstValue == null && builder == null) { 98 | // Got first value 99 | firstValue = evaluatedExpr 100 | } else if (firstValue != null && builder == null) { 101 | // Got second value 102 | builder = mutable.ListBuffer.empty[Any] 103 | builder += firstValue 104 | builder += evaluatedExpr 105 | firstValue = null 106 | } else if (firstValue == null && builder != null) { 107 | // Got 2+ values 108 | builder += evaluatedExpr 109 | } else { 110 | throw new IllegalStateException(s"Both state variables are defined") 111 | } 112 | } 113 | } 114 | 115 | override def eval(input: InternalRow): Any = { 116 | if (firstValue == null && builder == null) { 117 | new GenericArrayData(Array.empty) 118 | } else if (firstValue != null && builder == null) { 119 | new GenericArrayData(Array(firstValue)) 120 | } else if (firstValue == null && builder != null) { 121 | new GenericArrayData(builder.toArray) 122 | } else { 123 | throw new IllegalStateException("Both state variables are defined") 124 | } 125 | } 126 | } 127 | 128 | case class CombinePartialArrays(inputSet: Expression) extends AggregateExpression1 { 129 | def this() = this(null) 130 | 131 | override def children: Seq[Expression] = inputSet :: Nil 132 | override def nullable: Boolean = false 133 | override def dataType: DataType = inputSet.dataType 134 | override def toString: String = s"CombinePartialArrays($inputSet)" 135 | override def newInstance(): CombinePartialArraysFunction = { 136 | new CombinePartialArraysFunction(inputSet, this) 137 | } 138 | } 139 | 140 | case class CombinePartialArraysFunction( 141 | @transient inputSet: Expression, 142 | @transient base: AggregateExpression1) 143 | extends AggregateFunction1 { 144 | 145 | def this() = this(null, null) // Required for serialization. 146 | 147 | // Reducing GC pressure with this trick 148 | 149 | var firstArray: GenericArrayData = _ 150 | var builder: mutable.ListBuffer[Any] = _ 151 | 152 | override def update(input: InternalRow): Unit = { 153 | val inputSetEval = inputSet.eval(input).asInstanceOf[GenericArrayData] 154 | 155 | if (firstArray == null && builder == null) { 156 | // Got first array 157 | firstArray = inputSetEval 158 | } else if (firstArray != null && builder == null) { 159 | // Got second value 160 | builder = mutable.ListBuffer.empty[Any] 161 | val inputIterator = firstArray.array.iterator ++ inputSetEval.array.iterator 162 | while (inputIterator.hasNext) { 163 | builder += inputIterator.next 164 | } 165 | firstArray = null 166 | } else if (firstArray == null && builder != null) { 167 | // Got 2+ values 168 | val inputIterator = inputSetEval.array.iterator 169 | while (inputIterator.hasNext) { 170 | builder += inputIterator.next 171 | } 172 | } else { 173 | throw new IllegalStateException(s"Both state variables are defined") 174 | } 175 | } 176 | 177 | override def eval(input: InternalRow): Any = { 178 | if (firstArray == null && builder == null) { 179 | new GenericArrayData(Array.empty) 180 | } else if (firstArray != null && builder == null) { 181 | firstArray 182 | } else if (firstArray == null && builder != null) { 183 | new GenericArrayData(builder.toArray) 184 | } else { 185 | throw new IllegalStateException("Both state variables are defined") 186 | } 187 | } 188 | } 189 | -------------------------------------------------------------------------------- /sparkext-example/src/main/scala/com/collective/sparkext/example/SparkMlExtExample.scala: -------------------------------------------------------------------------------- 1 | package com.collective.sparkext.example 2 | 3 | import org.apache.log4j.Logger 4 | import org.apache.log4j.varia.NullAppender 5 | import org.apache.spark.ml.Pipeline 6 | import org.apache.spark.ml.attribute.AttributeGroup 7 | import org.apache.spark.ml.classification.LogisticRegression 8 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 9 | import org.apache.spark.ml.feature.{VectorAssembler, GatherEncoder, S2CellTransformer, Gather} 10 | import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} 11 | import org.apache.spark.mllib.evaluation.BinaryModelMetrics 12 | import org.apache.spark.mllib.linalg.DenseVector 13 | import org.apache.spark.sql.functions._ 14 | import org.apache.spark.sql.{Row, DataFrame} 15 | import org.apache.spark.sql.types._ 16 | 17 | object SparkMlExtExample extends App with Sites with Geo with Response { 18 | 19 | import sqlContext.implicits._ 20 | 21 | turnOffLogging() 22 | 23 | println(s"Run Spark ML Ext Example application") 24 | 25 | println(s"Sites data frame size = ${sitesDf.count()}") 26 | println(s"Geo data frame size = ${geoDf.count()}") 27 | println(s"Response data frame size = ${responseDf.count()} ") 28 | 29 | // Gather site visitation log 30 | val gatherSites = new Gather() 31 | .setPrimaryKeyCols(Sites.cookie) 32 | .setKeyCol(Sites.site) 33 | .setValueCol(Sites.impressions) 34 | .setOutputCol("sites") 35 | 36 | // Transform lat/lon into S2 Cell Id 37 | val s2Transformer = new S2CellTransformer() 38 | .setLevel(5) 39 | .setCellCol("s2_cell") 40 | 41 | // Gather S2 CellId log 42 | val gatherS2Cells = new Gather() 43 | .setPrimaryKeyCols(Geo.cookie) 44 | .setKeyCol("s2_cell") 45 | .setValueCol(Geo.impressions) 46 | .setOutputCol("s2_cells") 47 | 48 | // Gather raw data into wide rows 49 | val gatheredSites = gatherSites.transform(sitesDf) 50 | val gatheredCells = gatherS2Cells.transform(s2Transformer.transform(geoDf)) 51 | 52 | // Assemble input dataset 53 | val dataset = responseDf.as("response") 54 | .join(gatheredSites, responseDf(Response.cookie) === gatheredSites(Sites.cookie)) 55 | .join(gatheredCells, responseDf(Response.cookie) === gatheredCells(Sites.cookie)) 56 | .select( 57 | $"response.*", 58 | $"sites", 59 | $"s2_cells" 60 | ).cache() 61 | 62 | println(s"Input dataset size = ${dataset.count()}") 63 | 64 | dataset.show(10) 65 | 66 | // Split dataset into test/train sets 67 | val trainPct = 0.1 68 | val Array(trainSet, testSet) = dataset.randomSplit(Array(1 - trainPct, trainPct)) 69 | 70 | // Setup ML Pipeline stages 71 | 72 | // Encode site data 73 | val encodeSites = new GatherEncoder() 74 | .setInputCol("sites") 75 | .setOutputCol("sites_f") 76 | .setKeyCol(Sites.site) 77 | .setValueCol(Sites.impressions) 78 | 79 | // Encode S2 Cell data 80 | val encodeS2Cells = new GatherEncoder() 81 | .setInputCol("s2_cells") 82 | .setOutputCol("s2_cells_f") 83 | .setKeyCol("s2_cell") 84 | .setValueCol(Geo.impressions) 85 | .setCover(0.95) 86 | 87 | // Assemble feature vectors together 88 | val assemble = new VectorAssembler() 89 | .setInputCols(Array("sites_f", "s2_cells_f")) 90 | .setOutputCol("features") 91 | 92 | // Extract features label information 93 | val dummyPipeline = new Pipeline() 94 | .setStages(Array(encodeSites, encodeS2Cells, assemble)) 95 | val out = dummyPipeline.fit(dataset).transform(dataset) 96 | val attrGroup = AttributeGroup.fromStructField(out.schema("features")) 97 | 98 | val attributes = attrGroup.attributes.get 99 | println(s"Num features = ${attributes.length}") 100 | attributes.zipWithIndex.foreach { case (attr, idx) => 101 | println(s" - $idx = $attr") 102 | } 103 | 104 | // Build logistic regression using featurized statistics 105 | val lr = new LogisticRegression() 106 | .setFeaturesCol("features") 107 | .setLabelCol(Response.response) 108 | .setProbabilityCol("probability") 109 | 110 | // Define pipeline with 4 stages 111 | val pipeline = new Pipeline() 112 | .setStages(Array(encodeSites, encodeS2Cells, assemble, lr)) 113 | 114 | val evaluator = new BinaryClassificationEvaluator() 115 | .setLabelCol(Response.response) 116 | 117 | val crossValidator = new CrossValidator() 118 | .setEstimator(pipeline) 119 | .setEvaluator(evaluator) 120 | 121 | val paramGrid = new ParamGridBuilder() 122 | .addGrid(lr.elasticNetParam, Array(0.1, 0.5)) 123 | .build() 124 | 125 | crossValidator.setEstimatorParamMaps(paramGrid) 126 | crossValidator.setNumFolds(2) 127 | 128 | println(s"Train model on train set") 129 | val cvModel = crossValidator.fit(trainSet) 130 | 131 | println(s"Score test set") 132 | val testScores = cvModel.transform(testSet) 133 | 134 | val scoreAndLabels = testScores 135 | .select(col("probability"), col(Response.response)) 136 | .map { case Row(probability: DenseVector, label: Double) => 137 | val predictedActionProbability = probability(1) 138 | (predictedActionProbability, label) 139 | } 140 | 141 | println("Evaluate model") 142 | val metrics = new BinaryModelMetrics(scoreAndLabels) 143 | val auc = metrics.areaUnderROC() 144 | 145 | println(s"Model AUC: $auc") 146 | 147 | private def turnOffLogging(): Unit = { 148 | Logger.getRootLogger.removeAllAppenders() 149 | Logger.getRootLogger.addAppender(new NullAppender()) 150 | } 151 | } 152 | 153 | trait Sites extends InMemorySparkContext { 154 | 155 | object Sites { 156 | val cookie = "cookie" 157 | val site = "site" 158 | val impressions = "impressions" 159 | 160 | val schema = StructType(Array( 161 | StructField(cookie, StringType), 162 | StructField(site, StringType), 163 | StructField(impressions, IntegerType) 164 | )) 165 | } 166 | 167 | lazy val sitesDf: DataFrame = { 168 | val lines = scala.io.Source.fromInputStream(this.getClass.getResourceAsStream("/sites.csv")).getLines() 169 | val rows = lines.map(_.split(",")).drop(1) collect { 170 | case Array(cookie, site, impressions) => Row(cookie, site, impressions.toInt) 171 | } 172 | val rdd = sc.parallelize(rows.toSeq) 173 | sqlContext.createDataFrame(rdd, Sites.schema) 174 | } 175 | 176 | } 177 | 178 | trait Geo extends InMemorySparkContext { 179 | 180 | object Geo { 181 | val cookie = "cookie" 182 | val lat = "lat" 183 | val lon = "lon" 184 | val impressions = "impressions" 185 | 186 | val schema = StructType(Array( 187 | StructField(cookie, StringType), 188 | StructField(lat, DoubleType), 189 | StructField(lon, DoubleType), 190 | StructField(impressions, IntegerType) 191 | )) 192 | } 193 | 194 | lazy val geoDf: DataFrame = { 195 | val lines = scala.io.Source.fromInputStream(this.getClass.getResourceAsStream("/geo.csv")).getLines() 196 | val rows = lines.map(_.split(",")).drop(1) collect { 197 | case Array(cookie, lat, lon, impressions) => Row(cookie, lat.toDouble, lon.toDouble, impressions.toInt) 198 | } 199 | val rdd = sc.parallelize(rows.toSeq) 200 | sqlContext.createDataFrame(rdd, Geo.schema) 201 | } 202 | 203 | } 204 | 205 | trait Response extends InMemorySparkContext { 206 | 207 | object Response { 208 | val cookie = "cookie" 209 | val response = "response" 210 | 211 | val schema = StructType(Array( 212 | StructField(cookie, StringType), 213 | StructField(response, DoubleType) 214 | )) 215 | } 216 | 217 | lazy val responseDf: DataFrame = { 218 | val lines = scala.io.Source.fromInputStream(this.getClass.getResourceAsStream("/response.csv")).getLines() 219 | val rows = lines.map(_.split(",")).drop(1) collect { 220 | case Array(cookie, response) => Row(cookie, response.toDouble) 221 | } 222 | val rdd = sc.parallelize(rows.toSeq) 223 | sqlContext.createDataFrame(rdd, Response.schema) 224 | } 225 | 226 | } 227 | 228 | -------------------------------------------------------------------------------- /sparkext-mllib/src/test/scala/org/apache/spark/ml/TestingUtils.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml 2 | 3 | import org.apache.spark.mllib.linalg.{Matrix, Vector} 4 | import org.scalatest.exceptions.TestFailedException 5 | 6 | 7 | object TestingUtils { 8 | 9 | val ABS_TOL_MSG = " using absolute tolerance" 10 | val REL_TOL_MSG = " using relative tolerance" 11 | 12 | /** 13 | * Private helper function for comparing two values using relative tolerance. 14 | * Note that if x or y is extremely close to zero, i.e., smaller than Double.MinPositiveValue, 15 | * the relative tolerance is meaningless, so the exception will be raised to warn users. 16 | */ 17 | private def RelativeErrorComparison(x: Double, y: Double, eps: Double): Boolean = { 18 | val absX = math.abs(x) 19 | val absY = math.abs(y) 20 | val diff = math.abs(x - y) 21 | if (x == y) { 22 | true 23 | } else if (absX < Double.MinPositiveValue || absY < Double.MinPositiveValue) { 24 | throw new TestFailedException( 25 | s"$x or $y is extremely close to zero, so the relative tolerance is meaningless.", 0) 26 | } else { 27 | diff < eps * math.min(absX, absY) 28 | } 29 | } 30 | 31 | /** 32 | * Private helper function for comparing two values using absolute tolerance. 33 | */ 34 | private def AbsoluteErrorComparison(x: Double, y: Double, eps: Double): Boolean = { 35 | math.abs(x - y) < eps 36 | } 37 | 38 | case class CompareDoubleRightSide( 39 | fun: (Double, Double, Double) => Boolean, y: Double, eps: Double, method: String) 40 | 41 | /** 42 | * Implicit class for comparing two double values using relative tolerance or absolute tolerance. 43 | */ 44 | implicit class DoubleWithAlmostEquals(val x: Double) { 45 | 46 | /** 47 | * When the difference of two values are within eps, returns true; otherwise, returns false. 48 | */ 49 | def ~=(r: CompareDoubleRightSide): Boolean = r.fun(x, r.y, r.eps) 50 | 51 | /** 52 | * When the difference of two values are within eps, returns false; otherwise, returns true. 53 | */ 54 | def !~=(r: CompareDoubleRightSide): Boolean = !r.fun(x, r.y, r.eps) 55 | 56 | /** 57 | * Throws exception when the difference of two values are NOT within eps; 58 | * otherwise, returns true. 59 | */ 60 | def ~==(r: CompareDoubleRightSide): Boolean = { 61 | if (!r.fun(x, r.y, r.eps)) { 62 | throw new TestFailedException( 63 | s"Expected $x and ${r.y} to be within ${r.eps}${r.method}.", 0) 64 | } 65 | true 66 | } 67 | 68 | /** 69 | * Throws exception when the difference of two values are within eps; otherwise, returns true. 70 | */ 71 | def !~==(r: CompareDoubleRightSide): Boolean = { 72 | if (r.fun(x, r.y, r.eps)) { 73 | throw new TestFailedException( 74 | s"Did not expect $x and ${r.y} to be within ${r.eps}${r.method}.", 0) 75 | } 76 | true 77 | } 78 | 79 | /** 80 | * Comparison using absolute tolerance. 81 | */ 82 | def absTol(eps: Double): CompareDoubleRightSide = 83 | CompareDoubleRightSide(AbsoluteErrorComparison, x, eps, ABS_TOL_MSG) 84 | 85 | /** 86 | * Comparison using relative tolerance. 87 | */ 88 | def relTol(eps: Double): CompareDoubleRightSide = 89 | CompareDoubleRightSide(RelativeErrorComparison, x, eps, REL_TOL_MSG) 90 | 91 | override def toString: String = x.toString 92 | } 93 | 94 | case class CompareVectorRightSide( 95 | fun: (Vector, Vector, Double) => Boolean, y: Vector, eps: Double, method: String) 96 | 97 | /** 98 | * Implicit class for comparing two vectors using relative tolerance or absolute tolerance. 99 | */ 100 | implicit class VectorWithAlmostEquals(val x: Vector) { 101 | 102 | /** 103 | * When the difference of two vectors are within eps, returns true; otherwise, returns false. 104 | */ 105 | def ~=(r: CompareVectorRightSide): Boolean = r.fun(x, r.y, r.eps) 106 | 107 | /** 108 | * When the difference of two vectors are within eps, returns false; otherwise, returns true. 109 | */ 110 | def !~=(r: CompareVectorRightSide): Boolean = !r.fun(x, r.y, r.eps) 111 | 112 | /** 113 | * Throws exception when the difference of two vectors are NOT within eps; 114 | * otherwise, returns true. 115 | */ 116 | def ~==(r: CompareVectorRightSide): Boolean = { 117 | if (!r.fun(x, r.y, r.eps)) { 118 | throw new TestFailedException( 119 | s"Expected $x and ${r.y} to be within ${r.eps}${r.method} for all elements.", 0) 120 | } 121 | true 122 | } 123 | 124 | /** 125 | * Throws exception when the difference of two vectors are within eps; otherwise, returns true. 126 | */ 127 | def !~==(r: CompareVectorRightSide): Boolean = { 128 | if (r.fun(x, r.y, r.eps)) { 129 | throw new TestFailedException( 130 | s"Did not expect $x and ${r.y} to be within ${r.eps}${r.method} for all elements.", 0) 131 | } 132 | true 133 | } 134 | 135 | /** 136 | * Comparison using absolute tolerance. 137 | */ 138 | def absTol(eps: Double): CompareVectorRightSide = CompareVectorRightSide( 139 | (x: Vector, y: Vector, eps: Double) => { 140 | x.toArray.zip(y.toArray).forall(x => x._1 ~= x._2 absTol eps) 141 | }, x, eps, ABS_TOL_MSG) 142 | 143 | /** 144 | * Comparison using relative tolerance. Note that comparing against sparse vector 145 | * with elements having value of zero will raise exception because it involves with 146 | * comparing against zero. 147 | */ 148 | def relTol(eps: Double): CompareVectorRightSide = CompareVectorRightSide( 149 | (x: Vector, y: Vector, eps: Double) => { 150 | x.toArray.zip(y.toArray).forall(x => x._1 ~= x._2 relTol eps) 151 | }, x, eps, REL_TOL_MSG) 152 | 153 | override def toString: String = x.toString 154 | } 155 | 156 | case class CompareMatrixRightSide( 157 | fun: (Matrix, Matrix, Double) => Boolean, y: Matrix, eps: Double, method: String) 158 | 159 | /** 160 | * Implicit class for comparing two matrices using relative tolerance or absolute tolerance. 161 | */ 162 | implicit class MatrixWithAlmostEquals(val x: Matrix) { 163 | 164 | /** 165 | * When the difference of two matrices are within eps, returns true; otherwise, returns false. 166 | */ 167 | def ~=(r: CompareMatrixRightSide): Boolean = r.fun(x, r.y, r.eps) 168 | 169 | /** 170 | * When the difference of two matrices are within eps, returns false; otherwise, returns true. 171 | */ 172 | def !~=(r: CompareMatrixRightSide): Boolean = !r.fun(x, r.y, r.eps) 173 | 174 | /** 175 | * Throws exception when the difference of two matrices are NOT within eps; 176 | * otherwise, returns true. 177 | */ 178 | def ~==(r: CompareMatrixRightSide): Boolean = { 179 | if (!r.fun(x, r.y, r.eps)) { 180 | throw new TestFailedException( 181 | s"Expected \n$x\n and \n${r.y}\n to be within ${r.eps}${r.method} for all elements.", 0) 182 | } 183 | true 184 | } 185 | 186 | /** 187 | * Throws exception when the difference of two matrices are within eps; otherwise, returns true. 188 | */ 189 | def !~==(r: CompareMatrixRightSide): Boolean = { 190 | if (r.fun(x, r.y, r.eps)) { 191 | throw new TestFailedException( 192 | s"Did not expect \n$x\n and \n${r.y}\n to be within " + 193 | "${r.eps}${r.method} for all elements.", 0) 194 | } 195 | true 196 | } 197 | 198 | /** 199 | * Comparison using absolute tolerance. 200 | */ 201 | def absTol(eps: Double): CompareMatrixRightSide = CompareMatrixRightSide( 202 | (x: Matrix, y: Matrix, eps: Double) => { 203 | x.toArray.zip(y.toArray).forall(x => x._1 ~= x._2 absTol eps) 204 | }, x, eps, ABS_TOL_MSG) 205 | 206 | /** 207 | * Comparison using relative tolerance. Note that comparing against sparse vector 208 | * with elements having value of zero will raise exception because it involves with 209 | * comparing against zero. 210 | */ 211 | def relTol(eps: Double): CompareMatrixRightSide = CompareMatrixRightSide( 212 | (x: Matrix, y: Matrix, eps: Double) => { 213 | x.toArray.zip(y.toArray).forall(x => x._1 ~= x._2 relTol eps) 214 | }, x, eps, REL_TOL_MSG) 215 | 216 | override def toString: String = x.toString 217 | } 218 | 219 | } 220 | -------------------------------------------------------------------------------- /sparkext-mllib/src/test/scala/org/apache/spark/ml/classification/LocalLogisticRegressionSpec.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.classification 2 | 3 | import com.collective.TestSparkContext 4 | import org.apache.spark.mllib.linalg.{Vector, Vectors} 5 | import org.apache.spark.mllib.regression.LabeledPoint 6 | import org.apache.spark.sql.Row 7 | import org.scalatest._ 8 | 9 | import scala.util.Random 10 | import scala.util.control.Breaks._ 11 | import org.apache.spark.ml.TestingUtils._ 12 | 13 | /** 14 | * Copy Pasted from Spark LogisticRegressionSuite to verify that nothing is broken 15 | */ 16 | object LocalLogisticRegressionSpec { 17 | 18 | // Generate input of the form Y = logistic(offset + scale*X) 19 | def generateLogisticInput( 20 | offset: Double, 21 | scale: Double, 22 | nPoints: Int, 23 | seed: Int): Seq[LabeledPoint] = { 24 | val rnd = new Random(seed) 25 | val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian()) 26 | 27 | val y = (0 until nPoints).map { i => 28 | val p = 1.0 / (1.0 + math.exp(-(offset + scale * x1(i)))) 29 | if (rnd.nextDouble() < p) 1.0 else 0.0 30 | } 31 | 32 | val testData = (0 until nPoints).map(i => LabeledPoint(y(i), Vectors.dense(Array(x1(i))))) 33 | testData 34 | } 35 | 36 | /** 37 | * Generates `k` classes multinomial synthetic logistic input in `n` dimensional space given the 38 | * model weights and mean/variance of the features. The synthetic data will be drawn from 39 | * the probability distribution constructed by weights using the following formula. 40 | * 41 | * P(y = 0 | x) = 1 / norm 42 | * P(y = 1 | x) = exp(x * w_1) / norm 43 | * P(y = 2 | x) = exp(x * w_2) / norm 44 | * ... 45 | * P(y = k-1 | x) = exp(x * w_{k-1}) / norm 46 | * where norm = 1 + exp(x * w_1) + exp(x * w_2) + ... + exp(x * w_{k-1}) 47 | * 48 | * @param weights matrix is flatten into a vector; as a result, the dimension of weights vector 49 | * will be (k - 1) * (n + 1) if `addIntercept == true`, and 50 | * if `addIntercept != true`, the dimension will be (k - 1) * n. 51 | * @param xMean the mean of the generated features. Lots of time, if the features are not properly 52 | * standardized, the algorithm with poor implementation will have difficulty 53 | * to converge. 54 | * @param xVariance the variance of the generated features. 55 | * @param addIntercept whether to add intercept. 56 | * @param nPoints the number of instance of generated data. 57 | * @param seed the seed for random generator. For consistent testing result, it will be fixed. 58 | */ 59 | def generateMultinomialLogisticInput( 60 | weights: Array[Double], 61 | xMean: Array[Double], 62 | xVariance: Array[Double], 63 | addIntercept: Boolean, 64 | nPoints: Int, 65 | seed: Int): Seq[LabeledPoint] = { 66 | val rnd = new Random(seed) 67 | 68 | val xDim = xMean.length 69 | val xWithInterceptsDim = if (addIntercept) xDim + 1 else xDim 70 | val nClasses = weights.length / xWithInterceptsDim + 1 71 | 72 | val x = Array.fill[Vector](nPoints)(Vectors.dense(Array.fill[Double](xDim)(rnd.nextGaussian()))) 73 | 74 | x.foreach { vector => 75 | // This doesn't work if `vector` is a sparse vector. 76 | val vectorArray = vector.toArray 77 | var i = 0 78 | val len = vectorArray.length 79 | while (i < len) { 80 | vectorArray(i) = vectorArray(i) * math.sqrt(xVariance(i)) + xMean(i) 81 | i += 1 82 | } 83 | } 84 | 85 | val y = (0 until nPoints).map { idx => 86 | val xArray = x(idx).toArray 87 | val margins = Array.ofDim[Double](nClasses) 88 | val probs = Array.ofDim[Double](nClasses) 89 | 90 | for (i <- 0 until nClasses - 1) { 91 | for (j <- 0 until xDim) margins(i + 1) += weights(i * xWithInterceptsDim + j) * xArray(j) 92 | if (addIntercept) margins(i + 1) += weights((i + 1) * xWithInterceptsDim - 1) 93 | } 94 | // Preventing the overflow when we compute the probability 95 | val maxMargin = margins.max 96 | if (maxMargin > 0) for (i <- 0 until nClasses) margins(i) -= maxMargin 97 | 98 | // Computing the probabilities for each class from the margins. 99 | val norm = { 100 | var temp = 0.0 101 | for (i <- 0 until nClasses) { 102 | probs(i) = math.exp(margins(i)) 103 | temp += probs(i) 104 | } 105 | temp 106 | } 107 | for (i <- 0 until nClasses) probs(i) /= norm 108 | 109 | // Compute the cumulative probability so we can generate a random number and assign a label. 110 | for (i <- 1 until nClasses) probs(i) += probs(i - 1) 111 | val p = rnd.nextDouble() 112 | var y = 0 113 | breakable { 114 | for (i <- 0 until nClasses) { 115 | if (p < probs(i)) { 116 | y = i 117 | break 118 | } 119 | } 120 | } 121 | y 122 | } 123 | 124 | val testData = (0 until nPoints).map(i => LabeledPoint(y(i), x(i))) 125 | testData 126 | } 127 | 128 | } 129 | 130 | // Runs local Logistic Regression 131 | class LocalLogisticRegressionSpec extends AbstractLocalLogisticRegressionSpec("Local", 1) 132 | 133 | // Runs default Spark Logistic Regression 134 | class DefaultLogisticRegressionSpec extends AbstractLocalLogisticRegressionSpec("Default", 2) 135 | 136 | abstract class AbstractLocalLogisticRegressionSpec(name: String, partitions: Int) 137 | extends FlatSpec with GivenWhenThen with ShouldMatchers with TestSparkContext { 138 | 139 | import LocalLogisticRegressionSpec._ 140 | 141 | private val eps: Double = 1e-5 142 | 143 | lazy val dataset = sqlContext 144 | .createDataFrame(generateLogisticInput(1.0, 1.0, nPoints = 100, seed = 42)).repartition(partitions) 145 | 146 | lazy val binaryDataset = { 147 | val nPoints = 10000 148 | val weights = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191) 149 | val xMean = Array(5.843, 3.057, 3.758, 1.199) 150 | val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) 151 | 152 | sqlContext.createDataFrame( 153 | generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42)).repartition(partitions) 154 | } 155 | 156 | s"$name LogisticRegression" should "test logistic regression: Predictor, Classifier methods" in { 157 | val lr = new LocalLogisticRegression 158 | 159 | val model = lr.fit(dataset) 160 | assert(model.numClasses === 2) 161 | 162 | val results = model.transform(dataset) 163 | 164 | // Compare rawPrediction with probability 165 | results.select("rawPrediction", "probability").collect().foreach { 166 | case Row(raw: Vector, prob: Vector) => 167 | assert(raw.size === 2) 168 | assert(prob.size === 2) 169 | val probFromRaw1 = 1.0 / (1.0 + math.exp(-raw(1))) 170 | assert(prob(1) ~== probFromRaw1 relTol eps) 171 | assert(prob(0) ~== 1.0 - probFromRaw1 relTol eps) 172 | } 173 | 174 | // Compare prediction with probability 175 | results.select("prediction", "probability").collect().foreach { 176 | case Row(pred: Double, prob: Vector) => 177 | val predFromProb = prob.toArray.zipWithIndex.maxBy(_._1)._2 178 | assert(pred == predFromProb) 179 | } 180 | } 181 | 182 | it should "test binary logistic regression with intercept with L1 regularization" in { 183 | val trainer1 = (new LocalLogisticRegression).setFitIntercept(true) 184 | .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(true) 185 | val trainer2 = (new LocalLogisticRegression).setFitIntercept(true) 186 | .setElasticNetParam(1.0).setRegParam(0.12).setStandardization(false) 187 | 188 | val model1 = trainer1.fit(binaryDataset) 189 | val model2 = trainer2.fit(binaryDataset) 190 | 191 | /* 192 | Using the following R code to load the data and train the model using glmnet package. 193 | 194 | library("glmnet") 195 | data <- read.csv("path", header=FALSE) 196 | label = factor(data$V1) 197 | features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) 198 | weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12)) 199 | weights 200 | 201 | 5 x 1 sparse Matrix of class "dgCMatrix" 202 | s0 203 | (Intercept) -0.05627428 204 | data.V2 . 205 | data.V3 . 206 | data.V4 -0.04325749 207 | data.V5 -0.02481551 208 | */ 209 | val interceptR1 = -0.05627428 210 | val weightsR1 = Vectors.dense(0.0, 0.0, -0.04325749, -0.02481551) 211 | 212 | assert(model1.intercept ~== interceptR1 relTol 1E-2) 213 | assert(model1.weights ~= weightsR1 absTol 2E-2) 214 | 215 | /* 216 | Using the following R code to load the data and train the model using glmnet package. 217 | 218 | library("glmnet") 219 | data <- read.csv("path", header=FALSE) 220 | label = factor(data$V1) 221 | features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) 222 | weights = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12, 223 | standardize=FALSE)) 224 | weights 225 | 226 | 5 x 1 sparse Matrix of class "dgCMatrix" 227 | s0 228 | (Intercept) 0.3722152 229 | data.V2 . 230 | data.V3 . 231 | data.V4 -0.1665453 232 | data.V5 . 233 | */ 234 | val interceptR2 = 0.3722152 235 | val weightsR2 = Vectors.dense(0.0, 0.0, -0.1665453, 0.0) 236 | 237 | assert(model2.intercept ~== interceptR2 relTol 1E-2) 238 | assert(model2.weights ~= weightsR2 absTol 1E-3) 239 | } 240 | 241 | } 242 | -------------------------------------------------------------------------------- /sparkext-mllib/src/main/scala/org/apache/spark/ml/feature/Binning.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.feature 2 | 3 | import breeze.linalg.DenseVector 4 | import breeze.optimize.{ApproximateGradientFunction, DiffFunction, LBFGS} 5 | import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, BinaryAttribute} 6 | import org.apache.spark.ml.param._ 7 | import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} 8 | import org.apache.spark.ml.util.{Identifiable, SchemaUtils} 9 | import org.apache.spark.ml.{Estimator, Model} 10 | import org.apache.spark.mllib.linalg.Vectors 11 | import org.apache.spark.sql.DataFrame 12 | import org.apache.spark.sql.functions._ 13 | import org.apache.spark.sql.types.{DoubleType, NumericType, StructType} 14 | 15 | 16 | private[feature] trait BinningBase extends Params with HasInputCol with HasOutputCol 17 | 18 | class OptimalBinning(override val uid: String) extends Estimator[Binning] with BinningBase with SplitOptimizer { 19 | 20 | def this() = this(Identifiable.randomUID("optimalBinning")) 21 | 22 | val numBins: Param[Int] = new Param[Int](this, "numBins", "Number of bins", 23 | ParamValidators.gt(2)) 24 | 25 | val sampleSize: Param[Int] = new Param[Int](this, "sampleSize", "Size of a sample used for split optimizer", 26 | ParamValidators.gt(1000)) 27 | 28 | def getNumBins: Int = $(numBins) 29 | 30 | def getSampleSize: Int = $(sampleSize) 31 | 32 | def setNumBins(value: Int): this.type = set(numBins, value) 33 | 34 | def setSampleSize(value: Int): this.type = set(sampleSize, value) 35 | 36 | def setInputCol(value: String): this.type = set(inputCol, value) 37 | 38 | def setOutputCol(value: String): this.type = set(outputCol, value) 39 | 40 | setDefault( 41 | numBins -> 5, 42 | sampleSize -> 10000 43 | ) 44 | 45 | override def fit(dataset: DataFrame): Binning = { 46 | transformSchema(dataset.schema, logging = true) 47 | 48 | val notNulls = dataset.filter(col($(inputCol)).isNotNull) 49 | val inputSize = notNulls.count() 50 | val fraction = if ($(sampleSize) >= inputSize) 1.0D else $(sampleSize).toDouble / inputSize 51 | val sample = notNulls.select(col($(inputCol)).cast(DoubleType)).sample(withReplacement = false, fraction) 52 | 53 | val x = sample.collect().map(_.getDouble(0)) 54 | 55 | log.debug(s"Collected sample size of: ${x.length}") 56 | 57 | // Doesn't make any sense to do binning if no enough sample points available 58 | require(x.length > ${numBins} * 10, 59 | s"Number of sample points for binning is too small") 60 | 61 | // Find optimal split with -Inf, +Inf bounds 62 | val splits = Double.NegativeInfinity +: optimalSplit(x, $(numBins) - 1) :+ Double.PositiveInfinity 63 | val bins = splits.sliding(2).map(bin => s"[${bin.mkString(", ")})").toArray 64 | log.debug(s"Calculated optimal split. Bins: ${bins.mkString(", ")}") 65 | 66 | copyValues(new Binning(uid).setSplits(splits).setParent(this)) 67 | } 68 | 69 | override def transformSchema(schema: StructType): StructType = { 70 | val inputColName = $(inputCol) 71 | val inputDataType = schema(inputColName).dataType 72 | inputDataType match { 73 | case _: NumericType => 74 | case other => 75 | throw new IllegalArgumentException(s"Data type $other is not supported.") 76 | } 77 | // Names of bins are not available at this point 78 | val attrGroup = new AttributeGroup($(outputCol), $(numBins)) 79 | SchemaUtils.appendColumn(schema, attrGroup.toStructField()) 80 | } 81 | 82 | override def copy(extra: ParamMap): Estimator[Binning] = defaultCopy(extra) 83 | } 84 | 85 | 86 | /** 87 | * Based on [[org.apache.spark.ml.feature.Bucketizer Bucketizer]], except that 88 | * instead of [[org.apache.spark.ml.attribute.NominalAttribute NominalAttribute]] it 89 | * outputs [[org.apache.spark.ml.attribute.AttributeGroup AttributeGroup]] column 90 | */ 91 | final class Binning(override val uid: String) 92 | extends Model[Binning] with BinningBase { 93 | 94 | def this() = this(Identifiable.randomUID("binning")) 95 | 96 | val splits: DoubleArrayParam = new DoubleArrayParam(this, "splits", 97 | "Split points for mapping continuous features into bins. With n+1 splits, there are n " + 98 | "bins. A bin defined by splits x,y holds values in the range [x,y) except the last " + 99 | "bin, which also includes y. The splits should be strictly increasing. " + 100 | "Values at -inf, inf must be explicitly provided to cover all Double values; " + 101 | "otherwise, values outside the splits specified will be treated as errors.", 102 | Bucketizer.checkSplits) 103 | 104 | def getSplits: Array[Double] = $(splits) 105 | 106 | def setSplits(value: Array[Double]): this.type = set(splits, value) 107 | 108 | def setInputCol(value: String): this.type = set(inputCol, value) 109 | 110 | def setOutputCol(value: String): this.type = set(outputCol, value) 111 | 112 | override def transform(dataset: DataFrame): DataFrame = { 113 | val outputSchema = transformSchema(dataset.schema) 114 | val numBins = ${splits}.length - 1 115 | val t = udf { feature: Double => 116 | val binIdx = Bucketizer.binarySearchForBuckets($(splits), feature).toInt 117 | Vectors.sparse(numBins, Seq((binIdx, 1.0))) 118 | } 119 | val metadata = outputSchema($(outputCol)).metadata 120 | dataset.select(col("*"), t(col($(inputCol)).cast(DoubleType)).as($(outputCol), metadata)) 121 | } 122 | 123 | override def transformSchema(schema: StructType): StructType = { 124 | val inputColName = $(inputCol) 125 | val inputDataType = schema(inputColName).dataType 126 | inputDataType match { 127 | case _: NumericType => 128 | case other => 129 | throw new IllegalArgumentException(s"Data type $other is not supported.") 130 | } 131 | val bins = $(splits).sliding(2).map(bin => s"[${bin.mkString(", ")})").toArray 132 | val attrs: Array[Attribute] = bins.map(bin => new BinaryAttribute(Some(bin))) 133 | val attrGroup = new AttributeGroup($(outputCol), attrs) 134 | SchemaUtils.appendColumn(schema, attrGroup.toStructField()) 135 | } 136 | 137 | override def copy(extra: ParamMap): Binning = { 138 | defaultCopy[Binning](extra).setParent(parent) 139 | } 140 | } 141 | 142 | /** 143 | * Compute optimal split to have the same number of points in each bucket/bin 144 | */ 145 | trait SplitOptimizer { 146 | 147 | protected def fromDiff(diff: Array[Double]): Array[Double] = { 148 | diff.scanLeft(0D)((acc, v) => acc + v).drop(1) 149 | } 150 | 151 | protected def toDiff(values: Array[Double]): Array[Double] = { 152 | 153 | if (values.isEmpty) { 154 | Array.empty 155 | } else if (values.length == 1) { 156 | values 157 | } else { 158 | val diff = values.sliding(2) map { 159 | case s if s.length == 2 => s(1) - s(0) 160 | case s => sys.error(s"Unexpected sliding window: $s") 161 | } 162 | (values.head +: diff.toSeq).toArray 163 | } 164 | } 165 | 166 | protected def quantiles(x: Array[Double])(percentiles: Array[Double]): Array[Double] = { 167 | val as = x.sorted 168 | percentiles.map({ p => 169 | val i = p * (as.length - 1) 170 | val lb = i.toInt 171 | val ub = math.ceil(i).toInt 172 | val w = i - lb 173 | val quantile = as(lb) * (1 - w) + as(ub) * w 174 | quantile 175 | })(collection.breakOut) 176 | } 177 | 178 | /** 179 | * Mean squared error from ideal split 180 | */ 181 | protected def error(counts: Array[Int]): Double = { 182 | val sum = counts.sum 183 | val bins = counts.length 184 | counts.map(_ - (sum / bins)).map(math.pow(_, 2)).sum / bins 185 | } 186 | 187 | protected class OptimalSplitTargetFunction( 188 | x: Array[Double], 189 | splits: Int 190 | ) extends DiffFunction[DenseVector[Double]] { 191 | 192 | // Calculate starting point based on quantile split 193 | val init: DenseVector[Double] = { 194 | val percentile = (1 to splits) map (_.toDouble / (splits + 1)) 195 | DenseVector.apply(toDiff(quantiles(x)(percentile.toArray))) 196 | } 197 | 198 | // Target minimization function 199 | private val targetFunction: DenseVector[Double] => Double = 200 | p => error(counts(p)) 201 | 202 | def counts(p: DenseVector[Double]): Array[Int] = { 203 | val splits = Double.NegativeInfinity +: fromDiff(p.toArray) :+ Double.PositiveInfinity 204 | 205 | val count = splits.sliding(2) map { 206 | case split if split.length == 2 => 207 | val low = split(0) 208 | val high = split(1) 209 | val filter = (v: Double) => v >= low && v < high 210 | x.count(filter) 211 | 212 | case split => sys.error(s"Unexpected split: $split") 213 | } 214 | 215 | count.toArray 216 | } 217 | 218 | private val gradient = new ApproximateGradientFunction(targetFunction) 219 | 220 | def calculate(p: DenseVector[Double]): (Double, DenseVector[Double]) = { 221 | (targetFunction(p), gradient.gradientAt(p)) 222 | } 223 | } 224 | 225 | /** 226 | * Compute optimal split values to get uniformly 227 | * distributed number of points in each bin 228 | * 229 | * @param x input data that needs to be splitted 230 | * @param splits number of splits 231 | * @param maxIter max iterations for LBFGS optimizer 232 | * @param m memory parameter for LBFGS optimizer 233 | * @return 234 | */ 235 | def optimalSplit( 236 | x: Array[Double], 237 | splits: Int, 238 | maxIter: Int = 100, 239 | m: Int = 3 240 | ): Array[Double] = { 241 | 242 | // Binning requires at least 3 splits 243 | require(splits >= 3, s"Target splits should greater or equal 3") 244 | 245 | val lbfgs = new LBFGS[DenseVector[Double]](maxIter, m) 246 | val f = new OptimalSplitTargetFunction(x, splits) 247 | 248 | fromDiff(lbfgs.minimize(f, f.init).toArray) 249 | } 250 | 251 | } 252 | -------------------------------------------------------------------------------- /sparkext-mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryModelMetrics.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.mllib.evaluation 19 | 20 | import org.apache.spark.Logging 21 | import org.apache.spark.annotation.Experimental 22 | import org.apache.spark.mllib.evaluation.binary._ 23 | import org.apache.spark.rdd.{RDD, UnionRDD} 24 | import org.apache.spark.sql.DataFrame 25 | 26 | /** 27 | * :: Experimental :: 28 | * Evaluator for binary classification. 29 | * 30 | * @param scoreAndLabels an RDD of (score, label) pairs. 31 | * @param numBins if greater than 0, then the curves (ROC curve, PR curve) computed internally 32 | * will be down-sampled to this many "bins". If 0, no down-sampling will occur. 33 | * This is useful because the curve contains a point for each distinct score 34 | * in the input, and this could be as large as the input itself -- millions of 35 | * points or more, when thousands may be entirely sufficient to summarize 36 | * the curve. After down-sampling, the curves will instead be made of approximately 37 | * `numBins` points instead. Points are made from bins of equal numbers of 38 | * consecutive points. The size of each bin is 39 | * `floor(scoreAndLabels.count() / numBins)`, which means the resulting number 40 | * of bins may not exactly equal numBins. The last bin in each partition may 41 | * be smaller as a result, meaning there may be an extra sample at 42 | * partition boundaries. 43 | * @since 1.3.0 44 | */ 45 | @Experimental 46 | class BinaryModelMetrics( 47 | val scoreAndLabels: RDD[(Double, Double)], 48 | val numBins: Int) extends Logging { 49 | 50 | require(numBins >= 0, "numBins must be nonnegative") 51 | 52 | /** 53 | * Defaults `numBins` to 0. 54 | * @since 1.0.0 55 | */ 56 | def this(scoreAndLabels: RDD[(Double, Double)]) = this(scoreAndLabels, 0) 57 | 58 | /** 59 | * An auxiliary constructor taking a DataFrame. 60 | * @param scoreAndLabels a DataFrame with two double columns: score and label 61 | */ 62 | private[mllib] def this(scoreAndLabels: DataFrame) = 63 | this(scoreAndLabels.map(r => (r.getDouble(0), r.getDouble(1)))) 64 | 65 | /** 66 | * Unpersist intermediate RDDs used in the computation. 67 | * @since 1.0.0 68 | */ 69 | def unpersist() { 70 | cumulativeCounts.unpersist() 71 | } 72 | 73 | /** 74 | * Returns thresholds in descending order. 75 | * @since 1.0.0 76 | */ 77 | def thresholds(): RDD[Double] = cumulativeCounts.map(_._1) 78 | 79 | def gains(): RDD[(Double, Double)] = { 80 | val gainsChart = createCurve(Reach, Recall) 81 | val sc = confusions.context 82 | val first = sc.makeRDD(Seq((0.0, 0.0)), 1) 83 | val last = sc.makeRDD(Seq((1.0, 1.0)), 1) 84 | new UnionRDD[(Double, Double)](sc, Seq(first, gainsChart, last)) 85 | } 86 | 87 | def lift(): RDD[(Double, Double)] = createCurve(Reach, Lift) 88 | 89 | /** 90 | * Returns the receiver operating characteristic (ROC) curve, 91 | * which is an RDD of (false positive rate, true positive rate) 92 | * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it. 93 | * @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic 94 | * @since 1.0.0 95 | */ 96 | def roc(): RDD[(Double, Double)] = { 97 | val rocCurve = createCurve(FalsePositiveRate, Recall) 98 | val sc = confusions.context 99 | val first = sc.makeRDD(Seq((0.0, 0.0)), 1) 100 | val last = sc.makeRDD(Seq((1.0, 1.0)), 1) 101 | new UnionRDD[(Double, Double)](sc, Seq(first, rocCurve, last)) 102 | } 103 | 104 | /** 105 | * Computes the area under the receiver operating characteristic (ROC) curve. 106 | * @since 1.0.0 107 | */ 108 | def areaUnderROC(): Double = AreaUnderCurve.of(roc()) 109 | 110 | /** 111 | * Returns the precision-recall curve, which is an RDD of (recall, precision), 112 | * NOT (precision, recall), with (0.0, 1.0) prepended to it. 113 | * @see http://en.wikipedia.org/wiki/Precision_and_recall 114 | * @since 1.0.0 115 | */ 116 | def pr(): RDD[(Double, Double)] = { 117 | val prCurve = createCurve(Recall, Precision) 118 | val sc = confusions.context 119 | val first = sc.makeRDD(Seq((0.0, 1.0)), 1) 120 | first.union(prCurve) 121 | } 122 | 123 | /** 124 | * Computes the area under the precision-recall curve. 125 | * @since 1.0.0 126 | */ 127 | def areaUnderPR(): Double = AreaUnderCurve.of(pr()) 128 | 129 | /** 130 | * Returns the (threshold, F-Measure) curve. 131 | * @param beta the beta factor in F-Measure computation. 132 | * @return an RDD of (threshold, F-Measure) pairs. 133 | * @see http://en.wikipedia.org/wiki/F1_score 134 | * @since 1.0.0 135 | */ 136 | def fMeasureByThreshold(beta: Double): RDD[(Double, Double)] = createCurve(FMeasure(beta)) 137 | 138 | /** 139 | * Returns the (threshold, F-Measure) curve with beta = 1.0. 140 | * @since 1.0.0 141 | */ 142 | def fMeasureByThreshold(): RDD[(Double, Double)] = fMeasureByThreshold(1.0) 143 | 144 | /** 145 | * Returns the (threshold, precision) curve. 146 | * @since 1.0.0 147 | */ 148 | def precisionByThreshold(): RDD[(Double, Double)] = createCurve(Precision) 149 | 150 | /** 151 | * Returns the (threshold, recall) curve. 152 | * @since 1.0.0 153 | */ 154 | def recallByThreshold(): RDD[(Double, Double)] = createCurve(Recall) 155 | 156 | private lazy val ( 157 | cumulativeCounts: RDD[(Double, BinaryLabelCounter)], 158 | confusions: RDD[(Double, BinaryConfusionMatrix)]) = { 159 | // Create a bin for each distinct score value, count positives and negatives within each bin, 160 | // and then sort by score values in descending order. 161 | val counts = scoreAndLabels.combineByKey( 162 | createCombiner = (label: Double) => new BinaryLabelCounter(0L, 0L) += label, 163 | mergeValue = (c: BinaryLabelCounter, label: Double) => c += label, 164 | mergeCombiners = (c1: BinaryLabelCounter, c2: BinaryLabelCounter) => c1 += c2 165 | ).sortByKey(ascending = false) 166 | 167 | val binnedCounts = 168 | // Only down-sample if bins is > 0 169 | if (numBins == 0) { 170 | // Use original directly 171 | counts 172 | } else { 173 | val countsSize = counts.count() 174 | // Group the iterator into chunks of about countsSize / numBins points, 175 | // so that the resulting number of bins is about numBins 176 | var grouping = countsSize / numBins 177 | if (grouping < 2) { 178 | // numBins was more than half of the size; no real point in down-sampling to bins 179 | logInfo(s"Curve is too small ($countsSize) for $numBins bins to be useful") 180 | counts 181 | } else { 182 | if (grouping >= Int.MaxValue) { 183 | logWarning( 184 | s"Curve too large ($countsSize) for $numBins bins; capping at ${Int.MaxValue}") 185 | grouping = Int.MaxValue 186 | } 187 | counts.mapPartitions(_.grouped(grouping.toInt).map { pairs => 188 | // The score of the combined point will be just the first one's score 189 | val firstScore = pairs.head._1 190 | // The point will contain all counts in this chunk 191 | val agg = new BinaryLabelCounter() 192 | pairs.foreach(pair => agg += pair._2) 193 | (firstScore, agg) 194 | }) 195 | } 196 | } 197 | 198 | val agg = binnedCounts.values.mapPartitions { iter => 199 | val agg = new BinaryLabelCounter() 200 | iter.foreach(agg += _) 201 | Iterator(agg) 202 | }.collect() 203 | val partitionwiseCumulativeCounts = 204 | agg.scanLeft(new BinaryLabelCounter())( 205 | (agg: BinaryLabelCounter, c: BinaryLabelCounter) => agg.clone() += c) 206 | val totalCount = partitionwiseCumulativeCounts.last 207 | logInfo(s"Total counts: $totalCount") 208 | val cumulativeCounts = binnedCounts.mapPartitionsWithIndex( 209 | (index: Int, iter: Iterator[(Double, BinaryLabelCounter)]) => { 210 | val cumCount = partitionwiseCumulativeCounts(index) 211 | iter.map { case (score, c) => 212 | cumCount += c 213 | (score, cumCount.clone()) 214 | } 215 | }, preservesPartitioning = true) 216 | cumulativeCounts.persist() 217 | val confusions = cumulativeCounts.map { case (score, cumCount) => 218 | (score, BinaryConfusionMatrixImpl(cumCount, totalCount).asInstanceOf[BinaryConfusionMatrix]) 219 | } 220 | (cumulativeCounts, confusions) 221 | } 222 | 223 | /** Creates a curve of (threshold, metric). */ 224 | private def createCurve(y: BinaryClassificationMetricComputer): RDD[(Double, Double)] = { 225 | confusions.map { case (s, c) => 226 | (s, y(c)) 227 | } 228 | } 229 | 230 | /** Creates a curve of (metricX, metricY). */ 231 | private def createCurve( 232 | x: BinaryClassificationMetricComputer, 233 | y: BinaryClassificationMetricComputer): RDD[(Double, Double)] = { 234 | confusions.map { case (_, c) => 235 | (x(c), y(c)) 236 | } 237 | } 238 | } 239 | -------------------------------------------------------------------------------- /scalastyle-config.xml: -------------------------------------------------------------------------------- 1 | 23 | 24 | 25 | Scalastyle standard configuration 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | true 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | ^FunSuite[A-Za-z]*$ 98 | Tests must extend org.apache.spark.SparkFunSuite instead. 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | ^println$ 108 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 800> 163 | 164 | 165 | 166 | 167 | 30 168 | 169 | 170 | 171 | 172 | 10 173 | 174 | 175 | 176 | 177 | 50 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | -1,0,1,2,3 189 | 190 | 191 | 192 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | -------------------------------------------------------------------------------- /sparkext-mllib/src/main/scala/org/apache/spark/ml/classification/LocalLogisticRegression.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.classification 2 | 3 | import breeze.linalg.{DenseVector => BDV} 4 | import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN} 5 | import org.apache.spark.ml.param._ 6 | import org.apache.spark.ml.util.Identifiable 7 | import org.apache.spark.mllib.linalg._ 8 | import org.apache.spark.mllib.regression.LabeledPoint 9 | import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer 10 | import org.apache.spark.sql.DataFrame 11 | import org.apache.spark.{Logging, SparkException} 12 | 13 | import scala.collection.mutable 14 | 15 | /** 16 | * Local version of [[LogisticRegression]] 17 | * When DataFrame is too small that it can easily fit into single node it doesn't make sense 18 | * to build model using RDD, it can be built on single node. Essentially using Spark 19 | * as distributed Executor 20 | */ 21 | class LocalLogisticRegression(override val uid: String) 22 | extends ProbabilisticClassifier[Vector, LocalLogisticRegression, LogisticRegressionModel] 23 | with LogisticRegressionParams with Logging { 24 | 25 | def this() = this(Identifiable.randomUID("locallogreg")) 26 | 27 | def setRegParam(value: Double): this.type = set(regParam, value) 28 | setDefault(regParam -> 0.0) 29 | 30 | def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value) 31 | setDefault(elasticNetParam -> 0.0) 32 | 33 | def setMaxIter(value: Int): this.type = set(maxIter, value) 34 | setDefault(maxIter -> 100) 35 | 36 | def setTol(value: Double): this.type = set(tol, value) 37 | setDefault(tol -> 1E-6) 38 | 39 | def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) 40 | setDefault(fitIntercept -> true) 41 | 42 | def setStandardization(value: Boolean): this.type = set(standardization, value) 43 | setDefault(standardization -> true) 44 | 45 | override def setThreshold(value: Double): this.type = super.setThreshold(value) 46 | 47 | override def getThreshold: Double = super.getThreshold 48 | 49 | override def setThresholds(value: Array[Double]): this.type = super.setThresholds(value) 50 | 51 | override def getThresholds: Array[Double] = super.getThresholds 52 | 53 | private def trainLocal(instances: Array[(Double, Vector)]): (LogisticRegressionModel, Array[Double]) = { 54 | 55 | val (summarizer, labelSummarizer) = 56 | instances.foldLeft((new MultivariateOnlineSummarizer, new MultiClassSummarizer)) { 57 | case ((summarizer: MultivariateOnlineSummarizer, labelSummarizer: MultiClassSummarizer), 58 | (label: Double, features: Vector)) => 59 | (summarizer.add(features), labelSummarizer.add(label)) 60 | } 61 | 62 | val histogram = labelSummarizer.histogram 63 | val numInvalid = labelSummarizer.countInvalid 64 | val numClasses = histogram.length 65 | val numFeatures = summarizer.mean.size 66 | 67 | if (numInvalid != 0) { 68 | val msg = s"Classification labels should be in {0 to ${numClasses - 1} " + 69 | s"Found $numInvalid invalid labels." 70 | logError(msg) 71 | throw new SparkException(msg) 72 | } 73 | 74 | if (numClasses > 2) { 75 | val msg = s"Currently, LogisticRegression with ElasticNet in ML package only supports " + 76 | s"binary classification. Found $numClasses in the input dataset." 77 | logError(msg) 78 | throw new SparkException(msg) 79 | } 80 | 81 | val featuresMean = summarizer.mean.toArray 82 | val featuresStd = summarizer.variance.toArray.map(math.sqrt) 83 | 84 | val regParamL1 = $(elasticNetParam) * $(regParam) 85 | val regParamL2 = (1.0 - $(elasticNetParam)) * $(regParam) 86 | 87 | val costFun = new LocalLogisticCostFun(instances, numClasses, $(fitIntercept), $(standardization), 88 | featuresStd, featuresMean, regParamL2) 89 | 90 | val optimizer = if ($(elasticNetParam) == 0.0 || $(regParam) == 0.0) { 91 | new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol)) 92 | } else { 93 | def regParamL1Fun = (index: Int) => { 94 | // Remove the L1 penalization on the intercept 95 | if (index == numFeatures) { 96 | 0.0 97 | } else { 98 | if ($(standardization)) { 99 | regParamL1 100 | } else { 101 | // If `standardization` is false, we still standardize the data 102 | // to improve the rate of convergence; as a result, we have to 103 | // perform this reverse standardization by penalizing each component 104 | // differently to get effectively the same objective function when 105 | // the training dataset is not standardized. 106 | if (featuresStd(index) != 0.0) regParamL1 / featuresStd(index) else 0.0 107 | } 108 | } 109 | } 110 | new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol)) 111 | } 112 | 113 | val initialWeightsWithIntercept = 114 | Vectors.zeros(if ($(fitIntercept)) numFeatures + 1 else numFeatures) 115 | 116 | if ($(fitIntercept)) { 117 | /* 118 | For binary logistic regression, when we initialize the weights as zeros, 119 | it will converge faster if we initialize the intercept such that 120 | it follows the distribution of the labels. 121 | 122 | {{{ 123 | P(0) = 1 / (1 + \exp(b)), and 124 | P(1) = \exp(b) / (1 + \exp(b)) 125 | }}}, hence 126 | {{{ 127 | b = \log{P(1) / P(0)} = \log{count_1 / count_0} 128 | }}} 129 | */ 130 | initialWeightsWithIntercept.toArray(numFeatures) 131 | = math.log(histogram(1).toDouble / histogram(0).toDouble) 132 | } 133 | 134 | val states = optimizer.iterations(new CachedDiffFunction(costFun), 135 | initialWeightsWithIntercept.toBreeze.toDenseVector) 136 | 137 | val (weights, intercept, objectiveHistory) = { 138 | /* 139 | Note that in Logistic Regression, the objective history (loss + regularization) 140 | is log-likelihood which is invariance under feature standardization. As a result, 141 | the objective history from optimizer is the same as the one in the original space. 142 | */ 143 | val arrayBuilder = mutable.ArrayBuilder.make[Double] 144 | var state: optimizer.State = null 145 | while (states.hasNext) { 146 | state = states.next() 147 | arrayBuilder += state.adjustedValue 148 | } 149 | 150 | if (state == null) { 151 | val msg = s"${optimizer.getClass.getName} failed." 152 | logError(msg) 153 | throw new SparkException(msg) 154 | } 155 | 156 | /* 157 | The weights are trained in the scaled space; we're converting them back to 158 | the original space. 159 | Note that the intercept in scaled space and original space is the same; 160 | as a result, no scaling is needed. 161 | */ 162 | val rawWeights = state.x.toArray.clone() 163 | var i = 0 164 | while (i < numFeatures) { 165 | rawWeights(i) *= { if (featuresStd(i) != 0.0) 1.0 / featuresStd(i) else 0.0 } 166 | i += 1 167 | } 168 | 169 | if ($(fitIntercept)) { 170 | (Vectors.dense(rawWeights.dropRight(1)).compressed, rawWeights.last, arrayBuilder.result()) 171 | } else { 172 | (Vectors.dense(rawWeights).compressed, 0.0, arrayBuilder.result()) 173 | } 174 | } 175 | 176 | val model = copyValues(new LogisticRegressionModel(uid, weights, intercept)) 177 | 178 | (model, objectiveHistory) 179 | } 180 | 181 | override protected def train(dataset: DataFrame): LogisticRegressionModel = { 182 | 183 | if (dataset.rdd.partitions.length == 1) { 184 | log.info(s"Build LogisticRegression in local mode") 185 | 186 | val (model, objectiveHistory) = extractLabeledPoints(dataset).map { 187 | case LabeledPoint(label: Double, features: Vector) => (label, features) 188 | }.mapPartitions { instances => 189 | Seq(trainLocal(instances.toArray)).toIterator 190 | }.first() 191 | 192 | val logRegSummary = new BinaryLogisticRegressionTrainingSummary( 193 | model.transform(dataset), 194 | $(probabilityCol), 195 | $(labelCol), 196 | objectiveHistory) 197 | model.setSummary(logRegSummary) 198 | 199 | } else { 200 | log.info(s"Fallback to distributed LogisticRegression") 201 | 202 | val that = classOf[LogisticRegression].getConstructor(classOf[String]).newInstance(uid) 203 | val logisticRegression = copyValues(that) 204 | // Scala Reflection magic to call protected train method 205 | val ru = scala.reflect.runtime.universe 206 | import ru._ 207 | val m = ru.runtimeMirror(logisticRegression.getClass.getClassLoader) 208 | val im = m.reflect(logisticRegression) 209 | val trainMethod = typeOf[LogisticRegression].declaration(newTermName("train")).asMethod 210 | val mm = im.reflectMethod(trainMethod) 211 | mm.apply(dataset).asInstanceOf[LogisticRegressionModel] 212 | } 213 | } 214 | 215 | override def copy(extra: ParamMap): LocalLogisticRegression = defaultCopy(extra) 216 | } 217 | 218 | /** 219 | * Local version of [[LogisticCostFun]] 220 | */ 221 | private class LocalLogisticCostFun( 222 | data: Array[(Double, Vector)], 223 | numClasses: Int, 224 | fitIntercept: Boolean, 225 | standardization: Boolean, 226 | featuresStd: Array[Double], 227 | featuresMean: Array[Double], 228 | regParamL2: Double) extends DiffFunction[BDV[Double]] { 229 | 230 | override def calculate(weights: BDV[Double]): (Double, BDV[Double]) = { 231 | val numFeatures = featuresStd.length 232 | val w = Vectors.fromBreeze(weights) 233 | 234 | val logisticAggregator = data.foldLeft(new LogisticAggregator(w, numClasses, fitIntercept, 235 | featuresStd, featuresMean)) { 236 | case (aggregator, (label, features)) => aggregator.add(label, features) 237 | } 238 | 239 | val totalGradientArray = logisticAggregator.gradient.toArray 240 | 241 | // regVal is the sum of weight squares excluding intercept for L2 regularization. 242 | val regVal = if (regParamL2 == 0.0) { 243 | 0.0 244 | } else { 245 | var sum = 0.0 246 | w.foreachActive { (index, value) => 247 | // If `fitIntercept` is true, the last term which is intercept doesn't 248 | // contribute to the regularization. 249 | if (index != numFeatures) { 250 | // The following code will compute the loss of the regularization; also 251 | // the gradient of the regularization, and add back to totalGradientArray. 252 | sum += { 253 | if (standardization) { 254 | totalGradientArray(index) += regParamL2 * value 255 | value * value 256 | } else { 257 | if (featuresStd(index) != 0.0) { 258 | // If `standardization` is false, we still standardize the data 259 | // to improve the rate of convergence; as a result, we have to 260 | // perform this reverse standardization by penalizing each component 261 | // differently to get effectively the same objective function when 262 | // the training dataset is not standardized. 263 | val temp = value / (featuresStd(index) * featuresStd(index)) 264 | totalGradientArray(index) += regParamL2 * temp 265 | value * temp 266 | } else { 267 | 0.0 268 | } 269 | } 270 | } 271 | } 272 | } 273 | 0.5 * regParamL2 * sum 274 | } 275 | 276 | (logisticAggregator.loss + regVal, new BDV(totalGradientArray)) 277 | } 278 | } 279 | 280 | -------------------------------------------------------------------------------- /sparkext-mllib/src/main/scala/org/apache/spark/ml/feature/GatherEncoder.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.feature 2 | 3 | import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} 4 | import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} 5 | import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators, Params} 6 | import org.apache.spark.ml.util.{Identifiable, SchemaUtils} 7 | import org.apache.spark.ml.{Estimator, Model} 8 | import org.apache.spark.mllib.linalg.{VectorUDT, Vectors} 9 | import org.apache.spark.sql.DataFrame 10 | import org.apache.spark.sql.functions._ 11 | import org.apache.spark.sql.types._ 12 | 13 | import scala.collection.mutable 14 | 15 | 16 | private[feature] trait GatherEncoderParams 17 | extends Params with HasInputCol with HasOutputCol with HasKeyCol with HasValueCol { 18 | 19 | val transformation: Param[String] = new Param[String](this, "transformation", 20 | "Transformation type: [top, index]", 21 | ParamValidators.inArray(Array("top", "index"))) 22 | 23 | val support: Param[Double] = new Param[Double](this, "support", 24 | "Minimum support", 25 | ParamValidators.inRange(0.0, 100.0)) 26 | 27 | val cover: Param[Double] = new Param[Double](this, "cover", 28 | "Top coverage", 29 | ParamValidators.inRange(0.0, 100.0)) 30 | 31 | val allOther: Param[Boolean] = new Param[Boolean](this, "allOther", 32 | "Add all other column") 33 | 34 | val keepInputCol: Param[Boolean] = new Param[Boolean](this, "keepInputCol", 35 | "Keep input column in transformed data frame") 36 | 37 | val failOnEmptyKeys: Param[Boolean] = new Param[Boolean](this, "failOnEmptyKeys", 38 | "Fail if gathered key set is empty") 39 | 40 | val excludeKeys: Param[Set[Any]] = new Param[Set[Any]](this, "excludeKeys", 41 | "Exclude given keys from encoded model") 42 | 43 | def getCover: Double = $(cover) 44 | 45 | def getAllOther: Boolean = $(allOther) 46 | 47 | def getKeepInputCol: Boolean = $(keepInputCol) 48 | 49 | def getFailOnEmptyKeys: Boolean = ${failOnEmptyKeys} 50 | 51 | def getExcludeKeys: Set[Any] = $(excludeKeys) 52 | 53 | protected def validateSchema(schema: StructType): Unit = { 54 | // Check that inputCol is array of StructType 55 | val inputColName = $(inputCol) 56 | val inputColDataType = schema(inputColName).dataType 57 | val inputColStructSchema = inputColDataType match { 58 | case ArrayType(structType: StructType, _) => 59 | structType 60 | case other => 61 | throw new IllegalArgumentException(s"Input column data type $other is not supported.") 62 | } 63 | 64 | // Check that key type is supported 65 | val keyColName = $(keyCol) 66 | val keyColDataType = inputColStructSchema(keyColName).dataType 67 | keyColDataType match { 68 | case _: NumericType => 69 | case _: StringType => 70 | case other => 71 | throw new IllegalArgumentException(s"Key column data type $other is not supported.") 72 | } 73 | 74 | // Check that value type is numerical 75 | val valueColName = $(valueCol) 76 | val valueColDataType = inputColStructSchema(valueColName).dataType 77 | valueColDataType match { 78 | case _: NumericType => 79 | case other => 80 | throw new IllegalArgumentException(s"Value data type $other is not supported.") 81 | } 82 | } 83 | 84 | } 85 | 86 | /** 87 | * Encode categorical values collected by [[Gather]] transformation as feature vector using 88 | * dummy variables inside [[org.apache.spark.ml.attribute.AttributeGroup AttributeGroup]] 89 | * with attached metadata 90 | * 91 | * {{{ 92 | * cookie_id | sites 93 | * ----------|------------------------------------------------------------------------ 94 | * cookieAA | [{ site_id: 1, impressions: 15.0 }, { site_id: 2, impressions: 20.0 }] 95 | * cookieBB | [{ site_id: 2, impressions: 7.0 }, { site_id: 3, impressions: 5.0 }] 96 | * }}} 97 | * 98 | * transformed into 99 | * 100 | * {{{ 101 | * cookie_id | site_features 102 | * ----------|------------------------ 103 | * cookieAA | [ 15.0 , 20.0 , 0 ] 104 | * cookieBB | [ 0.0 , 7.0 , 5.0 ] 105 | * }}} 106 | * 107 | * Optionally apply dimensionality reduction using top transformation: 108 | * - Top coverage, is selecting categorical values by computing the count of distinct users for each value, 109 | * sorting the values in descending order by the count of users, and choosing the top values from the resulting 110 | * list such that the sum of the distinct user counts over these values covers c percent of all users, 111 | * for example, selecting top geographic locations covering 99% of users. 112 | * - Minimum Support, is selecting categorical values such that at least c percent of users have this value, 113 | * for example, web sites that account for at least c percent of traffic. 114 | */ 115 | class GatherEncoder(override val uid: String) extends Estimator[GatherEncoderModel] with GatherEncoderParams { 116 | 117 | def this() = this(Identifiable.randomUID("gatheredEncoder")) 118 | 119 | def setInputCol(value: String): this.type = set(inputCol, value) 120 | 121 | def setOutputCol(value: String): this.type = set(outputCol, value) 122 | 123 | def setKeyCol(value: String): this.type = set(keyCol, value) 124 | 125 | def setValueCol(value: String): this.type = set(valueCol, value) 126 | 127 | def setTransformation(value: String): this.type = set(transformation, value) 128 | 129 | def setSupport(value: Double): this.type = set(support, value) 130 | 131 | def setCover(value: Double): this.type = set(cover, value) 132 | 133 | def setAllOther(value: Boolean): this.type = set(allOther, value) 134 | 135 | def setKeepInputCol(value: Boolean): this.type = set(keepInputCol, value) 136 | 137 | def setFailOnEmptyKeys(value: Boolean): this.type = set(failOnEmptyKeys, value) 138 | 139 | def setExcludeKeys(value: Set[Any]): this.type = set(excludeKeys, value) 140 | 141 | setDefault( 142 | transformation -> "top", 143 | support -> 0.1, 144 | cover -> 100.0, 145 | allOther -> false, 146 | keepInputCol -> true, 147 | failOnEmptyKeys -> true, 148 | excludeKeys -> Set.empty[Any] 149 | ) 150 | 151 | private def computeTopKeys(dataset: DataFrame): Array[Any] = { 152 | val inputColName = $(inputCol) 153 | val keyColName = $(keyCol) 154 | val coverVal = $(cover) 155 | 156 | log.info(s"Compute top transformation." + 157 | s"Key column: $keyColName " + 158 | s"Cover: $coverVal") 159 | 160 | if (coverVal == 100.0) { 161 | // With cover 100% it's required to collect all keys 162 | val keyCol = s"${uid}_key" 163 | dataset.select(explode(col(s"$inputColName.$keyColName")) as keyCol) 164 | .groupBy(keyCol).agg(col(keyCol)).collect().map(_.get(0)) 165 | .filter(k => !getExcludeKeys.contains(k)) 166 | } else { 167 | 168 | val key = s"${uid}_key" 169 | val grouped: DataFrame = dataset.select(explode(col(s"$inputColName.$keyColName")) as key).groupBy(key).count() 170 | val keys: Array[(Any, Long)] = grouped.collect().map { row => 171 | val key = row.get(0) 172 | val cnt = row.getLong(1) 173 | (key, cnt) 174 | } 175 | 176 | log.debug(s"Collected ${keys.length} top keys for key column: $keyColName") 177 | 178 | val topKeys = keys.sortBy(_._2)(implicitly[Ordering[Long]].reverse) filter { 179 | case (k, _) => !getExcludeKeys.contains(k) 180 | } 181 | 182 | // Get number of columns below cover threshold 183 | val threshold = ($(cover) / 100) * topKeys.map(_._2).sum 184 | val keysBelowThreshold = topKeys.map(_._2).scanLeft(0L)((cum, cnt) => cum + cnt).takeWhile(_ < threshold).length 185 | 186 | topKeys.take(keysBelowThreshold).map(_._1) 187 | } 188 | } 189 | 190 | private def computeIndexKeys(dataset: DataFrame): Array[Any] = { 191 | val inputColName = $(inputCol) 192 | val keyColName = $(keyCol) 193 | val supportVal = $(support) 194 | 195 | log.info(s"Compute index transformation." + 196 | s"Key column: $keyColName " + 197 | s"Support: $supportVal") 198 | 199 | val key = s"${uid}_key" 200 | val grouped: DataFrame = dataset.select(explode(col(s"$inputColName.$keyColName")) as key).groupBy(key).count() 201 | 202 | // Get support threshold 203 | val totalCount = grouped.select(sum("count")).first().getLong(0) 204 | val threshold = (supportVal / 100) * totalCount 205 | 206 | val aboveThresholdKeys: Array[(Any, Long)] = 207 | grouped.filter(col("count") >= threshold).collect().map { row => 208 | val key = row.get(0) 209 | val cnt = row.getLong(1) 210 | (key, cnt) 211 | } 212 | 213 | log.debug(s"Collected '${aboveThresholdKeys.length}' support keys " + 214 | s"above threshold: $threshold for key column: $keyColName") 215 | 216 | val supportKeys = aboveThresholdKeys.sortBy(_._2)(implicitly[Ordering[Long]].reverse) filter { 217 | case (k, _) => !getExcludeKeys.contains(k) 218 | } 219 | 220 | supportKeys.map(_._1) 221 | } 222 | 223 | override def fit(dataset: DataFrame): GatherEncoderModel = { 224 | validateSchema(dataset.schema) 225 | 226 | val transformationVal = $(transformation) 227 | val inputColName = $(inputCol) 228 | val keyColName = $(keyCol) 229 | val valueColName = $(valueCol) 230 | 231 | log.info(s"Fit gather encoder for input column: $inputColName. " + 232 | s"Key column: $keyColName " + 233 | s"Value column: $valueColName " + 234 | s"Transformation: $transformationVal " + 235 | s"All other: ${$(allOther)}.") 236 | 237 | val gatherKeys: Array[Any] = transformationVal match { 238 | case "top" => computeTopKeys(dataset) 239 | case "index" => computeIndexKeys(dataset) 240 | case unknown => 241 | throw new IllegalArgumentException(s"Invalid gather transformation type: $unknown") 242 | } 243 | 244 | copyValues(new GatherEncoderModel(uid, gatherKeys).setParent(this)) 245 | } 246 | 247 | override def transformSchema(schema: StructType): StructType = { 248 | validateSchema(schema) 249 | // at this point labels and size of feature vectors is unknown 250 | val outputSchema = SchemaUtils.appendColumn(schema, StructField($(outputCol), new VectorUDT)) 251 | 252 | if (getKeepInputCol) { 253 | outputSchema 254 | } else { 255 | StructType(outputSchema.filter(_.name != getInputCol)) 256 | } 257 | } 258 | 259 | override def copy(extra: ParamMap): GatherEncoder = defaultCopy(extra) 260 | 261 | } 262 | 263 | /** 264 | * Model fitted by [[GatherEncoder]] 265 | * 266 | * @param modelKeys Ordered list of keys, corresponding column indices in feature vector 267 | */ 268 | class GatherEncoderModel( 269 | override val uid: String, 270 | val modelKeys: Array[Any] 271 | ) extends Model[GatherEncoderModel] with GatherEncoderParams { 272 | 273 | def this(keys: Array[Any]) = this(Identifiable.randomUID("gatheredEncoder"), keys) 274 | 275 | def setInputCol(value: String): this.type = set(inputCol, value) 276 | 277 | def setOutputCol(value: String): this.type = set(outputCol, value) 278 | 279 | def setKeyCol(value: String): this.type = set(keyCol, value) 280 | 281 | def setValueCol(value: String): this.type = set(valueCol, value) 282 | 283 | def setTransformation(value: String): this.type = set(transformation, value) 284 | 285 | def setSupport(value: Double): this.type = set(support, value) 286 | 287 | def setCover(value: Double): this.type = set(cover, value) 288 | 289 | def setAllOther(value: Boolean): this.type = set(allOther, value) 290 | 291 | def setKeepInputCol(value: Boolean): this.type = set(keepInputCol, value) 292 | 293 | def setFailOnEmptyKeys(value: Boolean): this.type = set(failOnEmptyKeys, value) 294 | 295 | setDefault( 296 | cover -> 100.0, 297 | allOther -> true, 298 | keepInputCol -> true, 299 | failOnEmptyKeys -> true 300 | ) 301 | 302 | private val labels: Array[String] = modelKeys.map(_.toString) 303 | 304 | private val keyIndex: Map[Any, Int] = modelKeys.zipWithIndex.toMap 305 | 306 | override def transform(dataset: DataFrame): DataFrame = { 307 | 308 | if (modelKeys.isEmpty && getFailOnEmptyKeys) { 309 | throw new IllegalArgumentException(s"Can't encode gathered data with empty model keys. " + 310 | s"Check that input column '$getInputCol' has data.") 311 | } 312 | 313 | if (modelKeys.isEmpty && !getFailOnEmptyKeys) { 314 | log.warn(s"Gathered data has empty key set. Check input column $getInputCol") 315 | } 316 | 317 | val outputSchema = transformSchema(dataset.schema) 318 | 319 | val inputColName = $(inputCol) 320 | val keyColName = $(keyCol) 321 | val valueColName = $(valueCol) 322 | 323 | val allOtherEnabled = $(allOther) 324 | val featureSize = if (allOtherEnabled) modelKeys.length + 1 else modelKeys.length 325 | 326 | val encoder = udf { (keys: mutable.WrappedArray[AnyRef], values: mutable.WrappedArray[Double]) => 327 | 328 | if (featureSize == 0) { 329 | // Special case for empty model keys 330 | Vectors.dense(Array.empty[Double]) 331 | 332 | } else if (keys == null && values == null) { 333 | Vectors.sparse(featureSize, Nil) 334 | 335 | } else if (keys != null && values != null) { 336 | 337 | require(keys.length == values.length, 338 | s"Keys names length doesn't match with values length") 339 | 340 | if (keys.length > 0) { 341 | var i: Int = 0 342 | val elements = mutable.Map.empty[Int, Double] 343 | while (i < keys.length) { 344 | val key = keys(i) 345 | val value = values(i) 346 | 347 | keyIndex.get(key) match { 348 | // Take latest value for key 349 | case Some(idx) => 350 | elements(idx) = value 351 | // Accumulate values is all other enabled 352 | case None if allOtherEnabled => 353 | val allOther = elements.getOrElse(modelKeys.length, 0.0) 354 | elements.update(modelKeys.length, allOther + value) 355 | // Ignore key if all other is disables 356 | case None => 357 | } 358 | 359 | i += 1 360 | } 361 | Vectors.sparse(featureSize, elements.toBuffer) 362 | 363 | } else { 364 | Vectors.sparse(featureSize, Nil) 365 | } 366 | 367 | } else { 368 | throw new IllegalArgumentException(s"Keys and Values are not consistent") 369 | } 370 | } 371 | 372 | val outputColName = $(outputCol) 373 | val metadata = outputSchema($(outputCol)).metadata 374 | 375 | val encodedCol = encoder( 376 | dataset(s"$inputColName.$keyColName"), 377 | dataset(s"$inputColName.$valueColName").cast(ArrayType(DoubleType)) 378 | ).as(outputColName, metadata) 379 | 380 | if (getKeepInputCol) { 381 | dataset.select(col("*"), encodedCol) 382 | } else { 383 | val cols = dataset.schema.fieldNames.filter(_ != getInputCol).map(col) 384 | dataset.select(cols :+ encodedCol: _*) 385 | } 386 | } 387 | 388 | override def transformSchema(schema: StructType): StructType = { 389 | validateSchema(schema) 390 | 391 | val attrLabels = if ($(allOther)) labels :+ "all other" else labels 392 | val attrs: Array[Attribute] = attrLabels.map(lbl => new NumericAttribute(Some(lbl))) 393 | val attrGroup = new AttributeGroup($(outputCol), attrs) 394 | val outputSchema = SchemaUtils.appendColumn(schema, attrGroup.toStructField()) 395 | 396 | if (getKeepInputCol) { 397 | outputSchema 398 | } else { 399 | StructType(outputSchema.filter(_.name != getInputCol)) 400 | } 401 | } 402 | 403 | override def copy(extra: ParamMap): GatherEncoderModel = { 404 | val copied = new GatherEncoderModel(uid, modelKeys) 405 | copyValues(copied, extra).setParent(parent) 406 | } 407 | 408 | } 409 | -------------------------------------------------------------------------------- /sparkext-example/src/main/resources/response.csv: -------------------------------------------------------------------------------- 1 | cookie,response 2 | wKgQaV0lHZanDrp,1 3 | rfTZLbQDwbu5mXV,1 4 | r1CSY234HTYdvE3,1 5 | 20ep6ddsVckCmFy,1 6 | Jga3f9JqZIuXBJ1,1 7 | 2THd1TMYVXjFLlI,1 8 | RmBr0GvcnHepocE,1 9 | 6bNRJD0f8rrvNHZ,1 10 | o9EXMfVigHmaoQM,1 11 | iLSH5Yhxsg5uf4q,1 12 | tmHEHROFGmji3zM,1 13 | gjR5HgiHWtNZuqK,1 14 | kdx8i3MJqLmDJV6,1 15 | vypM7m3z6SSRvGZ,1 16 | xq63eKSvrJFq5aL,1 17 | W4VBodItMK8475Q,1 18 | ZFjrfbTYBxadHQ1,1 19 | VAG5hYt89GntLYU,1 20 | 9WNbUDJvypHi4JK,1 21 | cJ5FhIM6lXTIFFZ,1 22 | zpTvRW58SN2CJ4E,1 23 | UJlUVPXDc4OaWmn,1 24 | dPWY5AiqqWVG3JJ,1 25 | 9fMT2NB9DFWl4Ox,1 26 | t9QW1NCFCWbPOJt,1 27 | c6tA8uzWFr9t1im,1 28 | 2ra0Q3AQp8qSlc9,1 29 | jVVrUj6wCsWcTOs,1 30 | qFmzD8NrgLmpvux,1 31 | VsPopNiYNgrDuls,1 32 | PU9OtYfnS2oOyCM,1 33 | 8lt5BpBLw2ahM1N,1 34 | mG6WRTKprzaTokJ,1 35 | U3J2Nv1EKokw4XS,1 36 | JCPgwGGXPFXINhC,1 37 | 1XmlUyrrVrBkiik,1 38 | E2YwHbCRoKXzzEc,1 39 | VwO5WFhxOooeCWb,1 40 | MDJqzz1Lsf4QfCX,1 41 | ehI13pvBcz8CEPr,1 42 | vtfqDMpCBLrpRLN,1 43 | 7lf3IRE7hPAM18w,1 44 | 50SWlFbA9or3KUJ,1 45 | bCUamTnw4qIEZ4J,1 46 | xwj65CbKrSuCygd,1 47 | Alq7HpCKRFcRgt8,1 48 | KOHN2oFFdv7UBLQ,1 49 | kwad6cmTpUk95Cr,1 50 | QxJ8zBHwkPNOxRZ,1 51 | v8EUlcsKj0KvYPr,1 52 | 9YYbK3joLYHQTGA,1 53 | fP8bh0C4xFuStUO,1 54 | Tn5LnVL9DjRcOCa,1 55 | SjExrrkZvl12k05,1 56 | fwN0yAl9GtRoAnY,1 57 | laSq5tUROYvMcID,1 58 | lJuqKJuDA3w5BZz,1 59 | cdkFNkwOneAQrND,1 60 | Lc3Rb0Gjd52eV3f,1 61 | JVLsLcx55JY1RZB,1 62 | DFnWOsYjpNFSw4u,1 63 | lpVHS3s9iJN9rg7,1 64 | kS5awEFgijnfKpC,1 65 | nNi7JzDeLIqC7Gi,1 66 | tvwF4qazBMkEBtP,1 67 | cdBUHoZRBS0mRXN,1 68 | T4g7WskDNNfOzUt,1 69 | lC18SA8sWsRldOJ,1 70 | z38D6CRkiCUH1Je,1 71 | meyO2Y4fOq3ZvpG,1 72 | KuPG6y3WgCom6OZ,1 73 | VjigIqWnYJMXBkJ,1 74 | px8OpJHykJjcTM8,1 75 | 4A4ZV38VHHUKl6E,1 76 | KzBsGh8zNkgJddb,1 77 | eNMQM0vh18nQfVY,1 78 | a57XcvW4vxZ473U,1 79 | hkvRWXnqTT2hrac,1 80 | pXP2e9mM6kkeYwt,1 81 | DcRbZkX92pDKHL8,1 82 | z70255CEsUQPmLw,1 83 | pDwUTusdFjPjwQM,1 84 | sfAW2avORX4ZT1F,1 85 | 4mZGAbuXKO3Qo6I,1 86 | pa9fn7FOC04FKXZ,1 87 | 6kXq4DMYsjKrLwf,1 88 | VcZsVQOQPAyz00E,1 89 | E2KHEHzA5lAIZtH,1 90 | 9iVEePY5KXgRpPf,1 91 | WkwvVLxSjRHxG5P,1 92 | GRLd5YubtEAyQlT,1 93 | 2X2ZBYw0vN6W5pX,1 94 | diL9uidQfbrPZYd,1 95 | AsIboE6VpP8liw6,1 96 | wffozKQkAi7BA97,1 97 | RaTLueWZCzjSsd1,1 98 | x0DHdKd9q1tJlMn,1 99 | KJOIbSDiRDe89R7,1 100 | xdx58SVBtxisreg,1 101 | Lxdrm3ZBgJ01n7r,1 102 | nPKHjz4XuXLjNat,0 103 | OoZYvAABOK8MpNu,0 104 | bv7jb8IcBPt5Ltu,0 105 | SjIdtw8LPsFqC4H,0 106 | PJqpUjktDxT6Hnv,0 107 | AQTT3ls2mO33hJK,0 108 | cvsPwm76aEMBtiL,0 109 | P7WoOvi9arjw9Y5,0 110 | ugRKYkq4lXlFnCw,0 111 | 8VZYKXzTFPEe5TZ,0 112 | R3aoS4yr0uXNBKn,0 113 | xlBcZLHtiogfmdy,0 114 | iBK8oiA48Ht2Ny2,0 115 | mkkCJ48GPs3QMN8,0 116 | 0PFARBmT6ALY8tS,0 117 | stdjSXIbzYvN3t8,0 118 | PPhUck0Krpe2nRw,0 119 | 3shCPyPOra01iFu,0 120 | RrAvFltqb1TzE5h,0 121 | QTP5HroNmKQeF59,0 122 | TmLjI8nN9gfLWRn,0 123 | TMENYL4j7vEty47,0 124 | NIqL0zJnZLkIU4b,0 125 | htip4jmn20j9N4q,0 126 | 1pcUDVIUWEcp0xv,0 127 | Xs2p7YI1wYpv9eo,0 128 | mAXtAGZc11HSaXl,0 129 | 2nHw31xl5g6RDcX,0 130 | daOsbOvkkpt4Zrv,0 131 | 3nWP6lH7EIJoz9z,0 132 | BrQ2ixqg5JY5b3G,0 133 | aKqDseFXcsFB151,0 134 | jlC7SgoA9sMKpEu,0 135 | xQZzaY4Qrwbcd8X,0 136 | G48sd0cGVKChY0d,0 137 | iPRPvpuu8IQvmWu,0 138 | usoNGkliLzAjQyU,0 139 | 7S5Ey82u0rxCvhR,0 140 | 09qLsoxXz9qlMpm,0 141 | v3x2nCBp5ubnbWU,0 142 | C9QFMDK1nz8ayPf,0 143 | TDAvlXzPfzBqtLM,0 144 | 8cKZBwtxzCGh17j,0 145 | IzfaRU0xUHmwewJ,0 146 | csQ41JfWlqEVGUi,0 147 | 8sMn7lPQ5q4l5XV,0 148 | rFFqEgV3SGhds4Y,0 149 | vFunkin9xerLoHh,0 150 | 9vsaXPcqrzf0eEN,0 151 | Ho9PqTqwoWZDKrF,0 152 | biD3UKo7egruwow,0 153 | 9yasPADnt2F2vPz,0 154 | 2i4rSCyl94ETBpW,0 155 | 5LinDQbtZiIi2IE,0 156 | mj8CntIfxJJISWD,0 157 | mmR1ewWmJHkJwwn,0 158 | vRGelfBqNoCQme0,0 159 | GZPFGVmvnFtg33j,0 160 | AB1qKRmPnohJzk8,0 161 | pA9bG5F2LfDMD8J,0 162 | mg4zOzr7oNgxdQj,0 163 | a5Jiv6roloQcOI8,0 164 | PXkjmkfvt74LfyG,0 165 | 8z4QwmDjjd8BcZt,0 166 | wL6j8AYyG3lbqUL,0 167 | YL28efelxO6aEkt,0 168 | hCL6R8yqcSbWFbf,0 169 | OnO3yfGcTBejpT4,0 170 | 8EG73soHyyUDlJI,0 171 | sAPRVTHJuK3ZDkU,0 172 | HXCkc47u8ocE25O,0 173 | XFTFeaK0rXfDUKI,0 174 | 7sbZSwT5sHzIyDW,0 175 | nptkbTSSkJIC4hP,0 176 | klkgnD1Ha4ASU1B,0 177 | IX8niiGMr2V3Qg1,0 178 | R2aUsLPOcnvSQsU,0 179 | XpFALYWVclal6yx,0 180 | 34DFbjltb1bCbA8,0 181 | mLitPzaFoFIjsUG,0 182 | GTMZ3VAvFiXKWP5,0 183 | V8MOUbmlHbJ4HQE,0 184 | vQZpT1LC9CmfuBm,0 185 | TaOBEJ5lb1oPKRO,0 186 | AbJ5JABJ4LWeJkB,0 187 | aB2UrElavymHwFH,0 188 | 92JgExFAVnsDTH5,0 189 | 6egnu86Db1dWsLC,0 190 | CG0YBmLLGJUoJbN,0 191 | A8wUIw5wsJDtMHN,0 192 | BI42LboSsAhtaiq,0 193 | Uwtur0ppGwzeflC,0 194 | XnvTcMbi4zirNEp,0 195 | R84RKdczylKFbtC,0 196 | t8PBhdB2CdyXeMe,0 197 | utOUGQ8EDE5oEIs,0 198 | bhxRBdS94giKsbI,0 199 | iWpyLstBxO1Biux,0 200 | ywvzEWxWr6LUTcI,0 201 | C2dSLgCOe4A9f1M,0 202 | xL4AJ1tQGder4yr,0 203 | hWI9ToM8M9s4vnf,0 204 | vJWGT34oasRMT6G,0 205 | vLbdKreaE4oCv5E,0 206 | RQ2NELthlA8uT8n,0 207 | oDTKArOeP0X70zT,0 208 | vbqSP6KbBR37VI3,0 209 | OtNxMwypvahPlM2,0 210 | QnPhU6FcmgiMUiW,0 211 | Hw6xCxcu53w6BZN,0 212 | aSQvkt8QDtSPuEg,0 213 | P2gUqCySiYJ1At5,0 214 | ai5PY2NYCivs81Z,0 215 | 9epcgrqhAenJ75w,0 216 | fIuwNMlfW1uLDS2,0 217 | cEGpBwbUFlUSOW8,0 218 | bS790RhXdGe8SxE,0 219 | 7BFuvlKw0RpVTU6,0 220 | mEVcbzGifTYwsPu,0 221 | nd0kFQJ8RH05g0X,0 222 | bFAwYjQWWUNfj6D,0 223 | vi3BKayZw8Wba16,0 224 | rBkxRjtLWIG9W8K,0 225 | nQyMy4peCqAlCpA,0 226 | 81tn1leMnS9dMYG,0 227 | QSDmjRgykRQew2l,0 228 | 0bWkTDeNpWNqYW6,0 229 | f2BqZImYK2wODHT,0 230 | 3mW4rmrOqLDWE1H,0 231 | onxJlEmpxzn4AsQ,0 232 | 4Gmxi7XxCdB15rN,0 233 | 12JxCGtq8a2PKvR,0 234 | 89pY6o3gJd9XOaV,0 235 | A5bN8QaCvUarj05,0 236 | qNxQxhbViHk3o43,0 237 | 6KMmENgSZpIaU2j,0 238 | UtRYLOPEZCIgfbW,0 239 | y1O1cp5Zx4tB5sJ,0 240 | rNCr41Ttz2hikal,0 241 | PZL1PLmjx4SpJIU,0 242 | 9HRgvm6ffnn7A6p,0 243 | nsB9g0iKYiTMrGa,0 244 | BcvrOHOgpm5kOcc,0 245 | DtU88WV6erzaVnV,0 246 | NjwlIyqP1xcHdbr,0 247 | 4XcvXLmXiM3JfMC,0 248 | 3kVhzOgOKQqOJr4,0 249 | eJL6epTf6gNimCO,0 250 | Mr0mYSYqpg4fyGy,0 251 | fLEWY187h5VqTNi,0 252 | f9Yn1trmkbLUVwx,0 253 | xLLeRpAatk1JAPq,0 254 | b1mgCO3dcvaT2hp,0 255 | 2IsxTYcj3xznJzx,0 256 | JkGaDkTZ5A584zW,0 257 | hRBV4glkADbeEyk,0 258 | BtE8jIbhd279i0G,0 259 | ikmQzn788nIXOFG,0 260 | wrrDo6FTpMEuCrx,0 261 | cwYbiky4hE4vj1I,0 262 | o4xGbXhe9g7mp5x,0 263 | aN6Gw5PXWvvaKv1,0 264 | snJED6GAOeiIfVO,0 265 | I5n4wOUE35OtqIr,0 266 | gU3XDDEsilsKjjN,0 267 | oOdUWss27MgL9ik,0 268 | hDTITZ8yVGim4S1,0 269 | 2PuISodonxd3ttw,0 270 | JMbyTycSSfTuBme,0 271 | sHUZdNgBCGqvxUZ,0 272 | MmZ5igLXsc9UAch,0 273 | jSwe6qnniV6MpcW,0 274 | tstcPghhXNWb4dy,0 275 | lt1tTYsH0GPqV9c,0 276 | Y11mJNFZnFpJFNW,0 277 | LGCwkDTUIJy3Uzm,0 278 | EWmM5RUuriswSzE,0 279 | DXJJGWESR9uBigO,0 280 | fsp5X69wFhZKI0X,0 281 | ytGVwWpfwWFSgz4,0 282 | aGCypm2VtVZBoNc,0 283 | 9dU1Eqya3KVXRWE,0 284 | yhiyUwFZS9Q1iIo,0 285 | NYTBrDdalAVC6dh,0 286 | mZOsQDSeN5zFrDf,0 287 | 5MglSlI2DGghdrW,0 288 | aavJ3ubevq3kS76,0 289 | CjEGTPVmlYCTkcV,0 290 | zWqNyqGjlqW0pQ7,0 291 | 7PswHGRvt1ivadC,0 292 | 41Hr9d9i9u6wHmk,0 293 | zxcNwRM6w33FfeS,0 294 | t9zQZItXzdNLaDG,0 295 | DyteNgYMg8fTv5c,0 296 | e3uzZhqhyU9pVpL,0 297 | dSZdpJh35VHXbFG,0 298 | qprE3ReQCfxNKNe,0 299 | VGWRqdNb8FfbmXg,0 300 | KZg4kldGdhHGuC0,0 301 | BeeITZPZFFOMaek,0 302 | Zn0eZY9TTC14j6G,0 303 | 9iZEmyH3nhqkKxr,0 304 | o5LmvaM3mlQOyTo,0 305 | BFZcFFLxklyDxFV,0 306 | z4xShRQyqvVmPGO,0 307 | nYKtpQaV72YX7j5,0 308 | 0Pk0YVTZPeXP2ZB,0 309 | 23i1MQbAT4xhuKv,0 310 | 7gOZCH05sczohyy,0 311 | H8ccYdk7RgC1GFp,0 312 | PE21azkB5ankNHN,0 313 | AvCF8PXlJp2G8o2,0 314 | WK20AstUu7rr1RZ,0 315 | rtR5IyGoNAY09Oa,0 316 | wNTeSOHX4NSwrdp,0 317 | IZcQ5l4NkfjTKAd,0 318 | O2QbKT9C1bku3m7,0 319 | SthTZhoWJOk4Zgz,0 320 | bX6t6GGzUaGsWU8,0 321 | ezH7Ba74vWKY2u0,0 322 | WopcDblfNuxv0Mi,0 323 | gXhbI81VuWVTdnX,0 324 | nU8UqjGqgRgDDNq,0 325 | 157i3jzu2GeKBYr,0 326 | ao0gjyeoHyyIbPj,0 327 | YTPtB6b4TxhzXYR,0 328 | rQ6Tz8yl1Ov8Mi5,0 329 | 3tPbgY2eAh1zPc8,0 330 | AtJIHk6FOBvWJsk,0 331 | UOTwXGuYHKaE0Ns,0 332 | PX4rwMqX5419uyR,0 333 | 5lwkSnPtF2CkHuC,0 334 | CYJxf4qldM2CrUM,0 335 | AaAeWATuP2v9fiM,0 336 | ItEXKG7EomjOM18,0 337 | 3fNuteSGeBGjTLE,0 338 | X9KymVZ762YFScM,0 339 | 4xVOyFqr3pUL0k8,0 340 | iYwxbGgCE0DHnzg,0 341 | UE5B0HXpDRjInNr,0 342 | OWuvRX6GhiiNBXG,0 343 | vIlQw8GgiqTvdOp,0 344 | 1THvcDSDgxvL2LR,0 345 | LCttC1LBaYoNRyO,0 346 | LIS4gwpxfai7w6W,0 347 | dB4gFCU7ZXzbtCc,0 348 | 8HPVh5RN5u72pXQ,0 349 | zxxrpfZvkv7OzVA,0 350 | Gndsf6oH4BN8GFI,0 351 | OYIxaNVcGfnPsYB,0 352 | 1oyIwL5cnEluBbJ,0 353 | eTXUwcPoIHidMHj,0 354 | GADcbhLFNNDTBl6,0 355 | 7mscV6eRV0teEdj,0 356 | yN5moB4sFnS2u6G,0 357 | g9pYvpUgnJOuz5P,0 358 | 1rwIDqDxbYUZMyR,0 359 | hpeiJXNqQYE1ZdJ,0 360 | LJq6yg8vu51xato,0 361 | I2M2FYj3z4Z46j8,0 362 | X9GBrMRGMhmDP0S,0 363 | 8dEQRPRpWGtJirh,0 364 | WCWZBFs5RMdlQ2t,0 365 | CpQX8wuSf5NRLZK,0 366 | Pej4anlunCZc888,0 367 | NIOrd6NfYmUf82V,0 368 | VYdd62ccHtmghcC,0 369 | 5lta7BvHcPvQT2Y,0 370 | plkA7taGX23Uvp0,0 371 | llNPMzSeJvgw4cX,0 372 | IHvWGy4qutyoyMs,0 373 | Z025v1Lbef0rppR,0 374 | kY6mof86L5geVlm,0 375 | NZN8x4YjlRKDPv8,0 376 | FZXltWwI0pbb0c3,0 377 | y1kTGEjykMmd500,0 378 | R3SU5yONk02yeXx,0 379 | mo3orWsGpzwbBPz,0 380 | nnkTS9uwqMXGM1b,0 381 | 9UpkA6lx5P0wZMW,0 382 | aH1Uvsq2KpYYZjN,0 383 | CDdI0s220nI2wFp,0 384 | s5P7xQVwo3A89le,0 385 | QPhdEgoxgBbYVgO,0 386 | dz8rfFOqrFZk8ll,0 387 | vVK4WaCj1Tn0mZT,0 388 | Kjt1qTwUwEd4wSP,0 389 | 6D4K3w634bdCNOS,0 390 | j3itsfenMcf0AKs,0 391 | XNZjlliphV4VhHo,0 392 | EgbOanPteB1itQ2,0 393 | Vo5Jd6eOubEfZm0,0 394 | 58JLPFYmAzYpJxN,0 395 | SRA5RX6AVII1Wub,0 396 | xe5INtFzGp3b1yO,0 397 | 3Dyg00Xzyfp5cXe,0 398 | 1PuBR55yNwjXI51,0 399 | pVYEAyfocQBvr4u,0 400 | s7iiGLc4ypEygFS,0 401 | YHlPGtzV95uFTdI,0 402 | C4NjQ5qWMLlH736,0 403 | z8asmgEDAcduBKM,0 404 | j0rSFDk5uMgE8ae,0 405 | fXFkg3DyhT4eIb7,0 406 | 8EhUCN25ZG3olcK,0 407 | c5M1i0avAsemEtJ,0 408 | 06N6jKMmKQqj9bo,0 409 | RfVKuNQi3UEAjBe,0 410 | Xs8bOiQjAVOTdrX,0 411 | sgI73flPdmVrMKa,0 412 | r7VsOQSvLqkaCHT,0 413 | 3zGRQIPE3QKxt64,0 414 | jhpTeAvunjEUKVf,0 415 | GP1pQ9jy58M8ETB,0 416 | rBfmrUkVsHaN7XY,0 417 | 2Cq1k3JDukKWxgb,0 418 | NQeB35TLKzLSfEz,0 419 | 4dlLdUlzYv5tpa5,0 420 | frHQ7iAPf0TTX7n,0 421 | 1E23YZZMUbm33BK,0 422 | 7AyhxcpKis6ocJJ,0 423 | wtbso34VaqCT2Zb,0 424 | g7MRC8ozUigHCa4,0 425 | USraY4mFRBpUwAq,0 426 | zL8FTKOgH8PrtqM,0 427 | 6l2d6dIEWt79vwo,0 428 | Lt1Mgdss5RtypIp,0 429 | ePkXyDmkB851lrP,0 430 | XJ6sEyMV94vicBC,0 431 | zD66H9F6vKKjE4r,0 432 | Gt9HyuRggewsUKO,0 433 | T9GTuTFDI99UnkS,0 434 | 7oQojJAPSZHxBU8,0 435 | AXCBA541jFb5T4r,0 436 | W9CfzY0KoIi5WDc,0 437 | RaghXCE90qhXAiD,0 438 | ISdQ54lI4Xrj3oc,0 439 | OMfrIN2c6KU4ngK,0 440 | b8Qh9w1Nk2r8RmS,0 441 | UgPiSskQ3uth1nz,0 442 | 0zNtDyIdfuiuz5G,0 443 | DVWx2U9yLdTb4OT,0 444 | Z6Y3GtP4daO1Oxp,0 445 | lE2v704MsIObqBu,0 446 | LJ3isFTw0kSUnr4,0 447 | 18iTwDzhF7dzcDT,0 448 | hNCiSkWdgDzb6yi,0 449 | QPi2JKRadNOiS8n,0 450 | M0KAh5azeLRncVS,0 451 | xFfX9YWA4gE5rIh,0 452 | VVr3QcHrhY57bB7,0 453 | S0bpaRFE04VXWzW,0 454 | yaajZsAYU8VFKzg,0 455 | TrqtNJz46wD5R03,0 456 | gPoTHXsnduDT0qP,0 457 | MPeZLlqldDrayRY,0 458 | xcWYaQBWE6XKDgC,0 459 | ZD1F9jNv5yAElK3,0 460 | t6NL4cAQsjnRAWQ,0 461 | hU4QFP1R9pNSAQQ,0 462 | 1dFQjh0IwkzDq2E,0 463 | cF9jgUCFysYeBBK,0 464 | t0Pm080elVWgYyT,0 465 | AFcB63pVW8xBE37,0 466 | J3yQQss6E8KcVw3,0 467 | lwACK4YmTCHaWZ3,0 468 | VpOIz948Wq3lbmO,0 469 | YQHAEtYaAj8dPcU,0 470 | cwyy7W4PMdA4HEW,0 471 | zaPyaZIzr2E9yeN,0 472 | fZiWUdJnIAxQ2I7,0 473 | T9Uw5gOlo7oxfQe,0 474 | 6mEvscjDOa0MreW,0 475 | ioqnUJ6qa3zwOVx,0 476 | nZOXy38siCb8FSK,0 477 | XYR6bWWaVe08q3i,0 478 | OjV9WDrg81oeSN9,0 479 | TLFrkp8F7RZJmBj,0 480 | RuWstnSglPCvJoh,0 481 | Kz3KPVGjuc70BQp,0 482 | aNL8nBRwvrQmFcf,0 483 | goA8YLoHHmtpTzT,0 484 | iDYA41ljCGRowAm,0 485 | 4LpzRMrk9vQTljL,0 486 | BDN5hTQOYzAxpA2,0 487 | jDHyXXt10L751VL,0 488 | hrhTMqbD2fEusB4,0 489 | uuWrGrzyuhryVDL,0 490 | G0GtCw02orpAaAk,0 491 | lg2elLXswpUu9KA,0 492 | NaTCSBu3Jz2oUeb,0 493 | 9h7BnVysR2fbcvU,0 494 | m1xfVTBgNh2Lqu4,0 495 | dZjoKYjGHH4r4YE,0 496 | yVZoLkRZVwt57Oz,0 497 | SLT3vTb0KMXX13O,0 498 | KTnk2l7BfKpx3Ja,0 499 | kJKC5vTrAX3uUVj,0 500 | KJmlQVn3aiNg8i5,0 501 | 2IAZnxEwFEdfeiK,0 502 | IzrU8fC5IMyOgEE,0 503 | KHiIhXnNMk4kpEA,0 504 | FmVW9EIU32USkwF,0 505 | zoDk2EbyWY9sCVi,0 506 | i9xuhzmhakilPbE,0 507 | aI2lv0WYJpwvf4w,0 508 | hcm3KstO7Zl4agR,0 509 | nCDfESWXGT0v9YH,0 510 | b7xuwZSnx96PM18,0 511 | cka8aIV2LGFbHz0,0 512 | eRqcmc3yTofCn0L,0 513 | T5IV0zdOL5EWcLm,0 514 | TUGZD4mzadvwa2c,0 515 | f6WyB95u0jQdhxm,0 516 | foF8wCeb0QHCh9W,0 517 | bTvtk7B7eJG370J,0 518 | 72LHIywba1yxkvV,0 519 | 7dlyFzIhovKLcl6,0 520 | tceJFsqLRU5IFV6,0 521 | k6xg7guVxtTQt6W,0 522 | 94tzilOmB6mAVJB,0 523 | afUIKMKjnCbK7QK,0 524 | dlKX8mDBFsgazcM,0 525 | XGNN2ypsRIa3pBD,0 526 | bwAIpaIj1a7nCAC,0 527 | FmiPxBQTb8ETYNt,0 528 | uDWkMyvX3znBPAV,0 529 | 11YWjD3GKX7fTET,0 530 | IrNZrOHSgoStDfs,0 531 | 0v70OKb68xlpL9k,0 532 | 6uiqrE8N4hwPrW9,0 533 | ajXtQbfKw74QLgG,0 534 | fjuvMOBlwUKJIlT,0 535 | bCe4lHULVAXrX6y,0 536 | wL9lPh9KoZgQGFH,0 537 | BqMucPvl5fDib36,0 538 | GTfrRAHFdbxRQL1,0 539 | RDy6knNYuS0b7at,0 540 | lZiPqJkDKGz3PT6,0 541 | tMAovFDGmc3M41i,0 542 | FPOqk2iPEAAfxSi,0 543 | xtwFjQPAdhG4vT2,0 544 | neK1z1yTzfQENXd,0 545 | 3iZcLgnlElH8BT6,0 546 | yLLg9dLUZMrBoS5,0 547 | zxm75fDCZw2kG6I,0 548 | 0DwG37UZRHAji95,0 549 | ZOWudjXcXAMmvSg,0 550 | jS61RFsl1kL2rEJ,0 551 | jdK8hpj8eqq0twb,0 552 | QyuyLvMCgjqkzvX,0 553 | 5wKLVQcoG88cOf6,0 554 | bkmPjbVe1ZGBa5x,0 555 | mdcdKjBmIacBDGj,0 556 | EZ5UedIT5xbJNvR,0 557 | 3fuF3IkA7jv79zv,0 558 | gT5zNrMjg4XtSZ0,0 559 | cbWJH7CqnBULNjM,0 560 | KUO1wimLKcFW1EP,0 561 | ZA4nDm3qtPw8Vl7,0 562 | nxVmm8UXptdMli4,0 563 | lBLhr4fKXiwp3Mh,0 564 | vtgcVmOxj8yEFSP,0 565 | V38MTOZRBD4f1aL,0 566 | qOsA41BdaB8AgUl,0 567 | ScjGihsEye8nbs1,0 568 | etQjQwug5eC6Ut1,0 569 | IBHxaYdcPUrI6l5,0 570 | RWJMArmoAjNX696,0 571 | FHIIjARyyQBZu7f,0 572 | ZJUaQC8rgPZvjQF,0 573 | M3Ot2d6hQyHYsxE,0 574 | gWRGlUJIBBicKZM,0 575 | Apue9Sf0kDtmP9v,0 576 | HK5ZWDbNJwe1BUV,0 577 | ODqjzxEoYdBbiUW,0 578 | 6Fwhb7RT707JJKO,0 579 | sIWcCVs4jasyLvg,0 580 | PmhKgsxTu61Z9Du,0 581 | qnXy0MDWMotQIf2,0 582 | cu2XDfMyoUPglfa,0 583 | X5cqUAmZK5sD9PY,0 584 | vsjBBALvD4QNCQu,0 585 | e1TUFlpaWXWg0zY,0 586 | xwbTgcvBV05rEwW,0 587 | mQHX4iLnSp0BxsV,0 588 | c8CkNqhxfghBaN3,0 589 | zq98gEEUxlnkY8K,0 590 | WuSTui7rQv1LqIY,0 591 | Vnv4Z9ewLI31vhR,0 592 | JcVJxNrvJnXMmfa,0 593 | BI8tAxuchYs6OTu,0 594 | bicEN1t1bgg4SGW,0 595 | EtIyzl5SSrNhcyb,0 596 | fp5ulnQIQg6WENW,0 597 | fKBfRbwOCgezAFY,0 598 | 29apXre9koe6P1F,0 599 | jIWDSgWFocTevVk,0 600 | AuVOYRjrwnG366x,0 601 | KyLQNxOpScL3Lpr,0 602 | ihrvGRbsQ6XnJN9,0 603 | sGYfoMaLZMf1Nox,0 604 | LNzaQlm37HreNHs,0 605 | MBbJgLpLaC1eu8U,0 606 | vHDhTT94lzqgkEN,0 607 | OcovSndupxG6nsg,0 608 | tkBOa4Z717CNk8R,0 609 | 7NAfzRYy2rNcI1b,0 610 | UCphABzy2b6S5mr,0 611 | OZrZsZ62WtPCvFe,0 612 | GDTgwdLEupQJi4I,0 613 | PViJu878u68y7rK,0 614 | SIBXEUN1MXFv5kG,0 615 | BU2oJLdTVLDsBUj,0 616 | nUQRFuuTziV4wZN,0 617 | MmTtikRjgeMhMgj,0 618 | EEryF1O9vy50fQp,0 619 | 0TInTyxq4BaWeSJ,0 620 | SOmCcwRL8pstsFz,0 621 | jAwKtuPcWohMuG8,0 622 | LrgkxD0qBQdMsyL,0 623 | 2x7B0moEqNiGSE9,0 624 | 6ant0VcFjmog2CA,0 625 | dtxmZSEJPY8WwA8,0 626 | 26zJCdeD54XNhvJ,0 627 | Nxz1zZ0rnye5kpt,0 628 | LXcSt1hMfypgZ9P,0 629 | LAuLKn4wua2kzb0,0 630 | yWFo1P9VPBkvIvY,0 631 | febhEN5snkl9Q3x,0 632 | HV0J2zx4xQiAkzr,0 633 | 71QhBGoU814NT6h,0 634 | o6r3Grt6z3zUPCK,0 635 | GPFmSgnxGhGMcg1,0 636 | Sa2KgXRa6JSoimg,0 637 | WEgfBj3RX6RFEb9,0 638 | nw8gAf13Ki2x2Qf,0 639 | Yp3yf69BKEdyU0c,0 640 | 8cETG0f8jRWVtvc,0 641 | lqxSOPvG1WwO5jU,0 642 | WtsIKvoJJE7QgCH,0 643 | KZvlYhEyMDTQB9G,0 644 | oO6190PwDcbAbCO,0 645 | jBNOSyqVnPOxIiT,0 646 | ty3Nmj1H0fGLAGC,0 647 | pPKw8PujS5euhJZ,0 648 | 6f6bihELJMvj55Z,0 649 | t8glpvEUkZNs52p,0 650 | 45Hq7uYe72XS98O,0 651 | ZK4eupmCRkEobCG,0 652 | NrFjzZjxNgSKpu8,0 653 | CDaRRYuy7YQmPQA,0 654 | UGqsTDYtRXXRnyK,0 655 | Xz71TGZA0MLkhdu,0 656 | Dvafv4XYs29jtbI,0 657 | RlHsFN6UhzVnOJo,0 658 | rBuDmx5L7oJITVS,0 659 | YglonQZbxyihUkq,0 660 | I8T0Ic4c3X4ZpfL,0 661 | BnZTh69NEyHlsoe,0 662 | eDpqflCDbOH6qqf,0 663 | gYgy2nof87Icw2E,0 664 | PCwqP8jNZLTSTTB,0 665 | oAtcvwCYahX0WNO,0 666 | Za9lUlmz1uyIh5E,0 667 | 1F8h8tkJiM1TE27,0 668 | 83ErRrI0CZMo0CH,0 669 | 12fST06o7FPdidu,0 670 | o0pwGN0GjSEADUF,0 671 | BoIwqOO0ulPzx8G,0 672 | vST1G4gXyN6rhme,0 673 | tNbYIOJvtQ8AmGI,0 674 | tn3cnKfaSsn0MWM,0 675 | gBYwSkDiltFyM1r,0 676 | VM69dp9cuMExx5K,0 677 | 1KUdP8lAajvDJ0g,0 678 | rMMlBI80b3XHv8Q,0 679 | sySq8MN1yPz3L9H,0 680 | YZzioz2qSXZiKFG,0 681 | MBKOjRqKCjM1uQF,0 682 | Ei5EPbPn7tpsT3q,0 683 | 7SolN7B9YieBp6Q,0 684 | pxvUfH2I4oBOavk,0 685 | XC16cPmZT2cghzs,0 686 | xb1ThIe8fVlYEGi,0 687 | I2bVaLRu00StOHD,0 688 | gHjIR2DsnyNre7s,0 689 | jHNaIpXkjzIlRf5,0 690 | 5bjg3uogM4745AO,0 691 | RfPEXIhUyri10a8,0 692 | cKwPTcHxMke0dHl,0 693 | vX8z2Z2wVOjW8xR,0 694 | 9uEXsp69TuKA0YV,0 695 | MZTNaBFGlCAJtjj,0 696 | rt1W9alBM68Jkj5,0 697 | U62OExoT1ndGjOC,0 698 | CCUK0fHLMUaX9ot,0 699 | Jra5i2Lba68Qt1L,0 700 | TjBX7Je2AgOGj2N,0 701 | hDRreP5NRFYCGxz,0 702 | Mg2MrqsOyUIAZra,0 703 | ERnJ7aIeB3qXcyq,0 704 | hhriIuueAiKnIer,0 705 | htYpQVsA0w8vUkr,0 706 | 8dPEP2C8roqcAOO,0 707 | yW0VQgqpUiqcte6,0 708 | 1zSSbnHsXNHPfkc,0 709 | 3vx9W9aAtYgidMG,0 710 | 3R2LYcmLmK7pZD9,0 711 | EQopeK3jOsTRepC,0 712 | cEvPvNMpZNfsrir,0 713 | toCpVHzcxzjYEwr,0 714 | EnWzG3nbIVxgHEF,0 715 | 5eXLvg7BE1XCjDg,0 716 | vBxLLLipe6EI4w6,0 717 | el4AmFdq8Zko9MQ,0 718 | N79TdnonJkqfG2g,0 719 | lD9844eT2DvCjPl,0 720 | CYPVIvFXqxnO6Ug,0 721 | 9frx0aWYmPkzyIN,0 722 | EIqEltjPQnj3P2x,0 723 | wdLhwuI2DgGzmZW,0 724 | CxHbmb3lYK1nRku,0 725 | cQqBjfHQQV8HQUu,0 726 | acFsFK3GwbcMpUj,0 727 | DB3lND5Zn3pOvbC,0 728 | fPYOKLcjYFWoxsA,0 729 | etrsKkeh8HXdGOJ,0 730 | nkS2hINFNbSY7iI,0 731 | Px1BE5MQ5ol9FzR,0 732 | 9nHVOT6YbQ9KQQD,0 733 | oTCqInqJlvQfW5p,0 734 | efmN0G1GpUXaIrg,0 735 | umUDou6YFU3O2ex,0 736 | f7PIUFQ3BVaVJcz,0 737 | Gmvam992yqQoldy,0 738 | txQj3Z9rRmGrAwo,0 739 | THslw4VZbNHSsJG,0 740 | wUBqktL4KwGDn22,0 741 | 7kOXiKFEFaQv9Te,0 742 | SxQ0VRtt5Q1RsLE,0 743 | BtAjznHhp2012S4,0 744 | 4Z3XhFqc3WJmabW,0 745 | ahDs7C8Ym4W91sl,0 746 | BIvG6U9cZoOlLQO,0 747 | 3wiJ946X0fUIfkS,0 748 | cBG46NsSscMTwSE,0 749 | HbihdIQ4VCVrfQJ,0 750 | POjpodgzT6JzzaX,0 751 | TAIf5L07K6LZ9Oe,0 752 | nYj9jz4lqJerXZs,0 753 | T62X0pBBy7QAsL7,0 754 | GezRxPlQWSAI20V,0 755 | 4TbsILrNmbESfV6,0 756 | f8eqK1oB9c37it3,0 757 | f10H5gz2o4FiTJ9,0 758 | ocawHslPg9Zwi3a,0 759 | XbWP3LbuMqssyei,0 760 | qJL12F7De5G0X0E,0 761 | 1hHbjpO7BCBARCl,0 762 | DKPg4Q1Vs296S8c,0 763 | 4Kj8vBgnENhn3oD,0 764 | LX4pZg12PHyco9H,0 765 | QO8BO4A2sSJuofE,0 766 | WrLy7wfhElQ5ZDo,0 767 | 87lfiON6LNeOMbz,0 768 | BJ90i3BOdfsCtae,0 769 | 1PzRdNXFm5TJ168,0 770 | 2UI3ZTtsAYt0SS0,0 771 | OXHkzHONO9kopNV,0 772 | 2lY1gsjhtn49UWH,0 773 | JB577SRnCuikqVq,0 774 | qUp2lzLd5R05jUb,0 775 | zgLocdeYlKRq1jW,0 776 | CV6lKLasILKJx9T,0 777 | KRituDdoUlk9zs6,0 778 | mQEB6hZrq8mFAvf,0 779 | QPTsihZcTswwraY,0 780 | mZ6fUbQZGq1bBX9,0 781 | cn82woWM0yfuNI3,0 782 | iQyfIKdYSyRrszP,0 783 | QLMhrrDlcyjWhsw,0 784 | Cuc6x5l4vv25yYx,0 785 | rZSgXwof5XPfnQ3,0 786 | 5lls66C8WwYxC6U,0 787 | M0zgwMfsXbQ4xYo,0 788 | rYUyufvudwal6uX,0 789 | IR0YWuFmTHnFe2Z,0 790 | xm3c8cdJBTRTyFQ,0 791 | 9i3UlROPSMVZkeV,0 792 | NZ21zrqadFESnDo,0 793 | Y4Xys8elUECwxAt,0 794 | nh3R7Wj0RAYIYWo,0 795 | ELcbeBheaiXsdZS,0 796 | 9yBRbkAvsqduDg0,0 797 | he1WTkgILLGc2dC,0 798 | S76TVnpTIsDizHI,0 799 | PYxHZgTkQGGT0fX,0 800 | xNaNhX4JXK46cGV,0 801 | ZhqljwtIR0uI3v5,0 802 | xwcKFtwwhG4MPAv,0 803 | KjKMixj99gS503d,0 804 | ERl0NV0n2lHFj31,0 805 | Yp4iX2WKICMm3e8,0 806 | VIHQxDo2zPtBUXR,0 807 | vkfPxtsZyKKqU0d,0 808 | hUuMvifmTBJYLaj,0 809 | 7YUTJEINRDcc4F1,0 810 | YFk5VoBhIcjgnIp,0 811 | ByNZv0TO13BqDgu,0 812 | ZXDPQDXQq2ENEAe,0 813 | 0UBXho4OGVnr3QI,0 814 | LVQoI4eAeeX8KDq,0 815 | EXu6QvESj4EuBFq,0 816 | dQu2WPu7LXmtTtJ,0 817 | pcfONfTqZaJcTzE,0 818 | EylDmufrujlK0uf,0 819 | C0rKOEnKblUyp7B,0 820 | Ri0sGLcCTq0pJep,0 821 | png2HNa39vxjTNs,0 822 | Wyovy3Mo5wR1KxP,0 823 | NbR20rUu4YS14TL,0 824 | tchPqQMWvor3RsK,0 825 | m8v1QjV7xwBmNRK,0 826 | Yjxp9Ql0XoOzcyB,0 827 | s2pS1xZhPE54Jju,0 828 | LnQvrV9b5rhUROn,0 829 | lMSjmpq7Gn5MTpH,0 830 | jvDWDjohd4VHXU9,0 831 | Q8la2FogOG7g1Af,0 832 | 2XzbFvKpQQzdd6y,0 833 | ogLqn1lzu0BIumB,0 834 | K1v8fp8BQmpORNM,0 835 | Qu9GLbOGdUexY4E,0 836 | hpg2svMIsxpXaKF,0 837 | CR7e2lWoO3itALD,0 838 | J0xaewAkU2DtTJZ,0 839 | rzUQlBcSzjMvNNC,0 840 | Vf5EgIi8YDDREW7,0 841 | eodhtivzrMJURee,0 842 | bF7AU9XOLaKm7nN,0 843 | mLIDWEafEfjttFr,0 844 | 1KDqm7NebTViOsD,0 845 | FyhlxIo1lEAtJM0,0 846 | Tq9WIAZbPY6C6Cd,0 847 | 3n9kGh9PRmCXXz3,0 848 | pUo8XUFxueOlvki,0 849 | 96nJc0C3HGIWDaN,0 850 | ofde230oXCWW257,0 851 | mh0tzB4WINxuGrM,0 852 | GH7Qxgk0X46nxg7,0 853 | 09SuUOrdAqm25X5,0 854 | 9pY6YiN8ajYP9VN,0 855 | ZGaQzN4U6WJ9YdO,0 856 | JA9rZJRdnZFi8x8,0 857 | 4fQUrj9Bfol3eIM,0 858 | A2uObdaRMNOCSJC,0 859 | xcGqxySpyN2NiRx,0 860 | cu3rz5Qmf6Lib03,0 861 | owGrzl9nt1tb7jQ,0 862 | jjDLqbih3kdw7kZ,0 863 | NBgLdhmiF5FJAD7,0 864 | N6JwTjE5Ni1LDED,0 865 | eAMisT2A79KZl44,0 866 | Fb696Oe1pEChH6n,0 867 | cz99bQCDDPkz6l2,0 868 | VkgDHScJg0bd6Lu,0 869 | ZKslYzw180bEgKF,0 870 | fhXAPp2rrlwTEIK,0 871 | gqYuc7mXE2uVPbP,0 872 | Net0S4onkYtAqqs,0 873 | kKxED3ioCx2fBvi,0 874 | jCx1ozdSaAN5Ucp,0 875 | PN2s9ZR0Nu4Y4Sn,0 876 | Ow8XqbDCXDE9RA0,0 877 | 8vxU3YScxR56Xm8,0 878 | 1OHRnMJnnoHcuw5,0 879 | I7ESUelaAXCP7Gm,0 880 | OrxJoVMwHwv1TJv,0 881 | g8KHfizaLbLTeyF,0 882 | u9TRhjJH5Y8d287,0 883 | bcwJXgJcfnM0xpD,0 884 | T6EAomKrN6Weonw,0 885 | kxn9uFOhNXr9ZsR,0 886 | wBkPHebQJC2M7Nc,0 887 | p3KqsE0hMPZdEEn,0 888 | 6qbOJh51Mk4cJjL,0 889 | j8LvmXNnLZ7Htjd,0 890 | WgAxlIIp0sQh8V1,0 891 | zfJmETHeB3sRaxk,0 892 | HwvLq2ZqU16y4Y0,0 893 | nHLvXPhSXjtfxBE,0 894 | Vu6iaX5LPiXewyd,0 895 | WyZxnzgg79xTXq5,0 896 | Mm8LvBDjmBE3lRl,0 897 | MmEeskc3XNqYOTy,0 898 | qLWTCxMTIx0BvLz,0 899 | Jdv0UZwR1muSjes,0 900 | QshSze5oMRWUzEn,0 901 | fen5eved2wH0N7g,0 902 | YBcmLYUkPABJ5oj,0 903 | rFjIgIhut0Bl04X,0 904 | svjkIsDTe2EskiV,0 905 | Ctn2iGhsrebatgi,0 906 | O2AVUq2SoPw30eR,0 907 | 1ceLy01sg5RuVCu,0 908 | Yy25xhi7LkSdKdj,0 909 | HMg1llHZ6Z0ldyp,0 910 | riEOAbMtoftpXt7,0 911 | if4OPJ6vRZrpRvP,0 912 | e1uUB5aW4Monizb,0 913 | D1QGTxAOk9GApNg,0 914 | uKnGNpoOLttU8b5,0 915 | b80u87cFUIaHi5O,0 916 | Y3tmM6tE7zh6UhF,0 917 | y1d555hqbmVZr3Q,0 918 | mpBx0T8fPVMjvbC,0 919 | mQe9OA1YYSdq8ef,0 920 | BzBUI16TVuyLudT,0 921 | tst8Otfe5xqENIl,0 922 | dZqyfyEZixPfJtm,0 923 | NOZOZ6Oaf6F53jx,0 924 | gIcWuHKQaJzlx5o,0 925 | hASj2ZSbNBT2Bu7,0 926 | 6z28PVqxWXs3KZu,0 927 | sUXymhjV2T8klfl,0 928 | smF38YFvTHo5MWw,0 929 | h3BbxjjAfe4UntT,0 930 | hLZdyjflRe4m6cA,0 931 | fiWLq2N7LDJL8uk,0 932 | iTYZ0VDnpeajZfk,0 933 | 8XbRHgDskcEinm1,0 934 | KgLWCh2Gmu5ld40,0 935 | b55lRo8Fvu1ZL4J,0 936 | 9abC2I5T3pGVrhi,0 937 | 53BkgahCOXHKrOi,0 938 | LBG90s6hI78a68E,0 939 | vonJCPGotQDpqUM,0 940 | w5QPM6sIOOdgQVF,0 941 | nYKw9qWvZtmHPAR,0 942 | bgKyEhGbb0DBYDx,0 943 | BL5cfl7DSO7fwLA,0 944 | QTJJiEWYy0s3Xop,0 945 | BUy8KYwgrV2862I,0 946 | cVTivUqC1wRw6m9,0 947 | CCw7r3nGUbzWW3s,0 948 | oqI1SsDXSIMOb6t,0 949 | FXbaRtHv6EffFap,0 950 | 6ESs83A1yOIWBIf,0 951 | l8xyTBHHS2GuKmd,0 952 | OIEQWsLAlvgfMDH,0 953 | P5W1aze7zAG60DB,0 954 | i4IFAbcNVG0wliz,0 955 | Uee6MG69V7puBbK,0 956 | RjUs6ESsDIoTn4b,0 957 | Ye4WiZKfYIc3BeJ,0 958 | nKGyDhZUHkiGuae,0 959 | 8lQGHu9CqQI7xmc,0 960 | GISaDKb5aqBLbzv,0 961 | EHJE5fHhRqZiTmg,0 962 | Ku9RcAN34EtJeOu,0 963 | Cz3PbkjG9Mb1u6p,0 964 | PTRdP0UiR8TT36Z,0 965 | KaqkhlRNydt53Ji,0 966 | d5gxsdkNoQ6Jvhx,0 967 | 6f795hyV2BFooLI,0 968 | UKEYbGLrXBUbbMt,0 969 | 7OCGLHacToRw7gq,0 970 | S5SJoqiY91o8VQH,0 971 | dKw135ddn678lO6,0 972 | 5xNSGvbwPVKRGdF,0 973 | O4fDR81bVHEVVYO,0 974 | qmuT7vl6bIILzYb,0 975 | 6Vi79OgPwMFI9Kw,0 976 | NeZavHnu4yz460D,0 977 | qJVRighu8Snvjjy,0 978 | ikznDuPg6UPatNf,0 979 | AeF4BptIHaT3tKp,0 980 | fLl4nA6sw2s2ARU,0 981 | SZotRoZeBA09sox,0 982 | 8OoMsCRlk25F7Ll,0 983 | XR0gzdweYG8c97e,0 984 | O5pfmZNIAHCeVcA,0 985 | uD7htgFPkW4Mzrk,0 986 | QdvvzL92g2ZAmjC,0 987 | 9mcmykV9DdtXu6A,0 988 | 9PKOooL0KZiZ3CE,0 989 | vO0dYOIVeaaAs0r,0 990 | 8x5W5CEUxyP43xs,0 991 | fdEX6ATJs28fO5E,0 992 | 5D1BTCIxrMxpSmJ,0 993 | w6BgLgcn5QUMjED,0 994 | lVVIZ8gbfn9WgTz,0 995 | Q8JJ6hZJPrsAs0i,0 996 | 6MifPBNFvyCMBwr,0 997 | D1JNMXaDIkMpTr2,0 998 | st4O6Y8Frl1yZ30,0 999 | tDEVIaC8IG6jjcP,0 1000 | J1twUytTSvdEAJa,0 1001 | J8BdjsDxViTQMCw,0 1002 | --------------------------------------------------------------------------------