├── project
    ├── build.properties
    └── build.sbt
├── .travis.yml
├── .gitignore
├── src
    ├── main
    │   └── scala
    │   │   ├── frl
    │   │       └── driesprong
    │   │       │   └── outlierdetection
    │   │       │       └── EvaluateOutlierDetection.scala
    │   │   └── org
    │   │       └── apache
    │   │           └── spark
    │   │               └── ml
    │   │                   └── outlierdetection
    │   │                       └── StochasticOutlierDetection.scala
    └── test
    │   └── scala
    │       └── org
    │           └── apache
    │               └── spark
    │                   └── ml
    │                       └── outlierdetection
    │                           ├── StocasticOutlierDetectionIntegrationTest.scala
    │                           └── StocasticOutlierDetectionTest.scala
├── cars-outliers.py
├── LICENSE
└── README.md


/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.3.2
2 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 |   - 2.11.12
4 | script:
5 |   - sbt clean coverage test
6 | after_success:
7 |   - sbt coveralls


--------------------------------------------------------------------------------
/project/build.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.sbtPluginReleases
2 | 
3 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1")
4 | 
5 | addSbtPlugin("org.scoverage" % "sbt-coveralls" % "1.2.7")
6 | 
7 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.7")
8 | 
9 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "2.0.0-M2")


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | # Fokko specific
 5 | events/*
 6 | 
 7 | # sbt specific
 8 | .cache
 9 | .history
10 | .lib/
11 | dist/*
12 | target/
13 | lib_managed/
14 | src_managed/
15 | project/boot/
16 | project/plugins/project/
17 | 
18 | # Scala-IDE specific
19 | .scala_dependencies
20 | .worksheet
21 | 
22 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion
23 | 
24 | *.iml
25 | 
26 | ## Directory-based project format:
27 | .idea/
28 | # if you remove the above rule, at least ignore the following:
29 | 
30 | # User-specific stuff:
31 | # .idea/workspace.xml
32 | # .idea/tasks.xml
33 | # .idea/dictionaries
34 | 
35 | # Sensitive or high-churn files:
36 | # .idea/dataSources.ids
37 | # .idea/dataSources.xml
38 | # .idea/sqlDataSources.xml
39 | # .idea/dynamic.xml
40 | # .idea/uiDesigner.xml
41 | 
42 | # Gradle:
43 | # .idea/gradle.xml
44 | # .idea/libraries
45 | 
46 | # Mongo Explorer plugin:
47 | # .idea/mongoSettings.xml
48 | 
49 | ## File-based project format:
50 | *.ipr
51 | *.iws
52 | 
53 | ## Plugin-specific files:
54 | 
55 | # IntelliJ
56 | /out/
57 | 
58 | # mpeltonen/sbt-idea plugin
59 | .idea_modules/
60 | 
61 | # JIRA plugin
62 | atlassian-ide-plugin.xml
63 | 
64 | # Crashlytics plugin (for Android Studio and IntelliJ)
65 | com_crashlytics_export_strings.xml
66 | crashlytics.properties
67 | crashlytics-build.properties
68 | 
69 | 


--------------------------------------------------------------------------------
/src/main/scala/frl/driesprong/outlierdetection/EvaluateOutlierDetection.scala:
--------------------------------------------------------------------------------
 1 | package frl.driesprong.outlierdetection
 2 | 
 3 | import org.apache.spark.ml.feature.VectorAssembler
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.functions.{concat_ws, lit, max, min}
 6 | 
 7 | object EvaluateOutlierDetection {
 8 | 
 9 |   def main(args: Array[String]) {
10 |     val spark = SparkSession
11 |       .builder()
12 |       .appName("Stochastic Outlier Selection")
13 |       .getOrCreate()
14 | 
15 |     var df = spark.read.option("header", "true").csv("data/cardataset.csv")
16 | 
17 |     val vector_columns = Array("Engine HP", "Engine Cylinders", "highway MPG", "city mpg", "MSRP")
18 | 
19 | 
20 |     vector_columns.foreach { col =>
21 |       df = df.withColumn(col, df(col).cast("Double"))
22 |       val minValue = lit(df.select(min(df(col))).first()(0))
23 |       val maxValue = lit(df.select(max(df(col))).first()(0))
24 |       println("Col " + col + " min " + minValue + ", max: " + maxValue)
25 |       df = df.withColumn(col, (df(col) - minValue) / (maxValue - minValue))
26 |     }
27 | 
28 |     val ass = new VectorAssembler().setInputCols(vector_columns).setOutputCol("vector")
29 | 
30 |     df = df.withColumn("label", concat_ws(" ", df("Make"), df("Model"), df("Year"), df("Engine Fuel Type"), df("Transmission Type")))
31 | 
32 |     df.count()
33 | 
34 |     df = ass.setHandleInvalid("skip").transform(df)
35 |     df.count()
36 | 
37 |     val output = org.apache.spark.ml.outlierdetection.StochasticOutlierDetection.performOutlierDetectionDf(df)
38 | 
39 |     output.collect()
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/cars-outliers.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from pyspark.ml.feature import VectorAssembler
 3 | from pyspark.sql.functions import col, lit, min, max, asc, desc, concat_ws
 4 | from pyspark.mllib.common import _java2py
 5 | 
 6 | df = spark.read.option("header", "true").csv("data/cardataset.csv")
 7 | 
 8 | metric_columns = ["Engine HP", "Engine Cylinders", "highway MPG", "city mpg", "Popularity", "MSRP"]
 9 | 
10 | # Scale the columns and remove the empty ones
11 | for column_name in metric_columns:
12 |     df = df.withColumn(column_name, col(column_name).cast("Double"))
13 |     minValue = lit(df.select(min(col(column_name)).alias("min")).first().asDict()['min'])
14 |     maxValue = lit(df.select(max(col(column_name)).alias("max")).first().asDict()['max'])
15 |     print("Col " + column_name + " min " + str(minValue) + ", max: " + str(maxValue))
16 |     df = df.where(col(column_name).isNotNull())
17 |     df = df.withColumn(column_name, (col(column_name) - minValue) / (maxValue - minValue))
18 | 
19 | 
20 | df = df.withColumn("label", concat_ws(" ", col("Make"), col("Model"), col("Year"), col("Engine Fuel Type"), col("Transmission Type")))
21 | 
22 | # Count the number of rows
23 | num = df.count()
24 | num
25 | 
26 | # Remove the missing vectors and combine all the columns to a single vector
27 | ass = VectorAssembler(inputCols=metric_columns, outputCol="vector")
28 | df = ass.setHandleInvalid("skip").transform(df).repartition(22)
29 | 
30 | # As perplexity, use the sqrt of the number of rows
31 | perplexity = math.sqrt(num)
32 | 
33 | # Some helpers for the Java reference objects
34 | jvm = sc._jvm
35 | sqlContext = df._jdf
36 | 
37 | sos = jvm.org.apache.spark.ml.outlierdetection.StochasticOutlierDetection.performOutlierDetectionPython(spark._jwrapped, df._jdf, "label", "vector", perplexity, 1e-9, 5000)
38 | 
39 | # Reconstruct the Python DF
40 | result_df = _java2py(sc, sos)
41 | 
42 | result_df.orderBy(asc("score")).show(22, False)
43 | 
44 | result_df.orderBy(desc("score")).show(22, False)
45 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/outlierdetection/StocasticOutlierDetectionIntegrationTest.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.outlierdetection
 2 | 
 3 | import org.apache.spark.ml.feature.VectorAssembler
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.functions._
 6 | import org.scalatest._
 7 | 
 8 | class StocasticOutlierDetectionIntegrationTest extends FlatSpec with Matchers with BeforeAndAfter {
 9 | 
10 |   "Running the SOS algorithm " should "give some sensible outcome" in {
11 | 
12 |     val partitions = 22
13 | 
14 |     val spark = SparkSession
15 |       .builder()
16 |       .master("local[*]")
17 |       .config("spark.driver.allowMultipleContexts", value = true)
18 |       .getOrCreate()
19 | 
20 |     var df = spark.read.option("header", "true").csv("data/cardataset.csv")
21 | 
22 |     val metricColumns = Array("Engine HP", "Engine Cylinders", "highway MPG", "city mpg", "Popularity", "MSRP")
23 | 
24 |     metricColumns.foreach { col =>
25 |       df = df.withColumn(col, df(col).cast("Double"))
26 |       val minValue = lit(df.select(min(df(col))).first()(0))
27 |       val maxValue = lit(df.select(max(df(col))).first()(0))
28 |       println("Col " + col + " min " + minValue + ", max: " + maxValue)
29 |       df = df.withColumn(col, (df(col) - minValue) / (maxValue - minValue))
30 |     }
31 | 
32 |     val ass = new VectorAssembler().setInputCols(metricColumns).setOutputCol("vector")
33 | 
34 |     df = df.withColumn("label", concat_ws(" ", df("Make"), df("Model"), df("Year"), df("Engine Fuel Type"), df("Transmission Type")))
35 | 
36 |     df = ass.setHandleInvalid("skip").transform(df)
37 | 
38 |     val num = df.count()
39 | 
40 |     val output = StochasticOutlierDetection.performOutlierDetectionDf(df.repartition(partitions), perplexity = Math.sqrt(num))
41 | 
42 |     val result = spark.createDataFrame(output).toDF("label", "score").cache()
43 | 
44 | //    println(result.show())
45 | //    Thread.sleep(2200000)
46 | 
47 |     spark.stop()
48 |   }
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/outlierdetection/StochasticOutlierDetection.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.outlierdetection
  2 | 
  3 | import breeze.linalg.functions.euclideanDistance
  4 | import breeze.linalg.{DenseVector, sum}
  5 | import org.apache.spark.ml.linalg.{DenseVector => SparkDenseVector}
  6 | import org.apache.spark.rdd.RDD
  7 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext}
  8 | 
  9 | object StochasticOutlierDetection {
 10 |   val defaultPerplexity = 30.0
 11 |   val defaultEps = 1e-12
 12 |   val defaultIterations = 5000
 13 | 
 14 |   /**
 15 |    * Helper function to make it available to the Python world
 16 |    */
 17 |   def performOutlierDetectionPython(sqlContext: SQLContext,
 18 |                                     ds: Dataset[Row],
 19 |                                     labelColumn: String = "label",
 20 |                                     vectorColumn: String = "vector",
 21 |                                     perplexity: Double = defaultPerplexity,
 22 |                                     eps: Double = defaultEps,
 23 |                                     maxIterations: Int = defaultIterations): DataFrame = {
 24 |     val rdd = performOutlierDetectionDf(ds, labelColumn, vectorColumn, perplexity, eps, maxIterations)
 25 | 
 26 |     sqlContext.createDataFrame(rdd).toDF(labelColumn, "score")
 27 |   }
 28 | 
 29 |   def performOutlierDetectionDs(ds: Dataset[Row],
 30 |                                 labelColumn: String = "label",
 31 |                                 vectorColumn: String = "vector",
 32 |                                 perplexity: Double = defaultPerplexity,
 33 |                                 eps: Double = defaultEps,
 34 |                                 maxIterations: Int = defaultIterations): RDD[(String, Double)] = {
 35 | 
 36 |     performOutlierDetectionDf(ds, labelColumn, vectorColumn, perplexity, eps, maxIterations)
 37 |   }
 38 | 
 39 |   def performOutlierDetectionDf(df: DataFrame,
 40 |                                 labelColumn: String = "label",
 41 |                                 vectorColumn: String = "vector",
 42 |                                 perplexity: Double = defaultPerplexity,
 43 |                                 eps: Double = defaultEps,
 44 |                                 maxIterations: Int = defaultIterations): RDD[(String, Double)] = {
 45 | 
 46 |     val rdd = df.select(labelColumn, vectorColumn).rdd
 47 |       .map(row => (row.getAs[String](labelColumn), row.getAs[SparkDenseVector](vectorColumn)))
 48 |     performOutlierDetectionRdd(rdd, perplexity, eps, maxIterations)
 49 |   }
 50 | 
 51 |   def performOutlierDetectionRdd(inputVectors: RDD[(String, SparkDenseVector)],
 52 |                                  perplexity: Double = defaultPerplexity,
 53 |                                  eps: Double = defaultEps,
 54 |                                  maxIterations: Int = defaultIterations): RDD[(String, Double)] = {
 55 | 
 56 |     val withIndices = inputVectors.zipWithIndex().map(_.swap)
 57 | 
 58 |     val labels = withIndices.mapValues(_._1)
 59 |     val vectors = withIndices.mapValues(_._2)
 60 | 
 61 |     // Only pass the vectors in
 62 |     val dMatrix = computeDistanceMatrix(vectors)
 63 |     val aMatrix = computeAffinityMatrix(dMatrix, perplexity, maxIterations, eps)
 64 |     val bMatrix = computeBindingProbabilities(aMatrix)
 65 |     val oMatrix = computeOutlierProbability(bMatrix)
 66 | 
 67 |     oMatrix.join(labels).map(_._2.swap)
 68 |   }
 69 | 
 70 | 
 71 |   /**
 72 |    * Comptues the perplexity for a given vector, given a value of beta
 73 |    *
 74 |    * @param D    The input vector
 75 |    * @param beta The given beta
 76 |    * @return A tuple of the current error and the affinity vector
 77 |    */
 78 |   private[outlierdetection] def getPerplexity(D: DenseVector[Double], beta: Double): (Double, DenseVector[Double]) = {
 79 |     val A = D.map(a => Math.exp(-a * beta))
 80 |     val sumA = sum(A)
 81 |     val h = Math.log(sumA) + beta * sum(A :* D) / sumA
 82 |     (h, A)
 83 |   }
 84 | 
 85 |   /**
 86 |    * Computes the affinity for a given row of the distance matrix
 87 |    *
 88 |    * @param D             The input vector of the given row
 89 |    * @param logPerplexity The log taken from the perplexity
 90 |    * @param maxIterations The maximum number of iterations before giving up
 91 |    * @param eps           The accepted error
 92 |    * @param iteration     The current iteration
 93 |    * @param beta          The current approximated beta
 94 |    * @param betaMin       The lower bound of the beta
 95 |    * @param betaMax       The upper bound of the beta
 96 |    * @return
 97 |    */
 98 |   @scala.annotation.tailrec
 99 |   private[outlierdetection] def binarySearch(D: DenseVector[Double],
100 |                                              logPerplexity: Double,
101 |                                              maxIterations: Int,
102 |                                              eps: Double,
103 |                                              iteration: Int = 0,
104 |                                              beta: Double = 1.0,
105 |                                              betaMin: Double = Double.NegativeInfinity,
106 |                                              betaMax: Double = Double.PositiveInfinity): DenseVector[Double] = {
107 | 
108 |     val (h, matA) = getPerplexity(D, beta)
109 |     val hDiff = h - logPerplexity
110 | 
111 |     if (iteration < maxIterations && Math.abs(hDiff) > eps) {
112 |       val (newBeta, newBetaMin, newBetaMax) = if (hDiff.isNaN) {
113 |         // If the beta is too high, it might result into a NaN
114 |         (beta / 10.0, betaMin, betaMax)
115 |       } else {
116 |         if (hDiff > 0)
117 |           (if (betaMax == Double.PositiveInfinity || betaMax == Double.NegativeInfinity)
118 |             beta * 2.0
119 |           else
120 |             (beta + betaMax) / 2.0, beta, betaMax)
121 |         else
122 |           (if (betaMin == Double.PositiveInfinity || betaMin == Double.NegativeInfinity)
123 |             beta / 2.0
124 |           else
125 |             (beta + betaMin) / 2.0, betaMin, beta)
126 |       }
127 | 
128 |       binarySearch(D, logPerplexity, maxIterations, eps, iteration + 1, newBeta, newBetaMin, newBetaMax)
129 |     }
130 |     else
131 |       matA
132 |   }
133 | 
134 |   /**
135 |    * Will convert the distances into the affinity by sampling using a binary search.
136 |    * The binary search will stop if it is between a certain tolerance
137 |    *
138 |    * @param dMatrix       The distance matrix
139 |    * @param perplexity    The perplexity
140 |    * @param maxIterations The maximum number of iterations before stopping
141 |    * @param eps           The accepted error to stop early
142 |    * @return The affinity matrix
143 |    */
144 |   private[outlierdetection] def computeAffinityMatrix(dMatrix: RDD[(Long, Array[Double])],
145 |                                                       perplexity: Double = defaultPerplexity,
146 |                                                       maxIterations: Int = defaultIterations,
147 |                                                       eps: Double = defaultEps): RDD[(Long, DenseVector[Double])] =
148 |     dMatrix.mapValues(r => binarySearch(new DenseVector(r), Math.log(perplexity), maxIterations, eps))
149 | 
150 |   /**
151 |    * Scales the binding probabilities by dividing the values in the vector by the total sum
152 |    *
153 |    * @param rows The affinity values
154 |    * @return Scaled affinity values where the sum is equal to 1.0
155 |    */
156 |   private[outlierdetection] def computeBindingProbabilities(rows: RDD[(Long, DenseVector[Double])]): RDD[(Long, Array[Double])] =
157 |     rows.mapValues(row => (row :/ sum(row)).toArray)
158 | 
159 |   /**
160 |    * Accepts a RDD of (Index, Vector) which is being used to compute the distance from each vector to each other vector.
161 |    * We're removing the diagonal since it isn't of any importance. Mostly because for computing the affection we don't
162 |    * want to include the distance to the vector itself.
163 |    *
164 |    * @param data RDD of (Index, Vector) as input
165 |    * @return RDD of (Index, Vector) where the position in the vector is the Index of the other vector
166 |    */
167 |   private[outlierdetection] def computeDistanceMatrix(data: RDD[(Long, SparkDenseVector)]): RDD[(Long, Array[Double])] =
168 |     data
169 |       .cache()
170 |       .cartesian(data)
171 |       .map(row => row._1._1 -> (row._2._1, euclideanDistance(row._1._2.asBreeze, row._2._2.asBreeze)))
172 |       // Remove the distance to itself, i.e. the diagonal of the matrix
173 |       .filter(row => row._1 != row._2._1)
174 |       .groupByKey()
175 |       .mapValues(_.toArray.sortBy(_._1).map(_._2))
176 | 
177 |   /**
178 |    * This will multiply all the columns with each other so we get the
179 |    * final outlier probability in [0, 1]
180 |    *
181 |    * @param rows The rows with the affinity
182 |    * @return The outlier probabilities for each of the observations
183 |    */
184 |   def computeOutlierProbability(rows: RDD[(Long, Array[Double])]): RDD[(Long, Double)] =
185 |     rows
186 |       .flatMap(row =>
187 |         row._2.zipWithIndex.map(value => {
188 |           val beyondDiagonal = if (value._2 >= row._1) 1L else 0L
189 |           (value._2 + beyondDiagonal, value._1)
190 |         })
191 |       )
192 |       .groupByKey()
193 |       .mapValues(vector => vector.fold(1.0)((a, b) => a * (1.0 - b)))
194 | 
195 | }
196 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/outlierdetection/StocasticOutlierDetectionTest.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.outlierdetection
  2 | 
  3 | import breeze.linalg.{DenseVector, sum}
  4 | import org.apache.spark.ml.linalg.{DenseVector => SparkDenseVector}
  5 | import org.apache.spark.rdd.RDD
  6 | import org.apache.spark.sql.SparkSession
  7 | import org.scalactic.{Equality, TolerantNumerics}
  8 | import org.scalatest._
  9 | 
 10 | // Unit-tests created based on the Python script of https://github.com/jeroenjanssens/sos
 11 | class StocasticOutlierDetectionTest extends FlatSpec with Matchers with BeforeAndAfter {
 12 | 
 13 |   val spark: SparkSession = SparkSession
 14 |     .builder()
 15 |     .master("local[*]")
 16 |     .config("spark.driver.allowMultipleContexts", value = true)
 17 |     .getOrCreate()
 18 | 
 19 |   val perplexity = 3.0
 20 | 
 21 |   val epsilon = 1e-9f
 22 |   implicit val doubleEq: Equality[Double] = TolerantNumerics.tolerantDoubleEquality(epsilon)
 23 | 
 24 |   "Computing the distance matrix " should "give symmetrical distances" in {
 25 | 
 26 |     val seqData: Seq[(Long, SparkDenseVector)] = Seq(
 27 |       (0L, new SparkDenseVector(Array(1.0, 3.0))),
 28 |       (1L, new SparkDenseVector(Array(5.0, 1.0))),
 29 |       (2L, new SparkDenseVector(Array(2.2, 2.2)))
 30 |     )
 31 | 
 32 |     val data: RDD[(Long, SparkDenseVector)] = spark.sparkContext.parallelize(seqData)
 33 | 
 34 |     val dMatrix = StochasticOutlierDetection.computeDistanceMatrix(data).collectAsMap()
 35 | 
 36 |     dMatrix.size should be(seqData.length)
 37 |     // No diagonal
 38 |     dMatrix.head._2.length should be(seqData.length - 1)
 39 | 
 40 |     dMatrix(0)(0) should be(dMatrix(1)(0))
 41 |     dMatrix(0)(1) should be(dMatrix(2)(0))
 42 |   }
 43 | 
 44 |   "Computing the distance matrix " should "give the correct distances" in {
 45 | 
 46 |     val data: RDD[(Long, SparkDenseVector)] = spark.sparkContext.parallelize(
 47 |       Seq(
 48 |         (0L, new SparkDenseVector(Array(1.0, 1.0))),
 49 |         (1L, new SparkDenseVector(Array(2.0, 2.0))),
 50 |         (2L, new SparkDenseVector(Array(5.0, 1.0)))
 51 |       ))
 52 | 
 53 |     val dMatrix = StochasticOutlierDetection.computeDistanceMatrix(data).collectAsMap()
 54 | 
 55 |     dMatrix(0L) should be(Array(Math.sqrt(2.0), Math.sqrt(Math.pow(1.0 - 5.0, 2) + Math.pow(1.0 - 1.0, 2))))
 56 |     dMatrix(1L) should be(Array(Math.sqrt(2.0), Math.sqrt(Math.pow(2.0 - 5.0, 2) + Math.pow(2.0 - 1.0, 2))))
 57 |     dMatrix(2L) should be(Array(Math.sqrt(16.0), Math.sqrt(10.0)))
 58 |   }
 59 | 
 60 |   "Computing the perplexity of the vector " should "give the correct error" in {
 61 | 
 62 |     val vector = new DenseVector(Array(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 8.0, 9.0, 10.0))
 63 | 
 64 |     val output = Array(
 65 |       3.67879441e-01,
 66 |       1.35335283e-01,
 67 |       4.97870684e-02,
 68 |       1.83156389e-02,
 69 |       6.73794700e-03,
 70 |       2.47875218e-03,
 71 |       3.35462628e-04,
 72 |       1.23409804e-04,
 73 |       4.53999298e-05
 74 |     )
 75 | 
 76 |     // Standard beta
 77 |     val beta = 1.0
 78 |     val search = StochasticOutlierDetection.binarySearch(vector, Math.log(perplexity), 500, beta).toArray
 79 | 
 80 |     assert(search.length == output.length)
 81 |     search.zip(output).foreach(v => assert(v._1 === v._2))
 82 |   }
 83 | 
 84 |   "Computing the perplexity" should "give the correct perplexity" in {
 85 | 
 86 |     val output = StochasticOutlierDetection.getPerplexity(DenseVector(1.0, 2.0, 3.0, 4.0), 3)
 87 | 
 88 |     /*
 89 |     >>> get_perplexity(np.array([1,2,3,4]), 3)
 90 |     (0.2081763951839819, new SparkDenseVector(([4.97870684e-02, 2.47875218e-03, 1.23409804e-04, 6.14421235e-06]))
 91 |      */
 92 | 
 93 |     output._1 should be(0.2081763951839819)
 94 |     output._2 should be(DenseVector(0.049787068367863944, 0.0024787521766663585, 1.2340980408667956E-04, 6.14421235332821E-06))
 95 |   }
 96 | 
 97 |   "Compute the affinity" should "give the correct affinity" in {
 98 | 
 99 |     // The datapoints
100 |     val data: RDD[(Long, SparkDenseVector)] = spark.sparkContext.parallelize(
101 |       Seq(
102 |         (0L, new SparkDenseVector(Array(1.0, 1.0))),
103 |         (1L, new SparkDenseVector(Array(2.0, 1.0))),
104 |         (2L, new SparkDenseVector(Array(1.0, 2.0))),
105 |         (3L, new SparkDenseVector(Array(2.0, 2.0))),
106 |         (4L, new SparkDenseVector(Array(5.0, 8.0))) // The outlier!
107 |       ))
108 | 
109 |     /*
110 |     Looks ok:
111 |     +---+--------------------------------------------------------------------------------+
112 |     |_1 |_2                                                                              |
113 |     +---+--------------------------------------------------------------------------------+
114 |     |0  |[0.0, 1.0, 1.0, 1.4142135623730951, 8.06225774829855]                           |
115 |     |1  |[1.0, 0.0, 1.4142135623730951, 1.0, 7.615773105863909]                          |
116 |     |2  |[1.0, 1.4142135623730951, 0.0, 1.0, 7.211102550927978]                          |
117 |     |3  |[1.4142135623730951, 1.0, 1.0, 0.0, 6.708203932499369]                          |
118 |     |4  |[8.06225774829855, 7.615773105863909, 7.211102550927978, 6.708203932499369, 0.0]|
119 |     +---+--------------------------------------------------------------------------------+
120 | 
121 |     df = pd.DataFrame([
122 |       [1.0, 1.0],
123 |       [2.0, 1.0],
124 |       [1.0, 2.0],
125 |       [2.0, 2.0],
126 |       [5.0, 8.0]
127 |     ])
128 | 
129 |     >>> D = distance.squareform(distance.pdist(df, 'euclidean'))
130 |     >>> D
131 |     array([[0.        , 1.        , 1.        , 1.41421356, 8.06225775],
132 |            [1.        , 0.        , 1.41421356, 1.        , 7.61577311],
133 |            [1.        , 1.41421356, 0.        , 1.        , 7.21110255],
134 |            [1.41421356, 1.        , 1.        , 0.        , 6.70820393],
135 |            [8.06225775, 7.61577311, 7.21110255, 6.70820393, 0.        ]])
136 |      */
137 | 
138 |     val dMatrix = StochasticOutlierDetection.computeDistanceMatrix(data)
139 | 
140 |     val dMatrixLocal = dMatrix.collectAsMap()
141 | 
142 |     dMatrixLocal.size should be(5)
143 |     // No diagonal
144 |     dMatrixLocal.head._2.length should be(4)
145 | 
146 |     dMatrixLocal(0) should be(Array(1.0, 1.0, 1.4142135623730951, 8.06225774829855))
147 |     dMatrixLocal(1) should be(Array(1.0, 1.4142135623730951, 1.0, 7.615773105863909))
148 |     dMatrixLocal(2) should be(Array(1.0, 1.4142135623730951, 1.0, 7.211102550927978))
149 |     dMatrixLocal(3) should be(Array(1.4142135623730951, 1.0, 1.0, 6.708203932499369))
150 |     dMatrixLocal(4) should be(Array(8.06225774829855, 7.615773105863909, 7.211102550927978, 6.708203932499369))
151 | 
152 |     val aMatrix = StochasticOutlierDetection.computeAffinityMatrix(
153 |       dMatrix,
154 |       perplexity).collectAsMap()
155 | 
156 |     /*
157 |     Reference output:
158 |     >>> A = d2a(D)
159 |     >>> A
160 |     array([[0.00000000e+00, 4.64662766e-01, 4.64662766e-01, 3.38268740e-01, 2.07195222e-03],
161 |            [4.48046270e-01, 0.00000000e+00, 3.21289157e-01, 4.48046270e-01, 2.21082346e-03],
162 |            [4.31925257e-01, 3.05063254e-01, 0.00000000e+00, 4.31925257e-01, 2.34905955e-03],
163 |            [2.83704490e-01, 4.10315559e-01, 4.10315559e-01, 0.00000000e+00, 2.53931484e-03],
164 |            [1.65024585e-06, 3.44967767e-06, 6.73004987e-06, 1.54422171e-05, 0.00000000e+00]])
165 |      */
166 | 
167 |     aMatrix.size should be(5)
168 |     aMatrix.head._2.size should be(4)
169 |     aMatrix(0) should be(DenseVector(0.46466276524892347, 0.46466276524892347, 0.3382687394706771, 0.002071952211481348))
170 |     aMatrix(1) should be(DenseVector(0.44804626736592407, 0.32128915387335244, 0.44804626736592407, 0.002210823345964273))
171 |     aMatrix(2) should be(DenseVector(0.43192525600789167, 0.3050632526240005, 0.43192525600789167, 0.0023490595179782026))
172 |     aMatrix(3) should be(DenseVector(0.2837044890481323, 0.41031555870116004, 0.41031555870116004, 0.0025393148189380038))
173 |     aMatrix(4) should be(DenseVector(1.6502458086112328E-6, 3.4496775759417726E-6, 6.730049701899862E-6, 1.544221669896851E-5))
174 |   }
175 | 
176 |   "Verify the binding probabilities " should "give the correct probabilities" in {
177 | 
178 |     // The distance matrix
179 |     val dMatrix = spark.sparkContext.parallelize(
180 |       Seq(
181 |         (0L, new DenseVector(Array(6.61626106e-112, 1.27343495e-088))),
182 |         (1L, new DenseVector(Array(2.21858114e-020, 1.12846575e-044))),
183 |         (2L, new DenseVector(Array(1.48949023e-010, 1.60381089e-028)))
184 |       ))
185 | 
186 |     val bMatrix = StochasticOutlierDetection.computeBindingProbabilities(dMatrix).map(_._2).sortBy(dist => sum(dist)).collect()
187 | 
188 |     assert(bMatrix(0)(0) === 5.19560192e-24)
189 |     assert(bMatrix(0)(1) === 1.00000000e+00)
190 | 
191 |     assert(bMatrix(1)(0) === 1.00000000e+00)
192 |     assert(bMatrix(1)(1) === 5.08642993e-25)
193 | 
194 |     assert(bMatrix(2)(0) === 1.00000000e+00)
195 |     assert(bMatrix(2)(1) === 1.07675154e-18)
196 |   }
197 | 
198 |   "Verifying the product " should "should provide valid products" in {
199 | 
200 |     val data = spark.sparkContext.parallelize(
201 |       Seq(
202 |         (0L, Array(/*0.0,*/ 0.5, 0.3)),
203 |         (1L, Array(0.25, /*0.0,*/ 0.1)),
204 |         (2L, Array(0.8, 0.8 /*, 0.0*/))
205 |       ))
206 | 
207 |     val oMatrix = StochasticOutlierDetection.computeOutlierProbability(data).collectAsMap()
208 | 
209 |     /*
210 |       >>> import pandas as pd
211 |       >>> import numpy as np
212 |       >>>
213 |       >>> df = pd.DataFrame([[0.0, 0.5, 0.3],
214 |       ...                    [0.25, 0.0, 0.1],
215 |       ...                    [0.8, 0.8, 0.0]])
216 |       >>>
217 |       >>> np.prod(1-df, 0)
218 |       0    0.15
219 |       1    0.10
220 |       2    0.63
221 |      */
222 | 
223 |     val out0 = (1.0 - 0.0) * (1.0 - 0.25) * (1.0 - 0.8) // 0.09999999999999998
224 |     val out1 = (1.0 - 0.5) * (1.0 - 0.0) * (1.0 - 0.8) // 0.14999999999999997
225 |     val out2 = (1.0 - 0.3) * (1.0 - 0.1) * (1.0 - 0) // 0.63
226 | 
227 |     assert(oMatrix.size == 3)
228 | 
229 |     assert(oMatrix(0) === out0)
230 |     assert(oMatrix(1) === out1)
231 |     assert(oMatrix(2) === out2)
232 |   }
233 | 
234 |   "Verifying the output of the SOS algorithm " should "assign the one true outlier" in {
235 | 
236 |     // The datapoints
237 |     val data: RDD[(Long, SparkDenseVector)] = spark.sparkContext.parallelize(
238 |       Seq(
239 |         (0L, new SparkDenseVector(Array(1.0, 1.0))),
240 |         (1L, new SparkDenseVector(Array(2.0, 1.0))),
241 |         (2L, new SparkDenseVector(Array(1.0, 2.0))),
242 |         (3L, new SparkDenseVector(Array(2.0, 2.0))),
243 |         (4L, new SparkDenseVector(Array(5.0, 8.0))) // The outlier!
244 |       ))
245 | 
246 |     // Process the steps of the algorithm
247 |     val dMatrix = StochasticOutlierDetection.computeDistanceMatrix(data)
248 | 
249 |     val aMatrix = StochasticOutlierDetection.computeAffinityMatrix(
250 |       dMatrix,
251 |       perplexity)
252 | 
253 |     val bMatrix = StochasticOutlierDetection.computeBindingProbabilities(aMatrix)
254 | 
255 |     val oMatrix = StochasticOutlierDetection.computeOutlierProbability(bMatrix)
256 | 
257 |     // Do a distributed sort, and then return to driver
258 |     val output = oMatrix.collectAsMap()
259 | 
260 |     assert(output.size == 5)
261 |     assert(output(0) === 0.27900944792028953)
262 |     assert(output(1) === 0.25775014551682535)
263 |     assert(output(2) === 0.22136130977995763)
264 |     assert(output(3) === 0.12707053787018444)
265 |     assert(output(4) === 0.99227799024537555184) // The outlier!
266 |   }
267 | 
268 | }
269 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![License](http://img.shields.io/:license-Apache%202-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.txt)
  2 | [![Build Status](https://travis-ci.org/Fokko/spark-stochastic-outlier-selection.svg?branch=master)](https://travis-ci.org/Fokko/spark-stochastic-outlier-selection)
  3 | [![Coverage Status](https://coveralls.io/repos/Fokko/spark-stochastic-outlier-selection/badge.svg?branch=master&service=github)](https://coveralls.io/github/Fokko/spark-stochastic-outlier-selection?branch=master)
  4 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/frl.driesprong/spark-stochastic-outlier-selection_2.11/badge.svg)](https://maven-badges.herokuapp.com/maven-central/frl.driesprong/spark-stochastic-outlier-selection_2.11)
  5 | 
  6 | # Stochastic Outlier Selection on Apache Spark
  7 | 
  8 | Stochastic Outlier Selection (SOS) is an unsupervised outlier selection algorithm. It uses the concept of affinity to compute an outlier probability for each data point.
  9 | 
 10 | For more information about SOS, see the technical report: J.H.M. Janssens, F. Huszar, E.O. Postma, and H.J. van den Herik. [Stochastic Outlier Selection](https://github.com/jeroenjanssens/sos/blob/master/doc/sos-ticc-tr-2012-001.pdf?raw=true). Technical Report TiCC TR 2012-001, Tilburg University, Tilburg, the Netherlands, 2012.
 11 | 
 12 | ## Selecting outliers from data
 13 | 
 14 | The current implementation accepts RDD's of the type `Array[Double]` and returns the indexes of the vector with it's degree of outlierness.
 15 | 
 16 | Current implementation only works with Euclidean distance, but this can be extended.
 17 | 
 18 | # Example
 19 | 
 20 | As a small example on how to use the algorithm, we have the following dataset:
 21 | 
 22 | ```
 23 | scala> val df = spark.read.option("header", "true").csv("data/cardataset.csv")
 24 | df: org.apache.spark.sql.DataFrame = [Make: string, Model: string ... 14 more fields]
 25 | 
 26 | scala> df.show()
 27 | +----+----------+----+--------------------+---------+----------------+-----------------+-----------------+---------------+--------------------+------------+-------------+-----------+--------+----------+-----+
 28 | |Make|     Model|Year|    Engine Fuel Type|Engine HP|Engine Cylinders|Transmission Type|    Driven_Wheels|Number of Doors|     Market Category|Vehicle Size|Vehicle Style|highway MPG|city mpg|Popularity| MSRP|
 29 | +----+----------+----+--------------------+---------+----------------+-----------------+-----------------+---------------+--------------------+------------+-------------+-----------+--------+----------+-----+
 30 | | BMW|1 Series M|2011|premium unleaded ...|      335|               6|           MANUAL| rear wheel drive|              2|Factory Tuner,Lux...|     Compact|        Coupe|         26|      19|      3916|46135|
 31 | | BMW|  1 Series|2011|premium unleaded ...|      300|               6|           MANUAL| rear wheel drive|              2|  Luxury,Performance|     Compact|  Convertible|         28|      19|      3916|40650|
 32 | | BMW|  1 Series|2011|premium unleaded ...|      300|               6|           MANUAL| rear wheel drive|              2|Luxury,High-Perfo...|     Compact|        Coupe|         28|      20|      3916|36350|
 33 | | BMW|  1 Series|2011|premium unleaded ...|      230|               6|           MANUAL| rear wheel drive|              2|  Luxury,Performance|     Compact|        Coupe|         28|      18|      3916|29450|
 34 | | BMW|  1 Series|2011|premium unleaded ...|      230|               6|           MANUAL| rear wheel drive|              2|              Luxury|     Compact|  Convertible|         28|      18|      3916|34500|
 35 | | BMW|  1 Series|2012|premium unleaded ...|      230|               6|           MANUAL| rear wheel drive|              2|  Luxury,Performance|     Compact|        Coupe|         28|      18|      3916|31200|
 36 | | BMW|  1 Series|2012|premium unleaded ...|      300|               6|           MANUAL| rear wheel drive|              2|  Luxury,Performance|     Compact|  Convertible|         26|      17|      3916|44100|
 37 | | BMW|  1 Series|2012|premium unleaded ...|      300|               6|           MANUAL| rear wheel drive|              2|Luxury,High-Perfo...|     Compact|        Coupe|         28|      20|      3916|39300|
 38 | | BMW|  1 Series|2012|premium unleaded ...|      230|               6|           MANUAL| rear wheel drive|              2|              Luxury|     Compact|  Convertible|         28|      18|      3916|36900|
 39 | | BMW|  1 Series|2013|premium unleaded ...|      230|               6|           MANUAL| rear wheel drive|              2|              Luxury|     Compact|  Convertible|         27|      18|      3916|37200|
 40 | | BMW|  1 Series|2013|premium unleaded ...|      300|               6|           MANUAL| rear wheel drive|              2|Luxury,High-Perfo...|     Compact|        Coupe|         28|      20|      3916|39600|
 41 | | BMW|  1 Series|2013|premium unleaded ...|      230|               6|           MANUAL| rear wheel drive|              2|  Luxury,Performance|     Compact|        Coupe|         28|      19|      3916|31500|
 42 | | BMW|  1 Series|2013|premium unleaded ...|      300|               6|           MANUAL| rear wheel drive|              2|  Luxury,Performance|     Compact|  Convertible|         28|      19|      3916|44400|
 43 | | BMW|  1 Series|2013|premium unleaded ...|      230|               6|           MANUAL| rear wheel drive|              2|              Luxury|     Compact|  Convertible|         28|      19|      3916|37200|
 44 | | BMW|  1 Series|2013|premium unleaded ...|      230|               6|           MANUAL| rear wheel drive|              2|  Luxury,Performance|     Compact|        Coupe|         28|      19|      3916|31500|
 45 | | BMW|  1 Series|2013|premium unleaded ...|      320|               6|           MANUAL| rear wheel drive|              2|Luxury,High-Perfo...|     Compact|  Convertible|         25|      18|      3916|48250|
 46 | | BMW|  1 Series|2013|premium unleaded ...|      320|               6|           MANUAL| rear wheel drive|              2|Luxury,High-Perfo...|     Compact|        Coupe|         28|      20|      3916|43550|
 47 | |Audi|       100|1992|    regular unleaded|      172|               6|           MANUAL|front wheel drive|              4|              Luxury|     Midsize|        Sedan|         24|      17|      3105| 2000|
 48 | |Audi|       100|1992|    regular unleaded|      172|               6|           MANUAL|front wheel drive|              4|              Luxury|     Midsize|        Sedan|         24|      17|      3105| 2000|
 49 | |Audi|       100|1992|    regular unleaded|      172|               6|        AUTOMATIC|  all wheel drive|              4|              Luxury|     Midsize|        Wagon|         20|      16|      3105| 2000|
 50 | +----+----------+----+--------------------+---------+----------------+-----------------+-----------------+---------------+--------------------+------------+-------------+-----------+--------+----------+-----+
 51 | only showing top 20 rows
 52 | ```
 53 | 
 54 | Borrowed from Kaggle: https://www.kaggle.com/CooperUnion/cardataset. The data is scraped from Edmunds, which serves the US of A, so we might expect some different cars compared to Europe or somewhere else.
 55 | 
 56 | The run might take some test since the algorithm is [quadratic in runtime](https://en.wikipedia.org/wiki/Big_O_notation). This means, for the example we insert 11816 rows, which boils down to a dense 11816 x 11816 distance matrix consisting of 139.617.856 doubles.
 57 | 
 58 | ## SOS using Scala
 59 | 
 60 | ```
 61 | MacBook-Pro-van-Fokko:spark-stochastic-outlier-selection fokkodriesprong$ spark-shell --jars target/scala-2.11/spark-stochastic-outlier-selection_2.11-0.1.0.jar 
 62 | 19/09/24 12:22:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 63 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
 64 | Setting default log level to "WARN".
 65 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 66 | Spark context Web UI available at http://192.168.185.146:4040
 67 | Spark context available as 'sc' (master = local[*], app id = local-1569320579133).
 68 | Spark session available as 'spark'.
 69 | Welcome to
 70 |       ____              __
 71 |      / __/__  ___ _____/ /__
 72 |     _\ \/ _ \/ _ `/ __/  '_/
 73 |    /___/ .__/\_,_/_/ /_/\_\   version 2.4.4
 74 |       /_/
 75 |          
 76 | Using Scala version 2.11.12 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_172)
 77 | Type in expressions to have them evaluated.
 78 | Type :help for more information.
 79 | 
 80 | scala> val partitions = 22
 81 | partitions: Int = 22
 82 | 
 83 | scala> var df = spark.read.option("header", "true").csv("data/cardataset.csv")
 84 | df: org.apache.spark.sql.DataFrame = [Make: string, Model: string ... 14 more fields]
 85 | 
 86 | scala> val metricColumns = Array("Engine HP", "Engine Cylinders", "highway MPG", "city mpg", "Popularity", "MSRP")
 87 | metricColumns: Array[String] = Array(Engine HP, Engine Cylinders, highway MPG, city mpg, Popularity, MSRP)
 88 | 
 89 | scala> metricColumns.foreach { col =>
 90 |      |   df = df.withColumn(col, df(col).cast("Double"))
 91 |      |   val minValue = lit(df.select(min(df(col))).first()(0))
 92 |      |   val maxValue = lit(df.select(max(df(col))).first()(0))
 93 |      |   println("Col " + col + " min " + minValue + ", max: " + maxValue)
 94 |      |   df = df.withColumn(col, (df(col) - minValue) / (maxValue - minValue))
 95 |      | }
 96 | Col Engine HP min 55.0, max: 1001.0
 97 | Col Engine Cylinders min 0.0, max: 16.0
 98 | Col highway MPG min 12.0, max: 354.0
 99 | Col city mpg min 7.0, max: 137.0
100 | Col Popularity min 2.0, max: 5657.0
101 | Col MSRP min 2000.0, max: 2065902.0
102 | 
103 | scala> import org.apache.spark.ml.feature.VectorAssembler
104 | import org.apache.spark.ml.feature.VectorAssembler
105 | 
106 | scala> val ass = new VectorAssembler().setInputCols(metricColumns).setOutputCol("vector")
107 | ass: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_20553ab89e72
108 | 
109 | scala> df = df.withColumn("label", concat_ws(" ", df("Make"), df("Model"), df("Year"), df("Engine Fuel Type"), df("Transmission Type")))
110 | df: org.apache.spark.sql.DataFrame = [Make: string, Model: string ... 15 more fields]
111 | 
112 | scala> df = ass.setHandleInvalid("skip").transform(df)
113 | df: org.apache.spark.sql.DataFrame = [Make: string, Model: string ... 16 more fields]
114 | 
115 | scala> val num = df.count()
116 | num: Long = 11816
117 | 
118 | scala> import org.apache.spark.ml.outlierdetection.StochasticOutlierDetection
119 | import org.apache.spark.ml.outlierdetection.StochasticOutlierDetection
120 | 
121 | scala> val output = StochasticOutlierDetection.performOutlierDetectionDf(df.repartition(partitions), perplexity = Math.sqrt(num))
122 | output: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[124] at map at StochasticOutlierDetection.scala:68
123 | 
124 | scala> val result = spark.createDataFrame(output).toDF("label", "score").cache()
125 | result: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: string, score: double]
126 | 
127 | scala> result.orderBy(asc("score")).show(22, false)
128 | +---------------------------------------------------------------+-------------------+
129 | |label                                                          |score              |
130 | +---------------------------------------------------------------+-------------------+
131 | |Chevrolet Sonic 2015 regular unleaded MANUAL                   |0.21279828599808828|
132 | |Chevrolet Sonic 2016 regular unleaded MANUAL                   |0.21293609075531097|
133 | |Chevrolet Sonic 2016 regular unleaded MANUAL                   |0.2131071436509756 |
134 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.213830724190521  |
135 | |Chevrolet Sonic 2015 regular unleaded MANUAL                   |0.213858747973501  |
136 | |GMC Sierra 1500 Classic 2007 regular unleaded AUTOMATIC        |0.21396034795422958|
137 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21396720338838232|
138 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.2140147473705729 |
139 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21406002357897783|
140 | |GMC Sierra 1500 Classic 2007 regular unleaded AUTOMATIC        |0.21415665139143406|
141 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21418025039928237|
142 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21448192959828502|
143 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21649415207000774|
144 | |GMC Sierra 1500 2016 flex-fuel (unleaded/E85) AUTOMATIC        |0.21656724572039546|
145 | |GMC Sierra 1500 2015 flex-fuel (unleaded/E85) AUTOMATIC        |0.21659862416390824|
146 | |GMC Sierra 1500 2015 flex-fuel (unleaded/E85) AUTOMATIC        |0.21665169162695752|
147 | |GMC Sierra 1500 2017 flex-fuel (unleaded/E85) AUTOMATIC        |0.21667668432518386|
148 | |GMC Sierra 1500 2016 flex-fuel (unleaded/E85) AUTOMATIC        |0.2167653474971194 |
149 | |Infiniti EX 2011 premium unleaded (recommended) AUTOMATIC      |0.21723682724336277|
150 | |Infiniti EX 2012 premium unleaded (recommended) AUTOMATIC      |0.21725696468322728|
151 | |GMC Sierra 1500 Classic 2007 regular unleaded AUTOMATIC        |0.21726098834132695|
152 | |GMC Sierra 1500 2017 flex-fuel (unleaded/E85) AUTOMATIC        |0.2172733380874455 |
153 | +---------------------------------------------------------------+-------------------+
154 | only showing top 22 rows
155 | 
156 | 
157 | scala> result.orderBy(desc("score")).show(22, false)
158 | +----------------------------------------------------------------------+------------------+
159 | |label                                                                 |score             |
160 | +----------------------------------------------------------------------+------------------+
161 | |Audi A6 2017 premium unleaded (recommended) AUTOMATED_MANUAL          |0.9999142633116896|
162 | |Porsche 718 Cayman 2017 premium unleaded (required) MANUAL            |0.9805813377048511|
163 | |Rolls-Royce Corniche 2001 premium unleaded (required) AUTOMATIC       |0.9799020737648789|
164 | |Acura NSX 2017 premium unleaded (required) AUTOMATED_MANUAL           |0.9686265528700773|
165 | |Volkswagen Touareg 2 2008 diesel AUTOMATIC                            |0.9620829288975413|
166 | |Mitsubishi Mighty Max Pickup 1994 regular unleaded MANUAL             |0.9484213630955308|
167 | |Chrysler Aspen 2009 regular unleaded AUTOMATIC                        |0.9271092903462673|
168 | |Oldsmobile Cutlass Ciera 1994 regular unleaded AUTOMATIC              |0.9119041256238049|
169 | |Volkswagen Touareg 2015 premium unleaded (recommended) AUTOMATIC      |0.9067141244279047|
170 | |Chrysler TC 1990 regular unleaded MANUAL                              |0.8942706414341941|
171 | |BMW 7 Series 2015 premium unleaded (required) AUTOMATIC               |0.887326636862905 |
172 | |Ferrari Enzo 2003 premium unleaded (required) AUTOMATED_MANUAL        |0.876073945508002 |
173 | |Audi 80 1990 regular unleaded MANUAL                                  |0.8712155642417887|
174 | |Mitsubishi Vanwagon 1990 regular unleaded AUTOMATIC                   |0.8604594927230855|
175 | |Lamborghini Reventon 2008 premium unleaded (required) AUTOMATED_MANUAL|0.8569533527940364|
176 | |Saab 900 1996 regular unleaded MANUAL                                 |0.8517167561224662|
177 | |Ford Focus RS 2017 premium unleaded (recommended) MANUAL              |0.8369213995663755|
178 | |Hyundai Elantra 2017 regular unleaded AUTOMATED_MANUAL                |0.8145671662996115|
179 | |Ford Focus 2017 regular unleaded MANUAL                               |0.8080970764431054|
180 | |Mercedes-Benz E-Class 2015 premium unleaded (required) AUTOMATIC      |0.8045429849252246|
181 | |BMW M4 GTS 2016 premium unleaded (required) AUTOMATED_MANUAL          |0.7994977767039853|
182 | |Chevrolet Cruze 2015 diesel AUTOMATIC                                 |0.7935704861212711|
183 | +----------------------------------------------------------------------+------------------+
184 | only showing top 22 rows
185 | ```
186 | 
187 | ## SOS using PySpark
188 | 
189 | ```
190 | MacBook-Pro-van-Fokko:spark-stochastic-outlier-selection fokkodriesprong$ pyspark --jars target/scala-2.11/spark-stochastic-outlier-selection_2.11-0.1.0.jar 
191 | Python 3.6.6 (v3.6.6:4cf1f54eb7, Jun 26 2018, 19:50:54) 
192 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)] on darwin
193 | Type "help", "copyright", "credits" or "license" for more information.
194 | 19/09/24 12:11:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
195 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
196 | Setting default log level to "WARN".
197 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
198 | Welcome to
199 |       ____              __
200 |      / __/__  ___ _____/ /__
201 |     _\ \/ _ \/ _ `/ __/  '_/
202 |    /__ / .__/\_,_/_/ /_/\_\   version 2.4.4
203 |       /_/
204 | 
205 | Using Python version 3.6.6 (v3.6.6:4cf1f54eb7, Jun 26 2018 19:50:54)
206 | SparkSession available as 'spark'.
207 | >>> import math
208 | >>> from pyspark.ml.feature import VectorAssembler
209 | >>> from pyspark.sql.functions import col, lit, min, max, asc, desc, concat_ws
210 | >>> from pyspark.mllib.common import _java2py
211 | >>> 
212 | >>> df = spark.read.option("header", "true").csv("data/cardataset.csv")
213 | >>> 
214 | >>> metric_columns = ["Engine HP", "Engine Cylinders", "highway MPG", "city mpg", "Popularity", "MSRP"]
215 | >>> 
216 | >>> # Scale the columns and remove the empty ones
217 | ... for column_name in metric_columns:
218 | ...     df = df.withColumn(column_name, col(column_name).cast("Double"))
219 | ...     minValue = lit(df.select(min(col(column_name)).alias("min")).first().asDict()['min'])
220 | ...     maxValue = lit(df.select(max(col(column_name)).alias("max")).first().asDict()['max'])
221 | ...     print("Col " + column_name + " min " + str(minValue) + ", max: " + str(maxValue))
222 | ...     df = df.where(col(column_name).isNotNull())
223 | ...     df = df.withColumn(column_name, (col(column_name) - minValue) / (maxValue - minValue))
224 | ... 
225 | Col Engine HP min Column<b'55.0'>, max: Column<b'1001.0'>
226 | Col Engine Cylinders min Column<b'0.0'>, max: Column<b'16.0'>
227 | Col highway MPG min Column<b'12.0'>, max: Column<b'354.0'>
228 | Col city mpg min Column<b'7.0'>, max: Column<b'137.0'>
229 | Col Popularity min Column<b'2.0'>, max: Column<b'5657.0'>
230 | Col MSRP min Column<b'2000.0'>, max: Column<b'2065902.0'>
231 | >>> 
232 | >>> df = df.withColumn("label", concat_ws(" ", col("Make"), col("Model"), col("Year"), col("Engine Fuel Type"), col("Transmission Type")))
233 | >>> 
234 | >>> # Count the number of rows
235 | ... num = df.count()
236 | >>> num
237 | 11816
238 | >>> 
239 | >>> # Remove the missing vectors and combine all the columns to a single vector
240 | ... ass = VectorAssembler(inputCols=metric_columns, outputCol="vector")
241 | >>> df = ass.setHandleInvalid("skip").transform(df).repartition(22)
242 | >>> 
243 | >>> # As perplexity, use the sqrt of the number of rows
244 | ... perplexity = math.sqrt(num)
245 | >>> 
246 | >>> jvm = sc._jvm
247 | >>> sqlContext = df._jdf
248 | >>> 
249 | >>> sos = jvm.org.apache.spark.ml.outlierdetection.StochasticOutlierDetection.performOutlierDetectionPython(spark._jwrapped, df._jdf, "label", "vector", perplexity, 1e-9, 5000)
250 | >>> 
251 | >>> # Reconstruct the Python DF
252 | ... result_df = _java2py(sc, sos)
253 | >>> 
254 | >>> result_df.orderBy(asc("score")).show(22, False)
255 | +---------------------------------------------------------------+-------------------+
256 | |label                                                          |score              |
257 | +---------------------------------------------------------------+-------------------+
258 | |Chevrolet Sonic 2015 regular unleaded MANUAL                   |0.2127982860017602 |
259 | |Chevrolet Sonic 2016 regular unleaded MANUAL                   |0.21293609075267098|
260 | |Chevrolet Sonic 2016 regular unleaded MANUAL                   |0.21310714365721425|
261 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21383072418134913|
262 | |Chevrolet Sonic 2015 regular unleaded MANUAL                   |0.21385874797566645|
263 | |GMC Sierra 1500 Classic 2007 regular unleaded AUTOMATIC        |0.21396034794647772|
264 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21396720338004566|
265 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.2140147473677278 |
266 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.2140600235686727 |
267 | |GMC Sierra 1500 Classic 2007 regular unleaded AUTOMATIC        |0.21415665138439838|
268 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21418025039550595|
269 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21448192959623724|
270 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21649415206093467|
271 | |GMC Sierra 1500 2016 flex-fuel (unleaded/E85) AUTOMATIC        |0.21656724570838756|
272 | |GMC Sierra 1500 2015 flex-fuel (unleaded/E85) AUTOMATIC        |0.21659862415217437|
273 | |GMC Sierra 1500 2015 flex-fuel (unleaded/E85) AUTOMATIC        |0.21665169161813994|
274 | |GMC Sierra 1500 2017 flex-fuel (unleaded/E85) AUTOMATIC        |0.21667668431185894|
275 | |GMC Sierra 1500 2016 flex-fuel (unleaded/E85) AUTOMATIC        |0.21676534748163712|
276 | |Infiniti EX 2011 premium unleaded (recommended) AUTOMATIC      |0.2172368272372638 |
277 | |Infiniti EX 2012 premium unleaded (recommended) AUTOMATIC      |0.21725696467803174|
278 | |GMC Sierra 1500 Classic 2007 regular unleaded AUTOMATIC        |0.2172609883362982 |
279 | |GMC Sierra 1500 2017 flex-fuel (unleaded/E85) AUTOMATIC        |0.21727333807551774|
280 | +---------------------------------------------------------------+-------------------+
281 | only showing top 22 rows
282 | 
283 | >>> result_df.orderBy(desc("score")).show(22, False)
284 | +----------------------------------------------------------------------+------------------+
285 | |label                                                                 |score             |
286 | +----------------------------------------------------------------------+------------------+
287 | |Audi A6 2017 premium unleaded (recommended) AUTOMATED_MANUAL          |0.9999142633116839|
288 | |Porsche 718 Cayman 2017 premium unleaded (required) MANUAL            |0.980581337706238 |
289 | |Rolls-Royce Corniche 2001 premium unleaded (required) AUTOMATIC       |0.9799020737640686|
290 | |Acura NSX 2017 premium unleaded (required) AUTOMATED_MANUAL           |0.9686265528721012|
291 | |Volkswagen Touareg 2 2008 diesel AUTOMATIC                            |0.9620829288975116|
292 | |Mitsubishi Mighty Max Pickup 1994 regular unleaded MANUAL             |0.9484213631039752|
293 | |Chrysler Aspen 2009 regular unleaded AUTOMATIC                        |0.9271092903411216|
294 | |Oldsmobile Cutlass Ciera 1994 regular unleaded AUTOMATIC              |0.911904125634653 |
295 | |Volkswagen Touareg 2015 premium unleaded (recommended) AUTOMATIC      |0.9067141244284443|
296 | |Chrysler TC 1990 regular unleaded MANUAL                              |0.8942706414327495|
297 | |BMW 7 Series 2015 premium unleaded (required) AUTOMATIC               |0.8873266368631819|
298 | |Ferrari Enzo 2003 premium unleaded (required) AUTOMATED_MANUAL        |0.8760739455080375|
299 | |Audi 80 1990 regular unleaded MANUAL                                  |0.871215564244082 |
300 | |Mitsubishi Vanwagon 1990 regular unleaded AUTOMATIC                   |0.8604594927168016|
301 | |Lamborghini Reventon 2008 premium unleaded (required) AUTOMATED_MANUAL|0.8569533527937525|
302 | |Saab 900 1996 regular unleaded MANUAL                                 |0.8517167561268112|
303 | |Ford Focus RS 2017 premium unleaded (recommended) MANUAL              |0.8369213995605174|
304 | |Hyundai Elantra 2017 regular unleaded AUTOMATED_MANUAL                |0.8145671663030841|
305 | |Ford Focus 2017 regular unleaded MANUAL                               |0.8080970764403019|
306 | |Mercedes-Benz E-Class 2015 premium unleaded (required) AUTOMATIC      |0.8045429849258207|
307 | |BMW M4 GTS 2016 premium unleaded (required) AUTOMATED_MANUAL          |0.7994977767050019|
308 | |Chevrolet Cruze 2015 diesel AUTOMATIC                                 |0.7935704861253007|
309 | +----------------------------------------------------------------------+------------------+
310 | only showing top 22 rows
311 | ```
312 | 


--------------------------------------------------------------------------------