├── project ├── build.properties └── build.sbt ├── .travis.yml ├── .gitignore ├── src ├── main │ └── scala │ │ ├── frl │ │ └── driesprong │ │ │ └── outlierdetection │ │ │ └── EvaluateOutlierDetection.scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── ml │ │ └── outlierdetection │ │ └── StochasticOutlierDetection.scala └── test │ └── scala │ └── org │ └── apache │ └── spark │ └── ml │ └── outlierdetection │ ├── StocasticOutlierDetectionIntegrationTest.scala │ └── StocasticOutlierDetectionTest.scala ├── cars-outliers.py ├── LICENSE └── README.md /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.3.2 2 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.11.12 4 | script: 5 | - sbt clean coverage test 6 | after_success: 7 | - sbt coveralls -------------------------------------------------------------------------------- /project/build.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Classpaths.sbtPluginReleases 2 | 3 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1") 4 | 5 | addSbtPlugin("org.scoverage" % "sbt-coveralls" % "1.2.7") 6 | 7 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.7") 8 | 9 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "2.0.0-M2") -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # Fokko specific 5 | events/* 6 | 7 | # sbt specific 8 | .cache 9 | .history 10 | .lib/ 11 | dist/* 12 | target/ 13 | lib_managed/ 14 | src_managed/ 15 | project/boot/ 16 | project/plugins/project/ 17 | 18 | # Scala-IDE specific 19 | .scala_dependencies 20 | .worksheet 21 | 22 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion 23 | 24 | *.iml 25 | 26 | ## Directory-based project format: 27 | .idea/ 28 | # if you remove the above rule, at least ignore the following: 29 | 30 | # User-specific stuff: 31 | # .idea/workspace.xml 32 | # .idea/tasks.xml 33 | # .idea/dictionaries 34 | 35 | # Sensitive or high-churn files: 36 | # .idea/dataSources.ids 37 | # .idea/dataSources.xml 38 | # .idea/sqlDataSources.xml 39 | # .idea/dynamic.xml 40 | # .idea/uiDesigner.xml 41 | 42 | # Gradle: 43 | # .idea/gradle.xml 44 | # .idea/libraries 45 | 46 | # Mongo Explorer plugin: 47 | # .idea/mongoSettings.xml 48 | 49 | ## File-based project format: 50 | *.ipr 51 | *.iws 52 | 53 | ## Plugin-specific files: 54 | 55 | # IntelliJ 56 | /out/ 57 | 58 | # mpeltonen/sbt-idea plugin 59 | .idea_modules/ 60 | 61 | # JIRA plugin 62 | atlassian-ide-plugin.xml 63 | 64 | # Crashlytics plugin (for Android Studio and IntelliJ) 65 | com_crashlytics_export_strings.xml 66 | crashlytics.properties 67 | crashlytics-build.properties 68 | 69 | -------------------------------------------------------------------------------- /src/main/scala/frl/driesprong/outlierdetection/EvaluateOutlierDetection.scala: -------------------------------------------------------------------------------- 1 | package frl.driesprong.outlierdetection 2 | 3 | import org.apache.spark.ml.feature.VectorAssembler 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions.{concat_ws, lit, max, min} 6 | 7 | object EvaluateOutlierDetection { 8 | 9 | def main(args: Array[String]) { 10 | val spark = SparkSession 11 | .builder() 12 | .appName("Stochastic Outlier Selection") 13 | .getOrCreate() 14 | 15 | var df = spark.read.option("header", "true").csv("data/cardataset.csv") 16 | 17 | val vector_columns = Array("Engine HP", "Engine Cylinders", "highway MPG", "city mpg", "MSRP") 18 | 19 | 20 | vector_columns.foreach { col => 21 | df = df.withColumn(col, df(col).cast("Double")) 22 | val minValue = lit(df.select(min(df(col))).first()(0)) 23 | val maxValue = lit(df.select(max(df(col))).first()(0)) 24 | println("Col " + col + " min " + minValue + ", max: " + maxValue) 25 | df = df.withColumn(col, (df(col) - minValue) / (maxValue - minValue)) 26 | } 27 | 28 | val ass = new VectorAssembler().setInputCols(vector_columns).setOutputCol("vector") 29 | 30 | df = df.withColumn("label", concat_ws(" ", df("Make"), df("Model"), df("Year"), df("Engine Fuel Type"), df("Transmission Type"))) 31 | 32 | df.count() 33 | 34 | df = ass.setHandleInvalid("skip").transform(df) 35 | df.count() 36 | 37 | val output = org.apache.spark.ml.outlierdetection.StochasticOutlierDetection.performOutlierDetectionDf(df) 38 | 39 | output.collect() 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /cars-outliers.py: -------------------------------------------------------------------------------- 1 | import math 2 | from pyspark.ml.feature import VectorAssembler 3 | from pyspark.sql.functions import col, lit, min, max, asc, desc, concat_ws 4 | from pyspark.mllib.common import _java2py 5 | 6 | df = spark.read.option("header", "true").csv("data/cardataset.csv") 7 | 8 | metric_columns = ["Engine HP", "Engine Cylinders", "highway MPG", "city mpg", "Popularity", "MSRP"] 9 | 10 | # Scale the columns and remove the empty ones 11 | for column_name in metric_columns: 12 | df = df.withColumn(column_name, col(column_name).cast("Double")) 13 | minValue = lit(df.select(min(col(column_name)).alias("min")).first().asDict()['min']) 14 | maxValue = lit(df.select(max(col(column_name)).alias("max")).first().asDict()['max']) 15 | print("Col " + column_name + " min " + str(minValue) + ", max: " + str(maxValue)) 16 | df = df.where(col(column_name).isNotNull()) 17 | df = df.withColumn(column_name, (col(column_name) - minValue) / (maxValue - minValue)) 18 | 19 | 20 | df = df.withColumn("label", concat_ws(" ", col("Make"), col("Model"), col("Year"), col("Engine Fuel Type"), col("Transmission Type"))) 21 | 22 | # Count the number of rows 23 | num = df.count() 24 | num 25 | 26 | # Remove the missing vectors and combine all the columns to a single vector 27 | ass = VectorAssembler(inputCols=metric_columns, outputCol="vector") 28 | df = ass.setHandleInvalid("skip").transform(df).repartition(22) 29 | 30 | # As perplexity, use the sqrt of the number of rows 31 | perplexity = math.sqrt(num) 32 | 33 | # Some helpers for the Java reference objects 34 | jvm = sc._jvm 35 | sqlContext = df._jdf 36 | 37 | sos = jvm.org.apache.spark.ml.outlierdetection.StochasticOutlierDetection.performOutlierDetectionPython(spark._jwrapped, df._jdf, "label", "vector", perplexity, 1e-9, 5000) 38 | 39 | # Reconstruct the Python DF 40 | result_df = _java2py(sc, sos) 41 | 42 | result_df.orderBy(asc("score")).show(22, False) 43 | 44 | result_df.orderBy(desc("score")).show(22, False) 45 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/ml/outlierdetection/StocasticOutlierDetectionIntegrationTest.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.outlierdetection 2 | 3 | import org.apache.spark.ml.feature.VectorAssembler 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | import org.scalatest._ 7 | 8 | class StocasticOutlierDetectionIntegrationTest extends FlatSpec with Matchers with BeforeAndAfter { 9 | 10 | "Running the SOS algorithm " should "give some sensible outcome" in { 11 | 12 | val partitions = 22 13 | 14 | val spark = SparkSession 15 | .builder() 16 | .master("local[*]") 17 | .config("spark.driver.allowMultipleContexts", value = true) 18 | .getOrCreate() 19 | 20 | var df = spark.read.option("header", "true").csv("data/cardataset.csv") 21 | 22 | val metricColumns = Array("Engine HP", "Engine Cylinders", "highway MPG", "city mpg", "Popularity", "MSRP") 23 | 24 | metricColumns.foreach { col => 25 | df = df.withColumn(col, df(col).cast("Double")) 26 | val minValue = lit(df.select(min(df(col))).first()(0)) 27 | val maxValue = lit(df.select(max(df(col))).first()(0)) 28 | println("Col " + col + " min " + minValue + ", max: " + maxValue) 29 | df = df.withColumn(col, (df(col) - minValue) / (maxValue - minValue)) 30 | } 31 | 32 | val ass = new VectorAssembler().setInputCols(metricColumns).setOutputCol("vector") 33 | 34 | df = df.withColumn("label", concat_ws(" ", df("Make"), df("Model"), df("Year"), df("Engine Fuel Type"), df("Transmission Type"))) 35 | 36 | df = ass.setHandleInvalid("skip").transform(df) 37 | 38 | val num = df.count() 39 | 40 | val output = StochasticOutlierDetection.performOutlierDetectionDf(df.repartition(partitions), perplexity = Math.sqrt(num)) 41 | 42 | val result = spark.createDataFrame(output).toDF("label", "score").cache() 43 | 44 | // println(result.show()) 45 | // Thread.sleep(2200000) 46 | 47 | spark.stop() 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/outlierdetection/StochasticOutlierDetection.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.outlierdetection 2 | 3 | import breeze.linalg.functions.euclideanDistance 4 | import breeze.linalg.{DenseVector, sum} 5 | import org.apache.spark.ml.linalg.{DenseVector => SparkDenseVector} 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext} 8 | 9 | object StochasticOutlierDetection { 10 | val defaultPerplexity = 30.0 11 | val defaultEps = 1e-12 12 | val defaultIterations = 5000 13 | 14 | /** 15 | * Helper function to make it available to the Python world 16 | */ 17 | def performOutlierDetectionPython(sqlContext: SQLContext, 18 | ds: Dataset[Row], 19 | labelColumn: String = "label", 20 | vectorColumn: String = "vector", 21 | perplexity: Double = defaultPerplexity, 22 | eps: Double = defaultEps, 23 | maxIterations: Int = defaultIterations): DataFrame = { 24 | val rdd = performOutlierDetectionDf(ds, labelColumn, vectorColumn, perplexity, eps, maxIterations) 25 | 26 | sqlContext.createDataFrame(rdd).toDF(labelColumn, "score") 27 | } 28 | 29 | def performOutlierDetectionDs(ds: Dataset[Row], 30 | labelColumn: String = "label", 31 | vectorColumn: String = "vector", 32 | perplexity: Double = defaultPerplexity, 33 | eps: Double = defaultEps, 34 | maxIterations: Int = defaultIterations): RDD[(String, Double)] = { 35 | 36 | performOutlierDetectionDf(ds, labelColumn, vectorColumn, perplexity, eps, maxIterations) 37 | } 38 | 39 | def performOutlierDetectionDf(df: DataFrame, 40 | labelColumn: String = "label", 41 | vectorColumn: String = "vector", 42 | perplexity: Double = defaultPerplexity, 43 | eps: Double = defaultEps, 44 | maxIterations: Int = defaultIterations): RDD[(String, Double)] = { 45 | 46 | val rdd = df.select(labelColumn, vectorColumn).rdd 47 | .map(row => (row.getAs[String](labelColumn), row.getAs[SparkDenseVector](vectorColumn))) 48 | performOutlierDetectionRdd(rdd, perplexity, eps, maxIterations) 49 | } 50 | 51 | def performOutlierDetectionRdd(inputVectors: RDD[(String, SparkDenseVector)], 52 | perplexity: Double = defaultPerplexity, 53 | eps: Double = defaultEps, 54 | maxIterations: Int = defaultIterations): RDD[(String, Double)] = { 55 | 56 | val withIndices = inputVectors.zipWithIndex().map(_.swap) 57 | 58 | val labels = withIndices.mapValues(_._1) 59 | val vectors = withIndices.mapValues(_._2) 60 | 61 | // Only pass the vectors in 62 | val dMatrix = computeDistanceMatrix(vectors) 63 | val aMatrix = computeAffinityMatrix(dMatrix, perplexity, maxIterations, eps) 64 | val bMatrix = computeBindingProbabilities(aMatrix) 65 | val oMatrix = computeOutlierProbability(bMatrix) 66 | 67 | oMatrix.join(labels).map(_._2.swap) 68 | } 69 | 70 | 71 | /** 72 | * Comptues the perplexity for a given vector, given a value of beta 73 | * 74 | * @param D The input vector 75 | * @param beta The given beta 76 | * @return A tuple of the current error and the affinity vector 77 | */ 78 | private[outlierdetection] def getPerplexity(D: DenseVector[Double], beta: Double): (Double, DenseVector[Double]) = { 79 | val A = D.map(a => Math.exp(-a * beta)) 80 | val sumA = sum(A) 81 | val h = Math.log(sumA) + beta * sum(A :* D) / sumA 82 | (h, A) 83 | } 84 | 85 | /** 86 | * Computes the affinity for a given row of the distance matrix 87 | * 88 | * @param D The input vector of the given row 89 | * @param logPerplexity The log taken from the perplexity 90 | * @param maxIterations The maximum number of iterations before giving up 91 | * @param eps The accepted error 92 | * @param iteration The current iteration 93 | * @param beta The current approximated beta 94 | * @param betaMin The lower bound of the beta 95 | * @param betaMax The upper bound of the beta 96 | * @return 97 | */ 98 | @scala.annotation.tailrec 99 | private[outlierdetection] def binarySearch(D: DenseVector[Double], 100 | logPerplexity: Double, 101 | maxIterations: Int, 102 | eps: Double, 103 | iteration: Int = 0, 104 | beta: Double = 1.0, 105 | betaMin: Double = Double.NegativeInfinity, 106 | betaMax: Double = Double.PositiveInfinity): DenseVector[Double] = { 107 | 108 | val (h, matA) = getPerplexity(D, beta) 109 | val hDiff = h - logPerplexity 110 | 111 | if (iteration < maxIterations && Math.abs(hDiff) > eps) { 112 | val (newBeta, newBetaMin, newBetaMax) = if (hDiff.isNaN) { 113 | // If the beta is too high, it might result into a NaN 114 | (beta / 10.0, betaMin, betaMax) 115 | } else { 116 | if (hDiff > 0) 117 | (if (betaMax == Double.PositiveInfinity || betaMax == Double.NegativeInfinity) 118 | beta * 2.0 119 | else 120 | (beta + betaMax) / 2.0, beta, betaMax) 121 | else 122 | (if (betaMin == Double.PositiveInfinity || betaMin == Double.NegativeInfinity) 123 | beta / 2.0 124 | else 125 | (beta + betaMin) / 2.0, betaMin, beta) 126 | } 127 | 128 | binarySearch(D, logPerplexity, maxIterations, eps, iteration + 1, newBeta, newBetaMin, newBetaMax) 129 | } 130 | else 131 | matA 132 | } 133 | 134 | /** 135 | * Will convert the distances into the affinity by sampling using a binary search. 136 | * The binary search will stop if it is between a certain tolerance 137 | * 138 | * @param dMatrix The distance matrix 139 | * @param perplexity The perplexity 140 | * @param maxIterations The maximum number of iterations before stopping 141 | * @param eps The accepted error to stop early 142 | * @return The affinity matrix 143 | */ 144 | private[outlierdetection] def computeAffinityMatrix(dMatrix: RDD[(Long, Array[Double])], 145 | perplexity: Double = defaultPerplexity, 146 | maxIterations: Int = defaultIterations, 147 | eps: Double = defaultEps): RDD[(Long, DenseVector[Double])] = 148 | dMatrix.mapValues(r => binarySearch(new DenseVector(r), Math.log(perplexity), maxIterations, eps)) 149 | 150 | /** 151 | * Scales the binding probabilities by dividing the values in the vector by the total sum 152 | * 153 | * @param rows The affinity values 154 | * @return Scaled affinity values where the sum is equal to 1.0 155 | */ 156 | private[outlierdetection] def computeBindingProbabilities(rows: RDD[(Long, DenseVector[Double])]): RDD[(Long, Array[Double])] = 157 | rows.mapValues(row => (row :/ sum(row)).toArray) 158 | 159 | /** 160 | * Accepts a RDD of (Index, Vector) which is being used to compute the distance from each vector to each other vector. 161 | * We're removing the diagonal since it isn't of any importance. Mostly because for computing the affection we don't 162 | * want to include the distance to the vector itself. 163 | * 164 | * @param data RDD of (Index, Vector) as input 165 | * @return RDD of (Index, Vector) where the position in the vector is the Index of the other vector 166 | */ 167 | private[outlierdetection] def computeDistanceMatrix(data: RDD[(Long, SparkDenseVector)]): RDD[(Long, Array[Double])] = 168 | data 169 | .cache() 170 | .cartesian(data) 171 | .map(row => row._1._1 -> (row._2._1, euclideanDistance(row._1._2.asBreeze, row._2._2.asBreeze))) 172 | // Remove the distance to itself, i.e. the diagonal of the matrix 173 | .filter(row => row._1 != row._2._1) 174 | .groupByKey() 175 | .mapValues(_.toArray.sortBy(_._1).map(_._2)) 176 | 177 | /** 178 | * This will multiply all the columns with each other so we get the 179 | * final outlier probability in [0, 1] 180 | * 181 | * @param rows The rows with the affinity 182 | * @return The outlier probabilities for each of the observations 183 | */ 184 | def computeOutlierProbability(rows: RDD[(Long, Array[Double])]): RDD[(Long, Double)] = 185 | rows 186 | .flatMap(row => 187 | row._2.zipWithIndex.map(value => { 188 | val beyondDiagonal = if (value._2 >= row._1) 1L else 0L 189 | (value._2 + beyondDiagonal, value._1) 190 | }) 191 | ) 192 | .groupByKey() 193 | .mapValues(vector => vector.fold(1.0)((a, b) => a * (1.0 - b))) 194 | 195 | } 196 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/ml/outlierdetection/StocasticOutlierDetectionTest.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.outlierdetection 2 | 3 | import breeze.linalg.{DenseVector, sum} 4 | import org.apache.spark.ml.linalg.{DenseVector => SparkDenseVector} 5 | import org.apache.spark.rdd.RDD 6 | import org.apache.spark.sql.SparkSession 7 | import org.scalactic.{Equality, TolerantNumerics} 8 | import org.scalatest._ 9 | 10 | // Unit-tests created based on the Python script of https://github.com/jeroenjanssens/sos 11 | class StocasticOutlierDetectionTest extends FlatSpec with Matchers with BeforeAndAfter { 12 | 13 | val spark: SparkSession = SparkSession 14 | .builder() 15 | .master("local[*]") 16 | .config("spark.driver.allowMultipleContexts", value = true) 17 | .getOrCreate() 18 | 19 | val perplexity = 3.0 20 | 21 | val epsilon = 1e-9f 22 | implicit val doubleEq: Equality[Double] = TolerantNumerics.tolerantDoubleEquality(epsilon) 23 | 24 | "Computing the distance matrix " should "give symmetrical distances" in { 25 | 26 | val seqData: Seq[(Long, SparkDenseVector)] = Seq( 27 | (0L, new SparkDenseVector(Array(1.0, 3.0))), 28 | (1L, new SparkDenseVector(Array(5.0, 1.0))), 29 | (2L, new SparkDenseVector(Array(2.2, 2.2))) 30 | ) 31 | 32 | val data: RDD[(Long, SparkDenseVector)] = spark.sparkContext.parallelize(seqData) 33 | 34 | val dMatrix = StochasticOutlierDetection.computeDistanceMatrix(data).collectAsMap() 35 | 36 | dMatrix.size should be(seqData.length) 37 | // No diagonal 38 | dMatrix.head._2.length should be(seqData.length - 1) 39 | 40 | dMatrix(0)(0) should be(dMatrix(1)(0)) 41 | dMatrix(0)(1) should be(dMatrix(2)(0)) 42 | } 43 | 44 | "Computing the distance matrix " should "give the correct distances" in { 45 | 46 | val data: RDD[(Long, SparkDenseVector)] = spark.sparkContext.parallelize( 47 | Seq( 48 | (0L, new SparkDenseVector(Array(1.0, 1.0))), 49 | (1L, new SparkDenseVector(Array(2.0, 2.0))), 50 | (2L, new SparkDenseVector(Array(5.0, 1.0))) 51 | )) 52 | 53 | val dMatrix = StochasticOutlierDetection.computeDistanceMatrix(data).collectAsMap() 54 | 55 | dMatrix(0L) should be(Array(Math.sqrt(2.0), Math.sqrt(Math.pow(1.0 - 5.0, 2) + Math.pow(1.0 - 1.0, 2)))) 56 | dMatrix(1L) should be(Array(Math.sqrt(2.0), Math.sqrt(Math.pow(2.0 - 5.0, 2) + Math.pow(2.0 - 1.0, 2)))) 57 | dMatrix(2L) should be(Array(Math.sqrt(16.0), Math.sqrt(10.0))) 58 | } 59 | 60 | "Computing the perplexity of the vector " should "give the correct error" in { 61 | 62 | val vector = new DenseVector(Array(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 8.0, 9.0, 10.0)) 63 | 64 | val output = Array( 65 | 3.67879441e-01, 66 | 1.35335283e-01, 67 | 4.97870684e-02, 68 | 1.83156389e-02, 69 | 6.73794700e-03, 70 | 2.47875218e-03, 71 | 3.35462628e-04, 72 | 1.23409804e-04, 73 | 4.53999298e-05 74 | ) 75 | 76 | // Standard beta 77 | val beta = 1.0 78 | val search = StochasticOutlierDetection.binarySearch(vector, Math.log(perplexity), 500, beta).toArray 79 | 80 | assert(search.length == output.length) 81 | search.zip(output).foreach(v => assert(v._1 === v._2)) 82 | } 83 | 84 | "Computing the perplexity" should "give the correct perplexity" in { 85 | 86 | val output = StochasticOutlierDetection.getPerplexity(DenseVector(1.0, 2.0, 3.0, 4.0), 3) 87 | 88 | /* 89 | >>> get_perplexity(np.array([1,2,3,4]), 3) 90 | (0.2081763951839819, new SparkDenseVector(([4.97870684e-02, 2.47875218e-03, 1.23409804e-04, 6.14421235e-06])) 91 | */ 92 | 93 | output._1 should be(0.2081763951839819) 94 | output._2 should be(DenseVector(0.049787068367863944, 0.0024787521766663585, 1.2340980408667956E-04, 6.14421235332821E-06)) 95 | } 96 | 97 | "Compute the affinity" should "give the correct affinity" in { 98 | 99 | // The datapoints 100 | val data: RDD[(Long, SparkDenseVector)] = spark.sparkContext.parallelize( 101 | Seq( 102 | (0L, new SparkDenseVector(Array(1.0, 1.0))), 103 | (1L, new SparkDenseVector(Array(2.0, 1.0))), 104 | (2L, new SparkDenseVector(Array(1.0, 2.0))), 105 | (3L, new SparkDenseVector(Array(2.0, 2.0))), 106 | (4L, new SparkDenseVector(Array(5.0, 8.0))) // The outlier! 107 | )) 108 | 109 | /* 110 | Looks ok: 111 | +---+--------------------------------------------------------------------------------+ 112 | |_1 |_2 | 113 | +---+--------------------------------------------------------------------------------+ 114 | |0 |[0.0, 1.0, 1.0, 1.4142135623730951, 8.06225774829855] | 115 | |1 |[1.0, 0.0, 1.4142135623730951, 1.0, 7.615773105863909] | 116 | |2 |[1.0, 1.4142135623730951, 0.0, 1.0, 7.211102550927978] | 117 | |3 |[1.4142135623730951, 1.0, 1.0, 0.0, 6.708203932499369] | 118 | |4 |[8.06225774829855, 7.615773105863909, 7.211102550927978, 6.708203932499369, 0.0]| 119 | +---+--------------------------------------------------------------------------------+ 120 | 121 | df = pd.DataFrame([ 122 | [1.0, 1.0], 123 | [2.0, 1.0], 124 | [1.0, 2.0], 125 | [2.0, 2.0], 126 | [5.0, 8.0] 127 | ]) 128 | 129 | >>> D = distance.squareform(distance.pdist(df, 'euclidean')) 130 | >>> D 131 | array([[0. , 1. , 1. , 1.41421356, 8.06225775], 132 | [1. , 0. , 1.41421356, 1. , 7.61577311], 133 | [1. , 1.41421356, 0. , 1. , 7.21110255], 134 | [1.41421356, 1. , 1. , 0. , 6.70820393], 135 | [8.06225775, 7.61577311, 7.21110255, 6.70820393, 0. ]]) 136 | */ 137 | 138 | val dMatrix = StochasticOutlierDetection.computeDistanceMatrix(data) 139 | 140 | val dMatrixLocal = dMatrix.collectAsMap() 141 | 142 | dMatrixLocal.size should be(5) 143 | // No diagonal 144 | dMatrixLocal.head._2.length should be(4) 145 | 146 | dMatrixLocal(0) should be(Array(1.0, 1.0, 1.4142135623730951, 8.06225774829855)) 147 | dMatrixLocal(1) should be(Array(1.0, 1.4142135623730951, 1.0, 7.615773105863909)) 148 | dMatrixLocal(2) should be(Array(1.0, 1.4142135623730951, 1.0, 7.211102550927978)) 149 | dMatrixLocal(3) should be(Array(1.4142135623730951, 1.0, 1.0, 6.708203932499369)) 150 | dMatrixLocal(4) should be(Array(8.06225774829855, 7.615773105863909, 7.211102550927978, 6.708203932499369)) 151 | 152 | val aMatrix = StochasticOutlierDetection.computeAffinityMatrix( 153 | dMatrix, 154 | perplexity).collectAsMap() 155 | 156 | /* 157 | Reference output: 158 | >>> A = d2a(D) 159 | >>> A 160 | array([[0.00000000e+00, 4.64662766e-01, 4.64662766e-01, 3.38268740e-01, 2.07195222e-03], 161 | [4.48046270e-01, 0.00000000e+00, 3.21289157e-01, 4.48046270e-01, 2.21082346e-03], 162 | [4.31925257e-01, 3.05063254e-01, 0.00000000e+00, 4.31925257e-01, 2.34905955e-03], 163 | [2.83704490e-01, 4.10315559e-01, 4.10315559e-01, 0.00000000e+00, 2.53931484e-03], 164 | [1.65024585e-06, 3.44967767e-06, 6.73004987e-06, 1.54422171e-05, 0.00000000e+00]]) 165 | */ 166 | 167 | aMatrix.size should be(5) 168 | aMatrix.head._2.size should be(4) 169 | aMatrix(0) should be(DenseVector(0.46466276524892347, 0.46466276524892347, 0.3382687394706771, 0.002071952211481348)) 170 | aMatrix(1) should be(DenseVector(0.44804626736592407, 0.32128915387335244, 0.44804626736592407, 0.002210823345964273)) 171 | aMatrix(2) should be(DenseVector(0.43192525600789167, 0.3050632526240005, 0.43192525600789167, 0.0023490595179782026)) 172 | aMatrix(3) should be(DenseVector(0.2837044890481323, 0.41031555870116004, 0.41031555870116004, 0.0025393148189380038)) 173 | aMatrix(4) should be(DenseVector(1.6502458086112328E-6, 3.4496775759417726E-6, 6.730049701899862E-6, 1.544221669896851E-5)) 174 | } 175 | 176 | "Verify the binding probabilities " should "give the correct probabilities" in { 177 | 178 | // The distance matrix 179 | val dMatrix = spark.sparkContext.parallelize( 180 | Seq( 181 | (0L, new DenseVector(Array(6.61626106e-112, 1.27343495e-088))), 182 | (1L, new DenseVector(Array(2.21858114e-020, 1.12846575e-044))), 183 | (2L, new DenseVector(Array(1.48949023e-010, 1.60381089e-028))) 184 | )) 185 | 186 | val bMatrix = StochasticOutlierDetection.computeBindingProbabilities(dMatrix).map(_._2).sortBy(dist => sum(dist)).collect() 187 | 188 | assert(bMatrix(0)(0) === 5.19560192e-24) 189 | assert(bMatrix(0)(1) === 1.00000000e+00) 190 | 191 | assert(bMatrix(1)(0) === 1.00000000e+00) 192 | assert(bMatrix(1)(1) === 5.08642993e-25) 193 | 194 | assert(bMatrix(2)(0) === 1.00000000e+00) 195 | assert(bMatrix(2)(1) === 1.07675154e-18) 196 | } 197 | 198 | "Verifying the product " should "should provide valid products" in { 199 | 200 | val data = spark.sparkContext.parallelize( 201 | Seq( 202 | (0L, Array(/*0.0,*/ 0.5, 0.3)), 203 | (1L, Array(0.25, /*0.0,*/ 0.1)), 204 | (2L, Array(0.8, 0.8 /*, 0.0*/)) 205 | )) 206 | 207 | val oMatrix = StochasticOutlierDetection.computeOutlierProbability(data).collectAsMap() 208 | 209 | /* 210 | >>> import pandas as pd 211 | >>> import numpy as np 212 | >>> 213 | >>> df = pd.DataFrame([[0.0, 0.5, 0.3], 214 | ... [0.25, 0.0, 0.1], 215 | ... [0.8, 0.8, 0.0]]) 216 | >>> 217 | >>> np.prod(1-df, 0) 218 | 0 0.15 219 | 1 0.10 220 | 2 0.63 221 | */ 222 | 223 | val out0 = (1.0 - 0.0) * (1.0 - 0.25) * (1.0 - 0.8) // 0.09999999999999998 224 | val out1 = (1.0 - 0.5) * (1.0 - 0.0) * (1.0 - 0.8) // 0.14999999999999997 225 | val out2 = (1.0 - 0.3) * (1.0 - 0.1) * (1.0 - 0) // 0.63 226 | 227 | assert(oMatrix.size == 3) 228 | 229 | assert(oMatrix(0) === out0) 230 | assert(oMatrix(1) === out1) 231 | assert(oMatrix(2) === out2) 232 | } 233 | 234 | "Verifying the output of the SOS algorithm " should "assign the one true outlier" in { 235 | 236 | // The datapoints 237 | val data: RDD[(Long, SparkDenseVector)] = spark.sparkContext.parallelize( 238 | Seq( 239 | (0L, new SparkDenseVector(Array(1.0, 1.0))), 240 | (1L, new SparkDenseVector(Array(2.0, 1.0))), 241 | (2L, new SparkDenseVector(Array(1.0, 2.0))), 242 | (3L, new SparkDenseVector(Array(2.0, 2.0))), 243 | (4L, new SparkDenseVector(Array(5.0, 8.0))) // The outlier! 244 | )) 245 | 246 | // Process the steps of the algorithm 247 | val dMatrix = StochasticOutlierDetection.computeDistanceMatrix(data) 248 | 249 | val aMatrix = StochasticOutlierDetection.computeAffinityMatrix( 250 | dMatrix, 251 | perplexity) 252 | 253 | val bMatrix = StochasticOutlierDetection.computeBindingProbabilities(aMatrix) 254 | 255 | val oMatrix = StochasticOutlierDetection.computeOutlierProbability(bMatrix) 256 | 257 | // Do a distributed sort, and then return to driver 258 | val output = oMatrix.collectAsMap() 259 | 260 | assert(output.size == 5) 261 | assert(output(0) === 0.27900944792028953) 262 | assert(output(1) === 0.25775014551682535) 263 | assert(output(2) === 0.22136130977995763) 264 | assert(output(3) === 0.12707053787018444) 265 | assert(output(4) === 0.99227799024537555184) // The outlier! 266 | } 267 | 268 | } 269 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![License](http://img.shields.io/:license-Apache%202-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.txt) 2 | [![Build Status](https://travis-ci.org/Fokko/spark-stochastic-outlier-selection.svg?branch=master)](https://travis-ci.org/Fokko/spark-stochastic-outlier-selection) 3 | [![Coverage Status](https://coveralls.io/repos/Fokko/spark-stochastic-outlier-selection/badge.svg?branch=master&service=github)](https://coveralls.io/github/Fokko/spark-stochastic-outlier-selection?branch=master) 4 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/frl.driesprong/spark-stochastic-outlier-selection_2.11/badge.svg)](https://maven-badges.herokuapp.com/maven-central/frl.driesprong/spark-stochastic-outlier-selection_2.11) 5 | 6 | # Stochastic Outlier Selection on Apache Spark 7 | 8 | Stochastic Outlier Selection (SOS) is an unsupervised outlier selection algorithm. It uses the concept of affinity to compute an outlier probability for each data point. 9 | 10 | For more information about SOS, see the technical report: J.H.M. Janssens, F. Huszar, E.O. Postma, and H.J. van den Herik. [Stochastic Outlier Selection](https://github.com/jeroenjanssens/sos/blob/master/doc/sos-ticc-tr-2012-001.pdf?raw=true). Technical Report TiCC TR 2012-001, Tilburg University, Tilburg, the Netherlands, 2012. 11 | 12 | ## Selecting outliers from data 13 | 14 | The current implementation accepts RDD's of the type `Array[Double]` and returns the indexes of the vector with it's degree of outlierness. 15 | 16 | Current implementation only works with Euclidean distance, but this can be extended. 17 | 18 | # Example 19 | 20 | As a small example on how to use the algorithm, we have the following dataset: 21 | 22 | ``` 23 | scala> val df = spark.read.option("header", "true").csv("data/cardataset.csv") 24 | df: org.apache.spark.sql.DataFrame = [Make: string, Model: string ... 14 more fields] 25 | 26 | scala> df.show() 27 | +----+----------+----+--------------------+---------+----------------+-----------------+-----------------+---------------+--------------------+------------+-------------+-----------+--------+----------+-----+ 28 | |Make| Model|Year| Engine Fuel Type|Engine HP|Engine Cylinders|Transmission Type| Driven_Wheels|Number of Doors| Market Category|Vehicle Size|Vehicle Style|highway MPG|city mpg|Popularity| MSRP| 29 | +----+----------+----+--------------------+---------+----------------+-----------------+-----------------+---------------+--------------------+------------+-------------+-----------+--------+----------+-----+ 30 | | BMW|1 Series M|2011|premium unleaded ...| 335| 6| MANUAL| rear wheel drive| 2|Factory Tuner,Lux...| Compact| Coupe| 26| 19| 3916|46135| 31 | | BMW| 1 Series|2011|premium unleaded ...| 300| 6| MANUAL| rear wheel drive| 2| Luxury,Performance| Compact| Convertible| 28| 19| 3916|40650| 32 | | BMW| 1 Series|2011|premium unleaded ...| 300| 6| MANUAL| rear wheel drive| 2|Luxury,High-Perfo...| Compact| Coupe| 28| 20| 3916|36350| 33 | | BMW| 1 Series|2011|premium unleaded ...| 230| 6| MANUAL| rear wheel drive| 2| Luxury,Performance| Compact| Coupe| 28| 18| 3916|29450| 34 | | BMW| 1 Series|2011|premium unleaded ...| 230| 6| MANUAL| rear wheel drive| 2| Luxury| Compact| Convertible| 28| 18| 3916|34500| 35 | | BMW| 1 Series|2012|premium unleaded ...| 230| 6| MANUAL| rear wheel drive| 2| Luxury,Performance| Compact| Coupe| 28| 18| 3916|31200| 36 | | BMW| 1 Series|2012|premium unleaded ...| 300| 6| MANUAL| rear wheel drive| 2| Luxury,Performance| Compact| Convertible| 26| 17| 3916|44100| 37 | | BMW| 1 Series|2012|premium unleaded ...| 300| 6| MANUAL| rear wheel drive| 2|Luxury,High-Perfo...| Compact| Coupe| 28| 20| 3916|39300| 38 | | BMW| 1 Series|2012|premium unleaded ...| 230| 6| MANUAL| rear wheel drive| 2| Luxury| Compact| Convertible| 28| 18| 3916|36900| 39 | | BMW| 1 Series|2013|premium unleaded ...| 230| 6| MANUAL| rear wheel drive| 2| Luxury| Compact| Convertible| 27| 18| 3916|37200| 40 | | BMW| 1 Series|2013|premium unleaded ...| 300| 6| MANUAL| rear wheel drive| 2|Luxury,High-Perfo...| Compact| Coupe| 28| 20| 3916|39600| 41 | | BMW| 1 Series|2013|premium unleaded ...| 230| 6| MANUAL| rear wheel drive| 2| Luxury,Performance| Compact| Coupe| 28| 19| 3916|31500| 42 | | BMW| 1 Series|2013|premium unleaded ...| 300| 6| MANUAL| rear wheel drive| 2| Luxury,Performance| Compact| Convertible| 28| 19| 3916|44400| 43 | | BMW| 1 Series|2013|premium unleaded ...| 230| 6| MANUAL| rear wheel drive| 2| Luxury| Compact| Convertible| 28| 19| 3916|37200| 44 | | BMW| 1 Series|2013|premium unleaded ...| 230| 6| MANUAL| rear wheel drive| 2| Luxury,Performance| Compact| Coupe| 28| 19| 3916|31500| 45 | | BMW| 1 Series|2013|premium unleaded ...| 320| 6| MANUAL| rear wheel drive| 2|Luxury,High-Perfo...| Compact| Convertible| 25| 18| 3916|48250| 46 | | BMW| 1 Series|2013|premium unleaded ...| 320| 6| MANUAL| rear wheel drive| 2|Luxury,High-Perfo...| Compact| Coupe| 28| 20| 3916|43550| 47 | |Audi| 100|1992| regular unleaded| 172| 6| MANUAL|front wheel drive| 4| Luxury| Midsize| Sedan| 24| 17| 3105| 2000| 48 | |Audi| 100|1992| regular unleaded| 172| 6| MANUAL|front wheel drive| 4| Luxury| Midsize| Sedan| 24| 17| 3105| 2000| 49 | |Audi| 100|1992| regular unleaded| 172| 6| AUTOMATIC| all wheel drive| 4| Luxury| Midsize| Wagon| 20| 16| 3105| 2000| 50 | +----+----------+----+--------------------+---------+----------------+-----------------+-----------------+---------------+--------------------+------------+-------------+-----------+--------+----------+-----+ 51 | only showing top 20 rows 52 | ``` 53 | 54 | Borrowed from Kaggle: https://www.kaggle.com/CooperUnion/cardataset. The data is scraped from Edmunds, which serves the US of A, so we might expect some different cars compared to Europe or somewhere else. 55 | 56 | The run might take some test since the algorithm is [quadratic in runtime](https://en.wikipedia.org/wiki/Big_O_notation). This means, for the example we insert 11816 rows, which boils down to a dense 11816 x 11816 distance matrix consisting of 139.617.856 doubles. 57 | 58 | ## SOS using Scala 59 | 60 | ``` 61 | MacBook-Pro-van-Fokko:spark-stochastic-outlier-selection fokkodriesprong$ spark-shell --jars target/scala-2.11/spark-stochastic-outlier-selection_2.11-0.1.0.jar 62 | 19/09/24 12:22:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 63 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 64 | Setting default log level to "WARN". 65 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 66 | Spark context Web UI available at http://192.168.185.146:4040 67 | Spark context available as 'sc' (master = local[*], app id = local-1569320579133). 68 | Spark session available as 'spark'. 69 | Welcome to 70 | ____ __ 71 | / __/__ ___ _____/ /__ 72 | _\ \/ _ \/ _ `/ __/ '_/ 73 | /___/ .__/\_,_/_/ /_/\_\ version 2.4.4 74 | /_/ 75 | 76 | Using Scala version 2.11.12 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_172) 77 | Type in expressions to have them evaluated. 78 | Type :help for more information. 79 | 80 | scala> val partitions = 22 81 | partitions: Int = 22 82 | 83 | scala> var df = spark.read.option("header", "true").csv("data/cardataset.csv") 84 | df: org.apache.spark.sql.DataFrame = [Make: string, Model: string ... 14 more fields] 85 | 86 | scala> val metricColumns = Array("Engine HP", "Engine Cylinders", "highway MPG", "city mpg", "Popularity", "MSRP") 87 | metricColumns: Array[String] = Array(Engine HP, Engine Cylinders, highway MPG, city mpg, Popularity, MSRP) 88 | 89 | scala> metricColumns.foreach { col => 90 | | df = df.withColumn(col, df(col).cast("Double")) 91 | | val minValue = lit(df.select(min(df(col))).first()(0)) 92 | | val maxValue = lit(df.select(max(df(col))).first()(0)) 93 | | println("Col " + col + " min " + minValue + ", max: " + maxValue) 94 | | df = df.withColumn(col, (df(col) - minValue) / (maxValue - minValue)) 95 | | } 96 | Col Engine HP min 55.0, max: 1001.0 97 | Col Engine Cylinders min 0.0, max: 16.0 98 | Col highway MPG min 12.0, max: 354.0 99 | Col city mpg min 7.0, max: 137.0 100 | Col Popularity min 2.0, max: 5657.0 101 | Col MSRP min 2000.0, max: 2065902.0 102 | 103 | scala> import org.apache.spark.ml.feature.VectorAssembler 104 | import org.apache.spark.ml.feature.VectorAssembler 105 | 106 | scala> val ass = new VectorAssembler().setInputCols(metricColumns).setOutputCol("vector") 107 | ass: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_20553ab89e72 108 | 109 | scala> df = df.withColumn("label", concat_ws(" ", df("Make"), df("Model"), df("Year"), df("Engine Fuel Type"), df("Transmission Type"))) 110 | df: org.apache.spark.sql.DataFrame = [Make: string, Model: string ... 15 more fields] 111 | 112 | scala> df = ass.setHandleInvalid("skip").transform(df) 113 | df: org.apache.spark.sql.DataFrame = [Make: string, Model: string ... 16 more fields] 114 | 115 | scala> val num = df.count() 116 | num: Long = 11816 117 | 118 | scala> import org.apache.spark.ml.outlierdetection.StochasticOutlierDetection 119 | import org.apache.spark.ml.outlierdetection.StochasticOutlierDetection 120 | 121 | scala> val output = StochasticOutlierDetection.performOutlierDetectionDf(df.repartition(partitions), perplexity = Math.sqrt(num)) 122 | output: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[124] at map at StochasticOutlierDetection.scala:68 123 | 124 | scala> val result = spark.createDataFrame(output).toDF("label", "score").cache() 125 | result: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: string, score: double] 126 | 127 | scala> result.orderBy(asc("score")).show(22, false) 128 | +---------------------------------------------------------------+-------------------+ 129 | |label |score | 130 | +---------------------------------------------------------------+-------------------+ 131 | |Chevrolet Sonic 2015 regular unleaded MANUAL |0.21279828599808828| 132 | |Chevrolet Sonic 2016 regular unleaded MANUAL |0.21293609075531097| 133 | |Chevrolet Sonic 2016 regular unleaded MANUAL |0.2131071436509756 | 134 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.213830724190521 | 135 | |Chevrolet Sonic 2015 regular unleaded MANUAL |0.213858747973501 | 136 | |GMC Sierra 1500 Classic 2007 regular unleaded AUTOMATIC |0.21396034795422958| 137 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21396720338838232| 138 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.2140147473705729 | 139 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21406002357897783| 140 | |GMC Sierra 1500 Classic 2007 regular unleaded AUTOMATIC |0.21415665139143406| 141 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21418025039928237| 142 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21448192959828502| 143 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21649415207000774| 144 | |GMC Sierra 1500 2016 flex-fuel (unleaded/E85) AUTOMATIC |0.21656724572039546| 145 | |GMC Sierra 1500 2015 flex-fuel (unleaded/E85) AUTOMATIC |0.21659862416390824| 146 | |GMC Sierra 1500 2015 flex-fuel (unleaded/E85) AUTOMATIC |0.21665169162695752| 147 | |GMC Sierra 1500 2017 flex-fuel (unleaded/E85) AUTOMATIC |0.21667668432518386| 148 | |GMC Sierra 1500 2016 flex-fuel (unleaded/E85) AUTOMATIC |0.2167653474971194 | 149 | |Infiniti EX 2011 premium unleaded (recommended) AUTOMATIC |0.21723682724336277| 150 | |Infiniti EX 2012 premium unleaded (recommended) AUTOMATIC |0.21725696468322728| 151 | |GMC Sierra 1500 Classic 2007 regular unleaded AUTOMATIC |0.21726098834132695| 152 | |GMC Sierra 1500 2017 flex-fuel (unleaded/E85) AUTOMATIC |0.2172733380874455 | 153 | +---------------------------------------------------------------+-------------------+ 154 | only showing top 22 rows 155 | 156 | 157 | scala> result.orderBy(desc("score")).show(22, false) 158 | +----------------------------------------------------------------------+------------------+ 159 | |label |score | 160 | +----------------------------------------------------------------------+------------------+ 161 | |Audi A6 2017 premium unleaded (recommended) AUTOMATED_MANUAL |0.9999142633116896| 162 | |Porsche 718 Cayman 2017 premium unleaded (required) MANUAL |0.9805813377048511| 163 | |Rolls-Royce Corniche 2001 premium unleaded (required) AUTOMATIC |0.9799020737648789| 164 | |Acura NSX 2017 premium unleaded (required) AUTOMATED_MANUAL |0.9686265528700773| 165 | |Volkswagen Touareg 2 2008 diesel AUTOMATIC |0.9620829288975413| 166 | |Mitsubishi Mighty Max Pickup 1994 regular unleaded MANUAL |0.9484213630955308| 167 | |Chrysler Aspen 2009 regular unleaded AUTOMATIC |0.9271092903462673| 168 | |Oldsmobile Cutlass Ciera 1994 regular unleaded AUTOMATIC |0.9119041256238049| 169 | |Volkswagen Touareg 2015 premium unleaded (recommended) AUTOMATIC |0.9067141244279047| 170 | |Chrysler TC 1990 regular unleaded MANUAL |0.8942706414341941| 171 | |BMW 7 Series 2015 premium unleaded (required) AUTOMATIC |0.887326636862905 | 172 | |Ferrari Enzo 2003 premium unleaded (required) AUTOMATED_MANUAL |0.876073945508002 | 173 | |Audi 80 1990 regular unleaded MANUAL |0.8712155642417887| 174 | |Mitsubishi Vanwagon 1990 regular unleaded AUTOMATIC |0.8604594927230855| 175 | |Lamborghini Reventon 2008 premium unleaded (required) AUTOMATED_MANUAL|0.8569533527940364| 176 | |Saab 900 1996 regular unleaded MANUAL |0.8517167561224662| 177 | |Ford Focus RS 2017 premium unleaded (recommended) MANUAL |0.8369213995663755| 178 | |Hyundai Elantra 2017 regular unleaded AUTOMATED_MANUAL |0.8145671662996115| 179 | |Ford Focus 2017 regular unleaded MANUAL |0.8080970764431054| 180 | |Mercedes-Benz E-Class 2015 premium unleaded (required) AUTOMATIC |0.8045429849252246| 181 | |BMW M4 GTS 2016 premium unleaded (required) AUTOMATED_MANUAL |0.7994977767039853| 182 | |Chevrolet Cruze 2015 diesel AUTOMATIC |0.7935704861212711| 183 | +----------------------------------------------------------------------+------------------+ 184 | only showing top 22 rows 185 | ``` 186 | 187 | ## SOS using PySpark 188 | 189 | ``` 190 | MacBook-Pro-van-Fokko:spark-stochastic-outlier-selection fokkodriesprong$ pyspark --jars target/scala-2.11/spark-stochastic-outlier-selection_2.11-0.1.0.jar 191 | Python 3.6.6 (v3.6.6:4cf1f54eb7, Jun 26 2018, 19:50:54) 192 | [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)] on darwin 193 | Type "help", "copyright", "credits" or "license" for more information. 194 | 19/09/24 12:11:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 195 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 196 | Setting default log level to "WARN". 197 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 198 | Welcome to 199 | ____ __ 200 | / __/__ ___ _____/ /__ 201 | _\ \/ _ \/ _ `/ __/ '_/ 202 | /__ / .__/\_,_/_/ /_/\_\ version 2.4.4 203 | /_/ 204 | 205 | Using Python version 3.6.6 (v3.6.6:4cf1f54eb7, Jun 26 2018 19:50:54) 206 | SparkSession available as 'spark'. 207 | >>> import math 208 | >>> from pyspark.ml.feature import VectorAssembler 209 | >>> from pyspark.sql.functions import col, lit, min, max, asc, desc, concat_ws 210 | >>> from pyspark.mllib.common import _java2py 211 | >>> 212 | >>> df = spark.read.option("header", "true").csv("data/cardataset.csv") 213 | >>> 214 | >>> metric_columns = ["Engine HP", "Engine Cylinders", "highway MPG", "city mpg", "Popularity", "MSRP"] 215 | >>> 216 | >>> # Scale the columns and remove the empty ones 217 | ... for column_name in metric_columns: 218 | ... df = df.withColumn(column_name, col(column_name).cast("Double")) 219 | ... minValue = lit(df.select(min(col(column_name)).alias("min")).first().asDict()['min']) 220 | ... maxValue = lit(df.select(max(col(column_name)).alias("max")).first().asDict()['max']) 221 | ... print("Col " + column_name + " min " + str(minValue) + ", max: " + str(maxValue)) 222 | ... df = df.where(col(column_name).isNotNull()) 223 | ... df = df.withColumn(column_name, (col(column_name) - minValue) / (maxValue - minValue)) 224 | ... 225 | Col Engine HP min Column, max: Column 226 | Col Engine Cylinders min Column, max: Column 227 | Col highway MPG min Column, max: Column 228 | Col city mpg min Column, max: Column 229 | Col Popularity min Column, max: Column 230 | Col MSRP min Column, max: Column 231 | >>> 232 | >>> df = df.withColumn("label", concat_ws(" ", col("Make"), col("Model"), col("Year"), col("Engine Fuel Type"), col("Transmission Type"))) 233 | >>> 234 | >>> # Count the number of rows 235 | ... num = df.count() 236 | >>> num 237 | 11816 238 | >>> 239 | >>> # Remove the missing vectors and combine all the columns to a single vector 240 | ... ass = VectorAssembler(inputCols=metric_columns, outputCol="vector") 241 | >>> df = ass.setHandleInvalid("skip").transform(df).repartition(22) 242 | >>> 243 | >>> # As perplexity, use the sqrt of the number of rows 244 | ... perplexity = math.sqrt(num) 245 | >>> 246 | >>> jvm = sc._jvm 247 | >>> sqlContext = df._jdf 248 | >>> 249 | >>> sos = jvm.org.apache.spark.ml.outlierdetection.StochasticOutlierDetection.performOutlierDetectionPython(spark._jwrapped, df._jdf, "label", "vector", perplexity, 1e-9, 5000) 250 | >>> 251 | >>> # Reconstruct the Python DF 252 | ... result_df = _java2py(sc, sos) 253 | >>> 254 | >>> result_df.orderBy(asc("score")).show(22, False) 255 | +---------------------------------------------------------------+-------------------+ 256 | |label |score | 257 | +---------------------------------------------------------------+-------------------+ 258 | |Chevrolet Sonic 2015 regular unleaded MANUAL |0.2127982860017602 | 259 | |Chevrolet Sonic 2016 regular unleaded MANUAL |0.21293609075267098| 260 | |Chevrolet Sonic 2016 regular unleaded MANUAL |0.21310714365721425| 261 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21383072418134913| 262 | |Chevrolet Sonic 2015 regular unleaded MANUAL |0.21385874797566645| 263 | |GMC Sierra 1500 Classic 2007 regular unleaded AUTOMATIC |0.21396034794647772| 264 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21396720338004566| 265 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.2140147473677278 | 266 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.2140600235686727 | 267 | |GMC Sierra 1500 Classic 2007 regular unleaded AUTOMATIC |0.21415665138439838| 268 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21418025039550595| 269 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21448192959623724| 270 | |GMC Sierra 1500 Classic 2007 flex-fuel (unleaded/E85) AUTOMATIC|0.21649415206093467| 271 | |GMC Sierra 1500 2016 flex-fuel (unleaded/E85) AUTOMATIC |0.21656724570838756| 272 | |GMC Sierra 1500 2015 flex-fuel (unleaded/E85) AUTOMATIC |0.21659862415217437| 273 | |GMC Sierra 1500 2015 flex-fuel (unleaded/E85) AUTOMATIC |0.21665169161813994| 274 | |GMC Sierra 1500 2017 flex-fuel (unleaded/E85) AUTOMATIC |0.21667668431185894| 275 | |GMC Sierra 1500 2016 flex-fuel (unleaded/E85) AUTOMATIC |0.21676534748163712| 276 | |Infiniti EX 2011 premium unleaded (recommended) AUTOMATIC |0.2172368272372638 | 277 | |Infiniti EX 2012 premium unleaded (recommended) AUTOMATIC |0.21725696467803174| 278 | |GMC Sierra 1500 Classic 2007 regular unleaded AUTOMATIC |0.2172609883362982 | 279 | |GMC Sierra 1500 2017 flex-fuel (unleaded/E85) AUTOMATIC |0.21727333807551774| 280 | +---------------------------------------------------------------+-------------------+ 281 | only showing top 22 rows 282 | 283 | >>> result_df.orderBy(desc("score")).show(22, False) 284 | +----------------------------------------------------------------------+------------------+ 285 | |label |score | 286 | +----------------------------------------------------------------------+------------------+ 287 | |Audi A6 2017 premium unleaded (recommended) AUTOMATED_MANUAL |0.9999142633116839| 288 | |Porsche 718 Cayman 2017 premium unleaded (required) MANUAL |0.980581337706238 | 289 | |Rolls-Royce Corniche 2001 premium unleaded (required) AUTOMATIC |0.9799020737640686| 290 | |Acura NSX 2017 premium unleaded (required) AUTOMATED_MANUAL |0.9686265528721012| 291 | |Volkswagen Touareg 2 2008 diesel AUTOMATIC |0.9620829288975116| 292 | |Mitsubishi Mighty Max Pickup 1994 regular unleaded MANUAL |0.9484213631039752| 293 | |Chrysler Aspen 2009 regular unleaded AUTOMATIC |0.9271092903411216| 294 | |Oldsmobile Cutlass Ciera 1994 regular unleaded AUTOMATIC |0.911904125634653 | 295 | |Volkswagen Touareg 2015 premium unleaded (recommended) AUTOMATIC |0.9067141244284443| 296 | |Chrysler TC 1990 regular unleaded MANUAL |0.8942706414327495| 297 | |BMW 7 Series 2015 premium unleaded (required) AUTOMATIC |0.8873266368631819| 298 | |Ferrari Enzo 2003 premium unleaded (required) AUTOMATED_MANUAL |0.8760739455080375| 299 | |Audi 80 1990 regular unleaded MANUAL |0.871215564244082 | 300 | |Mitsubishi Vanwagon 1990 regular unleaded AUTOMATIC |0.8604594927168016| 301 | |Lamborghini Reventon 2008 premium unleaded (required) AUTOMATED_MANUAL|0.8569533527937525| 302 | |Saab 900 1996 regular unleaded MANUAL |0.8517167561268112| 303 | |Ford Focus RS 2017 premium unleaded (recommended) MANUAL |0.8369213995605174| 304 | |Hyundai Elantra 2017 regular unleaded AUTOMATED_MANUAL |0.8145671663030841| 305 | |Ford Focus 2017 regular unleaded MANUAL |0.8080970764403019| 306 | |Mercedes-Benz E-Class 2015 premium unleaded (required) AUTOMATIC |0.8045429849258207| 307 | |BMW M4 GTS 2016 premium unleaded (required) AUTOMATED_MANUAL |0.7994977767050019| 308 | |Chevrolet Cruze 2015 diesel AUTOMATIC |0.7935704861253007| 309 | +----------------------------------------------------------------------+------------------+ 310 | only showing top 22 rows 311 | ``` 312 | --------------------------------------------------------------------------------