├── .gitignore ├── README.md ├── images ├── outlier-detection-overview.png └── predictiveworks.png ├── pom.xml └── src └── main ├── resources ├── application.conf ├── features.xml ├── server.conf └── states.xml └── scala └── de └── kp └── spark └── outlier ├── Configuration.scala ├── KMeansDetector.scala ├── MarkovDetector.scala ├── OutlierServer.scala ├── RequestContext.scala ├── actor ├── BaseActor.scala ├── KMeansActor.scala ├── MarkovActor.scala ├── OutlierMaster.scala ├── OutlierMiner.scala ├── OutlierQuestor.scala └── TrainActor.scala ├── api └── AkkaApi.scala ├── app └── TrainApp.scala ├── markov ├── DoubleMatrix.scala ├── MarkovBuilder.scala ├── StateMetrics.scala └── TransitionMatrix.scala ├── model └── Model.scala ├── spec ├── StateSpec.scala └── VectorSpec.scala └── util ├── MathHelper.scala └── Optimizer.scala /.gitignore: -------------------------------------------------------------------------------- 1 | # use glob syntax. 2 | syntax: glob 3 | *.ser 4 | *.class 5 | *~ 6 | *.bak 7 | #*.off 8 | *.old 9 | 10 | # eclipse conf file 11 | .settings 12 | .classpath 13 | .project 14 | .manager 15 | .scala_dependencies 16 | 17 | # idea 18 | .idea 19 | *.iml 20 | 21 | # building 22 | target 23 | build 24 | null 25 | tmp* 26 | temp* 27 | dist 28 | test-output 29 | build.log 30 | 31 | # other scm 32 | .svn 33 | .CVS 34 | .hg* 35 | 36 | # switch to regexp syntax. 37 | # syntax: regexp 38 | # ^\.pc/ 39 | 40 | #SHITTY output not in target directory 41 | build.log 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Elasticworks.](https://raw.githubusercontent.com/skrusche63/spark-outlier/master/images/predictiveworks.png) 2 | 3 | **Predictiveworks.** is an open ensemble of predictive engines and has been made to cover a wide range of today's analytics requirements. **Predictiveworks.** brings the power of predictive analytics to Elasticsearch. 4 | 5 | ## Reactive Outlier Detection Engine 6 | 7 | ![Outlier Detection Engine Overview](https://raw.githubusercontent.com/skrusche63/spark-outlier/master/images/outlier-detection-overview.png) 8 | 9 | The Outlier Detection Engine is one of the nine members of the open ensemble and is built to find anomalies in large-scale datasets and human behavior 10 | for advanced risk reduction. 11 | -------------------------------------------------------------------------------- /images/outlier-detection-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skrusche63/spark-outlier/a02b7835dc8c8b194e52311e450d855d7e9624b5/images/outlier-detection-overview.png -------------------------------------------------------------------------------- /images/predictiveworks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skrusche63/spark-outlier/a02b7835dc8c8b194e52311e450d855d7e9624b5/images/predictiveworks.png -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | spark-outlier 4 | spark-outlier 5 | 0.2.2 6 | Spark-Outlier 7 | Reactive Outlier Detection Engine 8 | 2010 9 | 10 | 11 | My License 12 | http://.... 13 | repo 14 | 15 | 16 | 17 | 18 | 1.6 19 | 1.6 20 | UTF-8 21 | 2.10 22 | 2.10.0 23 | 1.2.0 24 | 25 | 26 | 27 | 28 | org.scala-lang 29 | scala-library 30 | ${scala.version} 31 | 32 | 33 | 34 | 35 | junit 36 | junit 37 | 4.11 38 | test 39 | 40 | 41 | org.specs2 42 | specs2_${scala.tools.version} 43 | 1.13 44 | test 45 | 46 | 47 | org.scalatest 48 | scalatest_${scala.tools.version} 49 | 2.0.M6-SNAP8 50 | test 51 | 52 | 53 | 54 | 55 | org.apache.spark 56 | spark-core_2.10 57 | ${spark.version} 58 | 59 | 60 | 61 | 62 | org.apache.spark 63 | spark-mllib_2.10 64 | ${spark.version} 65 | 66 | 67 | 68 | 69 | cascading 70 | cascading-core 71 | 2.5.4 72 | 73 | 74 | 75 | cascading 76 | cascading-hadoop 77 | 2.5.4 78 | 79 | 80 | 81 | 82 | org.elasticsearch 83 | elasticsearch-hadoop 84 | 2.0.0 85 | 86 | 87 | 88 | 89 | org.elasticsearch 90 | elasticsearch 91 | 1.3.2 92 | 93 | 94 | 95 | 96 | org.json4s 97 | json4s-native_2.10 98 | 3.2.10 99 | 100 | 101 | 102 | 103 | redis.clients 104 | jedis 105 | 2.5.2 106 | 107 | 108 | 109 | 110 | org.clapper 111 | argot_2.10 112 | 1.0.3 113 | 114 | 115 | 116 | 117 | 118 | 119 | conjars.org 120 | http://conjars.org/repo 121 | 122 | 123 | 124 | 125 | src/main/scala 126 | src/test/scala 127 | 128 | 129 | 130 | net.alchim31.maven 131 | scala-maven-plugin 132 | 3.1.3 133 | 134 | 135 | 136 | compile 137 | testCompile 138 | 139 | 140 | 141 | -make:transitive 142 | -dependencyfile 143 | ${project.build.directory}/.scala_dependencies 144 | 145 | 146 | 147 | 148 | 149 | 150 | org.apache.maven.plugins 151 | maven-surefire-plugin 152 | 2.13 153 | 154 | false 155 | true 156 | 157 | 158 | 159 | **/*Test.* 160 | **/*Suite.* 161 | 162 | 163 | 164 | 165 | 166 | 167 | Dr. Krusche & Partner PartG 168 | http://www.dr-kruscheundpartner.com 169 | 170 | https://github.com/skrusche63/spark-outlier 171 | 172 | -------------------------------------------------------------------------------- /src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | akka { 2 | loglevel = INFO 3 | stdout-loglevel = INFO 4 | akka.loggers = ["akka.event.slf4j.Slf4jLogger"] 5 | } 6 | 7 | actor { 8 | duration = 10 9 | retries = 10 10 | timeout = 10 11 | } 12 | 13 | # 14 | # Access to cassandra is provided by Datastax' spark-cassandra-connector; the respective 15 | # configuration parameters can be retrieved from here: 16 | # 17 | # https://github.com/datastax/spark-cassandra-connector/blob/master/doc/0_quick_start.md 18 | # 19 | cassandra { 20 | spark.cassandra.connection.host="127.0.0.1" 21 | } 22 | 23 | elastic { 24 | es.nodes="localhost" 25 | es.port="9200" 26 | es.resource="" 27 | es.query="" 28 | } 29 | 30 | file { 31 | items="" 32 | features="" 33 | } 34 | 35 | hbase { 36 | spark.hbase.host="127.0.0.1" 37 | } 38 | 39 | mongo { 40 | mongo.input.uri="mongodb://127.0.0.1:27017/beowulf.input" 41 | } 42 | 43 | mysql { 44 | url="127.0.0.1:8889" 45 | database="analytics" 46 | user="root" 47 | password="root" 48 | } 49 | 50 | redis { 51 | host="127.0.0.1" 52 | port="6379" 53 | } 54 | # 55 | 56 | # Configuration parameters for the REST API 57 | # of the Outlier Detection Engine 58 | # 59 | rest { 60 | host="127.0.0.1" 61 | port=9000 62 | } 63 | 64 | spark { 65 | spark.executor.memory="1g" 66 | spark.kryoserializer.buffer.mb="256" 67 | } -------------------------------------------------------------------------------- /src/main/resources/features.xml: -------------------------------------------------------------------------------- 1 | 2 | row 3 | col 4 | label 5 | value 6 | -------------------------------------------------------------------------------- /src/main/resources/server.conf: -------------------------------------------------------------------------------- 1 | akka { 2 | actor { 3 | provider = "akka.remote.RemoteActorRefProvider" 4 | } 5 | remote { 6 | enabled-transports = ["akka.remote.netty.tcp"] 7 | netty.tcp { 8 | hostname = "127.0.0.1" 9 | port = 2604 10 | } 11 | log-sent-messages = on 12 | log-received-messages = on 13 | } 14 | } -------------------------------------------------------------------------------- /src/main/resources/states.xml: -------------------------------------------------------------------------------- 1 | 2 | site 3 | user 4 | timestamp 5 | state 6 | -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/Configuration.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import com.typesafe.config.ConfigFactory 22 | import org.apache.hadoop.conf.{Configuration => HConf} 23 | 24 | import de.kp.spark.core.{Configuration => CoreConf} 25 | 26 | object Configuration extends CoreConf { 27 | 28 | /* Load configuration for router */ 29 | val path = "application.conf" 30 | val config = ConfigFactory.load(path) 31 | 32 | override def actor:(Int,Int,Int) = { 33 | 34 | val cfg = config.getConfig("actor") 35 | 36 | val duration = cfg.getInt("duration") 37 | val retries = cfg.getInt("retries") 38 | val timeout = cfg.getInt("timeout") 39 | 40 | (duration,retries,timeout) 41 | 42 | } 43 | 44 | override def cassandra:Map[String,String] = { 45 | 46 | val cfg = config.getConfig("cassandra") 47 | val conf = Map( 48 | "spark.cassandra.connection.host" -> cfg.getString("spark.cassandra.connection.host") 49 | ) 50 | 51 | conf 52 | 53 | } 54 | 55 | override def elastic:HConf = { 56 | 57 | val cfg = config.getConfig("elastic") 58 | val conf = new HConf() 59 | 60 | conf.set("es.nodes",cfg.getString("es.nodes")) 61 | conf.set("es.port",cfg.getString("es.port")) 62 | 63 | conf.set("es.resource", cfg.getString("es.resource")) 64 | conf.set("es.query", cfg.getString("es.query")) 65 | 66 | conf 67 | 68 | } 69 | 70 | override def hbase:Map[String,String] = { 71 | 72 | val cfg = config.getConfig("hbase") 73 | val conf = Map( 74 | "spark.hbase.host" -> cfg.getString("spark.hbase.host") 75 | ) 76 | 77 | conf 78 | 79 | } 80 | 81 | override def input:List[String] = { 82 | 83 | val cfg = config.getConfig("file") 84 | 85 | val items = cfg.getString("items") 86 | val features = cfg.getString("features") 87 | 88 | List(items,features) 89 | 90 | } 91 | 92 | override def mongo:HConf = { 93 | 94 | val cfg = config.getConfig("mongo") 95 | val conf = new HConf() 96 | 97 | conf.set("mongo.input.uri",cfg.getString("mongo.input.uri")) 98 | conf 99 | 100 | } 101 | 102 | override def mysql:(String,String,String,String) = { 103 | 104 | val cfg = config.getConfig("mysql") 105 | 106 | val url = cfg.getString("url") 107 | val db = cfg.getString("database") 108 | 109 | val user = cfg.getString("user") 110 | val password = cfg.getString("password") 111 | 112 | (url,db,user,password) 113 | 114 | } 115 | 116 | override def output:List[String] = null 117 | 118 | override def redis:(String,String) = { 119 | 120 | val cfg = config.getConfig("redis") 121 | 122 | val host = cfg.getString("host") 123 | val port = cfg.getString("port") 124 | 125 | (host,port) 126 | 127 | } 128 | 129 | override def rest:(String,Int) = { 130 | 131 | val cfg = config.getConfig("rest") 132 | 133 | val host = cfg.getString("host") 134 | val port = cfg.getInt("port") 135 | 136 | (host,port) 137 | 138 | } 139 | 140 | override def spark:Map[String,String] = { 141 | 142 | val cfg = config.getConfig("spark") 143 | 144 | Map( 145 | "spark.executor.memory" -> cfg.getString("spark.executor.memory"), 146 | "spark.kryoserializer.buffer.mb" -> cfg.getString("spark.kryoserializer.buffer.mb") 147 | ) 148 | 149 | } 150 | 151 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/KMeansDetector.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.rdd.RDD 22 | 23 | import org.apache.spark.mllib.clustering.KMeans 24 | import org.apache.spark.mllib.linalg.Vectors 25 | 26 | import de.kp.spark.core.model._ 27 | import de.kp.spark.outlier.util.{MathHelper,Optimizer} 28 | 29 | /** 30 | * KMeansDetector is a general purpose outlier detector that 31 | * detects outliers in sets of labeled features 32 | */ 33 | class KMeansDetector extends Serializable { 34 | 35 | def find(data:RDD[LabeledPoint],strategy:String="entropy",iterations:Int,top:Int):List[ClusteredPoint] = { 36 | 37 | val (k,normdata) = prepare(data,strategy,iterations) 38 | detect(normdata,k,iterations,top) 39 | 40 | } 41 | 42 | def detect(normdata:RDD[LabeledPoint],k:Int,iterations:Int,top:Int):List[ClusteredPoint] = { 43 | 44 | val sc = normdata.context 45 | 46 | /* 47 | * STEP #1: Compute KMeans model 48 | */ 49 | val vectors = normdata.map(point => Vectors.dense(point.features)) 50 | 51 | val model = KMeans.train(vectors,k,iterations) 52 | val centroids = model.clusterCenters 53 | 54 | /* 55 | * STEP #2: Calculate the distances for all points from their clusters; 56 | * outliers are those that have the farest distance 57 | */ 58 | val bcmodel = sc.broadcast(model) 59 | val points = normdata.map(point => { 60 | 61 | val vector = Vectors.dense(point.features) 62 | 63 | val cluster = bcmodel.value.predict(vector) 64 | val centroid = bcmodel.value.clusterCenters(cluster) 65 | 66 | val distance = Optimizer.distance(centroid.toArray,vector.toArray) 67 | 68 | (cluster,distance,point) 69 | 70 | }) 71 | 72 | /* 73 | * Retrieve top k features (LabeledPoint) with respect to their clusters; 74 | * the cluster identifier is used as a grouping mechanism to specify which 75 | * features belong to which centroid 76 | */ 77 | val bctop = sc.broadcast(top) 78 | points.groupBy(_._1).flatMap(x => x._2.toList.sortBy(_._2).reverse.take(bctop.value)).map(data => { 79 | 80 | val (cluster,distance,point) = data 81 | new ClusteredPoint(cluster,distance,point) 82 | 83 | }).collect().toList 84 | 85 | } 86 | 87 | def prepare(data:RDD[LabeledPoint],strategy:String="entropy",iterations:Int):(Int,RDD[LabeledPoint]) = { 88 | 89 | /* 90 | * STEP #1: Normalize data 91 | */ 92 | val idlabels = data.map(p => (p.id,p.label)) 93 | 94 | val features = data.map(p => p.features) 95 | 96 | val normalized = MathHelper.normalize(features) 97 | val normdata = idlabels.zip(normalized).map{case((id,label),features) => LabeledPoint(id,label, features)} 98 | 99 | /* 100 | * STEP #2: Find optimal number of clusters 101 | */ 102 | 103 | /* Range of cluster center */ 104 | val range = (5 to 40 by 5) 105 | 106 | val k = strategy match { 107 | 108 | case "distance" => Optimizer.optimizeByDistance(normdata, range, iterations) 109 | 110 | case "entropy" => Optimizer.optimizeByEntropy(normdata, range, iterations) 111 | 112 | } 113 | 114 | (k, normdata) 115 | 116 | } 117 | 118 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/MarkovDetector.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.rdd.RDD 22 | 23 | import de.kp.spark.core.model._ 24 | import de.kp.spark.outlier.markov.{MarkovBuilder,StateMetrics,TransitionMatrix} 25 | 26 | /** 27 | * The MarkovDetector discovers outliers from registered behavior. 28 | */ 29 | class MarkovDetector(@transient ctx:RequestContext,scale:Int,states:Array[String]) extends Serializable { 30 | 31 | val metrics = new StateMetrics(states) 32 | 33 | def detect(sequences:RDD[Behavior],algorithm:String,threshold:Double,matrix:TransitionMatrix):RDD[Outlier] = { 34 | 35 | val bmatrix = ctx.sc.broadcast(matrix) 36 | sequences.map(seq => { 37 | 38 | val (site,user,states) = (seq.site,seq.user,seq.states) 39 | val metric = algorithm match { 40 | 41 | case "missprob" => metrics.missProbMetric(states,bmatrix.value) 42 | 43 | case "missrate" => metrics.missRateMetric(states,bmatrix.value) 44 | 45 | case "entreduc" => metrics.entropyReductionMetric(states,bmatrix.value) 46 | 47 | } 48 | 49 | val flag = if (metric > threshold) "yes" else "no" 50 | Outlier(site,user,states,metric,flag) 51 | 52 | }) 53 | 54 | } 55 | 56 | def train(sequences:RDD[Behavior]):TransitionMatrix = { 57 | new MarkovBuilder(scale,states).build(sequences) 58 | } 59 | 60 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/OutlierServer.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import akka.actor.{ActorSystem,Props} 22 | import com.typesafe.config.ConfigFactory 23 | 24 | import de.kp.spark.core.SparkService 25 | import de.kp.spark.outlier.api.AkkaApi 26 | 27 | /** 28 | * The OutlierServer supports two different approaches to outlier discovery; one is based 29 | * on clustering analysis and determines outlier feature sets due to their distance to the 30 | * cluster centers. This approach is independent of a certain use case and concentrates on 31 | * the extraction and evaluation of (equal-size) feature vectors. The other approach to 32 | * outlier discovery has a strong focus on the customers purchase behavior and detects those 33 | * customer that behave different from all other customers. 34 | */ 35 | object OutlierServer extends SparkService { 36 | 37 | private val sc = createCtxLocal("IntentContext",Configuration.spark) 38 | 39 | def main(args: Array[String]) { 40 | 41 | val ctx = new RequestContext(sc) 42 | 43 | /** 44 | * AKKA API 45 | */ 46 | val conf:String = "server.conf" 47 | 48 | val akkaSystem = ActorSystem("akka-server",ConfigFactory.load(conf)) 49 | sys.addShutdownHook(akkaSystem.shutdown) 50 | 51 | new AkkaApi(akkaSystem,ctx).start() 52 | 53 | println("AKKA API activated.") 54 | 55 | } 56 | 57 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/RequestContext.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.sql.SQLContext 23 | 24 | class RequestContext( /* 25 | * Reference to the common SparkContext; this context can be used 26 | * to access HDFS based data sources or leverage the Spark machine 27 | * learning library or other Spark based functionality 28 | */ 29 | @transient val sc:SparkContext) extends Serializable { 30 | 31 | val sqlc = new SQLContext(sc) 32 | val config = Configuration 33 | 34 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/actor/BaseActor.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier.actor 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import akka.actor.{Actor,ActorLogging,ActorRef,Props} 22 | 23 | import de.kp.spark.core.model._ 24 | import de.kp.spark.core.redis.RedisCache 25 | 26 | import de.kp.spark.outlier.Configuration 27 | import de.kp.spark.outlier.model._ 28 | 29 | abstract class BaseActor extends Actor with ActorLogging { 30 | 31 | val (host,port) = Configuration.redis 32 | val cache = new RedisCache(host,port.toInt) 33 | 34 | protected def failure(req:ServiceRequest,message:String):ServiceResponse = { 35 | 36 | if (req == null) { 37 | val data = Map("message" -> message) 38 | new ServiceResponse("","",data,OutlierStatus.FAILURE) 39 | 40 | } else { 41 | val data = Map("uid" -> req.data("uid"), "message" -> message) 42 | new ServiceResponse(req.service,req.task,data,OutlierStatus.FAILURE) 43 | 44 | } 45 | 46 | } 47 | 48 | protected def response(req:ServiceRequest,missing:Boolean):ServiceResponse = { 49 | 50 | val uid = req.data("uid") 51 | 52 | if (missing == true) { 53 | val data = Map("uid" -> uid, "message" -> Messages.MISSING_PARAMETERS(uid)) 54 | new ServiceResponse(req.service,req.task,data,OutlierStatus.FAILURE) 55 | 56 | } else { 57 | val data = Map("uid" -> uid, "message" -> Messages.OUTLIER_DETECTION_STARTED(uid)) 58 | new ServiceResponse(req.service,req.task,data,OutlierStatus.STARTED) 59 | 60 | 61 | } 62 | 63 | } 64 | 65 | protected def serialize(resp:ServiceResponse) = Serializer.serializeResponse(resp) 66 | 67 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/actor/KMeansActor.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier.actor 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import de.kp.spark.core.Names 22 | import de.kp.spark.core.model._ 23 | 24 | import de.kp.spark.outlier.{KMeansDetector,RequestContext} 25 | import de.kp.spark.outlier.model._ 26 | 27 | import de.kp.spark.core.source.VectorSource 28 | import de.kp.spark.core.source.handler.VectorHandler 29 | 30 | import de.kp.spark.core.redis.RedisDB 31 | 32 | import de.kp.spark.outlier.spec.VectorSpec 33 | import scala.collection.mutable.ArrayBuffer 34 | 35 | class KMeansActor(@transient ctx:RequestContext) extends TrainActor(ctx) { 36 | 37 | val redis = new RedisDB(host,port.toInt) 38 | 39 | override def validate(req:ServiceRequest) { 40 | 41 | if (req.data.contains("top") == false) 42 | throw new Exception("Parameter 'top' is missing.") 43 | 44 | if (req.data.contains("iterations") == false) 45 | throw new Exception("Parameter 'iterations' is missing.") 46 | 47 | if (req.data.contains("strategy") == false) 48 | throw new Exception("Parameter 'strategy' is missing.") 49 | 50 | } 51 | 52 | override def train(req:ServiceRequest) { 53 | 54 | val source = new VectorSource(ctx.sc,ctx.config,new VectorSpec(req)) 55 | val dataset = VectorHandler.vector2LabeledPoints(source.connect(req)) 56 | 57 | val params = ArrayBuffer.empty[Param] 58 | 59 | val top = req.data("top").toInt 60 | params += Param("top","integer",top.toString) 61 | 62 | val strategy = req.data("strategy").asInstanceOf[String] 63 | params += Param("strategy","string",strategy) 64 | 65 | val iter = req.data("iterations").toInt 66 | params += Param("iterations","integer",iter.toString) 67 | 68 | cache.addParams(req, params.toList) 69 | 70 | val points = new KMeansDetector().find(dataset,strategy,iter,top).toList 71 | savePoints(req,ClusteredPoints(points)) 72 | 73 | } 74 | 75 | private def savePoints(req:ServiceRequest,points:ClusteredPoints) { 76 | redis.addPoints(req,points) 77 | } 78 | 79 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/actor/MarkovActor.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier.actor 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import de.kp.spark.core.Names 22 | import de.kp.spark.core.model._ 23 | 24 | import de.kp.spark.core.source.StateSource 25 | import de.kp.spark.core.source.handler.StateHandler 26 | 27 | import de.kp.spark.core.redis.RedisDB 28 | 29 | import de.kp.spark.outlier.RequestContext 30 | import de.kp.spark.outlier.model._ 31 | 32 | import de.kp.spark.outlier.MarkovDetector 33 | import de.kp.spark.outlier.spec.StateSpec 34 | 35 | import scala.collection.mutable.ArrayBuffer 36 | 37 | class MarkovActor(@transient ctx:RequestContext) extends TrainActor(ctx) { 38 | 39 | val redis = new RedisDB(host,port.toInt) 40 | 41 | override def validate(req:ServiceRequest) { 42 | 43 | if (req.data.contains("scale") == false) 44 | throw new Exception("Parameter 'scale' is missing.") 45 | 46 | if (req.data.contains("states") == false) 47 | throw new Exception("Parameter 'states' is missing.") 48 | 49 | if (req.data.contains("strategy") == false) 50 | throw new Exception("Parameter 'strategy' is missing.") 51 | 52 | if (req.data.contains("threshold") == false) 53 | throw new Exception("Parameter 'threshold' is missing.") 54 | 55 | } 56 | 57 | override def train(req:ServiceRequest) { 58 | 59 | val source = new StateSource(ctx.sc,ctx.config,new StateSpec(req)) 60 | val sequences = StateHandler.state2Behavior(source.connect(req)) 61 | 62 | val scale = req.data(Names.REQ_SCALE).toInt 63 | val states = req.data(Names.REQ_STATES).split(",") 64 | 65 | val detector = new MarkovDetector(ctx,scale,states) 66 | 67 | val model = detector.train(sequences) 68 | 69 | val params = ArrayBuffer.empty[Param] 70 | 71 | val strategy = req.data("strategy") 72 | params += Param("strategy","string",strategy) 73 | 74 | val threshold = req.data("threshold").toDouble 75 | params += Param("threshold","double",threshold.toString) 76 | 77 | cache.addParams(req, params.toList) 78 | 79 | val outliers = detector.detect(sequences,strategy,threshold,model).collect().toList 80 | 81 | saveOutliers(req,new Outliers(outliers)) 82 | 83 | } 84 | 85 | private def saveOutliers(req:ServiceRequest,outliers:Outliers) { 86 | redis.addOutliers(req,outliers) 87 | } 88 | 89 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/actor/OutlierMaster.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier.actor 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import akka.actor.{ActorRef,Props} 22 | 23 | import de.kp.spark.core.actor._ 24 | import de.kp.spark.core.model._ 25 | 26 | import de.kp.spark.outlier.{Configuration,RequestContext} 27 | 28 | class OutlierMaster(@transient ctx:RequestContext) extends BaseMaster(Configuration) { 29 | 30 | protected def actor(worker:String):ActorRef = { 31 | 32 | worker match { 33 | /* 34 | * Metadata management is part of the core functionality; field or metadata 35 | * specifications can be registered in, and retrieved from a Redis database. 36 | */ 37 | case "fields" => context.actorOf(Props(new FieldQuestor(Configuration))) 38 | case "register" => context.actorOf(Props(new BaseRegistrar(Configuration))) 39 | /* 40 | * Index management is part of the core functionality; an Elasticsearch 41 | * index can be created and appropriate (tracked) items can be saved. 42 | */ 43 | case "index" => context.actorOf(Props(new BaseIndexer(Configuration))) 44 | case "track" => context.actorOf(Props(new BaseTracker(Configuration))) 45 | 46 | case "params" => context.actorOf(Props(new ParamQuestor(Configuration))) 47 | /* 48 | * Request the actual status of an association rule mining 49 | * task; note, that get requests should only be invoked after 50 | * having retrieved a FINISHED status. 51 | * 52 | * Status management is part of the core functionality. 53 | */ 54 | case "status" => context.actorOf(Props(new StatusQuestor(Configuration))) 55 | 56 | case "get" => context.actorOf(Props(new OutlierQuestor())) 57 | case "train" => context.actorOf(Props(new OutlierMiner(ctx))) 58 | 59 | case _ => null 60 | 61 | } 62 | 63 | } 64 | 65 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/actor/OutlierMiner.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier.actor 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import akka.actor.{ActorRef,Props} 22 | 23 | import de.kp.spark.core.Names 24 | 25 | import de.kp.spark.core.actor._ 26 | import de.kp.spark.core.model._ 27 | 28 | import de.kp.spark.outlier.{Configuration,RequestContext} 29 | import de.kp.spark.outlier.model._ 30 | 31 | /** 32 | * The focus of the OutlierMiner is on the model building task, 33 | * either for cluster analysis based tasks or markov based states. 34 | */ 35 | class OutlierMiner(@transient ctx:RequestContext) extends BaseTrainer(Configuration) { 36 | 37 | protected def validate(req:ServiceRequest):Option[String] = { 38 | 39 | val uid = req.data(Names.REQ_UID) 40 | 41 | if (cache.statusExists(req)) { 42 | val message = Messages.TASK_ALREADY_STARTED(uid) 43 | return Some(message) 44 | 45 | } 46 | 47 | req.data.get(Names.REQ_ALGORITHM) match { 48 | 49 | case None => { 50 | return Some(Messages.NO_ALGORITHM_PROVIDED(uid)) 51 | } 52 | 53 | case Some(algorithm) => { 54 | if (Algorithms.isAlgorithm(algorithm) == false) { 55 | return Some(Messages.ALGORITHM_IS_UNKNOWN(uid,algorithm)) 56 | } 57 | 58 | } 59 | 60 | } 61 | 62 | req.data.get(Names.REQ_SOURCE) match { 63 | 64 | case None => { 65 | return Some(Messages.NO_SOURCE_PROVIDED(uid)) 66 | } 67 | 68 | case Some(source) => { 69 | if (Sources.isSource(source) == false) { 70 | return Some(Messages.SOURCE_IS_UNKNOWN(uid,source)) 71 | } 72 | } 73 | 74 | } 75 | 76 | None 77 | 78 | } 79 | 80 | /** 81 | * This is a helper method to determine which actor has to be 82 | * created to support the requested algorithm; actually KMeans 83 | * and Markov based algorithms are supported. 84 | */ 85 | protected def actor(req:ServiceRequest):ActorRef = { 86 | 87 | val algorithm = req.data(Names.REQ_ALGORITHM) 88 | if (algorithm == Algorithms.KMEANS) { 89 | context.actorOf(Props(new KMeansActor(ctx))) 90 | 91 | } else { 92 | context.actorOf(Props(new MarkovActor(ctx))) 93 | 94 | } 95 | 96 | } 97 | 98 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/actor/OutlierQuestor.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier.actor 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import akka.actor.{Actor,ActorLogging,ActorRef,Props} 22 | 23 | import de.kp.spark.core.Names 24 | import de.kp.spark.core.model._ 25 | 26 | import de.kp.spark.core.redis.RedisDB 27 | 28 | import de.kp.spark.outlier.model._ 29 | 30 | class OutlierQuestor extends BaseActor { 31 | 32 | implicit val ec = context.dispatcher 33 | private val redis = new RedisDB(host,port.toInt) 34 | 35 | def receive = { 36 | 37 | case req:ServiceRequest => { 38 | 39 | val origin = sender 40 | val uid = req.data("uid") 41 | 42 | val Array(task,topic) = req.task.split(":") 43 | topic match { 44 | 45 | case "state" => { 46 | 47 | val response = { 48 | 49 | if (redis.outliersExists(req) == false) { 50 | failure(req,Messages.OUTLIERS_DO_NOT_EXIST(uid)) 51 | 52 | } else { 53 | 54 | val outliers = redis.outliers(req) 55 | 56 | val data = Map(Names.REQ_UID -> uid, Names.REQ_RESPONSE -> outliers) 57 | new ServiceResponse(req.service,req.task,data,OutlierStatus.SUCCESS) 58 | 59 | } 60 | } 61 | 62 | origin ! response 63 | context.stop(self) 64 | 65 | } 66 | 67 | case "vector" => { 68 | 69 | val response = { 70 | 71 | if (redis.pointsExist(req) == false) { 72 | failure(req,Messages.OUTLIERS_DO_NOT_EXIST(uid)) 73 | 74 | } else { 75 | 76 | val points = redis.points(req) 77 | 78 | val data = Map(Names.REQ_UID -> uid, Names.REQ_RESPONSE -> points) 79 | new ServiceResponse(req.service,req.task,data,OutlierStatus.SUCCESS) 80 | 81 | } 82 | 83 | } 84 | origin ! response 85 | context.stop(self) 86 | 87 | } 88 | 89 | case _ => { 90 | 91 | val msg = Messages.TASK_IS_UNKNOWN(uid,req.task) 92 | 93 | origin ! failure(req,msg) 94 | context.stop(self) 95 | 96 | } 97 | 98 | } 99 | 100 | } 101 | 102 | } 103 | 104 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/actor/TrainActor.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier.actor 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import de.kp.spark.core.model._ 22 | 23 | import de.kp.spark.outlier.RequestContext 24 | import de.kp.spark.outlier.model._ 25 | 26 | class TrainActor(@transient ctx:RequestContext) extends BaseActor { 27 | 28 | def receive = { 29 | 30 | case req:ServiceRequest => { 31 | 32 | val origin = sender 33 | val missing = try { 34 | 35 | validate(req) 36 | false 37 | 38 | } catch { 39 | case e:Exception => true 40 | 41 | } 42 | 43 | origin ! response(req, missing) 44 | 45 | if (missing == false) { 46 | 47 | try { 48 | 49 | /* Update cache */ 50 | cache.addStatus(req,OutlierStatus.TRAINING_STARTED) 51 | 52 | train(req) 53 | 54 | /* Update cache */ 55 | cache.addStatus(req,OutlierStatus.TRAINING_FINISHED) 56 | 57 | } catch { 58 | case e:Exception => cache.addStatus(req,OutlierStatus.FAILURE) 59 | } 60 | 61 | } 62 | 63 | context.stop(self) 64 | 65 | } 66 | 67 | case _ => { 68 | 69 | log.error("unknown request.") 70 | context.stop(self) 71 | 72 | } 73 | 74 | } 75 | 76 | protected def validate(req:ServiceRequest) = {} 77 | 78 | protected def train(req:ServiceRequest) {} 79 | 80 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/api/AkkaApi.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier.api 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import akka.actor.{ActorSystem,Props} 22 | 23 | import de.kp.spark.outlier.RequestContext 24 | import de.kp.spark.outlier.actor.OutlierMaster 25 | 26 | class AkkaApi(system:ActorSystem,@transient val ctx:RequestContext) { 27 | 28 | val master = system.actorOf(Props(new OutlierMaster(ctx)), name="outlier-master") 29 | 30 | def start() { 31 | while (true) {} 32 | } 33 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/app/TrainApp.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier.app 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.SparkContext 22 | 23 | import akka.actor._ 24 | import com.typesafe.config.ConfigFactory 25 | 26 | import org.clapper.argot._ 27 | 28 | import de.kp.spark.core.Names 29 | import de.kp.spark.core.model._ 30 | 31 | import de.kp.spark.core.actor.Supervisor 32 | import de.kp.spark.core.SparkService 33 | 34 | import de.kp.spark.outlier.{Configuration,RequestContext} 35 | 36 | import de.kp.spark.outlier.actor.OutlierMaster 37 | import de.kp.spark.outlier.model._ 38 | 39 | import scala.concurrent.duration.DurationInt 40 | import scala.collection.mutable.HashMap 41 | 42 | object TrainApp extends SparkService { 43 | 44 | protected val sc = createCtxLocal("OutlierContext",Configuration.spark) 45 | protected val system = ActorSystem("OutlierSystem") 46 | 47 | protected val inbox = Inbox.create(system) 48 | 49 | sys.addShutdownHook({ 50 | /* 51 | * In case of a system shutdown, we also make clear 52 | * that the SparkContext is properly stopped as well 53 | * as the respective Akka actor system 54 | */ 55 | sc.stop 56 | system.shutdown 57 | 58 | }) 59 | 60 | def main(args:Array[String]) { 61 | 62 | try { 63 | 64 | val req_params = createParams(args) 65 | val req = new ServiceRequest("context","train:model",req_params) 66 | 67 | val ctx = new RequestContext(sc) 68 | val actor = system.actorOf(Props(new Handler(ctx))) 69 | 70 | inbox.watch(actor) 71 | actor ! req 72 | 73 | val timeout = DurationInt(req_params("timeout").toInt).minute 74 | 75 | while (inbox.receive(timeout).isInstanceOf[Terminated] == false) {} 76 | sys.exit 77 | 78 | } catch { 79 | case e:Exception => { 80 | 81 | println(e.getMessage) 82 | sys.exit 83 | 84 | } 85 | 86 | } 87 | 88 | } 89 | 90 | protected def createParams(args:Array[String]):Map[String,String] = { 91 | 92 | import ArgotConverters._ 93 | 94 | val parser = new ArgotParser( 95 | programName = "Outlier Analysis Engine", 96 | compactUsage = true, 97 | preUsage = Some("Version %s. Copyright (c) 2015, %s.".format("1.0","Dr. Krusche & Partner PartG")) 98 | ) 99 | 100 | val site = parser.option[String](List("key"),"key","Unique application key") 101 | val uid = parser.option[String](List("uid"),"uid","Unique job identifier") 102 | 103 | val name = parser.option[String](List("name"),"name","Unique job designator") 104 | 105 | val config = parser.option[String](List("config"),"config","Configuration file") 106 | parser.parse(args) 107 | 108 | /* Collect parameters */ 109 | val params = HashMap.empty[String,String] 110 | 111 | /* Validate parameters */ 112 | site.value match { 113 | 114 | case None => parser.usage("Parameter 'key' is missing.") 115 | case Some(value) => params += "site" -> value 116 | 117 | } 118 | 119 | uid.value match { 120 | 121 | case None => parser.usage("Parameter 'uid' is missing.") 122 | case Some(value) => params += "uid" -> value 123 | 124 | } 125 | 126 | name.value match { 127 | 128 | case None => parser.usage("Parameter 'name' is missing.") 129 | case Some(value) => params += "name" -> value 130 | 131 | } 132 | 133 | config.value match { 134 | 135 | case None => parser.usage("Parameter 'config' is missing.") 136 | case Some(value) => { 137 | 138 | val cfg = ConfigFactory.load(value) 139 | 140 | val algo = cfg.getString("algo") 141 | if (Algorithms.isAlgorithm(algo) == false) 142 | parser.usage("Parameter 'algo' must be one of [KMEANS, SKMEANS].") 143 | 144 | params += "algorithm" -> algo 145 | params += "source" -> cfg.getString("source") 146 | 147 | /* COMMON */ 148 | params += "strategy" -> cfg.getString("strategy") 149 | 150 | /* KMEANS */ 151 | params += "k" -> cfg.getInt("k").toString 152 | 153 | /* MARKOV */ 154 | params += "threshold" -> cfg.getDouble("threshold").toString 155 | 156 | params += "scale" -> cfg.getInt("scale").toString 157 | params += "states" -> cfg.getString("states") 158 | 159 | } 160 | 161 | } 162 | 163 | /* Add timestamp as global parameter */ 164 | params += "timestamp" -> new java.util.Date().getTime.toString 165 | params.toMap 166 | 167 | } 168 | 169 | } 170 | 171 | class Handler(@transient ctx:RequestContext) extends Actor { 172 | 173 | private val config = Configuration 174 | def receive = { 175 | 176 | case req:ServiceRequest => { 177 | 178 | val start = new java.util.Date().getTime 179 | println("Trainer started at " + start) 180 | 181 | val master = context.actorOf(Props(new OutlierMaster(ctx))) 182 | master ! Serializer.serializeRequest(req) 183 | 184 | val status = OutlierStatus.TRAINING_FINISHED 185 | val supervisor = context.actorOf(Props(new Supervisor(req,status,config))) 186 | 187 | } 188 | 189 | case evt:StatusEvent => { 190 | /* 191 | * The StatusEvent message is returned from the 192 | * supervisor actor and specifies that the model 193 | * training task has been finished 194 | */ 195 | val end = new java.util.Date().getTime 196 | println("Trainer finished at " + end) 197 | 198 | context.stop(self) 199 | 200 | } 201 | 202 | case msg:String => { 203 | 204 | val end = new java.util.Date().getTime 205 | println("Trainer finished at " + end) 206 | 207 | val response = Serializer.deserializeResponse(msg) 208 | 209 | println("Message: " + response.data("message").toString) 210 | println("Status: " + response.status) 211 | 212 | } 213 | 214 | } 215 | 216 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/markov/DoubleMatrix.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier.markov 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import scala.collection.mutable.ArrayBuffer 22 | import scala.Array.canBuildFrom 23 | 24 | class DoubleMatrix(numRow:Int,numCol:Int) { 25 | 26 | protected val table:Array[Array[Double]] = Array.fill[Double](numRow,numCol)(0.0) 27 | 28 | protected var rowLabels = Array.empty[String] 29 | protected var colLabels = Array.empty[String] 30 | 31 | def setStates(rowStates:Array[String], colStates:Array[String]) { 32 | 33 | this.rowLabels = rowStates 34 | this.colLabels = colStates 35 | 36 | } 37 | 38 | def set(row:Int,col:Int,valu:Double) { 39 | table(row)(col) = valu 40 | } 41 | 42 | def get(row:Int,col:Int):Double = table(row)(col) 43 | 44 | def getRow(row:Int):Array[Double] = table(row) 45 | 46 | def getRow(rowLabel:String):Array[Double] = table(rowLabels.indexOf(rowLabel)) 47 | 48 | def getRowLabel(col:Int) = rowLabels(col) 49 | 50 | def getColLabel(col:Int) = colLabels(col) 51 | 52 | def add(row:Int,col:Int,valu:Double) { 53 | table(row)(col) = table(row)(col) + valu 54 | } 55 | 56 | def add(rowLabel:String,colLabel:String,valu:Double) { 57 | 58 | val (row,col) = getRowCol(rowLabel,colLabel) 59 | table(row)(col) += valu 60 | 61 | } 62 | 63 | def increment(row:Int,col:Int) { 64 | table(row)(col) = table(row)(col) + 1 65 | } 66 | 67 | def increment(rowLabel:String, colLabel:String) { 68 | 69 | val (row,col) = getRowCol(rowLabel, colLabel) 70 | table(row)(col) = table(row)(col) + 1 71 | 72 | } 73 | 74 | def getRowSum(row:Int):Double = table(row).sum 75 | 76 | def getColumnSum(col:Int):Double = { 77 | 78 | var sum:Double = 0 79 | (0 until numRow).foreach(row => sum += table(row)(col)) 80 | 81 | sum 82 | 83 | } 84 | 85 | def serialize():String = { 86 | 87 | val output = ArrayBuffer.empty[String] 88 | (0 until numRow).foreach(row => output += serializeRow(row)) 89 | 90 | output.mkString(";") 91 | 92 | } 93 | 94 | def serializeRow(row:Int):String = table(row).mkString(",") 95 | 96 | def deserialize(data:String) { 97 | 98 | val rows = data.split(";") 99 | (0 until rows.length).foreach(row => deserializeRow(row,rows(row))) 100 | 101 | } 102 | 103 | def deserializeRow(row:Int,data:String) { 104 | table(row) = data.split(",").map(_.toDouble) 105 | } 106 | 107 | 108 | private def getRowCol(rowLabel:String,colLabel:String):(Int,Int) = { 109 | 110 | val row = rowLabels.indexOf(rowLabel) 111 | val col = colLabels.indexOf(colLabel) 112 | 113 | (row,col) 114 | 115 | } 116 | 117 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/markov/MarkovBuilder.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier.markov 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.rdd.RDD 22 | 23 | import de.kp.spark.core.model._ 24 | import scala.collection.mutable.HashMap 25 | 26 | private case class Pair(ant:String,con:String) 27 | 28 | class MarkovBuilder(scaleDef:Int,stateDefs:Array[String]) extends Serializable { 29 | 30 | def build(dataset:RDD[Behavior]):TransitionMatrix = { 31 | 32 | def seqOp(support:HashMap[Pair,Int],seq:Behavior):HashMap[Pair,Int] = { 33 | 34 | val (site,user,states) = (seq.site,seq.user,seq.states) 35 | /* 36 | * The pair support aggregates over all sites and users provided; 37 | * for an outlier detection, we assume that this is the best way 38 | * to determine state transition probabilities 39 | */ 40 | for (i <- 1 until states.size) { 41 | 42 | val pair = new Pair(states(i-1),states(i)) 43 | 44 | support.get(pair) match { 45 | case None => support += pair -> 1 46 | case Some(count) => support += pair -> (count + 1) 47 | } 48 | 49 | } 50 | 51 | support 52 | 53 | } 54 | 55 | /* Note that supp1 is always NULL */ 56 | def combOp(supp1:HashMap[Pair,Int],supp2:HashMap[Pair,Int]):HashMap[Pair,Int] = supp2 57 | 58 | /* Build pair support */ 59 | val pairsupp = dataset.coalesce(1, false).aggregate(HashMap.empty[Pair,Int])(seqOp,combOp) 60 | 61 | /* Setup transition matrix and add pair support*/ 62 | val dim = stateDefs.length 63 | 64 | val matrix = new TransitionMatrix(dim,dim) 65 | matrix.setScale(scaleDef) 66 | 67 | matrix.setStates(stateDefs, stateDefs) 68 | for ((pair,support) <- pairsupp) { 69 | matrix.add(pair.ant, pair.con, support) 70 | } 71 | 72 | /* Normalize the matrix content and transform support into probabilities */ 73 | matrix.normalize() 74 | 75 | matrix 76 | 77 | } 78 | 79 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/markov/StateMetrics.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier.markov 2 | 3 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 4 | * 5 | * This file is part of the Spark-Outlier project 6 | * (https://github.com/skrusche63/spark-outlier). 7 | * 8 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 9 | * terms of the GNU General Public License as published by the Free Software 10 | * Foundation, either version 3 of the License, or (at your option) any later 11 | * version. 12 | * 13 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 14 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 15 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 | * You should have received a copy of the GNU General Public License along with 17 | * Spark-Outlier. 18 | * 19 | * If not, see . 20 | */ 21 | 22 | class StateMetrics(stateDefs:Array[String]) extends Serializable { 23 | 24 | /* 25 | * Miss Probability Metric 26 | * 27 | * For any pair of consecutive transaction states t(i) and t(j) in a sequence, 28 | * the following quantity is calculated: For the row corresponding to t(i), we 29 | * are summing all the probabilities except for the target state t(j). 30 | * 31 | * F(t(i), t(j)) = Sum(P(t(i), t(k)) | k != j) where P(t(i), t(k)) is the probability 32 | * of transitioning from transaction state t(i) to t(k) 33 | * 34 | * Then we sum F over all the transaction state pairs in the sequence and normalize by 35 | * the number of such pairs. 36 | */ 37 | 38 | def missProbMetric(states:List[String],model:TransitionMatrix):Double = { 39 | 40 | var F:Double = 0 41 | var count:Int = 0 42 | 43 | for (i <- 1 until states.size) { 44 | 45 | val srcIndex = stateDefs.indexOf(states(i-1)) 46 | val tarIndex = stateDefs.indexOf(states(i)) 47 | 48 | /* Sum all probabilities except the target state */ 49 | for (j <- 0 until stateDefs.length) { 50 | if (j != tarIndex) 51 | F += model.get(srcIndex,j) 52 | } 53 | 54 | count += 1 55 | } 56 | 57 | val metric = F / count 58 | metric 59 | 60 | } 61 | 62 | /* 63 | * Miss Rate Metric 64 | * 65 | * For any transition, if transition corresponds to the maximum probability target state, the value is 0, otherwise it’s 1. 66 | * 67 | * F(t(i), t(j)) = 0 if t(j) = t(k) else 1 where t(k) is the target state when P(t(i), t(k)) = max(P(t(i), t(l)) for all l 68 | * 69 | * Then we sum F over all the transaction state pairs in the sequence and normalize by 70 | * the number of such pairs. 71 | */ 72 | def missRateMetric(states:List[String],model:TransitionMatrix):Double = { 73 | 74 | var F:Double = 0 75 | var count:Int = 0 76 | 77 | for (i <- 1 until states.size) { 78 | 79 | val srcIndex = stateDefs.indexOf(states(i-1)) 80 | val tarIndex = stateDefs.indexOf(states(i)) 81 | 82 | val maxIndex = stateDefs.indexOf(model.getRow(srcIndex).max) 83 | 84 | F = (if (tarIndex == maxIndex) 0 else 1) 85 | count += 1 86 | 87 | } 88 | 89 | val metric = F / count 90 | metric 91 | 92 | } 93 | 94 | /* 95 | * Entropy Reduction Metric 96 | * 97 | * We calculate two quantities F and G as below. For a given row, F is the entropy excluding target state for the state pair 98 | * under consideration. G is the entropy for the whole row. 99 | * 100 | * F(t(i), t(j)) = sum (-P(t(i), t(k)) log(P(t(i), t(k)) | t(k) != t(j) 101 | * G(t(i)) = sum (-P(t(i), t(k)) log(P(t(i), t(k)) 102 | * 103 | * We sum F and G over all consecutive state pairs and divide the two sums. 104 | */ 105 | def entropyReductionMetric(states:List[String],model:TransitionMatrix):Double = { 106 | 107 | var F:Double = 0 108 | var G:Double = 0 109 | 110 | for (i <- 1 until states.size) { 111 | 112 | val srcIndex = stateDefs.indexOf(states(i-1)) 113 | val tarIndex = stateDefs.indexOf(states(i)) 114 | 115 | for (j <- 0 until stateDefs.length) { 116 | 117 | val prob = model.get(srcIndex,j) 118 | val entropy = -prob * Math.log(prob) 119 | 120 | 121 | if (j != tarIndex) F += entropy 122 | G += entropy 123 | 124 | } 125 | 126 | } 127 | 128 | val metric = F / G 129 | metric 130 | 131 | } 132 | 133 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/markov/TransitionMatrix.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier.markov 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | class TransitionMatrix(numRow:Int,numCol:Int) extends DoubleMatrix(numRow,numCol) { 22 | 23 | private var scale = 100 24 | 25 | def setScale(scale:Int) { 26 | this.scale = scale 27 | } 28 | 29 | def normalize() { 30 | /* 31 | * Laplace correction: A row that contains at least 32 | * one zero value is shift by the value of 1 33 | */ 34 | (0 until numRow).foreach(row => { 35 | 36 | val transProbs = getRow(row) 37 | if (transProbs.min == 0) { 38 | (0 until numCol).foreach(col => table(row)(col) += 1) 39 | } 40 | 41 | }) 42 | 43 | /* Normalize transition support */ 44 | (0 until numRow).foreach(row => { 45 | val rowSum = getRowSum(row) 46 | (0 until numCol).foreach(col => table(row)(col) = (table(row)(col) * scale) / rowSum) 47 | }) 48 | 49 | } 50 | 51 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/model/Model.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier.model 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import de.kp.spark.core.model._ 22 | 23 | object Algorithms { 24 | 25 | val KMEANS:String = "KMEANS" 26 | val MARKOV:String = "MARKOV" 27 | 28 | private def algorithms = List(KMEANS,MARKOV) 29 | def isAlgorithm(algorithm:String):Boolean = algorithms.contains(algorithm) 30 | 31 | } 32 | 33 | object Serializer extends BaseSerializer 34 | 35 | object Messages extends BaseMessages { 36 | 37 | def MISSING_PARAMETERS(uid:String):String = String.format("""Parameters are missing for uid '%s'.""", uid) 38 | 39 | def NO_METHOD_PROVIDED(uid:String):String = String.format("""No method provided for uid '%s'.""", uid) 40 | 41 | def METHOD_NOT_SUPPORTED(uid:String):String = String.format("""The provided is not supported for uid '%s'.""", uid) 42 | 43 | def OUTLIER_DETECTION_STARTED(uid:String) = String.format("""Outlier detection started for uid '%s'.""", uid) 44 | 45 | def OUTLIERS_DO_NOT_EXIST(uid:String):String = String.format("""The outliers for uid '%s' do not exist.""", uid) 46 | 47 | } 48 | 49 | object OutlierStatus extends BaseStatus { 50 | 51 | val DATASET:String = "dataset" 52 | val TRAINED:String = "trained" 53 | 54 | val STARTED:String = "started" 55 | val STOPPED:String = "stopped" 56 | 57 | val FINISHED:String = "finished" 58 | val RUNNING:String = "running" 59 | 60 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/spec/StateSpec.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier.spec 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import de.kp.spark.core.model._ 22 | import de.kp.spark.core.redis.RedisCache 23 | 24 | import de.kp.spark.core.spec.Fields 25 | import de.kp.spark.outlier.Configuration 26 | 27 | import scala.xml._ 28 | import scala.collection.mutable.Buffer 29 | 30 | class StateSpec(req:ServiceRequest) extends Fields { 31 | 32 | val path = "states.xml" 33 | 34 | val (host,port) = Configuration.redis 35 | val cache = new RedisCache(host,port.toInt) 36 | 37 | private val fields = load 38 | 39 | def mapping:Map[String,String] = fields.map(x => (x.name,x.value)).toMap 40 | 41 | def names:List[String] = fields.map(_.name) 42 | 43 | def types:List[String] = fields.map(_.datatype) 44 | 45 | private val load:List[Field] = { 46 | 47 | val data = Buffer.empty[Field] 48 | 49 | try { 50 | 51 | if (cache.fieldsExist(req)) { 52 | 53 | val fieldspec = cache.fields(req) 54 | for (field <- fieldspec) { 55 | data += Field(field.name,field.datatype,field.value) 56 | } 57 | 58 | } else { 59 | 60 | val root = XML.load(getClass.getClassLoader.getResource(path)) 61 | for (field <- root \ "field") { 62 | 63 | val _name = (field \ "@name").toString 64 | val _type = (field \ "@type").toString 65 | 66 | val _mapping = field.text 67 | 68 | data += Field(_name,_type,_mapping) 69 | 70 | } 71 | 72 | } 73 | 74 | } catch { 75 | case e:Exception => {} 76 | } 77 | 78 | data.toList 79 | 80 | } 81 | 82 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/spec/VectorSpec.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier.spec 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import de.kp.spark.core.model._ 22 | import de.kp.spark.core.redis.RedisCache 23 | 24 | import de.kp.spark.core.spec.Fields 25 | import de.kp.spark.outlier.Configuration 26 | 27 | import scala.xml._ 28 | import scala.collection.mutable.Buffer 29 | 30 | class VectorSpec(req:ServiceRequest) extends Fields { 31 | 32 | val path = "features.xml" 33 | 34 | val (host,port) = Configuration.redis 35 | val cache = new RedisCache(host,port.toInt) 36 | 37 | private val fields = load 38 | 39 | def mapping:Map[String,String] = fields.map(x => (x.name,x.value)).toMap 40 | 41 | def names:List[String] = fields.map(_.name) 42 | 43 | def types:List[String] = fields.map(_.datatype) 44 | 45 | private val load:List[Field] = { 46 | 47 | val data = Buffer.empty[Field] 48 | 49 | try { 50 | 51 | if (cache.fieldsExist(req)) { 52 | 53 | val fieldspec = cache.fields(req) 54 | for (field <- fieldspec) { 55 | data += Field(field.name,field.datatype,field.value) 56 | } 57 | 58 | } else { 59 | 60 | val root = XML.load(getClass.getClassLoader.getResource(path)) 61 | for (field <- root \ "field") { 62 | 63 | val _name = (field \ "@name").toString 64 | val _type = (field \ "@type").toString 65 | 66 | val _mapping = field.text 67 | 68 | data += Field(_name,_type,_mapping) 69 | 70 | } 71 | 72 | } 73 | 74 | } catch { 75 | case e:Exception => {} 76 | } 77 | 78 | data.toList 79 | 80 | } 81 | 82 | } 83 | 84 | -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/util/MathHelper.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier.util 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.rdd.RDD 22 | 23 | object MathHelper { 24 | 25 | /** 26 | * Entropy of a dataset containing integers 27 | */ 28 | def intEntropy(data:TraversableOnce[Int]):Double = { 29 | 30 | val invLog2 = 1.0 / Math.log(2) 31 | 32 | val positives = data.filter(_ > 0) 33 | if (positives.size > 0) { 34 | 35 | val sum: Double = positives.sum 36 | val invSum = 1.0 / sum.toDouble 37 | 38 | positives.map {positive => 39 | 40 | val p = positive.toDouble * invSum 41 | -p * Math.log(p) 42 | 43 | }.sum 44 | 45 | } else { 46 | 0.0 47 | } 48 | 49 | } 50 | 51 | /** 52 | * Entroy of a dataset containing strings; it may be 53 | * used as a measure of the homogenity of the strings 54 | */ 55 | def strEntropy(data:TraversableOnce[String]):Double = { 56 | 57 | val invLog2 = 1.0 / Math.log(2) 58 | 59 | val len = data.size 60 | if (len > 1) { 61 | 62 | val invLen = 1.0 / len.toDouble 63 | var ent = 0.0 64 | 65 | for (str <- data.toList.distinct) { 66 | /* 67 | * Probability to find a certain value within the dataset 68 | */ 69 | val pstr = data.count(x => x == str).toDouble * invLen 70 | ent -= pstr * Math.log(pstr) * invLog2 71 | 72 | } 73 | 74 | ent 75 | 76 | } else { 77 | 0.0 78 | 79 | } 80 | 81 | } 82 | 83 | /** 84 | * Data is a distributed list of feature vectors (Array[Double]) with the 85 | * following semantic: vector = [f_0,f_1,f_2, ...]; i.e. each vectors holds 86 | * a certain value for feature i at position i. Normalizing those data means 87 | * that one has to normalize all values of feature f_0, all values of f_1 etc 88 | */ 89 | def normalize(data:RDD[Array[Double]]):RDD[Array[Double]] = { 90 | 91 | val total = data.count() 92 | 93 | /* 94 | * Each column of the data matrix is assigned to a certain feature; 95 | * we therefore have to sum up the values of each column independently 96 | * and build the mean value 97 | */ 98 | val sums = data.reduce((a,b) => a.zip(b).map(t => t._1 + t._2)) 99 | val means = sums.map(_ / total) 100 | 101 | /* 102 | * We build the standard deviation for the values of each column 103 | */ 104 | val len = sums.length 105 | 106 | val init = new Array[Double](len) 107 | val sumSquares = data.fold(init)((a,b) => a.zip(b).map(t => t._1 + t._2*t._2)) 108 | 109 | val stdevs = sumSquares.zip(sums).map { 110 | case(sumSq,sum) => Math.sqrt(total*sumSq - sum*sum) / total 111 | } 112 | 113 | /* 114 | * Finally for each column (or feature), each single values gets 115 | * normalized using the mean value and standard deviations 116 | */ 117 | val normdata = data.map( 118 | 119 | (_,means,stdevs).zipped.map((value,mean,stdev) => { 120 | if (stdev <= 0) (value-mean) else (value-mean) / stdev 121 | 122 | }) 123 | 124 | ) 125 | 126 | normdata 127 | 128 | } 129 | 130 | } -------------------------------------------------------------------------------- /src/main/scala/de/kp/spark/outlier/util/Optimizer.scala: -------------------------------------------------------------------------------- 1 | package de.kp.spark.outlier.util 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG 3 | * 4 | * This file is part of the Spark-Outlier project 5 | * (https://github.com/skrusche63/spark-outlier). 6 | * 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the 8 | * terms of the GNU General Public License as published by the Free Software 9 | * Foundation, either version 3 of the License, or (at your option) any later 10 | * version. 11 | * 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details. 15 | * You should have received a copy of the GNU General Public License along with 16 | * Spark-Outlier. 17 | * 18 | * If not, see . 19 | */ 20 | 21 | import org.apache.spark.rdd.RDD 22 | 23 | import org.apache.spark.mllib.clustering.KMeans 24 | import org.apache.spark.mllib.linalg.Vectors 25 | 26 | import de.kp.spark.core.model.LabeledPoint 27 | 28 | object Optimizer { 29 | 30 | /** 31 | * Determine from a range of cluster numbers that number where the mean 32 | * entropy of all cluster labels is minimal; note, that the entropy is 33 | * an indicator for the homogenity of the cluster labels 34 | */ 35 | def optimizeByEntropy(data:RDD[LabeledPoint],range:Range,iterations:Int):Int = { 36 | 37 | val scores = range.par.map(k => (k, clusterEntropy(data,k,iterations))).toList 38 | scores.sortBy(_._2).head._1 39 | 40 | } 41 | 42 | def clusterEntropy(data: RDD[LabeledPoint],clusters:Int,iterations:Int):Double = { 43 | 44 | val vectors = data.map(point => Vectors.dense(point.features)) 45 | val model = KMeans.train(vectors,clusters,iterations) 46 | 47 | val entropies = data.map(point => { 48 | 49 | val cluster = model.predict(Vectors.dense(point.features)) 50 | (cluster,point.label) 51 | 52 | }).groupBy(_._1).map(data => MathHelper.strEntropy(data._2.map(_._2))).collect() 53 | 54 | entropies.sum / entropies.size 55 | 56 | } 57 | 58 | /** 59 | * Determine from a range of cluster numbers that number where the mean 60 | * distance between cluster points and their cluster centers is minimal 61 | */ 62 | def optimizeByDistance(data:RDD[LabeledPoint],range:Range,iterations:Int):Int = { 63 | 64 | val scores = range.par.map(k => (k, clusterDistance(data, k, iterations))).toList 65 | scores.sortBy(_._2).head._1 66 | 67 | } 68 | 69 | def distance(a:Array[Double], b:Array[Double]) = 70 | Math.sqrt(a.zip(b).map(p => p._1 - p._2).map(d => d * d).sum) 71 | 72 | /** 73 | * This method calculates the mean distance of all data (vectors) from 74 | * their centroids, given certain clustering parameters; the method may 75 | * be used to score clusters 76 | */ 77 | def clusterDistance(data: RDD[LabeledPoint], clusters:Int, iterations:Int):Double = { 78 | 79 | val vectors = data.map(point => Vectors.dense(point.features)) 80 | val model = KMeans.train(vectors,clusters,iterations) 81 | /** 82 | * Centroid: Vector that specifies the centre of a certain cluster 83 | */ 84 | val centroids = model.clusterCenters 85 | 86 | val distances = data.map(point => { 87 | 88 | val cluster = model.predict(Vectors.dense(point.features)) 89 | val centroid = centroids(cluster) 90 | 91 | distance(centroid.toArray,point.features) 92 | 93 | }).collect() 94 | 95 | distances.sum / distances.size 96 | 97 | } 98 | 99 | } --------------------------------------------------------------------------------