├── .gitignore
├── README.md
├── images
    ├── outlier-detection-overview.png
    └── predictiveworks.png
├── pom.xml
└── src
    └── main
        ├── resources
            ├── application.conf
            ├── features.xml
            ├── server.conf
            └── states.xml
        └── scala
            └── de
                └── kp
                    └── spark
                        └── outlier
                            ├── Configuration.scala
                            ├── KMeansDetector.scala
                            ├── MarkovDetector.scala
                            ├── OutlierServer.scala
                            ├── RequestContext.scala
                            ├── actor
                                ├── BaseActor.scala
                                ├── KMeansActor.scala
                                ├── MarkovActor.scala
                                ├── OutlierMaster.scala
                                ├── OutlierMiner.scala
                                ├── OutlierQuestor.scala
                                └── TrainActor.scala
                            ├── api
                                └── AkkaApi.scala
                            ├── app
                                └── TrainApp.scala
                            ├── markov
                                ├── DoubleMatrix.scala
                                ├── MarkovBuilder.scala
                                ├── StateMetrics.scala
                                └── TransitionMatrix.scala
                            ├── model
                                └── Model.scala
                            ├── spec
                                ├── StateSpec.scala
                                └── VectorSpec.scala
                            └── util
                                ├── MathHelper.scala
                                └── Optimizer.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | # use glob syntax.
 2 | syntax: glob
 3 | *.ser
 4 | *.class
 5 | *~
 6 | *.bak
 7 | #*.off
 8 | *.old
 9 | 
10 | # eclipse conf file
11 | .settings
12 | .classpath
13 | .project
14 | .manager
15 | .scala_dependencies
16 | 
17 | # idea
18 | .idea
19 | *.iml
20 | 
21 | # building
22 | target
23 | build
24 | null
25 | tmp*
26 | temp*
27 | dist
28 | test-output
29 | build.log
30 | 
31 | # other scm
32 | .svn
33 | .CVS
34 | .hg*
35 | 
36 | # switch to regexp syntax.
37 | #  syntax: regexp
38 | #  ^\.pc/
39 | 
40 | #SHITTY output not in target directory
41 | build.log
42 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![Elasticworks.](https://raw.githubusercontent.com/skrusche63/spark-outlier/master/images/predictiveworks.png)
 2 | 
 3 | **Predictiveworks.** is an open ensemble of predictive engines and has been made to cover a wide range of today's analytics requirements. **Predictiveworks.**  brings the power of predictive analytics to Elasticsearch.
 4 | 
 5 | ## Reactive Outlier Detection Engine
 6 | 
 7 | ![Outlier Detection Engine Overview](https://raw.githubusercontent.com/skrusche63/spark-outlier/master/images/outlier-detection-overview.png)
 8 | 
 9 | The Outlier Detection Engine is one of the nine members of the open ensemble and is built to find anomalies in large-scale datasets and human behavior 
10 | for advanced risk reduction. 
11 | 


--------------------------------------------------------------------------------
/images/outlier-detection-overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skrusche63/spark-outlier/a02b7835dc8c8b194e52311e450d855d7e9624b5/images/outlier-detection-overview.png


--------------------------------------------------------------------------------
/images/predictiveworks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skrusche63/spark-outlier/a02b7835dc8c8b194e52311e450d855d7e9624b5/images/predictiveworks.png


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  2 |   <modelVersion>4.0.0</modelVersion>
  3 |   <groupId>spark-outlier</groupId>
  4 |   <artifactId>spark-outlier</artifactId>
  5 |   <version>0.2.2</version>
  6 |   <name>Spark-Outlier</name>
  7 |   <description>Reactive Outlier Detection Engine</description>
  8 |   <inceptionYear>2010</inceptionYear>
  9 |   <licenses>
 10 |     <license>
 11 |       <name>My License</name>
 12 |       <url>http://....</url>
 13 |       <distribution>repo</distribution>
 14 |     </license>
 15 |   </licenses>
 16 | 
 17 |   <properties>
 18 |     <maven.compiler.source>1.6</maven.compiler.source>
 19 |     <maven.compiler.target>1.6</maven.compiler.target>
 20 |     <encoding>UTF-8</encoding>
 21 |     <scala.tools.version>2.10</scala.tools.version>
 22 |     <scala.version>2.10.0</scala.version>
 23 |     <spark.version>1.2.0</spark.version>
 24 |   </properties>
 25 | 
 26 |   <dependencies>
 27 |     <dependency>
 28 |       <groupId>org.scala-lang</groupId>
 29 |       <artifactId>scala-library</artifactId>
 30 |       <version>${scala.version}</version>
 31 |     </dependency>
 32 | 
 33 |     <!-- Test -->
 34 |     <dependency>
 35 |       <groupId>junit</groupId>
 36 |       <artifactId>junit</artifactId>
 37 |       <version>4.11</version>
 38 |       <scope>test</scope>
 39 |     </dependency>
 40 |     <dependency>
 41 |       <groupId>org.specs2</groupId>
 42 |       <artifactId>specs2_${scala.tools.version}</artifactId>
 43 |       <version>1.13</version>
 44 |       <scope>test</scope>
 45 |     </dependency>
 46 |     <dependency>
 47 |       <groupId>org.scalatest</groupId>
 48 |       <artifactId>scalatest_${scala.tools.version}</artifactId>
 49 |       <version>2.0.M6-SNAP8</version>
 50 |       <scope>test</scope>
 51 |     </dependency>
 52 | 
 53 |     <!-- spark core -->
 54 |     <dependency>
 55 | 	  <groupId>org.apache.spark</groupId>
 56 | 	  <artifactId>spark-core_2.10</artifactId>
 57 | 	  <version>${spark.version}</version>
 58 |     </dependency>
 59 |     
 60 |      <!-- spark mllib -->
 61 |     <dependency>
 62 | 	  <groupId>org.apache.spark</groupId>
 63 | 	  <artifactId>spark-mllib_2.10</artifactId>
 64 | 	  <version>${spark.version}</version>
 65 |     </dependency>
 66 |     
 67 |     <!--  cascading (from conjars.org) -->
 68 |     <dependency>
 69 |       <groupId>cascading</groupId>
 70 |       <artifactId>cascading-core</artifactId>
 71 |       <version>2.5.4</version>
 72 |     </dependency>
 73 |  
 74 |      <dependency>
 75 |       <groupId>cascading</groupId>
 76 |       <artifactId>cascading-hadoop</artifactId>
 77 |       <version>2.5.4</version>
 78 |     </dependency>
 79 |     
 80 |     <!-- elasticsearch hadoop -->
 81 |     <dependency>
 82 |       <groupId>org.elasticsearch</groupId>
 83 |       <artifactId>elasticsearch-hadoop</artifactId>
 84 |       <version>2.0.0</version>
 85 |     </dependency>
 86 | 
 87 |     <!-- elastic search -->
 88 |     <dependency>
 89 |       <groupId>org.elasticsearch</groupId>
 90 |       <artifactId>elasticsearch</artifactId>
 91 |       <version>1.3.2</version>      
 92 |     </dependency>  
 93 |     
 94 |     <!-- json4s -->
 95 |     <dependency>
 96 | 	  <groupId>org.json4s</groupId>
 97 | 	  <artifactId>json4s-native_2.10</artifactId>
 98 | 	  <version>3.2.10</version>
 99 |     </dependency>
100 |     
101 |     <!-- jedis 2.5.2 -->
102 |     <dependency>
103 |       <groupId>redis.clients</groupId>
104 |       <artifactId>jedis</artifactId>
105 |       <version>2.5.2</version>
106 |     </dependency>
107 |     
108 |     <!--  argot parser -->
109 |     <dependency>
110 |       <groupId>org.clapper</groupId>
111 |       <artifactId>argot_2.10</artifactId>
112 |       <version>1.0.3</version>
113 |     </dependency>     
114 |     
115 |   </dependencies>
116 |  
117 |   <repositories>
118 |     <repository>
119 |       <id>conjars.org</id>
120 |       <url>http://conjars.org/repo</url>
121 |     </repository>
122 |   </repositories>
123 | 
124 |   <build>
125 |     <sourceDirectory>src/main/scala</sourceDirectory>
126 |     <testSourceDirectory>src/test/scala</testSourceDirectory>
127 |     <plugins>
128 |       <plugin>
129 |         <!-- see http://davidb.github.com/scala-maven-plugin -->
130 |         <groupId>net.alchim31.maven</groupId>
131 |         <artifactId>scala-maven-plugin</artifactId>
132 |         <version>3.1.3</version>
133 |         <executions>
134 |           <execution>
135 |             <goals>
136 |               <goal>compile</goal>
137 |               <goal>testCompile</goal>
138 |             </goals>
139 |             <configuration>
140 |               <args>
141 |                 <arg>-make:transitive</arg>
142 |                 <arg>-dependencyfile</arg>
143 |                 <arg>${project.build.directory}/.scala_dependencies</arg>
144 |               </args>
145 |             </configuration>
146 |           </execution>
147 |         </executions>
148 |       </plugin>
149 |       <plugin>
150 |         <groupId>org.apache.maven.plugins</groupId>
151 |         <artifactId>maven-surefire-plugin</artifactId>
152 |         <version>2.13</version>
153 |         <configuration>
154 |           <useFile>false</useFile>
155 |           <disableXmlReport>true</disableXmlReport>
156 |           <!-- If you have classpath issue like NoDefClassError,... -->
157 |           <!-- useManifestOnlyJar>false</useManifestOnlyJar -->
158 |           <includes>
159 |             <include>**/*Test.*</include>
160 |             <include>**/*Suite.*</include>
161 |           </includes>
162 |         </configuration>
163 |       </plugin>
164 |     </plugins>
165 |   </build>
166 |   <organization>
167 |   	<name>Dr. Krusche &amp; Partner PartG</name>
168 |   	<url>http://www.dr-kruscheundpartner.com</url>
169 |   </organization>
170 |   <url>https://github.com/skrusche63/spark-outlier</url>
171 | </project>
172 | 


--------------------------------------------------------------------------------
/src/main/resources/application.conf:
--------------------------------------------------------------------------------
 1 | akka {
 2 |   loglevel = INFO
 3 |   stdout-loglevel = INFO
 4 |   akka.loggers = ["akka.event.slf4j.Slf4jLogger"]
 5 | }
 6 | 
 7 | actor {
 8 |   duration = 10
 9 |   retries = 10  
10 |   timeout = 10
11 | }
12 | 
13 | #
14 | # Access to cassandra is provided by Datastax' spark-cassandra-connector; the respective
15 | # configuration parameters can be retrieved from here: 
16 | #
17 | # https://github.com/datastax/spark-cassandra-connector/blob/master/doc/0_quick_start.md
18 | #
19 | cassandra {
20 |   spark.cassandra.connection.host="127.0.0.1"
21 | }
22 | 
23 | elastic {
24 |   es.nodes="localhost"
25 |   es.port="9200"
26 |   es.resource=""                
27 |   es.query=""                          
28 | }
29 | 
30 | file {
31 |   items=""
32 |   features=""
33 | }
34 | 
35 | hbase {
36 |   spark.hbase.host="127.0.0.1"
37 | }
38 | 
39 | mongo {
40 |   mongo.input.uri="mongodb://127.0.0.1:27017/beowulf.input"
41 | }
42 | 
43 | mysql {
44 |   url="127.0.0.1:8889"
45 |   database="analytics"    
46 |   user="root"
47 |   password="root" 
48 | }
49 | 
50 | redis {
51 |   host="127.0.0.1"
52 |   port="6379"
53 | }
54 | #
55 | 
56 | # Configuration parameters for the REST API
57 | # of the Outlier Detection Engine
58 | #
59 | rest {
60 |   host="127.0.0.1"
61 |   port=9000
62 | }
63 | 
64 | spark {
65 |   spark.executor.memory="1g"
66 |   spark.kryoserializer.buffer.mb="256"
67 | }


--------------------------------------------------------------------------------
/src/main/resources/features.xml:
--------------------------------------------------------------------------------
1 | <fieldspec>
2 | 	<field name="row" type="long">row</field>
3 | 	<field name="col" type="long">col</field>
4 | 	<field name="lbl" type="string">label</field>
5 | 	<field name="val" type="double">value</field>
6 | </fieldspec>


--------------------------------------------------------------------------------
/src/main/resources/server.conf:
--------------------------------------------------------------------------------
 1 | akka {
 2 |   actor {
 3 |     provider = "akka.remote.RemoteActorRefProvider"
 4 |   }
 5 |   remote {
 6 |     enabled-transports = ["akka.remote.netty.tcp"]
 7 |     netty.tcp {
 8 |       hostname = "127.0.0.1"
 9 |       port = 2604
10 |     }
11 |     log-sent-messages = on
12 |     log-received-messages = on
13 |   }
14 | }


--------------------------------------------------------------------------------
/src/main/resources/states.xml:
--------------------------------------------------------------------------------
1 | <fieldspec>
2 | 	<field name="site" type="string">site</field>
3 | 	<field name="user" type="string">user</field>
4 | 	<field name="timestamp" type="long">timestamp</field>
5 |     <field name="state" type="string">state</field>
6 | </fieldspec>


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/Configuration.scala:
--------------------------------------------------------------------------------
  1 | package de.kp.spark.outlier
  2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
  3 | * 
  4 | * This file is part of the Spark-Outlier project
  5 | * (https://github.com/skrusche63/spark-outlier).
  6 | * 
  7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
  8 | * terms of the GNU General Public License as published by the Free Software
  9 | * Foundation, either version 3 of the License, or (at your option) any later
 10 | * version.
 11 | * 
 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 15 | * You should have received a copy of the GNU General Public License along with
 16 | * Spark-Outlier. 
 17 | * 
 18 | * If not, see <http://www.gnu.org/licenses/>.
 19 | */
 20 | 
 21 | import com.typesafe.config.ConfigFactory
 22 | import org.apache.hadoop.conf.{Configuration => HConf}
 23 | 
 24 | import de.kp.spark.core.{Configuration => CoreConf}
 25 | 
 26 | object Configuration extends CoreConf {
 27 | 
 28 |     /* Load configuration for router */
 29 |   val path = "application.conf"
 30 |   val config = ConfigFactory.load(path)
 31 | 
 32 |   override def actor:(Int,Int,Int) = {
 33 |   
 34 |     val cfg = config.getConfig("actor")
 35 | 
 36 |     val duration = cfg.getInt("duration")
 37 |     val retries = cfg.getInt("retries")  
 38 |     val timeout = cfg.getInt("timeout")
 39 |     
 40 |     (duration,retries,timeout)
 41 |     
 42 |   }
 43 |  
 44 |   override def cassandra:Map[String,String] = {
 45 |    
 46 |     val cfg = config.getConfig("cassandra")
 47 |     val conf = Map(
 48 |       "spark.cassandra.connection.host" -> cfg.getString("spark.cassandra.connection.host")
 49 |     )                          
 50 | 
 51 |     conf
 52 |      
 53 |   }
 54 | 
 55 |   override def elastic:HConf = {
 56 |   
 57 |     val cfg = config.getConfig("elastic")
 58 |     val conf = new HConf()                          
 59 | 
 60 |     conf.set("es.nodes",cfg.getString("es.nodes"))
 61 |     conf.set("es.port",cfg.getString("es.port"))
 62 | 
 63 |     conf.set("es.resource", cfg.getString("es.resource"))                
 64 |     conf.set("es.query", cfg.getString("es.query"))                          
 65 |  
 66 |     conf
 67 |     
 68 |   }
 69 | 
 70 |   override def hbase:Map[String,String] = {
 71 |    
 72 |     val cfg = config.getConfig("hbase")
 73 |     val conf = Map(
 74 |       "spark.hbase.host" -> cfg.getString("spark.hbase.host")
 75 |     )                          
 76 | 
 77 |     conf
 78 |      
 79 |   }
 80 |   
 81 |   override def input:List[String] = {
 82 |   
 83 |     val cfg = config.getConfig("file")
 84 |     
 85 |     val items = cfg.getString("items")   
 86 |     val features = cfg.getString("features")   
 87 |    
 88 |     List(items,features)
 89 |     
 90 |   }
 91 |  
 92 |   override def mongo:HConf = {
 93 |    
 94 |     val cfg = config.getConfig("mongo")
 95 |     val conf = new HConf()                          
 96 | 
 97 |     conf.set("mongo.input.uri",cfg.getString("mongo.input.uri"))
 98 |     conf
 99 |      
100 |   }
101 |  
102 |   override def mysql:(String,String,String,String) = {
103 | 
104 |    val cfg = config.getConfig("mysql")
105 |   
106 |    val url = cfg.getString("url")
107 |    val db  = cfg.getString("database")
108 |   
109 |    val user = cfg.getString("user")
110 |    val password = cfg.getString("password")
111 |     
112 |    (url,db,user,password)
113 |    
114 |   }
115 |   
116 |   override def output:List[String] = null
117 |   
118 |   override def redis:(String,String) = {
119 |   
120 |     val cfg = config.getConfig("redis")
121 |     
122 |     val host = cfg.getString("host")
123 |     val port = cfg.getString("port")
124 |     
125 |     (host,port)
126 |     
127 |   }
128 | 
129 |   override def rest:(String,Int) = {
130 |       
131 |     val cfg = config.getConfig("rest")
132 |       
133 |     val host = cfg.getString("host")
134 |     val port = cfg.getInt("port")
135 | 
136 |     (host,port)
137 |     
138 |   }
139 |   
140 |   override def spark:Map[String,String] = {
141 |   
142 |     val cfg = config.getConfig("spark")
143 |     
144 |     Map(
145 |       "spark.executor.memory"          -> cfg.getString("spark.executor.memory"),
146 | 	  "spark.kryoserializer.buffer.mb" -> cfg.getString("spark.kryoserializer.buffer.mb")
147 |     )
148 | 
149 |   }
150 | 
151 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/KMeansDetector.scala:
--------------------------------------------------------------------------------
  1 | package de.kp.spark.outlier
  2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
  3 | * 
  4 | * This file is part of the Spark-Outlier project
  5 | * (https://github.com/skrusche63/spark-outlier).
  6 | * 
  7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
  8 | * terms of the GNU General Public License as published by the Free Software
  9 | * Foundation, either version 3 of the License, or (at your option) any later
 10 | * version.
 11 | * 
 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 15 | * You should have received a copy of the GNU General Public License along with
 16 | * Spark-Outlier. 
 17 | * 
 18 | * If not, see <http://www.gnu.org/licenses/>.
 19 | */
 20 | 
 21 | import org.apache.spark.rdd.RDD
 22 | 
 23 | import org.apache.spark.mllib.clustering.KMeans
 24 | import org.apache.spark.mllib.linalg.Vectors
 25 | 
 26 | import de.kp.spark.core.model._
 27 | import de.kp.spark.outlier.util.{MathHelper,Optimizer}
 28 | 
 29 | /**
 30 |  * KMeansDetector is a general purpose outlier detector that 
 31 |  * detects outliers in sets of labeled features
 32 |  */
 33 | class KMeansDetector extends Serializable {
 34 |   
 35 |   def find(data:RDD[LabeledPoint],strategy:String="entropy",iterations:Int,top:Int):List[ClusteredPoint] = {
 36 |     
 37 |     val (k,normdata) = prepare(data,strategy,iterations)
 38 |     detect(normdata,k,iterations,top)
 39 |     
 40 |   }
 41 |   
 42 |   def detect(normdata:RDD[LabeledPoint],k:Int,iterations:Int,top:Int):List[ClusteredPoint] = {
 43 |     
 44 |     val sc = normdata.context
 45 |     
 46 |     /*
 47 |      * STEP #1: Compute KMeans model
 48 |      */   
 49 |     val vectors = normdata.map(point => Vectors.dense(point.features))
 50 | 
 51 |     val model = KMeans.train(vectors,k,iterations)
 52 |     val centroids = model.clusterCenters
 53 | 
 54 |     /*
 55 |      * STEP #2: Calculate the distances for all points from their clusters; 
 56 |      * outliers are those that have the farest distance
 57 |      */
 58 |     val bcmodel = sc.broadcast(model)
 59 |     val points = normdata.map(point => {
 60 |       
 61 |       val vector = Vectors.dense(point.features)
 62 |       
 63 |       val cluster = bcmodel.value.predict(vector)
 64 |       val centroid = bcmodel.value.clusterCenters(cluster)
 65 |       
 66 |       val distance = Optimizer.distance(centroid.toArray,vector.toArray)
 67 |  
 68 |       (cluster,distance,point)
 69 |       
 70 |     })
 71 |     
 72 |     /*
 73 |      * Retrieve top k features (LabeledPoint) with respect to their clusters;
 74 |      * the cluster identifier is used as a grouping mechanism to specify which
 75 |      * features belong to which centroid
 76 |      */
 77 |     val bctop = sc.broadcast(top)
 78 |     points.groupBy(_._1).flatMap(x => x._2.toList.sortBy(_._2).reverse.take(bctop.value)).map(data => {
 79 |     
 80 |       val (cluster,distance,point) = data
 81 |       new ClusteredPoint(cluster,distance,point)
 82 |       
 83 |     }).collect().toList
 84 |     
 85 |   }
 86 | 
 87 |   def prepare(data:RDD[LabeledPoint],strategy:String="entropy",iterations:Int):(Int,RDD[LabeledPoint]) = {
 88 |     
 89 |     /* 
 90 |      * STEP #1: Normalize data 
 91 |      */
 92 |     val idlabels = data.map(p => (p.id,p.label))
 93 |     
 94 |     val features = data.map(p => p.features)
 95 |     
 96 |     val normalized = MathHelper.normalize(features)    
 97 |     val normdata = idlabels.zip(normalized).map{case((id,label),features) => LabeledPoint(id,label, features)}
 98 |     
 99 |     /*
100 |      * STEP #2: Find optimal number of clusters
101 |      */
102 |   
103 |     /* Range of cluster center */
104 |     val range = (5 to 40 by 5)
105 |   
106 |     val k = strategy match {
107 |       
108 |       case "distance" => Optimizer.optimizeByDistance(normdata, range, iterations)
109 |         
110 |       case "entropy"  => Optimizer.optimizeByEntropy(normdata, range, iterations)
111 |       
112 |     }
113 |     
114 |     (k, normdata)
115 |   
116 |   }
117 |   
118 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/MarkovDetector.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.outlier
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 | * 
 4 | * This file is part of the Spark-Outlier project
 5 | * (https://github.com/skrusche63/spark-outlier).
 6 | * 
 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
 8 | * terms of the GNU General Public License as published by the Free Software
 9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | * 
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier. 
17 | * 
18 | * If not, see <http://www.gnu.org/licenses/>.
19 | */
20 | 
21 | import org.apache.spark.rdd.RDD
22 | 
23 | import de.kp.spark.core.model._
24 | import de.kp.spark.outlier.markov.{MarkovBuilder,StateMetrics,TransitionMatrix}
25 | 
26 | /**
27 |  * The MarkovDetector discovers outliers from registered behavior.
28 |  */
29 | class MarkovDetector(@transient ctx:RequestContext,scale:Int,states:Array[String]) extends Serializable {
30 | 
31 |   val metrics  = new StateMetrics(states)
32 |   
33 |   def detect(sequences:RDD[Behavior],algorithm:String,threshold:Double,matrix:TransitionMatrix):RDD[Outlier] = {
34 | 
35 |     val bmatrix = ctx.sc.broadcast(matrix)    
36 |     sequences.map(seq => {
37 |       
38 |       val (site,user,states) = (seq.site,seq.user,seq.states)
39 |       val metric = algorithm match {
40 |         
41 |         case "missprob" => metrics.missProbMetric(states,bmatrix.value)
42 |         
43 |         case "missrate" => metrics.missRateMetric(states,bmatrix.value)
44 |         
45 |         case "entreduc" => metrics.entropyReductionMetric(states,bmatrix.value)
46 |         
47 |       }
48 |       
49 |       val flag = if (metric > threshold) "yes" else "no"      
50 |       Outlier(site,user,states,metric,flag)
51 |       
52 |     })
53 |     
54 |   }
55 | 
56 |   def train(sequences:RDD[Behavior]):TransitionMatrix = {
57 |     new MarkovBuilder(scale,states).build(sequences)
58 |   }
59 |   
60 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/OutlierServer.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.outlier
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 | * 
 4 | * This file is part of the Spark-Outlier project
 5 | * (https://github.com/skrusche63/spark-outlier).
 6 | * 
 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
 8 | * terms of the GNU General Public License as published by the Free Software
 9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | * 
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier. 
17 | * 
18 | * If not, see <http://www.gnu.org/licenses/>.
19 | */
20 | 
21 | import akka.actor.{ActorSystem,Props}
22 | import com.typesafe.config.ConfigFactory
23 | 
24 | import de.kp.spark.core.SparkService
25 | import de.kp.spark.outlier.api.AkkaApi
26 | 
27 | /**
28 |  * The OutlierServer supports two different approaches to outlier discovery; one is based 
29 |  * on clustering analysis and determines outlier feature sets due to their distance to the
30 |  * cluster centers. This approach is independent of a certain use case and concentrates on
31 |  * the extraction and evaluation of (equal-size) feature vectors. The other approach to 
32 |  * outlier discovery has a strong focus on the customers purchase behavior and detects those
33 |  * customer that behave different from all other customers.
34 |  */
35 | object OutlierServer extends SparkService {
36 |   
37 |   private val sc = createCtxLocal("IntentContext",Configuration.spark)      
38 | 
39 |   def main(args: Array[String]) {
40 |     
41 |     val ctx = new RequestContext(sc)
42 |     
43 |     /**
44 |      * AKKA API 
45 |      */
46 |     val conf:String = "server.conf"
47 | 
48 |     val akkaSystem = ActorSystem("akka-server",ConfigFactory.load(conf))
49 |     sys.addShutdownHook(akkaSystem.shutdown)
50 |     
51 |     new AkkaApi(akkaSystem,ctx).start()
52 |  
53 |     println("AKKA API activated.")
54 |       
55 |   }
56 | 
57 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/RequestContext.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.outlier
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 | * 
 4 | * This file is part of the Spark-Outlier project
 5 | * (https://github.com/skrusche63/spark-outlier).
 6 | * 
 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
 8 | * terms of the GNU General Public License as published by the Free Software
 9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | * 
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier. 
17 | * 
18 | * If not, see <http://www.gnu.org/licenses/>.
19 | */
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.sql.SQLContext
23 | 
24 | class RequestContext(  /*
25 |    * Reference to the common SparkContext; this context can be used
26 |    * to access HDFS based data sources or leverage the Spark machine
27 |    * learning library or other Spark based functionality
28 |    */
29 |   @transient val sc:SparkContext) extends Serializable {
30 | 
31 |   val sqlc = new SQLContext(sc)
32 |   val config = Configuration
33 | 
34 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/actor/BaseActor.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.outlier.actor
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 | * 
 4 | * This file is part of the Spark-Outlier project
 5 | * (https://github.com/skrusche63/spark-outlier).
 6 | * 
 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
 8 | * terms of the GNU General Public License as published by the Free Software
 9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | * 
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier. 
17 | * 
18 | * If not, see <http://www.gnu.org/licenses/>.
19 | */
20 | 
21 | import akka.actor.{Actor,ActorLogging,ActorRef,Props}
22 | 
23 | import de.kp.spark.core.model._
24 | import de.kp.spark.core.redis.RedisCache
25 | 
26 | import de.kp.spark.outlier.Configuration
27 | import de.kp.spark.outlier.model._
28 | 
29 | abstract class BaseActor extends Actor with ActorLogging {
30 | 
31 |   val (host,port) = Configuration.redis
32 |   val cache = new RedisCache(host,port.toInt)
33 |   
34 |   protected def failure(req:ServiceRequest,message:String):ServiceResponse = {
35 |     
36 |     if (req == null) {
37 |       val data = Map("message" -> message)
38 |       new ServiceResponse("","",data,OutlierStatus.FAILURE)	
39 |       
40 |     } else {
41 |       val data = Map("uid" -> req.data("uid"), "message" -> message)
42 |       new ServiceResponse(req.service,req.task,data,OutlierStatus.FAILURE)	
43 |     
44 |     }
45 |     
46 |   }
47 |   
48 |   protected def response(req:ServiceRequest,missing:Boolean):ServiceResponse = {
49 |     
50 |     val uid = req.data("uid")
51 |     
52 |     if (missing == true) {
53 |       val data = Map("uid" -> uid, "message" -> Messages.MISSING_PARAMETERS(uid))
54 |       new ServiceResponse(req.service,req.task,data,OutlierStatus.FAILURE)	
55 |   
56 |     } else {
57 |       val data = Map("uid" -> uid, "message" -> Messages.OUTLIER_DETECTION_STARTED(uid))
58 |       new ServiceResponse(req.service,req.task,data,OutlierStatus.STARTED)	
59 |       
60 |   
61 |     }
62 | 
63 |   }
64 | 
65 |   protected def serialize(resp:ServiceResponse) = Serializer.serializeResponse(resp)
66 | 
67 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/actor/KMeansActor.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.outlier.actor
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 | * 
 4 | * This file is part of the Spark-Outlier project
 5 | * (https://github.com/skrusche63/spark-outlier).
 6 | * 
 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
 8 | * terms of the GNU General Public License as published by the Free Software
 9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | * 
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier. 
17 | * 
18 | * If not, see <http://www.gnu.org/licenses/>.
19 | */
20 | 
21 | import de.kp.spark.core.Names
22 | import de.kp.spark.core.model._
23 | 
24 | import de.kp.spark.outlier.{KMeansDetector,RequestContext}
25 | import de.kp.spark.outlier.model._
26 | 
27 | import de.kp.spark.core.source.VectorSource
28 | import de.kp.spark.core.source.handler.VectorHandler
29 | 
30 | import de.kp.spark.core.redis.RedisDB
31 | 
32 | import de.kp.spark.outlier.spec.VectorSpec
33 | import scala.collection.mutable.ArrayBuffer
34 | 
35 | class KMeansActor(@transient ctx:RequestContext) extends TrainActor(ctx) {
36 | 
37 |   val redis = new RedisDB(host,port.toInt)
38 |  
39 |   override def validate(req:ServiceRequest) {
40 |       
41 |     if (req.data.contains("top") == false) 
42 |       throw new Exception("Parameter 'top' is missing.")
43 |         
44 |     if (req.data.contains("iterations") == false)
45 |       throw new Exception("Parameter 'iterations' is missing.")
46 |         
47 |     if (req.data.contains("strategy") == false)
48 |       throw new Exception("Parameter 'strategy' is missing.")
49 |     
50 |   }
51 |   
52 |   override def train(req:ServiceRequest) {
53 |           
54 |     val source = new VectorSource(ctx.sc,ctx.config,new VectorSpec(req))
55 |     val dataset = VectorHandler.vector2LabeledPoints(source.connect(req))
56 |       
57 |     val params = ArrayBuffer.empty[Param]
58 |       
59 |     val top = req.data("top").toInt
60 |     params += Param("top","integer",top.toString)
61 | 
62 |     val strategy = req.data("strategy").asInstanceOf[String]
63 |     params += Param("strategy","string",strategy)
64 | 
65 |     val iter = req.data("iterations").toInt
66 |     params += Param("iterations","integer",iter.toString)
67 | 
68 |     cache.addParams(req, params.toList)
69 |  
70 |     val points = new KMeansDetector().find(dataset,strategy,iter,top).toList          
71 |     savePoints(req,ClusteredPoints(points))
72 |     
73 |   }
74 |   
75 |   private def savePoints(req:ServiceRequest,points:ClusteredPoints) {
76 |     redis.addPoints(req,points)    
77 |   }
78 |   
79 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/actor/MarkovActor.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.outlier.actor
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 | * 
 4 | * This file is part of the Spark-Outlier project
 5 | * (https://github.com/skrusche63/spark-outlier).
 6 | * 
 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
 8 | * terms of the GNU General Public License as published by the Free Software
 9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | * 
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier. 
17 | * 
18 | * If not, see <http://www.gnu.org/licenses/>.
19 | */
20 | 
21 | import de.kp.spark.core.Names
22 | import de.kp.spark.core.model._
23 | 
24 | import de.kp.spark.core.source.StateSource
25 | import de.kp.spark.core.source.handler.StateHandler
26 | 
27 | import de.kp.spark.core.redis.RedisDB
28 | 
29 | import de.kp.spark.outlier.RequestContext
30 | import de.kp.spark.outlier.model._
31 | 
32 | import de.kp.spark.outlier.MarkovDetector
33 | import de.kp.spark.outlier.spec.StateSpec
34 | 
35 | import scala.collection.mutable.ArrayBuffer
36 | 
37 | class MarkovActor(@transient ctx:RequestContext) extends TrainActor(ctx) {
38 | 
39 |   val redis = new RedisDB(host,port.toInt)
40 |   
41 |   override def validate(req:ServiceRequest) {
42 |         
43 |     if (req.data.contains("scale") == false)
44 |       throw new Exception("Parameter 'scale' is missing.")
45 |         
46 |     if (req.data.contains("states") == false)
47 |       throw new Exception("Parameter 'states' is missing.")
48 |         
49 |     if (req.data.contains("strategy") == false)
50 |       throw new Exception("Parameter 'strategy' is missing.")
51 |     
52 |     if (req.data.contains("threshold") == false) 
53 |       throw new Exception("Parameter 'threshold' is missing.")
54 |     
55 |   }
56 |     
57 |   override def train(req:ServiceRequest) {
58 |           
59 |     val source = new StateSource(ctx.sc,ctx.config,new StateSpec(req))          
60 |     val sequences = StateHandler.state2Behavior(source.connect(req))
61 | 
62 |     val scale = req.data(Names.REQ_SCALE).toInt
63 |     val states = req.data(Names.REQ_STATES).split(",")
64 | 
65 |     val detector = new MarkovDetector(ctx,scale,states)
66 |     
67 |     val model = detector.train(sequences)
68 |       
69 |     val params = ArrayBuffer.empty[Param]
70 |       
71 |     val strategy  = req.data("strategy")
72 |     params += Param("strategy","string",strategy)
73 | 
74 |     val threshold = req.data("threshold").toDouble
75 |     params += Param("threshold","double",threshold.toString)
76 | 
77 |     cache.addParams(req, params.toList)
78 |          
79 |     val outliers = detector.detect(sequences,strategy,threshold,model).collect().toList
80 |           
81 |     saveOutliers(req,new Outliers(outliers))
82 |     
83 |   }
84 |   
85 |   private def saveOutliers(req:ServiceRequest,outliers:Outliers) {
86 |     redis.addOutliers(req,outliers)
87 |   }
88 | 
89 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/actor/OutlierMaster.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.outlier.actor
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 | * 
 4 | * This file is part of the Spark-Outlier project
 5 | * (https://github.com/skrusche63/spark-outlier).
 6 | * 
 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
 8 | * terms of the GNU General Public License as published by the Free Software
 9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | * 
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier. 
17 | * 
18 | * If not, see <http://www.gnu.org/licenses/>.
19 | */
20 | 
21 | import akka.actor.{ActorRef,Props}
22 | 
23 | import de.kp.spark.core.actor._
24 | import de.kp.spark.core.model._
25 | 
26 | import de.kp.spark.outlier.{Configuration,RequestContext}
27 | 
28 | class OutlierMaster(@transient ctx:RequestContext) extends BaseMaster(Configuration) {
29 |   
30 |   protected def actor(worker:String):ActorRef = {
31 |     
32 |     worker match {
33 |       /*
34 |        * Metadata management is part of the core functionality; field or metadata
35 |        * specifications can be registered in, and retrieved from a Redis database.
36 |        */
37 |       case "fields"   => context.actorOf(Props(new FieldQuestor(Configuration)))
38 |       case "register" => context.actorOf(Props(new BaseRegistrar(Configuration)))
39 |       /*
40 |        * Index management is part of the core functionality; an Elasticsearch 
41 |        * index can be created and appropriate (tracked) items can be saved.
42 |        */  
43 |       case "index" => context.actorOf(Props(new BaseIndexer(Configuration)))
44 |       case "track" => context.actorOf(Props(new BaseTracker(Configuration)))
45 | 
46 |       case "params" => context.actorOf(Props(new ParamQuestor(Configuration)))
47 |       /*
48 |        * Request the actual status of an association rule mining 
49 |        * task; note, that get requests should only be invoked after 
50 |        * having retrieved a FINISHED status.
51 |        * 
52 |        * Status management is part of the core functionality.
53 |        */
54 |       case "status" => context.actorOf(Props(new StatusQuestor(Configuration)))
55 |         
56 |       case "get"   => context.actorOf(Props(new OutlierQuestor()))
57 |       case "train" => context.actorOf(Props(new OutlierMiner(ctx)))
58 |       
59 |       case _ => null
60 |       
61 |     }
62 |   
63 |   }
64 | 
65 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/actor/OutlierMiner.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.outlier.actor
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 | * 
 4 | * This file is part of the Spark-Outlier project
 5 | * (https://github.com/skrusche63/spark-outlier).
 6 | * 
 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
 8 | * terms of the GNU General Public License as published by the Free Software
 9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | * 
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier. 
17 | * 
18 | * If not, see <http://www.gnu.org/licenses/>.
19 | */
20 | 
21 | import akka.actor.{ActorRef,Props}
22 | 
23 | import de.kp.spark.core.Names
24 | 
25 | import de.kp.spark.core.actor._
26 | import de.kp.spark.core.model._
27 | 
28 | import de.kp.spark.outlier.{Configuration,RequestContext}
29 | import de.kp.spark.outlier.model._
30 | 
31 | /**
32 |  * The focus of the OutlierMiner is on the model building task,
33 |  * either for cluster analysis based tasks or markov based states.
34 |  */
35 | class OutlierMiner(@transient ctx:RequestContext) extends BaseTrainer(Configuration) {
36 | 
37 |   protected def validate(req:ServiceRequest):Option[String] = {
38 | 
39 |     val uid = req.data(Names.REQ_UID)
40 |  
41 |     if (cache.statusExists(req)) {            
42 |       val message = Messages.TASK_ALREADY_STARTED(uid)
43 |       return Some(message)
44 |     
45 |     }
46 |     
47 |     req.data.get(Names.REQ_ALGORITHM) match {
48 |         
49 |       case None => {
50 |         return Some(Messages.NO_ALGORITHM_PROVIDED(uid))              
51 |       }
52 |         
53 |       case Some(algorithm) => {
54 |         if (Algorithms.isAlgorithm(algorithm) == false) {
55 |           return Some(Messages.ALGORITHM_IS_UNKNOWN(uid,algorithm))    
56 |         }
57 |           
58 |       }
59 |     
60 |     }  
61 |     
62 |     req.data.get(Names.REQ_SOURCE) match {
63 |         
64 |       case None => {
65 |         return Some(Messages.NO_SOURCE_PROVIDED(uid))          
66 |       }
67 |         
68 |       case Some(source) => {
69 |         if (Sources.isSource(source) == false) {
70 |           return Some(Messages.SOURCE_IS_UNKNOWN(uid,source))    
71 |         }          
72 |       }
73 |         
74 |     }
75 |     
76 |     None
77 |     
78 |   }
79 | 
80 |   /**
81 |    * This is a helper method to determine which actor has to be
82 |    * created to support the requested algorithm; actually KMeans
83 |    * and Markov based algorithms are supported.
84 |    */
85 |   protected def actor(req:ServiceRequest):ActorRef = {
86 | 
87 |     val algorithm = req.data(Names.REQ_ALGORITHM)
88 |     if (algorithm == Algorithms.KMEANS) {      
89 |       context.actorOf(Props(new KMeansActor(ctx)))      
90 |     
91 |     } else {
92 |      context.actorOf(Props(new MarkovActor(ctx)))
93 |     
94 |     }
95 |   
96 |   }
97 | 
98 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/actor/OutlierQuestor.scala:
--------------------------------------------------------------------------------
  1 | package de.kp.spark.outlier.actor
  2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
  3 | * 
  4 | * This file is part of the Spark-Outlier project
  5 | * (https://github.com/skrusche63/spark-outlier).
  6 | * 
  7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
  8 | * terms of the GNU General Public License as published by the Free Software
  9 | * Foundation, either version 3 of the License, or (at your option) any later
 10 | * version.
 11 | * 
 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 15 | * You should have received a copy of the GNU General Public License along with
 16 | * Spark-Outlier. 
 17 | * 
 18 | * If not, see <http://www.gnu.org/licenses/>.
 19 | */
 20 | 
 21 | import akka.actor.{Actor,ActorLogging,ActorRef,Props}
 22 | 
 23 | import de.kp.spark.core.Names
 24 | import de.kp.spark.core.model._
 25 | 
 26 | import de.kp.spark.core.redis.RedisDB
 27 | 
 28 | import de.kp.spark.outlier.model._
 29 | 
 30 | class OutlierQuestor extends BaseActor {
 31 | 
 32 |   implicit val ec = context.dispatcher
 33 |   private val redis = new RedisDB(host,port.toInt)
 34 | 
 35 |   def receive = {
 36 |     
 37 |     case req:ServiceRequest => {
 38 |       
 39 |       val origin = sender
 40 |       val uid = req.data("uid")
 41 |       
 42 |       val Array(task,topic) = req.task.split(":")
 43 |       topic match {
 44 | 
 45 |         case "state" => {
 46 | 
 47 |           val response = {
 48 | 
 49 |             if (redis.outliersExists(req) == false) {   
 50 |               failure(req,Messages.OUTLIERS_DO_NOT_EXIST(uid))
 51 |             
 52 |             } else {       
 53 |                 
 54 |               val outliers = redis.outliers(req)
 55 | 
 56 |               val data = Map(Names.REQ_UID -> uid, Names.REQ_RESPONSE -> outliers)            
 57 |               new ServiceResponse(req.service,req.task,data,OutlierStatus.SUCCESS)
 58 |             
 59 |             }
 60 |           } 
 61 |           
 62 |           origin ! response
 63 |           context.stop(self)
 64 |           
 65 |         }
 66 |          
 67 |         case "vector" => {
 68 | 
 69 |           val response = {
 70 | 
 71 |             if (redis.pointsExist(req) == false) {    
 72 |               failure(req,Messages.OUTLIERS_DO_NOT_EXIST(uid))
 73 |             
 74 |             } else {         
 75 |                 
 76 |               val points = redis.points(req)
 77 | 
 78 |               val data = Map(Names.REQ_UID -> uid, Names.REQ_RESPONSE -> points)            
 79 |               new ServiceResponse(req.service,req.task,data,OutlierStatus.SUCCESS)
 80 |              
 81 |             }
 82 |           
 83 |           } 
 84 |           origin ! response
 85 |           context.stop(self)
 86 |            
 87 |         }
 88 |        
 89 |         case _ => {
 90 |           
 91 |           val msg = Messages.TASK_IS_UNKNOWN(uid,req.task)
 92 |           
 93 |           origin ! failure(req,msg)
 94 |           context.stop(self)
 95 |           
 96 |         }
 97 |         
 98 |       }
 99 |       
100 |     }
101 |   
102 |   }
103 |  
104 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/actor/TrainActor.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.outlier.actor
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 | * 
 4 | * This file is part of the Spark-Outlier project
 5 | * (https://github.com/skrusche63/spark-outlier).
 6 | * 
 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
 8 | * terms of the GNU General Public License as published by the Free Software
 9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | * 
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier. 
17 | * 
18 | * If not, see <http://www.gnu.org/licenses/>.
19 | */
20 | 
21 | import de.kp.spark.core.model._
22 | 
23 | import de.kp.spark.outlier.RequestContext
24 | import de.kp.spark.outlier.model._
25 | 
26 | class TrainActor(@transient ctx:RequestContext) extends BaseActor {
27 | 
28 |   def receive = {
29 | 
30 |     case req:ServiceRequest => {
31 |       
32 |       val origin = sender
33 |       val missing = try {
34 |         
35 |         validate(req)
36 |         false
37 |       
38 |       } catch {
39 |         case e:Exception => true
40 |         
41 |       }
42 | 
43 |       origin ! response(req, missing)
44 | 
45 |       if (missing == false) {
46 |  
47 |         try {
48 | 
49 |           /* Update cache */
50 |           cache.addStatus(req,OutlierStatus.TRAINING_STARTED)
51 |           
52 |           train(req)
53 |           
54 |           /* Update cache */
55 |           cache.addStatus(req,OutlierStatus.TRAINING_FINISHED)
56 |  
57 |         } catch {
58 |           case e:Exception => cache.addStatus(req,OutlierStatus.FAILURE)          
59 |         }
60 | 
61 |       }
62 |       
63 |       context.stop(self)
64 |           
65 |     }
66 |     
67 |     case _ => {
68 |       
69 |       log.error("unknown request.")
70 |       context.stop(self)
71 |       
72 |     }
73 |     
74 |   }
75 |   
76 |   protected def validate(req:ServiceRequest) = {}
77 |   
78 |   protected def train(req:ServiceRequest) {}
79 | 
80 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/api/AkkaApi.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.outlier.api
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 | * 
 4 | * This file is part of the Spark-Outlier project
 5 | * (https://github.com/skrusche63/spark-outlier).
 6 | * 
 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
 8 | * terms of the GNU General Public License as published by the Free Software
 9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | * 
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier. 
17 | * 
18 | * If not, see <http://www.gnu.org/licenses/>.
19 | */
20 | 
21 | import akka.actor.{ActorSystem,Props}
22 | 
23 | import de.kp.spark.outlier.RequestContext
24 | import de.kp.spark.outlier.actor.OutlierMaster
25 | 
26 | class AkkaApi(system:ActorSystem,@transient val ctx:RequestContext) {
27 | 
28 |   val master = system.actorOf(Props(new OutlierMaster(ctx)), name="outlier-master")
29 | 
30 |   def start() {
31 |      while (true) {}   
32 |   }
33 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/app/TrainApp.scala:
--------------------------------------------------------------------------------
  1 | package de.kp.spark.outlier.app
  2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
  3 | * 
  4 | * This file is part of the Spark-Outlier project
  5 | * (https://github.com/skrusche63/spark-outlier).
  6 | * 
  7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
  8 | * terms of the GNU General Public License as published by the Free Software
  9 | * Foundation, either version 3 of the License, or (at your option) any later
 10 | * version.
 11 | * 
 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 15 | * You should have received a copy of the GNU General Public License along with
 16 | * Spark-Outlier. 
 17 | * 
 18 | * If not, see <http://www.gnu.org/licenses/>.
 19 | */
 20 | 
 21 | import org.apache.spark.SparkContext
 22 | 
 23 | import akka.actor._
 24 | import com.typesafe.config.ConfigFactory
 25 | 
 26 | import org.clapper.argot._
 27 | 
 28 | import de.kp.spark.core.Names
 29 | import de.kp.spark.core.model._
 30 | 
 31 | import de.kp.spark.core.actor.Supervisor
 32 | import de.kp.spark.core.SparkService
 33 | 
 34 | import de.kp.spark.outlier.{Configuration,RequestContext}
 35 | 
 36 | import de.kp.spark.outlier.actor.OutlierMaster
 37 | import de.kp.spark.outlier.model._
 38 | 
 39 | import scala.concurrent.duration.DurationInt
 40 | import scala.collection.mutable.HashMap
 41 | 
 42 | object TrainApp extends SparkService {
 43 |   
 44 |   protected val sc = createCtxLocal("OutlierContext",Configuration.spark)      
 45 |   protected val system = ActorSystem("OutlierSystem")
 46 | 
 47 |   protected val inbox = Inbox.create(system)
 48 |   
 49 |   sys.addShutdownHook({
 50 |     /*
 51 |      * In case of a system shutdown, we also make clear
 52 |      * that the SparkContext is properly stopped as well
 53 |      * as the respective Akka actor system
 54 |      */
 55 |     sc.stop
 56 |     system.shutdown
 57 |     
 58 |   })
 59 |   
 60 |   def main(args:Array[String]) {
 61 |     
 62 |     try {
 63 |       
 64 |       val req_params = createParams(args)
 65 |       val req = new ServiceRequest("context","train:model",req_params)
 66 |       
 67 |       val ctx = new RequestContext(sc)
 68 |       val actor = system.actorOf(Props(new Handler(ctx)))   
 69 |       
 70 |       inbox.watch(actor)    
 71 |       actor ! req
 72 | 
 73 |       val timeout = DurationInt(req_params("timeout").toInt).minute
 74 |     
 75 |       while (inbox.receive(timeout).isInstanceOf[Terminated] == false) {}    
 76 |       sys.exit
 77 |       
 78 |     } catch {
 79 |       case e:Exception => {
 80 |           
 81 |         println(e.getMessage) 
 82 |         sys.exit
 83 |           
 84 |       }
 85 |     
 86 |     }
 87 |     
 88 |   }
 89 |   
 90 |   protected def createParams(args:Array[String]):Map[String,String] = {
 91 | 
 92 |     import ArgotConverters._
 93 |      
 94 |     val parser = new ArgotParser(
 95 |       programName = "Outlier Analysis Engine",
 96 |       compactUsage = true,
 97 |       preUsage = Some("Version %s. Copyright (c) 2015, %s.".format("1.0","Dr. Krusche & Partner PartG"))
 98 |     )
 99 | 
100 |     val site = parser.option[String](List("key"),"key","Unique application key")
101 |     val uid = parser.option[String](List("uid"),"uid","Unique job identifier")
102 | 
103 |     val name = parser.option[String](List("name"),"name","Unique job designator")
104 | 
105 |     val config = parser.option[String](List("config"),"config","Configuration file")
106 |     parser.parse(args)
107 | 
108 |     /* Collect parameters */
109 |     val params = HashMap.empty[String,String]
110 |          
111 |     /* Validate parameters */
112 |     site.value match {
113 |       
114 |       case None => parser.usage("Parameter 'key' is missing.")
115 |       case Some(value) => params += "site" -> value
116 |     
117 |     }
118 |     
119 |     uid.value match {
120 |       
121 |       case None => parser.usage("Parameter 'uid' is missing.")
122 |       case Some(value) => params += "uid" -> value
123 |       
124 |     }
125 |     
126 |     name.value match {
127 |       
128 |       case None => parser.usage("Parameter 'name' is missing.")
129 |       case Some(value) => params += "name" -> value
130 |       
131 |     }
132 | 
133 |     config.value match {
134 |       
135 |       case None => parser.usage("Parameter 'config' is missing.")
136 |       case Some(value) => {
137 |         
138 |         val cfg = ConfigFactory.load(value)
139 |         
140 |         val algo = cfg.getString("algo")
141 |         if (Algorithms.isAlgorithm(algo) == false)
142 |           parser.usage("Parameter 'algo' must be one of [KMEANS, SKMEANS].")
143 |           
144 |         params += "algorithm" -> algo        
145 |         params += "source" -> cfg.getString("source")
146 | 
147 |         /* COMMON */
148 |         params += "strategy" -> cfg.getString("strategy")
149 | 
150 |         /* KMEANS */
151 |         params += "k" -> cfg.getInt("k").toString
152 | 
153 |         /* MARKOV */
154 |         params += "threshold" -> cfg.getDouble("threshold").toString
155 | 
156 |         params += "scale" -> cfg.getInt("scale").toString
157 |         params += "states" -> cfg.getString("states")
158 | 
159 |       }
160 |       
161 |     }
162 |     
163 |     /* Add timestamp as global parameter */
164 |     params += "timestamp" -> new java.util.Date().getTime.toString
165 |     params.toMap
166 |     
167 |   }
168 |   
169 | }
170 | 
171 | class Handler(@transient ctx:RequestContext) extends Actor {
172 |     
173 |   private val config = Configuration
174 |   def receive = {
175 |     
176 |     case req:ServiceRequest => {
177 | 
178 |       val start = new java.util.Date().getTime     
179 |       println("Trainer started at " + start)
180 |  
181 |       val master = context.actorOf(Props(new OutlierMaster(ctx))) 
182 |       master ! Serializer.serializeRequest(req)
183 | 
184 |       val status = OutlierStatus.TRAINING_FINISHED
185 |       val supervisor = context.actorOf(Props(new Supervisor(req,status,config)))
186 |        
187 |     }
188 |     
189 |     case evt:StatusEvent => {
190 |       /*
191 |        * The StatusEvent message is returned from the
192 |        * supervisor actor and specifies that the model
193 |        * training task has been finished
194 |        */
195 |       val end = new java.util.Date().getTime           
196 |       println("Trainer finished at " + end)
197 |        
198 |       context.stop(self)
199 |       
200 |     }
201 | 
202 |     case msg:String => {
203 |     
204 |       val end = new java.util.Date().getTime           
205 |       println("Trainer finished at " + end)
206 |     
207 |       val response = Serializer.deserializeResponse(msg)
208 |         
209 |       println("Message: " + response.data("message").toString)
210 |       println("Status: " + response.status)
211 |       
212 |     }
213 |     
214 |   }
215 |   
216 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/markov/DoubleMatrix.scala:
--------------------------------------------------------------------------------
  1 | package de.kp.spark.outlier.markov
  2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
  3 | * 
  4 | * This file is part of the Spark-Outlier project
  5 | * (https://github.com/skrusche63/spark-outlier).
  6 | * 
  7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
  8 | * terms of the GNU General Public License as published by the Free Software
  9 | * Foundation, either version 3 of the License, or (at your option) any later
 10 | * version.
 11 | * 
 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 15 | * You should have received a copy of the GNU General Public License along with
 16 | * Spark-Outlier. 
 17 | * 
 18 | * If not, see <http://www.gnu.org/licenses/>.
 19 | */
 20 | 
 21 | import scala.collection.mutable.ArrayBuffer
 22 | import scala.Array.canBuildFrom
 23 | 
 24 | class DoubleMatrix(numRow:Int,numCol:Int) {
 25 | 
 26 |   protected val table:Array[Array[Double]] = Array.fill[Double](numRow,numCol)(0.0)
 27 |   
 28 |   protected var rowLabels = Array.empty[String]
 29 |   protected var colLabels = Array.empty[String]
 30 | 
 31 |   def setStates(rowStates:Array[String], colStates:Array[String]) {
 32 | 	
 33 |     this.rowLabels = rowStates
 34 | 	this.colLabels = colStates
 35 | 	
 36 |   }
 37 | 
 38 |   def set(row:Int,col:Int,valu:Double) {
 39 | 	table(row)(col) = valu
 40 |   }
 41 | 	
 42 |   def get(row:Int,col:Int):Double = table(row)(col)
 43 | 
 44 |   def getRow(row:Int):Array[Double] = table(row)
 45 | 	
 46 |   def getRow(rowLabel:String):Array[Double] = table(rowLabels.indexOf(rowLabel))    
 47 |  
 48 |   def getRowLabel(col:Int) = rowLabels(col)
 49 | 
 50 |   def getColLabel(col:Int) = colLabels(col)
 51 |   
 52 |   def add(row:Int,col:Int,valu:Double) {
 53 | 	table(row)(col) = table(row)(col) + valu
 54 |   }
 55 | 
 56 |   def add(rowLabel:String,colLabel:String,valu:Double) {
 57 | 	
 58 |     val (row,col) = getRowCol(rowLabel,colLabel)
 59 | 	table(row)(col) += valu
 60 | 	
 61 |   }
 62 | 	
 63 |   def increment(row:Int,col:Int) {
 64 | 	table(row)(col) = table(row)(col) + 1
 65 |   }
 66 | 
 67 |   def increment(rowLabel:String, colLabel:String) {
 68 | 		
 69 |     val (row,col) = getRowCol(rowLabel, colLabel)
 70 |     table(row)(col) = table(row)(col) + 1
 71 | 
 72 |   }
 73 | 	
 74 |   def getRowSum(row:Int):Double = table(row).sum
 75 | 
 76 |   def getColumnSum(col:Int):Double = {
 77 | 		
 78 |     var sum:Double = 0
 79 | 	(0 until numRow).foreach(row => sum += table(row)(col))
 80 | 	
 81 |     sum
 82 | 	
 83 |   }
 84 | 	
 85 |   def serialize():String = {
 86 | 		
 87 |     val output = ArrayBuffer.empty[String]		
 88 |     (0 until numRow).foreach(row => output += serializeRow(row))
 89 | 	
 90 |     output.mkString(";")
 91 | 	
 92 |   }
 93 |   
 94 |   def serializeRow(row:Int):String = table(row).mkString(",")
 95 | 
 96 |   def deserialize(data:String) {
 97 |     
 98 |     val rows = data.split(";")
 99 |     (0 until rows.length).foreach(row => deserializeRow(row,rows(row)))
100 |     
101 |   }
102 | 	
103 |   def deserializeRow(row:Int,data:String) {
104 |     table(row) = data.split(",").map(_.toDouble)
105 |   }
106 | 	
107 | 
108 |   private def getRowCol(rowLabel:String,colLabel:String):(Int,Int) = {
109 | 
110 |     val row = rowLabels.indexOf(rowLabel)
111 |     val col = colLabels.indexOf(colLabel)		
112 | 
113 | 	(row,col)
114 | 	
115 |   }
116 | 
117 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/markov/MarkovBuilder.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.outlier.markov
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 | * 
 4 | * This file is part of the Spark-Outlier project
 5 | * (https://github.com/skrusche63/spark-outlier).
 6 | * 
 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
 8 | * terms of the GNU General Public License as published by the Free Software
 9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | * 
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier. 
17 | * 
18 | * If not, see <http://www.gnu.org/licenses/>.
19 | */
20 | 
21 | import org.apache.spark.rdd.RDD
22 | 
23 | import de.kp.spark.core.model._
24 | import scala.collection.mutable.HashMap
25 | 
26 | private case class Pair(ant:String,con:String)
27 | 
28 | class MarkovBuilder(scaleDef:Int,stateDefs:Array[String]) extends Serializable {
29 | 
30 |   def build(dataset:RDD[Behavior]):TransitionMatrix = {
31 |    
32 |     def seqOp(support:HashMap[Pair,Int],seq:Behavior):HashMap[Pair,Int] = {
33 |           
34 |       val (site,user,states) = (seq.site,seq.user,seq.states)
35 |       /*
36 |        *  The pair support aggregates over all sites and users provided;
37 |        *  for an outlier detection, we assume that this is the best way
38 |        *  to determine state transition probabilities
39 |        */  
40 |       for (i <- 1 until states.size) {
41 |         
42 |         val pair = new Pair(states(i-1),states(i))
43 | 
44 |         support.get(pair) match {          
45 |           case None => support += pair -> 1
46 |           case Some(count) => support += pair -> (count + 1)
47 |         }
48 | 
49 |       }
50 |       
51 |       support
52 |       
53 |     }
54 |     
55 |     /* Note that supp1 is always NULL */
56 |     def combOp(supp1:HashMap[Pair,Int],supp2:HashMap[Pair,Int]):HashMap[Pair,Int] = supp2      
57 | 
58 |     /* Build pair support */
59 |     val pairsupp = dataset.coalesce(1, false).aggregate(HashMap.empty[Pair,Int])(seqOp,combOp)    
60 | 
61 |     /* Setup transition matrix and add pair support*/  	
62 |     val dim = stateDefs.length
63 |     
64 |     val matrix = new TransitionMatrix(dim,dim)
65 |     matrix.setScale(scaleDef)
66 |     
67 |     matrix.setStates(stateDefs, stateDefs)    
68 |     for ((pair,support) <- pairsupp) {
69 |       matrix.add(pair.ant, pair.con, support)
70 |     }
71 |             
72 |     /* Normalize the matrix content and transform support into probabilities */
73 | 	matrix.normalize()
74 | 
75 |     matrix
76 |     
77 |   }
78 |   
79 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/markov/StateMetrics.scala:
--------------------------------------------------------------------------------
  1 | package de.kp.spark.outlier.markov
  2 | 
  3 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
  4 | * 
  5 | * This file is part of the Spark-Outlier project
  6 | * (https://github.com/skrusche63/spark-outlier).
  7 | * 
  8 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
  9 | * terms of the GNU General Public License as published by the Free Software
 10 | * Foundation, either version 3 of the License, or (at your option) any later
 11 | * version.
 12 | * 
 13 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
 14 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 15 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 16 | * You should have received a copy of the GNU General Public License along with
 17 | * Spark-Outlier. 
 18 | * 
 19 | * If not, see <http://www.gnu.org/licenses/>.
 20 | */
 21 | 
 22 | class StateMetrics(stateDefs:Array[String]) extends Serializable {
 23 |   
 24 |   /* 
 25 |    * Miss Probability Metric
 26 |    * 
 27 |    * For any pair of consecutive transaction states t(i) and t(j) in a sequence, 
 28 |    * the following quantity is calculated: For the row corresponding to t(i), we 
 29 |    * are summing all the probabilities except for the target state t(j).
 30 |    * 
 31 |    * F(t(i), t(j)) = Sum(P(t(i), t(k)) | k != j) where P(t(i), t(k)) is the probability 
 32 |    * of transitioning from transaction state t(i) to t(k)
 33 |    * 
 34 |    * Then we sum F over all the transaction state pairs in the sequence and normalize by 
 35 |    * the number of such pairs.
 36 |    */
 37 |   
 38 |   def missProbMetric(states:List[String],model:TransitionMatrix):Double = {
 39 |     
 40 |     var F:Double = 0
 41 |     var count:Int = 0
 42 |     
 43 |     for (i <- 1 until states.size) {
 44 |       
 45 |       val srcIndex = stateDefs.indexOf(states(i-1))
 46 |       val tarIndex = stateDefs.indexOf(states(i))
 47 | 
 48 |       /* Sum all probabilities except the target state */
 49 | 	  for (j <- 0 until stateDefs.length) {
 50 | 		if (j != tarIndex)
 51 | 		  F += model.get(srcIndex,j)
 52 | 	  }
 53 | 
 54 |       count += 1
 55 |     }
 56 |     
 57 |     val metric = F / count
 58 |     metric
 59 |     
 60 |   }
 61 | 
 62 |   /*
 63 |    * Miss Rate Metric 
 64 |    * 
 65 |    * For any transition, if transition corresponds to the maximum probability target state, the value is 0, otherwise it’s 1. 
 66 |    * 
 67 |    * F(t(i), t(j)) = 0 if t(j) = t(k) else 1 where t(k) is the target state when P(t(i), t(k)) = max(P(t(i), t(l)) for all l
 68 |    * 
 69 |    * Then we sum F over all the transaction state pairs in the sequence and normalize by 
 70 |    * the number of such pairs.
 71 |    */
 72 |   def missRateMetric(states:List[String],model:TransitionMatrix):Double = {
 73 |      
 74 |     var F:Double  = 0
 75 |     var count:Int = 0
 76 |     
 77 |     for (i <- 1 until states.size) {
 78 |       
 79 |       val srcIndex = stateDefs.indexOf(states(i-1))
 80 |       val tarIndex = stateDefs.indexOf(states(i))
 81 | 
 82 |       val maxIndex = stateDefs.indexOf(model.getRow(srcIndex).max)
 83 |       
 84 |       F = (if (tarIndex == maxIndex) 0 else 1)
 85 | 	  count += 1
 86 | 
 87 |     }
 88 |     
 89 |     val metric = F / count
 90 |     metric    
 91 | 
 92 |   }
 93 | 	
 94 |   /*
 95 |    * Entropy Reduction Metric
 96 |    * 
 97 |    * We calculate two quantities F and G as below. For a given row, F is the entropy excluding target state for the state pair 
 98 |    * under consideration. G is the entropy for the whole row.
 99 |    * 
100 |    * F(t(i), t(j)) = sum (-P(t(i), t(k)) log(P(t(i), t(k)) | t(k) != t(j)
101 |    * G(t(i)) = sum (-P(t(i), t(k)) log(P(t(i), t(k))
102 |    * 
103 |    * We sum F and G over all consecutive state pairs and divide the two sums.
104 |    */
105 |   def entropyReductionMetric(states:List[String],model:TransitionMatrix):Double = {
106 | 
107 |     var F:Double = 0
108 |     var G:Double = 0
109 |     
110 |     for (i <- 1 until states.size) {
111 |       
112 |       val srcIndex = stateDefs.indexOf(states(i-1))
113 |       val tarIndex = stateDefs.indexOf(states(i))
114 | 
115 |       for (j <- 0 until stateDefs.length) {
116 |         
117 |         val prob = model.get(srcIndex,j)
118 |         val entropy = -prob * Math.log(prob)
119 |         
120 |         
121 |         if (j != tarIndex) F += entropy
122 |         G += entropy
123 | 
124 |       }
125 | 
126 |     }
127 |     
128 |     val metric = F / G
129 |     metric
130 |     
131 |   }
132 |   
133 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/markov/TransitionMatrix.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.outlier.markov
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 | * 
 4 | * This file is part of the Spark-Outlier project
 5 | * (https://github.com/skrusche63/spark-outlier).
 6 | * 
 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
 8 | * terms of the GNU General Public License as published by the Free Software
 9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | * 
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier. 
17 | * 
18 | * If not, see <http://www.gnu.org/licenses/>.
19 | */
20 | 
21 | class TransitionMatrix(numRow:Int,numCol:Int) extends DoubleMatrix(numRow,numCol) {
22 |     	
23 |   private var scale = 100
24 | 
25 |   def setScale(scale:Int) {
26 | 	this.scale = scale
27 |   }
28 | 
29 |   def normalize() {	
30 |     /*
31 |      * Laplace correction: A row that contains at least 
32 |      * one zero value is shift by the value of 1
33 |      */
34 |     (0 until numRow).foreach(row => {
35 | 	  
36 |       val transProbs = getRow(row)
37 |       if (transProbs.min == 0) {		
38 |         (0 until numCol).foreach(col => table(row)(col) += 1)		        
39 |       }
40 | 
41 |     })	
42 | 		
43 | 	/* Normalize transition support */
44 | 	(0 until numRow).foreach(row => {			
45 | 	  val rowSum = getRowSum(row)
46 | 	  (0 until numCol).foreach(col => table(row)(col) = (table(row)(col) * scale) / rowSum)		
47 | 	})
48 |   
49 |   }
50 | 
51 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/model/Model.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.outlier.model
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 | * 
 4 | * This file is part of the Spark-Outlier project
 5 | * (https://github.com/skrusche63/spark-outlier).
 6 | * 
 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
 8 | * terms of the GNU General Public License as published by the Free Software
 9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | * 
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier. 
17 | * 
18 | * If not, see <http://www.gnu.org/licenses/>.
19 | */
20 | 
21 | import de.kp.spark.core.model._
22 |  
23 | object Algorithms {
24 |   
25 |   val KMEANS:String = "KMEANS"
26 |   val MARKOV:String = "MARKOV"
27 | 
28 |   private def algorithms = List(KMEANS,MARKOV)  
29 |   def isAlgorithm(algorithm:String):Boolean = algorithms.contains(algorithm)
30 |     
31 | }
32 | 
33 | object Serializer extends BaseSerializer
34 | 
35 | object Messages extends BaseMessages {
36 |  
37 |   def MISSING_PARAMETERS(uid:String):String = String.format("""Parameters are missing for uid '%s'.""", uid)
38 | 
39 |   def NO_METHOD_PROVIDED(uid:String):String = String.format("""No method provided for uid '%s'.""", uid)
40 | 
41 |   def METHOD_NOT_SUPPORTED(uid:String):String = String.format("""The provided is not supported for uid '%s'.""", uid)
42 | 
43 |   def OUTLIER_DETECTION_STARTED(uid:String) = String.format("""Outlier detection started for uid '%s'.""", uid)
44 | 
45 |   def OUTLIERS_DO_NOT_EXIST(uid:String):String = String.format("""The outliers for uid '%s' do not exist.""", uid)
46 |   
47 | }
48 | 
49 | object OutlierStatus extends BaseStatus {
50 |   
51 |   val DATASET:String = "dataset" 
52 |   val TRAINED:String = "trained"
53 |     
54 |   val STARTED:String = "started"
55 |   val STOPPED:String = "stopped"
56 |     
57 |   val FINISHED:String = "finished"
58 |   val RUNNING:String  = "running"
59 |     
60 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/spec/StateSpec.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.outlier.spec
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 | * 
 4 | * This file is part of the Spark-Outlier project
 5 | * (https://github.com/skrusche63/spark-outlier).
 6 | * 
 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
 8 | * terms of the GNU General Public License as published by the Free Software
 9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | * 
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier. 
17 | * 
18 | * If not, see <http://www.gnu.org/licenses/>.
19 | */
20 | 
21 | import de.kp.spark.core.model._
22 | import de.kp.spark.core.redis.RedisCache
23 | 
24 | import de.kp.spark.core.spec.Fields
25 | import de.kp.spark.outlier.Configuration
26 | 
27 | import scala.xml._
28 | import scala.collection.mutable.Buffer
29 | 
30 | class StateSpec(req:ServiceRequest) extends Fields {
31 |   
32 |   val path = "states.xml"
33 | 
34 |   val (host,port) = Configuration.redis
35 |   val cache = new RedisCache(host,port.toInt)
36 | 
37 |   private val fields = load
38 |   
39 |   def mapping:Map[String,String] = fields.map(x => (x.name,x.value)).toMap
40 | 
41 |   def names:List[String] = fields.map(_.name)
42 |   
43 |   def types:List[String] = fields.map(_.datatype)
44 |   
45 |   private val load:List[Field] = {
46 |     
47 |     val data = Buffer.empty[Field]
48 |     
49 |     try {
50 |           
51 |       if (cache.fieldsExist(req)) {   
52 |         
53 |         val fieldspec = cache.fields(req)
54 |         for (field <- fieldspec) {
55 |           data += Field(field.name,field.datatype,field.value)
56 |         }
57 |         
58 |       } else {
59 | 
60 |         val root = XML.load(getClass.getClassLoader.getResource(path))     
61 |         for (field <- root \ "field") {
62 |       
63 |           val _name  = (field \ "@name").toString
64 |           val _type  = (field \ "@type").toString
65 | 
66 |           val _mapping = field.text
67 |           
68 |           data += Field(_name,_type,_mapping)
69 |       
70 |         }
71 |       
72 |      }
73 |       
74 |     } catch {
75 |       case e:Exception => {}
76 |     }
77 |     
78 |     data.toList
79 |     
80 |   } 
81 | 
82 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/spec/VectorSpec.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.outlier.spec
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 | * 
 4 | * This file is part of the Spark-Outlier project
 5 | * (https://github.com/skrusche63/spark-outlier).
 6 | * 
 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
 8 | * terms of the GNU General Public License as published by the Free Software
 9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | * 
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier. 
17 | * 
18 | * If not, see <http://www.gnu.org/licenses/>.
19 | */
20 | 
21 | import de.kp.spark.core.model._
22 | import de.kp.spark.core.redis.RedisCache
23 | 
24 | import de.kp.spark.core.spec.Fields
25 | import de.kp.spark.outlier.Configuration
26 | 
27 | import scala.xml._
28 | import scala.collection.mutable.Buffer
29 | 
30 | class VectorSpec(req:ServiceRequest) extends Fields {
31 |   
32 |   val path = "features.xml"
33 | 
34 |   val (host,port) = Configuration.redis
35 |   val cache = new RedisCache(host,port.toInt)
36 | 
37 |   private val fields = load
38 |   
39 |   def mapping:Map[String,String] = fields.map(x => (x.name,x.value)).toMap
40 | 
41 |   def names:List[String] = fields.map(_.name)
42 |   
43 |   def types:List[String] = fields.map(_.datatype)
44 |   
45 |   private val load:List[Field] = {
46 |     
47 |     val data = Buffer.empty[Field]
48 |     
49 |     try {
50 |           
51 |       if (cache.fieldsExist(req)) {   
52 |         
53 |         val fieldspec = cache.fields(req)
54 |         for (field <- fieldspec) {
55 |           data += Field(field.name,field.datatype,field.value)
56 |         }
57 |         
58 |       } else {
59 | 
60 |         val root = XML.load(getClass.getClassLoader.getResource(path))     
61 |         for (field <- root \ "field") {
62 |       
63 |           val _name  = (field \ "@name").toString
64 |           val _type  = (field \ "@type").toString
65 | 
66 |           val _mapping = field.text
67 |           
68 |           data += Field(_name,_type,_mapping)
69 |       
70 |         }
71 |       
72 |      }
73 |       
74 |     } catch {
75 |       case e:Exception => {}
76 |     }
77 |     
78 |     data.toList
79 |     
80 |   } 
81 | 
82 | }
83 | 
84 | 


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/util/MathHelper.scala:
--------------------------------------------------------------------------------
  1 | package de.kp.spark.outlier.util
  2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
  3 | * 
  4 | * This file is part of the Spark-Outlier project
  5 | * (https://github.com/skrusche63/spark-outlier).
  6 | * 
  7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
  8 | * terms of the GNU General Public License as published by the Free Software
  9 | * Foundation, either version 3 of the License, or (at your option) any later
 10 | * version.
 11 | * 
 12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
 13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 15 | * You should have received a copy of the GNU General Public License along with
 16 | * Spark-Outlier. 
 17 | * 
 18 | * If not, see <http://www.gnu.org/licenses/>.
 19 | */
 20 | 
 21 | import org.apache.spark.rdd.RDD
 22 | 
 23 | object MathHelper {
 24 | 
 25 |   /**
 26 |    * Entropy of a dataset containing integers
 27 |    */
 28 |   def intEntropy(data:TraversableOnce[Int]):Double = {
 29 | 
 30 |     val invLog2 = 1.0 / Math.log(2)
 31 | 
 32 |     val positives = data.filter(_ > 0)
 33 |     if (positives.size > 0) {
 34 |       
 35 |       val sum: Double = positives.sum
 36 |       val invSum = 1.0 / sum.toDouble
 37 |       
 38 |       positives.map {positive =>
 39 |     
 40 |         val p = positive.toDouble * invSum
 41 |         -p * Math.log(p)
 42 |      
 43 |       }.sum
 44 |     
 45 |     } else {
 46 |       0.0
 47 |     }
 48 |     
 49 |   }
 50 |   
 51 |   /**
 52 |    * Entroy of a dataset containing strings; it may be
 53 |    * used as a measure of the homogenity of the strings
 54 |    */
 55 |   def strEntropy(data:TraversableOnce[String]):Double = {
 56 |     
 57 |     val invLog2 = 1.0 / Math.log(2)
 58 | 
 59 |     val len = data.size
 60 |     if (len > 1) {
 61 |       
 62 |       val invLen = 1.0 / len.toDouble
 63 |       var ent = 0.0
 64 | 
 65 |       for (str <- data.toList.distinct) {
 66 |         /*
 67 |          * Probability to find a certain value within the dataset
 68 |          */
 69 |         val pstr = data.count(x => x == str).toDouble * invLen
 70 |         ent -= pstr * Math.log(pstr) * invLog2
 71 |         
 72 |       }
 73 |       
 74 |       ent
 75 | 
 76 |     } else {
 77 |       0.0
 78 |     
 79 |     }
 80 |   
 81 |   }
 82 |   
 83 |   /** 
 84 |    * Data is a distributed list of feature vectors (Array[Double]) with the
 85 |    * following semantic: vector = [f_0,f_1,f_2, ...]; i.e. each vectors holds
 86 |    * a certain value for feature i at position i. Normalizing those data means
 87 |    * that one has to normalize all values of feature f_0, all values of f_1 etc  
 88 |    */
 89 |   def normalize(data:RDD[Array[Double]]):RDD[Array[Double]] = {
 90 |     
 91 |     val total = data.count()
 92 | 
 93 |     /*
 94 |      * Each column of the data matrix is assigned to a certain feature;
 95 |      * we therefore have to sum up the values of each column independently
 96 |      * and build the mean value
 97 |      */
 98 |     val sums = data.reduce((a,b) => a.zip(b).map(t => t._1 + t._2))
 99 |     val means = sums.map(_ / total)
100 | 
101 |     /*
102 |      * We build the standard deviation for the values of each column
103 |      */
104 |     val len = sums.length
105 |     
106 |     val init = new Array[Double](len)
107 |     val sumSquares = data.fold(init)((a,b) => a.zip(b).map(t => t._1 + t._2*t._2))
108 | 
109 |     val stdevs = sumSquares.zip(sums).map {
110 |       case(sumSq,sum) => Math.sqrt(total*sumSq - sum*sum) / total 
111 |     }
112 | 
113 |     /*
114 |      * Finally for each column (or feature), each single values gets
115 |      * normalized using the mean value and standard deviations 
116 |      */
117 |     val normdata = data.map(
118 |         
119 |       (_,means,stdevs).zipped.map((value,mean,stdev) => {
120 |         if (stdev <= 0) (value-mean) else (value-mean) / stdev
121 |        
122 |       })
123 |     
124 |     )
125 |       
126 |     normdata
127 |     
128 |   }
129 | 
130 | }


--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/util/Optimizer.scala:
--------------------------------------------------------------------------------
 1 | package de.kp.spark.outlier.util
 2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
 3 | * 
 4 | * This file is part of the Spark-Outlier project
 5 | * (https://github.com/skrusche63/spark-outlier).
 6 | * 
 7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
 8 | * terms of the GNU General Public License as published by the Free Software
 9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | * 
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier. 
17 | * 
18 | * If not, see <http://www.gnu.org/licenses/>.
19 | */
20 | 
21 | import org.apache.spark.rdd.RDD
22 | 
23 | import org.apache.spark.mllib.clustering.KMeans
24 | import org.apache.spark.mllib.linalg.Vectors
25 | 
26 | import de.kp.spark.core.model.LabeledPoint
27 | 
28 | object Optimizer {
29 | 
30 |   /**
31 |    * Determine from a range of cluster numbers that number where the mean
32 |    * entropy of all cluster labels is minimal; note, that the entropy is 
33 |    * an indicator for the homogenity of the cluster labels
34 |    */ 
35 |   def optimizeByEntropy(data:RDD[LabeledPoint],range:Range,iterations:Int):Int = {
36 |    
37 |     val scores = range.par.map(k => (k, clusterEntropy(data,k,iterations))).toList
38 |     scores.sortBy(_._2).head._1
39 |   
40 |   }
41 |   
42 |   def clusterEntropy(data: RDD[LabeledPoint],clusters:Int,iterations:Int):Double = {
43 | 
44 |     val vectors = data.map(point => Vectors.dense(point.features))
45 |     val model = KMeans.train(vectors,clusters,iterations)
46 | 
47 |     val entropies = data.map(point => {
48 |       
49 |       val cluster = model.predict(Vectors.dense(point.features))
50 |       (cluster,point.label)
51 |       
52 |     }).groupBy(_._1).map(data => MathHelper.strEntropy(data._2.map(_._2))).collect()
53 | 
54 |     entropies.sum / entropies.size
55 |     
56 |   }
57 | 
58 |   /**
59 |    * Determine from a range of cluster numbers that number where the mean
60 |    * distance between cluster points and their cluster centers is minimal
61 |    */ 
62 |   def optimizeByDistance(data:RDD[LabeledPoint],range:Range,iterations:Int):Int = {
63 | 
64 |     val scores = range.par.map(k => (k, clusterDistance(data, k, iterations))).toList
65 |     scores.sortBy(_._2).head._1
66 |     
67 |   }
68 | 
69 |   def distance(a:Array[Double], b:Array[Double]) = 
70 |     Math.sqrt(a.zip(b).map(p => p._1 - p._2).map(d => d * d).sum)
71 | 
72 |   /**
73 |    * This method calculates the mean distance of all data (vectors) from 
74 |    * their centroids, given certain clustering parameters; the method may
75 |    * be used to score clusters
76 |    */
77 |   def clusterDistance(data: RDD[LabeledPoint], clusters:Int, iterations:Int):Double = {
78 |     
79 |     val vectors = data.map(point => Vectors.dense(point.features))
80 |     val model = KMeans.train(vectors,clusters,iterations)
81 |     /**
82 |      * Centroid: Vector that specifies the centre of a certain cluster
83 |      */
84 |     val centroids = model.clusterCenters
85 |   
86 |     val distances = data.map(point => {
87 |       
88 |       val cluster = model.predict(Vectors.dense(point.features))
89 |       val centroid = centroids(cluster)
90 |       
91 |       distance(centroid.toArray,point.features)
92 |       
93 |     }).collect()
94 |     
95 |     distances.sum / distances.size
96 | 
97 |   }
98 | 
99 | }


--------------------------------------------------------------------------------