├── .gitignore
├── README.md
├── images
├── outlier-detection-overview.png
└── predictiveworks.png
├── pom.xml
└── src
└── main
├── resources
├── application.conf
├── features.xml
├── server.conf
└── states.xml
└── scala
└── de
└── kp
└── spark
└── outlier
├── Configuration.scala
├── KMeansDetector.scala
├── MarkovDetector.scala
├── OutlierServer.scala
├── RequestContext.scala
├── actor
├── BaseActor.scala
├── KMeansActor.scala
├── MarkovActor.scala
├── OutlierMaster.scala
├── OutlierMiner.scala
├── OutlierQuestor.scala
└── TrainActor.scala
├── api
└── AkkaApi.scala
├── app
└── TrainApp.scala
├── markov
├── DoubleMatrix.scala
├── MarkovBuilder.scala
├── StateMetrics.scala
└── TransitionMatrix.scala
├── model
└── Model.scala
├── spec
├── StateSpec.scala
└── VectorSpec.scala
└── util
├── MathHelper.scala
└── Optimizer.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | # use glob syntax.
2 | syntax: glob
3 | *.ser
4 | *.class
5 | *~
6 | *.bak
7 | #*.off
8 | *.old
9 |
10 | # eclipse conf file
11 | .settings
12 | .classpath
13 | .project
14 | .manager
15 | .scala_dependencies
16 |
17 | # idea
18 | .idea
19 | *.iml
20 |
21 | # building
22 | target
23 | build
24 | null
25 | tmp*
26 | temp*
27 | dist
28 | test-output
29 | build.log
30 |
31 | # other scm
32 | .svn
33 | .CVS
34 | .hg*
35 |
36 | # switch to regexp syntax.
37 | # syntax: regexp
38 | # ^\.pc/
39 |
40 | #SHITTY output not in target directory
41 | build.log
42 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | **Predictiveworks.** is an open ensemble of predictive engines and has been made to cover a wide range of today's analytics requirements. **Predictiveworks.** brings the power of predictive analytics to Elasticsearch.
4 |
5 | ## Reactive Outlier Detection Engine
6 |
7 | 
8 |
9 | The Outlier Detection Engine is one of the nine members of the open ensemble and is built to find anomalies in large-scale datasets and human behavior
10 | for advanced risk reduction.
11 |
--------------------------------------------------------------------------------
/images/outlier-detection-overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skrusche63/spark-outlier/a02b7835dc8c8b194e52311e450d855d7e9624b5/images/outlier-detection-overview.png
--------------------------------------------------------------------------------
/images/predictiveworks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skrusche63/spark-outlier/a02b7835dc8c8b194e52311e450d855d7e9624b5/images/predictiveworks.png
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | spark-outlier
4 | spark-outlier
5 | 0.2.2
6 | Spark-Outlier
7 | Reactive Outlier Detection Engine
8 | 2010
9 |
10 |
11 | My License
12 | http://....
13 | repo
14 |
15 |
16 |
17 |
18 | 1.6
19 | 1.6
20 | UTF-8
21 | 2.10
22 | 2.10.0
23 | 1.2.0
24 |
25 |
26 |
27 |
28 | org.scala-lang
29 | scala-library
30 | ${scala.version}
31 |
32 |
33 |
34 |
35 | junit
36 | junit
37 | 4.11
38 | test
39 |
40 |
41 | org.specs2
42 | specs2_${scala.tools.version}
43 | 1.13
44 | test
45 |
46 |
47 | org.scalatest
48 | scalatest_${scala.tools.version}
49 | 2.0.M6-SNAP8
50 | test
51 |
52 |
53 |
54 |
55 | org.apache.spark
56 | spark-core_2.10
57 | ${spark.version}
58 |
59 |
60 |
61 |
62 | org.apache.spark
63 | spark-mllib_2.10
64 | ${spark.version}
65 |
66 |
67 |
68 |
69 | cascading
70 | cascading-core
71 | 2.5.4
72 |
73 |
74 |
75 | cascading
76 | cascading-hadoop
77 | 2.5.4
78 |
79 |
80 |
81 |
82 | org.elasticsearch
83 | elasticsearch-hadoop
84 | 2.0.0
85 |
86 |
87 |
88 |
89 | org.elasticsearch
90 | elasticsearch
91 | 1.3.2
92 |
93 |
94 |
95 |
96 | org.json4s
97 | json4s-native_2.10
98 | 3.2.10
99 |
100 |
101 |
102 |
103 | redis.clients
104 | jedis
105 | 2.5.2
106 |
107 |
108 |
109 |
110 | org.clapper
111 | argot_2.10
112 | 1.0.3
113 |
114 |
115 |
116 |
117 |
118 |
119 | conjars.org
120 | http://conjars.org/repo
121 |
122 |
123 |
124 |
125 | src/main/scala
126 | src/test/scala
127 |
128 |
129 |
130 | net.alchim31.maven
131 | scala-maven-plugin
132 | 3.1.3
133 |
134 |
135 |
136 | compile
137 | testCompile
138 |
139 |
140 |
141 | -make:transitive
142 | -dependencyfile
143 | ${project.build.directory}/.scala_dependencies
144 |
145 |
146 |
147 |
148 |
149 |
150 | org.apache.maven.plugins
151 | maven-surefire-plugin
152 | 2.13
153 |
154 | false
155 | true
156 |
157 |
158 |
159 | **/*Test.*
160 | **/*Suite.*
161 |
162 |
163 |
164 |
165 |
166 |
167 | Dr. Krusche & Partner PartG
168 | http://www.dr-kruscheundpartner.com
169 |
170 | https://github.com/skrusche63/spark-outlier
171 |
172 |
--------------------------------------------------------------------------------
/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | akka {
2 | loglevel = INFO
3 | stdout-loglevel = INFO
4 | akka.loggers = ["akka.event.slf4j.Slf4jLogger"]
5 | }
6 |
7 | actor {
8 | duration = 10
9 | retries = 10
10 | timeout = 10
11 | }
12 |
13 | #
14 | # Access to cassandra is provided by Datastax' spark-cassandra-connector; the respective
15 | # configuration parameters can be retrieved from here:
16 | #
17 | # https://github.com/datastax/spark-cassandra-connector/blob/master/doc/0_quick_start.md
18 | #
19 | cassandra {
20 | spark.cassandra.connection.host="127.0.0.1"
21 | }
22 |
23 | elastic {
24 | es.nodes="localhost"
25 | es.port="9200"
26 | es.resource=""
27 | es.query=""
28 | }
29 |
30 | file {
31 | items=""
32 | features=""
33 | }
34 |
35 | hbase {
36 | spark.hbase.host="127.0.0.1"
37 | }
38 |
39 | mongo {
40 | mongo.input.uri="mongodb://127.0.0.1:27017/beowulf.input"
41 | }
42 |
43 | mysql {
44 | url="127.0.0.1:8889"
45 | database="analytics"
46 | user="root"
47 | password="root"
48 | }
49 |
50 | redis {
51 | host="127.0.0.1"
52 | port="6379"
53 | }
54 | #
55 |
56 | # Configuration parameters for the REST API
57 | # of the Outlier Detection Engine
58 | #
59 | rest {
60 | host="127.0.0.1"
61 | port=9000
62 | }
63 |
64 | spark {
65 | spark.executor.memory="1g"
66 | spark.kryoserializer.buffer.mb="256"
67 | }
--------------------------------------------------------------------------------
/src/main/resources/features.xml:
--------------------------------------------------------------------------------
1 |
2 | row
3 | col
4 | label
5 | value
6 |
--------------------------------------------------------------------------------
/src/main/resources/server.conf:
--------------------------------------------------------------------------------
1 | akka {
2 | actor {
3 | provider = "akka.remote.RemoteActorRefProvider"
4 | }
5 | remote {
6 | enabled-transports = ["akka.remote.netty.tcp"]
7 | netty.tcp {
8 | hostname = "127.0.0.1"
9 | port = 2604
10 | }
11 | log-sent-messages = on
12 | log-received-messages = on
13 | }
14 | }
--------------------------------------------------------------------------------
/src/main/resources/states.xml:
--------------------------------------------------------------------------------
1 |
2 | site
3 | user
4 | timestamp
5 | state
6 |
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/Configuration.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import com.typesafe.config.ConfigFactory
22 | import org.apache.hadoop.conf.{Configuration => HConf}
23 |
24 | import de.kp.spark.core.{Configuration => CoreConf}
25 |
26 | object Configuration extends CoreConf {
27 |
28 | /* Load configuration for router */
29 | val path = "application.conf"
30 | val config = ConfigFactory.load(path)
31 |
32 | override def actor:(Int,Int,Int) = {
33 |
34 | val cfg = config.getConfig("actor")
35 |
36 | val duration = cfg.getInt("duration")
37 | val retries = cfg.getInt("retries")
38 | val timeout = cfg.getInt("timeout")
39 |
40 | (duration,retries,timeout)
41 |
42 | }
43 |
44 | override def cassandra:Map[String,String] = {
45 |
46 | val cfg = config.getConfig("cassandra")
47 | val conf = Map(
48 | "spark.cassandra.connection.host" -> cfg.getString("spark.cassandra.connection.host")
49 | )
50 |
51 | conf
52 |
53 | }
54 |
55 | override def elastic:HConf = {
56 |
57 | val cfg = config.getConfig("elastic")
58 | val conf = new HConf()
59 |
60 | conf.set("es.nodes",cfg.getString("es.nodes"))
61 | conf.set("es.port",cfg.getString("es.port"))
62 |
63 | conf.set("es.resource", cfg.getString("es.resource"))
64 | conf.set("es.query", cfg.getString("es.query"))
65 |
66 | conf
67 |
68 | }
69 |
70 | override def hbase:Map[String,String] = {
71 |
72 | val cfg = config.getConfig("hbase")
73 | val conf = Map(
74 | "spark.hbase.host" -> cfg.getString("spark.hbase.host")
75 | )
76 |
77 | conf
78 |
79 | }
80 |
81 | override def input:List[String] = {
82 |
83 | val cfg = config.getConfig("file")
84 |
85 | val items = cfg.getString("items")
86 | val features = cfg.getString("features")
87 |
88 | List(items,features)
89 |
90 | }
91 |
92 | override def mongo:HConf = {
93 |
94 | val cfg = config.getConfig("mongo")
95 | val conf = new HConf()
96 |
97 | conf.set("mongo.input.uri",cfg.getString("mongo.input.uri"))
98 | conf
99 |
100 | }
101 |
102 | override def mysql:(String,String,String,String) = {
103 |
104 | val cfg = config.getConfig("mysql")
105 |
106 | val url = cfg.getString("url")
107 | val db = cfg.getString("database")
108 |
109 | val user = cfg.getString("user")
110 | val password = cfg.getString("password")
111 |
112 | (url,db,user,password)
113 |
114 | }
115 |
116 | override def output:List[String] = null
117 |
118 | override def redis:(String,String) = {
119 |
120 | val cfg = config.getConfig("redis")
121 |
122 | val host = cfg.getString("host")
123 | val port = cfg.getString("port")
124 |
125 | (host,port)
126 |
127 | }
128 |
129 | override def rest:(String,Int) = {
130 |
131 | val cfg = config.getConfig("rest")
132 |
133 | val host = cfg.getString("host")
134 | val port = cfg.getInt("port")
135 |
136 | (host,port)
137 |
138 | }
139 |
140 | override def spark:Map[String,String] = {
141 |
142 | val cfg = config.getConfig("spark")
143 |
144 | Map(
145 | "spark.executor.memory" -> cfg.getString("spark.executor.memory"),
146 | "spark.kryoserializer.buffer.mb" -> cfg.getString("spark.kryoserializer.buffer.mb")
147 | )
148 |
149 | }
150 |
151 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/KMeansDetector.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.rdd.RDD
22 |
23 | import org.apache.spark.mllib.clustering.KMeans
24 | import org.apache.spark.mllib.linalg.Vectors
25 |
26 | import de.kp.spark.core.model._
27 | import de.kp.spark.outlier.util.{MathHelper,Optimizer}
28 |
29 | /**
30 | * KMeansDetector is a general purpose outlier detector that
31 | * detects outliers in sets of labeled features
32 | */
33 | class KMeansDetector extends Serializable {
34 |
35 | def find(data:RDD[LabeledPoint],strategy:String="entropy",iterations:Int,top:Int):List[ClusteredPoint] = {
36 |
37 | val (k,normdata) = prepare(data,strategy,iterations)
38 | detect(normdata,k,iterations,top)
39 |
40 | }
41 |
42 | def detect(normdata:RDD[LabeledPoint],k:Int,iterations:Int,top:Int):List[ClusteredPoint] = {
43 |
44 | val sc = normdata.context
45 |
46 | /*
47 | * STEP #1: Compute KMeans model
48 | */
49 | val vectors = normdata.map(point => Vectors.dense(point.features))
50 |
51 | val model = KMeans.train(vectors,k,iterations)
52 | val centroids = model.clusterCenters
53 |
54 | /*
55 | * STEP #2: Calculate the distances for all points from their clusters;
56 | * outliers are those that have the farest distance
57 | */
58 | val bcmodel = sc.broadcast(model)
59 | val points = normdata.map(point => {
60 |
61 | val vector = Vectors.dense(point.features)
62 |
63 | val cluster = bcmodel.value.predict(vector)
64 | val centroid = bcmodel.value.clusterCenters(cluster)
65 |
66 | val distance = Optimizer.distance(centroid.toArray,vector.toArray)
67 |
68 | (cluster,distance,point)
69 |
70 | })
71 |
72 | /*
73 | * Retrieve top k features (LabeledPoint) with respect to their clusters;
74 | * the cluster identifier is used as a grouping mechanism to specify which
75 | * features belong to which centroid
76 | */
77 | val bctop = sc.broadcast(top)
78 | points.groupBy(_._1).flatMap(x => x._2.toList.sortBy(_._2).reverse.take(bctop.value)).map(data => {
79 |
80 | val (cluster,distance,point) = data
81 | new ClusteredPoint(cluster,distance,point)
82 |
83 | }).collect().toList
84 |
85 | }
86 |
87 | def prepare(data:RDD[LabeledPoint],strategy:String="entropy",iterations:Int):(Int,RDD[LabeledPoint]) = {
88 |
89 | /*
90 | * STEP #1: Normalize data
91 | */
92 | val idlabels = data.map(p => (p.id,p.label))
93 |
94 | val features = data.map(p => p.features)
95 |
96 | val normalized = MathHelper.normalize(features)
97 | val normdata = idlabels.zip(normalized).map{case((id,label),features) => LabeledPoint(id,label, features)}
98 |
99 | /*
100 | * STEP #2: Find optimal number of clusters
101 | */
102 |
103 | /* Range of cluster center */
104 | val range = (5 to 40 by 5)
105 |
106 | val k = strategy match {
107 |
108 | case "distance" => Optimizer.optimizeByDistance(normdata, range, iterations)
109 |
110 | case "entropy" => Optimizer.optimizeByEntropy(normdata, range, iterations)
111 |
112 | }
113 |
114 | (k, normdata)
115 |
116 | }
117 |
118 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/MarkovDetector.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.rdd.RDD
22 |
23 | import de.kp.spark.core.model._
24 | import de.kp.spark.outlier.markov.{MarkovBuilder,StateMetrics,TransitionMatrix}
25 |
26 | /**
27 | * The MarkovDetector discovers outliers from registered behavior.
28 | */
29 | class MarkovDetector(@transient ctx:RequestContext,scale:Int,states:Array[String]) extends Serializable {
30 |
31 | val metrics = new StateMetrics(states)
32 |
33 | def detect(sequences:RDD[Behavior],algorithm:String,threshold:Double,matrix:TransitionMatrix):RDD[Outlier] = {
34 |
35 | val bmatrix = ctx.sc.broadcast(matrix)
36 | sequences.map(seq => {
37 |
38 | val (site,user,states) = (seq.site,seq.user,seq.states)
39 | val metric = algorithm match {
40 |
41 | case "missprob" => metrics.missProbMetric(states,bmatrix.value)
42 |
43 | case "missrate" => metrics.missRateMetric(states,bmatrix.value)
44 |
45 | case "entreduc" => metrics.entropyReductionMetric(states,bmatrix.value)
46 |
47 | }
48 |
49 | val flag = if (metric > threshold) "yes" else "no"
50 | Outlier(site,user,states,metric,flag)
51 |
52 | })
53 |
54 | }
55 |
56 | def train(sequences:RDD[Behavior]):TransitionMatrix = {
57 | new MarkovBuilder(scale,states).build(sequences)
58 | }
59 |
60 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/OutlierServer.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import akka.actor.{ActorSystem,Props}
22 | import com.typesafe.config.ConfigFactory
23 |
24 | import de.kp.spark.core.SparkService
25 | import de.kp.spark.outlier.api.AkkaApi
26 |
27 | /**
28 | * The OutlierServer supports two different approaches to outlier discovery; one is based
29 | * on clustering analysis and determines outlier feature sets due to their distance to the
30 | * cluster centers. This approach is independent of a certain use case and concentrates on
31 | * the extraction and evaluation of (equal-size) feature vectors. The other approach to
32 | * outlier discovery has a strong focus on the customers purchase behavior and detects those
33 | * customer that behave different from all other customers.
34 | */
35 | object OutlierServer extends SparkService {
36 |
37 | private val sc = createCtxLocal("IntentContext",Configuration.spark)
38 |
39 | def main(args: Array[String]) {
40 |
41 | val ctx = new RequestContext(sc)
42 |
43 | /**
44 | * AKKA API
45 | */
46 | val conf:String = "server.conf"
47 |
48 | val akkaSystem = ActorSystem("akka-server",ConfigFactory.load(conf))
49 | sys.addShutdownHook(akkaSystem.shutdown)
50 |
51 | new AkkaApi(akkaSystem,ctx).start()
52 |
53 | println("AKKA API activated.")
54 |
55 | }
56 |
57 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/RequestContext.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.sql.SQLContext
23 |
24 | class RequestContext( /*
25 | * Reference to the common SparkContext; this context can be used
26 | * to access HDFS based data sources or leverage the Spark machine
27 | * learning library or other Spark based functionality
28 | */
29 | @transient val sc:SparkContext) extends Serializable {
30 |
31 | val sqlc = new SQLContext(sc)
32 | val config = Configuration
33 |
34 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/actor/BaseActor.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier.actor
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import akka.actor.{Actor,ActorLogging,ActorRef,Props}
22 |
23 | import de.kp.spark.core.model._
24 | import de.kp.spark.core.redis.RedisCache
25 |
26 | import de.kp.spark.outlier.Configuration
27 | import de.kp.spark.outlier.model._
28 |
29 | abstract class BaseActor extends Actor with ActorLogging {
30 |
31 | val (host,port) = Configuration.redis
32 | val cache = new RedisCache(host,port.toInt)
33 |
34 | protected def failure(req:ServiceRequest,message:String):ServiceResponse = {
35 |
36 | if (req == null) {
37 | val data = Map("message" -> message)
38 | new ServiceResponse("","",data,OutlierStatus.FAILURE)
39 |
40 | } else {
41 | val data = Map("uid" -> req.data("uid"), "message" -> message)
42 | new ServiceResponse(req.service,req.task,data,OutlierStatus.FAILURE)
43 |
44 | }
45 |
46 | }
47 |
48 | protected def response(req:ServiceRequest,missing:Boolean):ServiceResponse = {
49 |
50 | val uid = req.data("uid")
51 |
52 | if (missing == true) {
53 | val data = Map("uid" -> uid, "message" -> Messages.MISSING_PARAMETERS(uid))
54 | new ServiceResponse(req.service,req.task,data,OutlierStatus.FAILURE)
55 |
56 | } else {
57 | val data = Map("uid" -> uid, "message" -> Messages.OUTLIER_DETECTION_STARTED(uid))
58 | new ServiceResponse(req.service,req.task,data,OutlierStatus.STARTED)
59 |
60 |
61 | }
62 |
63 | }
64 |
65 | protected def serialize(resp:ServiceResponse) = Serializer.serializeResponse(resp)
66 |
67 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/actor/KMeansActor.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier.actor
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import de.kp.spark.core.Names
22 | import de.kp.spark.core.model._
23 |
24 | import de.kp.spark.outlier.{KMeansDetector,RequestContext}
25 | import de.kp.spark.outlier.model._
26 |
27 | import de.kp.spark.core.source.VectorSource
28 | import de.kp.spark.core.source.handler.VectorHandler
29 |
30 | import de.kp.spark.core.redis.RedisDB
31 |
32 | import de.kp.spark.outlier.spec.VectorSpec
33 | import scala.collection.mutable.ArrayBuffer
34 |
35 | class KMeansActor(@transient ctx:RequestContext) extends TrainActor(ctx) {
36 |
37 | val redis = new RedisDB(host,port.toInt)
38 |
39 | override def validate(req:ServiceRequest) {
40 |
41 | if (req.data.contains("top") == false)
42 | throw new Exception("Parameter 'top' is missing.")
43 |
44 | if (req.data.contains("iterations") == false)
45 | throw new Exception("Parameter 'iterations' is missing.")
46 |
47 | if (req.data.contains("strategy") == false)
48 | throw new Exception("Parameter 'strategy' is missing.")
49 |
50 | }
51 |
52 | override def train(req:ServiceRequest) {
53 |
54 | val source = new VectorSource(ctx.sc,ctx.config,new VectorSpec(req))
55 | val dataset = VectorHandler.vector2LabeledPoints(source.connect(req))
56 |
57 | val params = ArrayBuffer.empty[Param]
58 |
59 | val top = req.data("top").toInt
60 | params += Param("top","integer",top.toString)
61 |
62 | val strategy = req.data("strategy").asInstanceOf[String]
63 | params += Param("strategy","string",strategy)
64 |
65 | val iter = req.data("iterations").toInt
66 | params += Param("iterations","integer",iter.toString)
67 |
68 | cache.addParams(req, params.toList)
69 |
70 | val points = new KMeansDetector().find(dataset,strategy,iter,top).toList
71 | savePoints(req,ClusteredPoints(points))
72 |
73 | }
74 |
75 | private def savePoints(req:ServiceRequest,points:ClusteredPoints) {
76 | redis.addPoints(req,points)
77 | }
78 |
79 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/actor/MarkovActor.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier.actor
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import de.kp.spark.core.Names
22 | import de.kp.spark.core.model._
23 |
24 | import de.kp.spark.core.source.StateSource
25 | import de.kp.spark.core.source.handler.StateHandler
26 |
27 | import de.kp.spark.core.redis.RedisDB
28 |
29 | import de.kp.spark.outlier.RequestContext
30 | import de.kp.spark.outlier.model._
31 |
32 | import de.kp.spark.outlier.MarkovDetector
33 | import de.kp.spark.outlier.spec.StateSpec
34 |
35 | import scala.collection.mutable.ArrayBuffer
36 |
37 | class MarkovActor(@transient ctx:RequestContext) extends TrainActor(ctx) {
38 |
39 | val redis = new RedisDB(host,port.toInt)
40 |
41 | override def validate(req:ServiceRequest) {
42 |
43 | if (req.data.contains("scale") == false)
44 | throw new Exception("Parameter 'scale' is missing.")
45 |
46 | if (req.data.contains("states") == false)
47 | throw new Exception("Parameter 'states' is missing.")
48 |
49 | if (req.data.contains("strategy") == false)
50 | throw new Exception("Parameter 'strategy' is missing.")
51 |
52 | if (req.data.contains("threshold") == false)
53 | throw new Exception("Parameter 'threshold' is missing.")
54 |
55 | }
56 |
57 | override def train(req:ServiceRequest) {
58 |
59 | val source = new StateSource(ctx.sc,ctx.config,new StateSpec(req))
60 | val sequences = StateHandler.state2Behavior(source.connect(req))
61 |
62 | val scale = req.data(Names.REQ_SCALE).toInt
63 | val states = req.data(Names.REQ_STATES).split(",")
64 |
65 | val detector = new MarkovDetector(ctx,scale,states)
66 |
67 | val model = detector.train(sequences)
68 |
69 | val params = ArrayBuffer.empty[Param]
70 |
71 | val strategy = req.data("strategy")
72 | params += Param("strategy","string",strategy)
73 |
74 | val threshold = req.data("threshold").toDouble
75 | params += Param("threshold","double",threshold.toString)
76 |
77 | cache.addParams(req, params.toList)
78 |
79 | val outliers = detector.detect(sequences,strategy,threshold,model).collect().toList
80 |
81 | saveOutliers(req,new Outliers(outliers))
82 |
83 | }
84 |
85 | private def saveOutliers(req:ServiceRequest,outliers:Outliers) {
86 | redis.addOutliers(req,outliers)
87 | }
88 |
89 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/actor/OutlierMaster.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier.actor
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import akka.actor.{ActorRef,Props}
22 |
23 | import de.kp.spark.core.actor._
24 | import de.kp.spark.core.model._
25 |
26 | import de.kp.spark.outlier.{Configuration,RequestContext}
27 |
28 | class OutlierMaster(@transient ctx:RequestContext) extends BaseMaster(Configuration) {
29 |
30 | protected def actor(worker:String):ActorRef = {
31 |
32 | worker match {
33 | /*
34 | * Metadata management is part of the core functionality; field or metadata
35 | * specifications can be registered in, and retrieved from a Redis database.
36 | */
37 | case "fields" => context.actorOf(Props(new FieldQuestor(Configuration)))
38 | case "register" => context.actorOf(Props(new BaseRegistrar(Configuration)))
39 | /*
40 | * Index management is part of the core functionality; an Elasticsearch
41 | * index can be created and appropriate (tracked) items can be saved.
42 | */
43 | case "index" => context.actorOf(Props(new BaseIndexer(Configuration)))
44 | case "track" => context.actorOf(Props(new BaseTracker(Configuration)))
45 |
46 | case "params" => context.actorOf(Props(new ParamQuestor(Configuration)))
47 | /*
48 | * Request the actual status of an association rule mining
49 | * task; note, that get requests should only be invoked after
50 | * having retrieved a FINISHED status.
51 | *
52 | * Status management is part of the core functionality.
53 | */
54 | case "status" => context.actorOf(Props(new StatusQuestor(Configuration)))
55 |
56 | case "get" => context.actorOf(Props(new OutlierQuestor()))
57 | case "train" => context.actorOf(Props(new OutlierMiner(ctx)))
58 |
59 | case _ => null
60 |
61 | }
62 |
63 | }
64 |
65 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/actor/OutlierMiner.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier.actor
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import akka.actor.{ActorRef,Props}
22 |
23 | import de.kp.spark.core.Names
24 |
25 | import de.kp.spark.core.actor._
26 | import de.kp.spark.core.model._
27 |
28 | import de.kp.spark.outlier.{Configuration,RequestContext}
29 | import de.kp.spark.outlier.model._
30 |
31 | /**
32 | * The focus of the OutlierMiner is on the model building task,
33 | * either for cluster analysis based tasks or markov based states.
34 | */
35 | class OutlierMiner(@transient ctx:RequestContext) extends BaseTrainer(Configuration) {
36 |
37 | protected def validate(req:ServiceRequest):Option[String] = {
38 |
39 | val uid = req.data(Names.REQ_UID)
40 |
41 | if (cache.statusExists(req)) {
42 | val message = Messages.TASK_ALREADY_STARTED(uid)
43 | return Some(message)
44 |
45 | }
46 |
47 | req.data.get(Names.REQ_ALGORITHM) match {
48 |
49 | case None => {
50 | return Some(Messages.NO_ALGORITHM_PROVIDED(uid))
51 | }
52 |
53 | case Some(algorithm) => {
54 | if (Algorithms.isAlgorithm(algorithm) == false) {
55 | return Some(Messages.ALGORITHM_IS_UNKNOWN(uid,algorithm))
56 | }
57 |
58 | }
59 |
60 | }
61 |
62 | req.data.get(Names.REQ_SOURCE) match {
63 |
64 | case None => {
65 | return Some(Messages.NO_SOURCE_PROVIDED(uid))
66 | }
67 |
68 | case Some(source) => {
69 | if (Sources.isSource(source) == false) {
70 | return Some(Messages.SOURCE_IS_UNKNOWN(uid,source))
71 | }
72 | }
73 |
74 | }
75 |
76 | None
77 |
78 | }
79 |
80 | /**
81 | * This is a helper method to determine which actor has to be
82 | * created to support the requested algorithm; actually KMeans
83 | * and Markov based algorithms are supported.
84 | */
85 | protected def actor(req:ServiceRequest):ActorRef = {
86 |
87 | val algorithm = req.data(Names.REQ_ALGORITHM)
88 | if (algorithm == Algorithms.KMEANS) {
89 | context.actorOf(Props(new KMeansActor(ctx)))
90 |
91 | } else {
92 | context.actorOf(Props(new MarkovActor(ctx)))
93 |
94 | }
95 |
96 | }
97 |
98 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/actor/OutlierQuestor.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier.actor
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import akka.actor.{Actor,ActorLogging,ActorRef,Props}
22 |
23 | import de.kp.spark.core.Names
24 | import de.kp.spark.core.model._
25 |
26 | import de.kp.spark.core.redis.RedisDB
27 |
28 | import de.kp.spark.outlier.model._
29 |
30 | class OutlierQuestor extends BaseActor {
31 |
32 | implicit val ec = context.dispatcher
33 | private val redis = new RedisDB(host,port.toInt)
34 |
35 | def receive = {
36 |
37 | case req:ServiceRequest => {
38 |
39 | val origin = sender
40 | val uid = req.data("uid")
41 |
42 | val Array(task,topic) = req.task.split(":")
43 | topic match {
44 |
45 | case "state" => {
46 |
47 | val response = {
48 |
49 | if (redis.outliersExists(req) == false) {
50 | failure(req,Messages.OUTLIERS_DO_NOT_EXIST(uid))
51 |
52 | } else {
53 |
54 | val outliers = redis.outliers(req)
55 |
56 | val data = Map(Names.REQ_UID -> uid, Names.REQ_RESPONSE -> outliers)
57 | new ServiceResponse(req.service,req.task,data,OutlierStatus.SUCCESS)
58 |
59 | }
60 | }
61 |
62 | origin ! response
63 | context.stop(self)
64 |
65 | }
66 |
67 | case "vector" => {
68 |
69 | val response = {
70 |
71 | if (redis.pointsExist(req) == false) {
72 | failure(req,Messages.OUTLIERS_DO_NOT_EXIST(uid))
73 |
74 | } else {
75 |
76 | val points = redis.points(req)
77 |
78 | val data = Map(Names.REQ_UID -> uid, Names.REQ_RESPONSE -> points)
79 | new ServiceResponse(req.service,req.task,data,OutlierStatus.SUCCESS)
80 |
81 | }
82 |
83 | }
84 | origin ! response
85 | context.stop(self)
86 |
87 | }
88 |
89 | case _ => {
90 |
91 | val msg = Messages.TASK_IS_UNKNOWN(uid,req.task)
92 |
93 | origin ! failure(req,msg)
94 | context.stop(self)
95 |
96 | }
97 |
98 | }
99 |
100 | }
101 |
102 | }
103 |
104 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/actor/TrainActor.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier.actor
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import de.kp.spark.core.model._
22 |
23 | import de.kp.spark.outlier.RequestContext
24 | import de.kp.spark.outlier.model._
25 |
26 | class TrainActor(@transient ctx:RequestContext) extends BaseActor {
27 |
28 | def receive = {
29 |
30 | case req:ServiceRequest => {
31 |
32 | val origin = sender
33 | val missing = try {
34 |
35 | validate(req)
36 | false
37 |
38 | } catch {
39 | case e:Exception => true
40 |
41 | }
42 |
43 | origin ! response(req, missing)
44 |
45 | if (missing == false) {
46 |
47 | try {
48 |
49 | /* Update cache */
50 | cache.addStatus(req,OutlierStatus.TRAINING_STARTED)
51 |
52 | train(req)
53 |
54 | /* Update cache */
55 | cache.addStatus(req,OutlierStatus.TRAINING_FINISHED)
56 |
57 | } catch {
58 | case e:Exception => cache.addStatus(req,OutlierStatus.FAILURE)
59 | }
60 |
61 | }
62 |
63 | context.stop(self)
64 |
65 | }
66 |
67 | case _ => {
68 |
69 | log.error("unknown request.")
70 | context.stop(self)
71 |
72 | }
73 |
74 | }
75 |
76 | protected def validate(req:ServiceRequest) = {}
77 |
78 | protected def train(req:ServiceRequest) {}
79 |
80 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/api/AkkaApi.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier.api
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import akka.actor.{ActorSystem,Props}
22 |
23 | import de.kp.spark.outlier.RequestContext
24 | import de.kp.spark.outlier.actor.OutlierMaster
25 |
26 | class AkkaApi(system:ActorSystem,@transient val ctx:RequestContext) {
27 |
28 | val master = system.actorOf(Props(new OutlierMaster(ctx)), name="outlier-master")
29 |
30 | def start() {
31 | while (true) {}
32 | }
33 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/app/TrainApp.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier.app
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.SparkContext
22 |
23 | import akka.actor._
24 | import com.typesafe.config.ConfigFactory
25 |
26 | import org.clapper.argot._
27 |
28 | import de.kp.spark.core.Names
29 | import de.kp.spark.core.model._
30 |
31 | import de.kp.spark.core.actor.Supervisor
32 | import de.kp.spark.core.SparkService
33 |
34 | import de.kp.spark.outlier.{Configuration,RequestContext}
35 |
36 | import de.kp.spark.outlier.actor.OutlierMaster
37 | import de.kp.spark.outlier.model._
38 |
39 | import scala.concurrent.duration.DurationInt
40 | import scala.collection.mutable.HashMap
41 |
42 | object TrainApp extends SparkService {
43 |
44 | protected val sc = createCtxLocal("OutlierContext",Configuration.spark)
45 | protected val system = ActorSystem("OutlierSystem")
46 |
47 | protected val inbox = Inbox.create(system)
48 |
49 | sys.addShutdownHook({
50 | /*
51 | * In case of a system shutdown, we also make clear
52 | * that the SparkContext is properly stopped as well
53 | * as the respective Akka actor system
54 | */
55 | sc.stop
56 | system.shutdown
57 |
58 | })
59 |
60 | def main(args:Array[String]) {
61 |
62 | try {
63 |
64 | val req_params = createParams(args)
65 | val req = new ServiceRequest("context","train:model",req_params)
66 |
67 | val ctx = new RequestContext(sc)
68 | val actor = system.actorOf(Props(new Handler(ctx)))
69 |
70 | inbox.watch(actor)
71 | actor ! req
72 |
73 | val timeout = DurationInt(req_params("timeout").toInt).minute
74 |
75 | while (inbox.receive(timeout).isInstanceOf[Terminated] == false) {}
76 | sys.exit
77 |
78 | } catch {
79 | case e:Exception => {
80 |
81 | println(e.getMessage)
82 | sys.exit
83 |
84 | }
85 |
86 | }
87 |
88 | }
89 |
90 | protected def createParams(args:Array[String]):Map[String,String] = {
91 |
92 | import ArgotConverters._
93 |
94 | val parser = new ArgotParser(
95 | programName = "Outlier Analysis Engine",
96 | compactUsage = true,
97 | preUsage = Some("Version %s. Copyright (c) 2015, %s.".format("1.0","Dr. Krusche & Partner PartG"))
98 | )
99 |
100 | val site = parser.option[String](List("key"),"key","Unique application key")
101 | val uid = parser.option[String](List("uid"),"uid","Unique job identifier")
102 |
103 | val name = parser.option[String](List("name"),"name","Unique job designator")
104 |
105 | val config = parser.option[String](List("config"),"config","Configuration file")
106 | parser.parse(args)
107 |
108 | /* Collect parameters */
109 | val params = HashMap.empty[String,String]
110 |
111 | /* Validate parameters */
112 | site.value match {
113 |
114 | case None => parser.usage("Parameter 'key' is missing.")
115 | case Some(value) => params += "site" -> value
116 |
117 | }
118 |
119 | uid.value match {
120 |
121 | case None => parser.usage("Parameter 'uid' is missing.")
122 | case Some(value) => params += "uid" -> value
123 |
124 | }
125 |
126 | name.value match {
127 |
128 | case None => parser.usage("Parameter 'name' is missing.")
129 | case Some(value) => params += "name" -> value
130 |
131 | }
132 |
133 | config.value match {
134 |
135 | case None => parser.usage("Parameter 'config' is missing.")
136 | case Some(value) => {
137 |
138 | val cfg = ConfigFactory.load(value)
139 |
140 | val algo = cfg.getString("algo")
141 | if (Algorithms.isAlgorithm(algo) == false)
142 | parser.usage("Parameter 'algo' must be one of [KMEANS, SKMEANS].")
143 |
144 | params += "algorithm" -> algo
145 | params += "source" -> cfg.getString("source")
146 |
147 | /* COMMON */
148 | params += "strategy" -> cfg.getString("strategy")
149 |
150 | /* KMEANS */
151 | params += "k" -> cfg.getInt("k").toString
152 |
153 | /* MARKOV */
154 | params += "threshold" -> cfg.getDouble("threshold").toString
155 |
156 | params += "scale" -> cfg.getInt("scale").toString
157 | params += "states" -> cfg.getString("states")
158 |
159 | }
160 |
161 | }
162 |
163 | /* Add timestamp as global parameter */
164 | params += "timestamp" -> new java.util.Date().getTime.toString
165 | params.toMap
166 |
167 | }
168 |
169 | }
170 |
171 | class Handler(@transient ctx:RequestContext) extends Actor {
172 |
173 | private val config = Configuration
174 | def receive = {
175 |
176 | case req:ServiceRequest => {
177 |
178 | val start = new java.util.Date().getTime
179 | println("Trainer started at " + start)
180 |
181 | val master = context.actorOf(Props(new OutlierMaster(ctx)))
182 | master ! Serializer.serializeRequest(req)
183 |
184 | val status = OutlierStatus.TRAINING_FINISHED
185 | val supervisor = context.actorOf(Props(new Supervisor(req,status,config)))
186 |
187 | }
188 |
189 | case evt:StatusEvent => {
190 | /*
191 | * The StatusEvent message is returned from the
192 | * supervisor actor and specifies that the model
193 | * training task has been finished
194 | */
195 | val end = new java.util.Date().getTime
196 | println("Trainer finished at " + end)
197 |
198 | context.stop(self)
199 |
200 | }
201 |
202 | case msg:String => {
203 |
204 | val end = new java.util.Date().getTime
205 | println("Trainer finished at " + end)
206 |
207 | val response = Serializer.deserializeResponse(msg)
208 |
209 | println("Message: " + response.data("message").toString)
210 | println("Status: " + response.status)
211 |
212 | }
213 |
214 | }
215 |
216 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/markov/DoubleMatrix.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier.markov
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import scala.collection.mutable.ArrayBuffer
22 | import scala.Array.canBuildFrom
23 |
24 | class DoubleMatrix(numRow:Int,numCol:Int) {
25 |
26 | protected val table:Array[Array[Double]] = Array.fill[Double](numRow,numCol)(0.0)
27 |
28 | protected var rowLabels = Array.empty[String]
29 | protected var colLabels = Array.empty[String]
30 |
31 | def setStates(rowStates:Array[String], colStates:Array[String]) {
32 |
33 | this.rowLabels = rowStates
34 | this.colLabels = colStates
35 |
36 | }
37 |
38 | def set(row:Int,col:Int,valu:Double) {
39 | table(row)(col) = valu
40 | }
41 |
42 | def get(row:Int,col:Int):Double = table(row)(col)
43 |
44 | def getRow(row:Int):Array[Double] = table(row)
45 |
46 | def getRow(rowLabel:String):Array[Double] = table(rowLabels.indexOf(rowLabel))
47 |
48 | def getRowLabel(col:Int) = rowLabels(col)
49 |
50 | def getColLabel(col:Int) = colLabels(col)
51 |
52 | def add(row:Int,col:Int,valu:Double) {
53 | table(row)(col) = table(row)(col) + valu
54 | }
55 |
56 | def add(rowLabel:String,colLabel:String,valu:Double) {
57 |
58 | val (row,col) = getRowCol(rowLabel,colLabel)
59 | table(row)(col) += valu
60 |
61 | }
62 |
63 | def increment(row:Int,col:Int) {
64 | table(row)(col) = table(row)(col) + 1
65 | }
66 |
67 | def increment(rowLabel:String, colLabel:String) {
68 |
69 | val (row,col) = getRowCol(rowLabel, colLabel)
70 | table(row)(col) = table(row)(col) + 1
71 |
72 | }
73 |
74 | def getRowSum(row:Int):Double = table(row).sum
75 |
76 | def getColumnSum(col:Int):Double = {
77 |
78 | var sum:Double = 0
79 | (0 until numRow).foreach(row => sum += table(row)(col))
80 |
81 | sum
82 |
83 | }
84 |
85 | def serialize():String = {
86 |
87 | val output = ArrayBuffer.empty[String]
88 | (0 until numRow).foreach(row => output += serializeRow(row))
89 |
90 | output.mkString(";")
91 |
92 | }
93 |
94 | def serializeRow(row:Int):String = table(row).mkString(",")
95 |
96 | def deserialize(data:String) {
97 |
98 | val rows = data.split(";")
99 | (0 until rows.length).foreach(row => deserializeRow(row,rows(row)))
100 |
101 | }
102 |
103 | def deserializeRow(row:Int,data:String) {
104 | table(row) = data.split(",").map(_.toDouble)
105 | }
106 |
107 |
108 | private def getRowCol(rowLabel:String,colLabel:String):(Int,Int) = {
109 |
110 | val row = rowLabels.indexOf(rowLabel)
111 | val col = colLabels.indexOf(colLabel)
112 |
113 | (row,col)
114 |
115 | }
116 |
117 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/markov/MarkovBuilder.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier.markov
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.rdd.RDD
22 |
23 | import de.kp.spark.core.model._
24 | import scala.collection.mutable.HashMap
25 |
26 | private case class Pair(ant:String,con:String)
27 |
28 | class MarkovBuilder(scaleDef:Int,stateDefs:Array[String]) extends Serializable {
29 |
30 | def build(dataset:RDD[Behavior]):TransitionMatrix = {
31 |
32 | def seqOp(support:HashMap[Pair,Int],seq:Behavior):HashMap[Pair,Int] = {
33 |
34 | val (site,user,states) = (seq.site,seq.user,seq.states)
35 | /*
36 | * The pair support aggregates over all sites and users provided;
37 | * for an outlier detection, we assume that this is the best way
38 | * to determine state transition probabilities
39 | */
40 | for (i <- 1 until states.size) {
41 |
42 | val pair = new Pair(states(i-1),states(i))
43 |
44 | support.get(pair) match {
45 | case None => support += pair -> 1
46 | case Some(count) => support += pair -> (count + 1)
47 | }
48 |
49 | }
50 |
51 | support
52 |
53 | }
54 |
55 | /* Note that supp1 is always NULL */
56 | def combOp(supp1:HashMap[Pair,Int],supp2:HashMap[Pair,Int]):HashMap[Pair,Int] = supp2
57 |
58 | /* Build pair support */
59 | val pairsupp = dataset.coalesce(1, false).aggregate(HashMap.empty[Pair,Int])(seqOp,combOp)
60 |
61 | /* Setup transition matrix and add pair support*/
62 | val dim = stateDefs.length
63 |
64 | val matrix = new TransitionMatrix(dim,dim)
65 | matrix.setScale(scaleDef)
66 |
67 | matrix.setStates(stateDefs, stateDefs)
68 | for ((pair,support) <- pairsupp) {
69 | matrix.add(pair.ant, pair.con, support)
70 | }
71 |
72 | /* Normalize the matrix content and transform support into probabilities */
73 | matrix.normalize()
74 |
75 | matrix
76 |
77 | }
78 |
79 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/markov/StateMetrics.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier.markov
2 |
3 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
4 | *
5 | * This file is part of the Spark-Outlier project
6 | * (https://github.com/skrusche63/spark-outlier).
7 | *
8 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
9 | * terms of the GNU General Public License as published by the Free Software
10 | * Foundation, either version 3 of the License, or (at your option) any later
11 | * version.
12 | *
13 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
14 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
15 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16 | * You should have received a copy of the GNU General Public License along with
17 | * Spark-Outlier.
18 | *
19 | * If not, see .
20 | */
21 |
22 | class StateMetrics(stateDefs:Array[String]) extends Serializable {
23 |
24 | /*
25 | * Miss Probability Metric
26 | *
27 | * For any pair of consecutive transaction states t(i) and t(j) in a sequence,
28 | * the following quantity is calculated: For the row corresponding to t(i), we
29 | * are summing all the probabilities except for the target state t(j).
30 | *
31 | * F(t(i), t(j)) = Sum(P(t(i), t(k)) | k != j) where P(t(i), t(k)) is the probability
32 | * of transitioning from transaction state t(i) to t(k)
33 | *
34 | * Then we sum F over all the transaction state pairs in the sequence and normalize by
35 | * the number of such pairs.
36 | */
37 |
38 | def missProbMetric(states:List[String],model:TransitionMatrix):Double = {
39 |
40 | var F:Double = 0
41 | var count:Int = 0
42 |
43 | for (i <- 1 until states.size) {
44 |
45 | val srcIndex = stateDefs.indexOf(states(i-1))
46 | val tarIndex = stateDefs.indexOf(states(i))
47 |
48 | /* Sum all probabilities except the target state */
49 | for (j <- 0 until stateDefs.length) {
50 | if (j != tarIndex)
51 | F += model.get(srcIndex,j)
52 | }
53 |
54 | count += 1
55 | }
56 |
57 | val metric = F / count
58 | metric
59 |
60 | }
61 |
62 | /*
63 | * Miss Rate Metric
64 | *
65 | * For any transition, if transition corresponds to the maximum probability target state, the value is 0, otherwise it’s 1.
66 | *
67 | * F(t(i), t(j)) = 0 if t(j) = t(k) else 1 where t(k) is the target state when P(t(i), t(k)) = max(P(t(i), t(l)) for all l
68 | *
69 | * Then we sum F over all the transaction state pairs in the sequence and normalize by
70 | * the number of such pairs.
71 | */
72 | def missRateMetric(states:List[String],model:TransitionMatrix):Double = {
73 |
74 | var F:Double = 0
75 | var count:Int = 0
76 |
77 | for (i <- 1 until states.size) {
78 |
79 | val srcIndex = stateDefs.indexOf(states(i-1))
80 | val tarIndex = stateDefs.indexOf(states(i))
81 |
82 | val maxIndex = stateDefs.indexOf(model.getRow(srcIndex).max)
83 |
84 | F = (if (tarIndex == maxIndex) 0 else 1)
85 | count += 1
86 |
87 | }
88 |
89 | val metric = F / count
90 | metric
91 |
92 | }
93 |
94 | /*
95 | * Entropy Reduction Metric
96 | *
97 | * We calculate two quantities F and G as below. For a given row, F is the entropy excluding target state for the state pair
98 | * under consideration. G is the entropy for the whole row.
99 | *
100 | * F(t(i), t(j)) = sum (-P(t(i), t(k)) log(P(t(i), t(k)) | t(k) != t(j)
101 | * G(t(i)) = sum (-P(t(i), t(k)) log(P(t(i), t(k))
102 | *
103 | * We sum F and G over all consecutive state pairs and divide the two sums.
104 | */
105 | def entropyReductionMetric(states:List[String],model:TransitionMatrix):Double = {
106 |
107 | var F:Double = 0
108 | var G:Double = 0
109 |
110 | for (i <- 1 until states.size) {
111 |
112 | val srcIndex = stateDefs.indexOf(states(i-1))
113 | val tarIndex = stateDefs.indexOf(states(i))
114 |
115 | for (j <- 0 until stateDefs.length) {
116 |
117 | val prob = model.get(srcIndex,j)
118 | val entropy = -prob * Math.log(prob)
119 |
120 |
121 | if (j != tarIndex) F += entropy
122 | G += entropy
123 |
124 | }
125 |
126 | }
127 |
128 | val metric = F / G
129 | metric
130 |
131 | }
132 |
133 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/markov/TransitionMatrix.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier.markov
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | class TransitionMatrix(numRow:Int,numCol:Int) extends DoubleMatrix(numRow,numCol) {
22 |
23 | private var scale = 100
24 |
25 | def setScale(scale:Int) {
26 | this.scale = scale
27 | }
28 |
29 | def normalize() {
30 | /*
31 | * Laplace correction: A row that contains at least
32 | * one zero value is shift by the value of 1
33 | */
34 | (0 until numRow).foreach(row => {
35 |
36 | val transProbs = getRow(row)
37 | if (transProbs.min == 0) {
38 | (0 until numCol).foreach(col => table(row)(col) += 1)
39 | }
40 |
41 | })
42 |
43 | /* Normalize transition support */
44 | (0 until numRow).foreach(row => {
45 | val rowSum = getRowSum(row)
46 | (0 until numCol).foreach(col => table(row)(col) = (table(row)(col) * scale) / rowSum)
47 | })
48 |
49 | }
50 |
51 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/model/Model.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier.model
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import de.kp.spark.core.model._
22 |
23 | object Algorithms {
24 |
25 | val KMEANS:String = "KMEANS"
26 | val MARKOV:String = "MARKOV"
27 |
28 | private def algorithms = List(KMEANS,MARKOV)
29 | def isAlgorithm(algorithm:String):Boolean = algorithms.contains(algorithm)
30 |
31 | }
32 |
33 | object Serializer extends BaseSerializer
34 |
35 | object Messages extends BaseMessages {
36 |
37 | def MISSING_PARAMETERS(uid:String):String = String.format("""Parameters are missing for uid '%s'.""", uid)
38 |
39 | def NO_METHOD_PROVIDED(uid:String):String = String.format("""No method provided for uid '%s'.""", uid)
40 |
41 | def METHOD_NOT_SUPPORTED(uid:String):String = String.format("""The provided is not supported for uid '%s'.""", uid)
42 |
43 | def OUTLIER_DETECTION_STARTED(uid:String) = String.format("""Outlier detection started for uid '%s'.""", uid)
44 |
45 | def OUTLIERS_DO_NOT_EXIST(uid:String):String = String.format("""The outliers for uid '%s' do not exist.""", uid)
46 |
47 | }
48 |
49 | object OutlierStatus extends BaseStatus {
50 |
51 | val DATASET:String = "dataset"
52 | val TRAINED:String = "trained"
53 |
54 | val STARTED:String = "started"
55 | val STOPPED:String = "stopped"
56 |
57 | val FINISHED:String = "finished"
58 | val RUNNING:String = "running"
59 |
60 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/spec/StateSpec.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier.spec
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import de.kp.spark.core.model._
22 | import de.kp.spark.core.redis.RedisCache
23 |
24 | import de.kp.spark.core.spec.Fields
25 | import de.kp.spark.outlier.Configuration
26 |
27 | import scala.xml._
28 | import scala.collection.mutable.Buffer
29 |
30 | class StateSpec(req:ServiceRequest) extends Fields {
31 |
32 | val path = "states.xml"
33 |
34 | val (host,port) = Configuration.redis
35 | val cache = new RedisCache(host,port.toInt)
36 |
37 | private val fields = load
38 |
39 | def mapping:Map[String,String] = fields.map(x => (x.name,x.value)).toMap
40 |
41 | def names:List[String] = fields.map(_.name)
42 |
43 | def types:List[String] = fields.map(_.datatype)
44 |
45 | private val load:List[Field] = {
46 |
47 | val data = Buffer.empty[Field]
48 |
49 | try {
50 |
51 | if (cache.fieldsExist(req)) {
52 |
53 | val fieldspec = cache.fields(req)
54 | for (field <- fieldspec) {
55 | data += Field(field.name,field.datatype,field.value)
56 | }
57 |
58 | } else {
59 |
60 | val root = XML.load(getClass.getClassLoader.getResource(path))
61 | for (field <- root \ "field") {
62 |
63 | val _name = (field \ "@name").toString
64 | val _type = (field \ "@type").toString
65 |
66 | val _mapping = field.text
67 |
68 | data += Field(_name,_type,_mapping)
69 |
70 | }
71 |
72 | }
73 |
74 | } catch {
75 | case e:Exception => {}
76 | }
77 |
78 | data.toList
79 |
80 | }
81 |
82 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/spec/VectorSpec.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier.spec
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import de.kp.spark.core.model._
22 | import de.kp.spark.core.redis.RedisCache
23 |
24 | import de.kp.spark.core.spec.Fields
25 | import de.kp.spark.outlier.Configuration
26 |
27 | import scala.xml._
28 | import scala.collection.mutable.Buffer
29 |
30 | class VectorSpec(req:ServiceRequest) extends Fields {
31 |
32 | val path = "features.xml"
33 |
34 | val (host,port) = Configuration.redis
35 | val cache = new RedisCache(host,port.toInt)
36 |
37 | private val fields = load
38 |
39 | def mapping:Map[String,String] = fields.map(x => (x.name,x.value)).toMap
40 |
41 | def names:List[String] = fields.map(_.name)
42 |
43 | def types:List[String] = fields.map(_.datatype)
44 |
45 | private val load:List[Field] = {
46 |
47 | val data = Buffer.empty[Field]
48 |
49 | try {
50 |
51 | if (cache.fieldsExist(req)) {
52 |
53 | val fieldspec = cache.fields(req)
54 | for (field <- fieldspec) {
55 | data += Field(field.name,field.datatype,field.value)
56 | }
57 |
58 | } else {
59 |
60 | val root = XML.load(getClass.getClassLoader.getResource(path))
61 | for (field <- root \ "field") {
62 |
63 | val _name = (field \ "@name").toString
64 | val _type = (field \ "@type").toString
65 |
66 | val _mapping = field.text
67 |
68 | data += Field(_name,_type,_mapping)
69 |
70 | }
71 |
72 | }
73 |
74 | } catch {
75 | case e:Exception => {}
76 | }
77 |
78 | data.toList
79 |
80 | }
81 |
82 | }
83 |
84 |
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/util/MathHelper.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier.util
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.rdd.RDD
22 |
23 | object MathHelper {
24 |
25 | /**
26 | * Entropy of a dataset containing integers
27 | */
28 | def intEntropy(data:TraversableOnce[Int]):Double = {
29 |
30 | val invLog2 = 1.0 / Math.log(2)
31 |
32 | val positives = data.filter(_ > 0)
33 | if (positives.size > 0) {
34 |
35 | val sum: Double = positives.sum
36 | val invSum = 1.0 / sum.toDouble
37 |
38 | positives.map {positive =>
39 |
40 | val p = positive.toDouble * invSum
41 | -p * Math.log(p)
42 |
43 | }.sum
44 |
45 | } else {
46 | 0.0
47 | }
48 |
49 | }
50 |
51 | /**
52 | * Entroy of a dataset containing strings; it may be
53 | * used as a measure of the homogenity of the strings
54 | */
55 | def strEntropy(data:TraversableOnce[String]):Double = {
56 |
57 | val invLog2 = 1.0 / Math.log(2)
58 |
59 | val len = data.size
60 | if (len > 1) {
61 |
62 | val invLen = 1.0 / len.toDouble
63 | var ent = 0.0
64 |
65 | for (str <- data.toList.distinct) {
66 | /*
67 | * Probability to find a certain value within the dataset
68 | */
69 | val pstr = data.count(x => x == str).toDouble * invLen
70 | ent -= pstr * Math.log(pstr) * invLog2
71 |
72 | }
73 |
74 | ent
75 |
76 | } else {
77 | 0.0
78 |
79 | }
80 |
81 | }
82 |
83 | /**
84 | * Data is a distributed list of feature vectors (Array[Double]) with the
85 | * following semantic: vector = [f_0,f_1,f_2, ...]; i.e. each vectors holds
86 | * a certain value for feature i at position i. Normalizing those data means
87 | * that one has to normalize all values of feature f_0, all values of f_1 etc
88 | */
89 | def normalize(data:RDD[Array[Double]]):RDD[Array[Double]] = {
90 |
91 | val total = data.count()
92 |
93 | /*
94 | * Each column of the data matrix is assigned to a certain feature;
95 | * we therefore have to sum up the values of each column independently
96 | * and build the mean value
97 | */
98 | val sums = data.reduce((a,b) => a.zip(b).map(t => t._1 + t._2))
99 | val means = sums.map(_ / total)
100 |
101 | /*
102 | * We build the standard deviation for the values of each column
103 | */
104 | val len = sums.length
105 |
106 | val init = new Array[Double](len)
107 | val sumSquares = data.fold(init)((a,b) => a.zip(b).map(t => t._1 + t._2*t._2))
108 |
109 | val stdevs = sumSquares.zip(sums).map {
110 | case(sumSq,sum) => Math.sqrt(total*sumSq - sum*sum) / total
111 | }
112 |
113 | /*
114 | * Finally for each column (or feature), each single values gets
115 | * normalized using the mean value and standard deviations
116 | */
117 | val normdata = data.map(
118 |
119 | (_,means,stdevs).zipped.map((value,mean,stdev) => {
120 | if (stdev <= 0) (value-mean) else (value-mean) / stdev
121 |
122 | })
123 |
124 | )
125 |
126 | normdata
127 |
128 | }
129 |
130 | }
--------------------------------------------------------------------------------
/src/main/scala/de/kp/spark/outlier/util/Optimizer.scala:
--------------------------------------------------------------------------------
1 | package de.kp.spark.outlier.util
2 | /* Copyright (c) 2014 Dr. Krusche & Partner PartG
3 | *
4 | * This file is part of the Spark-Outlier project
5 | * (https://github.com/skrusche63/spark-outlier).
6 | *
7 | * Spark-Outlier is free software: you can redistribute it and/or modify it under the
8 | * terms of the GNU General Public License as published by the Free Software
9 | * Foundation, either version 3 of the License, or (at your option) any later
10 | * version.
11 | *
12 | * Spark-Outlier is distributed in the hope that it will be useful, but WITHOUT ANY
13 | * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 | * A PARTICULAR PURPOSE. See the GNU General Public License for more details.
15 | * You should have received a copy of the GNU General Public License along with
16 | * Spark-Outlier.
17 | *
18 | * If not, see .
19 | */
20 |
21 | import org.apache.spark.rdd.RDD
22 |
23 | import org.apache.spark.mllib.clustering.KMeans
24 | import org.apache.spark.mllib.linalg.Vectors
25 |
26 | import de.kp.spark.core.model.LabeledPoint
27 |
28 | object Optimizer {
29 |
30 | /**
31 | * Determine from a range of cluster numbers that number where the mean
32 | * entropy of all cluster labels is minimal; note, that the entropy is
33 | * an indicator for the homogenity of the cluster labels
34 | */
35 | def optimizeByEntropy(data:RDD[LabeledPoint],range:Range,iterations:Int):Int = {
36 |
37 | val scores = range.par.map(k => (k, clusterEntropy(data,k,iterations))).toList
38 | scores.sortBy(_._2).head._1
39 |
40 | }
41 |
42 | def clusterEntropy(data: RDD[LabeledPoint],clusters:Int,iterations:Int):Double = {
43 |
44 | val vectors = data.map(point => Vectors.dense(point.features))
45 | val model = KMeans.train(vectors,clusters,iterations)
46 |
47 | val entropies = data.map(point => {
48 |
49 | val cluster = model.predict(Vectors.dense(point.features))
50 | (cluster,point.label)
51 |
52 | }).groupBy(_._1).map(data => MathHelper.strEntropy(data._2.map(_._2))).collect()
53 |
54 | entropies.sum / entropies.size
55 |
56 | }
57 |
58 | /**
59 | * Determine from a range of cluster numbers that number where the mean
60 | * distance between cluster points and their cluster centers is minimal
61 | */
62 | def optimizeByDistance(data:RDD[LabeledPoint],range:Range,iterations:Int):Int = {
63 |
64 | val scores = range.par.map(k => (k, clusterDistance(data, k, iterations))).toList
65 | scores.sortBy(_._2).head._1
66 |
67 | }
68 |
69 | def distance(a:Array[Double], b:Array[Double]) =
70 | Math.sqrt(a.zip(b).map(p => p._1 - p._2).map(d => d * d).sum)
71 |
72 | /**
73 | * This method calculates the mean distance of all data (vectors) from
74 | * their centroids, given certain clustering parameters; the method may
75 | * be used to score clusters
76 | */
77 | def clusterDistance(data: RDD[LabeledPoint], clusters:Int, iterations:Int):Double = {
78 |
79 | val vectors = data.map(point => Vectors.dense(point.features))
80 | val model = KMeans.train(vectors,clusters,iterations)
81 | /**
82 | * Centroid: Vector that specifies the centre of a certain cluster
83 | */
84 | val centroids = model.clusterCenters
85 |
86 | val distances = data.map(point => {
87 |
88 | val cluster = model.predict(Vectors.dense(point.features))
89 | val centroid = centroids(cluster)
90 |
91 | distance(centroid.toArray,point.features)
92 |
93 | }).collect()
94 |
95 | distances.sum / distances.size
96 |
97 | }
98 |
99 | }
--------------------------------------------------------------------------------