├── spark
    ├── project
    │   ├── build.properties
    │   └── plugins.sbt
    ├── version.sbt
    ├── src
    │   └── main
    │   │   └── scala
    │   │       └── org
    │   │           └── beymani
    │   │               ├── sanity
    │   │                   └── WordCount.scala
    │   │               └── spark
    │   │                   ├── common
    │   │                       ├── PseudoRelevanceThresholdFinder.scala
    │   │                       ├── OutlierScoreLevelShift.scala
    │   │                       └── OutlierCounter.scala
    │   │                   ├── seq
    │   │                       └── LocalNeighborhoodDetector.scala
    │   │                   └── pc
    │   │                       └── PrincipalComponentPredictor.scala
    └── build.sbt
├── manifest.mf
├── resource
    ├── IntroductionToBeymani.docx
    ├── cpsale.conf
    ├── vib.conf
    ├── mmfr.properties
    ├── ouli.sh
    ├── mhist.sh
    ├── mm_seqn.sh
    ├── avdi.sh
    ├── negr.sh
    ├── rede.sh
    ├── dsort.sh
    ├── mdist.sh
    ├── nede.sh
    ├── mm_modl.sh
    ├── rt_predict.properties
    ├── hist.json
    ├── bsm.json
    ├── model_calibration_tutorial.txt
    ├── ecommDataStream.json
    ├── build_storm.xml
    ├── xaction_states.rb
    ├── knn_udr.properties
    ├── spark_dependency.txt
    ├── beymani_spark.xml
    ├── epid.conf
    ├── vib.sh
    ├── cpsale.sh
    ├── ae_ticket.properties
    ├── xaction_queue.py
    ├── mob_loc.properties
    ├── ecomm_hierarchy.json
    ├── cyd.conf
    ├── alarm_threshold_tuning_tutorial.txt
    ├── bsm.conf
    ├── jar_dependency.txt
    ├── unsup_model_drift_detection_tutorial.txt
    ├── epid.sh
    ├── sup_model_drift_detection_tutorial.txt
    ├── monitoring_order_processing_system_with_isolation_forest.txt
    ├── ticket.conf
    ├── cycle_detection_tutorial.txt
    ├── proximity_tutorial.txt
    ├── autoencoder_based_cust_svc_case_anomaly_detection.txt
    ├── machinary_fault_detection_with_subsequence_anomaly_tutorial.txt
    ├── salean.sh
    ├── issue_service_time_anomaly_detection_tutorial.txt
    ├── sales_data_change_point_detection_tutorial.txt
    ├── cyd.sh
    ├── health_monitoring_data_anomaly_detection_tutorial.txt
    ├── and.conf
    ├── salean.conf
    ├── ticket.sh
    ├── ecomm.conf
    ├── bsm.sh
    ├── quarantine_violation_detection_tutorial.txt
    ├── cct.rb
    ├── cpu_usage_anomaly_det_tutorial.txt
    ├── rel_density_tutorial.txt
    ├── real_time_fraud_prediction_tutorial.txt
    ├── retail_sale_monitoring_with_anomaly_detection_tutorial.txt
    ├── and_spark.sh
    └── ecomm.sh
├── .gitignore
├── src
    └── main
    │   └── java
    │       └── org
    │           └── beymani
    │               ├── util
    │                   ├── SequencedScore.java
    │                   ├── SeequenceScoreAggregator.java
    │                   ├── DataStream.java
    │                   ├── SequenceMatcher.java
    │                   └── DataStreamSchema.java
    │               ├── predictor
    │                   ├── PredictorSpout.java
    │                   ├── EntropyIncreaseBasedPredictor.java
    │                   ├── EstimatedProbabilityBasedPredictor.java
    │                   ├── ExtremeValuePredictor.java
    │                   ├── FileSpout.java
    │                   ├── EstimatedCumProbabilityBasedPredictor.java
    │                   ├── ModelBasedPredictor.java
    │                   ├── OutlierPredictor.java
    │                   ├── MahalanobisDistancePredictor.java
    │                   ├── EstimatedMetaProbabilityBasedPredictor.java
    │                   └── InterPercentileDifferenceBasedPredictor.java
    │               └── proximity
    │                   └── RelativeDensity.java
├── python
    └── app
    │   ├── wsbot.py
    │   ├── cpsale.py
    │   ├── mvand.py
    │   ├── bvib.py
    │   ├── olss.py
    │   ├── bls.py
    │   └── cpu_usage.py
└── README.md


/spark/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.3.3
2 | 


--------------------------------------------------------------------------------
/spark/version.sbt:
--------------------------------------------------------------------------------
1 | version in ThisBuild := "1.0-SNAPSHOT"


--------------------------------------------------------------------------------
/manifest.mf:
--------------------------------------------------------------------------------
1 | Manifest-Version: 1.0
2 | X-COMMENT: Main-Class will be added automatically by build
3 | 
4 | 


--------------------------------------------------------------------------------
/resource/IntroductionToBeymani.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pranab/beymani/HEAD/resource/IntroductionToBeymani.docx


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | target/**/*
 2 | .settings/
 3 | .project
 4 | .classpath
 5 | /target
 6 | spark/project/project
 7 | spark/project/target
 8 | spark/target
 9 | spark/lib_managed
10 | spark/src_managed
11 | spark/project/boot
12 | spark/tmp
13 | project/
14 | .history
15 | spark/dist
16 | .DS_Store
17 | .cache
18 | spark/bin
19 | .class
20 | .ivy2
21 | 
22 | 


--------------------------------------------------------------------------------
/resource/cpsale.conf:
--------------------------------------------------------------------------------
 1 | changePointDetector {
 2 | 	field.delim.in = ","
 3 | 	field.delim.out = ","
 4 | 	id.fieldOrdinals = [0]
 5 | 	attr.ordinals = [2]
 6 | 	seq.fieldOrd = 1
 7 | 	window.size = 200
 8 | 	stat.type = CVM
 9 | 	stat.critValue = 38.863
10 | 	seq.chPtOutFilePath = "file:///Users/pranab/Projects/bin/beymani/other/cpsale"
11 | 	debug.on = true
12 | 	save.output = true
13 | }
14 | 


--------------------------------------------------------------------------------
/resource/vib.conf:
--------------------------------------------------------------------------------
 1 | 
 2 | subSequenceDistanceDetector {
 3 | 	field.delim.in = ","
 4 | 	field.delim.out = ","
 5 | 	id.fieldOrdinals = [0]
 6 | 	attr.ordinal = 2
 7 | 	seq.fieldOrd = 1
 8 | 	window.size = 40
 9 | 	score.threshold = 0.2
10 | 	ref.filePath = "file:///Users/pranab/Projects/bin/beymani/other/vib/vib_ref.txt"
11 | 	output.precision = 3
12 | 	debug.on = true
13 | 	save.output = true
14 | }


--------------------------------------------------------------------------------
/resource/mmfr.properties:
--------------------------------------------------------------------------------
 1 | field.delim.regex=,
 2 | field.delim.out=,
 3 | num.reducer=1
 4 | debug.on=false
 5 | 
 6 | #Projection
 7 | pro.projection.operation=grouping
 8 | pro.key.field=0
 9 | pro.projection.field=2
10 | 
11 | #MarkovStateTransitionModel
12 | mst.skip.field.count=1
13 | mst.model.states=LNL,LNN,LNS,LHL,LHN,LHS,MNL,MNN,MNS,MHL,MHN,MHS,HNL,HNN,HNS,HHL,HHN,HHS
14 | mst.trans.prob.scale=1
15 | 


--------------------------------------------------------------------------------
/resource/ouli.sh:
--------------------------------------------------------------------------------
 1 | JAR_NAME=/home/pranab/Projects/chombo/target/chombo-1.0.jar
 2 | CLASS_NAME=org.chombo.mr.NumericSorter
 3 | 
 4 | echo "running mr"
 5 | IN_PATH=/user/pranab/cct/avdi
 6 | OUT_PATH=/user/pranab/cct/ouli
 7 | echo "input $IN_PATH output $OUT_PATH"
 8 | hadoop fs -rmr $OUT_PATH
 9 | echo "removed output dir"
10 | 
11 | hadoop jar $JAR_NAME  $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties  $IN_PATH  $OUT_PATH
12 | 


--------------------------------------------------------------------------------
/resource/mhist.sh:
--------------------------------------------------------------------------------
 1 | JAR_NAME=/home/pranab/Projects/chombo/target/chombo-1.0.jar
 2 | CLASS_NAME=org.chombo.mr.MultiVarHistogram
 3 | 
 4 | echo "running mr"
 5 | IN_PATH=/user/pranab/cct/input
 6 | OUT_PATH=/user/pranab/cct/mhist
 7 | echo "input $IN_PATH output $OUT_PATH"
 8 | hadoop fs -rmr $OUT_PATH
 9 | echo "removed output dir"
10 | 
11 | hadoop jar $JAR_NAME  $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties  $IN_PATH  $OUT_PATH
12 | 


--------------------------------------------------------------------------------
/resource/mm_seqn.sh:
--------------------------------------------------------------------------------
 1 | JAR_NAME=/home/pranab/Projects/chombo/target/chombo-1.0.jar
 2 | CLASS_NAME=org.chombo.mr.Projection
 3 | 
 4 | echo "running mr"
 5 | IN_PATH=/Users/pranab/mmfr/input
 6 | OUT_PATH=/Users/pranab/mmfr/sequence
 7 | echo "input $IN_PATH output $OUT_PATH"
 8 | hadoop fs -rmr $OUT_PATH
 9 | echo "removed output dir"
10 | 
11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/mmfr.properties $IN_PATH $OUT_PATH
12 | 


--------------------------------------------------------------------------------
/resource/avdi.sh:
--------------------------------------------------------------------------------
 1 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
 2 | CLASS_NAME=org.beymani.proximity.AverageDistance
 3 | 
 4 | echo "running mr"
 5 | IN_PATH=/user/pranab/cct/simi
 6 | OUT_PATH=/user/pranab/cct/avdi
 7 | echo "input $IN_PATH output $OUT_PATH"
 8 | hadoop fs -rmr $OUT_PATH
 9 | echo "removed output dir"
10 | 
11 | hadoop jar $JAR_NAME  $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties  $IN_PATH  $OUT_PATH
12 | 


--------------------------------------------------------------------------------
/resource/negr.sh:
--------------------------------------------------------------------------------
 1 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
 2 | CLASS_NAME=org.beymani.proximity.AverageDistance
 3 | 
 4 | echo "running mr"
 5 | IN_PATH=/user/pranab/cct/simi
 6 | OUT_PATH=/user/pranab/cct/negr
 7 | echo "input $IN_PATH output $OUT_PATH"
 8 | hadoop fs -rmr $OUT_PATH
 9 | echo "removed output dir"
10 | 
11 | hadoop jar $JAR_NAME  $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties  $IN_PATH  $OUT_PATH
12 | 


--------------------------------------------------------------------------------
/resource/rede.sh:
--------------------------------------------------------------------------------
 1 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
 2 | CLASS_NAME=org.beymani.proximity.RelativeDensity
 3 | 
 4 | echo "running mr"
 5 | IN_PATH=/user/pranab/cct/nede
 6 | OUT_PATH=/user/pranab/cct/rede
 7 | echo "input $IN_PATH output $OUT_PATH"
 8 | hadoop fs -rmr $OUT_PATH
 9 | echo "removed output dir"
10 | 
11 | hadoop jar $JAR_NAME  $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties  $IN_PATH  $OUT_PATH
12 | 


--------------------------------------------------------------------------------
/resource/dsort.sh:
--------------------------------------------------------------------------------
 1 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
 2 | CLASS_NAME=org.beymani.dist.DistributionSorter
 3 | 
 4 | echo "running mr"
 5 | IN_PATH=/user/pranab/cct/mdist
 6 | OUT_PATH=/user/pranab/cct/dsort
 7 | echo "input $IN_PATH output $OUT_PATH"
 8 | hadoop fs -rmr $OUT_PATH
 9 | echo "removed output dir"
10 | 
11 | hadoop jar $JAR_NAME  $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties  $IN_PATH  $OUT_PATH
12 | 


--------------------------------------------------------------------------------
/resource/mdist.sh:
--------------------------------------------------------------------------------
 1 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
 2 | CLASS_NAME=org.beymani.dist.MultiVariateDistribution
 3 | 
 4 | echo "running mr"
 5 | IN_PATH=/user/pranab/cct/input
 6 | OUT_PATH=/user/pranab/cct/mdist
 7 | echo "input $IN_PATH output $OUT_PATH"
 8 | hadoop fs -rmr $OUT_PATH
 9 | echo "removed output dir"
10 | 
11 | hadoop jar $JAR_NAME  $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties  $IN_PATH  $OUT_PATH
12 | 


--------------------------------------------------------------------------------
/resource/nede.sh:
--------------------------------------------------------------------------------
 1 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
 2 | CLASS_NAME=org.beymani.proximity.NeighborDensity
 3 | 
 4 | echo "running mr"
 5 | IN_PATH=/user/pranab/cct/input/nede
 6 | OUT_PATH=/user/pranab/cct/nede
 7 | echo "input $IN_PATH output $OUT_PATH"
 8 | hadoop fs -rmr $OUT_PATH
 9 | echo "removed output dir"
10 | 
11 | hadoop jar $JAR_NAME  $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties  $IN_PATH  $OUT_PATH
12 | 


--------------------------------------------------------------------------------
/resource/mm_modl.sh:
--------------------------------------------------------------------------------
 1 | JAR_NAME=/home/pranab/Projects/avenir/target/avenir-1.0.jar
 2 | CLASS_NAME=org.avenir.markov.MarkovStateTransitionModel
 3 | 
 4 | echo "running mr"
 5 | IN_PATH=/Users/pranab/mmfr/sequence
 6 | OUT_PATH=/Users/pranab/mmfr/model
 7 | echo "input $IN_PATH output $OUT_PATH"
 8 | hadoop fs -rmr $OUT_PATH
 9 | echo "removed output dir"
10 | 
11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/mmfr.properties $IN_PATH $OUT_PATH
12 | 


--------------------------------------------------------------------------------
/resource/rt_predict.properties:
--------------------------------------------------------------------------------
 1 | 
 2 | predictor.model=mm
 3 | predictor.spout.threads=1
 4 | predictor.bolt.threads=2
 5 | num.workers=1
 6 | debug=on
 7 | 
 8 | messaging.provider=redis
 9 | redis.server.host=localhost
10 | redis.server.port=6379
11 | redis.markov.model.key=xactionMarkovModel
12 | redis.input.queue=xactionQueue
13 | local.predictor=true
14 | state.seq.window.size=5
15 | state.ordinal=1
16 | detection.algorithm=missProbability
17 | metric.threshold=0.96
18 | redis.output.queue=fraudQueue
19 | 


--------------------------------------------------------------------------------
/resource/hist.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"fields" :
 3 | 	[
 4 | 		{
 5 | 			"name" : "xid",
 6 | 			"ordinal" : 0,
 7 | 			"id" : true,
 8 | 			"dataType" : "string"
 9 | 		},
10 | 		{
11 | 			"name" : "time",
12 | 			"ordinal" : 1,
13 | 			"dataType" : "int",
14 | 			"bucketWidth" : 60
15 | 		},
16 | 		{
17 | 			"name" : "amount",
18 | 			"ordinal" : 2,
19 | 			"dataType" : "double",
20 | 			"bucketWidth" : 100
21 | 		},
22 | 		{
23 | 			"name" : "vendor",
24 | 			"ordinal" : 3,
25 | 			"dataType" : "categorical"
26 | 		}
27 | 	]
28 | }
29 | 


--------------------------------------------------------------------------------
/resource/bsm.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 		"attributes" :
 3 | 		[
 4 | 			{
 5 | 				"name" : "devID",
 6 | 				"ordinal" : 0,
 7 | 				"dataType" : "string",
 8 | 				"targetFieldOrdinals" : [0]
 9 | 			},
10 | 			{
11 | 				"name" : "timeStamp",
12 | 				"ordinal" : 1,
13 | 				"dataType" : "long",
14 | 				"targetFieldOrdinals" : [1]
15 | 			},
16 | 			{
17 | 				"name" : "measurement",
18 | 				"ordinal" : 2,
19 | 				"dataType" : "int",
20 | 				"buckeWidth" : 5.0,
21 | 				"transformers" : ["discretizerTrans"],
22 | 				"targetFieldOrdinals" : [2]
23 | 			}
24 | 		]
25 | }


--------------------------------------------------------------------------------
/spark/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | logLevel := Level.Warn
 2 | 
 3 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.10.0-RC1")
 4 | 
 5 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6")
 6 | 
 7 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" %  "5.2.4")
 8 | 
 9 | resolvers ++= Seq(
10 |   "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/",
11 |   "Akka Repository" at "https://repo.akka.io/releases/",
12 |   "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools"
13 | )
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/resource/model_calibration_tutorial.txt:
--------------------------------------------------------------------------------
 1 | This tutorial is for finding  calibration properties of a machine lrarning model.
 2 | 
 3 | Setup
 4 | =====
 5 | Make sure you have ../lib  ../supv directories with all the python files in there wrt 
 6 | where heart_disease.py is. Alternatively you can use ../python/app directory of avenir as
 7 | your working directory
 8 | 
 9 | Generate data and train model
10 | =============================
11 | Please refer to heart_disease_prediction_with_random_forest_tutorial.txt
12 | 
13 | Global calibration
14 | ==================
15 | ./heart_disease.py calib
16 | 
17 | Local caliv=bration
18 | ===================
19 | ./heart_disease.py calibLoc


--------------------------------------------------------------------------------
/resource/ecommDataStream.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"dataStreams" :
 3 | 	[
 4 | 		{
 5 | 			"id" : "corp",
 6 | 			"type" : "root",
 7 | 			"parentId" : "none",
 8 | 			"parentType" : "none",
 9 | 			"singleton" : true
10 | 		},
11 | 		{
12 | 			"id" : "sale",
13 | 			"type" : "sale",
14 | 			"parentId" : "root",
15 | 			"parentType" : "root",
16 | 			"singleton" : true
17 | 		},
18 | 		{
19 | 			"id" : "*",
20 | 			"type" : "prodSale",
21 | 			"parentId" : "sale",
22 | 			"parentType" : "sale",
23 | 			"singleton" : false
24 | 		},
25 | 		{
26 | 			"id" : "scAbandon",
27 | 			"type" : "scAbandon",
28 | 			"parentId" : "root",
29 | 			"parentType" : "root",
30 | 			"singleton" : true
31 | 		}
32 | 	]
33 | }


--------------------------------------------------------------------------------
/resource/build_storm.xml:
--------------------------------------------------------------------------------
 1 | 
 2 | <project name="beymani_for_storm" default="uber-jar" basedir=".">
 3 | <target name="uber-jar">
 4 |     <echo>Packaging into a single uber JAR</echo>
 5 | 	<jar destfile="uber-beymani-1.0.jar">
 6 | 		<zipgroupfileset dir="/home/pranab/Projects/lib" includes="jedis-2.1.0.jar,commons-pool-1.5.5.jar" /> 
 7 | 		<zipgroupfileset dir="/home/pranab/Projects/beymani/target" includes="beymani-1.0.jar, " /> 
 8 | 		<zipgroupfileset dir="/home/pranab/Projects/chombo/target" includes="chombo-1.0.jar, " /> 
 9 | 		<manifest>
10 | 			<attribute name="Main-Class" value=" org.beymani.predictor.OutlierPredictor" />
11 | 		</manifest>
12 | 	</jar>
13 | </target>
14 | </project>
15 | 


--------------------------------------------------------------------------------
/resource/xaction_states.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | 
 3 | require '../lib/util.rb'      
 4 | 
 5 | 
 6 | custCount = ARGV[0].to_i
 7 | 
 8 | custIDs = []
 9 | amountDist = CategoricalField.new("L",35,"M",53,"H",12)
10 | typeDist =  CategoricalField.new("N",85,"H",15)
11 | timeElapsedDist = CategoricalField.new("L",35,"N",45,"S",20)
12 | 
13 | 
14 | idGen = IdGenerator.new
15 | 1.upto custCount do
16 | 	custIDs << idGen.generate(10)
17 | end
18 | 
19 | #num of transactions
20 | 1.upto 15 do
21 | 	#number of customers
22 | 	1.upto custCount do
23 | 		if (rand(10) < 9)
24 | 			cid = custIDs[rand(custIDs.length)]
25 | 			xid = idGen.generate(12)
26 | 			puts "#{cid},#{xid},#{amountDist.value}#{typeDist.value}#{timeElapsedDist.value}"
27 | 		end
28 | 	end
29 | end
30 | 


--------------------------------------------------------------------------------
/resource/knn_udr.properties:
--------------------------------------------------------------------------------
 1 | common.mode=train
 2 | common.model.directory=model
 3 | common.model.file=knn_udr_model
 4 | common.preprocessing=scale
 5 | common.scaling.method=minmax
 6 | common.verbose=True
 7 | common.logging.file=./log/knn.log
 8 | common.logging.level=info
 9 | train.data.file=chdr.txt
10 | train.data.fields=1,2,3,4,5,6,7,8
11 | train.data.feature.fields=0,1,2,3,4,5,6
12 | train.data.class.field=7
13 | train.num.neighbors=9
14 | train.neighbor.weight=_
15 | train.neighbor.search.algo=_
16 | train.neighbor.search.leaf.size=_
17 | train.neighbor.dist.metric=_
18 | train.neighbor.dist.metric.pow=_
19 | train.success.criterion=_
20 | train.model.save=_
21 | train.score.method=_
22 | predict.data.file=chdr.txt
23 | predict.data.fields=1,2,3,4,5,6,7,8
24 | predict.data.feature.fields=0,1,2,3,4,5,6
25 | predict.use.saved.model=_
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/resource/spark_dependency.txt:
--------------------------------------------------------------------------------
 1 | Build all necessary jars
 2 | ========================
 3 | in chombo
 4 | mvn clean install
 5 | sbt publishLocal
 6 | 
 7 | in chombo/spark
 8 | sbt clean package
 9 | sbt publishLocal
10 | 
11 | in hoidla
12 | mvn clean install
13 | sbt publishLocal
14 | 
15 | in beymani
16 | mvn clean install
17 | sbt publishLocal
18 | 
19 | in beymani/spark
20 | sbt clean package
21 | 
22 | Build uber jar
23 | ==============
24 | ant -f beymani_spark.xml
25 | 
26 | uber jar file name is uber-beymani-spark-1.0.jar
27 | 
28 | If you are using Spark 2.0+, please add the following line to beymani_spark.xml, because 
29 | type safe jar is not included in newer versions of Spark
30 | 
31 | <zipgroupfileset dir="/Users/pranab/.ivy2/cache/com.typesafe/config/jars" includes="config-1.2.1.jar" />
32 | 
33 | Please change the directory path, as per your environment


--------------------------------------------------------------------------------
/resource/beymani_spark.xml:
--------------------------------------------------------------------------------
 1 | <project name="beymani_for_state_trans_rate" default="uber-jar" basedir=".">
 2 | <target name="uber-jar">
 3 |     <echo>Packaging into a single uber JAR</echo>
 4 | 	<jar destfile="uber-beymani-spark-1.0.jar">
 5 | 		<zipgroupfileset dir="/Users/pranab/Projects/beymani/spark/target/scala-2.12" includes="beymani-spark_2.12-1.0.jar" /> 
 6 | 		<zipgroupfileset dir="/Users/pranab/Projects/chombo/spark/target/scala-2.12" includes="chombo-spark_2.12-1.0.jar" /> 
 7 | 		<zipgroupfileset dir="/Users/pranab/Projects/chombo/target" includes="chombo-1.0.jar" /> 
 8 | 		<zipgroupfileset dir="/Users/pranab/Projects/hoidla/target" includes="hoidla-1.0.jar" /> 
 9 | 		<zipgroupfileset dir="/Users/pranab/Projects/beymani/target" includes="beymani-1.0.jar" /> 
10 | 		<zipgroupfileset dir="/Users/pranab/.ivy2/cache/com.typesafe/config/jars" includes="config-1.2.1.jar" /> 
11 | 	</jar>
12 | </target>
13 | </project>
14 | 


--------------------------------------------------------------------------------
/resource/epid.conf:
--------------------------------------------------------------------------------
 1 | outRangeBasedPredictor {
 2 | 	field.delim.in = ","
 3 | 	field.delim.out = ","
 4 | 	id.fieldOrdinals = [0]
 5 | 	attr.ordinals = [2,3]
 6 | 	score.threshold = 0.80
 7 | 	seq.fieldOrd=1
 8 | 	exp.const = 2000.0
 9 | 	attr.weights = [0.5, 0.5]
10 | 	attr.weightStrategy = max
11 | 	range.global = false
12 | 	range.filePath="/Users/pranab/Projects/bin/beymani/other/epid/outr/qualist.txt"
13 | 	debug.on = true
14 | 	save.output = true
15 | }
16 | 
17 | inRangeBasedPredictor {
18 | 	field.delim.in = ","
19 | 	field.delim.out = ","
20 | 	id.fieldOrdinals = [0]
21 | 	attr.ordinals = [2,3]
22 | 	score.threshold = 0.500
23 | 	seq.fieldOrd=1
24 | 	exp.const=5000.0
25 | 	attr.weights = [0.5, 0.5]
26 | 	attr.weightStrategy = max
27 | 	range.global=true
28 | 	range.globalFilePath="/Users/pranab/Projects/bin/beymani/other/epid/inr/uniq_qualist.txt"
29 | 	range.LocalFilePath="/Users/pranab/Projects/bin/beymani/other/epid/qua_lo_loc.txt"
30 | 	debug.on = true
31 | 	save.output = true
32 | }


--------------------------------------------------------------------------------
/resource/vib.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PROJECT_HOME=/Users/pranab/Projects
 4 | JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar
 5 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar
 6 | AVENIR_JAR_NAME=$PROJECT_HOME/bin/avenir/uber-avenir-spark-1.0.jar
 7 | MASTER=spark://akash:7077
 8 | 
 9 | case "$1" in
10 | 
11 | "olPred")
12 | 	echo "running SubSequenceDistanceDetector"
13 | 	CLASS_NAME=org.beymani.spark.seq.SubSequenceDistanceDetector
14 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/vib/*
15 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/vib
16 | 	rm -rf ./output/vib
17 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
18 | 	--conf spark.ui.killEnabled=true --master $MASTER $JAR_NAME  $INPUT $OUTPUT vib.conf
19 | 	rm -rf ./output/vib/_SUCCESS
20 | 	ls -l ./output/vib
21 | 	for f in ./output/vib/*
22 | 	do
23 | 		echo "number of  outliers in $f"
24 | 		cat $f | grep ,O | wc -l
25 | 	done	
26 | ;;
27 | 
28 | *) 
29 | 	echo "unknown operation $1"
30 | ;;
31 | 
32 | esac


--------------------------------------------------------------------------------
/resource/cpsale.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PROJECT_HOME=/Users/pranab/Projects
 4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar
 5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar
 6 | MASTER=spark://akash:7077
 7 | 
 8 | case "$1" in
 9 | 
10 | "cpInp")
11 | 	echo "args: data_file  "
12 | 	cp $2 $PROJECT_HOME/bin/beymani/input/cpsale/
13 | 	ls -l $PROJECT_HOME/bin/beymani/input/cpsale/
14 | ;;
15 | 
16 | "cpPred")
17 | 	echo "running ChangePointDetector Spark job"
18 | 	CLASS_NAME=org.beymani.spark.misc.ChangePointDetector
19 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/cpsale/*
20 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/cpsale
21 | 	rm -rf ./output/cpsale
22 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
23 | 	--conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME  $INPUT $OUTPUT cpsale.conf
24 | 	wc -l ./output/cpsale/part-00000
25 | 	wc -l ./output/cpsale/part-00001
26 | ;;
27 | 
28 | 
29 | *) 
30 | 	echo "unknown operation $1"
31 | 	;;
32 | 
33 | esac


--------------------------------------------------------------------------------
/spark/src/main/scala/org/beymani/sanity/WordCount.scala:
--------------------------------------------------------------------------------
 1 | package org.beymani.sanity
 2 | 
 3 | import org.apache.spark._
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object WordCount {
 7 |     def main(args: Array[String]) {
 8 |       val master = args.length match {
 9 |         case x: Int if x > 0 => args(0)
10 |         case _ => "local"
11 |       }
12 |       val sc = new SparkContext(master, "WordCount", System.getenv("SPARK_HOME"))
13 |       val input = args.length match {
14 |         case x: Int if x > 1 => sc.textFile(args(1))
15 |         case _ => sc.parallelize(List("pandas", "i like pandas"))
16 |       }
17 |       val words = input.flatMap(line => line.split(" "))
18 |       args.length match {
19 |         case x: Int if x > 2 => {
20 |           val counts = words.map(word => (word, 1)).reduceByKey{case (x,y) => x + y}
21 |           counts.saveAsTextFile(args(2))
22 |         }
23 |         case _ => {
24 |           val wc = words.countByValue()
25 |           println(wc.mkString(","))
26 |         }
27 |       }
28 |     }
29 | }


--------------------------------------------------------------------------------
/resource/ae_ticket.properties:
--------------------------------------------------------------------------------
 1 | common.mode=training
 2 | common.model.directory=./model/ae
 3 | common.model.file=cus.mod
 4 | common.preprocessing=scale
 5 | common.scaling.method=zscale
 6 | common.verbose=True
 7 | common.device=_
 8 | train.data.file=cus_tr.txt
 9 | train.data.fields=1,2,3,4,5,6,7,8
10 | train.data.feature.fields=0,1,2,3,4,5,6
11 | train.num.input=7
12 | train.num.hidden.units=6,5
13 | train.encoder.activations=relu,sigmoid
14 | train.decoder.activations=sigmoid,sigmoid
15 | train.batch.size=32
16 | train.num.iterations=200
17 | train.loss.reduction=_
18 | train.lossFn=mse
19 | train.optimizer=_
20 | train.opt.learning.rate=.001
21 | train.opt.weight.decay=_
22 | train.opt.momentum=_
23 | train.opt.eps=_
24 | train.opt.dampening=_
25 | train.opt.momentum.nesterov=_
26 | train.opt.betas=_
27 | train.opt.alpha=_
28 | train.noise.scale=0.05
29 | train.tied.weights=True
30 | train.model.save=False
31 | train.track.error=True
32 | train.batch.intv=5
33 | train.loss.av.window=5
34 | train.loss.diff.threshold=0.001
35 | encode.use.saved.model=_
36 | encode.data.file=cus_te.txt
37 | encode.feat.pad.size=50


--------------------------------------------------------------------------------
/spark/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "beymani-spark"
 2 | 
 3 | organization := "org.beymani"
 4 | 
 5 | version := "1.0"
 6 | 
 7 | scalaVersion := "2.12.0"
 8 | 
 9 | scalacOptions := Seq("-unchecked", "-deprecation")
10 | 
11 | isSnapshot := true
12 | 
13 | libraryDependencies ++=Seq(
14 |   "org.apache.spark" %% "spark-core" % "3.0.0-preview" % "provided",
15 |   "org.apache.spark" %% "spark-streaming" % "3.0.0-preview" % "provided",
16 |   "org.apache.spark" %% "spark-streaming-kafka-0-10" % "3.0.0-preview", 
17 |   "org.apache.commons" % "commons-lang3" % "3.0",
18 |   "com.fasterxml.jackson.core" % "jackson-databind" % "2.3.3",
19 |   "com.fasterxml.jackson.module" % "jackson-module-scala_2.12" % "2.9.4",
20 |   "org.apache.lucene" % "lucene-core" % "7.1.0",
21 |   "org.apache.lucene" % "lucene-analyzers-common" % "7.1.0",
22 |   "junit" % "junit" % "4.7" % "test",
23 |   "org.scalatest" % "scalatest_2.10" % "2.0" % "test",
24 |   "org.chombo" %% "chombo-spark" % "1.0",
25 |   "mawazo" %% "chombo" % "1.0",
26 |   "mawazo" %% "beymani" % "1.0",
27 |   "mawazo" %% "hoidla" % "1.0",
28 |   "mawazo" %% "avenir" % "1.0",
29 |   "gov.nist.math" % "jama" % "1.0.3"
30 | )
31 | 


--------------------------------------------------------------------------------
/resource/xaction_queue.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import sys
 4 | import redis
 5 | 
 6 | op = sys.argv[1]
 7 | r = redis.StrictRedis(host='localhost', port=6379, db=0)
 8 | 
 9 | if (op == "setModel"):
10 | 	modelFile = sys.argv[2]
11 | 	with open (modelFile, "r") as myfile:
12 |     		modelData=myfile.read()
13 | 
14 | 	r.set('xactionMarkovModel', modelData)
15 | elif (op == "getModel"):
16 | 	model = r.get("xactionMarkovModel")
17 | 	print model    
18 | elif (op == "writeQueue"):
19 | 	xactionFile = sys.argv[2]
20 | 	with open (xactionFile, "r") as myfile:
21 | 		for line in myfile.readlines():
22 | 			#print line.rstrip('\n')
23 | 			r.lpush("xactionQueue", line.rstrip('\n'))
24 | elif (op == "readQueue"):
25 | 	while True:
26 | 		line = r.rpop("xactionQueue")
27 | 		if line is not None:
28 | 			print line
29 | 		else:
30 | 			break
31 | elif (op == "queueLength"):
32 | 	qlen = r.llen("xactionQueue")
33 | 	print qlen
34 | elif (op == "readOutQueue"):
35 | 	while True:
36 | 		out = r.rpop("fraudQueue")
37 | 		if out is not None:
38 | 			print out
39 | 		else:
40 | 			break
41 | elif (op == "outQueueLength"):
42 | 	qlen = r.llen("fraudQueue")
43 | 	print qlen
44 | 


--------------------------------------------------------------------------------
/resource/mob_loc.properties:
--------------------------------------------------------------------------------
 1 | common.verbose=_
 2 | population.num.hours=48
 3 | population.sampling.interval=5
 4 | population.size=1000
 5 | population.num.family=200
 6 | population.family.size.mean=_
 7 | population.family.size.sd=_
 8 | population.working.family.percentage=_
 9 | population.retired.one.person.family.percentage=_
10 | region.lat.min=37.000
11 | region.lat.max=37.500
12 | region.long.min=-122.500
13 | region.long.max=-122.000
14 | region.num.business=_
15 | region.biz.size.mean=_
16 | region.biz.size.size.sd=_
17 | region.num.office"=_
18 | region.office.size.mean=_
19 | region.biz.size.size.sd=_
20 | region.num.schools=_
21 | region.num.colleges=_
22 | region.quarantine.list.file=qualist.txt
23 | region.num.locations=2
24 | region.loc.size=0.0024
25 | region.quarantine.loc.file=qualoc.txt
26 | region.quarantine.num.violation=5
27 | region.residence.list.file=res_loc.txt
28 | region.work.list.file=work_loc.txt
29 | region.school.list.file=school_loc.txt
30 | region.medical.facility.list.file=med_loc.txt
31 | region.shopping.area.list.file=shop_loc.txt
32 | region.entertainment.area.list.file=ent_loc.txt
33 | region.large.event.area.list.file=event_loc.txt
34 | region.open.space.list.file=open_loc.txt
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/resource/ecomm_hierarchy.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"dataStreams" :
 3 | 	[
 4 | 		{
 5 | 			"id" : "corp",
 6 | 			"type" : "root",
 7 | 			"parentId" : "none",
 8 | 			"parentType" : "none",
 9 | 			"singleton" : true
10 | 		},
11 | 		{
12 | 			"id" : "sale",
13 | 			"type" : "sale",
14 | 			"parentId" : "root",
15 | 			"parentType" : "root",
16 | 			"singleton" : true
17 | 		},
18 | 		{
19 | 			"id" : "electronics",
20 | 			"type" : "dept",
21 | 			"parentId" : "sale",
22 | 			"parentType" : "sale",
23 | 			"singleton" : false,
24 | 			"childrenId" : ["31W6CN4OGP","ATROK5G187","54RLEB9L5J","P3N63F2TPP","L674KMOI01","38A2F7U4XK","L0668572D0","BS6RHF2PV2","C88L3DYBB9","NX23WR8JJW"]
25 | 		},
26 | 		{
27 | 			"id" : "clothing",
28 | 			"type" : "dept",
29 | 			"parentId" : "sale",
30 | 			"parentType" : "sale",
31 | 			"singleton" : false,
32 | 			"childrenId" : ["IYZN3F9WCX","2DPXUFR93R","7MRHFY4L70","3FHQOJ45IJ","H4T8785L41","P3RVWCZS37","GZ4819T12I","OGX2037784","9021SDZ1O6","U62K213GI2"]
33 | 		},
34 | 		{
35 | 			"id" : "*",
36 | 			"type" : "prodSale",
37 | 			"parentId" : "electronics",
38 | 			"parentType" : "dept",
39 | 			"singleton" : false
40 | 		},
41 | 		{
42 | 			"id" : "*",
43 | 			"type" : "prodSale",
44 | 			"parentId" : "clothing",
45 | 			"parentType" : "dept",
46 | 			"singleton" : false
47 | 		}	
48 | 	]
49 | }


--------------------------------------------------------------------------------
/src/main/java/org/beymani/util/SequencedScore.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * beymani: Outlier and anamoly detection 
 3 |  * Author: Pranab Ghosh
 4 |  * 
 5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 6 |  * may not use this file except in compliance with the License. You may
 7 |  * obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0 
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 |  * implied. See the License for the specific language governing
15 |  * permissions and limitations under the License.
16 |  */
17 | 
18 | package org.beymani.util;
19 | 
20 | import org.chombo.util.Pair;
21 | 
22 | /**
23 |  * Outlier score for a sequence element
24 |  * @author pranab
25 |  *
26 |  */
27 | public class SequencedScore extends Pair<Long, Double> {
28 | 	private static final long serialVersionUID = 4277362152194891790L;
29 | 
30 | 	public SequencedScore(long seq, double score) {
31 | 		super(seq, score);
32 | 	}
33 | 	
34 | 	public long getSeq() {
35 | 		return left;
36 | 	}
37 | 
38 | 	public double getScore() {
39 | 		return right;
40 | 	}
41 | 
42 | 	public void setScore(double score) {
43 | 		right = score;
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/resource/cyd.conf:
--------------------------------------------------------------------------------
 1 | 
 2 | numericalAttrStats {
 3 | 	field.delim.in = ","
 4 | 	field.delim.out = ","
 5 | 	id.fieldOrdinals = [0]
 6 | 	attr.ordinals = [2]
 7 | 	seasonal.analysis = false
 8 | 	part.bySeasonCycle = false
 9 | 	seasonal.cycleType = ["weekDayOrWeekendOfWeek"]
10 | 	time.fieldOrdinal = 1
11 | 	time.inMili = false
12 | 	output.precision = 3
13 | 	debug.on = true
14 | 	save.output = true
15 | }
16 | 
17 | temporalAggregator {
18 | 	field.delim.in = ","
19 | 	field.delim.out = ","
20 | 	attr.ordinals = [2]
21 | 	id.fieldOrdinals = [0]
22 | 	time.fieldOrdinal = 1
23 | 	time.inMili = false
24 | 	aggr.windowTimeUnit = "hour"
25 | 	aggr.windowTimeLength = 1
26 | 	aggr.type = "average"
27 | 	output.compact = true
28 | 	output.precision = 3
29 | 	debug.on = true
30 | 	save.output = true
31 | }
32 | 
33 | autoCorrelation {
34 | 	field.delim.in = ","
35 | 	field.delim.out = ","
36 | 	seq.fieldOrdinal = 1
37 | 	id.fieldOrdinals = [0]
38 | 	attr.ordinals = [2]
39 | 	output.precision = 3
40 | 	coor.lags = [24, 48, 168]
41 | 	stats.file.path = "/Users/pranab/Projects/bin/beymani/other/auc/stats.txt"
42 | 	mean.fieldOrd = 4
43 | 	debug.on = true
44 | 	save.output = true
45 | }
46 | 
47 | typedUniqueValueCounter {
48 | 	field.delim.in = ","
49 | 	field.delim.out = ","
50 | 	id.fieldOrdinals = [0, 1, 2]
51 | 	attr.ordinals = [5]
52 | 	attr.5.type = "double"
53 | 	seasonal.analysis = true
54 | 	seasonal.cycleType = ["weekDayOrWeekendOfWeek"]
55 | 	time.fieldOrdinal = 4
56 | 	time.inMili = false
57 | 	output.precision = 3
58 | 	debug.on = true
59 | 	save.output = true
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/resource/alarm_threshold_tuning_tutorial.txt:
--------------------------------------------------------------------------------
 1 | This tutorial is for tuning the threshold  in anomaly detection system based on supervised learning
 2 | using user feedback data 
 3 | 
 4 | Environment
 5 | ===========
 6 | Path etc shown here corresposnds to my environment. Please Change them  as needed  for your 
 7 | environment
 8 | 
 9 | Build
10 | =====
11 | Follow instructions in spark_dependency.txt
12 | 
13 | Python dependency
14 | =================
15 | The shell script commands for data generation run python scripts for data generation. Before you run 
16 | the data generation commands do the following
17 | 1. checkout project avenir
18 | 2. copy the avenir/python/lib directory to ../lib with respect to your location of cpu_usage.py file
19 | 
20 | Generate outlier detected data
21 | ==============================
22 | Please follow the tutorial cpu_usage_anomaly_det_tutorial.txt to generate data with outliers detected.
23 | Consolidate Spark generated output files into 1 file
24 | 
25 | Simulate user feedback
26 | ======================
27 | ./cpu_usage.py feedback <outlier_file_name> <cur_threshold> <new_threshold>
28 | 
29 | outlier_file_name = file generated in the previous step
30 | cur_threshold = threshold set outlier detection spark jobs. It's the parameter score.threshold 
31 | in and.conf file
32 | new_threshold = if set higher than cur_threshold, it will simulate the case false positive 
33 | i.e too many alarms
34 | 
35 | Run spark job
36 | =============
37 | ./and_spark.sh thLearn
38 | 
39 | Configuration
40 | =============
41 | It's in and.conf file. Through the parameter split.points multiple split points are provided.


--------------------------------------------------------------------------------
/resource/bsm.conf:
--------------------------------------------------------------------------------
 1 | 
 2 | #device data
 3 | dataTransformer {
 4 | 	field.delim.in = ","
 5 | 	field.delim.out = ","
 6 | 	schema.filePath = "/Users/pranab/Projects/bin/beymani/meta/bsm.json"
 7 | 	debug.on = true
 8 | 	save.output = true
 9 |     transformers {
10 |     	discretizerTrans {
11 |     	}
12 | 	}
13 | }
14 | 
15 | markovStateTransitionModel {
16 | 	field.delim.in = ","
17 | 	field.delim.out = ","
18 | 	id.field.ordinals = [0]
19 | 	seq.start.ordinal = 0
20 | 	state.list = ["8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29", "30","31","32","33","34","35","36","37","38","39","40","41","42"]
21 | 	output.precision = 3
22 | 	data.seqLongFormat = true
23 | 	seq.field.ordinal = 1
24 | 	state.field.ordinal = 2
25 | 	data.mergeKeysNeeded = true
26 | 	data.laplaceCorrNeeded = true
27 | 	output.compact = false
28 | 	debug.on = true
29 | 	save.output = true
30 | }
31 | 
32 | markovChainPredictor {
33 | 	field.delim.in = ","
34 | 	field.delim.out = ","
35 | 	predictor.strategy = "conditinalProbability"
36 | 	id.fieldOrdinals = [0]
37 | 	output.precision = 6
38 | 	score.threshold = 3.7
39 | 	attr.ordinal = 2
40 | 	seq.fieldOrd = 1
41 | 	window.size = 4
42 | 	state.list = ["8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29", "30","31","32","33","34","35","36","37","38","39","40","41","42"]
43 | 	stateTrans.filePath = "/Users/pranab/Projects/bin/beymani/meta/bsm_mod.txt"
44 | 	stateTrans.compact = false
45 | 	model.global = true
46 | 	ignore.missingModel = false
47 | 	exp.const = -1.0
48 | 	debug.on = true
49 | 	save.output = true
50 | }


--------------------------------------------------------------------------------
/resource/jar_dependency.txt:
--------------------------------------------------------------------------------
 1 | Dependent jars
 2 | ==============
 3 | beymani depends on the following jar libraries. Most of them are third party except for
 4 | chombo. For these two you could either checkout the jars and place them in your
 5 | local maven repo or you could build them.
 6 | 
 7 | jackson-core-lgpl-1.6.3.jar 
 8 | jackson-mapper-lgpl-1.6.3.jar 
 9 | chombo-1.0.jar 
10 | commons-lang-3.1.jar
11 | jedis-2.2.1.jar
12 | 
13 | 
14 | Building dependent jars
15 | =======================
16 | Follow these steps if you have decided to build the jars for chombo and hoidla
17 | 
18 | Checkout project chombo and run
19 | mvn clean install
20 | 
21 | 
22 | Handling dependency
23 | ===================
24 | There are many ways to handle dependency in Hadoop
25 | 
26 | 1. Use libjar command line options as below
27 | hadoop jar xyz.jar com.example.MyMapreduce -libjars path1/lib1.jar,path2/lib2.jar
28 | 
29 | 2. Use maven shade plugin to package all jars into one uber jar. The following needs to
30 | be added to the build element in pom.xml
31 | <build>
32 | .......
33 | 	<plugins>
34 | 		<plugin>
35 | 			<groupId>org.apache.maven.plugins</groupId>
36 | 			<artifactId>maven-shade-plugin</artifactId>
37 | 			<executions>
38 | 				<execution>
39 | 					<phase>package</phase>
40 | 					<goals>
41 | 						<goal>shade</goal>
42 | 					</goals>
43 | 				</execution>
44 | 			</executions>
45 | 			<configuration>
46 | 				<finalName>uber-${artifactId}-${version}</finalName>
47 | 			</configuration>
48 | 		</plugin>
49 | 	</plugins>
50 | .......
51 | </build>
52 | 
53 | 3. Use ant to package all dependent jars. You could use ../resource/build_hadoop.xml as an example
54 | 
55 | 4. Copy all jars to hadoop lib directory in all nodes 
56 | 


--------------------------------------------------------------------------------
/resource/unsup_model_drift_detection_tutorial.txt:
--------------------------------------------------------------------------------
 1 | This tutorial is for unsupervised concept  drift detection of deployed supervised machine learning  
 2 | models with nearest neighbor count algorithm. We will use ecommerce customer churn data
 3 | 
 4 | 
 5 | Setup
 6 | =====
 7 | Make sure you have python/lib, python/mlextra and python/supv  directories of avenir project with all the
 8 | python files wrt  where codrift.py is as a peer directory i.e at ../lib , ../mlextra and ../supv
 9 | 
10 | Generate data for no drift case
11 | ===============================
12 | - generate refrence churn data
13 | ./codrift.py genrc <nsamp> <noise_level> > ch.txt
14 | where
15 | bsamp = num of samples e.g 1000
16 | noise_level = noise level in data e.g 0.05
17 | 
18 | - set class label to 1
19 | ./codrift.py recl  ch.txt 1 > chref.txt
20 | 
21 | - generate current churn data
22 | ./codrift.py genrc <nsamp> <noise_level> chref.txt > ch.txt
23 | 
24 | - set class label to 0
25 | ./codrift.py recl  ch.txt 0 > chnew.txt
26 | 
27 | - concatenate files
28 | cat chref.txt > chndr.txt
29 | cat chnew.txt >> chndr.txt
30 | 
31 | No drift case
32 | =============
33 | - ensure following settings in knn_udr.properties
34 | train.data.file=chndr.txt
35 | predict.data.file=chndr.txt
36 | 
37 | - run
38 | ./codrift.py udrift knn_udr.properties
39 | 
40 | Generate data for drift case
41 | ============================
42 | - generate distribution shifted new data for second half
43 | ./codrift.py dish chnew.txt  > chnewd.txt
44 | 
45 | - concatenate files
46 | cat chref.txt > chdr.txt
47 | cat chnewd.txt >> chdr.txt
48 | 
49 | Drift case
50 | ==========
51 | - ensure following settings in knn_udr.properties
52 | train.data.file=chdr.txt
53 | predict.data.file=chdr.txt
54 | 
55 | - run
56 | ./codrift.py udrift knn_udr.properties
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/resource/epid.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PROJECT_HOME=/Users/pranab/Projects
 4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar
 5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar
 6 | MASTER=spark://akash.local:7077
 7 | 
 8 | case "$1" in
 9 | 
10 | "cpQuaLocData")
11 | 	echo "args: data_file  "
12 | 	cp $2 $PROJECT_HOME/bin/beymani/other/epid/$3/
13 | 	ls -l $PROJECT_HOME/bin/beymani/other/epid/$3/
14 | ;;
15 | 
16 | 
17 | "cpLocData")
18 | 	echo "args: test_data_file  "
19 | 	cp $2 $PROJECT_HOME/bin/beymani/input/epid/$3/
20 | 	ls -l $PROJECT_HOME/bin/beymani/input/epid/$3/
21 | ;;
22 | 
23 | "olPredOu")
24 | 	echo "running OutRangeBasedPredictor Spark job"
25 | 	CLASS_NAME=org.beymani.spark.misc.OutRangeBasedPredictor
26 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/epid/outr/*
27 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/epid/outr
28 | 	rm -rf ./output/epid/outr
29 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
30 | 	--conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME  $INPUT $OUTPUT epid.conf
31 | 	echo "number of outliers"
32 | 	wc -l ./output/epid/outr/part-00000
33 | 	wc -l ./output/epid/outr/part-00001
34 | ;;
35 | 
36 | "olPredIn")
37 | 	echo "running InRangeBasedPredictor Spark job"
38 | 	CLASS_NAME=org.beymani.spark.misc.InRangeBasedPredictor
39 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/epid/inr/*
40 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/epid/inr
41 | 	rm -rf ./output/epid/inr
42 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
43 | 	--conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME  $INPUT $OUTPUT epid.conf
44 | 	echo "number of outliers"
45 | 	wc -l ./output/epid/inr/part-00000
46 | 	wc -l ./output/epid/inr/part-00001
47 | ;;
48 | 
49 | *) 
50 | 	echo "unknown operation $1"
51 | 	;;
52 | 
53 | esac


--------------------------------------------------------------------------------
/resource/sup_model_drift_detection_tutorial.txt:
--------------------------------------------------------------------------------
 1 | This tutorial is forconcept  drift detection of supervised machine learning  models with EDDM algorithm
 2 | 
 3 | 
 4 | Setup
 5 | =====
 6 | Make sure you have python/lib directory of avenir project with all the python files wrt 
 7 | where codrift.py is as a peer directory i.e at ../lib  Copy sucodr.py from beymani/python/lob 
 8 | directory to your lib directory
 9 | 
10 | Generate Data
11 | =============
12 | - Generate refrence model prediction data 
13 | ./codrift.py agen <nsamp> <er_rate> > er1.txt
14 | where
15 | nsamp = num of samples e.g. 2000
16 | er_rate = error rate e.g 0.1
17 | 
18 | - Generate model prediction data with drift present
19 | ./codrift.py agen <nsamp> <er_rate> <trans> <dr_er_rate> > er2.txt
20 | where
21 | trans = transition point for drift e.g 0.4 which means drift will appear after
22 | the first 40% of the data
23 | dr_er_rate = increased error rate after drift e.g 0.2
24 | 
25 | Create reference statistics
26 | ===========================
27 | Make sure you have directory called  model under the working directory
28 | 
29 | Run
30 | ./codrift.py eddm er1.txt true <bootstrap_size>
31 | where 
32 | bootstrap_size = no of samples to be used boot strapping and creating referenece statistic e.g 600
33 | it will detect drift for the remaining samples. In our case it won't because er1.txt does not contain
34 | any error data with drift
35 | 
36 | Detect drift
37 | ============
38 | ./codrift.py eddm er2.txt
39 | In our case drift will be detected, because about half way through the error data, the error rate dobles
40 | to simulate drift. For real prouction data, you may or may not find drift
41 | 
42 | Ensemble and hierarchy of drift detectors
43 | =========================================
44 | There are aggregate functions in sucodr.py, that can be used to implement ensemble of detector e.g.
45 | LFR
46 | 
47 | 


--------------------------------------------------------------------------------
/resource/monitoring_order_processing_system_with_isolation_forest.txt:
--------------------------------------------------------------------------------
 1 | This tutorial is for monitring an order processing system with isolation forest based anomaly detection. 
 2 | It uses log records generated by the order processing business workflow system. 
 3 | 
 4 | Environment
 5 | ===========
 6 | Path etc shown here corresposnds to my environment. Please Change them  as needed  for your 
 7 | environment by editing ecomm.sh
 8 | 
 9 | Build
10 | =====
11 | Follow instructions in spark_dependency.txt
12 | 
13 | Python dependency
14 | =================
15 | Before you run python scripts for  data generation please do the following
16 | 1. checkout project avenir
17 | 2. copy the avenir/python/lib directory to ../lib with respect to your location of ecomm.py file
18 | 
19 | You could run ecomm.py from the python/app directory of beymani  where it resides or copy it 
20 | some where else
21 | 
22 | Generate order processing data
23 | ==============================
24 | ./ecomm.py ordProcessRecs <num_orders> > orpr.txt
25 | where
26 | num_orders = num of orders e.g 200
27 | 
28 | Insert outliers
29 | ===============
30 | ./ecomm.py olOrdPr orpr.txt <ol_percent> > rorpr.txt
31 | where
32 | ol_percent = outlier percentage e.g 10
33 | 
34 | Run anomaly detector Spark job
35 | ==============================
36 | Set score.threshold in ecomm.conf to some reasoable value e.g 0.5
37 | 
38 | Run Spark job
39 | ./ecomm.sh orpOlPred
40 | 
41 | Get upper tail statistics of outlier scores
42 | ===========================================
43 | ./olss.py sttest ./output/ecom/orp 0 hist
44 | 
45 | Run anomaly detector Spark job with new threshold value
46 | =======================================================
47 | Choose your threshold based on some confidence limit e.g 0.9 from the output of the lasr step Use that 
48 | value to set score.threshold  in ecomm.conf
49 | 
50 | Run Spark job again
51 | ./ecomm.sh orpOlPred
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/resource/ticket.conf:
--------------------------------------------------------------------------------
 1 | numericalAttrStats {
 2 | 	field.delim.in = ","
 3 | 	field.delim.out = ","
 4 | 	id.fieldOrdinals = [0,1]
 5 | 	attr.ordinals = [3]
 6 | 	seasonal.analysis = true
 7 | 	part.bySeasonCycle = true
 8 | 	seasonal.cycleType = ["hourOfDay"]
 9 | 	time.fieldOrdinal = 2
10 | 	time.inMili = false
11 | 	min.sampleCount = 100
12 | 	output.precision = 3
13 | 	debug.on = true
14 | 	save.output = true
15 | }
16 | 
17 | numericalAttrMedian {
18 | 	field.delim.in = ","
19 | 	field.delim.out = ","
20 | 	id.fieldOrdinals = [0]
21 | 	attr.ordinals = [4]
22 | 	seasonal.analysis = false
23 | 	operation.type = "mad"	
24 | 	hdfs.file = false
25 | 	med.file.path = "/Users/pranab/Projects/bin/beymani/meta/ticket/med.txt"
26 | 	seasonal.cycleType = ["hourOfDay"]
27 | 	time.fieldOrdinal = 1
28 | 	time.inMili = false
29 | 	output.precision = 6
30 | 	min.samplecount = 100
31 | 	debug.on = true
32 | 	save.output = true
33 | }
34 | 
35 | statsBasedOutlierPredictor {
36 | 	field.delim.in = ","
37 | 	field.delim.out = ","
38 | 	predictor.strategy = "robustZscore"
39 | 	id.fieldOrdinals = [0]
40 | 	attr.ordinals = [4]
41 | 	score.threshold = 0.7
42 | 	exp.const = -1.0
43 | 	outlier.polarity = "all"
44 | 	stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/ticket/stats.txt"
45 | 	mean.fldOrd = 4
46 | 	hdfs.file = false
47 | 	attr.weights = [1]
48 | 	attr.weightStrategy = "weightedAverage"
49 | 	robustZscore {
50 | 		med.stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/ticket/med.txt"
51 | 		mad.stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/ticket/mad.txt"
52 | 	}
53 | 	seasonal.analysis = false
54 | 	seasonal.cycleType = ["hourOfDay"]
55 | 	time.fieldOrdinal = 1
56 | 	time.inMili = false
57 | 	output.precision = 3
58 | 	output.outliers = false
59 | 	rem.outliers = false
60 | 	clean.dataDirPath = "file:///Users/pranab/Projects/bin/beymani/other/olp/clean"
61 | 	debug.on = true
62 | 	save.output = true
63 | }
64 | 


--------------------------------------------------------------------------------
/resource/cycle_detection_tutorial.txt:
--------------------------------------------------------------------------------
 1 | This tutorial is for cycle detection in time series data using auto correlation. A set of 
 2 | candidate lags are provided. The lag with the highest correlation corresponds to a cycle.
 3 |  
 4 | 
 5 | Environment
 6 | ===========
 7 | Path etc shown here corresposnds to my environment. Please Change them  as needed  for your 
 8 | environment
 9 | 
10 | Build
11 | =====
12 | Follow instructions in spark_dependency.txt
13 | 
14 | Python dependency
15 | =================
16 | The shell script commands for data generation run python scripts for data generation. Before you run 
17 | the data generation commands do the following
18 | 1. checkout project avenir
19 | 2. copy the avenir/python/lib directory to ../lib with respect to your location of cpu_usage.py file
20 | 
21 | Create input data
22 | =================
23 | ./and_spark.sh crInput <num_of_days> <reading_intervaL> <num_servers> <output_file>
24 | 
25 | where
26 | num_of_days = number of days e.g 15
27 | reading_intervaL = reading interval in sec e.g. 300
28 | num_servers = number of servers e.g. 4
29 | output_file = output file, we will use c.txt from now on
30 | 
31 | Copy output to input path for NumericalAttrStats and TemporalAggregator spark jobs
32 | 
33 | Run Spark job for stats
34 | =======================
35 | ./cyd.sh numStat
36 | 
37 | Copy and consolidate stats file
38 | ===============================
39 | ./and_spark.sh crStatsFile
40 | 
41 | Aggregate to hourly
42 | ===================
43 | If the sampling interval is in minutes or sec aggregate to hourly average
44 | ./cyd.sh tempAggr
45 | 
46 | Copy and consolidate aggregate output
47 | =====================================
48 | ./cyd.sh crAucInput
49 | 
50 | Run Spark job for auto correlation
51 | ==================================
52 | ./cyd.sh autoCor
53 | 
54 | Configuration
55 | =============
56 | Configuration is in cyd.conf. Make changes as necessary
57 | 
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/resource/proximity_tutorial.txt:
--------------------------------------------------------------------------------
 1 | This tutorial provides details of finding outliers based on average distance to neighbors.
 2 | It uses two MR  jobs, SameTypeSimilarity and AverageDistance. If you want to use credit card 
 3 | transactions as input, you could use cct.rb to generate data. Make sure that utol.rb is in the path
 4 | ../lib. util.rb can be checked out from my  project visitante. It's under script/ruby/lib directory
 5 | in that project.
 6 | 
 7 | Transaction Simarity
 8 | ====================
 9 | Herte is the script for SameTypeSimilarity
10 | 
11 | JAR_NAME=/home/pranab/Projects/sifarish/target/sifarish-1.0.jar
12 | CLASS_NAME=org.sifarish.feature.SameTypeSimilarity
13 | 
14 | echo "running mr"
15 | IN_PATH=/user/pranab/cct/input
16 | OUT_PATH=/user/pranab/cct/simi
17 | echo "input $IN_PATH output $OUT_PATH"
18 | hadoop fs -rmr $OUT_PATH
19 | echo "removed output dir"
20 | 
21 | hadoop jar $JAR_NAME  $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties  $IN_PATH  $OUT_PATH
22 | 
23 | Average Ditsance to Neighbors
24 | =============================
25 | Here is a sample script for AverageDistance
26 | 
27 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
28 | CLASS_NAME=org.beymani.proximity.AverageDistance
29 | 
30 | echo "running mr"
31 | IN_PATH=/user/pranab/cct/simi
32 | OUT_PATH=/user/pranab/cct/avdi
33 | echo "input $IN_PATH output $OUT_PATH"
34 | hadoop fs -rmr $OUT_PATH
35 | echo "removed output dir"
36 | 
37 | hadoop jar $JAR_NAME  $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties  $IN_PATH  $OUT_PATH
38 | 
39 | Configuration
40 | =============
41 | Here is a sample cct.properties
42 | 
43 | field.delim.regex=,
44 | field.delim=,
45 | num.reducer=1
46 | sts.bucket.count=1000
47 | sts.same.schema.file.path=/pranab/meta/prod/prod.json
48 | avd.top.match.count=10
49 | avd.top.match.average=true
50 | avd.top.match.density=false
51 | avd.top.match.grouping=false
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/resource/autoencoder_based_cust_svc_case_anomaly_detection.txt:
--------------------------------------------------------------------------------
 1 | This tutorial is for anaomaly detection for service time for an issue processing system  data using 
 2 | auto encoder. 
 3 |  
 4 | 
 5 | Environment
 6 | ===========
 7 | Path etc shown here corresposnds to my environment. Please Change them  as needed  for your 
 8 | environment
 9 | 
10 | 
11 | Python dependency
12 | =================
13 | The shell script commands for data generation run python scripts for data generation. Before you run 
14 | the data generation commands do the following
15 | 1. checkout project avenir
16 | 2. copy the directories  avenir/python/lib  avenir/python/mlextra and avenir/python/unsup directory to ../lib 
17 | ../mlextra and ../unsup with respect to your location of cpu_usage.py file
18 | 
19 | 
20 | Create normal data for modeling
21 | ===============================
22 | ./ticket.py genx <num_issues> > cus_tr.txt
23 | 
24 | where
25 | num_issues = number of issues e.g 2000
26 | 
27 | 
28 | Create test data
29 | ================
30 | ./ticket.py genx <num_issues> > cus.txt
31 | where
32 | num_issues = number of issues e.g 200
33 | 
34 | insert outliers
35 | /ticket.py iolx  cus.txt <outlier_percentage> > cus_te.txt
36 | 
37 | where
38 | <outlier_percentage = outlier percentage e.g 10
39 | 
40 | Train Auto Encoder
41 | ==================
42 |  ./ticket.py train  ae_ticket.properties
43 |  Make sure the following are set
44 |  train.model.save=True
45 |  
46 |  Model hyper parameters are tuned manually. Feel free to change and experiment by editing the file
47 |  ae_ticket.properties
48 |  
49 |  Test for regeneration error
50 |  ===========================
51 |  ./ticket.py regen ae_ticket.properties
52 |  A file called rol.txt has the output
53 |  
54 |  Regeneration error upper tail stat
55 |  ==================================
56 |  ./ticket.py utstat  rol.txt
57 |  
58 |  Histogram of regeneration error
59 |  ===============================
60 |  ./ticket.py hist  rol.txt
61 |  
62 |  
63 |  
64 |  
65 |  
66 |  
67 | 


--------------------------------------------------------------------------------
/resource/machinary_fault_detection_with_subsequence_anomaly_tutorial.txt:
--------------------------------------------------------------------------------
 1 | This tutorial is for time series anomaly detection for machinary vibration  data using sub sequence
 2 | dissimilarity
 3 | 
 4 | Dependent script
 5 | ================
 6 | Checkout the project avenir. Copy the lib  directory under python to a directory  at the same level
 7 | as your working directory forp python script
 8 | 
 9 | Build and Deployment
10 | ====================
11 | Please refer to resorce/spark_dependency.txt for building all jars and the final uber jar filw
12 | 
13 | Script and configuration
14 | ========================
15 | Feel free to make changes in script exp_spark.sh and the configuration file exp.conf as per you
16 | environment
17 | 
18 | Generate input
19 | ==============
20 | -Copy bvib.py from python/app to your workig directory.
21 | 
22 | -Generate vibration  data 
23 | ./bvib.py gen <num_secs> > v.txt
24 | 
25 | where
26 | num_secs = num of secs in past for which vibration data data is generated e.g 7
27 | 
28 | -Split into reference and prediction data
29 | split -l10000 v.txt
30 | mv xaa vib_ref.txt
31 | 
32 | -Insert outliers in prediction or test data data
33 | ./bvib.py  iol xab  <failure_onset_time> > vib_pred.txt
34 | failure_onset_time = time from beginning of test data where  outlier in inserted. Outlier is 
35 | in the form of 2 high frequecy componenets
36 | 
37 | -You could plot the data around where outliers were introduced as follows
38 | ./bvib.py iplot vib_pred.txt K87JG9F6  900 1100
39 | 
40 | K87JG9F6  is the ID of the machine that is faulty and has outliers in the vibration data
41 | 
42 | -Copy reference and prediction data
43 | cp vib_ref.txt ./other/vib/
44 | cp vib_pred.txt ./input/vib/
45 | 
46 | 
47 | Run Spark Job
48 | =============
49 | Run
50 | ./vib.sh olPred
51 | 
52 | Plot outlier data
53 | =================
54 | ./bvib.py oplot ./output/vib/part-00000 K87JG9F6  900 1100
55 | 
56 | K87JG9F6  is the ID of the machine that is faulty and has outliers in the vibration data
57 | 
58 | 
59 |    
60 | 


--------------------------------------------------------------------------------
/src/main/java/org/beymani/util/SeequenceScoreAggregator.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * beymani: Outlier and anamoly detection 
 3 |  * Author: Pranab Ghosh
 4 |  * 
 5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 6 |  * may not use this file except in compliance with the License. You may
 7 |  * obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0 
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 |  * implied. See the License for the specific language governing
15 |  * permissions and limitations under the License.
16 |  */
17 | 
18 | package org.beymani.util;
19 | 
20 | import java.util.ArrayList;
21 | import java.util.List;
22 | 
23 | /**
24 |  * Manages  outlier scores for data points in a sequence. A data point may belong to 
25 |  * multiple sequences and hence may have have multiple outlier scores
26 |  * @author pranab
27 |  *
28 |  */
29 | public class SeequenceScoreAggregator implements java.io.Serializable {
30 | 	private static final long serialVersionUID = 2181114339589177954L;
31 | 	private List<Double> scores = new ArrayList<Double>();
32 | 	private int windowSize;
33 | 	
34 | 	
35 | 	/**
36 | 	 * @param windowSize
37 | 	 */
38 | 	public SeequenceScoreAggregator(int windowSize) {
39 | 		super();
40 | 		this.windowSize = windowSize;
41 | 	}
42 | 	
43 | 	
44 | 	/**
45 | 	 * @param seq
46 | 	 * @param score
47 | 	 */
48 | 	public void add(double score ) {
49 | 		scores.add(score);
50 | 		if (scores.size() > windowSize) {
51 | 			//set score to max of current and new score
52 | 			for (int i = scores.size() - windowSize; i < scores.size(); ++i) {
53 | 				double thisSeqScore = scores.get(i);
54 | 				if (thisSeqScore < score) {
55 | 					scores.set(i, score);
56 | 				}
57 | 			}
58 | 		}
59 | 	}
60 | 
61 | 	/**
62 | 	 * @return
63 | 	 */
64 | 	public List<Double> getScores() {
65 | 		return scores;
66 | 	}
67 | 	
68 | }
69 | 


--------------------------------------------------------------------------------
/resource/salean.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PROJECT_HOME=/Users/pranab/Projects
 4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar
 5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar
 6 | MASTER=spark://akash:7077
 7 | 
 8 | case "$1" in
 9 | 
10 | *) 
11 | 	echo "unknown operation $1"
12 | 	;;
13 | 
14 | 
15 | "numStat")
16 | 	echo "running NumericalAttrStats Spark job"
17 | 	CLASS_NAME=org.chombo.spark.explore.NumericalAttrStats
18 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/san/*
19 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/san/stat
20 | 	rm -rf ./output/stat
21 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
22 | 	--conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME  $INPUT $OUTPUT salean.conf
23 | ;;
24 | 
25 | "numMstat")
26 | 	echo "running NumericalAttrMedian Spark job"
27 | 	CLASS_NAME=org.chombo.spark.explore.NumericalAttrMedian
28 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/san/*
29 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/san/mstat
30 | 	rm -rf ./output/san/mstat
31 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
32 | 	--conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME  $INPUT $OUTPUT salean.conf
33 | ;;
34 | 
35 | "cpMed")
36 | 	echo "copying median files"
37 | 	MED_FILES=$PROJECT_HOME/bin/beymani/output/san/mstat/*
38 | 	META_DIR=$PROJECT_HOME/bin/beymani/meta/san
39 | 	cp /dev/null $META_DIR/$2
40 | 	for f in $MED_FILES
41 | 	do
42 |   		echo "Copying file $f ..."
43 |   		cat $f >> $META_DIR/$2
44 | 	done
45 | 	ls -l $META_DIR
46 | ;;
47 | 
48 | "olPred")
49 | 	echo "running StatsBasedOutlierPredictor Spark job"
50 | 	CLASS_NAME=org.beymani.spark.dist.StatsBasedOutlierPredictor
51 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/san/*
52 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/san/olp
53 | 	rm -rf ./output/san/olp
54 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
55 | 	--conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME  $INPUT $OUTPUT salean.conf
56 | 	echo "number of outliers"
57 | 	wc -l ./output/olp/part-00000
58 | 	wc -l ./output/olp/part-00001
59 | ;;
60 | 
61 | esac


--------------------------------------------------------------------------------
/resource/issue_service_time_anomaly_detection_tutorial.txt:
--------------------------------------------------------------------------------
 1 | This tutorial is for anaomaly detection for service time for an issue processing system  data using 
 2 | statistical modeling. To ne more specidfic we will be using a z score based technique
 3 |  
 4 | 
 5 | Environment
 6 | ===========
 7 | Path etc shown here corresposnds to my environment. Please Change them  as needed  for your 
 8 | environment
 9 | 
10 | Build
11 | =====
12 | Follow instructions in spark_dependency.txt
13 | 
14 | Python dependency
15 | =================
16 | The shell script commands for data generation run python scripts for data generation. Before you run 
17 | the data generation commands do the following
18 | 1. checkout project avenir
19 | 2. copy the avenir/python/lib directory to ../lib with respect to your location of cpu_usage.py file
20 | 
21 | 
22 | Create normal data for modeling
23 | ===============================
24 | ./ticket.py gen <num_issues> > tick_tr.txt
25 | 
26 | where
27 | num_issues = number of issues e.g 2000
28 | 
29 | Copy modeling data
30 | ./ticket.sh loadInp tick_tr.txt train
31 | 
32 | Create test data
33 | ================
34 | ./ticket.py gen <num_issues> > tick.txt
35 | where
36 | num_issues = number of issues e.g 200
37 | 
38 | insert outliers
39 | /ticket.py iol  tick.txt <outlier_percentage> > tick_pred.txt
40 | 
41 | where
42 | <outlier_percentage = outlier percentage e.g 5
43 | 
44 | copy test data
45 | ./ticket.sh loadInp tick_pred.txt pred 
46 | 
47 | 
48 | Run Spark job for model
49 | =======================
50 | Set operation.type = "med" in ticket.conf
51 | run
52 | ./ticket.sh numMstat
53 | copy output
54 | ./ticket.sh bkMod med.txt
55 | 
56 | Set operation.type = "mad" in ticket.conf
57 | run
58 | ./ticket.sh numMstat
59 | copy output
60 | ./ticket.sh bkMod mad.txt
61 | 
62 | We assumed that the training data does not have outliers. If the data has outliers, then follow
63 | these steps
64 | 1. build model
65 | 2. detect outliers and remove them from the data
66 | 3. build model again, but this time without outliers in the data
67 | 
68 | Run Spark job to detect outliers
69 | ================================
70 | ./ticket.sh olPred
71 | 
72 | Configuration
73 | =============
74 | Configuration is in and.conf. Make changes as necessary
75 | 
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/resource/sales_data_change_point_detection_tutorial.txt:
--------------------------------------------------------------------------------
 1 | This tutorial is for time series change point detection for retail sales data
 2 | 
 3 | Dependent script
 4 | ================
 5 | Checkout the project avenir. Copy the lib and mlextra directory under python to a directory  at the same level
 6 | as your working directory forp python script
 7 | 
 8 | Build and Deployment
 9 | ====================
10 | Please refer to resorce/spark_dependency.txt for building all jars and the final uber jar filw
11 | 
12 | Script and configuration
13 | ========================
14 | Feel free to make changes in script exp_spark.sh and the configuration file exp.conf as per you
15 | environment
16 | 
17 | Generate input
18 | ==============
19 | Copt cpsale.py from python/app to your workig directory.
20 | Generate hourly sales data 
21 | ./cpsale.py gen <num_days> > cps.txt
22 | 
23 | num_days = num of days in past for which sales data data is generated
24 | 
25 | Generate distribution for CVM two ssample statistic
26 | ===================================================
27 | We use Monte Carlo simulation to generate distribution. When run it will output to the console 
28 | upper tail statistic. Save the output somewhere. You will need it to configure the Spark job
29 | 
30 | Checkout the project avenir. In the python/app directory run the following
31 | 
32 | ./tsstat.py <num_iter> cvm <num_samo>
33 | num_iter = num of iterations for the simulator e.g 2000
34 | num_samp = num of samples for generated samples, which should be half the window size (the parameter
35 | window.size in cpsale.conf). I have set this parameter to 200. So num_samp should be 100
36 | 
37 | You could skip this step, if use the values set for parameter stat.critValue
38 | 
39 | Copy input to Spark directory
40 | =============================
41 | ./cpsale.sh cpInp cps.txt
42 | 
43 | Run Spark Job
44 | =============
45 | Chhose an upper  critical value for confidence interval any wher between .95 and .99 from the
46 | output of the MC simulator we ran earlier. Set the parameter stat.critValue in cpsale.conf
47 | 
48 | Run
49 | ./cpsale.sh cpPred
50 | 
51 | Plot sales data and change points
52 | =================================
53 | ./cpsale.py plot cps.txt DK75HUI45X ./output/cpsale/part-00000
54 | 
55 | DK75HUI45X  is the ID of the product that change point in sales data
56 | 
57 | 
58 |    
59 | 


--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/PredictorSpout.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * beymani: Outlier and anamoly detection 
 3 |  * Author: Pranab Ghosh
 4 |  * 
 5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 6 |  * may not use this file except in compliance with the License. You may
 7 |  * obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0 
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 |  * implied. See the License for the specific language governing
15 |  * permissions and limitations under the License.
16 |  */
17 | 
18 | 
19 | package org.beymani.predictor;
20 | 
21 | import java.util.Map;
22 | 
23 | import org.chombo.storm.MessageQueue;
24 | 
25 | import backtype.storm.spout.SpoutOutputCollector;
26 | import backtype.storm.task.TopologyContext;
27 | import backtype.storm.topology.OutputFieldsDeclarer;
28 | import backtype.storm.topology.base.BaseRichSpout;
29 | import backtype.storm.tuple.Fields;
30 | import backtype.storm.tuple.Values;
31 | 
32 | /**
33 |  * @author pranab
34 |  *
35 |  */
36 | public class PredictorSpout  extends  BaseRichSpout {
37 |     private SpoutOutputCollector collector;
38 |     private Map conf;
39 | 	private String messageQueue;
40 | 	private MessageQueue msgQueue;
41 | 	private static final String NIL = "nil";
42 | 
43 | 	@Override
44 | 	public void open(Map conf, TopologyContext context,
45 | 			SpoutOutputCollector collector) {
46 | 		this.collector = collector;
47 | 		this.conf = conf;
48 | 		messageQueue =  conf.get("redis.input.queue").toString();
49 | 		msgQueue = MessageQueue.createMessageQueue(conf, messageQueue);
50 | 	}
51 | 
52 | 	@Override
53 | 	public void nextTuple() {
54 | 		String message  = msgQueue.receive();
55 | 		if(null != message  && !message.equals(NIL)) {
56 | 			int pos = message.indexOf(",");
57 | 			String entityID = message.substring(0, pos);
58 | 			String recordData = message.substring(pos+1);
59 | 			collector.emit(new Values(entityID, recordData));
60 | 		}
61 | 		
62 | 	}
63 | 
64 | 	@Override
65 | 	public void declareOutputFields(OutputFieldsDeclarer declarer) {
66 |         declarer.declare(new Fields("entityID", "recordData"));		
67 | 	}
68 | 
69 | }
70 | 


--------------------------------------------------------------------------------
/src/main/java/org/beymani/util/DataStream.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * beymani: Outlier and anamoly detection 
 3 |  * Author: Pranab Ghosh
 4 |  * 
 5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 6 |  * may not use this file except in compliance with the License. You may
 7 |  * obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0 
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 |  * implied. See the License for the specific language governing
15 |  * permissions and limitations under the License.
16 |  */
17 | 
18 | package org.beymani.util;
19 | 
20 | import java.io.Serializable;
21 | import java.util.List;
22 | 
23 | import org.codehaus.jackson.annotate.JsonIgnoreProperties;
24 | 
25 | /**
26 |  * @author pranab
27 |  *
28 |  */
29 | @JsonIgnoreProperties(ignoreUnknown = true)
30 | public class DataStream implements Serializable{
31 | 	private String id;
32 | 	private String type;
33 | 	private String parentId;
34 | 	private String parentType;
35 | 	private List<String> childrenId;
36 | 	private boolean singleton;
37 | 	
38 | 	/**
39 | 	 * 
40 | 	 */
41 | 	public DataStream() {
42 | 	}
43 | 
44 | 	/**
45 | 	 * @return
46 | 	 */
47 | 	public String getId() {
48 | 		return id;
49 | 	}
50 | 
51 | 	public void setId(String id) {
52 | 		this.id = id;
53 | 	}
54 | 
55 | 	public String getType() {
56 | 		return type;
57 | 	}
58 | 
59 | 	public void setType(String type) {
60 | 		this.type = type;
61 | 	}
62 | 
63 | 	public String getParentId() {
64 | 		return parentId;
65 | 	}
66 | 
67 | 	public void setParentId(String parentId) {
68 | 		this.parentId = parentId;
69 | 	}
70 | 
71 | 	public String getParentType() {
72 | 		return parentType;
73 | 	}
74 | 
75 | 	public void setParentType(String parentType) {
76 | 		this.parentType = parentType;
77 | 	}
78 | 
79 | 	public List<String> getChildrenId() {
80 | 		return childrenId;
81 | 	}
82 | 
83 | 	public void setChildrenId(List<String> childrenId) {
84 | 		this.childrenId = childrenId;
85 | 	}
86 | 
87 | 	public boolean isSingleton() {
88 | 		return singleton;
89 | 	}
90 | 
91 | 	public void setSingleton(boolean singleton) {
92 | 		this.singleton = singleton;
93 | 	}
94 | 
95 | }
96 | 


--------------------------------------------------------------------------------
/resource/cyd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PROJECT_HOME=/Users/pranab/Projects
 4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar
 5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar
 6 | MASTER=spark://akash:7077
 7 | 
 8 | case "$1" in
 9 | 
10 | "numStat")
11 | 	echo "running NumericalAttrStats Spark job"
12 | 	CLASS_NAME=org.chombo.spark.explore.NumericalAttrStats
13 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/teg/cusage.txt
14 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/mea
15 | 	rm -rf ./output/mea
16 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
17 | 	--conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME  $INPUT $OUTPUT cyd.conf
18 | ;;
19 | 
20 | "crStatsFile")
21 | 	echo "copying and consolidating stats file"
22 | 	cat $PROJECT_HOME/bin/beymani/output/mea/part-00000 > $PROJECT_HOME/bin/beymani/other/auc/stats.txt
23 | 	cat $PROJECT_HOME/bin/beymani/output/mea/part-00001 >> $PROJECT_HOME/bin/beymani/other/auc/stats.txt
24 | 	ls -l $PROJECT_HOME/bin/beymani/other/auc
25 | ;;
26 | 
27 | "tempAggr")
28 | 	echo "running TemporalAggregator Spark job"
29 | 	CLASS_NAME=org.chombo.spark.explore.TemporalAggregator
30 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/teg/cusage.txt
31 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/teg
32 | 	rm -rf ./output/teg
33 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
34 | 	--conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME  $INPUT $OUTPUT cyd.conf
35 | ;;
36 | 
37 | "crAucInput")
38 | 	echo "copying and consolidating tem aggregation output file"
39 | 	cat $PROJECT_HOME/bin/beymani/output/teg/part-00000 > $PROJECT_HOME/bin/beymani/input/auc/cusage.txt
40 | 	cat $PROJECT_HOME/bin/beymani/output/teg/part-00001 >> $PROJECT_HOME/bin/beymani/input/auc/cusage.txt
41 | 	ls -l $PROJECT_HOME/bin/beymani/input/auc
42 | ;;
43 | 
44 | "autoCor")
45 | 	echo "running AutoCorrelation Spark job"
46 | 	CLASS_NAME=org.chombo.spark.explore.AutoCorrelation
47 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/auc/cusage.txt
48 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/auc
49 | 	rm -rf ./output/auc
50 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
51 | 	--conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME  $INPUT $OUTPUT cyd.conf
52 | ;;
53 | 
54 | *) 
55 | 	echo "unknown operation $1"
56 | 	;;
57 | 
58 | esac


--------------------------------------------------------------------------------
/resource/health_monitoring_data_anomaly_detection_tutorial.txt:
--------------------------------------------------------------------------------
 1 | This tutorial is for anaomaly detection in health monitoring data. Sequence anomaly is detected 
 2 | with markov chain model. 
 3 |  
 4 | Environment
 5 | ===========
 6 | Path etc shown here corresposnds to my environment. Please Change them  as needed  for your 
 7 | environment. The script bsm.sh is for running spark jobs and various other tasks. The configuration 
 8 | is in bsm.conf
 9 | 
10 | Build
11 | =====
12 | Follow instructions in spark_dependency.txt
13 | 
14 | Python dependency
15 | =================
16 | The shell script commands for data generation run python scripts for data generation. Before you run 
17 | the data generation commands do the following
18 | 1. checkout project avenir
19 | 2. copy the avenir/python/lib directory to ../lib with respect to your location of cpu_usage.py file
20 | 
21 | Create device reading mean and std dev
22 | ======================================
23 | ./bls.py stat <num_dev> > dstat.txt
24 | 
25 | num_dev = number of devices e.g 200
26 | 
27 | Create training data
28 | ====================
29 | ./bls.py gen dstat.txt <nun_days> normal > <train_data_file>
30 | where
31 | nun_days = num of days for which data should be generated (e.g 300)
32 | train_data_file = training data file
33 | 
34 | Copy <train_data_file>  to the spark input directory.
35 | cp <train_data_file> ./input/bsm/train
36 | 
37 | Copy meta data file
38 | ====================
39 | cp bsm.json ./meta
40 | 
41 | Discretize training data
42 | ========================
43 | Run dicretization spark job
44 | ./bsm.sh transformTrain
45 | 
46 | Discretization step is set to 5 in bsm.conf
47 | 
48 | Build model
49 | ===========
50 | Run Spark job
51 | ./bsm.sh stateTrans
52 | 
53 | Consolidate model files
54 | =======================
55 | Copy all Spark generated files into one
56 | ./bsm.sh cpModel
57 | 
58 | Create test data
59 | ================
60 | Create test data with outliers
61 | ./bls.py gen dstat.txt <nun_days> anomaly > <test_data_file>
62 | nun_days = num of days for which data should be generated (e.g 30)
63 | test_data_file = test data file name
64 | 
65 | Copy file
66 | cp  <test_data_file> ./input/bsm/pred
67 | 
68 | Discretize test data
69 | ====================
70 | Run dicretization spark job
71 | ./bsm.sh transformPred
72 | 
73 | Anomaly prediction Spark job
74 | ============================
75 | ./bsm.sh olPredict
76 | 
77 | 


--------------------------------------------------------------------------------
/python/app/wsbot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/python3
 2 | 
 3 | # avenir-python: Machine Learning
 4 | # Author: Pranab Ghosh
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License"); you
 7 | # may not use this file except in compliance with the License. You may
 8 | # obtain a copy of the License at
 9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0 
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | # implied. See the License for the specific language governing
16 | # permissions and limitations under the License.
17 | 
18 | # Package imports
19 | import os
20 | import sys
21 | import random
22 | import statistics 
23 | import matplotlib.pyplot as plt 
24 | sys.path.append(os.path.abspath("../lib"))
25 | sys.path.append(os.path.abspath("../mlextra"))
26 | from util import *
27 | from sampler import *
28 | 
29 | """
30 | data generation for web session
31 | """
32 | if __name__ == "__main__":
33 | 	op = sys.argv[1]
34 | 	if op == "gen":
35 | 		numSamp = int(sys.argv[2])
36 | 		if len(sys.argv) == 4:
37 | 			percenNormal  = int(sys.argv[3])
38 | 		else:
39 | 			percenNormal  = -1
40 | 			
41 | 		hrOfDay = [NormalSampler(14,3), UniformNumericSampler(0,23)]
42 | 		numPage = [NormalSampler(12,2.5), NormalSampler(50,5)]
43 | 		pageDurAv = [NormalSampler(60, 15), NormalSampler(1,.1)]
44 | 		prRevFrac = [NormalSampler(.5,.1), NormalSampler(.9,.05)]
45 | 		shopCart = [BernoulliTrialSampler(.6), BernoulliTrialSampler(.2)]
46 | 		checkout = [BernoulliTrialSampler(.4), BernoulliTrialSampler(0)]
47 | 		logOut = [BernoulliTrialSampler(.8), BernoulliTrialSampler(.95)]
48 | 		
49 | 		idLists = [genIdList(100, 12), genIdList(80, 12)]
50 | 		
51 | 		for _ in range(numSamp):
52 | 			if percenNormal > 0:
53 | 				if isEventSampled(percenNormal):
54 | 					di = 0
55 | 				else:
56 | 					di = 1
57 | 			else:
58 | 				di = 0
59 | 			uid = selectRandomFromList(idLists[di])
60 | 			hd = int(hrOfDay[di].sample())
61 | 			nup = int(numPage[di].sample())
62 | 			pdu = pageDurAv[di].sample()
63 | 			prev = prRevFrac[di].sample()
64 | 			sc = toIntFromBoolean(shopCart[di].sample())
65 | 			co = toIntFromBoolean(checkout[di].sample())
66 | 			if di == 1:
67 | 				co = 0
68 | 			lo = toIntFromBoolean(logOut[di].sample())
69 | 			
70 | 			print("{},{},{},{:.3f},{:.3f},{},{},{}".format(uid,hd,nup,pdu,prev,sc,co,lo))
71 | 			
72 | 


--------------------------------------------------------------------------------
/resource/and.conf:
--------------------------------------------------------------------------------
 1 | numericalAttrStats {
 2 | 	field.delim.in = ","
 3 | 	field.delim.out = ","
 4 | 	id.fieldOrdinals = [0]
 5 | 	attr.ordinals = [3]
 6 | 	seasonal.analysis = true
 7 | 	part.bySeasonCycle = true
 8 | 	seasonal.cycleType = ["weekDayOrWeekendOfWeek"]
 9 | 	time.fieldOrdinal = 1
10 | 	time.inMili = false
11 | 	min.sampleCount = 10
12 | 	output.precision = 3
13 | 	debug.on = true
14 | 	save.output = true
15 | }
16 | 
17 | 
18 | statsBasedOutlierPredictor {
19 | 	field.delim.in = ","
20 | 	field.delim.out = ","
21 | 	predictor.strategy = "zscore"
22 | 	id.fieldOrdinals = [0]
23 | 	attr.ordinals = [3]
24 | 	score.threshold = 3.30
25 | 	score.thresholdNorm = 0.90
26 | 	exp.const = -1.0
27 | 	outlier.polarity = "high"
28 | 	stats.file.path = "/Users/pranab/Projects/bin/beymani/other/olp/stats.txt"
29 | 	mean.fldOrd = 4
30 | 	hdfs.file = false
31 | 	attr.weights = [1.0]
32 | 	attr.weightStrategy = "weightedAverage"
33 | 	zscore {
34 | 		stats.file.path = "/Users/pranab/Projects/bin/beymani/other/olp/stats.txt"
35 | 	}
36 | 	seasonal.analysis = true
37 | 	part.bySeasonCycle = true
38 | 	seasonal.cycleType = ["weekDayOrWeekendOfWeek"]
39 | 	time.fieldOrdinal = 1
40 | 	time.inMili = false
41 | 	output.precision = 3
42 | 	output.outliers = false
43 | 	rem.outliers = false
44 | 	clean.dataDirPath = "file:///Users/pranab/Projects/bin/beymani/other/olp/clean"
45 | 	debug.on = true
46 | 	save.output = true
47 | }
48 | 
49 | thresholdLearner {
50 | 	field.delim.in = ","
51 | 	field.delim.out = ","
52 | 	score.fldOrd = 4
53 | 	cls.fldOrd = 7
54 | 	split.points = [0.925, 0.930, 0.935, 0.940, 0.945, 0.950, 0.955, 0.960, 0.965, 0.970, 0.975]
55 | 	pos.clsLabel = "T"
56 | 	splitting.algo = "entropy"
57 | 	debug.on = true
58 | 	save.output = true
59 | }
60 | 
61 | temporalAggregator {
62 | 	field.delim.in = ","
63 | 	field.delim.out = ","
64 | 	attr.ordinals = [2]
65 | 	id.fieldOrdinals = [0]
66 | 	time.fieldOrdinal = 1
67 | 	time.inMili = false
68 | 	aggr.windowTimeUnit = "hour"
69 | 	aggr.windowTimeLength = 1
70 | 	aggr.type = "average"
71 | 	output.compact = true
72 | 	output.precision = 3
73 | 	debug.on = true
74 | 	save.output = true
75 | }
76 | 
77 | autoCorrelation {
78 | 	field.delim.in = ","
79 | 	field.delim.out = ","
80 | 	attr.ordinals = [2]
81 | 	id.fieldOrdinals = [0]
82 | 	seq.fieldOrdinal = 1
83 | 	output.precision = 3
84 | 	coor.lags = [24, 168]
85 | 	stats.file.path = "/Users/pranab/Projects/bin/beymani/other/auc/stats.txt"
86 | 	mean.fieldOrd = 5
87 | 	debug.on = true
88 | 	save.output = true
89 | }
90 | 
91 | 	
92 | 


--------------------------------------------------------------------------------
/resource/salean.conf:
--------------------------------------------------------------------------------
 1 | timeIntervalGenerator {
 2 | 	field.delim.in = ","
 3 | 	field.delim.out = ","
 4 | 	id.fieldOrdinals = [0]
 5 | 	time.fieldOrdinal = 1
 6 | 	time.keepField = true
 7 | 	debug.on = true
 8 | 	save.output = true
 9 | }
10 | 
11 | numericalAttrStats {
12 | 	field.delim.in = ","
13 | 	field.delim.out = ","
14 | 	id.fieldOrdinals = [0]
15 | 	attr.ordinals = [2,3]
16 | 	seasonal.analysis = true
17 | 	part.bySeasonCycle = true
18 | 	seasonal.cycleType = ["nightDayHourOfDay"]
19 | 	time.fieldOrdinal = 1
20 | 	time.inMili = false
21 | 	min.sampleCount = 200
22 | 	output.precision = 3
23 | 	debug.on = true
24 | 	save.output = true
25 | }
26 | 
27 | numericalAttrMedian {
28 | 	field.delim.in = ","
29 | 	field.delim.out = ","
30 | 	id.fieldOrdinals = [0]
31 | 	attr.ordinals = [2,3]
32 | 	seasonal.analysis = true
33 | 	operation.type = "med"	
34 | 	med.file.path = ""
35 | 	hdfs.file = false
36 | 	med.file.path = "/Users/pranab/Projects/bin/beymani/meta/san/med.txt"
37 | 	seasonal.cycleType = ["nightDayHourOfDay"]
38 | 	time.fieldOrdinal = 1
39 | 	time.inMili = false
40 | 	output.precision = 6
41 | 	min.samplecount = 200
42 | 	debug.on = true
43 | 	save.output = true
44 | }
45 | 
46 | filter {
47 | 	field.delim.in = ","
48 | 	field.delim.out = ","
49 | 	id.fieldOrdinals = [0]
50 | 	selection.filter = ""
51 | 	stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/san/stats.txt"
52 | 	schema.file.path = "/Users/pranab/Projects/bin/beymani/meta/sales.conf"
53 | 	debug.on = true
54 | 	save.output = true
55 | }
56 | 
57 | statsBasedOutlierPredictor {
58 | 	field.delim.in = ","
59 | 	field.delim.out = ","
60 | 	predictor.strategy = "robustZscore"
61 | 	id.fieldOrdinals = [0]
62 | 	attr.ordinals = [2,3]
63 | 	score.threshold = 0.95
64 | 	score.thresholdNorm = 0.90
65 | 	outlier.polarity = "all"
66 | 	stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/san/stats.txt"
67 | 	mean.fldOrd = 4
68 | 	hdfs.file = false
69 | 	attr.weights = [0.4, 0.6]
70 | 	attr.weightStrategy = "weightedAverage"
71 | 	robustZscore {
72 | 		med.stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/san/med.txt"
73 | 		mad.stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/san/mad.txt"
74 | 	}
75 | 	seasonal.analysis = true
76 | 	seasonal.cycleType = ["nightDayHourOfDay"]
77 | 	time.fieldOrdinal = 1
78 | 	time.inMili = false
79 | 	output.precision = 3
80 | 	output.outliers = false
81 | 	rem.outliers = false
82 | 	clean.dataDirPath = "file:///Users/pranab/Projects/bin/beymani/other/olp/clean"
83 | 	debug.on = true
84 | 	save.output = true
85 | }
86 | 


--------------------------------------------------------------------------------
/resource/ticket.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PROJECT_HOME=/Users/pranab/Projects
 4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar
 5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar
 6 | MASTER=spark://akash:7077
 7 | 
 8 | case "$1" in
 9 | 
10 | "loadInp")
11 | 	rm $PROJECT_HOME/bin/beymani/input/ticket/$3/*
12 | 	cp $2 $PROJECT_HOME/bin/beymani/input/ticket/$3/
13 | 	ls -l $PROJECT_HOME/bin/beymani/input/ticket/$3/
14 | ;;
15 | 
16 | 
17 | "numStat")
18 | 	echo "running NumericalAttrStats Spark job"
19 | 	CLASS_NAME=org.chombo.spark.explore.NumericalAttrStats
20 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/ticket/train/*
21 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ticket/stat
22 | 	rm -rf ./output/ticket/stat
23 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
24 | 	--conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME  $INPUT $OUTPUT ticket.conf
25 | ;;
26 | 
27 | "numMstat")
28 | 	echo "running NumericalAttrMedian Spark job"
29 | 	CLASS_NAME=org.chombo.spark.explore.NumericalAttrMedian
30 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/ticket/train/*
31 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ticket/mstat
32 | 	rm -rf ./output/ticket/mstat
33 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
34 | 	--conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME  $INPUT $OUTPUT ticket.conf
35 | 	rm ./output/ticket/mstat/_SUCCESS
36 | 	ls -l ./output/ticket/mstat
37 | ;;
38 | 
39 | "bkMod")
40 | 	echo "backing up model files"
41 | 	MED_FILES=$PROJECT_HOME/bin/beymani/output/ticket/mstat/*
42 | 	META_DIR=$PROJECT_HOME/bin/beymani/meta/ticket
43 | 	META_FILE=$META_DIR/$2
44 | 	echo "copying to $META_FILE"
45 | 	cp /dev/null $META_FILE
46 | 	for f in $MED_FILES
47 | 	do
48 |   		echo "Copying file $f ..."
49 |   		cat $f >> $META_FILE
50 | 	done
51 | 	ls -l $META_FILE
52 | ;;
53 | 
54 | "olPred")
55 | 	echo "running StatsBasedOutlierPredictor Spark job"
56 | 	CLASS_NAME=org.beymani.spark.dist.StatsBasedOutlierPredictor
57 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/ticket/pred/*
58 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ticket/olp
59 | 	rm -rf ./output/ticket/olp
60 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
61 | 	--conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME  $INPUT $OUTPUT ticket.conf
62 | 	rm ./output/ticket/olp/_SUCCESS
63 | 	ls -l ./output/ticket/olp
64 | 	cat ./output/ecom/ticket/part-00000 | grep ,O 
65 | ;;
66 | 	
67 | *) 
68 | 	echo "unknown operation $1"
69 | 	;;
70 | 
71 | esac
72 | 


--------------------------------------------------------------------------------
/resource/ecomm.conf:
--------------------------------------------------------------------------------
 1 | numericalAttrStats {
 2 | 	field.delim.in = ","
 3 | 	field.delim.out = ","
 4 | 	id.fieldOrdinals = [0,1]
 5 | 	attr.ordinals = [3]
 6 | 	seasonal.analysis = true
 7 | 	part.bySeasonCycle = true
 8 | 	seasonal.cycleType = ["hourOfDay"]
 9 | 	time.fieldOrdinal = 2
10 | 	time.inMili = false
11 | 	min.sampleCount = 100
12 | 	output.precision = 3
13 | 	debug.on = true
14 | 	save.output = true
15 | }
16 | 
17 | numericalAttrMedian {
18 | 	field.delim.in = ","
19 | 	field.delim.out = ","
20 | 	id.fieldOrdinals = [0,1]
21 | 	attr.ordinals = [3]
22 | 	seasonal.analysis = false
23 | 	operation.type = "mad"	
24 | 	hdfs.file = false
25 | 	med.file.path = "/Users/pranab/Projects/bin/beymani/meta/ecom/med.txt"
26 | 	seasonal.cycleType = ["hourOfDay"]
27 | 	time.fieldOrdinal = 2
28 | 	time.inMili = false
29 | 	output.precision = 6
30 | 	min.samplecount = 100
31 | 	debug.on = true
32 | 	save.output = true
33 | }
34 | 
35 | statsBasedOutlierPredictor {
36 | 	field.delim.in = ","
37 | 	field.delim.out = ","
38 | 	predictor.strategy = "robustZscore"
39 | 	id.fieldOrdinals = [0,1]
40 | 	attr.ordinals = [3]
41 | 	score.threshold = 0.7
42 | 	exp.const = 1.5
43 | 	outlier.polarity = "all"
44 | 	stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/ecom/stats.txt"
45 | 	mean.fldOrd = 4
46 | 	hdfs.file = false
47 | 	attr.weights = [1]
48 | 	attr.weightStrategy = "weightedAverage"
49 | 	robustZscore {
50 | 		med.stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/ecom/med.txt"
51 | 		mad.stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/ecom/mad.txt"
52 | 	}
53 | 	seasonal.analysis = false
54 | 	seasonal.cycleType = ["hourOfDay"]
55 | 	time.fieldOrdinal = 2
56 | 	time.inMili = false
57 | 	output.precision = 3
58 | 	output.outliers = false
59 | 	rem.outliers = false
60 | 	clean.dataDirPath = "file:///Users/pranab/Projects/bin/beymani/other/olp/clean"
61 | 	debug.on = true
62 | 	save.output = true
63 | }
64 | 
65 | outlierAggregator {
66 | 	field.delim.in = ","
67 | 	field.delim.out = ","
68 | 	type.field.ordinal = 0
69 | 	id.field.ordinal = 1
70 | 	seq.field.ordinal = 2
71 | 	quant.field.ordinal = 3
72 | 	stream.schmaFilePath = "/Users/pranab/Projects/bin/beymani/meta/ecom/ecommDataStream.json"
73 | 	output.precision = 3
74 | 	debug.on = true
75 | 	save.output = true
76 | }
77 | 
78 | isolationForestModel {
79 | 	field.delim.in = ","
80 | 	field.delim.out = ","
81 | 	attr.ordinals = [1,3,4,5,7]
82 | 	score.threshold = .450
83 | 	num.tree = 64
84 | 	subsample.size = 256
85 | 	max.depth = 10
86 | 	rec.count = 1788
87 | 	output.precision = 3
88 | 	debug.on = true
89 | 	save.output = true
90 | }


--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/EntropyIncreaseBasedPredictor.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * beymani: Outlier and anamoly detection 
 3 |  * Author: Pranab Ghosh
 4 |  * 
 5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 6 |  * may not use this file except in compliance with the License. You may
 7 |  * obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0 
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 |  * implied. See the License for the specific language governing
15 |  * permissions and limitations under the License.
16 |  */
17 | 
18 | package org.beymani.predictor;
19 | 
20 | import java.util.Map;
21 | 
22 | /**
23 |  * Predict outlier based on increase of entropy resulting from including outlier point
24 |  * @author pranab
25 |  *
26 |  */
27 | public class EntropyIncreaseBasedPredictor extends DistributionBasedPredictor {
28 | 	private double entropy;
29 | 	private double baseConvConst = Math.log(2);
30 | 	private String subFieldDelim = ":";
31 | 	
32 | 	public EntropyIncreaseBasedPredictor(Map conf) {
33 | 		super(conf);
34 | 		
35 | 		//entropy
36 | 		entropy = 0;
37 | 		for (String bucketKey : distrModel.keySet()) {
38 | 			double pr = ((double)distrModel.get(bucketKey)) / totalCount;
39 | 			entropy += -pr * Math.log(pr) / baseConvConst;
40 | 		}
41 | 	}
42 | 
43 | 	@Override
44 | 	public double execute(String entityID, String record) {
45 | 		double score = 0;
46 | 		String thisBucketKey = getBucketKey(record);
47 | 		
48 | 		//new entropy
49 | 		double newEntropy = 0;
50 | 		int newTotalCount = totalCount + 1;
51 | 		boolean bucketFound = false;
52 | 		double pr = 0;
53 | 		for (String bucketKey : distrModel.keySet()) {
54 | 			if (bucketKey.equals(thisBucketKey)) {
55 | 				pr = ((double)distrModel.get(bucketKey) + 1) / newTotalCount;
56 | 				bucketFound = true;
57 | 			} else {
58 | 				pr = ((double)distrModel.get(bucketKey)) / newTotalCount;
59 | 			}
60 | 			newEntropy += -pr * Math.log(pr) / baseConvConst;
61 | 		}
62 | 		
63 | 		if (!bucketFound) {
64 | 			pr = 1.0 / newTotalCount;
65 | 			newEntropy += -pr * Math.log(pr) / baseConvConst;
66 | 		}
67 | 		
68 | 		if (newEntropy > entropy) {
69 | 			score = (newEntropy - entropy) / entropy;
70 | 		}
71 | 		
72 | 		if (score > scoreThreshold) {
73 | 			//write if above threshold
74 | 			outQueue.send(entityID + " " + score);
75 | 		}
76 | 		return score;
77 | 	}
78 | 
79 | 	@Override
80 | 	public double execute(String[] items, String compKey) {
81 | 		//TODO
82 | 		double score = 0;
83 | 		
84 | 		return score;
85 | 	}
86 | 
87 | 	@Override
88 | 	public boolean isValid(String compKey) {
89 | 		// TODO Auto-generated method stub
90 | 		return true;
91 | 	}
92 | 	
93 | }
94 | 


--------------------------------------------------------------------------------
/resource/bsm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PROJECT_HOME=/Users/pranab/Projects
 4 | JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar
 5 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar
 6 | AVENIR_JAR_NAME=$PROJECT_HOME/bin/avenir/uber-avenir-spark-1.0.jar
 7 | MASTER=spark://akash:7077
 8 | 
 9 | case "$1" in
10 | 
11 | "transformTrain")
12 | 	echo "running DataTransformer"
13 | 	CLASS_NAME=org.chombo.spark.etl.DataTransformer
14 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/bsm/train/*
15 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/bsm/train/trans
16 | 	rm -rf ./output/bsm/train/trans
17 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
18 | 	--conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME  $INPUT $OUTPUT bsm.conf
19 | 	rm -rf ./output/bsm/train/trans/_SUCCESS
20 | ;;
21 | 
22 | "stateTrans")
23 | 	echo "running MarkovStateTransitionModel"
24 | 	CLASS_NAME=org.avenir.spark.sequence.MarkovStateTransitionModel
25 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/output/bsm/train/trans/*
26 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/bsm/train/sttr
27 | 	rm -rf ./output/bsm/train/sttr
28 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
29 | 	--conf spark.ui.killEnabled=true --master $MASTER $AVENIR_JAR_NAME  $INPUT $OUTPUT bsm.conf
30 | 	rm -rf ./output/bsm/train/sttr/_SUCCESS
31 | ;;
32 | 
33 | "transformPred")
34 | 	echo "running DataTransformer"
35 | 	CLASS_NAME=org.chombo.spark.etl.DataTransformer
36 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/bsm/pred/*
37 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/bsm/pred/trans
38 | 	rm -rf ./output/bsm/trans
39 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
40 | 	--conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME  $INPUT $OUTPUT bsm.conf
41 | 	rm -rf ./output/bsm/pred/trans/_SUCCESS
42 | ;;
43 | 
44 | "cpModel")
45 | 	echo "copying model files"
46 | 	MOD_FILES=$PROJECT_HOME/bin/beymani/output/bsm/train/sttr/*
47 | 	META_DIR=$PROJECT_HOME/bin/beymani/meta
48 | 	cp /dev/null $META_DIR/bsm_mod.txt
49 | 	for f in $MOD_FILES
50 | 	do
51 |   		echo "Copying file $f ..."
52 |   		cat $f >> $META_DIR/bsm_mod.txt
53 | 	done
54 | ;;
55 | 
56 | "olPredict")
57 | 	echo "running MarkovChainPredictor"
58 | 	CLASS_NAME=org.beymani.spark.seq.MarkovChainPredictor
59 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/output/bsm/pred/trans/*
60 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/bsm/pred/oul
61 | 	rm -rf ./output/bsm/pred/oul
62 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
63 | 	--conf spark.ui.killEnabled=true --master $MASTER $JAR_NAME  $INPUT $OUTPUT bsm.conf
64 | 	rm -rf ./output/bsm/pred/oul/_SUCCESS
65 | 	ls -l ./output/bsm/pred/oul
66 | 	for f in ./output/bsm/pred/oul/*
67 | 	do
68 | 		echo "number of  outliers in $f"
69 | 		cat $f | grep ,O | wc -l
70 | 	done	
71 | 
72 | ;;
73 | 
74 | *) 
75 | 	echo "unknown operation $1"
76 | 	;;
77 | 
78 | esac


--------------------------------------------------------------------------------
/resource/quarantine_violation_detection_tutorial.txt:
--------------------------------------------------------------------------------
 1 | This tutorial is for detecting quarantine violation based mobile location anomaly. Violation
 2 | could be because of quarantined people miving out of quarantined location or non quarantined people
 3 | visiting quarantined locations
 4 |  
 5 | 
 6 | Environment
 7 | ===========
 8 | Make sure you have ../lib  directory with all the python files wrt where mob_loc.py is. 
 9 | Please refer to resource/spark_dependency.txt for building the jar for Spark.
10 | All the configuration data generation python script are mob_loc.properties. Make sure all the 
11 | directories for data as in epid.sh are created
12 | 
13 | Generate data for out of range violation
14 | ========================================
15 | Phone numbers and quarantim=ne location
16 | python3 mob_loc.py genQuaLoc mob_loc.properties > qualist.txt
17 | 
18 | quarantined people movement location data
19 | python3 mob_loc.py quaLoc mob_loc.properties > qualoc.txt
20 | 
21 | insert outliers in movement location data (quarantined person moving out of quarantined location)
22 | python3 mob_loc.py quaLocOutlier mob_loc.properties > qualocou.txt
23 | 
24 | Copy data
25 | =========
26 | quarantine location
27 | ./epid.sh cpQuaLocData qualist.txt outr
28 | 
29 | quarantined people movement location data
30 | ./epid.sh cpLocData qualoc.txt outr
31 | 
32 | Spark job going out range outlier
33 | =================================
34 | ./epid.sh olPredOu
35 | 
36 | Generate data for out of range violation
37 | ========================================
38 | all locations data
39 | python3 mob_loc.py genLoc mob_loc.properties  > res_loc.txt
40 | python3 mob_loc.py genLoc mob_loc.properties  > work_loc.txt
41 | python3 mob_loc.py genLoc mob_loc.properties  > school_loc.txt
42 | python3 mob_loc.py genLoc mob_loc.properties  > med_loc.txt
43 | python3 mob_loc.py genLoc mob_loc.properties  > shop_loc.txt
44 | python3 mob_loc.py genLoc mob_loc.properties  > ent_loc.txt
45 | python3 mob_loc.py genLoc mob_loc.properties  > event_loc.txt 
46 | python3 mob_loc.py genLoc mob_loc.properties > open_loc.txt
47 | 
48 | Here are the region.num.locations and region.loc.size. You have to set them before generating location
49 | for each location type
50 | residence 			200	.0002
51 | work				10	.0005
52 | school				3	.0020
53 | medical				3	.0004
54 | shoppinh area		5	.0020
55 | entertainment area	5	.0010
56 | large event area	2	.0008
57 | open space			2	.0024
58 | 
59 | quarantined locations
60 | python3 mob_loc.py uniqQuaLoc  mob_loc.properties  > uniq_qualist.txt
61 | 
62 | people movement location data
63 | python3 mob_loc.py genMovement mob_loc.properties  > move_loc.txt
64 | 
65 | Copy data
66 | =========
67 | quarantine location
68 | ./epid.sh cpQuaLocData uniq_qualist.txt inr
69 | 
70 | quarantined people movement location data
71 | ./epid.sh cpLocData move_loc.txt inr
72 | 
73 | Spark job for in range outlier
74 | ==============================
75 | ./epid.sh olPredIn
76 | 
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | Beymani consists of set of Hadoop, Spark and Storm based tools for outlier and anamoly 
 3 | detection, which can be used for fraud detection, intrusion detection etc.
 4 | 
 5 | ## Philosophy
 6 | * Simple to use
 7 | * Input output in CSV format
 8 | * Metadata defined in simple JSON file
 9 | * Extremely configurable with tons of configuration knobs
10 | 
11 | ## Blogs
12 | The following blogs of mine are good source of details of beymani
13 | * http://pkghosh.wordpress.com/2012/01/02/fraudsters-outliers-and-big-data-2/
14 | * http://pkghosh.wordpress.com/2012/02/18/fraudsters-are-not-model-citizens/
15 | * http://pkghosh.wordpress.com/2012/06/18/its-a-lonely-life-for-outliers/
16 | * http://pkghosh.wordpress.com/2012/10/18/relative-density-and-outliers/
17 | * http://pkghosh.wordpress.com/2013/10/21/real-time-fraud-detection-with-sequence-mining/
18 | * https://pkghosh.wordpress.com/2018/09/18/contextual-outlier-detection-with-statistical-modeling-on-spark/
19 | * https://pkghosh.wordpress.com/2018/10/15/learning-alarm-threshold-from-user-feedback-using-decision-tree-on-spark/
20 | * https://pkghosh.wordpress.com/2019/07/25/time-series-sequence-anomaly-detection-with-markov-chain-on-spark/
21 | * https://pkghosh.wordpress.com/2020/09/27/time-series-change-point-detection-with-two-sample-statistic-on-spark-with-application-for-retail-sales-data/
22 | * https://pkghosh.wordpress.com/2020/12/24/concept-drift-detection-techniques-with-python-implementation-for-supervised-machine-learning-models/
23 | * https://pkghosh.wordpress.com/2021/01/20/customer-service-quality-monitoring-with-autoencoder-based-anomalous-case-detection/
24 | * https://pkghosh.wordpress.com/2021/06/28/ecommerce-order-processing-system-monitoring-with-isolation-forest-based-anomaly-detection-on-spark/
25 | 
26 | ## Algorithms
27 | * Univarite  distribution model
28 | * Multi variate sequence or multi gram distribution model
29 | * Average instance Distance
30 | * Relative instance Density
31 | * Markov chain with sequence data
32 | * Spectral residue for sequence data
33 | * Quantized symbol mapping for sequence data
34 | * Local outlier factor for multivariate data
35 | * Instance clustering
36 | * Sequence clustering
37 | * Change point detection
38 | * Isolation Forest for multivariate data
39 | * Auto Encoder for multivariate data
40 | 
41 | ## Getting started
42 | Project's resource directory has various tutorial documents for the use cases described in
43 | the blogs.
44 | 
45 | ## Build
46 | For Hadoop 1
47 | * mvn clean install
48 | 
49 | For Hadoop 2 (non yarn)
50 | * git checkout nuovo
51 | * mvn clean install
52 | 
53 | For Hadoop 2 (yarn)
54 | * git checkout nuovo
55 | * mvn clean install -P yarn
56 | 
57 | For Spark
58 | * mvn clean install
59 | * sbt publishLocal
60 | * in ./spark  sbt clean package
61 | 
62 | ## Help
63 | Please feel free to email me at pkghosh99@gmail.com
64 | 
65 | ## Contribution
66 | Contributors are welcome. Please email me at pkghosh99@gmail.com
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/resource/cct.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/ruby
  2 | 
  3 | count = ARGV[0].to_i
  4 | 
  5 | amount_dist = [
  6 | 10,10,
  7 | 17,17,17,
  8 | 25,25,25,25,25,
  9 | 37,37,37,37,37,37,37,
 10 | 45,45,45,45,45,
 11 | 66,66,66,66,
 12 | 82,82,82,82,
 13 | 150,150,150,
 14 | 220,220,
 15 | 300,300,
 16 | 500,
 17 | 1000,
 18 | 2000
 19 | ]
 20 | 
 21 | time_dist = [
 22 | 0,0,0,
 23 | 1,1,1,1,
 24 | 2,2,2,2,2,2,2,
 25 | 3,3,3,3,3,
 26 | 4,4,4,
 27 | 5,5,
 28 | 6,
 29 | 7,
 30 | 8,
 31 | 9,
 32 | 10,
 33 | 11,
 34 | 12,
 35 | 13,
 36 | 14,
 37 | 15,
 38 | 16,16,
 39 | 17,17,17,
 40 | 18,18,
 41 | 19,
 42 | 20,
 43 | 21,21,
 44 | 22,22,22,
 45 | 23
 46 | ]
 47 | 
 48 | vendors = ['grocery', 'restaurant', 'drug store', 'super market', 'electronic store', 'clothing store', 'jewellery store', 
 49 | 'air fare', 'hotel', 'car rental']
 50 | 
 51 | vendor_dist = [
 52 | 0,0,0,0,0,0,0,0,0,
 53 | 1,1,1,
 54 | 2,2,2,2,2,2,
 55 | 3,3,3,3,
 56 | 4,4,
 57 | 5,5,5,
 58 | 7,7,7,
 59 | 8,8,
 60 | 9,9
 61 | ]
 62 | 
 63 | 
 64 | vendor_amount_dist = {
 65 | 'grocery' => [
 66 | 10,10,
 67 | 20,20,20,20,
 68 | 30,30,30,30,30,30,30,
 69 | 50,50,50,50,50,50,50,50,50,
 70 | 70,70,70,70,
 71 | 100,100,
 72 | 150
 73 | ],
 74 | 
 75 | 'restaurant' => [
 76 | 10,10,
 77 | 20,20,20,20,20,
 78 | 27,27,
 79 | 35,
 80 | 50
 81 | ],
 82 | 
 83 | 'drug store' => [
 84 | 12,12,
 85 | 23,23,23,23,23,
 86 | 37,37,37,
 87 | 45,45,
 88 | 60
 89 | ],
 90 | 
 91 | 'super market' => [
 92 | 25,25,
 93 | 38,38,38,
 94 | 49,49,49,49,49,49,
 95 | 68,68,68,
 96 | 112,112,
 97 | 185,
 98 | 250
 99 | ],
100 | 
101 | 'electronic store' => [
102 | 60,60,
103 | 90,90,
104 | 120,120,120,120,
105 | 190,190,190,190,190,
106 | 250,250,250,
107 | 300,300,
108 | 500
109 | ],
110 | 
111 | 'clothing store' => [
112 | 30,30,
113 | 50,50,50,50,
114 | 70,70,70,
115 | 90,90,
116 | 150,
117 | 200
118 | ],
119 | 
120 | 'jewellery store' => [
121 | 100,
122 | 170,170,
123 | 260,260,260,
124 | 310,310,
125 | 400
126 | ],
127 | 
128 | 'air fare' => [
129 | 110,110,
130 | 180,180,180,
131 | 310,310,310,310,310,
132 | 520,520,
133 | 600
134 | ],
135 | 
136 | 'hotel' => [
137 | 110,110,110,
138 | 230,230,230,230,
139 | 300,
140 | 400
141 | ],
142 | 
143 | 'car rental' => [
144 | 60,60,
145 | 110,110,110,110,
146 | 150,150,
147 | 200
148 | ]
149 | 
150 | }
151 | 
152 | key = ['0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
153 | 'P','Q','R','S','T','U','V','W','X','Y','Z']
154 | 
155 | def gen_id(key)
156 | 	id = ''
157 | 	1.upto 8 do #!/usr/bin/ruby
158 | 
159 | require '../lib/util.rb'      
160 | 
161 | userCount = ARGV[0].to_i
162 | 
163 | 		id << key[rand(key.length)]
164 | 	end
165 | 	return id
166 | end
167 | 
168 | def sample(dist, mult, floor, percent)
169 | 	b = rand(dist.length)
170 | 	val = dist[b]
171 |     val = val * mult
172 | 	percent = rand(percent)
173 | 	percent = percent < floor ? floor : percent
174 | 
175 |     dev = (val * percent) / 100
176 | 	if ((rand(100) % 2) == 0)
177 | 		val = val + dev
178 | 	else 
179 | 		val = val - dev
180 | 	end
181 | 	val = val < 0 ? 0 : val
182 | 	val
183 | end
184 | 
185 | 1.upto count do 
186 | 	id = gen_id(key)
187 | 	time = sample(time_dist, 60, 2, 8)
188 | 	time = time > 1440 ? 1440 : time
189 | 	v = vendor_dist[rand(vendor_dist.length)]
190 | 	vendor = vendors[v]
191 | 	am = sample(vendor_amount_dist[vendor], 100, 4, 12)
192 | 	puts "#{id}[]#{time}[]#{am/100}.#{am%100}[]#{vendor}"
193 | end
194 | 
195 | 
196 | 


--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/EstimatedProbabilityBasedPredictor.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * beymani: Outlier and anamoly detection 
 3 |  * Author: Pranab Ghosh
 4 |  * 
 5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 6 |  * may not use this file except in compliance with the License. You may
 7 |  * obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0 
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 |  * implied. See the License for the specific language governing
15 |  * permissions and limitations under the License.
16 |  */
17 | 
18 | package org.beymani.predictor;
19 | 
20 | import java.io.IOException;
21 | import java.util.Map;
22 | 
23 | import org.apache.hadoop.conf.Configuration;
24 | 
25 | /**
26 |  * Estimated probability based outlier prediction
27 |  * @author pranab
28 |  *
29 |  */
30 | public class EstimatedProbabilityBasedPredictor extends DistributionBasedPredictor {
31 | 
32 | 	/**
33 | 	 * Storm usage
34 | 	 * @param conf
35 | 	 */
36 | 	public EstimatedProbabilityBasedPredictor(Map conf) {
37 | 		super(conf);
38 | 		realTimeDetection = true;
39 | 	}
40 | 	
41 | 	/**
42 | 	 * @param config
43 | 	 * @param distrFilePathParam
44 | 	 * @param hdfsFileParam
45 | 	 * @param schemaFilePathParam
46 | 	 * @param scoreThresholdParam
47 | 	 * @throws IOException
48 | 	 */
49 | 	public EstimatedProbabilityBasedPredictor(Map<String, Object> config, String idOrdinalsParam, 
50 | 			String distrFilePathParam, String hdfsFileParam, String schemaFilePathParam, 
51 | 			String seasonalParam, String fieldDelimParam, String scoreThresholdParam) throws IOException {
52 | 		super(config, idOrdinalsParam, distrFilePathParam, hdfsFileParam, schemaFilePathParam, 
53 | 				seasonalParam,  fieldDelimParam, scoreThresholdParam);
54 | 	}
55 | 
56 | 	/**
57 | 	 * Hadoop MR usage
58 | 	 * @param config
59 | 	 * @param distrFilePath
60 | 	 * @throws IOException
61 | 	 */
62 | 	public EstimatedProbabilityBasedPredictor(Configuration config, String distrFilePath, String scoreThresholdParam) throws IOException {
63 | 		super(config, distrFilePath);
64 | 		scoreThreshold = Double.parseDouble( config.get( scoreThresholdParam));
65 | 	}
66 | 	
67 | 	@Override
68 | 	public double execute(String entityID, String record) {
69 | 		String bucketKey = getBucketKey(record);
70 | 		Integer count = distrModel.get(bucketKey);
71 | 		double pr = null != count ? (((double)count) / totalCount) : 0;
72 | 		double score = 1.0 - pr;
73 | 		scoreAboveThreshold = score > scoreThreshold;
74 | 		if (realTimeDetection && scoreAboveThreshold) {
75 | 			//write if above threshold
76 | 			outQueue.send(entityID + " " + score);
77 | 		}
78 | 		return score;
79 | 	}
80 | 	
81 | 	@Override
82 | 	public double execute(String[] items, String compKey) {
83 | 		String bucketKey = getBucketKey(items);
84 | 		Map<String, Integer> distrModel = keyedDistrModel.get(compKey);
85 | 		Integer count = distrModel.get(bucketKey);
86 | 		int totalCount = totalCounts.get(compKey);
87 | 		double pr = null != count ? (((double)count) / totalCount) : 0;
88 | 		double score = 1.0 - pr;
89 | 		return score;
90 | 	}
91 | 
92 | 	@Override
93 | 	public boolean isValid(String compKey) {
94 | 		// TODO Auto-generated method stub
95 | 		return true;
96 | 	}
97 | }
98 | 


--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/ExtremeValuePredictor.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * beymani: Outlier and anamoly detection 
 3 |  * Author: Pranab Ghosh
 4 |  * 
 5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 6 |  * may not use this file except in compliance with the License. You may
 7 |  * obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0 
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 |  * implied. See the License for the specific language governing
15 |  * permissions and limitations under the License.
16 |  */
17 | 
18 | package org.beymani.predictor;
19 | 
20 | import java.io.IOException;
21 | import java.util.Map;
22 | 
23 | import org.beymani.util.OutlierScoreAggregator;
24 | import org.chombo.util.BasicUtils;
25 | 
26 | /**
27 |  * @author pranab
28 |  *
29 |  */
30 | public class ExtremeValuePredictor extends ZscorePredictor {
31 | 
32 | 	/**
33 | 	 * @param config
34 | 	 * @param idOrdinalsParam
35 | 	 * @param attrListParam
36 | 	 * @param fieldDelimParam
37 | 	 * @param attrWeightParam
38 | 	 * @param statsFilePathParam
39 | 	 * @param seasonalParam
40 | 	 * @param hdfsFileParam
41 | 	 * @param scoreThresholdParam
42 | 	 * @param expConstParam
43 | 	 * @throws IOException
44 | 	 */
45 | 	public ExtremeValuePredictor(Map<String, Object> config,String idOrdinalsParam, String attrListParam,
46 | 			String fieldDelimParam, String attrWeightParam,String statsFilePathParam, String seasonalParam,
47 | 			String hdfsFileParam, String scoreThresholdParam,String expConstParam, String ignoreMissingStatParam,
48 | 			String scoreAggggregationStrtaegyParam) throws IOException {
49 | 		super(config, idOrdinalsParam, attrListParam, fieldDelimParam, attrWeightParam,
50 | 				statsFilePathParam, seasonalParam, hdfsFileParam, scoreThresholdParam,
51 | 				expConstParam, ignoreMissingStatParam, scoreAggggregationStrtaegyParam);
52 | 	}
53 | 
54 | 	/* (non-Javadoc)
55 | 	 * @see org.beymani.predictor.ZscorePredictor#execute(java.lang.String[], java.lang.String)
56 | 	 */
57 | 	@Override
58 | 	public double execute(String[] items, String compKey) {
59 | 		double score = 0;
60 | 		OutlierScoreAggregator scoreAggregator = new OutlierScoreAggregator(attrWeights.length, attrWeights);
61 | 		double thisScore = 0;
62 | 		for (int ord  :  attrOrdinals) {
63 | 			double val = Double.parseDouble(items[ord]);
64 | 			double d = 0;
65 | 			double e = 0;
66 | 			if (null != idOrdinals) {
67 | 				if (statsManager.statsExists(compKey, ord)) {
68 | 					d = Math.abs( val - statsManager.getMean(compKey,ord));
69 | 					e = Math.exp(-d / statsManager.getStdDev(compKey, ord));
70 | 					thisScore  = Math.exp(-e);
71 | 					scoreAggregator.addScore(thisScore);
72 | 				} else {
73 | 					scoreAggregator.addScore();
74 | 				}
75 | 			} else {
76 | 				d = Math.abs( val - statsManager.getMean(ord));
77 | 				e = Math.exp(-d / statsManager.getStdDev(ord));
78 | 				thisScore  = Math.exp(-e);
79 | 				scoreAggregator.addScore(thisScore);
80 | 			}
81 | 		}
82 | 		//aggregate score	
83 | 		score = getAggregateScore(scoreAggregator);
84 | 		
85 | 		//exponential normalization
86 | 		if (expConst > 0) {
87 | 			score = BasicUtils.expScale(expConst, score);
88 | 		}
89 | 		
90 | 		scoreAboveThreshold = score > scoreThreshold;
91 | 		return score;
92 | 	}
93 | 	
94 | }
95 | 


--------------------------------------------------------------------------------
/python/app/cpsale.py:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/python3
  2 | 
  3 | # avenir-python: Machine Learning
  4 | # Author: Pranab Ghosh
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License"); you
  7 | # may not use this file except in compliance with the License. You may
  8 | # obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0 
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 15 | # implied. See the License for the specific language governing
 16 | # permissions and limitations under the License.
 17 | 
 18 | # Package imports
 19 | import os
 20 | import sys
 21 | import random
 22 | import statistics 
 23 | import matplotlib.pyplot as plt 
 24 | sys.path.append(os.path.abspath("../lib"))
 25 | sys.path.append(os.path.abspath("../mlextra"))
 26 | from util import *
 27 | from sampler import *
 28 | from mcsim import *
 29 | 
 30 | """
 31 | cannibalized product sale
 32 | """
 33 | 
 34 | values = list()
 35 | def psale(args):
 36 | 	i = 0
 37 | 	q1 = int(args[i])
 38 | 	q1 = q1 if q1 >= 0 else 0
 39 | 	i += 1
 40 | 	q2 = int(args[i])
 41 | 	q2 = q2 if q2 >= 0 else 0
 42 | 	i += 1
 43 | 	pid1 = args[i]
 44 | 	i += 1
 45 | 	pid2 = args[i]
 46 | 	i += 1
 47 | 	ptime = args[i]
 48 | 	i += 1
 49 | 	iter = args[i]
 50 | 	ctime = ptime + iter * 3600
 51 | 	print("{},{},{}".format(pid1, ctime, q1))
 52 | 	print("{},{},{}".format(pid2, ctime, q2))
 53 | 	values.append(q1)
 54 | 
 55 | 
 56 | if __name__ == "__main__":
 57 | 	op = sys.argv[1]
 58 | 	if op == "gen":
 59 | 		numDays = int(sys.argv[2])
 60 | 		numIter = 24 * numDays
 61 | 		curTime, pastTime = pastTime(numDays, "d")
 62 | 		pastTime = dayAlign(pastTime)
 63 | 		tsStart = int(0.6 * numIter)
 64 | 		trEnd = tsStart + 30
 65 | 		trSl = -2.0
 66 | 		cy = np.array([-20.0, -35.0, -55.0, -65.0, -70.0, -70.0, -50.0, -30.0, -5.0, 15.0, 35.0, 50.0,
 67 | 		65.0, 65.0, 55.0, 50.0, 40.0, 30.0, 25.0, 35.0, 30.0, 20.0, 5.0, -15.0])
 68 | 		cy1 = 0.7 * cy
 69 | 		cy2 = 0.7 * cy1
 70 | 		cy3 = 0.3 * cy1
 71 | 		simulator = MonteCarloSimulator(numIter, psale, "./log/mcsim.log", "info")
 72 | 		simulator.registerNormalSamplerWithTrendCycle(100, 10, 0, cy1)
 73 | 		simulator.registerNormalSamplerWithTrendCycle(150, 20, 0.01, cy2)
 74 | 		simulator.registerExtraArgs("DK75HUI45X", "GHT56FGT8K", pastTime)
 75 | 		trSampler = NormalSamplerWithTrendCycle(100.0, 10.0, trSl , cy1)
 76 | 		simulator.setSampler(0, tsStart, trSampler)
 77 | 		newSampler = NormalSamplerWithTrendCycle(40, 12, 0, cy3)
 78 | 		simulator.setSampler(0, trEnd, newSampler)
 79 | 
 80 | 		simulator.run()
 81 | 		#drawLine(values, 250)
 82 | 		
 83 | 	elif op == "plot":
 84 | 		filePath = sys.argv[2]
 85 | 		rid = sys.argv[3]
 86 | 		filt = lambda r : r[0] == rid
 87 | 		dvalues = list(map(lambda r : float(r[2]), fileFiltRecGen(filePath, filt)))
 88 | 		xvalues = list(map(lambda r : int(r[1]), fileFiltRecGen(filePath, filt)))
 89 | 		it = xvalues[0]
 90 | 		if len(sys.argv) == 5:
 91 | 			cpFilePath = sys.argv[4]
 92 | 			cdvalues = list(map(lambda r : float(r[3]), fileFiltRecGen(cpFilePath, filt)))
 93 | 			cxvalues = list(map(lambda r : int(r[2]), fileFiltRecGen(cpFilePath, filt)))
 94 | 			i = 0
 95 | 			for t in cxvalues:
 96 | 				plt.axvline(t, 0, .9, color="r")
 97 | 				i += 1
 98 | 			plt.plot(xvalues, dvalues, "b")
 99 | 			plt.show()
100 | 		else:
101 | 			plt.plot(xvalues, dvalues, "b")
102 | 			plt.show()
103 | 		
104 | 		
105 | 


--------------------------------------------------------------------------------
/resource/cpu_usage_anomaly_det_tutorial.txt:
--------------------------------------------------------------------------------
  1 | This tutorial is for anaomaly detection in CPU usage data using statistical modeling. To ne more specidfic
  2 | we will be using a z score based technique. Model gets built with oultliers in data. The detected outliers
  3 | are removed and the model is built again, but htis time without outliers in the data.
  4 |  
  5 | 
  6 | Environment
  7 | ===========
  8 | Path etc shown here corresposnds to my environment. Please Change them  as needed  for your 
  9 | environment
 10 | 
 11 | Build
 12 | =====
 13 | Follow instructions in spark_dependency.txt
 14 | 
 15 | Python dependency
 16 | =================
 17 | The shell script commands for data generation run python scripts for data generation. Before you run 
 18 | the data generation commands do the following
 19 | 1. checkout project avenir
 20 | 2. copy the avenir/python/lib directory to ../lib with respect to your location of cpu_usage.py file
 21 | 
 22 | 
 23 | Create base normal data
 24 | =======================
 25 | ./and_spark.sh crInput <num_of_days> <reading_intervaL> <num_servers> true <output_file>
 26 | 
 27 | where
 28 | num_of_days = number of days e.g 10
 29 | reading_intervaL = reading interval in sec e.g. 300
 30 | num_servers = number of servers e.g. 4
 31 | output_file = output file, we will use cusage.txt from now on
 32 | 
 33 | - insert outliers
 34 | ./and_spark.sh insOutliers <normal_data_file> <with_outlier_data_file> 
 35 | 
 36 | where
 37 | normal_data_file = normal data file (cusage.txt)
 38 | with_outlier_data_file = data file with outliers (cusage.txt)
 39 | 
 40 | -copy
 41 | ./and_spark.sh cpModData <with_outlier_data_file> 
 42 | 
 43 | where
 44 | with_outlier_data_file = data file with outliers (cusage.txt)
 45 | 
 46 | Run Spark job for stats
 47 | =======================
 48 | ./and_spark.sh numStat
 49 | 
 50 | Copy and consolidate stats file
 51 | ===============================
 52 | ./and_spark.sh crStatsFile
 53 | 
 54 | Run Spark job to detect outliers
 55 | ================================
 56 | - set 
 57 | score.threshold = 2.0
 58 | output.outliers = true
 59 | rem.outliers = true
 60 | 
 61 | - run
 62 | ./and_spark.sh olPred
 63 | 
 64 | Copy and consolidate clean file
 65 | ===============================
 66 | ./and_spark.sh crCleanFile
 67 | 
 68 | Create and copy test data
 69 | =========================
 70 | - create 
 71 | ./and_spark.sh crInput <num_of_days> <reading_intervaL> <num_servers> true <stats_file> <output_file
 72 | where
 73 | stas_file = stats file path, which gets used to get all the server IDs
 74 | 
 75 | - insert outliers
 76 | ./and_spark.sh insOutliers <normal_data_file> <with_outlier_data_file> 
 77 | 
 78 | where
 79 | normal_data_file = normal data file (c.txt)
 80 | with_outlier_data_file = data file with outliers (cusage.txt)
 81 | 
 82 | - copy
 83 | ./and_spark.sh cpTestData <with_outlier_data_file> 
 84 | 
 85 | where
 86 | with_outlier_data_file = data file with outliers (cusage.txt)
 87 | 
 88 | 
 89 | Run Spark job for stats again with clean data
 90 | =============================================
 91 | ./and_spark.sh numStat
 92 | 
 93 | Copy and consolidate stats file
 94 | ===============================
 95 | ./and_spark.sh crStatsFile
 96 | 
 97 | 
 98 | Run Spark job to detect outliers
 99 | ================================
100 | - set
101 | score.threshold = 3.3
102 | output.outliers = false
103 | rem.outliers = false
104 | 
105 | - run
106 | ./and_spark.sh olPred
107 | 
108 | Configuration
109 | =============
110 | Configuration is in and.conf. Make changes as necessary
111 | 
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/FileSpout.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * beymani: Outlier and anamoly detection 
  3 |  * Author: Pranab Ghosh
  4 |  * 
  5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  6 |  * may not use this file except in compliance with the License. You may
  7 |  * obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0 
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 14 |  * implied. See the License for the specific language governing
 15 |  * permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package org.beymani.predictor;
 19 | 
 20 | import java.io.File;
 21 | import java.io.FileNotFoundException;
 22 | import java.util.Arrays;
 23 | import java.util.Comparator;
 24 | import java.util.Map;
 25 | import java.util.Scanner;
 26 | 
 27 | import backtype.storm.spout.SpoutOutputCollector;
 28 | import backtype.storm.task.TopologyContext;
 29 | import backtype.storm.topology.OutputFieldsDeclarer;
 30 | import backtype.storm.topology.base.BaseRichSpout;
 31 | import backtype.storm.tuple.Fields;
 32 | import backtype.storm.tuple.Values;
 33 | 
 34 | /**
 35 |  * @author pranab
 36 |  *
 37 |  */
 38 | public class FileSpout extends BaseRichSpout {
 39 |     private SpoutOutputCollector collector;
 40 |     private Map conf;
 41 |     private File[] files;
 42 |     private Scanner scanner;
 43 |     /**
 44 |      * 
 45 |      */
 46 |     private int curFileIndex = 0;
 47 |     
 48 | 	@Override
 49 | 	public void open(Map conf, TopologyContext context,
 50 | 			SpoutOutputCollector collector) {
 51 | 		this.collector = collector;
 52 | 		this.conf = conf;
 53 | 		
 54 | 		String dirPath = conf.get("file.spout.dir.path").toString();
 55 | 		File dir = new File(dirPath);
 56 | 		files = dir.listFiles();
 57 | 		Arrays.sort(files, new Comparator<File>(){
 58 | 		    public int compare(File f1, File f2) {
 59 | 		    	int res = f1.lastModified() < f2.lastModified() ? -1 : ( f1.lastModified() > f2.lastModified() ? 1 : 0);
 60 | 		        return res;
 61 | 		    } });
 62 | 		
 63 | 		openNextFile();
 64 | 	}
 65 | 
 66 | 	@Override
 67 | 	public void nextTuple() {
 68 | 		String record = readFile();
 69 | 		String[] items = record.split("\\s+");
 70 | 		String entityID = items[0];
 71 | 		String recordData = items[1];
 72 | 		collector.emit(new Values(entityID, recordData));
 73 | 	}
 74 | 
 75 | 	/**
 76 | 	 * @return
 77 | 	 */
 78 | 	private String readFile() {
 79 | 		String record = null;
 80 | 		if (scanner.hasNextLine()) {
 81 | 			 record =  scanner.nextLine();
 82 | 		 } else {
 83 | 			 if (++curFileIndex < files.length) {
 84 | 					openNextFile();
 85 | 					if (scanner.hasNextLine()) {
 86 | 						 record =  scanner.nextLine();
 87 | 					 }				 
 88 | 			 } else {
 89 | 				 //no more files to read
 90 | 			 }
 91 | 		 }
 92 | 		return record;
 93 | 	}
 94 | 	
 95 | 	/**
 96 | 	 * 
 97 | 	 */
 98 | 	private void openNextFile() {
 99 | 		try {
100 | 			scanner = new Scanner(files[curFileIndex]);
101 | 		} catch (FileNotFoundException e) {
102 | 			throw new IllegalStateException("file not found");
103 | 		}
104 | 	}
105 | 
106 | 	/* (non-Javadoc)
107 | 	 * @see backtype.storm.topology.IComponent#declareOutputFields(backtype.storm.topology.OutputFieldsDeclarer)
108 | 	 */
109 | 	@Override
110 | 	public void declareOutputFields(OutputFieldsDeclarer declarer) {
111 |         declarer.declare(new Fields("entityID", "recordData"));		
112 | 	}
113 | 	
114 | }	
115 | 


--------------------------------------------------------------------------------
/python/app/mvand.py:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/python3
  2 | 
  3 | # avenir-python: Machine Learning
  4 | # Author: Pranab Ghosh
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License"); you
  7 | # may not use this file except in compliance with the License. You may
  8 | # obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0 
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 15 | # implied. See the License for the specific language governing
 16 | # permissions and limitations under the License.
 17 | 
 18 | # Package imports
 19 | import os
 20 | import sys
 21 | import random
 22 | import matplotlib.pyplot as plt 
 23 | import numpy as np
 24 | import sklearn as sk
 25 | from sklearn.ensemble import IsolationForest
 26 | from pyod.models.auto_encoder import AutoEncoder
 27 | sys.path.append(os.path.abspath("../lib"))
 28 | sys.path.append(os.path.abspath("../mlextra"))
 29 | from util import *
 30 | from mlutil import *
 31 | from sampler import *
 32 | 
 33 | """
 34 | Anomaly detection with isolation forest 
 35 | """
 36 | if __name__ == "__main__":
 37 | 	op = sys.argv[1]
 38 | 	filePath = sys.argv[2]
 39 | 	window = 20
 40 | 	beg = 0
 41 | 	end = beg + window
 42 | 	if op == "isfo": 
 43 | 		#anomaly detection in in service ticket data with isolation porest
 44 | 		scId = sys.argv[3]
 45 | 		colStr = sys.argv[4]
 46 | 		columns = strToIntArray(colStr)
 47 | 		filt = lambda r : r[0] == scId
 48 | 		data = np.array(getFileAsFiltFloatMatrix(filePath, filt, colStr))
 49 | 		nsamp = data.shape[0]
 50 | 		isf = IsolationForest(contamination=0.1)
 51 | 		ypred = isf.fit_predict(data)
 52 | 		colors = ["m", "g", "b", "c", "y"]
 53 | 		
 54 | 		for a in data:
 55 | 			a[2] = a[2] / 24
 56 | 		while True:
 57 | 			inp = input("begin offset: ")
 58 | 			beg = int(inp)
 59 | 			end = beg + window
 60 | 			if beg >= 0:
 61 | 				for i in range(len(columns)):
 62 | 					dvalues = data[:,i]
 63 | 					ci = i % 5
 64 | 					plt.plot(dvalues[beg:end], colors[ci])
 65 | 				count = 0
 66 | 				for i in  range(beg, end, 1):
 67 | 					if ypred[i] == -1:
 68 | 						plt.axvline(i - beg, 0, .9, color="r")
 69 | 						count += 1
 70 | 				print("num of outlier {}".format(count))
 71 | 				plt.show()
 72 | 			else:
 73 | 				print("quitting")
 74 | 				break
 75 | 		
 76 | 	elif op == "auen":
 77 | 		#anomaly detection in web session with auto encoder
 78 | 		teFilePath = sys.argv[3]
 79 | 		columns = sys.argv[4]
 80 | 		auen = AutoEncoder(hidden_neurons =[7,5,3,5,7])	
 81 | 		trData = np.array(getFileAsFloatMatrix(filePath, columns))
 82 | 		trNsamp = trData.shape[0]
 83 | 		teData = np.array(getFileAsFloatMatrix(teFilePath, columns))
 84 | 		aData = np.vstack((trData, teData))
 85 | 		aData = scaleData(aData, "zscale")
 86 | 		print(aData.shape)
 87 | 		trData = aData[:trNsamp, :]
 88 | 		teData = aData[trNsamp:, :]
 89 | 		print(trData.shape)
 90 | 		print(teData.shape)
 91 | 		
 92 | 		auen.fit(trData)
 93 | 		scores = auen.decision_function(teData)
 94 | 		
 95 | 		while True:
 96 | 			inp = input("begin offset: ")
 97 | 			beg = int(inp)
 98 | 			end = beg + window
 99 | 			if beg >= 0:
100 | 				plt.plot(scores[beg:end], color="b")
101 | 				count = 0
102 | 				for i in  range(beg, end, 1):
103 | 					if scores[i] > 17:
104 | 						plt.axvline(i - beg, 0, .9, color="r")
105 | 						count += 1
106 | 				print("num of outlier {}".format(count))
107 | 				plt.show()
108 | 			else:
109 | 				print("quitting")
110 | 				break
111 | 		
112 | 		
113 | 


--------------------------------------------------------------------------------
/resource/rel_density_tutorial.txt:
--------------------------------------------------------------------------------
  1 | Configuration
  2 | =============
  3 | Here are the global config properties to set in the properties file. Please make changes as necessary
  4 | 
  5 | debug.on=true
  6 | field.delim=,
  7 | field.delim.regex=,
  8 | num.reducer=1
  9 | 
 10 | Configuration settings for individual map reduce jobs are described below
 11 | 
 12 | Map Reduce Jobs
 13 | ===============
 14 | 
 15 | 1. Similarity calculation
 16 | -------------------------
 17 | run SameTypeSimilarity
 18 | 
 19 | Make sure properties are set as below in the configuration properties file
 20 | 
 21 | sts.same.schema.file.path=/user/pranab/cct/meta/cct.json
 22 | sts.bucket.count=10
 23 | sts.distance.scale=1000
 24 | 
 25 | 2. Density calculation. 
 26 | ----------------------
 27 | Here is a sample script. It uses the output of SameTypeSmilarity MR as input
 28 | 
 29 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
 30 | CLASS_NAME=org.beymani.proximity.AverageDistance
 31 | 
 32 | echo "running mr"
 33 | IN_PATH=/user/pranab/cct/simi
 34 | OUT_PATH=/user/pranab/cct/avdi
 35 | echo "input $IN_PATH output $OUT_PATH"
 36 | hadoop fs -rmr $OUT_PATH
 37 | echo "removed output dir"
 38 | 
 39 | hadoop jar $JAR_NAME  $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties  $IN_PATH  $OUT_PATH
 40 | 
 41 | Make sure properties are set as below, so that density is output
 42 | 
 43 | avd.top.match.average=false
 44 | avd.top.match.density=true
 45 | avd.top.match.grouping=false
 46 | 
 47 | 3. Calculate neighborhood groups
 48 | --------------------------------
 49 | Use the same MR as before. Watch the configurations params at the end of this section
 50 | 
 51 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
 52 | CLASS_NAME=org.beymani.proximity.AverageDistance
 53 | 
 54 | echo "running mr"
 55 | IN_PATH=/user/pranab/cct/simi
 56 | OUT_PATH=/user/pranab/cct/negrp
 57 | echo "input $IN_PATH output $OUT_PATH"
 58 | hadoop fs -rmr $OUT_PATH
 59 | echo "removed output dir"
 60 | 
 61 | hadoop jar $JAR_NAME  $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties  $IN_PATH  $OUT_PATH
 62 | 
 63 | Make sure properties are set as below, so that neighborhood group is output
 64 | 
 65 | avd.top.match.average=false
 66 | avd.top.match.density=flase
 67 | avd.top.match.grouping=true
 68 | 
 69 | 4. Find Neighborhood and Density. 
 70 | --------------------------------
 71 | Here is a sample  script. Before running make sure out of step 2 and 3  is copied
 72 | or moved to the input dir for thios MR. Change the prefix of the output of step 2 
 73 | to what is defined in the config param density.file.prefix
 74 | 
 75 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
 76 | CLASS_NAME=org.beymani.proximity.NeighborDensity
 77 | 
 78 | echo "running mr"
 79 | IN_PATH=/user/pranab/cct/input/nede
 80 | OUT_PATH=/user/pranab/cct/nede
 81 | echo "input $IN_PATH output $OUT_PATH"
 82 | hadoop fs -rmr $OUT_PATH
 83 | echo "removed output dir"
 84 | 
 85 | hadoop jar $JAR_NAME  $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties  $IN_PATH  $OUT_PATH
 86 | 
 87 | Make sure properties are set as below
 88 | 
 89 | ned.density.file.prefix=den
 90 | 
 91 | 5. Relative density calculation
 92 | -------------------------------
 93 | It uses the input of step 4 as input. Here is the sample script
 94 | 
 95 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
 96 | CLASS_NAME=org.beymani.proximity.RelativeDensity
 97 | 
 98 | echo "running mr"
 99 | IN_PATH=/user/pranab/cct/nede
100 | OUT_PATH=/user/pranab/cct/rede
101 | echo "input $IN_PATH output $OUT_PATH"
102 | hadoop fs -rmr $OUT_PATH
103 | echo "removed output dir"
104 | 
105 | hadoop jar $JAR_NAME  $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties  $IN_PATH  $OUT_PATH
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/src/main/java/org/beymani/util/SequenceMatcher.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * beymani: Outlier and anamoly detection 
  3 |  * Author: Pranab Ghosh
  4 |  * 
  5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  6 |  * may not use this file except in compliance with the License. You may
  7 |  * obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0 
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 14 |  * implied. See the License for the specific language governing
 15 |  * permissions and limitations under the License.
 16 |  */
 17 | 
 18 | 
 19 | package org.beymani.util;
 20 | 
 21 | import java.util.ArrayList;
 22 | import java.util.List;
 23 | 
 24 | /**
 25 |  * Various sequence matching algorithms
 26 |  * @author pranab
 27 |  *
 28 |  * @param <T>
 29 |  */
 30 | public class SequenceMatcher<T> {
 31 | 	private List<T> seqData = new ArrayList<T>();
 32 | 	private int maxSize;
 33 | 	private double sim;
 34 | 	private boolean normalize;
 35 | 	private boolean similarity;
 36 | 	private int matchSize;
 37 | 
 38 | 	public SequenceMatcher(boolean normalize, boolean similarity) {
 39 | 		this.normalize = normalize;
 40 | 		this.similarity = similarity;
 41 | 	}
 42 | 	
 43 | 	public SequenceMatcher(int maxSize,boolean normalized, boolean similarity) {
 44 | 		this(normalized, similarity);
 45 | 		this.maxSize = maxSize;
 46 | 	}
 47 | 	
 48 | 	public void add(T item) {
 49 | 		seqData.add(item);
 50 | 		if (maxSize > 0 && seqData.size() > maxSize) {
 51 | 			seqData.remove(0);
 52 | 		}
 53 | 	}
 54 | 	
 55 | 	/**
 56 | 	 * Simple positional matching
 57 | 	 * @param other
 58 | 	 * @return
 59 | 	 */
 60 | 	public double matchCount(SequenceMatcher<T> other) {
 61 | 		matchSize = seqData.size() < other.seqData.size() ? seqData.size() : other.seqData.size();
 62 | 		sim = 0;
 63 | 		for (int i = 0; i < matchSize; ++i) {
 64 | 			if (seqData.get(i).equals(other.seqData.get(i))) {
 65 | 				++sim;
 66 | 			}
 67 | 		}
 68 | 		prepeareResult(matchSize);
 69 | 		return sim;
 70 | 	}
 71 | 	
 72 | 	/**
 73 | 	 * Positional matching with higher reward for adjacent mactches
 74 | 	 * @param other
 75 | 	 * @return
 76 | 	 */
 77 | 	public double adjacencyRewardedMatchCount(SequenceMatcher<T> other) {
 78 | 		matchSize = seqData.size() < other.seqData.size() ? seqData.size() : other.seqData.size();
 79 | 		sim = 0;
 80 | 		int adjCount = 1;
 81 | 		for (int i = 0; i < matchSize; ++i) {
 82 | 			if (seqData.get(i).equals(other.seqData.get(i))) {
 83 | 				sim += adjCount;
 84 | 				++adjCount;
 85 | 			} else {
 86 | 				adjCount = 1;
 87 | 			}
 88 | 		}		
 89 | 		prepeareResult(matchSize);
 90 | 		return sim;
 91 | 	}	
 92 | 	
 93 | 	/**
 94 | 	 * Positional matching with higher reward for adjacent mactches
 95 | 	 * @param other
 96 | 	 * @return
 97 | 	 */
 98 | 	public double maxCommonSubSeqMatchCount(SequenceMatcher<T> other) {
 99 | 		int matchSize = seqData.size() < other.seqData.size() ? seqData.size() : other.seqData.size();
100 | 		sim = 0;
101 | 		int adjCount = 0;
102 | 		for (int i = 0; i < matchSize; ++i) {
103 | 			if (seqData.get(i).equals(other.seqData.get(i))) {
104 | 				++adjCount;
105 | 			} else {
106 | 				if (adjCount > sim) {
107 | 					sim = adjCount;
108 | 				}
109 | 				adjCount = 0;
110 | 			}
111 | 		}		
112 | 		prepeareResult(matchSize * (matchSize + 1) / 2);
113 | 		return sim;
114 | 	}	
115 | 	
116 | 	/**
117 | 	 * @param scale
118 | 	 */
119 | 	private void prepeareResult(int scale) {
120 | 		if (normalize) {
121 | 			sim /= scale;
122 | 			if (!similarity) {
123 | 				sim = 1.0 - sim;
124 | 			}
125 | 		} else {
126 | 			if (!similarity) {
127 | 				sim = scale - sim;
128 | 			}
129 | 		}
130 | 	}
131 | 	
132 | }
133 | 


--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/EstimatedCumProbabilityBasedPredictor.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * beymani: Outlier and anamoly detection 
 3 |  * Author: Pranab Ghosh
 4 |  * 
 5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
 6 |  * may not use this file except in compliance with the License. You may
 7 |  * obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0 
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 |  * implied. See the License for the specific language governing
15 |  * permissions and limitations under the License.
16 |  */
17 | 
18 | package org.beymani.predictor;
19 | 
20 | import java.io.IOException;
21 | import java.util.Map;
22 | 
23 | import org.apache.hadoop.conf.Configuration;
24 | import org.beymani.util.OutlierScoreAggregator;
25 | import org.chombo.stats.HistogramStat;
26 | import org.chombo.util.BasicUtils;
27 | 
28 | public class EstimatedCumProbabilityBasedPredictor extends EsimatedAttrtibuteProbabilityBasedPredictor {
29 | 
30 | 	public EstimatedCumProbabilityBasedPredictor(Map conf) {
31 | 		super(conf);
32 | 	}
33 | 
34 | 	/**
35 | 	 * @param config
36 | 	 * @param idOrdinalsParam
37 | 	 * @param attrListParam
38 | 	 * @param distrFilePathParam
39 | 	 * @param hdfsFileParam
40 | 	 * @param schemaFilePathParam
41 | 	 * @param attrWeightParam
42 | 	 * @param seasonalParam
43 | 	 * @param fieldDelimParam
44 | 	 * @param scoreThresholdParam
45 | 	 * @param ignoreMissingDistrParam
46 | 	 * @throws IOException
47 | 	 */
48 | 	public EstimatedCumProbabilityBasedPredictor(Map<String, Object> config,String idOrdinalsParam, String attrListParam,
49 | 			String distrFilePathParam, String hdfsFileParam,String schemaFilePathParam, String attrWeightParam,
50 | 			String seasonalParam, String fieldDelimParam,String scoreThresholdParam, String ignoreMissingDistrParam,
51 | 			String scoreAggggregationStrtaegyParam)
52 | 			throws IOException {
53 | 		super(config, idOrdinalsParam, attrListParam, distrFilePathParam,hdfsFileParam, schemaFilePathParam, attrWeightParam,
54 | 				seasonalParam, fieldDelimParam, scoreThresholdParam,ignoreMissingDistrParam, "score.strategy", "exp.const",
55 | 				scoreAggggregationStrtaegyParam);
56 | 	}
57 | 
58 | 	/**
59 | 	 * @param config
60 | 	 * @param distrFilePathParam
61 | 	 * @param attrWeightParam
62 | 	 * @param scoreThresholdParam
63 | 	 * @param fieldDelimParam
64 | 	 * @throws IOException
65 | 	 */
66 | 	public EstimatedCumProbabilityBasedPredictor(Configuration config,String distrFilePathParam, String attrWeightParam,
67 | 			String scoreThresholdParam, String fieldDelimParam)
68 | 			throws IOException {
69 | 		super(config, distrFilePathParam, attrWeightParam, scoreThresholdParam,fieldDelimParam);
70 | 	}
71 | 
72 | 	@Override
73 | 	public double execute(String[] items, String compKey) {
74 | 		double score = 0;
75 | 		OutlierScoreAggregator scoreAggregator = new OutlierScoreAggregator(attrWeights.length, attrWeights);
76 | 		double thisScore = 0;
77 | 		for (int ord  :  attrOrdinals) {
78 | 			String keyWithFldOrd = compKey + fieldDelim + ord;
79 | 			double val = Double.parseDouble(items[ord]);
80 | 			System.out.println("keyWithFldOrd " + keyWithFldOrd);
81 | 			HistogramStat hist = keyedHist.get(keyWithFldOrd);
82 | 			if (null != hist) {
83 | 				double distr = hist.findCumDistr(val);
84 | 				thisScore = distr < 0.5 ? 1.0 - distr : distr;
85 | 				scoreAggregator.addScore(thisScore);
86 | 			} else {
87 | 				BasicUtils.assertCondition(!ignoreMissingDistr, "missing distr for key " + keyWithFldOrd);
88 | 				scoreAggregator.addScore();
89 | 			}
90 | 		}
91 | 		//aggregate score	
92 | 		score = getAggregateScore(scoreAggregator);
93 | 		
94 | 		scoreAboveThreshold = score > scoreThreshold;
95 | 		return score;
96 | 	}
97 | 	
98 | }
99 | 


--------------------------------------------------------------------------------
/resource/real_time_fraud_prediction_tutorial.txt:
--------------------------------------------------------------------------------
  1 | This tutorial is for real time fraud detection using Haddop Storm. It uses markov chanin
  2 | as the predictive model. Make necessary changes to path etc to suit your environment.
  3 | 
  4 | Dependency
  5 | ==========
  6 | The project has dependency on chombo.Please do the build as below for chombo and avenir respectively
  7 | mvn clean  install
  8 | 
  9 | Please refer to jar_dependency.txt for details of dependency
 10 | 
 11 | Easiest way is to use ant as follows
 12 | ant build_storm.xml
 13 | 
 14 | Generate input data
 15 | ===================
 16 | Get util.rb from the project visitante. Puta copy of the file in ../lib
 17 | ./xaction_states.rb 5000 > xact_training.txt
 18 | 
 19 | where 5000 is the number of customers
 20 | Copy the output file to HDFS input directory /Users/pranab/mmfr/input
 21 | 
 22 | Generate transaction sequence data with MR
 23 | ==========================================
 24 | Run this script. This MR belogs to the project chombo
 25 | 
 26 | JAR_NAME=/home/pranab/Projects/chombo/target/chombo-1.0.jar
 27 | CLASS_NAME=org.chombo.mr.Projection
 28 | 
 29 | echo "running mr"
 30 | IN_PATH=/Users/pranab/mmfr/input
 31 | OUT_PATH=/Users/pranab/mmfr/sequence
 32 | echo "input $IN_PATH output $OUT_PATH"
 33 | hadoop fs -rmr $OUT_PATH
 34 | echo "removed output dir"
 35 | 
 36 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/mmfr.properties $IN_PATH $OUT_PATH
 37 | 
 38 | Generate markov chain model with MR
 39 | ===================================
 40 | Run this script
 41 | 
 42 | JAR_NAME=/home/pranab/Projects/avenir/target/avenir-1.0.jar
 43 | CLASS_NAME=org.avenir.markov.MarkovStateTransitionModel
 44 | 
 45 | echo "running mr"
 46 | IN_PATH=/Users/pranab/mmfr/sequence
 47 | OUT_PATH=/Users/pranab/mmfr/model
 48 | echo "input $IN_PATH output $OUT_PATH"
 49 | hadoop fs -rmr $OUT_PATH
 50 | echo "removed output dir"
 51 | 
 52 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/mmfr.properties $IN_PATH $OUT_PATH
 53 | 
 54 | Copy the MR ouput
 55 | =================
 56 | hadoop fs -get /Users/pranab/mmfr/model/part-r-00000 xmodel.txt
 57 | 
 58 | Store model in Redis
 59 | ====================
 60 | ./xaction_queue.py setModel xmodel.txt
 61 | 
 62 | Generate test transaction data
 63 | ==============================
 64 | ./xaction_states.rb 200 > xact_test.txt
 65 | 
 66 | Write test data to Redis queue
 67 | ==============================
 68 | ./xaction_queue.py writeQueue xact_test.txt
 69 | 
 70 | Build uber jar for storm deployment
 71 | ===================================
 72 | ant -f build_storm.xml
 73 | 
 74 | Deploy storm topology
 75 | =====================
 76 | storm  jar uber-beymani-1.0.jar org.beymani.predictor.OutlierPredictor  NoFraud rt_predict.properties
 77 | 
 78 | Get output
 79 | ==========
 80 | From storm UI after you have ensured all data have been processed, get the output from the redis
 81 | output queue
 82 | 
 83 | ./xaction_queue.py readOutQueue
 84 | 
 85 | Hadoop configuration
 86 | ====================
 87 | field.delim.regex=,
 88 | field.delim.out=,
 89 | num.reducer=1
 90 | debug.on=false
 91 | 
 92 | #Projection
 93 | projection.operation=grouping
 94 | key.field=0
 95 | projection.field=2
 96 | 
 97 | #MarkovStateTransitionModel
 98 | skip.field.count=1
 99 | model.states=LNL,LNN,LNS,LHL,LHN,LHS,MNL,MNN,MNS,MHL,MHN,MHS,HNL,HNN,HNS,HHL,HHN,HHS
100 | trans.prob.scale=1
101 | 
102 | Storm configuration
103 | ===================
104 | predictor.model=mm
105 | predictor.spout.threads=1
106 | predictor.bolt.threads=2
107 | num.workers=1
108 | debug=on
109 | 
110 | redis.server.host=localhost
111 | redis.server.port=6379
112 | redis.markov.model.key=xactionMarkovModel
113 | redis.input.queue=xactionQueue
114 | local.predictor=true
115 | state.seq.window.size=5
116 | state.ordinal=1
117 | detection.algorithm=missProbability
118 | metric.threshold=0.96
119 | redis.output.queue=fraudQueue
120 | 
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/resource/retail_sale_monitoring_with_anomaly_detection_tutorial.txt:
--------------------------------------------------------------------------------
  1 | This tutorial is for ecommerce retail sale monitring  based anomaly detection for hourly sales data.
  2 | Robust zscore is used  for anomaly detection. The data hierarchy is org -> sale -> dept -> product sale
  3 | 
  4 | Dependent script
  5 | ================
  6 | Checkout the project avenir. Copy the lib  directory under python to a directory  at the same level
  7 | as your working directory for python script ecomm.py
  8 | 
  9 | Build and Deployment
 10 | ====================
 11 | Please refer to resorce/spark_dependency.txt for building all jars and the final uber jar filw
 12 | 
 13 | Script and configuration
 14 | ========================
 15 | Feel free to make changes in script exp_spark.sh and the configuration file exp.conf as per you
 16 | environment
 17 | 
 18 | Generate stats for hourly sales
 19 | ===============================
 20 | ./ecomm.py prStat <num_product> > prstat.txt
 21 | 
 22 | where
 23 | num_product = num of products e.g 20
 24 | 
 25 | Generate training data
 26 | ======================
 27 | ./ecomm.py prSale prstat.txt <interval> <time_unit> > sale_tr.txt
 28 | 
 29 | where
 30 | interval = amount of time into past e.g 30
 31 | time_unit = time unit d for day and h for hour
 32 | 
 33 | Generate prediction data
 34 | ========================
 35 | ./ecomm.py prSale prstat.txt <interval> <time_unit> > sale.txt
 36 | 
 37 | 
 38 | Insert outlier
 39 | ./ecomm.py olPrSale sale.txt <outlier_percentage> > sale_pr.txt
 40 | 
 41 | where
 42 | outlier_percentage = percentage of outliers e.g 10
 43 | 
 44 | Copy training data
 45 | ==================
 46 | ./ecomm.sh loadInp sale_tr.txt training
 47 | 
 48 | Run spark job for basic stats
 49 | =============================
 50 | ./ecomm.sh numStat 
 51 | 
 52 | Run spark job for median
 53 | ========================
 54 | Set the following in ecomm.conf for numericalAttrMedian
 55 | operation.type = "med"
 56 | 
 57 | Run
 58 | ./ecomm.sh numMstat
 59 | 
 60 | Copy median file
 61 | ================
 62 | ./ecomm.sh bkMod med.txt
 63 | 
 64 | It generates med.txt file
 65 | 
 66 | Run spark job for median absolute deviation
 67 | ===========================================
 68 | Set the following in ecomm.conf for numericalAttrMedian
 69 | operation.type = "mad"
 70 | 
 71 | Run
 72 | ./ecomm.sh numMstat
 73 | 
 74 | Copy median absolute deviation file
 75 | ===================================
 76 | ./ecomm.sh bkMod mad.txt
 77 | 
 78 | It generates mad.txt
 79 | 
 80 | Copy prediction data
 81 | ====================
 82 | ./ecomm.sh loadInp sale_pr.txt pred
 83 | 
 84 | Run spark job for prediction
 85 | ============================
 86 | ./ecomm.sh olPred
 87 | 
 88 | Copy prediction output into one file
 89 | ====================================
 90 | ./ecomm.sh bkOut psale/olp.txt
 91 | 
 92 | All output gets wrirtten to olp.txt
 93 | 
 94 | Run spark job to aggregate to dept
 95 | ==================================
 96 | Clean aggregator input dir
 97 | ./ecomm.sh rmAggrInp
 98 | 
 99 | Copy to aggregator input dir
100 | ./ecomm.sh loadAggrInp psale/olp.txt
101 | 
102 | Run aggregator spark job
103 | ./ecomm.sh aggrOl
104 | 
105 | Copy aggregator output into one file
106 | ./ecomm.sh bkOutAggr dept/olp.txt
107 | 
108 | Run spark job to aggregate to sale
109 | ==================================
110 | Clean aggregator input dir
111 | ./ecomm.sh rmAggrInp
112 | 
113 | Copy to aggregator input dir
114 | ./ecomm.sh loadAggrInp dept/olp.txt
115 | 
116 | Run aggregator
117 | ./ecomm.sh aggrOl
118 | 
119 | Copy aggregator output into one file
120 | ./ecomm.sh bkOutAggr sale/olp.txt
121 | 
122 | Run spark job to aggregate to organization
123 | ==========================================
124 | Clean aggregator input dir
125 | ./ecomm.sh rmAggrInp
126 | 
127 | Copy to aggregator input dir
128 | ./ecomm.sh loadAggrInp sale/olp.txt
129 | 
130 | Run aggregator
131 | ./ecomm.sh aggrOl
132 | 
133 | Copy aggregator output into one file
134 | ./ecomm.sh bkOutAggr org/olp.txt
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | 


--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/ModelBasedPredictor.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * beymani: Outlier and anamoly detection 
  3 |  * Author: Pranab Ghosh
  4 |  * 
  5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  6 |  * may not use this file except in compliance with the License. You may
  7 |  * obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0 
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 14 |  * implied. See the License for the specific language governing
 15 |  * permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package org.beymani.predictor;
 19 | 
 20 | import java.io.Serializable;
 21 | import java.util.Map;
 22 | 
 23 | import org.beymani.util.OutlierScoreAggregator;
 24 | import org.chombo.util.BasicUtils;
 25 | import org.chombo.util.ConfigUtility;
 26 | 
 27 | /**
 28 |  * Base class for all model based predictors
 29 |  * @author pranab
 30 |  *
 31 |  */
 32 | public abstract class ModelBasedPredictor implements Serializable {
 33 | 	private static final long serialVersionUID = -8813946272356265424L;
 34 | 	protected boolean realTimeDetection;
 35 | 	protected double scoreThreshold;
 36 | 	protected boolean scoreAboveThreshold;
 37 | 	protected boolean partition = false;
 38 | 	protected double expConst = 1.0;
 39 | 	protected int[] idOrdinals;
 40 | 	protected int[] attrOrdinals;
 41 | 	protected double[] attrWeights;
 42 | 	protected boolean ignoreMissingStat;
 43 | 	protected String fieldDelim;
 44 | 	protected boolean seasonal;
 45 | 
 46 | 	private String aggregationStrategy;
 47 | 	
 48 | 
 49 | 	public ModelBasedPredictor() {
 50 | 		
 51 | 	}
 52 | 	
 53 | 	/**
 54 | 	 * @param config
 55 | 	 * @param attrWeightParam
 56 | 	 * @param scoreAggggregationStrtaegyParam
 57 | 	 */
 58 | 	public ModelBasedPredictor(Map<String, Object> config, String attrWeightParam, String scoreAggggregationStrtaegyParam) {
 59 | 		attrWeights = ConfigUtility.getDoubleArray(config, attrWeightParam);
 60 | 		aggregationStrategy = ConfigUtility.getString(config, scoreAggggregationStrtaegyParam);;
 61 | 	}
 62 | 	
 63 | 	/**
 64 | 	 * @param entityID
 65 | 	 * @param record
 66 | 	 * @return
 67 | 	 */
 68 | 	public abstract double execute(String entityID, String record);
 69 | 	
 70 | 	/**
 71 | 	 * @param items
 72 | 	 * @param compKey
 73 | 	 * @return
 74 | 	 */
 75 | 	public abstract double execute(String[] items, String compKey);
 76 | 
 77 | 
 78 | 	/**
 79 | 	 * @return
 80 | 	 */
 81 | 	public boolean isScoreAboveThreshold() {
 82 | 		return scoreAboveThreshold;
 83 | 	} 
 84 | 	
 85 | 	/**
 86 | 	 * @return
 87 | 	 */
 88 | 	public ModelBasedPredictor withPartition() {
 89 | 		partition = true;
 90 | 		return this;
 91 | 	}
 92 | 
 93 | 	/**
 94 | 	 * @param ignoreMissingStat
 95 | 	 * @return
 96 | 	 */
 97 | 	public ModelBasedPredictor withIgnoreMissingStat(boolean ignoreMissingStat) {
 98 | 		this.ignoreMissingStat = ignoreMissingStat;
 99 | 		return this;
100 | 	}
101 | 
102 | 	
103 | 	/**
104 | 	 * @param compKey
105 | 	 * @return
106 | 	 */
107 | 	public  abstract boolean isValid(String compKey);
108 | 	
109 | 	/**
110 | 	 * @return
111 | 	 */
112 | 	public double getAggregateScore(OutlierScoreAggregator scoreAggregator) {
113 | 		double aggrScore = 0;
114 | 		if (aggregationStrategy.equals("average")) {
115 | 			aggrScore = scoreAggregator.getAverage();
116 | 		} else if (aggregationStrategy.equals("weightedAverage")) {
117 | 			aggrScore = scoreAggregator.getWeightedAverage();
118 | 		} else if (aggregationStrategy.equals("median")) {
119 | 			aggrScore = scoreAggregator.getMedian();
120 | 		} else if (aggregationStrategy.equals("max")) {
121 | 			aggrScore = scoreAggregator.getMax();
122 | 		} else if (aggregationStrategy.equals("min")) {
123 | 			aggrScore = scoreAggregator.getMin();
124 | 		} else {
125 | 			BasicUtils.assertFail("invalid outlier score aggregation strategy " + aggregationStrategy);
126 | 		}
127 | 		return aggrScore;
128 | 	}
129 | 	
130 | }
131 | 


--------------------------------------------------------------------------------
/python/app/bvib.py:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/python3
  2 | 
  3 | # avenir-python: Machine Learning
  4 | # Author: Pranab Ghosh
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License"); you
  7 | # may not use this file except in compliance with the License. You may
  8 | # obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0 
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 15 | # implied. See the License for the specific language governing
 16 | # permissions and limitations under the License.
 17 | 
 18 | import os
 19 | import sys
 20 | from random import randint
 21 | import time
 22 | import math
 23 | from datetime import datetime
 24 | import matplotlib.pyplot as plt
 25 | sys.path.append(os.path.abspath("../lib"))
 26 | from util import *
 27 | from mlutil import *
 28 | from sampler import *
 29 | 
 30 | """
 31 | MAchinary vibration time series with multiple harmonic components and  random noise
 32 | Inserts outlier with high frequency components indicating failure 
 33 | """
 34 | 
 35 | def sinComponents(params):
 36 | 	"""
 37 | 	returns list sine components
 38 | 	"""
 39 | 	comps = list()
 40 | 	for i in range(0, len(params), 2):
 41 | 		amp = params[i]
 42 | 		per = params[i + 1]
 43 | 		phase = randomFloat(0, 2.0 * math.pi)
 44 | 		co = (amp, per, phase)
 45 | 		comps.append(co)
 46 | 	return comps
 47 | 
 48 | def addSines(comps, sampTm):
 49 | 	"""
 50 | 	adds multiple sine comopnents
 51 | 	"""
 52 | 	val = 0
 53 | 	for c in comps:
 54 | 		t = 2.0 * math.pi * (sampTm % c[1]) / c[1]
 55 | 		val += c[0] * math.sin(c[2] + t)
 56 | 	return val
 57 | 	
 58 | if __name__ == "__main__":
 59 | 	op = sys.argv[1]
 60 | 	if op == "gen":
 61 | 		#generate data
 62 | 		ids = ["HG56SDFE", "K87JG9F6"]
 63 | 		comps = dict()
 64 | 		comps["HG56SDFE"] = sinComponents([52,40,76,20,5,80,7,30])
 65 | 		comps["K87JG9F6"] = sinComponents([56,42,74,18,6,84,9,28])
 66 | 		noise= NormalSampler(0,3)
 67 | 		dur = int(sys.argv[2]) * 1000
 68 | 		ctime = curTimeMs()
 69 | 		ptime = ctime - dur
 70 | 		sintv = 1
 71 | 		stime = ptime
 72 | 		while stime < ctime:
 73 | 			for mid in ids:
 74 | 				val = addSines(comps[mid], stime) + noise.sample()
 75 | 				print("{},{},{:.3f}".format(mid, stime, val))				
 76 | 			stime += sintv
 77 | 					
 78 | 	elif op == "iplot":
 79 | 		#plot
 80 | 		fpath = sys.argv[2]
 81 | 		mid = sys.argv[3]
 82 | 		beg = int(sys.argv[4])
 83 | 		end = int(sys.argv[5])
 84 | 		filt = lambda r : r[0] == mid
 85 | 		dvalues = list(map(lambda r : float(r[2]), fileFiltRecGen(fpath, filt)))
 86 | 		drawLine(dvalues[beg:end])
 87 | 		
 88 | 	elif op == "iol":
 89 | 		#insert outliers
 90 | 		fpath = sys.argv[2]
 91 | 		delay = int(sys.argv[3]) * 1000 * 2
 92 | 		ocomps = sinComponents([36,12,30,8])
 93 | 		i = 0
 94 | 		for rec in fileRecGen(fpath, ","):
 95 | 			mid = rec[0]
 96 | 			if mid ==  "K87JG9F6" and i > delay:
 97 | 				val = float(rec[2])
 98 | 				stime = int(rec[1])
 99 | 				val += addSines(ocomps, stime)
100 | 				rec[2] = "{:.3f}".format(val)
101 | 			print(",".join(rec))	
102 | 			i += 1	
103 | 		
104 | 	elif op == "oplot":
105 | 		#plot outliers
106 | 		fpath = sys.argv[2]
107 | 		mid = sys.argv[3]
108 | 		beg = int(sys.argv[4])
109 | 		end = int(sys.argv[5])
110 | 		filt = lambda r : r[0] == mid
111 | 		dvalues = list(map(lambda r : float(r[3]), fileFiltRecGen(fpath, filt)))
112 | 		xvalues = list(map(lambda r : int(r[1]), fileFiltRecGen(fpath, filt)))
113 | 		plt.plot(xvalues[beg:end], dvalues[beg:end])
114 | 		plt.title("outlier score")
115 | 		plt.show()
116 | 		
117 | 		dvalues = list(map(lambda r : float(r[2]), fileFiltRecGen(fpath, filt)))
118 | 		plt.plot(xvalues, dvalues, "b")
119 | 		ofilt = lambda r : r[0] == mid and r[4] == "O"
120 | 		oxvalues = list(map(lambda r : int(r[1]), fileFiltRecGen(fpath, ofilt)))
121 | 		for t in oxvalues:
122 | 			plt.axvline(t, 0, .9, color="r")
123 | 		plt.title("outliers")
124 | 		plt.show()
125 | 		
126 | 
127 | 	else:
128 | 		exitWithMsg("ivalid command")
129 | 			
130 | 			
131 | 		
132 | 


--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/OutlierPredictor.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * beymani: Outlier and anamoly detection 
  3 |  * Author: Pranab Ghosh
  4 |  * 
  5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  6 |  * may not use this file except in compliance with the License. You may
  7 |  * obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0 
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 14 |  * implied. See the License for the specific language governing
 15 |  * permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package org.beymani.predictor;
 19 | 
 20 | import java.io.FileInputStream;
 21 | import java.util.Map;
 22 | import java.util.Properties;
 23 | 
 24 | import backtype.storm.Config;
 25 | import backtype.storm.StormSubmitter;
 26 | import backtype.storm.task.OutputCollector;
 27 | import backtype.storm.task.TopologyContext;
 28 | import backtype.storm.topology.OutputFieldsDeclarer;
 29 | import backtype.storm.topology.TopologyBuilder;
 30 | import backtype.storm.topology.base.BaseRichBolt;
 31 | import backtype.storm.tuple.Fields;
 32 | import backtype.storm.tuple.Tuple;
 33 | 
 34 | /**
 35 |  * Storm topolgy driver for outlier detection
 36 |  * @author pranab
 37 |  *
 38 |  */
 39 | public class OutlierPredictor {
 40 | 	
 41 | 	/**
 42 | 	 * @author pranab
 43 | 	 *
 44 | 	 */
 45 | 	public static class PredictorBolt extends BaseRichBolt {
 46 |         private OutputCollector collector;
 47 |         private ModelBasedPredictor predictor;
 48 | 
 49 | 		/* (non-Javadoc)
 50 | 		 * @see backtype.storm.task.IBolt#prepare(java.util.Map, backtype.storm.task.TopologyContext, backtype.storm.task.OutputCollector)
 51 | 		 */
 52 | 		public void prepare(Map stormConf, TopologyContext context,
 53 | 				OutputCollector collector) {
 54 | 			this.collector = collector;
 55 | 			String strategy = stormConf.get("predictor.model").toString();
 56 | 			if (strategy.equals("mm")){
 57 | 				predictor = new MarkovModelPredictor(stormConf);
 58 | 			}
 59 | 		}
 60 | 		
 61 | 		/* (non-Javadoc)
 62 | 		 * @see backtype.storm.task.IBolt#execute(backtype.storm.tuple.Tuple)
 63 | 		 */
 64 | 		public void execute(Tuple input) {
 65 | 			String entityID = input.getString(0);
 66 | 			String record  = input.getString(1);
 67 | 			double score = predictor.execute( entityID,  record);
 68 | 			
 69 | 			//write score to db
 70 | 			
 71 | 			//ack
 72 | 			collector.ack(input);
 73 | 		}
 74 | 
 75 | 		@Override
 76 | 		public void declareOutputFields(OutputFieldsDeclarer declarer) {
 77 | 			
 78 | 		}
 79 | 		
 80 | 	}
 81 | 	
 82 |     public static void main(String[] args) throws Exception {
 83 |     	String topologyName = args[0];
 84 |     	String configFilePath = args[1];
 85 |     	
 86 |         FileInputStream fis = new FileInputStream(configFilePath);
 87 |         Properties configProps = new Properties();
 88 |         configProps.load(fis);
 89 | 
 90 |         //intialize config
 91 |         Config conf = new Config();
 92 |         conf.setDebug(true);
 93 |         for (Object key : configProps.keySet()){
 94 |             String keySt = key.toString();
 95 |             String val = configProps.getProperty(keySt);
 96 |             conf.put(keySt, val);
 97 |         }
 98 |         
 99 |         //spout
100 |         TopologyBuilder builder = new TopologyBuilder();
101 |         int spoutThreads = Integer.parseInt(configProps.getProperty("predictor.spout.threads"));
102 |         builder.setSpout("predictorSpout", new PredictorSpout(), spoutThreads);
103 |         
104 |         //detector bolt
105 |         int boltThreads = Integer.parseInt(configProps.getProperty("predictor.bolt.threads"));
106 |         builder.setBolt("predictor", new PredictorBolt(), boltThreads)
107 |         	.fieldsGrouping("predictorSpout", new Fields("entityID"));
108 |        
109 |         //submit topology
110 |         int numWorkers = Integer.parseInt(configProps.getProperty("num.workers"));
111 |         conf.setNumWorkers(numWorkers);
112 |         StormSubmitter.submitTopology(topologyName, conf, builder.createTopology());
113 |         
114 |     }	
115 | }
116 | 


--------------------------------------------------------------------------------
/python/app/olss.py:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/python3
  2 | 
  3 | # avenir-python: Machine Learning
  4 | # Author: Pranab Ghosh
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License"); you
  7 | # may not use this file except in compliance with the License. You may
  8 | # obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0 
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 15 | # implied. See the License for the specific language governing
 16 | # permissions and limitations under the License.
 17 | 
 18 | import os
 19 | import sys
 20 | import time
 21 | import math
 22 | import statistics 
 23 | import ntpath
 24 | import matplotlib.pyplot as plt
 25 | sys.path.append(os.path.abspath("../lib"))
 26 | sys.path.append(os.path.abspath("../mlextra"))
 27 | from util import *
 28 | from mlutil import *
 29 | from mcsim import *
 30 | 
 31 | """
 32 | Statistical test for outlier score to determine suitable score threshold
 33 | """
 34 | 
 35 | def getKeyedOlScores(dirPath, keyLen):
 36 | 	'''
 37 | 	extracts outlier score from spark output files
 38 | 	'''
 39 | 	filePaths = getAllFiles(dirPath) 
 40 | 	scores = dict()
 41 | 	if keyLen == 0:
 42 | 		kstr = "all"
 43 | 	for fpath in filePaths:
 44 | 		fname = ntpath.basename(fpath)
 45 | 		if fname.startswith("part"):
 46 | 			print("processing {}".format(fpath))
 47 | 			for rec in fileRecGen(fpath, ","):
 48 | 				if keyLen > 0:
 49 | 					kstr = ",".join(rec[0:keyLen])
 50 | 				score = float(rec[-2])
 51 | 				vl = scores.get(kstr)
 52 | 				if vl is None:
 53 | 					vl = list()
 54 | 					scores[kstr] = vl					
 55 | 				vl.append(score)
 56 | 	return scores
 57 | 
 58 | def olScoreStat(dirPath, keyLen, shoHist):
 59 | 	"""
 60 | 	upper tail statistic for outlier score
 61 | 	"""
 62 | 	filePaths = getAllFiles(dirPath) 
 63 | 	scores = dict()
 64 | 	if keyLen == 0:
 65 | 		kstr = "all"
 66 | 	for fpath in filePaths:
 67 | 		fname = ntpath.basename(fpath)
 68 | 		if fname.startswith("part"):
 69 | 			print("processing {}".format(fpath))
 70 | 			for rec in fileRecGen(fpath, ","):
 71 | 				if keyLen > 0:
 72 | 					kstr = ",".join(rec[0:keyLen])
 73 | 				score = float(rec[-2])
 74 | 				vl = scores.get(kstr)
 75 | 				if vl is None:
 76 | 					vl = list()
 77 | 					scores[kstr] = vl					
 78 | 				vl.append(score)
 79 | 	
 80 | 	print("outlier score upper tail stats")
 81 | 	sim = MonteCarloSimulator(None,None,None,None)	
 82 | 	for kstr, vl in scores.items():			
 83 | 		sim.setOutput(vl)
 84 | 		if shoHist:
 85 | 			sim.drawHist("outlier score", "score", "freq")
 86 | 		stats = sim.getUpperTailStat(0)
 87 | 		print("key: {}".format(kstr))
 88 | 		for s in stats:
 89 | 			print("{:.3f} {:.3f}".format(s[0], s[1]))
 90 | 
 91 | def olScoreEvStat(dirPath, keyLen, prTh, exPrTh):
 92 | 	"""
 93 | 	extreme value statistic for outlier score
 94 | 	Paper: Anomaly Detection in Streams with Extreme Value Theory by Siffer, 
 95 | 	"""
 96 | 	scores = getKeyedOlScores(dirPath, keyLen)
 97 | 	
 98 | 	sim = MonteCarloSimulator(None,None,None,None)	
 99 | 	for kstr, vl in scores.items():			
100 | 		sim.setOutput(vl)
101 | 		vth = sim.getCritValue(self, prTh)
102 | 		
103 | 		#values above threshold
104 | 		y = list(filter(lambda v : v > vth, vl))	
105 | 		ymax = max(y)
106 | 		ymin = min(y)
107 | 		ymean = statistics.mean(y)
108 | 		xsmin = -1.0 / ymax
109 | 		xsmax = 2.0 * (ymean - ymin) / (ymean * ymean)
110 | 		delta = (xsmax - xsmin) / 100
111 | 		for xs in floatRange(xsmin, xsmax, delta):
112 | 			pass
113 | 			
114 | 		
115 | 		
116 | if __name__ == "__main__":
117 | 	technique = sys.argv[1]
118 | 	dirPath = sys.argv[2]
119 | 	keyLen = int(sys.argv[3])
120 | 	
121 | 	if technique == "sttest":
122 | 		"""  outlier score upper tail statistics """
123 | 		shoHist = sys.argv[4] == "hist" if len(sys.argv) == 5 else False
124 | 		olScoreStat(dirPath, keyLen, shoHist)
125 | 		
126 | 	elif technique == "exvstat":
127 | 		"""  extreme value statistic for outlier score  """
128 | 		prTh = float(sys.argv[4])
129 | 		exPrTh = float(sys.argv[5])
130 | 		olScoreEvStat(dirPath, keyLen, prTh, exPrTh)
131 | 	else:
132 | 		exitWithMsg("invalid technique")
133 | 	
134 | 			
135 | 				
136 | 
137 | 


--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/MahalanobisDistancePredictor.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * beymani: Outlier and anamoly detection 
  3 |  * Author: Pranab Ghosh
  4 |  * 
  5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  6 |  * may not use this file except in compliance with the License. You may
  7 |  * obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0 
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 14 |  * implied. See the License for the specific language governing
 15 |  * permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package org.beymani.predictor;
 19 | 
 20 | import java.io.IOException;
 21 | import java.util.Map;
 22 | 
 23 | import org.chombo.math.MathUtils;
 24 | import org.chombo.stats.MultiVariateStatsManager;
 25 | import org.chombo.util.BasicUtils;
 26 | import org.chombo.util.ConfigUtility;
 27 | 
 28 | import Jama.Matrix;
 29 | 
 30 | 
 31 | /**
 32 |  * Predictor based on Mahalanobis distance for multi variate data
 33 |  * @author pranab
 34 |  *
 35 |  */
 36 | public class MahalanobisDistancePredictor extends ModelBasedPredictor {
 37 | 	private  MultiVariateStatsManager statsManager;
 38 | 	
 39 | 	/**
 40 | 	 * @param config
 41 | 	 * @param idOrdinalsParam
 42 | 	 * @param attrListParam
 43 | 	 * @param fieldDelimParam
 44 | 	 * @param statsFilePathParam
 45 | 	 * @param seasonalParam
 46 | 	 * @param hdfsFileParam
 47 | 	 * @param scoreThresholdParam
 48 | 	 * @param expConstParam
 49 | 	 * @param ignoreMissingStatParam
 50 | 	 * @param scoreAggggregationStrtaegyParam
 51 | 	 * @throws IOException
 52 | 	 */
 53 | 	public MahalanobisDistancePredictor(Map<String, Object> config, String idOrdinalsParam, String attrListParam, 
 54 | 			String fieldDelimParam,  String statsFilePathParam, String seasonalParam,String hdfsFileParam, 
 55 | 			String scoreThresholdParam, String expConstParam, String ignoreMissingStatParam) 
 56 | 		throws IOException {
 57 | 		idOrdinals = ConfigUtility.getIntArray(config, idOrdinalsParam);
 58 | 		attrOrdinals = ConfigUtility.getIntArray(config, attrListParam);
 59 | 		fieldDelim = ConfigUtility.getString(config, fieldDelimParam, ",");
 60 | 		
 61 | 		String statsFilePath = ConfigUtility.getString(config, statsFilePathParam);
 62 | 		boolean hdfsFilePath = ConfigUtility.getBoolean(config, hdfsFileParam);
 63 | 		seasonal = ConfigUtility.getBoolean(config, seasonalParam);
 64 | 		statsManager = new MultiVariateStatsManager(statsFilePath, fieldDelim, hdfsFilePath);
 65 | 		scoreThreshold = ConfigUtility.getDouble(config, scoreThresholdParam);
 66 | 		realTimeDetection = true;
 67 | 		expConst = ConfigUtility.getDouble(config, expConstParam);
 68 | 		ignoreMissingStat = ConfigUtility.getBoolean(config, ignoreMissingStatParam);
 69 | 	}
 70 | 
 71 | 	@Override
 72 | 	public double execute(String entityID, String record) {
 73 | 		// TODO Auto-generated method stub
 74 | 		return 0;
 75 | 	}
 76 | 
 77 | 	@Override
 78 | 	public double execute(String[] items, String compKey) {
 79 | 		double score = 0;
 80 | 		if (statsManager.statsExists(compKey)) {
 81 | 			//extract input vector and subtract mean vector
 82 | 			double[] data = BasicUtils.extractFieldsAsDoubleArray(items , attrOrdinals);
 83 | 			Matrix input = MathUtils.createRowMatrix(data);
 84 | 			Matrix inputOffset = MathUtils.subtractMatrix(input, statsManager.getMeanVec(compKey));
 85 | 			Matrix inputOffsetTr = MathUtils.transposeMatrix(inputOffset);
 86 | 			
 87 | 			
 88 | 			//mahalanobis distance
 89 | 			Matrix invCovar = statsManager.getInvCoVarMatrix(compKey);
 90 | 			Matrix maDist = MathUtils.multiplyMatrix(inputOffset, invCovar);
 91 | 			maDist = MathUtils.multiplyMatrix(maDist, inputOffsetTr);
 92 | 			score = MathUtils.scalarFromMatrix(maDist);
 93 | 		} else {
 94 | 			BasicUtils.assertCondition(!ignoreMissingStat, "missing stats for key " + compKey );
 95 | 		}
 96 | 		
 97 | 		//exponential normalization
 98 | 		if (expConst > 0) {
 99 | 			score = BasicUtils.expScale(expConst, score);
100 | 		}
101 | 
102 | 		scoreAboveThreshold = score > scoreThreshold;
103 | 		return score;
104 | 	}
105 | 
106 | 	@Override
107 | 	public boolean isValid(String compKey) {
108 | 		return statsManager.statsExists(compKey);
109 | 	}
110 | 
111 | }
112 | 


--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/EstimatedMetaProbabilityBasedPredictor.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * beymani: Outlier and anamoly detection 
  3 |  * Author: Pranab Ghosh
  4 |  * 
  5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  6 |  * may not use this file except in compliance with the License. You may
  7 |  * obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0 
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 14 |  * implied. See the License for the specific language governing
 15 |  * permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package org.beymani.predictor;
 19 | 
 20 | import java.io.IOException;
 21 | import java.util.Map;
 22 | 
 23 | import org.apache.hadoop.conf.Configuration;
 24 | import org.beymani.util.OutlierScoreAggregator;
 25 | import org.chombo.stats.HistogramStat;
 26 | import org.chombo.util.BasicUtils;
 27 | 
 28 | /**
 29 |  * Based on probability of probability p(f(y) < f(x)). f(x) is density function
 30 |  * @author pranab
 31 |  *
 32 |  */
 33 | public class EstimatedMetaProbabilityBasedPredictor extends EsimatedAttrtibuteProbabilityBasedPredictor {
 34 | 
 35 | 	public EstimatedMetaProbabilityBasedPredictor(Map conf) {
 36 | 		super(conf);
 37 | 	}
 38 | 
 39 | 	/**
 40 | 	 * @param config
 41 | 	 * @param idOrdinalsParam
 42 | 	 * @param attrListParam
 43 | 	 * @param distrFilePathParam
 44 | 	 * @param hdfsFileParam
 45 | 	 * @param schemaFilePathParam
 46 | 	 * @param attrWeightParam
 47 | 	 * @param seasonalParam
 48 | 	 * @param fieldDelimParam
 49 | 	 * @param scoreThresholdParam
 50 | 	 * @param ignoreMissingDistrParam
 51 | 	 * @throws IOException
 52 | 	 */
 53 | 	public EstimatedMetaProbabilityBasedPredictor(Map<String, Object> config,String idOrdinalsParam, String attrListParam,
 54 | 			String distrFilePathParam, String hdfsFileParam,String schemaFilePathParam, String attrWeightParam,
 55 | 			String seasonalParam, String fieldDelimParam,String scoreThresholdParam, String ignoreMissingDistrParam, 
 56 | 			String scoreStrategyParam, String expConstParam, String scoreAggggregationStrtaegyParam)
 57 | 			throws IOException {
 58 | 		super(config, idOrdinalsParam, attrListParam, distrFilePathParam,hdfsFileParam, schemaFilePathParam, attrWeightParam,
 59 | 				seasonalParam, fieldDelimParam, scoreThresholdParam,ignoreMissingDistrParam, scoreStrategyParam, expConstParam,
 60 | 				scoreAggggregationStrtaegyParam);
 61 | 	}
 62 | 
 63 | 	/**
 64 | 	 * @param config
 65 | 	 * @param distrFilePathParam
 66 | 	 * @param attrWeightParam
 67 | 	 * @param scoreThresholdParam
 68 | 	 * @param fieldDelimParam
 69 | 	 * @throws IOException
 70 | 	 */
 71 | 	public EstimatedMetaProbabilityBasedPredictor(Configuration config,String distrFilePathParam, String attrWeightParam,
 72 | 			String scoreThresholdParam, String fieldDelimParam)
 73 | 			throws IOException {
 74 | 		super(config, distrFilePathParam, attrWeightParam, scoreThresholdParam,fieldDelimParam);
 75 | 	}
 76 | 	
 77 | 	@Override
 78 | 	public double execute(String[] items, String compKey) {
 79 | 		double score = 0;
 80 | 		OutlierScoreAggregator scoreAggregator = new OutlierScoreAggregator(attrWeights.length, attrWeights);
 81 | 		double thisScore = 0;
 82 | 		for (int ord  :  attrOrdinals) {
 83 | 			String keyWithFldOrd = compKey + fieldDelim + ord;
 84 | 			double val = Double.parseDouble(items[ord]);
 85 | 			System.out.println("keyWithFldOrd " + keyWithFldOrd);
 86 | 			HistogramStat hist = keyedHist.get(keyWithFldOrd);
 87 | 			if (null != hist) {
 88 | 				double distr = hist.findMetaDistr(val);
 89 | 				if (scoreStrategy.equals("inverse")) {
 90 | 					thisScore = 1.0 - distr;
 91 | 				} else {
 92 | 					if (distr > 0) {
 93 | 						thisScore = -Math.log(distr);
 94 | 					} else {
 95 | 						thisScore = 20.0;
 96 | 					}
 97 | 				}
 98 | 				scoreAggregator.addScore(thisScore);
 99 | 			} else {
100 | 				BasicUtils.assertCondition(!ignoreMissingDistr, "missing distr for key " + keyWithFldOrd);
101 | 				scoreAggregator.addScore();
102 | 			}
103 | 		}
104 | 		//aggregate score	
105 | 		score = getAggregateScore(scoreAggregator);
106 | 		
107 | 		//exponential normalization
108 | 		if (expConst > 0) {
109 | 			score = BasicUtils.expScale(expConst, score);
110 | 		}
111 | 		
112 | 		scoreAboveThreshold = score > scoreThreshold;
113 | 		return score;
114 | 	}
115 | 	
116 | }
117 | 


--------------------------------------------------------------------------------
/python/app/bls.py:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/python3
  2 | 
  3 | # avenir-python: Machine Learning
  4 | # Author: Pranab Ghosh
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License"); you
  7 | # may not use this file except in compliance with the License. You may
  8 | # obtain a copy of the License at
  9 | #
 10 | # http://www.apache.org/licenses/LICENSE-2.0 
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 15 | # implied. See the License for the specific language governing
 16 | # permissions and limitations under the License.
 17 | 
 18 | import os
 19 | import sys
 20 | from random import randint
 21 | import time
 22 | import uuid
 23 | import threading
 24 | import matplotlib.pyplot as plt
 25 | sys.path.append(os.path.abspath("../lib"))
 26 | from util import *
 27 | from sampler import *
 28 | 
 29 | def createAnomaly(high):
 30 | 	if high:
 31 | 		reading = randomFloat(120, 200)
 32 | 	else:
 33 | 		reading = randomFloat(60, 80)
 34 | 	return reading
 35 | 	
 36 | if __name__ == "__main__":
 37 | 	op = sys.argv[1]
 38 | 	
 39 | 	#device stats
 40 | 	if op == "stat":
 41 | 		#normal mean 80 - 100 sd 1 - 5 
 42 | 		#anomaly  mean 120 - 160 sd 1 - 5 
 43 | 		numDevs = int(sys.argv[2])
 44 | 		mmin = int(sys.argv[3])
 45 | 		mmax = int(sys.argv[4])
 46 | 		smin = int(sys.argv[5])
 47 | 		smax = int(sys.argv[6])
 48 | 		for i in range(numDevs):
 49 | 			mean = randomFloat(mmin, mmax)
 50 | 			sd = randomFloat(smin, smax)
 51 | 			devId = genID(12)
 52 | 			#print "%s,%.3f,%.3f" %(devId, mean, sd)
 53 | 			print("{},{:.3f},{:.3f}".format(devId, mean, sd))
 54 | 			
 55 | 	#generate reading		
 56 | 	elif op == "gen":
 57 | 		statFile = sys.argv[2]
 58 | 		numDays = int(sys.argv[3])
 59 | 		modeNorm = (sys.argv[4] == "normal")
 60 | 		
 61 | 		devices = []
 62 | 		for rec in fileRecGen(statFile, ","):
 63 | 			ds = (rec[0], float(rec[1]), float(rec[2]))
 64 | 			devices.append(ds)
 65 | 			
 66 | 		
 67 | 		numDevs = len(devices)
 68 | 		distrs = list(map(lambda d: GaussianRejectSampler(d[1],d[2]), devices))	
 69 | 
 70 | 		curTime = int(time.time())
 71 | 		pastTime = curTime - (numDays + 1) * secInDay
 72 | 		pastTime = (pastTime / secInDay) * secInDay + secInHour * 15
 73 | 		sampTime = pastTime
 74 | 		sampIntv = secInDay
 75 | 		
 76 | 		anm = dict()
 77 | 		anmDesc = dict()
 78 | 		while(sampTime < curTime):
 79 | 			for i in range(numDevs):
 80 | 				d = devices[i]
 81 | 				did = d[0]
 82 | 				ts = sampTime + randint(-1000, 1000)
 83 | 				sampled = False
 84 | 				anomalyRate = 10 if (modeNorm) else 20
 85 | 				if isEventSampled(anomalyRate):
 86 | 					if not did in anm:
 87 | 						#create anomaly
 88 | 						high = isEventSampled(80)
 89 | 						reading =  createAnomaly(high)
 90 | 						appendKeyedList(anm, did, reading)
 91 | 						length = randint(1, 2) if(modeNorm) else randint(3, 7)
 92 | 						desc = (length, high)
 93 | 						anmDesc[did] = desc
 94 | 						sampled = True
 95 | 						#print "**** anomaly created %s, %d" %(did, reading)
 96 | 				
 97 | 				if not sampled:
 98 | 					if did in anm:
 99 | 						# ongoing anomaly
100 | 						ans = anm[did]
101 | 						desc = anmDesc[did]
102 | 						towardsNorm = len(ans) == desc[0] 
103 | 						an = ans[0]
104 | 						if len(ans) == desc[0]:
105 | 							# moving toward normal from anomaly
106 | 							if isEventSampled(60):
107 | 								sampled = True
108 | 								reading = 0.85 * an if(desc[1]) else 1.15 * an
109 | 								#print "**** moving back to normal %s, %d" %(did, reading)
110 | 							del anm[did]
111 | 							del anmDesc[did]
112 | 						elif len(ans) < desc[0]:
113 | 							# continue anomaly
114 | 							reading = createAnomaly(desc[1])
115 | 							appendKeyedList(anm, did, reading)
116 | 							sampled = True
117 | 							#print "**** anomaly continued %s, %d" %(did, reading)
118 | 						
119 | 					if not sampled:
120 | 						# normal
121 | 						reading = distrs[i].sample()
122 | 				
123 | 				#print "%s,%d,%d" %(did, ts, int(reading))
124 | 				print("{},{},{}".format(did, ts, int(reading)))
125 | 			sampTime += sampIntv 
126 | 
127 | 	elif op == "oplot":
128 | 		#plot outliers
129 | 		fpath = sys.argv[2]
130 | 		mid = sys.argv[3]
131 | 		filt = lambda r : r[0] == mid
132 | 		dvalues = list(map(lambda r : float(r[3]), fileFiltRecGen(fpath, filt)))
133 | 		xvalues = list(map(lambda r : int(r[1]), fileFiltRecGen(fpath, filt)))
134 | 		plt.plot(xvalues, dvalues)
135 | 		plt.title("outlier score")
136 | 		plt.show()
137 | 				


--------------------------------------------------------------------------------
/python/app/cpu_usage.py:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/python3
  2 | 
  3 | import os
  4 | import sys
  5 | from random import randint
  6 | import time
  7 | import uuid
  8 | import threading
  9 | sys.path.append(os.path.abspath("../lib"))
 10 | from util import *
 11 | from sampler import *
 12 | 
 13 | op = sys.argv[1]
 14 | secInHour = 60 * 60
 15 | secInDay = 24 * secInHour
 16 | secInWeek = 7 * secInDay
 17 | secInYear = 365 * secInDay
 18 | 
 19 | wkDayMean = 60
 20 | wkDayStdDev = 12
 21 | wkEndMean = 30
 22 | wkEndStdDev = 8
 23 | 
 24 | if op == "usage":
 25 | 	numDays = int(sys.argv[2])
 26 | 	sampIntv = int(sys.argv[3])
 27 | 	numServers = int(sys.argv[4])
 28 | 	
 29 | 	outDayInWeek = True
 30 | 	s = 5
 31 | 	if len(sys.argv) > 5:
 32 | 		#print(sys.argv[5])
 33 | 		if sys.argv[5] == "false" or sys.argv[5] == "f":
 34 | 			outDayInWeek = False
 35 | 		s = 6		
 36 | 
 37 | 	serverList = None
 38 | 	if len(sys.argv) > s:
 39 | 		#server ID from stats file
 40 | 		sfile = sys.argv[s]
 41 | 		#print(sfile)
 42 | 		servers = set()
 43 | 		for rec in fileRecGen(sfile, ","):
 44 | 			#print(rec[0])
 45 | 			servers.add(rec[0])
 46 | 		serverList = list(servers)
 47 | 	else:
 48 | 		#generate server ID
 49 | 		serverList = list()
 50 | 		for i in range(numServers):
 51 | 			serverList.append(genID(10))
 52 | 	
 53 | 	curTime = int(time.time())
 54 | 	pastTime = curTime - (numDays + 1) * secInDay
 55 | 	sampTime = pastTime
 56 | 	usageDistr = [GaussianRejectSampler(wkDayMean,wkDayStdDev), GaussianRejectSampler(wkEndMean,wkEndStdDev)]
 57 | 
 58 | 	while(sampTime < curTime):
 59 | 		secIntoDay = sampTime % secInDay
 60 | 		#hourIntoDay = secIntoDay / secInHour
 61 | 	
 62 | 		secIntoWeek = sampTime % secInWeek
 63 | 		daysIntoWeek = int(secIntoWeek / secInDay)
 64 | 	
 65 | 		if daysIntoWeek >= 0 and daysIntoWeek <= 4:
 66 | 			distr = usageDistr[0]
 67 | 		else:
 68 | 			distr = usageDistr[1]
 69 | 		
 70 | 		for server in serverList:
 71 | 			usage = distr.sample()
 72 | 			if (usage < 0):
 73 | 				usage = 5
 74 | 			elif usage > 100:
 75 | 				usage = 100
 76 | 			usage = int(usage)			
 77 | 			st = sampTime + randint(-2,2)
 78 | 			if outDayInWeek:
 79 | 				#print "%s,%d,%d,%d" %(server, st, daysIntoWeek, usage)
 80 | 				print("{},{},{},{}".format(server, st, daysIntoWeek, usage))
 81 | 			else:
 82 | 				#print "%s,%d,%d" %(server, st, usage)
 83 | 				print("{},{},{}".format(server, st, usage))
 84 | 				
 85 | 		sampTime = sampTime + sampIntv
 86 | 	
 87 | elif op == "anomaly":
 88 | 	fileName = sys.argv[2]
 89 | 	count = 0
 90 | 	for rec in fileRecGen(fileName, ","):
 91 | 		if isEventSampled(8):
 92 | 			dow = int(rec[2])
 93 | 			if dow < 5:
 94 | 				rec[3] = str(randint(94, 100))
 95 | 			else:
 96 | 				rec[3] = str(randint(54, 100))
 97 | 			count += 1
 98 | 		mrec = ",".join(rec)
 99 | 		print(mrec)
100 | 	#print "num of anomalous records " + str(count)
101 | 
102 | elif op == "feedback":
103 | 	fileName = sys.argv[2]
104 | 	curThreshold = float(sys.argv[3])
105 | 	newThreshold = float(sys.argv[4])
106 | 	margin = curThreshold + 0.6 * (newThreshold - curThreshold)
107 | 	count = 0
108 | 	for rec in fileRecGen(fileName, ","):
109 | 		score = float(rec[4])	
110 | 		label = rec[5]
111 | 		if newThreshold > curThreshold:
112 | 			#false positive
113 | 			if label == "O":
114 | 				if score > newThreshold:
115 | 					flabel = "O"
116 | 					cl = "T"
117 | 				else:
118 | 					if score < margin or isEventSampled(90):
119 | 						flabel = "N"
120 | 						cl = "F"
121 | 						count += 1
122 | 					else:
123 | 						flabel = "O"
124 | 						cl = "T"
125 | 			else:
126 | 				flabel = "N"
127 | 				cl = "F"
128 | 		else:
129 | 			#false negative
130 | 			if label == "O":
131 | 				flabel = "O"
132 | 				cl = "T"
133 | 			else:
134 | 				if score > newThreshold:
135 | 					if score > margin or isEventSampled(90):
136 | 						flabel = "O"
137 | 						cl = "T"
138 | 						count += 1
139 | 					else:
140 | 						flabel = "N"
141 | 						cl = "F"			
142 | 				else:
143 | 					flabel = "N"
144 | 					cl = "F"
145 | 		rec.append(flabel)
146 | 		rec.append(cl)
147 | 		mrec = ",".join(rec)
148 | 		print(mrec)
149 | 	#print count	
150 | 	
151 | elif op == "addTrend":
152 | 	fileName = sys.argv[2]
153 | 	trendYearlyPercentRate = float(sys.argv[3])
154 | 	trenPerSec = trendYearlyPercentRate / secInYear
155 | 	start = None
156 | 	for rec in fileRecGen(fileName, ","):
157 | 		ts = int(rec[1])
158 | 		usage = float(rec[3])
159 | 		if start is None:
160 | 			start = ts
161 | 		else:
162 | 			usage = usage + (ts - start) * trenPerSec
163 | 	usageStr = ".3f" %(usage)
164 | 	rec[3] = usageStr
165 | 	mrec = ",".join(rec)
166 | 	print(mrec)
167 | 			
168 | 			
169 | 
170 | 			


--------------------------------------------------------------------------------
/spark/src/main/scala/org/beymani/spark/common/PseudoRelevanceThresholdFinder.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * beymani-spark: Outlier and anamoly detection 
  3 |  * Author: Pranab Ghosh
  4 |  * 
  5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  6 |  * may not use this file except in compliance with the License. You may
  7 |  * obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0 
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 14 |  * implied. See the License for the specific language governing
 15 |  * permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package org.beymani.spark.common
 19 | 
 20 | import org.chombo.spark.common.JobConfiguration
 21 | import org.apache.spark.SparkContext
 22 | import scala.collection.JavaConverters._
 23 | import org.chombo.util.BasicUtils
 24 | import org.chombo.spark.common.Record
 25 | import org.chombo.util.BaseAttribute
 26 | import com.typesafe.config.Config
 27 | 
 28 | /**
 29 |  * Finds threshold based pseudo relevance e.g. top n or top n percentage
 30 |  * @author pranab
 31 |  *
 32 |  */
 33 | object PseudoRelevanceThresholdFinder extends JobConfiguration {
 34 |    /**
 35 |    * @param args
 36 |    * @return
 37 |    */
 38 |    def main(args: Array[String]) {
 39 | 	   val appName = "outlierCounter"
 40 | 	   val Array(inputPath: String, outputPath: String, configFile: String) = getCommandLineArgs(args, 3)
 41 | 	   val config = createConfig(configFile)
 42 | 	   val sparkConf = createSparkConf(appName, config, false)
 43 | 	   val sparkCntxt = new SparkContext(sparkConf)
 44 | 	   val appConfig = config.getConfig(appName)
 45 | 	   
 46 | 	   //configuration params
 47 | 	   val fieldDelimIn = getStringParamOrElse(appConfig, "field.delim.in", ",")
 48 | 	   val fieldDelimOut = getStringParamOrElse(appConfig, "field.delim.out", ",")
 49 | 	   val keyLen = getMandatoryIntParam(appConfig, "data.keyLen", "missing key length")
 50 | 	   val precision = getIntParamOrElse(appConfig, "output.precision", 3)
 51 | 	   val relevanceThreshold = getMandatoryDoubleParam(appConfig, "relevance.threshold", "missing relevance threshold")
 52 | 	   val relevanceAsPercentage = getBooleanParamOrElse(appConfig, "relevance.asPercentage", true)
 53 | 	   val minSampleCount = getMandatoryIntParam(appConfig, "sample.minCount", "missing min sample count")
 54 | 	   val thresholdPath = getMandatoryStringParam(appConfig, "threshold.filePath", "missing stat file path")
 55 | 	   val thresholdMap = BasicUtils.getKeyedValues(thresholdPath, keyLen, keyLen)
 56 | 	   val defaultThreshold = getMandatoryDoubleParam(appConfig, "threshold.default", "missing default threshold")
 57 | 	   val debugOn = appConfig.getBoolean("debug.on")
 58 | 	   val saveOutput = appConfig.getBoolean("save.output")
 59 | 	   
 60 | 	   //input
 61 | 	   val data = sparkCntxt.textFile(inputPath)
 62 | 
 63 | 	   val keyedThresholds = data.map(line => {
 64 |    		   val items = BasicUtils.getTrimmedFields(line, fieldDelimIn)
 65 |    		   val keyRec = Record(items, 0, keyLen)
 66 |    		   val last = items.length - 1
 67 |    		   val score = items(last -1).toDouble
 68 |    		   (keyRec, score)
 69 | 	   }).groupByKey.map(r => {
 70 | 	     val key = r._1
 71 | 	     val scores = r._2.toList
 72 | 	     val sortedScores = scores.sortWith((v1,v2) => v1 > v2)
 73 | 	     val size = sortedScores.length
 74 | 	     val threshold = 
 75 | 	     if (size > minSampleCount) {
 76 | 	         //find threshold
 77 | 		     val thresholdIndex = 
 78 | 		     if (relevanceAsPercentage) {
 79 | 		       ((size * relevanceThreshold) / 100).toInt - 1
 80 | 		     } else {
 81 | 		       val indx = relevanceThreshold.toInt - 1
 82 | 		       if (indx > size-2) {
 83 | 		         throw new IllegalStateException("absolute threshold value too big")
 84 | 		       }
 85 | 		       indx
 86 | 		     }
 87 | 		     sortedScores.slice(thresholdIndex - 1, 3).sum / 3
 88 | 	     } else {
 89 | 	       //use existing threshold or default
 90 | 	       val keyStr = key.toString(fieldDelimOut)
 91 | 	       if (thresholdMap.containsKey(keyStr)) thresholdMap.get(keyStr).toDouble
 92 | 	       else defaultThreshold
 93 | 	     }
 94 | 	     key.toString(fieldDelimOut) + fieldDelimOut + BasicUtils.formatDouble(threshold, precision)
 95 | 	   })
 96 | 	   
 97 |        if (debugOn) {
 98 |          val records = keyedThresholds.collect.slice(0, 20)
 99 |          records.foreach(r => println(r))
100 |        }
101 | 	   
102 | 	   if(saveOutput) {	   
103 | 	     keyedThresholds.saveAsTextFile(outputPath) 
104 | 	   }
105 | 	   
106 |    }
107 | }


--------------------------------------------------------------------------------
/src/main/java/org/beymani/util/DataStreamSchema.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * beymani: Outlier and anamoly detection 
  3 |  * Author: Pranab Ghosh
  4 |  * 
  5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  6 |  * may not use this file except in compliance with the License. You may
  7 |  * obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0 
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 14 |  * implied. See the License for the specific language governing
 15 |  * permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package org.beymani.util;
 19 | 
 20 | import java.io.FileInputStream;
 21 | import java.io.IOException;
 22 | import java.io.InputStream;
 23 | import java.io.Serializable;
 24 | import java.util.ArrayList;
 25 | import java.util.List;
 26 | 
 27 | import org.chombo.util.BasicUtils;
 28 | import org.codehaus.jackson.annotate.JsonIgnoreProperties;
 29 | import org.codehaus.jackson.map.ObjectMapper;
 30 | 
 31 | @JsonIgnoreProperties(ignoreUnknown = true)
 32 | public class DataStreamSchema implements Serializable {
 33 | 	private List<DataStream> dataStreams;
 34 | 	
 35 | 	/**
 36 | 	 * 
 37 | 	 */
 38 | 	public DataStreamSchema() {
 39 | 	}
 40 | 
 41 | 	/**
 42 | 	 * @return
 43 | 	 */
 44 | 	public List<DataStream> getDataStreams() {
 45 | 		return dataStreams;
 46 | 	}
 47 | 
 48 | 	/**
 49 | 	 * @param dataStreams
 50 | 	 */
 51 | 	public void setDataStreams(List<DataStream> dataStreams) {
 52 | 		this.dataStreams = dataStreams;
 53 | 	}
 54 | 
 55 | 	/**
 56 | 	 * @param type
 57 | 	 * @return
 58 | 	 */
 59 | 	public DataStream findByType(String type) {
 60 | 		DataStream stream = null;
 61 | 		for (DataStream daStrm : dataStreams) {
 62 | 			if (daStrm.getType().equals(type)) {
 63 | 				stream = daStrm;
 64 | 				break;
 65 | 			}
 66 | 		}	
 67 | 		return stream;
 68 | 	}
 69 | 	
 70 | 	/**
 71 | 	 * @param type
 72 | 	 * @return
 73 | 	 */
 74 | 	public List<DataStream> findAllByType(String type) {
 75 | 		List<DataStream> streams = new ArrayList<DataStream>();
 76 | 		for (DataStream daStrm : dataStreams) {
 77 | 			if (daStrm.getType().equals(type)) {
 78 | 				streams.add(daStrm);
 79 | 			}
 80 | 		}	
 81 | 		return streams;
 82 | 	}
 83 | 
 84 | 	/**
 85 | 	 * @param type
 86 | 	 * @return
 87 | 	 */
 88 | 	public DataStream findByTypeAndId(String type, String id) {
 89 | 		DataStream stream = null;
 90 | 		for (DataStream daStrm : dataStreams) {
 91 | 			if (daStrm.getId().equals("*")) {
 92 | 				if (daStrm.getType().equals(type)) {
 93 | 					boolean done = false;
 94 | 					List<DataStream> parents = findAllByType(daStrm.getParentType());
 95 | 					for (DataStream pa : parents) {
 96 | 						List<String> children = pa.getChildrenId();
 97 | 						BasicUtils.assertNotNull(children, "missing child ID list in parent");
 98 | 						if (children.contains(id)) {
 99 | 							BasicUtils.assertCondition(daStrm.getParentId().equals(pa.getId()), "mismatched parent ID");
100 | 							stream = daStrm;
101 | 							done = true;
102 | 							break;
103 | 						}
104 | 					}
105 | 					if (done)
106 | 						break;
107 | 				}
108 | 			} else {
109 | 				if (daStrm.getType().equals(type) && daStrm.getId().equals(id)) {
110 | 					stream = daStrm;
111 | 					break;
112 | 				}
113 | 			}
114 | 		}	
115 | 		return stream;
116 | 	}
117 | 
118 | 	/**
119 | 	 * @param type
120 | 	 * @param id
121 | 	 * @return
122 | 	 */
123 | 	public DataStream findParent(String type, String id) {
124 | 		DataStream parentStream = null;
125 | 		DataStream stream = findByType(type);
126 | 		BasicUtils.assertNotNull(stream, "coud not find data stream object");
127 | 		parentStream = findByType(stream.getParentType());
128 | 		if (!parentStream.isSingleton()) {
129 | 			//instance based
130 | 			stream = findByTypeAndId(type, id);
131 | 			parentStream = findByTypeAndId(stream.getParentType(), stream.getParentId());
132 | 		} 
133 | 		return parentStream;
134 | 	}
135 | 
136 | 	/**
137 | 	 * @param type
138 | 	 * @return
139 | 	 */
140 | 	public String findParentType(String type) {
141 | 		DataStream stream = findByType(type);
142 | 		BasicUtils.assertNotNull(stream, "coud not find data stream object");
143 | 		return stream.getParentType();
144 | 	}
145 | 	
146 | 	/**
147 | 	 * @param path
148 | 	 * @return
149 | 	 * @throws IOException
150 | 	 */
151 | 	public static DataStreamSchema loadDataStreamSchema(String path) throws IOException {
152 |         InputStream fs = new FileInputStream(path);
153 |         ObjectMapper mapper = new ObjectMapper();
154 |         DataStreamSchema schema = mapper.readValue(fs, DataStreamSchema.class);
155 |         return schema;
156 | 	}
157 | 	
158 | }
159 | 


--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/InterPercentileDifferenceBasedPredictor.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * beymani: Outlier and anamoly detection 
  3 |  * Author: Pranab Ghosh
  4 |  * 
  5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  6 |  * may not use this file except in compliance with the License. You may
  7 |  * obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0 
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 14 |  * implied. See the License for the specific language governing
 15 |  * permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package org.beymani.predictor;
 19 | 
 20 | import java.io.IOException;
 21 | import java.util.Map;
 22 | 
 23 | import org.apache.hadoop.conf.Configuration;
 24 | import org.beymani.util.OutlierScoreAggregator;
 25 | import org.chombo.stats.HistogramStat;
 26 | import org.chombo.util.BasicUtils;
 27 | 
 28 | /**
 29 |  * Inter percentile difference (25% and 75%) based predictor
 30 |  * @author pranab
 31 |  *
 32 |  */
 33 | public class InterPercentileDifferenceBasedPredictor extends EsimatedAttrtibuteProbabilityBasedPredictor {
 34 | 	private static final int QUARTER_PERECENTILE = 25;
 35 | 	private static final int THREE_QUARTER_PERECENTILE = 75;
 36 | 	
 37 | 	/**
 38 | 	 * @param conf
 39 | 	 */
 40 | 	public InterPercentileDifferenceBasedPredictor(Map conf) {
 41 | 		super(conf);
 42 | 	}
 43 | 
 44 | 	/**
 45 | 	 * @param config
 46 | 	 * @param idOrdinalsParam
 47 | 	 * @param attrListParam
 48 | 	 * @param distrFilePathParam
 49 | 	 * @param hdfsFileParam
 50 | 	 * @param schemaFilePathParam
 51 | 	 * @param attrWeightParam
 52 | 	 * @param seasonalParam
 53 | 	 * @param fieldDelimParam
 54 | 	 * @param scoreThresholdParam
 55 | 	 * @param ignoreMissingDistrParam
 56 | 	 * @throws IOException
 57 | 	 */
 58 | 	public InterPercentileDifferenceBasedPredictor(Map<String, Object> config,String idOrdinalsParam, String attrListParam,
 59 | 			String distrFilePathParam, String hdfsFileParam,String schemaFilePathParam, String attrWeightParam,
 60 | 			String seasonalParam, String fieldDelimParam,String scoreThresholdParam, String ignoreMissingDistrParam,
 61 | 			String expConstParam, String scoreAggggregationStrtaegyParam)
 62 | 			throws IOException {
 63 | 		super(config, idOrdinalsParam, attrListParam, distrFilePathParam,hdfsFileParam, schemaFilePathParam, attrWeightParam,
 64 | 				seasonalParam, fieldDelimParam, scoreThresholdParam,ignoreMissingDistrParam, "score.strategy", expConstParam,
 65 | 				scoreAggggregationStrtaegyParam);
 66 | 	}
 67 | 
 68 | 	/**
 69 | 	 * @param config
 70 | 	 * @param distrFilePathParam
 71 | 	 * @param attrWeightParam
 72 | 	 * @param scoreThresholdParam
 73 | 	 * @param fieldDelimParam
 74 | 	 * @throws IOException
 75 | 	 */
 76 | 	public InterPercentileDifferenceBasedPredictor(Configuration config,String distrFilePathParam, String attrWeightParam,
 77 | 			String scoreThresholdParam, String fieldDelimParam)
 78 | 			throws IOException {
 79 | 		super(config, distrFilePathParam, attrWeightParam, scoreThresholdParam,fieldDelimParam);
 80 | 	}
 81 | 
 82 | 	/* (non-Javadoc)
 83 | 	 * @see org.beymani.predictor.EsimatedAttrtibuteProbabilityBasedPredictor#execute(java.lang.String[], java.lang.String)
 84 | 	 */
 85 | 	@Override
 86 | 	public double execute(String[] items, String compKey) {
 87 | 		double score = 0;
 88 | 		OutlierScoreAggregator scoreAggregator = new OutlierScoreAggregator(attrWeights.length, attrWeights);
 89 | 		double thisScore = 0;
 90 | 		for (int ord  :  attrOrdinals) {
 91 | 			String keyWithFldOrd = compKey + fieldDelim + ord;
 92 | 			double val = Double.parseDouble(items[ord]);
 93 | 			System.out.println("keyWithFldOrd " + keyWithFldOrd);
 94 | 			HistogramStat hist = keyedHist.get(keyWithFldOrd);
 95 | 			if (null != hist) {
 96 | 				double quarterPercentile = hist.getQuantile(QUARTER_PERECENTILE);
 97 | 				double threeQuarterPercentile = hist.getQuantile(THREE_QUARTER_PERECENTILE); 
 98 | 				double percentileDiff = threeQuarterPercentile - quarterPercentile;
 99 | 				if (val < quarterPercentile) {
100 | 					thisScore = (quarterPercentile - val) / percentileDiff;
101 | 				} else if (val > threeQuarterPercentile){
102 | 					thisScore = (val - threeQuarterPercentile) / percentileDiff;
103 | 				}
104 | 				scoreAggregator.addScore(thisScore);
105 | 			} else {
106 | 				BasicUtils.assertCondition(!ignoreMissingDistr, "missing distr for key " + keyWithFldOrd);
107 | 				scoreAggregator.addScore();
108 | 			}
109 | 		}
110 | 		//aggregate score	
111 | 		score = getAggregateScore(scoreAggregator);
112 | 		
113 | 		//exponential normalization
114 | 		if (expConst > 0) {
115 | 			score = BasicUtils.expScale(expConst, score);
116 | 		}
117 | 		
118 | 		scoreAboveThreshold = score > scoreThreshold;
119 | 		return score;
120 | 	}
121 | 	
122 | }
123 | 


--------------------------------------------------------------------------------
/spark/src/main/scala/org/beymani/spark/common/OutlierScoreLevelShift.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * beymani-spark: Outlier and anamoly detection 
  3 |  * Author: Pranab Ghosh
  4 |  * 
  5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  6 |  * may not use this file except in compliance with the License. You may
  7 |  * obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0 
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 14 |  * implied. See the License for the specific language governing
 15 |  * permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package org.beymani.spark.common
 19 | 
 20 | import scala.Array.canBuildFrom
 21 | import scala.collection.JavaConverters._
 22 | import org.apache.spark.SparkContext
 23 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
 24 | import org.chombo.spark.common.GeneralUtility
 25 | import org.chombo.spark.common.JobConfiguration
 26 | import org.chombo.spark.common.Record
 27 | import org.chombo.util.BasicUtils
 28 | import org.hoidla.window.SizeBoundFloatStatsWindow
 29 | 
 30 | /**
 31 |  * Outlier detection based on level shift outlier score from any algorithm
 32 |  * @author pranab
 33 |  */
 34 | object OutlierScoreLevelShift extends JobConfiguration  with GeneralUtility  {
 35 |   
 36 |    /**
 37 |    * @param args
 38 |    * @return
 39 |    */
 40 |    def main(args: Array[String]) {
 41 | 	   val appName = "outlierScoreLevelShift"
 42 | 	   val Array(inputPath: String, outputPath: String, configFile: String) = getCommandLineArgs(args, 3)
 43 | 	   val config = createConfig(configFile)
 44 | 	   val sparkConf = createSparkConf(appName, config, false)
 45 | 	   val sparkCntxt = new SparkContext(sparkConf)
 46 | 	   val appConfig = config.getConfig(appName)
 47 | 	   
 48 | 	   //configuration params
 49 | 	   val fieldDelimIn = getStringParamOrElse(appConfig, "field.delim.in", ",")
 50 | 	   val fieldDelimOut = getStringParamOrElse(appConfig, "field.delim.out", ",")
 51 | 	   val seqFieldOrd = getMandatoryIntParam(appConfig, "seq.fieldOrd", "missing seq field ordinal")
 52 | 	   val keyLen = getMandatoryIntParam(appConfig, "key.length", "missing key length")
 53 | 	   val longWindowSize = getMandatoryIntParam(appConfig, "window.longSize", "missing long window size")
 54 | 	   val shortWindowSize = getMandatoryIntParam(appConfig, "window.shortSize", "missing short window size")
 55 | 	   val minZscore = getMandatoryDoubleParam(appConfig, "zscore.min", "missing min z score")
 56 | 	   val debugOn = getBooleanParamOrElse(appConfig, "debug.on", false)
 57 | 	   val saveOutput = getBooleanParamOrElse(appConfig,"save.output", true)
 58 | 	   
 59 | 	   //input
 60 | 	   val data = sparkCntxt.textFile(inputPath)
 61 | 	   	   
 62 | 	   val taggedData = data.map(line => {
 63 | 		 val items = BasicUtils.getTrimmedFields(line, fieldDelimIn)
 64 | 		 val key = Record(items, 0, keyLen)
 65 | 		 (key, items)
 66 | 	   }).groupByKey.flatMap(r => {
 67 | 	     val longWindow = new SizeBoundFloatStatsWindow(longWindowSize)
 68 | 	     val shortWindow = new SizeBoundFloatStatsWindow(shortWindowSize)
 69 | 	     val values = r._2.toArray.sortBy(v => {
 70 | 	       v(seqFieldOrd).toLong
 71 | 	     })
 72 | 	     val newTags = values.map(v => {
 73 | 	       val score = v(v.size - 2).toDouble
 74 | 	       val tag = v(v.size - 1)
 75 | 	       longWindow.add(score)
 76 | 	       shortWindow.add(score)
 77 | 	       var newTag = ""
 78 | 	       if (longWindow.isFull()) {
 79 | 	         val loMean = longWindow.getMean()
 80 | 	         val loStdDev = longWindow.getStdDev()
 81 | 	         val shMean = shortWindow.getMean()
 82 | 	         val levelBasedScore = (shMean - loMean) / loStdDev;
 83 | 	         newTag = if (levelBasedScore > minZscore) "O" else "N"
 84 | 	       } else {
 85 | 	         newTag = tag
 86 | 	       }
 87 | 	       val rec = Record(2)
 88 | 	       rec.add(tag,newTag)
 89 | 	     })
 90 | 	     
 91 | 	     //propagate outlier tag
 92 | 	     for (i <- longWindowSize to newTags.length -1) {
 93 | 	       if(newTags(i).getString(1) == "O") {
 94 | 	         for (j <- i - shortWindowSize + 1 to i - 1) {
 95 | 	           val tag = if (newTags(j).getString(0) == "I") "I" else "O"
 96 | 	           val rec = Record(2)
 97 | 	           rec.add(newTags(j).getString(0), tag)
 98 | 	           newTags(j)  = rec
 99 | 	         }
100 | 	       }
101 | 	     }
102 | 	     
103 | 	     val recValues = values.map(v => Record(v))
104 | 	     newTags.zip(recValues).map(r => {
105 | 	       val newTag  = r._1.getString(1)
106 | 	       val rec = r._2.getString(0)
107 | 	       rec + fieldDelimOut + newTag
108 | 	     })
109 | 	 })
110 | 	   
111 | 	 if (debugOn) {
112 |          val records = taggedData.collect
113 |          records.slice(0, 100).foreach(r => println(r))
114 |      }
115 | 	   
116 | 	 if(saveOutput) {	   
117 | 	     taggedData.saveAsTextFile(outputPath) 
118 | 	 }	 
119 | 	   
120 |    }
121 | }


--------------------------------------------------------------------------------
/resource/and_spark.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | PROJECT_HOME=/Users/pranab/Projects
  4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar
  5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar
  6 | MASTER=spark://akash:7077
  7 | 
  8 | case "$1" in
  9 | 
 10 | "crInput")
 11 | 	echo "args: num_of_days time_interval(sec) num_of_servers output_file"
 12 | 	./cpu_usage.py usage $2 $3 $4 true > $5
 13 | 	ls -l $5
 14 | ;;
 15 | 
 16 | "crTestInput")
 17 | 	./cpu_usage.py usage $2 $3 $4 true $5 > $6
 18 | 	ls -l $6
 19 | ;;
 20 | 
 21 | "insOutliers")
 22 | 	echo "args: normal_data_file  output_file"
 23 | 	./cpu_usage.py anomaly $2 > $3
 24 | 	ls -l $3
 25 | ;;
 26 | 
 27 | "cpModData")
 28 | 	echo "args: modeling_data_file  "
 29 | 	rm $PROJECT_HOME/bin/beymani/input/olp/*
 30 | 	rm $PROJECT_HOME/bin/beymani/nas/olp/*
 31 | 	cp $2 $PROJECT_HOME/bin/beymani/input/nas/
 32 | 	cp $2 $PROJECT_HOME/bin/beymani/input/olp/
 33 | 	ls -l $PROJECT_HOME/bin/beymani/input/nas
 34 | 	ls -l $PROJECT_HOME/bin/beymani/input/olp
 35 | ;;
 36 | 
 37 | "cpTestData")
 38 | 	echo "args: test_data_file  "
 39 | 	rm $PROJECT_HOME/bin/beymani/input/olp/*
 40 | 	cp $2 $PROJECT_HOME/bin/beymani/input/olp/
 41 | 	ls -l $PROJECT_HOME/bin/beymani/input/olp
 42 | ;;
 43 | 
 44 | "numStat")
 45 | 	echo "running NumericalAttrStats Spark job"
 46 | 	CLASS_NAME=org.chombo.spark.explore.NumericalAttrStats
 47 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/nas/cusage.txt
 48 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/nas
 49 | 	rm -rf ./output/nas
 50 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
 51 | 	--conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME  $INPUT $OUTPUT and.conf
 52 | 	ls -l $PROJECT_HOME/bin/beymani/output/nas/
 53 | ;;
 54 | 
 55 | "crStatsFile")
 56 | 	echo "copying and consolidating stats file"
 57 | 	rm $PROJECT_HOME/bin/beymani/output/nas/_SUCCESS
 58 | 	SFILE=$PROJECT_HOME/bin/beymani/other/olp/stats.txt
 59 | 	cp /dev/null $SFILE
 60 | 	for f in $PROJECT_HOME/bin/beymani/output/nas/*
 61 | 	do
 62 | 		echo "Copying file $f ..."
 63 | 		cat $f >> $SFILE
 64 | 	done	
 65 | 	ls -l $PROJECT_HOME/bin/beymani/other/olp
 66 | ;;
 67 | 
 68 | "olPred")
 69 | 	echo "running StatsBasedOutlierPredictor Spark job"
 70 | 	CLASS_NAME=org.beymani.spark.dist.StatsBasedOutlierPredictor
 71 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/olp/*
 72 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/olp
 73 | 	rm -rf ./output/olp
 74 | 	rm -rf ./other/olp/clean
 75 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
 76 | 	--conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME  $INPUT $OUTPUT and.conf
 77 | 	rm ./output/olp/_SUCCESS
 78 | 	for f in ./output/olp/*
 79 | 	do
 80 | 		echo "number of records in $f"
 81 | 		wc -l $f
 82 | 	done
 83 | 
 84 | 	for f in ./output/olp/*
 85 | 	do
 86 | 		echo "number of outliers in $f"
 87 | 		cat $f | grep ,O | wc -l
 88 | 	done
 89 | 	
 90 | ;;
 91 | 
 92 | "crCleanFile")
 93 | 	echo "copying, consolidating and moving clean training data file"
 94 | 	rm $PROJECT_HOME/bin/beymani/other/olp/clean/_SUCCESS
 95 | 	CFILE=$PROJECT_HOME/bin/beymani/other/olp/clean/cusage.txt
 96 | 	cp /dev/null $CFILE
 97 | 	echo "creating clean file $CFILE"
 98 | 	for f in $PROJECT_HOME/bin/beymani/other/olp/clean/*
 99 | 	do
100 |   		echo "Copying file $f ..."
101 |   		cat $f >> $CFILE
102 | 	done
103 | 	echo "copying clean file to model input directory"
104 | 	mv $PROJECT_HOME/bin/beymani/input/nas/cusage.txt $PROJECT_HOME/bin/beymani/other/nas/cusage_1.txt 
105 | 	mv $CFILE $PROJECT_HOME/bin/beymani/input/nas/cusage.txt
106 | 	echo "backing up current model file"
107 | 	mv $PROJECT_HOME/bin/beymani/other/olp/stats.txt $PROJECT_HOME/bin/beymani/other/olp/stats_1.txt
108 | 	ls -l $PROJECT_HOME/bin/beymani/input/nas/
109 | ;;
110 | 
111 | 
112 | "mvOutlFile")
113 | 	echo "moving outlier output file"
114 | 	cat $PROJECT_HOME/bin/beymani/output/olp/part-00000 > $PROJECT_HOME/bin/beymani/other/olp/outl.txt
115 | 	cat $PROJECT_HOME/bin/beymani/output/olp/part-00001 >> $PROJECT_HOME/bin/beymani/other/olp/outl.txt
116 | ;;
117 | 
118 | "thLearn")
119 | 	echo "running ThresholdLearner Spark job"
120 | 	CLASS_NAME=org.beymani.spark.common.ThresholdLearner
121 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/thl/olf.txt
122 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/thl
123 | 	rm -rf ./output/thl
124 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
125 | 	--conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME  $INPUT $OUTPUT and.conf
126 | ;;
127 | 
128 | "tempAggr")
129 | 	echo "running TemporalAggregator Spark job"
130 | 	CLASS_NAME=org.chombo.spark.explore.TemporalAggregator
131 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/teg/cusage.txt
132 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/teg
133 | 	rm -rf ./output/teg
134 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
135 | 	--conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME  $INPUT $OUTPUT and.conf
136 | ;;
137 | 
138 | 
139 | *) 
140 | 	echo "unknown operation $1"
141 | 	;;
142 | 
143 | esac


--------------------------------------------------------------------------------
/spark/src/main/scala/org/beymani/spark/common/OutlierCounter.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * beymani-spark: Outlier and anamoly detection 
  3 |  * Author: Pranab Ghosh
  4 |  * 
  5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  6 |  * may not use this file except in compliance with the License. You may
  7 |  * obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0 
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 14 |  * implied. See the License for the specific language governing
 15 |  * permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package org.beymani.spark.common
 19 | 
 20 | import org.chombo.spark.common.JobConfiguration
 21 | import org.apache.spark.SparkContext
 22 | import scala.collection.JavaConverters._
 23 | import org.chombo.util.BasicUtils
 24 | import org.chombo.spark.common.Record
 25 | import org.chombo.util.BaseAttribute
 26 | import com.typesafe.config.Config
 27 | 
 28 | /**
 29 | * Outlier count statistics
 30 | * @author pranab
 31 | *
 32 | */
 33 | object OutlierCounter extends JobConfiguration {
 34 |    /**
 35 |    * @param args
 36 |    * @return
 37 |    */
 38 |    def main(args: Array[String]) {
 39 | 	   val appName = "outlierCounter"
 40 | 	   val Array(inputPath: String, outputPath: String, configFile: String) = getCommandLineArgs(args, 3)
 41 | 	   val config = createConfig(configFile)
 42 | 	   val sparkConf = createSparkConf(appName, config, false)
 43 | 	   val sparkCntxt = new SparkContext(sparkConf)
 44 | 	   val appConfig = config.getConfig(appName)
 45 | 	   
 46 | 	   //configuration params
 47 | 	   val fieldDelimIn = getStringParamOrElse(appConfig, "field.delim.in", ",")
 48 | 	   val fieldDelimOut = getStringParamOrElse(appConfig, "field.delim.out", ",")
 49 | 	   val keyLen = getMandatoryIntParam(appConfig, "data.keyLen", "missing key length")
 50 | 	   val precision = getIntParamOrElse(appConfig, "output.precision", 3)
 51 | 	   val insertTimeStamp = getBooleanParamOrElse(appConfig, "output.insertTmStmp", false)
 52 | 	   val tmStmp = if (insertTimeStamp) System.currentTimeMillis() else 0
 53 | 	   val normTag = "N"
 54 | 	   val outlierTag = "O"
 55 | 	   val indeterTag = "I"
 56 | 	   val totalTag = "T"
 57 | 	   val debugOn = appConfig.getBoolean("debug.on")
 58 | 	   val saveOutput = appConfig.getBoolean("save.output")
 59 | 	   
 60 | 	   //input
 61 | 	   val data = sparkCntxt.textFile(inputPath)
 62 | 	   
 63 | 	   //key by record key and record status
 64 | 	   val keyedCounters = data.flatMap(line => {
 65 |    		   val items = BasicUtils.getTrimmedFields(line, fieldDelimIn)
 66 |    		   val counters = for (i <- 0 to 1) yield {
 67 |    			   val keyRec = Record(keyLen+1, items, 0, keyLen)
 68 |    			   if (i == 0) keyRec.addString(items(items.length-1))
 69 |    			   else keyRec.addString(totalTag)
 70 |    			   (keyRec, 1)
 71 |    		   }
 72 |    		   counters
 73 | 	   }).reduceByKey((v1,v2) => v1+v2)
 74 | 	   
 75 | 	   //formatted count statistics for each key
 76 | 	   val formattedCountRecs = keyedCounters.map(r => {
 77 | 	     val keyRec = Record(r._1, 0, keyLen)
 78 | 	     val valRec = Record(2)
 79 | 	     valRec.addString(r._1.getString(keyLen))
 80 | 	     valRec.addInt(r._2)
 81 | 	     (keyRec, valRec)
 82 | 	   }).groupByKey().map(r => {
 83 | 	     val key = r._1
 84 | 	     val values = r._2.toArray
 85 | 	     var outlierCount = 0
 86 | 	     var indeterCount = 0
 87 | 	     var normCount = 0
 88 | 	     var totalCount = 0
 89 | 	     for (v <- values) {
 90 | 	    	 v.getString(0) match {
 91 | 	    	   case `outlierTag` => outlierCount = v.getInt(1)
 92 | 	    	   case `indeterTag` => indeterCount = v.getInt(1)
 93 | 	    	   case `normTag` => normCount = v.getInt(1)
 94 | 	    	   case `totalTag` => totalCount = v.getInt(1)
 95 | 	    	 }
 96 | 	     }
 97 | 	     val outlierPercent = (outlierCount * 100).toDouble / totalCount
 98 | 	     val indeterPercent = (indeterCount * 100).toDouble / totalCount
 99 | 	     val normPercent = (normCount * 100).toDouble / totalCount
100 | 	     
101 | 	     val stBld = new StringBuilder(key.toString(fieldDelimOut))
102 | 	     if (insertTimeStamp)
103 | 	       stBld.append(fieldDelimOut).append(tmStmp)
104 | 	     stBld.
105 | 	       	append(fieldDelimOut).append(outlierCount).
106 | 	     	append(fieldDelimOut).append(BasicUtils.formatDouble(outlierPercent, precision)).
107 | 	     	append(fieldDelimOut).append(indeterCount).
108 | 	     	append(fieldDelimOut).append(BasicUtils.formatDouble(indeterPercent, precision)).
109 | 	     	append(fieldDelimOut).append(normCount).
110 | 	     	append(fieldDelimOut).append(BasicUtils.formatDouble(normPercent, precision)).
111 | 	     	append(fieldDelimOut).append(totalCount)
112 | 	     
113 | 	     stBld.toString()
114 | 	   })
115 | 	   
116 |        if (debugOn) {
117 |          val records = formattedCountRecs.collect.slice(0, 20)
118 |          records.foreach(r => println(r))
119 |        }
120 | 	   
121 | 	   if(saveOutput) {	   
122 | 	     formattedCountRecs.saveAsTextFile(outputPath) 
123 | 	   }
124 |    }
125 | 
126 | }


--------------------------------------------------------------------------------
/resource/ecomm.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | PROJECT_HOME=/Users/pranab/Projects
  4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar
  5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar
  6 | MASTER=spark://akash:7077
  7 | 
  8 | case "$1" in
  9 | 
 10 | "loadInp")
 11 | 	rm $PROJECT_HOME/bin/beymani/input/ecom/$3/*
 12 | 	cp $2 $PROJECT_HOME/bin/beymani/input/ecom/$3/
 13 | 	ls -l $PROJECT_HOME/bin/beymani/input/ecom/$3/
 14 | ;;
 15 | 
 16 | 
 17 | "numStat")
 18 | 	echo "running NumericalAttrStats Spark job"
 19 | 	CLASS_NAME=org.chombo.spark.explore.NumericalAttrStats
 20 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/ecom/training/*
 21 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ecom/stat
 22 | 	rm -rf ./output/ecom/stat
 23 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
 24 | 	--conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME  $INPUT $OUTPUT ecomm.conf
 25 | ;;
 26 | 
 27 | "numMstat")
 28 | 	echo "running NumericalAttrMedian Spark job"
 29 | 	CLASS_NAME=org.chombo.spark.explore.NumericalAttrMedian
 30 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/ecom/training/*
 31 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ecom/mstat
 32 | 	rm -rf ./output/ecom/mstat
 33 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
 34 | 	--conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME  $INPUT $OUTPUT ecomm.conf
 35 | 	rm ./output/ecom/mstat/_SUCCESS
 36 | 	ls -l ./output/ecom/mstat
 37 | ;;
 38 | 
 39 | "bkMod")
 40 | 	echo "backing up model files"
 41 | 	MED_FILES=$PROJECT_HOME/bin/beymani/output/ecom/mstat/*
 42 | 	META_DIR=$PROJECT_HOME/bin/beymani/meta/ecom
 43 | 	META_FILE=$META_DIR/$2
 44 | 	echo "copying to $META_FILE"
 45 | 	cp /dev/null $META_FILE
 46 | 	for f in $MED_FILES
 47 | 	do
 48 |   		echo "Copying file $f ..."
 49 |   		cat $f >> $META_FILE
 50 | 	done
 51 | 	ls -l $META_FILE
 52 | ;;
 53 | 
 54 | "cpMod")
 55 | 	echo "copying model files files from backup"
 56 | 	META_DIR=$PROJECT_HOME/bin/beymani/meta/ecom
 57 | 	cp $META_DIR/$2  $META_DIR/
 58 | 	ls -l $META_DIR
 59 | ;;
 60 | 
 61 | "olPred")
 62 | 	echo "running StatsBasedOutlierPredictor Spark job"
 63 | 	CLASS_NAME=org.beymani.spark.dist.StatsBasedOutlierPredictor
 64 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/ecom/pred/*
 65 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ecom/olp
 66 | 	rm -rf ./output/ecom/olp
 67 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
 68 | 	--conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME  $INPUT $OUTPUT ecomm.conf
 69 | 	rm ./output/ecom/olp/_SUCCESS
 70 | 	ls -l ./output/ecom/olp
 71 | 	cat ./output/ecom/olp/part-00000 | grep ,O 
 72 | ;;
 73 | 
 74 | "chkOl")
 75 | 	echo "number of outliers"
 76 | 	OUT_FILES=$PROJECT_HOME/bin/beymani/output/ecom/olp/*
 77 | 	for f in $OUT_FILES
 78 | 	do
 79 |   		echo "checking file $f ..."
 80 |   		wc -l $f
 81 | 	done
 82 | ;;
 83 | 
 84 | "bkOut")
 85 | 	echo "backing up outlier output files"
 86 | 	OUT_FILES=$PROJECT_HOME/bin/beymani/output/ecom/olp/*
 87 | 	BK_DIR=$PROJECT_HOME/bin/beymani/output/ecom/bkup
 88 | 	BK_FILE=$BK_DIR/$2
 89 | 	cp /dev/null $BK_FILE
 90 | 	for f in $OUT_FILES
 91 | 	do
 92 |   		echo "Copying file $f ..."
 93 |   		cat $f >> $BK_FILE
 94 | 	done
 95 | 	ls -l $BK_FILE
 96 | ;;
 97 | 
 98 | "rmAggrInp")
 99 | 	echo "removing outlier aggregation input files"
100 | 	IN_DIR=$PROJECT_HOME/bin/beymani/input/ecom/aggr
101 | 	rm $IN_DIR/*
102 | 	ls -l $IN_DIR
103 | ;;
104 | 
105 | "loadAggrInp")
106 | 	echo "copying outlier output files for aggregation"
107 | 	IN_DIR=$PROJECT_HOME/bin/beymani/input/ecom/aggr/
108 | 	BK_DIR=$PROJECT_HOME/bin/beymani/output/ecom/bkup
109 | 	cp $BK_DIR/$2 $IN_DIR
110 | 	ls -l $IN_DIR
111 | ;;
112 | 
113 | 
114 | "aggrOl")
115 | 	echo "running OutlierAggregator Spark job"
116 | 	CLASS_NAME=org.beymani.spark.common.OutlierAggregator
117 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/ecom/aggr/*
118 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ecom/aggr
119 | 	rm -rf ./output/ecom/aggr
120 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
121 | 	--conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME  $INPUT $OUTPUT ecomm.conf
122 | 	rm ./output/ecom/aggr/_SUCCESS
123 | 	ls -l ./output/ecom/aggr
124 | 	cat ./output/ecom/aggr/part-00000 | grep ,O 
125 | ;;
126 | 
127 | 
128 | "bkOutAggr")
129 | 	echo "backing up aggregator output files"
130 | 	OUT_FILES=$PROJECT_HOME/bin/beymani/output/ecom/aggr/*
131 | 	BK_DIR=$PROJECT_HOME/bin/beymani/output/ecom/bkup
132 | 	BK_FILE=$BK_DIR/$2
133 | 	cp /dev/null $BK_FILE
134 | 	for f in $OUT_FILES
135 | 	do
136 |   		echo "Copying file $f ..."
137 |   		cat $f >> $BK_FILE
138 | 	done
139 | 	ls -l $BK_FILE
140 | ;;
141 | 
142 | "orpOlPred")
143 | 	echo "running IsolationForestModel Spark job"
144 | 	CLASS_NAME=org.beymani.spark.multi.IsolationForestModel
145 | 	INPUT=file:///Users/pranab/Projects/bin/beymani/input/ecom/orp/*
146 | 	OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ecom/orp
147 | 	rm -rf ./output/ecom/orp
148 | 	$SPARK_HOME/bin/spark-submit --class $CLASS_NAME   \
149 | 	--conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME  $INPUT $OUTPUT ecomm.conf
150 | 	rm ./output/ecom/orp/_SUCCESS
151 | 	ls -l ./output/ecom/orp
152 | 	cat ./output/ecom/orp/part-00000 | grep ,O 
153 | ;;
154 | 	
155 | *) 
156 | 	echo "unknown operation $1"
157 | 	;;
158 | 
159 | esac


--------------------------------------------------------------------------------
/src/main/java/org/beymani/proximity/RelativeDensity.java:
--------------------------------------------------------------------------------
  1 | package org.beymani.proximity;
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | import org.apache.hadoop.conf.Configuration;
  6 | import org.apache.hadoop.conf.Configured;
  7 | import org.apache.hadoop.fs.Path;
  8 | import org.apache.hadoop.io.LongWritable;
  9 | import org.apache.hadoop.io.NullWritable;
 10 | import org.apache.hadoop.io.Text;
 11 | import org.apache.hadoop.mapreduce.Job;
 12 | import org.apache.hadoop.mapreduce.Mapper;
 13 | import org.apache.hadoop.mapreduce.Reducer;
 14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 16 | import org.apache.hadoop.util.Tool;
 17 | import org.apache.hadoop.util.ToolRunner;
 18 | import org.apache.log4j.Level;
 19 | import org.apache.log4j.Logger;
 20 | import org.chombo.util.TextInt;
 21 | import org.chombo.util.Tuple;
 22 | import org.chombo.util.Utility;
 23 | 
 24 | public class RelativeDensity  extends Configured implements Tool {
 25 | 
 26 | 	@Override
 27 | 	public int run(String[] args) throws Exception {
 28 |         Job job = new Job(getConf());
 29 |         String jobName = "Relative  density";
 30 |         job.setJobName(jobName);
 31 |         
 32 |         job.setJarByClass(RelativeDensity.class);
 33 |         
 34 |         FileInputFormat.addInputPath(job, new Path(args[0]));
 35 |         FileOutputFormat.setOutputPath(job, new Path(args[1]));
 36 |         
 37 |         job.setMapperClass(RelativeDensity.DensityMapper.class);
 38 |         job.setReducerClass(RelativeDensity.DensityReducer.class);
 39 |         
 40 |         job.setMapOutputKeyClass(Text.class);
 41 |         job.setMapOutputValueClass(Tuple.class);
 42 | 
 43 |         job.setOutputKeyClass(NullWritable.class);
 44 |         job.setOutputValueClass(Text.class);
 45 |         
 46 |         Utility.setConfiguration(job.getConfiguration());
 47 | 
 48 |         job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));
 49 |         
 50 |         int status =  job.waitForCompletion(true) ? 0 : 1;
 51 |         return status;
 52 | 	}
 53 | 
 54 | 	public static class DensityMapper extends Mapper<LongWritable, Text, Text, Tuple> {
 55 |         private String fieldDelimRegex;
 56 |         private String fieldDelim;
 57 |         private  String[] items ;
 58 | 		private Text outKey = new Text();
 59 | 		private Tuple outVal = new Tuple();
 60 |         
 61 |         protected void setup(Context context) throws IOException, InterruptedException {
 62 |            	fieldDelim = context.getConfiguration().get("field.delim", ",");
 63 |             fieldDelimRegex = context.getConfiguration().get("field.delim.regex", "\\[\\]");
 64 |         }		
 65 |         
 66 |         @Override
 67 |         protected void map(LongWritable key, Text value, Context context)
 68 |             throws IOException, InterruptedException {
 69 |         	outVal.initialize();
 70 |             items  =  value.toString().split(fieldDelimRegex);
 71 |             outKey.set(items[0]);
 72 |             outVal.add(items[1], Integer.parseInt(items[2]));
 73 | 	   		context.write(outKey, outVal);
 74 |         }
 75 | 	}
 76 | 
 77 |     /**
 78 |      * @author pranab
 79 |      *
 80 |      */
 81 |     public static class DensityReducer extends Reducer<Text, Tuple, NullWritable, Text> {
 82 |        	private String fieldDelim;
 83 |        	private String groupID;
 84 |        	private String entityID;
 85 |        	private int sumDensity;
 86 |        	private int density;
 87 |        	private int relDensity;
 88 |     	private Text outVal = new Text();
 89 |     	private int relDensityScale;
 90 |     	private static final Logger LOG = Logger.getLogger(DensityReducer.class);
 91 |     	
 92 |         protected void setup(Context context) throws IOException, InterruptedException {
 93 | 			Configuration conf = context.getConfiguration();
 94 |            	fieldDelim = conf.get("field.delim", ",");
 95 |            relDensityScale = context.getConfiguration().getInt("red.reltive.density.scale", 1000);
 96 |             if (conf.getBoolean("debug.on", false)) {
 97 |              	LOG.setLevel(Level.DEBUG);
 98 |              }
 99 |         }
100 |     	
101 |     	/* (non-Javadoc)
102 |     	 * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
103 |     	 */
104 |     	protected void reduce(Text key, Iterable<Tuple> values, Context context)
105 |             	throws IOException,  InterruptedException {
106 |     		groupID = key.toString();
107 | 			sumDensity = 0;
108 | 			density = 0;
109 |     		for (Tuple val : values) {
110 |     			entityID = val.getString(0);
111 |     			if (entityID.equals(groupID)) {
112 |     				density =  val.getInt(1);
113 |     				LOG.debug("entityID:" + entityID + " density:" + density);
114 |     			}
115 |     			sumDensity += val.getInt(1);
116 |     		}    
117 |     		
118 |     		relDensity = (density * relDensityScale) / sumDensity;
119 | 			outVal.set(groupID + fieldDelim +relDensity);
120 | 			context.write(NullWritable.get(), outVal);
121 |     	}   
122 |     	
123 |     }	
124 | 	
125 | 	/**
126 | 	 * @param args
127 | 	 */
128 | 	public static void main(String[] args) throws Exception {
129 |         int exitCode = ToolRunner.run(new RelativeDensity(), args);
130 |         System.exit(exitCode);
131 | 	}
132 |     
133 |     
134 | }
135 | 


--------------------------------------------------------------------------------
/spark/src/main/scala/org/beymani/spark/seq/LocalNeighborhoodDetector.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * beymani-spark: Outlier and anamoly detection 
  3 |  * Author: Pranab Ghosh
  4 |  * 
  5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  6 |  * may not use this file except in compliance with the License. You may
  7 |  * obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0 
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 14 |  * implied. See the License for the specific language governing
 15 |  * permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package org.beymani.spark.seq
 19 | 
 20 | import org.apache.spark.rdd.RDD
 21 | import scala.collection.mutable.ArrayBuffer
 22 | import scala.collection.JavaConverters._
 23 | import scala.util.control.Breaks._
 24 | import org.apache.spark.SparkContext
 25 | import org.beymani.spark.common.OutlierUtility
 26 | import org.chombo.spark.common.GeneralUtility
 27 | import org.chombo.spark.common.JobConfiguration
 28 | import org.chombo.spark.common.Record
 29 | import org.chombo.util.BasicUtils
 30 | import org.chombo.math.MathUtils
 31 | import org.beymani.util.SeequenceScoreAggregator
 32 | import org.hoidla.window.LocalNeighborhoodWindow
 33 | 
 34 | 
 35 | /**
 36 |  * Anomaly detection in sequence data based on nearest neighboers  within an window.
 37 |  * @author pranab
 38 |  *
 39 |  */
 40 | object LocalNeighborhoodDetector extends JobConfiguration with GeneralUtility with OutlierUtility {
 41 |   
 42 |    /**
 43 |    * @param args
 44 |    * @return
 45 |    */
 46 |    def main(args: Array[String]) {
 47 | 	   val appName = "localNeighborhoodDetector"
 48 | 	   val Array(inputPath: String, outputPath: String, configFile: String) = getCommandLineArgs(args, 3)
 49 | 	   val config = createConfig(configFile)
 50 | 	   val sparkConf = createSparkConf(appName, config, false)
 51 | 	   val sparkCntxt = new SparkContext(sparkConf)
 52 | 	   val appConfig = config.getConfig(appName)
 53 | 	   
 54 | 	   //configuration params
 55 | 	   val fieldDelimIn = appConfig.getString("field.delim.in")
 56 | 	   val fieldDelimOut = appConfig.getString("field.delim.out")
 57 | 	   val precision = getIntParamOrElse(appConfig, "output.precision", 3)
 58 | 	   val keyFieldOrdinals = toOptionalIntArray(getOptionalIntListParam(appConfig, "id.fieldOrdinals"))
 59 | 	   val attrOrd = getMandatoryIntParam(appConfig, "attr.ordinal")
 60 | 	   val seqFieldOrd = getMandatoryIntParam(appConfig, "seq.fieldOrd", "missing seq field ordinal")
 61 | 	   val scoreThreshold = getMandatoryDoubleParam(appConfig, "score.threshold", "missing score threshold")	
 62 | 	   val windowSize = getIntParamOrElse(appConfig, "window.size", 3)
 63 | 	   val neighborhoodDist = getDoubleParamOrElse(appConfig, "neighborhood.dist", -1.0)
 64 |      val debugOn = appConfig.getBoolean("debug.on")
 65 | 	   val saveOutput = appConfig.getBoolean("save.output")
 66 | 		 
 67 | 	   BasicUtils.assertCondition(windowSize % 2 == 1, "window size should be odd")
 68 | 	   val keyLen = getOptinalArrayLength(keyFieldOrdinals, 1)
 69 | 	   val neighborhoodDistBased = neighborhoodDist > 0
 70 | 	   val neighborhoodSize = getConditionalMandatoryIntParam(!neighborhoodDistBased, appConfig, "neighborhood.size", 
 71 | 	       "neighborhoosd size must be provided")
 72 | 	   
 73 | 	   //input
 74 | 	   val data = sparkCntxt.textFile(inputPath)
 75 | 	   val keyedData = getKeyedValueWithSeq(data, fieldDelimIn, keyLen, keyFieldOrdinals, seqFieldOrd)
 76 | 		 
 77 | 	   //records with tag and score
 78 | 	   val taggedData = keyedData.groupByKey.flatMap(v => {
 79 |        val key = v._1
 80 | 	     val values = v._2.toList.sortBy(v => v.getLong(0))
 81 | 	     val size = values.length
 82 | 	     val coffset = windowSize / 2
 83 | 	     val window = if (neighborhoodDistBased) {
 84 |          new LocalNeighborhoodWindow(windowSize, neighborhoodDist)
 85 | 	     } else {
 86 |          new LocalNeighborhoodWindow(windowSize, neighborhoodSize)
 87 |        }
 88 |        val scores = Array.fill[Double](size)(0)
 89 | 	     for (i <- 0 to size - 1) {
 90 | 	       val v = values(i)
 91 | 	       val line = v.getString(1)
 92 | 	       val items = BasicUtils.getTrimmedFields(line, fieldDelimIn)
 93 | 	       val quant = items(attrOrd).toDouble
 94 | 	       window.add(quant)
 95 | 	       if (window.isProcessed()) {
 96 | 	         val score = if (neighborhoodDistBased) window.getNumNeighbosWithin().toDouble
 97 | 	           else  window.getAvNeighborDist()
 98 | 	         scores(i - coffset) = score 
 99 | 	       }
100 | 	     }
101 |        
102 |        //append score and tag
103 |        val recScores = values.map(r => r.getString(1)).zip(scores)
104 |        recScores.map(r => {
105 |          val rec = r._1
106 |          val score = r._2
107 |          val tag = if (score > scoreThreshold) "O" else "N"
108 |          rec + fieldDelimOut + BasicUtils.formatDouble(score, precision) + fieldDelimOut + tag
109 |        })
110 | 	   })
111 | 
112 | 	   if (debugOn) {
113 |        val records = taggedData.collect
114 |        records.slice(0, 50).foreach(r => println(r))
115 |      }
116 | 	   
117 | 	   if(saveOutput) {	   
118 | 	     taggedData.saveAsTextFile(outputPath) 
119 | 	   }	 
120 | 	   
121 |    }
122 |   
123 | }


--------------------------------------------------------------------------------
/spark/src/main/scala/org/beymani/spark/pc/PrincipalComponentPredictor.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * beymani-spark: Outlier and anamoly detection 
  3 |  * Author: Pranab Ghosh
  4 |  * 
  5 |  * Licensed under the Apache License, Version 2.0 (the "License"); you
  6 |  * may not use this file except in compliance with the License. You may
  7 |  * obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0 
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 14 |  * implied. See the License for the specific language governing
 15 |  * permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package org.beymani.spark.pc
 19 | 
 20 | import org.chombo.spark.common.JobConfiguration
 21 | import org.apache.spark.SparkContext
 22 | import scala.collection.JavaConverters._
 23 | import org.chombo.util.BasicUtils
 24 | import org.chombo.spark.common.Record
 25 | import org.chombo.util.BaseAttribute
 26 | import com.typesafe.config.Config
 27 | import org.beymani.spark.common.OutlierUtility
 28 | import org.chombo.spark.common.GeneralUtility
 29 | import org.avenir.util.PrincipalCompState
 30 | import org.chombo.math.MathUtils
 31 | 
 32 | /**
 33 | * PCA based outlier prediction
 34 | * @author pranab
 35 | * 
 36 | */
 37 | object PrincipalComponentPredictor extends JobConfiguration with GeneralUtility {
 38 |    /**
 39 |     * @param args
 40 |     * @return
 41 |     */
 42 |    def main(args: Array[String])  {
 43 | 	   val appName = "principalComponentPredictor"
 44 | 	   val Array(inputPath: String, outputPath: String, configFile: String) = getCommandLineArgs(args, 3)
 45 | 	   val config = createConfig(configFile)
 46 | 	   val sparkConf = createSparkConf(appName, config, false)
 47 | 	   val sparkCntxt = new SparkContext(sparkConf)
 48 | 	   val appConfig = config.getConfig(appName)
 49 | 	   
 50 | 	   //configurations
 51 | 	   val fieldDelimIn = getStringParamOrElse(appConfig, "field.delim.in", ",")
 52 | 	   val fieldDelimOut = getStringParamOrElse(appConfig, "field.delim.out", ",")
 53 | 	   val keyFieldOrdinals = toIntArray(getMandatoryIntListParam(appConfig, "id.field.ordinals"))
 54 | 	   val quantFieldOrdinals = toIntArray(getMandatoryIntListParam(appConfig, "quant.field.ordinals"))
 55 | 	   val seqFieldOrd = getMandatoryIntParam( appConfig, "seq.field.ordinal", "missing sequence field ordinal") 
 56 | 	   val dimension = quantFieldOrdinals.length
 57 | 	   val stateFilePath = this.getMandatoryStringParam(appConfig, "state.filePath", "missing pc state file path")
 58 | 	   val compState = PrincipalCompState.load(stateFilePath, fieldDelimOut).asScala.toMap
 59 | 	   val scoreThreshold = getMandatoryDoubleParam(appConfig, "score.threshold", "missing score threshold")
 60 | 	   val expConst = getDoubleParamOrElse(appConfig, "exp.const", 1.0)
 61 | 	   val precision = getIntParamOrElse(appConfig, "output.precision", 3)
 62 | 	   val debugOn = getBooleanParamOrElse(appConfig, "debug.on", false)
 63 | 	   val saveOutput = getBooleanParamOrElse(appConfig, "save.output", true)
 64 | 	   
 65 | 	   //pc matrix and transposed pc matrix
 66 | 	   val pcFun = (state: PrincipalCompState) => {
 67 | 	     val pcArr = state.getPrincComps()
 68 | 	     val pc = MathUtils.createMatrix(pcArr)
 69 | 	     val pcTr =  pc.transpose()
 70 | 	     (pc, pcTr)
 71 | 	   }
 72 | 	   val pcMa = updateMapValues(compState, pcFun)
 73 | 	   
 74 | 	   val data = sparkCntxt.textFile(inputPath)
 75 | 	   val taggedData = data.map(line => {
 76 | 		   val items = BasicUtils.getTrimmedFields(line, fieldDelimIn)
 77 | 		   val keyRec = Record(items, keyFieldOrdinals)
 78 | 		   val keyStr = keyRec.toString(fieldDelimIn)
 79 | 		   val quantFields = BasicUtils.extractFieldsAsDoubleArray(items, keyFieldOrdinals) 
 80 | 		   var score = 0
 81 | 		   val tag = pcMa.get(keyStr) match {
 82 | 		     case Some(pc) => {
 83 | 		       val pcHidden = pc._1
 84 | 		       val pcNorm = pc._2
 85 | 		       val daNorm = MathUtils.createColMatrix(quantFields)
 86 | 		       
 87 | 		       //regenerate
 88 | 		       val daHideen = MathUtils.multiplyMatrix(pcHidden, daNorm)
 89 | 		       val daRegen = MathUtils.multiplyMatrix(pcNorm, daHideen)
 90 | 		       
 91 | 		       //error
 92 | 		       val quantFieldsGen = MathUtils.arrayFromColumnMatrix(daRegen)
 93 | 		       var score = MathUtils.vectorDiffNorm(quantFields, quantFieldsGen)
 94 | 		       if (expConst > 0) {
 95 | 		    	   score = BasicUtils.expScale(expConst, score)
 96 | 		       }		       
 97 | 		       if (score < scoreThreshold) "N" else "O"
 98 | 		     }
 99 | 		     case None => "I"
100 | 		   }
101 | 		   val newRec = new Array[String](items.length + 2)
102 | 		   Array.copy(items, 0, newRec, 0, items.length)
103 | 		   newRec(newRec.length-2) = BasicUtils.formatDouble(score, precision)
104 | 		   newRec(newRec.length-1) = tag
105 | 		   (keyRec, newRec)
106 | 	   })
107 | 	   
108 | 	   //group by key and sort by sequence
109 | 	   val serTaggedData = groupByKeySortBySeq(taggedData, seqFieldOrd, fieldDelimOut)
110 | 	   
111 | 	   if (debugOn) {
112 |          val records = serTaggedData.collect
113 |          records.slice(0, 50).foreach(r => println(r))
114 |        }
115 | 	   
116 | 	   if(saveOutput) {	   
117 | 	     serTaggedData.saveAsTextFile(outputPath) 
118 | 	   }	 
119 | 	   
120 |    }
121 | }


--------------------------------------------------------------------------------