├── spark
├── project
│ ├── build.properties
│ └── plugins.sbt
├── version.sbt
├── src
│ └── main
│ │ └── scala
│ │ └── org
│ │ └── beymani
│ │ ├── sanity
│ │ └── WordCount.scala
│ │ └── spark
│ │ ├── common
│ │ ├── PseudoRelevanceThresholdFinder.scala
│ │ ├── OutlierScoreLevelShift.scala
│ │ └── OutlierCounter.scala
│ │ ├── seq
│ │ └── LocalNeighborhoodDetector.scala
│ │ └── pc
│ │ └── PrincipalComponentPredictor.scala
└── build.sbt
├── manifest.mf
├── resource
├── IntroductionToBeymani.docx
├── cpsale.conf
├── vib.conf
├── mmfr.properties
├── ouli.sh
├── mhist.sh
├── mm_seqn.sh
├── avdi.sh
├── negr.sh
├── rede.sh
├── dsort.sh
├── mdist.sh
├── nede.sh
├── mm_modl.sh
├── rt_predict.properties
├── hist.json
├── bsm.json
├── model_calibration_tutorial.txt
├── ecommDataStream.json
├── build_storm.xml
├── xaction_states.rb
├── knn_udr.properties
├── spark_dependency.txt
├── beymani_spark.xml
├── epid.conf
├── vib.sh
├── cpsale.sh
├── ae_ticket.properties
├── xaction_queue.py
├── mob_loc.properties
├── ecomm_hierarchy.json
├── cyd.conf
├── alarm_threshold_tuning_tutorial.txt
├── bsm.conf
├── jar_dependency.txt
├── unsup_model_drift_detection_tutorial.txt
├── epid.sh
├── sup_model_drift_detection_tutorial.txt
├── monitoring_order_processing_system_with_isolation_forest.txt
├── ticket.conf
├── cycle_detection_tutorial.txt
├── proximity_tutorial.txt
├── autoencoder_based_cust_svc_case_anomaly_detection.txt
├── machinary_fault_detection_with_subsequence_anomaly_tutorial.txt
├── salean.sh
├── issue_service_time_anomaly_detection_tutorial.txt
├── sales_data_change_point_detection_tutorial.txt
├── cyd.sh
├── health_monitoring_data_anomaly_detection_tutorial.txt
├── and.conf
├── salean.conf
├── ticket.sh
├── ecomm.conf
├── bsm.sh
├── quarantine_violation_detection_tutorial.txt
├── cct.rb
├── cpu_usage_anomaly_det_tutorial.txt
├── rel_density_tutorial.txt
├── real_time_fraud_prediction_tutorial.txt
├── retail_sale_monitoring_with_anomaly_detection_tutorial.txt
├── and_spark.sh
└── ecomm.sh
├── .gitignore
├── src
└── main
│ └── java
│ └── org
│ └── beymani
│ ├── util
│ ├── SequencedScore.java
│ ├── SeequenceScoreAggregator.java
│ ├── DataStream.java
│ ├── SequenceMatcher.java
│ └── DataStreamSchema.java
│ ├── predictor
│ ├── PredictorSpout.java
│ ├── EntropyIncreaseBasedPredictor.java
│ ├── EstimatedProbabilityBasedPredictor.java
│ ├── ExtremeValuePredictor.java
│ ├── FileSpout.java
│ ├── EstimatedCumProbabilityBasedPredictor.java
│ ├── ModelBasedPredictor.java
│ ├── OutlierPredictor.java
│ ├── MahalanobisDistancePredictor.java
│ ├── EstimatedMetaProbabilityBasedPredictor.java
│ └── InterPercentileDifferenceBasedPredictor.java
│ └── proximity
│ └── RelativeDensity.java
├── python
└── app
│ ├── wsbot.py
│ ├── cpsale.py
│ ├── mvand.py
│ ├── bvib.py
│ ├── olss.py
│ ├── bls.py
│ └── cpu_usage.py
└── README.md
/spark/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.3.3
2 |
--------------------------------------------------------------------------------
/spark/version.sbt:
--------------------------------------------------------------------------------
1 | version in ThisBuild := "1.0-SNAPSHOT"
--------------------------------------------------------------------------------
/manifest.mf:
--------------------------------------------------------------------------------
1 | Manifest-Version: 1.0
2 | X-COMMENT: Main-Class will be added automatically by build
3 |
4 |
--------------------------------------------------------------------------------
/resource/IntroductionToBeymani.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pranab/beymani/HEAD/resource/IntroductionToBeymani.docx
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target/**/*
2 | .settings/
3 | .project
4 | .classpath
5 | /target
6 | spark/project/project
7 | spark/project/target
8 | spark/target
9 | spark/lib_managed
10 | spark/src_managed
11 | spark/project/boot
12 | spark/tmp
13 | project/
14 | .history
15 | spark/dist
16 | .DS_Store
17 | .cache
18 | spark/bin
19 | .class
20 | .ivy2
21 |
22 |
--------------------------------------------------------------------------------
/resource/cpsale.conf:
--------------------------------------------------------------------------------
1 | changePointDetector {
2 | field.delim.in = ","
3 | field.delim.out = ","
4 | id.fieldOrdinals = [0]
5 | attr.ordinals = [2]
6 | seq.fieldOrd = 1
7 | window.size = 200
8 | stat.type = CVM
9 | stat.critValue = 38.863
10 | seq.chPtOutFilePath = "file:///Users/pranab/Projects/bin/beymani/other/cpsale"
11 | debug.on = true
12 | save.output = true
13 | }
14 |
--------------------------------------------------------------------------------
/resource/vib.conf:
--------------------------------------------------------------------------------
1 |
2 | subSequenceDistanceDetector {
3 | field.delim.in = ","
4 | field.delim.out = ","
5 | id.fieldOrdinals = [0]
6 | attr.ordinal = 2
7 | seq.fieldOrd = 1
8 | window.size = 40
9 | score.threshold = 0.2
10 | ref.filePath = "file:///Users/pranab/Projects/bin/beymani/other/vib/vib_ref.txt"
11 | output.precision = 3
12 | debug.on = true
13 | save.output = true
14 | }
--------------------------------------------------------------------------------
/resource/mmfr.properties:
--------------------------------------------------------------------------------
1 | field.delim.regex=,
2 | field.delim.out=,
3 | num.reducer=1
4 | debug.on=false
5 |
6 | #Projection
7 | pro.projection.operation=grouping
8 | pro.key.field=0
9 | pro.projection.field=2
10 |
11 | #MarkovStateTransitionModel
12 | mst.skip.field.count=1
13 | mst.model.states=LNL,LNN,LNS,LHL,LHN,LHS,MNL,MNN,MNS,MHL,MHN,MHS,HNL,HNN,HNS,HHL,HHN,HHS
14 | mst.trans.prob.scale=1
15 |
--------------------------------------------------------------------------------
/resource/ouli.sh:
--------------------------------------------------------------------------------
1 | JAR_NAME=/home/pranab/Projects/chombo/target/chombo-1.0.jar
2 | CLASS_NAME=org.chombo.mr.NumericSorter
3 |
4 | echo "running mr"
5 | IN_PATH=/user/pranab/cct/avdi
6 | OUT_PATH=/user/pranab/cct/ouli
7 | echo "input $IN_PATH output $OUT_PATH"
8 | hadoop fs -rmr $OUT_PATH
9 | echo "removed output dir"
10 |
11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH
12 |
--------------------------------------------------------------------------------
/resource/mhist.sh:
--------------------------------------------------------------------------------
1 | JAR_NAME=/home/pranab/Projects/chombo/target/chombo-1.0.jar
2 | CLASS_NAME=org.chombo.mr.MultiVarHistogram
3 |
4 | echo "running mr"
5 | IN_PATH=/user/pranab/cct/input
6 | OUT_PATH=/user/pranab/cct/mhist
7 | echo "input $IN_PATH output $OUT_PATH"
8 | hadoop fs -rmr $OUT_PATH
9 | echo "removed output dir"
10 |
11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH
12 |
--------------------------------------------------------------------------------
/resource/mm_seqn.sh:
--------------------------------------------------------------------------------
1 | JAR_NAME=/home/pranab/Projects/chombo/target/chombo-1.0.jar
2 | CLASS_NAME=org.chombo.mr.Projection
3 |
4 | echo "running mr"
5 | IN_PATH=/Users/pranab/mmfr/input
6 | OUT_PATH=/Users/pranab/mmfr/sequence
7 | echo "input $IN_PATH output $OUT_PATH"
8 | hadoop fs -rmr $OUT_PATH
9 | echo "removed output dir"
10 |
11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/mmfr.properties $IN_PATH $OUT_PATH
12 |
--------------------------------------------------------------------------------
/resource/avdi.sh:
--------------------------------------------------------------------------------
1 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
2 | CLASS_NAME=org.beymani.proximity.AverageDistance
3 |
4 | echo "running mr"
5 | IN_PATH=/user/pranab/cct/simi
6 | OUT_PATH=/user/pranab/cct/avdi
7 | echo "input $IN_PATH output $OUT_PATH"
8 | hadoop fs -rmr $OUT_PATH
9 | echo "removed output dir"
10 |
11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH
12 |
--------------------------------------------------------------------------------
/resource/negr.sh:
--------------------------------------------------------------------------------
1 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
2 | CLASS_NAME=org.beymani.proximity.AverageDistance
3 |
4 | echo "running mr"
5 | IN_PATH=/user/pranab/cct/simi
6 | OUT_PATH=/user/pranab/cct/negr
7 | echo "input $IN_PATH output $OUT_PATH"
8 | hadoop fs -rmr $OUT_PATH
9 | echo "removed output dir"
10 |
11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH
12 |
--------------------------------------------------------------------------------
/resource/rede.sh:
--------------------------------------------------------------------------------
1 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
2 | CLASS_NAME=org.beymani.proximity.RelativeDensity
3 |
4 | echo "running mr"
5 | IN_PATH=/user/pranab/cct/nede
6 | OUT_PATH=/user/pranab/cct/rede
7 | echo "input $IN_PATH output $OUT_PATH"
8 | hadoop fs -rmr $OUT_PATH
9 | echo "removed output dir"
10 |
11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH
12 |
--------------------------------------------------------------------------------
/resource/dsort.sh:
--------------------------------------------------------------------------------
1 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
2 | CLASS_NAME=org.beymani.dist.DistributionSorter
3 |
4 | echo "running mr"
5 | IN_PATH=/user/pranab/cct/mdist
6 | OUT_PATH=/user/pranab/cct/dsort
7 | echo "input $IN_PATH output $OUT_PATH"
8 | hadoop fs -rmr $OUT_PATH
9 | echo "removed output dir"
10 |
11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH
12 |
--------------------------------------------------------------------------------
/resource/mdist.sh:
--------------------------------------------------------------------------------
1 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
2 | CLASS_NAME=org.beymani.dist.MultiVariateDistribution
3 |
4 | echo "running mr"
5 | IN_PATH=/user/pranab/cct/input
6 | OUT_PATH=/user/pranab/cct/mdist
7 | echo "input $IN_PATH output $OUT_PATH"
8 | hadoop fs -rmr $OUT_PATH
9 | echo "removed output dir"
10 |
11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH
12 |
--------------------------------------------------------------------------------
/resource/nede.sh:
--------------------------------------------------------------------------------
1 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
2 | CLASS_NAME=org.beymani.proximity.NeighborDensity
3 |
4 | echo "running mr"
5 | IN_PATH=/user/pranab/cct/input/nede
6 | OUT_PATH=/user/pranab/cct/nede
7 | echo "input $IN_PATH output $OUT_PATH"
8 | hadoop fs -rmr $OUT_PATH
9 | echo "removed output dir"
10 |
11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH
12 |
--------------------------------------------------------------------------------
/resource/mm_modl.sh:
--------------------------------------------------------------------------------
1 | JAR_NAME=/home/pranab/Projects/avenir/target/avenir-1.0.jar
2 | CLASS_NAME=org.avenir.markov.MarkovStateTransitionModel
3 |
4 | echo "running mr"
5 | IN_PATH=/Users/pranab/mmfr/sequence
6 | OUT_PATH=/Users/pranab/mmfr/model
7 | echo "input $IN_PATH output $OUT_PATH"
8 | hadoop fs -rmr $OUT_PATH
9 | echo "removed output dir"
10 |
11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/mmfr.properties $IN_PATH $OUT_PATH
12 |
--------------------------------------------------------------------------------
/resource/rt_predict.properties:
--------------------------------------------------------------------------------
1 |
2 | predictor.model=mm
3 | predictor.spout.threads=1
4 | predictor.bolt.threads=2
5 | num.workers=1
6 | debug=on
7 |
8 | messaging.provider=redis
9 | redis.server.host=localhost
10 | redis.server.port=6379
11 | redis.markov.model.key=xactionMarkovModel
12 | redis.input.queue=xactionQueue
13 | local.predictor=true
14 | state.seq.window.size=5
15 | state.ordinal=1
16 | detection.algorithm=missProbability
17 | metric.threshold=0.96
18 | redis.output.queue=fraudQueue
19 |
--------------------------------------------------------------------------------
/resource/hist.json:
--------------------------------------------------------------------------------
1 | {
2 | "fields" :
3 | [
4 | {
5 | "name" : "xid",
6 | "ordinal" : 0,
7 | "id" : true,
8 | "dataType" : "string"
9 | },
10 | {
11 | "name" : "time",
12 | "ordinal" : 1,
13 | "dataType" : "int",
14 | "bucketWidth" : 60
15 | },
16 | {
17 | "name" : "amount",
18 | "ordinal" : 2,
19 | "dataType" : "double",
20 | "bucketWidth" : 100
21 | },
22 | {
23 | "name" : "vendor",
24 | "ordinal" : 3,
25 | "dataType" : "categorical"
26 | }
27 | ]
28 | }
29 |
--------------------------------------------------------------------------------
/resource/bsm.json:
--------------------------------------------------------------------------------
1 | {
2 | "attributes" :
3 | [
4 | {
5 | "name" : "devID",
6 | "ordinal" : 0,
7 | "dataType" : "string",
8 | "targetFieldOrdinals" : [0]
9 | },
10 | {
11 | "name" : "timeStamp",
12 | "ordinal" : 1,
13 | "dataType" : "long",
14 | "targetFieldOrdinals" : [1]
15 | },
16 | {
17 | "name" : "measurement",
18 | "ordinal" : 2,
19 | "dataType" : "int",
20 | "buckeWidth" : 5.0,
21 | "transformers" : ["discretizerTrans"],
22 | "targetFieldOrdinals" : [2]
23 | }
24 | ]
25 | }
--------------------------------------------------------------------------------
/spark/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | logLevel := Level.Warn
2 |
3 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.10.0-RC1")
4 |
5 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6")
6 |
7 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.2.4")
8 |
9 | resolvers ++= Seq(
10 | "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/",
11 | "Akka Repository" at "https://repo.akka.io/releases/",
12 | "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools"
13 | )
14 |
15 |
16 |
--------------------------------------------------------------------------------
/resource/model_calibration_tutorial.txt:
--------------------------------------------------------------------------------
1 | This tutorial is for finding calibration properties of a machine lrarning model.
2 |
3 | Setup
4 | =====
5 | Make sure you have ../lib ../supv directories with all the python files in there wrt
6 | where heart_disease.py is. Alternatively you can use ../python/app directory of avenir as
7 | your working directory
8 |
9 | Generate data and train model
10 | =============================
11 | Please refer to heart_disease_prediction_with_random_forest_tutorial.txt
12 |
13 | Global calibration
14 | ==================
15 | ./heart_disease.py calib
16 |
17 | Local caliv=bration
18 | ===================
19 | ./heart_disease.py calibLoc
--------------------------------------------------------------------------------
/resource/ecommDataStream.json:
--------------------------------------------------------------------------------
1 | {
2 | "dataStreams" :
3 | [
4 | {
5 | "id" : "corp",
6 | "type" : "root",
7 | "parentId" : "none",
8 | "parentType" : "none",
9 | "singleton" : true
10 | },
11 | {
12 | "id" : "sale",
13 | "type" : "sale",
14 | "parentId" : "root",
15 | "parentType" : "root",
16 | "singleton" : true
17 | },
18 | {
19 | "id" : "*",
20 | "type" : "prodSale",
21 | "parentId" : "sale",
22 | "parentType" : "sale",
23 | "singleton" : false
24 | },
25 | {
26 | "id" : "scAbandon",
27 | "type" : "scAbandon",
28 | "parentId" : "root",
29 | "parentType" : "root",
30 | "singleton" : true
31 | }
32 | ]
33 | }
--------------------------------------------------------------------------------
/resource/build_storm.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Packaging into a single uber JAR
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/resource/xaction_states.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/ruby
2 |
3 | require '../lib/util.rb'
4 |
5 |
6 | custCount = ARGV[0].to_i
7 |
8 | custIDs = []
9 | amountDist = CategoricalField.new("L",35,"M",53,"H",12)
10 | typeDist = CategoricalField.new("N",85,"H",15)
11 | timeElapsedDist = CategoricalField.new("L",35,"N",45,"S",20)
12 |
13 |
14 | idGen = IdGenerator.new
15 | 1.upto custCount do
16 | custIDs << idGen.generate(10)
17 | end
18 |
19 | #num of transactions
20 | 1.upto 15 do
21 | #number of customers
22 | 1.upto custCount do
23 | if (rand(10) < 9)
24 | cid = custIDs[rand(custIDs.length)]
25 | xid = idGen.generate(12)
26 | puts "#{cid},#{xid},#{amountDist.value}#{typeDist.value}#{timeElapsedDist.value}"
27 | end
28 | end
29 | end
30 |
--------------------------------------------------------------------------------
/resource/knn_udr.properties:
--------------------------------------------------------------------------------
1 | common.mode=train
2 | common.model.directory=model
3 | common.model.file=knn_udr_model
4 | common.preprocessing=scale
5 | common.scaling.method=minmax
6 | common.verbose=True
7 | common.logging.file=./log/knn.log
8 | common.logging.level=info
9 | train.data.file=chdr.txt
10 | train.data.fields=1,2,3,4,5,6,7,8
11 | train.data.feature.fields=0,1,2,3,4,5,6
12 | train.data.class.field=7
13 | train.num.neighbors=9
14 | train.neighbor.weight=_
15 | train.neighbor.search.algo=_
16 | train.neighbor.search.leaf.size=_
17 | train.neighbor.dist.metric=_
18 | train.neighbor.dist.metric.pow=_
19 | train.success.criterion=_
20 | train.model.save=_
21 | train.score.method=_
22 | predict.data.file=chdr.txt
23 | predict.data.fields=1,2,3,4,5,6,7,8
24 | predict.data.feature.fields=0,1,2,3,4,5,6
25 | predict.use.saved.model=_
26 |
27 |
28 |
--------------------------------------------------------------------------------
/resource/spark_dependency.txt:
--------------------------------------------------------------------------------
1 | Build all necessary jars
2 | ========================
3 | in chombo
4 | mvn clean install
5 | sbt publishLocal
6 |
7 | in chombo/spark
8 | sbt clean package
9 | sbt publishLocal
10 |
11 | in hoidla
12 | mvn clean install
13 | sbt publishLocal
14 |
15 | in beymani
16 | mvn clean install
17 | sbt publishLocal
18 |
19 | in beymani/spark
20 | sbt clean package
21 |
22 | Build uber jar
23 | ==============
24 | ant -f beymani_spark.xml
25 |
26 | uber jar file name is uber-beymani-spark-1.0.jar
27 |
28 | If you are using Spark 2.0+, please add the following line to beymani_spark.xml, because
29 | type safe jar is not included in newer versions of Spark
30 |
31 |
32 |
33 | Please change the directory path, as per your environment
--------------------------------------------------------------------------------
/resource/beymani_spark.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | Packaging into a single uber JAR
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/resource/epid.conf:
--------------------------------------------------------------------------------
1 | outRangeBasedPredictor {
2 | field.delim.in = ","
3 | field.delim.out = ","
4 | id.fieldOrdinals = [0]
5 | attr.ordinals = [2,3]
6 | score.threshold = 0.80
7 | seq.fieldOrd=1
8 | exp.const = 2000.0
9 | attr.weights = [0.5, 0.5]
10 | attr.weightStrategy = max
11 | range.global = false
12 | range.filePath="/Users/pranab/Projects/bin/beymani/other/epid/outr/qualist.txt"
13 | debug.on = true
14 | save.output = true
15 | }
16 |
17 | inRangeBasedPredictor {
18 | field.delim.in = ","
19 | field.delim.out = ","
20 | id.fieldOrdinals = [0]
21 | attr.ordinals = [2,3]
22 | score.threshold = 0.500
23 | seq.fieldOrd=1
24 | exp.const=5000.0
25 | attr.weights = [0.5, 0.5]
26 | attr.weightStrategy = max
27 | range.global=true
28 | range.globalFilePath="/Users/pranab/Projects/bin/beymani/other/epid/inr/uniq_qualist.txt"
29 | range.LocalFilePath="/Users/pranab/Projects/bin/beymani/other/epid/qua_lo_loc.txt"
30 | debug.on = true
31 | save.output = true
32 | }
--------------------------------------------------------------------------------
/resource/vib.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | PROJECT_HOME=/Users/pranab/Projects
4 | JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar
5 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar
6 | AVENIR_JAR_NAME=$PROJECT_HOME/bin/avenir/uber-avenir-spark-1.0.jar
7 | MASTER=spark://akash:7077
8 |
9 | case "$1" in
10 |
11 | "olPred")
12 | echo "running SubSequenceDistanceDetector"
13 | CLASS_NAME=org.beymani.spark.seq.SubSequenceDistanceDetector
14 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/vib/*
15 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/vib
16 | rm -rf ./output/vib
17 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
18 | --conf spark.ui.killEnabled=true --master $MASTER $JAR_NAME $INPUT $OUTPUT vib.conf
19 | rm -rf ./output/vib/_SUCCESS
20 | ls -l ./output/vib
21 | for f in ./output/vib/*
22 | do
23 | echo "number of outliers in $f"
24 | cat $f | grep ,O | wc -l
25 | done
26 | ;;
27 |
28 | *)
29 | echo "unknown operation $1"
30 | ;;
31 |
32 | esac
--------------------------------------------------------------------------------
/resource/cpsale.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | PROJECT_HOME=/Users/pranab/Projects
4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar
5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar
6 | MASTER=spark://akash:7077
7 |
8 | case "$1" in
9 |
10 | "cpInp")
11 | echo "args: data_file "
12 | cp $2 $PROJECT_HOME/bin/beymani/input/cpsale/
13 | ls -l $PROJECT_HOME/bin/beymani/input/cpsale/
14 | ;;
15 |
16 | "cpPred")
17 | echo "running ChangePointDetector Spark job"
18 | CLASS_NAME=org.beymani.spark.misc.ChangePointDetector
19 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/cpsale/*
20 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/cpsale
21 | rm -rf ./output/cpsale
22 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
23 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT cpsale.conf
24 | wc -l ./output/cpsale/part-00000
25 | wc -l ./output/cpsale/part-00001
26 | ;;
27 |
28 |
29 | *)
30 | echo "unknown operation $1"
31 | ;;
32 |
33 | esac
--------------------------------------------------------------------------------
/spark/src/main/scala/org/beymani/sanity/WordCount.scala:
--------------------------------------------------------------------------------
1 | package org.beymani.sanity
2 |
3 | import org.apache.spark._
4 | import org.apache.spark.SparkContext._
5 |
6 | object WordCount {
7 | def main(args: Array[String]) {
8 | val master = args.length match {
9 | case x: Int if x > 0 => args(0)
10 | case _ => "local"
11 | }
12 | val sc = new SparkContext(master, "WordCount", System.getenv("SPARK_HOME"))
13 | val input = args.length match {
14 | case x: Int if x > 1 => sc.textFile(args(1))
15 | case _ => sc.parallelize(List("pandas", "i like pandas"))
16 | }
17 | val words = input.flatMap(line => line.split(" "))
18 | args.length match {
19 | case x: Int if x > 2 => {
20 | val counts = words.map(word => (word, 1)).reduceByKey{case (x,y) => x + y}
21 | counts.saveAsTextFile(args(2))
22 | }
23 | case _ => {
24 | val wc = words.countByValue()
25 | println(wc.mkString(","))
26 | }
27 | }
28 | }
29 | }
--------------------------------------------------------------------------------
/resource/ae_ticket.properties:
--------------------------------------------------------------------------------
1 | common.mode=training
2 | common.model.directory=./model/ae
3 | common.model.file=cus.mod
4 | common.preprocessing=scale
5 | common.scaling.method=zscale
6 | common.verbose=True
7 | common.device=_
8 | train.data.file=cus_tr.txt
9 | train.data.fields=1,2,3,4,5,6,7,8
10 | train.data.feature.fields=0,1,2,3,4,5,6
11 | train.num.input=7
12 | train.num.hidden.units=6,5
13 | train.encoder.activations=relu,sigmoid
14 | train.decoder.activations=sigmoid,sigmoid
15 | train.batch.size=32
16 | train.num.iterations=200
17 | train.loss.reduction=_
18 | train.lossFn=mse
19 | train.optimizer=_
20 | train.opt.learning.rate=.001
21 | train.opt.weight.decay=_
22 | train.opt.momentum=_
23 | train.opt.eps=_
24 | train.opt.dampening=_
25 | train.opt.momentum.nesterov=_
26 | train.opt.betas=_
27 | train.opt.alpha=_
28 | train.noise.scale=0.05
29 | train.tied.weights=True
30 | train.model.save=False
31 | train.track.error=True
32 | train.batch.intv=5
33 | train.loss.av.window=5
34 | train.loss.diff.threshold=0.001
35 | encode.use.saved.model=_
36 | encode.data.file=cus_te.txt
37 | encode.feat.pad.size=50
--------------------------------------------------------------------------------
/spark/build.sbt:
--------------------------------------------------------------------------------
1 | name := "beymani-spark"
2 |
3 | organization := "org.beymani"
4 |
5 | version := "1.0"
6 |
7 | scalaVersion := "2.12.0"
8 |
9 | scalacOptions := Seq("-unchecked", "-deprecation")
10 |
11 | isSnapshot := true
12 |
13 | libraryDependencies ++=Seq(
14 | "org.apache.spark" %% "spark-core" % "3.0.0-preview" % "provided",
15 | "org.apache.spark" %% "spark-streaming" % "3.0.0-preview" % "provided",
16 | "org.apache.spark" %% "spark-streaming-kafka-0-10" % "3.0.0-preview",
17 | "org.apache.commons" % "commons-lang3" % "3.0",
18 | "com.fasterxml.jackson.core" % "jackson-databind" % "2.3.3",
19 | "com.fasterxml.jackson.module" % "jackson-module-scala_2.12" % "2.9.4",
20 | "org.apache.lucene" % "lucene-core" % "7.1.0",
21 | "org.apache.lucene" % "lucene-analyzers-common" % "7.1.0",
22 | "junit" % "junit" % "4.7" % "test",
23 | "org.scalatest" % "scalatest_2.10" % "2.0" % "test",
24 | "org.chombo" %% "chombo-spark" % "1.0",
25 | "mawazo" %% "chombo" % "1.0",
26 | "mawazo" %% "beymani" % "1.0",
27 | "mawazo" %% "hoidla" % "1.0",
28 | "mawazo" %% "avenir" % "1.0",
29 | "gov.nist.math" % "jama" % "1.0.3"
30 | )
31 |
--------------------------------------------------------------------------------
/resource/xaction_queue.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import sys
4 | import redis
5 |
6 | op = sys.argv[1]
7 | r = redis.StrictRedis(host='localhost', port=6379, db=0)
8 |
9 | if (op == "setModel"):
10 | modelFile = sys.argv[2]
11 | with open (modelFile, "r") as myfile:
12 | modelData=myfile.read()
13 |
14 | r.set('xactionMarkovModel', modelData)
15 | elif (op == "getModel"):
16 | model = r.get("xactionMarkovModel")
17 | print model
18 | elif (op == "writeQueue"):
19 | xactionFile = sys.argv[2]
20 | with open (xactionFile, "r") as myfile:
21 | for line in myfile.readlines():
22 | #print line.rstrip('\n')
23 | r.lpush("xactionQueue", line.rstrip('\n'))
24 | elif (op == "readQueue"):
25 | while True:
26 | line = r.rpop("xactionQueue")
27 | if line is not None:
28 | print line
29 | else:
30 | break
31 | elif (op == "queueLength"):
32 | qlen = r.llen("xactionQueue")
33 | print qlen
34 | elif (op == "readOutQueue"):
35 | while True:
36 | out = r.rpop("fraudQueue")
37 | if out is not None:
38 | print out
39 | else:
40 | break
41 | elif (op == "outQueueLength"):
42 | qlen = r.llen("fraudQueue")
43 | print qlen
44 |
--------------------------------------------------------------------------------
/resource/mob_loc.properties:
--------------------------------------------------------------------------------
1 | common.verbose=_
2 | population.num.hours=48
3 | population.sampling.interval=5
4 | population.size=1000
5 | population.num.family=200
6 | population.family.size.mean=_
7 | population.family.size.sd=_
8 | population.working.family.percentage=_
9 | population.retired.one.person.family.percentage=_
10 | region.lat.min=37.000
11 | region.lat.max=37.500
12 | region.long.min=-122.500
13 | region.long.max=-122.000
14 | region.num.business=_
15 | region.biz.size.mean=_
16 | region.biz.size.size.sd=_
17 | region.num.office"=_
18 | region.office.size.mean=_
19 | region.biz.size.size.sd=_
20 | region.num.schools=_
21 | region.num.colleges=_
22 | region.quarantine.list.file=qualist.txt
23 | region.num.locations=2
24 | region.loc.size=0.0024
25 | region.quarantine.loc.file=qualoc.txt
26 | region.quarantine.num.violation=5
27 | region.residence.list.file=res_loc.txt
28 | region.work.list.file=work_loc.txt
29 | region.school.list.file=school_loc.txt
30 | region.medical.facility.list.file=med_loc.txt
31 | region.shopping.area.list.file=shop_loc.txt
32 | region.entertainment.area.list.file=ent_loc.txt
33 | region.large.event.area.list.file=event_loc.txt
34 | region.open.space.list.file=open_loc.txt
35 |
36 |
37 |
--------------------------------------------------------------------------------
/resource/ecomm_hierarchy.json:
--------------------------------------------------------------------------------
1 | {
2 | "dataStreams" :
3 | [
4 | {
5 | "id" : "corp",
6 | "type" : "root",
7 | "parentId" : "none",
8 | "parentType" : "none",
9 | "singleton" : true
10 | },
11 | {
12 | "id" : "sale",
13 | "type" : "sale",
14 | "parentId" : "root",
15 | "parentType" : "root",
16 | "singleton" : true
17 | },
18 | {
19 | "id" : "electronics",
20 | "type" : "dept",
21 | "parentId" : "sale",
22 | "parentType" : "sale",
23 | "singleton" : false,
24 | "childrenId" : ["31W6CN4OGP","ATROK5G187","54RLEB9L5J","P3N63F2TPP","L674KMOI01","38A2F7U4XK","L0668572D0","BS6RHF2PV2","C88L3DYBB9","NX23WR8JJW"]
25 | },
26 | {
27 | "id" : "clothing",
28 | "type" : "dept",
29 | "parentId" : "sale",
30 | "parentType" : "sale",
31 | "singleton" : false,
32 | "childrenId" : ["IYZN3F9WCX","2DPXUFR93R","7MRHFY4L70","3FHQOJ45IJ","H4T8785L41","P3RVWCZS37","GZ4819T12I","OGX2037784","9021SDZ1O6","U62K213GI2"]
33 | },
34 | {
35 | "id" : "*",
36 | "type" : "prodSale",
37 | "parentId" : "electronics",
38 | "parentType" : "dept",
39 | "singleton" : false
40 | },
41 | {
42 | "id" : "*",
43 | "type" : "prodSale",
44 | "parentId" : "clothing",
45 | "parentType" : "dept",
46 | "singleton" : false
47 | }
48 | ]
49 | }
--------------------------------------------------------------------------------
/src/main/java/org/beymani/util/SequencedScore.java:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.util;
19 |
20 | import org.chombo.util.Pair;
21 |
22 | /**
23 | * Outlier score for a sequence element
24 | * @author pranab
25 | *
26 | */
27 | public class SequencedScore extends Pair {
28 | private static final long serialVersionUID = 4277362152194891790L;
29 |
30 | public SequencedScore(long seq, double score) {
31 | super(seq, score);
32 | }
33 |
34 | public long getSeq() {
35 | return left;
36 | }
37 |
38 | public double getScore() {
39 | return right;
40 | }
41 |
42 | public void setScore(double score) {
43 | right = score;
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/resource/cyd.conf:
--------------------------------------------------------------------------------
1 |
2 | numericalAttrStats {
3 | field.delim.in = ","
4 | field.delim.out = ","
5 | id.fieldOrdinals = [0]
6 | attr.ordinals = [2]
7 | seasonal.analysis = false
8 | part.bySeasonCycle = false
9 | seasonal.cycleType = ["weekDayOrWeekendOfWeek"]
10 | time.fieldOrdinal = 1
11 | time.inMili = false
12 | output.precision = 3
13 | debug.on = true
14 | save.output = true
15 | }
16 |
17 | temporalAggregator {
18 | field.delim.in = ","
19 | field.delim.out = ","
20 | attr.ordinals = [2]
21 | id.fieldOrdinals = [0]
22 | time.fieldOrdinal = 1
23 | time.inMili = false
24 | aggr.windowTimeUnit = "hour"
25 | aggr.windowTimeLength = 1
26 | aggr.type = "average"
27 | output.compact = true
28 | output.precision = 3
29 | debug.on = true
30 | save.output = true
31 | }
32 |
33 | autoCorrelation {
34 | field.delim.in = ","
35 | field.delim.out = ","
36 | seq.fieldOrdinal = 1
37 | id.fieldOrdinals = [0]
38 | attr.ordinals = [2]
39 | output.precision = 3
40 | coor.lags = [24, 48, 168]
41 | stats.file.path = "/Users/pranab/Projects/bin/beymani/other/auc/stats.txt"
42 | mean.fieldOrd = 4
43 | debug.on = true
44 | save.output = true
45 | }
46 |
47 | typedUniqueValueCounter {
48 | field.delim.in = ","
49 | field.delim.out = ","
50 | id.fieldOrdinals = [0, 1, 2]
51 | attr.ordinals = [5]
52 | attr.5.type = "double"
53 | seasonal.analysis = true
54 | seasonal.cycleType = ["weekDayOrWeekendOfWeek"]
55 | time.fieldOrdinal = 4
56 | time.inMili = false
57 | output.precision = 3
58 | debug.on = true
59 | save.output = true
60 | }
61 |
62 |
--------------------------------------------------------------------------------
/resource/alarm_threshold_tuning_tutorial.txt:
--------------------------------------------------------------------------------
1 | This tutorial is for tuning the threshold in anomaly detection system based on supervised learning
2 | using user feedback data
3 |
4 | Environment
5 | ===========
6 | Path etc shown here corresposnds to my environment. Please Change them as needed for your
7 | environment
8 |
9 | Build
10 | =====
11 | Follow instructions in spark_dependency.txt
12 |
13 | Python dependency
14 | =================
15 | The shell script commands for data generation run python scripts for data generation. Before you run
16 | the data generation commands do the following
17 | 1. checkout project avenir
18 | 2. copy the avenir/python/lib directory to ../lib with respect to your location of cpu_usage.py file
19 |
20 | Generate outlier detected data
21 | ==============================
22 | Please follow the tutorial cpu_usage_anomaly_det_tutorial.txt to generate data with outliers detected.
23 | Consolidate Spark generated output files into 1 file
24 |
25 | Simulate user feedback
26 | ======================
27 | ./cpu_usage.py feedback
28 |
29 | outlier_file_name = file generated in the previous step
30 | cur_threshold = threshold set outlier detection spark jobs. It's the parameter score.threshold
31 | in and.conf file
32 | new_threshold = if set higher than cur_threshold, it will simulate the case false positive
33 | i.e too many alarms
34 |
35 | Run spark job
36 | =============
37 | ./and_spark.sh thLearn
38 |
39 | Configuration
40 | =============
41 | It's in and.conf file. Through the parameter split.points multiple split points are provided.
--------------------------------------------------------------------------------
/resource/bsm.conf:
--------------------------------------------------------------------------------
1 |
2 | #device data
3 | dataTransformer {
4 | field.delim.in = ","
5 | field.delim.out = ","
6 | schema.filePath = "/Users/pranab/Projects/bin/beymani/meta/bsm.json"
7 | debug.on = true
8 | save.output = true
9 | transformers {
10 | discretizerTrans {
11 | }
12 | }
13 | }
14 |
15 | markovStateTransitionModel {
16 | field.delim.in = ","
17 | field.delim.out = ","
18 | id.field.ordinals = [0]
19 | seq.start.ordinal = 0
20 | state.list = ["8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29", "30","31","32","33","34","35","36","37","38","39","40","41","42"]
21 | output.precision = 3
22 | data.seqLongFormat = true
23 | seq.field.ordinal = 1
24 | state.field.ordinal = 2
25 | data.mergeKeysNeeded = true
26 | data.laplaceCorrNeeded = true
27 | output.compact = false
28 | debug.on = true
29 | save.output = true
30 | }
31 |
32 | markovChainPredictor {
33 | field.delim.in = ","
34 | field.delim.out = ","
35 | predictor.strategy = "conditinalProbability"
36 | id.fieldOrdinals = [0]
37 | output.precision = 6
38 | score.threshold = 3.7
39 | attr.ordinal = 2
40 | seq.fieldOrd = 1
41 | window.size = 4
42 | state.list = ["8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29", "30","31","32","33","34","35","36","37","38","39","40","41","42"]
43 | stateTrans.filePath = "/Users/pranab/Projects/bin/beymani/meta/bsm_mod.txt"
44 | stateTrans.compact = false
45 | model.global = true
46 | ignore.missingModel = false
47 | exp.const = -1.0
48 | debug.on = true
49 | save.output = true
50 | }
--------------------------------------------------------------------------------
/resource/jar_dependency.txt:
--------------------------------------------------------------------------------
1 | Dependent jars
2 | ==============
3 | beymani depends on the following jar libraries. Most of them are third party except for
4 | chombo. For these two you could either checkout the jars and place them in your
5 | local maven repo or you could build them.
6 |
7 | jackson-core-lgpl-1.6.3.jar
8 | jackson-mapper-lgpl-1.6.3.jar
9 | chombo-1.0.jar
10 | commons-lang-3.1.jar
11 | jedis-2.2.1.jar
12 |
13 |
14 | Building dependent jars
15 | =======================
16 | Follow these steps if you have decided to build the jars for chombo and hoidla
17 |
18 | Checkout project chombo and run
19 | mvn clean install
20 |
21 |
22 | Handling dependency
23 | ===================
24 | There are many ways to handle dependency in Hadoop
25 |
26 | 1. Use libjar command line options as below
27 | hadoop jar xyz.jar com.example.MyMapreduce -libjars path1/lib1.jar,path2/lib2.jar
28 |
29 | 2. Use maven shade plugin to package all jars into one uber jar. The following needs to
30 | be added to the build element in pom.xml
31 |
32 | .......
33 |
34 |
35 | org.apache.maven.plugins
36 | maven-shade-plugin
37 |
38 |
39 | package
40 |
41 | shade
42 |
43 |
44 |
45 |
46 | uber-${artifactId}-${version}
47 |
48 |
49 |
50 | .......
51 |
52 |
53 | 3. Use ant to package all dependent jars. You could use ../resource/build_hadoop.xml as an example
54 |
55 | 4. Copy all jars to hadoop lib directory in all nodes
56 |
--------------------------------------------------------------------------------
/resource/unsup_model_drift_detection_tutorial.txt:
--------------------------------------------------------------------------------
1 | This tutorial is for unsupervised concept drift detection of deployed supervised machine learning
2 | models with nearest neighbor count algorithm. We will use ecommerce customer churn data
3 |
4 |
5 | Setup
6 | =====
7 | Make sure you have python/lib, python/mlextra and python/supv directories of avenir project with all the
8 | python files wrt where codrift.py is as a peer directory i.e at ../lib , ../mlextra and ../supv
9 |
10 | Generate data for no drift case
11 | ===============================
12 | - generate refrence churn data
13 | ./codrift.py genrc > ch.txt
14 | where
15 | bsamp = num of samples e.g 1000
16 | noise_level = noise level in data e.g 0.05
17 |
18 | - set class label to 1
19 | ./codrift.py recl ch.txt 1 > chref.txt
20 |
21 | - generate current churn data
22 | ./codrift.py genrc chref.txt > ch.txt
23 |
24 | - set class label to 0
25 | ./codrift.py recl ch.txt 0 > chnew.txt
26 |
27 | - concatenate files
28 | cat chref.txt > chndr.txt
29 | cat chnew.txt >> chndr.txt
30 |
31 | No drift case
32 | =============
33 | - ensure following settings in knn_udr.properties
34 | train.data.file=chndr.txt
35 | predict.data.file=chndr.txt
36 |
37 | - run
38 | ./codrift.py udrift knn_udr.properties
39 |
40 | Generate data for drift case
41 | ============================
42 | - generate distribution shifted new data for second half
43 | ./codrift.py dish chnew.txt > chnewd.txt
44 |
45 | - concatenate files
46 | cat chref.txt > chdr.txt
47 | cat chnewd.txt >> chdr.txt
48 |
49 | Drift case
50 | ==========
51 | - ensure following settings in knn_udr.properties
52 | train.data.file=chdr.txt
53 | predict.data.file=chdr.txt
54 |
55 | - run
56 | ./codrift.py udrift knn_udr.properties
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/resource/epid.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | PROJECT_HOME=/Users/pranab/Projects
4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar
5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar
6 | MASTER=spark://akash.local:7077
7 |
8 | case "$1" in
9 |
10 | "cpQuaLocData")
11 | echo "args: data_file "
12 | cp $2 $PROJECT_HOME/bin/beymani/other/epid/$3/
13 | ls -l $PROJECT_HOME/bin/beymani/other/epid/$3/
14 | ;;
15 |
16 |
17 | "cpLocData")
18 | echo "args: test_data_file "
19 | cp $2 $PROJECT_HOME/bin/beymani/input/epid/$3/
20 | ls -l $PROJECT_HOME/bin/beymani/input/epid/$3/
21 | ;;
22 |
23 | "olPredOu")
24 | echo "running OutRangeBasedPredictor Spark job"
25 | CLASS_NAME=org.beymani.spark.misc.OutRangeBasedPredictor
26 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/epid/outr/*
27 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/epid/outr
28 | rm -rf ./output/epid/outr
29 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
30 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT epid.conf
31 | echo "number of outliers"
32 | wc -l ./output/epid/outr/part-00000
33 | wc -l ./output/epid/outr/part-00001
34 | ;;
35 |
36 | "olPredIn")
37 | echo "running InRangeBasedPredictor Spark job"
38 | CLASS_NAME=org.beymani.spark.misc.InRangeBasedPredictor
39 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/epid/inr/*
40 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/epid/inr
41 | rm -rf ./output/epid/inr
42 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
43 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT epid.conf
44 | echo "number of outliers"
45 | wc -l ./output/epid/inr/part-00000
46 | wc -l ./output/epid/inr/part-00001
47 | ;;
48 |
49 | *)
50 | echo "unknown operation $1"
51 | ;;
52 |
53 | esac
--------------------------------------------------------------------------------
/resource/sup_model_drift_detection_tutorial.txt:
--------------------------------------------------------------------------------
1 | This tutorial is forconcept drift detection of supervised machine learning models with EDDM algorithm
2 |
3 |
4 | Setup
5 | =====
6 | Make sure you have python/lib directory of avenir project with all the python files wrt
7 | where codrift.py is as a peer directory i.e at ../lib Copy sucodr.py from beymani/python/lob
8 | directory to your lib directory
9 |
10 | Generate Data
11 | =============
12 | - Generate refrence model prediction data
13 | ./codrift.py agen > er1.txt
14 | where
15 | nsamp = num of samples e.g. 2000
16 | er_rate = error rate e.g 0.1
17 |
18 | - Generate model prediction data with drift present
19 | ./codrift.py agen > er2.txt
20 | where
21 | trans = transition point for drift e.g 0.4 which means drift will appear after
22 | the first 40% of the data
23 | dr_er_rate = increased error rate after drift e.g 0.2
24 |
25 | Create reference statistics
26 | ===========================
27 | Make sure you have directory called model under the working directory
28 |
29 | Run
30 | ./codrift.py eddm er1.txt true
31 | where
32 | bootstrap_size = no of samples to be used boot strapping and creating referenece statistic e.g 600
33 | it will detect drift for the remaining samples. In our case it won't because er1.txt does not contain
34 | any error data with drift
35 |
36 | Detect drift
37 | ============
38 | ./codrift.py eddm er2.txt
39 | In our case drift will be detected, because about half way through the error data, the error rate dobles
40 | to simulate drift. For real prouction data, you may or may not find drift
41 |
42 | Ensemble and hierarchy of drift detectors
43 | =========================================
44 | There are aggregate functions in sucodr.py, that can be used to implement ensemble of detector e.g.
45 | LFR
46 |
47 |
--------------------------------------------------------------------------------
/resource/monitoring_order_processing_system_with_isolation_forest.txt:
--------------------------------------------------------------------------------
1 | This tutorial is for monitring an order processing system with isolation forest based anomaly detection.
2 | It uses log records generated by the order processing business workflow system.
3 |
4 | Environment
5 | ===========
6 | Path etc shown here corresposnds to my environment. Please Change them as needed for your
7 | environment by editing ecomm.sh
8 |
9 | Build
10 | =====
11 | Follow instructions in spark_dependency.txt
12 |
13 | Python dependency
14 | =================
15 | Before you run python scripts for data generation please do the following
16 | 1. checkout project avenir
17 | 2. copy the avenir/python/lib directory to ../lib with respect to your location of ecomm.py file
18 |
19 | You could run ecomm.py from the python/app directory of beymani where it resides or copy it
20 | some where else
21 |
22 | Generate order processing data
23 | ==============================
24 | ./ecomm.py ordProcessRecs > orpr.txt
25 | where
26 | num_orders = num of orders e.g 200
27 |
28 | Insert outliers
29 | ===============
30 | ./ecomm.py olOrdPr orpr.txt > rorpr.txt
31 | where
32 | ol_percent = outlier percentage e.g 10
33 |
34 | Run anomaly detector Spark job
35 | ==============================
36 | Set score.threshold in ecomm.conf to some reasoable value e.g 0.5
37 |
38 | Run Spark job
39 | ./ecomm.sh orpOlPred
40 |
41 | Get upper tail statistics of outlier scores
42 | ===========================================
43 | ./olss.py sttest ./output/ecom/orp 0 hist
44 |
45 | Run anomaly detector Spark job with new threshold value
46 | =======================================================
47 | Choose your threshold based on some confidence limit e.g 0.9 from the output of the lasr step Use that
48 | value to set score.threshold in ecomm.conf
49 |
50 | Run Spark job again
51 | ./ecomm.sh orpOlPred
52 |
53 |
54 |
55 |
--------------------------------------------------------------------------------
/resource/ticket.conf:
--------------------------------------------------------------------------------
1 | numericalAttrStats {
2 | field.delim.in = ","
3 | field.delim.out = ","
4 | id.fieldOrdinals = [0,1]
5 | attr.ordinals = [3]
6 | seasonal.analysis = true
7 | part.bySeasonCycle = true
8 | seasonal.cycleType = ["hourOfDay"]
9 | time.fieldOrdinal = 2
10 | time.inMili = false
11 | min.sampleCount = 100
12 | output.precision = 3
13 | debug.on = true
14 | save.output = true
15 | }
16 |
17 | numericalAttrMedian {
18 | field.delim.in = ","
19 | field.delim.out = ","
20 | id.fieldOrdinals = [0]
21 | attr.ordinals = [4]
22 | seasonal.analysis = false
23 | operation.type = "mad"
24 | hdfs.file = false
25 | med.file.path = "/Users/pranab/Projects/bin/beymani/meta/ticket/med.txt"
26 | seasonal.cycleType = ["hourOfDay"]
27 | time.fieldOrdinal = 1
28 | time.inMili = false
29 | output.precision = 6
30 | min.samplecount = 100
31 | debug.on = true
32 | save.output = true
33 | }
34 |
35 | statsBasedOutlierPredictor {
36 | field.delim.in = ","
37 | field.delim.out = ","
38 | predictor.strategy = "robustZscore"
39 | id.fieldOrdinals = [0]
40 | attr.ordinals = [4]
41 | score.threshold = 0.7
42 | exp.const = -1.0
43 | outlier.polarity = "all"
44 | stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/ticket/stats.txt"
45 | mean.fldOrd = 4
46 | hdfs.file = false
47 | attr.weights = [1]
48 | attr.weightStrategy = "weightedAverage"
49 | robustZscore {
50 | med.stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/ticket/med.txt"
51 | mad.stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/ticket/mad.txt"
52 | }
53 | seasonal.analysis = false
54 | seasonal.cycleType = ["hourOfDay"]
55 | time.fieldOrdinal = 1
56 | time.inMili = false
57 | output.precision = 3
58 | output.outliers = false
59 | rem.outliers = false
60 | clean.dataDirPath = "file:///Users/pranab/Projects/bin/beymani/other/olp/clean"
61 | debug.on = true
62 | save.output = true
63 | }
64 |
--------------------------------------------------------------------------------
/resource/cycle_detection_tutorial.txt:
--------------------------------------------------------------------------------
1 | This tutorial is for cycle detection in time series data using auto correlation. A set of
2 | candidate lags are provided. The lag with the highest correlation corresponds to a cycle.
3 |
4 |
5 | Environment
6 | ===========
7 | Path etc shown here corresposnds to my environment. Please Change them as needed for your
8 | environment
9 |
10 | Build
11 | =====
12 | Follow instructions in spark_dependency.txt
13 |
14 | Python dependency
15 | =================
16 | The shell script commands for data generation run python scripts for data generation. Before you run
17 | the data generation commands do the following
18 | 1. checkout project avenir
19 | 2. copy the avenir/python/lib directory to ../lib with respect to your location of cpu_usage.py file
20 |
21 | Create input data
22 | =================
23 | ./and_spark.sh crInput
24 |
25 | where
26 | num_of_days = number of days e.g 15
27 | reading_intervaL = reading interval in sec e.g. 300
28 | num_servers = number of servers e.g. 4
29 | output_file = output file, we will use c.txt from now on
30 |
31 | Copy output to input path for NumericalAttrStats and TemporalAggregator spark jobs
32 |
33 | Run Spark job for stats
34 | =======================
35 | ./cyd.sh numStat
36 |
37 | Copy and consolidate stats file
38 | ===============================
39 | ./and_spark.sh crStatsFile
40 |
41 | Aggregate to hourly
42 | ===================
43 | If the sampling interval is in minutes or sec aggregate to hourly average
44 | ./cyd.sh tempAggr
45 |
46 | Copy and consolidate aggregate output
47 | =====================================
48 | ./cyd.sh crAucInput
49 |
50 | Run Spark job for auto correlation
51 | ==================================
52 | ./cyd.sh autoCor
53 |
54 | Configuration
55 | =============
56 | Configuration is in cyd.conf. Make changes as necessary
57 |
58 |
59 |
60 |
61 |
--------------------------------------------------------------------------------
/resource/proximity_tutorial.txt:
--------------------------------------------------------------------------------
1 | This tutorial provides details of finding outliers based on average distance to neighbors.
2 | It uses two MR jobs, SameTypeSimilarity and AverageDistance. If you want to use credit card
3 | transactions as input, you could use cct.rb to generate data. Make sure that utol.rb is in the path
4 | ../lib. util.rb can be checked out from my project visitante. It's under script/ruby/lib directory
5 | in that project.
6 |
7 | Transaction Simarity
8 | ====================
9 | Herte is the script for SameTypeSimilarity
10 |
11 | JAR_NAME=/home/pranab/Projects/sifarish/target/sifarish-1.0.jar
12 | CLASS_NAME=org.sifarish.feature.SameTypeSimilarity
13 |
14 | echo "running mr"
15 | IN_PATH=/user/pranab/cct/input
16 | OUT_PATH=/user/pranab/cct/simi
17 | echo "input $IN_PATH output $OUT_PATH"
18 | hadoop fs -rmr $OUT_PATH
19 | echo "removed output dir"
20 |
21 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH
22 |
23 | Average Ditsance to Neighbors
24 | =============================
25 | Here is a sample script for AverageDistance
26 |
27 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
28 | CLASS_NAME=org.beymani.proximity.AverageDistance
29 |
30 | echo "running mr"
31 | IN_PATH=/user/pranab/cct/simi
32 | OUT_PATH=/user/pranab/cct/avdi
33 | echo "input $IN_PATH output $OUT_PATH"
34 | hadoop fs -rmr $OUT_PATH
35 | echo "removed output dir"
36 |
37 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH
38 |
39 | Configuration
40 | =============
41 | Here is a sample cct.properties
42 |
43 | field.delim.regex=,
44 | field.delim=,
45 | num.reducer=1
46 | sts.bucket.count=1000
47 | sts.same.schema.file.path=/pranab/meta/prod/prod.json
48 | avd.top.match.count=10
49 | avd.top.match.average=true
50 | avd.top.match.density=false
51 | avd.top.match.grouping=false
52 |
53 |
54 |
55 |
--------------------------------------------------------------------------------
/resource/autoencoder_based_cust_svc_case_anomaly_detection.txt:
--------------------------------------------------------------------------------
1 | This tutorial is for anaomaly detection for service time for an issue processing system data using
2 | auto encoder.
3 |
4 |
5 | Environment
6 | ===========
7 | Path etc shown here corresposnds to my environment. Please Change them as needed for your
8 | environment
9 |
10 |
11 | Python dependency
12 | =================
13 | The shell script commands for data generation run python scripts for data generation. Before you run
14 | the data generation commands do the following
15 | 1. checkout project avenir
16 | 2. copy the directories avenir/python/lib avenir/python/mlextra and avenir/python/unsup directory to ../lib
17 | ../mlextra and ../unsup with respect to your location of cpu_usage.py file
18 |
19 |
20 | Create normal data for modeling
21 | ===============================
22 | ./ticket.py genx > cus_tr.txt
23 |
24 | where
25 | num_issues = number of issues e.g 2000
26 |
27 |
28 | Create test data
29 | ================
30 | ./ticket.py genx > cus.txt
31 | where
32 | num_issues = number of issues e.g 200
33 |
34 | insert outliers
35 | /ticket.py iolx cus.txt > cus_te.txt
36 |
37 | where
38 | > v.txt
24 |
25 | where
26 | num_secs = num of secs in past for which vibration data data is generated e.g 7
27 |
28 | -Split into reference and prediction data
29 | split -l10000 v.txt
30 | mv xaa vib_ref.txt
31 |
32 | -Insert outliers in prediction or test data data
33 | ./bvib.py iol xab > vib_pred.txt
34 | failure_onset_time = time from beginning of test data where outlier in inserted. Outlier is
35 | in the form of 2 high frequecy componenets
36 |
37 | -You could plot the data around where outliers were introduced as follows
38 | ./bvib.py iplot vib_pred.txt K87JG9F6 900 1100
39 |
40 | K87JG9F6 is the ID of the machine that is faulty and has outliers in the vibration data
41 |
42 | -Copy reference and prediction data
43 | cp vib_ref.txt ./other/vib/
44 | cp vib_pred.txt ./input/vib/
45 |
46 |
47 | Run Spark Job
48 | =============
49 | Run
50 | ./vib.sh olPred
51 |
52 | Plot outlier data
53 | =================
54 | ./bvib.py oplot ./output/vib/part-00000 K87JG9F6 900 1100
55 |
56 | K87JG9F6 is the ID of the machine that is faulty and has outliers in the vibration data
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/src/main/java/org/beymani/util/SeequenceScoreAggregator.java:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.util;
19 |
20 | import java.util.ArrayList;
21 | import java.util.List;
22 |
23 | /**
24 | * Manages outlier scores for data points in a sequence. A data point may belong to
25 | * multiple sequences and hence may have have multiple outlier scores
26 | * @author pranab
27 | *
28 | */
29 | public class SeequenceScoreAggregator implements java.io.Serializable {
30 | private static final long serialVersionUID = 2181114339589177954L;
31 | private List scores = new ArrayList();
32 | private int windowSize;
33 |
34 |
35 | /**
36 | * @param windowSize
37 | */
38 | public SeequenceScoreAggregator(int windowSize) {
39 | super();
40 | this.windowSize = windowSize;
41 | }
42 |
43 |
44 | /**
45 | * @param seq
46 | * @param score
47 | */
48 | public void add(double score ) {
49 | scores.add(score);
50 | if (scores.size() > windowSize) {
51 | //set score to max of current and new score
52 | for (int i = scores.size() - windowSize; i < scores.size(); ++i) {
53 | double thisSeqScore = scores.get(i);
54 | if (thisSeqScore < score) {
55 | scores.set(i, score);
56 | }
57 | }
58 | }
59 | }
60 |
61 | /**
62 | * @return
63 | */
64 | public List getScores() {
65 | return scores;
66 | }
67 |
68 | }
69 |
--------------------------------------------------------------------------------
/resource/salean.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | PROJECT_HOME=/Users/pranab/Projects
4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar
5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar
6 | MASTER=spark://akash:7077
7 |
8 | case "$1" in
9 |
10 | *)
11 | echo "unknown operation $1"
12 | ;;
13 |
14 |
15 | "numStat")
16 | echo "running NumericalAttrStats Spark job"
17 | CLASS_NAME=org.chombo.spark.explore.NumericalAttrStats
18 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/san/*
19 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/san/stat
20 | rm -rf ./output/stat
21 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
22 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT salean.conf
23 | ;;
24 |
25 | "numMstat")
26 | echo "running NumericalAttrMedian Spark job"
27 | CLASS_NAME=org.chombo.spark.explore.NumericalAttrMedian
28 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/san/*
29 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/san/mstat
30 | rm -rf ./output/san/mstat
31 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
32 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT salean.conf
33 | ;;
34 |
35 | "cpMed")
36 | echo "copying median files"
37 | MED_FILES=$PROJECT_HOME/bin/beymani/output/san/mstat/*
38 | META_DIR=$PROJECT_HOME/bin/beymani/meta/san
39 | cp /dev/null $META_DIR/$2
40 | for f in $MED_FILES
41 | do
42 | echo "Copying file $f ..."
43 | cat $f >> $META_DIR/$2
44 | done
45 | ls -l $META_DIR
46 | ;;
47 |
48 | "olPred")
49 | echo "running StatsBasedOutlierPredictor Spark job"
50 | CLASS_NAME=org.beymani.spark.dist.StatsBasedOutlierPredictor
51 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/san/*
52 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/san/olp
53 | rm -rf ./output/san/olp
54 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
55 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT salean.conf
56 | echo "number of outliers"
57 | wc -l ./output/olp/part-00000
58 | wc -l ./output/olp/part-00001
59 | ;;
60 |
61 | esac
--------------------------------------------------------------------------------
/resource/issue_service_time_anomaly_detection_tutorial.txt:
--------------------------------------------------------------------------------
1 | This tutorial is for anaomaly detection for service time for an issue processing system data using
2 | statistical modeling. To ne more specidfic we will be using a z score based technique
3 |
4 |
5 | Environment
6 | ===========
7 | Path etc shown here corresposnds to my environment. Please Change them as needed for your
8 | environment
9 |
10 | Build
11 | =====
12 | Follow instructions in spark_dependency.txt
13 |
14 | Python dependency
15 | =================
16 | The shell script commands for data generation run python scripts for data generation. Before you run
17 | the data generation commands do the following
18 | 1. checkout project avenir
19 | 2. copy the avenir/python/lib directory to ../lib with respect to your location of cpu_usage.py file
20 |
21 |
22 | Create normal data for modeling
23 | ===============================
24 | ./ticket.py gen > tick_tr.txt
25 |
26 | where
27 | num_issues = number of issues e.g 2000
28 |
29 | Copy modeling data
30 | ./ticket.sh loadInp tick_tr.txt train
31 |
32 | Create test data
33 | ================
34 | ./ticket.py gen > tick.txt
35 | where
36 | num_issues = number of issues e.g 200
37 |
38 | insert outliers
39 | /ticket.py iol tick.txt > tick_pred.txt
40 |
41 | where
42 | > cps.txt
22 |
23 | num_days = num of days in past for which sales data data is generated
24 |
25 | Generate distribution for CVM two ssample statistic
26 | ===================================================
27 | We use Monte Carlo simulation to generate distribution. When run it will output to the console
28 | upper tail statistic. Save the output somewhere. You will need it to configure the Spark job
29 |
30 | Checkout the project avenir. In the python/app directory run the following
31 |
32 | ./tsstat.py cvm
33 | num_iter = num of iterations for the simulator e.g 2000
34 | num_samp = num of samples for generated samples, which should be half the window size (the parameter
35 | window.size in cpsale.conf). I have set this parameter to 200. So num_samp should be 100
36 |
37 | You could skip this step, if use the values set for parameter stat.critValue
38 |
39 | Copy input to Spark directory
40 | =============================
41 | ./cpsale.sh cpInp cps.txt
42 |
43 | Run Spark Job
44 | =============
45 | Chhose an upper critical value for confidence interval any wher between .95 and .99 from the
46 | output of the MC simulator we ran earlier. Set the parameter stat.critValue in cpsale.conf
47 |
48 | Run
49 | ./cpsale.sh cpPred
50 |
51 | Plot sales data and change points
52 | =================================
53 | ./cpsale.py plot cps.txt DK75HUI45X ./output/cpsale/part-00000
54 |
55 | DK75HUI45X is the ID of the product that change point in sales data
56 |
57 |
58 |
59 |
--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/PredictorSpout.java:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 |
19 | package org.beymani.predictor;
20 |
21 | import java.util.Map;
22 |
23 | import org.chombo.storm.MessageQueue;
24 |
25 | import backtype.storm.spout.SpoutOutputCollector;
26 | import backtype.storm.task.TopologyContext;
27 | import backtype.storm.topology.OutputFieldsDeclarer;
28 | import backtype.storm.topology.base.BaseRichSpout;
29 | import backtype.storm.tuple.Fields;
30 | import backtype.storm.tuple.Values;
31 |
32 | /**
33 | * @author pranab
34 | *
35 | */
36 | public class PredictorSpout extends BaseRichSpout {
37 | private SpoutOutputCollector collector;
38 | private Map conf;
39 | private String messageQueue;
40 | private MessageQueue msgQueue;
41 | private static final String NIL = "nil";
42 |
43 | @Override
44 | public void open(Map conf, TopologyContext context,
45 | SpoutOutputCollector collector) {
46 | this.collector = collector;
47 | this.conf = conf;
48 | messageQueue = conf.get("redis.input.queue").toString();
49 | msgQueue = MessageQueue.createMessageQueue(conf, messageQueue);
50 | }
51 |
52 | @Override
53 | public void nextTuple() {
54 | String message = msgQueue.receive();
55 | if(null != message && !message.equals(NIL)) {
56 | int pos = message.indexOf(",");
57 | String entityID = message.substring(0, pos);
58 | String recordData = message.substring(pos+1);
59 | collector.emit(new Values(entityID, recordData));
60 | }
61 |
62 | }
63 |
64 | @Override
65 | public void declareOutputFields(OutputFieldsDeclarer declarer) {
66 | declarer.declare(new Fields("entityID", "recordData"));
67 | }
68 |
69 | }
70 |
--------------------------------------------------------------------------------
/src/main/java/org/beymani/util/DataStream.java:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.util;
19 |
20 | import java.io.Serializable;
21 | import java.util.List;
22 |
23 | import org.codehaus.jackson.annotate.JsonIgnoreProperties;
24 |
25 | /**
26 | * @author pranab
27 | *
28 | */
29 | @JsonIgnoreProperties(ignoreUnknown = true)
30 | public class DataStream implements Serializable{
31 | private String id;
32 | private String type;
33 | private String parentId;
34 | private String parentType;
35 | private List childrenId;
36 | private boolean singleton;
37 |
38 | /**
39 | *
40 | */
41 | public DataStream() {
42 | }
43 |
44 | /**
45 | * @return
46 | */
47 | public String getId() {
48 | return id;
49 | }
50 |
51 | public void setId(String id) {
52 | this.id = id;
53 | }
54 |
55 | public String getType() {
56 | return type;
57 | }
58 |
59 | public void setType(String type) {
60 | this.type = type;
61 | }
62 |
63 | public String getParentId() {
64 | return parentId;
65 | }
66 |
67 | public void setParentId(String parentId) {
68 | this.parentId = parentId;
69 | }
70 |
71 | public String getParentType() {
72 | return parentType;
73 | }
74 |
75 | public void setParentType(String parentType) {
76 | this.parentType = parentType;
77 | }
78 |
79 | public List getChildrenId() {
80 | return childrenId;
81 | }
82 |
83 | public void setChildrenId(List childrenId) {
84 | this.childrenId = childrenId;
85 | }
86 |
87 | public boolean isSingleton() {
88 | return singleton;
89 | }
90 |
91 | public void setSingleton(boolean singleton) {
92 | this.singleton = singleton;
93 | }
94 |
95 | }
96 |
--------------------------------------------------------------------------------
/resource/cyd.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | PROJECT_HOME=/Users/pranab/Projects
4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar
5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar
6 | MASTER=spark://akash:7077
7 |
8 | case "$1" in
9 |
10 | "numStat")
11 | echo "running NumericalAttrStats Spark job"
12 | CLASS_NAME=org.chombo.spark.explore.NumericalAttrStats
13 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/teg/cusage.txt
14 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/mea
15 | rm -rf ./output/mea
16 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
17 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT cyd.conf
18 | ;;
19 |
20 | "crStatsFile")
21 | echo "copying and consolidating stats file"
22 | cat $PROJECT_HOME/bin/beymani/output/mea/part-00000 > $PROJECT_HOME/bin/beymani/other/auc/stats.txt
23 | cat $PROJECT_HOME/bin/beymani/output/mea/part-00001 >> $PROJECT_HOME/bin/beymani/other/auc/stats.txt
24 | ls -l $PROJECT_HOME/bin/beymani/other/auc
25 | ;;
26 |
27 | "tempAggr")
28 | echo "running TemporalAggregator Spark job"
29 | CLASS_NAME=org.chombo.spark.explore.TemporalAggregator
30 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/teg/cusage.txt
31 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/teg
32 | rm -rf ./output/teg
33 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
34 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT cyd.conf
35 | ;;
36 |
37 | "crAucInput")
38 | echo "copying and consolidating tem aggregation output file"
39 | cat $PROJECT_HOME/bin/beymani/output/teg/part-00000 > $PROJECT_HOME/bin/beymani/input/auc/cusage.txt
40 | cat $PROJECT_HOME/bin/beymani/output/teg/part-00001 >> $PROJECT_HOME/bin/beymani/input/auc/cusage.txt
41 | ls -l $PROJECT_HOME/bin/beymani/input/auc
42 | ;;
43 |
44 | "autoCor")
45 | echo "running AutoCorrelation Spark job"
46 | CLASS_NAME=org.chombo.spark.explore.AutoCorrelation
47 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/auc/cusage.txt
48 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/auc
49 | rm -rf ./output/auc
50 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
51 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT cyd.conf
52 | ;;
53 |
54 | *)
55 | echo "unknown operation $1"
56 | ;;
57 |
58 | esac
--------------------------------------------------------------------------------
/resource/health_monitoring_data_anomaly_detection_tutorial.txt:
--------------------------------------------------------------------------------
1 | This tutorial is for anaomaly detection in health monitoring data. Sequence anomaly is detected
2 | with markov chain model.
3 |
4 | Environment
5 | ===========
6 | Path etc shown here corresposnds to my environment. Please Change them as needed for your
7 | environment. The script bsm.sh is for running spark jobs and various other tasks. The configuration
8 | is in bsm.conf
9 |
10 | Build
11 | =====
12 | Follow instructions in spark_dependency.txt
13 |
14 | Python dependency
15 | =================
16 | The shell script commands for data generation run python scripts for data generation. Before you run
17 | the data generation commands do the following
18 | 1. checkout project avenir
19 | 2. copy the avenir/python/lib directory to ../lib with respect to your location of cpu_usage.py file
20 |
21 | Create device reading mean and std dev
22 | ======================================
23 | ./bls.py stat > dstat.txt
24 |
25 | num_dev = number of devices e.g 200
26 |
27 | Create training data
28 | ====================
29 | ./bls.py gen dstat.txt normal >
30 | where
31 | nun_days = num of days for which data should be generated (e.g 300)
32 | train_data_file = training data file
33 |
34 | Copy to the spark input directory.
35 | cp ./input/bsm/train
36 |
37 | Copy meta data file
38 | ====================
39 | cp bsm.json ./meta
40 |
41 | Discretize training data
42 | ========================
43 | Run dicretization spark job
44 | ./bsm.sh transformTrain
45 |
46 | Discretization step is set to 5 in bsm.conf
47 |
48 | Build model
49 | ===========
50 | Run Spark job
51 | ./bsm.sh stateTrans
52 |
53 | Consolidate model files
54 | =======================
55 | Copy all Spark generated files into one
56 | ./bsm.sh cpModel
57 |
58 | Create test data
59 | ================
60 | Create test data with outliers
61 | ./bls.py gen dstat.txt anomaly >
62 | nun_days = num of days for which data should be generated (e.g 30)
63 | test_data_file = test data file name
64 |
65 | Copy file
66 | cp ./input/bsm/pred
67 |
68 | Discretize test data
69 | ====================
70 | Run dicretization spark job
71 | ./bsm.sh transformPred
72 |
73 | Anomaly prediction Spark job
74 | ============================
75 | ./bsm.sh olPredict
76 |
77 |
--------------------------------------------------------------------------------
/python/app/wsbot.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 |
3 | # avenir-python: Machine Learning
4 | # Author: Pranab Ghosh
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License"); you
7 | # may not use this file except in compliance with the License. You may
8 | # obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | # implied. See the License for the specific language governing
16 | # permissions and limitations under the License.
17 |
18 | # Package imports
19 | import os
20 | import sys
21 | import random
22 | import statistics
23 | import matplotlib.pyplot as plt
24 | sys.path.append(os.path.abspath("../lib"))
25 | sys.path.append(os.path.abspath("../mlextra"))
26 | from util import *
27 | from sampler import *
28 |
29 | """
30 | data generation for web session
31 | """
32 | if __name__ == "__main__":
33 | op = sys.argv[1]
34 | if op == "gen":
35 | numSamp = int(sys.argv[2])
36 | if len(sys.argv) == 4:
37 | percenNormal = int(sys.argv[3])
38 | else:
39 | percenNormal = -1
40 |
41 | hrOfDay = [NormalSampler(14,3), UniformNumericSampler(0,23)]
42 | numPage = [NormalSampler(12,2.5), NormalSampler(50,5)]
43 | pageDurAv = [NormalSampler(60, 15), NormalSampler(1,.1)]
44 | prRevFrac = [NormalSampler(.5,.1), NormalSampler(.9,.05)]
45 | shopCart = [BernoulliTrialSampler(.6), BernoulliTrialSampler(.2)]
46 | checkout = [BernoulliTrialSampler(.4), BernoulliTrialSampler(0)]
47 | logOut = [BernoulliTrialSampler(.8), BernoulliTrialSampler(.95)]
48 |
49 | idLists = [genIdList(100, 12), genIdList(80, 12)]
50 |
51 | for _ in range(numSamp):
52 | if percenNormal > 0:
53 | if isEventSampled(percenNormal):
54 | di = 0
55 | else:
56 | di = 1
57 | else:
58 | di = 0
59 | uid = selectRandomFromList(idLists[di])
60 | hd = int(hrOfDay[di].sample())
61 | nup = int(numPage[di].sample())
62 | pdu = pageDurAv[di].sample()
63 | prev = prRevFrac[di].sample()
64 | sc = toIntFromBoolean(shopCart[di].sample())
65 | co = toIntFromBoolean(checkout[di].sample())
66 | if di == 1:
67 | co = 0
68 | lo = toIntFromBoolean(logOut[di].sample())
69 |
70 | print("{},{},{},{:.3f},{:.3f},{},{},{}".format(uid,hd,nup,pdu,prev,sc,co,lo))
71 |
72 |
--------------------------------------------------------------------------------
/resource/and.conf:
--------------------------------------------------------------------------------
1 | numericalAttrStats {
2 | field.delim.in = ","
3 | field.delim.out = ","
4 | id.fieldOrdinals = [0]
5 | attr.ordinals = [3]
6 | seasonal.analysis = true
7 | part.bySeasonCycle = true
8 | seasonal.cycleType = ["weekDayOrWeekendOfWeek"]
9 | time.fieldOrdinal = 1
10 | time.inMili = false
11 | min.sampleCount = 10
12 | output.precision = 3
13 | debug.on = true
14 | save.output = true
15 | }
16 |
17 |
18 | statsBasedOutlierPredictor {
19 | field.delim.in = ","
20 | field.delim.out = ","
21 | predictor.strategy = "zscore"
22 | id.fieldOrdinals = [0]
23 | attr.ordinals = [3]
24 | score.threshold = 3.30
25 | score.thresholdNorm = 0.90
26 | exp.const = -1.0
27 | outlier.polarity = "high"
28 | stats.file.path = "/Users/pranab/Projects/bin/beymani/other/olp/stats.txt"
29 | mean.fldOrd = 4
30 | hdfs.file = false
31 | attr.weights = [1.0]
32 | attr.weightStrategy = "weightedAverage"
33 | zscore {
34 | stats.file.path = "/Users/pranab/Projects/bin/beymani/other/olp/stats.txt"
35 | }
36 | seasonal.analysis = true
37 | part.bySeasonCycle = true
38 | seasonal.cycleType = ["weekDayOrWeekendOfWeek"]
39 | time.fieldOrdinal = 1
40 | time.inMili = false
41 | output.precision = 3
42 | output.outliers = false
43 | rem.outliers = false
44 | clean.dataDirPath = "file:///Users/pranab/Projects/bin/beymani/other/olp/clean"
45 | debug.on = true
46 | save.output = true
47 | }
48 |
49 | thresholdLearner {
50 | field.delim.in = ","
51 | field.delim.out = ","
52 | score.fldOrd = 4
53 | cls.fldOrd = 7
54 | split.points = [0.925, 0.930, 0.935, 0.940, 0.945, 0.950, 0.955, 0.960, 0.965, 0.970, 0.975]
55 | pos.clsLabel = "T"
56 | splitting.algo = "entropy"
57 | debug.on = true
58 | save.output = true
59 | }
60 |
61 | temporalAggregator {
62 | field.delim.in = ","
63 | field.delim.out = ","
64 | attr.ordinals = [2]
65 | id.fieldOrdinals = [0]
66 | time.fieldOrdinal = 1
67 | time.inMili = false
68 | aggr.windowTimeUnit = "hour"
69 | aggr.windowTimeLength = 1
70 | aggr.type = "average"
71 | output.compact = true
72 | output.precision = 3
73 | debug.on = true
74 | save.output = true
75 | }
76 |
77 | autoCorrelation {
78 | field.delim.in = ","
79 | field.delim.out = ","
80 | attr.ordinals = [2]
81 | id.fieldOrdinals = [0]
82 | seq.fieldOrdinal = 1
83 | output.precision = 3
84 | coor.lags = [24, 168]
85 | stats.file.path = "/Users/pranab/Projects/bin/beymani/other/auc/stats.txt"
86 | mean.fieldOrd = 5
87 | debug.on = true
88 | save.output = true
89 | }
90 |
91 |
92 |
--------------------------------------------------------------------------------
/resource/salean.conf:
--------------------------------------------------------------------------------
1 | timeIntervalGenerator {
2 | field.delim.in = ","
3 | field.delim.out = ","
4 | id.fieldOrdinals = [0]
5 | time.fieldOrdinal = 1
6 | time.keepField = true
7 | debug.on = true
8 | save.output = true
9 | }
10 |
11 | numericalAttrStats {
12 | field.delim.in = ","
13 | field.delim.out = ","
14 | id.fieldOrdinals = [0]
15 | attr.ordinals = [2,3]
16 | seasonal.analysis = true
17 | part.bySeasonCycle = true
18 | seasonal.cycleType = ["nightDayHourOfDay"]
19 | time.fieldOrdinal = 1
20 | time.inMili = false
21 | min.sampleCount = 200
22 | output.precision = 3
23 | debug.on = true
24 | save.output = true
25 | }
26 |
27 | numericalAttrMedian {
28 | field.delim.in = ","
29 | field.delim.out = ","
30 | id.fieldOrdinals = [0]
31 | attr.ordinals = [2,3]
32 | seasonal.analysis = true
33 | operation.type = "med"
34 | med.file.path = ""
35 | hdfs.file = false
36 | med.file.path = "/Users/pranab/Projects/bin/beymani/meta/san/med.txt"
37 | seasonal.cycleType = ["nightDayHourOfDay"]
38 | time.fieldOrdinal = 1
39 | time.inMili = false
40 | output.precision = 6
41 | min.samplecount = 200
42 | debug.on = true
43 | save.output = true
44 | }
45 |
46 | filter {
47 | field.delim.in = ","
48 | field.delim.out = ","
49 | id.fieldOrdinals = [0]
50 | selection.filter = ""
51 | stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/san/stats.txt"
52 | schema.file.path = "/Users/pranab/Projects/bin/beymani/meta/sales.conf"
53 | debug.on = true
54 | save.output = true
55 | }
56 |
57 | statsBasedOutlierPredictor {
58 | field.delim.in = ","
59 | field.delim.out = ","
60 | predictor.strategy = "robustZscore"
61 | id.fieldOrdinals = [0]
62 | attr.ordinals = [2,3]
63 | score.threshold = 0.95
64 | score.thresholdNorm = 0.90
65 | outlier.polarity = "all"
66 | stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/san/stats.txt"
67 | mean.fldOrd = 4
68 | hdfs.file = false
69 | attr.weights = [0.4, 0.6]
70 | attr.weightStrategy = "weightedAverage"
71 | robustZscore {
72 | med.stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/san/med.txt"
73 | mad.stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/san/mad.txt"
74 | }
75 | seasonal.analysis = true
76 | seasonal.cycleType = ["nightDayHourOfDay"]
77 | time.fieldOrdinal = 1
78 | time.inMili = false
79 | output.precision = 3
80 | output.outliers = false
81 | rem.outliers = false
82 | clean.dataDirPath = "file:///Users/pranab/Projects/bin/beymani/other/olp/clean"
83 | debug.on = true
84 | save.output = true
85 | }
86 |
--------------------------------------------------------------------------------
/resource/ticket.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | PROJECT_HOME=/Users/pranab/Projects
4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar
5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar
6 | MASTER=spark://akash:7077
7 |
8 | case "$1" in
9 |
10 | "loadInp")
11 | rm $PROJECT_HOME/bin/beymani/input/ticket/$3/*
12 | cp $2 $PROJECT_HOME/bin/beymani/input/ticket/$3/
13 | ls -l $PROJECT_HOME/bin/beymani/input/ticket/$3/
14 | ;;
15 |
16 |
17 | "numStat")
18 | echo "running NumericalAttrStats Spark job"
19 | CLASS_NAME=org.chombo.spark.explore.NumericalAttrStats
20 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/ticket/train/*
21 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ticket/stat
22 | rm -rf ./output/ticket/stat
23 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
24 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT ticket.conf
25 | ;;
26 |
27 | "numMstat")
28 | echo "running NumericalAttrMedian Spark job"
29 | CLASS_NAME=org.chombo.spark.explore.NumericalAttrMedian
30 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/ticket/train/*
31 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ticket/mstat
32 | rm -rf ./output/ticket/mstat
33 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
34 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT ticket.conf
35 | rm ./output/ticket/mstat/_SUCCESS
36 | ls -l ./output/ticket/mstat
37 | ;;
38 |
39 | "bkMod")
40 | echo "backing up model files"
41 | MED_FILES=$PROJECT_HOME/bin/beymani/output/ticket/mstat/*
42 | META_DIR=$PROJECT_HOME/bin/beymani/meta/ticket
43 | META_FILE=$META_DIR/$2
44 | echo "copying to $META_FILE"
45 | cp /dev/null $META_FILE
46 | for f in $MED_FILES
47 | do
48 | echo "Copying file $f ..."
49 | cat $f >> $META_FILE
50 | done
51 | ls -l $META_FILE
52 | ;;
53 |
54 | "olPred")
55 | echo "running StatsBasedOutlierPredictor Spark job"
56 | CLASS_NAME=org.beymani.spark.dist.StatsBasedOutlierPredictor
57 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/ticket/pred/*
58 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ticket/olp
59 | rm -rf ./output/ticket/olp
60 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
61 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT ticket.conf
62 | rm ./output/ticket/olp/_SUCCESS
63 | ls -l ./output/ticket/olp
64 | cat ./output/ecom/ticket/part-00000 | grep ,O
65 | ;;
66 |
67 | *)
68 | echo "unknown operation $1"
69 | ;;
70 |
71 | esac
72 |
--------------------------------------------------------------------------------
/resource/ecomm.conf:
--------------------------------------------------------------------------------
1 | numericalAttrStats {
2 | field.delim.in = ","
3 | field.delim.out = ","
4 | id.fieldOrdinals = [0,1]
5 | attr.ordinals = [3]
6 | seasonal.analysis = true
7 | part.bySeasonCycle = true
8 | seasonal.cycleType = ["hourOfDay"]
9 | time.fieldOrdinal = 2
10 | time.inMili = false
11 | min.sampleCount = 100
12 | output.precision = 3
13 | debug.on = true
14 | save.output = true
15 | }
16 |
17 | numericalAttrMedian {
18 | field.delim.in = ","
19 | field.delim.out = ","
20 | id.fieldOrdinals = [0,1]
21 | attr.ordinals = [3]
22 | seasonal.analysis = false
23 | operation.type = "mad"
24 | hdfs.file = false
25 | med.file.path = "/Users/pranab/Projects/bin/beymani/meta/ecom/med.txt"
26 | seasonal.cycleType = ["hourOfDay"]
27 | time.fieldOrdinal = 2
28 | time.inMili = false
29 | output.precision = 6
30 | min.samplecount = 100
31 | debug.on = true
32 | save.output = true
33 | }
34 |
35 | statsBasedOutlierPredictor {
36 | field.delim.in = ","
37 | field.delim.out = ","
38 | predictor.strategy = "robustZscore"
39 | id.fieldOrdinals = [0,1]
40 | attr.ordinals = [3]
41 | score.threshold = 0.7
42 | exp.const = 1.5
43 | outlier.polarity = "all"
44 | stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/ecom/stats.txt"
45 | mean.fldOrd = 4
46 | hdfs.file = false
47 | attr.weights = [1]
48 | attr.weightStrategy = "weightedAverage"
49 | robustZscore {
50 | med.stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/ecom/med.txt"
51 | mad.stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/ecom/mad.txt"
52 | }
53 | seasonal.analysis = false
54 | seasonal.cycleType = ["hourOfDay"]
55 | time.fieldOrdinal = 2
56 | time.inMili = false
57 | output.precision = 3
58 | output.outliers = false
59 | rem.outliers = false
60 | clean.dataDirPath = "file:///Users/pranab/Projects/bin/beymani/other/olp/clean"
61 | debug.on = true
62 | save.output = true
63 | }
64 |
65 | outlierAggregator {
66 | field.delim.in = ","
67 | field.delim.out = ","
68 | type.field.ordinal = 0
69 | id.field.ordinal = 1
70 | seq.field.ordinal = 2
71 | quant.field.ordinal = 3
72 | stream.schmaFilePath = "/Users/pranab/Projects/bin/beymani/meta/ecom/ecommDataStream.json"
73 | output.precision = 3
74 | debug.on = true
75 | save.output = true
76 | }
77 |
78 | isolationForestModel {
79 | field.delim.in = ","
80 | field.delim.out = ","
81 | attr.ordinals = [1,3,4,5,7]
82 | score.threshold = .450
83 | num.tree = 64
84 | subsample.size = 256
85 | max.depth = 10
86 | rec.count = 1788
87 | output.precision = 3
88 | debug.on = true
89 | save.output = true
90 | }
--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/EntropyIncreaseBasedPredictor.java:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.predictor;
19 |
20 | import java.util.Map;
21 |
22 | /**
23 | * Predict outlier based on increase of entropy resulting from including outlier point
24 | * @author pranab
25 | *
26 | */
27 | public class EntropyIncreaseBasedPredictor extends DistributionBasedPredictor {
28 | private double entropy;
29 | private double baseConvConst = Math.log(2);
30 | private String subFieldDelim = ":";
31 |
32 | public EntropyIncreaseBasedPredictor(Map conf) {
33 | super(conf);
34 |
35 | //entropy
36 | entropy = 0;
37 | for (String bucketKey : distrModel.keySet()) {
38 | double pr = ((double)distrModel.get(bucketKey)) / totalCount;
39 | entropy += -pr * Math.log(pr) / baseConvConst;
40 | }
41 | }
42 |
43 | @Override
44 | public double execute(String entityID, String record) {
45 | double score = 0;
46 | String thisBucketKey = getBucketKey(record);
47 |
48 | //new entropy
49 | double newEntropy = 0;
50 | int newTotalCount = totalCount + 1;
51 | boolean bucketFound = false;
52 | double pr = 0;
53 | for (String bucketKey : distrModel.keySet()) {
54 | if (bucketKey.equals(thisBucketKey)) {
55 | pr = ((double)distrModel.get(bucketKey) + 1) / newTotalCount;
56 | bucketFound = true;
57 | } else {
58 | pr = ((double)distrModel.get(bucketKey)) / newTotalCount;
59 | }
60 | newEntropy += -pr * Math.log(pr) / baseConvConst;
61 | }
62 |
63 | if (!bucketFound) {
64 | pr = 1.0 / newTotalCount;
65 | newEntropy += -pr * Math.log(pr) / baseConvConst;
66 | }
67 |
68 | if (newEntropy > entropy) {
69 | score = (newEntropy - entropy) / entropy;
70 | }
71 |
72 | if (score > scoreThreshold) {
73 | //write if above threshold
74 | outQueue.send(entityID + " " + score);
75 | }
76 | return score;
77 | }
78 |
79 | @Override
80 | public double execute(String[] items, String compKey) {
81 | //TODO
82 | double score = 0;
83 |
84 | return score;
85 | }
86 |
87 | @Override
88 | public boolean isValid(String compKey) {
89 | // TODO Auto-generated method stub
90 | return true;
91 | }
92 |
93 | }
94 |
--------------------------------------------------------------------------------
/resource/bsm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | PROJECT_HOME=/Users/pranab/Projects
4 | JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar
5 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar
6 | AVENIR_JAR_NAME=$PROJECT_HOME/bin/avenir/uber-avenir-spark-1.0.jar
7 | MASTER=spark://akash:7077
8 |
9 | case "$1" in
10 |
11 | "transformTrain")
12 | echo "running DataTransformer"
13 | CLASS_NAME=org.chombo.spark.etl.DataTransformer
14 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/bsm/train/*
15 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/bsm/train/trans
16 | rm -rf ./output/bsm/train/trans
17 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
18 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT bsm.conf
19 | rm -rf ./output/bsm/train/trans/_SUCCESS
20 | ;;
21 |
22 | "stateTrans")
23 | echo "running MarkovStateTransitionModel"
24 | CLASS_NAME=org.avenir.spark.sequence.MarkovStateTransitionModel
25 | INPUT=file:///Users/pranab/Projects/bin/beymani/output/bsm/train/trans/*
26 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/bsm/train/sttr
27 | rm -rf ./output/bsm/train/sttr
28 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
29 | --conf spark.ui.killEnabled=true --master $MASTER $AVENIR_JAR_NAME $INPUT $OUTPUT bsm.conf
30 | rm -rf ./output/bsm/train/sttr/_SUCCESS
31 | ;;
32 |
33 | "transformPred")
34 | echo "running DataTransformer"
35 | CLASS_NAME=org.chombo.spark.etl.DataTransformer
36 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/bsm/pred/*
37 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/bsm/pred/trans
38 | rm -rf ./output/bsm/trans
39 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
40 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT bsm.conf
41 | rm -rf ./output/bsm/pred/trans/_SUCCESS
42 | ;;
43 |
44 | "cpModel")
45 | echo "copying model files"
46 | MOD_FILES=$PROJECT_HOME/bin/beymani/output/bsm/train/sttr/*
47 | META_DIR=$PROJECT_HOME/bin/beymani/meta
48 | cp /dev/null $META_DIR/bsm_mod.txt
49 | for f in $MOD_FILES
50 | do
51 | echo "Copying file $f ..."
52 | cat $f >> $META_DIR/bsm_mod.txt
53 | done
54 | ;;
55 |
56 | "olPredict")
57 | echo "running MarkovChainPredictor"
58 | CLASS_NAME=org.beymani.spark.seq.MarkovChainPredictor
59 | INPUT=file:///Users/pranab/Projects/bin/beymani/output/bsm/pred/trans/*
60 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/bsm/pred/oul
61 | rm -rf ./output/bsm/pred/oul
62 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
63 | --conf spark.ui.killEnabled=true --master $MASTER $JAR_NAME $INPUT $OUTPUT bsm.conf
64 | rm -rf ./output/bsm/pred/oul/_SUCCESS
65 | ls -l ./output/bsm/pred/oul
66 | for f in ./output/bsm/pred/oul/*
67 | do
68 | echo "number of outliers in $f"
69 | cat $f | grep ,O | wc -l
70 | done
71 |
72 | ;;
73 |
74 | *)
75 | echo "unknown operation $1"
76 | ;;
77 |
78 | esac
--------------------------------------------------------------------------------
/resource/quarantine_violation_detection_tutorial.txt:
--------------------------------------------------------------------------------
1 | This tutorial is for detecting quarantine violation based mobile location anomaly. Violation
2 | could be because of quarantined people miving out of quarantined location or non quarantined people
3 | visiting quarantined locations
4 |
5 |
6 | Environment
7 | ===========
8 | Make sure you have ../lib directory with all the python files wrt where mob_loc.py is.
9 | Please refer to resource/spark_dependency.txt for building the jar for Spark.
10 | All the configuration data generation python script are mob_loc.properties. Make sure all the
11 | directories for data as in epid.sh are created
12 |
13 | Generate data for out of range violation
14 | ========================================
15 | Phone numbers and quarantim=ne location
16 | python3 mob_loc.py genQuaLoc mob_loc.properties > qualist.txt
17 |
18 | quarantined people movement location data
19 | python3 mob_loc.py quaLoc mob_loc.properties > qualoc.txt
20 |
21 | insert outliers in movement location data (quarantined person moving out of quarantined location)
22 | python3 mob_loc.py quaLocOutlier mob_loc.properties > qualocou.txt
23 |
24 | Copy data
25 | =========
26 | quarantine location
27 | ./epid.sh cpQuaLocData qualist.txt outr
28 |
29 | quarantined people movement location data
30 | ./epid.sh cpLocData qualoc.txt outr
31 |
32 | Spark job going out range outlier
33 | =================================
34 | ./epid.sh olPredOu
35 |
36 | Generate data for out of range violation
37 | ========================================
38 | all locations data
39 | python3 mob_loc.py genLoc mob_loc.properties > res_loc.txt
40 | python3 mob_loc.py genLoc mob_loc.properties > work_loc.txt
41 | python3 mob_loc.py genLoc mob_loc.properties > school_loc.txt
42 | python3 mob_loc.py genLoc mob_loc.properties > med_loc.txt
43 | python3 mob_loc.py genLoc mob_loc.properties > shop_loc.txt
44 | python3 mob_loc.py genLoc mob_loc.properties > ent_loc.txt
45 | python3 mob_loc.py genLoc mob_loc.properties > event_loc.txt
46 | python3 mob_loc.py genLoc mob_loc.properties > open_loc.txt
47 |
48 | Here are the region.num.locations and region.loc.size. You have to set them before generating location
49 | for each location type
50 | residence 200 .0002
51 | work 10 .0005
52 | school 3 .0020
53 | medical 3 .0004
54 | shoppinh area 5 .0020
55 | entertainment area 5 .0010
56 | large event area 2 .0008
57 | open space 2 .0024
58 |
59 | quarantined locations
60 | python3 mob_loc.py uniqQuaLoc mob_loc.properties > uniq_qualist.txt
61 |
62 | people movement location data
63 | python3 mob_loc.py genMovement mob_loc.properties > move_loc.txt
64 |
65 | Copy data
66 | =========
67 | quarantine location
68 | ./epid.sh cpQuaLocData uniq_qualist.txt inr
69 |
70 | quarantined people movement location data
71 | ./epid.sh cpLocData move_loc.txt inr
72 |
73 | Spark job for in range outlier
74 | ==============================
75 | ./epid.sh olPredIn
76 |
77 |
78 |
79 |
80 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Introduction
2 | Beymani consists of set of Hadoop, Spark and Storm based tools for outlier and anamoly
3 | detection, which can be used for fraud detection, intrusion detection etc.
4 |
5 | ## Philosophy
6 | * Simple to use
7 | * Input output in CSV format
8 | * Metadata defined in simple JSON file
9 | * Extremely configurable with tons of configuration knobs
10 |
11 | ## Blogs
12 | The following blogs of mine are good source of details of beymani
13 | * http://pkghosh.wordpress.com/2012/01/02/fraudsters-outliers-and-big-data-2/
14 | * http://pkghosh.wordpress.com/2012/02/18/fraudsters-are-not-model-citizens/
15 | * http://pkghosh.wordpress.com/2012/06/18/its-a-lonely-life-for-outliers/
16 | * http://pkghosh.wordpress.com/2012/10/18/relative-density-and-outliers/
17 | * http://pkghosh.wordpress.com/2013/10/21/real-time-fraud-detection-with-sequence-mining/
18 | * https://pkghosh.wordpress.com/2018/09/18/contextual-outlier-detection-with-statistical-modeling-on-spark/
19 | * https://pkghosh.wordpress.com/2018/10/15/learning-alarm-threshold-from-user-feedback-using-decision-tree-on-spark/
20 | * https://pkghosh.wordpress.com/2019/07/25/time-series-sequence-anomaly-detection-with-markov-chain-on-spark/
21 | * https://pkghosh.wordpress.com/2020/09/27/time-series-change-point-detection-with-two-sample-statistic-on-spark-with-application-for-retail-sales-data/
22 | * https://pkghosh.wordpress.com/2020/12/24/concept-drift-detection-techniques-with-python-implementation-for-supervised-machine-learning-models/
23 | * https://pkghosh.wordpress.com/2021/01/20/customer-service-quality-monitoring-with-autoencoder-based-anomalous-case-detection/
24 | * https://pkghosh.wordpress.com/2021/06/28/ecommerce-order-processing-system-monitoring-with-isolation-forest-based-anomaly-detection-on-spark/
25 |
26 | ## Algorithms
27 | * Univarite distribution model
28 | * Multi variate sequence or multi gram distribution model
29 | * Average instance Distance
30 | * Relative instance Density
31 | * Markov chain with sequence data
32 | * Spectral residue for sequence data
33 | * Quantized symbol mapping for sequence data
34 | * Local outlier factor for multivariate data
35 | * Instance clustering
36 | * Sequence clustering
37 | * Change point detection
38 | * Isolation Forest for multivariate data
39 | * Auto Encoder for multivariate data
40 |
41 | ## Getting started
42 | Project's resource directory has various tutorial documents for the use cases described in
43 | the blogs.
44 |
45 | ## Build
46 | For Hadoop 1
47 | * mvn clean install
48 |
49 | For Hadoop 2 (non yarn)
50 | * git checkout nuovo
51 | * mvn clean install
52 |
53 | For Hadoop 2 (yarn)
54 | * git checkout nuovo
55 | * mvn clean install -P yarn
56 |
57 | For Spark
58 | * mvn clean install
59 | * sbt publishLocal
60 | * in ./spark sbt clean package
61 |
62 | ## Help
63 | Please feel free to email me at pkghosh99@gmail.com
64 |
65 | ## Contribution
66 | Contributors are welcome. Please email me at pkghosh99@gmail.com
67 |
68 |
69 |
70 |
--------------------------------------------------------------------------------
/resource/cct.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/ruby
2 |
3 | count = ARGV[0].to_i
4 |
5 | amount_dist = [
6 | 10,10,
7 | 17,17,17,
8 | 25,25,25,25,25,
9 | 37,37,37,37,37,37,37,
10 | 45,45,45,45,45,
11 | 66,66,66,66,
12 | 82,82,82,82,
13 | 150,150,150,
14 | 220,220,
15 | 300,300,
16 | 500,
17 | 1000,
18 | 2000
19 | ]
20 |
21 | time_dist = [
22 | 0,0,0,
23 | 1,1,1,1,
24 | 2,2,2,2,2,2,2,
25 | 3,3,3,3,3,
26 | 4,4,4,
27 | 5,5,
28 | 6,
29 | 7,
30 | 8,
31 | 9,
32 | 10,
33 | 11,
34 | 12,
35 | 13,
36 | 14,
37 | 15,
38 | 16,16,
39 | 17,17,17,
40 | 18,18,
41 | 19,
42 | 20,
43 | 21,21,
44 | 22,22,22,
45 | 23
46 | ]
47 |
48 | vendors = ['grocery', 'restaurant', 'drug store', 'super market', 'electronic store', 'clothing store', 'jewellery store',
49 | 'air fare', 'hotel', 'car rental']
50 |
51 | vendor_dist = [
52 | 0,0,0,0,0,0,0,0,0,
53 | 1,1,1,
54 | 2,2,2,2,2,2,
55 | 3,3,3,3,
56 | 4,4,
57 | 5,5,5,
58 | 7,7,7,
59 | 8,8,
60 | 9,9
61 | ]
62 |
63 |
64 | vendor_amount_dist = {
65 | 'grocery' => [
66 | 10,10,
67 | 20,20,20,20,
68 | 30,30,30,30,30,30,30,
69 | 50,50,50,50,50,50,50,50,50,
70 | 70,70,70,70,
71 | 100,100,
72 | 150
73 | ],
74 |
75 | 'restaurant' => [
76 | 10,10,
77 | 20,20,20,20,20,
78 | 27,27,
79 | 35,
80 | 50
81 | ],
82 |
83 | 'drug store' => [
84 | 12,12,
85 | 23,23,23,23,23,
86 | 37,37,37,
87 | 45,45,
88 | 60
89 | ],
90 |
91 | 'super market' => [
92 | 25,25,
93 | 38,38,38,
94 | 49,49,49,49,49,49,
95 | 68,68,68,
96 | 112,112,
97 | 185,
98 | 250
99 | ],
100 |
101 | 'electronic store' => [
102 | 60,60,
103 | 90,90,
104 | 120,120,120,120,
105 | 190,190,190,190,190,
106 | 250,250,250,
107 | 300,300,
108 | 500
109 | ],
110 |
111 | 'clothing store' => [
112 | 30,30,
113 | 50,50,50,50,
114 | 70,70,70,
115 | 90,90,
116 | 150,
117 | 200
118 | ],
119 |
120 | 'jewellery store' => [
121 | 100,
122 | 170,170,
123 | 260,260,260,
124 | 310,310,
125 | 400
126 | ],
127 |
128 | 'air fare' => [
129 | 110,110,
130 | 180,180,180,
131 | 310,310,310,310,310,
132 | 520,520,
133 | 600
134 | ],
135 |
136 | 'hotel' => [
137 | 110,110,110,
138 | 230,230,230,230,
139 | 300,
140 | 400
141 | ],
142 |
143 | 'car rental' => [
144 | 60,60,
145 | 110,110,110,110,
146 | 150,150,
147 | 200
148 | ]
149 |
150 | }
151 |
152 | key = ['0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
153 | 'P','Q','R','S','T','U','V','W','X','Y','Z']
154 |
155 | def gen_id(key)
156 | id = ''
157 | 1.upto 8 do #!/usr/bin/ruby
158 |
159 | require '../lib/util.rb'
160 |
161 | userCount = ARGV[0].to_i
162 |
163 | id << key[rand(key.length)]
164 | end
165 | return id
166 | end
167 |
168 | def sample(dist, mult, floor, percent)
169 | b = rand(dist.length)
170 | val = dist[b]
171 | val = val * mult
172 | percent = rand(percent)
173 | percent = percent < floor ? floor : percent
174 |
175 | dev = (val * percent) / 100
176 | if ((rand(100) % 2) == 0)
177 | val = val + dev
178 | else
179 | val = val - dev
180 | end
181 | val = val < 0 ? 0 : val
182 | val
183 | end
184 |
185 | 1.upto count do
186 | id = gen_id(key)
187 | time = sample(time_dist, 60, 2, 8)
188 | time = time > 1440 ? 1440 : time
189 | v = vendor_dist[rand(vendor_dist.length)]
190 | vendor = vendors[v]
191 | am = sample(vendor_amount_dist[vendor], 100, 4, 12)
192 | puts "#{id}[]#{time}[]#{am/100}.#{am%100}[]#{vendor}"
193 | end
194 |
195 |
196 |
--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/EstimatedProbabilityBasedPredictor.java:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.predictor;
19 |
20 | import java.io.IOException;
21 | import java.util.Map;
22 |
23 | import org.apache.hadoop.conf.Configuration;
24 |
25 | /**
26 | * Estimated probability based outlier prediction
27 | * @author pranab
28 | *
29 | */
30 | public class EstimatedProbabilityBasedPredictor extends DistributionBasedPredictor {
31 |
32 | /**
33 | * Storm usage
34 | * @param conf
35 | */
36 | public EstimatedProbabilityBasedPredictor(Map conf) {
37 | super(conf);
38 | realTimeDetection = true;
39 | }
40 |
41 | /**
42 | * @param config
43 | * @param distrFilePathParam
44 | * @param hdfsFileParam
45 | * @param schemaFilePathParam
46 | * @param scoreThresholdParam
47 | * @throws IOException
48 | */
49 | public EstimatedProbabilityBasedPredictor(Map config, String idOrdinalsParam,
50 | String distrFilePathParam, String hdfsFileParam, String schemaFilePathParam,
51 | String seasonalParam, String fieldDelimParam, String scoreThresholdParam) throws IOException {
52 | super(config, idOrdinalsParam, distrFilePathParam, hdfsFileParam, schemaFilePathParam,
53 | seasonalParam, fieldDelimParam, scoreThresholdParam);
54 | }
55 |
56 | /**
57 | * Hadoop MR usage
58 | * @param config
59 | * @param distrFilePath
60 | * @throws IOException
61 | */
62 | public EstimatedProbabilityBasedPredictor(Configuration config, String distrFilePath, String scoreThresholdParam) throws IOException {
63 | super(config, distrFilePath);
64 | scoreThreshold = Double.parseDouble( config.get( scoreThresholdParam));
65 | }
66 |
67 | @Override
68 | public double execute(String entityID, String record) {
69 | String bucketKey = getBucketKey(record);
70 | Integer count = distrModel.get(bucketKey);
71 | double pr = null != count ? (((double)count) / totalCount) : 0;
72 | double score = 1.0 - pr;
73 | scoreAboveThreshold = score > scoreThreshold;
74 | if (realTimeDetection && scoreAboveThreshold) {
75 | //write if above threshold
76 | outQueue.send(entityID + " " + score);
77 | }
78 | return score;
79 | }
80 |
81 | @Override
82 | public double execute(String[] items, String compKey) {
83 | String bucketKey = getBucketKey(items);
84 | Map distrModel = keyedDistrModel.get(compKey);
85 | Integer count = distrModel.get(bucketKey);
86 | int totalCount = totalCounts.get(compKey);
87 | double pr = null != count ? (((double)count) / totalCount) : 0;
88 | double score = 1.0 - pr;
89 | return score;
90 | }
91 |
92 | @Override
93 | public boolean isValid(String compKey) {
94 | // TODO Auto-generated method stub
95 | return true;
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/ExtremeValuePredictor.java:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.predictor;
19 |
20 | import java.io.IOException;
21 | import java.util.Map;
22 |
23 | import org.beymani.util.OutlierScoreAggregator;
24 | import org.chombo.util.BasicUtils;
25 |
26 | /**
27 | * @author pranab
28 | *
29 | */
30 | public class ExtremeValuePredictor extends ZscorePredictor {
31 |
32 | /**
33 | * @param config
34 | * @param idOrdinalsParam
35 | * @param attrListParam
36 | * @param fieldDelimParam
37 | * @param attrWeightParam
38 | * @param statsFilePathParam
39 | * @param seasonalParam
40 | * @param hdfsFileParam
41 | * @param scoreThresholdParam
42 | * @param expConstParam
43 | * @throws IOException
44 | */
45 | public ExtremeValuePredictor(Map config,String idOrdinalsParam, String attrListParam,
46 | String fieldDelimParam, String attrWeightParam,String statsFilePathParam, String seasonalParam,
47 | String hdfsFileParam, String scoreThresholdParam,String expConstParam, String ignoreMissingStatParam,
48 | String scoreAggggregationStrtaegyParam) throws IOException {
49 | super(config, idOrdinalsParam, attrListParam, fieldDelimParam, attrWeightParam,
50 | statsFilePathParam, seasonalParam, hdfsFileParam, scoreThresholdParam,
51 | expConstParam, ignoreMissingStatParam, scoreAggggregationStrtaegyParam);
52 | }
53 |
54 | /* (non-Javadoc)
55 | * @see org.beymani.predictor.ZscorePredictor#execute(java.lang.String[], java.lang.String)
56 | */
57 | @Override
58 | public double execute(String[] items, String compKey) {
59 | double score = 0;
60 | OutlierScoreAggregator scoreAggregator = new OutlierScoreAggregator(attrWeights.length, attrWeights);
61 | double thisScore = 0;
62 | for (int ord : attrOrdinals) {
63 | double val = Double.parseDouble(items[ord]);
64 | double d = 0;
65 | double e = 0;
66 | if (null != idOrdinals) {
67 | if (statsManager.statsExists(compKey, ord)) {
68 | d = Math.abs( val - statsManager.getMean(compKey,ord));
69 | e = Math.exp(-d / statsManager.getStdDev(compKey, ord));
70 | thisScore = Math.exp(-e);
71 | scoreAggregator.addScore(thisScore);
72 | } else {
73 | scoreAggregator.addScore();
74 | }
75 | } else {
76 | d = Math.abs( val - statsManager.getMean(ord));
77 | e = Math.exp(-d / statsManager.getStdDev(ord));
78 | thisScore = Math.exp(-e);
79 | scoreAggregator.addScore(thisScore);
80 | }
81 | }
82 | //aggregate score
83 | score = getAggregateScore(scoreAggregator);
84 |
85 | //exponential normalization
86 | if (expConst > 0) {
87 | score = BasicUtils.expScale(expConst, score);
88 | }
89 |
90 | scoreAboveThreshold = score > scoreThreshold;
91 | return score;
92 | }
93 |
94 | }
95 |
--------------------------------------------------------------------------------
/python/app/cpsale.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 |
3 | # avenir-python: Machine Learning
4 | # Author: Pranab Ghosh
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License"); you
7 | # may not use this file except in compliance with the License. You may
8 | # obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | # implied. See the License for the specific language governing
16 | # permissions and limitations under the License.
17 |
18 | # Package imports
19 | import os
20 | import sys
21 | import random
22 | import statistics
23 | import matplotlib.pyplot as plt
24 | sys.path.append(os.path.abspath("../lib"))
25 | sys.path.append(os.path.abspath("../mlextra"))
26 | from util import *
27 | from sampler import *
28 | from mcsim import *
29 |
30 | """
31 | cannibalized product sale
32 | """
33 |
34 | values = list()
35 | def psale(args):
36 | i = 0
37 | q1 = int(args[i])
38 | q1 = q1 if q1 >= 0 else 0
39 | i += 1
40 | q2 = int(args[i])
41 | q2 = q2 if q2 >= 0 else 0
42 | i += 1
43 | pid1 = args[i]
44 | i += 1
45 | pid2 = args[i]
46 | i += 1
47 | ptime = args[i]
48 | i += 1
49 | iter = args[i]
50 | ctime = ptime + iter * 3600
51 | print("{},{},{}".format(pid1, ctime, q1))
52 | print("{},{},{}".format(pid2, ctime, q2))
53 | values.append(q1)
54 |
55 |
56 | if __name__ == "__main__":
57 | op = sys.argv[1]
58 | if op == "gen":
59 | numDays = int(sys.argv[2])
60 | numIter = 24 * numDays
61 | curTime, pastTime = pastTime(numDays, "d")
62 | pastTime = dayAlign(pastTime)
63 | tsStart = int(0.6 * numIter)
64 | trEnd = tsStart + 30
65 | trSl = -2.0
66 | cy = np.array([-20.0, -35.0, -55.0, -65.0, -70.0, -70.0, -50.0, -30.0, -5.0, 15.0, 35.0, 50.0,
67 | 65.0, 65.0, 55.0, 50.0, 40.0, 30.0, 25.0, 35.0, 30.0, 20.0, 5.0, -15.0])
68 | cy1 = 0.7 * cy
69 | cy2 = 0.7 * cy1
70 | cy3 = 0.3 * cy1
71 | simulator = MonteCarloSimulator(numIter, psale, "./log/mcsim.log", "info")
72 | simulator.registerNormalSamplerWithTrendCycle(100, 10, 0, cy1)
73 | simulator.registerNormalSamplerWithTrendCycle(150, 20, 0.01, cy2)
74 | simulator.registerExtraArgs("DK75HUI45X", "GHT56FGT8K", pastTime)
75 | trSampler = NormalSamplerWithTrendCycle(100.0, 10.0, trSl , cy1)
76 | simulator.setSampler(0, tsStart, trSampler)
77 | newSampler = NormalSamplerWithTrendCycle(40, 12, 0, cy3)
78 | simulator.setSampler(0, trEnd, newSampler)
79 |
80 | simulator.run()
81 | #drawLine(values, 250)
82 |
83 | elif op == "plot":
84 | filePath = sys.argv[2]
85 | rid = sys.argv[3]
86 | filt = lambda r : r[0] == rid
87 | dvalues = list(map(lambda r : float(r[2]), fileFiltRecGen(filePath, filt)))
88 | xvalues = list(map(lambda r : int(r[1]), fileFiltRecGen(filePath, filt)))
89 | it = xvalues[0]
90 | if len(sys.argv) == 5:
91 | cpFilePath = sys.argv[4]
92 | cdvalues = list(map(lambda r : float(r[3]), fileFiltRecGen(cpFilePath, filt)))
93 | cxvalues = list(map(lambda r : int(r[2]), fileFiltRecGen(cpFilePath, filt)))
94 | i = 0
95 | for t in cxvalues:
96 | plt.axvline(t, 0, .9, color="r")
97 | i += 1
98 | plt.plot(xvalues, dvalues, "b")
99 | plt.show()
100 | else:
101 | plt.plot(xvalues, dvalues, "b")
102 | plt.show()
103 |
104 |
105 |
--------------------------------------------------------------------------------
/resource/cpu_usage_anomaly_det_tutorial.txt:
--------------------------------------------------------------------------------
1 | This tutorial is for anaomaly detection in CPU usage data using statistical modeling. To ne more specidfic
2 | we will be using a z score based technique. Model gets built with oultliers in data. The detected outliers
3 | are removed and the model is built again, but htis time without outliers in the data.
4 |
5 |
6 | Environment
7 | ===========
8 | Path etc shown here corresposnds to my environment. Please Change them as needed for your
9 | environment
10 |
11 | Build
12 | =====
13 | Follow instructions in spark_dependency.txt
14 |
15 | Python dependency
16 | =================
17 | The shell script commands for data generation run python scripts for data generation. Before you run
18 | the data generation commands do the following
19 | 1. checkout project avenir
20 | 2. copy the avenir/python/lib directory to ../lib with respect to your location of cpu_usage.py file
21 |
22 |
23 | Create base normal data
24 | =======================
25 | ./and_spark.sh crInput true
26 |
27 | where
28 | num_of_days = number of days e.g 10
29 | reading_intervaL = reading interval in sec e.g. 300
30 | num_servers = number of servers e.g. 4
31 | output_file = output file, we will use cusage.txt from now on
32 |
33 | - insert outliers
34 | ./and_spark.sh insOutliers
35 |
36 | where
37 | normal_data_file = normal data file (cusage.txt)
38 | with_outlier_data_file = data file with outliers (cusage.txt)
39 |
40 | -copy
41 | ./and_spark.sh cpModData
42 |
43 | where
44 | with_outlier_data_file = data file with outliers (cusage.txt)
45 |
46 | Run Spark job for stats
47 | =======================
48 | ./and_spark.sh numStat
49 |
50 | Copy and consolidate stats file
51 | ===============================
52 | ./and_spark.sh crStatsFile
53 |
54 | Run Spark job to detect outliers
55 | ================================
56 | - set
57 | score.threshold = 2.0
58 | output.outliers = true
59 | rem.outliers = true
60 |
61 | - run
62 | ./and_spark.sh olPred
63 |
64 | Copy and consolidate clean file
65 | ===============================
66 | ./and_spark.sh crCleanFile
67 |
68 | Create and copy test data
69 | =========================
70 | - create
71 | ./and_spark.sh crInput true
77 |
78 | where
79 | normal_data_file = normal data file (c.txt)
80 | with_outlier_data_file = data file with outliers (cusage.txt)
81 |
82 | - copy
83 | ./and_spark.sh cpTestData
84 |
85 | where
86 | with_outlier_data_file = data file with outliers (cusage.txt)
87 |
88 |
89 | Run Spark job for stats again with clean data
90 | =============================================
91 | ./and_spark.sh numStat
92 |
93 | Copy and consolidate stats file
94 | ===============================
95 | ./and_spark.sh crStatsFile
96 |
97 |
98 | Run Spark job to detect outliers
99 | ================================
100 | - set
101 | score.threshold = 3.3
102 | output.outliers = false
103 | rem.outliers = false
104 |
105 | - run
106 | ./and_spark.sh olPred
107 |
108 | Configuration
109 | =============
110 | Configuration is in and.conf. Make changes as necessary
111 |
112 |
113 |
114 |
--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/FileSpout.java:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.predictor;
19 |
20 | import java.io.File;
21 | import java.io.FileNotFoundException;
22 | import java.util.Arrays;
23 | import java.util.Comparator;
24 | import java.util.Map;
25 | import java.util.Scanner;
26 |
27 | import backtype.storm.spout.SpoutOutputCollector;
28 | import backtype.storm.task.TopologyContext;
29 | import backtype.storm.topology.OutputFieldsDeclarer;
30 | import backtype.storm.topology.base.BaseRichSpout;
31 | import backtype.storm.tuple.Fields;
32 | import backtype.storm.tuple.Values;
33 |
34 | /**
35 | * @author pranab
36 | *
37 | */
38 | public class FileSpout extends BaseRichSpout {
39 | private SpoutOutputCollector collector;
40 | private Map conf;
41 | private File[] files;
42 | private Scanner scanner;
43 | /**
44 | *
45 | */
46 | private int curFileIndex = 0;
47 |
48 | @Override
49 | public void open(Map conf, TopologyContext context,
50 | SpoutOutputCollector collector) {
51 | this.collector = collector;
52 | this.conf = conf;
53 |
54 | String dirPath = conf.get("file.spout.dir.path").toString();
55 | File dir = new File(dirPath);
56 | files = dir.listFiles();
57 | Arrays.sort(files, new Comparator(){
58 | public int compare(File f1, File f2) {
59 | int res = f1.lastModified() < f2.lastModified() ? -1 : ( f1.lastModified() > f2.lastModified() ? 1 : 0);
60 | return res;
61 | } });
62 |
63 | openNextFile();
64 | }
65 |
66 | @Override
67 | public void nextTuple() {
68 | String record = readFile();
69 | String[] items = record.split("\\s+");
70 | String entityID = items[0];
71 | String recordData = items[1];
72 | collector.emit(new Values(entityID, recordData));
73 | }
74 |
75 | /**
76 | * @return
77 | */
78 | private String readFile() {
79 | String record = null;
80 | if (scanner.hasNextLine()) {
81 | record = scanner.nextLine();
82 | } else {
83 | if (++curFileIndex < files.length) {
84 | openNextFile();
85 | if (scanner.hasNextLine()) {
86 | record = scanner.nextLine();
87 | }
88 | } else {
89 | //no more files to read
90 | }
91 | }
92 | return record;
93 | }
94 |
95 | /**
96 | *
97 | */
98 | private void openNextFile() {
99 | try {
100 | scanner = new Scanner(files[curFileIndex]);
101 | } catch (FileNotFoundException e) {
102 | throw new IllegalStateException("file not found");
103 | }
104 | }
105 |
106 | /* (non-Javadoc)
107 | * @see backtype.storm.topology.IComponent#declareOutputFields(backtype.storm.topology.OutputFieldsDeclarer)
108 | */
109 | @Override
110 | public void declareOutputFields(OutputFieldsDeclarer declarer) {
111 | declarer.declare(new Fields("entityID", "recordData"));
112 | }
113 |
114 | }
115 |
--------------------------------------------------------------------------------
/python/app/mvand.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 |
3 | # avenir-python: Machine Learning
4 | # Author: Pranab Ghosh
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License"); you
7 | # may not use this file except in compliance with the License. You may
8 | # obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | # implied. See the License for the specific language governing
16 | # permissions and limitations under the License.
17 |
18 | # Package imports
19 | import os
20 | import sys
21 | import random
22 | import matplotlib.pyplot as plt
23 | import numpy as np
24 | import sklearn as sk
25 | from sklearn.ensemble import IsolationForest
26 | from pyod.models.auto_encoder import AutoEncoder
27 | sys.path.append(os.path.abspath("../lib"))
28 | sys.path.append(os.path.abspath("../mlextra"))
29 | from util import *
30 | from mlutil import *
31 | from sampler import *
32 |
33 | """
34 | Anomaly detection with isolation forest
35 | """
36 | if __name__ == "__main__":
37 | op = sys.argv[1]
38 | filePath = sys.argv[2]
39 | window = 20
40 | beg = 0
41 | end = beg + window
42 | if op == "isfo":
43 | #anomaly detection in in service ticket data with isolation porest
44 | scId = sys.argv[3]
45 | colStr = sys.argv[4]
46 | columns = strToIntArray(colStr)
47 | filt = lambda r : r[0] == scId
48 | data = np.array(getFileAsFiltFloatMatrix(filePath, filt, colStr))
49 | nsamp = data.shape[0]
50 | isf = IsolationForest(contamination=0.1)
51 | ypred = isf.fit_predict(data)
52 | colors = ["m", "g", "b", "c", "y"]
53 |
54 | for a in data:
55 | a[2] = a[2] / 24
56 | while True:
57 | inp = input("begin offset: ")
58 | beg = int(inp)
59 | end = beg + window
60 | if beg >= 0:
61 | for i in range(len(columns)):
62 | dvalues = data[:,i]
63 | ci = i % 5
64 | plt.plot(dvalues[beg:end], colors[ci])
65 | count = 0
66 | for i in range(beg, end, 1):
67 | if ypred[i] == -1:
68 | plt.axvline(i - beg, 0, .9, color="r")
69 | count += 1
70 | print("num of outlier {}".format(count))
71 | plt.show()
72 | else:
73 | print("quitting")
74 | break
75 |
76 | elif op == "auen":
77 | #anomaly detection in web session with auto encoder
78 | teFilePath = sys.argv[3]
79 | columns = sys.argv[4]
80 | auen = AutoEncoder(hidden_neurons =[7,5,3,5,7])
81 | trData = np.array(getFileAsFloatMatrix(filePath, columns))
82 | trNsamp = trData.shape[0]
83 | teData = np.array(getFileAsFloatMatrix(teFilePath, columns))
84 | aData = np.vstack((trData, teData))
85 | aData = scaleData(aData, "zscale")
86 | print(aData.shape)
87 | trData = aData[:trNsamp, :]
88 | teData = aData[trNsamp:, :]
89 | print(trData.shape)
90 | print(teData.shape)
91 |
92 | auen.fit(trData)
93 | scores = auen.decision_function(teData)
94 |
95 | while True:
96 | inp = input("begin offset: ")
97 | beg = int(inp)
98 | end = beg + window
99 | if beg >= 0:
100 | plt.plot(scores[beg:end], color="b")
101 | count = 0
102 | for i in range(beg, end, 1):
103 | if scores[i] > 17:
104 | plt.axvline(i - beg, 0, .9, color="r")
105 | count += 1
106 | print("num of outlier {}".format(count))
107 | plt.show()
108 | else:
109 | print("quitting")
110 | break
111 |
112 |
113 |
--------------------------------------------------------------------------------
/resource/rel_density_tutorial.txt:
--------------------------------------------------------------------------------
1 | Configuration
2 | =============
3 | Here are the global config properties to set in the properties file. Please make changes as necessary
4 |
5 | debug.on=true
6 | field.delim=,
7 | field.delim.regex=,
8 | num.reducer=1
9 |
10 | Configuration settings for individual map reduce jobs are described below
11 |
12 | Map Reduce Jobs
13 | ===============
14 |
15 | 1. Similarity calculation
16 | -------------------------
17 | run SameTypeSimilarity
18 |
19 | Make sure properties are set as below in the configuration properties file
20 |
21 | sts.same.schema.file.path=/user/pranab/cct/meta/cct.json
22 | sts.bucket.count=10
23 | sts.distance.scale=1000
24 |
25 | 2. Density calculation.
26 | ----------------------
27 | Here is a sample script. It uses the output of SameTypeSmilarity MR as input
28 |
29 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
30 | CLASS_NAME=org.beymani.proximity.AverageDistance
31 |
32 | echo "running mr"
33 | IN_PATH=/user/pranab/cct/simi
34 | OUT_PATH=/user/pranab/cct/avdi
35 | echo "input $IN_PATH output $OUT_PATH"
36 | hadoop fs -rmr $OUT_PATH
37 | echo "removed output dir"
38 |
39 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH
40 |
41 | Make sure properties are set as below, so that density is output
42 |
43 | avd.top.match.average=false
44 | avd.top.match.density=true
45 | avd.top.match.grouping=false
46 |
47 | 3. Calculate neighborhood groups
48 | --------------------------------
49 | Use the same MR as before. Watch the configurations params at the end of this section
50 |
51 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
52 | CLASS_NAME=org.beymani.proximity.AverageDistance
53 |
54 | echo "running mr"
55 | IN_PATH=/user/pranab/cct/simi
56 | OUT_PATH=/user/pranab/cct/negrp
57 | echo "input $IN_PATH output $OUT_PATH"
58 | hadoop fs -rmr $OUT_PATH
59 | echo "removed output dir"
60 |
61 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH
62 |
63 | Make sure properties are set as below, so that neighborhood group is output
64 |
65 | avd.top.match.average=false
66 | avd.top.match.density=flase
67 | avd.top.match.grouping=true
68 |
69 | 4. Find Neighborhood and Density.
70 | --------------------------------
71 | Here is a sample script. Before running make sure out of step 2 and 3 is copied
72 | or moved to the input dir for thios MR. Change the prefix of the output of step 2
73 | to what is defined in the config param density.file.prefix
74 |
75 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
76 | CLASS_NAME=org.beymani.proximity.NeighborDensity
77 |
78 | echo "running mr"
79 | IN_PATH=/user/pranab/cct/input/nede
80 | OUT_PATH=/user/pranab/cct/nede
81 | echo "input $IN_PATH output $OUT_PATH"
82 | hadoop fs -rmr $OUT_PATH
83 | echo "removed output dir"
84 |
85 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH
86 |
87 | Make sure properties are set as below
88 |
89 | ned.density.file.prefix=den
90 |
91 | 5. Relative density calculation
92 | -------------------------------
93 | It uses the input of step 4 as input. Here is the sample script
94 |
95 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar
96 | CLASS_NAME=org.beymani.proximity.RelativeDensity
97 |
98 | echo "running mr"
99 | IN_PATH=/user/pranab/cct/nede
100 | OUT_PATH=/user/pranab/cct/rede
101 | echo "input $IN_PATH output $OUT_PATH"
102 | hadoop fs -rmr $OUT_PATH
103 | echo "removed output dir"
104 |
105 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH
106 |
107 |
108 |
109 |
110 |
111 |
--------------------------------------------------------------------------------
/src/main/java/org/beymani/util/SequenceMatcher.java:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 |
19 | package org.beymani.util;
20 |
21 | import java.util.ArrayList;
22 | import java.util.List;
23 |
24 | /**
25 | * Various sequence matching algorithms
26 | * @author pranab
27 | *
28 | * @param
29 | */
30 | public class SequenceMatcher {
31 | private List seqData = new ArrayList();
32 | private int maxSize;
33 | private double sim;
34 | private boolean normalize;
35 | private boolean similarity;
36 | private int matchSize;
37 |
38 | public SequenceMatcher(boolean normalize, boolean similarity) {
39 | this.normalize = normalize;
40 | this.similarity = similarity;
41 | }
42 |
43 | public SequenceMatcher(int maxSize,boolean normalized, boolean similarity) {
44 | this(normalized, similarity);
45 | this.maxSize = maxSize;
46 | }
47 |
48 | public void add(T item) {
49 | seqData.add(item);
50 | if (maxSize > 0 && seqData.size() > maxSize) {
51 | seqData.remove(0);
52 | }
53 | }
54 |
55 | /**
56 | * Simple positional matching
57 | * @param other
58 | * @return
59 | */
60 | public double matchCount(SequenceMatcher other) {
61 | matchSize = seqData.size() < other.seqData.size() ? seqData.size() : other.seqData.size();
62 | sim = 0;
63 | for (int i = 0; i < matchSize; ++i) {
64 | if (seqData.get(i).equals(other.seqData.get(i))) {
65 | ++sim;
66 | }
67 | }
68 | prepeareResult(matchSize);
69 | return sim;
70 | }
71 |
72 | /**
73 | * Positional matching with higher reward for adjacent mactches
74 | * @param other
75 | * @return
76 | */
77 | public double adjacencyRewardedMatchCount(SequenceMatcher other) {
78 | matchSize = seqData.size() < other.seqData.size() ? seqData.size() : other.seqData.size();
79 | sim = 0;
80 | int adjCount = 1;
81 | for (int i = 0; i < matchSize; ++i) {
82 | if (seqData.get(i).equals(other.seqData.get(i))) {
83 | sim += adjCount;
84 | ++adjCount;
85 | } else {
86 | adjCount = 1;
87 | }
88 | }
89 | prepeareResult(matchSize);
90 | return sim;
91 | }
92 |
93 | /**
94 | * Positional matching with higher reward for adjacent mactches
95 | * @param other
96 | * @return
97 | */
98 | public double maxCommonSubSeqMatchCount(SequenceMatcher other) {
99 | int matchSize = seqData.size() < other.seqData.size() ? seqData.size() : other.seqData.size();
100 | sim = 0;
101 | int adjCount = 0;
102 | for (int i = 0; i < matchSize; ++i) {
103 | if (seqData.get(i).equals(other.seqData.get(i))) {
104 | ++adjCount;
105 | } else {
106 | if (adjCount > sim) {
107 | sim = adjCount;
108 | }
109 | adjCount = 0;
110 | }
111 | }
112 | prepeareResult(matchSize * (matchSize + 1) / 2);
113 | return sim;
114 | }
115 |
116 | /**
117 | * @param scale
118 | */
119 | private void prepeareResult(int scale) {
120 | if (normalize) {
121 | sim /= scale;
122 | if (!similarity) {
123 | sim = 1.0 - sim;
124 | }
125 | } else {
126 | if (!similarity) {
127 | sim = scale - sim;
128 | }
129 | }
130 | }
131 |
132 | }
133 |
--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/EstimatedCumProbabilityBasedPredictor.java:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.predictor;
19 |
20 | import java.io.IOException;
21 | import java.util.Map;
22 |
23 | import org.apache.hadoop.conf.Configuration;
24 | import org.beymani.util.OutlierScoreAggregator;
25 | import org.chombo.stats.HistogramStat;
26 | import org.chombo.util.BasicUtils;
27 |
28 | public class EstimatedCumProbabilityBasedPredictor extends EsimatedAttrtibuteProbabilityBasedPredictor {
29 |
30 | public EstimatedCumProbabilityBasedPredictor(Map conf) {
31 | super(conf);
32 | }
33 |
34 | /**
35 | * @param config
36 | * @param idOrdinalsParam
37 | * @param attrListParam
38 | * @param distrFilePathParam
39 | * @param hdfsFileParam
40 | * @param schemaFilePathParam
41 | * @param attrWeightParam
42 | * @param seasonalParam
43 | * @param fieldDelimParam
44 | * @param scoreThresholdParam
45 | * @param ignoreMissingDistrParam
46 | * @throws IOException
47 | */
48 | public EstimatedCumProbabilityBasedPredictor(Map config,String idOrdinalsParam, String attrListParam,
49 | String distrFilePathParam, String hdfsFileParam,String schemaFilePathParam, String attrWeightParam,
50 | String seasonalParam, String fieldDelimParam,String scoreThresholdParam, String ignoreMissingDistrParam,
51 | String scoreAggggregationStrtaegyParam)
52 | throws IOException {
53 | super(config, idOrdinalsParam, attrListParam, distrFilePathParam,hdfsFileParam, schemaFilePathParam, attrWeightParam,
54 | seasonalParam, fieldDelimParam, scoreThresholdParam,ignoreMissingDistrParam, "score.strategy", "exp.const",
55 | scoreAggggregationStrtaegyParam);
56 | }
57 |
58 | /**
59 | * @param config
60 | * @param distrFilePathParam
61 | * @param attrWeightParam
62 | * @param scoreThresholdParam
63 | * @param fieldDelimParam
64 | * @throws IOException
65 | */
66 | public EstimatedCumProbabilityBasedPredictor(Configuration config,String distrFilePathParam, String attrWeightParam,
67 | String scoreThresholdParam, String fieldDelimParam)
68 | throws IOException {
69 | super(config, distrFilePathParam, attrWeightParam, scoreThresholdParam,fieldDelimParam);
70 | }
71 |
72 | @Override
73 | public double execute(String[] items, String compKey) {
74 | double score = 0;
75 | OutlierScoreAggregator scoreAggregator = new OutlierScoreAggregator(attrWeights.length, attrWeights);
76 | double thisScore = 0;
77 | for (int ord : attrOrdinals) {
78 | String keyWithFldOrd = compKey + fieldDelim + ord;
79 | double val = Double.parseDouble(items[ord]);
80 | System.out.println("keyWithFldOrd " + keyWithFldOrd);
81 | HistogramStat hist = keyedHist.get(keyWithFldOrd);
82 | if (null != hist) {
83 | double distr = hist.findCumDistr(val);
84 | thisScore = distr < 0.5 ? 1.0 - distr : distr;
85 | scoreAggregator.addScore(thisScore);
86 | } else {
87 | BasicUtils.assertCondition(!ignoreMissingDistr, "missing distr for key " + keyWithFldOrd);
88 | scoreAggregator.addScore();
89 | }
90 | }
91 | //aggregate score
92 | score = getAggregateScore(scoreAggregator);
93 |
94 | scoreAboveThreshold = score > scoreThreshold;
95 | return score;
96 | }
97 |
98 | }
99 |
--------------------------------------------------------------------------------
/resource/real_time_fraud_prediction_tutorial.txt:
--------------------------------------------------------------------------------
1 | This tutorial is for real time fraud detection using Haddop Storm. It uses markov chanin
2 | as the predictive model. Make necessary changes to path etc to suit your environment.
3 |
4 | Dependency
5 | ==========
6 | The project has dependency on chombo.Please do the build as below for chombo and avenir respectively
7 | mvn clean install
8 |
9 | Please refer to jar_dependency.txt for details of dependency
10 |
11 | Easiest way is to use ant as follows
12 | ant build_storm.xml
13 |
14 | Generate input data
15 | ===================
16 | Get util.rb from the project visitante. Puta copy of the file in ../lib
17 | ./xaction_states.rb 5000 > xact_training.txt
18 |
19 | where 5000 is the number of customers
20 | Copy the output file to HDFS input directory /Users/pranab/mmfr/input
21 |
22 | Generate transaction sequence data with MR
23 | ==========================================
24 | Run this script. This MR belogs to the project chombo
25 |
26 | JAR_NAME=/home/pranab/Projects/chombo/target/chombo-1.0.jar
27 | CLASS_NAME=org.chombo.mr.Projection
28 |
29 | echo "running mr"
30 | IN_PATH=/Users/pranab/mmfr/input
31 | OUT_PATH=/Users/pranab/mmfr/sequence
32 | echo "input $IN_PATH output $OUT_PATH"
33 | hadoop fs -rmr $OUT_PATH
34 | echo "removed output dir"
35 |
36 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/mmfr.properties $IN_PATH $OUT_PATH
37 |
38 | Generate markov chain model with MR
39 | ===================================
40 | Run this script
41 |
42 | JAR_NAME=/home/pranab/Projects/avenir/target/avenir-1.0.jar
43 | CLASS_NAME=org.avenir.markov.MarkovStateTransitionModel
44 |
45 | echo "running mr"
46 | IN_PATH=/Users/pranab/mmfr/sequence
47 | OUT_PATH=/Users/pranab/mmfr/model
48 | echo "input $IN_PATH output $OUT_PATH"
49 | hadoop fs -rmr $OUT_PATH
50 | echo "removed output dir"
51 |
52 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/mmfr.properties $IN_PATH $OUT_PATH
53 |
54 | Copy the MR ouput
55 | =================
56 | hadoop fs -get /Users/pranab/mmfr/model/part-r-00000 xmodel.txt
57 |
58 | Store model in Redis
59 | ====================
60 | ./xaction_queue.py setModel xmodel.txt
61 |
62 | Generate test transaction data
63 | ==============================
64 | ./xaction_states.rb 200 > xact_test.txt
65 |
66 | Write test data to Redis queue
67 | ==============================
68 | ./xaction_queue.py writeQueue xact_test.txt
69 |
70 | Build uber jar for storm deployment
71 | ===================================
72 | ant -f build_storm.xml
73 |
74 | Deploy storm topology
75 | =====================
76 | storm jar uber-beymani-1.0.jar org.beymani.predictor.OutlierPredictor NoFraud rt_predict.properties
77 |
78 | Get output
79 | ==========
80 | From storm UI after you have ensured all data have been processed, get the output from the redis
81 | output queue
82 |
83 | ./xaction_queue.py readOutQueue
84 |
85 | Hadoop configuration
86 | ====================
87 | field.delim.regex=,
88 | field.delim.out=,
89 | num.reducer=1
90 | debug.on=false
91 |
92 | #Projection
93 | projection.operation=grouping
94 | key.field=0
95 | projection.field=2
96 |
97 | #MarkovStateTransitionModel
98 | skip.field.count=1
99 | model.states=LNL,LNN,LNS,LHL,LHN,LHS,MNL,MNN,MNS,MHL,MHN,MHS,HNL,HNN,HNS,HHL,HHN,HHS
100 | trans.prob.scale=1
101 |
102 | Storm configuration
103 | ===================
104 | predictor.model=mm
105 | predictor.spout.threads=1
106 | predictor.bolt.threads=2
107 | num.workers=1
108 | debug=on
109 |
110 | redis.server.host=localhost
111 | redis.server.port=6379
112 | redis.markov.model.key=xactionMarkovModel
113 | redis.input.queue=xactionQueue
114 | local.predictor=true
115 | state.seq.window.size=5
116 | state.ordinal=1
117 | detection.algorithm=missProbability
118 | metric.threshold=0.96
119 | redis.output.queue=fraudQueue
120 |
121 |
122 |
123 |
--------------------------------------------------------------------------------
/resource/retail_sale_monitoring_with_anomaly_detection_tutorial.txt:
--------------------------------------------------------------------------------
1 | This tutorial is for ecommerce retail sale monitring based anomaly detection for hourly sales data.
2 | Robust zscore is used for anomaly detection. The data hierarchy is org -> sale -> dept -> product sale
3 |
4 | Dependent script
5 | ================
6 | Checkout the project avenir. Copy the lib directory under python to a directory at the same level
7 | as your working directory for python script ecomm.py
8 |
9 | Build and Deployment
10 | ====================
11 | Please refer to resorce/spark_dependency.txt for building all jars and the final uber jar filw
12 |
13 | Script and configuration
14 | ========================
15 | Feel free to make changes in script exp_spark.sh and the configuration file exp.conf as per you
16 | environment
17 |
18 | Generate stats for hourly sales
19 | ===============================
20 | ./ecomm.py prStat > prstat.txt
21 |
22 | where
23 | num_product = num of products e.g 20
24 |
25 | Generate training data
26 | ======================
27 | ./ecomm.py prSale prstat.txt > sale_tr.txt
28 |
29 | where
30 | interval = amount of time into past e.g 30
31 | time_unit = time unit d for day and h for hour
32 |
33 | Generate prediction data
34 | ========================
35 | ./ecomm.py prSale prstat.txt > sale.txt
36 |
37 |
38 | Insert outlier
39 | ./ecomm.py olPrSale sale.txt > sale_pr.txt
40 |
41 | where
42 | outlier_percentage = percentage of outliers e.g 10
43 |
44 | Copy training data
45 | ==================
46 | ./ecomm.sh loadInp sale_tr.txt training
47 |
48 | Run spark job for basic stats
49 | =============================
50 | ./ecomm.sh numStat
51 |
52 | Run spark job for median
53 | ========================
54 | Set the following in ecomm.conf for numericalAttrMedian
55 | operation.type = "med"
56 |
57 | Run
58 | ./ecomm.sh numMstat
59 |
60 | Copy median file
61 | ================
62 | ./ecomm.sh bkMod med.txt
63 |
64 | It generates med.txt file
65 |
66 | Run spark job for median absolute deviation
67 | ===========================================
68 | Set the following in ecomm.conf for numericalAttrMedian
69 | operation.type = "mad"
70 |
71 | Run
72 | ./ecomm.sh numMstat
73 |
74 | Copy median absolute deviation file
75 | ===================================
76 | ./ecomm.sh bkMod mad.txt
77 |
78 | It generates mad.txt
79 |
80 | Copy prediction data
81 | ====================
82 | ./ecomm.sh loadInp sale_pr.txt pred
83 |
84 | Run spark job for prediction
85 | ============================
86 | ./ecomm.sh olPred
87 |
88 | Copy prediction output into one file
89 | ====================================
90 | ./ecomm.sh bkOut psale/olp.txt
91 |
92 | All output gets wrirtten to olp.txt
93 |
94 | Run spark job to aggregate to dept
95 | ==================================
96 | Clean aggregator input dir
97 | ./ecomm.sh rmAggrInp
98 |
99 | Copy to aggregator input dir
100 | ./ecomm.sh loadAggrInp psale/olp.txt
101 |
102 | Run aggregator spark job
103 | ./ecomm.sh aggrOl
104 |
105 | Copy aggregator output into one file
106 | ./ecomm.sh bkOutAggr dept/olp.txt
107 |
108 | Run spark job to aggregate to sale
109 | ==================================
110 | Clean aggregator input dir
111 | ./ecomm.sh rmAggrInp
112 |
113 | Copy to aggregator input dir
114 | ./ecomm.sh loadAggrInp dept/olp.txt
115 |
116 | Run aggregator
117 | ./ecomm.sh aggrOl
118 |
119 | Copy aggregator output into one file
120 | ./ecomm.sh bkOutAggr sale/olp.txt
121 |
122 | Run spark job to aggregate to organization
123 | ==========================================
124 | Clean aggregator input dir
125 | ./ecomm.sh rmAggrInp
126 |
127 | Copy to aggregator input dir
128 | ./ecomm.sh loadAggrInp sale/olp.txt
129 |
130 | Run aggregator
131 | ./ecomm.sh aggrOl
132 |
133 | Copy aggregator output into one file
134 | ./ecomm.sh bkOutAggr org/olp.txt
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/ModelBasedPredictor.java:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.predictor;
19 |
20 | import java.io.Serializable;
21 | import java.util.Map;
22 |
23 | import org.beymani.util.OutlierScoreAggregator;
24 | import org.chombo.util.BasicUtils;
25 | import org.chombo.util.ConfigUtility;
26 |
27 | /**
28 | * Base class for all model based predictors
29 | * @author pranab
30 | *
31 | */
32 | public abstract class ModelBasedPredictor implements Serializable {
33 | private static final long serialVersionUID = -8813946272356265424L;
34 | protected boolean realTimeDetection;
35 | protected double scoreThreshold;
36 | protected boolean scoreAboveThreshold;
37 | protected boolean partition = false;
38 | protected double expConst = 1.0;
39 | protected int[] idOrdinals;
40 | protected int[] attrOrdinals;
41 | protected double[] attrWeights;
42 | protected boolean ignoreMissingStat;
43 | protected String fieldDelim;
44 | protected boolean seasonal;
45 |
46 | private String aggregationStrategy;
47 |
48 |
49 | public ModelBasedPredictor() {
50 |
51 | }
52 |
53 | /**
54 | * @param config
55 | * @param attrWeightParam
56 | * @param scoreAggggregationStrtaegyParam
57 | */
58 | public ModelBasedPredictor(Map config, String attrWeightParam, String scoreAggggregationStrtaegyParam) {
59 | attrWeights = ConfigUtility.getDoubleArray(config, attrWeightParam);
60 | aggregationStrategy = ConfigUtility.getString(config, scoreAggggregationStrtaegyParam);;
61 | }
62 |
63 | /**
64 | * @param entityID
65 | * @param record
66 | * @return
67 | */
68 | public abstract double execute(String entityID, String record);
69 |
70 | /**
71 | * @param items
72 | * @param compKey
73 | * @return
74 | */
75 | public abstract double execute(String[] items, String compKey);
76 |
77 |
78 | /**
79 | * @return
80 | */
81 | public boolean isScoreAboveThreshold() {
82 | return scoreAboveThreshold;
83 | }
84 |
85 | /**
86 | * @return
87 | */
88 | public ModelBasedPredictor withPartition() {
89 | partition = true;
90 | return this;
91 | }
92 |
93 | /**
94 | * @param ignoreMissingStat
95 | * @return
96 | */
97 | public ModelBasedPredictor withIgnoreMissingStat(boolean ignoreMissingStat) {
98 | this.ignoreMissingStat = ignoreMissingStat;
99 | return this;
100 | }
101 |
102 |
103 | /**
104 | * @param compKey
105 | * @return
106 | */
107 | public abstract boolean isValid(String compKey);
108 |
109 | /**
110 | * @return
111 | */
112 | public double getAggregateScore(OutlierScoreAggregator scoreAggregator) {
113 | double aggrScore = 0;
114 | if (aggregationStrategy.equals("average")) {
115 | aggrScore = scoreAggregator.getAverage();
116 | } else if (aggregationStrategy.equals("weightedAverage")) {
117 | aggrScore = scoreAggregator.getWeightedAverage();
118 | } else if (aggregationStrategy.equals("median")) {
119 | aggrScore = scoreAggregator.getMedian();
120 | } else if (aggregationStrategy.equals("max")) {
121 | aggrScore = scoreAggregator.getMax();
122 | } else if (aggregationStrategy.equals("min")) {
123 | aggrScore = scoreAggregator.getMin();
124 | } else {
125 | BasicUtils.assertFail("invalid outlier score aggregation strategy " + aggregationStrategy);
126 | }
127 | return aggrScore;
128 | }
129 |
130 | }
131 |
--------------------------------------------------------------------------------
/python/app/bvib.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 |
3 | # avenir-python: Machine Learning
4 | # Author: Pranab Ghosh
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License"); you
7 | # may not use this file except in compliance with the License. You may
8 | # obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | # implied. See the License for the specific language governing
16 | # permissions and limitations under the License.
17 |
18 | import os
19 | import sys
20 | from random import randint
21 | import time
22 | import math
23 | from datetime import datetime
24 | import matplotlib.pyplot as plt
25 | sys.path.append(os.path.abspath("../lib"))
26 | from util import *
27 | from mlutil import *
28 | from sampler import *
29 |
30 | """
31 | MAchinary vibration time series with multiple harmonic components and random noise
32 | Inserts outlier with high frequency components indicating failure
33 | """
34 |
35 | def sinComponents(params):
36 | """
37 | returns list sine components
38 | """
39 | comps = list()
40 | for i in range(0, len(params), 2):
41 | amp = params[i]
42 | per = params[i + 1]
43 | phase = randomFloat(0, 2.0 * math.pi)
44 | co = (amp, per, phase)
45 | comps.append(co)
46 | return comps
47 |
48 | def addSines(comps, sampTm):
49 | """
50 | adds multiple sine comopnents
51 | """
52 | val = 0
53 | for c in comps:
54 | t = 2.0 * math.pi * (sampTm % c[1]) / c[1]
55 | val += c[0] * math.sin(c[2] + t)
56 | return val
57 |
58 | if __name__ == "__main__":
59 | op = sys.argv[1]
60 | if op == "gen":
61 | #generate data
62 | ids = ["HG56SDFE", "K87JG9F6"]
63 | comps = dict()
64 | comps["HG56SDFE"] = sinComponents([52,40,76,20,5,80,7,30])
65 | comps["K87JG9F6"] = sinComponents([56,42,74,18,6,84,9,28])
66 | noise= NormalSampler(0,3)
67 | dur = int(sys.argv[2]) * 1000
68 | ctime = curTimeMs()
69 | ptime = ctime - dur
70 | sintv = 1
71 | stime = ptime
72 | while stime < ctime:
73 | for mid in ids:
74 | val = addSines(comps[mid], stime) + noise.sample()
75 | print("{},{},{:.3f}".format(mid, stime, val))
76 | stime += sintv
77 |
78 | elif op == "iplot":
79 | #plot
80 | fpath = sys.argv[2]
81 | mid = sys.argv[3]
82 | beg = int(sys.argv[4])
83 | end = int(sys.argv[5])
84 | filt = lambda r : r[0] == mid
85 | dvalues = list(map(lambda r : float(r[2]), fileFiltRecGen(fpath, filt)))
86 | drawLine(dvalues[beg:end])
87 |
88 | elif op == "iol":
89 | #insert outliers
90 | fpath = sys.argv[2]
91 | delay = int(sys.argv[3]) * 1000 * 2
92 | ocomps = sinComponents([36,12,30,8])
93 | i = 0
94 | for rec in fileRecGen(fpath, ","):
95 | mid = rec[0]
96 | if mid == "K87JG9F6" and i > delay:
97 | val = float(rec[2])
98 | stime = int(rec[1])
99 | val += addSines(ocomps, stime)
100 | rec[2] = "{:.3f}".format(val)
101 | print(",".join(rec))
102 | i += 1
103 |
104 | elif op == "oplot":
105 | #plot outliers
106 | fpath = sys.argv[2]
107 | mid = sys.argv[3]
108 | beg = int(sys.argv[4])
109 | end = int(sys.argv[5])
110 | filt = lambda r : r[0] == mid
111 | dvalues = list(map(lambda r : float(r[3]), fileFiltRecGen(fpath, filt)))
112 | xvalues = list(map(lambda r : int(r[1]), fileFiltRecGen(fpath, filt)))
113 | plt.plot(xvalues[beg:end], dvalues[beg:end])
114 | plt.title("outlier score")
115 | plt.show()
116 |
117 | dvalues = list(map(lambda r : float(r[2]), fileFiltRecGen(fpath, filt)))
118 | plt.plot(xvalues, dvalues, "b")
119 | ofilt = lambda r : r[0] == mid and r[4] == "O"
120 | oxvalues = list(map(lambda r : int(r[1]), fileFiltRecGen(fpath, ofilt)))
121 | for t in oxvalues:
122 | plt.axvline(t, 0, .9, color="r")
123 | plt.title("outliers")
124 | plt.show()
125 |
126 |
127 | else:
128 | exitWithMsg("ivalid command")
129 |
130 |
131 |
132 |
--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/OutlierPredictor.java:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.predictor;
19 |
20 | import java.io.FileInputStream;
21 | import java.util.Map;
22 | import java.util.Properties;
23 |
24 | import backtype.storm.Config;
25 | import backtype.storm.StormSubmitter;
26 | import backtype.storm.task.OutputCollector;
27 | import backtype.storm.task.TopologyContext;
28 | import backtype.storm.topology.OutputFieldsDeclarer;
29 | import backtype.storm.topology.TopologyBuilder;
30 | import backtype.storm.topology.base.BaseRichBolt;
31 | import backtype.storm.tuple.Fields;
32 | import backtype.storm.tuple.Tuple;
33 |
34 | /**
35 | * Storm topolgy driver for outlier detection
36 | * @author pranab
37 | *
38 | */
39 | public class OutlierPredictor {
40 |
41 | /**
42 | * @author pranab
43 | *
44 | */
45 | public static class PredictorBolt extends BaseRichBolt {
46 | private OutputCollector collector;
47 | private ModelBasedPredictor predictor;
48 |
49 | /* (non-Javadoc)
50 | * @see backtype.storm.task.IBolt#prepare(java.util.Map, backtype.storm.task.TopologyContext, backtype.storm.task.OutputCollector)
51 | */
52 | public void prepare(Map stormConf, TopologyContext context,
53 | OutputCollector collector) {
54 | this.collector = collector;
55 | String strategy = stormConf.get("predictor.model").toString();
56 | if (strategy.equals("mm")){
57 | predictor = new MarkovModelPredictor(stormConf);
58 | }
59 | }
60 |
61 | /* (non-Javadoc)
62 | * @see backtype.storm.task.IBolt#execute(backtype.storm.tuple.Tuple)
63 | */
64 | public void execute(Tuple input) {
65 | String entityID = input.getString(0);
66 | String record = input.getString(1);
67 | double score = predictor.execute( entityID, record);
68 |
69 | //write score to db
70 |
71 | //ack
72 | collector.ack(input);
73 | }
74 |
75 | @Override
76 | public void declareOutputFields(OutputFieldsDeclarer declarer) {
77 |
78 | }
79 |
80 | }
81 |
82 | public static void main(String[] args) throws Exception {
83 | String topologyName = args[0];
84 | String configFilePath = args[1];
85 |
86 | FileInputStream fis = new FileInputStream(configFilePath);
87 | Properties configProps = new Properties();
88 | configProps.load(fis);
89 |
90 | //intialize config
91 | Config conf = new Config();
92 | conf.setDebug(true);
93 | for (Object key : configProps.keySet()){
94 | String keySt = key.toString();
95 | String val = configProps.getProperty(keySt);
96 | conf.put(keySt, val);
97 | }
98 |
99 | //spout
100 | TopologyBuilder builder = new TopologyBuilder();
101 | int spoutThreads = Integer.parseInt(configProps.getProperty("predictor.spout.threads"));
102 | builder.setSpout("predictorSpout", new PredictorSpout(), spoutThreads);
103 |
104 | //detector bolt
105 | int boltThreads = Integer.parseInt(configProps.getProperty("predictor.bolt.threads"));
106 | builder.setBolt("predictor", new PredictorBolt(), boltThreads)
107 | .fieldsGrouping("predictorSpout", new Fields("entityID"));
108 |
109 | //submit topology
110 | int numWorkers = Integer.parseInt(configProps.getProperty("num.workers"));
111 | conf.setNumWorkers(numWorkers);
112 | StormSubmitter.submitTopology(topologyName, conf, builder.createTopology());
113 |
114 | }
115 | }
116 |
--------------------------------------------------------------------------------
/python/app/olss.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 |
3 | # avenir-python: Machine Learning
4 | # Author: Pranab Ghosh
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License"); you
7 | # may not use this file except in compliance with the License. You may
8 | # obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | # implied. See the License for the specific language governing
16 | # permissions and limitations under the License.
17 |
18 | import os
19 | import sys
20 | import time
21 | import math
22 | import statistics
23 | import ntpath
24 | import matplotlib.pyplot as plt
25 | sys.path.append(os.path.abspath("../lib"))
26 | sys.path.append(os.path.abspath("../mlextra"))
27 | from util import *
28 | from mlutil import *
29 | from mcsim import *
30 |
31 | """
32 | Statistical test for outlier score to determine suitable score threshold
33 | """
34 |
35 | def getKeyedOlScores(dirPath, keyLen):
36 | '''
37 | extracts outlier score from spark output files
38 | '''
39 | filePaths = getAllFiles(dirPath)
40 | scores = dict()
41 | if keyLen == 0:
42 | kstr = "all"
43 | for fpath in filePaths:
44 | fname = ntpath.basename(fpath)
45 | if fname.startswith("part"):
46 | print("processing {}".format(fpath))
47 | for rec in fileRecGen(fpath, ","):
48 | if keyLen > 0:
49 | kstr = ",".join(rec[0:keyLen])
50 | score = float(rec[-2])
51 | vl = scores.get(kstr)
52 | if vl is None:
53 | vl = list()
54 | scores[kstr] = vl
55 | vl.append(score)
56 | return scores
57 |
58 | def olScoreStat(dirPath, keyLen, shoHist):
59 | """
60 | upper tail statistic for outlier score
61 | """
62 | filePaths = getAllFiles(dirPath)
63 | scores = dict()
64 | if keyLen == 0:
65 | kstr = "all"
66 | for fpath in filePaths:
67 | fname = ntpath.basename(fpath)
68 | if fname.startswith("part"):
69 | print("processing {}".format(fpath))
70 | for rec in fileRecGen(fpath, ","):
71 | if keyLen > 0:
72 | kstr = ",".join(rec[0:keyLen])
73 | score = float(rec[-2])
74 | vl = scores.get(kstr)
75 | if vl is None:
76 | vl = list()
77 | scores[kstr] = vl
78 | vl.append(score)
79 |
80 | print("outlier score upper tail stats")
81 | sim = MonteCarloSimulator(None,None,None,None)
82 | for kstr, vl in scores.items():
83 | sim.setOutput(vl)
84 | if shoHist:
85 | sim.drawHist("outlier score", "score", "freq")
86 | stats = sim.getUpperTailStat(0)
87 | print("key: {}".format(kstr))
88 | for s in stats:
89 | print("{:.3f} {:.3f}".format(s[0], s[1]))
90 |
91 | def olScoreEvStat(dirPath, keyLen, prTh, exPrTh):
92 | """
93 | extreme value statistic for outlier score
94 | Paper: Anomaly Detection in Streams with Extreme Value Theory by Siffer,
95 | """
96 | scores = getKeyedOlScores(dirPath, keyLen)
97 |
98 | sim = MonteCarloSimulator(None,None,None,None)
99 | for kstr, vl in scores.items():
100 | sim.setOutput(vl)
101 | vth = sim.getCritValue(self, prTh)
102 |
103 | #values above threshold
104 | y = list(filter(lambda v : v > vth, vl))
105 | ymax = max(y)
106 | ymin = min(y)
107 | ymean = statistics.mean(y)
108 | xsmin = -1.0 / ymax
109 | xsmax = 2.0 * (ymean - ymin) / (ymean * ymean)
110 | delta = (xsmax - xsmin) / 100
111 | for xs in floatRange(xsmin, xsmax, delta):
112 | pass
113 |
114 |
115 |
116 | if __name__ == "__main__":
117 | technique = sys.argv[1]
118 | dirPath = sys.argv[2]
119 | keyLen = int(sys.argv[3])
120 |
121 | if technique == "sttest":
122 | """ outlier score upper tail statistics """
123 | shoHist = sys.argv[4] == "hist" if len(sys.argv) == 5 else False
124 | olScoreStat(dirPath, keyLen, shoHist)
125 |
126 | elif technique == "exvstat":
127 | """ extreme value statistic for outlier score """
128 | prTh = float(sys.argv[4])
129 | exPrTh = float(sys.argv[5])
130 | olScoreEvStat(dirPath, keyLen, prTh, exPrTh)
131 | else:
132 | exitWithMsg("invalid technique")
133 |
134 |
135 |
136 |
137 |
--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/MahalanobisDistancePredictor.java:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.predictor;
19 |
20 | import java.io.IOException;
21 | import java.util.Map;
22 |
23 | import org.chombo.math.MathUtils;
24 | import org.chombo.stats.MultiVariateStatsManager;
25 | import org.chombo.util.BasicUtils;
26 | import org.chombo.util.ConfigUtility;
27 |
28 | import Jama.Matrix;
29 |
30 |
31 | /**
32 | * Predictor based on Mahalanobis distance for multi variate data
33 | * @author pranab
34 | *
35 | */
36 | public class MahalanobisDistancePredictor extends ModelBasedPredictor {
37 | private MultiVariateStatsManager statsManager;
38 |
39 | /**
40 | * @param config
41 | * @param idOrdinalsParam
42 | * @param attrListParam
43 | * @param fieldDelimParam
44 | * @param statsFilePathParam
45 | * @param seasonalParam
46 | * @param hdfsFileParam
47 | * @param scoreThresholdParam
48 | * @param expConstParam
49 | * @param ignoreMissingStatParam
50 | * @param scoreAggggregationStrtaegyParam
51 | * @throws IOException
52 | */
53 | public MahalanobisDistancePredictor(Map config, String idOrdinalsParam, String attrListParam,
54 | String fieldDelimParam, String statsFilePathParam, String seasonalParam,String hdfsFileParam,
55 | String scoreThresholdParam, String expConstParam, String ignoreMissingStatParam)
56 | throws IOException {
57 | idOrdinals = ConfigUtility.getIntArray(config, idOrdinalsParam);
58 | attrOrdinals = ConfigUtility.getIntArray(config, attrListParam);
59 | fieldDelim = ConfigUtility.getString(config, fieldDelimParam, ",");
60 |
61 | String statsFilePath = ConfigUtility.getString(config, statsFilePathParam);
62 | boolean hdfsFilePath = ConfigUtility.getBoolean(config, hdfsFileParam);
63 | seasonal = ConfigUtility.getBoolean(config, seasonalParam);
64 | statsManager = new MultiVariateStatsManager(statsFilePath, fieldDelim, hdfsFilePath);
65 | scoreThreshold = ConfigUtility.getDouble(config, scoreThresholdParam);
66 | realTimeDetection = true;
67 | expConst = ConfigUtility.getDouble(config, expConstParam);
68 | ignoreMissingStat = ConfigUtility.getBoolean(config, ignoreMissingStatParam);
69 | }
70 |
71 | @Override
72 | public double execute(String entityID, String record) {
73 | // TODO Auto-generated method stub
74 | return 0;
75 | }
76 |
77 | @Override
78 | public double execute(String[] items, String compKey) {
79 | double score = 0;
80 | if (statsManager.statsExists(compKey)) {
81 | //extract input vector and subtract mean vector
82 | double[] data = BasicUtils.extractFieldsAsDoubleArray(items , attrOrdinals);
83 | Matrix input = MathUtils.createRowMatrix(data);
84 | Matrix inputOffset = MathUtils.subtractMatrix(input, statsManager.getMeanVec(compKey));
85 | Matrix inputOffsetTr = MathUtils.transposeMatrix(inputOffset);
86 |
87 |
88 | //mahalanobis distance
89 | Matrix invCovar = statsManager.getInvCoVarMatrix(compKey);
90 | Matrix maDist = MathUtils.multiplyMatrix(inputOffset, invCovar);
91 | maDist = MathUtils.multiplyMatrix(maDist, inputOffsetTr);
92 | score = MathUtils.scalarFromMatrix(maDist);
93 | } else {
94 | BasicUtils.assertCondition(!ignoreMissingStat, "missing stats for key " + compKey );
95 | }
96 |
97 | //exponential normalization
98 | if (expConst > 0) {
99 | score = BasicUtils.expScale(expConst, score);
100 | }
101 |
102 | scoreAboveThreshold = score > scoreThreshold;
103 | return score;
104 | }
105 |
106 | @Override
107 | public boolean isValid(String compKey) {
108 | return statsManager.statsExists(compKey);
109 | }
110 |
111 | }
112 |
--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/EstimatedMetaProbabilityBasedPredictor.java:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.predictor;
19 |
20 | import java.io.IOException;
21 | import java.util.Map;
22 |
23 | import org.apache.hadoop.conf.Configuration;
24 | import org.beymani.util.OutlierScoreAggregator;
25 | import org.chombo.stats.HistogramStat;
26 | import org.chombo.util.BasicUtils;
27 |
28 | /**
29 | * Based on probability of probability p(f(y) < f(x)). f(x) is density function
30 | * @author pranab
31 | *
32 | */
33 | public class EstimatedMetaProbabilityBasedPredictor extends EsimatedAttrtibuteProbabilityBasedPredictor {
34 |
35 | public EstimatedMetaProbabilityBasedPredictor(Map conf) {
36 | super(conf);
37 | }
38 |
39 | /**
40 | * @param config
41 | * @param idOrdinalsParam
42 | * @param attrListParam
43 | * @param distrFilePathParam
44 | * @param hdfsFileParam
45 | * @param schemaFilePathParam
46 | * @param attrWeightParam
47 | * @param seasonalParam
48 | * @param fieldDelimParam
49 | * @param scoreThresholdParam
50 | * @param ignoreMissingDistrParam
51 | * @throws IOException
52 | */
53 | public EstimatedMetaProbabilityBasedPredictor(Map config,String idOrdinalsParam, String attrListParam,
54 | String distrFilePathParam, String hdfsFileParam,String schemaFilePathParam, String attrWeightParam,
55 | String seasonalParam, String fieldDelimParam,String scoreThresholdParam, String ignoreMissingDistrParam,
56 | String scoreStrategyParam, String expConstParam, String scoreAggggregationStrtaegyParam)
57 | throws IOException {
58 | super(config, idOrdinalsParam, attrListParam, distrFilePathParam,hdfsFileParam, schemaFilePathParam, attrWeightParam,
59 | seasonalParam, fieldDelimParam, scoreThresholdParam,ignoreMissingDistrParam, scoreStrategyParam, expConstParam,
60 | scoreAggggregationStrtaegyParam);
61 | }
62 |
63 | /**
64 | * @param config
65 | * @param distrFilePathParam
66 | * @param attrWeightParam
67 | * @param scoreThresholdParam
68 | * @param fieldDelimParam
69 | * @throws IOException
70 | */
71 | public EstimatedMetaProbabilityBasedPredictor(Configuration config,String distrFilePathParam, String attrWeightParam,
72 | String scoreThresholdParam, String fieldDelimParam)
73 | throws IOException {
74 | super(config, distrFilePathParam, attrWeightParam, scoreThresholdParam,fieldDelimParam);
75 | }
76 |
77 | @Override
78 | public double execute(String[] items, String compKey) {
79 | double score = 0;
80 | OutlierScoreAggregator scoreAggregator = new OutlierScoreAggregator(attrWeights.length, attrWeights);
81 | double thisScore = 0;
82 | for (int ord : attrOrdinals) {
83 | String keyWithFldOrd = compKey + fieldDelim + ord;
84 | double val = Double.parseDouble(items[ord]);
85 | System.out.println("keyWithFldOrd " + keyWithFldOrd);
86 | HistogramStat hist = keyedHist.get(keyWithFldOrd);
87 | if (null != hist) {
88 | double distr = hist.findMetaDistr(val);
89 | if (scoreStrategy.equals("inverse")) {
90 | thisScore = 1.0 - distr;
91 | } else {
92 | if (distr > 0) {
93 | thisScore = -Math.log(distr);
94 | } else {
95 | thisScore = 20.0;
96 | }
97 | }
98 | scoreAggregator.addScore(thisScore);
99 | } else {
100 | BasicUtils.assertCondition(!ignoreMissingDistr, "missing distr for key " + keyWithFldOrd);
101 | scoreAggregator.addScore();
102 | }
103 | }
104 | //aggregate score
105 | score = getAggregateScore(scoreAggregator);
106 |
107 | //exponential normalization
108 | if (expConst > 0) {
109 | score = BasicUtils.expScale(expConst, score);
110 | }
111 |
112 | scoreAboveThreshold = score > scoreThreshold;
113 | return score;
114 | }
115 |
116 | }
117 |
--------------------------------------------------------------------------------
/python/app/bls.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 |
3 | # avenir-python: Machine Learning
4 | # Author: Pranab Ghosh
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License"); you
7 | # may not use this file except in compliance with the License. You may
8 | # obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15 | # implied. See the License for the specific language governing
16 | # permissions and limitations under the License.
17 |
18 | import os
19 | import sys
20 | from random import randint
21 | import time
22 | import uuid
23 | import threading
24 | import matplotlib.pyplot as plt
25 | sys.path.append(os.path.abspath("../lib"))
26 | from util import *
27 | from sampler import *
28 |
29 | def createAnomaly(high):
30 | if high:
31 | reading = randomFloat(120, 200)
32 | else:
33 | reading = randomFloat(60, 80)
34 | return reading
35 |
36 | if __name__ == "__main__":
37 | op = sys.argv[1]
38 |
39 | #device stats
40 | if op == "stat":
41 | #normal mean 80 - 100 sd 1 - 5
42 | #anomaly mean 120 - 160 sd 1 - 5
43 | numDevs = int(sys.argv[2])
44 | mmin = int(sys.argv[3])
45 | mmax = int(sys.argv[4])
46 | smin = int(sys.argv[5])
47 | smax = int(sys.argv[6])
48 | for i in range(numDevs):
49 | mean = randomFloat(mmin, mmax)
50 | sd = randomFloat(smin, smax)
51 | devId = genID(12)
52 | #print "%s,%.3f,%.3f" %(devId, mean, sd)
53 | print("{},{:.3f},{:.3f}".format(devId, mean, sd))
54 |
55 | #generate reading
56 | elif op == "gen":
57 | statFile = sys.argv[2]
58 | numDays = int(sys.argv[3])
59 | modeNorm = (sys.argv[4] == "normal")
60 |
61 | devices = []
62 | for rec in fileRecGen(statFile, ","):
63 | ds = (rec[0], float(rec[1]), float(rec[2]))
64 | devices.append(ds)
65 |
66 |
67 | numDevs = len(devices)
68 | distrs = list(map(lambda d: GaussianRejectSampler(d[1],d[2]), devices))
69 |
70 | curTime = int(time.time())
71 | pastTime = curTime - (numDays + 1) * secInDay
72 | pastTime = (pastTime / secInDay) * secInDay + secInHour * 15
73 | sampTime = pastTime
74 | sampIntv = secInDay
75 |
76 | anm = dict()
77 | anmDesc = dict()
78 | while(sampTime < curTime):
79 | for i in range(numDevs):
80 | d = devices[i]
81 | did = d[0]
82 | ts = sampTime + randint(-1000, 1000)
83 | sampled = False
84 | anomalyRate = 10 if (modeNorm) else 20
85 | if isEventSampled(anomalyRate):
86 | if not did in anm:
87 | #create anomaly
88 | high = isEventSampled(80)
89 | reading = createAnomaly(high)
90 | appendKeyedList(anm, did, reading)
91 | length = randint(1, 2) if(modeNorm) else randint(3, 7)
92 | desc = (length, high)
93 | anmDesc[did] = desc
94 | sampled = True
95 | #print "**** anomaly created %s, %d" %(did, reading)
96 |
97 | if not sampled:
98 | if did in anm:
99 | # ongoing anomaly
100 | ans = anm[did]
101 | desc = anmDesc[did]
102 | towardsNorm = len(ans) == desc[0]
103 | an = ans[0]
104 | if len(ans) == desc[0]:
105 | # moving toward normal from anomaly
106 | if isEventSampled(60):
107 | sampled = True
108 | reading = 0.85 * an if(desc[1]) else 1.15 * an
109 | #print "**** moving back to normal %s, %d" %(did, reading)
110 | del anm[did]
111 | del anmDesc[did]
112 | elif len(ans) < desc[0]:
113 | # continue anomaly
114 | reading = createAnomaly(desc[1])
115 | appendKeyedList(anm, did, reading)
116 | sampled = True
117 | #print "**** anomaly continued %s, %d" %(did, reading)
118 |
119 | if not sampled:
120 | # normal
121 | reading = distrs[i].sample()
122 |
123 | #print "%s,%d,%d" %(did, ts, int(reading))
124 | print("{},{},{}".format(did, ts, int(reading)))
125 | sampTime += sampIntv
126 |
127 | elif op == "oplot":
128 | #plot outliers
129 | fpath = sys.argv[2]
130 | mid = sys.argv[3]
131 | filt = lambda r : r[0] == mid
132 | dvalues = list(map(lambda r : float(r[3]), fileFiltRecGen(fpath, filt)))
133 | xvalues = list(map(lambda r : int(r[1]), fileFiltRecGen(fpath, filt)))
134 | plt.plot(xvalues, dvalues)
135 | plt.title("outlier score")
136 | plt.show()
137 |
--------------------------------------------------------------------------------
/python/app/cpu_usage.py:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/python3
2 |
3 | import os
4 | import sys
5 | from random import randint
6 | import time
7 | import uuid
8 | import threading
9 | sys.path.append(os.path.abspath("../lib"))
10 | from util import *
11 | from sampler import *
12 |
13 | op = sys.argv[1]
14 | secInHour = 60 * 60
15 | secInDay = 24 * secInHour
16 | secInWeek = 7 * secInDay
17 | secInYear = 365 * secInDay
18 |
19 | wkDayMean = 60
20 | wkDayStdDev = 12
21 | wkEndMean = 30
22 | wkEndStdDev = 8
23 |
24 | if op == "usage":
25 | numDays = int(sys.argv[2])
26 | sampIntv = int(sys.argv[3])
27 | numServers = int(sys.argv[4])
28 |
29 | outDayInWeek = True
30 | s = 5
31 | if len(sys.argv) > 5:
32 | #print(sys.argv[5])
33 | if sys.argv[5] == "false" or sys.argv[5] == "f":
34 | outDayInWeek = False
35 | s = 6
36 |
37 | serverList = None
38 | if len(sys.argv) > s:
39 | #server ID from stats file
40 | sfile = sys.argv[s]
41 | #print(sfile)
42 | servers = set()
43 | for rec in fileRecGen(sfile, ","):
44 | #print(rec[0])
45 | servers.add(rec[0])
46 | serverList = list(servers)
47 | else:
48 | #generate server ID
49 | serverList = list()
50 | for i in range(numServers):
51 | serverList.append(genID(10))
52 |
53 | curTime = int(time.time())
54 | pastTime = curTime - (numDays + 1) * secInDay
55 | sampTime = pastTime
56 | usageDistr = [GaussianRejectSampler(wkDayMean,wkDayStdDev), GaussianRejectSampler(wkEndMean,wkEndStdDev)]
57 |
58 | while(sampTime < curTime):
59 | secIntoDay = sampTime % secInDay
60 | #hourIntoDay = secIntoDay / secInHour
61 |
62 | secIntoWeek = sampTime % secInWeek
63 | daysIntoWeek = int(secIntoWeek / secInDay)
64 |
65 | if daysIntoWeek >= 0 and daysIntoWeek <= 4:
66 | distr = usageDistr[0]
67 | else:
68 | distr = usageDistr[1]
69 |
70 | for server in serverList:
71 | usage = distr.sample()
72 | if (usage < 0):
73 | usage = 5
74 | elif usage > 100:
75 | usage = 100
76 | usage = int(usage)
77 | st = sampTime + randint(-2,2)
78 | if outDayInWeek:
79 | #print "%s,%d,%d,%d" %(server, st, daysIntoWeek, usage)
80 | print("{},{},{},{}".format(server, st, daysIntoWeek, usage))
81 | else:
82 | #print "%s,%d,%d" %(server, st, usage)
83 | print("{},{},{}".format(server, st, usage))
84 |
85 | sampTime = sampTime + sampIntv
86 |
87 | elif op == "anomaly":
88 | fileName = sys.argv[2]
89 | count = 0
90 | for rec in fileRecGen(fileName, ","):
91 | if isEventSampled(8):
92 | dow = int(rec[2])
93 | if dow < 5:
94 | rec[3] = str(randint(94, 100))
95 | else:
96 | rec[3] = str(randint(54, 100))
97 | count += 1
98 | mrec = ",".join(rec)
99 | print(mrec)
100 | #print "num of anomalous records " + str(count)
101 |
102 | elif op == "feedback":
103 | fileName = sys.argv[2]
104 | curThreshold = float(sys.argv[3])
105 | newThreshold = float(sys.argv[4])
106 | margin = curThreshold + 0.6 * (newThreshold - curThreshold)
107 | count = 0
108 | for rec in fileRecGen(fileName, ","):
109 | score = float(rec[4])
110 | label = rec[5]
111 | if newThreshold > curThreshold:
112 | #false positive
113 | if label == "O":
114 | if score > newThreshold:
115 | flabel = "O"
116 | cl = "T"
117 | else:
118 | if score < margin or isEventSampled(90):
119 | flabel = "N"
120 | cl = "F"
121 | count += 1
122 | else:
123 | flabel = "O"
124 | cl = "T"
125 | else:
126 | flabel = "N"
127 | cl = "F"
128 | else:
129 | #false negative
130 | if label == "O":
131 | flabel = "O"
132 | cl = "T"
133 | else:
134 | if score > newThreshold:
135 | if score > margin or isEventSampled(90):
136 | flabel = "O"
137 | cl = "T"
138 | count += 1
139 | else:
140 | flabel = "N"
141 | cl = "F"
142 | else:
143 | flabel = "N"
144 | cl = "F"
145 | rec.append(flabel)
146 | rec.append(cl)
147 | mrec = ",".join(rec)
148 | print(mrec)
149 | #print count
150 |
151 | elif op == "addTrend":
152 | fileName = sys.argv[2]
153 | trendYearlyPercentRate = float(sys.argv[3])
154 | trenPerSec = trendYearlyPercentRate / secInYear
155 | start = None
156 | for rec in fileRecGen(fileName, ","):
157 | ts = int(rec[1])
158 | usage = float(rec[3])
159 | if start is None:
160 | start = ts
161 | else:
162 | usage = usage + (ts - start) * trenPerSec
163 | usageStr = ".3f" %(usage)
164 | rec[3] = usageStr
165 | mrec = ",".join(rec)
166 | print(mrec)
167 |
168 |
169 |
170 |
--------------------------------------------------------------------------------
/spark/src/main/scala/org/beymani/spark/common/PseudoRelevanceThresholdFinder.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani-spark: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.spark.common
19 |
20 | import org.chombo.spark.common.JobConfiguration
21 | import org.apache.spark.SparkContext
22 | import scala.collection.JavaConverters._
23 | import org.chombo.util.BasicUtils
24 | import org.chombo.spark.common.Record
25 | import org.chombo.util.BaseAttribute
26 | import com.typesafe.config.Config
27 |
28 | /**
29 | * Finds threshold based pseudo relevance e.g. top n or top n percentage
30 | * @author pranab
31 | *
32 | */
33 | object PseudoRelevanceThresholdFinder extends JobConfiguration {
34 | /**
35 | * @param args
36 | * @return
37 | */
38 | def main(args: Array[String]) {
39 | val appName = "outlierCounter"
40 | val Array(inputPath: String, outputPath: String, configFile: String) = getCommandLineArgs(args, 3)
41 | val config = createConfig(configFile)
42 | val sparkConf = createSparkConf(appName, config, false)
43 | val sparkCntxt = new SparkContext(sparkConf)
44 | val appConfig = config.getConfig(appName)
45 |
46 | //configuration params
47 | val fieldDelimIn = getStringParamOrElse(appConfig, "field.delim.in", ",")
48 | val fieldDelimOut = getStringParamOrElse(appConfig, "field.delim.out", ",")
49 | val keyLen = getMandatoryIntParam(appConfig, "data.keyLen", "missing key length")
50 | val precision = getIntParamOrElse(appConfig, "output.precision", 3)
51 | val relevanceThreshold = getMandatoryDoubleParam(appConfig, "relevance.threshold", "missing relevance threshold")
52 | val relevanceAsPercentage = getBooleanParamOrElse(appConfig, "relevance.asPercentage", true)
53 | val minSampleCount = getMandatoryIntParam(appConfig, "sample.minCount", "missing min sample count")
54 | val thresholdPath = getMandatoryStringParam(appConfig, "threshold.filePath", "missing stat file path")
55 | val thresholdMap = BasicUtils.getKeyedValues(thresholdPath, keyLen, keyLen)
56 | val defaultThreshold = getMandatoryDoubleParam(appConfig, "threshold.default", "missing default threshold")
57 | val debugOn = appConfig.getBoolean("debug.on")
58 | val saveOutput = appConfig.getBoolean("save.output")
59 |
60 | //input
61 | val data = sparkCntxt.textFile(inputPath)
62 |
63 | val keyedThresholds = data.map(line => {
64 | val items = BasicUtils.getTrimmedFields(line, fieldDelimIn)
65 | val keyRec = Record(items, 0, keyLen)
66 | val last = items.length - 1
67 | val score = items(last -1).toDouble
68 | (keyRec, score)
69 | }).groupByKey.map(r => {
70 | val key = r._1
71 | val scores = r._2.toList
72 | val sortedScores = scores.sortWith((v1,v2) => v1 > v2)
73 | val size = sortedScores.length
74 | val threshold =
75 | if (size > minSampleCount) {
76 | //find threshold
77 | val thresholdIndex =
78 | if (relevanceAsPercentage) {
79 | ((size * relevanceThreshold) / 100).toInt - 1
80 | } else {
81 | val indx = relevanceThreshold.toInt - 1
82 | if (indx > size-2) {
83 | throw new IllegalStateException("absolute threshold value too big")
84 | }
85 | indx
86 | }
87 | sortedScores.slice(thresholdIndex - 1, 3).sum / 3
88 | } else {
89 | //use existing threshold or default
90 | val keyStr = key.toString(fieldDelimOut)
91 | if (thresholdMap.containsKey(keyStr)) thresholdMap.get(keyStr).toDouble
92 | else defaultThreshold
93 | }
94 | key.toString(fieldDelimOut) + fieldDelimOut + BasicUtils.formatDouble(threshold, precision)
95 | })
96 |
97 | if (debugOn) {
98 | val records = keyedThresholds.collect.slice(0, 20)
99 | records.foreach(r => println(r))
100 | }
101 |
102 | if(saveOutput) {
103 | keyedThresholds.saveAsTextFile(outputPath)
104 | }
105 |
106 | }
107 | }
--------------------------------------------------------------------------------
/src/main/java/org/beymani/util/DataStreamSchema.java:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.util;
19 |
20 | import java.io.FileInputStream;
21 | import java.io.IOException;
22 | import java.io.InputStream;
23 | import java.io.Serializable;
24 | import java.util.ArrayList;
25 | import java.util.List;
26 |
27 | import org.chombo.util.BasicUtils;
28 | import org.codehaus.jackson.annotate.JsonIgnoreProperties;
29 | import org.codehaus.jackson.map.ObjectMapper;
30 |
31 | @JsonIgnoreProperties(ignoreUnknown = true)
32 | public class DataStreamSchema implements Serializable {
33 | private List dataStreams;
34 |
35 | /**
36 | *
37 | */
38 | public DataStreamSchema() {
39 | }
40 |
41 | /**
42 | * @return
43 | */
44 | public List getDataStreams() {
45 | return dataStreams;
46 | }
47 |
48 | /**
49 | * @param dataStreams
50 | */
51 | public void setDataStreams(List dataStreams) {
52 | this.dataStreams = dataStreams;
53 | }
54 |
55 | /**
56 | * @param type
57 | * @return
58 | */
59 | public DataStream findByType(String type) {
60 | DataStream stream = null;
61 | for (DataStream daStrm : dataStreams) {
62 | if (daStrm.getType().equals(type)) {
63 | stream = daStrm;
64 | break;
65 | }
66 | }
67 | return stream;
68 | }
69 |
70 | /**
71 | * @param type
72 | * @return
73 | */
74 | public List findAllByType(String type) {
75 | List streams = new ArrayList();
76 | for (DataStream daStrm : dataStreams) {
77 | if (daStrm.getType().equals(type)) {
78 | streams.add(daStrm);
79 | }
80 | }
81 | return streams;
82 | }
83 |
84 | /**
85 | * @param type
86 | * @return
87 | */
88 | public DataStream findByTypeAndId(String type, String id) {
89 | DataStream stream = null;
90 | for (DataStream daStrm : dataStreams) {
91 | if (daStrm.getId().equals("*")) {
92 | if (daStrm.getType().equals(type)) {
93 | boolean done = false;
94 | List parents = findAllByType(daStrm.getParentType());
95 | for (DataStream pa : parents) {
96 | List children = pa.getChildrenId();
97 | BasicUtils.assertNotNull(children, "missing child ID list in parent");
98 | if (children.contains(id)) {
99 | BasicUtils.assertCondition(daStrm.getParentId().equals(pa.getId()), "mismatched parent ID");
100 | stream = daStrm;
101 | done = true;
102 | break;
103 | }
104 | }
105 | if (done)
106 | break;
107 | }
108 | } else {
109 | if (daStrm.getType().equals(type) && daStrm.getId().equals(id)) {
110 | stream = daStrm;
111 | break;
112 | }
113 | }
114 | }
115 | return stream;
116 | }
117 |
118 | /**
119 | * @param type
120 | * @param id
121 | * @return
122 | */
123 | public DataStream findParent(String type, String id) {
124 | DataStream parentStream = null;
125 | DataStream stream = findByType(type);
126 | BasicUtils.assertNotNull(stream, "coud not find data stream object");
127 | parentStream = findByType(stream.getParentType());
128 | if (!parentStream.isSingleton()) {
129 | //instance based
130 | stream = findByTypeAndId(type, id);
131 | parentStream = findByTypeAndId(stream.getParentType(), stream.getParentId());
132 | }
133 | return parentStream;
134 | }
135 |
136 | /**
137 | * @param type
138 | * @return
139 | */
140 | public String findParentType(String type) {
141 | DataStream stream = findByType(type);
142 | BasicUtils.assertNotNull(stream, "coud not find data stream object");
143 | return stream.getParentType();
144 | }
145 |
146 | /**
147 | * @param path
148 | * @return
149 | * @throws IOException
150 | */
151 | public static DataStreamSchema loadDataStreamSchema(String path) throws IOException {
152 | InputStream fs = new FileInputStream(path);
153 | ObjectMapper mapper = new ObjectMapper();
154 | DataStreamSchema schema = mapper.readValue(fs, DataStreamSchema.class);
155 | return schema;
156 | }
157 |
158 | }
159 |
--------------------------------------------------------------------------------
/src/main/java/org/beymani/predictor/InterPercentileDifferenceBasedPredictor.java:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.predictor;
19 |
20 | import java.io.IOException;
21 | import java.util.Map;
22 |
23 | import org.apache.hadoop.conf.Configuration;
24 | import org.beymani.util.OutlierScoreAggregator;
25 | import org.chombo.stats.HistogramStat;
26 | import org.chombo.util.BasicUtils;
27 |
28 | /**
29 | * Inter percentile difference (25% and 75%) based predictor
30 | * @author pranab
31 | *
32 | */
33 | public class InterPercentileDifferenceBasedPredictor extends EsimatedAttrtibuteProbabilityBasedPredictor {
34 | private static final int QUARTER_PERECENTILE = 25;
35 | private static final int THREE_QUARTER_PERECENTILE = 75;
36 |
37 | /**
38 | * @param conf
39 | */
40 | public InterPercentileDifferenceBasedPredictor(Map conf) {
41 | super(conf);
42 | }
43 |
44 | /**
45 | * @param config
46 | * @param idOrdinalsParam
47 | * @param attrListParam
48 | * @param distrFilePathParam
49 | * @param hdfsFileParam
50 | * @param schemaFilePathParam
51 | * @param attrWeightParam
52 | * @param seasonalParam
53 | * @param fieldDelimParam
54 | * @param scoreThresholdParam
55 | * @param ignoreMissingDistrParam
56 | * @throws IOException
57 | */
58 | public InterPercentileDifferenceBasedPredictor(Map config,String idOrdinalsParam, String attrListParam,
59 | String distrFilePathParam, String hdfsFileParam,String schemaFilePathParam, String attrWeightParam,
60 | String seasonalParam, String fieldDelimParam,String scoreThresholdParam, String ignoreMissingDistrParam,
61 | String expConstParam, String scoreAggggregationStrtaegyParam)
62 | throws IOException {
63 | super(config, idOrdinalsParam, attrListParam, distrFilePathParam,hdfsFileParam, schemaFilePathParam, attrWeightParam,
64 | seasonalParam, fieldDelimParam, scoreThresholdParam,ignoreMissingDistrParam, "score.strategy", expConstParam,
65 | scoreAggggregationStrtaegyParam);
66 | }
67 |
68 | /**
69 | * @param config
70 | * @param distrFilePathParam
71 | * @param attrWeightParam
72 | * @param scoreThresholdParam
73 | * @param fieldDelimParam
74 | * @throws IOException
75 | */
76 | public InterPercentileDifferenceBasedPredictor(Configuration config,String distrFilePathParam, String attrWeightParam,
77 | String scoreThresholdParam, String fieldDelimParam)
78 | throws IOException {
79 | super(config, distrFilePathParam, attrWeightParam, scoreThresholdParam,fieldDelimParam);
80 | }
81 |
82 | /* (non-Javadoc)
83 | * @see org.beymani.predictor.EsimatedAttrtibuteProbabilityBasedPredictor#execute(java.lang.String[], java.lang.String)
84 | */
85 | @Override
86 | public double execute(String[] items, String compKey) {
87 | double score = 0;
88 | OutlierScoreAggregator scoreAggregator = new OutlierScoreAggregator(attrWeights.length, attrWeights);
89 | double thisScore = 0;
90 | for (int ord : attrOrdinals) {
91 | String keyWithFldOrd = compKey + fieldDelim + ord;
92 | double val = Double.parseDouble(items[ord]);
93 | System.out.println("keyWithFldOrd " + keyWithFldOrd);
94 | HistogramStat hist = keyedHist.get(keyWithFldOrd);
95 | if (null != hist) {
96 | double quarterPercentile = hist.getQuantile(QUARTER_PERECENTILE);
97 | double threeQuarterPercentile = hist.getQuantile(THREE_QUARTER_PERECENTILE);
98 | double percentileDiff = threeQuarterPercentile - quarterPercentile;
99 | if (val < quarterPercentile) {
100 | thisScore = (quarterPercentile - val) / percentileDiff;
101 | } else if (val > threeQuarterPercentile){
102 | thisScore = (val - threeQuarterPercentile) / percentileDiff;
103 | }
104 | scoreAggregator.addScore(thisScore);
105 | } else {
106 | BasicUtils.assertCondition(!ignoreMissingDistr, "missing distr for key " + keyWithFldOrd);
107 | scoreAggregator.addScore();
108 | }
109 | }
110 | //aggregate score
111 | score = getAggregateScore(scoreAggregator);
112 |
113 | //exponential normalization
114 | if (expConst > 0) {
115 | score = BasicUtils.expScale(expConst, score);
116 | }
117 |
118 | scoreAboveThreshold = score > scoreThreshold;
119 | return score;
120 | }
121 |
122 | }
123 |
--------------------------------------------------------------------------------
/spark/src/main/scala/org/beymani/spark/common/OutlierScoreLevelShift.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani-spark: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.spark.common
19 |
20 | import scala.Array.canBuildFrom
21 | import scala.collection.JavaConverters._
22 | import org.apache.spark.SparkContext
23 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
24 | import org.chombo.spark.common.GeneralUtility
25 | import org.chombo.spark.common.JobConfiguration
26 | import org.chombo.spark.common.Record
27 | import org.chombo.util.BasicUtils
28 | import org.hoidla.window.SizeBoundFloatStatsWindow
29 |
30 | /**
31 | * Outlier detection based on level shift outlier score from any algorithm
32 | * @author pranab
33 | */
34 | object OutlierScoreLevelShift extends JobConfiguration with GeneralUtility {
35 |
36 | /**
37 | * @param args
38 | * @return
39 | */
40 | def main(args: Array[String]) {
41 | val appName = "outlierScoreLevelShift"
42 | val Array(inputPath: String, outputPath: String, configFile: String) = getCommandLineArgs(args, 3)
43 | val config = createConfig(configFile)
44 | val sparkConf = createSparkConf(appName, config, false)
45 | val sparkCntxt = new SparkContext(sparkConf)
46 | val appConfig = config.getConfig(appName)
47 |
48 | //configuration params
49 | val fieldDelimIn = getStringParamOrElse(appConfig, "field.delim.in", ",")
50 | val fieldDelimOut = getStringParamOrElse(appConfig, "field.delim.out", ",")
51 | val seqFieldOrd = getMandatoryIntParam(appConfig, "seq.fieldOrd", "missing seq field ordinal")
52 | val keyLen = getMandatoryIntParam(appConfig, "key.length", "missing key length")
53 | val longWindowSize = getMandatoryIntParam(appConfig, "window.longSize", "missing long window size")
54 | val shortWindowSize = getMandatoryIntParam(appConfig, "window.shortSize", "missing short window size")
55 | val minZscore = getMandatoryDoubleParam(appConfig, "zscore.min", "missing min z score")
56 | val debugOn = getBooleanParamOrElse(appConfig, "debug.on", false)
57 | val saveOutput = getBooleanParamOrElse(appConfig,"save.output", true)
58 |
59 | //input
60 | val data = sparkCntxt.textFile(inputPath)
61 |
62 | val taggedData = data.map(line => {
63 | val items = BasicUtils.getTrimmedFields(line, fieldDelimIn)
64 | val key = Record(items, 0, keyLen)
65 | (key, items)
66 | }).groupByKey.flatMap(r => {
67 | val longWindow = new SizeBoundFloatStatsWindow(longWindowSize)
68 | val shortWindow = new SizeBoundFloatStatsWindow(shortWindowSize)
69 | val values = r._2.toArray.sortBy(v => {
70 | v(seqFieldOrd).toLong
71 | })
72 | val newTags = values.map(v => {
73 | val score = v(v.size - 2).toDouble
74 | val tag = v(v.size - 1)
75 | longWindow.add(score)
76 | shortWindow.add(score)
77 | var newTag = ""
78 | if (longWindow.isFull()) {
79 | val loMean = longWindow.getMean()
80 | val loStdDev = longWindow.getStdDev()
81 | val shMean = shortWindow.getMean()
82 | val levelBasedScore = (shMean - loMean) / loStdDev;
83 | newTag = if (levelBasedScore > minZscore) "O" else "N"
84 | } else {
85 | newTag = tag
86 | }
87 | val rec = Record(2)
88 | rec.add(tag,newTag)
89 | })
90 |
91 | //propagate outlier tag
92 | for (i <- longWindowSize to newTags.length -1) {
93 | if(newTags(i).getString(1) == "O") {
94 | for (j <- i - shortWindowSize + 1 to i - 1) {
95 | val tag = if (newTags(j).getString(0) == "I") "I" else "O"
96 | val rec = Record(2)
97 | rec.add(newTags(j).getString(0), tag)
98 | newTags(j) = rec
99 | }
100 | }
101 | }
102 |
103 | val recValues = values.map(v => Record(v))
104 | newTags.zip(recValues).map(r => {
105 | val newTag = r._1.getString(1)
106 | val rec = r._2.getString(0)
107 | rec + fieldDelimOut + newTag
108 | })
109 | })
110 |
111 | if (debugOn) {
112 | val records = taggedData.collect
113 | records.slice(0, 100).foreach(r => println(r))
114 | }
115 |
116 | if(saveOutput) {
117 | taggedData.saveAsTextFile(outputPath)
118 | }
119 |
120 | }
121 | }
--------------------------------------------------------------------------------
/resource/and_spark.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | PROJECT_HOME=/Users/pranab/Projects
4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar
5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar
6 | MASTER=spark://akash:7077
7 |
8 | case "$1" in
9 |
10 | "crInput")
11 | echo "args: num_of_days time_interval(sec) num_of_servers output_file"
12 | ./cpu_usage.py usage $2 $3 $4 true > $5
13 | ls -l $5
14 | ;;
15 |
16 | "crTestInput")
17 | ./cpu_usage.py usage $2 $3 $4 true $5 > $6
18 | ls -l $6
19 | ;;
20 |
21 | "insOutliers")
22 | echo "args: normal_data_file output_file"
23 | ./cpu_usage.py anomaly $2 > $3
24 | ls -l $3
25 | ;;
26 |
27 | "cpModData")
28 | echo "args: modeling_data_file "
29 | rm $PROJECT_HOME/bin/beymani/input/olp/*
30 | rm $PROJECT_HOME/bin/beymani/nas/olp/*
31 | cp $2 $PROJECT_HOME/bin/beymani/input/nas/
32 | cp $2 $PROJECT_HOME/bin/beymani/input/olp/
33 | ls -l $PROJECT_HOME/bin/beymani/input/nas
34 | ls -l $PROJECT_HOME/bin/beymani/input/olp
35 | ;;
36 |
37 | "cpTestData")
38 | echo "args: test_data_file "
39 | rm $PROJECT_HOME/bin/beymani/input/olp/*
40 | cp $2 $PROJECT_HOME/bin/beymani/input/olp/
41 | ls -l $PROJECT_HOME/bin/beymani/input/olp
42 | ;;
43 |
44 | "numStat")
45 | echo "running NumericalAttrStats Spark job"
46 | CLASS_NAME=org.chombo.spark.explore.NumericalAttrStats
47 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/nas/cusage.txt
48 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/nas
49 | rm -rf ./output/nas
50 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
51 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT and.conf
52 | ls -l $PROJECT_HOME/bin/beymani/output/nas/
53 | ;;
54 |
55 | "crStatsFile")
56 | echo "copying and consolidating stats file"
57 | rm $PROJECT_HOME/bin/beymani/output/nas/_SUCCESS
58 | SFILE=$PROJECT_HOME/bin/beymani/other/olp/stats.txt
59 | cp /dev/null $SFILE
60 | for f in $PROJECT_HOME/bin/beymani/output/nas/*
61 | do
62 | echo "Copying file $f ..."
63 | cat $f >> $SFILE
64 | done
65 | ls -l $PROJECT_HOME/bin/beymani/other/olp
66 | ;;
67 |
68 | "olPred")
69 | echo "running StatsBasedOutlierPredictor Spark job"
70 | CLASS_NAME=org.beymani.spark.dist.StatsBasedOutlierPredictor
71 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/olp/*
72 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/olp
73 | rm -rf ./output/olp
74 | rm -rf ./other/olp/clean
75 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
76 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT and.conf
77 | rm ./output/olp/_SUCCESS
78 | for f in ./output/olp/*
79 | do
80 | echo "number of records in $f"
81 | wc -l $f
82 | done
83 |
84 | for f in ./output/olp/*
85 | do
86 | echo "number of outliers in $f"
87 | cat $f | grep ,O | wc -l
88 | done
89 |
90 | ;;
91 |
92 | "crCleanFile")
93 | echo "copying, consolidating and moving clean training data file"
94 | rm $PROJECT_HOME/bin/beymani/other/olp/clean/_SUCCESS
95 | CFILE=$PROJECT_HOME/bin/beymani/other/olp/clean/cusage.txt
96 | cp /dev/null $CFILE
97 | echo "creating clean file $CFILE"
98 | for f in $PROJECT_HOME/bin/beymani/other/olp/clean/*
99 | do
100 | echo "Copying file $f ..."
101 | cat $f >> $CFILE
102 | done
103 | echo "copying clean file to model input directory"
104 | mv $PROJECT_HOME/bin/beymani/input/nas/cusage.txt $PROJECT_HOME/bin/beymani/other/nas/cusage_1.txt
105 | mv $CFILE $PROJECT_HOME/bin/beymani/input/nas/cusage.txt
106 | echo "backing up current model file"
107 | mv $PROJECT_HOME/bin/beymani/other/olp/stats.txt $PROJECT_HOME/bin/beymani/other/olp/stats_1.txt
108 | ls -l $PROJECT_HOME/bin/beymani/input/nas/
109 | ;;
110 |
111 |
112 | "mvOutlFile")
113 | echo "moving outlier output file"
114 | cat $PROJECT_HOME/bin/beymani/output/olp/part-00000 > $PROJECT_HOME/bin/beymani/other/olp/outl.txt
115 | cat $PROJECT_HOME/bin/beymani/output/olp/part-00001 >> $PROJECT_HOME/bin/beymani/other/olp/outl.txt
116 | ;;
117 |
118 | "thLearn")
119 | echo "running ThresholdLearner Spark job"
120 | CLASS_NAME=org.beymani.spark.common.ThresholdLearner
121 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/thl/olf.txt
122 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/thl
123 | rm -rf ./output/thl
124 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
125 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT and.conf
126 | ;;
127 |
128 | "tempAggr")
129 | echo "running TemporalAggregator Spark job"
130 | CLASS_NAME=org.chombo.spark.explore.TemporalAggregator
131 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/teg/cusage.txt
132 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/teg
133 | rm -rf ./output/teg
134 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
135 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT and.conf
136 | ;;
137 |
138 |
139 | *)
140 | echo "unknown operation $1"
141 | ;;
142 |
143 | esac
--------------------------------------------------------------------------------
/spark/src/main/scala/org/beymani/spark/common/OutlierCounter.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani-spark: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.spark.common
19 |
20 | import org.chombo.spark.common.JobConfiguration
21 | import org.apache.spark.SparkContext
22 | import scala.collection.JavaConverters._
23 | import org.chombo.util.BasicUtils
24 | import org.chombo.spark.common.Record
25 | import org.chombo.util.BaseAttribute
26 | import com.typesafe.config.Config
27 |
28 | /**
29 | * Outlier count statistics
30 | * @author pranab
31 | *
32 | */
33 | object OutlierCounter extends JobConfiguration {
34 | /**
35 | * @param args
36 | * @return
37 | */
38 | def main(args: Array[String]) {
39 | val appName = "outlierCounter"
40 | val Array(inputPath: String, outputPath: String, configFile: String) = getCommandLineArgs(args, 3)
41 | val config = createConfig(configFile)
42 | val sparkConf = createSparkConf(appName, config, false)
43 | val sparkCntxt = new SparkContext(sparkConf)
44 | val appConfig = config.getConfig(appName)
45 |
46 | //configuration params
47 | val fieldDelimIn = getStringParamOrElse(appConfig, "field.delim.in", ",")
48 | val fieldDelimOut = getStringParamOrElse(appConfig, "field.delim.out", ",")
49 | val keyLen = getMandatoryIntParam(appConfig, "data.keyLen", "missing key length")
50 | val precision = getIntParamOrElse(appConfig, "output.precision", 3)
51 | val insertTimeStamp = getBooleanParamOrElse(appConfig, "output.insertTmStmp", false)
52 | val tmStmp = if (insertTimeStamp) System.currentTimeMillis() else 0
53 | val normTag = "N"
54 | val outlierTag = "O"
55 | val indeterTag = "I"
56 | val totalTag = "T"
57 | val debugOn = appConfig.getBoolean("debug.on")
58 | val saveOutput = appConfig.getBoolean("save.output")
59 |
60 | //input
61 | val data = sparkCntxt.textFile(inputPath)
62 |
63 | //key by record key and record status
64 | val keyedCounters = data.flatMap(line => {
65 | val items = BasicUtils.getTrimmedFields(line, fieldDelimIn)
66 | val counters = for (i <- 0 to 1) yield {
67 | val keyRec = Record(keyLen+1, items, 0, keyLen)
68 | if (i == 0) keyRec.addString(items(items.length-1))
69 | else keyRec.addString(totalTag)
70 | (keyRec, 1)
71 | }
72 | counters
73 | }).reduceByKey((v1,v2) => v1+v2)
74 |
75 | //formatted count statistics for each key
76 | val formattedCountRecs = keyedCounters.map(r => {
77 | val keyRec = Record(r._1, 0, keyLen)
78 | val valRec = Record(2)
79 | valRec.addString(r._1.getString(keyLen))
80 | valRec.addInt(r._2)
81 | (keyRec, valRec)
82 | }).groupByKey().map(r => {
83 | val key = r._1
84 | val values = r._2.toArray
85 | var outlierCount = 0
86 | var indeterCount = 0
87 | var normCount = 0
88 | var totalCount = 0
89 | for (v <- values) {
90 | v.getString(0) match {
91 | case `outlierTag` => outlierCount = v.getInt(1)
92 | case `indeterTag` => indeterCount = v.getInt(1)
93 | case `normTag` => normCount = v.getInt(1)
94 | case `totalTag` => totalCount = v.getInt(1)
95 | }
96 | }
97 | val outlierPercent = (outlierCount * 100).toDouble / totalCount
98 | val indeterPercent = (indeterCount * 100).toDouble / totalCount
99 | val normPercent = (normCount * 100).toDouble / totalCount
100 |
101 | val stBld = new StringBuilder(key.toString(fieldDelimOut))
102 | if (insertTimeStamp)
103 | stBld.append(fieldDelimOut).append(tmStmp)
104 | stBld.
105 | append(fieldDelimOut).append(outlierCount).
106 | append(fieldDelimOut).append(BasicUtils.formatDouble(outlierPercent, precision)).
107 | append(fieldDelimOut).append(indeterCount).
108 | append(fieldDelimOut).append(BasicUtils.formatDouble(indeterPercent, precision)).
109 | append(fieldDelimOut).append(normCount).
110 | append(fieldDelimOut).append(BasicUtils.formatDouble(normPercent, precision)).
111 | append(fieldDelimOut).append(totalCount)
112 |
113 | stBld.toString()
114 | })
115 |
116 | if (debugOn) {
117 | val records = formattedCountRecs.collect.slice(0, 20)
118 | records.foreach(r => println(r))
119 | }
120 |
121 | if(saveOutput) {
122 | formattedCountRecs.saveAsTextFile(outputPath)
123 | }
124 | }
125 |
126 | }
--------------------------------------------------------------------------------
/resource/ecomm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | PROJECT_HOME=/Users/pranab/Projects
4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar
5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar
6 | MASTER=spark://akash:7077
7 |
8 | case "$1" in
9 |
10 | "loadInp")
11 | rm $PROJECT_HOME/bin/beymani/input/ecom/$3/*
12 | cp $2 $PROJECT_HOME/bin/beymani/input/ecom/$3/
13 | ls -l $PROJECT_HOME/bin/beymani/input/ecom/$3/
14 | ;;
15 |
16 |
17 | "numStat")
18 | echo "running NumericalAttrStats Spark job"
19 | CLASS_NAME=org.chombo.spark.explore.NumericalAttrStats
20 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/ecom/training/*
21 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ecom/stat
22 | rm -rf ./output/ecom/stat
23 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
24 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT ecomm.conf
25 | ;;
26 |
27 | "numMstat")
28 | echo "running NumericalAttrMedian Spark job"
29 | CLASS_NAME=org.chombo.spark.explore.NumericalAttrMedian
30 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/ecom/training/*
31 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ecom/mstat
32 | rm -rf ./output/ecom/mstat
33 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
34 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT ecomm.conf
35 | rm ./output/ecom/mstat/_SUCCESS
36 | ls -l ./output/ecom/mstat
37 | ;;
38 |
39 | "bkMod")
40 | echo "backing up model files"
41 | MED_FILES=$PROJECT_HOME/bin/beymani/output/ecom/mstat/*
42 | META_DIR=$PROJECT_HOME/bin/beymani/meta/ecom
43 | META_FILE=$META_DIR/$2
44 | echo "copying to $META_FILE"
45 | cp /dev/null $META_FILE
46 | for f in $MED_FILES
47 | do
48 | echo "Copying file $f ..."
49 | cat $f >> $META_FILE
50 | done
51 | ls -l $META_FILE
52 | ;;
53 |
54 | "cpMod")
55 | echo "copying model files files from backup"
56 | META_DIR=$PROJECT_HOME/bin/beymani/meta/ecom
57 | cp $META_DIR/$2 $META_DIR/
58 | ls -l $META_DIR
59 | ;;
60 |
61 | "olPred")
62 | echo "running StatsBasedOutlierPredictor Spark job"
63 | CLASS_NAME=org.beymani.spark.dist.StatsBasedOutlierPredictor
64 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/ecom/pred/*
65 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ecom/olp
66 | rm -rf ./output/ecom/olp
67 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
68 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT ecomm.conf
69 | rm ./output/ecom/olp/_SUCCESS
70 | ls -l ./output/ecom/olp
71 | cat ./output/ecom/olp/part-00000 | grep ,O
72 | ;;
73 |
74 | "chkOl")
75 | echo "number of outliers"
76 | OUT_FILES=$PROJECT_HOME/bin/beymani/output/ecom/olp/*
77 | for f in $OUT_FILES
78 | do
79 | echo "checking file $f ..."
80 | wc -l $f
81 | done
82 | ;;
83 |
84 | "bkOut")
85 | echo "backing up outlier output files"
86 | OUT_FILES=$PROJECT_HOME/bin/beymani/output/ecom/olp/*
87 | BK_DIR=$PROJECT_HOME/bin/beymani/output/ecom/bkup
88 | BK_FILE=$BK_DIR/$2
89 | cp /dev/null $BK_FILE
90 | for f in $OUT_FILES
91 | do
92 | echo "Copying file $f ..."
93 | cat $f >> $BK_FILE
94 | done
95 | ls -l $BK_FILE
96 | ;;
97 |
98 | "rmAggrInp")
99 | echo "removing outlier aggregation input files"
100 | IN_DIR=$PROJECT_HOME/bin/beymani/input/ecom/aggr
101 | rm $IN_DIR/*
102 | ls -l $IN_DIR
103 | ;;
104 |
105 | "loadAggrInp")
106 | echo "copying outlier output files for aggregation"
107 | IN_DIR=$PROJECT_HOME/bin/beymani/input/ecom/aggr/
108 | BK_DIR=$PROJECT_HOME/bin/beymani/output/ecom/bkup
109 | cp $BK_DIR/$2 $IN_DIR
110 | ls -l $IN_DIR
111 | ;;
112 |
113 |
114 | "aggrOl")
115 | echo "running OutlierAggregator Spark job"
116 | CLASS_NAME=org.beymani.spark.common.OutlierAggregator
117 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/ecom/aggr/*
118 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ecom/aggr
119 | rm -rf ./output/ecom/aggr
120 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
121 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT ecomm.conf
122 | rm ./output/ecom/aggr/_SUCCESS
123 | ls -l ./output/ecom/aggr
124 | cat ./output/ecom/aggr/part-00000 | grep ,O
125 | ;;
126 |
127 |
128 | "bkOutAggr")
129 | echo "backing up aggregator output files"
130 | OUT_FILES=$PROJECT_HOME/bin/beymani/output/ecom/aggr/*
131 | BK_DIR=$PROJECT_HOME/bin/beymani/output/ecom/bkup
132 | BK_FILE=$BK_DIR/$2
133 | cp /dev/null $BK_FILE
134 | for f in $OUT_FILES
135 | do
136 | echo "Copying file $f ..."
137 | cat $f >> $BK_FILE
138 | done
139 | ls -l $BK_FILE
140 | ;;
141 |
142 | "orpOlPred")
143 | echo "running IsolationForestModel Spark job"
144 | CLASS_NAME=org.beymani.spark.multi.IsolationForestModel
145 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/ecom/orp/*
146 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ecom/orp
147 | rm -rf ./output/ecom/orp
148 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \
149 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT ecomm.conf
150 | rm ./output/ecom/orp/_SUCCESS
151 | ls -l ./output/ecom/orp
152 | cat ./output/ecom/orp/part-00000 | grep ,O
153 | ;;
154 |
155 | *)
156 | echo "unknown operation $1"
157 | ;;
158 |
159 | esac
--------------------------------------------------------------------------------
/src/main/java/org/beymani/proximity/RelativeDensity.java:
--------------------------------------------------------------------------------
1 | package org.beymani.proximity;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.conf.Configuration;
6 | import org.apache.hadoop.conf.Configured;
7 | import org.apache.hadoop.fs.Path;
8 | import org.apache.hadoop.io.LongWritable;
9 | import org.apache.hadoop.io.NullWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.mapreduce.Job;
12 | import org.apache.hadoop.mapreduce.Mapper;
13 | import org.apache.hadoop.mapreduce.Reducer;
14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
16 | import org.apache.hadoop.util.Tool;
17 | import org.apache.hadoop.util.ToolRunner;
18 | import org.apache.log4j.Level;
19 | import org.apache.log4j.Logger;
20 | import org.chombo.util.TextInt;
21 | import org.chombo.util.Tuple;
22 | import org.chombo.util.Utility;
23 |
24 | public class RelativeDensity extends Configured implements Tool {
25 |
26 | @Override
27 | public int run(String[] args) throws Exception {
28 | Job job = new Job(getConf());
29 | String jobName = "Relative density";
30 | job.setJobName(jobName);
31 |
32 | job.setJarByClass(RelativeDensity.class);
33 |
34 | FileInputFormat.addInputPath(job, new Path(args[0]));
35 | FileOutputFormat.setOutputPath(job, new Path(args[1]));
36 |
37 | job.setMapperClass(RelativeDensity.DensityMapper.class);
38 | job.setReducerClass(RelativeDensity.DensityReducer.class);
39 |
40 | job.setMapOutputKeyClass(Text.class);
41 | job.setMapOutputValueClass(Tuple.class);
42 |
43 | job.setOutputKeyClass(NullWritable.class);
44 | job.setOutputValueClass(Text.class);
45 |
46 | Utility.setConfiguration(job.getConfiguration());
47 |
48 | job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));
49 |
50 | int status = job.waitForCompletion(true) ? 0 : 1;
51 | return status;
52 | }
53 |
54 | public static class DensityMapper extends Mapper {
55 | private String fieldDelimRegex;
56 | private String fieldDelim;
57 | private String[] items ;
58 | private Text outKey = new Text();
59 | private Tuple outVal = new Tuple();
60 |
61 | protected void setup(Context context) throws IOException, InterruptedException {
62 | fieldDelim = context.getConfiguration().get("field.delim", ",");
63 | fieldDelimRegex = context.getConfiguration().get("field.delim.regex", "\\[\\]");
64 | }
65 |
66 | @Override
67 | protected void map(LongWritable key, Text value, Context context)
68 | throws IOException, InterruptedException {
69 | outVal.initialize();
70 | items = value.toString().split(fieldDelimRegex);
71 | outKey.set(items[0]);
72 | outVal.add(items[1], Integer.parseInt(items[2]));
73 | context.write(outKey, outVal);
74 | }
75 | }
76 |
77 | /**
78 | * @author pranab
79 | *
80 | */
81 | public static class DensityReducer extends Reducer {
82 | private String fieldDelim;
83 | private String groupID;
84 | private String entityID;
85 | private int sumDensity;
86 | private int density;
87 | private int relDensity;
88 | private Text outVal = new Text();
89 | private int relDensityScale;
90 | private static final Logger LOG = Logger.getLogger(DensityReducer.class);
91 |
92 | protected void setup(Context context) throws IOException, InterruptedException {
93 | Configuration conf = context.getConfiguration();
94 | fieldDelim = conf.get("field.delim", ",");
95 | relDensityScale = context.getConfiguration().getInt("red.reltive.density.scale", 1000);
96 | if (conf.getBoolean("debug.on", false)) {
97 | LOG.setLevel(Level.DEBUG);
98 | }
99 | }
100 |
101 | /* (non-Javadoc)
102 | * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
103 | */
104 | protected void reduce(Text key, Iterable values, Context context)
105 | throws IOException, InterruptedException {
106 | groupID = key.toString();
107 | sumDensity = 0;
108 | density = 0;
109 | for (Tuple val : values) {
110 | entityID = val.getString(0);
111 | if (entityID.equals(groupID)) {
112 | density = val.getInt(1);
113 | LOG.debug("entityID:" + entityID + " density:" + density);
114 | }
115 | sumDensity += val.getInt(1);
116 | }
117 |
118 | relDensity = (density * relDensityScale) / sumDensity;
119 | outVal.set(groupID + fieldDelim +relDensity);
120 | context.write(NullWritable.get(), outVal);
121 | }
122 |
123 | }
124 |
125 | /**
126 | * @param args
127 | */
128 | public static void main(String[] args) throws Exception {
129 | int exitCode = ToolRunner.run(new RelativeDensity(), args);
130 | System.exit(exitCode);
131 | }
132 |
133 |
134 | }
135 |
--------------------------------------------------------------------------------
/spark/src/main/scala/org/beymani/spark/seq/LocalNeighborhoodDetector.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani-spark: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.spark.seq
19 |
20 | import org.apache.spark.rdd.RDD
21 | import scala.collection.mutable.ArrayBuffer
22 | import scala.collection.JavaConverters._
23 | import scala.util.control.Breaks._
24 | import org.apache.spark.SparkContext
25 | import org.beymani.spark.common.OutlierUtility
26 | import org.chombo.spark.common.GeneralUtility
27 | import org.chombo.spark.common.JobConfiguration
28 | import org.chombo.spark.common.Record
29 | import org.chombo.util.BasicUtils
30 | import org.chombo.math.MathUtils
31 | import org.beymani.util.SeequenceScoreAggregator
32 | import org.hoidla.window.LocalNeighborhoodWindow
33 |
34 |
35 | /**
36 | * Anomaly detection in sequence data based on nearest neighboers within an window.
37 | * @author pranab
38 | *
39 | */
40 | object LocalNeighborhoodDetector extends JobConfiguration with GeneralUtility with OutlierUtility {
41 |
42 | /**
43 | * @param args
44 | * @return
45 | */
46 | def main(args: Array[String]) {
47 | val appName = "localNeighborhoodDetector"
48 | val Array(inputPath: String, outputPath: String, configFile: String) = getCommandLineArgs(args, 3)
49 | val config = createConfig(configFile)
50 | val sparkConf = createSparkConf(appName, config, false)
51 | val sparkCntxt = new SparkContext(sparkConf)
52 | val appConfig = config.getConfig(appName)
53 |
54 | //configuration params
55 | val fieldDelimIn = appConfig.getString("field.delim.in")
56 | val fieldDelimOut = appConfig.getString("field.delim.out")
57 | val precision = getIntParamOrElse(appConfig, "output.precision", 3)
58 | val keyFieldOrdinals = toOptionalIntArray(getOptionalIntListParam(appConfig, "id.fieldOrdinals"))
59 | val attrOrd = getMandatoryIntParam(appConfig, "attr.ordinal")
60 | val seqFieldOrd = getMandatoryIntParam(appConfig, "seq.fieldOrd", "missing seq field ordinal")
61 | val scoreThreshold = getMandatoryDoubleParam(appConfig, "score.threshold", "missing score threshold")
62 | val windowSize = getIntParamOrElse(appConfig, "window.size", 3)
63 | val neighborhoodDist = getDoubleParamOrElse(appConfig, "neighborhood.dist", -1.0)
64 | val debugOn = appConfig.getBoolean("debug.on")
65 | val saveOutput = appConfig.getBoolean("save.output")
66 |
67 | BasicUtils.assertCondition(windowSize % 2 == 1, "window size should be odd")
68 | val keyLen = getOptinalArrayLength(keyFieldOrdinals, 1)
69 | val neighborhoodDistBased = neighborhoodDist > 0
70 | val neighborhoodSize = getConditionalMandatoryIntParam(!neighborhoodDistBased, appConfig, "neighborhood.size",
71 | "neighborhoosd size must be provided")
72 |
73 | //input
74 | val data = sparkCntxt.textFile(inputPath)
75 | val keyedData = getKeyedValueWithSeq(data, fieldDelimIn, keyLen, keyFieldOrdinals, seqFieldOrd)
76 |
77 | //records with tag and score
78 | val taggedData = keyedData.groupByKey.flatMap(v => {
79 | val key = v._1
80 | val values = v._2.toList.sortBy(v => v.getLong(0))
81 | val size = values.length
82 | val coffset = windowSize / 2
83 | val window = if (neighborhoodDistBased) {
84 | new LocalNeighborhoodWindow(windowSize, neighborhoodDist)
85 | } else {
86 | new LocalNeighborhoodWindow(windowSize, neighborhoodSize)
87 | }
88 | val scores = Array.fill[Double](size)(0)
89 | for (i <- 0 to size - 1) {
90 | val v = values(i)
91 | val line = v.getString(1)
92 | val items = BasicUtils.getTrimmedFields(line, fieldDelimIn)
93 | val quant = items(attrOrd).toDouble
94 | window.add(quant)
95 | if (window.isProcessed()) {
96 | val score = if (neighborhoodDistBased) window.getNumNeighbosWithin().toDouble
97 | else window.getAvNeighborDist()
98 | scores(i - coffset) = score
99 | }
100 | }
101 |
102 | //append score and tag
103 | val recScores = values.map(r => r.getString(1)).zip(scores)
104 | recScores.map(r => {
105 | val rec = r._1
106 | val score = r._2
107 | val tag = if (score > scoreThreshold) "O" else "N"
108 | rec + fieldDelimOut + BasicUtils.formatDouble(score, precision) + fieldDelimOut + tag
109 | })
110 | })
111 |
112 | if (debugOn) {
113 | val records = taggedData.collect
114 | records.slice(0, 50).foreach(r => println(r))
115 | }
116 |
117 | if(saveOutput) {
118 | taggedData.saveAsTextFile(outputPath)
119 | }
120 |
121 | }
122 |
123 | }
--------------------------------------------------------------------------------
/spark/src/main/scala/org/beymani/spark/pc/PrincipalComponentPredictor.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * beymani-spark: Outlier and anamoly detection
3 | * Author: Pranab Ghosh
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License"); you
6 | * may not use this file except in compliance with the License. You may
7 | * obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14 | * implied. See the License for the specific language governing
15 | * permissions and limitations under the License.
16 | */
17 |
18 | package org.beymani.spark.pc
19 |
20 | import org.chombo.spark.common.JobConfiguration
21 | import org.apache.spark.SparkContext
22 | import scala.collection.JavaConverters._
23 | import org.chombo.util.BasicUtils
24 | import org.chombo.spark.common.Record
25 | import org.chombo.util.BaseAttribute
26 | import com.typesafe.config.Config
27 | import org.beymani.spark.common.OutlierUtility
28 | import org.chombo.spark.common.GeneralUtility
29 | import org.avenir.util.PrincipalCompState
30 | import org.chombo.math.MathUtils
31 |
32 | /**
33 | * PCA based outlier prediction
34 | * @author pranab
35 | *
36 | */
37 | object PrincipalComponentPredictor extends JobConfiguration with GeneralUtility {
38 | /**
39 | * @param args
40 | * @return
41 | */
42 | def main(args: Array[String]) {
43 | val appName = "principalComponentPredictor"
44 | val Array(inputPath: String, outputPath: String, configFile: String) = getCommandLineArgs(args, 3)
45 | val config = createConfig(configFile)
46 | val sparkConf = createSparkConf(appName, config, false)
47 | val sparkCntxt = new SparkContext(sparkConf)
48 | val appConfig = config.getConfig(appName)
49 |
50 | //configurations
51 | val fieldDelimIn = getStringParamOrElse(appConfig, "field.delim.in", ",")
52 | val fieldDelimOut = getStringParamOrElse(appConfig, "field.delim.out", ",")
53 | val keyFieldOrdinals = toIntArray(getMandatoryIntListParam(appConfig, "id.field.ordinals"))
54 | val quantFieldOrdinals = toIntArray(getMandatoryIntListParam(appConfig, "quant.field.ordinals"))
55 | val seqFieldOrd = getMandatoryIntParam( appConfig, "seq.field.ordinal", "missing sequence field ordinal")
56 | val dimension = quantFieldOrdinals.length
57 | val stateFilePath = this.getMandatoryStringParam(appConfig, "state.filePath", "missing pc state file path")
58 | val compState = PrincipalCompState.load(stateFilePath, fieldDelimOut).asScala.toMap
59 | val scoreThreshold = getMandatoryDoubleParam(appConfig, "score.threshold", "missing score threshold")
60 | val expConst = getDoubleParamOrElse(appConfig, "exp.const", 1.0)
61 | val precision = getIntParamOrElse(appConfig, "output.precision", 3)
62 | val debugOn = getBooleanParamOrElse(appConfig, "debug.on", false)
63 | val saveOutput = getBooleanParamOrElse(appConfig, "save.output", true)
64 |
65 | //pc matrix and transposed pc matrix
66 | val pcFun = (state: PrincipalCompState) => {
67 | val pcArr = state.getPrincComps()
68 | val pc = MathUtils.createMatrix(pcArr)
69 | val pcTr = pc.transpose()
70 | (pc, pcTr)
71 | }
72 | val pcMa = updateMapValues(compState, pcFun)
73 |
74 | val data = sparkCntxt.textFile(inputPath)
75 | val taggedData = data.map(line => {
76 | val items = BasicUtils.getTrimmedFields(line, fieldDelimIn)
77 | val keyRec = Record(items, keyFieldOrdinals)
78 | val keyStr = keyRec.toString(fieldDelimIn)
79 | val quantFields = BasicUtils.extractFieldsAsDoubleArray(items, keyFieldOrdinals)
80 | var score = 0
81 | val tag = pcMa.get(keyStr) match {
82 | case Some(pc) => {
83 | val pcHidden = pc._1
84 | val pcNorm = pc._2
85 | val daNorm = MathUtils.createColMatrix(quantFields)
86 |
87 | //regenerate
88 | val daHideen = MathUtils.multiplyMatrix(pcHidden, daNorm)
89 | val daRegen = MathUtils.multiplyMatrix(pcNorm, daHideen)
90 |
91 | //error
92 | val quantFieldsGen = MathUtils.arrayFromColumnMatrix(daRegen)
93 | var score = MathUtils.vectorDiffNorm(quantFields, quantFieldsGen)
94 | if (expConst > 0) {
95 | score = BasicUtils.expScale(expConst, score)
96 | }
97 | if (score < scoreThreshold) "N" else "O"
98 | }
99 | case None => "I"
100 | }
101 | val newRec = new Array[String](items.length + 2)
102 | Array.copy(items, 0, newRec, 0, items.length)
103 | newRec(newRec.length-2) = BasicUtils.formatDouble(score, precision)
104 | newRec(newRec.length-1) = tag
105 | (keyRec, newRec)
106 | })
107 |
108 | //group by key and sort by sequence
109 | val serTaggedData = groupByKeySortBySeq(taggedData, seqFieldOrd, fieldDelimOut)
110 |
111 | if (debugOn) {
112 | val records = serTaggedData.collect
113 | records.slice(0, 50).foreach(r => println(r))
114 | }
115 |
116 | if(saveOutput) {
117 | serTaggedData.saveAsTextFile(outputPath)
118 | }
119 |
120 | }
121 | }
--------------------------------------------------------------------------------