├── spark ├── project │ ├── build.properties │ └── plugins.sbt ├── version.sbt ├── src │ └── main │ │ └── scala │ │ └── org │ │ └── beymani │ │ ├── sanity │ │ └── WordCount.scala │ │ └── spark │ │ ├── common │ │ ├── PseudoRelevanceThresholdFinder.scala │ │ ├── OutlierScoreLevelShift.scala │ │ └── OutlierCounter.scala │ │ ├── seq │ │ └── LocalNeighborhoodDetector.scala │ │ └── pc │ │ └── PrincipalComponentPredictor.scala └── build.sbt ├── manifest.mf ├── resource ├── IntroductionToBeymani.docx ├── cpsale.conf ├── vib.conf ├── mmfr.properties ├── ouli.sh ├── mhist.sh ├── mm_seqn.sh ├── avdi.sh ├── negr.sh ├── rede.sh ├── dsort.sh ├── mdist.sh ├── nede.sh ├── mm_modl.sh ├── rt_predict.properties ├── hist.json ├── bsm.json ├── model_calibration_tutorial.txt ├── ecommDataStream.json ├── build_storm.xml ├── xaction_states.rb ├── knn_udr.properties ├── spark_dependency.txt ├── beymani_spark.xml ├── epid.conf ├── vib.sh ├── cpsale.sh ├── ae_ticket.properties ├── xaction_queue.py ├── mob_loc.properties ├── ecomm_hierarchy.json ├── cyd.conf ├── alarm_threshold_tuning_tutorial.txt ├── bsm.conf ├── jar_dependency.txt ├── unsup_model_drift_detection_tutorial.txt ├── epid.sh ├── sup_model_drift_detection_tutorial.txt ├── monitoring_order_processing_system_with_isolation_forest.txt ├── ticket.conf ├── cycle_detection_tutorial.txt ├── proximity_tutorial.txt ├── autoencoder_based_cust_svc_case_anomaly_detection.txt ├── machinary_fault_detection_with_subsequence_anomaly_tutorial.txt ├── salean.sh ├── issue_service_time_anomaly_detection_tutorial.txt ├── sales_data_change_point_detection_tutorial.txt ├── cyd.sh ├── health_monitoring_data_anomaly_detection_tutorial.txt ├── and.conf ├── salean.conf ├── ticket.sh ├── ecomm.conf ├── bsm.sh ├── quarantine_violation_detection_tutorial.txt ├── cct.rb ├── cpu_usage_anomaly_det_tutorial.txt ├── rel_density_tutorial.txt ├── real_time_fraud_prediction_tutorial.txt ├── retail_sale_monitoring_with_anomaly_detection_tutorial.txt ├── and_spark.sh └── ecomm.sh ├── .gitignore ├── src └── main │ └── java │ └── org │ └── beymani │ ├── util │ ├── SequencedScore.java │ ├── SeequenceScoreAggregator.java │ ├── DataStream.java │ ├── SequenceMatcher.java │ └── DataStreamSchema.java │ ├── predictor │ ├── PredictorSpout.java │ ├── EntropyIncreaseBasedPredictor.java │ ├── EstimatedProbabilityBasedPredictor.java │ ├── ExtremeValuePredictor.java │ ├── FileSpout.java │ ├── EstimatedCumProbabilityBasedPredictor.java │ ├── ModelBasedPredictor.java │ ├── OutlierPredictor.java │ ├── MahalanobisDistancePredictor.java │ ├── EstimatedMetaProbabilityBasedPredictor.java │ └── InterPercentileDifferenceBasedPredictor.java │ └── proximity │ └── RelativeDensity.java ├── python └── app │ ├── wsbot.py │ ├── cpsale.py │ ├── mvand.py │ ├── bvib.py │ ├── olss.py │ ├── bls.py │ └── cpu_usage.py └── README.md /spark/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.3.3 2 | -------------------------------------------------------------------------------- /spark/version.sbt: -------------------------------------------------------------------------------- 1 | version in ThisBuild := "1.0-SNAPSHOT" -------------------------------------------------------------------------------- /manifest.mf: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | X-COMMENT: Main-Class will be added automatically by build 3 | 4 | -------------------------------------------------------------------------------- /resource/IntroductionToBeymani.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pranab/beymani/HEAD/resource/IntroductionToBeymani.docx -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/**/* 2 | .settings/ 3 | .project 4 | .classpath 5 | /target 6 | spark/project/project 7 | spark/project/target 8 | spark/target 9 | spark/lib_managed 10 | spark/src_managed 11 | spark/project/boot 12 | spark/tmp 13 | project/ 14 | .history 15 | spark/dist 16 | .DS_Store 17 | .cache 18 | spark/bin 19 | .class 20 | .ivy2 21 | 22 | -------------------------------------------------------------------------------- /resource/cpsale.conf: -------------------------------------------------------------------------------- 1 | changePointDetector { 2 | field.delim.in = "," 3 | field.delim.out = "," 4 | id.fieldOrdinals = [0] 5 | attr.ordinals = [2] 6 | seq.fieldOrd = 1 7 | window.size = 200 8 | stat.type = CVM 9 | stat.critValue = 38.863 10 | seq.chPtOutFilePath = "file:///Users/pranab/Projects/bin/beymani/other/cpsale" 11 | debug.on = true 12 | save.output = true 13 | } 14 | -------------------------------------------------------------------------------- /resource/vib.conf: -------------------------------------------------------------------------------- 1 | 2 | subSequenceDistanceDetector { 3 | field.delim.in = "," 4 | field.delim.out = "," 5 | id.fieldOrdinals = [0] 6 | attr.ordinal = 2 7 | seq.fieldOrd = 1 8 | window.size = 40 9 | score.threshold = 0.2 10 | ref.filePath = "file:///Users/pranab/Projects/bin/beymani/other/vib/vib_ref.txt" 11 | output.precision = 3 12 | debug.on = true 13 | save.output = true 14 | } -------------------------------------------------------------------------------- /resource/mmfr.properties: -------------------------------------------------------------------------------- 1 | field.delim.regex=, 2 | field.delim.out=, 3 | num.reducer=1 4 | debug.on=false 5 | 6 | #Projection 7 | pro.projection.operation=grouping 8 | pro.key.field=0 9 | pro.projection.field=2 10 | 11 | #MarkovStateTransitionModel 12 | mst.skip.field.count=1 13 | mst.model.states=LNL,LNN,LNS,LHL,LHN,LHS,MNL,MNN,MNS,MHL,MHN,MHS,HNL,HNN,HNS,HHL,HHN,HHS 14 | mst.trans.prob.scale=1 15 | -------------------------------------------------------------------------------- /resource/ouli.sh: -------------------------------------------------------------------------------- 1 | JAR_NAME=/home/pranab/Projects/chombo/target/chombo-1.0.jar 2 | CLASS_NAME=org.chombo.mr.NumericSorter 3 | 4 | echo "running mr" 5 | IN_PATH=/user/pranab/cct/avdi 6 | OUT_PATH=/user/pranab/cct/ouli 7 | echo "input $IN_PATH output $OUT_PATH" 8 | hadoop fs -rmr $OUT_PATH 9 | echo "removed output dir" 10 | 11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH 12 | -------------------------------------------------------------------------------- /resource/mhist.sh: -------------------------------------------------------------------------------- 1 | JAR_NAME=/home/pranab/Projects/chombo/target/chombo-1.0.jar 2 | CLASS_NAME=org.chombo.mr.MultiVarHistogram 3 | 4 | echo "running mr" 5 | IN_PATH=/user/pranab/cct/input 6 | OUT_PATH=/user/pranab/cct/mhist 7 | echo "input $IN_PATH output $OUT_PATH" 8 | hadoop fs -rmr $OUT_PATH 9 | echo "removed output dir" 10 | 11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH 12 | -------------------------------------------------------------------------------- /resource/mm_seqn.sh: -------------------------------------------------------------------------------- 1 | JAR_NAME=/home/pranab/Projects/chombo/target/chombo-1.0.jar 2 | CLASS_NAME=org.chombo.mr.Projection 3 | 4 | echo "running mr" 5 | IN_PATH=/Users/pranab/mmfr/input 6 | OUT_PATH=/Users/pranab/mmfr/sequence 7 | echo "input $IN_PATH output $OUT_PATH" 8 | hadoop fs -rmr $OUT_PATH 9 | echo "removed output dir" 10 | 11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/mmfr.properties $IN_PATH $OUT_PATH 12 | -------------------------------------------------------------------------------- /resource/avdi.sh: -------------------------------------------------------------------------------- 1 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar 2 | CLASS_NAME=org.beymani.proximity.AverageDistance 3 | 4 | echo "running mr" 5 | IN_PATH=/user/pranab/cct/simi 6 | OUT_PATH=/user/pranab/cct/avdi 7 | echo "input $IN_PATH output $OUT_PATH" 8 | hadoop fs -rmr $OUT_PATH 9 | echo "removed output dir" 10 | 11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH 12 | -------------------------------------------------------------------------------- /resource/negr.sh: -------------------------------------------------------------------------------- 1 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar 2 | CLASS_NAME=org.beymani.proximity.AverageDistance 3 | 4 | echo "running mr" 5 | IN_PATH=/user/pranab/cct/simi 6 | OUT_PATH=/user/pranab/cct/negr 7 | echo "input $IN_PATH output $OUT_PATH" 8 | hadoop fs -rmr $OUT_PATH 9 | echo "removed output dir" 10 | 11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH 12 | -------------------------------------------------------------------------------- /resource/rede.sh: -------------------------------------------------------------------------------- 1 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar 2 | CLASS_NAME=org.beymani.proximity.RelativeDensity 3 | 4 | echo "running mr" 5 | IN_PATH=/user/pranab/cct/nede 6 | OUT_PATH=/user/pranab/cct/rede 7 | echo "input $IN_PATH output $OUT_PATH" 8 | hadoop fs -rmr $OUT_PATH 9 | echo "removed output dir" 10 | 11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH 12 | -------------------------------------------------------------------------------- /resource/dsort.sh: -------------------------------------------------------------------------------- 1 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar 2 | CLASS_NAME=org.beymani.dist.DistributionSorter 3 | 4 | echo "running mr" 5 | IN_PATH=/user/pranab/cct/mdist 6 | OUT_PATH=/user/pranab/cct/dsort 7 | echo "input $IN_PATH output $OUT_PATH" 8 | hadoop fs -rmr $OUT_PATH 9 | echo "removed output dir" 10 | 11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH 12 | -------------------------------------------------------------------------------- /resource/mdist.sh: -------------------------------------------------------------------------------- 1 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar 2 | CLASS_NAME=org.beymani.dist.MultiVariateDistribution 3 | 4 | echo "running mr" 5 | IN_PATH=/user/pranab/cct/input 6 | OUT_PATH=/user/pranab/cct/mdist 7 | echo "input $IN_PATH output $OUT_PATH" 8 | hadoop fs -rmr $OUT_PATH 9 | echo "removed output dir" 10 | 11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH 12 | -------------------------------------------------------------------------------- /resource/nede.sh: -------------------------------------------------------------------------------- 1 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar 2 | CLASS_NAME=org.beymani.proximity.NeighborDensity 3 | 4 | echo "running mr" 5 | IN_PATH=/user/pranab/cct/input/nede 6 | OUT_PATH=/user/pranab/cct/nede 7 | echo "input $IN_PATH output $OUT_PATH" 8 | hadoop fs -rmr $OUT_PATH 9 | echo "removed output dir" 10 | 11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH 12 | -------------------------------------------------------------------------------- /resource/mm_modl.sh: -------------------------------------------------------------------------------- 1 | JAR_NAME=/home/pranab/Projects/avenir/target/avenir-1.0.jar 2 | CLASS_NAME=org.avenir.markov.MarkovStateTransitionModel 3 | 4 | echo "running mr" 5 | IN_PATH=/Users/pranab/mmfr/sequence 6 | OUT_PATH=/Users/pranab/mmfr/model 7 | echo "input $IN_PATH output $OUT_PATH" 8 | hadoop fs -rmr $OUT_PATH 9 | echo "removed output dir" 10 | 11 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/mmfr.properties $IN_PATH $OUT_PATH 12 | -------------------------------------------------------------------------------- /resource/rt_predict.properties: -------------------------------------------------------------------------------- 1 | 2 | predictor.model=mm 3 | predictor.spout.threads=1 4 | predictor.bolt.threads=2 5 | num.workers=1 6 | debug=on 7 | 8 | messaging.provider=redis 9 | redis.server.host=localhost 10 | redis.server.port=6379 11 | redis.markov.model.key=xactionMarkovModel 12 | redis.input.queue=xactionQueue 13 | local.predictor=true 14 | state.seq.window.size=5 15 | state.ordinal=1 16 | detection.algorithm=missProbability 17 | metric.threshold=0.96 18 | redis.output.queue=fraudQueue 19 | -------------------------------------------------------------------------------- /resource/hist.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields" : 3 | [ 4 | { 5 | "name" : "xid", 6 | "ordinal" : 0, 7 | "id" : true, 8 | "dataType" : "string" 9 | }, 10 | { 11 | "name" : "time", 12 | "ordinal" : 1, 13 | "dataType" : "int", 14 | "bucketWidth" : 60 15 | }, 16 | { 17 | "name" : "amount", 18 | "ordinal" : 2, 19 | "dataType" : "double", 20 | "bucketWidth" : 100 21 | }, 22 | { 23 | "name" : "vendor", 24 | "ordinal" : 3, 25 | "dataType" : "categorical" 26 | } 27 | ] 28 | } 29 | -------------------------------------------------------------------------------- /resource/bsm.json: -------------------------------------------------------------------------------- 1 | { 2 | "attributes" : 3 | [ 4 | { 5 | "name" : "devID", 6 | "ordinal" : 0, 7 | "dataType" : "string", 8 | "targetFieldOrdinals" : [0] 9 | }, 10 | { 11 | "name" : "timeStamp", 12 | "ordinal" : 1, 13 | "dataType" : "long", 14 | "targetFieldOrdinals" : [1] 15 | }, 16 | { 17 | "name" : "measurement", 18 | "ordinal" : 2, 19 | "dataType" : "int", 20 | "buckeWidth" : 5.0, 21 | "transformers" : ["discretizerTrans"], 22 | "targetFieldOrdinals" : [2] 23 | } 24 | ] 25 | } -------------------------------------------------------------------------------- /spark/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn 2 | 3 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.10.0-RC1") 4 | 5 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6") 6 | 7 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.2.4") 8 | 9 | resolvers ++= Seq( 10 | "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/", 11 | "Akka Repository" at "https://repo.akka.io/releases/", 12 | "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools" 13 | ) 14 | 15 | 16 | -------------------------------------------------------------------------------- /resource/model_calibration_tutorial.txt: -------------------------------------------------------------------------------- 1 | This tutorial is for finding calibration properties of a machine lrarning model. 2 | 3 | Setup 4 | ===== 5 | Make sure you have ../lib ../supv directories with all the python files in there wrt 6 | where heart_disease.py is. Alternatively you can use ../python/app directory of avenir as 7 | your working directory 8 | 9 | Generate data and train model 10 | ============================= 11 | Please refer to heart_disease_prediction_with_random_forest_tutorial.txt 12 | 13 | Global calibration 14 | ================== 15 | ./heart_disease.py calib 16 | 17 | Local caliv=bration 18 | =================== 19 | ./heart_disease.py calibLoc -------------------------------------------------------------------------------- /resource/ecommDataStream.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataStreams" : 3 | [ 4 | { 5 | "id" : "corp", 6 | "type" : "root", 7 | "parentId" : "none", 8 | "parentType" : "none", 9 | "singleton" : true 10 | }, 11 | { 12 | "id" : "sale", 13 | "type" : "sale", 14 | "parentId" : "root", 15 | "parentType" : "root", 16 | "singleton" : true 17 | }, 18 | { 19 | "id" : "*", 20 | "type" : "prodSale", 21 | "parentId" : "sale", 22 | "parentType" : "sale", 23 | "singleton" : false 24 | }, 25 | { 26 | "id" : "scAbandon", 27 | "type" : "scAbandon", 28 | "parentId" : "root", 29 | "parentType" : "root", 30 | "singleton" : true 31 | } 32 | ] 33 | } -------------------------------------------------------------------------------- /resource/build_storm.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Packaging into a single uber JAR 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /resource/xaction_states.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | require '../lib/util.rb' 4 | 5 | 6 | custCount = ARGV[0].to_i 7 | 8 | custIDs = [] 9 | amountDist = CategoricalField.new("L",35,"M",53,"H",12) 10 | typeDist = CategoricalField.new("N",85,"H",15) 11 | timeElapsedDist = CategoricalField.new("L",35,"N",45,"S",20) 12 | 13 | 14 | idGen = IdGenerator.new 15 | 1.upto custCount do 16 | custIDs << idGen.generate(10) 17 | end 18 | 19 | #num of transactions 20 | 1.upto 15 do 21 | #number of customers 22 | 1.upto custCount do 23 | if (rand(10) < 9) 24 | cid = custIDs[rand(custIDs.length)] 25 | xid = idGen.generate(12) 26 | puts "#{cid},#{xid},#{amountDist.value}#{typeDist.value}#{timeElapsedDist.value}" 27 | end 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /resource/knn_udr.properties: -------------------------------------------------------------------------------- 1 | common.mode=train 2 | common.model.directory=model 3 | common.model.file=knn_udr_model 4 | common.preprocessing=scale 5 | common.scaling.method=minmax 6 | common.verbose=True 7 | common.logging.file=./log/knn.log 8 | common.logging.level=info 9 | train.data.file=chdr.txt 10 | train.data.fields=1,2,3,4,5,6,7,8 11 | train.data.feature.fields=0,1,2,3,4,5,6 12 | train.data.class.field=7 13 | train.num.neighbors=9 14 | train.neighbor.weight=_ 15 | train.neighbor.search.algo=_ 16 | train.neighbor.search.leaf.size=_ 17 | train.neighbor.dist.metric=_ 18 | train.neighbor.dist.metric.pow=_ 19 | train.success.criterion=_ 20 | train.model.save=_ 21 | train.score.method=_ 22 | predict.data.file=chdr.txt 23 | predict.data.fields=1,2,3,4,5,6,7,8 24 | predict.data.feature.fields=0,1,2,3,4,5,6 25 | predict.use.saved.model=_ 26 | 27 | 28 | -------------------------------------------------------------------------------- /resource/spark_dependency.txt: -------------------------------------------------------------------------------- 1 | Build all necessary jars 2 | ======================== 3 | in chombo 4 | mvn clean install 5 | sbt publishLocal 6 | 7 | in chombo/spark 8 | sbt clean package 9 | sbt publishLocal 10 | 11 | in hoidla 12 | mvn clean install 13 | sbt publishLocal 14 | 15 | in beymani 16 | mvn clean install 17 | sbt publishLocal 18 | 19 | in beymani/spark 20 | sbt clean package 21 | 22 | Build uber jar 23 | ============== 24 | ant -f beymani_spark.xml 25 | 26 | uber jar file name is uber-beymani-spark-1.0.jar 27 | 28 | If you are using Spark 2.0+, please add the following line to beymani_spark.xml, because 29 | type safe jar is not included in newer versions of Spark 30 | 31 | 32 | 33 | Please change the directory path, as per your environment -------------------------------------------------------------------------------- /resource/beymani_spark.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Packaging into a single uber JAR 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /resource/epid.conf: -------------------------------------------------------------------------------- 1 | outRangeBasedPredictor { 2 | field.delim.in = "," 3 | field.delim.out = "," 4 | id.fieldOrdinals = [0] 5 | attr.ordinals = [2,3] 6 | score.threshold = 0.80 7 | seq.fieldOrd=1 8 | exp.const = 2000.0 9 | attr.weights = [0.5, 0.5] 10 | attr.weightStrategy = max 11 | range.global = false 12 | range.filePath="/Users/pranab/Projects/bin/beymani/other/epid/outr/qualist.txt" 13 | debug.on = true 14 | save.output = true 15 | } 16 | 17 | inRangeBasedPredictor { 18 | field.delim.in = "," 19 | field.delim.out = "," 20 | id.fieldOrdinals = [0] 21 | attr.ordinals = [2,3] 22 | score.threshold = 0.500 23 | seq.fieldOrd=1 24 | exp.const=5000.0 25 | attr.weights = [0.5, 0.5] 26 | attr.weightStrategy = max 27 | range.global=true 28 | range.globalFilePath="/Users/pranab/Projects/bin/beymani/other/epid/inr/uniq_qualist.txt" 29 | range.LocalFilePath="/Users/pranab/Projects/bin/beymani/other/epid/qua_lo_loc.txt" 30 | debug.on = true 31 | save.output = true 32 | } -------------------------------------------------------------------------------- /resource/vib.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PROJECT_HOME=/Users/pranab/Projects 4 | JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar 5 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar 6 | AVENIR_JAR_NAME=$PROJECT_HOME/bin/avenir/uber-avenir-spark-1.0.jar 7 | MASTER=spark://akash:7077 8 | 9 | case "$1" in 10 | 11 | "olPred") 12 | echo "running SubSequenceDistanceDetector" 13 | CLASS_NAME=org.beymani.spark.seq.SubSequenceDistanceDetector 14 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/vib/* 15 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/vib 16 | rm -rf ./output/vib 17 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 18 | --conf spark.ui.killEnabled=true --master $MASTER $JAR_NAME $INPUT $OUTPUT vib.conf 19 | rm -rf ./output/vib/_SUCCESS 20 | ls -l ./output/vib 21 | for f in ./output/vib/* 22 | do 23 | echo "number of outliers in $f" 24 | cat $f | grep ,O | wc -l 25 | done 26 | ;; 27 | 28 | *) 29 | echo "unknown operation $1" 30 | ;; 31 | 32 | esac -------------------------------------------------------------------------------- /resource/cpsale.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PROJECT_HOME=/Users/pranab/Projects 4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar 5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar 6 | MASTER=spark://akash:7077 7 | 8 | case "$1" in 9 | 10 | "cpInp") 11 | echo "args: data_file " 12 | cp $2 $PROJECT_HOME/bin/beymani/input/cpsale/ 13 | ls -l $PROJECT_HOME/bin/beymani/input/cpsale/ 14 | ;; 15 | 16 | "cpPred") 17 | echo "running ChangePointDetector Spark job" 18 | CLASS_NAME=org.beymani.spark.misc.ChangePointDetector 19 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/cpsale/* 20 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/cpsale 21 | rm -rf ./output/cpsale 22 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 23 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT cpsale.conf 24 | wc -l ./output/cpsale/part-00000 25 | wc -l ./output/cpsale/part-00001 26 | ;; 27 | 28 | 29 | *) 30 | echo "unknown operation $1" 31 | ;; 32 | 33 | esac -------------------------------------------------------------------------------- /spark/src/main/scala/org/beymani/sanity/WordCount.scala: -------------------------------------------------------------------------------- 1 | package org.beymani.sanity 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.SparkContext._ 5 | 6 | object WordCount { 7 | def main(args: Array[String]) { 8 | val master = args.length match { 9 | case x: Int if x > 0 => args(0) 10 | case _ => "local" 11 | } 12 | val sc = new SparkContext(master, "WordCount", System.getenv("SPARK_HOME")) 13 | val input = args.length match { 14 | case x: Int if x > 1 => sc.textFile(args(1)) 15 | case _ => sc.parallelize(List("pandas", "i like pandas")) 16 | } 17 | val words = input.flatMap(line => line.split(" ")) 18 | args.length match { 19 | case x: Int if x > 2 => { 20 | val counts = words.map(word => (word, 1)).reduceByKey{case (x,y) => x + y} 21 | counts.saveAsTextFile(args(2)) 22 | } 23 | case _ => { 24 | val wc = words.countByValue() 25 | println(wc.mkString(",")) 26 | } 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /resource/ae_ticket.properties: -------------------------------------------------------------------------------- 1 | common.mode=training 2 | common.model.directory=./model/ae 3 | common.model.file=cus.mod 4 | common.preprocessing=scale 5 | common.scaling.method=zscale 6 | common.verbose=True 7 | common.device=_ 8 | train.data.file=cus_tr.txt 9 | train.data.fields=1,2,3,4,5,6,7,8 10 | train.data.feature.fields=0,1,2,3,4,5,6 11 | train.num.input=7 12 | train.num.hidden.units=6,5 13 | train.encoder.activations=relu,sigmoid 14 | train.decoder.activations=sigmoid,sigmoid 15 | train.batch.size=32 16 | train.num.iterations=200 17 | train.loss.reduction=_ 18 | train.lossFn=mse 19 | train.optimizer=_ 20 | train.opt.learning.rate=.001 21 | train.opt.weight.decay=_ 22 | train.opt.momentum=_ 23 | train.opt.eps=_ 24 | train.opt.dampening=_ 25 | train.opt.momentum.nesterov=_ 26 | train.opt.betas=_ 27 | train.opt.alpha=_ 28 | train.noise.scale=0.05 29 | train.tied.weights=True 30 | train.model.save=False 31 | train.track.error=True 32 | train.batch.intv=5 33 | train.loss.av.window=5 34 | train.loss.diff.threshold=0.001 35 | encode.use.saved.model=_ 36 | encode.data.file=cus_te.txt 37 | encode.feat.pad.size=50 -------------------------------------------------------------------------------- /spark/build.sbt: -------------------------------------------------------------------------------- 1 | name := "beymani-spark" 2 | 3 | organization := "org.beymani" 4 | 5 | version := "1.0" 6 | 7 | scalaVersion := "2.12.0" 8 | 9 | scalacOptions := Seq("-unchecked", "-deprecation") 10 | 11 | isSnapshot := true 12 | 13 | libraryDependencies ++=Seq( 14 | "org.apache.spark" %% "spark-core" % "3.0.0-preview" % "provided", 15 | "org.apache.spark" %% "spark-streaming" % "3.0.0-preview" % "provided", 16 | "org.apache.spark" %% "spark-streaming-kafka-0-10" % "3.0.0-preview", 17 | "org.apache.commons" % "commons-lang3" % "3.0", 18 | "com.fasterxml.jackson.core" % "jackson-databind" % "2.3.3", 19 | "com.fasterxml.jackson.module" % "jackson-module-scala_2.12" % "2.9.4", 20 | "org.apache.lucene" % "lucene-core" % "7.1.0", 21 | "org.apache.lucene" % "lucene-analyzers-common" % "7.1.0", 22 | "junit" % "junit" % "4.7" % "test", 23 | "org.scalatest" % "scalatest_2.10" % "2.0" % "test", 24 | "org.chombo" %% "chombo-spark" % "1.0", 25 | "mawazo" %% "chombo" % "1.0", 26 | "mawazo" %% "beymani" % "1.0", 27 | "mawazo" %% "hoidla" % "1.0", 28 | "mawazo" %% "avenir" % "1.0", 29 | "gov.nist.math" % "jama" % "1.0.3" 30 | ) 31 | -------------------------------------------------------------------------------- /resource/xaction_queue.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import redis 5 | 6 | op = sys.argv[1] 7 | r = redis.StrictRedis(host='localhost', port=6379, db=0) 8 | 9 | if (op == "setModel"): 10 | modelFile = sys.argv[2] 11 | with open (modelFile, "r") as myfile: 12 | modelData=myfile.read() 13 | 14 | r.set('xactionMarkovModel', modelData) 15 | elif (op == "getModel"): 16 | model = r.get("xactionMarkovModel") 17 | print model 18 | elif (op == "writeQueue"): 19 | xactionFile = sys.argv[2] 20 | with open (xactionFile, "r") as myfile: 21 | for line in myfile.readlines(): 22 | #print line.rstrip('\n') 23 | r.lpush("xactionQueue", line.rstrip('\n')) 24 | elif (op == "readQueue"): 25 | while True: 26 | line = r.rpop("xactionQueue") 27 | if line is not None: 28 | print line 29 | else: 30 | break 31 | elif (op == "queueLength"): 32 | qlen = r.llen("xactionQueue") 33 | print qlen 34 | elif (op == "readOutQueue"): 35 | while True: 36 | out = r.rpop("fraudQueue") 37 | if out is not None: 38 | print out 39 | else: 40 | break 41 | elif (op == "outQueueLength"): 42 | qlen = r.llen("fraudQueue") 43 | print qlen 44 | -------------------------------------------------------------------------------- /resource/mob_loc.properties: -------------------------------------------------------------------------------- 1 | common.verbose=_ 2 | population.num.hours=48 3 | population.sampling.interval=5 4 | population.size=1000 5 | population.num.family=200 6 | population.family.size.mean=_ 7 | population.family.size.sd=_ 8 | population.working.family.percentage=_ 9 | population.retired.one.person.family.percentage=_ 10 | region.lat.min=37.000 11 | region.lat.max=37.500 12 | region.long.min=-122.500 13 | region.long.max=-122.000 14 | region.num.business=_ 15 | region.biz.size.mean=_ 16 | region.biz.size.size.sd=_ 17 | region.num.office"=_ 18 | region.office.size.mean=_ 19 | region.biz.size.size.sd=_ 20 | region.num.schools=_ 21 | region.num.colleges=_ 22 | region.quarantine.list.file=qualist.txt 23 | region.num.locations=2 24 | region.loc.size=0.0024 25 | region.quarantine.loc.file=qualoc.txt 26 | region.quarantine.num.violation=5 27 | region.residence.list.file=res_loc.txt 28 | region.work.list.file=work_loc.txt 29 | region.school.list.file=school_loc.txt 30 | region.medical.facility.list.file=med_loc.txt 31 | region.shopping.area.list.file=shop_loc.txt 32 | region.entertainment.area.list.file=ent_loc.txt 33 | region.large.event.area.list.file=event_loc.txt 34 | region.open.space.list.file=open_loc.txt 35 | 36 | 37 | -------------------------------------------------------------------------------- /resource/ecomm_hierarchy.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataStreams" : 3 | [ 4 | { 5 | "id" : "corp", 6 | "type" : "root", 7 | "parentId" : "none", 8 | "parentType" : "none", 9 | "singleton" : true 10 | }, 11 | { 12 | "id" : "sale", 13 | "type" : "sale", 14 | "parentId" : "root", 15 | "parentType" : "root", 16 | "singleton" : true 17 | }, 18 | { 19 | "id" : "electronics", 20 | "type" : "dept", 21 | "parentId" : "sale", 22 | "parentType" : "sale", 23 | "singleton" : false, 24 | "childrenId" : ["31W6CN4OGP","ATROK5G187","54RLEB9L5J","P3N63F2TPP","L674KMOI01","38A2F7U4XK","L0668572D0","BS6RHF2PV2","C88L3DYBB9","NX23WR8JJW"] 25 | }, 26 | { 27 | "id" : "clothing", 28 | "type" : "dept", 29 | "parentId" : "sale", 30 | "parentType" : "sale", 31 | "singleton" : false, 32 | "childrenId" : ["IYZN3F9WCX","2DPXUFR93R","7MRHFY4L70","3FHQOJ45IJ","H4T8785L41","P3RVWCZS37","GZ4819T12I","OGX2037784","9021SDZ1O6","U62K213GI2"] 33 | }, 34 | { 35 | "id" : "*", 36 | "type" : "prodSale", 37 | "parentId" : "electronics", 38 | "parentType" : "dept", 39 | "singleton" : false 40 | }, 41 | { 42 | "id" : "*", 43 | "type" : "prodSale", 44 | "parentId" : "clothing", 45 | "parentType" : "dept", 46 | "singleton" : false 47 | } 48 | ] 49 | } -------------------------------------------------------------------------------- /src/main/java/org/beymani/util/SequencedScore.java: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.util; 19 | 20 | import org.chombo.util.Pair; 21 | 22 | /** 23 | * Outlier score for a sequence element 24 | * @author pranab 25 | * 26 | */ 27 | public class SequencedScore extends Pair { 28 | private static final long serialVersionUID = 4277362152194891790L; 29 | 30 | public SequencedScore(long seq, double score) { 31 | super(seq, score); 32 | } 33 | 34 | public long getSeq() { 35 | return left; 36 | } 37 | 38 | public double getScore() { 39 | return right; 40 | } 41 | 42 | public void setScore(double score) { 43 | right = score; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /resource/cyd.conf: -------------------------------------------------------------------------------- 1 | 2 | numericalAttrStats { 3 | field.delim.in = "," 4 | field.delim.out = "," 5 | id.fieldOrdinals = [0] 6 | attr.ordinals = [2] 7 | seasonal.analysis = false 8 | part.bySeasonCycle = false 9 | seasonal.cycleType = ["weekDayOrWeekendOfWeek"] 10 | time.fieldOrdinal = 1 11 | time.inMili = false 12 | output.precision = 3 13 | debug.on = true 14 | save.output = true 15 | } 16 | 17 | temporalAggregator { 18 | field.delim.in = "," 19 | field.delim.out = "," 20 | attr.ordinals = [2] 21 | id.fieldOrdinals = [0] 22 | time.fieldOrdinal = 1 23 | time.inMili = false 24 | aggr.windowTimeUnit = "hour" 25 | aggr.windowTimeLength = 1 26 | aggr.type = "average" 27 | output.compact = true 28 | output.precision = 3 29 | debug.on = true 30 | save.output = true 31 | } 32 | 33 | autoCorrelation { 34 | field.delim.in = "," 35 | field.delim.out = "," 36 | seq.fieldOrdinal = 1 37 | id.fieldOrdinals = [0] 38 | attr.ordinals = [2] 39 | output.precision = 3 40 | coor.lags = [24, 48, 168] 41 | stats.file.path = "/Users/pranab/Projects/bin/beymani/other/auc/stats.txt" 42 | mean.fieldOrd = 4 43 | debug.on = true 44 | save.output = true 45 | } 46 | 47 | typedUniqueValueCounter { 48 | field.delim.in = "," 49 | field.delim.out = "," 50 | id.fieldOrdinals = [0, 1, 2] 51 | attr.ordinals = [5] 52 | attr.5.type = "double" 53 | seasonal.analysis = true 54 | seasonal.cycleType = ["weekDayOrWeekendOfWeek"] 55 | time.fieldOrdinal = 4 56 | time.inMili = false 57 | output.precision = 3 58 | debug.on = true 59 | save.output = true 60 | } 61 | 62 | -------------------------------------------------------------------------------- /resource/alarm_threshold_tuning_tutorial.txt: -------------------------------------------------------------------------------- 1 | This tutorial is for tuning the threshold in anomaly detection system based on supervised learning 2 | using user feedback data 3 | 4 | Environment 5 | =========== 6 | Path etc shown here corresposnds to my environment. Please Change them as needed for your 7 | environment 8 | 9 | Build 10 | ===== 11 | Follow instructions in spark_dependency.txt 12 | 13 | Python dependency 14 | ================= 15 | The shell script commands for data generation run python scripts for data generation. Before you run 16 | the data generation commands do the following 17 | 1. checkout project avenir 18 | 2. copy the avenir/python/lib directory to ../lib with respect to your location of cpu_usage.py file 19 | 20 | Generate outlier detected data 21 | ============================== 22 | Please follow the tutorial cpu_usage_anomaly_det_tutorial.txt to generate data with outliers detected. 23 | Consolidate Spark generated output files into 1 file 24 | 25 | Simulate user feedback 26 | ====================== 27 | ./cpu_usage.py feedback 28 | 29 | outlier_file_name = file generated in the previous step 30 | cur_threshold = threshold set outlier detection spark jobs. It's the parameter score.threshold 31 | in and.conf file 32 | new_threshold = if set higher than cur_threshold, it will simulate the case false positive 33 | i.e too many alarms 34 | 35 | Run spark job 36 | ============= 37 | ./and_spark.sh thLearn 38 | 39 | Configuration 40 | ============= 41 | It's in and.conf file. Through the parameter split.points multiple split points are provided. -------------------------------------------------------------------------------- /resource/bsm.conf: -------------------------------------------------------------------------------- 1 | 2 | #device data 3 | dataTransformer { 4 | field.delim.in = "," 5 | field.delim.out = "," 6 | schema.filePath = "/Users/pranab/Projects/bin/beymani/meta/bsm.json" 7 | debug.on = true 8 | save.output = true 9 | transformers { 10 | discretizerTrans { 11 | } 12 | } 13 | } 14 | 15 | markovStateTransitionModel { 16 | field.delim.in = "," 17 | field.delim.out = "," 18 | id.field.ordinals = [0] 19 | seq.start.ordinal = 0 20 | state.list = ["8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29", "30","31","32","33","34","35","36","37","38","39","40","41","42"] 21 | output.precision = 3 22 | data.seqLongFormat = true 23 | seq.field.ordinal = 1 24 | state.field.ordinal = 2 25 | data.mergeKeysNeeded = true 26 | data.laplaceCorrNeeded = true 27 | output.compact = false 28 | debug.on = true 29 | save.output = true 30 | } 31 | 32 | markovChainPredictor { 33 | field.delim.in = "," 34 | field.delim.out = "," 35 | predictor.strategy = "conditinalProbability" 36 | id.fieldOrdinals = [0] 37 | output.precision = 6 38 | score.threshold = 3.7 39 | attr.ordinal = 2 40 | seq.fieldOrd = 1 41 | window.size = 4 42 | state.list = ["8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29", "30","31","32","33","34","35","36","37","38","39","40","41","42"] 43 | stateTrans.filePath = "/Users/pranab/Projects/bin/beymani/meta/bsm_mod.txt" 44 | stateTrans.compact = false 45 | model.global = true 46 | ignore.missingModel = false 47 | exp.const = -1.0 48 | debug.on = true 49 | save.output = true 50 | } -------------------------------------------------------------------------------- /resource/jar_dependency.txt: -------------------------------------------------------------------------------- 1 | Dependent jars 2 | ============== 3 | beymani depends on the following jar libraries. Most of them are third party except for 4 | chombo. For these two you could either checkout the jars and place them in your 5 | local maven repo or you could build them. 6 | 7 | jackson-core-lgpl-1.6.3.jar 8 | jackson-mapper-lgpl-1.6.3.jar 9 | chombo-1.0.jar 10 | commons-lang-3.1.jar 11 | jedis-2.2.1.jar 12 | 13 | 14 | Building dependent jars 15 | ======================= 16 | Follow these steps if you have decided to build the jars for chombo and hoidla 17 | 18 | Checkout project chombo and run 19 | mvn clean install 20 | 21 | 22 | Handling dependency 23 | =================== 24 | There are many ways to handle dependency in Hadoop 25 | 26 | 1. Use libjar command line options as below 27 | hadoop jar xyz.jar com.example.MyMapreduce -libjars path1/lib1.jar,path2/lib2.jar 28 | 29 | 2. Use maven shade plugin to package all jars into one uber jar. The following needs to 30 | be added to the build element in pom.xml 31 | 32 | ....... 33 | 34 | 35 | org.apache.maven.plugins 36 | maven-shade-plugin 37 | 38 | 39 | package 40 | 41 | shade 42 | 43 | 44 | 45 | 46 | uber-${artifactId}-${version} 47 | 48 | 49 | 50 | ....... 51 | 52 | 53 | 3. Use ant to package all dependent jars. You could use ../resource/build_hadoop.xml as an example 54 | 55 | 4. Copy all jars to hadoop lib directory in all nodes 56 | -------------------------------------------------------------------------------- /resource/unsup_model_drift_detection_tutorial.txt: -------------------------------------------------------------------------------- 1 | This tutorial is for unsupervised concept drift detection of deployed supervised machine learning 2 | models with nearest neighbor count algorithm. We will use ecommerce customer churn data 3 | 4 | 5 | Setup 6 | ===== 7 | Make sure you have python/lib, python/mlextra and python/supv directories of avenir project with all the 8 | python files wrt where codrift.py is as a peer directory i.e at ../lib , ../mlextra and ../supv 9 | 10 | Generate data for no drift case 11 | =============================== 12 | - generate refrence churn data 13 | ./codrift.py genrc > ch.txt 14 | where 15 | bsamp = num of samples e.g 1000 16 | noise_level = noise level in data e.g 0.05 17 | 18 | - set class label to 1 19 | ./codrift.py recl ch.txt 1 > chref.txt 20 | 21 | - generate current churn data 22 | ./codrift.py genrc chref.txt > ch.txt 23 | 24 | - set class label to 0 25 | ./codrift.py recl ch.txt 0 > chnew.txt 26 | 27 | - concatenate files 28 | cat chref.txt > chndr.txt 29 | cat chnew.txt >> chndr.txt 30 | 31 | No drift case 32 | ============= 33 | - ensure following settings in knn_udr.properties 34 | train.data.file=chndr.txt 35 | predict.data.file=chndr.txt 36 | 37 | - run 38 | ./codrift.py udrift knn_udr.properties 39 | 40 | Generate data for drift case 41 | ============================ 42 | - generate distribution shifted new data for second half 43 | ./codrift.py dish chnew.txt > chnewd.txt 44 | 45 | - concatenate files 46 | cat chref.txt > chdr.txt 47 | cat chnewd.txt >> chdr.txt 48 | 49 | Drift case 50 | ========== 51 | - ensure following settings in knn_udr.properties 52 | train.data.file=chdr.txt 53 | predict.data.file=chdr.txt 54 | 55 | - run 56 | ./codrift.py udrift knn_udr.properties 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /resource/epid.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PROJECT_HOME=/Users/pranab/Projects 4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar 5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar 6 | MASTER=spark://akash.local:7077 7 | 8 | case "$1" in 9 | 10 | "cpQuaLocData") 11 | echo "args: data_file " 12 | cp $2 $PROJECT_HOME/bin/beymani/other/epid/$3/ 13 | ls -l $PROJECT_HOME/bin/beymani/other/epid/$3/ 14 | ;; 15 | 16 | 17 | "cpLocData") 18 | echo "args: test_data_file " 19 | cp $2 $PROJECT_HOME/bin/beymani/input/epid/$3/ 20 | ls -l $PROJECT_HOME/bin/beymani/input/epid/$3/ 21 | ;; 22 | 23 | "olPredOu") 24 | echo "running OutRangeBasedPredictor Spark job" 25 | CLASS_NAME=org.beymani.spark.misc.OutRangeBasedPredictor 26 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/epid/outr/* 27 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/epid/outr 28 | rm -rf ./output/epid/outr 29 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 30 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT epid.conf 31 | echo "number of outliers" 32 | wc -l ./output/epid/outr/part-00000 33 | wc -l ./output/epid/outr/part-00001 34 | ;; 35 | 36 | "olPredIn") 37 | echo "running InRangeBasedPredictor Spark job" 38 | CLASS_NAME=org.beymani.spark.misc.InRangeBasedPredictor 39 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/epid/inr/* 40 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/epid/inr 41 | rm -rf ./output/epid/inr 42 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 43 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT epid.conf 44 | echo "number of outliers" 45 | wc -l ./output/epid/inr/part-00000 46 | wc -l ./output/epid/inr/part-00001 47 | ;; 48 | 49 | *) 50 | echo "unknown operation $1" 51 | ;; 52 | 53 | esac -------------------------------------------------------------------------------- /resource/sup_model_drift_detection_tutorial.txt: -------------------------------------------------------------------------------- 1 | This tutorial is forconcept drift detection of supervised machine learning models with EDDM algorithm 2 | 3 | 4 | Setup 5 | ===== 6 | Make sure you have python/lib directory of avenir project with all the python files wrt 7 | where codrift.py is as a peer directory i.e at ../lib Copy sucodr.py from beymani/python/lob 8 | directory to your lib directory 9 | 10 | Generate Data 11 | ============= 12 | - Generate refrence model prediction data 13 | ./codrift.py agen > er1.txt 14 | where 15 | nsamp = num of samples e.g. 2000 16 | er_rate = error rate e.g 0.1 17 | 18 | - Generate model prediction data with drift present 19 | ./codrift.py agen > er2.txt 20 | where 21 | trans = transition point for drift e.g 0.4 which means drift will appear after 22 | the first 40% of the data 23 | dr_er_rate = increased error rate after drift e.g 0.2 24 | 25 | Create reference statistics 26 | =========================== 27 | Make sure you have directory called model under the working directory 28 | 29 | Run 30 | ./codrift.py eddm er1.txt true 31 | where 32 | bootstrap_size = no of samples to be used boot strapping and creating referenece statistic e.g 600 33 | it will detect drift for the remaining samples. In our case it won't because er1.txt does not contain 34 | any error data with drift 35 | 36 | Detect drift 37 | ============ 38 | ./codrift.py eddm er2.txt 39 | In our case drift will be detected, because about half way through the error data, the error rate dobles 40 | to simulate drift. For real prouction data, you may or may not find drift 41 | 42 | Ensemble and hierarchy of drift detectors 43 | ========================================= 44 | There are aggregate functions in sucodr.py, that can be used to implement ensemble of detector e.g. 45 | LFR 46 | 47 | -------------------------------------------------------------------------------- /resource/monitoring_order_processing_system_with_isolation_forest.txt: -------------------------------------------------------------------------------- 1 | This tutorial is for monitring an order processing system with isolation forest based anomaly detection. 2 | It uses log records generated by the order processing business workflow system. 3 | 4 | Environment 5 | =========== 6 | Path etc shown here corresposnds to my environment. Please Change them as needed for your 7 | environment by editing ecomm.sh 8 | 9 | Build 10 | ===== 11 | Follow instructions in spark_dependency.txt 12 | 13 | Python dependency 14 | ================= 15 | Before you run python scripts for data generation please do the following 16 | 1. checkout project avenir 17 | 2. copy the avenir/python/lib directory to ../lib with respect to your location of ecomm.py file 18 | 19 | You could run ecomm.py from the python/app directory of beymani where it resides or copy it 20 | some where else 21 | 22 | Generate order processing data 23 | ============================== 24 | ./ecomm.py ordProcessRecs > orpr.txt 25 | where 26 | num_orders = num of orders e.g 200 27 | 28 | Insert outliers 29 | =============== 30 | ./ecomm.py olOrdPr orpr.txt > rorpr.txt 31 | where 32 | ol_percent = outlier percentage e.g 10 33 | 34 | Run anomaly detector Spark job 35 | ============================== 36 | Set score.threshold in ecomm.conf to some reasoable value e.g 0.5 37 | 38 | Run Spark job 39 | ./ecomm.sh orpOlPred 40 | 41 | Get upper tail statistics of outlier scores 42 | =========================================== 43 | ./olss.py sttest ./output/ecom/orp 0 hist 44 | 45 | Run anomaly detector Spark job with new threshold value 46 | ======================================================= 47 | Choose your threshold based on some confidence limit e.g 0.9 from the output of the lasr step Use that 48 | value to set score.threshold in ecomm.conf 49 | 50 | Run Spark job again 51 | ./ecomm.sh orpOlPred 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /resource/ticket.conf: -------------------------------------------------------------------------------- 1 | numericalAttrStats { 2 | field.delim.in = "," 3 | field.delim.out = "," 4 | id.fieldOrdinals = [0,1] 5 | attr.ordinals = [3] 6 | seasonal.analysis = true 7 | part.bySeasonCycle = true 8 | seasonal.cycleType = ["hourOfDay"] 9 | time.fieldOrdinal = 2 10 | time.inMili = false 11 | min.sampleCount = 100 12 | output.precision = 3 13 | debug.on = true 14 | save.output = true 15 | } 16 | 17 | numericalAttrMedian { 18 | field.delim.in = "," 19 | field.delim.out = "," 20 | id.fieldOrdinals = [0] 21 | attr.ordinals = [4] 22 | seasonal.analysis = false 23 | operation.type = "mad" 24 | hdfs.file = false 25 | med.file.path = "/Users/pranab/Projects/bin/beymani/meta/ticket/med.txt" 26 | seasonal.cycleType = ["hourOfDay"] 27 | time.fieldOrdinal = 1 28 | time.inMili = false 29 | output.precision = 6 30 | min.samplecount = 100 31 | debug.on = true 32 | save.output = true 33 | } 34 | 35 | statsBasedOutlierPredictor { 36 | field.delim.in = "," 37 | field.delim.out = "," 38 | predictor.strategy = "robustZscore" 39 | id.fieldOrdinals = [0] 40 | attr.ordinals = [4] 41 | score.threshold = 0.7 42 | exp.const = -1.0 43 | outlier.polarity = "all" 44 | stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/ticket/stats.txt" 45 | mean.fldOrd = 4 46 | hdfs.file = false 47 | attr.weights = [1] 48 | attr.weightStrategy = "weightedAverage" 49 | robustZscore { 50 | med.stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/ticket/med.txt" 51 | mad.stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/ticket/mad.txt" 52 | } 53 | seasonal.analysis = false 54 | seasonal.cycleType = ["hourOfDay"] 55 | time.fieldOrdinal = 1 56 | time.inMili = false 57 | output.precision = 3 58 | output.outliers = false 59 | rem.outliers = false 60 | clean.dataDirPath = "file:///Users/pranab/Projects/bin/beymani/other/olp/clean" 61 | debug.on = true 62 | save.output = true 63 | } 64 | -------------------------------------------------------------------------------- /resource/cycle_detection_tutorial.txt: -------------------------------------------------------------------------------- 1 | This tutorial is for cycle detection in time series data using auto correlation. A set of 2 | candidate lags are provided. The lag with the highest correlation corresponds to a cycle. 3 | 4 | 5 | Environment 6 | =========== 7 | Path etc shown here corresposnds to my environment. Please Change them as needed for your 8 | environment 9 | 10 | Build 11 | ===== 12 | Follow instructions in spark_dependency.txt 13 | 14 | Python dependency 15 | ================= 16 | The shell script commands for data generation run python scripts for data generation. Before you run 17 | the data generation commands do the following 18 | 1. checkout project avenir 19 | 2. copy the avenir/python/lib directory to ../lib with respect to your location of cpu_usage.py file 20 | 21 | Create input data 22 | ================= 23 | ./and_spark.sh crInput 24 | 25 | where 26 | num_of_days = number of days e.g 15 27 | reading_intervaL = reading interval in sec e.g. 300 28 | num_servers = number of servers e.g. 4 29 | output_file = output file, we will use c.txt from now on 30 | 31 | Copy output to input path for NumericalAttrStats and TemporalAggregator spark jobs 32 | 33 | Run Spark job for stats 34 | ======================= 35 | ./cyd.sh numStat 36 | 37 | Copy and consolidate stats file 38 | =============================== 39 | ./and_spark.sh crStatsFile 40 | 41 | Aggregate to hourly 42 | =================== 43 | If the sampling interval is in minutes or sec aggregate to hourly average 44 | ./cyd.sh tempAggr 45 | 46 | Copy and consolidate aggregate output 47 | ===================================== 48 | ./cyd.sh crAucInput 49 | 50 | Run Spark job for auto correlation 51 | ================================== 52 | ./cyd.sh autoCor 53 | 54 | Configuration 55 | ============= 56 | Configuration is in cyd.conf. Make changes as necessary 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /resource/proximity_tutorial.txt: -------------------------------------------------------------------------------- 1 | This tutorial provides details of finding outliers based on average distance to neighbors. 2 | It uses two MR jobs, SameTypeSimilarity and AverageDistance. If you want to use credit card 3 | transactions as input, you could use cct.rb to generate data. Make sure that utol.rb is in the path 4 | ../lib. util.rb can be checked out from my project visitante. It's under script/ruby/lib directory 5 | in that project. 6 | 7 | Transaction Simarity 8 | ==================== 9 | Herte is the script for SameTypeSimilarity 10 | 11 | JAR_NAME=/home/pranab/Projects/sifarish/target/sifarish-1.0.jar 12 | CLASS_NAME=org.sifarish.feature.SameTypeSimilarity 13 | 14 | echo "running mr" 15 | IN_PATH=/user/pranab/cct/input 16 | OUT_PATH=/user/pranab/cct/simi 17 | echo "input $IN_PATH output $OUT_PATH" 18 | hadoop fs -rmr $OUT_PATH 19 | echo "removed output dir" 20 | 21 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH 22 | 23 | Average Ditsance to Neighbors 24 | ============================= 25 | Here is a sample script for AverageDistance 26 | 27 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar 28 | CLASS_NAME=org.beymani.proximity.AverageDistance 29 | 30 | echo "running mr" 31 | IN_PATH=/user/pranab/cct/simi 32 | OUT_PATH=/user/pranab/cct/avdi 33 | echo "input $IN_PATH output $OUT_PATH" 34 | hadoop fs -rmr $OUT_PATH 35 | echo "removed output dir" 36 | 37 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH 38 | 39 | Configuration 40 | ============= 41 | Here is a sample cct.properties 42 | 43 | field.delim.regex=, 44 | field.delim=, 45 | num.reducer=1 46 | sts.bucket.count=1000 47 | sts.same.schema.file.path=/pranab/meta/prod/prod.json 48 | avd.top.match.count=10 49 | avd.top.match.average=true 50 | avd.top.match.density=false 51 | avd.top.match.grouping=false 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /resource/autoencoder_based_cust_svc_case_anomaly_detection.txt: -------------------------------------------------------------------------------- 1 | This tutorial is for anaomaly detection for service time for an issue processing system data using 2 | auto encoder. 3 | 4 | 5 | Environment 6 | =========== 7 | Path etc shown here corresposnds to my environment. Please Change them as needed for your 8 | environment 9 | 10 | 11 | Python dependency 12 | ================= 13 | The shell script commands for data generation run python scripts for data generation. Before you run 14 | the data generation commands do the following 15 | 1. checkout project avenir 16 | 2. copy the directories avenir/python/lib avenir/python/mlextra and avenir/python/unsup directory to ../lib 17 | ../mlextra and ../unsup with respect to your location of cpu_usage.py file 18 | 19 | 20 | Create normal data for modeling 21 | =============================== 22 | ./ticket.py genx > cus_tr.txt 23 | 24 | where 25 | num_issues = number of issues e.g 2000 26 | 27 | 28 | Create test data 29 | ================ 30 | ./ticket.py genx > cus.txt 31 | where 32 | num_issues = number of issues e.g 200 33 | 34 | insert outliers 35 | /ticket.py iolx cus.txt > cus_te.txt 36 | 37 | where 38 | > v.txt 24 | 25 | where 26 | num_secs = num of secs in past for which vibration data data is generated e.g 7 27 | 28 | -Split into reference and prediction data 29 | split -l10000 v.txt 30 | mv xaa vib_ref.txt 31 | 32 | -Insert outliers in prediction or test data data 33 | ./bvib.py iol xab > vib_pred.txt 34 | failure_onset_time = time from beginning of test data where outlier in inserted. Outlier is 35 | in the form of 2 high frequecy componenets 36 | 37 | -You could plot the data around where outliers were introduced as follows 38 | ./bvib.py iplot vib_pred.txt K87JG9F6 900 1100 39 | 40 | K87JG9F6 is the ID of the machine that is faulty and has outliers in the vibration data 41 | 42 | -Copy reference and prediction data 43 | cp vib_ref.txt ./other/vib/ 44 | cp vib_pred.txt ./input/vib/ 45 | 46 | 47 | Run Spark Job 48 | ============= 49 | Run 50 | ./vib.sh olPred 51 | 52 | Plot outlier data 53 | ================= 54 | ./bvib.py oplot ./output/vib/part-00000 K87JG9F6 900 1100 55 | 56 | K87JG9F6 is the ID of the machine that is faulty and has outliers in the vibration data 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /src/main/java/org/beymani/util/SeequenceScoreAggregator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.util; 19 | 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | /** 24 | * Manages outlier scores for data points in a sequence. A data point may belong to 25 | * multiple sequences and hence may have have multiple outlier scores 26 | * @author pranab 27 | * 28 | */ 29 | public class SeequenceScoreAggregator implements java.io.Serializable { 30 | private static final long serialVersionUID = 2181114339589177954L; 31 | private List scores = new ArrayList(); 32 | private int windowSize; 33 | 34 | 35 | /** 36 | * @param windowSize 37 | */ 38 | public SeequenceScoreAggregator(int windowSize) { 39 | super(); 40 | this.windowSize = windowSize; 41 | } 42 | 43 | 44 | /** 45 | * @param seq 46 | * @param score 47 | */ 48 | public void add(double score ) { 49 | scores.add(score); 50 | if (scores.size() > windowSize) { 51 | //set score to max of current and new score 52 | for (int i = scores.size() - windowSize; i < scores.size(); ++i) { 53 | double thisSeqScore = scores.get(i); 54 | if (thisSeqScore < score) { 55 | scores.set(i, score); 56 | } 57 | } 58 | } 59 | } 60 | 61 | /** 62 | * @return 63 | */ 64 | public List getScores() { 65 | return scores; 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /resource/salean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PROJECT_HOME=/Users/pranab/Projects 4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar 5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar 6 | MASTER=spark://akash:7077 7 | 8 | case "$1" in 9 | 10 | *) 11 | echo "unknown operation $1" 12 | ;; 13 | 14 | 15 | "numStat") 16 | echo "running NumericalAttrStats Spark job" 17 | CLASS_NAME=org.chombo.spark.explore.NumericalAttrStats 18 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/san/* 19 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/san/stat 20 | rm -rf ./output/stat 21 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 22 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT salean.conf 23 | ;; 24 | 25 | "numMstat") 26 | echo "running NumericalAttrMedian Spark job" 27 | CLASS_NAME=org.chombo.spark.explore.NumericalAttrMedian 28 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/san/* 29 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/san/mstat 30 | rm -rf ./output/san/mstat 31 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 32 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT salean.conf 33 | ;; 34 | 35 | "cpMed") 36 | echo "copying median files" 37 | MED_FILES=$PROJECT_HOME/bin/beymani/output/san/mstat/* 38 | META_DIR=$PROJECT_HOME/bin/beymani/meta/san 39 | cp /dev/null $META_DIR/$2 40 | for f in $MED_FILES 41 | do 42 | echo "Copying file $f ..." 43 | cat $f >> $META_DIR/$2 44 | done 45 | ls -l $META_DIR 46 | ;; 47 | 48 | "olPred") 49 | echo "running StatsBasedOutlierPredictor Spark job" 50 | CLASS_NAME=org.beymani.spark.dist.StatsBasedOutlierPredictor 51 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/san/* 52 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/san/olp 53 | rm -rf ./output/san/olp 54 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 55 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT salean.conf 56 | echo "number of outliers" 57 | wc -l ./output/olp/part-00000 58 | wc -l ./output/olp/part-00001 59 | ;; 60 | 61 | esac -------------------------------------------------------------------------------- /resource/issue_service_time_anomaly_detection_tutorial.txt: -------------------------------------------------------------------------------- 1 | This tutorial is for anaomaly detection for service time for an issue processing system data using 2 | statistical modeling. To ne more specidfic we will be using a z score based technique 3 | 4 | 5 | Environment 6 | =========== 7 | Path etc shown here corresposnds to my environment. Please Change them as needed for your 8 | environment 9 | 10 | Build 11 | ===== 12 | Follow instructions in spark_dependency.txt 13 | 14 | Python dependency 15 | ================= 16 | The shell script commands for data generation run python scripts for data generation. Before you run 17 | the data generation commands do the following 18 | 1. checkout project avenir 19 | 2. copy the avenir/python/lib directory to ../lib with respect to your location of cpu_usage.py file 20 | 21 | 22 | Create normal data for modeling 23 | =============================== 24 | ./ticket.py gen > tick_tr.txt 25 | 26 | where 27 | num_issues = number of issues e.g 2000 28 | 29 | Copy modeling data 30 | ./ticket.sh loadInp tick_tr.txt train 31 | 32 | Create test data 33 | ================ 34 | ./ticket.py gen > tick.txt 35 | where 36 | num_issues = number of issues e.g 200 37 | 38 | insert outliers 39 | /ticket.py iol tick.txt > tick_pred.txt 40 | 41 | where 42 | > cps.txt 22 | 23 | num_days = num of days in past for which sales data data is generated 24 | 25 | Generate distribution for CVM two ssample statistic 26 | =================================================== 27 | We use Monte Carlo simulation to generate distribution. When run it will output to the console 28 | upper tail statistic. Save the output somewhere. You will need it to configure the Spark job 29 | 30 | Checkout the project avenir. In the python/app directory run the following 31 | 32 | ./tsstat.py cvm 33 | num_iter = num of iterations for the simulator e.g 2000 34 | num_samp = num of samples for generated samples, which should be half the window size (the parameter 35 | window.size in cpsale.conf). I have set this parameter to 200. So num_samp should be 100 36 | 37 | You could skip this step, if use the values set for parameter stat.critValue 38 | 39 | Copy input to Spark directory 40 | ============================= 41 | ./cpsale.sh cpInp cps.txt 42 | 43 | Run Spark Job 44 | ============= 45 | Chhose an upper critical value for confidence interval any wher between .95 and .99 from the 46 | output of the MC simulator we ran earlier. Set the parameter stat.critValue in cpsale.conf 47 | 48 | Run 49 | ./cpsale.sh cpPred 50 | 51 | Plot sales data and change points 52 | ================================= 53 | ./cpsale.py plot cps.txt DK75HUI45X ./output/cpsale/part-00000 54 | 55 | DK75HUI45X is the ID of the product that change point in sales data 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /src/main/java/org/beymani/predictor/PredictorSpout.java: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | 19 | package org.beymani.predictor; 20 | 21 | import java.util.Map; 22 | 23 | import org.chombo.storm.MessageQueue; 24 | 25 | import backtype.storm.spout.SpoutOutputCollector; 26 | import backtype.storm.task.TopologyContext; 27 | import backtype.storm.topology.OutputFieldsDeclarer; 28 | import backtype.storm.topology.base.BaseRichSpout; 29 | import backtype.storm.tuple.Fields; 30 | import backtype.storm.tuple.Values; 31 | 32 | /** 33 | * @author pranab 34 | * 35 | */ 36 | public class PredictorSpout extends BaseRichSpout { 37 | private SpoutOutputCollector collector; 38 | private Map conf; 39 | private String messageQueue; 40 | private MessageQueue msgQueue; 41 | private static final String NIL = "nil"; 42 | 43 | @Override 44 | public void open(Map conf, TopologyContext context, 45 | SpoutOutputCollector collector) { 46 | this.collector = collector; 47 | this.conf = conf; 48 | messageQueue = conf.get("redis.input.queue").toString(); 49 | msgQueue = MessageQueue.createMessageQueue(conf, messageQueue); 50 | } 51 | 52 | @Override 53 | public void nextTuple() { 54 | String message = msgQueue.receive(); 55 | if(null != message && !message.equals(NIL)) { 56 | int pos = message.indexOf(","); 57 | String entityID = message.substring(0, pos); 58 | String recordData = message.substring(pos+1); 59 | collector.emit(new Values(entityID, recordData)); 60 | } 61 | 62 | } 63 | 64 | @Override 65 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 66 | declarer.declare(new Fields("entityID", "recordData")); 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /src/main/java/org/beymani/util/DataStream.java: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.util; 19 | 20 | import java.io.Serializable; 21 | import java.util.List; 22 | 23 | import org.codehaus.jackson.annotate.JsonIgnoreProperties; 24 | 25 | /** 26 | * @author pranab 27 | * 28 | */ 29 | @JsonIgnoreProperties(ignoreUnknown = true) 30 | public class DataStream implements Serializable{ 31 | private String id; 32 | private String type; 33 | private String parentId; 34 | private String parentType; 35 | private List childrenId; 36 | private boolean singleton; 37 | 38 | /** 39 | * 40 | */ 41 | public DataStream() { 42 | } 43 | 44 | /** 45 | * @return 46 | */ 47 | public String getId() { 48 | return id; 49 | } 50 | 51 | public void setId(String id) { 52 | this.id = id; 53 | } 54 | 55 | public String getType() { 56 | return type; 57 | } 58 | 59 | public void setType(String type) { 60 | this.type = type; 61 | } 62 | 63 | public String getParentId() { 64 | return parentId; 65 | } 66 | 67 | public void setParentId(String parentId) { 68 | this.parentId = parentId; 69 | } 70 | 71 | public String getParentType() { 72 | return parentType; 73 | } 74 | 75 | public void setParentType(String parentType) { 76 | this.parentType = parentType; 77 | } 78 | 79 | public List getChildrenId() { 80 | return childrenId; 81 | } 82 | 83 | public void setChildrenId(List childrenId) { 84 | this.childrenId = childrenId; 85 | } 86 | 87 | public boolean isSingleton() { 88 | return singleton; 89 | } 90 | 91 | public void setSingleton(boolean singleton) { 92 | this.singleton = singleton; 93 | } 94 | 95 | } 96 | -------------------------------------------------------------------------------- /resource/cyd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PROJECT_HOME=/Users/pranab/Projects 4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar 5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar 6 | MASTER=spark://akash:7077 7 | 8 | case "$1" in 9 | 10 | "numStat") 11 | echo "running NumericalAttrStats Spark job" 12 | CLASS_NAME=org.chombo.spark.explore.NumericalAttrStats 13 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/teg/cusage.txt 14 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/mea 15 | rm -rf ./output/mea 16 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 17 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT cyd.conf 18 | ;; 19 | 20 | "crStatsFile") 21 | echo "copying and consolidating stats file" 22 | cat $PROJECT_HOME/bin/beymani/output/mea/part-00000 > $PROJECT_HOME/bin/beymani/other/auc/stats.txt 23 | cat $PROJECT_HOME/bin/beymani/output/mea/part-00001 >> $PROJECT_HOME/bin/beymani/other/auc/stats.txt 24 | ls -l $PROJECT_HOME/bin/beymani/other/auc 25 | ;; 26 | 27 | "tempAggr") 28 | echo "running TemporalAggregator Spark job" 29 | CLASS_NAME=org.chombo.spark.explore.TemporalAggregator 30 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/teg/cusage.txt 31 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/teg 32 | rm -rf ./output/teg 33 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 34 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT cyd.conf 35 | ;; 36 | 37 | "crAucInput") 38 | echo "copying and consolidating tem aggregation output file" 39 | cat $PROJECT_HOME/bin/beymani/output/teg/part-00000 > $PROJECT_HOME/bin/beymani/input/auc/cusage.txt 40 | cat $PROJECT_HOME/bin/beymani/output/teg/part-00001 >> $PROJECT_HOME/bin/beymani/input/auc/cusage.txt 41 | ls -l $PROJECT_HOME/bin/beymani/input/auc 42 | ;; 43 | 44 | "autoCor") 45 | echo "running AutoCorrelation Spark job" 46 | CLASS_NAME=org.chombo.spark.explore.AutoCorrelation 47 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/auc/cusage.txt 48 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/auc 49 | rm -rf ./output/auc 50 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 51 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT cyd.conf 52 | ;; 53 | 54 | *) 55 | echo "unknown operation $1" 56 | ;; 57 | 58 | esac -------------------------------------------------------------------------------- /resource/health_monitoring_data_anomaly_detection_tutorial.txt: -------------------------------------------------------------------------------- 1 | This tutorial is for anaomaly detection in health monitoring data. Sequence anomaly is detected 2 | with markov chain model. 3 | 4 | Environment 5 | =========== 6 | Path etc shown here corresposnds to my environment. Please Change them as needed for your 7 | environment. The script bsm.sh is for running spark jobs and various other tasks. The configuration 8 | is in bsm.conf 9 | 10 | Build 11 | ===== 12 | Follow instructions in spark_dependency.txt 13 | 14 | Python dependency 15 | ================= 16 | The shell script commands for data generation run python scripts for data generation. Before you run 17 | the data generation commands do the following 18 | 1. checkout project avenir 19 | 2. copy the avenir/python/lib directory to ../lib with respect to your location of cpu_usage.py file 20 | 21 | Create device reading mean and std dev 22 | ====================================== 23 | ./bls.py stat > dstat.txt 24 | 25 | num_dev = number of devices e.g 200 26 | 27 | Create training data 28 | ==================== 29 | ./bls.py gen dstat.txt normal > 30 | where 31 | nun_days = num of days for which data should be generated (e.g 300) 32 | train_data_file = training data file 33 | 34 | Copy to the spark input directory. 35 | cp ./input/bsm/train 36 | 37 | Copy meta data file 38 | ==================== 39 | cp bsm.json ./meta 40 | 41 | Discretize training data 42 | ======================== 43 | Run dicretization spark job 44 | ./bsm.sh transformTrain 45 | 46 | Discretization step is set to 5 in bsm.conf 47 | 48 | Build model 49 | =========== 50 | Run Spark job 51 | ./bsm.sh stateTrans 52 | 53 | Consolidate model files 54 | ======================= 55 | Copy all Spark generated files into one 56 | ./bsm.sh cpModel 57 | 58 | Create test data 59 | ================ 60 | Create test data with outliers 61 | ./bls.py gen dstat.txt anomaly > 62 | nun_days = num of days for which data should be generated (e.g 30) 63 | test_data_file = test data file name 64 | 65 | Copy file 66 | cp ./input/bsm/pred 67 | 68 | Discretize test data 69 | ==================== 70 | Run dicretization spark job 71 | ./bsm.sh transformPred 72 | 73 | Anomaly prediction Spark job 74 | ============================ 75 | ./bsm.sh olPredict 76 | 77 | -------------------------------------------------------------------------------- /python/app/wsbot.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | 3 | # avenir-python: Machine Learning 4 | # Author: Pranab Ghosh 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); you 7 | # may not use this file except in compliance with the License. You may 8 | # obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | # implied. See the License for the specific language governing 16 | # permissions and limitations under the License. 17 | 18 | # Package imports 19 | import os 20 | import sys 21 | import random 22 | import statistics 23 | import matplotlib.pyplot as plt 24 | sys.path.append(os.path.abspath("../lib")) 25 | sys.path.append(os.path.abspath("../mlextra")) 26 | from util import * 27 | from sampler import * 28 | 29 | """ 30 | data generation for web session 31 | """ 32 | if __name__ == "__main__": 33 | op = sys.argv[1] 34 | if op == "gen": 35 | numSamp = int(sys.argv[2]) 36 | if len(sys.argv) == 4: 37 | percenNormal = int(sys.argv[3]) 38 | else: 39 | percenNormal = -1 40 | 41 | hrOfDay = [NormalSampler(14,3), UniformNumericSampler(0,23)] 42 | numPage = [NormalSampler(12,2.5), NormalSampler(50,5)] 43 | pageDurAv = [NormalSampler(60, 15), NormalSampler(1,.1)] 44 | prRevFrac = [NormalSampler(.5,.1), NormalSampler(.9,.05)] 45 | shopCart = [BernoulliTrialSampler(.6), BernoulliTrialSampler(.2)] 46 | checkout = [BernoulliTrialSampler(.4), BernoulliTrialSampler(0)] 47 | logOut = [BernoulliTrialSampler(.8), BernoulliTrialSampler(.95)] 48 | 49 | idLists = [genIdList(100, 12), genIdList(80, 12)] 50 | 51 | for _ in range(numSamp): 52 | if percenNormal > 0: 53 | if isEventSampled(percenNormal): 54 | di = 0 55 | else: 56 | di = 1 57 | else: 58 | di = 0 59 | uid = selectRandomFromList(idLists[di]) 60 | hd = int(hrOfDay[di].sample()) 61 | nup = int(numPage[di].sample()) 62 | pdu = pageDurAv[di].sample() 63 | prev = prRevFrac[di].sample() 64 | sc = toIntFromBoolean(shopCart[di].sample()) 65 | co = toIntFromBoolean(checkout[di].sample()) 66 | if di == 1: 67 | co = 0 68 | lo = toIntFromBoolean(logOut[di].sample()) 69 | 70 | print("{},{},{},{:.3f},{:.3f},{},{},{}".format(uid,hd,nup,pdu,prev,sc,co,lo)) 71 | 72 | -------------------------------------------------------------------------------- /resource/and.conf: -------------------------------------------------------------------------------- 1 | numericalAttrStats { 2 | field.delim.in = "," 3 | field.delim.out = "," 4 | id.fieldOrdinals = [0] 5 | attr.ordinals = [3] 6 | seasonal.analysis = true 7 | part.bySeasonCycle = true 8 | seasonal.cycleType = ["weekDayOrWeekendOfWeek"] 9 | time.fieldOrdinal = 1 10 | time.inMili = false 11 | min.sampleCount = 10 12 | output.precision = 3 13 | debug.on = true 14 | save.output = true 15 | } 16 | 17 | 18 | statsBasedOutlierPredictor { 19 | field.delim.in = "," 20 | field.delim.out = "," 21 | predictor.strategy = "zscore" 22 | id.fieldOrdinals = [0] 23 | attr.ordinals = [3] 24 | score.threshold = 3.30 25 | score.thresholdNorm = 0.90 26 | exp.const = -1.0 27 | outlier.polarity = "high" 28 | stats.file.path = "/Users/pranab/Projects/bin/beymani/other/olp/stats.txt" 29 | mean.fldOrd = 4 30 | hdfs.file = false 31 | attr.weights = [1.0] 32 | attr.weightStrategy = "weightedAverage" 33 | zscore { 34 | stats.file.path = "/Users/pranab/Projects/bin/beymani/other/olp/stats.txt" 35 | } 36 | seasonal.analysis = true 37 | part.bySeasonCycle = true 38 | seasonal.cycleType = ["weekDayOrWeekendOfWeek"] 39 | time.fieldOrdinal = 1 40 | time.inMili = false 41 | output.precision = 3 42 | output.outliers = false 43 | rem.outliers = false 44 | clean.dataDirPath = "file:///Users/pranab/Projects/bin/beymani/other/olp/clean" 45 | debug.on = true 46 | save.output = true 47 | } 48 | 49 | thresholdLearner { 50 | field.delim.in = "," 51 | field.delim.out = "," 52 | score.fldOrd = 4 53 | cls.fldOrd = 7 54 | split.points = [0.925, 0.930, 0.935, 0.940, 0.945, 0.950, 0.955, 0.960, 0.965, 0.970, 0.975] 55 | pos.clsLabel = "T" 56 | splitting.algo = "entropy" 57 | debug.on = true 58 | save.output = true 59 | } 60 | 61 | temporalAggregator { 62 | field.delim.in = "," 63 | field.delim.out = "," 64 | attr.ordinals = [2] 65 | id.fieldOrdinals = [0] 66 | time.fieldOrdinal = 1 67 | time.inMili = false 68 | aggr.windowTimeUnit = "hour" 69 | aggr.windowTimeLength = 1 70 | aggr.type = "average" 71 | output.compact = true 72 | output.precision = 3 73 | debug.on = true 74 | save.output = true 75 | } 76 | 77 | autoCorrelation { 78 | field.delim.in = "," 79 | field.delim.out = "," 80 | attr.ordinals = [2] 81 | id.fieldOrdinals = [0] 82 | seq.fieldOrdinal = 1 83 | output.precision = 3 84 | coor.lags = [24, 168] 85 | stats.file.path = "/Users/pranab/Projects/bin/beymani/other/auc/stats.txt" 86 | mean.fieldOrd = 5 87 | debug.on = true 88 | save.output = true 89 | } 90 | 91 | 92 | -------------------------------------------------------------------------------- /resource/salean.conf: -------------------------------------------------------------------------------- 1 | timeIntervalGenerator { 2 | field.delim.in = "," 3 | field.delim.out = "," 4 | id.fieldOrdinals = [0] 5 | time.fieldOrdinal = 1 6 | time.keepField = true 7 | debug.on = true 8 | save.output = true 9 | } 10 | 11 | numericalAttrStats { 12 | field.delim.in = "," 13 | field.delim.out = "," 14 | id.fieldOrdinals = [0] 15 | attr.ordinals = [2,3] 16 | seasonal.analysis = true 17 | part.bySeasonCycle = true 18 | seasonal.cycleType = ["nightDayHourOfDay"] 19 | time.fieldOrdinal = 1 20 | time.inMili = false 21 | min.sampleCount = 200 22 | output.precision = 3 23 | debug.on = true 24 | save.output = true 25 | } 26 | 27 | numericalAttrMedian { 28 | field.delim.in = "," 29 | field.delim.out = "," 30 | id.fieldOrdinals = [0] 31 | attr.ordinals = [2,3] 32 | seasonal.analysis = true 33 | operation.type = "med" 34 | med.file.path = "" 35 | hdfs.file = false 36 | med.file.path = "/Users/pranab/Projects/bin/beymani/meta/san/med.txt" 37 | seasonal.cycleType = ["nightDayHourOfDay"] 38 | time.fieldOrdinal = 1 39 | time.inMili = false 40 | output.precision = 6 41 | min.samplecount = 200 42 | debug.on = true 43 | save.output = true 44 | } 45 | 46 | filter { 47 | field.delim.in = "," 48 | field.delim.out = "," 49 | id.fieldOrdinals = [0] 50 | selection.filter = "" 51 | stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/san/stats.txt" 52 | schema.file.path = "/Users/pranab/Projects/bin/beymani/meta/sales.conf" 53 | debug.on = true 54 | save.output = true 55 | } 56 | 57 | statsBasedOutlierPredictor { 58 | field.delim.in = "," 59 | field.delim.out = "," 60 | predictor.strategy = "robustZscore" 61 | id.fieldOrdinals = [0] 62 | attr.ordinals = [2,3] 63 | score.threshold = 0.95 64 | score.thresholdNorm = 0.90 65 | outlier.polarity = "all" 66 | stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/san/stats.txt" 67 | mean.fldOrd = 4 68 | hdfs.file = false 69 | attr.weights = [0.4, 0.6] 70 | attr.weightStrategy = "weightedAverage" 71 | robustZscore { 72 | med.stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/san/med.txt" 73 | mad.stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/san/mad.txt" 74 | } 75 | seasonal.analysis = true 76 | seasonal.cycleType = ["nightDayHourOfDay"] 77 | time.fieldOrdinal = 1 78 | time.inMili = false 79 | output.precision = 3 80 | output.outliers = false 81 | rem.outliers = false 82 | clean.dataDirPath = "file:///Users/pranab/Projects/bin/beymani/other/olp/clean" 83 | debug.on = true 84 | save.output = true 85 | } 86 | -------------------------------------------------------------------------------- /resource/ticket.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PROJECT_HOME=/Users/pranab/Projects 4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar 5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar 6 | MASTER=spark://akash:7077 7 | 8 | case "$1" in 9 | 10 | "loadInp") 11 | rm $PROJECT_HOME/bin/beymani/input/ticket/$3/* 12 | cp $2 $PROJECT_HOME/bin/beymani/input/ticket/$3/ 13 | ls -l $PROJECT_HOME/bin/beymani/input/ticket/$3/ 14 | ;; 15 | 16 | 17 | "numStat") 18 | echo "running NumericalAttrStats Spark job" 19 | CLASS_NAME=org.chombo.spark.explore.NumericalAttrStats 20 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/ticket/train/* 21 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ticket/stat 22 | rm -rf ./output/ticket/stat 23 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 24 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT ticket.conf 25 | ;; 26 | 27 | "numMstat") 28 | echo "running NumericalAttrMedian Spark job" 29 | CLASS_NAME=org.chombo.spark.explore.NumericalAttrMedian 30 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/ticket/train/* 31 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ticket/mstat 32 | rm -rf ./output/ticket/mstat 33 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 34 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT ticket.conf 35 | rm ./output/ticket/mstat/_SUCCESS 36 | ls -l ./output/ticket/mstat 37 | ;; 38 | 39 | "bkMod") 40 | echo "backing up model files" 41 | MED_FILES=$PROJECT_HOME/bin/beymani/output/ticket/mstat/* 42 | META_DIR=$PROJECT_HOME/bin/beymani/meta/ticket 43 | META_FILE=$META_DIR/$2 44 | echo "copying to $META_FILE" 45 | cp /dev/null $META_FILE 46 | for f in $MED_FILES 47 | do 48 | echo "Copying file $f ..." 49 | cat $f >> $META_FILE 50 | done 51 | ls -l $META_FILE 52 | ;; 53 | 54 | "olPred") 55 | echo "running StatsBasedOutlierPredictor Spark job" 56 | CLASS_NAME=org.beymani.spark.dist.StatsBasedOutlierPredictor 57 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/ticket/pred/* 58 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ticket/olp 59 | rm -rf ./output/ticket/olp 60 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 61 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT ticket.conf 62 | rm ./output/ticket/olp/_SUCCESS 63 | ls -l ./output/ticket/olp 64 | cat ./output/ecom/ticket/part-00000 | grep ,O 65 | ;; 66 | 67 | *) 68 | echo "unknown operation $1" 69 | ;; 70 | 71 | esac 72 | -------------------------------------------------------------------------------- /resource/ecomm.conf: -------------------------------------------------------------------------------- 1 | numericalAttrStats { 2 | field.delim.in = "," 3 | field.delim.out = "," 4 | id.fieldOrdinals = [0,1] 5 | attr.ordinals = [3] 6 | seasonal.analysis = true 7 | part.bySeasonCycle = true 8 | seasonal.cycleType = ["hourOfDay"] 9 | time.fieldOrdinal = 2 10 | time.inMili = false 11 | min.sampleCount = 100 12 | output.precision = 3 13 | debug.on = true 14 | save.output = true 15 | } 16 | 17 | numericalAttrMedian { 18 | field.delim.in = "," 19 | field.delim.out = "," 20 | id.fieldOrdinals = [0,1] 21 | attr.ordinals = [3] 22 | seasonal.analysis = false 23 | operation.type = "mad" 24 | hdfs.file = false 25 | med.file.path = "/Users/pranab/Projects/bin/beymani/meta/ecom/med.txt" 26 | seasonal.cycleType = ["hourOfDay"] 27 | time.fieldOrdinal = 2 28 | time.inMili = false 29 | output.precision = 6 30 | min.samplecount = 100 31 | debug.on = true 32 | save.output = true 33 | } 34 | 35 | statsBasedOutlierPredictor { 36 | field.delim.in = "," 37 | field.delim.out = "," 38 | predictor.strategy = "robustZscore" 39 | id.fieldOrdinals = [0,1] 40 | attr.ordinals = [3] 41 | score.threshold = 0.7 42 | exp.const = 1.5 43 | outlier.polarity = "all" 44 | stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/ecom/stats.txt" 45 | mean.fldOrd = 4 46 | hdfs.file = false 47 | attr.weights = [1] 48 | attr.weightStrategy = "weightedAverage" 49 | robustZscore { 50 | med.stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/ecom/med.txt" 51 | mad.stats.file.path = "/Users/pranab/Projects/bin/beymani/meta/ecom/mad.txt" 52 | } 53 | seasonal.analysis = false 54 | seasonal.cycleType = ["hourOfDay"] 55 | time.fieldOrdinal = 2 56 | time.inMili = false 57 | output.precision = 3 58 | output.outliers = false 59 | rem.outliers = false 60 | clean.dataDirPath = "file:///Users/pranab/Projects/bin/beymani/other/olp/clean" 61 | debug.on = true 62 | save.output = true 63 | } 64 | 65 | outlierAggregator { 66 | field.delim.in = "," 67 | field.delim.out = "," 68 | type.field.ordinal = 0 69 | id.field.ordinal = 1 70 | seq.field.ordinal = 2 71 | quant.field.ordinal = 3 72 | stream.schmaFilePath = "/Users/pranab/Projects/bin/beymani/meta/ecom/ecommDataStream.json" 73 | output.precision = 3 74 | debug.on = true 75 | save.output = true 76 | } 77 | 78 | isolationForestModel { 79 | field.delim.in = "," 80 | field.delim.out = "," 81 | attr.ordinals = [1,3,4,5,7] 82 | score.threshold = .450 83 | num.tree = 64 84 | subsample.size = 256 85 | max.depth = 10 86 | rec.count = 1788 87 | output.precision = 3 88 | debug.on = true 89 | save.output = true 90 | } -------------------------------------------------------------------------------- /src/main/java/org/beymani/predictor/EntropyIncreaseBasedPredictor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.predictor; 19 | 20 | import java.util.Map; 21 | 22 | /** 23 | * Predict outlier based on increase of entropy resulting from including outlier point 24 | * @author pranab 25 | * 26 | */ 27 | public class EntropyIncreaseBasedPredictor extends DistributionBasedPredictor { 28 | private double entropy; 29 | private double baseConvConst = Math.log(2); 30 | private String subFieldDelim = ":"; 31 | 32 | public EntropyIncreaseBasedPredictor(Map conf) { 33 | super(conf); 34 | 35 | //entropy 36 | entropy = 0; 37 | for (String bucketKey : distrModel.keySet()) { 38 | double pr = ((double)distrModel.get(bucketKey)) / totalCount; 39 | entropy += -pr * Math.log(pr) / baseConvConst; 40 | } 41 | } 42 | 43 | @Override 44 | public double execute(String entityID, String record) { 45 | double score = 0; 46 | String thisBucketKey = getBucketKey(record); 47 | 48 | //new entropy 49 | double newEntropy = 0; 50 | int newTotalCount = totalCount + 1; 51 | boolean bucketFound = false; 52 | double pr = 0; 53 | for (String bucketKey : distrModel.keySet()) { 54 | if (bucketKey.equals(thisBucketKey)) { 55 | pr = ((double)distrModel.get(bucketKey) + 1) / newTotalCount; 56 | bucketFound = true; 57 | } else { 58 | pr = ((double)distrModel.get(bucketKey)) / newTotalCount; 59 | } 60 | newEntropy += -pr * Math.log(pr) / baseConvConst; 61 | } 62 | 63 | if (!bucketFound) { 64 | pr = 1.0 / newTotalCount; 65 | newEntropy += -pr * Math.log(pr) / baseConvConst; 66 | } 67 | 68 | if (newEntropy > entropy) { 69 | score = (newEntropy - entropy) / entropy; 70 | } 71 | 72 | if (score > scoreThreshold) { 73 | //write if above threshold 74 | outQueue.send(entityID + " " + score); 75 | } 76 | return score; 77 | } 78 | 79 | @Override 80 | public double execute(String[] items, String compKey) { 81 | //TODO 82 | double score = 0; 83 | 84 | return score; 85 | } 86 | 87 | @Override 88 | public boolean isValid(String compKey) { 89 | // TODO Auto-generated method stub 90 | return true; 91 | } 92 | 93 | } 94 | -------------------------------------------------------------------------------- /resource/bsm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PROJECT_HOME=/Users/pranab/Projects 4 | JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar 5 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar 6 | AVENIR_JAR_NAME=$PROJECT_HOME/bin/avenir/uber-avenir-spark-1.0.jar 7 | MASTER=spark://akash:7077 8 | 9 | case "$1" in 10 | 11 | "transformTrain") 12 | echo "running DataTransformer" 13 | CLASS_NAME=org.chombo.spark.etl.DataTransformer 14 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/bsm/train/* 15 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/bsm/train/trans 16 | rm -rf ./output/bsm/train/trans 17 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 18 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT bsm.conf 19 | rm -rf ./output/bsm/train/trans/_SUCCESS 20 | ;; 21 | 22 | "stateTrans") 23 | echo "running MarkovStateTransitionModel" 24 | CLASS_NAME=org.avenir.spark.sequence.MarkovStateTransitionModel 25 | INPUT=file:///Users/pranab/Projects/bin/beymani/output/bsm/train/trans/* 26 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/bsm/train/sttr 27 | rm -rf ./output/bsm/train/sttr 28 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 29 | --conf spark.ui.killEnabled=true --master $MASTER $AVENIR_JAR_NAME $INPUT $OUTPUT bsm.conf 30 | rm -rf ./output/bsm/train/sttr/_SUCCESS 31 | ;; 32 | 33 | "transformPred") 34 | echo "running DataTransformer" 35 | CLASS_NAME=org.chombo.spark.etl.DataTransformer 36 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/bsm/pred/* 37 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/bsm/pred/trans 38 | rm -rf ./output/bsm/trans 39 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 40 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT bsm.conf 41 | rm -rf ./output/bsm/pred/trans/_SUCCESS 42 | ;; 43 | 44 | "cpModel") 45 | echo "copying model files" 46 | MOD_FILES=$PROJECT_HOME/bin/beymani/output/bsm/train/sttr/* 47 | META_DIR=$PROJECT_HOME/bin/beymani/meta 48 | cp /dev/null $META_DIR/bsm_mod.txt 49 | for f in $MOD_FILES 50 | do 51 | echo "Copying file $f ..." 52 | cat $f >> $META_DIR/bsm_mod.txt 53 | done 54 | ;; 55 | 56 | "olPredict") 57 | echo "running MarkovChainPredictor" 58 | CLASS_NAME=org.beymani.spark.seq.MarkovChainPredictor 59 | INPUT=file:///Users/pranab/Projects/bin/beymani/output/bsm/pred/trans/* 60 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/bsm/pred/oul 61 | rm -rf ./output/bsm/pred/oul 62 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 63 | --conf spark.ui.killEnabled=true --master $MASTER $JAR_NAME $INPUT $OUTPUT bsm.conf 64 | rm -rf ./output/bsm/pred/oul/_SUCCESS 65 | ls -l ./output/bsm/pred/oul 66 | for f in ./output/bsm/pred/oul/* 67 | do 68 | echo "number of outliers in $f" 69 | cat $f | grep ,O | wc -l 70 | done 71 | 72 | ;; 73 | 74 | *) 75 | echo "unknown operation $1" 76 | ;; 77 | 78 | esac -------------------------------------------------------------------------------- /resource/quarantine_violation_detection_tutorial.txt: -------------------------------------------------------------------------------- 1 | This tutorial is for detecting quarantine violation based mobile location anomaly. Violation 2 | could be because of quarantined people miving out of quarantined location or non quarantined people 3 | visiting quarantined locations 4 | 5 | 6 | Environment 7 | =========== 8 | Make sure you have ../lib directory with all the python files wrt where mob_loc.py is. 9 | Please refer to resource/spark_dependency.txt for building the jar for Spark. 10 | All the configuration data generation python script are mob_loc.properties. Make sure all the 11 | directories for data as in epid.sh are created 12 | 13 | Generate data for out of range violation 14 | ======================================== 15 | Phone numbers and quarantim=ne location 16 | python3 mob_loc.py genQuaLoc mob_loc.properties > qualist.txt 17 | 18 | quarantined people movement location data 19 | python3 mob_loc.py quaLoc mob_loc.properties > qualoc.txt 20 | 21 | insert outliers in movement location data (quarantined person moving out of quarantined location) 22 | python3 mob_loc.py quaLocOutlier mob_loc.properties > qualocou.txt 23 | 24 | Copy data 25 | ========= 26 | quarantine location 27 | ./epid.sh cpQuaLocData qualist.txt outr 28 | 29 | quarantined people movement location data 30 | ./epid.sh cpLocData qualoc.txt outr 31 | 32 | Spark job going out range outlier 33 | ================================= 34 | ./epid.sh olPredOu 35 | 36 | Generate data for out of range violation 37 | ======================================== 38 | all locations data 39 | python3 mob_loc.py genLoc mob_loc.properties > res_loc.txt 40 | python3 mob_loc.py genLoc mob_loc.properties > work_loc.txt 41 | python3 mob_loc.py genLoc mob_loc.properties > school_loc.txt 42 | python3 mob_loc.py genLoc mob_loc.properties > med_loc.txt 43 | python3 mob_loc.py genLoc mob_loc.properties > shop_loc.txt 44 | python3 mob_loc.py genLoc mob_loc.properties > ent_loc.txt 45 | python3 mob_loc.py genLoc mob_loc.properties > event_loc.txt 46 | python3 mob_loc.py genLoc mob_loc.properties > open_loc.txt 47 | 48 | Here are the region.num.locations and region.loc.size. You have to set them before generating location 49 | for each location type 50 | residence 200 .0002 51 | work 10 .0005 52 | school 3 .0020 53 | medical 3 .0004 54 | shoppinh area 5 .0020 55 | entertainment area 5 .0010 56 | large event area 2 .0008 57 | open space 2 .0024 58 | 59 | quarantined locations 60 | python3 mob_loc.py uniqQuaLoc mob_loc.properties > uniq_qualist.txt 61 | 62 | people movement location data 63 | python3 mob_loc.py genMovement mob_loc.properties > move_loc.txt 64 | 65 | Copy data 66 | ========= 67 | quarantine location 68 | ./epid.sh cpQuaLocData uniq_qualist.txt inr 69 | 70 | quarantined people movement location data 71 | ./epid.sh cpLocData move_loc.txt inr 72 | 73 | Spark job for in range outlier 74 | ============================== 75 | ./epid.sh olPredIn 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | Beymani consists of set of Hadoop, Spark and Storm based tools for outlier and anamoly 3 | detection, which can be used for fraud detection, intrusion detection etc. 4 | 5 | ## Philosophy 6 | * Simple to use 7 | * Input output in CSV format 8 | * Metadata defined in simple JSON file 9 | * Extremely configurable with tons of configuration knobs 10 | 11 | ## Blogs 12 | The following blogs of mine are good source of details of beymani 13 | * http://pkghosh.wordpress.com/2012/01/02/fraudsters-outliers-and-big-data-2/ 14 | * http://pkghosh.wordpress.com/2012/02/18/fraudsters-are-not-model-citizens/ 15 | * http://pkghosh.wordpress.com/2012/06/18/its-a-lonely-life-for-outliers/ 16 | * http://pkghosh.wordpress.com/2012/10/18/relative-density-and-outliers/ 17 | * http://pkghosh.wordpress.com/2013/10/21/real-time-fraud-detection-with-sequence-mining/ 18 | * https://pkghosh.wordpress.com/2018/09/18/contextual-outlier-detection-with-statistical-modeling-on-spark/ 19 | * https://pkghosh.wordpress.com/2018/10/15/learning-alarm-threshold-from-user-feedback-using-decision-tree-on-spark/ 20 | * https://pkghosh.wordpress.com/2019/07/25/time-series-sequence-anomaly-detection-with-markov-chain-on-spark/ 21 | * https://pkghosh.wordpress.com/2020/09/27/time-series-change-point-detection-with-two-sample-statistic-on-spark-with-application-for-retail-sales-data/ 22 | * https://pkghosh.wordpress.com/2020/12/24/concept-drift-detection-techniques-with-python-implementation-for-supervised-machine-learning-models/ 23 | * https://pkghosh.wordpress.com/2021/01/20/customer-service-quality-monitoring-with-autoencoder-based-anomalous-case-detection/ 24 | * https://pkghosh.wordpress.com/2021/06/28/ecommerce-order-processing-system-monitoring-with-isolation-forest-based-anomaly-detection-on-spark/ 25 | 26 | ## Algorithms 27 | * Univarite distribution model 28 | * Multi variate sequence or multi gram distribution model 29 | * Average instance Distance 30 | * Relative instance Density 31 | * Markov chain with sequence data 32 | * Spectral residue for sequence data 33 | * Quantized symbol mapping for sequence data 34 | * Local outlier factor for multivariate data 35 | * Instance clustering 36 | * Sequence clustering 37 | * Change point detection 38 | * Isolation Forest for multivariate data 39 | * Auto Encoder for multivariate data 40 | 41 | ## Getting started 42 | Project's resource directory has various tutorial documents for the use cases described in 43 | the blogs. 44 | 45 | ## Build 46 | For Hadoop 1 47 | * mvn clean install 48 | 49 | For Hadoop 2 (non yarn) 50 | * git checkout nuovo 51 | * mvn clean install 52 | 53 | For Hadoop 2 (yarn) 54 | * git checkout nuovo 55 | * mvn clean install -P yarn 56 | 57 | For Spark 58 | * mvn clean install 59 | * sbt publishLocal 60 | * in ./spark sbt clean package 61 | 62 | ## Help 63 | Please feel free to email me at pkghosh99@gmail.com 64 | 65 | ## Contribution 66 | Contributors are welcome. Please email me at pkghosh99@gmail.com 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /resource/cct.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | count = ARGV[0].to_i 4 | 5 | amount_dist = [ 6 | 10,10, 7 | 17,17,17, 8 | 25,25,25,25,25, 9 | 37,37,37,37,37,37,37, 10 | 45,45,45,45,45, 11 | 66,66,66,66, 12 | 82,82,82,82, 13 | 150,150,150, 14 | 220,220, 15 | 300,300, 16 | 500, 17 | 1000, 18 | 2000 19 | ] 20 | 21 | time_dist = [ 22 | 0,0,0, 23 | 1,1,1,1, 24 | 2,2,2,2,2,2,2, 25 | 3,3,3,3,3, 26 | 4,4,4, 27 | 5,5, 28 | 6, 29 | 7, 30 | 8, 31 | 9, 32 | 10, 33 | 11, 34 | 12, 35 | 13, 36 | 14, 37 | 15, 38 | 16,16, 39 | 17,17,17, 40 | 18,18, 41 | 19, 42 | 20, 43 | 21,21, 44 | 22,22,22, 45 | 23 46 | ] 47 | 48 | vendors = ['grocery', 'restaurant', 'drug store', 'super market', 'electronic store', 'clothing store', 'jewellery store', 49 | 'air fare', 'hotel', 'car rental'] 50 | 51 | vendor_dist = [ 52 | 0,0,0,0,0,0,0,0,0, 53 | 1,1,1, 54 | 2,2,2,2,2,2, 55 | 3,3,3,3, 56 | 4,4, 57 | 5,5,5, 58 | 7,7,7, 59 | 8,8, 60 | 9,9 61 | ] 62 | 63 | 64 | vendor_amount_dist = { 65 | 'grocery' => [ 66 | 10,10, 67 | 20,20,20,20, 68 | 30,30,30,30,30,30,30, 69 | 50,50,50,50,50,50,50,50,50, 70 | 70,70,70,70, 71 | 100,100, 72 | 150 73 | ], 74 | 75 | 'restaurant' => [ 76 | 10,10, 77 | 20,20,20,20,20, 78 | 27,27, 79 | 35, 80 | 50 81 | ], 82 | 83 | 'drug store' => [ 84 | 12,12, 85 | 23,23,23,23,23, 86 | 37,37,37, 87 | 45,45, 88 | 60 89 | ], 90 | 91 | 'super market' => [ 92 | 25,25, 93 | 38,38,38, 94 | 49,49,49,49,49,49, 95 | 68,68,68, 96 | 112,112, 97 | 185, 98 | 250 99 | ], 100 | 101 | 'electronic store' => [ 102 | 60,60, 103 | 90,90, 104 | 120,120,120,120, 105 | 190,190,190,190,190, 106 | 250,250,250, 107 | 300,300, 108 | 500 109 | ], 110 | 111 | 'clothing store' => [ 112 | 30,30, 113 | 50,50,50,50, 114 | 70,70,70, 115 | 90,90, 116 | 150, 117 | 200 118 | ], 119 | 120 | 'jewellery store' => [ 121 | 100, 122 | 170,170, 123 | 260,260,260, 124 | 310,310, 125 | 400 126 | ], 127 | 128 | 'air fare' => [ 129 | 110,110, 130 | 180,180,180, 131 | 310,310,310,310,310, 132 | 520,520, 133 | 600 134 | ], 135 | 136 | 'hotel' => [ 137 | 110,110,110, 138 | 230,230,230,230, 139 | 300, 140 | 400 141 | ], 142 | 143 | 'car rental' => [ 144 | 60,60, 145 | 110,110,110,110, 146 | 150,150, 147 | 200 148 | ] 149 | 150 | } 151 | 152 | key = ['0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O', 153 | 'P','Q','R','S','T','U','V','W','X','Y','Z'] 154 | 155 | def gen_id(key) 156 | id = '' 157 | 1.upto 8 do #!/usr/bin/ruby 158 | 159 | require '../lib/util.rb' 160 | 161 | userCount = ARGV[0].to_i 162 | 163 | id << key[rand(key.length)] 164 | end 165 | return id 166 | end 167 | 168 | def sample(dist, mult, floor, percent) 169 | b = rand(dist.length) 170 | val = dist[b] 171 | val = val * mult 172 | percent = rand(percent) 173 | percent = percent < floor ? floor : percent 174 | 175 | dev = (val * percent) / 100 176 | if ((rand(100) % 2) == 0) 177 | val = val + dev 178 | else 179 | val = val - dev 180 | end 181 | val = val < 0 ? 0 : val 182 | val 183 | end 184 | 185 | 1.upto count do 186 | id = gen_id(key) 187 | time = sample(time_dist, 60, 2, 8) 188 | time = time > 1440 ? 1440 : time 189 | v = vendor_dist[rand(vendor_dist.length)] 190 | vendor = vendors[v] 191 | am = sample(vendor_amount_dist[vendor], 100, 4, 12) 192 | puts "#{id}[]#{time}[]#{am/100}.#{am%100}[]#{vendor}" 193 | end 194 | 195 | 196 | -------------------------------------------------------------------------------- /src/main/java/org/beymani/predictor/EstimatedProbabilityBasedPredictor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.predictor; 19 | 20 | import java.io.IOException; 21 | import java.util.Map; 22 | 23 | import org.apache.hadoop.conf.Configuration; 24 | 25 | /** 26 | * Estimated probability based outlier prediction 27 | * @author pranab 28 | * 29 | */ 30 | public class EstimatedProbabilityBasedPredictor extends DistributionBasedPredictor { 31 | 32 | /** 33 | * Storm usage 34 | * @param conf 35 | */ 36 | public EstimatedProbabilityBasedPredictor(Map conf) { 37 | super(conf); 38 | realTimeDetection = true; 39 | } 40 | 41 | /** 42 | * @param config 43 | * @param distrFilePathParam 44 | * @param hdfsFileParam 45 | * @param schemaFilePathParam 46 | * @param scoreThresholdParam 47 | * @throws IOException 48 | */ 49 | public EstimatedProbabilityBasedPredictor(Map config, String idOrdinalsParam, 50 | String distrFilePathParam, String hdfsFileParam, String schemaFilePathParam, 51 | String seasonalParam, String fieldDelimParam, String scoreThresholdParam) throws IOException { 52 | super(config, idOrdinalsParam, distrFilePathParam, hdfsFileParam, schemaFilePathParam, 53 | seasonalParam, fieldDelimParam, scoreThresholdParam); 54 | } 55 | 56 | /** 57 | * Hadoop MR usage 58 | * @param config 59 | * @param distrFilePath 60 | * @throws IOException 61 | */ 62 | public EstimatedProbabilityBasedPredictor(Configuration config, String distrFilePath, String scoreThresholdParam) throws IOException { 63 | super(config, distrFilePath); 64 | scoreThreshold = Double.parseDouble( config.get( scoreThresholdParam)); 65 | } 66 | 67 | @Override 68 | public double execute(String entityID, String record) { 69 | String bucketKey = getBucketKey(record); 70 | Integer count = distrModel.get(bucketKey); 71 | double pr = null != count ? (((double)count) / totalCount) : 0; 72 | double score = 1.0 - pr; 73 | scoreAboveThreshold = score > scoreThreshold; 74 | if (realTimeDetection && scoreAboveThreshold) { 75 | //write if above threshold 76 | outQueue.send(entityID + " " + score); 77 | } 78 | return score; 79 | } 80 | 81 | @Override 82 | public double execute(String[] items, String compKey) { 83 | String bucketKey = getBucketKey(items); 84 | Map distrModel = keyedDistrModel.get(compKey); 85 | Integer count = distrModel.get(bucketKey); 86 | int totalCount = totalCounts.get(compKey); 87 | double pr = null != count ? (((double)count) / totalCount) : 0; 88 | double score = 1.0 - pr; 89 | return score; 90 | } 91 | 92 | @Override 93 | public boolean isValid(String compKey) { 94 | // TODO Auto-generated method stub 95 | return true; 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/main/java/org/beymani/predictor/ExtremeValuePredictor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.predictor; 19 | 20 | import java.io.IOException; 21 | import java.util.Map; 22 | 23 | import org.beymani.util.OutlierScoreAggregator; 24 | import org.chombo.util.BasicUtils; 25 | 26 | /** 27 | * @author pranab 28 | * 29 | */ 30 | public class ExtremeValuePredictor extends ZscorePredictor { 31 | 32 | /** 33 | * @param config 34 | * @param idOrdinalsParam 35 | * @param attrListParam 36 | * @param fieldDelimParam 37 | * @param attrWeightParam 38 | * @param statsFilePathParam 39 | * @param seasonalParam 40 | * @param hdfsFileParam 41 | * @param scoreThresholdParam 42 | * @param expConstParam 43 | * @throws IOException 44 | */ 45 | public ExtremeValuePredictor(Map config,String idOrdinalsParam, String attrListParam, 46 | String fieldDelimParam, String attrWeightParam,String statsFilePathParam, String seasonalParam, 47 | String hdfsFileParam, String scoreThresholdParam,String expConstParam, String ignoreMissingStatParam, 48 | String scoreAggggregationStrtaegyParam) throws IOException { 49 | super(config, idOrdinalsParam, attrListParam, fieldDelimParam, attrWeightParam, 50 | statsFilePathParam, seasonalParam, hdfsFileParam, scoreThresholdParam, 51 | expConstParam, ignoreMissingStatParam, scoreAggggregationStrtaegyParam); 52 | } 53 | 54 | /* (non-Javadoc) 55 | * @see org.beymani.predictor.ZscorePredictor#execute(java.lang.String[], java.lang.String) 56 | */ 57 | @Override 58 | public double execute(String[] items, String compKey) { 59 | double score = 0; 60 | OutlierScoreAggregator scoreAggregator = new OutlierScoreAggregator(attrWeights.length, attrWeights); 61 | double thisScore = 0; 62 | for (int ord : attrOrdinals) { 63 | double val = Double.parseDouble(items[ord]); 64 | double d = 0; 65 | double e = 0; 66 | if (null != idOrdinals) { 67 | if (statsManager.statsExists(compKey, ord)) { 68 | d = Math.abs( val - statsManager.getMean(compKey,ord)); 69 | e = Math.exp(-d / statsManager.getStdDev(compKey, ord)); 70 | thisScore = Math.exp(-e); 71 | scoreAggregator.addScore(thisScore); 72 | } else { 73 | scoreAggregator.addScore(); 74 | } 75 | } else { 76 | d = Math.abs( val - statsManager.getMean(ord)); 77 | e = Math.exp(-d / statsManager.getStdDev(ord)); 78 | thisScore = Math.exp(-e); 79 | scoreAggregator.addScore(thisScore); 80 | } 81 | } 82 | //aggregate score 83 | score = getAggregateScore(scoreAggregator); 84 | 85 | //exponential normalization 86 | if (expConst > 0) { 87 | score = BasicUtils.expScale(expConst, score); 88 | } 89 | 90 | scoreAboveThreshold = score > scoreThreshold; 91 | return score; 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /python/app/cpsale.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | 3 | # avenir-python: Machine Learning 4 | # Author: Pranab Ghosh 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); you 7 | # may not use this file except in compliance with the License. You may 8 | # obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | # implied. See the License for the specific language governing 16 | # permissions and limitations under the License. 17 | 18 | # Package imports 19 | import os 20 | import sys 21 | import random 22 | import statistics 23 | import matplotlib.pyplot as plt 24 | sys.path.append(os.path.abspath("../lib")) 25 | sys.path.append(os.path.abspath("../mlextra")) 26 | from util import * 27 | from sampler import * 28 | from mcsim import * 29 | 30 | """ 31 | cannibalized product sale 32 | """ 33 | 34 | values = list() 35 | def psale(args): 36 | i = 0 37 | q1 = int(args[i]) 38 | q1 = q1 if q1 >= 0 else 0 39 | i += 1 40 | q2 = int(args[i]) 41 | q2 = q2 if q2 >= 0 else 0 42 | i += 1 43 | pid1 = args[i] 44 | i += 1 45 | pid2 = args[i] 46 | i += 1 47 | ptime = args[i] 48 | i += 1 49 | iter = args[i] 50 | ctime = ptime + iter * 3600 51 | print("{},{},{}".format(pid1, ctime, q1)) 52 | print("{},{},{}".format(pid2, ctime, q2)) 53 | values.append(q1) 54 | 55 | 56 | if __name__ == "__main__": 57 | op = sys.argv[1] 58 | if op == "gen": 59 | numDays = int(sys.argv[2]) 60 | numIter = 24 * numDays 61 | curTime, pastTime = pastTime(numDays, "d") 62 | pastTime = dayAlign(pastTime) 63 | tsStart = int(0.6 * numIter) 64 | trEnd = tsStart + 30 65 | trSl = -2.0 66 | cy = np.array([-20.0, -35.0, -55.0, -65.0, -70.0, -70.0, -50.0, -30.0, -5.0, 15.0, 35.0, 50.0, 67 | 65.0, 65.0, 55.0, 50.0, 40.0, 30.0, 25.0, 35.0, 30.0, 20.0, 5.0, -15.0]) 68 | cy1 = 0.7 * cy 69 | cy2 = 0.7 * cy1 70 | cy3 = 0.3 * cy1 71 | simulator = MonteCarloSimulator(numIter, psale, "./log/mcsim.log", "info") 72 | simulator.registerNormalSamplerWithTrendCycle(100, 10, 0, cy1) 73 | simulator.registerNormalSamplerWithTrendCycle(150, 20, 0.01, cy2) 74 | simulator.registerExtraArgs("DK75HUI45X", "GHT56FGT8K", pastTime) 75 | trSampler = NormalSamplerWithTrendCycle(100.0, 10.0, trSl , cy1) 76 | simulator.setSampler(0, tsStart, trSampler) 77 | newSampler = NormalSamplerWithTrendCycle(40, 12, 0, cy3) 78 | simulator.setSampler(0, trEnd, newSampler) 79 | 80 | simulator.run() 81 | #drawLine(values, 250) 82 | 83 | elif op == "plot": 84 | filePath = sys.argv[2] 85 | rid = sys.argv[3] 86 | filt = lambda r : r[0] == rid 87 | dvalues = list(map(lambda r : float(r[2]), fileFiltRecGen(filePath, filt))) 88 | xvalues = list(map(lambda r : int(r[1]), fileFiltRecGen(filePath, filt))) 89 | it = xvalues[0] 90 | if len(sys.argv) == 5: 91 | cpFilePath = sys.argv[4] 92 | cdvalues = list(map(lambda r : float(r[3]), fileFiltRecGen(cpFilePath, filt))) 93 | cxvalues = list(map(lambda r : int(r[2]), fileFiltRecGen(cpFilePath, filt))) 94 | i = 0 95 | for t in cxvalues: 96 | plt.axvline(t, 0, .9, color="r") 97 | i += 1 98 | plt.plot(xvalues, dvalues, "b") 99 | plt.show() 100 | else: 101 | plt.plot(xvalues, dvalues, "b") 102 | plt.show() 103 | 104 | 105 | -------------------------------------------------------------------------------- /resource/cpu_usage_anomaly_det_tutorial.txt: -------------------------------------------------------------------------------- 1 | This tutorial is for anaomaly detection in CPU usage data using statistical modeling. To ne more specidfic 2 | we will be using a z score based technique. Model gets built with oultliers in data. The detected outliers 3 | are removed and the model is built again, but htis time without outliers in the data. 4 | 5 | 6 | Environment 7 | =========== 8 | Path etc shown here corresposnds to my environment. Please Change them as needed for your 9 | environment 10 | 11 | Build 12 | ===== 13 | Follow instructions in spark_dependency.txt 14 | 15 | Python dependency 16 | ================= 17 | The shell script commands for data generation run python scripts for data generation. Before you run 18 | the data generation commands do the following 19 | 1. checkout project avenir 20 | 2. copy the avenir/python/lib directory to ../lib with respect to your location of cpu_usage.py file 21 | 22 | 23 | Create base normal data 24 | ======================= 25 | ./and_spark.sh crInput true 26 | 27 | where 28 | num_of_days = number of days e.g 10 29 | reading_intervaL = reading interval in sec e.g. 300 30 | num_servers = number of servers e.g. 4 31 | output_file = output file, we will use cusage.txt from now on 32 | 33 | - insert outliers 34 | ./and_spark.sh insOutliers 35 | 36 | where 37 | normal_data_file = normal data file (cusage.txt) 38 | with_outlier_data_file = data file with outliers (cusage.txt) 39 | 40 | -copy 41 | ./and_spark.sh cpModData 42 | 43 | where 44 | with_outlier_data_file = data file with outliers (cusage.txt) 45 | 46 | Run Spark job for stats 47 | ======================= 48 | ./and_spark.sh numStat 49 | 50 | Copy and consolidate stats file 51 | =============================== 52 | ./and_spark.sh crStatsFile 53 | 54 | Run Spark job to detect outliers 55 | ================================ 56 | - set 57 | score.threshold = 2.0 58 | output.outliers = true 59 | rem.outliers = true 60 | 61 | - run 62 | ./and_spark.sh olPred 63 | 64 | Copy and consolidate clean file 65 | =============================== 66 | ./and_spark.sh crCleanFile 67 | 68 | Create and copy test data 69 | ========================= 70 | - create 71 | ./and_spark.sh crInput true 77 | 78 | where 79 | normal_data_file = normal data file (c.txt) 80 | with_outlier_data_file = data file with outliers (cusage.txt) 81 | 82 | - copy 83 | ./and_spark.sh cpTestData 84 | 85 | where 86 | with_outlier_data_file = data file with outliers (cusage.txt) 87 | 88 | 89 | Run Spark job for stats again with clean data 90 | ============================================= 91 | ./and_spark.sh numStat 92 | 93 | Copy and consolidate stats file 94 | =============================== 95 | ./and_spark.sh crStatsFile 96 | 97 | 98 | Run Spark job to detect outliers 99 | ================================ 100 | - set 101 | score.threshold = 3.3 102 | output.outliers = false 103 | rem.outliers = false 104 | 105 | - run 106 | ./and_spark.sh olPred 107 | 108 | Configuration 109 | ============= 110 | Configuration is in and.conf. Make changes as necessary 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /src/main/java/org/beymani/predictor/FileSpout.java: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.predictor; 19 | 20 | import java.io.File; 21 | import java.io.FileNotFoundException; 22 | import java.util.Arrays; 23 | import java.util.Comparator; 24 | import java.util.Map; 25 | import java.util.Scanner; 26 | 27 | import backtype.storm.spout.SpoutOutputCollector; 28 | import backtype.storm.task.TopologyContext; 29 | import backtype.storm.topology.OutputFieldsDeclarer; 30 | import backtype.storm.topology.base.BaseRichSpout; 31 | import backtype.storm.tuple.Fields; 32 | import backtype.storm.tuple.Values; 33 | 34 | /** 35 | * @author pranab 36 | * 37 | */ 38 | public class FileSpout extends BaseRichSpout { 39 | private SpoutOutputCollector collector; 40 | private Map conf; 41 | private File[] files; 42 | private Scanner scanner; 43 | /** 44 | * 45 | */ 46 | private int curFileIndex = 0; 47 | 48 | @Override 49 | public void open(Map conf, TopologyContext context, 50 | SpoutOutputCollector collector) { 51 | this.collector = collector; 52 | this.conf = conf; 53 | 54 | String dirPath = conf.get("file.spout.dir.path").toString(); 55 | File dir = new File(dirPath); 56 | files = dir.listFiles(); 57 | Arrays.sort(files, new Comparator(){ 58 | public int compare(File f1, File f2) { 59 | int res = f1.lastModified() < f2.lastModified() ? -1 : ( f1.lastModified() > f2.lastModified() ? 1 : 0); 60 | return res; 61 | } }); 62 | 63 | openNextFile(); 64 | } 65 | 66 | @Override 67 | public void nextTuple() { 68 | String record = readFile(); 69 | String[] items = record.split("\\s+"); 70 | String entityID = items[0]; 71 | String recordData = items[1]; 72 | collector.emit(new Values(entityID, recordData)); 73 | } 74 | 75 | /** 76 | * @return 77 | */ 78 | private String readFile() { 79 | String record = null; 80 | if (scanner.hasNextLine()) { 81 | record = scanner.nextLine(); 82 | } else { 83 | if (++curFileIndex < files.length) { 84 | openNextFile(); 85 | if (scanner.hasNextLine()) { 86 | record = scanner.nextLine(); 87 | } 88 | } else { 89 | //no more files to read 90 | } 91 | } 92 | return record; 93 | } 94 | 95 | /** 96 | * 97 | */ 98 | private void openNextFile() { 99 | try { 100 | scanner = new Scanner(files[curFileIndex]); 101 | } catch (FileNotFoundException e) { 102 | throw new IllegalStateException("file not found"); 103 | } 104 | } 105 | 106 | /* (non-Javadoc) 107 | * @see backtype.storm.topology.IComponent#declareOutputFields(backtype.storm.topology.OutputFieldsDeclarer) 108 | */ 109 | @Override 110 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 111 | declarer.declare(new Fields("entityID", "recordData")); 112 | } 113 | 114 | } 115 | -------------------------------------------------------------------------------- /python/app/mvand.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | 3 | # avenir-python: Machine Learning 4 | # Author: Pranab Ghosh 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); you 7 | # may not use this file except in compliance with the License. You may 8 | # obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | # implied. See the License for the specific language governing 16 | # permissions and limitations under the License. 17 | 18 | # Package imports 19 | import os 20 | import sys 21 | import random 22 | import matplotlib.pyplot as plt 23 | import numpy as np 24 | import sklearn as sk 25 | from sklearn.ensemble import IsolationForest 26 | from pyod.models.auto_encoder import AutoEncoder 27 | sys.path.append(os.path.abspath("../lib")) 28 | sys.path.append(os.path.abspath("../mlextra")) 29 | from util import * 30 | from mlutil import * 31 | from sampler import * 32 | 33 | """ 34 | Anomaly detection with isolation forest 35 | """ 36 | if __name__ == "__main__": 37 | op = sys.argv[1] 38 | filePath = sys.argv[2] 39 | window = 20 40 | beg = 0 41 | end = beg + window 42 | if op == "isfo": 43 | #anomaly detection in in service ticket data with isolation porest 44 | scId = sys.argv[3] 45 | colStr = sys.argv[4] 46 | columns = strToIntArray(colStr) 47 | filt = lambda r : r[0] == scId 48 | data = np.array(getFileAsFiltFloatMatrix(filePath, filt, colStr)) 49 | nsamp = data.shape[0] 50 | isf = IsolationForest(contamination=0.1) 51 | ypred = isf.fit_predict(data) 52 | colors = ["m", "g", "b", "c", "y"] 53 | 54 | for a in data: 55 | a[2] = a[2] / 24 56 | while True: 57 | inp = input("begin offset: ") 58 | beg = int(inp) 59 | end = beg + window 60 | if beg >= 0: 61 | for i in range(len(columns)): 62 | dvalues = data[:,i] 63 | ci = i % 5 64 | plt.plot(dvalues[beg:end], colors[ci]) 65 | count = 0 66 | for i in range(beg, end, 1): 67 | if ypred[i] == -1: 68 | plt.axvline(i - beg, 0, .9, color="r") 69 | count += 1 70 | print("num of outlier {}".format(count)) 71 | plt.show() 72 | else: 73 | print("quitting") 74 | break 75 | 76 | elif op == "auen": 77 | #anomaly detection in web session with auto encoder 78 | teFilePath = sys.argv[3] 79 | columns = sys.argv[4] 80 | auen = AutoEncoder(hidden_neurons =[7,5,3,5,7]) 81 | trData = np.array(getFileAsFloatMatrix(filePath, columns)) 82 | trNsamp = trData.shape[0] 83 | teData = np.array(getFileAsFloatMatrix(teFilePath, columns)) 84 | aData = np.vstack((trData, teData)) 85 | aData = scaleData(aData, "zscale") 86 | print(aData.shape) 87 | trData = aData[:trNsamp, :] 88 | teData = aData[trNsamp:, :] 89 | print(trData.shape) 90 | print(teData.shape) 91 | 92 | auen.fit(trData) 93 | scores = auen.decision_function(teData) 94 | 95 | while True: 96 | inp = input("begin offset: ") 97 | beg = int(inp) 98 | end = beg + window 99 | if beg >= 0: 100 | plt.plot(scores[beg:end], color="b") 101 | count = 0 102 | for i in range(beg, end, 1): 103 | if scores[i] > 17: 104 | plt.axvline(i - beg, 0, .9, color="r") 105 | count += 1 106 | print("num of outlier {}".format(count)) 107 | plt.show() 108 | else: 109 | print("quitting") 110 | break 111 | 112 | 113 | -------------------------------------------------------------------------------- /resource/rel_density_tutorial.txt: -------------------------------------------------------------------------------- 1 | Configuration 2 | ============= 3 | Here are the global config properties to set in the properties file. Please make changes as necessary 4 | 5 | debug.on=true 6 | field.delim=, 7 | field.delim.regex=, 8 | num.reducer=1 9 | 10 | Configuration settings for individual map reduce jobs are described below 11 | 12 | Map Reduce Jobs 13 | =============== 14 | 15 | 1. Similarity calculation 16 | ------------------------- 17 | run SameTypeSimilarity 18 | 19 | Make sure properties are set as below in the configuration properties file 20 | 21 | sts.same.schema.file.path=/user/pranab/cct/meta/cct.json 22 | sts.bucket.count=10 23 | sts.distance.scale=1000 24 | 25 | 2. Density calculation. 26 | ---------------------- 27 | Here is a sample script. It uses the output of SameTypeSmilarity MR as input 28 | 29 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar 30 | CLASS_NAME=org.beymani.proximity.AverageDistance 31 | 32 | echo "running mr" 33 | IN_PATH=/user/pranab/cct/simi 34 | OUT_PATH=/user/pranab/cct/avdi 35 | echo "input $IN_PATH output $OUT_PATH" 36 | hadoop fs -rmr $OUT_PATH 37 | echo "removed output dir" 38 | 39 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH 40 | 41 | Make sure properties are set as below, so that density is output 42 | 43 | avd.top.match.average=false 44 | avd.top.match.density=true 45 | avd.top.match.grouping=false 46 | 47 | 3. Calculate neighborhood groups 48 | -------------------------------- 49 | Use the same MR as before. Watch the configurations params at the end of this section 50 | 51 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar 52 | CLASS_NAME=org.beymani.proximity.AverageDistance 53 | 54 | echo "running mr" 55 | IN_PATH=/user/pranab/cct/simi 56 | OUT_PATH=/user/pranab/cct/negrp 57 | echo "input $IN_PATH output $OUT_PATH" 58 | hadoop fs -rmr $OUT_PATH 59 | echo "removed output dir" 60 | 61 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH 62 | 63 | Make sure properties are set as below, so that neighborhood group is output 64 | 65 | avd.top.match.average=false 66 | avd.top.match.density=flase 67 | avd.top.match.grouping=true 68 | 69 | 4. Find Neighborhood and Density. 70 | -------------------------------- 71 | Here is a sample script. Before running make sure out of step 2 and 3 is copied 72 | or moved to the input dir for thios MR. Change the prefix of the output of step 2 73 | to what is defined in the config param density.file.prefix 74 | 75 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar 76 | CLASS_NAME=org.beymani.proximity.NeighborDensity 77 | 78 | echo "running mr" 79 | IN_PATH=/user/pranab/cct/input/nede 80 | OUT_PATH=/user/pranab/cct/nede 81 | echo "input $IN_PATH output $OUT_PATH" 82 | hadoop fs -rmr $OUT_PATH 83 | echo "removed output dir" 84 | 85 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH 86 | 87 | Make sure properties are set as below 88 | 89 | ned.density.file.prefix=den 90 | 91 | 5. Relative density calculation 92 | ------------------------------- 93 | It uses the input of step 4 as input. Here is the sample script 94 | 95 | JAR_NAME=/home/pranab/Projects/beymani/target/beymani-1.0.jar 96 | CLASS_NAME=org.beymani.proximity.RelativeDensity 97 | 98 | echo "running mr" 99 | IN_PATH=/user/pranab/cct/nede 100 | OUT_PATH=/user/pranab/cct/rede 101 | echo "input $IN_PATH output $OUT_PATH" 102 | hadoop fs -rmr $OUT_PATH 103 | echo "removed output dir" 104 | 105 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/cct.properties $IN_PATH $OUT_PATH 106 | 107 | 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /src/main/java/org/beymani/util/SequenceMatcher.java: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | 19 | package org.beymani.util; 20 | 21 | import java.util.ArrayList; 22 | import java.util.List; 23 | 24 | /** 25 | * Various sequence matching algorithms 26 | * @author pranab 27 | * 28 | * @param 29 | */ 30 | public class SequenceMatcher { 31 | private List seqData = new ArrayList(); 32 | private int maxSize; 33 | private double sim; 34 | private boolean normalize; 35 | private boolean similarity; 36 | private int matchSize; 37 | 38 | public SequenceMatcher(boolean normalize, boolean similarity) { 39 | this.normalize = normalize; 40 | this.similarity = similarity; 41 | } 42 | 43 | public SequenceMatcher(int maxSize,boolean normalized, boolean similarity) { 44 | this(normalized, similarity); 45 | this.maxSize = maxSize; 46 | } 47 | 48 | public void add(T item) { 49 | seqData.add(item); 50 | if (maxSize > 0 && seqData.size() > maxSize) { 51 | seqData.remove(0); 52 | } 53 | } 54 | 55 | /** 56 | * Simple positional matching 57 | * @param other 58 | * @return 59 | */ 60 | public double matchCount(SequenceMatcher other) { 61 | matchSize = seqData.size() < other.seqData.size() ? seqData.size() : other.seqData.size(); 62 | sim = 0; 63 | for (int i = 0; i < matchSize; ++i) { 64 | if (seqData.get(i).equals(other.seqData.get(i))) { 65 | ++sim; 66 | } 67 | } 68 | prepeareResult(matchSize); 69 | return sim; 70 | } 71 | 72 | /** 73 | * Positional matching with higher reward for adjacent mactches 74 | * @param other 75 | * @return 76 | */ 77 | public double adjacencyRewardedMatchCount(SequenceMatcher other) { 78 | matchSize = seqData.size() < other.seqData.size() ? seqData.size() : other.seqData.size(); 79 | sim = 0; 80 | int adjCount = 1; 81 | for (int i = 0; i < matchSize; ++i) { 82 | if (seqData.get(i).equals(other.seqData.get(i))) { 83 | sim += adjCount; 84 | ++adjCount; 85 | } else { 86 | adjCount = 1; 87 | } 88 | } 89 | prepeareResult(matchSize); 90 | return sim; 91 | } 92 | 93 | /** 94 | * Positional matching with higher reward for adjacent mactches 95 | * @param other 96 | * @return 97 | */ 98 | public double maxCommonSubSeqMatchCount(SequenceMatcher other) { 99 | int matchSize = seqData.size() < other.seqData.size() ? seqData.size() : other.seqData.size(); 100 | sim = 0; 101 | int adjCount = 0; 102 | for (int i = 0; i < matchSize; ++i) { 103 | if (seqData.get(i).equals(other.seqData.get(i))) { 104 | ++adjCount; 105 | } else { 106 | if (adjCount > sim) { 107 | sim = adjCount; 108 | } 109 | adjCount = 0; 110 | } 111 | } 112 | prepeareResult(matchSize * (matchSize + 1) / 2); 113 | return sim; 114 | } 115 | 116 | /** 117 | * @param scale 118 | */ 119 | private void prepeareResult(int scale) { 120 | if (normalize) { 121 | sim /= scale; 122 | if (!similarity) { 123 | sim = 1.0 - sim; 124 | } 125 | } else { 126 | if (!similarity) { 127 | sim = scale - sim; 128 | } 129 | } 130 | } 131 | 132 | } 133 | -------------------------------------------------------------------------------- /src/main/java/org/beymani/predictor/EstimatedCumProbabilityBasedPredictor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.predictor; 19 | 20 | import java.io.IOException; 21 | import java.util.Map; 22 | 23 | import org.apache.hadoop.conf.Configuration; 24 | import org.beymani.util.OutlierScoreAggregator; 25 | import org.chombo.stats.HistogramStat; 26 | import org.chombo.util.BasicUtils; 27 | 28 | public class EstimatedCumProbabilityBasedPredictor extends EsimatedAttrtibuteProbabilityBasedPredictor { 29 | 30 | public EstimatedCumProbabilityBasedPredictor(Map conf) { 31 | super(conf); 32 | } 33 | 34 | /** 35 | * @param config 36 | * @param idOrdinalsParam 37 | * @param attrListParam 38 | * @param distrFilePathParam 39 | * @param hdfsFileParam 40 | * @param schemaFilePathParam 41 | * @param attrWeightParam 42 | * @param seasonalParam 43 | * @param fieldDelimParam 44 | * @param scoreThresholdParam 45 | * @param ignoreMissingDistrParam 46 | * @throws IOException 47 | */ 48 | public EstimatedCumProbabilityBasedPredictor(Map config,String idOrdinalsParam, String attrListParam, 49 | String distrFilePathParam, String hdfsFileParam,String schemaFilePathParam, String attrWeightParam, 50 | String seasonalParam, String fieldDelimParam,String scoreThresholdParam, String ignoreMissingDistrParam, 51 | String scoreAggggregationStrtaegyParam) 52 | throws IOException { 53 | super(config, idOrdinalsParam, attrListParam, distrFilePathParam,hdfsFileParam, schemaFilePathParam, attrWeightParam, 54 | seasonalParam, fieldDelimParam, scoreThresholdParam,ignoreMissingDistrParam, "score.strategy", "exp.const", 55 | scoreAggggregationStrtaegyParam); 56 | } 57 | 58 | /** 59 | * @param config 60 | * @param distrFilePathParam 61 | * @param attrWeightParam 62 | * @param scoreThresholdParam 63 | * @param fieldDelimParam 64 | * @throws IOException 65 | */ 66 | public EstimatedCumProbabilityBasedPredictor(Configuration config,String distrFilePathParam, String attrWeightParam, 67 | String scoreThresholdParam, String fieldDelimParam) 68 | throws IOException { 69 | super(config, distrFilePathParam, attrWeightParam, scoreThresholdParam,fieldDelimParam); 70 | } 71 | 72 | @Override 73 | public double execute(String[] items, String compKey) { 74 | double score = 0; 75 | OutlierScoreAggregator scoreAggregator = new OutlierScoreAggregator(attrWeights.length, attrWeights); 76 | double thisScore = 0; 77 | for (int ord : attrOrdinals) { 78 | String keyWithFldOrd = compKey + fieldDelim + ord; 79 | double val = Double.parseDouble(items[ord]); 80 | System.out.println("keyWithFldOrd " + keyWithFldOrd); 81 | HistogramStat hist = keyedHist.get(keyWithFldOrd); 82 | if (null != hist) { 83 | double distr = hist.findCumDistr(val); 84 | thisScore = distr < 0.5 ? 1.0 - distr : distr; 85 | scoreAggregator.addScore(thisScore); 86 | } else { 87 | BasicUtils.assertCondition(!ignoreMissingDistr, "missing distr for key " + keyWithFldOrd); 88 | scoreAggregator.addScore(); 89 | } 90 | } 91 | //aggregate score 92 | score = getAggregateScore(scoreAggregator); 93 | 94 | scoreAboveThreshold = score > scoreThreshold; 95 | return score; 96 | } 97 | 98 | } 99 | -------------------------------------------------------------------------------- /resource/real_time_fraud_prediction_tutorial.txt: -------------------------------------------------------------------------------- 1 | This tutorial is for real time fraud detection using Haddop Storm. It uses markov chanin 2 | as the predictive model. Make necessary changes to path etc to suit your environment. 3 | 4 | Dependency 5 | ========== 6 | The project has dependency on chombo.Please do the build as below for chombo and avenir respectively 7 | mvn clean install 8 | 9 | Please refer to jar_dependency.txt for details of dependency 10 | 11 | Easiest way is to use ant as follows 12 | ant build_storm.xml 13 | 14 | Generate input data 15 | =================== 16 | Get util.rb from the project visitante. Puta copy of the file in ../lib 17 | ./xaction_states.rb 5000 > xact_training.txt 18 | 19 | where 5000 is the number of customers 20 | Copy the output file to HDFS input directory /Users/pranab/mmfr/input 21 | 22 | Generate transaction sequence data with MR 23 | ========================================== 24 | Run this script. This MR belogs to the project chombo 25 | 26 | JAR_NAME=/home/pranab/Projects/chombo/target/chombo-1.0.jar 27 | CLASS_NAME=org.chombo.mr.Projection 28 | 29 | echo "running mr" 30 | IN_PATH=/Users/pranab/mmfr/input 31 | OUT_PATH=/Users/pranab/mmfr/sequence 32 | echo "input $IN_PATH output $OUT_PATH" 33 | hadoop fs -rmr $OUT_PATH 34 | echo "removed output dir" 35 | 36 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/mmfr.properties $IN_PATH $OUT_PATH 37 | 38 | Generate markov chain model with MR 39 | =================================== 40 | Run this script 41 | 42 | JAR_NAME=/home/pranab/Projects/avenir/target/avenir-1.0.jar 43 | CLASS_NAME=org.avenir.markov.MarkovStateTransitionModel 44 | 45 | echo "running mr" 46 | IN_PATH=/Users/pranab/mmfr/sequence 47 | OUT_PATH=/Users/pranab/mmfr/model 48 | echo "input $IN_PATH output $OUT_PATH" 49 | hadoop fs -rmr $OUT_PATH 50 | echo "removed output dir" 51 | 52 | hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/fraud/mmfr.properties $IN_PATH $OUT_PATH 53 | 54 | Copy the MR ouput 55 | ================= 56 | hadoop fs -get /Users/pranab/mmfr/model/part-r-00000 xmodel.txt 57 | 58 | Store model in Redis 59 | ==================== 60 | ./xaction_queue.py setModel xmodel.txt 61 | 62 | Generate test transaction data 63 | ============================== 64 | ./xaction_states.rb 200 > xact_test.txt 65 | 66 | Write test data to Redis queue 67 | ============================== 68 | ./xaction_queue.py writeQueue xact_test.txt 69 | 70 | Build uber jar for storm deployment 71 | =================================== 72 | ant -f build_storm.xml 73 | 74 | Deploy storm topology 75 | ===================== 76 | storm jar uber-beymani-1.0.jar org.beymani.predictor.OutlierPredictor NoFraud rt_predict.properties 77 | 78 | Get output 79 | ========== 80 | From storm UI after you have ensured all data have been processed, get the output from the redis 81 | output queue 82 | 83 | ./xaction_queue.py readOutQueue 84 | 85 | Hadoop configuration 86 | ==================== 87 | field.delim.regex=, 88 | field.delim.out=, 89 | num.reducer=1 90 | debug.on=false 91 | 92 | #Projection 93 | projection.operation=grouping 94 | key.field=0 95 | projection.field=2 96 | 97 | #MarkovStateTransitionModel 98 | skip.field.count=1 99 | model.states=LNL,LNN,LNS,LHL,LHN,LHS,MNL,MNN,MNS,MHL,MHN,MHS,HNL,HNN,HNS,HHL,HHN,HHS 100 | trans.prob.scale=1 101 | 102 | Storm configuration 103 | =================== 104 | predictor.model=mm 105 | predictor.spout.threads=1 106 | predictor.bolt.threads=2 107 | num.workers=1 108 | debug=on 109 | 110 | redis.server.host=localhost 111 | redis.server.port=6379 112 | redis.markov.model.key=xactionMarkovModel 113 | redis.input.queue=xactionQueue 114 | local.predictor=true 115 | state.seq.window.size=5 116 | state.ordinal=1 117 | detection.algorithm=missProbability 118 | metric.threshold=0.96 119 | redis.output.queue=fraudQueue 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /resource/retail_sale_monitoring_with_anomaly_detection_tutorial.txt: -------------------------------------------------------------------------------- 1 | This tutorial is for ecommerce retail sale monitring based anomaly detection for hourly sales data. 2 | Robust zscore is used for anomaly detection. The data hierarchy is org -> sale -> dept -> product sale 3 | 4 | Dependent script 5 | ================ 6 | Checkout the project avenir. Copy the lib directory under python to a directory at the same level 7 | as your working directory for python script ecomm.py 8 | 9 | Build and Deployment 10 | ==================== 11 | Please refer to resorce/spark_dependency.txt for building all jars and the final uber jar filw 12 | 13 | Script and configuration 14 | ======================== 15 | Feel free to make changes in script exp_spark.sh and the configuration file exp.conf as per you 16 | environment 17 | 18 | Generate stats for hourly sales 19 | =============================== 20 | ./ecomm.py prStat > prstat.txt 21 | 22 | where 23 | num_product = num of products e.g 20 24 | 25 | Generate training data 26 | ====================== 27 | ./ecomm.py prSale prstat.txt > sale_tr.txt 28 | 29 | where 30 | interval = amount of time into past e.g 30 31 | time_unit = time unit d for day and h for hour 32 | 33 | Generate prediction data 34 | ======================== 35 | ./ecomm.py prSale prstat.txt > sale.txt 36 | 37 | 38 | Insert outlier 39 | ./ecomm.py olPrSale sale.txt > sale_pr.txt 40 | 41 | where 42 | outlier_percentage = percentage of outliers e.g 10 43 | 44 | Copy training data 45 | ================== 46 | ./ecomm.sh loadInp sale_tr.txt training 47 | 48 | Run spark job for basic stats 49 | ============================= 50 | ./ecomm.sh numStat 51 | 52 | Run spark job for median 53 | ======================== 54 | Set the following in ecomm.conf for numericalAttrMedian 55 | operation.type = "med" 56 | 57 | Run 58 | ./ecomm.sh numMstat 59 | 60 | Copy median file 61 | ================ 62 | ./ecomm.sh bkMod med.txt 63 | 64 | It generates med.txt file 65 | 66 | Run spark job for median absolute deviation 67 | =========================================== 68 | Set the following in ecomm.conf for numericalAttrMedian 69 | operation.type = "mad" 70 | 71 | Run 72 | ./ecomm.sh numMstat 73 | 74 | Copy median absolute deviation file 75 | =================================== 76 | ./ecomm.sh bkMod mad.txt 77 | 78 | It generates mad.txt 79 | 80 | Copy prediction data 81 | ==================== 82 | ./ecomm.sh loadInp sale_pr.txt pred 83 | 84 | Run spark job for prediction 85 | ============================ 86 | ./ecomm.sh olPred 87 | 88 | Copy prediction output into one file 89 | ==================================== 90 | ./ecomm.sh bkOut psale/olp.txt 91 | 92 | All output gets wrirtten to olp.txt 93 | 94 | Run spark job to aggregate to dept 95 | ================================== 96 | Clean aggregator input dir 97 | ./ecomm.sh rmAggrInp 98 | 99 | Copy to aggregator input dir 100 | ./ecomm.sh loadAggrInp psale/olp.txt 101 | 102 | Run aggregator spark job 103 | ./ecomm.sh aggrOl 104 | 105 | Copy aggregator output into one file 106 | ./ecomm.sh bkOutAggr dept/olp.txt 107 | 108 | Run spark job to aggregate to sale 109 | ================================== 110 | Clean aggregator input dir 111 | ./ecomm.sh rmAggrInp 112 | 113 | Copy to aggregator input dir 114 | ./ecomm.sh loadAggrInp dept/olp.txt 115 | 116 | Run aggregator 117 | ./ecomm.sh aggrOl 118 | 119 | Copy aggregator output into one file 120 | ./ecomm.sh bkOutAggr sale/olp.txt 121 | 122 | Run spark job to aggregate to organization 123 | ========================================== 124 | Clean aggregator input dir 125 | ./ecomm.sh rmAggrInp 126 | 127 | Copy to aggregator input dir 128 | ./ecomm.sh loadAggrInp sale/olp.txt 129 | 130 | Run aggregator 131 | ./ecomm.sh aggrOl 132 | 133 | Copy aggregator output into one file 134 | ./ecomm.sh bkOutAggr org/olp.txt 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /src/main/java/org/beymani/predictor/ModelBasedPredictor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.predictor; 19 | 20 | import java.io.Serializable; 21 | import java.util.Map; 22 | 23 | import org.beymani.util.OutlierScoreAggregator; 24 | import org.chombo.util.BasicUtils; 25 | import org.chombo.util.ConfigUtility; 26 | 27 | /** 28 | * Base class for all model based predictors 29 | * @author pranab 30 | * 31 | */ 32 | public abstract class ModelBasedPredictor implements Serializable { 33 | private static final long serialVersionUID = -8813946272356265424L; 34 | protected boolean realTimeDetection; 35 | protected double scoreThreshold; 36 | protected boolean scoreAboveThreshold; 37 | protected boolean partition = false; 38 | protected double expConst = 1.0; 39 | protected int[] idOrdinals; 40 | protected int[] attrOrdinals; 41 | protected double[] attrWeights; 42 | protected boolean ignoreMissingStat; 43 | protected String fieldDelim; 44 | protected boolean seasonal; 45 | 46 | private String aggregationStrategy; 47 | 48 | 49 | public ModelBasedPredictor() { 50 | 51 | } 52 | 53 | /** 54 | * @param config 55 | * @param attrWeightParam 56 | * @param scoreAggggregationStrtaegyParam 57 | */ 58 | public ModelBasedPredictor(Map config, String attrWeightParam, String scoreAggggregationStrtaegyParam) { 59 | attrWeights = ConfigUtility.getDoubleArray(config, attrWeightParam); 60 | aggregationStrategy = ConfigUtility.getString(config, scoreAggggregationStrtaegyParam);; 61 | } 62 | 63 | /** 64 | * @param entityID 65 | * @param record 66 | * @return 67 | */ 68 | public abstract double execute(String entityID, String record); 69 | 70 | /** 71 | * @param items 72 | * @param compKey 73 | * @return 74 | */ 75 | public abstract double execute(String[] items, String compKey); 76 | 77 | 78 | /** 79 | * @return 80 | */ 81 | public boolean isScoreAboveThreshold() { 82 | return scoreAboveThreshold; 83 | } 84 | 85 | /** 86 | * @return 87 | */ 88 | public ModelBasedPredictor withPartition() { 89 | partition = true; 90 | return this; 91 | } 92 | 93 | /** 94 | * @param ignoreMissingStat 95 | * @return 96 | */ 97 | public ModelBasedPredictor withIgnoreMissingStat(boolean ignoreMissingStat) { 98 | this.ignoreMissingStat = ignoreMissingStat; 99 | return this; 100 | } 101 | 102 | 103 | /** 104 | * @param compKey 105 | * @return 106 | */ 107 | public abstract boolean isValid(String compKey); 108 | 109 | /** 110 | * @return 111 | */ 112 | public double getAggregateScore(OutlierScoreAggregator scoreAggregator) { 113 | double aggrScore = 0; 114 | if (aggregationStrategy.equals("average")) { 115 | aggrScore = scoreAggregator.getAverage(); 116 | } else if (aggregationStrategy.equals("weightedAverage")) { 117 | aggrScore = scoreAggregator.getWeightedAverage(); 118 | } else if (aggregationStrategy.equals("median")) { 119 | aggrScore = scoreAggregator.getMedian(); 120 | } else if (aggregationStrategy.equals("max")) { 121 | aggrScore = scoreAggregator.getMax(); 122 | } else if (aggregationStrategy.equals("min")) { 123 | aggrScore = scoreAggregator.getMin(); 124 | } else { 125 | BasicUtils.assertFail("invalid outlier score aggregation strategy " + aggregationStrategy); 126 | } 127 | return aggrScore; 128 | } 129 | 130 | } 131 | -------------------------------------------------------------------------------- /python/app/bvib.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | 3 | # avenir-python: Machine Learning 4 | # Author: Pranab Ghosh 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); you 7 | # may not use this file except in compliance with the License. You may 8 | # obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | # implied. See the License for the specific language governing 16 | # permissions and limitations under the License. 17 | 18 | import os 19 | import sys 20 | from random import randint 21 | import time 22 | import math 23 | from datetime import datetime 24 | import matplotlib.pyplot as plt 25 | sys.path.append(os.path.abspath("../lib")) 26 | from util import * 27 | from mlutil import * 28 | from sampler import * 29 | 30 | """ 31 | MAchinary vibration time series with multiple harmonic components and random noise 32 | Inserts outlier with high frequency components indicating failure 33 | """ 34 | 35 | def sinComponents(params): 36 | """ 37 | returns list sine components 38 | """ 39 | comps = list() 40 | for i in range(0, len(params), 2): 41 | amp = params[i] 42 | per = params[i + 1] 43 | phase = randomFloat(0, 2.0 * math.pi) 44 | co = (amp, per, phase) 45 | comps.append(co) 46 | return comps 47 | 48 | def addSines(comps, sampTm): 49 | """ 50 | adds multiple sine comopnents 51 | """ 52 | val = 0 53 | for c in comps: 54 | t = 2.0 * math.pi * (sampTm % c[1]) / c[1] 55 | val += c[0] * math.sin(c[2] + t) 56 | return val 57 | 58 | if __name__ == "__main__": 59 | op = sys.argv[1] 60 | if op == "gen": 61 | #generate data 62 | ids = ["HG56SDFE", "K87JG9F6"] 63 | comps = dict() 64 | comps["HG56SDFE"] = sinComponents([52,40,76,20,5,80,7,30]) 65 | comps["K87JG9F6"] = sinComponents([56,42,74,18,6,84,9,28]) 66 | noise= NormalSampler(0,3) 67 | dur = int(sys.argv[2]) * 1000 68 | ctime = curTimeMs() 69 | ptime = ctime - dur 70 | sintv = 1 71 | stime = ptime 72 | while stime < ctime: 73 | for mid in ids: 74 | val = addSines(comps[mid], stime) + noise.sample() 75 | print("{},{},{:.3f}".format(mid, stime, val)) 76 | stime += sintv 77 | 78 | elif op == "iplot": 79 | #plot 80 | fpath = sys.argv[2] 81 | mid = sys.argv[3] 82 | beg = int(sys.argv[4]) 83 | end = int(sys.argv[5]) 84 | filt = lambda r : r[0] == mid 85 | dvalues = list(map(lambda r : float(r[2]), fileFiltRecGen(fpath, filt))) 86 | drawLine(dvalues[beg:end]) 87 | 88 | elif op == "iol": 89 | #insert outliers 90 | fpath = sys.argv[2] 91 | delay = int(sys.argv[3]) * 1000 * 2 92 | ocomps = sinComponents([36,12,30,8]) 93 | i = 0 94 | for rec in fileRecGen(fpath, ","): 95 | mid = rec[0] 96 | if mid == "K87JG9F6" and i > delay: 97 | val = float(rec[2]) 98 | stime = int(rec[1]) 99 | val += addSines(ocomps, stime) 100 | rec[2] = "{:.3f}".format(val) 101 | print(",".join(rec)) 102 | i += 1 103 | 104 | elif op == "oplot": 105 | #plot outliers 106 | fpath = sys.argv[2] 107 | mid = sys.argv[3] 108 | beg = int(sys.argv[4]) 109 | end = int(sys.argv[5]) 110 | filt = lambda r : r[0] == mid 111 | dvalues = list(map(lambda r : float(r[3]), fileFiltRecGen(fpath, filt))) 112 | xvalues = list(map(lambda r : int(r[1]), fileFiltRecGen(fpath, filt))) 113 | plt.plot(xvalues[beg:end], dvalues[beg:end]) 114 | plt.title("outlier score") 115 | plt.show() 116 | 117 | dvalues = list(map(lambda r : float(r[2]), fileFiltRecGen(fpath, filt))) 118 | plt.plot(xvalues, dvalues, "b") 119 | ofilt = lambda r : r[0] == mid and r[4] == "O" 120 | oxvalues = list(map(lambda r : int(r[1]), fileFiltRecGen(fpath, ofilt))) 121 | for t in oxvalues: 122 | plt.axvline(t, 0, .9, color="r") 123 | plt.title("outliers") 124 | plt.show() 125 | 126 | 127 | else: 128 | exitWithMsg("ivalid command") 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /src/main/java/org/beymani/predictor/OutlierPredictor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.predictor; 19 | 20 | import java.io.FileInputStream; 21 | import java.util.Map; 22 | import java.util.Properties; 23 | 24 | import backtype.storm.Config; 25 | import backtype.storm.StormSubmitter; 26 | import backtype.storm.task.OutputCollector; 27 | import backtype.storm.task.TopologyContext; 28 | import backtype.storm.topology.OutputFieldsDeclarer; 29 | import backtype.storm.topology.TopologyBuilder; 30 | import backtype.storm.topology.base.BaseRichBolt; 31 | import backtype.storm.tuple.Fields; 32 | import backtype.storm.tuple.Tuple; 33 | 34 | /** 35 | * Storm topolgy driver for outlier detection 36 | * @author pranab 37 | * 38 | */ 39 | public class OutlierPredictor { 40 | 41 | /** 42 | * @author pranab 43 | * 44 | */ 45 | public static class PredictorBolt extends BaseRichBolt { 46 | private OutputCollector collector; 47 | private ModelBasedPredictor predictor; 48 | 49 | /* (non-Javadoc) 50 | * @see backtype.storm.task.IBolt#prepare(java.util.Map, backtype.storm.task.TopologyContext, backtype.storm.task.OutputCollector) 51 | */ 52 | public void prepare(Map stormConf, TopologyContext context, 53 | OutputCollector collector) { 54 | this.collector = collector; 55 | String strategy = stormConf.get("predictor.model").toString(); 56 | if (strategy.equals("mm")){ 57 | predictor = new MarkovModelPredictor(stormConf); 58 | } 59 | } 60 | 61 | /* (non-Javadoc) 62 | * @see backtype.storm.task.IBolt#execute(backtype.storm.tuple.Tuple) 63 | */ 64 | public void execute(Tuple input) { 65 | String entityID = input.getString(0); 66 | String record = input.getString(1); 67 | double score = predictor.execute( entityID, record); 68 | 69 | //write score to db 70 | 71 | //ack 72 | collector.ack(input); 73 | } 74 | 75 | @Override 76 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 77 | 78 | } 79 | 80 | } 81 | 82 | public static void main(String[] args) throws Exception { 83 | String topologyName = args[0]; 84 | String configFilePath = args[1]; 85 | 86 | FileInputStream fis = new FileInputStream(configFilePath); 87 | Properties configProps = new Properties(); 88 | configProps.load(fis); 89 | 90 | //intialize config 91 | Config conf = new Config(); 92 | conf.setDebug(true); 93 | for (Object key : configProps.keySet()){ 94 | String keySt = key.toString(); 95 | String val = configProps.getProperty(keySt); 96 | conf.put(keySt, val); 97 | } 98 | 99 | //spout 100 | TopologyBuilder builder = new TopologyBuilder(); 101 | int spoutThreads = Integer.parseInt(configProps.getProperty("predictor.spout.threads")); 102 | builder.setSpout("predictorSpout", new PredictorSpout(), spoutThreads); 103 | 104 | //detector bolt 105 | int boltThreads = Integer.parseInt(configProps.getProperty("predictor.bolt.threads")); 106 | builder.setBolt("predictor", new PredictorBolt(), boltThreads) 107 | .fieldsGrouping("predictorSpout", new Fields("entityID")); 108 | 109 | //submit topology 110 | int numWorkers = Integer.parseInt(configProps.getProperty("num.workers")); 111 | conf.setNumWorkers(numWorkers); 112 | StormSubmitter.submitTopology(topologyName, conf, builder.createTopology()); 113 | 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /python/app/olss.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | 3 | # avenir-python: Machine Learning 4 | # Author: Pranab Ghosh 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); you 7 | # may not use this file except in compliance with the License. You may 8 | # obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | # implied. See the License for the specific language governing 16 | # permissions and limitations under the License. 17 | 18 | import os 19 | import sys 20 | import time 21 | import math 22 | import statistics 23 | import ntpath 24 | import matplotlib.pyplot as plt 25 | sys.path.append(os.path.abspath("../lib")) 26 | sys.path.append(os.path.abspath("../mlextra")) 27 | from util import * 28 | from mlutil import * 29 | from mcsim import * 30 | 31 | """ 32 | Statistical test for outlier score to determine suitable score threshold 33 | """ 34 | 35 | def getKeyedOlScores(dirPath, keyLen): 36 | ''' 37 | extracts outlier score from spark output files 38 | ''' 39 | filePaths = getAllFiles(dirPath) 40 | scores = dict() 41 | if keyLen == 0: 42 | kstr = "all" 43 | for fpath in filePaths: 44 | fname = ntpath.basename(fpath) 45 | if fname.startswith("part"): 46 | print("processing {}".format(fpath)) 47 | for rec in fileRecGen(fpath, ","): 48 | if keyLen > 0: 49 | kstr = ",".join(rec[0:keyLen]) 50 | score = float(rec[-2]) 51 | vl = scores.get(kstr) 52 | if vl is None: 53 | vl = list() 54 | scores[kstr] = vl 55 | vl.append(score) 56 | return scores 57 | 58 | def olScoreStat(dirPath, keyLen, shoHist): 59 | """ 60 | upper tail statistic for outlier score 61 | """ 62 | filePaths = getAllFiles(dirPath) 63 | scores = dict() 64 | if keyLen == 0: 65 | kstr = "all" 66 | for fpath in filePaths: 67 | fname = ntpath.basename(fpath) 68 | if fname.startswith("part"): 69 | print("processing {}".format(fpath)) 70 | for rec in fileRecGen(fpath, ","): 71 | if keyLen > 0: 72 | kstr = ",".join(rec[0:keyLen]) 73 | score = float(rec[-2]) 74 | vl = scores.get(kstr) 75 | if vl is None: 76 | vl = list() 77 | scores[kstr] = vl 78 | vl.append(score) 79 | 80 | print("outlier score upper tail stats") 81 | sim = MonteCarloSimulator(None,None,None,None) 82 | for kstr, vl in scores.items(): 83 | sim.setOutput(vl) 84 | if shoHist: 85 | sim.drawHist("outlier score", "score", "freq") 86 | stats = sim.getUpperTailStat(0) 87 | print("key: {}".format(kstr)) 88 | for s in stats: 89 | print("{:.3f} {:.3f}".format(s[0], s[1])) 90 | 91 | def olScoreEvStat(dirPath, keyLen, prTh, exPrTh): 92 | """ 93 | extreme value statistic for outlier score 94 | Paper: Anomaly Detection in Streams with Extreme Value Theory by Siffer, 95 | """ 96 | scores = getKeyedOlScores(dirPath, keyLen) 97 | 98 | sim = MonteCarloSimulator(None,None,None,None) 99 | for kstr, vl in scores.items(): 100 | sim.setOutput(vl) 101 | vth = sim.getCritValue(self, prTh) 102 | 103 | #values above threshold 104 | y = list(filter(lambda v : v > vth, vl)) 105 | ymax = max(y) 106 | ymin = min(y) 107 | ymean = statistics.mean(y) 108 | xsmin = -1.0 / ymax 109 | xsmax = 2.0 * (ymean - ymin) / (ymean * ymean) 110 | delta = (xsmax - xsmin) / 100 111 | for xs in floatRange(xsmin, xsmax, delta): 112 | pass 113 | 114 | 115 | 116 | if __name__ == "__main__": 117 | technique = sys.argv[1] 118 | dirPath = sys.argv[2] 119 | keyLen = int(sys.argv[3]) 120 | 121 | if technique == "sttest": 122 | """ outlier score upper tail statistics """ 123 | shoHist = sys.argv[4] == "hist" if len(sys.argv) == 5 else False 124 | olScoreStat(dirPath, keyLen, shoHist) 125 | 126 | elif technique == "exvstat": 127 | """ extreme value statistic for outlier score """ 128 | prTh = float(sys.argv[4]) 129 | exPrTh = float(sys.argv[5]) 130 | olScoreEvStat(dirPath, keyLen, prTh, exPrTh) 131 | else: 132 | exitWithMsg("invalid technique") 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /src/main/java/org/beymani/predictor/MahalanobisDistancePredictor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.predictor; 19 | 20 | import java.io.IOException; 21 | import java.util.Map; 22 | 23 | import org.chombo.math.MathUtils; 24 | import org.chombo.stats.MultiVariateStatsManager; 25 | import org.chombo.util.BasicUtils; 26 | import org.chombo.util.ConfigUtility; 27 | 28 | import Jama.Matrix; 29 | 30 | 31 | /** 32 | * Predictor based on Mahalanobis distance for multi variate data 33 | * @author pranab 34 | * 35 | */ 36 | public class MahalanobisDistancePredictor extends ModelBasedPredictor { 37 | private MultiVariateStatsManager statsManager; 38 | 39 | /** 40 | * @param config 41 | * @param idOrdinalsParam 42 | * @param attrListParam 43 | * @param fieldDelimParam 44 | * @param statsFilePathParam 45 | * @param seasonalParam 46 | * @param hdfsFileParam 47 | * @param scoreThresholdParam 48 | * @param expConstParam 49 | * @param ignoreMissingStatParam 50 | * @param scoreAggggregationStrtaegyParam 51 | * @throws IOException 52 | */ 53 | public MahalanobisDistancePredictor(Map config, String idOrdinalsParam, String attrListParam, 54 | String fieldDelimParam, String statsFilePathParam, String seasonalParam,String hdfsFileParam, 55 | String scoreThresholdParam, String expConstParam, String ignoreMissingStatParam) 56 | throws IOException { 57 | idOrdinals = ConfigUtility.getIntArray(config, idOrdinalsParam); 58 | attrOrdinals = ConfigUtility.getIntArray(config, attrListParam); 59 | fieldDelim = ConfigUtility.getString(config, fieldDelimParam, ","); 60 | 61 | String statsFilePath = ConfigUtility.getString(config, statsFilePathParam); 62 | boolean hdfsFilePath = ConfigUtility.getBoolean(config, hdfsFileParam); 63 | seasonal = ConfigUtility.getBoolean(config, seasonalParam); 64 | statsManager = new MultiVariateStatsManager(statsFilePath, fieldDelim, hdfsFilePath); 65 | scoreThreshold = ConfigUtility.getDouble(config, scoreThresholdParam); 66 | realTimeDetection = true; 67 | expConst = ConfigUtility.getDouble(config, expConstParam); 68 | ignoreMissingStat = ConfigUtility.getBoolean(config, ignoreMissingStatParam); 69 | } 70 | 71 | @Override 72 | public double execute(String entityID, String record) { 73 | // TODO Auto-generated method stub 74 | return 0; 75 | } 76 | 77 | @Override 78 | public double execute(String[] items, String compKey) { 79 | double score = 0; 80 | if (statsManager.statsExists(compKey)) { 81 | //extract input vector and subtract mean vector 82 | double[] data = BasicUtils.extractFieldsAsDoubleArray(items , attrOrdinals); 83 | Matrix input = MathUtils.createRowMatrix(data); 84 | Matrix inputOffset = MathUtils.subtractMatrix(input, statsManager.getMeanVec(compKey)); 85 | Matrix inputOffsetTr = MathUtils.transposeMatrix(inputOffset); 86 | 87 | 88 | //mahalanobis distance 89 | Matrix invCovar = statsManager.getInvCoVarMatrix(compKey); 90 | Matrix maDist = MathUtils.multiplyMatrix(inputOffset, invCovar); 91 | maDist = MathUtils.multiplyMatrix(maDist, inputOffsetTr); 92 | score = MathUtils.scalarFromMatrix(maDist); 93 | } else { 94 | BasicUtils.assertCondition(!ignoreMissingStat, "missing stats for key " + compKey ); 95 | } 96 | 97 | //exponential normalization 98 | if (expConst > 0) { 99 | score = BasicUtils.expScale(expConst, score); 100 | } 101 | 102 | scoreAboveThreshold = score > scoreThreshold; 103 | return score; 104 | } 105 | 106 | @Override 107 | public boolean isValid(String compKey) { 108 | return statsManager.statsExists(compKey); 109 | } 110 | 111 | } 112 | -------------------------------------------------------------------------------- /src/main/java/org/beymani/predictor/EstimatedMetaProbabilityBasedPredictor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.predictor; 19 | 20 | import java.io.IOException; 21 | import java.util.Map; 22 | 23 | import org.apache.hadoop.conf.Configuration; 24 | import org.beymani.util.OutlierScoreAggregator; 25 | import org.chombo.stats.HistogramStat; 26 | import org.chombo.util.BasicUtils; 27 | 28 | /** 29 | * Based on probability of probability p(f(y) < f(x)). f(x) is density function 30 | * @author pranab 31 | * 32 | */ 33 | public class EstimatedMetaProbabilityBasedPredictor extends EsimatedAttrtibuteProbabilityBasedPredictor { 34 | 35 | public EstimatedMetaProbabilityBasedPredictor(Map conf) { 36 | super(conf); 37 | } 38 | 39 | /** 40 | * @param config 41 | * @param idOrdinalsParam 42 | * @param attrListParam 43 | * @param distrFilePathParam 44 | * @param hdfsFileParam 45 | * @param schemaFilePathParam 46 | * @param attrWeightParam 47 | * @param seasonalParam 48 | * @param fieldDelimParam 49 | * @param scoreThresholdParam 50 | * @param ignoreMissingDistrParam 51 | * @throws IOException 52 | */ 53 | public EstimatedMetaProbabilityBasedPredictor(Map config,String idOrdinalsParam, String attrListParam, 54 | String distrFilePathParam, String hdfsFileParam,String schemaFilePathParam, String attrWeightParam, 55 | String seasonalParam, String fieldDelimParam,String scoreThresholdParam, String ignoreMissingDistrParam, 56 | String scoreStrategyParam, String expConstParam, String scoreAggggregationStrtaegyParam) 57 | throws IOException { 58 | super(config, idOrdinalsParam, attrListParam, distrFilePathParam,hdfsFileParam, schemaFilePathParam, attrWeightParam, 59 | seasonalParam, fieldDelimParam, scoreThresholdParam,ignoreMissingDistrParam, scoreStrategyParam, expConstParam, 60 | scoreAggggregationStrtaegyParam); 61 | } 62 | 63 | /** 64 | * @param config 65 | * @param distrFilePathParam 66 | * @param attrWeightParam 67 | * @param scoreThresholdParam 68 | * @param fieldDelimParam 69 | * @throws IOException 70 | */ 71 | public EstimatedMetaProbabilityBasedPredictor(Configuration config,String distrFilePathParam, String attrWeightParam, 72 | String scoreThresholdParam, String fieldDelimParam) 73 | throws IOException { 74 | super(config, distrFilePathParam, attrWeightParam, scoreThresholdParam,fieldDelimParam); 75 | } 76 | 77 | @Override 78 | public double execute(String[] items, String compKey) { 79 | double score = 0; 80 | OutlierScoreAggregator scoreAggregator = new OutlierScoreAggregator(attrWeights.length, attrWeights); 81 | double thisScore = 0; 82 | for (int ord : attrOrdinals) { 83 | String keyWithFldOrd = compKey + fieldDelim + ord; 84 | double val = Double.parseDouble(items[ord]); 85 | System.out.println("keyWithFldOrd " + keyWithFldOrd); 86 | HistogramStat hist = keyedHist.get(keyWithFldOrd); 87 | if (null != hist) { 88 | double distr = hist.findMetaDistr(val); 89 | if (scoreStrategy.equals("inverse")) { 90 | thisScore = 1.0 - distr; 91 | } else { 92 | if (distr > 0) { 93 | thisScore = -Math.log(distr); 94 | } else { 95 | thisScore = 20.0; 96 | } 97 | } 98 | scoreAggregator.addScore(thisScore); 99 | } else { 100 | BasicUtils.assertCondition(!ignoreMissingDistr, "missing distr for key " + keyWithFldOrd); 101 | scoreAggregator.addScore(); 102 | } 103 | } 104 | //aggregate score 105 | score = getAggregateScore(scoreAggregator); 106 | 107 | //exponential normalization 108 | if (expConst > 0) { 109 | score = BasicUtils.expScale(expConst, score); 110 | } 111 | 112 | scoreAboveThreshold = score > scoreThreshold; 113 | return score; 114 | } 115 | 116 | } 117 | -------------------------------------------------------------------------------- /python/app/bls.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | 3 | # avenir-python: Machine Learning 4 | # Author: Pranab Ghosh 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); you 7 | # may not use this file except in compliance with the License. You may 8 | # obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 15 | # implied. See the License for the specific language governing 16 | # permissions and limitations under the License. 17 | 18 | import os 19 | import sys 20 | from random import randint 21 | import time 22 | import uuid 23 | import threading 24 | import matplotlib.pyplot as plt 25 | sys.path.append(os.path.abspath("../lib")) 26 | from util import * 27 | from sampler import * 28 | 29 | def createAnomaly(high): 30 | if high: 31 | reading = randomFloat(120, 200) 32 | else: 33 | reading = randomFloat(60, 80) 34 | return reading 35 | 36 | if __name__ == "__main__": 37 | op = sys.argv[1] 38 | 39 | #device stats 40 | if op == "stat": 41 | #normal mean 80 - 100 sd 1 - 5 42 | #anomaly mean 120 - 160 sd 1 - 5 43 | numDevs = int(sys.argv[2]) 44 | mmin = int(sys.argv[3]) 45 | mmax = int(sys.argv[4]) 46 | smin = int(sys.argv[5]) 47 | smax = int(sys.argv[6]) 48 | for i in range(numDevs): 49 | mean = randomFloat(mmin, mmax) 50 | sd = randomFloat(smin, smax) 51 | devId = genID(12) 52 | #print "%s,%.3f,%.3f" %(devId, mean, sd) 53 | print("{},{:.3f},{:.3f}".format(devId, mean, sd)) 54 | 55 | #generate reading 56 | elif op == "gen": 57 | statFile = sys.argv[2] 58 | numDays = int(sys.argv[3]) 59 | modeNorm = (sys.argv[4] == "normal") 60 | 61 | devices = [] 62 | for rec in fileRecGen(statFile, ","): 63 | ds = (rec[0], float(rec[1]), float(rec[2])) 64 | devices.append(ds) 65 | 66 | 67 | numDevs = len(devices) 68 | distrs = list(map(lambda d: GaussianRejectSampler(d[1],d[2]), devices)) 69 | 70 | curTime = int(time.time()) 71 | pastTime = curTime - (numDays + 1) * secInDay 72 | pastTime = (pastTime / secInDay) * secInDay + secInHour * 15 73 | sampTime = pastTime 74 | sampIntv = secInDay 75 | 76 | anm = dict() 77 | anmDesc = dict() 78 | while(sampTime < curTime): 79 | for i in range(numDevs): 80 | d = devices[i] 81 | did = d[0] 82 | ts = sampTime + randint(-1000, 1000) 83 | sampled = False 84 | anomalyRate = 10 if (modeNorm) else 20 85 | if isEventSampled(anomalyRate): 86 | if not did in anm: 87 | #create anomaly 88 | high = isEventSampled(80) 89 | reading = createAnomaly(high) 90 | appendKeyedList(anm, did, reading) 91 | length = randint(1, 2) if(modeNorm) else randint(3, 7) 92 | desc = (length, high) 93 | anmDesc[did] = desc 94 | sampled = True 95 | #print "**** anomaly created %s, %d" %(did, reading) 96 | 97 | if not sampled: 98 | if did in anm: 99 | # ongoing anomaly 100 | ans = anm[did] 101 | desc = anmDesc[did] 102 | towardsNorm = len(ans) == desc[0] 103 | an = ans[0] 104 | if len(ans) == desc[0]: 105 | # moving toward normal from anomaly 106 | if isEventSampled(60): 107 | sampled = True 108 | reading = 0.85 * an if(desc[1]) else 1.15 * an 109 | #print "**** moving back to normal %s, %d" %(did, reading) 110 | del anm[did] 111 | del anmDesc[did] 112 | elif len(ans) < desc[0]: 113 | # continue anomaly 114 | reading = createAnomaly(desc[1]) 115 | appendKeyedList(anm, did, reading) 116 | sampled = True 117 | #print "**** anomaly continued %s, %d" %(did, reading) 118 | 119 | if not sampled: 120 | # normal 121 | reading = distrs[i].sample() 122 | 123 | #print "%s,%d,%d" %(did, ts, int(reading)) 124 | print("{},{},{}".format(did, ts, int(reading))) 125 | sampTime += sampIntv 126 | 127 | elif op == "oplot": 128 | #plot outliers 129 | fpath = sys.argv[2] 130 | mid = sys.argv[3] 131 | filt = lambda r : r[0] == mid 132 | dvalues = list(map(lambda r : float(r[3]), fileFiltRecGen(fpath, filt))) 133 | xvalues = list(map(lambda r : int(r[1]), fileFiltRecGen(fpath, filt))) 134 | plt.plot(xvalues, dvalues) 135 | plt.title("outlier score") 136 | plt.show() 137 | -------------------------------------------------------------------------------- /python/app/cpu_usage.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python3 2 | 3 | import os 4 | import sys 5 | from random import randint 6 | import time 7 | import uuid 8 | import threading 9 | sys.path.append(os.path.abspath("../lib")) 10 | from util import * 11 | from sampler import * 12 | 13 | op = sys.argv[1] 14 | secInHour = 60 * 60 15 | secInDay = 24 * secInHour 16 | secInWeek = 7 * secInDay 17 | secInYear = 365 * secInDay 18 | 19 | wkDayMean = 60 20 | wkDayStdDev = 12 21 | wkEndMean = 30 22 | wkEndStdDev = 8 23 | 24 | if op == "usage": 25 | numDays = int(sys.argv[2]) 26 | sampIntv = int(sys.argv[3]) 27 | numServers = int(sys.argv[4]) 28 | 29 | outDayInWeek = True 30 | s = 5 31 | if len(sys.argv) > 5: 32 | #print(sys.argv[5]) 33 | if sys.argv[5] == "false" or sys.argv[5] == "f": 34 | outDayInWeek = False 35 | s = 6 36 | 37 | serverList = None 38 | if len(sys.argv) > s: 39 | #server ID from stats file 40 | sfile = sys.argv[s] 41 | #print(sfile) 42 | servers = set() 43 | for rec in fileRecGen(sfile, ","): 44 | #print(rec[0]) 45 | servers.add(rec[0]) 46 | serverList = list(servers) 47 | else: 48 | #generate server ID 49 | serverList = list() 50 | for i in range(numServers): 51 | serverList.append(genID(10)) 52 | 53 | curTime = int(time.time()) 54 | pastTime = curTime - (numDays + 1) * secInDay 55 | sampTime = pastTime 56 | usageDistr = [GaussianRejectSampler(wkDayMean,wkDayStdDev), GaussianRejectSampler(wkEndMean,wkEndStdDev)] 57 | 58 | while(sampTime < curTime): 59 | secIntoDay = sampTime % secInDay 60 | #hourIntoDay = secIntoDay / secInHour 61 | 62 | secIntoWeek = sampTime % secInWeek 63 | daysIntoWeek = int(secIntoWeek / secInDay) 64 | 65 | if daysIntoWeek >= 0 and daysIntoWeek <= 4: 66 | distr = usageDistr[0] 67 | else: 68 | distr = usageDistr[1] 69 | 70 | for server in serverList: 71 | usage = distr.sample() 72 | if (usage < 0): 73 | usage = 5 74 | elif usage > 100: 75 | usage = 100 76 | usage = int(usage) 77 | st = sampTime + randint(-2,2) 78 | if outDayInWeek: 79 | #print "%s,%d,%d,%d" %(server, st, daysIntoWeek, usage) 80 | print("{},{},{},{}".format(server, st, daysIntoWeek, usage)) 81 | else: 82 | #print "%s,%d,%d" %(server, st, usage) 83 | print("{},{},{}".format(server, st, usage)) 84 | 85 | sampTime = sampTime + sampIntv 86 | 87 | elif op == "anomaly": 88 | fileName = sys.argv[2] 89 | count = 0 90 | for rec in fileRecGen(fileName, ","): 91 | if isEventSampled(8): 92 | dow = int(rec[2]) 93 | if dow < 5: 94 | rec[3] = str(randint(94, 100)) 95 | else: 96 | rec[3] = str(randint(54, 100)) 97 | count += 1 98 | mrec = ",".join(rec) 99 | print(mrec) 100 | #print "num of anomalous records " + str(count) 101 | 102 | elif op == "feedback": 103 | fileName = sys.argv[2] 104 | curThreshold = float(sys.argv[3]) 105 | newThreshold = float(sys.argv[4]) 106 | margin = curThreshold + 0.6 * (newThreshold - curThreshold) 107 | count = 0 108 | for rec in fileRecGen(fileName, ","): 109 | score = float(rec[4]) 110 | label = rec[5] 111 | if newThreshold > curThreshold: 112 | #false positive 113 | if label == "O": 114 | if score > newThreshold: 115 | flabel = "O" 116 | cl = "T" 117 | else: 118 | if score < margin or isEventSampled(90): 119 | flabel = "N" 120 | cl = "F" 121 | count += 1 122 | else: 123 | flabel = "O" 124 | cl = "T" 125 | else: 126 | flabel = "N" 127 | cl = "F" 128 | else: 129 | #false negative 130 | if label == "O": 131 | flabel = "O" 132 | cl = "T" 133 | else: 134 | if score > newThreshold: 135 | if score > margin or isEventSampled(90): 136 | flabel = "O" 137 | cl = "T" 138 | count += 1 139 | else: 140 | flabel = "N" 141 | cl = "F" 142 | else: 143 | flabel = "N" 144 | cl = "F" 145 | rec.append(flabel) 146 | rec.append(cl) 147 | mrec = ",".join(rec) 148 | print(mrec) 149 | #print count 150 | 151 | elif op == "addTrend": 152 | fileName = sys.argv[2] 153 | trendYearlyPercentRate = float(sys.argv[3]) 154 | trenPerSec = trendYearlyPercentRate / secInYear 155 | start = None 156 | for rec in fileRecGen(fileName, ","): 157 | ts = int(rec[1]) 158 | usage = float(rec[3]) 159 | if start is None: 160 | start = ts 161 | else: 162 | usage = usage + (ts - start) * trenPerSec 163 | usageStr = ".3f" %(usage) 164 | rec[3] = usageStr 165 | mrec = ",".join(rec) 166 | print(mrec) 167 | 168 | 169 | 170 | -------------------------------------------------------------------------------- /spark/src/main/scala/org/beymani/spark/common/PseudoRelevanceThresholdFinder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani-spark: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.spark.common 19 | 20 | import org.chombo.spark.common.JobConfiguration 21 | import org.apache.spark.SparkContext 22 | import scala.collection.JavaConverters._ 23 | import org.chombo.util.BasicUtils 24 | import org.chombo.spark.common.Record 25 | import org.chombo.util.BaseAttribute 26 | import com.typesafe.config.Config 27 | 28 | /** 29 | * Finds threshold based pseudo relevance e.g. top n or top n percentage 30 | * @author pranab 31 | * 32 | */ 33 | object PseudoRelevanceThresholdFinder extends JobConfiguration { 34 | /** 35 | * @param args 36 | * @return 37 | */ 38 | def main(args: Array[String]) { 39 | val appName = "outlierCounter" 40 | val Array(inputPath: String, outputPath: String, configFile: String) = getCommandLineArgs(args, 3) 41 | val config = createConfig(configFile) 42 | val sparkConf = createSparkConf(appName, config, false) 43 | val sparkCntxt = new SparkContext(sparkConf) 44 | val appConfig = config.getConfig(appName) 45 | 46 | //configuration params 47 | val fieldDelimIn = getStringParamOrElse(appConfig, "field.delim.in", ",") 48 | val fieldDelimOut = getStringParamOrElse(appConfig, "field.delim.out", ",") 49 | val keyLen = getMandatoryIntParam(appConfig, "data.keyLen", "missing key length") 50 | val precision = getIntParamOrElse(appConfig, "output.precision", 3) 51 | val relevanceThreshold = getMandatoryDoubleParam(appConfig, "relevance.threshold", "missing relevance threshold") 52 | val relevanceAsPercentage = getBooleanParamOrElse(appConfig, "relevance.asPercentage", true) 53 | val minSampleCount = getMandatoryIntParam(appConfig, "sample.minCount", "missing min sample count") 54 | val thresholdPath = getMandatoryStringParam(appConfig, "threshold.filePath", "missing stat file path") 55 | val thresholdMap = BasicUtils.getKeyedValues(thresholdPath, keyLen, keyLen) 56 | val defaultThreshold = getMandatoryDoubleParam(appConfig, "threshold.default", "missing default threshold") 57 | val debugOn = appConfig.getBoolean("debug.on") 58 | val saveOutput = appConfig.getBoolean("save.output") 59 | 60 | //input 61 | val data = sparkCntxt.textFile(inputPath) 62 | 63 | val keyedThresholds = data.map(line => { 64 | val items = BasicUtils.getTrimmedFields(line, fieldDelimIn) 65 | val keyRec = Record(items, 0, keyLen) 66 | val last = items.length - 1 67 | val score = items(last -1).toDouble 68 | (keyRec, score) 69 | }).groupByKey.map(r => { 70 | val key = r._1 71 | val scores = r._2.toList 72 | val sortedScores = scores.sortWith((v1,v2) => v1 > v2) 73 | val size = sortedScores.length 74 | val threshold = 75 | if (size > minSampleCount) { 76 | //find threshold 77 | val thresholdIndex = 78 | if (relevanceAsPercentage) { 79 | ((size * relevanceThreshold) / 100).toInt - 1 80 | } else { 81 | val indx = relevanceThreshold.toInt - 1 82 | if (indx > size-2) { 83 | throw new IllegalStateException("absolute threshold value too big") 84 | } 85 | indx 86 | } 87 | sortedScores.slice(thresholdIndex - 1, 3).sum / 3 88 | } else { 89 | //use existing threshold or default 90 | val keyStr = key.toString(fieldDelimOut) 91 | if (thresholdMap.containsKey(keyStr)) thresholdMap.get(keyStr).toDouble 92 | else defaultThreshold 93 | } 94 | key.toString(fieldDelimOut) + fieldDelimOut + BasicUtils.formatDouble(threshold, precision) 95 | }) 96 | 97 | if (debugOn) { 98 | val records = keyedThresholds.collect.slice(0, 20) 99 | records.foreach(r => println(r)) 100 | } 101 | 102 | if(saveOutput) { 103 | keyedThresholds.saveAsTextFile(outputPath) 104 | } 105 | 106 | } 107 | } -------------------------------------------------------------------------------- /src/main/java/org/beymani/util/DataStreamSchema.java: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.util; 19 | 20 | import java.io.FileInputStream; 21 | import java.io.IOException; 22 | import java.io.InputStream; 23 | import java.io.Serializable; 24 | import java.util.ArrayList; 25 | import java.util.List; 26 | 27 | import org.chombo.util.BasicUtils; 28 | import org.codehaus.jackson.annotate.JsonIgnoreProperties; 29 | import org.codehaus.jackson.map.ObjectMapper; 30 | 31 | @JsonIgnoreProperties(ignoreUnknown = true) 32 | public class DataStreamSchema implements Serializable { 33 | private List dataStreams; 34 | 35 | /** 36 | * 37 | */ 38 | public DataStreamSchema() { 39 | } 40 | 41 | /** 42 | * @return 43 | */ 44 | public List getDataStreams() { 45 | return dataStreams; 46 | } 47 | 48 | /** 49 | * @param dataStreams 50 | */ 51 | public void setDataStreams(List dataStreams) { 52 | this.dataStreams = dataStreams; 53 | } 54 | 55 | /** 56 | * @param type 57 | * @return 58 | */ 59 | public DataStream findByType(String type) { 60 | DataStream stream = null; 61 | for (DataStream daStrm : dataStreams) { 62 | if (daStrm.getType().equals(type)) { 63 | stream = daStrm; 64 | break; 65 | } 66 | } 67 | return stream; 68 | } 69 | 70 | /** 71 | * @param type 72 | * @return 73 | */ 74 | public List findAllByType(String type) { 75 | List streams = new ArrayList(); 76 | for (DataStream daStrm : dataStreams) { 77 | if (daStrm.getType().equals(type)) { 78 | streams.add(daStrm); 79 | } 80 | } 81 | return streams; 82 | } 83 | 84 | /** 85 | * @param type 86 | * @return 87 | */ 88 | public DataStream findByTypeAndId(String type, String id) { 89 | DataStream stream = null; 90 | for (DataStream daStrm : dataStreams) { 91 | if (daStrm.getId().equals("*")) { 92 | if (daStrm.getType().equals(type)) { 93 | boolean done = false; 94 | List parents = findAllByType(daStrm.getParentType()); 95 | for (DataStream pa : parents) { 96 | List children = pa.getChildrenId(); 97 | BasicUtils.assertNotNull(children, "missing child ID list in parent"); 98 | if (children.contains(id)) { 99 | BasicUtils.assertCondition(daStrm.getParentId().equals(pa.getId()), "mismatched parent ID"); 100 | stream = daStrm; 101 | done = true; 102 | break; 103 | } 104 | } 105 | if (done) 106 | break; 107 | } 108 | } else { 109 | if (daStrm.getType().equals(type) && daStrm.getId().equals(id)) { 110 | stream = daStrm; 111 | break; 112 | } 113 | } 114 | } 115 | return stream; 116 | } 117 | 118 | /** 119 | * @param type 120 | * @param id 121 | * @return 122 | */ 123 | public DataStream findParent(String type, String id) { 124 | DataStream parentStream = null; 125 | DataStream stream = findByType(type); 126 | BasicUtils.assertNotNull(stream, "coud not find data stream object"); 127 | parentStream = findByType(stream.getParentType()); 128 | if (!parentStream.isSingleton()) { 129 | //instance based 130 | stream = findByTypeAndId(type, id); 131 | parentStream = findByTypeAndId(stream.getParentType(), stream.getParentId()); 132 | } 133 | return parentStream; 134 | } 135 | 136 | /** 137 | * @param type 138 | * @return 139 | */ 140 | public String findParentType(String type) { 141 | DataStream stream = findByType(type); 142 | BasicUtils.assertNotNull(stream, "coud not find data stream object"); 143 | return stream.getParentType(); 144 | } 145 | 146 | /** 147 | * @param path 148 | * @return 149 | * @throws IOException 150 | */ 151 | public static DataStreamSchema loadDataStreamSchema(String path) throws IOException { 152 | InputStream fs = new FileInputStream(path); 153 | ObjectMapper mapper = new ObjectMapper(); 154 | DataStreamSchema schema = mapper.readValue(fs, DataStreamSchema.class); 155 | return schema; 156 | } 157 | 158 | } 159 | -------------------------------------------------------------------------------- /src/main/java/org/beymani/predictor/InterPercentileDifferenceBasedPredictor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.predictor; 19 | 20 | import java.io.IOException; 21 | import java.util.Map; 22 | 23 | import org.apache.hadoop.conf.Configuration; 24 | import org.beymani.util.OutlierScoreAggregator; 25 | import org.chombo.stats.HistogramStat; 26 | import org.chombo.util.BasicUtils; 27 | 28 | /** 29 | * Inter percentile difference (25% and 75%) based predictor 30 | * @author pranab 31 | * 32 | */ 33 | public class InterPercentileDifferenceBasedPredictor extends EsimatedAttrtibuteProbabilityBasedPredictor { 34 | private static final int QUARTER_PERECENTILE = 25; 35 | private static final int THREE_QUARTER_PERECENTILE = 75; 36 | 37 | /** 38 | * @param conf 39 | */ 40 | public InterPercentileDifferenceBasedPredictor(Map conf) { 41 | super(conf); 42 | } 43 | 44 | /** 45 | * @param config 46 | * @param idOrdinalsParam 47 | * @param attrListParam 48 | * @param distrFilePathParam 49 | * @param hdfsFileParam 50 | * @param schemaFilePathParam 51 | * @param attrWeightParam 52 | * @param seasonalParam 53 | * @param fieldDelimParam 54 | * @param scoreThresholdParam 55 | * @param ignoreMissingDistrParam 56 | * @throws IOException 57 | */ 58 | public InterPercentileDifferenceBasedPredictor(Map config,String idOrdinalsParam, String attrListParam, 59 | String distrFilePathParam, String hdfsFileParam,String schemaFilePathParam, String attrWeightParam, 60 | String seasonalParam, String fieldDelimParam,String scoreThresholdParam, String ignoreMissingDistrParam, 61 | String expConstParam, String scoreAggggregationStrtaegyParam) 62 | throws IOException { 63 | super(config, idOrdinalsParam, attrListParam, distrFilePathParam,hdfsFileParam, schemaFilePathParam, attrWeightParam, 64 | seasonalParam, fieldDelimParam, scoreThresholdParam,ignoreMissingDistrParam, "score.strategy", expConstParam, 65 | scoreAggggregationStrtaegyParam); 66 | } 67 | 68 | /** 69 | * @param config 70 | * @param distrFilePathParam 71 | * @param attrWeightParam 72 | * @param scoreThresholdParam 73 | * @param fieldDelimParam 74 | * @throws IOException 75 | */ 76 | public InterPercentileDifferenceBasedPredictor(Configuration config,String distrFilePathParam, String attrWeightParam, 77 | String scoreThresholdParam, String fieldDelimParam) 78 | throws IOException { 79 | super(config, distrFilePathParam, attrWeightParam, scoreThresholdParam,fieldDelimParam); 80 | } 81 | 82 | /* (non-Javadoc) 83 | * @see org.beymani.predictor.EsimatedAttrtibuteProbabilityBasedPredictor#execute(java.lang.String[], java.lang.String) 84 | */ 85 | @Override 86 | public double execute(String[] items, String compKey) { 87 | double score = 0; 88 | OutlierScoreAggregator scoreAggregator = new OutlierScoreAggregator(attrWeights.length, attrWeights); 89 | double thisScore = 0; 90 | for (int ord : attrOrdinals) { 91 | String keyWithFldOrd = compKey + fieldDelim + ord; 92 | double val = Double.parseDouble(items[ord]); 93 | System.out.println("keyWithFldOrd " + keyWithFldOrd); 94 | HistogramStat hist = keyedHist.get(keyWithFldOrd); 95 | if (null != hist) { 96 | double quarterPercentile = hist.getQuantile(QUARTER_PERECENTILE); 97 | double threeQuarterPercentile = hist.getQuantile(THREE_QUARTER_PERECENTILE); 98 | double percentileDiff = threeQuarterPercentile - quarterPercentile; 99 | if (val < quarterPercentile) { 100 | thisScore = (quarterPercentile - val) / percentileDiff; 101 | } else if (val > threeQuarterPercentile){ 102 | thisScore = (val - threeQuarterPercentile) / percentileDiff; 103 | } 104 | scoreAggregator.addScore(thisScore); 105 | } else { 106 | BasicUtils.assertCondition(!ignoreMissingDistr, "missing distr for key " + keyWithFldOrd); 107 | scoreAggregator.addScore(); 108 | } 109 | } 110 | //aggregate score 111 | score = getAggregateScore(scoreAggregator); 112 | 113 | //exponential normalization 114 | if (expConst > 0) { 115 | score = BasicUtils.expScale(expConst, score); 116 | } 117 | 118 | scoreAboveThreshold = score > scoreThreshold; 119 | return score; 120 | } 121 | 122 | } 123 | -------------------------------------------------------------------------------- /spark/src/main/scala/org/beymani/spark/common/OutlierScoreLevelShift.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani-spark: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.spark.common 19 | 20 | import scala.Array.canBuildFrom 21 | import scala.collection.JavaConverters._ 22 | import org.apache.spark.SparkContext 23 | import org.apache.spark.rdd.RDD.rddToPairRDDFunctions 24 | import org.chombo.spark.common.GeneralUtility 25 | import org.chombo.spark.common.JobConfiguration 26 | import org.chombo.spark.common.Record 27 | import org.chombo.util.BasicUtils 28 | import org.hoidla.window.SizeBoundFloatStatsWindow 29 | 30 | /** 31 | * Outlier detection based on level shift outlier score from any algorithm 32 | * @author pranab 33 | */ 34 | object OutlierScoreLevelShift extends JobConfiguration with GeneralUtility { 35 | 36 | /** 37 | * @param args 38 | * @return 39 | */ 40 | def main(args: Array[String]) { 41 | val appName = "outlierScoreLevelShift" 42 | val Array(inputPath: String, outputPath: String, configFile: String) = getCommandLineArgs(args, 3) 43 | val config = createConfig(configFile) 44 | val sparkConf = createSparkConf(appName, config, false) 45 | val sparkCntxt = new SparkContext(sparkConf) 46 | val appConfig = config.getConfig(appName) 47 | 48 | //configuration params 49 | val fieldDelimIn = getStringParamOrElse(appConfig, "field.delim.in", ",") 50 | val fieldDelimOut = getStringParamOrElse(appConfig, "field.delim.out", ",") 51 | val seqFieldOrd = getMandatoryIntParam(appConfig, "seq.fieldOrd", "missing seq field ordinal") 52 | val keyLen = getMandatoryIntParam(appConfig, "key.length", "missing key length") 53 | val longWindowSize = getMandatoryIntParam(appConfig, "window.longSize", "missing long window size") 54 | val shortWindowSize = getMandatoryIntParam(appConfig, "window.shortSize", "missing short window size") 55 | val minZscore = getMandatoryDoubleParam(appConfig, "zscore.min", "missing min z score") 56 | val debugOn = getBooleanParamOrElse(appConfig, "debug.on", false) 57 | val saveOutput = getBooleanParamOrElse(appConfig,"save.output", true) 58 | 59 | //input 60 | val data = sparkCntxt.textFile(inputPath) 61 | 62 | val taggedData = data.map(line => { 63 | val items = BasicUtils.getTrimmedFields(line, fieldDelimIn) 64 | val key = Record(items, 0, keyLen) 65 | (key, items) 66 | }).groupByKey.flatMap(r => { 67 | val longWindow = new SizeBoundFloatStatsWindow(longWindowSize) 68 | val shortWindow = new SizeBoundFloatStatsWindow(shortWindowSize) 69 | val values = r._2.toArray.sortBy(v => { 70 | v(seqFieldOrd).toLong 71 | }) 72 | val newTags = values.map(v => { 73 | val score = v(v.size - 2).toDouble 74 | val tag = v(v.size - 1) 75 | longWindow.add(score) 76 | shortWindow.add(score) 77 | var newTag = "" 78 | if (longWindow.isFull()) { 79 | val loMean = longWindow.getMean() 80 | val loStdDev = longWindow.getStdDev() 81 | val shMean = shortWindow.getMean() 82 | val levelBasedScore = (shMean - loMean) / loStdDev; 83 | newTag = if (levelBasedScore > minZscore) "O" else "N" 84 | } else { 85 | newTag = tag 86 | } 87 | val rec = Record(2) 88 | rec.add(tag,newTag) 89 | }) 90 | 91 | //propagate outlier tag 92 | for (i <- longWindowSize to newTags.length -1) { 93 | if(newTags(i).getString(1) == "O") { 94 | for (j <- i - shortWindowSize + 1 to i - 1) { 95 | val tag = if (newTags(j).getString(0) == "I") "I" else "O" 96 | val rec = Record(2) 97 | rec.add(newTags(j).getString(0), tag) 98 | newTags(j) = rec 99 | } 100 | } 101 | } 102 | 103 | val recValues = values.map(v => Record(v)) 104 | newTags.zip(recValues).map(r => { 105 | val newTag = r._1.getString(1) 106 | val rec = r._2.getString(0) 107 | rec + fieldDelimOut + newTag 108 | }) 109 | }) 110 | 111 | if (debugOn) { 112 | val records = taggedData.collect 113 | records.slice(0, 100).foreach(r => println(r)) 114 | } 115 | 116 | if(saveOutput) { 117 | taggedData.saveAsTextFile(outputPath) 118 | } 119 | 120 | } 121 | } -------------------------------------------------------------------------------- /resource/and_spark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PROJECT_HOME=/Users/pranab/Projects 4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar 5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar 6 | MASTER=spark://akash:7077 7 | 8 | case "$1" in 9 | 10 | "crInput") 11 | echo "args: num_of_days time_interval(sec) num_of_servers output_file" 12 | ./cpu_usage.py usage $2 $3 $4 true > $5 13 | ls -l $5 14 | ;; 15 | 16 | "crTestInput") 17 | ./cpu_usage.py usage $2 $3 $4 true $5 > $6 18 | ls -l $6 19 | ;; 20 | 21 | "insOutliers") 22 | echo "args: normal_data_file output_file" 23 | ./cpu_usage.py anomaly $2 > $3 24 | ls -l $3 25 | ;; 26 | 27 | "cpModData") 28 | echo "args: modeling_data_file " 29 | rm $PROJECT_HOME/bin/beymani/input/olp/* 30 | rm $PROJECT_HOME/bin/beymani/nas/olp/* 31 | cp $2 $PROJECT_HOME/bin/beymani/input/nas/ 32 | cp $2 $PROJECT_HOME/bin/beymani/input/olp/ 33 | ls -l $PROJECT_HOME/bin/beymani/input/nas 34 | ls -l $PROJECT_HOME/bin/beymani/input/olp 35 | ;; 36 | 37 | "cpTestData") 38 | echo "args: test_data_file " 39 | rm $PROJECT_HOME/bin/beymani/input/olp/* 40 | cp $2 $PROJECT_HOME/bin/beymani/input/olp/ 41 | ls -l $PROJECT_HOME/bin/beymani/input/olp 42 | ;; 43 | 44 | "numStat") 45 | echo "running NumericalAttrStats Spark job" 46 | CLASS_NAME=org.chombo.spark.explore.NumericalAttrStats 47 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/nas/cusage.txt 48 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/nas 49 | rm -rf ./output/nas 50 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 51 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT and.conf 52 | ls -l $PROJECT_HOME/bin/beymani/output/nas/ 53 | ;; 54 | 55 | "crStatsFile") 56 | echo "copying and consolidating stats file" 57 | rm $PROJECT_HOME/bin/beymani/output/nas/_SUCCESS 58 | SFILE=$PROJECT_HOME/bin/beymani/other/olp/stats.txt 59 | cp /dev/null $SFILE 60 | for f in $PROJECT_HOME/bin/beymani/output/nas/* 61 | do 62 | echo "Copying file $f ..." 63 | cat $f >> $SFILE 64 | done 65 | ls -l $PROJECT_HOME/bin/beymani/other/olp 66 | ;; 67 | 68 | "olPred") 69 | echo "running StatsBasedOutlierPredictor Spark job" 70 | CLASS_NAME=org.beymani.spark.dist.StatsBasedOutlierPredictor 71 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/olp/* 72 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/olp 73 | rm -rf ./output/olp 74 | rm -rf ./other/olp/clean 75 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 76 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT and.conf 77 | rm ./output/olp/_SUCCESS 78 | for f in ./output/olp/* 79 | do 80 | echo "number of records in $f" 81 | wc -l $f 82 | done 83 | 84 | for f in ./output/olp/* 85 | do 86 | echo "number of outliers in $f" 87 | cat $f | grep ,O | wc -l 88 | done 89 | 90 | ;; 91 | 92 | "crCleanFile") 93 | echo "copying, consolidating and moving clean training data file" 94 | rm $PROJECT_HOME/bin/beymani/other/olp/clean/_SUCCESS 95 | CFILE=$PROJECT_HOME/bin/beymani/other/olp/clean/cusage.txt 96 | cp /dev/null $CFILE 97 | echo "creating clean file $CFILE" 98 | for f in $PROJECT_HOME/bin/beymani/other/olp/clean/* 99 | do 100 | echo "Copying file $f ..." 101 | cat $f >> $CFILE 102 | done 103 | echo "copying clean file to model input directory" 104 | mv $PROJECT_HOME/bin/beymani/input/nas/cusage.txt $PROJECT_HOME/bin/beymani/other/nas/cusage_1.txt 105 | mv $CFILE $PROJECT_HOME/bin/beymani/input/nas/cusage.txt 106 | echo "backing up current model file" 107 | mv $PROJECT_HOME/bin/beymani/other/olp/stats.txt $PROJECT_HOME/bin/beymani/other/olp/stats_1.txt 108 | ls -l $PROJECT_HOME/bin/beymani/input/nas/ 109 | ;; 110 | 111 | 112 | "mvOutlFile") 113 | echo "moving outlier output file" 114 | cat $PROJECT_HOME/bin/beymani/output/olp/part-00000 > $PROJECT_HOME/bin/beymani/other/olp/outl.txt 115 | cat $PROJECT_HOME/bin/beymani/output/olp/part-00001 >> $PROJECT_HOME/bin/beymani/other/olp/outl.txt 116 | ;; 117 | 118 | "thLearn") 119 | echo "running ThresholdLearner Spark job" 120 | CLASS_NAME=org.beymani.spark.common.ThresholdLearner 121 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/thl/olf.txt 122 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/thl 123 | rm -rf ./output/thl 124 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 125 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT and.conf 126 | ;; 127 | 128 | "tempAggr") 129 | echo "running TemporalAggregator Spark job" 130 | CLASS_NAME=org.chombo.spark.explore.TemporalAggregator 131 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/teg/cusage.txt 132 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/teg 133 | rm -rf ./output/teg 134 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 135 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT and.conf 136 | ;; 137 | 138 | 139 | *) 140 | echo "unknown operation $1" 141 | ;; 142 | 143 | esac -------------------------------------------------------------------------------- /spark/src/main/scala/org/beymani/spark/common/OutlierCounter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani-spark: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.spark.common 19 | 20 | import org.chombo.spark.common.JobConfiguration 21 | import org.apache.spark.SparkContext 22 | import scala.collection.JavaConverters._ 23 | import org.chombo.util.BasicUtils 24 | import org.chombo.spark.common.Record 25 | import org.chombo.util.BaseAttribute 26 | import com.typesafe.config.Config 27 | 28 | /** 29 | * Outlier count statistics 30 | * @author pranab 31 | * 32 | */ 33 | object OutlierCounter extends JobConfiguration { 34 | /** 35 | * @param args 36 | * @return 37 | */ 38 | def main(args: Array[String]) { 39 | val appName = "outlierCounter" 40 | val Array(inputPath: String, outputPath: String, configFile: String) = getCommandLineArgs(args, 3) 41 | val config = createConfig(configFile) 42 | val sparkConf = createSparkConf(appName, config, false) 43 | val sparkCntxt = new SparkContext(sparkConf) 44 | val appConfig = config.getConfig(appName) 45 | 46 | //configuration params 47 | val fieldDelimIn = getStringParamOrElse(appConfig, "field.delim.in", ",") 48 | val fieldDelimOut = getStringParamOrElse(appConfig, "field.delim.out", ",") 49 | val keyLen = getMandatoryIntParam(appConfig, "data.keyLen", "missing key length") 50 | val precision = getIntParamOrElse(appConfig, "output.precision", 3) 51 | val insertTimeStamp = getBooleanParamOrElse(appConfig, "output.insertTmStmp", false) 52 | val tmStmp = if (insertTimeStamp) System.currentTimeMillis() else 0 53 | val normTag = "N" 54 | val outlierTag = "O" 55 | val indeterTag = "I" 56 | val totalTag = "T" 57 | val debugOn = appConfig.getBoolean("debug.on") 58 | val saveOutput = appConfig.getBoolean("save.output") 59 | 60 | //input 61 | val data = sparkCntxt.textFile(inputPath) 62 | 63 | //key by record key and record status 64 | val keyedCounters = data.flatMap(line => { 65 | val items = BasicUtils.getTrimmedFields(line, fieldDelimIn) 66 | val counters = for (i <- 0 to 1) yield { 67 | val keyRec = Record(keyLen+1, items, 0, keyLen) 68 | if (i == 0) keyRec.addString(items(items.length-1)) 69 | else keyRec.addString(totalTag) 70 | (keyRec, 1) 71 | } 72 | counters 73 | }).reduceByKey((v1,v2) => v1+v2) 74 | 75 | //formatted count statistics for each key 76 | val formattedCountRecs = keyedCounters.map(r => { 77 | val keyRec = Record(r._1, 0, keyLen) 78 | val valRec = Record(2) 79 | valRec.addString(r._1.getString(keyLen)) 80 | valRec.addInt(r._2) 81 | (keyRec, valRec) 82 | }).groupByKey().map(r => { 83 | val key = r._1 84 | val values = r._2.toArray 85 | var outlierCount = 0 86 | var indeterCount = 0 87 | var normCount = 0 88 | var totalCount = 0 89 | for (v <- values) { 90 | v.getString(0) match { 91 | case `outlierTag` => outlierCount = v.getInt(1) 92 | case `indeterTag` => indeterCount = v.getInt(1) 93 | case `normTag` => normCount = v.getInt(1) 94 | case `totalTag` => totalCount = v.getInt(1) 95 | } 96 | } 97 | val outlierPercent = (outlierCount * 100).toDouble / totalCount 98 | val indeterPercent = (indeterCount * 100).toDouble / totalCount 99 | val normPercent = (normCount * 100).toDouble / totalCount 100 | 101 | val stBld = new StringBuilder(key.toString(fieldDelimOut)) 102 | if (insertTimeStamp) 103 | stBld.append(fieldDelimOut).append(tmStmp) 104 | stBld. 105 | append(fieldDelimOut).append(outlierCount). 106 | append(fieldDelimOut).append(BasicUtils.formatDouble(outlierPercent, precision)). 107 | append(fieldDelimOut).append(indeterCount). 108 | append(fieldDelimOut).append(BasicUtils.formatDouble(indeterPercent, precision)). 109 | append(fieldDelimOut).append(normCount). 110 | append(fieldDelimOut).append(BasicUtils.formatDouble(normPercent, precision)). 111 | append(fieldDelimOut).append(totalCount) 112 | 113 | stBld.toString() 114 | }) 115 | 116 | if (debugOn) { 117 | val records = formattedCountRecs.collect.slice(0, 20) 118 | records.foreach(r => println(r)) 119 | } 120 | 121 | if(saveOutput) { 122 | formattedCountRecs.saveAsTextFile(outputPath) 123 | } 124 | } 125 | 126 | } -------------------------------------------------------------------------------- /resource/ecomm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PROJECT_HOME=/Users/pranab/Projects 4 | CHOMBO_JAR_NAME=$PROJECT_HOME/bin/chombo/uber-chombo-spark-1.0.jar 5 | BEYMANI_JAR_NAME=$PROJECT_HOME/bin/beymani/uber-beymani-spark-1.0.jar 6 | MASTER=spark://akash:7077 7 | 8 | case "$1" in 9 | 10 | "loadInp") 11 | rm $PROJECT_HOME/bin/beymani/input/ecom/$3/* 12 | cp $2 $PROJECT_HOME/bin/beymani/input/ecom/$3/ 13 | ls -l $PROJECT_HOME/bin/beymani/input/ecom/$3/ 14 | ;; 15 | 16 | 17 | "numStat") 18 | echo "running NumericalAttrStats Spark job" 19 | CLASS_NAME=org.chombo.spark.explore.NumericalAttrStats 20 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/ecom/training/* 21 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ecom/stat 22 | rm -rf ./output/ecom/stat 23 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 24 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT ecomm.conf 25 | ;; 26 | 27 | "numMstat") 28 | echo "running NumericalAttrMedian Spark job" 29 | CLASS_NAME=org.chombo.spark.explore.NumericalAttrMedian 30 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/ecom/training/* 31 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ecom/mstat 32 | rm -rf ./output/ecom/mstat 33 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 34 | --conf spark.ui.killEnabled=true --master $MASTER $CHOMBO_JAR_NAME $INPUT $OUTPUT ecomm.conf 35 | rm ./output/ecom/mstat/_SUCCESS 36 | ls -l ./output/ecom/mstat 37 | ;; 38 | 39 | "bkMod") 40 | echo "backing up model files" 41 | MED_FILES=$PROJECT_HOME/bin/beymani/output/ecom/mstat/* 42 | META_DIR=$PROJECT_HOME/bin/beymani/meta/ecom 43 | META_FILE=$META_DIR/$2 44 | echo "copying to $META_FILE" 45 | cp /dev/null $META_FILE 46 | for f in $MED_FILES 47 | do 48 | echo "Copying file $f ..." 49 | cat $f >> $META_FILE 50 | done 51 | ls -l $META_FILE 52 | ;; 53 | 54 | "cpMod") 55 | echo "copying model files files from backup" 56 | META_DIR=$PROJECT_HOME/bin/beymani/meta/ecom 57 | cp $META_DIR/$2 $META_DIR/ 58 | ls -l $META_DIR 59 | ;; 60 | 61 | "olPred") 62 | echo "running StatsBasedOutlierPredictor Spark job" 63 | CLASS_NAME=org.beymani.spark.dist.StatsBasedOutlierPredictor 64 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/ecom/pred/* 65 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ecom/olp 66 | rm -rf ./output/ecom/olp 67 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 68 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT ecomm.conf 69 | rm ./output/ecom/olp/_SUCCESS 70 | ls -l ./output/ecom/olp 71 | cat ./output/ecom/olp/part-00000 | grep ,O 72 | ;; 73 | 74 | "chkOl") 75 | echo "number of outliers" 76 | OUT_FILES=$PROJECT_HOME/bin/beymani/output/ecom/olp/* 77 | for f in $OUT_FILES 78 | do 79 | echo "checking file $f ..." 80 | wc -l $f 81 | done 82 | ;; 83 | 84 | "bkOut") 85 | echo "backing up outlier output files" 86 | OUT_FILES=$PROJECT_HOME/bin/beymani/output/ecom/olp/* 87 | BK_DIR=$PROJECT_HOME/bin/beymani/output/ecom/bkup 88 | BK_FILE=$BK_DIR/$2 89 | cp /dev/null $BK_FILE 90 | for f in $OUT_FILES 91 | do 92 | echo "Copying file $f ..." 93 | cat $f >> $BK_FILE 94 | done 95 | ls -l $BK_FILE 96 | ;; 97 | 98 | "rmAggrInp") 99 | echo "removing outlier aggregation input files" 100 | IN_DIR=$PROJECT_HOME/bin/beymani/input/ecom/aggr 101 | rm $IN_DIR/* 102 | ls -l $IN_DIR 103 | ;; 104 | 105 | "loadAggrInp") 106 | echo "copying outlier output files for aggregation" 107 | IN_DIR=$PROJECT_HOME/bin/beymani/input/ecom/aggr/ 108 | BK_DIR=$PROJECT_HOME/bin/beymani/output/ecom/bkup 109 | cp $BK_DIR/$2 $IN_DIR 110 | ls -l $IN_DIR 111 | ;; 112 | 113 | 114 | "aggrOl") 115 | echo "running OutlierAggregator Spark job" 116 | CLASS_NAME=org.beymani.spark.common.OutlierAggregator 117 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/ecom/aggr/* 118 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ecom/aggr 119 | rm -rf ./output/ecom/aggr 120 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 121 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT ecomm.conf 122 | rm ./output/ecom/aggr/_SUCCESS 123 | ls -l ./output/ecom/aggr 124 | cat ./output/ecom/aggr/part-00000 | grep ,O 125 | ;; 126 | 127 | 128 | "bkOutAggr") 129 | echo "backing up aggregator output files" 130 | OUT_FILES=$PROJECT_HOME/bin/beymani/output/ecom/aggr/* 131 | BK_DIR=$PROJECT_HOME/bin/beymani/output/ecom/bkup 132 | BK_FILE=$BK_DIR/$2 133 | cp /dev/null $BK_FILE 134 | for f in $OUT_FILES 135 | do 136 | echo "Copying file $f ..." 137 | cat $f >> $BK_FILE 138 | done 139 | ls -l $BK_FILE 140 | ;; 141 | 142 | "orpOlPred") 143 | echo "running IsolationForestModel Spark job" 144 | CLASS_NAME=org.beymani.spark.multi.IsolationForestModel 145 | INPUT=file:///Users/pranab/Projects/bin/beymani/input/ecom/orp/* 146 | OUTPUT=file:///Users/pranab/Projects/bin/beymani/output/ecom/orp 147 | rm -rf ./output/ecom/orp 148 | $SPARK_HOME/bin/spark-submit --class $CLASS_NAME \ 149 | --conf spark.ui.killEnabled=true --master $MASTER $BEYMANI_JAR_NAME $INPUT $OUTPUT ecomm.conf 150 | rm ./output/ecom/orp/_SUCCESS 151 | ls -l ./output/ecom/orp 152 | cat ./output/ecom/orp/part-00000 | grep ,O 153 | ;; 154 | 155 | *) 156 | echo "unknown operation $1" 157 | ;; 158 | 159 | esac -------------------------------------------------------------------------------- /src/main/java/org/beymani/proximity/RelativeDensity.java: -------------------------------------------------------------------------------- 1 | package org.beymani.proximity; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.conf.Configured; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.LongWritable; 9 | import org.apache.hadoop.io.NullWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.Mapper; 13 | import org.apache.hadoop.mapreduce.Reducer; 14 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 15 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 16 | import org.apache.hadoop.util.Tool; 17 | import org.apache.hadoop.util.ToolRunner; 18 | import org.apache.log4j.Level; 19 | import org.apache.log4j.Logger; 20 | import org.chombo.util.TextInt; 21 | import org.chombo.util.Tuple; 22 | import org.chombo.util.Utility; 23 | 24 | public class RelativeDensity extends Configured implements Tool { 25 | 26 | @Override 27 | public int run(String[] args) throws Exception { 28 | Job job = new Job(getConf()); 29 | String jobName = "Relative density"; 30 | job.setJobName(jobName); 31 | 32 | job.setJarByClass(RelativeDensity.class); 33 | 34 | FileInputFormat.addInputPath(job, new Path(args[0])); 35 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 36 | 37 | job.setMapperClass(RelativeDensity.DensityMapper.class); 38 | job.setReducerClass(RelativeDensity.DensityReducer.class); 39 | 40 | job.setMapOutputKeyClass(Text.class); 41 | job.setMapOutputValueClass(Tuple.class); 42 | 43 | job.setOutputKeyClass(NullWritable.class); 44 | job.setOutputValueClass(Text.class); 45 | 46 | Utility.setConfiguration(job.getConfiguration()); 47 | 48 | job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1)); 49 | 50 | int status = job.waitForCompletion(true) ? 0 : 1; 51 | return status; 52 | } 53 | 54 | public static class DensityMapper extends Mapper { 55 | private String fieldDelimRegex; 56 | private String fieldDelim; 57 | private String[] items ; 58 | private Text outKey = new Text(); 59 | private Tuple outVal = new Tuple(); 60 | 61 | protected void setup(Context context) throws IOException, InterruptedException { 62 | fieldDelim = context.getConfiguration().get("field.delim", ","); 63 | fieldDelimRegex = context.getConfiguration().get("field.delim.regex", "\\[\\]"); 64 | } 65 | 66 | @Override 67 | protected void map(LongWritable key, Text value, Context context) 68 | throws IOException, InterruptedException { 69 | outVal.initialize(); 70 | items = value.toString().split(fieldDelimRegex); 71 | outKey.set(items[0]); 72 | outVal.add(items[1], Integer.parseInt(items[2])); 73 | context.write(outKey, outVal); 74 | } 75 | } 76 | 77 | /** 78 | * @author pranab 79 | * 80 | */ 81 | public static class DensityReducer extends Reducer { 82 | private String fieldDelim; 83 | private String groupID; 84 | private String entityID; 85 | private int sumDensity; 86 | private int density; 87 | private int relDensity; 88 | private Text outVal = new Text(); 89 | private int relDensityScale; 90 | private static final Logger LOG = Logger.getLogger(DensityReducer.class); 91 | 92 | protected void setup(Context context) throws IOException, InterruptedException { 93 | Configuration conf = context.getConfiguration(); 94 | fieldDelim = conf.get("field.delim", ","); 95 | relDensityScale = context.getConfiguration().getInt("red.reltive.density.scale", 1000); 96 | if (conf.getBoolean("debug.on", false)) { 97 | LOG.setLevel(Level.DEBUG); 98 | } 99 | } 100 | 101 | /* (non-Javadoc) 102 | * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context) 103 | */ 104 | protected void reduce(Text key, Iterable values, Context context) 105 | throws IOException, InterruptedException { 106 | groupID = key.toString(); 107 | sumDensity = 0; 108 | density = 0; 109 | for (Tuple val : values) { 110 | entityID = val.getString(0); 111 | if (entityID.equals(groupID)) { 112 | density = val.getInt(1); 113 | LOG.debug("entityID:" + entityID + " density:" + density); 114 | } 115 | sumDensity += val.getInt(1); 116 | } 117 | 118 | relDensity = (density * relDensityScale) / sumDensity; 119 | outVal.set(groupID + fieldDelim +relDensity); 120 | context.write(NullWritable.get(), outVal); 121 | } 122 | 123 | } 124 | 125 | /** 126 | * @param args 127 | */ 128 | public static void main(String[] args) throws Exception { 129 | int exitCode = ToolRunner.run(new RelativeDensity(), args); 130 | System.exit(exitCode); 131 | } 132 | 133 | 134 | } 135 | -------------------------------------------------------------------------------- /spark/src/main/scala/org/beymani/spark/seq/LocalNeighborhoodDetector.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani-spark: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.spark.seq 19 | 20 | import org.apache.spark.rdd.RDD 21 | import scala.collection.mutable.ArrayBuffer 22 | import scala.collection.JavaConverters._ 23 | import scala.util.control.Breaks._ 24 | import org.apache.spark.SparkContext 25 | import org.beymani.spark.common.OutlierUtility 26 | import org.chombo.spark.common.GeneralUtility 27 | import org.chombo.spark.common.JobConfiguration 28 | import org.chombo.spark.common.Record 29 | import org.chombo.util.BasicUtils 30 | import org.chombo.math.MathUtils 31 | import org.beymani.util.SeequenceScoreAggregator 32 | import org.hoidla.window.LocalNeighborhoodWindow 33 | 34 | 35 | /** 36 | * Anomaly detection in sequence data based on nearest neighboers within an window. 37 | * @author pranab 38 | * 39 | */ 40 | object LocalNeighborhoodDetector extends JobConfiguration with GeneralUtility with OutlierUtility { 41 | 42 | /** 43 | * @param args 44 | * @return 45 | */ 46 | def main(args: Array[String]) { 47 | val appName = "localNeighborhoodDetector" 48 | val Array(inputPath: String, outputPath: String, configFile: String) = getCommandLineArgs(args, 3) 49 | val config = createConfig(configFile) 50 | val sparkConf = createSparkConf(appName, config, false) 51 | val sparkCntxt = new SparkContext(sparkConf) 52 | val appConfig = config.getConfig(appName) 53 | 54 | //configuration params 55 | val fieldDelimIn = appConfig.getString("field.delim.in") 56 | val fieldDelimOut = appConfig.getString("field.delim.out") 57 | val precision = getIntParamOrElse(appConfig, "output.precision", 3) 58 | val keyFieldOrdinals = toOptionalIntArray(getOptionalIntListParam(appConfig, "id.fieldOrdinals")) 59 | val attrOrd = getMandatoryIntParam(appConfig, "attr.ordinal") 60 | val seqFieldOrd = getMandatoryIntParam(appConfig, "seq.fieldOrd", "missing seq field ordinal") 61 | val scoreThreshold = getMandatoryDoubleParam(appConfig, "score.threshold", "missing score threshold") 62 | val windowSize = getIntParamOrElse(appConfig, "window.size", 3) 63 | val neighborhoodDist = getDoubleParamOrElse(appConfig, "neighborhood.dist", -1.0) 64 | val debugOn = appConfig.getBoolean("debug.on") 65 | val saveOutput = appConfig.getBoolean("save.output") 66 | 67 | BasicUtils.assertCondition(windowSize % 2 == 1, "window size should be odd") 68 | val keyLen = getOptinalArrayLength(keyFieldOrdinals, 1) 69 | val neighborhoodDistBased = neighborhoodDist > 0 70 | val neighborhoodSize = getConditionalMandatoryIntParam(!neighborhoodDistBased, appConfig, "neighborhood.size", 71 | "neighborhoosd size must be provided") 72 | 73 | //input 74 | val data = sparkCntxt.textFile(inputPath) 75 | val keyedData = getKeyedValueWithSeq(data, fieldDelimIn, keyLen, keyFieldOrdinals, seqFieldOrd) 76 | 77 | //records with tag and score 78 | val taggedData = keyedData.groupByKey.flatMap(v => { 79 | val key = v._1 80 | val values = v._2.toList.sortBy(v => v.getLong(0)) 81 | val size = values.length 82 | val coffset = windowSize / 2 83 | val window = if (neighborhoodDistBased) { 84 | new LocalNeighborhoodWindow(windowSize, neighborhoodDist) 85 | } else { 86 | new LocalNeighborhoodWindow(windowSize, neighborhoodSize) 87 | } 88 | val scores = Array.fill[Double](size)(0) 89 | for (i <- 0 to size - 1) { 90 | val v = values(i) 91 | val line = v.getString(1) 92 | val items = BasicUtils.getTrimmedFields(line, fieldDelimIn) 93 | val quant = items(attrOrd).toDouble 94 | window.add(quant) 95 | if (window.isProcessed()) { 96 | val score = if (neighborhoodDistBased) window.getNumNeighbosWithin().toDouble 97 | else window.getAvNeighborDist() 98 | scores(i - coffset) = score 99 | } 100 | } 101 | 102 | //append score and tag 103 | val recScores = values.map(r => r.getString(1)).zip(scores) 104 | recScores.map(r => { 105 | val rec = r._1 106 | val score = r._2 107 | val tag = if (score > scoreThreshold) "O" else "N" 108 | rec + fieldDelimOut + BasicUtils.formatDouble(score, precision) + fieldDelimOut + tag 109 | }) 110 | }) 111 | 112 | if (debugOn) { 113 | val records = taggedData.collect 114 | records.slice(0, 50).foreach(r => println(r)) 115 | } 116 | 117 | if(saveOutput) { 118 | taggedData.saveAsTextFile(outputPath) 119 | } 120 | 121 | } 122 | 123 | } -------------------------------------------------------------------------------- /spark/src/main/scala/org/beymani/spark/pc/PrincipalComponentPredictor.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * beymani-spark: Outlier and anamoly detection 3 | * Author: Pranab Ghosh 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); you 6 | * may not use this file except in compliance with the License. You may 7 | * obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 14 | * implied. See the License for the specific language governing 15 | * permissions and limitations under the License. 16 | */ 17 | 18 | package org.beymani.spark.pc 19 | 20 | import org.chombo.spark.common.JobConfiguration 21 | import org.apache.spark.SparkContext 22 | import scala.collection.JavaConverters._ 23 | import org.chombo.util.BasicUtils 24 | import org.chombo.spark.common.Record 25 | import org.chombo.util.BaseAttribute 26 | import com.typesafe.config.Config 27 | import org.beymani.spark.common.OutlierUtility 28 | import org.chombo.spark.common.GeneralUtility 29 | import org.avenir.util.PrincipalCompState 30 | import org.chombo.math.MathUtils 31 | 32 | /** 33 | * PCA based outlier prediction 34 | * @author pranab 35 | * 36 | */ 37 | object PrincipalComponentPredictor extends JobConfiguration with GeneralUtility { 38 | /** 39 | * @param args 40 | * @return 41 | */ 42 | def main(args: Array[String]) { 43 | val appName = "principalComponentPredictor" 44 | val Array(inputPath: String, outputPath: String, configFile: String) = getCommandLineArgs(args, 3) 45 | val config = createConfig(configFile) 46 | val sparkConf = createSparkConf(appName, config, false) 47 | val sparkCntxt = new SparkContext(sparkConf) 48 | val appConfig = config.getConfig(appName) 49 | 50 | //configurations 51 | val fieldDelimIn = getStringParamOrElse(appConfig, "field.delim.in", ",") 52 | val fieldDelimOut = getStringParamOrElse(appConfig, "field.delim.out", ",") 53 | val keyFieldOrdinals = toIntArray(getMandatoryIntListParam(appConfig, "id.field.ordinals")) 54 | val quantFieldOrdinals = toIntArray(getMandatoryIntListParam(appConfig, "quant.field.ordinals")) 55 | val seqFieldOrd = getMandatoryIntParam( appConfig, "seq.field.ordinal", "missing sequence field ordinal") 56 | val dimension = quantFieldOrdinals.length 57 | val stateFilePath = this.getMandatoryStringParam(appConfig, "state.filePath", "missing pc state file path") 58 | val compState = PrincipalCompState.load(stateFilePath, fieldDelimOut).asScala.toMap 59 | val scoreThreshold = getMandatoryDoubleParam(appConfig, "score.threshold", "missing score threshold") 60 | val expConst = getDoubleParamOrElse(appConfig, "exp.const", 1.0) 61 | val precision = getIntParamOrElse(appConfig, "output.precision", 3) 62 | val debugOn = getBooleanParamOrElse(appConfig, "debug.on", false) 63 | val saveOutput = getBooleanParamOrElse(appConfig, "save.output", true) 64 | 65 | //pc matrix and transposed pc matrix 66 | val pcFun = (state: PrincipalCompState) => { 67 | val pcArr = state.getPrincComps() 68 | val pc = MathUtils.createMatrix(pcArr) 69 | val pcTr = pc.transpose() 70 | (pc, pcTr) 71 | } 72 | val pcMa = updateMapValues(compState, pcFun) 73 | 74 | val data = sparkCntxt.textFile(inputPath) 75 | val taggedData = data.map(line => { 76 | val items = BasicUtils.getTrimmedFields(line, fieldDelimIn) 77 | val keyRec = Record(items, keyFieldOrdinals) 78 | val keyStr = keyRec.toString(fieldDelimIn) 79 | val quantFields = BasicUtils.extractFieldsAsDoubleArray(items, keyFieldOrdinals) 80 | var score = 0 81 | val tag = pcMa.get(keyStr) match { 82 | case Some(pc) => { 83 | val pcHidden = pc._1 84 | val pcNorm = pc._2 85 | val daNorm = MathUtils.createColMatrix(quantFields) 86 | 87 | //regenerate 88 | val daHideen = MathUtils.multiplyMatrix(pcHidden, daNorm) 89 | val daRegen = MathUtils.multiplyMatrix(pcNorm, daHideen) 90 | 91 | //error 92 | val quantFieldsGen = MathUtils.arrayFromColumnMatrix(daRegen) 93 | var score = MathUtils.vectorDiffNorm(quantFields, quantFieldsGen) 94 | if (expConst > 0) { 95 | score = BasicUtils.expScale(expConst, score) 96 | } 97 | if (score < scoreThreshold) "N" else "O" 98 | } 99 | case None => "I" 100 | } 101 | val newRec = new Array[String](items.length + 2) 102 | Array.copy(items, 0, newRec, 0, items.length) 103 | newRec(newRec.length-2) = BasicUtils.formatDouble(score, precision) 104 | newRec(newRec.length-1) = tag 105 | (keyRec, newRec) 106 | }) 107 | 108 | //group by key and sort by sequence 109 | val serTaggedData = groupByKeySortBySeq(taggedData, seqFieldOrd, fieldDelimOut) 110 | 111 | if (debugOn) { 112 | val records = serTaggedData.collect 113 | records.slice(0, 50).foreach(r => println(r)) 114 | } 115 | 116 | if(saveOutput) { 117 | serTaggedData.saveAsTextFile(outputPath) 118 | } 119 | 120 | } 121 | } --------------------------------------------------------------------------------