├── results
    ├── plots
    │   ├── kmeans_simple.png
    │   ├── kmeans_one_hot_encoder.png
    │   ├── kmeans_one_hot_encoder_with_normalization.png
    │   ├── bisecting_kmeans_one_hot_encoder_with_normalization.png
    │   └── gaussian_mixture_one_hot_encoder_with_normalization.png
    ├── results201705271102_K-means_(20)_simple.txt
    ├── results201705271115_K-means_(30)_simple.txt
    ├── results201705271130_K-means_(40)_simple.txt
    ├── results201705271151_K-means_(50)_simple.txt
    ├── results201705271215_K-means_(60)_simple.txt
    ├── results201705271243_K-means_(70)_simple.txt
    ├── results201705271312_K-means_(80)_simple.txt
    ├── results201705271342_K-means_(90)_simple.txt
    ├── results201705271422_K-means_(100)_simple.txt
    ├── plots_axel
    │   ├── kmeans_simple.png
    │   ├── kmeans_one_hot_encoder.png
    │   ├── kmeans_one_hot_encoder_with_normalization.png
    │   ├── bisecting_kmeans_one_hot_encoder_with_normalization.png
    │   └── gaussian_mixture_one_hot_encoder_with_normalization.png
    ├── kmeans_simple
    │   ├── results201705301051_K-means_(20)_simple.txt
    │   ├── results201705301100_K-means_(30)_simple.txt
    │   ├── results201705301112_K-means_(40)_simple.txt
    │   ├── results201705301127_K-means_(50)_simple.txt
    │   ├── results201705301145_K-means_(60)_simple.txt
    │   ├── results201705301206_K-means_(70)_simple.txt
    │   ├── results201705301230_K-means_(80)_simple.txt
    │   ├── results201705301257_K-means_(90)_simple.txt
    │   └── results201705302321_K-means_(100)_simple.txt
    ├── results201705271431_K-means_(20)_with_one-hot_encoder.txt
    ├── results201705271444_K-means_(30)_with_one-hot_encoder.txt
    ├── results201705271500_K-means_(40)_with_one-hot_encoder.txt
    ├── results201705271519_K-means_(50)_with_one-hot_encoder.txt
    ├── results201705271541_K-means_(60)_with_one-hot_encoder.txt
    ├── results201705271607_K-means_(70)_with_one-hot_encoder.txt
    ├── results201705271637_K-means_(80)_with_one-hot_encoder.txt
    ├── results201705271710_K-means_(90)_with_one-hot_encoder.txt
    ├── results201705271747_K-means_(100)_with_one-hot_encoder.txt
    ├── kmeans_one_hot_encoder
    │   ├── results201705271231_K-means_(20)_with_one-hot_encoder.txt
    │   ├── results201705271241_K-means_(30)_with_one-hot_encoder.txt
    │   ├── results201705271255_K-means_(40)_with_one-hot_encoder.txt
    │   ├── results201705271312_K-means_(50)_with_one-hot_encoder.txt
    │   ├── results201705271333_K-means_(60)_with_one-hot_encoder.txt
    │   ├── results201705271358_K-means_(70)_with_one-hot_encoder.txt
    │   ├── results201705271426_K-means_(80)_with_one-hot_encoder.txt
    │   ├── results201705271458_K-means_(90)_with_one-hot_encoder.txt
    │   └── results201705271533_K-means_(100)_with_one-hot_encoder.txt
    ├── results201705271755_K-means_(20)_with_one-hot_encoder_with_normalization.txt
    ├── results201705271807_K-means_(30)_with_one-hot_encoder_with_normalization.txt
    ├── results201705271822_K-means_(40)_with_one-hot_encoder_with_normalization.txt
    ├── results201705271841_K-means_(50)_with_one-hot_encoder_with_normalization.txt
    ├── results201705271904_K-means_(60)_with_one-hot_encoder_with_normalization.txt
    ├── results201705271930_K-means_(70)_with_one-hot_encoder_with_normalization.txt
    ├── results201705272000_K-means_(80)_with_one-hot_encoder_with_normalization.txt
    ├── results201705272033_K-means_(90)_with_one-hot_encoder_with_normalization.txt
    ├── results201705272111_K-means_(100)_with_one-hot_encoder_with_normalization.txt
    ├── results201705280046_GaussianMixture_(20)_with_one-hot_encoder_with_normalization.txt
    ├── results201705272119_Bisecting_K-means_(20)_with_one-hot_encoder_with_normalization.txt
    ├── results201705272131_Bisecting_K-means_(30)_with_one-hot_encoder_with_normalization.txt
    ├── results201705272147_Bisecting_K-means_(40)_with_one-hot_encoder_with_normalization.txt
    ├── results201705272206_Bisecting_K-means_(50)_with_one-hot_encoder_with_normalization.txt
    ├── results201705272229_Bisecting_K-means_(60)_with_one-hot_encoder_with_normalization.txt
    ├── results201705272255_Bisecting_K-means_(70)_with_one-hot_encoder_with_normalization.txt
    ├── results201705272325_Bisecting_K-means_(80)_with_one-hot_encoder_with_normalization.txt
    ├── results201705272357_Bisecting_K-means_(90)_with_one-hot_encoder_with_normalization.txt
    ├── results201705280108_GaussianMixture_(30)_with_one-hot_encoder_with_normalization.txt
    ├── results201705280141_GaussianMixture_(40)_with_one-hot_encoder_with_normalization.txt
    ├── results201705280229_GaussianMixture_(50)_with_one-hot_encoder_with_normalization.txt
    ├── results201705280333_GaussianMixture_(60)_with_one-hot_encoder_with_normalization.txt
    ├── results201705280456_GaussianMixture_(70)_with_one-hot_encoder_with_normalization.txt
    ├── results201705280710_GaussianMixture_(80)_with_one-hot_encoder_with_normalization.txt
    ├── results201705280915_GaussianMixture_(90)_with_one-hot_encoder_with_normalization.txt
    ├── results201705281152_GaussianMixture_(100)_with_one-hot_encoder_with_normalization.txt
    ├── results201705280033_Bisecting_K-means_(100)_with_one-hot_encoder_with_normalization.txt
    ├── kmeans_one_hot_encoder_with_normalization
    │   ├── results201705310454_K-means_(20)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705310512_K-means_(30)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705310535_K-means_(40)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705310605_K-means_(50)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705310641_K-means_(60)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705310722_K-means_(70)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705310809_K-means_(80)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705310902_K-means_(90)_with_one-hot_encoder_with_normalization.txt
    │   └── results201705311001_K-means_(100)_with_one-hot_encoder_with_normalization.txt
    ├── bisecting_kmeans_one_hot_encoder_with_normalization
    │   ├── results201705311014_Bisecting_K-means_(20)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705311245_Bisecting_K-means_(70)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705311033_Bisecting_K-means_(30)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705311057_Bisecting_K-means_(40)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705311128_Bisecting_K-means_(50)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705311204_Bisecting_K-means_(60)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705311333_Bisecting_K-means_(80)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705311426_Bisecting_K-means_(90)_with_one-hot_encoder_with_normalization.txt
    │   └── results201705311525_Bisecting_K-means_(100)_with_one-hot_encoder_with_normalization.txt
    └── gaussian_mixture_one_hot_encoder_with_normalization
    │   ├── results201705311543_GaussianMixture_(20)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705311613_GaussianMixture_(30)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705311659_GaussianMixture_(40)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705311806_GaussianMixture_(50)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705311937_GaussianMixture_(60)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201705312134_GaussianMixture_(70)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201706010002_GaussianMixture_(80)_with_one-hot_encoder_with_normalization.txt
    │   ├── results201706010316_GaussianMixture_(90)_with_one-hot_encoder_with_normalization.txt
    │   └── results201706010649_GaussianMixture_(100)_with_one-hot_encoder_with_normalization.txt
├── LICENSE
├── README.md
├── plotScore.py
├── .gitignore
└── src
    └── NetworkAnomalyDetection.scala


/results/plots/kmeans_simple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots/kmeans_simple.png


--------------------------------------------------------------------------------
/results/results201705271102_K-means_(20)_simple.txt:
--------------------------------------------------------------------------------
1 | K-means (20) simple
2 | Score=3411.9003628488085
3 | Duration=494.53377111
4 | 


--------------------------------------------------------------------------------
/results/results201705271115_K-means_(30)_simple.txt:
--------------------------------------------------------------------------------
1 | K-means (30) simple
2 | Score=1604.3522840254057
3 | Duration=743.344817602
4 | 


--------------------------------------------------------------------------------
/results/results201705271130_K-means_(40)_simple.txt:
--------------------------------------------------------------------------------
1 | K-means (40) simple
2 | Score=918.3628605831424
3 | Duration=929.610309607
4 | 


--------------------------------------------------------------------------------
/results/results201705271151_K-means_(50)_simple.txt:
--------------------------------------------------------------------------------
1 | K-means (50) simple
2 | Score=606.3685420150439
3 | Duration=1218.861908538
4 | 


--------------------------------------------------------------------------------
/results/results201705271215_K-means_(60)_simple.txt:
--------------------------------------------------------------------------------
1 | K-means (60) simple
2 | Score=507.20011532973143
3 | Duration=1443.970069177
4 | 


--------------------------------------------------------------------------------
/results/results201705271243_K-means_(70)_simple.txt:
--------------------------------------------------------------------------------
1 | K-means (70) simple
2 | Score=466.19325697272177
3 | Duration=1681.703710902
4 | 


--------------------------------------------------------------------------------
/results/results201705271312_K-means_(80)_simple.txt:
--------------------------------------------------------------------------------
1 | K-means (80) simple
2 | Score=402.7002490798157
3 | Duration=1780.824890324
4 | 


--------------------------------------------------------------------------------
/results/results201705271342_K-means_(90)_simple.txt:
--------------------------------------------------------------------------------
1 | K-means (90) simple
2 | Score=538.7185884007569
3 | Duration=1774.006552106
4 | 


--------------------------------------------------------------------------------
/results/results201705271422_K-means_(100)_simple.txt:
--------------------------------------------------------------------------------
1 | K-means (100) simple
2 | Score=506.4350529644009
3 | Duration=2395.402772907
4 | 


--------------------------------------------------------------------------------
/results/plots_axel/kmeans_simple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots_axel/kmeans_simple.png


--------------------------------------------------------------------------------
/results/kmeans_simple/results201705301051_K-means_(20)_simple.txt:
--------------------------------------------------------------------------------
1 | K-means (20) simple
2 | Score=795028.6677444499
3 | Duration=381.519614201
4 | 


--------------------------------------------------------------------------------
/results/kmeans_simple/results201705301100_K-means_(30)_simple.txt:
--------------------------------------------------------------------------------
1 | K-means (30) simple
2 | Score=374962.2418958273
3 | Duration=538.635203758
4 | 


--------------------------------------------------------------------------------
/results/kmeans_simple/results201705301112_K-means_(40)_simple.txt:
--------------------------------------------------------------------------------
1 | K-means (40) simple
2 | Score=259926.9568822872
3 | Duration=711.311292832
4 | 


--------------------------------------------------------------------------------
/results/kmeans_simple/results201705301127_K-means_(50)_simple.txt:
--------------------------------------------------------------------------------
1 | K-means (50) simple
2 | Score=183540.68086804406
3 | Duration=899.458520806
4 | 


--------------------------------------------------------------------------------
/results/kmeans_simple/results201705301145_K-means_(60)_simple.txt:
--------------------------------------------------------------------------------
1 | K-means (60) simple
2 | Score=141014.08830478703
3 | Duration=1083.386044166
4 | 


--------------------------------------------------------------------------------
/results/kmeans_simple/results201705301206_K-means_(70)_simple.txt:
--------------------------------------------------------------------------------
1 | K-means (70) simple
2 | Score=118370.73907939174
3 | Duration=1269.059388628
4 | 


--------------------------------------------------------------------------------
/results/kmeans_simple/results201705301230_K-means_(80)_simple.txt:
--------------------------------------------------------------------------------
1 | K-means (80) simple
2 | Score=44144.134731446975
3 | Duration=1436.104775358
4 | 


--------------------------------------------------------------------------------
/results/kmeans_simple/results201705301257_K-means_(90)_simple.txt:
--------------------------------------------------------------------------------
1 | K-means (90) simple
2 | Score=36740.48338868395
3 | Duration=1635.510372384
4 | 


--------------------------------------------------------------------------------
/results/kmeans_simple/results201705302321_K-means_(100)_simple.txt:
--------------------------------------------------------------------------------
1 | K-means (100) simple
2 | Score=30895.80296446874
3 | Duration=2813.514058399
4 | 


--------------------------------------------------------------------------------
/results/plots/kmeans_one_hot_encoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots/kmeans_one_hot_encoder.png


--------------------------------------------------------------------------------
/results/plots_axel/kmeans_one_hot_encoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots_axel/kmeans_one_hot_encoder.png


--------------------------------------------------------------------------------
/results/results201705271431_K-means_(20)_with_one-hot_encoder.txt:
--------------------------------------------------------------------------------
1 | K-means (20) with one-hot encoder
2 | Score=3411.9004558128668
3 | Duration=521.15065171
4 | 


--------------------------------------------------------------------------------
/results/results201705271444_K-means_(30)_with_one-hot_encoder.txt:
--------------------------------------------------------------------------------
1 | K-means (30) with one-hot encoder
2 | Score=1604.3523924730237
3 | Duration=827.13024721
4 | 


--------------------------------------------------------------------------------
/results/results201705271500_K-means_(40)_with_one-hot_encoder.txt:
--------------------------------------------------------------------------------
1 | K-means (40) with one-hot encoder
2 | Score=918.3629950314147
3 | Duration=914.994794978
4 | 


--------------------------------------------------------------------------------
/results/results201705271519_K-means_(50)_with_one-hot_encoder.txt:
--------------------------------------------------------------------------------
1 | K-means (50) with one-hot encoder
2 | Score=606.3686734230612
3 | Duration=1160.955209065
4 | 


--------------------------------------------------------------------------------
/results/results201705271541_K-means_(60)_with_one-hot_encoder.txt:
--------------------------------------------------------------------------------
1 | K-means (60) with one-hot encoder
2 | Score=507.2003489342253
3 | Duration=1332.972756026
4 | 


--------------------------------------------------------------------------------
/results/results201705271607_K-means_(70)_with_one-hot_encoder.txt:
--------------------------------------------------------------------------------
1 | K-means (70) with one-hot encoder
2 | Score=466.1934081517335
3 | Duration=1559.376549068
4 | 


--------------------------------------------------------------------------------
/results/results201705271637_K-means_(80)_with_one-hot_encoder.txt:
--------------------------------------------------------------------------------
1 | K-means (80) with one-hot encoder
2 | Score=402.7003742837811
3 | Duration=1774.606822868
4 | 


--------------------------------------------------------------------------------
/results/results201705271710_K-means_(90)_with_one-hot_encoder.txt:
--------------------------------------------------------------------------------
1 | K-means (90) with one-hot encoder
2 | Score=538.7186344059103
3 | Duration=1999.400565217
4 | 


--------------------------------------------------------------------------------
/results/results201705271747_K-means_(100)_with_one-hot_encoder.txt:
--------------------------------------------------------------------------------
1 | K-means (100) with one-hot encoder
2 | Score=506.4354273607242
3 | Duration=2232.683974512
4 | 


--------------------------------------------------------------------------------
/results/kmeans_one_hot_encoder/results201705271231_K-means_(20)_with_one-hot_encoder.txt:
--------------------------------------------------------------------------------
1 | K-means (20) with one-hot encoder
2 | Score=795029.8476020518
3 | Duration=427.630177442
4 | 


--------------------------------------------------------------------------------
/results/kmeans_one_hot_encoder/results201705271241_K-means_(30)_with_one-hot_encoder.txt:
--------------------------------------------------------------------------------
1 | K-means (30) with one-hot encoder
2 | Score=374963.2057284806
3 | Duration=628.492676961
4 | 


--------------------------------------------------------------------------------
/results/kmeans_one_hot_encoder/results201705271255_K-means_(40)_with_one-hot_encoder.txt:
--------------------------------------------------------------------------------
1 | K-means (40) with one-hot encoder
2 | Score=259927.68903096477
3 | Duration=830.547506794
4 | 


--------------------------------------------------------------------------------
/results/kmeans_one_hot_encoder/results201705271312_K-means_(50)_with_one-hot_encoder.txt:
--------------------------------------------------------------------------------
1 | K-means (50) with one-hot encoder
2 | Score=183541.23454720704
3 | Duration=1039.745908218
4 | 


--------------------------------------------------------------------------------
/results/kmeans_one_hot_encoder/results201705271333_K-means_(60)_with_one-hot_encoder.txt:
--------------------------------------------------------------------------------
1 | K-means (60) with one-hot encoder
2 | Score=141014.55503504092
3 | Duration=1253.997536722
4 | 


--------------------------------------------------------------------------------
/results/kmeans_one_hot_encoder/results201705271358_K-means_(70)_with_one-hot_encoder.txt:
--------------------------------------------------------------------------------
1 | K-means (70) with one-hot encoder
2 | Score=118371.13932829347
3 | Duration=1469.034651508
4 | 


--------------------------------------------------------------------------------
/results/kmeans_one_hot_encoder/results201705271426_K-means_(80)_with_one-hot_encoder.txt:
--------------------------------------------------------------------------------
1 | K-means (80) with one-hot encoder
2 | Score=44144.45446175979
3 | Duration=1682.14382306
4 | 


--------------------------------------------------------------------------------
/results/kmeans_one_hot_encoder/results201705271458_K-means_(90)_with_one-hot_encoder.txt:
--------------------------------------------------------------------------------
1 | K-means (90) with one-hot encoder
2 | Score=36740.76732934029
3 | Duration=1905.240331292
4 | 


--------------------------------------------------------------------------------
/results/kmeans_one_hot_encoder/results201705271533_K-means_(100)_with_one-hot_encoder.txt:
--------------------------------------------------------------------------------
1 | K-means (100) with one-hot encoder
2 | Score=30896.059641908327
3 | Duration=2121.976332771
4 | 


--------------------------------------------------------------------------------
/results/plots/kmeans_one_hot_encoder_with_normalization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots/kmeans_one_hot_encoder_with_normalization.png


--------------------------------------------------------------------------------
/results/plots_axel/kmeans_one_hot_encoder_with_normalization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots_axel/kmeans_one_hot_encoder_with_normalization.png


--------------------------------------------------------------------------------
/results/results201705271755_K-means_(20)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | K-means (20) with one-hot encoder with normalization
2 | Score=5.512477052689253
3 | Duration=467.247128525
4 | 


--------------------------------------------------------------------------------
/results/results201705271807_K-means_(30)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | K-means (30) with one-hot encoder with normalization
2 | Score=8.346482286155416
3 | Duration=708.824423426
4 | 


--------------------------------------------------------------------------------
/results/results201705271822_K-means_(40)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | K-means (40) with one-hot encoder with normalization
2 | Score=6.45572592987493
3 | Duration=907.843712033
4 | 


--------------------------------------------------------------------------------
/results/results201705271841_K-means_(50)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | K-means (50) with one-hot encoder with normalization
2 | Score=5.923645877855369
3 | Duration=1125.301766206
4 | 


--------------------------------------------------------------------------------
/results/results201705271904_K-means_(60)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | K-means (60) with one-hot encoder with normalization
2 | Score=6.154421118629562
3 | Duration=1378.721376253
4 | 


--------------------------------------------------------------------------------
/results/results201705271930_K-means_(70)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | K-means (70) with one-hot encoder with normalization
2 | Score=4.756622128870819
3 | Duration=1586.597461245
4 | 


--------------------------------------------------------------------------------
/results/results201705272000_K-means_(80)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | K-means (80) with one-hot encoder with normalization
2 | Score=4.448525376058372
3 | Duration=1776.086742568
4 | 


--------------------------------------------------------------------------------
/results/results201705272033_K-means_(90)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | K-means (90) with one-hot encoder with normalization
2 | Score=4.407901770147693
3 | Duration=2011.189166835
4 | 


--------------------------------------------------------------------------------
/results/results201705272111_K-means_(100)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | K-means (100) with one-hot encoder with normalization
2 | Score=3.581636032875701
3 | Duration=2243.538546857
4 | 


--------------------------------------------------------------------------------
/results/plots/bisecting_kmeans_one_hot_encoder_with_normalization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots/bisecting_kmeans_one_hot_encoder_with_normalization.png


--------------------------------------------------------------------------------
/results/plots/gaussian_mixture_one_hot_encoder_with_normalization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots/gaussian_mixture_one_hot_encoder_with_normalization.png


--------------------------------------------------------------------------------
/results/results201705280046_GaussianMixture_(20)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | GaussianMixture (20) with one-hot encoder with normalization
2 | Score=0.2932092377554215
3 | Duration=743.056496329
4 | 


--------------------------------------------------------------------------------
/results/plots_axel/bisecting_kmeans_one_hot_encoder_with_normalization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots_axel/bisecting_kmeans_one_hot_encoder_with_normalization.png


--------------------------------------------------------------------------------
/results/plots_axel/gaussian_mixture_one_hot_encoder_with_normalization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots_axel/gaussian_mixture_one_hot_encoder_with_normalization.png


--------------------------------------------------------------------------------
/results/results201705272119_Bisecting_K-means_(20)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | Bisecting K-means (20) with one-hot encoder with normalization
2 | Score=8.857494697352449
3 | Duration=506.177613531
4 | 


--------------------------------------------------------------------------------
/results/results201705272131_Bisecting_K-means_(30)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | Bisecting K-means (30) with one-hot encoder with normalization
2 | Score=14.904728795235828
3 | Duration=715.761176323
4 | 


--------------------------------------------------------------------------------
/results/results201705272147_Bisecting_K-means_(40)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | Bisecting K-means (40) with one-hot encoder with normalization
2 | Score=13.691284625809638
3 | Duration=934.888378974
4 | 


--------------------------------------------------------------------------------
/results/results201705272206_Bisecting_K-means_(50)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | Bisecting K-means (50) with one-hot encoder with normalization
2 | Score=16.717111733929226
3 | Duration=1173.524747163
4 | 


--------------------------------------------------------------------------------
/results/results201705272229_Bisecting_K-means_(60)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | Bisecting K-means (60) with one-hot encoder with normalization
2 | Score=14.776863383877734
3 | Duration=1361.131594462
4 | 


--------------------------------------------------------------------------------
/results/results201705272255_Bisecting_K-means_(70)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | Bisecting K-means (70) with one-hot encoder with normalization
2 | Score=13.083036390239007
3 | Duration=1567.085641769
4 | 


--------------------------------------------------------------------------------
/results/results201705272325_Bisecting_K-means_(80)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | Bisecting K-means (80) with one-hot encoder with normalization
2 | Score=13.941691367677404
3 | Duration=1778.031726294
4 | 


--------------------------------------------------------------------------------
/results/results201705272357_Bisecting_K-means_(90)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | Bisecting K-means (90) with one-hot encoder with normalization
2 | Score=13.962600986212275
3 | Duration=1962.443723704
4 | 


--------------------------------------------------------------------------------
/results/results201705280108_GaussianMixture_(30)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | GaussianMixture (30) with one-hot encoder with normalization
2 | Score=0.19547282517028208
3 | Duration=1315.244916256
4 | 


--------------------------------------------------------------------------------
/results/results201705280141_GaussianMixture_(40)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | GaussianMixture (40) with one-hot encoder with normalization
2 | Score=0.14660461887771076
3 | Duration=2026.049716333
4 | 


--------------------------------------------------------------------------------
/results/results201705280229_GaussianMixture_(50)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | GaussianMixture (50) with one-hot encoder with normalization
2 | Score=0.11728369510216834
3 | Duration=2873.354172106
4 | 


--------------------------------------------------------------------------------
/results/results201705280333_GaussianMixture_(60)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | GaussianMixture (60) with one-hot encoder with normalization
2 | Score=0.09773641258514104
3 | Duration=3836.495340339
4 | 


--------------------------------------------------------------------------------
/results/results201705280456_GaussianMixture_(70)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | GaussianMixture (70) with one-hot encoder with normalization
2 | Score=0.08377406793012049
3 | Duration=4934.065542179
4 | 


--------------------------------------------------------------------------------
/results/results201705280710_GaussianMixture_(80)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | GaussianMixture (80) with one-hot encoder with normalization
2 | Score=0.07330230943885538
3 | Duration=6054.620509646
4 | 


--------------------------------------------------------------------------------
/results/results201705280915_GaussianMixture_(90)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | GaussianMixture (90) with one-hot encoder with normalization
2 | Score=0.1348575655060487
3 | Duration=7445.378367397
4 | 


--------------------------------------------------------------------------------
/results/results201705281152_GaussianMixture_(100)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | GaussianMixture (100) with one-hot encoder with normalization
2 | Score=0.05864184755108417
3 | Duration=9452.060132619
4 | 


--------------------------------------------------------------------------------
/results/results201705280033_Bisecting_K-means_(100)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | Bisecting K-means (100) with one-hot encoder with normalization
2 | Score=14.48039254193131
3 | Duration=2165.318778686
4 | 


--------------------------------------------------------------------------------
/results/kmeans_one_hot_encoder_with_normalization/results201705310454_K-means_(20)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | K-means (20) with one-hot encoder with normalization
2 | Score=4562.023772066879
3 | Duration=739.491825297
4 | 


--------------------------------------------------------------------------------
/results/kmeans_one_hot_encoder_with_normalization/results201705310512_K-means_(30)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | K-means (30) with one-hot encoder with normalization
2 | Score=2523.904568574377
3 | Duration=1080.107399075
4 | 


--------------------------------------------------------------------------------
/results/kmeans_one_hot_encoder_with_normalization/results201705310535_K-means_(40)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | K-means (40) with one-hot encoder with normalization
2 | Score=2961.8241047913543
3 | Duration=1429.234178146
4 | 


--------------------------------------------------------------------------------
/results/kmeans_one_hot_encoder_with_normalization/results201705310605_K-means_(50)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | K-means (50) with one-hot encoder with normalization
2 | Score=1014.2989684919021
3 | Duration=1775.269853862
4 | 


--------------------------------------------------------------------------------
/results/kmeans_one_hot_encoder_with_normalization/results201705310641_K-means_(60)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | K-means (60) with one-hot encoder with normalization
2 | Score=760.2434287550872
3 | Duration=2139.817159987
4 | 


--------------------------------------------------------------------------------
/results/kmeans_one_hot_encoder_with_normalization/results201705310722_K-means_(70)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | K-means (70) with one-hot encoder with normalization
2 | Score=487.2500268778508
3 | Duration=2480.196751522
4 | 


--------------------------------------------------------------------------------
/results/kmeans_one_hot_encoder_with_normalization/results201705310809_K-means_(80)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | K-means (80) with one-hot encoder with normalization
2 | Score=365.8395041984696
3 | Duration=2843.095978621
4 | 


--------------------------------------------------------------------------------
/results/kmeans_one_hot_encoder_with_normalization/results201705310902_K-means_(90)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | K-means (90) with one-hot encoder with normalization
2 | Score=252.89070654255804
3 | Duration=3175.465465776
4 | 


--------------------------------------------------------------------------------
/results/kmeans_one_hot_encoder_with_normalization/results201705311001_K-means_(100)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | K-means (100) with one-hot encoder with normalization
2 | Score=197.126534840126
3 | Duration=3516.101870375
4 | 


--------------------------------------------------------------------------------
/results/bisecting_kmeans_one_hot_encoder_with_normalization/results201705311014_Bisecting_K-means_(20)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | Bisecting K-means (20) with one-hot encoder with normalization
2 | Score=3520.795209404406
3 | Duration=793.596314017
4 | 


--------------------------------------------------------------------------------
/results/bisecting_kmeans_one_hot_encoder_with_normalization/results201705311245_Bisecting_K-means_(70)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | Bisecting K-means (70) with one-hot encoder with normalization
2 | Score=684.7502042309432
3 | Duration=2509.28944881
4 | 


--------------------------------------------------------------------------------
/results/gaussian_mixture_one_hot_encoder_with_normalization/results201705311543_GaussianMixture_(20)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | GaussianMixture (20) with one-hot encoder with normalization
2 | Score=12209.316516395684
3 | Duration=1063.475780708
4 | 


--------------------------------------------------------------------------------
/results/gaussian_mixture_one_hot_encoder_with_normalization/results201705311613_GaussianMixture_(30)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | GaussianMixture (30) with one-hot encoder with normalization
2 | Score=8139.544344265546
3 | Duration=1814.226984862
4 | 


--------------------------------------------------------------------------------
/results/gaussian_mixture_one_hot_encoder_with_normalization/results201705311659_GaussianMixture_(40)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | GaussianMixture (40) with one-hot encoder with normalization
2 | Score=6104.658258199119
3 | Duration=2788.677829532
4 | 


--------------------------------------------------------------------------------
/results/gaussian_mixture_one_hot_encoder_with_normalization/results201705311806_GaussianMixture_(50)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | GaussianMixture (50) with one-hot encoder with normalization
2 | Score=4883.72660655933
3 | Duration=4007.448348552
4 | 


--------------------------------------------------------------------------------
/results/gaussian_mixture_one_hot_encoder_with_normalization/results201705311937_GaussianMixture_(60)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | GaussianMixture (60) with one-hot encoder with normalization
2 | Score=4069.772172132773
3 | Duration=5435.724346663
4 | 


--------------------------------------------------------------------------------
/results/gaussian_mixture_one_hot_encoder_with_normalization/results201705312134_GaussianMixture_(70)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | GaussianMixture (70) with one-hot encoder with normalization
2 | Score=3488.3761475423494
3 | Duration=7058.942404606
4 | 


--------------------------------------------------------------------------------
/results/gaussian_mixture_one_hot_encoder_with_normalization/results201706010002_GaussianMixture_(80)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | GaussianMixture (80) with one-hot encoder with normalization
2 | Score=3052.3291290995594
3 | Duration=8891.728532267
4 | 


--------------------------------------------------------------------------------
/results/gaussian_mixture_one_hot_encoder_with_normalization/results201706010316_GaussianMixture_(90)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | GaussianMixture (90) with one-hot encoder with normalization
2 | Score=2713.181448087896
3 | Duration=11625.973742715
4 | 


--------------------------------------------------------------------------------
/results/gaussian_mixture_one_hot_encoder_with_normalization/results201706010649_GaussianMixture_(100)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | GaussianMixture (100) with one-hot encoder with normalization
2 | Score=2441.863303279665
3 | Duration=12777.292420437
4 | 


--------------------------------------------------------------------------------
/results/bisecting_kmeans_one_hot_encoder_with_normalization/results201705311033_Bisecting_K-means_(30)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | Bisecting K-means (30) with one-hot encoder with normalization
2 | Score=2103.692239687063
3 | Duration=1120.416655349
4 | 


--------------------------------------------------------------------------------
/results/bisecting_kmeans_one_hot_encoder_with_normalization/results201705311057_Bisecting_K-means_(40)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | Bisecting K-means (40) with one-hot encoder with normalization
2 | Score=1418.1541171113377
3 | Duration=1475.098686214
4 | 


--------------------------------------------------------------------------------
/results/bisecting_kmeans_one_hot_encoder_with_normalization/results201705311128_Bisecting_K-means_(50)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | Bisecting K-means (50) with one-hot encoder with normalization
2 | Score=1062.8226643714606
3 | Duration=1815.145936027
4 | 


--------------------------------------------------------------------------------
/results/bisecting_kmeans_one_hot_encoder_with_normalization/results201705311204_Bisecting_K-means_(60)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | Bisecting K-means (60) with one-hot encoder with normalization
2 | Score=836.3342938470117
3 | Duration=2154.915725311
4 | 


--------------------------------------------------------------------------------
/results/bisecting_kmeans_one_hot_encoder_with_normalization/results201705311333_Bisecting_K-means_(80)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | Bisecting K-means (80) with one-hot encoder with normalization
2 | Score=575.9102565141137
3 | Duration=2842.771511482
4 | 


--------------------------------------------------------------------------------
/results/bisecting_kmeans_one_hot_encoder_with_normalization/results201705311426_Bisecting_K-means_(90)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | Bisecting K-means (90) with one-hot encoder with normalization
2 | Score=493.21041806599266
3 | Duration=3196.980530802
4 | 


--------------------------------------------------------------------------------
/results/bisecting_kmeans_one_hot_encoder_with_normalization/results201705311525_Bisecting_K-means_(100)_with_one-hot_encoder_with_normalization.txt:
--------------------------------------------------------------------------------
1 | Bisecting K-means (100) with one-hot encoder with normalization
2 | Score=448.4686151150904
3 | Duration=3524.550971894
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Axel Fahy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NetworkAnomalyDetection
 2 | 
 3 | Anomaly Detection in Network Traffic using different clustering algorithm.
 4 | 
 5 | Data must be located in the *data* folder. Due to the size of the dataset, it has been ignored.
 6 | 
 7 | The full dataset is available at <https://archive.ics.uci.edu/ml/datasets/KDD+Cup+1999+Data>
 8 | 
 9 | Program can be run using Intellij or with sbt and spark-submit:
10 | 
11 | ```$ sbt package```
12 | 
13 | ```$ spark-submit --class "NetworkAnomalyDetection" --driver-memory 6g target/scala-2.11/networkanomalydetection_2.11-0.1.jar```
14 | 
15 | This project has been developped using DataFrames from Spark MLlib.
16 | 
17 | 
18 | ## Data
19 | 
20 | The datasets used for the training phase contain 4,898,431 packets, and its size is of 743 MB. Here is a preview of the data:
21 | 
22 | ```0,tcp,http,SF,215,45076,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0,0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal.```
23 | 
24 | The data come from a competition held in 1999 by KDD Cup organization (i.e. an organisation bringing together professional data miners). The competition was about creating a model capable to detect anomalies in network traffic. The packets available were gathered from an US military network.
25 | 
26 | ## Preprocessing
27 | 
28 | - **Numerical features**: at first, only numerical features were used as features.
29 | 
30 | - **Categorical features**: then, categorical features were encoded using the One-hot encoder.
31 | 
32 | - **Normalization**: finally, normalization of vector features using standard deviation
33 | 
34 | ## Algorithms
35 | 
36 | - **K-means**: the center of a cluster is named a centroid. At the first iteration, K centroids are chosen randomly. Then, at each iteration, data points are affected to their closest centroid and the centroid becomes the mean of the points from this cluster.
37 | 
38 | - **Bisecting K-means**: same approach as K-means. However, at first there is only one cluster that contains all data points. Then, at each iteration, cluster are divided using K-means.
39 | 
40 | - **Gaussian Mixture (GMM)**: the goal is to maximise the log-likelihood. The algorithm will iterate until a threshold is reached or a maximum of iterations. The algorithm will converge, but not necessarly in a global minimum.
41 | 
42 | ## Evaluation
43 | 
44 | To evaluate the models with the same technique, an evaluation function was created. The evaluation is the Euclidean squared distance of each centroids.
45 | 
46 | 


--------------------------------------------------------------------------------
/plotScore.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import numpy as np
  3 | import matplotlib.mlab as mlab
  4 | 
  5 | from os import listdir
  6 | from os.path import isfile, join
  7 | import plotly.plotly as py
  8 | 
  9 | listFiles = []
 10 | scoreNumber = []
 11 | y2 = []
 12 | 
 13 | #Plot scores for K-means Simple
 14 | def kmeanSimple(scores):
 15 | 	
 16 | 	for score in scores:
 17 | 		y2.append(score)
 18 | 
 19 | 	x2 = [20, 30, 40, 50, 60, 70, 80, 90, 100]
 20 | 	plt.bar(x2, y2, label='Score', color='blue')
 21 | 	plt.xlabel('K')
 22 | 	plt.ylabel('Scores')
 23 | 	plt.title('K-means simple')
 24 | 	plt.legend()
 25 | 	plt.show()
 26 | 
 27 | #Plot score for K-means One Hot Encoder
 28 | def kmeanOneHotEncoder(scores):
 29 | 
 30 | 	for score in scores:
 31 | 		y2.append(score)
 32 | 
 33 | 	x2 = [20, 30, 40, 50, 60, 70, 80, 90, 100]
 34 | 	plt.bar(x2, y2, label='Score', color='green')
 35 | 	plt.xlabel('K')
 36 | 	plt.ylabel('Scores')
 37 | 	plt.title('K-means One Hot Encoder')
 38 | 	plt.legend()
 39 | 	plt.show()
 40 | 
 41 | #Plot scores for K-means One Hot Encoder with normalization
 42 | def kmeanOneHotEncoderWithNormalization(scores):
 43 | 
 44 | 	for score in scores:
 45 | 		y2.append(score)
 46 | 
 47 | 	x2 = [20, 30, 40, 50, 60, 70, 80, 90, 100]
 48 | 	plt.bar(x2, y2, label='Score', color='yellow')
 49 | 	plt.xlabel('K')
 50 | 	plt.ylabel('Scores')
 51 | 	plt.title('K-means One Hot Encoder with normalization')
 52 | 	plt.legend()
 53 | 	plt.show()
 54 | 
 55 | #Plot score for Bisecting K-means One Hot Encoder with normalization
 56 | def bisectingKmeanOneHotEncoderWithNormalization(scores):
 57 | 
 58 | 	for score in scores:
 59 | 		y2.append(score)
 60 | 
 61 | 	x2 = [20, 30, 40, 50, 60, 70, 80, 90, 100]
 62 | 	plt.bar(x2, y2, label='Score', color='red')
 63 | 	plt.xlabel('K')
 64 | 	plt.ylabel('Scores')
 65 | 	plt.title('Bisecting K-means One Hot Encoder with normalization')
 66 | 	plt.legend()
 67 | 	plt.show()
 68 | 
 69 | #Plot score Gaussian Mixture One Hot Encoder with normalization
 70 | def gaussianMixtureOneHotEncoderWithNormalization(scores):
 71 | 
 72 | 	for score in scores:
 73 | 		y2.append(score)
 74 | 
 75 | 	x2 = [20, 30, 40, 50, 60, 70, 80, 90, 100]
 76 | 	plt.bar(x2, y2, label='Score', color='orange')
 77 | 	plt.xlabel('K')
 78 | 	plt.ylabel('Scores')
 79 | 	plt.title('Bisecting K-means One Hot Encoder with normalization')
 80 | 	plt.legend()
 81 | 	plt.show()
 82 | 
 83 | #Read score in file
 84 | def readScoreInFile(filename):
 85 | 	with open(filename) as f:
 86 | 		lines = f.readlines()
 87 | 	lines = [line.rstrip('\n') for line in lines]
 88 | 	#print('lines: '+lines[1])
 89 | 	lineScore = lines[1].split('=')
 90 | 	scoreNumber.append(lineScore[1])
 91 | 	print(filename+' : '+lineScore[1])
 92 | 
 93 | #Read files in folder
 94 | def readFiles(pathToFolder):
 95 | 
 96 | 	for f in listdir(pathToFolder):
 97 | 		if isfile(join(pathToFolder, f)):
 98 | 			listFiles.append(f)
 99 | 
100 | 	for el in listFiles:
101 | 		readScoreInFile(pathToFolder+'/'+el)
102 | 
103 | #Read and plot each technic separately
104 | readFiles('results/kmeans_simple')
105 | kmeanSimple(scoreNumber)
106 | 
107 | #readFiles('results/kmeans_one_hot_encoder')
108 | #kmeanOneHotEncoder(scoreNumber)
109 | 
110 | #readFiles('results/kmeans_one_hot_encoder_with_normalization')
111 | #kmeanOneHotEncoderWithNormalization(scoreNumber)
112 | 
113 | #readFiles('results/bisecting_kmeans_one_hot_encoder_with_normalization')
114 | #bisectingKmeanOneHotEncoderWithNormalization(scoreNumber)
115 | 
116 | #readFiles('results/gaussian_mixture_one_hot_encoder_with_normalization')
117 | #gaussianMixtureOneHotEncoderWithNormalization(scoreNumber)
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 		
124 | 	
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ### Specific files ###
  2 | 
  3 | # Data
  4 | data/
  5 | 
  6 | # TagsClassifier temp folders
  7 | derby.log
  8 | metastore_db/
  9 | models/
 10 | spark-warehouse/
 11 | target/
 12 | 
 13 | ### C++ ###
 14 | # Prerequisites
 15 | *.d
 16 | 
 17 | # Compiled Object files
 18 | *.slo
 19 | *.lo
 20 | *.o
 21 | *.obj
 22 | 
 23 | # Precompiled Headers
 24 | *.gch
 25 | *.pch
 26 | 
 27 | # Compiled Dynamic libraries
 28 | *.so
 29 | *.dylib
 30 | *.dll
 31 | 
 32 | # Fortran module files
 33 | *.mod
 34 | *.smod
 35 | 
 36 | # Compiled Static libraries
 37 | *.lai
 38 | *.la
 39 | *.a
 40 | *.lib
 41 | 
 42 | # Executables
 43 | *.exe
 44 | *.out
 45 | *.app
 46 | 
 47 | ### Intellij ###
 48 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
 49 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 50 | 
 51 | #Remove .idea folder 
 52 | .idea
 53 | # User-specific stuff:
 54 | .idea/**/workspace.xml
 55 | .idea/**/tasks.xml
 56 | .idea/dictionaries
 57 | 
 58 | # Sensitive or high-churn files:
 59 | .idea/**/dataSources/
 60 | .idea/**/dataSources.ids
 61 | .idea/**/dataSources.xml
 62 | .idea/**/dataSources.local.xml
 63 | .idea/**/sqlDataSources.xml
 64 | .idea/**/dynamic.xml
 65 | .idea/**/uiDesigner.xml
 66 | 
 67 | # Gradle:
 68 | .idea/**/gradle.xml
 69 | .idea/**/libraries
 70 | 
 71 | # Mongo Explorer plugin:
 72 | .idea/**/mongoSettings.xml
 73 | 
 74 | ## File-based project format:
 75 | *.iws
 76 | 
 77 | ## Plugin-specific files:
 78 | 
 79 | # IntelliJ
 80 | /out/
 81 | 
 82 | # mpeltonen/sbt-idea plugin
 83 | .idea_modules/
 84 | 
 85 | # JIRA plugin
 86 | atlassian-ide-plugin.xml
 87 | 
 88 | # Crashlytics plugin (for Android Studio and IntelliJ)
 89 | com_crashlytics_export_strings.xml
 90 | crashlytics.properties
 91 | crashlytics-build.properties
 92 | fabric.properties
 93 | 
 94 | ### Intellij Patch ###
 95 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
 96 | 
 97 | # *.iml
 98 | # modules.xml
 99 | # .idea/misc.xml
100 | # *.ipr
101 | 
102 | ### Java ###
103 | # Compiled class file
104 | *.class
105 | 
106 | # Log file
107 | *.log
108 | 
109 | # BlueJ files
110 | *.ctxt
111 | 
112 | # Mobile Tools for Java (J2ME)
113 | .mtj.tmp/
114 | 
115 | # Package Files #
116 | *.jar
117 | *.war
118 | *.ear
119 | *.zip
120 | *.tar.gz
121 | *.rar
122 | 
123 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
124 | hs_err_pid*
125 | 
126 | ### LaTeX ###
127 | ## Core latex/pdflatex auxiliary files:
128 | *.aux
129 | *.lof
130 | *.lot
131 | *.fls
132 | *.toc
133 | *.fmt
134 | *.fot
135 | *.cb
136 | *.cb2
137 | 
138 | ## Intermediate documents:
139 | *.dvi
140 | *-converted-to.*
141 | # these rules might exclude image files for figures etc.
142 | # *.ps
143 | # *.eps
144 | # *.pdf
145 | 
146 | ## Generated if empty string is given at "Please type another file name for output:"
147 | .pdf
148 | 
149 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
150 | *.bbl
151 | *.bcf
152 | *.blg
153 | *-blx.aux
154 | *-blx.bib
155 | *.brf
156 | *.run.xml
157 | 
158 | ## Build tool auxiliary files:
159 | *.fdb_latexmk
160 | *.synctex
161 | *.synctex(busy)
162 | *.synctex.gz
163 | *.synctex.gz(busy)
164 | *.pdfsync
165 | 
166 | ## Auxiliary and intermediate files from other packages:
167 | # algorithms
168 | *.alg
169 | *.loa
170 | 
171 | # achemso
172 | acs-*.bib
173 | 
174 | # amsthm
175 | *.thm
176 | 
177 | # beamer
178 | *.nav
179 | *.pre
180 | *.snm
181 | *.vrb
182 | 
183 | # changes
184 | *.soc
185 | 
186 | # cprotect
187 | *.cpt
188 | 
189 | # elsarticle (documentclass of Elsevier journals)
190 | *.spl
191 | 
192 | # endnotes
193 | *.ent
194 | 
195 | # fixme
196 | *.lox
197 | 
198 | # feynmf/feynmp
199 | *.mf
200 | *.mp
201 | *.t[1-9]
202 | *.t[1-9][0-9]
203 | *.tfm
204 | *.[1-9]
205 | *.[1-9][0-9]
206 | 
207 | #(r)(e)ledmac/(r)(e)ledpar
208 | *.end
209 | *.?end
210 | *.[1-9][0-9][0-9]
211 | *.[1-9]R
212 | *.[1-9][0-9]R
213 | *.[1-9][0-9][0-9]R
214 | *.eledsec[1-9]
215 | *.eledsec[1-9]R
216 | *.eledsec[1-9][0-9]
217 | *.eledsec[1-9][0-9]R
218 | *.eledsec[1-9][0-9][0-9]
219 | *.eledsec[1-9][0-9][0-9]R
220 | 
221 | # glossaries
222 | *.acn
223 | *.acr
224 | *.glg
225 | *.glo
226 | *.gls
227 | *.glsdefs
228 | 
229 | # gnuplottex
230 | *-gnuplottex-*
231 | 
232 | # gregoriotex
233 | *.gaux
234 | *.gtex
235 | 
236 | # hyperref
237 | 
238 | # knitr
239 | *-concordance.tex
240 | # TODO Comment the next line if you want to keep your tikz graphics files
241 | *.tikz
242 | *-tikzDictionary
243 | 
244 | # listings
245 | *.lol
246 | 
247 | # makeidx
248 | *.idx
249 | *.ilg
250 | *.ind
251 | *.ist
252 | 
253 | # minitoc
254 | *.maf
255 | *.mlf
256 | *.mlt
257 | *.mtc[0-9]*
258 | 
259 | # minted
260 | _minted*
261 | *.pyg
262 | 
263 | # morewrites
264 | *.mw
265 | 
266 | # mylatexformat
267 | 
268 | # nomencl
269 | *.nlo
270 | 
271 | # pax
272 | *.pax
273 | 
274 | # sagetex
275 | *.sagetex.sage
276 | *.sagetex.py
277 | *.sagetex.scmd
278 | 
279 | # scrwfile
280 | *.wrt
281 | 
282 | # sympy
283 | *.sout
284 | *.sympy
285 | sympy-plots-for-*.tex/
286 | 
287 | # pdfcomment
288 | *.upa
289 | *.upb
290 | 
291 | # pythontex
292 | *.pytxcode
293 | pythontex-files-*/
294 | 
295 | # thmtools
296 | *.loe
297 | 
298 | # TikZ & PGF
299 | *.dpth
300 | *.md5
301 | *.auxlock
302 | 
303 | # todonotes
304 | *.tdo
305 | 
306 | # easy-todo
307 | *.lod
308 | 
309 | # xindy
310 | *.xdy
311 | 
312 | # xypic precompiled matrices
313 | *.xyc
314 | 
315 | # endfloat
316 | *.ttt
317 | *.fff
318 | 
319 | # Latexian
320 | TSWLatexianTemp*
321 | 
322 | ## Editors:
323 | # WinEdt
324 | *.bak
325 | *.sav
326 | 
327 | # Texpad
328 | .texpadtmp
329 | 
330 | # Kile
331 | *.backup
332 | 
333 | # KBibTeX
334 | *~[0-9]*
335 | 
336 | # auto folder when using emacs and auctex
337 | /auto/*
338 | 
339 | # expex forward references with \gathertags
340 | *-tags.tex
341 | 
342 | ### Python ###
343 | # Byte-compiled / optimized / DLL files
344 | __pycache__/
345 | *.py[cod]
346 | *$py.class
347 | 
348 | # C extensions
349 | *.so
350 | 
351 | # Distribution / packaging
352 | .Python
353 | env/
354 | build/
355 | develop-eggs/
356 | dist/
357 | downloads/
358 | eggs/
359 | .eggs/
360 | lib/
361 | lib64/
362 | parts/
363 | sdist/
364 | var/
365 | wheels/
366 | *.egg-info/
367 | .installed.cfg
368 | *.egg
369 | 
370 | # PyInstaller
371 | #  Usually these files are written by a python script from a template
372 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
373 | *.manifest
374 | *.spec
375 | 
376 | # Installer logs
377 | pip-log.txt
378 | pip-delete-this-directory.txt
379 | 
380 | # Unit test / coverage reports
381 | htmlcov/
382 | .tox/
383 | .coverage
384 | .coverage.*
385 | .cache
386 | nosetests.xml
387 | coverage.xml
388 | *,cover
389 | .hypothesis/
390 | 
391 | # Translations
392 | *.mo
393 | *.pot
394 | 
395 | # Django stuff:
396 | *.log
397 | local_settings.py
398 | 
399 | # Flask stuff:
400 | instance/
401 | .webassets-cache
402 | 
403 | # Scrapy stuff:
404 | .scrapy
405 | 
406 | # Sphinx documentation
407 | docs/_build/
408 | 
409 | # PyBuilder
410 | target/
411 | 
412 | # Jupyter Notebook
413 | .ipynb_checkpoints
414 | 
415 | # pyenv
416 | .python-version
417 | 
418 | # celery beat schedule file
419 | celerybeat-schedule
420 | 
421 | # dotenv
422 | .env
423 | 
424 | # virtualenv
425 | .venv
426 | venv/
427 | ENV/
428 | 
429 | # Spyder project settings
430 | .spyderproject
431 | 
432 | # Rope project settings
433 | .ropeproject
434 | 
435 | ### Scala ###
436 | *.class
437 | *.log
438 | 
439 | # sbt specific
440 | .cache
441 | .history
442 | .lib/
443 | dist/*
444 | target/
445 | lib_managed/
446 | src_managed/
447 | project/boot/
448 | project/plugins/project/
449 | 
450 | # Scala-IDE specific
451 | .ensime
452 | .ensime_cache/
453 | .scala_dependencies
454 | .worksheet
455 | 
456 | # ENSIME specific
457 | 
458 | ### Vim ###
459 | # swap
460 | [._]*.s[a-v][a-z]
461 | [._]*.sw[a-p]
462 | [._]s[a-v][a-z]
463 | [._]sw[a-p]
464 | # session
465 | Session.vim
466 | # temporary
467 | .netrwhist
468 | *~
469 | # auto-generated tag files
470 | tags
471 | 
472 | NetworkAnomalyDetection.iml
473 | .DS_Store
474 | 


--------------------------------------------------------------------------------
/src/NetworkAnomalyDetection.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Anomaly Detection in Network Traffic with different clustering algorithm.
  3 |  *
  4 |  * The implementation is done using the DataFrame-based API of SparkMLlib.
  5 |  *
  6 |  * Algorithms:
  7 |  *
  8 |  *  - K-means
  9 |  *  - Gaussian Mixture Model (GMM)
 10 |  *
 11 |  * Categorical features are transformed into numerical features using one-hot encoder.
 12 |  * Afterwards, all features are normalized.
 13 |  *
 14 |  * These different implementation are compared.
 15 |  *
 16 |  * Metrics used:
 17 |  *
 18 |  *  - Sum of distances between points and their centroids
 19 |  *
 20 |  * GMM is really slow (quadratic algorithm), so the performance will only be done on 1% of the dataset.
 21 |  *
 22 |  * Basic implementation is based on the chapter 5 (Anomaly Detection in Network Traffic with K-means clustering)
 23 |  * of the book Advanced Analytics with Spark.
 24 |  * However, this implementation is using the DataFrame-based API instead of the RDD-based API.
 25 |  *
 26 |  * Anomaly detection is done as follow:
 27 |  *
 28 |  *   - Find the maximal value of each cluster, those will be the thresholds
 29 |  *   - For a new point, calculate its score (distance), if it is more than the threshold of its cluster,
 30 |  *     this is an anomaly
 31 |  *
 32 |  * Datasource: https://archive.ics.uci.edu/ml/datasets/KDD+Cup+1999+Data
 33 |  * Test set:  http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html (corrected.gz)
 34 |  *
 35 |  * @author Axel Fahy
 36 |  * @author Rudolf Höhn
 37 |  * @author Brian Nydegger
 38 |  * @author Assaf Mahmoud
 39 |  *
 40 |  * @date 26.05.2017
 41 |  *
 42 |  */
 43 | 
 44 | import java.io.{File, PrintWriter}
 45 | import java.text.SimpleDateFormat
 46 | import java.util.Calendar
 47 | 
 48 | import org.apache.spark.ml.{Pipeline, PipelineModel}
 49 | import org.apache.spark.{SparkConf, SparkContext}
 50 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 51 | import org.apache.spark.sql.types._
 52 | import org.apache.spark.ml.clustering._
 53 | import org.apache.spark.ml.feature.{OneHotEncoder, StandardScaler, StringIndexer, VectorAssembler}
 54 | import org.apache.spark.ml.linalg.{DenseVector, Vector}
 55 | import org.apache.spark.sql.functions._
 56 | 
 57 | 
 58 | object NetworkAnomalyDetection {
 59 | 
 60 |   val DataPath = "data/kddcup.data.corrected"
 61 |   val TestPath = "data/test.data.corrected"
 62 | 
 63 |   // Fraction of the dataset used (1.0 for the full dataset)
 64 |   val Fraction = 1.0
 65 | 
 66 |   // Schema of data from csv file
 67 |   // Used when loading the data to have a correct structure
 68 |   val DataSchema = StructType(Array(
 69 |     StructField("duration", IntegerType, true),
 70 |     StructField("protocol_type", StringType, true),
 71 |     StructField("service", StringType, true),
 72 |     StructField("flag", StringType, true),
 73 |     StructField("src_bytes", IntegerType, true),
 74 |     StructField("dst_bytes", IntegerType, true),
 75 |     StructField("land", IntegerType, true),
 76 |     StructField("wrong_fragment", IntegerType, true),
 77 |     StructField("urgent", IntegerType, true),
 78 |     StructField("hot", IntegerType, true),
 79 |     StructField("num_failed_logins", IntegerType, true),
 80 |     StructField("logged_in", IntegerType, true),
 81 |     StructField("num_compromised", IntegerType, true),
 82 |     StructField("root_shell", IntegerType, true),
 83 |     StructField("su_attempted", IntegerType, true),
 84 |     StructField("num_root", IntegerType, true),
 85 |     StructField("num_file_creations", IntegerType, true),
 86 |     StructField("num_shells", IntegerType, true),
 87 |     StructField("num_access_files", IntegerType, true),
 88 |     StructField("num_outbound_cmds", IntegerType, true),
 89 |     StructField("is_host_login", IntegerType, true),
 90 |     StructField("is_guest_login", IntegerType, true),
 91 |     StructField("count", IntegerType, true),
 92 |     StructField("srv_count", IntegerType, true),
 93 |     StructField("serror_rate", DoubleType, true),
 94 |     StructField("srv_serror_rate", DoubleType, true),
 95 |     StructField("rerror_rate", DoubleType, true),
 96 |     StructField("srv_rerror_rate", DoubleType, true),
 97 |     StructField("same_srv_rate", DoubleType, true),
 98 |     StructField("diff_srv_rate", DoubleType, true),
 99 |     StructField("srv_diff_host_rate", DoubleType, true),
100 |     StructField("dst_host_count", IntegerType, true),
101 |     StructField("dst_host_srv_count", IntegerType, true),
102 |     StructField("dst_host_same_srv_rate", DoubleType, true),
103 |     StructField("dst_host_diff_srv_rate", DoubleType, true),
104 |     StructField("dst_host_same_src_port_rate", DoubleType, true),
105 |     StructField("dst_host_srv_diff_host_rate", DoubleType, true),
106 |     StructField("dst_host_serror_rate", DoubleType, true),
107 |     StructField("dst_host_srv_serror_rate", DoubleType, true),
108 |     StructField("dst_host_rerror_rate", DoubleType, true),
109 |     StructField("dst_host_srv_rerror_rate", DoubleType, true),
110 |     StructField("label", StringType, true)))
111 | 
112 |   def main(args: Array[String]): Unit = {
113 |     // Creation of configuration and session
114 |     val conf = new SparkConf()
115 |       .setMaster("local")
116 |       .setAppName("NetworkAnomalyDetection")
117 |       .set("spark.driver.memory", "6g")
118 | 
119 |     val sc = new SparkContext(conf)
120 |     sc.setCheckpointDir("checkpoints/")
121 | 
122 |     val spark = SparkSession
123 |       .builder()
124 |       .appName("NetworkAnomalyDetection")
125 |       .getOrCreate()
126 | 
127 |     // Load the data into the schema created previously
128 |     val rawDataDF = spark.read.format("com.databricks.spark.csv")
129 |       .option("header", "false")
130 |       .option("inferSchema", "true")
131 |       .schema(DataSchema)
132 |       .load(DataPath)
133 | 
134 |     val dataDF = rawDataDF.sample(false, Fraction, 42)
135 |     println("Size of dataset=" + dataDF.count + " (total=" + rawDataDF.count + ")")
136 |     val runClustering = new RunClustering(spark, dataDF)
137 | 
138 |     // K-means
139 |     // K-means simple is also doing anomaly detections.
140 |     (20 to 100 by 10).map(k => (k, runClustering.kmeansSimple(k)))
141 |     (20 to 100 by 10).map(k => (k, runClustering.kmeansOneHotEncoder(k)))
142 |     (20 to 100 by 10).map(k => (k, runClustering.kmeansOneHotEncoderWithNormalization(k)))
143 | 
144 |     // Bisecting K-means
145 |     (20 to 100 by 10).map(k => (k, runClustering.bisectingKmeansOneHotEncoderWithNormalization(k)))
146 | 
147 |     // Gaussian Mixture
148 |     (20 to 100 by 10).map(k => (k, runClustering.gaussianMixtureOneHotEncoderWithNormalization(k)))
149 |   }
150 | 
151 |   class RunClustering(private val spark: SparkSession, var data: DataFrame) {
152 | 
153 |     import spark.implicits._
154 | 
155 |     // Select only numerical features
156 |     val CategoricalColumns = Seq("label", "protocol_type", "service", "flag")
157 | 
158 |     /**
159 |       * Calculate the Euclidean distance between a data point and its centroid
160 |       *
161 |       * @param centroid Vector with the components of the centroid
162 |       * @param data Vector with the components of the data point
163 |       * @return The distance between the data point and the centroid
164 |       */
165 |     def distance(centroid: Vector, data: Vector): Double =
166 |       // Tranforming vector to array of double since operations
167 |       // on vector are not implemented
168 |       math.sqrt(centroid.toArray.zip(data.toArray)
169 |         .map(p => p._1 - p._2).map(d => d * d).sum)
170 | 
171 |     /**
172 |       * Apply the Euclidean distance between all points belonging to a centroid and the centroid in question
173 |       *
174 |       * @param centroid Vector with the components of the centroid
175 |       * @param dataCentroid All data points (as Vector) belonging to the centroid
176 |       * @return An array of double containing all the distance of a cluster (data with same centroid)
177 |       */
178 |     def distanceAllCluster(centroid: Vector, dataCentroid: Array[DenseVector]): Array[Double] = {
179 |       dataCentroid.map(d => distance(centroid, d))
180 |     }
181 | 
182 |     /**
183 |       * Calculate the score of a cluster
184 |       *
185 |       * For each k, select data belonging to the centroid
186 |       * and calculating the distance.
187 |       *
188 |       * @param centroids Array containing all the centroids
189 |       * @param data Dataset used
190 |       * @param k Number of cluster
191 |       * @return The mean of the score from all cluster
192 |       */
193 |     def clusteringScore(centroids: Array[Vector], data: DataFrame, k: Int): Double = {
194 |       val score = (0 until k).map{ k =>
195 |         val dataCentroid = data.filter($"prediction" === k)
196 |           .select("features")
197 |           .collect()
198 |           .map {
199 |             // Get the feature vectors in dense format
200 |             case Row(v: Vector) => v.toDense
201 |           }
202 |         val s = distanceAllCluster(centroids(k), dataCentroid)
203 |         if (s.length > 0)
204 |           s.sum / s.length
205 |         else
206 |           s.sum // Sum will be 0 if no element in cluster
207 |       }
208 |       if (score.nonEmpty)
209 |         score.sum / score.length
210 |       else
211 |         score.sum
212 |     }
213 | 
214 |     /**
215 |       * Get the maximum value of each centroid
216 |       *
217 |       * @param centroids Array containing all the centroids
218 |       * @param data DataFrame containing the data points
219 |       * @param k The number of cluster
220 |       * @return A Map with k as the key and its maximum value as value
221 |       */
222 |     def maxByCentroid(centroids: Array[Vector], data: DataFrame, k: Int): Map[Int, Double] = {
223 |       val max = (0 until k).map{ k =>
224 |         val dataCentroid = data.filter($"prediction" === k)
225 |           .select("features")
226 |           .collect()
227 |           .map {
228 |             // Get the feature vectors in dense format
229 |             case Row(v: Vector) => v.toDense
230 |           }
231 |         val dist = distanceAllCluster(centroids(k), dataCentroid)
232 |         if (dist.isEmpty) {
233 |           (k, 0.0)
234 |         }
235 |         else
236 |           (k, dist.max)
237 |       }
238 |       max.toMap
239 |     }
240 | 
241 |     /**
242 |       * Calculate the distance between a point and its centroid
243 |       *
244 |       * This is an udf and must be run on a DataFrame.
245 |       * Usage of currying in order to pass other parameters.
246 |       *
247 |       * The columns of the DataFrame to use: "features" and "prediction"
248 |       * Uses the prediction column to know in which centroid the point belongs.
249 |       *
250 |       * @param centroids Centroids
251 |       * @return
252 |       */
253 |     def calculateDistance(centroids: Array[Vector]) = udf((v: Vector, k: Int) => {
254 |       math.sqrt(centroids(k).toArray.zip(v.toArray)
255 |         .map(p => p._1 - p._2).map(d => d * d).sum)
256 |     })
257 | 
258 |     /**
259 |       * Check if a point is an anomaly
260 |       *
261 |       * If the score of a point is higher than the maximum of the cluster
262 |       * in which it belongs, it is an anomaly.
263 |       *
264 |       * UDF run on "dist" column and "prediction"
265 |       *
266 |       * @param max Map containing the maximal value of each cluster
267 |       * @return 1 if the paquet is an anomaly, else 0
268 |       */
269 |     def checkAnomaly(max: Map[Int, Double]) = udf((distance: Double, k: Int) => if (distance > max(k)) 1 else 0)
270 | 
271 |     /**
272 |       * Get all the anomalies of a test set
273 |       *
274 |       * @param pipeline The pipeline used for the preprocessing
275 |       * @param data The test data
276 |       * @param centroids The centroids found on the training data
277 |       * @param max Maximal value of each centroid
278 |       * @return A DataFrame containing the anomalies
279 |       */
280 |     def getAnomalies(pipeline: PipelineModel, data: DataFrame, centroids: Array[Vector], max: Map[Int, Double]) = {
281 |       val predictDF = pipeline.transform(data)
282 | 
283 |       val distanceDF = predictDF.withColumn("dist", calculateDistance(centroids)(predictDF("features"), predictDF("prediction"))).checkpoint()
284 |       val anomalies = distanceDF.withColumn("anomaly", checkAnomaly(max)(distanceDF("dist"), distanceDF("prediction"))).checkpoint()
285 |       anomalies.filter($"anomaly" > 0)
286 |     }
287 | 
288 |     /**
289 |       * Anomaly detection on test set
290 |       *
291 |       * Get the maximal value of each cluster, and check for each point
292 |       * if its value is higher than the maximal, if this is the case, this is an anomaly.
293 |       *
294 |       * @param dataDF Data of the training
295 |       * @param pipelineModel Pipeline model used with the training
296 |       * @param k Number of clusters
297 |       * @return A DataFrame containing the anomalies
298 |       */
299 |     def anomalyDectection(dataDF: DataFrame, pipelineModel: PipelineModel, k: Int): DataFrame = {
300 |       // Load the data into the schema created previously
301 |       val dataTestDF = spark.read.format("com.databricks.spark.csv")
302 |         .option("header", "false")
303 |         .option("inferSchema", "true")
304 |         .schema(DataSchema)
305 |         .load(TestPath)
306 | 
307 |       val testDF = dataTestDF.drop("label")
308 |       testDF.cache()
309 | 
310 |       // Prediction
311 |       val cluster = pipelineModel.transform(dataDF)
312 | 
313 |       val kmeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel]
314 | 
315 |       // Get the centroids
316 |       val centroids = kmeansModel.clusterCenters
317 | 
318 |       // Get the maximal distance for each cluster (on the training data)
319 |       val max = this.maxByCentroid(centroids, cluster, k)
320 | 
321 |       // Detect anomalies on the test data
322 |       val anomalies = getAnomalies(pipelineModel, testDF, centroids, max)
323 |       testDF.unpersist()
324 |       anomalies
325 |     }
326 | 
327 |     /**
328 |       * Write the result of a run into a file
329 |       *
330 |       * Filename is create dynamically with the current date and the algorithm used.
331 |       *
332 |       * @param score Score already calculated
333 |       * @param startTime Start time of the computation
334 |       * @param technique String with the name of the algorithm/preprocessing used
335 |       */
336 |     def write2file(score: Double, startTime: Long, technique: String): Unit = {
337 |       val format = new SimpleDateFormat("yyyyMMddHHmm")
338 |       val pw = new PrintWriter(new File("results" + format.format(Calendar.getInstance().getTime) +
339 |         "_" + technique.replaceAll(" ", "_") + ".txt"))
340 |       try {
341 |         println(technique)
342 |         pw.write(s"$technique\n")
343 |         println(s"Score=$score")
344 |         pw.write(s"Score=$score\n")
345 |         val duration = (System.nanoTime - startTime) / 1e9d
346 |         println(s"Duration=$duration")
347 |         pw.write(s"Duration=$duration\n")
348 |       } finally {
349 |         pw.close()
350 |       }
351 |     }
352 | 
353 |     /**
354 |       * K-means with only numerical features, without normalization
355 |       *
356 |       * @param k Number of cluster
357 |       */
358 |     def kmeansSimple(k: Int): Unit = {
359 |       println(s"Running kmeansSimple ($k)")
360 |       val startTime = System.nanoTime()
361 |       // Remove the label column
362 |       val dataDF = this.data.drop("label")
363 |       dataDF.cache()
364 |       val numericalColumns = dataDF.columns.diff(CategoricalColumns)
365 | 
366 |       // Creation of vector with features
367 |       val assembler = new VectorAssembler()
368 |         .setInputCols(numericalColumns)
369 |         .setOutputCol("features")
370 | 
371 |       val kmeans = new KMeans()
372 |         .setK(k)
373 |         .setFeaturesCol("features")
374 |         .setPredictionCol("prediction")
375 |         .setSeed(1L)
376 | 
377 |       val pipeline = new Pipeline()
378 |         .setStages(Array(assembler, kmeans))
379 | 
380 |       val pipelineModel = pipeline.fit(dataDF)
381 | 
382 |       val kmeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel]
383 | 
384 |       // Prediction
385 |       val cluster = pipelineModel.transform(dataDF)
386 | 
387 |       // Get the centroids
388 |       val centroids = kmeansModel.clusterCenters
389 | 
390 |       // Calculate the score
391 |       val score = this.clusteringScore(centroids, cluster, k)
392 | 
393 |       this.write2file(score, startTime, "K-means (" + k + ") simple")
394 | 
395 |       // Anomaly detection
396 |       val anomalies = this.anomalyDectection(dataDF, pipelineModel, k)
397 |       // Save results to json file
398 |       val format = new SimpleDateFormat("yyyyMMddHHmm")
399 |       Thread.sleep(1000)
400 |       anomalies.write.json("anomalies_" + format.format(Calendar.getInstance().getTime) + "_" + k + ".json")
401 |       dataDF.unpersist()
402 |     }
403 | 
404 |     /**
405 |       * K-means using categorical features, without normalization
406 |       *
407 |       * Categorical features are encoded using the One-Hot encoder.
408 |       *
409 |       * @param k Number of cluster
410 |       */
411 |     def kmeansOneHotEncoder(k: Int): Unit = {
412 |       println(s"Running kmeansOneHotEncoder ($k)")
413 |       val startTime = System.nanoTime()
414 |       // Remove the label column
415 |       val dataDF = this.data.drop("label")
416 |       dataDF.cache()
417 | 
418 |       // Indexing categorical columns
419 |       val indexer: Array[org.apache.spark.ml.PipelineStage] = CategoricalColumns.map(
420 |         c => new StringIndexer()
421 |           .setInputCol(c)
422 |           .setOutputCol(s"${c}_index")
423 |       ).toArray
424 | 
425 |       // Encoding previously indexed columns
426 |       val encoder: Array[org.apache.spark.ml.PipelineStage] = CategoricalColumns.map(
427 |         c => new OneHotEncoder()
428 |           .setInputCol(s"${c}_index")
429 |           .setOutputCol(s"${c}_vec")
430 |           .setDropLast(false)
431 |       ).toArray
432 | 
433 |       // Creation of list of columns for vector assembler (with only numerical columns)
434 |       val assemblerColumns = (Set(dataDF.columns: _*) -- CategoricalColumns ++ CategoricalColumns.map(c => s"${c}_vec")).toArray
435 | 
436 |       // Creation of vector with features
437 |       val assembler = new VectorAssembler()
438 |         .setInputCols(assemblerColumns)
439 |         .setOutputCol("features")
440 | 
441 |       val kmeans = new KMeans()
442 |         .setK(k)
443 |         .setFeaturesCol("features")
444 |         .setPredictionCol("prediction")
445 |         .setSeed(1L)
446 | 
447 |       val pipeline = new Pipeline()
448 |         .setStages(indexer ++ encoder ++ Array(assembler, kmeans))
449 | 
450 |       val pipelineModel = pipeline.fit(dataDF)
451 | 
452 |       val kmeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel]
453 | 
454 |       // Prediction
455 |       val cluster = pipelineModel.transform(dataDF)
456 |       dataDF.unpersist()
457 | 
458 |       // Get the centroids
459 |       val centroids = kmeansModel.clusterCenters
460 | 
461 |       // Calculate the score
462 |       val score = this.clusteringScore(centroids, cluster, k)
463 | 
464 |       this.write2file(score, startTime, "K-means (" + k + ") with one-hot encoder")
465 |     }
466 | 
467 |     /**
468 |       * K-means using categorical features, with normalization
469 |       *
470 |       * Categorical features are encoded using the One-hot encoder.
471 |       * One-hot encoder will map a column of label indices to a column of binary vectors.
472 |       * Normalization is done using the standard deviation
473 |       *
474 |       * @param k Number of cluster
475 |       */
476 |     def kmeansOneHotEncoderWithNormalization(k: Int): Unit = {
477 |       println(s"Running kmeansOneHotEncoderWithNormalization ($k)")
478 |       val startTime = System.nanoTime()
479 |       // Remove the label column
480 |       val dataDF = this.data.drop("label")
481 |       dataDF.cache()
482 | 
483 |       // Indexing categorical columns
484 |       val indexer: Array[org.apache.spark.ml.PipelineStage] = CategoricalColumns.map(
485 |         c => new StringIndexer()
486 |           .setInputCol(c)
487 |           .setOutputCol(s"${c}_index")
488 |       ).toArray
489 | 
490 |       // Encoding previously indexed columns
491 |       val encoder: Array[org.apache.spark.ml.PipelineStage] = CategoricalColumns.map(
492 |         c => new OneHotEncoder()
493 |           .setInputCol(s"${c}_index")
494 |           .setOutputCol(s"${c}_vec")
495 |           .setDropLast(false)
496 |       ).toArray
497 | 
498 |       // Creation of list of columns for vector assembler (with only numerical columns)
499 |       val assemblerColumns = (Set(dataDF.columns: _*) -- CategoricalColumns ++ CategoricalColumns.map(c => s"${c}_vec")).toArray
500 | 
501 |       // Creation of vector with features
502 |       val assembler = new VectorAssembler()
503 |         .setInputCols(assemblerColumns)
504 |         .setOutputCol("featuresVector")
505 | 
506 |       // Normalization using standard deviation
507 |       val scaler = new StandardScaler()
508 |         .setInputCol("featuresVector")
509 |         .setOutputCol("features")
510 |         .setWithStd(true)
511 |         .setWithMean(false)
512 | 
513 |       val kmeans = new KMeans()
514 |         .setK(k)
515 |         .setFeaturesCol("features")
516 |         .setPredictionCol("prediction")
517 |         .setSeed(1L)
518 | 
519 |       val pipeline = new Pipeline()
520 |         .setStages(indexer ++ encoder ++ Array(assembler, scaler, kmeans))
521 | 
522 |       val pipelineModel = pipeline.fit(dataDF)
523 | 
524 |       // Prediction
525 |       val cluster = pipelineModel.transform(dataDF)
526 |       dataDF.unpersist()
527 | 
528 |       val kmeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel]
529 | 
530 |       // Get the centroids
531 |       val centroids = kmeansModel.clusterCenters
532 | 
533 |       // Calculate the score
534 |       val score = this.clusteringScore(centroids, cluster, k)
535 | 
536 |       this.write2file(score, startTime, "K-means (" + k + ") with one-hot encoder with normalization")
537 |     }
538 | 
539 |     /**
540 |       * Bisecting K-means using categorical features, with normalization
541 |       *
542 |       * With the Bisecting K-means, al observations start in one cluster
543 |       * and split are performed recursively in a "top-down" approach.
544 |       *
545 |       * Categorical features are encoded using the One-hot encoder.
546 |       * One-hot encoder will map a column of label indices to a column of binary vectors.
547 |       * Normalization is done using the standard deviation
548 |       *
549 |       * @param k Number of cluster
550 |       */
551 |     def bisectingKmeansOneHotEncoderWithNormalization(k: Int): Unit = {
552 |       println(s"Running bisectingKmeansOneHotEncoderWithNormalization ($k)")
553 |       val startTime = System.nanoTime()
554 |       // Remove the label column
555 |       val dataDF = this.data.drop("label")
556 |       dataDF.cache()
557 | 
558 |       // Indexing categorical columns
559 |       val indexer: Array[org.apache.spark.ml.PipelineStage] = CategoricalColumns.map(
560 |         c => new StringIndexer()
561 |           .setInputCol(c)
562 |           .setOutputCol(s"${c}_index")
563 |       ).toArray
564 | 
565 |       // Encoding previously indexed columns
566 |       val encoder: Array[org.apache.spark.ml.PipelineStage] = CategoricalColumns.map(
567 |         c => new OneHotEncoder()
568 |           .setInputCol(s"${c}_index")
569 |           .setOutputCol(s"${c}_vec")
570 |           .setDropLast(false)
571 |       ).toArray
572 | 
573 |       // Creation of list of columns for vector assembler (with only numerical columns)
574 |       val assemblerColumns = (Set(dataDF.columns: _*) -- CategoricalColumns ++ CategoricalColumns.map(c => s"${c}_vec")).toArray
575 | 
576 |       // Creation of vector with features
577 |       val assembler = new VectorAssembler()
578 |         .setInputCols(assemblerColumns)
579 |         .setOutputCol("featuresVector")
580 | 
581 |       // Normalization using standard deviation
582 |       val scaler = new StandardScaler()
583 |         .setInputCol("featuresVector")
584 |         .setOutputCol("features")
585 |         .setWithStd(true)
586 |         .setWithMean(false)
587 | 
588 |       val kmeans = new BisectingKMeans()
589 |         .setK(k)
590 |         .setFeaturesCol("features")
591 |         .setPredictionCol("prediction")
592 |         .setSeed(1L)
593 | 
594 |       val pipeline = new Pipeline()
595 |         .setStages(indexer ++ encoder ++ Array(assembler, scaler, kmeans))
596 | 
597 |       val pipelineModel = pipeline.fit(dataDF)
598 | 
599 |       // Prediction
600 |       val cluster = pipelineModel.transform(dataDF)
601 |       dataDF.unpersist()
602 | 
603 |       val kmeansModel = pipelineModel.stages.last.asInstanceOf[BisectingKMeansModel]
604 | 
605 |       // Get the centroids
606 |       val centroids = kmeansModel.clusterCenters
607 | 
608 |       // Calculate the score
609 |       val score = this.clusteringScore(centroids, cluster, k)
610 | 
611 |       this.write2file(score, startTime, "Bisecting K-means (" + k + ") with one-hot encoder with normalization")
612 |     }
613 | 
614 |     /**
615 |       * Gaussian Mixture Model
616 |       *
617 |       * Categorical features are encoded using the One-hot encoder.
618 |       * One-hot encoder will map a column of label indices to a column of binary vectors.
619 |       * Normalization is done using the standard deviation
620 |       *
621 |       * GMM uses a quadratic algorithm and in consequence takes really long to perform.
622 |       * This algorithm will only be used on 1% of the dataset.
623 |       *
624 |       * @param k Number of cluster
625 |       */
626 |     def gaussianMixtureOneHotEncoderWithNormalization(k: Int): Unit = {
627 |       println(s"Running gaussianMixtureOneHotEncoderWithNormalization ($k)")
628 |       val startTime = System.nanoTime()
629 |       // Remove the label column
630 |       val dataDF = this.data.drop("label")
631 |       dataDF.cache()
632 | 
633 |       // Indexing categorical columns
634 |       val indexer: Array[org.apache.spark.ml.PipelineStage] = CategoricalColumns.map(
635 |         c => new StringIndexer()
636 |           .setInputCol(c)
637 |           .setOutputCol(s"${c}_index")
638 |       ).toArray
639 | 
640 |       // Encoding previously indexed columns
641 |       val encoder: Array[org.apache.spark.ml.PipelineStage] = CategoricalColumns.map(
642 |         c => new OneHotEncoder()
643 |           .setInputCol(s"${c}_index")
644 |           .setOutputCol(s"${c}_vec")
645 |           .setDropLast(false)
646 |       ).toArray
647 | 
648 |       // Creation of list of columns for vector assembler (with only numerical columns)
649 |       val assemblerColumns = (Set(dataDF.columns: _*) -- CategoricalColumns ++ CategoricalColumns.map(c => s"${c}_vec")).toArray
650 | 
651 |       // Creation of vector with features
652 |       val assembler = new VectorAssembler()
653 |         .setInputCols(assemblerColumns)
654 |         .setOutputCol("featuresVector")
655 | 
656 |       // Normalization using standard deviation
657 |       val scaler = new StandardScaler()
658 |         .setInputCol("featuresVector")
659 |         .setOutputCol("features")
660 |         .setWithStd(true)
661 |         .setWithMean(false)
662 | 
663 |       val gaussianMixture = new GaussianMixture()
664 |         .setK(k)
665 |         .setFeaturesCol("features")
666 |         .setPredictionCol("prediction")
667 |         .setSeed(1L)
668 | 
669 |       val pipeline = new Pipeline()
670 |         .setStages(indexer ++ encoder ++ Array(assembler, scaler, gaussianMixture))
671 | 
672 |       val pipelineModel = pipeline.fit(dataDF)
673 | 
674 |       val gmm = pipelineModel.stages.last.asInstanceOf[GaussianMixtureModel]
675 | 
676 |       // Prediction
677 |       val cluster = pipelineModel.transform(dataDF)
678 |       dataDF.unpersist()
679 | 
680 |       // Get the centroids
681 |       val centroids = (0 until k).map(i => gmm.gaussians(i).mean).toArray
682 | 
683 |       // Calculate the score
684 |       val score = this.clusteringScore(centroids, cluster, k)
685 | 
686 |       this.write2file(score, startTime, "GaussianMixture (" + k + ") with one-hot encoder with normalization")
687 |     }
688 |   }
689 | }
690 | 


--------------------------------------------------------------------------------