├── results ├── plots │ ├── kmeans_simple.png │ ├── kmeans_one_hot_encoder.png │ ├── kmeans_one_hot_encoder_with_normalization.png │ ├── bisecting_kmeans_one_hot_encoder_with_normalization.png │ └── gaussian_mixture_one_hot_encoder_with_normalization.png ├── results201705271102_K-means_(20)_simple.txt ├── results201705271115_K-means_(30)_simple.txt ├── results201705271130_K-means_(40)_simple.txt ├── results201705271151_K-means_(50)_simple.txt ├── results201705271215_K-means_(60)_simple.txt ├── results201705271243_K-means_(70)_simple.txt ├── results201705271312_K-means_(80)_simple.txt ├── results201705271342_K-means_(90)_simple.txt ├── results201705271422_K-means_(100)_simple.txt ├── plots_axel │ ├── kmeans_simple.png │ ├── kmeans_one_hot_encoder.png │ ├── kmeans_one_hot_encoder_with_normalization.png │ ├── bisecting_kmeans_one_hot_encoder_with_normalization.png │ └── gaussian_mixture_one_hot_encoder_with_normalization.png ├── kmeans_simple │ ├── results201705301051_K-means_(20)_simple.txt │ ├── results201705301100_K-means_(30)_simple.txt │ ├── results201705301112_K-means_(40)_simple.txt │ ├── results201705301127_K-means_(50)_simple.txt │ ├── results201705301145_K-means_(60)_simple.txt │ ├── results201705301206_K-means_(70)_simple.txt │ ├── results201705301230_K-means_(80)_simple.txt │ ├── results201705301257_K-means_(90)_simple.txt │ └── results201705302321_K-means_(100)_simple.txt ├── results201705271431_K-means_(20)_with_one-hot_encoder.txt ├── results201705271444_K-means_(30)_with_one-hot_encoder.txt ├── results201705271500_K-means_(40)_with_one-hot_encoder.txt ├── results201705271519_K-means_(50)_with_one-hot_encoder.txt ├── results201705271541_K-means_(60)_with_one-hot_encoder.txt ├── results201705271607_K-means_(70)_with_one-hot_encoder.txt ├── results201705271637_K-means_(80)_with_one-hot_encoder.txt ├── results201705271710_K-means_(90)_with_one-hot_encoder.txt ├── results201705271747_K-means_(100)_with_one-hot_encoder.txt ├── kmeans_one_hot_encoder │ ├── results201705271231_K-means_(20)_with_one-hot_encoder.txt │ ├── results201705271241_K-means_(30)_with_one-hot_encoder.txt │ ├── results201705271255_K-means_(40)_with_one-hot_encoder.txt │ ├── results201705271312_K-means_(50)_with_one-hot_encoder.txt │ ├── results201705271333_K-means_(60)_with_one-hot_encoder.txt │ ├── results201705271358_K-means_(70)_with_one-hot_encoder.txt │ ├── results201705271426_K-means_(80)_with_one-hot_encoder.txt │ ├── results201705271458_K-means_(90)_with_one-hot_encoder.txt │ └── results201705271533_K-means_(100)_with_one-hot_encoder.txt ├── results201705271755_K-means_(20)_with_one-hot_encoder_with_normalization.txt ├── results201705271807_K-means_(30)_with_one-hot_encoder_with_normalization.txt ├── results201705271822_K-means_(40)_with_one-hot_encoder_with_normalization.txt ├── results201705271841_K-means_(50)_with_one-hot_encoder_with_normalization.txt ├── results201705271904_K-means_(60)_with_one-hot_encoder_with_normalization.txt ├── results201705271930_K-means_(70)_with_one-hot_encoder_with_normalization.txt ├── results201705272000_K-means_(80)_with_one-hot_encoder_with_normalization.txt ├── results201705272033_K-means_(90)_with_one-hot_encoder_with_normalization.txt ├── results201705272111_K-means_(100)_with_one-hot_encoder_with_normalization.txt ├── results201705280046_GaussianMixture_(20)_with_one-hot_encoder_with_normalization.txt ├── results201705272119_Bisecting_K-means_(20)_with_one-hot_encoder_with_normalization.txt ├── results201705272131_Bisecting_K-means_(30)_with_one-hot_encoder_with_normalization.txt ├── results201705272147_Bisecting_K-means_(40)_with_one-hot_encoder_with_normalization.txt ├── results201705272206_Bisecting_K-means_(50)_with_one-hot_encoder_with_normalization.txt ├── results201705272229_Bisecting_K-means_(60)_with_one-hot_encoder_with_normalization.txt ├── results201705272255_Bisecting_K-means_(70)_with_one-hot_encoder_with_normalization.txt ├── results201705272325_Bisecting_K-means_(80)_with_one-hot_encoder_with_normalization.txt ├── results201705272357_Bisecting_K-means_(90)_with_one-hot_encoder_with_normalization.txt ├── results201705280108_GaussianMixture_(30)_with_one-hot_encoder_with_normalization.txt ├── results201705280141_GaussianMixture_(40)_with_one-hot_encoder_with_normalization.txt ├── results201705280229_GaussianMixture_(50)_with_one-hot_encoder_with_normalization.txt ├── results201705280333_GaussianMixture_(60)_with_one-hot_encoder_with_normalization.txt ├── results201705280456_GaussianMixture_(70)_with_one-hot_encoder_with_normalization.txt ├── results201705280710_GaussianMixture_(80)_with_one-hot_encoder_with_normalization.txt ├── results201705280915_GaussianMixture_(90)_with_one-hot_encoder_with_normalization.txt ├── results201705281152_GaussianMixture_(100)_with_one-hot_encoder_with_normalization.txt ├── results201705280033_Bisecting_K-means_(100)_with_one-hot_encoder_with_normalization.txt ├── kmeans_one_hot_encoder_with_normalization │ ├── results201705310454_K-means_(20)_with_one-hot_encoder_with_normalization.txt │ ├── results201705310512_K-means_(30)_with_one-hot_encoder_with_normalization.txt │ ├── results201705310535_K-means_(40)_with_one-hot_encoder_with_normalization.txt │ ├── results201705310605_K-means_(50)_with_one-hot_encoder_with_normalization.txt │ ├── results201705310641_K-means_(60)_with_one-hot_encoder_with_normalization.txt │ ├── results201705310722_K-means_(70)_with_one-hot_encoder_with_normalization.txt │ ├── results201705310809_K-means_(80)_with_one-hot_encoder_with_normalization.txt │ ├── results201705310902_K-means_(90)_with_one-hot_encoder_with_normalization.txt │ └── results201705311001_K-means_(100)_with_one-hot_encoder_with_normalization.txt ├── bisecting_kmeans_one_hot_encoder_with_normalization │ ├── results201705311014_Bisecting_K-means_(20)_with_one-hot_encoder_with_normalization.txt │ ├── results201705311245_Bisecting_K-means_(70)_with_one-hot_encoder_with_normalization.txt │ ├── results201705311033_Bisecting_K-means_(30)_with_one-hot_encoder_with_normalization.txt │ ├── results201705311057_Bisecting_K-means_(40)_with_one-hot_encoder_with_normalization.txt │ ├── results201705311128_Bisecting_K-means_(50)_with_one-hot_encoder_with_normalization.txt │ ├── results201705311204_Bisecting_K-means_(60)_with_one-hot_encoder_with_normalization.txt │ ├── results201705311333_Bisecting_K-means_(80)_with_one-hot_encoder_with_normalization.txt │ ├── results201705311426_Bisecting_K-means_(90)_with_one-hot_encoder_with_normalization.txt │ └── results201705311525_Bisecting_K-means_(100)_with_one-hot_encoder_with_normalization.txt └── gaussian_mixture_one_hot_encoder_with_normalization │ ├── results201705311543_GaussianMixture_(20)_with_one-hot_encoder_with_normalization.txt │ ├── results201705311613_GaussianMixture_(30)_with_one-hot_encoder_with_normalization.txt │ ├── results201705311659_GaussianMixture_(40)_with_one-hot_encoder_with_normalization.txt │ ├── results201705311806_GaussianMixture_(50)_with_one-hot_encoder_with_normalization.txt │ ├── results201705311937_GaussianMixture_(60)_with_one-hot_encoder_with_normalization.txt │ ├── results201705312134_GaussianMixture_(70)_with_one-hot_encoder_with_normalization.txt │ ├── results201706010002_GaussianMixture_(80)_with_one-hot_encoder_with_normalization.txt │ ├── results201706010316_GaussianMixture_(90)_with_one-hot_encoder_with_normalization.txt │ └── results201706010649_GaussianMixture_(100)_with_one-hot_encoder_with_normalization.txt ├── LICENSE ├── README.md ├── plotScore.py ├── .gitignore └── src └── NetworkAnomalyDetection.scala /results/plots/kmeans_simple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots/kmeans_simple.png -------------------------------------------------------------------------------- /results/results201705271102_K-means_(20)_simple.txt: -------------------------------------------------------------------------------- 1 | K-means (20) simple 2 | Score=3411.9003628488085 3 | Duration=494.53377111 4 | -------------------------------------------------------------------------------- /results/results201705271115_K-means_(30)_simple.txt: -------------------------------------------------------------------------------- 1 | K-means (30) simple 2 | Score=1604.3522840254057 3 | Duration=743.344817602 4 | -------------------------------------------------------------------------------- /results/results201705271130_K-means_(40)_simple.txt: -------------------------------------------------------------------------------- 1 | K-means (40) simple 2 | Score=918.3628605831424 3 | Duration=929.610309607 4 | -------------------------------------------------------------------------------- /results/results201705271151_K-means_(50)_simple.txt: -------------------------------------------------------------------------------- 1 | K-means (50) simple 2 | Score=606.3685420150439 3 | Duration=1218.861908538 4 | -------------------------------------------------------------------------------- /results/results201705271215_K-means_(60)_simple.txt: -------------------------------------------------------------------------------- 1 | K-means (60) simple 2 | Score=507.20011532973143 3 | Duration=1443.970069177 4 | -------------------------------------------------------------------------------- /results/results201705271243_K-means_(70)_simple.txt: -------------------------------------------------------------------------------- 1 | K-means (70) simple 2 | Score=466.19325697272177 3 | Duration=1681.703710902 4 | -------------------------------------------------------------------------------- /results/results201705271312_K-means_(80)_simple.txt: -------------------------------------------------------------------------------- 1 | K-means (80) simple 2 | Score=402.7002490798157 3 | Duration=1780.824890324 4 | -------------------------------------------------------------------------------- /results/results201705271342_K-means_(90)_simple.txt: -------------------------------------------------------------------------------- 1 | K-means (90) simple 2 | Score=538.7185884007569 3 | Duration=1774.006552106 4 | -------------------------------------------------------------------------------- /results/results201705271422_K-means_(100)_simple.txt: -------------------------------------------------------------------------------- 1 | K-means (100) simple 2 | Score=506.4350529644009 3 | Duration=2395.402772907 4 | -------------------------------------------------------------------------------- /results/plots_axel/kmeans_simple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots_axel/kmeans_simple.png -------------------------------------------------------------------------------- /results/kmeans_simple/results201705301051_K-means_(20)_simple.txt: -------------------------------------------------------------------------------- 1 | K-means (20) simple 2 | Score=795028.6677444499 3 | Duration=381.519614201 4 | -------------------------------------------------------------------------------- /results/kmeans_simple/results201705301100_K-means_(30)_simple.txt: -------------------------------------------------------------------------------- 1 | K-means (30) simple 2 | Score=374962.2418958273 3 | Duration=538.635203758 4 | -------------------------------------------------------------------------------- /results/kmeans_simple/results201705301112_K-means_(40)_simple.txt: -------------------------------------------------------------------------------- 1 | K-means (40) simple 2 | Score=259926.9568822872 3 | Duration=711.311292832 4 | -------------------------------------------------------------------------------- /results/kmeans_simple/results201705301127_K-means_(50)_simple.txt: -------------------------------------------------------------------------------- 1 | K-means (50) simple 2 | Score=183540.68086804406 3 | Duration=899.458520806 4 | -------------------------------------------------------------------------------- /results/kmeans_simple/results201705301145_K-means_(60)_simple.txt: -------------------------------------------------------------------------------- 1 | K-means (60) simple 2 | Score=141014.08830478703 3 | Duration=1083.386044166 4 | -------------------------------------------------------------------------------- /results/kmeans_simple/results201705301206_K-means_(70)_simple.txt: -------------------------------------------------------------------------------- 1 | K-means (70) simple 2 | Score=118370.73907939174 3 | Duration=1269.059388628 4 | -------------------------------------------------------------------------------- /results/kmeans_simple/results201705301230_K-means_(80)_simple.txt: -------------------------------------------------------------------------------- 1 | K-means (80) simple 2 | Score=44144.134731446975 3 | Duration=1436.104775358 4 | -------------------------------------------------------------------------------- /results/kmeans_simple/results201705301257_K-means_(90)_simple.txt: -------------------------------------------------------------------------------- 1 | K-means (90) simple 2 | Score=36740.48338868395 3 | Duration=1635.510372384 4 | -------------------------------------------------------------------------------- /results/kmeans_simple/results201705302321_K-means_(100)_simple.txt: -------------------------------------------------------------------------------- 1 | K-means (100) simple 2 | Score=30895.80296446874 3 | Duration=2813.514058399 4 | -------------------------------------------------------------------------------- /results/plots/kmeans_one_hot_encoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots/kmeans_one_hot_encoder.png -------------------------------------------------------------------------------- /results/plots_axel/kmeans_one_hot_encoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots_axel/kmeans_one_hot_encoder.png -------------------------------------------------------------------------------- /results/results201705271431_K-means_(20)_with_one-hot_encoder.txt: -------------------------------------------------------------------------------- 1 | K-means (20) with one-hot encoder 2 | Score=3411.9004558128668 3 | Duration=521.15065171 4 | -------------------------------------------------------------------------------- /results/results201705271444_K-means_(30)_with_one-hot_encoder.txt: -------------------------------------------------------------------------------- 1 | K-means (30) with one-hot encoder 2 | Score=1604.3523924730237 3 | Duration=827.13024721 4 | -------------------------------------------------------------------------------- /results/results201705271500_K-means_(40)_with_one-hot_encoder.txt: -------------------------------------------------------------------------------- 1 | K-means (40) with one-hot encoder 2 | Score=918.3629950314147 3 | Duration=914.994794978 4 | -------------------------------------------------------------------------------- /results/results201705271519_K-means_(50)_with_one-hot_encoder.txt: -------------------------------------------------------------------------------- 1 | K-means (50) with one-hot encoder 2 | Score=606.3686734230612 3 | Duration=1160.955209065 4 | -------------------------------------------------------------------------------- /results/results201705271541_K-means_(60)_with_one-hot_encoder.txt: -------------------------------------------------------------------------------- 1 | K-means (60) with one-hot encoder 2 | Score=507.2003489342253 3 | Duration=1332.972756026 4 | -------------------------------------------------------------------------------- /results/results201705271607_K-means_(70)_with_one-hot_encoder.txt: -------------------------------------------------------------------------------- 1 | K-means (70) with one-hot encoder 2 | Score=466.1934081517335 3 | Duration=1559.376549068 4 | -------------------------------------------------------------------------------- /results/results201705271637_K-means_(80)_with_one-hot_encoder.txt: -------------------------------------------------------------------------------- 1 | K-means (80) with one-hot encoder 2 | Score=402.7003742837811 3 | Duration=1774.606822868 4 | -------------------------------------------------------------------------------- /results/results201705271710_K-means_(90)_with_one-hot_encoder.txt: -------------------------------------------------------------------------------- 1 | K-means (90) with one-hot encoder 2 | Score=538.7186344059103 3 | Duration=1999.400565217 4 | -------------------------------------------------------------------------------- /results/results201705271747_K-means_(100)_with_one-hot_encoder.txt: -------------------------------------------------------------------------------- 1 | K-means (100) with one-hot encoder 2 | Score=506.4354273607242 3 | Duration=2232.683974512 4 | -------------------------------------------------------------------------------- /results/kmeans_one_hot_encoder/results201705271231_K-means_(20)_with_one-hot_encoder.txt: -------------------------------------------------------------------------------- 1 | K-means (20) with one-hot encoder 2 | Score=795029.8476020518 3 | Duration=427.630177442 4 | -------------------------------------------------------------------------------- /results/kmeans_one_hot_encoder/results201705271241_K-means_(30)_with_one-hot_encoder.txt: -------------------------------------------------------------------------------- 1 | K-means (30) with one-hot encoder 2 | Score=374963.2057284806 3 | Duration=628.492676961 4 | -------------------------------------------------------------------------------- /results/kmeans_one_hot_encoder/results201705271255_K-means_(40)_with_one-hot_encoder.txt: -------------------------------------------------------------------------------- 1 | K-means (40) with one-hot encoder 2 | Score=259927.68903096477 3 | Duration=830.547506794 4 | -------------------------------------------------------------------------------- /results/kmeans_one_hot_encoder/results201705271312_K-means_(50)_with_one-hot_encoder.txt: -------------------------------------------------------------------------------- 1 | K-means (50) with one-hot encoder 2 | Score=183541.23454720704 3 | Duration=1039.745908218 4 | -------------------------------------------------------------------------------- /results/kmeans_one_hot_encoder/results201705271333_K-means_(60)_with_one-hot_encoder.txt: -------------------------------------------------------------------------------- 1 | K-means (60) with one-hot encoder 2 | Score=141014.55503504092 3 | Duration=1253.997536722 4 | -------------------------------------------------------------------------------- /results/kmeans_one_hot_encoder/results201705271358_K-means_(70)_with_one-hot_encoder.txt: -------------------------------------------------------------------------------- 1 | K-means (70) with one-hot encoder 2 | Score=118371.13932829347 3 | Duration=1469.034651508 4 | -------------------------------------------------------------------------------- /results/kmeans_one_hot_encoder/results201705271426_K-means_(80)_with_one-hot_encoder.txt: -------------------------------------------------------------------------------- 1 | K-means (80) with one-hot encoder 2 | Score=44144.45446175979 3 | Duration=1682.14382306 4 | -------------------------------------------------------------------------------- /results/kmeans_one_hot_encoder/results201705271458_K-means_(90)_with_one-hot_encoder.txt: -------------------------------------------------------------------------------- 1 | K-means (90) with one-hot encoder 2 | Score=36740.76732934029 3 | Duration=1905.240331292 4 | -------------------------------------------------------------------------------- /results/kmeans_one_hot_encoder/results201705271533_K-means_(100)_with_one-hot_encoder.txt: -------------------------------------------------------------------------------- 1 | K-means (100) with one-hot encoder 2 | Score=30896.059641908327 3 | Duration=2121.976332771 4 | -------------------------------------------------------------------------------- /results/plots/kmeans_one_hot_encoder_with_normalization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots/kmeans_one_hot_encoder_with_normalization.png -------------------------------------------------------------------------------- /results/plots_axel/kmeans_one_hot_encoder_with_normalization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots_axel/kmeans_one_hot_encoder_with_normalization.png -------------------------------------------------------------------------------- /results/results201705271755_K-means_(20)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | K-means (20) with one-hot encoder with normalization 2 | Score=5.512477052689253 3 | Duration=467.247128525 4 | -------------------------------------------------------------------------------- /results/results201705271807_K-means_(30)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | K-means (30) with one-hot encoder with normalization 2 | Score=8.346482286155416 3 | Duration=708.824423426 4 | -------------------------------------------------------------------------------- /results/results201705271822_K-means_(40)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | K-means (40) with one-hot encoder with normalization 2 | Score=6.45572592987493 3 | Duration=907.843712033 4 | -------------------------------------------------------------------------------- /results/results201705271841_K-means_(50)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | K-means (50) with one-hot encoder with normalization 2 | Score=5.923645877855369 3 | Duration=1125.301766206 4 | -------------------------------------------------------------------------------- /results/results201705271904_K-means_(60)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | K-means (60) with one-hot encoder with normalization 2 | Score=6.154421118629562 3 | Duration=1378.721376253 4 | -------------------------------------------------------------------------------- /results/results201705271930_K-means_(70)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | K-means (70) with one-hot encoder with normalization 2 | Score=4.756622128870819 3 | Duration=1586.597461245 4 | -------------------------------------------------------------------------------- /results/results201705272000_K-means_(80)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | K-means (80) with one-hot encoder with normalization 2 | Score=4.448525376058372 3 | Duration=1776.086742568 4 | -------------------------------------------------------------------------------- /results/results201705272033_K-means_(90)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | K-means (90) with one-hot encoder with normalization 2 | Score=4.407901770147693 3 | Duration=2011.189166835 4 | -------------------------------------------------------------------------------- /results/results201705272111_K-means_(100)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | K-means (100) with one-hot encoder with normalization 2 | Score=3.581636032875701 3 | Duration=2243.538546857 4 | -------------------------------------------------------------------------------- /results/plots/bisecting_kmeans_one_hot_encoder_with_normalization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots/bisecting_kmeans_one_hot_encoder_with_normalization.png -------------------------------------------------------------------------------- /results/plots/gaussian_mixture_one_hot_encoder_with_normalization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots/gaussian_mixture_one_hot_encoder_with_normalization.png -------------------------------------------------------------------------------- /results/results201705280046_GaussianMixture_(20)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | GaussianMixture (20) with one-hot encoder with normalization 2 | Score=0.2932092377554215 3 | Duration=743.056496329 4 | -------------------------------------------------------------------------------- /results/plots_axel/bisecting_kmeans_one_hot_encoder_with_normalization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots_axel/bisecting_kmeans_one_hot_encoder_with_normalization.png -------------------------------------------------------------------------------- /results/plots_axel/gaussian_mixture_one_hot_encoder_with_normalization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axelfahy/NetworkAnomalyDetection/HEAD/results/plots_axel/gaussian_mixture_one_hot_encoder_with_normalization.png -------------------------------------------------------------------------------- /results/results201705272119_Bisecting_K-means_(20)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | Bisecting K-means (20) with one-hot encoder with normalization 2 | Score=8.857494697352449 3 | Duration=506.177613531 4 | -------------------------------------------------------------------------------- /results/results201705272131_Bisecting_K-means_(30)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | Bisecting K-means (30) with one-hot encoder with normalization 2 | Score=14.904728795235828 3 | Duration=715.761176323 4 | -------------------------------------------------------------------------------- /results/results201705272147_Bisecting_K-means_(40)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | Bisecting K-means (40) with one-hot encoder with normalization 2 | Score=13.691284625809638 3 | Duration=934.888378974 4 | -------------------------------------------------------------------------------- /results/results201705272206_Bisecting_K-means_(50)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | Bisecting K-means (50) with one-hot encoder with normalization 2 | Score=16.717111733929226 3 | Duration=1173.524747163 4 | -------------------------------------------------------------------------------- /results/results201705272229_Bisecting_K-means_(60)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | Bisecting K-means (60) with one-hot encoder with normalization 2 | Score=14.776863383877734 3 | Duration=1361.131594462 4 | -------------------------------------------------------------------------------- /results/results201705272255_Bisecting_K-means_(70)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | Bisecting K-means (70) with one-hot encoder with normalization 2 | Score=13.083036390239007 3 | Duration=1567.085641769 4 | -------------------------------------------------------------------------------- /results/results201705272325_Bisecting_K-means_(80)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | Bisecting K-means (80) with one-hot encoder with normalization 2 | Score=13.941691367677404 3 | Duration=1778.031726294 4 | -------------------------------------------------------------------------------- /results/results201705272357_Bisecting_K-means_(90)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | Bisecting K-means (90) with one-hot encoder with normalization 2 | Score=13.962600986212275 3 | Duration=1962.443723704 4 | -------------------------------------------------------------------------------- /results/results201705280108_GaussianMixture_(30)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | GaussianMixture (30) with one-hot encoder with normalization 2 | Score=0.19547282517028208 3 | Duration=1315.244916256 4 | -------------------------------------------------------------------------------- /results/results201705280141_GaussianMixture_(40)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | GaussianMixture (40) with one-hot encoder with normalization 2 | Score=0.14660461887771076 3 | Duration=2026.049716333 4 | -------------------------------------------------------------------------------- /results/results201705280229_GaussianMixture_(50)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | GaussianMixture (50) with one-hot encoder with normalization 2 | Score=0.11728369510216834 3 | Duration=2873.354172106 4 | -------------------------------------------------------------------------------- /results/results201705280333_GaussianMixture_(60)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | GaussianMixture (60) with one-hot encoder with normalization 2 | Score=0.09773641258514104 3 | Duration=3836.495340339 4 | -------------------------------------------------------------------------------- /results/results201705280456_GaussianMixture_(70)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | GaussianMixture (70) with one-hot encoder with normalization 2 | Score=0.08377406793012049 3 | Duration=4934.065542179 4 | -------------------------------------------------------------------------------- /results/results201705280710_GaussianMixture_(80)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | GaussianMixture (80) with one-hot encoder with normalization 2 | Score=0.07330230943885538 3 | Duration=6054.620509646 4 | -------------------------------------------------------------------------------- /results/results201705280915_GaussianMixture_(90)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | GaussianMixture (90) with one-hot encoder with normalization 2 | Score=0.1348575655060487 3 | Duration=7445.378367397 4 | -------------------------------------------------------------------------------- /results/results201705281152_GaussianMixture_(100)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | GaussianMixture (100) with one-hot encoder with normalization 2 | Score=0.05864184755108417 3 | Duration=9452.060132619 4 | -------------------------------------------------------------------------------- /results/results201705280033_Bisecting_K-means_(100)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | Bisecting K-means (100) with one-hot encoder with normalization 2 | Score=14.48039254193131 3 | Duration=2165.318778686 4 | -------------------------------------------------------------------------------- /results/kmeans_one_hot_encoder_with_normalization/results201705310454_K-means_(20)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | K-means (20) with one-hot encoder with normalization 2 | Score=4562.023772066879 3 | Duration=739.491825297 4 | -------------------------------------------------------------------------------- /results/kmeans_one_hot_encoder_with_normalization/results201705310512_K-means_(30)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | K-means (30) with one-hot encoder with normalization 2 | Score=2523.904568574377 3 | Duration=1080.107399075 4 | -------------------------------------------------------------------------------- /results/kmeans_one_hot_encoder_with_normalization/results201705310535_K-means_(40)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | K-means (40) with one-hot encoder with normalization 2 | Score=2961.8241047913543 3 | Duration=1429.234178146 4 | -------------------------------------------------------------------------------- /results/kmeans_one_hot_encoder_with_normalization/results201705310605_K-means_(50)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | K-means (50) with one-hot encoder with normalization 2 | Score=1014.2989684919021 3 | Duration=1775.269853862 4 | -------------------------------------------------------------------------------- /results/kmeans_one_hot_encoder_with_normalization/results201705310641_K-means_(60)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | K-means (60) with one-hot encoder with normalization 2 | Score=760.2434287550872 3 | Duration=2139.817159987 4 | -------------------------------------------------------------------------------- /results/kmeans_one_hot_encoder_with_normalization/results201705310722_K-means_(70)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | K-means (70) with one-hot encoder with normalization 2 | Score=487.2500268778508 3 | Duration=2480.196751522 4 | -------------------------------------------------------------------------------- /results/kmeans_one_hot_encoder_with_normalization/results201705310809_K-means_(80)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | K-means (80) with one-hot encoder with normalization 2 | Score=365.8395041984696 3 | Duration=2843.095978621 4 | -------------------------------------------------------------------------------- /results/kmeans_one_hot_encoder_with_normalization/results201705310902_K-means_(90)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | K-means (90) with one-hot encoder with normalization 2 | Score=252.89070654255804 3 | Duration=3175.465465776 4 | -------------------------------------------------------------------------------- /results/kmeans_one_hot_encoder_with_normalization/results201705311001_K-means_(100)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | K-means (100) with one-hot encoder with normalization 2 | Score=197.126534840126 3 | Duration=3516.101870375 4 | -------------------------------------------------------------------------------- /results/bisecting_kmeans_one_hot_encoder_with_normalization/results201705311014_Bisecting_K-means_(20)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | Bisecting K-means (20) with one-hot encoder with normalization 2 | Score=3520.795209404406 3 | Duration=793.596314017 4 | -------------------------------------------------------------------------------- /results/bisecting_kmeans_one_hot_encoder_with_normalization/results201705311245_Bisecting_K-means_(70)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | Bisecting K-means (70) with one-hot encoder with normalization 2 | Score=684.7502042309432 3 | Duration=2509.28944881 4 | -------------------------------------------------------------------------------- /results/gaussian_mixture_one_hot_encoder_with_normalization/results201705311543_GaussianMixture_(20)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | GaussianMixture (20) with one-hot encoder with normalization 2 | Score=12209.316516395684 3 | Duration=1063.475780708 4 | -------------------------------------------------------------------------------- /results/gaussian_mixture_one_hot_encoder_with_normalization/results201705311613_GaussianMixture_(30)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | GaussianMixture (30) with one-hot encoder with normalization 2 | Score=8139.544344265546 3 | Duration=1814.226984862 4 | -------------------------------------------------------------------------------- /results/gaussian_mixture_one_hot_encoder_with_normalization/results201705311659_GaussianMixture_(40)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | GaussianMixture (40) with one-hot encoder with normalization 2 | Score=6104.658258199119 3 | Duration=2788.677829532 4 | -------------------------------------------------------------------------------- /results/gaussian_mixture_one_hot_encoder_with_normalization/results201705311806_GaussianMixture_(50)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | GaussianMixture (50) with one-hot encoder with normalization 2 | Score=4883.72660655933 3 | Duration=4007.448348552 4 | -------------------------------------------------------------------------------- /results/gaussian_mixture_one_hot_encoder_with_normalization/results201705311937_GaussianMixture_(60)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | GaussianMixture (60) with one-hot encoder with normalization 2 | Score=4069.772172132773 3 | Duration=5435.724346663 4 | -------------------------------------------------------------------------------- /results/gaussian_mixture_one_hot_encoder_with_normalization/results201705312134_GaussianMixture_(70)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | GaussianMixture (70) with one-hot encoder with normalization 2 | Score=3488.3761475423494 3 | Duration=7058.942404606 4 | -------------------------------------------------------------------------------- /results/gaussian_mixture_one_hot_encoder_with_normalization/results201706010002_GaussianMixture_(80)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | GaussianMixture (80) with one-hot encoder with normalization 2 | Score=3052.3291290995594 3 | Duration=8891.728532267 4 | -------------------------------------------------------------------------------- /results/gaussian_mixture_one_hot_encoder_with_normalization/results201706010316_GaussianMixture_(90)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | GaussianMixture (90) with one-hot encoder with normalization 2 | Score=2713.181448087896 3 | Duration=11625.973742715 4 | -------------------------------------------------------------------------------- /results/gaussian_mixture_one_hot_encoder_with_normalization/results201706010649_GaussianMixture_(100)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | GaussianMixture (100) with one-hot encoder with normalization 2 | Score=2441.863303279665 3 | Duration=12777.292420437 4 | -------------------------------------------------------------------------------- /results/bisecting_kmeans_one_hot_encoder_with_normalization/results201705311033_Bisecting_K-means_(30)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | Bisecting K-means (30) with one-hot encoder with normalization 2 | Score=2103.692239687063 3 | Duration=1120.416655349 4 | -------------------------------------------------------------------------------- /results/bisecting_kmeans_one_hot_encoder_with_normalization/results201705311057_Bisecting_K-means_(40)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | Bisecting K-means (40) with one-hot encoder with normalization 2 | Score=1418.1541171113377 3 | Duration=1475.098686214 4 | -------------------------------------------------------------------------------- /results/bisecting_kmeans_one_hot_encoder_with_normalization/results201705311128_Bisecting_K-means_(50)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | Bisecting K-means (50) with one-hot encoder with normalization 2 | Score=1062.8226643714606 3 | Duration=1815.145936027 4 | -------------------------------------------------------------------------------- /results/bisecting_kmeans_one_hot_encoder_with_normalization/results201705311204_Bisecting_K-means_(60)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | Bisecting K-means (60) with one-hot encoder with normalization 2 | Score=836.3342938470117 3 | Duration=2154.915725311 4 | -------------------------------------------------------------------------------- /results/bisecting_kmeans_one_hot_encoder_with_normalization/results201705311333_Bisecting_K-means_(80)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | Bisecting K-means (80) with one-hot encoder with normalization 2 | Score=575.9102565141137 3 | Duration=2842.771511482 4 | -------------------------------------------------------------------------------- /results/bisecting_kmeans_one_hot_encoder_with_normalization/results201705311426_Bisecting_K-means_(90)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | Bisecting K-means (90) with one-hot encoder with normalization 2 | Score=493.21041806599266 3 | Duration=3196.980530802 4 | -------------------------------------------------------------------------------- /results/bisecting_kmeans_one_hot_encoder_with_normalization/results201705311525_Bisecting_K-means_(100)_with_one-hot_encoder_with_normalization.txt: -------------------------------------------------------------------------------- 1 | Bisecting K-means (100) with one-hot encoder with normalization 2 | Score=448.4686151150904 3 | Duration=3524.550971894 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Axel Fahy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NetworkAnomalyDetection 2 | 3 | Anomaly Detection in Network Traffic using different clustering algorithm. 4 | 5 | Data must be located in the *data* folder. Due to the size of the dataset, it has been ignored. 6 | 7 | The full dataset is available at 8 | 9 | Program can be run using Intellij or with sbt and spark-submit: 10 | 11 | ```$ sbt package``` 12 | 13 | ```$ spark-submit --class "NetworkAnomalyDetection" --driver-memory 6g target/scala-2.11/networkanomalydetection_2.11-0.1.jar``` 14 | 15 | This project has been developped using DataFrames from Spark MLlib. 16 | 17 | 18 | ## Data 19 | 20 | The datasets used for the training phase contain 4,898,431 packets, and its size is of 743 MB. Here is a preview of the data: 21 | 22 | ```0,tcp,http,SF,215,45076,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0,0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal.``` 23 | 24 | The data come from a competition held in 1999 by KDD Cup organization (i.e. an organisation bringing together professional data miners). The competition was about creating a model capable to detect anomalies in network traffic. The packets available were gathered from an US military network. 25 | 26 | ## Preprocessing 27 | 28 | - **Numerical features**: at first, only numerical features were used as features. 29 | 30 | - **Categorical features**: then, categorical features were encoded using the One-hot encoder. 31 | 32 | - **Normalization**: finally, normalization of vector features using standard deviation 33 | 34 | ## Algorithms 35 | 36 | - **K-means**: the center of a cluster is named a centroid. At the first iteration, K centroids are chosen randomly. Then, at each iteration, data points are affected to their closest centroid and the centroid becomes the mean of the points from this cluster. 37 | 38 | - **Bisecting K-means**: same approach as K-means. However, at first there is only one cluster that contains all data points. Then, at each iteration, cluster are divided using K-means. 39 | 40 | - **Gaussian Mixture (GMM)**: the goal is to maximise the log-likelihood. The algorithm will iterate until a threshold is reached or a maximum of iterations. The algorithm will converge, but not necessarly in a global minimum. 41 | 42 | ## Evaluation 43 | 44 | To evaluate the models with the same technique, an evaluation function was created. The evaluation is the Euclidean squared distance of each centroids. 45 | 46 | -------------------------------------------------------------------------------- /plotScore.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import matplotlib.mlab as mlab 4 | 5 | from os import listdir 6 | from os.path import isfile, join 7 | import plotly.plotly as py 8 | 9 | listFiles = [] 10 | scoreNumber = [] 11 | y2 = [] 12 | 13 | #Plot scores for K-means Simple 14 | def kmeanSimple(scores): 15 | 16 | for score in scores: 17 | y2.append(score) 18 | 19 | x2 = [20, 30, 40, 50, 60, 70, 80, 90, 100] 20 | plt.bar(x2, y2, label='Score', color='blue') 21 | plt.xlabel('K') 22 | plt.ylabel('Scores') 23 | plt.title('K-means simple') 24 | plt.legend() 25 | plt.show() 26 | 27 | #Plot score for K-means One Hot Encoder 28 | def kmeanOneHotEncoder(scores): 29 | 30 | for score in scores: 31 | y2.append(score) 32 | 33 | x2 = [20, 30, 40, 50, 60, 70, 80, 90, 100] 34 | plt.bar(x2, y2, label='Score', color='green') 35 | plt.xlabel('K') 36 | plt.ylabel('Scores') 37 | plt.title('K-means One Hot Encoder') 38 | plt.legend() 39 | plt.show() 40 | 41 | #Plot scores for K-means One Hot Encoder with normalization 42 | def kmeanOneHotEncoderWithNormalization(scores): 43 | 44 | for score in scores: 45 | y2.append(score) 46 | 47 | x2 = [20, 30, 40, 50, 60, 70, 80, 90, 100] 48 | plt.bar(x2, y2, label='Score', color='yellow') 49 | plt.xlabel('K') 50 | plt.ylabel('Scores') 51 | plt.title('K-means One Hot Encoder with normalization') 52 | plt.legend() 53 | plt.show() 54 | 55 | #Plot score for Bisecting K-means One Hot Encoder with normalization 56 | def bisectingKmeanOneHotEncoderWithNormalization(scores): 57 | 58 | for score in scores: 59 | y2.append(score) 60 | 61 | x2 = [20, 30, 40, 50, 60, 70, 80, 90, 100] 62 | plt.bar(x2, y2, label='Score', color='red') 63 | plt.xlabel('K') 64 | plt.ylabel('Scores') 65 | plt.title('Bisecting K-means One Hot Encoder with normalization') 66 | plt.legend() 67 | plt.show() 68 | 69 | #Plot score Gaussian Mixture One Hot Encoder with normalization 70 | def gaussianMixtureOneHotEncoderWithNormalization(scores): 71 | 72 | for score in scores: 73 | y2.append(score) 74 | 75 | x2 = [20, 30, 40, 50, 60, 70, 80, 90, 100] 76 | plt.bar(x2, y2, label='Score', color='orange') 77 | plt.xlabel('K') 78 | plt.ylabel('Scores') 79 | plt.title('Bisecting K-means One Hot Encoder with normalization') 80 | plt.legend() 81 | plt.show() 82 | 83 | #Read score in file 84 | def readScoreInFile(filename): 85 | with open(filename) as f: 86 | lines = f.readlines() 87 | lines = [line.rstrip('\n') for line in lines] 88 | #print('lines: '+lines[1]) 89 | lineScore = lines[1].split('=') 90 | scoreNumber.append(lineScore[1]) 91 | print(filename+' : '+lineScore[1]) 92 | 93 | #Read files in folder 94 | def readFiles(pathToFolder): 95 | 96 | for f in listdir(pathToFolder): 97 | if isfile(join(pathToFolder, f)): 98 | listFiles.append(f) 99 | 100 | for el in listFiles: 101 | readScoreInFile(pathToFolder+'/'+el) 102 | 103 | #Read and plot each technic separately 104 | readFiles('results/kmeans_simple') 105 | kmeanSimple(scoreNumber) 106 | 107 | #readFiles('results/kmeans_one_hot_encoder') 108 | #kmeanOneHotEncoder(scoreNumber) 109 | 110 | #readFiles('results/kmeans_one_hot_encoder_with_normalization') 111 | #kmeanOneHotEncoderWithNormalization(scoreNumber) 112 | 113 | #readFiles('results/bisecting_kmeans_one_hot_encoder_with_normalization') 114 | #bisectingKmeanOneHotEncoderWithNormalization(scoreNumber) 115 | 116 | #readFiles('results/gaussian_mixture_one_hot_encoder_with_normalization') 117 | #gaussianMixtureOneHotEncoderWithNormalization(scoreNumber) 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Specific files ### 2 | 3 | # Data 4 | data/ 5 | 6 | # TagsClassifier temp folders 7 | derby.log 8 | metastore_db/ 9 | models/ 10 | spark-warehouse/ 11 | target/ 12 | 13 | ### C++ ### 14 | # Prerequisites 15 | *.d 16 | 17 | # Compiled Object files 18 | *.slo 19 | *.lo 20 | *.o 21 | *.obj 22 | 23 | # Precompiled Headers 24 | *.gch 25 | *.pch 26 | 27 | # Compiled Dynamic libraries 28 | *.so 29 | *.dylib 30 | *.dll 31 | 32 | # Fortran module files 33 | *.mod 34 | *.smod 35 | 36 | # Compiled Static libraries 37 | *.lai 38 | *.la 39 | *.a 40 | *.lib 41 | 42 | # Executables 43 | *.exe 44 | *.out 45 | *.app 46 | 47 | ### Intellij ### 48 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 49 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 50 | 51 | #Remove .idea folder 52 | .idea 53 | # User-specific stuff: 54 | .idea/**/workspace.xml 55 | .idea/**/tasks.xml 56 | .idea/dictionaries 57 | 58 | # Sensitive or high-churn files: 59 | .idea/**/dataSources/ 60 | .idea/**/dataSources.ids 61 | .idea/**/dataSources.xml 62 | .idea/**/dataSources.local.xml 63 | .idea/**/sqlDataSources.xml 64 | .idea/**/dynamic.xml 65 | .idea/**/uiDesigner.xml 66 | 67 | # Gradle: 68 | .idea/**/gradle.xml 69 | .idea/**/libraries 70 | 71 | # Mongo Explorer plugin: 72 | .idea/**/mongoSettings.xml 73 | 74 | ## File-based project format: 75 | *.iws 76 | 77 | ## Plugin-specific files: 78 | 79 | # IntelliJ 80 | /out/ 81 | 82 | # mpeltonen/sbt-idea plugin 83 | .idea_modules/ 84 | 85 | # JIRA plugin 86 | atlassian-ide-plugin.xml 87 | 88 | # Crashlytics plugin (for Android Studio and IntelliJ) 89 | com_crashlytics_export_strings.xml 90 | crashlytics.properties 91 | crashlytics-build.properties 92 | fabric.properties 93 | 94 | ### Intellij Patch ### 95 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 96 | 97 | # *.iml 98 | # modules.xml 99 | # .idea/misc.xml 100 | # *.ipr 101 | 102 | ### Java ### 103 | # Compiled class file 104 | *.class 105 | 106 | # Log file 107 | *.log 108 | 109 | # BlueJ files 110 | *.ctxt 111 | 112 | # Mobile Tools for Java (J2ME) 113 | .mtj.tmp/ 114 | 115 | # Package Files # 116 | *.jar 117 | *.war 118 | *.ear 119 | *.zip 120 | *.tar.gz 121 | *.rar 122 | 123 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 124 | hs_err_pid* 125 | 126 | ### LaTeX ### 127 | ## Core latex/pdflatex auxiliary files: 128 | *.aux 129 | *.lof 130 | *.lot 131 | *.fls 132 | *.toc 133 | *.fmt 134 | *.fot 135 | *.cb 136 | *.cb2 137 | 138 | ## Intermediate documents: 139 | *.dvi 140 | *-converted-to.* 141 | # these rules might exclude image files for figures etc. 142 | # *.ps 143 | # *.eps 144 | # *.pdf 145 | 146 | ## Generated if empty string is given at "Please type another file name for output:" 147 | .pdf 148 | 149 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 150 | *.bbl 151 | *.bcf 152 | *.blg 153 | *-blx.aux 154 | *-blx.bib 155 | *.brf 156 | *.run.xml 157 | 158 | ## Build tool auxiliary files: 159 | *.fdb_latexmk 160 | *.synctex 161 | *.synctex(busy) 162 | *.synctex.gz 163 | *.synctex.gz(busy) 164 | *.pdfsync 165 | 166 | ## Auxiliary and intermediate files from other packages: 167 | # algorithms 168 | *.alg 169 | *.loa 170 | 171 | # achemso 172 | acs-*.bib 173 | 174 | # amsthm 175 | *.thm 176 | 177 | # beamer 178 | *.nav 179 | *.pre 180 | *.snm 181 | *.vrb 182 | 183 | # changes 184 | *.soc 185 | 186 | # cprotect 187 | *.cpt 188 | 189 | # elsarticle (documentclass of Elsevier journals) 190 | *.spl 191 | 192 | # endnotes 193 | *.ent 194 | 195 | # fixme 196 | *.lox 197 | 198 | # feynmf/feynmp 199 | *.mf 200 | *.mp 201 | *.t[1-9] 202 | *.t[1-9][0-9] 203 | *.tfm 204 | *.[1-9] 205 | *.[1-9][0-9] 206 | 207 | #(r)(e)ledmac/(r)(e)ledpar 208 | *.end 209 | *.?end 210 | *.[1-9][0-9][0-9] 211 | *.[1-9]R 212 | *.[1-9][0-9]R 213 | *.[1-9][0-9][0-9]R 214 | *.eledsec[1-9] 215 | *.eledsec[1-9]R 216 | *.eledsec[1-9][0-9] 217 | *.eledsec[1-9][0-9]R 218 | *.eledsec[1-9][0-9][0-9] 219 | *.eledsec[1-9][0-9][0-9]R 220 | 221 | # glossaries 222 | *.acn 223 | *.acr 224 | *.glg 225 | *.glo 226 | *.gls 227 | *.glsdefs 228 | 229 | # gnuplottex 230 | *-gnuplottex-* 231 | 232 | # gregoriotex 233 | *.gaux 234 | *.gtex 235 | 236 | # hyperref 237 | 238 | # knitr 239 | *-concordance.tex 240 | # TODO Comment the next line if you want to keep your tikz graphics files 241 | *.tikz 242 | *-tikzDictionary 243 | 244 | # listings 245 | *.lol 246 | 247 | # makeidx 248 | *.idx 249 | *.ilg 250 | *.ind 251 | *.ist 252 | 253 | # minitoc 254 | *.maf 255 | *.mlf 256 | *.mlt 257 | *.mtc[0-9]* 258 | 259 | # minted 260 | _minted* 261 | *.pyg 262 | 263 | # morewrites 264 | *.mw 265 | 266 | # mylatexformat 267 | 268 | # nomencl 269 | *.nlo 270 | 271 | # pax 272 | *.pax 273 | 274 | # sagetex 275 | *.sagetex.sage 276 | *.sagetex.py 277 | *.sagetex.scmd 278 | 279 | # scrwfile 280 | *.wrt 281 | 282 | # sympy 283 | *.sout 284 | *.sympy 285 | sympy-plots-for-*.tex/ 286 | 287 | # pdfcomment 288 | *.upa 289 | *.upb 290 | 291 | # pythontex 292 | *.pytxcode 293 | pythontex-files-*/ 294 | 295 | # thmtools 296 | *.loe 297 | 298 | # TikZ & PGF 299 | *.dpth 300 | *.md5 301 | *.auxlock 302 | 303 | # todonotes 304 | *.tdo 305 | 306 | # easy-todo 307 | *.lod 308 | 309 | # xindy 310 | *.xdy 311 | 312 | # xypic precompiled matrices 313 | *.xyc 314 | 315 | # endfloat 316 | *.ttt 317 | *.fff 318 | 319 | # Latexian 320 | TSWLatexianTemp* 321 | 322 | ## Editors: 323 | # WinEdt 324 | *.bak 325 | *.sav 326 | 327 | # Texpad 328 | .texpadtmp 329 | 330 | # Kile 331 | *.backup 332 | 333 | # KBibTeX 334 | *~[0-9]* 335 | 336 | # auto folder when using emacs and auctex 337 | /auto/* 338 | 339 | # expex forward references with \gathertags 340 | *-tags.tex 341 | 342 | ### Python ### 343 | # Byte-compiled / optimized / DLL files 344 | __pycache__/ 345 | *.py[cod] 346 | *$py.class 347 | 348 | # C extensions 349 | *.so 350 | 351 | # Distribution / packaging 352 | .Python 353 | env/ 354 | build/ 355 | develop-eggs/ 356 | dist/ 357 | downloads/ 358 | eggs/ 359 | .eggs/ 360 | lib/ 361 | lib64/ 362 | parts/ 363 | sdist/ 364 | var/ 365 | wheels/ 366 | *.egg-info/ 367 | .installed.cfg 368 | *.egg 369 | 370 | # PyInstaller 371 | # Usually these files are written by a python script from a template 372 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 373 | *.manifest 374 | *.spec 375 | 376 | # Installer logs 377 | pip-log.txt 378 | pip-delete-this-directory.txt 379 | 380 | # Unit test / coverage reports 381 | htmlcov/ 382 | .tox/ 383 | .coverage 384 | .coverage.* 385 | .cache 386 | nosetests.xml 387 | coverage.xml 388 | *,cover 389 | .hypothesis/ 390 | 391 | # Translations 392 | *.mo 393 | *.pot 394 | 395 | # Django stuff: 396 | *.log 397 | local_settings.py 398 | 399 | # Flask stuff: 400 | instance/ 401 | .webassets-cache 402 | 403 | # Scrapy stuff: 404 | .scrapy 405 | 406 | # Sphinx documentation 407 | docs/_build/ 408 | 409 | # PyBuilder 410 | target/ 411 | 412 | # Jupyter Notebook 413 | .ipynb_checkpoints 414 | 415 | # pyenv 416 | .python-version 417 | 418 | # celery beat schedule file 419 | celerybeat-schedule 420 | 421 | # dotenv 422 | .env 423 | 424 | # virtualenv 425 | .venv 426 | venv/ 427 | ENV/ 428 | 429 | # Spyder project settings 430 | .spyderproject 431 | 432 | # Rope project settings 433 | .ropeproject 434 | 435 | ### Scala ### 436 | *.class 437 | *.log 438 | 439 | # sbt specific 440 | .cache 441 | .history 442 | .lib/ 443 | dist/* 444 | target/ 445 | lib_managed/ 446 | src_managed/ 447 | project/boot/ 448 | project/plugins/project/ 449 | 450 | # Scala-IDE specific 451 | .ensime 452 | .ensime_cache/ 453 | .scala_dependencies 454 | .worksheet 455 | 456 | # ENSIME specific 457 | 458 | ### Vim ### 459 | # swap 460 | [._]*.s[a-v][a-z] 461 | [._]*.sw[a-p] 462 | [._]s[a-v][a-z] 463 | [._]sw[a-p] 464 | # session 465 | Session.vim 466 | # temporary 467 | .netrwhist 468 | *~ 469 | # auto-generated tag files 470 | tags 471 | 472 | NetworkAnomalyDetection.iml 473 | .DS_Store 474 | -------------------------------------------------------------------------------- /src/NetworkAnomalyDetection.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Anomaly Detection in Network Traffic with different clustering algorithm. 3 | * 4 | * The implementation is done using the DataFrame-based API of SparkMLlib. 5 | * 6 | * Algorithms: 7 | * 8 | * - K-means 9 | * - Gaussian Mixture Model (GMM) 10 | * 11 | * Categorical features are transformed into numerical features using one-hot encoder. 12 | * Afterwards, all features are normalized. 13 | * 14 | * These different implementation are compared. 15 | * 16 | * Metrics used: 17 | * 18 | * - Sum of distances between points and their centroids 19 | * 20 | * GMM is really slow (quadratic algorithm), so the performance will only be done on 1% of the dataset. 21 | * 22 | * Basic implementation is based on the chapter 5 (Anomaly Detection in Network Traffic with K-means clustering) 23 | * of the book Advanced Analytics with Spark. 24 | * However, this implementation is using the DataFrame-based API instead of the RDD-based API. 25 | * 26 | * Anomaly detection is done as follow: 27 | * 28 | * - Find the maximal value of each cluster, those will be the thresholds 29 | * - For a new point, calculate its score (distance), if it is more than the threshold of its cluster, 30 | * this is an anomaly 31 | * 32 | * Datasource: https://archive.ics.uci.edu/ml/datasets/KDD+Cup+1999+Data 33 | * Test set: http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html (corrected.gz) 34 | * 35 | * @author Axel Fahy 36 | * @author Rudolf Höhn 37 | * @author Brian Nydegger 38 | * @author Assaf Mahmoud 39 | * 40 | * @date 26.05.2017 41 | * 42 | */ 43 | 44 | import java.io.{File, PrintWriter} 45 | import java.text.SimpleDateFormat 46 | import java.util.Calendar 47 | 48 | import org.apache.spark.ml.{Pipeline, PipelineModel} 49 | import org.apache.spark.{SparkConf, SparkContext} 50 | import org.apache.spark.sql.{DataFrame, Row, SparkSession} 51 | import org.apache.spark.sql.types._ 52 | import org.apache.spark.ml.clustering._ 53 | import org.apache.spark.ml.feature.{OneHotEncoder, StandardScaler, StringIndexer, VectorAssembler} 54 | import org.apache.spark.ml.linalg.{DenseVector, Vector} 55 | import org.apache.spark.sql.functions._ 56 | 57 | 58 | object NetworkAnomalyDetection { 59 | 60 | val DataPath = "data/kddcup.data.corrected" 61 | val TestPath = "data/test.data.corrected" 62 | 63 | // Fraction of the dataset used (1.0 for the full dataset) 64 | val Fraction = 1.0 65 | 66 | // Schema of data from csv file 67 | // Used when loading the data to have a correct structure 68 | val DataSchema = StructType(Array( 69 | StructField("duration", IntegerType, true), 70 | StructField("protocol_type", StringType, true), 71 | StructField("service", StringType, true), 72 | StructField("flag", StringType, true), 73 | StructField("src_bytes", IntegerType, true), 74 | StructField("dst_bytes", IntegerType, true), 75 | StructField("land", IntegerType, true), 76 | StructField("wrong_fragment", IntegerType, true), 77 | StructField("urgent", IntegerType, true), 78 | StructField("hot", IntegerType, true), 79 | StructField("num_failed_logins", IntegerType, true), 80 | StructField("logged_in", IntegerType, true), 81 | StructField("num_compromised", IntegerType, true), 82 | StructField("root_shell", IntegerType, true), 83 | StructField("su_attempted", IntegerType, true), 84 | StructField("num_root", IntegerType, true), 85 | StructField("num_file_creations", IntegerType, true), 86 | StructField("num_shells", IntegerType, true), 87 | StructField("num_access_files", IntegerType, true), 88 | StructField("num_outbound_cmds", IntegerType, true), 89 | StructField("is_host_login", IntegerType, true), 90 | StructField("is_guest_login", IntegerType, true), 91 | StructField("count", IntegerType, true), 92 | StructField("srv_count", IntegerType, true), 93 | StructField("serror_rate", DoubleType, true), 94 | StructField("srv_serror_rate", DoubleType, true), 95 | StructField("rerror_rate", DoubleType, true), 96 | StructField("srv_rerror_rate", DoubleType, true), 97 | StructField("same_srv_rate", DoubleType, true), 98 | StructField("diff_srv_rate", DoubleType, true), 99 | StructField("srv_diff_host_rate", DoubleType, true), 100 | StructField("dst_host_count", IntegerType, true), 101 | StructField("dst_host_srv_count", IntegerType, true), 102 | StructField("dst_host_same_srv_rate", DoubleType, true), 103 | StructField("dst_host_diff_srv_rate", DoubleType, true), 104 | StructField("dst_host_same_src_port_rate", DoubleType, true), 105 | StructField("dst_host_srv_diff_host_rate", DoubleType, true), 106 | StructField("dst_host_serror_rate", DoubleType, true), 107 | StructField("dst_host_srv_serror_rate", DoubleType, true), 108 | StructField("dst_host_rerror_rate", DoubleType, true), 109 | StructField("dst_host_srv_rerror_rate", DoubleType, true), 110 | StructField("label", StringType, true))) 111 | 112 | def main(args: Array[String]): Unit = { 113 | // Creation of configuration and session 114 | val conf = new SparkConf() 115 | .setMaster("local") 116 | .setAppName("NetworkAnomalyDetection") 117 | .set("spark.driver.memory", "6g") 118 | 119 | val sc = new SparkContext(conf) 120 | sc.setCheckpointDir("checkpoints/") 121 | 122 | val spark = SparkSession 123 | .builder() 124 | .appName("NetworkAnomalyDetection") 125 | .getOrCreate() 126 | 127 | // Load the data into the schema created previously 128 | val rawDataDF = spark.read.format("com.databricks.spark.csv") 129 | .option("header", "false") 130 | .option("inferSchema", "true") 131 | .schema(DataSchema) 132 | .load(DataPath) 133 | 134 | val dataDF = rawDataDF.sample(false, Fraction, 42) 135 | println("Size of dataset=" + dataDF.count + " (total=" + rawDataDF.count + ")") 136 | val runClustering = new RunClustering(spark, dataDF) 137 | 138 | // K-means 139 | // K-means simple is also doing anomaly detections. 140 | (20 to 100 by 10).map(k => (k, runClustering.kmeansSimple(k))) 141 | (20 to 100 by 10).map(k => (k, runClustering.kmeansOneHotEncoder(k))) 142 | (20 to 100 by 10).map(k => (k, runClustering.kmeansOneHotEncoderWithNormalization(k))) 143 | 144 | // Bisecting K-means 145 | (20 to 100 by 10).map(k => (k, runClustering.bisectingKmeansOneHotEncoderWithNormalization(k))) 146 | 147 | // Gaussian Mixture 148 | (20 to 100 by 10).map(k => (k, runClustering.gaussianMixtureOneHotEncoderWithNormalization(k))) 149 | } 150 | 151 | class RunClustering(private val spark: SparkSession, var data: DataFrame) { 152 | 153 | import spark.implicits._ 154 | 155 | // Select only numerical features 156 | val CategoricalColumns = Seq("label", "protocol_type", "service", "flag") 157 | 158 | /** 159 | * Calculate the Euclidean distance between a data point and its centroid 160 | * 161 | * @param centroid Vector with the components of the centroid 162 | * @param data Vector with the components of the data point 163 | * @return The distance between the data point and the centroid 164 | */ 165 | def distance(centroid: Vector, data: Vector): Double = 166 | // Tranforming vector to array of double since operations 167 | // on vector are not implemented 168 | math.sqrt(centroid.toArray.zip(data.toArray) 169 | .map(p => p._1 - p._2).map(d => d * d).sum) 170 | 171 | /** 172 | * Apply the Euclidean distance between all points belonging to a centroid and the centroid in question 173 | * 174 | * @param centroid Vector with the components of the centroid 175 | * @param dataCentroid All data points (as Vector) belonging to the centroid 176 | * @return An array of double containing all the distance of a cluster (data with same centroid) 177 | */ 178 | def distanceAllCluster(centroid: Vector, dataCentroid: Array[DenseVector]): Array[Double] = { 179 | dataCentroid.map(d => distance(centroid, d)) 180 | } 181 | 182 | /** 183 | * Calculate the score of a cluster 184 | * 185 | * For each k, select data belonging to the centroid 186 | * and calculating the distance. 187 | * 188 | * @param centroids Array containing all the centroids 189 | * @param data Dataset used 190 | * @param k Number of cluster 191 | * @return The mean of the score from all cluster 192 | */ 193 | def clusteringScore(centroids: Array[Vector], data: DataFrame, k: Int): Double = { 194 | val score = (0 until k).map{ k => 195 | val dataCentroid = data.filter($"prediction" === k) 196 | .select("features") 197 | .collect() 198 | .map { 199 | // Get the feature vectors in dense format 200 | case Row(v: Vector) => v.toDense 201 | } 202 | val s = distanceAllCluster(centroids(k), dataCentroid) 203 | if (s.length > 0) 204 | s.sum / s.length 205 | else 206 | s.sum // Sum will be 0 if no element in cluster 207 | } 208 | if (score.nonEmpty) 209 | score.sum / score.length 210 | else 211 | score.sum 212 | } 213 | 214 | /** 215 | * Get the maximum value of each centroid 216 | * 217 | * @param centroids Array containing all the centroids 218 | * @param data DataFrame containing the data points 219 | * @param k The number of cluster 220 | * @return A Map with k as the key and its maximum value as value 221 | */ 222 | def maxByCentroid(centroids: Array[Vector], data: DataFrame, k: Int): Map[Int, Double] = { 223 | val max = (0 until k).map{ k => 224 | val dataCentroid = data.filter($"prediction" === k) 225 | .select("features") 226 | .collect() 227 | .map { 228 | // Get the feature vectors in dense format 229 | case Row(v: Vector) => v.toDense 230 | } 231 | val dist = distanceAllCluster(centroids(k), dataCentroid) 232 | if (dist.isEmpty) { 233 | (k, 0.0) 234 | } 235 | else 236 | (k, dist.max) 237 | } 238 | max.toMap 239 | } 240 | 241 | /** 242 | * Calculate the distance between a point and its centroid 243 | * 244 | * This is an udf and must be run on a DataFrame. 245 | * Usage of currying in order to pass other parameters. 246 | * 247 | * The columns of the DataFrame to use: "features" and "prediction" 248 | * Uses the prediction column to know in which centroid the point belongs. 249 | * 250 | * @param centroids Centroids 251 | * @return 252 | */ 253 | def calculateDistance(centroids: Array[Vector]) = udf((v: Vector, k: Int) => { 254 | math.sqrt(centroids(k).toArray.zip(v.toArray) 255 | .map(p => p._1 - p._2).map(d => d * d).sum) 256 | }) 257 | 258 | /** 259 | * Check if a point is an anomaly 260 | * 261 | * If the score of a point is higher than the maximum of the cluster 262 | * in which it belongs, it is an anomaly. 263 | * 264 | * UDF run on "dist" column and "prediction" 265 | * 266 | * @param max Map containing the maximal value of each cluster 267 | * @return 1 if the paquet is an anomaly, else 0 268 | */ 269 | def checkAnomaly(max: Map[Int, Double]) = udf((distance: Double, k: Int) => if (distance > max(k)) 1 else 0) 270 | 271 | /** 272 | * Get all the anomalies of a test set 273 | * 274 | * @param pipeline The pipeline used for the preprocessing 275 | * @param data The test data 276 | * @param centroids The centroids found on the training data 277 | * @param max Maximal value of each centroid 278 | * @return A DataFrame containing the anomalies 279 | */ 280 | def getAnomalies(pipeline: PipelineModel, data: DataFrame, centroids: Array[Vector], max: Map[Int, Double]) = { 281 | val predictDF = pipeline.transform(data) 282 | 283 | val distanceDF = predictDF.withColumn("dist", calculateDistance(centroids)(predictDF("features"), predictDF("prediction"))).checkpoint() 284 | val anomalies = distanceDF.withColumn("anomaly", checkAnomaly(max)(distanceDF("dist"), distanceDF("prediction"))).checkpoint() 285 | anomalies.filter($"anomaly" > 0) 286 | } 287 | 288 | /** 289 | * Anomaly detection on test set 290 | * 291 | * Get the maximal value of each cluster, and check for each point 292 | * if its value is higher than the maximal, if this is the case, this is an anomaly. 293 | * 294 | * @param dataDF Data of the training 295 | * @param pipelineModel Pipeline model used with the training 296 | * @param k Number of clusters 297 | * @return A DataFrame containing the anomalies 298 | */ 299 | def anomalyDectection(dataDF: DataFrame, pipelineModel: PipelineModel, k: Int): DataFrame = { 300 | // Load the data into the schema created previously 301 | val dataTestDF = spark.read.format("com.databricks.spark.csv") 302 | .option("header", "false") 303 | .option("inferSchema", "true") 304 | .schema(DataSchema) 305 | .load(TestPath) 306 | 307 | val testDF = dataTestDF.drop("label") 308 | testDF.cache() 309 | 310 | // Prediction 311 | val cluster = pipelineModel.transform(dataDF) 312 | 313 | val kmeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel] 314 | 315 | // Get the centroids 316 | val centroids = kmeansModel.clusterCenters 317 | 318 | // Get the maximal distance for each cluster (on the training data) 319 | val max = this.maxByCentroid(centroids, cluster, k) 320 | 321 | // Detect anomalies on the test data 322 | val anomalies = getAnomalies(pipelineModel, testDF, centroids, max) 323 | testDF.unpersist() 324 | anomalies 325 | } 326 | 327 | /** 328 | * Write the result of a run into a file 329 | * 330 | * Filename is create dynamically with the current date and the algorithm used. 331 | * 332 | * @param score Score already calculated 333 | * @param startTime Start time of the computation 334 | * @param technique String with the name of the algorithm/preprocessing used 335 | */ 336 | def write2file(score: Double, startTime: Long, technique: String): Unit = { 337 | val format = new SimpleDateFormat("yyyyMMddHHmm") 338 | val pw = new PrintWriter(new File("results" + format.format(Calendar.getInstance().getTime) + 339 | "_" + technique.replaceAll(" ", "_") + ".txt")) 340 | try { 341 | println(technique) 342 | pw.write(s"$technique\n") 343 | println(s"Score=$score") 344 | pw.write(s"Score=$score\n") 345 | val duration = (System.nanoTime - startTime) / 1e9d 346 | println(s"Duration=$duration") 347 | pw.write(s"Duration=$duration\n") 348 | } finally { 349 | pw.close() 350 | } 351 | } 352 | 353 | /** 354 | * K-means with only numerical features, without normalization 355 | * 356 | * @param k Number of cluster 357 | */ 358 | def kmeansSimple(k: Int): Unit = { 359 | println(s"Running kmeansSimple ($k)") 360 | val startTime = System.nanoTime() 361 | // Remove the label column 362 | val dataDF = this.data.drop("label") 363 | dataDF.cache() 364 | val numericalColumns = dataDF.columns.diff(CategoricalColumns) 365 | 366 | // Creation of vector with features 367 | val assembler = new VectorAssembler() 368 | .setInputCols(numericalColumns) 369 | .setOutputCol("features") 370 | 371 | val kmeans = new KMeans() 372 | .setK(k) 373 | .setFeaturesCol("features") 374 | .setPredictionCol("prediction") 375 | .setSeed(1L) 376 | 377 | val pipeline = new Pipeline() 378 | .setStages(Array(assembler, kmeans)) 379 | 380 | val pipelineModel = pipeline.fit(dataDF) 381 | 382 | val kmeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel] 383 | 384 | // Prediction 385 | val cluster = pipelineModel.transform(dataDF) 386 | 387 | // Get the centroids 388 | val centroids = kmeansModel.clusterCenters 389 | 390 | // Calculate the score 391 | val score = this.clusteringScore(centroids, cluster, k) 392 | 393 | this.write2file(score, startTime, "K-means (" + k + ") simple") 394 | 395 | // Anomaly detection 396 | val anomalies = this.anomalyDectection(dataDF, pipelineModel, k) 397 | // Save results to json file 398 | val format = new SimpleDateFormat("yyyyMMddHHmm") 399 | Thread.sleep(1000) 400 | anomalies.write.json("anomalies_" + format.format(Calendar.getInstance().getTime) + "_" + k + ".json") 401 | dataDF.unpersist() 402 | } 403 | 404 | /** 405 | * K-means using categorical features, without normalization 406 | * 407 | * Categorical features are encoded using the One-Hot encoder. 408 | * 409 | * @param k Number of cluster 410 | */ 411 | def kmeansOneHotEncoder(k: Int): Unit = { 412 | println(s"Running kmeansOneHotEncoder ($k)") 413 | val startTime = System.nanoTime() 414 | // Remove the label column 415 | val dataDF = this.data.drop("label") 416 | dataDF.cache() 417 | 418 | // Indexing categorical columns 419 | val indexer: Array[org.apache.spark.ml.PipelineStage] = CategoricalColumns.map( 420 | c => new StringIndexer() 421 | .setInputCol(c) 422 | .setOutputCol(s"${c}_index") 423 | ).toArray 424 | 425 | // Encoding previously indexed columns 426 | val encoder: Array[org.apache.spark.ml.PipelineStage] = CategoricalColumns.map( 427 | c => new OneHotEncoder() 428 | .setInputCol(s"${c}_index") 429 | .setOutputCol(s"${c}_vec") 430 | .setDropLast(false) 431 | ).toArray 432 | 433 | // Creation of list of columns for vector assembler (with only numerical columns) 434 | val assemblerColumns = (Set(dataDF.columns: _*) -- CategoricalColumns ++ CategoricalColumns.map(c => s"${c}_vec")).toArray 435 | 436 | // Creation of vector with features 437 | val assembler = new VectorAssembler() 438 | .setInputCols(assemblerColumns) 439 | .setOutputCol("features") 440 | 441 | val kmeans = new KMeans() 442 | .setK(k) 443 | .setFeaturesCol("features") 444 | .setPredictionCol("prediction") 445 | .setSeed(1L) 446 | 447 | val pipeline = new Pipeline() 448 | .setStages(indexer ++ encoder ++ Array(assembler, kmeans)) 449 | 450 | val pipelineModel = pipeline.fit(dataDF) 451 | 452 | val kmeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel] 453 | 454 | // Prediction 455 | val cluster = pipelineModel.transform(dataDF) 456 | dataDF.unpersist() 457 | 458 | // Get the centroids 459 | val centroids = kmeansModel.clusterCenters 460 | 461 | // Calculate the score 462 | val score = this.clusteringScore(centroids, cluster, k) 463 | 464 | this.write2file(score, startTime, "K-means (" + k + ") with one-hot encoder") 465 | } 466 | 467 | /** 468 | * K-means using categorical features, with normalization 469 | * 470 | * Categorical features are encoded using the One-hot encoder. 471 | * One-hot encoder will map a column of label indices to a column of binary vectors. 472 | * Normalization is done using the standard deviation 473 | * 474 | * @param k Number of cluster 475 | */ 476 | def kmeansOneHotEncoderWithNormalization(k: Int): Unit = { 477 | println(s"Running kmeansOneHotEncoderWithNormalization ($k)") 478 | val startTime = System.nanoTime() 479 | // Remove the label column 480 | val dataDF = this.data.drop("label") 481 | dataDF.cache() 482 | 483 | // Indexing categorical columns 484 | val indexer: Array[org.apache.spark.ml.PipelineStage] = CategoricalColumns.map( 485 | c => new StringIndexer() 486 | .setInputCol(c) 487 | .setOutputCol(s"${c}_index") 488 | ).toArray 489 | 490 | // Encoding previously indexed columns 491 | val encoder: Array[org.apache.spark.ml.PipelineStage] = CategoricalColumns.map( 492 | c => new OneHotEncoder() 493 | .setInputCol(s"${c}_index") 494 | .setOutputCol(s"${c}_vec") 495 | .setDropLast(false) 496 | ).toArray 497 | 498 | // Creation of list of columns for vector assembler (with only numerical columns) 499 | val assemblerColumns = (Set(dataDF.columns: _*) -- CategoricalColumns ++ CategoricalColumns.map(c => s"${c}_vec")).toArray 500 | 501 | // Creation of vector with features 502 | val assembler = new VectorAssembler() 503 | .setInputCols(assemblerColumns) 504 | .setOutputCol("featuresVector") 505 | 506 | // Normalization using standard deviation 507 | val scaler = new StandardScaler() 508 | .setInputCol("featuresVector") 509 | .setOutputCol("features") 510 | .setWithStd(true) 511 | .setWithMean(false) 512 | 513 | val kmeans = new KMeans() 514 | .setK(k) 515 | .setFeaturesCol("features") 516 | .setPredictionCol("prediction") 517 | .setSeed(1L) 518 | 519 | val pipeline = new Pipeline() 520 | .setStages(indexer ++ encoder ++ Array(assembler, scaler, kmeans)) 521 | 522 | val pipelineModel = pipeline.fit(dataDF) 523 | 524 | // Prediction 525 | val cluster = pipelineModel.transform(dataDF) 526 | dataDF.unpersist() 527 | 528 | val kmeansModel = pipelineModel.stages.last.asInstanceOf[KMeansModel] 529 | 530 | // Get the centroids 531 | val centroids = kmeansModel.clusterCenters 532 | 533 | // Calculate the score 534 | val score = this.clusteringScore(centroids, cluster, k) 535 | 536 | this.write2file(score, startTime, "K-means (" + k + ") with one-hot encoder with normalization") 537 | } 538 | 539 | /** 540 | * Bisecting K-means using categorical features, with normalization 541 | * 542 | * With the Bisecting K-means, al observations start in one cluster 543 | * and split are performed recursively in a "top-down" approach. 544 | * 545 | * Categorical features are encoded using the One-hot encoder. 546 | * One-hot encoder will map a column of label indices to a column of binary vectors. 547 | * Normalization is done using the standard deviation 548 | * 549 | * @param k Number of cluster 550 | */ 551 | def bisectingKmeansOneHotEncoderWithNormalization(k: Int): Unit = { 552 | println(s"Running bisectingKmeansOneHotEncoderWithNormalization ($k)") 553 | val startTime = System.nanoTime() 554 | // Remove the label column 555 | val dataDF = this.data.drop("label") 556 | dataDF.cache() 557 | 558 | // Indexing categorical columns 559 | val indexer: Array[org.apache.spark.ml.PipelineStage] = CategoricalColumns.map( 560 | c => new StringIndexer() 561 | .setInputCol(c) 562 | .setOutputCol(s"${c}_index") 563 | ).toArray 564 | 565 | // Encoding previously indexed columns 566 | val encoder: Array[org.apache.spark.ml.PipelineStage] = CategoricalColumns.map( 567 | c => new OneHotEncoder() 568 | .setInputCol(s"${c}_index") 569 | .setOutputCol(s"${c}_vec") 570 | .setDropLast(false) 571 | ).toArray 572 | 573 | // Creation of list of columns for vector assembler (with only numerical columns) 574 | val assemblerColumns = (Set(dataDF.columns: _*) -- CategoricalColumns ++ CategoricalColumns.map(c => s"${c}_vec")).toArray 575 | 576 | // Creation of vector with features 577 | val assembler = new VectorAssembler() 578 | .setInputCols(assemblerColumns) 579 | .setOutputCol("featuresVector") 580 | 581 | // Normalization using standard deviation 582 | val scaler = new StandardScaler() 583 | .setInputCol("featuresVector") 584 | .setOutputCol("features") 585 | .setWithStd(true) 586 | .setWithMean(false) 587 | 588 | val kmeans = new BisectingKMeans() 589 | .setK(k) 590 | .setFeaturesCol("features") 591 | .setPredictionCol("prediction") 592 | .setSeed(1L) 593 | 594 | val pipeline = new Pipeline() 595 | .setStages(indexer ++ encoder ++ Array(assembler, scaler, kmeans)) 596 | 597 | val pipelineModel = pipeline.fit(dataDF) 598 | 599 | // Prediction 600 | val cluster = pipelineModel.transform(dataDF) 601 | dataDF.unpersist() 602 | 603 | val kmeansModel = pipelineModel.stages.last.asInstanceOf[BisectingKMeansModel] 604 | 605 | // Get the centroids 606 | val centroids = kmeansModel.clusterCenters 607 | 608 | // Calculate the score 609 | val score = this.clusteringScore(centroids, cluster, k) 610 | 611 | this.write2file(score, startTime, "Bisecting K-means (" + k + ") with one-hot encoder with normalization") 612 | } 613 | 614 | /** 615 | * Gaussian Mixture Model 616 | * 617 | * Categorical features are encoded using the One-hot encoder. 618 | * One-hot encoder will map a column of label indices to a column of binary vectors. 619 | * Normalization is done using the standard deviation 620 | * 621 | * GMM uses a quadratic algorithm and in consequence takes really long to perform. 622 | * This algorithm will only be used on 1% of the dataset. 623 | * 624 | * @param k Number of cluster 625 | */ 626 | def gaussianMixtureOneHotEncoderWithNormalization(k: Int): Unit = { 627 | println(s"Running gaussianMixtureOneHotEncoderWithNormalization ($k)") 628 | val startTime = System.nanoTime() 629 | // Remove the label column 630 | val dataDF = this.data.drop("label") 631 | dataDF.cache() 632 | 633 | // Indexing categorical columns 634 | val indexer: Array[org.apache.spark.ml.PipelineStage] = CategoricalColumns.map( 635 | c => new StringIndexer() 636 | .setInputCol(c) 637 | .setOutputCol(s"${c}_index") 638 | ).toArray 639 | 640 | // Encoding previously indexed columns 641 | val encoder: Array[org.apache.spark.ml.PipelineStage] = CategoricalColumns.map( 642 | c => new OneHotEncoder() 643 | .setInputCol(s"${c}_index") 644 | .setOutputCol(s"${c}_vec") 645 | .setDropLast(false) 646 | ).toArray 647 | 648 | // Creation of list of columns for vector assembler (with only numerical columns) 649 | val assemblerColumns = (Set(dataDF.columns: _*) -- CategoricalColumns ++ CategoricalColumns.map(c => s"${c}_vec")).toArray 650 | 651 | // Creation of vector with features 652 | val assembler = new VectorAssembler() 653 | .setInputCols(assemblerColumns) 654 | .setOutputCol("featuresVector") 655 | 656 | // Normalization using standard deviation 657 | val scaler = new StandardScaler() 658 | .setInputCol("featuresVector") 659 | .setOutputCol("features") 660 | .setWithStd(true) 661 | .setWithMean(false) 662 | 663 | val gaussianMixture = new GaussianMixture() 664 | .setK(k) 665 | .setFeaturesCol("features") 666 | .setPredictionCol("prediction") 667 | .setSeed(1L) 668 | 669 | val pipeline = new Pipeline() 670 | .setStages(indexer ++ encoder ++ Array(assembler, scaler, gaussianMixture)) 671 | 672 | val pipelineModel = pipeline.fit(dataDF) 673 | 674 | val gmm = pipelineModel.stages.last.asInstanceOf[GaussianMixtureModel] 675 | 676 | // Prediction 677 | val cluster = pipelineModel.transform(dataDF) 678 | dataDF.unpersist() 679 | 680 | // Get the centroids 681 | val centroids = (0 until k).map(i => gmm.gaussians(i).mean).toArray 682 | 683 | // Calculate the score 684 | val score = this.clusteringScore(centroids, cluster, k) 685 | 686 | this.write2file(score, startTime, "GaussianMixture (" + k + ") with one-hot encoder with normalization") 687 | } 688 | } 689 | } 690 | --------------------------------------------------------------------------------