├── .DS_Store ├── .ipynb_checkpoints └── CBAD-checkpoint.ipynb ├── CBAD OUTLINE.pdf ├── CBAD-Poster.pdf ├── CBAD.ipynb ├── CBAD.py ├── Dataset ├── .DS_Store └── KDDTrain+.csv ├── LICENSE └── README.md /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeremy191/clustering-based-anomaly-detection/7df019978b9350920ac2e21bb53663f8709f6ab1/.DS_Store -------------------------------------------------------------------------------- /.ipynb_checkpoints/CBAD-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Main Libraries" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd \n", 18 | "import time\n", 19 | "import os" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "# Clear" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "clear = lambda:os.system('clear')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "# Getting the dataset" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "def getDataSet():# Getting the path of the dataset\n", 52 | " \n", 53 | " while True:\n", 54 | " print(\"**************************************************\")\n", 55 | " print(\"DATA SET MENU\")\n", 56 | " print(\"**************************************************\")\n", 57 | " print(\"1.NSL-KDD\")\n", 58 | " print(\"2.IDS 2017\")\n", 59 | " \n", 60 | " option = input(\"Option:\")\n", 61 | " \n", 62 | " if option == \"1\" or option == \"2\":\n", 63 | " break\n", 64 | " \n", 65 | " path = input(\"Path of the File:\")\n", 66 | " \n", 67 | " return path,option" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "# Reading the dataset" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "def readingData(path): #Reading the Dataset\n", 84 | " \n", 85 | " while True:\n", 86 | " \n", 87 | " option = input(\"Dataset has feature names[y/n]:\") \n", 88 | " \n", 89 | " if option == \"y\" or option == \"n\":\n", 90 | " break\n", 91 | " \n", 92 | " print(\"\\nReading Dataset...\") \n", 93 | " \n", 94 | " if option == \"y\":\n", 95 | " dataSet = pd.read_csv(path,low_memory=False)\n", 96 | " \n", 97 | " elif option == \"n\":\n", 98 | " dataSet = pd.read_csv(path, header = None,low_memory=False)\n", 99 | " \n", 100 | " return dataSet\n" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "# Check if missing data" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 5, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "def checkMissing(X):#This check if the dataset given has missing values.\n", 117 | " isMissing = str(X.isnull().values.any()) #Using String instead of Boolean because (\"cannot unpack non-iterable numpy.bool object\")\n", 118 | " \n", 119 | " if isMissing == \"True\":\n", 120 | " #if data set has infinity values replace them with none\n", 121 | " X = X.replace('Infinity', np.nan) #Replacing Infinity values with nan values\n", 122 | " \n", 123 | " missingValIndex = []\n", 124 | " total = X.isnull().sum().sum()\n", 125 | " percent = (total / (X.count().sum() + X.isnull().sum().sum())) * 100\n", 126 | " \n", 127 | " for rows in X:\n", 128 | " \n", 129 | " if X[rows].isnull().sum() != 0:\n", 130 | " missingValIndex.append(rows)\n", 131 | " print(\"\\n\\n**************************************************\")\n", 132 | " print(\"Data has missing values\")\n", 133 | " print(\"**************************************************\")\n", 134 | " print(\"Features with missing values:\",missingValIndex)\n", 135 | " print(\"Total missing Values -> \" , total)\n", 136 | " print(percent,\"%\")\n", 137 | " \n", 138 | " return X\n", 139 | " \n", 140 | " else:\n", 141 | " \n", 142 | " return X\n" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "# Getting the features" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 7, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "ename": "IndentationError", 159 | "evalue": "unindent does not match any outer indentation level (, line 47)", 160 | "output_type": "error", 161 | "traceback": [ 162 | "\u001b[0;36m File \u001b[0;32m\"\"\u001b[0;36m, line \u001b[0;32m47\u001b[0m\n\u001b[0;31m for rows in dataSet: #Getting features index with missing values\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unindent does not match any outer indentation level\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "#Getting The data we want to test for the clustering algorithms\n", 168 | "def gettingVariables(dataSet,dataSetOption):# If the dataset is NSL-KDD it would get the features and the labels for it and if its IDS 2017 it would take the features and the labels for it and take careof missing values.\n", 169 | " \n", 170 | " if dataSetOption == \"1\":\n", 171 | " while True:\n", 172 | " print(\"\\n\\n**************************************************\")\n", 173 | " print(\"Variables Menu\")\n", 174 | " print(\"**************************************************\")\n", 175 | " print(\"1.Data set with categorical data oneHot encoded\")\n", 176 | " print(\"2.Data set with categorical data removed\")\n", 177 | " print(\"3.Data set with Risk Values replacing Server Type and Flag Features; Protocol Data oneHot encoded\")\n", 178 | " option = input(\"Enter option :\")\n", 179 | " \n", 180 | " \n", 181 | " if option == \"1\" or option == \"2\" or option == \"3\":\n", 182 | " break\n", 183 | " else:\n", 184 | " \n", 185 | " print(\"Error\\n\\n\")\n", 186 | " \n", 187 | " \n", 188 | " if option == \"1\":\n", 189 | " #Getting the Dependent and independent Variables\n", 190 | " #In all the option we remove the dificulty level feature because we don't need it in our experiments\n", 191 | " \n", 192 | " \n", 193 | " X = dataSet.iloc[:,:-2].values # Data, Get all the rows and all the clums except all the colums - 2\n", 194 | " Y = dataSet.iloc[:,42].values# Labels\n", 195 | " return X,Y,option\n", 196 | " \n", 197 | " elif option == \"2\":\n", 198 | " #Removing Categorical data from the data set\n", 199 | " X = dataSet.iloc[:,[0,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40]].values\n", 200 | " Y = dataSet.iloc[:,42].values# Labels\n", 201 | " \n", 202 | " return X,Y,option\n", 203 | " \n", 204 | " elif option == \"3\":\n", 205 | " #for later Risk Encode - Categorical features\n", 206 | " X = dataSet.iloc[:,:-2].values\n", 207 | " Y = dataSet.iloc[:,42].values# Labels\n", 208 | " \n", 209 | " return X,Y,option\n", 210 | " \n", 211 | "\n", 212 | " elif dataSetOption == \"2\":\n", 213 | " #############################################################################\n", 214 | " #GETTING VARIABLES\n", 215 | " #############################################################################\n", 216 | " missingValIndex = []\n", 217 | " for rows in dataSet: #Getting features index with missing values\n", 218 | " if dataSet[rows].isnull().sum() != 0:\n", 219 | " missingValIndex.append(rows)\n", 220 | " \n", 221 | " X = dataSet.iloc[:,:-1].values#data\n", 222 | " #if names are not especified it will assign 0,1,2...n for the features name\n", 223 | " X = pd.DataFrame(X,columns = [' Destination Port',' Flow Duration',' Total Fwd Packets',' Total Backward Packets','Total Length of Fwd Packets',\n", 224 | " ' Total Length of Bwd Packets',' Fwd Packet Length Max',' Fwd Packet Length Min',' Fwd Packet Length Mean',' Fwd Packet Length Std',\n", 225 | " 'Bwd Packet Length Max',' Bwd Packet Length Min',' Bwd Packet Length Mean',' Bwd Packet Length Std','Flow Bytes/s',' Flow Packets/s',' Flow IAT Mean',\n", 226 | " ' Flow IAT Std',' Flow IAT Max',' Flow IAT Min','Fwd IAT Total',' Fwd IAT Mean',' Fwd IAT Std',' Fwd IAT Max',' Fwd IAT Min','Bwd IAT Total',' Bwd IAT Mean',\n", 227 | " ' Bwd IAT Std',' Bwd IAT Max',' Bwd IAT Min','Fwd PSH Flags',' Bwd PSH Flags',' Fwd URG Flags',' Bwd URG Flags',' Fwd Header Length',' Bwd Header Length','Fwd Packets/s',\n", 228 | " ' Bwd Packets/s',' Min Packet Length',' Max Packet Length',' Packet Length Mean',' Packet Length Std',' Packet Length Variance','FIN Flag Count',' SYN Flag Count',' RST Flag Count',\n", 229 | " ' PSH Flag Count',' ACK Flag Count',' URG Flag Count',' CWE Flag Count',' ECE Flag Count',' Down/Up Ratio',' Average Packet Size',' Avg Fwd Segment Size',' Avg Bwd Segment Size',' Fwd Header Length',\n", 230 | " 'Fwd Avg Bytes/Bulk',' Fwd Avg Packets/Bulk',' Fwd Avg Bulk Rate',' Bwd Avg Bytes/Bulk',' Bwd Avg Packets/Bulk','Bwd Avg Bulk Rate','Subflow Fwd Packets',' Subflow Fwd Bytes',' Subflow Bwd Packets',' Subflow Bwd Bytes',\n", 231 | " 'Init_Win_bytes_forward',' Init_Win_bytes_backward',' act_data_pkt_fwd',' min_seg_size_forward','Active Mean',' Active Std',' Active Max',' Active Min','Idle Mean',' Idle Std',' Idle Max',' Idle Min'])\n", 232 | " Y = dataSet.iloc[:,78].values#Labels\n", 233 | " \n", 234 | " #############################################################################\n", 235 | " #Variables Got \n", 236 | " #############################################################################\n", 237 | " \n", 238 | " #############################################################################\n", 239 | " #MANAGE MISSING DATA\n", 240 | " ############################################################################# \n", 241 | " \n", 242 | " while True:\n", 243 | " print(\"\\n\\n**************************************************\")\n", 244 | " print(\"Manage Missing Values \")\n", 245 | " print(\"**************************************************\")\n", 246 | " print(\"1.Eliminate Catg. w/ Missing Values\")\n", 247 | " print(\"2.Impute 0 for Missing Values\")\n", 248 | " print(\"3.Impute Mean for Missing Values\")\n", 249 | " print(\"4.Impute Median for Missing Values\")\n", 250 | " print(\"5.Impute Mode for Missing Values\")\n", 251 | " print(\"6.Simple Imputer\")\n", 252 | " missingDataOption = input(\"Option:\")\n", 253 | " \n", 254 | " if missingDataOption == \"1\" or missingDataOption == \"2\" or missingDataOption == \"3\" or missingDataOption == \"4\" or missingDataOption == \"5\" or missingDataOption == \"6\":\n", 255 | " break\n", 256 | " \n", 257 | " \n", 258 | " if missingDataOption == \"1\":\n", 259 | " deletedColumns = []\n", 260 | " numColumns = len(X.columns)\n", 261 | " #removing features with missing values\n", 262 | " for row in missingValIndex:\n", 263 | " deletedColumns.append(row)\n", 264 | " del X[row]\n", 265 | " \n", 266 | " print(\"#\\n\\n########################################################################\")\n", 267 | " print(\"Columns Succesfully Removed\")\n", 268 | " print(len(deletedColumns),\"of\",numColumns,\"were deleted\")\n", 269 | " print(\"Columns Names -> \",deletedColumns)\n", 270 | " print(\"#########################################################################\")\n", 271 | " \n", 272 | " elif missingDataOption == \"2\":\n", 273 | " #fill with 0\n", 274 | " for row in missingValIndex:\n", 275 | " X[row] = X[row].fillna(0)\n", 276 | " \n", 277 | " print(\"\\n\\n#########################################################################\")\n", 278 | " print(\"Sucessfully Filled Missing Values with 0\")\n", 279 | " print(\"#########################################################################\")\n", 280 | " \n", 281 | " \n", 282 | " elif missingDataOption == \"3\":\n", 283 | " #mean imputer\n", 284 | " for row in missingValIndex:\n", 285 | " X[row] = X[row].astype(float)\n", 286 | " X[row] = X[row].fillna(X[row].mean())\n", 287 | " \n", 288 | " print(\"\\n\\n#########################################################################\")\n", 289 | " print(\"Sucessfully Filled Missing Values with Mean\")\n", 290 | " print(\"#########################################################################\")\n", 291 | " \n", 292 | " elif missingDataOption == \"4\":\n", 293 | " #median imputer\n", 294 | " for row in missingValIndex:\n", 295 | " median = X[row].median()\n", 296 | " X[row].fillna(median, inplace=True)\n", 297 | " print(\"\\n\\n#########################################################################\")\n", 298 | " print(\"Sucessfully Filled Missing Values with Median\")\n", 299 | " print(\"#########################################################################\")\n", 300 | " \n", 301 | " elif missingDataOption == \"5\":\n", 302 | " #Mode imputer\n", 303 | " for row in missingValIndex:\n", 304 | " X[row] = X[row].fillna(X[row].mode()[0])\n", 305 | " \n", 306 | " print(\"\\n\\n#########################################################################\")\n", 307 | " print(\"Sucessfully Filled Missing Values with Mode \")\n", 308 | " print(\"#########################################################################\")\n", 309 | " \n", 310 | " elif missingDataOption == \"6\": \n", 311 | " from sklearn.impute import SimpleImputer\n", 312 | " #\"Imputation transformer for completing missing values.\"(Univariate)\n", 313 | " X = SimpleImputer(missing_values = np.nan, strategy='mean', fill_value=None, verbose=0, copy=True).fit_transform(X) \n", 314 | " print(\"\\n\\n#########################################################################\")\n", 315 | " print(\"Sucessfully Imputed Simple Imputer \")\n", 316 | " print(\"#########################################################################\")\n", 317 | " \n", 318 | " \n", 319 | " option = \"None\" #This data does not have categorical features so dataOption is none \n", 320 | " return X,Y,option\n", 321 | " \n", 322 | "#############################################################################\n", 323 | "#END OF MISSING DATA\n", 324 | "#############################################################################" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "# Encoding Labels" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "def encodingLabels(Y,dataOption,datasetOption):# Encoding the labels with multi class or binary\n", 341 | " \n", 342 | " if datasetOption == \"1\": #Check if the data set choosen is NSL-KDD or IDS2017\n", 343 | " \n", 344 | " if dataOption == \"1\" or dataOption == \"2\" or dataOption == \"3\":\n", 345 | " \n", 346 | " while True:\n", 347 | " print(\"\\n\\n#########################################################################\")\n", 348 | " print(\"Encoding Menu\")\n", 349 | " print(\"#########################################################################\")\n", 350 | " print(\"1.Binary true labels: normal = 0, abnormal = 1\")\n", 351 | " print(\"2.Multiclass true labels: normal = 0, DoS = 1, Probe = 2, R2L = 3, U2R = 4\")\n", 352 | " encodeOption = input(\"Enter option :\") \n", 353 | " \n", 354 | " if encodeOption == \"1\" or encodeOption == \"2\":\n", 355 | " break\n", 356 | " else:\n", 357 | " \n", 358 | " print(\"Error\\n\\n\")\n", 359 | " \n", 360 | " \n", 361 | " if encodeOption == \"1\":\n", 362 | " #Binary Categories\n", 363 | " attackType = {'normal':\"normal\", 'neptune':\"abnormal\", 'warezclient':\"abnormal\", 'ipsweep':\"abnormal\",'back':\"abnormal\", 'smurf':\"abnormal\", 'rootkit':\"abnormal\",'satan':\"abnormal\", 'guess_passwd':\"abnormal\",'portsweep':\"abnormal\",'teardrop':\"abnormal\",'nmap':\"abnormal\",'pod':\"abnormal\",'ftp_write':\"abnormal\",'multihop':\"abnormal\",'buffer_overflow':\"abnormal\",'imap':\"abnormal\",'warezmaster':\"abnormal\",'phf':\"abnormal\",'land':\"abnormal\",'loadmodule':\"abnormal\",'spy':\"abnormal\",'perl':\"abnormal\"} \n", 364 | " attackEncodingCluster = {'normal':0,'abnormal':1}\n", 365 | " \n", 366 | " Y[:] = [attackType[item] for item in Y[:]] #Encoding the binary data\n", 367 | " Y[:] = [attackEncodingCluster[item] for item in Y[:]]#Changing the names of the labels to binary labels normal and abnormal\n", 368 | " return Y,encodeOption\n", 369 | " \n", 370 | " elif encodeOption == \"2\":\n", 371 | " #4 Main Categories\n", 372 | " #normal = 0\n", 373 | " #DoS = 1\n", 374 | " #Probe = 2\n", 375 | " #R2L = 3\n", 376 | " #U2R = 4\n", 377 | " attackType = {'normal': 'normal', 'neptune':'DoS', 'warezclient': 'R2L', 'ipsweep': 'Probe','back': 'DoS', 'smurf': 'DoS', 'rootkit': 'U2R','satan': 'Probe', 'guess_passwd': 'R2L','portsweep': 'Probe','teardrop': 'DoS','nmap': 'Probe','pod': 'DoS','ftp_write': 'R2L','multihop': 'R2L','buffer_overflow': 'U2R','imap': 'R2L','warezmaster': 'R2L','phf': 'R2L','land': 'DoS','loadmodule': 'U2R','spy': 'R2L','perl': 'U2R'} \n", 378 | " attackEncodingCluster = {'normal':0,'DoS':1,'Probe':2,'R2L':3, 'U2R':4} #Main Categories\n", 379 | " \n", 380 | " Y[:] = [attackType[item] for item in Y[:]] #Encoding the main 4 categories\n", 381 | " Y[:] = [attackEncodingCluster[item] for item in Y[:]]# Changing the names of attacks into 4 main categories\n", 382 | " return Y,encodeOption\n", 383 | " else:\n", 384 | " return Y\n", 385 | " \n", 386 | " \n", 387 | " elif datasetOption == \"2\":#Check if the data set choosen is NSL-KDD or IDS2017\n", 388 | " print(\"\\n\\n#########################################################################\")\n", 389 | " print(\"Encoding Menu\")\n", 390 | " print(\"#########################################################################\")\n", 391 | " print(\"1.Binary true labels: normal = 0, abnormal = 1\")\n", 392 | " print(\"2. Multiclass true labels: BENIGN= 0, DoS slowloris= 1, DoS Slowhttptest= 2, DoS Hulk= 3, DoS GoldenEye= 4, Heartbleed= 5\")\n", 393 | " encodeOption = input(\"Enter option :\")\n", 394 | "\n", 395 | " if encodeOption == \"1\":\n", 396 | " Y = np.array(Y,dtype= object)\n", 397 | " attackEncoding = {'BENIGN': 0,'DoS slowloris': 1,'DoS Slowhttptest': 2,'DoS Hulk': 3, 'DoS GoldenEye': 4, 'Heartbleed': 5} #Main Categories\n", 398 | " Y[:] = [attackEncoding[item] for item in Y[:]]# Changing the names of attacks into 4 main categories\n", 399 | " \n", 400 | " return Y,encodeOption\n", 401 | " \n", 402 | " elif encodeOption == \"2\":\n", 403 | " Y = np.array(Y,dtype= object)\n", 404 | " attackType = {'BENIGN': 'normal','DoS slowloris': 'abnormal','DoS Slowhttptest': 'abnormal','DoS Hulk': 'abnormal', 'DoS GoldenEye': 'abnormal', 'Heartbleed': 'abnormal'} #Binary Categories\n", 405 | " attackEncoding = {'normal': 0, 'abnormal': 1}\n", 406 | " \n", 407 | " Y[:] = [attackType[item] for item in Y[:]]# Changing the names of attacks into binary categories\n", 408 | " Y[:] = [attackEncoding[item] for item in Y[:]]# Changing the names of attacks into binary categories\n", 409 | " return Y,encodeOption\n", 410 | " \n", 411 | " else:\n", 412 | " return Y" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "# One Hot Encoding" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "#Encoding the categorical features using one hot encoding and using Main attacks categories or binary categories\n", 429 | "def oneHotEncodingData(X,dataOption):\n", 430 | " \n", 431 | " from sklearn.preprocessing import OneHotEncoder\n", 432 | " from sklearn.compose import ColumnTransformer\n", 433 | " #We use One hot encoding to pervent the machine learning to atribute the categorical data in order. \n", 434 | " #What one hot encoding(ColumnTransformer) does is, it takes a column which has categorical data, \n", 435 | " #which has been label encoded, and then splits the column into multiple columns.\n", 436 | " #The numbers are replaced by 1s and 0s, depending on which column has what value\n", 437 | " #We don't need to do a label encoded step because ColumnTransformer do one hot encode and label encode!\n", 438 | " #Encoding the Independient Variable\n", 439 | " if dataOption == \"1\": #Only for dataset with Categorical Data\n", 440 | " transform = ColumnTransformer([(\"Servers\", OneHotEncoder(categories = \"auto\"), [1,2,3])], remainder=\"passthrough\")\n", 441 | " X = transform.fit_transform(X)\n", 442 | " print(\"\\n\\n#########################################################################\")\n", 443 | " print(\"Data has been successfully One Hot Encoded\")\n", 444 | " print(\"#########################################################################\")\n", 445 | "\n", 446 | " return X\n", 447 | " elif dataOption == \"3\": #Only for risk data, because we don't have risk values for protocol feature we do one hot encoding for only that feature and the other ones we do risk value encoding\n", 448 | " transform = ColumnTransformer([(\"Servers\", OneHotEncoder(categories = \"auto\"), [1])], remainder=\"passthrough\")\n", 449 | " X = transform.fit_transform(X)\n", 450 | " print(\"\\n\\n#########################################################################\")\n", 451 | " print(\"Data has been successfully One Hot Encoded\")\n", 452 | " print(\"#########################################################################\")\n", 453 | " return X\n", 454 | " \n", 455 | " else:\n", 456 | " return X #return data with no changes" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "# Risk Encoding" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [ 472 | "def riskEncodingData(X,dataOption):#Risk encoding categorical features\n", 473 | " #Manually Encoding for the attacks types only\n", 474 | " if dataOption == \"3\": #if data option is risk Value\n", 475 | " X = pd.DataFrame(X)\n", 476 | " servers = {'http':0.01, 'domain_u':0, 'sunrpc':1, 'smtp':0.01, 'ecr_i':0.87, 'iso_tsap':1, 'private':0.97, 'finger':0.27, 'ftp':0.26, 'telnet':0.48,'other':0.12,'discard':1, 'courier':1, 'pop_3':0.53, 'ldap':1, 'eco_i':0.8, 'ftp_data':0.06, 'klogin':1, 'auth':0.31, 'mtp':1, 'name':1, 'netbios_ns':1,'remote_job':1,'supdup':1,'uucp_path':1,'Z39_50':1,'csnet_ns':1,'uucp':1,'netbios_dgm':1,'urp_i':0,'domain':0.96,'bgp':1,'gopher':1,'vmnet':1,'systat':1,'http_443':1,'efs':1,'whois':1,'imap4':1,'echo':1,'link':1,'login':1,'kshell':1,'sql_net':1,'time':0.88,'hostnames':1,'exec':1,'ntp_u':0,'nntp':1,'ctf':1,'ssh':1,'daytime':1,'shell':1,'netstat':1,'nnsp':1,'IRC':0,'pop_2':1,'printer':1,'tim_i':0.33,'pm_dump':1,'red_i':0,'netbios_ssn':1,'rje':1,'X11':0.04,'urh_i':0,'http_8001':1,'aol':1,'http_2784':1,'tftp_u':0,'harvest':1}\n", 477 | " X[2] = [servers[item] for item in X[2]]\n", 478 | "\n", 479 | " servers_Error = {'REJ':0.519, 'SF':0.016, 'S0':0.998, 'RSTR':0.882, 'RSTO':0.886,'SH':0.993,'S1':0.008,'RSTOS0':1,'S3':0.08,'S2':0.05,'OTH':0.729} \n", 480 | " X[3] = [servers_Error[item] for item in X[3]]\n", 481 | "\n", 482 | " print(\"\\n\\n#########################################################################\")\n", 483 | " print(\"Data has been successfully risk Encoded\")\n", 484 | " print(\"#########################################################################\")\n", 485 | "\n", 486 | " return X\n", 487 | " \n", 488 | " else:\n", 489 | " \n", 490 | " return X #return data with no changes" 491 | ] 492 | }, 493 | { 494 | "cell_type": "markdown", 495 | "metadata": {}, 496 | "source": [ 497 | "# Scaling " 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "def scaling(X):#Scalign the data with the normalize method, we scale the data to have it in the same range for the experiments\n", 507 | " \n", 508 | " \n", 509 | "\n", 510 | " while True:\n", 511 | " \n", 512 | " decision = input(\"Scale data [y/n]:\")\n", 513 | " \n", 514 | " if decision == \"y\" or decision == \"n\":\n", 515 | " break\n", 516 | " else:\n", 517 | " \n", 518 | " print(\"Error\\n\\n\")\n", 519 | " \n", 520 | " if decision == \"y\":\n", 521 | " \n", 522 | " from sklearn.preprocessing import MinMaxScaler\n", 523 | " #Transforms features by scaling each feature to a given range.\n", 524 | " X = MinMaxScaler(feature_range=(0, 1)).fit_transform(X)\n", 525 | " print(\"\\n\\n#########################################################################\")\n", 526 | " print(\"Data has been successfully scaled.\")\n", 527 | " print(\"#########################################################################\")\n", 528 | " return X\n", 529 | " \n", 530 | " else:\n", 531 | " return X\n" 532 | ] 533 | }, 534 | { 535 | "cell_type": "markdown", 536 | "metadata": {}, 537 | "source": [ 538 | "# Shuffle" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": null, 544 | "metadata": {}, 545 | "outputs": [], 546 | "source": [ 547 | "def shuffleData(X):# currently a bug, if we do shuffleling the experiments resutls are not good, the order of the data does not affect the results\n", 548 | "\n", 549 | " from sklearn.utils import shuffle\n", 550 | " while True:\n", 551 | " option = input(\"Shuffle data [y]/[n]:\")\n", 552 | " \n", 553 | " if option == \"y\" or option == \"n\":\n", 554 | " break\n", 555 | " else:\n", 556 | " \n", 557 | " print(\"Error\\n\\n\")\n", 558 | " \n", 559 | " if option == \"y\":\n", 560 | " \n", 561 | " X = pd.DataFrame(X)\n", 562 | " X = shuffle(X)\n", 563 | " X.reset_index(inplace=True,drop=True)\n", 564 | " X = np.array(X)\n", 565 | " \n", 566 | " print(\"\\n\\n#########################################################################\")\n", 567 | " print(\"Data has been successfully shuffled.\")\n", 568 | " print(\"#########################################################################\")\n", 569 | " return X\n", 570 | " else:\n", 571 | " \n", 572 | " return X" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "metadata": {}, 578 | "source": [ 579 | "# KMEANS" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": null, 585 | "metadata": {}, 586 | "outputs": [], 587 | "source": [ 588 | "def kmeansClustering(X,Y):#K-means algorithm \n", 589 | " from sklearn.cluster import KMeans\n", 590 | "\n", 591 | " while True:\n", 592 | " print(\"\\n\\n#########################################################################\")\n", 593 | " print(\"KMEANS ALGORITHM\")\n", 594 | " print(\"#########################################################################\")\n", 595 | " \n", 596 | " nClusters = input(\"Number of clusters:\")\n", 597 | " \n", 598 | " try:\n", 599 | " nClusters = int(nClusters)\n", 600 | " \n", 601 | " except ValueError:\n", 602 | " \n", 603 | " print(\"Error\\n\\n\")\n", 604 | " \n", 605 | " if type(nClusters) == int:\n", 606 | " n = 0\n", 607 | " clusters = []\n", 608 | " \n", 609 | " while n < nClusters:#Converting nCluster into an array of n clusters [n] for use it later\n", 610 | " clusters.append(n)\n", 611 | " n+=1\n", 612 | " break\n", 613 | " \n", 614 | " while True:\n", 615 | " init = input(\"Initialization method [k-means++,random]:\")\n", 616 | " \n", 617 | " if init == \"k-means++\" or init == \"random\":\n", 618 | " break\n", 619 | "\n", 620 | " print(\"\\nClustering...\\n\")\n", 621 | " \n", 622 | " start_time = time.time()\n", 623 | " KMEANS = KMeans(n_clusters = nClusters, init = init,max_iter = 300,n_init = 10,random_state = 0)\n", 624 | " print(\"\\n\\nRun Time ->\",\"--- %s seconds ---\" % (time.time() - start_time))\n", 625 | " print(\"Data Successfully Clustered\")\n", 626 | " kmeans = KMEANS.fit(X)\n", 627 | " Z = kmeans.labels_\n", 628 | " inertia = KMEANS.inertia_\n", 629 | " #Kmeans Results\n", 630 | " kmeansR = pd.crosstab(Y,Z)\n", 631 | " maxVal = kmeansR.idxmax()\n", 632 | " \n", 633 | " return Z,clusters,kmeansR,maxVal,inertia\n" 634 | ] 635 | }, 636 | { 637 | "cell_type": "markdown", 638 | "metadata": {}, 639 | "source": [ 640 | "# Kmeans F1 Score" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": null, 646 | "metadata": {}, 647 | "outputs": [], 648 | "source": [ 649 | "def kF1(Z,Y,maxVal,clusters):#F1 Score for Kmeans\n", 650 | " from sklearn.metrics import f1_score\n", 651 | " #Encoding data to F-score\n", 652 | " \n", 653 | " \n", 654 | " # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n", 655 | " n = 0 # counter\n", 656 | " dictionaryCluster = {} # creating an empty dictionary \n", 657 | " f1 = 0 #f1score\n", 658 | " average = ''\n", 659 | " \n", 660 | " while n < len(clusters):# while counter < number of clusters\n", 661 | " dictionaryCluster[clusters[n]] = maxVal[n] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n", 662 | " n+=1\n", 663 | " \n", 664 | " Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n", 665 | " \n", 666 | " Y = np.array(Y,dtype = int) # Converting labels into a int array\n", 667 | " \n", 668 | " while True:\n", 669 | " \n", 670 | " average = input(\"Average Method[weighted,micro,macro,binary]:\")\n", 671 | " \n", 672 | " if average == \"weighted\" or average == \"micro\" or average == \"macro\" or average == 'binary':\n", 673 | " break\n", 674 | " #score metric \n", 675 | " f1 = f1_score(Y,Z, average = average) #Forget the labels that where not predicted and gives lables that were predicted at least once\n", 676 | " \n", 677 | " return f1,dictionaryCluster" 678 | ] 679 | }, 680 | { 681 | "cell_type": "markdown", 682 | "metadata": {}, 683 | "source": [ 684 | "# KMEANS Normal Mutial Info" 685 | ] 686 | }, 687 | { 688 | "cell_type": "code", 689 | "execution_count": null, 690 | "metadata": {}, 691 | "outputs": [], 692 | "source": [ 693 | "\n", 694 | "def kNMI(Z,Y,maxVal,clusters):\n", 695 | " from sklearn.metrics import normalized_mutual_info_score\n", 696 | " \n", 697 | " # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n", 698 | " n = 0 # counter\n", 699 | " dictionaryCluster = {} # creating an empty dictionary \n", 700 | " NMI = 0\n", 701 | " average = ''\n", 702 | " \n", 703 | " while n < len(clusters):# while counter < number of clusters\n", 704 | " dictionaryCluster[clusters[n]] = maxVal[n] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n", 705 | " n+=1\n", 706 | " \n", 707 | " Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n", 708 | " \n", 709 | " Y = np.array(Y,dtype = int) #Making sure that labels are in a int array\n", 710 | " \n", 711 | " while True:\n", 712 | " \n", 713 | " average = input(\"Average Method[geometric,min,arithmetic,max]:\")\n", 714 | " \n", 715 | " if average == \"geometric\" or average == \"min\" or average == \"arithmetic\" or average == \"max\":\n", 716 | " break\n", 717 | " #Score metric \n", 718 | " NMI = normalized_mutual_info_score(Y, Z, average_method = average)\n", 719 | " \n", 720 | " return NMI,dictionaryCluster\n" 721 | ] 722 | }, 723 | { 724 | "cell_type": "markdown", 725 | "metadata": {}, 726 | "source": [ 727 | "# KMEANS Adjusted Random Score" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": null, 733 | "metadata": {}, 734 | "outputs": [], 735 | "source": [ 736 | "def kARS(Z,Y,maxVal,clusters):\n", 737 | " from sklearn.metrics import adjusted_rand_score\n", 738 | " \n", 739 | " # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n", 740 | " n = 0 # counter\n", 741 | " dictionaryCluster = {} # creating an empty dictionary \n", 742 | " ars = 0\n", 743 | " \n", 744 | " while n < len(clusters):# while counter < number of clusters\n", 745 | " dictionaryCluster[clusters[n]] = maxVal[n] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n", 746 | " n+=1\n", 747 | " \n", 748 | " Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n", 749 | " \n", 750 | " Y = np.array(Y,dtype = int) #Making sure that labels are in a int array\n", 751 | " \n", 752 | " #score metric\n", 753 | " ars = adjusted_rand_score(Y, Z)\n", 754 | " \n", 755 | " return ars,dictionaryCluster" 756 | ] 757 | }, 758 | { 759 | "cell_type": "markdown", 760 | "metadata": {}, 761 | "source": [ 762 | "# DBSCAN" 763 | ] 764 | }, 765 | { 766 | "cell_type": "code", 767 | "execution_count": null, 768 | "metadata": {}, 769 | "outputs": [], 770 | "source": [ 771 | "def dbscanClustering(X,Y):#DBSCAN algorithm\n", 772 | " from sklearn.cluster import DBSCAN\n", 773 | " \n", 774 | " while True:\n", 775 | " \n", 776 | " print(\"\\n\\n#########################################################################\")\n", 777 | " print(\"DBSCAN ALGORITHM\")\n", 778 | " print(\"#########################################################################\")\n", 779 | " \n", 780 | " epsilon = input(\"epsilon[Decimal]:\")\n", 781 | " \n", 782 | " try:\n", 783 | " epsilon = float(epsilon)\n", 784 | " \n", 785 | " except ValueError:\n", 786 | " \n", 787 | " print(\"Enter a Decimal number\")\n", 788 | " \n", 789 | " \n", 790 | " if type(epsilon) == float:\n", 791 | " break\n", 792 | " \n", 793 | " while True:\n", 794 | " minSamples = input(\"Min Samples[Integer]:\")\n", 795 | " \n", 796 | " try:\n", 797 | " minSamples = int(minSamples)\n", 798 | " \n", 799 | " except ValueError:\n", 800 | " \n", 801 | " print(\"Enter a Integer Number\")\n", 802 | " \n", 803 | " if type(minSamples) == int:\n", 804 | " break\n", 805 | " \n", 806 | " while True:\n", 807 | " algorithm = input(\"Algorithm['auto’, ‘ball_tree’, ‘kd_tree’, 'brute']:\")\n", 808 | " \n", 809 | " if algorithm == \"auto\" or algorithm == \"ball_tree\" or algorithm == \"kd_tree\" or algorithm == \"brute\":\n", 810 | " break\n", 811 | " \n", 812 | " else:\n", 813 | " print(\"Error\\n\\n\")\n", 814 | " \n", 815 | " \n", 816 | " print(\"\\nClustering...\\n\")\n", 817 | "\n", 818 | " #Compute DBSCAN\n", 819 | " start_time = time.time() \n", 820 | " db = DBSCAN(eps= epsilon, min_samples = minSamples,algorithm = algorithm).fit(X)\n", 821 | " print(\"\\n\\nRun Time ->\",\"--- %s seconds ---\" % (time.time() - start_time))\n", 822 | " print(\"Data Successfully Clustered\")\n", 823 | " \n", 824 | " \n", 825 | " core_samples_mask = np.zeros_like(db.labels_, dtype=bool)\n", 826 | " core_samples_mask[db.core_sample_indices_] = True\n", 827 | " \n", 828 | " Z = db.labels_\n", 829 | " # Number of clusters in labels, ignoring noise if present.\n", 830 | " n_clusters = len(set(Z))\n", 831 | " n_noise_ = list(Z).count(-1)\n", 832 | " \n", 833 | " n = -1 # DBSCAN return index -1 cluster\n", 834 | " clusters = []\n", 835 | " while n + 1 < n_clusters:\n", 836 | " clusters.append(n)\n", 837 | " n += 1\n", 838 | " \n", 839 | " #DBSCAN Results\n", 840 | " dbscanR = pd.crosstab(Y,Z)\n", 841 | " maxVal = dbscanR.idxmax()\n", 842 | " \n", 843 | " return Z,clusters,n_noise_,dbscanR,maxVal" 844 | ] 845 | }, 846 | { 847 | "cell_type": "markdown", 848 | "metadata": {}, 849 | "source": [ 850 | "# DBSCAN F1 Score" 851 | ] 852 | }, 853 | { 854 | "cell_type": "code", 855 | "execution_count": null, 856 | "metadata": {}, 857 | "outputs": [], 858 | "source": [ 859 | "def dbF1(Z,Y,clusters,maxVal):#F1 score for DBSCAN\n", 860 | " from sklearn.metrics import f1_score\n", 861 | " #Encoding data to F-score\n", 862 | " \n", 863 | " # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n", 864 | " n = 0 # counter\n", 865 | " c = -1 # - counter max Value has negative index\n", 866 | " dictionaryCluster = {} # creating an empty dictionary \n", 867 | " f1 = 0\n", 868 | " average = ''\n", 869 | " \n", 870 | " while n < len(clusters):# while counter < number of clusters\n", 871 | " dictionaryCluster[clusters[n]] = maxVal[c] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n", 872 | " n+=1\n", 873 | " c+=1\n", 874 | " \n", 875 | " \n", 876 | " Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n", 877 | " \n", 878 | " Y = np.array(Y,dtype = int) #Making sure that labels are in a int array\n", 879 | " while True:\n", 880 | " \n", 881 | " average = input(\"Average Method[weighted,micro,macro]:\")\n", 882 | " \n", 883 | " if average == \"weighted\" or average == \"micro\" or average == \"macro\":\n", 884 | " break\n", 885 | " \n", 886 | " else:\n", 887 | " \n", 888 | " print(\"Error\\n\\n\")\n", 889 | " #score metric\n", 890 | " f1 = f1_score(Y,Z, average = average)\n", 891 | " return f1,dictionaryCluster" 892 | ] 893 | }, 894 | { 895 | "cell_type": "markdown", 896 | "metadata": {}, 897 | "source": [ 898 | "# DBSCAN Mutual Info Score" 899 | ] 900 | }, 901 | { 902 | "cell_type": "code", 903 | "execution_count": null, 904 | "metadata": {}, 905 | "outputs": [], 906 | "source": [ 907 | "def dbNMI(Z,Y,clusters,maxVal):# Mutual info score for dbscan\n", 908 | " from sklearn.metrics import normalized_mutual_info_score\n", 909 | " # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n", 910 | " n = 0 # counter\n", 911 | " c = -1 # - counter max Value has negative index\n", 912 | " NMI = 0\n", 913 | " dictionaryCluster = {} # creating an empty dictionary \n", 914 | " average = ''\n", 915 | " \n", 916 | " while n < len(clusters):# while counter < number of clusters\n", 917 | " dictionaryCluster[clusters[n]] = maxVal[c] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n", 918 | " n+=1\n", 919 | " c+=1\n", 920 | " \n", 921 | " Y = np.array(Y,dtype = int) #Making sure that labels are in a int array\n", 922 | "\n", 923 | " while True:\n", 924 | " \n", 925 | " average = input(\"Average Method[geometric,min,arithmetic,max]:\")\n", 926 | " \n", 927 | " if average == \"geometric\" or average == \"min\" or average == \"arithmetic\" or average == \"max\":\n", 928 | " break\n", 929 | " else:\n", 930 | " \n", 931 | " print(\"Error\\n\\n\")\n", 932 | " #score metric\n", 933 | " NMI = normalized_mutual_info_score(Y, Z, average_method= average)\n", 934 | " \n", 935 | " return NMI,dictionaryCluster" 936 | ] 937 | }, 938 | { 939 | "cell_type": "markdown", 940 | "metadata": {}, 941 | "source": [ 942 | "# DBSCAN Adjusted Random Score" 943 | ] 944 | }, 945 | { 946 | "cell_type": "code", 947 | "execution_count": null, 948 | "metadata": {}, 949 | "outputs": [], 950 | "source": [ 951 | "def dbARS(Z,Y,clusters,maxVal): # adjusted rand score for dbscan\n", 952 | " from sklearn.metrics import adjusted_rand_score\n", 953 | " \n", 954 | " # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n", 955 | " n = 0 # counter\n", 956 | " c = -1 # - counter max Value has negative index\n", 957 | " ars = 0\n", 958 | " dictionaryCluster = {} # creating an empty dictionary \n", 959 | " \n", 960 | " while n < len(clusters):# while counter < number of clusters\n", 961 | " dictionaryCluster[clusters[n]] = maxVal[c] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n", 962 | " n+=1\n", 963 | " c+=1\n", 964 | " #score metric\n", 965 | " ars = adjusted_rand_score(Y,Z)\n", 966 | " \n", 967 | " return ars,dictionaryCluster" 968 | ] 969 | }, 970 | { 971 | "cell_type": "markdown", 972 | "metadata": {}, 973 | "source": [ 974 | "# Isolation Forest" 975 | ] 976 | }, 977 | { 978 | "cell_type": "code", 979 | "execution_count": null, 980 | "metadata": {}, 981 | "outputs": [], 982 | "source": [ 983 | "def isolationForest(X,Y):# isolation forest algorithm\n", 984 | " from sklearn.ensemble import IsolationForest\n", 985 | " \n", 986 | " while True:\n", 987 | " contamination = input(\"Contamination[Float 0 to 0.5]: \")\n", 988 | " \n", 989 | " try:\n", 990 | " contamination = float(contamination)\n", 991 | " \n", 992 | " except ValueError:\n", 993 | " \n", 994 | " print(\"Enter a Number\")\n", 995 | " \n", 996 | " if type(contamination) == float and (contamination >= 0 and contamination <= 0.5):\n", 997 | " break\n", 998 | " \n", 999 | " print(\"\\nClustering...\\n\") \n", 1000 | " \n", 1001 | " start_time = time.time() \n", 1002 | " Z = IsolationForest(max_samples = \"auto\",behaviour = \"new\",contamination = contamination).fit_predict(X)\n", 1003 | " print(\"\\n\\nRun Time ->\",\"--- %s seconds ---\" % (time.time() - start_time))\n", 1004 | " \n", 1005 | " Z = np.array(Z,dtype = object)\n", 1006 | " \n", 1007 | " ifR = pd.crosstab(Y,Z)\n", 1008 | " ifR = pd.DataFrame(ifR)\n", 1009 | " maxVal = ifR.idxmax()\n", 1010 | " \n", 1011 | " n = -1 # Isolation Forest return index -1 and 1 cluster\n", 1012 | " clusters = []\n", 1013 | " while n < len(ifR.columns):\n", 1014 | " clusters.append(n)\n", 1015 | " n += 2\n", 1016 | " \n", 1017 | " return Z,ifR,maxVal,clusters" 1018 | ] 1019 | }, 1020 | { 1021 | "cell_type": "markdown", 1022 | "metadata": {}, 1023 | "source": [ 1024 | "# Isolation Forest F1 Score" 1025 | ] 1026 | }, 1027 | { 1028 | "cell_type": "code", 1029 | "execution_count": null, 1030 | "metadata": {}, 1031 | "outputs": [], 1032 | "source": [ 1033 | "def ifF1(Z,Y,clusters,maxVal): #f1 score for isolation forest\n", 1034 | " from sklearn.metrics import f1_score\n", 1035 | " # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n", 1036 | " \n", 1037 | " n = 0 # counter\n", 1038 | " c = -1 # - counter max Value has negative index\n", 1039 | " f1 = 0\n", 1040 | " average = ''\n", 1041 | " dictionaryCluster = {} # creating an empty dictionary \n", 1042 | "\n", 1043 | " \n", 1044 | " while n < len(clusters): # Since we got -1 and 1 clusters , in order to assing the corrects result counter starts at -1 and it increments by 2 so it can have the 1 index of maxLOFvalue\n", 1045 | " dictionaryCluster[clusters[n]] = maxVal[c] \n", 1046 | " n+=1\n", 1047 | " c+=2\n", 1048 | " \n", 1049 | " Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n", 1050 | " \n", 1051 | " Y = np.array(Y,dtype = int)\n", 1052 | " Z = np.array(Z,dtype = int)\n", 1053 | " \n", 1054 | " while True:\n", 1055 | " \n", 1056 | " average = input(\"Average Method[weighted,micro,macro]:\")\n", 1057 | " \n", 1058 | " if average == \"weighted\" or average == \"micro\" or average == \"macro\":\n", 1059 | " break\n", 1060 | " \n", 1061 | " else:\n", 1062 | " \n", 1063 | " print(\"Error\\n\\n\")\n", 1064 | " # score metric\n", 1065 | " f1 = f1_score(Y,Z, average = average) #[None, 'micro', 'macro', 'weighted']\n", 1066 | " \n", 1067 | " return f1,dictionaryCluster" 1068 | ] 1069 | }, 1070 | { 1071 | "cell_type": "markdown", 1072 | "metadata": {}, 1073 | "source": [ 1074 | "# Local Outlier Factor" 1075 | ] 1076 | }, 1077 | { 1078 | "cell_type": "code", 1079 | "execution_count": null, 1080 | "metadata": {}, 1081 | "outputs": [], 1082 | "source": [ 1083 | "def LOF(X,Y):# Local outlier factor algorithm\n", 1084 | " from sklearn.neighbors import LocalOutlierFactor \n", 1085 | " \n", 1086 | " while True:\n", 1087 | " contamination = input(\"Contamination[Float 0 to 0.5]: \")\n", 1088 | " \n", 1089 | " try:\n", 1090 | " contamination = float(contamination)\n", 1091 | " \n", 1092 | " except ValueError:\n", 1093 | " \n", 1094 | " print(\"Enter a Number\")\n", 1095 | " \n", 1096 | " if type(contamination) == float and (contamination > 0 and contamination <= 0.5):\n", 1097 | " break\n", 1098 | " \n", 1099 | " while True:\n", 1100 | " algorithm = input(\"Algorithm['auto’, ‘ball_tree’, ‘kd_tree’, 'brute']:\")\n", 1101 | " \n", 1102 | " if algorithm == \"auto\" or algorithm == \"ball_tree\" or algorithm == \"kd_tree\" or algorithm == \"brute\":\n", 1103 | " break\n", 1104 | " else:\n", 1105 | " \n", 1106 | " print(\"Error\\n\\n\")\n", 1107 | " \n", 1108 | " print(\"\\nClustering...\\n\")\n", 1109 | " \n", 1110 | " start_time = time.time() \n", 1111 | " lof = LocalOutlierFactor(contamination = contamination,algorithm = algorithm).fit_predict(X)\n", 1112 | " print(\"\\n\\nRun Time ->\",\"--- %s seconds ---\" % (time.time() - start_time))\n", 1113 | " \n", 1114 | " lofR = pd.crosstab(Y,lof)\n", 1115 | " maxVal = lofR.idxmax()\n", 1116 | " \n", 1117 | " \n", 1118 | " n = -1 # LOF return index -1 and 1 cluster\n", 1119 | " clusters = []\n", 1120 | " while n < len(lofR.columns):\n", 1121 | " clusters.append(n)\n", 1122 | " n += 2\n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " return lof,lofR,maxVal,clusters" 1127 | ] 1128 | }, 1129 | { 1130 | "cell_type": "markdown", 1131 | "metadata": {}, 1132 | "source": [ 1133 | "# Local Outlier Factor F1 Score" 1134 | ] 1135 | }, 1136 | { 1137 | "cell_type": "code", 1138 | "execution_count": null, 1139 | "metadata": {}, 1140 | "outputs": [], 1141 | "source": [ 1142 | "def lofF1(Z,Y,clusters,maxVal): # f1 score for local outlier factor\n", 1143 | " from sklearn.metrics import f1_score\n", 1144 | " \n", 1145 | " # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n", 1146 | " n = 0 # counter\n", 1147 | " c = -1 # - counter max Value has negative index\n", 1148 | " f1 = 0\n", 1149 | " dictionaryCluster = {} # creating an empty dictionary \n", 1150 | " \n", 1151 | " while n < len(clusters): # Since we got -1 and 1 clusters , in order to assing the corrects result counter starts at -1 and it increments by 2 so it can have the 1 index of maxLOFvalue\n", 1152 | " dictionaryCluster[clusters[n]] = maxVal[c] \n", 1153 | " n+=1\n", 1154 | " c+=2\n", 1155 | " \n", 1156 | " Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n", 1157 | " Y = np.array(Y,dtype = int)\n", 1158 | " Z = np.array(Z,dtype = int)\n", 1159 | " while True:\n", 1160 | " \n", 1161 | " average = input(\"Average Method[weighted,None,micro,macro]:\")\n", 1162 | " \n", 1163 | " if average == \"weighted\" or average == \"micro\" or average == \"macro\" or average == \"None\":\n", 1164 | " break\n", 1165 | " \n", 1166 | " else:\n", 1167 | " \n", 1168 | " print(\"Error\\n\\n\")\n", 1169 | " f1 = f1_score(Y,Z, average = average) #[None, 'micro', 'macro', 'weighted']\n", 1170 | " \n", 1171 | " return f1,dictionaryCluster" 1172 | ] 1173 | }, 1174 | { 1175 | "cell_type": "markdown", 1176 | "metadata": {}, 1177 | "source": [ 1178 | "# Calling Functions" 1179 | ] 1180 | }, 1181 | { 1182 | "cell_type": "code", 1183 | "execution_count": null, 1184 | "metadata": {}, 1185 | "outputs": [], 1186 | "source": [ 1187 | "clear()\n", 1188 | "#Calling the functions\n", 1189 | "\n", 1190 | "##########################################################################\n", 1191 | "path,dataSetOption = getDataSet()\n", 1192 | "#########################################################################\n", 1193 | "#########################################################################\n", 1194 | "dataSet = readingData(path)\n", 1195 | "#########################################################################\n", 1196 | "#########################################################################\n", 1197 | "dataSet = checkMissing(dataSet)\n", 1198 | "#########################################################################\n", 1199 | "#########################################################################\n", 1200 | "data,labels,dataOption = gettingVariables(dataSet,dataSetOption) #Getting the Data we want to use for the algorithms\n", 1201 | "#########################################################################\n", 1202 | "#########################################################################\n", 1203 | "try:\n", 1204 | " labels,encodeOption = encodingLabels(labels,dataOption,dataSetOption) #Encoding the true labels\n", 1205 | "except ValueError:\n", 1206 | " labels = encodingLabels(labels,dataOption,dataSetOption) #Encoding the true labels\n", 1207 | "#########################################################################\n", 1208 | "#########################################################################\n", 1209 | "data = riskEncodingData(data,dataOption)\n", 1210 | "#########################################################################\n", 1211 | "#########################################################################\n", 1212 | "data = oneHotEncodingData(data,dataOption) #One hot Encode with the complete data\n", 1213 | "#########################################################################\n", 1214 | "#########################################################################\n", 1215 | "data = scaling(data)\n", 1216 | "#########################################################################\n", 1217 | "#########################################################################\n", 1218 | "data = shuffleData(data)\n", 1219 | "#########################################################################\n", 1220 | "\n", 1221 | "#This menu is a option to run diferrent algorithms with the same preproceced data witouth the need of running all the code from 0 to make another experiment.\n", 1222 | "while True: \n", 1223 | " while True:\n", 1224 | " print(\"\\n\\n#########################################################################\")\n", 1225 | " print(\"Algorithm Menu\")\n", 1226 | " print(\"#########################################################################\")\n", 1227 | " \n", 1228 | " print(\"1.Kmeans\")\n", 1229 | " print(\"2.Dbscan\")\n", 1230 | " print(\"3.Isolation Forest\")\n", 1231 | " print(\"4.Local Factor Outlier\")\n", 1232 | " \n", 1233 | " algorithmOption = input(\"option:\")\n", 1234 | " \n", 1235 | " if algorithmOption == \"1\" or algorithmOption == \"2\" or algorithmOption == \"3\" or algorithmOption == \"4\":\n", 1236 | " break\n", 1237 | " else:\n", 1238 | " \n", 1239 | " print(\"Error\\n\\n\")\n", 1240 | "\n", 1241 | " \n", 1242 | " if algorithmOption == \"1\":\n", 1243 | " #########################################################################\n", 1244 | " #KMEANS\n", 1245 | " klabels,kClusters,kmeansR,maxKvalue,inertia = kmeansClustering(data,labels)\n", 1246 | " print(\"#########################################################################\")\n", 1247 | " print(\"KMEANS RESULTS\\n\\n\")\n", 1248 | " print(\"Clusters -> \",kClusters,\"\\n\")\n", 1249 | " print(\"Inertia -> \",inertia)\n", 1250 | " print(kmeansR,\"\\n\\n\")\n", 1251 | " print(\"Max True Label\",\"\\n\\n\",maxKvalue)\n", 1252 | " print(\"#########################################################################\")\n", 1253 | " #########################################################################\n", 1254 | " print(\"\\n\\n#########################################################################\")\n", 1255 | " print(\"Kmeans Score Metrics Menu\")\n", 1256 | " print(\"#########################################################################\")\n", 1257 | " \n", 1258 | " while True:\n", 1259 | " print(\"1.F1 Score\")\n", 1260 | " print(\"2.Normalized Mutual Info Score\")\n", 1261 | " print(\"3.Adjusted Rand Score\")\n", 1262 | " \n", 1263 | " kScoreOption = input(\"option:\")\n", 1264 | " \n", 1265 | " if kScoreOption == \"1\" or kScoreOption == \"2\" or kScoreOption == \"3\":\n", 1266 | " break\n", 1267 | " else:\n", 1268 | " \n", 1269 | " print(\"Error\\n\\n\")\n", 1270 | " \n", 1271 | " if kScoreOption == \"1\":\n", 1272 | " #########################################################################\n", 1273 | " #F1 Score\n", 1274 | " kmeansF1,clusterAssigned = kF1(klabels,labels,maxKvalue,kClusters)\n", 1275 | " print(\"\\n\\n#########################################################################\")\n", 1276 | " print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n", 1277 | " print(\"KMEANS F1 Score -> \",kmeansF1)\n", 1278 | " print(\"#########################################################################\")\n", 1279 | " #########################################################################\n", 1280 | " \n", 1281 | " elif kScoreOption == \"2\":\n", 1282 | " #########################################################################\n", 1283 | " kmeansNMI,clusterAssigned = kNMI(klabels,labels,maxKvalue,kClusters)\n", 1284 | " print(\"\\n\\n#########################################################################\")\n", 1285 | " print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n", 1286 | " print(\"KMEANS Normalized Mutual Info Score -> \",kmeansNMI)\n", 1287 | " print(\"#########################################################################\")\n", 1288 | " #########################################################################\n", 1289 | " \n", 1290 | " elif kScoreOption == \"3\":\n", 1291 | " \n", 1292 | " #########################################################################\n", 1293 | " kmeansARS,clusterAssigned = kARS(klabels,labels,maxKvalue,kClusters)\n", 1294 | " print(\"\\n\\n#########################################################################\")\n", 1295 | " print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n", 1296 | " print(\"KMEANS Adjusted Rand Score -> \",kmeansARS)\n", 1297 | " print(\"#########################################################################\")\n", 1298 | " #########################################################################\n", 1299 | " \n", 1300 | " elif algorithmOption == \"2\":\n", 1301 | " #########################################################################\n", 1302 | " #DBSCAN\n", 1303 | " dblabels,dbClusters,nNoises,dbscanR,maxDBvalue = dbscanClustering(data,labels) \n", 1304 | " print(\"#########################################################################\")\n", 1305 | " print(\"DBSCAN RESULTS\\n\\n\")\n", 1306 | " print(\"Clusters -> \",dbClusters,\"\\n\")\n", 1307 | " print(dbscanR,\"\\n\\n\")\n", 1308 | " print(\"Noise -> \",nNoises)\n", 1309 | " print(\"Max True Label\",\"\\n\\n\",maxDBvalue)\n", 1310 | " print(\"#########################################################################\")\n", 1311 | " #########################################################################\n", 1312 | " print(\"\\n\\n#########################################################################\")\n", 1313 | " print(\"Dscan Score Metrics Menu\")\n", 1314 | " print(\"#########################################################################\")\n", 1315 | " print(\"1.F1 Score\")\n", 1316 | " print(\"2.Normalized Mutual Info Score\")\n", 1317 | " print(\"3.Adjusted Rand Score\")\n", 1318 | " \n", 1319 | " while True:\n", 1320 | " \n", 1321 | " dbScoreOption = input(\"option:\")\n", 1322 | " \n", 1323 | " if dbScoreOption == \"1\" or dbScoreOption == \"2\" or dbScoreOption == \"3\":\n", 1324 | " break\n", 1325 | " else:\n", 1326 | " \n", 1327 | " print(\"Error\\n\\n\")\n", 1328 | " \n", 1329 | " if dbScoreOption == \"1\":\n", 1330 | " #########################################################################\n", 1331 | " #F1 Score dbscan\n", 1332 | " dbscanF1,clusterAssigned = dbF1(dblabels,labels,dbClusters,maxDBvalue)\n", 1333 | " print(\"\\n\\n#########################################################################\")\n", 1334 | " print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n", 1335 | " print(\"DBSCAN F1 Score -> \",dbscanF1)\n", 1336 | " print(\"#########################################################################\")\n", 1337 | " #########################################################################\n", 1338 | " \n", 1339 | " elif dbScoreOption == \"2\":\n", 1340 | " #########################################################################\n", 1341 | " dbscanNMI,clusterAssigned = dbNMI(dblabels,labels,dbClusters,maxDBvalue)\n", 1342 | " print(\"\\n\\n#########################################################################\")\n", 1343 | " print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n", 1344 | " print(\"DBSCAN Normalized Mutual Info Score -> \",dbscanNMI)\n", 1345 | " print(\"#########################################################################\")\n", 1346 | " #########################################################################\n", 1347 | " \n", 1348 | " elif dbScoreOption == \"3\":\n", 1349 | " #########################################################################\n", 1350 | " dbscanARS,clusterAssigned = dbARS(dblabels,labels,dbClusters,maxDBvalue)\n", 1351 | " print(\"\\n\\n#########################################################################\")\n", 1352 | " print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n", 1353 | " print(\"DBSCAN Adjusted Rand Score -> \",dbscanARS)\n", 1354 | " print(\"#########################################################################\")\n", 1355 | " #########################################################################\n", 1356 | " \n", 1357 | " \n", 1358 | " elif algorithmOption == \"3\":\n", 1359 | " #########################################################################\n", 1360 | " ifLabels,ifR,MaxIfVal,ifNclusters = isolationForest(data,labels)\n", 1361 | " print(\"#########################################################################\")\n", 1362 | " print(\"Isolation Forest RESULTS\\n\\n\")\n", 1363 | " print(\"Clusters -> \",ifNclusters,\"\\n\")\n", 1364 | " print(ifR,\"\\n\\n\")\n", 1365 | " print(\"Max True Label\",\"\\n\\n\",MaxIfVal)\n", 1366 | " print(\"#########################################################################\")\n", 1367 | " #########################################################################\n", 1368 | " print(\"\\n\\n#########################################################################\")\n", 1369 | " print(\"Isolation Forest Score Metrics Menu\")\n", 1370 | " print(\"#########################################################################\")\n", 1371 | " print(\"1.F1 Score\")\n", 1372 | " \n", 1373 | " while True:\n", 1374 | " \n", 1375 | " ifScoreOption = input(\"option:\")\n", 1376 | " \n", 1377 | " if ifScoreOption == \"1\":\n", 1378 | " break\n", 1379 | " else:\n", 1380 | " \n", 1381 | " print(\"Error\\n\\n\")\n", 1382 | " \n", 1383 | " if ifScoreOption == \"1\":\n", 1384 | " \n", 1385 | " ##########################################################################\n", 1386 | " isolationForestF1,clusterAssigned = ifF1(ifLabels,labels,ifNclusters,MaxIfVal)\n", 1387 | " print(\"\\n\\n#########################################################################\")\n", 1388 | " print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n", 1389 | " print(\"Isolation Forest F1 Score -> \",isolationForestF1)\n", 1390 | " print(\"#########################################################################\")\n", 1391 | " ##########################################################################\n", 1392 | " \n", 1393 | " elif algorithmOption == \"4\":\n", 1394 | " #########################################################################\n", 1395 | " LOFlabels,lofR,maxLOFvalue,lofClusters = LOF(data,labels)\n", 1396 | " print(\"#########################################################################\")\n", 1397 | " print(\"Local Outlier Factor RESULTS\\n\\n\")\n", 1398 | " print(\"Clusters -> \",lofClusters,\"\\n\")\n", 1399 | " print(lofR,\"\\n\\n\")\n", 1400 | " print(\"Max True Label\",\"\\n\\n\",maxLOFvalue)\n", 1401 | " print(\"#########################################################################\")\n", 1402 | " #########################################################################\n", 1403 | " print(\"\\n\\n#########################################################################\")\n", 1404 | " print(\"LOF Score Metrics Menu\")\n", 1405 | " print(\"#########################################################################\")\n", 1406 | " print(\"1.F1 Score\")\n", 1407 | " \n", 1408 | " while True:\n", 1409 | " \n", 1410 | " lofScoreOption = input(\"option:\")\n", 1411 | " \n", 1412 | " if lofScoreOption == \"1\":\n", 1413 | " break\n", 1414 | " else:\n", 1415 | " \n", 1416 | " print(\"Error\\n\\n\")\n", 1417 | " \n", 1418 | " if lofScoreOption == \"1\":\n", 1419 | " \n", 1420 | " ##########################################################################\n", 1421 | " LOFf1,clusterAssigned = lofF1(LOFlabels,labels,lofClusters,maxLOFvalue)\n", 1422 | " print(\"\\n\\n#########################################################################\")\n", 1423 | " print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n", 1424 | " print(\"LOF F1 Score -> \",LOFf1)\n", 1425 | " print(\"#########################################################################\")\n", 1426 | " ##########################################################################\n", 1427 | " \n", 1428 | " while True: # If the user want to Make a new clustering algorithm test\n", 1429 | " \n", 1430 | " decision = input(\"Try another Clustering Algorithm[y/n]:\")\n", 1431 | " \n", 1432 | " if decision == \"y\" or decision == \"n\":\n", 1433 | " break\n", 1434 | " else:\n", 1435 | " \n", 1436 | " print(\"Error\\n\\n\")\n", 1437 | " \n", 1438 | " \n", 1439 | " if decision == \"n\":\n", 1440 | " break\n", 1441 | " \n", 1442 | " else:\n", 1443 | " clear()" 1444 | ] 1445 | } 1446 | ], 1447 | "metadata": { 1448 | "kernelspec": { 1449 | "display_name": "Python 3", 1450 | "language": "python", 1451 | "name": "python3" 1452 | }, 1453 | "language_info": { 1454 | "codemirror_mode": { 1455 | "name": "ipython", 1456 | "version": 3 1457 | }, 1458 | "file_extension": ".py", 1459 | "mimetype": "text/x-python", 1460 | "name": "python", 1461 | "nbconvert_exporter": "python", 1462 | "pygments_lexer": "ipython3", 1463 | "version": "3.7.3" 1464 | }, 1465 | "varInspector": { 1466 | "cols": { 1467 | "lenName": 16, 1468 | "lenType": 16, 1469 | "lenVar": 40 1470 | }, 1471 | "kernels_config": { 1472 | "python": { 1473 | "delete_cmd_postfix": "", 1474 | "delete_cmd_prefix": "del ", 1475 | "library": "var_list.py", 1476 | "varRefreshCmd": "print(var_dic_list())" 1477 | }, 1478 | "r": { 1479 | "delete_cmd_postfix": ") ", 1480 | "delete_cmd_prefix": "rm(", 1481 | "library": "var_list.r", 1482 | "varRefreshCmd": "cat(var_dic_list()) " 1483 | } 1484 | }, 1485 | "position": { 1486 | "height": "923px", 1487 | "left": "328px", 1488 | "right": "20px", 1489 | "top": "9px", 1490 | "width": "800px" 1491 | }, 1492 | "types_to_exclude": [ 1493 | "module", 1494 | "function", 1495 | "builtin_function_or_method", 1496 | "instance", 1497 | "_Feature" 1498 | ], 1499 | "window_display": false 1500 | } 1501 | }, 1502 | "nbformat": 4, 1503 | "nbformat_minor": 2 1504 | } 1505 | -------------------------------------------------------------------------------- /CBAD OUTLINE.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeremy191/clustering-based-anomaly-detection/7df019978b9350920ac2e21bb53663f8709f6ab1/CBAD OUTLINE.pdf -------------------------------------------------------------------------------- /CBAD-Poster.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeremy191/clustering-based-anomaly-detection/7df019978b9350920ac2e21bb53663f8709f6ab1/CBAD-Poster.pdf -------------------------------------------------------------------------------- /CBAD.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Main Libraries" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd \n", 18 | "import time\n", 19 | "import os" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "# Clear" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "clear = lambda:os.system('clear')" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "# Getting the dataset" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "def getDataSet():# Getting the path of the dataset\n", 52 | " \n", 53 | " while True:\n", 54 | " print(\"**************************************************\")\n", 55 | " print(\"DATA SET MENU\")\n", 56 | " print(\"**************************************************\")\n", 57 | " print(\"1.NSL-KDD\")\n", 58 | " print(\"2.IDS 2017\")\n", 59 | " \n", 60 | " option = input(\"Option:\")\n", 61 | " \n", 62 | " if option == \"1\" or option == \"2\":\n", 63 | " break\n", 64 | " \n", 65 | " path = input(\"Path of the File:\")\n", 66 | " \n", 67 | " return path,option" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "# Reading the dataset" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "def readingData(path): #Reading the Dataset\n", 84 | " \n", 85 | " while True:\n", 86 | " \n", 87 | " option = input(\"Dataset has feature names[y/n]:\") \n", 88 | " \n", 89 | " if option == \"y\" or option == \"n\":\n", 90 | " break\n", 91 | " \n", 92 | " print(\"\\nReading Dataset...\") \n", 93 | " \n", 94 | " if option == \"y\":\n", 95 | " dataSet = pd.read_csv(path,low_memory=False)\n", 96 | " \n", 97 | " elif option == \"n\":\n", 98 | " dataSet = pd.read_csv(path, header = None,low_memory=False)\n", 99 | " \n", 100 | " return dataSet\n" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "# Check if missing data" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 5, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "def checkMissing(X):#This check if the dataset given has missing values.\n", 117 | " isMissing = str(X.isnull().values.any()) #Using String instead of Boolean because (\"cannot unpack non-iterable numpy.bool object\")\n", 118 | " \n", 119 | " if isMissing == \"True\":\n", 120 | " #if data set has infinity values replace them with none\n", 121 | " X = X.replace('Infinity', np.nan) #Replacing Infinity values with nan values\n", 122 | " \n", 123 | " missingValIndex = []\n", 124 | " total = X.isnull().sum().sum()\n", 125 | " percent = (total / (X.count().sum() + X.isnull().sum().sum())) * 100\n", 126 | " \n", 127 | " for rows in X:\n", 128 | " \n", 129 | " if X[rows].isnull().sum() != 0:\n", 130 | " missingValIndex.append(rows)\n", 131 | " print(\"\\n\\n**************************************************\")\n", 132 | " print(\"Data has missing values\")\n", 133 | " print(\"**************************************************\")\n", 134 | " print(\"Features with missing values:\",missingValIndex)\n", 135 | " print(\"Total missing Values -> \" , total)\n", 136 | " print(percent,\"%\")\n", 137 | " \n", 138 | " return X\n", 139 | " \n", 140 | " else:\n", 141 | " \n", 142 | " return X\n" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "# Getting the features" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 7, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "ename": "IndentationError", 159 | "evalue": "unindent does not match any outer indentation level (, line 47)", 160 | "output_type": "error", 161 | "traceback": [ 162 | "\u001b[0;36m File \u001b[0;32m\"\"\u001b[0;36m, line \u001b[0;32m47\u001b[0m\n\u001b[0;31m for rows in dataSet: #Getting features index with missing values\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unindent does not match any outer indentation level\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "#Getting The data we want to test for the clustering algorithms\n", 168 | "def gettingVariables(dataSet,dataSetOption):# If the dataset is NSL-KDD it would get the features and the labels for it and if its IDS 2017 it would take the features and the labels for it and take careof missing values.\n", 169 | " \n", 170 | " if dataSetOption == \"1\":\n", 171 | " while True:\n", 172 | " print(\"\\n\\n**************************************************\")\n", 173 | " print(\"Variables Menu\")\n", 174 | " print(\"**************************************************\")\n", 175 | " print(\"1.Data set with categorical data oneHot encoded\")\n", 176 | " print(\"2.Data set with categorical data removed\")\n", 177 | " print(\"3.Data set with Risk Values replacing Server Type and Flag Features; Protocol Data oneHot encoded\")\n", 178 | " option = input(\"Enter option :\")\n", 179 | " \n", 180 | " \n", 181 | " if option == \"1\" or option == \"2\" or option == \"3\":\n", 182 | " break\n", 183 | " else:\n", 184 | " \n", 185 | " print(\"Error\\n\\n\")\n", 186 | " \n", 187 | " \n", 188 | " if option == \"1\":\n", 189 | " #Getting the Dependent and independent Variables\n", 190 | " #In all the option we remove the dificulty level feature because we don't need it in our experiments\n", 191 | " \n", 192 | " \n", 193 | " X = dataSet.iloc[:,:-2].values # Data, Get all the rows and all the clums except all the colums - 2\n", 194 | " Y = dataSet.iloc[:,42].values# Labels\n", 195 | " return X,Y,option\n", 196 | " \n", 197 | " elif option == \"2\":\n", 198 | " #Removing Categorical data from the data set\n", 199 | " X = dataSet.iloc[:,[0,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40]].values\n", 200 | " Y = dataSet.iloc[:,42].values# Labels\n", 201 | " \n", 202 | " return X,Y,option\n", 203 | " \n", 204 | " elif option == \"3\":\n", 205 | " #for later Risk Encode - Categorical features\n", 206 | " X = dataSet.iloc[:,:-2].values\n", 207 | " Y = dataSet.iloc[:,42].values# Labels\n", 208 | " \n", 209 | " return X,Y,option\n", 210 | " \n", 211 | "\n", 212 | " elif dataSetOption == \"2\":\n", 213 | " #############################################################################\n", 214 | " #GETTING VARIABLES\n", 215 | " #############################################################################\n", 216 | " missingValIndex = []\n", 217 | " for rows in dataSet: #Getting features index with missing values\n", 218 | " if dataSet[rows].isnull().sum() != 0:\n", 219 | " missingValIndex.append(rows)\n", 220 | " \n", 221 | " X = dataSet.iloc[:,:-1].values#data\n", 222 | " #if names are not especified it will assign 0,1,2...n for the features name\n", 223 | " X = pd.DataFrame(X,columns = [' Destination Port',' Flow Duration',' Total Fwd Packets',' Total Backward Packets','Total Length of Fwd Packets',\n", 224 | " ' Total Length of Bwd Packets',' Fwd Packet Length Max',' Fwd Packet Length Min',' Fwd Packet Length Mean',' Fwd Packet Length Std',\n", 225 | " 'Bwd Packet Length Max',' Bwd Packet Length Min',' Bwd Packet Length Mean',' Bwd Packet Length Std','Flow Bytes/s',' Flow Packets/s',' Flow IAT Mean',\n", 226 | " ' Flow IAT Std',' Flow IAT Max',' Flow IAT Min','Fwd IAT Total',' Fwd IAT Mean',' Fwd IAT Std',' Fwd IAT Max',' Fwd IAT Min','Bwd IAT Total',' Bwd IAT Mean',\n", 227 | " ' Bwd IAT Std',' Bwd IAT Max',' Bwd IAT Min','Fwd PSH Flags',' Bwd PSH Flags',' Fwd URG Flags',' Bwd URG Flags',' Fwd Header Length',' Bwd Header Length','Fwd Packets/s',\n", 228 | " ' Bwd Packets/s',' Min Packet Length',' Max Packet Length',' Packet Length Mean',' Packet Length Std',' Packet Length Variance','FIN Flag Count',' SYN Flag Count',' RST Flag Count',\n", 229 | " ' PSH Flag Count',' ACK Flag Count',' URG Flag Count',' CWE Flag Count',' ECE Flag Count',' Down/Up Ratio',' Average Packet Size',' Avg Fwd Segment Size',' Avg Bwd Segment Size',' Fwd Header Length',\n", 230 | " 'Fwd Avg Bytes/Bulk',' Fwd Avg Packets/Bulk',' Fwd Avg Bulk Rate',' Bwd Avg Bytes/Bulk',' Bwd Avg Packets/Bulk','Bwd Avg Bulk Rate','Subflow Fwd Packets',' Subflow Fwd Bytes',' Subflow Bwd Packets',' Subflow Bwd Bytes',\n", 231 | " 'Init_Win_bytes_forward',' Init_Win_bytes_backward',' act_data_pkt_fwd',' min_seg_size_forward','Active Mean',' Active Std',' Active Max',' Active Min','Idle Mean',' Idle Std',' Idle Max',' Idle Min'])\n", 232 | " Y = dataSet.iloc[:,78].values#Labels\n", 233 | " \n", 234 | " #############################################################################\n", 235 | " #Variables Got \n", 236 | " #############################################################################\n", 237 | " \n", 238 | " #############################################################################\n", 239 | " #MANAGE MISSING DATA\n", 240 | " ############################################################################# \n", 241 | " \n", 242 | " while True:\n", 243 | " print(\"\\n\\n**************************************************\")\n", 244 | " print(\"Manage Missing Values \")\n", 245 | " print(\"**************************************************\")\n", 246 | " print(\"1.Eliminate Catg. w/ Missing Values\")\n", 247 | " print(\"2.Impute 0 for Missing Values\")\n", 248 | " print(\"3.Impute Mean for Missing Values\")\n", 249 | " print(\"4.Impute Median for Missing Values\")\n", 250 | " print(\"5.Impute Mode for Missing Values\")\n", 251 | " print(\"6.Simple Imputer\")\n", 252 | " missingDataOption = input(\"Option:\")\n", 253 | " \n", 254 | " if missingDataOption == \"1\" or missingDataOption == \"2\" or missingDataOption == \"3\" or missingDataOption == \"4\" or missingDataOption == \"5\" or missingDataOption == \"6\":\n", 255 | " break\n", 256 | " \n", 257 | " \n", 258 | " if missingDataOption == \"1\":\n", 259 | " deletedColumns = []\n", 260 | " numColumns = len(X.columns)\n", 261 | " #removing features with missing values\n", 262 | " for row in missingValIndex:\n", 263 | " deletedColumns.append(row)\n", 264 | " del X[row]\n", 265 | " \n", 266 | " print(\"#\\n\\n########################################################################\")\n", 267 | " print(\"Columns Succesfully Removed\")\n", 268 | " print(len(deletedColumns),\"of\",numColumns,\"were deleted\")\n", 269 | " print(\"Columns Names -> \",deletedColumns)\n", 270 | " print(\"#########################################################################\")\n", 271 | " \n", 272 | " elif missingDataOption == \"2\":\n", 273 | " #fill with 0\n", 274 | " for row in missingValIndex:\n", 275 | " X[row] = X[row].fillna(0)\n", 276 | " \n", 277 | " print(\"\\n\\n#########################################################################\")\n", 278 | " print(\"Sucessfully Filled Missing Values with 0\")\n", 279 | " print(\"#########################################################################\")\n", 280 | " \n", 281 | " \n", 282 | " elif missingDataOption == \"3\":\n", 283 | " #mean imputer\n", 284 | " for row in missingValIndex:\n", 285 | " X[row] = X[row].astype(float)\n", 286 | " X[row] = X[row].fillna(X[row].mean())\n", 287 | " \n", 288 | " print(\"\\n\\n#########################################################################\")\n", 289 | " print(\"Sucessfully Filled Missing Values with Mean\")\n", 290 | " print(\"#########################################################################\")\n", 291 | " \n", 292 | " elif missingDataOption == \"4\":\n", 293 | " #median imputer\n", 294 | " for row in missingValIndex:\n", 295 | " median = X[row].median()\n", 296 | " X[row].fillna(median, inplace=True)\n", 297 | " print(\"\\n\\n#########################################################################\")\n", 298 | " print(\"Sucessfully Filled Missing Values with Median\")\n", 299 | " print(\"#########################################################################\")\n", 300 | " \n", 301 | " elif missingDataOption == \"5\":\n", 302 | " #Mode imputer\n", 303 | " for row in missingValIndex:\n", 304 | " X[row] = X[row].fillna(X[row].mode()[0])\n", 305 | " \n", 306 | " print(\"\\n\\n#########################################################################\")\n", 307 | " print(\"Sucessfully Filled Missing Values with Mode \")\n", 308 | " print(\"#########################################################################\")\n", 309 | " \n", 310 | " elif missingDataOption == \"6\": \n", 311 | " from sklearn.impute import SimpleImputer\n", 312 | " #\"Imputation transformer for completing missing values.\"(Univariate)\n", 313 | " X = SimpleImputer(missing_values = np.nan, strategy='mean', fill_value=None, verbose=0, copy=True).fit_transform(X) \n", 314 | " print(\"\\n\\n#########################################################################\")\n", 315 | " print(\"Sucessfully Imputed Simple Imputer \")\n", 316 | " print(\"#########################################################################\")\n", 317 | " \n", 318 | " \n", 319 | " option = \"None\" #This data does not have categorical features so dataOption is none \n", 320 | " return X,Y,option\n", 321 | " \n", 322 | "#############################################################################\n", 323 | "#END OF MISSING DATA\n", 324 | "#############################################################################" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "# Encoding Labels" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "def encodingLabels(Y,dataOption,datasetOption):# Encoding the labels with multi class or binary\n", 341 | " \n", 342 | " if datasetOption == \"1\": #Check if the data set choosen is NSL-KDD or IDS2017\n", 343 | " \n", 344 | " if dataOption == \"1\" or dataOption == \"2\" or dataOption == \"3\":\n", 345 | " \n", 346 | " while True:\n", 347 | " print(\"\\n\\n#########################################################################\")\n", 348 | " print(\"Encoding Menu\")\n", 349 | " print(\"#########################################################################\")\n", 350 | " print(\"1.Binary true labels: normal = 0, abnormal = 1\")\n", 351 | " print(\"2.Multiclass true labels: normal = 0, DoS = 1, Probe = 2, R2L = 3, U2R = 4\")\n", 352 | " encodeOption = input(\"Enter option :\") \n", 353 | " \n", 354 | " if encodeOption == \"1\" or encodeOption == \"2\":\n", 355 | " break\n", 356 | " else:\n", 357 | " \n", 358 | " print(\"Error\\n\\n\")\n", 359 | " \n", 360 | " \n", 361 | " if encodeOption == \"1\":\n", 362 | " #Binary Categories\n", 363 | " attackType = {'normal':\"normal\", 'neptune':\"abnormal\", 'warezclient':\"abnormal\", 'ipsweep':\"abnormal\",'back':\"abnormal\", 'smurf':\"abnormal\", 'rootkit':\"abnormal\",'satan':\"abnormal\", 'guess_passwd':\"abnormal\",'portsweep':\"abnormal\",'teardrop':\"abnormal\",'nmap':\"abnormal\",'pod':\"abnormal\",'ftp_write':\"abnormal\",'multihop':\"abnormal\",'buffer_overflow':\"abnormal\",'imap':\"abnormal\",'warezmaster':\"abnormal\",'phf':\"abnormal\",'land':\"abnormal\",'loadmodule':\"abnormal\",'spy':\"abnormal\",'perl':\"abnormal\"} \n", 364 | " attackEncodingCluster = {'normal':0,'abnormal':1}\n", 365 | " \n", 366 | " Y[:] = [attackType[item] for item in Y[:]] #Encoding the binary data\n", 367 | " Y[:] = [attackEncodingCluster[item] for item in Y[:]]#Changing the names of the labels to binary labels normal and abnormal\n", 368 | " return Y,encodeOption\n", 369 | " \n", 370 | " elif encodeOption == \"2\":\n", 371 | " #4 Main Categories\n", 372 | " #normal = 0\n", 373 | " #DoS = 1\n", 374 | " #Probe = 2\n", 375 | " #R2L = 3\n", 376 | " #U2R = 4\n", 377 | " attackType = {'normal': 'normal', 'neptune':'DoS', 'warezclient': 'R2L', 'ipsweep': 'Probe','back': 'DoS', 'smurf': 'DoS', 'rootkit': 'U2R','satan': 'Probe', 'guess_passwd': 'R2L','portsweep': 'Probe','teardrop': 'DoS','nmap': 'Probe','pod': 'DoS','ftp_write': 'R2L','multihop': 'R2L','buffer_overflow': 'U2R','imap': 'R2L','warezmaster': 'R2L','phf': 'R2L','land': 'DoS','loadmodule': 'U2R','spy': 'R2L','perl': 'U2R'} \n", 378 | " attackEncodingCluster = {'normal':0,'DoS':1,'Probe':2,'R2L':3, 'U2R':4} #Main Categories\n", 379 | " \n", 380 | " Y[:] = [attackType[item] for item in Y[:]] #Encoding the main 4 categories\n", 381 | " Y[:] = [attackEncodingCluster[item] for item in Y[:]]# Changing the names of attacks into 4 main categories\n", 382 | " return Y,encodeOption\n", 383 | " else:\n", 384 | " return Y\n", 385 | " \n", 386 | " \n", 387 | " elif datasetOption == \"2\":#Check if the data set choosen is NSL-KDD or IDS2017\n", 388 | " print(\"\\n\\n#########################################################################\")\n", 389 | " print(\"Encoding Menu\")\n", 390 | " print(\"#########################################################################\")\n", 391 | " print(\"1.Binary true labels: normal = 0, abnormal = 1\")\n", 392 | " print(\"2. Multiclass true labels: BENIGN= 0, DoS slowloris= 1, DoS Slowhttptest= 2, DoS Hulk= 3, DoS GoldenEye= 4, Heartbleed= 5\")\n", 393 | " encodeOption = input(\"Enter option :\")\n", 394 | "\n", 395 | " if encodeOption == \"1\":\n", 396 | " Y = np.array(Y,dtype= object)\n", 397 | " attackEncoding = {'BENIGN': 0,'DoS slowloris': 1,'DoS Slowhttptest': 2,'DoS Hulk': 3, 'DoS GoldenEye': 4, 'Heartbleed': 5} #Main Categories\n", 398 | " Y[:] = [attackEncoding[item] for item in Y[:]]# Changing the names of attacks into 4 main categories\n", 399 | " \n", 400 | " return Y,encodeOption\n", 401 | " \n", 402 | " elif encodeOption == \"2\":\n", 403 | " Y = np.array(Y,dtype= object)\n", 404 | " attackType = {'BENIGN': 'normal','DoS slowloris': 'abnormal','DoS Slowhttptest': 'abnormal','DoS Hulk': 'abnormal', 'DoS GoldenEye': 'abnormal', 'Heartbleed': 'abnormal'} #Binary Categories\n", 405 | " attackEncoding = {'normal': 0, 'abnormal': 1}\n", 406 | " \n", 407 | " Y[:] = [attackType[item] for item in Y[:]]# Changing the names of attacks into binary categories\n", 408 | " Y[:] = [attackEncoding[item] for item in Y[:]]# Changing the names of attacks into binary categories\n", 409 | " return Y,encodeOption\n", 410 | " \n", 411 | " else:\n", 412 | " return Y" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "# One Hot Encoding" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "#Encoding the categorical features using one hot encoding and using Main attacks categories or binary categories\n", 429 | "def oneHotEncodingData(X,dataOption):\n", 430 | " \n", 431 | " from sklearn.preprocessing import OneHotEncoder\n", 432 | " from sklearn.compose import ColumnTransformer\n", 433 | " #We use One hot encoding to pervent the machine learning to atribute the categorical data in order. \n", 434 | " #What one hot encoding(ColumnTransformer) does is, it takes a column which has categorical data, \n", 435 | " #which has been label encoded, and then splits the column into multiple columns.\n", 436 | " #The numbers are replaced by 1s and 0s, depending on which column has what value\n", 437 | " #We don't need to do a label encoded step because ColumnTransformer do one hot encode and label encode!\n", 438 | " #Encoding the Independient Variable\n", 439 | " if dataOption == \"1\": #Only for dataset with Categorical Data\n", 440 | " transform = ColumnTransformer([(\"Servers\", OneHotEncoder(categories = \"auto\"), [1,2,3])], remainder=\"passthrough\")\n", 441 | " X = transform.fit_transform(X)\n", 442 | " print(\"\\n\\n#########################################################################\")\n", 443 | " print(\"Data has been successfully One Hot Encoded\")\n", 444 | " print(\"#########################################################################\")\n", 445 | "\n", 446 | " return X\n", 447 | " elif dataOption == \"3\": #Only for risk data, because we don't have risk values for protocol feature we do one hot encoding for only that feature and the other ones we do risk value encoding\n", 448 | " transform = ColumnTransformer([(\"Servers\", OneHotEncoder(categories = \"auto\"), [1])], remainder=\"passthrough\")\n", 449 | " X = transform.fit_transform(X)\n", 450 | " print(\"\\n\\n#########################################################################\")\n", 451 | " print(\"Data has been successfully One Hot Encoded\")\n", 452 | " print(\"#########################################################################\")\n", 453 | " return X\n", 454 | " \n", 455 | " else:\n", 456 | " return X #return data with no changes" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "# Risk Encoding" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [ 472 | "def riskEncodingData(X,dataOption):#Risk encoding categorical features\n", 473 | " #Manually Encoding for the attacks types only\n", 474 | " if dataOption == \"3\": #if data option is risk Value\n", 475 | " X = pd.DataFrame(X)\n", 476 | " servers = {'http':0.01, 'domain_u':0, 'sunrpc':1, 'smtp':0.01, 'ecr_i':0.87, 'iso_tsap':1, 'private':0.97, 'finger':0.27, 'ftp':0.26, 'telnet':0.48,'other':0.12,'discard':1, 'courier':1, 'pop_3':0.53, 'ldap':1, 'eco_i':0.8, 'ftp_data':0.06, 'klogin':1, 'auth':0.31, 'mtp':1, 'name':1, 'netbios_ns':1,'remote_job':1,'supdup':1,'uucp_path':1,'Z39_50':1,'csnet_ns':1,'uucp':1,'netbios_dgm':1,'urp_i':0,'domain':0.96,'bgp':1,'gopher':1,'vmnet':1,'systat':1,'http_443':1,'efs':1,'whois':1,'imap4':1,'echo':1,'link':1,'login':1,'kshell':1,'sql_net':1,'time':0.88,'hostnames':1,'exec':1,'ntp_u':0,'nntp':1,'ctf':1,'ssh':1,'daytime':1,'shell':1,'netstat':1,'nnsp':1,'IRC':0,'pop_2':1,'printer':1,'tim_i':0.33,'pm_dump':1,'red_i':0,'netbios_ssn':1,'rje':1,'X11':0.04,'urh_i':0,'http_8001':1,'aol':1,'http_2784':1,'tftp_u':0,'harvest':1}\n", 477 | " X[2] = [servers[item] for item in X[2]]\n", 478 | "\n", 479 | " servers_Error = {'REJ':0.519, 'SF':0.016, 'S0':0.998, 'RSTR':0.882, 'RSTO':0.886,'SH':0.993,'S1':0.008,'RSTOS0':1,'S3':0.08,'S2':0.05,'OTH':0.729} \n", 480 | " X[3] = [servers_Error[item] for item in X[3]]\n", 481 | "\n", 482 | " print(\"\\n\\n#########################################################################\")\n", 483 | " print(\"Data has been successfully risk Encoded\")\n", 484 | " print(\"#########################################################################\")\n", 485 | "\n", 486 | " return X\n", 487 | " \n", 488 | " else:\n", 489 | " \n", 490 | " return X #return data with no changes" 491 | ] 492 | }, 493 | { 494 | "cell_type": "markdown", 495 | "metadata": {}, 496 | "source": [ 497 | "# Scaling " 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "def scaling(X):#Scalign the data with the normalize method, we scale the data to have it in the same range for the experiments\n", 507 | " \n", 508 | " \n", 509 | "\n", 510 | " while True:\n", 511 | " \n", 512 | " decision = input(\"Scale data [y/n]:\")\n", 513 | " \n", 514 | " if decision == \"y\" or decision == \"n\":\n", 515 | " break\n", 516 | " else:\n", 517 | " \n", 518 | " print(\"Error\\n\\n\")\n", 519 | " \n", 520 | " if decision == \"y\":\n", 521 | " \n", 522 | " from sklearn.preprocessing import MinMaxScaler\n", 523 | " #Transforms features by scaling each feature to a given range.\n", 524 | " X = MinMaxScaler(feature_range=(0, 1)).fit_transform(X)\n", 525 | " print(\"\\n\\n#########################################################################\")\n", 526 | " print(\"Data has been successfully scaled.\")\n", 527 | " print(\"#########################################################################\")\n", 528 | " return X\n", 529 | " \n", 530 | " else:\n", 531 | " return X\n" 532 | ] 533 | }, 534 | { 535 | "cell_type": "markdown", 536 | "metadata": {}, 537 | "source": [ 538 | "# Shuffle" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": null, 544 | "metadata": {}, 545 | "outputs": [], 546 | "source": [ 547 | "def shuffleData(X):# currently a bug, if we do shuffleling the experiments resutls are not good, the order of the data does not affect the results\n", 548 | "\n", 549 | " from sklearn.utils import shuffle\n", 550 | " while True:\n", 551 | " option = input(\"Shuffle data [y]/[n]:\")\n", 552 | " \n", 553 | " if option == \"y\" or option == \"n\":\n", 554 | " break\n", 555 | " else:\n", 556 | " \n", 557 | " print(\"Error\\n\\n\")\n", 558 | " \n", 559 | " if option == \"y\":\n", 560 | " \n", 561 | " X = pd.DataFrame(X)\n", 562 | " X = shuffle(X)\n", 563 | " X.reset_index(inplace=True,drop=True)\n", 564 | " X = np.array(X)\n", 565 | " \n", 566 | " print(\"\\n\\n#########################################################################\")\n", 567 | " print(\"Data has been successfully shuffled.\")\n", 568 | " print(\"#########################################################################\")\n", 569 | " return X\n", 570 | " else:\n", 571 | " \n", 572 | " return X" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "metadata": {}, 578 | "source": [ 579 | "# KMEANS" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": null, 585 | "metadata": {}, 586 | "outputs": [], 587 | "source": [ 588 | "def kmeansClustering(X,Y):#K-means algorithm \n", 589 | " from sklearn.cluster import KMeans\n", 590 | "\n", 591 | " while True:\n", 592 | " print(\"\\n\\n#########################################################################\")\n", 593 | " print(\"KMEANS ALGORITHM\")\n", 594 | " print(\"#########################################################################\")\n", 595 | " \n", 596 | " nClusters = input(\"Number of clusters:\")\n", 597 | " \n", 598 | " try:\n", 599 | " nClusters = int(nClusters)\n", 600 | " \n", 601 | " except ValueError:\n", 602 | " \n", 603 | " print(\"Error\\n\\n\")\n", 604 | " \n", 605 | " if type(nClusters) == int:\n", 606 | " n = 0\n", 607 | " clusters = []\n", 608 | " \n", 609 | " while n < nClusters:#Converting nCluster into an array of n clusters [n] for use it later\n", 610 | " clusters.append(n)\n", 611 | " n+=1\n", 612 | " break\n", 613 | " \n", 614 | " while True:\n", 615 | " init = input(\"Initialization method [k-means++,random]:\")\n", 616 | " \n", 617 | " if init == \"k-means++\" or init == \"random\":\n", 618 | " break\n", 619 | "\n", 620 | " print(\"\\nClustering...\\n\")\n", 621 | " \n", 622 | " start_time = time.time()\n", 623 | " KMEANS = KMeans(n_clusters = nClusters, init = init,max_iter = 300,n_init = 10,random_state = 0)\n", 624 | " print(\"\\n\\nRun Time ->\",\"--- %s seconds ---\" % (time.time() - start_time))\n", 625 | " print(\"Data Successfully Clustered\")\n", 626 | " kmeans = KMEANS.fit(X)\n", 627 | " Z = kmeans.labels_\n", 628 | " inertia = KMEANS.inertia_\n", 629 | " #Kmeans Results\n", 630 | " kmeansR = pd.crosstab(Y,Z)\n", 631 | " maxVal = kmeansR.idxmax()\n", 632 | " \n", 633 | " return Z,clusters,kmeansR,maxVal,inertia\n" 634 | ] 635 | }, 636 | { 637 | "cell_type": "markdown", 638 | "metadata": {}, 639 | "source": [ 640 | "# Kmeans F1 Score" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": null, 646 | "metadata": {}, 647 | "outputs": [], 648 | "source": [ 649 | "def kF1(Z,Y,maxVal,clusters):#F1 Score for Kmeans\n", 650 | " from sklearn.metrics import f1_score\n", 651 | " #Encoding data to F-score\n", 652 | " \n", 653 | " \n", 654 | " # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n", 655 | " n = 0 # counter\n", 656 | " dictionaryCluster = {} # creating an empty dictionary \n", 657 | " f1 = 0 #f1score\n", 658 | " average = ''\n", 659 | " \n", 660 | " while n < len(clusters):# while counter < number of clusters\n", 661 | " dictionaryCluster[clusters[n]] = maxVal[n] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n", 662 | " n+=1\n", 663 | " \n", 664 | " Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n", 665 | " \n", 666 | " Y = np.array(Y,dtype = int) # Converting labels into a int array\n", 667 | " \n", 668 | " while True:\n", 669 | " \n", 670 | " average = input(\"Average Method[weighted,micro,macro,binary]:\")\n", 671 | " \n", 672 | " if average == \"weighted\" or average == \"micro\" or average == \"macro\" or average == 'binary':\n", 673 | " break\n", 674 | " #score metric \n", 675 | " f1 = f1_score(Y,Z, average = average) #Forget the labels that where not predicted and gives lables that were predicted at least once\n", 676 | " \n", 677 | " return f1,dictionaryCluster" 678 | ] 679 | }, 680 | { 681 | "cell_type": "markdown", 682 | "metadata": {}, 683 | "source": [ 684 | "# KMEANS Normal Mutial Info" 685 | ] 686 | }, 687 | { 688 | "cell_type": "code", 689 | "execution_count": null, 690 | "metadata": {}, 691 | "outputs": [], 692 | "source": [ 693 | "\n", 694 | "def kNMI(Z,Y,maxVal,clusters):\n", 695 | " from sklearn.metrics import normalized_mutual_info_score\n", 696 | " \n", 697 | " # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n", 698 | " n = 0 # counter\n", 699 | " dictionaryCluster = {} # creating an empty dictionary \n", 700 | " NMI = 0\n", 701 | " average = ''\n", 702 | " \n", 703 | " while n < len(clusters):# while counter < number of clusters\n", 704 | " dictionaryCluster[clusters[n]] = maxVal[n] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n", 705 | " n+=1\n", 706 | " \n", 707 | " Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n", 708 | " \n", 709 | " Y = np.array(Y,dtype = int) #Making sure that labels are in a int array\n", 710 | " \n", 711 | " while True:\n", 712 | " \n", 713 | " average = input(\"Average Method[geometric,min,arithmetic,max]:\")\n", 714 | " \n", 715 | " if average == \"geometric\" or average == \"min\" or average == \"arithmetic\" or average == \"max\":\n", 716 | " break\n", 717 | " #Score metric \n", 718 | " NMI = normalized_mutual_info_score(Y, Z, average_method = average)\n", 719 | " \n", 720 | " return NMI,dictionaryCluster\n" 721 | ] 722 | }, 723 | { 724 | "cell_type": "markdown", 725 | "metadata": {}, 726 | "source": [ 727 | "# KMEANS Adjusted Random Score" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": null, 733 | "metadata": {}, 734 | "outputs": [], 735 | "source": [ 736 | "def kARS(Z,Y,maxVal,clusters):\n", 737 | " from sklearn.metrics import adjusted_rand_score\n", 738 | " \n", 739 | " # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n", 740 | " n = 0 # counter\n", 741 | " dictionaryCluster = {} # creating an empty dictionary \n", 742 | " ars = 0\n", 743 | " \n", 744 | " while n < len(clusters):# while counter < number of clusters\n", 745 | " dictionaryCluster[clusters[n]] = maxVal[n] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n", 746 | " n+=1\n", 747 | " \n", 748 | " Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n", 749 | " \n", 750 | " Y = np.array(Y,dtype = int) #Making sure that labels are in a int array\n", 751 | " \n", 752 | " #score metric\n", 753 | " ars = adjusted_rand_score(Y, Z)\n", 754 | " \n", 755 | " return ars,dictionaryCluster" 756 | ] 757 | }, 758 | { 759 | "cell_type": "markdown", 760 | "metadata": {}, 761 | "source": [ 762 | "# DBSCAN" 763 | ] 764 | }, 765 | { 766 | "cell_type": "code", 767 | "execution_count": null, 768 | "metadata": {}, 769 | "outputs": [], 770 | "source": [ 771 | "def dbscanClustering(X,Y):#DBSCAN algorithm\n", 772 | " from sklearn.cluster import DBSCAN\n", 773 | " \n", 774 | " while True:\n", 775 | " \n", 776 | " print(\"\\n\\n#########################################################################\")\n", 777 | " print(\"DBSCAN ALGORITHM\")\n", 778 | " print(\"#########################################################################\")\n", 779 | " \n", 780 | " epsilon = input(\"epsilon[Decimal]:\")\n", 781 | " \n", 782 | " try:\n", 783 | " epsilon = float(epsilon)\n", 784 | " \n", 785 | " except ValueError:\n", 786 | " \n", 787 | " print(\"Enter a Decimal number\")\n", 788 | " \n", 789 | " \n", 790 | " if type(epsilon) == float:\n", 791 | " break\n", 792 | " \n", 793 | " while True:\n", 794 | " minSamples = input(\"Min Samples[Integer]:\")\n", 795 | " \n", 796 | " try:\n", 797 | " minSamples = int(minSamples)\n", 798 | " \n", 799 | " except ValueError:\n", 800 | " \n", 801 | " print(\"Enter a Integer Number\")\n", 802 | " \n", 803 | " if type(minSamples) == int:\n", 804 | " break\n", 805 | " \n", 806 | " while True:\n", 807 | " algorithm = input(\"Algorithm['auto’, ‘ball_tree’, ‘kd_tree’, 'brute']:\")\n", 808 | " \n", 809 | " if algorithm == \"auto\" or algorithm == \"ball_tree\" or algorithm == \"kd_tree\" or algorithm == \"brute\":\n", 810 | " break\n", 811 | " \n", 812 | " else:\n", 813 | " print(\"Error\\n\\n\")\n", 814 | " \n", 815 | " \n", 816 | " print(\"\\nClustering...\\n\")\n", 817 | "\n", 818 | " #Compute DBSCAN\n", 819 | " start_time = time.time() \n", 820 | " db = DBSCAN(eps= epsilon, min_samples = minSamples,algorithm = algorithm).fit(X)\n", 821 | " print(\"\\n\\nRun Time ->\",\"--- %s seconds ---\" % (time.time() - start_time))\n", 822 | " print(\"Data Successfully Clustered\")\n", 823 | " \n", 824 | " \n", 825 | " core_samples_mask = np.zeros_like(db.labels_, dtype=bool)\n", 826 | " core_samples_mask[db.core_sample_indices_] = True\n", 827 | " \n", 828 | " Z = db.labels_\n", 829 | " # Number of clusters in labels, ignoring noise if present.\n", 830 | " n_clusters = len(set(Z))\n", 831 | " n_noise_ = list(Z).count(-1)\n", 832 | " \n", 833 | " n = -1 # DBSCAN return index -1 cluster\n", 834 | " clusters = []\n", 835 | " while n + 1 < n_clusters:\n", 836 | " clusters.append(n)\n", 837 | " n += 1\n", 838 | " \n", 839 | " #DBSCAN Results\n", 840 | " dbscanR = pd.crosstab(Y,Z)\n", 841 | " maxVal = dbscanR.idxmax()\n", 842 | " \n", 843 | " return Z,clusters,n_noise_,dbscanR,maxVal" 844 | ] 845 | }, 846 | { 847 | "cell_type": "markdown", 848 | "metadata": {}, 849 | "source": [ 850 | "# DBSCAN F1 Score" 851 | ] 852 | }, 853 | { 854 | "cell_type": "code", 855 | "execution_count": null, 856 | "metadata": {}, 857 | "outputs": [], 858 | "source": [ 859 | "def dbF1(Z,Y,clusters,maxVal):#F1 score for DBSCAN\n", 860 | " from sklearn.metrics import f1_score\n", 861 | " #Encoding data to F-score\n", 862 | " \n", 863 | " # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n", 864 | " n = 0 # counter\n", 865 | " c = -1 # - counter max Value has negative index\n", 866 | " dictionaryCluster = {} # creating an empty dictionary \n", 867 | " f1 = 0\n", 868 | " average = ''\n", 869 | " \n", 870 | " while n < len(clusters):# while counter < number of clusters\n", 871 | " dictionaryCluster[clusters[n]] = maxVal[c] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n", 872 | " n+=1\n", 873 | " c+=1\n", 874 | " \n", 875 | " \n", 876 | " Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n", 877 | " \n", 878 | " Y = np.array(Y,dtype = int) #Making sure that labels are in a int array\n", 879 | " while True:\n", 880 | " \n", 881 | " average = input(\"Average Method[weighted,micro,macro]:\")\n", 882 | " \n", 883 | " if average == \"weighted\" or average == \"micro\" or average == \"macro\":\n", 884 | " break\n", 885 | " \n", 886 | " else:\n", 887 | " \n", 888 | " print(\"Error\\n\\n\")\n", 889 | " #score metric\n", 890 | " f1 = f1_score(Y,Z, average = average)\n", 891 | " return f1,dictionaryCluster" 892 | ] 893 | }, 894 | { 895 | "cell_type": "markdown", 896 | "metadata": {}, 897 | "source": [ 898 | "# DBSCAN Mutual Info Score" 899 | ] 900 | }, 901 | { 902 | "cell_type": "code", 903 | "execution_count": null, 904 | "metadata": {}, 905 | "outputs": [], 906 | "source": [ 907 | "def dbNMI(Z,Y,clusters,maxVal):# Mutual info score for dbscan\n", 908 | " from sklearn.metrics import normalized_mutual_info_score\n", 909 | " # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n", 910 | " n = 0 # counter\n", 911 | " c = -1 # - counter max Value has negative index\n", 912 | " NMI = 0\n", 913 | " dictionaryCluster = {} # creating an empty dictionary \n", 914 | " average = ''\n", 915 | " \n", 916 | " while n < len(clusters):# while counter < number of clusters\n", 917 | " dictionaryCluster[clusters[n]] = maxVal[c] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n", 918 | " n+=1\n", 919 | " c+=1\n", 920 | " \n", 921 | " Y = np.array(Y,dtype = int) #Making sure that labels are in a int array\n", 922 | "\n", 923 | " while True:\n", 924 | " \n", 925 | " average = input(\"Average Method[geometric,min,arithmetic,max]:\")\n", 926 | " \n", 927 | " if average == \"geometric\" or average == \"min\" or average == \"arithmetic\" or average == \"max\":\n", 928 | " break\n", 929 | " else:\n", 930 | " \n", 931 | " print(\"Error\\n\\n\")\n", 932 | " #score metric\n", 933 | " NMI = normalized_mutual_info_score(Y, Z, average_method= average)\n", 934 | " \n", 935 | " return NMI,dictionaryCluster" 936 | ] 937 | }, 938 | { 939 | "cell_type": "markdown", 940 | "metadata": {}, 941 | "source": [ 942 | "# DBSCAN Adjusted Random Score" 943 | ] 944 | }, 945 | { 946 | "cell_type": "code", 947 | "execution_count": null, 948 | "metadata": {}, 949 | "outputs": [], 950 | "source": [ 951 | "def dbARS(Z,Y,clusters,maxVal): # adjusted rand score for dbscan\n", 952 | " from sklearn.metrics import adjusted_rand_score\n", 953 | " \n", 954 | " # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n", 955 | " n = 0 # counter\n", 956 | " c = -1 # - counter max Value has negative index\n", 957 | " ars = 0\n", 958 | " dictionaryCluster = {} # creating an empty dictionary \n", 959 | " \n", 960 | " while n < len(clusters):# while counter < number of clusters\n", 961 | " dictionaryCluster[clusters[n]] = maxVal[c] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n", 962 | " n+=1\n", 963 | " c+=1\n", 964 | " #score metric\n", 965 | " ars = adjusted_rand_score(Y,Z)\n", 966 | " \n", 967 | " return ars,dictionaryCluster" 968 | ] 969 | }, 970 | { 971 | "cell_type": "markdown", 972 | "metadata": {}, 973 | "source": [ 974 | "# Isolation Forest" 975 | ] 976 | }, 977 | { 978 | "cell_type": "code", 979 | "execution_count": null, 980 | "metadata": {}, 981 | "outputs": [], 982 | "source": [ 983 | "def isolationForest(X,Y):# isolation forest algorithm\n", 984 | " from sklearn.ensemble import IsolationForest\n", 985 | " \n", 986 | " while True:\n", 987 | " contamination = input(\"Contamination[Float 0 to 0.5]: \")\n", 988 | " \n", 989 | " try:\n", 990 | " contamination = float(contamination)\n", 991 | " \n", 992 | " except ValueError:\n", 993 | " \n", 994 | " print(\"Enter a Number\")\n", 995 | " \n", 996 | " if type(contamination) == float and (contamination >= 0 and contamination <= 0.5):\n", 997 | " break\n", 998 | " \n", 999 | " print(\"\\nClustering...\\n\") \n", 1000 | " \n", 1001 | " start_time = time.time() \n", 1002 | " Z = IsolationForest(max_samples = \"auto\",behaviour = \"new\",contamination = contamination).fit_predict(X)\n", 1003 | " print(\"\\n\\nRun Time ->\",\"--- %s seconds ---\" % (time.time() - start_time))\n", 1004 | " \n", 1005 | " Z = np.array(Z,dtype = object)\n", 1006 | " \n", 1007 | " ifR = pd.crosstab(Y,Z)\n", 1008 | " ifR = pd.DataFrame(ifR)\n", 1009 | " maxVal = ifR.idxmax()\n", 1010 | " \n", 1011 | " n = -1 # Isolation Forest return index -1 and 1 cluster\n", 1012 | " clusters = []\n", 1013 | " while n < len(ifR.columns):\n", 1014 | " clusters.append(n)\n", 1015 | " n += 2\n", 1016 | " \n", 1017 | " return Z,ifR,maxVal,clusters" 1018 | ] 1019 | }, 1020 | { 1021 | "cell_type": "markdown", 1022 | "metadata": {}, 1023 | "source": [ 1024 | "# Isolation Forest F1 Score" 1025 | ] 1026 | }, 1027 | { 1028 | "cell_type": "code", 1029 | "execution_count": null, 1030 | "metadata": {}, 1031 | "outputs": [], 1032 | "source": [ 1033 | "def ifF1(Z,Y,clusters,maxVal): #f1 score for isolation forest\n", 1034 | " from sklearn.metrics import f1_score\n", 1035 | " # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n", 1036 | " \n", 1037 | " n = 0 # counter\n", 1038 | " c = -1 # - counter max Value has negative index\n", 1039 | " f1 = 0\n", 1040 | " average = ''\n", 1041 | " dictionaryCluster = {} # creating an empty dictionary \n", 1042 | "\n", 1043 | " \n", 1044 | " while n < len(clusters): # Since we got -1 and 1 clusters , in order to assing the corrects result counter starts at -1 and it increments by 2 so it can have the 1 index of maxLOFvalue\n", 1045 | " dictionaryCluster[clusters[n]] = maxVal[c] \n", 1046 | " n+=1\n", 1047 | " c+=2\n", 1048 | " \n", 1049 | " Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n", 1050 | " \n", 1051 | " Y = np.array(Y,dtype = int)\n", 1052 | " Z = np.array(Z,dtype = int)\n", 1053 | " \n", 1054 | " while True:\n", 1055 | " \n", 1056 | " average = input(\"Average Method[weighted,micro,macro]:\")\n", 1057 | " \n", 1058 | " if average == \"weighted\" or average == \"micro\" or average == \"macro\":\n", 1059 | " break\n", 1060 | " \n", 1061 | " else:\n", 1062 | " \n", 1063 | " print(\"Error\\n\\n\")\n", 1064 | " # score metric\n", 1065 | " f1 = f1_score(Y,Z, average = average) #[None, 'micro', 'macro', 'weighted']\n", 1066 | " \n", 1067 | " return f1,dictionaryCluster" 1068 | ] 1069 | }, 1070 | { 1071 | "cell_type": "markdown", 1072 | "metadata": {}, 1073 | "source": [ 1074 | "# Local Outlier Factor" 1075 | ] 1076 | }, 1077 | { 1078 | "cell_type": "code", 1079 | "execution_count": null, 1080 | "metadata": {}, 1081 | "outputs": [], 1082 | "source": [ 1083 | "def LOF(X,Y):# Local outlier factor algorithm\n", 1084 | " from sklearn.neighbors import LocalOutlierFactor \n", 1085 | " \n", 1086 | " while True:\n", 1087 | " contamination = input(\"Contamination[Float 0 to 0.5]: \")\n", 1088 | " \n", 1089 | " try:\n", 1090 | " contamination = float(contamination)\n", 1091 | " \n", 1092 | " except ValueError:\n", 1093 | " \n", 1094 | " print(\"Enter a Number\")\n", 1095 | " \n", 1096 | " if type(contamination) == float and (contamination > 0 and contamination <= 0.5):\n", 1097 | " break\n", 1098 | " \n", 1099 | " while True:\n", 1100 | " algorithm = input(\"Algorithm['auto’, ‘ball_tree’, ‘kd_tree’, 'brute']:\")\n", 1101 | " \n", 1102 | " if algorithm == \"auto\" or algorithm == \"ball_tree\" or algorithm == \"kd_tree\" or algorithm == \"brute\":\n", 1103 | " break\n", 1104 | " else:\n", 1105 | " \n", 1106 | " print(\"Error\\n\\n\")\n", 1107 | " \n", 1108 | " print(\"\\nClustering...\\n\")\n", 1109 | " \n", 1110 | " start_time = time.time() \n", 1111 | " lof = LocalOutlierFactor(contamination = contamination,algorithm = algorithm).fit_predict(X)\n", 1112 | " print(\"\\n\\nRun Time ->\",\"--- %s seconds ---\" % (time.time() - start_time))\n", 1113 | " \n", 1114 | " lofR = pd.crosstab(Y,lof)\n", 1115 | " maxVal = lofR.idxmax()\n", 1116 | " \n", 1117 | " \n", 1118 | " n = -1 # LOF return index -1 and 1 cluster\n", 1119 | " clusters = []\n", 1120 | " while n < len(lofR.columns):\n", 1121 | " clusters.append(n)\n", 1122 | " n += 2\n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " return lof,lofR,maxVal,clusters" 1127 | ] 1128 | }, 1129 | { 1130 | "cell_type": "markdown", 1131 | "metadata": {}, 1132 | "source": [ 1133 | "# Local Outlier Factor F1 Score" 1134 | ] 1135 | }, 1136 | { 1137 | "cell_type": "code", 1138 | "execution_count": null, 1139 | "metadata": {}, 1140 | "outputs": [], 1141 | "source": [ 1142 | "def lofF1(Z,Y,clusters,maxVal): # f1 score for local outlier factor\n", 1143 | " from sklearn.metrics import f1_score\n", 1144 | " \n", 1145 | " # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n", 1146 | " n = 0 # counter\n", 1147 | " c = -1 # - counter max Value has negative index\n", 1148 | " f1 = 0\n", 1149 | " dictionaryCluster = {} # creating an empty dictionary \n", 1150 | " \n", 1151 | " while n < len(clusters): # Since we got -1 and 1 clusters , in order to assing the corrects result counter starts at -1 and it increments by 2 so it can have the 1 index of maxLOFvalue\n", 1152 | " dictionaryCluster[clusters[n]] = maxVal[c] \n", 1153 | " n+=1\n", 1154 | " c+=2\n", 1155 | " \n", 1156 | " Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n", 1157 | " Y = np.array(Y,dtype = int)\n", 1158 | " Z = np.array(Z,dtype = int)\n", 1159 | " while True:\n", 1160 | " \n", 1161 | " average = input(\"Average Method[weighted,None,micro,macro]:\")\n", 1162 | " \n", 1163 | " if average == \"weighted\" or average == \"micro\" or average == \"macro\" or average == \"None\":\n", 1164 | " break\n", 1165 | " \n", 1166 | " else:\n", 1167 | " \n", 1168 | " print(\"Error\\n\\n\")\n", 1169 | " f1 = f1_score(Y,Z, average = average) #[None, 'micro', 'macro', 'weighted']\n", 1170 | " \n", 1171 | " return f1,dictionaryCluster" 1172 | ] 1173 | }, 1174 | { 1175 | "cell_type": "markdown", 1176 | "metadata": {}, 1177 | "source": [ 1178 | "# Calling Functions" 1179 | ] 1180 | }, 1181 | { 1182 | "cell_type": "code", 1183 | "execution_count": null, 1184 | "metadata": {}, 1185 | "outputs": [], 1186 | "source": [ 1187 | "clear()\n", 1188 | "#Calling the functions\n", 1189 | "\n", 1190 | "##########################################################################\n", 1191 | "path,dataSetOption = getDataSet()\n", 1192 | "#########################################################################\n", 1193 | "#########################################################################\n", 1194 | "dataSet = readingData(path)\n", 1195 | "#########################################################################\n", 1196 | "#########################################################################\n", 1197 | "dataSet = checkMissing(dataSet)\n", 1198 | "#########################################################################\n", 1199 | "#########################################################################\n", 1200 | "data,labels,dataOption = gettingVariables(dataSet,dataSetOption) #Getting the Data we want to use for the algorithms\n", 1201 | "#########################################################################\n", 1202 | "#########################################################################\n", 1203 | "try:\n", 1204 | " labels,encodeOption = encodingLabels(labels,dataOption,dataSetOption) #Encoding the true labels\n", 1205 | "except ValueError:\n", 1206 | " labels = encodingLabels(labels,dataOption,dataSetOption) #Encoding the true labels\n", 1207 | "#########################################################################\n", 1208 | "#########################################################################\n", 1209 | "data = riskEncodingData(data,dataOption)\n", 1210 | "#########################################################################\n", 1211 | "#########################################################################\n", 1212 | "data = oneHotEncodingData(data,dataOption) #One hot Encode with the complete data\n", 1213 | "#########################################################################\n", 1214 | "#########################################################################\n", 1215 | "data = scaling(data)\n", 1216 | "#########################################################################\n", 1217 | "#########################################################################\n", 1218 | "data = shuffleData(data)\n", 1219 | "#########################################################################\n", 1220 | "\n", 1221 | "#This menu is a option to run diferrent algorithms with the same preproceced data witouth the need of running all the code from 0 to make another experiment.\n", 1222 | "while True: \n", 1223 | " while True:\n", 1224 | " print(\"\\n\\n#########################################################################\")\n", 1225 | " print(\"Algorithm Menu\")\n", 1226 | " print(\"#########################################################################\")\n", 1227 | " \n", 1228 | " print(\"1.Kmeans\")\n", 1229 | " print(\"2.Dbscan\")\n", 1230 | " print(\"3.Isolation Forest\")\n", 1231 | " print(\"4.Local Factor Outlier\")\n", 1232 | " \n", 1233 | " algorithmOption = input(\"option:\")\n", 1234 | " \n", 1235 | " if algorithmOption == \"1\" or algorithmOption == \"2\" or algorithmOption == \"3\" or algorithmOption == \"4\":\n", 1236 | " break\n", 1237 | " else:\n", 1238 | " \n", 1239 | " print(\"Error\\n\\n\")\n", 1240 | "\n", 1241 | " \n", 1242 | " if algorithmOption == \"1\":\n", 1243 | " #########################################################################\n", 1244 | " #KMEANS\n", 1245 | " klabels,kClusters,kmeansR,maxKvalue,inertia = kmeansClustering(data,labels)\n", 1246 | " print(\"#########################################################################\")\n", 1247 | " print(\"KMEANS RESULTS\\n\\n\")\n", 1248 | " print(\"Clusters -> \",kClusters,\"\\n\")\n", 1249 | " print(\"Inertia -> \",inertia)\n", 1250 | " print(kmeansR,\"\\n\\n\")\n", 1251 | " print(\"Max True Label\",\"\\n\\n\",maxKvalue)\n", 1252 | " print(\"#########################################################################\")\n", 1253 | " #########################################################################\n", 1254 | " print(\"\\n\\n#########################################################################\")\n", 1255 | " print(\"Kmeans Score Metrics Menu\")\n", 1256 | " print(\"#########################################################################\")\n", 1257 | " \n", 1258 | " while True:\n", 1259 | " print(\"1.F1 Score\")\n", 1260 | " print(\"2.Normalized Mutual Info Score\")\n", 1261 | " print(\"3.Adjusted Rand Score\")\n", 1262 | " \n", 1263 | " kScoreOption = input(\"option:\")\n", 1264 | " \n", 1265 | " if kScoreOption == \"1\" or kScoreOption == \"2\" or kScoreOption == \"3\":\n", 1266 | " break\n", 1267 | " else:\n", 1268 | " \n", 1269 | " print(\"Error\\n\\n\")\n", 1270 | " \n", 1271 | " if kScoreOption == \"1\":\n", 1272 | " #########################################################################\n", 1273 | " #F1 Score\n", 1274 | " kmeansF1,clusterAssigned = kF1(klabels,labels,maxKvalue,kClusters)\n", 1275 | " print(\"\\n\\n#########################################################################\")\n", 1276 | " print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n", 1277 | " print(\"KMEANS F1 Score -> \",kmeansF1)\n", 1278 | " print(\"#########################################################################\")\n", 1279 | " #########################################################################\n", 1280 | " \n", 1281 | " elif kScoreOption == \"2\":\n", 1282 | " #########################################################################\n", 1283 | " kmeansNMI,clusterAssigned = kNMI(klabels,labels,maxKvalue,kClusters)\n", 1284 | " print(\"\\n\\n#########################################################################\")\n", 1285 | " print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n", 1286 | " print(\"KMEANS Normalized Mutual Info Score -> \",kmeansNMI)\n", 1287 | " print(\"#########################################################################\")\n", 1288 | " #########################################################################\n", 1289 | " \n", 1290 | " elif kScoreOption == \"3\":\n", 1291 | " \n", 1292 | " #########################################################################\n", 1293 | " kmeansARS,clusterAssigned = kARS(klabels,labels,maxKvalue,kClusters)\n", 1294 | " print(\"\\n\\n#########################################################################\")\n", 1295 | " print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n", 1296 | " print(\"KMEANS Adjusted Rand Score -> \",kmeansARS)\n", 1297 | " print(\"#########################################################################\")\n", 1298 | " #########################################################################\n", 1299 | " \n", 1300 | " elif algorithmOption == \"2\":\n", 1301 | " #########################################################################\n", 1302 | " #DBSCAN\n", 1303 | " dblabels,dbClusters,nNoises,dbscanR,maxDBvalue = dbscanClustering(data,labels) \n", 1304 | " print(\"#########################################################################\")\n", 1305 | " print(\"DBSCAN RESULTS\\n\\n\")\n", 1306 | " print(\"Clusters -> \",dbClusters,\"\\n\")\n", 1307 | " print(dbscanR,\"\\n\\n\")\n", 1308 | " print(\"Noise -> \",nNoises)\n", 1309 | " print(\"Max True Label\",\"\\n\\n\",maxDBvalue)\n", 1310 | " print(\"#########################################################################\")\n", 1311 | " #########################################################################\n", 1312 | " print(\"\\n\\n#########################################################################\")\n", 1313 | " print(\"Dscan Score Metrics Menu\")\n", 1314 | " print(\"#########################################################################\")\n", 1315 | " print(\"1.F1 Score\")\n", 1316 | " print(\"2.Normalized Mutual Info Score\")\n", 1317 | " print(\"3.Adjusted Rand Score\")\n", 1318 | " \n", 1319 | " while True:\n", 1320 | " \n", 1321 | " dbScoreOption = input(\"option:\")\n", 1322 | " \n", 1323 | " if dbScoreOption == \"1\" or dbScoreOption == \"2\" or dbScoreOption == \"3\":\n", 1324 | " break\n", 1325 | " else:\n", 1326 | " \n", 1327 | " print(\"Error\\n\\n\")\n", 1328 | " \n", 1329 | " if dbScoreOption == \"1\":\n", 1330 | " #########################################################################\n", 1331 | " #F1 Score dbscan\n", 1332 | " dbscanF1,clusterAssigned = dbF1(dblabels,labels,dbClusters,maxDBvalue)\n", 1333 | " print(\"\\n\\n#########################################################################\")\n", 1334 | " print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n", 1335 | " print(\"DBSCAN F1 Score -> \",dbscanF1)\n", 1336 | " print(\"#########################################################################\")\n", 1337 | " #########################################################################\n", 1338 | " \n", 1339 | " elif dbScoreOption == \"2\":\n", 1340 | " #########################################################################\n", 1341 | " dbscanNMI,clusterAssigned = dbNMI(dblabels,labels,dbClusters,maxDBvalue)\n", 1342 | " print(\"\\n\\n#########################################################################\")\n", 1343 | " print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n", 1344 | " print(\"DBSCAN Normalized Mutual Info Score -> \",dbscanNMI)\n", 1345 | " print(\"#########################################################################\")\n", 1346 | " #########################################################################\n", 1347 | " \n", 1348 | " elif dbScoreOption == \"3\":\n", 1349 | " #########################################################################\n", 1350 | " dbscanARS,clusterAssigned = dbARS(dblabels,labels,dbClusters,maxDBvalue)\n", 1351 | " print(\"\\n\\n#########################################################################\")\n", 1352 | " print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n", 1353 | " print(\"DBSCAN Adjusted Rand Score -> \",dbscanARS)\n", 1354 | " print(\"#########################################################################\")\n", 1355 | " #########################################################################\n", 1356 | " \n", 1357 | " \n", 1358 | " elif algorithmOption == \"3\":\n", 1359 | " #########################################################################\n", 1360 | " ifLabels,ifR,MaxIfVal,ifNclusters = isolationForest(data,labels)\n", 1361 | " print(\"#########################################################################\")\n", 1362 | " print(\"Isolation Forest RESULTS\\n\\n\")\n", 1363 | " print(\"Clusters -> \",ifNclusters,\"\\n\")\n", 1364 | " print(ifR,\"\\n\\n\")\n", 1365 | " print(\"Max True Label\",\"\\n\\n\",MaxIfVal)\n", 1366 | " print(\"#########################################################################\")\n", 1367 | " #########################################################################\n", 1368 | " print(\"\\n\\n#########################################################################\")\n", 1369 | " print(\"Isolation Forest Score Metrics Menu\")\n", 1370 | " print(\"#########################################################################\")\n", 1371 | " print(\"1.F1 Score\")\n", 1372 | " \n", 1373 | " while True:\n", 1374 | " \n", 1375 | " ifScoreOption = input(\"option:\")\n", 1376 | " \n", 1377 | " if ifScoreOption == \"1\":\n", 1378 | " break\n", 1379 | " else:\n", 1380 | " \n", 1381 | " print(\"Error\\n\\n\")\n", 1382 | " \n", 1383 | " if ifScoreOption == \"1\":\n", 1384 | " \n", 1385 | " ##########################################################################\n", 1386 | " isolationForestF1,clusterAssigned = ifF1(ifLabels,labels,ifNclusters,MaxIfVal)\n", 1387 | " print(\"\\n\\n#########################################################################\")\n", 1388 | " print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n", 1389 | " print(\"Isolation Forest F1 Score -> \",isolationForestF1)\n", 1390 | " print(\"#########################################################################\")\n", 1391 | " ##########################################################################\n", 1392 | " \n", 1393 | " elif algorithmOption == \"4\":\n", 1394 | " #########################################################################\n", 1395 | " LOFlabels,lofR,maxLOFvalue,lofClusters = LOF(data,labels)\n", 1396 | " print(\"#########################################################################\")\n", 1397 | " print(\"Local Outlier Factor RESULTS\\n\\n\")\n", 1398 | " print(\"Clusters -> \",lofClusters,\"\\n\")\n", 1399 | " print(lofR,\"\\n\\n\")\n", 1400 | " print(\"Max True Label\",\"\\n\\n\",maxLOFvalue)\n", 1401 | " print(\"#########################################################################\")\n", 1402 | " #########################################################################\n", 1403 | " print(\"\\n\\n#########################################################################\")\n", 1404 | " print(\"LOF Score Metrics Menu\")\n", 1405 | " print(\"#########################################################################\")\n", 1406 | " print(\"1.F1 Score\")\n", 1407 | " \n", 1408 | " while True:\n", 1409 | " \n", 1410 | " lofScoreOption = input(\"option:\")\n", 1411 | " \n", 1412 | " if lofScoreOption == \"1\":\n", 1413 | " break\n", 1414 | " else:\n", 1415 | " \n", 1416 | " print(\"Error\\n\\n\")\n", 1417 | " \n", 1418 | " if lofScoreOption == \"1\":\n", 1419 | " \n", 1420 | " ##########################################################################\n", 1421 | " LOFf1,clusterAssigned = lofF1(LOFlabels,labels,lofClusters,maxLOFvalue)\n", 1422 | " print(\"\\n\\n#########################################################################\")\n", 1423 | " print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n", 1424 | " print(\"LOF F1 Score -> \",LOFf1)\n", 1425 | " print(\"#########################################################################\")\n", 1426 | " ##########################################################################\n", 1427 | " \n", 1428 | " while True: # If the user want to Make a new clustering algorithm test\n", 1429 | " \n", 1430 | " decision = input(\"Try another Clustering Algorithm[y/n]:\")\n", 1431 | " \n", 1432 | " if decision == \"y\" or decision == \"n\":\n", 1433 | " break\n", 1434 | " else:\n", 1435 | " \n", 1436 | " print(\"Error\\n\\n\")\n", 1437 | " \n", 1438 | " \n", 1439 | " if decision == \"n\":\n", 1440 | " break\n", 1441 | " \n", 1442 | " else:\n", 1443 | " clear()" 1444 | ] 1445 | } 1446 | ], 1447 | "metadata": { 1448 | "kernelspec": { 1449 | "display_name": "Python 3", 1450 | "language": "python", 1451 | "name": "python3" 1452 | }, 1453 | "language_info": { 1454 | "codemirror_mode": { 1455 | "name": "ipython", 1456 | "version": 3 1457 | }, 1458 | "file_extension": ".py", 1459 | "mimetype": "text/x-python", 1460 | "name": "python", 1461 | "nbconvert_exporter": "python", 1462 | "pygments_lexer": "ipython3", 1463 | "version": "3.7.3" 1464 | }, 1465 | "varInspector": { 1466 | "cols": { 1467 | "lenName": 16, 1468 | "lenType": 16, 1469 | "lenVar": 40 1470 | }, 1471 | "kernels_config": { 1472 | "python": { 1473 | "delete_cmd_postfix": "", 1474 | "delete_cmd_prefix": "del ", 1475 | "library": "var_list.py", 1476 | "varRefreshCmd": "print(var_dic_list())" 1477 | }, 1478 | "r": { 1479 | "delete_cmd_postfix": ") ", 1480 | "delete_cmd_prefix": "rm(", 1481 | "library": "var_list.r", 1482 | "varRefreshCmd": "cat(var_dic_list()) " 1483 | } 1484 | }, 1485 | "position": { 1486 | "height": "923px", 1487 | "left": "328px", 1488 | "right": "20px", 1489 | "top": "9px", 1490 | "width": "800px" 1491 | }, 1492 | "types_to_exclude": [ 1493 | "module", 1494 | "function", 1495 | "builtin_function_or_method", 1496 | "instance", 1497 | "_Feature" 1498 | ], 1499 | "window_display": false 1500 | } 1501 | }, 1502 | "nbformat": 4, 1503 | "nbformat_minor": 2 1504 | } 1505 | -------------------------------------------------------------------------------- /CBAD.py: -------------------------------------------------------------------------------- 1 | #@authors:jeremyperez,bethanydanner 2 | 3 | #reset -f 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import time 8 | import os 9 | 10 | clear = lambda:os.system('clear') 11 | 12 | def getDataSet(): 13 | 14 | while True: 15 | print("**************************************************") 16 | print("DATA SET MENU") 17 | print("**************************************************") 18 | print("1.NSL-KDD") 19 | print("2.IDS 2017") 20 | 21 | option = input("Option:") 22 | 23 | if option == "1" or option == "2": 24 | break 25 | 26 | path = input("Path of the File:") 27 | 28 | return path,option 29 | 30 | def readingData(path): #Reading the Dataset 31 | 32 | while True: 33 | 34 | option = input("Dataset has feature names[y/n]:") 35 | 36 | if option == "y" or option == "n": 37 | break 38 | 39 | print("\nReading Dataset...") 40 | 41 | if option == "y": 42 | dataSet = pd.read_csv(path,low_memory=False) 43 | 44 | elif option == "n": 45 | dataSet = pd.read_csv(path, header = None,low_memory=False) 46 | 47 | return dataSet 48 | 49 | 50 | def checkMissing(X):#Checking if the dataset given has missing values. 51 | isMissing = str(X.isnull().values.any()) #Using String instead of Boolean because ("cannot unpack non-iterable numpy.bool object") 52 | 53 | if isMissing == "True": 54 | #Replacing vales = "Infinity" with "nan" values, if any such values exist in dataset 55 | X = X.replace('Infinity', np.nan) 56 | 57 | missingValIndex = [] 58 | total = X.isnull().sum().sum() 59 | percent = (total / (X.count().sum() + X.isnull().sum().sum())) * 100 60 | 61 | for rows in X: #Reporting percentages of missing values in dataset 62 | 63 | if X[rows].isnull().sum() != 0: 64 | missingValIndex.append(rows) 65 | print("\n\n**************************************************") 66 | print("Data has missing values") 67 | print("**************************************************") 68 | print("Features with missing values:",missingValIndex) 69 | print("Total missing Values -> " , total) 70 | print(percent,"%") 71 | 72 | return X 73 | 74 | else: 75 | 76 | return X 77 | 78 | 79 | #Getting the data we want to test for the clustering algorithms 80 | def gettingVariables(dataSet,dataSetOption): 81 | #Obtaining features and labels for either NSL-KDD or IDS 2017 dataset. 82 | #Handling categorical data if NSL-KDD dataset is chosen. 83 | #and handling missing values if IDS 2017 dataset is chosen. 84 | 85 | if dataSetOption == "1": 86 | while True: 87 | print("\n\n**************************************************") 88 | print("Variables Menu") 89 | print("**************************************************") 90 | print("1.Data set with categorical data oneHot encoded") 91 | print("2.Data set with categorical data removed") 92 | print("3.Data set with Risk Values replacing Server Type and Flag Features; Protocol Data oneHot encoded") 93 | option = input("Enter option :") 94 | 95 | 96 | if option == "1" or option == "2" or option == "3": 97 | break 98 | else: 99 | 100 | print("Error\n\n") 101 | 102 | #Getting the dependent and independent Variables 103 | #Removing the dificulty level feature from NSL-KDD dataset because we are not using supervised learning in this project 104 | 105 | if option == "1": 106 | #Keeping categorical features in dataset in order to One Hot Encode later on 107 | X = dataSet.iloc[:,:-2].values #Getting all data except for the last two columns (namely difficulty level and labels) 108 | Y = dataSet.iloc[:,42].values #Labels 109 | return X,Y,option 110 | 111 | elif option == "2": 112 | #Removing categorical data from the data set 113 | X = dataSet.iloc[:,[0,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40]].values 114 | Y = dataSet.iloc[:,42].values #Labels 115 | 116 | return X,Y,option 117 | 118 | elif option == "3": 119 | #Keeping categorical features in order to encode with risk values later on 120 | X = dataSet.iloc[:,:-2].values 121 | Y = dataSet.iloc[:,42].values #Labels 122 | 123 | return X,Y,option 124 | 125 | 126 | elif dataSetOption == "2": 127 | ############################################################################# 128 | #GETTING VARIABLES 129 | ############################################################################# 130 | missingValIndex = [] 131 | for rows in dataSet: #Getting features index with missing values 132 | if dataSet[rows].isnull().sum() != 0: 133 | missingValIndex.append(rows) 134 | 135 | X = dataSet.iloc[:,:-1].values#data 136 | #Assigning 0,1,2...n for the feature names if names are not specified 137 | X = pd.DataFrame(X,columns = [' Destination Port',' Flow Duration',' Total Fwd Packets',' Total Backward Packets','Total Length of Fwd Packets', 138 | ' Total Length of Bwd Packets',' Fwd Packet Length Max',' Fwd Packet Length Min',' Fwd Packet Length Mean',' Fwd Packet Length Std', 139 | 'Bwd Packet Length Max',' Bwd Packet Length Min',' Bwd Packet Length Mean',' Bwd Packet Length Std','Flow Bytes/s',' Flow Packets/s',' Flow IAT Mean', 140 | ' Flow IAT Std',' Flow IAT Max',' Flow IAT Min','Fwd IAT Total',' Fwd IAT Mean',' Fwd IAT Std',' Fwd IAT Max',' Fwd IAT Min','Bwd IAT Total',' Bwd IAT Mean', 141 | ' Bwd IAT Std',' Bwd IAT Max',' Bwd IAT Min','Fwd PSH Flags',' Bwd PSH Flags',' Fwd URG Flags',' Bwd URG Flags',' Fwd Header Length',' Bwd Header Length','Fwd Packets/s', 142 | ' Bwd Packets/s',' Min Packet Length',' Max Packet Length',' Packet Length Mean',' Packet Length Std',' Packet Length Variance','FIN Flag Count',' SYN Flag Count',' RST Flag Count', 143 | ' PSH Flag Count',' ACK Flag Count',' URG Flag Count',' CWE Flag Count',' ECE Flag Count',' Down/Up Ratio',' Average Packet Size',' Avg Fwd Segment Size',' Avg Bwd Segment Size',' Fwd Header Length', 144 | 'Fwd Avg Bytes/Bulk',' Fwd Avg Packets/Bulk',' Fwd Avg Bulk Rate',' Bwd Avg Bytes/Bulk',' Bwd Avg Packets/Bulk','Bwd Avg Bulk Rate','Subflow Fwd Packets',' Subflow Fwd Bytes',' Subflow Bwd Packets',' Subflow Bwd Bytes', 145 | 'Init_Win_bytes_forward',' Init_Win_bytes_backward',' act_data_pkt_fwd',' min_seg_size_forward','Active Mean',' Active Std',' Active Max',' Active Min','Idle Mean',' Idle Std',' Idle Max',' Idle Min']) 146 | Y = dataSet.iloc[:,78].values#Labels 147 | 148 | ############################################################################# 149 | #Variables successfully obtained 150 | ############################################################################# 151 | 152 | ############################################################################# 153 | #MANAGE MISSING DATA 154 | ############################################################################# 155 | 156 | while True: 157 | print("\n\n**************************************************") 158 | print("Manage Missing Values ") 159 | print("**************************************************") 160 | print("1.Eliminate Catg. w/ Missing Values") 161 | print("2.Impute 0 for Missing Values") 162 | print("3.Impute Mean for Missing Values") 163 | print("4.Impute Median for Missing Values") 164 | print("5.Impute Mode for Missing Values") 165 | print("6.Simple Imputer") 166 | missingDataOption = input("Option:") 167 | 168 | if missingDataOption == "1" or missingDataOption == "2" or missingDataOption == "3" or missingDataOption == "4" or missingDataOption == "5" or missingDataOption == "6": 169 | break 170 | 171 | 172 | if missingDataOption == "1": 173 | deletedColumns = [] 174 | numColumns = len(X.columns) 175 | #Removing features with missing values 176 | for row in missingValIndex: 177 | deletedColumns.append(row) 178 | del X[row] 179 | 180 | print("#\n\n########################################################################") 181 | print("Columns Succesfully Removed") 182 | print(len(deletedColumns),"of",numColumns,"were deleted") 183 | print("Columns Names -> ",deletedColumns) 184 | print("#########################################################################") 185 | 186 | elif missingDataOption == "2": 187 | #Impute 0 for missing values 188 | for row in missingValIndex: 189 | X[row] = X[row].fillna(0) 190 | 191 | print("\n\n#########################################################################") 192 | print("Sucessfully Filled Missing Values with 0") 193 | print("#########################################################################") 194 | 195 | 196 | elif missingDataOption == "3": 197 | #Impute mean for missing values 198 | for row in missingValIndex: 199 | X[row] = X[row].astype(float) 200 | X[row] = X[row].fillna(X[row].mean()) 201 | 202 | print("\n\n#########################################################################") 203 | print("Sucessfully Filled Missing Values with Mean") 204 | print("#########################################################################") 205 | 206 | elif missingDataOption == "4": 207 | #Impute median for missing values 208 | for row in missingValIndex: 209 | median = X[row].median() 210 | X[row].fillna(median, inplace=True) 211 | print("\n\n#########################################################################") 212 | print("Sucessfully Filled Missing Values with Median") 213 | print("#########################################################################") 214 | 215 | elif missingDataOption == "5": 216 | #Impute mode for missing values 217 | for row in missingValIndex: 218 | X[row] = X[row].fillna(X[row].mode()[0]) 219 | 220 | print("\n\n#########################################################################") 221 | print("Sucessfully Filled Missing Values with Mode ") 222 | print("#########################################################################") 223 | 224 | elif missingDataOption == "6": 225 | from sklearn.impute import SimpleImputer 226 | #"Imputation transformer for completing missing values."(Univariate) 227 | X = SimpleImputer(missing_values = np.nan, strategy='mean', fill_value=None, verbose=0, copy=True).fit_transform(X) 228 | print("\n\n#########################################################################") 229 | print("Sucessfully Imputed Simple Imputer ") 230 | print("#########################################################################") 231 | 232 | 233 | option = "None" #This data does not have categorical features so dataOption is none 234 | return X,Y,option 235 | 236 | ############################################################################# 237 | #END OF MISSING DATA 238 | ############################################################################# 239 | 240 | 241 | 242 | 243 | 244 | 245 | def encodingLabels(Y,dataOption,datasetOption):#Encoding the labels with multiclass or binary labels 246 | 247 | if datasetOption == "1": #Checking if the data set chosen is NSL-KDD 248 | 249 | if dataOption == "1" or dataOption == "2" or dataOption == "3": 250 | 251 | while True: 252 | print("\n\n#########################################################################") 253 | print("Encoding Menu") 254 | print("#########################################################################") 255 | print("1.Binary true labels: normal = 0, abnormal = 1") 256 | print("2.Multiclass true labels: normal = 0, DoS = 1, Probe = 2, R2L = 3, U2R = 4") 257 | encodeOption = input("Enter option :") 258 | 259 | if encodeOption == "1" or encodeOption == "2": 260 | break 261 | else: 262 | 263 | print("Error\n\n") 264 | 265 | 266 | if encodeOption == "1": 267 | #Binary Categories 268 | attackType = {'normal':"normal", 'neptune':"abnormal", 'warezclient':"abnormal", 'ipsweep':"abnormal",'back':"abnormal", 'smurf':"abnormal", 'rootkit':"abnormal",'satan':"abnormal", 'guess_passwd':"abnormal",'portsweep':"abnormal",'teardrop':"abnormal",'nmap':"abnormal",'pod':"abnormal",'ftp_write':"abnormal",'multihop':"abnormal",'buffer_overflow':"abnormal",'imap':"abnormal",'warezmaster':"abnormal",'phf':"abnormal",'land':"abnormal",'loadmodule':"abnormal",'spy':"abnormal",'perl':"abnormal"} 269 | attackEncodingCluster = {'normal':0,'abnormal':1} 270 | 271 | Y[:] = [attackType[item] for item in Y[:]] #Encoding the binary data 272 | Y[:] = [attackEncodingCluster[item] for item in Y[:]]#Changing the names of the labels to binary labels normal and abnormal 273 | return Y,encodeOption 274 | 275 | elif encodeOption == "2": 276 | #Multiclass Categories 277 | #normal = 0 278 | #DoS = 1 279 | #Probe = 2 280 | #R2L = 3 281 | #U2R = 4 282 | attackType = {'normal': 'normal', 'neptune':'DoS', 'warezclient': 'R2L', 'ipsweep': 'Probe','back': 'DoS', 'smurf': 'DoS', 'rootkit': 'U2R','satan': 'Probe', 'guess_passwd': 'R2L','portsweep': 'Probe','teardrop': 'DoS','nmap': 'Probe','pod': 'DoS','ftp_write': 'R2L','multihop': 'R2L','buffer_overflow': 'U2R','imap': 'R2L','warezmaster': 'R2L','phf': 'R2L','land': 'DoS','loadmodule': 'U2R','spy': 'R2L','perl': 'U2R'} 283 | attackEncodingCluster = {'normal':0,'DoS':1,'Probe':2,'R2L':3, 'U2R':4} #Main Categories 284 | 285 | Y[:] = [attackType[item] for item in Y[:]] #Encoding the 22 fine-grain attack labels into the 4 main types of attacks, and leaving 'normal' as 'normal' 286 | Y[:] = [attackEncodingCluster[item] for item in Y[:]]#Changing the names of attacks into numerical data 287 | return Y,encodeOption 288 | else: 289 | return Y 290 | 291 | 292 | elif datasetOption == "2":#Checking if the data set chosen is IDS2017 293 | print("\n\n#########################################################################") 294 | print("Encoding Menu") 295 | print("#########################################################################") 296 | print("1.Binary true labels: normal = 0, abnormal = 1") 297 | print("2. Multiclass true labels: BENIGN= 0, DoS slowloris= 1, DoS Slowhttptest= 2, DoS Hulk= 3, DoS GoldenEye= 4, Heartbleed= 5") 298 | encodeOption = input("Enter option :") 299 | 300 | if encodeOption == "1": 301 | Y = np.array(Y,dtype= object) 302 | attackEncoding = {'BENIGN': 0,'DoS slowloris': 1,'DoS Slowhttptest': 2,'DoS Hulk': 3, 'DoS GoldenEye': 4, 'Heartbleed': 5} #Main Categories 303 | Y[:] = [attackEncoding[item] for item in Y[:]]#Changing the names of attacks into categorical data 304 | 305 | return Y,encodeOption 306 | 307 | elif encodeOption == "2": 308 | Y = np.array(Y,dtype= object) 309 | attackType = {'BENIGN': 'normal','DoS slowloris': 'abnormal','DoS Slowhttptest': 'abnormal','DoS Hulk': 'abnormal', 'DoS GoldenEye': 'abnormal', 'Heartbleed': 'abnormal'} #Binary Categories 310 | attackEncoding = {'normal': 0, 'abnormal': 1} 311 | 312 | Y[:] = [attackType[item] for item in Y[:]]#Changing the names of attacks into binary categories 313 | Y[:] = [attackEncoding[item] for item in Y[:]]#Changing the names of attacks into binary categories 314 | return Y,encodeOption 315 | 316 | else: 317 | return Y 318 | 319 | 320 | 321 | 322 | #Encoding the categorical features using one hot encoding and using Main attacks categories or binary categories 323 | def oneHotEncodingData(X,dataOption): 324 | 325 | from sklearn.preprocessing import OneHotEncoder 326 | from sklearn.compose import ColumnTransformer 327 | 328 | #Label encoding step is unnecessary because ColumnTransformer performs both one hot encoding and label encoding 329 | #Encoding the Independient Variable 330 | if dataOption == "1": #For One Hot Encoding all categorical data 331 | transform = ColumnTransformer([("Servers", OneHotEncoder(categories = "auto"), [1,2,3])], remainder="passthrough") 332 | X = transform.fit_transform(X) 333 | print("\n\n#########################################################################") 334 | print("Data has been successfully One Hot Encoded") 335 | print("#########################################################################") 336 | 337 | return X 338 | elif dataOption == "3": #For risk encoding categorical data: One Hot Encoding Protocol Feature because there is no risk value data for that feature, and it only has 3 attributes, limiting the number of added features by One Hot Encoding 339 | transform = ColumnTransformer([("Servers", OneHotEncoder(categories = "auto"), [1])], remainder="passthrough") 340 | X = transform.fit_transform(X) 341 | print("\n\n#########################################################################") 342 | print("Data has been successfully One Hot Encoded") 343 | print("#########################################################################") 344 | return X 345 | 346 | else: 347 | return X #Returning data with no changes 348 | 349 | 350 | def riskEncodingData(X,dataOption):#Assinging risk values to categorical features "Servers" and "Server Errors" 351 | #Manually Encoding for the attacks types only 352 | if dataOption == "3": #if data option is risk Value 353 | X = pd.DataFrame(X) 354 | servers = {'http':0.01, 'domain_u':0, 'sunrpc':1, 'smtp':0.01, 'ecr_i':0.87, 'iso_tsap':1, 'private':0.97, 'finger':0.27, 'ftp':0.26, 'telnet':0.48,'other':0.12,'discard':1, 'courier':1, 'pop_3':0.53, 'ldap':1, 'eco_i':0.8, 'ftp_data':0.06, 'klogin':1, 'auth':0.31, 'mtp':1, 'name':1, 'netbios_ns':1,'remote_job':1,'supdup':1,'uucp_path':1,'Z39_50':1,'csnet_ns':1,'uucp':1,'netbios_dgm':1,'urp_i':0,'domain':0.96,'bgp':1,'gopher':1,'vmnet':1,'systat':1,'http_443':1,'efs':1,'whois':1,'imap4':1,'echo':1,'link':1,'login':1,'kshell':1,'sql_net':1,'time':0.88,'hostnames':1,'exec':1,'ntp_u':0,'nntp':1,'ctf':1,'ssh':1,'daytime':1,'shell':1,'netstat':1,'nnsp':1,'IRC':0,'pop_2':1,'printer':1,'tim_i':0.33,'pm_dump':1,'red_i':0,'netbios_ssn':1,'rje':1,'X11':0.04,'urh_i':0,'http_8001':1,'aol':1,'http_2784':1,'tftp_u':0,'harvest':1} 355 | X[2] = [servers[item] for item in X[2]] 356 | 357 | servers_Error = {'REJ':0.519, 'SF':0.016, 'S0':0.998, 'RSTR':0.882, 'RSTO':0.886,'SH':0.993,'S1':0.008,'RSTOS0':1,'S3':0.08,'S2':0.05,'OTH':0.729} 358 | X[3] = [servers_Error[item] for item in X[3]] 359 | 360 | print("\n\n#########################################################################") 361 | print("Data has been successfully risk Encoded") 362 | print("#########################################################################") 363 | 364 | return X 365 | 366 | else: 367 | 368 | return X #Returning the data with no changes 369 | 370 | 371 | 372 | 373 | def scaling(X):#Scaling the data with the MinMaxScaler method so that values in each feature are in the same range for experiments. 374 | 375 | 376 | 377 | while True: 378 | 379 | decision = input("Scale data [y/n]:") 380 | 381 | if decision == "y" or decision == "n": 382 | break 383 | else: 384 | 385 | print("Error\n\n") 386 | 387 | if decision == "y": 388 | 389 | from sklearn.preprocessing import MinMaxScaler 390 | #Transforming features by scaling each feature to the given range, (0,1) 391 | X = MinMaxScaler(feature_range=(0, 1)).fit_transform(X) 392 | print("\n\n#########################################################################") 393 | print("Data has been successfully scaled.") 394 | print("#########################################################################") 395 | return X 396 | 397 | else: 398 | return X 399 | 400 | 401 | def shuffleData(X):#Shuffling the order of data instances. Currently this is a bug in code. If we experiment on shuffled data, the algorithms return nonsense results. 402 | 403 | from sklearn.utils import shuffle 404 | while True: 405 | option = input("Shuffle data [y]/[n]:") 406 | 407 | if option == "y" or option == "n": 408 | break 409 | else: 410 | 411 | print("Error\n\n") 412 | 413 | if option == "y": 414 | 415 | X = pd.DataFrame(X) 416 | X = shuffle(X) 417 | X.reset_index(inplace=True,drop=True) 418 | X = np.array(X) 419 | 420 | print("\n\n#########################################################################") 421 | print("Data has been successfully shuffled.") 422 | print("#########################################################################") 423 | return X 424 | else: 425 | 426 | return X 427 | 428 | 429 | 430 | 431 | #K-Means Algorithm 432 | def kmeansClustering(X,Y): 433 | from sklearn.cluster import KMeans 434 | 435 | while True: 436 | print("\n\n#########################################################################") 437 | print("KMEANS ALGORITHM") 438 | print("#########################################################################") 439 | 440 | nClusters = input("Number of clusters:") 441 | 442 | try: 443 | nClusters = int(nClusters) 444 | 445 | except ValueError: 446 | 447 | print("Error\n\n") 448 | 449 | if type(nClusters) == int: 450 | n = 0 451 | clusters = [] 452 | 453 | while n < nClusters:#Converting nClusters into an array of n clusters [n] for use it later 454 | clusters.append(n) 455 | n+=1 456 | break 457 | 458 | while True: 459 | init = input("Initialization method [k-means++,random]:") 460 | 461 | if init == "k-means++" or init == "random": 462 | break 463 | 464 | print("\nClustering...\n") 465 | 466 | start_time = time.time() 467 | KMEANS = KMeans(n_clusters = nClusters, init = init,max_iter = 300,n_init = 10,random_state = 0) 468 | print("\n\nRun Time ->","--- %s seconds ---" % (time.time() - start_time)) 469 | print("Data Successfully Clustered") 470 | kmeans = KMEANS.fit(X) 471 | Z = kmeans.labels_ 472 | inertia = KMEANS.inertia_ 473 | #Kmeans Results 474 | kmeansR = pd.crosstab(Y,Z) 475 | maxVal = kmeansR.idxmax() 476 | 477 | return Z,clusters,kmeansR,maxVal,inertia 478 | 479 | 480 | 481 | 482 | def kF1(Z,Y,maxVal,clusters):#F1 Score for Kmeans 483 | from sklearn.metrics import f1_score 484 | #Encoding data to F-score 485 | 486 | 487 | #Automatically assigning the max-ocurring instance in each found cluster to that specific found cluster, in order to evaluate clustering with greater ease. 488 | n = 0 #counter 489 | dictionaryCluster = {} #creating an empty dictionary 490 | f1 = 0 #f1score 491 | average = '' 492 | 493 | while n < len(clusters):#while counter < number of clusters 494 | dictionaryCluster[clusters[n]] = maxVal[n] #Creating key(cluster index) with value (max number of the clustering results) for every iteration 495 | n+=1 496 | 497 | Z[:] = [dictionaryCluster[item] for item in Z[:]] #Matching key with the index of klabels and replacing it with key value 498 | 499 | Y = np.array(Y,dtype = int) #Converting labels into an int array 500 | 501 | while True: 502 | 503 | average = input("Average Method[weighted,micro,macro,binary]:") 504 | 505 | if average == "weighted" or average == "micro" or average == "macro" or average == 'binary': 506 | break 507 | #score metric 508 | f1 = f1_score(Y,Z, average = average) 509 | 510 | return f1,dictionaryCluster 511 | 512 | 513 | 514 | def kNMI(Z,Y,maxVal,clusters): 515 | from sklearn.metrics import normalized_mutual_info_score 516 | 517 | #Automatically assigning the max-ocurring instance in each found cluster to that specific found cluster, in order to evaluate clustering with greater ease. 518 | n = 0 # counter 519 | dictionaryCluster = {} #creating an empty dictionary 520 | NMI = 0 521 | average = '' 522 | 523 | while n < len(clusters):#while counter < number of clusters 524 | dictionaryCluster[clusters[n]] = maxVal[n] #Creating key(cluster index) with value (max number of the clustering results) for every iteration 525 | n+=1 526 | 527 | Z[:] = [dictionaryCluster[item] for item in Z[:]] #Matching key with the index of klabels and replacing it with key value 528 | 529 | Y = np.array(Y,dtype = int) #Making sure that labels are in an int array 530 | 531 | while True: 532 | 533 | average = input("Average Method[geometric,min,arithmetic,max]:") 534 | 535 | if average == "geometric" or average == "min" or average == "arithmetic" or average == "max": 536 | break 537 | #Score metric 538 | NMI = normalized_mutual_info_score(Y, Z, average_method = average) 539 | 540 | return NMI,dictionaryCluster 541 | 542 | 543 | 544 | def kARS(Z,Y,maxVal,clusters): 545 | from sklearn.metrics import adjusted_rand_score 546 | 547 | #Automatically assigning the max-ocurring instance in each found cluster to that specific found cluster, in order to evaluate clustering with greater ease. 548 | n = 0 # counter 549 | dictionaryCluster = {} #Creating an empty dictionary 550 | ars = 0 551 | 552 | while n < len(clusters):# while counter < number of clusters 553 | dictionaryCluster[clusters[n]] = maxVal[n] #Creating key(cluster index) with value (max number of the clustering results) for every iteration 554 | n+=1 555 | 556 | Z[:] = [dictionaryCluster[item] for item in Z[:]] #Matching key with the index of klabels and replacing it with key value 557 | 558 | Y = np.array(Y,dtype = int) #Making sure that labels are in an int array 559 | 560 | #score metric 561 | ars = adjusted_rand_score(Y, Z) 562 | 563 | return ars,dictionaryCluster 564 | 565 | 566 | #DBSCAN Algorithm 567 | def dbscanClustering(X,Y): 568 | from sklearn.cluster import DBSCAN 569 | 570 | while True: 571 | 572 | print("\n\n#########################################################################") 573 | print("DBSCAN ALGORITHM") 574 | print("#########################################################################") 575 | 576 | epsilon = input("epsilon[Decimal]:") 577 | 578 | try: 579 | epsilon = float(epsilon) 580 | 581 | except ValueError: 582 | 583 | print("Enter a Decimal number") 584 | 585 | 586 | if type(epsilon) == float: 587 | break 588 | 589 | while True: 590 | minSamples = input("Min Samples[Integer]:") 591 | 592 | try: 593 | minSamples = int(minSamples) 594 | 595 | except ValueError: 596 | 597 | print("Enter a Integer Number") 598 | 599 | if type(minSamples) == int: 600 | break 601 | 602 | while True: 603 | algorithm = input("Algorithm['auto’, ‘ball_tree’, ‘kd_tree’, 'brute']:") 604 | 605 | if algorithm == "auto" or algorithm == "ball_tree" or algorithm == "kd_tree" or algorithm == "brute": 606 | break 607 | 608 | else: 609 | print("Error\n\n") 610 | 611 | 612 | print("\nClustering...\n") 613 | 614 | #Computing DBSCAN 615 | start_time = time.time() 616 | db = DBSCAN(eps= epsilon, min_samples = minSamples,algorithm = algorithm).fit(X) 617 | print("\n\nRun Time ->","--- %s seconds ---" % (time.time() - start_time)) 618 | print("Data Successfully Clustered") 619 | 620 | 621 | core_samples_mask = np.zeros_like(db.labels_, dtype=bool) 622 | core_samples_mask[db.core_sample_indices_] = True 623 | 624 | Z = db.labels_ 625 | # Number of clusters in labels, ignoring noise if present. 626 | n_clusters = len(set(Z)) 627 | n_noise_ = list(Z).count(-1) 628 | 629 | n = -1 #DBSCAN returns cluster with index -1 (anomalies) 630 | clusters = [] 631 | while n + 1 < n_clusters: 632 | clusters.append(n) 633 | n += 1 634 | 635 | #DBSCAN Results 636 | dbscanR = pd.crosstab(Y,Z) 637 | maxVal = dbscanR.idxmax() 638 | 639 | return Z,clusters,n_noise_,dbscanR,maxVal 640 | 641 | 642 | 643 | 644 | def dbF1(Z,Y,clusters,maxVal):#F1 score for DBSCAN 645 | from sklearn.metrics import f1_score 646 | #Encoding data to F-score 647 | 648 | #Automatically assigning the max-ocurring instance in each found cluster to that specific found cluster, in order to evaluate clustering with greater ease. 649 | n = 0 # counter 650 | c = -1 # - counter for when max Value has negative index 651 | dictionaryCluster = {} #Creating an empty dictionary 652 | f1 = 0 653 | average = '' 654 | 655 | while n < len(clusters):#while counter < number of clusters 656 | dictionaryCluster[clusters[n]] = maxVal[c] #Creating key(cluster index) with value (max number of the clustering results) for every iteration 657 | n+=1 658 | c+=1 659 | 660 | 661 | Z[:] = [dictionaryCluster[item] for item in Z[:]] #Matching key with the index of klabels and replacing it with key value 662 | 663 | Y = np.array(Y,dtype = int) #Making sure that labels are in an int array 664 | while True: 665 | 666 | average = input("Average Method[weighted,micro,macro]:") 667 | 668 | if average == "weighted" or average == "micro" or average == "macro": 669 | break 670 | 671 | else: 672 | 673 | print("Error\n\n") 674 | #score metric 675 | f1 = f1_score(Y,Z, average = average) 676 | return f1,dictionaryCluster 677 | 678 | 679 | def dbNMI(Z,Y,clusters,maxVal):#Normalized Mutual Information score for DBSCAN 680 | from sklearn.metrics import normalized_mutual_info_score 681 | #Automatically assigning the max-ocurring instance in each found cluster to that specific found cluster, in order to evaluate clustering with greater ease. 682 | n = 0 # counter 683 | c = -1 # - counter max Value has negative index 684 | NMI = 0 685 | dictionaryCluster = {} #Creating an empty dictionary 686 | average = '' 687 | 688 | while n < len(clusters):#while counter < number of clusters 689 | dictionaryCluster[clusters[n]] = maxVal[c] #Creating key(cluster index) with value (max number of the clustering results) for every iteration 690 | n+=1 691 | c+=1 692 | 693 | Y = np.array(Y,dtype = int) #Making sure that labels are in an int array 694 | 695 | while True: 696 | 697 | average = input("Average Method[geometric,min,arithmetic,max]:") 698 | 699 | if average == "geometric" or average == "min" or average == "arithmetic" or average == "max": 700 | break 701 | else: 702 | 703 | print("Error\n\n") 704 | #score metric 705 | NMI = normalized_mutual_info_score(Y, Z, average_method= average) 706 | 707 | return NMI,dictionaryCluster 708 | 709 | def dbARS(Z,Y,clusters,maxVal): #Adjusted Rand Index score for DBSCAN 710 | from sklearn.metrics import adjusted_rand_score 711 | 712 | #Automatically assigning the max-ocurring instance in each found cluster to that specific found cluster, in order to evaluate clustering with greater ease. 713 | n = 0 # counter 714 | c = -1 # - counter max Value has negative index 715 | ars = 0 716 | dictionaryCluster = {} #Creating an empty dictionary 717 | 718 | while n < len(clusters):#while counter < number of clusters 719 | dictionaryCluster[clusters[n]] = maxVal[c] #Creating key(cluster index) with value (max number of the clustering results) for every iteration 720 | n+=1 721 | c+=1 722 | #score metric 723 | ars = adjusted_rand_score(Y,Z) 724 | 725 | return ars,dictionaryCluster 726 | 727 | 728 | def isolationForest(X,Y):#Isolation Forest algorithm 729 | from sklearn.ensemble import IsolationForest 730 | 731 | while True: 732 | contamination = input("Contamination[Float 0 to 0.5]: ") 733 | 734 | try: 735 | contamination = float(contamination) 736 | 737 | except ValueError: 738 | 739 | print("Enter a Number") 740 | 741 | if type(contamination) == float and (contamination >= 0 and contamination <= 0.5): 742 | break 743 | 744 | print("\nClustering...\n") 745 | 746 | start_time = time.time() 747 | Z = IsolationForest(max_samples = "auto",behaviour = "new",contamination = contamination).fit_predict(X) 748 | print("\n\nRun Time ->","--- %s seconds ---" % (time.time() - start_time)) 749 | 750 | Z = np.array(Z,dtype = object) 751 | 752 | ifR = pd.crosstab(Y,Z) 753 | ifR = pd.DataFrame(ifR) 754 | maxVal = ifR.idxmax() 755 | 756 | n = -1 #Isolation Forest returns clusters with indicies -1 (outlier) and 1 (normal) 757 | clusters = [] 758 | while n < len(ifR.columns): 759 | clusters.append(n) 760 | n += 2 761 | 762 | return Z,ifR,maxVal,clusters 763 | 764 | def ifF1(Z,Y,clusters,maxVal): #f1 score for isolation forest 765 | from sklearn.metrics import f1_score 766 | #Automatically assigning the max-ocurring instance in each found cluster to that specific found cluster, in order to evaluate clustering with greater ease. 767 | 768 | n = 0 # counter 769 | c = -1 # - counter max Value has negative index 770 | f1 = 0 771 | average = '' 772 | dictionaryCluster = {} #Creating an empty dictionary 773 | 774 | 775 | while n < len(clusters): #Starting counter at -1 and incrementing by 2, because Isolation Forest returns -1 and 1 clusters 776 | dictionaryCluster[clusters[n]] = maxVal[c] 777 | n+=1 778 | c+=2 779 | 780 | Z[:] = [dictionaryCluster[item] for item in Z[:]] #Matching key with the index of klabels and replacing it with key value 781 | 782 | Y = np.array(Y,dtype = int) 783 | Z = np.array(Z,dtype = int) 784 | 785 | while True: 786 | 787 | average = input("Average Method[weighted,micro,macro]:") 788 | 789 | if average == "weighted" or average == "micro" or average == "macro": 790 | break 791 | 792 | else: 793 | 794 | print("Error\n\n") 795 | #score metric 796 | f1 = f1_score(Y,Z, average = average) #[None, 'micro', 'macro', 'weighted'] 797 | 798 | return f1,dictionaryCluster 799 | 800 | 801 | def LOF(X,Y):#Local Outlier Factor algorithm 802 | from sklearn.neighbors import LocalOutlierFactor 803 | 804 | while True: 805 | contamination = input("Contamination[Float 0 to 0.5]: ") 806 | 807 | try: 808 | contamination = float(contamination) 809 | 810 | except ValueError: 811 | 812 | print("Enter a Number") 813 | 814 | if type(contamination) == float and (contamination > 0 and contamination <= 0.5): 815 | break 816 | 817 | while True: 818 | algorithm = input("Algorithm['auto’, ‘ball_tree’, ‘kd_tree’, 'brute']:") 819 | 820 | if algorithm == "auto" or algorithm == "ball_tree" or algorithm == "kd_tree" or algorithm == "brute": 821 | break 822 | else: 823 | 824 | print("Error\n\n") 825 | 826 | print("\nClustering...\n") 827 | 828 | start_time = time.time() 829 | lof = LocalOutlierFactor(contamination = contamination,algorithm = algorithm).fit_predict(X) 830 | print("\n\nRun Time ->","--- %s seconds ---" % (time.time() - start_time)) 831 | 832 | lofR = pd.crosstab(Y,lof) 833 | maxVal = lofR.idxmax() 834 | 835 | 836 | n = -1 #LOF returns index -1 and 1 cluster 837 | clusters = [] 838 | while n < len(lofR.columns): 839 | clusters.append(n) 840 | n += 2 841 | 842 | 843 | 844 | return lof,lofR,maxVal,clusters 845 | 846 | 847 | def lofF1(Z,Y,clusters,maxVal): #f1 score for local outlier factor 848 | from sklearn.metrics import f1_score 849 | 850 | #Automatically assigning the max-ocurring instance in each found cluster to that specific found cluster, in order to evaluate clustering with greater ease. 851 | n = 0 # counter 852 | c = -1 # - counter max Value has negative index 853 | f1 = 0 854 | dictionaryCluster = {} # creating an empty dictionary 855 | 856 | while n < len(clusters): # Starting counter at -1 and incrementing by 2, because Isolation Forest returns -1 and 1 clusters 857 | dictionaryCluster[clusters[n]] = maxVal[c] 858 | n+=1 859 | c+=2 860 | 861 | Z[:] = [dictionaryCluster[item] for item in Z[:]] #Matching key with the index of klabels and replacing it with key value 862 | Y = np.array(Y,dtype = int) 863 | Z = np.array(Z,dtype = int) 864 | while True: 865 | 866 | average = input("Average Method[weighted,None,micro,macro]:") 867 | 868 | if average == "weighted" or average == "micro" or average == "macro" or average == "None": 869 | break 870 | 871 | else: 872 | 873 | print("Error\n\n") 874 | f1 = f1_score(Y,Z, average = average) #[None, 'micro', 'macro', 'weighted'] 875 | 876 | return f1,dictionaryCluster 877 | 878 | clear() 879 | #Calling the functions 880 | 881 | ########################################################################## 882 | path,dataSetOption = getDataSet() 883 | ######################################################################### 884 | ######################################################################### 885 | dataSet = readingData(path) 886 | ######################################################################### 887 | ######################################################################### 888 | dataSet = checkMissing(dataSet) 889 | ######################################################################### 890 | ######################################################################### 891 | data,labels,dataOption = gettingVariables(dataSet,dataSetOption) #Getting the Data we want to use for the algorithms 892 | ######################################################################### 893 | ######################################################################### 894 | try: 895 | labels,encodeOption = encodingLabels(labels,dataOption,dataSetOption) #Encoding the true labels 896 | except ValueError: 897 | labels = encodingLabels(labels,dataOption,dataSetOption) #Encoding the true labels 898 | ######################################################################### 899 | ######################################################################### 900 | data = riskEncodingData(data,dataOption) 901 | ######################################################################### 902 | ######################################################################### 903 | data = oneHotEncodingData(data,dataOption) #Applying One Hot Encoding with the complete data 904 | ######################################################################### 905 | ######################################################################### 906 | data = scaling(data) 907 | ######################################################################### 908 | ######################################################################### 909 | data = shuffleData(data) 910 | ######################################################################### 911 | 912 | #This menu is a option to run diferrent algorithms with the same preproceced data without needing to run all the code from the start to make another experiment. 913 | while True: 914 | while True: 915 | print("\n\n#########################################################################") 916 | print("Algorithm Menu") 917 | print("#########################################################################") 918 | 919 | print("1.Kmeans") 920 | print("2.Dbscan") 921 | print("3.Isolation Forest") 922 | print("4.Local Factor Outlier") 923 | 924 | algorithmOption = input("option:") 925 | 926 | if algorithmOption == "1" or algorithmOption == "2" or algorithmOption == "3" or algorithmOption == "4": 927 | break 928 | else: 929 | 930 | print("Error\n\n") 931 | 932 | 933 | if algorithmOption == "1": 934 | ######################################################################### 935 | #KMEANS 936 | klabels,kClusters,kmeansR,maxKvalue,inertia = kmeansClustering(data,labels) 937 | print("#########################################################################") 938 | print("KMEANS RESULTS\n\n") 939 | print("Clusters -> ",kClusters,"\n") 940 | print("Inertia -> ",inertia) 941 | print(kmeansR,"\n\n") 942 | print("Max True Label","\n\n",maxKvalue) 943 | print("#########################################################################") 944 | ######################################################################### 945 | print("\n\n#########################################################################") 946 | print("Kmeans Score Metrics Menu") 947 | print("#########################################################################") 948 | 949 | while True: 950 | print("1.F1 Score") 951 | print("2.Normalized Mutual Info Score") 952 | print("3.Adjusted Rand Score") 953 | 954 | kScoreOption = input("option:") 955 | 956 | if kScoreOption == "1" or kScoreOption == "2" or kScoreOption == "3": 957 | break 958 | else: 959 | 960 | print("Error\n\n") 961 | 962 | if kScoreOption == "1": 963 | ######################################################################### 964 | #F1 Score 965 | kmeansF1,clusterAssigned = kF1(klabels,labels,maxKvalue,kClusters) 966 | print("\n\n#########################################################################") 967 | print("Cluster Matchings by Maximun Intersection[Found: True] -> ",clusterAssigned) 968 | print("KMEANS F1 Score -> ",kmeansF1) 969 | print("#########################################################################") 970 | ######################################################################### 971 | 972 | elif kScoreOption == "2": 973 | ######################################################################### 974 | kmeansNMI,clusterAssigned = kNMI(klabels,labels,maxKvalue,kClusters) 975 | print("\n\n#########################################################################") 976 | print("Cluster Matchings by Maximun Intersection[Found: True] -> ",clusterAssigned) 977 | print("KMEANS Normalized Mutual Info Score -> ",kmeansNMI) 978 | print("#########################################################################") 979 | ######################################################################### 980 | 981 | elif kScoreOption == "3": 982 | 983 | ######################################################################### 984 | kmeansARS,clusterAssigned = kARS(klabels,labels,maxKvalue,kClusters) 985 | print("\n\n#########################################################################") 986 | print("Cluster Matchings by Maximun Intersection[Found: True] -> ",clusterAssigned) 987 | print("KMEANS Adjusted Rand Score -> ",kmeansARS) 988 | print("#########################################################################") 989 | ######################################################################### 990 | 991 | elif algorithmOption == "2": 992 | ######################################################################### 993 | #DBSCAN 994 | dblabels,dbClusters,nNoises,dbscanR,maxDBvalue = dbscanClustering(data,labels) 995 | print("#########################################################################") 996 | print("DBSCAN RESULTS\n\n") 997 | print("Clusters -> ",dbClusters,"\n") 998 | print(dbscanR,"\n\n") 999 | print("Noise -> ",nNoises) 1000 | print("Max True Label","\n\n",maxDBvalue) 1001 | print("#########################################################################") 1002 | ######################################################################### 1003 | print("\n\n#########################################################################") 1004 | print("Dscan Score Metrics Menu") 1005 | print("#########################################################################") 1006 | print("1.F1 Score") 1007 | print("2.Normalized Mutual Info Score") 1008 | print("3.Adjusted Rand Score") 1009 | 1010 | while True: 1011 | 1012 | dbScoreOption = input("option:") 1013 | 1014 | if dbScoreOption == "1" or dbScoreOption == "2" or dbScoreOption == "3": 1015 | break 1016 | else: 1017 | 1018 | print("Error\n\n") 1019 | 1020 | if dbScoreOption == "1": 1021 | ######################################################################### 1022 | #F1 Score DBSCAN 1023 | dbscanF1,clusterAssigned = dbF1(dblabels,labels,dbClusters,maxDBvalue) 1024 | print("\n\n#########################################################################") 1025 | print("Cluster Matchings by Maximun Intersection[Found: True] -> ",clusterAssigned) 1026 | print("DBSCAN F1 Score -> ",dbscanF1) 1027 | print("#########################################################################") 1028 | ######################################################################### 1029 | 1030 | elif dbScoreOption == "2": 1031 | ######################################################################### 1032 | dbscanNMI,clusterAssigned = dbNMI(dblabels,labels,dbClusters,maxDBvalue) 1033 | print("\n\n#########################################################################") 1034 | print("Cluster Matchings by Maximun Intersection[Found: True] -> ",clusterAssigned) 1035 | print("DBSCAN Normalized Mutual Info Score -> ",dbscanNMI) 1036 | print("#########################################################################") 1037 | ######################################################################### 1038 | 1039 | elif dbScoreOption == "3": 1040 | ######################################################################### 1041 | dbscanARS,clusterAssigned = dbARS(dblabels,labels,dbClusters,maxDBvalue) 1042 | print("\n\n#########################################################################") 1043 | print("Cluster Matchings by Maximun Intersection[Found: True] -> ",clusterAssigned) 1044 | print("DBSCAN Adjusted Rand Score -> ",dbscanARS) 1045 | print("#########################################################################") 1046 | ######################################################################### 1047 | 1048 | 1049 | elif algorithmOption == "3": 1050 | ######################################################################### 1051 | ifLabels,ifR,MaxIfVal,ifNclusters = isolationForest(data,labels) 1052 | print("#########################################################################") 1053 | print("Isolation Forest RESULTS\n\n") 1054 | print("Clusters -> ",ifNclusters,"\n") 1055 | print(ifR,"\n\n") 1056 | print("Max True Label","\n\n",MaxIfVal) 1057 | print("#########################################################################") 1058 | ######################################################################### 1059 | print("\n\n#########################################################################") 1060 | print("Isolation Forest Score Metrics Menu") 1061 | print("#########################################################################") 1062 | print("1.F1 Score") 1063 | 1064 | while True: 1065 | 1066 | ifScoreOption = input("option:") 1067 | 1068 | if ifScoreOption == "1": 1069 | break 1070 | else: 1071 | 1072 | print("Error\n\n") 1073 | 1074 | if ifScoreOption == "1": 1075 | 1076 | ########################################################################## 1077 | isolationForestF1,clusterAssigned = ifF1(ifLabels,labels,ifNclusters,MaxIfVal) 1078 | print("\n\n#########################################################################") 1079 | print("Cluster Matchings by Maximun Intersection[Found: True] -> ",clusterAssigned) 1080 | print("Isolation Forest F1 Score -> ",isolationForestF1) 1081 | print("#########################################################################") 1082 | ########################################################################## 1083 | 1084 | elif algorithmOption == "4": 1085 | ######################################################################### 1086 | LOFlabels,lofR,maxLOFvalue,lofClusters = LOF(data,labels) 1087 | print("#########################################################################") 1088 | print("Local Outlier Factor RESULTS\n\n") 1089 | print("Clusters -> ",lofClusters,"\n") 1090 | print(lofR,"\n\n") 1091 | print("Max True Label","\n\n",maxLOFvalue) 1092 | print("#########################################################################") 1093 | ######################################################################### 1094 | print("\n\n#########################################################################") 1095 | print("LOF Score Metrics Menu") 1096 | print("#########################################################################") 1097 | print("1.F1 Score") 1098 | 1099 | while True: 1100 | 1101 | lofScoreOption = input("option:") 1102 | 1103 | if lofScoreOption == "1": 1104 | break 1105 | else: 1106 | 1107 | print("Error\n\n") 1108 | 1109 | if lofScoreOption == "1": 1110 | 1111 | ########################################################################## 1112 | LOFf1,clusterAssigned = lofF1(LOFlabels,labels,lofClusters,maxLOFvalue) 1113 | print("\n\n#########################################################################") 1114 | print("Cluster Matchings by Maximun Intersection[Found: True] -> ",clusterAssigned) 1115 | print("LOF F1 Score -> ",LOFf1) 1116 | print("#########################################################################") 1117 | ########################################################################## 1118 | 1119 | while True: #Asking if the user wants to run a new clustering algorithm test on the same data preprocessed in the same way 1120 | 1121 | decision = input("Try another Clustering Algorithm[y/n]:") 1122 | 1123 | if decision == "y" or decision == "n": 1124 | break 1125 | else: 1126 | 1127 | print("Error\n\n") 1128 | 1129 | 1130 | if decision == "n": 1131 | break 1132 | 1133 | else: 1134 | clear() -------------------------------------------------------------------------------- /Dataset/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeremy191/clustering-based-anomaly-detection/7df019978b9350920ac2e21bb53663f8709f6ab1/Dataset/.DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Jeremy Perez 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Clustering Based Anomaly Detection 2 | 3 | ## Description 4 | This clustering based anomaly detection project implements unsupervised clustering algorithms on the [NSL-KDD](https://pdfs.semanticscholar.org/1b34/80021c4ab0f632efa99e01a9b073903c5554.pdf) and [IDS 2017](https://www.unb.ca/cic/datasets/ids-2017.html) datasets. The project includes options for preprocessing the datasets. It then clusters the datasets, mainly using the K-means and DBSCAN algorithms. Finally, it evaluates the clustering performed by the algorithms using standard metrics such as F-Score. 5 | 6 | ## Requirements 7 | 8 | * [Python >= 3.5](https://www.python.org/) 9 | * [Anaconda](https://www.anaconda.com/distribution/) 10 | * [Scikit](https://scikit-learn.org/stable/install.html) 11 | * [SciPy](https://www.scipy.org/#) 12 | * [NumPy](http://numpy.org/) 13 | * [joblib](https://joblib.readthedocs.io/en/latest/#) 14 | * [pandas](https://pandas.pydata.org/) 15 | * [Spyder environment](https://www.spyder-ide.org/) 16 | 17 | ## Installation 18 | 19 | For this project, we installed Anaconda-Navigator to use as our package and environment manager. Under the Environments tab in Anaconda, we created an environment and downloaded the libraries listed in the prerequisites for this project. 20 | This [guide](https://docs.anaconda.com/_downloads/9ee215ff15fde24bf01791d719084950/Anaconda-Starter-Guide.pdf) can help use Anaconda 21 | 22 | 23 | ## Code Details 24 | After you install all the requirements you should be able to run the code without any problems. This code is implemented to be user friendly and the steps will be briefly explained below: 25 | 26 | ##### 1. Dataset option 27 | * ![image](https://user-images.githubusercontent.com/31083873/62171123-263b7400-b2eb-11e9-92ea-27dd3511b052.png) 28 | The user is asked to input which dataset will be analyzed in this run of the anomaly detetion algorithms. The two datasets that this project used contain different types of data and therefore require different types of preprocessing; thus, the user must choose which dataset to preprocess before beginning anomaly detection. 29 | 30 | ##### 2. Path 31 | * ![image](https://user-images.githubusercontent.com/31083873/62171230-816d6680-b2eb-11e9-814b-d6d2d2f819dd.png) 32 | The user is asked to input the path of the data set. After [downloading the dataset](https://www.unb.ca/cic/datasets/index.html) to your computer, copy the path to that dataset and input the path here. 33 | 34 | ##### 3. Variable Menu 35 | * ![image](https://user-images.githubusercontent.com/31083873/62171295-afeb4180-b2eb-11e9-8958-317cc71b9e43.png) 36 | The user is asked to choose the variables he wants to be working on. 37 | As explained in step 1, the two data sets have different types of features. Specifically, the NSL-KDD Dataset has categorical data that must either be converted into numerical data or eliminated. The user can choose between three options for dealing with the categorical features on the NSL-KDD Dataset: 38 | 39 | 1. The data will have categorical features(protocols,service type,attack types,service error) and the data within those features will be [one hot encoded](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)(encode categorical features into numerical features) 40 | 41 | 2. The categorical features are removed from the data. 42 | 43 | 3. The Categorical features (service type, attack types, and service error/flag) are encoded with [Risk Values](http://www.jatit.org/volumes/Vol65No3/13Vol65No3.pdf). Since protocols do not have associated risk values, they are one hot encoded 44 | 45 | ##### 4. Encoding Menu 46 | * ![image](https://user-images.githubusercontent.com/31083873/62171931-ed50ce80-b2ed-11e9-9963-45de4cc4301e.png) 47 | The user is asked to encode the labels. The NSL-KDD Dataset contains 22 usual attack types plus the normal category for a total of 23 possible labels. 48 | 1. The labels are converted in binary labels (normal and abnormal). Every attack name that is not normal - in other words, that is an attack - is renamed with the label abnormal. After that, the labels are encoded into binary numbers where 0 is normal and 1 is abnormal. Because we can't calculate a metric score with categorical features, so the normal and abnormal labels must be converted to numeric data. 49 | 50 | 2. The labels are converted into a 5 main categoires (normal,DoS,Probe,U2R,R2L) using the information provided in [this analysis of the dataset](https://pdfs.semanticscholar.org/1b34/80021c4ab0f632efa99e01a9b073903c5554.pdf). After that, each attack is encoded into one of 5 numbers where normal is 0, Dos is 1, Probe is 2, R2L is 3 and U2R is 4. 51 | 52 | ##### 5. Scale the data 53 | * ![image](https://user-images.githubusercontent.com/31083873/62172317-1756c080-b2ef-11e9-873b-3c4a0f8fb0e9.png) 54 | The user is asked if he or she wants to Scale the data. We use [Min Max Scaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html). We do this because we want our data instances to be at the same range, and Min Max Scaler puts the data in a range of [0,1] by feature. This allows the distance-based anomaly detection algorithms to accurately analyze the data. 55 | 56 | ##### 6. Shuffle the data 57 | * ![image](https://user-images.githubusercontent.com/31083873/62183286-db375600-b316-11e9-97e4-71f1440ee1ed.png) 58 | The user is asked if he or she wants to suffle the data. Because one of the clustering algorithms, namely DBSCAN, could potentially return a different clustering depending on the order of the dataset, we attempted to shuffle the data and compare results. Unfortunately, the shuffled data returned clusters vastly different from the unshuffled data, with enough reason to believe that the shuffling algorithm was not working properly. Users are welcome to attempt shuffling the data but are forewarned that the shuffling may not return desired results. 59 | 60 | ##### 7. Algorithm Menu 61 | * ![image](https://user-images.githubusercontent.com/31083873/62183597-0ff7dd00-b318-11e9-9bcf-d26b4f6ae0ac.png) 62 | The user is asked which anomaly detection algorithm he or she wants to use on the data. Each algorithm is discussed in greater detail in the Analyzing Dataset section. 63 | 64 | Each algorithm requires user-input parameters. 65 | 66 | ###### K-Means 67 | ###### Initialization method 68 | * ![image](https://user-images.githubusercontent.com/31083873/62186624-2b68e500-b324-11e9-9fdb-c700ee87ee4c.png) 69 | K-Means provides different options for choosing the initial cluster centers. In this project, the user can choose either the random method or SciKitLearn's more sophisticated [k-means++](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html) method. 70 | ###### Clusters 71 | * ![image](https://user-images.githubusercontent.com/31083873/62186784-97e3e400-b324-11e9-8505-d35d78ee9fc1.png) 72 | Users must choose the number of clusters for K-Means. The elbow method is popular for choosing the number of clusters. Read more below in the Analyzing Dataset section. 73 | 74 | ###### DBSCAN 75 | * ![image](https://user-images.githubusercontent.com/31083873/62664174-cfabe680-b937-11e9-8352-d9cd5550c7f3.png) 76 | DBSCAN need 2 main parameter epsilon and min samples , the algorithm parameter will affect the run time, we concluded that brute is the fastest one for the NSL-KDD dataset. 77 | 78 | ###### Local Outlier Factor 79 | * ![image](https://user-images.githubusercontent.com/31083873/62664862-65487580-b93a-11e9-80e5-32dcff8b0ac1.png) 80 | Users must choose the ratio of anomalies in the dataset. This is called the contamination factor. 81 | 82 | ###### Isolation Forest 83 | * ![image](https://user-images.githubusercontent.com/51713553/62648301-c149d480-b90f-11e9-848f-1fbe843099cb.png) 84 | Users must choose the ratio of anomalies in the dataset. This is called the contamination factor. 85 | 86 | ##### 8. Scoring Metrics 87 | * ![image](https://user-images.githubusercontent.com/31083873/62186832-be098400-b324-11e9-9036-ae5413a4535e.png) 88 | 89 | * ![image](https://user-images.githubusercontent.com/51713553/62640889-bdae5180-b8ff-11e9-975d-f2c356561180.png) 90 | Kmeans F1-score 91 | 92 | 93 | * ![image](https://user-images.githubusercontent.com/31083873/62664455-cb33fd80-b938-11e9-8032-72bb83af578d.png) 94 | DBSCAN F1-score 95 | 96 | 97 | 98 | ### Preprocessing Dataset 99 | 100 | This project was designed to be used with the NSL-KDD and IDS 2017 datasets, available for download [here](https://www.unb.ca/cic/datasets/index.html). The preprocessing options thus are specific for each dataset. 101 | 102 | The NSL-KDD dataset has [categorical data](https://machinelearningmastery.com/why-one-hot-encode-data-in-machine-learning/) that must be omitted or encoded as numerical data to be clustered. The options in this project for dealing with categorical data include omitting categorical features, One Hot Encoding catgorical features, and [assigning risk values](http://www.jatit.org/volumes/Vol65No3/13Vol65No3.pdf) to Server Type and Flag features while One Hot Encoding Protocol data. One Hot Encoding is a process that takes each categorical option in a feature and makes that option a feature itself, assinging each data instance a "0" if it does not contain that category and a "1" if it does. While this option allows the user to keep the structure of the categorical data without assigning arbitrary hierarchical ordering to categories, this option also increases the number of features and thus is not always optimal for already-large datasets. For this reason, the code offers three different methods of dealing with categorical data. 103 | 104 | The IDS-2017 dataset has missing values that must be dealt with as well. The code offers the user the option of deleting the columns with missing values, imputing "0", imputing the mean, median, or mode of the feature, or using the Iterative Imputer method offered by Python. 105 | 106 | The interactive code asks the user to specify which of the two datasets he or she is using. 107 | 108 | ### Analyzing Dataset 109 | 110 | The code offers four different anomaly detection algorithms, namely K-Means, DBSCAN, Local Outlier Factor (LOF), and Isolation Forest. K-Means and DBSCAN are clustering algorithms, while LOF is a K-Nearest-Neighbor algorithm and Isolation Forest is a decision tree algorithm, both using a contamination factor to classify data as normal or anomaly. 111 | 112 | [K-Means](https://www.youtube.com/watch?v=_aWzGGNrcic) clusters data by starting with user-specified K initial cluster centroids, and assigning all points to the nearest centroid. Based on the assignments, the algorithm recalculates the cluster centers and reassigns all points to the nearest cluster center. The algorithm repeats this process for a default of 300 iterations. When the process ends, K-Means has clustered data into K clusters. [SciKitLearn's K-Means algorithm](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html) offers the option for the user to also specify the method for initialization, the way that the algorithm chooses which points to use as initial cluster centroids. In this project, the user specifies K, the number of initial cluster centroids and eventual clusters. A typical way of choosing K is often by the [elbow method](https://www.scikit-yb.org/en/latest/api/cluster/elbow.html). The implementation of K-Means in this project reports the sum of squared distances to cluster centers (or squared sum of errors, SSE) needed in the elbow plot, so a user can run tests with different values of K and plot that against the SSE for each K value. A user can then subjectively choose the elbow point on such a plot to determine the best K, and can then conduct tests with this K. The researchers suggest using a few values of K around the elbow and comparing the evaluation metric scores generated for each K in order to determine the best value of K. 113 | 114 | [Density-Based Spacial Clustering of Applications with Noise](https://medium.com/@elutins/dbscan-what-is-it-when-to-use-it-how-to-use-it-8bd506293818), or DBSCAN, relies on two user-input parameters, namely epsilon and minimum samples. Epsilon denotes the neighborhood of density to be explored for each data point, and minimum samples denote the minimum number of samples needed to be within a point’s epsilon neighborhood for said point to be considered a core point. Points within another core point’s epsilon neighborhood, but not core points themselves, are considered border points. Meanwhile, points that are not within another core point’s epsilon neighborhood, and that are not core points themselves, are considered anomalous points or noise. DBSCAN finds clusters of core points and border points and reports those clusters along with a group of all of the anomalous points. [SciKitLearn's DBSCAN](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html) offers the user other parameters to manipulate the specific way that DBSCAN calculates the clusters; this project uses all default parameters except for the algorithm parameter, for which the project specifies the 'brute' option in order to reduce run time. 115 | **DBSCAN run time will depend of how big the dataset is and what resources your computer has. Since "DBSCAN groups together points that are close to each other based on a distance measurement," it is slower than K-means algorithm (Salton do Prado, 2017). The experiments on DBSCAN were made on a Macbook Pro 2.6 GHz i7 with 16 GB of RAM memory and using the Brute parameter for the algorithm. The average time for these experiments was 3 minutes. DBSCAN tests were attempted on a Macbook air 1.6 GHz i5 with 8GB of RAM, but after 30 minutes never finished due to the processing capacity of the computer. Before running experiments with DBSCAN make sure the computer can handle it.** 116 | 117 | [Local Outlier Factor](https://towardsdatascience.com/local-outlier-factor-for-anomaly-detection-cc0c770d2ebe), or LOF, begins with the parameter K, a default-set or user-chosen integer. For a specific point, the algorithm calculates the reach-distance to each point, which is essentially the distance from a specific point to another point with a small smoothing caveat for close points. The algorithm then takes the average of the reach-distances for a specific point to each of that point's k-nearest neighbors. The inverse of this average is called the Local Reachability Distance, or LRD. A point's high LRD indicates that the point exists in a highly dense neighborhood and does not have to travel far to encounter all K nearest neighbors, and a point's low LRD indicates the opposite, a low-density neighborhood. The algorithm calculates the LRDs for each point, and finds the average of all LRDs. Finally, the algoirthm calculates the Local Outlier Factor for each point by dividing that point's LRD by the average LRD of all points. An LRD around 1 indicates a point with average density, and an LRD much greater than 1 indicates a point in a much lower-density neighborhood than the average point, and therefore a point that is likely an anomaly. [SciKitLearn's LOF algorithm](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html) returns the negative of each point's local outlier factor. In this code, one can choose an Offset value such that all points with an LOF more negative than that Offset value are labeled as anomalous points, and all points equal to or more positive than that Offset value are labeled as normal points. 118 | 119 | Similarly to Local Outlier Factor, [Isolation Forest](https://towardsdatascience.com/outlier-detection-with-isolation-forest-3d190448d45e) returns for each point a score representing the probability of that particular point being an anomaly, and the user must choose a threshold for which scores will indicate an anomaly and which will indicate a normal instance. The algorithm generates the probability scores for each instance by the following process: _First, randomly choose a feature (column). Next, randomly choose a value between the min and max of that feature. Partition, or split the data into two groups: those with values in that feature above the randomly chosen value, and those with values below. Now, choose one of the two groups again and split the data on a random point. Repeat until a single point is isolated. Obtain the number of splits required to isolate that point. Repeat this process, eventually isolating all points across many features, and obtain for each specific point the average number of splits required for that point to be isolated_. The theory behind Isolation Forest states that anomalies occur less frequently and differ more greatly than normal points, and therefore will require fewer partitions, or splits, to isolate them than normal points would require. Thus, a score based on the average number of splits, also known as the average path length, denotes the probability that a particular point is an anomaly. The score is adjusted such that a a score near 1 denotes a likely anomaly, and a score near 0.5 denotes a likely normal point. Again, the user can set the contamination factor to indicate the threshold for scores labeled as anomaly and as normal. 120 | 121 | 122 | ### Evaluating Clusters 123 | 124 | The code also offers multiple evaluation metrics for the user to choose from. Each metric depends on comparing the labels of the actual dataset with the labels given by the clustering, or the "true labels" with the "found labels". For both the NSL-KDD and the IDS 2017 datasets, both binary and multiclass labels are available to compare with as "true labels." Users can specify their preference in the interactive code. In this code, users can verify the clustering on their data by using one of three different metrics, namely F-1 Score, Normalized Mutual Information Score (NMI), and Adjusted Rand Score (ARS). 125 | 126 | [F-Score](https://deepai.org/machine-learning-glossary-and-terms/f-score) is the harmonic mean between precision and recall. Precision is the ratio of correctly predicted positive values to all values predicted to be positive. In other words, precision indicates how sure the algorithm is that the found positive values are actually positive. Meanwhile, recall is the ratio of correctly predicted positive values to all values that are actually positive. In other words, recall indicates how sure the algorithm is that it did not miss any positive values in its positive labelings. One can weight either precision or recall to have more influence in the F-Score by changing the beta value in the F-beta function; however, this project opts to keep the weight between precision and recall equal by using the F-1 score. 127 | 128 | The [Normalized Mutual Information Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.normalized_mutual_info_score.html) is based on the probability function of entropy and uncertainty between the true and the found labels. 129 | 130 | Instead of measuring entropy as the NMI score does, the [Adjusted Rand Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html) measures the similarity between the true labels and the found labels. Furthermore, the ARS is adjusted for chance, while the NMI is not. 131 | 132 | 133 | ## Roadmap 134 | 135 | * Implement Dimensionality Reduction- The main idea to implement this is because both datasets are considerably big and they consume a lot of processing resources also we want to implement this because we can run DBSCAN algorithm on a bigger data set than IDS2017 and we want to know if we are going to be able to implement the algorithm. 136 | 137 | ## Poster 138 | ![CBAD-Poster](https://user-images.githubusercontent.com/31083873/70267654-41c0fa80-1775-11ea-9fa4-2bc85b1a57a3.png) 139 | 140 | 141 | 142 | ## Authors and acknowledgment 143 | * Jeremy Perez 144 | * Bethany Danner 145 | * **Special thanks to Dr. Veronika Neeley for mentoring us throughout this project, and for Dr. Clem Izurieta for organizing the REU program at Montana State University. This work was funded by the [National Science Foundation](https://www.nsf.gov/)**. 146 | 147 | ## License 148 | 149 | MIT License 150 | 151 | Copyright (c) 2019 Jeremy Perez 152 | 153 | Permission is hereby granted, free of charge, to any person obtaining a copy 154 | of this software and associated documentation files (the "Software"), to deal 155 | in the Software without restriction, including without limitation the rights 156 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 157 | copies of the Software, and to permit persons to whom the Software is 158 | furnished to do so, subject to the following conditions: 159 | 160 | The above copyright notice and this permission notice shall be included in all 161 | copies or substantial portions of the Software. 162 | 163 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 164 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 165 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 166 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 167 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 168 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 169 | SOFTWARE. 170 | 171 | ## Project status 172 | 173 | Current Bugs: After shuflleling the data results are not as excpected 174 | --------------------------------------------------------------------------------