├── .DS_Store
├── .ipynb_checkpoints
    └── CBAD-checkpoint.ipynb
├── CBAD OUTLINE.pdf
├── CBAD-Poster.pdf
├── CBAD.ipynb
├── CBAD.py
├── Dataset
    ├── .DS_Store
    └── KDDTrain+.csv
├── LICENSE
└── README.md


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremy191/clustering-based-anomaly-detection/7df019978b9350920ac2e21bb53663f8709f6ab1/.DS_Store


--------------------------------------------------------------------------------
/.ipynb_checkpoints/CBAD-checkpoint.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Main Libraries"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "code",
  12 |    "execution_count": 1,
  13 |    "metadata": {},
  14 |    "outputs": [],
  15 |    "source": [
  16 |     "import numpy as np\n",
  17 |     "import pandas as pd \n",
  18 |     "import time\n",
  19 |     "import os"
  20 |    ]
  21 |   },
  22 |   {
  23 |    "cell_type": "markdown",
  24 |    "metadata": {},
  25 |    "source": [
  26 |     "# Clear"
  27 |    ]
  28 |   },
  29 |   {
  30 |    "cell_type": "code",
  31 |    "execution_count": 2,
  32 |    "metadata": {},
  33 |    "outputs": [],
  34 |    "source": [
  35 |     "clear = lambda:os.system('clear')"
  36 |    ]
  37 |   },
  38 |   {
  39 |    "cell_type": "markdown",
  40 |    "metadata": {},
  41 |    "source": [
  42 |     "# Getting the dataset"
  43 |    ]
  44 |   },
  45 |   {
  46 |    "cell_type": "code",
  47 |    "execution_count": 3,
  48 |    "metadata": {},
  49 |    "outputs": [],
  50 |    "source": [
  51 |     "def getDataSet():# Getting the path of the dataset\n",
  52 |     "    \n",
  53 |     "    while True:\n",
  54 |     "        print(\"**************************************************\")\n",
  55 |     "        print(\"DATA SET MENU\")\n",
  56 |     "        print(\"**************************************************\")\n",
  57 |     "        print(\"1.NSL-KDD\")\n",
  58 |     "        print(\"2.IDS 2017\")\n",
  59 |     "        \n",
  60 |     "        option = input(\"Option:\")\n",
  61 |     "        \n",
  62 |     "        if option == \"1\" or option == \"2\":\n",
  63 |     "            break\n",
  64 |     "    \n",
  65 |     "    path = input(\"Path of the File:\")\n",
  66 |     "    \n",
  67 |     "    return path,option"
  68 |    ]
  69 |   },
  70 |   {
  71 |    "cell_type": "markdown",
  72 |    "metadata": {},
  73 |    "source": [
  74 |     "# Reading the dataset"
  75 |    ]
  76 |   },
  77 |   {
  78 |    "cell_type": "code",
  79 |    "execution_count": 4,
  80 |    "metadata": {},
  81 |    "outputs": [],
  82 |    "source": [
  83 |     "def readingData(path): #Reading the Dataset\n",
  84 |     "    \n",
  85 |     "    while True:\n",
  86 |     "        \n",
  87 |     "        option = input(\"Dataset has feature names[y/n]:\") \n",
  88 |     "        \n",
  89 |     "        if option == \"y\" or option == \"n\":\n",
  90 |     "            break\n",
  91 |     "            \n",
  92 |     "    print(\"\\nReading Dataset...\") \n",
  93 |     "        \n",
  94 |     "    if option == \"y\":\n",
  95 |     "        dataSet = pd.read_csv(path,low_memory=False)\n",
  96 |     "    \n",
  97 |     "    elif option == \"n\":\n",
  98 |     "        dataSet = pd.read_csv(path, header = None,low_memory=False)\n",
  99 |     "            \n",
 100 |     "    return dataSet\n"
 101 |    ]
 102 |   },
 103 |   {
 104 |    "cell_type": "markdown",
 105 |    "metadata": {},
 106 |    "source": [
 107 |     "# Check if missing data"
 108 |    ]
 109 |   },
 110 |   {
 111 |    "cell_type": "code",
 112 |    "execution_count": 5,
 113 |    "metadata": {},
 114 |    "outputs": [],
 115 |    "source": [
 116 |     "def checkMissing(X):#This check if the dataset given has missing values.\n",
 117 |     "    isMissing = str(X.isnull().values.any()) #Using String instead of Boolean because (\"cannot unpack non-iterable numpy.bool object\")\n",
 118 |     "    \n",
 119 |     "    if isMissing == \"True\":\n",
 120 |     "        #if data set has infinity values replace them with none\n",
 121 |     "        X = X.replace('Infinity', np.nan) #Replacing Infinity values with nan values\n",
 122 |     "           \n",
 123 |     "        missingValIndex = []\n",
 124 |     "        total = X.isnull().sum().sum()\n",
 125 |     "        percent = (total / (X.count().sum() + X.isnull().sum().sum())) * 100\n",
 126 |     "            \n",
 127 |     "        for rows in X:\n",
 128 |     "                    \n",
 129 |     "            if X[rows].isnull().sum() != 0:\n",
 130 |     "                missingValIndex.append(rows)\n",
 131 |     "        print(\"\\n\\n**************************************************\")\n",
 132 |     "        print(\"Data has missing values\")\n",
 133 |     "        print(\"**************************************************\")\n",
 134 |     "        print(\"Features with missing values:\",missingValIndex)\n",
 135 |     "        print(\"Total missing Values -> \" , total)\n",
 136 |     "        print(percent,\"%\")\n",
 137 |     "        \n",
 138 |     "        return X\n",
 139 |     "    \n",
 140 |     "    else:\n",
 141 |     "        \n",
 142 |     "        return X\n"
 143 |    ]
 144 |   },
 145 |   {
 146 |    "cell_type": "markdown",
 147 |    "metadata": {},
 148 |    "source": [
 149 |     "# Getting the features"
 150 |    ]
 151 |   },
 152 |   {
 153 |    "cell_type": "code",
 154 |    "execution_count": 7,
 155 |    "metadata": {},
 156 |    "outputs": [
 157 |     {
 158 |      "ename": "IndentationError",
 159 |      "evalue": "unindent does not match any outer indentation level (<tokenize>, line 47)",
 160 |      "output_type": "error",
 161 |      "traceback": [
 162 |       "\u001b[0;36m  File \u001b[0;32m\"<tokenize>\"\u001b[0;36m, line \u001b[0;32m47\u001b[0m\n\u001b[0;31m    for rows in dataSet: #Getting features index with missing values\u001b[0m\n\u001b[0m    ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unindent does not match any outer indentation level\n"
 163 |      ]
 164 |     }
 165 |    ],
 166 |    "source": [
 167 |     "#Getting The data we want to test for the clustering algorithms\n",
 168 |     "def gettingVariables(dataSet,dataSetOption):# If the dataset is NSL-KDD it would get the features and the labels for it and if its IDS 2017 it would take the features and the labels for it and take careof missing values.\n",
 169 |     "   \n",
 170 |     "    if dataSetOption == \"1\":\n",
 171 |     "        while True:\n",
 172 |     "            print(\"\\n\\n**************************************************\")\n",
 173 |     "            print(\"Variables Menu\")\n",
 174 |     "            print(\"**************************************************\")\n",
 175 |     "            print(\"1.Data set with categorical data oneHot encoded\")\n",
 176 |     "            print(\"2.Data set with categorical data removed\")\n",
 177 |     "            print(\"3.Data set with Risk Values replacing Server Type and Flag Features; Protocol Data oneHot encoded\")\n",
 178 |     "            option = input(\"Enter option :\")\n",
 179 |     "            \n",
 180 |     "            \n",
 181 |     "            if option == \"1\" or option == \"2\" or option == \"3\":\n",
 182 |     "                break\n",
 183 |     "            else:\n",
 184 |     "                \n",
 185 |     "                print(\"Error\\n\\n\")\n",
 186 |     "            \n",
 187 |     "        \n",
 188 |     "        if option == \"1\":\n",
 189 |     "            #Getting the Dependent and independent Variables\n",
 190 |     "            #In all the option we remove the dificulty level feature because we don't need it in our experiments\n",
 191 |     "            \n",
 192 |     "            \n",
 193 |     "            X = dataSet.iloc[:,:-2].values # Data, Get all the rows and all the clums except all the colums - 2\n",
 194 |     "            Y = dataSet.iloc[:,42].values# Labels\n",
 195 |     "            return X,Y,option\n",
 196 |     "        \n",
 197 |     "        elif option == \"2\":\n",
 198 |     "            #Removing Categorical data from the data set\n",
 199 |     "            X = dataSet.iloc[:,[0,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40]].values\n",
 200 |     "            Y = dataSet.iloc[:,42].values# Labels\n",
 201 |     "            \n",
 202 |     "            return X,Y,option\n",
 203 |     "        \n",
 204 |     "        elif option == \"3\":\n",
 205 |     "            #for later Risk Encode - Categorical features\n",
 206 |     "            X = dataSet.iloc[:,:-2].values\n",
 207 |     "            Y = dataSet.iloc[:,42].values# Labels\n",
 208 |     "            \n",
 209 |     "            return X,Y,option\n",
 210 |     "    \n",
 211 |     "\n",
 212 |     "    elif dataSetOption == \"2\":\n",
 213 |     "        #############################################################################\n",
 214 |     "        #GETTING VARIABLES\n",
 215 |     "        #############################################################################\n",
 216 |     "        missingValIndex = []\n",
 217 |     "        for rows in dataSet: #Getting features index with missing values\n",
 218 |     "            if dataSet[rows].isnull().sum() != 0:\n",
 219 |     "                    missingValIndex.append(rows)\n",
 220 |     "                \n",
 221 |     "        X = dataSet.iloc[:,:-1].values#data\n",
 222 |     "        #if names are not especified it will assign 0,1,2...n for the features name\n",
 223 |     "        X = pd.DataFrame(X,columns = [' Destination Port',' Flow Duration',' Total Fwd Packets',' Total Backward Packets','Total Length of Fwd Packets',\n",
 224 |     "                                      ' Total Length of Bwd Packets',' Fwd Packet Length Max',' Fwd Packet Length Min',' Fwd Packet Length Mean',' Fwd Packet Length Std',\n",
 225 |     "                                      'Bwd Packet Length Max',' Bwd Packet Length Min',' Bwd Packet Length Mean',' Bwd Packet Length Std','Flow Bytes/s',' Flow Packets/s',' Flow IAT Mean',\n",
 226 |     "                                      ' Flow IAT Std',' Flow IAT Max',' Flow IAT Min','Fwd IAT Total',' Fwd IAT Mean',' Fwd IAT Std',' Fwd IAT Max',' Fwd IAT Min','Bwd IAT Total',' Bwd IAT Mean',\n",
 227 |     "                                      ' Bwd IAT Std',' Bwd IAT Max',' Bwd IAT Min','Fwd PSH Flags',' Bwd PSH Flags',' Fwd URG Flags',' Bwd URG Flags',' Fwd Header Length',' Bwd Header Length','Fwd Packets/s',\n",
 228 |     "                                      ' Bwd Packets/s',' Min Packet Length',' Max Packet Length',' Packet Length Mean',' Packet Length Std',' Packet Length Variance','FIN Flag Count',' SYN Flag Count',' RST Flag Count',\n",
 229 |     "                                      ' PSH Flag Count',' ACK Flag Count',' URG Flag Count',' CWE Flag Count',' ECE Flag Count',' Down/Up Ratio',' Average Packet Size',' Avg Fwd Segment Size',' Avg Bwd Segment Size',' Fwd Header Length',\n",
 230 |     "                                      'Fwd Avg Bytes/Bulk',' Fwd Avg Packets/Bulk',' Fwd Avg Bulk Rate',' Bwd Avg Bytes/Bulk',' Bwd Avg Packets/Bulk','Bwd Avg Bulk Rate','Subflow Fwd Packets',' Subflow Fwd Bytes',' Subflow Bwd Packets',' Subflow Bwd Bytes',\n",
 231 |     "                                      'Init_Win_bytes_forward',' Init_Win_bytes_backward',' act_data_pkt_fwd',' min_seg_size_forward','Active Mean',' Active Std',' Active Max',' Active Min','Idle Mean',' Idle Std',' Idle Max',' Idle Min'])\n",
 232 |     "        Y = dataSet.iloc[:,78].values#Labels\n",
 233 |     "        \n",
 234 |     "        #############################################################################\n",
 235 |     "        #Variables Got \n",
 236 |     "        #############################################################################\n",
 237 |     "        \n",
 238 |     "    #############################################################################\n",
 239 |     "    #MANAGE MISSING DATA\n",
 240 |     "    #############################################################################   \n",
 241 |     "     \n",
 242 |     "        while True:\n",
 243 |     "            print(\"\\n\\n**************************************************\")\n",
 244 |     "            print(\"Manage Missing Values \")\n",
 245 |     "            print(\"**************************************************\")\n",
 246 |     "            print(\"1.Eliminate Catg. w/ Missing Values\")\n",
 247 |     "            print(\"2.Impute 0 for Missing Values\")\n",
 248 |     "            print(\"3.Impute Mean for Missing Values\")\n",
 249 |     "            print(\"4.Impute Median for Missing Values\")\n",
 250 |     "            print(\"5.Impute Mode for Missing Values\")\n",
 251 |     "            print(\"6.Simple Imputer\")\n",
 252 |     "            missingDataOption = input(\"Option:\")\n",
 253 |     "    \n",
 254 |     "            if missingDataOption == \"1\" or missingDataOption == \"2\" or missingDataOption == \"3\" or missingDataOption == \"4\" or missingDataOption == \"5\" or missingDataOption == \"6\":\n",
 255 |     "                break\n",
 256 |     "    \n",
 257 |     "    \n",
 258 |     "        if missingDataOption == \"1\":\n",
 259 |     "            deletedColumns = []\n",
 260 |     "            numColumns = len(X.columns)\n",
 261 |     "            #removing features with missing values\n",
 262 |     "            for row in missingValIndex:\n",
 263 |     "                deletedColumns.append(row)\n",
 264 |     "                del X[row]\n",
 265 |     "        \n",
 266 |     "            print(\"#\\n\\n########################################################################\")\n",
 267 |     "            print(\"Columns Succesfully Removed\")\n",
 268 |     "            print(len(deletedColumns),\"of\",numColumns,\"were deleted\")\n",
 269 |     "            print(\"Columns Names -> \",deletedColumns)\n",
 270 |     "            print(\"#########################################################################\")\n",
 271 |     "    \n",
 272 |     "        elif missingDataOption == \"2\":\n",
 273 |     "            #fill with 0\n",
 274 |     "            for row in missingValIndex:\n",
 275 |     "                X[row] = X[row].fillna(0)\n",
 276 |     "        \n",
 277 |     "            print(\"\\n\\n#########################################################################\")\n",
 278 |     "            print(\"Sucessfully Filled Missing Values with 0\")\n",
 279 |     "            print(\"#########################################################################\")\n",
 280 |     "    \n",
 281 |     "    \n",
 282 |     "        elif missingDataOption == \"3\":\n",
 283 |     "            #mean imputer\n",
 284 |     "            for row in missingValIndex:\n",
 285 |     "                X[row] = X[row].astype(float)\n",
 286 |     "                X[row] = X[row].fillna(X[row].mean())\n",
 287 |     "        \n",
 288 |     "            print(\"\\n\\n#########################################################################\")\n",
 289 |     "            print(\"Sucessfully Filled Missing Values with Mean\")\n",
 290 |     "            print(\"#########################################################################\")\n",
 291 |     "    \n",
 292 |     "        elif missingDataOption == \"4\":\n",
 293 |     "            #median imputer\n",
 294 |     "            for row in missingValIndex:\n",
 295 |     "                median = X[row].median()\n",
 296 |     "                X[row].fillna(median, inplace=True)\n",
 297 |     "            print(\"\\n\\n#########################################################################\")\n",
 298 |     "            print(\"Sucessfully Filled Missing Values with Median\")\n",
 299 |     "            print(\"#########################################################################\")\n",
 300 |     "    \n",
 301 |     "        elif missingDataOption == \"5\":\n",
 302 |     "            #Mode imputer\n",
 303 |     "            for row in missingValIndex:\n",
 304 |     "                X[row] = X[row].fillna(X[row].mode()[0])\n",
 305 |     "    \n",
 306 |     "            print(\"\\n\\n#########################################################################\")\n",
 307 |     "            print(\"Sucessfully Filled Missing Values with Mode \")\n",
 308 |     "            print(\"#########################################################################\")\n",
 309 |     "        \n",
 310 |     "        elif missingDataOption == \"6\": \n",
 311 |     "            from sklearn.impute import SimpleImputer\n",
 312 |     "            #\"Imputation transformer for completing missing values.\"(Univariate)\n",
 313 |     "            X = SimpleImputer(missing_values = np.nan, strategy='mean', fill_value=None, verbose=0, copy=True).fit_transform(X)          \n",
 314 |     "            print(\"\\n\\n#########################################################################\")\n",
 315 |     "            print(\"Sucessfully Imputed Simple Imputer \")\n",
 316 |     "            print(\"#########################################################################\")\n",
 317 |     "                  \n",
 318 |     "                  \n",
 319 |     "        option = \"None\" #This data does not have categorical features so dataOption is none      \n",
 320 |     "        return X,Y,option\n",
 321 |     "       \n",
 322 |     "#############################################################################\n",
 323 |     "#END OF MISSING DATA\n",
 324 |     "#############################################################################"
 325 |    ]
 326 |   },
 327 |   {
 328 |    "cell_type": "markdown",
 329 |    "metadata": {},
 330 |    "source": [
 331 |     "# Encoding Labels"
 332 |    ]
 333 |   },
 334 |   {
 335 |    "cell_type": "code",
 336 |    "execution_count": null,
 337 |    "metadata": {},
 338 |    "outputs": [],
 339 |    "source": [
 340 |     "def encodingLabels(Y,dataOption,datasetOption):# Encoding the labels with multi class or binary\n",
 341 |     "    \n",
 342 |     "    if datasetOption == \"1\": #Check if the data set choosen is NSL-KDD or IDS2017\n",
 343 |     "        \n",
 344 |     "        if dataOption == \"1\" or dataOption == \"2\" or dataOption == \"3\":\n",
 345 |     "            \n",
 346 |     "            while True:\n",
 347 |     "                print(\"\\n\\n#########################################################################\")\n",
 348 |     "                print(\"Encoding Menu\")\n",
 349 |     "                print(\"#########################################################################\")\n",
 350 |     "                print(\"1.Binary true labels: normal = 0, abnormal = 1\")\n",
 351 |     "                print(\"2.Multiclass true labels: normal = 0, DoS = 1, Probe = 2, R2L = 3, U2R = 4\")\n",
 352 |     "                encodeOption = input(\"Enter option :\") \n",
 353 |     "    \n",
 354 |     "                if encodeOption == \"1\" or encodeOption == \"2\":\n",
 355 |     "                    break\n",
 356 |     "                else:\n",
 357 |     "                    \n",
 358 |     "                    print(\"Error\\n\\n\")\n",
 359 |     "    \n",
 360 |     "    \n",
 361 |     "            if encodeOption == \"1\":\n",
 362 |     "                #Binary Categories\n",
 363 |     "                attackType  = {'normal':\"normal\", 'neptune':\"abnormal\", 'warezclient':\"abnormal\", 'ipsweep':\"abnormal\",'back':\"abnormal\", 'smurf':\"abnormal\", 'rootkit':\"abnormal\",'satan':\"abnormal\", 'guess_passwd':\"abnormal\",'portsweep':\"abnormal\",'teardrop':\"abnormal\",'nmap':\"abnormal\",'pod':\"abnormal\",'ftp_write':\"abnormal\",'multihop':\"abnormal\",'buffer_overflow':\"abnormal\",'imap':\"abnormal\",'warezmaster':\"abnormal\",'phf':\"abnormal\",'land':\"abnormal\",'loadmodule':\"abnormal\",'spy':\"abnormal\",'perl':\"abnormal\"} \n",
 364 |     "                attackEncodingCluster  = {'normal':0,'abnormal':1}\n",
 365 |     "    \n",
 366 |     "                Y[:] = [attackType[item] for item in Y[:]] #Encoding the binary data\n",
 367 |     "                Y[:] = [attackEncodingCluster[item] for item in Y[:]]#Changing the names of the labels to binary labels normal and abnormal\n",
 368 |     "                return Y,encodeOption\n",
 369 |     "    \n",
 370 |     "            elif encodeOption == \"2\":\n",
 371 |     "                #4 Main Categories\n",
 372 |     "                #normal = 0\n",
 373 |     "                #DoS = 1\n",
 374 |     "                #Probe = 2\n",
 375 |     "                #R2L = 3\n",
 376 |     "                #U2R = 4\n",
 377 |     "                attackType  = {'normal': 'normal', 'neptune':'DoS', 'warezclient': 'R2L', 'ipsweep': 'Probe','back': 'DoS', 'smurf': 'DoS', 'rootkit': 'U2R','satan': 'Probe', 'guess_passwd': 'R2L','portsweep': 'Probe','teardrop': 'DoS','nmap': 'Probe','pod': 'DoS','ftp_write': 'R2L','multihop': 'R2L','buffer_overflow': 'U2R','imap': 'R2L','warezmaster': 'R2L','phf': 'R2L','land': 'DoS','loadmodule': 'U2R','spy': 'R2L','perl': 'U2R'} \n",
 378 |     "                attackEncodingCluster  = {'normal':0,'DoS':1,'Probe':2,'R2L':3, 'U2R':4} #Main Categories\n",
 379 |     "    \n",
 380 |     "                Y[:] = [attackType[item] for item in Y[:]] #Encoding the main 4 categories\n",
 381 |     "                Y[:] = [attackEncodingCluster[item] for item in Y[:]]# Changing the names of attacks into 4 main categories\n",
 382 |     "                return Y,encodeOption\n",
 383 |     "        else:\n",
 384 |     "            return Y\n",
 385 |     "    \n",
 386 |     "    \n",
 387 |     "    elif datasetOption == \"2\":#Check if the data set choosen is NSL-KDD or IDS2017\n",
 388 |     "        print(\"\\n\\n#########################################################################\")\n",
 389 |     "        print(\"Encoding Menu\")\n",
 390 |     "        print(\"#########################################################################\")\n",
 391 |     "        print(\"1.Binary true labels: normal = 0, abnormal = 1\")\n",
 392 |     "        print(\"2. Multiclass true labels: BENIGN= 0, DoS slowloris= 1, DoS Slowhttptest= 2, DoS Hulk= 3, DoS GoldenEye= 4, Heartbleed= 5\")\n",
 393 |     "        encodeOption = input(\"Enter option :\")\n",
 394 |     "\n",
 395 |     "        if encodeOption == \"1\":\n",
 396 |     "            Y = np.array(Y,dtype= object)\n",
 397 |     "            attackEncoding  = {'BENIGN': 0,'DoS slowloris': 1,'DoS Slowhttptest': 2,'DoS Hulk': 3, 'DoS GoldenEye': 4, 'Heartbleed': 5} #Main Categories\n",
 398 |     "            Y[:] = [attackEncoding[item] for item in Y[:]]# Changing the names of attacks into 4 main categories\n",
 399 |     "    \n",
 400 |     "            return Y,encodeOption\n",
 401 |     "        \n",
 402 |     "        elif encodeOption == \"2\":\n",
 403 |     "            Y = np.array(Y,dtype= object)\n",
 404 |     "            attackType  = {'BENIGN': 'normal','DoS slowloris': 'abnormal','DoS Slowhttptest': 'abnormal','DoS Hulk': 'abnormal', 'DoS GoldenEye': 'abnormal', 'Heartbleed': 'abnormal'} #Binary Categories\n",
 405 |     "            attackEncoding = {'normal': 0, 'abnormal': 1}\n",
 406 |     "            \n",
 407 |     "            Y[:] = [attackType[item] for item in Y[:]]# Changing the names of attacks into binary categories\n",
 408 |     "            Y[:] = [attackEncoding[item] for item in Y[:]]# Changing the names of attacks into binary categories\n",
 409 |     "            return Y,encodeOption\n",
 410 |     "        \n",
 411 |     "        else:\n",
 412 |     "            return Y"
 413 |    ]
 414 |   },
 415 |   {
 416 |    "cell_type": "markdown",
 417 |    "metadata": {},
 418 |    "source": [
 419 |     "# One Hot Encoding"
 420 |    ]
 421 |   },
 422 |   {
 423 |    "cell_type": "code",
 424 |    "execution_count": null,
 425 |    "metadata": {},
 426 |    "outputs": [],
 427 |    "source": [
 428 |     "#Encoding the categorical features using one hot encoding and using Main attacks categories or binary categories\n",
 429 |     "def oneHotEncodingData(X,dataOption):\n",
 430 |     "        \n",
 431 |     "    from sklearn.preprocessing import OneHotEncoder\n",
 432 |     "    from sklearn.compose import ColumnTransformer\n",
 433 |     "    #We use One hot encoding to pervent the machine learning to atribute the categorical data in order. \n",
 434 |     "    #What one hot encoding(ColumnTransformer) does is, it takes a column which has categorical data, \n",
 435 |     "    #which has been label encoded, and then splits the column into multiple columns.\n",
 436 |     "    #The numbers are replaced by 1s and 0s, depending on which column has what value\n",
 437 |     "    #We don't need to do a label encoded step because ColumnTransformer do one hot encode and label encode!\n",
 438 |     "    #Encoding the Independient Variable\n",
 439 |     "    if dataOption == \"1\": #Only for dataset with Categorical Data\n",
 440 |     "        transform = ColumnTransformer([(\"Servers\", OneHotEncoder(categories = \"auto\"), [1,2,3])], remainder=\"passthrough\")\n",
 441 |     "        X = transform.fit_transform(X)\n",
 442 |     "        print(\"\\n\\n#########################################################################\")\n",
 443 |     "        print(\"Data has been successfully One Hot Encoded\")\n",
 444 |     "        print(\"#########################################################################\")\n",
 445 |     "\n",
 446 |     "        return X\n",
 447 |     "    elif dataOption == \"3\": #Only for risk data, because we don't have risk values for protocol feature we do one hot encoding for only that feature and the other ones we do risk value encoding\n",
 448 |     "        transform = ColumnTransformer([(\"Servers\", OneHotEncoder(categories = \"auto\"), [1])], remainder=\"passthrough\")\n",
 449 |     "        X = transform.fit_transform(X)\n",
 450 |     "        print(\"\\n\\n#########################################################################\")\n",
 451 |     "        print(\"Data has been successfully One Hot Encoded\")\n",
 452 |     "        print(\"#########################################################################\")\n",
 453 |     "        return X\n",
 454 |     "        \n",
 455 |     "    else:\n",
 456 |     "        return X #return data with no changes"
 457 |    ]
 458 |   },
 459 |   {
 460 |    "cell_type": "markdown",
 461 |    "metadata": {},
 462 |    "source": [
 463 |     "# Risk Encoding"
 464 |    ]
 465 |   },
 466 |   {
 467 |    "cell_type": "code",
 468 |    "execution_count": null,
 469 |    "metadata": {},
 470 |    "outputs": [],
 471 |    "source": [
 472 |     "def riskEncodingData(X,dataOption):#Risk encoding categorical features\n",
 473 |     "    #Manually Encoding for the attacks types only\n",
 474 |     "    if dataOption == \"3\": #if data option is risk Value\n",
 475 |     "        X = pd.DataFrame(X)\n",
 476 |     "        servers  = {'http':0.01, 'domain_u':0, 'sunrpc':1, 'smtp':0.01, 'ecr_i':0.87, 'iso_tsap':1, 'private':0.97, 'finger':0.27, 'ftp':0.26, 'telnet':0.48,'other':0.12,'discard':1, 'courier':1, 'pop_3':0.53, 'ldap':1, 'eco_i':0.8, 'ftp_data':0.06, 'klogin':1, 'auth':0.31, 'mtp':1, 'name':1, 'netbios_ns':1,'remote_job':1,'supdup':1,'uucp_path':1,'Z39_50':1,'csnet_ns':1,'uucp':1,'netbios_dgm':1,'urp_i':0,'domain':0.96,'bgp':1,'gopher':1,'vmnet':1,'systat':1,'http_443':1,'efs':1,'whois':1,'imap4':1,'echo':1,'link':1,'login':1,'kshell':1,'sql_net':1,'time':0.88,'hostnames':1,'exec':1,'ntp_u':0,'nntp':1,'ctf':1,'ssh':1,'daytime':1,'shell':1,'netstat':1,'nnsp':1,'IRC':0,'pop_2':1,'printer':1,'tim_i':0.33,'pm_dump':1,'red_i':0,'netbios_ssn':1,'rje':1,'X11':0.04,'urh_i':0,'http_8001':1,'aol':1,'http_2784':1,'tftp_u':0,'harvest':1}\n",
 477 |     "        X[2] = [servers[item] for item in X[2]]\n",
 478 |     "\n",
 479 |     "        servers_Error  = {'REJ':0.519, 'SF':0.016, 'S0':0.998, 'RSTR':0.882, 'RSTO':0.886,'SH':0.993,'S1':0.008,'RSTOS0':1,'S3':0.08,'S2':0.05,'OTH':0.729} \n",
 480 |     "        X[3] = [servers_Error[item] for item in X[3]]\n",
 481 |     "\n",
 482 |     "        print(\"\\n\\n#########################################################################\")\n",
 483 |     "        print(\"Data has been successfully risk Encoded\")\n",
 484 |     "        print(\"#########################################################################\")\n",
 485 |     "\n",
 486 |     "        return X\n",
 487 |     "        \n",
 488 |     "    else:\n",
 489 |     "        \n",
 490 |     "        return X #return data with no changes"
 491 |    ]
 492 |   },
 493 |   {
 494 |    "cell_type": "markdown",
 495 |    "metadata": {},
 496 |    "source": [
 497 |     "# Scaling "
 498 |    ]
 499 |   },
 500 |   {
 501 |    "cell_type": "code",
 502 |    "execution_count": null,
 503 |    "metadata": {},
 504 |    "outputs": [],
 505 |    "source": [
 506 |     "def scaling(X):#Scalign the data with the normalize method, we scale the data to have it in the same range for the experiments\n",
 507 |     "    \n",
 508 |     "    \n",
 509 |     "\n",
 510 |     "    while True:\n",
 511 |     "            \n",
 512 |     "            decision = input(\"Scale data [y/n]:\")\n",
 513 |     "            \n",
 514 |     "            if decision == \"y\" or  decision == \"n\":\n",
 515 |     "                break\n",
 516 |     "            else:\n",
 517 |     "                \n",
 518 |     "                print(\"Error\\n\\n\")\n",
 519 |     "    \n",
 520 |     "    if decision == \"y\":\n",
 521 |     "        \n",
 522 |     "            from sklearn.preprocessing import MinMaxScaler\n",
 523 |     "            #Transforms features by scaling each feature to a given range.\n",
 524 |     "            X =  MinMaxScaler(feature_range=(0, 1)).fit_transform(X)\n",
 525 |     "            print(\"\\n\\n#########################################################################\")\n",
 526 |     "            print(\"Data has been successfully scaled.\")\n",
 527 |     "            print(\"#########################################################################\")\n",
 528 |     "            return X\n",
 529 |     "        \n",
 530 |     "    else:\n",
 531 |     "        return X\n"
 532 |    ]
 533 |   },
 534 |   {
 535 |    "cell_type": "markdown",
 536 |    "metadata": {},
 537 |    "source": [
 538 |     "# Shuffle"
 539 |    ]
 540 |   },
 541 |   {
 542 |    "cell_type": "code",
 543 |    "execution_count": null,
 544 |    "metadata": {},
 545 |    "outputs": [],
 546 |    "source": [
 547 |     "def shuffleData(X):# currently a bug, if we do shuffleling the experiments resutls are not good, the order of the data does not affect the results\n",
 548 |     "\n",
 549 |     "    from sklearn.utils import shuffle\n",
 550 |     "    while True:\n",
 551 |     "        option = input(\"Shuffle data [y]/[n]:\")\n",
 552 |     "        \n",
 553 |     "        if option == \"y\" or option == \"n\":\n",
 554 |     "            break\n",
 555 |     "        else:\n",
 556 |     "            \n",
 557 |     "            print(\"Error\\n\\n\")\n",
 558 |     "    \n",
 559 |     "    if option == \"y\":\n",
 560 |     "        \n",
 561 |     "        X = pd.DataFrame(X)\n",
 562 |     "        X = shuffle(X)\n",
 563 |     "        X.reset_index(inplace=True,drop=True)\n",
 564 |     "        X = np.array(X)\n",
 565 |     "        \n",
 566 |     "        print(\"\\n\\n#########################################################################\")\n",
 567 |     "        print(\"Data has been successfully shuffled.\")\n",
 568 |     "        print(\"#########################################################################\")\n",
 569 |     "        return X\n",
 570 |     "    else:\n",
 571 |     "        \n",
 572 |     "        return X"
 573 |    ]
 574 |   },
 575 |   {
 576 |    "cell_type": "markdown",
 577 |    "metadata": {},
 578 |    "source": [
 579 |     "# KMEANS"
 580 |    ]
 581 |   },
 582 |   {
 583 |    "cell_type": "code",
 584 |    "execution_count": null,
 585 |    "metadata": {},
 586 |    "outputs": [],
 587 |    "source": [
 588 |     "def kmeansClustering(X,Y):#K-means algorithm \n",
 589 |     "    from sklearn.cluster import KMeans\n",
 590 |     "\n",
 591 |     "    while True:\n",
 592 |     "        print(\"\\n\\n#########################################################################\")\n",
 593 |     "        print(\"KMEANS ALGORITHM\")\n",
 594 |     "        print(\"#########################################################################\")\n",
 595 |     "              \n",
 596 |     "        nClusters = input(\"Number of clusters:\")\n",
 597 |     "        \n",
 598 |     "        try:\n",
 599 |     "            nClusters = int(nClusters)\n",
 600 |     "            \n",
 601 |     "        except ValueError:\n",
 602 |     "            \n",
 603 |     "            print(\"Error\\n\\n\")\n",
 604 |     "            \n",
 605 |     "        if type(nClusters) == int:\n",
 606 |     "            n = 0\n",
 607 |     "            clusters = []\n",
 608 |     "            \n",
 609 |     "            while n < nClusters:#Converting nCluster into an array of n clusters [n] for use it later\n",
 610 |     "                clusters.append(n)\n",
 611 |     "                n+=1\n",
 612 |     "            break\n",
 613 |     "        \n",
 614 |     "    while True:\n",
 615 |     "        init = input(\"Initialization method [k-means++,random]:\")\n",
 616 |     "        \n",
 617 |     "        if init == \"k-means++\" or init == \"random\":\n",
 618 |     "            break\n",
 619 |     "\n",
 620 |     "    print(\"\\nClustering...\\n\")\n",
 621 |     "    \n",
 622 |     "    start_time = time.time()\n",
 623 |     "    KMEANS = KMeans(n_clusters = nClusters, init = init,max_iter = 300,n_init = 10,random_state = 0)\n",
 624 |     "    print(\"\\n\\nRun Time ->\",\"--- %s seconds ---\" % (time.time() - start_time))\n",
 625 |     "    print(\"Data Successfully Clustered\")\n",
 626 |     "    kmeans = KMEANS.fit(X)\n",
 627 |     "    Z = kmeans.labels_\n",
 628 |     "    inertia = KMEANS.inertia_\n",
 629 |     "    #Kmeans Results\n",
 630 |     "    kmeansR = pd.crosstab(Y,Z)\n",
 631 |     "    maxVal = kmeansR.idxmax()\n",
 632 |     "    \n",
 633 |     "    return Z,clusters,kmeansR,maxVal,inertia\n"
 634 |    ]
 635 |   },
 636 |   {
 637 |    "cell_type": "markdown",
 638 |    "metadata": {},
 639 |    "source": [
 640 |     "# Kmeans F1 Score"
 641 |    ]
 642 |   },
 643 |   {
 644 |    "cell_type": "code",
 645 |    "execution_count": null,
 646 |    "metadata": {},
 647 |    "outputs": [],
 648 |    "source": [
 649 |     "def kF1(Z,Y,maxVal,clusters):#F1 Score for Kmeans\n",
 650 |     "    from sklearn.metrics import f1_score\n",
 651 |     "    #Encoding data to F-score\n",
 652 |     "    \n",
 653 |     "    \n",
 654 |     "    # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n",
 655 |     "    n = 0 # counter\n",
 656 |     "    dictionaryCluster  = {} # creating an empty dictionary \n",
 657 |     "    f1 = 0 #f1score\n",
 658 |     "    average = ''\n",
 659 |     "    \n",
 660 |     "    while n < len(clusters):# while counter < number of clusters\n",
 661 |     "        dictionaryCluster[clusters[n]] = maxVal[n] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n",
 662 |     "        n+=1\n",
 663 |     "        \n",
 664 |     "    Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n",
 665 |     "            \n",
 666 |     "    Y = np.array(Y,dtype = int) # Converting labels into a int array\n",
 667 |     "    \n",
 668 |     "    while True:\n",
 669 |     "        \n",
 670 |     "        average = input(\"Average Method[weighted,micro,macro,binary]:\")\n",
 671 |     "        \n",
 672 |     "        if average == \"weighted\" or average == \"micro\" or average == \"macro\" or average == 'binary':\n",
 673 |     "            break\n",
 674 |     "    #score metric   \n",
 675 |     "    f1 = f1_score(Y,Z, average = average) #Forget the labels that where not predicted and gives lables that were predicted at least once\n",
 676 |     "    \n",
 677 |     "    return f1,dictionaryCluster"
 678 |    ]
 679 |   },
 680 |   {
 681 |    "cell_type": "markdown",
 682 |    "metadata": {},
 683 |    "source": [
 684 |     "# KMEANS Normal Mutial Info"
 685 |    ]
 686 |   },
 687 |   {
 688 |    "cell_type": "code",
 689 |    "execution_count": null,
 690 |    "metadata": {},
 691 |    "outputs": [],
 692 |    "source": [
 693 |     "\n",
 694 |     "def kNMI(Z,Y,maxVal,clusters):\n",
 695 |     "    from sklearn.metrics import normalized_mutual_info_score\n",
 696 |     "    \n",
 697 |     "    # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n",
 698 |     "    n = 0 # counter\n",
 699 |     "    dictionaryCluster  = {} # creating an empty dictionary \n",
 700 |     "    NMI = 0\n",
 701 |     "    average = ''\n",
 702 |     "    \n",
 703 |     "    while n < len(clusters):# while counter < number of clusters\n",
 704 |     "        dictionaryCluster[clusters[n]] = maxVal[n] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n",
 705 |     "        n+=1\n",
 706 |     "        \n",
 707 |     "    Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n",
 708 |     "    \n",
 709 |     "    Y = np.array(Y,dtype = int) #Making sure that labels are in a int array\n",
 710 |     "    \n",
 711 |     "    while True:\n",
 712 |     "        \n",
 713 |     "        average = input(\"Average Method[geometric,min,arithmetic,max]:\")\n",
 714 |     "        \n",
 715 |     "        if average == \"geometric\" or average == \"min\" or average == \"arithmetic\" or average == \"max\":\n",
 716 |     "            break\n",
 717 |     "    #Score metric \n",
 718 |     "    NMI = normalized_mutual_info_score(Y, Z, average_method = average)\n",
 719 |     "    \n",
 720 |     "    return NMI,dictionaryCluster\n"
 721 |    ]
 722 |   },
 723 |   {
 724 |    "cell_type": "markdown",
 725 |    "metadata": {},
 726 |    "source": [
 727 |     "# KMEANS Adjusted Random Score"
 728 |    ]
 729 |   },
 730 |   {
 731 |    "cell_type": "code",
 732 |    "execution_count": null,
 733 |    "metadata": {},
 734 |    "outputs": [],
 735 |    "source": [
 736 |     "def kARS(Z,Y,maxVal,clusters):\n",
 737 |     "    from sklearn.metrics import adjusted_rand_score\n",
 738 |     "    \n",
 739 |     "    # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n",
 740 |     "    n = 0 # counter\n",
 741 |     "    dictionaryCluster  = {} # creating an empty dictionary \n",
 742 |     "    ars = 0\n",
 743 |     "    \n",
 744 |     "    while n < len(clusters):# while counter < number of clusters\n",
 745 |     "        dictionaryCluster[clusters[n]] = maxVal[n] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n",
 746 |     "        n+=1\n",
 747 |     "        \n",
 748 |     "    Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n",
 749 |     "    \n",
 750 |     "    Y = np.array(Y,dtype = int) #Making sure that labels are in a int array\n",
 751 |     "    \n",
 752 |     "    #score metric\n",
 753 |     "    ars = adjusted_rand_score(Y, Z)\n",
 754 |     "    \n",
 755 |     "    return ars,dictionaryCluster"
 756 |    ]
 757 |   },
 758 |   {
 759 |    "cell_type": "markdown",
 760 |    "metadata": {},
 761 |    "source": [
 762 |     "# DBSCAN"
 763 |    ]
 764 |   },
 765 |   {
 766 |    "cell_type": "code",
 767 |    "execution_count": null,
 768 |    "metadata": {},
 769 |    "outputs": [],
 770 |    "source": [
 771 |     "def dbscanClustering(X,Y):#DBSCAN algorithm\n",
 772 |     "    from sklearn.cluster import DBSCAN\n",
 773 |     "    \n",
 774 |     "    while True:\n",
 775 |     "        \n",
 776 |     "        print(\"\\n\\n#########################################################################\")\n",
 777 |     "        print(\"DBSCAN ALGORITHM\")\n",
 778 |     "        print(\"#########################################################################\")\n",
 779 |     "              \n",
 780 |     "        epsilon = input(\"epsilon[Decimal]:\")\n",
 781 |     "        \n",
 782 |     "        try:\n",
 783 |     "            epsilon = float(epsilon)\n",
 784 |     "            \n",
 785 |     "        except ValueError:\n",
 786 |     "            \n",
 787 |     "            print(\"Enter a Decimal number\")\n",
 788 |     "            \n",
 789 |     "            \n",
 790 |     "        if type(epsilon) == float:\n",
 791 |     "            break\n",
 792 |     "        \n",
 793 |     "    while True:\n",
 794 |     "        minSamples = input(\"Min Samples[Integer]:\")\n",
 795 |     "        \n",
 796 |     "        try:\n",
 797 |     "            minSamples = int(minSamples)\n",
 798 |     "            \n",
 799 |     "        except ValueError:\n",
 800 |     "            \n",
 801 |     "            print(\"Enter a Integer Number\")\n",
 802 |     "            \n",
 803 |     "        if type(minSamples) == int:\n",
 804 |     "            break\n",
 805 |     "        \n",
 806 |     "    while True:\n",
 807 |     "        algorithm = input(\"Algorithm['auto’, ‘ball_tree’, ‘kd_tree’, 'brute']:\")\n",
 808 |     "            \n",
 809 |     "        if algorithm == \"auto\" or algorithm == \"ball_tree\" or algorithm == \"kd_tree\" or algorithm == \"brute\":\n",
 810 |     "            break\n",
 811 |     "        \n",
 812 |     "        else:\n",
 813 |     "            print(\"Error\\n\\n\")\n",
 814 |     "            \n",
 815 |     "    \n",
 816 |     "    print(\"\\nClustering...\\n\")\n",
 817 |     "\n",
 818 |     "    #Compute DBSCAN\n",
 819 |     "    start_time = time.time() \n",
 820 |     "    db = DBSCAN(eps= epsilon, min_samples = minSamples,algorithm = algorithm).fit(X)\n",
 821 |     "    print(\"\\n\\nRun Time ->\",\"--- %s seconds ---\" % (time.time() - start_time))\n",
 822 |     "    print(\"Data Successfully Clustered\")\n",
 823 |     "    \n",
 824 |     "    \n",
 825 |     "    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)\n",
 826 |     "    core_samples_mask[db.core_sample_indices_] = True\n",
 827 |     "    \n",
 828 |     "    Z = db.labels_\n",
 829 |     "    # Number of clusters in labels, ignoring noise if present.\n",
 830 |     "    n_clusters = len(set(Z))\n",
 831 |     "    n_noise_ = list(Z).count(-1)\n",
 832 |     "    \n",
 833 |     "    n = -1  # DBSCAN return index -1 cluster\n",
 834 |     "    clusters = []\n",
 835 |     "    while n + 1 < n_clusters:\n",
 836 |     "        clusters.append(n)\n",
 837 |     "        n += 1\n",
 838 |     "    \n",
 839 |     "    #DBSCAN Results\n",
 840 |     "    dbscanR = pd.crosstab(Y,Z)\n",
 841 |     "    maxVal = dbscanR.idxmax()\n",
 842 |     "    \n",
 843 |     "    return Z,clusters,n_noise_,dbscanR,maxVal"
 844 |    ]
 845 |   },
 846 |   {
 847 |    "cell_type": "markdown",
 848 |    "metadata": {},
 849 |    "source": [
 850 |     "# DBSCAN F1 Score"
 851 |    ]
 852 |   },
 853 |   {
 854 |    "cell_type": "code",
 855 |    "execution_count": null,
 856 |    "metadata": {},
 857 |    "outputs": [],
 858 |    "source": [
 859 |     "def dbF1(Z,Y,clusters,maxVal):#F1 score for DBSCAN\n",
 860 |     "    from sklearn.metrics import f1_score\n",
 861 |     "    #Encoding data to F-score\n",
 862 |     "    \n",
 863 |     "    # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n",
 864 |     "    n = 0 # counter\n",
 865 |     "    c = -1 # - counter max Value has negative index\n",
 866 |     "    dictionaryCluster  = {} # creating an empty dictionary \n",
 867 |     "    f1 = 0\n",
 868 |     "    average = ''\n",
 869 |     "    \n",
 870 |     "    while n < len(clusters):# while counter < number of clusters\n",
 871 |     "        dictionaryCluster[clusters[n]] = maxVal[c] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n",
 872 |     "        n+=1\n",
 873 |     "        c+=1\n",
 874 |     "    \n",
 875 |     "        \n",
 876 |     "    Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n",
 877 |     "    \n",
 878 |     "    Y = np.array(Y,dtype = int) #Making sure that labels are in a int array\n",
 879 |     "    while True:\n",
 880 |     "        \n",
 881 |     "        average = input(\"Average Method[weighted,micro,macro]:\")\n",
 882 |     "        \n",
 883 |     "        if average == \"weighted\" or average == \"micro\" or average == \"macro\":\n",
 884 |     "            break\n",
 885 |     "        \n",
 886 |     "        else:\n",
 887 |     "            \n",
 888 |     "            print(\"Error\\n\\n\")\n",
 889 |     "    #score metric\n",
 890 |     "    f1 = f1_score(Y,Z, average = average)\n",
 891 |     "    return f1,dictionaryCluster"
 892 |    ]
 893 |   },
 894 |   {
 895 |    "cell_type": "markdown",
 896 |    "metadata": {},
 897 |    "source": [
 898 |     "# DBSCAN Mutual Info Score"
 899 |    ]
 900 |   },
 901 |   {
 902 |    "cell_type": "code",
 903 |    "execution_count": null,
 904 |    "metadata": {},
 905 |    "outputs": [],
 906 |    "source": [
 907 |     "def dbNMI(Z,Y,clusters,maxVal):# Mutual info score for dbscan\n",
 908 |     "    from sklearn.metrics import normalized_mutual_info_score\n",
 909 |     "    # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n",
 910 |     "    n = 0 # counter\n",
 911 |     "    c = -1 # - counter max Value has negative index\n",
 912 |     "    NMI = 0\n",
 913 |     "    dictionaryCluster  = {} # creating an empty dictionary \n",
 914 |     "    average = ''\n",
 915 |     "    \n",
 916 |     "    while n < len(clusters):# while counter < number of clusters\n",
 917 |     "        dictionaryCluster[clusters[n]] = maxVal[c] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n",
 918 |     "        n+=1\n",
 919 |     "        c+=1\n",
 920 |     "    \n",
 921 |     "    Y = np.array(Y,dtype = int) #Making sure that labels are in a int array\n",
 922 |     "\n",
 923 |     "    while True:\n",
 924 |     "        \n",
 925 |     "        average = input(\"Average Method[geometric,min,arithmetic,max]:\")\n",
 926 |     "        \n",
 927 |     "        if average == \"geometric\" or average == \"min\" or average == \"arithmetic\" or average == \"max\":\n",
 928 |     "            break\n",
 929 |     "        else:\n",
 930 |     "            \n",
 931 |     "            print(\"Error\\n\\n\")\n",
 932 |     "    #score metric\n",
 933 |     "    NMI = normalized_mutual_info_score(Y, Z, average_method= average)\n",
 934 |     "    \n",
 935 |     "    return NMI,dictionaryCluster"
 936 |    ]
 937 |   },
 938 |   {
 939 |    "cell_type": "markdown",
 940 |    "metadata": {},
 941 |    "source": [
 942 |     "# DBSCAN Adjusted Random Score"
 943 |    ]
 944 |   },
 945 |   {
 946 |    "cell_type": "code",
 947 |    "execution_count": null,
 948 |    "metadata": {},
 949 |    "outputs": [],
 950 |    "source": [
 951 |     "def dbARS(Z,Y,clusters,maxVal): # adjusted rand score for dbscan\n",
 952 |     "    from sklearn.metrics import adjusted_rand_score\n",
 953 |     "    \n",
 954 |     "    # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n",
 955 |     "    n = 0 # counter\n",
 956 |     "    c = -1 # - counter max Value has negative index\n",
 957 |     "    ars = 0\n",
 958 |     "    dictionaryCluster  = {} # creating an empty dictionary \n",
 959 |     "    \n",
 960 |     "    while n < len(clusters):# while counter < number of clusters\n",
 961 |     "        dictionaryCluster[clusters[n]] = maxVal[c] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n",
 962 |     "        n+=1\n",
 963 |     "        c+=1\n",
 964 |     "    #score metric\n",
 965 |     "    ars = adjusted_rand_score(Y,Z)\n",
 966 |     "    \n",
 967 |     "    return ars,dictionaryCluster"
 968 |    ]
 969 |   },
 970 |   {
 971 |    "cell_type": "markdown",
 972 |    "metadata": {},
 973 |    "source": [
 974 |     "# Isolation Forest"
 975 |    ]
 976 |   },
 977 |   {
 978 |    "cell_type": "code",
 979 |    "execution_count": null,
 980 |    "metadata": {},
 981 |    "outputs": [],
 982 |    "source": [
 983 |     "def isolationForest(X,Y):# isolation forest algorithm\n",
 984 |     "    from sklearn.ensemble import IsolationForest\n",
 985 |     "    \n",
 986 |     "    while True:\n",
 987 |     "        contamination = input(\"Contamination[Float 0 to 0.5]: \")\n",
 988 |     "        \n",
 989 |     "        try:\n",
 990 |     "            contamination = float(contamination)\n",
 991 |     "            \n",
 992 |     "        except ValueError:\n",
 993 |     "            \n",
 994 |     "            print(\"Enter a Number\")\n",
 995 |     "            \n",
 996 |     "        if type(contamination) == float and (contamination >= 0 and contamination <= 0.5):\n",
 997 |     "            break\n",
 998 |     "    \n",
 999 |     "    print(\"\\nClustering...\\n\")   \n",
1000 |     "    \n",
1001 |     "    start_time = time.time() \n",
1002 |     "    Z = IsolationForest(max_samples = \"auto\",behaviour = \"new\",contamination = contamination).fit_predict(X)\n",
1003 |     "    print(\"\\n\\nRun Time ->\",\"--- %s seconds ---\" % (time.time() - start_time))\n",
1004 |     "    \n",
1005 |     "    Z = np.array(Z,dtype = object)\n",
1006 |     "    \n",
1007 |     "    ifR = pd.crosstab(Y,Z)\n",
1008 |     "    ifR = pd.DataFrame(ifR)\n",
1009 |     "    maxVal = ifR.idxmax()\n",
1010 |     "    \n",
1011 |     "    n = -1  # Isolation Forest return index -1 and 1 cluster\n",
1012 |     "    clusters = []\n",
1013 |     "    while n < len(ifR.columns):\n",
1014 |     "        clusters.append(n)\n",
1015 |     "        n += 2\n",
1016 |     "        \n",
1017 |     "    return Z,ifR,maxVal,clusters"
1018 |    ]
1019 |   },
1020 |   {
1021 |    "cell_type": "markdown",
1022 |    "metadata": {},
1023 |    "source": [
1024 |     "# Isolation Forest F1 Score"
1025 |    ]
1026 |   },
1027 |   {
1028 |    "cell_type": "code",
1029 |    "execution_count": null,
1030 |    "metadata": {},
1031 |    "outputs": [],
1032 |    "source": [
1033 |     "def ifF1(Z,Y,clusters,maxVal): #f1 score for isolation forest\n",
1034 |     "    from sklearn.metrics import f1_score\n",
1035 |     "    # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n",
1036 |     "    \n",
1037 |     "    n = 0 # counter\n",
1038 |     "    c = -1 # - counter max Value has negative index\n",
1039 |     "    f1 = 0\n",
1040 |     "    average = ''\n",
1041 |     "    dictionaryCluster  = {} # creating an empty dictionary \n",
1042 |     "\n",
1043 |     "    \n",
1044 |     "    while n < len(clusters): # Since we got -1 and 1 clusters , in order to assing the corrects result counter starts at -1 and it increments by 2 so it can have the 1 index of maxLOFvalue\n",
1045 |     "        dictionaryCluster[clusters[n]] = maxVal[c] \n",
1046 |     "        n+=1\n",
1047 |     "        c+=2\n",
1048 |     "        \n",
1049 |     "    Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n",
1050 |     "    \n",
1051 |     "    Y = np.array(Y,dtype = int)\n",
1052 |     "    Z = np.array(Z,dtype = int)\n",
1053 |     "    \n",
1054 |     "    while True:\n",
1055 |     "        \n",
1056 |     "        average = input(\"Average Method[weighted,micro,macro]:\")\n",
1057 |     "        \n",
1058 |     "        if average == \"weighted\" or average == \"micro\" or average == \"macro\":\n",
1059 |     "            break\n",
1060 |     "        \n",
1061 |     "        else:\n",
1062 |     "            \n",
1063 |     "            print(\"Error\\n\\n\")\n",
1064 |     "    # score metric\n",
1065 |     "    f1 = f1_score(Y,Z, average = average) #[None, 'micro', 'macro', 'weighted']\n",
1066 |     "    \n",
1067 |     "    return f1,dictionaryCluster"
1068 |    ]
1069 |   },
1070 |   {
1071 |    "cell_type": "markdown",
1072 |    "metadata": {},
1073 |    "source": [
1074 |     "# Local Outlier Factor"
1075 |    ]
1076 |   },
1077 |   {
1078 |    "cell_type": "code",
1079 |    "execution_count": null,
1080 |    "metadata": {},
1081 |    "outputs": [],
1082 |    "source": [
1083 |     "def LOF(X,Y):# Local outlier factor algorithm\n",
1084 |     "    from sklearn.neighbors import LocalOutlierFactor \n",
1085 |     "    \n",
1086 |     "    while True:\n",
1087 |     "        contamination = input(\"Contamination[Float 0 to 0.5]: \")\n",
1088 |     "        \n",
1089 |     "        try:\n",
1090 |     "            contamination = float(contamination)\n",
1091 |     "            \n",
1092 |     "        except ValueError:\n",
1093 |     "            \n",
1094 |     "            print(\"Enter a Number\")\n",
1095 |     "            \n",
1096 |     "        if type(contamination) == float and (contamination > 0 and contamination <= 0.5):\n",
1097 |     "            break\n",
1098 |     "        \n",
1099 |     "    while True:\n",
1100 |     "        algorithm = input(\"Algorithm['auto’, ‘ball_tree’, ‘kd_tree’, 'brute']:\")\n",
1101 |     "            \n",
1102 |     "        if algorithm == \"auto\" or algorithm == \"ball_tree\" or algorithm == \"kd_tree\" or algorithm == \"brute\":\n",
1103 |     "            break\n",
1104 |     "        else:\n",
1105 |     "            \n",
1106 |     "            print(\"Error\\n\\n\")\n",
1107 |     "            \n",
1108 |     "    print(\"\\nClustering...\\n\")\n",
1109 |     "    \n",
1110 |     "    start_time = time.time() \n",
1111 |     "    lof = LocalOutlierFactor(contamination = contamination,algorithm = algorithm).fit_predict(X)\n",
1112 |     "    print(\"\\n\\nRun Time ->\",\"--- %s seconds ---\" % (time.time() - start_time))\n",
1113 |     "    \n",
1114 |     "    lofR = pd.crosstab(Y,lof)\n",
1115 |     "    maxVal = lofR.idxmax()\n",
1116 |     "    \n",
1117 |     "    \n",
1118 |     "    n = -1  # LOF return index -1 and 1 cluster\n",
1119 |     "    clusters = []\n",
1120 |     "    while n < len(lofR.columns):\n",
1121 |     "        clusters.append(n)\n",
1122 |     "        n += 2\n",
1123 |     "    \n",
1124 |     "    \n",
1125 |     "    \n",
1126 |     "    return lof,lofR,maxVal,clusters"
1127 |    ]
1128 |   },
1129 |   {
1130 |    "cell_type": "markdown",
1131 |    "metadata": {},
1132 |    "source": [
1133 |     "# Local Outlier Factor F1 Score"
1134 |    ]
1135 |   },
1136 |   {
1137 |    "cell_type": "code",
1138 |    "execution_count": null,
1139 |    "metadata": {},
1140 |    "outputs": [],
1141 |    "source": [
1142 |     "def lofF1(Z,Y,clusters,maxVal): # f1 score for local outlier factor\n",
1143 |     "    from sklearn.metrics import f1_score\n",
1144 |     "    \n",
1145 |     "    # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n",
1146 |     "    n = 0 # counter\n",
1147 |     "    c = -1 # - counter max Value has negative index\n",
1148 |     "    f1 = 0\n",
1149 |     "    dictionaryCluster  = {} # creating an empty dictionary \n",
1150 |     "    \n",
1151 |     "    while n < len(clusters): # Since we got -1 and 1 clusters , in order to assing the corrects result counter starts at -1 and it increments by 2 so it can have the 1 index of maxLOFvalue\n",
1152 |     "        dictionaryCluster[clusters[n]] = maxVal[c] \n",
1153 |     "        n+=1\n",
1154 |     "        c+=2\n",
1155 |     "        \n",
1156 |     "    Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n",
1157 |     "    Y = np.array(Y,dtype = int)\n",
1158 |     "    Z = np.array(Z,dtype = int)\n",
1159 |     "    while True:\n",
1160 |     "        \n",
1161 |     "        average = input(\"Average Method[weighted,None,micro,macro]:\")\n",
1162 |     "        \n",
1163 |     "        if average == \"weighted\" or average == \"micro\" or average == \"macro\" or average == \"None\":\n",
1164 |     "            break\n",
1165 |     "        \n",
1166 |     "        else:\n",
1167 |     "            \n",
1168 |     "            print(\"Error\\n\\n\")\n",
1169 |     "    f1 = f1_score(Y,Z, average = average) #[None, 'micro', 'macro', 'weighted']\n",
1170 |     "    \n",
1171 |     "    return f1,dictionaryCluster"
1172 |    ]
1173 |   },
1174 |   {
1175 |    "cell_type": "markdown",
1176 |    "metadata": {},
1177 |    "source": [
1178 |     "# Calling Functions"
1179 |    ]
1180 |   },
1181 |   {
1182 |    "cell_type": "code",
1183 |    "execution_count": null,
1184 |    "metadata": {},
1185 |    "outputs": [],
1186 |    "source": [
1187 |     "clear()\n",
1188 |     "#Calling the functions\n",
1189 |     "\n",
1190 |     "##########################################################################\n",
1191 |     "path,dataSetOption = getDataSet()\n",
1192 |     "#########################################################################\n",
1193 |     "#########################################################################\n",
1194 |     "dataSet = readingData(path)\n",
1195 |     "#########################################################################\n",
1196 |     "#########################################################################\n",
1197 |     "dataSet = checkMissing(dataSet)\n",
1198 |     "#########################################################################\n",
1199 |     "#########################################################################\n",
1200 |     "data,labels,dataOption = gettingVariables(dataSet,dataSetOption) #Getting the Data we want to use for the algorithms\n",
1201 |     "#########################################################################\n",
1202 |     "#########################################################################\n",
1203 |     "try:\n",
1204 |     "    labels,encodeOption = encodingLabels(labels,dataOption,dataSetOption) #Encoding the true labels\n",
1205 |     "except ValueError:\n",
1206 |     "    labels = encodingLabels(labels,dataOption,dataSetOption) #Encoding the true labels\n",
1207 |     "#########################################################################\n",
1208 |     "#########################################################################\n",
1209 |     "data = riskEncodingData(data,dataOption)\n",
1210 |     "#########################################################################\n",
1211 |     "#########################################################################\n",
1212 |     "data = oneHotEncodingData(data,dataOption) #One hot Encode with the complete data\n",
1213 |     "#########################################################################\n",
1214 |     "#########################################################################\n",
1215 |     "data = scaling(data)\n",
1216 |     "#########################################################################\n",
1217 |     "#########################################################################\n",
1218 |     "data = shuffleData(data)\n",
1219 |     "#########################################################################\n",
1220 |     "\n",
1221 |     "#This menu is a option to run diferrent algorithms with the same preproceced data witouth the need of running all the code from 0 to make another experiment.\n",
1222 |     "while True:  \n",
1223 |     "    while True:\n",
1224 |     "        print(\"\\n\\n#########################################################################\")\n",
1225 |     "        print(\"Algorithm Menu\")\n",
1226 |     "        print(\"#########################################################################\")\n",
1227 |     "        \n",
1228 |     "        print(\"1.Kmeans\")\n",
1229 |     "        print(\"2.Dbscan\")\n",
1230 |     "        print(\"3.Isolation Forest\")\n",
1231 |     "        print(\"4.Local Factor Outlier\")\n",
1232 |     "        \n",
1233 |     "        algorithmOption = input(\"option:\")\n",
1234 |     "        \n",
1235 |     "        if algorithmOption == \"1\" or algorithmOption == \"2\" or algorithmOption == \"3\" or algorithmOption == \"4\":\n",
1236 |     "                break\n",
1237 |     "        else:\n",
1238 |     "            \n",
1239 |     "            print(\"Error\\n\\n\")\n",
1240 |     "\n",
1241 |     "    \n",
1242 |     "    if algorithmOption == \"1\":\n",
1243 |     "        #########################################################################\n",
1244 |     "        #KMEANS\n",
1245 |     "        klabels,kClusters,kmeansR,maxKvalue,inertia = kmeansClustering(data,labels)\n",
1246 |     "        print(\"#########################################################################\")\n",
1247 |     "        print(\"KMEANS RESULTS\\n\\n\")\n",
1248 |     "        print(\"Clusters -> \",kClusters,\"\\n\")\n",
1249 |     "        print(\"Inertia -> \",inertia)\n",
1250 |     "        print(kmeansR,\"\\n\\n\")\n",
1251 |     "        print(\"Max True Label\",\"\\n\\n\",maxKvalue)\n",
1252 |     "        print(\"#########################################################################\")\n",
1253 |     "        #########################################################################\n",
1254 |     "        print(\"\\n\\n#########################################################################\")\n",
1255 |     "        print(\"Kmeans Score Metrics Menu\")\n",
1256 |     "        print(\"#########################################################################\")\n",
1257 |     "        \n",
1258 |     "        while True:\n",
1259 |     "            print(\"1.F1 Score\")\n",
1260 |     "            print(\"2.Normalized Mutual Info Score\")\n",
1261 |     "            print(\"3.Adjusted Rand Score\")\n",
1262 |     "        \n",
1263 |     "            kScoreOption = input(\"option:\")\n",
1264 |     "            \n",
1265 |     "            if kScoreOption == \"1\" or kScoreOption == \"2\" or kScoreOption == \"3\":\n",
1266 |     "                break\n",
1267 |     "            else:\n",
1268 |     "                \n",
1269 |     "                print(\"Error\\n\\n\")\n",
1270 |     "     \n",
1271 |     "        if kScoreOption == \"1\":\n",
1272 |     "            #########################################################################\n",
1273 |     "            #F1 Score\n",
1274 |     "            kmeansF1,clusterAssigned = kF1(klabels,labels,maxKvalue,kClusters)\n",
1275 |     "            print(\"\\n\\n#########################################################################\")\n",
1276 |     "            print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n",
1277 |     "            print(\"KMEANS F1 Score -> \",kmeansF1)\n",
1278 |     "            print(\"#########################################################################\")\n",
1279 |     "            #########################################################################\n",
1280 |     "        \n",
1281 |     "        elif kScoreOption == \"2\":\n",
1282 |     "            #########################################################################\n",
1283 |     "            kmeansNMI,clusterAssigned = kNMI(klabels,labels,maxKvalue,kClusters)\n",
1284 |     "            print(\"\\n\\n#########################################################################\")\n",
1285 |     "            print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n",
1286 |     "            print(\"KMEANS Normalized Mutual Info Score -> \",kmeansNMI)\n",
1287 |     "            print(\"#########################################################################\")\n",
1288 |     "            #########################################################################\n",
1289 |     "    \n",
1290 |     "        elif kScoreOption == \"3\":\n",
1291 |     "            \n",
1292 |     "            #########################################################################\n",
1293 |     "            kmeansARS,clusterAssigned = kARS(klabels,labels,maxKvalue,kClusters)\n",
1294 |     "            print(\"\\n\\n#########################################################################\")\n",
1295 |     "            print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n",
1296 |     "            print(\"KMEANS Adjusted Rand Score -> \",kmeansARS)\n",
1297 |     "            print(\"#########################################################################\")\n",
1298 |     "            #########################################################################\n",
1299 |     "            \n",
1300 |     "    elif algorithmOption == \"2\":\n",
1301 |     "        #########################################################################\n",
1302 |     "        #DBSCAN\n",
1303 |     "        dblabels,dbClusters,nNoises,dbscanR,maxDBvalue = dbscanClustering(data,labels) \n",
1304 |     "        print(\"#########################################################################\")\n",
1305 |     "        print(\"DBSCAN RESULTS\\n\\n\")\n",
1306 |     "        print(\"Clusters -> \",dbClusters,\"\\n\")\n",
1307 |     "        print(dbscanR,\"\\n\\n\")\n",
1308 |     "        print(\"Noise -> \",nNoises)\n",
1309 |     "        print(\"Max True Label\",\"\\n\\n\",maxDBvalue)\n",
1310 |     "        print(\"#########################################################################\")\n",
1311 |     "        #########################################################################\n",
1312 |     "        print(\"\\n\\n#########################################################################\")\n",
1313 |     "        print(\"Dscan Score Metrics Menu\")\n",
1314 |     "        print(\"#########################################################################\")\n",
1315 |     "        print(\"1.F1 Score\")\n",
1316 |     "        print(\"2.Normalized Mutual Info Score\")\n",
1317 |     "        print(\"3.Adjusted Rand Score\")\n",
1318 |     "        \n",
1319 |     "        while True:\n",
1320 |     "            \n",
1321 |     "            dbScoreOption = input(\"option:\")\n",
1322 |     "            \n",
1323 |     "            if dbScoreOption == \"1\" or dbScoreOption == \"2\" or dbScoreOption == \"3\":\n",
1324 |     "                break\n",
1325 |     "            else:\n",
1326 |     "                \n",
1327 |     "                print(\"Error\\n\\n\")\n",
1328 |     "    \n",
1329 |     "        if dbScoreOption == \"1\":\n",
1330 |     "            #########################################################################\n",
1331 |     "            #F1 Score dbscan\n",
1332 |     "            dbscanF1,clusterAssigned = dbF1(dblabels,labels,dbClusters,maxDBvalue)\n",
1333 |     "            print(\"\\n\\n#########################################################################\")\n",
1334 |     "            print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n",
1335 |     "            print(\"DBSCAN F1 Score -> \",dbscanF1)\n",
1336 |     "            print(\"#########################################################################\")\n",
1337 |     "            #########################################################################\n",
1338 |     "            \n",
1339 |     "        elif dbScoreOption == \"2\":\n",
1340 |     "            #########################################################################\n",
1341 |     "            dbscanNMI,clusterAssigned = dbNMI(dblabels,labels,dbClusters,maxDBvalue)\n",
1342 |     "            print(\"\\n\\n#########################################################################\")\n",
1343 |     "            print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n",
1344 |     "            print(\"DBSCAN Normalized Mutual Info Score -> \",dbscanNMI)\n",
1345 |     "            print(\"#########################################################################\")\n",
1346 |     "            #########################################################################\n",
1347 |     "            \n",
1348 |     "        elif dbScoreOption == \"3\":\n",
1349 |     "            #########################################################################\n",
1350 |     "            dbscanARS,clusterAssigned = dbARS(dblabels,labels,dbClusters,maxDBvalue)\n",
1351 |     "            print(\"\\n\\n#########################################################################\")\n",
1352 |     "            print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n",
1353 |     "            print(\"DBSCAN Adjusted Rand Score -> \",dbscanARS)\n",
1354 |     "            print(\"#########################################################################\")\n",
1355 |     "            #########################################################################\n",
1356 |     "        \n",
1357 |     "        \n",
1358 |     "    elif algorithmOption == \"3\":\n",
1359 |     "        #########################################################################\n",
1360 |     "        ifLabels,ifR,MaxIfVal,ifNclusters = isolationForest(data,labels)\n",
1361 |     "        print(\"#########################################################################\")\n",
1362 |     "        print(\"Isolation Forest RESULTS\\n\\n\")\n",
1363 |     "        print(\"Clusters -> \",ifNclusters,\"\\n\")\n",
1364 |     "        print(ifR,\"\\n\\n\")\n",
1365 |     "        print(\"Max True Label\",\"\\n\\n\",MaxIfVal)\n",
1366 |     "        print(\"#########################################################################\")\n",
1367 |     "        #########################################################################\n",
1368 |     "        print(\"\\n\\n#########################################################################\")\n",
1369 |     "        print(\"Isolation Forest Score Metrics Menu\")\n",
1370 |     "        print(\"#########################################################################\")\n",
1371 |     "        print(\"1.F1 Score\")\n",
1372 |     "        \n",
1373 |     "        while True:\n",
1374 |     "            \n",
1375 |     "            ifScoreOption = input(\"option:\")\n",
1376 |     "            \n",
1377 |     "            if ifScoreOption == \"1\":\n",
1378 |     "                break\n",
1379 |     "            else:\n",
1380 |     "                \n",
1381 |     "                print(\"Error\\n\\n\")\n",
1382 |     "        \n",
1383 |     "        if ifScoreOption == \"1\":\n",
1384 |     "            \n",
1385 |     "            ##########################################################################\n",
1386 |     "            isolationForestF1,clusterAssigned = ifF1(ifLabels,labels,ifNclusters,MaxIfVal)\n",
1387 |     "            print(\"\\n\\n#########################################################################\")\n",
1388 |     "            print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n",
1389 |     "            print(\"Isolation Forest F1 Score -> \",isolationForestF1)\n",
1390 |     "            print(\"#########################################################################\")\n",
1391 |     "            ##########################################################################\n",
1392 |     "        \n",
1393 |     "    elif algorithmOption == \"4\":\n",
1394 |     "        #########################################################################\n",
1395 |     "        LOFlabels,lofR,maxLOFvalue,lofClusters = LOF(data,labels)\n",
1396 |     "        print(\"#########################################################################\")\n",
1397 |     "        print(\"Local Outlier Factor RESULTS\\n\\n\")\n",
1398 |     "        print(\"Clusters -> \",lofClusters,\"\\n\")\n",
1399 |     "        print(lofR,\"\\n\\n\")\n",
1400 |     "        print(\"Max True Label\",\"\\n\\n\",maxLOFvalue)\n",
1401 |     "        print(\"#########################################################################\")\n",
1402 |     "        #########################################################################\n",
1403 |     "        print(\"\\n\\n#########################################################################\")\n",
1404 |     "        print(\"LOF Score Metrics Menu\")\n",
1405 |     "        print(\"#########################################################################\")\n",
1406 |     "        print(\"1.F1 Score\")\n",
1407 |     "        \n",
1408 |     "        while True:\n",
1409 |     "            \n",
1410 |     "            lofScoreOption = input(\"option:\")\n",
1411 |     "            \n",
1412 |     "            if lofScoreOption == \"1\":\n",
1413 |     "                break\n",
1414 |     "            else:\n",
1415 |     "                \n",
1416 |     "                print(\"Error\\n\\n\")\n",
1417 |     "        \n",
1418 |     "        if lofScoreOption == \"1\":\n",
1419 |     "            \n",
1420 |     "            ##########################################################################\n",
1421 |     "            LOFf1,clusterAssigned = lofF1(LOFlabels,labels,lofClusters,maxLOFvalue)\n",
1422 |     "            print(\"\\n\\n#########################################################################\")\n",
1423 |     "            print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n",
1424 |     "            print(\"LOF F1 Score -> \",LOFf1)\n",
1425 |     "            print(\"#########################################################################\")\n",
1426 |     "            ##########################################################################\n",
1427 |     "                \n",
1428 |     "    while True: # If the user want to Make a new clustering algorithm test\n",
1429 |     "        \n",
1430 |     "        decision = input(\"Try another Clustering Algorithm[y/n]:\")\n",
1431 |     "        \n",
1432 |     "        if decision == \"y\" or  decision == \"n\":\n",
1433 |     "            break\n",
1434 |     "        else:\n",
1435 |     "            \n",
1436 |     "            print(\"Error\\n\\n\")\n",
1437 |     "    \n",
1438 |     "    \n",
1439 |     "    if decision == \"n\":\n",
1440 |     "        break\n",
1441 |     "    \n",
1442 |     "    else:\n",
1443 |     "        clear()"
1444 |    ]
1445 |   }
1446 |  ],
1447 |  "metadata": {
1448 |   "kernelspec": {
1449 |    "display_name": "Python 3",
1450 |    "language": "python",
1451 |    "name": "python3"
1452 |   },
1453 |   "language_info": {
1454 |    "codemirror_mode": {
1455 |     "name": "ipython",
1456 |     "version": 3
1457 |    },
1458 |    "file_extension": ".py",
1459 |    "mimetype": "text/x-python",
1460 |    "name": "python",
1461 |    "nbconvert_exporter": "python",
1462 |    "pygments_lexer": "ipython3",
1463 |    "version": "3.7.3"
1464 |   },
1465 |   "varInspector": {
1466 |    "cols": {
1467 |     "lenName": 16,
1468 |     "lenType": 16,
1469 |     "lenVar": 40
1470 |    },
1471 |    "kernels_config": {
1472 |     "python": {
1473 |      "delete_cmd_postfix": "",
1474 |      "delete_cmd_prefix": "del ",
1475 |      "library": "var_list.py",
1476 |      "varRefreshCmd": "print(var_dic_list())"
1477 |     },
1478 |     "r": {
1479 |      "delete_cmd_postfix": ") ",
1480 |      "delete_cmd_prefix": "rm(",
1481 |      "library": "var_list.r",
1482 |      "varRefreshCmd": "cat(var_dic_list()) "
1483 |     }
1484 |    },
1485 |    "position": {
1486 |     "height": "923px",
1487 |     "left": "328px",
1488 |     "right": "20px",
1489 |     "top": "9px",
1490 |     "width": "800px"
1491 |    },
1492 |    "types_to_exclude": [
1493 |     "module",
1494 |     "function",
1495 |     "builtin_function_or_method",
1496 |     "instance",
1497 |     "_Feature"
1498 |    ],
1499 |    "window_display": false
1500 |   }
1501 |  },
1502 |  "nbformat": 4,
1503 |  "nbformat_minor": 2
1504 | }
1505 | 


--------------------------------------------------------------------------------
/CBAD OUTLINE.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremy191/clustering-based-anomaly-detection/7df019978b9350920ac2e21bb53663f8709f6ab1/CBAD OUTLINE.pdf


--------------------------------------------------------------------------------
/CBAD-Poster.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremy191/clustering-based-anomaly-detection/7df019978b9350920ac2e21bb53663f8709f6ab1/CBAD-Poster.pdf


--------------------------------------------------------------------------------
/CBAD.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Main Libraries"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "code",
  12 |    "execution_count": 1,
  13 |    "metadata": {},
  14 |    "outputs": [],
  15 |    "source": [
  16 |     "import numpy as np\n",
  17 |     "import pandas as pd \n",
  18 |     "import time\n",
  19 |     "import os"
  20 |    ]
  21 |   },
  22 |   {
  23 |    "cell_type": "markdown",
  24 |    "metadata": {},
  25 |    "source": [
  26 |     "# Clear"
  27 |    ]
  28 |   },
  29 |   {
  30 |    "cell_type": "code",
  31 |    "execution_count": 2,
  32 |    "metadata": {},
  33 |    "outputs": [],
  34 |    "source": [
  35 |     "clear = lambda:os.system('clear')"
  36 |    ]
  37 |   },
  38 |   {
  39 |    "cell_type": "markdown",
  40 |    "metadata": {},
  41 |    "source": [
  42 |     "# Getting the dataset"
  43 |    ]
  44 |   },
  45 |   {
  46 |    "cell_type": "code",
  47 |    "execution_count": 3,
  48 |    "metadata": {},
  49 |    "outputs": [],
  50 |    "source": [
  51 |     "def getDataSet():# Getting the path of the dataset\n",
  52 |     "    \n",
  53 |     "    while True:\n",
  54 |     "        print(\"**************************************************\")\n",
  55 |     "        print(\"DATA SET MENU\")\n",
  56 |     "        print(\"**************************************************\")\n",
  57 |     "        print(\"1.NSL-KDD\")\n",
  58 |     "        print(\"2.IDS 2017\")\n",
  59 |     "        \n",
  60 |     "        option = input(\"Option:\")\n",
  61 |     "        \n",
  62 |     "        if option == \"1\" or option == \"2\":\n",
  63 |     "            break\n",
  64 |     "    \n",
  65 |     "    path = input(\"Path of the File:\")\n",
  66 |     "    \n",
  67 |     "    return path,option"
  68 |    ]
  69 |   },
  70 |   {
  71 |    "cell_type": "markdown",
  72 |    "metadata": {},
  73 |    "source": [
  74 |     "# Reading the dataset"
  75 |    ]
  76 |   },
  77 |   {
  78 |    "cell_type": "code",
  79 |    "execution_count": 4,
  80 |    "metadata": {},
  81 |    "outputs": [],
  82 |    "source": [
  83 |     "def readingData(path): #Reading the Dataset\n",
  84 |     "    \n",
  85 |     "    while True:\n",
  86 |     "        \n",
  87 |     "        option = input(\"Dataset has feature names[y/n]:\") \n",
  88 |     "        \n",
  89 |     "        if option == \"y\" or option == \"n\":\n",
  90 |     "            break\n",
  91 |     "            \n",
  92 |     "    print(\"\\nReading Dataset...\") \n",
  93 |     "        \n",
  94 |     "    if option == \"y\":\n",
  95 |     "        dataSet = pd.read_csv(path,low_memory=False)\n",
  96 |     "    \n",
  97 |     "    elif option == \"n\":\n",
  98 |     "        dataSet = pd.read_csv(path, header = None,low_memory=False)\n",
  99 |     "            \n",
 100 |     "    return dataSet\n"
 101 |    ]
 102 |   },
 103 |   {
 104 |    "cell_type": "markdown",
 105 |    "metadata": {},
 106 |    "source": [
 107 |     "# Check if missing data"
 108 |    ]
 109 |   },
 110 |   {
 111 |    "cell_type": "code",
 112 |    "execution_count": 5,
 113 |    "metadata": {},
 114 |    "outputs": [],
 115 |    "source": [
 116 |     "def checkMissing(X):#This check if the dataset given has missing values.\n",
 117 |     "    isMissing = str(X.isnull().values.any()) #Using String instead of Boolean because (\"cannot unpack non-iterable numpy.bool object\")\n",
 118 |     "    \n",
 119 |     "    if isMissing == \"True\":\n",
 120 |     "        #if data set has infinity values replace them with none\n",
 121 |     "        X = X.replace('Infinity', np.nan) #Replacing Infinity values with nan values\n",
 122 |     "           \n",
 123 |     "        missingValIndex = []\n",
 124 |     "        total = X.isnull().sum().sum()\n",
 125 |     "        percent = (total / (X.count().sum() + X.isnull().sum().sum())) * 100\n",
 126 |     "            \n",
 127 |     "        for rows in X:\n",
 128 |     "                    \n",
 129 |     "            if X[rows].isnull().sum() != 0:\n",
 130 |     "                missingValIndex.append(rows)\n",
 131 |     "        print(\"\\n\\n**************************************************\")\n",
 132 |     "        print(\"Data has missing values\")\n",
 133 |     "        print(\"**************************************************\")\n",
 134 |     "        print(\"Features with missing values:\",missingValIndex)\n",
 135 |     "        print(\"Total missing Values -> \" , total)\n",
 136 |     "        print(percent,\"%\")\n",
 137 |     "        \n",
 138 |     "        return X\n",
 139 |     "    \n",
 140 |     "    else:\n",
 141 |     "        \n",
 142 |     "        return X\n"
 143 |    ]
 144 |   },
 145 |   {
 146 |    "cell_type": "markdown",
 147 |    "metadata": {},
 148 |    "source": [
 149 |     "# Getting the features"
 150 |    ]
 151 |   },
 152 |   {
 153 |    "cell_type": "code",
 154 |    "execution_count": 7,
 155 |    "metadata": {},
 156 |    "outputs": [
 157 |     {
 158 |      "ename": "IndentationError",
 159 |      "evalue": "unindent does not match any outer indentation level (<tokenize>, line 47)",
 160 |      "output_type": "error",
 161 |      "traceback": [
 162 |       "\u001b[0;36m  File \u001b[0;32m\"<tokenize>\"\u001b[0;36m, line \u001b[0;32m47\u001b[0m\n\u001b[0;31m    for rows in dataSet: #Getting features index with missing values\u001b[0m\n\u001b[0m    ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unindent does not match any outer indentation level\n"
 163 |      ]
 164 |     }
 165 |    ],
 166 |    "source": [
 167 |     "#Getting The data we want to test for the clustering algorithms\n",
 168 |     "def gettingVariables(dataSet,dataSetOption):# If the dataset is NSL-KDD it would get the features and the labels for it and if its IDS 2017 it would take the features and the labels for it and take careof missing values.\n",
 169 |     "   \n",
 170 |     "    if dataSetOption == \"1\":\n",
 171 |     "        while True:\n",
 172 |     "            print(\"\\n\\n**************************************************\")\n",
 173 |     "            print(\"Variables Menu\")\n",
 174 |     "            print(\"**************************************************\")\n",
 175 |     "            print(\"1.Data set with categorical data oneHot encoded\")\n",
 176 |     "            print(\"2.Data set with categorical data removed\")\n",
 177 |     "            print(\"3.Data set with Risk Values replacing Server Type and Flag Features; Protocol Data oneHot encoded\")\n",
 178 |     "            option = input(\"Enter option :\")\n",
 179 |     "            \n",
 180 |     "            \n",
 181 |     "            if option == \"1\" or option == \"2\" or option == \"3\":\n",
 182 |     "                break\n",
 183 |     "            else:\n",
 184 |     "                \n",
 185 |     "                print(\"Error\\n\\n\")\n",
 186 |     "            \n",
 187 |     "        \n",
 188 |     "        if option == \"1\":\n",
 189 |     "            #Getting the Dependent and independent Variables\n",
 190 |     "            #In all the option we remove the dificulty level feature because we don't need it in our experiments\n",
 191 |     "            \n",
 192 |     "            \n",
 193 |     "            X = dataSet.iloc[:,:-2].values # Data, Get all the rows and all the clums except all the colums - 2\n",
 194 |     "            Y = dataSet.iloc[:,42].values# Labels\n",
 195 |     "            return X,Y,option\n",
 196 |     "        \n",
 197 |     "        elif option == \"2\":\n",
 198 |     "            #Removing Categorical data from the data set\n",
 199 |     "            X = dataSet.iloc[:,[0,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40]].values\n",
 200 |     "            Y = dataSet.iloc[:,42].values# Labels\n",
 201 |     "            \n",
 202 |     "            return X,Y,option\n",
 203 |     "        \n",
 204 |     "        elif option == \"3\":\n",
 205 |     "            #for later Risk Encode - Categorical features\n",
 206 |     "            X = dataSet.iloc[:,:-2].values\n",
 207 |     "            Y = dataSet.iloc[:,42].values# Labels\n",
 208 |     "            \n",
 209 |     "            return X,Y,option\n",
 210 |     "    \n",
 211 |     "\n",
 212 |     "    elif dataSetOption == \"2\":\n",
 213 |     "        #############################################################################\n",
 214 |     "        #GETTING VARIABLES\n",
 215 |     "        #############################################################################\n",
 216 |     "        missingValIndex = []\n",
 217 |     "        for rows in dataSet: #Getting features index with missing values\n",
 218 |     "            if dataSet[rows].isnull().sum() != 0:\n",
 219 |     "                    missingValIndex.append(rows)\n",
 220 |     "                \n",
 221 |     "        X = dataSet.iloc[:,:-1].values#data\n",
 222 |     "        #if names are not especified it will assign 0,1,2...n for the features name\n",
 223 |     "        X = pd.DataFrame(X,columns = [' Destination Port',' Flow Duration',' Total Fwd Packets',' Total Backward Packets','Total Length of Fwd Packets',\n",
 224 |     "                                      ' Total Length of Bwd Packets',' Fwd Packet Length Max',' Fwd Packet Length Min',' Fwd Packet Length Mean',' Fwd Packet Length Std',\n",
 225 |     "                                      'Bwd Packet Length Max',' Bwd Packet Length Min',' Bwd Packet Length Mean',' Bwd Packet Length Std','Flow Bytes/s',' Flow Packets/s',' Flow IAT Mean',\n",
 226 |     "                                      ' Flow IAT Std',' Flow IAT Max',' Flow IAT Min','Fwd IAT Total',' Fwd IAT Mean',' Fwd IAT Std',' Fwd IAT Max',' Fwd IAT Min','Bwd IAT Total',' Bwd IAT Mean',\n",
 227 |     "                                      ' Bwd IAT Std',' Bwd IAT Max',' Bwd IAT Min','Fwd PSH Flags',' Bwd PSH Flags',' Fwd URG Flags',' Bwd URG Flags',' Fwd Header Length',' Bwd Header Length','Fwd Packets/s',\n",
 228 |     "                                      ' Bwd Packets/s',' Min Packet Length',' Max Packet Length',' Packet Length Mean',' Packet Length Std',' Packet Length Variance','FIN Flag Count',' SYN Flag Count',' RST Flag Count',\n",
 229 |     "                                      ' PSH Flag Count',' ACK Flag Count',' URG Flag Count',' CWE Flag Count',' ECE Flag Count',' Down/Up Ratio',' Average Packet Size',' Avg Fwd Segment Size',' Avg Bwd Segment Size',' Fwd Header Length',\n",
 230 |     "                                      'Fwd Avg Bytes/Bulk',' Fwd Avg Packets/Bulk',' Fwd Avg Bulk Rate',' Bwd Avg Bytes/Bulk',' Bwd Avg Packets/Bulk','Bwd Avg Bulk Rate','Subflow Fwd Packets',' Subflow Fwd Bytes',' Subflow Bwd Packets',' Subflow Bwd Bytes',\n",
 231 |     "                                      'Init_Win_bytes_forward',' Init_Win_bytes_backward',' act_data_pkt_fwd',' min_seg_size_forward','Active Mean',' Active Std',' Active Max',' Active Min','Idle Mean',' Idle Std',' Idle Max',' Idle Min'])\n",
 232 |     "        Y = dataSet.iloc[:,78].values#Labels\n",
 233 |     "        \n",
 234 |     "        #############################################################################\n",
 235 |     "        #Variables Got \n",
 236 |     "        #############################################################################\n",
 237 |     "        \n",
 238 |     "    #############################################################################\n",
 239 |     "    #MANAGE MISSING DATA\n",
 240 |     "    #############################################################################   \n",
 241 |     "     \n",
 242 |     "        while True:\n",
 243 |     "            print(\"\\n\\n**************************************************\")\n",
 244 |     "            print(\"Manage Missing Values \")\n",
 245 |     "            print(\"**************************************************\")\n",
 246 |     "            print(\"1.Eliminate Catg. w/ Missing Values\")\n",
 247 |     "            print(\"2.Impute 0 for Missing Values\")\n",
 248 |     "            print(\"3.Impute Mean for Missing Values\")\n",
 249 |     "            print(\"4.Impute Median for Missing Values\")\n",
 250 |     "            print(\"5.Impute Mode for Missing Values\")\n",
 251 |     "            print(\"6.Simple Imputer\")\n",
 252 |     "            missingDataOption = input(\"Option:\")\n",
 253 |     "    \n",
 254 |     "            if missingDataOption == \"1\" or missingDataOption == \"2\" or missingDataOption == \"3\" or missingDataOption == \"4\" or missingDataOption == \"5\" or missingDataOption == \"6\":\n",
 255 |     "                break\n",
 256 |     "    \n",
 257 |     "    \n",
 258 |     "        if missingDataOption == \"1\":\n",
 259 |     "            deletedColumns = []\n",
 260 |     "            numColumns = len(X.columns)\n",
 261 |     "            #removing features with missing values\n",
 262 |     "            for row in missingValIndex:\n",
 263 |     "                deletedColumns.append(row)\n",
 264 |     "                del X[row]\n",
 265 |     "        \n",
 266 |     "            print(\"#\\n\\n########################################################################\")\n",
 267 |     "            print(\"Columns Succesfully Removed\")\n",
 268 |     "            print(len(deletedColumns),\"of\",numColumns,\"were deleted\")\n",
 269 |     "            print(\"Columns Names -> \",deletedColumns)\n",
 270 |     "            print(\"#########################################################################\")\n",
 271 |     "    \n",
 272 |     "        elif missingDataOption == \"2\":\n",
 273 |     "            #fill with 0\n",
 274 |     "            for row in missingValIndex:\n",
 275 |     "                X[row] = X[row].fillna(0)\n",
 276 |     "        \n",
 277 |     "            print(\"\\n\\n#########################################################################\")\n",
 278 |     "            print(\"Sucessfully Filled Missing Values with 0\")\n",
 279 |     "            print(\"#########################################################################\")\n",
 280 |     "    \n",
 281 |     "    \n",
 282 |     "        elif missingDataOption == \"3\":\n",
 283 |     "            #mean imputer\n",
 284 |     "            for row in missingValIndex:\n",
 285 |     "                X[row] = X[row].astype(float)\n",
 286 |     "                X[row] = X[row].fillna(X[row].mean())\n",
 287 |     "        \n",
 288 |     "            print(\"\\n\\n#########################################################################\")\n",
 289 |     "            print(\"Sucessfully Filled Missing Values with Mean\")\n",
 290 |     "            print(\"#########################################################################\")\n",
 291 |     "    \n",
 292 |     "        elif missingDataOption == \"4\":\n",
 293 |     "            #median imputer\n",
 294 |     "            for row in missingValIndex:\n",
 295 |     "                median = X[row].median()\n",
 296 |     "                X[row].fillna(median, inplace=True)\n",
 297 |     "            print(\"\\n\\n#########################################################################\")\n",
 298 |     "            print(\"Sucessfully Filled Missing Values with Median\")\n",
 299 |     "            print(\"#########################################################################\")\n",
 300 |     "    \n",
 301 |     "        elif missingDataOption == \"5\":\n",
 302 |     "            #Mode imputer\n",
 303 |     "            for row in missingValIndex:\n",
 304 |     "                X[row] = X[row].fillna(X[row].mode()[0])\n",
 305 |     "    \n",
 306 |     "            print(\"\\n\\n#########################################################################\")\n",
 307 |     "            print(\"Sucessfully Filled Missing Values with Mode \")\n",
 308 |     "            print(\"#########################################################################\")\n",
 309 |     "        \n",
 310 |     "        elif missingDataOption == \"6\": \n",
 311 |     "            from sklearn.impute import SimpleImputer\n",
 312 |     "            #\"Imputation transformer for completing missing values.\"(Univariate)\n",
 313 |     "            X = SimpleImputer(missing_values = np.nan, strategy='mean', fill_value=None, verbose=0, copy=True).fit_transform(X)          \n",
 314 |     "            print(\"\\n\\n#########################################################################\")\n",
 315 |     "            print(\"Sucessfully Imputed Simple Imputer \")\n",
 316 |     "            print(\"#########################################################################\")\n",
 317 |     "                  \n",
 318 |     "                  \n",
 319 |     "        option = \"None\" #This data does not have categorical features so dataOption is none      \n",
 320 |     "        return X,Y,option\n",
 321 |     "       \n",
 322 |     "#############################################################################\n",
 323 |     "#END OF MISSING DATA\n",
 324 |     "#############################################################################"
 325 |    ]
 326 |   },
 327 |   {
 328 |    "cell_type": "markdown",
 329 |    "metadata": {},
 330 |    "source": [
 331 |     "# Encoding Labels"
 332 |    ]
 333 |   },
 334 |   {
 335 |    "cell_type": "code",
 336 |    "execution_count": null,
 337 |    "metadata": {},
 338 |    "outputs": [],
 339 |    "source": [
 340 |     "def encodingLabels(Y,dataOption,datasetOption):# Encoding the labels with multi class or binary\n",
 341 |     "    \n",
 342 |     "    if datasetOption == \"1\": #Check if the data set choosen is NSL-KDD or IDS2017\n",
 343 |     "        \n",
 344 |     "        if dataOption == \"1\" or dataOption == \"2\" or dataOption == \"3\":\n",
 345 |     "            \n",
 346 |     "            while True:\n",
 347 |     "                print(\"\\n\\n#########################################################################\")\n",
 348 |     "                print(\"Encoding Menu\")\n",
 349 |     "                print(\"#########################################################################\")\n",
 350 |     "                print(\"1.Binary true labels: normal = 0, abnormal = 1\")\n",
 351 |     "                print(\"2.Multiclass true labels: normal = 0, DoS = 1, Probe = 2, R2L = 3, U2R = 4\")\n",
 352 |     "                encodeOption = input(\"Enter option :\") \n",
 353 |     "    \n",
 354 |     "                if encodeOption == \"1\" or encodeOption == \"2\":\n",
 355 |     "                    break\n",
 356 |     "                else:\n",
 357 |     "                    \n",
 358 |     "                    print(\"Error\\n\\n\")\n",
 359 |     "    \n",
 360 |     "    \n",
 361 |     "            if encodeOption == \"1\":\n",
 362 |     "                #Binary Categories\n",
 363 |     "                attackType  = {'normal':\"normal\", 'neptune':\"abnormal\", 'warezclient':\"abnormal\", 'ipsweep':\"abnormal\",'back':\"abnormal\", 'smurf':\"abnormal\", 'rootkit':\"abnormal\",'satan':\"abnormal\", 'guess_passwd':\"abnormal\",'portsweep':\"abnormal\",'teardrop':\"abnormal\",'nmap':\"abnormal\",'pod':\"abnormal\",'ftp_write':\"abnormal\",'multihop':\"abnormal\",'buffer_overflow':\"abnormal\",'imap':\"abnormal\",'warezmaster':\"abnormal\",'phf':\"abnormal\",'land':\"abnormal\",'loadmodule':\"abnormal\",'spy':\"abnormal\",'perl':\"abnormal\"} \n",
 364 |     "                attackEncodingCluster  = {'normal':0,'abnormal':1}\n",
 365 |     "    \n",
 366 |     "                Y[:] = [attackType[item] for item in Y[:]] #Encoding the binary data\n",
 367 |     "                Y[:] = [attackEncodingCluster[item] for item in Y[:]]#Changing the names of the labels to binary labels normal and abnormal\n",
 368 |     "                return Y,encodeOption\n",
 369 |     "    \n",
 370 |     "            elif encodeOption == \"2\":\n",
 371 |     "                #4 Main Categories\n",
 372 |     "                #normal = 0\n",
 373 |     "                #DoS = 1\n",
 374 |     "                #Probe = 2\n",
 375 |     "                #R2L = 3\n",
 376 |     "                #U2R = 4\n",
 377 |     "                attackType  = {'normal': 'normal', 'neptune':'DoS', 'warezclient': 'R2L', 'ipsweep': 'Probe','back': 'DoS', 'smurf': 'DoS', 'rootkit': 'U2R','satan': 'Probe', 'guess_passwd': 'R2L','portsweep': 'Probe','teardrop': 'DoS','nmap': 'Probe','pod': 'DoS','ftp_write': 'R2L','multihop': 'R2L','buffer_overflow': 'U2R','imap': 'R2L','warezmaster': 'R2L','phf': 'R2L','land': 'DoS','loadmodule': 'U2R','spy': 'R2L','perl': 'U2R'} \n",
 378 |     "                attackEncodingCluster  = {'normal':0,'DoS':1,'Probe':2,'R2L':3, 'U2R':4} #Main Categories\n",
 379 |     "    \n",
 380 |     "                Y[:] = [attackType[item] for item in Y[:]] #Encoding the main 4 categories\n",
 381 |     "                Y[:] = [attackEncodingCluster[item] for item in Y[:]]# Changing the names of attacks into 4 main categories\n",
 382 |     "                return Y,encodeOption\n",
 383 |     "        else:\n",
 384 |     "            return Y\n",
 385 |     "    \n",
 386 |     "    \n",
 387 |     "    elif datasetOption == \"2\":#Check if the data set choosen is NSL-KDD or IDS2017\n",
 388 |     "        print(\"\\n\\n#########################################################################\")\n",
 389 |     "        print(\"Encoding Menu\")\n",
 390 |     "        print(\"#########################################################################\")\n",
 391 |     "        print(\"1.Binary true labels: normal = 0, abnormal = 1\")\n",
 392 |     "        print(\"2. Multiclass true labels: BENIGN= 0, DoS slowloris= 1, DoS Slowhttptest= 2, DoS Hulk= 3, DoS GoldenEye= 4, Heartbleed= 5\")\n",
 393 |     "        encodeOption = input(\"Enter option :\")\n",
 394 |     "\n",
 395 |     "        if encodeOption == \"1\":\n",
 396 |     "            Y = np.array(Y,dtype= object)\n",
 397 |     "            attackEncoding  = {'BENIGN': 0,'DoS slowloris': 1,'DoS Slowhttptest': 2,'DoS Hulk': 3, 'DoS GoldenEye': 4, 'Heartbleed': 5} #Main Categories\n",
 398 |     "            Y[:] = [attackEncoding[item] for item in Y[:]]# Changing the names of attacks into 4 main categories\n",
 399 |     "    \n",
 400 |     "            return Y,encodeOption\n",
 401 |     "        \n",
 402 |     "        elif encodeOption == \"2\":\n",
 403 |     "            Y = np.array(Y,dtype= object)\n",
 404 |     "            attackType  = {'BENIGN': 'normal','DoS slowloris': 'abnormal','DoS Slowhttptest': 'abnormal','DoS Hulk': 'abnormal', 'DoS GoldenEye': 'abnormal', 'Heartbleed': 'abnormal'} #Binary Categories\n",
 405 |     "            attackEncoding = {'normal': 0, 'abnormal': 1}\n",
 406 |     "            \n",
 407 |     "            Y[:] = [attackType[item] for item in Y[:]]# Changing the names of attacks into binary categories\n",
 408 |     "            Y[:] = [attackEncoding[item] for item in Y[:]]# Changing the names of attacks into binary categories\n",
 409 |     "            return Y,encodeOption\n",
 410 |     "        \n",
 411 |     "        else:\n",
 412 |     "            return Y"
 413 |    ]
 414 |   },
 415 |   {
 416 |    "cell_type": "markdown",
 417 |    "metadata": {},
 418 |    "source": [
 419 |     "# One Hot Encoding"
 420 |    ]
 421 |   },
 422 |   {
 423 |    "cell_type": "code",
 424 |    "execution_count": null,
 425 |    "metadata": {},
 426 |    "outputs": [],
 427 |    "source": [
 428 |     "#Encoding the categorical features using one hot encoding and using Main attacks categories or binary categories\n",
 429 |     "def oneHotEncodingData(X,dataOption):\n",
 430 |     "        \n",
 431 |     "    from sklearn.preprocessing import OneHotEncoder\n",
 432 |     "    from sklearn.compose import ColumnTransformer\n",
 433 |     "    #We use One hot encoding to pervent the machine learning to atribute the categorical data in order. \n",
 434 |     "    #What one hot encoding(ColumnTransformer) does is, it takes a column which has categorical data, \n",
 435 |     "    #which has been label encoded, and then splits the column into multiple columns.\n",
 436 |     "    #The numbers are replaced by 1s and 0s, depending on which column has what value\n",
 437 |     "    #We don't need to do a label encoded step because ColumnTransformer do one hot encode and label encode!\n",
 438 |     "    #Encoding the Independient Variable\n",
 439 |     "    if dataOption == \"1\": #Only for dataset with Categorical Data\n",
 440 |     "        transform = ColumnTransformer([(\"Servers\", OneHotEncoder(categories = \"auto\"), [1,2,3])], remainder=\"passthrough\")\n",
 441 |     "        X = transform.fit_transform(X)\n",
 442 |     "        print(\"\\n\\n#########################################################################\")\n",
 443 |     "        print(\"Data has been successfully One Hot Encoded\")\n",
 444 |     "        print(\"#########################################################################\")\n",
 445 |     "\n",
 446 |     "        return X\n",
 447 |     "    elif dataOption == \"3\": #Only for risk data, because we don't have risk values for protocol feature we do one hot encoding for only that feature and the other ones we do risk value encoding\n",
 448 |     "        transform = ColumnTransformer([(\"Servers\", OneHotEncoder(categories = \"auto\"), [1])], remainder=\"passthrough\")\n",
 449 |     "        X = transform.fit_transform(X)\n",
 450 |     "        print(\"\\n\\n#########################################################################\")\n",
 451 |     "        print(\"Data has been successfully One Hot Encoded\")\n",
 452 |     "        print(\"#########################################################################\")\n",
 453 |     "        return X\n",
 454 |     "        \n",
 455 |     "    else:\n",
 456 |     "        return X #return data with no changes"
 457 |    ]
 458 |   },
 459 |   {
 460 |    "cell_type": "markdown",
 461 |    "metadata": {},
 462 |    "source": [
 463 |     "# Risk Encoding"
 464 |    ]
 465 |   },
 466 |   {
 467 |    "cell_type": "code",
 468 |    "execution_count": null,
 469 |    "metadata": {},
 470 |    "outputs": [],
 471 |    "source": [
 472 |     "def riskEncodingData(X,dataOption):#Risk encoding categorical features\n",
 473 |     "    #Manually Encoding for the attacks types only\n",
 474 |     "    if dataOption == \"3\": #if data option is risk Value\n",
 475 |     "        X = pd.DataFrame(X)\n",
 476 |     "        servers  = {'http':0.01, 'domain_u':0, 'sunrpc':1, 'smtp':0.01, 'ecr_i':0.87, 'iso_tsap':1, 'private':0.97, 'finger':0.27, 'ftp':0.26, 'telnet':0.48,'other':0.12,'discard':1, 'courier':1, 'pop_3':0.53, 'ldap':1, 'eco_i':0.8, 'ftp_data':0.06, 'klogin':1, 'auth':0.31, 'mtp':1, 'name':1, 'netbios_ns':1,'remote_job':1,'supdup':1,'uucp_path':1,'Z39_50':1,'csnet_ns':1,'uucp':1,'netbios_dgm':1,'urp_i':0,'domain':0.96,'bgp':1,'gopher':1,'vmnet':1,'systat':1,'http_443':1,'efs':1,'whois':1,'imap4':1,'echo':1,'link':1,'login':1,'kshell':1,'sql_net':1,'time':0.88,'hostnames':1,'exec':1,'ntp_u':0,'nntp':1,'ctf':1,'ssh':1,'daytime':1,'shell':1,'netstat':1,'nnsp':1,'IRC':0,'pop_2':1,'printer':1,'tim_i':0.33,'pm_dump':1,'red_i':0,'netbios_ssn':1,'rje':1,'X11':0.04,'urh_i':0,'http_8001':1,'aol':1,'http_2784':1,'tftp_u':0,'harvest':1}\n",
 477 |     "        X[2] = [servers[item] for item in X[2]]\n",
 478 |     "\n",
 479 |     "        servers_Error  = {'REJ':0.519, 'SF':0.016, 'S0':0.998, 'RSTR':0.882, 'RSTO':0.886,'SH':0.993,'S1':0.008,'RSTOS0':1,'S3':0.08,'S2':0.05,'OTH':0.729} \n",
 480 |     "        X[3] = [servers_Error[item] for item in X[3]]\n",
 481 |     "\n",
 482 |     "        print(\"\\n\\n#########################################################################\")\n",
 483 |     "        print(\"Data has been successfully risk Encoded\")\n",
 484 |     "        print(\"#########################################################################\")\n",
 485 |     "\n",
 486 |     "        return X\n",
 487 |     "        \n",
 488 |     "    else:\n",
 489 |     "        \n",
 490 |     "        return X #return data with no changes"
 491 |    ]
 492 |   },
 493 |   {
 494 |    "cell_type": "markdown",
 495 |    "metadata": {},
 496 |    "source": [
 497 |     "# Scaling "
 498 |    ]
 499 |   },
 500 |   {
 501 |    "cell_type": "code",
 502 |    "execution_count": null,
 503 |    "metadata": {},
 504 |    "outputs": [],
 505 |    "source": [
 506 |     "def scaling(X):#Scalign the data with the normalize method, we scale the data to have it in the same range for the experiments\n",
 507 |     "    \n",
 508 |     "    \n",
 509 |     "\n",
 510 |     "    while True:\n",
 511 |     "            \n",
 512 |     "            decision = input(\"Scale data [y/n]:\")\n",
 513 |     "            \n",
 514 |     "            if decision == \"y\" or  decision == \"n\":\n",
 515 |     "                break\n",
 516 |     "            else:\n",
 517 |     "                \n",
 518 |     "                print(\"Error\\n\\n\")\n",
 519 |     "    \n",
 520 |     "    if decision == \"y\":\n",
 521 |     "        \n",
 522 |     "            from sklearn.preprocessing import MinMaxScaler\n",
 523 |     "            #Transforms features by scaling each feature to a given range.\n",
 524 |     "            X =  MinMaxScaler(feature_range=(0, 1)).fit_transform(X)\n",
 525 |     "            print(\"\\n\\n#########################################################################\")\n",
 526 |     "            print(\"Data has been successfully scaled.\")\n",
 527 |     "            print(\"#########################################################################\")\n",
 528 |     "            return X\n",
 529 |     "        \n",
 530 |     "    else:\n",
 531 |     "        return X\n"
 532 |    ]
 533 |   },
 534 |   {
 535 |    "cell_type": "markdown",
 536 |    "metadata": {},
 537 |    "source": [
 538 |     "# Shuffle"
 539 |    ]
 540 |   },
 541 |   {
 542 |    "cell_type": "code",
 543 |    "execution_count": null,
 544 |    "metadata": {},
 545 |    "outputs": [],
 546 |    "source": [
 547 |     "def shuffleData(X):# currently a bug, if we do shuffleling the experiments resutls are not good, the order of the data does not affect the results\n",
 548 |     "\n",
 549 |     "    from sklearn.utils import shuffle\n",
 550 |     "    while True:\n",
 551 |     "        option = input(\"Shuffle data [y]/[n]:\")\n",
 552 |     "        \n",
 553 |     "        if option == \"y\" or option == \"n\":\n",
 554 |     "            break\n",
 555 |     "        else:\n",
 556 |     "            \n",
 557 |     "            print(\"Error\\n\\n\")\n",
 558 |     "    \n",
 559 |     "    if option == \"y\":\n",
 560 |     "        \n",
 561 |     "        X = pd.DataFrame(X)\n",
 562 |     "        X = shuffle(X)\n",
 563 |     "        X.reset_index(inplace=True,drop=True)\n",
 564 |     "        X = np.array(X)\n",
 565 |     "        \n",
 566 |     "        print(\"\\n\\n#########################################################################\")\n",
 567 |     "        print(\"Data has been successfully shuffled.\")\n",
 568 |     "        print(\"#########################################################################\")\n",
 569 |     "        return X\n",
 570 |     "    else:\n",
 571 |     "        \n",
 572 |     "        return X"
 573 |    ]
 574 |   },
 575 |   {
 576 |    "cell_type": "markdown",
 577 |    "metadata": {},
 578 |    "source": [
 579 |     "# KMEANS"
 580 |    ]
 581 |   },
 582 |   {
 583 |    "cell_type": "code",
 584 |    "execution_count": null,
 585 |    "metadata": {},
 586 |    "outputs": [],
 587 |    "source": [
 588 |     "def kmeansClustering(X,Y):#K-means algorithm \n",
 589 |     "    from sklearn.cluster import KMeans\n",
 590 |     "\n",
 591 |     "    while True:\n",
 592 |     "        print(\"\\n\\n#########################################################################\")\n",
 593 |     "        print(\"KMEANS ALGORITHM\")\n",
 594 |     "        print(\"#########################################################################\")\n",
 595 |     "              \n",
 596 |     "        nClusters = input(\"Number of clusters:\")\n",
 597 |     "        \n",
 598 |     "        try:\n",
 599 |     "            nClusters = int(nClusters)\n",
 600 |     "            \n",
 601 |     "        except ValueError:\n",
 602 |     "            \n",
 603 |     "            print(\"Error\\n\\n\")\n",
 604 |     "            \n",
 605 |     "        if type(nClusters) == int:\n",
 606 |     "            n = 0\n",
 607 |     "            clusters = []\n",
 608 |     "            \n",
 609 |     "            while n < nClusters:#Converting nCluster into an array of n clusters [n] for use it later\n",
 610 |     "                clusters.append(n)\n",
 611 |     "                n+=1\n",
 612 |     "            break\n",
 613 |     "        \n",
 614 |     "    while True:\n",
 615 |     "        init = input(\"Initialization method [k-means++,random]:\")\n",
 616 |     "        \n",
 617 |     "        if init == \"k-means++\" or init == \"random\":\n",
 618 |     "            break\n",
 619 |     "\n",
 620 |     "    print(\"\\nClustering...\\n\")\n",
 621 |     "    \n",
 622 |     "    start_time = time.time()\n",
 623 |     "    KMEANS = KMeans(n_clusters = nClusters, init = init,max_iter = 300,n_init = 10,random_state = 0)\n",
 624 |     "    print(\"\\n\\nRun Time ->\",\"--- %s seconds ---\" % (time.time() - start_time))\n",
 625 |     "    print(\"Data Successfully Clustered\")\n",
 626 |     "    kmeans = KMEANS.fit(X)\n",
 627 |     "    Z = kmeans.labels_\n",
 628 |     "    inertia = KMEANS.inertia_\n",
 629 |     "    #Kmeans Results\n",
 630 |     "    kmeansR = pd.crosstab(Y,Z)\n",
 631 |     "    maxVal = kmeansR.idxmax()\n",
 632 |     "    \n",
 633 |     "    return Z,clusters,kmeansR,maxVal,inertia\n"
 634 |    ]
 635 |   },
 636 |   {
 637 |    "cell_type": "markdown",
 638 |    "metadata": {},
 639 |    "source": [
 640 |     "# Kmeans F1 Score"
 641 |    ]
 642 |   },
 643 |   {
 644 |    "cell_type": "code",
 645 |    "execution_count": null,
 646 |    "metadata": {},
 647 |    "outputs": [],
 648 |    "source": [
 649 |     "def kF1(Z,Y,maxVal,clusters):#F1 Score for Kmeans\n",
 650 |     "    from sklearn.metrics import f1_score\n",
 651 |     "    #Encoding data to F-score\n",
 652 |     "    \n",
 653 |     "    \n",
 654 |     "    # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n",
 655 |     "    n = 0 # counter\n",
 656 |     "    dictionaryCluster  = {} # creating an empty dictionary \n",
 657 |     "    f1 = 0 #f1score\n",
 658 |     "    average = ''\n",
 659 |     "    \n",
 660 |     "    while n < len(clusters):# while counter < number of clusters\n",
 661 |     "        dictionaryCluster[clusters[n]] = maxVal[n] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n",
 662 |     "        n+=1\n",
 663 |     "        \n",
 664 |     "    Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n",
 665 |     "            \n",
 666 |     "    Y = np.array(Y,dtype = int) # Converting labels into a int array\n",
 667 |     "    \n",
 668 |     "    while True:\n",
 669 |     "        \n",
 670 |     "        average = input(\"Average Method[weighted,micro,macro,binary]:\")\n",
 671 |     "        \n",
 672 |     "        if average == \"weighted\" or average == \"micro\" or average == \"macro\" or average == 'binary':\n",
 673 |     "            break\n",
 674 |     "    #score metric   \n",
 675 |     "    f1 = f1_score(Y,Z, average = average) #Forget the labels that where not predicted and gives lables that were predicted at least once\n",
 676 |     "    \n",
 677 |     "    return f1,dictionaryCluster"
 678 |    ]
 679 |   },
 680 |   {
 681 |    "cell_type": "markdown",
 682 |    "metadata": {},
 683 |    "source": [
 684 |     "# KMEANS Normal Mutial Info"
 685 |    ]
 686 |   },
 687 |   {
 688 |    "cell_type": "code",
 689 |    "execution_count": null,
 690 |    "metadata": {},
 691 |    "outputs": [],
 692 |    "source": [
 693 |     "\n",
 694 |     "def kNMI(Z,Y,maxVal,clusters):\n",
 695 |     "    from sklearn.metrics import normalized_mutual_info_score\n",
 696 |     "    \n",
 697 |     "    # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n",
 698 |     "    n = 0 # counter\n",
 699 |     "    dictionaryCluster  = {} # creating an empty dictionary \n",
 700 |     "    NMI = 0\n",
 701 |     "    average = ''\n",
 702 |     "    \n",
 703 |     "    while n < len(clusters):# while counter < number of clusters\n",
 704 |     "        dictionaryCluster[clusters[n]] = maxVal[n] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n",
 705 |     "        n+=1\n",
 706 |     "        \n",
 707 |     "    Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n",
 708 |     "    \n",
 709 |     "    Y = np.array(Y,dtype = int) #Making sure that labels are in a int array\n",
 710 |     "    \n",
 711 |     "    while True:\n",
 712 |     "        \n",
 713 |     "        average = input(\"Average Method[geometric,min,arithmetic,max]:\")\n",
 714 |     "        \n",
 715 |     "        if average == \"geometric\" or average == \"min\" or average == \"arithmetic\" or average == \"max\":\n",
 716 |     "            break\n",
 717 |     "    #Score metric \n",
 718 |     "    NMI = normalized_mutual_info_score(Y, Z, average_method = average)\n",
 719 |     "    \n",
 720 |     "    return NMI,dictionaryCluster\n"
 721 |    ]
 722 |   },
 723 |   {
 724 |    "cell_type": "markdown",
 725 |    "metadata": {},
 726 |    "source": [
 727 |     "# KMEANS Adjusted Random Score"
 728 |    ]
 729 |   },
 730 |   {
 731 |    "cell_type": "code",
 732 |    "execution_count": null,
 733 |    "metadata": {},
 734 |    "outputs": [],
 735 |    "source": [
 736 |     "def kARS(Z,Y,maxVal,clusters):\n",
 737 |     "    from sklearn.metrics import adjusted_rand_score\n",
 738 |     "    \n",
 739 |     "    # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n",
 740 |     "    n = 0 # counter\n",
 741 |     "    dictionaryCluster  = {} # creating an empty dictionary \n",
 742 |     "    ars = 0\n",
 743 |     "    \n",
 744 |     "    while n < len(clusters):# while counter < number of clusters\n",
 745 |     "        dictionaryCluster[clusters[n]] = maxVal[n] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n",
 746 |     "        n+=1\n",
 747 |     "        \n",
 748 |     "    Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n",
 749 |     "    \n",
 750 |     "    Y = np.array(Y,dtype = int) #Making sure that labels are in a int array\n",
 751 |     "    \n",
 752 |     "    #score metric\n",
 753 |     "    ars = adjusted_rand_score(Y, Z)\n",
 754 |     "    \n",
 755 |     "    return ars,dictionaryCluster"
 756 |    ]
 757 |   },
 758 |   {
 759 |    "cell_type": "markdown",
 760 |    "metadata": {},
 761 |    "source": [
 762 |     "# DBSCAN"
 763 |    ]
 764 |   },
 765 |   {
 766 |    "cell_type": "code",
 767 |    "execution_count": null,
 768 |    "metadata": {},
 769 |    "outputs": [],
 770 |    "source": [
 771 |     "def dbscanClustering(X,Y):#DBSCAN algorithm\n",
 772 |     "    from sklearn.cluster import DBSCAN\n",
 773 |     "    \n",
 774 |     "    while True:\n",
 775 |     "        \n",
 776 |     "        print(\"\\n\\n#########################################################################\")\n",
 777 |     "        print(\"DBSCAN ALGORITHM\")\n",
 778 |     "        print(\"#########################################################################\")\n",
 779 |     "              \n",
 780 |     "        epsilon = input(\"epsilon[Decimal]:\")\n",
 781 |     "        \n",
 782 |     "        try:\n",
 783 |     "            epsilon = float(epsilon)\n",
 784 |     "            \n",
 785 |     "        except ValueError:\n",
 786 |     "            \n",
 787 |     "            print(\"Enter a Decimal number\")\n",
 788 |     "            \n",
 789 |     "            \n",
 790 |     "        if type(epsilon) == float:\n",
 791 |     "            break\n",
 792 |     "        \n",
 793 |     "    while True:\n",
 794 |     "        minSamples = input(\"Min Samples[Integer]:\")\n",
 795 |     "        \n",
 796 |     "        try:\n",
 797 |     "            minSamples = int(minSamples)\n",
 798 |     "            \n",
 799 |     "        except ValueError:\n",
 800 |     "            \n",
 801 |     "            print(\"Enter a Integer Number\")\n",
 802 |     "            \n",
 803 |     "        if type(minSamples) == int:\n",
 804 |     "            break\n",
 805 |     "        \n",
 806 |     "    while True:\n",
 807 |     "        algorithm = input(\"Algorithm['auto’, ‘ball_tree’, ‘kd_tree’, 'brute']:\")\n",
 808 |     "            \n",
 809 |     "        if algorithm == \"auto\" or algorithm == \"ball_tree\" or algorithm == \"kd_tree\" or algorithm == \"brute\":\n",
 810 |     "            break\n",
 811 |     "        \n",
 812 |     "        else:\n",
 813 |     "            print(\"Error\\n\\n\")\n",
 814 |     "            \n",
 815 |     "    \n",
 816 |     "    print(\"\\nClustering...\\n\")\n",
 817 |     "\n",
 818 |     "    #Compute DBSCAN\n",
 819 |     "    start_time = time.time() \n",
 820 |     "    db = DBSCAN(eps= epsilon, min_samples = minSamples,algorithm = algorithm).fit(X)\n",
 821 |     "    print(\"\\n\\nRun Time ->\",\"--- %s seconds ---\" % (time.time() - start_time))\n",
 822 |     "    print(\"Data Successfully Clustered\")\n",
 823 |     "    \n",
 824 |     "    \n",
 825 |     "    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)\n",
 826 |     "    core_samples_mask[db.core_sample_indices_] = True\n",
 827 |     "    \n",
 828 |     "    Z = db.labels_\n",
 829 |     "    # Number of clusters in labels, ignoring noise if present.\n",
 830 |     "    n_clusters = len(set(Z))\n",
 831 |     "    n_noise_ = list(Z).count(-1)\n",
 832 |     "    \n",
 833 |     "    n = -1  # DBSCAN return index -1 cluster\n",
 834 |     "    clusters = []\n",
 835 |     "    while n + 1 < n_clusters:\n",
 836 |     "        clusters.append(n)\n",
 837 |     "        n += 1\n",
 838 |     "    \n",
 839 |     "    #DBSCAN Results\n",
 840 |     "    dbscanR = pd.crosstab(Y,Z)\n",
 841 |     "    maxVal = dbscanR.idxmax()\n",
 842 |     "    \n",
 843 |     "    return Z,clusters,n_noise_,dbscanR,maxVal"
 844 |    ]
 845 |   },
 846 |   {
 847 |    "cell_type": "markdown",
 848 |    "metadata": {},
 849 |    "source": [
 850 |     "# DBSCAN F1 Score"
 851 |    ]
 852 |   },
 853 |   {
 854 |    "cell_type": "code",
 855 |    "execution_count": null,
 856 |    "metadata": {},
 857 |    "outputs": [],
 858 |    "source": [
 859 |     "def dbF1(Z,Y,clusters,maxVal):#F1 score for DBSCAN\n",
 860 |     "    from sklearn.metrics import f1_score\n",
 861 |     "    #Encoding data to F-score\n",
 862 |     "    \n",
 863 |     "    # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n",
 864 |     "    n = 0 # counter\n",
 865 |     "    c = -1 # - counter max Value has negative index\n",
 866 |     "    dictionaryCluster  = {} # creating an empty dictionary \n",
 867 |     "    f1 = 0\n",
 868 |     "    average = ''\n",
 869 |     "    \n",
 870 |     "    while n < len(clusters):# while counter < number of clusters\n",
 871 |     "        dictionaryCluster[clusters[n]] = maxVal[c] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n",
 872 |     "        n+=1\n",
 873 |     "        c+=1\n",
 874 |     "    \n",
 875 |     "        \n",
 876 |     "    Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n",
 877 |     "    \n",
 878 |     "    Y = np.array(Y,dtype = int) #Making sure that labels are in a int array\n",
 879 |     "    while True:\n",
 880 |     "        \n",
 881 |     "        average = input(\"Average Method[weighted,micro,macro]:\")\n",
 882 |     "        \n",
 883 |     "        if average == \"weighted\" or average == \"micro\" or average == \"macro\":\n",
 884 |     "            break\n",
 885 |     "        \n",
 886 |     "        else:\n",
 887 |     "            \n",
 888 |     "            print(\"Error\\n\\n\")\n",
 889 |     "    #score metric\n",
 890 |     "    f1 = f1_score(Y,Z, average = average)\n",
 891 |     "    return f1,dictionaryCluster"
 892 |    ]
 893 |   },
 894 |   {
 895 |    "cell_type": "markdown",
 896 |    "metadata": {},
 897 |    "source": [
 898 |     "# DBSCAN Mutual Info Score"
 899 |    ]
 900 |   },
 901 |   {
 902 |    "cell_type": "code",
 903 |    "execution_count": null,
 904 |    "metadata": {},
 905 |    "outputs": [],
 906 |    "source": [
 907 |     "def dbNMI(Z,Y,clusters,maxVal):# Mutual info score for dbscan\n",
 908 |     "    from sklearn.metrics import normalized_mutual_info_score\n",
 909 |     "    # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n",
 910 |     "    n = 0 # counter\n",
 911 |     "    c = -1 # - counter max Value has negative index\n",
 912 |     "    NMI = 0\n",
 913 |     "    dictionaryCluster  = {} # creating an empty dictionary \n",
 914 |     "    average = ''\n",
 915 |     "    \n",
 916 |     "    while n < len(clusters):# while counter < number of clusters\n",
 917 |     "        dictionaryCluster[clusters[n]] = maxVal[c] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n",
 918 |     "        n+=1\n",
 919 |     "        c+=1\n",
 920 |     "    \n",
 921 |     "    Y = np.array(Y,dtype = int) #Making sure that labels are in a int array\n",
 922 |     "\n",
 923 |     "    while True:\n",
 924 |     "        \n",
 925 |     "        average = input(\"Average Method[geometric,min,arithmetic,max]:\")\n",
 926 |     "        \n",
 927 |     "        if average == \"geometric\" or average == \"min\" or average == \"arithmetic\" or average == \"max\":\n",
 928 |     "            break\n",
 929 |     "        else:\n",
 930 |     "            \n",
 931 |     "            print(\"Error\\n\\n\")\n",
 932 |     "    #score metric\n",
 933 |     "    NMI = normalized_mutual_info_score(Y, Z, average_method= average)\n",
 934 |     "    \n",
 935 |     "    return NMI,dictionaryCluster"
 936 |    ]
 937 |   },
 938 |   {
 939 |    "cell_type": "markdown",
 940 |    "metadata": {},
 941 |    "source": [
 942 |     "# DBSCAN Adjusted Random Score"
 943 |    ]
 944 |   },
 945 |   {
 946 |    "cell_type": "code",
 947 |    "execution_count": null,
 948 |    "metadata": {},
 949 |    "outputs": [],
 950 |    "source": [
 951 |     "def dbARS(Z,Y,clusters,maxVal): # adjusted rand score for dbscan\n",
 952 |     "    from sklearn.metrics import adjusted_rand_score\n",
 953 |     "    \n",
 954 |     "    # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n",
 955 |     "    n = 0 # counter\n",
 956 |     "    c = -1 # - counter max Value has negative index\n",
 957 |     "    ars = 0\n",
 958 |     "    dictionaryCluster  = {} # creating an empty dictionary \n",
 959 |     "    \n",
 960 |     "    while n < len(clusters):# while counter < number of clusters\n",
 961 |     "        dictionaryCluster[clusters[n]] = maxVal[c] #creating key(cluster index) with value (max number of the clustering results) for every iteration\n",
 962 |     "        n+=1\n",
 963 |     "        c+=1\n",
 964 |     "    #score metric\n",
 965 |     "    ars = adjusted_rand_score(Y,Z)\n",
 966 |     "    \n",
 967 |     "    return ars,dictionaryCluster"
 968 |    ]
 969 |   },
 970 |   {
 971 |    "cell_type": "markdown",
 972 |    "metadata": {},
 973 |    "source": [
 974 |     "# Isolation Forest"
 975 |    ]
 976 |   },
 977 |   {
 978 |    "cell_type": "code",
 979 |    "execution_count": null,
 980 |    "metadata": {},
 981 |    "outputs": [],
 982 |    "source": [
 983 |     "def isolationForest(X,Y):# isolation forest algorithm\n",
 984 |     "    from sklearn.ensemble import IsolationForest\n",
 985 |     "    \n",
 986 |     "    while True:\n",
 987 |     "        contamination = input(\"Contamination[Float 0 to 0.5]: \")\n",
 988 |     "        \n",
 989 |     "        try:\n",
 990 |     "            contamination = float(contamination)\n",
 991 |     "            \n",
 992 |     "        except ValueError:\n",
 993 |     "            \n",
 994 |     "            print(\"Enter a Number\")\n",
 995 |     "            \n",
 996 |     "        if type(contamination) == float and (contamination >= 0 and contamination <= 0.5):\n",
 997 |     "            break\n",
 998 |     "    \n",
 999 |     "    print(\"\\nClustering...\\n\")   \n",
1000 |     "    \n",
1001 |     "    start_time = time.time() \n",
1002 |     "    Z = IsolationForest(max_samples = \"auto\",behaviour = \"new\",contamination = contamination).fit_predict(X)\n",
1003 |     "    print(\"\\n\\nRun Time ->\",\"--- %s seconds ---\" % (time.time() - start_time))\n",
1004 |     "    \n",
1005 |     "    Z = np.array(Z,dtype = object)\n",
1006 |     "    \n",
1007 |     "    ifR = pd.crosstab(Y,Z)\n",
1008 |     "    ifR = pd.DataFrame(ifR)\n",
1009 |     "    maxVal = ifR.idxmax()\n",
1010 |     "    \n",
1011 |     "    n = -1  # Isolation Forest return index -1 and 1 cluster\n",
1012 |     "    clusters = []\n",
1013 |     "    while n < len(ifR.columns):\n",
1014 |     "        clusters.append(n)\n",
1015 |     "        n += 2\n",
1016 |     "        \n",
1017 |     "    return Z,ifR,maxVal,clusters"
1018 |    ]
1019 |   },
1020 |   {
1021 |    "cell_type": "markdown",
1022 |    "metadata": {},
1023 |    "source": [
1024 |     "# Isolation Forest F1 Score"
1025 |    ]
1026 |   },
1027 |   {
1028 |    "cell_type": "code",
1029 |    "execution_count": null,
1030 |    "metadata": {},
1031 |    "outputs": [],
1032 |    "source": [
1033 |     "def ifF1(Z,Y,clusters,maxVal): #f1 score for isolation forest\n",
1034 |     "    from sklearn.metrics import f1_score\n",
1035 |     "    # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n",
1036 |     "    \n",
1037 |     "    n = 0 # counter\n",
1038 |     "    c = -1 # - counter max Value has negative index\n",
1039 |     "    f1 = 0\n",
1040 |     "    average = ''\n",
1041 |     "    dictionaryCluster  = {} # creating an empty dictionary \n",
1042 |     "\n",
1043 |     "    \n",
1044 |     "    while n < len(clusters): # Since we got -1 and 1 clusters , in order to assing the corrects result counter starts at -1 and it increments by 2 so it can have the 1 index of maxLOFvalue\n",
1045 |     "        dictionaryCluster[clusters[n]] = maxVal[c] \n",
1046 |     "        n+=1\n",
1047 |     "        c+=2\n",
1048 |     "        \n",
1049 |     "    Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n",
1050 |     "    \n",
1051 |     "    Y = np.array(Y,dtype = int)\n",
1052 |     "    Z = np.array(Z,dtype = int)\n",
1053 |     "    \n",
1054 |     "    while True:\n",
1055 |     "        \n",
1056 |     "        average = input(\"Average Method[weighted,micro,macro]:\")\n",
1057 |     "        \n",
1058 |     "        if average == \"weighted\" or average == \"micro\" or average == \"macro\":\n",
1059 |     "            break\n",
1060 |     "        \n",
1061 |     "        else:\n",
1062 |     "            \n",
1063 |     "            print(\"Error\\n\\n\")\n",
1064 |     "    # score metric\n",
1065 |     "    f1 = f1_score(Y,Z, average = average) #[None, 'micro', 'macro', 'weighted']\n",
1066 |     "    \n",
1067 |     "    return f1,dictionaryCluster"
1068 |    ]
1069 |   },
1070 |   {
1071 |    "cell_type": "markdown",
1072 |    "metadata": {},
1073 |    "source": [
1074 |     "# Local Outlier Factor"
1075 |    ]
1076 |   },
1077 |   {
1078 |    "cell_type": "code",
1079 |    "execution_count": null,
1080 |    "metadata": {},
1081 |    "outputs": [],
1082 |    "source": [
1083 |     "def LOF(X,Y):# Local outlier factor algorithm\n",
1084 |     "    from sklearn.neighbors import LocalOutlierFactor \n",
1085 |     "    \n",
1086 |     "    while True:\n",
1087 |     "        contamination = input(\"Contamination[Float 0 to 0.5]: \")\n",
1088 |     "        \n",
1089 |     "        try:\n",
1090 |     "            contamination = float(contamination)\n",
1091 |     "            \n",
1092 |     "        except ValueError:\n",
1093 |     "            \n",
1094 |     "            print(\"Enter a Number\")\n",
1095 |     "            \n",
1096 |     "        if type(contamination) == float and (contamination > 0 and contamination <= 0.5):\n",
1097 |     "            break\n",
1098 |     "        \n",
1099 |     "    while True:\n",
1100 |     "        algorithm = input(\"Algorithm['auto’, ‘ball_tree’, ‘kd_tree’, 'brute']:\")\n",
1101 |     "            \n",
1102 |     "        if algorithm == \"auto\" or algorithm == \"ball_tree\" or algorithm == \"kd_tree\" or algorithm == \"brute\":\n",
1103 |     "            break\n",
1104 |     "        else:\n",
1105 |     "            \n",
1106 |     "            print(\"Error\\n\\n\")\n",
1107 |     "            \n",
1108 |     "    print(\"\\nClustering...\\n\")\n",
1109 |     "    \n",
1110 |     "    start_time = time.time() \n",
1111 |     "    lof = LocalOutlierFactor(contamination = contamination,algorithm = algorithm).fit_predict(X)\n",
1112 |     "    print(\"\\n\\nRun Time ->\",\"--- %s seconds ---\" % (time.time() - start_time))\n",
1113 |     "    \n",
1114 |     "    lofR = pd.crosstab(Y,lof)\n",
1115 |     "    maxVal = lofR.idxmax()\n",
1116 |     "    \n",
1117 |     "    \n",
1118 |     "    n = -1  # LOF return index -1 and 1 cluster\n",
1119 |     "    clusters = []\n",
1120 |     "    while n < len(lofR.columns):\n",
1121 |     "        clusters.append(n)\n",
1122 |     "        n += 2\n",
1123 |     "    \n",
1124 |     "    \n",
1125 |     "    \n",
1126 |     "    return lof,lofR,maxVal,clusters"
1127 |    ]
1128 |   },
1129 |   {
1130 |    "cell_type": "markdown",
1131 |    "metadata": {},
1132 |    "source": [
1133 |     "# Local Outlier Factor F1 Score"
1134 |    ]
1135 |   },
1136 |   {
1137 |    "cell_type": "code",
1138 |    "execution_count": null,
1139 |    "metadata": {},
1140 |    "outputs": [],
1141 |    "source": [
1142 |     "def lofF1(Z,Y,clusters,maxVal): # f1 score for local outlier factor\n",
1143 |     "    from sklearn.metrics import f1_score\n",
1144 |     "    \n",
1145 |     "    # This part of the code automatically assign the max-ocurring instance in each found cluster to that specific found cluster,in order to evaluate the clustering with greater ease.\n",
1146 |     "    n = 0 # counter\n",
1147 |     "    c = -1 # - counter max Value has negative index\n",
1148 |     "    f1 = 0\n",
1149 |     "    dictionaryCluster  = {} # creating an empty dictionary \n",
1150 |     "    \n",
1151 |     "    while n < len(clusters): # Since we got -1 and 1 clusters , in order to assing the corrects result counter starts at -1 and it increments by 2 so it can have the 1 index of maxLOFvalue\n",
1152 |     "        dictionaryCluster[clusters[n]] = maxVal[c] \n",
1153 |     "        n+=1\n",
1154 |     "        c+=2\n",
1155 |     "        \n",
1156 |     "    Z[:] = [dictionaryCluster[item] for item in Z[:]] # match key with the index of klabels and replace it with key value\n",
1157 |     "    Y = np.array(Y,dtype = int)\n",
1158 |     "    Z = np.array(Z,dtype = int)\n",
1159 |     "    while True:\n",
1160 |     "        \n",
1161 |     "        average = input(\"Average Method[weighted,None,micro,macro]:\")\n",
1162 |     "        \n",
1163 |     "        if average == \"weighted\" or average == \"micro\" or average == \"macro\" or average == \"None\":\n",
1164 |     "            break\n",
1165 |     "        \n",
1166 |     "        else:\n",
1167 |     "            \n",
1168 |     "            print(\"Error\\n\\n\")\n",
1169 |     "    f1 = f1_score(Y,Z, average = average) #[None, 'micro', 'macro', 'weighted']\n",
1170 |     "    \n",
1171 |     "    return f1,dictionaryCluster"
1172 |    ]
1173 |   },
1174 |   {
1175 |    "cell_type": "markdown",
1176 |    "metadata": {},
1177 |    "source": [
1178 |     "# Calling Functions"
1179 |    ]
1180 |   },
1181 |   {
1182 |    "cell_type": "code",
1183 |    "execution_count": null,
1184 |    "metadata": {},
1185 |    "outputs": [],
1186 |    "source": [
1187 |     "clear()\n",
1188 |     "#Calling the functions\n",
1189 |     "\n",
1190 |     "##########################################################################\n",
1191 |     "path,dataSetOption = getDataSet()\n",
1192 |     "#########################################################################\n",
1193 |     "#########################################################################\n",
1194 |     "dataSet = readingData(path)\n",
1195 |     "#########################################################################\n",
1196 |     "#########################################################################\n",
1197 |     "dataSet = checkMissing(dataSet)\n",
1198 |     "#########################################################################\n",
1199 |     "#########################################################################\n",
1200 |     "data,labels,dataOption = gettingVariables(dataSet,dataSetOption) #Getting the Data we want to use for the algorithms\n",
1201 |     "#########################################################################\n",
1202 |     "#########################################################################\n",
1203 |     "try:\n",
1204 |     "    labels,encodeOption = encodingLabels(labels,dataOption,dataSetOption) #Encoding the true labels\n",
1205 |     "except ValueError:\n",
1206 |     "    labels = encodingLabels(labels,dataOption,dataSetOption) #Encoding the true labels\n",
1207 |     "#########################################################################\n",
1208 |     "#########################################################################\n",
1209 |     "data = riskEncodingData(data,dataOption)\n",
1210 |     "#########################################################################\n",
1211 |     "#########################################################################\n",
1212 |     "data = oneHotEncodingData(data,dataOption) #One hot Encode with the complete data\n",
1213 |     "#########################################################################\n",
1214 |     "#########################################################################\n",
1215 |     "data = scaling(data)\n",
1216 |     "#########################################################################\n",
1217 |     "#########################################################################\n",
1218 |     "data = shuffleData(data)\n",
1219 |     "#########################################################################\n",
1220 |     "\n",
1221 |     "#This menu is a option to run diferrent algorithms with the same preproceced data witouth the need of running all the code from 0 to make another experiment.\n",
1222 |     "while True:  \n",
1223 |     "    while True:\n",
1224 |     "        print(\"\\n\\n#########################################################################\")\n",
1225 |     "        print(\"Algorithm Menu\")\n",
1226 |     "        print(\"#########################################################################\")\n",
1227 |     "        \n",
1228 |     "        print(\"1.Kmeans\")\n",
1229 |     "        print(\"2.Dbscan\")\n",
1230 |     "        print(\"3.Isolation Forest\")\n",
1231 |     "        print(\"4.Local Factor Outlier\")\n",
1232 |     "        \n",
1233 |     "        algorithmOption = input(\"option:\")\n",
1234 |     "        \n",
1235 |     "        if algorithmOption == \"1\" or algorithmOption == \"2\" or algorithmOption == \"3\" or algorithmOption == \"4\":\n",
1236 |     "                break\n",
1237 |     "        else:\n",
1238 |     "            \n",
1239 |     "            print(\"Error\\n\\n\")\n",
1240 |     "\n",
1241 |     "    \n",
1242 |     "    if algorithmOption == \"1\":\n",
1243 |     "        #########################################################################\n",
1244 |     "        #KMEANS\n",
1245 |     "        klabels,kClusters,kmeansR,maxKvalue,inertia = kmeansClustering(data,labels)\n",
1246 |     "        print(\"#########################################################################\")\n",
1247 |     "        print(\"KMEANS RESULTS\\n\\n\")\n",
1248 |     "        print(\"Clusters -> \",kClusters,\"\\n\")\n",
1249 |     "        print(\"Inertia -> \",inertia)\n",
1250 |     "        print(kmeansR,\"\\n\\n\")\n",
1251 |     "        print(\"Max True Label\",\"\\n\\n\",maxKvalue)\n",
1252 |     "        print(\"#########################################################################\")\n",
1253 |     "        #########################################################################\n",
1254 |     "        print(\"\\n\\n#########################################################################\")\n",
1255 |     "        print(\"Kmeans Score Metrics Menu\")\n",
1256 |     "        print(\"#########################################################################\")\n",
1257 |     "        \n",
1258 |     "        while True:\n",
1259 |     "            print(\"1.F1 Score\")\n",
1260 |     "            print(\"2.Normalized Mutual Info Score\")\n",
1261 |     "            print(\"3.Adjusted Rand Score\")\n",
1262 |     "        \n",
1263 |     "            kScoreOption = input(\"option:\")\n",
1264 |     "            \n",
1265 |     "            if kScoreOption == \"1\" or kScoreOption == \"2\" or kScoreOption == \"3\":\n",
1266 |     "                break\n",
1267 |     "            else:\n",
1268 |     "                \n",
1269 |     "                print(\"Error\\n\\n\")\n",
1270 |     "     \n",
1271 |     "        if kScoreOption == \"1\":\n",
1272 |     "            #########################################################################\n",
1273 |     "            #F1 Score\n",
1274 |     "            kmeansF1,clusterAssigned = kF1(klabels,labels,maxKvalue,kClusters)\n",
1275 |     "            print(\"\\n\\n#########################################################################\")\n",
1276 |     "            print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n",
1277 |     "            print(\"KMEANS F1 Score -> \",kmeansF1)\n",
1278 |     "            print(\"#########################################################################\")\n",
1279 |     "            #########################################################################\n",
1280 |     "        \n",
1281 |     "        elif kScoreOption == \"2\":\n",
1282 |     "            #########################################################################\n",
1283 |     "            kmeansNMI,clusterAssigned = kNMI(klabels,labels,maxKvalue,kClusters)\n",
1284 |     "            print(\"\\n\\n#########################################################################\")\n",
1285 |     "            print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n",
1286 |     "            print(\"KMEANS Normalized Mutual Info Score -> \",kmeansNMI)\n",
1287 |     "            print(\"#########################################################################\")\n",
1288 |     "            #########################################################################\n",
1289 |     "    \n",
1290 |     "        elif kScoreOption == \"3\":\n",
1291 |     "            \n",
1292 |     "            #########################################################################\n",
1293 |     "            kmeansARS,clusterAssigned = kARS(klabels,labels,maxKvalue,kClusters)\n",
1294 |     "            print(\"\\n\\n#########################################################################\")\n",
1295 |     "            print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n",
1296 |     "            print(\"KMEANS Adjusted Rand Score -> \",kmeansARS)\n",
1297 |     "            print(\"#########################################################################\")\n",
1298 |     "            #########################################################################\n",
1299 |     "            \n",
1300 |     "    elif algorithmOption == \"2\":\n",
1301 |     "        #########################################################################\n",
1302 |     "        #DBSCAN\n",
1303 |     "        dblabels,dbClusters,nNoises,dbscanR,maxDBvalue = dbscanClustering(data,labels) \n",
1304 |     "        print(\"#########################################################################\")\n",
1305 |     "        print(\"DBSCAN RESULTS\\n\\n\")\n",
1306 |     "        print(\"Clusters -> \",dbClusters,\"\\n\")\n",
1307 |     "        print(dbscanR,\"\\n\\n\")\n",
1308 |     "        print(\"Noise -> \",nNoises)\n",
1309 |     "        print(\"Max True Label\",\"\\n\\n\",maxDBvalue)\n",
1310 |     "        print(\"#########################################################################\")\n",
1311 |     "        #########################################################################\n",
1312 |     "        print(\"\\n\\n#########################################################################\")\n",
1313 |     "        print(\"Dscan Score Metrics Menu\")\n",
1314 |     "        print(\"#########################################################################\")\n",
1315 |     "        print(\"1.F1 Score\")\n",
1316 |     "        print(\"2.Normalized Mutual Info Score\")\n",
1317 |     "        print(\"3.Adjusted Rand Score\")\n",
1318 |     "        \n",
1319 |     "        while True:\n",
1320 |     "            \n",
1321 |     "            dbScoreOption = input(\"option:\")\n",
1322 |     "            \n",
1323 |     "            if dbScoreOption == \"1\" or dbScoreOption == \"2\" or dbScoreOption == \"3\":\n",
1324 |     "                break\n",
1325 |     "            else:\n",
1326 |     "                \n",
1327 |     "                print(\"Error\\n\\n\")\n",
1328 |     "    \n",
1329 |     "        if dbScoreOption == \"1\":\n",
1330 |     "            #########################################################################\n",
1331 |     "            #F1 Score dbscan\n",
1332 |     "            dbscanF1,clusterAssigned = dbF1(dblabels,labels,dbClusters,maxDBvalue)\n",
1333 |     "            print(\"\\n\\n#########################################################################\")\n",
1334 |     "            print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n",
1335 |     "            print(\"DBSCAN F1 Score -> \",dbscanF1)\n",
1336 |     "            print(\"#########################################################################\")\n",
1337 |     "            #########################################################################\n",
1338 |     "            \n",
1339 |     "        elif dbScoreOption == \"2\":\n",
1340 |     "            #########################################################################\n",
1341 |     "            dbscanNMI,clusterAssigned = dbNMI(dblabels,labels,dbClusters,maxDBvalue)\n",
1342 |     "            print(\"\\n\\n#########################################################################\")\n",
1343 |     "            print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n",
1344 |     "            print(\"DBSCAN Normalized Mutual Info Score -> \",dbscanNMI)\n",
1345 |     "            print(\"#########################################################################\")\n",
1346 |     "            #########################################################################\n",
1347 |     "            \n",
1348 |     "        elif dbScoreOption == \"3\":\n",
1349 |     "            #########################################################################\n",
1350 |     "            dbscanARS,clusterAssigned = dbARS(dblabels,labels,dbClusters,maxDBvalue)\n",
1351 |     "            print(\"\\n\\n#########################################################################\")\n",
1352 |     "            print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n",
1353 |     "            print(\"DBSCAN Adjusted Rand Score -> \",dbscanARS)\n",
1354 |     "            print(\"#########################################################################\")\n",
1355 |     "            #########################################################################\n",
1356 |     "        \n",
1357 |     "        \n",
1358 |     "    elif algorithmOption == \"3\":\n",
1359 |     "        #########################################################################\n",
1360 |     "        ifLabels,ifR,MaxIfVal,ifNclusters = isolationForest(data,labels)\n",
1361 |     "        print(\"#########################################################################\")\n",
1362 |     "        print(\"Isolation Forest RESULTS\\n\\n\")\n",
1363 |     "        print(\"Clusters -> \",ifNclusters,\"\\n\")\n",
1364 |     "        print(ifR,\"\\n\\n\")\n",
1365 |     "        print(\"Max True Label\",\"\\n\\n\",MaxIfVal)\n",
1366 |     "        print(\"#########################################################################\")\n",
1367 |     "        #########################################################################\n",
1368 |     "        print(\"\\n\\n#########################################################################\")\n",
1369 |     "        print(\"Isolation Forest Score Metrics Menu\")\n",
1370 |     "        print(\"#########################################################################\")\n",
1371 |     "        print(\"1.F1 Score\")\n",
1372 |     "        \n",
1373 |     "        while True:\n",
1374 |     "            \n",
1375 |     "            ifScoreOption = input(\"option:\")\n",
1376 |     "            \n",
1377 |     "            if ifScoreOption == \"1\":\n",
1378 |     "                break\n",
1379 |     "            else:\n",
1380 |     "                \n",
1381 |     "                print(\"Error\\n\\n\")\n",
1382 |     "        \n",
1383 |     "        if ifScoreOption == \"1\":\n",
1384 |     "            \n",
1385 |     "            ##########################################################################\n",
1386 |     "            isolationForestF1,clusterAssigned = ifF1(ifLabels,labels,ifNclusters,MaxIfVal)\n",
1387 |     "            print(\"\\n\\n#########################################################################\")\n",
1388 |     "            print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n",
1389 |     "            print(\"Isolation Forest F1 Score -> \",isolationForestF1)\n",
1390 |     "            print(\"#########################################################################\")\n",
1391 |     "            ##########################################################################\n",
1392 |     "        \n",
1393 |     "    elif algorithmOption == \"4\":\n",
1394 |     "        #########################################################################\n",
1395 |     "        LOFlabels,lofR,maxLOFvalue,lofClusters = LOF(data,labels)\n",
1396 |     "        print(\"#########################################################################\")\n",
1397 |     "        print(\"Local Outlier Factor RESULTS\\n\\n\")\n",
1398 |     "        print(\"Clusters -> \",lofClusters,\"\\n\")\n",
1399 |     "        print(lofR,\"\\n\\n\")\n",
1400 |     "        print(\"Max True Label\",\"\\n\\n\",maxLOFvalue)\n",
1401 |     "        print(\"#########################################################################\")\n",
1402 |     "        #########################################################################\n",
1403 |     "        print(\"\\n\\n#########################################################################\")\n",
1404 |     "        print(\"LOF Score Metrics Menu\")\n",
1405 |     "        print(\"#########################################################################\")\n",
1406 |     "        print(\"1.F1 Score\")\n",
1407 |     "        \n",
1408 |     "        while True:\n",
1409 |     "            \n",
1410 |     "            lofScoreOption = input(\"option:\")\n",
1411 |     "            \n",
1412 |     "            if lofScoreOption == \"1\":\n",
1413 |     "                break\n",
1414 |     "            else:\n",
1415 |     "                \n",
1416 |     "                print(\"Error\\n\\n\")\n",
1417 |     "        \n",
1418 |     "        if lofScoreOption == \"1\":\n",
1419 |     "            \n",
1420 |     "            ##########################################################################\n",
1421 |     "            LOFf1,clusterAssigned = lofF1(LOFlabels,labels,lofClusters,maxLOFvalue)\n",
1422 |     "            print(\"\\n\\n#########################################################################\")\n",
1423 |     "            print(\"Cluster Matchings by Maximun Intersection[Found: True] -> \",clusterAssigned)\n",
1424 |     "            print(\"LOF F1 Score -> \",LOFf1)\n",
1425 |     "            print(\"#########################################################################\")\n",
1426 |     "            ##########################################################################\n",
1427 |     "                \n",
1428 |     "    while True: # If the user want to Make a new clustering algorithm test\n",
1429 |     "        \n",
1430 |     "        decision = input(\"Try another Clustering Algorithm[y/n]:\")\n",
1431 |     "        \n",
1432 |     "        if decision == \"y\" or  decision == \"n\":\n",
1433 |     "            break\n",
1434 |     "        else:\n",
1435 |     "            \n",
1436 |     "            print(\"Error\\n\\n\")\n",
1437 |     "    \n",
1438 |     "    \n",
1439 |     "    if decision == \"n\":\n",
1440 |     "        break\n",
1441 |     "    \n",
1442 |     "    else:\n",
1443 |     "        clear()"
1444 |    ]
1445 |   }
1446 |  ],
1447 |  "metadata": {
1448 |   "kernelspec": {
1449 |    "display_name": "Python 3",
1450 |    "language": "python",
1451 |    "name": "python3"
1452 |   },
1453 |   "language_info": {
1454 |    "codemirror_mode": {
1455 |     "name": "ipython",
1456 |     "version": 3
1457 |    },
1458 |    "file_extension": ".py",
1459 |    "mimetype": "text/x-python",
1460 |    "name": "python",
1461 |    "nbconvert_exporter": "python",
1462 |    "pygments_lexer": "ipython3",
1463 |    "version": "3.7.3"
1464 |   },
1465 |   "varInspector": {
1466 |    "cols": {
1467 |     "lenName": 16,
1468 |     "lenType": 16,
1469 |     "lenVar": 40
1470 |    },
1471 |    "kernels_config": {
1472 |     "python": {
1473 |      "delete_cmd_postfix": "",
1474 |      "delete_cmd_prefix": "del ",
1475 |      "library": "var_list.py",
1476 |      "varRefreshCmd": "print(var_dic_list())"
1477 |     },
1478 |     "r": {
1479 |      "delete_cmd_postfix": ") ",
1480 |      "delete_cmd_prefix": "rm(",
1481 |      "library": "var_list.r",
1482 |      "varRefreshCmd": "cat(var_dic_list()) "
1483 |     }
1484 |    },
1485 |    "position": {
1486 |     "height": "923px",
1487 |     "left": "328px",
1488 |     "right": "20px",
1489 |     "top": "9px",
1490 |     "width": "800px"
1491 |    },
1492 |    "types_to_exclude": [
1493 |     "module",
1494 |     "function",
1495 |     "builtin_function_or_method",
1496 |     "instance",
1497 |     "_Feature"
1498 |    ],
1499 |    "window_display": false
1500 |   }
1501 |  },
1502 |  "nbformat": 4,
1503 |  "nbformat_minor": 2
1504 | }
1505 | 


--------------------------------------------------------------------------------
/CBAD.py:
--------------------------------------------------------------------------------
   1 | #@authors:jeremyperez,bethanydanner
   2 | 
   3 | #reset -f
   4 | 
   5 | import numpy as np
   6 | import pandas as pd 
   7 | import time
   8 | import os
   9 | 
  10 | clear = lambda:os.system('clear')
  11 | 
  12 | def getDataSet():
  13 |     
  14 |     while True:
  15 |         print("**************************************************")
  16 |         print("DATA SET MENU")
  17 |         print("**************************************************")
  18 |         print("1.NSL-KDD")
  19 |         print("2.IDS 2017")
  20 |         
  21 |         option = input("Option:")
  22 |         
  23 |         if option == "1" or option == "2":
  24 |             break
  25 |     
  26 |     path = input("Path of the File:")
  27 |     
  28 |     return path,option
  29 | 
  30 | def readingData(path): #Reading the Dataset
  31 |     
  32 |     while True:
  33 |         
  34 |         option = input("Dataset has feature names[y/n]:") 
  35 |         
  36 |         if option == "y" or option == "n":
  37 |             break
  38 |             
  39 |     print("\nReading Dataset...") 
  40 |         
  41 |     if option == "y":
  42 |         dataSet = pd.read_csv(path,low_memory=False)
  43 |     
  44 |     elif option == "n":
  45 |         dataSet = pd.read_csv(path, header = None,low_memory=False)
  46 |             
  47 |     return dataSet
  48 | 
  49 | 
  50 | def checkMissing(X):#Checking if the dataset given has missing values.
  51 |     isMissing = str(X.isnull().values.any()) #Using String instead of Boolean because ("cannot unpack non-iterable numpy.bool object")
  52 |     
  53 |     if isMissing == "True":
  54 |         #Replacing vales = "Infinity" with "nan" values, if any such values exist in dataset
  55 |         X = X.replace('Infinity', np.nan) 
  56 |            
  57 |         missingValIndex = []
  58 |         total = X.isnull().sum().sum()
  59 |         percent = (total / (X.count().sum() + X.isnull().sum().sum())) * 100
  60 |             
  61 |         for rows in X: #Reporting percentages of missing values in dataset
  62 |                     
  63 |             if X[rows].isnull().sum() != 0:
  64 |                 missingValIndex.append(rows)
  65 |         print("\n\n**************************************************")
  66 |         print("Data has missing values")
  67 |         print("**************************************************")
  68 |         print("Features with missing values:",missingValIndex)
  69 |         print("Total missing Values -> " , total)
  70 |         print(percent,"%")
  71 |         
  72 |         return X
  73 |     
  74 |     else:
  75 |         
  76 |         return X
  77 | 
  78 | 
  79 | #Getting the data we want to test for the clustering algorithms
  80 | def gettingVariables(dataSet,dataSetOption):
  81 |    #Obtaining features and labels for either NSL-KDD or IDS 2017 dataset.
  82 |    #Handling categorical data if NSL-KDD dataset is chosen. 
  83 |    #and handling missing values if IDS 2017 dataset is chosen. 
  84 |    
  85 |     if dataSetOption == "1":
  86 |         while True:
  87 |             print("\n\n**************************************************")
  88 |             print("Variables Menu")
  89 |             print("**************************************************")
  90 |             print("1.Data set with categorical data oneHot encoded")
  91 |             print("2.Data set with categorical data removed")
  92 |             print("3.Data set with Risk Values replacing Server Type and Flag Features; Protocol Data oneHot encoded")
  93 |             option = input("Enter option :")
  94 |             
  95 |             
  96 |             if option == "1" or option == "2" or option == "3":
  97 |                 break
  98 |             else:
  99 |                 
 100 |                 print("Error\n\n")
 101 |             
 102 |        #Getting the dependent and independent Variables
 103 |        #Removing the dificulty level feature from NSL-KDD dataset because we are not using supervised learning in this project 
 104 |        
 105 |         if option == "1":
 106 |             #Keeping categorical features in dataset in order to One Hot Encode later on 
 107 |             X = dataSet.iloc[:,:-2].values #Getting all data except for the last two columns (namely difficulty level and labels)
 108 |             Y = dataSet.iloc[:,42].values #Labels
 109 |             return X,Y,option
 110 |         
 111 |         elif option == "2":
 112 |             #Removing categorical data from the data set
 113 |             X = dataSet.iloc[:,[0,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40]].values
 114 |             Y = dataSet.iloc[:,42].values #Labels
 115 |             
 116 |             return X,Y,option
 117 |         
 118 |         elif option == "3":
 119 |             #Keeping categorical features in order to encode with risk values later on
 120 |             X = dataSet.iloc[:,:-2].values
 121 |             Y = dataSet.iloc[:,42].values #Labels
 122 |             
 123 |             return X,Y,option
 124 |     
 125 | 
 126 |     elif dataSetOption == "2":
 127 |         #############################################################################
 128 |         #GETTING VARIABLES
 129 |         #############################################################################
 130 |         missingValIndex = []
 131 |         for rows in dataSet: #Getting features index with missing values
 132 |             if dataSet[rows].isnull().sum() != 0:
 133 |                     missingValIndex.append(rows)
 134 |                 
 135 |         X = dataSet.iloc[:,:-1].values#data
 136 |         #Assigning 0,1,2...n for the feature names if names are not specified
 137 |         X = pd.DataFrame(X,columns = [' Destination Port',' Flow Duration',' Total Fwd Packets',' Total Backward Packets','Total Length of Fwd Packets',
 138 |                                       ' Total Length of Bwd Packets',' Fwd Packet Length Max',' Fwd Packet Length Min',' Fwd Packet Length Mean',' Fwd Packet Length Std',
 139 |                                       'Bwd Packet Length Max',' Bwd Packet Length Min',' Bwd Packet Length Mean',' Bwd Packet Length Std','Flow Bytes/s',' Flow Packets/s',' Flow IAT Mean',
 140 |                                       ' Flow IAT Std',' Flow IAT Max',' Flow IAT Min','Fwd IAT Total',' Fwd IAT Mean',' Fwd IAT Std',' Fwd IAT Max',' Fwd IAT Min','Bwd IAT Total',' Bwd IAT Mean',
 141 |                                       ' Bwd IAT Std',' Bwd IAT Max',' Bwd IAT Min','Fwd PSH Flags',' Bwd PSH Flags',' Fwd URG Flags',' Bwd URG Flags',' Fwd Header Length',' Bwd Header Length','Fwd Packets/s',
 142 |                                       ' Bwd Packets/s',' Min Packet Length',' Max Packet Length',' Packet Length Mean',' Packet Length Std',' Packet Length Variance','FIN Flag Count',' SYN Flag Count',' RST Flag Count',
 143 |                                       ' PSH Flag Count',' ACK Flag Count',' URG Flag Count',' CWE Flag Count',' ECE Flag Count',' Down/Up Ratio',' Average Packet Size',' Avg Fwd Segment Size',' Avg Bwd Segment Size',' Fwd Header Length',
 144 |                                       'Fwd Avg Bytes/Bulk',' Fwd Avg Packets/Bulk',' Fwd Avg Bulk Rate',' Bwd Avg Bytes/Bulk',' Bwd Avg Packets/Bulk','Bwd Avg Bulk Rate','Subflow Fwd Packets',' Subflow Fwd Bytes',' Subflow Bwd Packets',' Subflow Bwd Bytes',
 145 |                                       'Init_Win_bytes_forward',' Init_Win_bytes_backward',' act_data_pkt_fwd',' min_seg_size_forward','Active Mean',' Active Std',' Active Max',' Active Min','Idle Mean',' Idle Std',' Idle Max',' Idle Min'])
 146 |         Y = dataSet.iloc[:,78].values#Labels
 147 |         
 148 |         #############################################################################
 149 |         #Variables successfully obtained 
 150 |         #############################################################################
 151 |         
 152 |     #############################################################################
 153 |     #MANAGE MISSING DATA
 154 |     #############################################################################   
 155 |      
 156 |         while True:
 157 |             print("\n\n**************************************************")
 158 |             print("Manage Missing Values ")
 159 |             print("**************************************************")
 160 |             print("1.Eliminate Catg. w/ Missing Values")
 161 |             print("2.Impute 0 for Missing Values")
 162 |             print("3.Impute Mean for Missing Values")
 163 |             print("4.Impute Median for Missing Values")
 164 |             print("5.Impute Mode for Missing Values")
 165 |             print("6.Simple Imputer")
 166 |             missingDataOption = input("Option:")
 167 |     
 168 |             if missingDataOption == "1" or missingDataOption == "2" or missingDataOption == "3" or missingDataOption == "4" or missingDataOption == "5" or missingDataOption == "6":
 169 |                 break
 170 |     
 171 |     
 172 |         if missingDataOption == "1":
 173 |             deletedColumns = []
 174 |             numColumns = len(X.columns)
 175 |             #Removing features with missing values
 176 |             for row in missingValIndex:
 177 |                 deletedColumns.append(row)
 178 |                 del X[row]
 179 |         
 180 |             print("#\n\n########################################################################")
 181 |             print("Columns Succesfully Removed")
 182 |             print(len(deletedColumns),"of",numColumns,"were deleted")
 183 |             print("Columns Names -> ",deletedColumns)
 184 |             print("#########################################################################")
 185 |     
 186 |         elif missingDataOption == "2":
 187 |             #Impute 0 for missing values
 188 |             for row in missingValIndex:
 189 |                 X[row] = X[row].fillna(0)
 190 |         
 191 |             print("\n\n#########################################################################")
 192 |             print("Sucessfully Filled Missing Values with 0")
 193 |             print("#########################################################################")
 194 |     
 195 |     
 196 |         elif missingDataOption == "3":
 197 |             #Impute mean for missing values
 198 |             for row in missingValIndex:
 199 |                 X[row] = X[row].astype(float)
 200 |                 X[row] = X[row].fillna(X[row].mean())
 201 |         
 202 |             print("\n\n#########################################################################")
 203 |             print("Sucessfully Filled Missing Values with Mean")
 204 |             print("#########################################################################")
 205 |     
 206 |         elif missingDataOption == "4":
 207 |             #Impute median for missing values
 208 |             for row in missingValIndex:
 209 |                 median = X[row].median()
 210 |                 X[row].fillna(median, inplace=True)
 211 |             print("\n\n#########################################################################")
 212 |             print("Sucessfully Filled Missing Values with Median")
 213 |             print("#########################################################################")
 214 |     
 215 |         elif missingDataOption == "5":
 216 |             #Impute mode for missing values
 217 |             for row in missingValIndex:
 218 |                 X[row] = X[row].fillna(X[row].mode()[0])
 219 |     
 220 |             print("\n\n#########################################################################")
 221 |             print("Sucessfully Filled Missing Values with Mode ")
 222 |             print("#########################################################################")
 223 |         
 224 |         elif missingDataOption == "6": 
 225 |             from sklearn.impute import SimpleImputer
 226 |             #"Imputation transformer for completing missing values."(Univariate)
 227 |             X = SimpleImputer(missing_values = np.nan, strategy='mean', fill_value=None, verbose=0, copy=True).fit_transform(X)          
 228 |             print("\n\n#########################################################################")
 229 |             print("Sucessfully Imputed Simple Imputer ")
 230 |             print("#########################################################################")
 231 |                   
 232 |                   
 233 |         option = "None" #This data does not have categorical features so dataOption is none      
 234 |         return X,Y,option
 235 |        
 236 | #############################################################################
 237 | #END OF MISSING DATA
 238 | #############################################################################
 239 |     
 240 | 
 241 | 
 242 | 
 243 | 
 244 |     
 245 | def encodingLabels(Y,dataOption,datasetOption):#Encoding the labels with multiclass or binary labels
 246 |     
 247 |     if datasetOption == "1": #Checking if the data set chosen is NSL-KDD
 248 |         
 249 |         if dataOption == "1" or dataOption == "2" or dataOption == "3":
 250 |             
 251 |             while True:
 252 |                 print("\n\n#########################################################################")
 253 |                 print("Encoding Menu")
 254 |                 print("#########################################################################")
 255 |                 print("1.Binary true labels: normal = 0, abnormal = 1")
 256 |                 print("2.Multiclass true labels: normal = 0, DoS = 1, Probe = 2, R2L = 3, U2R = 4")
 257 |                 encodeOption = input("Enter option :") 
 258 |     
 259 |                 if encodeOption == "1" or encodeOption == "2":
 260 |                     break
 261 |                 else:
 262 |                     
 263 |                     print("Error\n\n")
 264 |     
 265 |     
 266 |             if encodeOption == "1":
 267 |                 #Binary Categories
 268 |                 attackType  = {'normal':"normal", 'neptune':"abnormal", 'warezclient':"abnormal", 'ipsweep':"abnormal",'back':"abnormal", 'smurf':"abnormal", 'rootkit':"abnormal",'satan':"abnormal", 'guess_passwd':"abnormal",'portsweep':"abnormal",'teardrop':"abnormal",'nmap':"abnormal",'pod':"abnormal",'ftp_write':"abnormal",'multihop':"abnormal",'buffer_overflow':"abnormal",'imap':"abnormal",'warezmaster':"abnormal",'phf':"abnormal",'land':"abnormal",'loadmodule':"abnormal",'spy':"abnormal",'perl':"abnormal"} 
 269 |                 attackEncodingCluster  = {'normal':0,'abnormal':1}
 270 |     
 271 |                 Y[:] = [attackType[item] for item in Y[:]] #Encoding the binary data
 272 |                 Y[:] = [attackEncodingCluster[item] for item in Y[:]]#Changing the names of the labels to binary labels normal and abnormal
 273 |                 return Y,encodeOption
 274 |     
 275 |             elif encodeOption == "2":
 276 |                 #Multiclass Categories
 277 |                 #normal = 0
 278 |                 #DoS = 1
 279 |                 #Probe = 2
 280 |                 #R2L = 3
 281 |                 #U2R = 4
 282 |                 attackType  = {'normal': 'normal', 'neptune':'DoS', 'warezclient': 'R2L', 'ipsweep': 'Probe','back': 'DoS', 'smurf': 'DoS', 'rootkit': 'U2R','satan': 'Probe', 'guess_passwd': 'R2L','portsweep': 'Probe','teardrop': 'DoS','nmap': 'Probe','pod': 'DoS','ftp_write': 'R2L','multihop': 'R2L','buffer_overflow': 'U2R','imap': 'R2L','warezmaster': 'R2L','phf': 'R2L','land': 'DoS','loadmodule': 'U2R','spy': 'R2L','perl': 'U2R'} 
 283 |                 attackEncodingCluster  = {'normal':0,'DoS':1,'Probe':2,'R2L':3, 'U2R':4} #Main Categories
 284 |     
 285 |                 Y[:] = [attackType[item] for item in Y[:]] #Encoding the 22 fine-grain attack labels into the 4 main types of attacks, and leaving 'normal' as 'normal'
 286 |                 Y[:] = [attackEncodingCluster[item] for item in Y[:]]#Changing the names of attacks into numerical data
 287 |                 return Y,encodeOption
 288 |         else:
 289 |             return Y
 290 |     
 291 |     
 292 |     elif datasetOption == "2":#Checking if the data set chosen is IDS2017
 293 |         print("\n\n#########################################################################")
 294 |         print("Encoding Menu")
 295 |         print("#########################################################################")
 296 |         print("1.Binary true labels: normal = 0, abnormal = 1")
 297 |         print("2. Multiclass true labels: BENIGN= 0, DoS slowloris= 1, DoS Slowhttptest= 2, DoS Hulk= 3, DoS GoldenEye= 4, Heartbleed= 5")
 298 |         encodeOption = input("Enter option :")
 299 | 
 300 |         if encodeOption == "1":
 301 |             Y = np.array(Y,dtype= object)
 302 |             attackEncoding  = {'BENIGN': 0,'DoS slowloris': 1,'DoS Slowhttptest': 2,'DoS Hulk': 3, 'DoS GoldenEye': 4, 'Heartbleed': 5} #Main Categories
 303 |             Y[:] = [attackEncoding[item] for item in Y[:]]#Changing the names of attacks into categorical data
 304 |     
 305 |             return Y,encodeOption
 306 |         
 307 |         elif encodeOption == "2":
 308 |             Y = np.array(Y,dtype= object)
 309 |             attackType  = {'BENIGN': 'normal','DoS slowloris': 'abnormal','DoS Slowhttptest': 'abnormal','DoS Hulk': 'abnormal', 'DoS GoldenEye': 'abnormal', 'Heartbleed': 'abnormal'} #Binary Categories
 310 |             attackEncoding = {'normal': 0, 'abnormal': 1}
 311 |             
 312 |             Y[:] = [attackType[item] for item in Y[:]]#Changing the names of attacks into binary categories
 313 |             Y[:] = [attackEncoding[item] for item in Y[:]]#Changing the names of attacks into binary categories
 314 |             return Y,encodeOption
 315 |         
 316 |         else:
 317 |             return Y
 318 | 
 319 | 
 320 | 
 321 | 
 322 | #Encoding the categorical features using one hot encoding and using Main attacks categories or binary categories
 323 | def oneHotEncodingData(X,dataOption):
 324 |         
 325 |     from sklearn.preprocessing import OneHotEncoder
 326 |     from sklearn.compose import ColumnTransformer
 327 |     
 328 |     #Label encoding step is unnecessary because ColumnTransformer performs both one hot encoding and label encoding
 329 |     #Encoding the Independient Variable
 330 |     if dataOption == "1": #For One Hot Encoding all categorical data
 331 |         transform = ColumnTransformer([("Servers", OneHotEncoder(categories = "auto"), [1,2,3])], remainder="passthrough")
 332 |         X = transform.fit_transform(X)
 333 |         print("\n\n#########################################################################")
 334 |         print("Data has been successfully One Hot Encoded")
 335 |         print("#########################################################################")
 336 | 
 337 |         return X
 338 |     elif dataOption == "3": #For risk encoding categorical data: One Hot Encoding Protocol Feature because there is no risk value data for that feature, and it only has 3 attributes, limiting the number of added features by One Hot Encoding
 339 |         transform = ColumnTransformer([("Servers", OneHotEncoder(categories = "auto"), [1])], remainder="passthrough")
 340 |         X = transform.fit_transform(X)
 341 |         print("\n\n#########################################################################")
 342 |         print("Data has been successfully One Hot Encoded")
 343 |         print("#########################################################################")
 344 |         return X
 345 |         
 346 |     else:
 347 |         return X #Returning data with no changes
 348 | 
 349 | 
 350 | def riskEncodingData(X,dataOption):#Assinging risk values to categorical features "Servers" and "Server Errors"
 351 |     #Manually Encoding for the attacks types only
 352 |     if dataOption == "3": #if data option is risk Value
 353 |         X = pd.DataFrame(X)
 354 |         servers  = {'http':0.01, 'domain_u':0, 'sunrpc':1, 'smtp':0.01, 'ecr_i':0.87, 'iso_tsap':1, 'private':0.97, 'finger':0.27, 'ftp':0.26, 'telnet':0.48,'other':0.12,'discard':1, 'courier':1, 'pop_3':0.53, 'ldap':1, 'eco_i':0.8, 'ftp_data':0.06, 'klogin':1, 'auth':0.31, 'mtp':1, 'name':1, 'netbios_ns':1,'remote_job':1,'supdup':1,'uucp_path':1,'Z39_50':1,'csnet_ns':1,'uucp':1,'netbios_dgm':1,'urp_i':0,'domain':0.96,'bgp':1,'gopher':1,'vmnet':1,'systat':1,'http_443':1,'efs':1,'whois':1,'imap4':1,'echo':1,'link':1,'login':1,'kshell':1,'sql_net':1,'time':0.88,'hostnames':1,'exec':1,'ntp_u':0,'nntp':1,'ctf':1,'ssh':1,'daytime':1,'shell':1,'netstat':1,'nnsp':1,'IRC':0,'pop_2':1,'printer':1,'tim_i':0.33,'pm_dump':1,'red_i':0,'netbios_ssn':1,'rje':1,'X11':0.04,'urh_i':0,'http_8001':1,'aol':1,'http_2784':1,'tftp_u':0,'harvest':1}
 355 |         X[2] = [servers[item] for item in X[2]]
 356 | 
 357 |         servers_Error  = {'REJ':0.519, 'SF':0.016, 'S0':0.998, 'RSTR':0.882, 'RSTO':0.886,'SH':0.993,'S1':0.008,'RSTOS0':1,'S3':0.08,'S2':0.05,'OTH':0.729} 
 358 |         X[3] = [servers_Error[item] for item in X[3]]
 359 | 
 360 |         print("\n\n#########################################################################")
 361 |         print("Data has been successfully risk Encoded")
 362 |         print("#########################################################################")
 363 | 
 364 |         return X
 365 |         
 366 |     else:
 367 |         
 368 |         return X #Returning the data with no changes
 369 |             
 370 |     
 371 | 
 372 | 
 373 | def scaling(X):#Scaling the data with the MinMaxScaler method so that values in each feature are in the same range for experiments.
 374 |     
 375 |     
 376 | 
 377 |     while True:
 378 |             
 379 |             decision = input("Scale data [y/n]:")
 380 |             
 381 |             if decision == "y" or  decision == "n":
 382 |                 break
 383 |             else:
 384 |                 
 385 |                 print("Error\n\n")
 386 |     
 387 |     if decision == "y":
 388 |         
 389 |             from sklearn.preprocessing import MinMaxScaler
 390 |             #Transforming features by scaling each feature to the given range, (0,1)
 391 |             X =  MinMaxScaler(feature_range=(0, 1)).fit_transform(X)
 392 |             print("\n\n#########################################################################")
 393 |             print("Data has been successfully scaled.")
 394 |             print("#########################################################################")
 395 |             return X
 396 |         
 397 |     else:
 398 |         return X
 399 | 
 400 |     
 401 | def shuffleData(X):#Shuffling the order of data instances. Currently this is a bug in code. If we experiment on shuffled data, the algorithms return nonsense results.
 402 | 
 403 |     from sklearn.utils import shuffle
 404 |     while True:
 405 |         option = input("Shuffle data [y]/[n]:")
 406 |         
 407 |         if option == "y" or option == "n":
 408 |             break
 409 |         else:
 410 |             
 411 |             print("Error\n\n")
 412 |     
 413 |     if option == "y":
 414 |         
 415 |         X = pd.DataFrame(X)
 416 |         X = shuffle(X)
 417 |         X.reset_index(inplace=True,drop=True)
 418 |         X = np.array(X)
 419 |         
 420 |         print("\n\n#########################################################################")
 421 |         print("Data has been successfully shuffled.")
 422 |         print("#########################################################################")
 423 |         return X
 424 |     else:
 425 |         
 426 |         return X
 427 | 
 428 | 
 429 | 
 430 | 
 431 | #K-Means Algorithm
 432 | def kmeansClustering(X,Y): 
 433 |     from sklearn.cluster import KMeans
 434 | 
 435 |     while True:
 436 |         print("\n\n#########################################################################")
 437 |         print("KMEANS ALGORITHM")
 438 |         print("#########################################################################")
 439 |               
 440 |         nClusters = input("Number of clusters:")
 441 |         
 442 |         try:
 443 |             nClusters = int(nClusters)
 444 |             
 445 |         except ValueError:
 446 |             
 447 |             print("Error\n\n")
 448 |             
 449 |         if type(nClusters) == int:
 450 |             n = 0
 451 |             clusters = []
 452 |             
 453 |             while n < nClusters:#Converting nClusters into an array of n clusters [n] for use it later
 454 |                 clusters.append(n)
 455 |                 n+=1
 456 |             break
 457 |         
 458 |     while True:
 459 |         init = input("Initialization method [k-means++,random]:")
 460 |         
 461 |         if init == "k-means++" or init == "random":
 462 |             break
 463 | 
 464 |     print("\nClustering...\n")
 465 |     
 466 |     start_time = time.time()
 467 |     KMEANS = KMeans(n_clusters = nClusters, init = init,max_iter = 300,n_init = 10,random_state = 0)
 468 |     print("\n\nRun Time ->","--- %s seconds ---" % (time.time() - start_time))
 469 |     print("Data Successfully Clustered")
 470 |     kmeans = KMEANS.fit(X)
 471 |     Z = kmeans.labels_
 472 |     inertia = KMEANS.inertia_
 473 |     #Kmeans Results
 474 |     kmeansR = pd.crosstab(Y,Z)
 475 |     maxVal = kmeansR.idxmax()
 476 |     
 477 |     return Z,clusters,kmeansR,maxVal,inertia
 478 | 
 479 | 
 480 | 
 481 | 
 482 | def kF1(Z,Y,maxVal,clusters):#F1 Score for Kmeans
 483 |     from sklearn.metrics import f1_score
 484 |     #Encoding data to F-score
 485 |     
 486 |     
 487 |     #Automatically assigning the max-ocurring instance in each found cluster to that specific found cluster, in order to evaluate clustering with greater ease.
 488 |     n = 0 #counter
 489 |     dictionaryCluster  = {} #creating an empty dictionary 
 490 |     f1 = 0 #f1score
 491 |     average = ''
 492 |     
 493 |     while n < len(clusters):#while counter < number of clusters
 494 |         dictionaryCluster[clusters[n]] = maxVal[n] #Creating key(cluster index) with value (max number of the clustering results) for every iteration
 495 |         n+=1
 496 |         
 497 |     Z[:] = [dictionaryCluster[item] for item in Z[:]] #Matching key with the index of klabels and replacing it with key value
 498 |             
 499 |     Y = np.array(Y,dtype = int) #Converting labels into an int array
 500 |     
 501 |     while True:
 502 |         
 503 |         average = input("Average Method[weighted,micro,macro,binary]:")
 504 |         
 505 |         if average == "weighted" or average == "micro" or average == "macro" or average == 'binary':
 506 |             break
 507 |     #score metric   
 508 |     f1 = f1_score(Y,Z, average = average)
 509 |     
 510 |     return f1,dictionaryCluster
 511 | 
 512 | 
 513 | 
 514 | def kNMI(Z,Y,maxVal,clusters):
 515 |     from sklearn.metrics import normalized_mutual_info_score
 516 |     
 517 |     #Automatically assigning the max-ocurring instance in each found cluster to that specific found cluster, in order to evaluate clustering with greater ease.
 518 |     n = 0 # counter
 519 |     dictionaryCluster  = {} #creating an empty dictionary 
 520 |     NMI = 0
 521 |     average = ''
 522 |     
 523 |     while n < len(clusters):#while counter < number of clusters
 524 |         dictionaryCluster[clusters[n]] = maxVal[n] #Creating key(cluster index) with value (max number of the clustering results) for every iteration
 525 |         n+=1
 526 |         
 527 |     Z[:] = [dictionaryCluster[item] for item in Z[:]] #Matching key with the index of klabels and replacing it with key value
 528 |     
 529 |     Y = np.array(Y,dtype = int) #Making sure that labels are in an int array
 530 |     
 531 |     while True:
 532 |         
 533 |         average = input("Average Method[geometric,min,arithmetic,max]:")
 534 |         
 535 |         if average == "geometric" or average == "min" or average == "arithmetic" or average == "max":
 536 |             break
 537 |     #Score metric 
 538 |     NMI = normalized_mutual_info_score(Y, Z, average_method = average)
 539 |     
 540 |     return NMI,dictionaryCluster
 541 | 
 542 | 
 543 | 
 544 | def kARS(Z,Y,maxVal,clusters):
 545 |     from sklearn.metrics import adjusted_rand_score
 546 |     
 547 |     #Automatically assigning the max-ocurring instance in each found cluster to that specific found cluster, in order to evaluate clustering with greater ease.
 548 |     n = 0 # counter
 549 |     dictionaryCluster  = {} #Creating an empty dictionary 
 550 |     ars = 0
 551 |     
 552 |     while n < len(clusters):# while counter < number of clusters
 553 |         dictionaryCluster[clusters[n]] = maxVal[n] #Creating key(cluster index) with value (max number of the clustering results) for every iteration
 554 |         n+=1
 555 |         
 556 |     Z[:] = [dictionaryCluster[item] for item in Z[:]] #Matching key with the index of klabels and replacing it with key value
 557 |     
 558 |     Y = np.array(Y,dtype = int) #Making sure that labels are in an int array
 559 |     
 560 |     #score metric
 561 |     ars = adjusted_rand_score(Y, Z)
 562 |     
 563 |     return ars,dictionaryCluster
 564 | 
 565 | 
 566 | #DBSCAN Algorithm
 567 | def dbscanClustering(X,Y):
 568 |     from sklearn.cluster import DBSCAN
 569 |     
 570 |     while True:
 571 |         
 572 |         print("\n\n#########################################################################")
 573 |         print("DBSCAN ALGORITHM")
 574 |         print("#########################################################################")
 575 |               
 576 |         epsilon = input("epsilon[Decimal]:")
 577 |         
 578 |         try:
 579 |             epsilon = float(epsilon)
 580 |             
 581 |         except ValueError:
 582 |             
 583 |             print("Enter a Decimal number")
 584 |             
 585 |             
 586 |         if type(epsilon) == float:
 587 |             break
 588 |         
 589 |     while True:
 590 |         minSamples = input("Min Samples[Integer]:")
 591 |         
 592 |         try:
 593 |             minSamples = int(minSamples)
 594 |             
 595 |         except ValueError:
 596 |             
 597 |             print("Enter a Integer Number")
 598 |             
 599 |         if type(minSamples) == int:
 600 |             break
 601 |         
 602 |     while True:
 603 |         algorithm = input("Algorithm['auto’, ‘ball_tree’, ‘kd_tree’, 'brute']:")
 604 |             
 605 |         if algorithm == "auto" or algorithm == "ball_tree" or algorithm == "kd_tree" or algorithm == "brute":
 606 |             break
 607 |         
 608 |         else:
 609 |             print("Error\n\n")
 610 |             
 611 |     
 612 |     print("\nClustering...\n")
 613 | 
 614 |     #Computing DBSCAN
 615 |     start_time = time.time() 
 616 |     db = DBSCAN(eps= epsilon, min_samples = minSamples,algorithm = algorithm).fit(X)
 617 |     print("\n\nRun Time ->","--- %s seconds ---" % (time.time() - start_time))
 618 |     print("Data Successfully Clustered")
 619 |     
 620 |     
 621 |     core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
 622 |     core_samples_mask[db.core_sample_indices_] = True
 623 |     
 624 |     Z = db.labels_
 625 |     # Number of clusters in labels, ignoring noise if present.
 626 |     n_clusters = len(set(Z))
 627 |     n_noise_ = list(Z).count(-1)
 628 |     
 629 |     n = -1  #DBSCAN returns cluster with index -1 (anomalies)
 630 |     clusters = []
 631 |     while n + 1 < n_clusters:
 632 |         clusters.append(n)
 633 |         n += 1
 634 |     
 635 |     #DBSCAN Results
 636 |     dbscanR = pd.crosstab(Y,Z)
 637 |     maxVal = dbscanR.idxmax()
 638 |     
 639 |     return Z,clusters,n_noise_,dbscanR,maxVal
 640 | 
 641 | 
 642 | 
 643 | 
 644 | def dbF1(Z,Y,clusters,maxVal):#F1 score for DBSCAN
 645 |     from sklearn.metrics import f1_score
 646 |     #Encoding data to F-score
 647 |     
 648 |     #Automatically assigning the max-ocurring instance in each found cluster to that specific found cluster, in order to evaluate clustering with greater ease.
 649 |     n = 0 # counter
 650 |     c = -1 # - counter for when max Value has negative index
 651 |     dictionaryCluster  = {} #Creating an empty dictionary 
 652 |     f1 = 0
 653 |     average = ''
 654 |     
 655 |     while n < len(clusters):#while counter < number of clusters
 656 |         dictionaryCluster[clusters[n]] = maxVal[c] #Creating key(cluster index) with value (max number of the clustering results) for every iteration
 657 |         n+=1
 658 |         c+=1
 659 |     
 660 |         
 661 |     Z[:] = [dictionaryCluster[item] for item in Z[:]] #Matching key with the index of klabels and replacing it with key value
 662 |     
 663 |     Y = np.array(Y,dtype = int) #Making sure that labels are in an int array
 664 |     while True:
 665 |         
 666 |         average = input("Average Method[weighted,micro,macro]:")
 667 |         
 668 |         if average == "weighted" or average == "micro" or average == "macro":
 669 |             break
 670 |         
 671 |         else:
 672 |             
 673 |             print("Error\n\n")
 674 |     #score metric
 675 |     f1 = f1_score(Y,Z, average = average)
 676 |     return f1,dictionaryCluster
 677 | 
 678 | 
 679 | def dbNMI(Z,Y,clusters,maxVal):#Normalized Mutual Information score for DBSCAN
 680 |     from sklearn.metrics import normalized_mutual_info_score
 681 |     #Automatically assigning the max-ocurring instance in each found cluster to that specific found cluster, in order to evaluate clustering with greater ease.
 682 |     n = 0 # counter
 683 |     c = -1 # - counter max Value has negative index
 684 |     NMI = 0
 685 |     dictionaryCluster  = {} #Creating an empty dictionary 
 686 |     average = ''
 687 |     
 688 |     while n < len(clusters):#while counter < number of clusters
 689 |         dictionaryCluster[clusters[n]] = maxVal[c] #Creating key(cluster index) with value (max number of the clustering results) for every iteration
 690 |         n+=1
 691 |         c+=1
 692 |     
 693 |     Y = np.array(Y,dtype = int) #Making sure that labels are in an int array
 694 | 
 695 |     while True:
 696 |         
 697 |         average = input("Average Method[geometric,min,arithmetic,max]:")
 698 |         
 699 |         if average == "geometric" or average == "min" or average == "arithmetic" or average == "max":
 700 |             break
 701 |         else:
 702 |             
 703 |             print("Error\n\n")
 704 |     #score metric
 705 |     NMI = normalized_mutual_info_score(Y, Z, average_method= average)
 706 |     
 707 |     return NMI,dictionaryCluster
 708 | 
 709 | def dbARS(Z,Y,clusters,maxVal): #Adjusted Rand Index score for DBSCAN
 710 |     from sklearn.metrics import adjusted_rand_score
 711 |     
 712 |     #Automatically assigning the max-ocurring instance in each found cluster to that specific found cluster, in order to evaluate clustering with greater ease.
 713 |     n = 0 # counter
 714 |     c = -1 # - counter max Value has negative index
 715 |     ars = 0
 716 |     dictionaryCluster  = {} #Creating an empty dictionary 
 717 |     
 718 |     while n < len(clusters):#while counter < number of clusters
 719 |         dictionaryCluster[clusters[n]] = maxVal[c] #Creating key(cluster index) with value (max number of the clustering results) for every iteration
 720 |         n+=1
 721 |         c+=1
 722 |     #score metric
 723 |     ars = adjusted_rand_score(Y,Z)
 724 |     
 725 |     return ars,dictionaryCluster
 726 | 
 727 | 
 728 | def isolationForest(X,Y):#Isolation Forest algorithm
 729 |     from sklearn.ensemble import IsolationForest
 730 |     
 731 |     while True:
 732 |         contamination = input("Contamination[Float 0 to 0.5]: ")
 733 |         
 734 |         try:
 735 |             contamination = float(contamination)
 736 |             
 737 |         except ValueError:
 738 |             
 739 |             print("Enter a Number")
 740 |             
 741 |         if type(contamination) == float and (contamination >= 0 and contamination <= 0.5):
 742 |             break
 743 |     
 744 |     print("\nClustering...\n")   
 745 |     
 746 |     start_time = time.time() 
 747 |     Z = IsolationForest(max_samples = "auto",behaviour = "new",contamination = contamination).fit_predict(X)
 748 |     print("\n\nRun Time ->","--- %s seconds ---" % (time.time() - start_time))
 749 |     
 750 |     Z = np.array(Z,dtype = object)
 751 |     
 752 |     ifR = pd.crosstab(Y,Z)
 753 |     ifR = pd.DataFrame(ifR)
 754 |     maxVal = ifR.idxmax()
 755 |     
 756 |     n = -1  #Isolation Forest returns clusters with indicies -1 (outlier) and 1 (normal)
 757 |     clusters = []
 758 |     while n < len(ifR.columns):
 759 |         clusters.append(n)
 760 |         n += 2
 761 |         
 762 |     return Z,ifR,maxVal,clusters
 763 | 
 764 | def ifF1(Z,Y,clusters,maxVal): #f1 score for isolation forest
 765 |     from sklearn.metrics import f1_score
 766 |     #Automatically assigning the max-ocurring instance in each found cluster to that specific found cluster, in order to evaluate clustering with greater ease.
 767 |     
 768 |     n = 0 # counter
 769 |     c = -1 # - counter max Value has negative index
 770 |     f1 = 0
 771 |     average = ''
 772 |     dictionaryCluster  = {} #Creating an empty dictionary 
 773 | 
 774 |     
 775 |     while n < len(clusters): #Starting counter at -1 and incrementing by 2, because Isolation Forest returns -1 and 1 clusters 
 776 |         dictionaryCluster[clusters[n]] = maxVal[c] 
 777 |         n+=1
 778 |         c+=2
 779 |         
 780 |     Z[:] = [dictionaryCluster[item] for item in Z[:]] #Matching key with the index of klabels and replacing it with key value
 781 |     
 782 |     Y = np.array(Y,dtype = int)
 783 |     Z = np.array(Z,dtype = int)
 784 |     
 785 |     while True:
 786 |         
 787 |         average = input("Average Method[weighted,micro,macro]:")
 788 |         
 789 |         if average == "weighted" or average == "micro" or average == "macro":
 790 |             break
 791 |         
 792 |         else:
 793 |             
 794 |             print("Error\n\n")
 795 |     #score metric
 796 |     f1 = f1_score(Y,Z, average = average) #[None, 'micro', 'macro', 'weighted']
 797 |     
 798 |     return f1,dictionaryCluster
 799 |     
 800 | 
 801 | def LOF(X,Y):#Local Outlier Factor algorithm
 802 |     from sklearn.neighbors import LocalOutlierFactor 
 803 |     
 804 |     while True:
 805 |         contamination = input("Contamination[Float 0 to 0.5]: ")
 806 |         
 807 |         try:
 808 |             contamination = float(contamination)
 809 |             
 810 |         except ValueError:
 811 |             
 812 |             print("Enter a Number")
 813 |             
 814 |         if type(contamination) == float and (contamination > 0 and contamination <= 0.5):
 815 |             break
 816 |         
 817 |     while True:
 818 |         algorithm = input("Algorithm['auto’, ‘ball_tree’, ‘kd_tree’, 'brute']:")
 819 |             
 820 |         if algorithm == "auto" or algorithm == "ball_tree" or algorithm == "kd_tree" or algorithm == "brute":
 821 |             break
 822 |         else:
 823 |             
 824 |             print("Error\n\n")
 825 |             
 826 |     print("\nClustering...\n")
 827 |     
 828 |     start_time = time.time() 
 829 |     lof = LocalOutlierFactor(contamination = contamination,algorithm = algorithm).fit_predict(X)
 830 |     print("\n\nRun Time ->","--- %s seconds ---" % (time.time() - start_time))
 831 |     
 832 |     lofR = pd.crosstab(Y,lof)
 833 |     maxVal = lofR.idxmax()
 834 |     
 835 |     
 836 |     n = -1  #LOF returns index -1 and 1 cluster
 837 |     clusters = []
 838 |     while n < len(lofR.columns):
 839 |         clusters.append(n)
 840 |         n += 2
 841 |     
 842 |     
 843 |     
 844 |     return lof,lofR,maxVal,clusters
 845 |     
 846 | 
 847 | def lofF1(Z,Y,clusters,maxVal): #f1 score for local outlier factor
 848 |     from sklearn.metrics import f1_score
 849 |     
 850 |     #Automatically assigning the max-ocurring instance in each found cluster to that specific found cluster, in order to evaluate clustering with greater ease.
 851 |     n = 0 # counter
 852 |     c = -1 # - counter max Value has negative index
 853 |     f1 = 0
 854 |     dictionaryCluster  = {} # creating an empty dictionary 
 855 |     
 856 |     while n < len(clusters): # Starting counter at -1 and incrementing by 2, because Isolation Forest returns -1 and 1 clusters
 857 |         dictionaryCluster[clusters[n]] = maxVal[c] 
 858 |         n+=1
 859 |         c+=2
 860 |         
 861 |     Z[:] = [dictionaryCluster[item] for item in Z[:]] #Matching key with the index of klabels and replacing it with key value
 862 |     Y = np.array(Y,dtype = int)
 863 |     Z = np.array(Z,dtype = int)
 864 |     while True:
 865 |         
 866 |         average = input("Average Method[weighted,None,micro,macro]:")
 867 |         
 868 |         if average == "weighted" or average == "micro" or average == "macro" or average == "None":
 869 |             break
 870 |         
 871 |         else:
 872 |             
 873 |             print("Error\n\n")
 874 |     f1 = f1_score(Y,Z, average = average) #[None, 'micro', 'macro', 'weighted']
 875 |     
 876 |     return f1,dictionaryCluster
 877 | 
 878 | clear()
 879 | #Calling the functions
 880 | 
 881 | ##########################################################################
 882 | path,dataSetOption = getDataSet()
 883 | #########################################################################
 884 | #########################################################################
 885 | dataSet = readingData(path)
 886 | #########################################################################
 887 | #########################################################################
 888 | dataSet = checkMissing(dataSet)
 889 | #########################################################################
 890 | #########################################################################
 891 | data,labels,dataOption = gettingVariables(dataSet,dataSetOption) #Getting the Data we want to use for the algorithms
 892 | #########################################################################
 893 | #########################################################################
 894 | try:
 895 |     labels,encodeOption = encodingLabels(labels,dataOption,dataSetOption) #Encoding the true labels
 896 | except ValueError:
 897 |     labels = encodingLabels(labels,dataOption,dataSetOption) #Encoding the true labels
 898 | #########################################################################
 899 | #########################################################################
 900 | data = riskEncodingData(data,dataOption)
 901 | #########################################################################
 902 | #########################################################################
 903 | data = oneHotEncodingData(data,dataOption) #Applying One Hot Encoding with the complete data
 904 | #########################################################################
 905 | #########################################################################
 906 | data = scaling(data)
 907 | #########################################################################
 908 | #########################################################################
 909 | data = shuffleData(data)
 910 | #########################################################################
 911 | 
 912 | #This menu is a option to run diferrent algorithms with the same preproceced data without needing to run all the code from the start to make another experiment.
 913 | while True:  
 914 |     while True:
 915 |         print("\n\n#########################################################################")
 916 |         print("Algorithm Menu")
 917 |         print("#########################################################################")
 918 |         
 919 |         print("1.Kmeans")
 920 |         print("2.Dbscan")
 921 |         print("3.Isolation Forest")
 922 |         print("4.Local Factor Outlier")
 923 |         
 924 |         algorithmOption = input("option:")
 925 |         
 926 |         if algorithmOption == "1" or algorithmOption == "2" or algorithmOption == "3" or algorithmOption == "4":
 927 |                 break
 928 |         else:
 929 |             
 930 |             print("Error\n\n")
 931 | 
 932 |     
 933 |     if algorithmOption == "1":
 934 |         #########################################################################
 935 |         #KMEANS
 936 |         klabels,kClusters,kmeansR,maxKvalue,inertia = kmeansClustering(data,labels)
 937 |         print("#########################################################################")
 938 |         print("KMEANS RESULTS\n\n")
 939 |         print("Clusters -> ",kClusters,"\n")
 940 |         print("Inertia -> ",inertia)
 941 |         print(kmeansR,"\n\n")
 942 |         print("Max True Label","\n\n",maxKvalue)
 943 |         print("#########################################################################")
 944 |         #########################################################################
 945 |         print("\n\n#########################################################################")
 946 |         print("Kmeans Score Metrics Menu")
 947 |         print("#########################################################################")
 948 |         
 949 |         while True:
 950 |             print("1.F1 Score")
 951 |             print("2.Normalized Mutual Info Score")
 952 |             print("3.Adjusted Rand Score")
 953 |         
 954 |             kScoreOption = input("option:")
 955 |             
 956 |             if kScoreOption == "1" or kScoreOption == "2" or kScoreOption == "3":
 957 |                 break
 958 |             else:
 959 |                 
 960 |                 print("Error\n\n")
 961 |      
 962 |         if kScoreOption == "1":
 963 |             #########################################################################
 964 |             #F1 Score
 965 |             kmeansF1,clusterAssigned = kF1(klabels,labels,maxKvalue,kClusters)
 966 |             print("\n\n#########################################################################")
 967 |             print("Cluster Matchings by Maximun Intersection[Found: True] -> ",clusterAssigned)
 968 |             print("KMEANS F1 Score -> ",kmeansF1)
 969 |             print("#########################################################################")
 970 |             #########################################################################
 971 |         
 972 |         elif kScoreOption == "2":
 973 |             #########################################################################
 974 |             kmeansNMI,clusterAssigned = kNMI(klabels,labels,maxKvalue,kClusters)
 975 |             print("\n\n#########################################################################")
 976 |             print("Cluster Matchings by Maximun Intersection[Found: True] -> ",clusterAssigned)
 977 |             print("KMEANS Normalized Mutual Info Score -> ",kmeansNMI)
 978 |             print("#########################################################################")
 979 |             #########################################################################
 980 |     
 981 |         elif kScoreOption == "3":
 982 |             
 983 |             #########################################################################
 984 |             kmeansARS,clusterAssigned = kARS(klabels,labels,maxKvalue,kClusters)
 985 |             print("\n\n#########################################################################")
 986 |             print("Cluster Matchings by Maximun Intersection[Found: True] -> ",clusterAssigned)
 987 |             print("KMEANS Adjusted Rand Score -> ",kmeansARS)
 988 |             print("#########################################################################")
 989 |             #########################################################################
 990 |             
 991 |     elif algorithmOption == "2":
 992 |         #########################################################################
 993 |         #DBSCAN
 994 |         dblabels,dbClusters,nNoises,dbscanR,maxDBvalue = dbscanClustering(data,labels) 
 995 |         print("#########################################################################")
 996 |         print("DBSCAN RESULTS\n\n")
 997 |         print("Clusters -> ",dbClusters,"\n")
 998 |         print(dbscanR,"\n\n")
 999 |         print("Noise -> ",nNoises)
1000 |         print("Max True Label","\n\n",maxDBvalue)
1001 |         print("#########################################################################")
1002 |         #########################################################################
1003 |         print("\n\n#########################################################################")
1004 |         print("Dscan Score Metrics Menu")
1005 |         print("#########################################################################")
1006 |         print("1.F1 Score")
1007 |         print("2.Normalized Mutual Info Score")
1008 |         print("3.Adjusted Rand Score")
1009 |         
1010 |         while True:
1011 |             
1012 |             dbScoreOption = input("option:")
1013 |             
1014 |             if dbScoreOption == "1" or dbScoreOption == "2" or dbScoreOption == "3":
1015 |                 break
1016 |             else:
1017 |                 
1018 |                 print("Error\n\n")
1019 |     
1020 |         if dbScoreOption == "1":
1021 |             #########################################################################
1022 |             #F1 Score DBSCAN
1023 |             dbscanF1,clusterAssigned = dbF1(dblabels,labels,dbClusters,maxDBvalue)
1024 |             print("\n\n#########################################################################")
1025 |             print("Cluster Matchings by Maximun Intersection[Found: True] -> ",clusterAssigned)
1026 |             print("DBSCAN F1 Score -> ",dbscanF1)
1027 |             print("#########################################################################")
1028 |             #########################################################################
1029 |             
1030 |         elif dbScoreOption == "2":
1031 |             #########################################################################
1032 |             dbscanNMI,clusterAssigned = dbNMI(dblabels,labels,dbClusters,maxDBvalue)
1033 |             print("\n\n#########################################################################")
1034 |             print("Cluster Matchings by Maximun Intersection[Found: True] -> ",clusterAssigned)
1035 |             print("DBSCAN Normalized Mutual Info Score -> ",dbscanNMI)
1036 |             print("#########################################################################")
1037 |             #########################################################################
1038 |             
1039 |         elif dbScoreOption == "3":
1040 |             #########################################################################
1041 |             dbscanARS,clusterAssigned = dbARS(dblabels,labels,dbClusters,maxDBvalue)
1042 |             print("\n\n#########################################################################")
1043 |             print("Cluster Matchings by Maximun Intersection[Found: True] -> ",clusterAssigned)
1044 |             print("DBSCAN Adjusted Rand Score -> ",dbscanARS)
1045 |             print("#########################################################################")
1046 |             #########################################################################
1047 |         
1048 |         
1049 |     elif algorithmOption == "3":
1050 |         #########################################################################
1051 |         ifLabels,ifR,MaxIfVal,ifNclusters = isolationForest(data,labels)
1052 |         print("#########################################################################")
1053 |         print("Isolation Forest RESULTS\n\n")
1054 |         print("Clusters -> ",ifNclusters,"\n")
1055 |         print(ifR,"\n\n")
1056 |         print("Max True Label","\n\n",MaxIfVal)
1057 |         print("#########################################################################")
1058 |         #########################################################################
1059 |         print("\n\n#########################################################################")
1060 |         print("Isolation Forest Score Metrics Menu")
1061 |         print("#########################################################################")
1062 |         print("1.F1 Score")
1063 |         
1064 |         while True:
1065 |             
1066 |             ifScoreOption = input("option:")
1067 |             
1068 |             if ifScoreOption == "1":
1069 |                 break
1070 |             else:
1071 |                 
1072 |                 print("Error\n\n")
1073 |         
1074 |         if ifScoreOption == "1":
1075 |             
1076 |             ##########################################################################
1077 |             isolationForestF1,clusterAssigned = ifF1(ifLabels,labels,ifNclusters,MaxIfVal)
1078 |             print("\n\n#########################################################################")
1079 |             print("Cluster Matchings by Maximun Intersection[Found: True] -> ",clusterAssigned)
1080 |             print("Isolation Forest F1 Score -> ",isolationForestF1)
1081 |             print("#########################################################################")
1082 |             ##########################################################################
1083 |         
1084 |     elif algorithmOption == "4":
1085 |         #########################################################################
1086 |         LOFlabels,lofR,maxLOFvalue,lofClusters = LOF(data,labels)
1087 |         print("#########################################################################")
1088 |         print("Local Outlier Factor RESULTS\n\n")
1089 |         print("Clusters -> ",lofClusters,"\n")
1090 |         print(lofR,"\n\n")
1091 |         print("Max True Label","\n\n",maxLOFvalue)
1092 |         print("#########################################################################")
1093 |         #########################################################################
1094 |         print("\n\n#########################################################################")
1095 |         print("LOF Score Metrics Menu")
1096 |         print("#########################################################################")
1097 |         print("1.F1 Score")
1098 |         
1099 |         while True:
1100 |             
1101 |             lofScoreOption = input("option:")
1102 |             
1103 |             if lofScoreOption == "1":
1104 |                 break
1105 |             else:
1106 |                 
1107 |                 print("Error\n\n")
1108 |         
1109 |         if lofScoreOption == "1":
1110 |             
1111 |             ##########################################################################
1112 |             LOFf1,clusterAssigned = lofF1(LOFlabels,labels,lofClusters,maxLOFvalue)
1113 |             print("\n\n#########################################################################")
1114 |             print("Cluster Matchings by Maximun Intersection[Found: True] -> ",clusterAssigned)
1115 |             print("LOF F1 Score -> ",LOFf1)
1116 |             print("#########################################################################")
1117 |             ##########################################################################
1118 |                 
1119 |     while True: #Asking if the user wants to run a new clustering algorithm test on the same data preprocessed in the same way
1120 |         
1121 |         decision = input("Try another Clustering Algorithm[y/n]:")
1122 |         
1123 |         if decision == "y" or  decision == "n":
1124 |             break
1125 |         else:
1126 |             
1127 |             print("Error\n\n")
1128 |     
1129 |     
1130 |     if decision == "n":
1131 |         break
1132 |     
1133 |     else:
1134 |         clear()


--------------------------------------------------------------------------------
/Dataset/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremy191/clustering-based-anomaly-detection/7df019978b9350920ac2e21bb53663f8709f6ab1/Dataset/.DS_Store


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Jeremy Perez
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Clustering Based Anomaly Detection
  2 | 
  3 | ## Description
  4 | This clustering based anomaly detection project implements unsupervised clustering algorithms on the [NSL-KDD](https://pdfs.semanticscholar.org/1b34/80021c4ab0f632efa99e01a9b073903c5554.pdf) and [IDS 2017](https://www.unb.ca/cic/datasets/ids-2017.html) datasets. The project includes options for preprocessing the datasets. It then clusters the datasets, mainly using the K-means and DBSCAN algorithms. Finally, it evaluates the clustering performed by the algorithms using standard metrics such as F-Score.
  5 | 
  6 | ## Requirements
  7 | 
  8 | * [Python >= 3.5](https://www.python.org/)
  9 | * [Anaconda](https://www.anaconda.com/distribution/)
 10 | * [Scikit](https://scikit-learn.org/stable/install.html)
 11 | * [SciPy](https://www.scipy.org/#)
 12 | * [NumPy](http://numpy.org/)
 13 | * [joblib](https://joblib.readthedocs.io/en/latest/#)
 14 | * [pandas](https://pandas.pydata.org/)
 15 | * [Spyder environment](https://www.spyder-ide.org/)
 16 | 
 17 | ## Installation
 18 | 
 19 | For this project, we installed Anaconda-Navigator to use as our package and environment manager. Under the Environments tab in Anaconda, we created an environment and downloaded the libraries listed in the prerequisites for this project.
 20 | This [guide](https://docs.anaconda.com/_downloads/9ee215ff15fde24bf01791d719084950/Anaconda-Starter-Guide.pdf) can help use Anaconda
 21 | 
 22 | 
 23 | ## Code Details
 24 | After you install all the requirements you should be able to run the code without any problems. This code is implemented to be user friendly and the steps will be briefly explained below:
 25 | 
 26 | ##### 1. Dataset option
 27 | * ![image](https://user-images.githubusercontent.com/31083873/62171123-263b7400-b2eb-11e9-92ea-27dd3511b052.png)
 28 | The user is asked to input which dataset will be analyzed in this run of the anomaly detetion algorithms. The two datasets that this project used contain different types of data and therefore require different types of preprocessing; thus, the user must choose which dataset to preprocess before beginning anomaly detection.
 29 | 
 30 | ##### 2. Path
 31 | * ![image](https://user-images.githubusercontent.com/31083873/62171230-816d6680-b2eb-11e9-814b-d6d2d2f819dd.png)
 32 | The user is asked to input the path of the data set. After [downloading the dataset](https://www.unb.ca/cic/datasets/index.html) to your computer, copy the path to that dataset and input the path here.
 33 | 
 34 | ##### 3. Variable Menu
 35 | * ![image](https://user-images.githubusercontent.com/31083873/62171295-afeb4180-b2eb-11e9-8958-317cc71b9e43.png)
 36 | The user is asked to choose the variables he wants to be working on.
 37 | As explained in step 1, the two data sets have different types of features. Specifically, the NSL-KDD Dataset has categorical data that must either be converted into numerical data or eliminated. The user can choose between three options for dealing with the categorical features on the NSL-KDD Dataset:
 38 | 
 39 |   1. The data will have categorical features(protocols,service type,attack types,service error) and the data within those features will be [one hot  encoded](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)(encode categorical features into numerical features)
 40 | 
 41 |   2. The categorical features are removed from the data.
 42 | 
 43 |   3. The Categorical features (service type, attack types, and service error/flag) are encoded with [Risk Values](http://www.jatit.org/volumes/Vol65No3/13Vol65No3.pdf). Since protocols do not have associated risk values, they are one hot encoded
 44 | 
 45 | ##### 4. Encoding Menu
 46 | * ![image](https://user-images.githubusercontent.com/31083873/62171931-ed50ce80-b2ed-11e9-9963-45de4cc4301e.png)
 47 | The user is asked to encode the labels. The NSL-KDD Dataset contains 22 usual attack types plus the normal category for a total of 23 possible labels.
 48 |   1. The labels are converted in binary labels (normal and abnormal). Every attack name that is not normal - in other words, that is an attack - is renamed with the label abnormal. After that, the labels are encoded into binary numbers where 0 is normal and 1 is abnormal. Because we can't calculate a metric score with categorical features, so the normal and abnormal labels must be converted to numeric data.
 49 | 
 50 |   2. The labels are converted into a 5 main categoires (normal,DoS,Probe,U2R,R2L) using the information provided in [this analysis of the dataset](https://pdfs.semanticscholar.org/1b34/80021c4ab0f632efa99e01a9b073903c5554.pdf). After that, each attack is encoded into one of 5 numbers where normal is 0, Dos is 1, Probe is 2, R2L is 3 and U2R is 4.
 51 | 
 52 | ##### 5. Scale the data
 53 | * ![image](https://user-images.githubusercontent.com/31083873/62172317-1756c080-b2ef-11e9-873b-3c4a0f8fb0e9.png)
 54 | The user is asked if he or she wants to Scale the data. We use [Min Max Scaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html). We do this because we want our data instances to be at the same range, and Min Max Scaler puts the data in a range of [0,1] by feature. This allows the distance-based anomaly detection algorithms to accurately analyze the data.
 55 | 
 56 | ##### 6. Shuffle the data
 57 | * ![image](https://user-images.githubusercontent.com/31083873/62183286-db375600-b316-11e9-97e4-71f1440ee1ed.png)
 58 | The user is asked if he or she wants to suffle the data. Because one of the clustering algorithms, namely DBSCAN, could potentially return a different clustering depending on the order of the dataset, we attempted to shuffle the data and compare results. Unfortunately, the shuffled data returned clusters vastly different from the unshuffled data, with enough reason to believe that the shuffling algorithm was not working properly. Users are welcome to attempt shuffling the data but are forewarned that the shuffling may not return desired results.
 59 | 
 60 | ##### 7. Algorithm Menu
 61 | * ![image](https://user-images.githubusercontent.com/31083873/62183597-0ff7dd00-b318-11e9-9bcf-d26b4f6ae0ac.png)
 62 | The user is asked which anomaly detection algorithm he or she wants to use on the data. Each algorithm is discussed in greater detail in the Analyzing Dataset section.
 63 | 
 64 | Each algorithm requires user-input parameters.
 65 | 
 66 |   ###### K-Means
 67 |      ###### Initialization method
 68 | * ![image](https://user-images.githubusercontent.com/31083873/62186624-2b68e500-b324-11e9-9fdb-c700ee87ee4c.png)
 69 | K-Means provides different options for choosing the initial cluster centers. In this project, the user can choose either the random method or SciKitLearn's more sophisticated [k-means++](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html) method.
 70 |       ###### Clusters
 71 | * ![image](https://user-images.githubusercontent.com/31083873/62186784-97e3e400-b324-11e9-8505-d35d78ee9fc1.png)
 72 | Users must choose the number of clusters for K-Means. The elbow method is popular for choosing the number of clusters. Read  more below in the Analyzing Dataset section.
 73 |   
 74 |   ###### DBSCAN  
 75 |   * ![image](https://user-images.githubusercontent.com/31083873/62664174-cfabe680-b937-11e9-8352-d9cd5550c7f3.png)
 76 |   DBSCAN need 2 main parameter epsilon and min samples , the algorithm parameter will affect the run time, we concluded that    brute is the fastest one for the NSL-KDD dataset.
 77 |   
 78 |   ###### Local Outlier Factor
 79 |   * ![image](https://user-images.githubusercontent.com/31083873/62664862-65487580-b93a-11e9-80e5-32dcff8b0ac1.png)
 80 |   Users must choose the ratio of anomalies in the dataset. This is called the contamination factor.
 81 |  
 82 |   ###### Isolation Forest 
 83 |   * ![image](https://user-images.githubusercontent.com/51713553/62648301-c149d480-b90f-11e9-848f-1fbe843099cb.png)
 84 |   Users must choose the ratio of anomalies in the dataset. This is called the contamination factor.
 85 |   
 86 | ##### 8. Scoring Metrics
 87 | * ![image](https://user-images.githubusercontent.com/31083873/62186832-be098400-b324-11e9-9036-ae5413a4535e.png)
 88 |   
 89 | * ![image](https://user-images.githubusercontent.com/51713553/62640889-bdae5180-b8ff-11e9-975d-f2c356561180.png)
 90 | Kmeans F1-score
 91 |   
 92 |   
 93 | * ![image](https://user-images.githubusercontent.com/31083873/62664455-cb33fd80-b938-11e9-8032-72bb83af578d.png)
 94 | DBSCAN F1-score
 95 |  
 96 | 
 97 | 
 98 | ### Preprocessing Dataset
 99 | 
100 | This project was designed to be used with the NSL-KDD and IDS 2017 datasets, available for download [here](https://www.unb.ca/cic/datasets/index.html). The preprocessing options thus are specific for each dataset. 
101 | 
102 | The NSL-KDD dataset has [categorical data](https://machinelearningmastery.com/why-one-hot-encode-data-in-machine-learning/) that must be omitted or encoded as numerical data to be clustered. The options in this project for dealing with categorical data include omitting categorical features, One Hot Encoding catgorical features, and [assigning risk values](http://www.jatit.org/volumes/Vol65No3/13Vol65No3.pdf) to Server Type and Flag features while One Hot Encoding Protocol data. One Hot Encoding is a process that takes each categorical option in a feature and makes that option a feature itself, assinging each data instance a "0" if it does not contain that category and a "1" if it does. While this option allows the user to keep the structure of the categorical data without assigning arbitrary hierarchical ordering to categories, this option also increases the number of features and thus is not always optimal for already-large datasets. For this reason, the code offers three different methods of dealing with categorical data.
103 | 
104 | The IDS-2017 dataset has missing values that must be dealt with as well. The code offers the user the option of deleting the columns with missing values, imputing "0", imputing the mean, median, or mode of the feature, or using the Iterative Imputer method offered by Python.
105 | 
106 | The interactive code asks the user to specify which of the two datasets he or she is using.
107 | 
108 | ### Analyzing Dataset
109 | 
110 | The code offers four different anomaly detection algorithms, namely K-Means, DBSCAN, Local Outlier Factor (LOF), and Isolation Forest. K-Means and DBSCAN are clustering algorithms, while LOF is a K-Nearest-Neighbor algorithm and Isolation Forest is a decision tree algorithm, both using a contamination factor to classify data as normal or anomaly.
111 | 
112 | [K-Means](https://www.youtube.com/watch?v=_aWzGGNrcic) clusters data by starting with user-specified K initial cluster centroids, and assigning all points to the nearest centroid. Based on the assignments, the algorithm recalculates the cluster centers and reassigns all points to the nearest cluster center. The algorithm repeats this process for a default of 300 iterations. When the process ends, K-Means has clustered data into K clusters. [SciKitLearn's K-Means algorithm](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html) offers the option for the user to also specify the method for initialization, the way that the algorithm chooses which points to use as initial cluster centroids. In this project, the user specifies K, the number of initial cluster centroids and eventual clusters. A typical way of choosing K is often by the [elbow method](https://www.scikit-yb.org/en/latest/api/cluster/elbow.html). The implementation of K-Means in this project reports the sum of squared distances to cluster centers (or squared sum of errors, SSE) needed in the elbow plot, so a user can run tests with different values of K and plot that against the SSE for each K value. A user can then subjectively choose the elbow point on such a plot to determine the best K, and can then conduct tests with this K. The researchers suggest using a few values of K around the elbow and comparing the evaluation metric scores generated for each K in order to determine the best value of K.
113 | 
114 | [Density-Based Spacial Clustering of Applications with Noise](https://medium.com/@elutins/dbscan-what-is-it-when-to-use-it-how-to-use-it-8bd506293818), or DBSCAN, relies on two user-input parameters, namely epsilon and minimum samples. Epsilon denotes the neighborhood of density to be explored for each data point, and minimum samples denote the minimum number of samples needed to be within a point’s epsilon neighborhood for said point to be considered a core point. Points within another core point’s epsilon neighborhood, but not core points themselves, are considered border points. Meanwhile, points that are not within another core point’s epsilon neighborhood, and that are not core points themselves, are considered anomalous points or noise. DBSCAN finds clusters of core points and border points and reports those clusters along with a group of all of the anomalous points. [SciKitLearn's DBSCAN](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html) offers the user other parameters to manipulate the specific way that DBSCAN calculates the clusters; this project uses all default parameters except for the algorithm parameter, for which the project specifies the 'brute' option in order to reduce run time.
115 | **DBSCAN run time will depend of how big the dataset is and what resources your computer has. Since "DBSCAN groups together points that are close to each other based on a distance measurement," it is slower than K-means algorithm (Salton do Prado, 2017). The experiments on DBSCAN were made on a Macbook Pro 2.6 GHz i7 with 16 GB of RAM memory and using the Brute parameter for the algorithm. The average time for these experiments was 3 minutes. DBSCAN tests were attempted on a Macbook air 1.6 GHz i5 with 8GB of RAM, but after 30 minutes never finished due to the processing capacity of the computer. Before running experiments with DBSCAN make sure the computer can handle it.**
116 | 
117 | [Local Outlier Factor](https://towardsdatascience.com/local-outlier-factor-for-anomaly-detection-cc0c770d2ebe), or LOF, begins with the parameter K, a default-set or user-chosen integer. For a specific point, the algorithm calculates the reach-distance to each point, which is essentially the distance from a specific point to another point with a small smoothing caveat for close points. The algorithm then takes the average of the reach-distances for a specific point to each of that point's k-nearest neighbors. The inverse of this average is called the Local Reachability Distance, or LRD. A point's high LRD indicates that the point exists in a highly dense neighborhood and does not have to travel far to encounter all K nearest neighbors, and a point's low LRD indicates the opposite, a low-density neighborhood. The algorithm calculates the LRDs for each point, and finds the average of all LRDs. Finally, the algoirthm calculates the Local Outlier Factor for each point by dividing that point's LRD by the average LRD of all points. An LRD around 1 indicates a point with average density, and an LRD much greater than 1 indicates a point in a much lower-density neighborhood than the average point, and therefore a point that is likely an anomaly. [SciKitLearn's LOF algorithm](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html) returns the negative of each point's local outlier factor. In this code, one can choose an Offset value such that all points with an LOF more negative than that Offset value are labeled as anomalous points, and all points equal to or more positive than that Offset value are labeled as normal points. 
118 | 
119 | Similarly to Local Outlier Factor, [Isolation Forest](https://towardsdatascience.com/outlier-detection-with-isolation-forest-3d190448d45e) returns for each point a score representing the probability of that particular point being an anomaly, and the user must choose a threshold for which scores will indicate an anomaly and which will indicate a normal instance. The algorithm generates the probability scores for each instance by the following process: _First, randomly choose a feature (column). Next, randomly choose a value between the min and max of that feature. Partition, or split the data into two groups: those with values in that feature above the randomly chosen value, and those with values below. Now, choose one of the two groups again and split the data on a random point. Repeat until a single point is isolated. Obtain the number of splits required to isolate that point. Repeat this process, eventually isolating all points across many features, and obtain for each specific point the average number of splits required for that point to be isolated_. The theory behind Isolation Forest states that anomalies occur less frequently and differ more greatly than normal points, and therefore will require fewer partitions, or splits, to isolate them than normal points would require. Thus, a score based on the average number of splits, also known as the average path length, denotes the probability that a particular point is an anomaly. The score is adjusted such that a a score near 1 denotes a likely anomaly, and a score near 0.5 denotes a likely normal point. Again, the user can set the contamination factor to indicate the threshold for scores labeled as anomaly and as normal.
120 | 
121 | 
122 | ### Evaluating Clusters
123 | 
124 | The code also offers multiple evaluation metrics for the user to choose from. Each metric depends on comparing the labels of the actual dataset with the labels given by the clustering, or the "true labels" with the "found labels". For both the NSL-KDD and the IDS 2017 datasets, both binary and multiclass labels are available to compare with as "true labels." Users can specify their preference in the interactive code. In this code, users can verify the clustering on their data by using one of three different metrics, namely F-1 Score, Normalized Mutual Information Score (NMI), and Adjusted Rand Score (ARS).  
125 | 
126 | [F-Score](https://deepai.org/machine-learning-glossary-and-terms/f-score) is the harmonic mean between precision and recall. Precision is the ratio of correctly predicted positive values to all values predicted to be positive. In other words, precision indicates how sure the algorithm is that the found positive values are actually positive. Meanwhile, recall is the ratio of correctly predicted positive values to all values that are actually positive. In other words, recall indicates how sure the algorithm is that it did not miss any positive values in its positive labelings. One can weight either precision or recall to have more influence in the F-Score by changing the beta value in the F-beta function; however, this project opts to keep the weight between precision and recall equal by using the F-1 score. 
127 | 
128 | The [Normalized Mutual Information Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.normalized_mutual_info_score.html) is based on the probability function of entropy and uncertainty between the true and the found labels. 
129 | 
130 | Instead of measuring entropy as the NMI score does, the [Adjusted Rand Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html) measures the similarity between the true labels and the found labels. Furthermore, the ARS is adjusted for chance, while the NMI is not.
131 | 
132 | 
133 | ## Roadmap
134 | 
135 | * Implement Dimensionality Reduction- The main idea to implement this is because both datasets are considerably big  and they consume a lot of processing resources also we want to implement this because we can run DBSCAN algorithm on a bigger data set than IDS2017 and we want to know if we are going to be able to implement the algorithm.
136 | 
137 | ## Poster
138 | ![CBAD-Poster](https://user-images.githubusercontent.com/31083873/70267654-41c0fa80-1775-11ea-9fa4-2bc85b1a57a3.png)
139 | 
140 | 
141 | 
142 | ## Authors and acknowledgment
143 | * Jeremy Perez
144 | * Bethany Danner
145 | * **Special thanks to Dr. Veronika Neeley for mentoring us throughout this project, and for Dr. Clem Izurieta for organizing the REU program at Montana State University. This work was funded by the [National Science Foundation](https://www.nsf.gov/)**.
146 | 
147 | ## License
148 | 
149 | MIT License
150 | 
151 | Copyright (c) 2019 Jeremy Perez
152 | 
153 | Permission is hereby granted, free of charge, to any person obtaining a copy
154 | of this software and associated documentation files (the "Software"), to deal
155 | in the Software without restriction, including without limitation the rights
156 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
157 | copies of the Software, and to permit persons to whom the Software is
158 | furnished to do so, subject to the following conditions:
159 | 
160 | The above copyright notice and this permission notice shall be included in all
161 | copies or substantial portions of the Software.
162 | 
163 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
164 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
165 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
166 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
167 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
168 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
169 | SOFTWARE.
170 | 
171 | ## Project status
172 | 
173 | Current Bugs: After shuflleling the data results are not as excpected
174 | 


--------------------------------------------------------------------------------