├── Datasets ├── KDDCup99 │ ├── kddcup.data.gz │ ├── kddcup.data_10_percent.gz │ ├── kddcup.newtestdata_10_percent_unlabeled.gz │ ├── kddcup.testdata.unlabeled.gz │ └── kddcup.testdata.unlabeled_10_percent.gz └── NSL-KDD │ ├── KDDTest-21.csv │ ├── KDDTrain+_20Percent.csv │ └── KDDTrain+_20Percent_Description.xlsx ├── IDSUsingAutoEnoderNeuralNetwork.ipynb ├── IDSUsingSimpleDeepNeuralNetwork.ipynb ├── IDSUsingTraditionalMLTechniques.ipynb └── Project-UtilityFunctions ├── __pycache__ └── lstm.cpython-37.pyc ├── classificationlibrary.py ├── dataformatinglibrary.py ├── datainspectionlibrary.py ├── dataloadinglibrary.py ├── datapreprocessinglibrary.py ├── defineInputs.py ├── featureencodinglibrary.py ├── featurescalinglibrary.py ├── featureselectionlibrary.py ├── findcombinations.py ├── lstm.py └── util.py /Datasets/KDDCup99/kddcup.data.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LearnDeepLearningOrg/NetworkIntrusionDetection/11e638a3ad91dff8d343ddbab624a1e5f2eb66d7/Datasets/KDDCup99/kddcup.data.gz -------------------------------------------------------------------------------- /Datasets/KDDCup99/kddcup.data_10_percent.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LearnDeepLearningOrg/NetworkIntrusionDetection/11e638a3ad91dff8d343ddbab624a1e5f2eb66d7/Datasets/KDDCup99/kddcup.data_10_percent.gz -------------------------------------------------------------------------------- /Datasets/KDDCup99/kddcup.newtestdata_10_percent_unlabeled.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LearnDeepLearningOrg/NetworkIntrusionDetection/11e638a3ad91dff8d343ddbab624a1e5f2eb66d7/Datasets/KDDCup99/kddcup.newtestdata_10_percent_unlabeled.gz -------------------------------------------------------------------------------- /Datasets/KDDCup99/kddcup.testdata.unlabeled.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LearnDeepLearningOrg/NetworkIntrusionDetection/11e638a3ad91dff8d343ddbab624a1e5f2eb66d7/Datasets/KDDCup99/kddcup.testdata.unlabeled.gz -------------------------------------------------------------------------------- /Datasets/KDDCup99/kddcup.testdata.unlabeled_10_percent.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LearnDeepLearningOrg/NetworkIntrusionDetection/11e638a3ad91dff8d343ddbab624a1e5f2eb66d7/Datasets/KDDCup99/kddcup.testdata.unlabeled_10_percent.gz -------------------------------------------------------------------------------- /Datasets/NSL-KDD/KDDTrain+_20Percent_Description.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LearnDeepLearningOrg/NetworkIntrusionDetection/11e638a3ad91dff8d343ddbab624a1e5f2eb66d7/Datasets/NSL-KDD/KDDTrain+_20Percent_Description.xlsx -------------------------------------------------------------------------------- /IDSUsingSimpleDeepNeuralNetwork.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Import the required libraries and the utility modules" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 12, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "\n", 19 | "from sklearn import metrics\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "from sklearn.preprocessing import LabelEncoder\n", 22 | "\n", 23 | "from tensorflow.keras.models import Sequential\n", 24 | "from tensorflow.keras.models import load_model\n", 25 | "from tensorflow.keras.layers import Dense, Activation, Dropout\n", 26 | "from tensorflow.keras.callbacks import EarlyStopping\n", 27 | "from tensorflow.keras.callbacks import ModelCheckpoint\n", 28 | "from tensorflow.keras.utils import plot_model\n", 29 | "from tensorflow.python.keras.utils.np_utils import to_categorical\n", 30 | "\n", 31 | "import matplotlib.pyplot as plt\n", 32 | "\n", 33 | "#Custom libraries\n", 34 | "#Data formating library\n", 35 | "from dataloadinglibrary import loadCSV\n", 36 | "\n", 37 | "from datainspectionlibrary import getStatisticsOfData\n", 38 | "\n", 39 | "from dataformatinglibrary import createExcelFromArray\n", 40 | "\n", 41 | "from defineInputs import getLabelName\n", 42 | "from defineInputs import getPathToTrainingAndTestingDataSets\n", 43 | "from defineInputs import modelPerformanceReport\n", 44 | "from defineInputs import defineArrayForPreProcessing\n", 45 | "from defineInputs import getPathToGenerateModels\n", 46 | "\n", 47 | "from util import performPreprocessing" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "### Load the training dataset and check the statistics" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "metadata": { 61 | "scrolled": true 62 | }, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "***** Start checking the statistics of the dataSet *****\n", 69 | "\n", 70 | "***** Shape (number of rows and columns) in the dataset: (25191, 42)\n", 71 | "***** Total number of features in the dataset: 41\n", 72 | "***** Number of categorical features in the dataset: 3\n", 73 | "***** Number of numerical features in the dataset: 38\n", 74 | "\n", 75 | "***** Names of categorical features in dataset *****\n", 76 | "\n", 77 | "| Categorical features in dataset |\n", 78 | "|-----------------------------------|\n", 79 | "| Protocol_type |\n", 80 | "| Service |\n", 81 | "| Flag |\n", 82 | "\n", 83 | "\n", 84 | "***** Names of numerical features in dataset *****\n", 85 | "\n", 86 | "| Numerical features in the dataset |\n", 87 | "|-------------------------------------|\n", 88 | "| Duration |\n", 89 | "| Src_bytes |\n", 90 | "| Dst_bytes |\n", 91 | "| Land |\n", 92 | "| Wrong_fragment |\n", 93 | "| Urgent |\n", 94 | "| Hot |\n", 95 | "| Num_failed_logins |\n", 96 | "| Logged_in |\n", 97 | "| Num_compromised |\n", 98 | "| Root_shell |\n", 99 | "| Su_attempted |\n", 100 | "| Num_root |\n", 101 | "| Num_file_creations |\n", 102 | "| Num_shells |\n", 103 | "| Num_access_files |\n", 104 | "| Num_outbound_cmds |\n", 105 | "| Is_hot_login |\n", 106 | "| Is_guest_login |\n", 107 | "| Count |\n", 108 | "| Srv_count |\n", 109 | "| Serror_rate |\n", 110 | "| Srv_serror_rate |\n", 111 | "| Rerror_rate |\n", 112 | "| Srv_rerror_rate |\n", 113 | "| Same_srv_rate |\n", 114 | "| Diff_srv_rate |\n", 115 | "| Srv_diff_host_rate |\n", 116 | "| Dst_host_count |\n", 117 | "| Dst_host_srv_count |\n", 118 | "| Dst_host_same_srv_rate |\n", 119 | "| Dst_host_diff_srv_rate |\n", 120 | "| Dst_host_same_src_port_rate |\n", 121 | "| Dst_host_srv_diff_host_rate |\n", 122 | "| Dst_host_serror_rate |\n", 123 | "| Dst_host_srv_serror_rate |\n", 124 | "| Dst_host_rerror_rate |\n", 125 | "| Dst_host_srv_rerror_rate |\n", 126 | "\n", 127 | "\n", 128 | "***** Are there any missing values in the data set: False\n", 129 | "Total number of records in the dataset: 25191\n", 130 | "Unique records in the dataset: 25191\n", 131 | "\n", 132 | "***** Are there any duplicate records in the data set: False\n", 133 | "\n", 134 | "****** Number of different values for label that are present in the dataset: 22\n", 135 | "\n", 136 | "****** Here is the list of unique label types present in the dataset ***** \n", 137 | "\n", 138 | "| Unique label types in the dataset |\n", 139 | "|-------------------------------------|\n", 140 | "| normal |\n", 141 | "| neptune |\n", 142 | "| warezclient |\n", 143 | "| ipsweep |\n", 144 | "| portsweep |\n", 145 | "| teardrop |\n", 146 | "| nmap |\n", 147 | "| satan |\n", 148 | "| smurf |\n", 149 | "| pod |\n", 150 | "| back |\n", 151 | "| guess_passwd |\n", 152 | "| ftp_write |\n", 153 | "| multihop |\n", 154 | "| rootkit |\n", 155 | "| buffer_overflow |\n", 156 | "| imap |\n", 157 | "| warezmaster |\n", 158 | "| phf |\n", 159 | "| land |\n", 160 | "| loadmodule |\n", 161 | "| spy |\n", 162 | "\n", 163 | "\n", 164 | "****** Here is the list of unique values present in each categorical feature in the dataset *****\n", 165 | "\n", 166 | "\n", 167 | "attack_type: 22 \n", 168 | "| distinct values |\n", 169 | "|-------------------|\n", 170 | "| normal |\n", 171 | "| neptune |\n", 172 | "| warezclient |\n", 173 | "| ipsweep |\n", 174 | "| portsweep |\n", 175 | "| teardrop |\n", 176 | "| nmap |\n", 177 | "| satan |\n", 178 | "| smurf |\n", 179 | "| pod |\n", 180 | "| back |\n", 181 | "| guess_passwd |\n", 182 | "| ftp_write |\n", 183 | "| multihop |\n", 184 | "| rootkit |\n", 185 | "| buffer_overflow |\n", 186 | "| imap |\n", 187 | "| warezmaster |\n", 188 | "| phf |\n", 189 | "| land |\n", 190 | "| loadmodule |\n", 191 | "| spy |\n", 192 | "\n", 193 | "\n", 194 | "Protocol_type: 3 \n", 195 | "| distinct values |\n", 196 | "|-------------------|\n", 197 | "| udp |\n", 198 | "| tcp |\n", 199 | "| icmp |\n", 200 | "\n", 201 | "\n", 202 | "Service: 66 \n", 203 | "| distinct values |\n", 204 | "|-------------------|\n", 205 | "| other |\n", 206 | "| private |\n", 207 | "| http |\n", 208 | "| remote_job |\n", 209 | "| ftp_data |\n", 210 | "| name |\n", 211 | "| netbios_ns |\n", 212 | "| eco_i |\n", 213 | "| mtp |\n", 214 | "| telnet |\n", 215 | "| finger |\n", 216 | "| domain_u |\n", 217 | "| supdup |\n", 218 | "| uucp_path |\n", 219 | "| Z39_50 |\n", 220 | "| smtp |\n", 221 | "| csnet_ns |\n", 222 | "| uucp |\n", 223 | "| netbios_dgm |\n", 224 | "| urp_i |\n", 225 | "| auth |\n", 226 | "| domain |\n", 227 | "| ftp |\n", 228 | "| bgp |\n", 229 | "| ldap |\n", 230 | "| ecr_i |\n", 231 | "| gopher |\n", 232 | "| vmnet |\n", 233 | "| systat |\n", 234 | "| http_443 |\n", 235 | "| efs |\n", 236 | "| whois |\n", 237 | "| imap4 |\n", 238 | "| iso_tsap |\n", 239 | "| echo |\n", 240 | "| klogin |\n", 241 | "| link |\n", 242 | "| sunrpc |\n", 243 | "| login |\n", 244 | "| kshell |\n", 245 | "| sql_net |\n", 246 | "| time |\n", 247 | "| hostnames |\n", 248 | "| exec |\n", 249 | "| ntp_u |\n", 250 | "| discard |\n", 251 | "| nntp |\n", 252 | "| courier |\n", 253 | "| ctf |\n", 254 | "| ssh |\n", 255 | "| daytime |\n", 256 | "| shell |\n", 257 | "| netstat |\n", 258 | "| pop_3 |\n", 259 | "| nnsp |\n", 260 | "| IRC |\n", 261 | "| pop_2 |\n", 262 | "| printer |\n", 263 | "| tim_i |\n", 264 | "| pm_dump |\n", 265 | "| red_i |\n", 266 | "| netbios_ssn |\n", 267 | "| rje |\n", 268 | "| X11 |\n", 269 | "| urh_i |\n", 270 | "| http_8001 |\n", 271 | "\n", 272 | "\n", 273 | "Flag: 11 \n", 274 | "| distinct values |\n", 275 | "|-------------------|\n", 276 | "| SF |\n", 277 | "| S0 |\n", 278 | "| REJ |\n", 279 | "| RSTR |\n", 280 | "| SH |\n", 281 | "| RSTO |\n", 282 | "| S1 |\n", 283 | "| RSTOS0 |\n", 284 | "| S3 |\n", 285 | "| S2 |\n", 286 | "| OTH |\n", 287 | "\n", 288 | "\n", 289 | "****** Label distribution in the dataset *****\n", 290 | "\n", 291 | "normal 13448\n", 292 | "neptune 8282\n", 293 | "ipsweep 710\n", 294 | "satan 691\n", 295 | "portsweep 587\n", 296 | "smurf 529\n", 297 | "nmap 301\n", 298 | "back 196\n", 299 | "teardrop 188\n", 300 | "warezclient 181\n", 301 | "pod 38\n", 302 | "guess_passwd 10\n", 303 | "warezmaster 7\n", 304 | "buffer_overflow 6\n", 305 | "imap 5\n", 306 | "rootkit 4\n", 307 | "multihop 2\n", 308 | "phf 2\n", 309 | "loadmodule 1\n", 310 | "ftp_write 1\n", 311 | "land 1\n", 312 | "spy 1\n", 313 | "Name: attack_type, dtype: int64\n", 314 | "\n", 315 | "\n", 316 | "***** End checking the statistics of the dataSet *****\n", 317 | "\n", 318 | "***** Here is how to training dataset looks like before performing any pre-processing *****\n" 319 | ] 320 | }, 321 | { 322 | "data": { 323 | "text/html": [ 324 | "
\n", 325 | "\n", 338 | "\n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | "
DurationProtocol_typeServiceFlagSrc_bytesDst_bytesLandWrong_fragmentUrgentHot...Dst_host_srv_countDst_host_same_srv_rateDst_host_diff_srv_rateDst_host_same_src_port_rateDst_host_srv_diff_host_rateDst_host_serror_rateDst_host_srv_serror_rateDst_host_rerror_rateDst_host_srv_rerror_rateattack_type
00udpotherSF14600000...10.000.600.880.000.000.000.00.00normal
10tcpprivateS0000000...260.100.050.000.001.001.000.00.00neptune
20tcphttpSF23281530000...2551.000.000.030.040.030.010.00.01normal
30tcphttpSF1994200000...2551.000.000.000.000.000.000.00.00normal
40tcpprivateREJ000000...190.070.070.000.000.000.001.01.00neptune
\n", 488 | "

5 rows × 42 columns

\n", 489 | "
" 490 | ], 491 | "text/plain": [ 492 | " Duration Protocol_type Service Flag Src_bytes Dst_bytes Land \\\n", 493 | "0 0 udp other SF 146 0 0 \n", 494 | "1 0 tcp private S0 0 0 0 \n", 495 | "2 0 tcp http SF 232 8153 0 \n", 496 | "3 0 tcp http SF 199 420 0 \n", 497 | "4 0 tcp private REJ 0 0 0 \n", 498 | "\n", 499 | " Wrong_fragment Urgent Hot ... Dst_host_srv_count \\\n", 500 | "0 0 0 0 ... 1 \n", 501 | "1 0 0 0 ... 26 \n", 502 | "2 0 0 0 ... 255 \n", 503 | "3 0 0 0 ... 255 \n", 504 | "4 0 0 0 ... 19 \n", 505 | "\n", 506 | " Dst_host_same_srv_rate Dst_host_diff_srv_rate \\\n", 507 | "0 0.00 0.60 \n", 508 | "1 0.10 0.05 \n", 509 | "2 1.00 0.00 \n", 510 | "3 1.00 0.00 \n", 511 | "4 0.07 0.07 \n", 512 | "\n", 513 | " Dst_host_same_src_port_rate Dst_host_srv_diff_host_rate \\\n", 514 | "0 0.88 0.00 \n", 515 | "1 0.00 0.00 \n", 516 | "2 0.03 0.04 \n", 517 | "3 0.00 0.00 \n", 518 | "4 0.00 0.00 \n", 519 | "\n", 520 | " Dst_host_serror_rate Dst_host_srv_serror_rate Dst_host_rerror_rate \\\n", 521 | "0 0.00 0.00 0.0 \n", 522 | "1 1.00 1.00 0.0 \n", 523 | "2 0.03 0.01 0.0 \n", 524 | "3 0.00 0.00 0.0 \n", 525 | "4 0.00 0.00 1.0 \n", 526 | "\n", 527 | " Dst_host_srv_rerror_rate attack_type \n", 528 | "0 0.00 normal \n", 529 | "1 0.00 neptune \n", 530 | "2 0.01 normal \n", 531 | "3 0.00 normal \n", 532 | "4 1.00 neptune \n", 533 | "\n", 534 | "[5 rows x 42 columns]" 535 | ] 536 | }, 537 | "execution_count": 2, 538 | "metadata": {}, 539 | "output_type": "execute_result" 540 | } 541 | ], 542 | "source": [ 543 | "#Define file names and call loadCSV to load the CSV files\n", 544 | "trainingFileNameWithAbsolutePath, testingFileNameWithAbsolutePath = getPathToTrainingAndTestingDataSets()\n", 545 | "trainingDataSet = loadCSV(trainingFileNameWithAbsolutePath)\n", 546 | "difficultyLevel = trainingDataSet.pop('difficulty_level')\n", 547 | "labelName = getLabelName()\n", 548 | "label = trainingDataSet[labelName]\n", 549 | "\n", 550 | "#Look at the statistics of the dataSet\n", 551 | "getStatisticsOfData(trainingDataSet)\n", 552 | "print(\"\\n***** Here is how to training dataset looks like before performing any pre-processing *****\")\n", 553 | "trainingDataSet.head()" 554 | ] 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "metadata": {}, 559 | "source": [ 560 | "### Load the testing dataset and check the statistics" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 3, 566 | "metadata": {}, 567 | "outputs": [ 568 | { 569 | "name": "stdout", 570 | "output_type": "stream", 571 | "text": [ 572 | "***** Start checking the statistics of the dataSet *****\n", 573 | "\n", 574 | "***** Shape (number of rows and columns) in the dataset: (11850, 42)\n", 575 | "***** Total number of features in the dataset: 41\n", 576 | "***** Number of categorical features in the dataset: 3\n", 577 | "***** Number of numerical features in the dataset: 38\n", 578 | "\n", 579 | "***** Names of categorical features in dataset *****\n", 580 | "\n", 581 | "| Categorical features in dataset |\n", 582 | "|-----------------------------------|\n", 583 | "| Protocol_type |\n", 584 | "| Service |\n", 585 | "| Flag |\n", 586 | "\n", 587 | "\n", 588 | "***** Names of numerical features in dataset *****\n", 589 | "\n", 590 | "| Numerical features in the dataset |\n", 591 | "|-------------------------------------|\n", 592 | "| Duration |\n", 593 | "| Src_bytes |\n", 594 | "| Dst_bytes |\n", 595 | "| Land |\n", 596 | "| Wrong_fragment |\n", 597 | "| Urgent |\n", 598 | "| Hot |\n", 599 | "| Num_failed_logins |\n", 600 | "| Logged_in |\n", 601 | "| Num_compromised |\n", 602 | "| Root_shell |\n", 603 | "| Su_attempted |\n", 604 | "| Num_root |\n", 605 | "| Num_file_creations |\n", 606 | "| Num_shells |\n", 607 | "| Num_access_files |\n", 608 | "| Num_outbound_cmds |\n", 609 | "| Is_hot_login |\n", 610 | "| Is_guest_login |\n", 611 | "| Count |\n", 612 | "| Srv_count |\n", 613 | "| Serror_rate |\n", 614 | "| Srv_serror_rate |\n", 615 | "| Rerror_rate |\n", 616 | "| Srv_rerror_rate |\n", 617 | "| Same_srv_rate |\n", 618 | "| Diff_srv_rate |\n", 619 | "| Srv_diff_host_rate |\n", 620 | "| Dst_host_count |\n", 621 | "| Dst_host_srv_count |\n", 622 | "| Dst_host_same_srv_rate |\n", 623 | "| Dst_host_diff_srv_rate |\n", 624 | "| Dst_host_same_src_port_rate |\n", 625 | "| Dst_host_srv_diff_host_rate |\n", 626 | "| Dst_host_serror_rate |\n", 627 | "| Dst_host_srv_serror_rate |\n", 628 | "| Dst_host_rerror_rate |\n", 629 | "| Dst_host_srv_rerror_rate |\n", 630 | "\n", 631 | "\n", 632 | "***** Are there any missing values in the data set: False\n", 633 | "Total number of records in the dataset: 11850\n", 634 | "Unique records in the dataset: 11850\n", 635 | "\n", 636 | "***** Are there any duplicate records in the data set: False\n", 637 | "\n", 638 | "****** Number of different values for label that are present in the dataset: 38\n", 639 | "\n", 640 | "****** Here is the list of unique label types present in the dataset ***** \n", 641 | "\n", 642 | "| Unique label types in the dataset |\n", 643 | "|-------------------------------------|\n", 644 | "| guess_passwd |\n", 645 | "| snmpguess |\n", 646 | "| processtable |\n", 647 | "| normal |\n", 648 | "| nmap |\n", 649 | "| back |\n", 650 | "| neptune |\n", 651 | "| satan |\n", 652 | "| saint |\n", 653 | "| mscan |\n", 654 | "| apache2 |\n", 655 | "| httptunnel |\n", 656 | "| warezmaster |\n", 657 | "| ipsweep |\n", 658 | "| smurf |\n", 659 | "| mailbomb |\n", 660 | "| teardrop |\n", 661 | "| portsweep |\n", 662 | "| snmpgetattack |\n", 663 | "| multihop |\n", 664 | "| worm |\n", 665 | "| land |\n", 666 | "| sendmail |\n", 667 | "| buffer_overflow |\n", 668 | "| pod |\n", 669 | "| rootkit |\n", 670 | "| xlock |\n", 671 | "| xterm |\n", 672 | "| xsnoop |\n", 673 | "| ps |\n", 674 | "| named |\n", 675 | "| ftp_write |\n", 676 | "| loadmodule |\n", 677 | "| phf |\n", 678 | "| udpstorm |\n", 679 | "| perl |\n", 680 | "| sqlattack |\n", 681 | "| imap |\n", 682 | "\n", 683 | "\n", 684 | "****** Here is the list of unique values present in each categorical feature in the dataset *****\n", 685 | "\n", 686 | "\n", 687 | "attack_type: 38 \n", 688 | "| distinct values |\n", 689 | "|-------------------|\n", 690 | "| guess_passwd |\n", 691 | "| snmpguess |\n", 692 | "| processtable |\n", 693 | "| normal |\n", 694 | "| nmap |\n", 695 | "| back |\n", 696 | "| neptune |\n", 697 | "| satan |\n", 698 | "| saint |\n", 699 | "| mscan |\n", 700 | "| apache2 |\n", 701 | "| httptunnel |\n", 702 | "| warezmaster |\n", 703 | "| ipsweep |\n", 704 | "| smurf |\n", 705 | "| mailbomb |\n", 706 | "| teardrop |\n", 707 | "| portsweep |\n", 708 | "| snmpgetattack |\n", 709 | "| multihop |\n", 710 | "| worm |\n", 711 | "| land |\n", 712 | "| sendmail |\n", 713 | "| buffer_overflow |\n", 714 | "| pod |\n", 715 | "| rootkit |\n", 716 | "| xlock |\n", 717 | "| xterm |\n", 718 | "| xsnoop |\n", 719 | "| ps |\n", 720 | "| named |\n", 721 | "| ftp_write |\n", 722 | "| loadmodule |\n", 723 | "| phf |\n", 724 | "| udpstorm |\n", 725 | "| perl |\n", 726 | "| sqlattack |\n", 727 | "| imap |\n", 728 | "\n", 729 | "\n", 730 | "Protocol_type: 3 \n", 731 | "| distinct values |\n", 732 | "|-------------------|\n", 733 | "| tcp |\n", 734 | "| udp |\n", 735 | "| icmp |\n", 736 | "\n", 737 | "\n", 738 | "Service: 62 \n", 739 | "| distinct values |\n", 740 | "|-------------------|\n", 741 | "| telnet |\n", 742 | "| private |\n", 743 | "| http |\n", 744 | "| imap4 |\n", 745 | "| ftp_data |\n", 746 | "| other |\n", 747 | "| ctf |\n", 748 | "| pop_3 |\n", 749 | "| ftp |\n", 750 | "| domain_u |\n", 751 | "| domain |\n", 752 | "| eco_i |\n", 753 | "| ecr_i |\n", 754 | "| finger |\n", 755 | "| name |\n", 756 | "| smtp |\n", 757 | "| vmnet |\n", 758 | "| mtp |\n", 759 | "| bgp |\n", 760 | "| exec |\n", 761 | "| sunrpc |\n", 762 | "| uucp_path |\n", 763 | "| iso_tsap |\n", 764 | "| echo |\n", 765 | "| auth |\n", 766 | "| hostnames |\n", 767 | "| courier |\n", 768 | "| uucp |\n", 769 | "| daytime |\n", 770 | "| nntp |\n", 771 | "| netstat |\n", 772 | "| urp_i |\n", 773 | "| http_443 |\n", 774 | "| csnet_ns |\n", 775 | "| login |\n", 776 | "| klogin |\n", 777 | "| whois |\n", 778 | "| time |\n", 779 | "| link |\n", 780 | "| discard |\n", 781 | "| gopher |\n", 782 | "| supdup |\n", 783 | "| netbios_ns |\n", 784 | "| systat |\n", 785 | "| netbios_dgm |\n", 786 | "| kshell |\n", 787 | "| efs |\n", 788 | "| nnsp |\n", 789 | "| ssh |\n", 790 | "| netbios_ssn |\n", 791 | "| Z39_50 |\n", 792 | "| IRC |\n", 793 | "| ntp_u |\n", 794 | "| X11 |\n", 795 | "| pm_dump |\n", 796 | "| ldap |\n", 797 | "| remote_job |\n", 798 | "| sql_net |\n", 799 | "| shell |\n", 800 | "| tim_i |\n", 801 | "| pop_2 |\n", 802 | "| tftp_u |\n", 803 | "\n", 804 | "\n", 805 | "Flag: 11 \n", 806 | "| distinct values |\n", 807 | "|-------------------|\n", 808 | "| SF |\n", 809 | "| S3 |\n", 810 | "| SH |\n", 811 | "| REJ |\n", 812 | "| S0 |\n", 813 | "| RSTO |\n", 814 | "| RSTR |\n", 815 | "| RSTOS0 |\n", 816 | "| S1 |\n", 817 | "| S2 |\n", 818 | "| OTH |\n", 819 | "\n", 820 | "\n", 821 | "****** Label distribution in the dataset *****\n", 822 | "\n", 823 | "normal 2152\n", 824 | "neptune 1579\n", 825 | "guess_passwd 1231\n", 826 | "mscan 996\n", 827 | "warezmaster 944\n", 828 | "apache2 737\n", 829 | "satan 727\n", 830 | "processtable 685\n", 831 | "smurf 627\n", 832 | "back 359\n", 833 | "snmpguess 331\n", 834 | "saint 309\n", 835 | "mailbomb 293\n", 836 | "snmpgetattack 178\n", 837 | "portsweep 156\n", 838 | "ipsweep 141\n", 839 | "httptunnel 133\n", 840 | "nmap 73\n", 841 | "pod 41\n", 842 | "buffer_overflow 20\n", 843 | "multihop 18\n", 844 | "named 17\n", 845 | "ps 15\n", 846 | "sendmail 14\n", 847 | "rootkit 13\n", 848 | "xterm 13\n", 849 | "teardrop 12\n", 850 | "xlock 9\n", 851 | "land 7\n", 852 | "xsnoop 4\n", 853 | "ftp_write 3\n", 854 | "worm 2\n", 855 | "perl 2\n", 856 | "phf 2\n", 857 | "loadmodule 2\n", 858 | "sqlattack 2\n", 859 | "udpstorm 2\n", 860 | "imap 1\n", 861 | "Name: attack_type, dtype: int64\n", 862 | "\n", 863 | "\n", 864 | "***** End checking the statistics of the dataSet *****\n", 865 | "\n", 866 | "***** Here is how to testing dataset looks like before performing any pre-processing *****\n" 867 | ] 868 | }, 869 | { 870 | "data": { 871 | "text/html": [ 872 | "
\n", 873 | "\n", 886 | "\n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | "
DurationProtocol_typeServiceFlagSrc_bytesDst_bytesLandWrong_fragmentUrgentHot...Dst_host_srv_countDst_host_same_srv_rateDst_host_diff_srv_rateDst_host_same_src_port_rateDst_host_srv_diff_host_rateDst_host_serror_rateDst_host_srv_serror_rateDst_host_rerror_rateDst_host_srv_rerror_rateattack_type
013tcptelnetSF11824250000...100.380.120.040.00.000.000.120.3guess_passwd
10udpprivateSF4400000...2541.000.010.010.00.000.000.000.0snmpguess
20tcptelnetS30440000...790.310.610.000.00.210.680.600.0processtable
30udpprivateSF53550000...2551.000.000.870.00.000.000.000.0normal
40tcpprivateSH000000...10.061.001.000.01.001.000.000.0nmap
\n", 1036 | "

5 rows × 42 columns

\n", 1037 | "
" 1038 | ], 1039 | "text/plain": [ 1040 | " Duration Protocol_type Service Flag Src_bytes Dst_bytes Land \\\n", 1041 | "0 13 tcp telnet SF 118 2425 0 \n", 1042 | "1 0 udp private SF 44 0 0 \n", 1043 | "2 0 tcp telnet S3 0 44 0 \n", 1044 | "3 0 udp private SF 53 55 0 \n", 1045 | "4 0 tcp private SH 0 0 0 \n", 1046 | "\n", 1047 | " Wrong_fragment Urgent Hot ... Dst_host_srv_count \\\n", 1048 | "0 0 0 0 ... 10 \n", 1049 | "1 0 0 0 ... 254 \n", 1050 | "2 0 0 0 ... 79 \n", 1051 | "3 0 0 0 ... 255 \n", 1052 | "4 0 0 0 ... 1 \n", 1053 | "\n", 1054 | " Dst_host_same_srv_rate Dst_host_diff_srv_rate \\\n", 1055 | "0 0.38 0.12 \n", 1056 | "1 1.00 0.01 \n", 1057 | "2 0.31 0.61 \n", 1058 | "3 1.00 0.00 \n", 1059 | "4 0.06 1.00 \n", 1060 | "\n", 1061 | " Dst_host_same_src_port_rate Dst_host_srv_diff_host_rate \\\n", 1062 | "0 0.04 0.0 \n", 1063 | "1 0.01 0.0 \n", 1064 | "2 0.00 0.0 \n", 1065 | "3 0.87 0.0 \n", 1066 | "4 1.00 0.0 \n", 1067 | "\n", 1068 | " Dst_host_serror_rate Dst_host_srv_serror_rate Dst_host_rerror_rate \\\n", 1069 | "0 0.00 0.00 0.12 \n", 1070 | "1 0.00 0.00 0.00 \n", 1071 | "2 0.21 0.68 0.60 \n", 1072 | "3 0.00 0.00 0.00 \n", 1073 | "4 1.00 1.00 0.00 \n", 1074 | "\n", 1075 | " Dst_host_srv_rerror_rate attack_type \n", 1076 | "0 0.3 guess_passwd \n", 1077 | "1 0.0 snmpguess \n", 1078 | "2 0.0 processtable \n", 1079 | "3 0.0 normal \n", 1080 | "4 0.0 nmap \n", 1081 | "\n", 1082 | "[5 rows x 42 columns]" 1083 | ] 1084 | }, 1085 | "execution_count": 3, 1086 | "metadata": {}, 1087 | "output_type": "execute_result" 1088 | } 1089 | ], 1090 | "source": [ 1091 | "#Define file names and call loadCSV to load the CSV files\n", 1092 | "testingDataSet = loadCSV(testingFileNameWithAbsolutePath)\n", 1093 | "difficultyLevel = testingDataSet.pop('difficulty_level')\n", 1094 | "\n", 1095 | "#Look at the statistics of the dataSet\n", 1096 | "getStatisticsOfData(testingDataSet)\n", 1097 | "print(\"\\n***** Here is how to testing dataset looks like before performing any pre-processing *****\")\n", 1098 | "testingDataSet.head()" 1099 | ] 1100 | }, 1101 | { 1102 | "cell_type": "markdown", 1103 | "metadata": {}, 1104 | "source": [ 1105 | "### Perform pre-processing" 1106 | ] 1107 | }, 1108 | { 1109 | "cell_type": "code", 1110 | "execution_count": 4, 1111 | "metadata": { 1112 | "scrolled": true 1113 | }, 1114 | "outputs": [ 1115 | { 1116 | "name": "stdout", 1117 | "output_type": "stream", 1118 | "text": [ 1119 | "[['ExtraTreesClassifier', 'OneHotEncoder', 'Standardization']]\n", 1120 | "***************************************************************************************************************************\n", 1121 | "********************************************* Building Model- 0 As Below *************************************************\n", 1122 | "\t -- Feature Selection: \t ExtraTreesClassifier \n", 1123 | "\t -- Feature Encoding: \t OneHotEncoder \n", 1124 | "\t -- Feature Scaling: \t Standardization \n", 1125 | "\n", 1126 | "completeDataSet.shape: (37041, 43)\n", 1127 | "completeDataSet.head: Duration Protocol_type Service Flag Src_bytes Dst_bytes Land \\\n", 1128 | "0 0 udp other SF 146 0 0 \n", 1129 | "1 0 tcp private S0 0 0 0 \n", 1130 | "2 0 tcp http SF 232 8153 0 \n", 1131 | "3 0 tcp http SF 199 420 0 \n", 1132 | "4 0 tcp private REJ 0 0 0 \n", 1133 | "\n", 1134 | " Wrong_fragment Urgent Hot ... Dst_host_same_srv_rate \\\n", 1135 | "0 0 0 0 ... 0.00 \n", 1136 | "1 0 0 0 ... 0.10 \n", 1137 | "2 0 0 0 ... 1.00 \n", 1138 | "3 0 0 0 ... 1.00 \n", 1139 | "4 0 0 0 ... 0.07 \n", 1140 | "\n", 1141 | " Dst_host_diff_srv_rate Dst_host_same_src_port_rate \\\n", 1142 | "0 0.60 0.88 \n", 1143 | "1 0.05 0.00 \n", 1144 | "2 0.00 0.03 \n", 1145 | "3 0.00 0.00 \n", 1146 | "4 0.07 0.00 \n", 1147 | "\n", 1148 | " Dst_host_srv_diff_host_rate Dst_host_serror_rate \\\n", 1149 | "0 0.00 0.00 \n", 1150 | "1 0.00 1.00 \n", 1151 | "2 0.04 0.03 \n", 1152 | "3 0.00 0.00 \n", 1153 | "4 0.00 0.00 \n", 1154 | "\n", 1155 | " Dst_host_srv_serror_rate Dst_host_rerror_rate Dst_host_srv_rerror_rate \\\n", 1156 | "0 0.00 0.0 0.00 \n", 1157 | "1 1.00 0.0 0.00 \n", 1158 | "2 0.01 0.0 0.01 \n", 1159 | "3 0.00 0.0 0.00 \n", 1160 | "4 0.00 1.0 1.00 \n", 1161 | "\n", 1162 | " attack_type difficulty_level \n", 1163 | "0 normal 15 \n", 1164 | "1 neptune 19 \n", 1165 | "2 normal 21 \n", 1166 | "3 normal 21 \n", 1167 | "4 neptune 21 \n", 1168 | "\n", 1169 | "[5 rows x 43 columns]\n", 1170 | "\n", 1171 | "****** Start performing feature selection using ExtraTreesClassifier *****\n", 1172 | "****** Falls under wrapper methods (feature importance) *****\n", 1173 | "****** Start label encoding on the categorical features in the given dataset *****\n", 1174 | "****** Number of features before label encoding: 43\n", 1175 | "****** Number of categorical features in the dataset: 3\n", 1176 | "****** Categorical feature names in the dataset: ['Protocol_type' 'Service' 'Flag']\n", 1177 | "\n", 1178 | "****** Here is the list of unique values present in each categorical feature in the dataset *****\n", 1179 | "\n", 1180 | "\n", 1181 | "Protocol_type: 3 \n", 1182 | "| distinct values |\n", 1183 | "|-------------------|\n", 1184 | "| udp |\n", 1185 | "| tcp |\n", 1186 | "| icmp |\n", 1187 | "\n", 1188 | "\n", 1189 | "Service: 67 \n", 1190 | "| distinct values |\n", 1191 | "|-------------------|\n", 1192 | "| other |\n", 1193 | "| private |\n", 1194 | "| http |\n", 1195 | "| remote_job |\n", 1196 | "| ftp_data |\n", 1197 | "| name |\n", 1198 | "| netbios_ns |\n", 1199 | "| eco_i |\n", 1200 | "| mtp |\n", 1201 | "| telnet |\n", 1202 | "| finger |\n", 1203 | "| domain_u |\n", 1204 | "| supdup |\n", 1205 | "| uucp_path |\n", 1206 | "| Z39_50 |\n", 1207 | "| smtp |\n", 1208 | "| csnet_ns |\n", 1209 | "| uucp |\n", 1210 | "| netbios_dgm |\n", 1211 | "| urp_i |\n", 1212 | "| auth |\n", 1213 | "| domain |\n", 1214 | "| ftp |\n", 1215 | "| bgp |\n", 1216 | "| ldap |\n", 1217 | "| ecr_i |\n", 1218 | "| gopher |\n", 1219 | "| vmnet |\n", 1220 | "| systat |\n", 1221 | "| http_443 |\n", 1222 | "| efs |\n", 1223 | "| whois |\n", 1224 | "| imap4 |\n", 1225 | "| iso_tsap |\n", 1226 | "| echo |\n", 1227 | "| klogin |\n", 1228 | "| link |\n", 1229 | "| sunrpc |\n", 1230 | "| login |\n", 1231 | "| kshell |\n", 1232 | "| sql_net |\n", 1233 | "| time |\n", 1234 | "| hostnames |\n", 1235 | "| exec |\n", 1236 | "| ntp_u |\n", 1237 | "| discard |\n", 1238 | "| nntp |\n", 1239 | "| courier |\n", 1240 | "| ctf |\n", 1241 | "| ssh |\n", 1242 | "| daytime |\n", 1243 | "| shell |\n", 1244 | "| netstat |\n", 1245 | "| pop_3 |\n", 1246 | "| nnsp |\n", 1247 | "| IRC |\n", 1248 | "| pop_2 |\n", 1249 | "| printer |\n", 1250 | "| tim_i |\n", 1251 | "| pm_dump |\n", 1252 | "| red_i |\n", 1253 | "| netbios_ssn |\n", 1254 | "| rje |\n", 1255 | "| X11 |\n", 1256 | "| urh_i |\n", 1257 | "| http_8001 |\n", 1258 | "| tftp_u |\n", 1259 | "\n", 1260 | "\n", 1261 | "Flag: 11 \n", 1262 | "| distinct values |\n", 1263 | "|-------------------|\n", 1264 | "| SF |\n", 1265 | "| S0 |\n", 1266 | "| REJ |\n", 1267 | "| RSTR |\n", 1268 | "| SH |\n", 1269 | "| RSTO |\n", 1270 | "| S1 |\n", 1271 | "| RSTOS0 |\n", 1272 | "| S3 |\n", 1273 | "| S2 |\n", 1274 | "| OTH |\n", 1275 | "\n", 1276 | "****** Number of features after label encoding: 43\n", 1277 | "****** End label encoding on the categorical features in the given dataset *****\n", 1278 | "\n", 1279 | "****** ExtraTreesClassification is in progress *****\n", 1280 | "\n", 1281 | " selectedFeatures after ExtraTreesClassification: difficulty_level 0.076128\n", 1282 | "Same_srv_rate 0.071428\n", 1283 | "Dst_host_srv_serror_rate 0.049446\n", 1284 | "Service 0.046810\n", 1285 | "Dst_host_serror_rate 0.046286\n", 1286 | "Flag 0.044061\n", 1287 | "Dst_host_same_srv_rate 0.043586\n", 1288 | "Serror_rate 0.042794\n", 1289 | "Protocol_type 0.041901\n", 1290 | "Dst_host_srv_count 0.041828\n", 1291 | "Srv_serror_rate 0.040107\n", 1292 | "Dst_host_same_src_port_rate 0.037406\n", 1293 | "Count 0.036696\n", 1294 | "Logged_in 0.035569\n", 1295 | "Dst_host_rerror_rate 0.030801\n", 1296 | "Dst_host_diff_srv_rate 0.029853\n", 1297 | "Src_bytes 0.028388\n", 1298 | "Diff_srv_rate 0.027244\n", 1299 | "Dst_host_count 0.027063\n", 1300 | "Rerror_rate 0.024310\n", 1301 | "dtype: float64\n", 1302 | "****** Completed ExtraTreesClassification *****\n", 1303 | "\n", 1304 | "***** Number of columns in the dataSet after feature selection: 21\n", 1305 | "***** Columns in the dataSet after feature selection: \n", 1306 | " Index(['Protocol_type', 'Service', 'Flag', 'Src_bytes', 'Logged_in', 'Count',\n", 1307 | " 'Serror_rate', 'Srv_serror_rate', 'Rerror_rate', 'Same_srv_rate',\n", 1308 | " 'Diff_srv_rate', 'Dst_host_count', 'Dst_host_srv_count',\n", 1309 | " 'Dst_host_same_srv_rate', 'Dst_host_diff_srv_rate',\n", 1310 | " 'Dst_host_same_src_port_rate', 'Dst_host_serror_rate',\n", 1311 | " 'Dst_host_srv_serror_rate', 'Dst_host_rerror_rate', 'difficulty_level',\n", 1312 | " 'attack_type'],\n", 1313 | " dtype='object')\n", 1314 | "****** End performing feature selection using ExtraTreesClassifier *****\n", 1315 | "****** Start one hot encoding on the categorical features in the given dataset *****\n", 1316 | "****** Number of features before one hot encoding: 21\n", 1317 | "****** Number of categorical features in the dataset: 0\n", 1318 | "****** Categorical feature names in the dataset: []\n", 1319 | "\n", 1320 | "****** Here is the list of unique values present in each categorical feature in the dataset *****\n", 1321 | "\n", 1322 | "\n", 1323 | "attack_type: 40 \n", 1324 | "| distinct values |\n", 1325 | "|-------------------|\n", 1326 | "| normal |\n", 1327 | "| neptune |\n", 1328 | "| warezclient |\n", 1329 | "| ipsweep |\n", 1330 | "| portsweep |\n", 1331 | "| teardrop |\n", 1332 | "| nmap |\n", 1333 | "| satan |\n", 1334 | "| smurf |\n", 1335 | "| pod |\n", 1336 | "| back |\n", 1337 | "| guess_passwd |\n", 1338 | "| ftp_write |\n", 1339 | "| multihop |\n", 1340 | "| rootkit |\n", 1341 | "| buffer_overflow |\n", 1342 | "| imap |\n", 1343 | "| warezmaster |\n", 1344 | "| phf |\n", 1345 | "| land |\n", 1346 | "| loadmodule |\n", 1347 | "| spy |\n", 1348 | "| snmpguess |\n", 1349 | "| processtable |\n", 1350 | "| saint |\n", 1351 | "| mscan |\n", 1352 | "| apache2 |\n", 1353 | "| httptunnel |\n", 1354 | "| mailbomb |\n", 1355 | "| snmpgetattack |\n", 1356 | "| worm |\n", 1357 | "| sendmail |\n", 1358 | "| xlock |\n", 1359 | "| xterm |\n", 1360 | "| xsnoop |\n", 1361 | "| ps |\n", 1362 | "| named |\n", 1363 | "| udpstorm |\n", 1364 | "| perl |\n", 1365 | "| sqlattack |\n", 1366 | "\n", 1367 | "****** Number of features after one hot encoding: 21\n", 1368 | "****** End one hot encoding on the categorical features in the given dataset *****\n", 1369 | "\n", 1370 | "****** Start feature scaling of the features present in the dataset using StandardScalar *****\n", 1371 | "[[2 41 9 ... 0.0 15 'normal']\n", 1372 | " [1 46 5 ... 0.0 19 'neptune']\n", 1373 | " [1 22 9 ... 0.0 21 'normal']\n", 1374 | " ...\n", 1375 | " [1 57 2 ... 0.85 13 'mscan']\n", 1376 | " [1 54 1 ... 0.88 15 'mscan']\n", 1377 | " [2 46 9 ... 0.0 17 'snmpguess']]\n", 1378 | "\n", 1379 | "****** Number of features in the dataset before performing scaling: 20\n", 1380 | "\n", 1381 | "****** Features in the dataset before performing scaling ***** \n", 1382 | " [[2 41 9 ... 0.0 0.0 15]\n", 1383 | " [1 46 5 ... 1.0 0.0 19]\n", 1384 | " [1 22 9 ... 0.01 0.0 21]\n", 1385 | " ...\n", 1386 | " [1 57 2 ... 0.08 0.85 13]\n", 1387 | " [1 54 1 ... 0.0 0.88 15]\n", 1388 | " [2 46 9 ... 0.0 0.0 17]]\n", 1389 | "\n", 1390 | "****** Number of features in the dataset after performing scaling: 20\n", 1391 | "\n", 1392 | "****** Features in the dataset after performing scaling ***** \n", 1393 | " [[ 2.03857058 0.6299765 0.73536923 ... -0.54981386 -0.48776502\n", 1394 | " -0.85380057]\n", 1395 | " [-0.15478617 0.93890397 -0.66099165 ... 1.89967409 -0.48776502\n", 1396 | " 0.22813874]\n", 1397 | " [-0.15478617 -0.54394786 0.73536923 ... -0.52531898 -0.48776502\n", 1398 | " 0.76910839]\n", 1399 | " ...\n", 1400 | " [-0.15478617 1.61854439 -1.70826232 ... -0.35385482 2.03171007\n", 1401 | " -1.39477022]\n", 1402 | " [-0.15478617 1.43318791 -2.05735254 ... -0.54981386 2.12063272\n", 1403 | " -0.85380057]\n", 1404 | " [ 2.03857058 0.93890397 0.73536923 ... -0.54981386 -0.48776502\n", 1405 | " -0.31283092]]\n", 1406 | "scaledFeatures.head(): Protocol_type Service Flag Src_bytes Logged_in Count \\\n", 1407 | "0 2.038571 0.629977 0.735369 -0.011190 -0.732914 -0.581217 \n", 1408 | "1 -0.154786 0.938904 -0.660992 -0.011262 -0.732914 0.275339 \n", 1409 | "2 -0.154786 -0.543948 0.735369 -0.011147 1.364417 -0.643512 \n", 1410 | "3 -0.154786 -0.543948 0.735369 -0.011163 1.364417 -0.448840 \n", 1411 | "4 -0.154786 0.938904 -2.057353 -0.011262 -0.732914 0.259766 \n", 1412 | "\n", 1413 | " Serror_rate Srv_serror_rate Rerror_rate Same_srv_rate ... \\\n", 1414 | "0 -0.556584 -0.552030 -0.460806 -1.421427 ... \n", 1415 | "1 1.851192 1.851769 -0.460806 -1.491319 ... \n", 1416 | "2 -0.075029 -0.071270 -0.460806 0.721924 ... \n", 1417 | "3 -0.556584 -0.552030 -0.460806 0.721924 ... \n", 1418 | "4 -0.556584 -0.552030 2.274941 -1.235049 ... \n", 1419 | "\n", 1420 | " Dst_host_count Dst_host_srv_count Dst_host_same_srv_rate \\\n", 1421 | "0 0.656445 -1.050270 -1.193023 \n", 1422 | "1 0.656445 -0.821669 -0.966271 \n", 1423 | "2 -1.709884 1.272317 1.074493 \n", 1424 | "3 0.656445 1.272317 1.074493 \n", 1425 | "4 0.656445 -0.885678 -1.034297 \n", 1426 | "\n", 1427 | " Dst_host_diff_srv_rate Dst_host_same_src_port_rate Dst_host_serror_rate \\\n", 1428 | "0 2.187298 2.137976 -0.561390 \n", 1429 | "1 -0.237144 -0.498320 1.904034 \n", 1430 | "2 -0.457548 -0.408446 -0.487427 \n", 1431 | "3 -0.457548 -0.498320 -0.561390 \n", 1432 | "4 -0.148983 -0.498320 -0.561390 \n", 1433 | "\n", 1434 | " Dst_host_srv_serror_rate Dst_host_rerror_rate difficulty_level \\\n", 1435 | "0 -0.549814 -0.487765 -0.853801 \n", 1436 | "1 1.899674 -0.487765 0.228139 \n", 1437 | "2 -0.525319 -0.487765 0.769108 \n", 1438 | "3 -0.549814 -0.487765 0.769108 \n", 1439 | "4 -0.549814 2.476323 0.769108 \n", 1440 | "\n", 1441 | " attack_type \n", 1442 | "0 normal \n", 1443 | "1 neptune \n", 1444 | "2 normal \n", 1445 | "3 normal \n", 1446 | "4 neptune \n", 1447 | "\n", 1448 | "[5 rows x 21 columns]\n", 1449 | "scaledFeatures.shape: (37041, 21)\n", 1450 | "\n", 1451 | "****** End of feature scaling of the features present in the dataset using StandardScalar *****\n", 1452 | "\n", 1453 | "features.shape: (37041, 20)\n", 1454 | "label.shape: (37041,)\n" 1455 | ] 1456 | }, 1457 | { 1458 | "data": { 1459 | "text/html": [ 1460 | "
\n", 1461 | "\n", 1474 | "\n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | " \n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | " \n", 1588 | " \n", 1589 | " \n", 1590 | " \n", 1591 | " \n", 1592 | " \n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | " \n", 1609 | " \n", 1610 | " \n", 1611 | " \n", 1612 | " \n", 1613 | " \n", 1614 | " \n", 1615 | " \n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | "
Protocol_typeServiceFlagSrc_bytesLogged_inCountSerror_rateSrv_serror_rateRerror_rateSame_srv_rate...Dst_host_countDst_host_srv_countDst_host_same_srv_rateDst_host_diff_srv_rateDst_host_same_src_port_rateDst_host_serror_rateDst_host_srv_serror_rateDst_host_rerror_ratedifficulty_levelattack_type
02.0385710.6299770.735369-0.011190-0.732914-0.581217-0.556584-0.552030-0.460806-1.421427...0.656445-1.050270-1.1930232.1872982.137976-0.561390-0.549814-0.487765-0.853801normal
1-0.1547860.938904-0.660992-0.011262-0.7329140.2753391.8511921.851769-0.460806-1.491319...0.656445-0.821669-0.966271-0.237144-0.4983201.9040341.899674-0.4877650.228139neptune
2-0.154786-0.5439480.735369-0.0111471.364417-0.643512-0.075029-0.071270-0.4608060.721924...-1.7098841.2723171.074493-0.457548-0.408446-0.487427-0.525319-0.4877650.769108normal
3-0.154786-0.5439480.735369-0.0111631.364417-0.448840-0.556584-0.552030-0.4608060.721924...0.6564451.2723171.074493-0.457548-0.498320-0.561390-0.549814-0.4877650.769108normal
4-0.1547860.938904-2.057353-0.011262-0.7329140.259766-0.556584-0.5520302.274941-1.235049...0.656445-0.885678-1.034297-0.148983-0.498320-0.561390-0.5498142.4763230.769108neptune
\n", 1624 | "

5 rows × 21 columns

\n", 1625 | "
" 1626 | ], 1627 | "text/plain": [ 1628 | " Protocol_type Service Flag Src_bytes Logged_in Count \\\n", 1629 | "0 2.038571 0.629977 0.735369 -0.011190 -0.732914 -0.581217 \n", 1630 | "1 -0.154786 0.938904 -0.660992 -0.011262 -0.732914 0.275339 \n", 1631 | "2 -0.154786 -0.543948 0.735369 -0.011147 1.364417 -0.643512 \n", 1632 | "3 -0.154786 -0.543948 0.735369 -0.011163 1.364417 -0.448840 \n", 1633 | "4 -0.154786 0.938904 -2.057353 -0.011262 -0.732914 0.259766 \n", 1634 | "\n", 1635 | " Serror_rate Srv_serror_rate Rerror_rate Same_srv_rate ... \\\n", 1636 | "0 -0.556584 -0.552030 -0.460806 -1.421427 ... \n", 1637 | "1 1.851192 1.851769 -0.460806 -1.491319 ... \n", 1638 | "2 -0.075029 -0.071270 -0.460806 0.721924 ... \n", 1639 | "3 -0.556584 -0.552030 -0.460806 0.721924 ... \n", 1640 | "4 -0.556584 -0.552030 2.274941 -1.235049 ... \n", 1641 | "\n", 1642 | " Dst_host_count Dst_host_srv_count Dst_host_same_srv_rate \\\n", 1643 | "0 0.656445 -1.050270 -1.193023 \n", 1644 | "1 0.656445 -0.821669 -0.966271 \n", 1645 | "2 -1.709884 1.272317 1.074493 \n", 1646 | "3 0.656445 1.272317 1.074493 \n", 1647 | "4 0.656445 -0.885678 -1.034297 \n", 1648 | "\n", 1649 | " Dst_host_diff_srv_rate Dst_host_same_src_port_rate Dst_host_serror_rate \\\n", 1650 | "0 2.187298 2.137976 -0.561390 \n", 1651 | "1 -0.237144 -0.498320 1.904034 \n", 1652 | "2 -0.457548 -0.408446 -0.487427 \n", 1653 | "3 -0.457548 -0.498320 -0.561390 \n", 1654 | "4 -0.148983 -0.498320 -0.561390 \n", 1655 | "\n", 1656 | " Dst_host_srv_serror_rate Dst_host_rerror_rate difficulty_level \\\n", 1657 | "0 -0.549814 -0.487765 -0.853801 \n", 1658 | "1 1.899674 -0.487765 0.228139 \n", 1659 | "2 -0.525319 -0.487765 0.769108 \n", 1660 | "3 -0.549814 -0.487765 0.769108 \n", 1661 | "4 -0.549814 2.476323 0.769108 \n", 1662 | "\n", 1663 | " attack_type \n", 1664 | "0 normal \n", 1665 | "1 neptune \n", 1666 | "2 normal \n", 1667 | "3 normal \n", 1668 | "4 neptune \n", 1669 | "\n", 1670 | "[5 rows x 21 columns]" 1671 | ] 1672 | }, 1673 | "execution_count": 4, 1674 | "metadata": {}, 1675 | "output_type": "execute_result" 1676 | } 1677 | ], 1678 | "source": [ 1679 | "arrayOfModels = defineArrayForPreProcessing()\n", 1680 | "completeEncodedAndScaledDataset = performPreprocessing(trainingDataSet, testingDataSet, arrayOfModels)\n", 1681 | "completeEncodedAndScaledDataset.head()" 1682 | ] 1683 | }, 1684 | { 1685 | "cell_type": "markdown", 1686 | "metadata": {}, 1687 | "source": [ 1688 | "### After preprocessing, check the shape of the dataset" 1689 | ] 1690 | }, 1691 | { 1692 | "cell_type": "code", 1693 | "execution_count": 5, 1694 | "metadata": {}, 1695 | "outputs": [ 1696 | { 1697 | "name": "stdout", 1698 | "output_type": "stream", 1699 | "text": [ 1700 | "(37041, 20) (37041,)\n", 1701 | "Number of unique values in label: 40\n", 1702 | "Unique values in label: ['apache2' 'back' 'buffer_overflow' 'ftp_write' 'guess_passwd'\n", 1703 | " 'httptunnel' 'imap' 'ipsweep' 'land' 'loadmodule' 'mailbomb' 'mscan'\n", 1704 | " 'multihop' 'named' 'neptune' 'nmap' 'normal' 'perl' 'phf' 'pod'\n", 1705 | " 'portsweep' 'processtable' 'ps' 'rootkit' 'saint' 'satan' 'sendmail'\n", 1706 | " 'smurf' 'snmpgetattack' 'snmpguess' 'spy' 'sqlattack' 'teardrop'\n", 1707 | " 'udpstorm' 'warezclient' 'warezmaster' 'worm' 'xlock' 'xsnoop' 'xterm']\n" 1708 | ] 1709 | } 1710 | ], 1711 | "source": [ 1712 | "x = completeEncodedAndScaledDataset.drop('attack_type',axis=1)\n", 1713 | "y = completeEncodedAndScaledDataset['attack_type']\n", 1714 | "print(x.shape, y.shape)\n", 1715 | "print('Number of unique values in label: ',len(np.unique(y)))\n", 1716 | "print('Unique values in label: ',np.unique(y))\n", 1717 | "#print(y.value_counts())" 1718 | ] 1719 | }, 1720 | { 1721 | "cell_type": "markdown", 1722 | "metadata": {}, 1723 | "source": [ 1724 | "### Encode the categorical label values" 1725 | ] 1726 | }, 1727 | { 1728 | "cell_type": "code", 1729 | "execution_count": 6, 1730 | "metadata": {}, 1731 | "outputs": [ 1732 | { 1733 | "name": "stdout", 1734 | "output_type": "stream", 1735 | "text": [ 1736 | "(37041, 20) (37041, 40)\n" 1737 | ] 1738 | }, 1739 | { 1740 | "name": "stderr", 1741 | "output_type": "stream", 1742 | "text": [ 1743 | "D:\\Anaconda3\\envs\\tf_gpu\\lib\\site-packages\\ipykernel_launcher.py:2: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n", 1744 | " \n" 1745 | ] 1746 | } 1747 | ], 1748 | "source": [ 1749 | "onehot = pd.get_dummies(y)\n", 1750 | "y = onehot.as_matrix()\n", 1751 | "print(x.shape, y.shape)" 1752 | ] 1753 | }, 1754 | { 1755 | "cell_type": "markdown", 1756 | "metadata": {}, 1757 | "source": [ 1758 | "## Build a neural Network model" 1759 | ] 1760 | }, 1761 | { 1762 | "cell_type": "code", 1763 | "execution_count": 7, 1764 | "metadata": {}, 1765 | "outputs": [], 1766 | "source": [ 1767 | "'''\n", 1768 | "This function is used to define, compile and filt a neural network\n", 1769 | "'''\n", 1770 | "def nn_model(trainx, trainy, valx,valy,bt_size,epochs, layers):\n", 1771 | " model = Sequential()\n", 1772 | " model.add(Dense(layers[0],activation='relu', input_shape=(trainx.shape[1],)))\n", 1773 | " for l in layers[1:]:\n", 1774 | " model.add(Dense(l, activation='relu' ))\n", 1775 | " model.add(Dropout(0.30))\n", 1776 | " model.add(Dense(trainy.shape[1], activation='softmax'))\n", 1777 | " model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])\n", 1778 | " hist=model.fit(trainx, trainy, batch_size=bt_size, epochs=epochs, shuffle=True, validation_data=(valx,valy), verbose=True)\n", 1779 | " return hist" 1780 | ] 1781 | }, 1782 | { 1783 | "cell_type": "code", 1784 | "execution_count": 8, 1785 | "metadata": { 1786 | "scrolled": true 1787 | }, 1788 | "outputs": [ 1789 | { 1790 | "name": "stdout", 1791 | "output_type": "stream", 1792 | "text": [ 1793 | "WARNING:tensorflow:From D:\\Anaconda3\\envs\\tf_gpu\\lib\\site-packages\\tensorflow\\python\\ops\\resource_variable_ops.py:435: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", 1794 | "Instructions for updating:\n", 1795 | "Colocations handled automatically by placer.\n", 1796 | "WARNING:tensorflow:From D:\\Anaconda3\\envs\\tf_gpu\\lib\\site-packages\\tensorflow\\python\\keras\\layers\\core.py:143: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n", 1797 | "Instructions for updating:\n", 1798 | "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n", 1799 | "Train on 27780 samples, validate on 9261 samples\n", 1800 | "WARNING:tensorflow:From D:\\Anaconda3\\envs\\tf_gpu\\lib\\site-packages\\tensorflow\\python\\ops\\math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", 1801 | "Instructions for updating:\n", 1802 | "Use tf.cast instead.\n", 1803 | "Epoch 1/100\n", 1804 | "27780/27780 [==============================] - 12s 420us/sample - loss: 1.0481 - acc: 0.7006 - val_loss: 0.6489 - val_acc: 0.7742\n", 1805 | "Epoch 2/100\n", 1806 | "27780/27780 [==============================] - 11s 381us/sample - loss: 0.7296 - acc: 0.7751 - val_loss: 0.5398 - val_acc: 0.8213\n", 1807 | "Epoch 3/100\n", 1808 | "27780/27780 [==============================] - 11s 385us/sample - loss: 0.6316 - acc: 0.7992 - val_loss: 0.4577 - val_acc: 0.8402\n", 1809 | "Epoch 4/100\n", 1810 | "27780/27780 [==============================] - 11s 392us/sample - loss: 0.5731 - acc: 0.8157 - val_loss: 0.4002 - val_acc: 0.8790\n", 1811 | "Epoch 5/100\n", 1812 | "27780/27780 [==============================] - 11s 386us/sample - loss: 0.5312 - acc: 0.8364 - val_loss: 0.3762 - val_acc: 0.8743\n", 1813 | "Epoch 6/100\n", 1814 | "27780/27780 [==============================] - 11s 382us/sample - loss: 0.5000 - acc: 0.8528 - val_loss: 0.3834 - val_acc: 0.8959\n", 1815 | "Epoch 7/100\n", 1816 | "27780/27780 [==============================] - 11s 385us/sample - loss: 0.5053 - acc: 0.8568 - val_loss: 0.3448 - val_acc: 0.9031\n", 1817 | "Epoch 8/100\n", 1818 | "27780/27780 [==============================] - 11s 386us/sample - loss: 0.4810 - acc: 0.8643 - val_loss: 0.3433 - val_acc: 0.9037\n", 1819 | "Epoch 9/100\n", 1820 | "27780/27780 [==============================] - 11s 395us/sample - loss: 0.4779 - acc: 0.8639 - val_loss: 0.3262 - val_acc: 0.9022\n", 1821 | "Epoch 10/100\n", 1822 | "27780/27780 [==============================] - 37s 1ms/sample - loss: 0.4537 - acc: 0.8704 - val_loss: 0.3192 - val_acc: 0.9094\n", 1823 | "Epoch 11/100\n", 1824 | "27780/27780 [==============================] - 40s 1ms/sample - loss: 0.4726 - acc: 0.8733 - val_loss: 0.3233 - val_acc: 0.9058\n", 1825 | "Epoch 12/100\n", 1826 | "27780/27780 [==============================] - 42s 2ms/sample - loss: 0.4438 - acc: 0.8746 - val_loss: 0.2998 - val_acc: 0.9097\n", 1827 | "Epoch 13/100\n", 1828 | "27780/27780 [==============================] - 11s 413us/sample - loss: 0.4329 - acc: 0.8796 - val_loss: 0.3063 - val_acc: 0.9161\n", 1829 | "Epoch 14/100\n", 1830 | "27780/27780 [==============================] - 10s 361us/sample - loss: 0.4481 - acc: 0.8753 - val_loss: 0.2901 - val_acc: 0.9099\n", 1831 | "Epoch 15/100\n", 1832 | "27780/27780 [==============================] - 10s 364us/sample - loss: 0.4404 - acc: 0.8772 - val_loss: 0.3386 - val_acc: 0.9068\n", 1833 | "Epoch 16/100\n", 1834 | "27780/27780 [==============================] - 10s 367us/sample - loss: 0.4451 - acc: 0.8808 - val_loss: 0.2942 - val_acc: 0.9142\n", 1835 | "Epoch 17/100\n", 1836 | "27780/27780 [==============================] - 10s 370us/sample - loss: 0.4482 - acc: 0.8839 - val_loss: 0.2815 - val_acc: 0.9166\n", 1837 | "Epoch 18/100\n", 1838 | "27780/27780 [==============================] - 10s 372us/sample - loss: 0.4619 - acc: 0.8762 - val_loss: 0.3048 - val_acc: 0.9119\n", 1839 | "Epoch 19/100\n", 1840 | "27780/27780 [==============================] - 10s 373us/sample - loss: 0.4768 - acc: 0.8789 - val_loss: 0.2868 - val_acc: 0.9173\n", 1841 | "Epoch 20/100\n", 1842 | "27780/27780 [==============================] - 10s 374us/sample - loss: 0.4557 - acc: 0.8812 - val_loss: 0.2839 - val_acc: 0.9148\n", 1843 | "Epoch 21/100\n", 1844 | "27780/27780 [==============================] - 10s 373us/sample - loss: 0.4341 - acc: 0.8857 - val_loss: 0.2716 - val_acc: 0.9219\n", 1845 | "Epoch 22/100\n", 1846 | "27780/27780 [==============================] - 10s 377us/sample - loss: 0.4442 - acc: 0.8855 - val_loss: 0.2975 - val_acc: 0.9151\n", 1847 | "Epoch 23/100\n", 1848 | "27780/27780 [==============================] - 10s 377us/sample - loss: 0.4499 - acc: 0.8844 - val_loss: 0.2735 - val_acc: 0.9175\n", 1849 | "Epoch 24/100\n", 1850 | "27780/27780 [==============================] - 10s 377us/sample - loss: 0.4499 - acc: 0.8874 - val_loss: 0.2904 - val_acc: 0.9228\n", 1851 | "Epoch 25/100\n", 1852 | "27780/27780 [==============================] - 10s 378us/sample - loss: 0.4782 - acc: 0.8844 - val_loss: 0.3537 - val_acc: 0.9121\n", 1853 | "Epoch 26/100\n", 1854 | "27780/27780 [==============================] - 10s 377us/sample - loss: 0.5170 - acc: 0.8828 - val_loss: 0.2650 - val_acc: 0.9230\n", 1855 | "Epoch 27/100\n", 1856 | "27780/27780 [==============================] - 10s 377us/sample - loss: 0.4159 - acc: 0.8907 - val_loss: 0.2711 - val_acc: 0.9246\n", 1857 | "Epoch 28/100\n", 1858 | "27780/27780 [==============================] - 11s 385us/sample - loss: 0.4607 - acc: 0.8807 - val_loss: 0.3042 - val_acc: 0.9160\n", 1859 | "Epoch 29/100\n", 1860 | "27780/27780 [==============================] - 11s 381us/sample - loss: 0.4366 - acc: 0.8857 - val_loss: 0.2859 - val_acc: 0.9171\n", 1861 | "Epoch 30/100\n", 1862 | "27780/27780 [==============================] - 11s 382us/sample - loss: 0.4303 - acc: 0.8923 - val_loss: 0.2832 - val_acc: 0.9194\n", 1863 | "Epoch 31/100\n", 1864 | "27780/27780 [==============================] - 11s 380us/sample - loss: 0.5191 - acc: 0.8786 - val_loss: 0.4336 - val_acc: 0.8996\n", 1865 | "Epoch 32/100\n", 1866 | "27780/27780 [==============================] - 11s 381us/sample - loss: 0.5173 - acc: 0.8720 - val_loss: 0.3143 - val_acc: 0.9112\n", 1867 | "Epoch 33/100\n", 1868 | "27780/27780 [==============================] - 11s 382us/sample - loss: 0.4416 - acc: 0.8854 - val_loss: 0.2853 - val_acc: 0.9193\n", 1869 | "Epoch 34/100\n", 1870 | "27780/27780 [==============================] - 11s 384us/sample - loss: 0.4670 - acc: 0.8879 - val_loss: 0.2971 - val_acc: 0.9171\n", 1871 | "Epoch 35/100\n", 1872 | "27780/27780 [==============================] - 11s 384us/sample - loss: 0.4538 - acc: 0.8838 - val_loss: 0.2842 - val_acc: 0.9210\n", 1873 | "Epoch 36/100\n", 1874 | "27780/27780 [==============================] - 11s 385us/sample - loss: 0.4627 - acc: 0.8860 - val_loss: 0.3164 - val_acc: 0.9176\n", 1875 | "Epoch 37/100\n", 1876 | "27780/27780 [==============================] - 11s 384us/sample - loss: 0.4645 - acc: 0.8835 - val_loss: 0.3689 - val_acc: 0.9066\n", 1877 | "Epoch 38/100\n", 1878 | "27780/27780 [==============================] - 11s 386us/sample - loss: 0.4621 - acc: 0.8809 - val_loss: 0.3259 - val_acc: 0.9129\n", 1879 | "Epoch 39/100\n", 1880 | "27780/27780 [==============================] - 11s 387us/sample - loss: 0.4537 - acc: 0.8845 - val_loss: 0.2831 - val_acc: 0.9199\n", 1881 | "Epoch 40/100\n", 1882 | "27780/27780 [==============================] - 11s 394us/sample - loss: 0.4641 - acc: 0.8784 - val_loss: 0.2876 - val_acc: 0.9129\n", 1883 | "Epoch 41/100\n", 1884 | "27780/27780 [==============================] - 11s 387us/sample - loss: 0.4442 - acc: 0.8835 - val_loss: 0.2862 - val_acc: 0.9189\n", 1885 | "Epoch 42/100\n", 1886 | "27780/27780 [==============================] - 11s 387us/sample - loss: 0.4541 - acc: 0.8864 - val_loss: 0.2863 - val_acc: 0.9191\n", 1887 | "Epoch 43/100\n", 1888 | "27780/27780 [==============================] - 11s 387us/sample - loss: 0.4861 - acc: 0.8815 - val_loss: 0.3277 - val_acc: 0.9136\n", 1889 | "Epoch 44/100\n", 1890 | "27780/27780 [==============================] - 11s 387us/sample - loss: 0.4576 - acc: 0.8830 - val_loss: 0.3660 - val_acc: 0.9008\n", 1891 | "Epoch 45/100\n", 1892 | "27780/27780 [==============================] - 11s 387us/sample - loss: 0.5172 - acc: 0.8793 - val_loss: 0.3160 - val_acc: 0.9036\n", 1893 | "Epoch 46/100\n", 1894 | "27780/27780 [==============================] - 11s 387us/sample - loss: 0.5235 - acc: 0.8769 - val_loss: 0.3028 - val_acc: 0.9205\n", 1895 | "Epoch 47/100\n", 1896 | "27780/27780 [==============================] - 11s 389us/sample - loss: 0.4917 - acc: 0.8840 - val_loss: 0.2869 - val_acc: 0.9187\n", 1897 | "Epoch 48/100\n", 1898 | "27780/27780 [==============================] - 11s 389us/sample - loss: 0.4863 - acc: 0.8767 - val_loss: 0.3165 - val_acc: 0.9057\n", 1899 | "Epoch 49/100\n", 1900 | "27780/27780 [==============================] - 11s 389us/sample - loss: 0.4999 - acc: 0.8771 - val_loss: 0.2654 - val_acc: 0.9218\n", 1901 | "Epoch 50/100\n", 1902 | "27780/27780 [==============================] - 11s 405us/sample - loss: 0.4832 - acc: 0.8861 - val_loss: 0.2689 - val_acc: 0.9221\n", 1903 | "Epoch 51/100\n", 1904 | "27780/27780 [==============================] - 11s 390us/sample - loss: 0.4703 - acc: 0.8832 - val_loss: 0.2785 - val_acc: 0.9167\n" 1905 | ] 1906 | }, 1907 | { 1908 | "name": "stdout", 1909 | "output_type": "stream", 1910 | "text": [ 1911 | "Epoch 52/100\n", 1912 | "27780/27780 [==============================] - 11s 383us/sample - loss: 0.5021 - acc: 0.8807 - val_loss: 0.2870 - val_acc: 0.9158\n", 1913 | "Epoch 53/100\n", 1914 | "27780/27780 [==============================] - 11s 383us/sample - loss: 0.4962 - acc: 0.8730 - val_loss: 0.2757 - val_acc: 0.9199\n", 1915 | "Epoch 54/100\n", 1916 | "27780/27780 [==============================] - 11s 383us/sample - loss: 0.4674 - acc: 0.8850 - val_loss: 0.2959 - val_acc: 0.9132\n", 1917 | "Epoch 55/100\n", 1918 | "27780/27780 [==============================] - 11s 384us/sample - loss: 0.5025 - acc: 0.8792 - val_loss: 0.3102 - val_acc: 0.9173\n", 1919 | "Epoch 56/100\n", 1920 | "27780/27780 [==============================] - 11s 385us/sample - loss: 0.5713 - acc: 0.8688 - val_loss: 0.2990 - val_acc: 0.9135\n", 1921 | "Epoch 57/100\n", 1922 | "27780/27780 [==============================] - 11s 384us/sample - loss: 0.4880 - acc: 0.8781 - val_loss: 0.3754 - val_acc: 0.9047\n", 1923 | "Epoch 58/100\n", 1924 | "27780/27780 [==============================] - 11s 383us/sample - loss: 0.5401 - acc: 0.8740 - val_loss: 0.3097 - val_acc: 0.9072\n", 1925 | "Epoch 59/100\n", 1926 | "27780/27780 [==============================] - 11s 382us/sample - loss: 0.4993 - acc: 0.8759 - val_loss: 0.3168 - val_acc: 0.9186\n", 1927 | "Epoch 60/100\n", 1928 | "27780/27780 [==============================] - 11s 383us/sample - loss: 0.4789 - acc: 0.8816 - val_loss: 0.2970 - val_acc: 0.9136\n", 1929 | "Epoch 61/100\n", 1930 | "27780/27780 [==============================] - 11s 382us/sample - loss: 0.5057 - acc: 0.8786 - val_loss: 0.3339 - val_acc: 0.9128\n", 1931 | "Epoch 62/100\n", 1932 | "27780/27780 [==============================] - 11s 383us/sample - loss: 0.5387 - acc: 0.8769 - val_loss: 0.3822 - val_acc: 0.9097\n", 1933 | "Epoch 63/100\n", 1934 | "27780/27780 [==============================] - 11s 387us/sample - loss: 0.5181 - acc: 0.8760 - val_loss: 0.3005 - val_acc: 0.9182\n", 1935 | "Epoch 64/100\n", 1936 | "27780/27780 [==============================] - 11s 385us/sample - loss: 0.5243 - acc: 0.8725 - val_loss: 0.3749 - val_acc: 0.9021\n", 1937 | "Epoch 65/100\n", 1938 | "27780/27780 [==============================] - 11s 383us/sample - loss: 0.5467 - acc: 0.8649 - val_loss: 0.4062 - val_acc: 0.8889\n", 1939 | "Epoch 66/100\n", 1940 | "27780/27780 [==============================] - 11s 384us/sample - loss: 0.5542 - acc: 0.8632 - val_loss: 0.3906 - val_acc: 0.9081\n", 1941 | "Epoch 67/100\n", 1942 | "27780/27780 [==============================] - 11s 385us/sample - loss: 0.5404 - acc: 0.8688 - val_loss: 0.3076 - val_acc: 0.9142\n", 1943 | "Epoch 68/100\n", 1944 | "27780/27780 [==============================] - 11s 386us/sample - loss: 0.5097 - acc: 0.8747 - val_loss: 0.3242 - val_acc: 0.9136\n", 1945 | "Epoch 69/100\n", 1946 | "27780/27780 [==============================] - 11s 386us/sample - loss: 0.5479 - acc: 0.8698 - val_loss: 0.3221 - val_acc: 0.9144\n", 1947 | "Epoch 70/100\n", 1948 | "27780/27780 [==============================] - 11s 386us/sample - loss: 0.5271 - acc: 0.8699 - val_loss: 0.3466 - val_acc: 0.9121\n", 1949 | "Epoch 71/100\n", 1950 | "27780/27780 [==============================] - 11s 384us/sample - loss: 0.5463 - acc: 0.8725 - val_loss: 0.3034 - val_acc: 0.9151\n", 1951 | "Epoch 72/100\n", 1952 | "27780/27780 [==============================] - 11s 385us/sample - loss: 0.5350 - acc: 0.8666 - val_loss: 0.3085 - val_acc: 0.9122\n", 1953 | "Epoch 73/100\n", 1954 | "27780/27780 [==============================] - 11s 385us/sample - loss: 0.5153 - acc: 0.8709 - val_loss: 0.2951 - val_acc: 0.9156\n", 1955 | "Epoch 74/100\n", 1956 | "27780/27780 [==============================] - 11s 386us/sample - loss: 0.5131 - acc: 0.8686 - val_loss: 0.3406 - val_acc: 0.8987\n", 1957 | "Epoch 75/100\n", 1958 | "27780/27780 [==============================] - 11s 387us/sample - loss: 0.6295 - acc: 0.8508 - val_loss: 0.3472 - val_acc: 0.9013\n", 1959 | "Epoch 76/100\n", 1960 | "27780/27780 [==============================] - 11s 387us/sample - loss: 0.5793 - acc: 0.8554 - val_loss: 0.3403 - val_acc: 0.9096\n", 1961 | "Epoch 77/100\n", 1962 | "27780/27780 [==============================] - 11s 385us/sample - loss: 0.5163 - acc: 0.8666 - val_loss: 0.3244 - val_acc: 0.9090\n", 1963 | "Epoch 78/100\n", 1964 | "27780/27780 [==============================] - 11s 387us/sample - loss: 0.5066 - acc: 0.8669 - val_loss: 0.3062 - val_acc: 0.9177\n", 1965 | "Epoch 79/100\n", 1966 | "27780/27780 [==============================] - 11s 387us/sample - loss: 0.5329 - acc: 0.8664 - val_loss: 0.3231 - val_acc: 0.8989\n", 1967 | "Epoch 80/100\n", 1968 | "27780/27780 [==============================] - 11s 388us/sample - loss: 0.5266 - acc: 0.8626 - val_loss: 0.3232 - val_acc: 0.9050\n", 1969 | "Epoch 81/100\n", 1970 | "27780/27780 [==============================] - 11s 386us/sample - loss: 0.5326 - acc: 0.8676 - val_loss: 0.3178 - val_acc: 0.9121\n", 1971 | "Epoch 82/100\n", 1972 | "27780/27780 [==============================] - 11s 384us/sample - loss: 0.5326 - acc: 0.8677 - val_loss: 0.2966 - val_acc: 0.9139\n", 1973 | "Epoch 83/100\n", 1974 | "27780/27780 [==============================] - 11s 389us/sample - loss: 0.5424 - acc: 0.8681 - val_loss: 0.3101 - val_acc: 0.9135\n", 1975 | "Epoch 84/100\n", 1976 | "27780/27780 [==============================] - 11s 387us/sample - loss: 0.5325 - acc: 0.8724 - val_loss: 0.2976 - val_acc: 0.9140\n", 1977 | "Epoch 85/100\n", 1978 | "27780/27780 [==============================] - 11s 388us/sample - loss: 0.5753 - acc: 0.8688 - val_loss: 0.2812 - val_acc: 0.9152\n", 1979 | "Epoch 86/100\n", 1980 | "27780/27780 [==============================] - 11s 386us/sample - loss: 0.5258 - acc: 0.8711 - val_loss: 0.3018 - val_acc: 0.9160\n", 1981 | "Epoch 87/100\n", 1982 | "27780/27780 [==============================] - 11s 387us/sample - loss: 0.5738 - acc: 0.8590 - val_loss: 0.3412 - val_acc: 0.9003\n", 1983 | "Epoch 88/100\n", 1984 | "27780/27780 [==============================] - 11s 386us/sample - loss: 0.5871 - acc: 0.8603 - val_loss: 0.3327 - val_acc: 0.9085\n", 1985 | "Epoch 89/100\n", 1986 | "27780/27780 [==============================] - 11s 386us/sample - loss: 0.5534 - acc: 0.8593 - val_loss: 0.3462 - val_acc: 0.9055\n", 1987 | "Epoch 90/100\n", 1988 | "27780/27780 [==============================] - 11s 387us/sample - loss: 0.5189 - acc: 0.8657 - val_loss: 0.3108 - val_acc: 0.9120\n", 1989 | "Epoch 91/100\n", 1990 | "27780/27780 [==============================] - 11s 387us/sample - loss: 0.5696 - acc: 0.8617 - val_loss: 0.3282 - val_acc: 0.9000\n", 1991 | "Epoch 92/100\n", 1992 | "27780/27780 [==============================] - 11s 390us/sample - loss: 0.5057 - acc: 0.8669 - val_loss: 0.3007 - val_acc: 0.9076\n", 1993 | "Epoch 93/100\n", 1994 | "27780/27780 [==============================] - 11s 387us/sample - loss: 0.5720 - acc: 0.8626 - val_loss: 0.3421 - val_acc: 0.9071\n", 1995 | "Epoch 94/100\n", 1996 | "27780/27780 [==============================] - 11s 390us/sample - loss: 0.5298 - acc: 0.8645 - val_loss: 0.3073 - val_acc: 0.9088\n", 1997 | "Epoch 95/100\n", 1998 | "27780/27780 [==============================] - 11s 389us/sample - loss: 0.5363 - acc: 0.8654 - val_loss: 0.3450 - val_acc: 0.9090\n", 1999 | "Epoch 96/100\n", 2000 | "27780/27780 [==============================] - 11s 391us/sample - loss: 0.5624 - acc: 0.8659 - val_loss: 0.3341 - val_acc: 0.9159\n", 2001 | "Epoch 97/100\n", 2002 | "27780/27780 [==============================] - 11s 391us/sample - loss: 0.5754 - acc: 0.8666 - val_loss: 0.3329 - val_acc: 0.9079\n", 2003 | "Epoch 98/100\n", 2004 | "27780/27780 [==============================] - 11s 389us/sample - loss: 0.6255 - acc: 0.8611 - val_loss: 0.4500 - val_acc: 0.9037\n", 2005 | "Epoch 99/100\n", 2006 | "27780/27780 [==============================] - 11s 388us/sample - loss: 0.6341 - acc: 0.8553 - val_loss: 0.4839 - val_acc: 0.8996\n", 2007 | "Epoch 100/100\n", 2008 | "27780/27780 [==============================] - 11s 389us/sample - loss: 0.7025 - acc: 0.8531 - val_loss: 0.5064 - val_acc: 0.8945\n" 2009 | ] 2010 | } 2011 | ], 2012 | "source": [ 2013 | "trainx, testx, trainy, testy = train_test_split(x,y, test_size=0.25, random_state=42)\n", 2014 | "layers=[trainx.shape[1],800,500,400,300,200,100,50,10]\n", 2015 | "hist = nn_model(trainx, trainy, testx, testy,16,100,layers)" 2016 | ] 2017 | }, 2018 | { 2019 | "cell_type": "code", 2020 | "execution_count": 9, 2021 | "metadata": {}, 2022 | "outputs": [ 2023 | { 2024 | "name": "stdout", 2025 | "output_type": "stream", 2026 | "text": [ 2027 | "MAX Accuracy during training: 89.22966122627258\n", 2028 | "MAX Accuracy during validation: 92.46301651000977\n" 2029 | ] 2030 | } 2031 | ], 2032 | "source": [ 2033 | "print('MAX Accuracy during training: ',max(hist.history['acc'])*100)\n", 2034 | "print('MAX Accuracy during validation: ',max(hist.history['val_acc'])*100)" 2035 | ] 2036 | }, 2037 | { 2038 | "cell_type": "markdown", 2039 | "metadata": {}, 2040 | "source": [ 2041 | "## Polt the training accuracy and testing accuracy" 2042 | ] 2043 | }, 2044 | { 2045 | "cell_type": "code", 2046 | "execution_count": 13, 2047 | "metadata": {}, 2048 | "outputs": [ 2049 | { 2050 | "data": { 2051 | "text/plain": [ 2052 | "[]" 2053 | ] 2054 | }, 2055 | "execution_count": 13, 2056 | "metadata": {}, 2057 | "output_type": "execute_result" 2058 | }, 2059 | { 2060 | "data": { 2061 | "image/png": "\n", 2062 | "text/plain": [ 2063 | "
" 2064 | ] 2065 | }, 2066 | "metadata": { 2067 | "needs_background": "light" 2068 | }, 2069 | "output_type": "display_data" 2070 | } 2071 | ], 2072 | "source": [ 2073 | "plt.plot(range(100), hist.history['acc'], 'r', label='Train acc')\n", 2074 | "plt.plot(range(100), hist.history['val_acc'], 'b', label='Test acc')" 2075 | ] 2076 | }, 2077 | { 2078 | "cell_type": "markdown", 2079 | "metadata": {}, 2080 | "source": [ 2081 | "## Polt the training loss and testing loss" 2082 | ] 2083 | }, 2084 | { 2085 | "cell_type": "code", 2086 | "execution_count": 14, 2087 | "metadata": {}, 2088 | "outputs": [ 2089 | { 2090 | "data": { 2091 | "text/plain": [ 2092 | "" 2093 | ] 2094 | }, 2095 | "execution_count": 14, 2096 | "metadata": {}, 2097 | "output_type": "execute_result" 2098 | }, 2099 | { 2100 | "data": { 2101 | "image/png": "\n", 2102 | "text/plain": [ 2103 | "
" 2104 | ] 2105 | }, 2106 | "metadata": { 2107 | "needs_background": "light" 2108 | }, 2109 | "output_type": "display_data" 2110 | } 2111 | ], 2112 | "source": [ 2113 | "plt.plot(range(100), hist.history['loss'], 'r', label='Train Loss')\n", 2114 | "plt.plot(range(100), hist.history['val_loss'], 'b', label='Test Loss')\n", 2115 | "plt.title(\"Dataset1: Neural Network Model on Latent Features: Train-Test Loss \")\n", 2116 | "plt.legend()" 2117 | ] 2118 | }, 2119 | { 2120 | "cell_type": "code", 2121 | "execution_count": null, 2122 | "metadata": {}, 2123 | "outputs": [], 2124 | "source": [] 2125 | } 2126 | ], 2127 | "metadata": { 2128 | "kernelspec": { 2129 | "display_name": "Python 3", 2130 | "language": "python", 2131 | "name": "python3" 2132 | }, 2133 | "language_info": { 2134 | "codemirror_mode": { 2135 | "name": "ipython", 2136 | "version": 3 2137 | }, 2138 | "file_extension": ".py", 2139 | "mimetype": "text/x-python", 2140 | "name": "python", 2141 | "nbconvert_exporter": "python", 2142 | "pygments_lexer": "ipython3", 2143 | "version": "3.7.1" 2144 | } 2145 | }, 2146 | "nbformat": 4, 2147 | "nbformat_minor": 2 2148 | } 2149 | -------------------------------------------------------------------------------- /Project-UtilityFunctions/__pycache__/lstm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LearnDeepLearningOrg/NetworkIntrusionDetection/11e638a3ad91dff8d343ddbab624a1e5f2eb66d7/Project-UtilityFunctions/__pycache__/lstm.cpython-37.pyc -------------------------------------------------------------------------------- /Project-UtilityFunctions/classificationlibrary.py: -------------------------------------------------------------------------------- 1 | #Libraries for feature encoding 2 | from sklearn.preprocessing import LabelEncoder 3 | 4 | #Libraries for classification 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.tree import DecisionTreeClassifier 7 | from sklearn.neighbors import KNeighborsClassifier 8 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 9 | from sklearn.naive_bayes import GaussianNB 10 | from sklearn.svm import SVC 11 | from sklearn.ensemble import RandomForestClassifier #RandomForestClassifier: Falls under wrapper methods (feature importance) 12 | from sklearn.ensemble import ExtraTreesClassifier #ExtraTreesClassifier: Falls under wrapper methods (feature importance) 13 | from sklearn.neighbors import KNeighborsClassifier 14 | 15 | #Libraries to measure the accuracy 16 | from sklearn import metrics 17 | from sklearn.metrics import accuracy_score 18 | 19 | #import pandas library 20 | import pandas as pd 21 | 22 | #This function is used to perform classification using DecisionTreeClassifier 23 | def classifyUsingDecisionTreeClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset): 24 | print("****** Start classification training using DecisionTreeClassifier *****") 25 | xtrain = trainingEncodedAndScaledDataset.iloc[:, :-1].values 26 | ytrain = trainingEncodedAndScaledDataset.iloc[:, len(trainingEncodedAndScaledDataset.columns)-1].values 27 | 28 | labelencoder_ytrain = LabelEncoder() 29 | ytrain = labelencoder_ytrain.fit_transform(ytrain) 30 | 31 | classifier = DecisionTreeClassifier() 32 | classifier.fit(xtrain,ytrain) 33 | 34 | ytrainpred = classifier.predict(xtrain) 35 | print("\n*** Classification accuracy score during model training: ", metrics.accuracy_score(ytrain, ytrainpred)) 36 | 37 | xtest = testingEncodedAndScaledDataset.iloc[:, :-1].values 38 | ytest = testingEncodedAndScaledDataset.iloc[:, len(testingEncodedAndScaledDataset.columns)-1].values 39 | 40 | labelencoder_ytest = LabelEncoder() 41 | ytest = labelencoder_ytest.fit_transform(ytest) 42 | 43 | # Predicting the Test set results 44 | ytestpred = classifier.predict(xtest) 45 | print("*** Classification accuracy score during model testing: ", metrics.accuracy_score(ytest, ytestpred)) 46 | print("\n****** End classification training using DecisionTreeClassifier *****\n") 47 | return classifier, metrics.accuracy_score(ytrain, ytrainpred), metrics.accuracy_score(ytest, ytestpred) 48 | 49 | #This function is used to perform classification using LogisticRegression 50 | def classifyUsingLogisticRegression(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset): 51 | print("****** Start classification training using LogisticRegression *****") 52 | xtrain = trainingEncodedAndScaledDataset.iloc[:, :-1].values 53 | ytrain = trainingEncodedAndScaledDataset.iloc[:, len(trainingEncodedAndScaledDataset.columns)-1].values 54 | 55 | labelencoder_ytrain = LabelEncoder() 56 | ytrain = labelencoder_ytrain.fit_transform(ytrain) 57 | 58 | classifier = LogisticRegression() 59 | classifier.fit(xtrain,ytrain) 60 | 61 | ytrainpred = classifier.predict(xtrain) 62 | print("\n*** Classification accuracy score during model training: ", metrics.accuracy_score(ytrain, ytrainpred)) 63 | 64 | xtest = testingEncodedAndScaledDataset.iloc[:, :-1].values 65 | ytest = testingEncodedAndScaledDataset.iloc[:, len(testingEncodedAndScaledDataset.columns)-1].values 66 | 67 | labelencoder_ytest = LabelEncoder() 68 | ytest = labelencoder_ytest.fit_transform(ytest) 69 | 70 | # Predicting the Test set results 71 | ytestpred = classifier.predict(xtest) 72 | print("*** Classification accuracy score during model testing: ", metrics.accuracy_score(ytest, ytestpred)) 73 | print("\n****** End classification training using LogisticRegression *****\n") 74 | return classifier, metrics.accuracy_score(ytrain, ytrainpred), metrics.accuracy_score(ytest, ytestpred) 75 | 76 | #This function is used to perform classification using LinearDiscriminantAnalysis 77 | def classifyUsingLinearDiscriminantAnalysis(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset): 78 | print("****** Start classification training using LinearDiscriminantAnalysis *****") 79 | xtrain = trainingEncodedAndScaledDataset.iloc[:, :-1].values 80 | ytrain = trainingEncodedAndScaledDataset.iloc[:, len(trainingEncodedAndScaledDataset.columns)-1].values 81 | 82 | labelencoder_ytrain = LabelEncoder() 83 | ytrain = labelencoder_ytrain.fit_transform(ytrain) 84 | 85 | classifier = LinearDiscriminantAnalysis() 86 | classifier.fit(xtrain,ytrain) 87 | 88 | ytrainpred = classifier.predict(xtrain) 89 | print("\n*** Classification accuracy score during model training: ", metrics.accuracy_score(ytrain, ytrainpred)) 90 | 91 | xtest = testingEncodedAndScaledDataset.iloc[:, :-1].values 92 | ytest = testingEncodedAndScaledDataset.iloc[:, len(testingEncodedAndScaledDataset.columns)-1].values 93 | 94 | labelencoder_ytest = LabelEncoder() 95 | ytest = labelencoder_ytest.fit_transform(ytest) 96 | 97 | # Predicting the Test set results 98 | ytestpred = classifier.predict(xtest) 99 | print("*** Classification accuracy score during model testing: ", metrics.accuracy_score(ytest, ytestpred)) 100 | print("\n****** End classification training using LinearDiscriminantAnalysis *****\n") 101 | return classifier, metrics.accuracy_score(ytrain, ytrainpred), metrics.accuracy_score(ytest, ytestpred) 102 | 103 | #This function is used to perform classification using GuassianNaiveBayes 104 | def classifyUsingGaussianNB(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset): 105 | print("****** Start classification training using GuassianNaiveBayes *****") 106 | xtrain = trainingEncodedAndScaledDataset.iloc[:, :-1].values 107 | ytrain = trainingEncodedAndScaledDataset.iloc[:, len(trainingEncodedAndScaledDataset.columns)-1].values 108 | 109 | labelencoder_ytrain = LabelEncoder() 110 | ytrain = labelencoder_ytrain.fit_transform(ytrain) 111 | 112 | classifier = GaussianNB() 113 | classifier.fit(xtrain,ytrain) 114 | 115 | ytrainpred = classifier.predict(xtrain) 116 | print("\n*** Classification accuracy score during model training: ", metrics.accuracy_score(ytrain, ytrainpred)) 117 | 118 | xtest = testingEncodedAndScaledDataset.iloc[:, :-1].values 119 | ytest = testingEncodedAndScaledDataset.iloc[:, len(testingEncodedAndScaledDataset.columns)-1].values 120 | 121 | labelencoder_ytest = LabelEncoder() 122 | ytest = labelencoder_ytest.fit_transform(ytest) 123 | 124 | # Predicting the Test set results 125 | ytestpred = classifier.predict(xtest) 126 | print("*** Classification accuracy score during model testing: ", metrics.accuracy_score(ytest, ytestpred)) 127 | print("\n****** End classification training using GuassianNaiveBayes *****\n") 128 | return classifier, metrics.accuracy_score(ytrain, ytrainpred), metrics.accuracy_score(ytest, ytestpred) 129 | 130 | #This function is used to perform classification using RandomForestClassifier 131 | def classifyUsingRandomForestClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset): 132 | print("****** Start classification training using RandomForestClassifier *****") 133 | xtrain = trainingEncodedAndScaledDataset.iloc[:, :-1].values 134 | ytrain = trainingEncodedAndScaledDataset.iloc[:, len(trainingEncodedAndScaledDataset.columns)-1].values 135 | 136 | labelencoder_ytrain = LabelEncoder() 137 | ytrain = labelencoder_ytrain.fit_transform(ytrain) 138 | 139 | classifier = RandomForestClassifier(n_estimators=100) 140 | classifier.fit(xtrain,ytrain) 141 | 142 | ytrainpred = classifier.predict(xtrain) 143 | print("\n*** Classification accuracy score during model training: ", metrics.accuracy_score(ytrain, ytrainpred)) 144 | 145 | xtest = testingEncodedAndScaledDataset.iloc[:, :-1].values 146 | ytest = testingEncodedAndScaledDataset.iloc[:, len(testingEncodedAndScaledDataset.columns)-1].values 147 | 148 | labelencoder_ytest = LabelEncoder() 149 | ytest = labelencoder_ytest.fit_transform(ytest) 150 | 151 | # Predicting the Test set results 152 | ytestpred = classifier.predict(xtest) 153 | print("*** Classification accuracy score during model testing: ", metrics.accuracy_score(ytest, ytestpred)) 154 | print("\n****** End classification training using RandomForestClassifier *****\n") 155 | return classifier, metrics.accuracy_score(ytrain, ytrainpred), metrics.accuracy_score(ytest, ytestpred) 156 | 157 | #This function is used to perform classification using RandomForestClassifier 158 | def classifyUsingExtraTreesClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset): 159 | print("****** Start classification training using ExtraTreesClassifier *****") 160 | xtrain = trainingEncodedAndScaledDataset.iloc[:, :-1].values 161 | ytrain = trainingEncodedAndScaledDataset.iloc[:, len(trainingEncodedAndScaledDataset.columns)-1].values 162 | 163 | print("trainingEncodedAndScaledDataset.shape: ",trainingEncodedAndScaledDataset.shape) 164 | 165 | labelencoder_ytrain = LabelEncoder() 166 | ytrain = labelencoder_ytrain.fit_transform(ytrain) 167 | 168 | classifier = ExtraTreesClassifier(n_estimators=100) 169 | classifier.fit(xtrain,ytrain) 170 | 171 | ytrainpred = classifier.predict(xtrain) 172 | print("\n*** Classification accuracy score during model training: ", metrics.accuracy_score(ytrain, ytrainpred)) 173 | 174 | xtest = testingEncodedAndScaledDataset.iloc[:, :-1].values 175 | ytest = testingEncodedAndScaledDataset.iloc[:, len(testingEncodedAndScaledDataset.columns)-1].values 176 | 177 | print("testingEncodedAndScaledDataset.shape: ",testingEncodedAndScaledDataset.shape) 178 | 179 | labelencoder_ytest = LabelEncoder() 180 | ytest = labelencoder_ytest.fit_transform(ytest) 181 | 182 | # Predicting the Test set results 183 | ytestpred = classifier.predict(xtest) 184 | print("*** Classification accuracy score during model testing: ", metrics.accuracy_score(ytest, ytestpred)) 185 | print("\n****** End classification training using ExtraTreesClassifier *****\n") 186 | return classifier, metrics.accuracy_score(ytrain, ytrainpred), metrics.accuracy_score(ytest, ytestpred) 187 | 188 | def classifyUsingKNNClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset): 189 | print("****** Start classification training using KNeighborsClassifier *****") 190 | xtrain = trainingEncodedAndScaledDataset.iloc[:, :-1].values 191 | ytrain = trainingEncodedAndScaledDataset.iloc[:, len(trainingEncodedAndScaledDataset.columns)-1].values 192 | 193 | labelencoder_ytrain = LabelEncoder() 194 | ytrain = labelencoder_ytrain.fit_transform(ytrain) 195 | 196 | classifier = KNeighborsClassifier(n_neighbors=1) 197 | classifier.fit(xtrain,ytrain) 198 | 199 | ytrainpred = classifier.predict(xtrain) 200 | print("\n*** Classification accuracy score during model training: ", metrics.accuracy_score(ytrain, ytrainpred)) 201 | 202 | xtest = testingEncodedAndScaledDataset.iloc[:, :-1].values 203 | ytest = testingEncodedAndScaledDataset.iloc[:, len(testingEncodedAndScaledDataset.columns)-1].values 204 | 205 | print("testingEncodedAndScaledDataset.shape: ",testingEncodedAndScaledDataset.shape) 206 | 207 | labelencoder_ytest = LabelEncoder() 208 | ytest = labelencoder_ytest.fit_transform(ytest) 209 | 210 | # Predicting the Test set results 211 | ytestpred = classifier.predict(xtest) 212 | print("*** Classification accuracy score during model testing: ", metrics.accuracy_score(ytest, ytestpred)) 213 | print("\n****** End classification training using KNeighborsClassifier *****\n") 214 | return classifier, metrics.accuracy_score(ytrain, ytrainpred), metrics.accuracy_score(ytest, ytestpred) 215 | 216 | def findingOptimumNumberOfNeighboursForKNN(trainingEncodedAndScaledDataset): 217 | print("****** Start finding optimum number of neighbours for KNN *****") 218 | xtrain = trainingEncodedAndScaledDataset.iloc[:, :-1].values 219 | ytrain = trainingEncodedAndScaledDataset.iloc[:, len(trainingEncodedAndScaledDataset.columns)-1].values 220 | 221 | labelencoder_ytrain = LabelEncoder() 222 | ytrain = labelencoder_ytrain.fit_transform(ytrain) 223 | 224 | # creating odd list of K for KNN 225 | neighbors = list(range(1, 150, 2)) 226 | 227 | # empty list that will hold cv scores 228 | cv_scores = [] 229 | 230 | # perform 10-fold cross validation 231 | for k in neighbors: 232 | knn = KNeighborsClassifier(n_neighbors=k) 233 | scores = cross_val_score(knn, xtrain, ytrain, cv=10, scoring='accuracy') 234 | cv_scores.append(scores.mean()) 235 | print("With number of neighbours as {}, average score is {}".format(k,scores.mean())) 236 | 237 | # changing to misclassification error 238 | mse = [1 - x for x in cv_scores] 239 | 240 | # determining best k 241 | optimal_k = neighbors[mse.index(min(mse))] 242 | print("The optimal number of neighbors is {}".format(optimal_k)) 243 | 244 | # plot misclassification error vs k 245 | plt.plot(neighbors, mse) 246 | plt.xlabel("Number of Neighbors K") 247 | plt.ylabel("Misclassification Error") 248 | plt.show() 249 | 250 | print("****** End finding optimum number of neighbours for KNN *****") -------------------------------------------------------------------------------- /Project-UtilityFunctions/dataformatinglibrary.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | #Libraries for printing tables in readable format 4 | from tabulate import tabulate 5 | 6 | #Library for creating an excel sheet 7 | import xlsxwriter 8 | 9 | def createExcelFromArray(array, fileName): 10 | workbook = xlsxwriter.Workbook(fileName) 11 | worksheet = workbook.add_worksheet() 12 | 13 | row = 0 14 | for col, data in enumerate(array): 15 | worksheet.write_row(col, row, data) 16 | 17 | workbook.close() 18 | 19 | def printList (list,heading): 20 | for i in range(0, len(list)): 21 | list[i] = str(list[i]) 22 | if len(list)>0: 23 | print(tabulate([i.strip("[]").split(", ") for i in list], headers=[heading], tablefmt='orgtbl')+"\n") 24 | -------------------------------------------------------------------------------- /Project-UtilityFunctions/datainspectionlibrary.py: -------------------------------------------------------------------------------- 1 | #Data formating library 2 | from dataformatinglibrary import printList 3 | 4 | #Data pre-processing library 5 | from datapreprocessinglibrary import checkForMissingValues 6 | from datapreprocessinglibrary import checkForDulicateRecords 7 | 8 | #Utility functions 9 | from defineInputs import getLabelName 10 | 11 | #Libraries for feature selection 12 | #SelectKBest, Chi2: Falls under filter methods (univariate selection) 13 | from sklearn.feature_selection import SelectKBest 14 | from sklearn.feature_selection import chi2 15 | from sklearn.feature_selection import SelectFromModel 16 | from sklearn.ensemble import RandomForestClassifier #RandomForestClassifier: Falls under wrapper methods (feature importance) 17 | from sklearn.ensemble import ExtraTreesClassifier #ExtraTreesClassifier: Falls under wrapper methods (feature importance) 18 | 19 | import numpy as np 20 | 21 | #This function is used to check the statistics of a given dataSet 22 | def getStatisticsOfData (dataSet): 23 | print("***** Start checking the statistics of the dataSet *****\n") 24 | 25 | labelName = getLabelName() 26 | #Number of rows and columns in the dataset 27 | print("***** Shape (number of rows and columns) in the dataset: ", dataSet.shape) 28 | 29 | #Total number of features in the dataset 30 | numberOfColumnsInTheDataset = len(dataSet.drop([labelName],axis=1).columns) 31 | #numberOfColumnsInTheDataset = len(dataSet.columns) 32 | print("***** Total number of features in the dataset: ",numberOfColumnsInTheDataset) 33 | 34 | #Total number of categorical featuers in the dataset 35 | categoricalFeaturesInTheDataset = list(set(dataSet.drop([labelName],axis=1).columns) - set(dataSet.drop([labelName],axis=1)._get_numeric_data().columns)) 36 | #categoricalFeaturesInTheDataset = list(set(dataSet.columns) - set(dataSet._get_numeric_data().columns)) 37 | print("***** Number of categorical features in the dataset: ",len(categoricalFeaturesInTheDataset)) 38 | 39 | #Total number of numerical features in the dataset 40 | numericalFeaturesInTheDataset = list(dataSet.drop([labelName],axis=1)._get_numeric_data().columns) 41 | #numericalFeaturesInTheDataset = list(dataSet._get_numeric_data().columns) 42 | print("***** Number of numerical features in the dataset: ",len(numericalFeaturesInTheDataset)) 43 | 44 | #Names of categorical features in the dataset 45 | print("\n***** Names of categorical features in dataset *****\n") 46 | printList(categoricalFeaturesInTheDataset,'Categorical features in dataset') 47 | 48 | #Names of numerical features in the dataset 49 | print("\n***** Names of numerical features in dataset *****\n") 50 | printList(numericalFeaturesInTheDataset,'Numerical features in the dataset') 51 | 52 | #Checking for any missing values in the data set 53 | anyMissingValuesInTheDataset = checkForMissingValues(dataSet) 54 | print("\n***** Are there any missing values in the data set: ", anyMissingValuesInTheDataset) 55 | 56 | anyDuplicateRecordsInTheDataset = checkForDulicateRecords(dataSet) 57 | print("\n***** Are there any duplicate records in the data set: ", anyDuplicateRecordsInTheDataset) 58 | #Check if there are any duplicate records in the data set 59 | if (anyDuplicateRecordsInTheDataset): 60 | dataSet = dataSet.drop_duplicates() 61 | print("Number of records in the dataSet after removing the duplicates: ", len(dataSet.index)) 62 | 63 | #How many number of different values for label that are present in the dataset 64 | print('\n****** Number of different values for label that are present in the dataset: ',dataSet[labelName].nunique()) 65 | #What are the different values for label in the dataset 66 | print('\n****** Here is the list of unique label types present in the dataset ***** \n') 67 | printList(list(dataSet[getLabelName()].unique()),'Unique label types in the dataset') 68 | 69 | #What are the different values in each of the categorical features in the dataset 70 | print('\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n') 71 | categoricalFeaturesInTheDataset = list(set(dataSet.columns) - set(dataSet._get_numeric_data().columns)) 72 | numericalFeaturesInTheDataset = list(dataSet._get_numeric_data().columns) 73 | for feature in categoricalFeaturesInTheDataset: 74 | uniq = np.unique(dataSet[feature]) 75 | print('\n{}: {} '.format(feature,len(uniq))) 76 | printList(dataSet[feature].unique(),'distinct values') 77 | 78 | print('\n****** Label distribution in the dataset *****\n') 79 | print(dataSet[labelName].value_counts()) 80 | print() 81 | 82 | print("\n***** End checking the statistics of the dataSet *****") -------------------------------------------------------------------------------- /Project-UtilityFunctions/dataloadinglibrary.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | #This function is used to load CSV file from the 'data' directory 4 | #in the present working directly 5 | def loadCSV (fileNameWithAbsolutePath): 6 | dataSet = pd.read_csv(fileNameWithAbsolutePath) 7 | return dataSet 8 | -------------------------------------------------------------------------------- /Project-UtilityFunctions/datapreprocessinglibrary.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import train_test_split 2 | from defineInputs import getLabelName 3 | 4 | #This function is used to check for missing values in a given dataSet 5 | def checkForMissingValues (dataSet): 6 | anyMissingValuesInTheDataset = dataSet.isnull().values.any() 7 | return anyMissingValuesInTheDataset 8 | 9 | #This function is used to check for duplicate records in a given dataSet 10 | def checkForDulicateRecords (dataSet): 11 | totalRecordsInDataset = len(dataSet.index) 12 | numberOfUniqueRecordsInDataset = len(dataSet.drop_duplicates().index) 13 | anyDuplicateRecordsInTheDataset = False if totalRecordsInDataset == numberOfUniqueRecordsInDataset else True 14 | print('Total number of records in the dataset: {}\nUnique records in the dataset: {}'.format(totalRecordsInDataset,numberOfUniqueRecordsInDataset)) 15 | return anyDuplicateRecordsInTheDataset 16 | 17 | #Split the complete dataSet into training dataSet and testing dataSet 18 | def splitCompleteDataSetIntoTrainingSetAndTestingSet(completeDataSet): 19 | labelName = getLabelName() 20 | label = completeDataSet[labelName] 21 | features = completeDataSet.drop(labelName,axis=1) 22 | featuresInPreProcessedTrainingDataSet,featuresInPreProcessedTestingDataSet,labelInPreProcessedTrainingDataSet,labelInPreProcessedTestingDataSet=train_test_split(features,label,test_size=0.4, random_state=42) 23 | print("features.shape: ",features.shape) 24 | print("label.shape: ",label.shape) 25 | return featuresInPreProcessedTrainingDataSet,featuresInPreProcessedTestingDataSet,labelInPreProcessedTrainingDataSet,labelInPreProcessedTestingDataSet 26 | -------------------------------------------------------------------------------- /Project-UtilityFunctions/defineInputs.py: -------------------------------------------------------------------------------- 1 | #This function is to maintain the name of the label at a single place 2 | def getLabelName(): 3 | return 'attack_type' 4 | 5 | def getPathToTrainingAndTestingDataSets(): 6 | trainingFileNameWithAbsolutePath = "D:\\Learning\\DeepLearning\\Project-AttackDetectionSystem\\Datasets\\NSL-KDD\\KDDTrain+_20Percent.csv" 7 | testingFileNameWithAbsolutePath = "D:\\Learning\\DeepLearning\\Project-AttackDetectionSystem\\Datasets\\NSL-KDD\\KDDTest-21.csv" 8 | return trainingFileNameWithAbsolutePath, testingFileNameWithAbsolutePath 9 | 10 | def modelPerformanceReport(): 11 | modelPerformanceReport = 'D:\\Learning\\DeepLearning\\Project-AttackDetectionSystem\\ModelsAndTheirPerformanceReports\\ModelsPerformance031442020.1.xlsx' 12 | return modelPerformanceReport 13 | 14 | def getPathToGenerateModels(): 15 | generatedModelsPath = 'D:\\Learning\\DeepLearning\\Project-AttackDetectionSystem\\ModelsAndTheirPerformanceReports\\' 16 | return generatedModelsPath 17 | 18 | ### Models with the below configuration will be generated 19 | def defineArrayOfModels(): 20 | arrayOfModels = [ 21 | [ 22 | "FeatureSelectionTechnique", 23 | "FeatureEncodingTechnique", 24 | "FeatureNormalizationTechnique", 25 | "ClassificationTechnique", 26 | "TrainAccuraccy", 27 | "TestAccuraccy", 28 | "ModelName", 29 | "ModelFileName" 30 | ], 31 | [ 32 | "ExtraTreesClassifier", 33 | "OneHotEncoder", 34 | "Standardization", 35 | "DecisonTree" 36 | ], 37 | [ 38 | "ExtraTreesClassifier", 39 | "OneHotEncoder", 40 | "Standardization", 41 | "RandomForestClassifier" 42 | ], 43 | [ 44 | "ExtraTreesClassifier", 45 | "OneHotEncoder", 46 | "Standardization", 47 | "ExtraTreesClassifier" 48 | ], 49 | [ 50 | "ExtraTreesClassifier", 51 | "OneHotEncoder", 52 | "Standardization", 53 | "KNN" 54 | ] 55 | ] 56 | print(arrayOfModels) 57 | return arrayOfModels 58 | 59 | def defineArrayForPreProcessing(): 60 | arrayOfModels = [ 61 | [ 62 | "ExtraTreesClassifier", 63 | "OneHotEncoder", 64 | "Standardization", 65 | ] 66 | ] 67 | print(arrayOfModels) 68 | return arrayOfModels 69 | -------------------------------------------------------------------------------- /Project-UtilityFunctions/featureencodinglibrary.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | #Libraries for feature encoding 5 | from sklearn.preprocessing import LabelEncoder 6 | import category_encoders as ce 7 | 8 | #Utility functions 9 | from defineInputs import getLabelName 10 | from dataformatinglibrary import printList 11 | 12 | #This function is used to perform one hot encoding on the categorical features in the given dataset 13 | def featureEncodingUsingOneHotEncoder(dataSetForFeatureEncoding): 14 | print("****** Start one hot encoding on the categorical features in the given dataset *****") 15 | 16 | labelName = getLabelName() 17 | #Extract the categorical features, leave the label 18 | categoricalColumnsInTheDataSet = dataSetForFeatureEncoding.drop([labelName],axis=1).select_dtypes(['object']) 19 | #Get the names of the categorical features 20 | categoricalColumnNames = categoricalColumnsInTheDataSet.columns.values 21 | 22 | print("****** Number of features before one hot encoding: ",len(dataSetForFeatureEncoding.columns)) 23 | print("****** Number of categorical features in the dataset: ",len(categoricalColumnNames)) 24 | print("****** Categorical feature names in the dataset: ",categoricalColumnNames) 25 | 26 | print('\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n') 27 | categoricalFeaturesInTheDataset = list(set(dataSetForFeatureEncoding.columns) - set(dataSetForFeatureEncoding._get_numeric_data().columns)) 28 | numericalFeaturesInTheDataset = list(dataSetForFeatureEncoding._get_numeric_data().columns) 29 | for feature in categoricalFeaturesInTheDataset: 30 | uniq = np.unique(dataSetForFeatureEncoding[feature]) 31 | print('\n{}: {} '.format(feature,len(uniq))) 32 | printList(dataSetForFeatureEncoding[feature].unique(),'distinct values') 33 | 34 | #Using get_dummies function to get the dummy variables for the categorical columns 35 | onHotEncodedDataSet=pd.get_dummies(dataSetForFeatureEncoding, columns=categoricalColumnNames, prefix=categoricalColumnNames) 36 | 37 | #Move the label column to the end 38 | label = onHotEncodedDataSet.pop(labelName) 39 | onHotEncodedDataSet[labelName] = label 40 | numberOfColumnsInOneHotEncodedDataset = len(onHotEncodedDataSet.columns) 41 | print("****** Number of features after one hot encoding: ",numberOfColumnsInOneHotEncodedDataset) 42 | 43 | print("****** End one hot encoding on the categorical features in the given dataset *****\n") 44 | return onHotEncodedDataSet 45 | 46 | #This function is used to perform label encoding on the categorical features in the given dataset 47 | def featureEncodingUsingLabelEncoder(dataSetForFeatureEncoding): 48 | print("****** Start label encoding on the categorical features in the given dataset *****") 49 | 50 | labelName = getLabelName() 51 | #Extract the categorical features, leave the label 52 | categoricalColumnsInTheDataSet = dataSetForFeatureEncoding.drop([labelName],axis=1).select_dtypes(['object']) 53 | #Get the names of the categorical features 54 | categoricalColumnNames = categoricalColumnsInTheDataSet.columns.values 55 | 56 | print("****** Number of features before label encoding: ",len(dataSetForFeatureEncoding.columns)) 57 | print("****** Number of categorical features in the dataset: ",len(categoricalColumnNames)) 58 | print("****** Categorical feature names in the dataset: ",categoricalColumnNames) 59 | 60 | print('\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n') 61 | labelEncoder = LabelEncoder() 62 | for feature in categoricalColumnNames: 63 | uniq = np.unique(dataSetForFeatureEncoding[feature]) 64 | print('\n{}: {} '.format(feature,len(uniq))) 65 | printList(dataSetForFeatureEncoding[feature].unique(),'distinct values') 66 | dataSetForFeatureEncoding[feature] = labelEncoder.fit_transform(dataSetForFeatureEncoding[feature]) 67 | print("****** Number of features after label encoding: ",len(dataSetForFeatureEncoding.columns)) 68 | 69 | print("****** End label encoding on the categorical features in the given dataset *****\n") 70 | return dataSetForFeatureEncoding 71 | 72 | #This function is used to perform binary encoding on the categorical features in the given dataset 73 | def featureEncodingUsingBinaryEncoder(dataSetForFeatureEncoding): 74 | print("****** Start binary encoding on the categorical features in the given dataset *****") 75 | 76 | labelName = getLabelName() 77 | #Extract the categorical features, leave the label 78 | categoricalColumnsInTheDataSet = dataSetForFeatureEncoding.drop([labelName],axis=1).select_dtypes(['object']) 79 | #Get the names of the categorical features 80 | categoricalColumnNames = categoricalColumnsInTheDataSet.columns.values 81 | 82 | print("****** Number of features before binary encoding: ",len(dataSetForFeatureEncoding.columns)) 83 | print("****** Number of categorical features in the dataset: ",len(categoricalColumnNames)) 84 | print("****** Categorical feature names in the dataset: ",categoricalColumnNames) 85 | 86 | print('\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n') 87 | label = dataSetForFeatureEncoding.drop(dataSetForFeatureEncoding.loc[:, ~dataSetForFeatureEncoding.columns.isin([labelName])].columns, axis = 1) 88 | for feature in categoricalColumnNames: 89 | uniq = np.unique(dataSetForFeatureEncoding[feature]) 90 | print('\n{}: {} '.format(feature,len(uniq))) 91 | printList(dataSetForFeatureEncoding[feature].unique(),'distinct values') 92 | featureColumns = dataSetForFeatureEncoding.drop(dataSetForFeatureEncoding.loc[:, ~dataSetForFeatureEncoding.columns.isin([feature])].columns, axis = 1) 93 | binaryEncoder = ce.BinaryEncoder(cols = [feature]) 94 | binaryEncodedFeature = binaryEncoder.fit_transform(featureColumns, label) 95 | dataSetForFeatureEncoding = dataSetForFeatureEncoding.join(binaryEncodedFeature) 96 | dataSetForFeatureEncoding = dataSetForFeatureEncoding.drop(feature, axis=1) 97 | 98 | dataSetForFeatureEncoding = dataSetForFeatureEncoding.drop(labelName, axis=1) 99 | dataSetForFeatureEncoding[labelName] = label 100 | print("****** Number of features after binary encoding: ",len(dataSetForFeatureEncoding.columns)) 101 | 102 | print("****** End binary encoding on the categorical features in the given dataset *****\n") 103 | return dataSetForFeatureEncoding 104 | 105 | #This function is used to perform frequency encoding on the categorical features in the given dataset 106 | def featureEncodingUsingFrequencyEncoder(dataSetForFeatureEncoding): 107 | print("****** Start frequency encoding on the categorical features in the given dataset *****") 108 | 109 | labelName = getLabelName() 110 | #Extract the categorical features, leave the label 111 | categoricalColumnsInTheDataSet = dataSetForFeatureEncoding.drop([labelName],axis=1).select_dtypes(['object']) 112 | #Get the names of the categorical features 113 | categoricalColumnNames = categoricalColumnsInTheDataSet.columns.values 114 | 115 | print("****** Number of features before label encoding: ",len(dataSetForFeatureEncoding.columns)) 116 | print("****** Number of categorical features in the dataset: ",len(categoricalColumnNames)) 117 | print("****** Categorical feature names in the dataset: ",categoricalColumnNames) 118 | 119 | print('\n****** Here is the list of unique values present in each categorical feature in the dataset *****\n') 120 | label = dataSetForFeatureEncoding.drop(dataSetForFeatureEncoding.loc[:, ~dataSetForFeatureEncoding.columns.isin([labelName])].columns, axis = 1) 121 | for feature in categoricalColumnNames: 122 | uniq = np.unique(dataSetForFeatureEncoding[feature]) 123 | print('\n{}: {} '.format(feature,len(uniq))) 124 | printList(dataSetForFeatureEncoding[feature].unique(),'distinct values') 125 | frequencyEncoder = dataSetForFeatureEncoding.groupby(feature).size()/len(dataSetForFeatureEncoding) 126 | dataSetForFeatureEncoding.loc[:,feature+"_Encoded"] = dataSetForFeatureEncoding[feature].map(frequencyEncoder) 127 | dataSetForFeatureEncoding = dataSetForFeatureEncoding.drop(feature, axis=1) 128 | 129 | dataSetForFeatureEncoding = dataSetForFeatureEncoding.drop(labelName, axis=1) 130 | dataSetForFeatureEncoding[labelName] = label 131 | print("****** Number of features after frequency encoding: ",len(dataSetForFeatureEncoding.columns)) 132 | 133 | print("****** End frequency encoding on the categorical features in the given dataset *****\n") 134 | return dataSetForFeatureEncoding -------------------------------------------------------------------------------- /Project-UtilityFunctions/featurescalinglibrary.py: -------------------------------------------------------------------------------- 1 | #Utility functions 2 | from defineInputs import getLabelName 3 | 4 | import pandas as pd 5 | import numpy as np 6 | 7 | #Libraries for feature scaling 8 | from sklearn.preprocessing import MinMaxScaler 9 | from sklearn.preprocessing import StandardScaler 10 | from sklearn.preprocessing import Binarizer 11 | from sklearn.preprocessing import Normalizer 12 | 13 | 14 | #This function is used to perform min-max feature scaing on the features in the given dataset 15 | #Formula for Min-Max scalar feature scaling is (Xi-Xmin)/(Xmax-Xmin) 16 | def featureScalingUsingMinMaxScaler(dataSetForFeatureScaling): 17 | print("****** Start feature scaling of the features present in the dataset using MinMaxScaler *****") 18 | 19 | numberOfColumnsInEncodedDataset = len(dataSetForFeatureScaling.columns) 20 | dataSetInArrayFormat = dataSetForFeatureScaling.values 21 | 22 | #Remove the label column from the dataset 23 | labelName = getLabelName() 24 | label = dataSetForFeatureScaling.pop(labelName) 25 | 26 | print(dataSetInArrayFormat) 27 | features = dataSetInArrayFormat[:,0:numberOfColumnsInEncodedDataset-1] 28 | print("\n****** Number of features in the dataset before performing scaling: ",np.size(features,1)) 29 | print("\n****** Features in the dataset before performing scaling ***** \n",features) 30 | 31 | #Perform feature scaling 32 | scaler=MinMaxScaler(feature_range=(0,1)) 33 | scaledFeatures=scaler.fit_transform(features) 34 | print("\n****** Number of features in the dataset after performing scaling: ",np.size(scaledFeatures,1)) 35 | print("\n****** Features in the dataset after performing scaling ***** \n",scaledFeatures) 36 | 37 | #Convert from array format to dataframe 38 | scaledFeatures = pd.DataFrame(scaledFeatures, columns=dataSetForFeatureScaling.columns) 39 | scaledFeatures = scaledFeatures.reset_index(drop=True) 40 | label = label.reset_index(drop=True) 41 | scaledFeatures[labelName]=label 42 | 43 | print("\n****** End of feature scaling of the features present in the dataset using MinMaxScaler *****\n") 44 | return scaledFeatures 45 | 46 | #This function is used to perform StandardScalar feature scaing on the features in the given dataset 47 | #This is also called as Z-score normalization 48 | #Formula for StandardScalar scalar feature scaling is z = (x – mean) / standard-deviation. 49 | def featureScalingUsingStandardScalar(dataSetForFeatureScaling): 50 | print("****** Start feature scaling of the features present in the dataset using StandardScalar *****") 51 | 52 | numberOfColumnsInEncodedDataset = len(dataSetForFeatureScaling.columns) 53 | dataSetInArrayFormat = dataSetForFeatureScaling.values 54 | 55 | #Remove the label column from the dataset 56 | labelName = getLabelName() 57 | label = dataSetForFeatureScaling.pop(labelName) 58 | 59 | print(dataSetInArrayFormat) 60 | features = dataSetInArrayFormat[:,0:numberOfColumnsInEncodedDataset-1] 61 | print("\n****** Number of features in the dataset before performing scaling: ",np.size(features,1)) 62 | print("\n****** Features in the dataset before performing scaling ***** \n",features) 63 | 64 | #Perform feature scaling 65 | scaler=StandardScaler() 66 | scaledFeatures=scaler.fit_transform(features) 67 | print("\n****** Number of features in the dataset after performing scaling: ",np.size(scaledFeatures,1)) 68 | print("\n****** Features in the dataset after performing scaling ***** \n",scaledFeatures) 69 | 70 | #Convert from array format to dataframe 71 | scaledFeatures = pd.DataFrame(scaledFeatures, columns=dataSetForFeatureScaling.columns) 72 | scaledFeatures = scaledFeatures.reset_index(drop=True) 73 | label = label.reset_index(drop=True) 74 | scaledFeatures[labelName]=label 75 | print("scaledFeatures.head(): ",scaledFeatures.head()) 76 | print("scaledFeatures.shape: ",scaledFeatures.shape) 77 | 78 | print("\n****** End of feature scaling of the features present in the dataset using StandardScalar *****\n") 79 | return scaledFeatures 80 | 81 | #This function is used to perform Binarizing feature scaing on the features in the given dataset 82 | #It is used for binary thresholding of an array like matrix. 83 | def featureScalingUsingBinarizer(dataSetForFeatureScaling): 84 | print("****** Start feature scaling of the features present in the dataset using Binarizer *****") 85 | 86 | numberOfColumnsInEncodedDataset = len(dataSetForFeatureScaling.columns) 87 | dataSetInArrayFormat = dataSetForFeatureScaling.values 88 | 89 | #Remove the label column from the dataset 90 | labelName = getLabelName() 91 | label = dataSetForFeatureScaling.pop(labelName) 92 | 93 | print(dataSetInArrayFormat) 94 | features = dataSetInArrayFormat[:,0:numberOfColumnsInEncodedDataset-1] 95 | print("\n****** Number of features in the dataset before performing scaling: ",np.size(features,1)) 96 | print("\n****** Features in the dataset before performing scaling ***** \n",features) 97 | 98 | #Perform feature scaling 99 | scaledFeatures=Binarizer(0.0).fit(features).transform(features) 100 | print("\n****** Number of features in the dataset after performing scaling: ",np.size(scaledFeatures,1)) 101 | print("\n****** Features in the dataset after performing scaling ***** \n",scaledFeatures) 102 | 103 | #Convert from array format to dataframe 104 | scaledFeatures = pd.DataFrame(scaledFeatures, columns=dataSetForFeatureScaling.columns) 105 | scaledFeatures = scaledFeatures.reset_index(drop=True) 106 | label = label.reset_index(drop=True) 107 | scaledFeatures[labelName]=label 108 | 109 | print("\n****** End of feature scaling of the features present in the dataset using Binarizer *****\n") 110 | return scaledFeatures 111 | 112 | #This function is used to perform Normalizing feature scaing on the features in the given dataset 113 | #It is used to rescale each sample. 114 | #Each sample (i.e. each row of the data matrix) with at least one non zero component 115 | #is rescaled independently of other samples so that its norm (l1 or l2) equals one. 116 | def featureScalingUsingNormalizer(dataSetForFeatureScaling): 117 | print("****** Start feature scaling of the features present in the dataset using Normalizer *****") 118 | 119 | numberOfColumnsInEncodedDataset = len(dataSetForFeatureScaling.columns) 120 | dataSetInArrayFormat = dataSetForFeatureScaling.values 121 | 122 | #Remove the label column from the dataset 123 | labelName = getLabelName() 124 | label = dataSetForFeatureScaling.pop(labelName) 125 | 126 | print(dataSetInArrayFormat) 127 | 128 | features = dataSetInArrayFormat[:,0:numberOfColumnsInEncodedDataset-1] 129 | print("\n****** Number of features in the dataset before performing scaling: ",np.size(features,1)) 130 | print("\n****** Features in the dataset before performing scaling ***** \n",features) 131 | 132 | #Perform feature scaling 133 | scaledFeatures=Normalizer().fit(features).transform(features) 134 | print("\n****** Number of features in the dataset after performing scaling: ",np.size(scaledFeatures,1)) 135 | print("\n****** Features in the dataset after performing scaling ***** \n",scaledFeatures) 136 | 137 | #Convert from array format to dataframe 138 | scaledFeatures = pd.DataFrame(scaledFeatures, columns=dataSetForFeatureScaling.columns) 139 | scaledFeatures = scaledFeatures.reset_index(drop=True) 140 | label = label.reset_index(drop=True) 141 | scaledFeatures[labelName]=label 142 | 143 | print("\n****** End of feature scaling of the features present in the dataset using Normalizer *****\n") 144 | return scaledFeatures 145 | 146 | -------------------------------------------------------------------------------- /Project-UtilityFunctions/featureselectionlibrary.py: -------------------------------------------------------------------------------- 1 | #Utility functions 2 | from defineInputs import getLabelName 3 | 4 | from featureencodinglibrary import featureEncodingUsingLabelEncoder 5 | from dataformatinglibrary import printList 6 | 7 | #Matplotlib is a plotting library for the Python programming language and its numerical mathematics extension NumPy 8 | import matplotlib.pyplot as plt 9 | from matplotlib.pyplot import figure 10 | import seaborn as sns 11 | import numpy as np 12 | import pandas as pd 13 | import math 14 | import scipy.stats as ss 15 | from collections import Counter 16 | from sklearn.ensemble import RandomForestClassifier #RandomForestClassifier: Falls under wrapper methods (feature importance) 17 | from sklearn.ensemble import ExtraTreesClassifier #ExtraTreesClassifier: Falls under wrapper methods (feature importance) 18 | from sklearn.feature_selection import SelectKBest 19 | from sklearn.feature_selection import chi2 20 | from sklearn.preprocessing import LabelEncoder 21 | 22 | #This function is used to calculate the conditional entropy between a given feature and the target 23 | def conditional_entropy(x,y): 24 | # entropy of x given y 25 | y_counter = Counter(y) 26 | xy_counter = Counter(list(zip(x,y))) 27 | total_occurrences = sum(y_counter.values()) 28 | entropy = 0 29 | for xy in xy_counter.keys(): 30 | p_xy = xy_counter[xy] / total_occurrences 31 | p_y = y_counter[xy[1]] / total_occurrences 32 | entropy += p_xy * math.log(p_y/p_xy) 33 | return entropy 34 | 35 | #This function is used to perform feature selection using TheilU 36 | #In TheilU we calculate the uncertainty coefficient between the given feature and the target 37 | def theil_u(x,y): 38 | s_xy = conditional_entropy(x,y) 39 | x_counter = Counter(x) 40 | total_occurrences = sum(x_counter.values()) 41 | p_x = list(map(lambda n: n/total_occurrences, x_counter.values())) 42 | s_x = ss.entropy(p_x) 43 | if s_x == 0: 44 | return 1 45 | else: 46 | return (s_x - s_xy) / s_x 47 | 48 | def featureSelectionUsingTheilU(dataSetForFeatureSelection): 49 | print("\n****** Start performing feature selection using TheilU *****") 50 | print("****** Falls under the group of techniques that use correlation matrix with Heatmap *****") 51 | 52 | labelName = getLabelName() 53 | label = dataSetForFeatureSelection[labelName] 54 | 55 | theilu = pd.DataFrame(index=[labelName],columns=dataSetForFeatureSelection.columns) 56 | columns = dataSetForFeatureSelection.columns 57 | dataSetAfterFeatuerSelection = dataSetForFeatureSelection 58 | 59 | for j in range(0,len(columns)): 60 | u = theil_u(label.tolist(),dataSetForFeatureSelection[columns[j]].tolist()) 61 | theilu.loc[:,columns[j]] = u 62 | if u < 0.50: 63 | dataSetAfterFeatuerSelection.pop(columns[j]) 64 | 65 | print('***** Ploting the uncertainty coefficient between the target and each feature *****') 66 | theilu.fillna(value=np.nan,inplace=True) 67 | plt.figure(figsize=(30,1)) 68 | sns.heatmap(theilu,annot=True,fmt='.2f') 69 | plt.show() 70 | 71 | numberOfFeaturesInTheDatasetAfterFeatureSelection = len(dataSetAfterFeatuerSelection.columns) 72 | print('***** Number of columns in the dataSet after feature selection: ', len(dataSetAfterFeatuerSelection.columns)) 73 | print('***** Columns in the dataSet after feature selection: \n', dataSetAfterFeatuerSelection.columns) 74 | print("****** End performing feature selection using TheilU *****") 75 | return dataSetAfterFeatuerSelection 76 | 77 | #This function is used to perform feature selection using Chi-squared test 78 | def featureSelectionUsingChisquaredTest(dataSetForFeatureSelection): 79 | print("\n****** Start performing feature selection using ChisquaredTest *****") 80 | print("****** Falls under filter methods (univariate selection) *****") 81 | 82 | numberOfFeatureToBeSelected = 10 83 | labelName = getLabelName() 84 | 85 | #To be able to apply Chi-squared test 86 | dataSetForFeatureSelection = featureEncodingUsingLabelEncoder(dataSetForFeatureSelection) 87 | dataSetAfterFeatuerSelection = dataSetForFeatureSelection 88 | 89 | #features = dataSetForFeatureSelection.iloc[:,0:len(dataSetForFeatureSelection.columns)-1] 90 | features = dataSetForFeatureSelection.drop([labelName],axis=1) 91 | label = dataSetForFeatureSelection[labelName] 92 | 93 | #Apply SelectKBest class to extract top 10 best features 94 | bestfeatures = SelectKBest(score_func=chi2, k=numberOfFeatureToBeSelected) 95 | fitBestfeatures = bestfeatures.fit(features,label) 96 | columns = pd.DataFrame(features.columns) 97 | scores = pd.DataFrame(fitBestfeatures.scores_) 98 | #concat two dataframes for better visualization 99 | scoresOfBestFeatures = pd.concat([columns,scores],axis=1) 100 | scoresOfBestFeatures.columns = ['Features','Score'] 101 | print("\n***** Scores for each feature in the dataset are *****") 102 | print(scoresOfBestFeatures.nlargest(numberOfFeatureToBeSelected,'Score')) 103 | 104 | mask = fitBestfeatures.get_support() 105 | for j in range(0,len(mask)): 106 | if (mask[j] == False): 107 | dataSetAfterFeatuerSelection.pop(features.columns[j]) 108 | 109 | numberOfFeaturesInTheDatasetAfterFeatureSelection = len(dataSetAfterFeatuerSelection.columns) 110 | print('***** Number of columns in the dataSet after feature selection: ', len(dataSetAfterFeatuerSelection.columns)) 111 | print('***** Columns in the dataSet after feature selection: \n', dataSetAfterFeatuerSelection.columns) 112 | print("****** End performing feature selection using ChisquaredTest *****") 113 | 114 | return dataSetAfterFeatuerSelection 115 | 116 | #This function is used to perform feature selection using RandomForestClassifier 117 | def featureSelectionUsingRandomForestClassifier(dataSetForFeatureSelection): 118 | print("\n****** Start performing feature selection using RandomForestClassifier *****") 119 | print("****** Falls under wrapper methods (feature importance) *****") 120 | 121 | labelName = getLabelName() 122 | 123 | #Applying feature encoding before applying the RandomForestClassification 124 | dataSetForFeatureSelection = featureEncodingUsingLabelEncoder(dataSetForFeatureSelection) 125 | dataSetAfterFeatuerSelection = dataSetForFeatureSelection 126 | #features = dataSetForFeatureSelection.iloc[:,0:len(dataSetForFeatureSelection.columns)-1] 127 | features = dataSetForFeatureSelection.drop([labelName],axis=1) 128 | label = dataSetForFeatureSelection[labelName] 129 | 130 | labelencoder = LabelEncoder() 131 | labelTransformed = labelencoder.fit_transform(label) 132 | 133 | print("****** RandomForestClassification is in progress *****") 134 | #Train using RamdomForestClassifier 135 | trainedforest = RandomForestClassifier(n_estimators=700).fit(features,labelTransformed) 136 | importances = trainedforest.feature_importances_ #array with importances of each feature 137 | idx = np.arange(0, features.shape[1]) #create an index array, with the number of features 138 | features_to_keep = idx[importances > np.mean(importances)] #only keep features whose importance is greater than the mean importance 139 | featureImportances = pd.Series(importances, index= features.columns) 140 | selectedFeatures = featureImportances.nlargest(len(features_to_keep)) 141 | print("\n selectedFeatures after RandomForestClassification: ", selectedFeatures) 142 | print("****** Completed RandomForestClassification *****") 143 | 144 | #Plot the feature Importance to see which features have been considered as most important for our model to make its predictions 145 | #figure(num=None, figsize=(20, 22), dpi=80, facecolor='w', edgecolor='k') 146 | #selectedFeatures.plot(kind='barh') 147 | 148 | selectedFeaturesNames = selectedFeatures.keys() 149 | dataSetForFeatureSelection = dataSetForFeatureSelection.drop(selectedFeaturesNames,axis=1) 150 | dataSetAfterFeatuerSelection = dataSetAfterFeatuerSelection.drop(dataSetForFeatureSelection.columns, axis=1) 151 | dataSetAfterFeatuerSelection[labelName] = label 152 | 153 | numberOfFeaturesInTheDatasetAfterFeatureSelection = len(dataSetAfterFeatuerSelection.columns) 154 | print('\n***** Number of columns in the dataSet after feature selection: ', len(dataSetAfterFeatuerSelection.columns)) 155 | print('***** Columns in the dataSet after feature selection: \n', dataSetAfterFeatuerSelection.columns) 156 | print("****** End performing feature selection using RandomForestClassifier *****") 157 | return dataSetAfterFeatuerSelection 158 | 159 | #This function is used to perform feature selection using ExtraTreesClassifier 160 | def featureSelectionUsingExtraTreesClassifier(dataSetForFeatureSelection): 161 | print("\n****** Start performing feature selection using ExtraTreesClassifier *****") 162 | print("****** Falls under wrapper methods (feature importance) *****") 163 | 164 | labelName = getLabelName() 165 | 166 | #Applying feature encoding before applying the ExtraTreesClassification 167 | dataSetForFeatureSelection = featureEncodingUsingLabelEncoder(dataSetForFeatureSelection) 168 | dataSetAfterFeatuerSelection = dataSetForFeatureSelection 169 | #features = dataSetForFeatureSelection.iloc[:,0:len(dataSetForFeatureSelection.columns)-1] 170 | features = dataSetForFeatureSelection.drop([labelName],axis=1) 171 | label = dataSetForFeatureSelection[labelName] 172 | 173 | labelencoder = LabelEncoder() 174 | labelTransformed = labelencoder.fit_transform(label) 175 | 176 | print("****** ExtraTreesClassification is in progress *****") 177 | #Train using ExtraTreesClassifier 178 | trainedforest = ExtraTreesClassifier(n_estimators=700).fit(features,labelTransformed) 179 | importances = trainedforest.feature_importances_ #array with importances of each feature 180 | idx = np.arange(0, features.shape[1]) #create an index array, with the number of features 181 | features_to_keep = idx[importances > np.mean(importances)] #only keep features whose importance is greater than the mean importance 182 | featureImportances = pd.Series(importances, index= features.columns) 183 | selectedFeatures = featureImportances.nlargest(len(features_to_keep)) 184 | print("\n selectedFeatures after ExtraTreesClassification: ", selectedFeatures) 185 | print("****** Completed ExtraTreesClassification *****") 186 | 187 | #Plot the feature Importance to see which features have been considered as most important for our model to make its predictions 188 | #figure(num=None, figsize=(20, 22), dpi=80, facecolor='w', edgecolor='k') 189 | #selectedFeatures.plot(kind='barh') 190 | 191 | selectedFeaturesNames = selectedFeatures.keys() 192 | dataSetForFeatureSelection = dataSetForFeatureSelection.drop(selectedFeaturesNames,axis=1) 193 | dataSetAfterFeatuerSelection = dataSetAfterFeatuerSelection.drop(dataSetForFeatureSelection.columns, axis=1) 194 | dataSetAfterFeatuerSelection[labelName] = label 195 | 196 | numberOfFeaturesInTheDatasetAfterFeatureSelection = len(dataSetAfterFeatuerSelection.columns) 197 | print('\n***** Number of columns in the dataSet after feature selection: ', len(dataSetAfterFeatuerSelection.columns)) 198 | print('***** Columns in the dataSet after feature selection: \n', dataSetAfterFeatuerSelection.columns) 199 | print("****** End performing feature selection using ExtraTreesClassifier *****") 200 | return dataSetAfterFeatuerSelection 201 | 202 | -------------------------------------------------------------------------------- /Project-UtilityFunctions/findcombinations.py: -------------------------------------------------------------------------------- 1 | # Python3 program to find combinations from n 2 | # arrays such that one element from each 3 | # array is present 4 | 5 | # function to prcombinations that contain 6 | # one element from each of the given arrays 7 | def print1(arr): 8 | 9 | # number of arrays 10 | n = len(arr) 11 | 12 | # to keep track of next element 13 | # in each of the n arrays 14 | indices = [0 for i in range(n)] 15 | 16 | while (1): 17 | print("[") 18 | 19 | # prcurrent combination 20 | for i in range(n): 21 | print("'"+arr[i][indices[i]], end = "',") 22 | print() 23 | 24 | # find the rightmost array that has more 25 | # elements left after the current element 26 | # in that array 27 | next = n - 1 28 | while (next >= 0 and 29 | (indices[next] + 1 >= len(arr[next]))): 30 | next-=1 31 | 32 | # no such array is found so no more 33 | # combinations left 34 | if (next < 0): 35 | return 36 | 37 | # if found move to next element in that 38 | # array 39 | indices[next] += 1 40 | 41 | # for all arrays to the right of this 42 | # array current index again points to 43 | # first element 44 | for i in range(next + 1, n): 45 | indices[i] = 0 46 | print("],") 47 | 48 | 49 | # Driver Code 50 | 51 | # initializing a vector with 3 empty vectors 52 | arr = [[] for i in range(4)] 53 | 54 | # now entering data 55 | # [[1, 2, 3], [4], [5, 6]] 56 | arr[0].append('TheilsU') 57 | arr[0].append('Chi-SquaredTest') 58 | arr[0].append('RandomForestClassifier') 59 | arr[0].append('ExtraTreesClassifier') 60 | 61 | arr[1].append('OneHotEncoder') 62 | arr[1].append('LabelEncoder') 63 | arr[1].append('BinaryEncoder') 64 | arr[1].append('FrequencyEncoder') 65 | 66 | arr[2].append('Min-Max') 67 | arr[2].append('Standardization') 68 | arr[2].append('Binarizing') 69 | arr[2].append('Normalizing') 70 | 71 | arr[3].append('DecisonTree') 72 | arr[3].append('RandomForestClassifier') 73 | arr[3].append('ExtraTreesClassifier') 74 | arr[3].append('LogisticRegressionRegression') 75 | arr[3].append('LinearDiscriminantAnalysis') 76 | arr[3].append('GuassianNaiveBayes') 77 | 78 | print1(arr) 79 | 80 | # This code is contributed by mohit kumar 81 | -------------------------------------------------------------------------------- /Project-UtilityFunctions/lstm.py: -------------------------------------------------------------------------------- 1 | import time 2 | import warnings 3 | import numpy as np 4 | from numpy import newaxis 5 | from keras.layers.core import Dense, Activation, Dropout 6 | from keras.layers.recurrent import LSTM 7 | from keras.models import Sequential 8 | import matplotlib.pyplot as plt 9 | 10 | 11 | warnings.filterwarnings("ignore") 12 | 13 | def plot_results_multiple(predicted_data, true_data, prediction_len): 14 | fig = plt.figure(facecolor='white') 15 | ax = fig.add_subplot(111) 16 | ax.plot(true_data, label='True Data') 17 | print ('yo') 18 | #Pad the list of predictions to shift it in the graph to it's correct start 19 | for i, data in enumerate(predicted_data): 20 | padding = [None for p in xrange(i * prediction_len)] 21 | plt.plot(padding + data, label='Prediction') 22 | plt.legend() 23 | plt.show() 24 | 25 | def load_data(filename, seq_len, normalise_window): 26 | f = open(filename, 'r').read() 27 | data = f.split('\n') 28 | 29 | sequence_length = seq_len + 1 30 | result = [] 31 | for index in range(len(data) - sequence_length): 32 | result.append(data[index: index + sequence_length]) 33 | 34 | if normalise_window: 35 | result = normalise_windows(result) 36 | 37 | result = np.array(result) 38 | 39 | row = round(0.9 * result.shape[0]) 40 | train = result[:int(row), :] 41 | np.random.shuffle(train) 42 | x_train = train[:, :-1] 43 | y_train = train[:, -1] 44 | x_test = result[int(row):, :-1] 45 | y_test = result[int(row):, -1] 46 | 47 | x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1)) 48 | x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1)) 49 | 50 | return [x_train, y_train, x_test, y_test] 51 | 52 | def normalise_windows(window_data): 53 | normalised_data = [] 54 | for window in window_data: 55 | normalised_window = [((float(p) / float(window[0])) - 1) for p in window] 56 | normalised_data.append(normalised_window) 57 | return normalised_data 58 | 59 | def build_model(layers): 60 | model = Sequential() 61 | 62 | model.add(LSTM( 63 | input_dim=layers[0], 64 | output_dim=layers[1], 65 | return_sequences=True)) 66 | model.add(Dropout(0.2)) 67 | 68 | model.add(LSTM( 69 | layers[2], 70 | return_sequences=False)) 71 | model.add(Dropout(0.2)) 72 | 73 | model.add(Dense( 74 | output_dim=layers[3])) 75 | model.add(Activation("linear")) 76 | 77 | start = time.time() 78 | model.compile(loss="mse", optimizer="rmsprop") 79 | print ("Compilation Time : ", time.time() - start) 80 | return model 81 | 82 | def predict_point_by_point(model, data): 83 | #Predict each timestep given the last sequence of true data, in effect only predicting 1 step ahead each time 84 | predicted = model.predict(data) 85 | predicted = np.reshape(predicted, (predicted.size,)) 86 | return predicted 87 | 88 | def predict_sequence_full(model, data, window_size): 89 | #Shift the window by 1 new prediction each time, re-run predictions on new window 90 | curr_frame = data[0] 91 | predicted = [] 92 | for i in xrange(len(data)): 93 | predicted.append(model.predict(curr_frame[newaxis,:,:])[0,0]) 94 | curr_frame = curr_frame[1:] 95 | curr_frame = np.insert(curr_frame, [window_size-1], predicted[-1], axis=0) 96 | return predicted 97 | 98 | def predict_sequences_multiple(model, data, window_size, prediction_len): 99 | #Predict sequence of 50 steps before shifting prediction run forward by 50 steps 100 | prediction_seqs = [] 101 | for i in xrange(len(data)/prediction_len): 102 | curr_frame = data[i*prediction_len] 103 | predicted = [] 104 | for j in xrange(prediction_len): 105 | predicted.append(model.predict(curr_frame[newaxis,:,:])[0,0]) 106 | curr_frame = curr_frame[1:] 107 | curr_frame = np.insert(curr_frame, [window_size-1], predicted[-1], axis=0) 108 | prediction_seqs.append(predicted) 109 | return prediction_seqs -------------------------------------------------------------------------------- /Project-UtilityFunctions/util.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | #Utils 4 | import operator 5 | 6 | #Seaborn is an open source Python library providing high level API for visualizing the data 7 | import seaborn as sns 8 | import matplotlib.pyplot as plt 9 | 10 | #library for saving the trained models to files 11 | import joblib 12 | 13 | from defineInputs import getPathToTrainingAndTestingDataSets 14 | from defineInputs import getPathToGenerateModels 15 | 16 | #Data loading library 17 | from dataloadinglibrary import loadCSV 18 | 19 | from defineInputs import getLabelName 20 | 21 | #Data pre-processing library 22 | from datapreprocessinglibrary import splitCompleteDataSetIntoTrainingSetAndTestingSet 23 | 24 | #Feature selection library 25 | from featureselectionlibrary import featureSelectionUsingTheilU 26 | from featureselectionlibrary import featureSelectionUsingChisquaredTest 27 | from featureselectionlibrary import featureSelectionUsingRandomForestClassifier 28 | from featureselectionlibrary import featureSelectionUsingExtraTreesClassifier 29 | 30 | #feature encoding library 31 | from featureencodinglibrary import featureEncodingUsingOneHotEncoder 32 | from featureencodinglibrary import featureEncodingUsingLabelEncoder 33 | from featureencodinglibrary import featureEncodingUsingBinaryEncoder 34 | from featureencodinglibrary import featureEncodingUsingFrequencyEncoder 35 | 36 | #feature scaling library 37 | from featurescalinglibrary import featureScalingUsingMinMaxScaler 38 | from featurescalinglibrary import featureScalingUsingStandardScalar 39 | from featurescalinglibrary import featureScalingUsingBinarizer 40 | from featurescalinglibrary import featureScalingUsingNormalizer 41 | 42 | from classificationlibrary import classifyUsingDecisionTreeClassifier 43 | from classificationlibrary import classifyUsingLogisticRegression 44 | from classificationlibrary import classifyUsingLinearDiscriminantAnalysis 45 | from classificationlibrary import classifyUsingGaussianNB 46 | from classificationlibrary import classifyUsingRandomForestClassifier 47 | from classificationlibrary import classifyUsingExtraTreesClassifier 48 | from classificationlibrary import classifyUsingKNNClassifier 49 | from classificationlibrary import findingOptimumNumberOfNeighboursForKNN 50 | 51 | def compareModels(arrayOfModels): 52 | modelsAndAccuracies = {} 53 | for i in range(1,len(arrayOfModels)): 54 | data = arrayOfModels[i] 55 | modelsAndAccuracies[data[3]]=data[5] 56 | bestModelAndItsAccuracy = {} 57 | bestModelAndItsAccuracy[max(modelsAndAccuracies.items(), key=operator.itemgetter(1))[0]]=modelsAndAccuracies[max(modelsAndAccuracies.items(), key=operator.itemgetter(1))[0]] 58 | sns.set_style("whitegrid") 59 | plt.figure(figsize=(5,5)) 60 | plt.ylabel("Algorithms",fontsize=10) 61 | plt.xlabel("Accuracy %",fontsize=10) 62 | plt.title("Comparing the models based on the accuries achieved",fontsize=15) 63 | sns.barplot(x=list(modelsAndAccuracies.values()), y=list(modelsAndAccuracies.keys())) 64 | plt.show() 65 | return bestModelAndItsAccuracy 66 | 67 | ### Below function is responsible for performing pre-processing, training, evaluation, persisting model 68 | def performPreprocessingBuildModelsAndEvaluateAccuracy(trainingDataSet, testingDataSet, arrayOfModels): 69 | for i in range(1,len(arrayOfModels)): 70 | print('***************************************************************************************************************************') 71 | print('********************************************* Building Model-', i ,' As Below *************************************************') 72 | print('\t -- Feature Selection: \t ', arrayOfModels[i][0], ' \n\t -- Feature Encoding: \t ', arrayOfModels[i][1], ' \n\t -- Feature Scaling: \t ', arrayOfModels[i][2], ' \n\t -- Classification: \t ', arrayOfModels[i][3], '\n') 73 | 74 | trainingFileNameWithAbsolutePath, testingFileNameWithAbsolutePath = getPathToTrainingAndTestingDataSets() 75 | trainingDataSet = loadCSV(trainingFileNameWithAbsolutePath) 76 | testingDataSet = loadCSV(testingFileNameWithAbsolutePath) 77 | 78 | labelName = getLabelName() 79 | label = trainingDataSet[labelName] 80 | 81 | #Combining the test and training datasets for preprocessing then together, because we observed that in sme datasets 82 | #the values in the categorical columns in test dataset and train dataset are being different this causes issues while 83 | #applying classification techniques 84 | completeDataSet = pd.concat(( trainingDataSet, testingDataSet )) 85 | 86 | #difficultyLevel = completeDataSet.pop('difficulty_level') 87 | 88 | print("completeDataSet.shape: ",completeDataSet.shape) 89 | print("completeDataSet.head: ",completeDataSet.head()) 90 | 91 | #Feature Selection 92 | if arrayOfModels[i][0] == 'TheilsU': 93 | #Perform feature selection using TheilU 94 | completeDataSetAfterFeatuerSelection = featureSelectionUsingTheilU(completeDataSet) 95 | elif arrayOfModels[i][0] == 'Chi-SquaredTest': 96 | #Perform feature selection using Chi-squared Test 97 | completeDataSetAfterFeatuerSelection = featureSelectionUsingChisquaredTest(completeDataSet) 98 | elif arrayOfModels[i][0] == 'RandomForestClassifier': 99 | #Perform feature selection using RandomForestClassifier 100 | completeDataSetAfterFeatuerSelection = featureSelectionUsingRandomForestClassifier(completeDataSet) 101 | elif arrayOfModels[i][0] == 'ExtraTreesClassifier': 102 | #Perform feature selection using ExtraTreesClassifier 103 | completeDataSetAfterFeatuerSelection = featureSelectionUsingExtraTreesClassifier(completeDataSet) 104 | 105 | #Feature Encoding 106 | if arrayOfModels[i][1] == 'LabelEncoder': 107 | #Perform lable encoding to convert categorical values into label encoded features 108 | completeEncodedDataSet = featureEncodingUsingLabelEncoder(completeDataSetAfterFeatuerSelection) 109 | elif arrayOfModels[i][1] == 'OneHotEncoder': 110 | #Perform OnHot encoding to convert categorical values into one-hot encoded features 111 | completeEncodedDataSet = featureEncodingUsingOneHotEncoder(completeDataSetAfterFeatuerSelection) 112 | elif arrayOfModels[i][1] == 'FrequencyEncoder': 113 | #Perform Frequency encoding to convert categorical values into frequency encoded features 114 | completeEncodedDataSet = featureEncodingUsingFrequencyEncoder(completeDataSetAfterFeatuerSelection) 115 | elif arrayOfModels[i][1] == 'BinaryEncoder': 116 | #Perform Binary encoding to convert categorical values into binary encoded features 117 | completeEncodedDataSet = featureEncodingUsingBinaryEncoder(completeDataSetAfterFeatuerSelection) 118 | 119 | #Feature Scaling 120 | if arrayOfModels[i][2] == 'Min-Max': 121 | #Perform MinMaxScaler to scale the features of the dataset into same range 122 | completeEncodedAndScaledDataset = featureScalingUsingMinMaxScaler(completeEncodedDataSet) 123 | elif arrayOfModels[i][2] == 'Binarizing': 124 | #Perform Binarizing to scale the features of the dataset into same range 125 | completeEncodedAndScaledDataset = featureScalingUsingBinarizer(completeEncodedDataSet) 126 | elif arrayOfModels[i][2] == 'Normalizing': 127 | #Perform Normalizing to scale the features of the dataset into same range 128 | completeEncodedAndScaledDataset = featureScalingUsingNormalizer(completeEncodedDataSet) 129 | elif arrayOfModels[i][2] == 'Standardization': 130 | #Perform Standardization to scale the features of the dataset into same range 131 | completeEncodedAndScaledDataset = featureScalingUsingStandardScalar(completeEncodedDataSet) 132 | 133 | #Split the complete dataSet into training dataSet and testing dataSet 134 | featuresInPreProcessedTrainingDataSet,featuresInPreProcessedTestingDataSet,labelInPreProcessedTrainingDataSet,labelInPreProcessedTestingDataSet = splitCompleteDataSetIntoTrainingSetAndTestingSet(completeEncodedAndScaledDataset) 135 | 136 | trainingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTrainingDataSet, labelInPreProcessedTrainingDataSet], axis=1, sort=False) 137 | testingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTestingDataSet, labelInPreProcessedTestingDataSet], axis=1, sort=False) 138 | 139 | #Classification 140 | if arrayOfModels[i][3] == 'DecisonTree': 141 | #Perform classification using DecisionTreeClassifier 142 | classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingDecisionTreeClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) 143 | elif arrayOfModels[i][3] == 'RandomForestClassifier': 144 | classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingRandomForestClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) 145 | elif arrayOfModels[i][3] == 'ExtraTreesClassifier': 146 | classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingExtraTreesClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) 147 | elif arrayOfModels[i][3] == 'LogisticRegressionRegression': 148 | classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingLogisticRegression(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) 149 | elif arrayOfModels[i][3] == 'LinearDiscriminantAnalysis': 150 | classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingLinearDiscriminantAnalysis(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) 151 | elif arrayOfModels[i][3] == 'GuassianNaiveBayes': 152 | classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingGaussianNB(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) 153 | elif arrayOfModels[i][3] == 'KNN': 154 | classifier, trainingAccuracyScore, testingAccuracyScore = classifyUsingKNNClassifier(trainingEncodedAndScaledDataset, testingEncodedAndScaledDataset) 155 | 156 | arrayOfModels[i].append(trainingAccuracyScore) 157 | arrayOfModels[i].append(testingAccuracyScore) 158 | 159 | modelName = arrayOfModels[i][0]+"_"+arrayOfModels[i][1]+"_"+arrayOfModels[i][2]+"_"+arrayOfModels[i][3] 160 | modelFileName = getPathToGenerateModels() + modelName+".pkl" 161 | arrayOfModels[i].append(modelName) 162 | arrayOfModels[i].append(modelFileName) 163 | #Save the model to file 164 | joblib.dump(classifier, modelFileName) 165 | 166 | def performPreprocessing(trainingDataSet, testingDataSet, arrayOfModels): 167 | for i in range(0,len(arrayOfModels)): 168 | print('***************************************************************************************************************************') 169 | print('********************************************* Building Model-', i ,' As Below *************************************************') 170 | print('\t -- Feature Selection: \t ', arrayOfModels[i][0], ' \n\t -- Feature Encoding: \t ', arrayOfModels[i][1], ' \n\t -- Feature Scaling: \t ', arrayOfModels[i][2], '\n') 171 | 172 | trainingFileNameWithAbsolutePath, testingFileNameWithAbsolutePath = getPathToTrainingAndTestingDataSets() 173 | trainingDataSet = loadCSV(trainingFileNameWithAbsolutePath) 174 | testingDataSet = loadCSV(testingFileNameWithAbsolutePath) 175 | 176 | labelName = getLabelName() 177 | label = trainingDataSet[labelName] 178 | 179 | #Combining the test and training datasets for preprocessing then together, because we observed that in sme datasets 180 | #the values in the categorical columns in test dataset and train dataset are being different this causes issues while 181 | #applying classification techniques 182 | completeDataSet = pd.concat(( trainingDataSet, testingDataSet )) 183 | 184 | #difficultyLevel = completeDataSet.pop('difficulty_level') 185 | 186 | print("completeDataSet.shape: ",completeDataSet.shape) 187 | print("completeDataSet.head: ",completeDataSet.head()) 188 | 189 | #Feature Selection 190 | if arrayOfModels[i][0] == 'TheilsU': 191 | #Perform feature selection using TheilU 192 | completeDataSetAfterFeatuerSelection = featureSelectionUsingTheilU(completeDataSet) 193 | elif arrayOfModels[i][0] == 'Chi-SquaredTest': 194 | #Perform feature selection using Chi-squared Test 195 | completeDataSetAfterFeatuerSelection = featureSelectionUsingChisquaredTest(completeDataSet) 196 | elif arrayOfModels[i][0] == 'RandomForestClassifier': 197 | #Perform feature selection using RandomForestClassifier 198 | completeDataSetAfterFeatuerSelection = featureSelectionUsingRandomForestClassifier(completeDataSet) 199 | elif arrayOfModels[i][0] == 'ExtraTreesClassifier': 200 | #Perform feature selection using ExtraTreesClassifier 201 | completeDataSetAfterFeatuerSelection = featureSelectionUsingExtraTreesClassifier(completeDataSet) 202 | 203 | #Feature Encoding 204 | if arrayOfModels[i][1] == 'LabelEncoder': 205 | #Perform lable encoding to convert categorical values into label encoded features 206 | completeEncodedDataSet = featureEncodingUsingLabelEncoder(completeDataSetAfterFeatuerSelection) 207 | elif arrayOfModels[i][1] == 'OneHotEncoder': 208 | #Perform OnHot encoding to convert categorical values into one-hot encoded features 209 | completeEncodedDataSet = featureEncodingUsingOneHotEncoder(completeDataSetAfterFeatuerSelection) 210 | elif arrayOfModels[i][1] == 'FrequencyEncoder': 211 | #Perform Frequency encoding to convert categorical values into frequency encoded features 212 | completeEncodedDataSet = featureEncodingUsingFrequencyEncoder(completeDataSetAfterFeatuerSelection) 213 | elif arrayOfModels[i][1] == 'BinaryEncoder': 214 | #Perform Binary encoding to convert categorical values into binary encoded features 215 | completeEncodedDataSet = featureEncodingUsingBinaryEncoder(completeDataSetAfterFeatuerSelection) 216 | 217 | #Feature Scaling 218 | if arrayOfModels[i][2] == 'Min-Max': 219 | #Perform MinMaxScaler to scale the features of the dataset into same range 220 | completeEncodedAndScaledDataset = featureScalingUsingMinMaxScaler(completeEncodedDataSet) 221 | elif arrayOfModels[i][2] == 'Binarizing': 222 | #Perform Binarizing to scale the features of the dataset into same range 223 | completeEncodedAndScaledDataset = featureScalingUsingBinarizer(completeEncodedDataSet) 224 | elif arrayOfModels[i][2] == 'Normalizing': 225 | #Perform Normalizing to scale the features of the dataset into same range 226 | completeEncodedAndScaledDataset = featureScalingUsingNormalizer(completeEncodedDataSet) 227 | elif arrayOfModels[i][2] == 'Standardization': 228 | #Perform Standardization to scale the features of the dataset into same range 229 | completeEncodedAndScaledDataset = featureScalingUsingStandardScalar(completeEncodedDataSet) 230 | 231 | #Split the complete dataSet into training dataSet and testing dataSet 232 | featuresInPreProcessedTrainingDataSet,featuresInPreProcessedTestingDataSet,labelInPreProcessedTrainingDataSet,labelInPreProcessedTestingDataSet = splitCompleteDataSetIntoTrainingSetAndTestingSet(completeEncodedAndScaledDataset) 233 | 234 | trainingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTrainingDataSet, labelInPreProcessedTrainingDataSet], axis=1, sort=False) 235 | testingEncodedAndScaledDataset = pd.concat([featuresInPreProcessedTestingDataSet, labelInPreProcessedTestingDataSet], axis=1, sort=False) 236 | 237 | return completeEncodedAndScaledDataset --------------------------------------------------------------------------------