├── .gitattributes ├── Data_Set_Generator .ipynb ├── Decision_Tree.ipynb ├── K_Nearest_Neighbors.ipynb ├── LICENSE ├── README.md ├── Random_Forest.ipynb └── data-set ├── MalwareDataSet.csv └── dataset.csv /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /Data_Set_Generator .ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "f51341ce", 6 | "metadata": {}, 7 | "source": [ 8 | "# Data Set Generator " 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "38f87734", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pefile\n", 19 | "import csv\n", 20 | "import glob" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "id": "93abc748", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "# pefile : It is used to extract pe information of exe files.\n", 31 | "# csv : Used to create csv file.\n", 32 | "# glob : It is used to access folders." 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "id": "3de03b84", 38 | "metadata": {}, 39 | "source": [ 40 | "First create 2 folders, add your malwares in one folder and your safe softwares in the other folder. Then access the folders with glob as follows." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "id": "26392386", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "malware = glob.glob('../malwares_folder//*.exe') # change path" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "id": "e0833f40", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "secure = glob.glob('../secures_folder//*.exe') # change path" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "id": "70f615e1", 66 | "metadata": {}, 67 | "source": [ 68 | "Define header information in dataset" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "id": "f2111f89", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "header =[\"AddressOfEntryPoint\",\"MajorLinkerVersion\",\"MajorImageVersion\",\"MajorOperatingSystemVersion\",\"DllCharacteristics\",\"SizeOfStackReserve\",\"NumberOfSections\",\"ResourceSize\",\"IfMalware\"]" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "id": "9726a7e4", 84 | "metadata": {}, 85 | "source": [ 86 | "Use the following code structure to generate dataset in csv file format:" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "id": "a098d0b9", 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "with open('dataset.csv', 'w', encoding='UTF8', newline='') as f:\n", 97 | " writer = csv.writer(f)\n", 98 | "\n", 99 | " writer.writerow(header) # header info added\n", 100 | "\n", 101 | " # We added the pe information for Malware folder:\n", 102 | " \n", 103 | " pe = pefile.PE(file)\n", 104 | " a = str(pe.OPTIONAL_HEADER.AddressOfEntryPoint)\n", 105 | " b = str(pe.OPTIONAL_HEADER.MajorLinkerVersion)\n", 106 | " c = str(pe.OPTIONAL_HEADER.MajorImageVersion)\n", 107 | " d = str(pe.OPTIONAL_HEADER.MajorOperatingSystemVersion)\n", 108 | " e = str(pe.OPTIONAL_HEADER.DllCharacteristics)\n", 109 | " f = str(pe.OPTIONAL_HEADER.SizeOfStackReserve)\n", 110 | " g = str(pe.FILE_HEADER.NumberOfSections)\n", 111 | " h = str(pe.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size)\n", 112 | " i = \"1\" # dengerous exe file value\n", 113 | " \n", 114 | " data = [a,b,c,d,e,f,g,h,i]\n", 115 | " writer.writerow(data)\n", 116 | " \n", 117 | " # We added the pe information for safe software folder:\n", 118 | " \n", 119 | " for file in secure:\n", 120 | " pe = pefile.PE(file)\n", 121 | " a = str(pe.OPTIONAL_HEADER.AddressOfEntryPoint)\n", 122 | " b = str(pe.OPTIONAL_HEADER.MajorLinkerVersion)\n", 123 | " c = str(pe.OPTIONAL_HEADER.MajorImageVersion)\n", 124 | " d = str(pe.OPTIONAL_HEADER.MajorOperatingSystemVersion)\n", 125 | " e = str(pe.OPTIONAL_HEADER.DllCharacteristics)\n", 126 | " f = str(pe.OPTIONAL_HEADER.SizeOfStackReserve)\n", 127 | " g = str(pe.FILE_HEADER.NumberOfSections)\n", 128 | " h = str(pe.OPTIONAL_HEADER.DATA_DIRECTORY[2].Size)\n", 129 | " i = \"0\" # safe exe file value\n", 130 | " \n", 131 | " data = [a,b,c,d,e,f,g,h,i]\n", 132 | " writer.writerow(data)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "id": "15004a13", 138 | "metadata": {}, 139 | "source": [ 140 | "We created our own dataset by extracting the PE information of the exe files and writing them to the csv file." 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "id": "30181e79", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [] 150 | } 151 | ], 152 | "metadata": { 153 | "kernelspec": { 154 | "display_name": "Python 3 (ipykernel)", 155 | "language": "python", 156 | "name": "python3" 157 | }, 158 | "language_info": { 159 | "codemirror_mode": { 160 | "name": "ipython", 161 | "version": 3 162 | }, 163 | "file_extension": ".py", 164 | "mimetype": "text/x-python", 165 | "name": "python", 166 | "nbconvert_exporter": "python", 167 | "pygments_lexer": "ipython3", 168 | "version": "3.9.10" 169 | } 170 | }, 171 | "nbformat": 4, 172 | "nbformat_minor": 5 173 | } 174 | -------------------------------------------------------------------------------- /Decision_Tree.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3cec636e", 6 | "metadata": {}, 7 | "source": [ 8 | "# Malware Analysis With Machine Learning - Decision Tree" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "c2240e7d", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import pandas as pd\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "from sklearn import tree\n", 22 | "from sklearn.metrics import f1_score\n", 23 | "from sklearn.metrics import plot_confusion_matrix,plot_precision_recall_curve,plot_roc_curve" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "id": "f2253395", 29 | "metadata": {}, 30 | "source": [ 31 | "Data set reading :" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "id": "7569420f", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "data = pd.read_csv('../Malware-Detection-Using-Machine-Learning/data-set/MalwareDataSet.csv') " 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "id": "206b4c35", 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "data": { 52 | "text/html": [ 53 | "
\n", 54 | "\n", 67 | "\n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | "
AddressOfEntryPointMajorLinkerVersionMajorImageVersionMajorOperatingSystemVersionDllCharacteristicsSizeOfStackReserveNumberOfSectionsResourceSizelegitimate
0104079663308826214449521
153549663308826214449521
2588079663308826214441364901
32516696633088262144419401
470387966330882621444830981
..............................
13743912329111053308810485765816540
137440400002613276810485768676240
1374415961010053308810485765226480
1374425121620101048576822160
13744322731110533088104857653184640
\n", 217 | "

137444 rows × 9 columns

\n", 218 | "
" 219 | ], 220 | "text/plain": [ 221 | " AddressOfEntryPoint MajorLinkerVersion MajorImageVersion \\\n", 222 | "0 10407 9 6 \n", 223 | "1 5354 9 6 \n", 224 | "2 58807 9 6 \n", 225 | "3 25166 9 6 \n", 226 | "4 70387 9 6 \n", 227 | "... ... ... ... \n", 228 | "137439 123291 11 0 \n", 229 | "137440 40000 2 6 \n", 230 | "137441 59610 10 0 \n", 231 | "137442 51216 2 0 \n", 232 | "137443 22731 11 0 \n", 233 | "\n", 234 | " MajorOperatingSystemVersion DllCharacteristics SizeOfStackReserve \\\n", 235 | "0 6 33088 262144 \n", 236 | "1 6 33088 262144 \n", 237 | "2 6 33088 262144 \n", 238 | "3 6 33088 262144 \n", 239 | "4 6 33088 262144 \n", 240 | "... ... ... ... \n", 241 | "137439 5 33088 1048576 \n", 242 | "137440 1 32768 1048576 \n", 243 | "137441 5 33088 1048576 \n", 244 | "137442 1 0 1048576 \n", 245 | "137443 5 33088 1048576 \n", 246 | "\n", 247 | " NumberOfSections ResourceSize legitimate \n", 248 | "0 4 952 1 \n", 249 | "1 4 952 1 \n", 250 | "2 4 136490 1 \n", 251 | "3 4 1940 1 \n", 252 | "4 4 83098 1 \n", 253 | "... ... ... ... \n", 254 | "137439 5 81654 0 \n", 255 | "137440 8 67624 0 \n", 256 | "137441 5 22648 0 \n", 257 | "137442 8 2216 0 \n", 258 | "137443 5 318464 0 \n", 259 | "\n", 260 | "[137444 rows x 9 columns]" 261 | ] 262 | }, 263 | "execution_count": 3, 264 | "metadata": {}, 265 | "output_type": "execute_result" 266 | } 267 | ], 268 | "source": [ 269 | "data" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 4, 275 | "id": "4b8b8b6c", 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/plain": [ 281 | "(137444, 9)" 282 | ] 283 | }, 284 | "execution_count": 4, 285 | "metadata": {}, 286 | "output_type": "execute_result" 287 | } 288 | ], 289 | "source": [ 290 | "data.shape" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 5, 296 | "id": "65dbf455", 297 | "metadata": { 298 | "scrolled": true 299 | }, 300 | "outputs": [ 301 | { 302 | "data": { 303 | "text/html": [ 304 | "
\n", 305 | "\n", 318 | "\n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | "
AddressOfEntryPointMajorLinkerVersionMajorImageVersionMajorOperatingSystemVersionDllCharacteristicsSizeOfStackReserveNumberOfSectionsResourceSizelegitimate
count1.374440e+05137444.000000137444.000000137444.000000137444.0000001.374440e+05137444.0000001.374440e+05137444.000000
mean1.722186e+058.62078468.7318765.09873822301.0434369.306841e+054.9971192.474766e+050.297707
std3.438014e+064.0956351185.70987399.43758415444.7532195.553175e+051.9172372.129516e+070.457252
min0.000000e+000.0000000.0000000.0000000.0000000.000000e+001.0000000.000000e+000.000000
25%1.272100e+048.0000000.0000004.000000320.0000001.048576e+064.0000002.216000e+030.000000
50%5.300800e+049.0000000.0000005.00000033088.0000001.048576e+065.0000009.640000e+030.000000
75%6.157800e+0410.0000006.0000005.00000033088.0000001.048576e+065.0000002.376250e+041.000000
max1.074484e+09255.00000028619.00000036868.00000049504.0000003.355443e+0740.0000004.294903e+091.000000
\n", 432 | "
" 433 | ], 434 | "text/plain": [ 435 | " AddressOfEntryPoint MajorLinkerVersion MajorImageVersion \\\n", 436 | "count 1.374440e+05 137444.000000 137444.000000 \n", 437 | "mean 1.722186e+05 8.620784 68.731876 \n", 438 | "std 3.438014e+06 4.095635 1185.709873 \n", 439 | "min 0.000000e+00 0.000000 0.000000 \n", 440 | "25% 1.272100e+04 8.000000 0.000000 \n", 441 | "50% 5.300800e+04 9.000000 0.000000 \n", 442 | "75% 6.157800e+04 10.000000 6.000000 \n", 443 | "max 1.074484e+09 255.000000 28619.000000 \n", 444 | "\n", 445 | " MajorOperatingSystemVersion DllCharacteristics SizeOfStackReserve \\\n", 446 | "count 137444.000000 137444.000000 1.374440e+05 \n", 447 | "mean 5.098738 22301.043436 9.306841e+05 \n", 448 | "std 99.437584 15444.753219 5.553175e+05 \n", 449 | "min 0.000000 0.000000 0.000000e+00 \n", 450 | "25% 4.000000 320.000000 1.048576e+06 \n", 451 | "50% 5.000000 33088.000000 1.048576e+06 \n", 452 | "75% 5.000000 33088.000000 1.048576e+06 \n", 453 | "max 36868.000000 49504.000000 3.355443e+07 \n", 454 | "\n", 455 | " NumberOfSections ResourceSize legitimate \n", 456 | "count 137444.000000 1.374440e+05 137444.000000 \n", 457 | "mean 4.997119 2.474766e+05 0.297707 \n", 458 | "std 1.917237 2.129516e+07 0.457252 \n", 459 | "min 1.000000 0.000000e+00 0.000000 \n", 460 | "25% 4.000000 2.216000e+03 0.000000 \n", 461 | "50% 5.000000 9.640000e+03 0.000000 \n", 462 | "75% 5.000000 2.376250e+04 1.000000 \n", 463 | "max 40.000000 4.294903e+09 1.000000 " 464 | ] 465 | }, 466 | "execution_count": 5, 467 | "metadata": {}, 468 | "output_type": "execute_result" 469 | } 470 | ], 471 | "source": [ 472 | "data.describe()" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 6, 478 | "id": "7378e7cf", 479 | "metadata": {}, 480 | "outputs": [ 481 | { 482 | "data": { 483 | "text/plain": [ 484 | "legitimate\n", 485 | "0 96526\n", 486 | "1 40918\n", 487 | "dtype: int64" 488 | ] 489 | }, 490 | "execution_count": 6, 491 | "metadata": {}, 492 | "output_type": "execute_result" 493 | } 494 | ], 495 | "source": [ 496 | "data.groupby(data['legitimate']).size()" 497 | ] 498 | }, 499 | { 500 | "cell_type": "markdown", 501 | "id": "2eab99e9", 502 | "metadata": {}, 503 | "source": [ 504 | "We have a total of 137.444 data. There are 96.526 safe and 40.918 malware." 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 7, 510 | "id": "cea0ed83", 511 | "metadata": {}, 512 | "outputs": [], 513 | "source": [ 514 | "features = data.iloc[:,[0,1,2,3,4,5,6,7]].values # extracting the first 8 columns from the dataset - features" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": 8, 520 | "id": "0d9d18ee", 521 | "metadata": {}, 522 | "outputs": [ 523 | { 524 | "data": { 525 | "text/plain": [ 526 | "array([[ 10407, 9, 6, ..., 262144, 4, 952],\n", 527 | " [ 5354, 9, 6, ..., 262144, 4, 952],\n", 528 | " [ 58807, 9, 6, ..., 262144, 4, 136490],\n", 529 | " ...,\n", 530 | " [ 59610, 10, 0, ..., 1048576, 5, 22648],\n", 531 | " [ 51216, 2, 0, ..., 1048576, 8, 2216],\n", 532 | " [ 22731, 11, 0, ..., 1048576, 5, 318464]])" 533 | ] 534 | }, 535 | "execution_count": 8, 536 | "metadata": {}, 537 | "output_type": "execute_result" 538 | } 539 | ], 540 | "source": [ 541 | "features" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 9, 547 | "id": "dfb50f62", 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "ifMalware = data.iloc[:,8].values # extracting the legitimate column of the dataset - safe & malware results" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 10, 557 | "id": "f323a0b1", 558 | "metadata": {}, 559 | "outputs": [ 560 | { 561 | "data": { 562 | "text/plain": [ 563 | "array([1, 1, 1, ..., 0, 0, 0])" 564 | ] 565 | }, 566 | "execution_count": 10, 567 | "metadata": {}, 568 | "output_type": "execute_result" 569 | } 570 | ], 571 | "source": [ 572 | "ifMalware" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "id": "9006bce3", 578 | "metadata": {}, 579 | "source": [ 580 | "We will use 75% of our dataset for training and 25% for testing : " 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": 11, 586 | "id": "0657650f", 587 | "metadata": {}, 588 | "outputs": [], 589 | "source": [ 590 | "features_train, features_test, ifMalware_train, ifMalware_test = train_test_split(features, ifMalware, test_size=0.25)" 591 | ] 592 | }, 593 | { 594 | "cell_type": "markdown", 595 | "id": "4edcf3e5", 596 | "metadata": {}, 597 | "source": [ 598 | "Learning and Predict :" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": 12, 604 | "id": "ab2b2a4a", 605 | "metadata": {}, 606 | "outputs": [], 607 | "source": [ 608 | "dtModel = tree.DecisionTreeClassifier() # Defined the model." 609 | ] 610 | }, 611 | { 612 | "cell_type": "code", 613 | "execution_count": 13, 614 | "id": "d458930d", 615 | "metadata": {}, 616 | "outputs": [ 617 | { 618 | "data": { 619 | "text/plain": [ 620 | "DecisionTreeClassifier()" 621 | ] 622 | }, 623 | "execution_count": 13, 624 | "metadata": {}, 625 | "output_type": "execute_result" 626 | } 627 | ], 628 | "source": [ 629 | "dtModel.fit(features_train, ifMalware_train) # Provided training data." 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": 14, 635 | "id": "5ede2da4", 636 | "metadata": {}, 637 | "outputs": [], 638 | "source": [ 639 | "dtPredict = dtModel.predict(features_test) # Give the test data then call predict." 640 | ] 641 | }, 642 | { 643 | "cell_type": "markdown", 644 | "id": "9d67adc3", 645 | "metadata": {}, 646 | "source": [ 647 | "Result :" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": 15, 653 | "id": "1faa7928", 654 | "metadata": {}, 655 | "outputs": [ 656 | { 657 | "name": "stdout", 658 | "output_type": "stream", 659 | "text": [ 660 | "Number of mislabeled out of a total of 34361 test entries: 483\n" 661 | ] 662 | } 663 | ], 664 | "source": [ 665 | "print(\"Number of mislabeled out of a total of %d test entries: %d\" % (features_test.shape[0], \n", 666 | " (ifMalware_test != dtPredict).sum()))" 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": 16, 672 | "id": "14c7dc90", 673 | "metadata": {}, 674 | "outputs": [], 675 | "source": [ 676 | "successRate = 100 * f1_score(ifMalware_test, dtPredict, average='micro') # Success rate calculation" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": 17, 682 | "id": "85d3f74a", 683 | "metadata": {}, 684 | "outputs": [ 685 | { 686 | "name": "stdout", 687 | "output_type": "stream", 688 | "text": [ 689 | "The Success Rate was calculated as % : 98.59433660254359 with the Decision Tree.\n" 690 | ] 691 | } 692 | ], 693 | "source": [ 694 | "print(\"The Success Rate was calculated as % : \" + str(successRate) + \" with the Decision Tree.\")" 695 | ] 696 | }, 697 | { 698 | "cell_type": "markdown", 699 | "id": "ffda807e", 700 | "metadata": {}, 701 | "source": [ 702 | "Visualization :" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": 18, 708 | "id": "d6799629", 709 | "metadata": {}, 710 | "outputs": [ 711 | { 712 | "data": { 713 | "text/plain": [ 714 | "" 715 | ] 716 | }, 717 | "execution_count": 18, 718 | "metadata": {}, 719 | "output_type": "execute_result" 720 | }, 721 | { 722 | "data": { 723 | "image/png": "\n", 724 | "text/plain": [ 725 | "
" 726 | ] 727 | }, 728 | "metadata": { 729 | "needs_background": "light" 730 | }, 731 | "output_type": "display_data" 732 | } 733 | ], 734 | "source": [ 735 | "plot_confusion_matrix(dtModel,features_test,ifMalware_test)" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": 19, 741 | "id": "caaf81cf", 742 | "metadata": {}, 743 | "outputs": [ 744 | { 745 | "data": { 746 | "text/plain": [ 747 | "" 748 | ] 749 | }, 750 | "execution_count": 19, 751 | "metadata": {}, 752 | "output_type": "execute_result" 753 | }, 754 | { 755 | "data": { 756 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAmgklEQVR4nO3de5xWZb338c83xEBUUhj3qzgEuPEAAhMMKrK3oIgHUkjNYx5L3bg9lYXpU6Gi9WQecnvYAnnAegwPVIJKQmqamQfAEAVPSKQQ5YgcNAQBf88fa810M8zMvYaZ+x5n7u/79ZrXrMO11vpdN46/e13XWteliMDMzErXZ5o7ADMza15OBGZmJc6JwMysxDkRmJmVOCcCM7MSt11zB9BQnTt3jh49ejR3GGZmLcq8efPei4iy2va1uETQo0cP5s6d29xhmJm1KJL+Wtc+Nw2ZmZU4JwIzsxLnRGBmVuKcCMzMSpwTgZlZiStYIpB0p6R3Jb1Sx35JuknSYkkLJA0sVCxmZla3Qt4RTAEOr2f/EUDv9Occ4LYCxmJmZnUoWCKIiD8A79dTZAzw80g8B3xO0ucLFc+VDy3kyocWFur0ZmYtVnO+UNYFeCdnfVm6bUXNgpLOIblroHv37tt0sUV/W7tNx5mZtXYtorM4IiZHREVEVJSV1fqGtJmZbaPmTATLgW45613TbWZmVkTNmQhmAKelTw/tD6yJiK2ahczMrLAK1kcgaSowHOgsaRlwOdAWICImAjOBUcBiYB1wZqFiMTOzuhUsEUTESXn2B3Beoa5vZmbZtLhhqBtj0Yq1nDDp2eYOw8ysTmPKu3Dyftv2dOS2KplEMKa8S3OHYGZWr+f/krx65URQICfv173oH66ZWUM0V4tFi3iPwMzMCseJwMysxDkRmJmVOCcCM7MS50RgZlbinAjMzEqcE4GZWYlzIjAzK3FOBGZmJc6JwMysxDkRmJmVOCcCM7MS50RgZlbinAjMzEqcE4GZWYlzIjAzK3EFTQSSDpf0uqTFki6tZf8XJT0uaYGkJyV1LWQ8Zma2tYIlAkltgFuBI4A+wEmS+tQodh3w84joD0wA/m+h4jEzs9oV8o5gX2BxRCyJiI+Be4ExNcr0AZ5Il39fy34zMyuwQiaCLsA7OevL0m25XgKOSZePBnaS1KnmiSSdI2mupLmVlZUFCdbMrFQ1d2fxd4Bhkv4MDAOWA5trFoqIyRFREREVZWVlxY7RzKxV266A514OdMtZ75puqxYRfyO9I5C0I3BsRKwuYExmZlZDIe8I5gC9JfWUtD1wIjAjt4CkzpKqYrgMuLOA8ZiZWS0KlggiYhNwPjALeBW4PyIWSpogaXRabDjwuqQ3gH8DflioeMzMrHaFbBoiImYCM2tsG5+zPA2YVsgYzMysfpkSgaTdgKHAF4CPgFeAuRHxSQFjMzOzIqg3EUg6CLgU2BX4M/Au0A74CrC7pGnA9RGxtsBxmplZgeS7IxgFnB0Rb9fcIWk74EhgJPCrAsRmZmZFUG8iiIhx9ezbBDzY1AGZmVlxbfNTQ5LObMpAzMyseTTm8dErmywKMzNrNvk6ixfUtYvkuX8zM2vh8nUW/xtwGLCqxnYBfypIRGZmVlT5EsHDwI4RMb/mDklPFiIgMzMrrnxPDX2jnn0nN304ZmZWbM09DLWZmTUzJwIzsxLnRGBmVuKcCMzMSlzmRCBpcn3rZmbWMjXkjmBSnnUzM2uBMieCiJhX37qZmbVM+YaYeAiIuvZHxOi69pmZWcuQ783i64oShZmZNZt8bxY/VbUsqT3QPSJez3pySYcD/wO0AW6PiB/X2N8duBv4XFrm0nSeYzMzK5JMfQSSjgLmA4+m6+WSZuQ5pg1wK3AE0Ac4SVKfGsW+D9wfEV8CTgT+t0HRm5lZo2XtLL4C2BdYDZAOQtczzzH7AosjYklEfAzcC4ypUSaAndPljsDfMsZjZmZNJF8fQZWNEbFGUu62OjuRU12Ad3LWlwH71ShzBTBb0gVAB+CQjPGYmVkTyXpHsFDSyUAbSb0l3UzTzEdwEjAlIroCo4BfSNoqJknnSJoraW5lZWUTXNbMzKpkTQQXAH2BDcBUYC3wzTzHLAe65ax3Tbfl+gZwP0BEPAu0AzrXPFFETI6IioioKCsryxiymZllkalpKCLWAd+TdE2yGh9kOGwO0FtST5IEcCJQcw6Dt4ERwBRJe5MkAn/lNzMroqxPDQ2W9DKwAHhZ0kuSBtV3TERsAs4HZgGvkjwdtFDSBElVL6J9Gzhb0kskdxpnRES+vgczM2tCWTuL7wD+OyKeBpD0H8BdQP/6DkrfCZhZY9v4nOVFwNCGBGxmZk0rax/B5qokABARfwQ2FSYkMzMrpnxjDQ1MF5+SNImk+SaAE4AnCxuamZkVQ76moetrrF+es+y2fDOzViDfWEMHFSsQMzNrHlk7i5H0ZZJ3CdpVbYuICYUIyszMiifr46MTSfoFLgAEHAd8sYBxmZlZkWR9auiAiDgNWBURVwJDgD0KF5aZmRVL1kTwUfp7naQvABuBzxcmJDMzK6asfQQPS/occC3wIskTQ7cXKigzMyuerGMNXZUu/krSw0C7iFhTuLDMzKxY8r1Qdkw9+4iIXzd9SGZmVkz57giOqmdfAE4EZmYtXL4Xys4sViBmZtY8sj41ZGZmrZQTgZlZiXMiMDMrcVmHmNhB0g8k/Sxd7y3pyMKGZmZmxZD1juAukonrh6Try4GrCxKRmZkVVdZEsHtE/IRkaImqyexVsKjMzKxosiaCjyW1J52MRtLuJHcIZmbWwmVNBFcAjwLdJN0DPA5cku8gSYdLel3SYkmX1rL/p5Lmpz9vSFrdgNjNzKwJZB1raLakecD+JE1CF0XEe/UdI6kNcCswElgGzJE0IyIW5Zz3WznlLwC+1PAqmJlZY2R9augh4FDgyYh4OF8SSO0LLI6IJRHxMXAvMKae8icBU7PEY2ZmTSdr09B1wH8CiyRNk/RVSe3yHNMFeCdnfVm6bSuSvgj0BJ6oY/85kuZKmltZWZkxZDMzyyJTIoiIpyLiv4FewCTgeODdJozjRGBaRGyu4/qTI6IiIirKysqa8LJmZtaQyevbk4xGegIwELg7zyHLgW45613TbbU5ETgvayxmZtZ0MiUCSfeTtPk/CtwCPBURn+Q5bA7QW1JPkgRwInByLefeC9gFeLYBcZuZWRPJekdwB3BSXU03tYmITZLOB2YBbYA7I2KhpAnA3IiYkRY9Ebg3IqIhgZuZWdPIN0PZwRHxBNABGCNt+TJxvhnKImImMLPGtvE11q9oQLxmZtbE8t0RDCN5kqe2mco8Q5mZWSuQb4ayy9PFCRHxl9x9adu/mZm1cFnfI/hVLdumNWUgZmbWPPL1EewF9AU6SjomZ9fOQL4XyszMrAXI10ewJ3Ak8Dm27Cf4ADi7QDGZmVkR5esjmA5MlzQkIvycv5lZK5SvaeiSdEKakyWdVHN/RFxYsMjMzKwo8jUNvZr+nlvoQMzMrHnkaxp6KP1dPa6QpM8AO0bE2gLHZmZmRZB1PoJfStpZUgfgFZLhqMcVNjQzMyuGrO8R9EnvAL4C/JZk7oBTCxWUmZkVT9ZE0FZSW5JEMCMiNpJOZG9mZi1b1kQwCVhKMvjcH9IZxdxHYGbWCmSdvP4m4KacTX+VdFBhQjIzs2LK2lncUdINVfMGS7qe5O7AzMxauKxNQ3eSDCtxfPqzFrirUEGZmVnxZJ2hbPeIODZn/UpJ8wsQj5mZFVnWO4KPJP1H1YqkocBHhQnJzMyKKesdwVjg55I6puurgNMLE5KZmRVT3jsCSeVAb5JJ5vsD/SPiSxGxIMOxh0t6XdJiSZfWUeZ4SYskLZT0ywbGb2ZmjVRvIpA0HrgfOBZ4BDgh6xhDktoAtwJHAH2AkyT1qVGmN3AZMDQi+gLfbGgFzMyscfI1DZ0AlEfEOkmdgEeBn2U8977A4ohYAiDpXmAMsCinzNnArRGxCiAi3m1I8GZm1nj5moY2RMQ6gIhYmaF8ri7AOznry9JtufYA9pD0jKTnJB1e24kknVP1DkNlZWUDQjAzs3zy3RH0kjQjXRawe846ETG6Ca7fGxgOdCUZvqJfRKzOLRQRk4HJABUVFR7jyMysCeVLBGNqrF/XgHMvB7rlrHdNt+VaBjyfDmL3F0lvkCSGOQ24jpmZNUK+iWmeasS55wC9JfUkSQAnAifXKPMgcBJwl6TOJE1FSxpxTTMza6B8Tw09JOmodAjqmvt6SZog6eu1HRsRm4DzgVkkU17eHxEL02OqmpRmASslLQJ+D4xL+yLMzKxI8jUNnQ1cDNwo6X2gEmgH9ADeAm6JiOl1HRwRM4GZNbaNz1mO9PwXb0vwZmbWePmahv4OXAJcIqkH8HmSoSXeqHqayMzMWrasQ0wQEUtJJqcxM7NWpCHvBZiZWSvkRGBmVuKcCMzMSlymPoJ0/oErgC+mx4jkoZ9ehQvNzMyKIWtn8R3At4B5wObChWNmZsWWNRGsiYjfFjQSMzNrFlkTwe8lXQv8GthQtTEiXixIVGZmVjRZE8F+6e+KnG0BHNy04ZiZWbFlSgQRcVChAzEzs+aR6fFRSR0l3VA1OYyk63MmsjczsxYs63sEdwIfAMenP2uBuwoVlJmZFU/WPoLdI+LYnPUrJc0vQDxmZlZkWe8IPpL0H1Ur6QtmHxUmJDMzK6asdwTnAnen/QIC3gfOKFRQZmZWPFmfGpoPDJC0c7q+tpBBmZlZ8dSbCCSdEhH/T9LFNbYDEBE3FDA2MzMrgnx3BB3S3zsVOhAzM2se+aaqnJT+vnJbTi7pcOB/gDbA7RHx4xr7zwCuBZanm26JiNu35VpmZrZtsr5Q9hNJO0tqK+lxSZWSTslzTBvgVuAIoA9wkqQ+tRS9LyLK0x8nATOzIsv6+OihaQfxkSTzFv87MC7PMfsCiyNiSUR8DNwLjNnWQM3MrDCyJoKqJqQvAw9ExJoMx3QB3slZX5Zuq+lYSQskTZPUrbYTSTqnaniLysrKjCGbmVkWWRPBw5JeAwYBj0sqA9Y3wfUfAnpERH/gd8DdtRWKiMkRURERFWVlZU1wWTMzq5IpEUTEpcABQEVEbAT+Sf5mnuVA7jf8rvyrU7jqvCsjomp+g9tJEo2ZmRVRvvcIDo6IJyQdk7Mtt8iv6zl8DtBbUk+SBHAicHKN838+Ilakq6OBVxsQu5mZNYF87xEMA54AjqplX1BPIoiITZLOB2aRPD56Z0QslDQBmBsRM4ALJY0GNuFhK8zMmkW+9wguT3+fuS0nj4iZwMwa28bnLF8GXLYt5zYzs6aR9T2CH0n6XM76LpKuLlhUZmZWNFmfGjoiIlZXrUTEKmBUQSIyM7OiypoI2kj6bNWKpPbAZ+spb2ZmLUTW+QjuIXl/oGp6yjOp45l/MzNrWbLOR3CNpJeAQ9JNV0XErMKFZWZmxZL1jgCSZ/w3RcRjknaQtFNEfFCowMzMrDiyPjV0NjANmJRu6gI8WKCYzMysiLJ2Fp8HDAXWAkTEm8BuhQrKzMyKJ2si2JAOJQ2ApO1I3iw2M7MWLmsieErS/wHaSxoJPEAycqiZmbVwWRPBd4FK4GXgv0iGjfh+oYIyM7PiyfvUUDrl5MKI2Av4WeFDMjOzYsp7RxARm4HXJXUvQjxmZlZkWd8j2AVYKOkFkklpAIiI0QWJyszMiiZrIvhBQaMwM7Nmk2+GsnbAWODfSTqK74iITcUIzMzMiiNfH8HdQAVJEjgCuL7gEZmZWVHlaxrqExH9ACTdAbxQ+JDMzKyY8t0RbKxacJOQmVnrlC8RDJC0Nv35AOhftSxpbb6TSzpc0uuSFku6tJ5yx0oKSRUNrYCZmTVOvsnr22zridMX0W4FRgLLgDmSZkTEohrldgIuAp7f1muZmdm2yzrExLbYF1gcEUvSAevuBcbUUu4q4BpgfQFjMTOzOhQyEXQB3slZX5ZuqyZpINAtIh6p70SSzpE0V9LcysrKpo/UzKyEFTIR1EvSZ4AbgG/nKxsRkyOiIiIqysrKCh+cmVkJKWQiWA50y1nvmm6rshOwD/CkpKXA/sAMdxibmRVXIRPBHKC3pJ6StgdOBGZU7YyINRHROSJ6REQP4DlgdETMLWBMZmZWQ8ESQfrewfnALJKJ7++PiIWSJkjyYHVmZp8SWQed2yYRMZNkEpvcbePrKDu8kLGYmVntmq2z2MzMPh2cCMzMSpwTgZlZiXMiMDMrcU4EZmYlzonAzKzEORGYmZU4JwIzsxLnRGBmVuKcCMzMSlxBh5golo0bN7Js2TLWr/fcNmZZtGvXjq5du9K2bdvmDsU+BVpFIli2bBk77bQTPXr0QFJzh2P2qRYRrFy5kmXLltGzZ8/mDsc+BVpF09D69evp1KmTk4BZBpLo1KmT76CtWqtIBICTgFkD+O/FcrWaRGBmZtvGiaCJtGnThvLycvr27cuAAQO4/vrr+eSTT7bpXOPHj+exxx6rc//EiRP5+c9/3uDzzpo1i/LycsrLy9lxxx3Zc889KS8v57TTTtumOHNdd9117LXXXpSXlzN48ODq+IYPH87cuU0z6dzcuXO58MILAdiwYQOHHHII5eXl3HfffZx11lksWrSoUee/8cYbt/hcN23aRFlZGZdeeukW5YYPH86ee+7JgAEDGDp0KK+//nqjrgtw991307t3b3r37s3dd99da5mXXnqJIUOG0K9fP4466ijWrl1bvW/BggUMGTKEvn370q9fv+pmn0MOOYRVq1Y1Oj5r5SKiRf0MGjQoalq0aNFW24qtQ4cO1cv/+Mc/YsSIETF+/PhmjKh+w4YNizlz5my1fdOmTQ0+12233RaHHnporFmzJiIi1qxZE1OmTKn3Oo317LPPxogRI7b5+Jr13LhxY/Tr1y82btxYvW3mzJlxwAEHRK9eveKTTz6p3p5bp0mTJsVRRx21zXFERKxcuTJ69uwZK1eujPfffz969uwZ77///lblKioq4sknn4yIiDvuuCO+//3vbxH7/PnzIyLivffeq67flClT4uqrr671up+Gvxvb0vET/xTHT/xTQc4NzI06/r/a6u4IrnxoISdMerZJf658aGGDYthtt92YPHkyt9xyCxHB5s2bGTduHIMHD6Z///5MmjSpuuw111xDv379GDBgQPU3zzPOOINp06YBcOmll9KnTx/69+/Pd77zHQCuuOIKrrvuOgDmz5/P/vvvT//+/Tn66KOrv/0NHz6c7373u+y7777ssccePP3003XG26NHD7773e8ycOBAHnjgAWbPns2QIUMYOHAgxx13HB9++CEA8+bNY9iwYQwaNIjDDjuMFStWAPCjH/2I2267jZ133hmAnXfemdNPP32r65x77rlUVFTQt29fLr/88urttdXxgQceYJ999mHAgAEceOCBADz55JMceeSRvPvuu5xyyinMmTOH8vJy3nrrrS3uPOqKv2Y9cz3xxBMMHDiQ7bb714N0U6dO5aKLLqJ79+48++yztX52Bx54IIsXL67zs81i1qxZjBw5kl133ZVddtmFkSNH8uijj25V7o033qj+LEaOHMmvfvWr6vr279+fAQMGANCpUyfatGkDwOjRo5k6dWqj4rPWr1U8Pvpp1KtXLzZv3sy7777L9OnT6dixI3PmzGHDhg0MHTqUQw89lNdee43p06fz/PPPs8MOO/D+++9vcY6VK1fym9/8htdeew1JrF69eqvrnHbaadx8880MGzaM8ePHc+WVV3LjjTcCSdPGCy+8wMyZM7nyyivrbW7q1KkTL774Iu+99x7HHHMMjz32GB06dOCaa67hhhtu4LLLLuOCCy5g+vTplJWVcd999/G9732PG2+8kQ8++IBevXrl/Ux++MMfsuuuu7J582ZGjBjBggUL6NKlS611nDBhArNmzaJLly5b1Xu33Xbj9ttv57rrruPhhx/eYt97773H1VdfvVX848eP36KeNT3zzDMMGjSoen39+vU89thjTJo0idWrVzN16lQOOOCArY576KGH6Nev31bbr732Wu65556tth944IHcdNNNW2xbvnw53bp1q17v2rUry5cv3+rYvn37Mn36dL7yla/wwAMP8M477wBJgpDEYYcdRmVlJSeeeCKXXHIJALvssgsbNmxg5cqVdOrUaatzmkGBE4Gkw4H/AdoAt0fEj2vsHwucB2wGPgTOiYhGNfReflTfxhxeELNnz2bBggXV3/LXrFnDm2++yWOPPcaZZ57JDjvsAMCuu+66xXEdO3akXbt2fOMb3+DII4/kyCOP3GL/mjVrWL16NcOGDQPg9NNP57jjjqvef8wxxwAwaNAgli5dWm+MJ5xwAgDPPfccixYtYujQoQB8/PHHDBkyhNdff51XXnmFkSNHArB582Y+//nPN+hzuP/++5k8eTKbNm1ixYoVLFq0iD59+tRax6FDh3LGGWdw/PHHV9cji7rir1nPmlasWMHee+9dvf7www9z0EEH0b59e4499liuuuoqbrzxxupv2l/72tdo3749PXr04Oabb97qfOPGjWPcuHGZ487izjvv5MILL+Sqq65i9OjRbL/99kCS8P/4xz8yZ84cdthhB0aMGMGgQYMYMWIEkCTOv/3tb04EVqeCJQJJbYBbgZHAMmCOpBk1/kf/y4iYmJYfDdwAHF6omIppyZIltGnTht12242I4Oabb+awww7bosysWbPqPcd2223HCy+8wOOPP860adO45ZZbeOKJJzLH8NnPfhZIOrI3bdpUb9kOHToASZ/RyJEjt2pOePnll+nbt2+tTSQ77rgjS5Ysqfeu4C9/+QvXXXcdc+bMYZddduGMM85g/fr1ddZx4sSJPP/88zzyyCMMGjSIefPmZapzXfHXrGdN7du33+K5+qlTp/LHP/6RHj16AMnd2RNPPFGdCO+55x4qKirqjKMhdwRdunThySefrF5ftmwZw4cP3+rYvfbai9mzZwPJXcAjjzwCJHcQBx54IJ07dwZg1KhRvPjii9WJYP369bRv377OWM0K2UewL7A4IpZExMfAvcCY3AIRsTZntQMQBYynaCorKxk7diznn39+9S37bbfdxsaNG4Hkj/if//wnI0eO5K677mLdunUAWzUNffjhh6xZs4ZRo0bx05/+lJdeemmL/R07dmSXXXapbv//xS9+UX13sK32339/nnnmmep273/+85+88cYb7LnnnlRWVlYngo0bN7JwYdJ3ctlll3HeeedVP8Xy4YcfbvVU09q1a+nQoQMdO3bkH//4B7/97W/rreNbb73Ffvvtx4QJEygrK6tuBtnW+PPZe++9q49Zu3YtTz/9NG+//TZLly5l6dKl3HrrrQ1qax83bhzz58/f6qdmEgA47LDDmD17NqtWrWLVqlXMnj17qy8NAO+++y4An3zyCVdffTVjx46tPv7ll19m3bp1bNq0iaeeeoo+ffoASWL8+9//Xp3QzGpTyKahLkDuX+8yYL+ahSSdB1wMbA8cXMB4Cuqjjz6ivLycjRs3st1223Hqqady8cUXA3DWWWexdOlSBg4cSERQVlbGgw8+yOGHH878+fOpqKhg++23Z9SoUfzoRz+qPucHH3zAmDFjWL9+PRHBDTfcsNV17777bsaOHcu6devo1asXd911V6PqUVZWxpQpUzjppJPYsGEDAFdffTV77LEH06ZN48ILL2TNmjVs2rSJb37zm/Tt25dzzz2XDz/8kMGDB9O2bVvatm3Lt7/97S3OO2DAAL70pS+x11570a1bt+qmm7rqOG7cON58800ighEjRjBgwACeeuqpRsVfnyOOOIJTTz0VgN/85jccfPDB1XdUAGPGjOGSSy6pPmdT2nXXXfnBD37A4MGDgeTx4apmwrPOOouxY8dSUVHB1KlTufXWW4Gk2e/MM88Ekn6Aiy++mMGDByOJUaNG8eUvfxlIOvj333//LTrB7dNt0Yq1nDCp9ocT+nxh54I0fyt5qqjpSfoqcHhEnJWunwrsFxHn11H+ZOCwiNjqcRNJ5wDnAHTv3n3QX//61y32v/rqq1u075pti6OPPpqf/OQn9O7du7lDaTIXXXQRo0ePrm4myuW/m0+fXz7/NtPnb/2gQJXGJAJJ8yKi1vbMQn5NWA50y1nvmm6ry73AbbXtiIjJwGSAioqKVtF8ZJ8+P/7xj1mxYkWrSgT77LNPrUnAPp1O3q87J+/XvejXLWQfwRygt6SekrYHTgRm5BaQlPsX92XgzQLGY1avPffcs/o5/dbi7LPPbu4QrAUo2B1BRGySdD4wi+Tx0TsjYqGkCSRvuM0Azpd0CLARWAVs/RZS9ut5IC2zjArVJGwtU0F7kCJiJjCzxrbxOcsXNcV12rVrV/3CjJOBWf0inY+gXbt2zR2KfUq0ikcJunbtyrJly6isrGzuUMxahKoZysyglSSCtm3beqYlM7Nt1OoGnTMzs4ZxIjAzK3FOBGZmJa5gbxYXiqRK4K95C9auM/BeE4bTErjOpcF1Lg2NqfMXI6Ksth0tLhE0hqS5db1i3Vq5zqXBdS4Nhaqzm4bMzEqcE4GZWYkrtUQwubkDaAauc2lwnUtDQepcUn0EZma2tVK7IzAzsxqcCMzMSlyrTASSDpf0uqTFki6tZf9nJd2X7n9eUo9mCLNJZajzxZIWSVog6XFJX2yOOJtSvjrnlDtWUkhq8Y8aZqmzpOPTf+uFkn5Z7BibWob/trtL+r2kP6f/fY9qjjibiqQ7Jb0r6ZU69kvSTennsUDSwEZfNCJa1Q/J3AdvAb1I5kF+CehTo8x/AxPT5ROB+5o77iLU+SBgh3T53FKoc1puJ+APwHNARXPHXYR/597An4Fd0vXdmjvuItR5MnBuutwHWNrccTeyzgcCA4FX6tg/CvgtIGB/4PnGXrM13hHsCyyOiCUR8THJFJhjapQZA9ydLk8DRqhlT2SQt84R8fuIWJeuPkcydWhLluXfGeAq4BpgfTGDK5AsdT4buDUiVgFExLtFjrGpZalzADunyx2BvxUxviYXEX8A3q+nyBjg55F4DvicpM835pqtMRF0Ad7JWV+Wbqu1TERsAtYAnYoSXWFkqXOub5B8o2jJ8tY5vWXuFhGPFDOwAsry77wHsIekZyQ9J+nwokVXGFnqfAVwiqRlJBNhXVCc0JpNQ//e82oV8xFYdpJOASqAYc0dSyFJ+gxwA3BGM4dSbNuRNA8NJ7nr+4OkfhGxujmDKrCTgCkRcb2kIcAvJO0TEZ80d2AtRWu8I1gOdMtZ75puq7WMpO1IbidXFiW6wshSZ9L5ob8HjI6IDUWKrVDy1XknYB/gSUlLSdpSZ7TwDuMs/87LgBkRsTEi/gK8QZIYWqosdf4GcD9ARDwLtCMZnK21yvT33hCtMRHMAXpL6ilpe5LO4Bk1yswATk+Xvwo8EWkvTAuVt86SvgRMIkkCLb3dGPLUOSLWRETniOgRET1I+kVGR8Tc5gm3SWT5b/tBkrsBJHUmaSpaUsQYm1qWOr8NjACQtDdJImjN89bOAE5Lnx7aH1gTESsac8JW1zQUEZsknQ/MInni4M6IWChpAjA3ImYAd5DcPi4m6ZQ5sfkibryMdb4W2BF4IO0XfzsiRjdb0I2Usc6tSsY6zwIOlbQI2AyMi4gWe7ebsc7fBn4m6VskHcdntOQvdpKmkiTzzmm/x+VAW4CImEjSDzIKWAysA85s9DVb8OdlZmZNoDU2DZmZWQM4EZiZlTgnAjOzEudEYGZW4pwIzMxKnBOBFZykzZLmS3pF0kOSPtfE51+aPjOPpA/rKNNe0lOS2kjqIemjNKZFkiambyI35JoVkm5Kl4dLOiBn31hJpzWmTul5rpD0nTxlpkj6agPO2aOuUS1rlPuhpHdqfp6Szpf09azXs5bBicCK4aOIKI+IfUje2zivGWL4OvDriNicrr8VEeVAf5IRK7/SkJNFxNyIuDBdHQ4ckLNvYkT8vLEBN7OHSAZ8q+lOWv9YPiXHicCK7VnSAbIk7S7pUUnzJD0taa90+79J+o2kl9KfA9LtD6ZlF0o6p4HX/RowvebGdNDBPwH/nn5bfkL/mrOhe3rd49K7mZck/SHdNlzSw0rmshgLfCu9w/jPqm/ykvaS9ELVtdLzv5wuD0rvUOZJmqU8o0dKOlvSnDSGX0naIWf3IZLmSnpD0pFp+TaSrk2PWSDpvxryYUXEc7W9rZqOYLtUUm1JwlooJwIrGkltSIYCqHrrdzJwQUQMAr4D/G+6/SbgqYgYQDIu+8J0+9fTshXAhZIyjRibDk3QKyKW1rJvhzSml4Gbgbsjoj9wTxoHwHjgsDSeLd7GTs85EfhpetfzdM6+14DtJfVMN50A3CepbXqtr6b1uRP4YZ5q/DoiBqcxvEoyvk6VHiTf3r8MTJTULt2/JiIGA4OBs3PiqKr7FyTNzHPd2swF/nMbjrNPqVY3xIR9KrWXNJ/kTuBV4HeSdiRpTqka8gLgs+nvg4HTANKmnDXp9gslHZ0udyMZTC3L8AmdgdU1tu2exhTA9Ij4raRfAMek+38B/CRdfgaYIul+4NcZrpfrfpIE8OP09wnAniQD4v0urXsbIN9YMftIuhr4HMlQIbNyr5GOtPmmpCXAXsChQP+c/oOOJJ/XG1UHRcTfSIYqaKh302tYK+FEYMXwUUSUp9++Z5H0EUwBVqft9HlJGg4cAgyJiHWSniQZXCzT9Wsp+1bWa0fEWEn7kXzjnidpUMbrAtxHkux+nZwq3pTUD1gYEUMacJ4pwFci4iVJZ5AOLFcVYs2QSWavuiAichMGapppWduRfKbWSrhpyIombV++kGSQsHXAXyQdB9XzsA5Iiz5OMp1mVVt3R5JvtKvSJLAXybDSWa+7CmiTNpnU50/8awDCrwFPpzHsHhHPR8R4klEtu9U47gOSYa9ru/ZbJIO//YAkKQC8DpQpGTsfSW0l9c0T207AirRZ6Ws19h0n6TOSdieZ0vF1koR7bloeSXtI6pDnGlntAeR98shaDicCK6qI+DOwgGQyka8B35D0Ekk/QNUUhBcBB6Udq/NInup5FNhO0qskzSzPNfDSs4H/yFPmAuBMSQuAU9M4AK6V9HL62OWfSObNzfUQcHRVZ3Et570POIV/jZn/Mcnw59ekdZ9PzlNHdfgB8DxJM9VrNfa9DbxAMuvc2IhYD9wOLAJeTOOeRI0WgPr6CCT9RMnIlztIWibpipzdQ4Hf5YnXWhCPPmolQcm0ld+KiFObO5aWTMm8Fhf7c2xdfEdgJSEiXgR+nz65ZNuuM8ndibUiviMwMytxviMwMytxTgRmZiXOicDMrMQ5EZiZlTgnAjOzEvf/AcoK9QOeV+4tAAAAAElFTkSuQmCC\n", 757 | "text/plain": [ 758 | "
" 759 | ] 760 | }, 761 | "metadata": { 762 | "needs_background": "light" 763 | }, 764 | "output_type": "display_data" 765 | } 766 | ], 767 | "source": [ 768 | "plot_precision_recall_curve(dtModel,features_test,ifMalware_test)" 769 | ] 770 | }, 771 | { 772 | "cell_type": "code", 773 | "execution_count": 20, 774 | "id": "a7d9cfc4", 775 | "metadata": {}, 776 | "outputs": [ 777 | { 778 | "data": { 779 | "text/plain": [ 780 | "" 781 | ] 782 | }, 783 | "execution_count": 20, 784 | "metadata": {}, 785 | "output_type": "execute_result" 786 | }, 787 | { 788 | "data": { 789 | "image/png": "\n", 790 | "text/plain": [ 791 | "
" 792 | ] 793 | }, 794 | "metadata": { 795 | "needs_background": "light" 796 | }, 797 | "output_type": "display_data" 798 | } 799 | ], 800 | "source": [ 801 | "plot_roc_curve(dtModel,features_test,ifMalware_test)" 802 | ] 803 | }, 804 | { 805 | "cell_type": "code", 806 | "execution_count": null, 807 | "id": "7cc50825", 808 | "metadata": {}, 809 | "outputs": [], 810 | "source": [] 811 | } 812 | ], 813 | "metadata": { 814 | "kernelspec": { 815 | "display_name": "Python 3 (ipykernel)", 816 | "language": "python", 817 | "name": "python3" 818 | }, 819 | "language_info": { 820 | "codemirror_mode": { 821 | "name": "ipython", 822 | "version": 3 823 | }, 824 | "file_extension": ".py", 825 | "mimetype": "text/x-python", 826 | "name": "python", 827 | "nbconvert_exporter": "python", 828 | "pygments_lexer": "ipython3", 829 | "version": "3.9.10" 830 | } 831 | }, 832 | "nbformat": 4, 833 | "nbformat_minor": 5 834 | } 835 | -------------------------------------------------------------------------------- /K_Nearest_Neighbors.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "75878b0c", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "from sklearn.model_selection import train_test_split\n", 13 | "from sklearn.neighbors import KNeighborsClassifier\n", 14 | "from sklearn.metrics import f1_score\n", 15 | "from sklearn.metrics import plot_confusion_matrix,plot_precision_recall_curve,plot_roc_curve" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "id": "69b25522", 21 | "metadata": {}, 22 | "source": [ 23 | "Data set reading :" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "id": "b54e1b40", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "data = pd.read_csv('../Malware-Detection-Using-Machine-Learning/data-set/MalwareDataSet.csv') " 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "id": "47c9ba1b", 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/html": [ 45 | "
\n", 46 | "\n", 59 | "\n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | "
AddressOfEntryPointMajorLinkerVersionMajorImageVersionMajorOperatingSystemVersionDllCharacteristicsSizeOfStackReserveNumberOfSectionsResourceSizelegitimate
0104079663308826214449521
153549663308826214449521
2588079663308826214441364901
32516696633088262144419401
470387966330882621444830981
..............................
13743912329111053308810485765816540
137440400002613276810485768676240
1374415961010053308810485765226480
1374425121620101048576822160
13744322731110533088104857653184640
\n", 209 | "

137444 rows × 9 columns

\n", 210 | "
" 211 | ], 212 | "text/plain": [ 213 | " AddressOfEntryPoint MajorLinkerVersion MajorImageVersion \\\n", 214 | "0 10407 9 6 \n", 215 | "1 5354 9 6 \n", 216 | "2 58807 9 6 \n", 217 | "3 25166 9 6 \n", 218 | "4 70387 9 6 \n", 219 | "... ... ... ... \n", 220 | "137439 123291 11 0 \n", 221 | "137440 40000 2 6 \n", 222 | "137441 59610 10 0 \n", 223 | "137442 51216 2 0 \n", 224 | "137443 22731 11 0 \n", 225 | "\n", 226 | " MajorOperatingSystemVersion DllCharacteristics SizeOfStackReserve \\\n", 227 | "0 6 33088 262144 \n", 228 | "1 6 33088 262144 \n", 229 | "2 6 33088 262144 \n", 230 | "3 6 33088 262144 \n", 231 | "4 6 33088 262144 \n", 232 | "... ... ... ... \n", 233 | "137439 5 33088 1048576 \n", 234 | "137440 1 32768 1048576 \n", 235 | "137441 5 33088 1048576 \n", 236 | "137442 1 0 1048576 \n", 237 | "137443 5 33088 1048576 \n", 238 | "\n", 239 | " NumberOfSections ResourceSize legitimate \n", 240 | "0 4 952 1 \n", 241 | "1 4 952 1 \n", 242 | "2 4 136490 1 \n", 243 | "3 4 1940 1 \n", 244 | "4 4 83098 1 \n", 245 | "... ... ... ... \n", 246 | "137439 5 81654 0 \n", 247 | "137440 8 67624 0 \n", 248 | "137441 5 22648 0 \n", 249 | "137442 8 2216 0 \n", 250 | "137443 5 318464 0 \n", 251 | "\n", 252 | "[137444 rows x 9 columns]" 253 | ] 254 | }, 255 | "execution_count": 3, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "data" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 4, 267 | "id": "b13348ab", 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "data": { 272 | "text/plain": [ 273 | "(137444, 9)" 274 | ] 275 | }, 276 | "execution_count": 4, 277 | "metadata": {}, 278 | "output_type": "execute_result" 279 | } 280 | ], 281 | "source": [ 282 | "data.shape" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 5, 288 | "id": "33292bb5", 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "data": { 293 | "text/html": [ 294 | "
\n", 295 | "\n", 308 | "\n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | "
AddressOfEntryPointMajorLinkerVersionMajorImageVersionMajorOperatingSystemVersionDllCharacteristicsSizeOfStackReserveNumberOfSectionsResourceSizelegitimate
count1.374440e+05137444.000000137444.000000137444.000000137444.0000001.374440e+05137444.0000001.374440e+05137444.000000
mean1.722186e+058.62078468.7318765.09873822301.0434369.306841e+054.9971192.474766e+050.297707
std3.438014e+064.0956351185.70987399.43758415444.7532195.553175e+051.9172372.129516e+070.457252
min0.000000e+000.0000000.0000000.0000000.0000000.000000e+001.0000000.000000e+000.000000
25%1.272100e+048.0000000.0000004.000000320.0000001.048576e+064.0000002.216000e+030.000000
50%5.300800e+049.0000000.0000005.00000033088.0000001.048576e+065.0000009.640000e+030.000000
75%6.157800e+0410.0000006.0000005.00000033088.0000001.048576e+065.0000002.376250e+041.000000
max1.074484e+09255.00000028619.00000036868.00000049504.0000003.355443e+0740.0000004.294903e+091.000000
\n", 422 | "
" 423 | ], 424 | "text/plain": [ 425 | " AddressOfEntryPoint MajorLinkerVersion MajorImageVersion \\\n", 426 | "count 1.374440e+05 137444.000000 137444.000000 \n", 427 | "mean 1.722186e+05 8.620784 68.731876 \n", 428 | "std 3.438014e+06 4.095635 1185.709873 \n", 429 | "min 0.000000e+00 0.000000 0.000000 \n", 430 | "25% 1.272100e+04 8.000000 0.000000 \n", 431 | "50% 5.300800e+04 9.000000 0.000000 \n", 432 | "75% 6.157800e+04 10.000000 6.000000 \n", 433 | "max 1.074484e+09 255.000000 28619.000000 \n", 434 | "\n", 435 | " MajorOperatingSystemVersion DllCharacteristics SizeOfStackReserve \\\n", 436 | "count 137444.000000 137444.000000 1.374440e+05 \n", 437 | "mean 5.098738 22301.043436 9.306841e+05 \n", 438 | "std 99.437584 15444.753219 5.553175e+05 \n", 439 | "min 0.000000 0.000000 0.000000e+00 \n", 440 | "25% 4.000000 320.000000 1.048576e+06 \n", 441 | "50% 5.000000 33088.000000 1.048576e+06 \n", 442 | "75% 5.000000 33088.000000 1.048576e+06 \n", 443 | "max 36868.000000 49504.000000 3.355443e+07 \n", 444 | "\n", 445 | " NumberOfSections ResourceSize legitimate \n", 446 | "count 137444.000000 1.374440e+05 137444.000000 \n", 447 | "mean 4.997119 2.474766e+05 0.297707 \n", 448 | "std 1.917237 2.129516e+07 0.457252 \n", 449 | "min 1.000000 0.000000e+00 0.000000 \n", 450 | "25% 4.000000 2.216000e+03 0.000000 \n", 451 | "50% 5.000000 9.640000e+03 0.000000 \n", 452 | "75% 5.000000 2.376250e+04 1.000000 \n", 453 | "max 40.000000 4.294903e+09 1.000000 " 454 | ] 455 | }, 456 | "execution_count": 5, 457 | "metadata": {}, 458 | "output_type": "execute_result" 459 | } 460 | ], 461 | "source": [ 462 | "data.describe()" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 6, 468 | "id": "65dbf455", 469 | "metadata": { 470 | "scrolled": true 471 | }, 472 | "outputs": [ 473 | { 474 | "data": { 475 | "text/plain": [ 476 | "legitimate\n", 477 | "0 96526\n", 478 | "1 40918\n", 479 | "dtype: int64" 480 | ] 481 | }, 482 | "execution_count": 6, 483 | "metadata": {}, 484 | "output_type": "execute_result" 485 | } 486 | ], 487 | "source": [ 488 | "data.groupby(data['legitimate']).size()" 489 | ] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "id": "2eab99e9", 494 | "metadata": {}, 495 | "source": [ 496 | "We have a total of 137.444 data. There are 96.526 safe and 40.918 malware." 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 7, 502 | "id": "8592b076", 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "features = data.iloc[:,[0,1,2,3,4,5,6,7]].values # extracting the first 8 columns from the dataset - features" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": 8, 512 | "id": "3da1b6cd", 513 | "metadata": {}, 514 | "outputs": [ 515 | { 516 | "data": { 517 | "text/plain": [ 518 | "array([[ 10407, 9, 6, ..., 262144, 4, 952],\n", 519 | " [ 5354, 9, 6, ..., 262144, 4, 952],\n", 520 | " [ 58807, 9, 6, ..., 262144, 4, 136490],\n", 521 | " ...,\n", 522 | " [ 59610, 10, 0, ..., 1048576, 5, 22648],\n", 523 | " [ 51216, 2, 0, ..., 1048576, 8, 2216],\n", 524 | " [ 22731, 11, 0, ..., 1048576, 5, 318464]])" 525 | ] 526 | }, 527 | "execution_count": 8, 528 | "metadata": {}, 529 | "output_type": "execute_result" 530 | } 531 | ], 532 | "source": [ 533 | "features" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 9, 539 | "id": "46db9f27", 540 | "metadata": {}, 541 | "outputs": [], 542 | "source": [ 543 | "ifMalware = data.iloc[:,8].values # extracting the legitimate column of the dataset - safe & malware results" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": 10, 549 | "id": "c0e0d3db", 550 | "metadata": {}, 551 | "outputs": [ 552 | { 553 | "data": { 554 | "text/plain": [ 555 | "array([1, 1, 1, ..., 0, 0, 0])" 556 | ] 557 | }, 558 | "execution_count": 10, 559 | "metadata": {}, 560 | "output_type": "execute_result" 561 | } 562 | ], 563 | "source": [ 564 | "ifMalware" 565 | ] 566 | }, 567 | { 568 | "cell_type": "markdown", 569 | "id": "773b9a86", 570 | "metadata": {}, 571 | "source": [ 572 | "We will use 75% of our dataset for training and 25% for testing :" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 11, 578 | "id": "0657650f", 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "features_train, features_test, ifMalware_train, ifMalware_test = train_test_split(features, ifMalware, test_size=0.25)" 583 | ] 584 | }, 585 | { 586 | "cell_type": "markdown", 587 | "id": "4edcf3e5", 588 | "metadata": {}, 589 | "source": [ 590 | "Learning and Predict :" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": 12, 596 | "id": "30274115", 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [ 600 | "knModel = KNeighborsClassifier(n_neighbors=1) # Defined the model." 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 13, 606 | "id": "b6993294", 607 | "metadata": {}, 608 | "outputs": [ 609 | { 610 | "data": { 611 | "text/plain": [ 612 | "KNeighborsClassifier(n_neighbors=1)" 613 | ] 614 | }, 615 | "execution_count": 13, 616 | "metadata": {}, 617 | "output_type": "execute_result" 618 | } 619 | ], 620 | "source": [ 621 | "knModel.fit(features_train, ifMalware_train) # Provided training data." 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": 14, 627 | "id": "0a413829", 628 | "metadata": {}, 629 | "outputs": [], 630 | "source": [ 631 | "knPredict = knModel.predict(features_test) # Give the test data then call predict." 632 | ] 633 | }, 634 | { 635 | "cell_type": "markdown", 636 | "id": "9d67adc3", 637 | "metadata": {}, 638 | "source": [ 639 | "Results :" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": 15, 645 | "id": "42f2a30d", 646 | "metadata": {}, 647 | "outputs": [ 648 | { 649 | "name": "stdout", 650 | "output_type": "stream", 651 | "text": [ 652 | "Number of mislabeled out of a total of 34361 test entries: 838\n" 653 | ] 654 | } 655 | ], 656 | "source": [ 657 | "print(\"Number of mislabeled out of a total of %d test entries: %d\" % (features_test.shape[0], \n", 658 | " (ifMalware_test != knPredict).sum()))" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 16, 664 | "id": "cbbfc4fb", 665 | "metadata": {}, 666 | "outputs": [], 667 | "source": [ 668 | "successRate = 100 * f1_score(ifMalware_test, knPredict, average='micro') # Success rate calculation" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": 17, 674 | "id": "894b2b35", 675 | "metadata": {}, 676 | "outputs": [ 677 | { 678 | "name": "stdout", 679 | "output_type": "stream", 680 | "text": [ 681 | "The Success Rate was calculated as % : 97.56118855679404 with the K-Nearest-Neighbors\n" 682 | ] 683 | } 684 | ], 685 | "source": [ 686 | "print(\"The Success Rate was calculated as % : \" + str(successRate) + \" with the K-Nearest-Neighbors\")" 687 | ] 688 | }, 689 | { 690 | "cell_type": "markdown", 691 | "id": "ffda807e", 692 | "metadata": {}, 693 | "source": [ 694 | "Visualization :" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": 18, 700 | "id": "a1af503a", 701 | "metadata": {}, 702 | "outputs": [ 703 | { 704 | "data": { 705 | "text/plain": [ 706 | "" 707 | ] 708 | }, 709 | "execution_count": 18, 710 | "metadata": {}, 711 | "output_type": "execute_result" 712 | }, 713 | { 714 | "data": { 715 | "image/png": "\n", 716 | "text/plain": [ 717 | "
" 718 | ] 719 | }, 720 | "metadata": { 721 | "needs_background": "light" 722 | }, 723 | "output_type": "display_data" 724 | } 725 | ], 726 | "source": [ 727 | "plot_confusion_matrix(knModel,features_test,ifMalware_test)" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": 19, 733 | "id": "396f5630", 734 | "metadata": {}, 735 | "outputs": [ 736 | { 737 | "data": { 738 | "text/plain": [ 739 | "" 740 | ] 741 | }, 742 | "execution_count": 19, 743 | "metadata": {}, 744 | "output_type": "execute_result" 745 | }, 746 | { 747 | "data": { 748 | "image/png": "\n", 749 | "text/plain": [ 750 | "
" 751 | ] 752 | }, 753 | "metadata": { 754 | "needs_background": "light" 755 | }, 756 | "output_type": "display_data" 757 | } 758 | ], 759 | "source": [ 760 | "plot_precision_recall_curve(knModel,features_test,ifMalware_test)" 761 | ] 762 | }, 763 | { 764 | "cell_type": "code", 765 | "execution_count": 20, 766 | "id": "6aced8b7", 767 | "metadata": {}, 768 | "outputs": [ 769 | { 770 | "data": { 771 | "text/plain": [ 772 | "" 773 | ] 774 | }, 775 | "execution_count": 20, 776 | "metadata": {}, 777 | "output_type": "execute_result" 778 | }, 779 | { 780 | "data": { 781 | "image/png": "\n", 782 | "text/plain": [ 783 | "
" 784 | ] 785 | }, 786 | "metadata": { 787 | "needs_background": "light" 788 | }, 789 | "output_type": "display_data" 790 | } 791 | ], 792 | "source": [ 793 | "plot_roc_curve(knModel,features_test,ifMalware_test)" 794 | ] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": null, 799 | "id": "038c4e87", 800 | "metadata": {}, 801 | "outputs": [], 802 | "source": [] 803 | } 804 | ], 805 | "metadata": { 806 | "kernelspec": { 807 | "display_name": "Python 3 (ipykernel)", 808 | "language": "python", 809 | "name": "python3" 810 | }, 811 | "language_info": { 812 | "codemirror_mode": { 813 | "name": "ipython", 814 | "version": 3 815 | }, 816 | "file_extension": ".py", 817 | "mimetype": "text/x-python", 818 | "name": "python", 819 | "nbconvert_exporter": "python", 820 | "pygments_lexer": "ipython3", 821 | "version": "3.9.10" 822 | } 823 | }, 824 | "nbformat": 4, 825 | "nbformat_minor": 5 826 | } 827 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Emrah Yıldırım 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Malware-Detection-Using-Machine-Learning 2 | 3 | 4 | ## About The Project 5 | 6 | * This project analyzes PE information of exe files to detect malware. 7 | * In this repository you will learn how to create your own dataset and will be able to see the use of machine learning models using the dataset. 8 | * We will use machine learning for detect malware. 9 | 10 | 11 | ## Getting Started 12 | This pe information was extracted using pefile. 13 | 14 | If you want to create your own dataset, check out ```Data_Set_Generator.ipynb``` 15 | 16 | or you can use ```data-set/MalwareDataSet.csv``` 17 | 18 | ## Dataset 19 | 20 | This dataset (MalwareDataSet.csv) contains a total of ```137,444 data```. 21 | There are ```96,526 safe and 40,918 malware```. 22 | 23 | It has a total of 9 columns. The information of these columns is as follows: 24 | ``` 25 | * AddressOfEntryPoint 26 | * MajorLinkerVersion 27 | * MajorImageVersion 28 | * MajorOperatingSystemVersion 29 | * DllCharacteristics 30 | * SizeOfStackReserve 31 | * NumberOfSections 32 | * ResourceSize 33 | * legitimate 34 | ``` 35 | 36 | 37 | ## Classification 38 | 39 | We used 3 different classification algorithms. These : 40 | ``` 41 | * K Nearest Neighbors (KNN) 42 | * Decision Tree 43 | * Random Forest 44 | ``` 45 | 46 | 47 | ## Results 48 | ``` 49 | The Success Rate was calculated as % 97.56118855679404 with the K-Nearest-Neighbors 50 | 51 | The Success Rate was calculated as % 98.59433660254359 with the Decision Tree 52 | 53 | The Success Rate was calculated as % 99.1240068682518 with the Random Forest 54 | ``` 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /Random_Forest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "918bf7bd", 6 | "metadata": {}, 7 | "source": [ 8 | "# Malware Analysis With Machine Learning - Random Forest" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "034fde8d", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import pandas as pd\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "from sklearn.ensemble import RandomForestClassifier\n", 22 | "from sklearn.metrics import f1_score\n", 23 | "from sklearn.metrics import plot_confusion_matrix,plot_precision_recall_curve,plot_roc_curve" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "id": "5c6b661b", 29 | "metadata": {}, 30 | "source": [ 31 | "Data set reading :" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "id": "c4e18ea4", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "data = pd.read_csv('../Malware-Detection-Using-Machine-Learning/data-set/MalwareDataSet.csv') " 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "id": "386e250f", 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "data": { 52 | "text/html": [ 53 | "
\n", 54 | "\n", 67 | "\n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | "
AddressOfEntryPointMajorLinkerVersionMajorImageVersionMajorOperatingSystemVersionDllCharacteristicsSizeOfStackReserveNumberOfSectionsResourceSizelegitimate
0104079663308826214449521
153549663308826214449521
2588079663308826214441364901
32516696633088262144419401
470387966330882621444830981
..............................
13743912329111053308810485765816540
137440400002613276810485768676240
1374415961010053308810485765226480
1374425121620101048576822160
13744322731110533088104857653184640
\n", 217 | "

137444 rows × 9 columns

\n", 218 | "
" 219 | ], 220 | "text/plain": [ 221 | " AddressOfEntryPoint MajorLinkerVersion MajorImageVersion \\\n", 222 | "0 10407 9 6 \n", 223 | "1 5354 9 6 \n", 224 | "2 58807 9 6 \n", 225 | "3 25166 9 6 \n", 226 | "4 70387 9 6 \n", 227 | "... ... ... ... \n", 228 | "137439 123291 11 0 \n", 229 | "137440 40000 2 6 \n", 230 | "137441 59610 10 0 \n", 231 | "137442 51216 2 0 \n", 232 | "137443 22731 11 0 \n", 233 | "\n", 234 | " MajorOperatingSystemVersion DllCharacteristics SizeOfStackReserve \\\n", 235 | "0 6 33088 262144 \n", 236 | "1 6 33088 262144 \n", 237 | "2 6 33088 262144 \n", 238 | "3 6 33088 262144 \n", 239 | "4 6 33088 262144 \n", 240 | "... ... ... ... \n", 241 | "137439 5 33088 1048576 \n", 242 | "137440 1 32768 1048576 \n", 243 | "137441 5 33088 1048576 \n", 244 | "137442 1 0 1048576 \n", 245 | "137443 5 33088 1048576 \n", 246 | "\n", 247 | " NumberOfSections ResourceSize legitimate \n", 248 | "0 4 952 1 \n", 249 | "1 4 952 1 \n", 250 | "2 4 136490 1 \n", 251 | "3 4 1940 1 \n", 252 | "4 4 83098 1 \n", 253 | "... ... ... ... \n", 254 | "137439 5 81654 0 \n", 255 | "137440 8 67624 0 \n", 256 | "137441 5 22648 0 \n", 257 | "137442 8 2216 0 \n", 258 | "137443 5 318464 0 \n", 259 | "\n", 260 | "[137444 rows x 9 columns]" 261 | ] 262 | }, 263 | "execution_count": 3, 264 | "metadata": {}, 265 | "output_type": "execute_result" 266 | } 267 | ], 268 | "source": [ 269 | "data" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 4, 275 | "id": "dd389773", 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/plain": [ 281 | "(137444, 9)" 282 | ] 283 | }, 284 | "execution_count": 4, 285 | "metadata": {}, 286 | "output_type": "execute_result" 287 | } 288 | ], 289 | "source": [ 290 | "data.shape" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 5, 296 | "id": "5c146794", 297 | "metadata": {}, 298 | "outputs": [ 299 | { 300 | "data": { 301 | "text/html": [ 302 | "
\n", 303 | "\n", 316 | "\n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | "
AddressOfEntryPointMajorLinkerVersionMajorImageVersionMajorOperatingSystemVersionDllCharacteristicsSizeOfStackReserveNumberOfSectionsResourceSizelegitimate
count1.374440e+05137444.000000137444.000000137444.000000137444.0000001.374440e+05137444.0000001.374440e+05137444.000000
mean1.722186e+058.62078468.7318765.09873822301.0434369.306841e+054.9971192.474766e+050.297707
std3.438014e+064.0956351185.70987399.43758415444.7532195.553175e+051.9172372.129516e+070.457252
min0.000000e+000.0000000.0000000.0000000.0000000.000000e+001.0000000.000000e+000.000000
25%1.272100e+048.0000000.0000004.000000320.0000001.048576e+064.0000002.216000e+030.000000
50%5.300800e+049.0000000.0000005.00000033088.0000001.048576e+065.0000009.640000e+030.000000
75%6.157800e+0410.0000006.0000005.00000033088.0000001.048576e+065.0000002.376250e+041.000000
max1.074484e+09255.00000028619.00000036868.00000049504.0000003.355443e+0740.0000004.294903e+091.000000
\n", 430 | "
" 431 | ], 432 | "text/plain": [ 433 | " AddressOfEntryPoint MajorLinkerVersion MajorImageVersion \\\n", 434 | "count 1.374440e+05 137444.000000 137444.000000 \n", 435 | "mean 1.722186e+05 8.620784 68.731876 \n", 436 | "std 3.438014e+06 4.095635 1185.709873 \n", 437 | "min 0.000000e+00 0.000000 0.000000 \n", 438 | "25% 1.272100e+04 8.000000 0.000000 \n", 439 | "50% 5.300800e+04 9.000000 0.000000 \n", 440 | "75% 6.157800e+04 10.000000 6.000000 \n", 441 | "max 1.074484e+09 255.000000 28619.000000 \n", 442 | "\n", 443 | " MajorOperatingSystemVersion DllCharacteristics SizeOfStackReserve \\\n", 444 | "count 137444.000000 137444.000000 1.374440e+05 \n", 445 | "mean 5.098738 22301.043436 9.306841e+05 \n", 446 | "std 99.437584 15444.753219 5.553175e+05 \n", 447 | "min 0.000000 0.000000 0.000000e+00 \n", 448 | "25% 4.000000 320.000000 1.048576e+06 \n", 449 | "50% 5.000000 33088.000000 1.048576e+06 \n", 450 | "75% 5.000000 33088.000000 1.048576e+06 \n", 451 | "max 36868.000000 49504.000000 3.355443e+07 \n", 452 | "\n", 453 | " NumberOfSections ResourceSize legitimate \n", 454 | "count 137444.000000 1.374440e+05 137444.000000 \n", 455 | "mean 4.997119 2.474766e+05 0.297707 \n", 456 | "std 1.917237 2.129516e+07 0.457252 \n", 457 | "min 1.000000 0.000000e+00 0.000000 \n", 458 | "25% 4.000000 2.216000e+03 0.000000 \n", 459 | "50% 5.000000 9.640000e+03 0.000000 \n", 460 | "75% 5.000000 2.376250e+04 1.000000 \n", 461 | "max 40.000000 4.294903e+09 1.000000 " 462 | ] 463 | }, 464 | "execution_count": 5, 465 | "metadata": {}, 466 | "output_type": "execute_result" 467 | } 468 | ], 469 | "source": [ 470 | "data.describe()" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 6, 476 | "id": "c4294347", 477 | "metadata": {}, 478 | "outputs": [ 479 | { 480 | "data": { 481 | "text/plain": [ 482 | "legitimate\n", 483 | "0 96526\n", 484 | "1 40918\n", 485 | "dtype: int64" 486 | ] 487 | }, 488 | "execution_count": 6, 489 | "metadata": {}, 490 | "output_type": "execute_result" 491 | } 492 | ], 493 | "source": [ 494 | "data.groupby(data['legitimate']).size()" 495 | ] 496 | }, 497 | { 498 | "cell_type": "markdown", 499 | "id": "2eab99e9", 500 | "metadata": {}, 501 | "source": [ 502 | "We have a total of 137.444 data. There are 96.526 safe and 40.918 malware." 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 7, 508 | "id": "b87ac400", 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [ 512 | "features = data.iloc[:,[0,1,2,3,4,5,6,7]].values # extracting the first 8 columns from the dataset - features" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 8, 518 | "id": "7dcb27ec", 519 | "metadata": {}, 520 | "outputs": [ 521 | { 522 | "data": { 523 | "text/plain": [ 524 | "array([[ 10407, 9, 6, ..., 262144, 4, 952],\n", 525 | " [ 5354, 9, 6, ..., 262144, 4, 952],\n", 526 | " [ 58807, 9, 6, ..., 262144, 4, 136490],\n", 527 | " ...,\n", 528 | " [ 59610, 10, 0, ..., 1048576, 5, 22648],\n", 529 | " [ 51216, 2, 0, ..., 1048576, 8, 2216],\n", 530 | " [ 22731, 11, 0, ..., 1048576, 5, 318464]])" 531 | ] 532 | }, 533 | "execution_count": 8, 534 | "metadata": {}, 535 | "output_type": "execute_result" 536 | } 537 | ], 538 | "source": [ 539 | "features" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": 9, 545 | "id": "137e5f7f", 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [ 549 | "ifMalware = data.iloc[:,8].values # extracting the legitimate column of the dataset - safe & malware results" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 10, 555 | "id": "52df4a9c", 556 | "metadata": {}, 557 | "outputs": [ 558 | { 559 | "data": { 560 | "text/plain": [ 561 | "array([1, 1, 1, ..., 0, 0, 0])" 562 | ] 563 | }, 564 | "execution_count": 10, 565 | "metadata": {}, 566 | "output_type": "execute_result" 567 | } 568 | ], 569 | "source": [ 570 | "ifMalware" 571 | ] 572 | }, 573 | { 574 | "cell_type": "markdown", 575 | "id": "bfc0b019", 576 | "metadata": {}, 577 | "source": [ 578 | "We will use 75% of our dataset for training and 25% for testing :" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": 11, 584 | "id": "8a988b38", 585 | "metadata": {}, 586 | "outputs": [], 587 | "source": [ 588 | "features_train, features_test, ifMalware_train, ifMalware_test = train_test_split(features, ifMalware, test_size=0.25)" 589 | ] 590 | }, 591 | { 592 | "cell_type": "markdown", 593 | "id": "834ffef4", 594 | "metadata": {}, 595 | "source": [ 596 | "Learning and Predict :" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": 12, 602 | "id": "f18cafe1", 603 | "metadata": {}, 604 | "outputs": [], 605 | "source": [ 606 | "rfModel = RandomForestClassifier() # Defined the model." 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": 13, 612 | "id": "a8b11bd3", 613 | "metadata": {}, 614 | "outputs": [ 615 | { 616 | "data": { 617 | "text/plain": [ 618 | "RandomForestClassifier()" 619 | ] 620 | }, 621 | "execution_count": 13, 622 | "metadata": {}, 623 | "output_type": "execute_result" 624 | } 625 | ], 626 | "source": [ 627 | "rfModel.fit(features_train, ifMalware_train) # Provided training data." 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": 14, 633 | "id": "4e41403c", 634 | "metadata": {}, 635 | "outputs": [], 636 | "source": [ 637 | "rfPredict = rfModel.predict(features_test) # Give the test data then call predict." 638 | ] 639 | }, 640 | { 641 | "cell_type": "markdown", 642 | "id": "42a29bb3", 643 | "metadata": {}, 644 | "source": [ 645 | "Results :" 646 | ] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": 15, 651 | "id": "310395c3", 652 | "metadata": {}, 653 | "outputs": [ 654 | { 655 | "name": "stdout", 656 | "output_type": "stream", 657 | "text": [ 658 | "Number of mislabeled out of a total of 34361 test entries: 301\n" 659 | ] 660 | } 661 | ], 662 | "source": [ 663 | "print(\"Number of mislabeled out of a total of %d test entries: %d\" % (features_test.shape[0], \n", 664 | " (ifMalware_test != rfPredict).sum()))" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": 16, 670 | "id": "3da4e570", 671 | "metadata": {}, 672 | "outputs": [], 673 | "source": [ 674 | "successRate = 100 * f1_score(ifMalware_test, rfPredict, average='micro') # Success rate calculation" 675 | ] 676 | }, 677 | { 678 | "cell_type": "code", 679 | "execution_count": 17, 680 | "id": "9a79bdd6", 681 | "metadata": {}, 682 | "outputs": [ 683 | { 684 | "name": "stdout", 685 | "output_type": "stream", 686 | "text": [ 687 | "The Success Rate was calculated as % : 99.1240068682518 with the Random Forest\n" 688 | ] 689 | } 690 | ], 691 | "source": [ 692 | "print(\"The Success Rate was calculated as % : \" + str(successRate) + \" with the Random Forest\")" 693 | ] 694 | }, 695 | { 696 | "cell_type": "markdown", 697 | "id": "fc87041f", 698 | "metadata": {}, 699 | "source": [ 700 | "Visualization :" 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": 18, 706 | "id": "b5fb046a", 707 | "metadata": {}, 708 | "outputs": [ 709 | { 710 | "data": { 711 | "text/plain": [ 712 | "" 713 | ] 714 | }, 715 | "execution_count": 18, 716 | "metadata": {}, 717 | "output_type": "execute_result" 718 | }, 719 | { 720 | "data": { 721 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAUUAAAEGCAYAAADyuIefAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAceUlEQVR4nO3de5xWZb338c+XAUTkDIoIqKh4IAs0Q7QyRbeAT3loW2m5JbUwD2XZbqftnjDJ56mXpWal5YHtqTymqWUiUaZmqGiKipmIclbkpJyEYea3/1jXwBpgZu4b5mbuuef7fr3Wa9Z93dda67cG/c11rWutaykiMDOzTLuWDsDMrJw4KZqZ5TgpmpnlOCmameU4KZqZ5bRv6QDy+vSqij0HdmjpMKwI/5reuaVDsCK8zyrWxVptyz5GHbVTLFlaU1DdZ6evnRQRo7fleNtbWSXFPQd24OlJA1s6DCvCqN2GtXQIVoSnYso272Px0hqemjSgoLod+r3eZ5sPuJ2VVVI0s9YgqInalg6iZJwUzawoAdRSuQ99OCmaWdFqcUvRzAyAIKh299nMLBNAjbvPZmYb+ZqimVkSQE0Fz67lpGhmRavcK4pOimZWpCB8TdHMrE4EVFduTnRSNLNiiRq26fHpsuakaGZFCaDWLUUzs43cUjQzS7Kbt50UzcyALClWR+XOT+2kaGZFCURNBU/a76RoZkWrDXefzcwAX1M0M9uEqPE1RTOzTDbztpOimRkAEWJdVLV0GCXjpGhmRav1NUUzs0w20OLus5lZ4oEWM7MNPNBiZraJGt+8bWaWCUR1VG7qqNwzM7OS8ECLmVlOIHefzczyPNBiZpZE4FtyzMzqZAMtfszPzGyDSh5oqdwzM7OSCERtFLY0RtJASX+RNEPSy5IuSOW9JE2W9Fr62TOVS9LVkmZKmi7p4Ny+xqb6r0kamyv/sKQX0zZXS2pyhMhJ0cyKVkO7gpYmrAe+GRFDgBHAeZKGABcBUyJiMDAlfQYYAwxOyzjgWsiSKDAeOBQYDoyvS6Spzpdz241uKignRTMrSvbe53YFLY3uJ2JhRDyX1lcArwD9gROAm1O1m4ET0/oJwC2RmQr0kNQPGAVMjoilEbEMmAyMTt91i4ipERHALbl9NcjXFM2sSCrmdQR9JE3Lfb4uIq7bbI/SnsBBwFNA34hYmL56C+ib1vsDc3ObzUtljZXP20J5o5wUzawo2StOCx59XhwRhzRWQVIX4LfA1yPivfxlv4gISbG1sW4Nd5/NrCgRapbuM4CkDmQJ8dcRcW8qfjt1fUk/F6Xy+cDA3OYDUllj5QO2UN4oJ0UzK1pNtCtoaUwaCb4ReCUirsh99QBQN4I8Frg/V356GoUeAbybutmTgGMl9UwDLMcCk9J370kakY51em5fDXL32cyKks2n2CzPPn8U+A/gRUnPp7LvAD8E7pJ0FjAb+Gz67iHgOGAmsBo4AyAilkqaADyT6l0aEUvT+rnATcCOwB/T0ignRTMrUvPMvB0RT0CD2fXoLdQP4LwG9jURmLiF8mnAgcXE5aRoZkXJbsnxLDlmZoCffTYz24ynDjMzS7Kpw9x9NjPbwNcUzcySbJYcd5/NzIC6x/ycFNu8RfM7cPkFu7P8nQ6g4LjTlnDSlxZv+P6eX+7M9Zf2564XX6R77xrmvLYDV1y4OzNf3JGx317IZ855p97+amrgq6P3pXe/aibc8gYA/3i8CzdM2I3aWrHjTjV886o59B+0brueZ1tw4RVzOPSYFSxf3J6zR+63ofz4M9/h+C8uobYGnprSjRt/sBtHnbSMz5y7aEOdQQe8z3mj9mXWyzu2ROhlwi3FrSZpNPBToAq4ISJ+WMrjlVJV+2Dc9xYw+ENrWL2yHeeP3peDj1jBHvuuZdH8Djz3167s0n9jAuvWs4ZzJszjyYe7b3F/v7thZwYOXsvqlRv/4/rZxQO45H/eYPfBa3nwpt7c/tNd+c+r5pT83NqaR+7sxQP/04dv/XTjxCpDD1/J4aPe45xj9qV6XTu6964G4C/39eQv92VT8+25/xrGT3yzjSfETDM90VKWSpbuJVUBvyCbGHIIcGqaQLJV6t13PYM/tAaAzl1qGbjPWhYv7ADAry7pz1nfXUB+Tt8efdaz37A1tN/Cn513FnTg6SndGPP5JfXKBaxekd3/tWpFFb36VpfkXNq6l57qwopl9f9hPnn6Yu78+S5Ur8v+l3h3SYfNtjvqxOX89f4e2yPEslY3+lzI0hqVsqU4HJgZEbMAJN1BNknkjBIec7t4a25HXn9pR/Y/eDVPPtyNPrtWs/cH3i94+1+O78+XvruA1Svr3wD79Z/M5bv/sRc7dKqlc5darvr9v5o7dGtA/73XcuChq/jit99i3Vpx/aW78a8XOterc8Txy7nkjD1bJsAyU8nd51KeWUMTP9YjaZykaZKmvbOkpoThNI81q9ox4Ut78pVL51NVFdzxs76c/q2FTW+YTJ3cjR59NrY68+67bmd+cOssfv3sDI793BKuu6TJ+TCtmVRVQdce67ngk/tww4Td+O9fzSYbUsjsd9Aq1q5px+xX3XVurne0lKsWT/cRcV1EHBIRh+zcu7wfHVpfDRO+tCcjP72Mjx33Lgtn78BbczpyzjH7c/rwIbyzsAPnjdqPpYsaboDPeGYnpj7SjdOHD+H/n7MHLzzRlR+dvzvLl1Qxa0bW+gT4xPHLmTFtp+11am3e4oUd+NtDPQDx6vOdqa2F7r02/pE+8oTlPPq7Hi0VXlkJYH20K2hpjUrZfW5o4sdWKQKu+ObuDBy8ln8/OxtJHnTA+9z14ssb6pw+fAg/++OrdO/dcIv3zO8s5MzvZC3LF57swj2/3Jlv/3wONeth1XtVzHt9BwbsvZbnHuvKwMGFd8lt2zz5cDeGfnQlLzzZhf57raVDx+DdpdkfaSk44lPL+eZJ+7RwlOWjkrvPpUyKzwCDJQ0iS4anAJ8v4fFK6uWnd2LKPb0YdMAazjkmu43jjIsXMPzoFVusv3RRe746Zl9Wr6hC7bLR5use/Sc7da3dYv2q9vD1H89lwpf3RO2ga/caLrzCI8+lcNE1s/nQYSvp3ms9t02bwa0/6cukO3px4RVz+dWfX6W6Wlx+wUDqZrX64IhVvLOgI2/N2aFlAy8XrbhrXAhlU5SVaOfSccBVZLfkTIyIyxqrf8jQTvH0pIGNVbEyM2q3YS0dghXhqZjCe7F0mzJaz/13iZETTy6o7r0fvfbZpt7RUm5Kep9iRDxENluumVWQSm4p+okWMyuKJ5k1M8sJxPpaD7SYmW1QyY/5OSmaWXHC3Wczsw18TdHMbBNOimZmSSBqPNBiZraRB1rMzJLwQIuZWX3hpGhmVqeyJ4RwUjSzormlaGaWREBNrZOimdkGHn02M0sCd5/NzHI80GJmVk8JJ+xvcU6KZlY0d5/NzJJs9Llyn32u3DMzs5KJKGxpiqSJkhZJeilXdomk+ZKeT8txue8uljRT0quSRuXKR6eymZIuypUPkvRUKr9TUsemYnJSNLOiRaigpQA3AaO3UH5lRAxLy0MAkoaQvSr5A2mbayRVSaoCfgGMAYYAp6a6AD9K+9oHWAac1VRATopmVpSgsIRYSFKMiMeApQUe+gTgjohYGxFvADOB4WmZGRGzImIdcAdwgiQBI4F70vY3Ayc2dRAnRTMrWhS4AH0kTcst4wo8xPmSpqfudc9U1h+Ym6szL5U1VN4bWB4R6zcpb5QHWsysOAFR+GN+iyPikCKPcC0wITsSE4CfAGcWuY+t5qRoZkUr5S05EfF23bqk64Hfp4/zgYG5qgNSGQ2ULwF6SGqfWov5+g1y99nMitZco89bIqlf7uNJQN3I9APAKZJ2kDQIGAw8DTwDDE4jzR3JBmMeiIgA/gKcnLYfC9zf1PEbbClK+hkbLgtsLiK+1tTOzazyNOezz5JuB44ku/Y4DxgPHClpWDrUm8DZABHxsqS7gBnAeuC8iKhJ+zkfmARUARMj4uV0iG8Dd0j6AfAP4MamYmqs+zytyPMzs7YggGZKihFx6haKG0xcEXEZcNkWyh8CHtpC+Syy0emCNZgUI+Lm/GdJnSNidTE7N7PKVMnPPjd5TVHSYZJmAP9Mn4dKuqbkkZlZmRJRW9jSGhUy0HIVMIpsJIeIeAE4ooQxmVm5K+JGxdamoFtyImJudnP4BjWlCcfMyl54lpy5kg4HQlIH4ALgldKGZWZlrZW2AgtRSPf5K8B5ZI/HLACGpc9m1mapwKX1abKlGBGLgS9sh1jMrLWobekASqeQ0ee9JD0o6Z0079n9kvbaHsGZWRmqu0+xkKUVKqT7/BvgLqAfsBtwN3B7KYMys/JWysf8WlohSbFzRNwaEevTchvQqdSBmVkZa4u35EjqlVb/mKb3voPsND/HFh6nMbM2pJV2jQvR2EDLs2RJsO7sz859F8DFpQrKzMqbWmkrsBCNPfs8aHsGYmatRAha6SN8hSjoiRZJB5K9EGbDtcSIuKVUQZlZmWuLLcU6ksaTzXc2hOxa4hjgCcBJ0aytquCkWMjo88nA0cBbEXEGMBToXtKozKy8tcXR55w1EVErab2kbsAi6r8PwczakmacZLYcFZIUp0nqAVxPNiK9Evh7KYMys/LWJkef60TEuWn1l5IeBrpFxPTShmVmZa0tJkVJBzf2XUQ8V5qQzKzctdWW4k8a+S6Akc0cC/+a3plRuw1r7t1aCc27+PCWDsGKUD1xavPsqC1eU4yIo7ZnIGbWSrTikeVCFHTztplZPU6KZmYbqYInmXVSNLPiVXBLsZCZtyXpNEnfS593lzS89KGZWTlSFL60RoU85ncNcBhwavq8AvhFySIys/JXwa8jKKT7fGhEHCzpHwARsUxSxxLHZWblrJW2AgtRSFKsllRF+jVI2pmKfpeXmTWltXaNC1FIUrwauA/YRdJlZLPmfLekUZlZ+Yo2PvocEb+W9CzZ9GECToyIV0oemZmVr7bcUpS0O7AaeDBfFhFzShmYmZWxtpwUgT+w8QVWnYBBwKvAB0oYl5mVsTZ9TTEiPpj/nGbPObeB6mZmrVrRT7RExHOSDi1FMGbWSrTllqKkC3Mf2wEHAwtKFpGZlbcKH30u5ImWrrllB7JrjCeUMigzK3PN9OIqSRMlLZL0Uq6sl6TJkl5LP3umckm6WtJMSdPzE2FLGpvqvyZpbK78w5JeTNtcLanJx2waTYrppu2uEfH9tFwWEb+OiPebPl0zq0SiWZ99vgkYvUnZRcCUiBgMTEmfIXu98uC0jAOuhSyJAuOBQ4HhwPi6RJrqfDm33abH2kyDSVFS+4ioAT5awImZWVvSTC3FiHgMWLpJ8QnAzWn9ZuDEXPktkZkK9JDUDxgFTI6IpRGxDJgMjE7fdYuIqRERZO+qP5EmNHZN8Wmy64fPS3oAuBtYlTuZe5vauZlVoOJmwOkjaVru83URcV0T2/SNiIVp/S2gb1rvD8zN1ZuXyhorn7eF8kYVMvrcCVhC9k6WuvsVA3BSNGurCh9oWRwRh2ztYSIipO17V2RjSXGXNPL8EhuTYZ0KHpA3s6aUOE29LalfRCxMXeBFqXw+MDBXb0Aqmw8cuUn5o6l8wBbqN6qxgZYqoEtauubW6xYza6ua6ZpiAx4A6kaQxwL358pPT6PQI4B3Uzd7EnCspJ5pgOVYYFL67j1JI9Ko8+m5fTWosZbiwoi4dOvOycwqVjO+zU/S7WStvD6S5pGNIv8QuEvSWcBs4LOp+kPAccBMsvkYzgCIiKWSJgDPpHqXRkTd4M25ZCPcOwJ/TEujGkuKrXPaXDMruebqPkfEqQ18dfQW6gZwXgP7mQhM3EL5NODAYmJqLCluFpSZGVDRowoNJsVc89PMrJ5KfszPrzg1s+I04zXFcuSkaGZFEZU94OCkaGbFc0vRzGyjNj3ztpnZZpwUzcySCp9k1knRzIrnlqKZ2Ua+pmhmluekaGa2kVuKZmZ1gmImmW11nBTNrCh1L66qVE6KZlY8J0Uzs40UlZsVnRTNrDieJcfMrD5fUzQzy/FjfmZmeW4pmpkl4e6zmVl9TopmZhnfvG1mtgnVVm5WdFI0s+L4PkVryoVXzOHQY1awfHF7zh65HwCnf2shh416jwhYvrg9P/767ix9u8OGbfYdupqrHnyN/3fOHjzxhx4tFHnbctrQ6Zz8gRkIuOflA7j1haHs12cx3zvyMTp3qGbBiq7816RjWFXdke6d3ueqMZM4cJdF/O6f+3PZXz8OQKf21Vwx5hEGdn+P2lrx6Jt7cuWTI1r2xFpAJd+S065UO5Y0UdIiSS+V6hjl4pE7e/HfXxhUr+yea3fhnGP249x/24+n/tSN077x9obv2rULzvrvhTz7167bO9Q2a59eSzj5AzM45a5/59O3f5ZPDJrN7t3f5dKRj3LlkyM46fbP8afXB3Hmwc8DsG59FT+bOpzL/3b4Zvu66blhfOq2Uzn5js9wUL+FfGyP2dv5bMpAFLi0QiVLisBNwOgS7r9svPRUF1Ysq9/oXr2yasN6px1ryT8qesKZi3nioe4sX+yG+vayV6/lTH+rL++v70BNtGPa/N04Zu9Z7NHjXaYt6AfA3+cO5N/2mQXAmvUdeG5hP9atr6q3n/fXd+Dp+f0BqK6tYsaindm1y6rtezJlQFHY0hqVLClGxGPA0lLtvzX44rcXctu0GYz89HJuuXxXAHrvWs3hY97l9zf3buHo2paZS3rx4d0W0r3T+3RqX83H95jDrl1WMnNpT0bu9SYAo/Z5nV27rCx4n107ruXIQW8yde6AEkVdpgKIKGxphUrZUiyIpHGSpkmaVs3alg6nWd30o36cdsgQ/nxvD44/czEAX/n+fG68rB8RauHo2pZZy3py43MHcf0JD/Kr4//APxf3pjbE/51yFKd88CXu+tzddO64juqawv6XqFItl4+ezK9f+CDz3utW4ujLj2oLW1qjFu+/RcR1wHUA3dSrdf5pacKf7+vJD259g1t/vCv7Dl3Dxddm16C696ph+NErqKkRf3+4ewtHWfnunXEA9844AIALDpvK2yu78Maynoy7/1MA7NFjOZ/Yc05B+7pk5F+ZvbwHt74wtGTxlivfp2hbZbdBa1nwxg4AHDbqXebOzNbHjjhgQ51vXjmHp/7UzQlxO+m142qWrulMvy4rOGbvN/j8XZ/eUCaCsz/yLHe+OKTJ/XxtxFN07biW7005svRBl6NW3DUuhJNiM7jomtl86LCVdO+1ntumzeDWn/Rl+MgVDNh7LbW1sGh+R67+dhu77lSGrjpuEj06rWV9bTt+8OjHWbFuB04bOp1TP5jdIPGnWXtx3yv7b6j/yNjb6NJxHR3a1TByrzcY97tPsnJdR87+yHO8vrQH95xyNwC/mX4gv53RdDKtJJXcUlSUKONLuh04EugDvA2Mj4gbG9umm3rFoTq6JPFYacy7ePNbVqx8vTnxCtYsnLtNF7S79hgQBx1xQUF1H3/wv56NiEO25XjbW8laihFxaqn2bWYtq5Jbiu4+m1lxAqip3KzopGhmRavklmKL36doZq1QM928LelNSS9Kel7StFTWS9JkSa+lnz1TuSRdLWmmpOmSDs7tZ2yq/5qksdtyak6KZla0Zn7M76iIGJYbkLkImBIRg4Ep6TPAGGBwWsYB10KWRIHxwKHAcGB8XSLdGk6KZlacQieD2Pou9gnAzWn9ZuDEXPktkZkK9JDUDxgFTI6IpRGxDJjMNsy74KRoZkURoJooaAH61D3Gm5Zxm+wugEckPZv7rm9ELEzrbwF903p/YG5u23mprKHyreKBFjMrmgq/v3lxE/cpfiwi5kvaBZgs6Z/5LyMipO07rOOWopkVpxm7zxExP/1cBNxHdk3w7dQtJv1clKrPBwbmNh+Qyhoq3ypOimZWpAJHnptoTUraSVLXunXgWOAl4AGgbgR5LHB/Wn8AOD2NQo8A3k3d7EnAsZJ6pgGWY1PZVnH32cyK1kwd2r7AfZIgy0W/iYiHJT0D3CXpLGA28NlU/yHgOGAmsBo4AyAilkqaADyT6l0aEVs9l6uTopkVrxnmTIiIWcBmc69FxBJgs0kQIpuo4bwG9jURmLjNQeGkaGbFCupGliuSk6KZFa9yc6KTopkVr4hbclodJ0UzK56ToplZEkArfSlVIZwUzawoItx9NjOrp7Zym4pOimZWHHefzczqc/fZzCzPSdHMrE5hrxporZwUzaw4fpufmVl9vqZoZpbnpGhmlgRQ66RoZpZ4oMXMrD4nRTOzJICayn2kxUnRzIoUEE6KZmYbuftsZpZ49NnMbBNuKZqZ5TgpmpklEVBT09JRlIyTopkVzy1FM7McJ0Uzszrh0Wczsw0Cwjdvm5nl+DE/M7Mkwq84NTOrxwMtZmYbhVuKZmZ1PMmsmdlGnhDCzGyjAMKP+ZmZJeFJZs3M6gl3n83Mciq4pagoo1EkSe8As1s6jhLoAyxu6SCsKJX6b7ZHROy8LTuQ9DDZ76cQiyNi9LYcb3srq6RYqSRNi4hDWjoOK5z/zdqudi0dgJlZOXFSNDPLcVLcPq5r6QCsaP43a6N8TdHMLMctRTOzHCdFM7McJ8USkjRa0quSZkq6qKXjsaZJmihpkaSXWjoWaxlOiiUiqQr4BTAGGAKcKmlIy0ZlBbgJaFU3G1vzclIsneHAzIiYFRHrgDuAE1o4JmtCRDwGLG3pOKzlOCmWTn9gbu7zvFRmZmXMSdHMLMdJsXTmAwNznwekMjMrY06KpfMMMFjSIEkdgVOAB1o4JjNrgpNiiUTEeuB8YBLwCnBXRLzcslFZUyTdDvwd2E/SPElntXRMtn35MT8zsxy3FM3McpwUzcxynBTNzHKcFM3McpwUzcxynBRbEUk1kp6X9JKkuyV13oZ93STp5LR+Q2OTVUg6UtLhW3GMNyVt9ta3hso3qbOyyGNdIuk/i43RbFNOiq3LmogYFhEHAuuAr+S/lLRV7/GOiC9FxIxGqhwJFJ0UzVojJ8XW63Fgn9SKe1zSA8AMSVWSLpf0jKTpks4GUObnaX7HPwG71O1I0qOSDknroyU9J+kFSVMk7UmWfL+RWqkfl7SzpN+mYzwj6aNp296SHpH0sqQbADV1EpJ+J+nZtM24Tb67MpVPkbRzKttb0sNpm8cl7d8sv02zZKtaFtayUotwDPBwKjoYODAi3kiJ5d2I+IikHYC/SXoEOAjYj2xux77ADGDiJvvdGbgeOCLtq1dELJX0S2BlRPw41fsNcGVEPCFpd7Kndg4AxgNPRMSlkv4PUMjTIGemY+wIPCPptxGxBNgJmBYR35D0vbTv88leKPWViHhN0qHANcDIrfg1mm2Rk2LrsqOk59P648CNZN3apyPijVR+LPChuuuFQHdgMHAEcHtE1AALJP15C/sfATxWt6+IaGhewWOAIdKGhmA3SV3SMT6dtv2DpGUFnNPXJJ2U1gemWJcAtcCdqfw24N50jMOBu3PH3qGAY5gVzEmxdVkTEcPyBSk5rMoXAV+NiEmb1DuuGeNoB4yIiPe3EEvBJB1JlmAPi4jVkh4FOjVQPdJxl2/6OzBrTr6mWHkmAedI6gAgaV9JOwGPAZ9L1xz7AUdtYdupwBGSBqVte6XyFUDXXL1HgK/WfZA0LK0+Bnw+lY0BejYRa3dgWUqI+5O1VOu0A+pau58n65a/B7wh6TPpGJI0tIljmBXFSbHy3EB2vfC59PKlX5H1CO4DXkvf3UI2E0w9EfEOMI6sq/oCG7uvDwIn1Q20AF8DDkkDOTPYOAr+fbKk+jJZN3pOE7E+DLSX9ArwQ7KkXGcVMDydw0jg0lT+BeCsFN/L+BUP1sw8S46ZWY5bimZmOU6KZmY5TopmZjlOimZmOU6KZmY5TopmZjlOimZmOf8LVxz/n6yM+wcAAAAASUVORK5CYII=\n", 722 | "text/plain": [ 723 | "
" 724 | ] 725 | }, 726 | "metadata": { 727 | "needs_background": "light" 728 | }, 729 | "output_type": "display_data" 730 | } 731 | ], 732 | "source": [ 733 | "plot_confusion_matrix(rfModel,features_test,ifMalware_test)" 734 | ] 735 | }, 736 | { 737 | "cell_type": "code", 738 | "execution_count": 19, 739 | "id": "326f5f69", 740 | "metadata": {}, 741 | "outputs": [ 742 | { 743 | "data": { 744 | "text/plain": [ 745 | "" 746 | ] 747 | }, 748 | "execution_count": 19, 749 | "metadata": {}, 750 | "output_type": "execute_result" 751 | }, 752 | { 753 | "data": { 754 | "image/png": "\n", 755 | "text/plain": [ 756 | "
" 757 | ] 758 | }, 759 | "metadata": { 760 | "needs_background": "light" 761 | }, 762 | "output_type": "display_data" 763 | } 764 | ], 765 | "source": [ 766 | "plot_precision_recall_curve(rfModel,features_test,ifMalware_test)" 767 | ] 768 | }, 769 | { 770 | "cell_type": "code", 771 | "execution_count": 20, 772 | "id": "61719e65", 773 | "metadata": {}, 774 | "outputs": [ 775 | { 776 | "data": { 777 | "text/plain": [ 778 | "" 779 | ] 780 | }, 781 | "execution_count": 20, 782 | "metadata": {}, 783 | "output_type": "execute_result" 784 | }, 785 | { 786 | "data": { 787 | "image/png": "\n", 788 | "text/plain": [ 789 | "
" 790 | ] 791 | }, 792 | "metadata": { 793 | "needs_background": "light" 794 | }, 795 | "output_type": "display_data" 796 | } 797 | ], 798 | "source": [ 799 | "plot_roc_curve(rfModel,features_test,ifMalware_test)" 800 | ] 801 | }, 802 | { 803 | "cell_type": "code", 804 | "execution_count": null, 805 | "id": "8ec51c31", 806 | "metadata": {}, 807 | "outputs": [], 808 | "source": [] 809 | } 810 | ], 811 | "metadata": { 812 | "kernelspec": { 813 | "display_name": "Python 3 (ipykernel)", 814 | "language": "python", 815 | "name": "python3" 816 | }, 817 | "language_info": { 818 | "codemirror_mode": { 819 | "name": "ipython", 820 | "version": 3 821 | }, 822 | "file_extension": ".py", 823 | "mimetype": "text/x-python", 824 | "name": "python", 825 | "nbconvert_exporter": "python", 826 | "pygments_lexer": "ipython3", 827 | "version": "3.9.10" 828 | } 829 | }, 830 | "nbformat": 4, 831 | "nbformat_minor": 5 832 | } 833 | -------------------------------------------------------------------------------- /data-set/dataset.csv: -------------------------------------------------------------------------------- 1 | AddressOfEntryPoint,MajorLinkerVersion,MajorImageVersion,MajorOperatingSystemVersion,DllCharacteristics,SizeOfStackReserve,NumberOfSections,ResourceSize,IfMalware 2 | 47184,5,0,4,0,1048576,3,0,1 3 | 257184,6,0,4,0,1048576,4,1720,1 4 | 443086,11,0,4,34112,1048576,3,24192,1 5 | 29959,6,0,4,0,1048576,5,7816,1 6 | 636583,9,0,5,32768,1048576,3,0,1 7 | 11344,14,0,6,33088,1048576,5,0,1 8 | 33303,11,0,5,33088,1048576,5,2204424,1 9 | 114288,6,0,4,0,1048576,3,928,1 10 | 15803,2,1,4,0,1048576,8,76424,1 11 | 662982,48,0,4,34144,1048576,3,1452,1 12 | 265614,8,0,4,34112,1048576,3,2560,1 13 | 261598,8,0,4,34112,1048576,3,2560,1 14 | 41910,10,0,5,33792,1048576,6,22770,1 15 | 7168,6,1,4,0,1048576,3,1432,1 16 | 226176,6,0,4,0,1048576,6,12928,1 17 | 32057,10,0,5,320,1048576,5,247608,1 18 | 176272,9,0,5,33024,1048576,3,4156,1 19 | 660302,48,0,4,34112,1048576,3,21188,1 20 | 14124,6,0,4,0,1048576,3,0,1 21 | 21982,6,0,4,0,1048576,4,17744,1 22 | 14764,2,6,4,32768,2097152,7,17800,1 23 | 609738,8,0,4,34112,1048576,3,118113,1 24 | 5936,10,0,5,34112,1048576,5,1628,1 25 | 9072,10,0,5,34112,1048576,4,265392,1 26 | 100370,7,5,5,32768,262144,2,0,1 27 | 5269,8,6,4,34112,1048576,7,8288,1 28 | 6264,14,6,6,33120,1048576,6,209496,1 29 | 924368,10,0,5,33088,1048576,3,1324,1 30 | 30650,6,0,4,0,1048576,4,3448736,1 31 | 324832,9,0,5,32768,1048576,4,50088,1 32 | 19001,14,0,6,320,1048576,6,856,1 33 | 664,6,0,4,0,1048576,4,26536,1 34 | 2564766,8,0,4,34112,1048576,3,371870,1 35 | 16896,5,0,4,0,1048576,4,2400,1 36 | 13199,6,6,4,34112,1048576,5,131952,0 37 | 27506,8,6,6,33088,262144,4,25695624,0 38 | 301411,14,0,6,33088,1048576,5,29024,0 39 | 30100,6,0,4,256,1048576,4,4072,0 40 | 40594,14,0,6,33088,1048576,5,25257092,0 41 | 15920,2,6,4,34144,2097152,9,38168,0 42 | 13784,6,6,4,34112,1048576,5,295776,0 43 | 4096,0,0,4,0,1048576,5,15104,0 44 | 13238,6,6,4,34112,1048576,5,75880,0 45 | 745196,2,6,6,33088,1048576,10,51196,0 46 | 43787,14,0,6,33088,1048576,5,135743656,0 47 | 261455,14,0,6,33024,1048576,4,52160,0 48 | 745196,2,6,6,33088,1048576,10,65828,0 49 | 4768,2,1,4,1344,2097152,11,174736,0 50 | 46622,14,0,6,33088,1048576,7,82774692,0 51 | 108448,14,0,6,49472,1048576,5,37936,0 52 | --------------------------------------------------------------------------------