├── README.md └── RansomwareD.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Ransomeware Detection Using ML 2 | 3 | ### Machine Learning Algorithms used are: 4 | 5 | 1. Random Forest 6 | 2. Decision Tree 7 | 3. Logistic Regression 8 | 9 | ### Additional Libraries Used: 10 | 11 | * pefile 12 | * pickle 13 | * joblib 14 | * mlxtend 15 | * statsmodel 16 | * sklearn 17 | 18 | ### Concepts Used: 19 | 20 | * Multicollinearity 21 | * Ensemble Technique 22 | * Extra Tree Classifier 23 | -------------------------------------------------------------------------------- /RansomwareD.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "id": "2bca8f3e", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "Collecting pefile\n", 14 | " Using cached pefile-2021.5.24.tar.gz (66 kB)\n", 15 | "Requirement already satisfied: future in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from pefile) (0.18.2)\n", 16 | "Building wheels for collected packages: pefile\n", 17 | " Building wheel for pefile (setup.py): started\n", 18 | " Building wheel for pefile (setup.py): finished with status 'done'\n", 19 | " Created wheel for pefile: filename=pefile-2021.5.24-py3-none-any.whl size=62578 sha256=cf20a74be7fc5f7210d0a6f4e3714ed405c5d8da883caaeb0d173f1927786bf5\n", 20 | " Stored in directory: c:\\users\\vajha\\appdata\\local\\pip\\cache\\wheels\\43\\04\\fc\\d9305103f7d512f2df35b1878e1009e8217e713b767aee8f13\n", 21 | "Successfully built pefile\n", 22 | "Installing collected packages: pefile\n", 23 | "Successfully installed pefile-2021.5.24\n", 24 | "Note: you may need to restart the kernel to use updated packages.\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "pip install pefile" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 1, 35 | "id": "817480c1", 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "Collecting mlxtend\n", 43 | " Using cached mlxtend-0.18.0-py2.py3-none-any.whl (1.3 MB)\n", 44 | "Requirement already satisfied: setuptools in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (52.0.0.post20210125)\n", 45 | "Requirement already satisfied: joblib>=0.13.2 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (1.0.1)\n", 46 | "Requirement already satisfied: scikit-learn>=0.20.3 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (0.24.1)\n", 47 | "Requirement already satisfied: scipy>=1.2.1 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (1.6.2)\n", 48 | "Requirement already satisfied: pandas>=0.24.2 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (1.2.4)\n", 49 | "Requirement already satisfied: matplotlib>=3.0.0 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (3.3.4)\n", 50 | "Requirement already satisfied: numpy>=1.16.2 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (1.20.1)\n", 51 | "Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (2.8.1)\n", 52 | "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (2.4.7)\n", 53 | "Requirement already satisfied: cycler>=0.10 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (0.10.0)\n", 54 | "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (8.2.0)\n", 55 | "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (1.3.1)\n", 56 | "Requirement already satisfied: six in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from cycler>=0.10->matplotlib>=3.0.0->mlxtend) (1.15.0)\n", 57 | "Requirement already satisfied: pytz>=2017.3 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from pandas>=0.24.2->mlxtend) (2021.1)\n", 58 | "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from scikit-learn>=0.20.3->mlxtend) (2.1.0)\n", 59 | "Installing collected packages: mlxtend\n", 60 | "Successfully installed mlxtend-0.18.0\n", 61 | "Note: you may need to restart the kernel to use updated packages.\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "pip install mlxtend" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 3, 72 | "id": "b94512f2", 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "import os\n", 77 | "import pandas as pd\n", 78 | "import numpy as np\n", 79 | "from matplotlib import pyplot as plt\n", 80 | "import pickle\n", 81 | "import pefile\n", 82 | "import sklearn.ensemble as ek\n", 83 | "from sklearn import tree, linear_model\n", 84 | "from sklearn.feature_selection import SelectFromModel\n", 85 | "import joblib\n", 86 | "from sklearn.naive_bayes import GaussianNB\n", 87 | "from sklearn.metrics import confusion_matrix\n", 88 | "from sklearn.pipeline import make_pipeline\n", 89 | "from sklearn import preprocessing\n", 90 | "from sklearn import svm\n", 91 | "from sklearn.linear_model import LogisticRegression\n", 92 | "from statsmodels.stats.outliers_influence import variance_inflation_factor as vif\n", 93 | "from sklearn.model_selection import train_test_split\n", 94 | "from mlxtend.plotting import plot_confusion_matrix\n", 95 | "dataset=pd.read_csv(\"Ransomware.csv\",sep='|')" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 4, 101 | "id": "cde9b494", 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/html": [ 107 | "
\n", 108 | "\n", 121 | "\n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | "
Namemd5MachineSizeOfOptionalHeaderCharacteristicsMajorLinkerVersionMinorLinkerVersionSizeOfCodeSizeOfInitializedDataSizeOfUninitializedData...ResourcesNbResourcesMeanEntropyResourcesMinEntropyResourcesMaxEntropyResourcesMeanSizeResourcesMinSizeResourcesMaxSizeLoadConfigurationSizeVersionInformationSizelegitimate
0memtest.exe631ea355665f28d4707448e442fbf5b8332224258903619841157120...43.2628232.5688443.5379398797.000000216180320161
1ose.exe9d10f99a6712e28f8acd5641e3a7ea6b332224333090130560199680...24.2504613.4207445.080177837.000000518115672181
2setup.exe4d92f518527353c0db88a70fddcfd3903322243330905171206215680...114.4263242.8464495.27181331102.27272710427037672181
3DW20.EXEa41e524f8d45f0074fd07805ff0c9b12332224258905857283691520...104.3642912.6693146.4007201457.00000090426472181
4dwtrig20.exec87e561258f2f8650cef999bf643a731332224258902949122472960...24.3061003.4215985.1906031074.500000849130072181
..................................................................
138042VirusShare_8e292b418568d6e7b87f2a32aee7074b8e292b418568d6e7b87f2a32aee7074b3322242581102058242237440...74.1227361.3702607.67709114900.71428616816547200
138043VirusShare_260d9e2258aed4c8a3bbd703ec895822260d9e2258aed4c8a3bbd703ec89582233222433167225378881853440...263.3776632.0316195.0500746905.84615444676240150
138044VirusShare_8d088a51b7d225c9f5d11d239791ec3f8d088a51b7d225c9f5d11d239791ec3f3322242581001182723804160...226.8254062.6170267.99048714981.909091482264872140
138045VirusShare_4286dccf67ca220fe67635388229a9f34286dccf67ca220fe67635388229a9f33322243316622549152168960...103.4216272.0609644.739744601.600000162216000
138046VirusShare_d7648eae45f09b3adb75127f43be6d11d7648eae45f09b3adb75127f43be6d113322242581101116164684800...44.4072521.9804826.11537496625.000000203184647200
\n", 415 | "

138047 rows × 57 columns

\n", 416 | "
" 417 | ], 418 | "text/plain": [ 419 | " Name \\\n", 420 | "0 memtest.exe \n", 421 | "1 ose.exe \n", 422 | "2 setup.exe \n", 423 | "3 DW20.EXE \n", 424 | "4 dwtrig20.exe \n", 425 | "... ... \n", 426 | "138042 VirusShare_8e292b418568d6e7b87f2a32aee7074b \n", 427 | "138043 VirusShare_260d9e2258aed4c8a3bbd703ec895822 \n", 428 | "138044 VirusShare_8d088a51b7d225c9f5d11d239791ec3f \n", 429 | "138045 VirusShare_4286dccf67ca220fe67635388229a9f3 \n", 430 | "138046 VirusShare_d7648eae45f09b3adb75127f43be6d11 \n", 431 | "\n", 432 | " md5 Machine SizeOfOptionalHeader \\\n", 433 | "0 631ea355665f28d4707448e442fbf5b8 332 224 \n", 434 | "1 9d10f99a6712e28f8acd5641e3a7ea6b 332 224 \n", 435 | "2 4d92f518527353c0db88a70fddcfd390 332 224 \n", 436 | "3 a41e524f8d45f0074fd07805ff0c9b12 332 224 \n", 437 | "4 c87e561258f2f8650cef999bf643a731 332 224 \n", 438 | "... ... ... ... \n", 439 | "138042 8e292b418568d6e7b87f2a32aee7074b 332 224 \n", 440 | "138043 260d9e2258aed4c8a3bbd703ec895822 332 224 \n", 441 | "138044 8d088a51b7d225c9f5d11d239791ec3f 332 224 \n", 442 | "138045 4286dccf67ca220fe67635388229a9f3 332 224 \n", 443 | "138046 d7648eae45f09b3adb75127f43be6d11 332 224 \n", 444 | "\n", 445 | " Characteristics MajorLinkerVersion MinorLinkerVersion SizeOfCode \\\n", 446 | "0 258 9 0 361984 \n", 447 | "1 3330 9 0 130560 \n", 448 | "2 3330 9 0 517120 \n", 449 | "3 258 9 0 585728 \n", 450 | "4 258 9 0 294912 \n", 451 | "... ... ... ... ... \n", 452 | "138042 258 11 0 205824 \n", 453 | "138043 33167 2 25 37888 \n", 454 | "138044 258 10 0 118272 \n", 455 | "138045 33166 2 25 49152 \n", 456 | "138046 258 11 0 111616 \n", 457 | "\n", 458 | " SizeOfInitializedData SizeOfUninitializedData ... ResourcesNb \\\n", 459 | "0 115712 0 ... 4 \n", 460 | "1 19968 0 ... 2 \n", 461 | "2 621568 0 ... 11 \n", 462 | "3 369152 0 ... 10 \n", 463 | "4 247296 0 ... 2 \n", 464 | "... ... ... ... ... \n", 465 | "138042 223744 0 ... 7 \n", 466 | "138043 185344 0 ... 26 \n", 467 | "138044 380416 0 ... 22 \n", 468 | "138045 16896 0 ... 10 \n", 469 | "138046 468480 0 ... 4 \n", 470 | "\n", 471 | " ResourcesMeanEntropy ResourcesMinEntropy ResourcesMaxEntropy \\\n", 472 | "0 3.262823 2.568844 3.537939 \n", 473 | "1 4.250461 3.420744 5.080177 \n", 474 | "2 4.426324 2.846449 5.271813 \n", 475 | "3 4.364291 2.669314 6.400720 \n", 476 | "4 4.306100 3.421598 5.190603 \n", 477 | "... ... ... ... \n", 478 | "138042 4.122736 1.370260 7.677091 \n", 479 | "138043 3.377663 2.031619 5.050074 \n", 480 | "138044 6.825406 2.617026 7.990487 \n", 481 | "138045 3.421627 2.060964 4.739744 \n", 482 | "138046 4.407252 1.980482 6.115374 \n", 483 | "\n", 484 | " ResourcesMeanSize ResourcesMinSize ResourcesMaxSize \\\n", 485 | "0 8797.000000 216 18032 \n", 486 | "1 837.000000 518 1156 \n", 487 | "2 31102.272727 104 270376 \n", 488 | "3 1457.000000 90 4264 \n", 489 | "4 1074.500000 849 1300 \n", 490 | "... ... ... ... \n", 491 | "138042 14900.714286 16 81654 \n", 492 | "138043 6905.846154 44 67624 \n", 493 | "138044 14981.909091 48 22648 \n", 494 | "138045 601.600000 16 2216 \n", 495 | "138046 96625.000000 20 318464 \n", 496 | "\n", 497 | " LoadConfigurationSize VersionInformationSize legitimate \n", 498 | "0 0 16 1 \n", 499 | "1 72 18 1 \n", 500 | "2 72 18 1 \n", 501 | "3 72 18 1 \n", 502 | "4 72 18 1 \n", 503 | "... ... ... ... \n", 504 | "138042 72 0 0 \n", 505 | "138043 0 15 0 \n", 506 | "138044 72 14 0 \n", 507 | "138045 0 0 0 \n", 508 | "138046 72 0 0 \n", 509 | "\n", 510 | "[138047 rows x 57 columns]" 511 | ] 512 | }, 513 | "execution_count": 4, 514 | "metadata": {}, 515 | "output_type": "execute_result" 516 | } 517 | ], 518 | "source": [ 519 | "dataset" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": 5, 525 | "id": "9e106963", 526 | "metadata": {}, 527 | "outputs": [ 528 | { 529 | "data": { 530 | "text/html": [ 531 | "
\n", 532 | "\n", 545 | "\n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | "
MachineSizeOfOptionalHeaderCharacteristicsMajorLinkerVersionMinorLinkerVersionSizeOfCodeSizeOfInitializedDataSizeOfUninitializedDataAddressOfEntryPointBaseOfCode...ResourcesNbResourcesMeanEntropyResourcesMinEntropyResourcesMaxEntropyResourcesMeanSizeResourcesMinSizeResourcesMaxSizeLoadConfigurationSizeVersionInformationSizelegitimate
count138047.000000138047.000000138047.000000138047.000000138047.0000001.380470e+051.380470e+051.380470e+051.380470e+051.380470e+05...138047.000000138047.000000138047.000000138047.0000001.380470e+051.380470e+051.380470e+051.380470e+05138047.000000138047.000000
mean4259.069274225.8456324444.1459948.6197743.8192862.425956e+054.504867e+051.009525e+051.719561e+055.779845e+04...22.0507004.0001272.4345415.5216105.545093e+041.818082e+042.465903e+054.656750e+0512.3631150.299340
std10880.3472455.1213998186.7825244.08875711.8626755.754485e+062.101599e+071.635288e+073.430553e+065.527658e+06...136.4942441.1129810.8155771.5974037.799163e+066.502369e+062.124860e+072.608987e+076.7988780.457971
min332.000000224.0000002.0000000.0000000.0000000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00...0.0000000.0000000.0000000.0000000.000000e+000.000000e+000.000000e+000.000000e+000.0000000.000000
25%332.000000224.000000258.0000008.0000000.0000003.020800e+042.457600e+040.000000e+001.272100e+044.096000e+03...5.0000003.4585052.1787484.8287069.560000e+024.800000e+012.216000e+030.000000e+0013.0000000.000000
50%332.000000224.000000258.0000009.0000000.0000001.136640e+052.631680e+050.000000e+005.288300e+044.096000e+03...6.0000003.7298242.4584925.3175522.708154e+034.800000e+019.640000e+037.200000e+0115.0000000.000000
75%332.000000224.0000008226.00000010.0000000.0000001.203200e+053.850240e+050.000000e+006.157800e+044.096000e+03...13.0000004.2330512.6968336.5022396.558429e+031.320000e+022.378000e+047.200000e+0116.0000001.000000
max34404.000000352.00000049551.000000255.000000255.0000001.818587e+094.294966e+094.294941e+091.074484e+092.028711e+09...7694.0000007.9997237.9997238.0000002.415919e+092.415919e+094.294903e+094.294967e+0926.0000001.000000
\n", 767 | "

8 rows × 55 columns

\n", 768 | "
" 769 | ], 770 | "text/plain": [ 771 | " Machine SizeOfOptionalHeader Characteristics \\\n", 772 | "count 138047.000000 138047.000000 138047.000000 \n", 773 | "mean 4259.069274 225.845632 4444.145994 \n", 774 | "std 10880.347245 5.121399 8186.782524 \n", 775 | "min 332.000000 224.000000 2.000000 \n", 776 | "25% 332.000000 224.000000 258.000000 \n", 777 | "50% 332.000000 224.000000 258.000000 \n", 778 | "75% 332.000000 224.000000 8226.000000 \n", 779 | "max 34404.000000 352.000000 49551.000000 \n", 780 | "\n", 781 | " MajorLinkerVersion MinorLinkerVersion SizeOfCode \\\n", 782 | "count 138047.000000 138047.000000 1.380470e+05 \n", 783 | "mean 8.619774 3.819286 2.425956e+05 \n", 784 | "std 4.088757 11.862675 5.754485e+06 \n", 785 | "min 0.000000 0.000000 0.000000e+00 \n", 786 | "25% 8.000000 0.000000 3.020800e+04 \n", 787 | "50% 9.000000 0.000000 1.136640e+05 \n", 788 | "75% 10.000000 0.000000 1.203200e+05 \n", 789 | "max 255.000000 255.000000 1.818587e+09 \n", 790 | "\n", 791 | " SizeOfInitializedData SizeOfUninitializedData AddressOfEntryPoint \\\n", 792 | "count 1.380470e+05 1.380470e+05 1.380470e+05 \n", 793 | "mean 4.504867e+05 1.009525e+05 1.719561e+05 \n", 794 | "std 2.101599e+07 1.635288e+07 3.430553e+06 \n", 795 | "min 0.000000e+00 0.000000e+00 0.000000e+00 \n", 796 | "25% 2.457600e+04 0.000000e+00 1.272100e+04 \n", 797 | "50% 2.631680e+05 0.000000e+00 5.288300e+04 \n", 798 | "75% 3.850240e+05 0.000000e+00 6.157800e+04 \n", 799 | "max 4.294966e+09 4.294941e+09 1.074484e+09 \n", 800 | "\n", 801 | " BaseOfCode ... ResourcesNb ResourcesMeanEntropy \\\n", 802 | "count 1.380470e+05 ... 138047.000000 138047.000000 \n", 803 | "mean 5.779845e+04 ... 22.050700 4.000127 \n", 804 | "std 5.527658e+06 ... 136.494244 1.112981 \n", 805 | "min 0.000000e+00 ... 0.000000 0.000000 \n", 806 | "25% 4.096000e+03 ... 5.000000 3.458505 \n", 807 | "50% 4.096000e+03 ... 6.000000 3.729824 \n", 808 | "75% 4.096000e+03 ... 13.000000 4.233051 \n", 809 | "max 2.028711e+09 ... 7694.000000 7.999723 \n", 810 | "\n", 811 | " ResourcesMinEntropy ResourcesMaxEntropy ResourcesMeanSize \\\n", 812 | "count 138047.000000 138047.000000 1.380470e+05 \n", 813 | "mean 2.434541 5.521610 5.545093e+04 \n", 814 | "std 0.815577 1.597403 7.799163e+06 \n", 815 | "min 0.000000 0.000000 0.000000e+00 \n", 816 | "25% 2.178748 4.828706 9.560000e+02 \n", 817 | "50% 2.458492 5.317552 2.708154e+03 \n", 818 | "75% 2.696833 6.502239 6.558429e+03 \n", 819 | "max 7.999723 8.000000 2.415919e+09 \n", 820 | "\n", 821 | " ResourcesMinSize ResourcesMaxSize LoadConfigurationSize \\\n", 822 | "count 1.380470e+05 1.380470e+05 1.380470e+05 \n", 823 | "mean 1.818082e+04 2.465903e+05 4.656750e+05 \n", 824 | "std 6.502369e+06 2.124860e+07 2.608987e+07 \n", 825 | "min 0.000000e+00 0.000000e+00 0.000000e+00 \n", 826 | "25% 4.800000e+01 2.216000e+03 0.000000e+00 \n", 827 | "50% 4.800000e+01 9.640000e+03 7.200000e+01 \n", 828 | "75% 1.320000e+02 2.378000e+04 7.200000e+01 \n", 829 | "max 2.415919e+09 4.294903e+09 4.294967e+09 \n", 830 | "\n", 831 | " VersionInformationSize legitimate \n", 832 | "count 138047.000000 138047.000000 \n", 833 | "mean 12.363115 0.299340 \n", 834 | "std 6.798878 0.457971 \n", 835 | "min 0.000000 0.000000 \n", 836 | "25% 13.000000 0.000000 \n", 837 | "50% 15.000000 0.000000 \n", 838 | "75% 16.000000 1.000000 \n", 839 | "max 26.000000 1.000000 \n", 840 | "\n", 841 | "[8 rows x 55 columns]" 842 | ] 843 | }, 844 | "execution_count": 5, 845 | "metadata": {}, 846 | "output_type": "execute_result" 847 | } 848 | ], 849 | "source": [ 850 | "dataset.describe()" 851 | ] 852 | }, 853 | { 854 | "cell_type": "code", 855 | "execution_count": 6, 856 | "id": "f3db099c", 857 | "metadata": {}, 858 | "outputs": [ 859 | { 860 | "data": { 861 | "text/plain": [ 862 | "Name 0\n", 863 | "md5 0\n", 864 | "Machine 0\n", 865 | "SizeOfOptionalHeader 0\n", 866 | "Characteristics 0\n", 867 | "MajorLinkerVersion 0\n", 868 | "MinorLinkerVersion 0\n", 869 | "SizeOfCode 0\n", 870 | "SizeOfInitializedData 0\n", 871 | "SizeOfUninitializedData 0\n", 872 | "AddressOfEntryPoint 0\n", 873 | "BaseOfCode 0\n", 874 | "BaseOfData 0\n", 875 | "ImageBase 0\n", 876 | "SectionAlignment 0\n", 877 | "FileAlignment 0\n", 878 | "MajorOperatingSystemVersion 0\n", 879 | "MinorOperatingSystemVersion 0\n", 880 | "MajorImageVersion 0\n", 881 | "MinorImageVersion 0\n", 882 | "MajorSubsystemVersion 0\n", 883 | "MinorSubsystemVersion 0\n", 884 | "SizeOfImage 0\n", 885 | "SizeOfHeaders 0\n", 886 | "CheckSum 0\n", 887 | "Subsystem 0\n", 888 | "DllCharacteristics 0\n", 889 | "SizeOfStackReserve 0\n", 890 | "SizeOfStackCommit 0\n", 891 | "SizeOfHeapReserve 0\n", 892 | "SizeOfHeapCommit 0\n", 893 | "LoaderFlags 0\n", 894 | "NumberOfRvaAndSizes 0\n", 895 | "SectionsNb 0\n", 896 | "SectionsMeanEntropy 0\n", 897 | "SectionsMinEntropy 0\n", 898 | "SectionsMaxEntropy 0\n", 899 | "SectionsMeanRawsize 0\n", 900 | "SectionsMinRawsize 0\n", 901 | "SectionMaxRawsize 0\n", 902 | "SectionsMeanVirtualsize 0\n", 903 | "SectionsMinVirtualsize 0\n", 904 | "SectionMaxVirtualsize 0\n", 905 | "ImportsNbDLL 0\n", 906 | "ImportsNb 0\n", 907 | "ImportsNbOrdinal 0\n", 908 | "ExportNb 0\n", 909 | "ResourcesNb 0\n", 910 | "ResourcesMeanEntropy 0\n", 911 | "ResourcesMinEntropy 0\n", 912 | "ResourcesMaxEntropy 0\n", 913 | "ResourcesMeanSize 0\n", 914 | "ResourcesMinSize 0\n", 915 | "ResourcesMaxSize 0\n", 916 | "LoadConfigurationSize 0\n", 917 | "VersionInformationSize 0\n", 918 | "legitimate 0\n", 919 | "dtype: int64" 920 | ] 921 | }, 922 | "execution_count": 6, 923 | "metadata": {}, 924 | "output_type": "execute_result" 925 | } 926 | ], 927 | "source": [ 928 | "dataset.isnull().sum()" 929 | ] 930 | }, 931 | { 932 | "cell_type": "code", 933 | "execution_count": 7, 934 | "id": "48a57329", 935 | "metadata": {}, 936 | "outputs": [], 937 | "source": [ 938 | "#Classifying Data Based on - Legitimate OR Malware" 939 | ] 940 | }, 941 | { 942 | "cell_type": "code", 943 | "execution_count": 8, 944 | "id": "52e76632", 945 | "metadata": {}, 946 | "outputs": [ 947 | { 948 | "data": { 949 | "text/plain": [ 950 | "legitimate\n", 951 | "0 96724\n", 952 | "1 41323\n", 953 | "dtype: int64" 954 | ] 955 | }, 956 | "execution_count": 8, 957 | "metadata": {}, 958 | "output_type": "execute_result" 959 | } 960 | ], 961 | "source": [ 962 | "dataset.groupby(dataset['legitimate']).size()\n", 963 | "#1 means legitimate, 0 means malware" 964 | ] 965 | }, 966 | { 967 | "cell_type": "code", 968 | "execution_count": 9, 969 | "id": "77eefc0b", 970 | "metadata": {}, 971 | "outputs": [ 972 | { 973 | "data": { 974 | "text/plain": [ 975 | "([,\n", 976 | " ],\n", 977 | " [Text(0.6484073958497663, 0.8885763045497695, 'Legitimate'),\n", 978 | " Text(-0.6484073958497659, -0.8885763045497698, 'Malware')],\n", 979 | " [Text(0.35367676137259974, 0.4846779842998742, '30%'),\n", 980 | " Text(-0.35367676137259957, -0.48467798429987435, '70%')])" 981 | ] 982 | }, 983 | "execution_count": 9, 984 | "metadata": {}, 985 | "output_type": "execute_result" 986 | }, 987 | { 988 | "data": { 989 | "image/png": "\n", 990 | "text/plain": [ 991 | "
" 992 | ] 993 | }, 994 | "metadata": {}, 995 | "output_type": "display_data" 996 | } 997 | ], 998 | "source": [ 999 | "type_classify=['Legitimate','Malware']\n", 1000 | "count_classify=[41323,96724]\n", 1001 | "plt.pie(count_classify, labels=type_classify, autopct='%0.f%%')" 1002 | ] 1003 | }, 1004 | { 1005 | "cell_type": "code", 1006 | "execution_count": 10, 1007 | "id": "4f846ba0", 1008 | "metadata": {}, 1009 | "outputs": [], 1010 | "source": [ 1011 | "# Total Number of Columns in Dataset" 1012 | ] 1013 | }, 1014 | { 1015 | "cell_type": "code", 1016 | "execution_count": 11, 1017 | "id": "cb1a7785", 1018 | "metadata": {}, 1019 | "outputs": [ 1020 | { 1021 | "data": { 1022 | "text/plain": [ 1023 | "57" 1024 | ] 1025 | }, 1026 | "execution_count": 11, 1027 | "metadata": {}, 1028 | "output_type": "execute_result" 1029 | } 1030 | ], 1031 | "source": [ 1032 | "dataset.shape[1]" 1033 | ] 1034 | }, 1035 | { 1036 | "cell_type": "code", 1037 | "execution_count": 12, 1038 | "id": "7c62eedf", 1039 | "metadata": {}, 1040 | "outputs": [], 1041 | "source": [ 1042 | "# Creating Legitimate and Malware Dataset from Main Dataset" 1043 | ] 1044 | }, 1045 | { 1046 | "cell_type": "code", 1047 | "execution_count": 13, 1048 | "id": "bda12c84", 1049 | "metadata": {}, 1050 | "outputs": [ 1051 | { 1052 | "data": { 1053 | "text/html": [ 1054 | "
\n", 1055 | "\n", 1068 | "\n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | "
Namemd5MachineSizeOfOptionalHeaderCharacteristicsMajorLinkerVersionMinorLinkerVersionSizeOfCodeSizeOfInitializedDataSizeOfUninitializedData...ExportNbResourcesNbResourcesMeanEntropyResourcesMinEntropyResourcesMaxEntropyResourcesMeanSizeResourcesMinSizeResourcesMaxSizeLoadConfigurationSizeVersionInformationSize
0memtest.exe631ea355665f28d4707448e442fbf5b8332224258903619841157120...043.2628232.5688443.5379398797.00000021618032016
1ose.exe9d10f99a6712e28f8acd5641e3a7ea6b332224333090130560199680...024.2504613.4207445.080177837.00000051811567218
2setup.exe4d92f518527353c0db88a70fddcfd3903322243330905171206215680...1114.4263242.8464495.27181331102.2727271042703767218
3DW20.EXEa41e524f8d45f0074fd07805ff0c9b12332224258905857283691520...1104.3642912.6693146.4007201457.0000009042647218
4dwtrig20.exec87e561258f2f8650cef999bf643a731332224258902949122472960...124.3061003.4215985.1906031074.50000084913007218
..................................................................
41318mfc80.dll1f5afd468eb5e09e9ed75a087529eab53322248450809461761597440...01232.6072510.9609535.130762327.1707322015927216
41319mfc80u.dlle2c48cd0132d4d1dc7d0df9a6bef686a3322248450809461761546240...01232.6072320.9609535.130762327.2357722015927216
41320mfcm80.dll83362ee950ad18adb85b54409155c37833222484508053248163840...2513.5242683.5242683.524268892.0000008928927216
41321mfcm80u.dll26aafee5c30020c99120ee113d751f7e33222484508052736112640...2513.5420713.5420713.542071892.0000008928927216
41322vcomp.dll73dbaa64d589f3262615550dd6881fee33222484508040960204800...11263.0043832.4065123.592623610.33333312414127216
\n", 1362 | "

41323 rows × 56 columns

\n", 1363 | "
" 1364 | ], 1365 | "text/plain": [ 1366 | " Name md5 Machine \\\n", 1367 | "0 memtest.exe 631ea355665f28d4707448e442fbf5b8 332 \n", 1368 | "1 ose.exe 9d10f99a6712e28f8acd5641e3a7ea6b 332 \n", 1369 | "2 setup.exe 4d92f518527353c0db88a70fddcfd390 332 \n", 1370 | "3 DW20.EXE a41e524f8d45f0074fd07805ff0c9b12 332 \n", 1371 | "4 dwtrig20.exe c87e561258f2f8650cef999bf643a731 332 \n", 1372 | "... ... ... ... \n", 1373 | "41318 mfc80.dll 1f5afd468eb5e09e9ed75a087529eab5 332 \n", 1374 | "41319 mfc80u.dll e2c48cd0132d4d1dc7d0df9a6bef686a 332 \n", 1375 | "41320 mfcm80.dll 83362ee950ad18adb85b54409155c378 332 \n", 1376 | "41321 mfcm80u.dll 26aafee5c30020c99120ee113d751f7e 332 \n", 1377 | "41322 vcomp.dll 73dbaa64d589f3262615550dd6881fee 332 \n", 1378 | "\n", 1379 | " SizeOfOptionalHeader Characteristics MajorLinkerVersion \\\n", 1380 | "0 224 258 9 \n", 1381 | "1 224 3330 9 \n", 1382 | "2 224 3330 9 \n", 1383 | "3 224 258 9 \n", 1384 | "4 224 258 9 \n", 1385 | "... ... ... ... \n", 1386 | "41318 224 8450 8 \n", 1387 | "41319 224 8450 8 \n", 1388 | "41320 224 8450 8 \n", 1389 | "41321 224 8450 8 \n", 1390 | "41322 224 8450 8 \n", 1391 | "\n", 1392 | " MinorLinkerVersion SizeOfCode SizeOfInitializedData \\\n", 1393 | "0 0 361984 115712 \n", 1394 | "1 0 130560 19968 \n", 1395 | "2 0 517120 621568 \n", 1396 | "3 0 585728 369152 \n", 1397 | "4 0 294912 247296 \n", 1398 | "... ... ... ... \n", 1399 | "41318 0 946176 159744 \n", 1400 | "41319 0 946176 154624 \n", 1401 | "41320 0 53248 16384 \n", 1402 | "41321 0 52736 11264 \n", 1403 | "41322 0 40960 20480 \n", 1404 | "\n", 1405 | " SizeOfUninitializedData ... ExportNb ResourcesNb \\\n", 1406 | "0 0 ... 0 4 \n", 1407 | "1 0 ... 0 2 \n", 1408 | "2 0 ... 1 11 \n", 1409 | "3 0 ... 1 10 \n", 1410 | "4 0 ... 1 2 \n", 1411 | "... ... ... ... ... \n", 1412 | "41318 0 ... 0 123 \n", 1413 | "41319 0 ... 0 123 \n", 1414 | "41320 0 ... 25 1 \n", 1415 | "41321 0 ... 25 1 \n", 1416 | "41322 0 ... 112 6 \n", 1417 | "\n", 1418 | " ResourcesMeanEntropy ResourcesMinEntropy ResourcesMaxEntropy \\\n", 1419 | "0 3.262823 2.568844 3.537939 \n", 1420 | "1 4.250461 3.420744 5.080177 \n", 1421 | "2 4.426324 2.846449 5.271813 \n", 1422 | "3 4.364291 2.669314 6.400720 \n", 1423 | "4 4.306100 3.421598 5.190603 \n", 1424 | "... ... ... ... \n", 1425 | "41318 2.607251 0.960953 5.130762 \n", 1426 | "41319 2.607232 0.960953 5.130762 \n", 1427 | "41320 3.524268 3.524268 3.524268 \n", 1428 | "41321 3.542071 3.542071 3.542071 \n", 1429 | "41322 3.004383 2.406512 3.592623 \n", 1430 | "\n", 1431 | " ResourcesMeanSize ResourcesMinSize ResourcesMaxSize \\\n", 1432 | "0 8797.000000 216 18032 \n", 1433 | "1 837.000000 518 1156 \n", 1434 | "2 31102.272727 104 270376 \n", 1435 | "3 1457.000000 90 4264 \n", 1436 | "4 1074.500000 849 1300 \n", 1437 | "... ... ... ... \n", 1438 | "41318 327.170732 20 1592 \n", 1439 | "41319 327.235772 20 1592 \n", 1440 | "41320 892.000000 892 892 \n", 1441 | "41321 892.000000 892 892 \n", 1442 | "41322 610.333333 124 1412 \n", 1443 | "\n", 1444 | " LoadConfigurationSize VersionInformationSize \n", 1445 | "0 0 16 \n", 1446 | "1 72 18 \n", 1447 | "2 72 18 \n", 1448 | "3 72 18 \n", 1449 | "4 72 18 \n", 1450 | "... ... ... \n", 1451 | "41318 72 16 \n", 1452 | "41319 72 16 \n", 1453 | "41320 72 16 \n", 1454 | "41321 72 16 \n", 1455 | "41322 72 16 \n", 1456 | "\n", 1457 | "[41323 rows x 56 columns]" 1458 | ] 1459 | }, 1460 | "execution_count": 13, 1461 | "metadata": {}, 1462 | "output_type": "execute_result" 1463 | } 1464 | ], 1465 | "source": [ 1466 | "legit=dataset[0:41323].drop([\"legitimate\"],axis=1) # here axis =1 means vertical \n", 1467 | "legit" 1468 | ] 1469 | }, 1470 | { 1471 | "cell_type": "code", 1472 | "execution_count": 14, 1473 | "id": "5adc7421", 1474 | "metadata": {}, 1475 | "outputs": [ 1476 | { 1477 | "data": { 1478 | "text/html": [ 1479 | "
\n", 1480 | "\n", 1493 | "\n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | " \n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | " \n", 1588 | " \n", 1589 | " \n", 1590 | " \n", 1591 | " \n", 1592 | " \n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | " \n", 1609 | " \n", 1610 | " \n", 1611 | " \n", 1612 | " \n", 1613 | " \n", 1614 | " \n", 1615 | " \n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | " \n", 1628 | " \n", 1629 | " \n", 1630 | " \n", 1631 | " \n", 1632 | " \n", 1633 | " \n", 1634 | " \n", 1635 | " \n", 1636 | " \n", 1637 | " \n", 1638 | " \n", 1639 | " \n", 1640 | " \n", 1641 | " \n", 1642 | " \n", 1643 | " \n", 1644 | " \n", 1645 | " \n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | " \n", 1650 | " \n", 1651 | " \n", 1652 | " \n", 1653 | " \n", 1654 | " \n", 1655 | " \n", 1656 | " \n", 1657 | " \n", 1658 | " \n", 1659 | " \n", 1660 | " \n", 1661 | " \n", 1662 | " \n", 1663 | " \n", 1664 | " \n", 1665 | " \n", 1666 | " \n", 1667 | " \n", 1668 | " \n", 1669 | " \n", 1670 | " \n", 1671 | " \n", 1672 | " \n", 1673 | " \n", 1674 | " \n", 1675 | " \n", 1676 | " \n", 1677 | " \n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | " \n", 1682 | " \n", 1683 | " \n", 1684 | " \n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | " \n", 1697 | " \n", 1698 | " \n", 1699 | " \n", 1700 | " \n", 1701 | " \n", 1702 | " \n", 1703 | " \n", 1704 | " \n", 1705 | " \n", 1706 | " \n", 1707 | " \n", 1708 | " \n", 1709 | " \n", 1710 | " \n", 1711 | " \n", 1712 | " \n", 1713 | " \n", 1714 | " \n", 1715 | " \n", 1716 | " \n", 1717 | " \n", 1718 | " \n", 1719 | " \n", 1720 | " \n", 1721 | " \n", 1722 | " \n", 1723 | " \n", 1724 | " \n", 1725 | " \n", 1726 | " \n", 1727 | " \n", 1728 | " \n", 1729 | " \n", 1730 | " \n", 1731 | " \n", 1732 | " \n", 1733 | " \n", 1734 | " \n", 1735 | " \n", 1736 | " \n", 1737 | " \n", 1738 | " \n", 1739 | " \n", 1740 | " \n", 1741 | " \n", 1742 | " \n", 1743 | " \n", 1744 | " \n", 1745 | " \n", 1746 | " \n", 1747 | " \n", 1748 | " \n", 1749 | " \n", 1750 | " \n", 1751 | " \n", 1752 | " \n", 1753 | " \n", 1754 | " \n", 1755 | " \n", 1756 | " \n", 1757 | " \n", 1758 | " \n", 1759 | " \n", 1760 | " \n", 1761 | " \n", 1762 | " \n", 1763 | " \n", 1764 | " \n", 1765 | " \n", 1766 | " \n", 1767 | " \n", 1768 | " \n", 1769 | " \n", 1770 | " \n", 1771 | " \n", 1772 | " \n", 1773 | " \n", 1774 | " \n", 1775 | " \n", 1776 | " \n", 1777 | " \n", 1778 | " \n", 1779 | " \n", 1780 | " \n", 1781 | " \n", 1782 | " \n", 1783 | " \n", 1784 | " \n", 1785 | " \n", 1786 | "
Namemd5MachineSizeOfOptionalHeaderCharacteristicsMajorLinkerVersionMinorLinkerVersionSizeOfCodeSizeOfInitializedDataSizeOfUninitializedData...ResourcesNbResourcesMeanEntropyResourcesMinEntropyResourcesMaxEntropyResourcesMeanSizeResourcesMinSizeResourcesMaxSizeLoadConfigurationSizeVersionInformationSizelegitimate
41323VirusShare_4a400b747afe6547e09ce0b02dae7f1c4a400b747afe6547e09ce0b02dae7f1c3322242581103548162570240...73.9144151.4416887.6770917298.42857116284387200
41324VirusShare_9bd57c8252948bd2fa651ad372bd4f139bd57c8252948bd2fa651ad372bd4f1333222427160240641648641024...63.1991071.9713355.214816452.000000349580150
41325VirusShare_d1456165e9358b8f61f93a5f2042f39cd1456165e9358b8f61f93a5f2042f39c3322242581001187843819520...186.5309462.4584927.99268818523.444444483394572140
41326VirusShare_e4214cc73afbba0f52bb72d5db8f8bb1e4214cc73afbba0f52bb72d5db8f8bb13322242581001745923000320...155.7323932.8523647.98772612706.1333331186050072140
41327VirusShare_710890c07b3f93b90635f8bff6c34605710890c07b3f93b90635f8bff6c34605332224258904756483486720...592.8278260.9609537.2123292637.03389820676247200
..................................................................
138042VirusShare_8e292b418568d6e7b87f2a32aee7074b8e292b418568d6e7b87f2a32aee7074b3322242581102058242237440...74.1227361.3702607.67709114900.71428616816547200
138043VirusShare_260d9e2258aed4c8a3bbd703ec895822260d9e2258aed4c8a3bbd703ec89582233222433167225378881853440...263.3776632.0316195.0500746905.84615444676240150
138044VirusShare_8d088a51b7d225c9f5d11d239791ec3f8d088a51b7d225c9f5d11d239791ec3f3322242581001182723804160...226.8254062.6170267.99048714981.909091482264872140
138045VirusShare_4286dccf67ca220fe67635388229a9f34286dccf67ca220fe67635388229a9f33322243316622549152168960...103.4216272.0609644.739744601.600000162216000
138046VirusShare_d7648eae45f09b3adb75127f43be6d11d7648eae45f09b3adb75127f43be6d113322242581101116164684800...44.4072521.9804826.11537496625.000000203184647200
\n", 1787 | "

96724 rows × 57 columns

\n", 1788 | "
" 1789 | ], 1790 | "text/plain": [ 1791 | " Name \\\n", 1792 | "41323 VirusShare_4a400b747afe6547e09ce0b02dae7f1c \n", 1793 | "41324 VirusShare_9bd57c8252948bd2fa651ad372bd4f13 \n", 1794 | "41325 VirusShare_d1456165e9358b8f61f93a5f2042f39c \n", 1795 | "41326 VirusShare_e4214cc73afbba0f52bb72d5db8f8bb1 \n", 1796 | "41327 VirusShare_710890c07b3f93b90635f8bff6c34605 \n", 1797 | "... ... \n", 1798 | "138042 VirusShare_8e292b418568d6e7b87f2a32aee7074b \n", 1799 | "138043 VirusShare_260d9e2258aed4c8a3bbd703ec895822 \n", 1800 | "138044 VirusShare_8d088a51b7d225c9f5d11d239791ec3f \n", 1801 | "138045 VirusShare_4286dccf67ca220fe67635388229a9f3 \n", 1802 | "138046 VirusShare_d7648eae45f09b3adb75127f43be6d11 \n", 1803 | "\n", 1804 | " md5 Machine SizeOfOptionalHeader \\\n", 1805 | "41323 4a400b747afe6547e09ce0b02dae7f1c 332 224 \n", 1806 | "41324 9bd57c8252948bd2fa651ad372bd4f13 332 224 \n", 1807 | "41325 d1456165e9358b8f61f93a5f2042f39c 332 224 \n", 1808 | "41326 e4214cc73afbba0f52bb72d5db8f8bb1 332 224 \n", 1809 | "41327 710890c07b3f93b90635f8bff6c34605 332 224 \n", 1810 | "... ... ... ... \n", 1811 | "138042 8e292b418568d6e7b87f2a32aee7074b 332 224 \n", 1812 | "138043 260d9e2258aed4c8a3bbd703ec895822 332 224 \n", 1813 | "138044 8d088a51b7d225c9f5d11d239791ec3f 332 224 \n", 1814 | "138045 4286dccf67ca220fe67635388229a9f3 332 224 \n", 1815 | "138046 d7648eae45f09b3adb75127f43be6d11 332 224 \n", 1816 | "\n", 1817 | " Characteristics MajorLinkerVersion MinorLinkerVersion SizeOfCode \\\n", 1818 | "41323 258 11 0 354816 \n", 1819 | "41324 271 6 0 24064 \n", 1820 | "41325 258 10 0 118784 \n", 1821 | "41326 258 10 0 174592 \n", 1822 | "41327 258 9 0 475648 \n", 1823 | "... ... ... ... ... \n", 1824 | "138042 258 11 0 205824 \n", 1825 | "138043 33167 2 25 37888 \n", 1826 | "138044 258 10 0 118272 \n", 1827 | "138045 33166 2 25 49152 \n", 1828 | "138046 258 11 0 111616 \n", 1829 | "\n", 1830 | " SizeOfInitializedData SizeOfUninitializedData ... ResourcesNb \\\n", 1831 | "41323 257024 0 ... 7 \n", 1832 | "41324 164864 1024 ... 6 \n", 1833 | "41325 381952 0 ... 18 \n", 1834 | "41326 300032 0 ... 15 \n", 1835 | "41327 348672 0 ... 59 \n", 1836 | "... ... ... ... ... \n", 1837 | "138042 223744 0 ... 7 \n", 1838 | "138043 185344 0 ... 26 \n", 1839 | "138044 380416 0 ... 22 \n", 1840 | "138045 16896 0 ... 10 \n", 1841 | "138046 468480 0 ... 4 \n", 1842 | "\n", 1843 | " ResourcesMeanEntropy ResourcesMinEntropy ResourcesMaxEntropy \\\n", 1844 | "41323 3.914415 1.441688 7.677091 \n", 1845 | "41324 3.199107 1.971335 5.214816 \n", 1846 | "41325 6.530946 2.458492 7.992688 \n", 1847 | "41326 5.732393 2.852364 7.987726 \n", 1848 | "41327 2.827826 0.960953 7.212329 \n", 1849 | "... ... ... ... \n", 1850 | "138042 4.122736 1.370260 7.677091 \n", 1851 | "138043 3.377663 2.031619 5.050074 \n", 1852 | "138044 6.825406 2.617026 7.990487 \n", 1853 | "138045 3.421627 2.060964 4.739744 \n", 1854 | "138046 4.407252 1.980482 6.115374 \n", 1855 | "\n", 1856 | " ResourcesMeanSize ResourcesMinSize ResourcesMaxSize \\\n", 1857 | "41323 7298.428571 16 28438 \n", 1858 | "41324 452.000000 34 958 \n", 1859 | "41325 18523.444444 48 33945 \n", 1860 | "41326 12706.133333 118 60500 \n", 1861 | "41327 2637.033898 20 67624 \n", 1862 | "... ... ... ... \n", 1863 | "138042 14900.714286 16 81654 \n", 1864 | "138043 6905.846154 44 67624 \n", 1865 | "138044 14981.909091 48 22648 \n", 1866 | "138045 601.600000 16 2216 \n", 1867 | "138046 96625.000000 20 318464 \n", 1868 | "\n", 1869 | " LoadConfigurationSize VersionInformationSize legitimate \n", 1870 | "41323 72 0 0 \n", 1871 | "41324 0 15 0 \n", 1872 | "41325 72 14 0 \n", 1873 | "41326 72 14 0 \n", 1874 | "41327 72 0 0 \n", 1875 | "... ... ... ... \n", 1876 | "138042 72 0 0 \n", 1877 | "138043 0 15 0 \n", 1878 | "138044 72 14 0 \n", 1879 | "138045 0 0 0 \n", 1880 | "138046 72 0 0 \n", 1881 | "\n", 1882 | "[96724 rows x 57 columns]" 1883 | ] 1884 | }, 1885 | "execution_count": 14, 1886 | "metadata": {}, 1887 | "output_type": "execute_result" 1888 | } 1889 | ], 1890 | "source": [ 1891 | "mal=dataset[41323::]\n", 1892 | "maldata=dataset[41323::].drop([\"legitimate\"],axis=1)\n", 1893 | "mal" 1894 | ] 1895 | }, 1896 | { 1897 | "cell_type": "code", 1898 | "execution_count": 15, 1899 | "id": "325f7c48", 1900 | "metadata": {}, 1901 | "outputs": [ 1902 | { 1903 | "name": "stdout", 1904 | "output_type": "stream", 1905 | "text": [ 1906 | "The shape of legit database is 41323 samples and 56 features\n", 1907 | "The shape of malware database is 96724 samples and 57 features\n" 1908 | ] 1909 | } 1910 | ], 1911 | "source": [ 1912 | "print(\"The shape of legit database is %s samples and %s features\"%(legit.shape[0],legit.shape[1])) \n", 1913 | "print(\"The shape of malware database is %s samples and %s features\"%(mal.shape[0],mal.shape[1])) " 1914 | ] 1915 | }, 1916 | { 1917 | "cell_type": "code", 1918 | "execution_count": 16, 1919 | "id": "09e246a0", 1920 | "metadata": {}, 1921 | "outputs": [ 1922 | { 1923 | "name": "stdout", 1924 | "output_type": "stream", 1925 | "text": [ 1926 | "Index(['Name', 'md5', 'Machine', 'SizeOfOptionalHeader', 'Characteristics',\n", 1927 | " 'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode',\n", 1928 | " 'SizeOfInitializedData', 'SizeOfUninitializedData',\n", 1929 | " 'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase',\n", 1930 | " 'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion',\n", 1931 | " 'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion',\n", 1932 | " 'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage',\n", 1933 | " 'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics',\n", 1934 | " 'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve',\n", 1935 | " 'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'SectionsNb',\n", 1936 | " 'SectionsMeanEntropy', 'SectionsMinEntropy', 'SectionsMaxEntropy',\n", 1937 | " 'SectionsMeanRawsize', 'SectionsMinRawsize', 'SectionMaxRawsize',\n", 1938 | " 'SectionsMeanVirtualsize', 'SectionsMinVirtualsize',\n", 1939 | " 'SectionMaxVirtualsize', 'ImportsNbDLL', 'ImportsNb',\n", 1940 | " 'ImportsNbOrdinal', 'ExportNb', 'ResourcesNb', 'ResourcesMeanEntropy',\n", 1941 | " 'ResourcesMinEntropy', 'ResourcesMaxEntropy', 'ResourcesMeanSize',\n", 1942 | " 'ResourcesMinSize', 'ResourcesMaxSize', 'LoadConfigurationSize',\n", 1943 | " 'VersionInformationSize', 'legitimate'],\n", 1944 | " dtype='object')\n", 1945 | "Index(['Name', 'md5', 'Machine', 'SizeOfOptionalHeader', 'Characteristics',\n", 1946 | " 'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode',\n", 1947 | " 'SizeOfInitializedData', 'SizeOfUninitializedData',\n", 1948 | " 'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase',\n", 1949 | " 'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion',\n", 1950 | " 'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion',\n", 1951 | " 'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage',\n", 1952 | " 'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics',\n", 1953 | " 'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve',\n", 1954 | " 'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'SectionsNb',\n", 1955 | " 'SectionsMeanEntropy', 'SectionsMinEntropy', 'SectionsMaxEntropy',\n", 1956 | " 'SectionsMeanRawsize', 'SectionsMinRawsize', 'SectionMaxRawsize',\n", 1957 | " 'SectionsMeanVirtualsize', 'SectionsMinVirtualsize',\n", 1958 | " 'SectionMaxVirtualsize', 'ImportsNbDLL', 'ImportsNb',\n", 1959 | " 'ImportsNbOrdinal', 'ExportNb', 'ResourcesNb', 'ResourcesMeanEntropy',\n", 1960 | " 'ResourcesMinEntropy', 'ResourcesMaxEntropy', 'ResourcesMeanSize',\n", 1961 | " 'ResourcesMinSize', 'ResourcesMaxSize', 'LoadConfigurationSize',\n", 1962 | " 'VersionInformationSize', 'legitimate'],\n", 1963 | " dtype='object')\n" 1964 | ] 1965 | } 1966 | ], 1967 | "source": [ 1968 | "#to find the features ie the column names\n", 1969 | "print(dataset.columns) #but in malware or legit there is no legitimate feature :)\n", 1970 | "print(mal.columns)" 1971 | ] 1972 | }, 1973 | { 1974 | "cell_type": "code", 1975 | "execution_count": 17, 1976 | "id": "55644b80", 1977 | "metadata": {}, 1978 | "outputs": [ 1979 | { 1980 | "data": { 1981 | "text/html": [ 1982 | "
\n", 1983 | "\n", 1996 | "\n", 1997 | " \n", 1998 | " \n", 1999 | " \n", 2000 | " \n", 2001 | " \n", 2002 | " \n", 2003 | " \n", 2004 | " \n", 2005 | " \n", 2006 | " \n", 2007 | " \n", 2008 | " \n", 2009 | " \n", 2010 | " \n", 2011 | " \n", 2012 | " \n", 2013 | " \n", 2014 | " \n", 2015 | " \n", 2016 | " \n", 2017 | " \n", 2018 | " \n", 2019 | " \n", 2020 | " \n", 2021 | " \n", 2022 | " \n", 2023 | " \n", 2024 | " \n", 2025 | " \n", 2026 | " \n", 2027 | " \n", 2028 | " \n", 2029 | " \n", 2030 | " \n", 2031 | " \n", 2032 | " \n", 2033 | " \n", 2034 | " \n", 2035 | " \n", 2036 | " \n", 2037 | " \n", 2038 | " \n", 2039 | " \n", 2040 | " \n", 2041 | " \n", 2042 | " \n", 2043 | " \n", 2044 | " \n", 2045 | " \n", 2046 | " \n", 2047 | " \n", 2048 | " \n", 2049 | " \n", 2050 | " \n", 2051 | " \n", 2052 | " \n", 2053 | " \n", 2054 | " \n", 2055 | " \n", 2056 | " \n", 2057 | " \n", 2058 | " \n", 2059 | " \n", 2060 | " \n", 2061 | " \n", 2062 | " \n", 2063 | " \n", 2064 | " \n", 2065 | " \n", 2066 | " \n", 2067 | " \n", 2068 | " \n", 2069 | " \n", 2070 | " \n", 2071 | " \n", 2072 | " \n", 2073 | " \n", 2074 | " \n", 2075 | " \n", 2076 | " \n", 2077 | " \n", 2078 | " \n", 2079 | " \n", 2080 | " \n", 2081 | " \n", 2082 | " \n", 2083 | " \n", 2084 | " \n", 2085 | " \n", 2086 | " \n", 2087 | " \n", 2088 | " \n", 2089 | " \n", 2090 | " \n", 2091 | " \n", 2092 | " \n", 2093 | " \n", 2094 | " \n", 2095 | " \n", 2096 | " \n", 2097 | " \n", 2098 | " \n", 2099 | " \n", 2100 | " \n", 2101 | " \n", 2102 | " \n", 2103 | " \n", 2104 | " \n", 2105 | " \n", 2106 | " \n", 2107 | " \n", 2108 | " \n", 2109 | " \n", 2110 | " \n", 2111 | " \n", 2112 | " \n", 2113 | " \n", 2114 | " \n", 2115 | " \n", 2116 | " \n", 2117 | " \n", 2118 | " \n", 2119 | " \n", 2120 | " \n", 2121 | " \n", 2122 | " \n", 2123 | " \n", 2124 | " \n", 2125 | " \n", 2126 | " \n", 2127 | " \n", 2128 | " \n", 2129 | " \n", 2130 | " \n", 2131 | " \n", 2132 | " \n", 2133 | " \n", 2134 | " \n", 2135 | " \n", 2136 | " \n", 2137 | " \n", 2138 | " \n", 2139 | " \n", 2140 | " \n", 2141 | " \n", 2142 | " \n", 2143 | " \n", 2144 | " \n", 2145 | " \n", 2146 | " \n", 2147 | " \n", 2148 | " \n", 2149 | " \n", 2150 | " \n", 2151 | " \n", 2152 | " \n", 2153 | " \n", 2154 | " \n", 2155 | " \n", 2156 | " \n", 2157 | " \n", 2158 | " \n", 2159 | " \n", 2160 | " \n", 2161 | " \n", 2162 | " \n", 2163 | " \n", 2164 | " \n", 2165 | " \n", 2166 | " \n", 2167 | " \n", 2168 | " \n", 2169 | " \n", 2170 | " \n", 2171 | " \n", 2172 | " \n", 2173 | " \n", 2174 | " \n", 2175 | " \n", 2176 | " \n", 2177 | " \n", 2178 | " \n", 2179 | " \n", 2180 | " \n", 2181 | " \n", 2182 | " \n", 2183 | " \n", 2184 | " \n", 2185 | " \n", 2186 | " \n", 2187 | " \n", 2188 | " \n", 2189 | " \n", 2190 | " \n", 2191 | " \n", 2192 | " \n", 2193 | " \n", 2194 | " \n", 2195 | " \n", 2196 | " \n", 2197 | " \n", 2198 | " \n", 2199 | " \n", 2200 | " \n", 2201 | " \n", 2202 | " \n", 2203 | " \n", 2204 | " \n", 2205 | " \n", 2206 | " \n", 2207 | " \n", 2208 | " \n", 2209 | " \n", 2210 | " \n", 2211 | " \n", 2212 | " \n", 2213 | " \n", 2214 | " \n", 2215 | " \n", 2216 | " \n", 2217 | " \n", 2218 | " \n", 2219 | " \n", 2220 | " \n", 2221 | " \n", 2222 | " \n", 2223 | " \n", 2224 | " \n", 2225 | " \n", 2226 | " \n", 2227 | " \n", 2228 | " \n", 2229 | " \n", 2230 | " \n", 2231 | " \n", 2232 | " \n", 2233 | " \n", 2234 | " \n", 2235 | " \n", 2236 | " \n", 2237 | " \n", 2238 | " \n", 2239 | " \n", 2240 | " \n", 2241 | " \n", 2242 | " \n", 2243 | " \n", 2244 | " \n", 2245 | " \n", 2246 | " \n", 2247 | " \n", 2248 | " \n", 2249 | " \n", 2250 | " \n", 2251 | " \n", 2252 | " \n", 2253 | " \n", 2254 | " \n", 2255 | " \n", 2256 | " \n", 2257 | " \n", 2258 | " \n", 2259 | " \n", 2260 | " \n", 2261 | " \n", 2262 | " \n", 2263 | " \n", 2264 | " \n", 2265 | "
Namemd5MachineSizeOfOptionalHeaderCharacteristicsMajorLinkerVersionMinorLinkerVersionSizeOfCodeSizeOfInitializedDataSizeOfUninitializedData...ResourcesNbResourcesMeanEntropyResourcesMinEntropyResourcesMaxEntropyResourcesMeanSizeResourcesMinSizeResourcesMaxSizeLoadConfigurationSizeVersionInformationSizelegitimate
41323VirusShare_4a400b747afe6547e09ce0b02dae7f1c4a400b747afe6547e09ce0b02dae7f1c3322242581103548162570240...73.9144151.4416887.6770917298.42857116284387200
41324VirusShare_9bd57c8252948bd2fa651ad372bd4f139bd57c8252948bd2fa651ad372bd4f1333222427160240641648641024...63.1991071.9713355.214816452.000000349580150
41325VirusShare_d1456165e9358b8f61f93a5f2042f39cd1456165e9358b8f61f93a5f2042f39c3322242581001187843819520...186.5309462.4584927.99268818523.444444483394572140
41326VirusShare_e4214cc73afbba0f52bb72d5db8f8bb1e4214cc73afbba0f52bb72d5db8f8bb13322242581001745923000320...155.7323932.8523647.98772612706.1333331186050072140
41327VirusShare_710890c07b3f93b90635f8bff6c34605710890c07b3f93b90635f8bff6c34605332224258904756483486720...592.8278260.9609537.2123292637.03389820676247200
41328VirusShare_3c2eb01508703752dca01957ea451a403c2eb01508703752dca01957ea451a4033222425990157696624640...133.9432961.8144436.1220452708.153846132964072140
41329VirusShare_3fb2d0ac00c5dff6c4fd5dfe6ba52c3f3fb2d0ac00c5dff6c4fd5dfe6ba52c3f332224259838272499223060480...213.9874632.6421596.47370014288.00000076270376000
41330VirusShare_ad1ca9a4d572c0a2793c4cea29b20887ad1ca9a4d572c0a2793c4cea29b208873322242581001203203850240...63.7298242.4584925.3175522739.50000048964072150
41331VirusShare_7414edb3d0be66aa0816e6ed4b6b0a217414edb3d0be66aa0816e6ed4b6b0a2133222425910023398413777920...184.3283222.3232207.06841376158.2777789134273572190
41332VirusShare_e57b4f294c142d050a784b67e2cf1f2ee57b4f294c142d050a784b67e2cf1f2e33222427160491525611520...00.0000000.0000000.0000000.00000000000
\n", 2266 | "

10 rows × 57 columns

\n", 2267 | "
" 2268 | ], 2269 | "text/plain": [ 2270 | " Name \\\n", 2271 | "41323 VirusShare_4a400b747afe6547e09ce0b02dae7f1c \n", 2272 | "41324 VirusShare_9bd57c8252948bd2fa651ad372bd4f13 \n", 2273 | "41325 VirusShare_d1456165e9358b8f61f93a5f2042f39c \n", 2274 | "41326 VirusShare_e4214cc73afbba0f52bb72d5db8f8bb1 \n", 2275 | "41327 VirusShare_710890c07b3f93b90635f8bff6c34605 \n", 2276 | "41328 VirusShare_3c2eb01508703752dca01957ea451a40 \n", 2277 | "41329 VirusShare_3fb2d0ac00c5dff6c4fd5dfe6ba52c3f \n", 2278 | "41330 VirusShare_ad1ca9a4d572c0a2793c4cea29b20887 \n", 2279 | "41331 VirusShare_7414edb3d0be66aa0816e6ed4b6b0a21 \n", 2280 | "41332 VirusShare_e57b4f294c142d050a784b67e2cf1f2e \n", 2281 | "\n", 2282 | " md5 Machine SizeOfOptionalHeader \\\n", 2283 | "41323 4a400b747afe6547e09ce0b02dae7f1c 332 224 \n", 2284 | "41324 9bd57c8252948bd2fa651ad372bd4f13 332 224 \n", 2285 | "41325 d1456165e9358b8f61f93a5f2042f39c 332 224 \n", 2286 | "41326 e4214cc73afbba0f52bb72d5db8f8bb1 332 224 \n", 2287 | "41327 710890c07b3f93b90635f8bff6c34605 332 224 \n", 2288 | "41328 3c2eb01508703752dca01957ea451a40 332 224 \n", 2289 | "41329 3fb2d0ac00c5dff6c4fd5dfe6ba52c3f 332 224 \n", 2290 | "41330 ad1ca9a4d572c0a2793c4cea29b20887 332 224 \n", 2291 | "41331 7414edb3d0be66aa0816e6ed4b6b0a21 332 224 \n", 2292 | "41332 e57b4f294c142d050a784b67e2cf1f2e 332 224 \n", 2293 | "\n", 2294 | " Characteristics MajorLinkerVersion MinorLinkerVersion SizeOfCode \\\n", 2295 | "41323 258 11 0 354816 \n", 2296 | "41324 271 6 0 24064 \n", 2297 | "41325 258 10 0 118784 \n", 2298 | "41326 258 10 0 174592 \n", 2299 | "41327 258 9 0 475648 \n", 2300 | "41328 259 9 0 157696 \n", 2301 | "41329 259 83 82 724992 \n", 2302 | "41330 258 10 0 120320 \n", 2303 | "41331 259 10 0 233984 \n", 2304 | "41332 271 6 0 49152 \n", 2305 | "\n", 2306 | " SizeOfInitializedData SizeOfUninitializedData ... ResourcesNb \\\n", 2307 | "41323 257024 0 ... 7 \n", 2308 | "41324 164864 1024 ... 6 \n", 2309 | "41325 381952 0 ... 18 \n", 2310 | "41326 300032 0 ... 15 \n", 2311 | "41327 348672 0 ... 59 \n", 2312 | "41328 62464 0 ... 13 \n", 2313 | "41329 2306048 0 ... 21 \n", 2314 | "41330 385024 0 ... 6 \n", 2315 | "41331 1377792 0 ... 18 \n", 2316 | "41332 561152 0 ... 0 \n", 2317 | "\n", 2318 | " ResourcesMeanEntropy ResourcesMinEntropy ResourcesMaxEntropy \\\n", 2319 | "41323 3.914415 1.441688 7.677091 \n", 2320 | "41324 3.199107 1.971335 5.214816 \n", 2321 | "41325 6.530946 2.458492 7.992688 \n", 2322 | "41326 5.732393 2.852364 7.987726 \n", 2323 | "41327 2.827826 0.960953 7.212329 \n", 2324 | "41328 3.943296 1.814443 6.122045 \n", 2325 | "41329 3.987463 2.642159 6.473700 \n", 2326 | "41330 3.729824 2.458492 5.317552 \n", 2327 | "41331 4.328322 2.323220 7.068413 \n", 2328 | "41332 0.000000 0.000000 0.000000 \n", 2329 | "\n", 2330 | " ResourcesMeanSize ResourcesMinSize ResourcesMaxSize \\\n", 2331 | "41323 7298.428571 16 28438 \n", 2332 | "41324 452.000000 34 958 \n", 2333 | "41325 18523.444444 48 33945 \n", 2334 | "41326 12706.133333 118 60500 \n", 2335 | "41327 2637.033898 20 67624 \n", 2336 | "41328 2708.153846 132 9640 \n", 2337 | "41329 14288.000000 76 270376 \n", 2338 | "41330 2739.500000 48 9640 \n", 2339 | "41331 76158.277778 9 1342735 \n", 2340 | "41332 0.000000 0 0 \n", 2341 | "\n", 2342 | " LoadConfigurationSize VersionInformationSize legitimate \n", 2343 | "41323 72 0 0 \n", 2344 | "41324 0 15 0 \n", 2345 | "41325 72 14 0 \n", 2346 | "41326 72 14 0 \n", 2347 | "41327 72 0 0 \n", 2348 | "41328 72 14 0 \n", 2349 | "41329 0 0 0 \n", 2350 | "41330 72 15 0 \n", 2351 | "41331 72 19 0 \n", 2352 | "41332 0 0 0 \n", 2353 | "\n", 2354 | "[10 rows x 57 columns]" 2355 | ] 2356 | }, 2357 | "execution_count": 17, 2358 | "metadata": {}, 2359 | "output_type": "execute_result" 2360 | } 2361 | ], 2362 | "source": [ 2363 | "#first 10 data points from malware database:\n", 2364 | "mal.head(10)" 2365 | ] 2366 | }, 2367 | { 2368 | "cell_type": "code", 2369 | "execution_count": 18, 2370 | "id": "d243486f", 2371 | "metadata": {}, 2372 | "outputs": [ 2373 | { 2374 | "data": { 2375 | "text/html": [ 2376 | "
\n", 2377 | "\n", 2390 | "\n", 2391 | " \n", 2392 | " \n", 2393 | " \n", 2394 | " \n", 2395 | " \n", 2396 | " \n", 2397 | " \n", 2398 | " \n", 2399 | " \n", 2400 | " \n", 2401 | " \n", 2402 | " \n", 2403 | " \n", 2404 | " \n", 2405 | " \n", 2406 | " \n", 2407 | " \n", 2408 | " \n", 2409 | " \n", 2410 | " \n", 2411 | " \n", 2412 | " \n", 2413 | " \n", 2414 | " \n", 2415 | " \n", 2416 | " \n", 2417 | " \n", 2418 | " \n", 2419 | " \n", 2420 | " \n", 2421 | " \n", 2422 | " \n", 2423 | " \n", 2424 | " \n", 2425 | " \n", 2426 | " \n", 2427 | " \n", 2428 | " \n", 2429 | " \n", 2430 | " \n", 2431 | " \n", 2432 | " \n", 2433 | " \n", 2434 | " \n", 2435 | " \n", 2436 | " \n", 2437 | " \n", 2438 | " \n", 2439 | " \n", 2440 | " \n", 2441 | " \n", 2442 | " \n", 2443 | "
Namemd5MachineSizeOfOptionalHeaderCharacteristicsMajorLinkerVersionMinorLinkerVersionSizeOfCodeSizeOfInitializedDataSizeOfUninitializedData...ExportNbResourcesNbResourcesMeanEntropyResourcesMinEntropyResourcesMaxEntropyResourcesMeanSizeResourcesMinSizeResourcesMaxSizeLoadConfigurationSizeVersionInformationSize
0memtest.exe631ea355665f28d4707448e442fbf5b8332224258903619841157120...043.2628232.5688443.5379398797.021618032016
\n", 2444 | "

1 rows × 56 columns

\n", 2445 | "
" 2446 | ], 2447 | "text/plain": [ 2448 | " Name md5 Machine \\\n", 2449 | "0 memtest.exe 631ea355665f28d4707448e442fbf5b8 332 \n", 2450 | "\n", 2451 | " SizeOfOptionalHeader Characteristics MajorLinkerVersion \\\n", 2452 | "0 224 258 9 \n", 2453 | "\n", 2454 | " MinorLinkerVersion SizeOfCode SizeOfInitializedData \\\n", 2455 | "0 0 361984 115712 \n", 2456 | "\n", 2457 | " SizeOfUninitializedData ... ExportNb ResourcesNb ResourcesMeanEntropy \\\n", 2458 | "0 0 ... 0 4 3.262823 \n", 2459 | "\n", 2460 | " ResourcesMinEntropy ResourcesMaxEntropy ResourcesMeanSize \\\n", 2461 | "0 2.568844 3.537939 8797.0 \n", 2462 | "\n", 2463 | " ResourcesMinSize ResourcesMaxSize LoadConfigurationSize \\\n", 2464 | "0 216 18032 0 \n", 2465 | "\n", 2466 | " VersionInformationSize \n", 2467 | "0 16 \n", 2468 | "\n", 2469 | "[1 rows x 56 columns]" 2470 | ] 2471 | }, 2472 | "execution_count": 18, 2473 | "metadata": {}, 2474 | "output_type": "execute_result" 2475 | } 2476 | ], 2477 | "source": [ 2478 | "#datapoint of legit to have a good comparison \n", 2479 | "legit.take([0]) #1st datapoint" 2480 | ] 2481 | }, 2482 | { 2483 | "cell_type": "code", 2484 | "execution_count": 19, 2485 | "id": "fd741615", 2486 | "metadata": {}, 2487 | "outputs": [ 2488 | { 2489 | "data": { 2490 | "text/html": [ 2491 | "
\n", 2492 | "\n", 2505 | "\n", 2506 | " \n", 2507 | " \n", 2508 | " \n", 2509 | " \n", 2510 | " \n", 2511 | " \n", 2512 | " \n", 2513 | " \n", 2514 | " \n", 2515 | " \n", 2516 | " \n", 2517 | " \n", 2518 | " \n", 2519 | " \n", 2520 | " \n", 2521 | " \n", 2522 | " \n", 2523 | " \n", 2524 | " \n", 2525 | " \n", 2526 | " \n", 2527 | " \n", 2528 | " \n", 2529 | " \n", 2530 | " \n", 2531 | " \n", 2532 | " \n", 2533 | " \n", 2534 | " \n", 2535 | " \n", 2536 | " \n", 2537 | " \n", 2538 | " \n", 2539 | " \n", 2540 | " \n", 2541 | " \n", 2542 | " \n", 2543 | " \n", 2544 | " \n", 2545 | " \n", 2546 | " \n", 2547 | " \n", 2548 | " \n", 2549 | " \n", 2550 | " \n", 2551 | " \n", 2552 | " \n", 2553 | " \n", 2554 | " \n", 2555 | " \n", 2556 | " \n", 2557 | " \n", 2558 | "
Namemd5MachineSizeOfOptionalHeaderCharacteristicsMajorLinkerVersionMinorLinkerVersionSizeOfCodeSizeOfInitializedDataSizeOfUninitializedData...ResourcesNbResourcesMeanEntropyResourcesMinEntropyResourcesMaxEntropyResourcesMeanSizeResourcesMinSizeResourcesMaxSizeLoadConfigurationSizeVersionInformationSizelegitimate
41323VirusShare_4a400b747afe6547e09ce0b02dae7f1c4a400b747afe6547e09ce0b02dae7f1c3322242581103548162570240...73.9144151.4416887.6770917298.42857116284387200
\n", 2559 | "

1 rows × 57 columns

\n", 2560 | "
" 2561 | ], 2562 | "text/plain": [ 2563 | " Name \\\n", 2564 | "41323 VirusShare_4a400b747afe6547e09ce0b02dae7f1c \n", 2565 | "\n", 2566 | " md5 Machine SizeOfOptionalHeader \\\n", 2567 | "41323 4a400b747afe6547e09ce0b02dae7f1c 332 224 \n", 2568 | "\n", 2569 | " Characteristics MajorLinkerVersion MinorLinkerVersion SizeOfCode \\\n", 2570 | "41323 258 11 0 354816 \n", 2571 | "\n", 2572 | " SizeOfInitializedData SizeOfUninitializedData ... ResourcesNb \\\n", 2573 | "41323 257024 0 ... 7 \n", 2574 | "\n", 2575 | " ResourcesMeanEntropy ResourcesMinEntropy ResourcesMaxEntropy \\\n", 2576 | "41323 3.914415 1.441688 7.677091 \n", 2577 | "\n", 2578 | " ResourcesMeanSize ResourcesMinSize ResourcesMaxSize \\\n", 2579 | "41323 7298.428571 16 28438 \n", 2580 | "\n", 2581 | " LoadConfigurationSize VersionInformationSize legitimate \n", 2582 | "41323 72 0 0 \n", 2583 | "\n", 2584 | "[1 rows x 57 columns]" 2585 | ] 2586 | }, 2587 | "execution_count": 19, 2588 | "metadata": {}, 2589 | "output_type": "execute_result" 2590 | } 2591 | ], 2592 | "source": [ 2593 | "#datapoint of malware to have a good comparison \n", 2594 | "mal.take([0]) #1st datapoint" 2595 | ] 2596 | }, 2597 | { 2598 | "cell_type": "code", 2599 | "execution_count": 20, 2600 | "id": "4dd1e87b", 2601 | "metadata": {}, 2602 | "outputs": [], 2603 | "source": [ 2604 | "# Feature Extraction" 2605 | ] 2606 | }, 2607 | { 2608 | "cell_type": "code", 2609 | "execution_count": 21, 2610 | "id": "d1ecc40f", 2611 | "metadata": {}, 2612 | "outputs": [], 2613 | "source": [ 2614 | "x=dataset.drop(['Name','md5','legitimate'],axis=1).values #independent features\n", 2615 | "y=dataset['legitimate'].values #dependent variable" 2616 | ] 2617 | }, 2618 | { 2619 | "cell_type": "code", 2620 | "execution_count": 22, 2621 | "id": "e26ddd4d", 2622 | "metadata": {}, 2623 | "outputs": [], 2624 | "source": [ 2625 | "extratrees=ek.ExtraTreesClassifier().fit(x,y)\n", 2626 | "model=SelectFromModel(extratrees,prefit=True)\n", 2627 | "x_new=model.transform(x)\n", 2628 | "nbfeatures=x_new.shape[1]" 2629 | ] 2630 | }, 2631 | { 2632 | "cell_type": "code", 2633 | "execution_count": 23, 2634 | "id": "3306769b", 2635 | "metadata": {}, 2636 | "outputs": [ 2637 | { 2638 | "data": { 2639 | "text/plain": [ 2640 | "14" 2641 | ] 2642 | }, 2643 | "execution_count": 23, 2644 | "metadata": {}, 2645 | "output_type": "execute_result" 2646 | } 2647 | ], 2648 | "source": [ 2649 | "nbfeatures" 2650 | ] 2651 | }, 2652 | { 2653 | "cell_type": "code", 2654 | "execution_count": 24, 2655 | "id": "a1bc47cc", 2656 | "metadata": {}, 2657 | "outputs": [ 2658 | { 2659 | "data": { 2660 | "text/plain": [ 2661 | "([,\n", 2662 | " ],\n", 2663 | " [Text(0.7884607600756525, 0.7670264857362649, 'Important Features'),\n", 2664 | " Text(-0.7884607959827531, -0.7670264488257517, 'Not Important Features')],\n", 2665 | " [Text(0.4300695054958104, 0.4183780831288717, '25%'),\n", 2666 | " Text(-0.43006952508150165, -0.4183780629958645, '75%')])" 2667 | ] 2668 | }, 2669 | "execution_count": 24, 2670 | "metadata": {}, 2671 | "output_type": "execute_result" 2672 | }, 2673 | { 2674 | "data": { 2675 | "image/png": "\n", 2676 | "text/plain": [ 2677 | "
" 2678 | ] 2679 | }, 2680 | "metadata": {}, 2681 | "output_type": "display_data" 2682 | } 2683 | ], 2684 | "source": [ 2685 | "dataset.columns.size\n", 2686 | "imp_features_visual=['Important Features','Not Important Features']\n", 2687 | "imp_features_visual_val=[nbfeatures,57-nbfeatures]\n", 2688 | "plt.pie(imp_features_visual_val, labels=imp_features_visual, autopct='%0.f%%')" 2689 | ] 2690 | }, 2691 | { 2692 | "cell_type": "code", 2693 | "execution_count": 25, 2694 | "id": "65c60c3c", 2695 | "metadata": {}, 2696 | "outputs": [], 2697 | "source": [ 2698 | "x_train,x_test,y_train,y_test=train_test_split(x_new,y,test_size=0.2)" 2699 | ] 2700 | }, 2701 | { 2702 | "cell_type": "code", 2703 | "execution_count": 26, 2704 | "id": "bc4902c0", 2705 | "metadata": {}, 2706 | "outputs": [], 2707 | "source": [ 2708 | "features=[]\n", 2709 | "index=np.argsort(extratrees.feature_importances_)[::1][:nbfeatures]" 2710 | ] 2711 | }, 2712 | { 2713 | "cell_type": "code", 2714 | "execution_count": 27, 2715 | "id": "d12c9fbb", 2716 | "metadata": {}, 2717 | "outputs": [ 2718 | { 2719 | "name": "stdout", 2720 | "output_type": "stream", 2721 | "text": [ 2722 | "1. feature LoaderFlags (0.000003)\n", 2723 | "2. feature NumberOfRvaAndSizes (0.000049)\n", 2724 | "3. feature SizeOfHeapCommit (0.000331)\n", 2725 | "4. feature BaseOfCode (0.000807)\n", 2726 | "5. feature SizeOfUninitializedData (0.000878)\n", 2727 | "6. feature ResourcesMeanSize (0.001154)\n", 2728 | "7. feature BaseOfData (0.001165)\n", 2729 | "8. feature ResourcesMaxSize (0.001197)\n", 2730 | "9. feature SectionsMeanVirtualsize (0.001212)\n", 2731 | "10. feature SizeOfImage (0.001226)\n", 2732 | "11. feature SectionMaxRawsize (0.001275)\n", 2733 | "12. feature SizeOfInitializedData (0.001280)\n", 2734 | "13. feature SectionMaxVirtualsize (0.001295)\n", 2735 | "14. feature SectionsMeanRawsize (0.001400)\n" 2736 | ] 2737 | } 2738 | ], 2739 | "source": [ 2740 | "for f in range(nbfeatures):\n", 2741 | " print(\"%d. feature %s (%f)\"%(f+1,dataset.columns[2+index[f]],extratrees.feature_importances_[index[f]]))\n", 2742 | " features.append(dataset.columns[2+f])" 2743 | ] 2744 | }, 2745 | { 2746 | "cell_type": "code", 2747 | "execution_count": 28, 2748 | "id": "ffe54b76", 2749 | "metadata": {}, 2750 | "outputs": [], 2751 | "source": [ 2752 | "model ={ \"RandomForest\":ek.RandomForestClassifier(n_estimators=50),\n", 2753 | " \"DecisionTree\":tree.DecisionTreeClassifier(max_depth=10),\n", 2754 | " \"LogisticRegression\":LogisticRegression()\n", 2755 | " }" 2756 | ] 2757 | }, 2758 | { 2759 | "cell_type": "code", 2760 | "execution_count": 29, 2761 | "id": "0ab0113e", 2762 | "metadata": {}, 2763 | "outputs": [ 2764 | { 2765 | "name": "stdout", 2766 | "output_type": "stream", 2767 | "text": [ 2768 | "RandomForest : 0.9940963419051069\n", 2769 | "DecisionTree : 0.9900760593987685\n", 2770 | "LogisticRegression : 0.6964505613908004\n" 2771 | ] 2772 | }, 2773 | { 2774 | "name": "stderr", 2775 | "output_type": "stream", 2776 | "text": [ 2777 | "C:\\Users\\vajha\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=2):\n", 2778 | "ABNORMAL_TERMINATION_IN_LNSRCH.\n", 2779 | "\n", 2780 | "Increase the number of iterations (max_iter) or scale the data as shown in:\n", 2781 | " https://scikit-learn.org/stable/modules/preprocessing.html\n", 2782 | "Please also refer to the documentation for alternative solver options:\n", 2783 | " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", 2784 | " n_iter_i = _check_optimize_result(\n" 2785 | ] 2786 | } 2787 | ], 2788 | "source": [ 2789 | "results={}\n", 2790 | "for algo in model:\n", 2791 | " clf=model[algo]\n", 2792 | " clf.fit(x_train,y_train)\n", 2793 | " score=clf.score(x_test,y_test)\n", 2794 | " print(\"%s : %s\"%(algo,score))\n", 2795 | " results[algo]=score" 2796 | ] 2797 | }, 2798 | { 2799 | "cell_type": "code", 2800 | "execution_count": 30, 2801 | "id": "4189adb6", 2802 | "metadata": {}, 2803 | "outputs": [ 2804 | { 2805 | "data": { 2806 | "text/plain": [ 2807 | "'RandomForest'" 2808 | ] 2809 | }, 2810 | "execution_count": 30, 2811 | "metadata": {}, 2812 | "output_type": "execute_result" 2813 | } 2814 | ], 2815 | "source": [ 2816 | "winner=max(results,key=results.get)\n", 2817 | "winner" 2818 | ] 2819 | }, 2820 | { 2821 | "cell_type": "code", 2822 | "execution_count": 31, 2823 | "id": "4fb1c869", 2824 | "metadata": {}, 2825 | "outputs": [ 2826 | { 2827 | "name": "stdout", 2828 | "output_type": "stream", 2829 | "text": [ 2830 | "False positive rate : 0.114760 %\n", 2831 | "False negative rate : 0.162137 %\n" 2832 | ] 2833 | } 2834 | ], 2835 | "source": [ 2836 | "clf=model[winner]\n", 2837 | "res=clf.predict(x_new)\n", 2838 | "mt=confusion_matrix(y,res)\n", 2839 | "print(\"False positive rate : %f %%\" % ((mt[0][1] / float(sum(mt[0])))*100))\n", 2840 | "print(\"False negative rate : %f %%\" % ((mt[1][0] / float(sum(mt[1])))*100))" 2841 | ] 2842 | }, 2843 | { 2844 | "cell_type": "code", 2845 | "execution_count": 32, 2846 | "id": "93c1012f", 2847 | "metadata": {}, 2848 | "outputs": [], 2849 | "source": [ 2850 | "# Check for Multicollinearity" 2851 | ] 2852 | }, 2853 | { 2854 | "cell_type": "code", 2855 | "execution_count": 33, 2856 | "id": "978fd992", 2857 | "metadata": {}, 2858 | "outputs": [ 2859 | { 2860 | "data": { 2861 | "text/html": [ 2862 | "
\n", 2863 | "\n", 2876 | "\n", 2877 | " \n", 2878 | " \n", 2879 | " \n", 2880 | " \n", 2881 | " \n", 2882 | " \n", 2883 | " \n", 2884 | " \n", 2885 | " \n", 2886 | " \n", 2887 | " \n", 2888 | " \n", 2889 | " \n", 2890 | " \n", 2891 | " \n", 2892 | " \n", 2893 | " \n", 2894 | " \n", 2895 | " \n", 2896 | " \n", 2897 | " \n", 2898 | " \n", 2899 | " \n", 2900 | " \n", 2901 | " \n", 2902 | " \n", 2903 | " \n", 2904 | " \n", 2905 | " \n", 2906 | " \n", 2907 | " \n", 2908 | " \n", 2909 | " \n", 2910 | " \n", 2911 | " \n", 2912 | " \n", 2913 | " \n", 2914 | " \n", 2915 | " \n", 2916 | " \n", 2917 | " \n", 2918 | " \n", 2919 | " \n", 2920 | " \n", 2921 | " \n", 2922 | " \n", 2923 | " \n", 2924 | " \n", 2925 | " \n", 2926 | " \n", 2927 | " \n", 2928 | " \n", 2929 | " \n", 2930 | " \n", 2931 | " \n", 2932 | " \n", 2933 | " \n", 2934 | " \n", 2935 | " \n", 2936 | " \n", 2937 | " \n", 2938 | " \n", 2939 | " \n", 2940 | " \n", 2941 | " \n", 2942 | " \n", 2943 | " \n", 2944 | " \n", 2945 | " \n", 2946 | " \n", 2947 | " \n", 2948 | " \n", 2949 | " \n", 2950 | " \n", 2951 | " \n", 2952 | " \n", 2953 | " \n", 2954 | " \n", 2955 | " \n", 2956 | " \n", 2957 | " \n", 2958 | " \n", 2959 | " \n", 2960 | " \n", 2961 | " \n", 2962 | " \n", 2963 | " \n", 2964 | " \n", 2965 | " \n", 2966 | " \n", 2967 | " \n", 2968 | " \n", 2969 | " \n", 2970 | " \n", 2971 | " \n", 2972 | " \n", 2973 | " \n", 2974 | " \n", 2975 | " \n", 2976 | " \n", 2977 | " \n", 2978 | " \n", 2979 | " \n", 2980 | " \n", 2981 | " \n", 2982 | " \n", 2983 | " \n", 2984 | " \n", 2985 | " \n", 2986 | " \n", 2987 | " \n", 2988 | " \n", 2989 | " \n", 2990 | " \n", 2991 | " \n", 2992 | " \n", 2993 | " \n", 2994 | " \n", 2995 | " \n", 2996 | " \n", 2997 | " \n", 2998 | " \n", 2999 | " \n", 3000 | " \n", 3001 | " \n", 3002 | " \n", 3003 | " \n", 3004 | " \n", 3005 | " \n", 3006 | " \n", 3007 | " \n", 3008 | " \n", 3009 | " \n", 3010 | " \n", 3011 | " \n", 3012 | " \n", 3013 | " \n", 3014 | " \n", 3015 | " \n", 3016 | " \n", 3017 | " \n", 3018 | " \n", 3019 | " \n", 3020 | " \n", 3021 | " \n", 3022 | " \n", 3023 | " \n", 3024 | " \n", 3025 | " \n", 3026 | " \n", 3027 | " \n", 3028 | " \n", 3029 | " \n", 3030 | " \n", 3031 | " \n", 3032 | " \n", 3033 | " \n", 3034 | " \n", 3035 | " \n", 3036 | " \n", 3037 | " \n", 3038 | " \n", 3039 | " \n", 3040 | " \n", 3041 | " \n", 3042 | " \n", 3043 | " \n", 3044 | " \n", 3045 | " \n", 3046 | " \n", 3047 | " \n", 3048 | " \n", 3049 | " \n", 3050 | " \n", 3051 | " \n", 3052 | " \n", 3053 | " \n", 3054 | " \n", 3055 | " \n", 3056 | " \n", 3057 | " \n", 3058 | " \n", 3059 | " \n", 3060 | " \n", 3061 | " \n", 3062 | " \n", 3063 | " \n", 3064 | " \n", 3065 | " \n", 3066 | " \n", 3067 | " \n", 3068 | " \n", 3069 | " \n", 3070 | " \n", 3071 | " \n", 3072 | " \n", 3073 | " \n", 3074 | " \n", 3075 | " \n", 3076 | " \n", 3077 | " \n", 3078 | " \n", 3079 | " \n", 3080 | " \n", 3081 | " \n", 3082 | " \n", 3083 | " \n", 3084 | " \n", 3085 | " \n", 3086 | " \n", 3087 | " \n", 3088 | " \n", 3089 | " \n", 3090 | " \n", 3091 | " \n", 3092 | " \n", 3093 | " \n", 3094 | " \n", 3095 | " \n", 3096 | " \n", 3097 | " \n", 3098 | " \n", 3099 | " \n", 3100 | " \n", 3101 | " \n", 3102 | " \n", 3103 | " \n", 3104 | " \n", 3105 | " \n", 3106 | " \n", 3107 | " \n", 3108 | " \n", 3109 | " \n", 3110 | " \n", 3111 | " \n", 3112 | " \n", 3113 | " \n", 3114 | " \n", 3115 | " \n", 3116 | " \n", 3117 | " \n", 3118 | " \n", 3119 | " \n", 3120 | " \n", 3121 | " \n", 3122 | " \n", 3123 | " \n", 3124 | " \n", 3125 | " \n", 3126 | " \n", 3127 | " \n", 3128 | " \n", 3129 | " \n", 3130 | " \n", 3131 | " \n", 3132 | " \n", 3133 | " \n", 3134 | " \n", 3135 | " \n", 3136 | " \n", 3137 | " \n", 3138 | " \n", 3139 | " \n", 3140 | " \n", 3141 | " \n", 3142 | " \n", 3143 | " \n", 3144 | " \n", 3145 | " \n", 3146 | " \n", 3147 | " \n", 3148 | " \n", 3149 | " \n", 3150 | " \n", 3151 | " \n", 3152 | " \n", 3153 | " \n", 3154 | " \n", 3155 | " \n", 3156 | " \n", 3157 | " \n", 3158 | " \n", 3159 | " \n", 3160 | " \n", 3161 | " \n", 3162 | " \n", 3163 | " \n", 3164 | " \n", 3165 | " \n", 3166 | " \n", 3167 | " \n", 3168 | " \n", 3169 | "
MachineSizeOfOptionalHeaderCharacteristicsMajorLinkerVersionMinorLinkerVersionSizeOfCodeSizeOfInitializedDataSizeOfUninitializedDataAddressOfEntryPointBaseOfCode...ExportNbResourcesNbResourcesMeanEntropyResourcesMinEntropyResourcesMaxEntropyResourcesMeanSizeResourcesMinSizeResourcesMaxSizeLoadConfigurationSizeVersionInformationSize
033222425890361984115712061354096...043.2628232.5688443.5379398797.00000021618032016
1332224333090130560199680817784096...024.2504613.4207445.080177837.00000051811567218
233222433309051712062156803508964096...1114.4263242.8464495.27181331102.2727271042703767218
33322242589058572836915204512584096...1104.3642912.6693146.4007201457.0000009042647218
43322242589029491224729602173814096...124.3061003.4215985.1906031074.50000084913007218
..................................................................
13804233222425811020582422374401232914096...074.1227361.3702607.67709114900.7142861681654720
13804333222433167225378881853440400004096...0263.3776632.0316195.0500746905.8461544467624015
1380443322242581001182723804160596104096...0226.8254062.6170267.99048714981.90909148226487214
1380453322243316622549152168960512164096...0103.4216272.0609644.739744601.60000016221600
1380463322242581101116164684800227314096...044.4072521.9804826.11537496625.00000020318464720
\n", 3170 | "

138047 rows × 54 columns

\n", 3171 | "
" 3172 | ], 3173 | "text/plain": [ 3174 | " Machine SizeOfOptionalHeader Characteristics MajorLinkerVersion \\\n", 3175 | "0 332 224 258 9 \n", 3176 | "1 332 224 3330 9 \n", 3177 | "2 332 224 3330 9 \n", 3178 | "3 332 224 258 9 \n", 3179 | "4 332 224 258 9 \n", 3180 | "... ... ... ... ... \n", 3181 | "138042 332 224 258 11 \n", 3182 | "138043 332 224 33167 2 \n", 3183 | "138044 332 224 258 10 \n", 3184 | "138045 332 224 33166 2 \n", 3185 | "138046 332 224 258 11 \n", 3186 | "\n", 3187 | " MinorLinkerVersion SizeOfCode SizeOfInitializedData \\\n", 3188 | "0 0 361984 115712 \n", 3189 | "1 0 130560 19968 \n", 3190 | "2 0 517120 621568 \n", 3191 | "3 0 585728 369152 \n", 3192 | "4 0 294912 247296 \n", 3193 | "... ... ... ... \n", 3194 | "138042 0 205824 223744 \n", 3195 | "138043 25 37888 185344 \n", 3196 | "138044 0 118272 380416 \n", 3197 | "138045 25 49152 16896 \n", 3198 | "138046 0 111616 468480 \n", 3199 | "\n", 3200 | " SizeOfUninitializedData AddressOfEntryPoint BaseOfCode ... \\\n", 3201 | "0 0 6135 4096 ... \n", 3202 | "1 0 81778 4096 ... \n", 3203 | "2 0 350896 4096 ... \n", 3204 | "3 0 451258 4096 ... \n", 3205 | "4 0 217381 4096 ... \n", 3206 | "... ... ... ... ... \n", 3207 | "138042 0 123291 4096 ... \n", 3208 | "138043 0 40000 4096 ... \n", 3209 | "138044 0 59610 4096 ... \n", 3210 | "138045 0 51216 4096 ... \n", 3211 | "138046 0 22731 4096 ... \n", 3212 | "\n", 3213 | " ExportNb ResourcesNb ResourcesMeanEntropy ResourcesMinEntropy \\\n", 3214 | "0 0 4 3.262823 2.568844 \n", 3215 | "1 0 2 4.250461 3.420744 \n", 3216 | "2 1 11 4.426324 2.846449 \n", 3217 | "3 1 10 4.364291 2.669314 \n", 3218 | "4 1 2 4.306100 3.421598 \n", 3219 | "... ... ... ... ... \n", 3220 | "138042 0 7 4.122736 1.370260 \n", 3221 | "138043 0 26 3.377663 2.031619 \n", 3222 | "138044 0 22 6.825406 2.617026 \n", 3223 | "138045 0 10 3.421627 2.060964 \n", 3224 | "138046 0 4 4.407252 1.980482 \n", 3225 | "\n", 3226 | " ResourcesMaxEntropy ResourcesMeanSize ResourcesMinSize \\\n", 3227 | "0 3.537939 8797.000000 216 \n", 3228 | "1 5.080177 837.000000 518 \n", 3229 | "2 5.271813 31102.272727 104 \n", 3230 | "3 6.400720 1457.000000 90 \n", 3231 | "4 5.190603 1074.500000 849 \n", 3232 | "... ... ... ... \n", 3233 | "138042 7.677091 14900.714286 16 \n", 3234 | "138043 5.050074 6905.846154 44 \n", 3235 | "138044 7.990487 14981.909091 48 \n", 3236 | "138045 4.739744 601.600000 16 \n", 3237 | "138046 6.115374 96625.000000 20 \n", 3238 | "\n", 3239 | " ResourcesMaxSize LoadConfigurationSize VersionInformationSize \n", 3240 | "0 18032 0 16 \n", 3241 | "1 1156 72 18 \n", 3242 | "2 270376 72 18 \n", 3243 | "3 4264 72 18 \n", 3244 | "4 1300 72 18 \n", 3245 | "... ... ... ... \n", 3246 | "138042 81654 72 0 \n", 3247 | "138043 67624 0 15 \n", 3248 | "138044 22648 72 14 \n", 3249 | "138045 2216 0 0 \n", 3250 | "138046 318464 72 0 \n", 3251 | "\n", 3252 | "[138047 rows x 54 columns]" 3253 | ] 3254 | }, 3255 | "execution_count": 33, 3256 | "metadata": {}, 3257 | "output_type": "execute_result" 3258 | } 3259 | ], 3260 | "source": [ 3261 | "mc=dataset.drop([\"Name\",'md5','legitimate'],axis=1) #independent features\n", 3262 | "mc" 3263 | ] 3264 | }, 3265 | { 3266 | "cell_type": "code", 3267 | "execution_count": 34, 3268 | "id": "6adb0606", 3269 | "metadata": {}, 3270 | "outputs": [ 3271 | { 3272 | "name": "stdout", 3273 | "output_type": "stream", 3274 | "text": [ 3275 | "Variance Inflation Factor for Machine: 1.19\n", 3276 | "Variance Inflation Factor for SizeOfOptionalHeader: 0.02\n", 3277 | "Variance Inflation Factor for Characteristics: 1.43\n", 3278 | "Variance Inflation Factor for MajorLinkerVersion: 1.19\n", 3279 | "Variance Inflation Factor for MinorLinkerVersion: 1.5\n", 3280 | "Variance Inflation Factor for SizeOfCode: 5.13\n", 3281 | "Variance Inflation Factor for SizeOfInitializedData: 1.57\n", 3282 | "Variance Inflation Factor for SizeOfUninitializedData: 1.0\n", 3283 | "Variance Inflation Factor for AddressOfEntryPoint: 1.07\n", 3284 | "Variance Inflation Factor for BaseOfCode: 4.27\n", 3285 | "Variance Inflation Factor for BaseOfData: 1.92\n", 3286 | "Variance Inflation Factor for ImageBase: 1.0\n", 3287 | "Variance Inflation Factor for SectionAlignment: 2.06\n", 3288 | "Variance Inflation Factor for FileAlignment: 1.09\n", 3289 | "Variance Inflation Factor for MajorOperatingSystemVersion: 1.0\n", 3290 | "Variance Inflation Factor for MinorOperatingSystemVersion: 4.16\n", 3291 | "Variance Inflation Factor for MajorImageVersion: 203.26\n", 3292 | "Variance Inflation Factor for MinorImageVersion: 186.8\n", 3293 | "Variance Inflation Factor for MajorSubsystemVersion: 0.6\n", 3294 | "Variance Inflation Factor for MinorSubsystemVersion: 17345.88\n", 3295 | "Variance Inflation Factor for SizeOfImage: 2.86\n", 3296 | "Variance Inflation Factor for SizeOfHeaders: 1.05\n", 3297 | "Variance Inflation Factor for CheckSum: 1.04\n", 3298 | "Variance Inflation Factor for Subsystem: 0.65\n", 3299 | "Variance Inflation Factor for DllCharacteristics: 1.63\n", 3300 | "Variance Inflation Factor for SizeOfStackReserve: 1.31\n", 3301 | "Variance Inflation Factor for SizeOfStackCommit: 1.03\n", 3302 | "Variance Inflation Factor for SizeOfHeapReserve: 0.57\n", 3303 | "Variance Inflation Factor for SizeOfHeapCommit: 140.51\n", 3304 | "Variance Inflation Factor for LoaderFlags: 143.64\n", 3305 | "Variance Inflation Factor for NumberOfRvaAndSizes: 4.65\n", 3306 | "Variance Inflation Factor for SectionsNb: 1.15\n", 3307 | "Variance Inflation Factor for SectionsMeanEntropy: 1.03\n", 3308 | "Variance Inflation Factor for SectionsMinEntropy: 1.18\n", 3309 | "Variance Inflation Factor for SectionsMaxEntropy: 0.7\n", 3310 | "Variance Inflation Factor for SectionsMeanRawsize: 30.3\n", 3311 | "Variance Inflation Factor for SectionsMinRawsize: 619.0\n", 3312 | "Variance Inflation Factor for SectionMaxRawsize: 26.68\n", 3313 | "Variance Inflation Factor for SectionsMeanVirtualsize: 138.58\n", 3314 | "Variance Inflation Factor for SectionsMinVirtualsize: 622.11\n", 3315 | "Variance Inflation Factor for SectionMaxVirtualsize: 146.14\n", 3316 | "Variance Inflation Factor for ImportsNbDLL: 1.42\n", 3317 | "Variance Inflation Factor for ImportsNb: 1.2\n", 3318 | "Variance Inflation Factor for ImportsNbOrdinal: 1.28\n", 3319 | "Variance Inflation Factor for ExportNb: 1.06\n", 3320 | "Variance Inflation Factor for ResourcesNb: 1.24\n", 3321 | "Variance Inflation Factor for ResourcesMeanEntropy: 0.89\n", 3322 | "Variance Inflation Factor for ResourcesMinEntropy: 0.88\n", 3323 | "Variance Inflation Factor for ResourcesMaxEntropy: 1.16\n", 3324 | "Variance Inflation Factor for ResourcesMeanSize: 13.04\n", 3325 | "Variance Inflation Factor for ResourcesMinSize: 7.14\n", 3326 | "Variance Inflation Factor for ResourcesMaxSize: 4.39\n", 3327 | "Variance Inflation Factor for LoadConfigurationSize: 1.0\n" 3328 | ] 3329 | } 3330 | ], 3331 | "source": [ 3332 | "for i in range(len(mc.columns[:-1])):\n", 3333 | " v=vif(np.matrix(mc[:-1]),i)\n", 3334 | " print(\"Variance Inflation Factor for {}: {}\".format(mc.columns[i],round(v,2)))" 3335 | ] 3336 | }, 3337 | { 3338 | "cell_type": "code", 3339 | "execution_count": 35, 3340 | "id": "5cc83368", 3341 | "metadata": {}, 3342 | "outputs": [ 3343 | { 3344 | "name": "stdout", 3345 | "output_type": "stream", 3346 | "text": [ 3347 | "Variance Inflation Factor for MajorImageVersion : 203.26\n", 3348 | "Variance Inflation Factor for MinorImageVersion : 186.8\n", 3349 | "Variance Inflation Factor for MinorSubsystemVersion : 17345.88\n", 3350 | "Variance Inflation Factor for SizeOfHeapCommit : 140.51\n", 3351 | "Variance Inflation Factor for LoaderFlags : 143.64\n", 3352 | "Variance Inflation Factor for SectionsMeanRawsize : 30.3\n", 3353 | "Variance Inflation Factor for SectionsMinRawsize : 619.0\n", 3354 | "Variance Inflation Factor for SectionMaxRawsize : 26.68\n", 3355 | "Variance Inflation Factor for SectionsMeanVirtualsize : 138.58\n", 3356 | "Variance Inflation Factor for SectionsMinVirtualsize : 622.11\n", 3357 | "Variance Inflation Factor for SectionMaxVirtualsize : 146.14\n", 3358 | "Variance Inflation Factor for ResourcesMeanSize : 13.04\n", 3359 | "12\n" 3360 | ] 3361 | } 3362 | ], 3363 | "source": [ 3364 | "count=0\n", 3365 | "for i in range(len(mc.columns[:-1])):\n", 3366 | " v=vif(np.matrix(mc[:-1]),i)\n", 3367 | " if v>10:\n", 3368 | " print(\"Variance Inflation Factor for {} : {}\".format(mc.columns[i],round(v,2)))\n", 3369 | " count=count+1\n", 3370 | "print(count) " 3371 | ] 3372 | }, 3373 | { 3374 | "cell_type": "code", 3375 | "execution_count": 38, 3376 | "id": "28be04aa", 3377 | "metadata": {}, 3378 | "outputs": [], 3379 | "source": [ 3380 | "# Remove Multicollinearity" 3381 | ] 3382 | }, 3383 | { 3384 | "cell_type": "code", 3385 | "execution_count": 39, 3386 | "id": "57f8e502", 3387 | "metadata": {}, 3388 | "outputs": [], 3389 | "source": [ 3390 | "x=dataset.drop(['Name','md5','legitimate','MajorImageVersion','MinorImageVersion','MinorSubsystemVersion','SizeOfHeapCommit','LoaderFlags','SectionsMeanRawsize','SectionsMeanVirtualsize','ResourcesMeanSize'],axis=1).values\n", 3391 | "y=dataset['legitimate'].values #dependent variable" 3392 | ] 3393 | }, 3394 | { 3395 | "cell_type": "code", 3396 | "execution_count": 40, 3397 | "id": "74d1f980", 3398 | "metadata": {}, 3399 | "outputs": [], 3400 | "source": [ 3401 | "extratrees=ek.ExtraTreesClassifier().fit(x,y)\n", 3402 | "model=SelectFromModel(extratrees,prefit=True)\n", 3403 | "x_new=model.transform(x)\n", 3404 | "nbfeatures=x_new.shape[1]" 3405 | ] 3406 | }, 3407 | { 3408 | "cell_type": "code", 3409 | "execution_count": 41, 3410 | "id": "9a959102", 3411 | "metadata": {}, 3412 | "outputs": [ 3413 | { 3414 | "data": { 3415 | "text/plain": [ 3416 | "12" 3417 | ] 3418 | }, 3419 | "execution_count": 41, 3420 | "metadata": {}, 3421 | "output_type": "execute_result" 3422 | } 3423 | ], 3424 | "source": [ 3425 | "nbfeatures" 3426 | ] 3427 | }, 3428 | { 3429 | "cell_type": "code", 3430 | "execution_count": 42, 3431 | "id": "3cbf22c4", 3432 | "metadata": {}, 3433 | "outputs": [ 3434 | { 3435 | "data": { 3436 | "text/plain": [ 3437 | "([,\n", 3438 | " ],\n", 3439 | " [Text(0.8680545570066952, 0.675633988236168, 'Important Features'),\n", 3440 | " Text(-0.8680544937492721, -0.675634069509298, 'Not Important Features')],\n", 3441 | " [Text(0.4734843038218337, 0.3685276299470007, '21%'),\n", 3442 | " Text(-0.47348426931778476, -0.36852767427779887, '79%')])" 3443 | ] 3444 | }, 3445 | "execution_count": 42, 3446 | "metadata": {}, 3447 | "output_type": "execute_result" 3448 | }, 3449 | { 3450 | "data": { 3451 | "image/png": "\n", 3452 | "text/plain": [ 3453 | "
" 3454 | ] 3455 | }, 3456 | "metadata": {}, 3457 | "output_type": "display_data" 3458 | } 3459 | ], 3460 | "source": [ 3461 | "dataset.columns.size\n", 3462 | "imp_features_visual=['Important Features','Not Important Features']\n", 3463 | "imp_features_visual_val=[nbfeatures,57-nbfeatures]\n", 3464 | "plt.pie(imp_features_visual_val, labels=imp_features_visual, autopct='%0.f%%')" 3465 | ] 3466 | }, 3467 | { 3468 | "cell_type": "code", 3469 | "execution_count": 43, 3470 | "id": "b018fe70", 3471 | "metadata": {}, 3472 | "outputs": [], 3473 | "source": [ 3474 | "x_train,x_test,y_train,y_test=train_test_split(x_new,y,test_size=0.2)" 3475 | ] 3476 | }, 3477 | { 3478 | "cell_type": "code", 3479 | "execution_count": 44, 3480 | "id": "3d0336b6", 3481 | "metadata": {}, 3482 | "outputs": [], 3483 | "source": [ 3484 | "features=[]\n", 3485 | "index=np.argsort(extratrees.feature_importances_)[::1][:nbfeatures]" 3486 | ] 3487 | }, 3488 | { 3489 | "cell_type": "code", 3490 | "execution_count": 45, 3491 | "id": "bc732e5b", 3492 | "metadata": {}, 3493 | "outputs": [ 3494 | { 3495 | "name": "stdout", 3496 | "output_type": "stream", 3497 | "text": [ 3498 | "1. feature SizeOfStackReserve (0.000042)\n", 3499 | "2. feature SizeOfUninitializedData (0.000998)\n", 3500 | "3. feature BaseOfCode (0.001076)\n", 3501 | "4. feature SizeOfInitializedData (0.001339)\n", 3502 | "5. feature MinorImageVersion (0.001343)\n", 3503 | "6. feature SectionsNb (0.001391)\n", 3504 | "7. feature SectionsMinEntropy (0.001397)\n", 3505 | "8. feature BaseOfData (0.001520)\n", 3506 | "9. feature ImportsNbOrdinal (0.001540)\n", 3507 | "10. feature DllCharacteristics (0.001577)\n", 3508 | "11. feature SizeOfCode (0.001702)\n", 3509 | "12. feature AddressOfEntryPoint (0.002303)\n" 3510 | ] 3511 | } 3512 | ], 3513 | "source": [ 3514 | "for f in range(nbfeatures):\n", 3515 | " print(\"%d. feature %s (%f)\"%(f+1,dataset.columns[2+index[f]],extratrees.feature_importances_[index[f]]))\n", 3516 | " features.append(dataset.columns[2+f])" 3517 | ] 3518 | }, 3519 | { 3520 | "cell_type": "code", 3521 | "execution_count": 46, 3522 | "id": "8019777c", 3523 | "metadata": {}, 3524 | "outputs": [], 3525 | "source": [ 3526 | "model ={ \"RandomForest\":ek.RandomForestClassifier(n_estimators=50),\n", 3527 | " \"DecisionTree\":tree.DecisionTreeClassifier(max_depth=10),\n", 3528 | " \"LogisticRegression\":LogisticRegression()\n", 3529 | " }" 3530 | ] 3531 | }, 3532 | { 3533 | "cell_type": "code", 3534 | "execution_count": 47, 3535 | "id": "26424604", 3536 | "metadata": {}, 3537 | "outputs": [ 3538 | { 3539 | "name": "stdout", 3540 | "output_type": "stream", 3541 | "text": [ 3542 | "RandomForest : 0.9942774357116987\n", 3543 | "DecisionTree : 0.9903295907279971\n", 3544 | "LogisticRegression : 0.6968489677653025\n" 3545 | ] 3546 | }, 3547 | { 3548 | "name": "stderr", 3549 | "output_type": "stream", 3550 | "text": [ 3551 | "C:\\Users\\vajha\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=2):\n", 3552 | "ABNORMAL_TERMINATION_IN_LNSRCH.\n", 3553 | "\n", 3554 | "Increase the number of iterations (max_iter) or scale the data as shown in:\n", 3555 | " https://scikit-learn.org/stable/modules/preprocessing.html\n", 3556 | "Please also refer to the documentation for alternative solver options:\n", 3557 | " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", 3558 | " n_iter_i = _check_optimize_result(\n" 3559 | ] 3560 | } 3561 | ], 3562 | "source": [ 3563 | "results={}\n", 3564 | "for algo in model:\n", 3565 | " clf=model[algo]\n", 3566 | " clf.fit(x_train,y_train)\n", 3567 | " score=clf.score(x_test,y_test)\n", 3568 | " print(\"%s : %s\"%(algo,score))\n", 3569 | " results[algo]=score" 3570 | ] 3571 | }, 3572 | { 3573 | "cell_type": "code", 3574 | "execution_count": 48, 3575 | "id": "295df33e", 3576 | "metadata": {}, 3577 | "outputs": [ 3578 | { 3579 | "data": { 3580 | "text/plain": [ 3581 | "'RandomForest'" 3582 | ] 3583 | }, 3584 | "execution_count": 48, 3585 | "metadata": {}, 3586 | "output_type": "execute_result" 3587 | } 3588 | ], 3589 | "source": [ 3590 | "winner=max(results,key=results.get)\n", 3591 | "winner" 3592 | ] 3593 | }, 3594 | { 3595 | "cell_type": "code", 3596 | "execution_count": 49, 3597 | "id": "b4203503", 3598 | "metadata": {}, 3599 | "outputs": [ 3600 | { 3601 | "name": "stdout", 3602 | "output_type": "stream", 3603 | "text": [ 3604 | "False positive rate : 0.102353 %\n", 3605 | "False negative rate : 0.174237 %\n" 3606 | ] 3607 | } 3608 | ], 3609 | "source": [ 3610 | "clf=model[winner]\n", 3611 | "res=clf.predict(x_new)\n", 3612 | "mt=confusion_matrix(y,res)\n", 3613 | "print(\"False positive rate : %f %%\" % ((mt[0][1] / float(sum(mt[0])))*100))\n", 3614 | "print(\"False negative rate : %f %%\" % ((mt[1][0] / float(sum(mt[1])))*100))" 3615 | ] 3616 | }, 3617 | { 3618 | "cell_type": "code", 3619 | "execution_count": 50, 3620 | "id": "395ebe6a", 3621 | "metadata": {}, 3622 | "outputs": [], 3623 | "source": [ 3624 | "# Confusion Matrix" 3625 | ] 3626 | }, 3627 | { 3628 | "cell_type": "code", 3629 | "execution_count": 51, 3630 | "id": "619f2146", 3631 | "metadata": {}, 3632 | "outputs": [ 3633 | { 3634 | "data": { 3635 | "text/plain": [ 3636 | "array([[96625, 99],\n", 3637 | " [ 72, 41251]], dtype=int64)" 3638 | ] 3639 | }, 3640 | "execution_count": 51, 3641 | "metadata": {}, 3642 | "output_type": "execute_result" 3643 | } 3644 | ], 3645 | "source": [ 3646 | "cf=confusion_matrix(y,res)\n", 3647 | "cf" 3648 | ] 3649 | }, 3650 | { 3651 | "cell_type": "code", 3652 | "execution_count": 52, 3653 | "id": "87bbf42c", 3654 | "metadata": {}, 3655 | "outputs": [ 3656 | { 3657 | "data": { 3658 | "image/png": "\n", 3659 | "text/plain": [ 3660 | "
" 3661 | ] 3662 | }, 3663 | "metadata": { 3664 | "needs_background": "light" 3665 | }, 3666 | "output_type": "display_data" 3667 | } 3668 | ], 3669 | "source": [ 3670 | "plot_confusion_matrix(conf_mat=cf)\n", 3671 | "plt.xlabel(\"Actual\")\n", 3672 | "plt.ylabel(\"Predicted\")\n", 3673 | "plt.title(\"Confusion Matrix - Key: 0 is Legitimate & 1 is Malware\")\n", 3674 | "plt.show()" 3675 | ] 3676 | }, 3677 | { 3678 | "cell_type": "code", 3679 | "execution_count": null, 3680 | "id": "eadbd20c", 3681 | "metadata": {}, 3682 | "outputs": [], 3683 | "source": [] 3684 | } 3685 | ], 3686 | "metadata": { 3687 | "kernelspec": { 3688 | "display_name": "Python 3", 3689 | "language": "python", 3690 | "name": "python3" 3691 | }, 3692 | "language_info": { 3693 | "codemirror_mode": { 3694 | "name": "ipython", 3695 | "version": 3 3696 | }, 3697 | "file_extension": ".py", 3698 | "mimetype": "text/x-python", 3699 | "name": "python", 3700 | "nbconvert_exporter": "python", 3701 | "pygments_lexer": "ipython3", 3702 | "version": "3.8.8" 3703 | } 3704 | }, 3705 | "nbformat": 4, 3706 | "nbformat_minor": 5 3707 | } 3708 | --------------------------------------------------------------------------------