├── README.md ├── data.zip └── malware-classification.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Machine-Learning-approach-for-Malware-Detection 2 | A Machine Learning approach for classifying a file as Malicious or Legitimate. 3 | 4 | This approach tries out 6 different classification algorithms before deciding which one to use for prediction by comparing their results. 5 | Different Machine Learning models tried are, Linear Regression, RandomForest, DecisionTree, Adaboost, Gaussian, Gradient Boosting. 6 | 7 | In order to test the model on an unseen file, it's required to extract the characteristics of the given file. Python's pefile.PE library is used to construct and build the feature vector and a ML model is used to predict the class for the given file based on the already trained model. 8 | 9 | Dependencies 10 | ============ 11 | 12 | * pandas ```pip install pandas``` 13 | * numpy ```pip install numpy``` 14 | * pickle ```pip install pickle``` 15 | * scipy ```pip install scipy``` 16 | * scikit ```pip install -U scikit-learn``` 17 | -------------------------------------------------------------------------------- /data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/surajr/Machine-Learning-approach-for-Malware-Detection/a6229eea45f962245ef2e8d139e0b68e59823972/data.zip -------------------------------------------------------------------------------- /malware-classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## A Machine Learning approach for Malware Detection" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Importing all the required libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import os\n", 26 | "import pandas\n", 27 | "import numpy\n", 28 | "import pickle\n", 29 | "import pefile\n", 30 | "import sklearn.ensemble as ek\n", 31 | "from sklearn import cross_validation, tree, linear_model\n", 32 | "from sklearn.feature_selection import SelectFromModel\n", 33 | "from sklearn.externals import joblib\n", 34 | "from sklearn.naive_bayes import GaussianNB\n", 35 | "from sklearn.metrics import confusion_matrix\n", 36 | "from sklearn.pipeline import make_pipeline\n", 37 | "from sklearn import preprocessing\n", 38 | "from sklearn import svm\n", 39 | "from sklearn.linear_model import LinearRegression" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "Loading the initial dataset delimited by | " 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "dataset = pandas.read_csv('/home/surajr/Downloads/data.csv',sep='|', low_memory=False)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/html": [ 70 | "
\n", 71 | "\n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | "
Namemd5MachineSizeOfOptionalHeaderCharacteristicsMajorLinkerVersionMinorLinkerVersionSizeOfCodeSizeOfInitializedDataSizeOfUninitializedData...ResourcesNbResourcesMeanEntropyResourcesMinEntropyResourcesMaxEntropyResourcesMeanSizeResourcesMinSizeResourcesMaxSizeLoadConfigurationSizeVersionInformationSizelegitimate
0memtest.exe631ea355665f28d4707448e442fbf5b8332224258903619841157120...43.2628232.5688443.5379398797.000000216180320161
1ose.exe9d10f99a6712e28f8acd5641e3a7ea6b332224333090130560199680...24.2504613.4207445.080177837.000000518115672181
2setup.exe4d92f518527353c0db88a70fddcfd3903322243330905171206215680...114.4263242.8464495.27181331102.27272710427037672181
3DW20.EXEa41e524f8d45f0074fd07805ff0c9b12332224258905857283691520...104.3642912.6693146.4007201457.00000090426472181
4dwtrig20.exec87e561258f2f8650cef999bf643a731332224258902949122472960...24.3061003.4215985.1906031074.500000849130072181
\n", 221 | "

5 rows × 57 columns

\n", 222 | "
" 223 | ], 224 | "text/plain": [ 225 | " Name md5 Machine \\\n", 226 | "0 memtest.exe 631ea355665f28d4707448e442fbf5b8 332 \n", 227 | "1 ose.exe 9d10f99a6712e28f8acd5641e3a7ea6b 332 \n", 228 | "2 setup.exe 4d92f518527353c0db88a70fddcfd390 332 \n", 229 | "3 DW20.EXE a41e524f8d45f0074fd07805ff0c9b12 332 \n", 230 | "4 dwtrig20.exe c87e561258f2f8650cef999bf643a731 332 \n", 231 | "\n", 232 | " SizeOfOptionalHeader Characteristics MajorLinkerVersion \\\n", 233 | "0 224 258 9 \n", 234 | "1 224 3330 9 \n", 235 | "2 224 3330 9 \n", 236 | "3 224 258 9 \n", 237 | "4 224 258 9 \n", 238 | "\n", 239 | " MinorLinkerVersion SizeOfCode SizeOfInitializedData \\\n", 240 | "0 0 361984 115712 \n", 241 | "1 0 130560 19968 \n", 242 | "2 0 517120 621568 \n", 243 | "3 0 585728 369152 \n", 244 | "4 0 294912 247296 \n", 245 | "\n", 246 | " SizeOfUninitializedData ... ResourcesNb ResourcesMeanEntropy \\\n", 247 | "0 0 ... 4 3.262823 \n", 248 | "1 0 ... 2 4.250461 \n", 249 | "2 0 ... 11 4.426324 \n", 250 | "3 0 ... 10 4.364291 \n", 251 | "4 0 ... 2 4.306100 \n", 252 | "\n", 253 | " ResourcesMinEntropy ResourcesMaxEntropy ResourcesMeanSize \\\n", 254 | "0 2.568844 3.537939 8797.000000 \n", 255 | "1 3.420744 5.080177 837.000000 \n", 256 | "2 2.846449 5.271813 31102.272727 \n", 257 | "3 2.669314 6.400720 1457.000000 \n", 258 | "4 3.421598 5.190603 1074.500000 \n", 259 | "\n", 260 | " ResourcesMinSize ResourcesMaxSize LoadConfigurationSize \\\n", 261 | "0 216 18032 0 \n", 262 | "1 518 1156 72 \n", 263 | "2 104 270376 72 \n", 264 | "3 90 4264 72 \n", 265 | "4 849 1300 72 \n", 266 | "\n", 267 | " VersionInformationSize legitimate \n", 268 | "0 16 1 \n", 269 | "1 18 1 \n", 270 | "2 18 1 \n", 271 | "3 18 1 \n", 272 | "4 18 1 \n", 273 | "\n", 274 | "[5 rows x 57 columns]" 275 | ] 276 | }, 277 | "execution_count": 3, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "dataset.head()" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 4, 289 | "metadata": { 290 | "collapsed": false 291 | }, 292 | "outputs": [ 293 | { 294 | "data": { 295 | "text/html": [ 296 | "
\n", 297 | "\n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | "
MachineSizeOfOptionalHeaderCharacteristicsMajorLinkerVersionMinorLinkerVersionSizeOfCodeSizeOfInitializedDataSizeOfUninitializedDataAddressOfEntryPointBaseOfCode...ResourcesNbResourcesMeanEntropyResourcesMinEntropyResourcesMaxEntropyResourcesMeanSizeResourcesMinSizeResourcesMaxSizeLoadConfigurationSizeVersionInformationSizelegitimate
count138047.000000138047.000000138047.000000138047.000000138047.0000001.380470e+051.380470e+051.380470e+051.380470e+051.380470e+05...138047.000000138047.000000138047.000000138047.0000001.380470e+051.380470e+051.380470e+051.380470e+05138047.000000138047.000000
mean4259.069274225.8456324444.1459948.6197743.8192862.425956e+054.504867e+051.009525e+051.719561e+055.779845e+04...22.0507004.0001272.4345415.5216105.545093e+041.818082e+042.465903e+054.656750e+0512.3631150.299340
std10880.3472455.1213998186.7825244.08875711.8626755.754485e+062.101599e+071.635288e+073.430553e+065.527658e+06...136.4942441.1129810.8155771.5974037.799163e+066.502369e+062.124860e+072.608987e+076.7988780.457971
min332.000000224.0000002.0000000.0000000.0000000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00...0.0000000.0000000.0000000.0000000.000000e+000.000000e+000.000000e+000.000000e+000.0000000.000000
25%332.000000224.000000258.0000008.0000000.0000003.020800e+042.457600e+040.000000e+001.272100e+044.096000e+03...5.0000003.4585052.1787484.8287069.560000e+024.800000e+012.216000e+030.000000e+0013.0000000.000000
50%332.000000224.000000258.0000009.0000000.0000001.136640e+052.631680e+050.000000e+005.288300e+044.096000e+03...6.0000003.7298242.4584925.3175522.708154e+034.800000e+019.640000e+037.200000e+0115.0000000.000000
75%332.000000224.0000008226.00000010.0000000.0000001.203200e+053.850240e+050.000000e+006.157800e+044.096000e+03...13.0000004.2330512.6968336.5022396.558429e+031.320000e+022.378000e+047.200000e+0116.0000001.000000
max34404.000000352.00000049551.000000255.000000255.0000001.818587e+094.294966e+094.294941e+091.074484e+092.028711e+09...7694.0000007.9997237.9997238.0000002.415919e+092.415919e+094.294903e+094.294967e+0926.0000001.000000
\n", 519 | "

8 rows × 54 columns

\n", 520 | "
" 521 | ], 522 | "text/plain": [ 523 | " Machine SizeOfOptionalHeader Characteristics \\\n", 524 | "count 138047.000000 138047.000000 138047.000000 \n", 525 | "mean 4259.069274 225.845632 4444.145994 \n", 526 | "std 10880.347245 5.121399 8186.782524 \n", 527 | "min 332.000000 224.000000 2.000000 \n", 528 | "25% 332.000000 224.000000 258.000000 \n", 529 | "50% 332.000000 224.000000 258.000000 \n", 530 | "75% 332.000000 224.000000 8226.000000 \n", 531 | "max 34404.000000 352.000000 49551.000000 \n", 532 | "\n", 533 | " MajorLinkerVersion MinorLinkerVersion SizeOfCode \\\n", 534 | "count 138047.000000 138047.000000 1.380470e+05 \n", 535 | "mean 8.619774 3.819286 2.425956e+05 \n", 536 | "std 4.088757 11.862675 5.754485e+06 \n", 537 | "min 0.000000 0.000000 0.000000e+00 \n", 538 | "25% 8.000000 0.000000 3.020800e+04 \n", 539 | "50% 9.000000 0.000000 1.136640e+05 \n", 540 | "75% 10.000000 0.000000 1.203200e+05 \n", 541 | "max 255.000000 255.000000 1.818587e+09 \n", 542 | "\n", 543 | " SizeOfInitializedData SizeOfUninitializedData AddressOfEntryPoint \\\n", 544 | "count 1.380470e+05 1.380470e+05 1.380470e+05 \n", 545 | "mean 4.504867e+05 1.009525e+05 1.719561e+05 \n", 546 | "std 2.101599e+07 1.635288e+07 3.430553e+06 \n", 547 | "min 0.000000e+00 0.000000e+00 0.000000e+00 \n", 548 | "25% 2.457600e+04 0.000000e+00 1.272100e+04 \n", 549 | "50% 2.631680e+05 0.000000e+00 5.288300e+04 \n", 550 | "75% 3.850240e+05 0.000000e+00 6.157800e+04 \n", 551 | "max 4.294966e+09 4.294941e+09 1.074484e+09 \n", 552 | "\n", 553 | " BaseOfCode ... ResourcesNb ResourcesMeanEntropy \\\n", 554 | "count 1.380470e+05 ... 138047.000000 138047.000000 \n", 555 | "mean 5.779845e+04 ... 22.050700 4.000127 \n", 556 | "std 5.527658e+06 ... 136.494244 1.112981 \n", 557 | "min 0.000000e+00 ... 0.000000 0.000000 \n", 558 | "25% 4.096000e+03 ... 5.000000 3.458505 \n", 559 | "50% 4.096000e+03 ... 6.000000 3.729824 \n", 560 | "75% 4.096000e+03 ... 13.000000 4.233051 \n", 561 | "max 2.028711e+09 ... 7694.000000 7.999723 \n", 562 | "\n", 563 | " ResourcesMinEntropy ResourcesMaxEntropy ResourcesMeanSize \\\n", 564 | "count 138047.000000 138047.000000 1.380470e+05 \n", 565 | "mean 2.434541 5.521610 5.545093e+04 \n", 566 | "std 0.815577 1.597403 7.799163e+06 \n", 567 | "min 0.000000 0.000000 0.000000e+00 \n", 568 | "25% 2.178748 4.828706 9.560000e+02 \n", 569 | "50% 2.458492 5.317552 2.708154e+03 \n", 570 | "75% 2.696833 6.502239 6.558429e+03 \n", 571 | "max 7.999723 8.000000 2.415919e+09 \n", 572 | "\n", 573 | " ResourcesMinSize ResourcesMaxSize LoadConfigurationSize \\\n", 574 | "count 1.380470e+05 1.380470e+05 1.380470e+05 \n", 575 | "mean 1.818082e+04 2.465903e+05 4.656750e+05 \n", 576 | "std 6.502369e+06 2.124860e+07 2.608987e+07 \n", 577 | "min 0.000000e+00 0.000000e+00 0.000000e+00 \n", 578 | "25% 4.800000e+01 2.216000e+03 0.000000e+00 \n", 579 | "50% 4.800000e+01 9.640000e+03 7.200000e+01 \n", 580 | "75% 1.320000e+02 2.378000e+04 7.200000e+01 \n", 581 | "max 2.415919e+09 4.294903e+09 4.294967e+09 \n", 582 | "\n", 583 | " VersionInformationSize legitimate \n", 584 | "count 138047.000000 138047.000000 \n", 585 | "mean 12.363115 0.299340 \n", 586 | "std 6.798878 0.457971 \n", 587 | "min 0.000000 0.000000 \n", 588 | "25% 13.000000 0.000000 \n", 589 | "50% 15.000000 0.000000 \n", 590 | "75% 16.000000 1.000000 \n", 591 | "max 26.000000 1.000000 \n", 592 | "\n", 593 | "[8 rows x 54 columns]" 594 | ] 595 | }, 596 | "execution_count": 4, 597 | "metadata": {}, 598 | "output_type": "execute_result" 599 | } 600 | ], 601 | "source": [ 602 | "dataset.describe()" 603 | ] 604 | }, 605 | { 606 | "cell_type": "markdown", 607 | "metadata": {}, 608 | "source": [ 609 | "Number of malicious files vs Legitimate files in the training set" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": 5, 615 | "metadata": { 616 | "collapsed": false 617 | }, 618 | "outputs": [ 619 | { 620 | "data": { 621 | "text/plain": [ 622 | "legitimate\n", 623 | "0 96724\n", 624 | "1 41323\n", 625 | "dtype: int64" 626 | ] 627 | }, 628 | "execution_count": 5, 629 | "metadata": {}, 630 | "output_type": "execute_result" 631 | } 632 | ], 633 | "source": [ 634 | "dataset.groupby(dataset['legitimate']).size()" 635 | ] 636 | }, 637 | { 638 | "cell_type": "markdown", 639 | "metadata": {}, 640 | "source": [ 641 | "Dropping columns like Name of the file, MD5 (message digest) and label" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": 6, 647 | "metadata": { 648 | "collapsed": false 649 | }, 650 | "outputs": [], 651 | "source": [ 652 | "X = dataset.drop(['Name','md5','legitimate'],axis=1).values\n", 653 | "y = dataset['legitimate'].values" 654 | ] 655 | }, 656 | { 657 | "cell_type": "markdown", 658 | "metadata": {}, 659 | "source": [ 660 | "##### ExtraTreesClassifier\n", 661 | "ExtraTreesClassifier fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": 7, 667 | "metadata": { 668 | "collapsed": false 669 | }, 670 | "outputs": [], 671 | "source": [ 672 | "extratrees = ek.ExtraTreesClassifier().fit(X,y)\n", 673 | "model = SelectFromModel(extratrees, prefit=True)\n", 674 | "X_new = model.transform(X)\n", 675 | "nbfeatures = X_new.shape[1]" 676 | ] 677 | }, 678 | { 679 | "cell_type": "markdown", 680 | "metadata": {}, 681 | "source": [ 682 | "ExtraTreesClassifier helps in selecting the required features useful for classifying a file as either Malicious or Legitimate\n", 683 | "\n", 684 | "14 features are identified as required by ExtraTreesClassifier" 685 | ] 686 | }, 687 | { 688 | "cell_type": "code", 689 | "execution_count": 8, 690 | "metadata": { 691 | "collapsed": false 692 | }, 693 | "outputs": [ 694 | { 695 | "data": { 696 | "text/plain": [ 697 | "14" 698 | ] 699 | }, 700 | "execution_count": 8, 701 | "metadata": {}, 702 | "output_type": "execute_result" 703 | } 704 | ], 705 | "source": [ 706 | "nbfeatures" 707 | ] 708 | }, 709 | { 710 | "cell_type": "markdown", 711 | "metadata": {}, 712 | "source": [ 713 | "###### Cross Validation\n", 714 | "Cross validation is applied to divide the dataset into random train and test subsets.\n", 715 | "test_size = 0.2 represent the proportion of the dataset to include in the test split " 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": 14, 721 | "metadata": { 722 | "collapsed": true 723 | }, 724 | "outputs": [], 725 | "source": [ 726 | "X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_new, y ,test_size=0.2)" 727 | ] 728 | }, 729 | { 730 | "cell_type": "code", 731 | "execution_count": 9, 732 | "metadata": { 733 | "collapsed": true 734 | }, 735 | "outputs": [], 736 | "source": [ 737 | "features = []\n", 738 | "index = numpy.argsort(extratrees.feature_importances_)[::-1][:nbfeatures]" 739 | ] 740 | }, 741 | { 742 | "cell_type": "markdown", 743 | "metadata": {}, 744 | "source": [ 745 | "The features identified by ExtraTreesClassifier" 746 | ] 747 | }, 748 | { 749 | "cell_type": "code", 750 | "execution_count": 10, 751 | "metadata": { 752 | "collapsed": false 753 | }, 754 | "outputs": [ 755 | { 756 | "name": "stdout", 757 | "output_type": "stream", 758 | "text": [ 759 | "1. feature DllCharacteristics (0.141259)\n", 760 | "2. feature Characteristics (0.136174)\n", 761 | "3. feature Machine (0.102237)\n", 762 | "4. feature SectionsMaxEntropy (0.093866)\n", 763 | "5. feature MajorSubsystemVersion (0.076185)\n", 764 | "6. feature ResourcesMinEntropy (0.054568)\n", 765 | "7. feature ResourcesMaxEntropy (0.048843)\n", 766 | "8. feature ImageBase (0.047034)\n", 767 | "9. feature VersionInformationSize (0.046712)\n", 768 | "10. feature SizeOfOptionalHeader (0.041392)\n", 769 | "11. feature SectionsMeanEntropy (0.025279)\n", 770 | "12. feature Subsystem (0.022657)\n", 771 | "13. feature MajorOperatingSystemVersion (0.019587)\n", 772 | "14. feature CheckSum (0.019544)\n" 773 | ] 774 | } 775 | ], 776 | "source": [ 777 | "for f in range(nbfeatures):\n", 778 | " print(\"%d. feature %s (%f)\" % (f + 1, dataset.columns[2+index[f]], extratrees.feature_importances_[index[f]]))\n", 779 | " features.append(dataset.columns[2+f])" 780 | ] 781 | }, 782 | { 783 | "cell_type": "markdown", 784 | "metadata": {}, 785 | "source": [ 786 | "Building the below Machine Learning model" 787 | ] 788 | }, 789 | { 790 | "cell_type": "code", 791 | "execution_count": 12, 792 | "metadata": { 793 | "collapsed": true 794 | }, 795 | "outputs": [], 796 | "source": [ 797 | "model = { \"DecisionTree\":tree.DecisionTreeClassifier(max_depth=10),\n", 798 | " \"RandomForest\":ek.RandomForestClassifier(n_estimators=50),\n", 799 | " \"Adaboost\":ek.AdaBoostClassifier(n_estimators=50),\n", 800 | " \"GradientBoosting\":ek.GradientBoostingClassifier(n_estimators=50),\n", 801 | " \"GNB\":GaussianNB(),\n", 802 | " \"LinearRegression\":LinearRegression() \n", 803 | "}" 804 | ] 805 | }, 806 | { 807 | "cell_type": "markdown", 808 | "metadata": {}, 809 | "source": [ 810 | "Training each of the model with the X_train and testing with X_test.\n", 811 | "The model with best accuracy will be ranked as winner" 812 | ] 813 | }, 814 | { 815 | "cell_type": "code", 816 | "execution_count": 25, 817 | "metadata": { 818 | "collapsed": false 819 | }, 820 | "outputs": [ 821 | { 822 | "name": "stdout", 823 | "output_type": "stream", 824 | "text": [ 825 | "RandomForest : 0.994386091996 \n", 826 | "GradientBoosting : 0.988373777617 \n", 827 | "GNB : 0.702897500905 \n", 828 | "DecisionTree : 0.990981528432 \n", 829 | "LinearRegression : 0.54036008649 \n", 830 | "Adaboost : 0.986381745744 \n" 831 | ] 832 | } 833 | ], 834 | "source": [ 835 | "results = {}\n", 836 | "for algo in model:\n", 837 | " clf = model[algo]\n", 838 | " clf.fit(X_train,y_train)\n", 839 | " score = clf.score(X_test,y_test)\n", 840 | " print (\"%s : %s \" %(algo, score))\n", 841 | " results[algo] = score" 842 | ] 843 | }, 844 | { 845 | "cell_type": "code", 846 | "execution_count": 26, 847 | "metadata": { 848 | "collapsed": true 849 | }, 850 | "outputs": [], 851 | "source": [ 852 | "winner = max(results, key=results.get)" 853 | ] 854 | }, 855 | { 856 | "cell_type": "markdown", 857 | "metadata": {}, 858 | "source": [ 859 | "Saving the model" 860 | ] 861 | }, 862 | { 863 | "cell_type": "code", 864 | "execution_count": 27, 865 | "metadata": { 866 | "collapsed": false 867 | }, 868 | "outputs": [ 869 | { 870 | "data": { 871 | "text/plain": [ 872 | "['classifier/classifier.pkl',\n", 873 | " 'classifier/classifier.pkl_01.npy',\n", 874 | " 'classifier/classifier.pkl_02.npy',\n", 875 | " 'classifier/classifier.pkl_03.npy',\n", 876 | " 'classifier/classifier.pkl_04.npy',\n", 877 | " 'classifier/classifier.pkl_05.npy',\n", 878 | " 'classifier/classifier.pkl_06.npy',\n", 879 | " 'classifier/classifier.pkl_07.npy',\n", 880 | " 'classifier/classifier.pkl_08.npy',\n", 881 | " 'classifier/classifier.pkl_09.npy',\n", 882 | " 'classifier/classifier.pkl_10.npy',\n", 883 | " 'classifier/classifier.pkl_11.npy',\n", 884 | " 'classifier/classifier.pkl_12.npy',\n", 885 | " 'classifier/classifier.pkl_13.npy',\n", 886 | " 'classifier/classifier.pkl_14.npy',\n", 887 | " 'classifier/classifier.pkl_15.npy',\n", 888 | " 'classifier/classifier.pkl_16.npy',\n", 889 | " 'classifier/classifier.pkl_17.npy',\n", 890 | " 'classifier/classifier.pkl_18.npy',\n", 891 | " 'classifier/classifier.pkl_19.npy',\n", 892 | " 'classifier/classifier.pkl_20.npy',\n", 893 | " 'classifier/classifier.pkl_21.npy',\n", 894 | " 'classifier/classifier.pkl_22.npy',\n", 895 | " 'classifier/classifier.pkl_23.npy',\n", 896 | " 'classifier/classifier.pkl_24.npy',\n", 897 | " 'classifier/classifier.pkl_25.npy',\n", 898 | " 'classifier/classifier.pkl_26.npy',\n", 899 | " 'classifier/classifier.pkl_27.npy',\n", 900 | " 'classifier/classifier.pkl_28.npy',\n", 901 | " 'classifier/classifier.pkl_29.npy',\n", 902 | " 'classifier/classifier.pkl_30.npy',\n", 903 | " 'classifier/classifier.pkl_31.npy',\n", 904 | " 'classifier/classifier.pkl_32.npy',\n", 905 | " 'classifier/classifier.pkl_33.npy',\n", 906 | " 'classifier/classifier.pkl_34.npy',\n", 907 | " 'classifier/classifier.pkl_35.npy',\n", 908 | " 'classifier/classifier.pkl_36.npy',\n", 909 | " 'classifier/classifier.pkl_37.npy',\n", 910 | " 'classifier/classifier.pkl_38.npy',\n", 911 | " 'classifier/classifier.pkl_39.npy',\n", 912 | " 'classifier/classifier.pkl_40.npy',\n", 913 | " 'classifier/classifier.pkl_41.npy',\n", 914 | " 'classifier/classifier.pkl_42.npy',\n", 915 | " 'classifier/classifier.pkl_43.npy',\n", 916 | " 'classifier/classifier.pkl_44.npy',\n", 917 | " 'classifier/classifier.pkl_45.npy',\n", 918 | " 'classifier/classifier.pkl_46.npy',\n", 919 | " 'classifier/classifier.pkl_47.npy',\n", 920 | " 'classifier/classifier.pkl_48.npy',\n", 921 | " 'classifier/classifier.pkl_49.npy',\n", 922 | " 'classifier/classifier.pkl_50.npy',\n", 923 | " 'classifier/classifier.pkl_51.npy',\n", 924 | " 'classifier/classifier.pkl_52.npy',\n", 925 | " 'classifier/classifier.pkl_53.npy',\n", 926 | " 'classifier/classifier.pkl_54.npy',\n", 927 | " 'classifier/classifier.pkl_55.npy',\n", 928 | " 'classifier/classifier.pkl_56.npy',\n", 929 | " 'classifier/classifier.pkl_57.npy',\n", 930 | " 'classifier/classifier.pkl_58.npy',\n", 931 | " 'classifier/classifier.pkl_59.npy',\n", 932 | " 'classifier/classifier.pkl_60.npy',\n", 933 | " 'classifier/classifier.pkl_61.npy',\n", 934 | " 'classifier/classifier.pkl_62.npy',\n", 935 | " 'classifier/classifier.pkl_63.npy',\n", 936 | " 'classifier/classifier.pkl_64.npy',\n", 937 | " 'classifier/classifier.pkl_65.npy',\n", 938 | " 'classifier/classifier.pkl_66.npy',\n", 939 | " 'classifier/classifier.pkl_67.npy',\n", 940 | " 'classifier/classifier.pkl_68.npy',\n", 941 | " 'classifier/classifier.pkl_69.npy',\n", 942 | " 'classifier/classifier.pkl_70.npy',\n", 943 | " 'classifier/classifier.pkl_71.npy',\n", 944 | " 'classifier/classifier.pkl_72.npy',\n", 945 | " 'classifier/classifier.pkl_73.npy',\n", 946 | " 'classifier/classifier.pkl_74.npy',\n", 947 | " 'classifier/classifier.pkl_75.npy',\n", 948 | " 'classifier/classifier.pkl_76.npy',\n", 949 | " 'classifier/classifier.pkl_77.npy',\n", 950 | " 'classifier/classifier.pkl_78.npy',\n", 951 | " 'classifier/classifier.pkl_79.npy',\n", 952 | " 'classifier/classifier.pkl_80.npy',\n", 953 | " 'classifier/classifier.pkl_81.npy',\n", 954 | " 'classifier/classifier.pkl_82.npy',\n", 955 | " 'classifier/classifier.pkl_83.npy',\n", 956 | " 'classifier/classifier.pkl_84.npy',\n", 957 | " 'classifier/classifier.pkl_85.npy',\n", 958 | " 'classifier/classifier.pkl_86.npy',\n", 959 | " 'classifier/classifier.pkl_87.npy',\n", 960 | " 'classifier/classifier.pkl_88.npy',\n", 961 | " 'classifier/classifier.pkl_89.npy',\n", 962 | " 'classifier/classifier.pkl_90.npy',\n", 963 | " 'classifier/classifier.pkl_91.npy',\n", 964 | " 'classifier/classifier.pkl_92.npy',\n", 965 | " 'classifier/classifier.pkl_93.npy',\n", 966 | " 'classifier/classifier.pkl_94.npy',\n", 967 | " 'classifier/classifier.pkl_95.npy',\n", 968 | " 'classifier/classifier.pkl_96.npy',\n", 969 | " 'classifier/classifier.pkl_97.npy',\n", 970 | " 'classifier/classifier.pkl_98.npy',\n", 971 | " 'classifier/classifier.pkl_99.npy',\n", 972 | " 'classifier/classifier.pkl_100.npy',\n", 973 | " 'classifier/classifier.pkl_101.npy',\n", 974 | " 'classifier/classifier.pkl_102.npy',\n", 975 | " 'classifier/classifier.pkl_103.npy',\n", 976 | " 'classifier/classifier.pkl_104.npy',\n", 977 | " 'classifier/classifier.pkl_105.npy',\n", 978 | " 'classifier/classifier.pkl_106.npy',\n", 979 | " 'classifier/classifier.pkl_107.npy',\n", 980 | " 'classifier/classifier.pkl_108.npy',\n", 981 | " 'classifier/classifier.pkl_109.npy',\n", 982 | " 'classifier/classifier.pkl_110.npy',\n", 983 | " 'classifier/classifier.pkl_111.npy',\n", 984 | " 'classifier/classifier.pkl_112.npy',\n", 985 | " 'classifier/classifier.pkl_113.npy',\n", 986 | " 'classifier/classifier.pkl_114.npy',\n", 987 | " 'classifier/classifier.pkl_115.npy',\n", 988 | " 'classifier/classifier.pkl_116.npy',\n", 989 | " 'classifier/classifier.pkl_117.npy',\n", 990 | " 'classifier/classifier.pkl_118.npy',\n", 991 | " 'classifier/classifier.pkl_119.npy',\n", 992 | " 'classifier/classifier.pkl_120.npy',\n", 993 | " 'classifier/classifier.pkl_121.npy',\n", 994 | " 'classifier/classifier.pkl_122.npy',\n", 995 | " 'classifier/classifier.pkl_123.npy',\n", 996 | " 'classifier/classifier.pkl_124.npy',\n", 997 | " 'classifier/classifier.pkl_125.npy',\n", 998 | " 'classifier/classifier.pkl_126.npy',\n", 999 | " 'classifier/classifier.pkl_127.npy',\n", 1000 | " 'classifier/classifier.pkl_128.npy',\n", 1001 | " 'classifier/classifier.pkl_129.npy',\n", 1002 | " 'classifier/classifier.pkl_130.npy',\n", 1003 | " 'classifier/classifier.pkl_131.npy',\n", 1004 | " 'classifier/classifier.pkl_132.npy',\n", 1005 | " 'classifier/classifier.pkl_133.npy',\n", 1006 | " 'classifier/classifier.pkl_134.npy',\n", 1007 | " 'classifier/classifier.pkl_135.npy',\n", 1008 | " 'classifier/classifier.pkl_136.npy',\n", 1009 | " 'classifier/classifier.pkl_137.npy',\n", 1010 | " 'classifier/classifier.pkl_138.npy',\n", 1011 | " 'classifier/classifier.pkl_139.npy',\n", 1012 | " 'classifier/classifier.pkl_140.npy',\n", 1013 | " 'classifier/classifier.pkl_141.npy',\n", 1014 | " 'classifier/classifier.pkl_142.npy',\n", 1015 | " 'classifier/classifier.pkl_143.npy',\n", 1016 | " 'classifier/classifier.pkl_144.npy',\n", 1017 | " 'classifier/classifier.pkl_145.npy',\n", 1018 | " 'classifier/classifier.pkl_146.npy',\n", 1019 | " 'classifier/classifier.pkl_147.npy',\n", 1020 | " 'classifier/classifier.pkl_148.npy',\n", 1021 | " 'classifier/classifier.pkl_149.npy',\n", 1022 | " 'classifier/classifier.pkl_150.npy',\n", 1023 | " 'classifier/classifier.pkl_151.npy',\n", 1024 | " 'classifier/classifier.pkl_152.npy',\n", 1025 | " 'classifier/classifier.pkl_153.npy',\n", 1026 | " 'classifier/classifier.pkl_154.npy',\n", 1027 | " 'classifier/classifier.pkl_155.npy',\n", 1028 | " 'classifier/classifier.pkl_156.npy',\n", 1029 | " 'classifier/classifier.pkl_157.npy',\n", 1030 | " 'classifier/classifier.pkl_158.npy',\n", 1031 | " 'classifier/classifier.pkl_159.npy',\n", 1032 | " 'classifier/classifier.pkl_160.npy',\n", 1033 | " 'classifier/classifier.pkl_161.npy',\n", 1034 | " 'classifier/classifier.pkl_162.npy',\n", 1035 | " 'classifier/classifier.pkl_163.npy',\n", 1036 | " 'classifier/classifier.pkl_164.npy',\n", 1037 | " 'classifier/classifier.pkl_165.npy',\n", 1038 | " 'classifier/classifier.pkl_166.npy',\n", 1039 | " 'classifier/classifier.pkl_167.npy',\n", 1040 | " 'classifier/classifier.pkl_168.npy',\n", 1041 | " 'classifier/classifier.pkl_169.npy',\n", 1042 | " 'classifier/classifier.pkl_170.npy',\n", 1043 | " 'classifier/classifier.pkl_171.npy',\n", 1044 | " 'classifier/classifier.pkl_172.npy',\n", 1045 | " 'classifier/classifier.pkl_173.npy',\n", 1046 | " 'classifier/classifier.pkl_174.npy',\n", 1047 | " 'classifier/classifier.pkl_175.npy',\n", 1048 | " 'classifier/classifier.pkl_176.npy',\n", 1049 | " 'classifier/classifier.pkl_177.npy',\n", 1050 | " 'classifier/classifier.pkl_178.npy',\n", 1051 | " 'classifier/classifier.pkl_179.npy',\n", 1052 | " 'classifier/classifier.pkl_180.npy',\n", 1053 | " 'classifier/classifier.pkl_181.npy',\n", 1054 | " 'classifier/classifier.pkl_182.npy',\n", 1055 | " 'classifier/classifier.pkl_183.npy',\n", 1056 | " 'classifier/classifier.pkl_184.npy',\n", 1057 | " 'classifier/classifier.pkl_185.npy',\n", 1058 | " 'classifier/classifier.pkl_186.npy',\n", 1059 | " 'classifier/classifier.pkl_187.npy',\n", 1060 | " 'classifier/classifier.pkl_188.npy',\n", 1061 | " 'classifier/classifier.pkl_189.npy',\n", 1062 | " 'classifier/classifier.pkl_190.npy',\n", 1063 | " 'classifier/classifier.pkl_191.npy',\n", 1064 | " 'classifier/classifier.pkl_192.npy',\n", 1065 | " 'classifier/classifier.pkl_193.npy',\n", 1066 | " 'classifier/classifier.pkl_194.npy',\n", 1067 | " 'classifier/classifier.pkl_195.npy',\n", 1068 | " 'classifier/classifier.pkl_196.npy',\n", 1069 | " 'classifier/classifier.pkl_197.npy',\n", 1070 | " 'classifier/classifier.pkl_198.npy',\n", 1071 | " 'classifier/classifier.pkl_199.npy',\n", 1072 | " 'classifier/classifier.pkl_200.npy',\n", 1073 | " 'classifier/classifier.pkl_201.npy']" 1074 | ] 1075 | }, 1076 | "execution_count": 27, 1077 | "metadata": {}, 1078 | "output_type": "execute_result" 1079 | } 1080 | ], 1081 | "source": [ 1082 | "joblib.dump(model[winner],'classifier/classifier.pkl')" 1083 | ] 1084 | }, 1085 | { 1086 | "cell_type": "code", 1087 | "execution_count": 28, 1088 | "metadata": { 1089 | "collapsed": false 1090 | }, 1091 | "outputs": [], 1092 | "source": [ 1093 | "open('classifier/features.pkl', 'w').write(pickle.dumps(features))" 1094 | ] 1095 | }, 1096 | { 1097 | "cell_type": "markdown", 1098 | "metadata": {}, 1099 | "source": [ 1100 | "Calculating the False positive and negative on the dataset" 1101 | ] 1102 | }, 1103 | { 1104 | "cell_type": "code", 1105 | "execution_count": 41, 1106 | "metadata": { 1107 | "collapsed": false 1108 | }, 1109 | "outputs": [ 1110 | { 1111 | "name": "stdout", 1112 | "output_type": "stream", 1113 | "text": [ 1114 | "False positive rate : 0.099251 %\n", 1115 | "False negative rate : 0.147618 %\n" 1116 | ] 1117 | } 1118 | ], 1119 | "source": [ 1120 | "clf = model[winner]\n", 1121 | "res = clf.predict(X_new)\n", 1122 | "mt = confusion_matrix(y, res)\n", 1123 | "print(\"False positive rate : %f %%\" % ((mt[0][1] / float(sum(mt[0])))*100))\n", 1124 | "print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))" 1125 | ] 1126 | }, 1127 | { 1128 | "cell_type": "code", 1129 | "execution_count": 36, 1130 | "metadata": { 1131 | "collapsed": false 1132 | }, 1133 | "outputs": [], 1134 | "source": [ 1135 | "# Load classifier\n", 1136 | "clf = joblib.load('classifier/classifier.pkl')\n", 1137 | "#load features\n", 1138 | "features = pickle.loads(open(os.path.join('classifier/features.pkl'),'r').read())" 1139 | ] 1140 | }, 1141 | { 1142 | "cell_type": "markdown", 1143 | "metadata": {}, 1144 | "source": [ 1145 | "##### Testing with unseen file\n", 1146 | "Given any unseen test file, it's required to extract the characteristics of the given file. \n", 1147 | "\n", 1148 | "In order to test the model on an unseen file, it's required to extract the characteristics of the given file. Python's pefile.PE library is used to construct and build the feature vector and a ML model is used to predict the class for the given file based on the already trained model. " 1149 | ] 1150 | }, 1151 | { 1152 | "cell_type": "code", 1153 | "execution_count": null, 1154 | "metadata": { 1155 | "collapsed": true 1156 | }, 1157 | "outputs": [], 1158 | "source": [ 1159 | "# %load malware_test.py\n", 1160 | "\"\"\"\n", 1161 | "this file extracts the required information of a given file using the library PE \n", 1162 | "\n", 1163 | "\"\"\"\n", 1164 | "\n", 1165 | "import pefile\n", 1166 | "import os\n", 1167 | "import array\n", 1168 | "import math\n", 1169 | "import pickle\n", 1170 | "from sklearn.externals import joblib\n", 1171 | "import sys\n", 1172 | "import argparse\n", 1173 | "\n", 1174 | "\n", 1175 | "\n", 1176 | "def get_entropy(data):\n", 1177 | " if len(data) == 0:\n", 1178 | "\treturn 0.0\n", 1179 | " occurences = array.array('L', [0]*256)\n", 1180 | " for x in data:\n", 1181 | " \toccurences[x if isinstance(x, int) else ord(x)] += 1\n", 1182 | "\n", 1183 | " entropy = 0\n", 1184 | " for x in occurences:\n", 1185 | "\tif x:\n", 1186 | "\t p_x = float(x) / len(data)\n", 1187 | "\t entropy -= p_x*math.log(p_x, 2)\n", 1188 | "\n", 1189 | " return entropy\n", 1190 | "\n", 1191 | "\n", 1192 | "def get_resources(pe):\n", 1193 | " \"\"\"Extract resources :\n", 1194 | " [entropy, size]\"\"\"\n", 1195 | " resources = []\n", 1196 | " if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):\n", 1197 | "\ttry:\n", 1198 | " for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:\n", 1199 | " if hasattr(resource_type, 'directory'):\n", 1200 | " for resource_id in resource_type.directory.entries:\n", 1201 | " if hasattr(resource_id, 'directory'):\n", 1202 | " for resource_lang in resource_id.directory.entries:\n", 1203 | " data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size)\n", 1204 | " size = resource_lang.data.struct.Size\n", 1205 | " entropy = get_entropy(data)\n", 1206 | "\n", 1207 | " resources.append([entropy, size])\n", 1208 | " except Exception as e:\n", 1209 | " return resources\n", 1210 | " return resources\n", 1211 | "\n", 1212 | "def get_version_info(pe):\n", 1213 | " \"\"\"Return version infos\"\"\"\n", 1214 | " res = {}\n", 1215 | " for fileinfo in pe.FileInfo:\n", 1216 | " if fileinfo.Key == 'StringFileInfo':\n", 1217 | " for st in fileinfo.StringTable:\n", 1218 | " for entry in st.entries.items():\n", 1219 | " res[entry[0]] = entry[1]\n", 1220 | " if fileinfo.Key == 'VarFileInfo':\n", 1221 | " for var in fileinfo.Var:\n", 1222 | " res[var.entry.items()[0][0]] = var.entry.items()[0][1]\n", 1223 | " if hasattr(pe, 'VS_FIXEDFILEINFO'):\n", 1224 | " res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags\n", 1225 | " res['os'] = pe.VS_FIXEDFILEINFO.FileOS\n", 1226 | " res['type'] = pe.VS_FIXEDFILEINFO.FileType\n", 1227 | " res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS\n", 1228 | " res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS\n", 1229 | " res['signature'] = pe.VS_FIXEDFILEINFO.Signature\n", 1230 | " res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion\n", 1231 | " return res\n", 1232 | "\n", 1233 | "#extract the info for a given file\n", 1234 | "def extract_infos(fpath):\n", 1235 | " res = {}\n", 1236 | " pe = pefile.PE(fpath)\n", 1237 | " res['Machine'] = pe.FILE_HEADER.Machine\n", 1238 | " res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader\n", 1239 | " res['Characteristics'] = pe.FILE_HEADER.Characteristics\n", 1240 | " res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion\n", 1241 | " res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion\n", 1242 | " res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode\n", 1243 | " res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData\n", 1244 | " res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData\n", 1245 | " res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint\n", 1246 | " res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode\n", 1247 | " try:\n", 1248 | " res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData\n", 1249 | " except AttributeError:\n", 1250 | " res['BaseOfData'] = 0\n", 1251 | " res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase\n", 1252 | " res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment\n", 1253 | " res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment\n", 1254 | " res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion\n", 1255 | " res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion\n", 1256 | " res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion\n", 1257 | " res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion\n", 1258 | " res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion\n", 1259 | " res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion\n", 1260 | " res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage\n", 1261 | " res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders\n", 1262 | " res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum\n", 1263 | " res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem\n", 1264 | " res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics\n", 1265 | " res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve\n", 1266 | " res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit\n", 1267 | " res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve\n", 1268 | " res['SizeOfHeapCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit\n", 1269 | " res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags\n", 1270 | " res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes\n", 1271 | "\n", 1272 | " # Sections\n", 1273 | " res['SectionsNb'] = len(pe.sections)\n", 1274 | " entropy = map(lambda x:x.get_entropy(), pe.sections)\n", 1275 | " res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy))\n", 1276 | " res['SectionsMinEntropy'] = min(entropy)\n", 1277 | " res['SectionsMaxEntropy'] = max(entropy)\n", 1278 | " raw_sizes = map(lambda x:x.SizeOfRawData, pe.sections)\n", 1279 | " res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes))\n", 1280 | " res['SectionsMinRawsize'] = min(raw_sizes)\n", 1281 | " res['SectionsMaxRawsize'] = max(raw_sizes)\n", 1282 | " virtual_sizes = map(lambda x:x.Misc_VirtualSize, pe.sections)\n", 1283 | " res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes))\n", 1284 | " res['SectionsMinVirtualsize'] = min(virtual_sizes)\n", 1285 | " res['SectionMaxVirtualsize'] = max(virtual_sizes)\n", 1286 | "\n", 1287 | " #Imports\n", 1288 | " try:\n", 1289 | " res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT)\n", 1290 | " imports = sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], [])\n", 1291 | " res['ImportsNb'] = len(imports)\n", 1292 | " res['ImportsNbOrdinal'] = len(filter(lambda x:x.name is None, imports))\n", 1293 | " except AttributeError:\n", 1294 | " res['ImportsNbDLL'] = 0\n", 1295 | " res['ImportsNb'] = 0\n", 1296 | " res['ImportsNbOrdinal'] = 0\n", 1297 | "\n", 1298 | " #Exports\n", 1299 | " try:\n", 1300 | " res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols)\n", 1301 | " except AttributeError:\n", 1302 | " # No export\n", 1303 | " res['ExportNb'] = 0\n", 1304 | " #Resources\n", 1305 | " resources= get_resources(pe)\n", 1306 | " res['ResourcesNb'] = len(resources)\n", 1307 | " if len(resources)> 0:\n", 1308 | " entropy = map(lambda x:x[0], resources)\n", 1309 | " res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy))\n", 1310 | " res['ResourcesMinEntropy'] = min(entropy)\n", 1311 | " res['ResourcesMaxEntropy'] = max(entropy)\n", 1312 | " sizes = map(lambda x:x[1], resources)\n", 1313 | " res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes))\n", 1314 | " res['ResourcesMinSize'] = min(sizes)\n", 1315 | " res['ResourcesMaxSize'] = max(sizes)\n", 1316 | " else:\n", 1317 | " res['ResourcesNb'] = 0\n", 1318 | " res['ResourcesMeanEntropy'] = 0\n", 1319 | " res['ResourcesMinEntropy'] = 0\n", 1320 | " res['ResourcesMaxEntropy'] = 0\n", 1321 | " res['ResourcesMeanSize'] = 0\n", 1322 | " res['ResourcesMinSize'] = 0\n", 1323 | " res['ResourcesMaxSize'] = 0\n", 1324 | "\n", 1325 | " # Load configuration size\n", 1326 | " try:\n", 1327 | " res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size\n", 1328 | " except AttributeError:\n", 1329 | " res['LoadConfigurationSize'] = 0\n", 1330 | "\n", 1331 | "\n", 1332 | " # Version configuration size\n", 1333 | " try:\n", 1334 | " version_infos = get_version_info(pe)\n", 1335 | " res['VersionInformationSize'] = len(version_infos.keys())\n", 1336 | " except AttributeError:\n", 1337 | " res['VersionInformationSize'] = 0\n", 1338 | " return res\n", 1339 | "\n", 1340 | "\n", 1341 | "if __name__ == '__main__':\n", 1342 | "\t\n", 1343 | " clf = joblib.load('classifier/classifier.pkl')\n", 1344 | " features = pickle.loads(open(os.path.join('classifier/features.pkl'),'r').read())\n", 1345 | " data = extract_infos(sys.argv[1])\n", 1346 | " pe_features = map(lambda x:data[x], features)\n", 1347 | "\n", 1348 | " res= clf.predict([pe_features])[0] \n", 1349 | " print ('The file %s is %s' % (os.path.basename(sys.argv[1]),['malicious', 'legitimate'][res]))\n", 1350 | " \n", 1351 | "\n", 1352 | "\n", 1353 | "\n", 1354 | "\n", 1355 | "\n", 1356 | "\n", 1357 | "\n", 1358 | "\n", 1359 | "\n", 1360 | "\n", 1361 | "\n" 1362 | ] 1363 | }, 1364 | { 1365 | "cell_type": "markdown", 1366 | "metadata": {}, 1367 | "source": [ 1368 | "Let's run the program to test the file - Skype.exe" 1369 | ] 1370 | }, 1371 | { 1372 | "cell_type": "code", 1373 | "execution_count": 40, 1374 | "metadata": { 1375 | "collapsed": false 1376 | }, 1377 | "outputs": [ 1378 | { 1379 | "name": "stdout", 1380 | "output_type": "stream", 1381 | "text": [ 1382 | "The file Skype.exe is legitimate\n" 1383 | ] 1384 | } 1385 | ], 1386 | "source": [ 1387 | "%run malware_test.py \"/home/surajr/Downloads/Skype.exe\"" 1388 | ] 1389 | }, 1390 | { 1391 | "cell_type": "markdown", 1392 | "metadata": {}, 1393 | "source": [ 1394 | "To test for the malicious file, an application has been downloaded from malwr.com" 1395 | ] 1396 | }, 1397 | { 1398 | "cell_type": "code", 1399 | "execution_count": 38, 1400 | "metadata": { 1401 | "collapsed": false 1402 | }, 1403 | "outputs": [ 1404 | { 1405 | "name": "stdout", 1406 | "output_type": "stream", 1407 | "text": [ 1408 | "The file BCN12ui49823.exe is malicious\n" 1409 | ] 1410 | } 1411 | ], 1412 | "source": [ 1413 | "%run malware_test.py \"/home/surajr/Downloads/BCN12ui49823.exe\"" 1414 | ] 1415 | } 1416 | ], 1417 | "metadata": { 1418 | "kernelspec": { 1419 | "display_name": "Python [Root]", 1420 | "language": "python", 1421 | "name": "Python [Root]" 1422 | }, 1423 | "language_info": { 1424 | "codemirror_mode": { 1425 | "name": "ipython", 1426 | "version": 2 1427 | }, 1428 | "file_extension": ".py", 1429 | "mimetype": "text/x-python", 1430 | "name": "python", 1431 | "nbconvert_exporter": "python", 1432 | "pygments_lexer": "ipython2", 1433 | "version": "2.7.12" 1434 | } 1435 | }, 1436 | "nbformat": 4, 1437 | "nbformat_minor": 0 1438 | } 1439 | --------------------------------------------------------------------------------