├── README.md
└── RansomwareD.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # Ransomeware Detection Using ML
2 |
3 | ### Machine Learning Algorithms used are:
4 |
5 | 1. Random Forest
6 | 2. Decision Tree
7 | 3. Logistic Regression
8 |
9 | ### Additional Libraries Used:
10 |
11 | * pefile
12 | * pickle
13 | * joblib
14 | * mlxtend
15 | * statsmodel
16 | * sklearn
17 |
18 | ### Concepts Used:
19 |
20 | * Multicollinearity
21 | * Ensemble Technique
22 | * Extra Tree Classifier
23 |
--------------------------------------------------------------------------------
/RansomwareD.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 6,
6 | "id": "2bca8f3e",
7 | "metadata": {},
8 | "outputs": [
9 | {
10 | "name": "stdout",
11 | "output_type": "stream",
12 | "text": [
13 | "Collecting pefile\n",
14 | " Using cached pefile-2021.5.24.tar.gz (66 kB)\n",
15 | "Requirement already satisfied: future in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from pefile) (0.18.2)\n",
16 | "Building wheels for collected packages: pefile\n",
17 | " Building wheel for pefile (setup.py): started\n",
18 | " Building wheel for pefile (setup.py): finished with status 'done'\n",
19 | " Created wheel for pefile: filename=pefile-2021.5.24-py3-none-any.whl size=62578 sha256=cf20a74be7fc5f7210d0a6f4e3714ed405c5d8da883caaeb0d173f1927786bf5\n",
20 | " Stored in directory: c:\\users\\vajha\\appdata\\local\\pip\\cache\\wheels\\43\\04\\fc\\d9305103f7d512f2df35b1878e1009e8217e713b767aee8f13\n",
21 | "Successfully built pefile\n",
22 | "Installing collected packages: pefile\n",
23 | "Successfully installed pefile-2021.5.24\n",
24 | "Note: you may need to restart the kernel to use updated packages.\n"
25 | ]
26 | }
27 | ],
28 | "source": [
29 | "pip install pefile"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 1,
35 | "id": "817480c1",
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "name": "stdout",
40 | "output_type": "stream",
41 | "text": [
42 | "Collecting mlxtend\n",
43 | " Using cached mlxtend-0.18.0-py2.py3-none-any.whl (1.3 MB)\n",
44 | "Requirement already satisfied: setuptools in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (52.0.0.post20210125)\n",
45 | "Requirement already satisfied: joblib>=0.13.2 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (1.0.1)\n",
46 | "Requirement already satisfied: scikit-learn>=0.20.3 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (0.24.1)\n",
47 | "Requirement already satisfied: scipy>=1.2.1 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (1.6.2)\n",
48 | "Requirement already satisfied: pandas>=0.24.2 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (1.2.4)\n",
49 | "Requirement already satisfied: matplotlib>=3.0.0 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (3.3.4)\n",
50 | "Requirement already satisfied: numpy>=1.16.2 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (1.20.1)\n",
51 | "Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (2.8.1)\n",
52 | "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (2.4.7)\n",
53 | "Requirement already satisfied: cycler>=0.10 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (0.10.0)\n",
54 | "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (8.2.0)\n",
55 | "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (1.3.1)\n",
56 | "Requirement already satisfied: six in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from cycler>=0.10->matplotlib>=3.0.0->mlxtend) (1.15.0)\n",
57 | "Requirement already satisfied: pytz>=2017.3 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from pandas>=0.24.2->mlxtend) (2021.1)\n",
58 | "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from scikit-learn>=0.20.3->mlxtend) (2.1.0)\n",
59 | "Installing collected packages: mlxtend\n",
60 | "Successfully installed mlxtend-0.18.0\n",
61 | "Note: you may need to restart the kernel to use updated packages.\n"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "pip install mlxtend"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 3,
72 | "id": "b94512f2",
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "import os\n",
77 | "import pandas as pd\n",
78 | "import numpy as np\n",
79 | "from matplotlib import pyplot as plt\n",
80 | "import pickle\n",
81 | "import pefile\n",
82 | "import sklearn.ensemble as ek\n",
83 | "from sklearn import tree, linear_model\n",
84 | "from sklearn.feature_selection import SelectFromModel\n",
85 | "import joblib\n",
86 | "from sklearn.naive_bayes import GaussianNB\n",
87 | "from sklearn.metrics import confusion_matrix\n",
88 | "from sklearn.pipeline import make_pipeline\n",
89 | "from sklearn import preprocessing\n",
90 | "from sklearn import svm\n",
91 | "from sklearn.linear_model import LogisticRegression\n",
92 | "from statsmodels.stats.outliers_influence import variance_inflation_factor as vif\n",
93 | "from sklearn.model_selection import train_test_split\n",
94 | "from mlxtend.plotting import plot_confusion_matrix\n",
95 | "dataset=pd.read_csv(\"Ransomware.csv\",sep='|')"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 4,
101 | "id": "cde9b494",
102 | "metadata": {},
103 | "outputs": [
104 | {
105 | "data": {
106 | "text/html": [
107 | "
\n",
108 | "\n",
121 | "
\n",
122 | " \n",
123 | " \n",
124 | " | \n",
125 | " Name | \n",
126 | " md5 | \n",
127 | " Machine | \n",
128 | " SizeOfOptionalHeader | \n",
129 | " Characteristics | \n",
130 | " MajorLinkerVersion | \n",
131 | " MinorLinkerVersion | \n",
132 | " SizeOfCode | \n",
133 | " SizeOfInitializedData | \n",
134 | " SizeOfUninitializedData | \n",
135 | " ... | \n",
136 | " ResourcesNb | \n",
137 | " ResourcesMeanEntropy | \n",
138 | " ResourcesMinEntropy | \n",
139 | " ResourcesMaxEntropy | \n",
140 | " ResourcesMeanSize | \n",
141 | " ResourcesMinSize | \n",
142 | " ResourcesMaxSize | \n",
143 | " LoadConfigurationSize | \n",
144 | " VersionInformationSize | \n",
145 | " legitimate | \n",
146 | "
\n",
147 | " \n",
148 | " \n",
149 | " \n",
150 | " 0 | \n",
151 | " memtest.exe | \n",
152 | " 631ea355665f28d4707448e442fbf5b8 | \n",
153 | " 332 | \n",
154 | " 224 | \n",
155 | " 258 | \n",
156 | " 9 | \n",
157 | " 0 | \n",
158 | " 361984 | \n",
159 | " 115712 | \n",
160 | " 0 | \n",
161 | " ... | \n",
162 | " 4 | \n",
163 | " 3.262823 | \n",
164 | " 2.568844 | \n",
165 | " 3.537939 | \n",
166 | " 8797.000000 | \n",
167 | " 216 | \n",
168 | " 18032 | \n",
169 | " 0 | \n",
170 | " 16 | \n",
171 | " 1 | \n",
172 | "
\n",
173 | " \n",
174 | " 1 | \n",
175 | " ose.exe | \n",
176 | " 9d10f99a6712e28f8acd5641e3a7ea6b | \n",
177 | " 332 | \n",
178 | " 224 | \n",
179 | " 3330 | \n",
180 | " 9 | \n",
181 | " 0 | \n",
182 | " 130560 | \n",
183 | " 19968 | \n",
184 | " 0 | \n",
185 | " ... | \n",
186 | " 2 | \n",
187 | " 4.250461 | \n",
188 | " 3.420744 | \n",
189 | " 5.080177 | \n",
190 | " 837.000000 | \n",
191 | " 518 | \n",
192 | " 1156 | \n",
193 | " 72 | \n",
194 | " 18 | \n",
195 | " 1 | \n",
196 | "
\n",
197 | " \n",
198 | " 2 | \n",
199 | " setup.exe | \n",
200 | " 4d92f518527353c0db88a70fddcfd390 | \n",
201 | " 332 | \n",
202 | " 224 | \n",
203 | " 3330 | \n",
204 | " 9 | \n",
205 | " 0 | \n",
206 | " 517120 | \n",
207 | " 621568 | \n",
208 | " 0 | \n",
209 | " ... | \n",
210 | " 11 | \n",
211 | " 4.426324 | \n",
212 | " 2.846449 | \n",
213 | " 5.271813 | \n",
214 | " 31102.272727 | \n",
215 | " 104 | \n",
216 | " 270376 | \n",
217 | " 72 | \n",
218 | " 18 | \n",
219 | " 1 | \n",
220 | "
\n",
221 | " \n",
222 | " 3 | \n",
223 | " DW20.EXE | \n",
224 | " a41e524f8d45f0074fd07805ff0c9b12 | \n",
225 | " 332 | \n",
226 | " 224 | \n",
227 | " 258 | \n",
228 | " 9 | \n",
229 | " 0 | \n",
230 | " 585728 | \n",
231 | " 369152 | \n",
232 | " 0 | \n",
233 | " ... | \n",
234 | " 10 | \n",
235 | " 4.364291 | \n",
236 | " 2.669314 | \n",
237 | " 6.400720 | \n",
238 | " 1457.000000 | \n",
239 | " 90 | \n",
240 | " 4264 | \n",
241 | " 72 | \n",
242 | " 18 | \n",
243 | " 1 | \n",
244 | "
\n",
245 | " \n",
246 | " 4 | \n",
247 | " dwtrig20.exe | \n",
248 | " c87e561258f2f8650cef999bf643a731 | \n",
249 | " 332 | \n",
250 | " 224 | \n",
251 | " 258 | \n",
252 | " 9 | \n",
253 | " 0 | \n",
254 | " 294912 | \n",
255 | " 247296 | \n",
256 | " 0 | \n",
257 | " ... | \n",
258 | " 2 | \n",
259 | " 4.306100 | \n",
260 | " 3.421598 | \n",
261 | " 5.190603 | \n",
262 | " 1074.500000 | \n",
263 | " 849 | \n",
264 | " 1300 | \n",
265 | " 72 | \n",
266 | " 18 | \n",
267 | " 1 | \n",
268 | "
\n",
269 | " \n",
270 | " ... | \n",
271 | " ... | \n",
272 | " ... | \n",
273 | " ... | \n",
274 | " ... | \n",
275 | " ... | \n",
276 | " ... | \n",
277 | " ... | \n",
278 | " ... | \n",
279 | " ... | \n",
280 | " ... | \n",
281 | " ... | \n",
282 | " ... | \n",
283 | " ... | \n",
284 | " ... | \n",
285 | " ... | \n",
286 | " ... | \n",
287 | " ... | \n",
288 | " ... | \n",
289 | " ... | \n",
290 | " ... | \n",
291 | " ... | \n",
292 | "
\n",
293 | " \n",
294 | " 138042 | \n",
295 | " VirusShare_8e292b418568d6e7b87f2a32aee7074b | \n",
296 | " 8e292b418568d6e7b87f2a32aee7074b | \n",
297 | " 332 | \n",
298 | " 224 | \n",
299 | " 258 | \n",
300 | " 11 | \n",
301 | " 0 | \n",
302 | " 205824 | \n",
303 | " 223744 | \n",
304 | " 0 | \n",
305 | " ... | \n",
306 | " 7 | \n",
307 | " 4.122736 | \n",
308 | " 1.370260 | \n",
309 | " 7.677091 | \n",
310 | " 14900.714286 | \n",
311 | " 16 | \n",
312 | " 81654 | \n",
313 | " 72 | \n",
314 | " 0 | \n",
315 | " 0 | \n",
316 | "
\n",
317 | " \n",
318 | " 138043 | \n",
319 | " VirusShare_260d9e2258aed4c8a3bbd703ec895822 | \n",
320 | " 260d9e2258aed4c8a3bbd703ec895822 | \n",
321 | " 332 | \n",
322 | " 224 | \n",
323 | " 33167 | \n",
324 | " 2 | \n",
325 | " 25 | \n",
326 | " 37888 | \n",
327 | " 185344 | \n",
328 | " 0 | \n",
329 | " ... | \n",
330 | " 26 | \n",
331 | " 3.377663 | \n",
332 | " 2.031619 | \n",
333 | " 5.050074 | \n",
334 | " 6905.846154 | \n",
335 | " 44 | \n",
336 | " 67624 | \n",
337 | " 0 | \n",
338 | " 15 | \n",
339 | " 0 | \n",
340 | "
\n",
341 | " \n",
342 | " 138044 | \n",
343 | " VirusShare_8d088a51b7d225c9f5d11d239791ec3f | \n",
344 | " 8d088a51b7d225c9f5d11d239791ec3f | \n",
345 | " 332 | \n",
346 | " 224 | \n",
347 | " 258 | \n",
348 | " 10 | \n",
349 | " 0 | \n",
350 | " 118272 | \n",
351 | " 380416 | \n",
352 | " 0 | \n",
353 | " ... | \n",
354 | " 22 | \n",
355 | " 6.825406 | \n",
356 | " 2.617026 | \n",
357 | " 7.990487 | \n",
358 | " 14981.909091 | \n",
359 | " 48 | \n",
360 | " 22648 | \n",
361 | " 72 | \n",
362 | " 14 | \n",
363 | " 0 | \n",
364 | "
\n",
365 | " \n",
366 | " 138045 | \n",
367 | " VirusShare_4286dccf67ca220fe67635388229a9f3 | \n",
368 | " 4286dccf67ca220fe67635388229a9f3 | \n",
369 | " 332 | \n",
370 | " 224 | \n",
371 | " 33166 | \n",
372 | " 2 | \n",
373 | " 25 | \n",
374 | " 49152 | \n",
375 | " 16896 | \n",
376 | " 0 | \n",
377 | " ... | \n",
378 | " 10 | \n",
379 | " 3.421627 | \n",
380 | " 2.060964 | \n",
381 | " 4.739744 | \n",
382 | " 601.600000 | \n",
383 | " 16 | \n",
384 | " 2216 | \n",
385 | " 0 | \n",
386 | " 0 | \n",
387 | " 0 | \n",
388 | "
\n",
389 | " \n",
390 | " 138046 | \n",
391 | " VirusShare_d7648eae45f09b3adb75127f43be6d11 | \n",
392 | " d7648eae45f09b3adb75127f43be6d11 | \n",
393 | " 332 | \n",
394 | " 224 | \n",
395 | " 258 | \n",
396 | " 11 | \n",
397 | " 0 | \n",
398 | " 111616 | \n",
399 | " 468480 | \n",
400 | " 0 | \n",
401 | " ... | \n",
402 | " 4 | \n",
403 | " 4.407252 | \n",
404 | " 1.980482 | \n",
405 | " 6.115374 | \n",
406 | " 96625.000000 | \n",
407 | " 20 | \n",
408 | " 318464 | \n",
409 | " 72 | \n",
410 | " 0 | \n",
411 | " 0 | \n",
412 | "
\n",
413 | " \n",
414 | "
\n",
415 | "
138047 rows × 57 columns
\n",
416 | "
"
417 | ],
418 | "text/plain": [
419 | " Name \\\n",
420 | "0 memtest.exe \n",
421 | "1 ose.exe \n",
422 | "2 setup.exe \n",
423 | "3 DW20.EXE \n",
424 | "4 dwtrig20.exe \n",
425 | "... ... \n",
426 | "138042 VirusShare_8e292b418568d6e7b87f2a32aee7074b \n",
427 | "138043 VirusShare_260d9e2258aed4c8a3bbd703ec895822 \n",
428 | "138044 VirusShare_8d088a51b7d225c9f5d11d239791ec3f \n",
429 | "138045 VirusShare_4286dccf67ca220fe67635388229a9f3 \n",
430 | "138046 VirusShare_d7648eae45f09b3adb75127f43be6d11 \n",
431 | "\n",
432 | " md5 Machine SizeOfOptionalHeader \\\n",
433 | "0 631ea355665f28d4707448e442fbf5b8 332 224 \n",
434 | "1 9d10f99a6712e28f8acd5641e3a7ea6b 332 224 \n",
435 | "2 4d92f518527353c0db88a70fddcfd390 332 224 \n",
436 | "3 a41e524f8d45f0074fd07805ff0c9b12 332 224 \n",
437 | "4 c87e561258f2f8650cef999bf643a731 332 224 \n",
438 | "... ... ... ... \n",
439 | "138042 8e292b418568d6e7b87f2a32aee7074b 332 224 \n",
440 | "138043 260d9e2258aed4c8a3bbd703ec895822 332 224 \n",
441 | "138044 8d088a51b7d225c9f5d11d239791ec3f 332 224 \n",
442 | "138045 4286dccf67ca220fe67635388229a9f3 332 224 \n",
443 | "138046 d7648eae45f09b3adb75127f43be6d11 332 224 \n",
444 | "\n",
445 | " Characteristics MajorLinkerVersion MinorLinkerVersion SizeOfCode \\\n",
446 | "0 258 9 0 361984 \n",
447 | "1 3330 9 0 130560 \n",
448 | "2 3330 9 0 517120 \n",
449 | "3 258 9 0 585728 \n",
450 | "4 258 9 0 294912 \n",
451 | "... ... ... ... ... \n",
452 | "138042 258 11 0 205824 \n",
453 | "138043 33167 2 25 37888 \n",
454 | "138044 258 10 0 118272 \n",
455 | "138045 33166 2 25 49152 \n",
456 | "138046 258 11 0 111616 \n",
457 | "\n",
458 | " SizeOfInitializedData SizeOfUninitializedData ... ResourcesNb \\\n",
459 | "0 115712 0 ... 4 \n",
460 | "1 19968 0 ... 2 \n",
461 | "2 621568 0 ... 11 \n",
462 | "3 369152 0 ... 10 \n",
463 | "4 247296 0 ... 2 \n",
464 | "... ... ... ... ... \n",
465 | "138042 223744 0 ... 7 \n",
466 | "138043 185344 0 ... 26 \n",
467 | "138044 380416 0 ... 22 \n",
468 | "138045 16896 0 ... 10 \n",
469 | "138046 468480 0 ... 4 \n",
470 | "\n",
471 | " ResourcesMeanEntropy ResourcesMinEntropy ResourcesMaxEntropy \\\n",
472 | "0 3.262823 2.568844 3.537939 \n",
473 | "1 4.250461 3.420744 5.080177 \n",
474 | "2 4.426324 2.846449 5.271813 \n",
475 | "3 4.364291 2.669314 6.400720 \n",
476 | "4 4.306100 3.421598 5.190603 \n",
477 | "... ... ... ... \n",
478 | "138042 4.122736 1.370260 7.677091 \n",
479 | "138043 3.377663 2.031619 5.050074 \n",
480 | "138044 6.825406 2.617026 7.990487 \n",
481 | "138045 3.421627 2.060964 4.739744 \n",
482 | "138046 4.407252 1.980482 6.115374 \n",
483 | "\n",
484 | " ResourcesMeanSize ResourcesMinSize ResourcesMaxSize \\\n",
485 | "0 8797.000000 216 18032 \n",
486 | "1 837.000000 518 1156 \n",
487 | "2 31102.272727 104 270376 \n",
488 | "3 1457.000000 90 4264 \n",
489 | "4 1074.500000 849 1300 \n",
490 | "... ... ... ... \n",
491 | "138042 14900.714286 16 81654 \n",
492 | "138043 6905.846154 44 67624 \n",
493 | "138044 14981.909091 48 22648 \n",
494 | "138045 601.600000 16 2216 \n",
495 | "138046 96625.000000 20 318464 \n",
496 | "\n",
497 | " LoadConfigurationSize VersionInformationSize legitimate \n",
498 | "0 0 16 1 \n",
499 | "1 72 18 1 \n",
500 | "2 72 18 1 \n",
501 | "3 72 18 1 \n",
502 | "4 72 18 1 \n",
503 | "... ... ... ... \n",
504 | "138042 72 0 0 \n",
505 | "138043 0 15 0 \n",
506 | "138044 72 14 0 \n",
507 | "138045 0 0 0 \n",
508 | "138046 72 0 0 \n",
509 | "\n",
510 | "[138047 rows x 57 columns]"
511 | ]
512 | },
513 | "execution_count": 4,
514 | "metadata": {},
515 | "output_type": "execute_result"
516 | }
517 | ],
518 | "source": [
519 | "dataset"
520 | ]
521 | },
522 | {
523 | "cell_type": "code",
524 | "execution_count": 5,
525 | "id": "9e106963",
526 | "metadata": {},
527 | "outputs": [
528 | {
529 | "data": {
530 | "text/html": [
531 | "\n",
532 | "\n",
545 | "
\n",
546 | " \n",
547 | " \n",
548 | " | \n",
549 | " Machine | \n",
550 | " SizeOfOptionalHeader | \n",
551 | " Characteristics | \n",
552 | " MajorLinkerVersion | \n",
553 | " MinorLinkerVersion | \n",
554 | " SizeOfCode | \n",
555 | " SizeOfInitializedData | \n",
556 | " SizeOfUninitializedData | \n",
557 | " AddressOfEntryPoint | \n",
558 | " BaseOfCode | \n",
559 | " ... | \n",
560 | " ResourcesNb | \n",
561 | " ResourcesMeanEntropy | \n",
562 | " ResourcesMinEntropy | \n",
563 | " ResourcesMaxEntropy | \n",
564 | " ResourcesMeanSize | \n",
565 | " ResourcesMinSize | \n",
566 | " ResourcesMaxSize | \n",
567 | " LoadConfigurationSize | \n",
568 | " VersionInformationSize | \n",
569 | " legitimate | \n",
570 | "
\n",
571 | " \n",
572 | " \n",
573 | " \n",
574 | " count | \n",
575 | " 138047.000000 | \n",
576 | " 138047.000000 | \n",
577 | " 138047.000000 | \n",
578 | " 138047.000000 | \n",
579 | " 138047.000000 | \n",
580 | " 1.380470e+05 | \n",
581 | " 1.380470e+05 | \n",
582 | " 1.380470e+05 | \n",
583 | " 1.380470e+05 | \n",
584 | " 1.380470e+05 | \n",
585 | " ... | \n",
586 | " 138047.000000 | \n",
587 | " 138047.000000 | \n",
588 | " 138047.000000 | \n",
589 | " 138047.000000 | \n",
590 | " 1.380470e+05 | \n",
591 | " 1.380470e+05 | \n",
592 | " 1.380470e+05 | \n",
593 | " 1.380470e+05 | \n",
594 | " 138047.000000 | \n",
595 | " 138047.000000 | \n",
596 | "
\n",
597 | " \n",
598 | " mean | \n",
599 | " 4259.069274 | \n",
600 | " 225.845632 | \n",
601 | " 4444.145994 | \n",
602 | " 8.619774 | \n",
603 | " 3.819286 | \n",
604 | " 2.425956e+05 | \n",
605 | " 4.504867e+05 | \n",
606 | " 1.009525e+05 | \n",
607 | " 1.719561e+05 | \n",
608 | " 5.779845e+04 | \n",
609 | " ... | \n",
610 | " 22.050700 | \n",
611 | " 4.000127 | \n",
612 | " 2.434541 | \n",
613 | " 5.521610 | \n",
614 | " 5.545093e+04 | \n",
615 | " 1.818082e+04 | \n",
616 | " 2.465903e+05 | \n",
617 | " 4.656750e+05 | \n",
618 | " 12.363115 | \n",
619 | " 0.299340 | \n",
620 | "
\n",
621 | " \n",
622 | " std | \n",
623 | " 10880.347245 | \n",
624 | " 5.121399 | \n",
625 | " 8186.782524 | \n",
626 | " 4.088757 | \n",
627 | " 11.862675 | \n",
628 | " 5.754485e+06 | \n",
629 | " 2.101599e+07 | \n",
630 | " 1.635288e+07 | \n",
631 | " 3.430553e+06 | \n",
632 | " 5.527658e+06 | \n",
633 | " ... | \n",
634 | " 136.494244 | \n",
635 | " 1.112981 | \n",
636 | " 0.815577 | \n",
637 | " 1.597403 | \n",
638 | " 7.799163e+06 | \n",
639 | " 6.502369e+06 | \n",
640 | " 2.124860e+07 | \n",
641 | " 2.608987e+07 | \n",
642 | " 6.798878 | \n",
643 | " 0.457971 | \n",
644 | "
\n",
645 | " \n",
646 | " min | \n",
647 | " 332.000000 | \n",
648 | " 224.000000 | \n",
649 | " 2.000000 | \n",
650 | " 0.000000 | \n",
651 | " 0.000000 | \n",
652 | " 0.000000e+00 | \n",
653 | " 0.000000e+00 | \n",
654 | " 0.000000e+00 | \n",
655 | " 0.000000e+00 | \n",
656 | " 0.000000e+00 | \n",
657 | " ... | \n",
658 | " 0.000000 | \n",
659 | " 0.000000 | \n",
660 | " 0.000000 | \n",
661 | " 0.000000 | \n",
662 | " 0.000000e+00 | \n",
663 | " 0.000000e+00 | \n",
664 | " 0.000000e+00 | \n",
665 | " 0.000000e+00 | \n",
666 | " 0.000000 | \n",
667 | " 0.000000 | \n",
668 | "
\n",
669 | " \n",
670 | " 25% | \n",
671 | " 332.000000 | \n",
672 | " 224.000000 | \n",
673 | " 258.000000 | \n",
674 | " 8.000000 | \n",
675 | " 0.000000 | \n",
676 | " 3.020800e+04 | \n",
677 | " 2.457600e+04 | \n",
678 | " 0.000000e+00 | \n",
679 | " 1.272100e+04 | \n",
680 | " 4.096000e+03 | \n",
681 | " ... | \n",
682 | " 5.000000 | \n",
683 | " 3.458505 | \n",
684 | " 2.178748 | \n",
685 | " 4.828706 | \n",
686 | " 9.560000e+02 | \n",
687 | " 4.800000e+01 | \n",
688 | " 2.216000e+03 | \n",
689 | " 0.000000e+00 | \n",
690 | " 13.000000 | \n",
691 | " 0.000000 | \n",
692 | "
\n",
693 | " \n",
694 | " 50% | \n",
695 | " 332.000000 | \n",
696 | " 224.000000 | \n",
697 | " 258.000000 | \n",
698 | " 9.000000 | \n",
699 | " 0.000000 | \n",
700 | " 1.136640e+05 | \n",
701 | " 2.631680e+05 | \n",
702 | " 0.000000e+00 | \n",
703 | " 5.288300e+04 | \n",
704 | " 4.096000e+03 | \n",
705 | " ... | \n",
706 | " 6.000000 | \n",
707 | " 3.729824 | \n",
708 | " 2.458492 | \n",
709 | " 5.317552 | \n",
710 | " 2.708154e+03 | \n",
711 | " 4.800000e+01 | \n",
712 | " 9.640000e+03 | \n",
713 | " 7.200000e+01 | \n",
714 | " 15.000000 | \n",
715 | " 0.000000 | \n",
716 | "
\n",
717 | " \n",
718 | " 75% | \n",
719 | " 332.000000 | \n",
720 | " 224.000000 | \n",
721 | " 8226.000000 | \n",
722 | " 10.000000 | \n",
723 | " 0.000000 | \n",
724 | " 1.203200e+05 | \n",
725 | " 3.850240e+05 | \n",
726 | " 0.000000e+00 | \n",
727 | " 6.157800e+04 | \n",
728 | " 4.096000e+03 | \n",
729 | " ... | \n",
730 | " 13.000000 | \n",
731 | " 4.233051 | \n",
732 | " 2.696833 | \n",
733 | " 6.502239 | \n",
734 | " 6.558429e+03 | \n",
735 | " 1.320000e+02 | \n",
736 | " 2.378000e+04 | \n",
737 | " 7.200000e+01 | \n",
738 | " 16.000000 | \n",
739 | " 1.000000 | \n",
740 | "
\n",
741 | " \n",
742 | " max | \n",
743 | " 34404.000000 | \n",
744 | " 352.000000 | \n",
745 | " 49551.000000 | \n",
746 | " 255.000000 | \n",
747 | " 255.000000 | \n",
748 | " 1.818587e+09 | \n",
749 | " 4.294966e+09 | \n",
750 | " 4.294941e+09 | \n",
751 | " 1.074484e+09 | \n",
752 | " 2.028711e+09 | \n",
753 | " ... | \n",
754 | " 7694.000000 | \n",
755 | " 7.999723 | \n",
756 | " 7.999723 | \n",
757 | " 8.000000 | \n",
758 | " 2.415919e+09 | \n",
759 | " 2.415919e+09 | \n",
760 | " 4.294903e+09 | \n",
761 | " 4.294967e+09 | \n",
762 | " 26.000000 | \n",
763 | " 1.000000 | \n",
764 | "
\n",
765 | " \n",
766 | "
\n",
767 | "
8 rows × 55 columns
\n",
768 | "
"
769 | ],
770 | "text/plain": [
771 | " Machine SizeOfOptionalHeader Characteristics \\\n",
772 | "count 138047.000000 138047.000000 138047.000000 \n",
773 | "mean 4259.069274 225.845632 4444.145994 \n",
774 | "std 10880.347245 5.121399 8186.782524 \n",
775 | "min 332.000000 224.000000 2.000000 \n",
776 | "25% 332.000000 224.000000 258.000000 \n",
777 | "50% 332.000000 224.000000 258.000000 \n",
778 | "75% 332.000000 224.000000 8226.000000 \n",
779 | "max 34404.000000 352.000000 49551.000000 \n",
780 | "\n",
781 | " MajorLinkerVersion MinorLinkerVersion SizeOfCode \\\n",
782 | "count 138047.000000 138047.000000 1.380470e+05 \n",
783 | "mean 8.619774 3.819286 2.425956e+05 \n",
784 | "std 4.088757 11.862675 5.754485e+06 \n",
785 | "min 0.000000 0.000000 0.000000e+00 \n",
786 | "25% 8.000000 0.000000 3.020800e+04 \n",
787 | "50% 9.000000 0.000000 1.136640e+05 \n",
788 | "75% 10.000000 0.000000 1.203200e+05 \n",
789 | "max 255.000000 255.000000 1.818587e+09 \n",
790 | "\n",
791 | " SizeOfInitializedData SizeOfUninitializedData AddressOfEntryPoint \\\n",
792 | "count 1.380470e+05 1.380470e+05 1.380470e+05 \n",
793 | "mean 4.504867e+05 1.009525e+05 1.719561e+05 \n",
794 | "std 2.101599e+07 1.635288e+07 3.430553e+06 \n",
795 | "min 0.000000e+00 0.000000e+00 0.000000e+00 \n",
796 | "25% 2.457600e+04 0.000000e+00 1.272100e+04 \n",
797 | "50% 2.631680e+05 0.000000e+00 5.288300e+04 \n",
798 | "75% 3.850240e+05 0.000000e+00 6.157800e+04 \n",
799 | "max 4.294966e+09 4.294941e+09 1.074484e+09 \n",
800 | "\n",
801 | " BaseOfCode ... ResourcesNb ResourcesMeanEntropy \\\n",
802 | "count 1.380470e+05 ... 138047.000000 138047.000000 \n",
803 | "mean 5.779845e+04 ... 22.050700 4.000127 \n",
804 | "std 5.527658e+06 ... 136.494244 1.112981 \n",
805 | "min 0.000000e+00 ... 0.000000 0.000000 \n",
806 | "25% 4.096000e+03 ... 5.000000 3.458505 \n",
807 | "50% 4.096000e+03 ... 6.000000 3.729824 \n",
808 | "75% 4.096000e+03 ... 13.000000 4.233051 \n",
809 | "max 2.028711e+09 ... 7694.000000 7.999723 \n",
810 | "\n",
811 | " ResourcesMinEntropy ResourcesMaxEntropy ResourcesMeanSize \\\n",
812 | "count 138047.000000 138047.000000 1.380470e+05 \n",
813 | "mean 2.434541 5.521610 5.545093e+04 \n",
814 | "std 0.815577 1.597403 7.799163e+06 \n",
815 | "min 0.000000 0.000000 0.000000e+00 \n",
816 | "25% 2.178748 4.828706 9.560000e+02 \n",
817 | "50% 2.458492 5.317552 2.708154e+03 \n",
818 | "75% 2.696833 6.502239 6.558429e+03 \n",
819 | "max 7.999723 8.000000 2.415919e+09 \n",
820 | "\n",
821 | " ResourcesMinSize ResourcesMaxSize LoadConfigurationSize \\\n",
822 | "count 1.380470e+05 1.380470e+05 1.380470e+05 \n",
823 | "mean 1.818082e+04 2.465903e+05 4.656750e+05 \n",
824 | "std 6.502369e+06 2.124860e+07 2.608987e+07 \n",
825 | "min 0.000000e+00 0.000000e+00 0.000000e+00 \n",
826 | "25% 4.800000e+01 2.216000e+03 0.000000e+00 \n",
827 | "50% 4.800000e+01 9.640000e+03 7.200000e+01 \n",
828 | "75% 1.320000e+02 2.378000e+04 7.200000e+01 \n",
829 | "max 2.415919e+09 4.294903e+09 4.294967e+09 \n",
830 | "\n",
831 | " VersionInformationSize legitimate \n",
832 | "count 138047.000000 138047.000000 \n",
833 | "mean 12.363115 0.299340 \n",
834 | "std 6.798878 0.457971 \n",
835 | "min 0.000000 0.000000 \n",
836 | "25% 13.000000 0.000000 \n",
837 | "50% 15.000000 0.000000 \n",
838 | "75% 16.000000 1.000000 \n",
839 | "max 26.000000 1.000000 \n",
840 | "\n",
841 | "[8 rows x 55 columns]"
842 | ]
843 | },
844 | "execution_count": 5,
845 | "metadata": {},
846 | "output_type": "execute_result"
847 | }
848 | ],
849 | "source": [
850 | "dataset.describe()"
851 | ]
852 | },
853 | {
854 | "cell_type": "code",
855 | "execution_count": 6,
856 | "id": "f3db099c",
857 | "metadata": {},
858 | "outputs": [
859 | {
860 | "data": {
861 | "text/plain": [
862 | "Name 0\n",
863 | "md5 0\n",
864 | "Machine 0\n",
865 | "SizeOfOptionalHeader 0\n",
866 | "Characteristics 0\n",
867 | "MajorLinkerVersion 0\n",
868 | "MinorLinkerVersion 0\n",
869 | "SizeOfCode 0\n",
870 | "SizeOfInitializedData 0\n",
871 | "SizeOfUninitializedData 0\n",
872 | "AddressOfEntryPoint 0\n",
873 | "BaseOfCode 0\n",
874 | "BaseOfData 0\n",
875 | "ImageBase 0\n",
876 | "SectionAlignment 0\n",
877 | "FileAlignment 0\n",
878 | "MajorOperatingSystemVersion 0\n",
879 | "MinorOperatingSystemVersion 0\n",
880 | "MajorImageVersion 0\n",
881 | "MinorImageVersion 0\n",
882 | "MajorSubsystemVersion 0\n",
883 | "MinorSubsystemVersion 0\n",
884 | "SizeOfImage 0\n",
885 | "SizeOfHeaders 0\n",
886 | "CheckSum 0\n",
887 | "Subsystem 0\n",
888 | "DllCharacteristics 0\n",
889 | "SizeOfStackReserve 0\n",
890 | "SizeOfStackCommit 0\n",
891 | "SizeOfHeapReserve 0\n",
892 | "SizeOfHeapCommit 0\n",
893 | "LoaderFlags 0\n",
894 | "NumberOfRvaAndSizes 0\n",
895 | "SectionsNb 0\n",
896 | "SectionsMeanEntropy 0\n",
897 | "SectionsMinEntropy 0\n",
898 | "SectionsMaxEntropy 0\n",
899 | "SectionsMeanRawsize 0\n",
900 | "SectionsMinRawsize 0\n",
901 | "SectionMaxRawsize 0\n",
902 | "SectionsMeanVirtualsize 0\n",
903 | "SectionsMinVirtualsize 0\n",
904 | "SectionMaxVirtualsize 0\n",
905 | "ImportsNbDLL 0\n",
906 | "ImportsNb 0\n",
907 | "ImportsNbOrdinal 0\n",
908 | "ExportNb 0\n",
909 | "ResourcesNb 0\n",
910 | "ResourcesMeanEntropy 0\n",
911 | "ResourcesMinEntropy 0\n",
912 | "ResourcesMaxEntropy 0\n",
913 | "ResourcesMeanSize 0\n",
914 | "ResourcesMinSize 0\n",
915 | "ResourcesMaxSize 0\n",
916 | "LoadConfigurationSize 0\n",
917 | "VersionInformationSize 0\n",
918 | "legitimate 0\n",
919 | "dtype: int64"
920 | ]
921 | },
922 | "execution_count": 6,
923 | "metadata": {},
924 | "output_type": "execute_result"
925 | }
926 | ],
927 | "source": [
928 | "dataset.isnull().sum()"
929 | ]
930 | },
931 | {
932 | "cell_type": "code",
933 | "execution_count": 7,
934 | "id": "48a57329",
935 | "metadata": {},
936 | "outputs": [],
937 | "source": [
938 | "#Classifying Data Based on - Legitimate OR Malware"
939 | ]
940 | },
941 | {
942 | "cell_type": "code",
943 | "execution_count": 8,
944 | "id": "52e76632",
945 | "metadata": {},
946 | "outputs": [
947 | {
948 | "data": {
949 | "text/plain": [
950 | "legitimate\n",
951 | "0 96724\n",
952 | "1 41323\n",
953 | "dtype: int64"
954 | ]
955 | },
956 | "execution_count": 8,
957 | "metadata": {},
958 | "output_type": "execute_result"
959 | }
960 | ],
961 | "source": [
962 | "dataset.groupby(dataset['legitimate']).size()\n",
963 | "#1 means legitimate, 0 means malware"
964 | ]
965 | },
966 | {
967 | "cell_type": "code",
968 | "execution_count": 9,
969 | "id": "77eefc0b",
970 | "metadata": {},
971 | "outputs": [
972 | {
973 | "data": {
974 | "text/plain": [
975 | "([,\n",
976 | " ],\n",
977 | " [Text(0.6484073958497663, 0.8885763045497695, 'Legitimate'),\n",
978 | " Text(-0.6484073958497659, -0.8885763045497698, 'Malware')],\n",
979 | " [Text(0.35367676137259974, 0.4846779842998742, '30%'),\n",
980 | " Text(-0.35367676137259957, -0.48467798429987435, '70%')])"
981 | ]
982 | },
983 | "execution_count": 9,
984 | "metadata": {},
985 | "output_type": "execute_result"
986 | },
987 | {
988 | "data": {
989 | "image/png": "\n",
990 | "text/plain": [
991 | ""
992 | ]
993 | },
994 | "metadata": {},
995 | "output_type": "display_data"
996 | }
997 | ],
998 | "source": [
999 | "type_classify=['Legitimate','Malware']\n",
1000 | "count_classify=[41323,96724]\n",
1001 | "plt.pie(count_classify, labels=type_classify, autopct='%0.f%%')"
1002 | ]
1003 | },
1004 | {
1005 | "cell_type": "code",
1006 | "execution_count": 10,
1007 | "id": "4f846ba0",
1008 | "metadata": {},
1009 | "outputs": [],
1010 | "source": [
1011 | "# Total Number of Columns in Dataset"
1012 | ]
1013 | },
1014 | {
1015 | "cell_type": "code",
1016 | "execution_count": 11,
1017 | "id": "cb1a7785",
1018 | "metadata": {},
1019 | "outputs": [
1020 | {
1021 | "data": {
1022 | "text/plain": [
1023 | "57"
1024 | ]
1025 | },
1026 | "execution_count": 11,
1027 | "metadata": {},
1028 | "output_type": "execute_result"
1029 | }
1030 | ],
1031 | "source": [
1032 | "dataset.shape[1]"
1033 | ]
1034 | },
1035 | {
1036 | "cell_type": "code",
1037 | "execution_count": 12,
1038 | "id": "7c62eedf",
1039 | "metadata": {},
1040 | "outputs": [],
1041 | "source": [
1042 | "# Creating Legitimate and Malware Dataset from Main Dataset"
1043 | ]
1044 | },
1045 | {
1046 | "cell_type": "code",
1047 | "execution_count": 13,
1048 | "id": "bda12c84",
1049 | "metadata": {},
1050 | "outputs": [
1051 | {
1052 | "data": {
1053 | "text/html": [
1054 | "\n",
1055 | "\n",
1068 | "
\n",
1069 | " \n",
1070 | " \n",
1071 | " | \n",
1072 | " Name | \n",
1073 | " md5 | \n",
1074 | " Machine | \n",
1075 | " SizeOfOptionalHeader | \n",
1076 | " Characteristics | \n",
1077 | " MajorLinkerVersion | \n",
1078 | " MinorLinkerVersion | \n",
1079 | " SizeOfCode | \n",
1080 | " SizeOfInitializedData | \n",
1081 | " SizeOfUninitializedData | \n",
1082 | " ... | \n",
1083 | " ExportNb | \n",
1084 | " ResourcesNb | \n",
1085 | " ResourcesMeanEntropy | \n",
1086 | " ResourcesMinEntropy | \n",
1087 | " ResourcesMaxEntropy | \n",
1088 | " ResourcesMeanSize | \n",
1089 | " ResourcesMinSize | \n",
1090 | " ResourcesMaxSize | \n",
1091 | " LoadConfigurationSize | \n",
1092 | " VersionInformationSize | \n",
1093 | "
\n",
1094 | " \n",
1095 | " \n",
1096 | " \n",
1097 | " 0 | \n",
1098 | " memtest.exe | \n",
1099 | " 631ea355665f28d4707448e442fbf5b8 | \n",
1100 | " 332 | \n",
1101 | " 224 | \n",
1102 | " 258 | \n",
1103 | " 9 | \n",
1104 | " 0 | \n",
1105 | " 361984 | \n",
1106 | " 115712 | \n",
1107 | " 0 | \n",
1108 | " ... | \n",
1109 | " 0 | \n",
1110 | " 4 | \n",
1111 | " 3.262823 | \n",
1112 | " 2.568844 | \n",
1113 | " 3.537939 | \n",
1114 | " 8797.000000 | \n",
1115 | " 216 | \n",
1116 | " 18032 | \n",
1117 | " 0 | \n",
1118 | " 16 | \n",
1119 | "
\n",
1120 | " \n",
1121 | " 1 | \n",
1122 | " ose.exe | \n",
1123 | " 9d10f99a6712e28f8acd5641e3a7ea6b | \n",
1124 | " 332 | \n",
1125 | " 224 | \n",
1126 | " 3330 | \n",
1127 | " 9 | \n",
1128 | " 0 | \n",
1129 | " 130560 | \n",
1130 | " 19968 | \n",
1131 | " 0 | \n",
1132 | " ... | \n",
1133 | " 0 | \n",
1134 | " 2 | \n",
1135 | " 4.250461 | \n",
1136 | " 3.420744 | \n",
1137 | " 5.080177 | \n",
1138 | " 837.000000 | \n",
1139 | " 518 | \n",
1140 | " 1156 | \n",
1141 | " 72 | \n",
1142 | " 18 | \n",
1143 | "
\n",
1144 | " \n",
1145 | " 2 | \n",
1146 | " setup.exe | \n",
1147 | " 4d92f518527353c0db88a70fddcfd390 | \n",
1148 | " 332 | \n",
1149 | " 224 | \n",
1150 | " 3330 | \n",
1151 | " 9 | \n",
1152 | " 0 | \n",
1153 | " 517120 | \n",
1154 | " 621568 | \n",
1155 | " 0 | \n",
1156 | " ... | \n",
1157 | " 1 | \n",
1158 | " 11 | \n",
1159 | " 4.426324 | \n",
1160 | " 2.846449 | \n",
1161 | " 5.271813 | \n",
1162 | " 31102.272727 | \n",
1163 | " 104 | \n",
1164 | " 270376 | \n",
1165 | " 72 | \n",
1166 | " 18 | \n",
1167 | "
\n",
1168 | " \n",
1169 | " 3 | \n",
1170 | " DW20.EXE | \n",
1171 | " a41e524f8d45f0074fd07805ff0c9b12 | \n",
1172 | " 332 | \n",
1173 | " 224 | \n",
1174 | " 258 | \n",
1175 | " 9 | \n",
1176 | " 0 | \n",
1177 | " 585728 | \n",
1178 | " 369152 | \n",
1179 | " 0 | \n",
1180 | " ... | \n",
1181 | " 1 | \n",
1182 | " 10 | \n",
1183 | " 4.364291 | \n",
1184 | " 2.669314 | \n",
1185 | " 6.400720 | \n",
1186 | " 1457.000000 | \n",
1187 | " 90 | \n",
1188 | " 4264 | \n",
1189 | " 72 | \n",
1190 | " 18 | \n",
1191 | "
\n",
1192 | " \n",
1193 | " 4 | \n",
1194 | " dwtrig20.exe | \n",
1195 | " c87e561258f2f8650cef999bf643a731 | \n",
1196 | " 332 | \n",
1197 | " 224 | \n",
1198 | " 258 | \n",
1199 | " 9 | \n",
1200 | " 0 | \n",
1201 | " 294912 | \n",
1202 | " 247296 | \n",
1203 | " 0 | \n",
1204 | " ... | \n",
1205 | " 1 | \n",
1206 | " 2 | \n",
1207 | " 4.306100 | \n",
1208 | " 3.421598 | \n",
1209 | " 5.190603 | \n",
1210 | " 1074.500000 | \n",
1211 | " 849 | \n",
1212 | " 1300 | \n",
1213 | " 72 | \n",
1214 | " 18 | \n",
1215 | "
\n",
1216 | " \n",
1217 | " ... | \n",
1218 | " ... | \n",
1219 | " ... | \n",
1220 | " ... | \n",
1221 | " ... | \n",
1222 | " ... | \n",
1223 | " ... | \n",
1224 | " ... | \n",
1225 | " ... | \n",
1226 | " ... | \n",
1227 | " ... | \n",
1228 | " ... | \n",
1229 | " ... | \n",
1230 | " ... | \n",
1231 | " ... | \n",
1232 | " ... | \n",
1233 | " ... | \n",
1234 | " ... | \n",
1235 | " ... | \n",
1236 | " ... | \n",
1237 | " ... | \n",
1238 | " ... | \n",
1239 | "
\n",
1240 | " \n",
1241 | " 41318 | \n",
1242 | " mfc80.dll | \n",
1243 | " 1f5afd468eb5e09e9ed75a087529eab5 | \n",
1244 | " 332 | \n",
1245 | " 224 | \n",
1246 | " 8450 | \n",
1247 | " 8 | \n",
1248 | " 0 | \n",
1249 | " 946176 | \n",
1250 | " 159744 | \n",
1251 | " 0 | \n",
1252 | " ... | \n",
1253 | " 0 | \n",
1254 | " 123 | \n",
1255 | " 2.607251 | \n",
1256 | " 0.960953 | \n",
1257 | " 5.130762 | \n",
1258 | " 327.170732 | \n",
1259 | " 20 | \n",
1260 | " 1592 | \n",
1261 | " 72 | \n",
1262 | " 16 | \n",
1263 | "
\n",
1264 | " \n",
1265 | " 41319 | \n",
1266 | " mfc80u.dll | \n",
1267 | " e2c48cd0132d4d1dc7d0df9a6bef686a | \n",
1268 | " 332 | \n",
1269 | " 224 | \n",
1270 | " 8450 | \n",
1271 | " 8 | \n",
1272 | " 0 | \n",
1273 | " 946176 | \n",
1274 | " 154624 | \n",
1275 | " 0 | \n",
1276 | " ... | \n",
1277 | " 0 | \n",
1278 | " 123 | \n",
1279 | " 2.607232 | \n",
1280 | " 0.960953 | \n",
1281 | " 5.130762 | \n",
1282 | " 327.235772 | \n",
1283 | " 20 | \n",
1284 | " 1592 | \n",
1285 | " 72 | \n",
1286 | " 16 | \n",
1287 | "
\n",
1288 | " \n",
1289 | " 41320 | \n",
1290 | " mfcm80.dll | \n",
1291 | " 83362ee950ad18adb85b54409155c378 | \n",
1292 | " 332 | \n",
1293 | " 224 | \n",
1294 | " 8450 | \n",
1295 | " 8 | \n",
1296 | " 0 | \n",
1297 | " 53248 | \n",
1298 | " 16384 | \n",
1299 | " 0 | \n",
1300 | " ... | \n",
1301 | " 25 | \n",
1302 | " 1 | \n",
1303 | " 3.524268 | \n",
1304 | " 3.524268 | \n",
1305 | " 3.524268 | \n",
1306 | " 892.000000 | \n",
1307 | " 892 | \n",
1308 | " 892 | \n",
1309 | " 72 | \n",
1310 | " 16 | \n",
1311 | "
\n",
1312 | " \n",
1313 | " 41321 | \n",
1314 | " mfcm80u.dll | \n",
1315 | " 26aafee5c30020c99120ee113d751f7e | \n",
1316 | " 332 | \n",
1317 | " 224 | \n",
1318 | " 8450 | \n",
1319 | " 8 | \n",
1320 | " 0 | \n",
1321 | " 52736 | \n",
1322 | " 11264 | \n",
1323 | " 0 | \n",
1324 | " ... | \n",
1325 | " 25 | \n",
1326 | " 1 | \n",
1327 | " 3.542071 | \n",
1328 | " 3.542071 | \n",
1329 | " 3.542071 | \n",
1330 | " 892.000000 | \n",
1331 | " 892 | \n",
1332 | " 892 | \n",
1333 | " 72 | \n",
1334 | " 16 | \n",
1335 | "
\n",
1336 | " \n",
1337 | " 41322 | \n",
1338 | " vcomp.dll | \n",
1339 | " 73dbaa64d589f3262615550dd6881fee | \n",
1340 | " 332 | \n",
1341 | " 224 | \n",
1342 | " 8450 | \n",
1343 | " 8 | \n",
1344 | " 0 | \n",
1345 | " 40960 | \n",
1346 | " 20480 | \n",
1347 | " 0 | \n",
1348 | " ... | \n",
1349 | " 112 | \n",
1350 | " 6 | \n",
1351 | " 3.004383 | \n",
1352 | " 2.406512 | \n",
1353 | " 3.592623 | \n",
1354 | " 610.333333 | \n",
1355 | " 124 | \n",
1356 | " 1412 | \n",
1357 | " 72 | \n",
1358 | " 16 | \n",
1359 | "
\n",
1360 | " \n",
1361 | "
\n",
1362 | "
41323 rows × 56 columns
\n",
1363 | "
"
1364 | ],
1365 | "text/plain": [
1366 | " Name md5 Machine \\\n",
1367 | "0 memtest.exe 631ea355665f28d4707448e442fbf5b8 332 \n",
1368 | "1 ose.exe 9d10f99a6712e28f8acd5641e3a7ea6b 332 \n",
1369 | "2 setup.exe 4d92f518527353c0db88a70fddcfd390 332 \n",
1370 | "3 DW20.EXE a41e524f8d45f0074fd07805ff0c9b12 332 \n",
1371 | "4 dwtrig20.exe c87e561258f2f8650cef999bf643a731 332 \n",
1372 | "... ... ... ... \n",
1373 | "41318 mfc80.dll 1f5afd468eb5e09e9ed75a087529eab5 332 \n",
1374 | "41319 mfc80u.dll e2c48cd0132d4d1dc7d0df9a6bef686a 332 \n",
1375 | "41320 mfcm80.dll 83362ee950ad18adb85b54409155c378 332 \n",
1376 | "41321 mfcm80u.dll 26aafee5c30020c99120ee113d751f7e 332 \n",
1377 | "41322 vcomp.dll 73dbaa64d589f3262615550dd6881fee 332 \n",
1378 | "\n",
1379 | " SizeOfOptionalHeader Characteristics MajorLinkerVersion \\\n",
1380 | "0 224 258 9 \n",
1381 | "1 224 3330 9 \n",
1382 | "2 224 3330 9 \n",
1383 | "3 224 258 9 \n",
1384 | "4 224 258 9 \n",
1385 | "... ... ... ... \n",
1386 | "41318 224 8450 8 \n",
1387 | "41319 224 8450 8 \n",
1388 | "41320 224 8450 8 \n",
1389 | "41321 224 8450 8 \n",
1390 | "41322 224 8450 8 \n",
1391 | "\n",
1392 | " MinorLinkerVersion SizeOfCode SizeOfInitializedData \\\n",
1393 | "0 0 361984 115712 \n",
1394 | "1 0 130560 19968 \n",
1395 | "2 0 517120 621568 \n",
1396 | "3 0 585728 369152 \n",
1397 | "4 0 294912 247296 \n",
1398 | "... ... ... ... \n",
1399 | "41318 0 946176 159744 \n",
1400 | "41319 0 946176 154624 \n",
1401 | "41320 0 53248 16384 \n",
1402 | "41321 0 52736 11264 \n",
1403 | "41322 0 40960 20480 \n",
1404 | "\n",
1405 | " SizeOfUninitializedData ... ExportNb ResourcesNb \\\n",
1406 | "0 0 ... 0 4 \n",
1407 | "1 0 ... 0 2 \n",
1408 | "2 0 ... 1 11 \n",
1409 | "3 0 ... 1 10 \n",
1410 | "4 0 ... 1 2 \n",
1411 | "... ... ... ... ... \n",
1412 | "41318 0 ... 0 123 \n",
1413 | "41319 0 ... 0 123 \n",
1414 | "41320 0 ... 25 1 \n",
1415 | "41321 0 ... 25 1 \n",
1416 | "41322 0 ... 112 6 \n",
1417 | "\n",
1418 | " ResourcesMeanEntropy ResourcesMinEntropy ResourcesMaxEntropy \\\n",
1419 | "0 3.262823 2.568844 3.537939 \n",
1420 | "1 4.250461 3.420744 5.080177 \n",
1421 | "2 4.426324 2.846449 5.271813 \n",
1422 | "3 4.364291 2.669314 6.400720 \n",
1423 | "4 4.306100 3.421598 5.190603 \n",
1424 | "... ... ... ... \n",
1425 | "41318 2.607251 0.960953 5.130762 \n",
1426 | "41319 2.607232 0.960953 5.130762 \n",
1427 | "41320 3.524268 3.524268 3.524268 \n",
1428 | "41321 3.542071 3.542071 3.542071 \n",
1429 | "41322 3.004383 2.406512 3.592623 \n",
1430 | "\n",
1431 | " ResourcesMeanSize ResourcesMinSize ResourcesMaxSize \\\n",
1432 | "0 8797.000000 216 18032 \n",
1433 | "1 837.000000 518 1156 \n",
1434 | "2 31102.272727 104 270376 \n",
1435 | "3 1457.000000 90 4264 \n",
1436 | "4 1074.500000 849 1300 \n",
1437 | "... ... ... ... \n",
1438 | "41318 327.170732 20 1592 \n",
1439 | "41319 327.235772 20 1592 \n",
1440 | "41320 892.000000 892 892 \n",
1441 | "41321 892.000000 892 892 \n",
1442 | "41322 610.333333 124 1412 \n",
1443 | "\n",
1444 | " LoadConfigurationSize VersionInformationSize \n",
1445 | "0 0 16 \n",
1446 | "1 72 18 \n",
1447 | "2 72 18 \n",
1448 | "3 72 18 \n",
1449 | "4 72 18 \n",
1450 | "... ... ... \n",
1451 | "41318 72 16 \n",
1452 | "41319 72 16 \n",
1453 | "41320 72 16 \n",
1454 | "41321 72 16 \n",
1455 | "41322 72 16 \n",
1456 | "\n",
1457 | "[41323 rows x 56 columns]"
1458 | ]
1459 | },
1460 | "execution_count": 13,
1461 | "metadata": {},
1462 | "output_type": "execute_result"
1463 | }
1464 | ],
1465 | "source": [
1466 | "legit=dataset[0:41323].drop([\"legitimate\"],axis=1) # here axis =1 means vertical \n",
1467 | "legit"
1468 | ]
1469 | },
1470 | {
1471 | "cell_type": "code",
1472 | "execution_count": 14,
1473 | "id": "5adc7421",
1474 | "metadata": {},
1475 | "outputs": [
1476 | {
1477 | "data": {
1478 | "text/html": [
1479 | "\n",
1480 | "\n",
1493 | "
\n",
1494 | " \n",
1495 | " \n",
1496 | " | \n",
1497 | " Name | \n",
1498 | " md5 | \n",
1499 | " Machine | \n",
1500 | " SizeOfOptionalHeader | \n",
1501 | " Characteristics | \n",
1502 | " MajorLinkerVersion | \n",
1503 | " MinorLinkerVersion | \n",
1504 | " SizeOfCode | \n",
1505 | " SizeOfInitializedData | \n",
1506 | " SizeOfUninitializedData | \n",
1507 | " ... | \n",
1508 | " ResourcesNb | \n",
1509 | " ResourcesMeanEntropy | \n",
1510 | " ResourcesMinEntropy | \n",
1511 | " ResourcesMaxEntropy | \n",
1512 | " ResourcesMeanSize | \n",
1513 | " ResourcesMinSize | \n",
1514 | " ResourcesMaxSize | \n",
1515 | " LoadConfigurationSize | \n",
1516 | " VersionInformationSize | \n",
1517 | " legitimate | \n",
1518 | "
\n",
1519 | " \n",
1520 | " \n",
1521 | " \n",
1522 | " 41323 | \n",
1523 | " VirusShare_4a400b747afe6547e09ce0b02dae7f1c | \n",
1524 | " 4a400b747afe6547e09ce0b02dae7f1c | \n",
1525 | " 332 | \n",
1526 | " 224 | \n",
1527 | " 258 | \n",
1528 | " 11 | \n",
1529 | " 0 | \n",
1530 | " 354816 | \n",
1531 | " 257024 | \n",
1532 | " 0 | \n",
1533 | " ... | \n",
1534 | " 7 | \n",
1535 | " 3.914415 | \n",
1536 | " 1.441688 | \n",
1537 | " 7.677091 | \n",
1538 | " 7298.428571 | \n",
1539 | " 16 | \n",
1540 | " 28438 | \n",
1541 | " 72 | \n",
1542 | " 0 | \n",
1543 | " 0 | \n",
1544 | "
\n",
1545 | " \n",
1546 | " 41324 | \n",
1547 | " VirusShare_9bd57c8252948bd2fa651ad372bd4f13 | \n",
1548 | " 9bd57c8252948bd2fa651ad372bd4f13 | \n",
1549 | " 332 | \n",
1550 | " 224 | \n",
1551 | " 271 | \n",
1552 | " 6 | \n",
1553 | " 0 | \n",
1554 | " 24064 | \n",
1555 | " 164864 | \n",
1556 | " 1024 | \n",
1557 | " ... | \n",
1558 | " 6 | \n",
1559 | " 3.199107 | \n",
1560 | " 1.971335 | \n",
1561 | " 5.214816 | \n",
1562 | " 452.000000 | \n",
1563 | " 34 | \n",
1564 | " 958 | \n",
1565 | " 0 | \n",
1566 | " 15 | \n",
1567 | " 0 | \n",
1568 | "
\n",
1569 | " \n",
1570 | " 41325 | \n",
1571 | " VirusShare_d1456165e9358b8f61f93a5f2042f39c | \n",
1572 | " d1456165e9358b8f61f93a5f2042f39c | \n",
1573 | " 332 | \n",
1574 | " 224 | \n",
1575 | " 258 | \n",
1576 | " 10 | \n",
1577 | " 0 | \n",
1578 | " 118784 | \n",
1579 | " 381952 | \n",
1580 | " 0 | \n",
1581 | " ... | \n",
1582 | " 18 | \n",
1583 | " 6.530946 | \n",
1584 | " 2.458492 | \n",
1585 | " 7.992688 | \n",
1586 | " 18523.444444 | \n",
1587 | " 48 | \n",
1588 | " 33945 | \n",
1589 | " 72 | \n",
1590 | " 14 | \n",
1591 | " 0 | \n",
1592 | "
\n",
1593 | " \n",
1594 | " 41326 | \n",
1595 | " VirusShare_e4214cc73afbba0f52bb72d5db8f8bb1 | \n",
1596 | " e4214cc73afbba0f52bb72d5db8f8bb1 | \n",
1597 | " 332 | \n",
1598 | " 224 | \n",
1599 | " 258 | \n",
1600 | " 10 | \n",
1601 | " 0 | \n",
1602 | " 174592 | \n",
1603 | " 300032 | \n",
1604 | " 0 | \n",
1605 | " ... | \n",
1606 | " 15 | \n",
1607 | " 5.732393 | \n",
1608 | " 2.852364 | \n",
1609 | " 7.987726 | \n",
1610 | " 12706.133333 | \n",
1611 | " 118 | \n",
1612 | " 60500 | \n",
1613 | " 72 | \n",
1614 | " 14 | \n",
1615 | " 0 | \n",
1616 | "
\n",
1617 | " \n",
1618 | " 41327 | \n",
1619 | " VirusShare_710890c07b3f93b90635f8bff6c34605 | \n",
1620 | " 710890c07b3f93b90635f8bff6c34605 | \n",
1621 | " 332 | \n",
1622 | " 224 | \n",
1623 | " 258 | \n",
1624 | " 9 | \n",
1625 | " 0 | \n",
1626 | " 475648 | \n",
1627 | " 348672 | \n",
1628 | " 0 | \n",
1629 | " ... | \n",
1630 | " 59 | \n",
1631 | " 2.827826 | \n",
1632 | " 0.960953 | \n",
1633 | " 7.212329 | \n",
1634 | " 2637.033898 | \n",
1635 | " 20 | \n",
1636 | " 67624 | \n",
1637 | " 72 | \n",
1638 | " 0 | \n",
1639 | " 0 | \n",
1640 | "
\n",
1641 | " \n",
1642 | " ... | \n",
1643 | " ... | \n",
1644 | " ... | \n",
1645 | " ... | \n",
1646 | " ... | \n",
1647 | " ... | \n",
1648 | " ... | \n",
1649 | " ... | \n",
1650 | " ... | \n",
1651 | " ... | \n",
1652 | " ... | \n",
1653 | " ... | \n",
1654 | " ... | \n",
1655 | " ... | \n",
1656 | " ... | \n",
1657 | " ... | \n",
1658 | " ... | \n",
1659 | " ... | \n",
1660 | " ... | \n",
1661 | " ... | \n",
1662 | " ... | \n",
1663 | " ... | \n",
1664 | "
\n",
1665 | " \n",
1666 | " 138042 | \n",
1667 | " VirusShare_8e292b418568d6e7b87f2a32aee7074b | \n",
1668 | " 8e292b418568d6e7b87f2a32aee7074b | \n",
1669 | " 332 | \n",
1670 | " 224 | \n",
1671 | " 258 | \n",
1672 | " 11 | \n",
1673 | " 0 | \n",
1674 | " 205824 | \n",
1675 | " 223744 | \n",
1676 | " 0 | \n",
1677 | " ... | \n",
1678 | " 7 | \n",
1679 | " 4.122736 | \n",
1680 | " 1.370260 | \n",
1681 | " 7.677091 | \n",
1682 | " 14900.714286 | \n",
1683 | " 16 | \n",
1684 | " 81654 | \n",
1685 | " 72 | \n",
1686 | " 0 | \n",
1687 | " 0 | \n",
1688 | "
\n",
1689 | " \n",
1690 | " 138043 | \n",
1691 | " VirusShare_260d9e2258aed4c8a3bbd703ec895822 | \n",
1692 | " 260d9e2258aed4c8a3bbd703ec895822 | \n",
1693 | " 332 | \n",
1694 | " 224 | \n",
1695 | " 33167 | \n",
1696 | " 2 | \n",
1697 | " 25 | \n",
1698 | " 37888 | \n",
1699 | " 185344 | \n",
1700 | " 0 | \n",
1701 | " ... | \n",
1702 | " 26 | \n",
1703 | " 3.377663 | \n",
1704 | " 2.031619 | \n",
1705 | " 5.050074 | \n",
1706 | " 6905.846154 | \n",
1707 | " 44 | \n",
1708 | " 67624 | \n",
1709 | " 0 | \n",
1710 | " 15 | \n",
1711 | " 0 | \n",
1712 | "
\n",
1713 | " \n",
1714 | " 138044 | \n",
1715 | " VirusShare_8d088a51b7d225c9f5d11d239791ec3f | \n",
1716 | " 8d088a51b7d225c9f5d11d239791ec3f | \n",
1717 | " 332 | \n",
1718 | " 224 | \n",
1719 | " 258 | \n",
1720 | " 10 | \n",
1721 | " 0 | \n",
1722 | " 118272 | \n",
1723 | " 380416 | \n",
1724 | " 0 | \n",
1725 | " ... | \n",
1726 | " 22 | \n",
1727 | " 6.825406 | \n",
1728 | " 2.617026 | \n",
1729 | " 7.990487 | \n",
1730 | " 14981.909091 | \n",
1731 | " 48 | \n",
1732 | " 22648 | \n",
1733 | " 72 | \n",
1734 | " 14 | \n",
1735 | " 0 | \n",
1736 | "
\n",
1737 | " \n",
1738 | " 138045 | \n",
1739 | " VirusShare_4286dccf67ca220fe67635388229a9f3 | \n",
1740 | " 4286dccf67ca220fe67635388229a9f3 | \n",
1741 | " 332 | \n",
1742 | " 224 | \n",
1743 | " 33166 | \n",
1744 | " 2 | \n",
1745 | " 25 | \n",
1746 | " 49152 | \n",
1747 | " 16896 | \n",
1748 | " 0 | \n",
1749 | " ... | \n",
1750 | " 10 | \n",
1751 | " 3.421627 | \n",
1752 | " 2.060964 | \n",
1753 | " 4.739744 | \n",
1754 | " 601.600000 | \n",
1755 | " 16 | \n",
1756 | " 2216 | \n",
1757 | " 0 | \n",
1758 | " 0 | \n",
1759 | " 0 | \n",
1760 | "
\n",
1761 | " \n",
1762 | " 138046 | \n",
1763 | " VirusShare_d7648eae45f09b3adb75127f43be6d11 | \n",
1764 | " d7648eae45f09b3adb75127f43be6d11 | \n",
1765 | " 332 | \n",
1766 | " 224 | \n",
1767 | " 258 | \n",
1768 | " 11 | \n",
1769 | " 0 | \n",
1770 | " 111616 | \n",
1771 | " 468480 | \n",
1772 | " 0 | \n",
1773 | " ... | \n",
1774 | " 4 | \n",
1775 | " 4.407252 | \n",
1776 | " 1.980482 | \n",
1777 | " 6.115374 | \n",
1778 | " 96625.000000 | \n",
1779 | " 20 | \n",
1780 | " 318464 | \n",
1781 | " 72 | \n",
1782 | " 0 | \n",
1783 | " 0 | \n",
1784 | "
\n",
1785 | " \n",
1786 | "
\n",
1787 | "
96724 rows × 57 columns
\n",
1788 | "
"
1789 | ],
1790 | "text/plain": [
1791 | " Name \\\n",
1792 | "41323 VirusShare_4a400b747afe6547e09ce0b02dae7f1c \n",
1793 | "41324 VirusShare_9bd57c8252948bd2fa651ad372bd4f13 \n",
1794 | "41325 VirusShare_d1456165e9358b8f61f93a5f2042f39c \n",
1795 | "41326 VirusShare_e4214cc73afbba0f52bb72d5db8f8bb1 \n",
1796 | "41327 VirusShare_710890c07b3f93b90635f8bff6c34605 \n",
1797 | "... ... \n",
1798 | "138042 VirusShare_8e292b418568d6e7b87f2a32aee7074b \n",
1799 | "138043 VirusShare_260d9e2258aed4c8a3bbd703ec895822 \n",
1800 | "138044 VirusShare_8d088a51b7d225c9f5d11d239791ec3f \n",
1801 | "138045 VirusShare_4286dccf67ca220fe67635388229a9f3 \n",
1802 | "138046 VirusShare_d7648eae45f09b3adb75127f43be6d11 \n",
1803 | "\n",
1804 | " md5 Machine SizeOfOptionalHeader \\\n",
1805 | "41323 4a400b747afe6547e09ce0b02dae7f1c 332 224 \n",
1806 | "41324 9bd57c8252948bd2fa651ad372bd4f13 332 224 \n",
1807 | "41325 d1456165e9358b8f61f93a5f2042f39c 332 224 \n",
1808 | "41326 e4214cc73afbba0f52bb72d5db8f8bb1 332 224 \n",
1809 | "41327 710890c07b3f93b90635f8bff6c34605 332 224 \n",
1810 | "... ... ... ... \n",
1811 | "138042 8e292b418568d6e7b87f2a32aee7074b 332 224 \n",
1812 | "138043 260d9e2258aed4c8a3bbd703ec895822 332 224 \n",
1813 | "138044 8d088a51b7d225c9f5d11d239791ec3f 332 224 \n",
1814 | "138045 4286dccf67ca220fe67635388229a9f3 332 224 \n",
1815 | "138046 d7648eae45f09b3adb75127f43be6d11 332 224 \n",
1816 | "\n",
1817 | " Characteristics MajorLinkerVersion MinorLinkerVersion SizeOfCode \\\n",
1818 | "41323 258 11 0 354816 \n",
1819 | "41324 271 6 0 24064 \n",
1820 | "41325 258 10 0 118784 \n",
1821 | "41326 258 10 0 174592 \n",
1822 | "41327 258 9 0 475648 \n",
1823 | "... ... ... ... ... \n",
1824 | "138042 258 11 0 205824 \n",
1825 | "138043 33167 2 25 37888 \n",
1826 | "138044 258 10 0 118272 \n",
1827 | "138045 33166 2 25 49152 \n",
1828 | "138046 258 11 0 111616 \n",
1829 | "\n",
1830 | " SizeOfInitializedData SizeOfUninitializedData ... ResourcesNb \\\n",
1831 | "41323 257024 0 ... 7 \n",
1832 | "41324 164864 1024 ... 6 \n",
1833 | "41325 381952 0 ... 18 \n",
1834 | "41326 300032 0 ... 15 \n",
1835 | "41327 348672 0 ... 59 \n",
1836 | "... ... ... ... ... \n",
1837 | "138042 223744 0 ... 7 \n",
1838 | "138043 185344 0 ... 26 \n",
1839 | "138044 380416 0 ... 22 \n",
1840 | "138045 16896 0 ... 10 \n",
1841 | "138046 468480 0 ... 4 \n",
1842 | "\n",
1843 | " ResourcesMeanEntropy ResourcesMinEntropy ResourcesMaxEntropy \\\n",
1844 | "41323 3.914415 1.441688 7.677091 \n",
1845 | "41324 3.199107 1.971335 5.214816 \n",
1846 | "41325 6.530946 2.458492 7.992688 \n",
1847 | "41326 5.732393 2.852364 7.987726 \n",
1848 | "41327 2.827826 0.960953 7.212329 \n",
1849 | "... ... ... ... \n",
1850 | "138042 4.122736 1.370260 7.677091 \n",
1851 | "138043 3.377663 2.031619 5.050074 \n",
1852 | "138044 6.825406 2.617026 7.990487 \n",
1853 | "138045 3.421627 2.060964 4.739744 \n",
1854 | "138046 4.407252 1.980482 6.115374 \n",
1855 | "\n",
1856 | " ResourcesMeanSize ResourcesMinSize ResourcesMaxSize \\\n",
1857 | "41323 7298.428571 16 28438 \n",
1858 | "41324 452.000000 34 958 \n",
1859 | "41325 18523.444444 48 33945 \n",
1860 | "41326 12706.133333 118 60500 \n",
1861 | "41327 2637.033898 20 67624 \n",
1862 | "... ... ... ... \n",
1863 | "138042 14900.714286 16 81654 \n",
1864 | "138043 6905.846154 44 67624 \n",
1865 | "138044 14981.909091 48 22648 \n",
1866 | "138045 601.600000 16 2216 \n",
1867 | "138046 96625.000000 20 318464 \n",
1868 | "\n",
1869 | " LoadConfigurationSize VersionInformationSize legitimate \n",
1870 | "41323 72 0 0 \n",
1871 | "41324 0 15 0 \n",
1872 | "41325 72 14 0 \n",
1873 | "41326 72 14 0 \n",
1874 | "41327 72 0 0 \n",
1875 | "... ... ... ... \n",
1876 | "138042 72 0 0 \n",
1877 | "138043 0 15 0 \n",
1878 | "138044 72 14 0 \n",
1879 | "138045 0 0 0 \n",
1880 | "138046 72 0 0 \n",
1881 | "\n",
1882 | "[96724 rows x 57 columns]"
1883 | ]
1884 | },
1885 | "execution_count": 14,
1886 | "metadata": {},
1887 | "output_type": "execute_result"
1888 | }
1889 | ],
1890 | "source": [
1891 | "mal=dataset[41323::]\n",
1892 | "maldata=dataset[41323::].drop([\"legitimate\"],axis=1)\n",
1893 | "mal"
1894 | ]
1895 | },
1896 | {
1897 | "cell_type": "code",
1898 | "execution_count": 15,
1899 | "id": "325f7c48",
1900 | "metadata": {},
1901 | "outputs": [
1902 | {
1903 | "name": "stdout",
1904 | "output_type": "stream",
1905 | "text": [
1906 | "The shape of legit database is 41323 samples and 56 features\n",
1907 | "The shape of malware database is 96724 samples and 57 features\n"
1908 | ]
1909 | }
1910 | ],
1911 | "source": [
1912 | "print(\"The shape of legit database is %s samples and %s features\"%(legit.shape[0],legit.shape[1])) \n",
1913 | "print(\"The shape of malware database is %s samples and %s features\"%(mal.shape[0],mal.shape[1])) "
1914 | ]
1915 | },
1916 | {
1917 | "cell_type": "code",
1918 | "execution_count": 16,
1919 | "id": "09e246a0",
1920 | "metadata": {},
1921 | "outputs": [
1922 | {
1923 | "name": "stdout",
1924 | "output_type": "stream",
1925 | "text": [
1926 | "Index(['Name', 'md5', 'Machine', 'SizeOfOptionalHeader', 'Characteristics',\n",
1927 | " 'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode',\n",
1928 | " 'SizeOfInitializedData', 'SizeOfUninitializedData',\n",
1929 | " 'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase',\n",
1930 | " 'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion',\n",
1931 | " 'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion',\n",
1932 | " 'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage',\n",
1933 | " 'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics',\n",
1934 | " 'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve',\n",
1935 | " 'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'SectionsNb',\n",
1936 | " 'SectionsMeanEntropy', 'SectionsMinEntropy', 'SectionsMaxEntropy',\n",
1937 | " 'SectionsMeanRawsize', 'SectionsMinRawsize', 'SectionMaxRawsize',\n",
1938 | " 'SectionsMeanVirtualsize', 'SectionsMinVirtualsize',\n",
1939 | " 'SectionMaxVirtualsize', 'ImportsNbDLL', 'ImportsNb',\n",
1940 | " 'ImportsNbOrdinal', 'ExportNb', 'ResourcesNb', 'ResourcesMeanEntropy',\n",
1941 | " 'ResourcesMinEntropy', 'ResourcesMaxEntropy', 'ResourcesMeanSize',\n",
1942 | " 'ResourcesMinSize', 'ResourcesMaxSize', 'LoadConfigurationSize',\n",
1943 | " 'VersionInformationSize', 'legitimate'],\n",
1944 | " dtype='object')\n",
1945 | "Index(['Name', 'md5', 'Machine', 'SizeOfOptionalHeader', 'Characteristics',\n",
1946 | " 'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode',\n",
1947 | " 'SizeOfInitializedData', 'SizeOfUninitializedData',\n",
1948 | " 'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase',\n",
1949 | " 'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion',\n",
1950 | " 'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion',\n",
1951 | " 'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage',\n",
1952 | " 'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics',\n",
1953 | " 'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve',\n",
1954 | " 'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'SectionsNb',\n",
1955 | " 'SectionsMeanEntropy', 'SectionsMinEntropy', 'SectionsMaxEntropy',\n",
1956 | " 'SectionsMeanRawsize', 'SectionsMinRawsize', 'SectionMaxRawsize',\n",
1957 | " 'SectionsMeanVirtualsize', 'SectionsMinVirtualsize',\n",
1958 | " 'SectionMaxVirtualsize', 'ImportsNbDLL', 'ImportsNb',\n",
1959 | " 'ImportsNbOrdinal', 'ExportNb', 'ResourcesNb', 'ResourcesMeanEntropy',\n",
1960 | " 'ResourcesMinEntropy', 'ResourcesMaxEntropy', 'ResourcesMeanSize',\n",
1961 | " 'ResourcesMinSize', 'ResourcesMaxSize', 'LoadConfigurationSize',\n",
1962 | " 'VersionInformationSize', 'legitimate'],\n",
1963 | " dtype='object')\n"
1964 | ]
1965 | }
1966 | ],
1967 | "source": [
1968 | "#to find the features ie the column names\n",
1969 | "print(dataset.columns) #but in malware or legit there is no legitimate feature :)\n",
1970 | "print(mal.columns)"
1971 | ]
1972 | },
1973 | {
1974 | "cell_type": "code",
1975 | "execution_count": 17,
1976 | "id": "55644b80",
1977 | "metadata": {},
1978 | "outputs": [
1979 | {
1980 | "data": {
1981 | "text/html": [
1982 | "\n",
1983 | "\n",
1996 | "
\n",
1997 | " \n",
1998 | " \n",
1999 | " | \n",
2000 | " Name | \n",
2001 | " md5 | \n",
2002 | " Machine | \n",
2003 | " SizeOfOptionalHeader | \n",
2004 | " Characteristics | \n",
2005 | " MajorLinkerVersion | \n",
2006 | " MinorLinkerVersion | \n",
2007 | " SizeOfCode | \n",
2008 | " SizeOfInitializedData | \n",
2009 | " SizeOfUninitializedData | \n",
2010 | " ... | \n",
2011 | " ResourcesNb | \n",
2012 | " ResourcesMeanEntropy | \n",
2013 | " ResourcesMinEntropy | \n",
2014 | " ResourcesMaxEntropy | \n",
2015 | " ResourcesMeanSize | \n",
2016 | " ResourcesMinSize | \n",
2017 | " ResourcesMaxSize | \n",
2018 | " LoadConfigurationSize | \n",
2019 | " VersionInformationSize | \n",
2020 | " legitimate | \n",
2021 | "
\n",
2022 | " \n",
2023 | " \n",
2024 | " \n",
2025 | " 41323 | \n",
2026 | " VirusShare_4a400b747afe6547e09ce0b02dae7f1c | \n",
2027 | " 4a400b747afe6547e09ce0b02dae7f1c | \n",
2028 | " 332 | \n",
2029 | " 224 | \n",
2030 | " 258 | \n",
2031 | " 11 | \n",
2032 | " 0 | \n",
2033 | " 354816 | \n",
2034 | " 257024 | \n",
2035 | " 0 | \n",
2036 | " ... | \n",
2037 | " 7 | \n",
2038 | " 3.914415 | \n",
2039 | " 1.441688 | \n",
2040 | " 7.677091 | \n",
2041 | " 7298.428571 | \n",
2042 | " 16 | \n",
2043 | " 28438 | \n",
2044 | " 72 | \n",
2045 | " 0 | \n",
2046 | " 0 | \n",
2047 | "
\n",
2048 | " \n",
2049 | " 41324 | \n",
2050 | " VirusShare_9bd57c8252948bd2fa651ad372bd4f13 | \n",
2051 | " 9bd57c8252948bd2fa651ad372bd4f13 | \n",
2052 | " 332 | \n",
2053 | " 224 | \n",
2054 | " 271 | \n",
2055 | " 6 | \n",
2056 | " 0 | \n",
2057 | " 24064 | \n",
2058 | " 164864 | \n",
2059 | " 1024 | \n",
2060 | " ... | \n",
2061 | " 6 | \n",
2062 | " 3.199107 | \n",
2063 | " 1.971335 | \n",
2064 | " 5.214816 | \n",
2065 | " 452.000000 | \n",
2066 | " 34 | \n",
2067 | " 958 | \n",
2068 | " 0 | \n",
2069 | " 15 | \n",
2070 | " 0 | \n",
2071 | "
\n",
2072 | " \n",
2073 | " 41325 | \n",
2074 | " VirusShare_d1456165e9358b8f61f93a5f2042f39c | \n",
2075 | " d1456165e9358b8f61f93a5f2042f39c | \n",
2076 | " 332 | \n",
2077 | " 224 | \n",
2078 | " 258 | \n",
2079 | " 10 | \n",
2080 | " 0 | \n",
2081 | " 118784 | \n",
2082 | " 381952 | \n",
2083 | " 0 | \n",
2084 | " ... | \n",
2085 | " 18 | \n",
2086 | " 6.530946 | \n",
2087 | " 2.458492 | \n",
2088 | " 7.992688 | \n",
2089 | " 18523.444444 | \n",
2090 | " 48 | \n",
2091 | " 33945 | \n",
2092 | " 72 | \n",
2093 | " 14 | \n",
2094 | " 0 | \n",
2095 | "
\n",
2096 | " \n",
2097 | " 41326 | \n",
2098 | " VirusShare_e4214cc73afbba0f52bb72d5db8f8bb1 | \n",
2099 | " e4214cc73afbba0f52bb72d5db8f8bb1 | \n",
2100 | " 332 | \n",
2101 | " 224 | \n",
2102 | " 258 | \n",
2103 | " 10 | \n",
2104 | " 0 | \n",
2105 | " 174592 | \n",
2106 | " 300032 | \n",
2107 | " 0 | \n",
2108 | " ... | \n",
2109 | " 15 | \n",
2110 | " 5.732393 | \n",
2111 | " 2.852364 | \n",
2112 | " 7.987726 | \n",
2113 | " 12706.133333 | \n",
2114 | " 118 | \n",
2115 | " 60500 | \n",
2116 | " 72 | \n",
2117 | " 14 | \n",
2118 | " 0 | \n",
2119 | "
\n",
2120 | " \n",
2121 | " 41327 | \n",
2122 | " VirusShare_710890c07b3f93b90635f8bff6c34605 | \n",
2123 | " 710890c07b3f93b90635f8bff6c34605 | \n",
2124 | " 332 | \n",
2125 | " 224 | \n",
2126 | " 258 | \n",
2127 | " 9 | \n",
2128 | " 0 | \n",
2129 | " 475648 | \n",
2130 | " 348672 | \n",
2131 | " 0 | \n",
2132 | " ... | \n",
2133 | " 59 | \n",
2134 | " 2.827826 | \n",
2135 | " 0.960953 | \n",
2136 | " 7.212329 | \n",
2137 | " 2637.033898 | \n",
2138 | " 20 | \n",
2139 | " 67624 | \n",
2140 | " 72 | \n",
2141 | " 0 | \n",
2142 | " 0 | \n",
2143 | "
\n",
2144 | " \n",
2145 | " 41328 | \n",
2146 | " VirusShare_3c2eb01508703752dca01957ea451a40 | \n",
2147 | " 3c2eb01508703752dca01957ea451a40 | \n",
2148 | " 332 | \n",
2149 | " 224 | \n",
2150 | " 259 | \n",
2151 | " 9 | \n",
2152 | " 0 | \n",
2153 | " 157696 | \n",
2154 | " 62464 | \n",
2155 | " 0 | \n",
2156 | " ... | \n",
2157 | " 13 | \n",
2158 | " 3.943296 | \n",
2159 | " 1.814443 | \n",
2160 | " 6.122045 | \n",
2161 | " 2708.153846 | \n",
2162 | " 132 | \n",
2163 | " 9640 | \n",
2164 | " 72 | \n",
2165 | " 14 | \n",
2166 | " 0 | \n",
2167 | "
\n",
2168 | " \n",
2169 | " 41329 | \n",
2170 | " VirusShare_3fb2d0ac00c5dff6c4fd5dfe6ba52c3f | \n",
2171 | " 3fb2d0ac00c5dff6c4fd5dfe6ba52c3f | \n",
2172 | " 332 | \n",
2173 | " 224 | \n",
2174 | " 259 | \n",
2175 | " 83 | \n",
2176 | " 82 | \n",
2177 | " 724992 | \n",
2178 | " 2306048 | \n",
2179 | " 0 | \n",
2180 | " ... | \n",
2181 | " 21 | \n",
2182 | " 3.987463 | \n",
2183 | " 2.642159 | \n",
2184 | " 6.473700 | \n",
2185 | " 14288.000000 | \n",
2186 | " 76 | \n",
2187 | " 270376 | \n",
2188 | " 0 | \n",
2189 | " 0 | \n",
2190 | " 0 | \n",
2191 | "
\n",
2192 | " \n",
2193 | " 41330 | \n",
2194 | " VirusShare_ad1ca9a4d572c0a2793c4cea29b20887 | \n",
2195 | " ad1ca9a4d572c0a2793c4cea29b20887 | \n",
2196 | " 332 | \n",
2197 | " 224 | \n",
2198 | " 258 | \n",
2199 | " 10 | \n",
2200 | " 0 | \n",
2201 | " 120320 | \n",
2202 | " 385024 | \n",
2203 | " 0 | \n",
2204 | " ... | \n",
2205 | " 6 | \n",
2206 | " 3.729824 | \n",
2207 | " 2.458492 | \n",
2208 | " 5.317552 | \n",
2209 | " 2739.500000 | \n",
2210 | " 48 | \n",
2211 | " 9640 | \n",
2212 | " 72 | \n",
2213 | " 15 | \n",
2214 | " 0 | \n",
2215 | "
\n",
2216 | " \n",
2217 | " 41331 | \n",
2218 | " VirusShare_7414edb3d0be66aa0816e6ed4b6b0a21 | \n",
2219 | " 7414edb3d0be66aa0816e6ed4b6b0a21 | \n",
2220 | " 332 | \n",
2221 | " 224 | \n",
2222 | " 259 | \n",
2223 | " 10 | \n",
2224 | " 0 | \n",
2225 | " 233984 | \n",
2226 | " 1377792 | \n",
2227 | " 0 | \n",
2228 | " ... | \n",
2229 | " 18 | \n",
2230 | " 4.328322 | \n",
2231 | " 2.323220 | \n",
2232 | " 7.068413 | \n",
2233 | " 76158.277778 | \n",
2234 | " 9 | \n",
2235 | " 1342735 | \n",
2236 | " 72 | \n",
2237 | " 19 | \n",
2238 | " 0 | \n",
2239 | "
\n",
2240 | " \n",
2241 | " 41332 | \n",
2242 | " VirusShare_e57b4f294c142d050a784b67e2cf1f2e | \n",
2243 | " e57b4f294c142d050a784b67e2cf1f2e | \n",
2244 | " 332 | \n",
2245 | " 224 | \n",
2246 | " 271 | \n",
2247 | " 6 | \n",
2248 | " 0 | \n",
2249 | " 49152 | \n",
2250 | " 561152 | \n",
2251 | " 0 | \n",
2252 | " ... | \n",
2253 | " 0 | \n",
2254 | " 0.000000 | \n",
2255 | " 0.000000 | \n",
2256 | " 0.000000 | \n",
2257 | " 0.000000 | \n",
2258 | " 0 | \n",
2259 | " 0 | \n",
2260 | " 0 | \n",
2261 | " 0 | \n",
2262 | " 0 | \n",
2263 | "
\n",
2264 | " \n",
2265 | "
\n",
2266 | "
10 rows × 57 columns
\n",
2267 | "
"
2268 | ],
2269 | "text/plain": [
2270 | " Name \\\n",
2271 | "41323 VirusShare_4a400b747afe6547e09ce0b02dae7f1c \n",
2272 | "41324 VirusShare_9bd57c8252948bd2fa651ad372bd4f13 \n",
2273 | "41325 VirusShare_d1456165e9358b8f61f93a5f2042f39c \n",
2274 | "41326 VirusShare_e4214cc73afbba0f52bb72d5db8f8bb1 \n",
2275 | "41327 VirusShare_710890c07b3f93b90635f8bff6c34605 \n",
2276 | "41328 VirusShare_3c2eb01508703752dca01957ea451a40 \n",
2277 | "41329 VirusShare_3fb2d0ac00c5dff6c4fd5dfe6ba52c3f \n",
2278 | "41330 VirusShare_ad1ca9a4d572c0a2793c4cea29b20887 \n",
2279 | "41331 VirusShare_7414edb3d0be66aa0816e6ed4b6b0a21 \n",
2280 | "41332 VirusShare_e57b4f294c142d050a784b67e2cf1f2e \n",
2281 | "\n",
2282 | " md5 Machine SizeOfOptionalHeader \\\n",
2283 | "41323 4a400b747afe6547e09ce0b02dae7f1c 332 224 \n",
2284 | "41324 9bd57c8252948bd2fa651ad372bd4f13 332 224 \n",
2285 | "41325 d1456165e9358b8f61f93a5f2042f39c 332 224 \n",
2286 | "41326 e4214cc73afbba0f52bb72d5db8f8bb1 332 224 \n",
2287 | "41327 710890c07b3f93b90635f8bff6c34605 332 224 \n",
2288 | "41328 3c2eb01508703752dca01957ea451a40 332 224 \n",
2289 | "41329 3fb2d0ac00c5dff6c4fd5dfe6ba52c3f 332 224 \n",
2290 | "41330 ad1ca9a4d572c0a2793c4cea29b20887 332 224 \n",
2291 | "41331 7414edb3d0be66aa0816e6ed4b6b0a21 332 224 \n",
2292 | "41332 e57b4f294c142d050a784b67e2cf1f2e 332 224 \n",
2293 | "\n",
2294 | " Characteristics MajorLinkerVersion MinorLinkerVersion SizeOfCode \\\n",
2295 | "41323 258 11 0 354816 \n",
2296 | "41324 271 6 0 24064 \n",
2297 | "41325 258 10 0 118784 \n",
2298 | "41326 258 10 0 174592 \n",
2299 | "41327 258 9 0 475648 \n",
2300 | "41328 259 9 0 157696 \n",
2301 | "41329 259 83 82 724992 \n",
2302 | "41330 258 10 0 120320 \n",
2303 | "41331 259 10 0 233984 \n",
2304 | "41332 271 6 0 49152 \n",
2305 | "\n",
2306 | " SizeOfInitializedData SizeOfUninitializedData ... ResourcesNb \\\n",
2307 | "41323 257024 0 ... 7 \n",
2308 | "41324 164864 1024 ... 6 \n",
2309 | "41325 381952 0 ... 18 \n",
2310 | "41326 300032 0 ... 15 \n",
2311 | "41327 348672 0 ... 59 \n",
2312 | "41328 62464 0 ... 13 \n",
2313 | "41329 2306048 0 ... 21 \n",
2314 | "41330 385024 0 ... 6 \n",
2315 | "41331 1377792 0 ... 18 \n",
2316 | "41332 561152 0 ... 0 \n",
2317 | "\n",
2318 | " ResourcesMeanEntropy ResourcesMinEntropy ResourcesMaxEntropy \\\n",
2319 | "41323 3.914415 1.441688 7.677091 \n",
2320 | "41324 3.199107 1.971335 5.214816 \n",
2321 | "41325 6.530946 2.458492 7.992688 \n",
2322 | "41326 5.732393 2.852364 7.987726 \n",
2323 | "41327 2.827826 0.960953 7.212329 \n",
2324 | "41328 3.943296 1.814443 6.122045 \n",
2325 | "41329 3.987463 2.642159 6.473700 \n",
2326 | "41330 3.729824 2.458492 5.317552 \n",
2327 | "41331 4.328322 2.323220 7.068413 \n",
2328 | "41332 0.000000 0.000000 0.000000 \n",
2329 | "\n",
2330 | " ResourcesMeanSize ResourcesMinSize ResourcesMaxSize \\\n",
2331 | "41323 7298.428571 16 28438 \n",
2332 | "41324 452.000000 34 958 \n",
2333 | "41325 18523.444444 48 33945 \n",
2334 | "41326 12706.133333 118 60500 \n",
2335 | "41327 2637.033898 20 67624 \n",
2336 | "41328 2708.153846 132 9640 \n",
2337 | "41329 14288.000000 76 270376 \n",
2338 | "41330 2739.500000 48 9640 \n",
2339 | "41331 76158.277778 9 1342735 \n",
2340 | "41332 0.000000 0 0 \n",
2341 | "\n",
2342 | " LoadConfigurationSize VersionInformationSize legitimate \n",
2343 | "41323 72 0 0 \n",
2344 | "41324 0 15 0 \n",
2345 | "41325 72 14 0 \n",
2346 | "41326 72 14 0 \n",
2347 | "41327 72 0 0 \n",
2348 | "41328 72 14 0 \n",
2349 | "41329 0 0 0 \n",
2350 | "41330 72 15 0 \n",
2351 | "41331 72 19 0 \n",
2352 | "41332 0 0 0 \n",
2353 | "\n",
2354 | "[10 rows x 57 columns]"
2355 | ]
2356 | },
2357 | "execution_count": 17,
2358 | "metadata": {},
2359 | "output_type": "execute_result"
2360 | }
2361 | ],
2362 | "source": [
2363 | "#first 10 data points from malware database:\n",
2364 | "mal.head(10)"
2365 | ]
2366 | },
2367 | {
2368 | "cell_type": "code",
2369 | "execution_count": 18,
2370 | "id": "d243486f",
2371 | "metadata": {},
2372 | "outputs": [
2373 | {
2374 | "data": {
2375 | "text/html": [
2376 | "\n",
2377 | "\n",
2390 | "
\n",
2391 | " \n",
2392 | " \n",
2393 | " | \n",
2394 | " Name | \n",
2395 | " md5 | \n",
2396 | " Machine | \n",
2397 | " SizeOfOptionalHeader | \n",
2398 | " Characteristics | \n",
2399 | " MajorLinkerVersion | \n",
2400 | " MinorLinkerVersion | \n",
2401 | " SizeOfCode | \n",
2402 | " SizeOfInitializedData | \n",
2403 | " SizeOfUninitializedData | \n",
2404 | " ... | \n",
2405 | " ExportNb | \n",
2406 | " ResourcesNb | \n",
2407 | " ResourcesMeanEntropy | \n",
2408 | " ResourcesMinEntropy | \n",
2409 | " ResourcesMaxEntropy | \n",
2410 | " ResourcesMeanSize | \n",
2411 | " ResourcesMinSize | \n",
2412 | " ResourcesMaxSize | \n",
2413 | " LoadConfigurationSize | \n",
2414 | " VersionInformationSize | \n",
2415 | "
\n",
2416 | " \n",
2417 | " \n",
2418 | " \n",
2419 | " 0 | \n",
2420 | " memtest.exe | \n",
2421 | " 631ea355665f28d4707448e442fbf5b8 | \n",
2422 | " 332 | \n",
2423 | " 224 | \n",
2424 | " 258 | \n",
2425 | " 9 | \n",
2426 | " 0 | \n",
2427 | " 361984 | \n",
2428 | " 115712 | \n",
2429 | " 0 | \n",
2430 | " ... | \n",
2431 | " 0 | \n",
2432 | " 4 | \n",
2433 | " 3.262823 | \n",
2434 | " 2.568844 | \n",
2435 | " 3.537939 | \n",
2436 | " 8797.0 | \n",
2437 | " 216 | \n",
2438 | " 18032 | \n",
2439 | " 0 | \n",
2440 | " 16 | \n",
2441 | "
\n",
2442 | " \n",
2443 | "
\n",
2444 | "
1 rows × 56 columns
\n",
2445 | "
"
2446 | ],
2447 | "text/plain": [
2448 | " Name md5 Machine \\\n",
2449 | "0 memtest.exe 631ea355665f28d4707448e442fbf5b8 332 \n",
2450 | "\n",
2451 | " SizeOfOptionalHeader Characteristics MajorLinkerVersion \\\n",
2452 | "0 224 258 9 \n",
2453 | "\n",
2454 | " MinorLinkerVersion SizeOfCode SizeOfInitializedData \\\n",
2455 | "0 0 361984 115712 \n",
2456 | "\n",
2457 | " SizeOfUninitializedData ... ExportNb ResourcesNb ResourcesMeanEntropy \\\n",
2458 | "0 0 ... 0 4 3.262823 \n",
2459 | "\n",
2460 | " ResourcesMinEntropy ResourcesMaxEntropy ResourcesMeanSize \\\n",
2461 | "0 2.568844 3.537939 8797.0 \n",
2462 | "\n",
2463 | " ResourcesMinSize ResourcesMaxSize LoadConfigurationSize \\\n",
2464 | "0 216 18032 0 \n",
2465 | "\n",
2466 | " VersionInformationSize \n",
2467 | "0 16 \n",
2468 | "\n",
2469 | "[1 rows x 56 columns]"
2470 | ]
2471 | },
2472 | "execution_count": 18,
2473 | "metadata": {},
2474 | "output_type": "execute_result"
2475 | }
2476 | ],
2477 | "source": [
2478 | "#datapoint of legit to have a good comparison \n",
2479 | "legit.take([0]) #1st datapoint"
2480 | ]
2481 | },
2482 | {
2483 | "cell_type": "code",
2484 | "execution_count": 19,
2485 | "id": "fd741615",
2486 | "metadata": {},
2487 | "outputs": [
2488 | {
2489 | "data": {
2490 | "text/html": [
2491 | "\n",
2492 | "\n",
2505 | "
\n",
2506 | " \n",
2507 | " \n",
2508 | " | \n",
2509 | " Name | \n",
2510 | " md5 | \n",
2511 | " Machine | \n",
2512 | " SizeOfOptionalHeader | \n",
2513 | " Characteristics | \n",
2514 | " MajorLinkerVersion | \n",
2515 | " MinorLinkerVersion | \n",
2516 | " SizeOfCode | \n",
2517 | " SizeOfInitializedData | \n",
2518 | " SizeOfUninitializedData | \n",
2519 | " ... | \n",
2520 | " ResourcesNb | \n",
2521 | " ResourcesMeanEntropy | \n",
2522 | " ResourcesMinEntropy | \n",
2523 | " ResourcesMaxEntropy | \n",
2524 | " ResourcesMeanSize | \n",
2525 | " ResourcesMinSize | \n",
2526 | " ResourcesMaxSize | \n",
2527 | " LoadConfigurationSize | \n",
2528 | " VersionInformationSize | \n",
2529 | " legitimate | \n",
2530 | "
\n",
2531 | " \n",
2532 | " \n",
2533 | " \n",
2534 | " 41323 | \n",
2535 | " VirusShare_4a400b747afe6547e09ce0b02dae7f1c | \n",
2536 | " 4a400b747afe6547e09ce0b02dae7f1c | \n",
2537 | " 332 | \n",
2538 | " 224 | \n",
2539 | " 258 | \n",
2540 | " 11 | \n",
2541 | " 0 | \n",
2542 | " 354816 | \n",
2543 | " 257024 | \n",
2544 | " 0 | \n",
2545 | " ... | \n",
2546 | " 7 | \n",
2547 | " 3.914415 | \n",
2548 | " 1.441688 | \n",
2549 | " 7.677091 | \n",
2550 | " 7298.428571 | \n",
2551 | " 16 | \n",
2552 | " 28438 | \n",
2553 | " 72 | \n",
2554 | " 0 | \n",
2555 | " 0 | \n",
2556 | "
\n",
2557 | " \n",
2558 | "
\n",
2559 | "
1 rows × 57 columns
\n",
2560 | "
"
2561 | ],
2562 | "text/plain": [
2563 | " Name \\\n",
2564 | "41323 VirusShare_4a400b747afe6547e09ce0b02dae7f1c \n",
2565 | "\n",
2566 | " md5 Machine SizeOfOptionalHeader \\\n",
2567 | "41323 4a400b747afe6547e09ce0b02dae7f1c 332 224 \n",
2568 | "\n",
2569 | " Characteristics MajorLinkerVersion MinorLinkerVersion SizeOfCode \\\n",
2570 | "41323 258 11 0 354816 \n",
2571 | "\n",
2572 | " SizeOfInitializedData SizeOfUninitializedData ... ResourcesNb \\\n",
2573 | "41323 257024 0 ... 7 \n",
2574 | "\n",
2575 | " ResourcesMeanEntropy ResourcesMinEntropy ResourcesMaxEntropy \\\n",
2576 | "41323 3.914415 1.441688 7.677091 \n",
2577 | "\n",
2578 | " ResourcesMeanSize ResourcesMinSize ResourcesMaxSize \\\n",
2579 | "41323 7298.428571 16 28438 \n",
2580 | "\n",
2581 | " LoadConfigurationSize VersionInformationSize legitimate \n",
2582 | "41323 72 0 0 \n",
2583 | "\n",
2584 | "[1 rows x 57 columns]"
2585 | ]
2586 | },
2587 | "execution_count": 19,
2588 | "metadata": {},
2589 | "output_type": "execute_result"
2590 | }
2591 | ],
2592 | "source": [
2593 | "#datapoint of malware to have a good comparison \n",
2594 | "mal.take([0]) #1st datapoint"
2595 | ]
2596 | },
2597 | {
2598 | "cell_type": "code",
2599 | "execution_count": 20,
2600 | "id": "4dd1e87b",
2601 | "metadata": {},
2602 | "outputs": [],
2603 | "source": [
2604 | "# Feature Extraction"
2605 | ]
2606 | },
2607 | {
2608 | "cell_type": "code",
2609 | "execution_count": 21,
2610 | "id": "d1ecc40f",
2611 | "metadata": {},
2612 | "outputs": [],
2613 | "source": [
2614 | "x=dataset.drop(['Name','md5','legitimate'],axis=1).values #independent features\n",
2615 | "y=dataset['legitimate'].values #dependent variable"
2616 | ]
2617 | },
2618 | {
2619 | "cell_type": "code",
2620 | "execution_count": 22,
2621 | "id": "e26ddd4d",
2622 | "metadata": {},
2623 | "outputs": [],
2624 | "source": [
2625 | "extratrees=ek.ExtraTreesClassifier().fit(x,y)\n",
2626 | "model=SelectFromModel(extratrees,prefit=True)\n",
2627 | "x_new=model.transform(x)\n",
2628 | "nbfeatures=x_new.shape[1]"
2629 | ]
2630 | },
2631 | {
2632 | "cell_type": "code",
2633 | "execution_count": 23,
2634 | "id": "3306769b",
2635 | "metadata": {},
2636 | "outputs": [
2637 | {
2638 | "data": {
2639 | "text/plain": [
2640 | "14"
2641 | ]
2642 | },
2643 | "execution_count": 23,
2644 | "metadata": {},
2645 | "output_type": "execute_result"
2646 | }
2647 | ],
2648 | "source": [
2649 | "nbfeatures"
2650 | ]
2651 | },
2652 | {
2653 | "cell_type": "code",
2654 | "execution_count": 24,
2655 | "id": "a1bc47cc",
2656 | "metadata": {},
2657 | "outputs": [
2658 | {
2659 | "data": {
2660 | "text/plain": [
2661 | "([,\n",
2662 | " ],\n",
2663 | " [Text(0.7884607600756525, 0.7670264857362649, 'Important Features'),\n",
2664 | " Text(-0.7884607959827531, -0.7670264488257517, 'Not Important Features')],\n",
2665 | " [Text(0.4300695054958104, 0.4183780831288717, '25%'),\n",
2666 | " Text(-0.43006952508150165, -0.4183780629958645, '75%')])"
2667 | ]
2668 | },
2669 | "execution_count": 24,
2670 | "metadata": {},
2671 | "output_type": "execute_result"
2672 | },
2673 | {
2674 | "data": {
2675 | "image/png": "\n",
2676 | "text/plain": [
2677 | ""
2678 | ]
2679 | },
2680 | "metadata": {},
2681 | "output_type": "display_data"
2682 | }
2683 | ],
2684 | "source": [
2685 | "dataset.columns.size\n",
2686 | "imp_features_visual=['Important Features','Not Important Features']\n",
2687 | "imp_features_visual_val=[nbfeatures,57-nbfeatures]\n",
2688 | "plt.pie(imp_features_visual_val, labels=imp_features_visual, autopct='%0.f%%')"
2689 | ]
2690 | },
2691 | {
2692 | "cell_type": "code",
2693 | "execution_count": 25,
2694 | "id": "65c60c3c",
2695 | "metadata": {},
2696 | "outputs": [],
2697 | "source": [
2698 | "x_train,x_test,y_train,y_test=train_test_split(x_new,y,test_size=0.2)"
2699 | ]
2700 | },
2701 | {
2702 | "cell_type": "code",
2703 | "execution_count": 26,
2704 | "id": "bc4902c0",
2705 | "metadata": {},
2706 | "outputs": [],
2707 | "source": [
2708 | "features=[]\n",
2709 | "index=np.argsort(extratrees.feature_importances_)[::1][:nbfeatures]"
2710 | ]
2711 | },
2712 | {
2713 | "cell_type": "code",
2714 | "execution_count": 27,
2715 | "id": "d12c9fbb",
2716 | "metadata": {},
2717 | "outputs": [
2718 | {
2719 | "name": "stdout",
2720 | "output_type": "stream",
2721 | "text": [
2722 | "1. feature LoaderFlags (0.000003)\n",
2723 | "2. feature NumberOfRvaAndSizes (0.000049)\n",
2724 | "3. feature SizeOfHeapCommit (0.000331)\n",
2725 | "4. feature BaseOfCode (0.000807)\n",
2726 | "5. feature SizeOfUninitializedData (0.000878)\n",
2727 | "6. feature ResourcesMeanSize (0.001154)\n",
2728 | "7. feature BaseOfData (0.001165)\n",
2729 | "8. feature ResourcesMaxSize (0.001197)\n",
2730 | "9. feature SectionsMeanVirtualsize (0.001212)\n",
2731 | "10. feature SizeOfImage (0.001226)\n",
2732 | "11. feature SectionMaxRawsize (0.001275)\n",
2733 | "12. feature SizeOfInitializedData (0.001280)\n",
2734 | "13. feature SectionMaxVirtualsize (0.001295)\n",
2735 | "14. feature SectionsMeanRawsize (0.001400)\n"
2736 | ]
2737 | }
2738 | ],
2739 | "source": [
2740 | "for f in range(nbfeatures):\n",
2741 | " print(\"%d. feature %s (%f)\"%(f+1,dataset.columns[2+index[f]],extratrees.feature_importances_[index[f]]))\n",
2742 | " features.append(dataset.columns[2+f])"
2743 | ]
2744 | },
2745 | {
2746 | "cell_type": "code",
2747 | "execution_count": 28,
2748 | "id": "ffe54b76",
2749 | "metadata": {},
2750 | "outputs": [],
2751 | "source": [
2752 | "model ={ \"RandomForest\":ek.RandomForestClassifier(n_estimators=50),\n",
2753 | " \"DecisionTree\":tree.DecisionTreeClassifier(max_depth=10),\n",
2754 | " \"LogisticRegression\":LogisticRegression()\n",
2755 | " }"
2756 | ]
2757 | },
2758 | {
2759 | "cell_type": "code",
2760 | "execution_count": 29,
2761 | "id": "0ab0113e",
2762 | "metadata": {},
2763 | "outputs": [
2764 | {
2765 | "name": "stdout",
2766 | "output_type": "stream",
2767 | "text": [
2768 | "RandomForest : 0.9940963419051069\n",
2769 | "DecisionTree : 0.9900760593987685\n",
2770 | "LogisticRegression : 0.6964505613908004\n"
2771 | ]
2772 | },
2773 | {
2774 | "name": "stderr",
2775 | "output_type": "stream",
2776 | "text": [
2777 | "C:\\Users\\vajha\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=2):\n",
2778 | "ABNORMAL_TERMINATION_IN_LNSRCH.\n",
2779 | "\n",
2780 | "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
2781 | " https://scikit-learn.org/stable/modules/preprocessing.html\n",
2782 | "Please also refer to the documentation for alternative solver options:\n",
2783 | " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
2784 | " n_iter_i = _check_optimize_result(\n"
2785 | ]
2786 | }
2787 | ],
2788 | "source": [
2789 | "results={}\n",
2790 | "for algo in model:\n",
2791 | " clf=model[algo]\n",
2792 | " clf.fit(x_train,y_train)\n",
2793 | " score=clf.score(x_test,y_test)\n",
2794 | " print(\"%s : %s\"%(algo,score))\n",
2795 | " results[algo]=score"
2796 | ]
2797 | },
2798 | {
2799 | "cell_type": "code",
2800 | "execution_count": 30,
2801 | "id": "4189adb6",
2802 | "metadata": {},
2803 | "outputs": [
2804 | {
2805 | "data": {
2806 | "text/plain": [
2807 | "'RandomForest'"
2808 | ]
2809 | },
2810 | "execution_count": 30,
2811 | "metadata": {},
2812 | "output_type": "execute_result"
2813 | }
2814 | ],
2815 | "source": [
2816 | "winner=max(results,key=results.get)\n",
2817 | "winner"
2818 | ]
2819 | },
2820 | {
2821 | "cell_type": "code",
2822 | "execution_count": 31,
2823 | "id": "4fb1c869",
2824 | "metadata": {},
2825 | "outputs": [
2826 | {
2827 | "name": "stdout",
2828 | "output_type": "stream",
2829 | "text": [
2830 | "False positive rate : 0.114760 %\n",
2831 | "False negative rate : 0.162137 %\n"
2832 | ]
2833 | }
2834 | ],
2835 | "source": [
2836 | "clf=model[winner]\n",
2837 | "res=clf.predict(x_new)\n",
2838 | "mt=confusion_matrix(y,res)\n",
2839 | "print(\"False positive rate : %f %%\" % ((mt[0][1] / float(sum(mt[0])))*100))\n",
2840 | "print(\"False negative rate : %f %%\" % ((mt[1][0] / float(sum(mt[1])))*100))"
2841 | ]
2842 | },
2843 | {
2844 | "cell_type": "code",
2845 | "execution_count": 32,
2846 | "id": "93c1012f",
2847 | "metadata": {},
2848 | "outputs": [],
2849 | "source": [
2850 | "# Check for Multicollinearity"
2851 | ]
2852 | },
2853 | {
2854 | "cell_type": "code",
2855 | "execution_count": 33,
2856 | "id": "978fd992",
2857 | "metadata": {},
2858 | "outputs": [
2859 | {
2860 | "data": {
2861 | "text/html": [
2862 | "\n",
2863 | "\n",
2876 | "
\n",
2877 | " \n",
2878 | " \n",
2879 | " | \n",
2880 | " Machine | \n",
2881 | " SizeOfOptionalHeader | \n",
2882 | " Characteristics | \n",
2883 | " MajorLinkerVersion | \n",
2884 | " MinorLinkerVersion | \n",
2885 | " SizeOfCode | \n",
2886 | " SizeOfInitializedData | \n",
2887 | " SizeOfUninitializedData | \n",
2888 | " AddressOfEntryPoint | \n",
2889 | " BaseOfCode | \n",
2890 | " ... | \n",
2891 | " ExportNb | \n",
2892 | " ResourcesNb | \n",
2893 | " ResourcesMeanEntropy | \n",
2894 | " ResourcesMinEntropy | \n",
2895 | " ResourcesMaxEntropy | \n",
2896 | " ResourcesMeanSize | \n",
2897 | " ResourcesMinSize | \n",
2898 | " ResourcesMaxSize | \n",
2899 | " LoadConfigurationSize | \n",
2900 | " VersionInformationSize | \n",
2901 | "
\n",
2902 | " \n",
2903 | " \n",
2904 | " \n",
2905 | " 0 | \n",
2906 | " 332 | \n",
2907 | " 224 | \n",
2908 | " 258 | \n",
2909 | " 9 | \n",
2910 | " 0 | \n",
2911 | " 361984 | \n",
2912 | " 115712 | \n",
2913 | " 0 | \n",
2914 | " 6135 | \n",
2915 | " 4096 | \n",
2916 | " ... | \n",
2917 | " 0 | \n",
2918 | " 4 | \n",
2919 | " 3.262823 | \n",
2920 | " 2.568844 | \n",
2921 | " 3.537939 | \n",
2922 | " 8797.000000 | \n",
2923 | " 216 | \n",
2924 | " 18032 | \n",
2925 | " 0 | \n",
2926 | " 16 | \n",
2927 | "
\n",
2928 | " \n",
2929 | " 1 | \n",
2930 | " 332 | \n",
2931 | " 224 | \n",
2932 | " 3330 | \n",
2933 | " 9 | \n",
2934 | " 0 | \n",
2935 | " 130560 | \n",
2936 | " 19968 | \n",
2937 | " 0 | \n",
2938 | " 81778 | \n",
2939 | " 4096 | \n",
2940 | " ... | \n",
2941 | " 0 | \n",
2942 | " 2 | \n",
2943 | " 4.250461 | \n",
2944 | " 3.420744 | \n",
2945 | " 5.080177 | \n",
2946 | " 837.000000 | \n",
2947 | " 518 | \n",
2948 | " 1156 | \n",
2949 | " 72 | \n",
2950 | " 18 | \n",
2951 | "
\n",
2952 | " \n",
2953 | " 2 | \n",
2954 | " 332 | \n",
2955 | " 224 | \n",
2956 | " 3330 | \n",
2957 | " 9 | \n",
2958 | " 0 | \n",
2959 | " 517120 | \n",
2960 | " 621568 | \n",
2961 | " 0 | \n",
2962 | " 350896 | \n",
2963 | " 4096 | \n",
2964 | " ... | \n",
2965 | " 1 | \n",
2966 | " 11 | \n",
2967 | " 4.426324 | \n",
2968 | " 2.846449 | \n",
2969 | " 5.271813 | \n",
2970 | " 31102.272727 | \n",
2971 | " 104 | \n",
2972 | " 270376 | \n",
2973 | " 72 | \n",
2974 | " 18 | \n",
2975 | "
\n",
2976 | " \n",
2977 | " 3 | \n",
2978 | " 332 | \n",
2979 | " 224 | \n",
2980 | " 258 | \n",
2981 | " 9 | \n",
2982 | " 0 | \n",
2983 | " 585728 | \n",
2984 | " 369152 | \n",
2985 | " 0 | \n",
2986 | " 451258 | \n",
2987 | " 4096 | \n",
2988 | " ... | \n",
2989 | " 1 | \n",
2990 | " 10 | \n",
2991 | " 4.364291 | \n",
2992 | " 2.669314 | \n",
2993 | " 6.400720 | \n",
2994 | " 1457.000000 | \n",
2995 | " 90 | \n",
2996 | " 4264 | \n",
2997 | " 72 | \n",
2998 | " 18 | \n",
2999 | "
\n",
3000 | " \n",
3001 | " 4 | \n",
3002 | " 332 | \n",
3003 | " 224 | \n",
3004 | " 258 | \n",
3005 | " 9 | \n",
3006 | " 0 | \n",
3007 | " 294912 | \n",
3008 | " 247296 | \n",
3009 | " 0 | \n",
3010 | " 217381 | \n",
3011 | " 4096 | \n",
3012 | " ... | \n",
3013 | " 1 | \n",
3014 | " 2 | \n",
3015 | " 4.306100 | \n",
3016 | " 3.421598 | \n",
3017 | " 5.190603 | \n",
3018 | " 1074.500000 | \n",
3019 | " 849 | \n",
3020 | " 1300 | \n",
3021 | " 72 | \n",
3022 | " 18 | \n",
3023 | "
\n",
3024 | " \n",
3025 | " ... | \n",
3026 | " ... | \n",
3027 | " ... | \n",
3028 | " ... | \n",
3029 | " ... | \n",
3030 | " ... | \n",
3031 | " ... | \n",
3032 | " ... | \n",
3033 | " ... | \n",
3034 | " ... | \n",
3035 | " ... | \n",
3036 | " ... | \n",
3037 | " ... | \n",
3038 | " ... | \n",
3039 | " ... | \n",
3040 | " ... | \n",
3041 | " ... | \n",
3042 | " ... | \n",
3043 | " ... | \n",
3044 | " ... | \n",
3045 | " ... | \n",
3046 | " ... | \n",
3047 | "
\n",
3048 | " \n",
3049 | " 138042 | \n",
3050 | " 332 | \n",
3051 | " 224 | \n",
3052 | " 258 | \n",
3053 | " 11 | \n",
3054 | " 0 | \n",
3055 | " 205824 | \n",
3056 | " 223744 | \n",
3057 | " 0 | \n",
3058 | " 123291 | \n",
3059 | " 4096 | \n",
3060 | " ... | \n",
3061 | " 0 | \n",
3062 | " 7 | \n",
3063 | " 4.122736 | \n",
3064 | " 1.370260 | \n",
3065 | " 7.677091 | \n",
3066 | " 14900.714286 | \n",
3067 | " 16 | \n",
3068 | " 81654 | \n",
3069 | " 72 | \n",
3070 | " 0 | \n",
3071 | "
\n",
3072 | " \n",
3073 | " 138043 | \n",
3074 | " 332 | \n",
3075 | " 224 | \n",
3076 | " 33167 | \n",
3077 | " 2 | \n",
3078 | " 25 | \n",
3079 | " 37888 | \n",
3080 | " 185344 | \n",
3081 | " 0 | \n",
3082 | " 40000 | \n",
3083 | " 4096 | \n",
3084 | " ... | \n",
3085 | " 0 | \n",
3086 | " 26 | \n",
3087 | " 3.377663 | \n",
3088 | " 2.031619 | \n",
3089 | " 5.050074 | \n",
3090 | " 6905.846154 | \n",
3091 | " 44 | \n",
3092 | " 67624 | \n",
3093 | " 0 | \n",
3094 | " 15 | \n",
3095 | "
\n",
3096 | " \n",
3097 | " 138044 | \n",
3098 | " 332 | \n",
3099 | " 224 | \n",
3100 | " 258 | \n",
3101 | " 10 | \n",
3102 | " 0 | \n",
3103 | " 118272 | \n",
3104 | " 380416 | \n",
3105 | " 0 | \n",
3106 | " 59610 | \n",
3107 | " 4096 | \n",
3108 | " ... | \n",
3109 | " 0 | \n",
3110 | " 22 | \n",
3111 | " 6.825406 | \n",
3112 | " 2.617026 | \n",
3113 | " 7.990487 | \n",
3114 | " 14981.909091 | \n",
3115 | " 48 | \n",
3116 | " 22648 | \n",
3117 | " 72 | \n",
3118 | " 14 | \n",
3119 | "
\n",
3120 | " \n",
3121 | " 138045 | \n",
3122 | " 332 | \n",
3123 | " 224 | \n",
3124 | " 33166 | \n",
3125 | " 2 | \n",
3126 | " 25 | \n",
3127 | " 49152 | \n",
3128 | " 16896 | \n",
3129 | " 0 | \n",
3130 | " 51216 | \n",
3131 | " 4096 | \n",
3132 | " ... | \n",
3133 | " 0 | \n",
3134 | " 10 | \n",
3135 | " 3.421627 | \n",
3136 | " 2.060964 | \n",
3137 | " 4.739744 | \n",
3138 | " 601.600000 | \n",
3139 | " 16 | \n",
3140 | " 2216 | \n",
3141 | " 0 | \n",
3142 | " 0 | \n",
3143 | "
\n",
3144 | " \n",
3145 | " 138046 | \n",
3146 | " 332 | \n",
3147 | " 224 | \n",
3148 | " 258 | \n",
3149 | " 11 | \n",
3150 | " 0 | \n",
3151 | " 111616 | \n",
3152 | " 468480 | \n",
3153 | " 0 | \n",
3154 | " 22731 | \n",
3155 | " 4096 | \n",
3156 | " ... | \n",
3157 | " 0 | \n",
3158 | " 4 | \n",
3159 | " 4.407252 | \n",
3160 | " 1.980482 | \n",
3161 | " 6.115374 | \n",
3162 | " 96625.000000 | \n",
3163 | " 20 | \n",
3164 | " 318464 | \n",
3165 | " 72 | \n",
3166 | " 0 | \n",
3167 | "
\n",
3168 | " \n",
3169 | "
\n",
3170 | "
138047 rows × 54 columns
\n",
3171 | "
"
3172 | ],
3173 | "text/plain": [
3174 | " Machine SizeOfOptionalHeader Characteristics MajorLinkerVersion \\\n",
3175 | "0 332 224 258 9 \n",
3176 | "1 332 224 3330 9 \n",
3177 | "2 332 224 3330 9 \n",
3178 | "3 332 224 258 9 \n",
3179 | "4 332 224 258 9 \n",
3180 | "... ... ... ... ... \n",
3181 | "138042 332 224 258 11 \n",
3182 | "138043 332 224 33167 2 \n",
3183 | "138044 332 224 258 10 \n",
3184 | "138045 332 224 33166 2 \n",
3185 | "138046 332 224 258 11 \n",
3186 | "\n",
3187 | " MinorLinkerVersion SizeOfCode SizeOfInitializedData \\\n",
3188 | "0 0 361984 115712 \n",
3189 | "1 0 130560 19968 \n",
3190 | "2 0 517120 621568 \n",
3191 | "3 0 585728 369152 \n",
3192 | "4 0 294912 247296 \n",
3193 | "... ... ... ... \n",
3194 | "138042 0 205824 223744 \n",
3195 | "138043 25 37888 185344 \n",
3196 | "138044 0 118272 380416 \n",
3197 | "138045 25 49152 16896 \n",
3198 | "138046 0 111616 468480 \n",
3199 | "\n",
3200 | " SizeOfUninitializedData AddressOfEntryPoint BaseOfCode ... \\\n",
3201 | "0 0 6135 4096 ... \n",
3202 | "1 0 81778 4096 ... \n",
3203 | "2 0 350896 4096 ... \n",
3204 | "3 0 451258 4096 ... \n",
3205 | "4 0 217381 4096 ... \n",
3206 | "... ... ... ... ... \n",
3207 | "138042 0 123291 4096 ... \n",
3208 | "138043 0 40000 4096 ... \n",
3209 | "138044 0 59610 4096 ... \n",
3210 | "138045 0 51216 4096 ... \n",
3211 | "138046 0 22731 4096 ... \n",
3212 | "\n",
3213 | " ExportNb ResourcesNb ResourcesMeanEntropy ResourcesMinEntropy \\\n",
3214 | "0 0 4 3.262823 2.568844 \n",
3215 | "1 0 2 4.250461 3.420744 \n",
3216 | "2 1 11 4.426324 2.846449 \n",
3217 | "3 1 10 4.364291 2.669314 \n",
3218 | "4 1 2 4.306100 3.421598 \n",
3219 | "... ... ... ... ... \n",
3220 | "138042 0 7 4.122736 1.370260 \n",
3221 | "138043 0 26 3.377663 2.031619 \n",
3222 | "138044 0 22 6.825406 2.617026 \n",
3223 | "138045 0 10 3.421627 2.060964 \n",
3224 | "138046 0 4 4.407252 1.980482 \n",
3225 | "\n",
3226 | " ResourcesMaxEntropy ResourcesMeanSize ResourcesMinSize \\\n",
3227 | "0 3.537939 8797.000000 216 \n",
3228 | "1 5.080177 837.000000 518 \n",
3229 | "2 5.271813 31102.272727 104 \n",
3230 | "3 6.400720 1457.000000 90 \n",
3231 | "4 5.190603 1074.500000 849 \n",
3232 | "... ... ... ... \n",
3233 | "138042 7.677091 14900.714286 16 \n",
3234 | "138043 5.050074 6905.846154 44 \n",
3235 | "138044 7.990487 14981.909091 48 \n",
3236 | "138045 4.739744 601.600000 16 \n",
3237 | "138046 6.115374 96625.000000 20 \n",
3238 | "\n",
3239 | " ResourcesMaxSize LoadConfigurationSize VersionInformationSize \n",
3240 | "0 18032 0 16 \n",
3241 | "1 1156 72 18 \n",
3242 | "2 270376 72 18 \n",
3243 | "3 4264 72 18 \n",
3244 | "4 1300 72 18 \n",
3245 | "... ... ... ... \n",
3246 | "138042 81654 72 0 \n",
3247 | "138043 67624 0 15 \n",
3248 | "138044 22648 72 14 \n",
3249 | "138045 2216 0 0 \n",
3250 | "138046 318464 72 0 \n",
3251 | "\n",
3252 | "[138047 rows x 54 columns]"
3253 | ]
3254 | },
3255 | "execution_count": 33,
3256 | "metadata": {},
3257 | "output_type": "execute_result"
3258 | }
3259 | ],
3260 | "source": [
3261 | "mc=dataset.drop([\"Name\",'md5','legitimate'],axis=1) #independent features\n",
3262 | "mc"
3263 | ]
3264 | },
3265 | {
3266 | "cell_type": "code",
3267 | "execution_count": 34,
3268 | "id": "6adb0606",
3269 | "metadata": {},
3270 | "outputs": [
3271 | {
3272 | "name": "stdout",
3273 | "output_type": "stream",
3274 | "text": [
3275 | "Variance Inflation Factor for Machine: 1.19\n",
3276 | "Variance Inflation Factor for SizeOfOptionalHeader: 0.02\n",
3277 | "Variance Inflation Factor for Characteristics: 1.43\n",
3278 | "Variance Inflation Factor for MajorLinkerVersion: 1.19\n",
3279 | "Variance Inflation Factor for MinorLinkerVersion: 1.5\n",
3280 | "Variance Inflation Factor for SizeOfCode: 5.13\n",
3281 | "Variance Inflation Factor for SizeOfInitializedData: 1.57\n",
3282 | "Variance Inflation Factor for SizeOfUninitializedData: 1.0\n",
3283 | "Variance Inflation Factor for AddressOfEntryPoint: 1.07\n",
3284 | "Variance Inflation Factor for BaseOfCode: 4.27\n",
3285 | "Variance Inflation Factor for BaseOfData: 1.92\n",
3286 | "Variance Inflation Factor for ImageBase: 1.0\n",
3287 | "Variance Inflation Factor for SectionAlignment: 2.06\n",
3288 | "Variance Inflation Factor for FileAlignment: 1.09\n",
3289 | "Variance Inflation Factor for MajorOperatingSystemVersion: 1.0\n",
3290 | "Variance Inflation Factor for MinorOperatingSystemVersion: 4.16\n",
3291 | "Variance Inflation Factor for MajorImageVersion: 203.26\n",
3292 | "Variance Inflation Factor for MinorImageVersion: 186.8\n",
3293 | "Variance Inflation Factor for MajorSubsystemVersion: 0.6\n",
3294 | "Variance Inflation Factor for MinorSubsystemVersion: 17345.88\n",
3295 | "Variance Inflation Factor for SizeOfImage: 2.86\n",
3296 | "Variance Inflation Factor for SizeOfHeaders: 1.05\n",
3297 | "Variance Inflation Factor for CheckSum: 1.04\n",
3298 | "Variance Inflation Factor for Subsystem: 0.65\n",
3299 | "Variance Inflation Factor for DllCharacteristics: 1.63\n",
3300 | "Variance Inflation Factor for SizeOfStackReserve: 1.31\n",
3301 | "Variance Inflation Factor for SizeOfStackCommit: 1.03\n",
3302 | "Variance Inflation Factor for SizeOfHeapReserve: 0.57\n",
3303 | "Variance Inflation Factor for SizeOfHeapCommit: 140.51\n",
3304 | "Variance Inflation Factor for LoaderFlags: 143.64\n",
3305 | "Variance Inflation Factor for NumberOfRvaAndSizes: 4.65\n",
3306 | "Variance Inflation Factor for SectionsNb: 1.15\n",
3307 | "Variance Inflation Factor for SectionsMeanEntropy: 1.03\n",
3308 | "Variance Inflation Factor for SectionsMinEntropy: 1.18\n",
3309 | "Variance Inflation Factor for SectionsMaxEntropy: 0.7\n",
3310 | "Variance Inflation Factor for SectionsMeanRawsize: 30.3\n",
3311 | "Variance Inflation Factor for SectionsMinRawsize: 619.0\n",
3312 | "Variance Inflation Factor for SectionMaxRawsize: 26.68\n",
3313 | "Variance Inflation Factor for SectionsMeanVirtualsize: 138.58\n",
3314 | "Variance Inflation Factor for SectionsMinVirtualsize: 622.11\n",
3315 | "Variance Inflation Factor for SectionMaxVirtualsize: 146.14\n",
3316 | "Variance Inflation Factor for ImportsNbDLL: 1.42\n",
3317 | "Variance Inflation Factor for ImportsNb: 1.2\n",
3318 | "Variance Inflation Factor for ImportsNbOrdinal: 1.28\n",
3319 | "Variance Inflation Factor for ExportNb: 1.06\n",
3320 | "Variance Inflation Factor for ResourcesNb: 1.24\n",
3321 | "Variance Inflation Factor for ResourcesMeanEntropy: 0.89\n",
3322 | "Variance Inflation Factor for ResourcesMinEntropy: 0.88\n",
3323 | "Variance Inflation Factor for ResourcesMaxEntropy: 1.16\n",
3324 | "Variance Inflation Factor for ResourcesMeanSize: 13.04\n",
3325 | "Variance Inflation Factor for ResourcesMinSize: 7.14\n",
3326 | "Variance Inflation Factor for ResourcesMaxSize: 4.39\n",
3327 | "Variance Inflation Factor for LoadConfigurationSize: 1.0\n"
3328 | ]
3329 | }
3330 | ],
3331 | "source": [
3332 | "for i in range(len(mc.columns[:-1])):\n",
3333 | " v=vif(np.matrix(mc[:-1]),i)\n",
3334 | " print(\"Variance Inflation Factor for {}: {}\".format(mc.columns[i],round(v,2)))"
3335 | ]
3336 | },
3337 | {
3338 | "cell_type": "code",
3339 | "execution_count": 35,
3340 | "id": "5cc83368",
3341 | "metadata": {},
3342 | "outputs": [
3343 | {
3344 | "name": "stdout",
3345 | "output_type": "stream",
3346 | "text": [
3347 | "Variance Inflation Factor for MajorImageVersion : 203.26\n",
3348 | "Variance Inflation Factor for MinorImageVersion : 186.8\n",
3349 | "Variance Inflation Factor for MinorSubsystemVersion : 17345.88\n",
3350 | "Variance Inflation Factor for SizeOfHeapCommit : 140.51\n",
3351 | "Variance Inflation Factor for LoaderFlags : 143.64\n",
3352 | "Variance Inflation Factor for SectionsMeanRawsize : 30.3\n",
3353 | "Variance Inflation Factor for SectionsMinRawsize : 619.0\n",
3354 | "Variance Inflation Factor for SectionMaxRawsize : 26.68\n",
3355 | "Variance Inflation Factor for SectionsMeanVirtualsize : 138.58\n",
3356 | "Variance Inflation Factor for SectionsMinVirtualsize : 622.11\n",
3357 | "Variance Inflation Factor for SectionMaxVirtualsize : 146.14\n",
3358 | "Variance Inflation Factor for ResourcesMeanSize : 13.04\n",
3359 | "12\n"
3360 | ]
3361 | }
3362 | ],
3363 | "source": [
3364 | "count=0\n",
3365 | "for i in range(len(mc.columns[:-1])):\n",
3366 | " v=vif(np.matrix(mc[:-1]),i)\n",
3367 | " if v>10:\n",
3368 | " print(\"Variance Inflation Factor for {} : {}\".format(mc.columns[i],round(v,2)))\n",
3369 | " count=count+1\n",
3370 | "print(count) "
3371 | ]
3372 | },
3373 | {
3374 | "cell_type": "code",
3375 | "execution_count": 38,
3376 | "id": "28be04aa",
3377 | "metadata": {},
3378 | "outputs": [],
3379 | "source": [
3380 | "# Remove Multicollinearity"
3381 | ]
3382 | },
3383 | {
3384 | "cell_type": "code",
3385 | "execution_count": 39,
3386 | "id": "57f8e502",
3387 | "metadata": {},
3388 | "outputs": [],
3389 | "source": [
3390 | "x=dataset.drop(['Name','md5','legitimate','MajorImageVersion','MinorImageVersion','MinorSubsystemVersion','SizeOfHeapCommit','LoaderFlags','SectionsMeanRawsize','SectionsMeanVirtualsize','ResourcesMeanSize'],axis=1).values\n",
3391 | "y=dataset['legitimate'].values #dependent variable"
3392 | ]
3393 | },
3394 | {
3395 | "cell_type": "code",
3396 | "execution_count": 40,
3397 | "id": "74d1f980",
3398 | "metadata": {},
3399 | "outputs": [],
3400 | "source": [
3401 | "extratrees=ek.ExtraTreesClassifier().fit(x,y)\n",
3402 | "model=SelectFromModel(extratrees,prefit=True)\n",
3403 | "x_new=model.transform(x)\n",
3404 | "nbfeatures=x_new.shape[1]"
3405 | ]
3406 | },
3407 | {
3408 | "cell_type": "code",
3409 | "execution_count": 41,
3410 | "id": "9a959102",
3411 | "metadata": {},
3412 | "outputs": [
3413 | {
3414 | "data": {
3415 | "text/plain": [
3416 | "12"
3417 | ]
3418 | },
3419 | "execution_count": 41,
3420 | "metadata": {},
3421 | "output_type": "execute_result"
3422 | }
3423 | ],
3424 | "source": [
3425 | "nbfeatures"
3426 | ]
3427 | },
3428 | {
3429 | "cell_type": "code",
3430 | "execution_count": 42,
3431 | "id": "3cbf22c4",
3432 | "metadata": {},
3433 | "outputs": [
3434 | {
3435 | "data": {
3436 | "text/plain": [
3437 | "([,\n",
3438 | " ],\n",
3439 | " [Text(0.8680545570066952, 0.675633988236168, 'Important Features'),\n",
3440 | " Text(-0.8680544937492721, -0.675634069509298, 'Not Important Features')],\n",
3441 | " [Text(0.4734843038218337, 0.3685276299470007, '21%'),\n",
3442 | " Text(-0.47348426931778476, -0.36852767427779887, '79%')])"
3443 | ]
3444 | },
3445 | "execution_count": 42,
3446 | "metadata": {},
3447 | "output_type": "execute_result"
3448 | },
3449 | {
3450 | "data": {
3451 | "image/png": "\n",
3452 | "text/plain": [
3453 | ""
3454 | ]
3455 | },
3456 | "metadata": {},
3457 | "output_type": "display_data"
3458 | }
3459 | ],
3460 | "source": [
3461 | "dataset.columns.size\n",
3462 | "imp_features_visual=['Important Features','Not Important Features']\n",
3463 | "imp_features_visual_val=[nbfeatures,57-nbfeatures]\n",
3464 | "plt.pie(imp_features_visual_val, labels=imp_features_visual, autopct='%0.f%%')"
3465 | ]
3466 | },
3467 | {
3468 | "cell_type": "code",
3469 | "execution_count": 43,
3470 | "id": "b018fe70",
3471 | "metadata": {},
3472 | "outputs": [],
3473 | "source": [
3474 | "x_train,x_test,y_train,y_test=train_test_split(x_new,y,test_size=0.2)"
3475 | ]
3476 | },
3477 | {
3478 | "cell_type": "code",
3479 | "execution_count": 44,
3480 | "id": "3d0336b6",
3481 | "metadata": {},
3482 | "outputs": [],
3483 | "source": [
3484 | "features=[]\n",
3485 | "index=np.argsort(extratrees.feature_importances_)[::1][:nbfeatures]"
3486 | ]
3487 | },
3488 | {
3489 | "cell_type": "code",
3490 | "execution_count": 45,
3491 | "id": "bc732e5b",
3492 | "metadata": {},
3493 | "outputs": [
3494 | {
3495 | "name": "stdout",
3496 | "output_type": "stream",
3497 | "text": [
3498 | "1. feature SizeOfStackReserve (0.000042)\n",
3499 | "2. feature SizeOfUninitializedData (0.000998)\n",
3500 | "3. feature BaseOfCode (0.001076)\n",
3501 | "4. feature SizeOfInitializedData (0.001339)\n",
3502 | "5. feature MinorImageVersion (0.001343)\n",
3503 | "6. feature SectionsNb (0.001391)\n",
3504 | "7. feature SectionsMinEntropy (0.001397)\n",
3505 | "8. feature BaseOfData (0.001520)\n",
3506 | "9. feature ImportsNbOrdinal (0.001540)\n",
3507 | "10. feature DllCharacteristics (0.001577)\n",
3508 | "11. feature SizeOfCode (0.001702)\n",
3509 | "12. feature AddressOfEntryPoint (0.002303)\n"
3510 | ]
3511 | }
3512 | ],
3513 | "source": [
3514 | "for f in range(nbfeatures):\n",
3515 | " print(\"%d. feature %s (%f)\"%(f+1,dataset.columns[2+index[f]],extratrees.feature_importances_[index[f]]))\n",
3516 | " features.append(dataset.columns[2+f])"
3517 | ]
3518 | },
3519 | {
3520 | "cell_type": "code",
3521 | "execution_count": 46,
3522 | "id": "8019777c",
3523 | "metadata": {},
3524 | "outputs": [],
3525 | "source": [
3526 | "model ={ \"RandomForest\":ek.RandomForestClassifier(n_estimators=50),\n",
3527 | " \"DecisionTree\":tree.DecisionTreeClassifier(max_depth=10),\n",
3528 | " \"LogisticRegression\":LogisticRegression()\n",
3529 | " }"
3530 | ]
3531 | },
3532 | {
3533 | "cell_type": "code",
3534 | "execution_count": 47,
3535 | "id": "26424604",
3536 | "metadata": {},
3537 | "outputs": [
3538 | {
3539 | "name": "stdout",
3540 | "output_type": "stream",
3541 | "text": [
3542 | "RandomForest : 0.9942774357116987\n",
3543 | "DecisionTree : 0.9903295907279971\n",
3544 | "LogisticRegression : 0.6968489677653025\n"
3545 | ]
3546 | },
3547 | {
3548 | "name": "stderr",
3549 | "output_type": "stream",
3550 | "text": [
3551 | "C:\\Users\\vajha\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=2):\n",
3552 | "ABNORMAL_TERMINATION_IN_LNSRCH.\n",
3553 | "\n",
3554 | "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
3555 | " https://scikit-learn.org/stable/modules/preprocessing.html\n",
3556 | "Please also refer to the documentation for alternative solver options:\n",
3557 | " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
3558 | " n_iter_i = _check_optimize_result(\n"
3559 | ]
3560 | }
3561 | ],
3562 | "source": [
3563 | "results={}\n",
3564 | "for algo in model:\n",
3565 | " clf=model[algo]\n",
3566 | " clf.fit(x_train,y_train)\n",
3567 | " score=clf.score(x_test,y_test)\n",
3568 | " print(\"%s : %s\"%(algo,score))\n",
3569 | " results[algo]=score"
3570 | ]
3571 | },
3572 | {
3573 | "cell_type": "code",
3574 | "execution_count": 48,
3575 | "id": "295df33e",
3576 | "metadata": {},
3577 | "outputs": [
3578 | {
3579 | "data": {
3580 | "text/plain": [
3581 | "'RandomForest'"
3582 | ]
3583 | },
3584 | "execution_count": 48,
3585 | "metadata": {},
3586 | "output_type": "execute_result"
3587 | }
3588 | ],
3589 | "source": [
3590 | "winner=max(results,key=results.get)\n",
3591 | "winner"
3592 | ]
3593 | },
3594 | {
3595 | "cell_type": "code",
3596 | "execution_count": 49,
3597 | "id": "b4203503",
3598 | "metadata": {},
3599 | "outputs": [
3600 | {
3601 | "name": "stdout",
3602 | "output_type": "stream",
3603 | "text": [
3604 | "False positive rate : 0.102353 %\n",
3605 | "False negative rate : 0.174237 %\n"
3606 | ]
3607 | }
3608 | ],
3609 | "source": [
3610 | "clf=model[winner]\n",
3611 | "res=clf.predict(x_new)\n",
3612 | "mt=confusion_matrix(y,res)\n",
3613 | "print(\"False positive rate : %f %%\" % ((mt[0][1] / float(sum(mt[0])))*100))\n",
3614 | "print(\"False negative rate : %f %%\" % ((mt[1][0] / float(sum(mt[1])))*100))"
3615 | ]
3616 | },
3617 | {
3618 | "cell_type": "code",
3619 | "execution_count": 50,
3620 | "id": "395ebe6a",
3621 | "metadata": {},
3622 | "outputs": [],
3623 | "source": [
3624 | "# Confusion Matrix"
3625 | ]
3626 | },
3627 | {
3628 | "cell_type": "code",
3629 | "execution_count": 51,
3630 | "id": "619f2146",
3631 | "metadata": {},
3632 | "outputs": [
3633 | {
3634 | "data": {
3635 | "text/plain": [
3636 | "array([[96625, 99],\n",
3637 | " [ 72, 41251]], dtype=int64)"
3638 | ]
3639 | },
3640 | "execution_count": 51,
3641 | "metadata": {},
3642 | "output_type": "execute_result"
3643 | }
3644 | ],
3645 | "source": [
3646 | "cf=confusion_matrix(y,res)\n",
3647 | "cf"
3648 | ]
3649 | },
3650 | {
3651 | "cell_type": "code",
3652 | "execution_count": 52,
3653 | "id": "87bbf42c",
3654 | "metadata": {},
3655 | "outputs": [
3656 | {
3657 | "data": {
3658 | "image/png": "\n",
3659 | "text/plain": [
3660 | ""
3661 | ]
3662 | },
3663 | "metadata": {
3664 | "needs_background": "light"
3665 | },
3666 | "output_type": "display_data"
3667 | }
3668 | ],
3669 | "source": [
3670 | "plot_confusion_matrix(conf_mat=cf)\n",
3671 | "plt.xlabel(\"Actual\")\n",
3672 | "plt.ylabel(\"Predicted\")\n",
3673 | "plt.title(\"Confusion Matrix - Key: 0 is Legitimate & 1 is Malware\")\n",
3674 | "plt.show()"
3675 | ]
3676 | },
3677 | {
3678 | "cell_type": "code",
3679 | "execution_count": null,
3680 | "id": "eadbd20c",
3681 | "metadata": {},
3682 | "outputs": [],
3683 | "source": []
3684 | }
3685 | ],
3686 | "metadata": {
3687 | "kernelspec": {
3688 | "display_name": "Python 3",
3689 | "language": "python",
3690 | "name": "python3"
3691 | },
3692 | "language_info": {
3693 | "codemirror_mode": {
3694 | "name": "ipython",
3695 | "version": 3
3696 | },
3697 | "file_extension": ".py",
3698 | "mimetype": "text/x-python",
3699 | "name": "python",
3700 | "nbconvert_exporter": "python",
3701 | "pygments_lexer": "ipython3",
3702 | "version": "3.8.8"
3703 | }
3704 | },
3705 | "nbformat": 4,
3706 | "nbformat_minor": 5
3707 | }
3708 |
--------------------------------------------------------------------------------