├── README.md
└── RansomwareD.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # Ransomeware Detection Using ML
2 |
3 | ### Machine Learning Algorithms used are:
4 |
5 | 1. Random Forest
6 | 2. Decision Tree
7 | 3. Logistic Regression
8 |
9 | ### Additional Libraries Used:
10 |
11 | * pefile
12 | * pickle
13 | * joblib
14 | * mlxtend
15 | * statsmodel
16 | * sklearn
17 |
18 | ### Concepts Used:
19 |
20 | * Multicollinearity
21 | * Ensemble Technique
22 | * Extra Tree Classifier
23 |
--------------------------------------------------------------------------------
/RansomwareD.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 6,
6 | "id": "2bca8f3e",
7 | "metadata": {},
8 | "outputs": [
9 | {
10 | "name": "stdout",
11 | "output_type": "stream",
12 | "text": [
13 | "Collecting pefile\n",
14 | " Using cached pefile-2021.5.24.tar.gz (66 kB)\n",
15 | "Requirement already satisfied: future in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from pefile) (0.18.2)\n",
16 | "Building wheels for collected packages: pefile\n",
17 | " Building wheel for pefile (setup.py): started\n",
18 | " Building wheel for pefile (setup.py): finished with status 'done'\n",
19 | " Created wheel for pefile: filename=pefile-2021.5.24-py3-none-any.whl size=62578 sha256=cf20a74be7fc5f7210d0a6f4e3714ed405c5d8da883caaeb0d173f1927786bf5\n",
20 | " Stored in directory: c:\\users\\vajha\\appdata\\local\\pip\\cache\\wheels\\43\\04\\fc\\d9305103f7d512f2df35b1878e1009e8217e713b767aee8f13\n",
21 | "Successfully built pefile\n",
22 | "Installing collected packages: pefile\n",
23 | "Successfully installed pefile-2021.5.24\n",
24 | "Note: you may need to restart the kernel to use updated packages.\n"
25 | ]
26 | }
27 | ],
28 | "source": [
29 | "pip install pefile"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 1,
35 | "id": "817480c1",
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "name": "stdout",
40 | "output_type": "stream",
41 | "text": [
42 | "Collecting mlxtend\n",
43 | " Using cached mlxtend-0.18.0-py2.py3-none-any.whl (1.3 MB)\n",
44 | "Requirement already satisfied: setuptools in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (52.0.0.post20210125)\n",
45 | "Requirement already satisfied: joblib>=0.13.2 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (1.0.1)\n",
46 | "Requirement already satisfied: scikit-learn>=0.20.3 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (0.24.1)\n",
47 | "Requirement already satisfied: scipy>=1.2.1 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (1.6.2)\n",
48 | "Requirement already satisfied: pandas>=0.24.2 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (1.2.4)\n",
49 | "Requirement already satisfied: matplotlib>=3.0.0 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (3.3.4)\n",
50 | "Requirement already satisfied: numpy>=1.16.2 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from mlxtend) (1.20.1)\n",
51 | "Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (2.8.1)\n",
52 | "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (2.4.7)\n",
53 | "Requirement already satisfied: cycler>=0.10 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (0.10.0)\n",
54 | "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (8.2.0)\n",
55 | "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (1.3.1)\n",
56 | "Requirement already satisfied: six in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from cycler>=0.10->matplotlib>=3.0.0->mlxtend) (1.15.0)\n",
57 | "Requirement already satisfied: pytz>=2017.3 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from pandas>=0.24.2->mlxtend) (2021.1)\n",
58 | "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\vajha\\anaconda3\\lib\\site-packages (from scikit-learn>=0.20.3->mlxtend) (2.1.0)\n",
59 | "Installing collected packages: mlxtend\n",
60 | "Successfully installed mlxtend-0.18.0\n",
61 | "Note: you may need to restart the kernel to use updated packages.\n"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "pip install mlxtend"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 3,
72 | "id": "b94512f2",
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "import os\n",
77 | "import pandas as pd\n",
78 | "import numpy as np\n",
79 | "from matplotlib import pyplot as plt\n",
80 | "import pickle\n",
81 | "import pefile\n",
82 | "import sklearn.ensemble as ek\n",
83 | "from sklearn import tree, linear_model\n",
84 | "from sklearn.feature_selection import SelectFromModel\n",
85 | "import joblib\n",
86 | "from sklearn.naive_bayes import GaussianNB\n",
87 | "from sklearn.metrics import confusion_matrix\n",
88 | "from sklearn.pipeline import make_pipeline\n",
89 | "from sklearn import preprocessing\n",
90 | "from sklearn import svm\n",
91 | "from sklearn.linear_model import LogisticRegression\n",
92 | "from statsmodels.stats.outliers_influence import variance_inflation_factor as vif\n",
93 | "from sklearn.model_selection import train_test_split\n",
94 | "from mlxtend.plotting import plot_confusion_matrix\n",
95 | "dataset=pd.read_csv(\"Ransomware.csv\",sep='|')"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 4,
101 | "id": "cde9b494",
102 | "metadata": {},
103 | "outputs": [
104 | {
105 | "data": {
106 | "text/html": [
107 | "
\n",
108 | "\n",
121 | "
\n",
122 | " \n",
123 | " \n",
124 | " | \n",
125 | " Name | \n",
126 | " md5 | \n",
127 | " Machine | \n",
128 | " SizeOfOptionalHeader | \n",
129 | " Characteristics | \n",
130 | " MajorLinkerVersion | \n",
131 | " MinorLinkerVersion | \n",
132 | " SizeOfCode | \n",
133 | " SizeOfInitializedData | \n",
134 | " SizeOfUninitializedData | \n",
135 | " ... | \n",
136 | " ResourcesNb | \n",
137 | " ResourcesMeanEntropy | \n",
138 | " ResourcesMinEntropy | \n",
139 | " ResourcesMaxEntropy | \n",
140 | " ResourcesMeanSize | \n",
141 | " ResourcesMinSize | \n",
142 | " ResourcesMaxSize | \n",
143 | " LoadConfigurationSize | \n",
144 | " VersionInformationSize | \n",
145 | " legitimate | \n",
146 | "
\n",
147 | " \n",
148 | " \n",
149 | " \n",
150 | " 0 | \n",
151 | " memtest.exe | \n",
152 | " 631ea355665f28d4707448e442fbf5b8 | \n",
153 | " 332 | \n",
154 | " 224 | \n",
155 | " 258 | \n",
156 | " 9 | \n",
157 | " 0 | \n",
158 | " 361984 | \n",
159 | " 115712 | \n",
160 | " 0 | \n",
161 | " ... | \n",
162 | " 4 | \n",
163 | " 3.262823 | \n",
164 | " 2.568844 | \n",
165 | " 3.537939 | \n",
166 | " 8797.000000 | \n",
167 | " 216 | \n",
168 | " 18032 | \n",
169 | " 0 | \n",
170 | " 16 | \n",
171 | " 1 | \n",
172 | "
\n",
173 | " \n",
174 | " 1 | \n",
175 | " ose.exe | \n",
176 | " 9d10f99a6712e28f8acd5641e3a7ea6b | \n",
177 | " 332 | \n",
178 | " 224 | \n",
179 | " 3330 | \n",
180 | " 9 | \n",
181 | " 0 | \n",
182 | " 130560 | \n",
183 | " 19968 | \n",
184 | " 0 | \n",
185 | " ... | \n",
186 | " 2 | \n",
187 | " 4.250461 | \n",
188 | " 3.420744 | \n",
189 | " 5.080177 | \n",
190 | " 837.000000 | \n",
191 | " 518 | \n",
192 | " 1156 | \n",
193 | " 72 | \n",
194 | " 18 | \n",
195 | " 1 | \n",
196 | "
\n",
197 | " \n",
198 | " 2 | \n",
199 | " setup.exe | \n",
200 | " 4d92f518527353c0db88a70fddcfd390 | \n",
201 | " 332 | \n",
202 | " 224 | \n",
203 | " 3330 | \n",
204 | " 9 | \n",
205 | " 0 | \n",
206 | " 517120 | \n",
207 | " 621568 | \n",
208 | " 0 | \n",
209 | " ... | \n",
210 | " 11 | \n",
211 | " 4.426324 | \n",
212 | " 2.846449 | \n",
213 | " 5.271813 | \n",
214 | " 31102.272727 | \n",
215 | " 104 | \n",
216 | " 270376 | \n",
217 | " 72 | \n",
218 | " 18 | \n",
219 | " 1 | \n",
220 | "
\n",
221 | " \n",
222 | " 3 | \n",
223 | " DW20.EXE | \n",
224 | " a41e524f8d45f0074fd07805ff0c9b12 | \n",
225 | " 332 | \n",
226 | " 224 | \n",
227 | " 258 | \n",
228 | " 9 | \n",
229 | " 0 | \n",
230 | " 585728 | \n",
231 | " 369152 | \n",
232 | " 0 | \n",
233 | " ... | \n",
234 | " 10 | \n",
235 | " 4.364291 | \n",
236 | " 2.669314 | \n",
237 | " 6.400720 | \n",
238 | " 1457.000000 | \n",
239 | " 90 | \n",
240 | " 4264 | \n",
241 | " 72 | \n",
242 | " 18 | \n",
243 | " 1 | \n",
244 | "
\n",
245 | " \n",
246 | " 4 | \n",
247 | " dwtrig20.exe | \n",
248 | " c87e561258f2f8650cef999bf643a731 | \n",
249 | " 332 | \n",
250 | " 224 | \n",
251 | " 258 | \n",
252 | " 9 | \n",
253 | " 0 | \n",
254 | " 294912 | \n",
255 | " 247296 | \n",
256 | " 0 | \n",
257 | " ... | \n",
258 | " 2 | \n",
259 | " 4.306100 | \n",
260 | " 3.421598 | \n",
261 | " 5.190603 | \n",
262 | " 1074.500000 | \n",
263 | " 849 | \n",
264 | " 1300 | \n",
265 | " 72 | \n",
266 | " 18 | \n",
267 | " 1 | \n",
268 | "
\n",
269 | " \n",
270 | " ... | \n",
271 | " ... | \n",
272 | " ... | \n",
273 | " ... | \n",
274 | " ... | \n",
275 | " ... | \n",
276 | " ... | \n",
277 | " ... | \n",
278 | " ... | \n",
279 | " ... | \n",
280 | " ... | \n",
281 | " ... | \n",
282 | " ... | \n",
283 | " ... | \n",
284 | " ... | \n",
285 | " ... | \n",
286 | " ... | \n",
287 | " ... | \n",
288 | " ... | \n",
289 | " ... | \n",
290 | " ... | \n",
291 | " ... | \n",
292 | "
\n",
293 | " \n",
294 | " 138042 | \n",
295 | " VirusShare_8e292b418568d6e7b87f2a32aee7074b | \n",
296 | " 8e292b418568d6e7b87f2a32aee7074b | \n",
297 | " 332 | \n",
298 | " 224 | \n",
299 | " 258 | \n",
300 | " 11 | \n",
301 | " 0 | \n",
302 | " 205824 | \n",
303 | " 223744 | \n",
304 | " 0 | \n",
305 | " ... | \n",
306 | " 7 | \n",
307 | " 4.122736 | \n",
308 | " 1.370260 | \n",
309 | " 7.677091 | \n",
310 | " 14900.714286 | \n",
311 | " 16 | \n",
312 | " 81654 | \n",
313 | " 72 | \n",
314 | " 0 | \n",
315 | " 0 | \n",
316 | "
\n",
317 | " \n",
318 | " 138043 | \n",
319 | " VirusShare_260d9e2258aed4c8a3bbd703ec895822 | \n",
320 | " 260d9e2258aed4c8a3bbd703ec895822 | \n",
321 | " 332 | \n",
322 | " 224 | \n",
323 | " 33167 | \n",
324 | " 2 | \n",
325 | " 25 | \n",
326 | " 37888 | \n",
327 | " 185344 | \n",
328 | " 0 | \n",
329 | " ... | \n",
330 | " 26 | \n",
331 | " 3.377663 | \n",
332 | " 2.031619 | \n",
333 | " 5.050074 | \n",
334 | " 6905.846154 | \n",
335 | " 44 | \n",
336 | " 67624 | \n",
337 | " 0 | \n",
338 | " 15 | \n",
339 | " 0 | \n",
340 | "
\n",
341 | " \n",
342 | " 138044 | \n",
343 | " VirusShare_8d088a51b7d225c9f5d11d239791ec3f | \n",
344 | " 8d088a51b7d225c9f5d11d239791ec3f | \n",
345 | " 332 | \n",
346 | " 224 | \n",
347 | " 258 | \n",
348 | " 10 | \n",
349 | " 0 | \n",
350 | " 118272 | \n",
351 | " 380416 | \n",
352 | " 0 | \n",
353 | " ... | \n",
354 | " 22 | \n",
355 | " 6.825406 | \n",
356 | " 2.617026 | \n",
357 | " 7.990487 | \n",
358 | " 14981.909091 | \n",
359 | " 48 | \n",
360 | " 22648 | \n",
361 | " 72 | \n",
362 | " 14 | \n",
363 | " 0 | \n",
364 | "
\n",
365 | " \n",
366 | " 138045 | \n",
367 | " VirusShare_4286dccf67ca220fe67635388229a9f3 | \n",
368 | " 4286dccf67ca220fe67635388229a9f3 | \n",
369 | " 332 | \n",
370 | " 224 | \n",
371 | " 33166 | \n",
372 | " 2 | \n",
373 | " 25 | \n",
374 | " 49152 | \n",
375 | " 16896 | \n",
376 | " 0 | \n",
377 | " ... | \n",
378 | " 10 | \n",
379 | " 3.421627 | \n",
380 | " 2.060964 | \n",
381 | " 4.739744 | \n",
382 | " 601.600000 | \n",
383 | " 16 | \n",
384 | " 2216 | \n",
385 | " 0 | \n",
386 | " 0 | \n",
387 | " 0 | \n",
388 | "
\n",
389 | " \n",
390 | " 138046 | \n",
391 | " VirusShare_d7648eae45f09b3adb75127f43be6d11 | \n",
392 | " d7648eae45f09b3adb75127f43be6d11 | \n",
393 | " 332 | \n",
394 | " 224 | \n",
395 | " 258 | \n",
396 | " 11 | \n",
397 | " 0 | \n",
398 | " 111616 | \n",
399 | " 468480 | \n",
400 | " 0 | \n",
401 | " ... | \n",
402 | " 4 | \n",
403 | " 4.407252 | \n",
404 | " 1.980482 | \n",
405 | " 6.115374 | \n",
406 | " 96625.000000 | \n",
407 | " 20 | \n",
408 | " 318464 | \n",
409 | " 72 | \n",
410 | " 0 | \n",
411 | " 0 | \n",
412 | "
\n",
413 | " \n",
414 | "
\n",
415 | "
138047 rows × 57 columns
\n",
416 | "
"
417 | ],
418 | "text/plain": [
419 | " Name \\\n",
420 | "0 memtest.exe \n",
421 | "1 ose.exe \n",
422 | "2 setup.exe \n",
423 | "3 DW20.EXE \n",
424 | "4 dwtrig20.exe \n",
425 | "... ... \n",
426 | "138042 VirusShare_8e292b418568d6e7b87f2a32aee7074b \n",
427 | "138043 VirusShare_260d9e2258aed4c8a3bbd703ec895822 \n",
428 | "138044 VirusShare_8d088a51b7d225c9f5d11d239791ec3f \n",
429 | "138045 VirusShare_4286dccf67ca220fe67635388229a9f3 \n",
430 | "138046 VirusShare_d7648eae45f09b3adb75127f43be6d11 \n",
431 | "\n",
432 | " md5 Machine SizeOfOptionalHeader \\\n",
433 | "0 631ea355665f28d4707448e442fbf5b8 332 224 \n",
434 | "1 9d10f99a6712e28f8acd5641e3a7ea6b 332 224 \n",
435 | "2 4d92f518527353c0db88a70fddcfd390 332 224 \n",
436 | "3 a41e524f8d45f0074fd07805ff0c9b12 332 224 \n",
437 | "4 c87e561258f2f8650cef999bf643a731 332 224 \n",
438 | "... ... ... ... \n",
439 | "138042 8e292b418568d6e7b87f2a32aee7074b 332 224 \n",
440 | "138043 260d9e2258aed4c8a3bbd703ec895822 332 224 \n",
441 | "138044 8d088a51b7d225c9f5d11d239791ec3f 332 224 \n",
442 | "138045 4286dccf67ca220fe67635388229a9f3 332 224 \n",
443 | "138046 d7648eae45f09b3adb75127f43be6d11 332 224 \n",
444 | "\n",
445 | " Characteristics MajorLinkerVersion MinorLinkerVersion SizeOfCode \\\n",
446 | "0 258 9 0 361984 \n",
447 | "1 3330 9 0 130560 \n",
448 | "2 3330 9 0 517120 \n",
449 | "3 258 9 0 585728 \n",
450 | "4 258 9 0 294912 \n",
451 | "... ... ... ... ... \n",
452 | "138042 258 11 0 205824 \n",
453 | "138043 33167 2 25 37888 \n",
454 | "138044 258 10 0 118272 \n",
455 | "138045 33166 2 25 49152 \n",
456 | "138046 258 11 0 111616 \n",
457 | "\n",
458 | " SizeOfInitializedData SizeOfUninitializedData ... ResourcesNb \\\n",
459 | "0 115712 0 ... 4 \n",
460 | "1 19968 0 ... 2 \n",
461 | "2 621568 0 ... 11 \n",
462 | "3 369152 0 ... 10 \n",
463 | "4 247296 0 ... 2 \n",
464 | "... ... ... ... ... \n",
465 | "138042 223744 0 ... 7 \n",
466 | "138043 185344 0 ... 26 \n",
467 | "138044 380416 0 ... 22 \n",
468 | "138045 16896 0 ... 10 \n",
469 | "138046 468480 0 ... 4 \n",
470 | "\n",
471 | " ResourcesMeanEntropy ResourcesMinEntropy ResourcesMaxEntropy \\\n",
472 | "0 3.262823 2.568844 3.537939 \n",
473 | "1 4.250461 3.420744 5.080177 \n",
474 | "2 4.426324 2.846449 5.271813 \n",
475 | "3 4.364291 2.669314 6.400720 \n",
476 | "4 4.306100 3.421598 5.190603 \n",
477 | "... ... ... ... \n",
478 | "138042 4.122736 1.370260 7.677091 \n",
479 | "138043 3.377663 2.031619 5.050074 \n",
480 | "138044 6.825406 2.617026 7.990487 \n",
481 | "138045 3.421627 2.060964 4.739744 \n",
482 | "138046 4.407252 1.980482 6.115374 \n",
483 | "\n",
484 | " ResourcesMeanSize ResourcesMinSize ResourcesMaxSize \\\n",
485 | "0 8797.000000 216 18032 \n",
486 | "1 837.000000 518 1156 \n",
487 | "2 31102.272727 104 270376 \n",
488 | "3 1457.000000 90 4264 \n",
489 | "4 1074.500000 849 1300 \n",
490 | "... ... ... ... \n",
491 | "138042 14900.714286 16 81654 \n",
492 | "138043 6905.846154 44 67624 \n",
493 | "138044 14981.909091 48 22648 \n",
494 | "138045 601.600000 16 2216 \n",
495 | "138046 96625.000000 20 318464 \n",
496 | "\n",
497 | " LoadConfigurationSize VersionInformationSize legitimate \n",
498 | "0 0 16 1 \n",
499 | "1 72 18 1 \n",
500 | "2 72 18 1 \n",
501 | "3 72 18 1 \n",
502 | "4 72 18 1 \n",
503 | "... ... ... ... \n",
504 | "138042 72 0 0 \n",
505 | "138043 0 15 0 \n",
506 | "138044 72 14 0 \n",
507 | "138045 0 0 0 \n",
508 | "138046 72 0 0 \n",
509 | "\n",
510 | "[138047 rows x 57 columns]"
511 | ]
512 | },
513 | "execution_count": 4,
514 | "metadata": {},
515 | "output_type": "execute_result"
516 | }
517 | ],
518 | "source": [
519 | "dataset"
520 | ]
521 | },
522 | {
523 | "cell_type": "code",
524 | "execution_count": 5,
525 | "id": "9e106963",
526 | "metadata": {},
527 | "outputs": [
528 | {
529 | "data": {
530 | "text/html": [
531 | "\n",
532 | "\n",
545 | "
\n",
546 | " \n",
547 | " \n",
548 | " | \n",
549 | " Machine | \n",
550 | " SizeOfOptionalHeader | \n",
551 | " Characteristics | \n",
552 | " MajorLinkerVersion | \n",
553 | " MinorLinkerVersion | \n",
554 | " SizeOfCode | \n",
555 | " SizeOfInitializedData | \n",
556 | " SizeOfUninitializedData | \n",
557 | " AddressOfEntryPoint | \n",
558 | " BaseOfCode | \n",
559 | " ... | \n",
560 | " ResourcesNb | \n",
561 | " ResourcesMeanEntropy | \n",
562 | " ResourcesMinEntropy | \n",
563 | " ResourcesMaxEntropy | \n",
564 | " ResourcesMeanSize | \n",
565 | " ResourcesMinSize | \n",
566 | " ResourcesMaxSize | \n",
567 | " LoadConfigurationSize | \n",
568 | " VersionInformationSize | \n",
569 | " legitimate | \n",
570 | "
\n",
571 | " \n",
572 | " \n",
573 | " \n",
574 | " count | \n",
575 | " 138047.000000 | \n",
576 | " 138047.000000 | \n",
577 | " 138047.000000 | \n",
578 | " 138047.000000 | \n",
579 | " 138047.000000 | \n",
580 | " 1.380470e+05 | \n",
581 | " 1.380470e+05 | \n",
582 | " 1.380470e+05 | \n",
583 | " 1.380470e+05 | \n",
584 | " 1.380470e+05 | \n",
585 | " ... | \n",
586 | " 138047.000000 | \n",
587 | " 138047.000000 | \n",
588 | " 138047.000000 | \n",
589 | " 138047.000000 | \n",
590 | " 1.380470e+05 | \n",
591 | " 1.380470e+05 | \n",
592 | " 1.380470e+05 | \n",
593 | " 1.380470e+05 | \n",
594 | " 138047.000000 | \n",
595 | " 138047.000000 | \n",
596 | "
\n",
597 | " \n",
598 | " mean | \n",
599 | " 4259.069274 | \n",
600 | " 225.845632 | \n",
601 | " 4444.145994 | \n",
602 | " 8.619774 | \n",
603 | " 3.819286 | \n",
604 | " 2.425956e+05 | \n",
605 | " 4.504867e+05 | \n",
606 | " 1.009525e+05 | \n",
607 | " 1.719561e+05 | \n",
608 | " 5.779845e+04 | \n",
609 | " ... | \n",
610 | " 22.050700 | \n",
611 | " 4.000127 | \n",
612 | " 2.434541 | \n",
613 | " 5.521610 | \n",
614 | " 5.545093e+04 | \n",
615 | " 1.818082e+04 | \n",
616 | " 2.465903e+05 | \n",
617 | " 4.656750e+05 | \n",
618 | " 12.363115 | \n",
619 | " 0.299340 | \n",
620 | "
\n",
621 | " \n",
622 | " std | \n",
623 | " 10880.347245 | \n",
624 | " 5.121399 | \n",
625 | " 8186.782524 | \n",
626 | " 4.088757 | \n",
627 | " 11.862675 | \n",
628 | " 5.754485e+06 | \n",
629 | " 2.101599e+07 | \n",
630 | " 1.635288e+07 | \n",
631 | " 3.430553e+06 | \n",
632 | " 5.527658e+06 | \n",
633 | " ... | \n",
634 | " 136.494244 | \n",
635 | " 1.112981 | \n",
636 | " 0.815577 | \n",
637 | " 1.597403 | \n",
638 | " 7.799163e+06 | \n",
639 | " 6.502369e+06 | \n",
640 | " 2.124860e+07 | \n",
641 | " 2.608987e+07 | \n",
642 | " 6.798878 | \n",
643 | " 0.457971 | \n",
644 | "
\n",
645 | " \n",
646 | " min | \n",
647 | " 332.000000 | \n",
648 | " 224.000000 | \n",
649 | " 2.000000 | \n",
650 | " 0.000000 | \n",
651 | " 0.000000 | \n",
652 | " 0.000000e+00 | \n",
653 | " 0.000000e+00 | \n",
654 | " 0.000000e+00 | \n",
655 | " 0.000000e+00 | \n",
656 | " 0.000000e+00 | \n",
657 | " ... | \n",
658 | " 0.000000 | \n",
659 | " 0.000000 | \n",
660 | " 0.000000 | \n",
661 | " 0.000000 | \n",
662 | " 0.000000e+00 | \n",
663 | " 0.000000e+00 | \n",
664 | " 0.000000e+00 | \n",
665 | " 0.000000e+00 | \n",
666 | " 0.000000 | \n",
667 | " 0.000000 | \n",
668 | "
\n",
669 | " \n",
670 | " 25% | \n",
671 | " 332.000000 | \n",
672 | " 224.000000 | \n",
673 | " 258.000000 | \n",
674 | " 8.000000 | \n",
675 | " 0.000000 | \n",
676 | " 3.020800e+04 | \n",
677 | " 2.457600e+04 | \n",
678 | " 0.000000e+00 | \n",
679 | " 1.272100e+04 | \n",
680 | " 4.096000e+03 | \n",
681 | " ... | \n",
682 | " 5.000000 | \n",
683 | " 3.458505 | \n",
684 | " 2.178748 | \n",
685 | " 4.828706 | \n",
686 | " 9.560000e+02 | \n",
687 | " 4.800000e+01 | \n",
688 | " 2.216000e+03 | \n",
689 | " 0.000000e+00 | \n",
690 | " 13.000000 | \n",
691 | " 0.000000 | \n",
692 | "
\n",
693 | " \n",
694 | " 50% | \n",
695 | " 332.000000 | \n",
696 | " 224.000000 | \n",
697 | " 258.000000 | \n",
698 | " 9.000000 | \n",
699 | " 0.000000 | \n",
700 | " 1.136640e+05 | \n",
701 | " 2.631680e+05 | \n",
702 | " 0.000000e+00 | \n",
703 | " 5.288300e+04 | \n",
704 | " 4.096000e+03 | \n",
705 | " ... | \n",
706 | " 6.000000 | \n",
707 | " 3.729824 | \n",
708 | " 2.458492 | \n",
709 | " 5.317552 | \n",
710 | " 2.708154e+03 | \n",
711 | " 4.800000e+01 | \n",
712 | " 9.640000e+03 | \n",
713 | " 7.200000e+01 | \n",
714 | " 15.000000 | \n",
715 | " 0.000000 | \n",
716 | "
\n",
717 | " \n",
718 | " 75% | \n",
719 | " 332.000000 | \n",
720 | " 224.000000 | \n",
721 | " 8226.000000 | \n",
722 | " 10.000000 | \n",
723 | " 0.000000 | \n",
724 | " 1.203200e+05 | \n",
725 | " 3.850240e+05 | \n",
726 | " 0.000000e+00 | \n",
727 | " 6.157800e+04 | \n",
728 | " 4.096000e+03 | \n",
729 | " ... | \n",
730 | " 13.000000 | \n",
731 | " 4.233051 | \n",
732 | " 2.696833 | \n",
733 | " 6.502239 | \n",
734 | " 6.558429e+03 | \n",
735 | " 1.320000e+02 | \n",
736 | " 2.378000e+04 | \n",
737 | " 7.200000e+01 | \n",
738 | " 16.000000 | \n",
739 | " 1.000000 | \n",
740 | "
\n",
741 | " \n",
742 | " max | \n",
743 | " 34404.000000 | \n",
744 | " 352.000000 | \n",
745 | " 49551.000000 | \n",
746 | " 255.000000 | \n",
747 | " 255.000000 | \n",
748 | " 1.818587e+09 | \n",
749 | " 4.294966e+09 | \n",
750 | " 4.294941e+09 | \n",
751 | " 1.074484e+09 | \n",
752 | " 2.028711e+09 | \n",
753 | " ... | \n",
754 | " 7694.000000 | \n",
755 | " 7.999723 | \n",
756 | " 7.999723 | \n",
757 | " 8.000000 | \n",
758 | " 2.415919e+09 | \n",
759 | " 2.415919e+09 | \n",
760 | " 4.294903e+09 | \n",
761 | " 4.294967e+09 | \n",
762 | " 26.000000 | \n",
763 | " 1.000000 | \n",
764 | "
\n",
765 | " \n",
766 | "
\n",
767 | "
8 rows × 55 columns
\n",
768 | "
"
769 | ],
770 | "text/plain": [
771 | " Machine SizeOfOptionalHeader Characteristics \\\n",
772 | "count 138047.000000 138047.000000 138047.000000 \n",
773 | "mean 4259.069274 225.845632 4444.145994 \n",
774 | "std 10880.347245 5.121399 8186.782524 \n",
775 | "min 332.000000 224.000000 2.000000 \n",
776 | "25% 332.000000 224.000000 258.000000 \n",
777 | "50% 332.000000 224.000000 258.000000 \n",
778 | "75% 332.000000 224.000000 8226.000000 \n",
779 | "max 34404.000000 352.000000 49551.000000 \n",
780 | "\n",
781 | " MajorLinkerVersion MinorLinkerVersion SizeOfCode \\\n",
782 | "count 138047.000000 138047.000000 1.380470e+05 \n",
783 | "mean 8.619774 3.819286 2.425956e+05 \n",
784 | "std 4.088757 11.862675 5.754485e+06 \n",
785 | "min 0.000000 0.000000 0.000000e+00 \n",
786 | "25% 8.000000 0.000000 3.020800e+04 \n",
787 | "50% 9.000000 0.000000 1.136640e+05 \n",
788 | "75% 10.000000 0.000000 1.203200e+05 \n",
789 | "max 255.000000 255.000000 1.818587e+09 \n",
790 | "\n",
791 | " SizeOfInitializedData SizeOfUninitializedData AddressOfEntryPoint \\\n",
792 | "count 1.380470e+05 1.380470e+05 1.380470e+05 \n",
793 | "mean 4.504867e+05 1.009525e+05 1.719561e+05 \n",
794 | "std 2.101599e+07 1.635288e+07 3.430553e+06 \n",
795 | "min 0.000000e+00 0.000000e+00 0.000000e+00 \n",
796 | "25% 2.457600e+04 0.000000e+00 1.272100e+04 \n",
797 | "50% 2.631680e+05 0.000000e+00 5.288300e+04 \n",
798 | "75% 3.850240e+05 0.000000e+00 6.157800e+04 \n",
799 | "max 4.294966e+09 4.294941e+09 1.074484e+09 \n",
800 | "\n",
801 | " BaseOfCode ... ResourcesNb ResourcesMeanEntropy \\\n",
802 | "count 1.380470e+05 ... 138047.000000 138047.000000 \n",
803 | "mean 5.779845e+04 ... 22.050700 4.000127 \n",
804 | "std 5.527658e+06 ... 136.494244 1.112981 \n",
805 | "min 0.000000e+00 ... 0.000000 0.000000 \n",
806 | "25% 4.096000e+03 ... 5.000000 3.458505 \n",
807 | "50% 4.096000e+03 ... 6.000000 3.729824 \n",
808 | "75% 4.096000e+03 ... 13.000000 4.233051 \n",
809 | "max 2.028711e+09 ... 7694.000000 7.999723 \n",
810 | "\n",
811 | " ResourcesMinEntropy ResourcesMaxEntropy ResourcesMeanSize \\\n",
812 | "count 138047.000000 138047.000000 1.380470e+05 \n",
813 | "mean 2.434541 5.521610 5.545093e+04 \n",
814 | "std 0.815577 1.597403 7.799163e+06 \n",
815 | "min 0.000000 0.000000 0.000000e+00 \n",
816 | "25% 2.178748 4.828706 9.560000e+02 \n",
817 | "50% 2.458492 5.317552 2.708154e+03 \n",
818 | "75% 2.696833 6.502239 6.558429e+03 \n",
819 | "max 7.999723 8.000000 2.415919e+09 \n",
820 | "\n",
821 | " ResourcesMinSize ResourcesMaxSize LoadConfigurationSize \\\n",
822 | "count 1.380470e+05 1.380470e+05 1.380470e+05 \n",
823 | "mean 1.818082e+04 2.465903e+05 4.656750e+05 \n",
824 | "std 6.502369e+06 2.124860e+07 2.608987e+07 \n",
825 | "min 0.000000e+00 0.000000e+00 0.000000e+00 \n",
826 | "25% 4.800000e+01 2.216000e+03 0.000000e+00 \n",
827 | "50% 4.800000e+01 9.640000e+03 7.200000e+01 \n",
828 | "75% 1.320000e+02 2.378000e+04 7.200000e+01 \n",
829 | "max 2.415919e+09 4.294903e+09 4.294967e+09 \n",
830 | "\n",
831 | " VersionInformationSize legitimate \n",
832 | "count 138047.000000 138047.000000 \n",
833 | "mean 12.363115 0.299340 \n",
834 | "std 6.798878 0.457971 \n",
835 | "min 0.000000 0.000000 \n",
836 | "25% 13.000000 0.000000 \n",
837 | "50% 15.000000 0.000000 \n",
838 | "75% 16.000000 1.000000 \n",
839 | "max 26.000000 1.000000 \n",
840 | "\n",
841 | "[8 rows x 55 columns]"
842 | ]
843 | },
844 | "execution_count": 5,
845 | "metadata": {},
846 | "output_type": "execute_result"
847 | }
848 | ],
849 | "source": [
850 | "dataset.describe()"
851 | ]
852 | },
853 | {
854 | "cell_type": "code",
855 | "execution_count": 6,
856 | "id": "f3db099c",
857 | "metadata": {},
858 | "outputs": [
859 | {
860 | "data": {
861 | "text/plain": [
862 | "Name 0\n",
863 | "md5 0\n",
864 | "Machine 0\n",
865 | "SizeOfOptionalHeader 0\n",
866 | "Characteristics 0\n",
867 | "MajorLinkerVersion 0\n",
868 | "MinorLinkerVersion 0\n",
869 | "SizeOfCode 0\n",
870 | "SizeOfInitializedData 0\n",
871 | "SizeOfUninitializedData 0\n",
872 | "AddressOfEntryPoint 0\n",
873 | "BaseOfCode 0\n",
874 | "BaseOfData 0\n",
875 | "ImageBase 0\n",
876 | "SectionAlignment 0\n",
877 | "FileAlignment 0\n",
878 | "MajorOperatingSystemVersion 0\n",
879 | "MinorOperatingSystemVersion 0\n",
880 | "MajorImageVersion 0\n",
881 | "MinorImageVersion 0\n",
882 | "MajorSubsystemVersion 0\n",
883 | "MinorSubsystemVersion 0\n",
884 | "SizeOfImage 0\n",
885 | "SizeOfHeaders 0\n",
886 | "CheckSum 0\n",
887 | "Subsystem 0\n",
888 | "DllCharacteristics 0\n",
889 | "SizeOfStackReserve 0\n",
890 | "SizeOfStackCommit 0\n",
891 | "SizeOfHeapReserve 0\n",
892 | "SizeOfHeapCommit 0\n",
893 | "LoaderFlags 0\n",
894 | "NumberOfRvaAndSizes 0\n",
895 | "SectionsNb 0\n",
896 | "SectionsMeanEntropy 0\n",
897 | "SectionsMinEntropy 0\n",
898 | "SectionsMaxEntropy 0\n",
899 | "SectionsMeanRawsize 0\n",
900 | "SectionsMinRawsize 0\n",
901 | "SectionMaxRawsize 0\n",
902 | "SectionsMeanVirtualsize 0\n",
903 | "SectionsMinVirtualsize 0\n",
904 | "SectionMaxVirtualsize 0\n",
905 | "ImportsNbDLL 0\n",
906 | "ImportsNb 0\n",
907 | "ImportsNbOrdinal 0\n",
908 | "ExportNb 0\n",
909 | "ResourcesNb 0\n",
910 | "ResourcesMeanEntropy 0\n",
911 | "ResourcesMinEntropy 0\n",
912 | "ResourcesMaxEntropy 0\n",
913 | "ResourcesMeanSize 0\n",
914 | "ResourcesMinSize 0\n",
915 | "ResourcesMaxSize 0\n",
916 | "LoadConfigurationSize 0\n",
917 | "VersionInformationSize 0\n",
918 | "legitimate 0\n",
919 | "dtype: int64"
920 | ]
921 | },
922 | "execution_count": 6,
923 | "metadata": {},
924 | "output_type": "execute_result"
925 | }
926 | ],
927 | "source": [
928 | "dataset.isnull().sum()"
929 | ]
930 | },
931 | {
932 | "cell_type": "code",
933 | "execution_count": 7,
934 | "id": "48a57329",
935 | "metadata": {},
936 | "outputs": [],
937 | "source": [
938 | "#Classifying Data Based on - Legitimate OR Malware"
939 | ]
940 | },
941 | {
942 | "cell_type": "code",
943 | "execution_count": 8,
944 | "id": "52e76632",
945 | "metadata": {},
946 | "outputs": [
947 | {
948 | "data": {
949 | "text/plain": [
950 | "legitimate\n",
951 | "0 96724\n",
952 | "1 41323\n",
953 | "dtype: int64"
954 | ]
955 | },
956 | "execution_count": 8,
957 | "metadata": {},
958 | "output_type": "execute_result"
959 | }
960 | ],
961 | "source": [
962 | "dataset.groupby(dataset['legitimate']).size()\n",
963 | "#1 means legitimate, 0 means malware"
964 | ]
965 | },
966 | {
967 | "cell_type": "code",
968 | "execution_count": 9,
969 | "id": "77eefc0b",
970 | "metadata": {},
971 | "outputs": [
972 | {
973 | "data": {
974 | "text/plain": [
975 | "([,\n",
976 | " ],\n",
977 | " [Text(0.6484073958497663, 0.8885763045497695, 'Legitimate'),\n",
978 | " Text(-0.6484073958497659, -0.8885763045497698, 'Malware')],\n",
979 | " [Text(0.35367676137259974, 0.4846779842998742, '30%'),\n",
980 | " Text(-0.35367676137259957, -0.48467798429987435, '70%')])"
981 | ]
982 | },
983 | "execution_count": 9,
984 | "metadata": {},
985 | "output_type": "execute_result"
986 | },
987 | {
988 | "data": {
989 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAOgAAADnCAYAAAAU/xqtAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAZPklEQVR4nO3deZwU5Z3H8c/TPcBwWYgI4tleQLglHhjFaOKR3TGaQ1EMAUUlmxizrmaTNiab0uzGMYd3YqIx0WhE1M2uaxqUrKIYXdBIRDSKUZgoiAoCBTPDnP3sH08Th8kMc9Ddv6e6f+/Xq18MM9NV34b5Tj1VXfWUsdailPJTQjqAUqpzWlClPKYFVcpjWlClPKYFVcpjWlClPKYFVcpjWlClPKYFVcpjWlClPKYFVcpjWlClPKYFVcpjWlClPKYFVcpjWlClPKYFVcpjWlClPKYFVcpjWlClPKYFVcpjWlClPKYFVcpjWlClPKYFVcpjWlClPKYFVcpjFdIBVOdS6Uw/YBSwNzAM2KuDx47PB0AjUJd71Lf5uO2jFngbeAP4C/BOTXWV3qDHU0ZvnuSHVDozCJgMHAFMyT3GUvhfovXAm8ArwEvACuClmuqqtQVer+oGLaiA3JbxWOBIPizj4fi1y/EB8BSwEFhYU121TjhPWdKCFkkqnRkBVAGfBk4GBskm6rGV5MoKPFNTXdUsnKcsaEELKQxSwNl3tvzDuO+1fHEWYIQT5ctW4HE+3LrqcLhAtKD5FgYDgRnARcAxANts/1cmNN45TjRX4VjgMeAnwIKa6qqscJ6SogXNlzCYAnwJV87Bbb9kLfb4xpveXcfeI0WyFc8a4GfAnTXVVR8UaiXGmFpr7W7tIhhj9gVuttaeZYyZDOxrrV2Q+9oZwFhrbXUesl4G3G6tre/V87WguyEMkrhCXgZ8dFff+suWTy25pmXWCcWI5YEG4AHgJzXVVc/le+H5KGi75Z0PHGmt/Wq+ltlm2TW5ZW/s1fO1oL0QBn2B84FvAod05ymb7OAXpzT+fHIBU/nqj7jh77ya6qrGfCywo4IaYw7NrWdv3FtHF1trX8t9/jdAErfPfLm1dpAxJgX8DncE/Q2gP7AOuDb38ZHW2q8aY+4CtgNjgIOAC4DZuKPwy6y15+fWfxtwVO65D1lrv2uM+RrwI2AVsNFae5Ix5lTgaqAf7u2tC6y1tZ29Vp8O6/svDAYQBpcBq4Gf081yAuzJtglDiQo27PPYkcCvgFdT6cz0Aq7nduBSa+1Hga8DP819/ibgJmvtUcA77Z9krW0C/g2Yb62dbK2d38Gy9wQ+AfwL8AhwAzAOmJAbHgNcZa09EpgIfNwYM9Fae3NunSflyjkM+DZwsrV2Cu6X1+W7elFa0O4IgwRhMAf3G+8GYL+eLsIYkrMrFr2a92zxcTAwP5XOPJNKZ47O54KNMYOAjwEPGmNexP3y3LG/fyzwYO7j+3q5ikesG2quBN6z1q601mZxJ3ekct8z3RizHPgTrrxjO1jO1Nznn8nlnI3bKndKT/XrShh8HFfKI3Z3UdOTT/a7oeXs3c8Ubx8DlqbSmXlAuqa66u08LDMBbLHWTs7DsjqyY2iebfPxjr9XGGMOxm21j7LWbs4Niys7WI4Bfm+tndHdFesWtDNhcAhh8FvgSfJQToB92DxxEPVb87GsmDPAecCqVDrzH7nTHHvNWrsVWGOMORvAOJNyX14KfD738bmdLGIb7Y6899AeuPOcI2PMCOAfOln2UuA4Y8xhuZwDjDGjdrVgLWh7YZAkDP4VN3z5bD4XbQz9zk0ufjmfy4y5/sC3gDdS6cxFqXSmuz+PA4wxa9s8Lge+AFxojFmB+787M/e9lwGXG2Oeww17ow6WtxgYa4x50RhzTk9fhLV2BW5o+wrwS+CZNl++HVhojFlsrd2AO7g4zxjzEq6wY3a1bD2K21YYfAR3QOOYQq2iJjti6YlNN0wt1PJj7ilgVk111Vv5WqAxZgCw3VprjTHnAjOstWd29TxfaEFhx/uZ3wC+izv8XTDWUjem8a5kI3072kdRbgv31ZrqqnvzsTBjzDTgVtywegswx1r7Rj6WXQxa0DA4CJhPAbea7X2z+aLn5rd+Iq9HMkvQA8A/1VRXbZYOIqm890HD4B+B5RSxnADnJx/TK0G6Nh1YnkpnjpIOIqk8t6BuSHsNcCUCV5hkLVsOb7xnUCtJfZura03AFTXVVbdKB5FQfgUNg+HAPNyZIWIuafra8kx26hTJDDHzAHBRTXXVNukgxVReQ9wwGAMsQ7icABdWLKiTzhAz04GnUunMcOkgxVQ+BQ2D43HvT6WEkwAwybw52pDVayd75gjg6VQ6c6B0kGIpj4KGwVnA74Gh0lF2SBo7fFpi5SvSOWJoFPCHVDqzyzf4S0XpF9RdffIAHZ8bKeriZGaTdIaYOgC3Jd3lNbiloLQLGgZp3InuXs4FNDXxarcvV1N/ZxiwOJXOnCgdpJBKt6DufNprpWPsSh/TesAU8/oq6RwxNhhYmEpnzpAOUiilWdAwuBz4gXSM7phb8bt3pTPEXCXwn6l0ZpZ0kEIovYK6fc4fS8forhMTK/aXzlACKoBfpdKZ06WD5FtpFTQMLsDtc8ZGpWk+dLR5a410jhKQAOal0pnJ0kHyqXQKGgan4a69i525FZm8XV5V5gYBj6TSmX2lg+RLaRQ0DCbi5p2J5bmtpyWe31s6QwnZH1fSgdJB8iH+BQ2DEbiZ1nZnygpRg0zD2P3Nhr+bcU712hTgNz2YocFb8X4BYdAPeBiI/alfFyUzsbmIOCbOJCZH8ncl3gV1R2uLei1noZyZfDaQzlCCrkilM3OlQ+yO+BY0DD4PXCIdI1+GUDthGFs2SOcoQT9JpTMnSYforXgWNAwOAe6UjpFPxpA4v+Kx16RzlKAK4O5UOrOHdJDeiF9B3X1R5gMlNyQ8O/lUf+kMJeoA4HrpEL0Rv4LC93H3+yg5w9kycTB1Hc3bqnbfhal05jTpED0Vr4KGwTG4G9iUJGPoe17yCb1GtHDuiNtQNz4FdUPbO4lT5l44L/l4Sb8+YbEb6sbph+FK3F2jStqB5v2JlTRul85RwmI11I3HrH5hMBZ374u+0lGK4VvNc5bd13qyF+/v2pYm3r3vm9iWZshmGTD6OIZM+wKt27ex8eHraNn6HhV7jGDYZ9IkKwfRsPbPbFr0U0yyD8PO+Ff67Lkv2YZaNjx8HcOnX4MxXlw7/zYwvqa6yvsbWcVlC3o7ZVJOgNnJRS3SGf4m2YcR536ffefcysgLbmb7mhdoXPcaW5c+SGVqEvvNvYPK1CS2LnW34Nz6/H+x92euZMgJs9j2pwUAbHn2foJjp/tSTojRUNf/gobBdOA46RjFNMqsHV9BixezzxtjSPR17/7YbAtkW8EY6t9YxsDxnwRg4PhPUv+Xpe77ExXYliZsSyMmUUHz5vW0bvuAygMniL2GTlyYSme8/7nyu6DuwJDX05YUgjEE/5hY9pJ0jh1stpV3fnUpa2+ZSWVqMv32HU1r3RYqBrlJEisGDSVbtwWAYOrZfPDorWz948MMnnI6W5b8miHTZgqm36Vq6QBd8bug8BWgLCfWurBiYb10hh1MIsm+F9zC/l+5i8b1r9O0oabT7+074hBGzvox+8y4lpboXZK5Em94+Do2PvIjWuu8uhfS8al0pko6xK74W9AwGAJ8RzqGlAlmzRjfJrZOVA6i8oAJbF+9nOTAIbTUullDW2o3kRg4ZKfvtdYSPTuf4LgZbHnmPoYcfx4Dx53E1hceEUi+S9f6fFmat8Fwb6t4M9F0sSWM3fukxIvid+NurY/INtQCkG1upOGvL9Jnr/0ZcNgx1L38OAB1Lz/OgMN2Puhc9/Lj9D/0SJKVg7DNjWASYIz72C8TgPOkQ3TGz7dZwmAo8BZQElfF99b/tY5dMqP52ydIZmh6fw0bMzeAzYLNMmDMNIYcN4PW7VvZ+HA1LVs3ULHH3gw780qS/d0189nmBt5/6GpGTP8eJllBw9svs2nRbZhkBcPO+AZ9hu4n+ZI6sgoYW1Nd5dWIBfwt6Hdwtwcsay02se6wxnu9+2kuUefUVFc9IB2iPf+GuGHQH7hUOoYPKkx2v6PNq69K5ygTV6XSGW/eqN3Bv4LCHEAn0cqZW5F5XzpDmZgIfFo6RHt+FdTd+foK6Rg+mZZ4KfbzLcXI16UDtOdXQeEM4GDpED7pZ1oOHmtq3pTOUSampdIZr953962gF0oH8NHcit+tlc5QRry6x4s/BQ2DkcCnpGP46JTECyOkM5SRL/p0sMifgsJsICkdwkcDTeOYg8y7uhUtjkOA46VD7OBTQS+QDuCzi5MZ3Q8tHm+GuX4UNAyOA0ZJx/DZp5NLy/a0RwFnp9KZSukQ4EtB4SzpAL7bg7pxI9ik74kWR4C7dYQ4Xwpasrcwz5fcxNarpHOUES+GufIFDYPxlOk1nz11VnJJWV88UGSnptIZ8aPn8gX1ZCgRB8OIJgTUbpHOUSYqgFOlQ2hBY8QY+sxM/q9ObF0806QDyBY0DIZTordxKJQZFU/0kc5QRkSvxQXpgrrfUN6ctREH+7FxwgAa6qRzlInRqXRmuGQA6YJ6c8ZGXBhD/7OSS1ZK5ygjoj+j0gX1fl5SH81KLvJuao4SJrofKlfQMBgAHCG2/hg71Lwzvg8tTdI5yoTofqjkFnQq7lC26iFj2OP0xP95M7F1iZuUSmcGS61csqBe3BworuZUPNognaFMJIGPSa1csqDe3awjTsaZmjEJsq3SOcqE2H6oZEHHC6479hLGDvtEYrn4xNZlYozUimUK6iYHGy2y7hJyccWCSDpDmThIasVSW9AUZXS/z0L5qHn9cPBx5vGSIzazolRB9eLsPKgw2ZFTE3/Wia0Lb7jUBdxSBdWpNfNkbjKzQTpDmRDZikoVdB+h9Zac4xMvi+0flRktqOq5vqYlNd6sfkM6RxkQ+UUoVVDxK9VLydyKzDrpDGWgrLagWtA8OjnxwkjpDGWgrAqqQ9w8GmCaRh1s3nlLOkeJK6sh7hCh9Zasi5ML1khnKHFDJFYqVdB+QustWacnl+4lnaHEiUw1I1VQPYsozwZTP24fNr0nnaOEiVwaWfyChkFCZL0lzhjMnIqFOrF14ZTNFlS3ngXyueTTYhcWlwGRgkpstrWgBbIXWyev7veFTdI5SlEWsw02F329EgXVCa8KxBiMwepd0Aoggd0ms97iqwf0EikVNyKzVwgcJIqywPair1ep3SPyMyt1NLVWaL1K9ZbIvr0WVKnuKf4RIrSgSnVXWW1BPxBar1K9VVZb0LeF1qtUb5XVFlQvjVJxs15ipboFVap7RKaV0S2oUt3zpsRKtaBKdW07IDLvk1RBVyN06pRSvbCaMBI5PVXo3ixRA6DXLqq4EBneguyF0y8KrlupnlgptWLJgq4QXLdSPfGc1Ip1C6pU17SgSnlqLWH0rtTK5QoaRu8juPOtVDc9L7ly6dn1nhBev1JdERveghZUqa6I/oxqQZXq3Ebgj5IBZAvq9kNfEc2gVOcW5ebQEiO9BQX4X+kASnXiUekAPhT0v6UDKNUBCzwmHcKHgi4BxN5nUqoTL+R2wUTJF9SN8X8rHUOpduZJBwAfCuo8KB1AqTayaEF3sgTQe1sqXzxBGInMQdSeHwV1w9wHpGMolXOvdIAd/Cio8wvpAErhpjfx5piIPwUNo5eAZdIxVNl7kDASudVgR/wpqHObdABV9m6SDtCWbwWdjzv/USkJTxNGy6VDtOVXQd1kYndIx1Bl60bpAO35VVDnZvQGv6r41uDhaaf+FdRNL6FbUVVst0hfudIR/wrqXAc0Sodob9XGVib/rPZvjz2u3cqNSxvZtN1yyj11HH5LLafcU8fm7W6O42feamHibbUcdUctb2xy//dbGiyn3VuHtSLzIKuOrQd+Jh2iI8bbH5QwuBW4RDpGZ1qzlv2ur2XZRQP5yfNNDO1vSB/fj+o/NLJ5u+W6Uyr53Px6rju5HzVbLI++0cKPT6vkiscaOGN0BR9PVUi/BPWhSwijn0qH6IivW1CAaqBJOkRnHl/TyqFDExw0JMHDq1qYPakPALMn9eG/V7UA0CcJ21ugvtnSJwlvbsqybltWy+mXNXi8S+VvQcNoLZ4OOwDuf7mZGeNdKd+rzTJysPunHDk4wft1bjh75fH9mPtIAzcua+KrR/flqica+N5J/cQyqw5dTRg1S4fojL8FdUI8fF+0qdXyP6taOHvsrreEk/dJsvSigSyePZDVm7PsOziBBc55qJ6Zv93Oe7XeHZMoN68C90iH2BW/CxpGm4FvS8dob+FfWpgyMsGIQe6fb8SgBOu3ubKt35Zl+MCd/1mttfz7kka+c0I/rn6qkatP7MfMiX24eZm3I/hycYWPR27b8rugzh14Ngv9vDbDW4AzRlVw9wo3Srp7RTNnjt55y3r3imaqDq9gz/6G+mZIGPeo93ZgVRYeJowWSofoir9HcdsKg2m4a0bF1TdbDrihltVfG0RQaQD4oD7L9Ie281ZkOTAwPHj2AIb2N3/7/qr76lk0cwB9koan/9rCVxY00DcJ8z7fn1F7JSVfTrmqB8YRRjXSQboSj4IChME9wEzpGKokfJMw+oF0iO6IwxB3h3/GvaGs1O5YAVwvHaK74lPQMNoEXCwdQ8VaEzCHMGqRDtJd8SkoQBhlgF9Kx1Cx9R3fLifrSrwK6vwL8JZ0CBU7TwA/lA7RU/EraBhtBWYDrdJRVGxsAmYRRjE5Ivqh+BUUIIyeBL4lHUPFxsWE0TrpEL0Rz4ICucPk/ykdQ3nvesLIm1n6eiq+BXUuAF6TDqG89SjwDekQuyM+Jyp0JgzG4G5TPlg6ivLKKuAYwiiSDrI74r4FhTB6DTgXiM17W6rgtgBnxL2cUAoFBQijBehJDMppBs4hjF6XDpIPpVFQgDC6C7hSOoYSlQW+SBgtkg6SL6VTUIAwqsZN26nKjwXmEkbzpYPkU2kV1LkM+I10CFV0lxNGd0qHyLfSK6g7W2QWcJdwElU8IWF0o3SIQii9gsKO+43OAX4uHUUV3LcIo6ulQxRK/N8H7UoY3Ii7llSVlizwZcLodukghVSaW9C2wugy4FrpGCqvmoBzS72cUA5b0B3C4CLgp0Cfrr5Vea0O+Cxh9HvpIMVQPgUFCIOTgIeAodJRVK/UAJ8hjFZIBymW0h/ithVGi4GpQEmcZVJmngCOLKdyQrkVFCCM/oIr6WPSUVS33QCcShh9IB2k2MpriNtWGBjcqYHXADo5rZ/qgS8RRvdKB5FSvgXdwU2KfS9woHQUtZPngJm5EU/ZKr8hbnth9DQwCbhfOooC3GWDIXBcuZcTdAu6szA4C3ey/UjpKGXqddzVKM9JB/FFXregxhhrjLmnzd8rjDEbjDG/6+J5J3b1PUURRg8BH8Hdl1R/cxVPE/B94Agt587yfavnOmC8Maa/tXY7cApQ9NnUjDFJa23vpuV0V+F/OXcvmNuBcfnMpv7OY8ClOpztWCH2QRcCVbmPZwDzdnzBGHO0MeZZY8yfcn+Obv9kY8xKY8wQ43xgjJmV+/w9xpiTjTEpY8zTxpjlucfHcl8/0Riz2BhzH7DSGJM0xvzQGPO8MeYlY8yXevQqwuhZ4AjgCqDsDu8XwV+BzxFGn9Jydq4QBb0fONcYUwlMBJa1+dprwAnW2iOAf8MNa9p7BjgOt+VaDUzLfX4qsBR4HzjFWjsFOIedL9A+GrjKWjsWuBCIrLVHAUcBFxtjDu7RKwmjZsLoeuDQXNb6Hj1fdWQLcBXwEcLov4SzeC/fQ1ystS8ZY1K4reeCdl8OgLuNMYfj9vE6Oi/2aeAE3G/Y24C5xpj9gE3W2lpjTADcaoyZjJtdflSb5z5nrV2T+/hUYKIx5qw26z4cWENPuWHvVYTBrcB3ceXP+79didsG3AL8kDDaIpwlNgr1Q/Y/wI+AE4G92nz+e8Bia+1ncyV+soPnLgEuwb0veRXwWeAsXHHB3ZvlPdxbIwmgoc1z69p8bIBLrbX5O2MojNYD/0QY/CCX4wJgYN6WX5q24op5fe4OdaoHCvU+6C+Ba6y1K9t9PuDDg0bnd/REa+3bwDDgcGvtauAPwNf5sKABsN5a6yaI6vwsoMeALxtj+gAYY0YZY/JTpjBaTRhdyoe/RN7Ny3JLyyrga8D+hNG3tZy9U5CCWmvXWmtv6uBLPwCuNcY8w65Pr1vGhye0Pw3shysquEvGZhtjluKGt3V//3QAfgH8GVhujHkZN7tCfkcMYbSJMPo+cBBuBodlXTyj1GWBR4DTcPuYtxBG24QzxZqeqJBvYTAad/e1LwL7C6cpltdwBwd/TRj1fB9fdUoLWihhkAA+AczEve00TDZQ3tUA84H7CaMXZaOULi1oMbiyHgOcjivrJNlAvdKCG8IvAh7VM36KQwsqIQz2x51ldWzuMRb/LlxoxQ1dFwO/B57M3TxZFZEW1AdhsAfuJItjgcnAaOAwoF+REjQDrwLLgRdyf75IGOmJGcK0oL5yw+KDcEeqR+U+3gcYAeyNm1dpT2AAnW9964Go3WM9bv9xTZs/1xFGvTt3WRWUFrRUuEIn2zwaCSO9JWPMaUGV8phvByaUUm1oQZXymBZUKY9pQZXymBZUKY9pQZXymBZUKY9pQZXymBZUKY9pQZXymBZUKY9pQZXymBZUKY9pQZXymBZUKY9pQZXymBZUKY9pQZXymBZUKY9pQZXymBZUKY9pQZXymBZUKY9pQZXymBZUKY9pQZXymBZUKY/9P8Iv7ysch+PKAAAAAElFTkSuQmCC\n",
990 | "text/plain": [
991 | ""
992 | ]
993 | },
994 | "metadata": {},
995 | "output_type": "display_data"
996 | }
997 | ],
998 | "source": [
999 | "type_classify=['Legitimate','Malware']\n",
1000 | "count_classify=[41323,96724]\n",
1001 | "plt.pie(count_classify, labels=type_classify, autopct='%0.f%%')"
1002 | ]
1003 | },
1004 | {
1005 | "cell_type": "code",
1006 | "execution_count": 10,
1007 | "id": "4f846ba0",
1008 | "metadata": {},
1009 | "outputs": [],
1010 | "source": [
1011 | "# Total Number of Columns in Dataset"
1012 | ]
1013 | },
1014 | {
1015 | "cell_type": "code",
1016 | "execution_count": 11,
1017 | "id": "cb1a7785",
1018 | "metadata": {},
1019 | "outputs": [
1020 | {
1021 | "data": {
1022 | "text/plain": [
1023 | "57"
1024 | ]
1025 | },
1026 | "execution_count": 11,
1027 | "metadata": {},
1028 | "output_type": "execute_result"
1029 | }
1030 | ],
1031 | "source": [
1032 | "dataset.shape[1]"
1033 | ]
1034 | },
1035 | {
1036 | "cell_type": "code",
1037 | "execution_count": 12,
1038 | "id": "7c62eedf",
1039 | "metadata": {},
1040 | "outputs": [],
1041 | "source": [
1042 | "# Creating Legitimate and Malware Dataset from Main Dataset"
1043 | ]
1044 | },
1045 | {
1046 | "cell_type": "code",
1047 | "execution_count": 13,
1048 | "id": "bda12c84",
1049 | "metadata": {},
1050 | "outputs": [
1051 | {
1052 | "data": {
1053 | "text/html": [
1054 | "\n",
1055 | "\n",
1068 | "
\n",
1069 | " \n",
1070 | " \n",
1071 | " | \n",
1072 | " Name | \n",
1073 | " md5 | \n",
1074 | " Machine | \n",
1075 | " SizeOfOptionalHeader | \n",
1076 | " Characteristics | \n",
1077 | " MajorLinkerVersion | \n",
1078 | " MinorLinkerVersion | \n",
1079 | " SizeOfCode | \n",
1080 | " SizeOfInitializedData | \n",
1081 | " SizeOfUninitializedData | \n",
1082 | " ... | \n",
1083 | " ExportNb | \n",
1084 | " ResourcesNb | \n",
1085 | " ResourcesMeanEntropy | \n",
1086 | " ResourcesMinEntropy | \n",
1087 | " ResourcesMaxEntropy | \n",
1088 | " ResourcesMeanSize | \n",
1089 | " ResourcesMinSize | \n",
1090 | " ResourcesMaxSize | \n",
1091 | " LoadConfigurationSize | \n",
1092 | " VersionInformationSize | \n",
1093 | "
\n",
1094 | " \n",
1095 | " \n",
1096 | " \n",
1097 | " 0 | \n",
1098 | " memtest.exe | \n",
1099 | " 631ea355665f28d4707448e442fbf5b8 | \n",
1100 | " 332 | \n",
1101 | " 224 | \n",
1102 | " 258 | \n",
1103 | " 9 | \n",
1104 | " 0 | \n",
1105 | " 361984 | \n",
1106 | " 115712 | \n",
1107 | " 0 | \n",
1108 | " ... | \n",
1109 | " 0 | \n",
1110 | " 4 | \n",
1111 | " 3.262823 | \n",
1112 | " 2.568844 | \n",
1113 | " 3.537939 | \n",
1114 | " 8797.000000 | \n",
1115 | " 216 | \n",
1116 | " 18032 | \n",
1117 | " 0 | \n",
1118 | " 16 | \n",
1119 | "
\n",
1120 | " \n",
1121 | " 1 | \n",
1122 | " ose.exe | \n",
1123 | " 9d10f99a6712e28f8acd5641e3a7ea6b | \n",
1124 | " 332 | \n",
1125 | " 224 | \n",
1126 | " 3330 | \n",
1127 | " 9 | \n",
1128 | " 0 | \n",
1129 | " 130560 | \n",
1130 | " 19968 | \n",
1131 | " 0 | \n",
1132 | " ... | \n",
1133 | " 0 | \n",
1134 | " 2 | \n",
1135 | " 4.250461 | \n",
1136 | " 3.420744 | \n",
1137 | " 5.080177 | \n",
1138 | " 837.000000 | \n",
1139 | " 518 | \n",
1140 | " 1156 | \n",
1141 | " 72 | \n",
1142 | " 18 | \n",
1143 | "
\n",
1144 | " \n",
1145 | " 2 | \n",
1146 | " setup.exe | \n",
1147 | " 4d92f518527353c0db88a70fddcfd390 | \n",
1148 | " 332 | \n",
1149 | " 224 | \n",
1150 | " 3330 | \n",
1151 | " 9 | \n",
1152 | " 0 | \n",
1153 | " 517120 | \n",
1154 | " 621568 | \n",
1155 | " 0 | \n",
1156 | " ... | \n",
1157 | " 1 | \n",
1158 | " 11 | \n",
1159 | " 4.426324 | \n",
1160 | " 2.846449 | \n",
1161 | " 5.271813 | \n",
1162 | " 31102.272727 | \n",
1163 | " 104 | \n",
1164 | " 270376 | \n",
1165 | " 72 | \n",
1166 | " 18 | \n",
1167 | "
\n",
1168 | " \n",
1169 | " 3 | \n",
1170 | " DW20.EXE | \n",
1171 | " a41e524f8d45f0074fd07805ff0c9b12 | \n",
1172 | " 332 | \n",
1173 | " 224 | \n",
1174 | " 258 | \n",
1175 | " 9 | \n",
1176 | " 0 | \n",
1177 | " 585728 | \n",
1178 | " 369152 | \n",
1179 | " 0 | \n",
1180 | " ... | \n",
1181 | " 1 | \n",
1182 | " 10 | \n",
1183 | " 4.364291 | \n",
1184 | " 2.669314 | \n",
1185 | " 6.400720 | \n",
1186 | " 1457.000000 | \n",
1187 | " 90 | \n",
1188 | " 4264 | \n",
1189 | " 72 | \n",
1190 | " 18 | \n",
1191 | "
\n",
1192 | " \n",
1193 | " 4 | \n",
1194 | " dwtrig20.exe | \n",
1195 | " c87e561258f2f8650cef999bf643a731 | \n",
1196 | " 332 | \n",
1197 | " 224 | \n",
1198 | " 258 | \n",
1199 | " 9 | \n",
1200 | " 0 | \n",
1201 | " 294912 | \n",
1202 | " 247296 | \n",
1203 | " 0 | \n",
1204 | " ... | \n",
1205 | " 1 | \n",
1206 | " 2 | \n",
1207 | " 4.306100 | \n",
1208 | " 3.421598 | \n",
1209 | " 5.190603 | \n",
1210 | " 1074.500000 | \n",
1211 | " 849 | \n",
1212 | " 1300 | \n",
1213 | " 72 | \n",
1214 | " 18 | \n",
1215 | "
\n",
1216 | " \n",
1217 | " ... | \n",
1218 | " ... | \n",
1219 | " ... | \n",
1220 | " ... | \n",
1221 | " ... | \n",
1222 | " ... | \n",
1223 | " ... | \n",
1224 | " ... | \n",
1225 | " ... | \n",
1226 | " ... | \n",
1227 | " ... | \n",
1228 | " ... | \n",
1229 | " ... | \n",
1230 | " ... | \n",
1231 | " ... | \n",
1232 | " ... | \n",
1233 | " ... | \n",
1234 | " ... | \n",
1235 | " ... | \n",
1236 | " ... | \n",
1237 | " ... | \n",
1238 | " ... | \n",
1239 | "
\n",
1240 | " \n",
1241 | " 41318 | \n",
1242 | " mfc80.dll | \n",
1243 | " 1f5afd468eb5e09e9ed75a087529eab5 | \n",
1244 | " 332 | \n",
1245 | " 224 | \n",
1246 | " 8450 | \n",
1247 | " 8 | \n",
1248 | " 0 | \n",
1249 | " 946176 | \n",
1250 | " 159744 | \n",
1251 | " 0 | \n",
1252 | " ... | \n",
1253 | " 0 | \n",
1254 | " 123 | \n",
1255 | " 2.607251 | \n",
1256 | " 0.960953 | \n",
1257 | " 5.130762 | \n",
1258 | " 327.170732 | \n",
1259 | " 20 | \n",
1260 | " 1592 | \n",
1261 | " 72 | \n",
1262 | " 16 | \n",
1263 | "
\n",
1264 | " \n",
1265 | " 41319 | \n",
1266 | " mfc80u.dll | \n",
1267 | " e2c48cd0132d4d1dc7d0df9a6bef686a | \n",
1268 | " 332 | \n",
1269 | " 224 | \n",
1270 | " 8450 | \n",
1271 | " 8 | \n",
1272 | " 0 | \n",
1273 | " 946176 | \n",
1274 | " 154624 | \n",
1275 | " 0 | \n",
1276 | " ... | \n",
1277 | " 0 | \n",
1278 | " 123 | \n",
1279 | " 2.607232 | \n",
1280 | " 0.960953 | \n",
1281 | " 5.130762 | \n",
1282 | " 327.235772 | \n",
1283 | " 20 | \n",
1284 | " 1592 | \n",
1285 | " 72 | \n",
1286 | " 16 | \n",
1287 | "
\n",
1288 | " \n",
1289 | " 41320 | \n",
1290 | " mfcm80.dll | \n",
1291 | " 83362ee950ad18adb85b54409155c378 | \n",
1292 | " 332 | \n",
1293 | " 224 | \n",
1294 | " 8450 | \n",
1295 | " 8 | \n",
1296 | " 0 | \n",
1297 | " 53248 | \n",
1298 | " 16384 | \n",
1299 | " 0 | \n",
1300 | " ... | \n",
1301 | " 25 | \n",
1302 | " 1 | \n",
1303 | " 3.524268 | \n",
1304 | " 3.524268 | \n",
1305 | " 3.524268 | \n",
1306 | " 892.000000 | \n",
1307 | " 892 | \n",
1308 | " 892 | \n",
1309 | " 72 | \n",
1310 | " 16 | \n",
1311 | "
\n",
1312 | " \n",
1313 | " 41321 | \n",
1314 | " mfcm80u.dll | \n",
1315 | " 26aafee5c30020c99120ee113d751f7e | \n",
1316 | " 332 | \n",
1317 | " 224 | \n",
1318 | " 8450 | \n",
1319 | " 8 | \n",
1320 | " 0 | \n",
1321 | " 52736 | \n",
1322 | " 11264 | \n",
1323 | " 0 | \n",
1324 | " ... | \n",
1325 | " 25 | \n",
1326 | " 1 | \n",
1327 | " 3.542071 | \n",
1328 | " 3.542071 | \n",
1329 | " 3.542071 | \n",
1330 | " 892.000000 | \n",
1331 | " 892 | \n",
1332 | " 892 | \n",
1333 | " 72 | \n",
1334 | " 16 | \n",
1335 | "
\n",
1336 | " \n",
1337 | " 41322 | \n",
1338 | " vcomp.dll | \n",
1339 | " 73dbaa64d589f3262615550dd6881fee | \n",
1340 | " 332 | \n",
1341 | " 224 | \n",
1342 | " 8450 | \n",
1343 | " 8 | \n",
1344 | " 0 | \n",
1345 | " 40960 | \n",
1346 | " 20480 | \n",
1347 | " 0 | \n",
1348 | " ... | \n",
1349 | " 112 | \n",
1350 | " 6 | \n",
1351 | " 3.004383 | \n",
1352 | " 2.406512 | \n",
1353 | " 3.592623 | \n",
1354 | " 610.333333 | \n",
1355 | " 124 | \n",
1356 | " 1412 | \n",
1357 | " 72 | \n",
1358 | " 16 | \n",
1359 | "
\n",
1360 | " \n",
1361 | "
\n",
1362 | "
41323 rows × 56 columns
\n",
1363 | "
"
1364 | ],
1365 | "text/plain": [
1366 | " Name md5 Machine \\\n",
1367 | "0 memtest.exe 631ea355665f28d4707448e442fbf5b8 332 \n",
1368 | "1 ose.exe 9d10f99a6712e28f8acd5641e3a7ea6b 332 \n",
1369 | "2 setup.exe 4d92f518527353c0db88a70fddcfd390 332 \n",
1370 | "3 DW20.EXE a41e524f8d45f0074fd07805ff0c9b12 332 \n",
1371 | "4 dwtrig20.exe c87e561258f2f8650cef999bf643a731 332 \n",
1372 | "... ... ... ... \n",
1373 | "41318 mfc80.dll 1f5afd468eb5e09e9ed75a087529eab5 332 \n",
1374 | "41319 mfc80u.dll e2c48cd0132d4d1dc7d0df9a6bef686a 332 \n",
1375 | "41320 mfcm80.dll 83362ee950ad18adb85b54409155c378 332 \n",
1376 | "41321 mfcm80u.dll 26aafee5c30020c99120ee113d751f7e 332 \n",
1377 | "41322 vcomp.dll 73dbaa64d589f3262615550dd6881fee 332 \n",
1378 | "\n",
1379 | " SizeOfOptionalHeader Characteristics MajorLinkerVersion \\\n",
1380 | "0 224 258 9 \n",
1381 | "1 224 3330 9 \n",
1382 | "2 224 3330 9 \n",
1383 | "3 224 258 9 \n",
1384 | "4 224 258 9 \n",
1385 | "... ... ... ... \n",
1386 | "41318 224 8450 8 \n",
1387 | "41319 224 8450 8 \n",
1388 | "41320 224 8450 8 \n",
1389 | "41321 224 8450 8 \n",
1390 | "41322 224 8450 8 \n",
1391 | "\n",
1392 | " MinorLinkerVersion SizeOfCode SizeOfInitializedData \\\n",
1393 | "0 0 361984 115712 \n",
1394 | "1 0 130560 19968 \n",
1395 | "2 0 517120 621568 \n",
1396 | "3 0 585728 369152 \n",
1397 | "4 0 294912 247296 \n",
1398 | "... ... ... ... \n",
1399 | "41318 0 946176 159744 \n",
1400 | "41319 0 946176 154624 \n",
1401 | "41320 0 53248 16384 \n",
1402 | "41321 0 52736 11264 \n",
1403 | "41322 0 40960 20480 \n",
1404 | "\n",
1405 | " SizeOfUninitializedData ... ExportNb ResourcesNb \\\n",
1406 | "0 0 ... 0 4 \n",
1407 | "1 0 ... 0 2 \n",
1408 | "2 0 ... 1 11 \n",
1409 | "3 0 ... 1 10 \n",
1410 | "4 0 ... 1 2 \n",
1411 | "... ... ... ... ... \n",
1412 | "41318 0 ... 0 123 \n",
1413 | "41319 0 ... 0 123 \n",
1414 | "41320 0 ... 25 1 \n",
1415 | "41321 0 ... 25 1 \n",
1416 | "41322 0 ... 112 6 \n",
1417 | "\n",
1418 | " ResourcesMeanEntropy ResourcesMinEntropy ResourcesMaxEntropy \\\n",
1419 | "0 3.262823 2.568844 3.537939 \n",
1420 | "1 4.250461 3.420744 5.080177 \n",
1421 | "2 4.426324 2.846449 5.271813 \n",
1422 | "3 4.364291 2.669314 6.400720 \n",
1423 | "4 4.306100 3.421598 5.190603 \n",
1424 | "... ... ... ... \n",
1425 | "41318 2.607251 0.960953 5.130762 \n",
1426 | "41319 2.607232 0.960953 5.130762 \n",
1427 | "41320 3.524268 3.524268 3.524268 \n",
1428 | "41321 3.542071 3.542071 3.542071 \n",
1429 | "41322 3.004383 2.406512 3.592623 \n",
1430 | "\n",
1431 | " ResourcesMeanSize ResourcesMinSize ResourcesMaxSize \\\n",
1432 | "0 8797.000000 216 18032 \n",
1433 | "1 837.000000 518 1156 \n",
1434 | "2 31102.272727 104 270376 \n",
1435 | "3 1457.000000 90 4264 \n",
1436 | "4 1074.500000 849 1300 \n",
1437 | "... ... ... ... \n",
1438 | "41318 327.170732 20 1592 \n",
1439 | "41319 327.235772 20 1592 \n",
1440 | "41320 892.000000 892 892 \n",
1441 | "41321 892.000000 892 892 \n",
1442 | "41322 610.333333 124 1412 \n",
1443 | "\n",
1444 | " LoadConfigurationSize VersionInformationSize \n",
1445 | "0 0 16 \n",
1446 | "1 72 18 \n",
1447 | "2 72 18 \n",
1448 | "3 72 18 \n",
1449 | "4 72 18 \n",
1450 | "... ... ... \n",
1451 | "41318 72 16 \n",
1452 | "41319 72 16 \n",
1453 | "41320 72 16 \n",
1454 | "41321 72 16 \n",
1455 | "41322 72 16 \n",
1456 | "\n",
1457 | "[41323 rows x 56 columns]"
1458 | ]
1459 | },
1460 | "execution_count": 13,
1461 | "metadata": {},
1462 | "output_type": "execute_result"
1463 | }
1464 | ],
1465 | "source": [
1466 | "legit=dataset[0:41323].drop([\"legitimate\"],axis=1) # here axis =1 means vertical \n",
1467 | "legit"
1468 | ]
1469 | },
1470 | {
1471 | "cell_type": "code",
1472 | "execution_count": 14,
1473 | "id": "5adc7421",
1474 | "metadata": {},
1475 | "outputs": [
1476 | {
1477 | "data": {
1478 | "text/html": [
1479 | "\n",
1480 | "\n",
1493 | "
\n",
1494 | " \n",
1495 | " \n",
1496 | " | \n",
1497 | " Name | \n",
1498 | " md5 | \n",
1499 | " Machine | \n",
1500 | " SizeOfOptionalHeader | \n",
1501 | " Characteristics | \n",
1502 | " MajorLinkerVersion | \n",
1503 | " MinorLinkerVersion | \n",
1504 | " SizeOfCode | \n",
1505 | " SizeOfInitializedData | \n",
1506 | " SizeOfUninitializedData | \n",
1507 | " ... | \n",
1508 | " ResourcesNb | \n",
1509 | " ResourcesMeanEntropy | \n",
1510 | " ResourcesMinEntropy | \n",
1511 | " ResourcesMaxEntropy | \n",
1512 | " ResourcesMeanSize | \n",
1513 | " ResourcesMinSize | \n",
1514 | " ResourcesMaxSize | \n",
1515 | " LoadConfigurationSize | \n",
1516 | " VersionInformationSize | \n",
1517 | " legitimate | \n",
1518 | "
\n",
1519 | " \n",
1520 | " \n",
1521 | " \n",
1522 | " 41323 | \n",
1523 | " VirusShare_4a400b747afe6547e09ce0b02dae7f1c | \n",
1524 | " 4a400b747afe6547e09ce0b02dae7f1c | \n",
1525 | " 332 | \n",
1526 | " 224 | \n",
1527 | " 258 | \n",
1528 | " 11 | \n",
1529 | " 0 | \n",
1530 | " 354816 | \n",
1531 | " 257024 | \n",
1532 | " 0 | \n",
1533 | " ... | \n",
1534 | " 7 | \n",
1535 | " 3.914415 | \n",
1536 | " 1.441688 | \n",
1537 | " 7.677091 | \n",
1538 | " 7298.428571 | \n",
1539 | " 16 | \n",
1540 | " 28438 | \n",
1541 | " 72 | \n",
1542 | " 0 | \n",
1543 | " 0 | \n",
1544 | "
\n",
1545 | " \n",
1546 | " 41324 | \n",
1547 | " VirusShare_9bd57c8252948bd2fa651ad372bd4f13 | \n",
1548 | " 9bd57c8252948bd2fa651ad372bd4f13 | \n",
1549 | " 332 | \n",
1550 | " 224 | \n",
1551 | " 271 | \n",
1552 | " 6 | \n",
1553 | " 0 | \n",
1554 | " 24064 | \n",
1555 | " 164864 | \n",
1556 | " 1024 | \n",
1557 | " ... | \n",
1558 | " 6 | \n",
1559 | " 3.199107 | \n",
1560 | " 1.971335 | \n",
1561 | " 5.214816 | \n",
1562 | " 452.000000 | \n",
1563 | " 34 | \n",
1564 | " 958 | \n",
1565 | " 0 | \n",
1566 | " 15 | \n",
1567 | " 0 | \n",
1568 | "
\n",
1569 | " \n",
1570 | " 41325 | \n",
1571 | " VirusShare_d1456165e9358b8f61f93a5f2042f39c | \n",
1572 | " d1456165e9358b8f61f93a5f2042f39c | \n",
1573 | " 332 | \n",
1574 | " 224 | \n",
1575 | " 258 | \n",
1576 | " 10 | \n",
1577 | " 0 | \n",
1578 | " 118784 | \n",
1579 | " 381952 | \n",
1580 | " 0 | \n",
1581 | " ... | \n",
1582 | " 18 | \n",
1583 | " 6.530946 | \n",
1584 | " 2.458492 | \n",
1585 | " 7.992688 | \n",
1586 | " 18523.444444 | \n",
1587 | " 48 | \n",
1588 | " 33945 | \n",
1589 | " 72 | \n",
1590 | " 14 | \n",
1591 | " 0 | \n",
1592 | "
\n",
1593 | " \n",
1594 | " 41326 | \n",
1595 | " VirusShare_e4214cc73afbba0f52bb72d5db8f8bb1 | \n",
1596 | " e4214cc73afbba0f52bb72d5db8f8bb1 | \n",
1597 | " 332 | \n",
1598 | " 224 | \n",
1599 | " 258 | \n",
1600 | " 10 | \n",
1601 | " 0 | \n",
1602 | " 174592 | \n",
1603 | " 300032 | \n",
1604 | " 0 | \n",
1605 | " ... | \n",
1606 | " 15 | \n",
1607 | " 5.732393 | \n",
1608 | " 2.852364 | \n",
1609 | " 7.987726 | \n",
1610 | " 12706.133333 | \n",
1611 | " 118 | \n",
1612 | " 60500 | \n",
1613 | " 72 | \n",
1614 | " 14 | \n",
1615 | " 0 | \n",
1616 | "
\n",
1617 | " \n",
1618 | " 41327 | \n",
1619 | " VirusShare_710890c07b3f93b90635f8bff6c34605 | \n",
1620 | " 710890c07b3f93b90635f8bff6c34605 | \n",
1621 | " 332 | \n",
1622 | " 224 | \n",
1623 | " 258 | \n",
1624 | " 9 | \n",
1625 | " 0 | \n",
1626 | " 475648 | \n",
1627 | " 348672 | \n",
1628 | " 0 | \n",
1629 | " ... | \n",
1630 | " 59 | \n",
1631 | " 2.827826 | \n",
1632 | " 0.960953 | \n",
1633 | " 7.212329 | \n",
1634 | " 2637.033898 | \n",
1635 | " 20 | \n",
1636 | " 67624 | \n",
1637 | " 72 | \n",
1638 | " 0 | \n",
1639 | " 0 | \n",
1640 | "
\n",
1641 | " \n",
1642 | " ... | \n",
1643 | " ... | \n",
1644 | " ... | \n",
1645 | " ... | \n",
1646 | " ... | \n",
1647 | " ... | \n",
1648 | " ... | \n",
1649 | " ... | \n",
1650 | " ... | \n",
1651 | " ... | \n",
1652 | " ... | \n",
1653 | " ... | \n",
1654 | " ... | \n",
1655 | " ... | \n",
1656 | " ... | \n",
1657 | " ... | \n",
1658 | " ... | \n",
1659 | " ... | \n",
1660 | " ... | \n",
1661 | " ... | \n",
1662 | " ... | \n",
1663 | " ... | \n",
1664 | "
\n",
1665 | " \n",
1666 | " 138042 | \n",
1667 | " VirusShare_8e292b418568d6e7b87f2a32aee7074b | \n",
1668 | " 8e292b418568d6e7b87f2a32aee7074b | \n",
1669 | " 332 | \n",
1670 | " 224 | \n",
1671 | " 258 | \n",
1672 | " 11 | \n",
1673 | " 0 | \n",
1674 | " 205824 | \n",
1675 | " 223744 | \n",
1676 | " 0 | \n",
1677 | " ... | \n",
1678 | " 7 | \n",
1679 | " 4.122736 | \n",
1680 | " 1.370260 | \n",
1681 | " 7.677091 | \n",
1682 | " 14900.714286 | \n",
1683 | " 16 | \n",
1684 | " 81654 | \n",
1685 | " 72 | \n",
1686 | " 0 | \n",
1687 | " 0 | \n",
1688 | "
\n",
1689 | " \n",
1690 | " 138043 | \n",
1691 | " VirusShare_260d9e2258aed4c8a3bbd703ec895822 | \n",
1692 | " 260d9e2258aed4c8a3bbd703ec895822 | \n",
1693 | " 332 | \n",
1694 | " 224 | \n",
1695 | " 33167 | \n",
1696 | " 2 | \n",
1697 | " 25 | \n",
1698 | " 37888 | \n",
1699 | " 185344 | \n",
1700 | " 0 | \n",
1701 | " ... | \n",
1702 | " 26 | \n",
1703 | " 3.377663 | \n",
1704 | " 2.031619 | \n",
1705 | " 5.050074 | \n",
1706 | " 6905.846154 | \n",
1707 | " 44 | \n",
1708 | " 67624 | \n",
1709 | " 0 | \n",
1710 | " 15 | \n",
1711 | " 0 | \n",
1712 | "
\n",
1713 | " \n",
1714 | " 138044 | \n",
1715 | " VirusShare_8d088a51b7d225c9f5d11d239791ec3f | \n",
1716 | " 8d088a51b7d225c9f5d11d239791ec3f | \n",
1717 | " 332 | \n",
1718 | " 224 | \n",
1719 | " 258 | \n",
1720 | " 10 | \n",
1721 | " 0 | \n",
1722 | " 118272 | \n",
1723 | " 380416 | \n",
1724 | " 0 | \n",
1725 | " ... | \n",
1726 | " 22 | \n",
1727 | " 6.825406 | \n",
1728 | " 2.617026 | \n",
1729 | " 7.990487 | \n",
1730 | " 14981.909091 | \n",
1731 | " 48 | \n",
1732 | " 22648 | \n",
1733 | " 72 | \n",
1734 | " 14 | \n",
1735 | " 0 | \n",
1736 | "
\n",
1737 | " \n",
1738 | " 138045 | \n",
1739 | " VirusShare_4286dccf67ca220fe67635388229a9f3 | \n",
1740 | " 4286dccf67ca220fe67635388229a9f3 | \n",
1741 | " 332 | \n",
1742 | " 224 | \n",
1743 | " 33166 | \n",
1744 | " 2 | \n",
1745 | " 25 | \n",
1746 | " 49152 | \n",
1747 | " 16896 | \n",
1748 | " 0 | \n",
1749 | " ... | \n",
1750 | " 10 | \n",
1751 | " 3.421627 | \n",
1752 | " 2.060964 | \n",
1753 | " 4.739744 | \n",
1754 | " 601.600000 | \n",
1755 | " 16 | \n",
1756 | " 2216 | \n",
1757 | " 0 | \n",
1758 | " 0 | \n",
1759 | " 0 | \n",
1760 | "
\n",
1761 | " \n",
1762 | " 138046 | \n",
1763 | " VirusShare_d7648eae45f09b3adb75127f43be6d11 | \n",
1764 | " d7648eae45f09b3adb75127f43be6d11 | \n",
1765 | " 332 | \n",
1766 | " 224 | \n",
1767 | " 258 | \n",
1768 | " 11 | \n",
1769 | " 0 | \n",
1770 | " 111616 | \n",
1771 | " 468480 | \n",
1772 | " 0 | \n",
1773 | " ... | \n",
1774 | " 4 | \n",
1775 | " 4.407252 | \n",
1776 | " 1.980482 | \n",
1777 | " 6.115374 | \n",
1778 | " 96625.000000 | \n",
1779 | " 20 | \n",
1780 | " 318464 | \n",
1781 | " 72 | \n",
1782 | " 0 | \n",
1783 | " 0 | \n",
1784 | "
\n",
1785 | " \n",
1786 | "
\n",
1787 | "
96724 rows × 57 columns
\n",
1788 | "
"
1789 | ],
1790 | "text/plain": [
1791 | " Name \\\n",
1792 | "41323 VirusShare_4a400b747afe6547e09ce0b02dae7f1c \n",
1793 | "41324 VirusShare_9bd57c8252948bd2fa651ad372bd4f13 \n",
1794 | "41325 VirusShare_d1456165e9358b8f61f93a5f2042f39c \n",
1795 | "41326 VirusShare_e4214cc73afbba0f52bb72d5db8f8bb1 \n",
1796 | "41327 VirusShare_710890c07b3f93b90635f8bff6c34605 \n",
1797 | "... ... \n",
1798 | "138042 VirusShare_8e292b418568d6e7b87f2a32aee7074b \n",
1799 | "138043 VirusShare_260d9e2258aed4c8a3bbd703ec895822 \n",
1800 | "138044 VirusShare_8d088a51b7d225c9f5d11d239791ec3f \n",
1801 | "138045 VirusShare_4286dccf67ca220fe67635388229a9f3 \n",
1802 | "138046 VirusShare_d7648eae45f09b3adb75127f43be6d11 \n",
1803 | "\n",
1804 | " md5 Machine SizeOfOptionalHeader \\\n",
1805 | "41323 4a400b747afe6547e09ce0b02dae7f1c 332 224 \n",
1806 | "41324 9bd57c8252948bd2fa651ad372bd4f13 332 224 \n",
1807 | "41325 d1456165e9358b8f61f93a5f2042f39c 332 224 \n",
1808 | "41326 e4214cc73afbba0f52bb72d5db8f8bb1 332 224 \n",
1809 | "41327 710890c07b3f93b90635f8bff6c34605 332 224 \n",
1810 | "... ... ... ... \n",
1811 | "138042 8e292b418568d6e7b87f2a32aee7074b 332 224 \n",
1812 | "138043 260d9e2258aed4c8a3bbd703ec895822 332 224 \n",
1813 | "138044 8d088a51b7d225c9f5d11d239791ec3f 332 224 \n",
1814 | "138045 4286dccf67ca220fe67635388229a9f3 332 224 \n",
1815 | "138046 d7648eae45f09b3adb75127f43be6d11 332 224 \n",
1816 | "\n",
1817 | " Characteristics MajorLinkerVersion MinorLinkerVersion SizeOfCode \\\n",
1818 | "41323 258 11 0 354816 \n",
1819 | "41324 271 6 0 24064 \n",
1820 | "41325 258 10 0 118784 \n",
1821 | "41326 258 10 0 174592 \n",
1822 | "41327 258 9 0 475648 \n",
1823 | "... ... ... ... ... \n",
1824 | "138042 258 11 0 205824 \n",
1825 | "138043 33167 2 25 37888 \n",
1826 | "138044 258 10 0 118272 \n",
1827 | "138045 33166 2 25 49152 \n",
1828 | "138046 258 11 0 111616 \n",
1829 | "\n",
1830 | " SizeOfInitializedData SizeOfUninitializedData ... ResourcesNb \\\n",
1831 | "41323 257024 0 ... 7 \n",
1832 | "41324 164864 1024 ... 6 \n",
1833 | "41325 381952 0 ... 18 \n",
1834 | "41326 300032 0 ... 15 \n",
1835 | "41327 348672 0 ... 59 \n",
1836 | "... ... ... ... ... \n",
1837 | "138042 223744 0 ... 7 \n",
1838 | "138043 185344 0 ... 26 \n",
1839 | "138044 380416 0 ... 22 \n",
1840 | "138045 16896 0 ... 10 \n",
1841 | "138046 468480 0 ... 4 \n",
1842 | "\n",
1843 | " ResourcesMeanEntropy ResourcesMinEntropy ResourcesMaxEntropy \\\n",
1844 | "41323 3.914415 1.441688 7.677091 \n",
1845 | "41324 3.199107 1.971335 5.214816 \n",
1846 | "41325 6.530946 2.458492 7.992688 \n",
1847 | "41326 5.732393 2.852364 7.987726 \n",
1848 | "41327 2.827826 0.960953 7.212329 \n",
1849 | "... ... ... ... \n",
1850 | "138042 4.122736 1.370260 7.677091 \n",
1851 | "138043 3.377663 2.031619 5.050074 \n",
1852 | "138044 6.825406 2.617026 7.990487 \n",
1853 | "138045 3.421627 2.060964 4.739744 \n",
1854 | "138046 4.407252 1.980482 6.115374 \n",
1855 | "\n",
1856 | " ResourcesMeanSize ResourcesMinSize ResourcesMaxSize \\\n",
1857 | "41323 7298.428571 16 28438 \n",
1858 | "41324 452.000000 34 958 \n",
1859 | "41325 18523.444444 48 33945 \n",
1860 | "41326 12706.133333 118 60500 \n",
1861 | "41327 2637.033898 20 67624 \n",
1862 | "... ... ... ... \n",
1863 | "138042 14900.714286 16 81654 \n",
1864 | "138043 6905.846154 44 67624 \n",
1865 | "138044 14981.909091 48 22648 \n",
1866 | "138045 601.600000 16 2216 \n",
1867 | "138046 96625.000000 20 318464 \n",
1868 | "\n",
1869 | " LoadConfigurationSize VersionInformationSize legitimate \n",
1870 | "41323 72 0 0 \n",
1871 | "41324 0 15 0 \n",
1872 | "41325 72 14 0 \n",
1873 | "41326 72 14 0 \n",
1874 | "41327 72 0 0 \n",
1875 | "... ... ... ... \n",
1876 | "138042 72 0 0 \n",
1877 | "138043 0 15 0 \n",
1878 | "138044 72 14 0 \n",
1879 | "138045 0 0 0 \n",
1880 | "138046 72 0 0 \n",
1881 | "\n",
1882 | "[96724 rows x 57 columns]"
1883 | ]
1884 | },
1885 | "execution_count": 14,
1886 | "metadata": {},
1887 | "output_type": "execute_result"
1888 | }
1889 | ],
1890 | "source": [
1891 | "mal=dataset[41323::]\n",
1892 | "maldata=dataset[41323::].drop([\"legitimate\"],axis=1)\n",
1893 | "mal"
1894 | ]
1895 | },
1896 | {
1897 | "cell_type": "code",
1898 | "execution_count": 15,
1899 | "id": "325f7c48",
1900 | "metadata": {},
1901 | "outputs": [
1902 | {
1903 | "name": "stdout",
1904 | "output_type": "stream",
1905 | "text": [
1906 | "The shape of legit database is 41323 samples and 56 features\n",
1907 | "The shape of malware database is 96724 samples and 57 features\n"
1908 | ]
1909 | }
1910 | ],
1911 | "source": [
1912 | "print(\"The shape of legit database is %s samples and %s features\"%(legit.shape[0],legit.shape[1])) \n",
1913 | "print(\"The shape of malware database is %s samples and %s features\"%(mal.shape[0],mal.shape[1])) "
1914 | ]
1915 | },
1916 | {
1917 | "cell_type": "code",
1918 | "execution_count": 16,
1919 | "id": "09e246a0",
1920 | "metadata": {},
1921 | "outputs": [
1922 | {
1923 | "name": "stdout",
1924 | "output_type": "stream",
1925 | "text": [
1926 | "Index(['Name', 'md5', 'Machine', 'SizeOfOptionalHeader', 'Characteristics',\n",
1927 | " 'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode',\n",
1928 | " 'SizeOfInitializedData', 'SizeOfUninitializedData',\n",
1929 | " 'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase',\n",
1930 | " 'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion',\n",
1931 | " 'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion',\n",
1932 | " 'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage',\n",
1933 | " 'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics',\n",
1934 | " 'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve',\n",
1935 | " 'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'SectionsNb',\n",
1936 | " 'SectionsMeanEntropy', 'SectionsMinEntropy', 'SectionsMaxEntropy',\n",
1937 | " 'SectionsMeanRawsize', 'SectionsMinRawsize', 'SectionMaxRawsize',\n",
1938 | " 'SectionsMeanVirtualsize', 'SectionsMinVirtualsize',\n",
1939 | " 'SectionMaxVirtualsize', 'ImportsNbDLL', 'ImportsNb',\n",
1940 | " 'ImportsNbOrdinal', 'ExportNb', 'ResourcesNb', 'ResourcesMeanEntropy',\n",
1941 | " 'ResourcesMinEntropy', 'ResourcesMaxEntropy', 'ResourcesMeanSize',\n",
1942 | " 'ResourcesMinSize', 'ResourcesMaxSize', 'LoadConfigurationSize',\n",
1943 | " 'VersionInformationSize', 'legitimate'],\n",
1944 | " dtype='object')\n",
1945 | "Index(['Name', 'md5', 'Machine', 'SizeOfOptionalHeader', 'Characteristics',\n",
1946 | " 'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode',\n",
1947 | " 'SizeOfInitializedData', 'SizeOfUninitializedData',\n",
1948 | " 'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase',\n",
1949 | " 'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion',\n",
1950 | " 'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion',\n",
1951 | " 'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage',\n",
1952 | " 'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics',\n",
1953 | " 'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve',\n",
1954 | " 'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'SectionsNb',\n",
1955 | " 'SectionsMeanEntropy', 'SectionsMinEntropy', 'SectionsMaxEntropy',\n",
1956 | " 'SectionsMeanRawsize', 'SectionsMinRawsize', 'SectionMaxRawsize',\n",
1957 | " 'SectionsMeanVirtualsize', 'SectionsMinVirtualsize',\n",
1958 | " 'SectionMaxVirtualsize', 'ImportsNbDLL', 'ImportsNb',\n",
1959 | " 'ImportsNbOrdinal', 'ExportNb', 'ResourcesNb', 'ResourcesMeanEntropy',\n",
1960 | " 'ResourcesMinEntropy', 'ResourcesMaxEntropy', 'ResourcesMeanSize',\n",
1961 | " 'ResourcesMinSize', 'ResourcesMaxSize', 'LoadConfigurationSize',\n",
1962 | " 'VersionInformationSize', 'legitimate'],\n",
1963 | " dtype='object')\n"
1964 | ]
1965 | }
1966 | ],
1967 | "source": [
1968 | "#to find the features ie the column names\n",
1969 | "print(dataset.columns) #but in malware or legit there is no legitimate feature :)\n",
1970 | "print(mal.columns)"
1971 | ]
1972 | },
1973 | {
1974 | "cell_type": "code",
1975 | "execution_count": 17,
1976 | "id": "55644b80",
1977 | "metadata": {},
1978 | "outputs": [
1979 | {
1980 | "data": {
1981 | "text/html": [
1982 | "\n",
1983 | "\n",
1996 | "
\n",
1997 | " \n",
1998 | " \n",
1999 | " | \n",
2000 | " Name | \n",
2001 | " md5 | \n",
2002 | " Machine | \n",
2003 | " SizeOfOptionalHeader | \n",
2004 | " Characteristics | \n",
2005 | " MajorLinkerVersion | \n",
2006 | " MinorLinkerVersion | \n",
2007 | " SizeOfCode | \n",
2008 | " SizeOfInitializedData | \n",
2009 | " SizeOfUninitializedData | \n",
2010 | " ... | \n",
2011 | " ResourcesNb | \n",
2012 | " ResourcesMeanEntropy | \n",
2013 | " ResourcesMinEntropy | \n",
2014 | " ResourcesMaxEntropy | \n",
2015 | " ResourcesMeanSize | \n",
2016 | " ResourcesMinSize | \n",
2017 | " ResourcesMaxSize | \n",
2018 | " LoadConfigurationSize | \n",
2019 | " VersionInformationSize | \n",
2020 | " legitimate | \n",
2021 | "
\n",
2022 | " \n",
2023 | " \n",
2024 | " \n",
2025 | " 41323 | \n",
2026 | " VirusShare_4a400b747afe6547e09ce0b02dae7f1c | \n",
2027 | " 4a400b747afe6547e09ce0b02dae7f1c | \n",
2028 | " 332 | \n",
2029 | " 224 | \n",
2030 | " 258 | \n",
2031 | " 11 | \n",
2032 | " 0 | \n",
2033 | " 354816 | \n",
2034 | " 257024 | \n",
2035 | " 0 | \n",
2036 | " ... | \n",
2037 | " 7 | \n",
2038 | " 3.914415 | \n",
2039 | " 1.441688 | \n",
2040 | " 7.677091 | \n",
2041 | " 7298.428571 | \n",
2042 | " 16 | \n",
2043 | " 28438 | \n",
2044 | " 72 | \n",
2045 | " 0 | \n",
2046 | " 0 | \n",
2047 | "
\n",
2048 | " \n",
2049 | " 41324 | \n",
2050 | " VirusShare_9bd57c8252948bd2fa651ad372bd4f13 | \n",
2051 | " 9bd57c8252948bd2fa651ad372bd4f13 | \n",
2052 | " 332 | \n",
2053 | " 224 | \n",
2054 | " 271 | \n",
2055 | " 6 | \n",
2056 | " 0 | \n",
2057 | " 24064 | \n",
2058 | " 164864 | \n",
2059 | " 1024 | \n",
2060 | " ... | \n",
2061 | " 6 | \n",
2062 | " 3.199107 | \n",
2063 | " 1.971335 | \n",
2064 | " 5.214816 | \n",
2065 | " 452.000000 | \n",
2066 | " 34 | \n",
2067 | " 958 | \n",
2068 | " 0 | \n",
2069 | " 15 | \n",
2070 | " 0 | \n",
2071 | "
\n",
2072 | " \n",
2073 | " 41325 | \n",
2074 | " VirusShare_d1456165e9358b8f61f93a5f2042f39c | \n",
2075 | " d1456165e9358b8f61f93a5f2042f39c | \n",
2076 | " 332 | \n",
2077 | " 224 | \n",
2078 | " 258 | \n",
2079 | " 10 | \n",
2080 | " 0 | \n",
2081 | " 118784 | \n",
2082 | " 381952 | \n",
2083 | " 0 | \n",
2084 | " ... | \n",
2085 | " 18 | \n",
2086 | " 6.530946 | \n",
2087 | " 2.458492 | \n",
2088 | " 7.992688 | \n",
2089 | " 18523.444444 | \n",
2090 | " 48 | \n",
2091 | " 33945 | \n",
2092 | " 72 | \n",
2093 | " 14 | \n",
2094 | " 0 | \n",
2095 | "
\n",
2096 | " \n",
2097 | " 41326 | \n",
2098 | " VirusShare_e4214cc73afbba0f52bb72d5db8f8bb1 | \n",
2099 | " e4214cc73afbba0f52bb72d5db8f8bb1 | \n",
2100 | " 332 | \n",
2101 | " 224 | \n",
2102 | " 258 | \n",
2103 | " 10 | \n",
2104 | " 0 | \n",
2105 | " 174592 | \n",
2106 | " 300032 | \n",
2107 | " 0 | \n",
2108 | " ... | \n",
2109 | " 15 | \n",
2110 | " 5.732393 | \n",
2111 | " 2.852364 | \n",
2112 | " 7.987726 | \n",
2113 | " 12706.133333 | \n",
2114 | " 118 | \n",
2115 | " 60500 | \n",
2116 | " 72 | \n",
2117 | " 14 | \n",
2118 | " 0 | \n",
2119 | "
\n",
2120 | " \n",
2121 | " 41327 | \n",
2122 | " VirusShare_710890c07b3f93b90635f8bff6c34605 | \n",
2123 | " 710890c07b3f93b90635f8bff6c34605 | \n",
2124 | " 332 | \n",
2125 | " 224 | \n",
2126 | " 258 | \n",
2127 | " 9 | \n",
2128 | " 0 | \n",
2129 | " 475648 | \n",
2130 | " 348672 | \n",
2131 | " 0 | \n",
2132 | " ... | \n",
2133 | " 59 | \n",
2134 | " 2.827826 | \n",
2135 | " 0.960953 | \n",
2136 | " 7.212329 | \n",
2137 | " 2637.033898 | \n",
2138 | " 20 | \n",
2139 | " 67624 | \n",
2140 | " 72 | \n",
2141 | " 0 | \n",
2142 | " 0 | \n",
2143 | "
\n",
2144 | " \n",
2145 | " 41328 | \n",
2146 | " VirusShare_3c2eb01508703752dca01957ea451a40 | \n",
2147 | " 3c2eb01508703752dca01957ea451a40 | \n",
2148 | " 332 | \n",
2149 | " 224 | \n",
2150 | " 259 | \n",
2151 | " 9 | \n",
2152 | " 0 | \n",
2153 | " 157696 | \n",
2154 | " 62464 | \n",
2155 | " 0 | \n",
2156 | " ... | \n",
2157 | " 13 | \n",
2158 | " 3.943296 | \n",
2159 | " 1.814443 | \n",
2160 | " 6.122045 | \n",
2161 | " 2708.153846 | \n",
2162 | " 132 | \n",
2163 | " 9640 | \n",
2164 | " 72 | \n",
2165 | " 14 | \n",
2166 | " 0 | \n",
2167 | "
\n",
2168 | " \n",
2169 | " 41329 | \n",
2170 | " VirusShare_3fb2d0ac00c5dff6c4fd5dfe6ba52c3f | \n",
2171 | " 3fb2d0ac00c5dff6c4fd5dfe6ba52c3f | \n",
2172 | " 332 | \n",
2173 | " 224 | \n",
2174 | " 259 | \n",
2175 | " 83 | \n",
2176 | " 82 | \n",
2177 | " 724992 | \n",
2178 | " 2306048 | \n",
2179 | " 0 | \n",
2180 | " ... | \n",
2181 | " 21 | \n",
2182 | " 3.987463 | \n",
2183 | " 2.642159 | \n",
2184 | " 6.473700 | \n",
2185 | " 14288.000000 | \n",
2186 | " 76 | \n",
2187 | " 270376 | \n",
2188 | " 0 | \n",
2189 | " 0 | \n",
2190 | " 0 | \n",
2191 | "
\n",
2192 | " \n",
2193 | " 41330 | \n",
2194 | " VirusShare_ad1ca9a4d572c0a2793c4cea29b20887 | \n",
2195 | " ad1ca9a4d572c0a2793c4cea29b20887 | \n",
2196 | " 332 | \n",
2197 | " 224 | \n",
2198 | " 258 | \n",
2199 | " 10 | \n",
2200 | " 0 | \n",
2201 | " 120320 | \n",
2202 | " 385024 | \n",
2203 | " 0 | \n",
2204 | " ... | \n",
2205 | " 6 | \n",
2206 | " 3.729824 | \n",
2207 | " 2.458492 | \n",
2208 | " 5.317552 | \n",
2209 | " 2739.500000 | \n",
2210 | " 48 | \n",
2211 | " 9640 | \n",
2212 | " 72 | \n",
2213 | " 15 | \n",
2214 | " 0 | \n",
2215 | "
\n",
2216 | " \n",
2217 | " 41331 | \n",
2218 | " VirusShare_7414edb3d0be66aa0816e6ed4b6b0a21 | \n",
2219 | " 7414edb3d0be66aa0816e6ed4b6b0a21 | \n",
2220 | " 332 | \n",
2221 | " 224 | \n",
2222 | " 259 | \n",
2223 | " 10 | \n",
2224 | " 0 | \n",
2225 | " 233984 | \n",
2226 | " 1377792 | \n",
2227 | " 0 | \n",
2228 | " ... | \n",
2229 | " 18 | \n",
2230 | " 4.328322 | \n",
2231 | " 2.323220 | \n",
2232 | " 7.068413 | \n",
2233 | " 76158.277778 | \n",
2234 | " 9 | \n",
2235 | " 1342735 | \n",
2236 | " 72 | \n",
2237 | " 19 | \n",
2238 | " 0 | \n",
2239 | "
\n",
2240 | " \n",
2241 | " 41332 | \n",
2242 | " VirusShare_e57b4f294c142d050a784b67e2cf1f2e | \n",
2243 | " e57b4f294c142d050a784b67e2cf1f2e | \n",
2244 | " 332 | \n",
2245 | " 224 | \n",
2246 | " 271 | \n",
2247 | " 6 | \n",
2248 | " 0 | \n",
2249 | " 49152 | \n",
2250 | " 561152 | \n",
2251 | " 0 | \n",
2252 | " ... | \n",
2253 | " 0 | \n",
2254 | " 0.000000 | \n",
2255 | " 0.000000 | \n",
2256 | " 0.000000 | \n",
2257 | " 0.000000 | \n",
2258 | " 0 | \n",
2259 | " 0 | \n",
2260 | " 0 | \n",
2261 | " 0 | \n",
2262 | " 0 | \n",
2263 | "
\n",
2264 | " \n",
2265 | "
\n",
2266 | "
10 rows × 57 columns
\n",
2267 | "
"
2268 | ],
2269 | "text/plain": [
2270 | " Name \\\n",
2271 | "41323 VirusShare_4a400b747afe6547e09ce0b02dae7f1c \n",
2272 | "41324 VirusShare_9bd57c8252948bd2fa651ad372bd4f13 \n",
2273 | "41325 VirusShare_d1456165e9358b8f61f93a5f2042f39c \n",
2274 | "41326 VirusShare_e4214cc73afbba0f52bb72d5db8f8bb1 \n",
2275 | "41327 VirusShare_710890c07b3f93b90635f8bff6c34605 \n",
2276 | "41328 VirusShare_3c2eb01508703752dca01957ea451a40 \n",
2277 | "41329 VirusShare_3fb2d0ac00c5dff6c4fd5dfe6ba52c3f \n",
2278 | "41330 VirusShare_ad1ca9a4d572c0a2793c4cea29b20887 \n",
2279 | "41331 VirusShare_7414edb3d0be66aa0816e6ed4b6b0a21 \n",
2280 | "41332 VirusShare_e57b4f294c142d050a784b67e2cf1f2e \n",
2281 | "\n",
2282 | " md5 Machine SizeOfOptionalHeader \\\n",
2283 | "41323 4a400b747afe6547e09ce0b02dae7f1c 332 224 \n",
2284 | "41324 9bd57c8252948bd2fa651ad372bd4f13 332 224 \n",
2285 | "41325 d1456165e9358b8f61f93a5f2042f39c 332 224 \n",
2286 | "41326 e4214cc73afbba0f52bb72d5db8f8bb1 332 224 \n",
2287 | "41327 710890c07b3f93b90635f8bff6c34605 332 224 \n",
2288 | "41328 3c2eb01508703752dca01957ea451a40 332 224 \n",
2289 | "41329 3fb2d0ac00c5dff6c4fd5dfe6ba52c3f 332 224 \n",
2290 | "41330 ad1ca9a4d572c0a2793c4cea29b20887 332 224 \n",
2291 | "41331 7414edb3d0be66aa0816e6ed4b6b0a21 332 224 \n",
2292 | "41332 e57b4f294c142d050a784b67e2cf1f2e 332 224 \n",
2293 | "\n",
2294 | " Characteristics MajorLinkerVersion MinorLinkerVersion SizeOfCode \\\n",
2295 | "41323 258 11 0 354816 \n",
2296 | "41324 271 6 0 24064 \n",
2297 | "41325 258 10 0 118784 \n",
2298 | "41326 258 10 0 174592 \n",
2299 | "41327 258 9 0 475648 \n",
2300 | "41328 259 9 0 157696 \n",
2301 | "41329 259 83 82 724992 \n",
2302 | "41330 258 10 0 120320 \n",
2303 | "41331 259 10 0 233984 \n",
2304 | "41332 271 6 0 49152 \n",
2305 | "\n",
2306 | " SizeOfInitializedData SizeOfUninitializedData ... ResourcesNb \\\n",
2307 | "41323 257024 0 ... 7 \n",
2308 | "41324 164864 1024 ... 6 \n",
2309 | "41325 381952 0 ... 18 \n",
2310 | "41326 300032 0 ... 15 \n",
2311 | "41327 348672 0 ... 59 \n",
2312 | "41328 62464 0 ... 13 \n",
2313 | "41329 2306048 0 ... 21 \n",
2314 | "41330 385024 0 ... 6 \n",
2315 | "41331 1377792 0 ... 18 \n",
2316 | "41332 561152 0 ... 0 \n",
2317 | "\n",
2318 | " ResourcesMeanEntropy ResourcesMinEntropy ResourcesMaxEntropy \\\n",
2319 | "41323 3.914415 1.441688 7.677091 \n",
2320 | "41324 3.199107 1.971335 5.214816 \n",
2321 | "41325 6.530946 2.458492 7.992688 \n",
2322 | "41326 5.732393 2.852364 7.987726 \n",
2323 | "41327 2.827826 0.960953 7.212329 \n",
2324 | "41328 3.943296 1.814443 6.122045 \n",
2325 | "41329 3.987463 2.642159 6.473700 \n",
2326 | "41330 3.729824 2.458492 5.317552 \n",
2327 | "41331 4.328322 2.323220 7.068413 \n",
2328 | "41332 0.000000 0.000000 0.000000 \n",
2329 | "\n",
2330 | " ResourcesMeanSize ResourcesMinSize ResourcesMaxSize \\\n",
2331 | "41323 7298.428571 16 28438 \n",
2332 | "41324 452.000000 34 958 \n",
2333 | "41325 18523.444444 48 33945 \n",
2334 | "41326 12706.133333 118 60500 \n",
2335 | "41327 2637.033898 20 67624 \n",
2336 | "41328 2708.153846 132 9640 \n",
2337 | "41329 14288.000000 76 270376 \n",
2338 | "41330 2739.500000 48 9640 \n",
2339 | "41331 76158.277778 9 1342735 \n",
2340 | "41332 0.000000 0 0 \n",
2341 | "\n",
2342 | " LoadConfigurationSize VersionInformationSize legitimate \n",
2343 | "41323 72 0 0 \n",
2344 | "41324 0 15 0 \n",
2345 | "41325 72 14 0 \n",
2346 | "41326 72 14 0 \n",
2347 | "41327 72 0 0 \n",
2348 | "41328 72 14 0 \n",
2349 | "41329 0 0 0 \n",
2350 | "41330 72 15 0 \n",
2351 | "41331 72 19 0 \n",
2352 | "41332 0 0 0 \n",
2353 | "\n",
2354 | "[10 rows x 57 columns]"
2355 | ]
2356 | },
2357 | "execution_count": 17,
2358 | "metadata": {},
2359 | "output_type": "execute_result"
2360 | }
2361 | ],
2362 | "source": [
2363 | "#first 10 data points from malware database:\n",
2364 | "mal.head(10)"
2365 | ]
2366 | },
2367 | {
2368 | "cell_type": "code",
2369 | "execution_count": 18,
2370 | "id": "d243486f",
2371 | "metadata": {},
2372 | "outputs": [
2373 | {
2374 | "data": {
2375 | "text/html": [
2376 | "\n",
2377 | "\n",
2390 | "
\n",
2391 | " \n",
2392 | " \n",
2393 | " | \n",
2394 | " Name | \n",
2395 | " md5 | \n",
2396 | " Machine | \n",
2397 | " SizeOfOptionalHeader | \n",
2398 | " Characteristics | \n",
2399 | " MajorLinkerVersion | \n",
2400 | " MinorLinkerVersion | \n",
2401 | " SizeOfCode | \n",
2402 | " SizeOfInitializedData | \n",
2403 | " SizeOfUninitializedData | \n",
2404 | " ... | \n",
2405 | " ExportNb | \n",
2406 | " ResourcesNb | \n",
2407 | " ResourcesMeanEntropy | \n",
2408 | " ResourcesMinEntropy | \n",
2409 | " ResourcesMaxEntropy | \n",
2410 | " ResourcesMeanSize | \n",
2411 | " ResourcesMinSize | \n",
2412 | " ResourcesMaxSize | \n",
2413 | " LoadConfigurationSize | \n",
2414 | " VersionInformationSize | \n",
2415 | "
\n",
2416 | " \n",
2417 | " \n",
2418 | " \n",
2419 | " 0 | \n",
2420 | " memtest.exe | \n",
2421 | " 631ea355665f28d4707448e442fbf5b8 | \n",
2422 | " 332 | \n",
2423 | " 224 | \n",
2424 | " 258 | \n",
2425 | " 9 | \n",
2426 | " 0 | \n",
2427 | " 361984 | \n",
2428 | " 115712 | \n",
2429 | " 0 | \n",
2430 | " ... | \n",
2431 | " 0 | \n",
2432 | " 4 | \n",
2433 | " 3.262823 | \n",
2434 | " 2.568844 | \n",
2435 | " 3.537939 | \n",
2436 | " 8797.0 | \n",
2437 | " 216 | \n",
2438 | " 18032 | \n",
2439 | " 0 | \n",
2440 | " 16 | \n",
2441 | "
\n",
2442 | " \n",
2443 | "
\n",
2444 | "
1 rows × 56 columns
\n",
2445 | "
"
2446 | ],
2447 | "text/plain": [
2448 | " Name md5 Machine \\\n",
2449 | "0 memtest.exe 631ea355665f28d4707448e442fbf5b8 332 \n",
2450 | "\n",
2451 | " SizeOfOptionalHeader Characteristics MajorLinkerVersion \\\n",
2452 | "0 224 258 9 \n",
2453 | "\n",
2454 | " MinorLinkerVersion SizeOfCode SizeOfInitializedData \\\n",
2455 | "0 0 361984 115712 \n",
2456 | "\n",
2457 | " SizeOfUninitializedData ... ExportNb ResourcesNb ResourcesMeanEntropy \\\n",
2458 | "0 0 ... 0 4 3.262823 \n",
2459 | "\n",
2460 | " ResourcesMinEntropy ResourcesMaxEntropy ResourcesMeanSize \\\n",
2461 | "0 2.568844 3.537939 8797.0 \n",
2462 | "\n",
2463 | " ResourcesMinSize ResourcesMaxSize LoadConfigurationSize \\\n",
2464 | "0 216 18032 0 \n",
2465 | "\n",
2466 | " VersionInformationSize \n",
2467 | "0 16 \n",
2468 | "\n",
2469 | "[1 rows x 56 columns]"
2470 | ]
2471 | },
2472 | "execution_count": 18,
2473 | "metadata": {},
2474 | "output_type": "execute_result"
2475 | }
2476 | ],
2477 | "source": [
2478 | "#datapoint of legit to have a good comparison \n",
2479 | "legit.take([0]) #1st datapoint"
2480 | ]
2481 | },
2482 | {
2483 | "cell_type": "code",
2484 | "execution_count": 19,
2485 | "id": "fd741615",
2486 | "metadata": {},
2487 | "outputs": [
2488 | {
2489 | "data": {
2490 | "text/html": [
2491 | "\n",
2492 | "\n",
2505 | "
\n",
2506 | " \n",
2507 | " \n",
2508 | " | \n",
2509 | " Name | \n",
2510 | " md5 | \n",
2511 | " Machine | \n",
2512 | " SizeOfOptionalHeader | \n",
2513 | " Characteristics | \n",
2514 | " MajorLinkerVersion | \n",
2515 | " MinorLinkerVersion | \n",
2516 | " SizeOfCode | \n",
2517 | " SizeOfInitializedData | \n",
2518 | " SizeOfUninitializedData | \n",
2519 | " ... | \n",
2520 | " ResourcesNb | \n",
2521 | " ResourcesMeanEntropy | \n",
2522 | " ResourcesMinEntropy | \n",
2523 | " ResourcesMaxEntropy | \n",
2524 | " ResourcesMeanSize | \n",
2525 | " ResourcesMinSize | \n",
2526 | " ResourcesMaxSize | \n",
2527 | " LoadConfigurationSize | \n",
2528 | " VersionInformationSize | \n",
2529 | " legitimate | \n",
2530 | "
\n",
2531 | " \n",
2532 | " \n",
2533 | " \n",
2534 | " 41323 | \n",
2535 | " VirusShare_4a400b747afe6547e09ce0b02dae7f1c | \n",
2536 | " 4a400b747afe6547e09ce0b02dae7f1c | \n",
2537 | " 332 | \n",
2538 | " 224 | \n",
2539 | " 258 | \n",
2540 | " 11 | \n",
2541 | " 0 | \n",
2542 | " 354816 | \n",
2543 | " 257024 | \n",
2544 | " 0 | \n",
2545 | " ... | \n",
2546 | " 7 | \n",
2547 | " 3.914415 | \n",
2548 | " 1.441688 | \n",
2549 | " 7.677091 | \n",
2550 | " 7298.428571 | \n",
2551 | " 16 | \n",
2552 | " 28438 | \n",
2553 | " 72 | \n",
2554 | " 0 | \n",
2555 | " 0 | \n",
2556 | "
\n",
2557 | " \n",
2558 | "
\n",
2559 | "
1 rows × 57 columns
\n",
2560 | "
"
2561 | ],
2562 | "text/plain": [
2563 | " Name \\\n",
2564 | "41323 VirusShare_4a400b747afe6547e09ce0b02dae7f1c \n",
2565 | "\n",
2566 | " md5 Machine SizeOfOptionalHeader \\\n",
2567 | "41323 4a400b747afe6547e09ce0b02dae7f1c 332 224 \n",
2568 | "\n",
2569 | " Characteristics MajorLinkerVersion MinorLinkerVersion SizeOfCode \\\n",
2570 | "41323 258 11 0 354816 \n",
2571 | "\n",
2572 | " SizeOfInitializedData SizeOfUninitializedData ... ResourcesNb \\\n",
2573 | "41323 257024 0 ... 7 \n",
2574 | "\n",
2575 | " ResourcesMeanEntropy ResourcesMinEntropy ResourcesMaxEntropy \\\n",
2576 | "41323 3.914415 1.441688 7.677091 \n",
2577 | "\n",
2578 | " ResourcesMeanSize ResourcesMinSize ResourcesMaxSize \\\n",
2579 | "41323 7298.428571 16 28438 \n",
2580 | "\n",
2581 | " LoadConfigurationSize VersionInformationSize legitimate \n",
2582 | "41323 72 0 0 \n",
2583 | "\n",
2584 | "[1 rows x 57 columns]"
2585 | ]
2586 | },
2587 | "execution_count": 19,
2588 | "metadata": {},
2589 | "output_type": "execute_result"
2590 | }
2591 | ],
2592 | "source": [
2593 | "#datapoint of malware to have a good comparison \n",
2594 | "mal.take([0]) #1st datapoint"
2595 | ]
2596 | },
2597 | {
2598 | "cell_type": "code",
2599 | "execution_count": 20,
2600 | "id": "4dd1e87b",
2601 | "metadata": {},
2602 | "outputs": [],
2603 | "source": [
2604 | "# Feature Extraction"
2605 | ]
2606 | },
2607 | {
2608 | "cell_type": "code",
2609 | "execution_count": 21,
2610 | "id": "d1ecc40f",
2611 | "metadata": {},
2612 | "outputs": [],
2613 | "source": [
2614 | "x=dataset.drop(['Name','md5','legitimate'],axis=1).values #independent features\n",
2615 | "y=dataset['legitimate'].values #dependent variable"
2616 | ]
2617 | },
2618 | {
2619 | "cell_type": "code",
2620 | "execution_count": 22,
2621 | "id": "e26ddd4d",
2622 | "metadata": {},
2623 | "outputs": [],
2624 | "source": [
2625 | "extratrees=ek.ExtraTreesClassifier().fit(x,y)\n",
2626 | "model=SelectFromModel(extratrees,prefit=True)\n",
2627 | "x_new=model.transform(x)\n",
2628 | "nbfeatures=x_new.shape[1]"
2629 | ]
2630 | },
2631 | {
2632 | "cell_type": "code",
2633 | "execution_count": 23,
2634 | "id": "3306769b",
2635 | "metadata": {},
2636 | "outputs": [
2637 | {
2638 | "data": {
2639 | "text/plain": [
2640 | "14"
2641 | ]
2642 | },
2643 | "execution_count": 23,
2644 | "metadata": {},
2645 | "output_type": "execute_result"
2646 | }
2647 | ],
2648 | "source": [
2649 | "nbfeatures"
2650 | ]
2651 | },
2652 | {
2653 | "cell_type": "code",
2654 | "execution_count": 24,
2655 | "id": "a1bc47cc",
2656 | "metadata": {},
2657 | "outputs": [
2658 | {
2659 | "data": {
2660 | "text/plain": [
2661 | "([,\n",
2662 | " ],\n",
2663 | " [Text(0.7884607600756525, 0.7670264857362649, 'Important Features'),\n",
2664 | " Text(-0.7884607959827531, -0.7670264488257517, 'Not Important Features')],\n",
2665 | " [Text(0.4300695054958104, 0.4183780831288717, '25%'),\n",
2666 | " Text(-0.43006952508150165, -0.4183780629958645, '75%')])"
2667 | ]
2668 | },
2669 | "execution_count": 24,
2670 | "metadata": {},
2671 | "output_type": "execute_result"
2672 | },
2673 | {
2674 | "data": {
2675 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAADnCAYAAAA3gRxRAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAdi0lEQVR4nO3deZgU5b328e9vVlYbF1yISysquKOixl2M0Si8HjUqRzFqjkmOGjUalzM55k1K41GMr0uMIW4nLtEYlehxGReMwQ1U3FARBY84cQ0RkJZl9nneP6pGB5wZZunup6r6/lxXXzP0VFfdPczc83RV11PmnENEROKrzHcAERHpnopaRCTmVNQiIjGnohYRiTkVtYhIzKmoRURiTkUtIhJzKmoRkZhTUYuIxJyKWkQk5lTUIiIxp6IWEYk5FbWISMypqEVEYk5FLSIScypqEZGYU1GLiMScilpEJOZU1CIiMaeiFhGJORW1iEjMqahFRGJORS0iEnMqahGRmFNRi4jEnIpaRCTmVNQiIjFX4TuASN4EmUHApsBmHW4jgKHAkOg2uMNHgNZmV/7ZVo1/HAgsB1ZEt8VA3Wq3D+omj28pynMR6cCcc74ziPROkCkHtgV2AXaNPm4NDO/L6lpc2cdbNt7xjR4s2gp8Qlja/wu8BMwE3qybPL6tL9sW6QkVtcRfkBkGjItuuwM7AgPztfomV1G3dePt2X6sYhkwi7C0ZwIv1E0evzQP0UQAFbXEUZApA3YDDgO+A4ylgMdTGlzlu6Mbb9sqj6t0wFvAQ8DUusnjX83juqUEqaglPoLMnsAk4Bhg/WJtdoUb8PZ2jX/YpoCbWABMJSztlwq4HUkpFbX4FWRGAScAxwNb+IiQc4Pe3Knx5h2KtLm/A38B7q6bPH5WkbYpCaeiluILMlWExfxjwt0aXi12Q1/btfGGnT1s+mXgN4Sl3exh+5IQKmopniCzHnAacDqwoec0X1rohr28R+MUn38wPgGmANfXTR6/2GMOiSkVtRRekNkGOAf4HjDAc5qv+cit9+I+jdfu4TsHUA/cAVxTN3n8XN9hJD5U1FI4QSYL/IpwN0dsz4Jd0Lbh8wc2XbWn7xyreRC4oG7y+Hm+g4h/KmrJvyAzHPg5cCpQ5TnNGs1r23jGIU2/3tt3jk60ADcAQd3k8Yt8hxF/VNSSP0FmCHBudBvqOU2PvdG2+bOHN/3Xvr5zdCMHXAr8pm7y+EbfYaT4YvtyVBImyBwDvAMEJKikARqpjPtoJQNcDryTran9V99hpPg0KZP0T7gfegpwqOckfdboquJe1O2ywF3ZmtozgZPrJo9/13MeKRKNqKVvgkwZQeYsYA4JLmmAhviPqFe3F/Batqb2NN9BpDhU1NJ7QWYz4BnCkzUGr2Hp2GuI//HOzgwGpmRrah/J1tRu5DuMFJaKWnonyBwJzAbi+C6JPmmg2neE/jgUeDNbU3tMvlZoZsvzta4ebi9rZsf3cx1nm9mgLr72lJnNM7PZ0e3oPqx/jJkd1p+M/aGilp4JMtUEmd8C9wHDPKfJq3pXZb4z9NO6wD3Zmto7sjW1w3yH6Q0zqyDc996vogbOBjot6sgk59yY6Da1D+sfQzibY49ZKC8dq6KWNQsyWxLOs3yG7yiFUE/ii7rdJODlbE3t6HyszMwOMLOnzeweM5tvZpPNbJKZzTKzN81sZLTcrWZ2vZk9Gy03Ibp/gJndEi37mpmNi+4/2czuNbOHgGnAZGDfaLR7TjTCftbMXo1ue3XI85SZTTWzd8zszqgMzyK8ks90M5vew+c23Mz+YmYvRbe9o/t3N7OZUd6ZZjbKzKqAi4GJUcaJZhaY2Xkd1jcnyp01s7fNbArwKrCJmZ0fbeMNM7soWn6wmdWa2evRYyd2l1fv+pDuBZlDgT8Da/mOUij1VKdpwDISeD5bU3tM3eTxf83D+nYCtgGWEE7XerNzbncz+wlwJuFIFsJR8f7R9qeb2ZaEk27hnNvBzEYD08xs62j5PYEdnXNLzOwA4DznXHvBDwK+7ZxrMLOtgLv4avKunYHtCOdHmQHs7Zy71sx+CoxzznV1YtCdZlYfff4twuMrVzvnnjOzTYHHo+f5DrCfc67FzA4CLnXOfdfMfgGMdc6dEWUMuvmejQK+75w73cwOBrYivOCFAQ+a2X6EVyP6xDk3Plpfppv1aUQt3QgypxFOfp/akgZY6QakZUTdbhjwaLam9t/zsK6XnHOfOucagfcIR8AAbxKWc7t7nHNtzrl3CQt9NLAP8EcA59w7hFO8thf1E865JV1ssxK4yczeBO4lvOxau1nOuY+cc22Ex0qyX394pzru+lgMHARcZ2azCU/XX8vMhhK+Z/1eM5sDXE34R6G3/u6ceyH6/ODo9hrhCHs0YXG/CRxkZpeb2b7OuVx3K9SIWr4uyBjwa+C8NS2aBvVUpXHAUgFcn62pHQWc149rOnY8E7Ktw7/bWLU/Vn+LoyMcQXZlRTdfOwdYSDiaLwMausjTSt87rAzY0zlX3/FOM/stMN05d6SZZYGnunh8C6sOdDtONtbxuRlwmXPuhtVXYGa7Eu73vszMpjnnLu4urMhXgswA4B5KpKQB6qku952hgM4BHsjW1A4p8HaOMbOyaL/1FsA8wrdwTgKIdnlsGt2/umWsejZrBvg0GjV/D+jJ/8/q61iTaXQ45mJmYzps++Po85O7WX8d4UWVMbNdgM272M7jwL+Z2ZBo2W+Y2fpmNgJY6Zy7A/h/7evqiopavhJkMsCTQK/fvpRk9S7VRQ0wAXguW1NbyMubzQOeBh4FTnXONRCesVoe7cK4Gzg52oWyujeAlujA2jnR404ysxcId5V0N/pudyPwaE8PJgJnAWOjA3xzCScQg/CV5GVmNoNV/0BMB7ZtP5hIeJWedaJdJ6cB8zvbiHNuGvAn4Pno+zCVsPB3AGZFj78QuKS7sJqUSUJhSU8jPOhRUk5pOnf2k227jvGdowjmAgfWTR6/MJ8rNbNbgYf7+LY36QGNqAWCzFqEL9FKrqQBGqgulWM12wLTszW1sbm6jvSMirrUhSU9DYjDFU68qHdVpVLUEL4FbXo+d4M4507WaLqwVNSl7KuRdMmWNEADVZW+MxTZaODxpJ3FWMpU1KUqvBL4A8A3fUfxrYGSGlG3GwPUZmtqEz+pVilQUZei8H3StwIH+A0SD42uMpHT5+XBXsB92ZraUvxDlSgq6tL0X8BxvkPERQnu+ujoYOAK3yGkeyrqUhNkTgR+5jtGnDRSsiPqdmdna2pP8B1CuqaiLiVBZm/gJt8x4qaJykRPSJ0nN2Zranf2HUI6p6IuFUFmOOEEN6U+evyaJir0PYGBwP3Zmtr1fAeRr1NRl4Lw4OFtgC7ZtBrnaAJL2+x5fbUZcHe2pjbtp9Qnjoq6NJxLwi9AW0CdzT1Ryg5EBxdjR0WddkFmD+BS3zHiymFNvjPE0DnZmtoJvkPIV1TUaRZOtHQX4UTs0gkVdZd+n62pTfUFI5JERZ1u19D1PLkCtGHNvjPE1MZoF0hsqKjTKsiMY9WJz6UTrZSpqLv2w2xN7TjfIURFnU5Bphq43neMJGhTUXfHgJuzNbWDfAcpdSrqdLqQry4iKt1ooazFd4aY24I1XH1ECk9FnTZBZhvgP3zHSIpWylXUa/aTbE1tyc+y6JOKOn1+j84+7LFmylt9Z0iAMmBKtqZWJwZ5oqJOkyAzAdjfd4wkaaZCRd0zO1NiFz2OExV1WgSZMnRiS681OxV1L1ys08v9UFGnx/GEl6CXXmiios13hgQZDZzoO0QpMuec7wzSX0GmEpiHTm7ptTlt2WcnNF26bzG21fLFZyyqvYrW5Z9jVsaQMYew1th/Yelzd7L89ccpG5QBYO39TmTgyN1o+GguS6ZNwcorWe/w86lcewRtDcv57IHLWf/YizE/c0n9Hdi6bvJ4ndFZRLoETzr8OyrpPmmksngjlbJy1h53CtUbbklb40o+ve1sBmTDKaCHjj2CzB5HrbL4Fy/dz/AjfkZL7p8se+0R1jnwByyd+Wcyex7rq6QhnGHvR8B1vgKUIu36SLrwIrX/6TtGUjW64hV1xZB1qN5wSwDKqgdRue4mtC5b3OXyVlaBa2nCtTRiZRU0f/4prcsWM2BT73u4LtRJMMWlok6+E9A8033WQJWXfX8tuYU0LVxA9YhRACx79WE++cMZLHrkGloblgOQ+eYxLH7sOr54+QGG7jKBpc/czrB9Y3HFrA2B032HKCXaR51k4QUB5gDb+o6SVI+07v7U6c1nH1DMbbY11bPwTzVk9pzIoFF70bric8oGrgVmLH32DlqXL2G9w85e5TENH85h5fznGbrzYSx99g6srJy1DzyF8sFrFzN6R+8DI+smj1eBFIFG1Ml2MCrpfmko8rlBrrWFz+6/lMHbHsCgUXsBUD54baysHLMyhu50CE2fzl/1Mc6Rm3k3mb2PY+mMPzFsn+MZvN04vnjloaJmX83mwCE+A5QSFXWynek7QNI1uKqiHZVzzrH40d9Que4mrLX7kV/e37J8yZefr5z/PJXrbbbK41bMeZKBI8dSPmAIrrkRrAzMws/9OtV3gFKhd30kVZDZAl1eq9/qKd4FyBs/nsuKt6ZTOTzLJ7eEf2PX3u9EVrz9DE0LF4AZFZn1WeeQM758TFtzA8vnPMkGx/4KgLV2O4LP7r8UK69gvcMvKFr2LkzI1tRuXDd5/Ee+g6Sdijq5TkGviPqtnqqifQ8HbLwdm/3Hw1+7f+DI3bp8TFnlADY87rKv1rHJ9ow45XcFydcH5cAPgV/6DpJ2+kVPruN9B0iDeletiYb65wfZmloN+ApMRZ1EQWYvIOs7RhrUU63fgf4ZAfwf3yHSTj+kyaTRdJ7UU6XRYP+d7DtA2qmokybIVADH+o6RFivdAP0O9N9B2ZraAb5DpJl+SJPn28Bw3yHSop4qTdvZf4OAA32HSDMVdfIcteZFpKfqqdauj/yY4DtAmqmok+dg3wHSpMFpH3WeqKgLSEWdJEFmFLCp7xhp0qCDifmySbamdiffIdJKRZ0sGk3nWQNVlb4zpIjeplcgKupkUVHnmYo6r7T7o0BU1EkRXm7rAN8x0qbRVRZ3+rx02y1bU5vxHSKNVNTJsRswxHeItGlARZ1HZcDOvkOkkYo6OXb1HSCNmlTU+baL7wBppKJODv0CFEAjlcWb57Q06Oe0AFTUyaFfgDxzjhZHmX4H8kuv/ApAP6RJEGSq0SW3CsH7JVJSaOtsTe1g3yHSRkWdDDugizzknYMm3xlSqAwY4ztE2qiok2GM7wBp5DAVdWFoN12eqaiTYWvfAdKojbJm3xlSSqeS55mKOhk29x0gjdowFXVhfMN3gLRRUSeDiroANKIumI18B0gbFXUyaMa8AmilrMV3hpRSUeeZijrugkwVsJ7vGGnUQrmKujCG68rk+aWijr8RgPkOkUYq6oIxYAPfIdJERR1/uj5igTRT0eo7Q4pp90ceqajjT2d5FUizU1EXkIo6j1TU8aeiLpAmKtp8Z0gxFXUeqajjT0VdICrqgtLPbR6pqONPP/AF0kilirpw9K6PPFJRx5+KukAaqXK+M6SYrkWZRyrq+BvoO0BaNbpKFXXhaESdR/pmxp/e61sg+5W9sceC6klLfOdIowaqWuCfvmOkhoo6/hp8B0grMwYaTq9YCmAQjXq1nkf6ZsafilqSSO9RzyMVdfypqCWJdJmzPFJRx5+KWpJose8AaaKijr963wFE+kBFnUcq6vj73HcAkT5QUeeRijr+PvYdQKQPVNR5pKKOv38AOtVZkkZFnUcq6rgLci3AQt8xRHqhDe2yyysVdTJo94ckyQcEOb0KzCMVdTKoqCVJ5vkOkDYq6mT4wHcAkV6Y7ztA2qiok2Gu7wAivaCizjMVdTK85TuASC9o10eeqaiTQUUtSaIRdZ6pqJMgyC1B+6klGZain9W8U1Enx6u+A4j0wAsEOV05J89U1Mnxiu8AIj0w03eANFJRJ8ezvgOI9MDzvgOkkYo6OZ5HU55KvLUCL/oOkUYq6qQIck3ADN8xRLoxhyC3zHeINFJRJ8uTvgOIdEO75wpERZ0sf/MdQKQbj/gOkFYq6mR5Bcj5DiHSiXpguu8QaaWiTpIg1wo86juGSCeeJMjpQswFoqJOnrt9BxDpxP2+A6SZijp5HgW+8B1CpINW4EHfIdJMRZ00Qa4ReMB3DJEOnibILfIdIs1U1Mmk3R8SJ7f4DpB2KupkmkY4S5mIb0uBv/gOkXYq6iQKcs3Anb5jiAB/IshpaoMCq/AdQPrsd8CPfYeYt6iViVO/+j1d8HkbF4+rZmmD46ZXmxk+yAC49FvVHLZVJTM+aOG02gaqK+Cu7w5iy3XKWNrgmDh1JY9NGoSZ+Xoq0jf/7TtAKTDnNHVsYgWZJ4CDfMdo19rm+MZVy3nxB4O5ZXYTQ6qM8/aqXmWZo+5eyeUHVVO31PHY/7Zw5SEDOPfxBg4fVcH+WY0bEuY1gtwuvkOUAu36SLbrfAfo6Mn3Wxm5ThmbDev6x6qyHOpbYGWzo7Ic3lvSxsfL2lTSyXST7wClQr8dyfYQUAdk/cYI/XlOM8dtX/nlv6+b1cTtrzczdkQ5Vx48gLUHGj/bp5ofPdTAwEr445EDOW9aA78aV93NWiWm/gnc6jtEqdCIOsmCXBswxXcMgKZWx4PzWjhm2/Bv/2ljq3jvrCHMPnUwGw0xzp0Wnl08ZsNyXvjBYKafNJgFn7cxYmgZDpg4dSUn3FfPwuVtHp+F9MI1OohYPCrq5LuRGLxV79F3W9hlozI2GBL+SG0wpIzyMqPMjB/uWsWsj1tXWd45xyXPNPJ/96vmoqcbueiAak7YsZJrX2zyEV96J0d4MFuKREWddEEuB/zGd4y7Vtvt8emyr0bG97/dzPbrr/qjdtvrzYzfqoK1Bxorm6HMwtvK5qJFlr77HUFO0xgUkfZRp8PVwE+AYT42vrLZ8cSCVm6YMPDL+y74ayOz/9GKAdlhZdwwYcAqy9/2ejPTThgEwE+/WcV376mnqhzu+u7A1Vcv8bISuMZ3iFKjt+elRZC5ELjEdwxJvSsJcuf5DlFqtOsjPa4B/uE7hKTaIjQY8EJFnRZBbgXwK98xJNV+SZBb6jtEKVJRp8sNwGzfISSV3iL8+RIPVNRpEl6q6zRABx4k386Jfr7EAxV12gS5F4CbfceQVKklyD3hO0QpU1GnUw3wme8QkgorgDN9hyh1Kuo0CnJLgAt8x5BU+E+C3Pu+Q5Q6FXV63Qbo5ar0x3PAb32HEJ3wkm5BZkPgDWC47yiSOMuBnQhyC3wHEY2o0y3I/QP4N98xJJF+qpKODxV12gW5h9FMZ9I79xPkdFGAGFFRl4bzgDm+Q0gizANO8h1CVqWiLgVBrgH4V8L9jiJdWQ4cSZBb5juIrEpFXSqC3FvA8YAuoSJd+T5B7m3fIeTrVNSlJMg9RHgyjMjqriDITfUdQjqnoi41Qe4K4A++Y0isPAr8zHcI6ZqKujSdBjzjO4TEwgvA0ZpwKd50wkupCjLrEp55Ntp3FPFmLrBvNOWAxJhG1KUqyC0Gvg3UeU4ifnwAHKKSTgYVdSkLch8B3wI+8R1FimoRcHD0/y8JoKIudeFpwuNQWZeKzwhLep7vINJzKmqBIDefsKw/9R1FCuojYD+C3Gu+g0jvrLGozcyZ2ZUd/n2emQVreMwRZrZtF18LzKyol5s3s5PNbEQ/Hj/GzA7r4msHmFnOzGZHt7/2cRtnm9mgvmbst7Cs9wHe9ZZBCuk9wgOH7/gOIr3XkxF1I3CUma3Xi/UeAXRa1MVmZuXAyUCfixoYA3Ra1JFnnXNjottBfdzG2UCvitrMKvq4rc6Fu0H2InzLlqTHW4QlXec7iPRNT4q6BbgROGf1L5jZZmb2pJm9EX3c1Mz2Ag4HrohGmCO7WrGZPWVmV5vZM2b2tpntZmb3mdm7ZnZJtEzWzN4xs9ui7UxtH3ma2bfM7DUze9PM/mBm1dH9dWb2CzN7DjgOGAvcGeUZGH3tJTObY2Y3mpl1yHO5mc0ys/lmtq+ZVQEXAxOjx0/syTfWzE6I1jPbzG6I/mBgZr83s5fN7C0zuyi67yzCPyTTzWx6dN/yDus62sxujT6/1cyuipa73MxGmtljZvaKmT1rZqOj5Y6Jnt/rZtbz90wHuUXAgcCDPX6MxNkLwP4EOe3WSrCe7qP+HTDJzDKr3X8dcLtzbkfgTuBa59xMwl/y86MR5ntrWHeTc24/4HrgAeDHwPbAyWa2brTMKODGaDtfAKeb2QDgVmCic24HoILwRI52Dc65fZxzdwAvA5OiPPXAdc653Zxz2wMDgQkdHlfhnNudcIT7S+dcE/AL4O7o8Xd38hz27bDr40Iz2waYCOztnBsDtAKTomUvdM6NBXYE9jezHZ1z1xIezBvnnBu3hu8XwNbAQc65cwn/iJ7pnNuVcJa8KdEyvwAOcc7tRPiHs+eCXD1wFOH/iSTXncAB0VsxJcF69NLZOfeFmd0OnAXUd/jSnoS/0AB/BH7dhwztI7c3gbecc58CmNkCYBNgKfChc25GtNwdUY4ngPedc/Oj+28jLPlron93VqjtxpnZBYS7GtYhfGn4UPS1+6KPrwDZHj6HZ51zX5a9mZ0B7Aq8FA3WBwL/jL58rJn9iPB7vxHhLqI3eriddvc651rNbAjhrop7o+0AVEcfZwC3mtk9HZ5Tz4Vnqp1GkJlP+P+a390sUkhtwM8Jcpf5DiL50ZtfvmuAV4FbulmmL6c5NkYf2zp83v7v9nyrr9cBRvdWdHZnNBKfAox1zn0YHRgd0EmeVvpeTgbc5pxbZf4EM9uccNS7m3Pu82h3xoBOHg+rPufVl2l/bmXA0mjUvuqDnTvVzPYAxgOzzWyMc673I6sgdzVB5kXCP3wb9/rxUmyfA8cT5B7zHUTyp8dvz3POLQHuAU7pcPdMwnmOIXxp/1z0+TJgaD4CRjY1sz2jz4+LtvMOkDWzLaP7vwc83cXjO+ZpL71F0Yj06B5sv7fP50ngaDNbH8DM1jGzzYC1CEs2Z2YbAId2s42FZraNmZUBR3a2EefcF8D7ZnZMtB0zs52iz0c65150zv2C8ASHTXqRf1VBbiawMzCtz+uQYngNGKuSTp/evo/6SqDjuz/OAr5vZm8QFuVPovv/DJwfHejr8mBiL7wNnBRtZx3g9865BuD7hC/73yQcgXe1T/VW4Hozm004Yr6JcFfL/wAv9WD704Fte3ow0Tk3F/g5MC3K/ASwkXPudcJfprcIZ7Cb0eFhNwKPth9MJJyO9GHgb3T//uZJwClm9nq03n+J7r8iOsg6h3ACptd78Dy7Fh5kPBT4JZrTOm5agcuAPXSdw3SK/aRMZpYFHo4O/EkcBJn9gZuBLde0qBTcAuBEgtyMNS4piaUzE6X3gtzThO9auZJwNCd+/Dewk0o6/WI/opaYCzK7ExaGXvEUz3vAWQS5R3wHkeLQiFr6J8jNInwrYgA0+A2TeisJj31sp5IuLRpRS/4EmSwwmfBkH8mve4FzCXIf+g4ixaeilvwLMnsSniSzj+8oKfAKcAFB7m++g4g/KmopnCAzAbgE2Ml3lAR6CbiIIFfrO4j4p6KWwgsy3wHOJ5zsSbr3ImFBP+o7iMSHilqKJ8jsSljYRwPlntPEiSM86/MqgpzO/pSvUVFL8YUHHc8ATgA28BvGqyXA7cAUgpwu2CBdUlGLP0GmAjgEOIlwKtbq7h+QCm3AU4Rndt5HkGvsfnERFbXERZAZRvi2vkmEU7emaddIC+GcLfcB/0OQW+g5jySMilriJ8isDRxMePmz7wDr+w3UJ8v5qpwfJMh97jmPJJiKWuItyBjhmY/fAfYGvgkM8xmpC0sIp999Jrq9Gl18QaTfVNSSLGFxjya8DuYuhCU+GhhexBQfAnMJp5WdC8wC5hDk9MskBaGilnQIMkOALaLbyOjjxkCmk9vq+78d4SyALYQXdlgU3T4jvJblJ8BHhBermEuQW1bgZyOyChW1lJ4gMyj6rAVoIcjpQggSaypqEZGY0zSnIiIxp6IWEYk5FbWISMypqEVEYk5FLSIScypqEZGYU1GLiMScilpEJOZU1CIiMaeiFhGJORW1iEjMqahFRGJORS0iEnMqahGRmFNRi4jEnIpaRCTmVNQiIjGnohYRiTkVtYhIzKmoRURiTkUtIhJzKmoRkZhTUYuIxJyKWkQk5lTUIiIxp6IWEYk5FbWISMz9f6bjxUcYMcfAAAAAAElFTkSuQmCC\n",
2676 | "text/plain": [
2677 | ""
2678 | ]
2679 | },
2680 | "metadata": {},
2681 | "output_type": "display_data"
2682 | }
2683 | ],
2684 | "source": [
2685 | "dataset.columns.size\n",
2686 | "imp_features_visual=['Important Features','Not Important Features']\n",
2687 | "imp_features_visual_val=[nbfeatures,57-nbfeatures]\n",
2688 | "plt.pie(imp_features_visual_val, labels=imp_features_visual, autopct='%0.f%%')"
2689 | ]
2690 | },
2691 | {
2692 | "cell_type": "code",
2693 | "execution_count": 25,
2694 | "id": "65c60c3c",
2695 | "metadata": {},
2696 | "outputs": [],
2697 | "source": [
2698 | "x_train,x_test,y_train,y_test=train_test_split(x_new,y,test_size=0.2)"
2699 | ]
2700 | },
2701 | {
2702 | "cell_type": "code",
2703 | "execution_count": 26,
2704 | "id": "bc4902c0",
2705 | "metadata": {},
2706 | "outputs": [],
2707 | "source": [
2708 | "features=[]\n",
2709 | "index=np.argsort(extratrees.feature_importances_)[::1][:nbfeatures]"
2710 | ]
2711 | },
2712 | {
2713 | "cell_type": "code",
2714 | "execution_count": 27,
2715 | "id": "d12c9fbb",
2716 | "metadata": {},
2717 | "outputs": [
2718 | {
2719 | "name": "stdout",
2720 | "output_type": "stream",
2721 | "text": [
2722 | "1. feature LoaderFlags (0.000003)\n",
2723 | "2. feature NumberOfRvaAndSizes (0.000049)\n",
2724 | "3. feature SizeOfHeapCommit (0.000331)\n",
2725 | "4. feature BaseOfCode (0.000807)\n",
2726 | "5. feature SizeOfUninitializedData (0.000878)\n",
2727 | "6. feature ResourcesMeanSize (0.001154)\n",
2728 | "7. feature BaseOfData (0.001165)\n",
2729 | "8. feature ResourcesMaxSize (0.001197)\n",
2730 | "9. feature SectionsMeanVirtualsize (0.001212)\n",
2731 | "10. feature SizeOfImage (0.001226)\n",
2732 | "11. feature SectionMaxRawsize (0.001275)\n",
2733 | "12. feature SizeOfInitializedData (0.001280)\n",
2734 | "13. feature SectionMaxVirtualsize (0.001295)\n",
2735 | "14. feature SectionsMeanRawsize (0.001400)\n"
2736 | ]
2737 | }
2738 | ],
2739 | "source": [
2740 | "for f in range(nbfeatures):\n",
2741 | " print(\"%d. feature %s (%f)\"%(f+1,dataset.columns[2+index[f]],extratrees.feature_importances_[index[f]]))\n",
2742 | " features.append(dataset.columns[2+f])"
2743 | ]
2744 | },
2745 | {
2746 | "cell_type": "code",
2747 | "execution_count": 28,
2748 | "id": "ffe54b76",
2749 | "metadata": {},
2750 | "outputs": [],
2751 | "source": [
2752 | "model ={ \"RandomForest\":ek.RandomForestClassifier(n_estimators=50),\n",
2753 | " \"DecisionTree\":tree.DecisionTreeClassifier(max_depth=10),\n",
2754 | " \"LogisticRegression\":LogisticRegression()\n",
2755 | " }"
2756 | ]
2757 | },
2758 | {
2759 | "cell_type": "code",
2760 | "execution_count": 29,
2761 | "id": "0ab0113e",
2762 | "metadata": {},
2763 | "outputs": [
2764 | {
2765 | "name": "stdout",
2766 | "output_type": "stream",
2767 | "text": [
2768 | "RandomForest : 0.9940963419051069\n",
2769 | "DecisionTree : 0.9900760593987685\n",
2770 | "LogisticRegression : 0.6964505613908004\n"
2771 | ]
2772 | },
2773 | {
2774 | "name": "stderr",
2775 | "output_type": "stream",
2776 | "text": [
2777 | "C:\\Users\\vajha\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=2):\n",
2778 | "ABNORMAL_TERMINATION_IN_LNSRCH.\n",
2779 | "\n",
2780 | "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
2781 | " https://scikit-learn.org/stable/modules/preprocessing.html\n",
2782 | "Please also refer to the documentation for alternative solver options:\n",
2783 | " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
2784 | " n_iter_i = _check_optimize_result(\n"
2785 | ]
2786 | }
2787 | ],
2788 | "source": [
2789 | "results={}\n",
2790 | "for algo in model:\n",
2791 | " clf=model[algo]\n",
2792 | " clf.fit(x_train,y_train)\n",
2793 | " score=clf.score(x_test,y_test)\n",
2794 | " print(\"%s : %s\"%(algo,score))\n",
2795 | " results[algo]=score"
2796 | ]
2797 | },
2798 | {
2799 | "cell_type": "code",
2800 | "execution_count": 30,
2801 | "id": "4189adb6",
2802 | "metadata": {},
2803 | "outputs": [
2804 | {
2805 | "data": {
2806 | "text/plain": [
2807 | "'RandomForest'"
2808 | ]
2809 | },
2810 | "execution_count": 30,
2811 | "metadata": {},
2812 | "output_type": "execute_result"
2813 | }
2814 | ],
2815 | "source": [
2816 | "winner=max(results,key=results.get)\n",
2817 | "winner"
2818 | ]
2819 | },
2820 | {
2821 | "cell_type": "code",
2822 | "execution_count": 31,
2823 | "id": "4fb1c869",
2824 | "metadata": {},
2825 | "outputs": [
2826 | {
2827 | "name": "stdout",
2828 | "output_type": "stream",
2829 | "text": [
2830 | "False positive rate : 0.114760 %\n",
2831 | "False negative rate : 0.162137 %\n"
2832 | ]
2833 | }
2834 | ],
2835 | "source": [
2836 | "clf=model[winner]\n",
2837 | "res=clf.predict(x_new)\n",
2838 | "mt=confusion_matrix(y,res)\n",
2839 | "print(\"False positive rate : %f %%\" % ((mt[0][1] / float(sum(mt[0])))*100))\n",
2840 | "print(\"False negative rate : %f %%\" % ((mt[1][0] / float(sum(mt[1])))*100))"
2841 | ]
2842 | },
2843 | {
2844 | "cell_type": "code",
2845 | "execution_count": 32,
2846 | "id": "93c1012f",
2847 | "metadata": {},
2848 | "outputs": [],
2849 | "source": [
2850 | "# Check for Multicollinearity"
2851 | ]
2852 | },
2853 | {
2854 | "cell_type": "code",
2855 | "execution_count": 33,
2856 | "id": "978fd992",
2857 | "metadata": {},
2858 | "outputs": [
2859 | {
2860 | "data": {
2861 | "text/html": [
2862 | "\n",
2863 | "\n",
2876 | "
\n",
2877 | " \n",
2878 | " \n",
2879 | " | \n",
2880 | " Machine | \n",
2881 | " SizeOfOptionalHeader | \n",
2882 | " Characteristics | \n",
2883 | " MajorLinkerVersion | \n",
2884 | " MinorLinkerVersion | \n",
2885 | " SizeOfCode | \n",
2886 | " SizeOfInitializedData | \n",
2887 | " SizeOfUninitializedData | \n",
2888 | " AddressOfEntryPoint | \n",
2889 | " BaseOfCode | \n",
2890 | " ... | \n",
2891 | " ExportNb | \n",
2892 | " ResourcesNb | \n",
2893 | " ResourcesMeanEntropy | \n",
2894 | " ResourcesMinEntropy | \n",
2895 | " ResourcesMaxEntropy | \n",
2896 | " ResourcesMeanSize | \n",
2897 | " ResourcesMinSize | \n",
2898 | " ResourcesMaxSize | \n",
2899 | " LoadConfigurationSize | \n",
2900 | " VersionInformationSize | \n",
2901 | "
\n",
2902 | " \n",
2903 | " \n",
2904 | " \n",
2905 | " 0 | \n",
2906 | " 332 | \n",
2907 | " 224 | \n",
2908 | " 258 | \n",
2909 | " 9 | \n",
2910 | " 0 | \n",
2911 | " 361984 | \n",
2912 | " 115712 | \n",
2913 | " 0 | \n",
2914 | " 6135 | \n",
2915 | " 4096 | \n",
2916 | " ... | \n",
2917 | " 0 | \n",
2918 | " 4 | \n",
2919 | " 3.262823 | \n",
2920 | " 2.568844 | \n",
2921 | " 3.537939 | \n",
2922 | " 8797.000000 | \n",
2923 | " 216 | \n",
2924 | " 18032 | \n",
2925 | " 0 | \n",
2926 | " 16 | \n",
2927 | "
\n",
2928 | " \n",
2929 | " 1 | \n",
2930 | " 332 | \n",
2931 | " 224 | \n",
2932 | " 3330 | \n",
2933 | " 9 | \n",
2934 | " 0 | \n",
2935 | " 130560 | \n",
2936 | " 19968 | \n",
2937 | " 0 | \n",
2938 | " 81778 | \n",
2939 | " 4096 | \n",
2940 | " ... | \n",
2941 | " 0 | \n",
2942 | " 2 | \n",
2943 | " 4.250461 | \n",
2944 | " 3.420744 | \n",
2945 | " 5.080177 | \n",
2946 | " 837.000000 | \n",
2947 | " 518 | \n",
2948 | " 1156 | \n",
2949 | " 72 | \n",
2950 | " 18 | \n",
2951 | "
\n",
2952 | " \n",
2953 | " 2 | \n",
2954 | " 332 | \n",
2955 | " 224 | \n",
2956 | " 3330 | \n",
2957 | " 9 | \n",
2958 | " 0 | \n",
2959 | " 517120 | \n",
2960 | " 621568 | \n",
2961 | " 0 | \n",
2962 | " 350896 | \n",
2963 | " 4096 | \n",
2964 | " ... | \n",
2965 | " 1 | \n",
2966 | " 11 | \n",
2967 | " 4.426324 | \n",
2968 | " 2.846449 | \n",
2969 | " 5.271813 | \n",
2970 | " 31102.272727 | \n",
2971 | " 104 | \n",
2972 | " 270376 | \n",
2973 | " 72 | \n",
2974 | " 18 | \n",
2975 | "
\n",
2976 | " \n",
2977 | " 3 | \n",
2978 | " 332 | \n",
2979 | " 224 | \n",
2980 | " 258 | \n",
2981 | " 9 | \n",
2982 | " 0 | \n",
2983 | " 585728 | \n",
2984 | " 369152 | \n",
2985 | " 0 | \n",
2986 | " 451258 | \n",
2987 | " 4096 | \n",
2988 | " ... | \n",
2989 | " 1 | \n",
2990 | " 10 | \n",
2991 | " 4.364291 | \n",
2992 | " 2.669314 | \n",
2993 | " 6.400720 | \n",
2994 | " 1457.000000 | \n",
2995 | " 90 | \n",
2996 | " 4264 | \n",
2997 | " 72 | \n",
2998 | " 18 | \n",
2999 | "
\n",
3000 | " \n",
3001 | " 4 | \n",
3002 | " 332 | \n",
3003 | " 224 | \n",
3004 | " 258 | \n",
3005 | " 9 | \n",
3006 | " 0 | \n",
3007 | " 294912 | \n",
3008 | " 247296 | \n",
3009 | " 0 | \n",
3010 | " 217381 | \n",
3011 | " 4096 | \n",
3012 | " ... | \n",
3013 | " 1 | \n",
3014 | " 2 | \n",
3015 | " 4.306100 | \n",
3016 | " 3.421598 | \n",
3017 | " 5.190603 | \n",
3018 | " 1074.500000 | \n",
3019 | " 849 | \n",
3020 | " 1300 | \n",
3021 | " 72 | \n",
3022 | " 18 | \n",
3023 | "
\n",
3024 | " \n",
3025 | " ... | \n",
3026 | " ... | \n",
3027 | " ... | \n",
3028 | " ... | \n",
3029 | " ... | \n",
3030 | " ... | \n",
3031 | " ... | \n",
3032 | " ... | \n",
3033 | " ... | \n",
3034 | " ... | \n",
3035 | " ... | \n",
3036 | " ... | \n",
3037 | " ... | \n",
3038 | " ... | \n",
3039 | " ... | \n",
3040 | " ... | \n",
3041 | " ... | \n",
3042 | " ... | \n",
3043 | " ... | \n",
3044 | " ... | \n",
3045 | " ... | \n",
3046 | " ... | \n",
3047 | "
\n",
3048 | " \n",
3049 | " 138042 | \n",
3050 | " 332 | \n",
3051 | " 224 | \n",
3052 | " 258 | \n",
3053 | " 11 | \n",
3054 | " 0 | \n",
3055 | " 205824 | \n",
3056 | " 223744 | \n",
3057 | " 0 | \n",
3058 | " 123291 | \n",
3059 | " 4096 | \n",
3060 | " ... | \n",
3061 | " 0 | \n",
3062 | " 7 | \n",
3063 | " 4.122736 | \n",
3064 | " 1.370260 | \n",
3065 | " 7.677091 | \n",
3066 | " 14900.714286 | \n",
3067 | " 16 | \n",
3068 | " 81654 | \n",
3069 | " 72 | \n",
3070 | " 0 | \n",
3071 | "
\n",
3072 | " \n",
3073 | " 138043 | \n",
3074 | " 332 | \n",
3075 | " 224 | \n",
3076 | " 33167 | \n",
3077 | " 2 | \n",
3078 | " 25 | \n",
3079 | " 37888 | \n",
3080 | " 185344 | \n",
3081 | " 0 | \n",
3082 | " 40000 | \n",
3083 | " 4096 | \n",
3084 | " ... | \n",
3085 | " 0 | \n",
3086 | " 26 | \n",
3087 | " 3.377663 | \n",
3088 | " 2.031619 | \n",
3089 | " 5.050074 | \n",
3090 | " 6905.846154 | \n",
3091 | " 44 | \n",
3092 | " 67624 | \n",
3093 | " 0 | \n",
3094 | " 15 | \n",
3095 | "
\n",
3096 | " \n",
3097 | " 138044 | \n",
3098 | " 332 | \n",
3099 | " 224 | \n",
3100 | " 258 | \n",
3101 | " 10 | \n",
3102 | " 0 | \n",
3103 | " 118272 | \n",
3104 | " 380416 | \n",
3105 | " 0 | \n",
3106 | " 59610 | \n",
3107 | " 4096 | \n",
3108 | " ... | \n",
3109 | " 0 | \n",
3110 | " 22 | \n",
3111 | " 6.825406 | \n",
3112 | " 2.617026 | \n",
3113 | " 7.990487 | \n",
3114 | " 14981.909091 | \n",
3115 | " 48 | \n",
3116 | " 22648 | \n",
3117 | " 72 | \n",
3118 | " 14 | \n",
3119 | "
\n",
3120 | " \n",
3121 | " 138045 | \n",
3122 | " 332 | \n",
3123 | " 224 | \n",
3124 | " 33166 | \n",
3125 | " 2 | \n",
3126 | " 25 | \n",
3127 | " 49152 | \n",
3128 | " 16896 | \n",
3129 | " 0 | \n",
3130 | " 51216 | \n",
3131 | " 4096 | \n",
3132 | " ... | \n",
3133 | " 0 | \n",
3134 | " 10 | \n",
3135 | " 3.421627 | \n",
3136 | " 2.060964 | \n",
3137 | " 4.739744 | \n",
3138 | " 601.600000 | \n",
3139 | " 16 | \n",
3140 | " 2216 | \n",
3141 | " 0 | \n",
3142 | " 0 | \n",
3143 | "
\n",
3144 | " \n",
3145 | " 138046 | \n",
3146 | " 332 | \n",
3147 | " 224 | \n",
3148 | " 258 | \n",
3149 | " 11 | \n",
3150 | " 0 | \n",
3151 | " 111616 | \n",
3152 | " 468480 | \n",
3153 | " 0 | \n",
3154 | " 22731 | \n",
3155 | " 4096 | \n",
3156 | " ... | \n",
3157 | " 0 | \n",
3158 | " 4 | \n",
3159 | " 4.407252 | \n",
3160 | " 1.980482 | \n",
3161 | " 6.115374 | \n",
3162 | " 96625.000000 | \n",
3163 | " 20 | \n",
3164 | " 318464 | \n",
3165 | " 72 | \n",
3166 | " 0 | \n",
3167 | "
\n",
3168 | " \n",
3169 | "
\n",
3170 | "
138047 rows × 54 columns
\n",
3171 | "
"
3172 | ],
3173 | "text/plain": [
3174 | " Machine SizeOfOptionalHeader Characteristics MajorLinkerVersion \\\n",
3175 | "0 332 224 258 9 \n",
3176 | "1 332 224 3330 9 \n",
3177 | "2 332 224 3330 9 \n",
3178 | "3 332 224 258 9 \n",
3179 | "4 332 224 258 9 \n",
3180 | "... ... ... ... ... \n",
3181 | "138042 332 224 258 11 \n",
3182 | "138043 332 224 33167 2 \n",
3183 | "138044 332 224 258 10 \n",
3184 | "138045 332 224 33166 2 \n",
3185 | "138046 332 224 258 11 \n",
3186 | "\n",
3187 | " MinorLinkerVersion SizeOfCode SizeOfInitializedData \\\n",
3188 | "0 0 361984 115712 \n",
3189 | "1 0 130560 19968 \n",
3190 | "2 0 517120 621568 \n",
3191 | "3 0 585728 369152 \n",
3192 | "4 0 294912 247296 \n",
3193 | "... ... ... ... \n",
3194 | "138042 0 205824 223744 \n",
3195 | "138043 25 37888 185344 \n",
3196 | "138044 0 118272 380416 \n",
3197 | "138045 25 49152 16896 \n",
3198 | "138046 0 111616 468480 \n",
3199 | "\n",
3200 | " SizeOfUninitializedData AddressOfEntryPoint BaseOfCode ... \\\n",
3201 | "0 0 6135 4096 ... \n",
3202 | "1 0 81778 4096 ... \n",
3203 | "2 0 350896 4096 ... \n",
3204 | "3 0 451258 4096 ... \n",
3205 | "4 0 217381 4096 ... \n",
3206 | "... ... ... ... ... \n",
3207 | "138042 0 123291 4096 ... \n",
3208 | "138043 0 40000 4096 ... \n",
3209 | "138044 0 59610 4096 ... \n",
3210 | "138045 0 51216 4096 ... \n",
3211 | "138046 0 22731 4096 ... \n",
3212 | "\n",
3213 | " ExportNb ResourcesNb ResourcesMeanEntropy ResourcesMinEntropy \\\n",
3214 | "0 0 4 3.262823 2.568844 \n",
3215 | "1 0 2 4.250461 3.420744 \n",
3216 | "2 1 11 4.426324 2.846449 \n",
3217 | "3 1 10 4.364291 2.669314 \n",
3218 | "4 1 2 4.306100 3.421598 \n",
3219 | "... ... ... ... ... \n",
3220 | "138042 0 7 4.122736 1.370260 \n",
3221 | "138043 0 26 3.377663 2.031619 \n",
3222 | "138044 0 22 6.825406 2.617026 \n",
3223 | "138045 0 10 3.421627 2.060964 \n",
3224 | "138046 0 4 4.407252 1.980482 \n",
3225 | "\n",
3226 | " ResourcesMaxEntropy ResourcesMeanSize ResourcesMinSize \\\n",
3227 | "0 3.537939 8797.000000 216 \n",
3228 | "1 5.080177 837.000000 518 \n",
3229 | "2 5.271813 31102.272727 104 \n",
3230 | "3 6.400720 1457.000000 90 \n",
3231 | "4 5.190603 1074.500000 849 \n",
3232 | "... ... ... ... \n",
3233 | "138042 7.677091 14900.714286 16 \n",
3234 | "138043 5.050074 6905.846154 44 \n",
3235 | "138044 7.990487 14981.909091 48 \n",
3236 | "138045 4.739744 601.600000 16 \n",
3237 | "138046 6.115374 96625.000000 20 \n",
3238 | "\n",
3239 | " ResourcesMaxSize LoadConfigurationSize VersionInformationSize \n",
3240 | "0 18032 0 16 \n",
3241 | "1 1156 72 18 \n",
3242 | "2 270376 72 18 \n",
3243 | "3 4264 72 18 \n",
3244 | "4 1300 72 18 \n",
3245 | "... ... ... ... \n",
3246 | "138042 81654 72 0 \n",
3247 | "138043 67624 0 15 \n",
3248 | "138044 22648 72 14 \n",
3249 | "138045 2216 0 0 \n",
3250 | "138046 318464 72 0 \n",
3251 | "\n",
3252 | "[138047 rows x 54 columns]"
3253 | ]
3254 | },
3255 | "execution_count": 33,
3256 | "metadata": {},
3257 | "output_type": "execute_result"
3258 | }
3259 | ],
3260 | "source": [
3261 | "mc=dataset.drop([\"Name\",'md5','legitimate'],axis=1) #independent features\n",
3262 | "mc"
3263 | ]
3264 | },
3265 | {
3266 | "cell_type": "code",
3267 | "execution_count": 34,
3268 | "id": "6adb0606",
3269 | "metadata": {},
3270 | "outputs": [
3271 | {
3272 | "name": "stdout",
3273 | "output_type": "stream",
3274 | "text": [
3275 | "Variance Inflation Factor for Machine: 1.19\n",
3276 | "Variance Inflation Factor for SizeOfOptionalHeader: 0.02\n",
3277 | "Variance Inflation Factor for Characteristics: 1.43\n",
3278 | "Variance Inflation Factor for MajorLinkerVersion: 1.19\n",
3279 | "Variance Inflation Factor for MinorLinkerVersion: 1.5\n",
3280 | "Variance Inflation Factor for SizeOfCode: 5.13\n",
3281 | "Variance Inflation Factor for SizeOfInitializedData: 1.57\n",
3282 | "Variance Inflation Factor for SizeOfUninitializedData: 1.0\n",
3283 | "Variance Inflation Factor for AddressOfEntryPoint: 1.07\n",
3284 | "Variance Inflation Factor for BaseOfCode: 4.27\n",
3285 | "Variance Inflation Factor for BaseOfData: 1.92\n",
3286 | "Variance Inflation Factor for ImageBase: 1.0\n",
3287 | "Variance Inflation Factor for SectionAlignment: 2.06\n",
3288 | "Variance Inflation Factor for FileAlignment: 1.09\n",
3289 | "Variance Inflation Factor for MajorOperatingSystemVersion: 1.0\n",
3290 | "Variance Inflation Factor for MinorOperatingSystemVersion: 4.16\n",
3291 | "Variance Inflation Factor for MajorImageVersion: 203.26\n",
3292 | "Variance Inflation Factor for MinorImageVersion: 186.8\n",
3293 | "Variance Inflation Factor for MajorSubsystemVersion: 0.6\n",
3294 | "Variance Inflation Factor for MinorSubsystemVersion: 17345.88\n",
3295 | "Variance Inflation Factor for SizeOfImage: 2.86\n",
3296 | "Variance Inflation Factor for SizeOfHeaders: 1.05\n",
3297 | "Variance Inflation Factor for CheckSum: 1.04\n",
3298 | "Variance Inflation Factor for Subsystem: 0.65\n",
3299 | "Variance Inflation Factor for DllCharacteristics: 1.63\n",
3300 | "Variance Inflation Factor for SizeOfStackReserve: 1.31\n",
3301 | "Variance Inflation Factor for SizeOfStackCommit: 1.03\n",
3302 | "Variance Inflation Factor for SizeOfHeapReserve: 0.57\n",
3303 | "Variance Inflation Factor for SizeOfHeapCommit: 140.51\n",
3304 | "Variance Inflation Factor for LoaderFlags: 143.64\n",
3305 | "Variance Inflation Factor for NumberOfRvaAndSizes: 4.65\n",
3306 | "Variance Inflation Factor for SectionsNb: 1.15\n",
3307 | "Variance Inflation Factor for SectionsMeanEntropy: 1.03\n",
3308 | "Variance Inflation Factor for SectionsMinEntropy: 1.18\n",
3309 | "Variance Inflation Factor for SectionsMaxEntropy: 0.7\n",
3310 | "Variance Inflation Factor for SectionsMeanRawsize: 30.3\n",
3311 | "Variance Inflation Factor for SectionsMinRawsize: 619.0\n",
3312 | "Variance Inflation Factor for SectionMaxRawsize: 26.68\n",
3313 | "Variance Inflation Factor for SectionsMeanVirtualsize: 138.58\n",
3314 | "Variance Inflation Factor for SectionsMinVirtualsize: 622.11\n",
3315 | "Variance Inflation Factor for SectionMaxVirtualsize: 146.14\n",
3316 | "Variance Inflation Factor for ImportsNbDLL: 1.42\n",
3317 | "Variance Inflation Factor for ImportsNb: 1.2\n",
3318 | "Variance Inflation Factor for ImportsNbOrdinal: 1.28\n",
3319 | "Variance Inflation Factor for ExportNb: 1.06\n",
3320 | "Variance Inflation Factor for ResourcesNb: 1.24\n",
3321 | "Variance Inflation Factor for ResourcesMeanEntropy: 0.89\n",
3322 | "Variance Inflation Factor for ResourcesMinEntropy: 0.88\n",
3323 | "Variance Inflation Factor for ResourcesMaxEntropy: 1.16\n",
3324 | "Variance Inflation Factor for ResourcesMeanSize: 13.04\n",
3325 | "Variance Inflation Factor for ResourcesMinSize: 7.14\n",
3326 | "Variance Inflation Factor for ResourcesMaxSize: 4.39\n",
3327 | "Variance Inflation Factor for LoadConfigurationSize: 1.0\n"
3328 | ]
3329 | }
3330 | ],
3331 | "source": [
3332 | "for i in range(len(mc.columns[:-1])):\n",
3333 | " v=vif(np.matrix(mc[:-1]),i)\n",
3334 | " print(\"Variance Inflation Factor for {}: {}\".format(mc.columns[i],round(v,2)))"
3335 | ]
3336 | },
3337 | {
3338 | "cell_type": "code",
3339 | "execution_count": 35,
3340 | "id": "5cc83368",
3341 | "metadata": {},
3342 | "outputs": [
3343 | {
3344 | "name": "stdout",
3345 | "output_type": "stream",
3346 | "text": [
3347 | "Variance Inflation Factor for MajorImageVersion : 203.26\n",
3348 | "Variance Inflation Factor for MinorImageVersion : 186.8\n",
3349 | "Variance Inflation Factor for MinorSubsystemVersion : 17345.88\n",
3350 | "Variance Inflation Factor for SizeOfHeapCommit : 140.51\n",
3351 | "Variance Inflation Factor for LoaderFlags : 143.64\n",
3352 | "Variance Inflation Factor for SectionsMeanRawsize : 30.3\n",
3353 | "Variance Inflation Factor for SectionsMinRawsize : 619.0\n",
3354 | "Variance Inflation Factor for SectionMaxRawsize : 26.68\n",
3355 | "Variance Inflation Factor for SectionsMeanVirtualsize : 138.58\n",
3356 | "Variance Inflation Factor for SectionsMinVirtualsize : 622.11\n",
3357 | "Variance Inflation Factor for SectionMaxVirtualsize : 146.14\n",
3358 | "Variance Inflation Factor for ResourcesMeanSize : 13.04\n",
3359 | "12\n"
3360 | ]
3361 | }
3362 | ],
3363 | "source": [
3364 | "count=0\n",
3365 | "for i in range(len(mc.columns[:-1])):\n",
3366 | " v=vif(np.matrix(mc[:-1]),i)\n",
3367 | " if v>10:\n",
3368 | " print(\"Variance Inflation Factor for {} : {}\".format(mc.columns[i],round(v,2)))\n",
3369 | " count=count+1\n",
3370 | "print(count) "
3371 | ]
3372 | },
3373 | {
3374 | "cell_type": "code",
3375 | "execution_count": 38,
3376 | "id": "28be04aa",
3377 | "metadata": {},
3378 | "outputs": [],
3379 | "source": [
3380 | "# Remove Multicollinearity"
3381 | ]
3382 | },
3383 | {
3384 | "cell_type": "code",
3385 | "execution_count": 39,
3386 | "id": "57f8e502",
3387 | "metadata": {},
3388 | "outputs": [],
3389 | "source": [
3390 | "x=dataset.drop(['Name','md5','legitimate','MajorImageVersion','MinorImageVersion','MinorSubsystemVersion','SizeOfHeapCommit','LoaderFlags','SectionsMeanRawsize','SectionsMeanVirtualsize','ResourcesMeanSize'],axis=1).values\n",
3391 | "y=dataset['legitimate'].values #dependent variable"
3392 | ]
3393 | },
3394 | {
3395 | "cell_type": "code",
3396 | "execution_count": 40,
3397 | "id": "74d1f980",
3398 | "metadata": {},
3399 | "outputs": [],
3400 | "source": [
3401 | "extratrees=ek.ExtraTreesClassifier().fit(x,y)\n",
3402 | "model=SelectFromModel(extratrees,prefit=True)\n",
3403 | "x_new=model.transform(x)\n",
3404 | "nbfeatures=x_new.shape[1]"
3405 | ]
3406 | },
3407 | {
3408 | "cell_type": "code",
3409 | "execution_count": 41,
3410 | "id": "9a959102",
3411 | "metadata": {},
3412 | "outputs": [
3413 | {
3414 | "data": {
3415 | "text/plain": [
3416 | "12"
3417 | ]
3418 | },
3419 | "execution_count": 41,
3420 | "metadata": {},
3421 | "output_type": "execute_result"
3422 | }
3423 | ],
3424 | "source": [
3425 | "nbfeatures"
3426 | ]
3427 | },
3428 | {
3429 | "cell_type": "code",
3430 | "execution_count": 42,
3431 | "id": "3cbf22c4",
3432 | "metadata": {},
3433 | "outputs": [
3434 | {
3435 | "data": {
3436 | "text/plain": [
3437 | "([,\n",
3438 | " ],\n",
3439 | " [Text(0.8680545570066952, 0.675633988236168, 'Important Features'),\n",
3440 | " Text(-0.8680544937492721, -0.675634069509298, 'Not Important Features')],\n",
3441 | " [Text(0.4734843038218337, 0.3685276299470007, '21%'),\n",
3442 | " Text(-0.47348426931778476, -0.36852767427779887, '79%')])"
3443 | ]
3444 | },
3445 | "execution_count": 42,
3446 | "metadata": {},
3447 | "output_type": "execute_result"
3448 | },
3449 | {
3450 | "data": {
3451 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXgAAADnCAYAAAAU2k2EAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAeBElEQVR4nO3deZhcVb3u8e+vuzMPBSFAAgIlCfMUITJkkghHzrERmQMGAceDM6Cc24B6N6AYVDyKyHSvCiigGEZpRjFIIIKBEAhDEjB2mAMkUJl7qFrnj71z6IROp7pTVav2qvfzPPV0d3XV3m9leGvV2pM55xARkfDU+Q4gIiLloYIXEQmUCl5EJFAqeBGRQKngRUQCpYIXEQmUCl5EJFAqeBGRQKngRUQCpYIXEQmUCl5EJFAqeBGRQKngRUQCpYIXEQmUCl5EJFAqeBGRQKngRUQCpYIXEQmUCl5EJFAqeBGRQKngRUQCpYIXEQmUCl5EJFAqeBGRQKngRUQCpYIXEQmUCl5EJFANvgOIVESUGQCMAkYDOwMjgK2S2xbAkOQ2ACgAHV3cVgFLgDc3uL0B/JMol6vY6xEpgjnnfGcQKZ0o0wcYAxwM7AvsQlzq2wFW5rW/Cjyb3OYlX58nyq0t83pFuqSCl3SLMiOBccSFfgiwP/EovFp0AHOAma+64Q9OaL1sVsu0Ro30pSJU8JI+UeYA4FPAUcBHPKcp2rxCduan2i4eBzwBPADcDTzWMq1R/wmlLFTwUv2iTF/gcOJSPxL4kN9AvTOt/aRZV+WPGrfB3S8DfwT+0DKtcY6HWBIwFbxUr3ikfjpwMvHG0FQbt/ayN15n+MhuHrKQuOxvapnW+EKFYknAVPBSXaLMEGAq8J/EG0uD0OYaWnZtvT7bg6c8DvwcmN4yrbGjLKEkeCp4qQ5RZifgHOA0YLDnNCU3tzBq5tFtF03sxVNfBS4HrmmZ1vhuiWNJ4FTw4leU2QU4FzgF6OM5Tdlc1H7KrF/nP7nh/HtPrAauB37eMq1xQYliSeBU8OJHlNkbOB84Aaj3nKbsDlp7+VtLGLZNCRZVAH4HfK9lWuMrJVieBEwFL5UVZbLAj4HjKf+BR1Wh1TUs2q31+p1LvNi1xFM3F2vqRjZGBS+VEWUGAk3E8+z9PaepqCcLuzx8XNsFk8q0+HeBHwG/bJnWqCNmZT062ZiUX5Q5CZgPfI8aK3eAO/OHlHPbwpbEn4gWZpuajy3jeiSFNIKX8onn2a8AerP3SBCcw41tvXLpUjLDK7TK6cDXWqY1vlWh9UkV0wheSi/K1BFlziE+JL9myx2glT4vVbDcId628Xy2qfmUCq5TqpQKXkor3og6g3jaoJ/fMP4943Z+w8NqtwJ+l21qbs42NafytA5SGip4KZ0o83ngGaBcGxRT5878uL4eV/9J4LlsU/MUjxnEI83By+aLMlsAvwWO9hukujiH27/1qnffZegw31mIT3twjk57UFs0gpfNE2X2IZ5rP9pzkqqzhr4Lq6TcAc4EZmSbmrs72ZkERgUvvRdlTgT+TnwpPNnA3MLoN31n2MAEYE62qVlTaDVCBS89F2WMKHMB8altB/mOU61uL4yvpitLrTMCeDDb1Hym7yBSfpqDl56JMv2JT3p1gu8o1cw5Cvu1XrNiOYMzvrN041LieXmVQKA0gpfiRZnBxJeZU7lvwmr6Lajycgf4NvCbbFNzg+8gUh4qeClOvKfMA8Bkz0lSYU5h17QcSXo6cEu2qbnmTiFRC1TwsmlRZmvig5cO9h0lLW7Pjx/oO0MPHAXcl21q3qxPHGa2skR5il1f1sw+s5nLONPMuvy7MrOHzGyBmc1Nbsf3YvljzOyTm5Nxc6jgpXtRZnvgYQK6fF65OUf+vsLY3Xzn6KFJwEPZpuZq2a2zW2bWAGSBzSp44t1Hu3sznuqcG5Pcpvdi+WOIDzgrmsVK0s0qeNm4KLMdMBPY3XeUNFnJgPkrGTjUd45eGAPcnW1q3qw9o8zsUDP7m5ndbGYLzWyamU01s3+Y2TwzG5U87lozu8rMZiaPOzK5v7+Z/TZ57FNmNjm5/3Qz+5OZ/Rm4H5gGTExG12clI/qZZjYnuY3rlOchM5tuZvPN7IakRL8JbAfMMLMZRb62rc3sFjObndzGJ/cfaGazkryzzGw3M+sLXAhMSTJOMbPIzL7TaXnPJrmzZvaCmV0BzAF2MLNzknU8Y2YXJI8fZGbNZvZ08txuj1LWxhXpWpTJAPcCH/YdJW2eKOz6ju8Mm+Eg4NZsU/OnWqY1tm3GcvYD9gCWAYuA/++cO9DMvgV8g3jkDPEo/GPEx1LMMLPRwNcAnHP7mNnuwP1mtmvy+EOAfZ1zy8zsUOA7zrl1bwwDgX9zzq01s12Am4CxyfM+AuwFvA48Cox3zl1mZmcDk51zG/s7u8HM1iTfHwb8Avhv59wjZrYjcF/yOucDk5xzHWZ2OHCxc+44M/s+MNY59/UkY9TNn9luwOecc181s08AuwAHEl8Y504zmwRsDbzunGtMltfttJoKXj4oyvQD7gD28R0ljW7PT0j7sQGfID5Z2ckt0xoLvVzGbOfcGwBm9k/iETfAPNbfUH+zc64AvGhmi4g/LU4AfgngnJtvZouBdQX/gHNu2UbW2Qe43MzGAPlOzwH4h3Pu1STPXOI3lkeKeB1TnXNPrPshKe89zf73YmRDzWwIkAGuS95YHL27vvBi59xjyfefSG5PJT8PJi78mcBPzewS4C7n3MzuFqiCl/VFmTrgBuJRlfSQc3T8pbB/CFNaJxKPvr/Sy+e3dvq+0OnnAuv3zob74Du6v5Tjqm5+dxawhPjTQx3xZQ27ypOn991XBxzinFvT+U4z+yUwwzl3jJllgYc28vwO1p8a77z3UufXZsCPnHNXb7gAMzuAeF7/R2Z2v3Puwu7CinR2OXCc7xBptYKBL6xiwGDfOUrkjGxT80VlXscJZlaXzMvvDCwg3qg/FSCZmtkxuX9DK4AhnX7OAG8knwg+S3EXc99wGZtyP/D1dT8knxbWrfu15PvTu1l+C7B/8tz92fgU6H3A581scPLY7c1sGzPbDljtnPs98NN1y9oYFby8L8r8F70fsQnweGH3pb4zlNh3y3zxkAXA34B7gDOcc2uJrwJWb2bziE+HcbpzrrWL5z4DdCQbHM9KnneamT1GPD3T3Wh/nWuAe4rdyAp8ExibbPh8Hjgjuf/HxCPqR1n/jWUG8ZTO3GSD6C3AsGSa6CvAwq5W4py7H7gR+Hvy5zCd+I1iH+AfyfPPB37QXVidqkBiUWYy8YFMxYx6ZCO+2vatOXcXDup2VJVCa4DxLdMan9rkI3vAzK4lnkfuze6HUgSN4GXd7pA3oXLfLM7RNqMwJoT59w0NAG7LNjVv5TuI9IwKvtZFmQbij8Hb+o6SdjkGzV9DvzQdwdoTOxHvWdPdBtAecc6drtF7eang5RLi3dJkMz1W2HNju++F4j+Ac32HkOKp4GtZlDkWONt3jFDcmp9Q7WePLIULs03NE32HkOJoI2utijLbAs8BmlctAedo3b31WtdK31o4K+NLwL4t0xrXbPKR4pVG8LXrKlTuJfMuQ16okXIHGE18jhWpcir4WhRlPoMukl1Sswp75XxnqLCzsk3NH/UdQrqngq81UWYr4Oe+Y4Tm1vyELXxnqLB64qtB9fUdRDZOBV97fkZ8RjopEedYM7Ow7x6+c3iwN3Ce7xCycSr4WhJlPgac6jtGaJYy9IV2Gmp1JHtetql5L98hpGsq+FoRZYz45ERSYjML+yz3ncGjPsTHUkgVUsHXjpN4/+IHUkK35ifW+t5IjdmmZp1eugqp4GtBlOkL/NB3jBA5x6q/F/YM8fwzPaVRfBVSwdeGr6FL75XF22Tmd9DQm6v3hOagbFOzriNQZVTwoYsyWwDf9R0jVA8X9lvhO0MVuTjb1KyrxFURFXz4vg0M8x0iVLfmJ2iX0/ftCnzRdwh5n85FE7IoMxB4BRV8WTjHitGtvxuQp16j1ve1AKNbpjXmfQcRjeBD9zlU7mWzhC3nq9w/IItOg1E19I8zVFGmjvgq81ImD+XHrC7HcjuWv807zT8jv/JdzOoYPOYIho79NKvmP0LukRtpX/oKI079Gf1G7gLA2lefZ9n9V2D1fRh+1Dn02XI7CmtX8vYdl7DNiRdiVrJrdBTrTOJrj4pnGsGH62hglO8QISvb/HtdPVtO/gLbf+kqRnz2p6yY00zbOy/Td/hObH3MefTbYf0DR5fPvo2tjz6XLSadyoqn7gbgvVl/IHPIiT7KHWBCtqn5AB8rlvWp4MP1bd8BQuYcuSfcbruVY9kNg4fRb8RoAOr6DaTPVjuQX7GUPsN3oM9WH/rA462uAdfRhutoxeoaaH/3DfIrltJ/x33KEa9Y+vRYBVTwIYoyY4FxvmOE7HW2WlCgruwXKe/ILaFtySL6bbfx95LMwSew9N7LWf7EHQzZ/0jee/h6tph4SrmjbcqJ2abmkb5D1DrNwYfpNN8BQvfX/EfKfjWjQtsa3r7tYoYd9iXqurmWd99td2bkqZcCsPaVZ6kfHG9Xf/uOS7C6erb8+BeoH7RlueNuqA/wn0BU6RXL+zSCD02UaSA+74yU0a35iSPKuXyX7+Dt2y5m0J6HMnC34j6MOefIzfojmfEn896jN7LFhM8waK/JLH/yz+WM2p3P+FqxxFTw4fl3YLjvECErON6b60btWq7lO+dYes8v6LPVDgw98Jiin7fq2QcZMGos9f0H49pbwerALP7ej120sdUvTdGE57O+A4TuNTd8gaPuoHItv/W151n13Az6bJ3l9d9+A4AtJ52Ky7ez7IGrya/J8db0C+i7zYfZdspFABTa17Ly2QfZ9sT456EfPZq3b7sYq29g+FH/Va6oxTgJeNJngFqmI1lDEmWGAm8CA3xHCdlvO4742wUdp+n0uMV5Gci2TGtU0XigKZqwHIvKvexuy0/YzneGFNkROMR3iFqlgg/L0b4DhK7gbOkzbufRvnOkjDb6e6KCD0V8UY/DfMcI3ctum4Xg5/DQFCt+S7GUlAo+HBOBwb5DhO6BwgHtvjOk0IeyTc27+A5Ri1Tw4TjCd4BacFt+wva+M6TUZN8BapEKPhyanimzvLO3n3dZncCtd1TwHqjgQxBlhgFjfMcI3WI34kXfGVLsUN8BapEKPgzj0d9l2d1XGNvhO0OKjcg2Ne/uO0StUSmEQYeDV8Dt+fE7+M6QcpqmqTAVfBg+4jtA6PLOlixwO37Yd46U0ymsK0wFH4b9fQcI3SI38iXfGQKw16YfIqWkgk+7KLM18MHL/EhJ3Vs4sOA7QwB2zzY1q3MqSH/Y6afpmQq4LT9hR98ZAjAA2Nl3iFqigk8/FXyZdbi61xe57XbynSMQmqapIBV8+ukQ8DJ7yW3/L98ZAqKCryAVfPpp6qDM7skfqHOZl44KvoJU8OmnfbPL7PbC+KzvDAHRJ84KUsGnnwq+jNpd/auL3QjtpVQ62/gOUEtU8GkWn4NmkO8YIVvoPtTiO0NgtvYdoJao4NNNo/cya84frIt7lNbAbFOzBiUVooJPN52bvMzuLIzT6QlKT6P4ClHBp9sQ3wFC1u7qF7/qttYFtktPBV8hKvh0G+A7QMhecDu+7DtDoLShtUJU8Ommgi+ju/KH6P9HeQzzHaBW6B9wuqngy+jO/LjRvjMEqq/vALVCBZ9uA30HCFWra/jXmwzb1neOQDX4DlArVPDpphF8mTzvsq/6zhCwet8BaoXeSdOtj+8AoRpjLx2wqN/UZb5zhGgNfTvgLd8xaoIKPt3afAcIlRkDDacpsDIYRKsOHqsQTdGk21rfAUR6Ie87QK1Qwadbq+8AIr3Q7jtArVDBp9tK3wFEekHbNipEBZ9uy30HEOmFd3wHqBUq+HRTwUsaqeArRAWfbtrXTNJIBV8hKvh008mwJG06gPd8h6gVKvh0exPtSSPpsowop4uYV4gKPs3i/yiv+I4h0gOv+Q5QS1Tw6adpGkmTBb4D1BIVfPot9h1ApAfm+w5QS1Tw6dfiO4BID6jgK0gFn37zfAcQ6QEVfAWp4NPvSd8BRIpUABb6DlFLVPBpF+VeRgeOSDosJsqt8R2ilqjgwzDHdwCRIsz2HaDWqODDoGkaSYOZvgPUGhV8GFTwkgaP+A5Qa1TwYXjMdwCRTcgBz/gOUWtU8CGIcq8Bz/uOIdKNWUS5gu8QtUYFH457fQcQ6YamZzxQwYdDBS/V7CHfAWqRCj4cDwOrfYcQ6cLbaDuRFyr4UES5VjRKkurUrPl3P1TwYbnHdwCRLtzhO0CtUsGH5Tbi832IVIsVaPuQN+acrp4VlCgzAzjUx6oXvJNnyvT3TzWy6N0CF07ux+RsA2c0r2VlmyO7RR03HDuAof2MR1/u4CvNa+nXADcdN5DRw+p4b61jyvTV3Dt1IGbm42VIad1IlJvqO0St0gg+PDf6WvFuw+uZe8Zg5p4xmCe/PIiBfYxjdu/DF/+8hmmH9WPeVwZzzO4N/OTR+DKyl/69jVtOHMDFH+/PlbPbALjob62cN6Gfyj0cN/sOUMtU8OG5GfB+xr4H/5Vn1LA6dtqijgXvFJi0Uz0A/7ZzA7e80AFAn3pY0wGr2x196uGfywq8tqLAx7INPqNL6bwJ3O07RC1TwYcmyuWI5+K9+sOz7Zy8dx8A9t6mnjsXxKX+p+fbeWV5vJng3An9+PKf1/Lzx9v4+oF9Of+va7locj9vmaXkfk2Ua/cdopap4MP0W58rb8s77lzQwQl7xiPx33y6P7+a3cYB16xkRSv0rY+nX8aMqOexLw5ixmmDWPRuge2G1OGAKdNXc8qta1iyUtuLU6wAXOM7RK3TZ+EwPUh85Zxdfaz8nhc72H9kHdsOjscPuw+v5/7PDgJg4dI8zS+uP6hzzvGDh1v54/ED+fo9a7jg0H60vOe47PE2fnhY/4rnl5K4O7kYjXikEXyIopwDLvW1+ps6Tc8AvLUqHokXnOMHD7dxxti+6z3+uqfbadylgS0HGKvboc7i22p9uE+zq3wHEI3gQ3Y9cBGwTSVXurrd8cCiPFcfOeB/77tpXju/mh239bF7NPC5MX3We/x1T7dz/ykDATj74L4cd/Ma+tbDTccNQFKpBR10VxW0H3zIosx3iUtepJLOJMr9wncI0RRN6K4AVvkOITXlDeBq3yEkpoIPWZRbBvzGdwypKdOIcmt9h5CYCj58PwVafYeQmvA62jWyqqjgQxfvqvZL3zGkJmj0XmVU8LXhh8BS3yEkaBq9VyEVfC2Icu+hvWmkvL6XXHRGqogKvnZcAbzkO4QEaRaeT48hXVPB14r4pE9NvmNIcPLA15Kjp6XKqOBrSZS7BfiL7xgSlCuJcnN9h5CuqeBrzxeBlb5DSBCWAN/1HUI2TgVfa6LcYuD/+I4hQTgnuf6AVCkVfG26EnjIdwhJtbuIcr/zHUK6p4KvRfEGsS8Cq31HkVR6h/jfj1Q5FXytinL/BM71HUNS6UtEuSW+Q8imqeBr2y+Bu3yHkFS5kih3u+8QUhwVfC2Lp2pOAxb7jiKp8Axwtu8QUjwVfK2LTyl8IjrjpHQvB0zRycTSRQUvEOX+AXzNdwypWnnicp/vO4j0zCYL3sycmV3a6efvmFm0ieccbWZ7buR3kZl9p8dJN4OZnW5m223G88eY2Sc38rtDzSxnZnOTW6+OFDWzM81sYG8zbrYo92t0oWTp2llEuft8h5CeK2YE3woca2bDe7Dco4EuC77SzKweOB3odcEDY4AuCz4x0zk3Jrkd3st1nAn0qODNrNQXTf8m8ECJlynpdiVRTtcTSKliCr6D+DzPZ234CzPbycweNLNnkq87mtk44CjgJ8mIdtTGFmxmD5nZf5vZw2b2gpl91MxuNbMXzewHyWOyZjbfzK5L1jN93UjXzA4zs6fMbJ6Z/cbM+iX3t5jZ983sEeBkYCxwQ5JnQPK72Wb2rJldY2bWKc8lZvYPM1toZhPNrC9wITAlef6UYv5gzeyUZDlzzezq5I0GM7vSzJ4ws+fM7ILkvm8SvwHNMLMZyX0rOy3reDO7Nvn+WjP7WfK4S8xslJnda2ZPmtlMM9s9edwJyet72sweLiZzckKyY4HZRT1eQvcX4jd9Sali5+B/BUw1s8wG918OXO+c2xe4AbjMOTcLuBM4JxnR/nMTy25zzk0inh64g3gueG/gdDPbKnnMbsA1yXqWA181s/7AtcAU59w+QAPwlU7LXeucm+Cc+z3wBDA1ybMGuNw591Hn3N7AAODITs9rcM4dSDyi/r/OuTbg+8Afk+f/sYvXMLHTFM35ZrYHMAUY75wbQzyHOTV57PnOubHAvsDHzGxf59xlxBdMmOycm7yJPy+AXYHDnXPfJn7z/YZz7gDgO8SnBSbJfIRzbj/iN9ziRLmVxJ9WFhT9HAnRc8AJRLkO30Gk94oqeOfccuB6PvhufghwY/L974AJvchwZ/J1HvCcc+4N51wrsAjYIfndK865R5Pvf5+sZzfgX865hcn91wGTOi23qyJeZ7KZPW5m84CPA3t1+t2tydcngWyRr6HzFM0PgcOAA4DZZjY3+Xnn5LEnmtkc4Klkvb2ZyvqTcy5vZoOBccCfkvVcDYxMHvMocK2ZfQmo79HSo9w7wBHAa73IJuk3HzgsuVCMpFhP5nB/Dsyh+xP79+ac0Ot2zyuw/q56Bd7Pt+FyHWCbWO6qru5MRv5XAGOdc68kG4z7d5EnT8/+fNZbDXCdc269I0XN7MPEo+yPOufeTaZd+nfxfFj/NW/4mHWvrQ54L/mUsP6TnTvDzA4CGoG5ZjbGOVf8Zfui3GKizL8DDwNbFv08SbsXgY/rSNUwFL2bpHNuGXAz8IVOd88CTkq+nwo8kny/AhhSioCJHc3skOT7k5P1zAeyZjY6uf+zwN828vzOedaV5TvJCPj4Itbf09fzIHC8mW0DYGbDzGwnYChxOefMbFvgP7pZxxIz28PM6oBjulpJ8snqX2Z2QrIeM7P9ku9HOeced859n/jcITt0tYxuRblniUfyy3r8XEmjRcTl/obvIFIaPd0P/lKg89403wQ+Z2bPEBfst5L7/wCck2wA3ehG1h54ATgtWc8w4Ern3Frgc8TTE/OIR/wb283vWuCqZBqjFfh/xFNCt1PcBsUZwJ7FbmR1zj1PfJ7s+5PMDwAjnXNPE0/NPAf8hngaZZ1rgHvWbWQlvvrSXcBfge7+w00FvmBmTyfL/XRy/0+Sjc/PEo/Cny7idX5QlJtNPPWl//RhawEmE+Ve9R1ESsecq+4rbZlZFrgr2SAqvkSZUcR7VWQ9J5HSWwgcQZRr8R1ESktHskpx4rNPTiD+NCXheBQ4ROUepqofwUuViTLDgXuIjy2QdPsTcKrOLxMujeClZ+JdKCcRH/cg6fVTdPKw4GkEL70XZc4GfkxP97MXnzqAM4lyv/IdRMpPBS+bJ8ocRnxQ2Vabeqh49wZwElGuuFNXSOqp4GXzRZks8S6n+/kNIt2YAZysA5hqi+bgZfPFe2AcTHwJQI0YqksH8D3gcJV77dEIXkoryhxBfDqLkZt6qJRdCzCVKDfLdxDxQyN4Ka34whB7EZ8UTvzIEx91vrfKvbZpBC/lE2WOJD59xPa+o9SQJ4EvE+Xm+A4i/mkEL+UT5e4iPq3zRcAaz2lCtwo4GzhI5S7raAQvlRFldiLeZ/5E31EC44h3U20iyi32HUaqiwpeKivKTCS+tsD+npOE4G7gfKLcXN9BpDqp4KXyokwdcAJwLtp3vjceAc4jys30HUSqmwpe/IoyjcRFP953lBSYBVxMlGv2HUTSQQUv1SHKTALOI76ClLyvjXiO/RdEuSd9h5F0UcFLdYkyewCfB04FtvGcxqclxLuYXkWUe9N3GEknFbxUpyjTh/iC4V8gvnZtLZyxsh24l/ggsduJcm2e80jKqeCl+kWZkcTXnj2W+Jw35jdQSeWBh4gvvjGdKLfUbxwJiQpe0iXKjAA+BXwSOAwY4jdQrywhPrvjA8CdyUVUREpOBS/pFU/jTAAmAgclt2o8L/17xKP0vwJ/Jco95zWN1AwVvIQlyozi/bIfC+wCbF3BBC3AvOT2TPJ1PlGuUMEMIoAKXmpBlBkC7AyM6vR1eyCT3IZ2+trQxRIcsBZYDbxFfGWk15Ov675/GXiOKLe8nC9FpCdU8CKdRZmBxCVfIC72DqJcq99QIr2jghcRCZROFywiEigVvIhIoFTwIiKBUsGLiARKBS8iEigVvIhIoFTwIiKBUsGLiARKBS8iEigVvIhIoFTwIiKBUsGLiARKBS8iEigVvIhIoFTwIiKBUsGLiARKBS8iEigVvIhIoFTwIiKBUsGLiARKBS8iEigVvIhIoFTwIiKBUsGLiARKBS8iEigVvIhIoFTwIiKB+h/BIyvZDHErBAAAAABJRU5ErkJggg==\n",
3452 | "text/plain": [
3453 | ""
3454 | ]
3455 | },
3456 | "metadata": {},
3457 | "output_type": "display_data"
3458 | }
3459 | ],
3460 | "source": [
3461 | "dataset.columns.size\n",
3462 | "imp_features_visual=['Important Features','Not Important Features']\n",
3463 | "imp_features_visual_val=[nbfeatures,57-nbfeatures]\n",
3464 | "plt.pie(imp_features_visual_val, labels=imp_features_visual, autopct='%0.f%%')"
3465 | ]
3466 | },
3467 | {
3468 | "cell_type": "code",
3469 | "execution_count": 43,
3470 | "id": "b018fe70",
3471 | "metadata": {},
3472 | "outputs": [],
3473 | "source": [
3474 | "x_train,x_test,y_train,y_test=train_test_split(x_new,y,test_size=0.2)"
3475 | ]
3476 | },
3477 | {
3478 | "cell_type": "code",
3479 | "execution_count": 44,
3480 | "id": "3d0336b6",
3481 | "metadata": {},
3482 | "outputs": [],
3483 | "source": [
3484 | "features=[]\n",
3485 | "index=np.argsort(extratrees.feature_importances_)[::1][:nbfeatures]"
3486 | ]
3487 | },
3488 | {
3489 | "cell_type": "code",
3490 | "execution_count": 45,
3491 | "id": "bc732e5b",
3492 | "metadata": {},
3493 | "outputs": [
3494 | {
3495 | "name": "stdout",
3496 | "output_type": "stream",
3497 | "text": [
3498 | "1. feature SizeOfStackReserve (0.000042)\n",
3499 | "2. feature SizeOfUninitializedData (0.000998)\n",
3500 | "3. feature BaseOfCode (0.001076)\n",
3501 | "4. feature SizeOfInitializedData (0.001339)\n",
3502 | "5. feature MinorImageVersion (0.001343)\n",
3503 | "6. feature SectionsNb (0.001391)\n",
3504 | "7. feature SectionsMinEntropy (0.001397)\n",
3505 | "8. feature BaseOfData (0.001520)\n",
3506 | "9. feature ImportsNbOrdinal (0.001540)\n",
3507 | "10. feature DllCharacteristics (0.001577)\n",
3508 | "11. feature SizeOfCode (0.001702)\n",
3509 | "12. feature AddressOfEntryPoint (0.002303)\n"
3510 | ]
3511 | }
3512 | ],
3513 | "source": [
3514 | "for f in range(nbfeatures):\n",
3515 | " print(\"%d. feature %s (%f)\"%(f+1,dataset.columns[2+index[f]],extratrees.feature_importances_[index[f]]))\n",
3516 | " features.append(dataset.columns[2+f])"
3517 | ]
3518 | },
3519 | {
3520 | "cell_type": "code",
3521 | "execution_count": 46,
3522 | "id": "8019777c",
3523 | "metadata": {},
3524 | "outputs": [],
3525 | "source": [
3526 | "model ={ \"RandomForest\":ek.RandomForestClassifier(n_estimators=50),\n",
3527 | " \"DecisionTree\":tree.DecisionTreeClassifier(max_depth=10),\n",
3528 | " \"LogisticRegression\":LogisticRegression()\n",
3529 | " }"
3530 | ]
3531 | },
3532 | {
3533 | "cell_type": "code",
3534 | "execution_count": 47,
3535 | "id": "26424604",
3536 | "metadata": {},
3537 | "outputs": [
3538 | {
3539 | "name": "stdout",
3540 | "output_type": "stream",
3541 | "text": [
3542 | "RandomForest : 0.9942774357116987\n",
3543 | "DecisionTree : 0.9903295907279971\n",
3544 | "LogisticRegression : 0.6968489677653025\n"
3545 | ]
3546 | },
3547 | {
3548 | "name": "stderr",
3549 | "output_type": "stream",
3550 | "text": [
3551 | "C:\\Users\\vajha\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=2):\n",
3552 | "ABNORMAL_TERMINATION_IN_LNSRCH.\n",
3553 | "\n",
3554 | "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
3555 | " https://scikit-learn.org/stable/modules/preprocessing.html\n",
3556 | "Please also refer to the documentation for alternative solver options:\n",
3557 | " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
3558 | " n_iter_i = _check_optimize_result(\n"
3559 | ]
3560 | }
3561 | ],
3562 | "source": [
3563 | "results={}\n",
3564 | "for algo in model:\n",
3565 | " clf=model[algo]\n",
3566 | " clf.fit(x_train,y_train)\n",
3567 | " score=clf.score(x_test,y_test)\n",
3568 | " print(\"%s : %s\"%(algo,score))\n",
3569 | " results[algo]=score"
3570 | ]
3571 | },
3572 | {
3573 | "cell_type": "code",
3574 | "execution_count": 48,
3575 | "id": "295df33e",
3576 | "metadata": {},
3577 | "outputs": [
3578 | {
3579 | "data": {
3580 | "text/plain": [
3581 | "'RandomForest'"
3582 | ]
3583 | },
3584 | "execution_count": 48,
3585 | "metadata": {},
3586 | "output_type": "execute_result"
3587 | }
3588 | ],
3589 | "source": [
3590 | "winner=max(results,key=results.get)\n",
3591 | "winner"
3592 | ]
3593 | },
3594 | {
3595 | "cell_type": "code",
3596 | "execution_count": 49,
3597 | "id": "b4203503",
3598 | "metadata": {},
3599 | "outputs": [
3600 | {
3601 | "name": "stdout",
3602 | "output_type": "stream",
3603 | "text": [
3604 | "False positive rate : 0.102353 %\n",
3605 | "False negative rate : 0.174237 %\n"
3606 | ]
3607 | }
3608 | ],
3609 | "source": [
3610 | "clf=model[winner]\n",
3611 | "res=clf.predict(x_new)\n",
3612 | "mt=confusion_matrix(y,res)\n",
3613 | "print(\"False positive rate : %f %%\" % ((mt[0][1] / float(sum(mt[0])))*100))\n",
3614 | "print(\"False negative rate : %f %%\" % ((mt[1][0] / float(sum(mt[1])))*100))"
3615 | ]
3616 | },
3617 | {
3618 | "cell_type": "code",
3619 | "execution_count": 50,
3620 | "id": "395ebe6a",
3621 | "metadata": {},
3622 | "outputs": [],
3623 | "source": [
3624 | "# Confusion Matrix"
3625 | ]
3626 | },
3627 | {
3628 | "cell_type": "code",
3629 | "execution_count": 51,
3630 | "id": "619f2146",
3631 | "metadata": {},
3632 | "outputs": [
3633 | {
3634 | "data": {
3635 | "text/plain": [
3636 | "array([[96625, 99],\n",
3637 | " [ 72, 41251]], dtype=int64)"
3638 | ]
3639 | },
3640 | "execution_count": 51,
3641 | "metadata": {},
3642 | "output_type": "execute_result"
3643 | }
3644 | ],
3645 | "source": [
3646 | "cf=confusion_matrix(y,res)\n",
3647 | "cf"
3648 | ]
3649 | },
3650 | {
3651 | "cell_type": "code",
3652 | "execution_count": 52,
3653 | "id": "87bbf42c",
3654 | "metadata": {},
3655 | "outputs": [
3656 | {
3657 | "data": {
3658 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVMAAAEWCAYAAADb3nSrAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAfFklEQVR4nO3dd5gV5fnG8e/DLggI0hEFARUFUWNsaDACtgR7iaJYoogtllh/atRYorFgiQ1jBRQiKokaC4IFaYoiGGn2RlWkqICgsMvz++OdxcOyu2d3edk57t6f69qLM3OmPDNn5p533jm7mLsjIiLrp1baBYiIVAcKUxGRCBSmIiIRKExFRCJQmIqIRKAwFRGJINUwNbN6Zva8mX1vZsPWYzknmNnLMWtLg5m9ZGYnp11HGsxsmZltlXYdkL0WM5thZj2qrqLcYWZXmNnDKa27vZm5meWnsf6s3D3rD3A8MAlYBnwFvAT8tjzzZlnuScBEIH99l7UhfoAegANPFxu/UzJ+dDmXcy0wJMXtaJ/Um58MG3AP8CHQegOu90Lga+B7YACwUeTlnwKM38D7bhBwQxV9Tj2AOeu5jKbA88k+nwdcWo55rgemAQXAtRto274EVgLNi41/Lzk225djGWsdx7n2k7VlamYXAXcCNwKbAm2B+4DDs81bDu2Aj929IMKyNpQFQFcza5Yx7mTg41grsKBK7hLMzIAHCCdud3efu4HW83vgcmA/wkmwFXDdhliXrOX/gLrAZsD2wBvlmOdT4FLgxQ1YF8AXQO+iATPbEai3gddZIevV6s1yJWhEaI0eU8Y0GxHCdl7ycydJC4TkSgtcDHxDaNX2Sd67jnClWpWsoy/FWnCs26I6BfgcWEr4YE7IGD8+Y76uwDuEq/M7QNeM90YTrsRvJMt5mWJXy+ItBeB+4JxkXF4y7moyWqbAXcBsYAkwGdg7Gd+z2HZOyajj70kdK4AOybjTkvf/Cfw7Y/m3AK8BVolWQdF+3Ah4FHgXaJbxfifgFWAx8BHQKxm/OzCfjJYA8AfgvXKs83Hgxozh/YCvy5jegQ7J64OA95PPZy5wSSnzrPW5F3uvxG1K3mtGaL0tSY6PG4odP558Hmckn9vK5LN7Pnn/S2D/5PW1wDBgSFLvNGBb4C+EY3428LuMZfcBPkim/Rw4Mxm/cXIcrE7WtQzYnNAVdznwGbAIeApoWsZ+vB74V2VaVsk2XJtlmmtJzlFCaA9J6vou2ZebljLfl8BVwDsZ424DriSjZQocDPwv+WxmZ9ZDRh4A+wDTMt57FZiYMTweOCJ5XbT/libH1ZHFjqE3gH8kx8oNhPPkNmAW4fi/H6iXdf9l2XE9CU3/UpvVwN+At4CWQAvgTeD6jDAqSKapTThJlgNNin8wpQxn7ryNkx3cMXlvM2D74icV4TbnW0IXQj7hSvgtSXgQAuszwgFfLxm+uZRt60EIzq7A2xkn+kjgNNYO0xMJJ2k+4eLxNVC3pO3KqGMWofWQn+yf0fwcpvUJrd9TgL2BhUCbSp4kRfvx38DbQOOM9zYmHLR9kjp2SdZVtG/fBw7MmP6ZZPvaEk6gtqWscwpwbMZw86SGZqVMnxmmX/HzxagJsEsp86z53IuNz7ZNTyQ/9YHOybTrhGnyehDFbvNZN0x/BH6frOsxwoX+yuQzPR34ImPeg4GtCV0t3Qnnwy6Zx1uxdV1AOL/aEE7yB4ChZXzWhxIC+dRKHCcVDdMzCRel+oRGxq7AJqXM9yWwP+HCtl0y/WzC3WlmmPYAdiRcRH5FCLOiUGzPz3lQl3DxaZ4Mf01ozDUknNcr+PmcP4afL0zHAj8Am2UcQwXAecly6hEahM8RsqRhso03Zdt/2W4tmwELvezb8BOAv7n7N+6+gNDiPCnj/VXJ+6vcfTjhitsxy3pLsxrYwczquftX7j6jhGkOBj5x98HuXuDuQwl9g4dmTDPQ3T929xWEK/2vy1qpu78JNDWzjsAfCSdM8WmGuPuiZJ23Ew78bNs5yN1nJPOsKra85YSAvoNwkJ/n7nOyLC+b3wFPuft3GeMOAb5094FJHe8C/wGOTt5/NKkDM2tKCI3H3X2Wuzd291mlrKsB4c6gSNHrhuWocxXQ2cw2cfdvk5oqotRtMrM8Quv6Gndf7u7vJ9u4Psa5+8jkPBlGaFTcnHymTwDtzawxgLu/6O6feTCGcGe0dxnLPhO40t3nuPtPhDA7uqTbUTPrADxICKTLzaxPMn4jM1tpZo3WczuLW0XIiA7uXujuk919SZZ5BhPOoQMI5+Va3UzuPtrdp7n7anefCgwlXHQoNt2PhOc43YDdgKmE1uhewJ6EDFiUTDvM3ecly3wS+ATokrG4ee5+T/L5/Ui4AF7o7ovdfSmhi/O4bDsjW5guAppn6UfYHJiZMTwzGbdmGcXCeDnhRKsQd/+BcFU5C/jKzF40s07lqKeoptYZw19Xop7BwLmE24tnir9pZheb2QfJNxO+I3SRNM+yzNllvenuEwm3gkYI/RIlT5eXJT9lnZiHANeY2akZ49oBe5jZd0U/hAtkq+T9IcChZtYA6EUIjq+ybBeEi+YmGcNFr5eWY94/EO4AZprZGDP7TTnmyVTWNrUgtEAy932Zn0M5zM94vYLQACnMGIbkGDOzA83sLTNbnNR1EGUfJ+2AZzK24wOgkPD8ori+wCvuPpZw0bs+CdQ9gf+5+/clzLM+BhPu0p4ws3lm1s/MapdjnuMJLcJ1GiVmtoeZvW5mC8zse8L5Xtr+GUO4cHRLXo8mBG/3ZLhomX80s/cy9uEOxZaZ+fm3ILS0J2dMPyIZX6ZsYTqBkNRHlDHNPMIHXqRtMq4yfiBsSJFWmW8mV/8DCLf4HwIPlaOeoprW90HLYOBsYHjSalwjCbDLCGHTxN0bE1piVlR6KcssbXzRcs8htHDnER4QlLwQ9+3dvUHyM66MRb5JaKHfZWbHJ+NmA2OSVmbRTwN3/1Oy7LmE4+BIwh3H4LJqzjCD8K2HIjsB84taC2Vx93fc/XBC19GzlHEhKUVZ27SAcFvXJmP6Lcoqp4LrLpWZbURoId9G6FtsDAyn7ONkNqGbJXNb6nrJDw7zCduGu39B6KbrBzxM6GqLKrnbvM7dOxO6wg4htDrLmmcmoRvkIODpEiZ5nHCLvYW7NyL0V1oJ08G6YTqGYmFqZu0IOXEu4ba/MTC92DIz9/tCwgVw+4z93cjdsza4ygzT5Ep2NdDfzI4ws/pmVju5uvZLJhsKXGVmLcyseTL9kGwrLsV7QDcza5vckvyl6A0z29TMDjOzjYGfCC2fwhKWMRzY1syON7N8MzuW0C/2QiVrAtYcnN0JfWHFNSQcxAuAfDO7mrVbZfMJt3rlfmJvZtsSOsNPJITYpWb268pV/7Pk1vIo4EEzO5qwX7Y1s5OSz7a2me1uZttlzPYYIcx3pIRWeSkeA/qaWWcza0J4+DAo20xmVif53nCj5DZ5CSV/zhmzWN3Mn7K2KWkxPg1cmxzPnSg7AOYTvokQQx3CxXEBUGBmBxK6XjLX1azY7fj9wN+TUCA5zw4vZflPA8cm52oeYd9NIfTRlnpRSPZPXUIe5Cf7MS/bxpjZPma2Y8a6VlH2Z1WkL7BvcrdZXENgsbv/aGZdCK3Y0rxJ6ErrQnj4NIPkrgQYm0yzMWHbFyQ19yG0TEvk7qsJ4fsPM2uZzNM6+XZKmbKe3O5+B3AR4WRYQLhSnktoMUA44ScR+iymEZ4U35BtuaWs6xXgyWRZk1k7AGsRHnzMIzx1605oKRZfxiLCFfJiQjfFpcAh7r6wMjUVW/Z4dy+p1T2S8N3bjwldCj+y9q1D0S8kLDKzrP1/SbfKEOAWd5/i7p8AVwCDk9bNekn287GEcOtBOKGPI+zbrwnfHMhczzMkt5tFJ0BywVtmZm1LWccIQqvodcI+mQlcU84STwK+NLMlhNu8E8uYtiuhJVH8p6xtOpfQDfM1oaU9lHCBLskjhP7b78zs2XLWX6Kk/+3PhJb2t4SgeC7j/Q+TWj5P1rc54VsizwEvm9lSwsOoPUpZ/oRkmdckyx9JaFz8ARhqZjuXUtpDhH3Wm9BYWMHazz1K04rwUHMJofthDOVoSCV9xpNKefts4G/Jtl5NGXclybH4LjDD3VcmoycAM939m2Sa94Hbk/HzCQ2CbF8Xu4zwdbG3kmPwVcrxnMfco93FSDVmZp8Rvsbzatq1xGZmtwCt3P3ktGuRXy79br5kZWZ/INwqjUq7lhjMrJOZ/cqCLoTbzvJ2X4iUKDd/x1VyhpmNJvQ5n5T0J1UHDQm305sTvlh/O/DfVCuSXzzd5ouIRKDbfBGRCGrkbb7l13OrU55fxJFcsfN2JX5pQHLUzJlfsnDhwtK+H1ot1cwwrdOQjTr2SrsMqYA33r437RKkAvbaY7e0S6hyus0XEYlAYSoiEoHCVEQkAoWpiEgEClMRkQgUpiIiEShMRUQiUJiKiESgMBURiUBhKiISgcJURCQChamISAQKUxGRCBSmIiIRKExFRCJQmIqIRKAwFRGJQGEqIhKBwlREJAKFqYhIBApTEZEIFKYiIhEoTEVEIlCYiohEoDAVEYlAYSoiEoHCVEQkAoWpiEgEClMRkQgUpiIiEShMRUQiUJiKiESgMBURiUBhKiISgcJURCQChamISAQKUxGRCBSmIiIRKExFRCJQmIqIRKAwFRGJQGEqIhKBwlREJAKFqYhIBApTEZEIFKYiIhEoTEVEIlCYiohEoDAVEYkgP+0CZF3n9O5Bn6O6YmYMfPoN7n18NAB/Oq47Zx3bjYLC1YwYN50r7/ovADtsszn3XtWbhhvXZfVq57cn9qNWLeNf/fqyVZvmFK52ho+dxl/vfg6AEw/dgxsvPIJ533wPwP1PjmHQMxNS2daa5N6772LggIdwd/qcejrnnX8BU6dM4bxzzuKHZcto1749Ax/7F5tssknapUolVIswNbOewF1AHvCwu9+cckmV1nnrzehzVFf2PulWVq4q5Ln+Z/PS+Bm0btmYQ3rsyO69bmLlqgJaNGkAQF5eLQbccDJ9//oY0z6eS9NGG7OqoJCN6uRz52OvMXbSJ9TOz+OlB87jd3t15uU33gfgPyPf5cJbhqW5qTXKjOnTGTjgIca9OZE6depw2ME9OfCgg/nTmadxc7/b2Ltbdx4dOIB/3H4r11x3fdrlSiX84m/zzSwP6A8cCHQGeptZ53SrqrxOW7Zi4rQvWfHjKgoLVzNu8qccvs9OnHHM3tw28BVWrioAYMG3ywDY/zedmP7JXKZ9PBeAxd//wOrVzoofVzF20icArCoo5L0PZ9O6ZeNUtkngww8/oEuXPalfvz75+fns3a07//3vM3zy8Uf8du9uAOy7/wE8+8x/Uq5UKusXH6ZAF+BTd//c3VcCTwCHp1xTpc34bB6/3aUDTRttTL26ten52+1p06oJHdq1ZK+dt2bsY5fw8sPns2vntgBs07Yl7vBc/3N48/HLuOjk/ddZZqMG9Tio2468PvGjNeMO3+/XTHzyLzx+a1/abNq4qjavxtp++x0YP34sixYtYvny5Yx4aThzZs+m8/Y78MLzofvl6X8PY87s2SlXKpVVHcK0NZB5BM5Jxq3FzM4ws0lmNskLVlRZcRX10RfzuX3QK7zwz3N5rv85TP14LgUFheTn1aLJJvXp9sfbuOIfzzKk36kA5Ofl0XXnrehz5SD2O/UODtt3J3p02XbN8vLyavHozadw39DRfDl3EQDDx06n08HX0OXYmxj19kc89LeTUtnWmqTTdttx8SWXcUjPAzjs4J786lc7kZ+fzwMPDeCBf/ana5ddWbZsKXXq1Em7VKmk6hCmVsI4X2eE+4Puvpu772b59aqgrMp79NkJdD3+Fg7oeyfffv8Dn85awNz53/Hsa1MAmDRjJqtXO82bNGDuN98xbvKnLPruB1b8uIoR42ewc6ct1iyr/1W9+WzWgjUPsSB0BRR1Fwx4+g123q5tlW5fTXXKqX2Z8M67vPr6WJo0bUqHDtvQsVMnXnjpZd6cOJlex/Zmy622TrtMqaTqEKZzgC0yhtsA81KqJYqih0tbtGrC4fvuxFMjJvH86KlrWpwd2rakTu18Fn67jFfefJ8dtmlNvbq1ycurxd67duCDz78G4JqzD6FRw3pccuva/XCtmv/8tPiQ7jvy0RdfV9GW1WzffPMNALNmzeK/zz5Nr+N6rxm3evVqbr7xBk4/46w0S5T1UB2e5r8DbGNmWwJzgeOA49Mtaf0Mve00mjYOT+UvuPkpvlu6gkefncAD157ApGFXsHJVIaddPRiA75au4O4hoxg/5FLcnZHjZzAiefp/+ek9+fDzr5kw9DLg569And27Bwd335GCwkK+/X45p18zJM3NrTF69/oDixcvonZ+be68uz9NmjTh3rvv4oH7+wNw+BFH8cdT+qRcpVSWua9zR/yLY2YHAXcSvho1wN3/Xtb0teq39I069qqK0iSSb9+5N+0SpAL22mM3Jk+eVFIXXLVVHVqmuPtwYHjadYhIzVUd+kxFRFKnMBURiUBhKiISgcJURCQChamISAQKUxGRCBSmIiIRKExFRCJQmIqIRKAwFRGJQGEqIhKBwlREJAKFqYhIBApTEZEIFKYiIhEoTEVEIlCYiohEoDAVEYlAYSoiEoHCVEQkAoWpiEgEClMRkQgUpiIiEShMRUQiUJiKiESgMBURiUBhKiISQX7aBQCYWdOy3nf3xVVVi4hIZeREmAKTAQcMaAt8m7xuDMwCtkytMhGRcsiJ23x339LdtwJGAoe6e3N3bwYcAjydbnUiItnlRJhm2N3dhxcNuPtLQPcU6xERKZdcuc0vstDMrgKGEG77TwQWpVuSiEh2udYy7Q20AJ5Jflok40REclpOtUyTp/bnm1kDd1+Wdj0iIuWVUy1TM+tqZu8D7yfDO5nZfSmXJSKSVU6FKfAP4Pck/aTuPgXolmpFIiLlkGthirvPLjaqMJVCREQqIKf6TIHZZtYVcDOrA/wZ+CDlmkREssq1lulZwDlAa2AO8Gvg7DQLEhEpj1xrmXZ09xMyR5jZXsAbKdUjIlIuudYyvaec40REckpOtEzN7DdAV6CFmV2U8dYmQF46VYmIlF9OhClQB2hAqKdhxvglwNGpVCQiUgE5EabuPgYYY2aD3H1m2vWIiFRUrvWZPmxmjYsGzKyJmY1MsR4RkXLJtTBt7u7fFQ24+7dAy/TKEREpn1wL09Vm1rZowMzaEf4Un4hITsuJPtMMVwLjzWxMMtwNOCPFekREyiWnwtTdR5jZLsCehP8D6kJ3Xxh7PTtv15Y33r439mJlA7pr3GdplyAVMH/ZT2mXUOVy4jbfzDol/+5C+A/15gFzgbbJOBGRnJYrLdOLgdOB20t4z4F9q7YcEZGKyYkwdffTk3/3SbsWEZHKyIkwNbOjynrf3fXfPYtITsuJMAUOTf5tSfgd/VHJ8D7AaEBhKiI5LSfC1N37AJjZC0Bnd/8qGd4M6J9mbSIi5ZETT/MztC8K0sR8YNu0ihERKa+caJlmGJ38Lv5QwlP844DX0y1JRCS7nApTdz/XzI7k5/+R9EF3fybNmkREyiOnwjTxLrDU3V81s/pm1tDdl6ZdlIhIWXKqz9TMTgf+DTyQjGoNPJtaQSIi5ZRTYUr4n0n3IvyFfdz9E/Qn+ETkFyDXwvQnd19ZNGBm+ehP8InIL0CuhekYM7sCqGdmBwDDgOdTrklEJKtcC9PLgAXANOBMYDhwVaoViYiUQ848zTezWsBUd98BeCjtekREKiJnWqbuvhqYkvnfloiI/FLkTMs0sRkww8wmAj8UjXT3w9IrSUQku1wL0+vSLkBEpDJyIkzNrC5wFtCB8PDpEXcvSLcqEZHyy5U+00eB3QhBeiAl//clIiI5KydapoS/YbojgJk9AkxMuR4RkQrJlZbpqqIXur0XkV+iXGmZ7mRmS5LXRvgNqCXJa3f3TdIrTUQku5wIU3fPS7sGEZH1kSu3+SIiv2gKUxGRCBSmIiIRKExFRCJQmIqIRKAwFRGJQGEqIhKBwlREJAKFqYhIBApTEZEIFKYiIhEoTEVEIlCYiohEoDAVEYlAYSoiEoHCVEQkAoWpiEgEClMRkQgUpiIiEShMRUQiUJiKiESgMBURiUBhKiISgcJURCSC/LQLkMr5+KOPOOn4Y9cMf/HF5/z1mr8xb95chr/4PHVq12HLrbfmwYcH0rhx4/QKrSFWFxZyz5+OoFHzVpxy40NMHT2cVx+9mwWzPuOc+56mTccdAfhk0nhGPHQrBQWryM+vzYFnXk6HXX4DwAMXHs/SRQuovVFdAPr2G0SDJs34fMpEXuh/A19//hG9/3onO3Y/MLXtlNJVi5apmQ0ws2/MbHratVSVbTt25O3J7/H25Pd4c+Jk6tevz2FHHMl++x/A5Pem887/prLNNtty6y03pV1qjfDG04No2bbDmuFWW27LSdfdR/tf7b7WdPUbNeHkvz/IhY8M55jLb+Wpmy5Z6/3jrryD8x96nvMfep4GTZoB0HjTzTnmsn7stN+hG35DpNKqRZgCg4CeaReRltdHvcaWW21Nu3bt2P+A35GfH244uuyxJ3PnzEm5uurv+wVf8eFbo9n9oF5rxrVs14EWbbdaZ9rW22zPJs03BWDT9ttQsOonClb+VObym7Zqw2Zbd8JqVZfTtXqqFrf57j7WzNqnXUdahj35BL2O7b3O+McGDeDoY44tYQ6J6fn+N3DgmZfx0/JlFZpv+tgRbN6hM/l1Nlozbli/y6hVK48duv2efU88BzOLXa5sIDXmUmdmZ5jZJDObtGDhgrTLiWblypW8+MJzHHX0MWuNv+Wmv5OXn89xx5+QUmU1wwcTRtGgcTPabLtDheab/8XHvPRgP4688Po144674g4ufGQ4Z901lC+nvsO7rzwbuVrZkGpMmLr7g+6+m7vv1qJ5i7TLiWbkiJf49c67sOmmm64ZN+SxRxn+4gsMeuxfatlsYDOnT+b9N1/j5t7dGXr9BXz2vwk8ceNFZc7z/YKvGHzN2fT6y200a91uzfhGLVoBsFH9Buy032HM+WDKBq1d4qoWt/k12VNPDl3rFv/lkSO4/bZbePm1MdSvXz/FymqGnqf/Hz1P/z8APnvvLcY99QjHXXFHqdOvWLaEgX85nd+fdgntd9h1zfjCwgJ+XLaEjRs1pbBgFR++NYoOu+y1weuXeBSmv2DLly9n1KuvcO99D6wZd+H55/LTTz9xSM8DgPAQ6p777k+rxBpr+riXee6e6/jh+8UMuuI0Ntt6O/r2G8Sbzwxm0byZjBrcn1GD+wPhK1B16tZjwKV9KCwsYHVhIR123YsuB4f+7tkfTmXw1X9ixbIlfDhhFK8MuouLBo5Ic/OkBObuadew3sxsKNADaA7MB65x90dKm37XXXfzN96eVEXVSQx3jfss7RKkAu456wjmfDStRvUxVYuWqbuv+yhbRKQK1ZgHUCIiG5LCVEQkAoWpiEgEClMRkQgUpiIiEShMRUQiUJiKiESgMBURiUBhKiISgcJURCQChamISAQKUxGRCBSmIiIRKExFRCJQmIqIRKAwFRGJQGEqIhKBwlREJAKFqYhIBApTEZEIFKYiIhEoTEVEIlCYiohEoDAVEYlAYSoiEoHCVEQkAoWpiEgEClMRkQgUpiIiEShMRUQiUJiKiESgMBURiUBhKiISgcJURCQChamISAQKUxGRCBSmIiIRKExFRCJQmIqIRKAwFRGJQGEqIhKBwlREJAKFqYhIBApTEZEIFKYiIhEoTEVEIlCYiohEoDAVEYlAYSoiEoG5e9o1VDkzWwDMTLuODaA5sDDtIqRCqutn1s7dW6RdRFWqkWFaXZnZJHffLe06pPz0mVUfus0XEYlAYSoiEoHCtHp5MO0CpML0mVUT6jMVEYlALVMRkQgUpiIiEShMqwkz62lmH5nZp2Z2edr1SNnMbICZfWNm09OuReJQmFYDZpYH9AcOBDoDvc2sc7pVSRaDgJ5pFyHxKEyrhy7Ap+7+ubuvBJ4ADk+5JimDu48FFqddh8SjMK0eWgOzM4bnJONEpIooTKsHK2GcvvMmUoUUptXDHGCLjOE2wLyUahGpkRSm1cM7wDZmtqWZ1QGOA55LuSaRGkVhWg24ewFwLjAS+AB4yt1npFuVlMXMhgITgI5mNsfM+qZdk6wf/TqpiEgEapmKiESgMBURiUBhKiISgcJURCQChamISAQKU6lSZnakmbmZdcoy3QVmVn891nOKmd1b2flFKkphKlWtNzCe8IsFZbkAqHSYilQ1halUGTNrAOwF9CUJUzPLM7PbzGyamU01s/PM7M/A5sDrZvZ6Mt2yjOUcbWaDkteHmtnbZvY/M3vVzDat6u0SAchPuwCpUY4ARrj7x2a22Mx2AfYAtgR2dvcCM2vq7ovN7CJgH3dfmGWZ44E93d3N7DTgUuDiDbkRIiVRmEpV6g3cmbx+IhneCrg/+ZVY3L2if+OzDfCkmW0G1AG+iFOqSMUoTKVKmFkzYF9gBzNzII/wZwInU74/F5g5Td2M1/cAd7j7c2bWA7g2Rr0iFaU+U6kqRwOPuXs7d2/v7lsQWpHvAmeZWT6AmTVNpl8KNMyYf76ZbWdmtYAjM8Y3AuYmr0/eoFsgUgaFqVSV3sAzxcb9h/CgaRYw1cymAMcn7z0IvFT0AAq4HHgBGAV8lbGMa4FhZjYOyNa/KrLB6K9GiYhEoJapiEgEClMRkQgUpiIiEShMRUQiUJiKiESgMBURiUBhKiISwf8DZW2v4ZXJlsUAAAAASUVORK5CYII=\n",
3659 | "text/plain": [
3660 | ""
3661 | ]
3662 | },
3663 | "metadata": {
3664 | "needs_background": "light"
3665 | },
3666 | "output_type": "display_data"
3667 | }
3668 | ],
3669 | "source": [
3670 | "plot_confusion_matrix(conf_mat=cf)\n",
3671 | "plt.xlabel(\"Actual\")\n",
3672 | "plt.ylabel(\"Predicted\")\n",
3673 | "plt.title(\"Confusion Matrix - Key: 0 is Legitimate & 1 is Malware\")\n",
3674 | "plt.show()"
3675 | ]
3676 | },
3677 | {
3678 | "cell_type": "code",
3679 | "execution_count": null,
3680 | "id": "eadbd20c",
3681 | "metadata": {},
3682 | "outputs": [],
3683 | "source": []
3684 | }
3685 | ],
3686 | "metadata": {
3687 | "kernelspec": {
3688 | "display_name": "Python 3",
3689 | "language": "python",
3690 | "name": "python3"
3691 | },
3692 | "language_info": {
3693 | "codemirror_mode": {
3694 | "name": "ipython",
3695 | "version": 3
3696 | },
3697 | "file_extension": ".py",
3698 | "mimetype": "text/x-python",
3699 | "name": "python",
3700 | "nbconvert_exporter": "python",
3701 | "pygments_lexer": "ipython3",
3702 | "version": "3.8.8"
3703 | }
3704 | },
3705 | "nbformat": 4,
3706 | "nbformat_minor": 5
3707 | }
3708 |
--------------------------------------------------------------------------------