├── README.md
├── .gitignore
├── learning.py
└── checker.py


/README.md:
--------------------------------------------------------------------------------
1 | # machine-learning-approach-for-malware-detection
2 | Virus detection is a fundamental classification problem i.e. we can train a program to recognize whether a piece of software is a malware or not and thus we can we can detect and delete it. We build an antivirus script in Python by training a classifier to be able to detect Portable Executable (PE) format files (https://en.wikipedia.org/wiki/Portable_Executable) as either malicious or non-malicious. This classifier problem works on 5 different classification algorithms and chooses the best one for prediction by comparing their results.
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/learning.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import pickle
 4 | import sklearn.ensemble as ske
 5 | from sklearn import cross_validation, tree, linear_model
 6 | from sklearn.feature_selection import SelectFromModel
 7 | from sklearn.externals import joblib
 8 | from sklearn.naive_bayes import GaussianNB
 9 | from sklearn.metrics import confusion_matrix
10 | 
11 | data = pd.read_csv('data.csv', sep='|')
12 | X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values
13 | y = data['legitimate'].values
14 | 
15 | print('Researching important feature based on %i total features\n' % X.shape[1])
16 | 
17 | # Feature selection using Trees Classifier
18 | fsel = ske.ExtraTreesClassifier().fit(X, y)
19 | model = SelectFromModel(fsel, prefit=True)
20 | X_new = model.transform(X)
21 | nb_features = X_new.shape[1]
22 | 
23 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_new, y ,test_size=0.2)
24 | 
25 | features = []
26 | 
27 | print('%i features identified as important:' % nb_features)
28 | 
29 | indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features]
30 | for f in range(nb_features):
31 |     print("%d. feature %s (%f)" % (f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]]))
32 | 
33 | # XXX : take care of the feature order
34 | for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]):
35 |     features.append(data.columns[2+f])
36 | 
37 | #Algorithm comparison
38 | algorithms = {
39 |         "DecisionTree": tree.DecisionTreeClassifier(max_depth=10),
40 |         "RandomForest": ske.RandomForestClassifier(n_estimators=50),
41 |         "GradientBoosting": ske.GradientBoostingClassifier(n_estimators=50),
42 |         "AdaBoost": ske.AdaBoostClassifier(n_estimators=100),
43 |         "GNB": GaussianNB()
44 |     }
45 | 
46 | results = {}
47 | print("\nNow testing algorithms")
48 | for algo in algorithms:
49 |     clf = algorithms[algo]
50 |     clf.fit(X_train, y_train)
51 |     score = clf.score(X_test, y_test)
52 |     print("%s : %f %%" % (algo, score*100))
53 |     results[algo] = score
54 | 
55 | winner = max(results, key=results.get)
56 | print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100))
57 | 
58 | # Save the algorithm and the feature list for later predictions
59 | print('Saving algorithm and feature list in classifier directory...')
60 | joblib.dump(algorithms[winner], 'classifier/classifier.pkl')
61 | open('classifier/features.pkl', 'w').write(pickle.dumps(features))
62 | print('Saved')
63 | 
64 | # Identify false and true positive rates
65 | clf = algorithms[winner]
66 | res = clf.predict(X_test)
67 | mt = confusion_matrix(y_test, res)
68 | print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100))
69 | print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))
70 | 


--------------------------------------------------------------------------------
/checker.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python2
  2 | import pefile
  3 | import os
  4 | import array
  5 | import math
  6 | import pickle
  7 | from sklearn.externals import joblib
  8 | import sys
  9 | import argparse
 10 | 
 11 | def get_entropy(data):
 12 |     if len(data) == 0:
 13 | 	return 0.0
 14 |     occurences = array.array('L', [0]*256)
 15 |     for x in data:
 16 |   	occurences[x if isinstance(x, int) else ord(x)] += 1
 17 | 
 18 |     entropy = 0
 19 |     for x in occurences:
 20 | 	if x:
 21 | 	    p_x = float(x) / len(data)
 22 | 	    entropy -= p_x*math.log(p_x, 2)
 23 | 
 24 |     return entropy
 25 | 
 26 | def get_resources(pe):
 27 |     """Extract resources :
 28 |     [entropy, size]"""
 29 |     resources = []
 30 |     if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
 31 | 	try:
 32 |             for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
 33 |                 if hasattr(resource_type, 'directory'):
 34 |                     for resource_id in resource_type.directory.entries:
 35 |                         if hasattr(resource_id, 'directory'):
 36 |                             for resource_lang in resource_id.directory.entries:
 37 |                                 data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size)
 38 |                                 size = resource_lang.data.struct.Size
 39 |                                 entropy = get_entropy(data)
 40 | 
 41 |                                 resources.append([entropy, size])
 42 |         except Exception as e:
 43 |             return resources
 44 |     return resources
 45 | 
 46 | def get_version_info(pe):
 47 |     """Return version infos"""
 48 |     res = {}
 49 |     for fileinfo in pe.FileInfo:
 50 |         if fileinfo.Key == 'StringFileInfo':
 51 |             for st in fileinfo.StringTable:
 52 |                 for entry in st.entries.items():
 53 |                     res[entry[0]] = entry[1]
 54 |         if fileinfo.Key == 'VarFileInfo':
 55 |             for var in fileinfo.Var:
 56 |                 res[var.entry.items()[0][0]] = var.entry.items()[0][1]
 57 |     if hasattr(pe, 'VS_FIXEDFILEINFO'):
 58 |           res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags
 59 |           res['os'] = pe.VS_FIXEDFILEINFO.FileOS
 60 |           res['type'] = pe.VS_FIXEDFILEINFO.FileType
 61 |           res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS
 62 |           res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS
 63 |           res['signature'] = pe.VS_FIXEDFILEINFO.Signature
 64 |           res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion
 65 |     return res
 66 | 
 67 | def extract_infos(fpath):
 68 |     res = {}
 69 |     pe = pefile.PE(fpath)
 70 |     res['Machine'] = pe.FILE_HEADER.Machine
 71 |     res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader
 72 |     res['Characteristics'] = pe.FILE_HEADER.Characteristics
 73 |     res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion
 74 |     res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion
 75 |     res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode
 76 |     res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData
 77 |     res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData
 78 |     res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint
 79 |     res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode
 80 |     try:
 81 |         res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData
 82 |     except AttributeError:
 83 |         res['BaseOfData'] = 0
 84 |     res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase
 85 |     res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment
 86 |     res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment
 87 |     res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
 88 |     res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion
 89 |     res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion
 90 |     res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion
 91 |     res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion
 92 |     res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion
 93 |     res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage
 94 |     res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders
 95 |     res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum
 96 |     res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem
 97 |     res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics
 98 |     res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve
 99 |     res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit
100 |     res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve
101 |     res['SizeOfHeapCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit
102 |     res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags
103 |     res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes
104 | 
105 |     # Sections
106 |     res['SectionsNb'] = len(pe.sections)
107 |     entropy = map(lambda x:x.get_entropy(), pe.sections)
108 |     res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy))
109 |     res['SectionsMinEntropy'] = min(entropy)
110 |     res['SectionsMaxEntropy'] = max(entropy)
111 |     raw_sizes = map(lambda x:x.SizeOfRawData, pe.sections)
112 |     res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes))
113 |     res['SectionsMinRawsize'] = min(raw_sizes)
114 |     res['SectionsMaxRawsize'] = max(raw_sizes)
115 |     virtual_sizes = map(lambda x:x.Misc_VirtualSize, pe.sections)
116 |     res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes))
117 |     res['SectionsMinVirtualsize'] = min(virtual_sizes)
118 |     res['SectionMaxVirtualsize'] = max(virtual_sizes)
119 | 
120 |     #Imports
121 |     try:
122 |         res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT)
123 |         imports = sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], [])
124 |         res['ImportsNb'] = len(imports)
125 |         res['ImportsNbOrdinal'] = len(filter(lambda x:x.name is None, imports))
126 |     except AttributeError:
127 |         res['ImportsNbDLL'] = 0
128 |         res['ImportsNb'] = 0
129 |         res['ImportsNbOrdinal'] = 0
130 | 
131 |     #Exports
132 |     try:
133 |         res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols)
134 |     except AttributeError:
135 |         # No export
136 |         res['ExportNb'] = 0
137 |     #Resources
138 |     resources= get_resources(pe)
139 |     res['ResourcesNb'] = len(resources)
140 |     if len(resources)> 0:
141 |         entropy = map(lambda x:x[0], resources)
142 |         res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy))
143 |         res['ResourcesMinEntropy'] = min(entropy)
144 |         res['ResourcesMaxEntropy'] = max(entropy)
145 |         sizes = map(lambda x:x[1], resources)
146 |         res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes))
147 |         res['ResourcesMinSize'] = min(sizes)
148 |         res['ResourcesMaxSize'] = max(sizes)
149 |     else:
150 |         res['ResourcesNb'] = 0
151 |         res['ResourcesMeanEntropy'] = 0
152 |         res['ResourcesMinEntropy'] = 0
153 |         res['ResourcesMaxEntropy'] = 0
154 |         res['ResourcesMeanSize'] = 0
155 |         res['ResourcesMinSize'] = 0
156 |         res['ResourcesMaxSize'] = 0
157 | 
158 |     # Load configuration size
159 |     try:
160 |         res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size
161 |     except AttributeError:
162 |         res['LoadConfigurationSize'] = 0
163 | 
164 | 
165 |     # Version configuration size
166 |     try:
167 |         version_infos = get_version_info(pe)
168 |         res['VersionInformationSize'] = len(version_infos.keys())
169 |     except AttributeError:
170 |         res['VersionInformationSize'] = 0
171 |     return res
172 | 
173 | if __name__ == '__main__':
174 |     parser = argparse.ArgumentParser(description='Detect malicious files')
175 |     parser.add_argument('FILE', help='File to be tested')
176 |     args = parser.parse_args()
177 |     # Load classifier
178 |     clf = joblib.load(os.path.join(
179 |         os.path.dirname(os.path.realpath(__file__)),
180 |         'classifier/classifier.pkl'
181 |     ))
182 |     features = pickle.loads(open(os.path.join(
183 |         os.path.dirname(os.path.realpath(__file__)),
184 |         'classifier/features.pkl'),
185 |         'r').read()
186 |     )
187 | 
188 |     data = extract_infos(args.FILE)
189 | 
190 |     pe_features = map(lambda x:data[x], features)
191 | 
192 |     res= clf.predict([pe_features])[0]
193 |     print('The file %s is %s' % (
194 |         os.path.basename(sys.argv[1]),
195 |         ['malicious', 'legitimate'][res])
196 |     )
197 | 


--------------------------------------------------------------------------------