├── Feature_selection.py
├── README.md
├── Random Forest.py
├── XGboost.py
├── data.csv
├── extract.py
├── licenta.pdf
├── prezentare.pdf
└── simple_KNN.py


/Feature_selection.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Fri Sep  7 19:02:25 2018
 5 | 
 6 | @author: alex
 7 | """
 8 | 
 9 | import numpy as np
10 | import pandas as pd
11 | 
12 | # Importing the dataset
13 | dataset = pd.read_csv('data.csv', sep = '|')
14 | X = dataset.drop(['Name', 'md5', 'legitimate'], axis = 1).values
15 | y = dataset['legitimate'].values
16 | 
17 | # Tree-based feature selection:
18 | from sklearn.feature_selection import SelectFromModel
19 | import sklearn.ensemble as ske
20 | fsel = ske.ExtraTreesClassifier().fit(X, y)
21 | model = SelectFromModel(fsel, prefit=True)
22 | X_new = model.transform(X)
23 | nb_features = X_new.shape[1]
24 | indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features]
25 | for f in range(nb_features):
26 |     print("%d. feature %s (%f)" % (f + 1, dataset.columns[2+indices[f]], fsel.feature_importances_[indices[f]]))
27 | features = []
28 | for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]):
29 |     features.append(dataset.columns[2+f])
30 |     
31 |     
32 | # Splitting the dataset into the Training set and Test set
33 | from sklearn.cross_validation import train_test_split
34 | X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.20, random_state = 0)
35 | 
36 | # Feature Scaling
37 | from sklearn.preprocessing import StandardScaler
38 | sc = StandardScaler()
39 | X_train = sc.fit_transform(X_train)
40 | X_test = sc.transform(X_test)
41 | 
42 | #------------------------K-NN--------------------------------------
43 | from sklearn.neighbors import KNeighborsClassifier
44 | classifier = KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski', p = 2)
45 | classifier.fit(X_train, y_train)
46 | 
47 | # Predicting the Test set results
48 | y_pred = classifier.predict(X_test)
49 | 
50 | # Making the Confusion Matrix
51 | from sklearn.metrics import confusion_matrix
52 | cm = confusion_matrix(y_test, y_pred)
53 | #------------------------------------------------------------------
54 | 
55 | #-----------------Random-Forest------------------------------------
56 | from sklearn.ensemble import RandomForestClassifier
57 | classifier = RandomForestClassifier(n_estimators = 50, criterion = 'entropy')
58 | classifier.fit(X_train, y_train)
59 | 
60 | #predict the test results
61 | y_pred = classifier.predict(X_test)
62 | 
63 | #Makeing the confusion matrix
64 | from sklearn.metrics import confusion_matrix
65 | cm = confusion_matrix(y_test, y_pred)
66 | #------------------------------------------------------------------
67 | 
68 | #-------------------XGBoost----------------------------------------
69 | from xgboost import XGBClassifier
70 | classifier = XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=50)
71 | classifier.fit(X_train, y_train)
72 | 
73 | #predict the test results
74 | y_pred = classifier.predict(X_test)
75 | 
76 | #Makeing the confusion matrix
77 | from sklearn.metrics import confusion_matrix
78 | cm = confusion_matrix(y_test, y_pred)
79 | #-----------------------------------------------------------------


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Malware-detection-using-Machine-Learning
 2 | The scope of this paper is to present a malware detection approach using machine learning. In this paper we will focus on windows executable ﬁles. Because of the abnormal growth of these malicious software’s we need to use diﬀerent automated approaches to ﬁnd theses infected ﬁles. 
 3 | 
 4 | In this project we are going to study and implement a script used for data extraction from the PE-ﬁles to create a data set with infected and clean ﬁles, on which we are gonna train our machine learning algorithms:K-nn, XGBoost and Random Forest. 
 5 | 
 6 | The last chapter of this paper the algorithms are tested with all the data set features. The accuracy of all algorithms is over 90%. After applying a Feature selection algorithm over the data set, the accuracy has been improved for all the learning algorithms
 7 | 
 8 | EDIT:
 9 | PE files: https://en.wikipedia.org/wiki/Portable_Executable
10 | 
11 | In the legitimate folder, you need to add a lot of legitimate windows PE files (just download random PE files from an  legitimate source like skype.exe, teams.exe, etc)
12 | and " /hdd/Downloads/virusi_00325/" is a folder full of Malwares from https://virusshare.com/ (send them an email and ask for an account explaining why you need one )
13 | 
14 | 


--------------------------------------------------------------------------------
/Random Forest.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Thu Aug 30 01:11:42 2018
 5 | 
 6 | @author: alex
 7 | """
 8 | 
 9 | #Random Forest
10 | 
11 | # Importing the libraries
12 | import numpy as np
13 | import matplotlib.pyplot as plt
14 | import pandas as pd
15 | 
16 | # Importing the dataset
17 | dataset = pd.read_csv('data.csv', sep = '|')
18 | X = dataset.drop(['Name', 'md5', 'legitimate'], axis = 1).values
19 | y = dataset['legitimate'].values
20 | 
21 | # Splitting the dataset into the Training set and Test set
22 | from sklearn.cross_validation import train_test_split
23 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
24 | 
25 | # Feature Scaling
26 | from sklearn.preprocessing import StandardScaler
27 | sc = StandardScaler()
28 | X_train = sc.fit_transform(X_train)
29 | X_test = sc.transform(X_test)
30 | 
31 | from sklearn.ensemble import RandomForestClassifier
32 | classifier = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 0)
33 | classifier.fit(X_train, y_train)
34 | 
35 | #predict the test results
36 | y_pred = classifier.predict(X_test)
37 | 
38 | #Makeing the confusion matrix
39 | from sklearn.metrics import confusion_matrix
40 | cm = confusion_matrix(y_test, y_pred)
41 | 


--------------------------------------------------------------------------------
/XGboost.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed Aug 29 23:44:35 2018
 5 | 
 6 | @author: alex
 7 | """
 8 | 
 9 | # XGboost
10 | 
11 | # Importing the libraries
12 | import numpy as np
13 | import matplotlib.pyplot as plt
14 | import pandas as pd
15 | 
16 | # Importing the dataset
17 | dataset = pd.read_csv('data.csv', sep = '|')
18 | X = dataset.drop(['Name', 'md5', 'legitimate'], axis = 1).values
19 | y = dataset['legitimate'].values
20 | 
21 | # Splitting the dataset into the Training set and Test set
22 | from sklearn.cross_validation import train_test_split
23 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
24 | 
25 | # Feature Scaling
26 | from sklearn.preprocessing import StandardScaler
27 | sc = StandardScaler()
28 | X_train = sc.fit_transform(X_train)
29 | X_test = sc.transform(X_test)
30 | 
31 | #Fitting xgboost to the training Set
32 | from xgboost import XGBClassifier
33 | classifier = XGBClassifier(max_depth=20, learning_rate=0.3, n_estimators=150)
34 | classifier.fit(X_train, y_train)
35 | 
36 | #predict the test results
37 | y_pred = classifier.predict(X_test)
38 | 
39 | #Makeing the confusion matrix
40 | from sklearn.metrics import confusion_matrix
41 | cm = confusion_matrix(y_test, y_pred)
42 | 
43 | #Applying K-Fold cross validation
44 | from sklearn.model_selection import cross_val_score
45 | accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
46 | accuracies.mean()
47 | accuracies.std()
48 | 


--------------------------------------------------------------------------------
/extract.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import pefile
  4 | 
  5 | #!/usr/bin/env python2
  6 | import pefile
  7 | import os
  8 | import hashlib
  9 | import array
 10 | import math
 11 | 
 12 | def get_md5(fname):
 13 |     hash_md5 = hashlib.md5()
 14 |     with open(fname, "rb") as f:
 15 |         for chunk in iter(lambda: f.read(4096), b""):
 16 |             hash_md5.update(chunk)
 17 |     return hash_md5.hexdigest()
 18 | 
 19 | def get_entropy(data):
 20 |     if len(data) == 0:
 21 | 	return 0.0
 22 |     occurences = array.array('L', [0]*256)
 23 |     for x in data:
 24 |   	occurences[x if isinstance(x, int) else ord(x)] += 1
 25 | 
 26 |     entropy = 0
 27 |     for x in occurences:
 28 | 	if x:
 29 | 	    p_x = float(x) / len(data)
 30 | 	    entropy -= p_x*math.log(p_x, 2)
 31 | 
 32 |     return entropy
 33 | 
 34 | def get_resources(pe):
 35 |     """Extract resources :
 36 |     [entropy, size]"""
 37 |     resources = []
 38 |     if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
 39 | 	try:
 40 |             for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
 41 |                 if hasattr(resource_type, 'directory'):
 42 |                     for resource_id in resource_type.directory.entries:
 43 |                         if hasattr(resource_id, 'directory'):
 44 |                             for resource_lang in resource_id.directory.entries:
 45 |                                 data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size)
 46 |                                 size = resource_lang.data.struct.Size
 47 |                                 entropy = get_entropy(data)
 48 | 
 49 |                                 resources.append([entropy, size])
 50 |         except Exception as e:
 51 |             return resources
 52 |     return resources
 53 | 
 54 | def get_version_info(pe):
 55 |     """Return version infos"""
 56 |     res = {}
 57 |     for fileinfo in pe.FileInfo:
 58 |         if fileinfo.Key == 'StringFileInfo':
 59 |             for st in fileinfo.StringTable:
 60 |                 for entry in st.entries.items():
 61 |                     res[entry[0]] = entry[1]
 62 |         if fileinfo.Key == 'VarFileInfo':
 63 |             for var in fileinfo.Var:
 64 |                 res[var.entry.items()[0][0]] = var.entry.items()[0][1]
 65 |     if hasattr(pe, 'VS_FIXEDFILEINFO'):
 66 |           res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags
 67 |           res['os'] = pe.VS_FIXEDFILEINFO.FileOS
 68 |           res['type'] = pe.VS_FIXEDFILEINFO.FileType
 69 |           res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS
 70 |           res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS
 71 |           res['signature'] = pe.VS_FIXEDFILEINFO.Signature
 72 |           res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion
 73 |     return res
 74 | 
 75 | def extract_infos(fpath):
 76 |     res = []
 77 |     res.append(os.path.basename(fpath))
 78 |     res.append(get_md5(fpath))
 79 |     pe = pefile.PE(fpath)
 80 |     res.append(pe.FILE_HEADER.Machine)
 81 |     res.append(pe.FILE_HEADER.SizeOfOptionalHeader)
 82 |     res.append(pe.FILE_HEADER.Characteristics)
 83 |     res.append(pe.OPTIONAL_HEADER.MajorLinkerVersion)
 84 |     res.append(pe.OPTIONAL_HEADER.MinorLinkerVersion)
 85 |     res.append(pe.OPTIONAL_HEADER.SizeOfCode)
 86 |     res.append(pe.OPTIONAL_HEADER.SizeOfInitializedData)
 87 |     res.append(pe.OPTIONAL_HEADER.SizeOfUninitializedData)
 88 |     res.append(pe.OPTIONAL_HEADER.AddressOfEntryPoint)
 89 |     res.append(pe.OPTIONAL_HEADER.BaseOfCode)
 90 |     try:
 91 |         res.append(pe.OPTIONAL_HEADER.BaseOfData)
 92 |     except AttributeError:
 93 |         res.append(0)
 94 |     res.append(pe.OPTIONAL_HEADER.ImageBase)
 95 |     res.append(pe.OPTIONAL_HEADER.SectionAlignment)
 96 |     res.append(pe.OPTIONAL_HEADER.FileAlignment)
 97 |     res.append(pe.OPTIONAL_HEADER.MajorOperatingSystemVersion)
 98 |     res.append(pe.OPTIONAL_HEADER.MinorOperatingSystemVersion)
 99 |     res.append(pe.OPTIONAL_HEADER.MajorImageVersion)
100 |     res.append(pe.OPTIONAL_HEADER.MinorImageVersion)
101 |     res.append(pe.OPTIONAL_HEADER.MajorSubsystemVersion)
102 |     res.append(pe.OPTIONAL_HEADER.MinorSubsystemVersion)
103 |     res.append(pe.OPTIONAL_HEADER.SizeOfImage)
104 |     res.append(pe.OPTIONAL_HEADER.SizeOfHeaders)
105 |     res.append(pe.OPTIONAL_HEADER.CheckSum)
106 |     res.append(pe.OPTIONAL_HEADER.Subsystem)
107 |     res.append(pe.OPTIONAL_HEADER.DllCharacteristics)
108 |     res.append(pe.OPTIONAL_HEADER.SizeOfStackReserve)
109 |     res.append(pe.OPTIONAL_HEADER.SizeOfStackCommit)
110 |     res.append(pe.OPTIONAL_HEADER.SizeOfHeapReserve)
111 |     res.append(pe.OPTIONAL_HEADER.SizeOfHeapCommit)
112 |     res.append(pe.OPTIONAL_HEADER.LoaderFlags)
113 |     res.append(pe.OPTIONAL_HEADER.NumberOfRvaAndSizes)
114 |     res.append(len(pe.sections))
115 |     entropy = map(lambda x:x.get_entropy(), pe.sections)
116 |     res.append(sum(entropy)/float(len(entropy)))
117 |     res.append(min(entropy))
118 |     res.append(max(entropy))
119 |     raw_sizes = map(lambda x:x.SizeOfRawData, pe.sections)
120 |     res.append(sum(raw_sizes)/float(len(raw_sizes)))
121 |     res.append(min(raw_sizes))
122 |     res.append(max(raw_sizes))
123 |     virtual_sizes = map(lambda x:x.Misc_VirtualSize, pe.sections)
124 |     res.append(sum(virtual_sizes)/float(len(virtual_sizes)))
125 |     res.append(min(virtual_sizes))
126 |     res.append(max(virtual_sizes))
127 |     #Imports
128 |     try:
129 |         res.append(len(pe.DIRECTORY_ENTRY_IMPORT))
130 |         imports = sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], [])
131 |         res.append(len(imports))
132 |         res.append(len(filter(lambda x:x.name is None, imports)))
133 |     except AttributeError:
134 |         res.append(0)
135 |         res.append(0)
136 |         res.append(0)
137 |     #Exports
138 |     try:
139 |         res.append(len(pe.DIRECTORY_ENTRY_EXPORT.symbols))
140 |     except AttributeError:
141 |         # No export
142 |         res.append(0)
143 |     #Resources
144 |     resources= get_resources(pe)
145 |     res.append(len(resources))
146 |     if len(resources)> 0:
147 |         entropy = map(lambda x:x[0], resources)
148 |         res.append(sum(entropy)/float(len(entropy)))
149 |         res.append(min(entropy))
150 |         res.append(max(entropy))
151 |         sizes = map(lambda x:x[1], resources)
152 |         res.append(sum(sizes)/float(len(sizes)))
153 |         res.append(min(sizes))
154 |         res.append(max(sizes))
155 |     else:
156 |         res.append(0)
157 |         res.append(0)
158 |         res.append(0)
159 |         res.append(0)
160 |         res.append(0)
161 |         res.append(0)
162 | 
163 |     # Load configuration size
164 |     try:
165 |         res.append(pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size)
166 |     except AttributeError:
167 |         res.append(0)
168 | 
169 |     # Version configuration size
170 |     try:
171 |         version_infos = get_version_info(pe)
172 |         res.append(len(version_infos.keys()))
173 |     except AttributeError:
174 |         res.append(0)
175 |     return res
176 | 
177 | if __name__ == '__main__':
178 |     output = "data.csv"
179 |     csv_delimiter = "|"
180 |     columns = [
181 |         "Name",
182 |         "md5",
183 |         "Machine",
184 |         "SizeOfOptionalHeader",
185 |         "Characteristics",
186 |         "MajorLinkerVersion",
187 |         "MinorLinkerVersion",
188 |         "SizeOfCode",
189 |         "SizeOfInitializedData",
190 |         "SizeOfUninitializedData",
191 |         "AddressOfEntryPoint",
192 |         "BaseOfCode",
193 |         "BaseOfData",
194 |         "ImageBase",
195 |         "SectionAlignment",
196 |         "FileAlignment",
197 |         "MajorOperatingSystemVersion",
198 |         "MinorOperatingSystemVersion",
199 |         "MajorImageVersion",
200 |         "MinorImageVersion",
201 |         "MajorSubsystemVersion",
202 |         "MinorSubsystemVersion",
203 |         "SizeOfImage",
204 |         "SizeOfHeaders",
205 |         "CheckSum",
206 |         "Subsystem",
207 |         "DllCharacteristics",
208 |         "SizeOfStackReserve",
209 |         "SizeOfStackCommit",
210 |         "SizeOfHeapReserve",
211 |         "SizeOfHeapCommit",
212 |         "LoaderFlags",
213 |         "NumberOfRvaAndSizes",
214 |         "SectionsNb",
215 |         "SectionsMeanEntropy",
216 |         "SectionsMinEntropy",
217 |         "SectionsMaxEntropy",
218 |         "SectionsMeanRawsize",
219 |         "SectionsMinRawsize",
220 |         "SectionMaxRawsize",
221 |         "SectionsMeanVirtualsize",
222 |         "SectionsMinVirtualsize",
223 |         "SectionMaxVirtualsize",
224 |         "ImportsNbDLL",
225 |         "ImportsNb",
226 |         "ImportsNbOrdinal",
227 |         "ExportNb",
228 |         "ResourcesNb",
229 |         "ResourcesMeanEntropy",
230 |         "ResourcesMinEntropy",
231 |         "ResourcesMaxEntropy",
232 |         "ResourcesMeanSize",
233 |         "ResourcesMinSize",
234 |         "ResourcesMaxSize",
235 |         "LoadConfigurationSize",
236 |         "VersionInformationSize",
237 |         "legitimate"
238 |     ]
239 | 
240 |     ff = open(output, "a")
241 |     ff.write(csv_delimiter.join(columns) + "\n")
242 | 
243 |     # Launch legitimate
244 |     for ffile in os.listdir('legitimate'):
245 |         print(ffile)
246 |         try:
247 |             res = extract_infos(os.path.join('legitimate/', ffile))
248 |             res.append(1)
249 |             ff.write(csv_delimiter.join(map(lambda x:str(x), res)) + "\n")
250 |         except pefile.PEFormatError:
251 |             print('\t -> Bad PE format')
252 | 
253 |     for ffile in os.listdir('/hdd/Downloads/virusi_00325'):
254 |         print(ffile)
255 |         try:
256 |             res = extract_infos(os.path.join('/hdd/Downloads/virusi_00325/', ffile))
257 |             res.append(0)
258 | 
259 |             ff.write(csv_delimiter.join(map(lambda x:str(x), res)) + "\n")
260 |             shutil.copy("/hdd/Downloads/virusi_00325/"+ffile,"2/")
261 |         except pefile.PEFormatError:
262 |             print('\t -> Bad PE format')
263 |         except:
264 |             print('\t -> Weird error')
265 |     ff.close()
266 | 


--------------------------------------------------------------------------------
/licenta.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuff96/Malware-detection-using-Machine-Learning/8a8ebc9d0ea9dc6ad7890ebaaf2bcd79b0c9c596/licenta.pdf


--------------------------------------------------------------------------------
/prezentare.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuff96/Malware-detection-using-Machine-Learning/8a8ebc9d0ea9dc6ad7890ebaaf2bcd79b0c9c596/prezentare.pdf


--------------------------------------------------------------------------------
/simple_KNN.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Mon Aug 27 22:29:36 2018
 5 | 
 6 | @author: alex
 7 | """
 8 | 
 9 | # K-Nearest Neighbors (K-NN)
10 | 
11 | # Importing the libraries
12 | import numpy as np
13 | import matplotlib.pyplot as plt
14 | import pandas as pd
15 | 
16 | # Importing the dataset
17 | dataset = pd.read_csv('data.csv', sep = '|')
18 | X = dataset.drop(['Name', 'md5', 'legitimate'], axis = 1).values
19 | y = dataset['legitimate'].values
20 | 
21 | # Splitting the dataset into the Training set and Test set
22 | from sklearn.cross_validation import train_test_split
23 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
24 | 
25 | # Feature Scaling
26 | from sklearn.preprocessing import StandardScaler
27 | sc = StandardScaler()
28 | X_train = sc.fit_transform(X_train)
29 | X_test = sc.transform(X_test)
30 | 
31 | # Fitting K-NN to the Training set
32 | from sklearn.neighbors import KNeighborsClassifier
33 | classifier = KNeighborsClassifier(n_neighbors = 7, metric = 'minkowski', p = 2)
34 | classifier.fit(X_train, y_train)
35 | 
36 | # Predicting the Test set results
37 | y_pred = classifier.predict(X_test)
38 | 
39 | # Making the Confusion Matrix
40 | from sklearn.metrics import confusion_matrix
41 | cm = confusion_matrix(y_test, y_pred)
42 | 
43 | from sklearn.model_selection import cross_val_score
44 | # creating odd list of K for KNN
45 | myList = list(range(1,50))
46 | 
47 | # subsetting just the odd ones
48 | neighbors = filter(lambda x: x % 2 != 0, myList)
49 | neighbors = list(range(1,50))
50 | # empty list that will hold cv scores
51 | cv_scores = []
52 | 
53 | # perform 20-fold cross validation
54 | for k in neighbors:
55 |     knn = KNeighborsClassifier(n_neighbors=k)
56 |     scores = cross_val_score(knn, X_train, y_train, cv=20, scoring='accuracy')
57 |     cv_scores.append(scores.mean())
58 | 
59 | # changing to misclassification error
60 | MSE = [1 - x for x in cv_scores]
61 | MSE_list = np.array(MSE)
62 | neighbors_list = np.array(neighbors)
63 | # determining best k
64 | optimal_k = neighbors[MSE_list.tolist().index(min(MSE_list))]
65 | print ("The optimal number of neighbors is %d" % optimal_k)
66 | 
67 | # plot misclassification error vs k
68 | plt.plot(neighbors_list, MSE_list)
69 | plt.xlabel('Number of Neighbors K')
70 | plt.ylabel('Misclassification Error')
71 | plt.show()
72 | 


--------------------------------------------------------------------------------