├── Feature_selection.py ├── README.md ├── Random Forest.py ├── XGboost.py ├── data.csv ├── extract.py ├── licenta.pdf ├── prezentare.pdf └── simple_KNN.py /Feature_selection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Sep 7 19:02:25 2018 5 | 6 | @author: alex 7 | """ 8 | 9 | import numpy as np 10 | import pandas as pd 11 | 12 | # Importing the dataset 13 | dataset = pd.read_csv('data.csv', sep = '|') 14 | X = dataset.drop(['Name', 'md5', 'legitimate'], axis = 1).values 15 | y = dataset['legitimate'].values 16 | 17 | # Tree-based feature selection: 18 | from sklearn.feature_selection import SelectFromModel 19 | import sklearn.ensemble as ske 20 | fsel = ske.ExtraTreesClassifier().fit(X, y) 21 | model = SelectFromModel(fsel, prefit=True) 22 | X_new = model.transform(X) 23 | nb_features = X_new.shape[1] 24 | indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features] 25 | for f in range(nb_features): 26 | print("%d. feature %s (%f)" % (f + 1, dataset.columns[2+indices[f]], fsel.feature_importances_[indices[f]])) 27 | features = [] 28 | for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]): 29 | features.append(dataset.columns[2+f]) 30 | 31 | 32 | # Splitting the dataset into the Training set and Test set 33 | from sklearn.cross_validation import train_test_split 34 | X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.20, random_state = 0) 35 | 36 | # Feature Scaling 37 | from sklearn.preprocessing import StandardScaler 38 | sc = StandardScaler() 39 | X_train = sc.fit_transform(X_train) 40 | X_test = sc.transform(X_test) 41 | 42 | #------------------------K-NN-------------------------------------- 43 | from sklearn.neighbors import KNeighborsClassifier 44 | classifier = KNeighborsClassifier(n_neighbors = 3, metric = 'minkowski', p = 2) 45 | classifier.fit(X_train, y_train) 46 | 47 | # Predicting the Test set results 48 | y_pred = classifier.predict(X_test) 49 | 50 | # Making the Confusion Matrix 51 | from sklearn.metrics import confusion_matrix 52 | cm = confusion_matrix(y_test, y_pred) 53 | #------------------------------------------------------------------ 54 | 55 | #-----------------Random-Forest------------------------------------ 56 | from sklearn.ensemble import RandomForestClassifier 57 | classifier = RandomForestClassifier(n_estimators = 50, criterion = 'entropy') 58 | classifier.fit(X_train, y_train) 59 | 60 | #predict the test results 61 | y_pred = classifier.predict(X_test) 62 | 63 | #Makeing the confusion matrix 64 | from sklearn.metrics import confusion_matrix 65 | cm = confusion_matrix(y_test, y_pred) 66 | #------------------------------------------------------------------ 67 | 68 | #-------------------XGBoost---------------------------------------- 69 | from xgboost import XGBClassifier 70 | classifier = XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=50) 71 | classifier.fit(X_train, y_train) 72 | 73 | #predict the test results 74 | y_pred = classifier.predict(X_test) 75 | 76 | #Makeing the confusion matrix 77 | from sklearn.metrics import confusion_matrix 78 | cm = confusion_matrix(y_test, y_pred) 79 | #----------------------------------------------------------------- -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Malware-detection-using-Machine-Learning 2 | The scope of this paper is to present a malware detection approach using machine learning. In this paper we will focus on windows executable files. Because of the abnormal growth of these malicious software’s we need to use different automated approaches to find theses infected files. 3 | 4 | In this project we are going to study and implement a script used for data extraction from the PE-files to create a data set with infected and clean files, on which we are gonna train our machine learning algorithms:K-nn, XGBoost and Random Forest. 5 | 6 | The last chapter of this paper the algorithms are tested with all the data set features. The accuracy of all algorithms is over 90%. After applying a Feature selection algorithm over the data set, the accuracy has been improved for all the learning algorithms 7 | 8 | EDIT: 9 | PE files: https://en.wikipedia.org/wiki/Portable_Executable 10 | 11 | In the legitimate folder, you need to add a lot of legitimate windows PE files (just download random PE files from an legitimate source like skype.exe, teams.exe, etc) 12 | and " /hdd/Downloads/virusi_00325/" is a folder full of Malwares from https://virusshare.com/ (send them an email and ask for an account explaining why you need one ) 13 | 14 | -------------------------------------------------------------------------------- /Random Forest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Aug 30 01:11:42 2018 5 | 6 | @author: alex 7 | """ 8 | 9 | #Random Forest 10 | 11 | # Importing the libraries 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | import pandas as pd 15 | 16 | # Importing the dataset 17 | dataset = pd.read_csv('data.csv', sep = '|') 18 | X = dataset.drop(['Name', 'md5', 'legitimate'], axis = 1).values 19 | y = dataset['legitimate'].values 20 | 21 | # Splitting the dataset into the Training set and Test set 22 | from sklearn.cross_validation import train_test_split 23 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0) 24 | 25 | # Feature Scaling 26 | from sklearn.preprocessing import StandardScaler 27 | sc = StandardScaler() 28 | X_train = sc.fit_transform(X_train) 29 | X_test = sc.transform(X_test) 30 | 31 | from sklearn.ensemble import RandomForestClassifier 32 | classifier = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 0) 33 | classifier.fit(X_train, y_train) 34 | 35 | #predict the test results 36 | y_pred = classifier.predict(X_test) 37 | 38 | #Makeing the confusion matrix 39 | from sklearn.metrics import confusion_matrix 40 | cm = confusion_matrix(y_test, y_pred) 41 | -------------------------------------------------------------------------------- /XGboost.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Aug 29 23:44:35 2018 5 | 6 | @author: alex 7 | """ 8 | 9 | # XGboost 10 | 11 | # Importing the libraries 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | import pandas as pd 15 | 16 | # Importing the dataset 17 | dataset = pd.read_csv('data.csv', sep = '|') 18 | X = dataset.drop(['Name', 'md5', 'legitimate'], axis = 1).values 19 | y = dataset['legitimate'].values 20 | 21 | # Splitting the dataset into the Training set and Test set 22 | from sklearn.cross_validation import train_test_split 23 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0) 24 | 25 | # Feature Scaling 26 | from sklearn.preprocessing import StandardScaler 27 | sc = StandardScaler() 28 | X_train = sc.fit_transform(X_train) 29 | X_test = sc.transform(X_test) 30 | 31 | #Fitting xgboost to the training Set 32 | from xgboost import XGBClassifier 33 | classifier = XGBClassifier(max_depth=20, learning_rate=0.3, n_estimators=150) 34 | classifier.fit(X_train, y_train) 35 | 36 | #predict the test results 37 | y_pred = classifier.predict(X_test) 38 | 39 | #Makeing the confusion matrix 40 | from sklearn.metrics import confusion_matrix 41 | cm = confusion_matrix(y_test, y_pred) 42 | 43 | #Applying K-Fold cross validation 44 | from sklearn.model_selection import cross_val_score 45 | accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10) 46 | accuracies.mean() 47 | accuracies.std() 48 | -------------------------------------------------------------------------------- /extract.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import pefile 4 | 5 | #!/usr/bin/env python2 6 | import pefile 7 | import os 8 | import hashlib 9 | import array 10 | import math 11 | 12 | def get_md5(fname): 13 | hash_md5 = hashlib.md5() 14 | with open(fname, "rb") as f: 15 | for chunk in iter(lambda: f.read(4096), b""): 16 | hash_md5.update(chunk) 17 | return hash_md5.hexdigest() 18 | 19 | def get_entropy(data): 20 | if len(data) == 0: 21 | return 0.0 22 | occurences = array.array('L', [0]*256) 23 | for x in data: 24 | occurences[x if isinstance(x, int) else ord(x)] += 1 25 | 26 | entropy = 0 27 | for x in occurences: 28 | if x: 29 | p_x = float(x) / len(data) 30 | entropy -= p_x*math.log(p_x, 2) 31 | 32 | return entropy 33 | 34 | def get_resources(pe): 35 | """Extract resources : 36 | [entropy, size]""" 37 | resources = [] 38 | if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'): 39 | try: 40 | for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries: 41 | if hasattr(resource_type, 'directory'): 42 | for resource_id in resource_type.directory.entries: 43 | if hasattr(resource_id, 'directory'): 44 | for resource_lang in resource_id.directory.entries: 45 | data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size) 46 | size = resource_lang.data.struct.Size 47 | entropy = get_entropy(data) 48 | 49 | resources.append([entropy, size]) 50 | except Exception as e: 51 | return resources 52 | return resources 53 | 54 | def get_version_info(pe): 55 | """Return version infos""" 56 | res = {} 57 | for fileinfo in pe.FileInfo: 58 | if fileinfo.Key == 'StringFileInfo': 59 | for st in fileinfo.StringTable: 60 | for entry in st.entries.items(): 61 | res[entry[0]] = entry[1] 62 | if fileinfo.Key == 'VarFileInfo': 63 | for var in fileinfo.Var: 64 | res[var.entry.items()[0][0]] = var.entry.items()[0][1] 65 | if hasattr(pe, 'VS_FIXEDFILEINFO'): 66 | res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags 67 | res['os'] = pe.VS_FIXEDFILEINFO.FileOS 68 | res['type'] = pe.VS_FIXEDFILEINFO.FileType 69 | res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS 70 | res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS 71 | res['signature'] = pe.VS_FIXEDFILEINFO.Signature 72 | res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion 73 | return res 74 | 75 | def extract_infos(fpath): 76 | res = [] 77 | res.append(os.path.basename(fpath)) 78 | res.append(get_md5(fpath)) 79 | pe = pefile.PE(fpath) 80 | res.append(pe.FILE_HEADER.Machine) 81 | res.append(pe.FILE_HEADER.SizeOfOptionalHeader) 82 | res.append(pe.FILE_HEADER.Characteristics) 83 | res.append(pe.OPTIONAL_HEADER.MajorLinkerVersion) 84 | res.append(pe.OPTIONAL_HEADER.MinorLinkerVersion) 85 | res.append(pe.OPTIONAL_HEADER.SizeOfCode) 86 | res.append(pe.OPTIONAL_HEADER.SizeOfInitializedData) 87 | res.append(pe.OPTIONAL_HEADER.SizeOfUninitializedData) 88 | res.append(pe.OPTIONAL_HEADER.AddressOfEntryPoint) 89 | res.append(pe.OPTIONAL_HEADER.BaseOfCode) 90 | try: 91 | res.append(pe.OPTIONAL_HEADER.BaseOfData) 92 | except AttributeError: 93 | res.append(0) 94 | res.append(pe.OPTIONAL_HEADER.ImageBase) 95 | res.append(pe.OPTIONAL_HEADER.SectionAlignment) 96 | res.append(pe.OPTIONAL_HEADER.FileAlignment) 97 | res.append(pe.OPTIONAL_HEADER.MajorOperatingSystemVersion) 98 | res.append(pe.OPTIONAL_HEADER.MinorOperatingSystemVersion) 99 | res.append(pe.OPTIONAL_HEADER.MajorImageVersion) 100 | res.append(pe.OPTIONAL_HEADER.MinorImageVersion) 101 | res.append(pe.OPTIONAL_HEADER.MajorSubsystemVersion) 102 | res.append(pe.OPTIONAL_HEADER.MinorSubsystemVersion) 103 | res.append(pe.OPTIONAL_HEADER.SizeOfImage) 104 | res.append(pe.OPTIONAL_HEADER.SizeOfHeaders) 105 | res.append(pe.OPTIONAL_HEADER.CheckSum) 106 | res.append(pe.OPTIONAL_HEADER.Subsystem) 107 | res.append(pe.OPTIONAL_HEADER.DllCharacteristics) 108 | res.append(pe.OPTIONAL_HEADER.SizeOfStackReserve) 109 | res.append(pe.OPTIONAL_HEADER.SizeOfStackCommit) 110 | res.append(pe.OPTIONAL_HEADER.SizeOfHeapReserve) 111 | res.append(pe.OPTIONAL_HEADER.SizeOfHeapCommit) 112 | res.append(pe.OPTIONAL_HEADER.LoaderFlags) 113 | res.append(pe.OPTIONAL_HEADER.NumberOfRvaAndSizes) 114 | res.append(len(pe.sections)) 115 | entropy = map(lambda x:x.get_entropy(), pe.sections) 116 | res.append(sum(entropy)/float(len(entropy))) 117 | res.append(min(entropy)) 118 | res.append(max(entropy)) 119 | raw_sizes = map(lambda x:x.SizeOfRawData, pe.sections) 120 | res.append(sum(raw_sizes)/float(len(raw_sizes))) 121 | res.append(min(raw_sizes)) 122 | res.append(max(raw_sizes)) 123 | virtual_sizes = map(lambda x:x.Misc_VirtualSize, pe.sections) 124 | res.append(sum(virtual_sizes)/float(len(virtual_sizes))) 125 | res.append(min(virtual_sizes)) 126 | res.append(max(virtual_sizes)) 127 | #Imports 128 | try: 129 | res.append(len(pe.DIRECTORY_ENTRY_IMPORT)) 130 | imports = sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], []) 131 | res.append(len(imports)) 132 | res.append(len(filter(lambda x:x.name is None, imports))) 133 | except AttributeError: 134 | res.append(0) 135 | res.append(0) 136 | res.append(0) 137 | #Exports 138 | try: 139 | res.append(len(pe.DIRECTORY_ENTRY_EXPORT.symbols)) 140 | except AttributeError: 141 | # No export 142 | res.append(0) 143 | #Resources 144 | resources= get_resources(pe) 145 | res.append(len(resources)) 146 | if len(resources)> 0: 147 | entropy = map(lambda x:x[0], resources) 148 | res.append(sum(entropy)/float(len(entropy))) 149 | res.append(min(entropy)) 150 | res.append(max(entropy)) 151 | sizes = map(lambda x:x[1], resources) 152 | res.append(sum(sizes)/float(len(sizes))) 153 | res.append(min(sizes)) 154 | res.append(max(sizes)) 155 | else: 156 | res.append(0) 157 | res.append(0) 158 | res.append(0) 159 | res.append(0) 160 | res.append(0) 161 | res.append(0) 162 | 163 | # Load configuration size 164 | try: 165 | res.append(pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size) 166 | except AttributeError: 167 | res.append(0) 168 | 169 | # Version configuration size 170 | try: 171 | version_infos = get_version_info(pe) 172 | res.append(len(version_infos.keys())) 173 | except AttributeError: 174 | res.append(0) 175 | return res 176 | 177 | if __name__ == '__main__': 178 | output = "data.csv" 179 | csv_delimiter = "|" 180 | columns = [ 181 | "Name", 182 | "md5", 183 | "Machine", 184 | "SizeOfOptionalHeader", 185 | "Characteristics", 186 | "MajorLinkerVersion", 187 | "MinorLinkerVersion", 188 | "SizeOfCode", 189 | "SizeOfInitializedData", 190 | "SizeOfUninitializedData", 191 | "AddressOfEntryPoint", 192 | "BaseOfCode", 193 | "BaseOfData", 194 | "ImageBase", 195 | "SectionAlignment", 196 | "FileAlignment", 197 | "MajorOperatingSystemVersion", 198 | "MinorOperatingSystemVersion", 199 | "MajorImageVersion", 200 | "MinorImageVersion", 201 | "MajorSubsystemVersion", 202 | "MinorSubsystemVersion", 203 | "SizeOfImage", 204 | "SizeOfHeaders", 205 | "CheckSum", 206 | "Subsystem", 207 | "DllCharacteristics", 208 | "SizeOfStackReserve", 209 | "SizeOfStackCommit", 210 | "SizeOfHeapReserve", 211 | "SizeOfHeapCommit", 212 | "LoaderFlags", 213 | "NumberOfRvaAndSizes", 214 | "SectionsNb", 215 | "SectionsMeanEntropy", 216 | "SectionsMinEntropy", 217 | "SectionsMaxEntropy", 218 | "SectionsMeanRawsize", 219 | "SectionsMinRawsize", 220 | "SectionMaxRawsize", 221 | "SectionsMeanVirtualsize", 222 | "SectionsMinVirtualsize", 223 | "SectionMaxVirtualsize", 224 | "ImportsNbDLL", 225 | "ImportsNb", 226 | "ImportsNbOrdinal", 227 | "ExportNb", 228 | "ResourcesNb", 229 | "ResourcesMeanEntropy", 230 | "ResourcesMinEntropy", 231 | "ResourcesMaxEntropy", 232 | "ResourcesMeanSize", 233 | "ResourcesMinSize", 234 | "ResourcesMaxSize", 235 | "LoadConfigurationSize", 236 | "VersionInformationSize", 237 | "legitimate" 238 | ] 239 | 240 | ff = open(output, "a") 241 | ff.write(csv_delimiter.join(columns) + "\n") 242 | 243 | # Launch legitimate 244 | for ffile in os.listdir('legitimate'): 245 | print(ffile) 246 | try: 247 | res = extract_infos(os.path.join('legitimate/', ffile)) 248 | res.append(1) 249 | ff.write(csv_delimiter.join(map(lambda x:str(x), res)) + "\n") 250 | except pefile.PEFormatError: 251 | print('\t -> Bad PE format') 252 | 253 | for ffile in os.listdir('/hdd/Downloads/virusi_00325'): 254 | print(ffile) 255 | try: 256 | res = extract_infos(os.path.join('/hdd/Downloads/virusi_00325/', ffile)) 257 | res.append(0) 258 | 259 | ff.write(csv_delimiter.join(map(lambda x:str(x), res)) + "\n") 260 | shutil.copy("/hdd/Downloads/virusi_00325/"+ffile,"2/") 261 | except pefile.PEFormatError: 262 | print('\t -> Bad PE format') 263 | except: 264 | print('\t -> Weird error') 265 | ff.close() 266 | -------------------------------------------------------------------------------- /licenta.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuff96/Malware-detection-using-Machine-Learning/8a8ebc9d0ea9dc6ad7890ebaaf2bcd79b0c9c596/licenta.pdf -------------------------------------------------------------------------------- /prezentare.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuff96/Malware-detection-using-Machine-Learning/8a8ebc9d0ea9dc6ad7890ebaaf2bcd79b0c9c596/prezentare.pdf -------------------------------------------------------------------------------- /simple_KNN.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Aug 27 22:29:36 2018 5 | 6 | @author: alex 7 | """ 8 | 9 | # K-Nearest Neighbors (K-NN) 10 | 11 | # Importing the libraries 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | import pandas as pd 15 | 16 | # Importing the dataset 17 | dataset = pd.read_csv('data.csv', sep = '|') 18 | X = dataset.drop(['Name', 'md5', 'legitimate'], axis = 1).values 19 | y = dataset['legitimate'].values 20 | 21 | # Splitting the dataset into the Training set and Test set 22 | from sklearn.cross_validation import train_test_split 23 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0) 24 | 25 | # Feature Scaling 26 | from sklearn.preprocessing import StandardScaler 27 | sc = StandardScaler() 28 | X_train = sc.fit_transform(X_train) 29 | X_test = sc.transform(X_test) 30 | 31 | # Fitting K-NN to the Training set 32 | from sklearn.neighbors import KNeighborsClassifier 33 | classifier = KNeighborsClassifier(n_neighbors = 7, metric = 'minkowski', p = 2) 34 | classifier.fit(X_train, y_train) 35 | 36 | # Predicting the Test set results 37 | y_pred = classifier.predict(X_test) 38 | 39 | # Making the Confusion Matrix 40 | from sklearn.metrics import confusion_matrix 41 | cm = confusion_matrix(y_test, y_pred) 42 | 43 | from sklearn.model_selection import cross_val_score 44 | # creating odd list of K for KNN 45 | myList = list(range(1,50)) 46 | 47 | # subsetting just the odd ones 48 | neighbors = filter(lambda x: x % 2 != 0, myList) 49 | neighbors = list(range(1,50)) 50 | # empty list that will hold cv scores 51 | cv_scores = [] 52 | 53 | # perform 20-fold cross validation 54 | for k in neighbors: 55 | knn = KNeighborsClassifier(n_neighbors=k) 56 | scores = cross_val_score(knn, X_train, y_train, cv=20, scoring='accuracy') 57 | cv_scores.append(scores.mean()) 58 | 59 | # changing to misclassification error 60 | MSE = [1 - x for x in cv_scores] 61 | MSE_list = np.array(MSE) 62 | neighbors_list = np.array(neighbors) 63 | # determining best k 64 | optimal_k = neighbors[MSE_list.tolist().index(min(MSE_list))] 65 | print ("The optimal number of neighbors is %d" % optimal_k) 66 | 67 | # plot misclassification error vs k 68 | plt.plot(neighbors_list, MSE_list) 69 | plt.xlabel('Number of Neighbors K') 70 | plt.ylabel('Misclassification Error') 71 | plt.show() 72 | --------------------------------------------------------------------------------