├── README.md ├── .gitignore ├── learning.py └── checker.py /README.md: -------------------------------------------------------------------------------- 1 | # machine-learning-approach-for-malware-detection 2 | Virus detection is a fundamental classification problem i.e. we can train a program to recognize whether a piece of software is a malware or not and thus we can we can detect and delete it. We build an antivirus script in Python by training a classifier to be able to detect Portable Executable (PE) format files (https://en.wikipedia.org/wiki/Portable_Executable) as either malicious or non-malicious. This classifier problem works on 5 different classification algorithms and chooses the best one for prediction by comparing their results. 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /learning.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import pickle 4 | import sklearn.ensemble as ske 5 | from sklearn import cross_validation, tree, linear_model 6 | from sklearn.feature_selection import SelectFromModel 7 | from sklearn.externals import joblib 8 | from sklearn.naive_bayes import GaussianNB 9 | from sklearn.metrics import confusion_matrix 10 | 11 | data = pd.read_csv('data.csv', sep='|') 12 | X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values 13 | y = data['legitimate'].values 14 | 15 | print('Researching important feature based on %i total features\n' % X.shape[1]) 16 | 17 | # Feature selection using Trees Classifier 18 | fsel = ske.ExtraTreesClassifier().fit(X, y) 19 | model = SelectFromModel(fsel, prefit=True) 20 | X_new = model.transform(X) 21 | nb_features = X_new.shape[1] 22 | 23 | X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_new, y ,test_size=0.2) 24 | 25 | features = [] 26 | 27 | print('%i features identified as important:' % nb_features) 28 | 29 | indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features] 30 | for f in range(nb_features): 31 | print("%d. feature %s (%f)" % (f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]])) 32 | 33 | # XXX : take care of the feature order 34 | for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]): 35 | features.append(data.columns[2+f]) 36 | 37 | #Algorithm comparison 38 | algorithms = { 39 | "DecisionTree": tree.DecisionTreeClassifier(max_depth=10), 40 | "RandomForest": ske.RandomForestClassifier(n_estimators=50), 41 | "GradientBoosting": ske.GradientBoostingClassifier(n_estimators=50), 42 | "AdaBoost": ske.AdaBoostClassifier(n_estimators=100), 43 | "GNB": GaussianNB() 44 | } 45 | 46 | results = {} 47 | print("\nNow testing algorithms") 48 | for algo in algorithms: 49 | clf = algorithms[algo] 50 | clf.fit(X_train, y_train) 51 | score = clf.score(X_test, y_test) 52 | print("%s : %f %%" % (algo, score*100)) 53 | results[algo] = score 54 | 55 | winner = max(results, key=results.get) 56 | print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100)) 57 | 58 | # Save the algorithm and the feature list for later predictions 59 | print('Saving algorithm and feature list in classifier directory...') 60 | joblib.dump(algorithms[winner], 'classifier/classifier.pkl') 61 | open('classifier/features.pkl', 'w').write(pickle.dumps(features)) 62 | print('Saved') 63 | 64 | # Identify false and true positive rates 65 | clf = algorithms[winner] 66 | res = clf.predict(X_test) 67 | mt = confusion_matrix(y_test, res) 68 | print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100)) 69 | print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100))) 70 | -------------------------------------------------------------------------------- /checker.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python2 2 | import pefile 3 | import os 4 | import array 5 | import math 6 | import pickle 7 | from sklearn.externals import joblib 8 | import sys 9 | import argparse 10 | 11 | def get_entropy(data): 12 | if len(data) == 0: 13 | return 0.0 14 | occurences = array.array('L', [0]*256) 15 | for x in data: 16 | occurences[x if isinstance(x, int) else ord(x)] += 1 17 | 18 | entropy = 0 19 | for x in occurences: 20 | if x: 21 | p_x = float(x) / len(data) 22 | entropy -= p_x*math.log(p_x, 2) 23 | 24 | return entropy 25 | 26 | def get_resources(pe): 27 | """Extract resources : 28 | [entropy, size]""" 29 | resources = [] 30 | if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'): 31 | try: 32 | for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries: 33 | if hasattr(resource_type, 'directory'): 34 | for resource_id in resource_type.directory.entries: 35 | if hasattr(resource_id, 'directory'): 36 | for resource_lang in resource_id.directory.entries: 37 | data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size) 38 | size = resource_lang.data.struct.Size 39 | entropy = get_entropy(data) 40 | 41 | resources.append([entropy, size]) 42 | except Exception as e: 43 | return resources 44 | return resources 45 | 46 | def get_version_info(pe): 47 | """Return version infos""" 48 | res = {} 49 | for fileinfo in pe.FileInfo: 50 | if fileinfo.Key == 'StringFileInfo': 51 | for st in fileinfo.StringTable: 52 | for entry in st.entries.items(): 53 | res[entry[0]] = entry[1] 54 | if fileinfo.Key == 'VarFileInfo': 55 | for var in fileinfo.Var: 56 | res[var.entry.items()[0][0]] = var.entry.items()[0][1] 57 | if hasattr(pe, 'VS_FIXEDFILEINFO'): 58 | res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags 59 | res['os'] = pe.VS_FIXEDFILEINFO.FileOS 60 | res['type'] = pe.VS_FIXEDFILEINFO.FileType 61 | res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS 62 | res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS 63 | res['signature'] = pe.VS_FIXEDFILEINFO.Signature 64 | res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion 65 | return res 66 | 67 | def extract_infos(fpath): 68 | res = {} 69 | pe = pefile.PE(fpath) 70 | res['Machine'] = pe.FILE_HEADER.Machine 71 | res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader 72 | res['Characteristics'] = pe.FILE_HEADER.Characteristics 73 | res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion 74 | res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion 75 | res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode 76 | res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData 77 | res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData 78 | res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint 79 | res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode 80 | try: 81 | res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData 82 | except AttributeError: 83 | res['BaseOfData'] = 0 84 | res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase 85 | res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment 86 | res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment 87 | res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion 88 | res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion 89 | res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion 90 | res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion 91 | res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion 92 | res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion 93 | res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage 94 | res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders 95 | res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum 96 | res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem 97 | res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics 98 | res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve 99 | res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit 100 | res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve 101 | res['SizeOfHeapCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit 102 | res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags 103 | res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes 104 | 105 | # Sections 106 | res['SectionsNb'] = len(pe.sections) 107 | entropy = map(lambda x:x.get_entropy(), pe.sections) 108 | res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy)) 109 | res['SectionsMinEntropy'] = min(entropy) 110 | res['SectionsMaxEntropy'] = max(entropy) 111 | raw_sizes = map(lambda x:x.SizeOfRawData, pe.sections) 112 | res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes)) 113 | res['SectionsMinRawsize'] = min(raw_sizes) 114 | res['SectionsMaxRawsize'] = max(raw_sizes) 115 | virtual_sizes = map(lambda x:x.Misc_VirtualSize, pe.sections) 116 | res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes)) 117 | res['SectionsMinVirtualsize'] = min(virtual_sizes) 118 | res['SectionMaxVirtualsize'] = max(virtual_sizes) 119 | 120 | #Imports 121 | try: 122 | res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT) 123 | imports = sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], []) 124 | res['ImportsNb'] = len(imports) 125 | res['ImportsNbOrdinal'] = len(filter(lambda x:x.name is None, imports)) 126 | except AttributeError: 127 | res['ImportsNbDLL'] = 0 128 | res['ImportsNb'] = 0 129 | res['ImportsNbOrdinal'] = 0 130 | 131 | #Exports 132 | try: 133 | res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols) 134 | except AttributeError: 135 | # No export 136 | res['ExportNb'] = 0 137 | #Resources 138 | resources= get_resources(pe) 139 | res['ResourcesNb'] = len(resources) 140 | if len(resources)> 0: 141 | entropy = map(lambda x:x[0], resources) 142 | res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy)) 143 | res['ResourcesMinEntropy'] = min(entropy) 144 | res['ResourcesMaxEntropy'] = max(entropy) 145 | sizes = map(lambda x:x[1], resources) 146 | res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes)) 147 | res['ResourcesMinSize'] = min(sizes) 148 | res['ResourcesMaxSize'] = max(sizes) 149 | else: 150 | res['ResourcesNb'] = 0 151 | res['ResourcesMeanEntropy'] = 0 152 | res['ResourcesMinEntropy'] = 0 153 | res['ResourcesMaxEntropy'] = 0 154 | res['ResourcesMeanSize'] = 0 155 | res['ResourcesMinSize'] = 0 156 | res['ResourcesMaxSize'] = 0 157 | 158 | # Load configuration size 159 | try: 160 | res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size 161 | except AttributeError: 162 | res['LoadConfigurationSize'] = 0 163 | 164 | 165 | # Version configuration size 166 | try: 167 | version_infos = get_version_info(pe) 168 | res['VersionInformationSize'] = len(version_infos.keys()) 169 | except AttributeError: 170 | res['VersionInformationSize'] = 0 171 | return res 172 | 173 | if __name__ == '__main__': 174 | parser = argparse.ArgumentParser(description='Detect malicious files') 175 | parser.add_argument('FILE', help='File to be tested') 176 | args = parser.parse_args() 177 | # Load classifier 178 | clf = joblib.load(os.path.join( 179 | os.path.dirname(os.path.realpath(__file__)), 180 | 'classifier/classifier.pkl' 181 | )) 182 | features = pickle.loads(open(os.path.join( 183 | os.path.dirname(os.path.realpath(__file__)), 184 | 'classifier/features.pkl'), 185 | 'r').read() 186 | ) 187 | 188 | data = extract_infos(args.FILE) 189 | 190 | pe_features = map(lambda x:data[x], features) 191 | 192 | res= clf.predict([pe_features])[0] 193 | print('The file %s is %s' % ( 194 | os.path.basename(sys.argv[1]), 195 | ['malicious', 'legitimate'][res]) 196 | ) 197 | --------------------------------------------------------------------------------