├── Mal-detection-learning.py ├── Mal-detection.py ├── README.md ├── antivirusXml.py ├── classifier ├── classifier.pkl └── features.pkl ├── data.csv ├── requirements.txt └── virustotal.py /Mal-detection-learning.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """antivirus-learning-phase.ipynb 3 | """ 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import seaborn as sns 8 | import sklearn.ensemble as ske 9 | from sklearn.feature_selection import SelectFromModel 10 | import matplotlib.pyplot as plt 11 | from sklearn.model_selection import train_test_split 12 | from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 13 | from sklearn.tree import DecisionTreeClassifier 14 | from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier 15 | from sklearn.neighbors import KNeighborsClassifier 16 | from sklearn.linear_model import SGDClassifier 17 | from sklearn.naive_bayes import GaussianNB 18 | from sklearn import preprocessing 19 | from sklearn import utils 20 | import joblib 21 | import sys 22 | import pickle 23 | sys.modules['sklearn.externals.joblib'] = joblib 24 | 25 | 26 | 27 | data = pd.read_csv('data.csv',sep="|") 28 | data.head() 29 | 30 | 31 | data.isnull().sum() 32 | 33 | colomuns = ["LoaderFlags","NumberOfRvaAndSizes","SectionsNb","SectionsMeanEntropy","SectionsMinEntropy","SectionsMaxEntropy","SectionsMeanRawsize","SectionMaxRawsize","SectionsMeanVirtualsize","SectionsMinVirtualsize","SectionMaxVirtualsize","ImportsNbDLL","ImportsNb","ImportsNbOrdinal","ExportNb","ResourcesNb","ResourcesMeanEntropy","ResourcesMinEntropy","ResourcesMaxEntropy","ResourcesMeanSize","ResourcesMinSize","ResourcesMaxSize","LoadConfigurationSize","VersionInformationSize","legitimate"] 34 | for c in colomuns: 35 | m=round(data[c].mean(),2) 36 | data= data.fillna(m) 37 | 38 | X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values 39 | y = data['legitimate'].values 40 | 41 | data.dtypes 42 | 43 | sns.countplot(x='legitimate', data=data); 44 | 45 | ex = ExtraTreesClassifier() 46 | lab = preprocessing.LabelEncoder() 47 | y_transformed = lab.fit_transform(y) 48 | 49 | fsel = ex.fit(X,y_transformed) 50 | model = SelectFromModel(fsel, prefit=True) 51 | X_new = model.transform(X) 52 | nb_features = X_new.shape[1] 53 | 54 | X_train, X_test, y_train, y_test = train_test_split(X_new, y ,test_size=0.2) 55 | 56 | features = [] 57 | 58 | print('%i features identified as important:' % nb_features) 59 | 60 | indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features] 61 | for f in range(nb_features): 62 | print("%d. feature %s (%f)" % (f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]])) 63 | 64 | for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]): 65 | features.append(data.columns[2+f]) 66 | 67 | algorithms = { 68 | "DecisionTree": DecisionTreeClassifier(max_depth=10), 69 | "RandomForest": RandomForestClassifier(n_estimators=50), 70 | "AdaBoost": AdaBoostClassifier(n_estimators=100), 71 | "GNB": GaussianNB() 72 | } 73 | 74 | results = {} 75 | accuracy_test = [] 76 | model = [] 77 | print("\nNow testing algorithms") 78 | for algo in algorithms: 79 | clf = algorithms[algo] 80 | lab = preprocessing.LabelEncoder() 81 | y_transformed = lab.fit_transform(y_train) 82 | clf.fit(X_train, y_transformed) 83 | pred = clf.predict(X_test) 84 | score = clf.score(X_test, y_test) 85 | results[algo] = score 86 | print("%s : %f %%" % (algo, score*100)) 87 | acc = accuracy_score(pred, y_test) 88 | accuracy_test.append(acc) 89 | print('Test Accuracy :\033[32m \033[01m {:.5f}% \033[30m \033[0m'.format(acc*100)) 90 | print('\033[01m Classification_report \033[0m') 91 | print(classification_report(y_test, pred)) 92 | print('\033[01m Confusion_matrix \033[0m') 93 | cf_matrix = confusion_matrix(y_test, pred) 94 | plot_ = sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True,fmt= '0.2%') 95 | plt.show() 96 | print('\033[31m###################- End -###################\033[0m') 97 | 98 | winner = max(results, key=results.get) 99 | print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100)) 100 | 101 | # Save the algorithm and the feature list for later predictions 102 | print('Saving algorithm and feature list in classifier directory...') 103 | joblib.dump(algorithms[winner], 'classifier.pkl') 104 | open('features.pkl', 'bw').write(pickle.dumps(features)) 105 | print('Saved') 106 | -------------------------------------------------------------------------------- /Mal-detection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import pefile 3 | import os 4 | import array 5 | import math 6 | import pickle 7 | import joblib 8 | import sys 9 | import argparse 10 | 11 | def get_entropy(data): 12 | if len(data) == 0: 13 | return 0.0 14 | occurences = array.array('L', [0]*256) 15 | for x in data: 16 | occurences[x if isinstance(x, int) else ord(x)] += 1 17 | 18 | entropy = 0 19 | for x in occurences: 20 | if x: 21 | p_x = float(x) / len(data) 22 | entropy -= p_x*math.log(p_x, 2) 23 | 24 | return entropy 25 | 26 | def get_resources(pe): 27 | """Extract resources : 28 | [entropy, size]""" 29 | resources = [] 30 | if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'): 31 | try: 32 | for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries: 33 | if hasattr(resource_type, 'directory'): 34 | for resource_id in resource_type.directory.entries: 35 | if hasattr(resource_id, 'directory'): 36 | for resource_lang in resource_id.directory.entries: 37 | data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size) 38 | size = resource_lang.data.struct.Size 39 | entropy = get_entropy(data) 40 | 41 | resources.append([entropy, size]) 42 | except Exception as e: 43 | return resources 44 | return resources 45 | 46 | def get_version_info(pe): 47 | """Return version infos""" 48 | res = {} 49 | for fileinfo in pe.FileInfo: 50 | if fileinfo.Key == 'StringFileInfo': 51 | for st in fileinfo.StringTable: 52 | for entry in st.entries.items(): 53 | res[entry[0]] = entry[1] 54 | if fileinfo.Key == 'VarFileInfo': 55 | for var in fileinfo.Var: 56 | res[var.entry.items()[0][0]] = var.entry.items()[0][1] 57 | if hasattr(pe, 'VS_FIXEDFILEINFO'): 58 | res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags 59 | res['os'] = pe.VS_FIXEDFILEINFO.FileOS 60 | res['type'] = pe.VS_FIXEDFILEINFO.FileType 61 | res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS 62 | res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS 63 | res['signature'] = pe.VS_FIXEDFILEINFO.Signature 64 | res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion 65 | return res 66 | 67 | def extract_infos(fpath): 68 | res = {} 69 | pe = pefile.PE(fpath) 70 | res['Machine'] = pe.FILE_HEADER.Machine 71 | res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader 72 | res['Characteristics'] = pe.FILE_HEADER.Characteristics 73 | res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion 74 | res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion 75 | res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode 76 | res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData 77 | res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData 78 | res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint 79 | res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode 80 | try: 81 | res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData 82 | except AttributeError: 83 | res['BaseOfData'] = 0 84 | res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase 85 | res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment 86 | res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment 87 | res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion 88 | res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion 89 | res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion 90 | res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion 91 | res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion 92 | res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion 93 | res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage 94 | res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders 95 | res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum 96 | res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem 97 | res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics 98 | res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve 99 | res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit 100 | res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve 101 | res['SizeOfHeapCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit 102 | res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags 103 | res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes 104 | 105 | # Sections 106 | res['SectionsNb'] = len(pe.sections) 107 | entropy = list(map(lambda x:x.get_entropy(), pe.sections)) 108 | res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy)) 109 | res['SectionsMinEntropy'] = min(entropy) 110 | res['SectionsMaxEntropy'] = max(entropy) 111 | 112 | 113 | raw_sizes = list(map(lambda x:x.SizeOfRawData, pe.sections)) 114 | res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes)) 115 | res['SectionsMinRawsize'] = min(raw_sizes) 116 | res['SectionsMaxRawsize'] = max(raw_sizes) 117 | virtual_sizes = list(map(lambda x:x.Misc_VirtualSize, pe.sections)) 118 | res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes)) 119 | res['SectionsMinVirtualsize'] = min(virtual_sizes) 120 | res['SectionMaxVirtualsize'] = max(virtual_sizes) 121 | 122 | #Imports 123 | try: 124 | res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT) 125 | imports = list(sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], [])) 126 | res['ImportsNb'] = len(imports) 127 | res['ImportsNbOrdinal'] = len(list(filter(lambda x:x.name is None, imports))) 128 | except AttributeError: 129 | res['ImportsNbDLL'] = 0 130 | res['ImportsNb'] = 0 131 | res['ImportsNbOrdinal'] = 0 132 | 133 | #Exports 134 | try: 135 | res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols) 136 | except AttributeError: 137 | # No export 138 | res['ExportNb'] = 0 139 | #Resources 140 | resources= get_resources(pe) 141 | res['ResourcesNb'] = len(resources) 142 | if len(resources)> 0: 143 | entropy = list(map(lambda x:x[0], resources)) 144 | res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy)) 145 | res['ResourcesMinEntropy'] = min(entropy) 146 | res['ResourcesMaxEntropy'] = max(entropy) 147 | sizes = list(map(lambda x:x[1], resources)) 148 | res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes)) 149 | res['ResourcesMinSize'] = min(sizes) 150 | res['ResourcesMaxSize'] = max(sizes) 151 | else: 152 | res['ResourcesNb'] = 0 153 | res['ResourcesMeanEntropy'] = 0 154 | res['ResourcesMinEntropy'] = 0 155 | res['ResourcesMaxEntropy'] = 0 156 | res['ResourcesMeanSize'] = 0 157 | res['ResourcesMinSize'] = 0 158 | res['ResourcesMaxSize'] = 0 159 | 160 | # Load configuration size 161 | try: 162 | res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size 163 | except AttributeError: 164 | res['LoadConfigurationSize'] = 0 165 | 166 | 167 | # Version configuration size 168 | try: 169 | version_infos = get_version_info(pe) 170 | res['VersionInformationSize'] = len(version_infos.keys()) 171 | except AttributeError: 172 | res['VersionInformationSize'] = 0 173 | return res 174 | 175 | if __name__ == '__main__': 176 | parser = argparse.ArgumentParser(description='Detect malicious files') 177 | parser.add_argument('FILE', help='File to be tested') 178 | args = parser.parse_args() 179 | # Load classifier 180 | clf = joblib.load(os.path.join( 181 | os.path.dirname(os.path.realpath(__file__)), 182 | 'classifier/classifier.pkl' 183 | )) 184 | 185 | with open('classifier/features.pkl', 'rb') as f: 186 | features = pickle.load(f) 187 | 188 | data = extract_infos(args.FILE) 189 | pe_features = list(map(lambda x:data[x], features)) 190 | 191 | res= clf.predict([pe_features])[0] 192 | print('The file %s is %s' % ( 193 | os.path.basename(sys.argv[1]), 194 | ['malicious', 'legitimate'][res]) 195 | ) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 |

ANTIVIRUSXML

6 | 7 |

8 | File Integrity Monitor with Malware detection using Machine learning 9 |
10 |

11 | 12 | 13 | 14 | 15 | 16 |
17 | 18 |
19 | 20 | 21 | # :notebook_with_decorative_cover: Table of Contents 22 | 23 | - [About the Project](#star2-about-the-project) 24 | - [Getting Started](#toolbox-getting-started) 25 | * [Prerequisites](#bangbang-prerequisites) 26 | * [Installation](#gear-installation) 27 | - [Usage](#eyes-usage) 28 | - [Contact](#handshake-contact) 29 | - [Acknowledgements](#gem-acknowledgements) 30 | 31 | 32 | 33 | ## :star2: About the Project 34 | 35 | Overview 36 | ============ 37 | This Python project is a project that combine between : 38 | 1. basic file integrity monitor: which it takes two arguments: a directory to scan, and an output file for alerts. The script will recursively scan the given directory and its subdirectories, and will create alerts for any added, removed, or changed files. The script uses the os, sys, and pickle libraries to perform file system operations, as well as the datetime, hashlib, logging, and time libraries for other operations. 39 | 2. malware detection using machine learning : it helps train a classifier to be able to detect [PE files](https://en.wikipedia.org/wiki/Portable_Executable) as either malicious or legitimate. It tries out 6 different classification algorithms before deciding which one to use for prediction by comparing their results. 40 | 41 | 42 |
43 | screenshot 44 |
45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | ## :toolbox: Getting Started 55 | 56 | 57 | ### :bangbang: Prerequisites 58 | 59 | This project uses some libraries that you need to install them first : 60 | 61 | ```bash 62 | pip install -r requirements.txt 63 | ``` 64 | 65 | 66 | ### :gear: Installation 67 | 68 | 69 | ```bash 70 | git clone https://github.com/da4nyy/ANTIVIRUSxML/ 71 | cd ANTIVIRUSxML 72 | ``` 73 | 74 | 75 | 76 | ### :running: Run Locally 77 | 78 | Clone the project 79 | 80 | 81 | Go to the project directory 82 | 83 | ```bash 84 | cd ANTIVIRUSxML/ 85 | ``` 86 | 87 | Install dependencies 88 | 89 | ```bash 90 | pip install -r requirements.txt 91 | ``` 92 | 93 | train the model ( you can skip this phase : you already find the files in the classifier directory 94 | 95 | ```bash 96 | python3 Malware-detection-learning.py data.csv 97 | ``` 98 | 99 | Start the file monitor and malware detection handler 100 | 101 | ```bash 102 | python3 antivirusXml.py -i -o 103 | ``` 104 | 105 | 106 | ## :eyes: Usage 107 | 108 | + You may monitor the integrity of the files that may have PII. In this case, you can place the script where your files live, and create a crontab or use task scheduler to run the script. 109 | + You can use the script to monitor the files stored in the web app and scan the added files. 110 | + If you are in the Blue Team at a CCDC competition, you can use this script to monitor your server and easily see which files modified. 111 | 112 | 113 | 114 | 115 | ## :compass: Roadmap 116 | 117 | * [x] scan x32 PE files 118 | * [ ] scan x64 PE files 119 | 120 | 121 | ## :wave: Contributing 122 | 123 | 124 | 125 | 126 | 127 | 128 | Contributions are always welcome! 129 | 130 | 131 | 132 | ## :handshake: Contact 133 | 134 | kacem hakim - [@DARNY](https://twitter.com/darny74258511) - da4nyyy@proton.me 135 | 136 | Project Link: [https://github.com/](https://github.com/da4nyy/ANTIVIRUSxML) 137 | 138 | 139 | ## :gem: Acknowledgements 140 | 141 | - [kaggle mai dali](https://www.kaggle.com/code/maidaly/malware-detection-with-machine-learning) 142 | - [Te-k](https://github.com/Te-k) 143 | - [MaksimEkin- file integrity monitor](https://github.com/MaksimEkin/) 144 | - [Awesome README](https://github.com/matiassingers/awesome-readme) 145 | 146 | 147 | 148 | -------------------------------------------------------------------------------- /antivirusXml.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | 4 | 5 | """ 6 | File Integrity Monitor with malware detection 7 | 8 | @author: DA4NY 9 | """ 10 | 11 | # LIBRARIES NEEDED 12 | import subprocess 13 | import os 14 | import sys 15 | import pickle 16 | import datetime 17 | import hashlib 18 | import logging 19 | import time 20 | import signal 21 | from time import sleep 22 | import dictdiffer 23 | from progress.bar import Bar 24 | 25 | #================== 26 | #change here ! 27 | #================== 28 | 29 | 30 | import sys 31 | import getopt 32 | 33 | 34 | import getopt 35 | import sys 36 | 37 | def get_args(argv): 38 | arg_input = "" 39 | arg_output = "" 40 | 41 | arg_help = "{0} -i -o ".format(argv[0]) 42 | 43 | try: 44 | opts, args = getopt.getopt(argv[1:], "hi:o", ["help", "input=", "output="]) 45 | except: 46 | print(arg_help) 47 | sys.exit(2) 48 | 49 | for opt, arg in opts: 50 | if opt in ("-h", "--help"): 51 | print(arg_help) 52 | sys.exit(2) 53 | elif opt in ("-i", "--input"): 54 | arg_input = arg 55 | print('input directory:', arg_input) 56 | 57 | elif opt in ("-o", "--output"): 58 | arg_output = arg 59 | print('output directory:', arg_output) 60 | 61 | 62 | l = [arg_input,arg_output] 63 | return l 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | #=============== 77 | # Colors 78 | #=============== 79 | # Normal 80 | black="\033[0;30m" 81 | red="\033[0;31m" 82 | green="\033[0;32m" 83 | yellow="\033[0;33m" 84 | blue="\033[0;34m" 85 | purple="\033[0;35m" 86 | cyan="\033[0;36m" 87 | white="\033[0;37m" 88 | # Bold 89 | bblack="\033[1;30m" 90 | bred="\033[1;31m" 91 | bgreen="\033[1;32m" 92 | byellow="\033[1;33m" 93 | bblue="\033[1;34m" 94 | bpurple="\033[1;35m" 95 | bcyan="\033[1;36m" 96 | bwhite="\033[1;37m" 97 | 98 | 99 | #====================== 100 | #printing Banner 101 | #====================== 102 | 103 | def banner(): 104 | logo=''' 105 | '''+byellow+''' 106 | '''''' mmmm mm mm mm mm m 107 | '''''' # "m ## m"# #"m # "m m" 108 | '''''' # # # # #" # # #m # "#" 109 | '''''' # # #mm# #mmm#m # # # # 110 | '''''' #mmm" # # # # ## # 111 | '''''' 112 | 113 | '''+byellow+''' ++ File Integrity Monitor ith Malware Detection ++ 114 | 115 | '''+bblue+''' 116 | *) Creates alerts for: 117 | - added files 118 | - removed files 119 | - changed files 120 | 121 | *) detect if the added / changed file is a malware 122 | 123 | *) checks the signature of the file with virustotal api 124 | 125 | '''+red+''' 126 | @author: DA4NY 127 | 128 | '''+bwhite+'''''' 129 | print(logo) 130 | 131 | 132 | 133 | #========================== 134 | #Count all files in the directory and its subdirectories 135 | #we will use it in the progress bar 136 | #========================= 137 | 138 | def count(SCAN_DIR): 139 | var = 0 140 | for dirName, subdirList, fileList in os.walk(SCAN_DIR): 141 | 142 | if (list_to_ignore): 143 | for ignore in list_to_ignore: 144 | 145 | # if ignore in the list 146 | if (ignore in fileList): 147 | fileList.remove(ignore) 148 | var+=1 149 | return var 150 | 151 | 152 | #********************* 153 | #scanning files 154 | #********************* 155 | def scan_files(SCAN_DIR, list_to_ignore, LOG_FILE): 156 | 157 | try: 158 | # hold directories and files 159 | files = dict() 160 | 161 | 162 | 163 | # recursively walk to directory tree and get files 164 | with Bar('Scanning Files ...',max=count(SCAN_DIR)) as bar: #using a progress bar while scanning the files 165 | for dirName, subdirList, fileList in os.walk(SCAN_DIR): 166 | 167 | if (list_to_ignore): 168 | for ignore in list_to_ignore: 169 | 170 | # if ignore in the list 171 | if (ignore in fileList): 172 | fileList.remove(ignore) 173 | 174 | files[str(dirName)] = fileList 175 | sleep(0.02) 176 | bar.next() 177 | 178 | return files 179 | 180 | except Exception as e: 181 | msg="Error in scanning files and dirs !" 182 | logging.exception(msg) 183 | 184 | 185 | 186 | 187 | #storing hashes 188 | 189 | def save_hash(dictionary, file, LOG_FILE): 190 | 191 | try: 192 | # open the file to use to save the dictionary 193 | initial_scan_file = open(file, "wb") 194 | 195 | # use pickle to save the dictionary 196 | pickle.dump(dictionary, initial_scan_file) 197 | 198 | # close the file 199 | initial_scan_file.close 200 | 201 | except Exception as e: 202 | msg="Error while saving the dictionary" 203 | logging.exception(msg) 204 | 205 | 206 | 207 | 208 | # Load dictionary of hashes 209 | 210 | def load_dict(file, LOG_FILE): 211 | 212 | try: 213 | # open the pickle file to load 214 | infile = open(file, 'rb') 215 | 216 | # use pickle to load the dictionary 217 | loaded_dict = pickle.load(infile) 218 | 219 | # close the file 220 | infile.close() 221 | 222 | return loaded_dict 223 | 224 | except Exception as e: 225 | log(LOG_FILE, \ 226 | "Error while loading the dictionary") 227 | 228 | 229 | 230 | 231 | # Log events 232 | 233 | def log(log_dir, message): 234 | 235 | # get time 236 | currentDT = datetime.datetime.now() 237 | 238 | # log event 239 | file = open(log_dir, "a+") 240 | file.write(str(message) + \ 241 | " --- Time: " + \ 242 | str(currentDT.strftime("%Y-%m-%d %H:%M:%S")) + \ 243 | "\n") 244 | file.close 245 | 246 | 247 | def log_change(log_dir, message): 248 | 249 | # get time 250 | currentDT = datetime.datetime.now() 251 | 252 | # log event test 253 | file = open(log_dir, "a+") 254 | file.write(str(message) + \ 255 | " --- Time: " + \ 256 | str(currentDT.strftime("%Y-%m-%d %H:%M:%S")) + \ 257 | "\n") 258 | file.close 259 | print(red,message,white) 260 | 261 | 262 | 263 | 264 | # Take SHA256 of each file 265 | # hash is taken in blocks, this is done to ensure large files doens't fail 266 | 267 | def calculate_hash(directory, LOG_FILE): 268 | 269 | try: 270 | # use hash libraries sha 256 271 | sha256_hash = hashlib.sha256() 272 | 273 | # take hash 274 | with open(directory,"rb") as f: 275 | 276 | # Read and update hash string value in blocks of 4K 277 | for byte_block in iter(lambda: f.read(4096),b""): 278 | sha256_hash.update(byte_block) 279 | 280 | # return the hash 281 | return sha256_hash.hexdigest() 282 | 283 | except Exception as e: 284 | log(LOG_FILE,"Error while taking the hash values") 285 | 286 | 287 | 288 | 289 | # integrity FUNCTION 290 | 291 | def integrity(): 292 | 293 | 294 | #printing the directory to scan 295 | print("DIRECTORY TO MONITOR :{} ".format(SCAN_DIRECTORY)) 296 | 297 | # start the initial scan 298 | log(LOG_FILE, "Starting the initial scan...") 299 | 300 | 301 | INITIAL_FILE_HASHES = scan() 302 | 303 | # save the initial scan dictionary of hashes 304 | save_hash(INITIAL_FILE_HASHES, \ 305 | SCAN_STORAGE,\ 306 | LOG_FILE) 307 | log(LOG_FILE, "Initial scan completed!") 308 | 309 | 310 | # start the integrity check 311 | log(LOG_FILE, "Starting the integrity check...") 312 | 313 | while True: 314 | 315 | # get the file hashes 316 | new_hash = scan() 317 | 318 | # load the old hash 319 | old_hash = load_dict(SCAN_STORAGE,\ 320 | LOG_FILE) 321 | 322 | # compare two dict of hashes 323 | for diff in list(dictdiffer.diff(old_hash, new_hash)): 324 | # ALERT 325 | 326 | log_change(ALERT_FILE, diff) 327 | malware_detection(diff) 328 | # save the new hash 329 | save_hash(new_hash, \ 330 | SCAN_STORAGE, 331 | LOG_FILE) 332 | 333 | # wait 334 | sleep(sleep_time_sc) 335 | 336 | 337 | 338 | 339 | # Scan the directory tree and take hash of the files 340 | # Return a dictionary of hashes and file paths 341 | 342 | def scan(): 343 | 344 | # get dictonary of directories and files they contain 345 | directories = scan_files(SCAN_DIRECTORY, \ 346 | list_to_ignore, \ 347 | LOG_FILE) 348 | 349 | # take hash 350 | file_hashes = dict() 351 | for path, files in directories.items(): 352 | 353 | # look at each file at path 354 | for file in files: 355 | 356 | # get the full path name to the file 357 | file_dir = str(path) + "/" + str(file) 358 | 359 | # store the hash of the file 360 | file_hashes[file_dir] = calculate_hash(file_dir, \ 361 | LOG_FILE) 362 | 363 | 364 | # return dictionary with files path and hashes 365 | return file_hashes 366 | 367 | 368 | 369 | 370 | 371 | def malware_detection(diff): 372 | with open(ALERT_FILE, "r+") as alert_file: 373 | for line in alert_file: 374 | pass 375 | 376 | test = line.split("'") 377 | if test[1]== "change": 378 | file_to_scan = test[3] 379 | if test[1]=="add": 380 | file_to_scan = test[5] 381 | 382 | print(bgreen,"[+] Scanning {} ...".format(file_to_scan),bwhite) 383 | if file_to_scan[0]== ".": 384 | extention = file_to_scan[1:] 385 | try : 386 | if extention.split(".")[1] =="exe": 387 | 388 | try : 389 | subprocess.call(['python3','Mal-detection.py', file_to_scan]) 390 | except : 391 | print(bred ,"[x] Failed to run the Malware detection !!!",bwhite ) 392 | else: 393 | print(bred,"[x] The file isn't a windows executable !!! Currently we can only can windows x32 files !") 394 | print(bgreen,"[+] Trying the virus total api ... ",bwhite) 395 | api_virus_total(file_to_scan) 396 | except: 397 | print(bred,"[x] The file isn't a windows executable !!! Currently we can only can windows x32 files !") 398 | print(bgreen,"[+] Trying the virus total api ... ",bwhite) 399 | api_virus_total(file_to_scan) 400 | 401 | 402 | 403 | 404 | 405 | def api_virus_total(file): 406 | subprocess.call(['python3','virustotal.py','-m',file]) 407 | def hand_sign(signum, frame): 408 | res = input("Ctrl-c was pressed. Do you really want to exit? y/n :") 409 | if res == 'y': 410 | print(red,"[x] Quitting!\n",bgreen,"[+] Saving the results in {} ".format(ALERT_FILE)) 411 | exit(1) 412 | 413 | signal.signal(signal.SIGINT, hand_sign) 414 | 415 | # execute 416 | if __name__ == "__main__": 417 | l=[] 418 | SCAN_DIRECTORY = '.' 419 | ALERT_FILE = 'alert.log' 420 | l= get_args(sys.argv) 421 | if l[0]!="": 422 | SCAN_DIRECTORY = l[0] 423 | if l[1]!="": 424 | ALERT_FILE = l[1] 425 | 426 | 427 | 428 | SCAN_STORAGE = 'hashes.pkl' 429 | LOG_FILE = 'handler.log' 430 | list_to_ignore=[SCAN_STORAGE, LOG_FILE, ALERT_FILE] 431 | sleep_time_sc=4 432 | print(str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))+"\n") 433 | #Starting the integrity monitor 434 | banner() 435 | integrity() 436 | 437 | 438 | -------------------------------------------------------------------------------- /classifier/classifier.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/da4nyy/ANTIVIRUSxML/ff2d4b1cf1c0f71241b99e24505cba458fd81998/classifier/classifier.pkl -------------------------------------------------------------------------------- /classifier/features.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/da4nyy/ANTIVIRUSxML/ff2d4b1cf1c0f71241b99e24505cba458fd81998/classifier/features.pkl -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | hashlib 2 | logging 3 | dictdiffer 4 | progress 5 | -------------------------------------------------------------------------------- /virustotal.py: -------------------------------------------------------------------------------- 1 | # upload PE file to VirusTotal 2 | # then get info about the results 3 | # of analysis, print if malicious 4 | import os 5 | import sys 6 | import time 7 | import json 8 | import requests 9 | import argparse 10 | import hashlib 11 | 12 | # for terminal colors 13 | class Colors: 14 | BLUE = '\033[94m' 15 | GREEN = '\033[92m' 16 | YELLOW = '\033[93m' 17 | RED = '\033[91m' 18 | PURPLE = '\033[95m' 19 | ENDC = '\033[0m' 20 | 21 | # VirusTotal API key 22 | VT_API_KEY = "< PUT UR KEY HERE >" 23 | 24 | # VirusTotal API v3 URL 25 | VT_API_URL = "https://www.virustotal.com/api/v3/" 26 | 27 | # upload malicious file to VirusTotal and analyse 28 | class VTScan: 29 | def __init__(self): 30 | self.headers = { 31 | "x-apikey" : VT_API_KEY, 32 | "User-Agent" : "vtscan v.1.0", 33 | "Accept-Encoding" : "gzip, deflate", 34 | } 35 | 36 | def upload(self, malware_path): 37 | print (Colors.BLUE + "upload file: " + malware_path + "..." + Colors.ENDC) 38 | self.malware_path = malware_path 39 | upload_url = VT_API_URL + "files" 40 | files = {"file" : ( 41 | os.path.basename(malware_path), 42 | open(os.path.abspath(malware_path), "rb")) 43 | } 44 | print (Colors.YELLOW + "upload to " + upload_url + Colors.ENDC) 45 | res = requests.post(upload_url, headers = self.headers, files = files) 46 | if res.status_code == 200: 47 | result = res.json() 48 | self.file_id = result.get("data").get("id") 49 | print (Colors.YELLOW + self.file_id + Colors.ENDC) 50 | print (Colors.GREEN + "successfully upload PE file: OK" + Colors.ENDC) 51 | else: 52 | print (Colors.RED + "failed to upload PE file :(" + Colors.ENDC) 53 | print (Colors.RED + "status code: " + str(res.status_code) + Colors.ENDC) 54 | sys.exit() 55 | 56 | def analyse(self): 57 | print (Colors.BLUE + "get info about the results of analysis..." + Colors.ENDC) 58 | analysis_url = VT_API_URL + "analyses/" + self.file_id 59 | res = requests.get(analysis_url, headers = self.headers) 60 | if res.status_code == 200: 61 | result = res.json() 62 | status = result.get("data").get("attributes").get("status") 63 | if status == "completed": 64 | stats = result.get("data").get("attributes").get("stats") 65 | results = result.get("data").get("attributes").get("results") 66 | print (Colors.RED + "malicious: " + str(stats.get("malicious")) + Colors.ENDC) 67 | print (Colors.YELLOW + "undetected : " + str(stats.get("undetected")) + Colors.ENDC) 68 | print () 69 | for k in results: 70 | if results[k].get("category") == "malicious": 71 | print ("==================================================") 72 | print (Colors.GREEN + results[k].get("engine_name") + Colors.ENDC) 73 | print ("version : " + results[k].get("engine_version")) 74 | print ("category : " + results[k].get("category")) 75 | print ("result : " + Colors.RED + results[k].get("result") + Colors.ENDC) 76 | print ("method : " + results[k].get("method")) 77 | print ("update : " + results[k].get("engine_update")) 78 | print ("==================================================") 79 | print () 80 | print (Colors.GREEN + "successfully analyse: OK" + Colors.ENDC) 81 | sys.exit() 82 | elif status == "queued": 83 | print (Colors.BLUE + "status QUEUED..." + Colors.ENDC) 84 | with open(os.path.abspath(self.malware_path), "rb") as malware_path: 85 | b = malware_path.read() 86 | hashsum = hashlib.sha256(b).hexdigest() 87 | self.info(hashsum) 88 | else: 89 | print (Colors.RED + "failed to get results of analysis :(" + Colors.ENDC) 90 | print (Colors.RED + "status code: " + str(res.status_code) + Colors.ENDC) 91 | sys.exit() 92 | 93 | def run(self, malware_path): 94 | self.upload(malware_path) 95 | self.analyse() 96 | 97 | def info(self, file_hash): 98 | print (Colors.BLUE + "get file info by ID: " + file_hash + Colors.ENDC) 99 | info_url = VT_API_URL + "files/" + file_hash 100 | res = requests.get(info_url, headers = self.headers) 101 | if res.status_code == 200: 102 | result = res.json() 103 | if result.get("data").get("attributes").get("last_analysis_results"): 104 | stats = result.get("data").get("attributes").get("last_analysis_stats") 105 | results = result.get("data").get("attributes").get("last_analysis_results") 106 | print (Colors.RED + "malicious: " + str(stats.get("malicious")) + Colors.ENDC) 107 | print (Colors.YELLOW + "undetected : " + str(stats.get("undetected")) + Colors.ENDC) 108 | print () 109 | for k in results: 110 | if results[k].get("category") == "malicious": 111 | print ("==================================================") 112 | print (Colors.GREEN + results[k].get("engine_name") + Colors.ENDC) 113 | print ("version : " + results[k].get("engine_version")) 114 | print ("category : " + results[k].get("category")) 115 | print ("result : " + Colors.RED + results[k].get("result") + Colors.ENDC) 116 | print ("method : " + results[k].get("method")) 117 | print ("update : " + results[k].get("engine_update")) 118 | print ("==================================================") 119 | print () 120 | print (Colors.GREEN + "successfully analyse: OK" + Colors.ENDC) 121 | sys.exit() 122 | else: 123 | print (Colors.BLUE + "failed to analyse :(..." + Colors.ENDC) 124 | 125 | else: 126 | print (Colors.RED + "failed to get information :(" + Colors.ENDC) 127 | print (Colors.RED + "status code: " + str(res.status_code) + Colors.ENDC) 128 | sys.exit() 129 | def hand_sign(signum, frame): 130 | res = input("Ctrl-c was pressed. Do you really want to exit? y/n ") 131 | if res == 'y': 132 | print(red,"QUitting !") 133 | exit(1) 134 | if __name__ == "__main__": 135 | parser = argparse.ArgumentParser() 136 | parser.add_argument('-m','--mal', required = True, help = "PE file path for scanning") 137 | args = vars(parser.parse_args()) 138 | vtscan = VTScan() 139 | vtscan.run(args["mal"]) 140 | --------------------------------------------------------------------------------