├── Mal-detection-learning.py
├── Mal-detection.py
├── README.md
├── antivirusXml.py
├── classifier
    ├── classifier.pkl
    └── features.pkl
├── data.csv
├── requirements.txt
└── virustotal.py


/Mal-detection-learning.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """antivirus-learning-phase.ipynb
  3 | """
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import seaborn as sns
  8 | import sklearn.ensemble as ske
  9 | from sklearn.feature_selection import SelectFromModel
 10 | import matplotlib.pyplot as plt
 11 | from sklearn.model_selection import train_test_split
 12 | from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
 13 | from sklearn.tree import DecisionTreeClassifier
 14 | from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
 15 | from sklearn.neighbors import KNeighborsClassifier
 16 | from sklearn.linear_model import SGDClassifier
 17 | from sklearn.naive_bayes import GaussianNB
 18 | from sklearn import preprocessing
 19 | from sklearn import utils
 20 | import joblib
 21 | import sys
 22 | import pickle
 23 | sys.modules['sklearn.externals.joblib'] = joblib
 24 | 
 25 | 
 26 | 
 27 | data = pd.read_csv('data.csv',sep="|")
 28 | data.head()
 29 | 
 30 | 
 31 | data.isnull().sum()
 32 | 
 33 | colomuns = ["LoaderFlags","NumberOfRvaAndSizes","SectionsNb","SectionsMeanEntropy","SectionsMinEntropy","SectionsMaxEntropy","SectionsMeanRawsize","SectionMaxRawsize","SectionsMeanVirtualsize","SectionsMinVirtualsize","SectionMaxVirtualsize","ImportsNbDLL","ImportsNb","ImportsNbOrdinal","ExportNb","ResourcesNb","ResourcesMeanEntropy","ResourcesMinEntropy","ResourcesMaxEntropy","ResourcesMeanSize","ResourcesMinSize","ResourcesMaxSize","LoadConfigurationSize","VersionInformationSize","legitimate"]
 34 | for c in colomuns:  
 35 |   m=round(data[c].mean(),2)
 36 |   data= data.fillna(m)
 37 | 
 38 | X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values
 39 | y = data['legitimate'].values
 40 | 
 41 | data.dtypes
 42 | 
 43 | sns.countplot(x='legitimate', data=data);
 44 | 
 45 | ex = ExtraTreesClassifier()
 46 | lab = preprocessing.LabelEncoder()
 47 | y_transformed = lab.fit_transform(y)
 48 | 
 49 | fsel = ex.fit(X,y_transformed)
 50 | model = SelectFromModel(fsel, prefit=True)
 51 | X_new = model.transform(X)
 52 | nb_features = X_new.shape[1]
 53 | 
 54 | X_train, X_test, y_train, y_test = train_test_split(X_new, y ,test_size=0.2)
 55 | 
 56 | features = []
 57 | 
 58 | print('%i features identified as important:' % nb_features)
 59 | 
 60 | indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features]
 61 | for f in range(nb_features):
 62 |     print("%d. feature %s (%f)" % (f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]]))
 63 | 
 64 | for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]):
 65 |     features.append(data.columns[2+f])
 66 | 
 67 | algorithms = {
 68 |         "DecisionTree": DecisionTreeClassifier(max_depth=10),
 69 |         "RandomForest": RandomForestClassifier(n_estimators=50),
 70 |         "AdaBoost": AdaBoostClassifier(n_estimators=100),
 71 |         "GNB": GaussianNB()
 72 |     }
 73 | 
 74 | results = {}
 75 | accuracy_test = []
 76 | model = []
 77 | print("\nNow testing algorithms")
 78 | for algo in algorithms:
 79 |     clf = algorithms[algo]
 80 |     lab = preprocessing.LabelEncoder()
 81 |     y_transformed = lab.fit_transform(y_train)
 82 |     clf.fit(X_train, y_transformed)
 83 |     pred = clf.predict(X_test)
 84 |     score = clf.score(X_test, y_test)
 85 |     results[algo] = score
 86 |     print("%s : %f %%" % (algo, score*100))
 87 |     acc = accuracy_score(pred, y_test)
 88 |     accuracy_test.append(acc)
 89 |     print('Test Accuracy :\033[32m \033[01m {:.5f}% \033[30m \033[0m'.format(acc*100))
 90 |     print('\033[01m              Classification_report \033[0m')
 91 |     print(classification_report(y_test, pred))
 92 |     print('\033[01m             Confusion_matrix \033[0m')
 93 |     cf_matrix = confusion_matrix(y_test, pred)
 94 |     plot_ = sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True,fmt= '0.2%')
 95 |     plt.show()
 96 |     print('\033[31m###################- End -###################\033[0m')
 97 | 
 98 | winner = max(results, key=results.get)
 99 | print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100))
100 | 
101 | # Save the algorithm and the feature list for later predictions
102 | print('Saving algorithm and feature list in classifier directory...')
103 | joblib.dump(algorithms[winner], 'classifier.pkl')
104 | open('features.pkl', 'bw').write(pickle.dumps(features))
105 | print('Saved')
106 | 


--------------------------------------------------------------------------------
/Mal-detection.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | import pefile
  3 | import os
  4 | import array
  5 | import math
  6 | import pickle
  7 | import joblib
  8 | import sys
  9 | import argparse
 10 | 
 11 | def get_entropy(data):
 12 |     if len(data) == 0:
 13 |         return 0.0
 14 |     occurences = array.array('L', [0]*256)
 15 |     for x in data:
 16 |         occurences[x if isinstance(x, int) else ord(x)] += 1
 17 | 
 18 |     entropy = 0
 19 |     for x in occurences:
 20 |         if x:
 21 |             p_x = float(x) / len(data)
 22 |             entropy -= p_x*math.log(p_x, 2)
 23 | 
 24 |     return entropy
 25 | 
 26 | def get_resources(pe):
 27 |     """Extract resources :
 28 |     [entropy, size]"""
 29 |     resources = []
 30 |     if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
 31 |         try:
 32 |             for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
 33 |                 if hasattr(resource_type, 'directory'):
 34 |                     for resource_id in resource_type.directory.entries:
 35 |                         if hasattr(resource_id, 'directory'):
 36 |                             for resource_lang in resource_id.directory.entries:
 37 |                                 data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size)
 38 |                                 size = resource_lang.data.struct.Size
 39 |                                 entropy = get_entropy(data)
 40 | 
 41 |                                 resources.append([entropy, size])
 42 |         except Exception as e:
 43 |             return resources
 44 |     return resources
 45 | 
 46 | def get_version_info(pe):
 47 |     """Return version infos"""
 48 |     res = {}
 49 |     for fileinfo in pe.FileInfo:
 50 |         if fileinfo.Key == 'StringFileInfo':
 51 |             for st in fileinfo.StringTable:
 52 |                 for entry in st.entries.items():
 53 |                     res[entry[0]] = entry[1]
 54 |         if fileinfo.Key == 'VarFileInfo':
 55 |             for var in fileinfo.Var:
 56 |                 res[var.entry.items()[0][0]] = var.entry.items()[0][1]
 57 |     if hasattr(pe, 'VS_FIXEDFILEINFO'):
 58 |           res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags
 59 |           res['os'] = pe.VS_FIXEDFILEINFO.FileOS
 60 |           res['type'] = pe.VS_FIXEDFILEINFO.FileType
 61 |           res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS
 62 |           res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS
 63 |           res['signature'] = pe.VS_FIXEDFILEINFO.Signature
 64 |           res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion
 65 |     return res
 66 | 
 67 | def extract_infos(fpath):
 68 |     res = {}
 69 |     pe = pefile.PE(fpath)
 70 |     res['Machine'] = pe.FILE_HEADER.Machine
 71 |     res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader
 72 |     res['Characteristics'] = pe.FILE_HEADER.Characteristics
 73 |     res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion
 74 |     res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion
 75 |     res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode
 76 |     res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData
 77 |     res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData
 78 |     res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint
 79 |     res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode
 80 |     try:
 81 |         res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData
 82 |     except AttributeError:
 83 |         res['BaseOfData'] = 0
 84 |     res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase
 85 |     res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment
 86 |     res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment
 87 |     res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
 88 |     res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion
 89 |     res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion
 90 |     res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion
 91 |     res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion
 92 |     res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion
 93 |     res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage
 94 |     res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders
 95 |     res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum
 96 |     res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem
 97 |     res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics
 98 |     res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve
 99 |     res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit
100 |     res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve
101 |     res['SizeOfHeapCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit
102 |     res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags
103 |     res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes
104 | 
105 |     # Sections
106 |     res['SectionsNb'] = len(pe.sections)
107 |     entropy = list(map(lambda x:x.get_entropy(), pe.sections))
108 |     res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy))
109 |     res['SectionsMinEntropy'] = min(entropy)
110 |     res['SectionsMaxEntropy'] = max(entropy)
111 | 
112 | 
113 |     raw_sizes = list(map(lambda x:x.SizeOfRawData, pe.sections))
114 |     res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes))
115 |     res['SectionsMinRawsize'] = min(raw_sizes)
116 |     res['SectionsMaxRawsize'] = max(raw_sizes)
117 |     virtual_sizes = list(map(lambda x:x.Misc_VirtualSize, pe.sections))
118 |     res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes))
119 |     res['SectionsMinVirtualsize'] = min(virtual_sizes)
120 |     res['SectionMaxVirtualsize'] = max(virtual_sizes)
121 | 
122 |     #Imports
123 |     try:
124 |         res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT)
125 |         imports = list(sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], []))
126 |         res['ImportsNb'] = len(imports)
127 |         res['ImportsNbOrdinal'] = len(list(filter(lambda x:x.name is None, imports)))
128 |     except AttributeError:
129 |         res['ImportsNbDLL'] = 0
130 |         res['ImportsNb'] = 0
131 |         res['ImportsNbOrdinal'] = 0
132 | 
133 |     #Exports
134 |     try:
135 |         res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols)
136 |     except AttributeError:
137 |         # No export
138 |         res['ExportNb'] = 0
139 |     #Resources
140 |     resources= get_resources(pe)
141 |     res['ResourcesNb'] = len(resources)
142 |     if len(resources)> 0:
143 |         entropy = list(map(lambda x:x[0], resources))
144 |         res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy))
145 |         res['ResourcesMinEntropy'] = min(entropy)
146 |         res['ResourcesMaxEntropy'] = max(entropy)
147 |         sizes = list(map(lambda x:x[1], resources))
148 |         res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes))
149 |         res['ResourcesMinSize'] = min(sizes)
150 |         res['ResourcesMaxSize'] = max(sizes)
151 |     else:
152 |         res['ResourcesNb'] = 0
153 |         res['ResourcesMeanEntropy'] = 0
154 |         res['ResourcesMinEntropy'] = 0
155 |         res['ResourcesMaxEntropy'] = 0
156 |         res['ResourcesMeanSize'] = 0
157 |         res['ResourcesMinSize'] = 0
158 |         res['ResourcesMaxSize'] = 0
159 | 
160 |     # Load configuration size
161 |     try:
162 |         res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size
163 |     except AttributeError:
164 |         res['LoadConfigurationSize'] = 0
165 | 
166 | 
167 |     # Version configuration size
168 |     try:
169 |         version_infos = get_version_info(pe)
170 |         res['VersionInformationSize'] = len(version_infos.keys())
171 |     except AttributeError:
172 |         res['VersionInformationSize'] = 0
173 |     return res
174 | 
175 | if __name__ == '__main__':
176 |     parser = argparse.ArgumentParser(description='Detect malicious files')
177 |     parser.add_argument('FILE', help='File to be tested')
178 |     args = parser.parse_args()
179 |     # Load classifier
180 |     clf = joblib.load(os.path.join(
181 |         os.path.dirname(os.path.realpath(__file__)),
182 |         'classifier/classifier.pkl'
183 |     ))
184 |   
185 |     with open('classifier/features.pkl', 'rb') as f:
186 |         features = pickle.load(f)
187 | 
188 |     data = extract_infos(args.FILE)
189 |     pe_features = list(map(lambda x:data[x], features))
190 | 
191 |     res= clf.predict([pe_features])[0]
192 |     print('The file %s is %s' % (
193 |         os.path.basename(sys.argv[1]),
194 |         ['malicious', 'legitimate'][res])
195 |     )


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | <div align="center">
  4 | 
  5 |   <h1>ANTIVIRUSXML</h1>
  6 |   
  7 |   <p>
  8 |       File Integrity Monitor with Malware detection using Machine learning 
  9 |       <br>
 10 |   </p>
 11 | 
 12 |   
 13 | 
 14 |    
 15 | 
 16 | </div>
 17 | 
 18 | <br />
 19 | 
 20 | <!-- Table of Contents -->
 21 | # :notebook_with_decorative_cover: Table of Contents
 22 | 
 23 | - [About the Project](#star2-about-the-project)
 24 | - [Getting Started](#toolbox-getting-started)
 25 |   * [Prerequisites](#bangbang-prerequisites)
 26 |   * [Installation](#gear-installation)
 27 | - [Usage](#eyes-usage)
 28 | - [Contact](#handshake-contact)
 29 | - [Acknowledgements](#gem-acknowledgements)
 30 |   
 31 | 
 32 | <!-- About the Project -->
 33 | ## :star2: About the Project
 34 | 
 35 | Overview
 36 | ============
 37 | This Python project is a project that combine between  :
 38 | 1. basic file integrity monitor: which it takes two arguments: a directory to scan, and an output file for alerts. The script will recursively scan the given directory and its subdirectories, and will create alerts for any added, removed, or changed files. The script uses the os, sys, and pickle libraries to perform file system operations, as well as the datetime, hashlib, logging, and time libraries for other operations.
 39 | 2. malware detection using machine learning : it helps train a classifier to be able to detect [PE files](https://en.wikipedia.org/wiki/Portable_Executable) as either malicious or legitimate. It tries out 6 different classification algorithms before deciding which one to use for prediction by comparing their results. 
 40 | 
 41 | 
 42 | <div align="center"> 
 43 |   <img src="https://user-images.githubusercontent.com/117517618/206859303-8042f133-5fa3-493a-9b3b-b65995f4ecef.png" alt="screenshot" />
 44 | </div>
 45 | 
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | 
 53 | <!-- Getting Started -->
 54 | ## 	:toolbox: Getting Started
 55 | 
 56 | <!-- Prerequisites -->
 57 | ### :bangbang: Prerequisites
 58 | 
 59 | This project uses some libraries that you need to install them first :
 60 | 
 61 | ```bash
 62 |  pip install -r requirements.txt
 63 | ```
 64 | 
 65 | <!-- Installation -->
 66 | ### :gear: Installation
 67 | 
 68 | 
 69 | ```bash
 70 |   git clone https://github.com/da4nyy/ANTIVIRUSxML/
 71 |   cd ANTIVIRUSxML
 72 | ```
 73 | 
 74 | 
 75 | <!-- Run Locally -->
 76 | ### :running: Run Locally
 77 | 
 78 | Clone the project
 79 | 
 80 | 
 81 | Go to the project directory
 82 | 
 83 | ```bash
 84 |   cd ANTIVIRUSxML/
 85 | ```
 86 | 
 87 | Install dependencies
 88 | 
 89 | ```bash
 90 |    pip install -r requirements.txt
 91 | ```
 92 | 
 93 | train the model ( you can skip this phase : you already find the files in the classifier directory
 94 | 
 95 | ```bash
 96 |    python3 Malware-detection-learning.py data.csv
 97 | ```
 98 | 
 99 | Start the file monitor and malware detection handler
100 | 
101 | ```bash
102 |    python3 antivirusXml.py -i <input directory to monitor> -o <output file>  
103 | ```
104 | 
105 | <!-- Usage -->
106 | ## :eyes: Usage
107 | 
108 | + You may monitor the integrity of the files that may have PII. In this case, you can place the script where your files live, and create a crontab or use task scheduler to run the script.
109 | + You can use the script to monitor the files stored in the web app and scan the added files.
110 | + If you are in the Blue Team at a CCDC competition, you can use this script to monitor your server and easily see which files modified.
111 | 
112 | 
113 | 
114 | <!-- Roadmap -->
115 | ## :compass: Roadmap
116 | 
117 | * [x] scan x32 PE files
118 | * [ ] scan x64 PE files
119 | 
120 | <!-- Contributing -->
121 | ## :wave: Contributing
122 | 
123 | <a href="https://github.com/da4nyy/ANTIVIRUSxML/graphs/contributors">
124 | 
125 | </a>
126 | 
127 | 
128 | Contributions are always welcome!
129 | 
130 | 
131 | <!-- Contact -->
132 | ## :handshake: Contact
133 | 
134 | kacem hakim - [@DARNY](https://twitter.com/darny74258511) - da4nyyy@proton.me
135 | 
136 | Project Link: [https://github.com/](https://github.com/da4nyy/ANTIVIRUSxML)
137 | 
138 | <!-- Acknowledgments -->
139 | ## :gem: Acknowledgements
140 | 
141 |  - [kaggle mai dali](https://www.kaggle.com/code/maidaly/malware-detection-with-machine-learning)
142 |  - [Te-k](https://github.com/Te-k)
143 |  - [MaksimEkin- file integrity monitor](https://github.com/MaksimEkin/)
144 |  - [Awesome README](https://github.com/matiassingers/awesome-readme)
145 | 
146 | 
147 | 
148 | 


--------------------------------------------------------------------------------
/antivirusXml.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | 
  4 | 
  5 | """
  6 | File Integrity Monitor with malware detection
  7 | 
  8 | @author: DA4NY
  9 | """
 10 | 
 11 | # LIBRARIES NEEDED
 12 | import subprocess
 13 | import os
 14 | import sys
 15 | import pickle 
 16 | import datetime
 17 | import hashlib	
 18 | import logging
 19 | import time 
 20 | import signal
 21 | from time import sleep
 22 | import dictdiffer 
 23 | from progress.bar import Bar
 24 | 
 25 | #==================
 26 | #change here !
 27 | #==================
 28 | 
 29 | 
 30 | import sys
 31 | import getopt
 32 | 
 33 | 
 34 | import getopt
 35 | import sys
 36 | 
 37 | def get_args(argv):
 38 |     arg_input = ""
 39 |     arg_output = ""
 40 | 
 41 |     arg_help = "{0} -i <input directory> -o <output directory> ".format(argv[0])
 42 | 
 43 |     try:
 44 |         opts, args = getopt.getopt(argv[1:], "hi:o", ["help", "input=", "output="])
 45 |     except:
 46 |         print(arg_help)
 47 |         sys.exit(2)
 48 | 
 49 |     for opt, arg in opts:
 50 |         if opt in ("-h", "--help"):
 51 |             print(arg_help)
 52 |             sys.exit(2)
 53 |         elif opt in ("-i", "--input"):
 54 |             arg_input = arg
 55 |             print('input directory:', arg_input)
 56 | 
 57 |         elif opt in ("-o", "--output"):
 58 |             arg_output = arg
 59 |             print('output directory:', arg_output)
 60 | 
 61 | 
 62 |     l = [arg_input,arg_output]
 63 |     return l
 64 | 
 65 | 
 66 | 
 67 | 
 68 |     
 69 | 
 70 | 
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | #===============
 77 | # Colors
 78 | #===============
 79 | # Normal 
 80 | black="\033[0;30m"
 81 | red="\033[0;31m"
 82 | green="\033[0;32m"
 83 | yellow="\033[0;33m"  
 84 | blue="\033[0;34m"
 85 | purple="\033[0;35m"
 86 | cyan="\033[0;36m"
 87 | white="\033[0;37m"
 88 | # Bold
 89 | bblack="\033[1;30m"
 90 | bred="\033[1;31m"
 91 | bgreen="\033[1;32m"
 92 | byellow="\033[1;33m"
 93 | bblue="\033[1;34m"
 94 | bpurple="\033[1;35m"
 95 | bcyan="\033[1;36m"
 96 | bwhite="\033[1;37m"
 97 | 
 98 | 
 99 | #======================
100 | #printing Banner
101 | #======================
102 | 
103 | def banner():
104 |     logo='''
105 | '''+byellow+'''                                   
106 | '''''' mmmm     mm      mm  mm   mm     m
107 | '''''' #   "m   ##     m"#  #"m  # "m m" 
108 | '''''' #    #  #  #   #" #  # #m #  "#"  
109 | '''''' #    #  #mm#  #mmm#m #  # #   #   
110 | '''''' #mmm"  #    #     #  #   ##   #   
111 | ''''''                                   
112 |                                 
113 | '''+byellow+''' ++ File Integrity Monitor ith Malware Detection ++ 
114 | 
115 | '''+bblue+''' 
116 | *) Creates alerts for:
117 |         - added files
118 |         - removed files
119 |         - changed files
120 | 
121 | *) detect if the added / changed file is a malware       
122 | 
123 | *) checks the signature of the file with virustotal api 
124 | 
125 | '''+red+''' 
126 | @author: DA4NY
127 | 
128 | '''+bwhite+''''''
129 |     print(logo)
130 | 
131 | 
132 | 
133 | #==========================
134 | #Count all files in the directory and its subdirectories
135 | #we will use it in the progress bar
136 | #=========================
137 | 
138 | def count(SCAN_DIR):
139 | 	var = 0
140 | 	for dirName, subdirList, fileList in os.walk(SCAN_DIR):
141 | 	            
142 | 	            if (list_to_ignore):
143 | 	                for ignore in list_to_ignore:
144 | 	                    
145 | 	                    # if ignore in the list
146 | 	                    if (ignore in fileList):
147 | 	                        fileList.remove(ignore)
148 | 	            var+=1
149 | 	return var
150 | 
151 | 
152 | #*********************
153 | #scanning files 
154 | #*********************
155 | def scan_files(SCAN_DIR, list_to_ignore, LOG_FILE):
156 |     
157 |     try:
158 |         # hold directories and files
159 |         files = dict()
160 |         
161 |     		
162 |         
163 |         # recursively walk to directory tree and get files
164 |         with Bar('Scanning Files ...',max=count(SCAN_DIR)) as bar:  #using a progress bar while scanning the files  
165 | 	        for dirName, subdirList, fileList in os.walk(SCAN_DIR):
166 | 	            
167 | 	            if (list_to_ignore):
168 | 	                for ignore in list_to_ignore:
169 | 	                    
170 | 	                    # if ignore in the list
171 | 	                    if (ignore in fileList):
172 | 	                        fileList.remove(ignore)
173 | 	                        
174 | 	            files[str(dirName)] = fileList
175 | 	            sleep(0.02)
176 | 	            bar.next()
177 | 
178 |         return files
179 |             
180 |     except Exception as e:
181 |     	msg="Error in scanning files and dirs !"
182 |     	logging.exception(msg)
183 |         
184 |         
185 | 
186 | 
187 | #storing hashes
188 | 
189 | def save_hash(dictionary, file, LOG_FILE):
190 |     
191 |     try:
192 |         # open the file to use to save the dictionary
193 |         initial_scan_file = open(file, "wb")
194 |         
195 |         # use pickle to save the dictionary
196 |         pickle.dump(dictionary, initial_scan_file)
197 |         
198 |         # close the file
199 |         initial_scan_file.close
200 |         
201 |     except Exception as e:
202 |     	msg="Error while saving the dictionary"
203 |     	logging.exception(msg)
204 | 
205 | 
206 | 
207 | 
208 | # Load dictionary of hashes
209 | 
210 | def load_dict(file, LOG_FILE):
211 |     
212 |     try:
213 |         # open the pickle file to load
214 |         infile = open(file, 'rb')
215 |         
216 |         # use pickle to load the dictionary
217 |         loaded_dict = pickle.load(infile)
218 |         
219 |         # close the file
220 |         infile.close()
221 | 
222 |         return loaded_dict
223 |         
224 |     except Exception as e:
225 |         log(LOG_FILE, \
226 |             "Error while loading the dictionary")
227 |     
228 |     
229 | 
230 | 
231 | # Log events
232 | 
233 | def log(log_dir, message):
234 |     
235 |     # get time
236 |     currentDT = datetime.datetime.now()
237 |     
238 |     # log event
239 |     file = open(log_dir, "a+")
240 |     file.write(str(message) + \
241 |                " --- Time: " + \
242 |                str(currentDT.strftime("%Y-%m-%d %H:%M:%S")) + \
243 |                "\n")
244 |     file.close
245 |     
246 | 
247 | def log_change(log_dir, message):
248 |     
249 |     # get time
250 |     currentDT = datetime.datetime.now()
251 |     
252 |     # log event test
253 |     file = open(log_dir, "a+")
254 |     file.write(str(message) + \
255 |                " --- Time: " + \
256 |                str(currentDT.strftime("%Y-%m-%d %H:%M:%S")) + \
257 |                "\n")
258 |     file.close
259 |     print(red,message,white)
260 | 
261 | 
262 | 
263 | 
264 | # Take SHA256 of each file
265 | # hash is taken in blocks, this is done to ensure large files doens't fail
266 | 
267 | def calculate_hash(directory, LOG_FILE):
268 |     
269 |     try:
270 |         # use hash libraries sha 256
271 |         sha256_hash = hashlib.sha256()
272 |         
273 |         # take hash
274 |         with open(directory,"rb") as f:
275 |             
276 |             # Read and update hash string value in blocks of 4K
277 |             for byte_block in iter(lambda: f.read(4096),b""):
278 |                 sha256_hash.update(byte_block)
279 |                 
280 |             # return the hash
281 |             return sha256_hash.hexdigest()
282 |         
283 |     except Exception as e:
284 |         log(LOG_FILE,"Error while taking the hash values")
285 | 
286 | 
287 | 
288 | 
289 | # integrity FUNCTION
290 | 
291 | def integrity():                       
292 |     
293 | 
294 |     #printing the directory to scan 
295 |     print("DIRECTORY TO MONITOR :{} ".format(SCAN_DIRECTORY))
296 | 
297 |     # start the initial scan
298 |     log(LOG_FILE, "Starting the initial scan...")
299 | 
300 | 
301 |     INITIAL_FILE_HASHES = scan()
302 |     
303 |     # save the initial scan dictionary of hashes
304 |     save_hash(INITIAL_FILE_HASHES, \
305 |                            SCAN_STORAGE,\
306 |                            LOG_FILE)
307 |     log(LOG_FILE, "Initial scan completed!")
308 |     
309 |     
310 |     # start the integrity check
311 |     log(LOG_FILE, "Starting the integrity check...")
312 |    	
313 |     while True:
314 |         
315 |         # get the file hashes
316 |         new_hash = scan()
317 |         
318 |         # load the old hash
319 |         old_hash = load_dict(SCAN_STORAGE,\
320 |                                           LOG_FILE)
321 |         
322 |         # compare two dict of hashes
323 |         for diff in list(dictdiffer.diff(old_hash, new_hash)):         
324 |             # ALERT
325 |             
326 |             log_change(ALERT_FILE, diff)
327 |             malware_detection(diff)
328 |         # save the new hash
329 |         save_hash(new_hash, \
330 |                                SCAN_STORAGE,
331 |                                LOG_FILE)
332 |         
333 |         # wait
334 |         sleep(sleep_time_sc)
335 |         
336 | 
337 | 
338 | 
339 | # Scan the directory tree and take hash of the files 
340 | # Return a dictionary of hashes and file paths
341 | 
342 | def scan():
343 |     
344 |     # get dictonary of directories and files they contain
345 |     directories = scan_files(SCAN_DIRECTORY, \
346 |                                                      list_to_ignore, \
347 |                                                      LOG_FILE)        
348 |     
349 |     # take hash
350 |     file_hashes = dict()
351 |     for path, files in directories.items():
352 |         
353 |         # look at each file at path
354 |         for file in files:
355 |             
356 |             # get the full path name to the file
357 |             file_dir = str(path) + "/" + str(file)
358 |             
359 |             # store the hash of the file
360 |             file_hashes[file_dir] = calculate_hash(file_dir, \
361 |                        LOG_FILE)
362 |             
363 |             
364 |     # return dictionary with files path and hashes
365 |     return file_hashes
366 | 
367 | 
368 | 
369 | 
370 | 
371 | def malware_detection(diff):
372 |     with open(ALERT_FILE, "r+") as alert_file:
373 |           for line in alert_file:
374 |               pass
375 |           
376 |           test = line.split("'")
377 |           if test[1]== "change":
378 |               file_to_scan = test[3]
379 |           if test[1]=="add":
380 |               file_to_scan = test[5]
381 |           
382 |           print(bgreen,"[+] Scanning {} ...".format(file_to_scan),bwhite)
383 |           if file_to_scan[0]== ".":
384 |               extention =  file_to_scan[1:] 
385 |           try :   
386 |               if extention.split(".")[1] =="exe":
387 |                   
388 |                   try : 
389 |                       subprocess.call(['python3','Mal-detection.py', file_to_scan])
390 |                   except : 
391 |                       print(bred ,"[x] Failed to run the Malware detection !!!",bwhite )
392 |               else: 
393 |                   print(bred,"[x] The file isn't a windows executable !!! Currently we can only can windows x32 files !")     
394 |                   print(bgreen,"[+] Trying the virus total api ... ",bwhite)
395 |                   api_virus_total(file_to_scan)
396 |           except: 
397 |               print(bred,"[x] The file isn't a windows executable !!! Currently we can only can windows x32 files !")     
398 |               print(bgreen,"[+] Trying the virus total api ... ",bwhite)
399 |               api_virus_total(file_to_scan)
400 |       
401 | 
402 | 
403 |  
404 | 
405 | def api_virus_total(file):
406 |     subprocess.call(['python3','virustotal.py','-m',file])
407 | def hand_sign(signum, frame):
408 |     res = input("Ctrl-c was pressed. Do you really want to exit? y/n :")
409 |     if res == 'y':
410 |     	print(red,"[x] Quitting!\n",bgreen,"[+] Saving the results in {} ".format(ALERT_FILE))
411 |     	exit(1)
412 |  
413 | signal.signal(signal.SIGINT, hand_sign)
414 | 
415 | # execute
416 | if __name__ == "__main__":
417 |     l=[]
418 |     SCAN_DIRECTORY = '.'
419 |     ALERT_FILE = 'alert.log'
420 |     l= get_args(sys.argv)
421 |     if l[0]!="":
422 |         SCAN_DIRECTORY = l[0]
423 |     if l[1]!="":
424 |         ALERT_FILE = l[1]
425 |     
426 | 
427 | 
428 |     SCAN_STORAGE = 'hashes.pkl'
429 |     LOG_FILE = 'handler.log'
430 |     list_to_ignore=[SCAN_STORAGE, LOG_FILE, ALERT_FILE]
431 |     sleep_time_sc=4
432 |     print(str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))+"\n")
433 |     #Starting the integrity monitor
434 |     banner()
435 |     integrity()
436 | 
437 |     
438 | 


--------------------------------------------------------------------------------
/classifier/classifier.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da4nyy/ANTIVIRUSxML/ff2d4b1cf1c0f71241b99e24505cba458fd81998/classifier/classifier.pkl


--------------------------------------------------------------------------------
/classifier/features.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da4nyy/ANTIVIRUSxML/ff2d4b1cf1c0f71241b99e24505cba458fd81998/classifier/features.pkl


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | hashlib
2 | logging
3 | dictdiffer
4 | progress
5 | 


--------------------------------------------------------------------------------
/virustotal.py:
--------------------------------------------------------------------------------
  1 | # upload PE file to VirusTotal
  2 | # then get info about the results
  3 | # of analysis, print if malicious
  4 | import os
  5 | import sys
  6 | import time
  7 | import json
  8 | import requests
  9 | import argparse
 10 | import hashlib
 11 | 
 12 | # for terminal colors
 13 | class Colors:
 14 |     BLUE = '\033[94m'
 15 |     GREEN = '\033[92m'
 16 |     YELLOW = '\033[93m'
 17 |     RED = '\033[91m'
 18 |     PURPLE = '\033[95m'
 19 |     ENDC = '\033[0m'
 20 | 
 21 | # VirusTotal API key
 22 | VT_API_KEY = "< PUT UR KEY HERE >"
 23 | 
 24 | # VirusTotal API v3 URL
 25 | VT_API_URL = "https://www.virustotal.com/api/v3/"
 26 | 
 27 | # upload malicious file to VirusTotal and analyse
 28 | class VTScan:
 29 |     def __init__(self):
 30 |         self.headers = {
 31 |             "x-apikey" : VT_API_KEY,
 32 |             "User-Agent" : "vtscan v.1.0",
 33 |             "Accept-Encoding" : "gzip, deflate",
 34 |         }
 35 | 
 36 |     def upload(self, malware_path):
 37 |         print (Colors.BLUE + "upload file: " + malware_path + "..." + Colors.ENDC)
 38 |         self.malware_path = malware_path
 39 |         upload_url = VT_API_URL + "files"
 40 |         files = {"file" : (
 41 |             os.path.basename(malware_path),
 42 |             open(os.path.abspath(malware_path), "rb"))
 43 |         }
 44 |         print (Colors.YELLOW + "upload to " + upload_url + Colors.ENDC)
 45 |         res = requests.post(upload_url, headers = self.headers, files = files)
 46 |         if res.status_code == 200:
 47 |             result = res.json()
 48 |             self.file_id = result.get("data").get("id")
 49 |             print (Colors.YELLOW + self.file_id + Colors.ENDC)
 50 |             print (Colors.GREEN + "successfully upload PE file: OK" + Colors.ENDC)
 51 |         else:
 52 |             print (Colors.RED + "failed to upload PE file :(" + Colors.ENDC)
 53 |             print (Colors.RED + "status code: " + str(res.status_code) + Colors.ENDC)
 54 |             sys.exit()
 55 | 
 56 |     def analyse(self):
 57 |         print (Colors.BLUE + "get info about the results of analysis..." + Colors.ENDC)
 58 |         analysis_url = VT_API_URL + "analyses/" + self.file_id
 59 |         res = requests.get(analysis_url, headers = self.headers)
 60 |         if res.status_code == 200:
 61 |             result = res.json()
 62 |             status = result.get("data").get("attributes").get("status")
 63 |             if status == "completed":
 64 |                 stats = result.get("data").get("attributes").get("stats")
 65 |                 results = result.get("data").get("attributes").get("results")
 66 |                 print (Colors.RED + "malicious: " + str(stats.get("malicious")) + Colors.ENDC)
 67 |                 print (Colors.YELLOW + "undetected : " + str(stats.get("undetected")) + Colors.ENDC)
 68 |                 print ()
 69 |                 for k in results:
 70 |                     if results[k].get("category") == "malicious":
 71 |                         print ("==================================================")
 72 |                         print (Colors.GREEN + results[k].get("engine_name") + Colors.ENDC)
 73 |                         print ("version : " + results[k].get("engine_version"))
 74 |                         print ("category : " + results[k].get("category"))
 75 |                         print ("result : " + Colors.RED + results[k].get("result") + Colors.ENDC)
 76 |                         print ("method : " + results[k].get("method"))
 77 |                         print ("update : " + results[k].get("engine_update"))
 78 |                         print ("==================================================")
 79 |                         print ()
 80 |                 print (Colors.GREEN + "successfully analyse: OK" + Colors.ENDC)
 81 |                 sys.exit()
 82 |             elif status == "queued":
 83 |                 print (Colors.BLUE + "status QUEUED..." + Colors.ENDC)
 84 |                 with open(os.path.abspath(self.malware_path), "rb") as malware_path:
 85 |                     b = malware_path.read()
 86 |                     hashsum = hashlib.sha256(b).hexdigest()
 87 |                     self.info(hashsum)
 88 |         else:
 89 |             print (Colors.RED + "failed to get results of analysis :(" + Colors.ENDC)
 90 |             print (Colors.RED + "status code: " + str(res.status_code) + Colors.ENDC)
 91 |             sys.exit()
 92 | 
 93 |     def run(self, malware_path):
 94 |         self.upload(malware_path)
 95 |         self.analyse()
 96 | 
 97 |     def info(self, file_hash):
 98 |         print (Colors.BLUE + "get file info by ID: " + file_hash + Colors.ENDC)
 99 |         info_url = VT_API_URL + "files/" + file_hash
100 |         res = requests.get(info_url, headers = self.headers)
101 |         if res.status_code == 200:
102 |             result = res.json()
103 |             if result.get("data").get("attributes").get("last_analysis_results"):
104 |                 stats = result.get("data").get("attributes").get("last_analysis_stats")
105 |                 results = result.get("data").get("attributes").get("last_analysis_results")
106 |                 print (Colors.RED + "malicious: " + str(stats.get("malicious")) + Colors.ENDC)
107 |                 print (Colors.YELLOW + "undetected : " + str(stats.get("undetected")) + Colors.ENDC)
108 |                 print ()
109 |                 for k in results:
110 |                     if results[k].get("category") == "malicious":
111 |                         print ("==================================================")
112 |                         print (Colors.GREEN + results[k].get("engine_name") + Colors.ENDC)
113 |                         print ("version : " + results[k].get("engine_version"))
114 |                         print ("category : " + results[k].get("category"))
115 |                         print ("result : " + Colors.RED + results[k].get("result") + Colors.ENDC)
116 |                         print ("method : " + results[k].get("method"))
117 |                         print ("update : " + results[k].get("engine_update"))
118 |                         print ("==================================================")
119 |                         print ()
120 |                 print (Colors.GREEN + "successfully analyse: OK" + Colors.ENDC)
121 |                 sys.exit()
122 |             else:
123 |                 print (Colors.BLUE + "failed to analyse :(..." + Colors.ENDC)
124 | 
125 |         else:
126 |             print (Colors.RED + "failed to get information :(" + Colors.ENDC)
127 |             print (Colors.RED + "status code: " + str(res.status_code) + Colors.ENDC)
128 |             sys.exit()
129 | def hand_sign(signum, frame):
130 |     res = input("Ctrl-c was pressed. Do you really want to exit? y/n ")
131 |     if res == 'y':
132 |         print(red,"QUitting !")
133 |         exit(1)
134 | if __name__ == "__main__":
135 |     parser = argparse.ArgumentParser()
136 |     parser.add_argument('-m','--mal', required = True, help = "PE file path for scanning")
137 |     args = vars(parser.parse_args())
138 |     vtscan = VTScan()
139 |     vtscan.run(args["mal"])
140 | 


--------------------------------------------------------------------------------