├── .gitignore ├── crawler.py ├── README.md ├── iconM.py ├── iconN.py ├── parserM.py └── parserN.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | -------------------------------------------------------------------------------- /crawler.py: -------------------------------------------------------------------------------- 1 | import re 2 | import urllib2 3 | import webbrowser 4 | import time 5 | from bs4 import BeautifulSoup 6 | from appscript import * 7 | 8 | downloadLinks = 0 9 | 10 | 11 | for num in range(1500,2000): 12 | u = "http://download.cnet.com/windows/3150-20_4-0-" + str(num) + ".html?sort=reviewDate%20asc&tag=page" # from download.com 13 | page = urllib2.urlopen(u) 14 | soup = BeautifulSoup(page) 15 | 16 | for link in soup.findAll(href=re.compile("26dlm")): 17 | downloadLinks = downloadLinks + 1 18 | print "Number of download links: ", downloadLinks 19 | url = link.get('href') 20 | print(url) 21 | webbrowser.open(url) 22 | time.sleep(10) 23 | safari = app("Safari") 24 | safari.windows.first.current_tab.close() 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PE Header Parser: A PE Header-Based Antivirus Tool 2 | ================================================== 3 | 4 | 5 | 6 | In this project, I present a simple and faster apporach to distinguish between malware and legitimate .exe files by simply looking at the properties of the Windows Portable Executable (PE) headers, and develop a tool to detect malware from a large number of .exe files. We extract distinguishing features from the PE headers using the structural information standardized by the Miscrosoft Windows operating system for executables. 7 | 8 | Here are the following three major parts of this project: 9 | 10 | (1) collect a large dataset of malware .exe (given by the project advisor) and legitimate .exe from the two website, www.downloads.com and www.softpedia.com by writing a python script called "crawler.py" as a web spider to automatically download files from website; 11 | 12 | (2) write python scripts called "parseM.py, parseN.py" to extract the features of each header field, compare and find the most significant differences between malware and legitimate .exe files; 13 | 14 | (3) write python scripts called "iconM.py, iconN.py" to extract the icons from the PE, and then find the most prevalent icons from the malware .exe files by comparing from the legitimate .exe files. 15 | 16 | 17 | Conclusion: 18 | 19 | I have evaluated this apporach on a large dataset which contains 5598 malware samples and 1237 legitimate samples respectively. The result of our experiments show that the PE-Header-Based approach achieves more than 99% detection rate with less than 0.2% false positive for distinguishing between benign and malicious executables in less than 20 minutes. We have also found 3 most prevalent icons from malware that are seldom seen in legitimate PE files, and 8 types of misleading icons from malware. My results show that it is possible to identify the malware by simply looking at some key features from PE headers. 20 | -------------------------------------------------------------------------------- /iconM.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import win32ui 4 | import win32gui 5 | from PyQt4 import QtGui 6 | 7 | class extractIcon(): 8 | def __init__(self): 9 | iconEmbededM = 0 10 | numberOfFilesM = 0 11 | 12 | fileListM = self.dir_list('YourDirectory\\malware\\') 13 | 14 | for f in fileListM: 15 | numberOfFilesM = numberOfFilesM + 1 16 | print "File Number(Malware): ", numberOfFilesM 17 | print f 18 | if win32gui.ExtractIconEx(f, -1): 19 | print "Icon Embedded(Malware)." 20 | print "" 21 | iconEmbededM = iconEmbededM + 1 22 | large, small = win32gui.ExtractIconEx(f, 0) 23 | print win32gui.ExtractIconEx(f, 0) 24 | 25 | win32gui.DestroyIcon(small[0]) 26 | self.pixmap = QtGui.QPixmap.fromWinHBITMAP(self.bitmapFromHIcon(large[0]), 2) 27 | dest = "YourDirectory\\iconM\\" + f[52:len(f)-4] + ".ico" 28 | self.pixmap.save(dest) 29 | else: 30 | print "No Icon Embedded(Malware)." 31 | print "" 32 | 33 | print "" 34 | print "Total Number Of Files Embedded Icon(Malware): ", iconEmbededM 35 | 36 | def bitmapFromHIcon(self, hIcon): 37 | hdc = win32ui.CreateDCFromHandle(win32gui.GetDC(0)) 38 | hbmp = win32ui.CreateBitmap() 39 | hbmp.CreateCompatibleBitmap(hdc, 32, 32) 40 | hdc = hdc.CreateCompatibleDC() 41 | hdc.SelectObject(hbmp) 42 | hdc.DrawIcon((0, 0), hIcon) 43 | hdc.DeleteDC() 44 | return hbmp.GetHandle() 45 | 46 | def dir_list(self, dir_name, *args): 47 | fileList = [] 48 | for file in os.listdir(dir_name): 49 | dirfile = os.path.join(dir_name, file) 50 | if os.path.isfile(dirfile): 51 | if len(args) == 0: 52 | fileList.append(dirfile) 53 | else: 54 | if os.path.splitext(dirfile)[1][1:] in args: 55 | fileList.append(dirfile) 56 | return fileList 57 | 58 | if __name__ == "__main__": 59 | app = QtGui.QApplication(sys.argv) 60 | icon = extractIcon() 61 | -------------------------------------------------------------------------------- /iconN.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import win32ui 4 | import win32gui 5 | from PyQt4 import QtGui 6 | 7 | class extractIcon(): 8 | def __init__(self): 9 | iconEmbededN = 0 10 | numberOfFilesN = 0 11 | 12 | fileListN = self.dir_list('YourDirectory\\normal\\') 13 | 14 | for f in fileListN: 15 | numberOfFilesN = numberOfFilesN + 1 16 | print "File Number(NormalFile): ", numberOfFilesN 17 | print f 18 | if win32gui.ExtractIconEx(f, -1): 19 | print "Icon Embedded(NormalFile)." 20 | iconEmbededN = iconEmbededN + 1 21 | large, small = win32gui.ExtractIconEx(f, 0) 22 | print large[0], small[0] 23 | print win32gui.ExtractIconEx(f, 0) 24 | win32gui.DestroyIcon(small[0]) 25 | self.pixmap = QtGui.QPixmap.fromWinHBITMAP(self.bitmapFromHIcon(large[0]), 2) 26 | dest = "YourDirectory\\iconN\\" + f[42:len(f)-4] + ".ico" 27 | self.pixmap.save(dest) 28 | print "" 29 | else: 30 | print "No Icon Embedded(NormalFile)." 31 | print "" 32 | 33 | print "" 34 | print "Total Number Of Files Embedded Icon(NormalFile): ", iconEmbededN 35 | 36 | def bitmapFromHIcon(self, hIcon): 37 | hdc = win32ui.CreateDCFromHandle(win32gui.GetDC(0)) 38 | hbmp = win32ui.CreateBitmap() 39 | hbmp.CreateCompatibleBitmap(hdc, 32, 32) 40 | hdc = hdc.CreateCompatibleDC() 41 | hdc.SelectObject(hbmp) 42 | hdc.DrawIcon((0, 0), hIcon) 43 | hdc.DeleteDC() 44 | return hbmp.GetHandle() 45 | 46 | def dir_list(self, dir_name, *args): 47 | fileList = [] 48 | for file in os.listdir(dir_name): 49 | dirfile = os.path.join(dir_name, file) 50 | if os.path.isfile(dirfile): 51 | if len(args) == 0: 52 | fileList.append(dirfile) 53 | else: 54 | if os.path.splitext(dirfile)[1][1:] in args: 55 | fileList.append(dirfile) 56 | return fileList 57 | 58 | if __name__ == "__main__": 59 | app = QtGui.QApplication(sys.argv) 60 | icon = extractIcon() 61 | -------------------------------------------------------------------------------- /parserM.py: -------------------------------------------------------------------------------- 1 | import pefile 2 | import os 3 | import time 4 | 5 | def dir_list2(dir_name, *args): 6 | fileList = [] 7 | for file in os.listdir(dir_name): 8 | dirfile = os.path.join(dir_name, file) 9 | if os.path.isfile(dirfile): 10 | if len(args) == 0: 11 | fileList.append(dirfile) 12 | else: 13 | if os.path.splitext(dirfile)[1][1:] in args: 14 | fileList.append(dirfile) 15 | return fileList 16 | 17 | def sectionName (f): 18 | unknown = 0 19 | unknownName = 0 20 | normalName = 0 21 | pe = pefile.PE(f, fast_load=True) 22 | for section in pe.sections: 23 | name = section.Name 24 | if "text" in name: 25 | normalName = normalName + 1 26 | elif "bss" in name: 27 | normalName = normalName + 1 28 | elif "data" in name: 29 | normalName = normalName + 1 30 | elif "rsrc" in name: 31 | normalName = normalName + 1 32 | elif "debug" in name: 33 | normalName = normalName + 1 34 | elif "reloc" in name: 35 | normalName = normalName + 1 36 | elif "winzip" in name: 37 | normalName = normalName + 1 38 | elif "tls" in name: 39 | normalName = normalName + 1 40 | elif "UPX" in name: 41 | normalName = normalName + 1 42 | elif "boom" in name: 43 | normalName = normalName + 1 44 | elif "seau" in name: 45 | normalName = normalName + 1 46 | elif "code" in name: 47 | normalName = normalName + 1 48 | elif "Shared" in name: 49 | normalName = normalName + 1 50 | elif "gentee" in name: 51 | normalName = normalName + 1 52 | elif "CODE" in name: 53 | normalName = normalName + 1 54 | elif "DATA" in name: 55 | normalName = normalName + 1 56 | elif "BSS" in name: 57 | normalName = normalName + 1 58 | elif "CRT" in name: 59 | normalName = normalName + 1 60 | elif "PAGE" in name: 61 | normalName = normalName + 1 62 | elif "INIT" in name: 63 | normalName = normalName + 1 64 | elif "res" in name: 65 | normalName = normalName + 1 66 | elif "asp" in name: 67 | normalName = normalName + 1 68 | elif "tsu" in name: 69 | normalName = normalName + 1 70 | elif "TEXT" in name: 71 | normalName = normalName + 1 72 | else: 73 | unknownName = unknownName + 1 74 | if unknownName > 0: 75 | unknown = 1 76 | return unknown 77 | 78 | if __name__ == '__main__': 79 | 80 | numberOfFiles = 0 81 | sizeOfInitializedData = 0 82 | abnormalName = 0 83 | dllCharacteristics = 0 84 | majorImageVersion = 0 85 | checkSumCount = 0 86 | malware = 0 87 | 88 | start = time.time() 89 | fileList = dir_list2('YourDirectory/malware/') 90 | for f in fileList: 91 | pe = pefile.PE(f, fast_load=True) 92 | numberOfFiles = numberOfFiles + 1 93 | print "Number Of Files: ", numberOfFiles 94 | 95 | if pe.OPTIONAL_HEADER.SizeOfInitializedData == 0: 96 | malware = malware + 1 97 | elif sectionName (f) == 1: 98 | malware = malware + 1 99 | 100 | end = time.time() 101 | elapsed = end - start 102 | print "Malware Find", malware 103 | print "The time for running this program: ", elapsed 104 | 105 | start = time.time() 106 | fileList = dir_list2('YourDirectory/normal/') 107 | for f in fileList: 108 | pe = pefile.PE(f, fast_load=True) 109 | numberOfFiles = numberOfFiles + 1 110 | print "Number Of Files: ", numberOfFiles 111 | 112 | if pe.OPTIONAL_HEADER.SizeOfInitializedData == 0: 113 | sizeOfInitializedData = sizeOfInitializedData + 1 114 | 115 | if sectionName (f) == 1: 116 | abnormalName = abnormalName + 1 117 | 118 | if pe.OPTIONAL_HEADER.DllCharacteristics == 0: 119 | dllCharacteristics = dllCharacteristics + 1 120 | 121 | if pe.OPTIONAL_HEADER.MajorImageVersion == 0: 122 | majorImageVersion = majorImageVersion + 1 123 | 124 | if pe.OPTIONAL_HEADER.CheckSum == 0: 125 | checkSumCount = checkSumCount + 1 126 | 127 | end = time.time() 128 | elapsed = end - start 129 | print "SizeOfInitializedData == 0: ", sizeOfInitializedData 130 | print "UnkownName: ", abnormalName 131 | print "DllCharacteristics == 0: ", dllCharacteristics 132 | print "MajorImageVersion == 0: ", majorImageVersion 133 | print "CheckSum == 0: ", checkSumCount 134 | print "The time for running this program: ", elapsed 135 | 136 | -------------------------------------------------------------------------------- /parserN.py: -------------------------------------------------------------------------------- 1 | import pefile 2 | import os 3 | import time 4 | 5 | def dir_list2(dir_name, *args): 6 | fileList = [] 7 | for file in os.listdir(dir_name): 8 | dirfile = os.path.join(dir_name, file) 9 | if os.path.isfile(dirfile): 10 | if len(args) == 0: 11 | fileList.append(dirfile) 12 | else: 13 | if os.path.splitext(dirfile)[1][1:] in args: 14 | fileList.append(dirfile) 15 | return fileList 16 | 17 | def sectionName (f): 18 | unknown = 0 19 | unknownName = 0 20 | normalName = 0 21 | pe = pefile.PE(f, fast_load=True) 22 | for section in pe.sections: 23 | name = section.Name 24 | if "text" in name: 25 | normalName = normalName + 1 26 | elif "bss" in name: 27 | normalName = normalName + 1 28 | elif "data" in name: 29 | normalName = normalName + 1 30 | elif "rsrc" in name: 31 | normalName = normalName + 1 32 | elif "debug" in name: 33 | normalName = normalName + 1 34 | elif "reloc" in name: 35 | normalName = normalName + 1 36 | elif "winzip" in name: 37 | normalName = normalName + 1 38 | elif "tls" in name: 39 | normalName = normalName + 1 40 | elif "UPX" in name: 41 | normalName = normalName + 1 42 | elif "boom" in name: 43 | normalName = normalName + 1 44 | elif "seau" in name: 45 | normalName = normalName + 1 46 | elif "code" in name: 47 | normalName = normalName + 1 48 | elif "Shared" in name: 49 | normalName = normalName + 1 50 | elif "gentee" in name: 51 | normalName = normalName + 1 52 | elif "CODE" in name: 53 | normalName = normalName + 1 54 | elif "DATA" in name: 55 | normalName = normalName + 1 56 | elif "BSS" in name: 57 | normalName = normalName + 1 58 | elif "CRT" in name: 59 | normalName = normalName + 1 60 | elif "PAGE" in name: 61 | normalName = normalName + 1 62 | elif "INIT" in name: 63 | normalName = normalName + 1 64 | elif "res" in name: 65 | normalName = normalName + 1 66 | elif "asp" in name: 67 | normalName = normalName + 1 68 | elif "tsu" in name: 69 | normalName = normalName + 1 70 | elif "TEXT" in name: 71 | normalName = normalName + 1 72 | else: 73 | unknownName = unknownName + 1 74 | if unknownName > 0: 75 | unknown = 1 76 | return unknown 77 | 78 | if __name__ == '__main__': 79 | 80 | numberOfFiles = 0 81 | sizeOfInitializedData = 0 82 | abnormalName = 0 83 | dllCharacteristics = 0 84 | majorImageVersion = 0 85 | checkSumCount = 0 86 | malware = 0 87 | 88 | start = time.time() 89 | fileList = dir_list2('YourDirectory/normal/') 90 | for f in fileList: 91 | pe = pefile.PE(f, fast_load=True) 92 | numberOfFiles = numberOfFiles + 1 93 | print "Number Of Files: ", numberOfFiles 94 | 95 | if pe.OPTIONAL_HEADER.SizeOfInitializedData == 0: 96 | malware = malware + 1 97 | else: 98 | if sectionName (f) == 1 and pe.OPTIONAL_HEADER.CheckSum == 0 and pe.OPTIONAL_HEADER.MajorImageVersion == 0 and pe.OPTIONAL_HEADER.DllCharacteristics == 0: 99 | malware = malware + 1 100 | 101 | end = time.time() 102 | elapsed = end - start 103 | print "Malware Find", malware 104 | print "The time for running this program: ", elapsed 105 | 106 | start = time.time() 107 | fileList = dir_list2('YourDirectory/normal/') 108 | for f in fileList: 109 | pe = pefile.PE(f, fast_load=True) 110 | numberOfFiles = numberOfFiles + 1 111 | print "Number Of Files: ", numberOfFiles 112 | 113 | if pe.OPTIONAL_HEADER.SizeOfInitializedData == 0: 114 | sizeOfInitializedData = sizeOfInitializedData + 1 115 | 116 | if sectionName (f) == 1: 117 | abnormalName = abnormalName + 1 118 | 119 | if pe.OPTIONAL_HEADER.DllCharacteristics == 0: 120 | dllCharacteristics = dllCharacteristics + 1 121 | 122 | if pe.OPTIONAL_HEADER.MajorImageVersion == 0: 123 | majorImageVersion = majorImageVersion + 1 124 | 125 | if pe.OPTIONAL_HEADER.CheckSum == 0: 126 | checkSumCount = checkSumCount + 1 127 | 128 | end = time.time() 129 | elapsed = end - start 130 | print "SizeOfInitializedData == 0: ", sizeOfInitializedData 131 | print "UnkownName: ", abnormalName 132 | print "DllCharacteristics == 0: ", dllCharacteristics 133 | print "MajorImageVersion == 0: ", majorImageVersion 134 | print "CheckSum == 0: ", checkSumCount 135 | print "The time for running this program: ", elapsed 136 | 137 | --------------------------------------------------------------------------------