├── bin-utf8-vec ├── maxSequenceLen.txt ├── finalVocabSize.txt ├── maxHashWordID.txt ├── dispCorpusSize.sh ├── X_test.npy ├── valInds.npy ├── y_test.npy ├── testInds.npy ├── trainInds.npy ├── joblibParForTest.py ├── getMaxHashID.py ├── truncateCorpus.py ├── vocabSize.py ├── hashAndPadSequences.py ├── wordSequences.py └── bin-utf8-vec.py ├── bin-opcodes-vec ├── X_test.npy ├── y_test.npy ├── testInds.npy ├── valInds.npy ├── top50opcodes.csv ├── top50opcodes.py ├── opcode-model.py └── bin-opcodes-vec.py ├── Project_Report_-_ELL16600748.pdf ├── weights-improvement-04-0.72.hdf5 ├── weights-improvement-574-0.85.hdf5 ├── LICENSE ├── README.md ├── benignFreewareDownloader.py └── ensemblePredict.py /bin-utf8-vec/maxSequenceLen.txt: -------------------------------------------------------------------------------- 1 | 2121399 -------------------------------------------------------------------------------- /bin-utf8-vec/finalVocabSize.txt: -------------------------------------------------------------------------------- 1 | 11695771 -------------------------------------------------------------------------------- /bin-utf8-vec/maxHashWordID.txt: -------------------------------------------------------------------------------- 1 | 9223372036854759194 -------------------------------------------------------------------------------- /bin-utf8-vec/dispCorpusSize.sh: -------------------------------------------------------------------------------- 1 | while sleep 2; do clear; ls -1 corpusTrunc | wc -l; done 2 | -------------------------------------------------------------------------------- /bin-utf8-vec/X_test.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/bin-utf8-vec/X_test.npy -------------------------------------------------------------------------------- /bin-utf8-vec/valInds.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/bin-utf8-vec/valInds.npy -------------------------------------------------------------------------------- /bin-utf8-vec/y_test.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/bin-utf8-vec/y_test.npy -------------------------------------------------------------------------------- /bin-opcodes-vec/X_test.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/bin-opcodes-vec/X_test.npy -------------------------------------------------------------------------------- /bin-opcodes-vec/y_test.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/bin-opcodes-vec/y_test.npy -------------------------------------------------------------------------------- /bin-utf8-vec/testInds.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/bin-utf8-vec/testInds.npy -------------------------------------------------------------------------------- /bin-utf8-vec/trainInds.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/bin-utf8-vec/trainInds.npy -------------------------------------------------------------------------------- /bin-opcodes-vec/testInds.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/bin-opcodes-vec/testInds.npy -------------------------------------------------------------------------------- /bin-opcodes-vec/valInds.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/bin-opcodes-vec/valInds.npy -------------------------------------------------------------------------------- /Project_Report_-_ELL16600748.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/Project_Report_-_ELL16600748.pdf -------------------------------------------------------------------------------- /weights-improvement-04-0.72.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/weights-improvement-04-0.72.hdf5 -------------------------------------------------------------------------------- /weights-improvement-574-0.85.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/weights-improvement-574-0.85.hdf5 -------------------------------------------------------------------------------- /bin-utf8-vec/joblibParForTest.py: -------------------------------------------------------------------------------- 1 | __author__ = "Laurence Elliott" 2 | 3 | from joblib import Parallel, delayed 4 | 5 | def returnArg(arg): 6 | print(arg) 7 | return arg 8 | 9 | argList = [1,2,3,4,5,6,7,8,9,10] 10 | 11 | returnArgResults = Parallel(n_jobs=4, verbose=1, backend="threading")(map(delayed(returnArg), argList)) 12 | print(returnArgResults) -------------------------------------------------------------------------------- /bin-utf8-vec/getMaxHashID.py: -------------------------------------------------------------------------------- 1 | __author__ = "Laurence Elliott - 16600748" 2 | 3 | import os 4 | import numpy as np 5 | 6 | 7 | x = np.core.defchararray.add(np.array(["corpus/"]), np.array(os.listdir("corpus"))) 8 | maxHashWordIDs = [] 9 | 10 | count = 0 11 | for file in x: 12 | sampleArr = np.load(file) 13 | maxHashWordIDs.append(max(sampleArr)) 14 | os.system("clear") 15 | print(count) 16 | count += 1 17 | 18 | maxHashWordID = max(maxHashWordIDs) 19 | print("maxHashWordID:", maxHashWordID) 20 | with open("maxHashWordID.txt", "w") as f: 21 | f.write(str(maxHashWordID)) 22 | -------------------------------------------------------------------------------- /bin-opcodes-vec/top50opcodes.csv: -------------------------------------------------------------------------------- 1 | opcode, frequency 2 | add, 72144237 3 | mov, 28942614 4 | push, 13091749 5 | call, 7452773 6 | cmp, 5527990 7 | int3, 4436803 8 | lea, 4332597 9 | jmp, 4167194 10 | pop, 4074808 11 | je, 4038195 12 | test, 3204034 13 | jne, 3123470 14 | xor, 2607139 15 | nop, 2441467 16 | sub, 2339824 17 | inc, 1875395 18 | and, 1736751 19 | ret, 1699971 20 | or, 1069451 21 | movzx, 1065368 22 | dec, 1001755 23 | shl, 496341 24 | shr, 482247 25 | jb, 471314 26 | xchg, 443857 27 | jae, 409428 28 | imul, 402999 29 | jg, 341131 30 | sar, 318254 31 | jle, 288714 32 | adc, 286692 33 | jbe, 282154 34 | ja, 264139 35 | leave, 263945 36 | jl, 261413 37 | sbb, 255962 38 | movsx, 222484 39 | neg, 199949 40 | fstp, 188997 41 | jge, 187117 42 | fld, 177448 43 | movsd, 148505 44 | not, 145062 45 | js, 139043 46 | insb, 137781 47 | popal, 133120 48 | setne, 131947 49 | outsd, 129368 50 | outsb, 125488 51 | jns, 114090 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Laurence Elliott 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /bin-utf8-vec/truncateCorpus.py: -------------------------------------------------------------------------------- 1 | __author__ = "Laurence Elliott - 16600748" 2 | 3 | import os, math 4 | import numpy as np 5 | 6 | # sampleLens = [] 7 | # count = 0 8 | # for file in os.listdir("corpus"): 9 | # sample = np.load("corpus/" + file) 10 | # zeroArr = [0] 11 | # try: 12 | # zerosInSample = np.isin(sample, zeroArr) 13 | # zerosIndexes = np.where(zerosInSample) 14 | # zerosStart = zerosIndexes[0][0] 15 | # sample = sample[:zerosStart] 16 | # sampleLen = len(sample) 17 | # print(count, sampleLen) 18 | # sampleLens.append(len(sample)) 19 | # except: 20 | # sampleLen = len(sample) 21 | # print(count, sampleLen) 22 | # sampleLens.append(len(sample)) 23 | # count += 1 24 | # # sample = np.concatenate((sample[0:200], sample[::-1][0:200])) 25 | # 26 | # minSampleLen = np.min(sampleLens) 27 | # print(minSampleLen) 28 | 29 | # Min sample length is 18 bytes D: 30 | maxSequenceLen = 10000 31 | lenSqrt = int(math.sqrt(maxSequenceLen)) 32 | print(lenSqrt) 33 | 34 | count = 0 35 | for file in os.listdir("corpus"): 36 | sample = np.load("corpus/" + file)[:maxSequenceLen] 37 | sample = np.rint(((sample - np.min(sample)) / 38 | (np.max(sample) - np.min(sample))) * 255)\ 39 | .astype('int').reshape(lenSqrt, lenSqrt, 1) 40 | np.save("corpusTrunc/" + file, sample) 41 | print(count) 42 | count += 1 -------------------------------------------------------------------------------- /bin-utf8-vec/vocabSize.py: -------------------------------------------------------------------------------- 1 | __author__ = "Laurence Elliott - 16600748" 2 | 3 | import os 4 | import numpy as np 5 | 6 | benignSequenceFiles = np.core.defchararray.add(np.array(["benignSequences/"]), 7 | np.array(os.listdir("benignSequences"))) 8 | 9 | malwareSequenceFiles = np.core.defchararray.add(np.array(["malwareSequences/"]), 10 | np.array(os.listdir("malwareSequences"))) 11 | 12 | ransomSequenceFiles = np.core.defchararray.add(np.array(["ransomSequences/"]), 13 | np.array(os.listdir("ransomSequences"))) 14 | 15 | 16 | vocab = set() 17 | 18 | 19 | for sampleN in range(0, 10000): 20 | with open(benignSequenceFiles[sampleN]) as f: 21 | vocab = vocab.union(set(f.readlines())) 22 | if sampleN % 100 == 0: 23 | print(sampleN) 24 | 25 | for sampleN in range(0, 10000): 26 | with open(malwareSequenceFiles[sampleN]) as f: 27 | vocab = vocab.union(set(f.readlines())) 28 | if sampleN % 100 == 0: 29 | print(sampleN + 10000) 30 | 31 | for sampleN in range(0, 10000): 32 | with open(ransomSequenceFiles[sampleN]) as f: 33 | vocab = vocab.union(set(f.readlines())) 34 | if sampleN % 100 == 0: 35 | print(sampleN + 20000) 36 | 37 | with open("finalVocabSize.txt", "w") as f: 38 | f.write(str(len(vocab))) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ensemble Deep Learning Ransomware Detector 2 | A Deep Learning ensemble that classifies Windows executable files as either benign, ransomware, or other malware. 3 | This program was developed as part of my dissertation for my BSc (Hons) Computer Science course at the University of Lincoln: ['Ransomware Detection Using Deep Learning Ensemble'](Project_Report_-_ELL16600748.pdf) in which it is demonstrated to achieve 96% accuracy in classifying a test set of 3000 '.exe' files not seen in the model's training. 4 | 5 | # Setup 6 | This project uses Python 3. For the GUI detector program `ensemblePredict.py`, the following python packages must be installed: tensorflow, keras, h5py, capstone, pefile, numpy, and scikit-learn. These can be installed via the terminal or command prompt command `pip install tensorflow keras h5py capstone pefile numpy scikit-learn`. Then simply run the script with `python ensemblePredict.py`. You should be greeted by a file selection dialog with which you can select one or more '.exe' files, then click 'Open' and the deep learning ensemble will predict if they are benign, ransomware, or other malware. 7 | 8 | Source code for training and pre-processing for the ensemble's two models, in the folders `bin-opcodes-vec` and `bin-utf8-vec`, should run with the same pre-requisites, though `tensorflow-gpu` is recommended to acheive reasonable training times. I am not licenced to distribute the benign samples used to train the models, but these can be downloaded by installing BeautifulSoup with `pip install beautifulsoup4` and running `python benignFreewareDownloader.py`. Malware and ransomware samples were obtained from torrents available from [VirusShare.com](https://virusshare.com), after being vetted by the site's admin. Details about the particular torrents used, among other details of the model and its development can be found in this project's [report](Project_Report_-_ELL16600748.pdf). 9 | -------------------------------------------------------------------------------- /bin-opcodes-vec/top50opcodes.py: -------------------------------------------------------------------------------- 1 | __author__ = "Laurence Elliott - 16600748" 2 | 3 | from capstone import * 4 | import pefile, os 5 | 6 | # samplePaths = ["testSamples/" + sample for sample in os.listdir("testSamples")] 7 | samplePaths = ["../bin-utf8-vec/benignSamples/" + sample for sample in os.listdir("../bin-utf8-vec/benignSamples")] + \ 8 | ["../bin-utf8-vec/malwareSamples/" + sample for sample in os.listdir("../bin-utf8-vec/malwareSamples")] + \ 9 | ["../bin-utf8-vec/ransomwareSamples/" + sample for sample in os.listdir("../bin-utf8-vec/ransomwareSamples")] 10 | 11 | 12 | 13 | opcodeSet = set() 14 | opCodeDicts = [] 15 | opCodeFreqs = {} 16 | nSamples = len(samplePaths) 17 | 18 | count = 1 19 | for sample in samplePaths: 20 | try: 21 | pe = pefile.PE(sample, fast_load=True) 22 | entryPoint = pe.OPTIONAL_HEADER.AddressOfEntryPoint 23 | data = pe.get_memory_mapped_image()[entryPoint:] 24 | cs = Cs(CS_ARCH_X86, CS_MODE_32) 25 | 26 | opcodes = [] 27 | for i in cs.disasm(data, 0x1000): 28 | opcodes.append(i.mnemonic) 29 | 30 | opcodeDict = {} 31 | total = len(opcodes) 32 | 33 | opcodeSet = set(list(opcodeSet) + opcodes) 34 | for opcode in opcodeSet: 35 | freq = 1 36 | for op in opcodes: 37 | if opcode == op: 38 | freq += 1 39 | try: 40 | opCodeFreqs[opcode] += freq 41 | except: 42 | opCodeFreqs[opcode] = freq 43 | 44 | opcodeDict[opcode] = round((freq / total) * 100, 2) 45 | 46 | 47 | opCodeDicts.append(opcodeDict) 48 | os.system("clear") 49 | print(str((count / nSamples) * 100) + "%") 50 | count += 1 51 | except Exception as e: 52 | print(e) 53 | 54 | # for opcode in opcodeSet: 55 | # print(opcode, str(opcodeDict[opcode]) + "%") 56 | 57 | # for opcodeDict in opCodeDicts: 58 | # freqSorted = sorted(opcodeDict, key=opcodeDict.get)[-1:0:-1] 59 | # print(opcodeDict[freqSorted[0]], opcodeDict[freqSorted[1]], opcodeDict[freqSorted[2]], freqSorted) 60 | 61 | opCodeFreqsSorted = sorted(opCodeFreqs, key=opCodeFreqs.get)[-1:0:-1] 62 | 63 | with open("top50opcodes.csv", "w") as f: 64 | f.write("opcode, frequency\n") 65 | for opcode in opCodeFreqsSorted[:50]: 66 | f.write(str(opcode) + ", " + str(opCodeFreqs[opcode]) + "\n") 67 | print(opcode, opCodeFreqs[opcode]) 68 | 69 | -------------------------------------------------------------------------------- /bin-utf8-vec/hashAndPadSequences.py: -------------------------------------------------------------------------------- 1 | __author__= "Laurence Elliott - 16600748" 2 | 3 | import os 4 | import numpy as np 5 | from keras import preprocessing 6 | 7 | docTotal = 30000 8 | maxSequenceLen = 10000 9 | 10 | with open("finalVocabSize.txt", "r") as f: 11 | maxVocabSize = int(f.readline()) 12 | 13 | 14 | def hashWordSequences(sequencePath, outputPath, maxSeqLen, vocabSize, docT, nSamples): 15 | docCount = 0 16 | if sequencePath[-1] != "/": sequencePath += "/" 17 | if outputPath[-1] != "/": outputPath += "/" 18 | 19 | seqFiles = [sequencePath + os.listdir(sequencePath)[i] for i in range(0, nSamples)] 20 | for seqFile in seqFiles: 21 | with open(seqFile, "r") as f: 22 | try: 23 | sequence = np.char.replace(np.array(f.readlines()), "\n", "") 24 | text = " ".join(sequence) 25 | hashWordIDs = preprocessing.text.hashing_trick(text, round(maxVocabSize * 1.5), hash_function='md5') 26 | docLen = len(hashWordIDs) 27 | if docLen < maxSequenceLen: 28 | hashWordIDs += [0 for i in range(0, maxSequenceLen-docLen)] 29 | hashWordIDs = np.array(hashWordIDs).reshape(100, 100, 1) 30 | np.save(outputPath + str(docCount) + ".npy", hashWordIDs) 31 | if docCount % 100 == 0: 32 | print(str(int((docCount / nSamples) * 100)) + "%") 33 | docCount += 1 34 | except Exception as e: 35 | print(e) 36 | 37 | 38 | # print("Max vocab size (for hashing trick):", maxVocabSize, "\nMax sequence length (for zero padding):", maxSequenceLen) 39 | # 40 | # print("Hashing benign word sequences...") 41 | # hashWordSequences(sequencePath="benignSequences", 42 | # outputPath="finalBenignCorpus", 43 | # maxSeqLen=maxSequenceLen, 44 | # vocabSize=maxVocabSize, 45 | # docT=docTotal, 46 | # nSamples=10000) 47 | 48 | print("Hashing malware word sequences...") 49 | hashWordSequences(sequencePath="malwareSequences", 50 | outputPath="finalMalwareCorpus", 51 | maxSeqLen=maxSequenceLen, 52 | vocabSize=maxVocabSize, 53 | docT=docTotal, 54 | nSamples=10000) 55 | 56 | print("Hashing ransomware word sequences...") 57 | hashWordSequences(sequencePath="ransomSequences", 58 | outputPath="finalRansomCorpus", 59 | maxSeqLen=maxSequenceLen, 60 | vocabSize=maxVocabSize, 61 | docT=docTotal, 62 | nSamples=10000) 63 | -------------------------------------------------------------------------------- /bin-utf8-vec/wordSequences.py: -------------------------------------------------------------------------------- 1 | __author__ = "Laurence Elliott - 16600748" 2 | 3 | import string, os, math 4 | import numpy as np 5 | from tensorflow.python.keras import preprocessing 6 | from joblib import Parallel, delayed 7 | 8 | def strings(filename, min=4): 9 | with open(filename, errors="ignore", encoding="utf-8") as f: 10 | result = "" 11 | for c in f.read(): 12 | if c in string.printable: 13 | result += c 14 | continue 15 | if len(result) >= min: 16 | yield result 17 | result = "" 18 | if len(result) >= min: # catch result at EOF 19 | yield result 20 | 21 | 22 | def wordSequencesBenign(sampleN): 23 | print(benignSampleFiles[sampleN]) 24 | try: 25 | text = "" 26 | for s in strings("benignSamples/" + benignSampleFiles[sampleN]): 27 | text += s + "\n" 28 | sequence = preprocessing.text.text_to_word_sequence(text)[:10000] 29 | np.savetxt("benignSequences/" + str(sampleN) + ".txt", sequence, fmt="%s") 30 | del text, sequence 31 | except Exception as e: 32 | print(e) 33 | 34 | 35 | def wordSequencesMalware(sampleN): 36 | print(malwareSampleFiles[sampleN]) 37 | try: 38 | text = "" 39 | for s in strings("malwareSamples/" + malwareSampleFiles[sampleN]): 40 | text += s + "\n" 41 | sequence = preprocessing.text.text_to_word_sequence(text)[:10000] 42 | np.savetxt("malwareSequences/" + str(sampleN) + ".txt", sequence, fmt="%s") 43 | del text, sequence 44 | except Exception as e: 45 | print(e) 46 | 47 | 48 | def wordSequencesRansom(sampleN): 49 | print(ransomSampleFiles[sampleN]) 50 | try: 51 | text = "" 52 | for s in strings("ransomwareSamples/" + ransomSampleFiles[sampleN]): 53 | text += s + "\n" 54 | sequence = preprocessing.text.text_to_word_sequence(text)[:10000] 55 | np.savetxt("ransomSequences/" + str(sampleN) + ".txt", sequence, fmt="%s") 56 | del text, sequence 57 | except Exception as e: 58 | print(e) 59 | 60 | 61 | # 10,000 samples will be used from each class 62 | # Only 9,255 benign samples were gathered, so the first 745 samples are used again 63 | benignSampleFiles = os.listdir("benignSamples") + os.listdir("benignSamples")[:745] 64 | malwareSampleFiles = os.listdir("malwareSamples")[:10000] 65 | ransomSampleFiles = os.listdir("ransomwareSamples")[:10000] 66 | 67 | nBenignSamples = len(benignSampleFiles) 68 | nMalwareSamples = len(malwareSampleFiles) 69 | nRansomSamples = len(ransomSampleFiles) 70 | 71 | benignSequences = [] 72 | malwareSequences = [] 73 | ransomSequences = [] 74 | 75 | print(nBenignSamples, nMalwareSamples, nRansomSamples) 76 | 77 | print("Generating word sequences for benign samples...") 78 | Parallel(n_jobs=-1, verbose=11, backend="threading")(map(delayed(wordSequencesBenign), range(0, nBenignSamples))) 79 | 80 | print("Generating word sequences for malware samples...") 81 | Parallel(n_jobs=-1, verbose=11, backend="threading")(map(delayed(wordSequencesMalware), range(0, nMalwareSamples))) 82 | 83 | print("Generating word sequences for ransomware samples...") 84 | Parallel(n_jobs=-1, verbose=11, backend="threading")(map(delayed(wordSequencesRansom), range(0, nRansomSamples))) 85 | 86 | -------------------------------------------------------------------------------- /bin-opcodes-vec/opcode-model.py: -------------------------------------------------------------------------------- 1 | __author__ = "Laurence Elliott - 16600748" 2 | 3 | import os, math 4 | import numpy as np 5 | import tensorflow as tf 6 | from time import time 7 | from tensorflow.python.keras.models import Sequential 8 | from tensorflow.python.keras import preprocessing, layers, optimizers 9 | from tensorflow.python.keras.utils import Sequence, to_categorical 10 | from tensorflow.python.keras.callbacks import TensorBoard, ModelCheckpoint 11 | from sklearn.utils import shuffle 12 | from sklearn.model_selection import train_test_split 13 | 14 | 15 | benignHists = ["benignHistVecs/" + fileName \ 16 | for fileName in os.listdir("benignHistVecs")] 17 | benignHists += benignHists[:745] 18 | 19 | malwareHists = ["malwareHistVecs/" + fileName \ 20 | for fileName in os.listdir("malwareHistVecs")] 21 | malwareHists += malwareHists[:16] 22 | 23 | ransomHists = ["ransomHistVecs/" + fileName \ 24 | for fileName in os.listdir("ransomHistVecs")] 25 | ransomHists = ransomHists[:10000] 26 | 27 | nBenignSamples = len(benignHists) 28 | nMalwareSamples = len(malwareHists) 29 | nRansomSamples = len(ransomHists) 30 | 31 | 32 | 33 | # x is a list of paths to training samples 34 | x = np.array(benignHists + malwareHists + ransomHists) 35 | # y is a list of samples' associated class labels with one-hot encoding 36 | y = np.ones(nBenignSamples+nMalwareSamples+nRansomSamples) 37 | y[0:nBenignSamples] = 0 38 | y[nBenignSamples:nBenignSamples+nMalwareSamples] = 1 39 | y[nBenignSamples+nMalwareSamples:nBenignSamples+nMalwareSamples+nRansomSamples] = 2 40 | # represent labels with one-hot encoding 41 | y = to_categorical(y, num_classes=3) 42 | 43 | # Dataset is indexed by same shuffled and split indexing as the other model 44 | trainInds = np.load("trainInds.npy") 45 | valInds = np.load("valInds.npy") 46 | testInds = np.load("testInds.npy") 47 | X_train = x[trainInds] 48 | X_val = x[valInds] 49 | X_test = x[testInds] 50 | 51 | y_train = y[trainInds] 52 | y_val = y[valInds] 53 | y_test = y[testInds] 54 | 55 | 56 | class histSequence(Sequence): 57 | 58 | def __init__(self, x, y, batch_size): 59 | self.x, self.y = shuffle(x, y) 60 | self.batch_size = batch_size 61 | 62 | def __len__(self): 63 | return math.ceil(len(self.x) / self.batch_size) 64 | 65 | def __getitem__(self, idx): 66 | batch_x = self.x[idx * self.batch_size:(idx + 1) * 67 | self.batch_size] 68 | batch_y = self.y[idx * self.batch_size:(idx + 1) * 69 | self.batch_size] 70 | 71 | return np.array([ 72 | np.load(file_name) 73 | for file_name in batch_x]), np.array(batch_y) 74 | 75 | def on_epoch_end(self): 76 | pass 77 | 78 | 79 | class histSequenceVal(histSequence): 80 | 81 | def __init__(self, x, y, batch_size): 82 | self.x, self.y = x, y 83 | self.batch_size = batch_size 84 | 85 | 86 | batch_size = 1000 87 | sequenceGenerator = histSequence(X_train, y_train, batch_size) 88 | validationSeqGen = histSequenceVal(X_val, y_val, batch_size) 89 | print(validationSeqGen.__getitem__(0)) 90 | 91 | # Defining the ML model 92 | model = Sequential() 93 | 94 | model.add(layers.InputLayer(input_shape=(50,))) 95 | model.add(layers.Dense(256, activation='relu')) 96 | model.add(layers.BatchNormalization()) 97 | model.add(layers.Dropout(0.2)) 98 | model.add(layers.Dense(128, activation='relu')) 99 | model.add(layers.BatchNormalization()) 100 | model.add(layers.Dense(64, activation='relu')) 101 | model.add(layers.BatchNormalization()) 102 | model.add(layers.Dense(32, activation='relu')) 103 | model.add(layers.BatchNormalization()) 104 | model.add(layers.Dense(16, activation='relu')) 105 | model.add(layers.BatchNormalization()) 106 | model.add(layers.Dense(3, activation='softmax')) 107 | 108 | tensorboard = TensorBoard(log_dir="logs/{}".format(time())) 109 | 110 | model.compile(optimizer="rmsprop", 111 | loss='categorical_crossentropy', 112 | metrics=['accuracy']) 113 | 114 | filePath="weights/weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5" 115 | checkpoint = ModelCheckpoint(filePath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') 116 | callbackList = [tensorboard, checkpoint] 117 | 118 | # Training the model 119 | model.fit_generator(generator=sequenceGenerator, 120 | epochs=1000, 121 | steps_per_epoch=len(sequenceGenerator), 122 | verbose=1, 123 | validation_data=validationSeqGen, 124 | validation_steps=len(validationSeqGen), 125 | workers=8, 126 | use_multiprocessing=True, 127 | callbacks=callbackList) 128 | -------------------------------------------------------------------------------- /bin-utf8-vec/bin-utf8-vec.py: -------------------------------------------------------------------------------- 1 | __author__ = "Laurence Elliott - 16600748" 2 | 3 | import string, os, json, math, random 4 | from time import time 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | import tensorflow as tf 8 | from tensorflow.python.keras.models import Sequential 9 | from tensorflow.python.keras import preprocessing, layers, optimizers 10 | from tensorflow.python.keras.utils import Sequence, to_categorical 11 | from tensorflow.python.keras.callbacks import TensorBoard, ModelCheckpoint 12 | from sklearn.utils import shuffle 13 | from sklearn.model_selection import train_test_split 14 | from joblib import Parallel, delayed 15 | 16 | # https://stackoverflow.com/questions/17195924/python-equivalent-of-unix-strings-utility 17 | # Solution to Python based 'strings' alternative from SO 18 | 19 | def strings(filename, min=4): 20 | with open(filename, errors="ignore", encoding="utf-8") as f: 21 | result = "" 22 | for c in f.read(): 23 | if c in string.printable: 24 | result += c 25 | continue 26 | if len(result) >= min: 27 | yield result 28 | result = "" 29 | if len(result) >= min: # catch result at EOF 30 | yield result 31 | 32 | 33 | vocabSizes = [] 34 | wordSequenceLens = [] 35 | benignSequences = [] 36 | malwareSequences = [] 37 | nBenignSamples = 10000 38 | nMalwareSamples = 10000 39 | nRansomSamples = 10000 40 | 41 | 42 | with open("maxVocabSize.txt","r") as f: 43 | maxVocabSize = int(f.read()) 44 | 45 | maxSequenceLen = 10000 46 | 47 | with open("maxHashWordID.txt","r") as f: 48 | maxHashWordID = int(f.read()) 49 | 50 | 51 | benignCorpus = ["finalBenignCorpus/" + fileName \ 52 | for fileName in os.listdir("finalBenignCorpus")] 53 | 54 | malwareCorpus = ["finalMalwareCorpus/" + fileName \ 55 | for fileName in os.listdir("finalMalwareCorpus")] 56 | malwareCorpus += malwareCorpus[:5] 57 | 58 | ransomCorpus = ["finalRansomCorpus/" + fileName \ 59 | for fileName in os.listdir("finalRansomCorpus")] 60 | ransomCorpus += ransomCorpus[:2] 61 | 62 | # x is a list of paths to training samples 63 | x = np.array(benignCorpus + malwareCorpus + ransomCorpus) 64 | # y is a list of samples' associated class labels with one-hot encoding 65 | y = np.ones(nBenignSamples+nMalwareSamples+nRansomSamples) 66 | y[0:nBenignSamples] = 0 67 | y[nBenignSamples:nBenignSamples+nMalwareSamples] = 1 68 | y[nBenignSamples+nMalwareSamples:nBenignSamples+nMalwareSamples+nRansomSamples] = 2 69 | # represent labels with one-hot encoding 70 | y = to_categorical(y,num_classes=3) 71 | 72 | 73 | # Dataset is indexed by same shuffled and split indexing as the other model 74 | trainInds = np.load("trainInds.npy") 75 | valInds = np.load("valInds.npy") 76 | testInds = np.load("testInds.npy") 77 | X_train = x[trainInds] 78 | X_val = x[valInds] 79 | X_test = x[testInds] 80 | 81 | y_train = y[trainInds] 82 | y_val = y[valInds] 83 | y_test = y[testInds] 84 | 85 | 86 | class hashCorpusSequence(Sequence): 87 | 88 | def __init__(self, x, y, batch_size): 89 | self.x, self.y = shuffle(x, y) 90 | self.batch_size = batch_size 91 | 92 | def __len__(self): 93 | return math.ceil(len(self.x) / self.batch_size) 94 | 95 | def __getitem__(self, idx): 96 | batch_x = self.x[idx * self.batch_size:(idx + 1) * 97 | self.batch_size] 98 | batch_y = self.y[idx * self.batch_size:(idx + 1) * 99 | self.batch_size] 100 | 101 | return np.array([ 102 | # np.load(file_name) 103 | np.rint(((np.load(file_name) - np.min(np.load(file_name))) / 104 | (np.max(np.load(file_name)) - np.min(np.load(file_name)))) * 255).astype(int) 105 | for file_name in batch_x]), np.array(batch_y) 106 | 107 | def on_epoch_end(self): 108 | pass 109 | 110 | class hashCorpusSequenceVal(hashCorpusSequence): 111 | 112 | def __init__(self, x, y, batch_size): 113 | self.x, self.y = x, y 114 | self.batch_size = batch_size 115 | 116 | 117 | batch_size = 150 118 | sequenceGenerator = hashCorpusSequence(X_train, y_train, batch_size) 119 | validationSeqGen = hashCorpusSequenceVal(X_val, y_val, batch_size) 120 | 121 | 122 | # res = int(math.sqrt(maxSequenceLen)) 123 | # classTitles = ["benign", "malware", "ransomware"] 124 | # 125 | # sampleCount = random.randrange(0, 2988) 126 | # # plt.title("Sample " + str(sampleCount) + " Converted to Grayscale Image\n(" + 127 | # # classTitles[sequenceGenerator.__getitem__(sampleCount)[1].tolist()[0].index(1)] + ")") 128 | # plt.imshow(sequenceGenerator.__getitem__(sampleCount)[0][0].reshape(res, res), cmap='gray') 129 | 130 | 131 | # Defining the ML model 132 | model = Sequential() 133 | 134 | model.add(layers.InputLayer(input_shape=(100, 100, 1))) 135 | model.add(layers.SpatialDropout2D(rate=0.2)) 136 | model.add(layers.Conv2D(32, kernel_size=3, activation='relu')) 137 | model.add(layers.BatchNormalization()) 138 | model.add(layers.SpatialDropout2D(rate=0.1)) 139 | model.add(layers.Conv2D(16, kernel_size=3, activation='relu')) 140 | model.add(layers.BatchNormalization()) 141 | model.add(layers.SpatialDropout2D(rate=0.1)) 142 | model.add(layers.Flatten()) 143 | model.add(layers.Dense(3, activation='softmax')) 144 | 145 | 146 | tensorboard = TensorBoard(log_dir="logs/{}".format(time())) 147 | 148 | model.compile(optimizer="adamax", 149 | loss='categorical_crossentropy', 150 | metrics=['accuracy']) 151 | 152 | 153 | filePath="weights/weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5" 154 | checkpoint = ModelCheckpoint(filePath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') 155 | callbackList = [tensorboard, checkpoint] 156 | 157 | 158 | # Training the model 159 | model.fit_generator(generator=sequenceGenerator, 160 | epochs=1000, 161 | steps_per_epoch=len(sequenceGenerator), 162 | verbose=1, 163 | validation_data=validationSeqGen, 164 | validation_steps=len(validationSeqGen), 165 | workers=8, 166 | use_multiprocessing=True, 167 | callbacks=callbackList) 168 | -------------------------------------------------------------------------------- /bin-opcodes-vec/bin-opcodes-vec.py: -------------------------------------------------------------------------------- 1 | __author__ = "Laurence Elliott - 16600748" 2 | 3 | from capstone import * 4 | import pefile, os 5 | import numpy as np 6 | from matplotlib import pyplot as plt 7 | 8 | benignPaths = ["../bin-utf8-vec/benignSamples/" + sample for sample in os.listdir("../bin-utf8-vec/benignSamples")] 9 | malwarePaths = ["../bin-utf8-vec/malwareSamples/" + sample for sample in os.listdir("../bin-utf8-vec/malwareSamples")] 10 | ransomPaths = ["../bin-utf8-vec/ransomwareSamples/" + sample for sample in os.listdir("../bin-utf8-vec/ransomwareSamples")] 11 | 12 | nSamples = len(benignPaths) + len(malwarePaths) + len(ransomPaths) 13 | benignOpCodeSet = set() 14 | benignOpCodeDicts = [] 15 | benignOpCodeFreqs = {} 16 | 17 | count = 1 18 | for sample in benignPaths: 19 | try: 20 | pe = pefile.PE(sample, fast_load=True) 21 | entryPoint = pe.OPTIONAL_HEADER.AddressOfEntryPoint 22 | data = pe.get_memory_mapped_image()[entryPoint:] 23 | cs = Cs(CS_ARCH_X86, CS_MODE_32) 24 | 25 | opcodes = [] 26 | for i in cs.disasm(data, 0x1000): 27 | opcodes.append(i.mnemonic) 28 | 29 | opcodeDict = {} 30 | total = len(opcodes) 31 | 32 | benignOpCodeSet = set(list(benignOpCodeSet) + opcodes) 33 | for opcode in benignOpCodeSet: 34 | freq = 1 35 | for op in opcodes: 36 | if opcode == op: 37 | freq += 1 38 | try: 39 | benignOpCodeFreqs[opcode] += freq 40 | except: 41 | benignOpCodeFreqs[opcode] = freq 42 | 43 | opcodeDict[opcode] = round((freq / total) * 100, 2) 44 | 45 | benignOpCodeDicts.append(opcodeDict) 46 | 47 | os.system("clear") 48 | print(str((count / nSamples) * 100) + "%") 49 | count += 1 50 | 51 | except Exception as e: 52 | print(e) 53 | 54 | 55 | malwareOpCodeSet = set() 56 | malwareOpCodeDicts = [] 57 | malwareOpCodeFreqs = {} 58 | 59 | count = len(malwarePaths) 60 | for sample in malwarePaths: 61 | try: 62 | pe = pefile.PE(sample, fast_load=True) 63 | entryPoint = pe.OPTIONAL_HEADER.AddressOfEntryPoint 64 | data = pe.get_memory_mapped_image()[entryPoint:] 65 | cs = Cs(CS_ARCH_X86, CS_MODE_32) 66 | 67 | opcodes = [] 68 | for i in cs.disasm(data, 0x1000): 69 | opcodes.append(i.mnemonic) 70 | 71 | opcodeDict = {} 72 | total = len(opcodes) 73 | 74 | malwareOpCodeSet = set(list(malwareOpCodeSet) + opcodes) 75 | for opcode in malwareOpCodeSet: 76 | freq = 1 77 | for op in opcodes: 78 | if opcode == op: 79 | freq += 1 80 | try: 81 | malwareOpCodeFreqs[opcode] += freq 82 | except: 83 | malwareOpCodeFreqs[opcode] = freq 84 | 85 | opcodeDict[opcode] = round((freq / total) * 100, 2) 86 | 87 | malwareOpCodeDicts.append(opcodeDict) 88 | 89 | os.system("clear") 90 | print(str((count / nSamples) * 100) + "%") 91 | count += 1 92 | 93 | except Exception as e: 94 | print(e) 95 | 96 | ransomOpCodeSet = set() 97 | ransomOpCodeDicts = [] 98 | ransomOpCodeFreqs = {} 99 | 100 | count = len(benignPaths) + len(malwarePaths) 101 | for sample in ransomPaths: 102 | try: 103 | pe = pefile.PE(sample, fast_load=True) 104 | entryPoint = pe.OPTIONAL_HEADER.AddressOfEntryPoint 105 | data = pe.get_memory_mapped_image()[entryPoint:] 106 | cs = Cs(CS_ARCH_X86, CS_MODE_32) 107 | 108 | opcodes = [] 109 | for i in cs.disasm(data, 0x1000): 110 | opcodes.append(i.mnemonic) 111 | 112 | opcodeDict = {} 113 | total = len(opcodes) 114 | 115 | ransomOpCodeSet = set(list(ransomOpCodeSet) + opcodes) 116 | for opcode in ransomOpCodeSet: 117 | freq = 1 118 | for op in opcodes: 119 | if opcode == op: 120 | freq += 1 121 | try: 122 | ransomOpCodeFreqs[opcode] += freq 123 | except: 124 | ransomOpCodeFreqs[opcode] = freq 125 | 126 | opcodeDict[opcode] = round((freq / total) * 100, 2) 127 | 128 | ransomOpCodeDicts.append(opcodeDict) 129 | 130 | os.system("clear") 131 | print(str((count / nSamples) * 100) + "%") 132 | count += 1 133 | 134 | except Exception as e: 135 | print(e) 136 | 137 | 138 | opCodeFreqsSorted = np.genfromtxt("top50opcodes.csv", delimiter=",", dtype="str")[1:, 0] 139 | 140 | count = 0 141 | for opDict in benignOpCodeDicts: 142 | opFreqVec = [] 143 | for opcode in opCodeFreqsSorted[:50]: 144 | try: 145 | opFreqVec.append(opDict[opcode]) 146 | except Exception as e: 147 | if str(type(e)) == "": 148 | opFreqVec.append(0.0) 149 | 150 | np.save("benignHistVecs/" + str(count)+".npy", opFreqVec) 151 | os.system("clear") 152 | print(str((count / nSamples) * 100) + "%") 153 | count += 1 154 | 155 | 156 | count = len(benignPaths) 157 | for opDict in malwareOpCodeDicts: 158 | opFreqVec = [] 159 | for opcode in opCodeFreqsSorted[:50]: 160 | try: 161 | opFreqVec.append(opDict[opcode]) 162 | except Exception as e: 163 | if str(type(e)) == "": 164 | opFreqVec.append(0.0) 165 | 166 | np.save("malwareHistVecs/" + str(count)+".npy", opFreqVec) 167 | os.system("clear") 168 | print(str((count / nSamples) * 100) + "%") 169 | count += 1 170 | 171 | 172 | count = len(benignPaths) + len(malwarePaths) 173 | for opDict in ransomOpCodeDicts: 174 | opFreqVec = [] 175 | for opcode in opCodeFreqsSorted[:50]: 176 | try: 177 | opFreqVec.append(opDict[opcode]) 178 | except Exception as e: 179 | if str(type(e)) == "": 180 | opFreqVec.append(0.0) 181 | 182 | np.save("ransomHistVecs/" + str(count)+".npy", opFreqVec) 183 | os.system("clear") 184 | print(str((count / nSamples) * 100) + "%") 185 | count += 1 186 | 187 | 188 | # benignVecPaths = ["benignHistVecs/" + vecPath for vecPath in os.listdir("benignHistVecs")] 189 | 190 | # for vecPath in benignVecPaths: 191 | # opFreqVec = np.load(vecPath) 192 | # print(opFreqVec) 193 | # plt.figure(count) 194 | # plt.bar(np.arange(len(opFreqVec)), opFreqVec) 195 | # plt.show() 196 | -------------------------------------------------------------------------------- /benignFreewareDownloader.py: -------------------------------------------------------------------------------- 1 | __author__ = "Laurence Elliott" 2 | 3 | from urllib.request import urlopen as uOpen 4 | from bs4 import BeautifulSoup as soup 5 | import re 6 | import os 7 | 8 | 9 | myUrl = "https://www.freewarefiles.com/search.php?categoryid=1&query=&boolean=exact" 10 | 11 | # connecting to and downloading page 12 | uClient = uOpen(myUrl) 13 | page_html = uClient.read() 14 | uClient.close() 15 | 16 | # instatiating BeautifulSoup parsing of first page 17 | page_soup = soup(page_html, "html.parser") 18 | 19 | # gets page numbers from list above program listings 20 | numPagesA = page_soup.findAll("li",{"class":"page-item"}) 21 | numPagesArr = [] 22 | for numPageA in numPagesA: 23 | numPage = numPageA.findAll("a",{"class":"page-link"})[0] 24 | try: 25 | numPage = re.search('(?<=>)[0-9]+(?=<\/a>)',str(numPage)).group(0) 26 | numPagesArr.append(numPage) 27 | except: 28 | pass 29 | 30 | # the last of the list of page numbers is stored for reference as the last 31 | # page of the search 32 | maxPage = numPagesArr[-1] 33 | print("Total pages: " + str(maxPage) + "\n") 34 | 35 | # get next page link 36 | nextA = page_soup.findAll("a",{"aria-label":"Next"})[0] 37 | print(nextA["href"]) 38 | 39 | # get links to individual program download pages 40 | downloadPageHeaders = page_soup.findAll("h3",{"class":"search-head"}) 41 | downloadPageLinks = [] 42 | for pageHeader in downloadPageHeaders: 43 | pageHeader = pageHeader.findAll("a")[0] 44 | downloadPageLink = pageHeader["href"] 45 | print(downloadPageLink) 46 | downloadPageLinks.append(downloadPageLink) 47 | 48 | # main 49 | if __name__ == "__main__": 50 | myUrl = "https://www.freewarefiles.com/search.php?categoryid=1&query=&boolean=exact" 51 | count = 1 52 | 53 | # connecting to and downloading first page 54 | uClient = uOpen(myUrl) 55 | page_html = uClient.read() 56 | uClient.close() 57 | 58 | # instatiating BeautifulSoup parsing of first page 59 | page_soup = soup(page_html, "html.parser") 60 | 61 | # gets page numbers from list above program listings 62 | numPagesA = page_soup.findAll("li", {"class": "page-item"}) 63 | numPagesArr = [] 64 | for numPageA in numPagesA: 65 | numPage = numPageA.findAll("a", {"class": "page-link"})[0] 66 | try: 67 | numPage = re.search('(?<=>)[0-9]+(?=<\/a>)', str(numPage)).group(0) 68 | numPagesArr.append(numPage) 69 | except: 70 | pass 71 | 72 | # the last of the list of page numbers is stored for reference as the last 73 | # page of the search 74 | maxPage = int(numPagesArr[-1]) 75 | print("Total pages: " + str(maxPage) + "\n") 76 | 77 | # get next page link 78 | nextPage = page_soup.findAll("a", {"aria-label": "Next"})[0]["href"] 79 | 80 | # get links to individual program download pages 81 | downloadPageHeaders = page_soup.findAll("h3", {"class": "search-head"}) 82 | downloadPageLinks = [] 83 | for pageHeader in downloadPageHeaders: 84 | pageHeader = pageHeader.findAll("a")[0] 85 | downloadPageLink = pageHeader["href"] 86 | print(downloadPageLink) 87 | downloadPageLinks.append(downloadPageLink) 88 | 89 | # load the page's linked download pages and download exe files 90 | for dlPage in downloadPageLinks: 91 | myUrl = dlPage 92 | myUrl = myUrl.replace("_program_", "-Download-Page-") 93 | 94 | # connecting to and downloading page 95 | uClient = uOpen(myUrl) 96 | page_html = uClient.read() 97 | uClient.close() 98 | 99 | # instatiating BeautifulSoup parsing of page 100 | dlPageSoup = soup(page_html, "html.parser") 101 | 102 | downLinks = dlPageSoup.findAll("a", {"class": "dwnlocations"}) 103 | for link in downLinks: 104 | link = link["href"] 105 | try: 106 | file = uOpen(link) 107 | 108 | if int(file.info()['Content-Length']) <= 27000000: 109 | print(str(count) + ": " + link) 110 | os.system("sudo wget -O /media/lozmund/de9f2ab8-20e0-4d32-82a0-9564591262d0/home/freewareBenignFiles/" + str(count) + " " + link + " --read-timeout=1") 111 | count += 1 112 | except: 113 | pass 114 | 115 | 116 | for pageNum in range(2,maxPage+1): 117 | print("Page " + str(pageNum) + ": ") 118 | myUrl = nextPage 119 | 120 | # connecting to and downloading page 121 | # last 8 characters of url is removed as it 122 | # didn't seem to effect loading of page, and 123 | # could not be parsed by 'urlopen' due to utf-8 encoding 124 | myUrl = myUrl[:-8] 125 | print("\n" + myUrl + "\n") 126 | uClient = uOpen(myUrl) 127 | page_html = uClient.read() 128 | uClient.close() 129 | 130 | # instatiating BeautifulSoup parsing of first page 131 | page_soup = soup(page_html, "html.parser") 132 | 133 | # get links to individual program download pages 134 | downloadPageHeaders = page_soup.findAll("h3", {"class": "search-head"}) 135 | downloadPageLinks = [] 136 | for pageHeader in downloadPageHeaders: 137 | pageHeader = pageHeader.findAll("a")[0] 138 | downloadPageLink = pageHeader["href"] 139 | print(downloadPageLink) 140 | downloadPageLinks.append(downloadPageLink) 141 | 142 | # load the page's linked download pages and download exe files 143 | for dlPage in downloadPageLinks: 144 | myUrl = dlPage 145 | myUrl = myUrl.replace("_program_","-Download-Page-") 146 | 147 | # connecting to and downloading page 148 | uClient = uOpen(myUrl) 149 | page_html = uClient.read() 150 | uClient.close() 151 | 152 | # instatiating BeautifulSoup parsing of page 153 | dlPageSoup = soup(page_html, "html.parser") 154 | 155 | downLinks = dlPageSoup.findAll("a", {"class": "dwnlocations"}) 156 | for link in downLinks: 157 | link = link["href"] 158 | try: 159 | file = uOpen(link) 160 | 161 | if int(file.info()['Content-Length']) <= 27000000: 162 | print(str(count) + ": " + link) 163 | os.system( 164 | "sudo wget -O /media/lozmund/de9f2ab8-20e0-4d32-82a0-9564591262d0/home/freewareBenignFiles/" + str( 165 | count) + " " + link + " --read-timeout=1") 166 | count += 1 167 | except: 168 | pass 169 | 170 | # downLinks = dlPageSoup.findAll("a",{"class":"dwnlocations"}) 171 | # for link in downLinks: 172 | # link = link["href"] 173 | # try: 174 | # file = uOpen(link) 175 | # print(file.info()['Content-Length']) 176 | # if int(file.info()['Content-Length']) <= 27000000: 177 | # print(str(count) + ": " + link) 178 | # os.system("sudo wget -O ~/media/lozmund/de9f2ab8-20e0-4d32-82a0-9564591262d0/home/freewareBenignFiles/" + str(count) + " " + link) 179 | # count += 1 180 | # except: 181 | # pass 182 | 183 | 184 | # get next page link 185 | nextPage = page_soup.findAll("a", {"aria-label": "Next"})[0]["href"] 186 | 187 | -------------------------------------------------------------------------------- /ensemblePredict.py: -------------------------------------------------------------------------------- 1 | __author__ = "Laurence Elliott - 16600748" 2 | 3 | import os, math, string, pefile, time, threading 4 | import tkinter as tk 5 | import numpy as np 6 | from capstone import * 7 | from keras.models import Sequential, Model, Input 8 | from keras import layers, preprocessing 9 | from keras.utils import Sequence 10 | from sklearn.utils import shuffle 11 | from tkinter import messagebox 12 | from tkinter.filedialog import askopenfilenames 13 | from tkinter.ttk import Progressbar 14 | 15 | ## Defining models (opcode, strings, and ensemble) 16 | 17 | # Defining the opcode model 18 | opModel = Sequential() 19 | 20 | opModel.add(layers.InputLayer(input_shape=(50,))) 21 | opModel.add(layers.Dense(256, activation='relu')) 22 | opModel.add(layers.BatchNormalization()) 23 | opModel.add(layers.Dense(128, activation='relu')) 24 | opModel.add(layers.BatchNormalization()) 25 | opModel.add(layers.Dense(64, activation='relu')) 26 | opModel.add(layers.BatchNormalization()) 27 | opModel.add(layers.Dense(32, activation='relu')) 28 | opModel.add(layers.BatchNormalization()) 29 | opModel.add(layers.Dense(16, activation='relu')) 30 | opModel.add(layers.BatchNormalization()) 31 | opModel.add(layers.Dense(3, activation='softmax')) 32 | 33 | opModel.load_weights("weights-improvement-574-0.85.hdf5") 34 | 35 | opModel.compile(optimizer="rmsprop", 36 | loss='categorical_crossentropy', 37 | metrics=['accuracy']) 38 | 39 | 40 | class histSequence(Sequence): 41 | 42 | def __init__(self, x, y, batch_size): 43 | self.x, self.y = shuffle(x, y) 44 | self.batch_size = batch_size 45 | 46 | def __len__(self): 47 | return math.ceil(len(self.x) / self.batch_size) 48 | 49 | def __getitem__(self, idx): 50 | batch_x = self.x[idx * self.batch_size:(idx + 1) * 51 | self.batch_size] 52 | batch_y = self.y[idx * self.batch_size:(idx + 1) * 53 | self.batch_size] 54 | 55 | return np.array([ 56 | np.load(file_name) 57 | for file_name in batch_x]), np.array(batch_y) 58 | 59 | def on_epoch_end(self): 60 | pass 61 | 62 | 63 | class histSequenceVal(histSequence): 64 | 65 | def __init__(self, x, y, batch_size): 66 | self.x, self.y = x, y 67 | self.batch_size = batch_size 68 | 69 | 70 | # Defining the strings as greyscale images model 71 | model = Sequential() 72 | 73 | model.add(layers.InputLayer(input_shape=(100, 100, 1))) 74 | model.add(layers.SpatialDropout2D(rate=0.2)) 75 | model.add(layers.Conv2D(32, kernel_size=3, activation='relu')) 76 | model.add(layers.BatchNormalization()) 77 | model.add(layers.SpatialDropout2D(rate=0.1)) 78 | model.add(layers.Conv2D(16, kernel_size=3, activation='relu')) 79 | model.add(layers.BatchNormalization()) 80 | model.add(layers.SpatialDropout2D(rate=0.1)) 81 | model.add(layers.Flatten()) 82 | model.add(layers.Dense(3, activation='softmax')) 83 | 84 | 85 | class hashCorpusSequence(Sequence): 86 | 87 | def __init__(self, x, y, batch_size): 88 | self.x, self.y = shuffle(x, y) 89 | self.batch_size = batch_size 90 | 91 | def __len__(self): 92 | return math.ceil(len(self.x) / self.batch_size) 93 | 94 | def __getitem__(self, idx): 95 | batch_x = self.x[idx * self.batch_size:(idx + 1) * 96 | self.batch_size] 97 | batch_y = self.y[idx * self.batch_size:(idx + 1) * 98 | self.batch_size] 99 | 100 | return np.array([ 101 | np.rint(((np.load(file_name) - np.min(np.load(file_name))) / 102 | (np.max(np.load(file_name)) - np.min(np.load(file_name)))) * 255).astype(int) 103 | for file_name in batch_x]), np.array(batch_y) 104 | 105 | def on_epoch_end(self): 106 | pass 107 | 108 | 109 | class hashCorpusSequenceVal(hashCorpusSequence): 110 | 111 | def __init__(self, x, y, batch_size): 112 | self.x, self.y = x, y 113 | self.batch_size = batch_size 114 | 115 | 116 | model.load_weights("weights-improvement-04-0.72.hdf5") 117 | 118 | model.compile(optimizer="adamax", 119 | loss='categorical_crossentropy', 120 | metrics=['accuracy']) 121 | 122 | 123 | opModel.name = "opcodeModel" 124 | model.name = "stringsAsGreyscaleModel" 125 | 126 | def ensemble(models, model_inputs): 127 | outputs = [models[0](model_inputs[0]), models[1](model_inputs[1])] 128 | y = layers.average(outputs) 129 | 130 | modelEns = Model(model_inputs, y, name='ensemble') 131 | 132 | return modelEns 133 | 134 | 135 | models = [opModel, model] 136 | model_inputs = [Input(shape=(50,)), Input(shape=(100, 100, 1))] 137 | modelEns = ensemble(models, model_inputs) 138 | modelEns.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) 139 | 140 | 141 | ## Pre-processing of PE (EXE, DLL, etc.) file(s) 142 | 143 | # https://stackoverflow.com/questions/17195924/python-equivalent-of-unix-strings-utility 144 | # Solution to Python based 'strings' alternative from SO. Decodes bytes of binary file as 145 | # utf-8 strings 146 | def strings(filename, min=4): 147 | with open(filename, errors="ignore", encoding="utf-8") as f: 148 | result = "" 149 | for c in f.read(): 150 | if c in string.printable: 151 | result += c 152 | continue 153 | if len(result) >= min: 154 | yield result 155 | result = "" 156 | if len(result) >= min: # catch result at EOF 157 | yield result 158 | 159 | 160 | # Converting utf-8 string to sequence of words 161 | def wordSequence(pePath): 162 | try: 163 | text = "" 164 | for s in strings(pePath): 165 | text += s + "\n" 166 | sequence = preprocessing.text.text_to_word_sequence(text)[:10000] 167 | return sequence 168 | except Exception as e: 169 | print(e) 170 | 171 | # Hashing words of word sequences into sequences of word-specific integers 172 | def hashWordSequences(sequences, maxSeqLen, vocabSize): 173 | 174 | hashedSeqs = [] 175 | docCount = 0 176 | for sequence in sequences: 177 | try: 178 | text = " ".join(sequence) 179 | hashWordIDs = preprocessing.text.hashing_trick(text, round(vocabSize * 1.5), hash_function='md5') 180 | docLen = len(hashWordIDs) 181 | if docLen < maxSeqLen: 182 | hashWordIDs += [0 for i in range(0, maxSeqLen-docLen)] 183 | hashWordIDs = np.array(hashWordIDs).reshape(100, 100, 1) 184 | hashedSeqs.append(hashWordIDs) 185 | docCount += 1 186 | except Exception as e: 187 | print(e) 188 | return hashedSeqs 189 | 190 | 191 | # Function takes list of paths to PE files and returns a list 192 | # of lists, with the first index as input for the opcode model, 193 | # and the second index as input for the strings model 194 | def preprocessPEs(pePaths): 195 | mlInputs = [] 196 | 197 | # Get percentage opcode composition of file assembley code for the top 50 most common opcodes 198 | # in each file 199 | opCodeSet = set() 200 | opCodeDicts = [] 201 | opCodeFreqs = {} 202 | 203 | count = 1 204 | for sample in pePaths: 205 | try: 206 | pe = pefile.PE(sample, fast_load=True) 207 | entryPoint = pe.OPTIONAL_HEADER.AddressOfEntryPoint 208 | data = pe.get_memory_mapped_image()[entryPoint:] 209 | cs = Cs(CS_ARCH_X86, CS_MODE_32) 210 | 211 | opcodes = [] 212 | for i in cs.disasm(data, 0x1000): 213 | opcodes.append(i.mnemonic) 214 | 215 | opcodeDict = {} 216 | total = len(opcodes) 217 | 218 | opCodeSet = set(list(opCodeSet) + opcodes) 219 | for opcode in opCodeSet: 220 | freq = 1 221 | for op in opcodes: 222 | if opcode == op: 223 | freq += 1 224 | try: 225 | opCodeFreqs[opcode] += freq 226 | except: 227 | opCodeFreqs[opcode] = freq 228 | 229 | opcodeDict[opcode] = round((freq / total) * 100, 2) 230 | 231 | opCodeDicts.append(opcodeDict) 232 | count += 1 233 | 234 | except Exception as e: 235 | print(e) 236 | 237 | opCodeFreqsSorted = np.genfromtxt("top50opcodes.csv", delimiter=",", dtype="str")[1:, 0] 238 | 239 | count = 0 240 | for opDict in opCodeDicts: 241 | opFreqVec = [] 242 | for opcode in opCodeFreqsSorted[:50]: 243 | try: 244 | opFreqVec.append(opDict[opcode]) 245 | except Exception as e: 246 | if str(type(e)) == "": 247 | opFreqVec.append(0.0) 248 | 249 | mlInputs.append([np.array(opFreqVec)]) 250 | count += 1 251 | 252 | # Get words from utf-8 strings decoded from raw bytes of files, 253 | # and hash to vectors of integers 254 | sequences = [] 255 | count = 0 256 | for sample in pePaths: 257 | sequences.append(wordSequence(sample)) 258 | count += 1 259 | 260 | with open("finalVocabSize.txt", "r") as f: 261 | maxVocabSize = int(f.readline()) 262 | 263 | hashSeqs = hashWordSequences(sequences, 10000, maxVocabSize) 264 | 265 | count = 0 266 | for hashSeq in hashSeqs: 267 | mlInputs[count].append(np.array(hashSeq)) 268 | count += 1 269 | 270 | mlInputs = np.array(mlInputs) 271 | 272 | return mlInputs 273 | 274 | 275 | ## Function taking paths to PE files as input, and returning ensemble model predictions 276 | # as output 277 | def predictPEs(pePaths): 278 | classNames = ["benign", "malware", "ransomware"] 279 | pePredictions = {} 280 | 281 | count = 0 282 | for pePath in pePaths: 283 | x1 = preprocessPEs(pePaths)[count][0].reshape(1, 50) 284 | x2 = preprocessPEs(pePaths)[count][1].reshape(1, 100, 100, 1) 285 | count += 1 286 | pePredictions[pePath] = classNames[np.argmax(modelEns.predict(x=[x1, x2]))] 287 | 288 | return pePredictions 289 | 290 | 291 | if __name__ == "__main__": 292 | tkRoot = tk.Tk() 293 | tkRoot.title("Processing files...") 294 | tkRoot.withdraw() 295 | tkRoot.protocol("WM_DELETE_WINDOW", quit) 296 | w = tkRoot.winfo_screenwidth() 297 | h = tkRoot.winfo_screenheight() 298 | size = tuple(int(pos) for pos in tkRoot.geometry().split('+')[0].split('x')) 299 | x = w / 2 - size[0] / 2 300 | y = h / 2 - size[1] / 2 301 | tkRoot.geometry("300x1+{}+{}".format(round(x) - 150, round(y))) 302 | 303 | while True: 304 | try: 305 | pePaths = list(askopenfilenames(filetypes=[("Windows executable files", "*.exe")])) 306 | tkRoot.update() 307 | tkRoot.deiconify() 308 | preds = predictPEs(pePaths) 309 | if len(preds) > 0: 310 | classificationsStr = "" 311 | for key in preds.keys(): 312 | # print("'" + key + "'" + " detected as " + preds[key]) 313 | classificationsStr += "'" + key + "'" + " detected as " + preds[key] + "\n\n" 314 | tkRoot.withdraw() 315 | messagebox.showinfo("Detections", classificationsStr) 316 | else: 317 | quit() 318 | except Exception as e: 319 | messagebox.showerror("Error", "Error: " + str(e) + "\nPlease try again...") 320 | --------------------------------------------------------------------------------