├── bin-utf8-vec
    ├── maxSequenceLen.txt
    ├── finalVocabSize.txt
    ├── maxHashWordID.txt
    ├── dispCorpusSize.sh
    ├── X_test.npy
    ├── valInds.npy
    ├── y_test.npy
    ├── testInds.npy
    ├── trainInds.npy
    ├── joblibParForTest.py
    ├── getMaxHashID.py
    ├── truncateCorpus.py
    ├── vocabSize.py
    ├── hashAndPadSequences.py
    ├── wordSequences.py
    └── bin-utf8-vec.py
├── bin-opcodes-vec
    ├── X_test.npy
    ├── y_test.npy
    ├── testInds.npy
    ├── valInds.npy
    ├── top50opcodes.csv
    ├── top50opcodes.py
    ├── opcode-model.py
    └── bin-opcodes-vec.py
├── Project_Report_-_ELL16600748.pdf
├── weights-improvement-04-0.72.hdf5
├── weights-improvement-574-0.85.hdf5
├── LICENSE
├── README.md
├── benignFreewareDownloader.py
└── ensemblePredict.py


/bin-utf8-vec/maxSequenceLen.txt:
--------------------------------------------------------------------------------
1 | 2121399


--------------------------------------------------------------------------------
/bin-utf8-vec/finalVocabSize.txt:
--------------------------------------------------------------------------------
1 | 11695771


--------------------------------------------------------------------------------
/bin-utf8-vec/maxHashWordID.txt:
--------------------------------------------------------------------------------
1 | 9223372036854759194


--------------------------------------------------------------------------------
/bin-utf8-vec/dispCorpusSize.sh:
--------------------------------------------------------------------------------
1 | while sleep 2; do clear; ls -1 corpusTrunc | wc -l; done
2 | 


--------------------------------------------------------------------------------
/bin-utf8-vec/X_test.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/bin-utf8-vec/X_test.npy


--------------------------------------------------------------------------------
/bin-utf8-vec/valInds.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/bin-utf8-vec/valInds.npy


--------------------------------------------------------------------------------
/bin-utf8-vec/y_test.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/bin-utf8-vec/y_test.npy


--------------------------------------------------------------------------------
/bin-opcodes-vec/X_test.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/bin-opcodes-vec/X_test.npy


--------------------------------------------------------------------------------
/bin-opcodes-vec/y_test.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/bin-opcodes-vec/y_test.npy


--------------------------------------------------------------------------------
/bin-utf8-vec/testInds.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/bin-utf8-vec/testInds.npy


--------------------------------------------------------------------------------
/bin-utf8-vec/trainInds.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/bin-utf8-vec/trainInds.npy


--------------------------------------------------------------------------------
/bin-opcodes-vec/testInds.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/bin-opcodes-vec/testInds.npy


--------------------------------------------------------------------------------
/bin-opcodes-vec/valInds.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/bin-opcodes-vec/valInds.npy


--------------------------------------------------------------------------------
/Project_Report_-_ELL16600748.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/Project_Report_-_ELL16600748.pdf


--------------------------------------------------------------------------------
/weights-improvement-04-0.72.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/weights-improvement-04-0.72.hdf5


--------------------------------------------------------------------------------
/weights-improvement-574-0.85.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laurencejbelliott/Ensemble_DL_Ransomware_Detector/HEAD/weights-improvement-574-0.85.hdf5


--------------------------------------------------------------------------------
/bin-utf8-vec/joblibParForTest.py:
--------------------------------------------------------------------------------
 1 | __author__ = "Laurence Elliott"
 2 | 
 3 | from joblib import Parallel, delayed
 4 | 
 5 | def returnArg(arg):
 6 |     print(arg)
 7 |     return arg
 8 | 
 9 | argList = [1,2,3,4,5,6,7,8,9,10]
10 | 
11 | returnArgResults = Parallel(n_jobs=4, verbose=1, backend="threading")(map(delayed(returnArg), argList))
12 | print(returnArgResults)


--------------------------------------------------------------------------------
/bin-utf8-vec/getMaxHashID.py:
--------------------------------------------------------------------------------
 1 | __author__ = "Laurence Elliott - 16600748"
 2 | 
 3 | import os
 4 | import numpy as np
 5 | 
 6 | 
 7 | x = np.core.defchararray.add(np.array(["corpus/"]), np.array(os.listdir("corpus")))
 8 | maxHashWordIDs = []
 9 | 
10 | count = 0
11 | for file in x:
12 |     sampleArr = np.load(file)
13 |     maxHashWordIDs.append(max(sampleArr))
14 |     os.system("clear")
15 |     print(count)
16 |     count += 1
17 | 
18 | maxHashWordID = max(maxHashWordIDs)
19 | print("maxHashWordID:", maxHashWordID)
20 | with open("maxHashWordID.txt", "w") as f:
21 |     f.write(str(maxHashWordID))
22 | 


--------------------------------------------------------------------------------
/bin-opcodes-vec/top50opcodes.csv:
--------------------------------------------------------------------------------
 1 | opcode, frequency
 2 | add, 72144237
 3 | mov, 28942614
 4 | push, 13091749
 5 | call, 7452773
 6 | cmp, 5527990
 7 | int3, 4436803
 8 | lea, 4332597
 9 | jmp, 4167194
10 | pop, 4074808
11 | je, 4038195
12 | test, 3204034
13 | jne, 3123470
14 | xor, 2607139
15 | nop, 2441467
16 | sub, 2339824
17 | inc, 1875395
18 | and, 1736751
19 | ret, 1699971
20 | or, 1069451
21 | movzx, 1065368
22 | dec, 1001755
23 | shl, 496341
24 | shr, 482247
25 | jb, 471314
26 | xchg, 443857
27 | jae, 409428
28 | imul, 402999
29 | jg, 341131
30 | sar, 318254
31 | jle, 288714
32 | adc, 286692
33 | jbe, 282154
34 | ja, 264139
35 | leave, 263945
36 | jl, 261413
37 | sbb, 255962
38 | movsx, 222484
39 | neg, 199949
40 | fstp, 188997
41 | jge, 187117
42 | fld, 177448
43 | movsd, 148505
44 | not, 145062
45 | js, 139043
46 | insb, 137781
47 | popal, 133120
48 | setne, 131947
49 | outsd, 129368
50 | outsb, 125488
51 | jns, 114090
52 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Laurence Elliott
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/bin-utf8-vec/truncateCorpus.py:
--------------------------------------------------------------------------------
 1 | __author__ = "Laurence Elliott - 16600748"
 2 | 
 3 | import os, math
 4 | import numpy as np
 5 | 
 6 | # sampleLens = []
 7 | # count = 0
 8 | # for file in os.listdir("corpus"):
 9 | #     sample = np.load("corpus/" + file)
10 | #     zeroArr = [0]
11 | #     try:
12 | #         zerosInSample = np.isin(sample, zeroArr)
13 | #         zerosIndexes = np.where(zerosInSample)
14 | #         zerosStart = zerosIndexes[0][0]
15 | #         sample = sample[:zerosStart]
16 | #         sampleLen = len(sample)
17 | #         print(count, sampleLen)
18 | #         sampleLens.append(len(sample))
19 | #     except:
20 | #         sampleLen = len(sample)
21 | #         print(count, sampleLen)
22 | #         sampleLens.append(len(sample))
23 | #     count += 1
24 | #     # sample = np.concatenate((sample[0:200], sample[::-1][0:200]))
25 | #
26 | # minSampleLen = np.min(sampleLens)
27 | # print(minSampleLen)
28 | 
29 | # Min sample length is 18 bytes D:
30 | maxSequenceLen = 10000
31 | lenSqrt = int(math.sqrt(maxSequenceLen))
32 | print(lenSqrt)
33 | 
34 | count = 0
35 | for file in os.listdir("corpus"):
36 |     sample = np.load("corpus/" + file)[:maxSequenceLen]
37 |     sample = np.rint(((sample - np.min(sample)) /
38 |              (np.max(sample) - np.min(sample))) * 255)\
39 |         .astype('int').reshape(lenSqrt, lenSqrt, 1)
40 |     np.save("corpusTrunc/" + file, sample)
41 |     print(count)
42 |     count += 1


--------------------------------------------------------------------------------
/bin-utf8-vec/vocabSize.py:
--------------------------------------------------------------------------------
 1 | __author__ = "Laurence Elliott - 16600748"
 2 | 
 3 | import os
 4 | import numpy as np
 5 | 
 6 | benignSequenceFiles = np.core.defchararray.add(np.array(["benignSequences/"]),
 7 |                                                np.array(os.listdir("benignSequences")))
 8 | 
 9 | malwareSequenceFiles = np.core.defchararray.add(np.array(["malwareSequences/"]),
10 |                                                np.array(os.listdir("malwareSequences")))
11 | 
12 | ransomSequenceFiles = np.core.defchararray.add(np.array(["ransomSequences/"]),
13 |                                                np.array(os.listdir("ransomSequences")))
14 | 
15 | 
16 | vocab = set()
17 | 
18 | 
19 | for sampleN in range(0, 10000):
20 |     with open(benignSequenceFiles[sampleN]) as f:
21 |         vocab = vocab.union(set(f.readlines()))
22 |         if sampleN % 100 == 0:
23 |             print(sampleN)
24 | 
25 | for sampleN in range(0, 10000):
26 |     with open(malwareSequenceFiles[sampleN]) as f:
27 |         vocab = vocab.union(set(f.readlines()))
28 |         if sampleN % 100 == 0:
29 |             print(sampleN + 10000)
30 | 
31 | for sampleN in range(0, 10000):
32 |     with open(ransomSequenceFiles[sampleN]) as f:
33 |         vocab = vocab.union(set(f.readlines()))
34 |         if sampleN % 100 == 0:
35 |             print(sampleN + 20000)
36 | 
37 | with open("finalVocabSize.txt", "w") as f:
38 |     f.write(str(len(vocab)))


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Ensemble Deep Learning Ransomware Detector
2 | A Deep Learning ensemble that classifies Windows executable files as either benign, ransomware, or other malware.
3 | This program was developed as part of my dissertation for my BSc (Hons) Computer Science course at the University of Lincoln: ['Ransomware Detection Using Deep Learning Ensemble'](Project_Report_-_ELL16600748.pdf) in which it is demonstrated to achieve 96% accuracy in classifying a test set of 3000 '.exe' files not seen in the model's training.
4 | 
5 | # Setup
6 | This project uses Python 3. For the GUI detector program `ensemblePredict.py`, the following python packages must be installed: tensorflow, keras, h5py, capstone, pefile, numpy, and scikit-learn. These can be installed via the terminal or command prompt command `pip install tensorflow keras h5py capstone pefile numpy scikit-learn`. Then simply run the script with `python ensemblePredict.py`. You should be greeted by a file selection dialog with which you can select one or more '.exe' files, then click 'Open' and the deep learning ensemble will predict if they are benign, ransomware, or other malware.
7 | 
8 | Source code for training and pre-processing for the ensemble's two models, in the folders `bin-opcodes-vec` and `bin-utf8-vec`, should run with the same pre-requisites, though `tensorflow-gpu` is recommended to acheive reasonable training times. I am not licenced to distribute the benign samples used to train the models, but these can be downloaded by installing BeautifulSoup with `pip install beautifulsoup4` and running `python benignFreewareDownloader.py`. Malware and ransomware samples were obtained from torrents available from [VirusShare.com](https://virusshare.com), after being vetted by the site's admin. Details about the particular torrents used, among other details of the model and its development can be found in this project's [report](Project_Report_-_ELL16600748.pdf).
9 | 


--------------------------------------------------------------------------------
/bin-opcodes-vec/top50opcodes.py:
--------------------------------------------------------------------------------
 1 | __author__ = "Laurence Elliott - 16600748"
 2 | 
 3 | from capstone import *
 4 | import pefile, os
 5 | 
 6 | # samplePaths = ["testSamples/" + sample for sample in os.listdir("testSamples")]
 7 | samplePaths = ["../bin-utf8-vec/benignSamples/" + sample for sample in os.listdir("../bin-utf8-vec/benignSamples")] + \
 8 | ["../bin-utf8-vec/malwareSamples/" + sample for sample in os.listdir("../bin-utf8-vec/malwareSamples")] + \
 9 | ["../bin-utf8-vec/ransomwareSamples/" + sample for sample in os.listdir("../bin-utf8-vec/ransomwareSamples")]
10 | 
11 | 
12 | 
13 | opcodeSet = set()
14 | opCodeDicts = []
15 | opCodeFreqs = {}
16 | nSamples = len(samplePaths)
17 | 
18 | count = 1
19 | for sample in samplePaths:
20 |     try:
21 |         pe = pefile.PE(sample, fast_load=True)
22 |         entryPoint = pe.OPTIONAL_HEADER.AddressOfEntryPoint
23 |         data = pe.get_memory_mapped_image()[entryPoint:]
24 |         cs = Cs(CS_ARCH_X86, CS_MODE_32)
25 | 
26 |         opcodes = []
27 |         for i in cs.disasm(data, 0x1000):
28 |             opcodes.append(i.mnemonic)
29 | 
30 |         opcodeDict = {}
31 |         total = len(opcodes)
32 | 
33 |         opcodeSet = set(list(opcodeSet) + opcodes)
34 |         for opcode in opcodeSet:
35 |             freq = 1
36 |             for op in opcodes:
37 |                 if opcode == op:
38 |                     freq += 1
39 |             try:
40 |                 opCodeFreqs[opcode] += freq
41 |             except:
42 |                 opCodeFreqs[opcode] = freq
43 | 
44 |             opcodeDict[opcode] = round((freq / total) * 100, 2)
45 | 
46 | 
47 |         opCodeDicts.append(opcodeDict)
48 |         os.system("clear")
49 |         print(str((count / nSamples) * 100) + "%")
50 |         count += 1
51 |     except Exception as e:
52 |         print(e)
53 | 
54 |     # for opcode in opcodeSet:
55 |     #     print(opcode, str(opcodeDict[opcode]) + "%")
56 | 
57 | # for opcodeDict in opCodeDicts:
58 | #     freqSorted = sorted(opcodeDict, key=opcodeDict.get)[-1:0:-1]
59 | #     print(opcodeDict[freqSorted[0]], opcodeDict[freqSorted[1]], opcodeDict[freqSorted[2]], freqSorted)
60 | 
61 | opCodeFreqsSorted = sorted(opCodeFreqs, key=opCodeFreqs.get)[-1:0:-1]
62 | 
63 | with open("top50opcodes.csv", "w") as f:
64 |     f.write("opcode, frequency\n")
65 |     for opcode in opCodeFreqsSorted[:50]:
66 |         f.write(str(opcode) + ", " + str(opCodeFreqs[opcode]) + "\n")
67 |         print(opcode, opCodeFreqs[opcode])
68 | 
69 | 


--------------------------------------------------------------------------------
/bin-utf8-vec/hashAndPadSequences.py:
--------------------------------------------------------------------------------
 1 | __author__= "Laurence Elliott - 16600748"
 2 | 
 3 | import os
 4 | import numpy as np
 5 | from keras import preprocessing
 6 | 
 7 | docTotal = 30000
 8 | maxSequenceLen = 10000
 9 | 
10 | with open("finalVocabSize.txt", "r") as f:
11 |     maxVocabSize = int(f.readline())
12 | 
13 | 
14 | def hashWordSequences(sequencePath, outputPath, maxSeqLen, vocabSize, docT, nSamples):
15 |     docCount = 0
16 |     if sequencePath[-1] != "/": sequencePath += "/"
17 |     if outputPath[-1] != "/": outputPath += "/"
18 | 
19 |     seqFiles = [sequencePath + os.listdir(sequencePath)[i] for i in range(0, nSamples)]
20 |     for seqFile in seqFiles:
21 |         with open(seqFile, "r") as f:
22 |             try:
23 |                 sequence = np.char.replace(np.array(f.readlines()), "\n", "")
24 |                 text = " ".join(sequence)
25 |                 hashWordIDs = preprocessing.text.hashing_trick(text, round(maxVocabSize * 1.5), hash_function='md5')
26 |                 docLen = len(hashWordIDs)
27 |                 if docLen < maxSequenceLen:
28 |                     hashWordIDs += [0 for i in range(0, maxSequenceLen-docLen)]
29 |                 hashWordIDs = np.array(hashWordIDs).reshape(100, 100, 1)
30 |                 np.save(outputPath + str(docCount) + ".npy", hashWordIDs)
31 |                 if docCount % 100 == 0:
32 |                     print(str(int((docCount / nSamples) * 100)) + "%")
33 |                 docCount += 1
34 |             except Exception as e:
35 |                 print(e)
36 | 
37 | 
38 | # print("Max vocab size (for hashing trick):", maxVocabSize, "\nMax sequence length (for zero padding):", maxSequenceLen)
39 | #
40 | # print("Hashing benign word sequences...")
41 | # hashWordSequences(sequencePath="benignSequences",
42 | #                   outputPath="finalBenignCorpus",
43 | #                   maxSeqLen=maxSequenceLen,
44 | #                   vocabSize=maxVocabSize,
45 | #                   docT=docTotal,
46 | #                   nSamples=10000)
47 | 
48 | print("Hashing malware word sequences...")
49 | hashWordSequences(sequencePath="malwareSequences",
50 |                   outputPath="finalMalwareCorpus",
51 |                   maxSeqLen=maxSequenceLen,
52 |                   vocabSize=maxVocabSize,
53 |                   docT=docTotal,
54 |                   nSamples=10000)
55 | 
56 | print("Hashing ransomware word sequences...")
57 | hashWordSequences(sequencePath="ransomSequences",
58 |                   outputPath="finalRansomCorpus",
59 |                   maxSeqLen=maxSequenceLen,
60 |                   vocabSize=maxVocabSize,
61 |                   docT=docTotal,
62 |                   nSamples=10000)
63 | 


--------------------------------------------------------------------------------
/bin-utf8-vec/wordSequences.py:
--------------------------------------------------------------------------------
 1 | __author__ = "Laurence Elliott - 16600748"
 2 | 
 3 | import string, os, math
 4 | import numpy as np
 5 | from tensorflow.python.keras import preprocessing
 6 | from joblib import Parallel, delayed
 7 | 
 8 | def strings(filename, min=4):
 9 |     with open(filename, errors="ignore", encoding="utf-8") as f:
10 |         result = ""
11 |         for c in f.read():
12 |             if c in string.printable:
13 |                 result += c
14 |                 continue
15 |             if len(result) >= min:
16 |                 yield result
17 |             result = ""
18 |         if len(result) >= min:  # catch result at EOF
19 |             yield result
20 | 
21 | 
22 | def wordSequencesBenign(sampleN):
23 |     print(benignSampleFiles[sampleN])
24 |     try:
25 |         text = ""
26 |         for s in strings("benignSamples/" + benignSampleFiles[sampleN]):
27 |             text += s + "\n"
28 |         sequence = preprocessing.text.text_to_word_sequence(text)[:10000]
29 |         np.savetxt("benignSequences/" + str(sampleN) + ".txt", sequence, fmt="%s")
30 |         del text, sequence
31 |     except Exception as e:
32 |         print(e)
33 | 
34 | 
35 | def wordSequencesMalware(sampleN):
36 |     print(malwareSampleFiles[sampleN])
37 |     try:
38 |         text = ""
39 |         for s in strings("malwareSamples/" + malwareSampleFiles[sampleN]):
40 |             text += s + "\n"
41 |         sequence = preprocessing.text.text_to_word_sequence(text)[:10000]
42 |         np.savetxt("malwareSequences/" + str(sampleN) + ".txt", sequence, fmt="%s")
43 |         del text, sequence
44 |     except Exception as e:
45 |         print(e)
46 | 
47 | 
48 | def wordSequencesRansom(sampleN):
49 |     print(ransomSampleFiles[sampleN])
50 |     try:
51 |         text = ""
52 |         for s in strings("ransomwareSamples/" + ransomSampleFiles[sampleN]):
53 |             text += s + "\n"
54 |         sequence = preprocessing.text.text_to_word_sequence(text)[:10000]
55 |         np.savetxt("ransomSequences/" + str(sampleN) + ".txt", sequence, fmt="%s")
56 |         del text, sequence
57 |     except Exception as e:
58 |         print(e)
59 | 
60 | 
61 | # 10,000 samples will be used from each class
62 | # Only 9,255 benign samples were gathered, so the first 745 samples are used again
63 | benignSampleFiles = os.listdir("benignSamples") + os.listdir("benignSamples")[:745]
64 | malwareSampleFiles = os.listdir("malwareSamples")[:10000]
65 | ransomSampleFiles = os.listdir("ransomwareSamples")[:10000]
66 | 
67 | nBenignSamples = len(benignSampleFiles)
68 | nMalwareSamples = len(malwareSampleFiles)
69 | nRansomSamples = len(ransomSampleFiles)
70 | 
71 | benignSequences = []
72 | malwareSequences = []
73 | ransomSequences = []
74 | 
75 | print(nBenignSamples, nMalwareSamples, nRansomSamples)
76 | 
77 | print("Generating word sequences for benign samples...")
78 | Parallel(n_jobs=-1, verbose=11, backend="threading")(map(delayed(wordSequencesBenign), range(0, nBenignSamples)))
79 | 
80 | print("Generating word sequences for malware samples...")
81 | Parallel(n_jobs=-1, verbose=11, backend="threading")(map(delayed(wordSequencesMalware), range(0, nMalwareSamples)))
82 | 
83 | print("Generating word sequences for ransomware samples...")
84 | Parallel(n_jobs=-1, verbose=11, backend="threading")(map(delayed(wordSequencesRansom), range(0, nRansomSamples)))
85 | 
86 | 


--------------------------------------------------------------------------------
/bin-opcodes-vec/opcode-model.py:
--------------------------------------------------------------------------------
  1 | __author__ = "Laurence Elliott - 16600748"
  2 | 
  3 | import os, math
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from time import time
  7 | from tensorflow.python.keras.models import Sequential
  8 | from tensorflow.python.keras import preprocessing, layers, optimizers
  9 | from tensorflow.python.keras.utils import Sequence, to_categorical
 10 | from tensorflow.python.keras.callbacks import TensorBoard, ModelCheckpoint
 11 | from sklearn.utils import shuffle
 12 | from sklearn.model_selection import train_test_split
 13 | 
 14 | 
 15 | benignHists = ["benignHistVecs/" + fileName \
 16 |                 for fileName in os.listdir("benignHistVecs")]
 17 | benignHists += benignHists[:745]
 18 | 
 19 | malwareHists = ["malwareHistVecs/" + fileName \
 20 |                 for fileName in os.listdir("malwareHistVecs")]
 21 | malwareHists += malwareHists[:16]
 22 | 
 23 | ransomHists = ["ransomHistVecs/" + fileName \
 24 |                 for fileName in os.listdir("ransomHistVecs")]
 25 | ransomHists = ransomHists[:10000]
 26 | 
 27 | nBenignSamples = len(benignHists)
 28 | nMalwareSamples = len(malwareHists)
 29 | nRansomSamples = len(ransomHists)
 30 | 
 31 | 
 32 | 
 33 | # x is a list of paths to training samples
 34 | x = np.array(benignHists + malwareHists + ransomHists)
 35 | # y is a list of samples' associated class labels with one-hot encoding
 36 | y = np.ones(nBenignSamples+nMalwareSamples+nRansomSamples)
 37 | y[0:nBenignSamples] = 0
 38 | y[nBenignSamples:nBenignSamples+nMalwareSamples] = 1
 39 | y[nBenignSamples+nMalwareSamples:nBenignSamples+nMalwareSamples+nRansomSamples] = 2
 40 | # represent labels with one-hot encoding
 41 | y = to_categorical(y, num_classes=3)
 42 | 
 43 | # Dataset is indexed by same shuffled and split indexing as the other model
 44 | trainInds = np.load("trainInds.npy")
 45 | valInds = np.load("valInds.npy")
 46 | testInds = np.load("testInds.npy")
 47 | X_train = x[trainInds]
 48 | X_val = x[valInds]
 49 | X_test = x[testInds]
 50 | 
 51 | y_train = y[trainInds]
 52 | y_val = y[valInds]
 53 | y_test = y[testInds]
 54 | 
 55 | 
 56 | class histSequence(Sequence):
 57 | 
 58 |     def __init__(self, x, y, batch_size):
 59 |         self.x, self.y = shuffle(x, y)
 60 |         self.batch_size = batch_size
 61 | 
 62 |     def __len__(self):
 63 |         return math.ceil(len(self.x) / self.batch_size)
 64 | 
 65 |     def __getitem__(self, idx):
 66 |         batch_x = self.x[idx * self.batch_size:(idx + 1) *
 67 |         self.batch_size]
 68 |         batch_y = self.y[idx * self.batch_size:(idx + 1) *
 69 |         self.batch_size]
 70 | 
 71 |         return np.array([
 72 |             np.load(file_name)
 73 |             for file_name in batch_x]), np.array(batch_y)
 74 | 
 75 |     def on_epoch_end(self):
 76 |         pass
 77 | 
 78 | 
 79 | class histSequenceVal(histSequence):
 80 | 
 81 |     def __init__(self, x, y, batch_size):
 82 |         self.x, self.y = x, y
 83 |         self.batch_size = batch_size
 84 | 
 85 | 
 86 | batch_size = 1000
 87 | sequenceGenerator = histSequence(X_train, y_train, batch_size)
 88 | validationSeqGen = histSequenceVal(X_val, y_val, batch_size)
 89 | print(validationSeqGen.__getitem__(0))
 90 | 
 91 | # Defining the ML model
 92 | model = Sequential()
 93 | 
 94 | model.add(layers.InputLayer(input_shape=(50,)))
 95 | model.add(layers.Dense(256, activation='relu'))
 96 | model.add(layers.BatchNormalization())
 97 | model.add(layers.Dropout(0.2))
 98 | model.add(layers.Dense(128, activation='relu'))
 99 | model.add(layers.BatchNormalization())
100 | model.add(layers.Dense(64, activation='relu'))
101 | model.add(layers.BatchNormalization())
102 | model.add(layers.Dense(32, activation='relu'))
103 | model.add(layers.BatchNormalization())
104 | model.add(layers.Dense(16, activation='relu'))
105 | model.add(layers.BatchNormalization())
106 | model.add(layers.Dense(3, activation='softmax'))
107 | 
108 | tensorboard = TensorBoard(log_dir="logs/{}".format(time()))
109 | 
110 | model.compile(optimizer="rmsprop",
111 |               loss='categorical_crossentropy',
112 |               metrics=['accuracy'])
113 | 
114 | filePath="weights/weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
115 | checkpoint = ModelCheckpoint(filePath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
116 | callbackList = [tensorboard, checkpoint]
117 | 
118 | # Training the model
119 | model.fit_generator(generator=sequenceGenerator,
120 |                     epochs=1000,
121 |                     steps_per_epoch=len(sequenceGenerator),
122 |                     verbose=1,
123 |                     validation_data=validationSeqGen,
124 |                     validation_steps=len(validationSeqGen),
125 |                     workers=8,
126 |                     use_multiprocessing=True,
127 |                     callbacks=callbackList)
128 | 


--------------------------------------------------------------------------------
/bin-utf8-vec/bin-utf8-vec.py:
--------------------------------------------------------------------------------
  1 | __author__ = "Laurence Elliott - 16600748"
  2 | 
  3 | import string, os, json, math, random
  4 | from time import time
  5 | import numpy as np
  6 | import matplotlib.pyplot as plt
  7 | import tensorflow as tf
  8 | from tensorflow.python.keras.models import Sequential
  9 | from tensorflow.python.keras import preprocessing, layers, optimizers
 10 | from tensorflow.python.keras.utils import Sequence, to_categorical
 11 | from tensorflow.python.keras.callbacks import TensorBoard, ModelCheckpoint
 12 | from sklearn.utils import shuffle
 13 | from sklearn.model_selection import train_test_split
 14 | from joblib import Parallel, delayed
 15 | 
 16 | # https://stackoverflow.com/questions/17195924/python-equivalent-of-unix-strings-utility
 17 | # Solution to Python based 'strings' alternative from SO
 18 | 
 19 | def strings(filename, min=4):
 20 |     with open(filename, errors="ignore", encoding="utf-8") as f:
 21 |         result = ""
 22 |         for c in f.read():
 23 |             if c in string.printable:
 24 |                 result += c
 25 |                 continue
 26 |             if len(result) >= min:
 27 |                 yield result
 28 |             result = ""
 29 |         if len(result) >= min:  # catch result at EOF
 30 |             yield result
 31 | 
 32 | 
 33 | vocabSizes = []
 34 | wordSequenceLens = []
 35 | benignSequences = []
 36 | malwareSequences = []
 37 | nBenignSamples = 10000
 38 | nMalwareSamples = 10000
 39 | nRansomSamples = 10000
 40 | 
 41 | 
 42 | with open("maxVocabSize.txt","r") as f:
 43 |     maxVocabSize = int(f.read())
 44 | 
 45 | maxSequenceLen = 10000
 46 | 
 47 | with open("maxHashWordID.txt","r") as f:
 48 |     maxHashWordID = int(f.read())
 49 | 
 50 | 
 51 | benignCorpus = ["finalBenignCorpus/" + fileName \
 52 |                 for fileName in os.listdir("finalBenignCorpus")]
 53 | 
 54 | malwareCorpus = ["finalMalwareCorpus/" + fileName \
 55 |                 for fileName in os.listdir("finalMalwareCorpus")]
 56 | malwareCorpus += malwareCorpus[:5]
 57 | 
 58 | ransomCorpus = ["finalRansomCorpus/" + fileName \
 59 |                 for fileName in os.listdir("finalRansomCorpus")]
 60 | ransomCorpus += ransomCorpus[:2]
 61 | 
 62 | # x is a list of paths to training samples
 63 | x = np.array(benignCorpus + malwareCorpus + ransomCorpus)
 64 | # y is a list of samples' associated class labels with one-hot encoding
 65 | y = np.ones(nBenignSamples+nMalwareSamples+nRansomSamples)
 66 | y[0:nBenignSamples] = 0
 67 | y[nBenignSamples:nBenignSamples+nMalwareSamples] = 1
 68 | y[nBenignSamples+nMalwareSamples:nBenignSamples+nMalwareSamples+nRansomSamples] = 2
 69 | # represent labels with one-hot encoding
 70 | y = to_categorical(y,num_classes=3)
 71 | 
 72 | 
 73 | # Dataset is indexed by same shuffled and split indexing as the other model
 74 | trainInds = np.load("trainInds.npy")
 75 | valInds = np.load("valInds.npy")
 76 | testInds = np.load("testInds.npy")
 77 | X_train = x[trainInds]
 78 | X_val = x[valInds]
 79 | X_test = x[testInds]
 80 | 
 81 | y_train = y[trainInds]
 82 | y_val = y[valInds]
 83 | y_test = y[testInds]
 84 | 
 85 | 
 86 | class hashCorpusSequence(Sequence):
 87 | 
 88 |     def __init__(self, x, y, batch_size):
 89 |         self.x, self.y = shuffle(x, y)
 90 |         self.batch_size = batch_size
 91 | 
 92 |     def __len__(self):
 93 |         return math.ceil(len(self.x) / self.batch_size)
 94 | 
 95 |     def __getitem__(self, idx):
 96 |         batch_x = self.x[idx * self.batch_size:(idx + 1) *
 97 |         self.batch_size]
 98 |         batch_y = self.y[idx * self.batch_size:(idx + 1) *
 99 |         self.batch_size]
100 | 
101 |         return np.array([
102 |             # np.load(file_name)
103 |             np.rint(((np.load(file_name) - np.min(np.load(file_name))) /
104 |             (np.max(np.load(file_name)) - np.min(np.load(file_name)))) * 255).astype(int)
105 |             for file_name in batch_x]), np.array(batch_y)
106 | 
107 |     def on_epoch_end(self):
108 |         pass
109 | 
110 | class hashCorpusSequenceVal(hashCorpusSequence):
111 | 
112 |     def __init__(self, x, y, batch_size):
113 |         self.x, self.y = x, y
114 |         self.batch_size = batch_size
115 | 
116 | 
117 | batch_size = 150
118 | sequenceGenerator = hashCorpusSequence(X_train, y_train, batch_size)
119 | validationSeqGen = hashCorpusSequenceVal(X_val, y_val, batch_size)
120 | 
121 | 
122 | # res = int(math.sqrt(maxSequenceLen))
123 | # classTitles = ["benign", "malware", "ransomware"]
124 | #
125 | # sampleCount = random.randrange(0, 2988)
126 | # # plt.title("Sample " + str(sampleCount) + " Converted to Grayscale Image\n(" +
127 | # #           classTitles[sequenceGenerator.__getitem__(sampleCount)[1].tolist()[0].index(1)] + ")")
128 | # plt.imshow(sequenceGenerator.__getitem__(sampleCount)[0][0].reshape(res, res), cmap='gray')
129 | 
130 | 
131 | # Defining the ML model
132 | model = Sequential()
133 | 
134 | model.add(layers.InputLayer(input_shape=(100, 100, 1)))
135 | model.add(layers.SpatialDropout2D(rate=0.2))
136 | model.add(layers.Conv2D(32, kernel_size=3, activation='relu'))
137 | model.add(layers.BatchNormalization())
138 | model.add(layers.SpatialDropout2D(rate=0.1))
139 | model.add(layers.Conv2D(16, kernel_size=3, activation='relu'))
140 | model.add(layers.BatchNormalization())
141 | model.add(layers.SpatialDropout2D(rate=0.1))
142 | model.add(layers.Flatten())
143 | model.add(layers.Dense(3, activation='softmax'))
144 | 
145 | 
146 | tensorboard = TensorBoard(log_dir="logs/{}".format(time()))
147 | 
148 | model.compile(optimizer="adamax",
149 |               loss='categorical_crossentropy',
150 |               metrics=['accuracy'])
151 | 
152 | 
153 | filePath="weights/weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
154 | checkpoint = ModelCheckpoint(filePath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
155 | callbackList = [tensorboard, checkpoint]
156 | 
157 | 
158 | # Training the model
159 | model.fit_generator(generator=sequenceGenerator,
160 |                     epochs=1000,
161 |                     steps_per_epoch=len(sequenceGenerator),
162 |                     verbose=1,
163 |                     validation_data=validationSeqGen,
164 |                     validation_steps=len(validationSeqGen),
165 |                     workers=8,
166 |                     use_multiprocessing=True,
167 |                     callbacks=callbackList)
168 | 


--------------------------------------------------------------------------------
/bin-opcodes-vec/bin-opcodes-vec.py:
--------------------------------------------------------------------------------
  1 | __author__ = "Laurence Elliott - 16600748"
  2 | 
  3 | from capstone import *
  4 | import pefile, os
  5 | import numpy as np
  6 | from matplotlib import pyplot as plt
  7 | 
  8 | benignPaths = ["../bin-utf8-vec/benignSamples/" + sample for sample in os.listdir("../bin-utf8-vec/benignSamples")]
  9 | malwarePaths = ["../bin-utf8-vec/malwareSamples/" + sample for sample in os.listdir("../bin-utf8-vec/malwareSamples")]
 10 | ransomPaths = ["../bin-utf8-vec/ransomwareSamples/" + sample for sample in os.listdir("../bin-utf8-vec/ransomwareSamples")]
 11 | 
 12 | nSamples = len(benignPaths) + len(malwarePaths) + len(ransomPaths)
 13 | benignOpCodeSet = set()
 14 | benignOpCodeDicts = []
 15 | benignOpCodeFreqs = {}
 16 | 
 17 | count = 1
 18 | for sample in benignPaths:
 19 |     try:
 20 |         pe = pefile.PE(sample, fast_load=True)
 21 |         entryPoint = pe.OPTIONAL_HEADER.AddressOfEntryPoint
 22 |         data = pe.get_memory_mapped_image()[entryPoint:]
 23 |         cs = Cs(CS_ARCH_X86, CS_MODE_32)
 24 | 
 25 |         opcodes = []
 26 |         for i in cs.disasm(data, 0x1000):
 27 |             opcodes.append(i.mnemonic)
 28 | 
 29 |         opcodeDict = {}
 30 |         total = len(opcodes)
 31 | 
 32 |         benignOpCodeSet = set(list(benignOpCodeSet) + opcodes)
 33 |         for opcode in benignOpCodeSet:
 34 |             freq = 1
 35 |             for op in opcodes:
 36 |                 if opcode == op:
 37 |                     freq += 1
 38 |             try:
 39 |                 benignOpCodeFreqs[opcode] += freq
 40 |             except:
 41 |                 benignOpCodeFreqs[opcode] = freq
 42 | 
 43 |             opcodeDict[opcode] = round((freq / total) * 100, 2)
 44 | 
 45 |         benignOpCodeDicts.append(opcodeDict)
 46 | 
 47 |         os.system("clear")
 48 |         print(str((count / nSamples) * 100) + "%")
 49 |         count += 1
 50 | 
 51 |     except Exception as e:
 52 |         print(e)
 53 | 
 54 | 
 55 | malwareOpCodeSet = set()
 56 | malwareOpCodeDicts = []
 57 | malwareOpCodeFreqs = {}
 58 | 
 59 | count = len(malwarePaths)
 60 | for sample in malwarePaths:
 61 |     try:
 62 |         pe = pefile.PE(sample, fast_load=True)
 63 |         entryPoint = pe.OPTIONAL_HEADER.AddressOfEntryPoint
 64 |         data = pe.get_memory_mapped_image()[entryPoint:]
 65 |         cs = Cs(CS_ARCH_X86, CS_MODE_32)
 66 | 
 67 |         opcodes = []
 68 |         for i in cs.disasm(data, 0x1000):
 69 |             opcodes.append(i.mnemonic)
 70 | 
 71 |         opcodeDict = {}
 72 |         total = len(opcodes)
 73 | 
 74 |         malwareOpCodeSet = set(list(malwareOpCodeSet) + opcodes)
 75 |         for opcode in malwareOpCodeSet:
 76 |             freq = 1
 77 |             for op in opcodes:
 78 |                 if opcode == op:
 79 |                     freq += 1
 80 |             try:
 81 |                 malwareOpCodeFreqs[opcode] += freq
 82 |             except:
 83 |                 malwareOpCodeFreqs[opcode] = freq
 84 | 
 85 |             opcodeDict[opcode] = round((freq / total) * 100, 2)
 86 | 
 87 |         malwareOpCodeDicts.append(opcodeDict)
 88 | 
 89 |         os.system("clear")
 90 |         print(str((count / nSamples) * 100) + "%")
 91 |         count += 1
 92 | 
 93 |     except Exception as e:
 94 |         print(e)
 95 | 
 96 | ransomOpCodeSet = set()
 97 | ransomOpCodeDicts = []
 98 | ransomOpCodeFreqs = {}
 99 | 
100 | count = len(benignPaths) + len(malwarePaths)
101 | for sample in ransomPaths:
102 |     try:
103 |         pe = pefile.PE(sample, fast_load=True)
104 |         entryPoint = pe.OPTIONAL_HEADER.AddressOfEntryPoint
105 |         data = pe.get_memory_mapped_image()[entryPoint:]
106 |         cs = Cs(CS_ARCH_X86, CS_MODE_32)
107 | 
108 |         opcodes = []
109 |         for i in cs.disasm(data, 0x1000):
110 |             opcodes.append(i.mnemonic)
111 | 
112 |         opcodeDict = {}
113 |         total = len(opcodes)
114 | 
115 |         ransomOpCodeSet = set(list(ransomOpCodeSet) + opcodes)
116 |         for opcode in ransomOpCodeSet:
117 |             freq = 1
118 |             for op in opcodes:
119 |                 if opcode == op:
120 |                     freq += 1
121 |             try:
122 |                 ransomOpCodeFreqs[opcode] += freq
123 |             except:
124 |                 ransomOpCodeFreqs[opcode] = freq
125 | 
126 |             opcodeDict[opcode] = round((freq / total) * 100, 2)
127 | 
128 |         ransomOpCodeDicts.append(opcodeDict)
129 | 
130 |         os.system("clear")
131 |         print(str((count / nSamples) * 100) + "%")
132 |         count += 1
133 | 
134 |     except Exception as e:
135 |         print(e)
136 | 
137 | 
138 | opCodeFreqsSorted = np.genfromtxt("top50opcodes.csv", delimiter=",", dtype="str")[1:, 0]
139 | 
140 | count = 0
141 | for opDict in benignOpCodeDicts:
142 |     opFreqVec = []
143 |     for opcode in opCodeFreqsSorted[:50]:
144 |         try:
145 |             opFreqVec.append(opDict[opcode])
146 |         except Exception as e:
147 |             if str(type(e)) == "<class 'KeyError'>":
148 |                 opFreqVec.append(0.0)
149 | 
150 |     np.save("benignHistVecs/" + str(count)+".npy", opFreqVec)
151 |     os.system("clear")
152 |     print(str((count / nSamples) * 100) + "%")
153 |     count += 1
154 | 
155 | 
156 | count = len(benignPaths)
157 | for opDict in malwareOpCodeDicts:
158 |     opFreqVec = []
159 |     for opcode in opCodeFreqsSorted[:50]:
160 |         try:
161 |             opFreqVec.append(opDict[opcode])
162 |         except Exception as e:
163 |             if str(type(e)) == "<class 'KeyError'>":
164 |                 opFreqVec.append(0.0)
165 | 
166 |     np.save("malwareHistVecs/" + str(count)+".npy", opFreqVec)
167 |     os.system("clear")
168 |     print(str((count / nSamples) * 100) + "%")
169 |     count += 1
170 | 
171 | 
172 | count = len(benignPaths) + len(malwarePaths)
173 | for opDict in ransomOpCodeDicts:
174 |     opFreqVec = []
175 |     for opcode in opCodeFreqsSorted[:50]:
176 |         try:
177 |             opFreqVec.append(opDict[opcode])
178 |         except Exception as e:
179 |             if str(type(e)) == "<class 'KeyError'>":
180 |                 opFreqVec.append(0.0)
181 | 
182 |     np.save("ransomHistVecs/" + str(count)+".npy", opFreqVec)
183 |     os.system("clear")
184 |     print(str((count / nSamples) * 100) + "%")
185 |     count += 1
186 | 
187 | 
188 | # benignVecPaths = ["benignHistVecs/" + vecPath for vecPath in os.listdir("benignHistVecs")]
189 | 
190 | # for vecPath in benignVecPaths:
191 | #     opFreqVec = np.load(vecPath)
192 | #     print(opFreqVec)
193 | #     plt.figure(count)
194 | #     plt.bar(np.arange(len(opFreqVec)), opFreqVec)
195 | #     plt.show()
196 | 


--------------------------------------------------------------------------------
/benignFreewareDownloader.py:
--------------------------------------------------------------------------------
  1 | __author__ = "Laurence Elliott"
  2 | 
  3 | from urllib.request import urlopen as uOpen
  4 | from bs4 import BeautifulSoup as soup
  5 | import re
  6 | import os
  7 | 
  8 | 
  9 | myUrl = "https://www.freewarefiles.com/search.php?categoryid=1&query=&boolean=exact"
 10 | 
 11 | # connecting to and downloading page
 12 | uClient = uOpen(myUrl)
 13 | page_html = uClient.read()
 14 | uClient.close()
 15 | 
 16 | # instatiating BeautifulSoup parsing of first page
 17 | page_soup = soup(page_html, "html.parser")
 18 | 
 19 | # gets page numbers from list above program listings
 20 | numPagesA = page_soup.findAll("li",{"class":"page-item"})
 21 | numPagesArr = []
 22 | for numPageA in numPagesA:
 23 |     numPage = numPageA.findAll("a",{"class":"page-link"})[0]
 24 |     try:
 25 |         numPage = re.search('(?<=>)[0-9]+(?=<\/a>)',str(numPage)).group(0)
 26 |         numPagesArr.append(numPage)
 27 |     except:
 28 |         pass
 29 | 
 30 | # the last of the list of page numbers is stored for reference as the last
 31 | # page of the search
 32 | maxPage = numPagesArr[-1]
 33 | print("Total pages: " + str(maxPage) + "\n")
 34 | 
 35 | # get next page link
 36 | nextA = page_soup.findAll("a",{"aria-label":"Next"})[0]
 37 | print(nextA["href"])
 38 | 
 39 | # get links to individual program download pages
 40 | downloadPageHeaders = page_soup.findAll("h3",{"class":"search-head"})
 41 | downloadPageLinks = []
 42 | for pageHeader in downloadPageHeaders:
 43 |     pageHeader = pageHeader.findAll("a")[0]
 44 |     downloadPageLink = pageHeader["href"]
 45 |     print(downloadPageLink)
 46 |     downloadPageLinks.append(downloadPageLink)
 47 | 
 48 | # main
 49 | if __name__ == "__main__":
 50 |     myUrl = "https://www.freewarefiles.com/search.php?categoryid=1&query=&boolean=exact"
 51 |     count = 1
 52 | 
 53 |     # connecting to and downloading first page
 54 |     uClient = uOpen(myUrl)
 55 |     page_html = uClient.read()
 56 |     uClient.close()
 57 | 
 58 |     # instatiating BeautifulSoup parsing of first page
 59 |     page_soup = soup(page_html, "html.parser")
 60 | 
 61 |     # gets page numbers from list above program listings
 62 |     numPagesA = page_soup.findAll("li", {"class": "page-item"})
 63 |     numPagesArr = []
 64 |     for numPageA in numPagesA:
 65 |         numPage = numPageA.findAll("a", {"class": "page-link"})[0]
 66 |         try:
 67 |             numPage = re.search('(?<=>)[0-9]+(?=<\/a>)', str(numPage)).group(0)
 68 |             numPagesArr.append(numPage)
 69 |         except:
 70 |             pass
 71 | 
 72 |     # the last of the list of page numbers is stored for reference as the last
 73 |     # page of the search
 74 |     maxPage = int(numPagesArr[-1])
 75 |     print("Total pages: " + str(maxPage) + "\n")
 76 | 
 77 |     # get next page link
 78 |     nextPage = page_soup.findAll("a", {"aria-label": "Next"})[0]["href"]
 79 | 
 80 |     # get links to individual program download pages
 81 |     downloadPageHeaders = page_soup.findAll("h3", {"class": "search-head"})
 82 |     downloadPageLinks = []
 83 |     for pageHeader in downloadPageHeaders:
 84 |         pageHeader = pageHeader.findAll("a")[0]
 85 |         downloadPageLink = pageHeader["href"]
 86 |         print(downloadPageLink)
 87 |         downloadPageLinks.append(downloadPageLink)
 88 | 
 89 |     # load the page's linked download pages and download exe files
 90 |     for dlPage in downloadPageLinks:
 91 |         myUrl = dlPage
 92 |         myUrl = myUrl.replace("_program_", "-Download-Page-")
 93 | 
 94 |         # connecting to and downloading page
 95 |         uClient = uOpen(myUrl)
 96 |         page_html = uClient.read()
 97 |         uClient.close()
 98 | 
 99 |         # instatiating BeautifulSoup parsing of page
100 |         dlPageSoup = soup(page_html, "html.parser")
101 | 
102 |         downLinks = dlPageSoup.findAll("a", {"class": "dwnlocations"})
103 |         for link in downLinks:
104 |             link = link["href"]
105 |             try:
106 |                 file = uOpen(link)
107 | 
108 |                 if int(file.info()['Content-Length']) <= 27000000:
109 |                     print(str(count) + ": " + link)
110 |                     os.system("sudo wget -O /media/lozmund/de9f2ab8-20e0-4d32-82a0-9564591262d0/home/freewareBenignFiles/" + str(count) + " " + link + " --read-timeout=1")
111 |                     count += 1
112 |             except:
113 |                 pass
114 | 
115 | 
116 |     for pageNum in range(2,maxPage+1):
117 |         print("Page " + str(pageNum) + ": ")
118 |         myUrl = nextPage
119 | 
120 |         # connecting to and downloading page
121 |         # last 8 characters of url is removed as it
122 |         # didn't seem to effect loading of page, and
123 |         # could not be parsed by 'urlopen' due to utf-8 encoding
124 |         myUrl = myUrl[:-8]
125 |         print("\n" + myUrl + "\n")
126 |         uClient = uOpen(myUrl)
127 |         page_html = uClient.read()
128 |         uClient.close()
129 | 
130 |         # instatiating BeautifulSoup parsing of first page
131 |         page_soup = soup(page_html, "html.parser")
132 | 
133 |         # get links to individual program download pages
134 |         downloadPageHeaders = page_soup.findAll("h3", {"class": "search-head"})
135 |         downloadPageLinks = []
136 |         for pageHeader in downloadPageHeaders:
137 |             pageHeader = pageHeader.findAll("a")[0]
138 |             downloadPageLink = pageHeader["href"]
139 |             print(downloadPageLink)
140 |             downloadPageLinks.append(downloadPageLink)
141 | 
142 |         # load the page's linked download pages and download exe files
143 |         for dlPage in downloadPageLinks:
144 |             myUrl = dlPage
145 |             myUrl = myUrl.replace("_program_","-Download-Page-")
146 | 
147 |             # connecting to and downloading page
148 |             uClient = uOpen(myUrl)
149 |             page_html = uClient.read()
150 |             uClient.close()
151 | 
152 |             # instatiating BeautifulSoup parsing of page
153 |             dlPageSoup = soup(page_html, "html.parser")
154 | 
155 |             downLinks = dlPageSoup.findAll("a", {"class": "dwnlocations"})
156 |             for link in downLinks:
157 |                 link = link["href"]
158 |                 try:
159 |                     file = uOpen(link)
160 | 
161 |                     if int(file.info()['Content-Length']) <= 27000000:
162 |                         print(str(count) + ": " + link)
163 |                         os.system(
164 |                             "sudo wget -O /media/lozmund/de9f2ab8-20e0-4d32-82a0-9564591262d0/home/freewareBenignFiles/" + str(
165 |                                 count) + " " + link + " --read-timeout=1")
166 |                         count += 1
167 |                 except:
168 |                     pass
169 | 
170 |             # downLinks = dlPageSoup.findAll("a",{"class":"dwnlocations"})
171 |             # for link in downLinks:
172 |             #     link = link["href"]
173 |             #     try:
174 |             #         file = uOpen(link)
175 |             #         print(file.info()['Content-Length'])
176 |             #         if int(file.info()['Content-Length']) <= 27000000:
177 |             #             print(str(count) + ": " + link)
178 |             #             os.system("sudo wget -O ~/media/lozmund/de9f2ab8-20e0-4d32-82a0-9564591262d0/home/freewareBenignFiles/" + str(count) + " " + link)
179 |             #             count += 1
180 |             #     except:
181 |             #         pass
182 | 
183 | 
184 |         # get next page link
185 |         nextPage = page_soup.findAll("a", {"aria-label": "Next"})[0]["href"]
186 | 
187 | 


--------------------------------------------------------------------------------
/ensemblePredict.py:
--------------------------------------------------------------------------------
  1 | __author__ = "Laurence Elliott - 16600748"
  2 | 
  3 | import os, math, string, pefile, time, threading
  4 | import tkinter as tk
  5 | import numpy as np
  6 | from capstone import *
  7 | from keras.models import Sequential, Model, Input
  8 | from keras import layers, preprocessing
  9 | from keras.utils import Sequence
 10 | from sklearn.utils import shuffle
 11 | from tkinter import messagebox
 12 | from tkinter.filedialog import askopenfilenames
 13 | from tkinter.ttk import Progressbar
 14 | 
 15 | ## Defining models (opcode, strings, and ensemble)
 16 | 
 17 | # Defining the opcode model
 18 | opModel = Sequential()
 19 | 
 20 | opModel.add(layers.InputLayer(input_shape=(50,)))
 21 | opModel.add(layers.Dense(256, activation='relu'))
 22 | opModel.add(layers.BatchNormalization())
 23 | opModel.add(layers.Dense(128, activation='relu'))
 24 | opModel.add(layers.BatchNormalization())
 25 | opModel.add(layers.Dense(64, activation='relu'))
 26 | opModel.add(layers.BatchNormalization())
 27 | opModel.add(layers.Dense(32, activation='relu'))
 28 | opModel.add(layers.BatchNormalization())
 29 | opModel.add(layers.Dense(16, activation='relu'))
 30 | opModel.add(layers.BatchNormalization())
 31 | opModel.add(layers.Dense(3, activation='softmax'))
 32 | 
 33 | opModel.load_weights("weights-improvement-574-0.85.hdf5")
 34 | 
 35 | opModel.compile(optimizer="rmsprop",
 36 |               loss='categorical_crossentropy',
 37 |               metrics=['accuracy'])
 38 | 
 39 | 
 40 | class histSequence(Sequence):
 41 | 
 42 |     def __init__(self, x, y, batch_size):
 43 |         self.x, self.y = shuffle(x, y)
 44 |         self.batch_size = batch_size
 45 | 
 46 |     def __len__(self):
 47 |         return math.ceil(len(self.x) / self.batch_size)
 48 | 
 49 |     def __getitem__(self, idx):
 50 |         batch_x = self.x[idx * self.batch_size:(idx + 1) *
 51 |         self.batch_size]
 52 |         batch_y = self.y[idx * self.batch_size:(idx + 1) *
 53 |         self.batch_size]
 54 | 
 55 |         return np.array([
 56 |             np.load(file_name)
 57 |             for file_name in batch_x]), np.array(batch_y)
 58 | 
 59 |     def on_epoch_end(self):
 60 |         pass
 61 | 
 62 | 
 63 | class histSequenceVal(histSequence):
 64 | 
 65 |     def __init__(self, x, y, batch_size):
 66 |         self.x, self.y = x, y
 67 |         self.batch_size = batch_size
 68 | 
 69 | 
 70 | # Defining the strings as greyscale images model
 71 | model = Sequential()
 72 | 
 73 | model.add(layers.InputLayer(input_shape=(100, 100, 1)))
 74 | model.add(layers.SpatialDropout2D(rate=0.2))
 75 | model.add(layers.Conv2D(32, kernel_size=3, activation='relu'))
 76 | model.add(layers.BatchNormalization())
 77 | model.add(layers.SpatialDropout2D(rate=0.1))
 78 | model.add(layers.Conv2D(16, kernel_size=3, activation='relu'))
 79 | model.add(layers.BatchNormalization())
 80 | model.add(layers.SpatialDropout2D(rate=0.1))
 81 | model.add(layers.Flatten())
 82 | model.add(layers.Dense(3, activation='softmax'))
 83 | 
 84 | 
 85 | class hashCorpusSequence(Sequence):
 86 | 
 87 |     def __init__(self, x, y, batch_size):
 88 |         self.x, self.y = shuffle(x, y)
 89 |         self.batch_size = batch_size
 90 | 
 91 |     def __len__(self):
 92 |         return math.ceil(len(self.x) / self.batch_size)
 93 | 
 94 |     def __getitem__(self, idx):
 95 |         batch_x = self.x[idx * self.batch_size:(idx + 1) *
 96 |         self.batch_size]
 97 |         batch_y = self.y[idx * self.batch_size:(idx + 1) *
 98 |         self.batch_size]
 99 | 
100 |         return np.array([
101 |             np.rint(((np.load(file_name) - np.min(np.load(file_name))) /
102 |             (np.max(np.load(file_name)) - np.min(np.load(file_name)))) * 255).astype(int)
103 |             for file_name in batch_x]), np.array(batch_y)
104 | 
105 |     def on_epoch_end(self):
106 |         pass
107 | 
108 | 
109 | class hashCorpusSequenceVal(hashCorpusSequence):
110 | 
111 |     def __init__(self, x, y, batch_size):
112 |         self.x, self.y = x, y
113 |         self.batch_size = batch_size
114 | 
115 | 
116 | model.load_weights("weights-improvement-04-0.72.hdf5")
117 | 
118 | model.compile(optimizer="adamax",
119 |               loss='categorical_crossentropy',
120 |               metrics=['accuracy'])
121 | 
122 | 
123 | opModel.name = "opcodeModel"
124 | model.name = "stringsAsGreyscaleModel"
125 | 
126 | def ensemble(models, model_inputs):
127 |     outputs = [models[0](model_inputs[0]), models[1](model_inputs[1])]
128 |     y = layers.average(outputs)
129 | 
130 |     modelEns = Model(model_inputs, y, name='ensemble')
131 | 
132 |     return modelEns
133 | 
134 | 
135 | models = [opModel, model]
136 | model_inputs = [Input(shape=(50,)), Input(shape=(100, 100, 1))]
137 | modelEns = ensemble(models, model_inputs)
138 | modelEns.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
139 | 
140 | 
141 | ## Pre-processing of PE (EXE, DLL, etc.) file(s)
142 | 
143 | # https://stackoverflow.com/questions/17195924/python-equivalent-of-unix-strings-utility
144 | # Solution to Python based 'strings' alternative from SO. Decodes bytes of binary file as
145 | # utf-8 strings
146 | def strings(filename, min=4):
147 |     with open(filename, errors="ignore", encoding="utf-8") as f:
148 |         result = ""
149 |         for c in f.read():
150 |             if c in string.printable:
151 |                 result += c
152 |                 continue
153 |             if len(result) >= min:
154 |                 yield result
155 |             result = ""
156 |         if len(result) >= min:  # catch result at EOF
157 |             yield result
158 | 
159 | 
160 | # Converting utf-8 string to sequence of words
161 | def wordSequence(pePath):
162 |     try:
163 |         text = ""
164 |         for s in strings(pePath):
165 |             text += s + "\n"
166 |         sequence = preprocessing.text.text_to_word_sequence(text)[:10000]
167 |         return sequence
168 |     except Exception as e:
169 |         print(e)
170 | 
171 | # Hashing words of word sequences into sequences of word-specific integers
172 | def hashWordSequences(sequences, maxSeqLen, vocabSize):
173 | 
174 |     hashedSeqs = []
175 |     docCount = 0
176 |     for sequence in sequences:
177 |         try:
178 |             text = " ".join(sequence)
179 |             hashWordIDs = preprocessing.text.hashing_trick(text, round(vocabSize * 1.5), hash_function='md5')
180 |             docLen = len(hashWordIDs)
181 |             if docLen < maxSeqLen:
182 |                 hashWordIDs += [0 for i in range(0, maxSeqLen-docLen)]
183 |             hashWordIDs = np.array(hashWordIDs).reshape(100, 100, 1)
184 |             hashedSeqs.append(hashWordIDs)
185 |             docCount += 1
186 |         except Exception as e:
187 |             print(e)
188 |     return hashedSeqs
189 | 
190 | 
191 | # Function takes list of paths to PE files and returns a list
192 | # of lists, with the first index as input for the opcode model,
193 | # and the second index as input for the strings model
194 | def preprocessPEs(pePaths):
195 |     mlInputs = []
196 | 
197 |     # Get percentage opcode composition of file assembley code for the top 50 most common opcodes
198 |     # in each file
199 |     opCodeSet = set()
200 |     opCodeDicts = []
201 |     opCodeFreqs = {}
202 | 
203 |     count = 1
204 |     for sample in pePaths:
205 |         try:
206 |             pe = pefile.PE(sample, fast_load=True)
207 |             entryPoint = pe.OPTIONAL_HEADER.AddressOfEntryPoint
208 |             data = pe.get_memory_mapped_image()[entryPoint:]
209 |             cs = Cs(CS_ARCH_X86, CS_MODE_32)
210 | 
211 |             opcodes = []
212 |             for i in cs.disasm(data, 0x1000):
213 |                 opcodes.append(i.mnemonic)
214 | 
215 |             opcodeDict = {}
216 |             total = len(opcodes)
217 | 
218 |             opCodeSet = set(list(opCodeSet) + opcodes)
219 |             for opcode in opCodeSet:
220 |                 freq = 1
221 |                 for op in opcodes:
222 |                     if opcode == op:
223 |                         freq += 1
224 |                 try:
225 |                     opCodeFreqs[opcode] += freq
226 |                 except:
227 |                     opCodeFreqs[opcode] = freq
228 | 
229 |                 opcodeDict[opcode] = round((freq / total) * 100, 2)
230 | 
231 |             opCodeDicts.append(opcodeDict)
232 |             count += 1
233 | 
234 |         except Exception as e:
235 |             print(e)
236 | 
237 |     opCodeFreqsSorted = np.genfromtxt("top50opcodes.csv", delimiter=",", dtype="str")[1:, 0]
238 | 
239 |     count = 0
240 |     for opDict in opCodeDicts:
241 |         opFreqVec = []
242 |         for opcode in opCodeFreqsSorted[:50]:
243 |             try:
244 |                 opFreqVec.append(opDict[opcode])
245 |             except Exception as e:
246 |                 if str(type(e)) == "<class 'KeyError'>":
247 |                     opFreqVec.append(0.0)
248 | 
249 |         mlInputs.append([np.array(opFreqVec)])
250 |         count += 1
251 | 
252 |     # Get words from utf-8 strings decoded from raw bytes of files,
253 |     # and hash to vectors of integers
254 |     sequences = []
255 |     count = 0
256 |     for sample in pePaths:
257 |         sequences.append(wordSequence(sample))
258 |         count += 1
259 | 
260 |     with open("finalVocabSize.txt", "r") as f:
261 |         maxVocabSize = int(f.readline())
262 | 
263 |     hashSeqs = hashWordSequences(sequences, 10000, maxVocabSize)
264 | 
265 |     count = 0
266 |     for hashSeq in hashSeqs:
267 |         mlInputs[count].append(np.array(hashSeq))
268 |         count += 1
269 | 
270 |     mlInputs = np.array(mlInputs)
271 | 
272 |     return mlInputs
273 | 
274 | 
275 | ## Function taking paths to PE files as input, and returning ensemble model predictions
276 | # as output
277 | def predictPEs(pePaths):
278 |     classNames = ["benign", "malware", "ransomware"]
279 |     pePredictions = {}
280 | 
281 |     count = 0
282 |     for pePath in pePaths:
283 |         x1 = preprocessPEs(pePaths)[count][0].reshape(1, 50)
284 |         x2 = preprocessPEs(pePaths)[count][1].reshape(1, 100, 100, 1)
285 |         count += 1
286 |         pePredictions[pePath] = classNames[np.argmax(modelEns.predict(x=[x1, x2]))]
287 | 
288 |     return pePredictions
289 | 
290 | 
291 | if __name__ == "__main__":
292 |     tkRoot = tk.Tk()
293 |     tkRoot.title("Processing files...")
294 |     tkRoot.withdraw()
295 |     tkRoot.protocol("WM_DELETE_WINDOW", quit)
296 |     w = tkRoot.winfo_screenwidth()
297 |     h = tkRoot.winfo_screenheight()
298 |     size = tuple(int(pos) for pos in tkRoot.geometry().split('+')[0].split('x'))
299 |     x = w / 2 - size[0] / 2
300 |     y = h / 2 - size[1] / 2
301 |     tkRoot.geometry("300x1+{}+{}".format(round(x) - 150, round(y)))
302 | 
303 |     while True:
304 |         try:
305 |             pePaths = list(askopenfilenames(filetypes=[("Windows executable files", "*.exe")]))
306 |             tkRoot.update()
307 |             tkRoot.deiconify()
308 |             preds = predictPEs(pePaths)
309 |             if len(preds) > 0:
310 |                 classificationsStr = ""
311 |                 for key in preds.keys():
312 |                     # print("'" + key + "'" + " detected as " + preds[key])
313 |                     classificationsStr += "'" + key + "'" + " detected as " + preds[key] + "\n\n"
314 |                 tkRoot.withdraw()
315 |                 messagebox.showinfo("Detections", classificationsStr)
316 |             else:
317 |                 quit()
318 |         except Exception as e:
319 |             messagebox.showerror("Error", "Error: " + str(e) + "\nPlease try again...")
320 | 


--------------------------------------------------------------------------------