├── .gitattributes ├── README.md ├── decrypt_l0rdix_c2.py ├── deobfuscate_ostap.py ├── graph_similar_document_images ├── README.md ├── graph_similar_document_images.py ├── image_hash_signatures.txt ├── images │ ├── graph_similar_document_images_screenshot_1.png │ └── graph_similar_document_images_screenshot_2.png └── requirements.txt └── graph_similar_strings ├── README.md ├── graph_similar_strings.py ├── images └── graph_similar_strings_screenshot_1.png └── requirements.txt /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Malware Analysis Scripts 2 | Handy scripts I use to speed up malware analysis. 3 | -------------------------------------------------------------------------------- /decrypt_l0rdix_c2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # A script that identifies, decrypts and extracts L0rdix RAT command and control (C2) 4 | # traffic from a supplied PCAP file. 5 | # 6 | # To speed up parsing, trim your PCAP to only HTTP ports using tcpdump, 7 | # for example: 8 | # 9 | # $ tcpdump -r l0rdix_c2.pcap -w l0rdix_c2_http.pcap 'tcp port 80 or 8080 or 3128' 10 | # 11 | # Requirements: 12 | # pyshark-legacy 13 | # pycryptodome 14 | # 15 | # Author.....: Alex Holland (@cryptogramfan) 16 | # Date.......: 2019-07-27 17 | # Version....: 0.1.6 18 | # License....: CC BY 4.0 19 | # Reference_1: https://www.bromium.com/an-analysis-of-l0rdix-rat-panel-and-builder/ 20 | # Reference_2: https://www.bromium.com/decrypting-l0rdix-rats-c2/ 21 | 22 | import sys 23 | import argparse 24 | import pyshark 25 | import urllib 26 | import re 27 | import hashlib 28 | import binascii 29 | import uuid 30 | from Crypto.Cipher import AES 31 | from base64 import b64decode 32 | 33 | parser = argparse.ArgumentParser(description="\nUsage: python decrypt_l0rdix_c2.py -p -k ") 34 | parser.add_argument("-p", dest="pcap_file", help="PCAP containing encrypted L0rdix C2 traffic.", required=True) 35 | parser.add_argument("-k", dest="operator_key", help="UTF-8 operator key extracted from a L0rdix bot or panel. If no key is supplied, the default key \"3sc3RLrpd17\" will be used.", default="3sc3RLrpd17") 36 | parsed_args = parser.parse_args() 37 | operator_key = parsed_args.operator_key 38 | aes_key = hashlib.sha256(operator_key).digest() 39 | iv = b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' 40 | parameters = [] 41 | hostnames = [] 42 | imgs = [] 43 | 44 | try: 45 | pcap = pyshark.FileCapture(parsed_args.pcap_file, keep_packets=False, display_filter='http.request.method == POST && http.request.uri.path == "/connect.php" && count(http.request.uri.query.parameter) >= 10') 46 | print "[+] Parsing PCAP..." 47 | 48 | except: 49 | print(parser.description) 50 | exit(0) 51 | 52 | try: 53 | print "[+] Searching for L0rdix C2 traffic..." 54 | for packet in pcap: 55 | # Enumerate hosts 56 | query = packet['HTTP'] 57 | hostnames.append(query.host) 58 | 59 | # Enumerate parameters 60 | query = query.request_uri_query 61 | query = urllib.unquote(query) 62 | query = re.sub("~", "+", query) 63 | query = re.sub("^h=", "", query) 64 | found_parameters = re.split("&[a-z]{1,2}=", query) 65 | parameters.extend(found_parameters) 66 | 67 | except: 68 | print "[!] Error, exiting." 69 | exit(0) 70 | 71 | if not hostnames: 72 | print "[+] No L0rdix C2 traffic found." 73 | exit(0) 74 | 75 | else: 76 | print "[+] Found references to L0rdix C2 servers (%d):\n" % (len(hostnames)) 77 | for hostname in hostnames: 78 | print hostname 79 | 80 | if not parameters: 81 | print "[+] No L0rdix URI parameters found." 82 | exit(0) 83 | 84 | else: 85 | print "\n[+] Found L0rdix C2 traffic (%d strings):\n" % (len(parameters)) 86 | for parameter in parameters: 87 | print parameter 88 | 89 | try: 90 | print "[+] Searching for screenshots..." 91 | for packet in pcap: 92 | # Enumerate screenshots 93 | img = packet['URLENCODED-FORM'] 94 | img = urllib.unquote(img.value) 95 | img = b64decode(img) 96 | img = bytearray(img) 97 | img_name = str(uuid.uuid4()) + '.jpg' 98 | imgs.append(img_name) 99 | 100 | # Dump screenshots 101 | f = open(img_name, 'w+b') 102 | f.write(img) 103 | f.close() 104 | 105 | except: 106 | print "[!] Error, exiting." 107 | exit(0) 108 | 109 | if not imgs: 110 | print "[+] No L0rdix screenshots found." 111 | exit(0) 112 | 113 | else: 114 | print "[+] Dumped L0rdix screenshots in current directory (%d):\n" % (len(imgs)) 115 | for img_name in imgs: 116 | print img_name 117 | 118 | print "\n[+] Decrypting strings using operator key (UTF-8): " + operator_key 119 | print "[+] AES key (hex): " + binascii.hexlify(bytearray(aes_key)) 120 | print "[+] IV (hex): " + binascii.hexlify(bytearray(iv)) 121 | print "[+] Decrypted L0rdix C2 traffic (%d strings):\n" % (len(parameters)) 122 | 123 | for parameter in parameters: 124 | cipher = AES.new(aes_key, AES.MODE_CBC, iv) 125 | ciphertext = b64decode(parameter) 126 | decrypted = cipher.decrypt(ciphertext) 127 | decrypted = decrypted.rstrip() 128 | print decrypted 129 | 130 | print "[+] Finished, exiting." 131 | -------------------------------------------------------------------------------- /deobfuscate_ostap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # A script that deobfuscates Ostap JSE (JScript Encoded) downloaders. The script is based 4 | # on Ostap samples analysed in August 2019, such as those delivering TrickBot. It will try 5 | # to identify the indexes containing Unicode character codes and then deobfuscate the sample 6 | # using subtraction and addition. 7 | # 8 | # To use the script, supply a file as an argument or pipe it to stdin: 9 | # 10 | # $ python deobfuscate_ostap.py ostap.jse 11 | # $ cat ostap.jse | deobfuscate_ostap.py 12 | # 13 | # Author.....: Alex Holland (@cryptogramfan) 14 | # Date.......: 2019-08-29 15 | # Version....: 0.0.5 16 | # License....: CC BY 4.0 17 | # Reference_1: https://www.bromium.com/deobfuscating-ostap-trickbots-javascript-downloader/ 18 | 19 | import os 20 | import sys 21 | import re 22 | 23 | index_0 = "" 24 | index_1 = "" 25 | indexes_raw = [] 26 | indexes = [] 27 | values_0 = [] 28 | values_1 = [] 29 | 30 | # Subtract index 0 values from index 1 31 | def subtract_values_1(): 32 | characters_sub = [] 33 | servers = [] 34 | urls = [] 35 | 36 | try: 37 | print("[+] Trying deobfuscation by subtracting index %s elements from index %s elements..." % (indexes[0], indexes[1])) 38 | charcodes_sub = [i - j for i, j in zip(values_0, values_1)] 39 | 40 | except: 41 | print("[!] Error subtracting index %s elements from index %s elements." % (indexes[0], indexes[1])) 42 | subtract_values_2() # Try another subtraction instead 43 | 44 | try: 45 | for charcode_sub in charcodes_sub: 46 | character_sub = chr(charcode_sub) 47 | characters_sub.append(character_sub) 48 | 49 | characters_sub = ''.join(characters_sub) 50 | 51 | except: 52 | print("[!] Error converting character codes to characters.") 53 | subtract_values_2() 54 | 55 | match = re.search("Script", characters_sub, re.IGNORECASE) 56 | 57 | if match: 58 | print("[+] Deobfuscation using subtraction 1 was successful:\n") 59 | print(characters_sub) 60 | 61 | match_url = re.search("http(s):\/\/.+(Drives|POST)", characters_sub, re.IGNORECASE) 62 | 63 | if match_url: 64 | servers.append(match_url.group()) 65 | 66 | for server in servers: 67 | server = re.sub("Drives.*$", "", server, re.IGNORECASE) 68 | server = re.sub("POST$", "", server, re.IGNORECASE) 69 | urls.append(server) 70 | 71 | if urls: 72 | print("\n[+] Found URL(s):\n") 73 | print(", ".join(urls)) 74 | 75 | exit(0) 76 | 77 | else: 78 | print("[!] Deobfuscation using subtraction 1 was unsuccessful.") 79 | subtract_values_2() 80 | 81 | return; 82 | 83 | # Subtract index 1 values from index 0 values 84 | def subtract_values_2(): 85 | characters_sub = [] 86 | servers = [] 87 | urls = [] 88 | 89 | try: 90 | print("[+] Trying deobfuscation by subtracting index %s elements from index %s elements..." % (indexes[1], indexes[0])) 91 | charcodes_sub = [i - j for i, j in zip(values_1, values_0)] 92 | 93 | except: 94 | print("[!] Error subtracting index %s elements from index %s elements." % (indexes[1], indexes[0])) 95 | add_values() # Try addition instead 96 | 97 | try: 98 | for charcode_sub in charcodes_sub: 99 | character_sub = chr(charcode_sub) 100 | characters_sub.append(character_sub) 101 | 102 | characters_sub = ''.join(characters_sub) 103 | 104 | except: 105 | print("[!] Error converting character codes to characters.") 106 | add_values() 107 | 108 | match = re.search("Script", characters_sub, re.IGNORECASE) 109 | 110 | if match: 111 | print("[+] Deobfuscation using subtraction 2 was successful:\n") 112 | print(characters_sub) 113 | 114 | match_url = re.search("http(s):\/\/.+(Drives|POST)", characters_sub, re.IGNORECASE) 115 | 116 | if match_url: 117 | servers.append(match_url.group()) 118 | 119 | for server in servers: 120 | server = re.sub("Drives.*$", "", server, re.IGNORECASE) 121 | server = re.sub("POST$", "", server, re.IGNORECASE) 122 | urls.append(server) 123 | 124 | if urls: 125 | print("\n[+] Found URL(s):\n") 126 | print(", ".join(urls)) 127 | 128 | exit(0) 129 | 130 | else: 131 | print("[!] Deobfuscation using subtraction 2 was unsuccessful.") 132 | add_values() 133 | 134 | return; 135 | 136 | # Add index 0 values to index 1 values 137 | def add_values(): 138 | characters_add = [] 139 | servers = [] 140 | urls = [] 141 | 142 | try: 143 | print("[+] Trying deobfuscation by adding index %s elements to index %s elements..." % (indexes[1], indexes[0])) 144 | charcodes_add = [i + j for i, j in zip(values_1, values_0)] 145 | 146 | except: 147 | print("[!] Error adding index %s elements to index %s elements. Exiting." % (indexes[1], indexes[0])) 148 | exit(0) 149 | 150 | try: 151 | for charcode_add in charcodes_add: 152 | character_add = chr(charcode_add) 153 | characters_add.append(character_add) 154 | 155 | characters_add = ''.join(characters_add) 156 | 157 | except: 158 | print("[!] Error converting character codes to characters. Exiting.") 159 | exit(0) 160 | 161 | match = re.search("Script", characters_add, re.IGNORECASE) 162 | 163 | if match: 164 | print("[+] Deobfuscation using addition was successful:\n") 165 | print(characters_add) 166 | 167 | match_url = re.search("http(s):\/\/.+(Drives|POST)", characters_add, re.IGNORECASE) 168 | 169 | if match_url: 170 | servers.append(match_url.group()) 171 | 172 | for server in servers: 173 | server = re.sub("Drives.*$", "", server, re.IGNORECASE) 174 | server = re.sub("POST$", "", server, re.IGNORECASE) 175 | urls.append(server) 176 | 177 | if urls: 178 | print("\n[+] Found URL(s):\n") 179 | print(", ".join(urls)) 180 | 181 | exit(0) 182 | 183 | else: 184 | print("[!] Deobfuscation using addition was unsuccessful. Exiting.") 185 | exit(0) 186 | 187 | return; 188 | 189 | if len(sys.argv) > 1: 190 | file = open(sys.argv[1], 'r') 191 | else: 192 | file = sys.stdin 193 | 194 | while 1: 195 | input = file.read() 196 | 197 | # Find array indexes 198 | try: 199 | print("\n[+] Analysing %s" % os.path.basename(file.name)) 200 | input = input.decode('utf-8') 201 | 202 | except UnicodeError: 203 | print("[!] File not UTF-8. Treating as UTF-16.") 204 | input = input.decode('utf-16') 205 | 206 | try: 207 | indexes_raw = re.findall("\[\d+\]=\d+;", input) 208 | 209 | except: 210 | print("[!] Error finding array indexes. Exiting.") 211 | exit(0) 212 | 213 | if not indexes_raw: 214 | print("[!] Array indexes not found. Exiting.") 215 | exit(0) 216 | 217 | # Put the index string into a list 218 | try: 219 | for index in indexes_raw: 220 | index = re.sub("\[", "", index) 221 | index = re.sub("\]=\d+;", "", index) 222 | indexes.append(index) 223 | 224 | # Remove duplicates 225 | indexes = list(set(indexes)) 226 | print("[+] Found array indexes %s and %s." % (indexes[0], indexes[1])) 227 | 228 | except: 229 | print("[!] Error processing array indexes. Exiting.") 230 | exit(0) 231 | 232 | try: 233 | element_regex_0 = r"\[" + indexes[0] + r"\]=\d+;" 234 | element_regex_1 = r"\[" + indexes[1] + r"\]=\d+;" 235 | 236 | except: 237 | print("[!] Error creating regular expressions. Exiting.") 238 | exit(0) 239 | 240 | # Find the values of index 0 elements 241 | try: 242 | print("[+] Searching for index %s elements..." % indexes[0]) 243 | array_0 = re.findall(element_regex_0, input) 244 | 245 | for element in array_0: 246 | element = re.sub("\[\d+\]=", "", element) 247 | element = re.sub(";", "", element) 248 | values_0.append(element) 249 | 250 | except: 251 | print("[!] Error finding index %s elements. Exiting." % indexes[0]) 252 | exit(0) 253 | 254 | if not values_0: 255 | print("[!] No index %s elements found. Exiting." % indexes[0]) 256 | exit(0) 257 | 258 | # Convert index 0 elements to integer values 259 | try: 260 | values_0 = map(int, values_0) 261 | 262 | except: 263 | print("[!] Error converting index %s elements to integers. Exiting." % indexes[0]) 264 | exit(0) 265 | 266 | # Find the values of index 1 elements 267 | try: 268 | print("[+] Searching for index %s elements..." % indexes[1]) 269 | array_1 = re.findall(element_regex_1, input) 270 | 271 | for element in array_1: 272 | element = re.sub("\[\d+\]=", "", element) 273 | element = re.sub(";", "", element) 274 | values_1.append(element) 275 | 276 | except: 277 | print("[!] Error finding index %s elements. Exiting." % indexes[1]) 278 | exit(0) 279 | 280 | if not values_1: 281 | print("[!] No index %s elements found. Exiting." % indexes[1]) 282 | exit(0) 283 | 284 | # Convert index 1 elements to integer values 285 | try: 286 | values_1 = map(int, values_1) 287 | 288 | except: 289 | print("[!] Error converting index %s elements to integers. Exiting." % indexes[1]) 290 | exit(0) 291 | 292 | subtract_values_1() 293 | subtract_values_2() 294 | add_values() 295 | 296 | exit(0) 297 | -------------------------------------------------------------------------------- /graph_similar_document_images/README.md: -------------------------------------------------------------------------------- 1 | # graph_similar_document_images.py 2 | A script that extracts embedded images from [Office Open XML (OOXML)](https://en.wikipedia.org/wiki/Office_Open_XML) documents and generates image hash similarity graphs that cluster visually similar images together. The script computes the [Average Hash](http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html) of each extracted image, then graphs the images if they meet the similarity threshold. The script can be used as a technique for visually identifying malware campaigns involving documents. To use the script, supply a directory containing OOXML files. If LibreOffice is in your PATH you can optionally convert non-OOXML Word, Excel, PowerPoint and Rich Text File documents to OOXML. The script outputs DOT files that can be exported as images using Graphviz. If Graphviz is in your PATH you can also export to an SVG (preferred) or PNG image. 3 | 4 | ## Output 5 | Example image hash similarity graph (cropped). Here each node is a unique image that is connected by edges to other images that met the similarity threshold: 6 | 7 | 8 | 9 | Example CSV output of the script in detect mode, which lists images that match the similarity threshold with the signatures in the blacklist file, [image_hash_signatures.txt](https://github.com/cryptogramfan/Malware-Analysis-Scripts/blob/master/graph_similar_document_images/image_hash_signatures.txt): 10 | 11 | 12 | 13 | ## Example usage 14 | Convert documents to OOXML, extract images from the documents, identify images that are similar to the blacklist and then graph images that meet the similarity threshold: 15 | ``` 16 | $ graph_similar_document_images.py -f ~/Samples -d image_hash_signatures.txt -c -g -t 80 -o svg 17 | ``` 18 | ## Help 19 | ``` 20 | usage: graph_similar_document_images.py [-h] -f INPUT_DIR 21 | [-t MIN_SIMILARITY_THRESHOLD] 22 | [-d SIG_FILE] [-g] [-c] [-o {svg,png}] 23 | 24 | Usage: graph_similar_document_images.py -f -c -d 25 | -g -t -o 26 | 27 | optional arguments: 28 | -h, --help show this help message and exit 29 | -f INPUT_DIR, --files INPUT_DIR 30 | Directory to process 31 | -t MIN_SIMILARITY_THRESHOLD, --threshold MIN_SIMILARITY_THRESHOLD 32 | Minimum percentage similarity between images to graph 33 | (0 to 100) 34 | -d SIG_FILE, --detect SIG_FILE 35 | Detect mode identifies images that are similar to a 36 | blacklist of known-bad images 37 | -g, --graph Graph mode creates a graph of images that meet the 38 | similarity threshold 39 | -c, --convert Try converting documents to OOXML using LibreOffice 40 | -o {svg,png}, --output {svg,png} 41 | Output image format 42 | ``` 43 | ## Supported platforms 44 | Tested on Ubuntu 18.04 with Python 3. 45 | 46 | ## Installation 47 | First install Graphviz and LibreOffice: 48 | ``` 49 | $ sudo add-apt-repository ppa:libreoffice/ppa 50 | $ sudo apt update 51 | $ sudo apt install graphviz libreoffice 52 | ``` 53 | Afterwards, install the required Python libraries: 54 | ``` 55 | $ python3 -m pip install -r requirements.txt 56 | ``` 57 | To view SVG files produced by the script you can use a viewer such as [Inkscape](https://inkscape.org/). Outputting to PNG isn't recommended because the resulting files can be large. 58 | 59 | ## License 60 | Released under the Creative Commons Attribution 4.0 International ([CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)) license. -------------------------------------------------------------------------------- /graph_similar_document_images/graph_similar_document_images.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # A script that extracts embedded images from Open Office XML (OOXML) documents and generates network 4 | # graphs that cluster similar images together. The script computes average hashes of the extracted 5 | # images, then graphs the images if they meet the similarity threshold. The script can be used as a 6 | # technique for visually identifying malware campaigns involving documents. To use the script, supply 7 | # a directory containing OOXML files. If LibreOffice is in your PATH you can optionally convert 8 | # non-OOXML Word, Excel, PowerPoint and Rich Text File documents to OOXML. The script outputs DOT files 9 | # that can be exported as images using Graphviz. If Graphviz is in your PATH you can also export to an 10 | # SVG (preferred) or PNG image. 11 | # 12 | # $ graph_similar_document_images.py -f -d -g -t -o 13 | # 14 | # Author.....: Alex Holland (@cryptogramfan) 15 | # Date.......: 2020-02-02 16 | # Version....: 0.0.3 17 | # License....: CC BY 4.0 18 | 19 | import os 20 | import csv 21 | import hashlib 22 | import argparse 23 | import imagehash 24 | import subprocess 25 | import magic 26 | import distance 27 | from PIL import Image 28 | from zipfile import ZipFile 29 | from shutil import copyfileobj, copy, rmtree 30 | from time import strftime 31 | from networkx.algorithms import bipartite 32 | from networkx.drawing.nx_agraph import write_dot 33 | from networkx import graph as nx 34 | 35 | parser = argparse.ArgumentParser(description='\nUsage: graph_similar_document_images.py -f -d -g -c -t -o ') 36 | parser.add_argument('-f', '--files', dest='input_dir', help='Directory to process', required=True) 37 | parser.add_argument('-t', '--threshold', dest='min_similarity_threshold', type=float, help='Minimum percentage similarity between images to graph (0 to 100)', default=87.5) 38 | parser.add_argument('-d', '--detect', dest='sig_file', help='Detect mode identifies images that are similar to a blacklist of known-bad images and saves the results to a CSV (requires image_hash_signatures.txt)') 39 | parser.add_argument('-g', '--graph', help='Graph mode creates a graph of images that meet the similarity threshold in DOT format', action='store_true') 40 | parser.add_argument('-c', '--convert', help='Try converting documents to OOXML (requires LibreOffice)', action='store_true') 41 | parser.add_argument('-o', '--output', choices=['svg', 'png'], help='Output image format (requires Graphviz)', default='svg') 42 | parsed_args = parser.parse_args() 43 | network = nx.Graph() 44 | timestr = strftime('%Y%m%d-%H%M%S') 45 | graph_file = 'graph_similar_document_images_' + timestr 46 | input_dir = parsed_args.input_dir 47 | sig_file = parsed_args.sig_file 48 | csv_file = os.path.join(os.getcwd(), 'image_hash_matches_' + timestr + '.csv') 49 | dir_image = os.path.join(os.getcwd(), 'extracted_document_images_' + timestr) 50 | dir_convert_docx = os.path.join(os.getcwd(), 'convert_docx') 51 | dir_convert_pptx = os.path.join(os.getcwd(), 'convert_pptx') 52 | dir_convert_xlsx = os.path.join(os.getcwd(), 'convert_xlsx') 53 | dir_converted_docx = os.path.join(os.getcwd(), 'converted_docx') 54 | dir_converted_pptx = os.path.join(os.getcwd(), 'converted_pptx') 55 | dir_converted_xlsx = os.path.join(os.getcwd(), 'converted_xlsx') 56 | min_similarity_threshold = parsed_args.min_similarity_threshold 57 | 58 | def load_signatures(): 59 | signatures = {} 60 | with open(sig_file) as f: 61 | for line in f: 62 | line = line.rstrip() 63 | if line: 64 | if not line.startswith('#'): 65 | (sig_hash, sig_name) = line.split(',') 66 | signatures[sig_hash] = sig_name 67 | 68 | return signatures 69 | 70 | def create_dirs(): 71 | try: 72 | os.makedirs(dir_convert_docx) 73 | os.makedirs(dir_convert_xlsx) 74 | os.makedirs(dir_convert_pptx) 75 | os.makedirs(dir_converted_docx) 76 | os.makedirs(dir_converted_xlsx) 77 | os.makedirs(dir_converted_pptx) 78 | 79 | except OSError: 80 | if not os.path.isdir(dir_convert_docx): 81 | raise 82 | 83 | if not os.path.isdir(dir_convert_xlsx): 84 | raise 85 | 86 | if not os.path.isdir(dir_convert_pptx): 87 | raise 88 | 89 | if not os.path.isdir(dir_converted_docx): 90 | raise 91 | 92 | if not os.path.isdir(dir_converted_xlsx): 93 | raise 94 | 95 | if not os.path.isdir(dir_converted_pptx): 96 | raise 97 | 98 | def identify_files(): 99 | for infile in os.listdir(input_dir): 100 | if os.path.isfile(os.path.join(input_dir, infile)): 101 | infile_path = os.path.join(input_dir, infile) 102 | 103 | if ('Microsoft OOXML' in magic.from_file(infile_path)) or ('Microsoft Word 2007+' in magic.from_file(infile_path)): 104 | extract_ooxml(infile_path) 105 | 106 | if parsed_args.convert: 107 | if ('Microsoft Office Word' in magic.from_file(infile_path)) or ('CDFV2 Encrypted' in magic.from_file(infile_path)) or ('Rich Text Format' in magic.from_file(infile_path)): 108 | copy(infile_path, dir_convert_docx) 109 | 110 | if ('Microsoft Excel' in magic.from_file(infile_path)): 111 | copy(infile_path, dir_convert_xlsx) 112 | 113 | if ('Microsoft Office PowerPoint' in magic.from_file(infile_path)): 114 | copy(infile_path, dir_convert_pptx) 115 | return True 116 | 117 | def convert_docx(): 118 | print('\n[+] Converting to Word OOXML...') 119 | path = dir_convert_docx + '/*' 120 | 121 | try: 122 | os.system('soffice --headless --convert-to docx --outdir ' + dir_converted_docx + ' ' + path) 123 | 124 | except: 125 | print('[!] Error converting document. Check that LibreOffice is added to your PATH.') 126 | 127 | print('\n[+] Extracting from converted Word OOXML documents...') 128 | for f in os.listdir(dir_converted_docx): 129 | if os.path.isfile(os.path.join(dir_converted_docx, f)): 130 | f = os.path.join(dir_converted_docx, f) 131 | extract_ooxml(f) 132 | 133 | rmtree(dir_convert_docx) 134 | rmtree(dir_converted_docx) 135 | return True 136 | 137 | def convert_xlsx(): 138 | print('\n[+] Converting to Excel OOXML...') 139 | path = dir_convert_xlsx + '/*' 140 | 141 | try: 142 | os.system('soffice --headless --convert-to xlsx --outdir ' + dir_converted_xlsx + ' ' + path) 143 | 144 | except: 145 | print('[!] Error converting document. Check that LibreOffice is added to your PATH.') 146 | 147 | print('\n[+] Extracting from converted Excel OOXML documents...') 148 | for f in os.listdir(dir_converted_xlsx): 149 | if os.path.isfile(os.path.join(dir_converted_xlsx, f)): 150 | f = os.path.join(dir_converted_xlsx, f) 151 | extract_ooxml(f) 152 | 153 | rmtree(dir_convert_xlsx) 154 | rmtree(dir_converted_xlsx) 155 | return True 156 | 157 | def convert_pptx(): 158 | print('\n[+] Converting to PowerPoint OOXML...') 159 | path = dir_convert_pptx + '/*' 160 | 161 | try: 162 | os.system('soffice --headless --convert-to pptx --outdir ' + dir_converted_pptx + ' ' + path) 163 | 164 | except: 165 | print('[!] Error converting document. Check that LibreOffice is added to your PATH.') 166 | 167 | print('\n[+] Extracting from converted PowerPoint OOXML documents...') 168 | for f in os.listdir(dir_converted_pptx): 169 | if os.path.isfile(os.path.join(dir_converted_pptx, f)): 170 | f = os.path.join(dir_converted_pptx, f) 171 | extract_ooxml(f) 172 | 173 | rmtree(dir_convert_pptx) 174 | rmtree(dir_converted_pptx) 175 | return True 176 | 177 | def extract_ooxml(f): 178 | with open(f, 'rb') as infile: 179 | bytes = infile.read() 180 | hash_document = hashlib.sha256(bytes).hexdigest() 181 | 182 | try: 183 | with ZipFile(f) as z: 184 | for i in z.infolist(): 185 | name = i.filename 186 | 187 | if name.endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')): 188 | try: 189 | image_path = os.path.join(dir_image, hash_document) 190 | with z.open(name) as in_image, open(image_path, 'wb') as out_image: 191 | copyfileobj(in_image, out_image) 192 | print('[+] Extracted image from ' + f + '.') 193 | 194 | except Exception as e: 195 | print('[!] Error extracting image from ' + hash_document, e) 196 | 197 | except Exception as e: 198 | print('[!] Error unzipping ' + f, e) 199 | 200 | def calculate_image_distance(): 201 | print('\n[+] Computing image hash distances...') 202 | images = [] 203 | for i in os.listdir(dir_image): 204 | images.append(i) 205 | 206 | images_a = images 207 | images_b = images 208 | 209 | for a in images_a: 210 | image_a_path = os.path.join(dir_image, a) 211 | hash_a = imagehash.average_hash(Image.open(image_a_path)) 212 | 213 | for b in images_b: 214 | image_b_path = os.path.join(dir_image, b) 215 | hash_b = imagehash.average_hash(Image.open(image_b_path)) 216 | 217 | image_distance = ((hash_a-hash_b)/len(hash_a.hash)**2)*100 # Each image hash is 64 bits long 218 | image_similarity = 100-image_distance 219 | 220 | if image_similarity >= min_similarity_threshold: 221 | print('[+] ' + a + ' is ' + str('%.0f' % image_similarity) + '% similar to ' + b + '.') 222 | network.add_node(a, 223 | label='Image_Hash: ' + str(hash_a) + '\n' + 'SHA256_Doc: ' + a, 224 | image=image_a_path, 225 | type='image', 226 | style='filled', 227 | fillcolor='white', 228 | color='white', 229 | fontcolor='black', 230 | fontname='Arial', 231 | fontsize='20', 232 | bipartite=0) 233 | 234 | network.add_node(b, 235 | label='Image_Hash: ' + str(hash_b) + '\n' + 'SHA256_Doc: ' + b, 236 | image=image_b_path, 237 | type='image', 238 | style='filled', 239 | fillcolor='white', 240 | color='white', 241 | fontcolor='black', 242 | fontname='Arial', 243 | fontsize='20', 244 | bipartite=1) 245 | 246 | network.add_edge(b, 247 | a, 248 | penwidth=3, 249 | color='#0096D6', 250 | dir='none') 251 | 252 | write_dot(network, graph_file + '.dot') 253 | print('[+] Created ' + graph_file + '.dot.') 254 | 255 | return True 256 | 257 | def export_graph(): 258 | try: 259 | if parsed_args.output == 'png': 260 | subprocess.Popen(['sfdp', 261 | graph_file + '.dot', 262 | '-Tpng', 263 | '-o', 264 | graph_file + '.png', 265 | '-Gfontname="Arial"' 266 | ]) 267 | 268 | print('[+] Created ' + graph_file + '.png.') 269 | 270 | if parsed_args.output == 'svg': 271 | subprocess.Popen(['sfdp', 272 | graph_file + '.dot', 273 | '-Tsvg', 274 | '-o', 275 | graph_file + '.svg', 276 | '-Gfontname="Arial"' 277 | ]) 278 | 279 | print('[+] Created ' + graph_file + '.svg.') 280 | 281 | except: 282 | print('[!] Error exporting graph image. Check that Graphviz is added to your PATH.') 283 | 284 | return True 285 | 286 | def detect_images(signatures): 287 | print('\n[+] Detecting images that meet similarity threshold of signatures (' + str(parsed_args.min_similarity_threshold) + '%)...') 288 | images = [] 289 | for i in os.listdir(dir_image): 290 | images.append(i) 291 | 292 | with open(csv_file, mode='w') as csv_out: 293 | csv_writer = csv.writer(csv_out, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 294 | csv_writer.writerow(['Document_SHA256', 'Average_Hash', 'Signature_Name', 'Similarity']) 295 | 296 | for i in images: 297 | image_path = os.path.join(dir_image, i) 298 | image_hash = str(imagehash.average_hash(Image.open(image_path))) 299 | 300 | for sig_hash, sig_name in signatures.items(): 301 | hamming_distance = distance.hamming(image_hash, sig_hash) 302 | image_similarity = 100-((hamming_distance/16)*100) 303 | 304 | if image_similarity >= min_similarity_threshold: 305 | csv_writer.writerow([i, image_hash, sig_name, image_similarity]) 306 | print('[+] Document ' + i + ' matched ' + sig_name + ' (' + str('%.0f' % image_similarity) + '% similarity).') 307 | 308 | print('[+] Saved results to ' + csv_file + '.') 309 | return True 310 | 311 | def main(): 312 | try: 313 | os.makedirs(dir_image) 314 | 315 | except OSError: 316 | if not os.path.isdir(dir_image): 317 | raise 318 | 319 | if parsed_args.convert == False: 320 | identify_files() 321 | 322 | if parsed_args.convert: 323 | create_dirs() 324 | identify_files() 325 | convert_docx() 326 | convert_xlsx() 327 | convert_pptx() 328 | 329 | if parsed_args.graph: 330 | if calculate_image_distance(): 331 | export_graph() 332 | 333 | if parsed_args.sig_file: 334 | signatures = load_signatures() 335 | detect_images(signatures) 336 | 337 | if __name__== "__main__": 338 | main() 339 | -------------------------------------------------------------------------------- /graph_similar_document_images/image_hash_signatures.txt: -------------------------------------------------------------------------------- 1 | # A collection of Average Hash values of images embedded in malicious 2 | # documents computed using the average hash algorithm. These can 3 | # be used as an analytic technique to detect malicious documents 4 | # by comparing the distance (similarity) between a given document 5 | # and these signatures. The signatures can be also be used to 6 | # identify and track specific campaigns based on image re-use. 7 | # Graph_similiar_document_images.py uses these hashes as signatures 8 | # in detect mode. 9 | # 10 | # Signatures are named using the following convention: 11 | # Type.Purpose.Product.Descriptor_1.Descriptor_2.Descriptor_3 12 | # 13 | # Name.......: image_hash_signatures.txt 14 | # Author.....: Alex Holland (@cryptogramfan) 15 | # Date.......: 2020-02-03 16 | # 17 | # Average_Hash,Signature_Name 18 | 0000c0d3dedc4000,Image.Social-Engineering.MS-Word.Previous-Version.Blue.Circle 19 | 00c0f8e0e6e4c000,Image.Social-Engineering.MS-Word.Earlier-Version.Navy 20 | 0038386eff7e7c00,Image.Social-Engineering.Office-365.Online-Version.Blue 21 | cfc7c7cfffff00e5,Image.Social-Engineering.MS-Office.Old-Version.White.Attemting 22 | 1f7f7ffe0202ff00,Image.Social-Engineering.MS-Office.Activation-Wizard 23 | ffff0113819fffff,Image.Social-Engineering.MS-Office.Protected.White 24 | 3f838383ffffffff,Image.Social-Engineering.Generic.Protected.Invoice.First-Fresh-NZ-Limited 25 | 0000187f76000000,Image.Social-Engineering.Generic.Protected.Full-Page.Blue 26 | efe7ffc38100ffff,Image.Social-Engineering.MS-Office.OpenOffice.Gray 27 | ffffc3ffff8101ff,Image.Social-Engineering.MS-Word.License 28 | be9c7c20fc007c00,Image.Social-Engineering.MS-Word.Older-Version.Blue 29 | cfc3c703bfbfffff,Image.Social-Engineering.MS-Office.Protected.Old-Logo.White.1 30 | ffff0113819fffff,Image.Social-Engineering.MS-Word.Protected.Defender.Blue 31 | 1f7fff01ffff1e00,Image.Social-Engineering.MS-Word.Not-Activated.White 32 | 00ffffc1c18181ff,Image.Social-Engineering.MS-Word.Protected.UPS 33 | ff818183bfbfff3f,Image.Social-Engineering.Generic.Protected.Invoice.Western 34 | 2c054720fffff8ff,Image.Social-Engineering.McAfee-Secure.Protected.Chinese.Sleek-Bill 35 | ffff81818181ffff,Image.Social-Engineering.Generic.Protected.Full-Page.Invoice 36 | fefec20600000000,Image.Social-Engineering.MS-Word.Earlier-Version.Full-Page 37 | fff117911331ffff,Image.Social-Engineering.MS-Office.Protected.Old-Logo.White.2 38 | c7c7818f83cbffff,Image.Social-Engineering.Office-365.Desktop-Laptop.White 39 | ff0000fffe2fffff,Image.Social-Engineering.MS-Word.Newer-Version.Infographic 40 | ffffff3f38ffffff,Image.Social-Engineering.Amazon.Assistance-Center 41 | 00fef87e40000000,Image.Social-Engineering.MS-Word.Older-Version.Navy 42 | 1018ff07fd9d9dff,Image.Social-Engineering.MS-Office.Oops.Protected 43 | ffff0301819f83ff,Image.Social-Engineering.MS-Office.Protected.Old-Logo.3 44 | 00ff3f1f1fffff00,Image.Social-Engineering.MS-Office.Banner.Select-Document.Turkish 45 | e0fede1e18000000,Image.Social-Engineering.MS-Word.Protected.Padlock.Navy 46 | 00c0c4fcc0d6c000,Image.Social-Engineering.MS-Word.Earlier-Version.Italian.Navy 47 | ff1f0c0001011fff,Image.Social-Engineering.Generic.Logo.RSA.SecurID 48 | 9f00a38f8fa383ff,Image.Social-Engineering.Generic.Problem.Full-Page.Barclays 49 | 0040c06060606000,Image.Social-Engineering.Generic.Logo.BZSt.German -------------------------------------------------------------------------------- /graph_similar_document_images/images/graph_similar_document_images_screenshot_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cryptogramfan/Malware-Analysis-Scripts/7fc87591a0500ab74ba0dc9896d5e61efae93107/graph_similar_document_images/images/graph_similar_document_images_screenshot_1.png -------------------------------------------------------------------------------- /graph_similar_document_images/images/graph_similar_document_images_screenshot_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cryptogramfan/Malware-Analysis-Scripts/7fc87591a0500ab74ba0dc9896d5e61efae93107/graph_similar_document_images/images/graph_similar_document_images_screenshot_2.png -------------------------------------------------------------------------------- /graph_similar_document_images/requirements.txt: -------------------------------------------------------------------------------- 1 | networkx>=2.2 2 | pygraphviz>=1.3.1 3 | python-magic>=0.4.15 4 | Pillow==7.0.0 5 | ImageHash==4.0 6 | Distance>=0.1.3 7 | -------------------------------------------------------------------------------- /graph_similar_strings/README.md: -------------------------------------------------------------------------------- 1 | # graph_similar_strings.py 2 | A script that reads a list of strings and generates link charts that clusters similar strings together. Three string similarity metrics are supported: 3 | * [Jaccard distance](https://en.wikipedia.org/wiki/Jaccard_index) 4 | * [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance) 5 | * [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) 6 | 7 | The script can be used as a text analysis technique for identifying phishing campaigns, for example by clustering similar filenames and email subject lines. To use the script, supply a text file containing a list of strings. By default, the script outputs DOT files that can be exported as images using [Graphviz](https://www.graphviz.org/). If Graphviz is in your PATH the script can export to SVG (recommended) or PNG files for you. You can modify the thresholds that determine if two strings are similar enough to connect according to your needs. 8 | 9 | ## Installation 10 | First install Graphviz. On Windows you can download the installer [here](https://graphviz.gitlab.io/_pages/Download/Download_windows.html). On Linux you can install Graphviz by running: 11 | ``` 12 | $ sudo apt install graphviz 13 | ``` 14 | Afterwards, install the required Python libraries: 15 | ``` 16 | $ pip install -r requirements.txt 17 | ``` 18 | To view SVG files produced by the script you can use a viewer such as [Inkscape](https://inkscape.org/). Outputting to PNG isn't recommended because the resulting files can be large. 19 | 20 | ## Example usage 21 | Apply all three metrics against strings in input.txt that match a filename pattern and save the resulting link charts as SVG files: 22 | ``` 23 | $ graph_similar_strings.py -f input.txt -m A -r filenames -o svg 24 | ``` 25 | ## Help 26 | ``` 27 | usage: graph_similar_strings.py [-h] -f INPUT_FILE -m 28 | {jaccard,J,hamming,H,levenshtein,L,all,A} 29 | [-r {filenames,f}] [-o {png,svg}] 30 | 31 | Usage: graph_similar_strings.py -f -a [JHLA] 32 | 33 | optional arguments: 34 | -h, --help show this help message and exit 35 | -f INPUT_FILE, --file INPUT_FILE 36 | Text file containing strings 37 | -m {jaccard,J,hamming,H,levenshtein,L,all,A}, --metric {jaccard,J,hamming,H,levenshtein,L,all,A} 38 | Metric to calculate similarity 39 | -r {filenames,f}, --regex {filenames,f} 40 | Filter strings first using regular expression 41 | -o {png,svg}, --output {png,svg} 42 | Output image format (requires Graphviz to be in your 43 | PATH) 44 | ``` 45 | 46 | ## Output 47 | Example filename similarity link chart using Levenshtein distance (cropped). Here each node represents a unique filename that is connected by edges to other filenames that met the similarity threshold. 48 | 49 | 50 | 51 | ## License 52 | Released under the Creative Commons Attribution 4.0 International ([CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)) license. -------------------------------------------------------------------------------- /graph_similar_strings/graph_similar_strings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # A script that reads a list of strings and generates link charts that clusters similar strings 4 | # together. Three distance metrics are supported to calculate string similarity: Jaccard, Hamming 5 | # and Levenshtein. The script can be used as a technique for visualising phishing campaigns, for 6 | # example by identifying similiar filenames and email subject lines. To use the script, supply a 7 | # text file containing a list of strings. By default, the script outputs DOT files that can be 8 | # exported as images using Graphviz. If Graphviz is in your PATH you can also export to PNG or SVG. 9 | # 10 | # $ graph_similar_strings.py -f input.txt -m A -r filenames -o svg 11 | # 12 | # Author.....: Alex Holland (@cryptogramfan) 13 | # Date.......: 2019-12-06 14 | # Version....: 0.0.3 15 | # License....: CC BY 4.0 16 | 17 | import sys 18 | import argparse 19 | import time 20 | import re 21 | import subprocess 22 | import distance 23 | import networkx as nx 24 | import pandas as pd 25 | import numpy as np 26 | from networkx.algorithms import bipartite 27 | from networkx.drawing.nx_agraph import write_dot 28 | 29 | jaccard_threshold = 0.2 # Similarity threshold where 0 means identical and 1 means totally different (default = 0.2). 30 | hamming_threshold = 5 # Similarity threshold where 1 means 1 character difference (default = 5). 31 | levenshtein_threshold = 7 # Similarity threshold where 1 means 1 character operation difference (default = 7). 32 | levenshtein_length_threshold = 5 # The Levenshtein distance can be calculated between strings of different lengths. Only strings whose length difference is less than the threshold will be computed (default = 5). Reduce the threshold if the script is taking a long time to finish. 33 | 34 | parser = argparse.ArgumentParser(description='\nUsage: graph_similar_strings.py -f -m [JHLA]') 35 | parser.add_argument('-f', '--file', dest='input_file', help='Text file containing strings', required=True) 36 | parser.add_argument('-m', '--metric', choices=['jaccard', 'J', 'hamming', 'H', 'levenshtein', 'L', 'all', 'A'], help='Metric to calculate similarity', required=True) 37 | parser.add_argument('-r', '--regex', choices=['filenames', 'f'], help='Filter strings first using regular expression') 38 | parser.add_argument('-o', '--output', choices=['png', 'svg'], help='Output image format (requires Graphviz to be in your PATH)') 39 | parsed_args = parser.parse_args() 40 | network = nx.Graph() 41 | timestr = time.strftime('%Y%m%d-%H%M%S') 42 | jaccard_file = 'string_similarity_jaccard_' + timestr 43 | hamming_file = 'string_similarity_hamming_' + timestr 44 | levenshtein_file = 'string_similarity_levenshtein_' + timestr 45 | regex_filenames = r'[^\\\/:*?"<>|\r\n]+$' # Regular expression to filter filenames for comparison 46 | 47 | def parse_text(): 48 | with open(parsed_args.input_file, 'r') as text_file: 49 | 50 | strings = [line.strip() for line in text_file] 51 | strings = filter(None, strings) 52 | print('[+] Processing ' + str(len(strings)) + ' strings...') 53 | 54 | if (parsed_args.regex == 'filenames') or (parsed_args.regex == 'f'): 55 | matched_strings = [] 56 | 57 | for string in strings: 58 | matched_string = re.search(regex_filenames, string) 59 | 60 | if matched_string is None: 61 | pass 62 | 63 | else: 64 | matched_string = matched_string.group() 65 | matched_strings.append(matched_string) 66 | 67 | print('[+] Matched ' + str(len(matched_strings)) + ' filenames.') 68 | strings_a = matched_strings 69 | strings_b = matched_strings 70 | 71 | else: 72 | strings = filter(None, strings) 73 | strings_a = strings 74 | strings_b = strings 75 | 76 | return strings_a,strings_b 77 | 78 | def calculate_jaccard(strings_a,strings_b): 79 | print('[+] Calculating Jaccard distance...') 80 | 81 | for string_a in strings_a: 82 | string_a = str(string_a) 83 | string_a = string_a.decode('utf-8').encode('ascii', 'xmlcharrefreplace') 84 | 85 | for string_b in strings_b: 86 | string_b = str(string_b) 87 | string_b = string_b.decode('utf-8').encode('ascii', 'xmlcharrefreplace') 88 | 89 | jaccard_distance = distance.jaccard(string_a, string_b) 90 | 91 | if 0 < jaccard_distance < jaccard_threshold: 92 | network.add_node(string_a, 93 | label=string_a.decode('utf-8').encode('ascii', 'xmlcharrefreplace')[:32], 94 | color='orange', 95 | style='filled', 96 | fillcolor='white', 97 | font_color='black', 98 | fontname='Arial', 99 | penwidth=2, 100 | bipartite=0) 101 | 102 | network.add_node(string_b, 103 | label=string_b.decode('utf-8').encode('ascii', 'xmlcharrefreplace')[:32], 104 | style='filled', 105 | fillcolor='white', 106 | color='orange', 107 | font_color='black', 108 | fontname='Arial', 109 | penwidth=2, 110 | bipartite=1) 111 | 112 | network.add_edge(string_b, 113 | string_a, 114 | penwidth=2, 115 | color='#0096D6', 116 | dir='none') 117 | 118 | try: 119 | write_dot(network, jaccard_file + '.dot') 120 | print('[+] Created ' + jaccard_file + '.dot.') 121 | 122 | except: 123 | print('[!] Error creating DOT file.') 124 | 125 | if parsed_args.output == 'png': 126 | try: 127 | subprocess.Popen(['sfdp', 128 | jaccard_file + '.dot', 129 | '-Tpng', 130 | '-o', 131 | jaccard_file + '.png', 132 | '-Goverlap=False', 133 | '-Goutputorder="edgesfirst"']) 134 | 135 | print('[+] Exported link chart as ' + jaccard_file + '.png.') 136 | 137 | except: 138 | print('[!] Error saving link chart. Check that Graphviz is added to PATH.') 139 | exit(0) 140 | 141 | if parsed_args.output == 'svg': 142 | try: 143 | 144 | subprocess.Popen(['sfdp', 145 | jaccard_file + '.dot', 146 | '-Goverlap=False', 147 | '-Tsvg', 148 | '-o', 149 | jaccard_file + '.svg', 150 | '-Goverlap=False', 151 | '-Goutputorder="edgesfirst"']) 152 | 153 | print('[+] Exported link chart as ' + jaccard_file + '.svg.') 154 | 155 | except: 156 | print('[!] Error saving link chart. Check that Graphviz is added to PATH.') 157 | 158 | return 159 | 160 | def calculate_hamming(strings_a,strings_b): 161 | print('[+] Calculating Hamming distance...') 162 | 163 | for string_a in strings_a: 164 | string_a = str(string_a) 165 | string_a = string_a.decode('utf-8').encode('ascii', 'xmlcharrefreplace') 166 | 167 | for string_b in strings_b: 168 | string_b = str(string_b) 169 | string_b = string_b.decode('utf-8').encode('ascii', 'xmlcharrefreplace') 170 | 171 | if len(string_a) == len(string_b): # Hamming distance can only be calculated for equal length strings 172 | hamming_distance = distance.hamming(string_a, string_b) 173 | 174 | if 0 < hamming_distance < hamming_threshold: 175 | network.add_node(string_a, 176 | label=string_a.decode('utf-8').encode('ascii', 'xmlcharrefreplace')[:32], 177 | color='orange', 178 | style='filled', 179 | fillcolor='white', 180 | font_color='black', 181 | fontname='Arial', 182 | penwidth=2, 183 | bipartite=0) 184 | 185 | network.add_node(string_b, 186 | label=string_b.decode('utf-8').encode('ascii', 'xmlcharrefreplace')[:32], 187 | style='filled', 188 | fillcolor='white', 189 | color='orange', 190 | font_color='black', 191 | fontname='Arial', 192 | penwidth=2, 193 | bipartite=1) 194 | 195 | network.add_edge(string_b, 196 | string_a, 197 | penwidth=2, 198 | color='#0096D6', 199 | dir='none') 200 | 201 | try: 202 | write_dot(network, hamming_file + '.dot') 203 | print('[+] Created ' + hamming_file + '.dot.') 204 | 205 | except: 206 | print('[!] Error creating DOT file.') 207 | 208 | if parsed_args.output == 'png': 209 | try: 210 | subprocess.Popen(['sfdp', 211 | hamming_file + '.dot', 212 | '-Tpng', 213 | '-o', 214 | hamming_file + '.png', 215 | '-Goverlap=False', 216 | '-Goutputorder="edgesfirst"']) 217 | 218 | print('[+] Exported link chart as ' + hamming_file + '.png.') 219 | 220 | except: 221 | print('[!] Error saving link chart. Check that Graphviz is added to PATH.') 222 | 223 | if parsed_args.output == 'svg': 224 | try: 225 | subprocess.Popen(['sfdp', 226 | hamming_file + '.dot', 227 | '-Tsvg', 228 | '-o', 229 | hamming_file + '.svg', 230 | '-Goverlap=False', 231 | '-Goutputorder="edgesfirst"']) 232 | 233 | print('[+] Exported link chart as ' + hamming_file + '.svg.') 234 | 235 | except: 236 | print('[!] Error saving link chart. Check that Graphviz is added to PATH.') 237 | 238 | return 239 | 240 | def calculate_levenshtein(strings_a,strings_b): 241 | print('[+] Calculating Levenshtein distance...') 242 | 243 | for string_a in strings_a: 244 | string_a = str(string_a) 245 | string_a = string_a.decode('utf-8').encode('ascii', 'xmlcharrefreplace') 246 | 247 | for string_b in strings_b: 248 | string_b = str(string_b) 249 | string_b = string_b.decode('utf-8').encode('ascii', 'xmlcharrefreplace') 250 | 251 | if -levenshtein_length_threshold < (len(string_a) - len(string_b)) < levenshtein_length_threshold: 252 | levenshtein_distance = distance.levenshtein(string_a, string_b) 253 | 254 | if 0 < levenshtein_distance < levenshtein_threshold: 255 | network.add_node(string_a, 256 | label=string_a.decode('utf-8').encode('ascii', 'xmlcharrefreplace')[:32], 257 | color='orange', 258 | style='filled', 259 | fillcolor='white', 260 | font_color='black', 261 | fontname='Arial', 262 | penwidth=2, 263 | bipartite=0) 264 | 265 | network.add_node(string_b, 266 | label=string_b.decode('utf-8').encode('ascii', 'xmlcharrefreplace')[:32], 267 | style='filled', 268 | fillcolor='white', 269 | color='orange', 270 | font_color='black', 271 | fontname='Arial', 272 | penwidth=2, 273 | bipartite=1) 274 | 275 | network.add_edge(string_b, 276 | string_a, 277 | penwidth=2, 278 | color='#0096D6', 279 | dir='none') 280 | try: 281 | write_dot(network, levenshtein_file + '.dot') 282 | print('[+] Created ' + levenshtein_file + '.dot.') 283 | 284 | except: 285 | print('[!] Error creating DOT file.') 286 | 287 | if parsed_args.output == 'png': 288 | try: 289 | subprocess.Popen(['sfdp', 290 | levenshtein_file + '.dot', 291 | '-Tpng', 292 | '-o', 293 | levenshtein_file + '.png', 294 | '-Goverlap=False', 295 | '-Goutputorder="edgesfirst"']) 296 | 297 | print('[+] Exported link chart as ' + levenshtein_file + '.png.') 298 | 299 | except: 300 | print('[!] Error saving link chart. Check that Graphviz is added to PATH.') 301 | 302 | if parsed_args.output == 'svg': 303 | try: 304 | subprocess.Popen(['sfdp', 305 | levenshtein_file + '.dot', 306 | '-Tsvg', 307 | '-o', 308 | levenshtein_file + '.svg', 309 | '-Goverlap=False', 310 | '-Goutputorder="edgesfirst"']) 311 | 312 | print('[+] Exported link chart as ' + levenshtein_file + '.svg.') 313 | 314 | except: 315 | print('[!] Error saving link chart. Check that Graphviz is added to PATH.') 316 | 317 | return 318 | 319 | strings_a,strings_b = parse_text() 320 | 321 | if (parsed_args.metric == 'jaccard') or (parsed_args.metric == 'J'): 322 | calculate_jaccard(strings_a,strings_b) 323 | 324 | if (parsed_args.metric == 'hamming') or (parsed_args.metric == 'H'): 325 | calculate_hamming(strings_a,strings_b) 326 | 327 | if (parsed_args.metric == 'levenshtein') or (parsed_args.metric == 'L'): 328 | calculate_levenshtein(strings_a,strings_b) 329 | 330 | if (parsed_args.metric == 'all') or (parsed_args.metric == 'A'): 331 | calculate_jaccard(strings_a,strings_b) 332 | calculate_hamming(strings_a,strings_b) 333 | calculate_levenshtein(strings_a,strings_b) 334 | 335 | exit(0) -------------------------------------------------------------------------------- /graph_similar_strings/images/graph_similar_strings_screenshot_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cryptogramfan/Malware-Analysis-Scripts/7fc87591a0500ab74ba0dc9896d5e61efae93107/graph_similar_strings/images/graph_similar_strings_screenshot_1.png -------------------------------------------------------------------------------- /graph_similar_strings/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cryptogramfan/Malware-Analysis-Scripts/7fc87591a0500ab74ba0dc9896d5e61efae93107/graph_similar_strings/requirements.txt --------------------------------------------------------------------------------