├── .gitattributes
├── README.md
├── decrypt_l0rdix_c2.py
├── deobfuscate_ostap.py
├── graph_similar_document_images
    ├── README.md
    ├── graph_similar_document_images.py
    ├── image_hash_signatures.txt
    ├── images
    │   ├── graph_similar_document_images_screenshot_1.png
    │   └── graph_similar_document_images_screenshot_2.png
    └── requirements.txt
└── graph_similar_strings
    ├── README.md
    ├── graph_similar_strings.py
    ├── images
        └── graph_similar_strings_screenshot_1.png
    └── requirements.txt


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Malware Analysis Scripts
2 |  Handy scripts I use to speed up malware analysis.
3 | 


--------------------------------------------------------------------------------
/decrypt_l0rdix_c2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # A script that identifies, decrypts and extracts L0rdix RAT command and control (C2)
  4 | # traffic from a supplied PCAP file.
  5 | #
  6 | # To speed up parsing, trim your PCAP to only HTTP ports using tcpdump,
  7 | # for example:
  8 | #
  9 | # $ tcpdump -r l0rdix_c2.pcap -w l0rdix_c2_http.pcap 'tcp port 80 or 8080 or 3128'
 10 | #
 11 | # Requirements:
 12 | # pyshark-legacy
 13 | # pycryptodome
 14 | #
 15 | # Author.....: Alex Holland (@cryptogramfan)
 16 | # Date.......: 2019-07-27
 17 | # Version....: 0.1.6
 18 | # License....: CC BY 4.0
 19 | # Reference_1: https://www.bromium.com/an-analysis-of-l0rdix-rat-panel-and-builder/
 20 | # Reference_2: https://www.bromium.com/decrypting-l0rdix-rats-c2/
 21 | 
 22 | import sys
 23 | import argparse
 24 | import pyshark
 25 | import urllib
 26 | import re
 27 | import hashlib
 28 | import binascii
 29 | import uuid
 30 | from Crypto.Cipher import AES
 31 | from base64 import b64decode
 32 | 
 33 | parser = argparse.ArgumentParser(description="\nUsage: python decrypt_l0rdix_c2.py -p <l0rdix_c2.pcap> -k <OPERATOR_KEY>")
 34 | parser.add_argument("-p", dest="pcap_file", help="PCAP containing encrypted L0rdix C2 traffic.", required=True)
 35 | parser.add_argument("-k", dest="operator_key", help="UTF-8 operator key extracted from a L0rdix bot or panel. If no key is supplied, the default key \"3sc3RLrpd17\" will be used.", default="3sc3RLrpd17")
 36 | parsed_args = parser.parse_args()
 37 | operator_key = parsed_args.operator_key
 38 | aes_key = hashlib.sha256(operator_key).digest()
 39 | iv = b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
 40 | parameters = []
 41 | hostnames = []
 42 | imgs = []
 43 | 
 44 | try:
 45 | 	pcap = pyshark.FileCapture(parsed_args.pcap_file, keep_packets=False, display_filter='http.request.method == POST && http.request.uri.path == "/connect.php" && count(http.request.uri.query.parameter) >= 10')
 46 | 	print "[+] Parsing PCAP..."
 47 | 
 48 | except:
 49 | 	print(parser.description)
 50 | 	exit(0)
 51 | 
 52 | try: 
 53 | 	print "[+] Searching for L0rdix C2 traffic..."
 54 | 	for packet in pcap:
 55 | 		# Enumerate hosts
 56 | 		query = packet['HTTP']
 57 | 		hostnames.append(query.host)
 58 | 		
 59 | 		# Enumerate parameters
 60 | 		query = query.request_uri_query
 61 | 		query = urllib.unquote(query)
 62 | 		query = re.sub("~", "+", query)
 63 | 		query = re.sub("^h=", "", query)
 64 | 		found_parameters = re.split("&[a-z]{1,2}=", query)
 65 | 		parameters.extend(found_parameters)
 66 | 		
 67 | except:
 68 | 	print "[!] Error, exiting."
 69 | 	exit(0)
 70 | 
 71 | if not hostnames:
 72 | 	print "[+] No L0rdix C2 traffic found."
 73 | 	exit(0)
 74 | 
 75 | else:
 76 | 	print "[+] Found references to L0rdix C2 servers (%d):\n" % (len(hostnames))
 77 | 	for hostname in hostnames:
 78 | 		print hostname
 79 | 	
 80 | if not parameters:
 81 | 	print "[+] No L0rdix URI parameters found."
 82 | 	exit(0)
 83 | 	
 84 | else:		
 85 | 	print "\n[+] Found L0rdix C2 traffic (%d strings):\n" % (len(parameters))	
 86 | 	for parameter in parameters:
 87 | 		print parameter
 88 | 
 89 | try: 
 90 | 	print "[+] Searching for screenshots..."
 91 | 	for packet in pcap:
 92 | 		# Enumerate screenshots
 93 | 		img = packet['URLENCODED-FORM']
 94 | 		img = urllib.unquote(img.value)
 95 | 		img = b64decode(img)
 96 | 		img = bytearray(img)
 97 | 		img_name = str(uuid.uuid4()) + '.jpg'
 98 | 		imgs.append(img_name)
 99 | 		
100 | 		# Dump screenshots
101 | 		f = open(img_name, 'w+b')
102 | 		f.write(img)
103 | 		f.close()
104 | 
105 | except:
106 | 	print "[!] Error, exiting."
107 | 	exit(0)
108 | 
109 | if not imgs:
110 | 	print "[+] No L0rdix screenshots found."
111 | 	exit(0)
112 | 	
113 | else:		
114 | 	print "[+] Dumped L0rdix screenshots in current directory (%d):\n" % (len(imgs))	
115 | 	for img_name in imgs:
116 | 		print img_name
117 | 
118 | print "\n[+] Decrypting strings using operator key (UTF-8): " + operator_key
119 | print "[+] AES key (hex): " + binascii.hexlify(bytearray(aes_key))
120 | print "[+] IV (hex): " + binascii.hexlify(bytearray(iv))	
121 | print "[+] Decrypted L0rdix C2 traffic (%d strings):\n" % (len(parameters))
122 | 
123 | for parameter in parameters:
124 | 	cipher = AES.new(aes_key, AES.MODE_CBC, iv)
125 | 	ciphertext = b64decode(parameter)
126 | 	decrypted = cipher.decrypt(ciphertext)
127 | 	decrypted = decrypted.rstrip()
128 | 	print decrypted
129 | 
130 | print "[+] Finished, exiting."
131 | 


--------------------------------------------------------------------------------
/deobfuscate_ostap.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # A script that deobfuscates Ostap JSE (JScript Encoded) downloaders. The script is based 
  4 | # on Ostap samples analysed in August 2019, such as those delivering TrickBot. It will try
  5 | # to identify the indexes containing Unicode character codes and then deobfuscate the sample 
  6 | # using subtraction and addition.
  7 | # 
  8 | # To use the script, supply a file as an argument or pipe it to stdin:
  9 | #
 10 | # $ python deobfuscate_ostap.py ostap.jse
 11 | # $ cat ostap.jse | deobfuscate_ostap.py
 12 | #
 13 | # Author.....: Alex Holland (@cryptogramfan)
 14 | # Date.......: 2019-08-29 
 15 | # Version....: 0.0.5
 16 | # License....: CC BY 4.0
 17 | # Reference_1: https://www.bromium.com/deobfuscating-ostap-trickbots-javascript-downloader/
 18 | 
 19 | import os
 20 | import sys
 21 | import re
 22 | 
 23 | index_0 = ""
 24 | index_1 = ""
 25 | indexes_raw = []
 26 | indexes = []
 27 | values_0 = []
 28 | values_1 = []
 29 | 
 30 | # Subtract index 0 values from index 1 
 31 | def subtract_values_1():
 32 | 	characters_sub = []
 33 | 	servers = []
 34 | 	urls = []
 35 | 	
 36 | 	try:
 37 | 		print("[+] Trying deobfuscation by subtracting index %s elements from index %s elements..." % (indexes[0], indexes[1]))
 38 | 		charcodes_sub = [i - j for i, j in zip(values_0, values_1)]
 39 | 		
 40 | 	except:
 41 | 		print("[!] Error subtracting index %s elements from index %s elements." % (indexes[0], indexes[1]))
 42 | 		subtract_values_2() # Try another subtraction instead
 43 | 	
 44 | 	try:
 45 | 		for charcode_sub in charcodes_sub:
 46 | 			character_sub = chr(charcode_sub)
 47 | 			characters_sub.append(character_sub)
 48 | 		
 49 | 		characters_sub = ''.join(characters_sub)
 50 | 
 51 | 	except:
 52 | 		print("[!] Error converting character codes to characters.")
 53 | 		subtract_values_2()
 54 | 	
 55 | 	match = re.search("Script", characters_sub, re.IGNORECASE)
 56 | 		
 57 | 	if match:
 58 | 		print("[+] Deobfuscation using subtraction 1 was successful:\n")
 59 | 		print(characters_sub)
 60 | 		
 61 | 		match_url = re.search("http(s):\/\/.+(Drives|POST)", characters_sub, re.IGNORECASE)
 62 | 
 63 | 		if match_url:
 64 | 			servers.append(match_url.group())
 65 | 		
 66 | 		for server in servers:
 67 | 			server = re.sub("Drives.*$", "", server, re.IGNORECASE)
 68 | 			server = re.sub("POST$", "", server, re.IGNORECASE)
 69 | 			urls.append(server)
 70 | 			
 71 | 		if urls:
 72 | 			print("\n[+] Found URL(s):\n")
 73 | 			print(", ".join(urls))
 74 | 		
 75 | 		exit(0)
 76 | 		
 77 | 	else:
 78 | 		print("[!] Deobfuscation using subtraction 1 was unsuccessful.")
 79 | 		subtract_values_2()
 80 | 	
 81 | 	return;
 82 | 
 83 | # Subtract index 1 values from index 0 values
 84 | def subtract_values_2():
 85 | 	characters_sub = []
 86 | 	servers = []
 87 | 	urls = []
 88 | 	
 89 | 	try:
 90 | 		print("[+] Trying deobfuscation by subtracting index %s elements from index %s elements..." % (indexes[1], indexes[0]))
 91 | 		charcodes_sub = [i - j for i, j in zip(values_1, values_0)]
 92 | 		
 93 | 	except:
 94 | 		print("[!] Error subtracting index %s elements from index %s elements." % (indexes[1], indexes[0]))
 95 | 		add_values() # Try addition instead
 96 | 	
 97 | 	try:
 98 | 		for charcode_sub in charcodes_sub:
 99 | 			character_sub = chr(charcode_sub)
100 | 			characters_sub.append(character_sub)
101 | 		
102 | 		characters_sub = ''.join(characters_sub)
103 | 
104 | 	except:
105 | 		print("[!] Error converting character codes to characters.")
106 | 		add_values()
107 | 	
108 | 	match = re.search("Script", characters_sub, re.IGNORECASE)
109 | 		
110 | 	if match:
111 | 		print("[+] Deobfuscation using subtraction 2 was successful:\n")
112 | 		print(characters_sub)
113 | 		
114 | 		match_url = re.search("http(s):\/\/.+(Drives|POST)", characters_sub, re.IGNORECASE)
115 | 
116 | 		if match_url:
117 | 			servers.append(match_url.group())
118 | 		
119 | 		for server in servers:
120 | 			server = re.sub("Drives.*$", "", server, re.IGNORECASE)
121 | 			server = re.sub("POST$", "", server, re.IGNORECASE)
122 | 			urls.append(server)
123 | 			
124 | 		if urls:
125 | 			print("\n[+] Found URL(s):\n")
126 | 			print(", ".join(urls))
127 | 		
128 | 		exit(0)
129 | 		
130 | 	else:
131 | 		print("[!] Deobfuscation using subtraction 2 was unsuccessful.")
132 | 		add_values()
133 | 	
134 | 	return;
135 | 
136 | # Add index 0 values to index 1 values
137 | def add_values():
138 | 	characters_add = []
139 | 	servers = []
140 | 	urls = []
141 | 
142 | 	try:
143 | 		print("[+] Trying deobfuscation by adding index %s elements to index %s elements..." % (indexes[1], indexes[0]))
144 | 		charcodes_add = [i + j for i, j in zip(values_1, values_0)]
145 | 	
146 | 	except:
147 | 		print("[!] Error adding index %s elements to index %s elements. Exiting." % (indexes[1], indexes[0]))
148 | 		exit(0)
149 | 	
150 | 	try:
151 | 		for charcode_add in charcodes_add:
152 | 			character_add = chr(charcode_add)
153 | 			characters_add.append(character_add)
154 | 		
155 | 		characters_add = ''.join(characters_add)
156 | 	
157 | 	except:
158 | 		print("[!] Error converting character codes to characters. Exiting.")
159 | 		exit(0)
160 | 	
161 | 	match = re.search("Script", characters_add, re.IGNORECASE)
162 | 		
163 | 	if match:
164 | 		print("[+] Deobfuscation using addition was successful:\n")
165 | 		print(characters_add)
166 | 		
167 | 		match_url = re.search("http(s):\/\/.+(Drives|POST)", characters_add, re.IGNORECASE)
168 | 
169 | 		if match_url:
170 | 			servers.append(match_url.group())
171 | 		
172 | 		for server in servers:
173 | 			server = re.sub("Drives.*$", "", server, re.IGNORECASE)
174 | 			server = re.sub("POST$", "", server, re.IGNORECASE)
175 | 			urls.append(server)
176 | 			
177 | 		if urls:
178 | 			print("\n[+] Found URL(s):\n")
179 | 			print(", ".join(urls))
180 | 			
181 | 		exit(0)
182 | 		
183 | 	else:
184 | 		print("[!] Deobfuscation using addition was unsuccessful. Exiting.")
185 | 		exit(0)
186 | 	
187 | 	return;
188 | 	
189 | if len(sys.argv) > 1:
190 | 	file = open(sys.argv[1], 'r')
191 | else:
192 | 	file = sys.stdin
193 | 
194 | while 1:
195 | 	input = file.read()
196 | 	
197 | 	# Find array indexes
198 | 	try:
199 | 		print("\n[+] Analysing %s" % os.path.basename(file.name))
200 | 		input = input.decode('utf-8')
201 | 	
202 | 	except UnicodeError:
203 | 		print("[!] File not UTF-8. Treating as UTF-16.")
204 | 		input = input.decode('utf-16')
205 | 	
206 | 	try:
207 | 		indexes_raw = re.findall("\[\d+\]=\d+;", input)
208 | 		
209 | 	except:
210 | 		print("[!] Error finding array indexes. Exiting.")
211 | 		exit(0)
212 | 	
213 | 	if not indexes_raw:
214 | 		print("[!] Array indexes not found. Exiting.")
215 | 		exit(0)
216 | 	
217 | 	# Put the index string into a list 
218 | 	try:
219 | 		for index in indexes_raw:
220 | 			index = re.sub("\[", "", index) 
221 | 			index = re.sub("\]=\d+;", "", index)
222 | 			indexes.append(index)
223 | 		
224 | 		# Remove duplicates
225 | 		indexes = list(set(indexes))
226 | 		print("[+] Found array indexes %s and %s." % (indexes[0], indexes[1]))
227 | 		
228 | 	except:
229 | 		print("[!] Error processing array indexes. Exiting.")
230 | 		exit(0)
231 | 	
232 | 	try:
233 | 		element_regex_0 = r"\[" + indexes[0] + r"\]=\d+;"
234 | 		element_regex_1 = r"\[" + indexes[1] + r"\]=\d+;"
235 | 			
236 | 	except:
237 | 		print("[!] Error creating regular expressions. Exiting.")
238 | 		exit(0)
239 | 
240 | 	# Find the values of index 0 elements
241 | 	try:
242 | 		print("[+] Searching for index %s elements..." % indexes[0])
243 | 		array_0 = re.findall(element_regex_0, input)
244 | 	
245 | 		for element in array_0:
246 | 			element = re.sub("\[\d+\]=", "", element) 
247 | 			element = re.sub(";", "", element)
248 | 			values_0.append(element)
249 | 	
250 | 	except:
251 | 		print("[!] Error finding index %s elements. Exiting." % indexes[0]) 
252 | 		exit(0)
253 | 	
254 | 	if not values_0:
255 | 		print("[!] No index %s elements found. Exiting." % indexes[0])
256 | 		exit(0)
257 | 	
258 | 	# Convert index 0 elements to integer values
259 | 	try:
260 | 		values_0 = map(int, values_0)
261 | 	
262 | 	except: 
263 | 		print("[!] Error converting index %s elements to integers. Exiting." % indexes[0])
264 | 		exit(0)
265 | 	
266 | 	# Find the values of index 1 elements
267 | 	try:
268 | 		print("[+] Searching for index %s elements..." % indexes[1])
269 | 		array_1 = re.findall(element_regex_1, input)
270 | 	
271 | 		for element in array_1:
272 | 			element = re.sub("\[\d+\]=", "", element) 
273 | 			element = re.sub(";", "", element)
274 | 			values_1.append(element)
275 | 	
276 | 	except:
277 | 		print("[!] Error finding index %s elements. Exiting." % indexes[1])
278 | 		exit(0)
279 | 	
280 | 	if not values_1:
281 | 		print("[!] No index %s elements found. Exiting." % indexes[1])
282 | 		exit(0)
283 | 	
284 | 	# Convert index 1 elements to integer values
285 | 	try:
286 | 		values_1 = map(int, values_1)
287 | 	
288 | 	except: 
289 | 		print("[!] Error converting index %s elements to integers. Exiting." % indexes[1])
290 | 		exit(0)
291 | 	
292 | 	subtract_values_1()
293 | 	subtract_values_2()
294 | 	add_values()
295 | 	
296 | 	exit(0)
297 | 


--------------------------------------------------------------------------------
/graph_similar_document_images/README.md:
--------------------------------------------------------------------------------
 1 | # graph_similar_document_images.py
 2 |  A script that extracts embedded images from [Office Open XML (OOXML)](https://en.wikipedia.org/wiki/Office_Open_XML) documents and generates image hash similarity graphs that cluster visually similar images together. The script computes the [Average Hash](http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html) of each extracted image, then graphs the images if they meet the similarity threshold. The script can be used as a technique for visually identifying malware campaigns involving documents. To use the script, supply a directory containing OOXML files. If LibreOffice is in your PATH you can optionally convert non-OOXML Word, Excel, PowerPoint and Rich Text File documents to OOXML. The script outputs DOT files that can be exported as images using Graphviz. If Graphviz is in your PATH you can also export to an SVG (preferred) or PNG image.
 3 | 
 4 |  ## Output 
 5 |  Example image hash similarity graph (cropped). Here each node is a unique image that is connected by edges to other images that met the similarity threshold:
 6 | 
 7 |  <img src="https://github.com/cryptogramfan/Malware-Analysis-Scripts/blob/master/graph_similar_document_images/images/graph_similar_document_images_screenshot_1.png" width="700">
 8 |  
 9 |  Example CSV output of the script in detect mode, which lists images that match the similarity threshold with the signatures in the blacklist file, [image_hash_signatures.txt](https://github.com/cryptogramfan/Malware-Analysis-Scripts/blob/master/graph_similar_document_images/image_hash_signatures.txt):
10 |  
11 |  <img src="https://github.com/cryptogramfan/Malware-Analysis-Scripts/blob/master/graph_similar_document_images/images/graph_similar_document_images_screenshot_2.png" width="700">
12 |  
13 |  ## Example usage
14 |  Convert documents to OOXML, extract images from the documents, identify images that are similar to the blacklist and then graph images that meet the similarity threshold: 
15 |  ```
16 |  $ graph_similar_document_images.py -f ~/Samples -d image_hash_signatures.txt -c -g -t 80 -o svg
17 |  ```
18 |  ## Help
19 |  ```
20 | usage: graph_similar_document_images.py [-h] -f INPUT_DIR
21 |                                         [-t MIN_SIMILARITY_THRESHOLD]
22 |                                         [-d SIG_FILE] [-g] [-c] [-o {svg,png}]
23 | 
24 | Usage: graph_similar_document_images.py -f <directory_containing_documents> -c -d <image_hash_signatures.txt>
25 | -g -t <min_similarity_threshold> -o <svg|png>
26 | 
27 | optional arguments:
28 |   -h, --help            show this help message and exit
29 |   -f INPUT_DIR, --files INPUT_DIR
30 |                         Directory to process
31 |   -t MIN_SIMILARITY_THRESHOLD, --threshold MIN_SIMILARITY_THRESHOLD
32 |                         Minimum percentage similarity between images to graph
33 |                         (0 to 100)
34 |   -d SIG_FILE, --detect SIG_FILE
35 |                         Detect mode identifies images that are similar to a
36 |                         blacklist of known-bad images
37 |   -g, --graph           Graph mode creates a graph of images that meet the
38 |                         similarity threshold
39 |   -c, --convert         Try converting documents to OOXML using LibreOffice
40 |   -o {svg,png}, --output {svg,png}
41 |                         Output image format
42 |  ```
43 |  ## Supported platforms
44 |  Tested on Ubuntu 18.04 with Python 3.
45 |  
46 |  ## Installation
47 |  First install Graphviz and LibreOffice:
48 |  ``` 
49 |  $ sudo add-apt-repository ppa:libreoffice/ppa
50 |  $ sudo apt update
51 |  $ sudo apt install graphviz libreoffice
52 |  ```
53 |  Afterwards, install the required Python libraries:
54 |  ```
55 |  $ python3 -m pip install -r requirements.txt
56 |  ```
57 |  To view SVG files produced by the script you can use a viewer such as [Inkscape](https://inkscape.org/). Outputting to PNG isn't recommended because the resulting files can be large.
58 |  
59 |  ## License
60 |  Released under the Creative Commons Attribution 4.0 International ([CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)) license.


--------------------------------------------------------------------------------
/graph_similar_document_images/graph_similar_document_images.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #
  3 | # A script that extracts embedded images from Open Office XML (OOXML) documents and generates network
  4 | # graphs that cluster similar images together. The script computes average hashes of the extracted
  5 | # images, then graphs the images if they meet the similarity threshold. The script can be used as a
  6 | # technique for visually identifying malware campaigns involving documents. To use the script, supply
  7 | # a directory containing OOXML files. If LibreOffice is in your PATH you can optionally convert
  8 | # non-OOXML Word, Excel, PowerPoint and Rich Text File documents to OOXML. The script outputs DOT files
  9 | # that can be exported as images using Graphviz. If Graphviz is in your PATH you can also export to an
 10 | # SVG (preferred) or PNG image.
 11 | #
 12 | # $ graph_similar_document_images.py -f <directory_containing_documents> -d <image_hash_signatures.txt> -g -t <min_similarity_threshold> -o <svg|png>
 13 | # 
 14 | # Author.....: Alex Holland (@cryptogramfan)
 15 | # Date.......: 2020-02-02
 16 | # Version....: 0.0.3
 17 | # License....: CC BY 4.0
 18 | 
 19 | import os
 20 | import csv
 21 | import hashlib
 22 | import argparse
 23 | import imagehash
 24 | import subprocess
 25 | import magic
 26 | import distance
 27 | from PIL import Image
 28 | from zipfile import ZipFile
 29 | from shutil import copyfileobj, copy, rmtree
 30 | from time import strftime
 31 | from networkx.algorithms import bipartite
 32 | from networkx.drawing.nx_agraph import write_dot
 33 | from networkx import graph as nx
 34 | 
 35 | parser = argparse.ArgumentParser(description='\nUsage: graph_similar_document_images.py -f <directory_containing_documents> -d <image_hash_signatures.txt> -g -c -t <threshold> -o <svg|png>')
 36 | parser.add_argument('-f', '--files', dest='input_dir', help='Directory to process', required=True)
 37 | parser.add_argument('-t', '--threshold', dest='min_similarity_threshold', type=float, help='Minimum percentage similarity between images to graph (0 to 100)', default=87.5)
 38 | parser.add_argument('-d', '--detect', dest='sig_file', help='Detect mode identifies images that are similar to a blacklist of known-bad images and saves the results to a CSV (requires image_hash_signatures.txt)')
 39 | parser.add_argument('-g', '--graph', help='Graph mode creates a graph of images that meet the similarity threshold in DOT format', action='store_true')
 40 | parser.add_argument('-c', '--convert', help='Try converting documents to OOXML (requires LibreOffice)', action='store_true')
 41 | parser.add_argument('-o', '--output', choices=['svg', 'png'], help='Output image format (requires Graphviz)', default='svg')
 42 | parsed_args = parser.parse_args()
 43 | network = nx.Graph()
 44 | timestr = strftime('%Y%m%d-%H%M%S')
 45 | graph_file = 'graph_similar_document_images_' + timestr
 46 | input_dir = parsed_args.input_dir
 47 | sig_file = parsed_args.sig_file
 48 | csv_file = os.path.join(os.getcwd(), 'image_hash_matches_' + timestr + '.csv')
 49 | dir_image = os.path.join(os.getcwd(), 'extracted_document_images_' + timestr)
 50 | dir_convert_docx = os.path.join(os.getcwd(), 'convert_docx')
 51 | dir_convert_pptx = os.path.join(os.getcwd(), 'convert_pptx')
 52 | dir_convert_xlsx = os.path.join(os.getcwd(), 'convert_xlsx')
 53 | dir_converted_docx = os.path.join(os.getcwd(), 'converted_docx')
 54 | dir_converted_pptx = os.path.join(os.getcwd(), 'converted_pptx')
 55 | dir_converted_xlsx = os.path.join(os.getcwd(), 'converted_xlsx')
 56 | min_similarity_threshold = parsed_args.min_similarity_threshold
 57 | 
 58 | def load_signatures():
 59 |     signatures = {}
 60 |     with open(sig_file) as f:
 61 |         for line in f:
 62 |            line = line.rstrip()
 63 |            if line:
 64 |                if not line.startswith('#'):
 65 |                    (sig_hash, sig_name) = line.split(',')
 66 |                    signatures[sig_hash] = sig_name
 67 | 
 68 |     return signatures
 69 | 
 70 | def create_dirs():
 71 |     try:
 72 |         os.makedirs(dir_convert_docx)
 73 |         os.makedirs(dir_convert_xlsx)
 74 |         os.makedirs(dir_convert_pptx)
 75 |         os.makedirs(dir_converted_docx)
 76 |         os.makedirs(dir_converted_xlsx)
 77 |         os.makedirs(dir_converted_pptx)
 78 | 
 79 |     except OSError:
 80 |         if not os.path.isdir(dir_convert_docx):
 81 |             raise
 82 | 
 83 |         if not os.path.isdir(dir_convert_xlsx):
 84 |             raise
 85 | 
 86 |         if not os.path.isdir(dir_convert_pptx):
 87 |             raise
 88 | 
 89 |         if not os.path.isdir(dir_converted_docx):
 90 |             raise
 91 | 
 92 |         if not os.path.isdir(dir_converted_xlsx):
 93 |             raise
 94 | 
 95 |         if not os.path.isdir(dir_converted_pptx):
 96 |             raise
 97 | 
 98 | def identify_files():
 99 |     for infile in os.listdir(input_dir):
100 |         if os.path.isfile(os.path.join(input_dir, infile)):
101 |             infile_path = os.path.join(input_dir, infile)
102 |             
103 |             if ('Microsoft OOXML' in magic.from_file(infile_path)) or ('Microsoft Word 2007+' in magic.from_file(infile_path)):
104 |                 extract_ooxml(infile_path)
105 |             
106 |             if parsed_args.convert:
107 |                 if ('Microsoft Office Word' in magic.from_file(infile_path)) or ('CDFV2 Encrypted' in magic.from_file(infile_path)) or ('Rich Text Format' in magic.from_file(infile_path)):
108 |                     copy(infile_path, dir_convert_docx)
109 | 
110 |                 if ('Microsoft Excel' in magic.from_file(infile_path)):
111 |                     copy(infile_path, dir_convert_xlsx)
112 | 
113 |                 if ('Microsoft Office PowerPoint' in magic.from_file(infile_path)):
114 |                     copy(infile_path, dir_convert_pptx)
115 |     return True
116 | 
117 | def convert_docx():
118 |     print('\n[+] Converting to Word OOXML...')
119 |     path = dir_convert_docx + '/*'
120 |     
121 |     try:    
122 |         os.system('soffice --headless --convert-to docx --outdir ' + dir_converted_docx + ' ' + path)
123 | 
124 |     except:
125 |         print('[!] Error converting document. Check that LibreOffice is added to your PATH.')
126 | 
127 |     print('\n[+] Extracting from converted Word OOXML documents...')
128 |     for f in os.listdir(dir_converted_docx):
129 |         if os.path.isfile(os.path.join(dir_converted_docx, f)):
130 |             f = os.path.join(dir_converted_docx, f)           
131 |             extract_ooxml(f)
132 |     
133 |     rmtree(dir_convert_docx)
134 |     rmtree(dir_converted_docx)
135 |     return True
136 |     
137 | def convert_xlsx():
138 |     print('\n[+] Converting to Excel OOXML...')
139 |     path = dir_convert_xlsx + '/*'
140 | 
141 |     try:
142 |         os.system('soffice --headless --convert-to xlsx --outdir ' + dir_converted_xlsx + ' ' + path)
143 | 
144 |     except:
145 |         print('[!] Error converting document. Check that LibreOffice is added to your PATH.')
146 |     
147 |     print('\n[+] Extracting from converted Excel OOXML documents...')
148 |     for f in os.listdir(dir_converted_xlsx):
149 |         if os.path.isfile(os.path.join(dir_converted_xlsx, f)):
150 |             f = os.path.join(dir_converted_xlsx, f)           
151 |             extract_ooxml(f)
152 |     
153 |     rmtree(dir_convert_xlsx)
154 |     rmtree(dir_converted_xlsx)
155 |     return True
156 | 
157 | def convert_pptx():
158 |     print('\n[+] Converting to PowerPoint OOXML...')
159 |     path = dir_convert_pptx + '/*'
160 |     
161 |     try:
162 |         os.system('soffice --headless --convert-to pptx --outdir ' + dir_converted_pptx + ' ' + path)
163 | 
164 |     except:
165 |         print('[!] Error converting document. Check that LibreOffice is added to your PATH.')
166 | 
167 |     print('\n[+] Extracting from converted PowerPoint OOXML documents...')
168 |     for f in os.listdir(dir_converted_pptx):
169 |         if os.path.isfile(os.path.join(dir_converted_pptx, f)):
170 |             f = os.path.join(dir_converted_pptx, f)           
171 |             extract_ooxml(f)
172 | 
173 |     rmtree(dir_convert_pptx)
174 |     rmtree(dir_converted_pptx)
175 |     return True
176 | 
177 | def extract_ooxml(f):
178 |     with open(f, 'rb') as infile:
179 |         bytes = infile.read()
180 |         hash_document = hashlib.sha256(bytes).hexdigest()
181 |         
182 |         try:
183 |             with ZipFile(f) as z:
184 |                 for i in z.infolist():
185 |                     name = i.filename
186 |                     
187 |                     if name.endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
188 |                        try:
189 |                            image_path = os.path.join(dir_image, hash_document)
190 |                            with z.open(name) as in_image, open(image_path, 'wb') as out_image:
191 |                                 copyfileobj(in_image, out_image)
192 |                                 print('[+] Extracted image from ' + f + '.')
193 | 
194 |                        except Exception as e: 
195 |                            print('[!] Error extracting image from ' + hash_document, e)
196 | 
197 |         except Exception as e: 
198 |             print('[!] Error unzipping ' + f, e)
199 | 
200 | def calculate_image_distance():
201 |     print('\n[+] Computing image hash distances...')
202 |     images = []
203 |     for i in os.listdir(dir_image):
204 |         images.append(i)
205 | 
206 |     images_a = images
207 |     images_b = images
208 |     
209 |     for a in images_a:
210 |         image_a_path = os.path.join(dir_image, a)
211 |         hash_a = imagehash.average_hash(Image.open(image_a_path))
212 |                           
213 |         for b in images_b:
214 |             image_b_path = os.path.join(dir_image, b)
215 |             hash_b = imagehash.average_hash(Image.open(image_b_path))
216 |             
217 |             image_distance = ((hash_a-hash_b)/len(hash_a.hash)**2)*100 # Each image hash is 64 bits long
218 |             image_similarity = 100-image_distance
219 | 
220 |             if image_similarity >= min_similarity_threshold:
221 |                 print('[+] ' + a + ' is ' + str('%.0f' % image_similarity) + '% similar to ' + b + '.')
222 |                 network.add_node(a,
223 |                         label='Image_Hash: ' + str(hash_a) + '\n' + 'SHA256_Doc: ' + a,
224 |                         image=image_a_path,
225 |                         type='image',
226 |                         style='filled',
227 |                         fillcolor='white',
228 |                         color='white',
229 |                         fontcolor='black',
230 |                         fontname='Arial',
231 |                         fontsize='20',
232 |                         bipartite=0)
233 | 
234 |                 network.add_node(b,
235 |                         label='Image_Hash: ' + str(hash_b) + '\n' + 'SHA256_Doc: ' + b,
236 |                         image=image_b_path,
237 |                         type='image',
238 |                         style='filled',
239 |                         fillcolor='white',
240 |                         color='white',
241 |                         fontcolor='black',
242 |                         fontname='Arial',
243 |                         fontsize='20',
244 |                         bipartite=1)
245 | 
246 |                 network.add_edge(b,
247 |                         a,
248 |                         penwidth=3,
249 |                         color='#0096D6',
250 |                         dir='none')
251 | 
252 |     write_dot(network, graph_file + '.dot')
253 |     print('[+] Created ' + graph_file + '.dot.')
254 | 
255 |     return True
256 | 
257 | def export_graph():
258 |     try:
259 |         if parsed_args.output == 'png':
260 |             subprocess.Popen(['sfdp',
261 |                             graph_file + '.dot', 
262 |                             '-Tpng', 
263 |                             '-o', 
264 |                             graph_file + '.png',
265 |                             '-Gfontname="Arial"'
266 |                             ])
267 |                             
268 |             print('[+] Created ' + graph_file + '.png.')
269 |             
270 |         if parsed_args.output == 'svg':
271 |             subprocess.Popen(['sfdp', 
272 |                             graph_file + '.dot', 
273 |                             '-Tsvg', 
274 |                             '-o', 
275 |                             graph_file + '.svg',
276 |                             '-Gfontname="Arial"'
277 |                             ])
278 |                             
279 |             print('[+] Created ' + graph_file + '.svg.')
280 | 
281 |     except: 
282 |             print('[!] Error exporting graph image. Check that Graphviz is added to your PATH.')
283 | 
284 |     return True
285 | 
286 | def detect_images(signatures):
287 |     print('\n[+] Detecting images that meet similarity threshold of signatures (' + str(parsed_args.min_similarity_threshold) + '%)...')
288 |     images = []
289 |     for i in os.listdir(dir_image):
290 |         images.append(i)
291 |     
292 |     with open(csv_file, mode='w') as csv_out:
293 |         csv_writer = csv.writer(csv_out, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
294 |         csv_writer.writerow(['Document_SHA256', 'Average_Hash', 'Signature_Name', 'Similarity'])
295 |  
296 |         for i in images:
297 |             image_path = os.path.join(dir_image, i)
298 |             image_hash = str(imagehash.average_hash(Image.open(image_path)))
299 |            
300 |             for sig_hash, sig_name in signatures.items():
301 |                 hamming_distance = distance.hamming(image_hash, sig_hash)
302 |                 image_similarity = 100-((hamming_distance/16)*100)
303 | 
304 |                 if image_similarity >= min_similarity_threshold:
305 |                     csv_writer.writerow([i, image_hash, sig_name, image_similarity])
306 |                     print('[+] Document ' + i + ' matched ' + sig_name + ' (' + str('%.0f' % image_similarity) + '% similarity).')
307 |     
308 |     print('[+] Saved results to ' + csv_file + '.')    
309 |     return True
310 | 
311 | def main():
312 |     try:
313 |         os.makedirs(dir_image)
314 | 
315 |     except OSError:
316 |             if not os.path.isdir(dir_image):
317 |                 raise
318 | 
319 |     if parsed_args.convert == False:
320 |         identify_files()
321 | 
322 |     if parsed_args.convert:
323 |         create_dirs()
324 |         identify_files()
325 |         convert_docx()
326 |         convert_xlsx()
327 |         convert_pptx()
328 |     
329 |     if parsed_args.graph:
330 |         if calculate_image_distance():
331 |             export_graph()
332 | 
333 |     if parsed_args.sig_file:
334 |         signatures = load_signatures()
335 |         detect_images(signatures)
336 | 
337 | if __name__== "__main__":
338 |     main()
339 | 


--------------------------------------------------------------------------------
/graph_similar_document_images/image_hash_signatures.txt:
--------------------------------------------------------------------------------
 1 | # A collection of Average Hash values of images embedded in malicious 
 2 | # documents computed using the average hash algorithm. These can
 3 | # be used as an analytic technique to detect malicious documents
 4 | # by comparing the distance (similarity) between a given document
 5 | # and these signatures. The signatures can be also be used to
 6 | # identify and track specific campaigns based on image re-use.
 7 | # Graph_similiar_document_images.py uses these hashes as signatures
 8 | # in detect mode.
 9 | #
10 | # Signatures are named using the following convention:
11 | # Type.Purpose.Product.Descriptor_1.Descriptor_2.Descriptor_3
12 | #
13 | # Name.......: image_hash_signatures.txt
14 | # Author.....: Alex Holland (@cryptogramfan)
15 | # Date.......: 2020-02-03
16 | #
17 | # Average_Hash,Signature_Name
18 | 0000c0d3dedc4000,Image.Social-Engineering.MS-Word.Previous-Version.Blue.Circle
19 | 00c0f8e0e6e4c000,Image.Social-Engineering.MS-Word.Earlier-Version.Navy
20 | 0038386eff7e7c00,Image.Social-Engineering.Office-365.Online-Version.Blue
21 | cfc7c7cfffff00e5,Image.Social-Engineering.MS-Office.Old-Version.White.Attemting
22 | 1f7f7ffe0202ff00,Image.Social-Engineering.MS-Office.Activation-Wizard
23 | ffff0113819fffff,Image.Social-Engineering.MS-Office.Protected.White
24 | 3f838383ffffffff,Image.Social-Engineering.Generic.Protected.Invoice.First-Fresh-NZ-Limited
25 | 0000187f76000000,Image.Social-Engineering.Generic.Protected.Full-Page.Blue
26 | efe7ffc38100ffff,Image.Social-Engineering.MS-Office.OpenOffice.Gray
27 | ffffc3ffff8101ff,Image.Social-Engineering.MS-Word.License
28 | be9c7c20fc007c00,Image.Social-Engineering.MS-Word.Older-Version.Blue
29 | cfc3c703bfbfffff,Image.Social-Engineering.MS-Office.Protected.Old-Logo.White.1
30 | ffff0113819fffff,Image.Social-Engineering.MS-Word.Protected.Defender.Blue
31 | 1f7fff01ffff1e00,Image.Social-Engineering.MS-Word.Not-Activated.White
32 | 00ffffc1c18181ff,Image.Social-Engineering.MS-Word.Protected.UPS
33 | ff818183bfbfff3f,Image.Social-Engineering.Generic.Protected.Invoice.Western
34 | 2c054720fffff8ff,Image.Social-Engineering.McAfee-Secure.Protected.Chinese.Sleek-Bill
35 | ffff81818181ffff,Image.Social-Engineering.Generic.Protected.Full-Page.Invoice
36 | fefec20600000000,Image.Social-Engineering.MS-Word.Earlier-Version.Full-Page
37 | fff117911331ffff,Image.Social-Engineering.MS-Office.Protected.Old-Logo.White.2
38 | c7c7818f83cbffff,Image.Social-Engineering.Office-365.Desktop-Laptop.White
39 | ff0000fffe2fffff,Image.Social-Engineering.MS-Word.Newer-Version.Infographic
40 | ffffff3f38ffffff,Image.Social-Engineering.Amazon.Assistance-Center
41 | 00fef87e40000000,Image.Social-Engineering.MS-Word.Older-Version.Navy
42 | 1018ff07fd9d9dff,Image.Social-Engineering.MS-Office.Oops.Protected
43 | ffff0301819f83ff,Image.Social-Engineering.MS-Office.Protected.Old-Logo.3
44 | 00ff3f1f1fffff00,Image.Social-Engineering.MS-Office.Banner.Select-Document.Turkish
45 | e0fede1e18000000,Image.Social-Engineering.MS-Word.Protected.Padlock.Navy
46 | 00c0c4fcc0d6c000,Image.Social-Engineering.MS-Word.Earlier-Version.Italian.Navy
47 | ff1f0c0001011fff,Image.Social-Engineering.Generic.Logo.RSA.SecurID
48 | 9f00a38f8fa383ff,Image.Social-Engineering.Generic.Problem.Full-Page.Barclays
49 | 0040c06060606000,Image.Social-Engineering.Generic.Logo.BZSt.German


--------------------------------------------------------------------------------
/graph_similar_document_images/images/graph_similar_document_images_screenshot_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cryptogramfan/Malware-Analysis-Scripts/7fc87591a0500ab74ba0dc9896d5e61efae93107/graph_similar_document_images/images/graph_similar_document_images_screenshot_1.png


--------------------------------------------------------------------------------
/graph_similar_document_images/images/graph_similar_document_images_screenshot_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cryptogramfan/Malware-Analysis-Scripts/7fc87591a0500ab74ba0dc9896d5e61efae93107/graph_similar_document_images/images/graph_similar_document_images_screenshot_2.png


--------------------------------------------------------------------------------
/graph_similar_document_images/requirements.txt:
--------------------------------------------------------------------------------
1 | networkx>=2.2
2 | pygraphviz>=1.3.1
3 | python-magic>=0.4.15
4 | Pillow==7.0.0
5 | ImageHash==4.0
6 | Distance>=0.1.3
7 | 


--------------------------------------------------------------------------------
/graph_similar_strings/README.md:
--------------------------------------------------------------------------------
 1 | # graph_similar_strings.py
 2 |  A script that reads a list of strings and generates link charts that clusters similar strings together. Three string similarity metrics are supported:
 3 |  * [Jaccard distance](https://en.wikipedia.org/wiki/Jaccard_index)
 4 |  * [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance)
 5 |  * [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
 6 |  
 7 |  The script can be used as a text analysis technique for identifying phishing campaigns, for example by clustering similar filenames and email subject lines. To use the script, supply a text file containing a list of strings. By default, the script outputs DOT files that can be exported as images using [Graphviz](https://www.graphviz.org/). If Graphviz is in your PATH the script can export to SVG (recommended) or PNG files for you. You can modify the thresholds that determine if two strings are similar enough to connect according to your needs.
 8 |  
 9 |  ## Installation
10 |  First install Graphviz. On Windows you can download the installer [here](https://graphviz.gitlab.io/_pages/Download/Download_windows.html). On Linux you can install Graphviz by running:
11 |  ``` 
12 |  $ sudo apt install graphviz
13 |  ```
14 |  Afterwards, install the required Python libraries:
15 |  ```
16 |  $ pip install -r requirements.txt
17 |  ```
18 |  To view SVG files produced by the script you can use a viewer such as [Inkscape](https://inkscape.org/). Outputting to PNG isn't recommended because the resulting files can be large.
19 |  
20 |  ## Example usage
21 |  Apply all three metrics against strings in input.txt that match a filename pattern and save the resulting link charts as SVG files: 
22 |  ```
23 |  $ graph_similar_strings.py -f input.txt -m A -r filenames -o svg
24 |  ```
25 |  ## Help
26 |  ```
27 | usage: graph_similar_strings.py [-h] -f INPUT_FILE -m
28 |                                 {jaccard,J,hamming,H,levenshtein,L,all,A}
29 |                                 [-r {filenames,f}] [-o {png,svg}]
30 | 
31 | Usage: graph_similar_strings.py -f <text_file> -a [JHLA]
32 | 
33 | optional arguments:
34 |   -h, --help            show this help message and exit
35 |   -f INPUT_FILE, --file INPUT_FILE
36 |                         Text file containing strings
37 |   -m {jaccard,J,hamming,H,levenshtein,L,all,A}, --metric {jaccard,J,hamming,H,levenshtein,L,all,A}
38 |                         Metric to calculate similarity
39 |   -r {filenames,f}, --regex {filenames,f}
40 |                         Filter strings first using regular expression
41 |   -o {png,svg}, --output {png,svg}
42 |                         Output image format (requires Graphviz to be in your
43 |                         PATH)
44 |  ```
45 |  
46 |  ## Output 
47 |   Example filename similarity link chart using Levenshtein distance (cropped). Here each node represents a unique filename that is connected by edges to other filenames that met the similarity threshold.
48 | 
49 |  <img src="https://github.com/cryptogramfan/Malware-Analysis-Scripts/blob/master/graph_similar_strings/images/graph_similar_strings_screenshot_1.png" width="700">
50 |  
51 |  ## License
52 |  Released under the Creative Commons Attribution 4.0 International ([CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)) license.


--------------------------------------------------------------------------------
/graph_similar_strings/graph_similar_strings.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # A script that reads a list of strings and generates link charts that clusters similar strings
  4 | # together. Three distance metrics are supported to calculate string similarity: Jaccard, Hamming
  5 | # and Levenshtein. The script can be used as a technique for visualising phishing campaigns, for
  6 | # example by identifying similiar filenames and email subject lines. To use the script, supply a
  7 | # text file containing a list of strings. By default, the script outputs DOT files that can be 
  8 | # exported as images using Graphviz. If Graphviz is in your PATH you can also export to PNG or SVG.
  9 | #
 10 | # $ graph_similar_strings.py -f input.txt -m A -r filenames -o svg
 11 | # 
 12 | # Author.....: Alex Holland (@cryptogramfan)
 13 | # Date.......: 2019-12-06
 14 | # Version....: 0.0.3
 15 | # License....: CC BY 4.0
 16 | 
 17 | import sys
 18 | import argparse
 19 | import time
 20 | import re
 21 | import subprocess
 22 | import distance
 23 | import networkx as nx
 24 | import pandas as pd
 25 | import numpy as np
 26 | from networkx.algorithms import bipartite
 27 | from networkx.drawing.nx_agraph import write_dot
 28 | 
 29 | jaccard_threshold            = 0.2 # Similarity threshold where 0 means identical and 1 means totally different (default = 0.2).
 30 | hamming_threshold            = 5 # Similarity threshold where 1 means 1 character difference (default = 5).
 31 | levenshtein_threshold        = 7 # Similarity threshold where 1 means 1 character operation difference (default = 7).
 32 | levenshtein_length_threshold = 5 # The Levenshtein distance can be calculated between strings of different lengths. Only strings whose length difference is less than the threshold will be computed (default = 5). Reduce the threshold if the script is taking a long time to finish.
 33 | 
 34 | parser = argparse.ArgumentParser(description='\nUsage: graph_similar_strings.py -f <text_file> -m [JHLA]')
 35 | parser.add_argument('-f', '--file', dest='input_file', help='Text file containing strings', required=True)
 36 | parser.add_argument('-m', '--metric', choices=['jaccard', 'J', 'hamming', 'H', 'levenshtein', 'L', 'all', 'A'], help='Metric to calculate similarity', required=True)
 37 | parser.add_argument('-r', '--regex', choices=['filenames', 'f'], help='Filter strings first using regular expression')
 38 | parser.add_argument('-o', '--output', choices=['png', 'svg'], help='Output image format (requires Graphviz to be in your PATH)')
 39 | parsed_args = parser.parse_args()
 40 | network = nx.Graph()
 41 | timestr = time.strftime('%Y%m%d-%H%M%S')
 42 | jaccard_file = 'string_similarity_jaccard_' + timestr
 43 | hamming_file = 'string_similarity_hamming_' + timestr
 44 | levenshtein_file = 'string_similarity_levenshtein_' + timestr
 45 | regex_filenames = r'[^\\\/:*?"<>|\r\n]+$' # Regular expression to filter filenames for comparison
 46 | 
 47 | def parse_text():
 48 | 	with open(parsed_args.input_file, 'r') as text_file:
 49 | 
 50 | 		strings = [line.strip() for line in text_file]
 51 | 		strings = filter(None, strings)
 52 | 		print('[+] Processing ' + str(len(strings)) + ' strings...')
 53 | 		
 54 | 		if (parsed_args.regex == 'filenames') or (parsed_args.regex == 'f'):
 55 | 			matched_strings = []
 56 | 			
 57 | 			for string in strings:
 58 | 				matched_string = re.search(regex_filenames, string)
 59 | 				
 60 | 				if matched_string is None:
 61 | 					pass
 62 | 
 63 | 				else:
 64 | 					matched_string = matched_string.group()
 65 | 					matched_strings.append(matched_string)
 66 | 			
 67 | 			print('[+] Matched ' + str(len(matched_strings)) + ' filenames.')
 68 | 			strings_a = matched_strings
 69 | 			strings_b = matched_strings
 70 | 		
 71 | 		else:
 72 | 			strings = filter(None, strings)
 73 | 			strings_a = strings
 74 | 			strings_b = strings
 75 | 
 76 | 		return strings_a,strings_b
 77 | 
 78 | def calculate_jaccard(strings_a,strings_b):
 79 | 	print('[+] Calculating Jaccard distance...')
 80 | 	
 81 | 	for string_a in strings_a:
 82 | 		string_a = str(string_a)
 83 | 		string_a = string_a.decode('utf-8').encode('ascii', 'xmlcharrefreplace')
 84 | 
 85 | 		for string_b in strings_b:
 86 | 			string_b = str(string_b)
 87 | 			string_b = string_b.decode('utf-8').encode('ascii', 'xmlcharrefreplace')
 88 | 
 89 | 			jaccard_distance = distance.jaccard(string_a, string_b)
 90 | 			
 91 | 			if 0 < jaccard_distance < jaccard_threshold:
 92 | 				network.add_node(string_a,
 93 | 						label=string_a.decode('utf-8').encode('ascii', 'xmlcharrefreplace')[:32],
 94 | 						color='orange',
 95 | 						style='filled',
 96 | 						fillcolor='white',
 97 | 						font_color='black',
 98 | 						fontname='Arial',
 99 | 						penwidth=2,
100 | 						bipartite=0)
101 | 
102 | 				network.add_node(string_b,
103 | 						label=string_b.decode('utf-8').encode('ascii', 'xmlcharrefreplace')[:32],
104 | 						style='filled',
105 | 						fillcolor='white',
106 | 						color='orange',
107 | 						font_color='black',
108 | 						fontname='Arial',
109 | 						penwidth=2,
110 | 						bipartite=1)
111 | 
112 | 				network.add_edge(string_b,
113 | 						string_a,
114 | 						penwidth=2,
115 | 						color='#0096D6',
116 | 						dir='none')
117 | 
118 | 	try:	
119 | 		write_dot(network, jaccard_file + '.dot')
120 | 		print('[+] Created ' + jaccard_file + '.dot.')
121 | 	
122 | 	except:
123 | 		print('[!] Error creating DOT file.')
124 | 	
125 | 	if parsed_args.output == 'png':
126 | 		try:		
127 | 			subprocess.Popen(['sfdp',
128 | 					jaccard_file + '.dot',
129 | 					'-Tpng', 
130 | 					'-o', 
131 | 					jaccard_file + '.png',
132 | 					'-Goverlap=False',
133 | 					'-Goutputorder="edgesfirst"'])
134 | 							
135 | 			print('[+] Exported link chart as ' + jaccard_file + '.png.')
136 | 		
137 | 		except: 
138 | 			print('[!] Error saving link chart. Check that Graphviz is added to PATH.')
139 | 			exit(0)
140 | 			
141 | 	if parsed_args.output == 'svg':
142 | 		try:
143 | 			
144 | 			subprocess.Popen(['sfdp', 
145 | 					jaccard_file + '.dot',
146 | 					'-Goverlap=False',
147 | 					'-Tsvg', 
148 | 					'-o', 
149 | 					jaccard_file + '.svg',
150 | 					'-Goverlap=False',
151 | 					'-Goutputorder="edgesfirst"'])
152 | 							
153 | 			print('[+] Exported link chart as ' + jaccard_file + '.svg.')
154 | 			
155 | 		except: 
156 | 			print('[!] Error saving link chart. Check that Graphviz is added to PATH.')
157 | 		
158 | 	return
159 | 
160 | def calculate_hamming(strings_a,strings_b):
161 | 	print('[+] Calculating Hamming distance...')
162 | 	
163 | 	for string_a in strings_a:
164 | 		string_a = str(string_a)
165 | 		string_a = string_a.decode('utf-8').encode('ascii', 'xmlcharrefreplace')
166 | 
167 | 		for string_b in strings_b:
168 | 			string_b = str(string_b)
169 | 			string_b = string_b.decode('utf-8').encode('ascii', 'xmlcharrefreplace')
170 | 			
171 | 			if len(string_a) == len(string_b): # Hamming distance can only be calculated for equal length strings
172 | 				hamming_distance = distance.hamming(string_a, string_b)
173 | 
174 | 				if 0 < hamming_distance < hamming_threshold:
175 | 					network.add_node(string_a,
176 | 							label=string_a.decode('utf-8').encode('ascii', 'xmlcharrefreplace')[:32],
177 | 							color='orange',
178 | 							style='filled',
179 | 							fillcolor='white',
180 | 							font_color='black',
181 | 							fontname='Arial',
182 | 							penwidth=2,
183 | 							bipartite=0)
184 | 
185 | 					network.add_node(string_b,
186 | 							label=string_b.decode('utf-8').encode('ascii', 'xmlcharrefreplace')[:32],
187 | 							style='filled',
188 | 							fillcolor='white',
189 | 							color='orange',
190 | 							font_color='black',
191 | 							fontname='Arial',
192 | 							penwidth=2,
193 | 							bipartite=1)
194 | 
195 | 					network.add_edge(string_b,
196 | 							string_a,
197 | 							penwidth=2,
198 | 							color='#0096D6',
199 | 							dir='none')
200 | 	
201 | 	try:
202 | 		write_dot(network, hamming_file + '.dot')
203 | 		print('[+] Created ' + hamming_file + '.dot.')
204 | 		
205 | 	except:
206 | 		print('[!] Error creating DOT file.')
207 | 		
208 | 	if parsed_args.output == 'png':
209 | 		try:
210 | 			subprocess.Popen(['sfdp',
211 | 					hamming_file + '.dot',
212 | 					'-Tpng', 
213 | 					'-o', 
214 | 					hamming_file + '.png',
215 | 					'-Goverlap=False',
216 | 					'-Goutputorder="edgesfirst"'])
217 | 							
218 | 			print('[+] Exported link chart as ' + hamming_file + '.png.')
219 | 		
220 | 		except: 
221 | 			print('[!] Error saving link chart. Check that Graphviz is added to PATH.')
222 | 			
223 | 	if parsed_args.output == 'svg':
224 | 		try:
225 | 			subprocess.Popen(['sfdp', 
226 | 					hamming_file + '.dot',
227 | 					'-Tsvg', 
228 | 					'-o', 
229 | 					hamming_file + '.svg',
230 | 					'-Goverlap=False',
231 | 					'-Goutputorder="edgesfirst"'])
232 | 							
233 | 			print('[+] Exported link chart as ' + hamming_file + '.svg.')
234 | 		
235 | 		except: 
236 | 			print('[!] Error saving link chart. Check that Graphviz is added to PATH.')
237 | 			
238 | 	return
239 | 
240 | def calculate_levenshtein(strings_a,strings_b):
241 | 	print('[+] Calculating Levenshtein distance...')
242 | 	
243 | 	for string_a in strings_a:
244 | 		string_a = str(string_a)
245 | 		string_a = string_a.decode('utf-8').encode('ascii', 'xmlcharrefreplace')
246 | 
247 | 		for string_b in strings_b:
248 | 			string_b = str(string_b)
249 | 			string_b = string_b.decode('utf-8').encode('ascii', 'xmlcharrefreplace')
250 | 
251 | 			if -levenshtein_length_threshold < (len(string_a) - len(string_b)) < levenshtein_length_threshold:
252 | 				levenshtein_distance = distance.levenshtein(string_a, string_b)
253 | 
254 | 				if 0 < levenshtein_distance < levenshtein_threshold:
255 | 					network.add_node(string_a,
256 | 							label=string_a.decode('utf-8').encode('ascii', 'xmlcharrefreplace')[:32],
257 | 							color='orange',
258 | 							style='filled',
259 | 							fillcolor='white',
260 | 							font_color='black',
261 | 							fontname='Arial',
262 | 							penwidth=2,
263 | 							bipartite=0)
264 | 
265 | 					network.add_node(string_b,
266 | 							label=string_b.decode('utf-8').encode('ascii', 'xmlcharrefreplace')[:32],
267 | 							style='filled',
268 | 							fillcolor='white',
269 | 							color='orange',
270 | 							font_color='black',
271 | 							fontname='Arial',
272 | 							penwidth=2,
273 | 							bipartite=1)
274 | 
275 | 					network.add_edge(string_b,
276 | 							string_a,
277 | 							penwidth=2,
278 | 							color='#0096D6',
279 | 							dir='none')
280 | 	try:
281 | 		write_dot(network, levenshtein_file + '.dot')
282 | 		print('[+] Created ' + levenshtein_file + '.dot.')
283 | 		
284 | 	except:
285 | 		print('[!] Error creating DOT file.')
286 | 		
287 | 	if parsed_args.output == 'png':
288 | 		try:
289 | 			subprocess.Popen(['sfdp',
290 | 					levenshtein_file + '.dot', 
291 | 					'-Tpng', 
292 | 					'-o', 
293 | 					levenshtein_file + '.png',
294 | 					'-Goverlap=False',
295 | 					'-Goutputorder="edgesfirst"'])
296 | 							
297 | 			print('[+] Exported link chart as ' + levenshtein_file + '.png.')
298 | 		
299 | 		except: 
300 | 			print('[!] Error saving link chart. Check that Graphviz is added to PATH.')
301 | 			
302 | 	if parsed_args.output == 'svg':
303 | 		try:
304 | 			subprocess.Popen(['sfdp', 
305 | 					levenshtein_file + '.dot',
306 | 					'-Tsvg', 
307 | 					'-o', 
308 | 					levenshtein_file + '.svg',
309 | 					'-Goverlap=False',
310 | 					'-Goutputorder="edgesfirst"'])
311 | 							
312 | 			print('[+] Exported link chart as ' + levenshtein_file + '.svg.')
313 | 		
314 | 		except: 
315 | 			print('[!] Error saving link chart. Check that Graphviz is added to PATH.')
316 | 		
317 | 	return
318 | 
319 | strings_a,strings_b = parse_text()
320 | 
321 | if (parsed_args.metric == 'jaccard') or (parsed_args.metric == 'J'):
322 | 	calculate_jaccard(strings_a,strings_b)
323 | 
324 | if (parsed_args.metric == 'hamming') or (parsed_args.metric == 'H'):
325 | 	calculate_hamming(strings_a,strings_b)
326 | 
327 | if (parsed_args.metric == 'levenshtein') or (parsed_args.metric == 'L'):
328 | 	calculate_levenshtein(strings_a,strings_b)
329 | 
330 | if (parsed_args.metric == 'all') or (parsed_args.metric == 'A'):
331 | 	calculate_jaccard(strings_a,strings_b)
332 | 	calculate_hamming(strings_a,strings_b)
333 | 	calculate_levenshtein(strings_a,strings_b)
334 | 
335 | exit(0)


--------------------------------------------------------------------------------
/graph_similar_strings/images/graph_similar_strings_screenshot_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cryptogramfan/Malware-Analysis-Scripts/7fc87591a0500ab74ba0dc9896d5e61efae93107/graph_similar_strings/images/graph_similar_strings_screenshot_1.png


--------------------------------------------------------------------------------
/graph_similar_strings/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cryptogramfan/Malware-Analysis-Scripts/7fc87591a0500ab74ba0dc9896d5e61efae93107/graph_similar_strings/requirements.txt


--------------------------------------------------------------------------------