├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── iocp.py
├── output.py
├── patterns.ini
├── requirements.txt
├── whitelist.py
└── whitelists
    ├── whitelist_CVE.ini
    ├── whitelist_Email.ini
    ├── whitelist_Filename.ini
    ├── whitelist_Filepath.ini
    ├── whitelist_Host.ini
    ├── whitelist_IP.ini
    ├── whitelist_MD5.ini
    ├── whitelist_Registry.ini
    ├── whitelist_SHA1.ini
    ├── whitelist_SHA256.ini
    └── whitelist_URL.ini


/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | *.pyc


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 armbues
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ioc-parser
 2 | IOC Parser is a tool to extract indicators of compromise from security reports in PDF format. A good collection of APT related reports with many IOCs can be found here: [APTNotes](https://github.com/kbandla/APTnotes).
 3 | 
 4 | ## Usage
 5 | **iocp.py [-h] [-p INI] [-i FORMAT] [-o FORMAT] [-d] [-l LIB] FILE**
 6 | * *FILE* File/directory path to report(s)
 7 | * *-p INI* Pattern file
 8 | * *-i FORMAT* Input format (pdf/txt/html)
 9 | * *-o FORMAT* Output format (csv/json/yara)
10 | * *-d* Deduplicate matches
11 | * *-l LIB* Parsing library
12 | 
13 | ## Requirements
14 | One of the following PDF parsing libraries:
15 | * [PyPDF2](https://github.com/mstamy2/PyPDF2) - *pip install pypdf2*
16 | * [pdfminer](https://github.com/euske/pdfminer) - *pip install pdfminer*
17 | 
18 | For HTML parsing support:
19 | * [BeautifulSoup](http://www.crummy.com/software/BeautifulSoup/) - *pip install beautifulsoup4*
20 | 
21 | For HTTP(S) support:
22 | * [requests](http://docs.python-requests.org/en/latest/) - *pip install requests*


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/threatminer/ioc_parser/c0ab39001580b48e87ad6fe4875a9e33f7cbee89/__init__.py


--------------------------------------------------------------------------------
/iocp.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | ###################################################################################################
  4 | #
  5 | # Copyright (c) 2015, Armin Buescher (armin.buescher@googlemail.com)
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | #
 25 | ###################################################################################################
 26 | #
 27 | # File:             iocp.py
 28 | # Description:      IOC Parser is a tool to extract indicators of compromise from security reports
 29 | #                   in PDF format.
 30 | # Usage:            iocp.py [-h] [-p INI] [-f FORMAT] PDF
 31 | # Author:           Armin Buescher (@armbues)
 32 | # Contributors:     Angelo Dell'Aera (@angelodellaera)
 33 | # Thanks to:        Jose Ramon Palanco
 34 | #                   Koen Van Impe (@cudeso)
 35 | #
 36 | ###################################################################################################
 37 | 
 38 | import os
 39 | import sys
 40 | import fnmatch
 41 | import argparse
 42 | import re
 43 | from StringIO import StringIO
 44 | try:
 45 |     import configparser as ConfigParser
 46 | except ImportError:
 47 |     import ConfigParser
 48 | 
 49 | # Import optional third-party libraries
 50 | IMPORTS = []
 51 | try:
 52 |     from PyPDF2 import PdfFileReader
 53 |     IMPORTS.append('pypdf2')
 54 | except ImportError:
 55 |     pass
 56 | try:
 57 |     from pdfminer.pdfpage import PDFPage
 58 |     from pdfminer.pdfinterp import PDFResourceManager
 59 |     from pdfminer.converter import TextConverter
 60 |     from pdfminer.pdfinterp import PDFPageInterpreter
 61 |     from pdfminer.layout import LAParams
 62 |     IMPORTS.append('pdfminer')
 63 | except ImportError:
 64 |     pass
 65 | try:
 66 |     from bs4 import BeautifulSoup
 67 |     IMPORTS.append('beautifulsoup')
 68 | except ImportError:
 69 |     pass
 70 | try:
 71 |     import requests
 72 |     IMPORTS.append('requests')
 73 | except ImportError:
 74 |     pass
 75 | 
 76 | # Import additional project source files
 77 | import output
 78 | from whitelist import WhiteList
 79 | 
 80 | class IOC_Parser(object):
 81 |     patterns = {}
 82 |     defang = {}
 83 | 	
 84 | 	
 85 |     def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library='pdfminer', output_format='JSON', output_handler=None):
 86 |         basedir = os.path.dirname(os.path.abspath(__file__))
 87 |         if patterns_ini is None:
 88 |             patterns_ini = os.path.join(basedir, 'patterns.ini')
 89 | 
 90 |         self.load_patterns(patterns_ini)
 91 |         self.whitelist = WhiteList(basedir)
 92 |         self.dedup = dedup
 93 |         if output_handler:
 94 |             self.handler = output_handler
 95 |         else:
 96 |             self.handler = output.getHandler(output_format)
 97 | 
 98 |         self.ext_filter = "*." + input_format
 99 |         parser_format = "parse_" + input_format
100 |         try:
101 |             self.parser_func = getattr(self, parser_format)
102 |         except AttributeError:
103 |             e = 'Selected parser format is not supported: %s' % (input_format)
104 |             raise NotImplementedError(e)
105 | 
106 |         self.library = library
107 |         if input_format == 'pdf':
108 |             if library not in IMPORTS:
109 |                 e = 'Selected PDF parser library not found: %s' % (library)
110 |                 raise ImportError(e)
111 |         elif input_format == 'html':
112 |             if 'beautifulsoup' not in IMPORTS:
113 |                 e = 'HTML parser library not found: BeautifulSoup'
114 |                 raise ImportError(e)
115 | 	
116 | 	
117 |     def load_patterns(self, fpath):
118 |         config = ConfigParser.ConfigParser()
119 |         with open(fpath) as f:
120 |             config.readfp(f)
121 | 
122 |         for ind_type in config.sections():
123 |             try:
124 |                 ind_pattern = config.get(ind_type, 'pattern')
125 |             except:
126 |                 continue
127 | 
128 |             if ind_pattern:
129 |                 ind_regex = re.compile(ind_pattern)
130 |                 self.patterns[ind_type] = ind_regex
131 | 
132 |             try:
133 |                 ind_defang = config.get(ind_type, 'defang')
134 |             except:
135 |                 continue
136 | 
137 |             if ind_defang:
138 |                 self.defang[ind_type] = True
139 | 
140 |     def is_whitelisted(self, ind_match, ind_type):
141 |         try:
142 |             for w in self.whitelist[ind_type]:
143 |                 if w.findall(ind_match):
144 |                     return True
145 |         except KeyError as e:
146 |             pass
147 |         return False
148 | 	
149 | 	################################################################################
150 | 	################################################################################
151 |     def parse_page(self, fpath, data, page_num):
152 | 		''' Parse page for IOCs. '''
153 | 		iocs = []
154 | 		for ind_type, ind_regex in self.patterns.items():
155 | 			matches = ind_regex.findall(data)
156 | 			
157 | 			# For each matched indicator...
158 | 			for ind_match in matches:
159 | 				if isinstance(ind_match, tuple):
160 | 					ind_match = ind_match[0]
161 | 
162 | 				if self.is_whitelisted(ind_match, ind_type):
163 | 					continue
164 | 
165 | 				if ind_type in self.defang:
166 | 					ind_match = re.sub(r'\[\.\]', '.', ind_match)
167 | 
168 | 				if self.dedup:
169 | 					if (ind_type, ind_match) in self.dedup_store:
170 | 						continue
171 | 
172 | 					self.dedup_store.add((ind_type, ind_match))
173 | 
174 | 				iocs.append( self.handler.return_match(fpath, page_num, ind_type, ind_match) ) # [TO-modified]
175 | 				
176 | 		return iocs
177 | 	
178 | 	################################################################################
179 | 	################################################################################
180 | 	
181 |     def parse_pdf_pypdf2(self, f, fpath):
182 | 		text = ""
183 | 		iocs = None
184 | 		try:
185 | 			pdf = PdfFileReader(f, strict = False)
186 | 
187 | 			if pdf.isEncrypted:
188 | 				pdf.decrypt('')
189 | 
190 | 			if self.dedup:
191 | 				self.dedup_store = set()
192 | 
193 | 			self.handler.print_header(fpath)
194 | 			page_num = 0
195 | 			for page in pdf.pages:
196 | 				page_num += 1
197 | 
198 | 				data = page.extractText()
199 | 				
200 | 				# Parse IOCs
201 | 				temp_iocs = self.parse_page(fpath, data, page_num) # parse_page
202 | 				
203 | 				# Add IOCs to collection
204 | 				iocs.extend(temp_iocs)
205 | 				
206 | 				# Add new page
207 | 				text += data
208 | 				
209 | 			self.handler.print_footer(fpath)
210 | 		except (KeyboardInterrupt, SystemExit):
211 | 			raise
212 | 		except Exception as e:
213 | 			self.handler.print_error(fpath, e)
214 | 		return text, iocs
215 | 		
216 |     def parse_pdf_pdfminer(self, f, fpath):
217 | 		text = ""
218 | 		iocs = []
219 | 		try:
220 | 			laparams = LAParams()
221 | 			laparams.all_texts = True  
222 | 			rsrcmgr = PDFResourceManager()
223 | 			pagenos = set()
224 | 
225 | 			if self.dedup:
226 | 				self.dedup_store = set()
227 | 
228 | 			self.handler.print_header(fpath)
229 | 			page_num = 0
230 | 			for page in PDFPage.get_pages(f, pagenos, check_extractable=True):
231 | 				page_num += 1
232 | 
233 | 				retstr = StringIO()
234 | 				device = TextConverter(rsrcmgr, retstr, laparams=laparams)
235 | 				interpreter = PDFPageInterpreter(rsrcmgr, device)
236 | 				interpreter.process_page(page)
237 | 				data = retstr.getvalue()
238 | 				retstr.close()
239 | 				
240 | 				# Parse IOCs
241 | 				temp_iocs = self.parse_page(fpath, data, page_num) # parse_page
242 | 				
243 | 				# Add IOCs to collection
244 | 				iocs.extend(temp_iocs)
245 | 				
246 | 				# Add new page
247 | 				text += data
248 | 				
249 | 			self.handler.print_footer(fpath)
250 | 		except (KeyboardInterrupt, SystemExit):
251 | 			raise
252 | 		except Exception as e:
253 | 			self.handler.print_error(fpath, e)
254 | 		return text, iocs
255 | 		
256 |     def parse_pdf(self, f, fpath):
257 | 		''' Parse PDF. '''
258 | 		print "[INFO][IOCParser][f:parse_pdf] Parsing PDF..."
259 | 		text = ""
260 | 		iocs = []
261 | 
262 | 		parser_format = "parse_pdf_" + self.library
263 | 		try:
264 | 			self.parser_func = getattr(self, parser_format)
265 | 		except AttributeError:
266 | 			e = 'Selected PDF parser library is not supported: %s' % (self.library)
267 | 			raise NotImplementedError(e)
268 | 			
269 | 		text, iocs = self.parser_func(f, fpath)
270 | 		return text, iocs
271 | 	################################################################################
272 | 	################################################################################
273 | 	
274 |     def parse_txt(self, f, fpath):
275 |         try:
276 |             if self.dedup:
277 |                 self.dedup_store = set()
278 | 
279 |             data = f.read()
280 |             self.handler.print_header(fpath)
281 |             self.parse_page(fpath, data, 1)
282 |             self.handler.print_footer(fpath)
283 |         except (KeyboardInterrupt, SystemExit):
284 |             raise
285 |         except Exception as e:
286 |             self.handler.print_error(fpath, e)
287 | 
288 |     def parse_html(self, f, fpath):
289 |         try:
290 |             if self.dedup:
291 |                 self.dedup_store = set()
292 |                 
293 |             data = f.read()
294 |             soup = BeautifulSoup(data)
295 |             html = soup.findAll(text=True)
296 | 
297 |             text = u''
298 |             for elem in html:
299 |                 if elem.parent.name in ['style', 'script', '[document]', 'head', 'title']:
300 |                     continue
301 |                 elif re.match('<!--.*-->', unicode(elem)):
302 |                     continue
303 |                 else:
304 |                     text += unicode(elem)
305 | 
306 |             self.handler.print_header(fpath)
307 |             self.parse_page(fpath, text, 1)
308 |             self.handler.print_footer(fpath)
309 |         except (KeyboardInterrupt, SystemExit):
310 |             raise
311 |         except Exception as e:
312 |             self.handler.print_error(fpath, e)
313 | 	
314 | 	################################################################################
315 |     def parse(self, path):
316 | 		''' Main parse function. Selects parser by input type. '''
317 | 		data = ""
318 | 		iocs = []
319 | 		try:
320 | 			if path.startswith('http://') or path.startswith('https://'):
321 | 				if 'requests' not in IMPORTS:
322 | 					e = 'HTTP library not found: requests'
323 | 					raise ImportError(e)
324 | 				headers = { 'User-Agent': 'Mozilla/5.0 Gecko Firefox' }
325 | 				r = requests.get(path, headers=headers)
326 | 				r.raise_for_status()
327 | 				f = StringIO(r.content)
328 | 				self.parser_func(f, path)
329 | 				return
330 | 			#########################################################
331 | 			elif os.path.isfile(path): # Parse file
332 | 				print "[INFO][IOCParser][f:parse] Parsing file..."
333 | 				with open(path, 'rb') as f:
334 | 					text, iocs = self.parse_pdf(f, path) # Assum pdf
335 | 				return text, iocs, path, os.path.basename(path)
336 | 			#########################################################
337 | 			elif os.path.isdir(path):
338 | 				for walk_root, walk_dirs, walk_files in os.walk(path):
339 | 					for walk_file in fnmatch.filter(walk_files, self.ext_filter):
340 | 						fpath = os.path.join(walk_root, walk_file)
341 | 						with open(fpath, 'rb') as f:
342 | 							self.parser_func(f, fpath)
343 | 				return
344 | 
345 | 			e = 'File path is not a file, directory or URL: %s' % (path)
346 | 			raise IOError(e)
347 | 		except (KeyboardInterrupt, SystemExit):
348 | 			raise
349 | 		except Exception as e:
350 | 			self.handler.print_error(path, e)
351 | 	################################################################################
352 | 	
353 | if __name__ == "__main__":
354 |     argparser = argparse.ArgumentParser()
355 |     argparser.add_argument('PATH', action='store', help='File/directory/URL to report(s)')
356 |     argparser.add_argument('-p', dest='INI', default=None, help='Pattern file')
357 |     argparser.add_argument('-i', dest='INPUT_FORMAT', default='pdf', help='Input format (pdf/txt/html)')
358 |     argparser.add_argument('-o', dest='OUTPUT_FORMAT', default='csv', help='Output format (csv/json/yara/netflow)')
359 |     argparser.add_argument('-d', dest='DEDUP', action='store_true', default=False, help='Deduplicate matches')
360 |     argparser.add_argument('-l', dest='LIB', default='pdfminer', help='PDF parsing library (pypdf2/pdfminer)')
361 |     args = argparser.parse_args()
362 | 
363 |     parser = IOC_Parser(args.INI, args.INPUT_FORMAT, args.DEDUP, args.LIB, args.OUTPUT_FORMAT)
364 |     parser.parse(args.PATH)
365 | 


--------------------------------------------------------------------------------
/output.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import csv
  4 | import json
  5 | 
  6 | OUTPUT_FORMATS = ('csv', 'json', 'yara', 'netflow', )
  7 | 
  8 | def getHandler(output_format):
  9 |     output_format = output_format.lower()
 10 |     if output_format not in OUTPUT_FORMATS:
 11 |         print("[WARNING] Invalid output format specified.. using CSV")
 12 |         output_format = 'csv'
 13 | 
 14 |     handler_format = "OutputHandler_" + output_format
 15 |     handler_class = getattr(sys.modules[__name__], handler_format)
 16 | 
 17 |     return handler_class()
 18 | 
 19 | class OutputHandler(object):
 20 |     def print_match(self, fpath, page, name, match):
 21 |         pass
 22 | 
 23 |     def print_header(self, fpath):
 24 |         pass
 25 | 
 26 |     def print_footer(self, fpath):
 27 |         pass
 28 | 
 29 |     def print_error(self, fpath, exception):
 30 |         print("[ERROR] %s" % (exception))
 31 | 
 32 | class OutputHandler_csv(OutputHandler):
 33 |     def __init__(self):
 34 |         self.csv_writer = csv.writer(sys.stdout, delimiter = '\t')
 35 | 
 36 |     def print_match(self, fpath, page, name, match):
 37 |         self.csv_writer.writerow((fpath, page, name, match))
 38 | 
 39 |     def print_error(self, fpath, exception):
 40 |         self.csv_writer.writerow((fpath, '0', 'error', exception))
 41 | 
 42 | class OutputHandler_json(OutputHandler):
 43 |     def print_match(self, fpath, page, name, match):
 44 |         data = {
 45 |             'path' : fpath,
 46 |             'file' : os.path.basename(fpath),
 47 |             'page' : page,
 48 |             'type' : name,
 49 |             'match': match
 50 |         }
 51 | 
 52 |         print(json.dumps(data))
 53 | 
 54 |     def print_error(self, fpath, exception):
 55 |         data = {
 56 |             'path'      : fpath,
 57 |             'file'      : os.path.basename(fpath),
 58 |             'type'      : 'error',
 59 |             'exception' : exception
 60 |         }
 61 | 
 62 |         print(json.dumps(data))
 63 | 
 64 | class OutputHandler_yara(OutputHandler):
 65 |     def __init__(self):
 66 |         self.rule_enc = ''.join(chr(c) if chr(c).isupper() or chr(c).islower() or chr(c).isdigit() else '_' for c in range(256))
 67 | 
 68 |     def print_match(self, fpath, page, name, match):
 69 |         if name in self.cnt:
 70 |             self.cnt[name] += 1
 71 |         else:
 72 |             self.cnt[name] = 1
 73 |         
 74 |         string_id = "$%s%d" % (name, self.cnt[name])
 75 |         self.sids.append(string_id)
 76 |         string_value = match.replace('\\', '\\\\')
 77 |         print("\t\t%s = \"%s\"" % (string_id, string_value))
 78 | 
 79 |     def print_header(self, fpath):
 80 |         rule_name = os.path.splitext(os.path.basename(fpath))[0].translate(self.rule_enc)
 81 | 
 82 |         print("rule %s" % (rule_name))
 83 |         print("{")
 84 |         print("\tstrings:")
 85 | 
 86 |         self.cnt = {}
 87 |         self.sids = []
 88 | 
 89 |     def print_footer(self, fpath):
 90 |         cond = ' or '.join(self.sids)
 91 | 
 92 |         print("\tcondition:")
 93 |         print("\t\t" + cond)
 94 |         print("}")
 95 |         
 96 | class OutputHandler_netflow(OutputHandler):
 97 |     def __init__(self):
 98 |         print "host 255.255.255.255"
 99 | 
100 |     def print_match(self, fpath, page, name, match):
101 |         data = {
102 |             'type' : name,
103 |             'match': match
104 |         }
105 |         if data["type"] == "IP":
106 |             print " or host %s " % data["match"]
107 | 


--------------------------------------------------------------------------------
/patterns.ini:
--------------------------------------------------------------------------------
 1 | [URL]
 2 | pattern:	\b([a-z]{3,}\:\/\/[\S]{16,})\b
 3 | defang:		True
 4 | 
 5 | [Host]
 6 | pattern:	\b(([a-z0-9\-]{2,}\[?\.\]?)+(abogado|ac|academy|accountants|active|actor|ad|adult|ae|aero|af|ag|agency|ai|airforce|al|allfinanz|alsace|am|amsterdam|an|android|ao|aq|aquarelle|ar|archi|army|arpa|as|asia|associates|at|attorney|au|auction|audio|autos|aw|ax|axa|az|ba|band|bank|bar|barclaycard|barclays|bargains|bayern|bb|bd|be|beer|berlin|best|bf|bg|bh|bi|bid|bike|bingo|bio|biz|bj|black|blackfriday|bloomberg|blue|bm|bmw|bn|bnpparibas|bo|boo|boutique|br|brussels|bs|bt|budapest|build|builders|business|buzz|bv|bw|by|bz|bzh|ca|cal|camera|camp|cancerresearch|canon|capetown|capital|caravan|cards|care|career|careers|cartier|casa|cash|cat|catering|cc|cd|center|ceo|cern|cf|cg|ch|channel|chat|cheap|christmas|chrome|church|ci|citic|city|ck|cl|claims|cleaning|click|clinic|clothing|club|cm|cn|co|coach|codes|coffee|college|cologne|com|community|company|computer|condos|construction|consulting|contractors|cooking|cool|coop|country|cr|credit|creditcard|cricket|crs|cruises|cu|cuisinella|cv|cw|cx|cy|cymru|cz|dabur|dad|dance|dating|day|dclk|de|deals|degree|delivery|democrat|dental|dentist|desi|design|dev|diamonds|diet|digital|direct|directory|discount|dj|dk|dm|dnp|do|docs|domains|doosan|durban|dvag|dz|eat|ec|edu|education|ee|eg|email|emerck|energy|engineer|engineering|enterprises|equipment|er|es|esq|estate|et|eu|eurovision|eus|events|everbank|exchange|expert|exposed|fail|farm|fashion|feedback|fi|finance|financial|firmdale|fish|fishing|fit|fitness|fj|fk|flights|florist|flowers|flsmidth|fly|fm|fo|foo|forsale|foundation|fr|frl|frogans|fund|furniture|futbol|ga|gal|gallery|garden|gb|gbiz|gd|ge|gent|gf|gg|ggee|gh|gi|gift|gifts|gives|gl|glass|gle|global|globo|gm|gmail|gmo|gmx|gn|goog|google|gop|gov|gp|gq|gr|graphics|gratis|green|gripe|gs|gt|gu|guide|guitars|guru|gw|gy|hamburg|hangout|haus|healthcare|help|here|hermes|hiphop|hiv|hk|hm|hn|holdings|holiday|homes|horse|host|hosting|house|how|hr|ht|hu|ibm|id|ie|ifm|il|im|immo|immobilien|in|industries|info|ing|ink|institute|insure|int|international|investments|io|iq|ir|irish|is|it|iwc|jcb|je|jetzt|jm|jo|jobs|joburg|jp|juegos|kaufen|kddi|ke|kg|kh|ki|kim|kitchen|kiwi|km|kn|koeln|kp|kr|krd|kred|kw|ky|kyoto|kz|la|lacaixa|land|lat|latrobe|lawyer|lb|lc|lds|lease|legal|lgbt|li|lidl|life|lighting|limited|limo|link|lk|loans|london|lotte|lotto|lr|ls|lt|ltda|lu|luxe|luxury|lv|ly|ma|madrid|maison|management|mango|market|marketing|marriott|mc|md|me|media|meet|melbourne|meme|memorial|menu|mg|mh|miami|mil|mini|mk|ml|mm|mn|mo|mobi|moda|moe|monash|money|mormon|mortgage|moscow|motorcycles|mov|mp|mq|mr|ms|mt|mu|museum|mv|mw|mx|my|mz|na|nagoya|name|navy|nc|ne|net|network|neustar|new|nexus|nf|ng|ngo|nhk|ni|ninja|nl|no|np|nr|nra|nrw|ntt|nu|nyc|nz|okinawa|om|one|ong|onl|ooo|org|organic|osaka|otsuka|ovh|pa|paris|partners|parts|party|pe|pf|pg|ph|pharmacy|photo|photography|photos|physio|pics|pictures|pink|pizza|pk|pl|place|plumbing|pm|pn|pohl|poker|porn|post|pr|praxi|press|pro|prod|productions|prof|properties|property|ps|pt|pub|pw|qa|qpon|quebec|re|realtor|recipes|red|rehab|reise|reisen|reit|ren|rentals|repair|report|republican|rest|restaurant|reviews|rich|rio|rip|ro|rocks|rodeo|rs|rsvp|ru|ruhr|rw|ryukyu|sa|saarland|sale|samsung|sarl|sb|sc|sca|scb|schmidt|schule|schwarz|science|scot|sd|se|services|sew|sexy|sg|sh|shiksha|shoes|shriram|si|singles|sj|sk|sky|sl|sm|sn|so|social|software|sohu|solar|solutions|soy|space|spiegel|sr|st|style|su|supplies|supply|support|surf|surgery|suzuki|sv|sx|sy|sydney|systems|sz|taipei|tatar|tattoo|tax|tc|td|technology|tel|temasek|tennis|tf|tg|th|tienda|tips|tires|tirol|tj|tk|tl|tm|tn|to|today|tokyo|tools|top|toshiba|town|toys|tp|tr|trade|training|travel|trust|tt|tui|tv|tw|tz|ua|ug|uk|university|uno|uol|us|uy|uz|va|vacations|vc|ve|vegas|ventures|versicherung|vet|vg|vi|viajes|video|villas|vision|vlaanderen|vn|vodka|vote|voting|voto|voyage|vu|wales|wang|watch|webcam|website|wed|wedding|wf|whoswho|wien|wiki|williamhill|wme|work|works|world|ws|wtc|wtf|xyz|yachts|yandex|ye|yoga|yokohama|youtube|yt|za|zm|zone|zuerich|zw))\b
 7 | defang:		True
 8 | 
 9 | [IP]
10 | pattern:	\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b
11 | 
12 | [Email]
13 | pattern:	\b([a-z][_a-z0-9-.]+@[a-z0-9-]+\.[a-z]+)\b
14 | 
15 | [MD5]
16 | pattern:	\b([a-f0-9]{32}|[A-F0-9]{32})\b
17 | 
18 | [SHA1]
19 | pattern:	\b([a-f0-9]{40}|[A-F0-9]{40})\b
20 | 
21 | [SHA256]
22 | pattern:	\b([a-f0-9]{64}|[A-F0-9]{64})\b
23 | 
24 | [CVE]
25 | pattern:	\b(CVE\-[0-9]{4}\-[0-9]{4,6})\b
26 | 
27 | [Registry]
28 | pattern:	\b((HKLM|HKCU)\\[\\A-Za-z0-9-_]+)\b
29 | 
30 | [Filename]
31 | pattern:	\b([A-Za-z0-9-_\.]+\.(exe|dll|bat|sys|htm|html|js|jar|jpg|png|vb|scr|pif|chm|zip|rar|cab|pdf|doc|docx|ppt|pptx|xls|xlsx|swf|gif))\b
32 | 
33 | [Filepath]
34 | pattern:	\b[A-Z]:\\[A-Za-z0-9-_\.\\]+\b
35 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4>=4.4.0
2 | pdfminer>=20140328
3 | PyPDF2>=1.25.1
4 | requests>=2.7.0
5 | 


--------------------------------------------------------------------------------
/whitelist.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import re
 4 | 
 5 | class WhiteList(dict):
 6 |     def __init__(self, basedir):
 7 |     	searchdir = os.path.join(basedir, "whitelists/whitelist_*.ini")
 8 |         fpaths = glob.glob(searchdir)
 9 |         for fpath in fpaths:
10 |             t = os.path.splitext(os.path.split(fpath)[1])[0].split('_',1)[1]
11 |             patterns = [line.strip() for line in open(fpath)]
12 |             self[t]  = [re.compile(p) for p in patterns]


--------------------------------------------------------------------------------
/whitelists/whitelist_CVE.ini:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/threatminer/ioc_parser/c0ab39001580b48e87ad6fe4875a9e33f7cbee89/whitelists/whitelist_CVE.ini


--------------------------------------------------------------------------------
/whitelists/whitelist_Email.ini:
--------------------------------------------------------------------------------
1 | @fireeye.com
2 | @crowdstrike.com
3 | @f-secure.com
4 | @kaspersky.com
5 | @gdata.de
6 | @cylance.com


--------------------------------------------------------------------------------
/whitelists/whitelist_Filename.ini:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/threatminer/ioc_parser/c0ab39001580b48e87ad6fe4875a9e33f7cbee89/whitelists/whitelist_Filename.ini


--------------------------------------------------------------------------------
/whitelists/whitelist_Filepath.ini:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/threatminer/ioc_parser/c0ab39001580b48e87ad6fe4875a9e33f7cbee89/whitelists/whitelist_Filepath.ini


--------------------------------------------------------------------------------
/whitelists/whitelist_Host.ini:
--------------------------------------------------------------------------------
  1 | acm\.org$
  2 | adobe\.com$
  3 | ahnlab\.com$
  4 | alienvault\.com$
  5 | amazon\.com$
  6 | android\.com$
  7 | aol\.com$
  8 | arbornetworks\.com$
  9 | arstechnica\.com$
 10 | avg\.com$
 11 | bbc\.co\.uk$
 12 | bing\.com$
 13 | bitdefender\.com$
 14 | bloomberg\.com$
 15 | bluecoat\.com$
 16 | cassidiancybersecurity\.com$
 17 | cbsnews\.com$
 18 | cia\.gov$
 19 | cisco\.com$
 20 | citizenlab\.org$
 21 | clean-mx\.de$
 22 | cnn\.com$
 23 | comodo\.com$
 24 | contagiodump\.blogspot\.com$
 25 | contextis\.com$
 26 | coresecurity\.com$
 27 | crowdstrike\.com$
 28 | crysys\.hu$
 29 | cve\.mitre\.org$
 30 | cylance\.com$
 31 | dailymail\.co\.uk$
 32 | damballa\.com$
 33 | darkreading\.com$
 34 | ddanchev\.blogspot\.com$
 35 | defense\.gov$
 36 | dell\.com$
 37 | domaintools\.com$
 38 | dropbox\.com$
 39 | eff\.org$
 40 | emergingthreats\.net$
 41 | eset\.com$
 42 | eset\.sk$
 43 | events\.ccc\.de$
 44 | exploit-db\.com$
 45 | f-secure\.com$
 46 | facebook\.com$
 47 | fbi\.gov$
 48 | fidelissecurity\.com$
 49 | fireeye\.com$
 50 | forbes\.com$
 51 | fortinet\.com$
 52 | gdata\.de$
 53 | gdatasoftware\.com$
 54 | github\.com$
 55 | gmail\.com$
 56 | gmx\.com$
 57 | gmx\.de$
 58 | google\.com$
 59 | googlemail\.com$
 60 | googleonlinesecurity\.blogspot\.com$
 61 | hbgary\.com$
 62 | heise\.de$
 63 | hex-rays\.com$
 64 | hotmail\.com$
 65 | huffingtonpost\.com$
 66 | iana\.org$
 67 | ibtimes\.com$
 68 | ietf\.org$
 69 | inbox\.com$
 70 | informationweek\.com$
 71 | invincea\.com$
 72 | isc\.org$
 73 | isightpartners\.com$
 74 | java\.net$
 75 | kaspersky\.com$
 76 | krebsonsecurity\.com$
 77 | lastline\.com$
 78 | lemonde\.fr$
 79 | linkedin\.com$
 80 | live\.com$
 81 | malware\.dontneedcoffee\.com$
 82 | malware\.lu$
 83 | malwaredomainlist\.com$
 84 | mandiant\.com$
 85 | mcafee\.com$
 86 | metasploit\.com$
 87 | microsoft\.com$
 88 | mozilla\.org$
 89 | msn\.com$
 90 | norman\.com$
 91 | norman\.no$
 92 | normanshark\.com$
 93 | nytimes\.com$
 94 | outlook\.com$
 95 | paloaltonetworks\.com$
 96 | paypal\.com$
 97 | pinterest\.com$
 98 | pwc\.com$
 99 | qualys\.com$
100 | rapid7\.com$
101 | reuters\.com$
102 | rsa\.com$
103 | sans\.org$
104 | secunia\.com$
105 | securelist\.com$
106 | secureworks\.com$
107 | shadowserver\.org$
108 | snort\.org$
109 | sophos\.com$
110 | spiderlabs\.com$
111 | spiegel\.de$
112 | symantec\.com$
113 | technet\.com$
114 | telegraph\.co\.uk$
115 | telussecuritylabs\.com$
116 | theguardian\.com$
117 | theregister\.co\.uk$
118 | thetimes\.co\.uk$
119 | threatconnect\.com$
120 | threatexpert\.com$
121 | threatgeek\.com$
122 | threatpost\.com$
123 | time\.com$
124 | trendmicro\.com$
125 | twitter\.com$
126 | us-cert\.gov$
127 | usenix\.org$
128 | verisign\.com$
129 | virusbtn\.com$
130 | virustotal\.com$
131 | washingtonpost\.com$
132 | washingtontimes\.com$
133 | wikipedia\.org$
134 | windows\.com$
135 | windowsupdate\.com$
136 | wired\.com$
137 | wsj\.com$
138 | www\.w3\.org$
139 | yahoo\.com$
140 | yandex\.ru$
141 | youtube\.com$
142 | zdnet\.com$
143 | zscaler\.com$


--------------------------------------------------------------------------------
/whitelists/whitelist_IP.ini:
--------------------------------------------------------------------------------
1 | 0[1-9]
2 | ^0\.
3 | \.0$
4 | ^127\.0\.0\.1
5 | ^192\.168\.
6 | ^10\.
7 | ^8\.8\.8\.8


--------------------------------------------------------------------------------
/whitelists/whitelist_MD5.ini:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/threatminer/ioc_parser/c0ab39001580b48e87ad6fe4875a9e33f7cbee89/whitelists/whitelist_MD5.ini


--------------------------------------------------------------------------------
/whitelists/whitelist_Registry.ini:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/threatminer/ioc_parser/c0ab39001580b48e87ad6fe4875a9e33f7cbee89/whitelists/whitelist_Registry.ini


--------------------------------------------------------------------------------
/whitelists/whitelist_SHA1.ini:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/threatminer/ioc_parser/c0ab39001580b48e87ad6fe4875a9e33f7cbee89/whitelists/whitelist_SHA1.ini


--------------------------------------------------------------------------------
/whitelists/whitelist_SHA256.ini:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/threatminer/ioc_parser/c0ab39001580b48e87ad6fe4875a9e33f7cbee89/whitelists/whitelist_SHA256.ini


--------------------------------------------------------------------------------
/whitelists/whitelist_URL.ini:
--------------------------------------------------------------------------------
 1 | ^http:\/\/about-threats.trendmicro.com\/
 2 | ^http:\/\/blog.crowdstrike.com\/
 3 | ^http:\/\/blog.crysys.hu\/
 4 | ^http:\/\/blog.cylance.com\/
 5 | ^http:\/\/blog.eset.com\/
 6 | ^http:\/\/blog.kaspersky.com\/
 7 | ^http:\/\/blog.stopbadware.org\/
 8 | ^http:\/\/blog.trendmicro.com\/
 9 | ^http:\/\/blog.trendmicro.com\/
10 | ^http:\/\/blogs.cisco.com\/
11 | ^http:\/\/blogs.mcafee.com\/
12 | ^http:\/\/blogs.norman.com\/
13 | ^http:\/\/blogs.sans.org\/
14 | ^http:\/\/blogs.technet.com\/
15 | ^http:\/\/citizenlab.org\/
16 | ^http:\/\/contagiodump.blogspot.com\/
17 | ^http:\/\/ddanchev.blogspot.com\/
18 | ^http:\/\/download01.norman.no\/
19 | ^http:\/\/en.wikipedia.org\/wiki\/
20 | ^http:\/\/events.ccc.de\/
21 | ^http:\/\/isc.sans.edu\/
22 | ^http:\/\/isc.sans.org\/
23 | ^http:\/\/krebsonsecurity.com\/
24 | ^http:\/\/labs.alienvault.com\/
25 | ^http:\/\/labs.bitdefender.com\/
26 | ^http:\/\/lists.clean-mx.com\/
27 | ^http:\/\/msdn.microsoft.com\/
28 | ^http:\/\/msdn.microsoft.com\/
29 | ^http:\/\/newsroom.mcafee.com\/
30 | ^http:\/\/normanshark.com\/
31 | ^http:\/\/siblog.mcafee.com\/
32 | ^http:\/\/support.clean-mx.de\/
33 | ^http:\/\/support.microsoft.com\/kb\/
34 | ^http:\/\/symantec.com\/
35 | ^http:\/\/technet.microsoft.com\/
36 | ^http:\/\/threatexpert.com\/
37 | ^http:\/\/threatpost.com\/
38 | ^http:\/\/tools.cisco.com\/security\/
39 | ^http:\/\/vrt-blog.snort.org\/
40 | ^http:\/\/whois.domaintools.com\/
41 | ^http:\/\/www.adobe.com\/support\/security\/
42 | ^http:\/\/www.blackhat.com\/presentations\/
43 | ^http:\/\/www.citizenlab.org\/
44 | ^http:\/\/www.crowdstrike.com\/blog\/
45 | ^http:\/\/www.cve.mitre.org\/
46 | ^http:\/\/www.damballa.com\/
47 | ^http:\/\/www.domaintools.com\/
48 | ^http:\/\/www.eff.org\/document\/
49 | ^http:\/\/www.exploit-db.com\/exploits\/
50 | ^http:\/\/www.f-secure.com\/
51 | ^http:\/\/www.f-secure.com\/weblog\/
52 | ^http:\/\/www.fireeye.com\/
53 | ^http:\/\/www.gdata.de\/
54 | ^http:\/\/www.ietf.org\/
55 | ^http:\/\/www.mandiant.com\/
56 | ^http:\/\/www.mcafee.com\/
57 | ^http:\/\/www.microsoft.com\/technet\/security\/
58 | ^http:\/\/www.securelist.com\/
59 | ^http:\/\/www.secureworks.com\/
60 | ^http:\/\/www.secureworks.com\/research\/
61 | ^http:\/\/www.shadowserver.org\/
62 | ^http:\/\/www.sophos.com\/
63 | ^http:\/\/www.symantec.com\/
64 | ^http:\/\/www.threatconnect.com\/
65 | ^http:\/\/www.threatexpert.com\/
66 | ^http:\/\/www.trendmicro.com\/
67 | ^http:\/\/www.virusbtn.com\/pdf\/
68 | ^http:\/\/www.w3.org\/
69 | ^http:\/\/www.welivesecurity.com\/
70 | ^https?://blog.gdatasoftware.com\/
71 | ^https?://citizenlab.org\/
72 | ^https?://cve.mitre.org\/
73 | ^https?://securelist.com\/
74 | ^https?://www.eff.org\/
75 | ^https?://www.virustotal.com\/
76 | ^https?:\/\/blog.fireeye.com\/
77 | ^https?:\/\/blogs.rsa.com\/
78 | ^https?:\/\/nakedsecurity.sophos.com\/
79 | ^https?:\/\/www.usenix.org\/


--------------------------------------------------------------------------------