├── .gitignore ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── __init__.py ├── bin └── iocp ├── iocp ├── Output.py ├── Parser.py ├── Whitelist.py ├── __init__.py └── data │ ├── patterns.ini │ └── whitelists │ ├── whitelist_CVE.ini │ ├── whitelist_Email.ini │ ├── whitelist_Filename.ini │ ├── whitelist_Filepath.ini │ ├── whitelist_Host.ini │ ├── whitelist_IP.ini │ ├── whitelist_MD5.ini │ ├── whitelist_Registry.ini │ ├── whitelist_SHA1.ini │ ├── whitelist_SHA256.ini │ └── whitelist_URL.ini ├── requirements.txt ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 armbues 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include iocp/data * -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ioc-parser 2 | IOC Parser is a tool to extract indicators of compromise from security reports in PDF format. A good collection of APT related reports with many IOCs can be found here: [APTNotes](https://github.com/kbandla/APTnotes). 3 | 4 | ## Usage 5 | **iocp [-h] [-p INI] [-i FORMAT] [-o FORMAT] [-d] [-l LIB] FILE** 6 | * *FILE* File/directory path to report(s) 7 | * *-p INI* Pattern file 8 | * *-i FORMAT* Input format (pdf/txt/html) 9 | * *-o FORMAT* Output format (csv/json/yara) 10 | * *-d* Deduplicate matches 11 | * *-l LIB* Parsing library 12 | 13 | ## Installation 14 | **pip install ioc_parser** 15 | 16 | ## Requirements 17 | One of the following PDF parsing libraries: 18 | * [PyPDF2](https://github.com/mstamy2/PyPDF2) - *pip install pypdf2* 19 | * [pdfminer](https://github.com/euske/pdfminer) - *pip install pdfminer* 20 | 21 | For HTML parsing support: 22 | * [BeautifulSoup](http://www.crummy.com/software/BeautifulSoup/) - *pip install beautifulsoup4* 23 | 24 | For HTTP(S) support: 25 | * [requests](http://docs.python-requests.org/en/latest/) - *pip install requests* 26 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/armbues/ioc_parser/b87758680aca377a019ee3185566bc05145e0117/__init__.py -------------------------------------------------------------------------------- /bin/iocp: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ################################################################################################### 4 | # 5 | # Copyright (c) 2015, Armin Buescher (armin.buescher@googlemail.com) 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | # 25 | ################################################################################################### 26 | # 27 | # File: iocp.py 28 | # Description: IOC Parser is a tool to extract indicators of compromise from security reports 29 | # in PDF format. 30 | # Usage: iocp.py [-h] [-p INI] [-f FORMAT] PDF 31 | # Author: Armin Buescher (@armbues) 32 | # Contributors: Angelo Dell'Aera (@angelodellaera) 33 | # Thanks to: Jose Ramon Palanco 34 | # Koen Van Impe (@cudeso) 35 | # 36 | ################################################################################################### 37 | 38 | import argparse 39 | 40 | from iocp import Parser 41 | 42 | if __name__ == "__main__": 43 | argparser = argparse.ArgumentParser() 44 | argparser.add_argument('PATH', action='store', help='File/directory/URL to report(s)') 45 | argparser.add_argument('-p', dest='INI', default=None, help='Pattern file') 46 | argparser.add_argument('-i', dest='INPUT_FORMAT', default='pdf', help='Input format (pdf/txt/html)') 47 | argparser.add_argument('-o', dest='OUTPUT_FORMAT', default='csv', help='Output format (csv/tsv/json/yara/netflow)') 48 | argparser.add_argument('-d', dest='DEDUP', action='store_true', default=False, help='Deduplicate matches') 49 | argparser.add_argument('-l', dest='LIB', default='pdfminer', help='PDF parsing library (pypdf2/pdfminer)') 50 | args = argparser.parse_args() 51 | 52 | parser = Parser.Parser(args.INI, args.INPUT_FORMAT, args.DEDUP, args.LIB, args.OUTPUT_FORMAT) 53 | parser.parse(args.PATH) -------------------------------------------------------------------------------- /iocp/Output.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import csv 6 | import json 7 | 8 | OUTPUT_FORMATS = ('csv', 'tsv', 'json', 'yara', 'netflow', ) 9 | 10 | def getHandler(output_format): 11 | output_format = output_format.lower() 12 | if output_format not in OUTPUT_FORMATS: 13 | print("[WARNING] Invalid output format specified... using CSV") 14 | output_format = 'csv' 15 | 16 | handler_format = "OutputHandler_" + output_format 17 | handler_class = getattr(sys.modules[__name__], handler_format) 18 | 19 | return handler_class() 20 | 21 | class OutputHandler(object): 22 | def print_match(self, fpath, page, name, match): 23 | pass 24 | 25 | def print_header(self, fpath): 26 | pass 27 | 28 | def print_footer(self, fpath): 29 | pass 30 | 31 | def print_error(self, fpath, exception): 32 | print("[ERROR] %s" % (exception)) 33 | 34 | class OutputHandler_csv(OutputHandler): 35 | def __init__(self): 36 | self.csv_writer = csv.writer(sys.stdout) 37 | 38 | def print_match(self, fpath, page, name, match): 39 | self.csv_writer.writerow((fpath, page, name, match)) 40 | 41 | def print_error(self, fpath, exception): 42 | self.csv_writer.writerow((fpath, '0', 'error', exception)) 43 | 44 | class OutputHandler_tsv(OutputHandler): 45 | def __init__(self): 46 | self.csv_writer = csv.writer(sys.stdout, delimiter = '\t') 47 | 48 | def print_match(self, fpath, page, name, match): 49 | self.csv_writer.writerow((fpath, page, name, match)) 50 | 51 | def print_error(self, fpath, exception): 52 | self.csv_writer.writerow((fpath, '0', 'error', exception)) 53 | 54 | class OutputHandler_json(OutputHandler): 55 | def print_match(self, fpath, page, name, match): 56 | data = { 57 | 'path' : fpath, 58 | 'file' : os.path.basename(fpath), 59 | 'page' : page, 60 | 'type' : name, 61 | 'match': match 62 | } 63 | 64 | print(json.dumps(data)) 65 | 66 | def print_error(self, fpath, exception): 67 | data = { 68 | 'path' : fpath, 69 | 'file' : os.path.basename(fpath), 70 | 'type' : 'error', 71 | 'exception' : exception 72 | } 73 | 74 | print(json.dumps(data)) 75 | 76 | class OutputHandler_yara(OutputHandler): 77 | def __init__(self): 78 | self.rule_enc = ''.join(chr(c) if chr(c).isupper() or chr(c).islower() or chr(c).isdigit() else '_' for c in range(256)) 79 | 80 | def print_match(self, fpath, page, name, match): 81 | if name in self.cnt: 82 | self.cnt[name] += 1 83 | else: 84 | self.cnt[name] = 1 85 | 86 | string_id = "$%s%d" % (name, self.cnt[name]) 87 | self.sids.append(string_id) 88 | string_value = match.replace('\\', '\\\\') 89 | print("\t\t%s = \"%s\"" % (string_id, string_value)) 90 | 91 | def print_header(self, fpath): 92 | rule_name = os.path.splitext(os.path.basename(fpath))[0].translate(self.rule_enc) 93 | 94 | print("rule %s" % (rule_name)) 95 | print("{") 96 | print("\tstrings:") 97 | 98 | self.cnt = {} 99 | self.sids = [] 100 | 101 | def print_footer(self, fpath): 102 | cond = ' or '.join(self.sids) 103 | 104 | print("\tcondition:") 105 | print("\t\t" + cond) 106 | print("}") 107 | 108 | class OutputHandler_netflow(OutputHandler): 109 | def __init__(self): 110 | print "host 255.255.255.255" 111 | 112 | def print_match(self, fpath, page, name, match): 113 | data = { 114 | 'type' : name, 115 | 'match': match 116 | } 117 | if data["type"] == "IP": 118 | print " or host %s " % data["match"] 119 | -------------------------------------------------------------------------------- /iocp/Parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ################################################################################################### 4 | # 5 | # Copyright (c) 2015, Armin Buescher (armin.buescher@googlemail.com) 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | # 25 | ################################################################################################### 26 | # 27 | # File: iocp.py 28 | # Description: IOC Parser is a tool to extract indicators of compromise from security reports 29 | # in PDF format. 30 | # Usage: iocp.py [-h] [-p INI] [-f FORMAT] PDF 31 | # Author: Armin Buescher (@armbues) 32 | # Contributors: Angelo Dell'Aera (@angelodellaera) 33 | # Thanks to: Jose Ramon Palanco 34 | # Koen Van Impe (@cudeso) 35 | # 36 | ################################################################################################### 37 | 38 | import os 39 | import sys 40 | import fnmatch 41 | import glob 42 | import re 43 | try: 44 | import configparser as ConfigParser 45 | except ImportError: 46 | import ConfigParser 47 | try: 48 | from StringIO import StringIO 49 | except ImportError: 50 | from io import StringIO 51 | 52 | # Import optional third-party libraries 53 | IMPORTS = [] 54 | try: 55 | from PyPDF2 import PdfFileReader 56 | IMPORTS.append('pypdf2') 57 | except ImportError: 58 | pass 59 | try: 60 | from pdfminer.pdfpage import PDFPage 61 | from pdfminer.pdfinterp import PDFResourceManager 62 | from pdfminer.converter import TextConverter 63 | from pdfminer.pdfinterp import PDFPageInterpreter 64 | from pdfminer.layout import LAParams 65 | IMPORTS.append('pdfminer') 66 | except ImportError: 67 | pass 68 | try: 69 | from bs4 import BeautifulSoup 70 | IMPORTS.append('beautifulsoup') 71 | except ImportError: 72 | pass 73 | try: 74 | import requests 75 | IMPORTS.append('requests') 76 | except ImportError: 77 | pass 78 | 79 | # Import project source files 80 | import iocp 81 | from iocp import Output 82 | 83 | class Parser(object): 84 | patterns = {} 85 | defang = {} 86 | 87 | def __init__(self, patterns_ini=None, input_format='pdf', dedup=False, library='pdfminer', output_format='csv', output_handler=None): 88 | basedir = iocp.get_basedir() 89 | 90 | if patterns_ini is None: 91 | patterns_ini = os.path.join(basedir, 'data/patterns.ini') 92 | self.load_patterns(patterns_ini) 93 | 94 | wldir = os.path.join(basedir, 'data/whitelists') 95 | self.whitelist = self.load_whitelists(wldir) 96 | 97 | self.dedup = dedup 98 | if output_handler: 99 | self.handler = output_handler 100 | else: 101 | self.handler = Output.getHandler(output_format) 102 | 103 | self.ext_filter = "*." + input_format 104 | parser_format = "parse_" + input_format 105 | try: 106 | self.parser_func = getattr(self, parser_format) 107 | except AttributeError: 108 | e = 'Selected parser format is not supported: %s' % (input_format) 109 | raise NotImplementedError(e) 110 | 111 | self.library = library 112 | if input_format == 'pdf': 113 | if library not in IMPORTS: 114 | e = 'Selected PDF parser library not found: %s' % (library) 115 | raise ImportError(e) 116 | elif input_format == 'html': 117 | if 'beautifulsoup' not in IMPORTS: 118 | e = 'HTML parser library not found: BeautifulSoup' 119 | raise ImportError(e) 120 | 121 | def load_patterns(self, fpath): 122 | config = ConfigParser.ConfigParser() 123 | with open(fpath) as f: 124 | config.readfp(f) 125 | 126 | for ind_type in config.sections(): 127 | try: 128 | ind_pattern = config.get(ind_type, 'pattern') 129 | except: 130 | continue 131 | 132 | if ind_pattern: 133 | ind_regex = re.compile(ind_pattern) 134 | self.patterns[ind_type] = ind_regex 135 | 136 | try: 137 | ind_defang = config.get(ind_type, 'defang') 138 | except: 139 | continue 140 | 141 | if ind_defang: 142 | self.defang[ind_type] = True 143 | 144 | def load_whitelists(self, fpath): 145 | whitelist = {} 146 | 147 | searchdir = os.path.join(fpath, "whitelist_*.ini") 148 | fpaths = glob.glob(searchdir) 149 | for fpath in fpaths: 150 | t = os.path.splitext(os.path.split(fpath)[1])[0].split('_',1)[1] 151 | patterns = [line.strip() for line in open(fpath)] 152 | whitelist[t] = [re.compile(p) for p in patterns] 153 | 154 | return whitelist 155 | 156 | def is_whitelisted(self, ind_match, ind_type): 157 | try: 158 | for w in self.whitelist[ind_type]: 159 | if w.findall(ind_match): 160 | return True 161 | except KeyError as e: 162 | pass 163 | return False 164 | 165 | def parse_page(self, fpath, data, page_num): 166 | for ind_type, ind_regex in self.patterns.items(): 167 | matches = ind_regex.findall(data) 168 | 169 | for ind_match in matches: 170 | if isinstance(ind_match, tuple): 171 | ind_match = ind_match[0] 172 | 173 | if self.is_whitelisted(ind_match, ind_type): 174 | continue 175 | 176 | if ind_type in self.defang: 177 | ind_match = re.sub(r'\[\.\]', '.', ind_match) 178 | 179 | if self.dedup: 180 | if (ind_type, ind_match) in self.dedup_store: 181 | continue 182 | 183 | self.dedup_store.add((ind_type, ind_match)) 184 | 185 | self.handler.print_match(fpath, page_num, ind_type, ind_match) 186 | 187 | def parse_pdf_pypdf2(self, f, fpath): 188 | try: 189 | pdf = PdfFileReader(f, strict = False) 190 | 191 | if self.dedup: 192 | self.dedup_store = set() 193 | 194 | self.handler.print_header(fpath) 195 | page_num = 0 196 | for page in pdf.pages: 197 | page_num += 1 198 | 199 | data = page.extractText() 200 | 201 | self.parse_page(fpath, data, page_num) 202 | self.handler.print_footer(fpath) 203 | except (KeyboardInterrupt, SystemExit): 204 | raise 205 | 206 | def parse_pdf_pdfminer(self, f, fpath): 207 | try: 208 | laparams = LAParams() 209 | laparams.all_texts = True 210 | rsrcmgr = PDFResourceManager() 211 | pagenos = set() 212 | 213 | if self.dedup: 214 | self.dedup_store = set() 215 | 216 | self.handler.print_header(fpath) 217 | page_num = 0 218 | for page in PDFPage.get_pages(f, pagenos, check_extractable=True): 219 | page_num += 1 220 | 221 | retstr = StringIO() 222 | device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams) 223 | interpreter = PDFPageInterpreter(rsrcmgr, device) 224 | interpreter.process_page(page) 225 | data = retstr.getvalue() 226 | retstr.close() 227 | 228 | self.parse_page(fpath, data, page_num) 229 | self.handler.print_footer(fpath) 230 | except (KeyboardInterrupt, SystemExit): 231 | raise 232 | 233 | def parse_pdf(self, f, fpath): 234 | parser_format = "parse_pdf_" + self.library 235 | try: 236 | self.parser_func = getattr(self, parser_format) 237 | except AttributeError: 238 | e = 'Selected PDF parser library is not supported: %s' % (self.library) 239 | raise NotImplementedError(e) 240 | 241 | self.parser_func(f, fpath) 242 | 243 | def parse_txt(self, f, fpath): 244 | try: 245 | if self.dedup: 246 | self.dedup_store = set() 247 | 248 | data = f.read() 249 | self.handler.print_header(fpath) 250 | self.parse_page(fpath, data, 1) 251 | self.handler.print_footer(fpath) 252 | except (KeyboardInterrupt, SystemExit): 253 | raise 254 | 255 | def parse_html(self, f, fpath): 256 | try: 257 | if self.dedup: 258 | self.dedup_store = set() 259 | 260 | data = f.read() 261 | soup = BeautifulSoup(data) 262 | html = soup.findAll(text=True) 263 | 264 | text = u'' 265 | for elem in html: 266 | if elem.parent.name in ['style', 'script', '[document]', 'head', 'title']: 267 | continue 268 | elif re.match('', unicode(elem)): 269 | continue 270 | else: 271 | text += unicode(elem) 272 | 273 | self.handler.print_header(fpath) 274 | self.parse_page(fpath, text, 1) 275 | self.handler.print_footer(fpath) 276 | except (KeyboardInterrupt, SystemExit): 277 | raise 278 | 279 | def parse(self, path): 280 | try: 281 | if path.startswith('http://') or path.startswith('https://'): 282 | if 'requests' not in IMPORTS: 283 | e = 'HTTP library not found: requests' 284 | raise ImportError(e) 285 | headers = { 'User-Agent': 'Mozilla/5.0 Gecko Firefox' } 286 | r = requests.get(path, headers=headers) 287 | r.raise_for_status() 288 | f = StringIO(r.content) 289 | self.parser_func(f, path) 290 | return 291 | elif os.path.isfile(path): 292 | with open(path, 'rb') as f: 293 | self.parser_func(f, path) 294 | return 295 | elif os.path.isdir(path): 296 | for walk_root, walk_dirs, walk_files in os.walk(path): 297 | for walk_file in fnmatch.filter(walk_files, self.ext_filter): 298 | fpath = os.path.join(walk_root, walk_file) 299 | with open(fpath, 'rb') as f: 300 | self.parser_func(f, fpath) 301 | return 302 | 303 | e = 'File path is not a file, directory or URL: %s' % (path) 304 | raise IOError(e) 305 | except (KeyboardInterrupt, SystemExit): 306 | raise 307 | except Exception as e: 308 | self.handler.print_error(path, e) -------------------------------------------------------------------------------- /iocp/Whitelist.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import re 4 | 5 | class WhiteList(dict): 6 | def __init__(self, basedir): 7 | searchdir = os.path.join(basedir, "whitelists/whitelist_*.ini") 8 | fpaths = glob.glob(searchdir) 9 | for fpath in fpaths: 10 | t = os.path.splitext(os.path.split(fpath)[1])[0].split('_',1)[1] 11 | patterns = [line.strip() for line in open(fpath)] 12 | self[t] = [re.compile(p) for p in patterns] -------------------------------------------------------------------------------- /iocp/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | 5 | _IOCP_ROOT = os.path.abspath(os.path.dirname(__file__)) 6 | 7 | def get_basedir(): 8 | return _IOCP_ROOT -------------------------------------------------------------------------------- /iocp/data/patterns.ini: -------------------------------------------------------------------------------- 1 | [URL] 2 | pattern: \b([a-z]{3,}\:\/\/[\S]{16,})\b 3 | defang: True 4 | 5 | [Host] 6 | pattern: \b(([a-z0-9\-]{2,}\[?\.\]?)+(abogado|ac|academy|accountants|active|actor|ad|adult|ae|aero|af|ag|agency|ai|airforce|al|allfinanz|alsace|am|amsterdam|an|android|ao|aq|aquarelle|ar|archi|army|arpa|as|asia|associates|at|attorney|au|auction|audio|autos|aw|ax|axa|az|ba|band|bank|bar|barclaycard|barclays|bargains|bayern|bb|bd|be|beer|berlin|best|bf|bg|bh|bi|bid|bike|bingo|bio|biz|bj|black|blackfriday|bloomberg|blue|bm|bmw|bn|bnpparibas|bo|boo|boutique|br|brussels|bs|bt|budapest|build|builders|business|buzz|bv|bw|by|bz|bzh|ca|cal|camera|camp|cancerresearch|canon|capetown|capital|caravan|cards|care|career|careers|cartier|casa|cash|cat|catering|cc|cd|center|ceo|cern|cf|cg|ch|channel|chat|cheap|christmas|chrome|church|ci|citic|city|ck|cl|claims|cleaning|click|clinic|clothing|club|cm|cn|co|coach|codes|coffee|college|cologne|com|community|company|computer|condos|construction|consulting|contractors|cooking|cool|coop|country|cr|credit|creditcard|cricket|crs|cruises|cu|cuisinella|cv|cw|cx|cy|cymru|cz|dabur|dad|dance|dating|day|dclk|de|deals|degree|delivery|democrat|dental|dentist|desi|design|dev|diamonds|diet|digital|direct|directory|discount|dj|dk|dm|dnp|do|docs|domains|doosan|durban|dvag|dz|eat|ec|edu|education|ee|eg|email|emerck|energy|engineer|engineering|enterprises|equipment|er|es|esq|estate|et|eu|eurovision|eus|events|everbank|exchange|expert|exposed|fail|farm|fashion|feedback|fi|finance|financial|firmdale|fish|fishing|fit|fitness|fj|fk|flights|florist|flowers|flsmidth|fly|fm|fo|foo|forsale|foundation|fr|frl|frogans|fund|furniture|futbol|ga|gal|gallery|garden|gb|gbiz|gd|ge|gent|gf|gg|ggee|gh|gi|gift|gifts|gives|gl|glass|gle|global|globo|gm|gmail|gmo|gmx|gn|goog|google|gop|gov|gp|gq|gr|graphics|gratis|green|gripe|gs|gt|gu|guide|guitars|guru|gw|gy|hamburg|hangout|haus|healthcare|help|here|hermes|hiphop|hiv|hk|hm|hn|holdings|holiday|homes|horse|host|hosting|house|how|hr|ht|hu|ibm|id|ie|ifm|il|im|immo|immobilien|in|industries|info|ing|ink|institute|insure|int|international|investments|io|iq|ir|irish|is|it|iwc|jcb|je|jetzt|jm|jo|jobs|joburg|jp|juegos|kaufen|kddi|ke|kg|kh|ki|kim|kitchen|kiwi|km|kn|koeln|kp|kr|krd|kred|kw|ky|kyoto|kz|la|lacaixa|land|lat|latrobe|lawyer|lb|lc|lds|lease|legal|lgbt|li|lidl|life|lighting|limited|limo|link|lk|loans|london|lotte|lotto|lr|ls|lt|ltda|lu|luxe|luxury|lv|ly|ma|madrid|maison|management|mango|market|marketing|marriott|mc|md|me|media|meet|melbourne|meme|memorial|menu|mg|mh|miami|mil|mini|mk|ml|mm|mn|mo|mobi|moda|moe|monash|money|mormon|mortgage|moscow|motorcycles|mov|mp|mq|mr|ms|mt|mu|museum|mv|mw|mx|my|mz|na|nagoya|name|navy|nc|ne|net|network|neustar|new|nexus|nf|ng|ngo|nhk|ni|ninja|nl|no|np|nr|nra|nrw|ntt|nu|nyc|nz|okinawa|om|one|ong|onl|ooo|org|organic|osaka|otsuka|ovh|pa|paris|partners|parts|party|pe|pf|pg|ph|pharmacy|photo|photography|photos|physio|pics|pictures|pink|pizza|pk|pl|place|plumbing|pm|pn|pohl|poker|porn|post|pr|praxi|press|pro|prod|productions|prof|properties|property|ps|pt|pub|pw|qa|qpon|quebec|re|realtor|recipes|red|rehab|reise|reisen|reit|ren|rentals|repair|report|republican|rest|restaurant|reviews|rich|rio|rip|ro|rocks|rodeo|rs|rsvp|ru|ruhr|rw|ryukyu|sa|saarland|sale|samsung|sarl|sb|sc|sca|scb|schmidt|schule|schwarz|science|scot|sd|se|services|sew|sexy|sg|sh|shiksha|shoes|shriram|si|singles|sj|sk|sky|sl|sm|sn|so|social|software|sohu|solar|solutions|soy|space|spiegel|sr|st|style|su|supplies|supply|support|surf|surgery|suzuki|sv|sx|sy|sydney|systems|sz|taipei|tatar|tattoo|tax|tc|td|technology|tel|temasek|tennis|tf|tg|th|tienda|tips|tires|tirol|tj|tk|tl|tm|tn|to|today|tokyo|tools|top|toshiba|town|toys|tp|tr|trade|training|travel|trust|tt|tui|tv|tw|tz|ua|ug|uk|university|uno|uol|us|uy|uz|va|vacations|vc|ve|vegas|ventures|versicherung|vet|vg|vi|viajes|video|villas|vision|vlaanderen|vn|vodka|vote|voting|voto|voyage|vu|wales|wang|watch|webcam|website|wed|wedding|wf|whoswho|wien|wiki|williamhill|wme|work|works|world|ws|wtc|wtf|xyz|yachts|yandex|ye|yoga|yokohama|youtube|yt|za|zm|zone|zuerich|zw))\b 7 | defang: True 8 | 9 | [IP] 10 | pattern: \b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b 11 | 12 | [Email] 13 | pattern: \b([a-z][_a-z0-9-.]+@[a-z0-9-]+\.[a-z]+)\b 14 | 15 | [MD5] 16 | pattern: \b([a-f0-9]{32}|[A-F0-9]{32})\b 17 | 18 | [SHA1] 19 | pattern: \b([a-f0-9]{40}|[A-F0-9]{40})\b 20 | 21 | [SHA256] 22 | pattern: \b([a-f0-9]{64}|[A-F0-9]{64})\b 23 | 24 | [CVE] 25 | pattern: \b(CVE\-[0-9]{4}\-[0-9]{4,6})\b 26 | 27 | [Registry] 28 | pattern: \b((HKLM|HKCU)\\[\\A-Za-z0-9-_]+)\b 29 | 30 | [Filename] 31 | pattern: \b([A-Za-z0-9-_\.]+\.(exe|dll|bat|sys|htm|html|js|jar|jpg|png|vb|scr|pif|chm|zip|rar|cab|pdf|doc|docx|ppt|pptx|xls|xlsx|swf|gif))\b 32 | 33 | [Filepath] 34 | pattern: \b[A-Z]:\\[A-Za-z0-9-_\.\\]+\b 35 | -------------------------------------------------------------------------------- /iocp/data/whitelists/whitelist_CVE.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/armbues/ioc_parser/b87758680aca377a019ee3185566bc05145e0117/iocp/data/whitelists/whitelist_CVE.ini -------------------------------------------------------------------------------- /iocp/data/whitelists/whitelist_Email.ini: -------------------------------------------------------------------------------- 1 | @fireeye.com 2 | @crowdstrike.com 3 | @f-secure.com 4 | @kaspersky.com 5 | @gdata.de 6 | @cylance.com -------------------------------------------------------------------------------- /iocp/data/whitelists/whitelist_Filename.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/armbues/ioc_parser/b87758680aca377a019ee3185566bc05145e0117/iocp/data/whitelists/whitelist_Filename.ini -------------------------------------------------------------------------------- /iocp/data/whitelists/whitelist_Filepath.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/armbues/ioc_parser/b87758680aca377a019ee3185566bc05145e0117/iocp/data/whitelists/whitelist_Filepath.ini -------------------------------------------------------------------------------- /iocp/data/whitelists/whitelist_Host.ini: -------------------------------------------------------------------------------- 1 | acm\.org$ 2 | adobe\.com$ 3 | ahnlab\.com$ 4 | alienvault\.com$ 5 | amazon\.com$ 6 | android\.com$ 7 | aol\.com$ 8 | arbornetworks\.com$ 9 | arstechnica\.com$ 10 | avg\.com$ 11 | bbc\.co\.uk$ 12 | bing\.com$ 13 | bitdefender\.com$ 14 | bloomberg\.com$ 15 | bluecoat\.com$ 16 | cassidiancybersecurity\.com$ 17 | cbsnews\.com$ 18 | cia\.gov$ 19 | cisco\.com$ 20 | citizenlab\.org$ 21 | clean-mx\.de$ 22 | cnn\.com$ 23 | comodo\.com$ 24 | contagiodump\.blogspot\.com$ 25 | contextis\.com$ 26 | coresecurity\.com$ 27 | crowdstrike\.com$ 28 | crysys\.hu$ 29 | cve\.mitre\.org$ 30 | cylance\.com$ 31 | dailymail\.co\.uk$ 32 | damballa\.com$ 33 | darkreading\.com$ 34 | ddanchev\.blogspot\.com$ 35 | defense\.gov$ 36 | dell\.com$ 37 | domaintools\.com$ 38 | dropbox\.com$ 39 | eff\.org$ 40 | emergingthreats\.net$ 41 | eset\.com$ 42 | eset\.sk$ 43 | events\.ccc\.de$ 44 | exploit-db\.com$ 45 | f-secure\.com$ 46 | facebook\.com$ 47 | fbi\.gov$ 48 | fidelissecurity\.com$ 49 | fireeye\.com$ 50 | forbes\.com$ 51 | fortinet\.com$ 52 | gdata\.de$ 53 | gdatasoftware\.com$ 54 | github\.com$ 55 | gmail\.com$ 56 | gmx\.com$ 57 | gmx\.de$ 58 | google\.com$ 59 | googlemail\.com$ 60 | googleonlinesecurity\.blogspot\.com$ 61 | hbgary\.com$ 62 | heise\.de$ 63 | hex-rays\.com$ 64 | hotmail\.com$ 65 | huffingtonpost\.com$ 66 | iana\.org$ 67 | ibtimes\.com$ 68 | ietf\.org$ 69 | inbox\.com$ 70 | informationweek\.com$ 71 | invincea\.com$ 72 | isc\.org$ 73 | isightpartners\.com$ 74 | java\.net$ 75 | kaspersky\.com$ 76 | krebsonsecurity\.com$ 77 | lastline\.com$ 78 | lemonde\.fr$ 79 | linkedin\.com$ 80 | live\.com$ 81 | malware\.dontneedcoffee\.com$ 82 | malware\.lu$ 83 | malwaredomainlist\.com$ 84 | mandiant\.com$ 85 | mcafee\.com$ 86 | metasploit\.com$ 87 | microsoft\.com$ 88 | mozilla\.org$ 89 | msn\.com$ 90 | norman\.com$ 91 | norman\.no$ 92 | normanshark\.com$ 93 | nytimes\.com$ 94 | outlook\.com$ 95 | paloaltonetworks\.com$ 96 | paypal\.com$ 97 | pinterest\.com$ 98 | pwc\.com$ 99 | qualys\.com$ 100 | rapid7\.com$ 101 | reuters\.com$ 102 | rsa\.com$ 103 | sans\.org$ 104 | secunia\.com$ 105 | securelist\.com$ 106 | secureworks\.com$ 107 | shadowserver\.org$ 108 | snort\.org$ 109 | sophos\.com$ 110 | spiderlabs\.com$ 111 | spiegel\.de$ 112 | symantec\.com$ 113 | technet\.com$ 114 | telegraph\.co\.uk$ 115 | telussecuritylabs\.com$ 116 | theguardian\.com$ 117 | theregister\.co\.uk$ 118 | thetimes\.co\.uk$ 119 | threatconnect\.com$ 120 | threatexpert\.com$ 121 | threatgeek\.com$ 122 | threatpost\.com$ 123 | time\.com$ 124 | trendmicro\.com$ 125 | twitter\.com$ 126 | us-cert\.gov$ 127 | usenix\.org$ 128 | verisign\.com$ 129 | virusbtn\.com$ 130 | virustotal\.com$ 131 | washingtonpost\.com$ 132 | washingtontimes\.com$ 133 | wikipedia\.org$ 134 | windows\.com$ 135 | windowsupdate\.com$ 136 | wired\.com$ 137 | wsj\.com$ 138 | www\.w3\.org$ 139 | yahoo\.com$ 140 | yandex\.ru$ 141 | youtube\.com$ 142 | zdnet\.com$ 143 | zscaler\.com$ -------------------------------------------------------------------------------- /iocp/data/whitelists/whitelist_IP.ini: -------------------------------------------------------------------------------- 1 | ^0\. 2 | \.0$ 3 | ^127\.0\.0\.1 4 | ^192\.168\. 5 | ^10\. 6 | ^8\.8\.8\.8 7 | -------------------------------------------------------------------------------- /iocp/data/whitelists/whitelist_MD5.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/armbues/ioc_parser/b87758680aca377a019ee3185566bc05145e0117/iocp/data/whitelists/whitelist_MD5.ini -------------------------------------------------------------------------------- /iocp/data/whitelists/whitelist_Registry.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/armbues/ioc_parser/b87758680aca377a019ee3185566bc05145e0117/iocp/data/whitelists/whitelist_Registry.ini -------------------------------------------------------------------------------- /iocp/data/whitelists/whitelist_SHA1.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/armbues/ioc_parser/b87758680aca377a019ee3185566bc05145e0117/iocp/data/whitelists/whitelist_SHA1.ini -------------------------------------------------------------------------------- /iocp/data/whitelists/whitelist_SHA256.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/armbues/ioc_parser/b87758680aca377a019ee3185566bc05145e0117/iocp/data/whitelists/whitelist_SHA256.ini -------------------------------------------------------------------------------- /iocp/data/whitelists/whitelist_URL.ini: -------------------------------------------------------------------------------- 1 | ^http:\/\/about-threats.trendmicro.com\/ 2 | ^http:\/\/blog.crowdstrike.com\/ 3 | ^http:\/\/blog.crysys.hu\/ 4 | ^http:\/\/blog.cylance.com\/ 5 | ^http:\/\/blog.eset.com\/ 6 | ^http:\/\/blog.kaspersky.com\/ 7 | ^http:\/\/blog.stopbadware.org\/ 8 | ^http:\/\/blog.trendmicro.com\/ 9 | ^http:\/\/blog.trendmicro.com\/ 10 | ^http:\/\/blogs.cisco.com\/ 11 | ^http:\/\/blogs.mcafee.com\/ 12 | ^http:\/\/blogs.norman.com\/ 13 | ^http:\/\/blogs.sans.org\/ 14 | ^http:\/\/blogs.technet.com\/ 15 | ^http:\/\/citizenlab.org\/ 16 | ^http:\/\/contagiodump.blogspot.com\/ 17 | ^http:\/\/ddanchev.blogspot.com\/ 18 | ^http:\/\/download01.norman.no\/ 19 | ^http:\/\/en.wikipedia.org\/wiki\/ 20 | ^http:\/\/events.ccc.de\/ 21 | ^http:\/\/isc.sans.edu\/ 22 | ^http:\/\/isc.sans.org\/ 23 | ^http:\/\/krebsonsecurity.com\/ 24 | ^http:\/\/labs.alienvault.com\/ 25 | ^http:\/\/labs.bitdefender.com\/ 26 | ^http:\/\/lists.clean-mx.com\/ 27 | ^http:\/\/msdn.microsoft.com\/ 28 | ^http:\/\/msdn.microsoft.com\/ 29 | ^http:\/\/newsroom.mcafee.com\/ 30 | ^http:\/\/normanshark.com\/ 31 | ^http:\/\/siblog.mcafee.com\/ 32 | ^http:\/\/support.clean-mx.de\/ 33 | ^http:\/\/support.microsoft.com\/kb\/ 34 | ^http:\/\/symantec.com\/ 35 | ^http:\/\/technet.microsoft.com\/ 36 | ^http:\/\/threatexpert.com\/ 37 | ^http:\/\/threatpost.com\/ 38 | ^http:\/\/tools.cisco.com\/security\/ 39 | ^http:\/\/vrt-blog.snort.org\/ 40 | ^http:\/\/whois.domaintools.com\/ 41 | ^http:\/\/www.adobe.com\/support\/security\/ 42 | ^http:\/\/www.blackhat.com\/presentations\/ 43 | ^http:\/\/www.citizenlab.org\/ 44 | ^http:\/\/www.crowdstrike.com\/blog\/ 45 | ^http:\/\/www.cve.mitre.org\/ 46 | ^http:\/\/www.damballa.com\/ 47 | ^http:\/\/www.domaintools.com\/ 48 | ^http:\/\/www.eff.org\/document\/ 49 | ^http:\/\/www.exploit-db.com\/exploits\/ 50 | ^http:\/\/www.f-secure.com\/ 51 | ^http:\/\/www.f-secure.com\/weblog\/ 52 | ^http:\/\/www.fireeye.com\/ 53 | ^http:\/\/www.gdata.de\/ 54 | ^http:\/\/www.ietf.org\/ 55 | ^http:\/\/www.mandiant.com\/ 56 | ^http:\/\/www.mcafee.com\/ 57 | ^http:\/\/www.microsoft.com\/technet\/security\/ 58 | ^http:\/\/www.securelist.com\/ 59 | ^http:\/\/www.secureworks.com\/ 60 | ^http:\/\/www.secureworks.com\/research\/ 61 | ^http:\/\/www.shadowserver.org\/ 62 | ^http:\/\/www.sophos.com\/ 63 | ^http:\/\/www.symantec.com\/ 64 | ^http:\/\/www.threatconnect.com\/ 65 | ^http:\/\/www.threatexpert.com\/ 66 | ^http:\/\/www.trendmicro.com\/ 67 | ^http:\/\/www.virusbtn.com\/pdf\/ 68 | ^http:\/\/www.w3.org\/ 69 | ^http:\/\/www.welivesecurity.com\/ 70 | ^https?://blog.gdatasoftware.com\/ 71 | ^https?://citizenlab.org\/ 72 | ^https?://cve.mitre.org\/ 73 | ^https?://securelist.com\/ 74 | ^https?://www.eff.org\/ 75 | ^https?://www.virustotal.com\/ 76 | ^https?:\/\/blog.fireeye.com\/ 77 | ^https?:\/\/blogs.rsa.com\/ 78 | ^https?:\/\/nakedsecurity.sophos.com\/ 79 | ^https?:\/\/www.usenix.org\/ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4>=4.4.1 2 | pdfminer>=20140328 3 | PyPDF2>=1.26.0 4 | requests>=2.10.0 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | from setuptools import setup 5 | 6 | setup( 7 | name = "ioc_parser", 8 | version = "0.9.1", 9 | author = "Armin Buescher", 10 | author_email = "armin.buescher@googlemail.com", 11 | scripts=['bin/iocp'], 12 | description = ("Tool to extract indicators of compromise from security reports"), 13 | license = "MIT", 14 | url = "https://github.com/armbues/ioc_parser", 15 | packages=['iocp'], 16 | include_package_data=True, 17 | classifiers=[ 18 | "Development Status :: 4 - Beta", 19 | "Topic :: Security", 20 | "License :: OSI Approved :: MIT License", 21 | ], 22 | install_requires=[ 23 | "pdfminer", 24 | "PyPDF2", 25 | "requests", 26 | "beautifulsoup4" 27 | ], 28 | ) --------------------------------------------------------------------------------