├── README.md └── scrape_pdf.py /README.md: -------------------------------------------------------------------------------- 1 | A basic script based that uses PDFMiner to decompress streams, and then looks inside the streams 2 | 3 | Currently it attempts to pull out IPs, hashes, URLs, and hostnames. 4 | 5 | Requires: 6 | 7 | * pip install dnspython 8 | * grab uniaccept from here 9 | * pip install pdfminer 10 | 11 | 12 | Then after you've done that, you'll likely want to get the newest TLD list.
13 | Open a Python interpreter then:
14 | 15 | ``` 16 | import uniaccept 17 | uniaccept.refreshtlddb("/tmp/tld-list.txt") 18 | ``` 19 | Feel free to change the location of the tld-list.txt file, the scrape-pdf.py script expects it in the CWD. 20 | 21 | 22 | -------------------------------------------------------------------------------- /scrape_pdf.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | import mmap 4 | import string 5 | import uniaccept 6 | from pdfminer.pdfpage import PDFPage 7 | from pdfminer.layout import LAParams 8 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 9 | from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter 10 | 11 | def main(argv): 12 | #codec = 'utf-8' 13 | codec = 'ascii' 14 | laparams = LAParams() 15 | pagenos = set() 16 | maxpages = 0 17 | password = '' 18 | caching = True 19 | rotation = 0 20 | rsrcmgr = PDFResourceManager(caching=caching) 21 | 22 | # Do a double read thanks to: 23 | # https://mail.python.org/pipermail/python-list/2009-April/531944.html 24 | mm = mmap.mmap(-1, 1024*1024*1024) 25 | 26 | device = TextConverter(rsrcmgr, mm, codec=codec, laparams=laparams, imagewriter=None) 27 | 28 | fname = argv[1] 29 | fp = file(fname, 'rb') 30 | interpreter = PDFPageInterpreter(rsrcmgr, device) 31 | for page in PDFPage.get_pages(fp, pagenos, 32 | maxpages=maxpages, password=password, 33 | caching=caching, check_extractable=True): 34 | page.rotate = (page.rotate+rotation) % 360 35 | interpreter.process_page(page) 36 | fp.close() 37 | 38 | eof = mm.tell() 39 | device.close() 40 | mm.close() 41 | 42 | # Recreate the mmap area w/the correct size 43 | mm = mmap.mmap(-1, eof) 44 | 45 | device = TextConverter(rsrcmgr, mm, codec=codec, laparams=laparams, imagewriter=None) 46 | 47 | fname = argv[1] 48 | fp = file(fname, 'rb') 49 | interpreter = PDFPageInterpreter(rsrcmgr, device) 50 | for page in PDFPage.get_pages(fp, pagenos, 51 | maxpages=maxpages, password=password, 52 | caching=caching, check_extractable=True): 53 | page.rotate = (page.rotate+rotation) % 360 54 | interpreter.process_page(page) 55 | fp.close() 56 | 57 | mm.seek(0) 58 | 59 | ip_regex = re.compile(r'((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))') 60 | hash_regex = re.compile(r'(?:[A-Fa-f0-9]{32}|[A-Fa-f0-9]{40}|[A-Fa-f0-9]{64})') # md5, sha1, sha256 61 | url_regex = re.compile(r'\b((?:[\w-]+://?|www[.])[A-Za-z0-9-_\/.%?=&\[\]()@!$#,;]+)', re.MULTILINE) 62 | hostname_regex = re.compile(r'([a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63}|\s\.\s[a-zA-Z\d-]{,63})+)', re.MULTILINE) 63 | single_line_hostname_regex = re.compile(r'([a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63}|\s\.\s[a-zA-Z\d-]{,63})+)') 64 | doc = '' 65 | while True: 66 | if mm.tell() >= eof: 67 | break 68 | doc += mm.readline().rstrip() 69 | 70 | m = re.findall(ip_regex, doc) 71 | if m != None and len(m) > 0: print set(m) 72 | m = re.findall(url_regex, doc) 73 | if m != None and len(m) > 0: print set(m) 74 | m = re.findall(hash_regex, doc) 75 | if m != None and len(m) > 0: print set(m) 76 | m = re.findall(hostname_regex, doc) 77 | hostname_candidates = [] 78 | if m != None and len(m) > 0: hostname_candidates = list(set(m)) 79 | m = re.findall(single_line_hostname_regex, doc) 80 | if m != None and len(m) > 0: hostname_candidates = list(set(m + hostname_candidates)) 81 | 82 | if len(hostname_candidates) > 0: 83 | for h in hostname_candidates: 84 | domain = string.replace(h, ' ', '') 85 | #print h 86 | if uniaccept.verifytldoffline(domain, "./tld-list.txt") and domain[-1] != '.': 87 | print h 88 | 89 | #print doc 90 | device.close() 91 | mm.close() 92 | 93 | if __name__ == '__main__': sys.exit(main(sys.argv)) 94 | --------------------------------------------------------------------------------