├── README.md
└── scrape_pdf.py


/README.md:
--------------------------------------------------------------------------------
 1 | A basic script based that uses PDFMiner to decompress streams, and then looks inside the streams
 2 | 
 3 | Currently it attempts to pull out IPs, hashes, URLs, and hostnames.
 4 | 
 5 | Requires:
 6 | 
 7 | * pip install dnspython
 8 | * grab uniaccept from <a href="https://github.com/icann/uniaccept-python">here</a>
 9 | * pip install pdfminer
10 | 
11 | 
12 | Then after you've done that, you'll likely want to get the newest TLD list.<br/>
13 | Open a Python interpreter then:<br/>
14 | 
15 | ```
16 | import uniaccept
17 | uniaccept.refreshtlddb("/tmp/tld-list.txt")
18 | ```
19 | Feel free to change the location of the tld-list.txt file, the scrape-pdf.py script expects it in the CWD.
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/scrape_pdf.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import sys
 3 | import mmap
 4 | import string
 5 | import uniaccept
 6 | from pdfminer.pdfpage import PDFPage
 7 | from pdfminer.layout import LAParams
 8 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 9 | from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
10 | 
11 | def main(argv):
12 |     #codec = 'utf-8'
13 |     codec = 'ascii'
14 |     laparams = LAParams()
15 |     pagenos = set()
16 |     maxpages = 0
17 |     password = ''
18 |     caching = True
19 |     rotation = 0
20 |     rsrcmgr = PDFResourceManager(caching=caching)
21 | 
22 |     # Do a double read thanks to:
23 |     # https://mail.python.org/pipermail/python-list/2009-April/531944.html
24 |     mm = mmap.mmap(-1, 1024*1024*1024)
25 | 
26 |     device = TextConverter(rsrcmgr, mm, codec=codec, laparams=laparams, imagewriter=None)
27 | 
28 |     fname = argv[1]
29 |     fp = file(fname, 'rb')
30 |     interpreter = PDFPageInterpreter(rsrcmgr, device)
31 |     for page in PDFPage.get_pages(fp, pagenos,
32 |                                   maxpages=maxpages, password=password,
33 |                                   caching=caching, check_extractable=True):
34 |         page.rotate = (page.rotate+rotation) % 360
35 |         interpreter.process_page(page)
36 |     fp.close()
37 | 
38 |     eof = mm.tell()
39 |     device.close()
40 |     mm.close()
41 | 
42 |     # Recreate the mmap area w/the correct size
43 |     mm = mmap.mmap(-1, eof)
44 | 
45 |     device = TextConverter(rsrcmgr, mm, codec=codec, laparams=laparams, imagewriter=None)
46 | 
47 |     fname = argv[1]
48 |     fp = file(fname, 'rb')
49 |     interpreter = PDFPageInterpreter(rsrcmgr, device)
50 |     for page in PDFPage.get_pages(fp, pagenos,
51 |                                   maxpages=maxpages, password=password,
52 |                                   caching=caching, check_extractable=True):
53 |         page.rotate = (page.rotate+rotation) % 360
54 |         interpreter.process_page(page)
55 |     fp.close()
56 | 
57 |     mm.seek(0)
58 | 
59 |     ip_regex = re.compile(r'((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))')
60 |     hash_regex = re.compile(r'(?:[A-Fa-f0-9]{32}|[A-Fa-f0-9]{40}|[A-Fa-f0-9]{64})') # md5, sha1, sha256
61 |     url_regex = re.compile(r'\b((?:[\w-]+://?|www[.])[A-Za-z0-9-_\/.%?=&\[\]()@!$#,;]+)', re.MULTILINE)
62 |     hostname_regex = re.compile(r'([a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63}|\s\.\s[a-zA-Z\d-]{,63})+)', re.MULTILINE)
63 |     single_line_hostname_regex = re.compile(r'([a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63}|\s\.\s[a-zA-Z\d-]{,63})+)')
64 |     doc = ''
65 |     while True:
66 |         if mm.tell() >= eof: 
67 |             break
68 |         doc += mm.readline().rstrip()
69 |     
70 |     m = re.findall(ip_regex, doc)
71 |     if m != None and len(m) > 0: print set(m)
72 |     m = re.findall(url_regex, doc)
73 |     if m != None and len(m) > 0: print set(m)
74 |     m = re.findall(hash_regex, doc)
75 |     if m != None and len(m) > 0: print set(m)
76 |     m = re.findall(hostname_regex, doc)
77 |     hostname_candidates = []
78 |     if m != None and len(m) > 0: hostname_candidates = list(set(m))
79 |     m = re.findall(single_line_hostname_regex, doc)
80 |     if m != None and len(m) > 0: hostname_candidates = list(set(m + hostname_candidates))
81 | 
82 |     if len(hostname_candidates) > 0:
83 |         for h in hostname_candidates:
84 |             domain = string.replace(h, ' ', '')
85 |             #print h
86 |             if uniaccept.verifytldoffline(domain, "./tld-list.txt") and domain[-1] != '.':
87 |                 print h
88 | 
89 |     #print doc
90 |     device.close()
91 |     mm.close()
92 | 
93 | if __name__ == '__main__': sys.exit(main(sys.argv))
94 | 


--------------------------------------------------------------------------------