├── README.md
└── scrape_pdf.py
/README.md:
--------------------------------------------------------------------------------
1 | A basic script based that uses PDFMiner to decompress streams, and then looks inside the streams
2 |
3 | Currently it attempts to pull out IPs, hashes, URLs, and hostnames.
4 |
5 | Requires:
6 |
7 | * pip install dnspython
8 | * grab uniaccept from here
9 | * pip install pdfminer
10 |
11 |
12 | Then after you've done that, you'll likely want to get the newest TLD list.
13 | Open a Python interpreter then:
14 |
15 | ```
16 | import uniaccept
17 | uniaccept.refreshtlddb("/tmp/tld-list.txt")
18 | ```
19 | Feel free to change the location of the tld-list.txt file, the scrape-pdf.py script expects it in the CWD.
20 |
21 |
22 |
--------------------------------------------------------------------------------
/scrape_pdf.py:
--------------------------------------------------------------------------------
1 | import re
2 | import sys
3 | import mmap
4 | import string
5 | import uniaccept
6 | from pdfminer.pdfpage import PDFPage
7 | from pdfminer.layout import LAParams
8 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
9 | from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
10 |
11 | def main(argv):
12 | #codec = 'utf-8'
13 | codec = 'ascii'
14 | laparams = LAParams()
15 | pagenos = set()
16 | maxpages = 0
17 | password = ''
18 | caching = True
19 | rotation = 0
20 | rsrcmgr = PDFResourceManager(caching=caching)
21 |
22 | # Do a double read thanks to:
23 | # https://mail.python.org/pipermail/python-list/2009-April/531944.html
24 | mm = mmap.mmap(-1, 1024*1024*1024)
25 |
26 | device = TextConverter(rsrcmgr, mm, codec=codec, laparams=laparams, imagewriter=None)
27 |
28 | fname = argv[1]
29 | fp = file(fname, 'rb')
30 | interpreter = PDFPageInterpreter(rsrcmgr, device)
31 | for page in PDFPage.get_pages(fp, pagenos,
32 | maxpages=maxpages, password=password,
33 | caching=caching, check_extractable=True):
34 | page.rotate = (page.rotate+rotation) % 360
35 | interpreter.process_page(page)
36 | fp.close()
37 |
38 | eof = mm.tell()
39 | device.close()
40 | mm.close()
41 |
42 | # Recreate the mmap area w/the correct size
43 | mm = mmap.mmap(-1, eof)
44 |
45 | device = TextConverter(rsrcmgr, mm, codec=codec, laparams=laparams, imagewriter=None)
46 |
47 | fname = argv[1]
48 | fp = file(fname, 'rb')
49 | interpreter = PDFPageInterpreter(rsrcmgr, device)
50 | for page in PDFPage.get_pages(fp, pagenos,
51 | maxpages=maxpages, password=password,
52 | caching=caching, check_extractable=True):
53 | page.rotate = (page.rotate+rotation) % 360
54 | interpreter.process_page(page)
55 | fp.close()
56 |
57 | mm.seek(0)
58 |
59 | ip_regex = re.compile(r'((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))')
60 | hash_regex = re.compile(r'(?:[A-Fa-f0-9]{32}|[A-Fa-f0-9]{40}|[A-Fa-f0-9]{64})') # md5, sha1, sha256
61 | url_regex = re.compile(r'\b((?:[\w-]+://?|www[.])[A-Za-z0-9-_\/.%?=&\[\]()@!$#,;]+)', re.MULTILINE)
62 | hostname_regex = re.compile(r'([a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63}|\s\.\s[a-zA-Z\d-]{,63})+)', re.MULTILINE)
63 | single_line_hostname_regex = re.compile(r'([a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63}|\s\.\s[a-zA-Z\d-]{,63})+)')
64 | doc = ''
65 | while True:
66 | if mm.tell() >= eof:
67 | break
68 | doc += mm.readline().rstrip()
69 |
70 | m = re.findall(ip_regex, doc)
71 | if m != None and len(m) > 0: print set(m)
72 | m = re.findall(url_regex, doc)
73 | if m != None and len(m) > 0: print set(m)
74 | m = re.findall(hash_regex, doc)
75 | if m != None and len(m) > 0: print set(m)
76 | m = re.findall(hostname_regex, doc)
77 | hostname_candidates = []
78 | if m != None and len(m) > 0: hostname_candidates = list(set(m))
79 | m = re.findall(single_line_hostname_regex, doc)
80 | if m != None and len(m) > 0: hostname_candidates = list(set(m + hostname_candidates))
81 |
82 | if len(hostname_candidates) > 0:
83 | for h in hostname_candidates:
84 | domain = string.replace(h, ' ', '')
85 | #print h
86 | if uniaccept.verifytldoffline(domain, "./tld-list.txt") and domain[-1] != '.':
87 | print h
88 |
89 | #print doc
90 | device.close()
91 | mm.close()
92 |
93 | if __name__ == '__main__': sys.exit(main(sys.argv))
94 |
--------------------------------------------------------------------------------