├── .gitignore
├── JSAnalysis.py
├── LICENSE.md
├── README.md
├── __init__.py
├── build_pdf_objects.py
├── cfg.py
├── db
    └── __init__.py
├── db_mgmt.py
├── huntterp.py
├── jobs
    └── __init__.py
├── pdfminer
    ├── LICENSE
    ├── __init__.py
    ├── arcfour.py
    ├── ascii85.py
    ├── ccitt.py
    ├── lzw.py
    ├── pdfdocument.py
    ├── pdfparser.py
    ├── pdftypes.py
    ├── psparser.py
    ├── runlength.py
    └── utils.py
├── pdfrankenstein.py
├── peepdf
    ├── AUTHORS
    ├── CHANGELOG
    ├── COPYING
    ├── JSAnalysis.py
    ├── PDFConsole.py
    ├── PDFCore.py
    ├── PDFCrypto.py
    ├── PDFFilters.py
    ├── PDFUtils.py
    ├── README
    ├── TODO
    ├── __init__.py
    ├── aes.py
    ├── aespython
    │   ├── __init__.py
    │   ├── aes_cipher.py
    │   ├── aes_tables.py
    │   ├── cbc_mode.py
    │   ├── cfb_mode.py
    │   ├── key_expander.py
    │   ├── ofb_mode.py
    │   └── test_keys.py
    ├── ccitt.py
    ├── colorama
    │   ├── PKG-INFO
    │   ├── __init__.py
    │   ├── ansi.py
    │   ├── ansitowin32.py
    │   ├── initialise.py
    │   ├── win32.py
    │   └── winterm.py
    ├── jjdecode.py
    ├── jsbeautifier
    │   ├── __init__.py
    │   └── unpackers
    │   │   ├── README.specs.mkd
    │   │   ├── __init__.py
    │   │   ├── evalbased.py
    │   │   ├── javascriptobfuscator.py
    │   │   ├── myobfuscate.py
    │   │   ├── packer.py
    │   │   └── urlencode.py
    ├── lzw.py
    ├── peepdf.dtd
    └── peepdf.py
├── scripts
    ├── __init__.py
    ├── clarify.py
    ├── ffdec.jar
    ├── mapper.py
    └── run-jpexs.py
├── sdhasher.py
├── storage.py
├── util
    ├── __init__.py
    ├── mapper.py
    └── str_utils.py
└── xml_creator.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion
  2 | 
  3 | *.iml
  4 | 
  5 | ## Directory-based project format:
  6 | .idea/
  7 | # if you remove the above rule, at least ignore the following:
  8 | 
  9 | # User-specific stuff:
 10 | # .idea/workspace.xml
 11 | # .idea/tasks.xml
 12 | # .idea/dictionaries
 13 | 
 14 | # Sensitive or high-churn files:
 15 | # .idea/dataSources.ids
 16 | # .idea/dataSources.xml
 17 | # .idea/sqlDataSources.xml
 18 | # .idea/dynamic.xml
 19 | # .idea/uiDesigner.xml
 20 | 
 21 | # Gradle:
 22 | # .idea/gradle.xml
 23 | # .idea/libraries
 24 | 
 25 | # Mongo Explorer plugin:
 26 | # .idea/mongoSettings.xml
 27 | 
 28 | ## File-based project format:
 29 | *.ipr
 30 | *.iws
 31 | 
 32 | ## Plugin-specific files:
 33 | 
 34 | # IntelliJ
 35 | /out/
 36 | 
 37 | # mpeltonen/sbt-idea plugin
 38 | .idea_modules/
 39 | 
 40 | # JIRA plugin
 41 | atlassian-ide-plugin.xml
 42 | 
 43 | # Crashlytics plugin (for Android Studio and IntelliJ)
 44 | com_crashlytics_export_strings.xml
 45 | crashlytics.properties
 46 | crashlytics-build.properties
 47 | 
 48 | # OSX
 49 | .DS_Store
 50 | .AppleDouble
 51 | .LSOverride
 52 | 
 53 | # Icon must end with two \r
 54 | Icon
 55 | 
 56 | 
 57 | # Thumbnails
 58 | ._*
 59 | 
 60 | # Files that might appear in the root of a volume
 61 | .DocumentRevisions-V100
 62 | .fseventsd
 63 | .Spotlight-V100
 64 | .TemporaryItems
 65 | .Trashes
 66 | .VolumeIcon.icns
 67 | 
 68 | # Directories potentially created on remote AFP share
 69 | .AppleDB
 70 | .AppleDesktop
 71 | Network Trash Folder
 72 | Temporary Items
 73 | .apdisk
 74 | 
 75 | #Python
 76 | # Byte-compiled / optimized / DLL files
 77 | __pycache__/
 78 | *.py[cod]
 79 | *$py.class
 80 | 
 81 | # C extensions
 82 | *.so
 83 | 
 84 | # Distribution / packaging
 85 | .Python
 86 | env/
 87 | build/
 88 | develop-eggs/
 89 | dist/
 90 | downloads/
 91 | eggs/
 92 | .eggs/
 93 | lib/
 94 | lib64/
 95 | parts/
 96 | sdist/
 97 | var/
 98 | *.egg-info/
 99 | .installed.cfg
100 | *.egg
101 | 
102 | # PyInstaller
103 | #  Usually these files are written by a python script from a template
104 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
105 | *.manifest
106 | *.spec
107 | 
108 | # Installer logs
109 | pip-log.txt
110 | pip-delete-this-directory.txt
111 | 
112 | # Unit test / coverage reports
113 | htmlcov/
114 | .tox/
115 | .coverage
116 | .coverage.*
117 | .cache
118 | nosetests.xml
119 | coverage.xml
120 | *,cover
121 | 
122 | # Translations
123 | *.mo
124 | *.pot
125 | 
126 | # Django stuff:
127 | *.log
128 | 
129 | # Sphinx documentation
130 | docs/_build/
131 | 
132 | # PyBuilder
133 | target/
134 | 
135 | 
136 | #Vi
137 | [._]*.s[a-w][a-z]
138 | [._]s[a-w][a-z]
139 | *.un~
140 | Session.vim
141 | .netrwhist
142 | *~
143 | 
144 | frankenstein.cfg
145 | *.txt
146 | *.csv
147 | *.sqlite*
148 | 


--------------------------------------------------------------------------------
/JSAnalysis.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | # 
  3 | #  NO WARRANTY
  4 | # 
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | try:
 15 |     import PyV8
 16 | except ImportError as e:
 17 |     print str(e)
 18 |     PyV8 = None
 19 | 
 20 | import re
 21 | 
 22 | import build_pdf_objects
 23 | from util.str_utils import unescapeHTMLEntities
 24 | 
 25 | reJSscript = '<script[^>]*?contentType\s*?=\s*?[\'"]application/x-javascript[\'"][^>]*?>(.*?)</script>'
 26 | 
 27 | def create_objs(context, tree):
 28 |     """
 29 | 
 30 |     Mimic native Adobe objects and add them to the context
 31 |     :param context: JavaScript context, like a namespace at runtime
 32 |     :param tree: XML tree of the pdf to reference objects
 33 |     :return:
 34 |     """
 35 |     try:
 36 |         app = build_pdf_objects.create_app_obj(tree)
 37 |         context.eval("app = " + str(app) + ";")
 38 |         context.eval("app.doc.syncAnnotScan = function () {}")
 39 |         context.eval("app.doc.getAnnots = function () { return app.doc.annots;}")
 40 |         context.eval("app.eval = function (string) { eval(string);}")
 41 |         context.eval("app.newDoc = function () { return '';}")
 42 |         context.eval("app.getString = function () { ret = \"\"; for(var prop in app){ ret += app[prop]; } return ret;}")
 43 |     except Exception as e:
 44 |         # print "App: " + e.message
 45 |         pass
 46 |     try:
 47 |         info = build_pdf_objects.create_info_obj(tree)
 48 |         context.eval("this.info = " + str(info) + ";")
 49 |         for key in info:
 50 |             context.eval("this." + key + "= '" + re.escape(info[key]) + "';")
 51 |         context.eval("this.eval = eval")
 52 |         # print info
 53 |     except Exception as e:
 54 |         print "Info: " + e.message
 55 |         pass
 56 |     try:
 57 |         event = build_pdf_objects.create_event_obj(tree)
 58 |         context.eval("event = " + str(event) + ";")
 59 |         context.eval("event.target.info = this.info")
 60 |     except Exception as e:
 61 |         # print "Event: " + e.message
 62 |         pass
 63 | 
 64 | 
 65 | def eval_loop(code, context, old_msg="", limit=10):
 66 |     """
 67 | 
 68 |     Eval the code and handle any exceptions it throws
 69 |     :param code: String of code to evaluate
 70 |     :param context: JavaScript context object
 71 |     :param old_msg:
 72 |     :param limit: Recursive limit
 73 |     :return:
 74 |     """
 75 |     try:
 76 |         context.eval(code)
 77 |         return context.eval("evalCode")
 78 |     # catch exceptions and attempt to fix them
 79 |     except ReferenceError as e:
 80 |         # print e.message
 81 |         if e.message == old_msg:
 82 |             return context.eval("evalCode")
 83 |         elif e.message.find('$') > -1:
 84 |             context.eval("$ = this;")
 85 |         else:
 86 |             # try commenting out line
 87 |             line_num = re.findall("@\s(\d*?)\s", e.message)
 88 |             line_num = int(line_num[0])
 89 |             i = 0
 90 |             for item in code.split("\n"):
 91 |                 i += 1
 92 |                 if i == line_num:
 93 |                     code = re.sub(item, "//" + item, code)
 94 |                     break
 95 |         return eval_loop(code, context, e.message)
 96 |     except TypeError as te:
 97 |         # print te.message
 98 |         if te.message == old_msg:
 99 |             return context.eval("evalCode")
100 |         elif te.message.find("called on null or undefined") > -1:
101 |             # in Adobe undefined objects become app object
102 |             line = re.findall("->\s(.*)", te.message)
103 |             sub, count = re.subn("=\s?.\(.*?\)", "=app", line[0])
104 |             if count < 1:
105 |                 sub = re.sub("=.*", "=app", line[0])
106 |             line = re.escape(line[0])
107 |             code = re.sub(line, sub, code)
108 |         elif te.message.find("undefined is not a function") > -1:
109 |             # sub in eval as a guess
110 |             line = re.findall("->\s(.*)", te.message)
111 |             match = re.findall("[\s=]?(.*?)\(", line[0])
112 |             if len(match) > 0:
113 |                 sub = re.sub(match[0], "eval", line[0])
114 |                 line = re.escape(line[0])
115 |                 code = re.sub(line, sub, code)
116 |             else:
117 |                 return context.eval("evalCode")
118 |         elif te.message.find("Cannot read property") > -1:
119 |             # undefined becomes app
120 |             line = re.findall("->\s(.*)", te.message)
121 |             match = re.findall("[=\s](.*?)\[", line[0])
122 |             if len(match) > 0:
123 |                 sub = re.sub(match[0], "app", line[0])
124 |                 line = re.escape(line[0])
125 |                 code = re.sub(line, sub, code)
126 |             else:
127 |                 return context.eval("evalCode")
128 |         else:
129 |             return context.eval("evalCode")
130 |         return eval_loop(code, context, te.message)
131 |     except SyntaxError as se:
132 |         # print se.message
133 |         if se.message == old_msg:
134 |             return context.eval("evalCode")
135 |         line_num = re.findall("@\s(\d*?)\s", se.message)
136 |         if len(line_num) > 0:
137 |             line_num = int(line_num[0])
138 |             i = 0
139 |             # try commenting out the line number with the error
140 |             for item in code.split("\n"):
141 |                 i += 1
142 |                 if i == line_num:
143 |                     esc_item = re.escape(item)
144 |                     code, n = re.subn(esc_item, "//" + item, code)
145 |                     break
146 |         else:
147 |             return context.eval('evalCode')
148 |         return eval_loop(code, context, se.message)
149 |     except Exception as e1:
150 |         # print e1.message
151 |         return context.eval("evalCode")
152 | 
153 | 
154 | def analyse(js, tree):
155 |     """
156 | 
157 |     Main function called from pdfrankenstein. Analyzes javascript in order to deobfuscate the code.
158 |     :param js: String of code to analyze
159 |     :param tree: Tree xml object to use as reference for objects called from the code.
160 |     :return: String of deobfuscated code
161 |     """
162 |     if not PyV8:
163 |         return ''
164 |     with PyV8.JSIsolate():
165 |         context = PyV8.JSContext()
166 |         context.enter()
167 |         context.eval('evalCode = \'\';')
168 |         context.eval('evalOverride = function (expression) { evalCode += expression; return;}')
169 |         context.eval('eval=evalOverride')
170 |         try:
171 |             if tree is not None:
172 |                 create_objs(context, tree)
173 |             ret = eval_loop(js, context)
174 |             context.leave()
175 |             if ret == None:
176 |                 return ''
177 |             else:
178 |                 return ret
179 |         except Exception as e:
180 |             context.leave()
181 |             # return 'Error with analyzing JS: ' + e.message
182 |             return ''
183 | 
184 | 
185 | def isJavascript(content):
186 |     """
187 |     Given an string this method looks for typical Javscript strings and try to identify if the string contains Javascript code or not.
188 | 
189 |     :param content: A string
190 |     :return: A boolean, True if it seems to contain Javascript code or False in the other case
191 |     """
192 |     JSStrings = ['var ', ';', ')', '(', 'function ', '=', '{', '}', 'if ', 'else', 'return', 'while ', 'for ', ',',
193 |                  'eval', 'unescape', '.replace']
194 |     keyStrings = [';', '(', ')']
195 |     stringsFound = []
196 |     limit = 15
197 |     minDistinctStringsFound = 5
198 |     results = 0
199 |     content = unescapeHTMLEntities(content)
200 |     if re.findall(reJSscript, content, re.DOTALL | re.IGNORECASE) != []:
201 |         return True
202 |     for char in content:
203 |         if (ord(char) < 32 and char not in ['\n', '\r', '\t', '\f', '\x00']) or ord(char) >= 127:
204 |             return False
205 | 
206 |     for string in JSStrings:
207 |         cont = content.count(string)
208 |         results += cont
209 |         if cont > 0 and string not in stringsFound:
210 |             stringsFound.append(string)
211 |         elif cont == 0 and string in keyStrings:
212 |             return False
213 | 
214 |     if results > limit and len(stringsFound) >= minDistinctStringsFound:
215 |         return True
216 |     else:
217 |         return False
218 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Use of PDFrankenstein and related source code is subject to the terms
 2 | of the following licenses:
 3 | 
 4 | GNU General Public License (GPL) Rights pursuant to Version 2, June 1991
 5 | Government Purpose License Rights (GPLR) pursuant to DFARS 252.227.7013
 6 | 
 7 | NO WARRANTY
 8 | 
 9 | ANY INFORMATION, MATERIALS, SERVICES, INTELLECTUAL PROPERTY OR OTHER
10 | PROPERTY OR RIGHTS GRANTED OR PROVIDED BY CARNEGIE MELLON UNIVERSITY
11 | PURSUANT TO THIS LICENSE (HEREINAFTER THE "DELIVERABLES") ARE ON AN
12 | "AS-IS" BASIS. CARNEGIE MELLON UNIVERSITY MAKES NO WARRANTIES OF ANY
13 | KIND, EITHER EXPRESS OR IMPLIED AS TO ANY MATTER INCLUDING, BUT NOT
14 | LIMITED TO, WARRANTY OF FITNESS FOR A PARTICULAR PURPOSE,
15 | MERCHANTABILITY, INFORMATIONAL CONTENT, NONINFRINGEMENT, OR ERROR-FREE
16 | OPERATION. CARNEGIE MELLON UNIVERSITY SHALL NOT BE LIABLE FOR INDIRECT,
17 | SPECIAL OR CONSEQUENTIAL DAMAGES, SUCH AS LOSS OF PROFITS OR INABILITY
18 | TO USE SAID INTELLECTUAL PROPERTY, UNDER THIS LICENSE, REGARDLESS OF
19 | WHETHER SUCH PARTY WAS AWARE OF THE POSSIBILITY OF SUCH DAMAGES.
20 | LICENSEE AGREES THAT IT WILL NOT MAKE ANY WARRANTY ON BEHALF OF
21 | CARNEGIE MELLON UNIVERSITY, EXPRESS OR IMPLIED, TO ANY PERSON
22 | CONCERNING THE APPLICATION OF OR THE RESULTS TO BE OBTAINED WITH THE
23 | DELIVERABLES UNDER THIS LICENSE.
24 | 
25 | Licensee hereby agrees to defend, indemnify, and hold harmless Carnegie
26 | Mellon University, its trustees, officers, employees, and agents from
27 | all claims or demands made against them (and any related losses,
28 | expenses, or attorney's fees) arising out of, or relating to Licensee's
29 | and/or its sub licensees' negligent use or willful misuse of or
30 | negligent conduct or willful misconduct regarding the Software,
31 | facilities, or other rights or assistance granted by Carnegie Mellon
32 | University under this License, including, but not limited to, any
33 | claims of product liability, personal injury, death, damage to
34 | property, or violation of any laws or regulations.
35 | 
36 | Carnegie Mellon University Software Engineering Institute authored
37 | documents are sponsored by the U.S. Department of Defense under
38 | Contract FA8721-05-C-0003. Carnegie Mellon University retains
39 | copyrights in all material produced under this contract. The U.S.
40 | Government retains a non-exclusive, royalty-free license to publish or
41 | reproduce these documents, or allow others to do so, for U.S.
42 | Government purposes only pursuant to the copyright license under the
43 | contract clause at 252.227.7013.
44 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | PDFrankenstein
 2 | ================
 3 | Python tool for bulk malicious PDF feature extraction.
 4 | 
 5 | Dependencies
 6 | ------------
 7 | * PyV8 (and V8) (optional: if you intend to use JS deobfuscation. Note: JS deobfuscation needs to be run in a safe environment, as you would treat any malware.
 8 | * lxml
 9 | * [scandir](https://github.com/benhoyt/scandir) (optional: module included in lib folder)
10 | * postgresql and psycopg2 (optional: if you intend to use postgresql backing storage)
11 | 
12 | 
13 | Usage
14 | -----
15 | 
16 | ```
17 | $ pdfrankenstein.py --help
18 | ```
19 | 
20 | Output to a file in delimited plain text, parses ALL files in pdf-dir/
21 | ```
22 | $ pdfrankenstein.py -o file -n fileoutput.txt ~/pdf-dir
23 | ```
24 | 
25 | Output to an sqlite database 
26 | ```
27 | $ pdfrankenstein.py -o sqlite3 -n pdf-db ~/pdf-dir
28 | ```
29 | 
30 | Output to stdout after parsing all files listed inside file-with-pdfs
31 | ```
32 | $ pdfrankensetin.py -o stdout ~/file-with-pdfs
33 | ```
34 | 
35 | 
36 | <table>
37 | <tr>
38 |   <td>pdf_in </td>
39 |   <td>PDF input for analysis. Can be a single PDF file or a directory of files.</td>
40 | </tr>
41 | <tr>
42 |   <td>-d, --debug</td>
43 |   <td>Print debugging messages.</td>
44 | </tr>
45 | <tr>
46 |   <td>-o, --out</td>
47 |   <td>Analysis output filename or type. Default to 'unnamed-out.*' file in CWD. Options: 'sqlite3'||'postgres'||'stdout'||[filename]</td>
48 | </tr>
49 | <tr>
50 |   <td>-n, --name</td><td>Name for output database.</td>
51 | </tr>
52 | <tr>
53 |   <td>--hasher</td><td>Specify which type of hasher to use. PeePDF | PDFMiner (default). PDFMiner option provides better parsing capabilities.</td>
54 | </tr>
55 | <tr>
56 |   <td>-v, --verbose</td><td>Spam the terminal, TODO.</td>
57 | </tr>
58 | </table>
59 | 
60 | References
61 | -------------
62 | ### Open Source PDF Tools
63 | * [PeePDF](http://eternal-todo.com/tools/peepdf-pdf-analysis-tool)
64 | * [PDFMiner](http://www.unixuser.org/~euske/python/pdfminer/index.html)
65 | * [swf mastah](https://github.com/9b/pdfxray_public/blob/master/builder/swf_mastah.py)
66 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/__init__.py


--------------------------------------------------------------------------------
/build_pdf_objects.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | # 
  3 | #  NO WARRANTY
  4 | # 
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | import re
 15 | 
 16 | from util.str_utils import unescapeHTMLEntities
 17 | 
 18 | # Determine the type of tag used and return its value accordingly
 19 | def get_value(elem, root):
 20 |     if elem.tag == "literal" or elem.tag == "number" or elem.tag == "keyword":
 21 |         return unescapeHTMLEntities(elem.text)
 22 |     elif elem.tag == "string":
 23 |         return unescapeHTMLEntities(elem.text.decode('base64'))
 24 |     elif elem.tag == "ref":
 25 |         # find the referenced object and return its value
 26 |         obj = get_ref_object(elem.get('id'), root)
 27 |         return get_value(obj[0], root)
 28 |     elif elem.tag == "stream":
 29 |         return unescapeHTMLEntities(elem[1].text.decode('base64'))
 30 |     elif elem.tag == "dict":
 31 |         # build the dictionary
 32 |         ret = {}
 33 |         size = elem.get("size")
 34 |         size = re.sub("%", "", size)
 35 |         dict_elems = elem.getchildren()
 36 |         for i in range(int(size)):
 37 |             val = get_value(dict_elems[i][0], root)
 38 |             if val is not None:
 39 |                 ret[dict_elems[i].tag] = val
 40 |     elif elem.tag == "list":
 41 |         # build the list
 42 |         ret = []
 43 |         size = elem.get("size")
 44 |         size = re.sub("%", "", size)
 45 |         list_elems = elem.getchildren()
 46 |         for i in range(int(size)):
 47 |             val = get_value(list_elems[i], root)
 48 |             if val is not None:
 49 |                 ret.append(val)
 50 |     else:
 51 |         # some tags not accounted for: Rect, field, xfa, Media, etc
 52 |         ret = None
 53 |     return ret
 54 | 
 55 | 
 56 | # find the object referenced in another object
 57 | def get_ref_object(id, root):
 58 |     for obj in root.iterfind(".//object"):
 59 |         if obj.get("id") == id:
 60 |             return obj
 61 |     else:
 62 |         return None
 63 | 
 64 | 
 65 | # Get any annotation objects in the PDF and store in the app object
 66 | def get_annots(app, root):
 67 |     for annot in root.iterfind(".//Annots"):
 68 |         annot_list = annot[0]
 69 |         for ref in annot_list:
 70 |             id = ref.get("id")
 71 |             obj = get_ref_object(id, root)
 72 |             new = get_value(obj[0], root)
 73 |             if new is not None:
 74 |                 new["subject"] = new.pop("Subj")
 75 |                 app['doc']['annots'].append(new)
 76 | 
 77 | 
 78 | # Mimic the Adobe event object by parsing the PDF for commonly found attributes
 79 | def create_event_obj(tree):
 80 |     event_attrs = ["author", "calculate", "creator", "creationDate", "delay", "dirty", "external", "filesize",
 81 |                    "keywords", "modDate", "numFields", "numPages", "numTemplates", "path", "pageNum", "producer",
 82 |                    "subject", "title", "zoom", "zoomType"]
 83 |     event = {}
 84 |     event["target"] = {}
 85 |     for item in event_attrs:
 86 |         for elem in tree.iterfind('.//' + item[0].upper() + item[1:]):
 87 |             val = get_value(elem[0], tree)
 88 |             if val is not None:
 89 |                 event["target"][item] = val
 90 |     # print event
 91 |     return event
 92 | 
 93 | 
 94 | # Mimic the Adobe app object by parsing the PDF for commonly found attributes
 95 | def create_app_obj(tree):
 96 |     app = {}
 97 |     app_attrs = ["calculate", "formsVersion", "fullscreen", "language", "numPlugins", "openInPlace", "platform",
 98 |                  "toolbar", "toolbarHorizontal", "toolbarVertical"]
 99 |     doc = {}
100 |     for item in app_attrs:
101 |         for elem in tree.iterfind('.//' + item[0].upper() + item[1:]):
102 |             val = get_value(elem[0], tree)
103 |             if val is not None:
104 |                 doc[item] = val
105 |     app['doc'] = doc;
106 | 
107 |     # Many app values are dependent on the reader
108 |     # set some common defaults here
109 |     app['doc']['viewerType'] = 'Reader'
110 |     app['viewerType'] = 'Reader'
111 |     app['viewerVersion'] = 5.0
112 |     app['plugIns'] = [{'version': 6.0}, {'version': 7.5}, {'version': 8.7}, {'version': 9.1}, {'version': 10}]
113 |     if not 'language' in app.keys():
114 |         app['language'] = "ENU"
115 |     if not 'platform' in app.keys():
116 |         app['platform'] = "WIN"
117 | 
118 |     # store the annotation objects so they can be retrieved later
119 |     app['doc']['annots'] = []
120 |     get_annots(app, tree)
121 |     # print app
122 |     return app
123 | 
124 | 
125 | # Mimic the Adobe info object by parsing the PDF for commonly found attributes
126 | def create_info_obj(tree):
127 |     info_attrs = ["author", "creator", "creationDate", "Date", "keywords", "modDate", "producer", "subject", "title",
128 |                   "trapped"]
129 |     info = {}
130 |     for item in info_attrs:
131 |         for elem in tree.iterfind('.//' + item[0].upper() + item[1:]):
132 |             val = get_value(elem[0], tree)
133 |             if val is not None:
134 |                 info[item] = val
135 |     # print info
136 |     return info
137 | 


--------------------------------------------------------------------------------
/cfg.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2011-2015 by Carnegie Mellon University
 2 | # 
 3 | #  NO WARRANTY
 4 | # 
 5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
 6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
 7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
 8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
 9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
13 | 
14 | import os
15 | import sys
16 | from ConfigParser import SafeConfigParser
17 | 
18 | DEFAULT_CFG = 'frankenstein.cfg'
19 | 
20 | 
21 | class Config(object):
22 |     def __init__(self, path='', name=''):
23 |         if name:
24 |             cfg_file = os.path.join(path, name)
25 |         else:
26 |             cfg_file = os.path.join(path, DEFAULT_CFG)
27 |         self.parser = SafeConfigParser()
28 |         if not self.parser.read(cfg_file):
29 |             print 'No configuration file found:', cfg_file
30 |             self.new_cfg()
31 | 
32 |     def new_cfg(self):
33 |         self.section_gen()
34 |         self.section_db()
35 |         with open(DEFAULT_CFG, 'w') as new_cfg:
36 |             print 'Creating new config file in CWD:', DEFAULT_CFG
37 |             print 'Please double check the default values before running again:'
38 |             print self
39 |             self.parser.write(new_cfg)
40 |         sys.exit(0)
41 | 
42 |     def section_gen(self):
43 |         sec = 'general'
44 |         self.parser.add_section(sec)
45 |         self.parser.set(sec, '#output', 'sqlite3')
46 |         self.parser.set(sec, 'output', 'stdout')
47 | 
48 |     def section_db(self):
49 |         sec = 'database'
50 |         self.parser.add_section(sec)
51 |         self.parser.set(sec, 'path', os.getcwd())
52 |         self.parser.set(sec, 'user', 'frankenstein')
53 |         self.parser.set(sec, 'pw', 'PuttinOnTheRitz')
54 |         self.parser.set(sec, 'db', 'frankenstein.sqlite')
55 | 
56 |     def setting(self, section='', option=''):
57 |         if not section:
58 |             for s in self.parser.sections():
59 |                 if self.parser.has_option(s, option):
60 |                     return self.parser.get(s, option)
61 |         elif self.parser.has_option(section, option):
62 |             return self.parser.get(section, option)
63 |         else:
64 |             return None
65 | 
66 |     def __str__(self):
67 |         rv = ''
68 |         for sect in self.parser.sections():
69 |             rv += 'Section: %s\n' % sect
70 |             for opt in self.parser.options(sect):
71 |                 rv += '\t%s\t=\t%s\n' % (opt, self.parser.get(sect, opt))
72 |         return rv
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     cfg = Config()
77 |     print cfg
78 | 


--------------------------------------------------------------------------------
/db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/db/__init__.py


--------------------------------------------------------------------------------
/db_mgmt.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | # 
  3 | #  NO WARRANTY
  4 | # 
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | import os
 15 | import sys
 16 | import sqlite3
 17 | 
 18 | import cfg
 19 | 
 20 | 
 21 | class DBGateway(object):
 22 |     def __init__(self, db='', path=''):
 23 |         self.error = ''
 24 |         self.cfg = cfg.Config()
 25 |         
 26 |         if not db:
 27 |             self.db_dir = self.cfg.setting('database', 'path')
 28 |             self.db_name = self.cfg.setting('database', 'db')
 29 |         elif db is 'test':
 30 |             self.db_dir = os.getcwd()
 31 |             self.db_name = 'testdb.sqlite'
 32 |         else:
 33 |             if not path:
 34 |                 self.db_dir = self.cfg.setting('database', 'path')
 35 |             else:
 36 |                 self.db_dir = path
 37 |             self.db_name = db
 38 | 
 39 |         if not self.db_dir or not (os.path.isdir(self.db_dir)) or not self.db_name:
 40 |             sys.stderr.write("GError in database path or name. Check frankenstein.cfg file\n")
 41 |             sys.exit(1)
 42 | 
 43 |         self.db_path = os.path.join(self.db_dir, self.db_name)
 44 |         print('DBGateway connecting: %s' % self.db_path)
 45 |         self.connect(self.db_path)
 46 | 
 47 |     def query(self, cmd, params=''):
 48 |         try:
 49 |             if params:
 50 |                 self.db_curr.execute(cmd, params)
 51 |             else:
 52 |                 self.db_curr.execute(cmd)
 53 |             self.commit()
 54 |             return True
 55 |         except Exception as e:
 56 |             self.error = str(e)
 57 |             return False
 58 | 
 59 |     def queryblock(self, cmd, params='', n=30):
 60 |         done = False
 61 |         tries = 0
 62 |         while not done and tries < n:
 63 |             tries += 1
 64 |             try:
 65 |                 if params:
 66 |                     self.db_curr.execute(cmd, params)
 67 |                 else:
 68 |                     self.db_curr.execute(cmd)
 69 |             except Exception as e:
 70 |                 self.error = str(e)
 71 |             else:
 72 |                 done = True
 73 |         return done
 74 | 
 75 |     def get_error(self):
 76 |         err = self.error
 77 |         self.error = ''
 78 |         return err
 79 | 
 80 |     def attach(self, db_name):
 81 |         db = "'" + os.path.join(config.SETTINGS.get('DB_DIR'), db_name) + "'"
 82 |         self.db_curr.execute('ATTACH DATABASE ' + db + ' AS ' + db_name)
 83 |         self.db_conn.commit()
 84 | 
 85 |     def has_table(self, table):
 86 |         cmd = "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='%s'" % table
 87 |         if self.query(cmd):
 88 |             return self.db_curr.fetchone()[0]
 89 | 
 90 |     def create_table(self, table, **kwargs):
 91 |         try:
 92 |             kwargs = self.format_args(**kwargs)
 93 |             cmd = 'CREATE TABLE IF NOT EXISTS ' + table
 94 |             if kwargs.get('select'):
 95 |                 cmd += ' AS SELECT ' + kwargs.get('select') + ' FROM ' + kwargs.get('from') + ' WHERE ' + kwargs.get(
 96 |                     'where') + '=' + kwargs.get('is')
 97 |             else:
 98 |                 cmd += ' (' + kwargs.get('cols') + ', PRIMARY KEY(' + kwargs.get('primary') + '))'
 99 |         except TypeError as e:
100 |             print 'Invalid arguments passed to database gateway:', kwargs
101 |             raise e
102 |         else:
103 |             try:
104 |                 self.db_curr.execute(cmd)
105 |             except sqlite3.OperationalError as error:
106 |                 print 'Invalid operation in database gateway:', error
107 |                 print 'Occurred during cmd:', cmd
108 |                 raise error
109 |             else:
110 |                 self.db_conn.commit()
111 |                 # self.dump()
112 | 
113 |     def connect(self, path):
114 |         try:
115 |             self.db_conn = sqlite3.connect(path, 30)
116 |         except Exception as e:
117 |             sys.stderr.write("DBGateway connect: %s\n" % e)
118 |             return None
119 |         self.db_conn.text_factory = str
120 |         self.db_conn.row_factory = sqlite3.Row
121 |         self.db_curr = self.db_conn.cursor()
122 | 
123 |     def commit(self):
124 |         self.db_conn.commit()
125 | 
126 |     def disconnect(self):
127 |         self.commit()
128 |         self.db_conn.close()
129 | 
130 |     def drop_tables(self):
131 |         self.db_curr.execute("SELECT name FROM sqlite_master WHERE type='table'")
132 |         for row in self.db_curr.fetchall():
133 |             self.drop(row[0])
134 | 
135 |     def drop(self, name):
136 |         self.db_curr.execute("DROP TABLE IF EXISTS " + name)
137 |         self.db_conn.commit()
138 | 
139 |     def format_args(self, **kwargs):
140 |         if isinstance(kwargs.get('primary'), (tuple, list)):
141 |             kwargs['primary'] = ', '.join(kwargs['primary'])
142 |         if isinstance(kwargs.get('cols'), (tuple, list)):
143 |             kwargs['subs'] = ', '.join(['?' for arg in kwargs['cols']])
144 |             kwargs['cols'] = ', '.join(kwargs['cols'])
145 |         else:
146 |             kwargs['subs'] = '?'
147 |         return kwargs
148 | 
149 |     def insert(self, table, **kwargs):
150 |         kwargs = self.format_args(**kwargs)
151 |         cmd = 'INSERT OR REPLACE INTO ' + table + '(' + kwargs.get('cols') + ') VALUES (' + kwargs.get('subs') + ')'
152 |         try:
153 |             self.db_curr.execute(cmd, kwargs.get('vals'))
154 |             self.db_conn.commit()
155 |         except Exception as e:
156 |             self.error = repr(e)
157 |             return False
158 |         else:
159 |             return True
160 | 
161 |     def select(self, cmd_str):
162 |         cmd = 'SELECT %s' % cmd_str
163 |         self.db_curr.execute(cmd)
164 |         return self.db_curr
165 | 
166 |     def count(self, table, key, val):
167 |         cmd = "SELECT COUNT (*) FROM %s WHERE %s is '%s'" % (table, key, val)
168 |         self.db_curr.execute(cmd)
169 |         return self.db_curr.fetchone()[0]
170 | 
171 |     def update(self, dic):
172 |         cmd = "UPDATE {tbl} SET {col} ='{val}' WHERE {key} ='{kval}'".format(**dic)
173 |         print cmd
174 |         try:
175 |             # self.db_curr.execute(cmd, dic)
176 |             self.db_curr.execute(cmd)
177 |             self.db_conn.commit()
178 |         except Exception as e:
179 |             self.error = str(e)
180 |             return False
181 |         else:
182 |             return True
183 | 
184 |     def delete(self, *ids):
185 |         pass
186 | 
187 |     def dump(self, n=0):
188 |         print ':MEMORY DB DUMP:'
189 |         cnt = 0
190 |         for val in self.db_conn.iterdump():
191 |             cnt += 1
192 |             if 0 < n <= cnt:
193 |                 break
194 |             print val
195 |         print ':MEMORY DB DUMP END:'
196 | 
197 | 


--------------------------------------------------------------------------------
/huntterp.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | # 
  3 | #  NO WARRANTY
  4 | # 
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | import sys
 15 | import re
 16 | 
 17 | '''
 18 | For testing run the module without arguments. (Can also be run on arbitrary files.)
 19 | 
 20 | '''
 21 | 
 22 | 
 23 | class Test(object):
 24 |     tests = ['ftp', 'http']
 25 |     ftp = "6674703a2f2f676f6f676c652e636f6d"
 26 |     http = "6674703a2f2f676f6f676c652e636f6d687474703a2f2f676f6f676c652e636f6df1"
 27 | 
 28 | 
 29 | '''
 30 | This function makes no assumptions on the validity of the string values
 31 | '''
 32 | 
 33 | 
 34 | def ascii2hex(string):
 35 |     if isinstance(string, str):
 36 |         return ''.join([hex(ord(c))[2:] for c in string])
 37 |     else:
 38 |         return ''
 39 | 
 40 | 
 41 | '''
 42 | Convert a string from hex to ascii. Starting from the first position, and
 43 | stopping on the first invalid (not-printable) character or invalid input,
 44 | whichever comes first.
 45 | '''
 46 | 
 47 | 
 48 | def hex2ascii(string):
 49 |     letters = ''
 50 |     for idx in range(0, len(string), 2):
 51 |         try:
 52 |             c1 = string[idx]
 53 |             c2 = string[idx + 1]
 54 |             i = int(c1 + c2, 16)
 55 |             if i < 32 or i > 127:
 56 |                 break
 57 |             ch = chr(i)
 58 |         except (ValueError, TypeError, IndexError):
 59 |             break
 60 |         else:
 61 |             letters += ch
 62 |     return letters
 63 | 
 64 | 
 65 | def get_unicode(h2):
 66 |     res = []
 67 |     res = re.findall('[\'\"]((%u[0-9a-f]{4})*)[\'\"]', h2)
 68 |     return res
 69 | 
 70 | 
 71 | '''
 72 | Return a list of strings found in the hexstring. Should not return overlapping
 73 | results. Needle is converted from ASCII to HEX on the first line.
 74 | '''
 75 | 
 76 | 
 77 | def find_in_hex(needle, hexstack):
 78 |     needle = ascii2hex(needle)
 79 |     results = []
 80 |     total = 0
 81 |     while True:
 82 |         idx = hexstack.find(needle)
 83 |         if idx < 0:
 84 |             break
 85 |         total += idx
 86 |         results.append((total, hex2ascii(hexstack[idx:])))
 87 |         hexstack = hexstack[idx + 1:]
 88 |         total += 1
 89 |     return results
 90 | 
 91 | 
 92 | def verify(vals, string):
 93 |     for val in vals:
 94 |         sys.stdout.write('Verifying [%s] @ [%d]...' % (val[1], val[0]))
 95 |         if string[val[0]:len(val[1])].startswith(hex2ascii(val[1])):
 96 |             sys.stdout.write('pass\n')
 97 |         else:
 98 |             sys.stdout.write('fail. string[%d]==[%s]...\n' % (val[0], val[1][val[0]:val[0] + 32]))
 99 | 
100 | 
101 | '''
102 | Return a list of urls found in the unicode string. Should not return overlapping
103 | results. Needle is converted from ASCII to UNICODE on the first line.
104 | '''
105 | 
106 | 
107 | def find_unicode(needle, haystack):
108 |     needle = ascii2uni(needle)
109 |     results = []
110 |     total = 0
111 |     while True:
112 |         idx = haystack.find(needle)
113 |         if idx < 0:
114 |             break
115 |         total += idx
116 |         quote_2 = haystack[idx:].find('"')
117 |         quote_1 = haystack[idx:].find('\'')
118 |         if quote_1 < quote_2 and quote_1 > -1:
119 |             quote = quote_1
120 |         else:
121 |             quote = quote_2
122 |         results.append((total, haystack[idx:idx + quote]))
123 |         haystack = haystack[idx + 1:]
124 |         total += 1
125 |     res = []
126 |     for r in results:
127 |         res.append((r[0], uni2ascii(r[1])))
128 |     return res
129 | 
130 | 
131 | '''
132 |     Convert a string from ascii to unicode
133 | '''
134 | 
135 | 
136 | def ascii2uni(string):
137 |     string = ascii2hex(string)
138 |     res = re.findall('([0-9a-f]{2})([0-9a-f]{2})', string)
139 |     string = ''
140 |     for i in res:
141 |         string += '%u' + i[1] + i[0]
142 |     return string
143 | 
144 | 
145 | '''
146 |     Convert a string form unicode to ascii
147 | '''
148 | 
149 | 
150 | def uni2ascii(string):
151 |     string = re.sub("%u", "", string)
152 |     res = re.findall('([0-9a-f]{2})([0-9a-f]{2})', string)
153 |     string = ''
154 |     for i in res:
155 |         string += i[1] + i[0]
156 |     return hex2ascii(string)
157 | 
158 | 
159 | '''
160 | Find h1 in h2 | h1 == ASCII && h2 == HEX
161 | '''
162 | 
163 | 
164 | def main(h1, h2):
165 |     if not isinstance(h2, str):
166 |         print 'Invalid input:', type(h2)
167 |         print str(h2)
168 |         return
169 | 
170 |     print 'Searching for "%s" in "%s"...' % (h1, h2[:32])
171 | 
172 |     urls = find_in_hex(h1, h2)
173 |     urls += find_unicode(h1, h2)
174 |     print urls
175 |     print 'Found: %d occurrences' % len(urls)
176 |     if len(urls):
177 |         verify(urls, h2)
178 | 
179 | 
180 | if __name__ == "__main__":
181 |     try:
182 |         needle = sys.argv[1]
183 |         fin = open(sys.argv[2], 'r')
184 |     except IndexError:
185 |         print 'Invalid or no arguments. Usage: huntterp.py needle haystack.txt'
186 |         print 'Beginning tests'
187 |         t = Test()
188 |         for needle in t.tests:
189 |             haystack = getattr(t, needle)
190 |             main(needle, haystack)
191 |     except IOError as e:
192 |         print e
193 |     else:
194 |         main(needle, fin.read())
195 | 


--------------------------------------------------------------------------------
/jobs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/jobs/__init__.py


--------------------------------------------------------------------------------
/pdfminer/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2004-2016  Yusuke Shinyama <yusuke at shinyama dot jp>
 2 | 
 3 | Permission is hereby granted, free of charge, to any person
 4 | obtaining a copy of this software and associated documentation
 5 | files (the "Software"), to deal in the Software without
 6 | restriction, including without limitation the rights to use,
 7 | copy, modify, merge, publish, distribute, sublicense, and/or
 8 | sell copies of the Software, and to permit persons to whom the
 9 | Software is furnished to do so, subject to the following
10 | conditions:
11 | 
12 | The above copyright notice and this permission notice shall be
13 | included in all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
16 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
17 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
18 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
19 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/pdfminer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/pdfminer/__init__.py


--------------------------------------------------------------------------------
/pdfminer/arcfour.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """ Python implementation of Arcfour encryption algorithm.
 4 | 
 5 | This code is in the public domain.
 6 | 
 7 | """
 8 | 
 9 | 
10 | ##  Arcfour
11 | ##
12 | class Arcfour(object):
13 |     """
14 |     >>> Arcfour(b'Key').process(b'Plaintext').encode('hex')
15 |     'bbf316e8d940af0ad3'
16 |     >>> Arcfour(b'Wiki').process(b'pedia').encode('hex')
17 |     '1021bf0420'
18 |     >>> Arcfour(b'Secret').process(b'Attack at dawn').encode('hex')
19 |     '45a01f645fc35b383552544b9bf5'
20 |     """
21 | 
22 |     def __init__(self, key):
23 |         s = range(256)
24 |         j = 0
25 |         klen = len(key)
26 |         for i in xrange(256):
27 |             j = (j + s[i] + ord(key[i % klen])) % 256
28 |             (s[i], s[j]) = (s[j], s[i])
29 |         self.s = s
30 |         (self.i, self.j) = (0, 0)
31 |         return
32 | 
33 |     def process(self, data):
34 |         (i, j) = (self.i, self.j)
35 |         s = self.s
36 |         r = b''
37 |         for c in data:
38 |             i = (i + 1) % 256
39 |             j = (j + s[i]) % 256
40 |             (s[i], s[j]) = (s[j], s[i])
41 |             k = s[(s[i] + s[j]) % 256]
42 |             r += chr(ord(c) ^ k)
43 |         (self.i, self.j) = (i, j)
44 |         return r
45 | 
46 |     encrypt = decrypt = process
47 | 
48 | 
49 | new = Arcfour
50 | 
51 | # test
52 | if __name__ == '__main__':
53 |     import doctest
54 | 
55 |     doctest.testmod()
56 | 


--------------------------------------------------------------------------------
/pdfminer/ascii85.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """ Python implementation of ASCII85/ASCIIHex decoder (Adobe version).
 4 | 
 5 | This code is in the public domain.
 6 | 
 7 | """
 8 | 
 9 | import re
10 | import struct
11 | 
12 | 
13 | # ascii85decode(data)
14 | def ascii85decode(data):
15 |     """
16 |     In ASCII85 encoding, every four bytes are encoded with five ASCII
17 |     letters, using 85 different types of characters (as 256**4 < 85**5).
18 |     When the length of the original bytes is not a multiple of 4, a special
19 |     rule is used for round up.
20 | 
21 |     The Adobe's ASCII85 implementation is slightly different from
22 |     its original in handling the last characters.
23 | 
24 |     The sample string is taken from:
25 |       http://en.wikipedia.org/w/index.php?title=Ascii85
26 | 
27 |     >>> ascii85decode(b'9jqo^BlbD-BleB1DJ+*+F(f,q')
28 |     'Man is distinguished'
29 |     >>> ascii85decode(b'E,9)oF*2M7/c~>')
30 |     'pleasure.'
31 |     """
32 |     n = b = 0
33 |     out = b''
34 |     for c in data:
35 |         if b'!' <= c and c <= b'u':
36 |             n += 1
37 |             b = b * 85 + (ord(c) - 33)
38 |             if n == 5:
39 |                 out += struct.pack('>L', b)
40 |                 n = b = 0
41 |         elif c == b'z':
42 |             assert n == 0
43 |             out += b'\0\0\0\0'
44 |         elif c == b'~':
45 |             if n:
46 |                 for _ in range(5 - n):
47 |                     b = b * 85 + 84
48 |                 out += struct.pack('>L', b)[:n - 1]
49 |             break
50 |     return out
51 | 
52 | # asciihexdecode(data)
53 | hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
54 | trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
55 | 
56 | 
57 | def asciihexdecode(data):
58 |     """
59 |     ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
60 |     For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
61 |     ASCIIHexDecode filter produces one byte of binary data. All white-space
62 |     characters are ignored. A right angle bracket character (>) indicates
63 |     EOD. Any other characters will cause an error. If the filter encounters
64 |     the EOD marker after reading an odd number of hexadecimal digits, it
65 |     will behave as if a 0 followed the last digit.
66 | 
67 |     >>> asciihexdecode(b'61 62 2e6364   65')
68 |     'ab.cde'
69 |     >>> asciihexdecode(b'61 62 2e6364   657>')
70 |     'ab.cdep'
71 |     >>> asciihexdecode(b'7>')
72 |     'p'
73 |     """
74 |     decode = (lambda hx: chr(int(hx, 16)))
75 |     out = map(decode, hex_re.findall(data))
76 |     m = trail_re.search(data)
77 |     if m:
78 |         out.append(decode('%c0' % m.group(1)))
79 |     return b''.join(out)
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     import doctest
84 | 
85 |     doctest.testmod()
86 | 


--------------------------------------------------------------------------------
/pdfminer/lzw.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | 
  4 | try:
  5 |     from cStringIO import StringIO
  6 | except ImportError:
  7 |     from StringIO import StringIO
  8 | 
  9 | 
 10 | class CorruptDataError(Exception):
 11 |     pass
 12 | 
 13 | 
 14 | ##  LZWDecoder
 15 | ##
 16 | class LZWDecoder(object):
 17 |     debug = 0
 18 | 
 19 |     def __init__(self, fp):
 20 |         self.fp = fp
 21 |         self.buff = 0
 22 |         self.bpos = 8
 23 |         self.nbits = 9
 24 |         self.table = None
 25 |         self.prevbuf = None
 26 |         return
 27 | 
 28 |     def readbits(self, bits):
 29 |         v = 0
 30 |         while 1:
 31 |             # the number of remaining bits we can get from the current buffer.
 32 |             r = 8 - self.bpos
 33 |             if bits <= r:
 34 |                 # |-----8-bits-----|
 35 |                 # |-bpos-|-bits-|  |
 36 |                 # |      |----r----|
 37 |                 v = (v << bits) | ((self.buff >> (r - bits)) & ((1 << bits) - 1))
 38 |                 self.bpos += bits
 39 |                 break
 40 |             else:
 41 |                 # |-----8-bits-----|
 42 |                 # |-bpos-|---bits----...
 43 |                 # |      |----r----|
 44 |                 v = (v << r) | (self.buff & ((1 << r) - 1))
 45 |                 bits -= r
 46 |                 x = self.fp.read(1)
 47 |                 if not x:
 48 |                     raise EOFError
 49 |                 self.buff = ord(x)
 50 |                 self.bpos = 0
 51 |         return v
 52 | 
 53 |     def feed(self, code):
 54 |         x = ''
 55 |         if code == 256:
 56 |             self.table = [chr(c) for c in xrange(256)]  # 0-255
 57 |             self.table.append(None)  # 256
 58 |             self.table.append(None)  # 257
 59 |             self.prevbuf = ''
 60 |             self.nbits = 9
 61 |         elif code == 257:
 62 |             pass
 63 |         elif not self.prevbuf:
 64 |             x = self.prevbuf = self.table[code]
 65 |         else:
 66 |             if code < len(self.table):
 67 |                 x = self.table[code]
 68 |                 self.table.append(self.prevbuf + x[:1])
 69 |             elif code == len(self.table):
 70 |                 self.table.append(self.prevbuf + self.prevbuf[:1])
 71 |                 x = self.table[code]
 72 |             else:
 73 |                 raise CorruptDataError
 74 |             l = len(self.table)
 75 |             if l == 511:
 76 |                 self.nbits = 10
 77 |             elif l == 1023:
 78 |                 self.nbits = 11
 79 |             elif l == 2047:
 80 |                 self.nbits = 12
 81 |             self.prevbuf = x
 82 |         return x
 83 | 
 84 |     def run(self):
 85 |         while 1:
 86 |             try:
 87 |                 code = self.readbits(self.nbits)
 88 |             except EOFError:
 89 |                 break
 90 |             try:
 91 |                 x = self.feed(code)
 92 |             except CorruptDataError:
 93 |                 # just ignore corrupt data and stop yielding there
 94 |                 break
 95 |             yield x
 96 |             if self.debug:
 97 |                 print >> sys.stderr, ('nbits=%d, code=%d, output=%r, table=%r' %
 98 |                                       (self.nbits, code, x, self.table[258:]))
 99 |         return
100 | 
101 | 
102 | # lzwdecode
103 | def lzwdecode(data):
104 |     """
105 |     >>> lzwdecode('\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01')
106 |     '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
107 |     """
108 |     fp = StringIO(data)
109 |     return ''.join(LZWDecoder(fp).run())
110 | 
111 | 
112 | if __name__ == '__main__':
113 |     import doctest
114 | 
115 |     doctest.testmod()
116 | 


--------------------------------------------------------------------------------
/pdfminer/pdfparser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | 
  4 | try:
  5 |     from cStringIO import StringIO
  6 | except ImportError:
  7 |     from StringIO import StringIO
  8 | from psparser import PSStackParser
  9 | from psparser import PSSyntaxError, PSEOF
 10 | from psparser import KWD, STRICT
 11 | from pdftypes import PDFException
 12 | from pdftypes import PDFStream, PDFObjRef
 13 | from pdftypes import int_value
 14 | from pdftypes import dict_value
 15 | 
 16 | 
 17 | ##  Exceptions
 18 | ##
 19 | class PDFSyntaxError(PDFException):
 20 |     pass
 21 | 
 22 | 
 23 | ##  PDFParser
 24 | ##
 25 | class PDFParser(PSStackParser):
 26 |     """
 27 |     PDFParser fetch PDF objects from a file stream.
 28 |     It can handle indirect references by referring to
 29 |     a PDF document set by set_document method.
 30 |     It also reads XRefs at the end of every PDF file.
 31 | 
 32 |     Typical usage:
 33 |       parser = PDFParser(fp)
 34 |       parser.read_xref()
 35 |       parser.read_xref(fallback=True) # optional
 36 |       parser.set_document(doc)
 37 |       parser.seek(offset)
 38 |       parser.nextobject()
 39 | 
 40 |     """
 41 | 
 42 |     def __init__(self, fp, dbg=False):
 43 |         PSStackParser.__init__(self, fp, dbg)
 44 |         self.doc = None
 45 |         self.fallback = False
 46 |         return
 47 | 
 48 |     def set_document(self, doc):
 49 |         """Associates the parser with a PDFDocument object."""
 50 |         self.doc = doc
 51 |         return
 52 | 
 53 |     KEYWORD_R = KWD('R')
 54 |     KEYWORD_NULL = KWD('null')
 55 |     KEYWORD_ENDOBJ = KWD('endobj')
 56 |     KEYWORD_STREAM = KWD('stream')
 57 |     KEYWORD_XREF = KWD('xref')
 58 |     KEYWORD_STARTXREF = KWD('startxref')
 59 | 
 60 |     def do_keyword(self, pos, token):
 61 |         """Handles PDF-related keywords."""
 62 | 
 63 |         if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
 64 |             self.add_results(*self.pop(1))
 65 | 
 66 |         elif token is self.KEYWORD_ENDOBJ:
 67 |             self.add_results(*self.pop(4))
 68 | 
 69 |         elif token is self.KEYWORD_NULL:
 70 |             # null object
 71 |             self.push((pos, None))
 72 | 
 73 |         elif token is self.KEYWORD_R:
 74 |             # reference to indirect object
 75 |             try:
 76 |                 ((_, objid), (_, genno)) = self.pop(2)
 77 |                 (objid, genno) = (int(objid), int(genno))
 78 |                 obj = PDFObjRef(self.doc, objid, genno)
 79 |                 self.push((pos, obj))
 80 |             except PSSyntaxError:
 81 |                 pass
 82 | 
 83 |         elif token is self.KEYWORD_STREAM:
 84 |             # stream object
 85 |             ((_, dic),) = self.pop(1)
 86 |             dic = dict_value(dic)
 87 |             objlen = 0
 88 |             if not self.fallback:
 89 |                 try:
 90 |                     objlen = int_value(dic['Length'])
 91 |                 except KeyError:
 92 |                     if STRICT:
 93 |                         raise PDFSyntaxError('/Length is undefined: %r' % dic)
 94 |             self.seek(pos)
 95 |             try:
 96 |                 (_, line) = self.nextline()  # 'stream'
 97 |             except PSEOF:
 98 |                 if STRICT:
 99 |                     raise PDFSyntaxError('Unexpected EOF')
100 |                 return
101 |             pos += len(line)
102 |             self.fp.seek(pos)
103 |             data = self.fp.read(objlen)
104 |             self.seek(pos + objlen)
105 |             while 1:
106 |                 try:
107 |                     (linepos, line) = self.nextline()
108 |                 except PSEOF:
109 |                     if STRICT:
110 |                         raise PDFSyntaxError('Unexpected EOF')
111 |                     break
112 |                 if 'endstream' in line:
113 |                     i = line.index('endstream')
114 |                     objlen += i
115 |                     data += line[:i]
116 |                     break
117 |                 objlen += len(line)
118 |                 data += line
119 |             self.seek(pos + objlen)
120 |             # XXX limit objlen not to exceed object boundary
121 |             if 2 <= self.debug:
122 |                 print >> sys.stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
123 |                                      (pos, objlen, dic, data[:10])
124 |             obj = PDFStream(dic, data, self.doc.decipher)
125 |             self.push((pos, obj))
126 | 
127 |         else:
128 |             # others
129 |             self.push((pos, token))
130 | 
131 |         return
132 | 
133 | 
134 | ##  PDFStreamParser
135 | ##
136 | class PDFStreamParser(PDFParser):
137 |     """
138 |     PDFStreamParser is used to parse PDF content streams
139 |     that is contained in each page and has instructions
140 |     for rendering the page. A reference to a PDF document is
141 |     needed because a PDF content stream can also have
142 |     indirect references to other objects in the same document.
143 |     """
144 | 
145 |     def __init__(self, data):
146 |         PDFParser.__init__(self, StringIO(data))
147 |         return
148 | 
149 |     def flush(self):
150 |         self.add_results(*self.popall())
151 |         return
152 | 
153 |     def do_keyword(self, pos, token):
154 |         if token is self.KEYWORD_R:
155 |             # reference to indirect object
156 |             try:
157 |                 ((_, objid), (_, genno)) = self.pop(2)
158 |                 (objid, genno) = (int(objid), int(genno))
159 |                 obj = PDFObjRef(self.doc, objid, genno)
160 |                 self.push((pos, obj))
161 |             except PSSyntaxError:
162 |                 pass
163 |             return
164 |         # others
165 |         self.push((pos, token))
166 |         return
167 | 


--------------------------------------------------------------------------------
/pdfminer/pdftypes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import zlib
  3 | 
  4 | from lzw import lzwdecode
  5 | from ascii85 import ascii85decode, asciihexdecode
  6 | from runlength import rldecode
  7 | from ccitt import ccittfaxdecode
  8 | from psparser import PSException, PSObject
  9 | from psparser import LIT, STRICT
 10 | from utils import apply_png_predictor, isnumber
 11 | 
 12 | LITERAL_CRYPT = LIT('Crypt')
 13 | 
 14 | # Abbreviation of Filter names in PDF 4.8.6. "Inline Images"
 15 | LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl'))
 16 | LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW'))
 17 | LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85'))
 18 | LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx'))
 19 | LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL'))
 20 | LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF'))
 21 | LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
 22 | 
 23 | 
 24 | ##  PDF Objects
 25 | ##
 26 | class PDFObject(PSObject):
 27 |     pass
 28 | 
 29 | 
 30 | class PDFException(PSException):
 31 |     pass
 32 | 
 33 | 
 34 | class PDFTypeError(PDFException):
 35 |     pass
 36 | 
 37 | 
 38 | class PDFValueError(PDFException):
 39 |     pass
 40 | 
 41 | 
 42 | class PDFObjectNotFound(PDFException):
 43 |     pass
 44 | 
 45 | 
 46 | class PDFNotImplementedError(PDFException):
 47 |     pass
 48 | 
 49 | 
 50 | ##  PDFObjRef
 51 | ##
 52 | class PDFObjRef(PDFObject):
 53 |     def __init__(self, doc, objid, _):
 54 |         if objid == 0:
 55 |             if STRICT:
 56 |                 raise PDFValueError('PDF object id cannot be 0.')
 57 |         self.doc = doc
 58 |         self.objid = objid
 59 |         # self.genno = genno  # Never used.
 60 |         return
 61 | 
 62 |     def __repr__(self):
 63 |         return '<PDFObjRef:%d>' % (self.objid)
 64 | 
 65 |     def resolve(self, default=None):
 66 |         try:
 67 |             return self.doc.getobj(self.objid)
 68 |         except PDFObjectNotFound:
 69 |             return default
 70 | 
 71 | 
 72 | # resolve
 73 | def resolve1(x, default=None):
 74 |     """Resolves an object.
 75 | 
 76 |     If this is an array or dictionary, it may still contains
 77 |     some indirect objects inside.
 78 |     """
 79 |     while isinstance(x, PDFObjRef):
 80 |         x = x.resolve(default=default)
 81 |     return x
 82 | 
 83 | 
 84 | def resolve_all(x, default=None):
 85 |     """Recursively resolves the given object and all the internals.
 86 | 
 87 |     Make sure there is no indirect reference within the nested object.
 88 |     This procedure might be slow.
 89 |     """
 90 |     while isinstance(x, PDFObjRef):
 91 |         x = x.resolve(default=default)
 92 |     if isinstance(x, list):
 93 |         x = [resolve_all(v, default=default) for v in x]
 94 |     elif isinstance(x, dict):
 95 |         for (k, v) in x.iteritems():
 96 |             x[k] = resolve_all(v, default=default)
 97 |     return x
 98 | 
 99 | 
100 | def decipher_all(decipher, objid, genno, x):
101 |     """Recursively deciphers the given object.
102 |     """
103 |     if isinstance(x, str):
104 |         return decipher(objid, genno, x)
105 |     if isinstance(x, list):
106 |         x = [decipher_all(decipher, objid, genno, v) for v in x]
107 |     elif isinstance(x, dict):
108 |         for (k, v) in x.iteritems():
109 |             x[k] = decipher_all(decipher, objid, genno, v)
110 |     return x
111 | 
112 | 
113 | # Type cheking
114 | def int_value(x):
115 |     x = resolve1(x)
116 |     if not isinstance(x, int):
117 |         if STRICT:
118 |             raise PDFTypeError('Integer required: %r' % x)
119 |         return 0
120 |     return x
121 | 
122 | 
123 | def float_value(x):
124 |     x = resolve1(x)
125 |     if not isinstance(x, float):
126 |         if STRICT:
127 |             raise PDFTypeError('Float required: %r' % x)
128 |         return 0.0
129 |     return x
130 | 
131 | 
132 | def num_value(x):
133 |     x = resolve1(x)
134 |     if not isnumber(x):
135 |         if STRICT:
136 |             raise PDFTypeError('Int or Float required: %r' % x)
137 |         return 0
138 |     return x
139 | 
140 | 
141 | def str_value(x):
142 |     x = resolve1(x)
143 |     if not isinstance(x, str):
144 |         if STRICT:
145 |             raise PDFTypeError('String required: %r' % x)
146 |         return ''
147 |     return x
148 | 
149 | 
150 | def list_value(x):
151 |     x = resolve1(x)
152 |     if not isinstance(x, (list, tuple)):
153 |         if STRICT:
154 |             raise PDFTypeError('List required: %r' % x)
155 |         return []
156 |     return x
157 | 
158 | 
159 | def dict_value(x):
160 |     x = resolve1(x)
161 |     if not isinstance(x, dict):
162 |         if STRICT:
163 |             raise PDFTypeError('Dict required: %r' % x)
164 |         return {}
165 |     return x
166 | 
167 | 
168 | def stream_value(x):
169 |     x = resolve1(x)
170 |     if not isinstance(x, PDFStream):
171 |         if STRICT:
172 |             raise PDFTypeError('PDFStream required: %r' % x)
173 |         return PDFStream({}, '')
174 |     return x
175 | 
176 | 
177 | ##  PDFStream type
178 | ##
179 | class PDFStream(PDFObject):
180 |     def __init__(self, attrs, rawdata, decipher=None):
181 |         assert isinstance(attrs, dict)
182 |         self.attrs = attrs
183 |         self.rawdata = rawdata
184 |         self.decipher = decipher
185 |         self.data = None
186 |         self.objid = None
187 |         self.genno = None
188 |         return
189 | 
190 |     def set_objid(self, objid, genno):
191 |         self.objid = objid
192 |         self.genno = genno
193 |         return
194 | 
195 |     def __repr__(self):
196 |         if self.data is None:
197 |             assert self.rawdata is not None
198 |             return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.attrs)
199 |         else:
200 |             assert self.data is not None
201 |             return '<PDFStream(%r): len=%d, %r>' % (self.objid, len(self.data), self.attrs)
202 | 
203 |     def __contains__(self, name):
204 |         return name in self.attrs
205 | 
206 |     def __getitem__(self, name):
207 |         return self.attrs[name]
208 | 
209 |     def get(self, name, default=None):
210 |         return self.attrs.get(name, default)
211 | 
212 |     def get_any(self, names, default=None):
213 |         for name in names:
214 |             if name in self.attrs:
215 |                 return self.attrs[name]
216 |         return default
217 | 
218 |     def get_filters(self):
219 |         filters = self.get_any(('F', 'Filter'))
220 |         if not filters:
221 |             return []
222 |         if isinstance(filters, list):
223 |             return filters
224 |         return [filters]
225 | 
226 |     def decode(self):
227 |         assert self.data is None and self.rawdata is not None
228 |         data = self.rawdata
229 |         if self.decipher:
230 |             # Handle encryption
231 |             data = self.decipher(self.objid, self.genno, data)
232 |         filters = self.get_filters()
233 |         if not filters:
234 |             self.data = data
235 |             self.rawdata = None
236 |             return
237 |         for f in filters:
238 |             if isinstance(f, PDFObjRef):
239 |                 filters += f.resolve()
240 |                 continue
241 |             params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
242 |             if f in LITERALS_FLATE_DECODE:
243 |                 # will get errors if the document is encrypted.
244 |                 try:
245 |                     data = zlib.decompress(data)
246 |                 except zlib.error, e:
247 |                     if STRICT:
248 |                         raise PDFException('Invalid zlib bytes: %r, %r' % (e, data))
249 |                     data = ''
250 |             elif f in LITERALS_LZW_DECODE:
251 |                 data = lzwdecode(data)
252 |             elif f in LITERALS_ASCII85_DECODE:
253 |                 data = ascii85decode(data)
254 |             elif f in LITERALS_ASCIIHEX_DECODE:
255 |                 data = asciihexdecode(data)
256 |             elif f in LITERALS_RUNLENGTH_DECODE:
257 |                 data = rldecode(data)
258 |             elif f in LITERALS_CCITTFAX_DECODE:
259 |                 data = ccittfaxdecode(data, params)
260 |             elif f == LITERAL_CRYPT:
261 |                 # not yet..
262 |                 raise PDFNotImplementedError('/Crypt filter is unsupported')
263 |             else:
264 |                 raise PDFNotImplementedError('Unsupported filter: %r' % f)
265 |             # apply predictors
266 |             if 'Predictor' in params:
267 |                 pred = int_value(params['Predictor'])
268 |                 if pred == 1:
269 |                     # no predictor
270 |                     pass
271 |                 elif 10 <= pred:
272 |                     # PNG predictor
273 |                     colors = int_value(params.get('Colors', 1))
274 |                     columns = int_value(params.get('Columns', 1))
275 |                     bitspercomponent = int_value(params.get('BitsPerComponent', 8))
276 |                     data = apply_png_predictor(pred, colors, columns, bitspercomponent, data)
277 |                 else:
278 |                     raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
279 |         self.data = data
280 |         self.rawdata = None
281 |         return
282 | 
283 |     def get_data(self):
284 |         if self.data is None:
285 |             self.decode()
286 |         return self.data
287 | 
288 |     def get_rawdata(self):
289 |         return self.rawdata
290 | 


--------------------------------------------------------------------------------
/pdfminer/runlength.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # RunLength decoder (Adobe version) implementation based on PDF Reference
 4 | # version 1.4 section 3.3.4.
 5 | #
 6 | #  * public domain *
 7 | #
 8 | 
 9 | def rldecode(data):
10 |     """
11 |     RunLength decoder (Adobe version) implementation based on PDF Reference
12 |     version 1.4 section 3.3.4:
13 |         The RunLengthDecode filter decodes data that has been encoded in a
14 |         simple byte-oriented format based on run length. The encoded data
15 |         is a sequence of runs, where each run consists of a length byte
16 |         followed by 1 to 128 bytes of data. If the length byte is in the
17 |         range 0 to 127, the following length + 1 (1 to 128) bytes are
18 |         copied literally during decompression. If length is in the range
19 |         129 to 255, the following single byte is to be copied 257 - length
20 |         (2 to 128) times during decompression. A length value of 128
21 |         denotes EOD.
22 |     >>> s = b'\x05123456\xfa7\x04abcde\x80junk'
23 |     >>> rldecode(s)
24 |     '1234567777777abcde'
25 |     """
26 |     decoded = []
27 |     i = 0
28 |     while i < len(data):
29 |         # print 'data[%d]=:%d:' % (i,ord(data[i]))
30 |         length = ord(data[i])
31 |         if length == 128:
32 |             break
33 |         if length >= 0 and length < 128:
34 |             run = data[i + 1:(i + 1) + (length + 1)]
35 |             # print 'length=%d, run=%s' % (length+1,run)
36 |             decoded.append(run)
37 |             i = (i + 1) + (length + 1)
38 |         if length > 128:
39 |             run = data[i + 1] * (257 - length)
40 |             # print 'length=%d, run=%s' % (257-length,run)
41 |             decoded.append(run)
42 |             i = (i + 1) + 1
43 |     return b''.join(decoded)
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     import doctest
48 | 
49 |     doctest.testmod()
50 | 


--------------------------------------------------------------------------------
/pdfminer/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Miscellaneous Routines.
  4 | """
  5 | import struct
  6 | from sys import maxint as INF
  7 | 
  8 | 
  9 | ##  PNG Predictor
 10 | ##
 11 | def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
 12 |     if bitspercomponent != 8:
 13 |         # unsupported
 14 |         raise ValueError(bitspercomponent)
 15 |     nbytes = colors * columns * bitspercomponent // 8
 16 |     i = 0
 17 |     buf = ''
 18 |     line0 = '\x00' * columns
 19 |     for i in xrange(0, len(data), nbytes + 1):
 20 |         ft = data[i]
 21 |         i += 1
 22 |         line1 = data[i:i + nbytes]
 23 |         line2 = ''
 24 |         if ft == '\x00':
 25 |             # PNG none
 26 |             line2 += line1
 27 |         elif ft == '\x01':
 28 |             # PNG sub (UNTESTED)
 29 |             c = 0
 30 |             for b in line1:
 31 |                 c = (c + ord(b)) & 255
 32 |                 line2 += chr(c)
 33 |         elif ft == '\x02':
 34 |             # PNG up
 35 |             for (a, b) in zip(line0, line1):
 36 |                 c = (ord(a) + ord(b)) & 255
 37 |                 line2 += chr(c)
 38 |         elif ft == '\x03':
 39 |             # PNG average (UNTESTED)
 40 |             c = 0
 41 |             for (a, b) in zip(line0, line1):
 42 |                 c = ((c + ord(a) + ord(b)) // 2) & 255
 43 |                 line2 += chr(c)
 44 |         else:
 45 |             # unsupported
 46 |             raise ValueError(ft)
 47 |         buf += line2
 48 |         line0 = line2
 49 |     return buf
 50 | 
 51 | ##  Matrix operations
 52 | ##
 53 | MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
 54 | 
 55 | 
 56 | def mult_matrix((a1, b1, c1, d1, e1, f1), (a0, b0, c0, d0, e0, f0)):
 57 |     """Returns the multiplication of two matrices."""
 58 |     return (a0 * a1 + c0 * b1, b0 * a1 + d0 * b1,
 59 |             a0 * c1 + c0 * d1, b0 * c1 + d0 * d1,
 60 |             a0 * e1 + c0 * f1 + e0, b0 * e1 + d0 * f1 + f0)
 61 | 
 62 | 
 63 | def translate_matrix((a, b, c, d, e, f), (x, y)):
 64 |     """Translates a matrix by (x, y)."""
 65 |     return (a, b, c, d, x * a + y * c + e, x * b + y * d + f)
 66 | 
 67 | 
 68 | def apply_matrix_pt((a, b, c, d, e, f), (x, y)):
 69 |     """Applies a matrix to a point."""
 70 |     return (a * x + c * y + e, b * x + d * y + f)
 71 | 
 72 | 
 73 | def apply_matrix_norm((a, b, c, d, e, f), (p, q)):
 74 |     """Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
 75 |     return (a * p + c * q, b * p + d * q)
 76 | 
 77 | 
 78 | ##  Utility functions
 79 | ##
 80 | 
 81 | # isnumber
 82 | def isnumber(x):
 83 |     return isinstance(x, (int, long, float))
 84 | 
 85 | 
 86 | # uniq
 87 | def uniq(objs):
 88 |     """Eliminates duplicated elements."""
 89 |     done = set()
 90 |     for obj in objs:
 91 |         if obj in done:
 92 |             continue
 93 |         done.add(obj)
 94 |         yield obj
 95 |     return
 96 | 
 97 | 
 98 | # csort
 99 | def csort(objs, key=lambda x: x):
100 |     """Order-preserving sorting function."""
101 |     idxs = dict((obj, i) for (i, obj) in enumerate(objs))
102 |     return sorted(objs, key=lambda obj: (key(obj), idxs[obj]))
103 | 
104 | 
105 | # fsplit
106 | def fsplit(pred, objs):
107 |     """Split a list into two classes according to the predicate."""
108 |     t = []
109 |     f = []
110 |     for obj in objs:
111 |         if pred(obj):
112 |             t.append(obj)
113 |         else:
114 |             f.append(obj)
115 |     return (t, f)
116 | 
117 | 
118 | # drange
119 | def drange(v0, v1, d):
120 |     """Returns a discrete range."""
121 |     assert v0 < v1
122 |     return xrange(int(v0) // d, int(v1 + d) // d)
123 | 
124 | 
125 | # get_bound
126 | def get_bound(pts):
127 |     """Compute a minimal rectangle that covers all the points."""
128 |     (x0, y0, x1, y1) = (INF, INF, -INF, -INF)
129 |     for (x, y) in pts:
130 |         x0 = min(x0, x)
131 |         y0 = min(y0, y)
132 |         x1 = max(x1, x)
133 |         y1 = max(y1, y)
134 |     return (x0, y0, x1, y1)
135 | 
136 | 
137 | # pick
138 | def pick(seq, func, maxobj=None):
139 |     """Picks the object obj where func(obj) has the highest value."""
140 |     maxscore = None
141 |     for obj in seq:
142 |         score = func(obj)
143 |         if maxscore is None or maxscore < score:
144 |             (maxscore, maxobj) = (score, obj)
145 |     return maxobj
146 | 
147 | 
148 | # choplist
149 | def choplist(n, seq):
150 |     """Groups every n elements of the list."""
151 |     r = []
152 |     for x in seq:
153 |         r.append(x)
154 |         if len(r) == n:
155 |             yield tuple(r)
156 |             r = []
157 |     return
158 | 
159 | 
160 | # nunpack
161 | def nunpack(s, default=0):
162 |     """Unpacks 1 to 4 byte integers (big endian)."""
163 |     l = len(s)
164 |     if not l:
165 |         return default
166 |     elif l == 1:
167 |         return ord(s)
168 |     elif l == 2:
169 |         return struct.unpack('>H', s)[0]
170 |     elif l == 3:
171 |         return struct.unpack('>L', '\x00' + s)[0]
172 |     elif l == 4:
173 |         return struct.unpack('>L', s)[0]
174 |     else:
175 |         raise TypeError('invalid length: %d' % l)
176 | 
177 | # decode_text
178 | PDFDocEncoding = ''.join(unichr(x) for x in (
179 |     0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
180 |     0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
181 |     0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
182 |     0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
183 |     0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
184 |     0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
185 |     0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
186 |     0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
187 |     0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
188 |     0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
189 |     0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
190 |     0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
191 |     0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
192 |     0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
193 |     0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
194 |     0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
195 |     0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
196 |     0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
197 |     0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
198 |     0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
199 |     0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
200 |     0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
201 |     0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
202 |     0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
203 |     0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
204 |     0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
205 |     0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
206 |     0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
207 |     0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
208 |     0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
209 |     0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
210 |     0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
211 | ))
212 | 
213 | 
214 | def decode_text(s):
215 |     """Decodes a PDFDocEncoding string to Unicode."""
216 |     if s.startswith('\xfe\xff'):
217 |         return unicode(s[2:], 'utf-16be', 'ignore')
218 |     else:
219 |         return ''.join(PDFDocEncoding[ord(c)] for c in s)
220 | 
221 | 
222 | # enc
223 | def enc(x, codec='ascii'):
224 |     """Encodes a string for SGML/XML/HTML"""
225 |     x = x.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;').replace('"', '&quot;')
226 |     return x.encode(codec, 'xmlcharrefreplace')
227 | 
228 | 
229 | def bbox2str((x0, y0, x1, y1)):
230 |     return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
231 | 
232 | 
233 | def matrix2str((a, b, c, d, e, f)):
234 |     return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a, b, c, d, e, f)
235 | 
236 | 
237 | ##  Plane
238 | ##
239 | ##  A set-like data structure for objects placed on a plane.
240 | ##  Can efficiently find objects in a certain rectangular area.
241 | ##  It maintains two parallel lists of objects, each of
242 | ##  which is sorted by its x or y coordinate.
243 | ##
244 | class Plane(object):
245 |     def __init__(self, bbox, gridsize=50):
246 |         self._objs = set()
247 |         self._grid = {}
248 |         self.gridsize = gridsize
249 |         (self.x0, self.y0, self.x1, self.y1) = bbox
250 |         return
251 | 
252 |     def __repr__(self):
253 |         return ('<Plane objs=%r>' % list(self))
254 | 
255 |     def __iter__(self):
256 |         return iter(self._objs)
257 | 
258 |     def __len__(self):
259 |         return len(self._objs)
260 | 
261 |     def __contains__(self, obj):
262 |         return obj in self._objs
263 | 
264 |     def _getrange(self, (x0, y0, x1, y1)):
265 |         if (x1 <= self.x0 or self.x1 <= x0 or
266 |                     y1 <= self.y0 or self.y1 <= y0): return
267 |         x0 = max(self.x0, x0)
268 |         y0 = max(self.y0, y0)
269 |         x1 = min(self.x1, x1)
270 |         y1 = min(self.y1, y1)
271 |         for y in drange(y0, y1, self.gridsize):
272 |             for x in drange(x0, x1, self.gridsize):
273 |                 yield (x, y)
274 |         return
275 | 
276 |     # extend(objs)
277 |     def extend(self, objs):
278 |         for obj in objs:
279 |             self.add(obj)
280 |         return
281 | 
282 |     # add(obj): place an object.
283 |     def add(self, obj):
284 |         for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
285 |             if k not in self._grid:
286 |                 r = []
287 |                 self._grid[k] = r
288 |             else:
289 |                 r = self._grid[k]
290 |             r.append(obj)
291 |         self._objs.add(obj)
292 |         return
293 | 
294 |     # remove(obj): displace an object.
295 |     def remove(self, obj):
296 |         for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
297 |             try:
298 |                 self._grid[k].remove(obj)
299 |             except (KeyError, ValueError):
300 |                 pass
301 |         self._objs.remove(obj)
302 |         return
303 | 
304 |     # find(): finds objects that are in a certain area.
305 |     def find(self, (x0, y0, x1, y1)):
306 |         done = set()
307 |         for k in self._getrange((x0, y0, x1, y1)):
308 |             if k not in self._grid:
309 |                 continue
310 |             for obj in self._grid[k]:
311 |                 if obj in done:
312 |                     continue
313 |                 done.add(obj)
314 |                 if (obj.x1 <= x0 or x1 <= obj.x0 or
315 |                             obj.y1 <= y0 or y1 <= obj.y0):
316 |                     continue
317 |                 yield obj
318 |         return
319 | 


--------------------------------------------------------------------------------
/peepdf/AUTHORS:
--------------------------------------------------------------------------------
1 | Jose Miguel Esparza <jesparza AT eternal-todo.com>
2 | http://eternal-todo.com
3 | http://twitter.com/EternalTodo


--------------------------------------------------------------------------------
/peepdf/CHANGELOG:
--------------------------------------------------------------------------------
 1 | -----------------------------------------------
 2 | peepdf Black Hat Vegas (0.2 r156), 2012-07-25
 3 | -----------------------------------------------
 4 | 
 5 |     * New features:
 6 | 
 7 |         - Added "grinch mode" execution to avoid colorized output
 8 |         - Added more colors in the interactive console output: warning, errors, important information...
 9 |         - Changed sctest command, now it's implemented with pylibemu
10 |         - Added decrypt command to parse password protected documents
11 |         - Modified analyseJS() to extract JS code from XDP packets and unescape HTML entities
12 |         - Added function unescapeHTMLEntities() to unescape HTML entities
13 |         - Added AES decryption support (128 and 256 bits).
14 |         - Added hashes in objects information (info $object_id)
15 |         - Added support for decoding CCITTFaxDecode filters (Thanks to @binjo)
16 | 
17 |     * Fixes:
18 | 
19 |         - Fix to show decrypt errors
20 |         - Fixed silly bug with /EncryptMetadata element
21 |         - Added missing binary file operations
22 |         - Fixed Issue 5: Resolved false positives when monitoring some elements like actions, events, etc. (Thanks to @hiddenillusion)
23 |         - Bug in PDFStream.decode and PDFStream.encode, dealing with an array of filter parameters (Thanks to @binjo)
24 | 
25 | 
26 | -----------------------------------------------
27 | peepdf Black Hat Arsenal (0.1 r92), 2012-03-16
28 | -----------------------------------------------
29 | 
30 |     * New features:
31 | 
32 |         - Added support for more parameters in Flate/LZW decode (stream filters)
33 |         - Encryption algorithm now showing in document information
34 |         - Added XML output and SHA hash to file information    
35 |         - Improved unescape function to support mixed escaped formats (eg. "%u6734%34%u8790")
36 |         - Added xor and xor_search commands
37 |         - Added easy way of redirect console output (>, >>, $>, $>>)
38 |         - Added xor function by Evan Fosmark
39 |         - Added detection of CVE-2011-4369 (/PRC)
40 |         - Added hash command (Thanks to @binjo for code and comments)
41 |         - Added js_beautify command
42 |         - Update function added
43 |         - Added new vulns and showing information related to non JS vulns
44 |         - Added escape sequence in the limited output
45 |         - Added ascii85 decode from pdfminer to improve code and avoid bugs (Thanks to Brandon Dixon!)
46 |         - Added lzwdecode from pdfminer to improve code and avoid bugs
47 | 
48 |     * Fixes:
49 | 
50 |         - Update process rewritten, now based on hashing of files
51 |         - Silly bug in computeUserPass function (Thanks to Christian Martorella!)
52 |         - Added binary mode in files operations
53 |         - Recursion bug in update function
54 |         - Minor bug in do_embed function
55 |         - Bug to support encoding following PDF specifications (Issue 3 by czchen)
56 |         - Bug to handle negative numbers in P element
57 |         - Bug in the xref table when creating a new PDF (Issue 2)
58 |         - Silly bug when parsing filter parameters
59 |         - Bug related to updating objects and statistics of PDF files
60 |         - Some bugs related to offsets calculation
61 |         - Fixed "replace" function in PDFObjectStream
62 |         - Fix in asciiHexDecode filter function
63 | 
64 | 
65 | -----------------------------------------------
66 | peepdf 0.1 r15, 2011-05-05
67 | -----------------------------------------------
68 | 
69 | - Initial Release
70 | 
71 | 


--------------------------------------------------------------------------------
/peepdf/JSAnalysis.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #    peepdf is a tool to analyse and modify PDF files
  3 | #    http://peepdf.eternal-todo.com
  4 | #    By Jose Miguel Esparza <jesparza AT eternal-todo.com>
  5 | #
  6 | #    Copyright (C) 2011-2014 Jose Miguel Esparza
  7 | #
  8 | #    This file is part of peepdf.
  9 | #
 10 | #        peepdf is free software: you can redistribute it and/or modify
 11 | #        it under the terms of the GNU General Public License as published by
 12 | #        the Free Software Foundation, either version 3 of the License, or
 13 | #        (at your option) any later version.
 14 | #
 15 | #        peepdf is distributed in the hope that it will be useful,
 16 | #        but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | #        MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    See the
 18 | #        GNU General Public License for more details.
 19 | #
 20 | #        You should have received a copy of the GNU General Public License
 21 | #        along with peepdf.    If not, see <http://www.gnu.org/licenses/>.
 22 | #
 23 | 
 24 | '''
 25 |     This module contains some functions to analyse Javascript code inside the PDF file
 26 | '''
 27 | 
 28 | import sys
 29 | import re
 30 | import os
 31 | import traceback
 32 | 
 33 | import jsbeautifier
 34 | from PDFUtils import unescapeHTMLEntities, escapeString
 35 | 
 36 | try:
 37 |     import PyV8
 38 | 
 39 |     JS_MODULE = True
 40 | 
 41 |     class Global(PyV8.JSClass):
 42 |         evalCode = ''
 43 | 
 44 |         def evalOverride(self, expression):
 45 |             self.evalCode += '\n\n// New evaluated code\n' + expression
 46 |             return
 47 | 
 48 | except:
 49 |     JS_MODULE = False
 50 | 
 51 | errorsFile = 'errors.txt'
 52 | newLine = os.linesep
 53 | reJSscript = '<script[^>]*?contentType\s*?=\s*?[\'"]application/x-javascript[\'"][^>]*?>(.*?)</script>'
 54 | preDefinedCode = 'var app = this;'
 55 | 
 56 | 
 57 | def analyseJS(code, context=None, manualAnalysis=False):
 58 |     '''
 59 |         Hooks the eval function and search for obfuscated elements in the Javascript code
 60 |         
 61 |         @param code: The Javascript code (string)
 62 |         @return: List with analysis information of the Javascript code: [JSCode,unescapedBytes,urlsFound,errors,context], where 
 63 |                 JSCode is a list with the several stages Javascript code,
 64 |                 unescapedBytes is a list with the parameters of unescape functions, 
 65 |                 urlsFound is a list with the URLs found in the unescaped bytes,
 66 |                 errors is a list of errors,
 67 |                 context is the context of execution of the Javascript code.
 68 |     '''
 69 |     errors = []
 70 |     JSCode = []
 71 |     unescapedBytes = []
 72 |     urlsFound = []
 73 | 
 74 |     try:
 75 |         code = unescapeHTMLEntities(code)
 76 |         scriptElements = re.findall(reJSscript, code, re.DOTALL | re.IGNORECASE)
 77 |         if scriptElements != []:
 78 |             code = ''
 79 |             for scriptElement in scriptElements:
 80 |                 code += scriptElement + '\n\n'
 81 |         code = jsbeautifier.beautify(code)
 82 |         JSCode.append(code)
 83 | 
 84 |         if code != None and JS_MODULE and not manualAnalysis:
 85 |             if context == None:
 86 |                 context = PyV8.JSContext(Global())
 87 |             context.enter()
 88 |             # Hooking the eval function
 89 |             context.eval('eval=evalOverride')
 90 |             # context.eval(preDefinedCode)
 91 |             while True:
 92 |                 originalCode = code
 93 |                 try:
 94 |                     context.eval(code)
 95 |                     evalCode = context.eval('evalCode')
 96 |                     evalCode = jsbeautifier.beautify(evalCode)
 97 |                     if evalCode != '' and evalCode != code:
 98 |                         code = evalCode
 99 |                         JSCode.append(code)
100 |                     else:
101 |                         break
102 |                 except:
103 |                     error = str(sys.exc_info()[1])
104 |                     open('jserror.log', 'ab').write(error + newLine)
105 |                     errors.append(error)
106 |                     break
107 | 
108 |             if False:
109 |                 escapedVars = re.findall('(\w*?)\s*?=\s*?(unescape\((.*?)\))', code, re.DOTALL)
110 |                 for var in escapedVars:
111 |                     bytes = var[2]
112 |                     if bytes.find('+') != -1 or bytes.find('%') == -1:
113 |                         varContent = getVarContent(code, bytes)
114 |                         if len(varContent) > 150:
115 |                             ret = unescape(varContent)
116 |                             if ret[0] != -1:
117 |                                 bytes = ret[1]
118 |                                 urls = re.findall('https?://.*$', bytes, re.DOTALL)
119 |                                 if bytes not in unescapedBytes:
120 |                                     unescapedBytes.append(bytes)
121 |                                 for url in urls:
122 |                                     if url not in urlsFound:
123 |                                         urlsFound.append(url)
124 |                     else:
125 |                         bytes = bytes[1:-1]
126 |                         if len(bytes) > 150:
127 |                             ret = unescape(bytes)
128 |                             if ret[0] != -1:
129 |                                 bytes = ret[1]
130 |                                 urls = re.findall('https?://.*$', bytes, re.DOTALL)
131 |                                 if bytes not in unescapedBytes:
132 |                                     unescapedBytes.append(bytes)
133 |                                 for url in urls:
134 |                                     if url not in urlsFound:
135 |                                         urlsFound.append(url)
136 |     except:
137 |         traceback.print_exc(file=open(errorsFile, 'a'))
138 |         errors.append('Unexpected error in the JSAnalysis module!!')
139 |     finally:
140 |         for js in JSCode:
141 |             if js == None or js == '':
142 |                 JSCode.remove(js)
143 |     return [JSCode, unescapedBytes, urlsFound, errors, context]
144 | 
145 | 
146 | def getVarContent(jsCode, varContent):
147 |     '''
148 |         Given the Javascript code and the content of a variable this method tries to obtain the real value of the variable, cleaning expressions like "a = eval; a(js_code);"
149 |         
150 |         @param jsCode: The Javascript code (string)
151 |         @param varContent: The content of the variable (string)
152 |         @return: A string with real value of the variable
153 |     '''
154 |     clearBytes = ''
155 |     varContent = varContent.replace('\n', '')
156 |     varContent = varContent.replace('\r', '')
157 |     varContent = varContent.replace('\t', '')
158 |     varContent = varContent.replace(' ', '')
159 |     parts = varContent.split('+')
160 |     for part in parts:
161 |         if re.match('["\'].*?["\']', part, re.DOTALL):
162 |             clearBytes += part[1:-1]
163 |         else:
164 |             part = escapeString(part)
165 |             varContent = re.findall(part + '\s*?=\s*?(.*?)[,;]', jsCode, re.DOTALL)
166 |             if varContent != []:
167 |                 clearBytes += getVarContent(jsCode, varContent[0])
168 |     return clearBytes
169 | 
170 | 
171 | def isJavascript(content):
172 |     '''
173 |         Given an string this method looks for typical Javscript strings and try to identify if the string contains Javascrit code or not.
174 |         
175 |         @param content: A string
176 |         @return: A boolean, True if it seems to contain Javascript code or False in the other case
177 |     '''
178 |     JSStrings = ['var ', ';', ')', '(', 'function ', '=', '{', '}', 'if ', 'else', 'return', 'while ', 'for ', ',',
179 |                  'eval']
180 |     keyStrings = [';', '(', ')']
181 |     stringsFound = []
182 |     limit = 15
183 |     minDistinctStringsFound = 5
184 |     results = 0
185 | 
186 |     if re.findall(reJSscript, content, re.DOTALL | re.IGNORECASE) != []:
187 |         return True
188 | 
189 |     for char in content:
190 |         if (ord(char) < 32 and char not in ['\n', '\r', '\t', '\f', '\x00']) or ord(char) >= 127:
191 |             return False
192 | 
193 |     for string in JSStrings:
194 |         cont = content.count(string)
195 |         results += cont
196 |         if cont > 0 and string not in stringsFound:
197 |             stringsFound.append(string)
198 |         elif cont == 0 and string in keyStrings:
199 |             return False
200 | 
201 |     if results > limit and len(stringsFound) >= minDistinctStringsFound:
202 |         return True
203 |     else:
204 |         return False
205 | 
206 | 
207 | def searchObfuscatedFunctions(jsCode, function):
208 |     '''
209 |         Search for obfuscated functions in the Javascript code
210 |         
211 |         @param jsCode: The Javascript code (string)
212 |         @param function: The function name to look for (string)
213 |         @return: List with obfuscated functions information [functionName,functionCall,containsReturns] 
214 |     '''
215 |     obfuscatedFunctionsInfo = []
216 |     if jsCode != None:
217 |         match = re.findall('\W(' + function + '\s{0,5}?\((.*?)\)\s{0,5}?;)', jsCode, re.DOTALL)
218 |         if match != []:
219 |             for m in match:
220 |                 if re.findall('return', m[1], re.IGNORECASE) != []:
221 |                     obfuscatedFunctionsInfo.append([function, m, True])
222 |                 else:
223 |                     obfuscatedFunctionsInfo.append([function, m, False])
224 |         obfuscatedFunctions = re.findall('\s*?((\w*?)\s*?=\s*?' + function + ')\s*?;', jsCode, re.DOTALL)
225 |         for obfuscatedFunction in obfuscatedFunctions:
226 |             obfuscatedElement = obfuscatedFunction[1]
227 |             obfuscatedFunctionsInfo += searchObfuscatedFunctions(jsCode, obfuscatedElement)
228 |     return obfuscatedFunctionsInfo
229 | 
230 | 
231 | def unescape(escapedBytes, unicode=True):
232 |     '''
233 |         This method unescapes the given string
234 |         
235 |         @param escapedBytes: A string to unescape
236 |         @return: A tuple (status,statusContent), where statusContent is an unescaped string in case status = 0 or an error in case status = -1
237 |     '''
238 |     # TODO: modify to accept a list of escaped strings?
239 |     unescapedBytes = ''
240 |     if unicode:
241 |         unicodePadding = '\x00'
242 |     else:
243 |         unicodePadding = ''
244 |     try:
245 |         if escapedBytes.lower().find('%u') != -1 or escapedBytes.lower().find('\u') != -1 or escapedBytes.find(
246 |                 '%') != -1:
247 |             if escapedBytes.lower().find('\u') != -1:
248 |                 splitBytes = escapedBytes.split('\\')
249 |             else:
250 |                 splitBytes = escapedBytes.split('%')
251 |             for i in range(len(splitBytes)):
252 |                 splitByte = splitBytes[i]
253 |                 if splitByte == '':
254 |                     continue
255 |                 if len(splitByte) > 4 and re.match('u[0-9a-f]{4}', splitByte[:5], re.IGNORECASE):
256 |                     unescapedBytes += chr(int(splitByte[3] + splitByte[4], 16)) + chr(
257 |                         int(splitByte[1] + splitByte[2], 16))
258 |                     if len(splitByte) > 5:
259 |                         for j in range(5, len(splitByte)):
260 |                             unescapedBytes += splitByte[j] + unicodePadding
261 |                 elif len(splitByte) > 1 and re.match('[0-9a-f]{2}', splitByte[:2], re.IGNORECASE):
262 |                     unescapedBytes += chr(int(splitByte[0] + splitByte[1], 16)) + unicodePadding
263 |                     if len(splitByte) > 2:
264 |                         for j in range(2, len(splitByte)):
265 |                             unescapedBytes += splitByte[j] + unicodePadding
266 |                 else:
267 |                     if i != 0:
268 |                         unescapedBytes += '%' + unicodePadding
269 |                     for j in range(len(splitByte)):
270 |                         unescapedBytes += splitByte[j] + unicodePadding
271 |         else:
272 |             unescapedBytes = escapedBytes
273 |     except:
274 |         return (-1, 'Error while unescaping the bytes')
275 |     return (0, unescapedBytes)
276 | 


--------------------------------------------------------------------------------
/peepdf/PDFCrypto.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #    peepdf is a tool to analyse and modify PDF files
  3 | #    http://peepdf.eternal-todo.com
  4 | #    By Jose Miguel Esparza <jesparza AT eternal-todo.com>
  5 | #
  6 | #    Copyright (C) 2011-2014 Jose Miguel Esparza
  7 | #
  8 | #    This file is part of peepdf.
  9 | #
 10 | #        peepdf is free software: you can redistribute it and/or modify
 11 | #        it under the terms of the GNU General Public License as published by
 12 | #        the Free Software Foundation, either version 3 of the License, or
 13 | #        (at your option) any later version.
 14 | #
 15 | #        peepdf is distributed in the hope that it will be useful,
 16 | #        but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | #        MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    See the
 18 | #        GNU General Public License for more details.
 19 | #
 20 | #        You should have received a copy of the GNU General Public License
 21 | #        along with peepdf.    If not, see <http://www.gnu.org/licenses/>.
 22 | #
 23 | 
 24 | '''    
 25 |     Module to manage cryptographic operations with PDF files
 26 | '''
 27 | 
 28 | import hashlib
 29 | import struct
 30 | import random
 31 | import warnings
 32 | from itertools import cycle, izip
 33 | 
 34 | import aes
 35 | 
 36 | warnings.filterwarnings("ignore")
 37 | 
 38 | paddingString = '\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A'
 39 | 
 40 | 
 41 | def computeEncryptionKey(password, dictOwnerPass, dictUserPass, dictOE, dictUE, fileID, pElement, dictKeyLength=128,
 42 |                          revision=3, encryptMetadata=False, passwordType=None):
 43 |     '''
 44 |         Compute an encryption key to encrypt/decrypt the PDF file
 45 |         
 46 |         @param password: The password entered by the user
 47 |         @param dictOwnerPass: The owner password from the standard security handler dictionary
 48 |         @param dictUserPass: The user password from the standard security handler dictionary
 49 |         @param dictOE: The owner encrypted string from the standard security handler dictionary
 50 |         @param dictUE:The user encrypted string from the standard security handler dictionary
 51 |         @param fileID: The /ID element in the trailer dictionary of the PDF file
 52 |         @param pElement: The /P element of the Encryption dictionary
 53 |         @param dictKeyLength: The length of the key
 54 |         @param revision: The algorithm revision
 55 |         @param encryptMetadata: A boolean extracted from the standard security handler dictionary to specify if it's necessary to encrypt the document metadata or not
 56 |         @param passwordType: It specifies the given password type. It can be 'USER', 'OWNER' or None.
 57 |         @return: A tuple (status,statusContent), where statusContent is the encryption key in case status = 0 or an error message in case status = -1
 58 |     '''
 59 |     if revision != 5:
 60 |         keyLength = dictKeyLength / 8
 61 |         lenPass = len(password)
 62 |         if lenPass > 32:
 63 |             password = password[:32]
 64 |         elif lenPass < 32:
 65 |             password += paddingString[:32 - lenPass]
 66 |         md5input = password + dictOwnerPass + struct.pack('<I', abs(int(pElement))) + fileID
 67 |         if revision > 3 and not encryptMetadata:
 68 |             md5input += '\xFF' * 4
 69 |         key = hashlib.md5(md5input).digest()
 70 |         if revision > 2:
 71 |             counter = 0
 72 |             while counter < 50:
 73 |                 key = hashlib.md5(key[:keyLength]).digest()
 74 |                 counter += 1
 75 |             key = key[:keyLength]
 76 |         elif revision == 2:
 77 |             key = key[:5]
 78 |         return (0, key)
 79 |     else:
 80 |         if passwordType == 'USER':
 81 |             password = password.encode('utf-8')[:127]
 82 |             kSalt = dictUserPass[40:48]
 83 |             intermediateKey = hashlib.sha256(password + kSalt).digest()
 84 |             ret = aes.decryptData('\0' * 16 + dictUE, intermediateKey)
 85 |         elif passwordType == 'OWNER':
 86 |             password = password.encode('utf-8')[:127]
 87 |             kSalt = dictOwnerPass[40:48]
 88 |             intermediateKey = hashlib.sha256(password + kSalt + dictUserPass).digest()
 89 |             ret = aes.decryptData('\0' * 16 + dictOE, intermediateKey)
 90 |         return ret
 91 | 
 92 | 
 93 | def computeObjectKey(id, generationNum, encryptionKey, keyLengthBytes, algorithm='RC4'):
 94 |     '''
 95 |         Compute the key necessary to encrypt each object, depending on the id and generation number. Only necessary with /V < 5.
 96 |         
 97 |         @param id: The object id
 98 |         @param generationNum: The generation number of the object
 99 |         @param encryptionKey: The encryption key
100 |         @param keyLengthBytes: The length of the encryption key in bytes
101 |         @param algorithm: The algorithm used in the encryption/decryption process
102 |         @return: The computed key in string format
103 |     '''
104 |     key = encryptionKey + struct.pack('<i', id)[:3] + struct.pack('<i', generationNum)[:2]
105 |     if algorithm == 'AES':
106 |         key += '\x73\x41\x6C\x54'  # sAlT
107 |     key = hashlib.md5(key).digest()
108 |     if keyLengthBytes + 5 < 16:
109 |         key = key[:keyLengthBytes + 5]
110 |     else:
111 |         key = key[:16]
112 |     # AES: block size = 16 bytes, initialization vector (16 bytes), random, first bytes encrypted string
113 |     return key
114 | 
115 | 
116 | def computeOwnerPass(ownerPassString, userPassString, keyLength=128, revision=3):
117 |     '''
118 |         Compute the owner password necessary to compute the encryption key of the PDF file
119 |         
120 |         @param ownerPassString: The owner password entered by the user
121 |         @param userPassString: The user password entered by the user
122 |         @param keyLength: The length of the key
123 |         @param revision: The algorithm revision
124 |         @return: The computed password in string format
125 |     '''
126 |     # TODO: revision 5
127 |     keyLength = keyLength / 8
128 |     lenPass = len(ownerPassString)
129 |     if lenPass > 32:
130 |         ownerPassString = ownerPassString[:32]
131 |     elif lenPass < 32:
132 |         ownerPassString += paddingString[:32 - lenPass]
133 |     rc4Key = hashlib.md5(ownerPassString).digest()
134 |     if revision > 2:
135 |         counter = 0
136 |         while counter < 50:
137 |             rc4Key = hashlib.md5(rc4Key).digest()
138 |             counter += 1
139 |     rc4Key = rc4Key[:keyLength]
140 |     lenPass = len(userPassString)
141 |     if lenPass > 32:
142 |         userPassString = userPassString[:32]
143 |     elif lenPass < 32:
144 |         userPassString += paddingString[:32 - lenPass]
145 |     ownerPass = RC4(userPassString, rc4Key)
146 |     if revision > 2:
147 |         counter = 1
148 |         while counter <= 19:
149 |             newKey = ''
150 |             for i in range(len(rc4Key)):
151 |                 newKey += chr(ord(rc4Key[i]) ^ counter)
152 |             ownerPass = RC4(ownerPass, newKey)
153 |             counter += 1
154 |     return ownerPass
155 | 
156 | 
157 | def computeUserPass(userPassString, dictO, fileID, pElement, keyLength=128, revision=3, encryptMetadata=False):
158 |     '''
159 |         Compute the user password of the PDF file
160 |         
161 |         @param userPassString: The user password entered by the user
162 |         @param ownerPass: The computed owner password
163 |         @param fileID: The /ID element in the trailer dictionary of the PDF file
164 |         @param pElement: The /P element of the /Encryption dictionary
165 |         @param keyLength: The length of the key
166 |         @param revision: The algorithm revision
167 |         @param encryptMetadata: A boolean extracted from the standard security handler dictionary to specify if it's necessary to encrypt the document metadata or not
168 |         @return: A tuple (status,statusContent), where statusContent is the computed password in case status = 0 or an error message in case status = -1
169 |     '''
170 |     # TODO: revision 5
171 |     userPass = ''
172 |     dictU = ''
173 |     dictOE = ''
174 |     dictUE = ''
175 |     ret = computeEncryptionKey(userPassString, dictO, dictU, dictOE, dictUE, fileID, pElement, keyLength, revision,
176 |                                encryptMetadata)
177 |     if ret[0] != -1:
178 |         rc4Key = ret[1]
179 |     else:
180 |         return ret
181 |     if revision == 2:
182 |         userPass = RC4(paddingString, rc4Key)
183 |     elif revision > 2:
184 |         counter = 1
185 |         md5Input = paddingString + fileID
186 |         hashResult = hashlib.md5(md5Input).digest()
187 |         userPass = RC4(hashResult, rc4Key)
188 |         while counter <= 19:
189 |             newKey = ''
190 |             for i in range(len(rc4Key)):
191 |                 newKey += chr(ord(rc4Key[i]) ^ counter)
192 |             userPass = RC4(userPass, newKey)
193 |             counter += 1
194 |         counter = 0
195 |         while counter < 16:
196 |             userPass += chr(random.randint(32, 255))
197 |             counter += 1
198 |     return (0, userPass)
199 | 
200 | 
201 | def isUserPass(password, computedUserPass, dictU, revision):
202 |     '''
203 |         Checks if the given password is the User password of the file
204 |         
205 |         @param password: The given password or the empty password
206 |         @param computedUserPass: The computed user password of the file
207 |         @param dictU: The /U element of the /Encrypt dictionary
208 |         @param revision: The number of revision of the standard security handler
209 |         @return The boolean telling if the given password is the user password or not
210 |     '''
211 |     if revision == 5:
212 |         vSalt = dictU[32:40]
213 |         inputHash = hashlib.sha256(password + vSalt).digest()
214 |         if inputHash == dictU[:32]:
215 |             return True
216 |         else:
217 |             return False
218 |     elif revision == 3 or revision == 4:
219 |         if computedUserPass[:16] == dictU[:16]:
220 |             return True
221 |         else:
222 |             return False
223 |     elif revision < 3:
224 |         if computedUserPass == dictU:
225 |             return True
226 |         else:
227 |             return False
228 | 
229 | 
230 | def isOwnerPass(password, dictO, dictU, computedUserPass, keyLength, revision):
231 |     '''
232 |         Checks if the given password is the owner password of the file
233 |         
234 |         @param password: The given password or the empty password
235 |         @param dictO: The /O element of the /Encrypt dictionary
236 |         @param dictU: The /U element of the /Encrypt dictionary
237 |         @param computedUserPass: The computed user password of the file
238 |         @param keyLength: The length of the key
239 |         @param revision: The algorithm revision
240 |         @return The boolean telling if the given password is the owner password or not
241 |     '''
242 |     if revision == 5:
243 |         vSalt = dictO[32:40]
244 |         inputHash = hashlib.sha256(password + vSalt + dictU).digest()
245 |         if inputHash == dictO[:32]:
246 |             return True
247 |         else:
248 |             return False
249 |     else:
250 |         keyLength = keyLength / 8
251 |         lenPass = len(password)
252 |         if lenPass > 32:
253 |             password = password[:32]
254 |         elif lenPass < 32:
255 |             password += paddingString[:32 - lenPass]
256 |         rc4Key = hashlib.md5(password).digest()
257 |         if revision > 2:
258 |             counter = 0
259 |             while counter < 50:
260 |                 rc4Key = hashlib.md5(rc4Key).digest()
261 |                 counter += 1
262 |         rc4Key = rc4Key[:keyLength]
263 |         if revision == 2:
264 |             userPass = RC4(dictO, rc4Key)
265 |         elif revision > 2:
266 |             counter = 19
267 |             while counter >= 0:
268 |                 newKey = ''
269 |                 for i in range(len(rc4Key)):
270 |                     newKey += chr(ord(rc4Key[i]) ^ counter)
271 |                 dictO = RC4(dictO, newKey)
272 |                 counter -= 1
273 |             userPass = dictO
274 |         else:
275 |             # Is it possible??
276 |             userPass = ''
277 |         return isUserPass(userPass, computedUserPass, dictU, revision)
278 | 
279 | 
280 | def RC4(data, key):
281 |     '''
282 |         RC4 implementation
283 |         
284 |         @param data: Bytes to be encrypyed/decrypted
285 |         @param key: Key used for the algorithm
286 |         @return: The encrypted/decrypted bytes
287 |     '''
288 |     y = 0
289 |     hash = {}
290 |     box = {}
291 |     ret = ''
292 |     keyLength = len(key)
293 |     dataLength = len(data)
294 | 
295 |     # Initialization
296 |     for x in range(256):
297 |         hash[x] = ord(key[x % keyLength])
298 |         box[x] = x
299 |     for x in range(256):
300 |         y = (y + int(box[x]) + int(hash[x])) % 256
301 |         tmp = box[x]
302 |         box[x] = box[y]
303 |         box[y] = tmp
304 | 
305 |     z = y = 0
306 |     for x in range(0, dataLength):
307 |         z = (z + 1) % 256
308 |         y = (y + box[z]) % 256
309 |         tmp = box[z]
310 |         box[z] = box[y]
311 |         box[y] = tmp
312 |         k = box[((box[z] + box[y]) % 256)]
313 |         ret += chr(ord(data[x]) ^ k)
314 |     return ret
315 | 
316 | 
317 | '''
318 |     Author: Evan Fosmark (http://www.evanfosmark.com/2008/06/xor-encryption-with-python/)
319 | '''
320 | 
321 | 
322 | def xor(bytes, key):
323 |     '''
324 |         Simple XOR implementation
325 |         
326 |         @param bytes: Bytes to be xored
327 |         @param key: Key used for the operation, it's cycled.
328 |         @return: The xored bytes
329 |     '''
330 |     key = cycle(key)
331 |     return ''.join(chr(ord(x) ^ ord(y)) for (x, y) in izip(bytes, key))
332 | 


--------------------------------------------------------------------------------
/peepdf/README:
--------------------------------------------------------------------------------
  1 | ** Home page **
  2 | 
  3 | http://peepdf.eternal-todo.com
  4 | http://twitter.com/peepdf
  5 | 
  6 | 
  7 | ** Dependencies **
  8 | 
  9 | - In order to analyse Javascript code "PyV8" is needed:
 10 | 
 11 |     http://code.google.com/p/pyv8/
 12 | 
 13 | 
 14 | - The "sctest" command is a wrapper of "sctest" (libemu). Besides libemu pylibemu is used and must be installed:
 15 | 
 16 |     http://libemu.carnivore.it (latest version from git repository, Sourceforge package is outdated)
 17 |     https://github.com/buffer/pylibemu
 18 | 
 19 | 
 20 | - To support XML output "lxml" is needed:
 21 | 
 22 |     http://lxml.de/installation.html
 23 |     
 24 | 
 25 | - Included modules: lzw, colorama, jsbeautifier, ccitt, pythonaes (Thanks to all the developers!!)
 26 | 
 27 | 
 28 | 
 29 | ** Installation **
 30 | 
 31 | No installation is needed apart of the commented dependencies, just execute it!
 32 | 
 33 | 
 34 | 
 35 | ** Execution **
 36 | 
 37 | There are two important options when peepdf is executed:
 38 | 
 39 | -f: Ignores the parsing errors. Analysing malicious files propably leads to parsing errors, so this parameter should be set.
 40 | -l: Sets the loose mode, so does not search for the endobj tag because it's not obligatory. Helpful with malformed files.
 41 | 
 42 | 
 43 | * Simple execution
 44 | 
 45 | Shows the statistics of the file after being decoded/decrypted and analysed:
 46 | 
 47 |     python peepdf.py [options] pdf_file
 48 | 
 49 | 
 50 | * Interactive console
 51 | 
 52 | Executes the interactive console to let play with the PDF file:
 53 | 
 54 |     python peepdf.py -i [options] pdf_file
 55 | 
 56 | If no PDF file is specified it's possible to use the decode/encode/js*/sctest commands and create a new PDF file:
 57 | 
 58 |     python peepdf.py -i
 59 | 
 60 | 
 61 | * Batch execution
 62 | 
 63 | It's possible to use a commands file to specify the commands to be executed in the batch mode. This type of execution is good to automatise analysis of several files:
 64 | 
 65 |     python peepdf.py [options] -s commands_file pdf_file
 66 | 
 67 | 
 68 | 
 69 | ** Updating **
 70 | 
 71 | Just type this and you will be updated to the latest version from the repository:
 72 | 
 73 |     python peepdf.py -u
 74 | 
 75 | 
 76 | 
 77 | ** Some hints **
 78 | 
 79 | If the information shown when a PDF file is parsed is not enough to know if it's harmful or not, the following commands can help to do it:
 80 | 
 81 | * tree
 82 | 
 83 | Shows the tree graph of the file or specified version. Here we can see suspicious elements.
 84 | 
 85 | 
 86 | * offsets 
 87 | 
 88 | Shows the physical map of the file or the specified version of the document. This is helpful to see unusual big objects or big spaces between objects.
 89 | 
 90 | 
 91 | * search
 92 | 
 93 | Search the specified string or hexadecimal string in the objects (decoded and encrypted streams included).
 94 | 
 95 | 
 96 | * object/rawobject
 97 | 
 98 | Shows the (raw) content of the object.
 99 | 
100 | 
101 | * stream/rawstream
102 | 
103 | Shows the (raw) content of the stream.
104 | 
105 | 
106 | * The rest of commands, of course
107 | 
108 | > help
109 | 
110 | 
111 | 
112 | ** Bugs **
113 | 
114 | Send me bugs and comments, please!! ;) You can do it via mail (jesparza AT eternal-todo.com) or through Google Code (http://peepdf.googlecode.com).
115 | 
116 | Thanks!!
117 | 


--------------------------------------------------------------------------------
/peepdf/TODO:
--------------------------------------------------------------------------------
 1 | Pending tasks:
 2 | 
 3 | - User manual
 4 | - Documentation of methods in PDFCore.py
 5 | - Add the rest of supported stream filters (better testing of existent)
 6 | - Automatic analysis of embedded PDF files
 7 | - Add AES to the encryption implementation
 8 | - Improve the automatic Javascript analysis, getting code from other parts of the documents (getAnnots, etc)
 9 | - GUI
10 | - ActionScript analysis?


--------------------------------------------------------------------------------
/peepdf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/peepdf/__init__.py


--------------------------------------------------------------------------------
/peepdf/aes.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #    peepdf is a tool to analyse and modify PDF files
 3 | #    http://peepdf.eternal-todo.com
 4 | #    By Jose Miguel Esparza <jesparza AT eternal-todo.com>
 5 | #
 6 | #    Copyright (C) 2012-2014 Jose Miguel Esparza
 7 | #
 8 | #    This file is part of peepdf.
 9 | #
10 | #        peepdf is free software: you can redistribute it and/or modify
11 | #        it under the terms of the GNU General Public License as published by
12 | #        the Free Software Foundation, either version 3 of the License, or
13 | #        (at your option) any later version.
14 | #
15 | #        peepdf is distributed in the hope that it will be useful,
16 | #        but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | #        MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    See the
18 | #        GNU General Public License for more details.
19 | #
20 | #        You should have received a copy of the GNU General Public License
21 | #        along with peepdf.    If not, see <http://www.gnu.org/licenses/>.
22 | #
23 | 
24 | """
25 | Created from the demonstration of the pythonaes package.
26 | 
27 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
28 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
29 | """
30 | 
31 | from aespython import key_expander, aes_cipher, cbc_mode
32 | 
33 | 
34 | def decryptData(data, password=None, keyLength=None, mode='CBC'):
35 |     '''
36 |         Method added for peepdf
37 |     '''
38 |     decryptedData = ''
39 |     if keyLength == None:
40 |         keyLength = len(password) * 8
41 |     if keyLength not in [128, 192, 256]:
42 |         return (-1, 'Bad length key in AES decryption process')
43 | 
44 |     iv = map(ord, data[:16])
45 |     key = map(ord, password)
46 |     data = data[16:]
47 |     if len(data) % 16 != 0:
48 |         data = data[:-(len(data) % 16)]
49 |     keyExpander = key_expander.KeyExpander(keyLength)
50 |     expandedKey = keyExpander.expand(key)
51 |     aesCipher = aes_cipher.AESCipher(expandedKey)
52 |     if mode == 'CBC':
53 |         aesMode = cbc_mode.CBCMode(aesCipher, 16)
54 |     aesMode.set_iv(iv)
55 |     for i in range(0, len(data), 16):
56 |         ciphertext = map(ord, data[i:i + 16])
57 |         decryptedBytes = aesMode.decrypt_block(ciphertext)
58 |         for byte in decryptedBytes:
59 |             decryptedData += chr(byte)
60 |     return (0, decryptedData)
61 | 


--------------------------------------------------------------------------------
/peepdf/aespython/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/peepdf/aespython/__init__.py


--------------------------------------------------------------------------------
/peepdf/aespython/aes_cipher.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | AES Block Cipher.
 4 | 
 5 | Performs single block cipher decipher operations on a 16 element list of integers.
 6 | These integers represent 8 bit bytes in a 128 bit block.
 7 | The result of cipher or decipher operations is the transformed 16 element list of integers.
 8 | 
 9 | Running this file as __main__ will result in a self-test of the algorithm.
10 | 
11 | Algorithm per NIST FIPS-197 http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf
12 | 
13 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
14 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
15 | """
16 | __author__ = "Adam Newman"
17 | 
18 | # Normally use relative import. In test mode use local import.
19 | try:
20 |     from .aes_tables import sbox, i_sbox, galI, galNI
21 | except ValueError:
22 |     from aes_tables import sbox, i_sbox, galI, galNI
23 | ups = ",".join("s%x" % x for x in range(16))
24 | upr = ups.replace("s", "r")
25 | mix = ",".join(",".join(
26 |     ("g{0}[s%x]^g{1}[s%x]^g{2}[s%x]^g{3}[s%x]^r%x" % (i + (i[0] + (0, 3, 2, 1)[j],))).format(j & 3, j + 1 & 3,
27 |                                                                                              j + 2 & 3, j + 3 & 3) for j
28 |     in (0, 3, 2, 1)) for i in ((0, 1, 2, 3), (4, 5, 6, 7), (8, 9, 10, 11), (12, 13, 14, 15))).replace("g2",
29 |                                                                                                       "g").replace("g3",
30 |                                                                                                                    "g")
31 | i = mix.find("g[")
32 | while i != -1:
33 |     mix = mix[:i] + mix[i + 2:i + 4] + mix[i + 5:]
34 |     i = mix.find("g[", i)
35 | imix = ",".join(",".join(
36 |     ("g{0}[s%x]^g{1}[s%x]^g{2}[s%x]^g{3}[s%x]" % i).format(j & 3, j + 1 & 3, j + 2 & 3, j + 3 & 3) for j in
37 |     (0, 3, 2, 1)) for i in ((0, 1, 2, 3), (4, 5, 6, 7), (8, 9, 10, 11), (12, 13, 14, 15)))
38 | csl = ["s%x" % (x * 5 & 15) for x in range(16)]
39 | csr = ["s%x" % (x * -3 & 15) for x in range(16)]
40 | box = ",".join("s[%s]" % i for i in csl)
41 | ibox = ",".join("s[%s]^r%x" % i for i in zip(csr, range(16)))
42 | xor = ",".join("s[%s]^r%x" % i for i in zip(csl, range(16)))
43 | xori = ";".join("s%x^=r%x" % (i, i) for i in range(16))
44 | ciph = """def decipher_block(f,s):
45 |  g0,g1,g2,g3=galNI;ek=f._expanded_key;S=s+[0]*(16-len(s));s=sbox;R=ek[:16];X
46 |  for f in range(!16):R=ek[f:f+16];S=B;S=M
47 |  R=ek[f+16:]
48 |  return """.replace("S", ups).replace("R", upr).replace("X", xori)
49 | 
50 | 
51 | class AESCipher:
52 |     def __init__(self, expanded_key):
53 |         self._expanded_key = expanded_key
54 |         self._Nr = len(expanded_key) - 16
55 | 
56 |     exec (
57 |     ciph.replace("g2,g3", "").replace("dec", "c").replace("!", "16,f._Nr,").replace("B", box).replace("M", mix) + xor)
58 |     exec (ciph.replace("NI", "I").replace(":16", "f._Nr:").replace("f+16:", ":16").replace("!", "f._Nr-16,0,-").replace(
59 |         "sbox", "i_sbox").replace("B", ibox).replace("M", imix) + ibox)
60 | 
61 | 
62 | import unittest
63 | 
64 | 
65 | class TestCipher(unittest.TestCase):
66 |     def test_cipher(self):
67 |         """Test AES cipher with all key lengths"""
68 |         import test_keys
69 |         import key_expander
70 | 
71 |         test_data = test_keys.TestKeys()
72 |         for key_size in 128, 192, 256:
73 |             test_key_expander = key_expander.KeyExpander(key_size)
74 |             test_expanded_key = test_key_expander.expand(test_data.test_key[key_size])
75 |             test_cipher = AESCipher(test_expanded_key)
76 |             test_result_ciphertext = test_cipher.cipher_block(test_data.test_block_plaintext)
77 |             self.assertEquals(len(
78 |                 [i for i, j in zip(test_result_ciphertext, test_data.test_block_ciphertext_validated[key_size]) if
79 |                  i == j]),
80 |                               16, msg='Test %d bit cipher' % key_size)
81 |             test_result_plaintext = test_cipher.decipher_block(test_data.test_block_ciphertext_validated[key_size])
82 |             self.assertEquals(len([i for i, j in zip(test_result_plaintext, test_data.test_block_plaintext) if i == j]),
83 |                               16, msg='Test %d bit decipher' % key_size)
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     unittest.main()
88 | 


--------------------------------------------------------------------------------
/peepdf/aespython/cbc_mode.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | CBC Mode of operation
 4 | 
 5 | Running this file as __main__ will result in a self-test of the algorithm.
 6 | 
 7 | Algorithm per NIST SP 800-38A http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf
 8 | 
 9 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
10 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
11 | """
12 | __author__ = "Adam Newman"
13 | 
14 | 
15 | class CBCMode:
16 |     """Perform CBC operation on a block and retain IV information for next operation"""
17 | 
18 |     def __init__(self, block_cipher, block_size):
19 |         self._block_cipher = block_cipher
20 |         self._block_size = block_size
21 |         self._iv = [0] * block_size
22 | 
23 |     def set_iv(self, iv):
24 |         if len(iv) == self._block_size:
25 |             self._iv = iv
26 | 
27 |     def encrypt_block(self, plaintext):
28 |         iv = self._iv = self._block_cipher.cipher_block([i ^ j for i, j in zip(plaintext, self._iv)])
29 |         return iv
30 | 
31 |     def decrypt_block(self, ciphertext):
32 |         plaintext = list(self._block_cipher.decipher_block(ciphertext))
33 |         for i, v in enumerate(self._iv): plaintext[i] ^= v
34 |         self._iv = ciphertext
35 |         return plaintext
36 | 
37 | 
38 | import unittest
39 | 
40 | 
41 | class TestEncryptionMode(unittest.TestCase):
42 |     def test_mode(self):
43 |         # Self test
44 |         import key_expander
45 |         import aes_cipher
46 |         import test_keys
47 | 
48 |         test_data = test_keys.TestKeys()
49 | 
50 |         test_expander = key_expander.KeyExpander(256)
51 |         test_expanded_key = test_expander.expand(test_data.test_mode_key)
52 | 
53 |         test_cipher = aes_cipher.AESCipher(test_expanded_key)
54 | 
55 |         test_cbc = CBCMode(test_cipher, 16)
56 | 
57 |         test_cbc.set_iv(test_data.test_mode_iv)
58 |         for k in range(4):
59 |             self.assertEquals(len([i for i, j in zip(test_data.test_cbc_ciphertext[k],
60 |                                                      test_cbc.encrypt_block(test_data.test_mode_plaintext[k])) if
61 |                                    i == j]),
62 |                               16,
63 |                               msg='CBC encrypt test block %d' % k)
64 | 
65 |         test_cbc.set_iv(test_data.test_mode_iv)
66 |         for k in range(4):
67 |             self.assertEquals(len([i for i, j in zip(test_data.test_mode_plaintext[k],
68 |                                                      test_cbc.decrypt_block(test_data.test_cbc_ciphertext[k])) if
69 |                                    i == j]),
70 |                               16,
71 |                               msg='CBC decrypt test block %d' % k)
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     unittest.main()
76 | 


--------------------------------------------------------------------------------
/peepdf/aespython/cfb_mode.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | CFB Mode of operation
 4 | 
 5 | Running this file as __main__ will result in a self-test of the algorithm.
 6 | 
 7 | Algorithm per NIST SP 800-38A http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf
 8 | 
 9 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
10 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
11 | """
12 | __author__ = "Adam Newman"
13 | 
14 | 
15 | class CFBMode:
16 |     """Perform CFB operation on a block and retain IV information for next operation"""
17 | 
18 |     def __init__(self, block_cipher, block_size):
19 |         self._block_cipher = block_cipher
20 |         self._block_size = block_size
21 |         self._iv = [0] * block_size
22 | 
23 |     def set_iv(self, iv):
24 |         if len(iv) == self._block_size:
25 |             self._iv = iv
26 | 
27 |     def encrypt_block(self, plaintext):
28 |         cipher_iv = self._block_cipher.cipher_block(self._iv)
29 |         iv = self._iv = [i ^ j for i, j in zip(plaintext, cipher_iv)]
30 |         return iv
31 | 
32 |     def decrypt_block(self, ciphertext):
33 |         cipher_iv = self._block_cipher.cipher_block(self._iv)
34 |         self._iv = ciphertext
35 |         return [i ^ j for i, j in zip(cipher_iv, ciphertext)]
36 | 
37 | 
38 | import unittest
39 | 
40 | 
41 | class TestEncryptionMode(unittest.TestCase):
42 |     def test_mode(self):
43 |         # Self test
44 |         import key_expander
45 |         import aes_cipher
46 |         import test_keys
47 | 
48 |         test_data = test_keys.TestKeys()
49 | 
50 |         test_expander = key_expander.KeyExpander(256)
51 |         test_expanded_key = test_expander.expand(test_data.test_mode_key)
52 | 
53 |         test_cipher = aes_cipher.AESCipher(test_expanded_key)
54 | 
55 |         test_cfb = CFBMode(test_cipher, 16)
56 | 
57 |         test_cfb.set_iv(test_data.test_mode_iv)
58 |         for k in range(4):
59 |             self.assertEquals(len([i for i, j in zip(test_data.test_cfb_ciphertext[k],
60 |                                                      test_cfb.encrypt_block(test_data.test_mode_plaintext[k])) if
61 |                                    i == j]),
62 |                               16,
63 |                               msg='CFB encrypt test block' + str(k))
64 | 
65 |         test_cfb.set_iv(test_data.test_mode_iv)
66 |         for k in range(4):
67 |             self.assertEquals(len([i for i, j in zip(test_data.test_mode_plaintext[k],
68 |                                                      test_cfb.decrypt_block(test_data.test_cfb_ciphertext[k])) if
69 |                                    i == j]),
70 |                               16,
71 |                               msg='CFB decrypt test block' + str(k))
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     unittest.main()
76 | 


--------------------------------------------------------------------------------
/peepdf/aespython/key_expander.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | AES Key Expansion.
  5 | 
  6 | Expands 128, 192, or 256 bit key for use with AES
  7 | 
  8 | Running this file as __main__ will result in a self-test of the algorithm.
  9 | 
 10 | Algorithm per NIST FIPS-197 http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf
 11 | 
 12 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
 13 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
 14 | """
 15 | __author__ = "Adam Newman"
 16 | 
 17 | # Normally use relative import. In test mode use local import.
 18 | try:
 19 |     from .aes_tables import sbox, rcon
 20 | except ValueError:
 21 |     from aes_tables import sbox, rcon
 22 | from operator import xor
 23 | 
 24 | 
 25 | class KeyExpander:
 26 |     """Perform AES Key Expansion"""
 27 | 
 28 |     _expanded_key_length = {128: 176, 192: 208, 256: 240}
 29 | 
 30 |     def __init__(self, key_length):
 31 |         self._key_length = key_length
 32 |         self._n = key_length >> 3
 33 | 
 34 |         if key_length in self._expanded_key_length:
 35 |             self._b = self._expanded_key_length[key_length]
 36 |         else:
 37 |             raise LookupError('Invalid Key Size')
 38 | 
 39 |     def expand(self, new_key):
 40 |         """
 41 |             Expand the encryption key per AES key schedule specifications
 42 | 
 43 |             http://en.wikipedia.org/wiki/Rijndael_key_schedule#Key_schedule_description
 44 |         """
 45 |         # First n bytes are copied from key
 46 |         len_new_key = len(new_key)
 47 |         if len_new_key != self._n:
 48 |             raise RuntimeError('expand(): key size is invalid')
 49 |         rcon_iter = 1
 50 |         nex = new_key.extend
 51 | 
 52 |         # Grow the key until it is the correct length
 53 |         while 1:
 54 |             # Copy last 4 bytes of extended key, apply core, increment i(rcon_iter),
 55 |             # core Append the list of elements 1-3 and list comprised of element 0 (circular rotate left)
 56 |             # core For each element of this new list, put the result of sbox into output array.
 57 |             # xor with 4 bytes n bytes from end of extended key
 58 |             keyarr = [sbox[i] for i in new_key[-3:] + new_key[-4:-3]]
 59 |             # First byte of output array is XORed with rcon(iter)
 60 |             keyarr[0] ^= rcon[rcon_iter]
 61 |             nex(map(xor, keyarr, new_key[-self._n:4 - self._n]))
 62 |             rcon_iter += 1
 63 |             len_new_key += 4
 64 | 
 65 |             # Run three passes of 4 byte expansion using copy of 4 byte tail of extended key
 66 |             # which is then xor'd with 4 bytes n bytes from end of extended key
 67 |             for j in 0, 1, 2:
 68 |                 nex(map(xor, new_key[-4:], new_key[-self._n:4 - self._n]))
 69 |                 len_new_key += 4
 70 |             if len_new_key >= self._b:
 71 |                 return new_key
 72 |             else:
 73 |                 # If key length is 256 and key is not complete, add 4 bytes tail of extended key
 74 |                 # run through sbox before xor with 4 bytes n bytes from end of extended key
 75 |                 if self._key_length == 256:
 76 |                     nex(map(xor, [sbox[x] for x in new_key[-4:]], new_key[-self._n:4 - self._n]))
 77 |                     len_new_key += 4
 78 |                     if len_new_key >= self._b: return new_key
 79 | 
 80 |                 # If key length is 192 or 256 and key is not complete, run 2 or 3 passes respectively
 81 |                 # of 4 byte tail of extended key xor with 4 bytes n bytes from end of extended key
 82 |                 if self._key_length != 128:
 83 |                     for j in ((0, 1) if self._key_length == 192 else (0, 1, 2)):
 84 |                         nex(map(xor, new_key[-4:], new_key[-self._n:4 - self._n]))
 85 |                         len_new_key += 4
 86 |                     if len_new_key >= self._b: return new_key
 87 | 
 88 | 
 89 | import unittest
 90 | 
 91 | 
 92 | class TestKeyExpander(unittest.TestCase):
 93 |     def test_keys(self):
 94 |         """Test All Key Expansions"""
 95 |         import test_keys
 96 | 
 97 |         test_data = test_keys.TestKeys()
 98 |         for key_size in 128, 192, 256:
 99 |             test_expander = KeyExpander(key_size)
100 |             test_expanded_key = test_expander.expand(test_data.test_key[key_size])
101 |             self.assertEqual(
102 |                 len([i for i, j in zip(test_expanded_key, test_data.test_expanded_key_validated[key_size]) if i == j]),
103 |                 len(test_data.test_expanded_key_validated[key_size]),
104 |                 msg='Key expansion ' + str(key_size) + ' bit')
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     unittest.main()
109 | 


--------------------------------------------------------------------------------
/peepdf/aespython/ofb_mode.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | OFB Mode of operation
 4 | 
 5 | Running this file as __main__ will result in a self-test of the algorithm.
 6 | 
 7 | Algorithm per NIST SP 800-38A http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf
 8 | 
 9 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
10 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
11 | """
12 | __author__ = "Adam Newman"
13 | 
14 | 
15 | class OFBMode:
16 |     """Perform OFB operation on a block and retain IV information for next operation"""
17 | 
18 |     def __init__(self, block_cipher, block_size):
19 |         self._block_cipher = block_cipher
20 |         self._block_size = block_size
21 |         self._iv = [0] * block_size
22 | 
23 |     def set_iv(self, iv):
24 |         if len(iv) == self._block_size:
25 |             self._iv = iv
26 | 
27 |     def encrypt_block(self, plaintext):
28 |         self._iv = cipher_iv = self._block_cipher.cipher_block(self._iv)
29 |         return [i ^ j for i, j in zip(plaintext, cipher_iv)]
30 | 
31 |     def decrypt_block(self, ciphertext):
32 |         self._iv = cipher_iv = self._block_cipher.cipher_block(self._iv)
33 |         return [i ^ j for i, j in zip(cipher_iv, ciphertext)]
34 | 
35 | 
36 | import unittest
37 | 
38 | 
39 | class TestEncryptionMode(unittest.TestCase):
40 |     def test_mode(self):
41 |         # Self test
42 |         import key_expander
43 |         import aes_cipher
44 |         import test_keys
45 | 
46 |         test_data = test_keys.TestKeys()
47 | 
48 |         test_expander = key_expander.KeyExpander(256)
49 |         test_expanded_key = test_expander.expand(test_data.test_mode_key)
50 | 
51 |         test_cipher = aes_cipher.AESCipher(test_expanded_key)
52 | 
53 |         test_ofb = OFBMode(test_cipher, 16)
54 | 
55 |         test_ofb.set_iv(test_data.test_mode_iv)
56 |         for k in range(4):
57 |             self.assertEquals(len([i for i, j in zip(test_data.test_ofb_ciphertext[k],
58 |                                                      test_ofb.encrypt_block(test_data.test_mode_plaintext[k])) if
59 |                                    i == j]),
60 |                               16,
61 |                               msg='OFB encrypt test block' + str(k))
62 | 
63 |         test_ofb.set_iv(test_data.test_mode_iv)
64 |         for k in range(4):
65 |             self.assertEquals(len([i for i, j in zip(test_data.test_mode_plaintext[k],
66 |                                                      test_ofb.decrypt_block(test_data.test_ofb_ciphertext[k])) if
67 |                                    i == j]),
68 |                               16,
69 |                               msg='OFB decrypt test block' + str(k))
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     unittest.main()
74 | 


--------------------------------------------------------------------------------
/peepdf/aespython/test_keys.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test keys and data for self-test operations.
  3 | 
  4 | Test data from:
  5 | NIST SP 800-38A http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf
  6 | NIST FIPS-197 http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf
  7 | 
  8 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
  9 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
 10 | """
 11 | __author__ = "Adam Newman"
 12 | 
 13 | 
 14 | class TestKeys:
 15 |     """Test data, keys, IVs, and output to use in self-tests"""
 16 |     test_key = {
 17 |         128: [
 18 |             0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f]
 19 |         , 192: [
 20 |         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 21 |         0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17]
 22 |         , 256: [
 23 |         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 24 |         0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f]
 25 |     }
 26 | 
 27 |     test_expanded_key_validated = {
 28 |         128: [
 29 |             0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 30 |             0xd6, 0xaa, 0x74, 0xfd, 0xd2, 0xaf, 0x72, 0xfa, 0xda, 0xa6, 0x78, 0xf1, 0xd6, 0xab, 0x76, 0xfe,
 31 |             0xb6, 0x92, 0xcf, 0x0b, 0x64, 0x3d, 0xbd, 0xf1, 0xbe, 0x9b, 0xc5, 0x00, 0x68, 0x30, 0xb3, 0xfe,
 32 |             0xb6, 0xff, 0x74, 0x4e, 0xd2, 0xc2, 0xc9, 0xbf, 0x6c, 0x59, 0x0c, 0xbf, 0x04, 0x69, 0xbf, 0x41,
 33 |             0x47, 0xf7, 0xf7, 0xbc, 0x95, 0x35, 0x3e, 0x03, 0xf9, 0x6c, 0x32, 0xbc, 0xfd, 0x05, 0x8d, 0xfd,
 34 |             0x3c, 0xaa, 0xa3, 0xe8, 0xa9, 0x9f, 0x9d, 0xeb, 0x50, 0xf3, 0xaf, 0x57, 0xad, 0xf6, 0x22, 0xaa,
 35 |             0x5e, 0x39, 0x0f, 0x7d, 0xf7, 0xa6, 0x92, 0x96, 0xa7, 0x55, 0x3d, 0xc1, 0x0a, 0xa3, 0x1f, 0x6b,
 36 |             0x14, 0xf9, 0x70, 0x1a, 0xe3, 0x5f, 0xe2, 0x8c, 0x44, 0x0a, 0xdf, 0x4d, 0x4e, 0xa9, 0xc0, 0x26,
 37 |             0x47, 0x43, 0x87, 0x35, 0xa4, 0x1c, 0x65, 0xb9, 0xe0, 0x16, 0xba, 0xf4, 0xae, 0xbf, 0x7a, 0xd2,
 38 |             0x54, 0x99, 0x32, 0xd1, 0xf0, 0x85, 0x57, 0x68, 0x10, 0x93, 0xed, 0x9c, 0xbe, 0x2c, 0x97, 0x4e,
 39 |             0x13, 0x11, 0x1d, 0x7f, 0xe3, 0x94, 0x4a, 0x17, 0xf3, 0x07, 0xa7, 0x8b, 0x4d, 0x2b, 0x30, 0xc5]
 40 |         , 192: [
 41 |         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 42 |         0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x58, 0x46, 0xf2, 0xf9, 0x5c, 0x43, 0xf4, 0xfe,
 43 |         0x54, 0x4a, 0xfe, 0xf5, 0x58, 0x47, 0xf0, 0xfa, 0x48, 0x56, 0xe2, 0xe9, 0x5c, 0x43, 0xf4, 0xfe,
 44 |         0x40, 0xf9, 0x49, 0xb3, 0x1c, 0xba, 0xbd, 0x4d, 0x48, 0xf0, 0x43, 0xb8, 0x10, 0xb7, 0xb3, 0x42,
 45 |         0x58, 0xe1, 0x51, 0xab, 0x04, 0xa2, 0xa5, 0x55, 0x7e, 0xff, 0xb5, 0x41, 0x62, 0x45, 0x08, 0x0c,
 46 |         0x2a, 0xb5, 0x4b, 0xb4, 0x3a, 0x02, 0xf8, 0xf6, 0x62, 0xe3, 0xa9, 0x5d, 0x66, 0x41, 0x0c, 0x08,
 47 |         0xf5, 0x01, 0x85, 0x72, 0x97, 0x44, 0x8d, 0x7e, 0xbd, 0xf1, 0xc6, 0xca, 0x87, 0xf3, 0x3e, 0x3c,
 48 |         0xe5, 0x10, 0x97, 0x61, 0x83, 0x51, 0x9b, 0x69, 0x34, 0x15, 0x7c, 0x9e, 0xa3, 0x51, 0xf1, 0xe0,
 49 |         0x1e, 0xa0, 0x37, 0x2a, 0x99, 0x53, 0x09, 0x16, 0x7c, 0x43, 0x9e, 0x77, 0xff, 0x12, 0x05, 0x1e,
 50 |         0xdd, 0x7e, 0x0e, 0x88, 0x7e, 0x2f, 0xff, 0x68, 0x60, 0x8f, 0xc8, 0x42, 0xf9, 0xdc, 0xc1, 0x54,
 51 |         0x85, 0x9f, 0x5f, 0x23, 0x7a, 0x8d, 0x5a, 0x3d, 0xc0, 0xc0, 0x29, 0x52, 0xbe, 0xef, 0xd6, 0x3a,
 52 |         0xde, 0x60, 0x1e, 0x78, 0x27, 0xbc, 0xdf, 0x2c, 0xa2, 0x23, 0x80, 0x0f, 0xd8, 0xae, 0xda, 0x32,
 53 |         0xa4, 0x97, 0x0a, 0x33, 0x1a, 0x78, 0xdc, 0x09, 0xc4, 0x18, 0xc2, 0x71, 0xe3, 0xa4, 0x1d, 0x5d]
 54 |         , 256: [
 55 |         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 56 |         0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
 57 |         0xa5, 0x73, 0xc2, 0x9f, 0xa1, 0x76, 0xc4, 0x98, 0xa9, 0x7f, 0xce, 0x93, 0xa5, 0x72, 0xc0, 0x9c,
 58 |         0x16, 0x51, 0xa8, 0xcd, 0x02, 0x44, 0xbe, 0xda, 0x1a, 0x5d, 0xa4, 0xc1, 0x06, 0x40, 0xba, 0xde,
 59 |         0xae, 0x87, 0xdf, 0xf0, 0x0f, 0xf1, 0x1b, 0x68, 0xa6, 0x8e, 0xd5, 0xfb, 0x03, 0xfc, 0x15, 0x67,
 60 |         0x6d, 0xe1, 0xf1, 0x48, 0x6f, 0xa5, 0x4f, 0x92, 0x75, 0xf8, 0xeb, 0x53, 0x73, 0xb8, 0x51, 0x8d,
 61 |         0xc6, 0x56, 0x82, 0x7f, 0xc9, 0xa7, 0x99, 0x17, 0x6f, 0x29, 0x4c, 0xec, 0x6c, 0xd5, 0x59, 0x8b,
 62 |         0x3d, 0xe2, 0x3a, 0x75, 0x52, 0x47, 0x75, 0xe7, 0x27, 0xbf, 0x9e, 0xb4, 0x54, 0x07, 0xcf, 0x39,
 63 |         0x0b, 0xdc, 0x90, 0x5f, 0xc2, 0x7b, 0x09, 0x48, 0xad, 0x52, 0x45, 0xa4, 0xc1, 0x87, 0x1c, 0x2f,
 64 |         0x45, 0xf5, 0xa6, 0x60, 0x17, 0xb2, 0xd3, 0x87, 0x30, 0x0d, 0x4d, 0x33, 0x64, 0x0a, 0x82, 0x0a,
 65 |         0x7c, 0xcf, 0xf7, 0x1c, 0xbe, 0xb4, 0xfe, 0x54, 0x13, 0xe6, 0xbb, 0xf0, 0xd2, 0x61, 0xa7, 0xdf,
 66 |         0xf0, 0x1a, 0xfa, 0xfe, 0xe7, 0xa8, 0x29, 0x79, 0xd7, 0xa5, 0x64, 0x4a, 0xb3, 0xaf, 0xe6, 0x40,
 67 |         0x25, 0x41, 0xfe, 0x71, 0x9b, 0xf5, 0x00, 0x25, 0x88, 0x13, 0xbb, 0xd5, 0x5a, 0x72, 0x1c, 0x0a,
 68 |         0x4e, 0x5a, 0x66, 0x99, 0xa9, 0xf2, 0x4f, 0xe0, 0x7e, 0x57, 0x2b, 0xaa, 0xcd, 0xf8, 0xcd, 0xea,
 69 |         0x24, 0xfc, 0x79, 0xcc, 0xbf, 0x09, 0x79, 0xe9, 0x37, 0x1a, 0xc2, 0x3c, 0x6d, 0x68, 0xde, 0x36]
 70 |     }
 71 | 
 72 |     test_block_ciphertext_validated = {
 73 |         128: [
 74 |             0x69, 0xc4, 0xe0, 0xd8, 0x6a, 0x7b, 0x04, 0x30, 0xd8, 0xcd, 0xb7, 0x80, 0x70, 0xb4, 0xc5, 0x5a]
 75 |         , 192: [
 76 |         0xdd, 0xa9, 0x7c, 0xa4, 0x86, 0x4c, 0xdf, 0xe0, 0x6e, 0xaf, 0x70, 0xa0, 0xec, 0x0d, 0x71, 0x91]
 77 |         , 256: [
 78 |         0x8e, 0xa2, 0xb7, 0xca, 0x51, 0x67, 0x45, 0xbf, 0xea, 0xfc, 0x49, 0x90, 0x4b, 0x49, 0x60, 0x89]
 79 |     }
 80 | 
 81 |     test_block_plaintext = [
 82 |         0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff]
 83 | 
 84 |     # After initial validation, these deviated from test in SP 800-38A to use same key, iv, and plaintext on tests.
 85 |     # Still valid, just easier to test with.
 86 |     test_mode_key = [
 87 |         0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81,
 88 |         0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4]
 89 |     test_mode_iv = [
 90 |         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f]
 91 |     test_mode_plaintext = [
 92 |         [0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a],
 93 |         [0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51],
 94 |         [0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef],
 95 |         [0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10]]
 96 |     test_cbc_ciphertext = [
 97 |         [0xf5, 0x8c, 0x4c, 0x04, 0xd6, 0xe5, 0xf1, 0xba, 0x77, 0x9e, 0xab, 0xfb, 0x5f, 0x7b, 0xfb, 0xd6],
 98 |         [0x9c, 0xfc, 0x4e, 0x96, 0x7e, 0xdb, 0x80, 0x8d, 0x67, 0x9f, 0x77, 0x7b, 0xc6, 0x70, 0x2c, 0x7d],
 99 |         [0x39, 0xf2, 0x33, 0x69, 0xa9, 0xd9, 0xba, 0xcf, 0xa5, 0x30, 0xe2, 0x63, 0x04, 0x23, 0x14, 0x61],
100 |         [0xb2, 0xeb, 0x05, 0xe2, 0xc3, 0x9b, 0xe9, 0xfc, 0xda, 0x6c, 0x19, 0x07, 0x8c, 0x6a, 0x9d, 0x1b]]
101 |     test_cfb_ciphertext = [
102 |         [0xdc, 0x7e, 0x84, 0xbf, 0xda, 0x79, 0x16, 0x4b, 0x7e, 0xcd, 0x84, 0x86, 0x98, 0x5d, 0x38, 0x60],
103 |         [0x39, 0xff, 0xed, 0x14, 0x3b, 0x28, 0xb1, 0xc8, 0x32, 0x11, 0x3c, 0x63, 0x31, 0xe5, 0x40, 0x7b],
104 |         [0xdf, 0x10, 0x13, 0x24, 0x15, 0xe5, 0x4b, 0x92, 0xa1, 0x3e, 0xd0, 0xa8, 0x26, 0x7a, 0xe2, 0xf9],
105 |         [0x75, 0xa3, 0x85, 0x74, 0x1a, 0xb9, 0xce, 0xf8, 0x20, 0x31, 0x62, 0x3d, 0x55, 0xb1, 0xe4, 0x71]]
106 |     test_ofb_ciphertext = [
107 |         [0xdc, 0x7e, 0x84, 0xbf, 0xda, 0x79, 0x16, 0x4b, 0x7e, 0xcd, 0x84, 0x86, 0x98, 0x5d, 0x38, 0x60],
108 |         [0x4f, 0xeb, 0xdc, 0x67, 0x40, 0xd2, 0x0b, 0x3a, 0xc8, 0x8f, 0x6a, 0xd8, 0x2a, 0x4f, 0xb0, 0x8d],
109 |         [0x71, 0xab, 0x47, 0xa0, 0x86, 0xe8, 0x6e, 0xed, 0xf3, 0x9d, 0x1c, 0x5b, 0xba, 0x97, 0xc4, 0x08],
110 |         [0x01, 0x26, 0x14, 0x1d, 0x67, 0xf3, 0x7b, 0xe8, 0x53, 0x8f, 0x5a, 0x8b, 0xe7, 0x40, 0xe4, 0x84]]
111 | 
112 |     def hex_output(self, list):
113 |         # Debugging output helper
114 |         result = '['
115 |         for i in list[:-1]:
116 |             result += hex(i) + ','
117 |         return result + hex(list[-1]) + ']'
118 | 


--------------------------------------------------------------------------------
/peepdf/colorama/__init__.py:
--------------------------------------------------------------------------------
1 | from .initialise import init
2 | from .ansi import Fore, Back, Style
3 | from .ansitowin32 import AnsiToWin32
4 | 
5 | VERSION = '0.1.18'
6 | 


--------------------------------------------------------------------------------
/peepdf/colorama/ansi.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | This module generates ANSI character codes to printing colors to terminals.
 3 | See: http://en.wikipedia.org/wiki/ANSI_escape_code
 4 | '''
 5 | 
 6 | CSI = '\033['
 7 | 
 8 | 
 9 | def code_to_chars(code):
10 |     return CSI + str(code) + 'm'
11 | 
12 | 
13 | class AnsiCodes(object):
14 |     def __init__(self, codes):
15 |         for name in dir(codes):
16 |             if not name.startswith('_'):
17 |                 value = getattr(codes, name)
18 |                 setattr(self, name, code_to_chars(value))
19 | 
20 | 
21 | class AnsiFore:
22 |     BLACK = 30
23 |     RED = 31
24 |     GREEN = 32
25 |     YELLOW = 33
26 |     BLUE = 34
27 |     MAGENTA = 35
28 |     CYAN = 36
29 |     WHITE = 37
30 |     RESET = 39
31 | 
32 | 
33 | class AnsiBack:
34 |     BLACK = 40
35 |     RED = 41
36 |     GREEN = 42
37 |     YELLOW = 43
38 |     BLUE = 44
39 |     MAGENTA = 45
40 |     CYAN = 46
41 |     WHITE = 47
42 |     RESET = 49
43 | 
44 | 
45 | class AnsiStyle:
46 |     BRIGHT = 1
47 |     DIM = 2
48 |     NORMAL = 22
49 |     RESET_ALL = 0
50 | 
51 | 
52 | Fore = AnsiCodes(AnsiFore)
53 | Back = AnsiCodes(AnsiBack)
54 | Style = AnsiCodes(AnsiStyle)
55 | 


--------------------------------------------------------------------------------
/peepdf/colorama/ansitowin32.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import sys
  3 | 
  4 | from .ansi import AnsiFore, AnsiBack, AnsiStyle, Style
  5 | from .winterm import WinTerm, WinColor, WinStyle
  6 | from .win32 import windll
  7 | 
  8 | if windll is not None:
  9 |     winterm = WinTerm()
 10 | 
 11 | 
 12 | def is_a_tty(stream):
 13 |     return hasattr(stream, 'isatty') and stream.isatty()
 14 | 
 15 | 
 16 | class StreamWrapper(object):
 17 |     '''
 18 |     Wraps a stream (such as stdout), acting as a transparent proxy for all
 19 |     attribute access apart from method 'write()', which is delegated to our
 20 |     Converter instance.
 21 |     '''
 22 | 
 23 |     def __init__(self, wrapped, converter):
 24 |         # double-underscore everything to prevent clashes with names of
 25 |         # attributes on the wrapped stream object.
 26 |         self.__wrapped = wrapped
 27 |         self.__convertor = converter
 28 | 
 29 |     def __getattr__(self, name):
 30 |         return getattr(self.__wrapped, name)
 31 | 
 32 |     def write(self, text):
 33 |         self.__convertor.write(text)
 34 | 
 35 | 
 36 | class AnsiToWin32(object):
 37 |     '''
 38 |     Implements a 'write()' method which, on Windows, will strip ANSI character
 39 |     sequences from the text, and if outputting to a tty, will convert them into
 40 |     win32 function calls.
 41 |     '''
 42 |     ANSI_RE = re.compile('\033\[((?:\d|;)*)([a-zA-Z])')
 43 | 
 44 |     def __init__(self, wrapped, convert=None, strip=None, autoreset=False):
 45 |         # The wrapped stream (normally sys.stdout or sys.stderr)
 46 |         self.wrapped = wrapped
 47 | 
 48 |         # should we reset colors to defaults after every .write()
 49 |         self.autoreset = autoreset
 50 | 
 51 |         # create the proxy wrapping our output stream
 52 |         self.stream = StreamWrapper(wrapped, self)
 53 | 
 54 |         on_windows = sys.platform.startswith('win')
 55 | 
 56 |         # should we strip ANSI sequences from our output?
 57 |         if strip is None:
 58 |             strip = on_windows
 59 |         self.strip = strip
 60 | 
 61 |         # should we should convert ANSI sequences into win32 calls?
 62 |         if convert is None:
 63 |             convert = on_windows and is_a_tty(wrapped)
 64 |         self.convert = convert
 65 | 
 66 |         # dict of ansi codes to win32 functions and parameters
 67 |         self.win32_calls = self.get_win32_calls()
 68 | 
 69 |         # are we wrapping stderr?
 70 |         self.on_stderr = self.wrapped is sys.stderr
 71 | 
 72 |     def should_wrap(self):
 73 |         '''
 74 |         True if this class is actually needed. If false, then the output
 75 |         stream will not be affected, nor will win32 calls be issued, so
 76 |         wrapping stdout is not actually required. This will generally be
 77 |         False on non-Windows platforms, unless optional functionality like
 78 |         autoreset has been requested using kwargs to init()
 79 |         '''
 80 |         return self.convert or self.strip or self.autoreset
 81 | 
 82 |     def get_win32_calls(self):
 83 |         if self.convert and winterm:
 84 |             return {
 85 |                 AnsiStyle.RESET_ALL: (winterm.reset_all,),
 86 |                 AnsiStyle.BRIGHT: (winterm.style, WinStyle.BRIGHT),
 87 |                 AnsiStyle.DIM: (winterm.style, WinStyle.NORMAL),
 88 |                 AnsiStyle.NORMAL: (winterm.style, WinStyle.NORMAL),
 89 |                 AnsiFore.BLACK: (winterm.fore, WinColor.BLACK),
 90 |                 AnsiFore.RED: (winterm.fore, WinColor.RED),
 91 |                 AnsiFore.GREEN: (winterm.fore, WinColor.GREEN),
 92 |                 AnsiFore.YELLOW: (winterm.fore, WinColor.YELLOW),
 93 |                 AnsiFore.BLUE: (winterm.fore, WinColor.BLUE),
 94 |                 AnsiFore.MAGENTA: (winterm.fore, WinColor.MAGENTA),
 95 |                 AnsiFore.CYAN: (winterm.fore, WinColor.CYAN),
 96 |                 AnsiFore.WHITE: (winterm.fore, WinColor.GREY),
 97 |                 AnsiFore.RESET: (winterm.fore,),
 98 |                 AnsiBack.BLACK: (winterm.back, WinColor.BLACK),
 99 |                 AnsiBack.RED: (winterm.back, WinColor.RED),
100 |                 AnsiBack.GREEN: (winterm.back, WinColor.GREEN),
101 |                 AnsiBack.YELLOW: (winterm.back, WinColor.YELLOW),
102 |                 AnsiBack.BLUE: (winterm.back, WinColor.BLUE),
103 |                 AnsiBack.MAGENTA: (winterm.back, WinColor.MAGENTA),
104 |                 AnsiBack.CYAN: (winterm.back, WinColor.CYAN),
105 |                 AnsiBack.WHITE: (winterm.back, WinColor.GREY),
106 |                 AnsiBack.RESET: (winterm.back,),
107 |             }
108 | 
109 |     def write(self, text):
110 |         if self.strip or self.convert:
111 |             self.write_and_convert(text)
112 |         else:
113 |             self.wrapped.write(text)
114 |             self.wrapped.flush()
115 |         if self.autoreset:
116 |             self.reset_all()
117 | 
118 |     def reset_all(self):
119 |         if self.convert:
120 |             self.call_win32('m', (0,))
121 |         else:
122 |             self.wrapped.write(Style.RESET_ALL)
123 | 
124 |     def write_and_convert(self, text):
125 |         '''
126 |         Write the given text to our wrapped stream, stripping any ANSI
127 |         sequences from the text, and optionally converting them into win32
128 |         calls.
129 |         '''
130 |         cursor = 0
131 |         for match in self.ANSI_RE.finditer(text):
132 |             start, end = match.span()
133 |             self.write_plain_text(text, cursor, start)
134 |             self.convert_ansi(*match.groups())
135 |             cursor = end
136 |         self.write_plain_text(text, cursor, len(text))
137 | 
138 |     def write_plain_text(self, text, start, end):
139 |         if start < end:
140 |             self.wrapped.write(text[start:end])
141 |             self.wrapped.flush()
142 | 
143 |     def convert_ansi(self, paramstring, command):
144 |         if self.convert:
145 |             params = self.extract_params(paramstring)
146 |             self.call_win32(command, params)
147 | 
148 |     def extract_params(self, paramstring):
149 |         def split(paramstring):
150 |             for p in paramstring.split(';'):
151 |                 if p != '':
152 |                     yield int(p)
153 | 
154 |         return tuple(split(paramstring))
155 | 
156 |     def call_win32(self, command, params):
157 |         if params == []:
158 |             params = [0]
159 |         if command == 'm':
160 |             for param in params:
161 |                 if param in self.win32_calls:
162 |                     func_args = self.win32_calls[param]
163 |                     func = func_args[0]
164 |                     args = func_args[1:]
165 |                     kwargs = dict(on_stderr=self.on_stderr)
166 |                     func(*args, **kwargs)
167 | 


--------------------------------------------------------------------------------
/peepdf/colorama/initialise.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | import sys
 3 | 
 4 | from .ansitowin32 import AnsiToWin32
 5 | 
 6 | orig_stdout = sys.stdout
 7 | orig_stderr = sys.stderr
 8 | 
 9 | atexit_done = False
10 | 
11 | 
12 | def reset_all():
13 |     AnsiToWin32(orig_stdout).reset_all()
14 | 
15 | 
16 | def init(autoreset=False, convert=None, strip=None, wrap=True):
17 |     if wrap == False and (autoreset == True or convert == True or strip == True):
18 |         raise ValueError('wrap=False conflicts with any other arg=True')
19 | 
20 |     sys.stdout = wrap_stream(orig_stdout, convert, strip, autoreset, wrap)
21 |     sys.stderr = wrap_stream(orig_stderr, convert, strip, autoreset, wrap)
22 | 
23 |     global atexit_done
24 |     if not atexit_done:
25 |         atexit.register(reset_all)
26 |         atexit_done = True
27 | 
28 | 
29 | def wrap_stream(stream, convert, strip, autoreset, wrap):
30 |     if wrap:
31 |         wrapper = AnsiToWin32(stream,
32 |                               convert=convert, strip=strip, autoreset=autoreset)
33 |         if wrapper.should_wrap():
34 |             stream = wrapper.stream
35 |     return stream
36 | 


--------------------------------------------------------------------------------
/peepdf/colorama/win32.py:
--------------------------------------------------------------------------------
 1 | # from winbase.h
 2 | STDOUT = -11
 3 | STDERR = -12
 4 | 
 5 | try:
 6 |     from ctypes import windll
 7 | except ImportError:
 8 |     windll = None
 9 |     SetConsoleTextAttribute = lambda *_: None
10 | else:
11 |     from ctypes import (
12 |         byref, Structure, c_char, c_short, c_uint32, c_ushort
13 |     )
14 | 
15 |     handles = {
16 |         STDOUT: windll.kernel32.GetStdHandle(STDOUT),
17 |         STDERR: windll.kernel32.GetStdHandle(STDERR),
18 |     }
19 | 
20 |     SHORT = c_short
21 |     WORD = c_ushort
22 |     DWORD = c_uint32
23 |     TCHAR = c_char
24 | 
25 |     class COORD(Structure):
26 |         """struct in wincon.h"""
27 |         _fields_ = [
28 |             ('X', SHORT),
29 |             ('Y', SHORT),
30 |         ]
31 | 
32 |     class SMALL_RECT(Structure):
33 |         """struct in wincon.h."""
34 |         _fields_ = [
35 |             ("Left", SHORT),
36 |             ("Top", SHORT),
37 |             ("Right", SHORT),
38 |             ("Bottom", SHORT),
39 |         ]
40 | 
41 |     class CONSOLE_SCREEN_BUFFER_INFO(Structure):
42 |         """struct in wincon.h."""
43 |         _fields_ = [
44 |             ("dwSize", COORD),
45 |             ("dwCursorPosition", COORD),
46 |             ("wAttributes", WORD),
47 |             ("srWindow", SMALL_RECT),
48 |             ("dwMaximumWindowSize", COORD),
49 |         ]
50 | 
51 |     def GetConsoleScreenBufferInfo(stream_id):
52 |         handle = handles[stream_id]
53 |         csbi = CONSOLE_SCREEN_BUFFER_INFO()
54 |         success = windll.kernel32.GetConsoleScreenBufferInfo(
55 |             handle, byref(csbi))
56 |         # This fails when imported via setup.py when installing using 'pip'
57 |         # presumably the fix is that running setup.py should not trigger all
58 |         # this activity.
59 |         # assert success
60 |         return csbi
61 | 
62 |     def SetConsoleTextAttribute(stream_id, attrs):
63 |         handle = handles[stream_id]
64 |         success = windll.kernel32.SetConsoleTextAttribute(handle, attrs)
65 |         assert success
66 | 
67 |     def SetConsoleCursorPosition(stream_id, position):
68 |         handle = handles[stream_id]
69 |         position = COORD(*position)
70 |         success = windll.kernel32.SetConsoleCursorPosition(handle, position)
71 |         assert success
72 | 
73 |     def FillConsoleOutputCharacter(stream_id, char, length, start):
74 |         handle = handles[stream_id]
75 |         char = TCHAR(char)
76 |         length = DWORD(length)
77 |         start = COORD(*start)
78 |         num_written = DWORD(0)
79 |         # AttributeError: function 'FillConsoleOutputCharacter' not found
80 |         # could it just be that my types are wrong?
81 |         success = windll.kernel32.FillConsoleOutputCharacter(
82 |             handle, char, length, start, byref(num_written))
83 |         assert success
84 |         return num_written.value
85 | 
86 | if __name__ == '__main__':
87 |     x = GetConsoleScreenBufferInfo(STDOUT)
88 |     print(x.dwSize)
89 |     print(x.dwCursorPosition)
90 |     print(x.wAttributes)
91 |     print(x.srWindow)
92 |     print(x.dwMaximumWindowSize)
93 | 


--------------------------------------------------------------------------------
/peepdf/colorama/winterm.py:
--------------------------------------------------------------------------------
 1 | from . import win32
 2 | 
 3 | 
 4 | # from wincon.h
 5 | class WinColor(object):
 6 |     BLACK = 0
 7 |     BLUE = 1
 8 |     GREEN = 2
 9 |     CYAN = 3
10 |     RED = 4
11 |     MAGENTA = 5
12 |     YELLOW = 6
13 |     GREY = 7
14 | 
15 | 
16 | # from wincon.h
17 | class WinStyle(object):
18 |     NORMAL = 0x00  # dim text, dim background
19 |     BRIGHT = 0x08  # bright text, dim background
20 | 
21 | 
22 | class WinTerm(object):
23 |     def __init__(self):
24 |         self._default = \
25 |             win32.GetConsoleScreenBufferInfo(win32.STDOUT).wAttributes
26 |         self.set_attrs(self._default)
27 |         self._default_fore = self._fore
28 |         self._default_back = self._back
29 |         self._default_style = self._style
30 | 
31 |     def get_attrs(self):
32 |         return self._fore + self._back * 16 + self._style
33 | 
34 |     def set_attrs(self, value):
35 |         self._fore = value & 7
36 |         self._back = (value >> 4) & 7
37 |         self._style = value & WinStyle.BRIGHT
38 | 
39 |     def reset_all(self, on_stderr=None):
40 |         self.set_attrs(self._default)
41 |         self.set_console(attrs=self._default)
42 | 
43 |     def fore(self, fore=None, on_stderr=False):
44 |         if fore is None:
45 |             fore = self._default_fore
46 |         self._fore = fore
47 |         self.set_console(on_stderr=on_stderr)
48 | 
49 |     def back(self, back=None, on_stderr=False):
50 |         if back is None:
51 |             back = self._default_back
52 |         self._back = back
53 |         self.set_console(on_stderr=on_stderr)
54 | 
55 |     def style(self, style=None, on_stderr=False):
56 |         if style is None:
57 |             style = self._default_style
58 |         self._style = style
59 |         self.set_console(on_stderr=on_stderr)
60 | 
61 |     def set_console(self, attrs=None, on_stderr=False):
62 |         if attrs is None:
63 |             attrs = self.get_attrs()
64 |         handle = win32.STDOUT
65 |         if on_stderr:
66 |             handle = win32.STDERR
67 |         win32.SetConsoleTextAttribute(handle, attrs)
68 | 


--------------------------------------------------------------------------------
/peepdf/jsbeautifier/unpackers/README.specs.mkd:
--------------------------------------------------------------------------------
 1 | # UNPACKERS SPECIFICATIONS
 2 | 
 3 | Nothing very difficult: an unpacker is a submodule placed in the directory
 4 | where this file was found. Each unpacker must define three symbols:
 5 | 
 6 |  * `PRIORITY`       : integer number expressing the priority in applying this
 7 |                       unpacker. Lower number means higher priority.
 8 |                       Makes sense only if a source file has been packed with
 9 |                       more than one packer.
10 |  * `detect(source)` : returns `True` if source is packed, otherwise, `False`.
11 |  * `unpack(source)` : takes a `source` string and unpacks it. Must always return
12 |                       valid JavaScript. That is to say, your code should look
13 |                       like:
14 | 
15 | ```
16 | if detect(source):
17 |     return do_your_fancy_things_with(source)
18 | else:
19 |     return source
20 | ```
21 | 
22 | *You can safely define any other symbol in your module, as it will be ignored.*
23 | 
24 | `__init__` code will automatically load new unpackers, without any further step
25 | to be accomplished. Simply drop it in this directory.
26 | 


--------------------------------------------------------------------------------
/peepdf/jsbeautifier/unpackers/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # General code for JSBeautifier unpackers infrastructure. See README.specs
 3 | #     written by Stefano Sanfilippo <a.little.coder@gmail.com>
 4 | #
 5 | 
 6 | """General code for JSBeautifier unpackers infrastructure."""
 7 | 
 8 | import pkgutil
 9 | import re
10 | # from jsbeautifier.unpackers import evalbased
11 | import evalbased
12 | 
13 | # NOTE: AT THE MOMENT, IT IS DEACTIVATED FOR YOUR SECURITY: it runs js!
14 | BLACKLIST = ['jsbeautifier.unpackers.evalbased']
15 | 
16 | 
17 | class UnpackingError(Exception):
18 |     """Badly packed source or general error. Argument is a
19 |     meaningful description."""
20 |     pass
21 | 
22 | 
23 | def getunpackers():
24 |     """Scans the unpackers dir, finds unpackers and add them to UNPACKERS list.
25 |     An unpacker will be loaded only if it is a valid python module (name must
26 |     adhere to naming conventions) and it is not blacklisted (i.e. inserted
27 |     into BLACKLIST."""
28 |     path = __path__
29 |     prefix = __name__ + '.'
30 |     unpackers = []
31 |     interface = ['unpack', 'detect', 'PRIORITY']
32 |     for _importer, modname, _ispkg in pkgutil.iter_modules(path, prefix):
33 |         if 'tests' not in modname and modname not in BLACKLIST:
34 |             try:
35 |                 module = __import__(modname, fromlist=interface)
36 |             except ImportError:
37 |                 raise UnpackingError('Bad unpacker: %s' % modname)
38 |             else:
39 |                 unpackers.append(module)
40 | 
41 |     return sorted(unpackers, key=lambda mod: mod.PRIORITY)
42 | 
43 | # UNPACKERS = getunpackers()
44 | UNPACKERS = []
45 | 
46 | 
47 | def run(source, evalcode=False):
48 |     """Runs the applicable unpackers and return unpacked source as a string."""
49 |     for unpacker in [mod for mod in UNPACKERS if mod.detect(source)]:
50 |         source = unpacker.unpack(source)
51 |     if evalcode and evalbased.detect(source):
52 |         source = evalbased.unpack(source)
53 |     return source
54 | 
55 | 
56 | def filtercomments(source):
57 |     """NOT USED: strips trailing comments and put them at the top."""
58 |     trailing_comments = []
59 |     comment = True
60 | 
61 |     while comment:
62 |         if re.search(r'^\s*\/\*', source):
63 |             comment = source[0, source.index('*/') + 2]
64 |         elif re.search(r'^\s*\/\/', source):
65 |             comment = re.search(r'^\s*\/\/', source).group(0)
66 |         else:
67 |             comment = None
68 | 
69 |         if comment:
70 |             source = re.sub(r'^\s+', '', source[len(comment):])
71 |             trailing_comments.append(comment)
72 | 
73 |     return '\n'.join(trailing_comments) + source
74 | 


--------------------------------------------------------------------------------
/peepdf/jsbeautifier/unpackers/evalbased.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Unpacker for eval() based packers, a part of javascript beautifier
 3 | # by Einar Lielmanis <einar@jsbeautifier.org>
 4 | #
 5 | #     written by Stefano Sanfilippo <a.little.coder@gmail.com>
 6 | #
 7 | # usage:
 8 | #
 9 | # if detect(some_string):
10 | #     unpacked = unpack(some_string)
11 | #
12 | 
13 | """Unpacker for eval() based packers: runs JS code and returns result.
14 | Works only if a JS interpreter (e.g. Mozilla's Rhino) is installed and
15 | properly set up on host."""
16 | 
17 | from subprocess import PIPE, Popen
18 | 
19 | PRIORITY = 3
20 | 
21 | 
22 | def detect(source):
23 |     """Detects if source is likely to be eval() packed."""
24 |     return source.strip().lower().startswith('eval(function(')
25 | 
26 | 
27 | def unpack(source):
28 |     """Runs source and return resulting code."""
29 |     return jseval('print %s;' % source[4:]) if detect(source) else source
30 | 
31 | 
32 | # In case of failure, we'll just return the original, without crashing on user.
33 | def jseval(script):
34 |     """Run code in the JS interpreter and return output."""
35 |     try:
36 |         interpreter = Popen(['js'], stdin=PIPE, stdout=PIPE)
37 |     except OSError:
38 |         return script
39 |     result, errors = interpreter.communicate(script)
40 |     if interpreter.poll() or errors:
41 |         return script
42 |     return result
43 | 


--------------------------------------------------------------------------------
/peepdf/jsbeautifier/unpackers/javascriptobfuscator.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # simple unpacker/deobfuscator for scripts messed up with
 3 | # javascriptobfuscator.com
 4 | #
 5 | #     written by Einar Lielmanis <einar@jsbeautifier.org>
 6 | #     rewritten in Python by Stefano Sanfilippo <a.little.coder@gmail.com>
 7 | #
 8 | # Will always return valid javascript: if `detect()` is false, `code` is
 9 | # returned, unmodified.
10 | #
11 | # usage:
12 | #
13 | # if javascriptobfuscator.detect(some_string):
14 | #     some_string = javascriptobfuscator.unpack(some_string)
15 | #
16 | 
17 | """deobfuscator for scripts messed up with JavascriptObfuscator.com"""
18 | 
19 | import re
20 | 
21 | PRIORITY = 1
22 | 
23 | 
24 | def smartsplit(code):
25 |     """Split `code` at " symbol, only if it is not escaped."""
26 |     strings = []
27 |     pos = 0
28 |     while pos < len(code):
29 |         if code[pos] == '"':
30 |             word = ''  # new word
31 |             pos += 1
32 |             while pos < len(code):
33 |                 if code[pos] == '"':
34 |                     break
35 |                 if code[pos] == '\\':
36 |                     word += '\\'
37 |                     pos += 1
38 |                 word += code[pos]
39 |                 pos += 1
40 |             strings.append('"%s"' % word)
41 |         pos += 1
42 |     return strings
43 | 
44 | 
45 | def detect(code):
46 |     """Detects if `code` is JavascriptObfuscator.com packed."""
47 |     # prefer `is not` idiom, so that a true boolean is returned
48 |     return (re.search(r'^var _0x[a-f0-9]+ ?\= ?\[', code) is not None)
49 | 
50 | 
51 | def unpack(code):
52 |     """Unpacks JavascriptObfuscator.com packed code."""
53 |     if detect(code):
54 |         matches = re.search(r'var (_0x[a-f\d]+) ?\= ?\[(.*?)\];', code)
55 |         if matches:
56 |             variable = matches.group(1)
57 |             dictionary = smartsplit(matches.group(2))
58 |             code = code[len(matches.group(0)):]
59 |             for key, value in enumerate(dictionary):
60 |                 code = code.replace(r'%s[%s]' % (variable, key), value)
61 |     return code
62 | 


--------------------------------------------------------------------------------
/peepdf/jsbeautifier/unpackers/myobfuscate.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # deobfuscator for scripts messed up with myobfuscate.com
 3 | # by Einar Lielmanis <einar@jsbeautifier.org>
 4 | #
 5 | #     written by Stefano Sanfilippo <a.little.coder@gmail.com>
 6 | #
 7 | # usage:
 8 | #
 9 | # if detect(some_string):
10 | #     unpacked = unpack(some_string)
11 | #
12 | 
13 | # CAVEAT by Einar Lielmanis
14 | 
15 | #
16 | # You really don't want to obfuscate your scripts there: they're tracking
17 | # your unpackings, your script gets turned into something like this,
18 | # as of 2011-08-26:
19 | #
20 | #   var _escape = 'your_script_escaped';
21 | #   var _111 = document.createElement('script');
22 | #   _111.src = 'http://api.www.myobfuscate.com/?getsrc=ok' +
23 | #              '&ref=' + encodeURIComponent(document.referrer) +
24 | #              '&url=' + encodeURIComponent(document.URL);
25 | #   var 000 = document.getElementsByTagName('head')[0];
26 | #   000.appendChild(_111);
27 | #   document.write(unescape(_escape));
28 | #
29 | 
30 | """Deobfuscator for scripts messed up with MyObfuscate.com"""
31 | 
32 | import re
33 | import base64
34 | 
35 | # Python 2 retrocompatibility
36 | # pylint: disable=F0401
37 | # pylint: disable=E0611
38 | try:
39 |     from urllib import unquote
40 | except ImportError:
41 |     from urllib.parse import unquote
42 | 
43 | from jsbeautifier.unpackers import UnpackingError
44 | 
45 | PRIORITY = 1
46 | 
47 | CAVEAT = """//
48 | // Unpacker warning: be careful when using myobfuscate.com for your projects:
49 | // scripts obfuscated by the free online version call back home.
50 | //
51 | 
52 | """
53 | 
54 | SIGNATURE = (r'["\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F'
55 |              r'\x50\x51\x52\x53\x54\x55\x56\x57\x58\x59\x5A\x61\x62\x63\x64\x65'
56 |              r'\x66\x67\x68\x69\x6A\x6B\x6C\x6D\x6E\x6F\x70\x71\x72\x73\x74\x75'
57 |              r'\x76\x77\x78\x79\x7A\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x2B'
58 |              r'\x2F\x3D","","\x63\x68\x61\x72\x41\x74","\x69\x6E\x64\x65\x78'
59 |              r'\x4F\x66","\x66\x72\x6F\x6D\x43\x68\x61\x72\x43\x6F\x64\x65","'
60 |              r'\x6C\x65\x6E\x67\x74\x68"]')
61 | 
62 | 
63 | def detect(source):
64 |     """Detects MyObfuscate.com packer."""
65 |     return SIGNATURE in source
66 | 
67 | 
68 | def unpack(source):
69 |     """Unpacks js code packed with MyObfuscate.com"""
70 |     if not detect(source):
71 |         return source
72 |     payload = unquote(_filter(source))
73 |     match = re.search(r"^var _escape\='<script>(.*)<\/script>'",
74 |                       payload, re.DOTALL)
75 |     polished = match.group(1) if match else source
76 |     return CAVEAT + polished
77 | 
78 | 
79 | def _filter(source):
80 |     """Extracts and decode payload (original file) from `source`"""
81 |     try:
82 |         varname = re.search(r'eval\(\w+\(\w+\((\w+)\)\)\);', source).group(1)
83 |         reverse = re.search(r"var +%s *\= *'(.*)';" % varname, source).group(1)
84 |     except AttributeError:
85 |         raise UnpackingError('Malformed MyObfuscate data.')
86 |     try:
87 |         return base64.b64decode(reverse[::-1].encode('utf8')).decode('utf8')
88 |     except TypeError:
89 |         raise UnpackingError('MyObfuscate payload is not base64-encoded.')
90 | 


--------------------------------------------------------------------------------
/peepdf/jsbeautifier/unpackers/packer.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Unpacker for Dean Edward's p.a.c.k.e.r, a part of javascript beautifier
  3 | # by Einar Lielmanis <einar@jsbeautifier.org>
  4 | #
  5 | #     written by Stefano Sanfilippo <a.little.coder@gmail.com>
  6 | #
  7 | # usage:
  8 | #
  9 | # if detect(some_string):
 10 | #     unpacked = unpack(some_string)
 11 | #
 12 | 
 13 | """Unpacker for Dean Edward's p.a.c.k.e.r"""
 14 | 
 15 | import re
 16 | 
 17 | from jsbeautifier.unpackers import UnpackingError
 18 | 
 19 | PRIORITY = 1
 20 | 
 21 | 
 22 | def detect(source):
 23 |     """Detects whether `source` is P.A.C.K.E.R. coded."""
 24 |     return source.replace(' ', '').startswith('eval(function(p,a,c,k,e,r')
 25 | 
 26 | 
 27 | def unpack(source):
 28 |     """Unpacks P.A.C.K.E.R. packed js code."""
 29 |     payload, symtab, radix, count = _filterargs(source)
 30 | 
 31 |     if count != len(symtab):
 32 |         raise UnpackingError('Malformed p.a.c.k.e.r. symtab.')
 33 | 
 34 |     try:
 35 |         unbase = Unbaser(radix)
 36 |     except TypeError:
 37 |         raise UnpackingError('Unknown p.a.c.k.e.r. encoding.')
 38 | 
 39 |     def lookup(match):
 40 |         """Look up symbols in the synthetic symtab."""
 41 |         word = match.group(0)
 42 |         return symtab[unbase(word)] or word
 43 | 
 44 |     source = re.sub(r'\b\w+\b', lookup, payload)
 45 |     return _replacestrings(source)
 46 | 
 47 | 
 48 | def _filterargs(source):
 49 |     """Juice from a source file the four args needed by decoder."""
 50 |     argsregex = (r"}\('(.*)', *(\d+), *(\d+), *'(.*)'\."
 51 |                  r"split\('\|'\), *(\d+), *(.*)\)\)")
 52 |     args = re.search(argsregex, source, re.DOTALL).groups()
 53 | 
 54 |     try:
 55 |         return args[0], args[3].split('|'), int(args[1]), int(args[2])
 56 |     except ValueError:
 57 |         raise UnpackingError('Corrupted p.a.c.k.e.r. data.')
 58 | 
 59 | 
 60 | def _replacestrings(source):
 61 |     """Strip string lookup table (list) and replace values in source."""
 62 |     match = re.search(r'var *(_\w+)\=\["(.*?)"\];', source, re.DOTALL)
 63 | 
 64 |     if match:
 65 |         varname, strings = match.groups()
 66 |         startpoint = len(match.group(0))
 67 |         lookup = strings.split('","')
 68 |         variable = '%s[%%d]' % varname
 69 |         for index, value in enumerate(lookup):
 70 |             source = source.replace(variable % index, '"%s"' % value)
 71 |         return source[startpoint:]
 72 |     return source
 73 | 
 74 | 
 75 | class Unbaser(object):
 76 |     """Functor for a given base. Will efficiently convert
 77 |     strings to natural numbers."""
 78 |     ALPHABET = {
 79 |         62: '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ',
 80 |         95: (' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 81 |              '[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')
 82 |     }
 83 | 
 84 |     def __init__(self, base):
 85 |         self.base = base
 86 | 
 87 |         # If base can be handled by int() builtin, let it do it for us
 88 |         if 2 <= base <= 36:
 89 |             self.unbase = lambda string: int(string, base)
 90 |         else:
 91 |             # Build conversion dictionary cache
 92 |             try:
 93 |                 self.dictionary = dict((cipher, index) for
 94 |                                        index, cipher in enumerate(self.ALPHABET[base]))
 95 |             except KeyError:
 96 |                 raise TypeError('Unsupported base encoding.')
 97 | 
 98 |             self.unbase = self._dictunbaser
 99 | 
100 |     def __call__(self, string):
101 |         return self.unbase(string)
102 | 
103 |     def _dictunbaser(self, string):
104 |         """Decodes a  value to an integer."""
105 |         ret = 0
106 |         for index, cipher in enumerate(string[::-1]):
107 |             ret += (self.base ** index) * self.dictionary[cipher]
108 |         return ret
109 | 


--------------------------------------------------------------------------------
/peepdf/jsbeautifier/unpackers/urlencode.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Trivial bookmarklet/escaped script detector for the javascript beautifier
 3 | #     written by Einar Lielmanis <einar@jsbeautifier.org>
 4 | #     rewritten in Python by Stefano Sanfilippo <a.little.coder@gmail.com>
 5 | #
 6 | # Will always return valid javascript: if `detect()` is false, `code` is
 7 | # returned, unmodified.
 8 | #
 9 | # usage:
10 | #
11 | # some_string = urlencode.unpack(some_string)
12 | #
13 | 
14 | """Bookmarklet/escaped script unpacker."""
15 | 
16 | # Python 2 retrocompatibility
17 | # pylint: disable=F0401
18 | # pylint: disable=E0611
19 | try:
20 |     from urllib import unquote_plus
21 | except ImportError:
22 |     from urllib.parse import unquote_plus
23 | 
24 | PRIORITY = 0
25 | 
26 | 
27 | def detect(code):
28 |     """Detects if a scriptlet is urlencoded."""
29 |     # the fact that script doesn't contain any space, but has %20 instead
30 |     # should be sufficient check for now.
31 |     return ' ' not in code and ('%20' in code or code.count('%') > 3)
32 | 
33 | 
34 | def unpack(code):
35 |     """URL decode `code` source string."""
36 |     return unquote_plus(code) if detect(code) else code
37 | 


--------------------------------------------------------------------------------
/peepdf/peepdf.dtd:
--------------------------------------------------------------------------------
  1 | <!ELEMENT peepdf_analysis ( date, basic, advanced ) >
  2 |         <!ATTLIST peepdf_analysis author CDATA #REQUIRED >
  3 |         <!ATTLIST peepdf_analysis url CDATA #REQUIRED >
  4 |         <!ATTLIST peepdf_analysis version CDATA #REQUIRED >
  5 | 
  6 | 
  7 |         <!ELEMENT date ( #PCDATA ) >
  8 | 
  9 | 
 10 |         <!ELEMENT basic ( filename, md5, sha1, sha256, size, detection, pdf_version, binary, linearized, encrypted, updates, num_objects, num_streams, comments, errors ) >
 11 | 
 12 |         <!ELEMENT filename ( #PCDATA ) >
 13 | 
 14 |         <!ELEMENT md5 ( #PCDATA ) >
 15 | 
 16 |         <!ELEMENT sha1 ( #PCDATA ) >
 17 | 
 18 |         <!ELEMENT sha256 ( #PCDATA ) >
 19 | 
 20 |         <!ELEMENT size ( #PCDATA ) >
 21 | 
 22 |         <!ELEMENT detection ( rate?, report_link? ) >
 23 | 
 24 |         <!ELEMENT rate ( #PCDATA ) >
 25 | 
 26 |         <!ELEMENT report_link ( #PCDATA ) >
 27 | 
 28 |         <!ELEMENT pdf_version ( #PCDATA ) >
 29 | 
 30 |         <!ELEMENT binary EMPTY >
 31 |         <!ATTLIST binary status ( false | true ) #REQUIRED >
 32 | 
 33 |         <!ELEMENT linearized EMPTY >
 34 |         <!ATTLIST linearized status ( false | true ) #REQUIRED >
 35 | 
 36 |         <!ELEMENT encrypted ( algorithms? ) >
 37 |         <!ATTLIST encrypted status ( false | true ) #REQUIRED >
 38 | 
 39 |         <!ELEMENT algorithms ( algorithm+ ) >
 40 | 
 41 |         <!ELEMENT algorithm ( #PCDATA ) >
 42 |         <!ATTLIST algorithm bits NMTOKEN #REQUIRED >
 43 | 
 44 |         <!ELEMENT updates ( #PCDATA ) >
 45 | 
 46 |         <!ELEMENT num_objects ( #PCDATA ) >
 47 | 
 48 |         <!ELEMENT num_streams ( #PCDATA ) >
 49 | 
 50 |         <!ELEMENT comments ( #PCDATA ) >
 51 | 
 52 |         <!ELEMENT errors ( error_message* ) >
 53 |         <!ATTLIST errors num NMTOKEN #REQUIRED >
 54 | 
 55 |         <!ELEMENT error_message ( #PCDATA ) >
 56 | 
 57 | 
 58 |         <!ELEMENT advanced ( version* ) >
 59 | 
 60 |         <!ELEMENT version ( catalog, info, objects, streams ,js_objects, suspicious_elements, suspicious_urls ) >
 61 |         <!ATTLIST version num NMTOKEN #REQUIRED >
 62 |         <!ATTLIST version type ( original | update ) #REQUIRED >
 63 | 
 64 |         <!ELEMENT catalog EMPTY >
 65 |         <!ATTLIST catalog object_id NMTOKEN #IMPLIED >
 66 | 
 67 |         <!ELEMENT info EMPTY >
 68 |         <!ATTLIST info object_id NMTOKEN #IMPLIED >
 69 | 
 70 |         <!ELEMENT objects ( object* ) >
 71 |         <!ATTLIST objects num NMTOKEN #REQUIRED >
 72 | 
 73 |         <!ELEMENT object EMPTY >
 74 |         <!ATTLIST object errors ( false | true ) #IMPLIED >
 75 |         <!ATTLIST object compressed ( false | true ) #IMPLIED >
 76 |         <!ATTLIST object id NMTOKEN #REQUIRED >
 77 | 
 78 |         <!ELEMENT streams ( stream* ) >
 79 |         <!ATTLIST streams num NMTOKEN #REQUIRED >
 80 | 
 81 |         <!ELEMENT stream EMPTY >
 82 |         <!ATTLIST stream encoded ( false | true ) #IMPLIED >
 83 |         <!ATTLIST stream id NMTOKEN #REQUIRED >
 84 |         <!ATTLIST stream object_stream ( false | true ) #IMPLIED >
 85 |         <!ATTLIST stream xref_stream ( false | true ) #IMPLIED >
 86 |         <!ATTLIST stream decoding_errors ( false | true ) #IMPLIED >
 87 | 
 88 |         <!ELEMENT js_objects ( container_object* ) >
 89 | 
 90 |         <!ELEMENT container_object EMPTY >
 91 |         <!ATTLIST container_object id NMTOKEN #REQUIRED >
 92 | 
 93 |         <!ELEMENT suspicious_elements ( triggers?, actions?, elements?, js_vulns? ) >
 94 | 
 95 |         <!ELEMENT triggers ( trigger* ) >
 96 | 
 97 |         <!ELEMENT trigger ( container_object+ ) >
 98 |         <!ATTLIST trigger name CDATA #REQUIRED >
 99 | 
100 |         <!ELEMENT actions ( action* ) >
101 | 
102 |         <!ELEMENT action ( container_object+ ) >
103 |         <!ATTLIST action name CDATA #REQUIRED >
104 | 
105 |         <!ELEMENT elements ( element* ) >
106 | 
107 |         <!ELEMENT element ( cve*, container_object+ ) >
108 |         <!ATTLIST element name CDATA #REQUIRED >
109 | 
110 |         <!ELEMENT cve ( #PCDATA ) >
111 | 
112 |         <!ELEMENT js_vulns ( vulnerable_function* ) >
113 | 
114 |         <!ELEMENT vulnerable_function ( cve*, container_object+ ) >
115 |         <!ATTLIST vulnerable_function name CDATA #REQUIRED >
116 | 
117 |         <!ELEMENT suspicious_urls ( url* ) >
118 | 
119 |         <!ELEMENT url ( #PCDATA ) >
120 | 


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2011-2015 by Carnegie Mellon University
 2 | # 
 3 | #  NO WARRANTY
 4 | # 
 5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
 6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
 7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
 8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
 9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
13 | 
14 | __author__ = "sei-mappel"
15 | __date__ = 6 / 17 / 15


--------------------------------------------------------------------------------
/scripts/clarify.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | # 
  3 | #  NO WARRANTY
  4 | # 
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | import os
 15 | import sys
 16 | import time
 17 | import Queue
 18 | import multiprocessing
 19 | from subprocess import Popen, PIPE
 20 | 
 21 | import lxml.etree as ET
 22 | from jsbeautifier import beautify, default_options
 23 | from sdhasher import make_sdhash
 24 | from db_mgmt import DBGateway
 25 | from JSAnalysis import analyse
 26 | 
 27 | PID = os.getpid()
 28 | 
 29 | try:
 30 |     JSFLASH = sys.argv[1].lower()
 31 |     DBin_NAME = sys.argv[2]
 32 |     MIN = sys.argv[3]
 33 |     MAX = sys.argv[4]
 34 | except IndexError:
 35 |     sys.exit(1)
 36 | 
 37 | LASTROW = MIN
 38 | 
 39 | try:
 40 |     verbose = sys.argv[5].lower()
 41 |     print 'Spamming terminal'
 42 | except IndexError:
 43 |     verbose = ''
 44 | 
 45 | try:
 46 |     log = open('/media/sf_voodo_db/%s-%s.log' % (JSFLASH, PID), 'a')
 47 | except IOError:
 48 |     print 'Error opening log file. No logging'
 49 |     log = None
 50 | 
 51 | 
 52 | def logmsg(log, msg):
 53 |     if log:
 54 |         log.write(msg)
 55 |         log.flush()
 56 |     if verbose:
 57 |         sys.stdout.write(msg)
 58 |         sys.stdout.flush()
 59 | 
 60 | 
 61 | db_name = 'clarified-%s-%d.sqlite' % (JSFLASH, PID)
 62 | logmsg(log, 'Creating: %s\n\n' % db_name)
 63 | 
 64 | if JSFLASH == 'js':
 65 |     cmd = "select rowid, pdf_md5, tree, obf_js from parsed_pdfs where obf_js is not '' and de_js is ''  and (rowid > %s and rowid <= %s) order by rowid limit 1" % (
 66 |     '%s', MAX)
 67 |     update = "update parsed_pdfs set de_js='%s' where rowid is %s" % (db_name, '%s')
 68 | elif JSFLASH == 'flash':
 69 |     cmd = "select rowid, pdf_md5, tree, swf from parsed_pdfs where swf is not '' and (rowid > %s and rowid <= %s) order by rowid limit 1" % (
 70 |     '%s', MAX)
 71 |     update = "update parsed_pdfs set actionscript='%s' where rowid is %s" % (db_name, '%s')
 72 | logmsg(log, "%s\n%s\n" % (cmd, update))
 73 | 
 74 | jsopts = default_options()
 75 | jsopts.preserve_new_lines = False
 76 | jsopts.break_chained_methods = True
 77 | 
 78 | DB = DBGateway(DBin_NAME, '/media/sf_voodo_db/')
 79 | DBout = DBGateway(db_name, '/media/sf_voodo_db/')
 80 | if not DBout.query(
 81 |         'create table if not exists clarified (pdf_md5 TEXT, js TEXT, de_js TEXT, de_js_sdhash TEXT, swf TEXT, abc TEXT, actionscript TEXT, actionscript_sdhash TEXT, primary key(pdf_md5))'):
 82 |     err = DBout.get_error()
 83 |     logmsg(log, "%s\n" % err)
 84 |     sys.exit(1)
 85 | 
 86 | '''
 87 | Create an lxml tree from the xml string
 88 | '''
 89 | 
 90 | 
 91 | def tree_from_xml(xml):
 92 |     try:
 93 |         return ET.fromstring(xml)
 94 |     except Exception:
 95 |         return None
 96 | 
 97 | 
 98 | '''
 99 | Get JS/SWF from DB
100 | '''
101 | 
102 | 
103 | def get_row(log):
104 |     global LASTROW
105 |     if not DB.query(cmd % LASTROW):
106 |         rv = ('', '', '', '')
107 |         err = DB.get_error()
108 |         logmsg(log, 'Get error: %s\n' % err)
109 |     else:
110 |         result = DB.db_curr.fetchone()
111 |         try:
112 |             rv = (result['rowid'], result['pdf_md5'], result['tree'], result['obf_js'])
113 |             LASTROW = result['rowid']
114 |         except IndexError:
115 |             rv = (result['rowid'], result['pdf_md5'], result['tree'], result['swf'])
116 |             LASTROW = result['rowid']
117 |         except TypeError:
118 |             rv = ('', '', '', '')
119 |             LASTROW = result['rowid']
120 |         else:
121 |             pass
122 |             '''
123 |             LASTROW = result['rowid']
124 |             upd = update % result['rowid']
125 |             print upd
126 |             if not DB.query(upd):
127 |                 err = DB.get_error()
128 |                 logmsg(log,'Update error: %s' % err)
129 |             '''
130 |     return rv
131 | 
132 | 
133 | '''
134 | Store clarified js
135 | '''
136 | 
137 | 
138 | def store(log, table, columns, values):
139 |     if not DBout.insert(table, cols=columns, vals=values):
140 |         err = DB.get_error()
141 |         logmsg(log, 'Store error: %s\n' % err)
142 | 
143 | 
144 | '''
145 | Store a flash string in a file, call the tool on the file, and then read
146 | from the stdout what was extracted. Store each.
147 | '''
148 | 
149 | 
150 | def decompile_flash(swf):
151 |     javacmd = ['java', '-jar', 'ffdec.jar', '-export', 'script', '/tmp/actionscript', '/tmp/tmp.swf']
152 |     if not swf:
153 |         return 'None'
154 |     try:
155 |         fout = open('/tmp/tmp.swf', 'wb')
156 |     except IOError as e:
157 |         if verbose:
158 |             print 'Error writing %s' % str(e)
159 |         return 'Error: %s' % str(e)
160 |     else:
161 |         extracted = []
162 |         script = ''
163 |         fout.write(swf)
164 |         fout.close()
165 |         proc = Popen(javacmd, stdout=PIPE, stderr=PIPE)
166 |         out, err = proc.communicate()
167 |         proc.wait()
168 |         for line in out.split('\n'):
169 |             if line.startswith('Exported'):
170 |                 name = line.split(' ')[3].rstrip(',').replace('.', '/')
171 |                 extracted.append(name)
172 |         for ext in extracted:
173 |             try:
174 |                 fin = open('/tmp/actionscript/%s.as' % ext, 'r')
175 |             except IOError as e:
176 |                 logmsg(log, 'Error reading: %s\n' % str(e))
177 |             else:
178 |                 script = '\n'.join([script, fin.read()])
179 |                 fin.close()
180 |         return script
181 | 
182 | 
183 | qu = multiprocessing.Queue()
184 | 
185 | 
186 | def run_analysis(code, etree, results):
187 |     res = analyse(code, etree)
188 |     results.put(res)
189 |     results.close()
190 | 
191 | 
192 | def clarify_js(code, etree):
193 |     global qu
194 |     rv = ''
195 |     attempts = 0
196 |     proc = multiprocessing.Process(target=run_analysis, args=(code, etree, qu))
197 |     try:
198 |         proc.start()
199 |         proc.join(10)
200 |         if proc.is_alive():
201 |             logmsg(log, 'Timeout...')
202 |             if not qu.empty():
203 |                 logmsg(log, 'getting large response...')
204 |                 rv = qu.get(False, 30)
205 |             else:
206 |                 logmsg(log, 'inserting dummy response...')
207 |                 qu.put('Timeout')
208 |             while proc.is_alive() and attempts < 10000:
209 |                 proc.terminate()
210 |                 time.sleep(.1)
211 |                 attempts += 1
212 |             logmsg(log, 'Killed\n')
213 |     except Exception as e:
214 |         logmsg(log, str(e))
215 |         qu.put(str(e))
216 |     finally:
217 |         if not rv:
218 |             logmsg(log, 'getting response...')
219 |             try:
220 |                 rv = qu.get(False, 30)
221 |             except Queue.Empty:
222 |                 rv = 'None'
223 |                 logmsg(log, rv)
224 |             logmsg(log, '\n')
225 |         return rv
226 | 
227 | 
228 | '''
229 | While DB has rows with JS and clarified JS is not empty string
230 | '''
231 | cnt = 0
232 | code = ''
233 | de_js = ''
234 | de_js_sdhash = ''
235 | ascript = ''
236 | as_sdhash = ''
237 | etree = ''
238 | logmsg(log, 'Begin loop\n')
239 | while (True):
240 | 
241 |     rid, pdf, xml, code = get_row(log)
242 |     if not pdf:
243 |         logmsg(log, '%s: No pdf returned\n' % JSFLASH)
244 |         break
245 |     if not code:
246 |         logmsg(log, '%s: No code returned %s\n' % (JSFLASH, pdf))
247 |         continue
248 |     cnt += 1
249 | 
250 |     if JSFLASH == 'js':
251 |         msg = 'JS CNT: %6d\tRID: %6d\tFile: %s\n' % (cnt, rid, pdf)
252 |         logmsg(log, msg)
253 | 
254 |         try:
255 |             etree = tree_from_xml(xml)
256 |             de_js = clarify_js(code, etree)
257 |             de_js = beautify(de_js, jsopts)
258 |             de_js_sdhash = make_sdhash(de_js, log)
259 |         except Exception as e:
260 |             de_js = 'error: %s' % e
261 |             logmsg(log, 'Clarification error [%s]: %s\n' % (pdf, str(e)))
262 | 
263 |         col = ('pdf_md5', 'js', 'de_js', 'de_js_sdhash')
264 |         val = (pdf, code, de_js, de_js_sdhash)
265 |         store(log, 'clarified', col, val)
266 | 
267 |     elif JSFLASH == 'flash':
268 |         msg = 'FL CNT: %6d\tRID: %6d\tFile: %s\n' % (cnt, rid, pdf)
269 |         logmsg(log, msg)
270 | 
271 |         try:
272 |             ascript = decompile_flash(code)
273 |             as_sdhash = make_sdhash(ascript, log)
274 |         except Exception as e:
275 |             ascript = 'error: %s' % e
276 |             logmsg(log, 'Decompilation error [%s]: %s\n' % (pdf, str(e)))
277 | 
278 |         col = ('pdf_md5', 'swf', 'actionscript', 'actionscript_sdhash')
279 |         val = (pdf, code, ascript, as_sdhash)
280 |         store(log, 'clarified', col, val)
281 | 
282 | try:
283 |     log.close()
284 | except Exception:
285 |     pass
286 | 
287 | '''
288 | Finish
289 | '''
290 | DB.commit()
291 | DBout.commit()
292 | DB.disconnect()
293 | DBout.disconnect()
294 | 


--------------------------------------------------------------------------------
/scripts/ffdec.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/scripts/ffdec.jar


--------------------------------------------------------------------------------
/scripts/mapper.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | # 
  3 | #  NO WARRANTY
  4 | # 
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | import os
 15 | import sys
 16 | 
 17 | from db_mgmt import DBGateway
 18 | from sdhasher import make_sdhash
 19 | 
 20 | 
 21 | class JSHasher(object):
 22 |     name = 'JSHasher'
 23 |     # query = "select rowid, js, de_js, swf, actionscript from clarified where (rowid > %s and rowid <= %s) limit 1;"
 24 |     # update = "update clarified set js_sdhash=?, de_js_sdhash=?, swf_sdhash=?, actionscript_sdhash=? where rowid is ?"
 25 |     query = "select rowid, js, swf from clarified where (rowid > %s and rowid <= %s) limit 1;"
 26 |     update = "update clarified set js_sdhash=?, swf_sdhash=? where rowid is ?"
 27 |     rowid = -1
 28 |     sdhash = ''
 29 |     setup = ('alter table clarified add column js_sdhash text;', 'alter table clarified add column swf_sdhash text;')
 30 |     proceed = True
 31 |     subs = None
 32 | 
 33 |     def __init__(self, MIN=-1, MAX=1000000):
 34 |         self.lastrow = MIN
 35 |         self.MAX = MAX
 36 | 
 37 |     def run(self, row):
 38 |         if not row:
 39 |             print 'Complete'
 40 |             proceed = False
 41 |         self.lastrow = row['rowid']
 42 |         jssdhash = make_sdhash(row['js'])
 43 |         # de_jssdhash = make_sdhash(row['de_js'])
 44 |         swfsdhash = make_sdhash(row['swf'])
 45 |         # actionscriptsdhash = make_sdhash(row['actionscript'])
 46 |         # self.subs = (jssdhash, de_jssdhash, swfsdhash, actionscriptsdhash, row['rowid'])
 47 |         self.subs = (jssdhash, swfsdhash, row['rowid'])
 48 |         self.verbose(row, 'js', jssdhash)
 49 |         # self.verbose(row, 'de_Js', de_jssdhash)
 50 |         self.verbose(row, 'swf', swfsdhash)
 51 |         # self.verbose(row, 'actionscript', actionscriptsdhash)
 52 | 
 53 |     def verbose(self, row, key, sdh):
 54 |         try:
 55 |             print 'SdHashed %s R:%s\t(%s)\t[%s]' % (key, self.lastrow, row[key][:16], sdh[:16])
 56 |         except TypeError:
 57 |             pass
 58 | 
 59 |     def query_cmd(self):
 60 |         return self.query % (self.lastrow, self.MAX)
 61 | 
 62 | 
 63 | class Mapper(object):
 64 |     def __init__(self, dbgateway, func_list):
 65 |         self.db = dbgateway
 66 |         self.funcs = func_list
 67 | 
 68 |     def setup(self, cmds):
 69 |         for cmd in cmds:
 70 |             if not self.db.query(cmd):
 71 |                 err = self.db.get_error()
 72 |                 sys.stderr.write("\tsetup: %s\n" % err)
 73 |                 if 'duplicate' not in err:
 74 |                     return False
 75 |         return True
 76 | 
 77 |     def start(self):
 78 |         for func in self.funcs:
 79 |             print 'Mapping: %s' % func.name
 80 |             if func.setup:
 81 |                 print '\tsetup:\t%s\n\t\t%s' % func.setup
 82 |                 if not self.setup(func.setup):
 83 |                     continue
 84 |             while func.proceed:
 85 |                 if not self.db.query(func.query_cmd()):
 86 |                     sys.stderr.write("query: %s\n" % self.db.get_error())
 87 |                 else:
 88 |                     func.run(self.db.db_curr.fetchone())
 89 |                     if not self.db.query(func.update, func.subs):
 90 |                         sys.stderr.write("update: %s\n" % self.db.get_error())
 91 | 
 92 | 
 93 | if __name__ == '__main__':
 94 |     try:
 95 |         dbpath = sys.argv[1]
 96 |         if not os.path.exists(dbpath):
 97 |             raise IndexError
 98 |         MIN = sys.argv[2]
 99 |         MAX = sys.argv[3]
100 |     except IndexError as e:
101 |         print 'Invalid args: %s' % e
102 |         sys.exit(0)
103 |     else:
104 |         db = DBGateway(os.path.basename(dbpath), os.path.dirname(dbpath))
105 |         functions = [JSHasher(MIN, MAX), ]
106 |         mapper = Mapper(db, functions)
107 |         mapper.start()
108 |         db.disconnect()
109 | 


--------------------------------------------------------------------------------
/scripts/run-jpexs.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | # 
  3 | #  NO WARRANTY
  4 | # 
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | import sys
 15 | import os
 16 | import re
 17 | import glob
 18 | import shutil
 19 | import select
 20 | import fileinput
 21 | from subprocess import Popen, PIPE
 22 | 
 23 | ERRMSG = ''
 24 | 
 25 | 
 26 | def errmsg():
 27 |     global ERRMSG
 28 |     tmp = ERRMSG
 29 |     ERRMSG = ''
 30 |     return tmp
 31 | 
 32 | 
 33 | def simple_name(filepath):
 34 |     return os.path.splitext(os.path.basename(filepath))[0]
 35 | 
 36 | 
 37 | def frame_id(string):
 38 |     mobj = re.match(r'.*([\d]+)\).*"(\w*)"', string, re.U)
 39 |     if mobj:
 40 |         return (mobj.groups())
 41 |     else:
 42 |         return None
 43 | 
 44 | 
 45 | def get_frame_ids(fin):
 46 |     frame_nums = []
 47 |     listcmd = 'furnace-swf -i %s abclist' % fin
 48 |     proc = Popen(listcmd, shell=True, stdout=PIPE, stderr=PIPE, close_fds=True)
 49 |     for line in iter(proc.stdout.readline, b''):
 50 |         line = str(line, encoding='utf-8')
 51 |         num_name = frame_id(line)
 52 |         if num_name:
 53 |             frame_nums.append(num_name)
 54 |     proc.communicate()
 55 |     return frame_nums
 56 | 
 57 | 
 58 | def extract_frame(fin, dout, num_name):
 59 |     if num_name[1]:
 60 |         abcfile = "%s-%s.abc" % (simple_name(fin), num_name[1])
 61 |     else:
 62 |         abcfile = "%s-%s.abc" % (simple_name(fin), num_name[0])
 63 |     extractcmd = 'furnace-swf -i %s abcextract -o %s -n %s' % (fin, os.path.join(dout, abcfile), num_name[0])
 64 |     proc = Popen(extractcmd, shell=True, close_fds=True)
 65 |     proc.wait()
 66 |     return abcfile
 67 | 
 68 | 
 69 | def furnace_bytecode(fin, dout):
 70 |     abcfiles = []
 71 |     name = simple_name(fin)
 72 |     frames = get_frame_ids(fin)
 73 |     for num_name in frames:
 74 |         abcfiles.append(extract_frame(fin, dout, num_name))
 75 |     return abcfiles
 76 | 
 77 | 
 78 | def furnace_actionscript(abcfiles, dout):
 79 |     asfiles = []
 80 |     for bytecode in abcfiles:
 81 |         name = "%s.as" % simple_name(bytecode)
 82 |         decompilecmd = 'furnace-avm2-decompiler -d -i %s > %s' % (
 83 |         os.path.join(dout, bytecode), os.path.join(dout, name))
 84 |         proc = Popen(decompilecmd, shell=True, close_fds=True)
 85 |         proc.wait()
 86 |         asfiles.append(os.path.join(dout, name))
 87 |     return asfiles
 88 | 
 89 | 
 90 | def concat_scripts(scripts, fout):
 91 |     with open(fout, 'w') as fout:
 92 |         for line in fileinput.input(scripts, mode='rb'):
 93 |             fout.write(line)
 94 | 
 95 | 
 96 | def furnace_extract(fin, dirname):
 97 |     name = simple_name(fin)
 98 |     dout = os.path.join(dirname, name + '-furnace')
 99 |     try:
100 |         os.mkdir(dout)
101 |     except OSError as e:
102 |         if e.errno == 17:
103 |             pass
104 |         else:
105 |             print(e)
106 |             ERRMSG = str(e)
107 |             return None
108 |     abcfiles = furnace_bytecode(fin, dout)
109 |     asfiles = furnace_actionscript(abcfiles, dout)
110 |     if len(asfiles) > 1:
111 |         concat_scripts(asfiles, os.path.join(dout, "%s-all.as" % name))
112 |     return True
113 | 
114 | 
115 | def jpexs_extract(fin, dirname):
116 |     global ERRMSG
117 |     name = ''
118 |     script = ''
119 |     extracted = []
120 |     fname = fin
121 |     dout = os.path.join(dirname, os.path.splitext(os.path.basename(fname))[0] + '-jpexs')
122 |     # javacmd = ['java', '-jar', 'ffdec.jar', '-export', 'script', dout, fname ]
123 |     # javacmd = 'java -Djava.awt.headless=true -jar /Users/honey/src/work/pdf/thisneedsacoolname/ffdec.jar -format script:pcode -export script %s %s' % (dout, fname)
124 |     javacmd = 'java -Djava.awt.headless=true -jar /Users/honey/src/work/pdf/thisneedsacoolname/ffdec.jar -export script %s %s' % (
125 |     dout, fname)
126 | 
127 |     try:
128 |         os.mkdir(dout)
129 |     except OSError as e:
130 |         if e.errno == 17:
131 |             pass
132 |         else:
133 |             print('jpexs_extract mkdir(%s): %s' % (dout, e))
134 |             ERRMSG = str(e)
135 |             return None
136 | 
137 |     proc = Popen(javacmd, shell=True, stdout=PIPE, stderr=PIPE, close_fds=True, cwd=dout)
138 | 
139 |     p = select.poll()
140 |     p.register(proc.stderr.fileno(), select.POLLIN | select.POLLPRI)
141 |     for line in iter(proc.stdout.readline, b''):
142 |         line = str(line, encoding='utf-8')
143 |         if line.startswith('Exported'):
144 |             mobj = re.match(r"^Exported\s[^\d]*[\d]+/[\d]+\s([^,]+)\,\s", line, re.U)
145 |             if mobj:
146 |                 name = mobj.group(1).replace('.', '/').replace(' ', '\ ')
147 |                 extracted.append(name)
148 |         if p.poll(1):
149 |             err = str(proc.stderr.readline(), encoding='utf-8')
150 |             if err.startswith('FAIL') or err.startswith('SEVERE'):
151 |                 proc.kill()
152 |                 sys.stderr.write("jpexs_extract error %s\n" % err)
153 |                 shutil.rmtree(dout)
154 |                 ERRMSG = err
155 |                 return None
156 | 
157 |     out, err = proc.communicate()
158 |     if err:
159 |         ERRMSG = err
160 | 
161 |     for srcfile in extracted:
162 |         try:
163 |             fin = open('%s/%s.as' % (dout, srcfile), 'r')
164 |         except IOError as e:
165 |             print('jpexs_extract open(srcfile): %s' % e)
166 |             ERRMSG = str(e)
167 |             return None
168 |         else:
169 |             script += '\n'.join([line.rstrip() for line in fin.readlines()])
170 |             script += '\n'
171 |             fin.close()
172 | 
173 |     if not script:
174 |         return None
175 | 
176 |     try:
177 |         fout = open('%s/%s-all.as' % (dout, os.path.splitext(os.path.basename(fname))[0]), 'w')
178 |     except IOError as e:
179 |         print('jpexs_extract open(fout): %s' % e)
180 |         ERRMSG = str(e)
181 |         return None
182 |     else:
183 |         fout.write(script)
184 |         fout.close()
185 |         return True
186 | 
187 | 
188 | def main(din, dout='', tool='jpexs'):
189 |     if not dout:
190 |         dout = din
191 |     if not os.path.isdir(dout):
192 |         sys.stderr.write("Invalid directory: %s\n" % dout)
193 |         return None
194 |     files = glob.glob(os.path.join(din, '*.swf'))
195 | 
196 |     fdone = None
197 |     ferr = None
198 |     completed = set()
199 |     try:
200 |         fdone = open("%s/done.txt" % dout, "r")
201 |     except IOError as e:
202 |         if e.errno != 2:
203 |             sys.stderr.write("%s\n" % e)
204 |             sys.exit(0)
205 |     else:
206 |         completed = set([l.rstrip() for l in fdone.readlines()])
207 |         fdone.close()
208 | 
209 |     total = 0
210 |     errors = 0
211 |     for f in files:
212 |         md5name = os.path.splitext(os.path.basename(f))[0]
213 |         if md5name not in completed:
214 |             total += 1
215 |             sys.stdout.write("Processing:\t%s\t" % md5name)
216 |             if tool == 'jpexs':
217 |                 rv = jpexs_extract(f, dout)
218 |             elif tool == 'furnace':
219 |                 rv = furnace_extract(f, dout)
220 |             if rv:
221 |                 sys.stdout.write("complete\n")
222 |                 try:
223 |                     fdone = open("%s/done.txt" % dout, "a")
224 |                     fdone.write("%s\n" % md5name)
225 |                 except IOError as e:
226 |                     sys.stderr.write("Unable to write to log file, done.txt: %s\n" % e)
227 |                 else:
228 |                     fdone.close()
229 |             else:
230 |                 errors += 1
231 |                 sys.stdout.write("error\n")
232 |                 try:
233 |                     ferr = open("%s/err.txt" % dout, "a")
234 |                     ferr.write("%s\n%s\n\n" % (md5name, errmsg()))
235 |                 except IOError as e:
236 |                     sys.stderr.write("Unable to write to log file, err.txt: %s\n" % e)
237 |                     continue
238 |                 else:
239 |                     ferr.close()
240 |         else:
241 |             sys.stdout.write("Skipping:\t%s\n" % md5name)
242 | 
243 |     sys.stdout.write("Complete:\t%d\nFailures:\t%d\nTotal jobs:\t%d\n" % (total - errors, errors, total))
244 | 
245 | 
246 | if __name__ == '__main__':
247 |     try:
248 |         dir_in = sys.argv[1]
249 |     except IndexError:
250 |         dir_in = './'
251 | 
252 |     try:
253 |         dir_out = os.path.abspath(os.path.expandvars(os.path.expanduser(sys.argv[2])))
254 |     except IndexError:
255 |         dir_out = './'
256 | 
257 |     try:
258 |         tool = sys.argv[3]
259 |     except IndexError:
260 |         tool = 'jpexs'
261 | 
262 |     if os.path.isdir(dir_in):
263 |         main(dir_in, dir_out, tool)
264 | 


--------------------------------------------------------------------------------
/sdhasher.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2011-2015 by Carnegie Mellon University
 2 | # 
 3 | #  NO WARRANTY
 4 | # 
 5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
 6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
 7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
 8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
 9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
13 | 
14 | import sys
15 | import tempfile
16 | from subprocess import Popen, PIPE
17 | 
18 | 
19 | def log_error(msg, log):
20 |     try:
21 |         logmsg(log, msg)
22 |     except NameError:
23 |         log.append(msg)
24 | 
25 | 
26 | def make_sdhash(data, log=None):
27 |     if not data or len(data) < 512:
28 |         return ''
29 |     stdout = ''
30 |     try:
31 |         tmpfile = tempfile.NamedTemporaryFile(delete=True)
32 |     except IOError as e:
33 |         if log:
34 |             log_error("sdhash: %s\n" % e)
35 |         else:
36 |             sys.stderr.write('sdhash: %s\n' % str(e))
37 |     else:
38 |         tmpfile.write(data)
39 |         tmpfile.flush()
40 |         cmd = ['sdhash', tmpfile.name]
41 |         proc = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
42 |         stdout, stderr = proc.communicate()
43 |         proc.wait()
44 |         tmpfile.close()
45 |         if not stdout:
46 |             stdout = ''
47 |         if stderr:
48 |             sys.stderr.write("%s\n" % stderr)
49 |     finally:
50 |         return stdout
51 | 


--------------------------------------------------------------------------------
/storage.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | # 
  3 | #  NO WARRANTY
  4 | # 
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | import sys
 15 | 
 16 | import cfg
 17 | from db_mgmt import DBGateway
 18 | 
 19 | TABLE = 'parsed_pdfs'
 20 | PRIMARY = 'pdf_md5'
 21 | COLUMNS = ('category',
 22 |            'pdf_md5',
 23 |            'tree_md5',
 24 |            'tree',
 25 |            'graph',
 26 |            'obf_js',
 27 |            'obf_js_sdhash',
 28 |            'de_js',
 29 |            'de_js_sdhash',
 30 |            'swf',
 31 |            'swf_sdhash',
 32 |            'abc',
 33 |            'abc_sdhash',
 34 |            'actionscript',
 35 |            'as_sdhash',
 36 |            'shellcode',
 37 |            'fsize',
 38 |            'pdfsize',
 39 |            'bin_blob',
 40 |            'urls',
 41 |            'malformed',
 42 |            'errors')
 43 | 
 44 | 
 45 | class StorageFactory(object):
 46 | 
 47 |     def __init__(self):
 48 |         self.cfg = cfg.Config()
 49 | 
 50 |     def new_storage(self, typ, name='', user='', pw=''):
 51 |         if not name:
 52 |             name = self.cfg.setting("database", "db")
 53 |         if not user:
 54 |             user = self.cfg.setting("database", "user")
 55 |         if not pw:
 56 |             ps = self.cfg.setting("database", "pw")
 57 |         sys.stderr.write("Storage type,name,user: %s,%s,%s\n" % (typ, name, user))
 58 |         
 59 |         if typ == 'stdout':
 60 |             return StdoutStorage()
 61 |         if typ == 'sqlite3':
 62 |             return DbStorage(name)
 63 |         if typ == 'postgres':
 64 |             try:
 65 |                 import psycopg2
 66 |             except ImportError as e:
 67 |                 sys.stderr.write("Failed to import PostgreSQL Python library 'psycopg2': %s\n" % e)
 68 |                 return None
 69 |             else:
 70 |                 return PostgresStorage(name, user, pw)
 71 |         if typ == 'neo4j':
 72 |             return NeoStorage()
 73 |         if typ == 'file':
 74 |             return FileStorage(name + '.txt')
 75 | 
 76 | 
 77 | class Storage(object):
 78 |     def __init__(self):
 79 |         pass
 80 | 
 81 |     def open(self):
 82 |         return False
 83 | 
 84 |     def store(self):
 85 |         pass
 86 | 
 87 |     def close(self):
 88 |         pass
 89 | 
 90 |     def align_kwargs(self, data):
 91 |         aligned = []
 92 |         for col in COLUMNS:
 93 |             aligned.append(data.get(col, ''))
 94 |         return tuple(aligned)
 95 | 
 96 | 
 97 | class PostgresStorage(Storage):
 98 |     insert = "INSERT INTO parsed_pdfs (%s) VALUES (%s)"
 99 |     create = "CREATE TABLE IF NOT EXISTS parsed_pdfs (rowid serial, %s TEXT, PRIMARY KEY (rowid, pdf_md5))"
100 | 
101 |     def __init__(self, dbname, user, pw=''):
102 |         self.dbname = dbname
103 |         self.user = user
104 |         self.pw = pw
105 |         ccols = ' TEXT, '.join(COLUMNS)
106 |         icols = ', '.join(COLUMNS)
107 |         markers = ', '.join(['%s' for x in COLUMNS])
108 |         self.create = self.create % (ccols)
109 |         self.insert = self.insert % (icols, markers)
110 | 
111 |     def open(self):
112 |         try:
113 |             self.conn = psycopg2.connect(database=self.dbname, user=self.user, password=self.pw)
114 |             cur = self.conn.cursor()
115 |             cur.execute(self.create)
116 |             self.conn.commit()
117 |         except Exception as e:
118 |             sys.stderr.write("Postgres Connect Error\t%s\n" % (str(e)))
119 |             sys.stderr.flush()
120 |             return False
121 |         else:
122 |             cur.close()
123 |             return True
124 | 
125 |     def store(self, data_dict):
126 |         data_tuple = self.align_kwargs(data_dict)
127 |         try:
128 |             cur = self.conn.cursor()
129 |             cur.execute(self.insert, data_tuple)
130 |         except Exception as e:
131 |             self.conn.rollback()
132 |         else:
133 |             self.conn.commit()
134 |             cur.close()
135 | 
136 |     def close(self):
137 |         self.conn.commit()
138 |         self.conn.close()
139 | 
140 | 
141 | class NeoStorage(Storage):
142 |     pass
143 | 
144 | 
145 | class StdoutStorage(Storage):
146 | 
147 |     def open(self):
148 |         return True
149 | 
150 |     def store(self, data_dict):
151 |         sys.stdout.write("%s\n" % data_dict)
152 | 
153 |     def close(self):
154 |         pass
155 | 
156 | class DbStorage(Storage):
157 |     def __init__(self, db=''):
158 |         self.db = DBGateway(db + '.sqlite')
159 | 
160 |     def open(self):
161 |         try:
162 |             self.db.create_table(TABLE, cols=[' '.join([col, 'TEXT']) for col in COLUMNS], primary=PRIMARY)
163 |         except Exception:
164 |             return False
165 |         else:
166 |             return True
167 | 
168 |     def store(self, data_dict):
169 |         data_tuple = self.align_kwargs(data_dict)
170 |         if not self.db.insert(TABLE, cols=COLUMNS, vals=data_tuple):
171 |             err_tuple = (data_dict.get('pdf_md5'), 'DB_ERROR: %s' % self.db.get_error())
172 |             self.db.insert(TABLE, cols=['pdf_md5', 'errors'], vals=err_tuple)
173 | 
174 |     def close(self):
175 |         self.db.disconnect()
176 | 
177 |     def contains(self, key, val):
178 |         return self.db.count(TABLE, key, val)
179 | 
180 | 
181 | class FileStorage(Storage):
182 |     def __init__(self, path):
183 |         self.path = path
184 |         try:
185 |             self.fd = open(path, 'wb')
186 |         except IOError as e:
187 |             print e
188 |             print 'Unable to create output. Exiting.'
189 |             sys.exit(1)
190 |         else:
191 |             self.fd.close()
192 | 
193 |     def open(self):
194 |         try:
195 |             self.fd = open(self.path, 'wb')
196 |         except IOError:
197 |             return False
198 |         else:
199 |             return True
200 | 
201 |     def store(self, data_dict):
202 |         # self.json.dump(data_dict, self.fd, separators=(',', ':'))
203 |         header = '%s\n%s\n%s\n' % ('-' * 80, data_dict.get('pdf_md5', 'N/A'), '-' * 80)
204 |         footer = '\n'
205 | 
206 |         self.fd.write(header)
207 |         for k, v in data_dict.items():
208 |             try:
209 |                 self.fd.write("__%s\n" % k)
210 |                 self.fd.write(v)
211 |                 self.fd.write("\n\n")
212 |             except IOError as e:
213 |                 sys.stderr.write("FileStorage store IO error: %s\n" % e)
214 |                 sys.exit(0)
215 |         self.fd.write(footer)
216 | 
217 |     def close(self):
218 |         self.fd.close()
219 | 
220 | 
221 | if __name__ == '__main__':
222 |     tests = ['test.test', 'db', 'pg', 'neo4j']
223 |     for test in tests:
224 |         storage = StorageFactory().new_storage(test, "500k-test")
225 |         print "%s.open()\t%s" % (test, storage.open())
226 | 


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/util/__init__.py


--------------------------------------------------------------------------------
/util/mapper.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | # 
  3 | #  NO WARRANTY
  4 | # 
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | import os
 15 | import sys
 16 | 
 17 | from db_mgmt import DBGateway
 18 | from sdhasher import make_sdhash
 19 | 
 20 | 
 21 | class JSHasher(object):
 22 |     name = 'JSHasher'
 23 |     # query = "select rowid, js, de_js, swf, actionscript from clarified where (rowid > %s and rowid <= %s) limit 1;"
 24 |     # update = "update clarified set js_sdhash=?, de_js_sdhash=?, swf_sdhash=?, actionscript_sdhash=? where rowid is ?"
 25 |     query = "select rowid, js, swf from clarified where (rowid > %s and rowid <= %s) limit 1;"
 26 |     update = "update clarified set js_sdhash=?, swf_sdhash=? where rowid is ?"
 27 |     rowid = -1
 28 |     sdhash = ''
 29 |     setup = ('alter table clarified add column js_sdhash text;', 'alter table clarified add column swf_sdhash text;')
 30 |     proceed = True
 31 |     subs = None
 32 | 
 33 |     def __init__(self, MIN=-1, MAX=1000000):
 34 |         self.lastrow = MIN
 35 |         self.MAX = MAX
 36 | 
 37 |     def run(self, row):
 38 |         if not row:
 39 |             print 'Complete'
 40 |             proceed = False
 41 |         self.lastrow = row['rowid']
 42 |         jssdhash = make_sdhash(row['js'])
 43 |         # de_jssdhash = make_sdhash(row['de_js'])
 44 |         swfsdhash = make_sdhash(row['swf'])
 45 |         # actionscriptsdhash = make_sdhash(row['actionscript'])
 46 |         # self.subs = (jssdhash, de_jssdhash, swfsdhash, actionscriptsdhash, row['rowid'])
 47 |         self.subs = (jssdhash, swfsdhash, row['rowid'])
 48 |         self.verbose(row, 'js', jssdhash)
 49 |         # self.verbose(row, 'de_Js', de_jssdhash)
 50 |         self.verbose(row, 'swf', swfsdhash)
 51 |         # self.verbose(row, 'actionscript', actionscriptsdhash)
 52 | 
 53 |     def verbose(self, row, key, sdh):
 54 |         try:
 55 |             print 'SdHashed %s R:%s\t(%s)\t[%s]' % (key, self.lastrow, row[key][:16], sdh[:16])
 56 |         except TypeError:
 57 |             pass
 58 | 
 59 |     def query_cmd(self):
 60 |         return self.query % (self.lastrow, self.MAX)
 61 | 
 62 | 
 63 | class Mapper(object):
 64 |     def __init__(self, dbgateway, func_list):
 65 |         self.db = dbgateway
 66 |         self.funcs = func_list
 67 | 
 68 |     def setup(self, cmds):
 69 |         for cmd in cmds:
 70 |             if not self.db.query(cmd):
 71 |                 err = self.db.get_error()
 72 |                 sys.stderr.write("\tsetup: %s\n" % err)
 73 |                 if 'duplicate' not in err:
 74 |                     return False
 75 |         return True
 76 | 
 77 |     def start(self):
 78 |         for func in self.funcs:
 79 |             print 'Mapping: %s' % func.name
 80 |             if func.setup:
 81 |                 print '\tsetup:\t%s\n\t\t%s' % func.setup
 82 |                 if not self.setup(func.setup):
 83 |                     continue
 84 |             while func.proceed:
 85 |                 if not self.db.query(func.query_cmd()):
 86 |                     sys.stderr.write("query: %s\n" % self.db.get_error())
 87 |                 else:
 88 |                     func.run(self.db.db_curr.fetchone())
 89 |                     if not self.db.query(func.update, func.subs):
 90 |                         sys.stderr.write("update: %s\n" % self.db.get_error())
 91 | 
 92 | 
 93 | if __name__ == '__main__':
 94 |     try:
 95 |         dbpath = sys.argv[1]
 96 |         if not os.path.exists(dbpath):
 97 |             raise IndexError
 98 |         MIN = sys.argv[2]
 99 |         MAX = sys.argv[3]
100 |     except IndexError as e:
101 |         print 'Invalid args: %s' % e
102 |         sys.exit(0)
103 |     else:
104 |         db = DBGateway(os.path.basename(dbpath), os.path.dirname(dbpath))
105 |         functions = [JSHasher(MIN, MAX), ]
106 |         mapper = Mapper(db, functions)
107 |         mapper.start()
108 |         db.disconnect()
109 | 


--------------------------------------------------------------------------------
/util/str_utils.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2011-2015 by Carnegie Mellon University
 2 | # 
 3 | #  NO WARRANTY
 4 | # 
 5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
 6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
 7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
 8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
 9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
13 | 
14 | import re
15 | 
16 | import htmlentitydefs
17 | 
18 | 
19 | def unescapeHTMLEntities(text):
20 |     """
21 | 
22 |     Removes HTML or XML character references and entities from a text string.
23 |     @param text The HTML (or XML) source text.
24 |     @return The plain text, as a Unicode string, if necessary.
25 |     Author: Fredrik Lundh
26 |     Source: http://effbot.org/zone/re-sub.htm#unescape-html
27 |     """
28 | 
29 |     def fixup(m):
30 |         text = m.group(0)
31 |         if text[:2] == '&#':
32 |             # character reference
33 |             try:
34 |                 if text[:3] == '&#x':
35 |                     return unichr(int(text[3:-1], 16))
36 |                 else:
37 |                     return unichr(int(text[2:-1]))
38 |             except ValueError:
39 |                 pass
40 |         else:
41 |             # named entity
42 |             try:
43 |                 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
44 |             except KeyError:
45 |                 pass
46 |         return text  # leave as is
47 | 
48 |     return str(re.sub('&#?\w+;', fixup, text))
49 | 
50 | 
51 | def isFlash(content):
52 |     """
53 | 
54 |     Check for swf content in a string by searching for CWS or FWS
55 |     in the first three characters
56 |     :param content: Stream content from an XML/HTML doc
57 |     :return: True if content has a correct flash header value of 'CWS' or 'FWS'
58 |     """
59 |     content = unescapeHTMLEntities(content)
60 |     return content.startswith("CWS") or content.startswith("FWS")
61 | 


--------------------------------------------------------------------------------
/xml_creator.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | # 
  3 | #  NO WARRANTY
  4 | # 
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | import os
 15 | import re
 16 | import sys
 17 | 
 18 | import lxml.etree as ET
 19 | from pdfminer.pdfdocument import PDFDocument
 20 | from pdfminer.pdfparser import PDFParser
 21 | from pdfminer.pdftypes import PDFStream, PDFObjRef
 22 | from pdfminer.pdftypes import PDFObjectNotFound
 23 | from pdfminer.psparser import PSKeyword, PSLiteral
 24 | from pdfminer.utils import isnumber
 25 | from JSAnalysis import isJavascript
 26 | from util.str_utils import isFlash
 27 | from lib.scandir import scandir
 28 | 
 29 | '''
 30 |     Parse a pdf and build an xml tree based on the object structure
 31 | '''
 32 | 
 33 | 
 34 | class FrankenParser(object):
 35 |     def __init__(self, pdf, debug=False):
 36 |         self.errors = ''
 37 |         self.debug = debug
 38 |         self.pdf = pdf
 39 |         self.xml = ''
 40 |         self.javascript = []
 41 |         self.deobfuscated = []
 42 |         self.swf = []
 43 |         self.found_eof = False
 44 |         self.bin_blob = ''
 45 |         self.malformed = {}
 46 |         self.parse()
 47 |         self.tree = self.tree_from_xml(self.xml)
 48 | 
 49 |     def e(self, s):
 50 |         ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
 51 |         return ESC_PAT.sub(lambda m: '&#%d;' % ord(m.group(0)), s)
 52 | 
 53 |     '''
 54 |         Convert a pdf object into xml
 55 |     '''
 56 | 
 57 |     def dump(self, obj):
 58 |         res = ""
 59 |         if obj is None:
 60 |             res += '<null />'
 61 |             return res
 62 | 
 63 |         if isinstance(obj, dict):
 64 |             res += '<dict size="%' + str(len(obj)) + '">\n'
 65 |             for (k, v) in obj.iteritems():
 66 |                 k = re.sub(r'\W+', '', k)
 67 |                 if k.isdigit() or not k:
 68 |                     k = 'xml_creator_' + k
 69 |                 res += '<' + k + '>'
 70 |                 res += self.dump(v)
 71 |                 res += '</' + k + '>\n'
 72 |             res += '</dict>'
 73 |             return res
 74 | 
 75 |         if isinstance(obj, list):
 76 |             res += '<list size="' + str(len(obj)) + '">\n'
 77 |             for v in obj:
 78 |                 res += self.dump(v)
 79 |                 res += '\n'
 80 |             res += '</list>'
 81 |             return res
 82 | 
 83 |         if isinstance(obj, str):
 84 |             self.check_js(obj)
 85 |             # encode base64 to avoid illegal xml characters
 86 |             res += '<string>' + self.e(obj).encode('base64') + '</string>'
 87 |             return res
 88 | 
 89 |         if isinstance(obj, PDFStream):
 90 |             res += '<stream>\n'
 91 |             try:
 92 |                 res += '<props>\n'
 93 |                 res += self.dump(obj.attrs)
 94 |                 res += '\n</props>\n'
 95 |                 data = obj.get_data()
 96 |                 self.check_js(str(data))
 97 |                 self.check_swf(str(data))
 98 |                 res += '<data size="' + str(len(data)) + '">' + self.e(data).encode('base64') + '</data>\n'
 99 |             # Throws an exception if the filter is unsupported, etc
100 |             except Exception as e:
101 |                 # print e.message
102 |                 res += '<StreamException>%s</StreamException>' % str(e)
103 |             # make sure the tag is closed appropriately
104 |             res += '</stream>'
105 |             return res
106 | 
107 |         if isinstance(obj, PDFObjRef):
108 |             res += '<ref id="' + str(obj.objid) + '" />'
109 |             return res
110 | 
111 |         if isinstance(obj, PSKeyword):
112 |             self.check_js(obj.name)
113 |             res += '<keyword>' + obj.name + '</keyword>'
114 |             return res
115 | 
116 |         if isinstance(obj, PSLiteral):
117 |             self.check_js(obj.name)
118 |             res += '<literal>' + obj.name + '</literal>'
119 |             return res
120 | 
121 |         if isnumber(obj):
122 |             self.check_js(str(obj))
123 |             res += '<number>' + str(obj) + '</number>'
124 |             return res
125 | 
126 |         raise TypeError(obj)
127 | 
128 |     '''
129 |         Add the PDF trailers to the xml
130 |     '''
131 | 
132 |     def dumptrailers(self, doc):
133 |         res = ""
134 |         for xref in doc.xrefs:
135 |             res += '<trailer>\n'
136 |             res += self.dump(xref.trailer)
137 |             res += '\n</trailer>\n\n'
138 |         return res
139 | 
140 |     '''
141 |     Records information into a dictionary.
142 |     All key values are lists, and the paramter is appended.
143 |     '''
144 | 
145 |     def takenote(self, dic, key, val):
146 |         try:
147 |             dic[key].append(val)
148 |         except KeyError:
149 |             dic[key] = []
150 |             dic[key].append(val)
151 | 
152 |     def getmalformed(self, key=''):
153 |         if not key:
154 |             return self.malformed
155 |         else:
156 |             return self.malformed.get(key)
157 | 
158 |     '''
159 |         Parse the pdf and build the xml
160 |     '''
161 | 
162 |     def parse(self):
163 |         fp = file(self.pdf, 'rb')
164 |         parser = PDFParser(fp, dbg=self.debug)
165 |         doc = PDFDocument(parser, dbg=self.debug)
166 |         # extract blob of data after EOF (if it exists)
167 |         if doc.found_eof and doc.eof_distance > 3:
168 |             self.bin_blob = parser.read_from_end(doc.eof_distance)
169 |         res = '<pdf>'
170 |         visited = set()  # keep track of the objects already visited
171 |         for xref in doc.xrefs:
172 |             for objid in xref.get_objids():
173 |                 if objid in visited:
174 |                     continue
175 |                 visited.add(objid)
176 |                 try:
177 |                     obj = doc.getobj(objid)
178 |                     res += '<object id="' + str(objid) + '">\n'
179 |                     res += self.dump(obj)
180 |                     res += '\n</object>\n\n'
181 |                 except PDFObjectNotFound as e:
182 |                     mal_obj = parser.read_n_from(xref.get_pos(objid)[1], 4096)
183 |                     mal_obj = mal_obj.replace('<', '0x3C')
184 |                     res += '<object id="%d" type="malformed">\n%s\n</object>\n\n' % (objid, mal_obj)
185 |                     self.takenote(self.malformed, 'objects', objid)
186 |                 except Exception as e:
187 |                     res += '<object id="%d" type="exception">\n%s\n</object>\n\n' % (objid, e.message)
188 |         fp.close()
189 |         res += self.dumptrailers(doc)
190 |         res += '</pdf>'
191 |         self.xml = res
192 |         self.errors = doc.errors
193 |         self.bytes_read = parser.BYTES
194 |         return
195 | 
196 |     '''
197 |         Check string for javascript content
198 |     '''
199 | 
200 |     def check_js(self, content):
201 |         if isJavascript(content):
202 |             # pull out js between script tags
203 |             reJSscript = '<script[^>]*?contentType\s*?=\s*?[\'"]application/x-javascript[\'"][^>]*?>(.*?)</script>'
204 |             res = re.findall(reJSscript, content, re.DOTALL | re.IGNORECASE)
205 |             if res != []:
206 |                 self.javascript.append('\n'.join(res))
207 |             else:
208 |                 self.javascript.append(content)
209 |         return
210 | 
211 |     '''
212 |         Check string for flash content
213 |     '''
214 | 
215 |     def check_swf(self, content):
216 |         if isFlash(content):
217 |             self.swf.append(content)
218 |         return
219 | 
220 |     '''
221 |         Create an lxml tree from the xml string
222 |     '''
223 | 
224 |     def tree_from_xml(self, xml):
225 |         try:
226 |             tree = ET.fromstring(xml)
227 |             return tree
228 |         except Exception as e:
229 |             sys.stderr.write("xml_creator cannot create tree: %s\n" % e)
230 |             return 'TREE_ERROR: %s' % str(e)
231 | 
232 |     '''
233 |         Calls edges to recursively create the graph string
234 |     '''
235 | 
236 |     def make_graph(self, tree):
237 |         res = []
238 |         # Explicit check for None to avoid FutureWarning
239 |         if tree is not None:
240 |             self.edges(tree, res, 0)
241 |         return res
242 | 
243 |     def edges(self, parent, output, id):
244 |         """
245 | 
246 |         creates string showing connections between objects
247 |         """
248 |         for child in list(parent):
249 |             if isinstance(child, str):
250 |                 return
251 |             elif child.get("id") != None:
252 |                 cid = child.get("id")
253 |                 output.append(str(id) + ' ' + cid + '\n')
254 |                 self.edges(child, output, cid)
255 |             else:
256 |                 res = self.edges(child, output, id)
257 |         return
258 | 
259 | 
260 | if __name__ == "__main__":
261 |     try:
262 |         dirin = sys.argv[1]
263 |         dirout = sys.argv[2]
264 |     except IndexError:
265 |         sys.exit(0)
266 |     else:
267 |         if not os.path.isdir(dirin) or not os.path.isdir(dirout):
268 |             sys.exit(0)
269 | 
270 |         sys.stdout.write("%s/*.pdf  -->  %s/*.swf\n\n" % (dirin, dirout))
271 | 
272 |         try:
273 |             fdone = open(os.path.join(dirout, "done.txt"), 'a+')
274 |             ferr = open(os.path.join(dirout, "error.txt"), 'a')
275 |         except IOError as e:
276 |             sys.stderr.write("parser done file error: %s\n" % e)
277 |         else:
278 |             completed = set()
279 |             fdone.seek(0)
280 |             for line in fdone:
281 |                 completed.add(line.rstrip())
282 | 
283 |             pdfs = scandir(dirin)
284 | 
285 |             for pdf in pdfs:
286 | 
287 |                 if pdf.name in completed:
288 |                     sys.stdout.write("skipping: %s\n" % pdf.name)
289 |                     continue
290 | 
291 |                 sys.stdout.write("%s\n" % pdf.name)
292 | 
293 |                 try:
294 |                     parsed = FrankenParser(pdf.path)
295 |                 except Exception as e:
296 |                     try:
297 |                         ferr.write("%s:%s\n" % (pdf.name, str(e)))
298 |                     except Exception:
299 |                         ferr.write("%s: ferr write() BIG-TIME ERROR\n" % pdf.name)
300 |                         sys.stderr.write("ferr write error pdf: %s := %s\n" % (pdf.name, e))
301 |                 else:
302 |                     if parsed.swf:
303 |                         try:
304 |                             fout = open(os.path.join(dirout, "%s.swf" % pdf.name), 'wb')
305 |                         except IOError as e:
306 |                             sys.stderr.write("parser output file error: %s\n" % e)
307 |                         else:
308 |                             fout.write(''.join(parsed.swf))
309 |                             fout.close()
310 |                 finally:
311 |                     try:
312 |                         fdone.write("%s\n" % pdf.name)
313 |                     except Exception as e:
314 |                         sys.stderr.write("fdone write error pdf: %s := %s\n" % (pdf.name, e))
315 |             sys.stdout.write("\n")
316 |             fdone.close()
317 |             ferr.close()
318 | 


--------------------------------------------------------------------------------