├── .gitignore
├── JSAnalysis.py
├── LICENSE.md
├── README.md
├── __init__.py
├── build_pdf_objects.py
├── cfg.py
├── db
└── __init__.py
├── db_mgmt.py
├── huntterp.py
├── jobs
└── __init__.py
├── pdfminer
├── LICENSE
├── __init__.py
├── arcfour.py
├── ascii85.py
├── ccitt.py
├── lzw.py
├── pdfdocument.py
├── pdfparser.py
├── pdftypes.py
├── psparser.py
├── runlength.py
└── utils.py
├── pdfrankenstein.py
├── peepdf
├── AUTHORS
├── CHANGELOG
├── COPYING
├── JSAnalysis.py
├── PDFConsole.py
├── PDFCore.py
├── PDFCrypto.py
├── PDFFilters.py
├── PDFUtils.py
├── README
├── TODO
├── __init__.py
├── aes.py
├── aespython
│ ├── __init__.py
│ ├── aes_cipher.py
│ ├── aes_tables.py
│ ├── cbc_mode.py
│ ├── cfb_mode.py
│ ├── key_expander.py
│ ├── ofb_mode.py
│ └── test_keys.py
├── ccitt.py
├── colorama
│ ├── PKG-INFO
│ ├── __init__.py
│ ├── ansi.py
│ ├── ansitowin32.py
│ ├── initialise.py
│ ├── win32.py
│ └── winterm.py
├── jjdecode.py
├── jsbeautifier
│ ├── __init__.py
│ └── unpackers
│ │ ├── README.specs.mkd
│ │ ├── __init__.py
│ │ ├── evalbased.py
│ │ ├── javascriptobfuscator.py
│ │ ├── myobfuscate.py
│ │ ├── packer.py
│ │ └── urlencode.py
├── lzw.py
├── peepdf.dtd
└── peepdf.py
├── scripts
├── __init__.py
├── clarify.py
├── ffdec.jar
├── mapper.py
└── run-jpexs.py
├── sdhasher.py
├── storage.py
├── util
├── __init__.py
├── mapper.py
└── str_utils.py
└── xml_creator.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion
2 |
3 | *.iml
4 |
5 | ## Directory-based project format:
6 | .idea/
7 | # if you remove the above rule, at least ignore the following:
8 |
9 | # User-specific stuff:
10 | # .idea/workspace.xml
11 | # .idea/tasks.xml
12 | # .idea/dictionaries
13 |
14 | # Sensitive or high-churn files:
15 | # .idea/dataSources.ids
16 | # .idea/dataSources.xml
17 | # .idea/sqlDataSources.xml
18 | # .idea/dynamic.xml
19 | # .idea/uiDesigner.xml
20 |
21 | # Gradle:
22 | # .idea/gradle.xml
23 | # .idea/libraries
24 |
25 | # Mongo Explorer plugin:
26 | # .idea/mongoSettings.xml
27 |
28 | ## File-based project format:
29 | *.ipr
30 | *.iws
31 |
32 | ## Plugin-specific files:
33 |
34 | # IntelliJ
35 | /out/
36 |
37 | # mpeltonen/sbt-idea plugin
38 | .idea_modules/
39 |
40 | # JIRA plugin
41 | atlassian-ide-plugin.xml
42 |
43 | # Crashlytics plugin (for Android Studio and IntelliJ)
44 | com_crashlytics_export_strings.xml
45 | crashlytics.properties
46 | crashlytics-build.properties
47 |
48 | # OSX
49 | .DS_Store
50 | .AppleDouble
51 | .LSOverride
52 |
53 | # Icon must end with two \r
54 | Icon
55 |
56 |
57 | # Thumbnails
58 | ._*
59 |
60 | # Files that might appear in the root of a volume
61 | .DocumentRevisions-V100
62 | .fseventsd
63 | .Spotlight-V100
64 | .TemporaryItems
65 | .Trashes
66 | .VolumeIcon.icns
67 |
68 | # Directories potentially created on remote AFP share
69 | .AppleDB
70 | .AppleDesktop
71 | Network Trash Folder
72 | Temporary Items
73 | .apdisk
74 |
75 | #Python
76 | # Byte-compiled / optimized / DLL files
77 | __pycache__/
78 | *.py[cod]
79 | *$py.class
80 |
81 | # C extensions
82 | *.so
83 |
84 | # Distribution / packaging
85 | .Python
86 | env/
87 | build/
88 | develop-eggs/
89 | dist/
90 | downloads/
91 | eggs/
92 | .eggs/
93 | lib/
94 | lib64/
95 | parts/
96 | sdist/
97 | var/
98 | *.egg-info/
99 | .installed.cfg
100 | *.egg
101 |
102 | # PyInstaller
103 | # Usually these files are written by a python script from a template
104 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
105 | *.manifest
106 | *.spec
107 |
108 | # Installer logs
109 | pip-log.txt
110 | pip-delete-this-directory.txt
111 |
112 | # Unit test / coverage reports
113 | htmlcov/
114 | .tox/
115 | .coverage
116 | .coverage.*
117 | .cache
118 | nosetests.xml
119 | coverage.xml
120 | *,cover
121 |
122 | # Translations
123 | *.mo
124 | *.pot
125 |
126 | # Django stuff:
127 | *.log
128 |
129 | # Sphinx documentation
130 | docs/_build/
131 |
132 | # PyBuilder
133 | target/
134 |
135 |
136 | #Vi
137 | [._]*.s[a-w][a-z]
138 | [._]s[a-w][a-z]
139 | *.un~
140 | Session.vim
141 | .netrwhist
142 | *~
143 |
144 | frankenstein.cfg
145 | *.txt
146 | *.csv
147 | *.sqlite*
148 |
--------------------------------------------------------------------------------
/JSAnalysis.py:
--------------------------------------------------------------------------------
1 | # Copyright 2011-2015 by Carnegie Mellon University
2 | #
3 | # NO WARRANTY
4 | #
5 | # THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
6 | # MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON
7 | # UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
8 | # IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
9 | # OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
10 | # OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY
11 | # DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
12 | # FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
13 |
14 | try:
15 | import PyV8
16 | except ImportError as e:
17 | print str(e)
18 | PyV8 = None
19 |
20 | import re
21 |
22 | import build_pdf_objects
23 | from util.str_utils import unescapeHTMLEntities
24 |
25 | reJSscript = ''
26 |
27 | def create_objs(context, tree):
28 | """
29 |
30 | Mimic native Adobe objects and add them to the context
31 | :param context: JavaScript context, like a namespace at runtime
32 | :param tree: XML tree of the pdf to reference objects
33 | :return:
34 | """
35 | try:
36 | app = build_pdf_objects.create_app_obj(tree)
37 | context.eval("app = " + str(app) + ";")
38 | context.eval("app.doc.syncAnnotScan = function () {}")
39 | context.eval("app.doc.getAnnots = function () { return app.doc.annots;}")
40 | context.eval("app.eval = function (string) { eval(string);}")
41 | context.eval("app.newDoc = function () { return '';}")
42 | context.eval("app.getString = function () { ret = \"\"; for(var prop in app){ ret += app[prop]; } return ret;}")
43 | except Exception as e:
44 | # print "App: " + e.message
45 | pass
46 | try:
47 | info = build_pdf_objects.create_info_obj(tree)
48 | context.eval("this.info = " + str(info) + ";")
49 | for key in info:
50 | context.eval("this." + key + "= '" + re.escape(info[key]) + "';")
51 | context.eval("this.eval = eval")
52 | # print info
53 | except Exception as e:
54 | print "Info: " + e.message
55 | pass
56 | try:
57 | event = build_pdf_objects.create_event_obj(tree)
58 | context.eval("event = " + str(event) + ";")
59 | context.eval("event.target.info = this.info")
60 | except Exception as e:
61 | # print "Event: " + e.message
62 | pass
63 |
64 |
65 | def eval_loop(code, context, old_msg="", limit=10):
66 | """
67 |
68 | Eval the code and handle any exceptions it throws
69 | :param code: String of code to evaluate
70 | :param context: JavaScript context object
71 | :param old_msg:
72 | :param limit: Recursive limit
73 | :return:
74 | """
75 | try:
76 | context.eval(code)
77 | return context.eval("evalCode")
78 | # catch exceptions and attempt to fix them
79 | except ReferenceError as e:
80 | # print e.message
81 | if e.message == old_msg:
82 | return context.eval("evalCode")
83 | elif e.message.find('$') > -1:
84 | context.eval("$ = this;")
85 | else:
86 | # try commenting out line
87 | line_num = re.findall("@\s(\d*?)\s", e.message)
88 | line_num = int(line_num[0])
89 | i = 0
90 | for item in code.split("\n"):
91 | i += 1
92 | if i == line_num:
93 | code = re.sub(item, "//" + item, code)
94 | break
95 | return eval_loop(code, context, e.message)
96 | except TypeError as te:
97 | # print te.message
98 | if te.message == old_msg:
99 | return context.eval("evalCode")
100 | elif te.message.find("called on null or undefined") > -1:
101 | # in Adobe undefined objects become app object
102 | line = re.findall("->\s(.*)", te.message)
103 | sub, count = re.subn("=\s?.\(.*?\)", "=app", line[0])
104 | if count < 1:
105 | sub = re.sub("=.*", "=app", line[0])
106 | line = re.escape(line[0])
107 | code = re.sub(line, sub, code)
108 | elif te.message.find("undefined is not a function") > -1:
109 | # sub in eval as a guess
110 | line = re.findall("->\s(.*)", te.message)
111 | match = re.findall("[\s=]?(.*?)\(", line[0])
112 | if len(match) > 0:
113 | sub = re.sub(match[0], "eval", line[0])
114 | line = re.escape(line[0])
115 | code = re.sub(line, sub, code)
116 | else:
117 | return context.eval("evalCode")
118 | elif te.message.find("Cannot read property") > -1:
119 | # undefined becomes app
120 | line = re.findall("->\s(.*)", te.message)
121 | match = re.findall("[=\s](.*?)\[", line[0])
122 | if len(match) > 0:
123 | sub = re.sub(match[0], "app", line[0])
124 | line = re.escape(line[0])
125 | code = re.sub(line, sub, code)
126 | else:
127 | return context.eval("evalCode")
128 | else:
129 | return context.eval("evalCode")
130 | return eval_loop(code, context, te.message)
131 | except SyntaxError as se:
132 | # print se.message
133 | if se.message == old_msg:
134 | return context.eval("evalCode")
135 | line_num = re.findall("@\s(\d*?)\s", se.message)
136 | if len(line_num) > 0:
137 | line_num = int(line_num[0])
138 | i = 0
139 | # try commenting out the line number with the error
140 | for item in code.split("\n"):
141 | i += 1
142 | if i == line_num:
143 | esc_item = re.escape(item)
144 | code, n = re.subn(esc_item, "//" + item, code)
145 | break
146 | else:
147 | return context.eval('evalCode')
148 | return eval_loop(code, context, se.message)
149 | except Exception as e1:
150 | # print e1.message
151 | return context.eval("evalCode")
152 |
153 |
154 | def analyse(js, tree):
155 | """
156 |
157 | Main function called from pdfrankenstein. Analyzes javascript in order to deobfuscate the code.
158 | :param js: String of code to analyze
159 | :param tree: Tree xml object to use as reference for objects called from the code.
160 | :return: String of deobfuscated code
161 | """
162 | if not PyV8:
163 | return ''
164 | with PyV8.JSIsolate():
165 | context = PyV8.JSContext()
166 | context.enter()
167 | context.eval('evalCode = \'\';')
168 | context.eval('evalOverride = function (expression) { evalCode += expression; return;}')
169 | context.eval('eval=evalOverride')
170 | try:
171 | if tree is not None:
172 | create_objs(context, tree)
173 | ret = eval_loop(js, context)
174 | context.leave()
175 | if ret == None:
176 | return ''
177 | else:
178 | return ret
179 | except Exception as e:
180 | context.leave()
181 | # return 'Error with analyzing JS: ' + e.message
182 | return ''
183 |
184 |
185 | def isJavascript(content):
186 | """
187 | Given an string this method looks for typical Javscript strings and try to identify if the string contains Javascript code or not.
188 |
189 | :param content: A string
190 | :return: A boolean, True if it seems to contain Javascript code or False in the other case
191 | """
192 | JSStrings = ['var ', ';', ')', '(', 'function ', '=', '{', '}', 'if ', 'else', 'return', 'while ', 'for ', ',',
193 | 'eval', 'unescape', '.replace']
194 | keyStrings = [';', '(', ')']
195 | stringsFound = []
196 | limit = 15
197 | minDistinctStringsFound = 5
198 | results = 0
199 | content = unescapeHTMLEntities(content)
200 | if re.findall(reJSscript, content, re.DOTALL | re.IGNORECASE) != []:
201 | return True
202 | for char in content:
203 | if (ord(char) < 32 and char not in ['\n', '\r', '\t', '\f', '\x00']) or ord(char) >= 127:
204 | return False
205 |
206 | for string in JSStrings:
207 | cont = content.count(string)
208 | results += cont
209 | if cont > 0 and string not in stringsFound:
210 | stringsFound.append(string)
211 | elif cont == 0 and string in keyStrings:
212 | return False
213 |
214 | if results > limit and len(stringsFound) >= minDistinctStringsFound:
215 | return True
216 | else:
217 | return False
218 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Use of PDFrankenstein and related source code is subject to the terms
2 | of the following licenses:
3 |
4 | GNU General Public License (GPL) Rights pursuant to Version 2, June 1991
5 | Government Purpose License Rights (GPLR) pursuant to DFARS 252.227.7013
6 |
7 | NO WARRANTY
8 |
9 | ANY INFORMATION, MATERIALS, SERVICES, INTELLECTUAL PROPERTY OR OTHER
10 | PROPERTY OR RIGHTS GRANTED OR PROVIDED BY CARNEGIE MELLON UNIVERSITY
11 | PURSUANT TO THIS LICENSE (HEREINAFTER THE "DELIVERABLES") ARE ON AN
12 | "AS-IS" BASIS. CARNEGIE MELLON UNIVERSITY MAKES NO WARRANTIES OF ANY
13 | KIND, EITHER EXPRESS OR IMPLIED AS TO ANY MATTER INCLUDING, BUT NOT
14 | LIMITED TO, WARRANTY OF FITNESS FOR A PARTICULAR PURPOSE,
15 | MERCHANTABILITY, INFORMATIONAL CONTENT, NONINFRINGEMENT, OR ERROR-FREE
16 | OPERATION. CARNEGIE MELLON UNIVERSITY SHALL NOT BE LIABLE FOR INDIRECT,
17 | SPECIAL OR CONSEQUENTIAL DAMAGES, SUCH AS LOSS OF PROFITS OR INABILITY
18 | TO USE SAID INTELLECTUAL PROPERTY, UNDER THIS LICENSE, REGARDLESS OF
19 | WHETHER SUCH PARTY WAS AWARE OF THE POSSIBILITY OF SUCH DAMAGES.
20 | LICENSEE AGREES THAT IT WILL NOT MAKE ANY WARRANTY ON BEHALF OF
21 | CARNEGIE MELLON UNIVERSITY, EXPRESS OR IMPLIED, TO ANY PERSON
22 | CONCERNING THE APPLICATION OF OR THE RESULTS TO BE OBTAINED WITH THE
23 | DELIVERABLES UNDER THIS LICENSE.
24 |
25 | Licensee hereby agrees to defend, indemnify, and hold harmless Carnegie
26 | Mellon University, its trustees, officers, employees, and agents from
27 | all claims or demands made against them (and any related losses,
28 | expenses, or attorney's fees) arising out of, or relating to Licensee's
29 | and/or its sub licensees' negligent use or willful misuse of or
30 | negligent conduct or willful misconduct regarding the Software,
31 | facilities, or other rights or assistance granted by Carnegie Mellon
32 | University under this License, including, but not limited to, any
33 | claims of product liability, personal injury, death, damage to
34 | property, or violation of any laws or regulations.
35 |
36 | Carnegie Mellon University Software Engineering Institute authored
37 | documents are sponsored by the U.S. Department of Defense under
38 | Contract FA8721-05-C-0003. Carnegie Mellon University retains
39 | copyrights in all material produced under this contract. The U.S.
40 | Government retains a non-exclusive, royalty-free license to publish or
41 | reproduce these documents, or allow others to do so, for U.S.
42 | Government purposes only pursuant to the copyright license under the
43 | contract clause at 252.227.7013.
44 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | PDFrankenstein
2 | ================
3 | Python tool for bulk malicious PDF feature extraction.
4 |
5 | Dependencies
6 | ------------
7 | * PyV8 (and V8) (optional: if you intend to use JS deobfuscation. Note: JS deobfuscation needs to be run in a safe environment, as you would treat any malware.
8 | * lxml
9 | * [scandir](https://github.com/benhoyt/scandir) (optional: module included in lib folder)
10 | * postgresql and psycopg2 (optional: if you intend to use postgresql backing storage)
11 |
12 |
13 | Usage
14 | -----
15 |
16 | ```
17 | $ pdfrankenstein.py --help
18 | ```
19 |
20 | Output to a file in delimited plain text, parses ALL files in pdf-dir/
21 | ```
22 | $ pdfrankenstein.py -o file -n fileoutput.txt ~/pdf-dir
23 | ```
24 |
25 | Output to an sqlite database
26 | ```
27 | $ pdfrankenstein.py -o sqlite3 -n pdf-db ~/pdf-dir
28 | ```
29 |
30 | Output to stdout after parsing all files listed inside file-with-pdfs
31 | ```
32 | $ pdfrankensetin.py -o stdout ~/file-with-pdfs
33 | ```
34 |
35 |
36 |
37 |
38 | pdf_in |
39 | PDF input for analysis. Can be a single PDF file or a directory of files. |
40 |
41 |
42 | -d, --debug |
43 | Print debugging messages. |
44 |
45 |
46 | -o, --out |
47 | Analysis output filename or type. Default to 'unnamed-out.*' file in CWD. Options: 'sqlite3'||'postgres'||'stdout'||[filename] |
48 |
49 |
50 | -n, --name | Name for output database. |
51 |
52 |
53 | --hasher | Specify which type of hasher to use. PeePDF | PDFMiner (default). PDFMiner option provides better parsing capabilities. |
54 |
55 |
56 | -v, --verbose | Spam the terminal, TODO. |
57 |
58 |
59 |
60 | References
61 | -------------
62 | ### Open Source PDF Tools
63 | * [PeePDF](http://eternal-todo.com/tools/peepdf-pdf-analysis-tool)
64 | * [PDFMiner](http://www.unixuser.org/~euske/python/pdfminer/index.html)
65 | * [swf mastah](https://github.com/9b/pdfxray_public/blob/master/builder/swf_mastah.py)
66 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/__init__.py
--------------------------------------------------------------------------------
/build_pdf_objects.py:
--------------------------------------------------------------------------------
1 | # Copyright 2011-2015 by Carnegie Mellon University
2 | #
3 | # NO WARRANTY
4 | #
5 | # THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
6 | # MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON
7 | # UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
8 | # IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
9 | # OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
10 | # OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY
11 | # DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
12 | # FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
13 |
14 | import re
15 |
16 | from util.str_utils import unescapeHTMLEntities
17 |
18 | # Determine the type of tag used and return its value accordingly
19 | def get_value(elem, root):
20 | if elem.tag == "literal" or elem.tag == "number" or elem.tag == "keyword":
21 | return unescapeHTMLEntities(elem.text)
22 | elif elem.tag == "string":
23 | return unescapeHTMLEntities(elem.text.decode('base64'))
24 | elif elem.tag == "ref":
25 | # find the referenced object and return its value
26 | obj = get_ref_object(elem.get('id'), root)
27 | return get_value(obj[0], root)
28 | elif elem.tag == "stream":
29 | return unescapeHTMLEntities(elem[1].text.decode('base64'))
30 | elif elem.tag == "dict":
31 | # build the dictionary
32 | ret = {}
33 | size = elem.get("size")
34 | size = re.sub("%", "", size)
35 | dict_elems = elem.getchildren()
36 | for i in range(int(size)):
37 | val = get_value(dict_elems[i][0], root)
38 | if val is not None:
39 | ret[dict_elems[i].tag] = val
40 | elif elem.tag == "list":
41 | # build the list
42 | ret = []
43 | size = elem.get("size")
44 | size = re.sub("%", "", size)
45 | list_elems = elem.getchildren()
46 | for i in range(int(size)):
47 | val = get_value(list_elems[i], root)
48 | if val is not None:
49 | ret.append(val)
50 | else:
51 | # some tags not accounted for: Rect, field, xfa, Media, etc
52 | ret = None
53 | return ret
54 |
55 |
56 | # find the object referenced in another object
57 | def get_ref_object(id, root):
58 | for obj in root.iterfind(".//object"):
59 | if obj.get("id") == id:
60 | return obj
61 | else:
62 | return None
63 |
64 |
65 | # Get any annotation objects in the PDF and store in the app object
66 | def get_annots(app, root):
67 | for annot in root.iterfind(".//Annots"):
68 | annot_list = annot[0]
69 | for ref in annot_list:
70 | id = ref.get("id")
71 | obj = get_ref_object(id, root)
72 | new = get_value(obj[0], root)
73 | if new is not None:
74 | new["subject"] = new.pop("Subj")
75 | app['doc']['annots'].append(new)
76 |
77 |
78 | # Mimic the Adobe event object by parsing the PDF for commonly found attributes
79 | def create_event_obj(tree):
80 | event_attrs = ["author", "calculate", "creator", "creationDate", "delay", "dirty", "external", "filesize",
81 | "keywords", "modDate", "numFields", "numPages", "numTemplates", "path", "pageNum", "producer",
82 | "subject", "title", "zoom", "zoomType"]
83 | event = {}
84 | event["target"] = {}
85 | for item in event_attrs:
86 | for elem in tree.iterfind('.//' + item[0].upper() + item[1:]):
87 | val = get_value(elem[0], tree)
88 | if val is not None:
89 | event["target"][item] = val
90 | # print event
91 | return event
92 |
93 |
94 | # Mimic the Adobe app object by parsing the PDF for commonly found attributes
95 | def create_app_obj(tree):
96 | app = {}
97 | app_attrs = ["calculate", "formsVersion", "fullscreen", "language", "numPlugins", "openInPlace", "platform",
98 | "toolbar", "toolbarHorizontal", "toolbarVertical"]
99 | doc = {}
100 | for item in app_attrs:
101 | for elem in tree.iterfind('.//' + item[0].upper() + item[1:]):
102 | val = get_value(elem[0], tree)
103 | if val is not None:
104 | doc[item] = val
105 | app['doc'] = doc;
106 |
107 | # Many app values are dependent on the reader
108 | # set some common defaults here
109 | app['doc']['viewerType'] = 'Reader'
110 | app['viewerType'] = 'Reader'
111 | app['viewerVersion'] = 5.0
112 | app['plugIns'] = [{'version': 6.0}, {'version': 7.5}, {'version': 8.7}, {'version': 9.1}, {'version': 10}]
113 | if not 'language' in app.keys():
114 | app['language'] = "ENU"
115 | if not 'platform' in app.keys():
116 | app['platform'] = "WIN"
117 |
118 | # store the annotation objects so they can be retrieved later
119 | app['doc']['annots'] = []
120 | get_annots(app, tree)
121 | # print app
122 | return app
123 |
124 |
125 | # Mimic the Adobe info object by parsing the PDF for commonly found attributes
126 | def create_info_obj(tree):
127 | info_attrs = ["author", "creator", "creationDate", "Date", "keywords", "modDate", "producer", "subject", "title",
128 | "trapped"]
129 | info = {}
130 | for item in info_attrs:
131 | for elem in tree.iterfind('.//' + item[0].upper() + item[1:]):
132 | val = get_value(elem[0], tree)
133 | if val is not None:
134 | info[item] = val
135 | # print info
136 | return info
137 |
--------------------------------------------------------------------------------
/cfg.py:
--------------------------------------------------------------------------------
1 | # Copyright 2011-2015 by Carnegie Mellon University
2 | #
3 | # NO WARRANTY
4 | #
5 | # THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
6 | # MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON
7 | # UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
8 | # IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
9 | # OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
10 | # OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY
11 | # DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
12 | # FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
13 |
14 | import os
15 | import sys
16 | from ConfigParser import SafeConfigParser
17 |
18 | DEFAULT_CFG = 'frankenstein.cfg'
19 |
20 |
21 | class Config(object):
22 | def __init__(self, path='', name=''):
23 | if name:
24 | cfg_file = os.path.join(path, name)
25 | else:
26 | cfg_file = os.path.join(path, DEFAULT_CFG)
27 | self.parser = SafeConfigParser()
28 | if not self.parser.read(cfg_file):
29 | print 'No configuration file found:', cfg_file
30 | self.new_cfg()
31 |
32 | def new_cfg(self):
33 | self.section_gen()
34 | self.section_db()
35 | with open(DEFAULT_CFG, 'w') as new_cfg:
36 | print 'Creating new config file in CWD:', DEFAULT_CFG
37 | print 'Please double check the default values before running again:'
38 | print self
39 | self.parser.write(new_cfg)
40 | sys.exit(0)
41 |
42 | def section_gen(self):
43 | sec = 'general'
44 | self.parser.add_section(sec)
45 | self.parser.set(sec, '#output', 'sqlite3')
46 | self.parser.set(sec, 'output', 'stdout')
47 |
48 | def section_db(self):
49 | sec = 'database'
50 | self.parser.add_section(sec)
51 | self.parser.set(sec, 'path', os.getcwd())
52 | self.parser.set(sec, 'user', 'frankenstein')
53 | self.parser.set(sec, 'pw', 'PuttinOnTheRitz')
54 | self.parser.set(sec, 'db', 'frankenstein.sqlite')
55 |
56 | def setting(self, section='', option=''):
57 | if not section:
58 | for s in self.parser.sections():
59 | if self.parser.has_option(s, option):
60 | return self.parser.get(s, option)
61 | elif self.parser.has_option(section, option):
62 | return self.parser.get(section, option)
63 | else:
64 | return None
65 |
66 | def __str__(self):
67 | rv = ''
68 | for sect in self.parser.sections():
69 | rv += 'Section: %s\n' % sect
70 | for opt in self.parser.options(sect):
71 | rv += '\t%s\t=\t%s\n' % (opt, self.parser.get(sect, opt))
72 | return rv
73 |
74 |
75 | if __name__ == '__main__':
76 | cfg = Config()
77 | print cfg
78 |
--------------------------------------------------------------------------------
/db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/db/__init__.py
--------------------------------------------------------------------------------
/db_mgmt.py:
--------------------------------------------------------------------------------
1 | # Copyright 2011-2015 by Carnegie Mellon University
2 | #
3 | # NO WARRANTY
4 | #
5 | # THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
6 | # MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON
7 | # UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
8 | # IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
9 | # OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
10 | # OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY
11 | # DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
12 | # FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
13 |
14 | import os
15 | import sys
16 | import sqlite3
17 |
18 | import cfg
19 |
20 |
21 | class DBGateway(object):
22 | def __init__(self, db='', path=''):
23 | self.error = ''
24 | self.cfg = cfg.Config()
25 |
26 | if not db:
27 | self.db_dir = self.cfg.setting('database', 'path')
28 | self.db_name = self.cfg.setting('database', 'db')
29 | elif db is 'test':
30 | self.db_dir = os.getcwd()
31 | self.db_name = 'testdb.sqlite'
32 | else:
33 | if not path:
34 | self.db_dir = self.cfg.setting('database', 'path')
35 | else:
36 | self.db_dir = path
37 | self.db_name = db
38 |
39 | if not self.db_dir or not (os.path.isdir(self.db_dir)) or not self.db_name:
40 | sys.stderr.write("GError in database path or name. Check frankenstein.cfg file\n")
41 | sys.exit(1)
42 |
43 | self.db_path = os.path.join(self.db_dir, self.db_name)
44 | print('DBGateway connecting: %s' % self.db_path)
45 | self.connect(self.db_path)
46 |
47 | def query(self, cmd, params=''):
48 | try:
49 | if params:
50 | self.db_curr.execute(cmd, params)
51 | else:
52 | self.db_curr.execute(cmd)
53 | self.commit()
54 | return True
55 | except Exception as e:
56 | self.error = str(e)
57 | return False
58 |
59 | def queryblock(self, cmd, params='', n=30):
60 | done = False
61 | tries = 0
62 | while not done and tries < n:
63 | tries += 1
64 | try:
65 | if params:
66 | self.db_curr.execute(cmd, params)
67 | else:
68 | self.db_curr.execute(cmd)
69 | except Exception as e:
70 | self.error = str(e)
71 | else:
72 | done = True
73 | return done
74 |
75 | def get_error(self):
76 | err = self.error
77 | self.error = ''
78 | return err
79 |
80 | def attach(self, db_name):
81 | db = "'" + os.path.join(config.SETTINGS.get('DB_DIR'), db_name) + "'"
82 | self.db_curr.execute('ATTACH DATABASE ' + db + ' AS ' + db_name)
83 | self.db_conn.commit()
84 |
85 | def has_table(self, table):
86 | cmd = "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='%s'" % table
87 | if self.query(cmd):
88 | return self.db_curr.fetchone()[0]
89 |
90 | def create_table(self, table, **kwargs):
91 | try:
92 | kwargs = self.format_args(**kwargs)
93 | cmd = 'CREATE TABLE IF NOT EXISTS ' + table
94 | if kwargs.get('select'):
95 | cmd += ' AS SELECT ' + kwargs.get('select') + ' FROM ' + kwargs.get('from') + ' WHERE ' + kwargs.get(
96 | 'where') + '=' + kwargs.get('is')
97 | else:
98 | cmd += ' (' + kwargs.get('cols') + ', PRIMARY KEY(' + kwargs.get('primary') + '))'
99 | except TypeError as e:
100 | print 'Invalid arguments passed to database gateway:', kwargs
101 | raise e
102 | else:
103 | try:
104 | self.db_curr.execute(cmd)
105 | except sqlite3.OperationalError as error:
106 | print 'Invalid operation in database gateway:', error
107 | print 'Occurred during cmd:', cmd
108 | raise error
109 | else:
110 | self.db_conn.commit()
111 | # self.dump()
112 |
113 | def connect(self, path):
114 | try:
115 | self.db_conn = sqlite3.connect(path, 30)
116 | except Exception as e:
117 | sys.stderr.write("DBGateway connect: %s\n" % e)
118 | return None
119 | self.db_conn.text_factory = str
120 | self.db_conn.row_factory = sqlite3.Row
121 | self.db_curr = self.db_conn.cursor()
122 |
123 | def commit(self):
124 | self.db_conn.commit()
125 |
126 | def disconnect(self):
127 | self.commit()
128 | self.db_conn.close()
129 |
130 | def drop_tables(self):
131 | self.db_curr.execute("SELECT name FROM sqlite_master WHERE type='table'")
132 | for row in self.db_curr.fetchall():
133 | self.drop(row[0])
134 |
135 | def drop(self, name):
136 | self.db_curr.execute("DROP TABLE IF EXISTS " + name)
137 | self.db_conn.commit()
138 |
139 | def format_args(self, **kwargs):
140 | if isinstance(kwargs.get('primary'), (tuple, list)):
141 | kwargs['primary'] = ', '.join(kwargs['primary'])
142 | if isinstance(kwargs.get('cols'), (tuple, list)):
143 | kwargs['subs'] = ', '.join(['?' for arg in kwargs['cols']])
144 | kwargs['cols'] = ', '.join(kwargs['cols'])
145 | else:
146 | kwargs['subs'] = '?'
147 | return kwargs
148 |
149 | def insert(self, table, **kwargs):
150 | kwargs = self.format_args(**kwargs)
151 | cmd = 'INSERT OR REPLACE INTO ' + table + '(' + kwargs.get('cols') + ') VALUES (' + kwargs.get('subs') + ')'
152 | try:
153 | self.db_curr.execute(cmd, kwargs.get('vals'))
154 | self.db_conn.commit()
155 | except Exception as e:
156 | self.error = repr(e)
157 | return False
158 | else:
159 | return True
160 |
161 | def select(self, cmd_str):
162 | cmd = 'SELECT %s' % cmd_str
163 | self.db_curr.execute(cmd)
164 | return self.db_curr
165 |
166 | def count(self, table, key, val):
167 | cmd = "SELECT COUNT (*) FROM %s WHERE %s is '%s'" % (table, key, val)
168 | self.db_curr.execute(cmd)
169 | return self.db_curr.fetchone()[0]
170 |
171 | def update(self, dic):
172 | cmd = "UPDATE {tbl} SET {col} ='{val}' WHERE {key} ='{kval}'".format(**dic)
173 | print cmd
174 | try:
175 | # self.db_curr.execute(cmd, dic)
176 | self.db_curr.execute(cmd)
177 | self.db_conn.commit()
178 | except Exception as e:
179 | self.error = str(e)
180 | return False
181 | else:
182 | return True
183 |
184 | def delete(self, *ids):
185 | pass
186 |
187 | def dump(self, n=0):
188 | print ':MEMORY DB DUMP:'
189 | cnt = 0
190 | for val in self.db_conn.iterdump():
191 | cnt += 1
192 | if 0 < n <= cnt:
193 | break
194 | print val
195 | print ':MEMORY DB DUMP END:'
196 |
197 |
--------------------------------------------------------------------------------
/huntterp.py:
--------------------------------------------------------------------------------
1 | # Copyright 2011-2015 by Carnegie Mellon University
2 | #
3 | # NO WARRANTY
4 | #
5 | # THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
6 | # MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON
7 | # UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
8 | # IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
9 | # OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
10 | # OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY
11 | # DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
12 | # FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
13 |
14 | import sys
15 | import re
16 |
17 | '''
18 | For testing run the module without arguments. (Can also be run on arbitrary files.)
19 |
20 | '''
21 |
22 |
23 | class Test(object):
24 | tests = ['ftp', 'http']
25 | ftp = "6674703a2f2f676f6f676c652e636f6d"
26 | http = "6674703a2f2f676f6f676c652e636f6d687474703a2f2f676f6f676c652e636f6df1"
27 |
28 |
29 | '''
30 | This function makes no assumptions on the validity of the string values
31 | '''
32 |
33 |
34 | def ascii2hex(string):
35 | if isinstance(string, str):
36 | return ''.join([hex(ord(c))[2:] for c in string])
37 | else:
38 | return ''
39 |
40 |
41 | '''
42 | Convert a string from hex to ascii. Starting from the first position, and
43 | stopping on the first invalid (not-printable) character or invalid input,
44 | whichever comes first.
45 | '''
46 |
47 |
48 | def hex2ascii(string):
49 | letters = ''
50 | for idx in range(0, len(string), 2):
51 | try:
52 | c1 = string[idx]
53 | c2 = string[idx + 1]
54 | i = int(c1 + c2, 16)
55 | if i < 32 or i > 127:
56 | break
57 | ch = chr(i)
58 | except (ValueError, TypeError, IndexError):
59 | break
60 | else:
61 | letters += ch
62 | return letters
63 |
64 |
65 | def get_unicode(h2):
66 | res = []
67 | res = re.findall('[\'\"]((%u[0-9a-f]{4})*)[\'\"]', h2)
68 | return res
69 |
70 |
71 | '''
72 | Return a list of strings found in the hexstring. Should not return overlapping
73 | results. Needle is converted from ASCII to HEX on the first line.
74 | '''
75 |
76 |
77 | def find_in_hex(needle, hexstack):
78 | needle = ascii2hex(needle)
79 | results = []
80 | total = 0
81 | while True:
82 | idx = hexstack.find(needle)
83 | if idx < 0:
84 | break
85 | total += idx
86 | results.append((total, hex2ascii(hexstack[idx:])))
87 | hexstack = hexstack[idx + 1:]
88 | total += 1
89 | return results
90 |
91 |
92 | def verify(vals, string):
93 | for val in vals:
94 | sys.stdout.write('Verifying [%s] @ [%d]...' % (val[1], val[0]))
95 | if string[val[0]:len(val[1])].startswith(hex2ascii(val[1])):
96 | sys.stdout.write('pass\n')
97 | else:
98 | sys.stdout.write('fail. string[%d]==[%s]...\n' % (val[0], val[1][val[0]:val[0] + 32]))
99 |
100 |
101 | '''
102 | Return a list of urls found in the unicode string. Should not return overlapping
103 | results. Needle is converted from ASCII to UNICODE on the first line.
104 | '''
105 |
106 |
107 | def find_unicode(needle, haystack):
108 | needle = ascii2uni(needle)
109 | results = []
110 | total = 0
111 | while True:
112 | idx = haystack.find(needle)
113 | if idx < 0:
114 | break
115 | total += idx
116 | quote_2 = haystack[idx:].find('"')
117 | quote_1 = haystack[idx:].find('\'')
118 | if quote_1 < quote_2 and quote_1 > -1:
119 | quote = quote_1
120 | else:
121 | quote = quote_2
122 | results.append((total, haystack[idx:idx + quote]))
123 | haystack = haystack[idx + 1:]
124 | total += 1
125 | res = []
126 | for r in results:
127 | res.append((r[0], uni2ascii(r[1])))
128 | return res
129 |
130 |
131 | '''
132 | Convert a string from ascii to unicode
133 | '''
134 |
135 |
136 | def ascii2uni(string):
137 | string = ascii2hex(string)
138 | res = re.findall('([0-9a-f]{2})([0-9a-f]{2})', string)
139 | string = ''
140 | for i in res:
141 | string += '%u' + i[1] + i[0]
142 | return string
143 |
144 |
145 | '''
146 | Convert a string form unicode to ascii
147 | '''
148 |
149 |
150 | def uni2ascii(string):
151 | string = re.sub("%u", "", string)
152 | res = re.findall('([0-9a-f]{2})([0-9a-f]{2})', string)
153 | string = ''
154 | for i in res:
155 | string += i[1] + i[0]
156 | return hex2ascii(string)
157 |
158 |
159 | '''
160 | Find h1 in h2 | h1 == ASCII && h2 == HEX
161 | '''
162 |
163 |
164 | def main(h1, h2):
165 | if not isinstance(h2, str):
166 | print 'Invalid input:', type(h2)
167 | print str(h2)
168 | return
169 |
170 | print 'Searching for "%s" in "%s"...' % (h1, h2[:32])
171 |
172 | urls = find_in_hex(h1, h2)
173 | urls += find_unicode(h1, h2)
174 | print urls
175 | print 'Found: %d occurrences' % len(urls)
176 | if len(urls):
177 | verify(urls, h2)
178 |
179 |
180 | if __name__ == "__main__":
181 | try:
182 | needle = sys.argv[1]
183 | fin = open(sys.argv[2], 'r')
184 | except IndexError:
185 | print 'Invalid or no arguments. Usage: huntterp.py needle haystack.txt'
186 | print 'Beginning tests'
187 | t = Test()
188 | for needle in t.tests:
189 | haystack = getattr(t, needle)
190 | main(needle, haystack)
191 | except IOError as e:
192 | print e
193 | else:
194 | main(needle, fin.read())
195 |
--------------------------------------------------------------------------------
/jobs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/jobs/__init__.py
--------------------------------------------------------------------------------
/pdfminer/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2004-2016 Yusuke Shinyama
2 |
3 | Permission is hereby granted, free of charge, to any person
4 | obtaining a copy of this software and associated documentation
5 | files (the "Software"), to deal in the Software without
6 | restriction, including without limitation the rights to use,
7 | copy, modify, merge, publish, distribute, sublicense, and/or
8 | sell copies of the Software, and to permit persons to whom the
9 | Software is furnished to do so, subject to the following
10 | conditions:
11 |
12 | The above copyright notice and this permission notice shall be
13 | included in all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
16 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
17 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
18 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
19 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 |
--------------------------------------------------------------------------------
/pdfminer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/pdfminer/__init__.py
--------------------------------------------------------------------------------
/pdfminer/arcfour.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """ Python implementation of Arcfour encryption algorithm.
4 |
5 | This code is in the public domain.
6 |
7 | """
8 |
9 |
10 | ## Arcfour
11 | ##
12 | class Arcfour(object):
13 | """
14 | >>> Arcfour(b'Key').process(b'Plaintext').encode('hex')
15 | 'bbf316e8d940af0ad3'
16 | >>> Arcfour(b'Wiki').process(b'pedia').encode('hex')
17 | '1021bf0420'
18 | >>> Arcfour(b'Secret').process(b'Attack at dawn').encode('hex')
19 | '45a01f645fc35b383552544b9bf5'
20 | """
21 |
22 | def __init__(self, key):
23 | s = range(256)
24 | j = 0
25 | klen = len(key)
26 | for i in xrange(256):
27 | j = (j + s[i] + ord(key[i % klen])) % 256
28 | (s[i], s[j]) = (s[j], s[i])
29 | self.s = s
30 | (self.i, self.j) = (0, 0)
31 | return
32 |
33 | def process(self, data):
34 | (i, j) = (self.i, self.j)
35 | s = self.s
36 | r = b''
37 | for c in data:
38 | i = (i + 1) % 256
39 | j = (j + s[i]) % 256
40 | (s[i], s[j]) = (s[j], s[i])
41 | k = s[(s[i] + s[j]) % 256]
42 | r += chr(ord(c) ^ k)
43 | (self.i, self.j) = (i, j)
44 | return r
45 |
46 | encrypt = decrypt = process
47 |
48 |
49 | new = Arcfour
50 |
51 | # test
52 | if __name__ == '__main__':
53 | import doctest
54 |
55 | doctest.testmod()
56 |
--------------------------------------------------------------------------------
/pdfminer/ascii85.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """ Python implementation of ASCII85/ASCIIHex decoder (Adobe version).
4 |
5 | This code is in the public domain.
6 |
7 | """
8 |
9 | import re
10 | import struct
11 |
12 |
13 | # ascii85decode(data)
14 | def ascii85decode(data):
15 | """
16 | In ASCII85 encoding, every four bytes are encoded with five ASCII
17 | letters, using 85 different types of characters (as 256**4 < 85**5).
18 | When the length of the original bytes is not a multiple of 4, a special
19 | rule is used for round up.
20 |
21 | The Adobe's ASCII85 implementation is slightly different from
22 | its original in handling the last characters.
23 |
24 | The sample string is taken from:
25 | http://en.wikipedia.org/w/index.php?title=Ascii85
26 |
27 | >>> ascii85decode(b'9jqo^BlbD-BleB1DJ+*+F(f,q')
28 | 'Man is distinguished'
29 | >>> ascii85decode(b'E,9)oF*2M7/c~>')
30 | 'pleasure.'
31 | """
32 | n = b = 0
33 | out = b''
34 | for c in data:
35 | if b'!' <= c and c <= b'u':
36 | n += 1
37 | b = b * 85 + (ord(c) - 33)
38 | if n == 5:
39 | out += struct.pack('>L', b)
40 | n = b = 0
41 | elif c == b'z':
42 | assert n == 0
43 | out += b'\0\0\0\0'
44 | elif c == b'~':
45 | if n:
46 | for _ in range(5 - n):
47 | b = b * 85 + 84
48 | out += struct.pack('>L', b)[:n - 1]
49 | break
50 | return out
51 |
52 | # asciihexdecode(data)
53 | hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
54 | trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
55 |
56 |
57 | def asciihexdecode(data):
58 | """
59 | ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
60 | For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
61 | ASCIIHexDecode filter produces one byte of binary data. All white-space
62 | characters are ignored. A right angle bracket character (>) indicates
63 | EOD. Any other characters will cause an error. If the filter encounters
64 | the EOD marker after reading an odd number of hexadecimal digits, it
65 | will behave as if a 0 followed the last digit.
66 |
67 | >>> asciihexdecode(b'61 62 2e6364 65')
68 | 'ab.cde'
69 | >>> asciihexdecode(b'61 62 2e6364 657>')
70 | 'ab.cdep'
71 | >>> asciihexdecode(b'7>')
72 | 'p'
73 | """
74 | decode = (lambda hx: chr(int(hx, 16)))
75 | out = map(decode, hex_re.findall(data))
76 | m = trail_re.search(data)
77 | if m:
78 | out.append(decode('%c0' % m.group(1)))
79 | return b''.join(out)
80 |
81 |
82 | if __name__ == '__main__':
83 | import doctest
84 |
85 | doctest.testmod()
86 |
--------------------------------------------------------------------------------
/pdfminer/lzw.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import sys
3 |
4 | try:
5 | from cStringIO import StringIO
6 | except ImportError:
7 | from StringIO import StringIO
8 |
9 |
10 | class CorruptDataError(Exception):
11 | pass
12 |
13 |
14 | ## LZWDecoder
15 | ##
16 | class LZWDecoder(object):
17 | debug = 0
18 |
19 | def __init__(self, fp):
20 | self.fp = fp
21 | self.buff = 0
22 | self.bpos = 8
23 | self.nbits = 9
24 | self.table = None
25 | self.prevbuf = None
26 | return
27 |
28 | def readbits(self, bits):
29 | v = 0
30 | while 1:
31 | # the number of remaining bits we can get from the current buffer.
32 | r = 8 - self.bpos
33 | if bits <= r:
34 | # |-----8-bits-----|
35 | # |-bpos-|-bits-| |
36 | # | |----r----|
37 | v = (v << bits) | ((self.buff >> (r - bits)) & ((1 << bits) - 1))
38 | self.bpos += bits
39 | break
40 | else:
41 | # |-----8-bits-----|
42 | # |-bpos-|---bits----...
43 | # | |----r----|
44 | v = (v << r) | (self.buff & ((1 << r) - 1))
45 | bits -= r
46 | x = self.fp.read(1)
47 | if not x:
48 | raise EOFError
49 | self.buff = ord(x)
50 | self.bpos = 0
51 | return v
52 |
53 | def feed(self, code):
54 | x = ''
55 | if code == 256:
56 | self.table = [chr(c) for c in xrange(256)] # 0-255
57 | self.table.append(None) # 256
58 | self.table.append(None) # 257
59 | self.prevbuf = ''
60 | self.nbits = 9
61 | elif code == 257:
62 | pass
63 | elif not self.prevbuf:
64 | x = self.prevbuf = self.table[code]
65 | else:
66 | if code < len(self.table):
67 | x = self.table[code]
68 | self.table.append(self.prevbuf + x[:1])
69 | elif code == len(self.table):
70 | self.table.append(self.prevbuf + self.prevbuf[:1])
71 | x = self.table[code]
72 | else:
73 | raise CorruptDataError
74 | l = len(self.table)
75 | if l == 511:
76 | self.nbits = 10
77 | elif l == 1023:
78 | self.nbits = 11
79 | elif l == 2047:
80 | self.nbits = 12
81 | self.prevbuf = x
82 | return x
83 |
84 | def run(self):
85 | while 1:
86 | try:
87 | code = self.readbits(self.nbits)
88 | except EOFError:
89 | break
90 | try:
91 | x = self.feed(code)
92 | except CorruptDataError:
93 | # just ignore corrupt data and stop yielding there
94 | break
95 | yield x
96 | if self.debug:
97 | print >> sys.stderr, ('nbits=%d, code=%d, output=%r, table=%r' %
98 | (self.nbits, code, x, self.table[258:]))
99 | return
100 |
101 |
102 | # lzwdecode
103 | def lzwdecode(data):
104 | """
105 | >>> lzwdecode('\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01')
106 | '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
107 | """
108 | fp = StringIO(data)
109 | return ''.join(LZWDecoder(fp).run())
110 |
111 |
112 | if __name__ == '__main__':
113 | import doctest
114 |
115 | doctest.testmod()
116 |
--------------------------------------------------------------------------------
/pdfminer/pdfparser.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import sys
3 |
4 | try:
5 | from cStringIO import StringIO
6 | except ImportError:
7 | from StringIO import StringIO
8 | from psparser import PSStackParser
9 | from psparser import PSSyntaxError, PSEOF
10 | from psparser import KWD, STRICT
11 | from pdftypes import PDFException
12 | from pdftypes import PDFStream, PDFObjRef
13 | from pdftypes import int_value
14 | from pdftypes import dict_value
15 |
16 |
17 | ## Exceptions
18 | ##
19 | class PDFSyntaxError(PDFException):
20 | pass
21 |
22 |
23 | ## PDFParser
24 | ##
25 | class PDFParser(PSStackParser):
26 | """
27 | PDFParser fetch PDF objects from a file stream.
28 | It can handle indirect references by referring to
29 | a PDF document set by set_document method.
30 | It also reads XRefs at the end of every PDF file.
31 |
32 | Typical usage:
33 | parser = PDFParser(fp)
34 | parser.read_xref()
35 | parser.read_xref(fallback=True) # optional
36 | parser.set_document(doc)
37 | parser.seek(offset)
38 | parser.nextobject()
39 |
40 | """
41 |
42 | def __init__(self, fp, dbg=False):
43 | PSStackParser.__init__(self, fp, dbg)
44 | self.doc = None
45 | self.fallback = False
46 | return
47 |
48 | def set_document(self, doc):
49 | """Associates the parser with a PDFDocument object."""
50 | self.doc = doc
51 | return
52 |
53 | KEYWORD_R = KWD('R')
54 | KEYWORD_NULL = KWD('null')
55 | KEYWORD_ENDOBJ = KWD('endobj')
56 | KEYWORD_STREAM = KWD('stream')
57 | KEYWORD_XREF = KWD('xref')
58 | KEYWORD_STARTXREF = KWD('startxref')
59 |
60 | def do_keyword(self, pos, token):
61 | """Handles PDF-related keywords."""
62 |
63 | if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
64 | self.add_results(*self.pop(1))
65 |
66 | elif token is self.KEYWORD_ENDOBJ:
67 | self.add_results(*self.pop(4))
68 |
69 | elif token is self.KEYWORD_NULL:
70 | # null object
71 | self.push((pos, None))
72 |
73 | elif token is self.KEYWORD_R:
74 | # reference to indirect object
75 | try:
76 | ((_, objid), (_, genno)) = self.pop(2)
77 | (objid, genno) = (int(objid), int(genno))
78 | obj = PDFObjRef(self.doc, objid, genno)
79 | self.push((pos, obj))
80 | except PSSyntaxError:
81 | pass
82 |
83 | elif token is self.KEYWORD_STREAM:
84 | # stream object
85 | ((_, dic),) = self.pop(1)
86 | dic = dict_value(dic)
87 | objlen = 0
88 | if not self.fallback:
89 | try:
90 | objlen = int_value(dic['Length'])
91 | except KeyError:
92 | if STRICT:
93 | raise PDFSyntaxError('/Length is undefined: %r' % dic)
94 | self.seek(pos)
95 | try:
96 | (_, line) = self.nextline() # 'stream'
97 | except PSEOF:
98 | if STRICT:
99 | raise PDFSyntaxError('Unexpected EOF')
100 | return
101 | pos += len(line)
102 | self.fp.seek(pos)
103 | data = self.fp.read(objlen)
104 | self.seek(pos + objlen)
105 | while 1:
106 | try:
107 | (linepos, line) = self.nextline()
108 | except PSEOF:
109 | if STRICT:
110 | raise PDFSyntaxError('Unexpected EOF')
111 | break
112 | if 'endstream' in line:
113 | i = line.index('endstream')
114 | objlen += i
115 | data += line[:i]
116 | break
117 | objlen += len(line)
118 | data += line
119 | self.seek(pos + objlen)
120 | # XXX limit objlen not to exceed object boundary
121 | if 2 <= self.debug:
122 | print >> sys.stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
123 | (pos, objlen, dic, data[:10])
124 | obj = PDFStream(dic, data, self.doc.decipher)
125 | self.push((pos, obj))
126 |
127 | else:
128 | # others
129 | self.push((pos, token))
130 |
131 | return
132 |
133 |
134 | ## PDFStreamParser
135 | ##
136 | class PDFStreamParser(PDFParser):
137 | """
138 | PDFStreamParser is used to parse PDF content streams
139 | that is contained in each page and has instructions
140 | for rendering the page. A reference to a PDF document is
141 | needed because a PDF content stream can also have
142 | indirect references to other objects in the same document.
143 | """
144 |
145 | def __init__(self, data):
146 | PDFParser.__init__(self, StringIO(data))
147 | return
148 |
149 | def flush(self):
150 | self.add_results(*self.popall())
151 | return
152 |
153 | def do_keyword(self, pos, token):
154 | if token is self.KEYWORD_R:
155 | # reference to indirect object
156 | try:
157 | ((_, objid), (_, genno)) = self.pop(2)
158 | (objid, genno) = (int(objid), int(genno))
159 | obj = PDFObjRef(self.doc, objid, genno)
160 | self.push((pos, obj))
161 | except PSSyntaxError:
162 | pass
163 | return
164 | # others
165 | self.push((pos, token))
166 | return
167 |
--------------------------------------------------------------------------------
/pdfminer/pdftypes.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import zlib
3 |
4 | from lzw import lzwdecode
5 | from ascii85 import ascii85decode, asciihexdecode
6 | from runlength import rldecode
7 | from ccitt import ccittfaxdecode
8 | from psparser import PSException, PSObject
9 | from psparser import LIT, STRICT
10 | from utils import apply_png_predictor, isnumber
11 |
12 | LITERAL_CRYPT = LIT('Crypt')
13 |
14 | # Abbreviation of Filter names in PDF 4.8.6. "Inline Images"
15 | LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl'))
16 | LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW'))
17 | LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85'))
18 | LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx'))
19 | LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL'))
20 | LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF'))
21 | LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
22 |
23 |
24 | ## PDF Objects
25 | ##
26 | class PDFObject(PSObject):
27 | pass
28 |
29 |
30 | class PDFException(PSException):
31 | pass
32 |
33 |
34 | class PDFTypeError(PDFException):
35 | pass
36 |
37 |
38 | class PDFValueError(PDFException):
39 | pass
40 |
41 |
42 | class PDFObjectNotFound(PDFException):
43 | pass
44 |
45 |
46 | class PDFNotImplementedError(PDFException):
47 | pass
48 |
49 |
50 | ## PDFObjRef
51 | ##
52 | class PDFObjRef(PDFObject):
53 | def __init__(self, doc, objid, _):
54 | if objid == 0:
55 | if STRICT:
56 | raise PDFValueError('PDF object id cannot be 0.')
57 | self.doc = doc
58 | self.objid = objid
59 | # self.genno = genno # Never used.
60 | return
61 |
62 | def __repr__(self):
63 | return '' % (self.objid)
64 |
65 | def resolve(self, default=None):
66 | try:
67 | return self.doc.getobj(self.objid)
68 | except PDFObjectNotFound:
69 | return default
70 |
71 |
72 | # resolve
73 | def resolve1(x, default=None):
74 | """Resolves an object.
75 |
76 | If this is an array or dictionary, it may still contains
77 | some indirect objects inside.
78 | """
79 | while isinstance(x, PDFObjRef):
80 | x = x.resolve(default=default)
81 | return x
82 |
83 |
84 | def resolve_all(x, default=None):
85 | """Recursively resolves the given object and all the internals.
86 |
87 | Make sure there is no indirect reference within the nested object.
88 | This procedure might be slow.
89 | """
90 | while isinstance(x, PDFObjRef):
91 | x = x.resolve(default=default)
92 | if isinstance(x, list):
93 | x = [resolve_all(v, default=default) for v in x]
94 | elif isinstance(x, dict):
95 | for (k, v) in x.iteritems():
96 | x[k] = resolve_all(v, default=default)
97 | return x
98 |
99 |
100 | def decipher_all(decipher, objid, genno, x):
101 | """Recursively deciphers the given object.
102 | """
103 | if isinstance(x, str):
104 | return decipher(objid, genno, x)
105 | if isinstance(x, list):
106 | x = [decipher_all(decipher, objid, genno, v) for v in x]
107 | elif isinstance(x, dict):
108 | for (k, v) in x.iteritems():
109 | x[k] = decipher_all(decipher, objid, genno, v)
110 | return x
111 |
112 |
113 | # Type cheking
114 | def int_value(x):
115 | x = resolve1(x)
116 | if not isinstance(x, int):
117 | if STRICT:
118 | raise PDFTypeError('Integer required: %r' % x)
119 | return 0
120 | return x
121 |
122 |
123 | def float_value(x):
124 | x = resolve1(x)
125 | if not isinstance(x, float):
126 | if STRICT:
127 | raise PDFTypeError('Float required: %r' % x)
128 | return 0.0
129 | return x
130 |
131 |
132 | def num_value(x):
133 | x = resolve1(x)
134 | if not isnumber(x):
135 | if STRICT:
136 | raise PDFTypeError('Int or Float required: %r' % x)
137 | return 0
138 | return x
139 |
140 |
141 | def str_value(x):
142 | x = resolve1(x)
143 | if not isinstance(x, str):
144 | if STRICT:
145 | raise PDFTypeError('String required: %r' % x)
146 | return ''
147 | return x
148 |
149 |
150 | def list_value(x):
151 | x = resolve1(x)
152 | if not isinstance(x, (list, tuple)):
153 | if STRICT:
154 | raise PDFTypeError('List required: %r' % x)
155 | return []
156 | return x
157 |
158 |
159 | def dict_value(x):
160 | x = resolve1(x)
161 | if not isinstance(x, dict):
162 | if STRICT:
163 | raise PDFTypeError('Dict required: %r' % x)
164 | return {}
165 | return x
166 |
167 |
168 | def stream_value(x):
169 | x = resolve1(x)
170 | if not isinstance(x, PDFStream):
171 | if STRICT:
172 | raise PDFTypeError('PDFStream required: %r' % x)
173 | return PDFStream({}, '')
174 | return x
175 |
176 |
177 | ## PDFStream type
178 | ##
179 | class PDFStream(PDFObject):
180 | def __init__(self, attrs, rawdata, decipher=None):
181 | assert isinstance(attrs, dict)
182 | self.attrs = attrs
183 | self.rawdata = rawdata
184 | self.decipher = decipher
185 | self.data = None
186 | self.objid = None
187 | self.genno = None
188 | return
189 |
190 | def set_objid(self, objid, genno):
191 | self.objid = objid
192 | self.genno = genno
193 | return
194 |
195 | def __repr__(self):
196 | if self.data is None:
197 | assert self.rawdata is not None
198 | return '' % (self.objid, len(self.rawdata), self.attrs)
199 | else:
200 | assert self.data is not None
201 | return '' % (self.objid, len(self.data), self.attrs)
202 |
203 | def __contains__(self, name):
204 | return name in self.attrs
205 |
206 | def __getitem__(self, name):
207 | return self.attrs[name]
208 |
209 | def get(self, name, default=None):
210 | return self.attrs.get(name, default)
211 |
212 | def get_any(self, names, default=None):
213 | for name in names:
214 | if name in self.attrs:
215 | return self.attrs[name]
216 | return default
217 |
218 | def get_filters(self):
219 | filters = self.get_any(('F', 'Filter'))
220 | if not filters:
221 | return []
222 | if isinstance(filters, list):
223 | return filters
224 | return [filters]
225 |
226 | def decode(self):
227 | assert self.data is None and self.rawdata is not None
228 | data = self.rawdata
229 | if self.decipher:
230 | # Handle encryption
231 | data = self.decipher(self.objid, self.genno, data)
232 | filters = self.get_filters()
233 | if not filters:
234 | self.data = data
235 | self.rawdata = None
236 | return
237 | for f in filters:
238 | if isinstance(f, PDFObjRef):
239 | filters += f.resolve()
240 | continue
241 | params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
242 | if f in LITERALS_FLATE_DECODE:
243 | # will get errors if the document is encrypted.
244 | try:
245 | data = zlib.decompress(data)
246 | except zlib.error, e:
247 | if STRICT:
248 | raise PDFException('Invalid zlib bytes: %r, %r' % (e, data))
249 | data = ''
250 | elif f in LITERALS_LZW_DECODE:
251 | data = lzwdecode(data)
252 | elif f in LITERALS_ASCII85_DECODE:
253 | data = ascii85decode(data)
254 | elif f in LITERALS_ASCIIHEX_DECODE:
255 | data = asciihexdecode(data)
256 | elif f in LITERALS_RUNLENGTH_DECODE:
257 | data = rldecode(data)
258 | elif f in LITERALS_CCITTFAX_DECODE:
259 | data = ccittfaxdecode(data, params)
260 | elif f == LITERAL_CRYPT:
261 | # not yet..
262 | raise PDFNotImplementedError('/Crypt filter is unsupported')
263 | else:
264 | raise PDFNotImplementedError('Unsupported filter: %r' % f)
265 | # apply predictors
266 | if 'Predictor' in params:
267 | pred = int_value(params['Predictor'])
268 | if pred == 1:
269 | # no predictor
270 | pass
271 | elif 10 <= pred:
272 | # PNG predictor
273 | colors = int_value(params.get('Colors', 1))
274 | columns = int_value(params.get('Columns', 1))
275 | bitspercomponent = int_value(params.get('BitsPerComponent', 8))
276 | data = apply_png_predictor(pred, colors, columns, bitspercomponent, data)
277 | else:
278 | raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
279 | self.data = data
280 | self.rawdata = None
281 | return
282 |
283 | def get_data(self):
284 | if self.data is None:
285 | self.decode()
286 | return self.data
287 |
288 | def get_rawdata(self):
289 | return self.rawdata
290 |
--------------------------------------------------------------------------------
/pdfminer/runlength.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # RunLength decoder (Adobe version) implementation based on PDF Reference
4 | # version 1.4 section 3.3.4.
5 | #
6 | # * public domain *
7 | #
8 |
9 | def rldecode(data):
10 | """
11 | RunLength decoder (Adobe version) implementation based on PDF Reference
12 | version 1.4 section 3.3.4:
13 | The RunLengthDecode filter decodes data that has been encoded in a
14 | simple byte-oriented format based on run length. The encoded data
15 | is a sequence of runs, where each run consists of a length byte
16 | followed by 1 to 128 bytes of data. If the length byte is in the
17 | range 0 to 127, the following length + 1 (1 to 128) bytes are
18 | copied literally during decompression. If length is in the range
19 | 129 to 255, the following single byte is to be copied 257 - length
20 | (2 to 128) times during decompression. A length value of 128
21 | denotes EOD.
22 | >>> s = b'\x05123456\xfa7\x04abcde\x80junk'
23 | >>> rldecode(s)
24 | '1234567777777abcde'
25 | """
26 | decoded = []
27 | i = 0
28 | while i < len(data):
29 | # print 'data[%d]=:%d:' % (i,ord(data[i]))
30 | length = ord(data[i])
31 | if length == 128:
32 | break
33 | if length >= 0 and length < 128:
34 | run = data[i + 1:(i + 1) + (length + 1)]
35 | # print 'length=%d, run=%s' % (length+1,run)
36 | decoded.append(run)
37 | i = (i + 1) + (length + 1)
38 | if length > 128:
39 | run = data[i + 1] * (257 - length)
40 | # print 'length=%d, run=%s' % (257-length,run)
41 | decoded.append(run)
42 | i = (i + 1) + 1
43 | return b''.join(decoded)
44 |
45 |
46 | if __name__ == '__main__':
47 | import doctest
48 |
49 | doctest.testmod()
50 |
--------------------------------------------------------------------------------
/pdfminer/utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Miscellaneous Routines.
4 | """
5 | import struct
6 | from sys import maxint as INF
7 |
8 |
9 | ## PNG Predictor
10 | ##
11 | def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
12 | if bitspercomponent != 8:
13 | # unsupported
14 | raise ValueError(bitspercomponent)
15 | nbytes = colors * columns * bitspercomponent // 8
16 | i = 0
17 | buf = ''
18 | line0 = '\x00' * columns
19 | for i in xrange(0, len(data), nbytes + 1):
20 | ft = data[i]
21 | i += 1
22 | line1 = data[i:i + nbytes]
23 | line2 = ''
24 | if ft == '\x00':
25 | # PNG none
26 | line2 += line1
27 | elif ft == '\x01':
28 | # PNG sub (UNTESTED)
29 | c = 0
30 | for b in line1:
31 | c = (c + ord(b)) & 255
32 | line2 += chr(c)
33 | elif ft == '\x02':
34 | # PNG up
35 | for (a, b) in zip(line0, line1):
36 | c = (ord(a) + ord(b)) & 255
37 | line2 += chr(c)
38 | elif ft == '\x03':
39 | # PNG average (UNTESTED)
40 | c = 0
41 | for (a, b) in zip(line0, line1):
42 | c = ((c + ord(a) + ord(b)) // 2) & 255
43 | line2 += chr(c)
44 | else:
45 | # unsupported
46 | raise ValueError(ft)
47 | buf += line2
48 | line0 = line2
49 | return buf
50 |
51 | ## Matrix operations
52 | ##
53 | MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
54 |
55 |
56 | def mult_matrix((a1, b1, c1, d1, e1, f1), (a0, b0, c0, d0, e0, f0)):
57 | """Returns the multiplication of two matrices."""
58 | return (a0 * a1 + c0 * b1, b0 * a1 + d0 * b1,
59 | a0 * c1 + c0 * d1, b0 * c1 + d0 * d1,
60 | a0 * e1 + c0 * f1 + e0, b0 * e1 + d0 * f1 + f0)
61 |
62 |
63 | def translate_matrix((a, b, c, d, e, f), (x, y)):
64 | """Translates a matrix by (x, y)."""
65 | return (a, b, c, d, x * a + y * c + e, x * b + y * d + f)
66 |
67 |
68 | def apply_matrix_pt((a, b, c, d, e, f), (x, y)):
69 | """Applies a matrix to a point."""
70 | return (a * x + c * y + e, b * x + d * y + f)
71 |
72 |
73 | def apply_matrix_norm((a, b, c, d, e, f), (p, q)):
74 | """Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
75 | return (a * p + c * q, b * p + d * q)
76 |
77 |
78 | ## Utility functions
79 | ##
80 |
81 | # isnumber
82 | def isnumber(x):
83 | return isinstance(x, (int, long, float))
84 |
85 |
86 | # uniq
87 | def uniq(objs):
88 | """Eliminates duplicated elements."""
89 | done = set()
90 | for obj in objs:
91 | if obj in done:
92 | continue
93 | done.add(obj)
94 | yield obj
95 | return
96 |
97 |
98 | # csort
99 | def csort(objs, key=lambda x: x):
100 | """Order-preserving sorting function."""
101 | idxs = dict((obj, i) for (i, obj) in enumerate(objs))
102 | return sorted(objs, key=lambda obj: (key(obj), idxs[obj]))
103 |
104 |
105 | # fsplit
106 | def fsplit(pred, objs):
107 | """Split a list into two classes according to the predicate."""
108 | t = []
109 | f = []
110 | for obj in objs:
111 | if pred(obj):
112 | t.append(obj)
113 | else:
114 | f.append(obj)
115 | return (t, f)
116 |
117 |
118 | # drange
119 | def drange(v0, v1, d):
120 | """Returns a discrete range."""
121 | assert v0 < v1
122 | return xrange(int(v0) // d, int(v1 + d) // d)
123 |
124 |
125 | # get_bound
126 | def get_bound(pts):
127 | """Compute a minimal rectangle that covers all the points."""
128 | (x0, y0, x1, y1) = (INF, INF, -INF, -INF)
129 | for (x, y) in pts:
130 | x0 = min(x0, x)
131 | y0 = min(y0, y)
132 | x1 = max(x1, x)
133 | y1 = max(y1, y)
134 | return (x0, y0, x1, y1)
135 |
136 |
137 | # pick
138 | def pick(seq, func, maxobj=None):
139 | """Picks the object obj where func(obj) has the highest value."""
140 | maxscore = None
141 | for obj in seq:
142 | score = func(obj)
143 | if maxscore is None or maxscore < score:
144 | (maxscore, maxobj) = (score, obj)
145 | return maxobj
146 |
147 |
148 | # choplist
149 | def choplist(n, seq):
150 | """Groups every n elements of the list."""
151 | r = []
152 | for x in seq:
153 | r.append(x)
154 | if len(r) == n:
155 | yield tuple(r)
156 | r = []
157 | return
158 |
159 |
160 | # nunpack
161 | def nunpack(s, default=0):
162 | """Unpacks 1 to 4 byte integers (big endian)."""
163 | l = len(s)
164 | if not l:
165 | return default
166 | elif l == 1:
167 | return ord(s)
168 | elif l == 2:
169 | return struct.unpack('>H', s)[0]
170 | elif l == 3:
171 | return struct.unpack('>L', '\x00' + s)[0]
172 | elif l == 4:
173 | return struct.unpack('>L', s)[0]
174 | else:
175 | raise TypeError('invalid length: %d' % l)
176 |
177 | # decode_text
178 | PDFDocEncoding = ''.join(unichr(x) for x in (
179 | 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
180 | 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
181 | 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
182 | 0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
183 | 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
184 | 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
185 | 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
186 | 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
187 | 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
188 | 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
189 | 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
190 | 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
191 | 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
192 | 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
193 | 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
194 | 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
195 | 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
196 | 0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
197 | 0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
198 | 0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
199 | 0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
200 | 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
201 | 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
202 | 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
203 | 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
204 | 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
205 | 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
206 | 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
207 | 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
208 | 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
209 | 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
210 | 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
211 | ))
212 |
213 |
214 | def decode_text(s):
215 | """Decodes a PDFDocEncoding string to Unicode."""
216 | if s.startswith('\xfe\xff'):
217 | return unicode(s[2:], 'utf-16be', 'ignore')
218 | else:
219 | return ''.join(PDFDocEncoding[ord(c)] for c in s)
220 |
221 |
222 | # enc
223 | def enc(x, codec='ascii'):
224 | """Encodes a string for SGML/XML/HTML"""
225 | x = x.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"')
226 | return x.encode(codec, 'xmlcharrefreplace')
227 |
228 |
229 | def bbox2str((x0, y0, x1, y1)):
230 | return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
231 |
232 |
233 | def matrix2str((a, b, c, d, e, f)):
234 | return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a, b, c, d, e, f)
235 |
236 |
237 | ## Plane
238 | ##
239 | ## A set-like data structure for objects placed on a plane.
240 | ## Can efficiently find objects in a certain rectangular area.
241 | ## It maintains two parallel lists of objects, each of
242 | ## which is sorted by its x or y coordinate.
243 | ##
244 | class Plane(object):
245 | def __init__(self, bbox, gridsize=50):
246 | self._objs = set()
247 | self._grid = {}
248 | self.gridsize = gridsize
249 | (self.x0, self.y0, self.x1, self.y1) = bbox
250 | return
251 |
252 | def __repr__(self):
253 | return ('' % list(self))
254 |
255 | def __iter__(self):
256 | return iter(self._objs)
257 |
258 | def __len__(self):
259 | return len(self._objs)
260 |
261 | def __contains__(self, obj):
262 | return obj in self._objs
263 |
264 | def _getrange(self, (x0, y0, x1, y1)):
265 | if (x1 <= self.x0 or self.x1 <= x0 or
266 | y1 <= self.y0 or self.y1 <= y0): return
267 | x0 = max(self.x0, x0)
268 | y0 = max(self.y0, y0)
269 | x1 = min(self.x1, x1)
270 | y1 = min(self.y1, y1)
271 | for y in drange(y0, y1, self.gridsize):
272 | for x in drange(x0, x1, self.gridsize):
273 | yield (x, y)
274 | return
275 |
276 | # extend(objs)
277 | def extend(self, objs):
278 | for obj in objs:
279 | self.add(obj)
280 | return
281 |
282 | # add(obj): place an object.
283 | def add(self, obj):
284 | for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
285 | if k not in self._grid:
286 | r = []
287 | self._grid[k] = r
288 | else:
289 | r = self._grid[k]
290 | r.append(obj)
291 | self._objs.add(obj)
292 | return
293 |
294 | # remove(obj): displace an object.
295 | def remove(self, obj):
296 | for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
297 | try:
298 | self._grid[k].remove(obj)
299 | except (KeyError, ValueError):
300 | pass
301 | self._objs.remove(obj)
302 | return
303 |
304 | # find(): finds objects that are in a certain area.
305 | def find(self, (x0, y0, x1, y1)):
306 | done = set()
307 | for k in self._getrange((x0, y0, x1, y1)):
308 | if k not in self._grid:
309 | continue
310 | for obj in self._grid[k]:
311 | if obj in done:
312 | continue
313 | done.add(obj)
314 | if (obj.x1 <= x0 or x1 <= obj.x0 or
315 | obj.y1 <= y0 or y1 <= obj.y0):
316 | continue
317 | yield obj
318 | return
319 |
--------------------------------------------------------------------------------
/peepdf/AUTHORS:
--------------------------------------------------------------------------------
1 | Jose Miguel Esparza
2 | http://eternal-todo.com
3 | http://twitter.com/EternalTodo
--------------------------------------------------------------------------------
/peepdf/CHANGELOG:
--------------------------------------------------------------------------------
1 | -----------------------------------------------
2 | peepdf Black Hat Vegas (0.2 r156), 2012-07-25
3 | -----------------------------------------------
4 |
5 | * New features:
6 |
7 | - Added "grinch mode" execution to avoid colorized output
8 | - Added more colors in the interactive console output: warning, errors, important information...
9 | - Changed sctest command, now it's implemented with pylibemu
10 | - Added decrypt command to parse password protected documents
11 | - Modified analyseJS() to extract JS code from XDP packets and unescape HTML entities
12 | - Added function unescapeHTMLEntities() to unescape HTML entities
13 | - Added AES decryption support (128 and 256 bits).
14 | - Added hashes in objects information (info $object_id)
15 | - Added support for decoding CCITTFaxDecode filters (Thanks to @binjo)
16 |
17 | * Fixes:
18 |
19 | - Fix to show decrypt errors
20 | - Fixed silly bug with /EncryptMetadata element
21 | - Added missing binary file operations
22 | - Fixed Issue 5: Resolved false positives when monitoring some elements like actions, events, etc. (Thanks to @hiddenillusion)
23 | - Bug in PDFStream.decode and PDFStream.encode, dealing with an array of filter parameters (Thanks to @binjo)
24 |
25 |
26 | -----------------------------------------------
27 | peepdf Black Hat Arsenal (0.1 r92), 2012-03-16
28 | -----------------------------------------------
29 |
30 | * New features:
31 |
32 | - Added support for more parameters in Flate/LZW decode (stream filters)
33 | - Encryption algorithm now showing in document information
34 | - Added XML output and SHA hash to file information
35 | - Improved unescape function to support mixed escaped formats (eg. "%u6734%34%u8790")
36 | - Added xor and xor_search commands
37 | - Added easy way of redirect console output (>, >>, $>, $>>)
38 | - Added xor function by Evan Fosmark
39 | - Added detection of CVE-2011-4369 (/PRC)
40 | - Added hash command (Thanks to @binjo for code and comments)
41 | - Added js_beautify command
42 | - Update function added
43 | - Added new vulns and showing information related to non JS vulns
44 | - Added escape sequence in the limited output
45 | - Added ascii85 decode from pdfminer to improve code and avoid bugs (Thanks to Brandon Dixon!)
46 | - Added lzwdecode from pdfminer to improve code and avoid bugs
47 |
48 | * Fixes:
49 |
50 | - Update process rewritten, now based on hashing of files
51 | - Silly bug in computeUserPass function (Thanks to Christian Martorella!)
52 | - Added binary mode in files operations
53 | - Recursion bug in update function
54 | - Minor bug in do_embed function
55 | - Bug to support encoding following PDF specifications (Issue 3 by czchen)
56 | - Bug to handle negative numbers in P element
57 | - Bug in the xref table when creating a new PDF (Issue 2)
58 | - Silly bug when parsing filter parameters
59 | - Bug related to updating objects and statistics of PDF files
60 | - Some bugs related to offsets calculation
61 | - Fixed "replace" function in PDFObjectStream
62 | - Fix in asciiHexDecode filter function
63 |
64 |
65 | -----------------------------------------------
66 | peepdf 0.1 r15, 2011-05-05
67 | -----------------------------------------------
68 |
69 | - Initial Release
70 |
71 |
--------------------------------------------------------------------------------
/peepdf/JSAnalysis.py:
--------------------------------------------------------------------------------
1 | #
2 | # peepdf is a tool to analyse and modify PDF files
3 | # http://peepdf.eternal-todo.com
4 | # By Jose Miguel Esparza
5 | #
6 | # Copyright (C) 2011-2014 Jose Miguel Esparza
7 | #
8 | # This file is part of peepdf.
9 | #
10 | # peepdf is free software: you can redistribute it and/or modify
11 | # it under the terms of the GNU General Public License as published by
12 | # the Free Software Foundation, either version 3 of the License, or
13 | # (at your option) any later version.
14 | #
15 | # peepdf is distributed in the hope that it will be useful,
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | # GNU General Public License for more details.
19 | #
20 | # You should have received a copy of the GNU General Public License
21 | # along with peepdf. If not, see .
22 | #
23 |
24 | '''
25 | This module contains some functions to analyse Javascript code inside the PDF file
26 | '''
27 |
28 | import sys
29 | import re
30 | import os
31 | import traceback
32 |
33 | import jsbeautifier
34 | from PDFUtils import unescapeHTMLEntities, escapeString
35 |
36 | try:
37 | import PyV8
38 |
39 | JS_MODULE = True
40 |
41 | class Global(PyV8.JSClass):
42 | evalCode = ''
43 |
44 | def evalOverride(self, expression):
45 | self.evalCode += '\n\n// New evaluated code\n' + expression
46 | return
47 |
48 | except:
49 | JS_MODULE = False
50 |
51 | errorsFile = 'errors.txt'
52 | newLine = os.linesep
53 | reJSscript = ''
54 | preDefinedCode = 'var app = this;'
55 |
56 |
57 | def analyseJS(code, context=None, manualAnalysis=False):
58 | '''
59 | Hooks the eval function and search for obfuscated elements in the Javascript code
60 |
61 | @param code: The Javascript code (string)
62 | @return: List with analysis information of the Javascript code: [JSCode,unescapedBytes,urlsFound,errors,context], where
63 | JSCode is a list with the several stages Javascript code,
64 | unescapedBytes is a list with the parameters of unescape functions,
65 | urlsFound is a list with the URLs found in the unescaped bytes,
66 | errors is a list of errors,
67 | context is the context of execution of the Javascript code.
68 | '''
69 | errors = []
70 | JSCode = []
71 | unescapedBytes = []
72 | urlsFound = []
73 |
74 | try:
75 | code = unescapeHTMLEntities(code)
76 | scriptElements = re.findall(reJSscript, code, re.DOTALL | re.IGNORECASE)
77 | if scriptElements != []:
78 | code = ''
79 | for scriptElement in scriptElements:
80 | code += scriptElement + '\n\n'
81 | code = jsbeautifier.beautify(code)
82 | JSCode.append(code)
83 |
84 | if code != None and JS_MODULE and not manualAnalysis:
85 | if context == None:
86 | context = PyV8.JSContext(Global())
87 | context.enter()
88 | # Hooking the eval function
89 | context.eval('eval=evalOverride')
90 | # context.eval(preDefinedCode)
91 | while True:
92 | originalCode = code
93 | try:
94 | context.eval(code)
95 | evalCode = context.eval('evalCode')
96 | evalCode = jsbeautifier.beautify(evalCode)
97 | if evalCode != '' and evalCode != code:
98 | code = evalCode
99 | JSCode.append(code)
100 | else:
101 | break
102 | except:
103 | error = str(sys.exc_info()[1])
104 | open('jserror.log', 'ab').write(error + newLine)
105 | errors.append(error)
106 | break
107 |
108 | if False:
109 | escapedVars = re.findall('(\w*?)\s*?=\s*?(unescape\((.*?)\))', code, re.DOTALL)
110 | for var in escapedVars:
111 | bytes = var[2]
112 | if bytes.find('+') != -1 or bytes.find('%') == -1:
113 | varContent = getVarContent(code, bytes)
114 | if len(varContent) > 150:
115 | ret = unescape(varContent)
116 | if ret[0] != -1:
117 | bytes = ret[1]
118 | urls = re.findall('https?://.*$', bytes, re.DOTALL)
119 | if bytes not in unescapedBytes:
120 | unescapedBytes.append(bytes)
121 | for url in urls:
122 | if url not in urlsFound:
123 | urlsFound.append(url)
124 | else:
125 | bytes = bytes[1:-1]
126 | if len(bytes) > 150:
127 | ret = unescape(bytes)
128 | if ret[0] != -1:
129 | bytes = ret[1]
130 | urls = re.findall('https?://.*$', bytes, re.DOTALL)
131 | if bytes not in unescapedBytes:
132 | unescapedBytes.append(bytes)
133 | for url in urls:
134 | if url not in urlsFound:
135 | urlsFound.append(url)
136 | except:
137 | traceback.print_exc(file=open(errorsFile, 'a'))
138 | errors.append('Unexpected error in the JSAnalysis module!!')
139 | finally:
140 | for js in JSCode:
141 | if js == None or js == '':
142 | JSCode.remove(js)
143 | return [JSCode, unescapedBytes, urlsFound, errors, context]
144 |
145 |
146 | def getVarContent(jsCode, varContent):
147 | '''
148 | Given the Javascript code and the content of a variable this method tries to obtain the real value of the variable, cleaning expressions like "a = eval; a(js_code);"
149 |
150 | @param jsCode: The Javascript code (string)
151 | @param varContent: The content of the variable (string)
152 | @return: A string with real value of the variable
153 | '''
154 | clearBytes = ''
155 | varContent = varContent.replace('\n', '')
156 | varContent = varContent.replace('\r', '')
157 | varContent = varContent.replace('\t', '')
158 | varContent = varContent.replace(' ', '')
159 | parts = varContent.split('+')
160 | for part in parts:
161 | if re.match('["\'].*?["\']', part, re.DOTALL):
162 | clearBytes += part[1:-1]
163 | else:
164 | part = escapeString(part)
165 | varContent = re.findall(part + '\s*?=\s*?(.*?)[,;]', jsCode, re.DOTALL)
166 | if varContent != []:
167 | clearBytes += getVarContent(jsCode, varContent[0])
168 | return clearBytes
169 |
170 |
171 | def isJavascript(content):
172 | '''
173 | Given an string this method looks for typical Javscript strings and try to identify if the string contains Javascrit code or not.
174 |
175 | @param content: A string
176 | @return: A boolean, True if it seems to contain Javascript code or False in the other case
177 | '''
178 | JSStrings = ['var ', ';', ')', '(', 'function ', '=', '{', '}', 'if ', 'else', 'return', 'while ', 'for ', ',',
179 | 'eval']
180 | keyStrings = [';', '(', ')']
181 | stringsFound = []
182 | limit = 15
183 | minDistinctStringsFound = 5
184 | results = 0
185 |
186 | if re.findall(reJSscript, content, re.DOTALL | re.IGNORECASE) != []:
187 | return True
188 |
189 | for char in content:
190 | if (ord(char) < 32 and char not in ['\n', '\r', '\t', '\f', '\x00']) or ord(char) >= 127:
191 | return False
192 |
193 | for string in JSStrings:
194 | cont = content.count(string)
195 | results += cont
196 | if cont > 0 and string not in stringsFound:
197 | stringsFound.append(string)
198 | elif cont == 0 and string in keyStrings:
199 | return False
200 |
201 | if results > limit and len(stringsFound) >= minDistinctStringsFound:
202 | return True
203 | else:
204 | return False
205 |
206 |
207 | def searchObfuscatedFunctions(jsCode, function):
208 | '''
209 | Search for obfuscated functions in the Javascript code
210 |
211 | @param jsCode: The Javascript code (string)
212 | @param function: The function name to look for (string)
213 | @return: List with obfuscated functions information [functionName,functionCall,containsReturns]
214 | '''
215 | obfuscatedFunctionsInfo = []
216 | if jsCode != None:
217 | match = re.findall('\W(' + function + '\s{0,5}?\((.*?)\)\s{0,5}?;)', jsCode, re.DOTALL)
218 | if match != []:
219 | for m in match:
220 | if re.findall('return', m[1], re.IGNORECASE) != []:
221 | obfuscatedFunctionsInfo.append([function, m, True])
222 | else:
223 | obfuscatedFunctionsInfo.append([function, m, False])
224 | obfuscatedFunctions = re.findall('\s*?((\w*?)\s*?=\s*?' + function + ')\s*?;', jsCode, re.DOTALL)
225 | for obfuscatedFunction in obfuscatedFunctions:
226 | obfuscatedElement = obfuscatedFunction[1]
227 | obfuscatedFunctionsInfo += searchObfuscatedFunctions(jsCode, obfuscatedElement)
228 | return obfuscatedFunctionsInfo
229 |
230 |
231 | def unescape(escapedBytes, unicode=True):
232 | '''
233 | This method unescapes the given string
234 |
235 | @param escapedBytes: A string to unescape
236 | @return: A tuple (status,statusContent), where statusContent is an unescaped string in case status = 0 or an error in case status = -1
237 | '''
238 | # TODO: modify to accept a list of escaped strings?
239 | unescapedBytes = ''
240 | if unicode:
241 | unicodePadding = '\x00'
242 | else:
243 | unicodePadding = ''
244 | try:
245 | if escapedBytes.lower().find('%u') != -1 or escapedBytes.lower().find('\u') != -1 or escapedBytes.find(
246 | '%') != -1:
247 | if escapedBytes.lower().find('\u') != -1:
248 | splitBytes = escapedBytes.split('\\')
249 | else:
250 | splitBytes = escapedBytes.split('%')
251 | for i in range(len(splitBytes)):
252 | splitByte = splitBytes[i]
253 | if splitByte == '':
254 | continue
255 | if len(splitByte) > 4 and re.match('u[0-9a-f]{4}', splitByte[:5], re.IGNORECASE):
256 | unescapedBytes += chr(int(splitByte[3] + splitByte[4], 16)) + chr(
257 | int(splitByte[1] + splitByte[2], 16))
258 | if len(splitByte) > 5:
259 | for j in range(5, len(splitByte)):
260 | unescapedBytes += splitByte[j] + unicodePadding
261 | elif len(splitByte) > 1 and re.match('[0-9a-f]{2}', splitByte[:2], re.IGNORECASE):
262 | unescapedBytes += chr(int(splitByte[0] + splitByte[1], 16)) + unicodePadding
263 | if len(splitByte) > 2:
264 | for j in range(2, len(splitByte)):
265 | unescapedBytes += splitByte[j] + unicodePadding
266 | else:
267 | if i != 0:
268 | unescapedBytes += '%' + unicodePadding
269 | for j in range(len(splitByte)):
270 | unescapedBytes += splitByte[j] + unicodePadding
271 | else:
272 | unescapedBytes = escapedBytes
273 | except:
274 | return (-1, 'Error while unescaping the bytes')
275 | return (0, unescapedBytes)
276 |
--------------------------------------------------------------------------------
/peepdf/PDFCrypto.py:
--------------------------------------------------------------------------------
1 | #
2 | # peepdf is a tool to analyse and modify PDF files
3 | # http://peepdf.eternal-todo.com
4 | # By Jose Miguel Esparza
5 | #
6 | # Copyright (C) 2011-2014 Jose Miguel Esparza
7 | #
8 | # This file is part of peepdf.
9 | #
10 | # peepdf is free software: you can redistribute it and/or modify
11 | # it under the terms of the GNU General Public License as published by
12 | # the Free Software Foundation, either version 3 of the License, or
13 | # (at your option) any later version.
14 | #
15 | # peepdf is distributed in the hope that it will be useful,
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | # GNU General Public License for more details.
19 | #
20 | # You should have received a copy of the GNU General Public License
21 | # along with peepdf. If not, see .
22 | #
23 |
24 | '''
25 | Module to manage cryptographic operations with PDF files
26 | '''
27 |
28 | import hashlib
29 | import struct
30 | import random
31 | import warnings
32 | from itertools import cycle, izip
33 |
34 | import aes
35 |
36 | warnings.filterwarnings("ignore")
37 |
38 | paddingString = '\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A'
39 |
40 |
41 | def computeEncryptionKey(password, dictOwnerPass, dictUserPass, dictOE, dictUE, fileID, pElement, dictKeyLength=128,
42 | revision=3, encryptMetadata=False, passwordType=None):
43 | '''
44 | Compute an encryption key to encrypt/decrypt the PDF file
45 |
46 | @param password: The password entered by the user
47 | @param dictOwnerPass: The owner password from the standard security handler dictionary
48 | @param dictUserPass: The user password from the standard security handler dictionary
49 | @param dictOE: The owner encrypted string from the standard security handler dictionary
50 | @param dictUE:The user encrypted string from the standard security handler dictionary
51 | @param fileID: The /ID element in the trailer dictionary of the PDF file
52 | @param pElement: The /P element of the Encryption dictionary
53 | @param dictKeyLength: The length of the key
54 | @param revision: The algorithm revision
55 | @param encryptMetadata: A boolean extracted from the standard security handler dictionary to specify if it's necessary to encrypt the document metadata or not
56 | @param passwordType: It specifies the given password type. It can be 'USER', 'OWNER' or None.
57 | @return: A tuple (status,statusContent), where statusContent is the encryption key in case status = 0 or an error message in case status = -1
58 | '''
59 | if revision != 5:
60 | keyLength = dictKeyLength / 8
61 | lenPass = len(password)
62 | if lenPass > 32:
63 | password = password[:32]
64 | elif lenPass < 32:
65 | password += paddingString[:32 - lenPass]
66 | md5input = password + dictOwnerPass + struct.pack(' 3 and not encryptMetadata:
68 | md5input += '\xFF' * 4
69 | key = hashlib.md5(md5input).digest()
70 | if revision > 2:
71 | counter = 0
72 | while counter < 50:
73 | key = hashlib.md5(key[:keyLength]).digest()
74 | counter += 1
75 | key = key[:keyLength]
76 | elif revision == 2:
77 | key = key[:5]
78 | return (0, key)
79 | else:
80 | if passwordType == 'USER':
81 | password = password.encode('utf-8')[:127]
82 | kSalt = dictUserPass[40:48]
83 | intermediateKey = hashlib.sha256(password + kSalt).digest()
84 | ret = aes.decryptData('\0' * 16 + dictUE, intermediateKey)
85 | elif passwordType == 'OWNER':
86 | password = password.encode('utf-8')[:127]
87 | kSalt = dictOwnerPass[40:48]
88 | intermediateKey = hashlib.sha256(password + kSalt + dictUserPass).digest()
89 | ret = aes.decryptData('\0' * 16 + dictOE, intermediateKey)
90 | return ret
91 |
92 |
93 | def computeObjectKey(id, generationNum, encryptionKey, keyLengthBytes, algorithm='RC4'):
94 | '''
95 | Compute the key necessary to encrypt each object, depending on the id and generation number. Only necessary with /V < 5.
96 |
97 | @param id: The object id
98 | @param generationNum: The generation number of the object
99 | @param encryptionKey: The encryption key
100 | @param keyLengthBytes: The length of the encryption key in bytes
101 | @param algorithm: The algorithm used in the encryption/decryption process
102 | @return: The computed key in string format
103 | '''
104 | key = encryptionKey + struct.pack(' 32:
130 | ownerPassString = ownerPassString[:32]
131 | elif lenPass < 32:
132 | ownerPassString += paddingString[:32 - lenPass]
133 | rc4Key = hashlib.md5(ownerPassString).digest()
134 | if revision > 2:
135 | counter = 0
136 | while counter < 50:
137 | rc4Key = hashlib.md5(rc4Key).digest()
138 | counter += 1
139 | rc4Key = rc4Key[:keyLength]
140 | lenPass = len(userPassString)
141 | if lenPass > 32:
142 | userPassString = userPassString[:32]
143 | elif lenPass < 32:
144 | userPassString += paddingString[:32 - lenPass]
145 | ownerPass = RC4(userPassString, rc4Key)
146 | if revision > 2:
147 | counter = 1
148 | while counter <= 19:
149 | newKey = ''
150 | for i in range(len(rc4Key)):
151 | newKey += chr(ord(rc4Key[i]) ^ counter)
152 | ownerPass = RC4(ownerPass, newKey)
153 | counter += 1
154 | return ownerPass
155 |
156 |
157 | def computeUserPass(userPassString, dictO, fileID, pElement, keyLength=128, revision=3, encryptMetadata=False):
158 | '''
159 | Compute the user password of the PDF file
160 |
161 | @param userPassString: The user password entered by the user
162 | @param ownerPass: The computed owner password
163 | @param fileID: The /ID element in the trailer dictionary of the PDF file
164 | @param pElement: The /P element of the /Encryption dictionary
165 | @param keyLength: The length of the key
166 | @param revision: The algorithm revision
167 | @param encryptMetadata: A boolean extracted from the standard security handler dictionary to specify if it's necessary to encrypt the document metadata or not
168 | @return: A tuple (status,statusContent), where statusContent is the computed password in case status = 0 or an error message in case status = -1
169 | '''
170 | # TODO: revision 5
171 | userPass = ''
172 | dictU = ''
173 | dictOE = ''
174 | dictUE = ''
175 | ret = computeEncryptionKey(userPassString, dictO, dictU, dictOE, dictUE, fileID, pElement, keyLength, revision,
176 | encryptMetadata)
177 | if ret[0] != -1:
178 | rc4Key = ret[1]
179 | else:
180 | return ret
181 | if revision == 2:
182 | userPass = RC4(paddingString, rc4Key)
183 | elif revision > 2:
184 | counter = 1
185 | md5Input = paddingString + fileID
186 | hashResult = hashlib.md5(md5Input).digest()
187 | userPass = RC4(hashResult, rc4Key)
188 | while counter <= 19:
189 | newKey = ''
190 | for i in range(len(rc4Key)):
191 | newKey += chr(ord(rc4Key[i]) ^ counter)
192 | userPass = RC4(userPass, newKey)
193 | counter += 1
194 | counter = 0
195 | while counter < 16:
196 | userPass += chr(random.randint(32, 255))
197 | counter += 1
198 | return (0, userPass)
199 |
200 |
201 | def isUserPass(password, computedUserPass, dictU, revision):
202 | '''
203 | Checks if the given password is the User password of the file
204 |
205 | @param password: The given password or the empty password
206 | @param computedUserPass: The computed user password of the file
207 | @param dictU: The /U element of the /Encrypt dictionary
208 | @param revision: The number of revision of the standard security handler
209 | @return The boolean telling if the given password is the user password or not
210 | '''
211 | if revision == 5:
212 | vSalt = dictU[32:40]
213 | inputHash = hashlib.sha256(password + vSalt).digest()
214 | if inputHash == dictU[:32]:
215 | return True
216 | else:
217 | return False
218 | elif revision == 3 or revision == 4:
219 | if computedUserPass[:16] == dictU[:16]:
220 | return True
221 | else:
222 | return False
223 | elif revision < 3:
224 | if computedUserPass == dictU:
225 | return True
226 | else:
227 | return False
228 |
229 |
230 | def isOwnerPass(password, dictO, dictU, computedUserPass, keyLength, revision):
231 | '''
232 | Checks if the given password is the owner password of the file
233 |
234 | @param password: The given password or the empty password
235 | @param dictO: The /O element of the /Encrypt dictionary
236 | @param dictU: The /U element of the /Encrypt dictionary
237 | @param computedUserPass: The computed user password of the file
238 | @param keyLength: The length of the key
239 | @param revision: The algorithm revision
240 | @return The boolean telling if the given password is the owner password or not
241 | '''
242 | if revision == 5:
243 | vSalt = dictO[32:40]
244 | inputHash = hashlib.sha256(password + vSalt + dictU).digest()
245 | if inputHash == dictO[:32]:
246 | return True
247 | else:
248 | return False
249 | else:
250 | keyLength = keyLength / 8
251 | lenPass = len(password)
252 | if lenPass > 32:
253 | password = password[:32]
254 | elif lenPass < 32:
255 | password += paddingString[:32 - lenPass]
256 | rc4Key = hashlib.md5(password).digest()
257 | if revision > 2:
258 | counter = 0
259 | while counter < 50:
260 | rc4Key = hashlib.md5(rc4Key).digest()
261 | counter += 1
262 | rc4Key = rc4Key[:keyLength]
263 | if revision == 2:
264 | userPass = RC4(dictO, rc4Key)
265 | elif revision > 2:
266 | counter = 19
267 | while counter >= 0:
268 | newKey = ''
269 | for i in range(len(rc4Key)):
270 | newKey += chr(ord(rc4Key[i]) ^ counter)
271 | dictO = RC4(dictO, newKey)
272 | counter -= 1
273 | userPass = dictO
274 | else:
275 | # Is it possible??
276 | userPass = ''
277 | return isUserPass(userPass, computedUserPass, dictU, revision)
278 |
279 |
280 | def RC4(data, key):
281 | '''
282 | RC4 implementation
283 |
284 | @param data: Bytes to be encrypyed/decrypted
285 | @param key: Key used for the algorithm
286 | @return: The encrypted/decrypted bytes
287 | '''
288 | y = 0
289 | hash = {}
290 | box = {}
291 | ret = ''
292 | keyLength = len(key)
293 | dataLength = len(data)
294 |
295 | # Initialization
296 | for x in range(256):
297 | hash[x] = ord(key[x % keyLength])
298 | box[x] = x
299 | for x in range(256):
300 | y = (y + int(box[x]) + int(hash[x])) % 256
301 | tmp = box[x]
302 | box[x] = box[y]
303 | box[y] = tmp
304 |
305 | z = y = 0
306 | for x in range(0, dataLength):
307 | z = (z + 1) % 256
308 | y = (y + box[z]) % 256
309 | tmp = box[z]
310 | box[z] = box[y]
311 | box[y] = tmp
312 | k = box[((box[z] + box[y]) % 256)]
313 | ret += chr(ord(data[x]) ^ k)
314 | return ret
315 |
316 |
317 | '''
318 | Author: Evan Fosmark (http://www.evanfosmark.com/2008/06/xor-encryption-with-python/)
319 | '''
320 |
321 |
322 | def xor(bytes, key):
323 | '''
324 | Simple XOR implementation
325 |
326 | @param bytes: Bytes to be xored
327 | @param key: Key used for the operation, it's cycled.
328 | @return: The xored bytes
329 | '''
330 | key = cycle(key)
331 | return ''.join(chr(ord(x) ^ ord(y)) for (x, y) in izip(bytes, key))
332 |
--------------------------------------------------------------------------------
/peepdf/README:
--------------------------------------------------------------------------------
1 | ** Home page **
2 |
3 | http://peepdf.eternal-todo.com
4 | http://twitter.com/peepdf
5 |
6 |
7 | ** Dependencies **
8 |
9 | - In order to analyse Javascript code "PyV8" is needed:
10 |
11 | http://code.google.com/p/pyv8/
12 |
13 |
14 | - The "sctest" command is a wrapper of "sctest" (libemu). Besides libemu pylibemu is used and must be installed:
15 |
16 | http://libemu.carnivore.it (latest version from git repository, Sourceforge package is outdated)
17 | https://github.com/buffer/pylibemu
18 |
19 |
20 | - To support XML output "lxml" is needed:
21 |
22 | http://lxml.de/installation.html
23 |
24 |
25 | - Included modules: lzw, colorama, jsbeautifier, ccitt, pythonaes (Thanks to all the developers!!)
26 |
27 |
28 |
29 | ** Installation **
30 |
31 | No installation is needed apart of the commented dependencies, just execute it!
32 |
33 |
34 |
35 | ** Execution **
36 |
37 | There are two important options when peepdf is executed:
38 |
39 | -f: Ignores the parsing errors. Analysing malicious files propably leads to parsing errors, so this parameter should be set.
40 | -l: Sets the loose mode, so does not search for the endobj tag because it's not obligatory. Helpful with malformed files.
41 |
42 |
43 | * Simple execution
44 |
45 | Shows the statistics of the file after being decoded/decrypted and analysed:
46 |
47 | python peepdf.py [options] pdf_file
48 |
49 |
50 | * Interactive console
51 |
52 | Executes the interactive console to let play with the PDF file:
53 |
54 | python peepdf.py -i [options] pdf_file
55 |
56 | If no PDF file is specified it's possible to use the decode/encode/js*/sctest commands and create a new PDF file:
57 |
58 | python peepdf.py -i
59 |
60 |
61 | * Batch execution
62 |
63 | It's possible to use a commands file to specify the commands to be executed in the batch mode. This type of execution is good to automatise analysis of several files:
64 |
65 | python peepdf.py [options] -s commands_file pdf_file
66 |
67 |
68 |
69 | ** Updating **
70 |
71 | Just type this and you will be updated to the latest version from the repository:
72 |
73 | python peepdf.py -u
74 |
75 |
76 |
77 | ** Some hints **
78 |
79 | If the information shown when a PDF file is parsed is not enough to know if it's harmful or not, the following commands can help to do it:
80 |
81 | * tree
82 |
83 | Shows the tree graph of the file or specified version. Here we can see suspicious elements.
84 |
85 |
86 | * offsets
87 |
88 | Shows the physical map of the file or the specified version of the document. This is helpful to see unusual big objects or big spaces between objects.
89 |
90 |
91 | * search
92 |
93 | Search the specified string or hexadecimal string in the objects (decoded and encrypted streams included).
94 |
95 |
96 | * object/rawobject
97 |
98 | Shows the (raw) content of the object.
99 |
100 |
101 | * stream/rawstream
102 |
103 | Shows the (raw) content of the stream.
104 |
105 |
106 | * The rest of commands, of course
107 |
108 | > help
109 |
110 |
111 |
112 | ** Bugs **
113 |
114 | Send me bugs and comments, please!! ;) You can do it via mail (jesparza AT eternal-todo.com) or through Google Code (http://peepdf.googlecode.com).
115 |
116 | Thanks!!
117 |
--------------------------------------------------------------------------------
/peepdf/TODO:
--------------------------------------------------------------------------------
1 | Pending tasks:
2 |
3 | - User manual
4 | - Documentation of methods in PDFCore.py
5 | - Add the rest of supported stream filters (better testing of existent)
6 | - Automatic analysis of embedded PDF files
7 | - Add AES to the encryption implementation
8 | - Improve the automatic Javascript analysis, getting code from other parts of the documents (getAnnots, etc)
9 | - GUI
10 | - ActionScript analysis?
--------------------------------------------------------------------------------
/peepdf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/peepdf/__init__.py
--------------------------------------------------------------------------------
/peepdf/aes.py:
--------------------------------------------------------------------------------
1 | #
2 | # peepdf is a tool to analyse and modify PDF files
3 | # http://peepdf.eternal-todo.com
4 | # By Jose Miguel Esparza
5 | #
6 | # Copyright (C) 2012-2014 Jose Miguel Esparza
7 | #
8 | # This file is part of peepdf.
9 | #
10 | # peepdf is free software: you can redistribute it and/or modify
11 | # it under the terms of the GNU General Public License as published by
12 | # the Free Software Foundation, either version 3 of the License, or
13 | # (at your option) any later version.
14 | #
15 | # peepdf is distributed in the hope that it will be useful,
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | # GNU General Public License for more details.
19 | #
20 | # You should have received a copy of the GNU General Public License
21 | # along with peepdf. If not, see .
22 | #
23 |
24 | """
25 | Created from the demonstration of the pythonaes package.
26 |
27 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
28 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
29 | """
30 |
31 | from aespython import key_expander, aes_cipher, cbc_mode
32 |
33 |
34 | def decryptData(data, password=None, keyLength=None, mode='CBC'):
35 | '''
36 | Method added for peepdf
37 | '''
38 | decryptedData = ''
39 | if keyLength == None:
40 | keyLength = len(password) * 8
41 | if keyLength not in [128, 192, 256]:
42 | return (-1, 'Bad length key in AES decryption process')
43 |
44 | iv = map(ord, data[:16])
45 | key = map(ord, password)
46 | data = data[16:]
47 | if len(data) % 16 != 0:
48 | data = data[:-(len(data) % 16)]
49 | keyExpander = key_expander.KeyExpander(keyLength)
50 | expandedKey = keyExpander.expand(key)
51 | aesCipher = aes_cipher.AESCipher(expandedKey)
52 | if mode == 'CBC':
53 | aesMode = cbc_mode.CBCMode(aesCipher, 16)
54 | aesMode.set_iv(iv)
55 | for i in range(0, len(data), 16):
56 | ciphertext = map(ord, data[i:i + 16])
57 | decryptedBytes = aesMode.decrypt_block(ciphertext)
58 | for byte in decryptedBytes:
59 | decryptedData += chr(byte)
60 | return (0, decryptedData)
61 |
--------------------------------------------------------------------------------
/peepdf/aespython/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/peepdf/aespython/__init__.py
--------------------------------------------------------------------------------
/peepdf/aespython/aes_cipher.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | AES Block Cipher.
4 |
5 | Performs single block cipher decipher operations on a 16 element list of integers.
6 | These integers represent 8 bit bytes in a 128 bit block.
7 | The result of cipher or decipher operations is the transformed 16 element list of integers.
8 |
9 | Running this file as __main__ will result in a self-test of the algorithm.
10 |
11 | Algorithm per NIST FIPS-197 http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf
12 |
13 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
14 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
15 | """
16 | __author__ = "Adam Newman"
17 |
18 | # Normally use relative import. In test mode use local import.
19 | try:
20 | from .aes_tables import sbox, i_sbox, galI, galNI
21 | except ValueError:
22 | from aes_tables import sbox, i_sbox, galI, galNI
23 | ups = ",".join("s%x" % x for x in range(16))
24 | upr = ups.replace("s", "r")
25 | mix = ",".join(",".join(
26 | ("g{0}[s%x]^g{1}[s%x]^g{2}[s%x]^g{3}[s%x]^r%x" % (i + (i[0] + (0, 3, 2, 1)[j],))).format(j & 3, j + 1 & 3,
27 | j + 2 & 3, j + 3 & 3) for j
28 | in (0, 3, 2, 1)) for i in ((0, 1, 2, 3), (4, 5, 6, 7), (8, 9, 10, 11), (12, 13, 14, 15))).replace("g2",
29 | "g").replace("g3",
30 | "g")
31 | i = mix.find("g[")
32 | while i != -1:
33 | mix = mix[:i] + mix[i + 2:i + 4] + mix[i + 5:]
34 | i = mix.find("g[", i)
35 | imix = ",".join(",".join(
36 | ("g{0}[s%x]^g{1}[s%x]^g{2}[s%x]^g{3}[s%x]" % i).format(j & 3, j + 1 & 3, j + 2 & 3, j + 3 & 3) for j in
37 | (0, 3, 2, 1)) for i in ((0, 1, 2, 3), (4, 5, 6, 7), (8, 9, 10, 11), (12, 13, 14, 15)))
38 | csl = ["s%x" % (x * 5 & 15) for x in range(16)]
39 | csr = ["s%x" % (x * -3 & 15) for x in range(16)]
40 | box = ",".join("s[%s]" % i for i in csl)
41 | ibox = ",".join("s[%s]^r%x" % i for i in zip(csr, range(16)))
42 | xor = ",".join("s[%s]^r%x" % i for i in zip(csl, range(16)))
43 | xori = ";".join("s%x^=r%x" % (i, i) for i in range(16))
44 | ciph = """def decipher_block(f,s):
45 | g0,g1,g2,g3=galNI;ek=f._expanded_key;S=s+[0]*(16-len(s));s=sbox;R=ek[:16];X
46 | for f in range(!16):R=ek[f:f+16];S=B;S=M
47 | R=ek[f+16:]
48 | return """.replace("S", ups).replace("R", upr).replace("X", xori)
49 |
50 |
51 | class AESCipher:
52 | def __init__(self, expanded_key):
53 | self._expanded_key = expanded_key
54 | self._Nr = len(expanded_key) - 16
55 |
56 | exec (
57 | ciph.replace("g2,g3", "").replace("dec", "c").replace("!", "16,f._Nr,").replace("B", box).replace("M", mix) + xor)
58 | exec (ciph.replace("NI", "I").replace(":16", "f._Nr:").replace("f+16:", ":16").replace("!", "f._Nr-16,0,-").replace(
59 | "sbox", "i_sbox").replace("B", ibox).replace("M", imix) + ibox)
60 |
61 |
62 | import unittest
63 |
64 |
65 | class TestCipher(unittest.TestCase):
66 | def test_cipher(self):
67 | """Test AES cipher with all key lengths"""
68 | import test_keys
69 | import key_expander
70 |
71 | test_data = test_keys.TestKeys()
72 | for key_size in 128, 192, 256:
73 | test_key_expander = key_expander.KeyExpander(key_size)
74 | test_expanded_key = test_key_expander.expand(test_data.test_key[key_size])
75 | test_cipher = AESCipher(test_expanded_key)
76 | test_result_ciphertext = test_cipher.cipher_block(test_data.test_block_plaintext)
77 | self.assertEquals(len(
78 | [i for i, j in zip(test_result_ciphertext, test_data.test_block_ciphertext_validated[key_size]) if
79 | i == j]),
80 | 16, msg='Test %d bit cipher' % key_size)
81 | test_result_plaintext = test_cipher.decipher_block(test_data.test_block_ciphertext_validated[key_size])
82 | self.assertEquals(len([i for i, j in zip(test_result_plaintext, test_data.test_block_plaintext) if i == j]),
83 | 16, msg='Test %d bit decipher' % key_size)
84 |
85 |
86 | if __name__ == "__main__":
87 | unittest.main()
88 |
--------------------------------------------------------------------------------
/peepdf/aespython/cbc_mode.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | CBC Mode of operation
4 |
5 | Running this file as __main__ will result in a self-test of the algorithm.
6 |
7 | Algorithm per NIST SP 800-38A http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf
8 |
9 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
10 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
11 | """
12 | __author__ = "Adam Newman"
13 |
14 |
15 | class CBCMode:
16 | """Perform CBC operation on a block and retain IV information for next operation"""
17 |
18 | def __init__(self, block_cipher, block_size):
19 | self._block_cipher = block_cipher
20 | self._block_size = block_size
21 | self._iv = [0] * block_size
22 |
23 | def set_iv(self, iv):
24 | if len(iv) == self._block_size:
25 | self._iv = iv
26 |
27 | def encrypt_block(self, plaintext):
28 | iv = self._iv = self._block_cipher.cipher_block([i ^ j for i, j in zip(plaintext, self._iv)])
29 | return iv
30 |
31 | def decrypt_block(self, ciphertext):
32 | plaintext = list(self._block_cipher.decipher_block(ciphertext))
33 | for i, v in enumerate(self._iv): plaintext[i] ^= v
34 | self._iv = ciphertext
35 | return plaintext
36 |
37 |
38 | import unittest
39 |
40 |
41 | class TestEncryptionMode(unittest.TestCase):
42 | def test_mode(self):
43 | # Self test
44 | import key_expander
45 | import aes_cipher
46 | import test_keys
47 |
48 | test_data = test_keys.TestKeys()
49 |
50 | test_expander = key_expander.KeyExpander(256)
51 | test_expanded_key = test_expander.expand(test_data.test_mode_key)
52 |
53 | test_cipher = aes_cipher.AESCipher(test_expanded_key)
54 |
55 | test_cbc = CBCMode(test_cipher, 16)
56 |
57 | test_cbc.set_iv(test_data.test_mode_iv)
58 | for k in range(4):
59 | self.assertEquals(len([i for i, j in zip(test_data.test_cbc_ciphertext[k],
60 | test_cbc.encrypt_block(test_data.test_mode_plaintext[k])) if
61 | i == j]),
62 | 16,
63 | msg='CBC encrypt test block %d' % k)
64 |
65 | test_cbc.set_iv(test_data.test_mode_iv)
66 | for k in range(4):
67 | self.assertEquals(len([i for i, j in zip(test_data.test_mode_plaintext[k],
68 | test_cbc.decrypt_block(test_data.test_cbc_ciphertext[k])) if
69 | i == j]),
70 | 16,
71 | msg='CBC decrypt test block %d' % k)
72 |
73 |
74 | if __name__ == "__main__":
75 | unittest.main()
76 |
--------------------------------------------------------------------------------
/peepdf/aespython/cfb_mode.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | CFB Mode of operation
4 |
5 | Running this file as __main__ will result in a self-test of the algorithm.
6 |
7 | Algorithm per NIST SP 800-38A http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf
8 |
9 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
10 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
11 | """
12 | __author__ = "Adam Newman"
13 |
14 |
15 | class CFBMode:
16 | """Perform CFB operation on a block and retain IV information for next operation"""
17 |
18 | def __init__(self, block_cipher, block_size):
19 | self._block_cipher = block_cipher
20 | self._block_size = block_size
21 | self._iv = [0] * block_size
22 |
23 | def set_iv(self, iv):
24 | if len(iv) == self._block_size:
25 | self._iv = iv
26 |
27 | def encrypt_block(self, plaintext):
28 | cipher_iv = self._block_cipher.cipher_block(self._iv)
29 | iv = self._iv = [i ^ j for i, j in zip(plaintext, cipher_iv)]
30 | return iv
31 |
32 | def decrypt_block(self, ciphertext):
33 | cipher_iv = self._block_cipher.cipher_block(self._iv)
34 | self._iv = ciphertext
35 | return [i ^ j for i, j in zip(cipher_iv, ciphertext)]
36 |
37 |
38 | import unittest
39 |
40 |
41 | class TestEncryptionMode(unittest.TestCase):
42 | def test_mode(self):
43 | # Self test
44 | import key_expander
45 | import aes_cipher
46 | import test_keys
47 |
48 | test_data = test_keys.TestKeys()
49 |
50 | test_expander = key_expander.KeyExpander(256)
51 | test_expanded_key = test_expander.expand(test_data.test_mode_key)
52 |
53 | test_cipher = aes_cipher.AESCipher(test_expanded_key)
54 |
55 | test_cfb = CFBMode(test_cipher, 16)
56 |
57 | test_cfb.set_iv(test_data.test_mode_iv)
58 | for k in range(4):
59 | self.assertEquals(len([i for i, j in zip(test_data.test_cfb_ciphertext[k],
60 | test_cfb.encrypt_block(test_data.test_mode_plaintext[k])) if
61 | i == j]),
62 | 16,
63 | msg='CFB encrypt test block' + str(k))
64 |
65 | test_cfb.set_iv(test_data.test_mode_iv)
66 | for k in range(4):
67 | self.assertEquals(len([i for i, j in zip(test_data.test_mode_plaintext[k],
68 | test_cfb.decrypt_block(test_data.test_cfb_ciphertext[k])) if
69 | i == j]),
70 | 16,
71 | msg='CFB decrypt test block' + str(k))
72 |
73 |
74 | if __name__ == "__main__":
75 | unittest.main()
76 |
--------------------------------------------------------------------------------
/peepdf/aespython/key_expander.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """
4 | AES Key Expansion.
5 |
6 | Expands 128, 192, or 256 bit key for use with AES
7 |
8 | Running this file as __main__ will result in a self-test of the algorithm.
9 |
10 | Algorithm per NIST FIPS-197 http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf
11 |
12 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
13 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
14 | """
15 | __author__ = "Adam Newman"
16 |
17 | # Normally use relative import. In test mode use local import.
18 | try:
19 | from .aes_tables import sbox, rcon
20 | except ValueError:
21 | from aes_tables import sbox, rcon
22 | from operator import xor
23 |
24 |
25 | class KeyExpander:
26 | """Perform AES Key Expansion"""
27 |
28 | _expanded_key_length = {128: 176, 192: 208, 256: 240}
29 |
30 | def __init__(self, key_length):
31 | self._key_length = key_length
32 | self._n = key_length >> 3
33 |
34 | if key_length in self._expanded_key_length:
35 | self._b = self._expanded_key_length[key_length]
36 | else:
37 | raise LookupError('Invalid Key Size')
38 |
39 | def expand(self, new_key):
40 | """
41 | Expand the encryption key per AES key schedule specifications
42 |
43 | http://en.wikipedia.org/wiki/Rijndael_key_schedule#Key_schedule_description
44 | """
45 | # First n bytes are copied from key
46 | len_new_key = len(new_key)
47 | if len_new_key != self._n:
48 | raise RuntimeError('expand(): key size is invalid')
49 | rcon_iter = 1
50 | nex = new_key.extend
51 |
52 | # Grow the key until it is the correct length
53 | while 1:
54 | # Copy last 4 bytes of extended key, apply core, increment i(rcon_iter),
55 | # core Append the list of elements 1-3 and list comprised of element 0 (circular rotate left)
56 | # core For each element of this new list, put the result of sbox into output array.
57 | # xor with 4 bytes n bytes from end of extended key
58 | keyarr = [sbox[i] for i in new_key[-3:] + new_key[-4:-3]]
59 | # First byte of output array is XORed with rcon(iter)
60 | keyarr[0] ^= rcon[rcon_iter]
61 | nex(map(xor, keyarr, new_key[-self._n:4 - self._n]))
62 | rcon_iter += 1
63 | len_new_key += 4
64 |
65 | # Run three passes of 4 byte expansion using copy of 4 byte tail of extended key
66 | # which is then xor'd with 4 bytes n bytes from end of extended key
67 | for j in 0, 1, 2:
68 | nex(map(xor, new_key[-4:], new_key[-self._n:4 - self._n]))
69 | len_new_key += 4
70 | if len_new_key >= self._b:
71 | return new_key
72 | else:
73 | # If key length is 256 and key is not complete, add 4 bytes tail of extended key
74 | # run through sbox before xor with 4 bytes n bytes from end of extended key
75 | if self._key_length == 256:
76 | nex(map(xor, [sbox[x] for x in new_key[-4:]], new_key[-self._n:4 - self._n]))
77 | len_new_key += 4
78 | if len_new_key >= self._b: return new_key
79 |
80 | # If key length is 192 or 256 and key is not complete, run 2 or 3 passes respectively
81 | # of 4 byte tail of extended key xor with 4 bytes n bytes from end of extended key
82 | if self._key_length != 128:
83 | for j in ((0, 1) if self._key_length == 192 else (0, 1, 2)):
84 | nex(map(xor, new_key[-4:], new_key[-self._n:4 - self._n]))
85 | len_new_key += 4
86 | if len_new_key >= self._b: return new_key
87 |
88 |
89 | import unittest
90 |
91 |
92 | class TestKeyExpander(unittest.TestCase):
93 | def test_keys(self):
94 | """Test All Key Expansions"""
95 | import test_keys
96 |
97 | test_data = test_keys.TestKeys()
98 | for key_size in 128, 192, 256:
99 | test_expander = KeyExpander(key_size)
100 | test_expanded_key = test_expander.expand(test_data.test_key[key_size])
101 | self.assertEqual(
102 | len([i for i, j in zip(test_expanded_key, test_data.test_expanded_key_validated[key_size]) if i == j]),
103 | len(test_data.test_expanded_key_validated[key_size]),
104 | msg='Key expansion ' + str(key_size) + ' bit')
105 |
106 |
107 | if __name__ == "__main__":
108 | unittest.main()
109 |
--------------------------------------------------------------------------------
/peepdf/aespython/ofb_mode.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | OFB Mode of operation
4 |
5 | Running this file as __main__ will result in a self-test of the algorithm.
6 |
7 | Algorithm per NIST SP 800-38A http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf
8 |
9 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
10 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
11 | """
12 | __author__ = "Adam Newman"
13 |
14 |
15 | class OFBMode:
16 | """Perform OFB operation on a block and retain IV information for next operation"""
17 |
18 | def __init__(self, block_cipher, block_size):
19 | self._block_cipher = block_cipher
20 | self._block_size = block_size
21 | self._iv = [0] * block_size
22 |
23 | def set_iv(self, iv):
24 | if len(iv) == self._block_size:
25 | self._iv = iv
26 |
27 | def encrypt_block(self, plaintext):
28 | self._iv = cipher_iv = self._block_cipher.cipher_block(self._iv)
29 | return [i ^ j for i, j in zip(plaintext, cipher_iv)]
30 |
31 | def decrypt_block(self, ciphertext):
32 | self._iv = cipher_iv = self._block_cipher.cipher_block(self._iv)
33 | return [i ^ j for i, j in zip(cipher_iv, ciphertext)]
34 |
35 |
36 | import unittest
37 |
38 |
39 | class TestEncryptionMode(unittest.TestCase):
40 | def test_mode(self):
41 | # Self test
42 | import key_expander
43 | import aes_cipher
44 | import test_keys
45 |
46 | test_data = test_keys.TestKeys()
47 |
48 | test_expander = key_expander.KeyExpander(256)
49 | test_expanded_key = test_expander.expand(test_data.test_mode_key)
50 |
51 | test_cipher = aes_cipher.AESCipher(test_expanded_key)
52 |
53 | test_ofb = OFBMode(test_cipher, 16)
54 |
55 | test_ofb.set_iv(test_data.test_mode_iv)
56 | for k in range(4):
57 | self.assertEquals(len([i for i, j in zip(test_data.test_ofb_ciphertext[k],
58 | test_ofb.encrypt_block(test_data.test_mode_plaintext[k])) if
59 | i == j]),
60 | 16,
61 | msg='OFB encrypt test block' + str(k))
62 |
63 | test_ofb.set_iv(test_data.test_mode_iv)
64 | for k in range(4):
65 | self.assertEquals(len([i for i, j in zip(test_data.test_mode_plaintext[k],
66 | test_ofb.decrypt_block(test_data.test_ofb_ciphertext[k])) if
67 | i == j]),
68 | 16,
69 | msg='OFB decrypt test block' + str(k))
70 |
71 |
72 | if __name__ == "__main__":
73 | unittest.main()
74 |
--------------------------------------------------------------------------------
/peepdf/aespython/test_keys.py:
--------------------------------------------------------------------------------
1 | """
2 | Test keys and data for self-test operations.
3 |
4 | Test data from:
5 | NIST SP 800-38A http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf
6 | NIST FIPS-197 http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf
7 |
8 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
9 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
10 | """
11 | __author__ = "Adam Newman"
12 |
13 |
14 | class TestKeys:
15 | """Test data, keys, IVs, and output to use in self-tests"""
16 | test_key = {
17 | 128: [
18 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f]
19 | , 192: [
20 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
21 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17]
22 | , 256: [
23 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
24 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f]
25 | }
26 |
27 | test_expanded_key_validated = {
28 | 128: [
29 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
30 | 0xd6, 0xaa, 0x74, 0xfd, 0xd2, 0xaf, 0x72, 0xfa, 0xda, 0xa6, 0x78, 0xf1, 0xd6, 0xab, 0x76, 0xfe,
31 | 0xb6, 0x92, 0xcf, 0x0b, 0x64, 0x3d, 0xbd, 0xf1, 0xbe, 0x9b, 0xc5, 0x00, 0x68, 0x30, 0xb3, 0xfe,
32 | 0xb6, 0xff, 0x74, 0x4e, 0xd2, 0xc2, 0xc9, 0xbf, 0x6c, 0x59, 0x0c, 0xbf, 0x04, 0x69, 0xbf, 0x41,
33 | 0x47, 0xf7, 0xf7, 0xbc, 0x95, 0x35, 0x3e, 0x03, 0xf9, 0x6c, 0x32, 0xbc, 0xfd, 0x05, 0x8d, 0xfd,
34 | 0x3c, 0xaa, 0xa3, 0xe8, 0xa9, 0x9f, 0x9d, 0xeb, 0x50, 0xf3, 0xaf, 0x57, 0xad, 0xf6, 0x22, 0xaa,
35 | 0x5e, 0x39, 0x0f, 0x7d, 0xf7, 0xa6, 0x92, 0x96, 0xa7, 0x55, 0x3d, 0xc1, 0x0a, 0xa3, 0x1f, 0x6b,
36 | 0x14, 0xf9, 0x70, 0x1a, 0xe3, 0x5f, 0xe2, 0x8c, 0x44, 0x0a, 0xdf, 0x4d, 0x4e, 0xa9, 0xc0, 0x26,
37 | 0x47, 0x43, 0x87, 0x35, 0xa4, 0x1c, 0x65, 0xb9, 0xe0, 0x16, 0xba, 0xf4, 0xae, 0xbf, 0x7a, 0xd2,
38 | 0x54, 0x99, 0x32, 0xd1, 0xf0, 0x85, 0x57, 0x68, 0x10, 0x93, 0xed, 0x9c, 0xbe, 0x2c, 0x97, 0x4e,
39 | 0x13, 0x11, 0x1d, 0x7f, 0xe3, 0x94, 0x4a, 0x17, 0xf3, 0x07, 0xa7, 0x8b, 0x4d, 0x2b, 0x30, 0xc5]
40 | , 192: [
41 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
42 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x58, 0x46, 0xf2, 0xf9, 0x5c, 0x43, 0xf4, 0xfe,
43 | 0x54, 0x4a, 0xfe, 0xf5, 0x58, 0x47, 0xf0, 0xfa, 0x48, 0x56, 0xe2, 0xe9, 0x5c, 0x43, 0xf4, 0xfe,
44 | 0x40, 0xf9, 0x49, 0xb3, 0x1c, 0xba, 0xbd, 0x4d, 0x48, 0xf0, 0x43, 0xb8, 0x10, 0xb7, 0xb3, 0x42,
45 | 0x58, 0xe1, 0x51, 0xab, 0x04, 0xa2, 0xa5, 0x55, 0x7e, 0xff, 0xb5, 0x41, 0x62, 0x45, 0x08, 0x0c,
46 | 0x2a, 0xb5, 0x4b, 0xb4, 0x3a, 0x02, 0xf8, 0xf6, 0x62, 0xe3, 0xa9, 0x5d, 0x66, 0x41, 0x0c, 0x08,
47 | 0xf5, 0x01, 0x85, 0x72, 0x97, 0x44, 0x8d, 0x7e, 0xbd, 0xf1, 0xc6, 0xca, 0x87, 0xf3, 0x3e, 0x3c,
48 | 0xe5, 0x10, 0x97, 0x61, 0x83, 0x51, 0x9b, 0x69, 0x34, 0x15, 0x7c, 0x9e, 0xa3, 0x51, 0xf1, 0xe0,
49 | 0x1e, 0xa0, 0x37, 0x2a, 0x99, 0x53, 0x09, 0x16, 0x7c, 0x43, 0x9e, 0x77, 0xff, 0x12, 0x05, 0x1e,
50 | 0xdd, 0x7e, 0x0e, 0x88, 0x7e, 0x2f, 0xff, 0x68, 0x60, 0x8f, 0xc8, 0x42, 0xf9, 0xdc, 0xc1, 0x54,
51 | 0x85, 0x9f, 0x5f, 0x23, 0x7a, 0x8d, 0x5a, 0x3d, 0xc0, 0xc0, 0x29, 0x52, 0xbe, 0xef, 0xd6, 0x3a,
52 | 0xde, 0x60, 0x1e, 0x78, 0x27, 0xbc, 0xdf, 0x2c, 0xa2, 0x23, 0x80, 0x0f, 0xd8, 0xae, 0xda, 0x32,
53 | 0xa4, 0x97, 0x0a, 0x33, 0x1a, 0x78, 0xdc, 0x09, 0xc4, 0x18, 0xc2, 0x71, 0xe3, 0xa4, 0x1d, 0x5d]
54 | , 256: [
55 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
56 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
57 | 0xa5, 0x73, 0xc2, 0x9f, 0xa1, 0x76, 0xc4, 0x98, 0xa9, 0x7f, 0xce, 0x93, 0xa5, 0x72, 0xc0, 0x9c,
58 | 0x16, 0x51, 0xa8, 0xcd, 0x02, 0x44, 0xbe, 0xda, 0x1a, 0x5d, 0xa4, 0xc1, 0x06, 0x40, 0xba, 0xde,
59 | 0xae, 0x87, 0xdf, 0xf0, 0x0f, 0xf1, 0x1b, 0x68, 0xa6, 0x8e, 0xd5, 0xfb, 0x03, 0xfc, 0x15, 0x67,
60 | 0x6d, 0xe1, 0xf1, 0x48, 0x6f, 0xa5, 0x4f, 0x92, 0x75, 0xf8, 0xeb, 0x53, 0x73, 0xb8, 0x51, 0x8d,
61 | 0xc6, 0x56, 0x82, 0x7f, 0xc9, 0xa7, 0x99, 0x17, 0x6f, 0x29, 0x4c, 0xec, 0x6c, 0xd5, 0x59, 0x8b,
62 | 0x3d, 0xe2, 0x3a, 0x75, 0x52, 0x47, 0x75, 0xe7, 0x27, 0xbf, 0x9e, 0xb4, 0x54, 0x07, 0xcf, 0x39,
63 | 0x0b, 0xdc, 0x90, 0x5f, 0xc2, 0x7b, 0x09, 0x48, 0xad, 0x52, 0x45, 0xa4, 0xc1, 0x87, 0x1c, 0x2f,
64 | 0x45, 0xf5, 0xa6, 0x60, 0x17, 0xb2, 0xd3, 0x87, 0x30, 0x0d, 0x4d, 0x33, 0x64, 0x0a, 0x82, 0x0a,
65 | 0x7c, 0xcf, 0xf7, 0x1c, 0xbe, 0xb4, 0xfe, 0x54, 0x13, 0xe6, 0xbb, 0xf0, 0xd2, 0x61, 0xa7, 0xdf,
66 | 0xf0, 0x1a, 0xfa, 0xfe, 0xe7, 0xa8, 0x29, 0x79, 0xd7, 0xa5, 0x64, 0x4a, 0xb3, 0xaf, 0xe6, 0x40,
67 | 0x25, 0x41, 0xfe, 0x71, 0x9b, 0xf5, 0x00, 0x25, 0x88, 0x13, 0xbb, 0xd5, 0x5a, 0x72, 0x1c, 0x0a,
68 | 0x4e, 0x5a, 0x66, 0x99, 0xa9, 0xf2, 0x4f, 0xe0, 0x7e, 0x57, 0x2b, 0xaa, 0xcd, 0xf8, 0xcd, 0xea,
69 | 0x24, 0xfc, 0x79, 0xcc, 0xbf, 0x09, 0x79, 0xe9, 0x37, 0x1a, 0xc2, 0x3c, 0x6d, 0x68, 0xde, 0x36]
70 | }
71 |
72 | test_block_ciphertext_validated = {
73 | 128: [
74 | 0x69, 0xc4, 0xe0, 0xd8, 0x6a, 0x7b, 0x04, 0x30, 0xd8, 0xcd, 0xb7, 0x80, 0x70, 0xb4, 0xc5, 0x5a]
75 | , 192: [
76 | 0xdd, 0xa9, 0x7c, 0xa4, 0x86, 0x4c, 0xdf, 0xe0, 0x6e, 0xaf, 0x70, 0xa0, 0xec, 0x0d, 0x71, 0x91]
77 | , 256: [
78 | 0x8e, 0xa2, 0xb7, 0xca, 0x51, 0x67, 0x45, 0xbf, 0xea, 0xfc, 0x49, 0x90, 0x4b, 0x49, 0x60, 0x89]
79 | }
80 |
81 | test_block_plaintext = [
82 | 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff]
83 |
84 | # After initial validation, these deviated from test in SP 800-38A to use same key, iv, and plaintext on tests.
85 | # Still valid, just easier to test with.
86 | test_mode_key = [
87 | 0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81,
88 | 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4]
89 | test_mode_iv = [
90 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f]
91 | test_mode_plaintext = [
92 | [0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a],
93 | [0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51],
94 | [0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef],
95 | [0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10]]
96 | test_cbc_ciphertext = [
97 | [0xf5, 0x8c, 0x4c, 0x04, 0xd6, 0xe5, 0xf1, 0xba, 0x77, 0x9e, 0xab, 0xfb, 0x5f, 0x7b, 0xfb, 0xd6],
98 | [0x9c, 0xfc, 0x4e, 0x96, 0x7e, 0xdb, 0x80, 0x8d, 0x67, 0x9f, 0x77, 0x7b, 0xc6, 0x70, 0x2c, 0x7d],
99 | [0x39, 0xf2, 0x33, 0x69, 0xa9, 0xd9, 0xba, 0xcf, 0xa5, 0x30, 0xe2, 0x63, 0x04, 0x23, 0x14, 0x61],
100 | [0xb2, 0xeb, 0x05, 0xe2, 0xc3, 0x9b, 0xe9, 0xfc, 0xda, 0x6c, 0x19, 0x07, 0x8c, 0x6a, 0x9d, 0x1b]]
101 | test_cfb_ciphertext = [
102 | [0xdc, 0x7e, 0x84, 0xbf, 0xda, 0x79, 0x16, 0x4b, 0x7e, 0xcd, 0x84, 0x86, 0x98, 0x5d, 0x38, 0x60],
103 | [0x39, 0xff, 0xed, 0x14, 0x3b, 0x28, 0xb1, 0xc8, 0x32, 0x11, 0x3c, 0x63, 0x31, 0xe5, 0x40, 0x7b],
104 | [0xdf, 0x10, 0x13, 0x24, 0x15, 0xe5, 0x4b, 0x92, 0xa1, 0x3e, 0xd0, 0xa8, 0x26, 0x7a, 0xe2, 0xf9],
105 | [0x75, 0xa3, 0x85, 0x74, 0x1a, 0xb9, 0xce, 0xf8, 0x20, 0x31, 0x62, 0x3d, 0x55, 0xb1, 0xe4, 0x71]]
106 | test_ofb_ciphertext = [
107 | [0xdc, 0x7e, 0x84, 0xbf, 0xda, 0x79, 0x16, 0x4b, 0x7e, 0xcd, 0x84, 0x86, 0x98, 0x5d, 0x38, 0x60],
108 | [0x4f, 0xeb, 0xdc, 0x67, 0x40, 0xd2, 0x0b, 0x3a, 0xc8, 0x8f, 0x6a, 0xd8, 0x2a, 0x4f, 0xb0, 0x8d],
109 | [0x71, 0xab, 0x47, 0xa0, 0x86, 0xe8, 0x6e, 0xed, 0xf3, 0x9d, 0x1c, 0x5b, 0xba, 0x97, 0xc4, 0x08],
110 | [0x01, 0x26, 0x14, 0x1d, 0x67, 0xf3, 0x7b, 0xe8, 0x53, 0x8f, 0x5a, 0x8b, 0xe7, 0x40, 0xe4, 0x84]]
111 |
112 | def hex_output(self, list):
113 | # Debugging output helper
114 | result = '['
115 | for i in list[:-1]:
116 | result += hex(i) + ','
117 | return result + hex(list[-1]) + ']'
118 |
--------------------------------------------------------------------------------
/peepdf/colorama/__init__.py:
--------------------------------------------------------------------------------
1 | from .initialise import init
2 | from .ansi import Fore, Back, Style
3 | from .ansitowin32 import AnsiToWin32
4 |
5 | VERSION = '0.1.18'
6 |
--------------------------------------------------------------------------------
/peepdf/colorama/ansi.py:
--------------------------------------------------------------------------------
1 | '''
2 | This module generates ANSI character codes to printing colors to terminals.
3 | See: http://en.wikipedia.org/wiki/ANSI_escape_code
4 | '''
5 |
6 | CSI = '\033['
7 |
8 |
9 | def code_to_chars(code):
10 | return CSI + str(code) + 'm'
11 |
12 |
13 | class AnsiCodes(object):
14 | def __init__(self, codes):
15 | for name in dir(codes):
16 | if not name.startswith('_'):
17 | value = getattr(codes, name)
18 | setattr(self, name, code_to_chars(value))
19 |
20 |
21 | class AnsiFore:
22 | BLACK = 30
23 | RED = 31
24 | GREEN = 32
25 | YELLOW = 33
26 | BLUE = 34
27 | MAGENTA = 35
28 | CYAN = 36
29 | WHITE = 37
30 | RESET = 39
31 |
32 |
33 | class AnsiBack:
34 | BLACK = 40
35 | RED = 41
36 | GREEN = 42
37 | YELLOW = 43
38 | BLUE = 44
39 | MAGENTA = 45
40 | CYAN = 46
41 | WHITE = 47
42 | RESET = 49
43 |
44 |
45 | class AnsiStyle:
46 | BRIGHT = 1
47 | DIM = 2
48 | NORMAL = 22
49 | RESET_ALL = 0
50 |
51 |
52 | Fore = AnsiCodes(AnsiFore)
53 | Back = AnsiCodes(AnsiBack)
54 | Style = AnsiCodes(AnsiStyle)
55 |
--------------------------------------------------------------------------------
/peepdf/colorama/ansitowin32.py:
--------------------------------------------------------------------------------
1 | import re
2 | import sys
3 |
4 | from .ansi import AnsiFore, AnsiBack, AnsiStyle, Style
5 | from .winterm import WinTerm, WinColor, WinStyle
6 | from .win32 import windll
7 |
8 | if windll is not None:
9 | winterm = WinTerm()
10 |
11 |
12 | def is_a_tty(stream):
13 | return hasattr(stream, 'isatty') and stream.isatty()
14 |
15 |
16 | class StreamWrapper(object):
17 | '''
18 | Wraps a stream (such as stdout), acting as a transparent proxy for all
19 | attribute access apart from method 'write()', which is delegated to our
20 | Converter instance.
21 | '''
22 |
23 | def __init__(self, wrapped, converter):
24 | # double-underscore everything to prevent clashes with names of
25 | # attributes on the wrapped stream object.
26 | self.__wrapped = wrapped
27 | self.__convertor = converter
28 |
29 | def __getattr__(self, name):
30 | return getattr(self.__wrapped, name)
31 |
32 | def write(self, text):
33 | self.__convertor.write(text)
34 |
35 |
36 | class AnsiToWin32(object):
37 | '''
38 | Implements a 'write()' method which, on Windows, will strip ANSI character
39 | sequences from the text, and if outputting to a tty, will convert them into
40 | win32 function calls.
41 | '''
42 | ANSI_RE = re.compile('\033\[((?:\d|;)*)([a-zA-Z])')
43 |
44 | def __init__(self, wrapped, convert=None, strip=None, autoreset=False):
45 | # The wrapped stream (normally sys.stdout or sys.stderr)
46 | self.wrapped = wrapped
47 |
48 | # should we reset colors to defaults after every .write()
49 | self.autoreset = autoreset
50 |
51 | # create the proxy wrapping our output stream
52 | self.stream = StreamWrapper(wrapped, self)
53 |
54 | on_windows = sys.platform.startswith('win')
55 |
56 | # should we strip ANSI sequences from our output?
57 | if strip is None:
58 | strip = on_windows
59 | self.strip = strip
60 |
61 | # should we should convert ANSI sequences into win32 calls?
62 | if convert is None:
63 | convert = on_windows and is_a_tty(wrapped)
64 | self.convert = convert
65 |
66 | # dict of ansi codes to win32 functions and parameters
67 | self.win32_calls = self.get_win32_calls()
68 |
69 | # are we wrapping stderr?
70 | self.on_stderr = self.wrapped is sys.stderr
71 |
72 | def should_wrap(self):
73 | '''
74 | True if this class is actually needed. If false, then the output
75 | stream will not be affected, nor will win32 calls be issued, so
76 | wrapping stdout is not actually required. This will generally be
77 | False on non-Windows platforms, unless optional functionality like
78 | autoreset has been requested using kwargs to init()
79 | '''
80 | return self.convert or self.strip or self.autoreset
81 |
82 | def get_win32_calls(self):
83 | if self.convert and winterm:
84 | return {
85 | AnsiStyle.RESET_ALL: (winterm.reset_all,),
86 | AnsiStyle.BRIGHT: (winterm.style, WinStyle.BRIGHT),
87 | AnsiStyle.DIM: (winterm.style, WinStyle.NORMAL),
88 | AnsiStyle.NORMAL: (winterm.style, WinStyle.NORMAL),
89 | AnsiFore.BLACK: (winterm.fore, WinColor.BLACK),
90 | AnsiFore.RED: (winterm.fore, WinColor.RED),
91 | AnsiFore.GREEN: (winterm.fore, WinColor.GREEN),
92 | AnsiFore.YELLOW: (winterm.fore, WinColor.YELLOW),
93 | AnsiFore.BLUE: (winterm.fore, WinColor.BLUE),
94 | AnsiFore.MAGENTA: (winterm.fore, WinColor.MAGENTA),
95 | AnsiFore.CYAN: (winterm.fore, WinColor.CYAN),
96 | AnsiFore.WHITE: (winterm.fore, WinColor.GREY),
97 | AnsiFore.RESET: (winterm.fore,),
98 | AnsiBack.BLACK: (winterm.back, WinColor.BLACK),
99 | AnsiBack.RED: (winterm.back, WinColor.RED),
100 | AnsiBack.GREEN: (winterm.back, WinColor.GREEN),
101 | AnsiBack.YELLOW: (winterm.back, WinColor.YELLOW),
102 | AnsiBack.BLUE: (winterm.back, WinColor.BLUE),
103 | AnsiBack.MAGENTA: (winterm.back, WinColor.MAGENTA),
104 | AnsiBack.CYAN: (winterm.back, WinColor.CYAN),
105 | AnsiBack.WHITE: (winterm.back, WinColor.GREY),
106 | AnsiBack.RESET: (winterm.back,),
107 | }
108 |
109 | def write(self, text):
110 | if self.strip or self.convert:
111 | self.write_and_convert(text)
112 | else:
113 | self.wrapped.write(text)
114 | self.wrapped.flush()
115 | if self.autoreset:
116 | self.reset_all()
117 |
118 | def reset_all(self):
119 | if self.convert:
120 | self.call_win32('m', (0,))
121 | else:
122 | self.wrapped.write(Style.RESET_ALL)
123 |
124 | def write_and_convert(self, text):
125 | '''
126 | Write the given text to our wrapped stream, stripping any ANSI
127 | sequences from the text, and optionally converting them into win32
128 | calls.
129 | '''
130 | cursor = 0
131 | for match in self.ANSI_RE.finditer(text):
132 | start, end = match.span()
133 | self.write_plain_text(text, cursor, start)
134 | self.convert_ansi(*match.groups())
135 | cursor = end
136 | self.write_plain_text(text, cursor, len(text))
137 |
138 | def write_plain_text(self, text, start, end):
139 | if start < end:
140 | self.wrapped.write(text[start:end])
141 | self.wrapped.flush()
142 |
143 | def convert_ansi(self, paramstring, command):
144 | if self.convert:
145 | params = self.extract_params(paramstring)
146 | self.call_win32(command, params)
147 |
148 | def extract_params(self, paramstring):
149 | def split(paramstring):
150 | for p in paramstring.split(';'):
151 | if p != '':
152 | yield int(p)
153 |
154 | return tuple(split(paramstring))
155 |
156 | def call_win32(self, command, params):
157 | if params == []:
158 | params = [0]
159 | if command == 'm':
160 | for param in params:
161 | if param in self.win32_calls:
162 | func_args = self.win32_calls[param]
163 | func = func_args[0]
164 | args = func_args[1:]
165 | kwargs = dict(on_stderr=self.on_stderr)
166 | func(*args, **kwargs)
167 |
--------------------------------------------------------------------------------
/peepdf/colorama/initialise.py:
--------------------------------------------------------------------------------
1 | import atexit
2 | import sys
3 |
4 | from .ansitowin32 import AnsiToWin32
5 |
6 | orig_stdout = sys.stdout
7 | orig_stderr = sys.stderr
8 |
9 | atexit_done = False
10 |
11 |
12 | def reset_all():
13 | AnsiToWin32(orig_stdout).reset_all()
14 |
15 |
16 | def init(autoreset=False, convert=None, strip=None, wrap=True):
17 | if wrap == False and (autoreset == True or convert == True or strip == True):
18 | raise ValueError('wrap=False conflicts with any other arg=True')
19 |
20 | sys.stdout = wrap_stream(orig_stdout, convert, strip, autoreset, wrap)
21 | sys.stderr = wrap_stream(orig_stderr, convert, strip, autoreset, wrap)
22 |
23 | global atexit_done
24 | if not atexit_done:
25 | atexit.register(reset_all)
26 | atexit_done = True
27 |
28 |
29 | def wrap_stream(stream, convert, strip, autoreset, wrap):
30 | if wrap:
31 | wrapper = AnsiToWin32(stream,
32 | convert=convert, strip=strip, autoreset=autoreset)
33 | if wrapper.should_wrap():
34 | stream = wrapper.stream
35 | return stream
36 |
--------------------------------------------------------------------------------
/peepdf/colorama/win32.py:
--------------------------------------------------------------------------------
1 | # from winbase.h
2 | STDOUT = -11
3 | STDERR = -12
4 |
5 | try:
6 | from ctypes import windll
7 | except ImportError:
8 | windll = None
9 | SetConsoleTextAttribute = lambda *_: None
10 | else:
11 | from ctypes import (
12 | byref, Structure, c_char, c_short, c_uint32, c_ushort
13 | )
14 |
15 | handles = {
16 | STDOUT: windll.kernel32.GetStdHandle(STDOUT),
17 | STDERR: windll.kernel32.GetStdHandle(STDERR),
18 | }
19 |
20 | SHORT = c_short
21 | WORD = c_ushort
22 | DWORD = c_uint32
23 | TCHAR = c_char
24 |
25 | class COORD(Structure):
26 | """struct in wincon.h"""
27 | _fields_ = [
28 | ('X', SHORT),
29 | ('Y', SHORT),
30 | ]
31 |
32 | class SMALL_RECT(Structure):
33 | """struct in wincon.h."""
34 | _fields_ = [
35 | ("Left", SHORT),
36 | ("Top", SHORT),
37 | ("Right", SHORT),
38 | ("Bottom", SHORT),
39 | ]
40 |
41 | class CONSOLE_SCREEN_BUFFER_INFO(Structure):
42 | """struct in wincon.h."""
43 | _fields_ = [
44 | ("dwSize", COORD),
45 | ("dwCursorPosition", COORD),
46 | ("wAttributes", WORD),
47 | ("srWindow", SMALL_RECT),
48 | ("dwMaximumWindowSize", COORD),
49 | ]
50 |
51 | def GetConsoleScreenBufferInfo(stream_id):
52 | handle = handles[stream_id]
53 | csbi = CONSOLE_SCREEN_BUFFER_INFO()
54 | success = windll.kernel32.GetConsoleScreenBufferInfo(
55 | handle, byref(csbi))
56 | # This fails when imported via setup.py when installing using 'pip'
57 | # presumably the fix is that running setup.py should not trigger all
58 | # this activity.
59 | # assert success
60 | return csbi
61 |
62 | def SetConsoleTextAttribute(stream_id, attrs):
63 | handle = handles[stream_id]
64 | success = windll.kernel32.SetConsoleTextAttribute(handle, attrs)
65 | assert success
66 |
67 | def SetConsoleCursorPosition(stream_id, position):
68 | handle = handles[stream_id]
69 | position = COORD(*position)
70 | success = windll.kernel32.SetConsoleCursorPosition(handle, position)
71 | assert success
72 |
73 | def FillConsoleOutputCharacter(stream_id, char, length, start):
74 | handle = handles[stream_id]
75 | char = TCHAR(char)
76 | length = DWORD(length)
77 | start = COORD(*start)
78 | num_written = DWORD(0)
79 | # AttributeError: function 'FillConsoleOutputCharacter' not found
80 | # could it just be that my types are wrong?
81 | success = windll.kernel32.FillConsoleOutputCharacter(
82 | handle, char, length, start, byref(num_written))
83 | assert success
84 | return num_written.value
85 |
86 | if __name__ == '__main__':
87 | x = GetConsoleScreenBufferInfo(STDOUT)
88 | print(x.dwSize)
89 | print(x.dwCursorPosition)
90 | print(x.wAttributes)
91 | print(x.srWindow)
92 | print(x.dwMaximumWindowSize)
93 |
--------------------------------------------------------------------------------
/peepdf/colorama/winterm.py:
--------------------------------------------------------------------------------
1 | from . import win32
2 |
3 |
4 | # from wincon.h
5 | class WinColor(object):
6 | BLACK = 0
7 | BLUE = 1
8 | GREEN = 2
9 | CYAN = 3
10 | RED = 4
11 | MAGENTA = 5
12 | YELLOW = 6
13 | GREY = 7
14 |
15 |
16 | # from wincon.h
17 | class WinStyle(object):
18 | NORMAL = 0x00 # dim text, dim background
19 | BRIGHT = 0x08 # bright text, dim background
20 |
21 |
22 | class WinTerm(object):
23 | def __init__(self):
24 | self._default = \
25 | win32.GetConsoleScreenBufferInfo(win32.STDOUT).wAttributes
26 | self.set_attrs(self._default)
27 | self._default_fore = self._fore
28 | self._default_back = self._back
29 | self._default_style = self._style
30 |
31 | def get_attrs(self):
32 | return self._fore + self._back * 16 + self._style
33 |
34 | def set_attrs(self, value):
35 | self._fore = value & 7
36 | self._back = (value >> 4) & 7
37 | self._style = value & WinStyle.BRIGHT
38 |
39 | def reset_all(self, on_stderr=None):
40 | self.set_attrs(self._default)
41 | self.set_console(attrs=self._default)
42 |
43 | def fore(self, fore=None, on_stderr=False):
44 | if fore is None:
45 | fore = self._default_fore
46 | self._fore = fore
47 | self.set_console(on_stderr=on_stderr)
48 |
49 | def back(self, back=None, on_stderr=False):
50 | if back is None:
51 | back = self._default_back
52 | self._back = back
53 | self.set_console(on_stderr=on_stderr)
54 |
55 | def style(self, style=None, on_stderr=False):
56 | if style is None:
57 | style = self._default_style
58 | self._style = style
59 | self.set_console(on_stderr=on_stderr)
60 |
61 | def set_console(self, attrs=None, on_stderr=False):
62 | if attrs is None:
63 | attrs = self.get_attrs()
64 | handle = win32.STDOUT
65 | if on_stderr:
66 | handle = win32.STDERR
67 | win32.SetConsoleTextAttribute(handle, attrs)
68 |
--------------------------------------------------------------------------------
/peepdf/jsbeautifier/unpackers/README.specs.mkd:
--------------------------------------------------------------------------------
1 | # UNPACKERS SPECIFICATIONS
2 |
3 | Nothing very difficult: an unpacker is a submodule placed in the directory
4 | where this file was found. Each unpacker must define three symbols:
5 |
6 | * `PRIORITY` : integer number expressing the priority in applying this
7 | unpacker. Lower number means higher priority.
8 | Makes sense only if a source file has been packed with
9 | more than one packer.
10 | * `detect(source)` : returns `True` if source is packed, otherwise, `False`.
11 | * `unpack(source)` : takes a `source` string and unpacks it. Must always return
12 | valid JavaScript. That is to say, your code should look
13 | like:
14 |
15 | ```
16 | if detect(source):
17 | return do_your_fancy_things_with(source)
18 | else:
19 | return source
20 | ```
21 |
22 | *You can safely define any other symbol in your module, as it will be ignored.*
23 |
24 | `__init__` code will automatically load new unpackers, without any further step
25 | to be accomplished. Simply drop it in this directory.
26 |
--------------------------------------------------------------------------------
/peepdf/jsbeautifier/unpackers/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # General code for JSBeautifier unpackers infrastructure. See README.specs
3 | # written by Stefano Sanfilippo
4 | #
5 |
6 | """General code for JSBeautifier unpackers infrastructure."""
7 |
8 | import pkgutil
9 | import re
10 | # from jsbeautifier.unpackers import evalbased
11 | import evalbased
12 |
13 | # NOTE: AT THE MOMENT, IT IS DEACTIVATED FOR YOUR SECURITY: it runs js!
14 | BLACKLIST = ['jsbeautifier.unpackers.evalbased']
15 |
16 |
17 | class UnpackingError(Exception):
18 | """Badly packed source or general error. Argument is a
19 | meaningful description."""
20 | pass
21 |
22 |
23 | def getunpackers():
24 | """Scans the unpackers dir, finds unpackers and add them to UNPACKERS list.
25 | An unpacker will be loaded only if it is a valid python module (name must
26 | adhere to naming conventions) and it is not blacklisted (i.e. inserted
27 | into BLACKLIST."""
28 | path = __path__
29 | prefix = __name__ + '.'
30 | unpackers = []
31 | interface = ['unpack', 'detect', 'PRIORITY']
32 | for _importer, modname, _ispkg in pkgutil.iter_modules(path, prefix):
33 | if 'tests' not in modname and modname not in BLACKLIST:
34 | try:
35 | module = __import__(modname, fromlist=interface)
36 | except ImportError:
37 | raise UnpackingError('Bad unpacker: %s' % modname)
38 | else:
39 | unpackers.append(module)
40 |
41 | return sorted(unpackers, key=lambda mod: mod.PRIORITY)
42 |
43 | # UNPACKERS = getunpackers()
44 | UNPACKERS = []
45 |
46 |
47 | def run(source, evalcode=False):
48 | """Runs the applicable unpackers and return unpacked source as a string."""
49 | for unpacker in [mod for mod in UNPACKERS if mod.detect(source)]:
50 | source = unpacker.unpack(source)
51 | if evalcode and evalbased.detect(source):
52 | source = evalbased.unpack(source)
53 | return source
54 |
55 |
56 | def filtercomments(source):
57 | """NOT USED: strips trailing comments and put them at the top."""
58 | trailing_comments = []
59 | comment = True
60 |
61 | while comment:
62 | if re.search(r'^\s*\/\*', source):
63 | comment = source[0, source.index('*/') + 2]
64 | elif re.search(r'^\s*\/\/', source):
65 | comment = re.search(r'^\s*\/\/', source).group(0)
66 | else:
67 | comment = None
68 |
69 | if comment:
70 | source = re.sub(r'^\s+', '', source[len(comment):])
71 | trailing_comments.append(comment)
72 |
73 | return '\n'.join(trailing_comments) + source
74 |
--------------------------------------------------------------------------------
/peepdf/jsbeautifier/unpackers/evalbased.py:
--------------------------------------------------------------------------------
1 | #
2 | # Unpacker for eval() based packers, a part of javascript beautifier
3 | # by Einar Lielmanis
4 | #
5 | # written by Stefano Sanfilippo
6 | #
7 | # usage:
8 | #
9 | # if detect(some_string):
10 | # unpacked = unpack(some_string)
11 | #
12 |
13 | """Unpacker for eval() based packers: runs JS code and returns result.
14 | Works only if a JS interpreter (e.g. Mozilla's Rhino) is installed and
15 | properly set up on host."""
16 |
17 | from subprocess import PIPE, Popen
18 |
19 | PRIORITY = 3
20 |
21 |
22 | def detect(source):
23 | """Detects if source is likely to be eval() packed."""
24 | return source.strip().lower().startswith('eval(function(')
25 |
26 |
27 | def unpack(source):
28 | """Runs source and return resulting code."""
29 | return jseval('print %s;' % source[4:]) if detect(source) else source
30 |
31 |
32 | # In case of failure, we'll just return the original, without crashing on user.
33 | def jseval(script):
34 | """Run code in the JS interpreter and return output."""
35 | try:
36 | interpreter = Popen(['js'], stdin=PIPE, stdout=PIPE)
37 | except OSError:
38 | return script
39 | result, errors = interpreter.communicate(script)
40 | if interpreter.poll() or errors:
41 | return script
42 | return result
43 |
--------------------------------------------------------------------------------
/peepdf/jsbeautifier/unpackers/javascriptobfuscator.py:
--------------------------------------------------------------------------------
1 | #
2 | # simple unpacker/deobfuscator for scripts messed up with
3 | # javascriptobfuscator.com
4 | #
5 | # written by Einar Lielmanis
6 | # rewritten in Python by Stefano Sanfilippo
7 | #
8 | # Will always return valid javascript: if `detect()` is false, `code` is
9 | # returned, unmodified.
10 | #
11 | # usage:
12 | #
13 | # if javascriptobfuscator.detect(some_string):
14 | # some_string = javascriptobfuscator.unpack(some_string)
15 | #
16 |
17 | """deobfuscator for scripts messed up with JavascriptObfuscator.com"""
18 |
19 | import re
20 |
21 | PRIORITY = 1
22 |
23 |
24 | def smartsplit(code):
25 | """Split `code` at " symbol, only if it is not escaped."""
26 | strings = []
27 | pos = 0
28 | while pos < len(code):
29 | if code[pos] == '"':
30 | word = '' # new word
31 | pos += 1
32 | while pos < len(code):
33 | if code[pos] == '"':
34 | break
35 | if code[pos] == '\\':
36 | word += '\\'
37 | pos += 1
38 | word += code[pos]
39 | pos += 1
40 | strings.append('"%s"' % word)
41 | pos += 1
42 | return strings
43 |
44 |
45 | def detect(code):
46 | """Detects if `code` is JavascriptObfuscator.com packed."""
47 | # prefer `is not` idiom, so that a true boolean is returned
48 | return (re.search(r'^var _0x[a-f0-9]+ ?\= ?\[', code) is not None)
49 |
50 |
51 | def unpack(code):
52 | """Unpacks JavascriptObfuscator.com packed code."""
53 | if detect(code):
54 | matches = re.search(r'var (_0x[a-f\d]+) ?\= ?\[(.*?)\];', code)
55 | if matches:
56 | variable = matches.group(1)
57 | dictionary = smartsplit(matches.group(2))
58 | code = code[len(matches.group(0)):]
59 | for key, value in enumerate(dictionary):
60 | code = code.replace(r'%s[%s]' % (variable, key), value)
61 | return code
62 |
--------------------------------------------------------------------------------
/peepdf/jsbeautifier/unpackers/myobfuscate.py:
--------------------------------------------------------------------------------
1 | #
2 | # deobfuscator for scripts messed up with myobfuscate.com
3 | # by Einar Lielmanis
4 | #
5 | # written by Stefano Sanfilippo
6 | #
7 | # usage:
8 | #
9 | # if detect(some_string):
10 | # unpacked = unpack(some_string)
11 | #
12 |
13 | # CAVEAT by Einar Lielmanis
14 |
15 | #
16 | # You really don't want to obfuscate your scripts there: they're tracking
17 | # your unpackings, your script gets turned into something like this,
18 | # as of 2011-08-26:
19 | #
20 | # var _escape = 'your_script_escaped';
21 | # var _111 = document.createElement('script');
22 | # _111.src = 'http://api.www.myobfuscate.com/?getsrc=ok' +
23 | # '&ref=' + encodeURIComponent(document.referrer) +
24 | # '&url=' + encodeURIComponent(document.URL);
25 | # var 000 = document.getElementsByTagName('head')[0];
26 | # 000.appendChild(_111);
27 | # document.write(unescape(_escape));
28 | #
29 |
30 | """Deobfuscator for scripts messed up with MyObfuscate.com"""
31 |
32 | import re
33 | import base64
34 |
35 | # Python 2 retrocompatibility
36 | # pylint: disable=F0401
37 | # pylint: disable=E0611
38 | try:
39 | from urllib import unquote
40 | except ImportError:
41 | from urllib.parse import unquote
42 |
43 | from jsbeautifier.unpackers import UnpackingError
44 |
45 | PRIORITY = 1
46 |
47 | CAVEAT = """//
48 | // Unpacker warning: be careful when using myobfuscate.com for your projects:
49 | // scripts obfuscated by the free online version call back home.
50 | //
51 |
52 | """
53 |
54 | SIGNATURE = (r'["\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F'
55 | r'\x50\x51\x52\x53\x54\x55\x56\x57\x58\x59\x5A\x61\x62\x63\x64\x65'
56 | r'\x66\x67\x68\x69\x6A\x6B\x6C\x6D\x6E\x6F\x70\x71\x72\x73\x74\x75'
57 | r'\x76\x77\x78\x79\x7A\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x2B'
58 | r'\x2F\x3D","","\x63\x68\x61\x72\x41\x74","\x69\x6E\x64\x65\x78'
59 | r'\x4F\x66","\x66\x72\x6F\x6D\x43\x68\x61\x72\x43\x6F\x64\x65","'
60 | r'\x6C\x65\x6E\x67\x74\x68"]')
61 |
62 |
63 | def detect(source):
64 | """Detects MyObfuscate.com packer."""
65 | return SIGNATURE in source
66 |
67 |
68 | def unpack(source):
69 | """Unpacks js code packed with MyObfuscate.com"""
70 | if not detect(source):
71 | return source
72 | payload = unquote(_filter(source))
73 | match = re.search(r"^var _escape\=''
204 | res = re.findall(reJSscript, content, re.DOTALL | re.IGNORECASE)
205 | if res != []:
206 | self.javascript.append('\n'.join(res))
207 | else:
208 | self.javascript.append(content)
209 | return
210 |
211 | '''
212 | Check string for flash content
213 | '''
214 |
215 | def check_swf(self, content):
216 | if isFlash(content):
217 | self.swf.append(content)
218 | return
219 |
220 | '''
221 | Create an lxml tree from the xml string
222 | '''
223 |
224 | def tree_from_xml(self, xml):
225 | try:
226 | tree = ET.fromstring(xml)
227 | return tree
228 | except Exception as e:
229 | sys.stderr.write("xml_creator cannot create tree: %s\n" % e)
230 | return 'TREE_ERROR: %s' % str(e)
231 |
232 | '''
233 | Calls edges to recursively create the graph string
234 | '''
235 |
236 | def make_graph(self, tree):
237 | res = []
238 | # Explicit check for None to avoid FutureWarning
239 | if tree is not None:
240 | self.edges(tree, res, 0)
241 | return res
242 |
243 | def edges(self, parent, output, id):
244 | """
245 |
246 | creates string showing connections between objects
247 | """
248 | for child in list(parent):
249 | if isinstance(child, str):
250 | return
251 | elif child.get("id") != None:
252 | cid = child.get("id")
253 | output.append(str(id) + ' ' + cid + '\n')
254 | self.edges(child, output, cid)
255 | else:
256 | res = self.edges(child, output, id)
257 | return
258 |
259 |
260 | if __name__ == "__main__":
261 | try:
262 | dirin = sys.argv[1]
263 | dirout = sys.argv[2]
264 | except IndexError:
265 | sys.exit(0)
266 | else:
267 | if not os.path.isdir(dirin) or not os.path.isdir(dirout):
268 | sys.exit(0)
269 |
270 | sys.stdout.write("%s/*.pdf --> %s/*.swf\n\n" % (dirin, dirout))
271 |
272 | try:
273 | fdone = open(os.path.join(dirout, "done.txt"), 'a+')
274 | ferr = open(os.path.join(dirout, "error.txt"), 'a')
275 | except IOError as e:
276 | sys.stderr.write("parser done file error: %s\n" % e)
277 | else:
278 | completed = set()
279 | fdone.seek(0)
280 | for line in fdone:
281 | completed.add(line.rstrip())
282 |
283 | pdfs = scandir(dirin)
284 |
285 | for pdf in pdfs:
286 |
287 | if pdf.name in completed:
288 | sys.stdout.write("skipping: %s\n" % pdf.name)
289 | continue
290 |
291 | sys.stdout.write("%s\n" % pdf.name)
292 |
293 | try:
294 | parsed = FrankenParser(pdf.path)
295 | except Exception as e:
296 | try:
297 | ferr.write("%s:%s\n" % (pdf.name, str(e)))
298 | except Exception:
299 | ferr.write("%s: ferr write() BIG-TIME ERROR\n" % pdf.name)
300 | sys.stderr.write("ferr write error pdf: %s := %s\n" % (pdf.name, e))
301 | else:
302 | if parsed.swf:
303 | try:
304 | fout = open(os.path.join(dirout, "%s.swf" % pdf.name), 'wb')
305 | except IOError as e:
306 | sys.stderr.write("parser output file error: %s\n" % e)
307 | else:
308 | fout.write(''.join(parsed.swf))
309 | fout.close()
310 | finally:
311 | try:
312 | fdone.write("%s\n" % pdf.name)
313 | except Exception as e:
314 | sys.stderr.write("fdone write error pdf: %s := %s\n" % (pdf.name, e))
315 | sys.stdout.write("\n")
316 | fdone.close()
317 | ferr.close()
318 |
--------------------------------------------------------------------------------