├── .gitignore ├── JSAnalysis.py ├── LICENSE.md ├── README.md ├── __init__.py ├── build_pdf_objects.py ├── cfg.py ├── db └── __init__.py ├── db_mgmt.py ├── huntterp.py ├── jobs └── __init__.py ├── pdfminer ├── LICENSE ├── __init__.py ├── arcfour.py ├── ascii85.py ├── ccitt.py ├── lzw.py ├── pdfdocument.py ├── pdfparser.py ├── pdftypes.py ├── psparser.py ├── runlength.py └── utils.py ├── pdfrankenstein.py ├── peepdf ├── AUTHORS ├── CHANGELOG ├── COPYING ├── JSAnalysis.py ├── PDFConsole.py ├── PDFCore.py ├── PDFCrypto.py ├── PDFFilters.py ├── PDFUtils.py ├── README ├── TODO ├── __init__.py ├── aes.py ├── aespython │ ├── __init__.py │ ├── aes_cipher.py │ ├── aes_tables.py │ ├── cbc_mode.py │ ├── cfb_mode.py │ ├── key_expander.py │ ├── ofb_mode.py │ └── test_keys.py ├── ccitt.py ├── colorama │ ├── PKG-INFO │ ├── __init__.py │ ├── ansi.py │ ├── ansitowin32.py │ ├── initialise.py │ ├── win32.py │ └── winterm.py ├── jjdecode.py ├── jsbeautifier │ ├── __init__.py │ └── unpackers │ │ ├── README.specs.mkd │ │ ├── __init__.py │ │ ├── evalbased.py │ │ ├── javascriptobfuscator.py │ │ ├── myobfuscate.py │ │ ├── packer.py │ │ └── urlencode.py ├── lzw.py ├── peepdf.dtd └── peepdf.py ├── scripts ├── __init__.py ├── clarify.py ├── ffdec.jar ├── mapper.py └── run-jpexs.py ├── sdhasher.py ├── storage.py ├── util ├── __init__.py ├── mapper.py └── str_utils.py └── xml_creator.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion 2 | 3 | *.iml 4 | 5 | ## Directory-based project format: 6 | .idea/ 7 | # if you remove the above rule, at least ignore the following: 8 | 9 | # User-specific stuff: 10 | # .idea/workspace.xml 11 | # .idea/tasks.xml 12 | # .idea/dictionaries 13 | 14 | # Sensitive or high-churn files: 15 | # .idea/dataSources.ids 16 | # .idea/dataSources.xml 17 | # .idea/sqlDataSources.xml 18 | # .idea/dynamic.xml 19 | # .idea/uiDesigner.xml 20 | 21 | # Gradle: 22 | # .idea/gradle.xml 23 | # .idea/libraries 24 | 25 | # Mongo Explorer plugin: 26 | # .idea/mongoSettings.xml 27 | 28 | ## File-based project format: 29 | *.ipr 30 | *.iws 31 | 32 | ## Plugin-specific files: 33 | 34 | # IntelliJ 35 | /out/ 36 | 37 | # mpeltonen/sbt-idea plugin 38 | .idea_modules/ 39 | 40 | # JIRA plugin 41 | atlassian-ide-plugin.xml 42 | 43 | # Crashlytics plugin (for Android Studio and IntelliJ) 44 | com_crashlytics_export_strings.xml 45 | crashlytics.properties 46 | crashlytics-build.properties 47 | 48 | # OSX 49 | .DS_Store 50 | .AppleDouble 51 | .LSOverride 52 | 53 | # Icon must end with two \r 54 | Icon 55 | 56 | 57 | # Thumbnails 58 | ._* 59 | 60 | # Files that might appear in the root of a volume 61 | .DocumentRevisions-V100 62 | .fseventsd 63 | .Spotlight-V100 64 | .TemporaryItems 65 | .Trashes 66 | .VolumeIcon.icns 67 | 68 | # Directories potentially created on remote AFP share 69 | .AppleDB 70 | .AppleDesktop 71 | Network Trash Folder 72 | Temporary Items 73 | .apdisk 74 | 75 | #Python 76 | # Byte-compiled / optimized / DLL files 77 | __pycache__/ 78 | *.py[cod] 79 | *$py.class 80 | 81 | # C extensions 82 | *.so 83 | 84 | # Distribution / packaging 85 | .Python 86 | env/ 87 | build/ 88 | develop-eggs/ 89 | dist/ 90 | downloads/ 91 | eggs/ 92 | .eggs/ 93 | lib/ 94 | lib64/ 95 | parts/ 96 | sdist/ 97 | var/ 98 | *.egg-info/ 99 | .installed.cfg 100 | *.egg 101 | 102 | # PyInstaller 103 | # Usually these files are written by a python script from a template 104 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 105 | *.manifest 106 | *.spec 107 | 108 | # Installer logs 109 | pip-log.txt 110 | pip-delete-this-directory.txt 111 | 112 | # Unit test / coverage reports 113 | htmlcov/ 114 | .tox/ 115 | .coverage 116 | .coverage.* 117 | .cache 118 | nosetests.xml 119 | coverage.xml 120 | *,cover 121 | 122 | # Translations 123 | *.mo 124 | *.pot 125 | 126 | # Django stuff: 127 | *.log 128 | 129 | # Sphinx documentation 130 | docs/_build/ 131 | 132 | # PyBuilder 133 | target/ 134 | 135 | 136 | #Vi 137 | [._]*.s[a-w][a-z] 138 | [._]s[a-w][a-z] 139 | *.un~ 140 | Session.vim 141 | .netrwhist 142 | *~ 143 | 144 | frankenstein.cfg 145 | *.txt 146 | *.csv 147 | *.sqlite* 148 | -------------------------------------------------------------------------------- /JSAnalysis.py: -------------------------------------------------------------------------------- 1 | # Copyright 2011-2015 by Carnegie Mellon University 2 | # 3 | # NO WARRANTY 4 | # 5 | # THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE 6 | # MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON 7 | # UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR 8 | # IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY 9 | # OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS 10 | # OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY 11 | # DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM 12 | # FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT. 13 | 14 | try: 15 | import PyV8 16 | except ImportError as e: 17 | print str(e) 18 | PyV8 = None 19 | 20 | import re 21 | 22 | import build_pdf_objects 23 | from util.str_utils import unescapeHTMLEntities 24 | 25 | reJSscript = ']*?contentType\s*?=\s*?[\'"]application/x-javascript[\'"][^>]*?>(.*?)' 26 | 27 | def create_objs(context, tree): 28 | """ 29 | 30 | Mimic native Adobe objects and add them to the context 31 | :param context: JavaScript context, like a namespace at runtime 32 | :param tree: XML tree of the pdf to reference objects 33 | :return: 34 | """ 35 | try: 36 | app = build_pdf_objects.create_app_obj(tree) 37 | context.eval("app = " + str(app) + ";") 38 | context.eval("app.doc.syncAnnotScan = function () {}") 39 | context.eval("app.doc.getAnnots = function () { return app.doc.annots;}") 40 | context.eval("app.eval = function (string) { eval(string);}") 41 | context.eval("app.newDoc = function () { return '';}") 42 | context.eval("app.getString = function () { ret = \"\"; for(var prop in app){ ret += app[prop]; } return ret;}") 43 | except Exception as e: 44 | # print "App: " + e.message 45 | pass 46 | try: 47 | info = build_pdf_objects.create_info_obj(tree) 48 | context.eval("this.info = " + str(info) + ";") 49 | for key in info: 50 | context.eval("this." + key + "= '" + re.escape(info[key]) + "';") 51 | context.eval("this.eval = eval") 52 | # print info 53 | except Exception as e: 54 | print "Info: " + e.message 55 | pass 56 | try: 57 | event = build_pdf_objects.create_event_obj(tree) 58 | context.eval("event = " + str(event) + ";") 59 | context.eval("event.target.info = this.info") 60 | except Exception as e: 61 | # print "Event: " + e.message 62 | pass 63 | 64 | 65 | def eval_loop(code, context, old_msg="", limit=10): 66 | """ 67 | 68 | Eval the code and handle any exceptions it throws 69 | :param code: String of code to evaluate 70 | :param context: JavaScript context object 71 | :param old_msg: 72 | :param limit: Recursive limit 73 | :return: 74 | """ 75 | try: 76 | context.eval(code) 77 | return context.eval("evalCode") 78 | # catch exceptions and attempt to fix them 79 | except ReferenceError as e: 80 | # print e.message 81 | if e.message == old_msg: 82 | return context.eval("evalCode") 83 | elif e.message.find('$') > -1: 84 | context.eval("$ = this;") 85 | else: 86 | # try commenting out line 87 | line_num = re.findall("@\s(\d*?)\s", e.message) 88 | line_num = int(line_num[0]) 89 | i = 0 90 | for item in code.split("\n"): 91 | i += 1 92 | if i == line_num: 93 | code = re.sub(item, "//" + item, code) 94 | break 95 | return eval_loop(code, context, e.message) 96 | except TypeError as te: 97 | # print te.message 98 | if te.message == old_msg: 99 | return context.eval("evalCode") 100 | elif te.message.find("called on null or undefined") > -1: 101 | # in Adobe undefined objects become app object 102 | line = re.findall("->\s(.*)", te.message) 103 | sub, count = re.subn("=\s?.\(.*?\)", "=app", line[0]) 104 | if count < 1: 105 | sub = re.sub("=.*", "=app", line[0]) 106 | line = re.escape(line[0]) 107 | code = re.sub(line, sub, code) 108 | elif te.message.find("undefined is not a function") > -1: 109 | # sub in eval as a guess 110 | line = re.findall("->\s(.*)", te.message) 111 | match = re.findall("[\s=]?(.*?)\(", line[0]) 112 | if len(match) > 0: 113 | sub = re.sub(match[0], "eval", line[0]) 114 | line = re.escape(line[0]) 115 | code = re.sub(line, sub, code) 116 | else: 117 | return context.eval("evalCode") 118 | elif te.message.find("Cannot read property") > -1: 119 | # undefined becomes app 120 | line = re.findall("->\s(.*)", te.message) 121 | match = re.findall("[=\s](.*?)\[", line[0]) 122 | if len(match) > 0: 123 | sub = re.sub(match[0], "app", line[0]) 124 | line = re.escape(line[0]) 125 | code = re.sub(line, sub, code) 126 | else: 127 | return context.eval("evalCode") 128 | else: 129 | return context.eval("evalCode") 130 | return eval_loop(code, context, te.message) 131 | except SyntaxError as se: 132 | # print se.message 133 | if se.message == old_msg: 134 | return context.eval("evalCode") 135 | line_num = re.findall("@\s(\d*?)\s", se.message) 136 | if len(line_num) > 0: 137 | line_num = int(line_num[0]) 138 | i = 0 139 | # try commenting out the line number with the error 140 | for item in code.split("\n"): 141 | i += 1 142 | if i == line_num: 143 | esc_item = re.escape(item) 144 | code, n = re.subn(esc_item, "//" + item, code) 145 | break 146 | else: 147 | return context.eval('evalCode') 148 | return eval_loop(code, context, se.message) 149 | except Exception as e1: 150 | # print e1.message 151 | return context.eval("evalCode") 152 | 153 | 154 | def analyse(js, tree): 155 | """ 156 | 157 | Main function called from pdfrankenstein. Analyzes javascript in order to deobfuscate the code. 158 | :param js: String of code to analyze 159 | :param tree: Tree xml object to use as reference for objects called from the code. 160 | :return: String of deobfuscated code 161 | """ 162 | if not PyV8: 163 | return '' 164 | with PyV8.JSIsolate(): 165 | context = PyV8.JSContext() 166 | context.enter() 167 | context.eval('evalCode = \'\';') 168 | context.eval('evalOverride = function (expression) { evalCode += expression; return;}') 169 | context.eval('eval=evalOverride') 170 | try: 171 | if tree is not None: 172 | create_objs(context, tree) 173 | ret = eval_loop(js, context) 174 | context.leave() 175 | if ret == None: 176 | return '' 177 | else: 178 | return ret 179 | except Exception as e: 180 | context.leave() 181 | # return 'Error with analyzing JS: ' + e.message 182 | return '' 183 | 184 | 185 | def isJavascript(content): 186 | """ 187 | Given an string this method looks for typical Javscript strings and try to identify if the string contains Javascript code or not. 188 | 189 | :param content: A string 190 | :return: A boolean, True if it seems to contain Javascript code or False in the other case 191 | """ 192 | JSStrings = ['var ', ';', ')', '(', 'function ', '=', '{', '}', 'if ', 'else', 'return', 'while ', 'for ', ',', 193 | 'eval', 'unescape', '.replace'] 194 | keyStrings = [';', '(', ')'] 195 | stringsFound = [] 196 | limit = 15 197 | minDistinctStringsFound = 5 198 | results = 0 199 | content = unescapeHTMLEntities(content) 200 | if re.findall(reJSscript, content, re.DOTALL | re.IGNORECASE) != []: 201 | return True 202 | for char in content: 203 | if (ord(char) < 32 and char not in ['\n', '\r', '\t', '\f', '\x00']) or ord(char) >= 127: 204 | return False 205 | 206 | for string in JSStrings: 207 | cont = content.count(string) 208 | results += cont 209 | if cont > 0 and string not in stringsFound: 210 | stringsFound.append(string) 211 | elif cont == 0 and string in keyStrings: 212 | return False 213 | 214 | if results > limit and len(stringsFound) >= minDistinctStringsFound: 215 | return True 216 | else: 217 | return False 218 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Use of PDFrankenstein and related source code is subject to the terms 2 | of the following licenses: 3 | 4 | GNU General Public License (GPL) Rights pursuant to Version 2, June 1991 5 | Government Purpose License Rights (GPLR) pursuant to DFARS 252.227.7013 6 | 7 | NO WARRANTY 8 | 9 | ANY INFORMATION, MATERIALS, SERVICES, INTELLECTUAL PROPERTY OR OTHER 10 | PROPERTY OR RIGHTS GRANTED OR PROVIDED BY CARNEGIE MELLON UNIVERSITY 11 | PURSUANT TO THIS LICENSE (HEREINAFTER THE "DELIVERABLES") ARE ON AN 12 | "AS-IS" BASIS. CARNEGIE MELLON UNIVERSITY MAKES NO WARRANTIES OF ANY 13 | KIND, EITHER EXPRESS OR IMPLIED AS TO ANY MATTER INCLUDING, BUT NOT 14 | LIMITED TO, WARRANTY OF FITNESS FOR A PARTICULAR PURPOSE, 15 | MERCHANTABILITY, INFORMATIONAL CONTENT, NONINFRINGEMENT, OR ERROR-FREE 16 | OPERATION. CARNEGIE MELLON UNIVERSITY SHALL NOT BE LIABLE FOR INDIRECT, 17 | SPECIAL OR CONSEQUENTIAL DAMAGES, SUCH AS LOSS OF PROFITS OR INABILITY 18 | TO USE SAID INTELLECTUAL PROPERTY, UNDER THIS LICENSE, REGARDLESS OF 19 | WHETHER SUCH PARTY WAS AWARE OF THE POSSIBILITY OF SUCH DAMAGES. 20 | LICENSEE AGREES THAT IT WILL NOT MAKE ANY WARRANTY ON BEHALF OF 21 | CARNEGIE MELLON UNIVERSITY, EXPRESS OR IMPLIED, TO ANY PERSON 22 | CONCERNING THE APPLICATION OF OR THE RESULTS TO BE OBTAINED WITH THE 23 | DELIVERABLES UNDER THIS LICENSE. 24 | 25 | Licensee hereby agrees to defend, indemnify, and hold harmless Carnegie 26 | Mellon University, its trustees, officers, employees, and agents from 27 | all claims or demands made against them (and any related losses, 28 | expenses, or attorney's fees) arising out of, or relating to Licensee's 29 | and/or its sub licensees' negligent use or willful misuse of or 30 | negligent conduct or willful misconduct regarding the Software, 31 | facilities, or other rights or assistance granted by Carnegie Mellon 32 | University under this License, including, but not limited to, any 33 | claims of product liability, personal injury, death, damage to 34 | property, or violation of any laws or regulations. 35 | 36 | Carnegie Mellon University Software Engineering Institute authored 37 | documents are sponsored by the U.S. Department of Defense under 38 | Contract FA8721-05-C-0003. Carnegie Mellon University retains 39 | copyrights in all material produced under this contract. The U.S. 40 | Government retains a non-exclusive, royalty-free license to publish or 41 | reproduce these documents, or allow others to do so, for U.S. 42 | Government purposes only pursuant to the copyright license under the 43 | contract clause at 252.227.7013. 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PDFrankenstein 2 | ================ 3 | Python tool for bulk malicious PDF feature extraction. 4 | 5 | Dependencies 6 | ------------ 7 | * PyV8 (and V8) (optional: if you intend to use JS deobfuscation. Note: JS deobfuscation needs to be run in a safe environment, as you would treat any malware. 8 | * lxml 9 | * [scandir](https://github.com/benhoyt/scandir) (optional: module included in lib folder) 10 | * postgresql and psycopg2 (optional: if you intend to use postgresql backing storage) 11 | 12 | 13 | Usage 14 | ----- 15 | 16 | ``` 17 | $ pdfrankenstein.py --help 18 | ``` 19 | 20 | Output to a file in delimited plain text, parses ALL files in pdf-dir/ 21 | ``` 22 | $ pdfrankenstein.py -o file -n fileoutput.txt ~/pdf-dir 23 | ``` 24 | 25 | Output to an sqlite database 26 | ``` 27 | $ pdfrankenstein.py -o sqlite3 -n pdf-db ~/pdf-dir 28 | ``` 29 | 30 | Output to stdout after parsing all files listed inside file-with-pdfs 31 | ``` 32 | $ pdfrankensetin.py -o stdout ~/file-with-pdfs 33 | ``` 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 |
pdf_in PDF input for analysis. Can be a single PDF file or a directory of files.
-d, --debugPrint debugging messages.
-o, --outAnalysis output filename or type. Default to 'unnamed-out.*' file in CWD. Options: 'sqlite3'||'postgres'||'stdout'||[filename]
-n, --nameName for output database.
--hasherSpecify which type of hasher to use. PeePDF | PDFMiner (default). PDFMiner option provides better parsing capabilities.
-v, --verboseSpam the terminal, TODO.
59 | 60 | References 61 | ------------- 62 | ### Open Source PDF Tools 63 | * [PeePDF](http://eternal-todo.com/tools/peepdf-pdf-analysis-tool) 64 | * [PDFMiner](http://www.unixuser.org/~euske/python/pdfminer/index.html) 65 | * [swf mastah](https://github.com/9b/pdfxray_public/blob/master/builder/swf_mastah.py) 66 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/__init__.py -------------------------------------------------------------------------------- /build_pdf_objects.py: -------------------------------------------------------------------------------- 1 | # Copyright 2011-2015 by Carnegie Mellon University 2 | # 3 | # NO WARRANTY 4 | # 5 | # THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE 6 | # MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON 7 | # UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR 8 | # IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY 9 | # OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS 10 | # OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY 11 | # DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM 12 | # FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT. 13 | 14 | import re 15 | 16 | from util.str_utils import unescapeHTMLEntities 17 | 18 | # Determine the type of tag used and return its value accordingly 19 | def get_value(elem, root): 20 | if elem.tag == "literal" or elem.tag == "number" or elem.tag == "keyword": 21 | return unescapeHTMLEntities(elem.text) 22 | elif elem.tag == "string": 23 | return unescapeHTMLEntities(elem.text.decode('base64')) 24 | elif elem.tag == "ref": 25 | # find the referenced object and return its value 26 | obj = get_ref_object(elem.get('id'), root) 27 | return get_value(obj[0], root) 28 | elif elem.tag == "stream": 29 | return unescapeHTMLEntities(elem[1].text.decode('base64')) 30 | elif elem.tag == "dict": 31 | # build the dictionary 32 | ret = {} 33 | size = elem.get("size") 34 | size = re.sub("%", "", size) 35 | dict_elems = elem.getchildren() 36 | for i in range(int(size)): 37 | val = get_value(dict_elems[i][0], root) 38 | if val is not None: 39 | ret[dict_elems[i].tag] = val 40 | elif elem.tag == "list": 41 | # build the list 42 | ret = [] 43 | size = elem.get("size") 44 | size = re.sub("%", "", size) 45 | list_elems = elem.getchildren() 46 | for i in range(int(size)): 47 | val = get_value(list_elems[i], root) 48 | if val is not None: 49 | ret.append(val) 50 | else: 51 | # some tags not accounted for: Rect, field, xfa, Media, etc 52 | ret = None 53 | return ret 54 | 55 | 56 | # find the object referenced in another object 57 | def get_ref_object(id, root): 58 | for obj in root.iterfind(".//object"): 59 | if obj.get("id") == id: 60 | return obj 61 | else: 62 | return None 63 | 64 | 65 | # Get any annotation objects in the PDF and store in the app object 66 | def get_annots(app, root): 67 | for annot in root.iterfind(".//Annots"): 68 | annot_list = annot[0] 69 | for ref in annot_list: 70 | id = ref.get("id") 71 | obj = get_ref_object(id, root) 72 | new = get_value(obj[0], root) 73 | if new is not None: 74 | new["subject"] = new.pop("Subj") 75 | app['doc']['annots'].append(new) 76 | 77 | 78 | # Mimic the Adobe event object by parsing the PDF for commonly found attributes 79 | def create_event_obj(tree): 80 | event_attrs = ["author", "calculate", "creator", "creationDate", "delay", "dirty", "external", "filesize", 81 | "keywords", "modDate", "numFields", "numPages", "numTemplates", "path", "pageNum", "producer", 82 | "subject", "title", "zoom", "zoomType"] 83 | event = {} 84 | event["target"] = {} 85 | for item in event_attrs: 86 | for elem in tree.iterfind('.//' + item[0].upper() + item[1:]): 87 | val = get_value(elem[0], tree) 88 | if val is not None: 89 | event["target"][item] = val 90 | # print event 91 | return event 92 | 93 | 94 | # Mimic the Adobe app object by parsing the PDF for commonly found attributes 95 | def create_app_obj(tree): 96 | app = {} 97 | app_attrs = ["calculate", "formsVersion", "fullscreen", "language", "numPlugins", "openInPlace", "platform", 98 | "toolbar", "toolbarHorizontal", "toolbarVertical"] 99 | doc = {} 100 | for item in app_attrs: 101 | for elem in tree.iterfind('.//' + item[0].upper() + item[1:]): 102 | val = get_value(elem[0], tree) 103 | if val is not None: 104 | doc[item] = val 105 | app['doc'] = doc; 106 | 107 | # Many app values are dependent on the reader 108 | # set some common defaults here 109 | app['doc']['viewerType'] = 'Reader' 110 | app['viewerType'] = 'Reader' 111 | app['viewerVersion'] = 5.0 112 | app['plugIns'] = [{'version': 6.0}, {'version': 7.5}, {'version': 8.7}, {'version': 9.1}, {'version': 10}] 113 | if not 'language' in app.keys(): 114 | app['language'] = "ENU" 115 | if not 'platform' in app.keys(): 116 | app['platform'] = "WIN" 117 | 118 | # store the annotation objects so they can be retrieved later 119 | app['doc']['annots'] = [] 120 | get_annots(app, tree) 121 | # print app 122 | return app 123 | 124 | 125 | # Mimic the Adobe info object by parsing the PDF for commonly found attributes 126 | def create_info_obj(tree): 127 | info_attrs = ["author", "creator", "creationDate", "Date", "keywords", "modDate", "producer", "subject", "title", 128 | "trapped"] 129 | info = {} 130 | for item in info_attrs: 131 | for elem in tree.iterfind('.//' + item[0].upper() + item[1:]): 132 | val = get_value(elem[0], tree) 133 | if val is not None: 134 | info[item] = val 135 | # print info 136 | return info 137 | -------------------------------------------------------------------------------- /cfg.py: -------------------------------------------------------------------------------- 1 | # Copyright 2011-2015 by Carnegie Mellon University 2 | # 3 | # NO WARRANTY 4 | # 5 | # THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE 6 | # MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON 7 | # UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR 8 | # IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY 9 | # OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS 10 | # OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY 11 | # DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM 12 | # FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT. 13 | 14 | import os 15 | import sys 16 | from ConfigParser import SafeConfigParser 17 | 18 | DEFAULT_CFG = 'frankenstein.cfg' 19 | 20 | 21 | class Config(object): 22 | def __init__(self, path='', name=''): 23 | if name: 24 | cfg_file = os.path.join(path, name) 25 | else: 26 | cfg_file = os.path.join(path, DEFAULT_CFG) 27 | self.parser = SafeConfigParser() 28 | if not self.parser.read(cfg_file): 29 | print 'No configuration file found:', cfg_file 30 | self.new_cfg() 31 | 32 | def new_cfg(self): 33 | self.section_gen() 34 | self.section_db() 35 | with open(DEFAULT_CFG, 'w') as new_cfg: 36 | print 'Creating new config file in CWD:', DEFAULT_CFG 37 | print 'Please double check the default values before running again:' 38 | print self 39 | self.parser.write(new_cfg) 40 | sys.exit(0) 41 | 42 | def section_gen(self): 43 | sec = 'general' 44 | self.parser.add_section(sec) 45 | self.parser.set(sec, '#output', 'sqlite3') 46 | self.parser.set(sec, 'output', 'stdout') 47 | 48 | def section_db(self): 49 | sec = 'database' 50 | self.parser.add_section(sec) 51 | self.parser.set(sec, 'path', os.getcwd()) 52 | self.parser.set(sec, 'user', 'frankenstein') 53 | self.parser.set(sec, 'pw', 'PuttinOnTheRitz') 54 | self.parser.set(sec, 'db', 'frankenstein.sqlite') 55 | 56 | def setting(self, section='', option=''): 57 | if not section: 58 | for s in self.parser.sections(): 59 | if self.parser.has_option(s, option): 60 | return self.parser.get(s, option) 61 | elif self.parser.has_option(section, option): 62 | return self.parser.get(section, option) 63 | else: 64 | return None 65 | 66 | def __str__(self): 67 | rv = '' 68 | for sect in self.parser.sections(): 69 | rv += 'Section: %s\n' % sect 70 | for opt in self.parser.options(sect): 71 | rv += '\t%s\t=\t%s\n' % (opt, self.parser.get(sect, opt)) 72 | return rv 73 | 74 | 75 | if __name__ == '__main__': 76 | cfg = Config() 77 | print cfg 78 | -------------------------------------------------------------------------------- /db/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/db/__init__.py -------------------------------------------------------------------------------- /db_mgmt.py: -------------------------------------------------------------------------------- 1 | # Copyright 2011-2015 by Carnegie Mellon University 2 | # 3 | # NO WARRANTY 4 | # 5 | # THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE 6 | # MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON 7 | # UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR 8 | # IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY 9 | # OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS 10 | # OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY 11 | # DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM 12 | # FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT. 13 | 14 | import os 15 | import sys 16 | import sqlite3 17 | 18 | import cfg 19 | 20 | 21 | class DBGateway(object): 22 | def __init__(self, db='', path=''): 23 | self.error = '' 24 | self.cfg = cfg.Config() 25 | 26 | if not db: 27 | self.db_dir = self.cfg.setting('database', 'path') 28 | self.db_name = self.cfg.setting('database', 'db') 29 | elif db is 'test': 30 | self.db_dir = os.getcwd() 31 | self.db_name = 'testdb.sqlite' 32 | else: 33 | if not path: 34 | self.db_dir = self.cfg.setting('database', 'path') 35 | else: 36 | self.db_dir = path 37 | self.db_name = db 38 | 39 | if not self.db_dir or not (os.path.isdir(self.db_dir)) or not self.db_name: 40 | sys.stderr.write("GError in database path or name. Check frankenstein.cfg file\n") 41 | sys.exit(1) 42 | 43 | self.db_path = os.path.join(self.db_dir, self.db_name) 44 | print('DBGateway connecting: %s' % self.db_path) 45 | self.connect(self.db_path) 46 | 47 | def query(self, cmd, params=''): 48 | try: 49 | if params: 50 | self.db_curr.execute(cmd, params) 51 | else: 52 | self.db_curr.execute(cmd) 53 | self.commit() 54 | return True 55 | except Exception as e: 56 | self.error = str(e) 57 | return False 58 | 59 | def queryblock(self, cmd, params='', n=30): 60 | done = False 61 | tries = 0 62 | while not done and tries < n: 63 | tries += 1 64 | try: 65 | if params: 66 | self.db_curr.execute(cmd, params) 67 | else: 68 | self.db_curr.execute(cmd) 69 | except Exception as e: 70 | self.error = str(e) 71 | else: 72 | done = True 73 | return done 74 | 75 | def get_error(self): 76 | err = self.error 77 | self.error = '' 78 | return err 79 | 80 | def attach(self, db_name): 81 | db = "'" + os.path.join(config.SETTINGS.get('DB_DIR'), db_name) + "'" 82 | self.db_curr.execute('ATTACH DATABASE ' + db + ' AS ' + db_name) 83 | self.db_conn.commit() 84 | 85 | def has_table(self, table): 86 | cmd = "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='%s'" % table 87 | if self.query(cmd): 88 | return self.db_curr.fetchone()[0] 89 | 90 | def create_table(self, table, **kwargs): 91 | try: 92 | kwargs = self.format_args(**kwargs) 93 | cmd = 'CREATE TABLE IF NOT EXISTS ' + table 94 | if kwargs.get('select'): 95 | cmd += ' AS SELECT ' + kwargs.get('select') + ' FROM ' + kwargs.get('from') + ' WHERE ' + kwargs.get( 96 | 'where') + '=' + kwargs.get('is') 97 | else: 98 | cmd += ' (' + kwargs.get('cols') + ', PRIMARY KEY(' + kwargs.get('primary') + '))' 99 | except TypeError as e: 100 | print 'Invalid arguments passed to database gateway:', kwargs 101 | raise e 102 | else: 103 | try: 104 | self.db_curr.execute(cmd) 105 | except sqlite3.OperationalError as error: 106 | print 'Invalid operation in database gateway:', error 107 | print 'Occurred during cmd:', cmd 108 | raise error 109 | else: 110 | self.db_conn.commit() 111 | # self.dump() 112 | 113 | def connect(self, path): 114 | try: 115 | self.db_conn = sqlite3.connect(path, 30) 116 | except Exception as e: 117 | sys.stderr.write("DBGateway connect: %s\n" % e) 118 | return None 119 | self.db_conn.text_factory = str 120 | self.db_conn.row_factory = sqlite3.Row 121 | self.db_curr = self.db_conn.cursor() 122 | 123 | def commit(self): 124 | self.db_conn.commit() 125 | 126 | def disconnect(self): 127 | self.commit() 128 | self.db_conn.close() 129 | 130 | def drop_tables(self): 131 | self.db_curr.execute("SELECT name FROM sqlite_master WHERE type='table'") 132 | for row in self.db_curr.fetchall(): 133 | self.drop(row[0]) 134 | 135 | def drop(self, name): 136 | self.db_curr.execute("DROP TABLE IF EXISTS " + name) 137 | self.db_conn.commit() 138 | 139 | def format_args(self, **kwargs): 140 | if isinstance(kwargs.get('primary'), (tuple, list)): 141 | kwargs['primary'] = ', '.join(kwargs['primary']) 142 | if isinstance(kwargs.get('cols'), (tuple, list)): 143 | kwargs['subs'] = ', '.join(['?' for arg in kwargs['cols']]) 144 | kwargs['cols'] = ', '.join(kwargs['cols']) 145 | else: 146 | kwargs['subs'] = '?' 147 | return kwargs 148 | 149 | def insert(self, table, **kwargs): 150 | kwargs = self.format_args(**kwargs) 151 | cmd = 'INSERT OR REPLACE INTO ' + table + '(' + kwargs.get('cols') + ') VALUES (' + kwargs.get('subs') + ')' 152 | try: 153 | self.db_curr.execute(cmd, kwargs.get('vals')) 154 | self.db_conn.commit() 155 | except Exception as e: 156 | self.error = repr(e) 157 | return False 158 | else: 159 | return True 160 | 161 | def select(self, cmd_str): 162 | cmd = 'SELECT %s' % cmd_str 163 | self.db_curr.execute(cmd) 164 | return self.db_curr 165 | 166 | def count(self, table, key, val): 167 | cmd = "SELECT COUNT (*) FROM %s WHERE %s is '%s'" % (table, key, val) 168 | self.db_curr.execute(cmd) 169 | return self.db_curr.fetchone()[0] 170 | 171 | def update(self, dic): 172 | cmd = "UPDATE {tbl} SET {col} ='{val}' WHERE {key} ='{kval}'".format(**dic) 173 | print cmd 174 | try: 175 | # self.db_curr.execute(cmd, dic) 176 | self.db_curr.execute(cmd) 177 | self.db_conn.commit() 178 | except Exception as e: 179 | self.error = str(e) 180 | return False 181 | else: 182 | return True 183 | 184 | def delete(self, *ids): 185 | pass 186 | 187 | def dump(self, n=0): 188 | print ':MEMORY DB DUMP:' 189 | cnt = 0 190 | for val in self.db_conn.iterdump(): 191 | cnt += 1 192 | if 0 < n <= cnt: 193 | break 194 | print val 195 | print ':MEMORY DB DUMP END:' 196 | 197 | -------------------------------------------------------------------------------- /huntterp.py: -------------------------------------------------------------------------------- 1 | # Copyright 2011-2015 by Carnegie Mellon University 2 | # 3 | # NO WARRANTY 4 | # 5 | # THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE 6 | # MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON 7 | # UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR 8 | # IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY 9 | # OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS 10 | # OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY 11 | # DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM 12 | # FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT. 13 | 14 | import sys 15 | import re 16 | 17 | ''' 18 | For testing run the module without arguments. (Can also be run on arbitrary files.) 19 | 20 | ''' 21 | 22 | 23 | class Test(object): 24 | tests = ['ftp', 'http'] 25 | ftp = "6674703a2f2f676f6f676c652e636f6d" 26 | http = "6674703a2f2f676f6f676c652e636f6d687474703a2f2f676f6f676c652e636f6df1" 27 | 28 | 29 | ''' 30 | This function makes no assumptions on the validity of the string values 31 | ''' 32 | 33 | 34 | def ascii2hex(string): 35 | if isinstance(string, str): 36 | return ''.join([hex(ord(c))[2:] for c in string]) 37 | else: 38 | return '' 39 | 40 | 41 | ''' 42 | Convert a string from hex to ascii. Starting from the first position, and 43 | stopping on the first invalid (not-printable) character or invalid input, 44 | whichever comes first. 45 | ''' 46 | 47 | 48 | def hex2ascii(string): 49 | letters = '' 50 | for idx in range(0, len(string), 2): 51 | try: 52 | c1 = string[idx] 53 | c2 = string[idx + 1] 54 | i = int(c1 + c2, 16) 55 | if i < 32 or i > 127: 56 | break 57 | ch = chr(i) 58 | except (ValueError, TypeError, IndexError): 59 | break 60 | else: 61 | letters += ch 62 | return letters 63 | 64 | 65 | def get_unicode(h2): 66 | res = [] 67 | res = re.findall('[\'\"]((%u[0-9a-f]{4})*)[\'\"]', h2) 68 | return res 69 | 70 | 71 | ''' 72 | Return a list of strings found in the hexstring. Should not return overlapping 73 | results. Needle is converted from ASCII to HEX on the first line. 74 | ''' 75 | 76 | 77 | def find_in_hex(needle, hexstack): 78 | needle = ascii2hex(needle) 79 | results = [] 80 | total = 0 81 | while True: 82 | idx = hexstack.find(needle) 83 | if idx < 0: 84 | break 85 | total += idx 86 | results.append((total, hex2ascii(hexstack[idx:]))) 87 | hexstack = hexstack[idx + 1:] 88 | total += 1 89 | return results 90 | 91 | 92 | def verify(vals, string): 93 | for val in vals: 94 | sys.stdout.write('Verifying [%s] @ [%d]...' % (val[1], val[0])) 95 | if string[val[0]:len(val[1])].startswith(hex2ascii(val[1])): 96 | sys.stdout.write('pass\n') 97 | else: 98 | sys.stdout.write('fail. string[%d]==[%s]...\n' % (val[0], val[1][val[0]:val[0] + 32])) 99 | 100 | 101 | ''' 102 | Return a list of urls found in the unicode string. Should not return overlapping 103 | results. Needle is converted from ASCII to UNICODE on the first line. 104 | ''' 105 | 106 | 107 | def find_unicode(needle, haystack): 108 | needle = ascii2uni(needle) 109 | results = [] 110 | total = 0 111 | while True: 112 | idx = haystack.find(needle) 113 | if idx < 0: 114 | break 115 | total += idx 116 | quote_2 = haystack[idx:].find('"') 117 | quote_1 = haystack[idx:].find('\'') 118 | if quote_1 < quote_2 and quote_1 > -1: 119 | quote = quote_1 120 | else: 121 | quote = quote_2 122 | results.append((total, haystack[idx:idx + quote])) 123 | haystack = haystack[idx + 1:] 124 | total += 1 125 | res = [] 126 | for r in results: 127 | res.append((r[0], uni2ascii(r[1]))) 128 | return res 129 | 130 | 131 | ''' 132 | Convert a string from ascii to unicode 133 | ''' 134 | 135 | 136 | def ascii2uni(string): 137 | string = ascii2hex(string) 138 | res = re.findall('([0-9a-f]{2})([0-9a-f]{2})', string) 139 | string = '' 140 | for i in res: 141 | string += '%u' + i[1] + i[0] 142 | return string 143 | 144 | 145 | ''' 146 | Convert a string form unicode to ascii 147 | ''' 148 | 149 | 150 | def uni2ascii(string): 151 | string = re.sub("%u", "", string) 152 | res = re.findall('([0-9a-f]{2})([0-9a-f]{2})', string) 153 | string = '' 154 | for i in res: 155 | string += i[1] + i[0] 156 | return hex2ascii(string) 157 | 158 | 159 | ''' 160 | Find h1 in h2 | h1 == ASCII && h2 == HEX 161 | ''' 162 | 163 | 164 | def main(h1, h2): 165 | if not isinstance(h2, str): 166 | print 'Invalid input:', type(h2) 167 | print str(h2) 168 | return 169 | 170 | print 'Searching for "%s" in "%s"...' % (h1, h2[:32]) 171 | 172 | urls = find_in_hex(h1, h2) 173 | urls += find_unicode(h1, h2) 174 | print urls 175 | print 'Found: %d occurrences' % len(urls) 176 | if len(urls): 177 | verify(urls, h2) 178 | 179 | 180 | if __name__ == "__main__": 181 | try: 182 | needle = sys.argv[1] 183 | fin = open(sys.argv[2], 'r') 184 | except IndexError: 185 | print 'Invalid or no arguments. Usage: huntterp.py needle haystack.txt' 186 | print 'Beginning tests' 187 | t = Test() 188 | for needle in t.tests: 189 | haystack = getattr(t, needle) 190 | main(needle, haystack) 191 | except IOError as e: 192 | print e 193 | else: 194 | main(needle, fin.read()) 195 | -------------------------------------------------------------------------------- /jobs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/jobs/__init__.py -------------------------------------------------------------------------------- /pdfminer/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2004-2016 Yusuke Shinyama 2 | 3 | Permission is hereby granted, free of charge, to any person 4 | obtaining a copy of this software and associated documentation 5 | files (the "Software"), to deal in the Software without 6 | restriction, including without limitation the rights to use, 7 | copy, modify, merge, publish, distribute, sublicense, and/or 8 | sell copies of the Software, and to permit persons to whom the 9 | Software is furnished to do so, subject to the following 10 | conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY 16 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 17 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 18 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 19 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /pdfminer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/pdfminer/__init__.py -------------------------------------------------------------------------------- /pdfminer/arcfour.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ Python implementation of Arcfour encryption algorithm. 4 | 5 | This code is in the public domain. 6 | 7 | """ 8 | 9 | 10 | ## Arcfour 11 | ## 12 | class Arcfour(object): 13 | """ 14 | >>> Arcfour(b'Key').process(b'Plaintext').encode('hex') 15 | 'bbf316e8d940af0ad3' 16 | >>> Arcfour(b'Wiki').process(b'pedia').encode('hex') 17 | '1021bf0420' 18 | >>> Arcfour(b'Secret').process(b'Attack at dawn').encode('hex') 19 | '45a01f645fc35b383552544b9bf5' 20 | """ 21 | 22 | def __init__(self, key): 23 | s = range(256) 24 | j = 0 25 | klen = len(key) 26 | for i in xrange(256): 27 | j = (j + s[i] + ord(key[i % klen])) % 256 28 | (s[i], s[j]) = (s[j], s[i]) 29 | self.s = s 30 | (self.i, self.j) = (0, 0) 31 | return 32 | 33 | def process(self, data): 34 | (i, j) = (self.i, self.j) 35 | s = self.s 36 | r = b'' 37 | for c in data: 38 | i = (i + 1) % 256 39 | j = (j + s[i]) % 256 40 | (s[i], s[j]) = (s[j], s[i]) 41 | k = s[(s[i] + s[j]) % 256] 42 | r += chr(ord(c) ^ k) 43 | (self.i, self.j) = (i, j) 44 | return r 45 | 46 | encrypt = decrypt = process 47 | 48 | 49 | new = Arcfour 50 | 51 | # test 52 | if __name__ == '__main__': 53 | import doctest 54 | 55 | doctest.testmod() 56 | -------------------------------------------------------------------------------- /pdfminer/ascii85.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ Python implementation of ASCII85/ASCIIHex decoder (Adobe version). 4 | 5 | This code is in the public domain. 6 | 7 | """ 8 | 9 | import re 10 | import struct 11 | 12 | 13 | # ascii85decode(data) 14 | def ascii85decode(data): 15 | """ 16 | In ASCII85 encoding, every four bytes are encoded with five ASCII 17 | letters, using 85 different types of characters (as 256**4 < 85**5). 18 | When the length of the original bytes is not a multiple of 4, a special 19 | rule is used for round up. 20 | 21 | The Adobe's ASCII85 implementation is slightly different from 22 | its original in handling the last characters. 23 | 24 | The sample string is taken from: 25 | http://en.wikipedia.org/w/index.php?title=Ascii85 26 | 27 | >>> ascii85decode(b'9jqo^BlbD-BleB1DJ+*+F(f,q') 28 | 'Man is distinguished' 29 | >>> ascii85decode(b'E,9)oF*2M7/c~>') 30 | 'pleasure.' 31 | """ 32 | n = b = 0 33 | out = b'' 34 | for c in data: 35 | if b'!' <= c and c <= b'u': 36 | n += 1 37 | b = b * 85 + (ord(c) - 33) 38 | if n == 5: 39 | out += struct.pack('>L', b) 40 | n = b = 0 41 | elif c == b'z': 42 | assert n == 0 43 | out += b'\0\0\0\0' 44 | elif c == b'~': 45 | if n: 46 | for _ in range(5 - n): 47 | b = b * 85 + 84 48 | out += struct.pack('>L', b)[:n - 1] 49 | break 50 | return out 51 | 52 | # asciihexdecode(data) 53 | hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE) 54 | trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE) 55 | 56 | 57 | def asciihexdecode(data): 58 | """ 59 | ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1 60 | For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the 61 | ASCIIHexDecode filter produces one byte of binary data. All white-space 62 | characters are ignored. A right angle bracket character (>) indicates 63 | EOD. Any other characters will cause an error. If the filter encounters 64 | the EOD marker after reading an odd number of hexadecimal digits, it 65 | will behave as if a 0 followed the last digit. 66 | 67 | >>> asciihexdecode(b'61 62 2e6364 65') 68 | 'ab.cde' 69 | >>> asciihexdecode(b'61 62 2e6364 657>') 70 | 'ab.cdep' 71 | >>> asciihexdecode(b'7>') 72 | 'p' 73 | """ 74 | decode = (lambda hx: chr(int(hx, 16))) 75 | out = map(decode, hex_re.findall(data)) 76 | m = trail_re.search(data) 77 | if m: 78 | out.append(decode('%c0' % m.group(1))) 79 | return b''.join(out) 80 | 81 | 82 | if __name__ == '__main__': 83 | import doctest 84 | 85 | doctest.testmod() 86 | -------------------------------------------------------------------------------- /pdfminer/lzw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | try: 5 | from cStringIO import StringIO 6 | except ImportError: 7 | from StringIO import StringIO 8 | 9 | 10 | class CorruptDataError(Exception): 11 | pass 12 | 13 | 14 | ## LZWDecoder 15 | ## 16 | class LZWDecoder(object): 17 | debug = 0 18 | 19 | def __init__(self, fp): 20 | self.fp = fp 21 | self.buff = 0 22 | self.bpos = 8 23 | self.nbits = 9 24 | self.table = None 25 | self.prevbuf = None 26 | return 27 | 28 | def readbits(self, bits): 29 | v = 0 30 | while 1: 31 | # the number of remaining bits we can get from the current buffer. 32 | r = 8 - self.bpos 33 | if bits <= r: 34 | # |-----8-bits-----| 35 | # |-bpos-|-bits-| | 36 | # | |----r----| 37 | v = (v << bits) | ((self.buff >> (r - bits)) & ((1 << bits) - 1)) 38 | self.bpos += bits 39 | break 40 | else: 41 | # |-----8-bits-----| 42 | # |-bpos-|---bits----... 43 | # | |----r----| 44 | v = (v << r) | (self.buff & ((1 << r) - 1)) 45 | bits -= r 46 | x = self.fp.read(1) 47 | if not x: 48 | raise EOFError 49 | self.buff = ord(x) 50 | self.bpos = 0 51 | return v 52 | 53 | def feed(self, code): 54 | x = '' 55 | if code == 256: 56 | self.table = [chr(c) for c in xrange(256)] # 0-255 57 | self.table.append(None) # 256 58 | self.table.append(None) # 257 59 | self.prevbuf = '' 60 | self.nbits = 9 61 | elif code == 257: 62 | pass 63 | elif not self.prevbuf: 64 | x = self.prevbuf = self.table[code] 65 | else: 66 | if code < len(self.table): 67 | x = self.table[code] 68 | self.table.append(self.prevbuf + x[:1]) 69 | elif code == len(self.table): 70 | self.table.append(self.prevbuf + self.prevbuf[:1]) 71 | x = self.table[code] 72 | else: 73 | raise CorruptDataError 74 | l = len(self.table) 75 | if l == 511: 76 | self.nbits = 10 77 | elif l == 1023: 78 | self.nbits = 11 79 | elif l == 2047: 80 | self.nbits = 12 81 | self.prevbuf = x 82 | return x 83 | 84 | def run(self): 85 | while 1: 86 | try: 87 | code = self.readbits(self.nbits) 88 | except EOFError: 89 | break 90 | try: 91 | x = self.feed(code) 92 | except CorruptDataError: 93 | # just ignore corrupt data and stop yielding there 94 | break 95 | yield x 96 | if self.debug: 97 | print >> sys.stderr, ('nbits=%d, code=%d, output=%r, table=%r' % 98 | (self.nbits, code, x, self.table[258:])) 99 | return 100 | 101 | 102 | # lzwdecode 103 | def lzwdecode(data): 104 | """ 105 | >>> lzwdecode('\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01') 106 | '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42' 107 | """ 108 | fp = StringIO(data) 109 | return ''.join(LZWDecoder(fp).run()) 110 | 111 | 112 | if __name__ == '__main__': 113 | import doctest 114 | 115 | doctest.testmod() 116 | -------------------------------------------------------------------------------- /pdfminer/pdfparser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | try: 5 | from cStringIO import StringIO 6 | except ImportError: 7 | from StringIO import StringIO 8 | from psparser import PSStackParser 9 | from psparser import PSSyntaxError, PSEOF 10 | from psparser import KWD, STRICT 11 | from pdftypes import PDFException 12 | from pdftypes import PDFStream, PDFObjRef 13 | from pdftypes import int_value 14 | from pdftypes import dict_value 15 | 16 | 17 | ## Exceptions 18 | ## 19 | class PDFSyntaxError(PDFException): 20 | pass 21 | 22 | 23 | ## PDFParser 24 | ## 25 | class PDFParser(PSStackParser): 26 | """ 27 | PDFParser fetch PDF objects from a file stream. 28 | It can handle indirect references by referring to 29 | a PDF document set by set_document method. 30 | It also reads XRefs at the end of every PDF file. 31 | 32 | Typical usage: 33 | parser = PDFParser(fp) 34 | parser.read_xref() 35 | parser.read_xref(fallback=True) # optional 36 | parser.set_document(doc) 37 | parser.seek(offset) 38 | parser.nextobject() 39 | 40 | """ 41 | 42 | def __init__(self, fp, dbg=False): 43 | PSStackParser.__init__(self, fp, dbg) 44 | self.doc = None 45 | self.fallback = False 46 | return 47 | 48 | def set_document(self, doc): 49 | """Associates the parser with a PDFDocument object.""" 50 | self.doc = doc 51 | return 52 | 53 | KEYWORD_R = KWD('R') 54 | KEYWORD_NULL = KWD('null') 55 | KEYWORD_ENDOBJ = KWD('endobj') 56 | KEYWORD_STREAM = KWD('stream') 57 | KEYWORD_XREF = KWD('xref') 58 | KEYWORD_STARTXREF = KWD('startxref') 59 | 60 | def do_keyword(self, pos, token): 61 | """Handles PDF-related keywords.""" 62 | 63 | if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF): 64 | self.add_results(*self.pop(1)) 65 | 66 | elif token is self.KEYWORD_ENDOBJ: 67 | self.add_results(*self.pop(4)) 68 | 69 | elif token is self.KEYWORD_NULL: 70 | # null object 71 | self.push((pos, None)) 72 | 73 | elif token is self.KEYWORD_R: 74 | # reference to indirect object 75 | try: 76 | ((_, objid), (_, genno)) = self.pop(2) 77 | (objid, genno) = (int(objid), int(genno)) 78 | obj = PDFObjRef(self.doc, objid, genno) 79 | self.push((pos, obj)) 80 | except PSSyntaxError: 81 | pass 82 | 83 | elif token is self.KEYWORD_STREAM: 84 | # stream object 85 | ((_, dic),) = self.pop(1) 86 | dic = dict_value(dic) 87 | objlen = 0 88 | if not self.fallback: 89 | try: 90 | objlen = int_value(dic['Length']) 91 | except KeyError: 92 | if STRICT: 93 | raise PDFSyntaxError('/Length is undefined: %r' % dic) 94 | self.seek(pos) 95 | try: 96 | (_, line) = self.nextline() # 'stream' 97 | except PSEOF: 98 | if STRICT: 99 | raise PDFSyntaxError('Unexpected EOF') 100 | return 101 | pos += len(line) 102 | self.fp.seek(pos) 103 | data = self.fp.read(objlen) 104 | self.seek(pos + objlen) 105 | while 1: 106 | try: 107 | (linepos, line) = self.nextline() 108 | except PSEOF: 109 | if STRICT: 110 | raise PDFSyntaxError('Unexpected EOF') 111 | break 112 | if 'endstream' in line: 113 | i = line.index('endstream') 114 | objlen += i 115 | data += line[:i] 116 | break 117 | objlen += len(line) 118 | data += line 119 | self.seek(pos + objlen) 120 | # XXX limit objlen not to exceed object boundary 121 | if 2 <= self.debug: 122 | print >> sys.stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \ 123 | (pos, objlen, dic, data[:10]) 124 | obj = PDFStream(dic, data, self.doc.decipher) 125 | self.push((pos, obj)) 126 | 127 | else: 128 | # others 129 | self.push((pos, token)) 130 | 131 | return 132 | 133 | 134 | ## PDFStreamParser 135 | ## 136 | class PDFStreamParser(PDFParser): 137 | """ 138 | PDFStreamParser is used to parse PDF content streams 139 | that is contained in each page and has instructions 140 | for rendering the page. A reference to a PDF document is 141 | needed because a PDF content stream can also have 142 | indirect references to other objects in the same document. 143 | """ 144 | 145 | def __init__(self, data): 146 | PDFParser.__init__(self, StringIO(data)) 147 | return 148 | 149 | def flush(self): 150 | self.add_results(*self.popall()) 151 | return 152 | 153 | def do_keyword(self, pos, token): 154 | if token is self.KEYWORD_R: 155 | # reference to indirect object 156 | try: 157 | ((_, objid), (_, genno)) = self.pop(2) 158 | (objid, genno) = (int(objid), int(genno)) 159 | obj = PDFObjRef(self.doc, objid, genno) 160 | self.push((pos, obj)) 161 | except PSSyntaxError: 162 | pass 163 | return 164 | # others 165 | self.push((pos, token)) 166 | return 167 | -------------------------------------------------------------------------------- /pdfminer/pdftypes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import zlib 3 | 4 | from lzw import lzwdecode 5 | from ascii85 import ascii85decode, asciihexdecode 6 | from runlength import rldecode 7 | from ccitt import ccittfaxdecode 8 | from psparser import PSException, PSObject 9 | from psparser import LIT, STRICT 10 | from utils import apply_png_predictor, isnumber 11 | 12 | LITERAL_CRYPT = LIT('Crypt') 13 | 14 | # Abbreviation of Filter names in PDF 4.8.6. "Inline Images" 15 | LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl')) 16 | LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW')) 17 | LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85')) 18 | LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx')) 19 | LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL')) 20 | LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF')) 21 | LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT')) 22 | 23 | 24 | ## PDF Objects 25 | ## 26 | class PDFObject(PSObject): 27 | pass 28 | 29 | 30 | class PDFException(PSException): 31 | pass 32 | 33 | 34 | class PDFTypeError(PDFException): 35 | pass 36 | 37 | 38 | class PDFValueError(PDFException): 39 | pass 40 | 41 | 42 | class PDFObjectNotFound(PDFException): 43 | pass 44 | 45 | 46 | class PDFNotImplementedError(PDFException): 47 | pass 48 | 49 | 50 | ## PDFObjRef 51 | ## 52 | class PDFObjRef(PDFObject): 53 | def __init__(self, doc, objid, _): 54 | if objid == 0: 55 | if STRICT: 56 | raise PDFValueError('PDF object id cannot be 0.') 57 | self.doc = doc 58 | self.objid = objid 59 | # self.genno = genno # Never used. 60 | return 61 | 62 | def __repr__(self): 63 | return '' % (self.objid) 64 | 65 | def resolve(self, default=None): 66 | try: 67 | return self.doc.getobj(self.objid) 68 | except PDFObjectNotFound: 69 | return default 70 | 71 | 72 | # resolve 73 | def resolve1(x, default=None): 74 | """Resolves an object. 75 | 76 | If this is an array or dictionary, it may still contains 77 | some indirect objects inside. 78 | """ 79 | while isinstance(x, PDFObjRef): 80 | x = x.resolve(default=default) 81 | return x 82 | 83 | 84 | def resolve_all(x, default=None): 85 | """Recursively resolves the given object and all the internals. 86 | 87 | Make sure there is no indirect reference within the nested object. 88 | This procedure might be slow. 89 | """ 90 | while isinstance(x, PDFObjRef): 91 | x = x.resolve(default=default) 92 | if isinstance(x, list): 93 | x = [resolve_all(v, default=default) for v in x] 94 | elif isinstance(x, dict): 95 | for (k, v) in x.iteritems(): 96 | x[k] = resolve_all(v, default=default) 97 | return x 98 | 99 | 100 | def decipher_all(decipher, objid, genno, x): 101 | """Recursively deciphers the given object. 102 | """ 103 | if isinstance(x, str): 104 | return decipher(objid, genno, x) 105 | if isinstance(x, list): 106 | x = [decipher_all(decipher, objid, genno, v) for v in x] 107 | elif isinstance(x, dict): 108 | for (k, v) in x.iteritems(): 109 | x[k] = decipher_all(decipher, objid, genno, v) 110 | return x 111 | 112 | 113 | # Type cheking 114 | def int_value(x): 115 | x = resolve1(x) 116 | if not isinstance(x, int): 117 | if STRICT: 118 | raise PDFTypeError('Integer required: %r' % x) 119 | return 0 120 | return x 121 | 122 | 123 | def float_value(x): 124 | x = resolve1(x) 125 | if not isinstance(x, float): 126 | if STRICT: 127 | raise PDFTypeError('Float required: %r' % x) 128 | return 0.0 129 | return x 130 | 131 | 132 | def num_value(x): 133 | x = resolve1(x) 134 | if not isnumber(x): 135 | if STRICT: 136 | raise PDFTypeError('Int or Float required: %r' % x) 137 | return 0 138 | return x 139 | 140 | 141 | def str_value(x): 142 | x = resolve1(x) 143 | if not isinstance(x, str): 144 | if STRICT: 145 | raise PDFTypeError('String required: %r' % x) 146 | return '' 147 | return x 148 | 149 | 150 | def list_value(x): 151 | x = resolve1(x) 152 | if not isinstance(x, (list, tuple)): 153 | if STRICT: 154 | raise PDFTypeError('List required: %r' % x) 155 | return [] 156 | return x 157 | 158 | 159 | def dict_value(x): 160 | x = resolve1(x) 161 | if not isinstance(x, dict): 162 | if STRICT: 163 | raise PDFTypeError('Dict required: %r' % x) 164 | return {} 165 | return x 166 | 167 | 168 | def stream_value(x): 169 | x = resolve1(x) 170 | if not isinstance(x, PDFStream): 171 | if STRICT: 172 | raise PDFTypeError('PDFStream required: %r' % x) 173 | return PDFStream({}, '') 174 | return x 175 | 176 | 177 | ## PDFStream type 178 | ## 179 | class PDFStream(PDFObject): 180 | def __init__(self, attrs, rawdata, decipher=None): 181 | assert isinstance(attrs, dict) 182 | self.attrs = attrs 183 | self.rawdata = rawdata 184 | self.decipher = decipher 185 | self.data = None 186 | self.objid = None 187 | self.genno = None 188 | return 189 | 190 | def set_objid(self, objid, genno): 191 | self.objid = objid 192 | self.genno = genno 193 | return 194 | 195 | def __repr__(self): 196 | if self.data is None: 197 | assert self.rawdata is not None 198 | return '' % (self.objid, len(self.rawdata), self.attrs) 199 | else: 200 | assert self.data is not None 201 | return '' % (self.objid, len(self.data), self.attrs) 202 | 203 | def __contains__(self, name): 204 | return name in self.attrs 205 | 206 | def __getitem__(self, name): 207 | return self.attrs[name] 208 | 209 | def get(self, name, default=None): 210 | return self.attrs.get(name, default) 211 | 212 | def get_any(self, names, default=None): 213 | for name in names: 214 | if name in self.attrs: 215 | return self.attrs[name] 216 | return default 217 | 218 | def get_filters(self): 219 | filters = self.get_any(('F', 'Filter')) 220 | if not filters: 221 | return [] 222 | if isinstance(filters, list): 223 | return filters 224 | return [filters] 225 | 226 | def decode(self): 227 | assert self.data is None and self.rawdata is not None 228 | data = self.rawdata 229 | if self.decipher: 230 | # Handle encryption 231 | data = self.decipher(self.objid, self.genno, data) 232 | filters = self.get_filters() 233 | if not filters: 234 | self.data = data 235 | self.rawdata = None 236 | return 237 | for f in filters: 238 | if isinstance(f, PDFObjRef): 239 | filters += f.resolve() 240 | continue 241 | params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {}) 242 | if f in LITERALS_FLATE_DECODE: 243 | # will get errors if the document is encrypted. 244 | try: 245 | data = zlib.decompress(data) 246 | except zlib.error, e: 247 | if STRICT: 248 | raise PDFException('Invalid zlib bytes: %r, %r' % (e, data)) 249 | data = '' 250 | elif f in LITERALS_LZW_DECODE: 251 | data = lzwdecode(data) 252 | elif f in LITERALS_ASCII85_DECODE: 253 | data = ascii85decode(data) 254 | elif f in LITERALS_ASCIIHEX_DECODE: 255 | data = asciihexdecode(data) 256 | elif f in LITERALS_RUNLENGTH_DECODE: 257 | data = rldecode(data) 258 | elif f in LITERALS_CCITTFAX_DECODE: 259 | data = ccittfaxdecode(data, params) 260 | elif f == LITERAL_CRYPT: 261 | # not yet.. 262 | raise PDFNotImplementedError('/Crypt filter is unsupported') 263 | else: 264 | raise PDFNotImplementedError('Unsupported filter: %r' % f) 265 | # apply predictors 266 | if 'Predictor' in params: 267 | pred = int_value(params['Predictor']) 268 | if pred == 1: 269 | # no predictor 270 | pass 271 | elif 10 <= pred: 272 | # PNG predictor 273 | colors = int_value(params.get('Colors', 1)) 274 | columns = int_value(params.get('Columns', 1)) 275 | bitspercomponent = int_value(params.get('BitsPerComponent', 8)) 276 | data = apply_png_predictor(pred, colors, columns, bitspercomponent, data) 277 | else: 278 | raise PDFNotImplementedError('Unsupported predictor: %r' % pred) 279 | self.data = data 280 | self.rawdata = None 281 | return 282 | 283 | def get_data(self): 284 | if self.data is None: 285 | self.decode() 286 | return self.data 287 | 288 | def get_rawdata(self): 289 | return self.rawdata 290 | -------------------------------------------------------------------------------- /pdfminer/runlength.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # RunLength decoder (Adobe version) implementation based on PDF Reference 4 | # version 1.4 section 3.3.4. 5 | # 6 | # * public domain * 7 | # 8 | 9 | def rldecode(data): 10 | """ 11 | RunLength decoder (Adobe version) implementation based on PDF Reference 12 | version 1.4 section 3.3.4: 13 | The RunLengthDecode filter decodes data that has been encoded in a 14 | simple byte-oriented format based on run length. The encoded data 15 | is a sequence of runs, where each run consists of a length byte 16 | followed by 1 to 128 bytes of data. If the length byte is in the 17 | range 0 to 127, the following length + 1 (1 to 128) bytes are 18 | copied literally during decompression. If length is in the range 19 | 129 to 255, the following single byte is to be copied 257 - length 20 | (2 to 128) times during decompression. A length value of 128 21 | denotes EOD. 22 | >>> s = b'\x05123456\xfa7\x04abcde\x80junk' 23 | >>> rldecode(s) 24 | '1234567777777abcde' 25 | """ 26 | decoded = [] 27 | i = 0 28 | while i < len(data): 29 | # print 'data[%d]=:%d:' % (i,ord(data[i])) 30 | length = ord(data[i]) 31 | if length == 128: 32 | break 33 | if length >= 0 and length < 128: 34 | run = data[i + 1:(i + 1) + (length + 1)] 35 | # print 'length=%d, run=%s' % (length+1,run) 36 | decoded.append(run) 37 | i = (i + 1) + (length + 1) 38 | if length > 128: 39 | run = data[i + 1] * (257 - length) 40 | # print 'length=%d, run=%s' % (257-length,run) 41 | decoded.append(run) 42 | i = (i + 1) + 1 43 | return b''.join(decoded) 44 | 45 | 46 | if __name__ == '__main__': 47 | import doctest 48 | 49 | doctest.testmod() 50 | -------------------------------------------------------------------------------- /pdfminer/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Miscellaneous Routines. 4 | """ 5 | import struct 6 | from sys import maxint as INF 7 | 8 | 9 | ## PNG Predictor 10 | ## 11 | def apply_png_predictor(pred, colors, columns, bitspercomponent, data): 12 | if bitspercomponent != 8: 13 | # unsupported 14 | raise ValueError(bitspercomponent) 15 | nbytes = colors * columns * bitspercomponent // 8 16 | i = 0 17 | buf = '' 18 | line0 = '\x00' * columns 19 | for i in xrange(0, len(data), nbytes + 1): 20 | ft = data[i] 21 | i += 1 22 | line1 = data[i:i + nbytes] 23 | line2 = '' 24 | if ft == '\x00': 25 | # PNG none 26 | line2 += line1 27 | elif ft == '\x01': 28 | # PNG sub (UNTESTED) 29 | c = 0 30 | for b in line1: 31 | c = (c + ord(b)) & 255 32 | line2 += chr(c) 33 | elif ft == '\x02': 34 | # PNG up 35 | for (a, b) in zip(line0, line1): 36 | c = (ord(a) + ord(b)) & 255 37 | line2 += chr(c) 38 | elif ft == '\x03': 39 | # PNG average (UNTESTED) 40 | c = 0 41 | for (a, b) in zip(line0, line1): 42 | c = ((c + ord(a) + ord(b)) // 2) & 255 43 | line2 += chr(c) 44 | else: 45 | # unsupported 46 | raise ValueError(ft) 47 | buf += line2 48 | line0 = line2 49 | return buf 50 | 51 | ## Matrix operations 52 | ## 53 | MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0) 54 | 55 | 56 | def mult_matrix((a1, b1, c1, d1, e1, f1), (a0, b0, c0, d0, e0, f0)): 57 | """Returns the multiplication of two matrices.""" 58 | return (a0 * a1 + c0 * b1, b0 * a1 + d0 * b1, 59 | a0 * c1 + c0 * d1, b0 * c1 + d0 * d1, 60 | a0 * e1 + c0 * f1 + e0, b0 * e1 + d0 * f1 + f0) 61 | 62 | 63 | def translate_matrix((a, b, c, d, e, f), (x, y)): 64 | """Translates a matrix by (x, y).""" 65 | return (a, b, c, d, x * a + y * c + e, x * b + y * d + f) 66 | 67 | 68 | def apply_matrix_pt((a, b, c, d, e, f), (x, y)): 69 | """Applies a matrix to a point.""" 70 | return (a * x + c * y + e, b * x + d * y + f) 71 | 72 | 73 | def apply_matrix_norm((a, b, c, d, e, f), (p, q)): 74 | """Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))""" 75 | return (a * p + c * q, b * p + d * q) 76 | 77 | 78 | ## Utility functions 79 | ## 80 | 81 | # isnumber 82 | def isnumber(x): 83 | return isinstance(x, (int, long, float)) 84 | 85 | 86 | # uniq 87 | def uniq(objs): 88 | """Eliminates duplicated elements.""" 89 | done = set() 90 | for obj in objs: 91 | if obj in done: 92 | continue 93 | done.add(obj) 94 | yield obj 95 | return 96 | 97 | 98 | # csort 99 | def csort(objs, key=lambda x: x): 100 | """Order-preserving sorting function.""" 101 | idxs = dict((obj, i) for (i, obj) in enumerate(objs)) 102 | return sorted(objs, key=lambda obj: (key(obj), idxs[obj])) 103 | 104 | 105 | # fsplit 106 | def fsplit(pred, objs): 107 | """Split a list into two classes according to the predicate.""" 108 | t = [] 109 | f = [] 110 | for obj in objs: 111 | if pred(obj): 112 | t.append(obj) 113 | else: 114 | f.append(obj) 115 | return (t, f) 116 | 117 | 118 | # drange 119 | def drange(v0, v1, d): 120 | """Returns a discrete range.""" 121 | assert v0 < v1 122 | return xrange(int(v0) // d, int(v1 + d) // d) 123 | 124 | 125 | # get_bound 126 | def get_bound(pts): 127 | """Compute a minimal rectangle that covers all the points.""" 128 | (x0, y0, x1, y1) = (INF, INF, -INF, -INF) 129 | for (x, y) in pts: 130 | x0 = min(x0, x) 131 | y0 = min(y0, y) 132 | x1 = max(x1, x) 133 | y1 = max(y1, y) 134 | return (x0, y0, x1, y1) 135 | 136 | 137 | # pick 138 | def pick(seq, func, maxobj=None): 139 | """Picks the object obj where func(obj) has the highest value.""" 140 | maxscore = None 141 | for obj in seq: 142 | score = func(obj) 143 | if maxscore is None or maxscore < score: 144 | (maxscore, maxobj) = (score, obj) 145 | return maxobj 146 | 147 | 148 | # choplist 149 | def choplist(n, seq): 150 | """Groups every n elements of the list.""" 151 | r = [] 152 | for x in seq: 153 | r.append(x) 154 | if len(r) == n: 155 | yield tuple(r) 156 | r = [] 157 | return 158 | 159 | 160 | # nunpack 161 | def nunpack(s, default=0): 162 | """Unpacks 1 to 4 byte integers (big endian).""" 163 | l = len(s) 164 | if not l: 165 | return default 166 | elif l == 1: 167 | return ord(s) 168 | elif l == 2: 169 | return struct.unpack('>H', s)[0] 170 | elif l == 3: 171 | return struct.unpack('>L', '\x00' + s)[0] 172 | elif l == 4: 173 | return struct.unpack('>L', s)[0] 174 | else: 175 | raise TypeError('invalid length: %d' % l) 176 | 177 | # decode_text 178 | PDFDocEncoding = ''.join(unichr(x) for x in ( 179 | 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 180 | 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 181 | 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017, 182 | 0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc, 183 | 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 184 | 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 185 | 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 186 | 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 187 | 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 188 | 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 189 | 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 190 | 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 191 | 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 192 | 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 193 | 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 194 | 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000, 195 | 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044, 196 | 0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018, 197 | 0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160, 198 | 0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000, 199 | 0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 200 | 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af, 201 | 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 202 | 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 203 | 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, 204 | 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, 205 | 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 206 | 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 207 | 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 208 | 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 209 | 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 210 | 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 211 | )) 212 | 213 | 214 | def decode_text(s): 215 | """Decodes a PDFDocEncoding string to Unicode.""" 216 | if s.startswith('\xfe\xff'): 217 | return unicode(s[2:], 'utf-16be', 'ignore') 218 | else: 219 | return ''.join(PDFDocEncoding[ord(c)] for c in s) 220 | 221 | 222 | # enc 223 | def enc(x, codec='ascii'): 224 | """Encodes a string for SGML/XML/HTML""" 225 | x = x.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"') 226 | return x.encode(codec, 'xmlcharrefreplace') 227 | 228 | 229 | def bbox2str((x0, y0, x1, y1)): 230 | return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1) 231 | 232 | 233 | def matrix2str((a, b, c, d, e, f)): 234 | return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a, b, c, d, e, f) 235 | 236 | 237 | ## Plane 238 | ## 239 | ## A set-like data structure for objects placed on a plane. 240 | ## Can efficiently find objects in a certain rectangular area. 241 | ## It maintains two parallel lists of objects, each of 242 | ## which is sorted by its x or y coordinate. 243 | ## 244 | class Plane(object): 245 | def __init__(self, bbox, gridsize=50): 246 | self._objs = set() 247 | self._grid = {} 248 | self.gridsize = gridsize 249 | (self.x0, self.y0, self.x1, self.y1) = bbox 250 | return 251 | 252 | def __repr__(self): 253 | return ('' % list(self)) 254 | 255 | def __iter__(self): 256 | return iter(self._objs) 257 | 258 | def __len__(self): 259 | return len(self._objs) 260 | 261 | def __contains__(self, obj): 262 | return obj in self._objs 263 | 264 | def _getrange(self, (x0, y0, x1, y1)): 265 | if (x1 <= self.x0 or self.x1 <= x0 or 266 | y1 <= self.y0 or self.y1 <= y0): return 267 | x0 = max(self.x0, x0) 268 | y0 = max(self.y0, y0) 269 | x1 = min(self.x1, x1) 270 | y1 = min(self.y1, y1) 271 | for y in drange(y0, y1, self.gridsize): 272 | for x in drange(x0, x1, self.gridsize): 273 | yield (x, y) 274 | return 275 | 276 | # extend(objs) 277 | def extend(self, objs): 278 | for obj in objs: 279 | self.add(obj) 280 | return 281 | 282 | # add(obj): place an object. 283 | def add(self, obj): 284 | for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)): 285 | if k not in self._grid: 286 | r = [] 287 | self._grid[k] = r 288 | else: 289 | r = self._grid[k] 290 | r.append(obj) 291 | self._objs.add(obj) 292 | return 293 | 294 | # remove(obj): displace an object. 295 | def remove(self, obj): 296 | for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)): 297 | try: 298 | self._grid[k].remove(obj) 299 | except (KeyError, ValueError): 300 | pass 301 | self._objs.remove(obj) 302 | return 303 | 304 | # find(): finds objects that are in a certain area. 305 | def find(self, (x0, y0, x1, y1)): 306 | done = set() 307 | for k in self._getrange((x0, y0, x1, y1)): 308 | if k not in self._grid: 309 | continue 310 | for obj in self._grid[k]: 311 | if obj in done: 312 | continue 313 | done.add(obj) 314 | if (obj.x1 <= x0 or x1 <= obj.x0 or 315 | obj.y1 <= y0 or y1 <= obj.y0): 316 | continue 317 | yield obj 318 | return 319 | -------------------------------------------------------------------------------- /peepdf/AUTHORS: -------------------------------------------------------------------------------- 1 | Jose Miguel Esparza 2 | http://eternal-todo.com 3 | http://twitter.com/EternalTodo -------------------------------------------------------------------------------- /peepdf/CHANGELOG: -------------------------------------------------------------------------------- 1 | ----------------------------------------------- 2 | peepdf Black Hat Vegas (0.2 r156), 2012-07-25 3 | ----------------------------------------------- 4 | 5 | * New features: 6 | 7 | - Added "grinch mode" execution to avoid colorized output 8 | - Added more colors in the interactive console output: warning, errors, important information... 9 | - Changed sctest command, now it's implemented with pylibemu 10 | - Added decrypt command to parse password protected documents 11 | - Modified analyseJS() to extract JS code from XDP packets and unescape HTML entities 12 | - Added function unescapeHTMLEntities() to unescape HTML entities 13 | - Added AES decryption support (128 and 256 bits). 14 | - Added hashes in objects information (info $object_id) 15 | - Added support for decoding CCITTFaxDecode filters (Thanks to @binjo) 16 | 17 | * Fixes: 18 | 19 | - Fix to show decrypt errors 20 | - Fixed silly bug with /EncryptMetadata element 21 | - Added missing binary file operations 22 | - Fixed Issue 5: Resolved false positives when monitoring some elements like actions, events, etc. (Thanks to @hiddenillusion) 23 | - Bug in PDFStream.decode and PDFStream.encode, dealing with an array of filter parameters (Thanks to @binjo) 24 | 25 | 26 | ----------------------------------------------- 27 | peepdf Black Hat Arsenal (0.1 r92), 2012-03-16 28 | ----------------------------------------------- 29 | 30 | * New features: 31 | 32 | - Added support for more parameters in Flate/LZW decode (stream filters) 33 | - Encryption algorithm now showing in document information 34 | - Added XML output and SHA hash to file information 35 | - Improved unescape function to support mixed escaped formats (eg. "%u6734%34%u8790") 36 | - Added xor and xor_search commands 37 | - Added easy way of redirect console output (>, >>, $>, $>>) 38 | - Added xor function by Evan Fosmark 39 | - Added detection of CVE-2011-4369 (/PRC) 40 | - Added hash command (Thanks to @binjo for code and comments) 41 | - Added js_beautify command 42 | - Update function added 43 | - Added new vulns and showing information related to non JS vulns 44 | - Added escape sequence in the limited output 45 | - Added ascii85 decode from pdfminer to improve code and avoid bugs (Thanks to Brandon Dixon!) 46 | - Added lzwdecode from pdfminer to improve code and avoid bugs 47 | 48 | * Fixes: 49 | 50 | - Update process rewritten, now based on hashing of files 51 | - Silly bug in computeUserPass function (Thanks to Christian Martorella!) 52 | - Added binary mode in files operations 53 | - Recursion bug in update function 54 | - Minor bug in do_embed function 55 | - Bug to support encoding following PDF specifications (Issue 3 by czchen) 56 | - Bug to handle negative numbers in P element 57 | - Bug in the xref table when creating a new PDF (Issue 2) 58 | - Silly bug when parsing filter parameters 59 | - Bug related to updating objects and statistics of PDF files 60 | - Some bugs related to offsets calculation 61 | - Fixed "replace" function in PDFObjectStream 62 | - Fix in asciiHexDecode filter function 63 | 64 | 65 | ----------------------------------------------- 66 | peepdf 0.1 r15, 2011-05-05 67 | ----------------------------------------------- 68 | 69 | - Initial Release 70 | 71 | -------------------------------------------------------------------------------- /peepdf/JSAnalysis.py: -------------------------------------------------------------------------------- 1 | # 2 | # peepdf is a tool to analyse and modify PDF files 3 | # http://peepdf.eternal-todo.com 4 | # By Jose Miguel Esparza 5 | # 6 | # Copyright (C) 2011-2014 Jose Miguel Esparza 7 | # 8 | # This file is part of peepdf. 9 | # 10 | # peepdf is free software: you can redistribute it and/or modify 11 | # it under the terms of the GNU General Public License as published by 12 | # the Free Software Foundation, either version 3 of the License, or 13 | # (at your option) any later version. 14 | # 15 | # peepdf is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | # GNU General Public License for more details. 19 | # 20 | # You should have received a copy of the GNU General Public License 21 | # along with peepdf. If not, see . 22 | # 23 | 24 | ''' 25 | This module contains some functions to analyse Javascript code inside the PDF file 26 | ''' 27 | 28 | import sys 29 | import re 30 | import os 31 | import traceback 32 | 33 | import jsbeautifier 34 | from PDFUtils import unescapeHTMLEntities, escapeString 35 | 36 | try: 37 | import PyV8 38 | 39 | JS_MODULE = True 40 | 41 | class Global(PyV8.JSClass): 42 | evalCode = '' 43 | 44 | def evalOverride(self, expression): 45 | self.evalCode += '\n\n// New evaluated code\n' + expression 46 | return 47 | 48 | except: 49 | JS_MODULE = False 50 | 51 | errorsFile = 'errors.txt' 52 | newLine = os.linesep 53 | reJSscript = ']*?contentType\s*?=\s*?[\'"]application/x-javascript[\'"][^>]*?>(.*?)' 54 | preDefinedCode = 'var app = this;' 55 | 56 | 57 | def analyseJS(code, context=None, manualAnalysis=False): 58 | ''' 59 | Hooks the eval function and search for obfuscated elements in the Javascript code 60 | 61 | @param code: The Javascript code (string) 62 | @return: List with analysis information of the Javascript code: [JSCode,unescapedBytes,urlsFound,errors,context], where 63 | JSCode is a list with the several stages Javascript code, 64 | unescapedBytes is a list with the parameters of unescape functions, 65 | urlsFound is a list with the URLs found in the unescaped bytes, 66 | errors is a list of errors, 67 | context is the context of execution of the Javascript code. 68 | ''' 69 | errors = [] 70 | JSCode = [] 71 | unescapedBytes = [] 72 | urlsFound = [] 73 | 74 | try: 75 | code = unescapeHTMLEntities(code) 76 | scriptElements = re.findall(reJSscript, code, re.DOTALL | re.IGNORECASE) 77 | if scriptElements != []: 78 | code = '' 79 | for scriptElement in scriptElements: 80 | code += scriptElement + '\n\n' 81 | code = jsbeautifier.beautify(code) 82 | JSCode.append(code) 83 | 84 | if code != None and JS_MODULE and not manualAnalysis: 85 | if context == None: 86 | context = PyV8.JSContext(Global()) 87 | context.enter() 88 | # Hooking the eval function 89 | context.eval('eval=evalOverride') 90 | # context.eval(preDefinedCode) 91 | while True: 92 | originalCode = code 93 | try: 94 | context.eval(code) 95 | evalCode = context.eval('evalCode') 96 | evalCode = jsbeautifier.beautify(evalCode) 97 | if evalCode != '' and evalCode != code: 98 | code = evalCode 99 | JSCode.append(code) 100 | else: 101 | break 102 | except: 103 | error = str(sys.exc_info()[1]) 104 | open('jserror.log', 'ab').write(error + newLine) 105 | errors.append(error) 106 | break 107 | 108 | if False: 109 | escapedVars = re.findall('(\w*?)\s*?=\s*?(unescape\((.*?)\))', code, re.DOTALL) 110 | for var in escapedVars: 111 | bytes = var[2] 112 | if bytes.find('+') != -1 or bytes.find('%') == -1: 113 | varContent = getVarContent(code, bytes) 114 | if len(varContent) > 150: 115 | ret = unescape(varContent) 116 | if ret[0] != -1: 117 | bytes = ret[1] 118 | urls = re.findall('https?://.*$', bytes, re.DOTALL) 119 | if bytes not in unescapedBytes: 120 | unescapedBytes.append(bytes) 121 | for url in urls: 122 | if url not in urlsFound: 123 | urlsFound.append(url) 124 | else: 125 | bytes = bytes[1:-1] 126 | if len(bytes) > 150: 127 | ret = unescape(bytes) 128 | if ret[0] != -1: 129 | bytes = ret[1] 130 | urls = re.findall('https?://.*$', bytes, re.DOTALL) 131 | if bytes not in unescapedBytes: 132 | unescapedBytes.append(bytes) 133 | for url in urls: 134 | if url not in urlsFound: 135 | urlsFound.append(url) 136 | except: 137 | traceback.print_exc(file=open(errorsFile, 'a')) 138 | errors.append('Unexpected error in the JSAnalysis module!!') 139 | finally: 140 | for js in JSCode: 141 | if js == None or js == '': 142 | JSCode.remove(js) 143 | return [JSCode, unescapedBytes, urlsFound, errors, context] 144 | 145 | 146 | def getVarContent(jsCode, varContent): 147 | ''' 148 | Given the Javascript code and the content of a variable this method tries to obtain the real value of the variable, cleaning expressions like "a = eval; a(js_code);" 149 | 150 | @param jsCode: The Javascript code (string) 151 | @param varContent: The content of the variable (string) 152 | @return: A string with real value of the variable 153 | ''' 154 | clearBytes = '' 155 | varContent = varContent.replace('\n', '') 156 | varContent = varContent.replace('\r', '') 157 | varContent = varContent.replace('\t', '') 158 | varContent = varContent.replace(' ', '') 159 | parts = varContent.split('+') 160 | for part in parts: 161 | if re.match('["\'].*?["\']', part, re.DOTALL): 162 | clearBytes += part[1:-1] 163 | else: 164 | part = escapeString(part) 165 | varContent = re.findall(part + '\s*?=\s*?(.*?)[,;]', jsCode, re.DOTALL) 166 | if varContent != []: 167 | clearBytes += getVarContent(jsCode, varContent[0]) 168 | return clearBytes 169 | 170 | 171 | def isJavascript(content): 172 | ''' 173 | Given an string this method looks for typical Javscript strings and try to identify if the string contains Javascrit code or not. 174 | 175 | @param content: A string 176 | @return: A boolean, True if it seems to contain Javascript code or False in the other case 177 | ''' 178 | JSStrings = ['var ', ';', ')', '(', 'function ', '=', '{', '}', 'if ', 'else', 'return', 'while ', 'for ', ',', 179 | 'eval'] 180 | keyStrings = [';', '(', ')'] 181 | stringsFound = [] 182 | limit = 15 183 | minDistinctStringsFound = 5 184 | results = 0 185 | 186 | if re.findall(reJSscript, content, re.DOTALL | re.IGNORECASE) != []: 187 | return True 188 | 189 | for char in content: 190 | if (ord(char) < 32 and char not in ['\n', '\r', '\t', '\f', '\x00']) or ord(char) >= 127: 191 | return False 192 | 193 | for string in JSStrings: 194 | cont = content.count(string) 195 | results += cont 196 | if cont > 0 and string not in stringsFound: 197 | stringsFound.append(string) 198 | elif cont == 0 and string in keyStrings: 199 | return False 200 | 201 | if results > limit and len(stringsFound) >= minDistinctStringsFound: 202 | return True 203 | else: 204 | return False 205 | 206 | 207 | def searchObfuscatedFunctions(jsCode, function): 208 | ''' 209 | Search for obfuscated functions in the Javascript code 210 | 211 | @param jsCode: The Javascript code (string) 212 | @param function: The function name to look for (string) 213 | @return: List with obfuscated functions information [functionName,functionCall,containsReturns] 214 | ''' 215 | obfuscatedFunctionsInfo = [] 216 | if jsCode != None: 217 | match = re.findall('\W(' + function + '\s{0,5}?\((.*?)\)\s{0,5}?;)', jsCode, re.DOTALL) 218 | if match != []: 219 | for m in match: 220 | if re.findall('return', m[1], re.IGNORECASE) != []: 221 | obfuscatedFunctionsInfo.append([function, m, True]) 222 | else: 223 | obfuscatedFunctionsInfo.append([function, m, False]) 224 | obfuscatedFunctions = re.findall('\s*?((\w*?)\s*?=\s*?' + function + ')\s*?;', jsCode, re.DOTALL) 225 | for obfuscatedFunction in obfuscatedFunctions: 226 | obfuscatedElement = obfuscatedFunction[1] 227 | obfuscatedFunctionsInfo += searchObfuscatedFunctions(jsCode, obfuscatedElement) 228 | return obfuscatedFunctionsInfo 229 | 230 | 231 | def unescape(escapedBytes, unicode=True): 232 | ''' 233 | This method unescapes the given string 234 | 235 | @param escapedBytes: A string to unescape 236 | @return: A tuple (status,statusContent), where statusContent is an unescaped string in case status = 0 or an error in case status = -1 237 | ''' 238 | # TODO: modify to accept a list of escaped strings? 239 | unescapedBytes = '' 240 | if unicode: 241 | unicodePadding = '\x00' 242 | else: 243 | unicodePadding = '' 244 | try: 245 | if escapedBytes.lower().find('%u') != -1 or escapedBytes.lower().find('\u') != -1 or escapedBytes.find( 246 | '%') != -1: 247 | if escapedBytes.lower().find('\u') != -1: 248 | splitBytes = escapedBytes.split('\\') 249 | else: 250 | splitBytes = escapedBytes.split('%') 251 | for i in range(len(splitBytes)): 252 | splitByte = splitBytes[i] 253 | if splitByte == '': 254 | continue 255 | if len(splitByte) > 4 and re.match('u[0-9a-f]{4}', splitByte[:5], re.IGNORECASE): 256 | unescapedBytes += chr(int(splitByte[3] + splitByte[4], 16)) + chr( 257 | int(splitByte[1] + splitByte[2], 16)) 258 | if len(splitByte) > 5: 259 | for j in range(5, len(splitByte)): 260 | unescapedBytes += splitByte[j] + unicodePadding 261 | elif len(splitByte) > 1 and re.match('[0-9a-f]{2}', splitByte[:2], re.IGNORECASE): 262 | unescapedBytes += chr(int(splitByte[0] + splitByte[1], 16)) + unicodePadding 263 | if len(splitByte) > 2: 264 | for j in range(2, len(splitByte)): 265 | unescapedBytes += splitByte[j] + unicodePadding 266 | else: 267 | if i != 0: 268 | unescapedBytes += '%' + unicodePadding 269 | for j in range(len(splitByte)): 270 | unescapedBytes += splitByte[j] + unicodePadding 271 | else: 272 | unescapedBytes = escapedBytes 273 | except: 274 | return (-1, 'Error while unescaping the bytes') 275 | return (0, unescapedBytes) 276 | -------------------------------------------------------------------------------- /peepdf/PDFCrypto.py: -------------------------------------------------------------------------------- 1 | # 2 | # peepdf is a tool to analyse and modify PDF files 3 | # http://peepdf.eternal-todo.com 4 | # By Jose Miguel Esparza 5 | # 6 | # Copyright (C) 2011-2014 Jose Miguel Esparza 7 | # 8 | # This file is part of peepdf. 9 | # 10 | # peepdf is free software: you can redistribute it and/or modify 11 | # it under the terms of the GNU General Public License as published by 12 | # the Free Software Foundation, either version 3 of the License, or 13 | # (at your option) any later version. 14 | # 15 | # peepdf is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | # GNU General Public License for more details. 19 | # 20 | # You should have received a copy of the GNU General Public License 21 | # along with peepdf. If not, see . 22 | # 23 | 24 | ''' 25 | Module to manage cryptographic operations with PDF files 26 | ''' 27 | 28 | import hashlib 29 | import struct 30 | import random 31 | import warnings 32 | from itertools import cycle, izip 33 | 34 | import aes 35 | 36 | warnings.filterwarnings("ignore") 37 | 38 | paddingString = '\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A' 39 | 40 | 41 | def computeEncryptionKey(password, dictOwnerPass, dictUserPass, dictOE, dictUE, fileID, pElement, dictKeyLength=128, 42 | revision=3, encryptMetadata=False, passwordType=None): 43 | ''' 44 | Compute an encryption key to encrypt/decrypt the PDF file 45 | 46 | @param password: The password entered by the user 47 | @param dictOwnerPass: The owner password from the standard security handler dictionary 48 | @param dictUserPass: The user password from the standard security handler dictionary 49 | @param dictOE: The owner encrypted string from the standard security handler dictionary 50 | @param dictUE:The user encrypted string from the standard security handler dictionary 51 | @param fileID: The /ID element in the trailer dictionary of the PDF file 52 | @param pElement: The /P element of the Encryption dictionary 53 | @param dictKeyLength: The length of the key 54 | @param revision: The algorithm revision 55 | @param encryptMetadata: A boolean extracted from the standard security handler dictionary to specify if it's necessary to encrypt the document metadata or not 56 | @param passwordType: It specifies the given password type. It can be 'USER', 'OWNER' or None. 57 | @return: A tuple (status,statusContent), where statusContent is the encryption key in case status = 0 or an error message in case status = -1 58 | ''' 59 | if revision != 5: 60 | keyLength = dictKeyLength / 8 61 | lenPass = len(password) 62 | if lenPass > 32: 63 | password = password[:32] 64 | elif lenPass < 32: 65 | password += paddingString[:32 - lenPass] 66 | md5input = password + dictOwnerPass + struct.pack(' 3 and not encryptMetadata: 68 | md5input += '\xFF' * 4 69 | key = hashlib.md5(md5input).digest() 70 | if revision > 2: 71 | counter = 0 72 | while counter < 50: 73 | key = hashlib.md5(key[:keyLength]).digest() 74 | counter += 1 75 | key = key[:keyLength] 76 | elif revision == 2: 77 | key = key[:5] 78 | return (0, key) 79 | else: 80 | if passwordType == 'USER': 81 | password = password.encode('utf-8')[:127] 82 | kSalt = dictUserPass[40:48] 83 | intermediateKey = hashlib.sha256(password + kSalt).digest() 84 | ret = aes.decryptData('\0' * 16 + dictUE, intermediateKey) 85 | elif passwordType == 'OWNER': 86 | password = password.encode('utf-8')[:127] 87 | kSalt = dictOwnerPass[40:48] 88 | intermediateKey = hashlib.sha256(password + kSalt + dictUserPass).digest() 89 | ret = aes.decryptData('\0' * 16 + dictOE, intermediateKey) 90 | return ret 91 | 92 | 93 | def computeObjectKey(id, generationNum, encryptionKey, keyLengthBytes, algorithm='RC4'): 94 | ''' 95 | Compute the key necessary to encrypt each object, depending on the id and generation number. Only necessary with /V < 5. 96 | 97 | @param id: The object id 98 | @param generationNum: The generation number of the object 99 | @param encryptionKey: The encryption key 100 | @param keyLengthBytes: The length of the encryption key in bytes 101 | @param algorithm: The algorithm used in the encryption/decryption process 102 | @return: The computed key in string format 103 | ''' 104 | key = encryptionKey + struct.pack(' 32: 130 | ownerPassString = ownerPassString[:32] 131 | elif lenPass < 32: 132 | ownerPassString += paddingString[:32 - lenPass] 133 | rc4Key = hashlib.md5(ownerPassString).digest() 134 | if revision > 2: 135 | counter = 0 136 | while counter < 50: 137 | rc4Key = hashlib.md5(rc4Key).digest() 138 | counter += 1 139 | rc4Key = rc4Key[:keyLength] 140 | lenPass = len(userPassString) 141 | if lenPass > 32: 142 | userPassString = userPassString[:32] 143 | elif lenPass < 32: 144 | userPassString += paddingString[:32 - lenPass] 145 | ownerPass = RC4(userPassString, rc4Key) 146 | if revision > 2: 147 | counter = 1 148 | while counter <= 19: 149 | newKey = '' 150 | for i in range(len(rc4Key)): 151 | newKey += chr(ord(rc4Key[i]) ^ counter) 152 | ownerPass = RC4(ownerPass, newKey) 153 | counter += 1 154 | return ownerPass 155 | 156 | 157 | def computeUserPass(userPassString, dictO, fileID, pElement, keyLength=128, revision=3, encryptMetadata=False): 158 | ''' 159 | Compute the user password of the PDF file 160 | 161 | @param userPassString: The user password entered by the user 162 | @param ownerPass: The computed owner password 163 | @param fileID: The /ID element in the trailer dictionary of the PDF file 164 | @param pElement: The /P element of the /Encryption dictionary 165 | @param keyLength: The length of the key 166 | @param revision: The algorithm revision 167 | @param encryptMetadata: A boolean extracted from the standard security handler dictionary to specify if it's necessary to encrypt the document metadata or not 168 | @return: A tuple (status,statusContent), where statusContent is the computed password in case status = 0 or an error message in case status = -1 169 | ''' 170 | # TODO: revision 5 171 | userPass = '' 172 | dictU = '' 173 | dictOE = '' 174 | dictUE = '' 175 | ret = computeEncryptionKey(userPassString, dictO, dictU, dictOE, dictUE, fileID, pElement, keyLength, revision, 176 | encryptMetadata) 177 | if ret[0] != -1: 178 | rc4Key = ret[1] 179 | else: 180 | return ret 181 | if revision == 2: 182 | userPass = RC4(paddingString, rc4Key) 183 | elif revision > 2: 184 | counter = 1 185 | md5Input = paddingString + fileID 186 | hashResult = hashlib.md5(md5Input).digest() 187 | userPass = RC4(hashResult, rc4Key) 188 | while counter <= 19: 189 | newKey = '' 190 | for i in range(len(rc4Key)): 191 | newKey += chr(ord(rc4Key[i]) ^ counter) 192 | userPass = RC4(userPass, newKey) 193 | counter += 1 194 | counter = 0 195 | while counter < 16: 196 | userPass += chr(random.randint(32, 255)) 197 | counter += 1 198 | return (0, userPass) 199 | 200 | 201 | def isUserPass(password, computedUserPass, dictU, revision): 202 | ''' 203 | Checks if the given password is the User password of the file 204 | 205 | @param password: The given password or the empty password 206 | @param computedUserPass: The computed user password of the file 207 | @param dictU: The /U element of the /Encrypt dictionary 208 | @param revision: The number of revision of the standard security handler 209 | @return The boolean telling if the given password is the user password or not 210 | ''' 211 | if revision == 5: 212 | vSalt = dictU[32:40] 213 | inputHash = hashlib.sha256(password + vSalt).digest() 214 | if inputHash == dictU[:32]: 215 | return True 216 | else: 217 | return False 218 | elif revision == 3 or revision == 4: 219 | if computedUserPass[:16] == dictU[:16]: 220 | return True 221 | else: 222 | return False 223 | elif revision < 3: 224 | if computedUserPass == dictU: 225 | return True 226 | else: 227 | return False 228 | 229 | 230 | def isOwnerPass(password, dictO, dictU, computedUserPass, keyLength, revision): 231 | ''' 232 | Checks if the given password is the owner password of the file 233 | 234 | @param password: The given password or the empty password 235 | @param dictO: The /O element of the /Encrypt dictionary 236 | @param dictU: The /U element of the /Encrypt dictionary 237 | @param computedUserPass: The computed user password of the file 238 | @param keyLength: The length of the key 239 | @param revision: The algorithm revision 240 | @return The boolean telling if the given password is the owner password or not 241 | ''' 242 | if revision == 5: 243 | vSalt = dictO[32:40] 244 | inputHash = hashlib.sha256(password + vSalt + dictU).digest() 245 | if inputHash == dictO[:32]: 246 | return True 247 | else: 248 | return False 249 | else: 250 | keyLength = keyLength / 8 251 | lenPass = len(password) 252 | if lenPass > 32: 253 | password = password[:32] 254 | elif lenPass < 32: 255 | password += paddingString[:32 - lenPass] 256 | rc4Key = hashlib.md5(password).digest() 257 | if revision > 2: 258 | counter = 0 259 | while counter < 50: 260 | rc4Key = hashlib.md5(rc4Key).digest() 261 | counter += 1 262 | rc4Key = rc4Key[:keyLength] 263 | if revision == 2: 264 | userPass = RC4(dictO, rc4Key) 265 | elif revision > 2: 266 | counter = 19 267 | while counter >= 0: 268 | newKey = '' 269 | for i in range(len(rc4Key)): 270 | newKey += chr(ord(rc4Key[i]) ^ counter) 271 | dictO = RC4(dictO, newKey) 272 | counter -= 1 273 | userPass = dictO 274 | else: 275 | # Is it possible?? 276 | userPass = '' 277 | return isUserPass(userPass, computedUserPass, dictU, revision) 278 | 279 | 280 | def RC4(data, key): 281 | ''' 282 | RC4 implementation 283 | 284 | @param data: Bytes to be encrypyed/decrypted 285 | @param key: Key used for the algorithm 286 | @return: The encrypted/decrypted bytes 287 | ''' 288 | y = 0 289 | hash = {} 290 | box = {} 291 | ret = '' 292 | keyLength = len(key) 293 | dataLength = len(data) 294 | 295 | # Initialization 296 | for x in range(256): 297 | hash[x] = ord(key[x % keyLength]) 298 | box[x] = x 299 | for x in range(256): 300 | y = (y + int(box[x]) + int(hash[x])) % 256 301 | tmp = box[x] 302 | box[x] = box[y] 303 | box[y] = tmp 304 | 305 | z = y = 0 306 | for x in range(0, dataLength): 307 | z = (z + 1) % 256 308 | y = (y + box[z]) % 256 309 | tmp = box[z] 310 | box[z] = box[y] 311 | box[y] = tmp 312 | k = box[((box[z] + box[y]) % 256)] 313 | ret += chr(ord(data[x]) ^ k) 314 | return ret 315 | 316 | 317 | ''' 318 | Author: Evan Fosmark (http://www.evanfosmark.com/2008/06/xor-encryption-with-python/) 319 | ''' 320 | 321 | 322 | def xor(bytes, key): 323 | ''' 324 | Simple XOR implementation 325 | 326 | @param bytes: Bytes to be xored 327 | @param key: Key used for the operation, it's cycled. 328 | @return: The xored bytes 329 | ''' 330 | key = cycle(key) 331 | return ''.join(chr(ord(x) ^ ord(y)) for (x, y) in izip(bytes, key)) 332 | -------------------------------------------------------------------------------- /peepdf/README: -------------------------------------------------------------------------------- 1 | ** Home page ** 2 | 3 | http://peepdf.eternal-todo.com 4 | http://twitter.com/peepdf 5 | 6 | 7 | ** Dependencies ** 8 | 9 | - In order to analyse Javascript code "PyV8" is needed: 10 | 11 | http://code.google.com/p/pyv8/ 12 | 13 | 14 | - The "sctest" command is a wrapper of "sctest" (libemu). Besides libemu pylibemu is used and must be installed: 15 | 16 | http://libemu.carnivore.it (latest version from git repository, Sourceforge package is outdated) 17 | https://github.com/buffer/pylibemu 18 | 19 | 20 | - To support XML output "lxml" is needed: 21 | 22 | http://lxml.de/installation.html 23 | 24 | 25 | - Included modules: lzw, colorama, jsbeautifier, ccitt, pythonaes (Thanks to all the developers!!) 26 | 27 | 28 | 29 | ** Installation ** 30 | 31 | No installation is needed apart of the commented dependencies, just execute it! 32 | 33 | 34 | 35 | ** Execution ** 36 | 37 | There are two important options when peepdf is executed: 38 | 39 | -f: Ignores the parsing errors. Analysing malicious files propably leads to parsing errors, so this parameter should be set. 40 | -l: Sets the loose mode, so does not search for the endobj tag because it's not obligatory. Helpful with malformed files. 41 | 42 | 43 | * Simple execution 44 | 45 | Shows the statistics of the file after being decoded/decrypted and analysed: 46 | 47 | python peepdf.py [options] pdf_file 48 | 49 | 50 | * Interactive console 51 | 52 | Executes the interactive console to let play with the PDF file: 53 | 54 | python peepdf.py -i [options] pdf_file 55 | 56 | If no PDF file is specified it's possible to use the decode/encode/js*/sctest commands and create a new PDF file: 57 | 58 | python peepdf.py -i 59 | 60 | 61 | * Batch execution 62 | 63 | It's possible to use a commands file to specify the commands to be executed in the batch mode. This type of execution is good to automatise analysis of several files: 64 | 65 | python peepdf.py [options] -s commands_file pdf_file 66 | 67 | 68 | 69 | ** Updating ** 70 | 71 | Just type this and you will be updated to the latest version from the repository: 72 | 73 | python peepdf.py -u 74 | 75 | 76 | 77 | ** Some hints ** 78 | 79 | If the information shown when a PDF file is parsed is not enough to know if it's harmful or not, the following commands can help to do it: 80 | 81 | * tree 82 | 83 | Shows the tree graph of the file or specified version. Here we can see suspicious elements. 84 | 85 | 86 | * offsets 87 | 88 | Shows the physical map of the file or the specified version of the document. This is helpful to see unusual big objects or big spaces between objects. 89 | 90 | 91 | * search 92 | 93 | Search the specified string or hexadecimal string in the objects (decoded and encrypted streams included). 94 | 95 | 96 | * object/rawobject 97 | 98 | Shows the (raw) content of the object. 99 | 100 | 101 | * stream/rawstream 102 | 103 | Shows the (raw) content of the stream. 104 | 105 | 106 | * The rest of commands, of course 107 | 108 | > help 109 | 110 | 111 | 112 | ** Bugs ** 113 | 114 | Send me bugs and comments, please!! ;) You can do it via mail (jesparza AT eternal-todo.com) or through Google Code (http://peepdf.googlecode.com). 115 | 116 | Thanks!! 117 | -------------------------------------------------------------------------------- /peepdf/TODO: -------------------------------------------------------------------------------- 1 | Pending tasks: 2 | 3 | - User manual 4 | - Documentation of methods in PDFCore.py 5 | - Add the rest of supported stream filters (better testing of existent) 6 | - Automatic analysis of embedded PDF files 7 | - Add AES to the encryption implementation 8 | - Improve the automatic Javascript analysis, getting code from other parts of the documents (getAnnots, etc) 9 | - GUI 10 | - ActionScript analysis? -------------------------------------------------------------------------------- /peepdf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/peepdf/__init__.py -------------------------------------------------------------------------------- /peepdf/aes.py: -------------------------------------------------------------------------------- 1 | # 2 | # peepdf is a tool to analyse and modify PDF files 3 | # http://peepdf.eternal-todo.com 4 | # By Jose Miguel Esparza 5 | # 6 | # Copyright (C) 2012-2014 Jose Miguel Esparza 7 | # 8 | # This file is part of peepdf. 9 | # 10 | # peepdf is free software: you can redistribute it and/or modify 11 | # it under the terms of the GNU General Public License as published by 12 | # the Free Software Foundation, either version 3 of the License, or 13 | # (at your option) any later version. 14 | # 15 | # peepdf is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | # GNU General Public License for more details. 19 | # 20 | # You should have received a copy of the GNU General Public License 21 | # along with peepdf. If not, see . 22 | # 23 | 24 | """ 25 | Created from the demonstration of the pythonaes package. 26 | 27 | Copyright (c) 2010, Adam Newman http://www.caller9.com/ 28 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php 29 | """ 30 | 31 | from aespython import key_expander, aes_cipher, cbc_mode 32 | 33 | 34 | def decryptData(data, password=None, keyLength=None, mode='CBC'): 35 | ''' 36 | Method added for peepdf 37 | ''' 38 | decryptedData = '' 39 | if keyLength == None: 40 | keyLength = len(password) * 8 41 | if keyLength not in [128, 192, 256]: 42 | return (-1, 'Bad length key in AES decryption process') 43 | 44 | iv = map(ord, data[:16]) 45 | key = map(ord, password) 46 | data = data[16:] 47 | if len(data) % 16 != 0: 48 | data = data[:-(len(data) % 16)] 49 | keyExpander = key_expander.KeyExpander(keyLength) 50 | expandedKey = keyExpander.expand(key) 51 | aesCipher = aes_cipher.AESCipher(expandedKey) 52 | if mode == 'CBC': 53 | aesMode = cbc_mode.CBCMode(aesCipher, 16) 54 | aesMode.set_iv(iv) 55 | for i in range(0, len(data), 16): 56 | ciphertext = map(ord, data[i:i + 16]) 57 | decryptedBytes = aesMode.decrypt_block(ciphertext) 58 | for byte in decryptedBytes: 59 | decryptedData += chr(byte) 60 | return (0, decryptedData) 61 | -------------------------------------------------------------------------------- /peepdf/aespython/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-sei/pdfrankenstein/1a82844d1122ee956166123a9f5e0b8b22456747/peepdf/aespython/__init__.py -------------------------------------------------------------------------------- /peepdf/aespython/aes_cipher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | AES Block Cipher. 4 | 5 | Performs single block cipher decipher operations on a 16 element list of integers. 6 | These integers represent 8 bit bytes in a 128 bit block. 7 | The result of cipher or decipher operations is the transformed 16 element list of integers. 8 | 9 | Running this file as __main__ will result in a self-test of the algorithm. 10 | 11 | Algorithm per NIST FIPS-197 http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf 12 | 13 | Copyright (c) 2010, Adam Newman http://www.caller9.com/ 14 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php 15 | """ 16 | __author__ = "Adam Newman" 17 | 18 | # Normally use relative import. In test mode use local import. 19 | try: 20 | from .aes_tables import sbox, i_sbox, galI, galNI 21 | except ValueError: 22 | from aes_tables import sbox, i_sbox, galI, galNI 23 | ups = ",".join("s%x" % x for x in range(16)) 24 | upr = ups.replace("s", "r") 25 | mix = ",".join(",".join( 26 | ("g{0}[s%x]^g{1}[s%x]^g{2}[s%x]^g{3}[s%x]^r%x" % (i + (i[0] + (0, 3, 2, 1)[j],))).format(j & 3, j + 1 & 3, 27 | j + 2 & 3, j + 3 & 3) for j 28 | in (0, 3, 2, 1)) for i in ((0, 1, 2, 3), (4, 5, 6, 7), (8, 9, 10, 11), (12, 13, 14, 15))).replace("g2", 29 | "g").replace("g3", 30 | "g") 31 | i = mix.find("g[") 32 | while i != -1: 33 | mix = mix[:i] + mix[i + 2:i + 4] + mix[i + 5:] 34 | i = mix.find("g[", i) 35 | imix = ",".join(",".join( 36 | ("g{0}[s%x]^g{1}[s%x]^g{2}[s%x]^g{3}[s%x]" % i).format(j & 3, j + 1 & 3, j + 2 & 3, j + 3 & 3) for j in 37 | (0, 3, 2, 1)) for i in ((0, 1, 2, 3), (4, 5, 6, 7), (8, 9, 10, 11), (12, 13, 14, 15))) 38 | csl = ["s%x" % (x * 5 & 15) for x in range(16)] 39 | csr = ["s%x" % (x * -3 & 15) for x in range(16)] 40 | box = ",".join("s[%s]" % i for i in csl) 41 | ibox = ",".join("s[%s]^r%x" % i for i in zip(csr, range(16))) 42 | xor = ",".join("s[%s]^r%x" % i for i in zip(csl, range(16))) 43 | xori = ";".join("s%x^=r%x" % (i, i) for i in range(16)) 44 | ciph = """def decipher_block(f,s): 45 | g0,g1,g2,g3=galNI;ek=f._expanded_key;S=s+[0]*(16-len(s));s=sbox;R=ek[:16];X 46 | for f in range(!16):R=ek[f:f+16];S=B;S=M 47 | R=ek[f+16:] 48 | return """.replace("S", ups).replace("R", upr).replace("X", xori) 49 | 50 | 51 | class AESCipher: 52 | def __init__(self, expanded_key): 53 | self._expanded_key = expanded_key 54 | self._Nr = len(expanded_key) - 16 55 | 56 | exec ( 57 | ciph.replace("g2,g3", "").replace("dec", "c").replace("!", "16,f._Nr,").replace("B", box).replace("M", mix) + xor) 58 | exec (ciph.replace("NI", "I").replace(":16", "f._Nr:").replace("f+16:", ":16").replace("!", "f._Nr-16,0,-").replace( 59 | "sbox", "i_sbox").replace("B", ibox).replace("M", imix) + ibox) 60 | 61 | 62 | import unittest 63 | 64 | 65 | class TestCipher(unittest.TestCase): 66 | def test_cipher(self): 67 | """Test AES cipher with all key lengths""" 68 | import test_keys 69 | import key_expander 70 | 71 | test_data = test_keys.TestKeys() 72 | for key_size in 128, 192, 256: 73 | test_key_expander = key_expander.KeyExpander(key_size) 74 | test_expanded_key = test_key_expander.expand(test_data.test_key[key_size]) 75 | test_cipher = AESCipher(test_expanded_key) 76 | test_result_ciphertext = test_cipher.cipher_block(test_data.test_block_plaintext) 77 | self.assertEquals(len( 78 | [i for i, j in zip(test_result_ciphertext, test_data.test_block_ciphertext_validated[key_size]) if 79 | i == j]), 80 | 16, msg='Test %d bit cipher' % key_size) 81 | test_result_plaintext = test_cipher.decipher_block(test_data.test_block_ciphertext_validated[key_size]) 82 | self.assertEquals(len([i for i, j in zip(test_result_plaintext, test_data.test_block_plaintext) if i == j]), 83 | 16, msg='Test %d bit decipher' % key_size) 84 | 85 | 86 | if __name__ == "__main__": 87 | unittest.main() 88 | -------------------------------------------------------------------------------- /peepdf/aespython/cbc_mode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | CBC Mode of operation 4 | 5 | Running this file as __main__ will result in a self-test of the algorithm. 6 | 7 | Algorithm per NIST SP 800-38A http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf 8 | 9 | Copyright (c) 2010, Adam Newman http://www.caller9.com/ 10 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php 11 | """ 12 | __author__ = "Adam Newman" 13 | 14 | 15 | class CBCMode: 16 | """Perform CBC operation on a block and retain IV information for next operation""" 17 | 18 | def __init__(self, block_cipher, block_size): 19 | self._block_cipher = block_cipher 20 | self._block_size = block_size 21 | self._iv = [0] * block_size 22 | 23 | def set_iv(self, iv): 24 | if len(iv) == self._block_size: 25 | self._iv = iv 26 | 27 | def encrypt_block(self, plaintext): 28 | iv = self._iv = self._block_cipher.cipher_block([i ^ j for i, j in zip(plaintext, self._iv)]) 29 | return iv 30 | 31 | def decrypt_block(self, ciphertext): 32 | plaintext = list(self._block_cipher.decipher_block(ciphertext)) 33 | for i, v in enumerate(self._iv): plaintext[i] ^= v 34 | self._iv = ciphertext 35 | return plaintext 36 | 37 | 38 | import unittest 39 | 40 | 41 | class TestEncryptionMode(unittest.TestCase): 42 | def test_mode(self): 43 | # Self test 44 | import key_expander 45 | import aes_cipher 46 | import test_keys 47 | 48 | test_data = test_keys.TestKeys() 49 | 50 | test_expander = key_expander.KeyExpander(256) 51 | test_expanded_key = test_expander.expand(test_data.test_mode_key) 52 | 53 | test_cipher = aes_cipher.AESCipher(test_expanded_key) 54 | 55 | test_cbc = CBCMode(test_cipher, 16) 56 | 57 | test_cbc.set_iv(test_data.test_mode_iv) 58 | for k in range(4): 59 | self.assertEquals(len([i for i, j in zip(test_data.test_cbc_ciphertext[k], 60 | test_cbc.encrypt_block(test_data.test_mode_plaintext[k])) if 61 | i == j]), 62 | 16, 63 | msg='CBC encrypt test block %d' % k) 64 | 65 | test_cbc.set_iv(test_data.test_mode_iv) 66 | for k in range(4): 67 | self.assertEquals(len([i for i, j in zip(test_data.test_mode_plaintext[k], 68 | test_cbc.decrypt_block(test_data.test_cbc_ciphertext[k])) if 69 | i == j]), 70 | 16, 71 | msg='CBC decrypt test block %d' % k) 72 | 73 | 74 | if __name__ == "__main__": 75 | unittest.main() 76 | -------------------------------------------------------------------------------- /peepdf/aespython/cfb_mode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | CFB Mode of operation 4 | 5 | Running this file as __main__ will result in a self-test of the algorithm. 6 | 7 | Algorithm per NIST SP 800-38A http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf 8 | 9 | Copyright (c) 2010, Adam Newman http://www.caller9.com/ 10 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php 11 | """ 12 | __author__ = "Adam Newman" 13 | 14 | 15 | class CFBMode: 16 | """Perform CFB operation on a block and retain IV information for next operation""" 17 | 18 | def __init__(self, block_cipher, block_size): 19 | self._block_cipher = block_cipher 20 | self._block_size = block_size 21 | self._iv = [0] * block_size 22 | 23 | def set_iv(self, iv): 24 | if len(iv) == self._block_size: 25 | self._iv = iv 26 | 27 | def encrypt_block(self, plaintext): 28 | cipher_iv = self._block_cipher.cipher_block(self._iv) 29 | iv = self._iv = [i ^ j for i, j in zip(plaintext, cipher_iv)] 30 | return iv 31 | 32 | def decrypt_block(self, ciphertext): 33 | cipher_iv = self._block_cipher.cipher_block(self._iv) 34 | self._iv = ciphertext 35 | return [i ^ j for i, j in zip(cipher_iv, ciphertext)] 36 | 37 | 38 | import unittest 39 | 40 | 41 | class TestEncryptionMode(unittest.TestCase): 42 | def test_mode(self): 43 | # Self test 44 | import key_expander 45 | import aes_cipher 46 | import test_keys 47 | 48 | test_data = test_keys.TestKeys() 49 | 50 | test_expander = key_expander.KeyExpander(256) 51 | test_expanded_key = test_expander.expand(test_data.test_mode_key) 52 | 53 | test_cipher = aes_cipher.AESCipher(test_expanded_key) 54 | 55 | test_cfb = CFBMode(test_cipher, 16) 56 | 57 | test_cfb.set_iv(test_data.test_mode_iv) 58 | for k in range(4): 59 | self.assertEquals(len([i for i, j in zip(test_data.test_cfb_ciphertext[k], 60 | test_cfb.encrypt_block(test_data.test_mode_plaintext[k])) if 61 | i == j]), 62 | 16, 63 | msg='CFB encrypt test block' + str(k)) 64 | 65 | test_cfb.set_iv(test_data.test_mode_iv) 66 | for k in range(4): 67 | self.assertEquals(len([i for i, j in zip(test_data.test_mode_plaintext[k], 68 | test_cfb.decrypt_block(test_data.test_cfb_ciphertext[k])) if 69 | i == j]), 70 | 16, 71 | msg='CFB decrypt test block' + str(k)) 72 | 73 | 74 | if __name__ == "__main__": 75 | unittest.main() 76 | -------------------------------------------------------------------------------- /peepdf/aespython/key_expander.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | AES Key Expansion. 5 | 6 | Expands 128, 192, or 256 bit key for use with AES 7 | 8 | Running this file as __main__ will result in a self-test of the algorithm. 9 | 10 | Algorithm per NIST FIPS-197 http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf 11 | 12 | Copyright (c) 2010, Adam Newman http://www.caller9.com/ 13 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php 14 | """ 15 | __author__ = "Adam Newman" 16 | 17 | # Normally use relative import. In test mode use local import. 18 | try: 19 | from .aes_tables import sbox, rcon 20 | except ValueError: 21 | from aes_tables import sbox, rcon 22 | from operator import xor 23 | 24 | 25 | class KeyExpander: 26 | """Perform AES Key Expansion""" 27 | 28 | _expanded_key_length = {128: 176, 192: 208, 256: 240} 29 | 30 | def __init__(self, key_length): 31 | self._key_length = key_length 32 | self._n = key_length >> 3 33 | 34 | if key_length in self._expanded_key_length: 35 | self._b = self._expanded_key_length[key_length] 36 | else: 37 | raise LookupError('Invalid Key Size') 38 | 39 | def expand(self, new_key): 40 | """ 41 | Expand the encryption key per AES key schedule specifications 42 | 43 | http://en.wikipedia.org/wiki/Rijndael_key_schedule#Key_schedule_description 44 | """ 45 | # First n bytes are copied from key 46 | len_new_key = len(new_key) 47 | if len_new_key != self._n: 48 | raise RuntimeError('expand(): key size is invalid') 49 | rcon_iter = 1 50 | nex = new_key.extend 51 | 52 | # Grow the key until it is the correct length 53 | while 1: 54 | # Copy last 4 bytes of extended key, apply core, increment i(rcon_iter), 55 | # core Append the list of elements 1-3 and list comprised of element 0 (circular rotate left) 56 | # core For each element of this new list, put the result of sbox into output array. 57 | # xor with 4 bytes n bytes from end of extended key 58 | keyarr = [sbox[i] for i in new_key[-3:] + new_key[-4:-3]] 59 | # First byte of output array is XORed with rcon(iter) 60 | keyarr[0] ^= rcon[rcon_iter] 61 | nex(map(xor, keyarr, new_key[-self._n:4 - self._n])) 62 | rcon_iter += 1 63 | len_new_key += 4 64 | 65 | # Run three passes of 4 byte expansion using copy of 4 byte tail of extended key 66 | # which is then xor'd with 4 bytes n bytes from end of extended key 67 | for j in 0, 1, 2: 68 | nex(map(xor, new_key[-4:], new_key[-self._n:4 - self._n])) 69 | len_new_key += 4 70 | if len_new_key >= self._b: 71 | return new_key 72 | else: 73 | # If key length is 256 and key is not complete, add 4 bytes tail of extended key 74 | # run through sbox before xor with 4 bytes n bytes from end of extended key 75 | if self._key_length == 256: 76 | nex(map(xor, [sbox[x] for x in new_key[-4:]], new_key[-self._n:4 - self._n])) 77 | len_new_key += 4 78 | if len_new_key >= self._b: return new_key 79 | 80 | # If key length is 192 or 256 and key is not complete, run 2 or 3 passes respectively 81 | # of 4 byte tail of extended key xor with 4 bytes n bytes from end of extended key 82 | if self._key_length != 128: 83 | for j in ((0, 1) if self._key_length == 192 else (0, 1, 2)): 84 | nex(map(xor, new_key[-4:], new_key[-self._n:4 - self._n])) 85 | len_new_key += 4 86 | if len_new_key >= self._b: return new_key 87 | 88 | 89 | import unittest 90 | 91 | 92 | class TestKeyExpander(unittest.TestCase): 93 | def test_keys(self): 94 | """Test All Key Expansions""" 95 | import test_keys 96 | 97 | test_data = test_keys.TestKeys() 98 | for key_size in 128, 192, 256: 99 | test_expander = KeyExpander(key_size) 100 | test_expanded_key = test_expander.expand(test_data.test_key[key_size]) 101 | self.assertEqual( 102 | len([i for i, j in zip(test_expanded_key, test_data.test_expanded_key_validated[key_size]) if i == j]), 103 | len(test_data.test_expanded_key_validated[key_size]), 104 | msg='Key expansion ' + str(key_size) + ' bit') 105 | 106 | 107 | if __name__ == "__main__": 108 | unittest.main() 109 | -------------------------------------------------------------------------------- /peepdf/aespython/ofb_mode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | OFB Mode of operation 4 | 5 | Running this file as __main__ will result in a self-test of the algorithm. 6 | 7 | Algorithm per NIST SP 800-38A http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf 8 | 9 | Copyright (c) 2010, Adam Newman http://www.caller9.com/ 10 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php 11 | """ 12 | __author__ = "Adam Newman" 13 | 14 | 15 | class OFBMode: 16 | """Perform OFB operation on a block and retain IV information for next operation""" 17 | 18 | def __init__(self, block_cipher, block_size): 19 | self._block_cipher = block_cipher 20 | self._block_size = block_size 21 | self._iv = [0] * block_size 22 | 23 | def set_iv(self, iv): 24 | if len(iv) == self._block_size: 25 | self._iv = iv 26 | 27 | def encrypt_block(self, plaintext): 28 | self._iv = cipher_iv = self._block_cipher.cipher_block(self._iv) 29 | return [i ^ j for i, j in zip(plaintext, cipher_iv)] 30 | 31 | def decrypt_block(self, ciphertext): 32 | self._iv = cipher_iv = self._block_cipher.cipher_block(self._iv) 33 | return [i ^ j for i, j in zip(cipher_iv, ciphertext)] 34 | 35 | 36 | import unittest 37 | 38 | 39 | class TestEncryptionMode(unittest.TestCase): 40 | def test_mode(self): 41 | # Self test 42 | import key_expander 43 | import aes_cipher 44 | import test_keys 45 | 46 | test_data = test_keys.TestKeys() 47 | 48 | test_expander = key_expander.KeyExpander(256) 49 | test_expanded_key = test_expander.expand(test_data.test_mode_key) 50 | 51 | test_cipher = aes_cipher.AESCipher(test_expanded_key) 52 | 53 | test_ofb = OFBMode(test_cipher, 16) 54 | 55 | test_ofb.set_iv(test_data.test_mode_iv) 56 | for k in range(4): 57 | self.assertEquals(len([i for i, j in zip(test_data.test_ofb_ciphertext[k], 58 | test_ofb.encrypt_block(test_data.test_mode_plaintext[k])) if 59 | i == j]), 60 | 16, 61 | msg='OFB encrypt test block' + str(k)) 62 | 63 | test_ofb.set_iv(test_data.test_mode_iv) 64 | for k in range(4): 65 | self.assertEquals(len([i for i, j in zip(test_data.test_mode_plaintext[k], 66 | test_ofb.decrypt_block(test_data.test_ofb_ciphertext[k])) if 67 | i == j]), 68 | 16, 69 | msg='OFB decrypt test block' + str(k)) 70 | 71 | 72 | if __name__ == "__main__": 73 | unittest.main() 74 | -------------------------------------------------------------------------------- /peepdf/aespython/test_keys.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test keys and data for self-test operations. 3 | 4 | Test data from: 5 | NIST SP 800-38A http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf 6 | NIST FIPS-197 http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf 7 | 8 | Copyright (c) 2010, Adam Newman http://www.caller9.com/ 9 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php 10 | """ 11 | __author__ = "Adam Newman" 12 | 13 | 14 | class TestKeys: 15 | """Test data, keys, IVs, and output to use in self-tests""" 16 | test_key = { 17 | 128: [ 18 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f] 19 | , 192: [ 20 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 21 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17] 22 | , 256: [ 23 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 24 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f] 25 | } 26 | 27 | test_expanded_key_validated = { 28 | 128: [ 29 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 30 | 0xd6, 0xaa, 0x74, 0xfd, 0xd2, 0xaf, 0x72, 0xfa, 0xda, 0xa6, 0x78, 0xf1, 0xd6, 0xab, 0x76, 0xfe, 31 | 0xb6, 0x92, 0xcf, 0x0b, 0x64, 0x3d, 0xbd, 0xf1, 0xbe, 0x9b, 0xc5, 0x00, 0x68, 0x30, 0xb3, 0xfe, 32 | 0xb6, 0xff, 0x74, 0x4e, 0xd2, 0xc2, 0xc9, 0xbf, 0x6c, 0x59, 0x0c, 0xbf, 0x04, 0x69, 0xbf, 0x41, 33 | 0x47, 0xf7, 0xf7, 0xbc, 0x95, 0x35, 0x3e, 0x03, 0xf9, 0x6c, 0x32, 0xbc, 0xfd, 0x05, 0x8d, 0xfd, 34 | 0x3c, 0xaa, 0xa3, 0xe8, 0xa9, 0x9f, 0x9d, 0xeb, 0x50, 0xf3, 0xaf, 0x57, 0xad, 0xf6, 0x22, 0xaa, 35 | 0x5e, 0x39, 0x0f, 0x7d, 0xf7, 0xa6, 0x92, 0x96, 0xa7, 0x55, 0x3d, 0xc1, 0x0a, 0xa3, 0x1f, 0x6b, 36 | 0x14, 0xf9, 0x70, 0x1a, 0xe3, 0x5f, 0xe2, 0x8c, 0x44, 0x0a, 0xdf, 0x4d, 0x4e, 0xa9, 0xc0, 0x26, 37 | 0x47, 0x43, 0x87, 0x35, 0xa4, 0x1c, 0x65, 0xb9, 0xe0, 0x16, 0xba, 0xf4, 0xae, 0xbf, 0x7a, 0xd2, 38 | 0x54, 0x99, 0x32, 0xd1, 0xf0, 0x85, 0x57, 0x68, 0x10, 0x93, 0xed, 0x9c, 0xbe, 0x2c, 0x97, 0x4e, 39 | 0x13, 0x11, 0x1d, 0x7f, 0xe3, 0x94, 0x4a, 0x17, 0xf3, 0x07, 0xa7, 0x8b, 0x4d, 0x2b, 0x30, 0xc5] 40 | , 192: [ 41 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 42 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x58, 0x46, 0xf2, 0xf9, 0x5c, 0x43, 0xf4, 0xfe, 43 | 0x54, 0x4a, 0xfe, 0xf5, 0x58, 0x47, 0xf0, 0xfa, 0x48, 0x56, 0xe2, 0xe9, 0x5c, 0x43, 0xf4, 0xfe, 44 | 0x40, 0xf9, 0x49, 0xb3, 0x1c, 0xba, 0xbd, 0x4d, 0x48, 0xf0, 0x43, 0xb8, 0x10, 0xb7, 0xb3, 0x42, 45 | 0x58, 0xe1, 0x51, 0xab, 0x04, 0xa2, 0xa5, 0x55, 0x7e, 0xff, 0xb5, 0x41, 0x62, 0x45, 0x08, 0x0c, 46 | 0x2a, 0xb5, 0x4b, 0xb4, 0x3a, 0x02, 0xf8, 0xf6, 0x62, 0xe3, 0xa9, 0x5d, 0x66, 0x41, 0x0c, 0x08, 47 | 0xf5, 0x01, 0x85, 0x72, 0x97, 0x44, 0x8d, 0x7e, 0xbd, 0xf1, 0xc6, 0xca, 0x87, 0xf3, 0x3e, 0x3c, 48 | 0xe5, 0x10, 0x97, 0x61, 0x83, 0x51, 0x9b, 0x69, 0x34, 0x15, 0x7c, 0x9e, 0xa3, 0x51, 0xf1, 0xe0, 49 | 0x1e, 0xa0, 0x37, 0x2a, 0x99, 0x53, 0x09, 0x16, 0x7c, 0x43, 0x9e, 0x77, 0xff, 0x12, 0x05, 0x1e, 50 | 0xdd, 0x7e, 0x0e, 0x88, 0x7e, 0x2f, 0xff, 0x68, 0x60, 0x8f, 0xc8, 0x42, 0xf9, 0xdc, 0xc1, 0x54, 51 | 0x85, 0x9f, 0x5f, 0x23, 0x7a, 0x8d, 0x5a, 0x3d, 0xc0, 0xc0, 0x29, 0x52, 0xbe, 0xef, 0xd6, 0x3a, 52 | 0xde, 0x60, 0x1e, 0x78, 0x27, 0xbc, 0xdf, 0x2c, 0xa2, 0x23, 0x80, 0x0f, 0xd8, 0xae, 0xda, 0x32, 53 | 0xa4, 0x97, 0x0a, 0x33, 0x1a, 0x78, 0xdc, 0x09, 0xc4, 0x18, 0xc2, 0x71, 0xe3, 0xa4, 0x1d, 0x5d] 54 | , 256: [ 55 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 56 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 57 | 0xa5, 0x73, 0xc2, 0x9f, 0xa1, 0x76, 0xc4, 0x98, 0xa9, 0x7f, 0xce, 0x93, 0xa5, 0x72, 0xc0, 0x9c, 58 | 0x16, 0x51, 0xa8, 0xcd, 0x02, 0x44, 0xbe, 0xda, 0x1a, 0x5d, 0xa4, 0xc1, 0x06, 0x40, 0xba, 0xde, 59 | 0xae, 0x87, 0xdf, 0xf0, 0x0f, 0xf1, 0x1b, 0x68, 0xa6, 0x8e, 0xd5, 0xfb, 0x03, 0xfc, 0x15, 0x67, 60 | 0x6d, 0xe1, 0xf1, 0x48, 0x6f, 0xa5, 0x4f, 0x92, 0x75, 0xf8, 0xeb, 0x53, 0x73, 0xb8, 0x51, 0x8d, 61 | 0xc6, 0x56, 0x82, 0x7f, 0xc9, 0xa7, 0x99, 0x17, 0x6f, 0x29, 0x4c, 0xec, 0x6c, 0xd5, 0x59, 0x8b, 62 | 0x3d, 0xe2, 0x3a, 0x75, 0x52, 0x47, 0x75, 0xe7, 0x27, 0xbf, 0x9e, 0xb4, 0x54, 0x07, 0xcf, 0x39, 63 | 0x0b, 0xdc, 0x90, 0x5f, 0xc2, 0x7b, 0x09, 0x48, 0xad, 0x52, 0x45, 0xa4, 0xc1, 0x87, 0x1c, 0x2f, 64 | 0x45, 0xf5, 0xa6, 0x60, 0x17, 0xb2, 0xd3, 0x87, 0x30, 0x0d, 0x4d, 0x33, 0x64, 0x0a, 0x82, 0x0a, 65 | 0x7c, 0xcf, 0xf7, 0x1c, 0xbe, 0xb4, 0xfe, 0x54, 0x13, 0xe6, 0xbb, 0xf0, 0xd2, 0x61, 0xa7, 0xdf, 66 | 0xf0, 0x1a, 0xfa, 0xfe, 0xe7, 0xa8, 0x29, 0x79, 0xd7, 0xa5, 0x64, 0x4a, 0xb3, 0xaf, 0xe6, 0x40, 67 | 0x25, 0x41, 0xfe, 0x71, 0x9b, 0xf5, 0x00, 0x25, 0x88, 0x13, 0xbb, 0xd5, 0x5a, 0x72, 0x1c, 0x0a, 68 | 0x4e, 0x5a, 0x66, 0x99, 0xa9, 0xf2, 0x4f, 0xe0, 0x7e, 0x57, 0x2b, 0xaa, 0xcd, 0xf8, 0xcd, 0xea, 69 | 0x24, 0xfc, 0x79, 0xcc, 0xbf, 0x09, 0x79, 0xe9, 0x37, 0x1a, 0xc2, 0x3c, 0x6d, 0x68, 0xde, 0x36] 70 | } 71 | 72 | test_block_ciphertext_validated = { 73 | 128: [ 74 | 0x69, 0xc4, 0xe0, 0xd8, 0x6a, 0x7b, 0x04, 0x30, 0xd8, 0xcd, 0xb7, 0x80, 0x70, 0xb4, 0xc5, 0x5a] 75 | , 192: [ 76 | 0xdd, 0xa9, 0x7c, 0xa4, 0x86, 0x4c, 0xdf, 0xe0, 0x6e, 0xaf, 0x70, 0xa0, 0xec, 0x0d, 0x71, 0x91] 77 | , 256: [ 78 | 0x8e, 0xa2, 0xb7, 0xca, 0x51, 0x67, 0x45, 0xbf, 0xea, 0xfc, 0x49, 0x90, 0x4b, 0x49, 0x60, 0x89] 79 | } 80 | 81 | test_block_plaintext = [ 82 | 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff] 83 | 84 | # After initial validation, these deviated from test in SP 800-38A to use same key, iv, and plaintext on tests. 85 | # Still valid, just easier to test with. 86 | test_mode_key = [ 87 | 0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81, 88 | 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4] 89 | test_mode_iv = [ 90 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f] 91 | test_mode_plaintext = [ 92 | [0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a], 93 | [0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51], 94 | [0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef], 95 | [0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10]] 96 | test_cbc_ciphertext = [ 97 | [0xf5, 0x8c, 0x4c, 0x04, 0xd6, 0xe5, 0xf1, 0xba, 0x77, 0x9e, 0xab, 0xfb, 0x5f, 0x7b, 0xfb, 0xd6], 98 | [0x9c, 0xfc, 0x4e, 0x96, 0x7e, 0xdb, 0x80, 0x8d, 0x67, 0x9f, 0x77, 0x7b, 0xc6, 0x70, 0x2c, 0x7d], 99 | [0x39, 0xf2, 0x33, 0x69, 0xa9, 0xd9, 0xba, 0xcf, 0xa5, 0x30, 0xe2, 0x63, 0x04, 0x23, 0x14, 0x61], 100 | [0xb2, 0xeb, 0x05, 0xe2, 0xc3, 0x9b, 0xe9, 0xfc, 0xda, 0x6c, 0x19, 0x07, 0x8c, 0x6a, 0x9d, 0x1b]] 101 | test_cfb_ciphertext = [ 102 | [0xdc, 0x7e, 0x84, 0xbf, 0xda, 0x79, 0x16, 0x4b, 0x7e, 0xcd, 0x84, 0x86, 0x98, 0x5d, 0x38, 0x60], 103 | [0x39, 0xff, 0xed, 0x14, 0x3b, 0x28, 0xb1, 0xc8, 0x32, 0x11, 0x3c, 0x63, 0x31, 0xe5, 0x40, 0x7b], 104 | [0xdf, 0x10, 0x13, 0x24, 0x15, 0xe5, 0x4b, 0x92, 0xa1, 0x3e, 0xd0, 0xa8, 0x26, 0x7a, 0xe2, 0xf9], 105 | [0x75, 0xa3, 0x85, 0x74, 0x1a, 0xb9, 0xce, 0xf8, 0x20, 0x31, 0x62, 0x3d, 0x55, 0xb1, 0xe4, 0x71]] 106 | test_ofb_ciphertext = [ 107 | [0xdc, 0x7e, 0x84, 0xbf, 0xda, 0x79, 0x16, 0x4b, 0x7e, 0xcd, 0x84, 0x86, 0x98, 0x5d, 0x38, 0x60], 108 | [0x4f, 0xeb, 0xdc, 0x67, 0x40, 0xd2, 0x0b, 0x3a, 0xc8, 0x8f, 0x6a, 0xd8, 0x2a, 0x4f, 0xb0, 0x8d], 109 | [0x71, 0xab, 0x47, 0xa0, 0x86, 0xe8, 0x6e, 0xed, 0xf3, 0x9d, 0x1c, 0x5b, 0xba, 0x97, 0xc4, 0x08], 110 | [0x01, 0x26, 0x14, 0x1d, 0x67, 0xf3, 0x7b, 0xe8, 0x53, 0x8f, 0x5a, 0x8b, 0xe7, 0x40, 0xe4, 0x84]] 111 | 112 | def hex_output(self, list): 113 | # Debugging output helper 114 | result = '[' 115 | for i in list[:-1]: 116 | result += hex(i) + ',' 117 | return result + hex(list[-1]) + ']' 118 | -------------------------------------------------------------------------------- /peepdf/colorama/__init__.py: -------------------------------------------------------------------------------- 1 | from .initialise import init 2 | from .ansi import Fore, Back, Style 3 | from .ansitowin32 import AnsiToWin32 4 | 5 | VERSION = '0.1.18' 6 | -------------------------------------------------------------------------------- /peepdf/colorama/ansi.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This module generates ANSI character codes to printing colors to terminals. 3 | See: http://en.wikipedia.org/wiki/ANSI_escape_code 4 | ''' 5 | 6 | CSI = '\033[' 7 | 8 | 9 | def code_to_chars(code): 10 | return CSI + str(code) + 'm' 11 | 12 | 13 | class AnsiCodes(object): 14 | def __init__(self, codes): 15 | for name in dir(codes): 16 | if not name.startswith('_'): 17 | value = getattr(codes, name) 18 | setattr(self, name, code_to_chars(value)) 19 | 20 | 21 | class AnsiFore: 22 | BLACK = 30 23 | RED = 31 24 | GREEN = 32 25 | YELLOW = 33 26 | BLUE = 34 27 | MAGENTA = 35 28 | CYAN = 36 29 | WHITE = 37 30 | RESET = 39 31 | 32 | 33 | class AnsiBack: 34 | BLACK = 40 35 | RED = 41 36 | GREEN = 42 37 | YELLOW = 43 38 | BLUE = 44 39 | MAGENTA = 45 40 | CYAN = 46 41 | WHITE = 47 42 | RESET = 49 43 | 44 | 45 | class AnsiStyle: 46 | BRIGHT = 1 47 | DIM = 2 48 | NORMAL = 22 49 | RESET_ALL = 0 50 | 51 | 52 | Fore = AnsiCodes(AnsiFore) 53 | Back = AnsiCodes(AnsiBack) 54 | Style = AnsiCodes(AnsiStyle) 55 | -------------------------------------------------------------------------------- /peepdf/colorama/ansitowin32.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | 4 | from .ansi import AnsiFore, AnsiBack, AnsiStyle, Style 5 | from .winterm import WinTerm, WinColor, WinStyle 6 | from .win32 import windll 7 | 8 | if windll is not None: 9 | winterm = WinTerm() 10 | 11 | 12 | def is_a_tty(stream): 13 | return hasattr(stream, 'isatty') and stream.isatty() 14 | 15 | 16 | class StreamWrapper(object): 17 | ''' 18 | Wraps a stream (such as stdout), acting as a transparent proxy for all 19 | attribute access apart from method 'write()', which is delegated to our 20 | Converter instance. 21 | ''' 22 | 23 | def __init__(self, wrapped, converter): 24 | # double-underscore everything to prevent clashes with names of 25 | # attributes on the wrapped stream object. 26 | self.__wrapped = wrapped 27 | self.__convertor = converter 28 | 29 | def __getattr__(self, name): 30 | return getattr(self.__wrapped, name) 31 | 32 | def write(self, text): 33 | self.__convertor.write(text) 34 | 35 | 36 | class AnsiToWin32(object): 37 | ''' 38 | Implements a 'write()' method which, on Windows, will strip ANSI character 39 | sequences from the text, and if outputting to a tty, will convert them into 40 | win32 function calls. 41 | ''' 42 | ANSI_RE = re.compile('\033\[((?:\d|;)*)([a-zA-Z])') 43 | 44 | def __init__(self, wrapped, convert=None, strip=None, autoreset=False): 45 | # The wrapped stream (normally sys.stdout or sys.stderr) 46 | self.wrapped = wrapped 47 | 48 | # should we reset colors to defaults after every .write() 49 | self.autoreset = autoreset 50 | 51 | # create the proxy wrapping our output stream 52 | self.stream = StreamWrapper(wrapped, self) 53 | 54 | on_windows = sys.platform.startswith('win') 55 | 56 | # should we strip ANSI sequences from our output? 57 | if strip is None: 58 | strip = on_windows 59 | self.strip = strip 60 | 61 | # should we should convert ANSI sequences into win32 calls? 62 | if convert is None: 63 | convert = on_windows and is_a_tty(wrapped) 64 | self.convert = convert 65 | 66 | # dict of ansi codes to win32 functions and parameters 67 | self.win32_calls = self.get_win32_calls() 68 | 69 | # are we wrapping stderr? 70 | self.on_stderr = self.wrapped is sys.stderr 71 | 72 | def should_wrap(self): 73 | ''' 74 | True if this class is actually needed. If false, then the output 75 | stream will not be affected, nor will win32 calls be issued, so 76 | wrapping stdout is not actually required. This will generally be 77 | False on non-Windows platforms, unless optional functionality like 78 | autoreset has been requested using kwargs to init() 79 | ''' 80 | return self.convert or self.strip or self.autoreset 81 | 82 | def get_win32_calls(self): 83 | if self.convert and winterm: 84 | return { 85 | AnsiStyle.RESET_ALL: (winterm.reset_all,), 86 | AnsiStyle.BRIGHT: (winterm.style, WinStyle.BRIGHT), 87 | AnsiStyle.DIM: (winterm.style, WinStyle.NORMAL), 88 | AnsiStyle.NORMAL: (winterm.style, WinStyle.NORMAL), 89 | AnsiFore.BLACK: (winterm.fore, WinColor.BLACK), 90 | AnsiFore.RED: (winterm.fore, WinColor.RED), 91 | AnsiFore.GREEN: (winterm.fore, WinColor.GREEN), 92 | AnsiFore.YELLOW: (winterm.fore, WinColor.YELLOW), 93 | AnsiFore.BLUE: (winterm.fore, WinColor.BLUE), 94 | AnsiFore.MAGENTA: (winterm.fore, WinColor.MAGENTA), 95 | AnsiFore.CYAN: (winterm.fore, WinColor.CYAN), 96 | AnsiFore.WHITE: (winterm.fore, WinColor.GREY), 97 | AnsiFore.RESET: (winterm.fore,), 98 | AnsiBack.BLACK: (winterm.back, WinColor.BLACK), 99 | AnsiBack.RED: (winterm.back, WinColor.RED), 100 | AnsiBack.GREEN: (winterm.back, WinColor.GREEN), 101 | AnsiBack.YELLOW: (winterm.back, WinColor.YELLOW), 102 | AnsiBack.BLUE: (winterm.back, WinColor.BLUE), 103 | AnsiBack.MAGENTA: (winterm.back, WinColor.MAGENTA), 104 | AnsiBack.CYAN: (winterm.back, WinColor.CYAN), 105 | AnsiBack.WHITE: (winterm.back, WinColor.GREY), 106 | AnsiBack.RESET: (winterm.back,), 107 | } 108 | 109 | def write(self, text): 110 | if self.strip or self.convert: 111 | self.write_and_convert(text) 112 | else: 113 | self.wrapped.write(text) 114 | self.wrapped.flush() 115 | if self.autoreset: 116 | self.reset_all() 117 | 118 | def reset_all(self): 119 | if self.convert: 120 | self.call_win32('m', (0,)) 121 | else: 122 | self.wrapped.write(Style.RESET_ALL) 123 | 124 | def write_and_convert(self, text): 125 | ''' 126 | Write the given text to our wrapped stream, stripping any ANSI 127 | sequences from the text, and optionally converting them into win32 128 | calls. 129 | ''' 130 | cursor = 0 131 | for match in self.ANSI_RE.finditer(text): 132 | start, end = match.span() 133 | self.write_plain_text(text, cursor, start) 134 | self.convert_ansi(*match.groups()) 135 | cursor = end 136 | self.write_plain_text(text, cursor, len(text)) 137 | 138 | def write_plain_text(self, text, start, end): 139 | if start < end: 140 | self.wrapped.write(text[start:end]) 141 | self.wrapped.flush() 142 | 143 | def convert_ansi(self, paramstring, command): 144 | if self.convert: 145 | params = self.extract_params(paramstring) 146 | self.call_win32(command, params) 147 | 148 | def extract_params(self, paramstring): 149 | def split(paramstring): 150 | for p in paramstring.split(';'): 151 | if p != '': 152 | yield int(p) 153 | 154 | return tuple(split(paramstring)) 155 | 156 | def call_win32(self, command, params): 157 | if params == []: 158 | params = [0] 159 | if command == 'm': 160 | for param in params: 161 | if param in self.win32_calls: 162 | func_args = self.win32_calls[param] 163 | func = func_args[0] 164 | args = func_args[1:] 165 | kwargs = dict(on_stderr=self.on_stderr) 166 | func(*args, **kwargs) 167 | -------------------------------------------------------------------------------- /peepdf/colorama/initialise.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import sys 3 | 4 | from .ansitowin32 import AnsiToWin32 5 | 6 | orig_stdout = sys.stdout 7 | orig_stderr = sys.stderr 8 | 9 | atexit_done = False 10 | 11 | 12 | def reset_all(): 13 | AnsiToWin32(orig_stdout).reset_all() 14 | 15 | 16 | def init(autoreset=False, convert=None, strip=None, wrap=True): 17 | if wrap == False and (autoreset == True or convert == True or strip == True): 18 | raise ValueError('wrap=False conflicts with any other arg=True') 19 | 20 | sys.stdout = wrap_stream(orig_stdout, convert, strip, autoreset, wrap) 21 | sys.stderr = wrap_stream(orig_stderr, convert, strip, autoreset, wrap) 22 | 23 | global atexit_done 24 | if not atexit_done: 25 | atexit.register(reset_all) 26 | atexit_done = True 27 | 28 | 29 | def wrap_stream(stream, convert, strip, autoreset, wrap): 30 | if wrap: 31 | wrapper = AnsiToWin32(stream, 32 | convert=convert, strip=strip, autoreset=autoreset) 33 | if wrapper.should_wrap(): 34 | stream = wrapper.stream 35 | return stream 36 | -------------------------------------------------------------------------------- /peepdf/colorama/win32.py: -------------------------------------------------------------------------------- 1 | # from winbase.h 2 | STDOUT = -11 3 | STDERR = -12 4 | 5 | try: 6 | from ctypes import windll 7 | except ImportError: 8 | windll = None 9 | SetConsoleTextAttribute = lambda *_: None 10 | else: 11 | from ctypes import ( 12 | byref, Structure, c_char, c_short, c_uint32, c_ushort 13 | ) 14 | 15 | handles = { 16 | STDOUT: windll.kernel32.GetStdHandle(STDOUT), 17 | STDERR: windll.kernel32.GetStdHandle(STDERR), 18 | } 19 | 20 | SHORT = c_short 21 | WORD = c_ushort 22 | DWORD = c_uint32 23 | TCHAR = c_char 24 | 25 | class COORD(Structure): 26 | """struct in wincon.h""" 27 | _fields_ = [ 28 | ('X', SHORT), 29 | ('Y', SHORT), 30 | ] 31 | 32 | class SMALL_RECT(Structure): 33 | """struct in wincon.h.""" 34 | _fields_ = [ 35 | ("Left", SHORT), 36 | ("Top", SHORT), 37 | ("Right", SHORT), 38 | ("Bottom", SHORT), 39 | ] 40 | 41 | class CONSOLE_SCREEN_BUFFER_INFO(Structure): 42 | """struct in wincon.h.""" 43 | _fields_ = [ 44 | ("dwSize", COORD), 45 | ("dwCursorPosition", COORD), 46 | ("wAttributes", WORD), 47 | ("srWindow", SMALL_RECT), 48 | ("dwMaximumWindowSize", COORD), 49 | ] 50 | 51 | def GetConsoleScreenBufferInfo(stream_id): 52 | handle = handles[stream_id] 53 | csbi = CONSOLE_SCREEN_BUFFER_INFO() 54 | success = windll.kernel32.GetConsoleScreenBufferInfo( 55 | handle, byref(csbi)) 56 | # This fails when imported via setup.py when installing using 'pip' 57 | # presumably the fix is that running setup.py should not trigger all 58 | # this activity. 59 | # assert success 60 | return csbi 61 | 62 | def SetConsoleTextAttribute(stream_id, attrs): 63 | handle = handles[stream_id] 64 | success = windll.kernel32.SetConsoleTextAttribute(handle, attrs) 65 | assert success 66 | 67 | def SetConsoleCursorPosition(stream_id, position): 68 | handle = handles[stream_id] 69 | position = COORD(*position) 70 | success = windll.kernel32.SetConsoleCursorPosition(handle, position) 71 | assert success 72 | 73 | def FillConsoleOutputCharacter(stream_id, char, length, start): 74 | handle = handles[stream_id] 75 | char = TCHAR(char) 76 | length = DWORD(length) 77 | start = COORD(*start) 78 | num_written = DWORD(0) 79 | # AttributeError: function 'FillConsoleOutputCharacter' not found 80 | # could it just be that my types are wrong? 81 | success = windll.kernel32.FillConsoleOutputCharacter( 82 | handle, char, length, start, byref(num_written)) 83 | assert success 84 | return num_written.value 85 | 86 | if __name__ == '__main__': 87 | x = GetConsoleScreenBufferInfo(STDOUT) 88 | print(x.dwSize) 89 | print(x.dwCursorPosition) 90 | print(x.wAttributes) 91 | print(x.srWindow) 92 | print(x.dwMaximumWindowSize) 93 | -------------------------------------------------------------------------------- /peepdf/colorama/winterm.py: -------------------------------------------------------------------------------- 1 | from . import win32 2 | 3 | 4 | # from wincon.h 5 | class WinColor(object): 6 | BLACK = 0 7 | BLUE = 1 8 | GREEN = 2 9 | CYAN = 3 10 | RED = 4 11 | MAGENTA = 5 12 | YELLOW = 6 13 | GREY = 7 14 | 15 | 16 | # from wincon.h 17 | class WinStyle(object): 18 | NORMAL = 0x00 # dim text, dim background 19 | BRIGHT = 0x08 # bright text, dim background 20 | 21 | 22 | class WinTerm(object): 23 | def __init__(self): 24 | self._default = \ 25 | win32.GetConsoleScreenBufferInfo(win32.STDOUT).wAttributes 26 | self.set_attrs(self._default) 27 | self._default_fore = self._fore 28 | self._default_back = self._back 29 | self._default_style = self._style 30 | 31 | def get_attrs(self): 32 | return self._fore + self._back * 16 + self._style 33 | 34 | def set_attrs(self, value): 35 | self._fore = value & 7 36 | self._back = (value >> 4) & 7 37 | self._style = value & WinStyle.BRIGHT 38 | 39 | def reset_all(self, on_stderr=None): 40 | self.set_attrs(self._default) 41 | self.set_console(attrs=self._default) 42 | 43 | def fore(self, fore=None, on_stderr=False): 44 | if fore is None: 45 | fore = self._default_fore 46 | self._fore = fore 47 | self.set_console(on_stderr=on_stderr) 48 | 49 | def back(self, back=None, on_stderr=False): 50 | if back is None: 51 | back = self._default_back 52 | self._back = back 53 | self.set_console(on_stderr=on_stderr) 54 | 55 | def style(self, style=None, on_stderr=False): 56 | if style is None: 57 | style = self._default_style 58 | self._style = style 59 | self.set_console(on_stderr=on_stderr) 60 | 61 | def set_console(self, attrs=None, on_stderr=False): 62 | if attrs is None: 63 | attrs = self.get_attrs() 64 | handle = win32.STDOUT 65 | if on_stderr: 66 | handle = win32.STDERR 67 | win32.SetConsoleTextAttribute(handle, attrs) 68 | -------------------------------------------------------------------------------- /peepdf/jsbeautifier/unpackers/README.specs.mkd: -------------------------------------------------------------------------------- 1 | # UNPACKERS SPECIFICATIONS 2 | 3 | Nothing very difficult: an unpacker is a submodule placed in the directory 4 | where this file was found. Each unpacker must define three symbols: 5 | 6 | * `PRIORITY` : integer number expressing the priority in applying this 7 | unpacker. Lower number means higher priority. 8 | Makes sense only if a source file has been packed with 9 | more than one packer. 10 | * `detect(source)` : returns `True` if source is packed, otherwise, `False`. 11 | * `unpack(source)` : takes a `source` string and unpacks it. Must always return 12 | valid JavaScript. That is to say, your code should look 13 | like: 14 | 15 | ``` 16 | if detect(source): 17 | return do_your_fancy_things_with(source) 18 | else: 19 | return source 20 | ``` 21 | 22 | *You can safely define any other symbol in your module, as it will be ignored.* 23 | 24 | `__init__` code will automatically load new unpackers, without any further step 25 | to be accomplished. Simply drop it in this directory. 26 | -------------------------------------------------------------------------------- /peepdf/jsbeautifier/unpackers/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # General code for JSBeautifier unpackers infrastructure. See README.specs 3 | # written by Stefano Sanfilippo 4 | # 5 | 6 | """General code for JSBeautifier unpackers infrastructure.""" 7 | 8 | import pkgutil 9 | import re 10 | # from jsbeautifier.unpackers import evalbased 11 | import evalbased 12 | 13 | # NOTE: AT THE MOMENT, IT IS DEACTIVATED FOR YOUR SECURITY: it runs js! 14 | BLACKLIST = ['jsbeautifier.unpackers.evalbased'] 15 | 16 | 17 | class UnpackingError(Exception): 18 | """Badly packed source or general error. Argument is a 19 | meaningful description.""" 20 | pass 21 | 22 | 23 | def getunpackers(): 24 | """Scans the unpackers dir, finds unpackers and add them to UNPACKERS list. 25 | An unpacker will be loaded only if it is a valid python module (name must 26 | adhere to naming conventions) and it is not blacklisted (i.e. inserted 27 | into BLACKLIST.""" 28 | path = __path__ 29 | prefix = __name__ + '.' 30 | unpackers = [] 31 | interface = ['unpack', 'detect', 'PRIORITY'] 32 | for _importer, modname, _ispkg in pkgutil.iter_modules(path, prefix): 33 | if 'tests' not in modname and modname not in BLACKLIST: 34 | try: 35 | module = __import__(modname, fromlist=interface) 36 | except ImportError: 37 | raise UnpackingError('Bad unpacker: %s' % modname) 38 | else: 39 | unpackers.append(module) 40 | 41 | return sorted(unpackers, key=lambda mod: mod.PRIORITY) 42 | 43 | # UNPACKERS = getunpackers() 44 | UNPACKERS = [] 45 | 46 | 47 | def run(source, evalcode=False): 48 | """Runs the applicable unpackers and return unpacked source as a string.""" 49 | for unpacker in [mod for mod in UNPACKERS if mod.detect(source)]: 50 | source = unpacker.unpack(source) 51 | if evalcode and evalbased.detect(source): 52 | source = evalbased.unpack(source) 53 | return source 54 | 55 | 56 | def filtercomments(source): 57 | """NOT USED: strips trailing comments and put them at the top.""" 58 | trailing_comments = [] 59 | comment = True 60 | 61 | while comment: 62 | if re.search(r'^\s*\/\*', source): 63 | comment = source[0, source.index('*/') + 2] 64 | elif re.search(r'^\s*\/\/', source): 65 | comment = re.search(r'^\s*\/\/', source).group(0) 66 | else: 67 | comment = None 68 | 69 | if comment: 70 | source = re.sub(r'^\s+', '', source[len(comment):]) 71 | trailing_comments.append(comment) 72 | 73 | return '\n'.join(trailing_comments) + source 74 | -------------------------------------------------------------------------------- /peepdf/jsbeautifier/unpackers/evalbased.py: -------------------------------------------------------------------------------- 1 | # 2 | # Unpacker for eval() based packers, a part of javascript beautifier 3 | # by Einar Lielmanis 4 | # 5 | # written by Stefano Sanfilippo 6 | # 7 | # usage: 8 | # 9 | # if detect(some_string): 10 | # unpacked = unpack(some_string) 11 | # 12 | 13 | """Unpacker for eval() based packers: runs JS code and returns result. 14 | Works only if a JS interpreter (e.g. Mozilla's Rhino) is installed and 15 | properly set up on host.""" 16 | 17 | from subprocess import PIPE, Popen 18 | 19 | PRIORITY = 3 20 | 21 | 22 | def detect(source): 23 | """Detects if source is likely to be eval() packed.""" 24 | return source.strip().lower().startswith('eval(function(') 25 | 26 | 27 | def unpack(source): 28 | """Runs source and return resulting code.""" 29 | return jseval('print %s;' % source[4:]) if detect(source) else source 30 | 31 | 32 | # In case of failure, we'll just return the original, without crashing on user. 33 | def jseval(script): 34 | """Run code in the JS interpreter and return output.""" 35 | try: 36 | interpreter = Popen(['js'], stdin=PIPE, stdout=PIPE) 37 | except OSError: 38 | return script 39 | result, errors = interpreter.communicate(script) 40 | if interpreter.poll() or errors: 41 | return script 42 | return result 43 | -------------------------------------------------------------------------------- /peepdf/jsbeautifier/unpackers/javascriptobfuscator.py: -------------------------------------------------------------------------------- 1 | # 2 | # simple unpacker/deobfuscator for scripts messed up with 3 | # javascriptobfuscator.com 4 | # 5 | # written by Einar Lielmanis 6 | # rewritten in Python by Stefano Sanfilippo 7 | # 8 | # Will always return valid javascript: if `detect()` is false, `code` is 9 | # returned, unmodified. 10 | # 11 | # usage: 12 | # 13 | # if javascriptobfuscator.detect(some_string): 14 | # some_string = javascriptobfuscator.unpack(some_string) 15 | # 16 | 17 | """deobfuscator for scripts messed up with JavascriptObfuscator.com""" 18 | 19 | import re 20 | 21 | PRIORITY = 1 22 | 23 | 24 | def smartsplit(code): 25 | """Split `code` at " symbol, only if it is not escaped.""" 26 | strings = [] 27 | pos = 0 28 | while pos < len(code): 29 | if code[pos] == '"': 30 | word = '' # new word 31 | pos += 1 32 | while pos < len(code): 33 | if code[pos] == '"': 34 | break 35 | if code[pos] == '\\': 36 | word += '\\' 37 | pos += 1 38 | word += code[pos] 39 | pos += 1 40 | strings.append('"%s"' % word) 41 | pos += 1 42 | return strings 43 | 44 | 45 | def detect(code): 46 | """Detects if `code` is JavascriptObfuscator.com packed.""" 47 | # prefer `is not` idiom, so that a true boolean is returned 48 | return (re.search(r'^var _0x[a-f0-9]+ ?\= ?\[', code) is not None) 49 | 50 | 51 | def unpack(code): 52 | """Unpacks JavascriptObfuscator.com packed code.""" 53 | if detect(code): 54 | matches = re.search(r'var (_0x[a-f\d]+) ?\= ?\[(.*?)\];', code) 55 | if matches: 56 | variable = matches.group(1) 57 | dictionary = smartsplit(matches.group(2)) 58 | code = code[len(matches.group(0)):] 59 | for key, value in enumerate(dictionary): 60 | code = code.replace(r'%s[%s]' % (variable, key), value) 61 | return code 62 | -------------------------------------------------------------------------------- /peepdf/jsbeautifier/unpackers/myobfuscate.py: -------------------------------------------------------------------------------- 1 | # 2 | # deobfuscator for scripts messed up with myobfuscate.com 3 | # by Einar Lielmanis 4 | # 5 | # written by Stefano Sanfilippo 6 | # 7 | # usage: 8 | # 9 | # if detect(some_string): 10 | # unpacked = unpack(some_string) 11 | # 12 | 13 | # CAVEAT by Einar Lielmanis 14 | 15 | # 16 | # You really don't want to obfuscate your scripts there: they're tracking 17 | # your unpackings, your script gets turned into something like this, 18 | # as of 2011-08-26: 19 | # 20 | # var _escape = 'your_script_escaped'; 21 | # var _111 = document.createElement('script'); 22 | # _111.src = 'http://api.www.myobfuscate.com/?getsrc=ok' + 23 | # '&ref=' + encodeURIComponent(document.referrer) + 24 | # '&url=' + encodeURIComponent(document.URL); 25 | # var 000 = document.getElementsByTagName('head')[0]; 26 | # 000.appendChild(_111); 27 | # document.write(unescape(_escape)); 28 | # 29 | 30 | """Deobfuscator for scripts messed up with MyObfuscate.com""" 31 | 32 | import re 33 | import base64 34 | 35 | # Python 2 retrocompatibility 36 | # pylint: disable=F0401 37 | # pylint: disable=E0611 38 | try: 39 | from urllib import unquote 40 | except ImportError: 41 | from urllib.parse import unquote 42 | 43 | from jsbeautifier.unpackers import UnpackingError 44 | 45 | PRIORITY = 1 46 | 47 | CAVEAT = """// 48 | // Unpacker warning: be careful when using myobfuscate.com for your projects: 49 | // scripts obfuscated by the free online version call back home. 50 | // 51 | 52 | """ 53 | 54 | SIGNATURE = (r'["\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F' 55 | r'\x50\x51\x52\x53\x54\x55\x56\x57\x58\x59\x5A\x61\x62\x63\x64\x65' 56 | r'\x66\x67\x68\x69\x6A\x6B\x6C\x6D\x6E\x6F\x70\x71\x72\x73\x74\x75' 57 | r'\x76\x77\x78\x79\x7A\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x2B' 58 | r'\x2F\x3D","","\x63\x68\x61\x72\x41\x74","\x69\x6E\x64\x65\x78' 59 | r'\x4F\x66","\x66\x72\x6F\x6D\x43\x68\x61\x72\x43\x6F\x64\x65","' 60 | r'\x6C\x65\x6E\x67\x74\x68"]') 61 | 62 | 63 | def detect(source): 64 | """Detects MyObfuscate.com packer.""" 65 | return SIGNATURE in source 66 | 67 | 68 | def unpack(source): 69 | """Unpacks js code packed with MyObfuscate.com""" 70 | if not detect(source): 71 | return source 72 | payload = unquote(_filter(source)) 73 | match = re.search(r"^var _escape\='' 204 | res = re.findall(reJSscript, content, re.DOTALL | re.IGNORECASE) 205 | if res != []: 206 | self.javascript.append('\n'.join(res)) 207 | else: 208 | self.javascript.append(content) 209 | return 210 | 211 | ''' 212 | Check string for flash content 213 | ''' 214 | 215 | def check_swf(self, content): 216 | if isFlash(content): 217 | self.swf.append(content) 218 | return 219 | 220 | ''' 221 | Create an lxml tree from the xml string 222 | ''' 223 | 224 | def tree_from_xml(self, xml): 225 | try: 226 | tree = ET.fromstring(xml) 227 | return tree 228 | except Exception as e: 229 | sys.stderr.write("xml_creator cannot create tree: %s\n" % e) 230 | return 'TREE_ERROR: %s' % str(e) 231 | 232 | ''' 233 | Calls edges to recursively create the graph string 234 | ''' 235 | 236 | def make_graph(self, tree): 237 | res = [] 238 | # Explicit check for None to avoid FutureWarning 239 | if tree is not None: 240 | self.edges(tree, res, 0) 241 | return res 242 | 243 | def edges(self, parent, output, id): 244 | """ 245 | 246 | creates string showing connections between objects 247 | """ 248 | for child in list(parent): 249 | if isinstance(child, str): 250 | return 251 | elif child.get("id") != None: 252 | cid = child.get("id") 253 | output.append(str(id) + ' ' + cid + '\n') 254 | self.edges(child, output, cid) 255 | else: 256 | res = self.edges(child, output, id) 257 | return 258 | 259 | 260 | if __name__ == "__main__": 261 | try: 262 | dirin = sys.argv[1] 263 | dirout = sys.argv[2] 264 | except IndexError: 265 | sys.exit(0) 266 | else: 267 | if not os.path.isdir(dirin) or not os.path.isdir(dirout): 268 | sys.exit(0) 269 | 270 | sys.stdout.write("%s/*.pdf --> %s/*.swf\n\n" % (dirin, dirout)) 271 | 272 | try: 273 | fdone = open(os.path.join(dirout, "done.txt"), 'a+') 274 | ferr = open(os.path.join(dirout, "error.txt"), 'a') 275 | except IOError as e: 276 | sys.stderr.write("parser done file error: %s\n" % e) 277 | else: 278 | completed = set() 279 | fdone.seek(0) 280 | for line in fdone: 281 | completed.add(line.rstrip()) 282 | 283 | pdfs = scandir(dirin) 284 | 285 | for pdf in pdfs: 286 | 287 | if pdf.name in completed: 288 | sys.stdout.write("skipping: %s\n" % pdf.name) 289 | continue 290 | 291 | sys.stdout.write("%s\n" % pdf.name) 292 | 293 | try: 294 | parsed = FrankenParser(pdf.path) 295 | except Exception as e: 296 | try: 297 | ferr.write("%s:%s\n" % (pdf.name, str(e))) 298 | except Exception: 299 | ferr.write("%s: ferr write() BIG-TIME ERROR\n" % pdf.name) 300 | sys.stderr.write("ferr write error pdf: %s := %s\n" % (pdf.name, e)) 301 | else: 302 | if parsed.swf: 303 | try: 304 | fout = open(os.path.join(dirout, "%s.swf" % pdf.name), 'wb') 305 | except IOError as e: 306 | sys.stderr.write("parser output file error: %s\n" % e) 307 | else: 308 | fout.write(''.join(parsed.swf)) 309 | fout.close() 310 | finally: 311 | try: 312 | fdone.write("%s\n" % pdf.name) 313 | except Exception as e: 314 | sys.stderr.write("fdone write error pdf: %s := %s\n" % (pdf.name, e)) 315 | sys.stdout.write("\n") 316 | fdone.close() 317 | ferr.close() 318 | --------------------------------------------------------------------------------