├── .gitignore
├── LICENSE.md
├── README.md
├── __init__.py
├── bin
    └── __init__.py
├── db
    └── __init__.py
├── lib
    ├── __init__.py
    ├── parse
    │   ├── __init__.py
    │   ├── pdfminer
    │   │   ├── __init__.py
    │   │   ├── arcfour.py
    │   │   ├── ascii85.py
    │   │   ├── ccitt.py
    │   │   ├── lzw.py
    │   │   ├── pdfdocument.py
    │   │   ├── pdfparser.py
    │   │   ├── pdftypes.py
    │   │   ├── psparser.py
    │   │   ├── runlength.py
    │   │   └── utils.py
    │   └── peepdf
    │   │   ├── AUTHORS
    │   │   ├── CHANGELOG
    │   │   ├── COPYING
    │   │   ├── JSAnalysis.py
    │   │   ├── PDFConsole.py
    │   │   ├── PDFCore.py
    │   │   ├── PDFCrypto.py
    │   │   ├── PDFFilters.py
    │   │   ├── PDFUtils.py
    │   │   ├── README
    │   │   ├── TODO
    │   │   ├── __init__.py
    │   │   ├── aes.py
    │   │   ├── aespython
    │   │       ├── __init__.py
    │   │       ├── aes_cipher.py
    │   │       ├── aes_tables.py
    │   │       ├── cbc_mode.py
    │   │       ├── cfb_mode.py
    │   │       ├── key_expander.py
    │   │       ├── ofb_mode.py
    │   │       └── test_keys.py
    │   │   ├── ccitt.py
    │   │   ├── colorama
    │   │       ├── PKG-INFO
    │   │       ├── __init__.py
    │   │       ├── ansi.py
    │   │       ├── ansitowin32.py
    │   │       ├── initialise.py
    │   │       ├── win32.py
    │   │       └── winterm.py
    │   │   ├── jjdecode.py
    │   │   ├── jsbeautifier
    │   │       ├── __init__.py
    │   │       └── unpackers
    │   │       │   ├── README.specs.mkd
    │   │       │   ├── __init__.py
    │   │       │   ├── evalbased.py
    │   │       │   ├── javascriptobfuscator.py
    │   │       │   ├── myobfuscate.py
    │   │       │   ├── packer.py
    │   │       │   └── urlencode.py
    │   │   ├── lzw.py
    │   │   ├── peepdf.dtd
    │   │   └── peepdf.py
    ├── scandir.py
    └── spectragraph
    │   ├── __init__.py
    │   ├── conversion.py
    │   ├── matrix.py
    │   └── spectragraph.py
├── logs
    └── __init__.py
├── main.py
├── process
    ├── __init__.py
    ├── hashers
    │   ├── __init__.py
    │   ├── hasher.py
    │   ├── pdfminer.py
    │   └── peepdf.py
    ├── parsers
    │   ├── __init__.py
    │   ├── parse.py
    │   ├── pdfminer.py
    │   └── peepdf.py
    ├── pdf.py
    ├── pdfhasher.py
    ├── run-jpexs.py
    └── sdhasher.py
├── storage
    ├── __init__.py
    ├── dbgw.py
    └── storage.py
├── util
    ├── __init__.py
    ├── huntterp.py
    └── str_utils.py
└── xml-output
    └── __init__.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .gitignore support plugin (hsz.mobi)
  2 | ### vim template
  3 | [._]*.s[a-w][a-z]
  4 | [._]s[a-w][a-z]
  5 | *.un~
  6 | Session.vim
  7 | .netrwhist
  8 | *~
  9 | 
 10 | 
 11 | ### C template
 12 | # Object files
 13 | *.o
 14 | *.ko
 15 | *.obj
 16 | *.elf
 17 | 
 18 | # Precompiled Headers
 19 | *.gch
 20 | *.pch
 21 | 
 22 | # Libraries
 23 | *.lib
 24 | *.a
 25 | *.la
 26 | *.lo
 27 | 
 28 | # Shared objects (inc. Windows DLLs)
 29 | *.dll
 30 | *.so
 31 | *.so.*
 32 | *.dylib
 33 | 
 34 | # Executables
 35 | *.exe
 36 | *.out
 37 | *.app
 38 | *.i*86
 39 | *.x86_64
 40 | *.hex
 41 | 
 42 | 
 43 | ### JetBrains template
 44 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm
 45 | 
 46 | *.iml
 47 | 
 48 | ## Directory-based project format:
 49 | .idea/
 50 | # if you remove the above rule, at least ignore the following:
 51 | 
 52 | # User-specific stuff:
 53 | # .idea/workspace.xml
 54 | # .idea/tasks.xml
 55 | # .idea/dictionaries
 56 | 
 57 | # Sensitive or high-churn files:
 58 | # .idea/dataSources.ids
 59 | # .idea/dataSources.xml
 60 | # .idea/sqlDataSources.xml
 61 | # .idea/dynamic.xml
 62 | # .idea/uiDesigner.xml
 63 | 
 64 | # Gradle:
 65 | # .idea/gradle.xml
 66 | # .idea/libraries
 67 | 
 68 | # Mongo Explorer plugin:
 69 | # .idea/mongoSettings.xml
 70 | 
 71 | ## File-based project format:
 72 | *.ipr
 73 | *.iws
 74 | 
 75 | ## Plugin-specific files:
 76 | 
 77 | # IntelliJ
 78 | out/
 79 | 
 80 | # mpeltonen/sbt-idea plugin
 81 | .idea_modules/
 82 | 
 83 | # JIRA plugin
 84 | atlassian-ide-plugin.xml
 85 | 
 86 | # Crashlytics plugin (for Android Studio and IntelliJ)
 87 | com_crashlytics_export_strings.xml
 88 | crashlytics.properties
 89 | crashlytics-build.properties
 90 | 
 91 | 
 92 | ### Xcode template
 93 | build/
 94 | *.pbxuser
 95 | !default.pbxuser
 96 | *.mode1v3
 97 | !default.mode1v3
 98 | *.mode2v3
 99 | !default.mode2v3
100 | *.perspectivev3
101 | !default.perspectivev3
102 | xcuserdata
103 | *.xccheckout
104 | *.moved-aside
105 | DerivedData
106 | *.xcuserstate
107 | 
108 | 
109 | ### Python template
110 | # Byte-compiled / optimized / DLL files
111 | __pycache__/
112 | *.py[cod]
113 | 
114 | # C extensions
115 | *.so
116 | 
117 | # Distribution / packaging
118 | .Python
119 | env/
120 | build/
121 | develop-eggs/
122 | dist/
123 | downloads/
124 | eggs/
125 | lib64/
126 | parts/
127 | sdist/
128 | var/
129 | *.egg-info/
130 | .installed.cfg
131 | *.egg
132 | 
133 | # PyInstaller
134 | #  Usually these files are written by a python script from a template
135 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
136 | *.manifest
137 | *.spec
138 | 
139 | # Installer logs
140 | pip-log.txt
141 | pip-delete-this-directory.txt
142 | 
143 | # Unit test / coverage reports
144 | htmlcov/
145 | .tox/
146 | .coverage
147 | .cache
148 | nosetests.xml
149 | coverage.xml
150 | 
151 | # Translations
152 | *.mo
153 | *.pot
154 | 
155 | # Django stuff:
156 | *.log
157 | 
158 | # Sphinx documentation
159 | _build/
160 | 
161 | # PyBuilder
162 | target/
163 | 
164 | 
165 | ### OSX template
166 | .DS_Store
167 | .AppleDouble
168 | .LSOverride
169 | 
170 | # Icon must end with two \r
171 | Icon  
172 | 
173 | # Thumbnails
174 | ._*
175 | 
176 | # Files that might appear on external disk
177 | .Spotlight-V100
178 | .Trashes
179 | 
180 | # Directories potentially created on remote AFP share
181 | .AppleDB
182 | .AppleDesktop
183 | Network Trash Folder
184 | Temporary Items
185 | .apdisk
186 | 
187 | 
188 | ### Java template
189 | *.class
190 | 
191 | # Mobile Tools for Java (J2ME)
192 | .mtj.tmp/
193 | 
194 | # Package Files #
195 | *.jar
196 | *.war
197 | *.ear
198 | 
199 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
200 | hs_err_pid*
201 | 
202 | 
203 | ### Added by nabu authors
204 | *.sqlite
205 | docs/
206 | xml-output/*.zip
207 | xml-output/*.xml
208 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Use of Nabu and related source code is subject to the terms
 2 | of the following licenses:
 3 | 
 4 | GNU General Public License (GPL) Rights pursuant to Version 2, June 1991
 5 | Government Purpose License Rights (GPLR) pursuant to DFARS 252.227.7013
 6 | 
 7 | NO WARRANTY
 8 | 
 9 | ANY INFORMATION, MATERIALS, SERVICES, INTELLECTUAL PROPERTY OR OTHER
10 | PROPERTY OR RIGHTS GRANTED OR PROVIDED BY CARNEGIE MELLON UNIVERSITY
11 | PURSUANT TO THIS LICENSE (HEREINAFTER THE "DELIVERABLES") ARE ON AN
12 | "AS-IS" BASIS. CARNEGIE MELLON UNIVERSITY MAKES NO WARRANTIES OF ANY
13 | KIND, EITHER EXPRESS OR IMPLIED AS TO ANY MATTER INCLUDING, BUT NOT
14 | LIMITED TO, WARRANTY OF FITNESS FOR A PARTICULAR PURPOSE,
15 | MERCHANTABILITY, INFORMATIONAL CONTENT, NONINFRINGEMENT, OR ERROR-FREE
16 | OPERATION. CARNEGIE MELLON UNIVERSITY SHALL NOT BE LIABLE FOR INDIRECT,
17 | SPECIAL OR CONSEQUENTIAL DAMAGES, SUCH AS LOSS OF PROFITS OR INABILITY
18 | TO USE SAID INTELLECTUAL PROPERTY, UNDER THIS LICENSE, REGARDLESS OF
19 | WHETHER SUCH PARTY WAS AWARE OF THE POSSIBILITY OF SUCH DAMAGES.
20 | LICENSEE AGREES THAT IT WILL NOT MAKE ANY WARRANTY ON BEHALF OF
21 | CARNEGIE MELLON UNIVERSITY, EXPRESS OR IMPLIED, TO ANY PERSON
22 | CONCERNING THE APPLICATION OF OR THE RESULTS TO BE OBTAINED WITH THE
23 | DELIVERABLES UNDER THIS LICENSE.
24 | 
25 | Licensee hereby agrees to defend, indemnify, and hold harmless Carnegie
26 | Mellon University, its trustees, officers, employees, and agents from
27 | all claims or demands made against them (and any related losses,
28 | expenses, or attorney's fees) arising out of, or relating to Licensee's
29 | and/or its sub licensees' negligent use or willful misuse of or
30 | negligent conduct or willful misconduct regarding the Software,
31 | facilities, or other rights or assistance granted by Carnegie Mellon
32 | University under this License, including, but not limited to, any
33 | claims of product liability, personal injury, death, damage to
34 | property, or violation of any laws or regulations.
35 | 
36 | Carnegie Mellon University Software Engineering Institute authored
37 | documents are sponsored by the U.S. Department of Defense under
38 | Contract FA8721-05-C-0003. Carnegie Mellon University retains
39 | copyrights in all material produced under this contract. The U.S.
40 | Government retains a non-exclusive, royalty-free license to publish or
41 | reproduce these documents, or allow others to do so, for U.S.
42 | Government purposes only pursuant to the copyright license under the
43 | contract clause at 252.227.7013.
44 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | About
 2 | =====
 3 | Nabu is a tool (work in progress) for parsing, constructing, and comparing the structural graphs of a large collection
 4 |  of PDF documents. The comparisons are based on the work of [NetSimile](http://arxiv.org/abs/1209.2684).
 5 |  
 6 | This tool grew from PDFrankenstein, and now includes javascript in the pdf database. To view the JS after building 
 7 | your database:
 8 | 
 9 | `sqlite3 -cmd "select js from pdfs" db/nabu-graphdb.sqlite`
10 | 
11 | Dependencies
12 | ------------
13 | * networkx
14 | * scipy
15 | * matplotlib
16 | * psycopg2 (PostGres python module, also requires Postgres)
17 | 
18 | Usage
19 | -----
20 | 
21 | The workflow with Nabu will typically be:
22 | 
23 | 1. Build a graph database from a collection of PDFs
24 | 2. Score the graphs for similarity
25 | 3. Draw dendogram clusters (TODO)
26 | 
27 | #### Building the Database
28 | 
29 | Build the graph database by parsing the specified PDFs. PDFs are given with full paths in a line separated file.
30 | `python main.py [options] build <file input>`
31 | 
32 | #### Scoring the Database
33 | 
34 | Requires a list of files to score. If the files are not present in the graph database then they will be added. Nabu will output (in CSV format): `subject, family, candidate, score`
35 | 
36 | `python main.py [options] score <file input>`
37 | 
38 | #### Drawing Clusters
39 | 
40 | Runs from the graph database. Uses scipy and matplotlib to draw the dendrogram of the set of PDFs based on the 
41 | similarity score. Currently uses Canberra distance metric.
42 | 
43 | `python main.py [options] cluster`
44 | 
45 | #### Options
46 | 
47 | ```
48 | positional arguments:
49 |     action                build | score | cluster (under construction)
50 |     fin                   line separated text file of samples to run
51 |   
52 | optional arguments:
53 |     -h, --help            show this help message and exit
54 |     -b, --beginning       Start from beginning. Don't resume job file based on completed
55 |     -c CHUNK, --chunk CHUNK
56 |                         Chunk size in jobs. Default is num_procs * 1
57 |     -d, --debug           Spam the terminal with debug output
58 |     -g GRAPHDB, --graphdb GRAPHDB
59 |                         Graph database filename. Default is nabu-
60 |                         graphdb.sqlite
61 |     -j JOBDB, --jobdb JOBDB
62 |                         Job database filename. Default is nabu-jobs.sqlite
63 |     --xmldb XMLDB         xml database filename. Default is nabu-xml.sqlite
64 |     --dbdir DBDIR         Database directory. Default is .../nabu/db/
65 |     --logdir LOGDIR       Logging directory. Default is .../nabu/logs/
66 |     --parser PARSER       Type of pdf parser to use. Default is pdfminer
67 |     -p PROCS, --procs PROCS
68 |                         Number of parallel processes. Default is 2/3 cpu core
69 |                         count
70 |     -t THRESH, --thresh THRESH
71 |                         Threshold which reports only graphs with similarities
72 |                         at or below this value.
73 |     -u, --update          Ignore completed jobs
74 | ```
75 | 
76 | References
77 | ----------
78 | [NetSimile](http://arxiv.org/abs/1209.2684)


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'sei-mappel'
2 | 


--------------------------------------------------------------------------------
/bin/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'honey'
2 | 


--------------------------------------------------------------------------------
/db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/nabu/3afcab20a5ddd8a9b984d8f34756ebedfc0b45a9/db/__init__.py


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'honey'
2 | 


--------------------------------------------------------------------------------
/lib/parse/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'honey'
2 | 


--------------------------------------------------------------------------------
/lib/parse/pdfminer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/nabu/3afcab20a5ddd8a9b984d8f34756ebedfc0b45a9/lib/parse/pdfminer/__init__.py


--------------------------------------------------------------------------------
/lib/parse/pdfminer/arcfour.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """ Python implementation of Arcfour encryption algorithm.
 4 | 
 5 | This code is in the public domain.
 6 | 
 7 | """
 8 | 
 9 | 
10 | ##  Arcfour
11 | ##
12 | class Arcfour(object):
13 | 
14 |     """
15 |     >>> Arcfour(b'Key').process(b'Plaintext').encode('hex')
16 |     'bbf316e8d940af0ad3'
17 |     >>> Arcfour(b'Wiki').process(b'pedia').encode('hex')
18 |     '1021bf0420'
19 |     >>> Arcfour(b'Secret').process(b'Attack at dawn').encode('hex')
20 |     '45a01f645fc35b383552544b9bf5'
21 |     """
22 | 
23 |     def __init__(self, key):
24 |         s = range(256)
25 |         j = 0
26 |         klen = len(key)
27 |         for i in xrange(256):
28 |             j = (j + s[i] + ord(key[i % klen])) % 256
29 |             (s[i], s[j]) = (s[j], s[i])
30 |         self.s = s
31 |         (self.i, self.j) = (0, 0)
32 |         return
33 | 
34 |     def process(self, data):
35 |         (i, j) = (self.i, self.j)
36 |         s = self.s
37 |         r = b''
38 |         for c in data:
39 |             i = (i+1) % 256
40 |             j = (j+s[i]) % 256
41 |             (s[i], s[j]) = (s[j], s[i])
42 |             k = s[(s[i]+s[j]) % 256]
43 |             r += chr(ord(c) ^ k)
44 |         (self.i, self.j) = (i, j)
45 |         return r
46 |     
47 |     encrypt = decrypt = process
48 | 
49 | new = Arcfour
50 | 
51 | # test
52 | if __name__ == '__main__':
53 |     import doctest
54 |     doctest.testmod()
55 | 


--------------------------------------------------------------------------------
/lib/parse/pdfminer/ascii85.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """ Python implementation of ASCII85/ASCIIHex decoder (Adobe version).
 4 | 
 5 | This code is in the public domain.
 6 | 
 7 | """
 8 | 
 9 | import re
10 | import struct
11 | 
12 | 
13 | # ascii85decode(data)
14 | def ascii85decode(data):
15 |     """
16 |     In ASCII85 encoding, every four bytes are encoded with five ASCII
17 |     letters, using 85 different types of characters (as 256**4 < 85**5).
18 |     When the length of the original bytes is not a multiple of 4, a special
19 |     rule is used for round up.
20 | 
21 |     The Adobe's ASCII85 implementation is slightly different from
22 |     its original in handling the last characters.
23 | 
24 |     The sample string is taken from:
25 |       http://en.wikipedia.org/w/index.php?title=Ascii85
26 | 
27 |     >>> ascii85decode(b'9jqo^BlbD-BleB1DJ+*+F(f,q')
28 |     'Man is distinguished'
29 |     >>> ascii85decode(b'E,9)oF*2M7/c~>')
30 |     'pleasure.'
31 |     """
32 |     n = b = 0
33 |     out = b''
34 |     for c in data:
35 |         if b'!' <= c and c <= b'u':
36 |             n += 1
37 |             b = b*85+(ord(c)-33)
38 |             if n == 5:
39 |                 out += struct.pack('>L', b)
40 |                 n = b = 0
41 |         elif c == b'z':
42 |             assert n == 0
43 |             out += b'\0\0\0\0'
44 |         elif c == b'~':
45 |             if n:
46 |                 for _ in range(5-n):
47 |                     b = b*85+84
48 |                 out += struct.pack('>L', b)[:n-1]
49 |             break
50 |     return out
51 | 
52 | # asciihexdecode(data)
53 | hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
54 | trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
55 | 
56 | 
57 | def asciihexdecode(data):
58 |     """
59 |     ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
60 |     For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
61 |     ASCIIHexDecode filter produces one byte of binary data. All white-space
62 |     characters are ignored. A right angle bracket character (>) indicates
63 |     EOD. Any other characters will cause an error. If the filter encounters
64 |     the EOD marker after reading an odd number of hexadecimal digits, it
65 |     will behave as if a 0 followed the last digit.
66 | 
67 |     >>> asciihexdecode(b'61 62 2e6364   65')
68 |     'ab.cde'
69 |     >>> asciihexdecode(b'61 62 2e6364   657>')
70 |     'ab.cdep'
71 |     >>> asciihexdecode(b'7>')
72 |     'p'
73 |     """
74 |     decode = (lambda hx: chr(int(hx, 16)))
75 |     out = map(decode, hex_re.findall(data))
76 |     m = trail_re.search(data)
77 |     if m:
78 |         out.append(decode('%c0' % m.group(1)))
79 |     return b''.join(out)
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     import doctest
84 |     doctest.testmod()
85 | 


--------------------------------------------------------------------------------
/lib/parse/pdfminer/lzw.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | try:
  4 |     from cStringIO import StringIO
  5 | except ImportError:
  6 |     from StringIO import StringIO
  7 | 
  8 | 
  9 | class CorruptDataError(Exception):
 10 |     pass
 11 | 
 12 | 
 13 | ##  LZWDecoder
 14 | ##
 15 | class LZWDecoder(object):
 16 | 
 17 |     debug = 0
 18 | 
 19 |     def __init__(self, fp):
 20 |         self.fp = fp
 21 |         self.buff = 0
 22 |         self.bpos = 8
 23 |         self.nbits = 9
 24 |         self.table = None
 25 |         self.prevbuf = None
 26 |         return
 27 | 
 28 |     def readbits(self, bits):
 29 |         v = 0
 30 |         while 1:
 31 |             # the number of remaining bits we can get from the current buffer.
 32 |             r = 8-self.bpos
 33 |             if bits <= r:
 34 |                 # |-----8-bits-----|
 35 |                 # |-bpos-|-bits-|  |
 36 |                 # |      |----r----|
 37 |                 v = (v << bits) | ((self.buff >> (r-bits)) & ((1 << bits)-1))
 38 |                 self.bpos += bits
 39 |                 break
 40 |             else:
 41 |                 # |-----8-bits-----|
 42 |                 # |-bpos-|---bits----...
 43 |                 # |      |----r----|
 44 |                 v = (v << r) | (self.buff & ((1 << r)-1))
 45 |                 bits -= r
 46 |                 x = self.fp.read(1)
 47 |                 if not x:
 48 |                     raise EOFError
 49 |                 self.buff = ord(x)
 50 |                 self.bpos = 0
 51 |         return v
 52 | 
 53 |     def feed(self, code):
 54 |         x = ''
 55 |         if code == 256:
 56 |             self.table = [chr(c) for c in xrange(256)]  # 0-255
 57 |             self.table.append(None)  # 256
 58 |             self.table.append(None)  # 257
 59 |             self.prevbuf = ''
 60 |             self.nbits = 9
 61 |         elif code == 257:
 62 |             pass
 63 |         elif not self.prevbuf:
 64 |             x = self.prevbuf = self.table[code]
 65 |         else:
 66 |             if code < len(self.table):
 67 |                 x = self.table[code]
 68 |                 self.table.append(self.prevbuf+x[:1])
 69 |             elif code == len(self.table):
 70 |                 self.table.append(self.prevbuf+self.prevbuf[:1])
 71 |                 x = self.table[code]
 72 |             else:
 73 |                 raise CorruptDataError
 74 |             l = len(self.table)
 75 |             if l == 511:
 76 |                 self.nbits = 10
 77 |             elif l == 1023:
 78 |                 self.nbits = 11
 79 |             elif l == 2047:
 80 |                 self.nbits = 12
 81 |             self.prevbuf = x
 82 |         return x
 83 | 
 84 |     def run(self):
 85 |         while 1:
 86 |             try:
 87 |                 code = self.readbits(self.nbits)
 88 |             except EOFError:
 89 |                 break
 90 |             try:
 91 |                 x = self.feed(code)
 92 |             except CorruptDataError:
 93 |                 # just ignore corrupt data and stop yielding there
 94 |                 break
 95 |             yield x
 96 |             if self.debug:
 97 |                 print >>sys.stderr, ('nbits=%d, code=%d, output=%r, table=%r' %
 98 |                                      (self.nbits, code, x, self.table[258:]))
 99 |         return
100 | 
101 | 
102 | # lzwdecode
103 | def lzwdecode(data):
104 |     """
105 |     >>> lzwdecode('\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01')
106 |     '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
107 |     """
108 |     fp = StringIO(data)
109 |     return ''.join(LZWDecoder(fp).run())
110 | 
111 | if __name__ == '__main__':
112 |     import doctest
113 |     doctest.testmod()
114 | 


--------------------------------------------------------------------------------
/lib/parse/pdfminer/pdfparser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | try:
  4 |     from cStringIO import StringIO
  5 | except ImportError:
  6 |     from StringIO import StringIO
  7 | from psparser import PSStackParser
  8 | from psparser import PSSyntaxError, PSEOF
  9 | from psparser import KWD, STRICT
 10 | from pdftypes import PDFException
 11 | from pdftypes import PDFStream, PDFObjRef
 12 | from pdftypes import int_value
 13 | from pdftypes import dict_value
 14 | 
 15 | 
 16 | ##  Exceptions
 17 | ##
 18 | class PDFSyntaxError(PDFException):
 19 |     pass
 20 | 
 21 | 
 22 | ##  PDFParser
 23 | ##
 24 | class PDFParser(PSStackParser):
 25 | 
 26 |     """
 27 |     PDFParser fetch PDF objects from a file stream.
 28 |     It can handle indirect references by referring to
 29 |     a PDF document set by set_document method.
 30 |     It also reads XRefs at the end of every PDF file.
 31 | 
 32 |     Typical usage:
 33 |       parser = PDFParser(fp)
 34 |       parser.read_xref()
 35 |       parser.read_xref(fallback=True) # optional
 36 |       parser.set_document(doc)
 37 |       parser.seek(offset)
 38 |       parser.nextobject()
 39 | 
 40 |     """
 41 | 
 42 |     def __init__(self, fp, dbg=False):
 43 |         PSStackParser.__init__(self, fp, dbg)
 44 |         self.doc = None
 45 |         self.fallback = False
 46 |         return
 47 | 
 48 |     def set_document(self, doc):
 49 |         """Associates the parser with a PDFDocument object."""
 50 |         self.doc = doc
 51 |         return
 52 | 
 53 |     KEYWORD_R = KWD('R')
 54 |     KEYWORD_NULL = KWD('null')
 55 |     KEYWORD_ENDOBJ = KWD('endobj')
 56 |     KEYWORD_STREAM = KWD('stream')
 57 |     KEYWORD_XREF = KWD('xref')
 58 |     KEYWORD_STARTXREF = KWD('startxref')
 59 | 
 60 |     def do_keyword(self, pos, token):
 61 |         """Handles PDF-related keywords."""
 62 | 
 63 |         if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
 64 |             self.add_results(*self.pop(1))
 65 | 
 66 |         elif token is self.KEYWORD_ENDOBJ:
 67 |             self.add_results(*self.pop(4))
 68 | 
 69 |         elif token is self.KEYWORD_NULL:
 70 |             # null object
 71 |             self.push((pos, None))
 72 | 
 73 |         elif token is self.KEYWORD_R:
 74 |             # reference to indirect object
 75 |             try:
 76 |                 ((_, objid), (_, genno)) = self.pop(2)
 77 |                 (objid, genno) = (int(objid), int(genno))
 78 |                 obj = PDFObjRef(self.doc, objid, genno)
 79 |                 self.push((pos, obj))
 80 |             except PSSyntaxError:
 81 |                 pass
 82 | 
 83 |         elif token is self.KEYWORD_STREAM:
 84 |             # stream object
 85 |             try:
 86 |                 ((_, dic),) = self.pop(1)
 87 |             except ValueError:
 88 |                 dic = []
 89 | 
 90 |             dic = dict_value(dic)
 91 |             objlen = 0
 92 |             if not self.fallback:
 93 |                 try:
 94 |                     objlen = int_value(dic['Length'])
 95 |                 except KeyError:
 96 |                     if STRICT:
 97 |                         raise PDFSyntaxError('/Length is undefined: %r' % dic)
 98 |             self.seek(pos)
 99 |             try:
100 |                 (_, line) = self.nextline()  # 'stream'
101 |             except PSEOF:
102 |                 if STRICT:
103 |                     raise PDFSyntaxError('Unexpected EOF')
104 |                 return
105 |             pos += len(line)
106 |             self.fp.seek(pos)
107 |             data = self.fp.read(objlen)
108 |             self.seek(pos+objlen)
109 |             while 1:
110 |                 try:
111 |                     (linepos, line) = self.nextline()
112 |                 except PSEOF:
113 |                     if STRICT:
114 |                         raise PDFSyntaxError('Unexpected EOF')
115 |                     break
116 |                 if 'endstream' in line:
117 |                     i = line.index('endstream')
118 |                     objlen += i
119 |                     data += line[:i]
120 |                     break
121 |                 objlen += len(line)
122 |                 data += line
123 |             self.seek(pos+objlen)
124 |             # XXX limit objlen not to exceed object boundary
125 |             if 2 <= self.debug:
126 |                 print >>sys.stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
127 |                                     (pos, objlen, dic, data[:10])
128 |             obj = PDFStream(dic, data, self.doc.decipher)
129 |             self.push((pos, obj))
130 | 
131 |         else:
132 |             # others
133 |             self.push((pos, token))
134 | 
135 |         return
136 | 
137 | 
138 | ##  PDFStreamParser
139 | ##
140 | class PDFStreamParser(PDFParser):
141 | 
142 |     """
143 |     PDFStreamParser is used to parse PDF content streams
144 |     that is contained in each page and has instructions
145 |     for rendering the page. A reference to a PDF document is
146 |     needed because a PDF content stream can also have
147 |     indirect references to other objects in the same document.
148 |     """
149 | 
150 |     def __init__(self, data):
151 |         PDFParser.__init__(self, StringIO(data))
152 |         return
153 | 
154 |     def flush(self):
155 |         self.add_results(*self.popall())
156 |         return
157 | 
158 |     def do_keyword(self, pos, token):
159 |         if token is self.KEYWORD_R:
160 |             # reference to indirect object
161 |             try:
162 |                 ((_, objid), (_, genno)) = self.pop(2)
163 |                 (objid, genno) = (int(objid), int(genno))
164 |                 obj = PDFObjRef(self.doc, objid, genno)
165 |                 self.push((pos, obj))
166 |             except PSSyntaxError:
167 |                 pass
168 |             return
169 |         # others
170 |         self.push((pos, token))
171 |         return
172 | 


--------------------------------------------------------------------------------
/lib/parse/pdfminer/pdftypes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import zlib
  3 | from lzw import lzwdecode
  4 | from ascii85 import ascii85decode, asciihexdecode
  5 | from runlength import rldecode
  6 | from ccitt import ccittfaxdecode
  7 | from psparser import PSException, PSObject
  8 | from psparser import LIT, STRICT
  9 | from utils import apply_png_predictor, isnumber
 10 | 
 11 | LITERAL_CRYPT = LIT('Crypt')
 12 | 
 13 | # Abbreviation of Filter names in PDF 4.8.6. "Inline Images"
 14 | LITERALS_FLATE_DECODE = (LIT('FlateDecode'), LIT('Fl'))
 15 | LITERALS_LZW_DECODE = (LIT('LZWDecode'), LIT('LZW'))
 16 | LITERALS_ASCII85_DECODE = (LIT('ASCII85Decode'), LIT('A85'))
 17 | LITERALS_ASCIIHEX_DECODE = (LIT('ASCIIHexDecode'), LIT('AHx'))
 18 | LITERALS_RUNLENGTH_DECODE = (LIT('RunLengthDecode'), LIT('RL'))
 19 | LITERALS_CCITTFAX_DECODE = (LIT('CCITTFaxDecode'), LIT('CCF'))
 20 | LITERALS_DCT_DECODE = (LIT('DCTDecode'), LIT('DCT'))
 21 | 
 22 | 
 23 | ##  PDF Objects
 24 | ##
 25 | class PDFObject(PSObject):
 26 |     pass
 27 | 
 28 | class PDFException(PSException):
 29 |     pass
 30 | 
 31 | class PDFTypeError(PDFException):
 32 |     pass
 33 | 
 34 | class PDFValueError(PDFException):
 35 |     pass
 36 | 
 37 | class PDFObjectNotFound(PDFException):
 38 |     pass
 39 | 
 40 | class PDFNotImplementedError(PDFException):
 41 |     pass
 42 | 
 43 | 
 44 | ##  PDFObjRef
 45 | ##
 46 | class PDFObjRef(PDFObject):
 47 | 
 48 |     def __init__(self, doc, objid, _):
 49 |         if objid == 0:
 50 |             if STRICT:
 51 |                 raise PDFValueError('PDF object id cannot be 0.')
 52 |         self.doc = doc
 53 |         self.objid = objid
 54 |         #self.genno = genno  # Never used.
 55 |         return
 56 | 
 57 |     def __repr__(self):
 58 |         return '<PDFObjRef:%d>' % (self.objid)
 59 | 
 60 |     def resolve(self, default=None):
 61 |         try:
 62 |             return self.doc.getobj(self.objid)
 63 |         except PDFObjectNotFound:
 64 |             return default
 65 | 
 66 | 
 67 | # resolve
 68 | def resolve1(x, default=None):
 69 |     """Resolves an object.
 70 | 
 71 |     If this is an array or dictionary, it may still contains
 72 |     some indirect objects inside.
 73 |     """
 74 |     while isinstance(x, PDFObjRef):
 75 |         x = x.resolve(default=default)
 76 |     return x
 77 | 
 78 | 
 79 | def resolve_all(x, default=None):
 80 |     """Recursively resolves the given object and all the internals.
 81 | 
 82 |     Make sure there is no indirect reference within the nested object.
 83 |     This procedure might be slow.
 84 |     """
 85 |     while isinstance(x, PDFObjRef):
 86 |         x = x.resolve(default=default)
 87 |     if isinstance(x, list):
 88 |         x = [resolve_all(v, default=default) for v in x]
 89 |     elif isinstance(x, dict):
 90 |         for (k, v) in x.iteritems():
 91 |             x[k] = resolve_all(v, default=default)
 92 |     return x
 93 | 
 94 | 
 95 | def decipher_all(decipher, objid, genno, x):
 96 |     """Recursively deciphers the given object.
 97 |     """
 98 |     if isinstance(x, str):
 99 |         return decipher(objid, genno, x)
100 |     if isinstance(x, list):
101 |         x = [decipher_all(decipher, objid, genno, v) for v in x]
102 |     elif isinstance(x, dict):
103 |         for (k, v) in x.iteritems():
104 |             x[k] = decipher_all(decipher, objid, genno, v)
105 |     return x
106 | 
107 | 
108 | # Type cheking
109 | def int_value(x):
110 |     x = resolve1(x)
111 |     if not isinstance(x, int):
112 |         if STRICT:
113 |             raise PDFTypeError('Integer required: %r' % x)
114 |         return 0
115 |     return x
116 | 
117 | 
118 | def float_value(x):
119 |     x = resolve1(x)
120 |     if not isinstance(x, float):
121 |         if STRICT:
122 |             raise PDFTypeError('Float required: %r' % x)
123 |         return 0.0
124 |     return x
125 | 
126 | 
127 | def num_value(x):
128 |     x = resolve1(x)
129 |     if not isnumber(x):
130 |         if STRICT:
131 |             raise PDFTypeError('Int or Float required: %r' % x)
132 |         return 0
133 |     return x
134 | 
135 | 
136 | def str_value(x):
137 |     x = resolve1(x)
138 |     if not isinstance(x, str):
139 |         if STRICT:
140 |             raise PDFTypeError('String required: %r' % x)
141 |         return ''
142 |     return x
143 | 
144 | 
145 | def list_value(x):
146 |     x = resolve1(x)
147 |     if not isinstance(x, (list, tuple)):
148 |         if STRICT:
149 |             raise PDFTypeError('List required: %r' % x)
150 |         return []
151 |     return x
152 | 
153 | 
154 | def dict_value(x):
155 |     x = resolve1(x)
156 |     if not isinstance(x, dict):
157 |         if STRICT:
158 |             raise PDFTypeError('Dict required: %r' % x)
159 |         return {}
160 |     return x
161 | 
162 | 
163 | def stream_value(x):
164 |     x = resolve1(x)
165 |     if not isinstance(x, PDFStream):
166 |         if STRICT:
167 |             raise PDFTypeError('PDFStream required: %r' % x)
168 |         return PDFStream({}, '')
169 |     return x
170 | 
171 | 
172 | ##  PDFStream type
173 | ##
174 | class PDFStream(PDFObject):
175 | 
176 |     def __init__(self, attrs, rawdata, decipher=None):
177 |         assert isinstance(attrs, dict)
178 |         self.attrs = attrs
179 |         self.rawdata = rawdata
180 |         self.decipher = decipher
181 |         self.data = None
182 |         self.objid = None
183 |         self.genno = None
184 |         return
185 | 
186 |     def set_objid(self, objid, genno):
187 |         self.objid = objid
188 |         self.genno = genno
189 |         return
190 | 
191 |     def __repr__(self):
192 |         if self.data is None:
193 |             assert self.rawdata is not None
194 |             return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.attrs)
195 |         else:
196 |             assert self.data is not None
197 |             return '<PDFStream(%r): len=%d, %r>' % (self.objid, len(self.data), self.attrs)
198 | 
199 |     def __contains__(self, name):
200 |         return name in self.attrs
201 | 
202 |     def __getitem__(self, name):
203 |         return self.attrs[name]
204 | 
205 |     def get(self, name, default=None):
206 |         return self.attrs.get(name, default)
207 | 
208 |     def get_any(self, names, default=None):
209 |         for name in names:
210 |             if name in self.attrs:
211 |                 return self.attrs[name]
212 |         return default
213 | 
214 |     def get_filters(self):
215 |         filters = self.get_any(('F', 'Filter'))
216 |         if not filters:
217 |             return []
218 |         if isinstance(filters, list):
219 |             return filters
220 |         return [filters]
221 | 
222 |     def decode(self):
223 |         assert self.data is None and self.rawdata is not None
224 |         data = self.rawdata
225 |         if self.decipher:
226 |             # Handle encryption
227 |             data = self.decipher(self.objid, self.genno, data)
228 |         filters = self.get_filters()
229 |         if not filters:
230 |             self.data = data
231 |             self.rawdata = None
232 |             return
233 |         for f in filters:
234 |             if isinstance(f,PDFObjRef):
235 |                 filters +=f.resolve()
236 |                 continue
237 |             params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
238 |             if f in LITERALS_FLATE_DECODE:
239 |                 # will get errors if the document is encrypted.
240 |                 try:
241 |                     data = zlib.decompress(data)
242 |                 except zlib.error, e:
243 |                     if STRICT:
244 |                         raise PDFException('Invalid zlib bytes: %r, %r' % (e, data))
245 |                     data = ''
246 |             elif f in LITERALS_LZW_DECODE:
247 |                 data = lzwdecode(data)
248 |             elif f in LITERALS_ASCII85_DECODE:
249 |                 data = ascii85decode(data)
250 |             elif f in LITERALS_ASCIIHEX_DECODE:
251 |                 data = asciihexdecode(data)
252 |             elif f in LITERALS_RUNLENGTH_DECODE:
253 |                 data = rldecode(data)
254 |             elif f in LITERALS_CCITTFAX_DECODE:
255 |                 data = ccittfaxdecode(data, params)
256 |             elif f == LITERAL_CRYPT:
257 |                 # not yet..
258 |                 raise PDFNotImplementedError('/Crypt filter is unsupported')
259 |             else:
260 |                 raise PDFNotImplementedError('Unsupported filter: %r' % f)
261 |             # apply predictors
262 |             if 'Predictor' in params:
263 |                 pred = int_value(params['Predictor'])
264 |                 if pred == 1:
265 |                     # no predictor
266 |                     pass
267 |                 elif 10 <= pred:
268 |                     # PNG predictor
269 |                     colors = int_value(params.get('Colors', 1))
270 |                     columns = int_value(params.get('Columns', 1))
271 |                     bitspercomponent = int_value(params.get('BitsPerComponent', 8))
272 |                     data = apply_png_predictor(pred, colors, columns, bitspercomponent, data)
273 |                 else:
274 |                     raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
275 |         self.data = data
276 |         self.rawdata = None
277 |         return
278 | 
279 |     def get_data(self):
280 |         if self.data is None:
281 |             self.decode()
282 |         return self.data
283 | 
284 |     def get_rawdata(self):
285 |         return self.rawdata
286 | 


--------------------------------------------------------------------------------
/lib/parse/pdfminer/runlength.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # RunLength decoder (Adobe version) implementation based on PDF Reference
 4 | # version 1.4 section 3.3.4.
 5 | #
 6 | #  * public domain *
 7 | #
 8 | 
 9 | def rldecode(data):
10 |     """
11 |     RunLength decoder (Adobe version) implementation based on PDF Reference
12 |     version 1.4 section 3.3.4:
13 |         The RunLengthDecode filter decodes data that has been encoded in a
14 |         simple byte-oriented format based on run length. The encoded data
15 |         is a sequence of runs, where each run consists of a length byte
16 |         followed by 1 to 128 bytes of data. If the length byte is in the
17 |         range 0 to 127, the following length + 1 (1 to 128) bytes are
18 |         copied literally during decompression. If length is in the range
19 |         129 to 255, the following single byte is to be copied 257 - length
20 |         (2 to 128) times during decompression. A length value of 128
21 |         denotes EOD.
22 |     >>> s = b'\x05123456\xfa7\x04abcde\x80junk'
23 |     >>> rldecode(s)
24 |     '1234567777777abcde'
25 |     """
26 |     decoded = []
27 |     i = 0
28 |     while i < len(data):
29 |         #print 'data[%d]=:%d:' % (i,ord(data[i]))
30 |         length = ord(data[i])
31 |         if length == 128:
32 |             break
33 |         if length >= 0 and length < 128:
34 |             run = data[i+1:(i+1)+(length+1)]
35 |             #print 'length=%d, run=%s' % (length+1,run)
36 |             decoded.append(run)
37 |             i = (i+1) + (length+1)
38 |         if length > 128:
39 |             run = data[i+1]*(257-length)
40 |             #print 'length=%d, run=%s' % (257-length,run)
41 |             decoded.append(run)
42 |             i = (i+1) + 1
43 |     return b''.join(decoded)
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     import doctest
48 |     doctest.testmod()
49 | 


--------------------------------------------------------------------------------
/lib/parse/pdfminer/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Miscellaneous Routines.
  4 | """
  5 | import struct
  6 | from sys import maxint as INF
  7 | 
  8 | 
  9 | ##  PNG Predictor
 10 | ##
 11 | def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
 12 |     if bitspercomponent != 8:
 13 |         # unsupported
 14 |         raise ValueError(bitspercomponent)
 15 |     nbytes = colors*columns*bitspercomponent//8
 16 |     i = 0
 17 |     buf = ''
 18 |     line0 = '\x00' * columns
 19 |     for i in xrange(0, len(data), nbytes+1):
 20 |         ft = data[i]
 21 |         i += 1
 22 |         line1 = data[i:i+nbytes]
 23 |         line2 = ''
 24 |         if ft == '\x00':
 25 |             # PNG none
 26 |             line2 += line1
 27 |         elif ft == '\x01':
 28 |             # PNG sub (UNTESTED)
 29 |             c = 0
 30 |             for b in line1:
 31 |                 c = (c+ord(b)) & 255
 32 |                 line2 += chr(c)
 33 |         elif ft == '\x02':
 34 |             # PNG up
 35 |             for (a, b) in zip(line0, line1):
 36 |                 c = (ord(a)+ord(b)) & 255
 37 |                 line2 += chr(c)
 38 |         elif ft == '\x03':
 39 |             # PNG average (UNTESTED)
 40 |             c = 0
 41 |             for (a, b) in zip(line0, line1):
 42 |                 c = ((c+ord(a)+ord(b))//2) & 255
 43 |                 line2 += chr(c)
 44 |         else:
 45 |             # unsupported
 46 |             raise ValueError(ft)
 47 |         buf += line2
 48 |         line0 = line2
 49 |     return buf
 50 | 
 51 | 
 52 | ##  Matrix operations
 53 | ##
 54 | MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
 55 | 
 56 | 
 57 | def mult_matrix((a1, b1, c1, d1, e1, f1), (a0, b0, c0, d0, e0, f0)):
 58 |     """Returns the multiplication of two matrices."""
 59 |     return (a0*a1+c0*b1,    b0*a1+d0*b1,
 60 |             a0*c1+c0*d1,    b0*c1+d0*d1,
 61 |             a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
 62 | 
 63 | 
 64 | def translate_matrix((a, b, c, d, e, f), (x, y)):
 65 |     """Translates a matrix by (x, y)."""
 66 |     return (a, b, c, d, x*a+y*c+e, x*b+y*d+f)
 67 | 
 68 | 
 69 | def apply_matrix_pt((a, b, c, d, e, f), (x, y)):
 70 |     """Applies a matrix to a point."""
 71 |     return (a*x+c*y+e, b*x+d*y+f)
 72 | 
 73 | 
 74 | def apply_matrix_norm((a, b, c, d, e, f), (p, q)):
 75 |     """Equivalent to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))"""
 76 |     return (a*p+c*q, b*p+d*q)
 77 | 
 78 | 
 79 | ##  Utility functions
 80 | ##
 81 | 
 82 | # isnumber
 83 | def isnumber(x):
 84 |     return isinstance(x, (int, long, float))
 85 | 
 86 | # uniq
 87 | def uniq(objs):
 88 |     """Eliminates duplicated elements."""
 89 |     done = set()
 90 |     for obj in objs:
 91 |         if obj in done:
 92 |             continue
 93 |         done.add(obj)
 94 |         yield obj
 95 |     return
 96 | 
 97 | 
 98 | # csort
 99 | def csort(objs, key=lambda x: x):
100 |     """Order-preserving sorting function."""
101 |     idxs = dict((obj, i) for (i, obj) in enumerate(objs))
102 |     return sorted(objs, key=lambda obj: (key(obj), idxs[obj]))
103 | 
104 | 
105 | # fsplit
106 | def fsplit(pred, objs):
107 |     """Split a list into two classes according to the predicate."""
108 |     t = []
109 |     f = []
110 |     for obj in objs:
111 |         if pred(obj):
112 |             t.append(obj)
113 |         else:
114 |             f.append(obj)
115 |     return (t, f)
116 | 
117 | 
118 | # drange
119 | def drange(v0, v1, d):
120 |     """Returns a discrete range."""
121 |     assert v0 < v1
122 |     return xrange(int(v0)//d, int(v1+d)//d)
123 | 
124 | 
125 | # get_bound
126 | def get_bound(pts):
127 |     """Compute a minimal rectangle that covers all the points."""
128 |     (x0, y0, x1, y1) = (INF, INF, -INF, -INF)
129 |     for (x, y) in pts:
130 |         x0 = min(x0, x)
131 |         y0 = min(y0, y)
132 |         x1 = max(x1, x)
133 |         y1 = max(y1, y)
134 |     return (x0, y0, x1, y1)
135 | 
136 | 
137 | # pick
138 | def pick(seq, func, maxobj=None):
139 |     """Picks the object obj where func(obj) has the highest value."""
140 |     maxscore = None
141 |     for obj in seq:
142 |         score = func(obj)
143 |         if maxscore is None or maxscore < score:
144 |             (maxscore, maxobj) = (score, obj)
145 |     return maxobj
146 | 
147 | 
148 | # choplist
149 | def choplist(n, seq):
150 |     """Groups every n elements of the list."""
151 |     r = []
152 |     for x in seq:
153 |         r.append(x)
154 |         if len(r) == n:
155 |             yield tuple(r)
156 |             r = []
157 |     return
158 | 
159 | 
160 | # nunpack
161 | def nunpack(s, default=0):
162 |     """Unpacks 1 to 4 byte integers (big endian)."""
163 |     l = len(s)
164 |     if not l:
165 |         return default
166 |     elif l == 1:
167 |         return ord(s)
168 |     elif l == 2:
169 |         return struct.unpack('>H', s)[0]
170 |     elif l == 3:
171 |         return struct.unpack('>L', '\x00'+s)[0]
172 |     elif l == 4:
173 |         return struct.unpack('>L', s)[0]
174 |     else:
175 |         raise TypeError('invalid length: %d' % l)
176 | 
177 | 
178 | # decode_text
179 | PDFDocEncoding = ''.join(unichr(x) for x in (
180 |     0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
181 |     0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
182 |     0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
183 |     0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
184 |     0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
185 |     0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
186 |     0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
187 |     0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
188 |     0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
189 |     0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
190 |     0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
191 |     0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
192 |     0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
193 |     0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
194 |     0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
195 |     0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
196 |     0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
197 |     0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
198 |     0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
199 |     0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
200 |     0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
201 |     0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
202 |     0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
203 |     0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
204 |     0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
205 |     0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
206 |     0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
207 |     0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
208 |     0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
209 |     0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
210 |     0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
211 |     0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
212 | ))
213 | 
214 | 
215 | def decode_text(s):
216 |     """Decodes a PDFDocEncoding string to Unicode."""
217 |     if s.startswith('\xfe\xff'):
218 |         return unicode(s[2:], 'utf-16be', 'ignore')
219 |     else:
220 |         return ''.join(PDFDocEncoding[ord(c)] for c in s)
221 | 
222 | 
223 | # enc
224 | def enc(x, codec='ascii'):
225 |     """Encodes a string for SGML/XML/HTML"""
226 |     x = x.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;').replace('"', '&quot;')
227 |     return x.encode(codec, 'xmlcharrefreplace')
228 | 
229 | 
230 | def bbox2str((x0, y0, x1, y1)):
231 |     return '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
232 | 
233 | 
234 | def matrix2str((a, b, c, d, e, f)):
235 |     return '[%.2f,%.2f,%.2f,%.2f, (%.2f,%.2f)]' % (a, b, c, d, e, f)
236 | 
237 | 
238 | ##  Plane
239 | ##
240 | ##  A set-like data structure for objects placed on a plane.
241 | ##  Can efficiently find objects in a certain rectangular area.
242 | ##  It maintains two parallel lists of objects, each of
243 | ##  which is sorted by its x or y coordinate.
244 | ##
245 | class Plane(object):
246 | 
247 |     def __init__(self, bbox, gridsize=50):
248 |         self._objs = set()
249 |         self._grid = {}
250 |         self.gridsize = gridsize
251 |         (self.x0, self.y0, self.x1, self.y1) = bbox
252 |         return
253 | 
254 |     def __repr__(self):
255 |         return ('<Plane objs=%r>' % list(self))
256 | 
257 |     def __iter__(self):
258 |         return iter(self._objs)
259 | 
260 |     def __len__(self):
261 |         return len(self._objs)
262 | 
263 |     def __contains__(self, obj):
264 |         return obj in self._objs
265 | 
266 |     def _getrange(self, (x0, y0, x1, y1)):
267 |         if (x1 <= self.x0 or self.x1 <= x0 or
268 |             y1 <= self.y0 or self.y1 <= y0): return
269 |         x0 = max(self.x0, x0)
270 |         y0 = max(self.y0, y0)
271 |         x1 = min(self.x1, x1)
272 |         y1 = min(self.y1, y1)
273 |         for y in drange(y0, y1, self.gridsize):
274 |             for x in drange(x0, x1, self.gridsize):
275 |                 yield (x, y)
276 |         return
277 | 
278 |     # extend(objs)
279 |     def extend(self, objs):
280 |         for obj in objs:
281 |             self.add(obj)
282 |         return
283 | 
284 |     # add(obj): place an object.
285 |     def add(self, obj):
286 |         for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
287 |             if k not in self._grid:
288 |                 r = []
289 |                 self._grid[k] = r
290 |             else:
291 |                 r = self._grid[k]
292 |             r.append(obj)
293 |         self._objs.add(obj)
294 |         return
295 | 
296 |     # remove(obj): displace an object.
297 |     def remove(self, obj):
298 |         for k in self._getrange((obj.x0, obj.y0, obj.x1, obj.y1)):
299 |             try:
300 |                 self._grid[k].remove(obj)
301 |             except (KeyError, ValueError):
302 |                 pass
303 |         self._objs.remove(obj)
304 |         return
305 | 
306 |     # find(): finds objects that are in a certain area.
307 |     def find(self, (x0, y0, x1, y1)):
308 |         done = set()
309 |         for k in self._getrange((x0, y0, x1, y1)):
310 |             if k not in self._grid:
311 |                 continue
312 |             for obj in self._grid[k]:
313 |                 if obj in done:
314 |                     continue
315 |                 done.add(obj)
316 |                 if (obj.x1 <= x0 or x1 <= obj.x0 or
317 |                     obj.y1 <= y0 or y1 <= obj.y0):
318 |                     continue
319 |                 yield obj
320 |         return
321 | 


--------------------------------------------------------------------------------
/lib/parse/peepdf/AUTHORS:
--------------------------------------------------------------------------------
1 | Jose Miguel Esparza <jesparza AT eternal-todo.com>
2 | http://eternal-todo.com
3 | http://twitter.com/EternalTodo


--------------------------------------------------------------------------------
/lib/parse/peepdf/CHANGELOG:
--------------------------------------------------------------------------------
 1 | -----------------------------------------------
 2 | peepdf Black Hat Vegas (0.2 r156), 2012-07-25
 3 | -----------------------------------------------
 4 | 
 5 |     * New features:
 6 | 
 7 |         - Added "grinch mode" execution to avoid colorized output
 8 |         - Added more colors in the interactive console output: warning, errors, important information...
 9 |         - Changed sctest command, now it's implemented with pylibemu
10 |         - Added decrypt command to parse password protected documents
11 |         - Modified analyseJS() to extract JS code from XDP packets and unescape HTML entities
12 |         - Added function unescapeHTMLEntities() to unescape HTML entities
13 |         - Added AES decryption support (128 and 256 bits).
14 |         - Added hashes in objects information (info $object_id)
15 |         - Added support for decoding CCITTFaxDecode filters (Thanks to @binjo)
16 | 
17 |     * Fixes:
18 | 
19 |         - Fix to show decrypt errors
20 |         - Fixed silly bug with /EncryptMetadata element
21 |         - Added missing binary file operations
22 |         - Fixed Issue 5: Resolved false positives when monitoring some elements like actions, events, etc. (Thanks to @hiddenillusion)
23 |         - Bug in PDFStream.decode and PDFStream.encode, dealing with an array of filter parameters (Thanks to @binjo)
24 | 
25 | 
26 | -----------------------------------------------
27 | peepdf Black Hat Arsenal (0.1 r92), 2012-03-16
28 | -----------------------------------------------
29 | 
30 |     * New features:
31 | 
32 |         - Added support for more parameters in Flate/LZW decode (stream filters)
33 |         - Encryption algorithm now showing in document information
34 |         - Added XML output and SHA hash to file information    
35 |         - Improved unescape function to support mixed escaped formats (eg. "%u6734%34%u8790")
36 |         - Added xor and xor_search commands
37 |         - Added easy way of redirect console output (>, >>, $>, $>>)
38 |         - Added xor function by Evan Fosmark
39 |         - Added detection of CVE-2011-4369 (/PRC)
40 |         - Added hash command (Thanks to @binjo for code and comments)
41 |         - Added js_beautify command
42 |         - Update function added
43 |         - Added new vulns and showing information related to non JS vulns
44 |         - Added escape sequence in the limited output
45 |         - Added ascii85 decode from pdfminer to improve code and avoid bugs (Thanks to Brandon Dixon!)
46 |         - Added lzwdecode from pdfminer to improve code and avoid bugs
47 | 
48 |     * Fixes:
49 | 
50 |         - Update process rewritten, now based on hashing of files
51 |         - Silly bug in computeUserPass function (Thanks to Christian Martorella!)
52 |         - Added binary mode in files operations
53 |         - Recursion bug in update function
54 |         - Minor bug in do_embed function
55 |         - Bug to support encoding following PDF specifications (Issue 3 by czchen)
56 |         - Bug to handle negative numbers in P element
57 |         - Bug in the xref table when creating a new PDF (Issue 2)
58 |         - Silly bug when parsing filter parameters
59 |         - Bug related to updating objects and statistics of PDF files
60 |         - Some bugs related to offsets calculation
61 |         - Fixed "replace" function in PDFObjectStream
62 |         - Fix in asciiHexDecode filter function
63 | 
64 | 
65 | -----------------------------------------------
66 | peepdf 0.1 r15, 2011-05-05
67 | -----------------------------------------------
68 | 
69 | - Initial Release
70 | 
71 | 


--------------------------------------------------------------------------------
/lib/parse/peepdf/JSAnalysis.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #    peepdf is a tool to analyse and modify PDF files
  3 | #    http://peepdf.eternal-todo.com
  4 | #    By Jose Miguel Esparza <jesparza AT eternal-todo.com>
  5 | #
  6 | #    Copyright (C) 2011-2014 Jose Miguel Esparza
  7 | #
  8 | #    This file is part of peepdf.
  9 | #
 10 | #        peepdf is free software: you can redistribute it and/or modify
 11 | #        it under the terms of the GNU General Public License as published by
 12 | #        the Free Software Foundation, either version 3 of the License, or
 13 | #        (at your option) any later version.
 14 | #
 15 | #        peepdf is distributed in the hope that it will be useful,
 16 | #        but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | #        MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    See the
 18 | #        GNU General Public License for more details.
 19 | #
 20 | #        You should have received a copy of the GNU General Public License
 21 | #        along with peepdf.    If not, see <http://www.gnu.org/licenses/>.
 22 | #
 23 | 
 24 | '''
 25 |     This module contains some functions to analyse Javascript code inside the PDF file
 26 | '''
 27 | 
 28 | import sys, re , os, jsbeautifier, traceback
 29 | from PDFUtils import unescapeHTMLEntities, escapeString
 30 | try:
 31 |     import PyV8
 32 |     
 33 |     JS_MODULE = True
 34 |     
 35 |     class Global(PyV8.JSClass):
 36 |         evalCode = ''
 37 |         
 38 |         def evalOverride(self, expression):
 39 |             self.evalCode += '\n\n// New evaluated code\n' + expression
 40 |             return
 41 |         
 42 | except:
 43 |     JS_MODULE = False
 44 | 
 45 | 
 46 | errorsFile = 'errors.txt'
 47 | newLine = os.linesep         
 48 | reJSscript = '<script[^>]*?contentType\s*?=\s*?[\'"]application/x-javascript[\'"][^>]*?>(.*?)</script>'
 49 | preDefinedCode = 'var app = this;'
 50 | 
 51 | def analyseJS(code, context = None, manualAnalysis = False):
 52 |     '''
 53 |         Hooks the eval function and search for obfuscated elements in the Javascript code
 54 |         
 55 |         @param code: The Javascript code (string)
 56 |         @return: List with analysis information of the Javascript code: [JSCode,unescapedBytes,urlsFound,errors,context], where 
 57 |                 JSCode is a list with the several stages Javascript code,
 58 |                 unescapedBytes is a list with the parameters of unescape functions, 
 59 |                 urlsFound is a list with the URLs found in the unescaped bytes,
 60 |                 errors is a list of errors,
 61 |                 context is the context of execution of the Javascript code.
 62 |     '''
 63 |     errors = []
 64 |     JSCode = []
 65 |     unescapedBytes = []
 66 |     urlsFound = []
 67 |     
 68 |     try:
 69 |         code = unescapeHTMLEntities(code)
 70 |         scriptElements = re.findall(reJSscript, code, re.DOTALL | re.IGNORECASE)
 71 |         if scriptElements != []:
 72 |             code = ''
 73 |             for scriptElement in scriptElements:
 74 |                 code += scriptElement + '\n\n'
 75 |         code = jsbeautifier.beautify(code)
 76 |         JSCode.append(code)
 77 |     
 78 |         if code != None and JS_MODULE and not manualAnalysis:
 79 |             if context == None:
 80 |                 context = PyV8.JSContext(Global())
 81 |             context.enter()
 82 |             # Hooking the eval function
 83 |             context.eval('eval=evalOverride')
 84 |             #context.eval(preDefinedCode)
 85 |             while True:
 86 |                 originalCode = code
 87 |                 try:
 88 |                     context.eval(code)
 89 |                     evalCode = context.eval('evalCode')
 90 |                     evalCode = jsbeautifier.beautify(evalCode)
 91 |                     if evalCode != '' and evalCode != code:
 92 |                         code = evalCode
 93 |                         JSCode.append(code)
 94 |                     else:
 95 |                         break
 96 |                 except:
 97 |                     error = str(sys.exc_info()[1])
 98 |                     open('jserror.log','ab').write(error + newLine)
 99 |                     errors.append(error)
100 |                     break
101 |             
102 |             if False:
103 |                 escapedVars = re.findall('(\w*?)\s*?=\s*?(unescape\((.*?)\))', code, re.DOTALL)
104 |                 for var in escapedVars:
105 |                     bytes = var[2]
106 |                     if bytes.find('+') != -1 or bytes.find('%') == -1:
107 |                         varContent = getVarContent(code, bytes)
108 |                         if len(varContent) > 150:
109 |                             ret = unescape(varContent)
110 |                             if ret[0] != -1:
111 |                                 bytes = ret[1]
112 |                                 urls = re.findall('https?://.*$', bytes, re.DOTALL)
113 |                                 if bytes not in unescapedBytes:
114 |                                    unescapedBytes.append(bytes)
115 |                                 for url in urls:
116 |                                    if url not in urlsFound:
117 |                                        urlsFound.append(url)
118 |                     else:
119 |                         bytes = bytes[1:-1]
120 |                         if len(bytes) > 150:
121 |                             ret = unescape(bytes)
122 |                             if ret[0] != -1:
123 |                                 bytes = ret[1]
124 |                                 urls = re.findall('https?://.*$', bytes, re.DOTALL)
125 |                                 if bytes not in unescapedBytes:
126 |                                    unescapedBytes.append(bytes)
127 |                                 for url in urls:
128 |                                    if url not in urlsFound:
129 |                                        urlsFound.append(url)
130 |     except:
131 |         traceback.print_exc(file=open(errorsFile,'a'))
132 |         errors.append('Unexpected error in the JSAnalysis module!!')
133 |     finally:
134 |         for js in JSCode:
135 |             if js == None or js == '':
136 |                  JSCode.remove(js)
137 |     return [JSCode,unescapedBytes,urlsFound,errors,context]
138 |  
139 | def getVarContent(jsCode, varContent):
140 |     '''
141 |         Given the Javascript code and the content of a variable this method tries to obtain the real value of the variable, cleaning expressions like "a = eval; a(js_code);"
142 |         
143 |         @param jsCode: The Javascript code (string)
144 |         @param varContent: The content of the variable (string)
145 |         @return: A string with real value of the variable
146 |     '''
147 |     clearBytes = ''
148 |     varContent = varContent.replace('\n','')
149 |     varContent = varContent.replace('\r','')
150 |     varContent = varContent.replace('\t','')
151 |     varContent = varContent.replace(' ','')
152 |     parts = varContent.split('+')
153 |     for part in parts:
154 |         if re.match('["\'].*?["\']', part, re.DOTALL):
155 |             clearBytes += part[1:-1]
156 |         else:
157 |             part = escapeString(part)
158 |             varContent = re.findall(part + '\s*?=\s*?(.*?)[,;]', jsCode, re.DOTALL)
159 |             if varContent != []:
160 |                 clearBytes += getVarContent(jsCode, varContent[0])
161 |     return clearBytes
162 | 
163 | def isJavascript(content):
164 |     '''
165 |         Given an string this method looks for typical Javscript strings and try to identify if the string contains Javascrit code or not.
166 |         
167 |         @param content: A string
168 |         @return: A boolean, True if it seems to contain Javascript code or False in the other case
169 |     '''
170 |     JSStrings = ['var ',';',')','(','function ','=','{','}','if ','else','return','while ','for ',',','eval']
171 |     keyStrings = [';','(',')']
172 |     stringsFound = []
173 |     limit = 15
174 |     minDistinctStringsFound = 5
175 |     results = 0
176 |     
177 |     if re.findall(reJSscript, content, re.DOTALL | re.IGNORECASE) != []:
178 |         return True
179 |     
180 |     for char in content:
181 |         if (ord(char) < 32 and char not in ['\n','\r','\t','\f','\x00']) or ord(char) >= 127:
182 |             return False
183 | 
184 |     for string in JSStrings:
185 |         cont = content.count(string)
186 |         results += cont
187 |         if cont > 0 and string not in stringsFound:
188 |             stringsFound.append(string)
189 |         elif cont == 0 and string in keyStrings:
190 |             return False
191 | 
192 |     if results > limit and len(stringsFound) >= minDistinctStringsFound:
193 |         return True
194 |     else:
195 |         return False
196 |     
197 | def searchObfuscatedFunctions(jsCode, function):
198 |     '''
199 |         Search for obfuscated functions in the Javascript code
200 |         
201 |         @param jsCode: The Javascript code (string)
202 |         @param function: The function name to look for (string)
203 |         @return: List with obfuscated functions information [functionName,functionCall,containsReturns] 
204 |     '''
205 |     obfuscatedFunctionsInfo = []
206 |     if jsCode != None:
207 |         match = re.findall('\W('+function+'\s{0,5}?\((.*?)\)\s{0,5}?;)', jsCode, re.DOTALL)
208 |         if match != []:
209 |            for m in match:
210 |               if re.findall('return',m[1],re.IGNORECASE) != []:
211 |                  obfuscatedFunctionsInfo.append([function,m,True])
212 |               else:
213 |                  obfuscatedFunctionsInfo.append([function,m,False])
214 |         obfuscatedFunctions = re.findall('\s*?((\w*?)\s*?=\s*?'+function+')\s*?;', jsCode, re.DOTALL)
215 |         for obfuscatedFunction in obfuscatedFunctions:
216 |            obfuscatedElement = obfuscatedFunction[1]
217 |            obfuscatedFunctionsInfo += searchObfuscatedFunctions(jsCode, obfuscatedElement)
218 |     return obfuscatedFunctionsInfo
219 | 
220 | def unescape(escapedBytes, unicode = True):
221 |     '''
222 |         This method unescapes the given string
223 |         
224 |         @param escapedBytes: A string to unescape
225 |         @return: A tuple (status,statusContent), where statusContent is an unescaped string in case status = 0 or an error in case status = -1
226 |     '''
227 |     #TODO: modify to accept a list of escaped strings?
228 |     unescapedBytes = ''
229 |     if unicode:
230 |         unicodePadding = '\x00'
231 |     else:
232 |         unicodePadding = ''
233 |     try:
234 |         if escapedBytes.lower().find('%u') != -1 or escapedBytes.lower().find('\u') != -1 or escapedBytes.find('%') != -1:
235 |             if escapedBytes.lower().find('\u') != -1:
236 |                 splitBytes = escapedBytes.split('\\')
237 |             else:
238 |                 splitBytes = escapedBytes.split('%')
239 |             for i in range(len(splitBytes)):
240 |                 splitByte = splitBytes[i]
241 |                 if splitByte == '':
242 |                     continue
243 |                 if len(splitByte) > 4 and re.match('u[0-9a-f]{4}',splitByte[:5],re.IGNORECASE):
244 |                     unescapedBytes += chr(int(splitByte[3]+splitByte[4],16))+chr(int(splitByte[1]+splitByte[2],16))
245 |                     if len(splitByte) > 5:
246 |                         for j in range(5,len(splitByte)): 
247 |                             unescapedBytes += splitByte[j] + unicodePadding
248 |                 elif len(splitByte) > 1 and re.match('[0-9a-f]{2}',splitByte[:2],re.IGNORECASE):
249 |                     unescapedBytes += chr(int(splitByte[0]+splitByte[1],16)) + unicodePadding
250 |                     if len(splitByte) > 2:
251 |                         for j in range(2,len(splitByte)): 
252 |                             unescapedBytes += splitByte[j] + unicodePadding
253 |                 else:
254 |                     if i != 0:
255 |                         unescapedBytes += '%' + unicodePadding
256 |                     for j in range(len(splitByte)):
257 |                         unescapedBytes += splitByte[j] + unicodePadding
258 |         else:
259 |             unescapedBytes = escapedBytes
260 |     except:
261 |         return (-1,'Error while unescaping the bytes')
262 |     return (0,unescapedBytes)
263 | 


--------------------------------------------------------------------------------
/lib/parse/peepdf/README:
--------------------------------------------------------------------------------
  1 | ** Home page **
  2 | 
  3 | http://peepdf.eternal-todo.com
  4 | http://twitter.com/peepdf
  5 | 
  6 | 
  7 | ** Dependencies **
  8 | 
  9 | - In order to analyse Javascript code "PyV8" is needed:
 10 | 
 11 |     http://code.google.com/p/pyv8/
 12 | 
 13 | 
 14 | - The "sctest" command is a wrapper of "sctest" (libemu). Besides libemu pylibemu is used and must be installed:
 15 | 
 16 |     http://libemu.carnivore.it (latest version from git repository, Sourceforge package is outdated)
 17 |     https://github.com/buffer/pylibemu
 18 | 
 19 | 
 20 | - To support XML output "lxml" is needed:
 21 | 
 22 |     http://lxml.de/installation.html
 23 |     
 24 | 
 25 | - Included modules: lzw, colorama, jsbeautifier, ccitt, pythonaes (Thanks to all the developers!!)
 26 | 
 27 | 
 28 | 
 29 | ** Installation **
 30 | 
 31 | No installation is needed apart of the commented dependencies, just execute it!
 32 | 
 33 | 
 34 | 
 35 | ** Execution **
 36 | 
 37 | There are two important options when peepdf is executed:
 38 | 
 39 | -f: Ignores the parsing errors. Analysing malicious files propably leads to parsing errors, so this parameter should be set.
 40 | -l: Sets the loose mode, so does not search for the endobj tag because it's not obligatory. Helpful with malformed files.
 41 | 
 42 | 
 43 | * Simple execution
 44 | 
 45 | Shows the statistics of the file after being decoded/decrypted and analysed:
 46 | 
 47 |     python peepdf.py [options] pdf_file
 48 | 
 49 | 
 50 | * Interactive console
 51 | 
 52 | Executes the interactive console to let play with the PDF file:
 53 | 
 54 |     python peepdf.py -i [options] pdf_file
 55 | 
 56 | If no PDF file is specified it's possible to use the decode/encode/js*/sctest commands and create a new PDF file:
 57 | 
 58 |     python peepdf.py -i
 59 | 
 60 | 
 61 | * Batch execution
 62 | 
 63 | It's possible to use a commands file to specify the commands to be executed in the batch mode. This type of execution is good to automatise analysis of several files:
 64 | 
 65 |     python peepdf.py [options] -s commands_file pdf_file
 66 | 
 67 | 
 68 | 
 69 | ** Updating **
 70 | 
 71 | Just type this and you will be updated to the latest version from the repository:
 72 | 
 73 |     python peepdf.py -u
 74 | 
 75 | 
 76 | 
 77 | ** Some hints **
 78 | 
 79 | If the information shown when a PDF file is parsed is not enough to know if it's harmful or not, the following commands can help to do it:
 80 | 
 81 | * tree
 82 | 
 83 | Shows the tree graph of the file or specified version. Here we can see suspicious elements.
 84 | 
 85 | 
 86 | * offsets 
 87 | 
 88 | Shows the physical map of the file or the specified version of the document. This is helpful to see unusual big objects or big spaces between objects.
 89 | 
 90 | 
 91 | * search
 92 | 
 93 | Search the specified string or hexadecimal string in the objects (decoded and encrypted streams included).
 94 | 
 95 | 
 96 | * object/rawobject
 97 | 
 98 | Shows the (raw) content of the object.
 99 | 
100 | 
101 | * stream/rawstream
102 | 
103 | Shows the (raw) content of the stream.
104 | 
105 | 
106 | * The rest of commands, of course
107 | 
108 | > help
109 | 
110 | 
111 | 
112 | ** Bugs **
113 | 
114 | Send me bugs and comments, please!! ;) You can do it via mail (jesparza AT eternal-todo.com) or through Google Code (http://peepdf.googlecode.com).
115 | 
116 | Thanks!!
117 | 


--------------------------------------------------------------------------------
/lib/parse/peepdf/TODO:
--------------------------------------------------------------------------------
 1 | Pending tasks:
 2 | 
 3 | - User manual
 4 | - Documentation of methods in PDFCore.py
 5 | - Add the rest of supported stream filters (better testing of existent)
 6 | - Automatic analysis of embedded PDF files
 7 | - Add AES to the encryption implementation
 8 | - Improve the automatic Javascript analysis, getting code from other parts of the documents (getAnnots, etc)
 9 | - GUI
10 | - ActionScript analysis?


--------------------------------------------------------------------------------
/lib/parse/peepdf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/nabu/3afcab20a5ddd8a9b984d8f34756ebedfc0b45a9/lib/parse/peepdf/__init__.py


--------------------------------------------------------------------------------
/lib/parse/peepdf/aes.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #    peepdf is a tool to analyse and modify PDF files
 3 | #    http://peepdf.eternal-todo.com
 4 | #    By Jose Miguel Esparza <jesparza AT eternal-todo.com>
 5 | #
 6 | #    Copyright (C) 2012-2014 Jose Miguel Esparza
 7 | #
 8 | #    This file is part of peepdf.
 9 | #
10 | #        peepdf is free software: you can redistribute it and/or modify
11 | #        it under the terms of the GNU General Public License as published by
12 | #        the Free Software Foundation, either version 3 of the License, or
13 | #        (at your option) any later version.
14 | #
15 | #        peepdf is distributed in the hope that it will be useful,
16 | #        but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | #        MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    See the
18 | #        GNU General Public License for more details.
19 | #
20 | #        You should have received a copy of the GNU General Public License
21 | #        along with peepdf.    If not, see <http://www.gnu.org/licenses/>.
22 | #
23 | 
24 | """
25 | Created from the demonstration of the pythonaes package.
26 | 
27 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
28 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
29 | """
30 | 
31 | import sys
32 | from aespython import key_expander, aes_cipher, cbc_mode
33 | 
34 | def decryptData(data, password = None, keyLength =  None, mode = 'CBC'):
35 |     '''
36 |         Method added for peepdf
37 |     '''
38 |     decryptedData = ''
39 |     if keyLength == None:
40 |         keyLength = len(password)*8
41 |     if keyLength not in [128, 192, 256]:
42 |         return (-1, 'Bad length key in AES decryption process')
43 |     
44 |     iv = map(ord, data[:16])
45 |     key = map(ord, password)
46 |     data = data[16:]
47 |     if len(data) % 16 != 0:
48 |         data = data[:-(len(data)%16)]
49 |     keyExpander = key_expander.KeyExpander(keyLength)
50 |     expandedKey = keyExpander.expand(key)
51 |     aesCipher = aes_cipher.AESCipher(expandedKey)
52 |     if mode == 'CBC':
53 |         aesMode = cbc_mode.CBCMode(aesCipher, 16)
54 |     aesMode.set_iv(iv)
55 |     for i in range(0,len(data),16):
56 |         ciphertext = map(ord,data[i:i+16])
57 |         decryptedBytes = aesMode.decrypt_block(ciphertext)
58 |         for byte in decryptedBytes:
59 |             decryptedData += chr(byte)
60 |     return (0, decryptedData) 


--------------------------------------------------------------------------------
/lib/parse/peepdf/aespython/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/nabu/3afcab20a5ddd8a9b984d8f34756ebedfc0b45a9/lib/parse/peepdf/aespython/__init__.py


--------------------------------------------------------------------------------
/lib/parse/peepdf/aespython/aes_cipher.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | AES Block Cipher.
 4 | 
 5 | Performs single block cipher decipher operations on a 16 element list of integers.
 6 | These integers represent 8 bit bytes in a 128 bit block.
 7 | The result of cipher or decipher operations is the transformed 16 element list of integers.
 8 | 
 9 | Running this file as __main__ will result in a self-test of the algorithm.
10 | 
11 | Algorithm per NIST FIPS-197 http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf
12 | 
13 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
14 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
15 | """
16 | __author__ = "Adam Newman"
17 | 
18 | #Normally use relative import. In test mode use local import.
19 | try:from .aes_tables import sbox,i_sbox,galI,galNI
20 | except ValueError:from aes_tables import sbox,i_sbox,galI,galNI
21 | ups=",".join("s%x"%x for x in range(16))
22 | upr=ups.replace("s","r")
23 | mix=",".join(",".join(("g{0}[s%x]^g{1}[s%x]^g{2}[s%x]^g{3}[s%x]^r%x"%(i+(i[0]+(0,3,2,1)[j],))).format(j&3,j+1&3,j+2&3,j+3&3) for j in (0,3,2,1)) for i in ((0,1,2,3),(4,5,6,7),(8,9,10,11),(12,13,14,15))).replace("g2","g").replace("g3","g")
24 | i=mix.find("g[")
25 | while i!=-1:
26 | 	mix=mix[:i]+mix[i+2:i+4]+mix[i+5:]
27 | 	i=mix.find("g[",i)
28 | imix=",".join(",".join(("g{0}[s%x]^g{1}[s%x]^g{2}[s%x]^g{3}[s%x]"%i).format(j&3,j+1&3,j+2&3,j+3&3) for j in (0,3,2,1)) for i in ((0,1,2,3),(4,5,6,7),(8,9,10,11),(12,13,14,15)))
29 | csl=["s%x"%(x*5&15) for x in range(16)]
30 | csr=["s%x"%(x*-3&15) for x in range(16)]
31 | box=",".join("s[%s]"%i for i in csl)
32 | ibox=",".join("s[%s]^r%x"%i for i in zip(csr,range(16)))
33 | xor=",".join("s[%s]^r%x"%i for i in zip(csl,range(16)))
34 | xori=";".join("s%x^=r%x"%(i,i) for i in range(16))
35 | ciph="""def decipher_block(f,s):
36 |  g0,g1,g2,g3=galNI;ek=f._expanded_key;S=s+[0]*(16-len(s));s=sbox;R=ek[:16];X
37 |  for f in range(!16):R=ek[f:f+16];S=B;S=M
38 |  R=ek[f+16:]
39 |  return """.replace("S",ups).replace("R",upr).replace("X",xori)
40 | class AESCipher:
41 |     def __init__(self,expanded_key):
42 |         self._expanded_key=expanded_key
43 |         self._Nr=len(expanded_key)-16
44 |     exec(ciph.replace("g2,g3","").replace("dec","c").replace("!","16,f._Nr,").replace("B",box).replace("M",mix)+xor)
45 |     exec(ciph.replace("NI","I").replace(":16","f._Nr:").replace("f+16:",":16").replace("!","f._Nr-16,0,-").replace("sbox","i_sbox").replace("B",ibox).replace("M",imix)+ibox)
46 | import unittest
47 | class TestCipher(unittest.TestCase):
48 |     def test_cipher(self):
49 |         """Test AES cipher with all key lengths"""
50 |         import test_keys
51 |         import key_expander
52 |         test_data = test_keys.TestKeys()
53 |         for key_size in 128, 192, 256:
54 |             test_key_expander = key_expander.KeyExpander(key_size)
55 |             test_expanded_key = test_key_expander.expand(test_data.test_key[key_size])
56 |             test_cipher = AESCipher(test_expanded_key)
57 |             test_result_ciphertext = test_cipher.cipher_block(test_data.test_block_plaintext)
58 |             self.assertEquals(len([i for i, j in zip(test_result_ciphertext, test_data.test_block_ciphertext_validated[key_size]) if i == j]),
59 |                 16,msg='Test %d bit cipher'%key_size)
60 |             test_result_plaintext = test_cipher.decipher_block(test_data.test_block_ciphertext_validated[key_size])
61 |             self.assertEquals(len([i for i, j in zip(test_result_plaintext, test_data.test_block_plaintext) if i == j]),
62 |                 16,msg='Test %d bit decipher'%key_size)
63 | if __name__ == "__main__":
64 |     unittest.main()


--------------------------------------------------------------------------------
/lib/parse/peepdf/aespython/aes_tables.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Instantiate AES tables for rcon,sbox,i_sbox,and galois_lookup.
  3 | 
  4 | Copyright (c) 2010,Adam Newman http://www.caller9.com/
  5 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
  6 | """
  7 | __author__ = "Adam Newman"
  8 | rcon=(
  9 | 0x8d,0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80,0x1b,0x36,0x6c,0xd8,0xab,0x4d,0x9a,
 10 | 0x2f,0x5e,0xbc,0x63,0xc6,0x97,0x35,0x6a,0xd4,0xb3,0x7d,0xfa,0xef,0xc5,0x91,0x39,
 11 | 0x72,0xe4,0xd3,0xbd,0x61,0xc2,0x9f,0x25,0x4a,0x94,0x33,0x66,0xcc,0x83,0x1d,0x3a,
 12 | 0x74,0xe8,0xcb,0x8d,0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80,0x1b,0x36,0x6c,0xd8,
 13 | 0xab,0x4d,0x9a,0x2f,0x5e,0xbc,0x63,0xc6,0x97,0x35,0x6a,0xd4,0xb3,0x7d,0xfa,0xef,
 14 | 0xc5,0x91,0x39,0x72,0xe4,0xd3,0xbd,0x61,0xc2,0x9f,0x25,0x4a,0x94,0x33,0x66,0xcc,
 15 | 0x83,0x1d,0x3a,0x74,0xe8,0xcb,0x8d,0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80,0x1b,
 16 | 0x36,0x6c,0xd8,0xab,0x4d,0x9a,0x2f,0x5e,0xbc,0x63,0xc6,0x97,0x35,0x6a,0xd4,0xb3,
 17 | 0x7d,0xfa,0xef,0xc5,0x91,0x39,0x72,0xe4,0xd3,0xbd,0x61,0xc2,0x9f,0x25,0x4a,0x94,
 18 | 0x33,0x66,0xcc,0x83,0x1d,0x3a,0x74,0xe8,0xcb,0x8d,0x01,0x02,0x04,0x08,0x10,0x20,
 19 | 0x40,0x80,0x1b,0x36,0x6c,0xd8,0xab,0x4d,0x9a,0x2f,0x5e,0xbc,0x63,0xc6,0x97,0x35,
 20 | 0x6a,0xd4,0xb3,0x7d,0xfa,0xef,0xc5,0x91,0x39,0x72,0xe4,0xd3,0xbd,0x61,0xc2,0x9f,
 21 | 0x25,0x4a,0x94,0x33,0x66,0xcc,0x83,0x1d,0x3a,0x74,0xe8,0xcb,0x8d,0x01,0x02,0x04,
 22 | 0x08,0x10,0x20,0x40,0x80,0x1b,0x36,0x6c,0xd8,0xab,0x4d,0x9a,0x2f,0x5e,0xbc,0x63,
 23 | 0xc6,0x97,0x35,0x6a,0xd4,0xb3,0x7d,0xfa,0xef,0xc5,0x91,0x39,0x72,0xe4,0xd3,0xbd,
 24 | 0x61,0xc2,0x9f,0x25,0x4a,0x94,0x33,0x66,0xcc,0x83,0x1d,0x3a,0x74,0xe8,0xcb)
 25 | sbox=(
 26 | 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76,
 27 | 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0,
 28 | 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15,
 29 | 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75,
 30 | 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84,
 31 | 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf,
 32 | 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8,
 33 | 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2,
 34 | 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73,
 35 | 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb,
 36 | 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79,
 37 | 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08,
 38 | 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a,
 39 | 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e,
 40 | 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf,
 41 | 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16)
 42 | i_sbox=(
 43 | 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38,0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb,
 44 | 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87,0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb,
 45 | 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d,0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e,
 46 | 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2,0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25,
 47 | 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16,0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92,
 48 | 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda,0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84,
 49 | 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a,0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06,
 50 | 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02,0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b,
 51 | 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea,0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73,
 52 | 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85,0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e,
 53 | 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89,0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b,
 54 | 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20,0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4,
 55 | 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31,0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f,
 56 | 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d,0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef,
 57 | 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0,0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61,
 58 | 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26,0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d)
 59 | galNI=((
 60 | 0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e,0x10,0x12,0x14,0x16,0x18,0x1a,0x1c,0x1e,
 61 | 0x20,0x22,0x24,0x26,0x28,0x2a,0x2c,0x2e,0x30,0x32,0x34,0x36,0x38,0x3a,0x3c,0x3e,
 62 | 0x40,0x42,0x44,0x46,0x48,0x4a,0x4c,0x4e,0x50,0x52,0x54,0x56,0x58,0x5a,0x5c,0x5e,
 63 | 0x60,0x62,0x64,0x66,0x68,0x6a,0x6c,0x6e,0x70,0x72,0x74,0x76,0x78,0x7a,0x7c,0x7e,
 64 | 0x80,0x82,0x84,0x86,0x88,0x8a,0x8c,0x8e,0x90,0x92,0x94,0x96,0x98,0x9a,0x9c,0x9e,
 65 | 0xa0,0xa2,0xa4,0xa6,0xa8,0xaa,0xac,0xae,0xb0,0xb2,0xb4,0xb6,0xb8,0xba,0xbc,0xbe,
 66 | 0xc0,0xc2,0xc4,0xc6,0xc8,0xca,0xcc,0xce,0xd0,0xd2,0xd4,0xd6,0xd8,0xda,0xdc,0xde,
 67 | 0xe0,0xe2,0xe4,0xe6,0xe8,0xea,0xec,0xee,0xf0,0xf2,0xf4,0xf6,0xf8,0xfa,0xfc,0xfe,
 68 | 0x1b,0x19,0x1f,0x1d,0x13,0x11,0x17,0x15,0x0b,0x09,0x0f,0x0d,0x03,0x01,0x07,0x05,
 69 | 0x3b,0x39,0x3f,0x3d,0x33,0x31,0x37,0x35,0x2b,0x29,0x2f,0x2d,0x23,0x21,0x27,0x25,
 70 | 0x5b,0x59,0x5f,0x5d,0x53,0x51,0x57,0x55,0x4b,0x49,0x4f,0x4d,0x43,0x41,0x47,0x45,
 71 | 0x7b,0x79,0x7f,0x7d,0x73,0x71,0x77,0x75,0x6b,0x69,0x6f,0x6d,0x63,0x61,0x67,0x65,
 72 | 0x9b,0x99,0x9f,0x9d,0x93,0x91,0x97,0x95,0x8b,0x89,0x8f,0x8d,0x83,0x81,0x87,0x85,
 73 | 0xbb,0xb9,0xbf,0xbd,0xb3,0xb1,0xb7,0xb5,0xab,0xa9,0xaf,0xad,0xa3,0xa1,0xa7,0xa5,
 74 | 0xdb,0xd9,0xdf,0xdd,0xd3,0xd1,0xd7,0xd5,0xcb,0xc9,0xcf,0xcd,0xc3,0xc1,0xc7,0xc5,
 75 | 0xfb,0xf9,0xff,0xfd,0xf3,0xf1,0xf7,0xf5,0xeb,0xe9,0xef,0xed,0xe3,0xe1,0xe7,0xe5),
 76 | (0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09,0x18,0x1b,0x1e,0x1d,0x14,0x17,0x12,0x11,
 77 | 0x30,0x33,0x36,0x35,0x3c,0x3f,0x3a,0x39,0x28,0x2b,0x2e,0x2d,0x24,0x27,0x22,0x21,
 78 | 0x60,0x63,0x66,0x65,0x6c,0x6f,0x6a,0x69,0x78,0x7b,0x7e,0x7d,0x74,0x77,0x72,0x71,
 79 | 0x50,0x53,0x56,0x55,0x5c,0x5f,0x5a,0x59,0x48,0x4b,0x4e,0x4d,0x44,0x47,0x42,0x41,
 80 | 0xc0,0xc3,0xc6,0xc5,0xcc,0xcf,0xca,0xc9,0xd8,0xdb,0xde,0xdd,0xd4,0xd7,0xd2,0xd1,
 81 | 0xf0,0xf3,0xf6,0xf5,0xfc,0xff,0xfa,0xf9,0xe8,0xeb,0xee,0xed,0xe4,0xe7,0xe2,0xe1,
 82 | 0xa0,0xa3,0xa6,0xa5,0xac,0xaf,0xaa,0xa9,0xb8,0xbb,0xbe,0xbd,0xb4,0xb7,0xb2,0xb1,
 83 | 0x90,0x93,0x96,0x95,0x9c,0x9f,0x9a,0x99,0x88,0x8b,0x8e,0x8d,0x84,0x87,0x82,0x81,
 84 | 0x9b,0x98,0x9d,0x9e,0x97,0x94,0x91,0x92,0x83,0x80,0x85,0x86,0x8f,0x8c,0x89,0x8a,
 85 | 0xab,0xa8,0xad,0xae,0xa7,0xa4,0xa1,0xa2,0xb3,0xb0,0xb5,0xb6,0xbf,0xbc,0xb9,0xba,
 86 | 0xfb,0xf8,0xfd,0xfe,0xf7,0xf4,0xf1,0xf2,0xe3,0xe0,0xe5,0xe6,0xef,0xec,0xe9,0xea,
 87 | 0xcb,0xc8,0xcd,0xce,0xc7,0xc4,0xc1,0xc2,0xd3,0xd0,0xd5,0xd6,0xdf,0xdc,0xd9,0xda,
 88 | 0x5b,0x58,0x5d,0x5e,0x57,0x54,0x51,0x52,0x43,0x40,0x45,0x46,0x4f,0x4c,0x49,0x4a,
 89 | 0x6b,0x68,0x6d,0x6e,0x67,0x64,0x61,0x62,0x73,0x70,0x75,0x76,0x7f,0x7c,0x79,0x7a,
 90 | 0x3b,0x38,0x3d,0x3e,0x37,0x34,0x31,0x32,0x23,0x20,0x25,0x26,0x2f,0x2c,0x29,0x2a,
 91 | 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02,0x13,0x10,0x15,0x16,0x1f,0x1c,0x19,0x1a))
 92 | galI=(
 93 | (0x00,0x0e,0x1c,0x12,0x38,0x36,0x24,0x2a,0x70,0x7e,0x6c,0x62,0x48,0x46,0x54,0x5a,
 94 | 0xe0,0xee,0xfc,0xf2,0xd8,0xd6,0xc4,0xca,0x90,0x9e,0x8c,0x82,0xa8,0xa6,0xb4,0xba,
 95 | 0xdb,0xd5,0xc7,0xc9,0xe3,0xed,0xff,0xf1,0xab,0xa5,0xb7,0xb9,0x93,0x9d,0x8f,0x81,
 96 | 0x3b,0x35,0x27,0x29,0x03,0x0d,0x1f,0x11,0x4b,0x45,0x57,0x59,0x73,0x7d,0x6f,0x61,
 97 | 0xad,0xa3,0xb1,0xbf,0x95,0x9b,0x89,0x87,0xdd,0xd3,0xc1,0xcf,0xe5,0xeb,0xf9,0xf7,
 98 | 0x4d,0x43,0x51,0x5f,0x75,0x7b,0x69,0x67,0x3d,0x33,0x21,0x2f,0x05,0x0b,0x19,0x17,
 99 | 0x76,0x78,0x6a,0x64,0x4e,0x40,0x52,0x5c,0x06,0x08,0x1a,0x14,0x3e,0x30,0x22,0x2c,
100 | 0x96,0x98,0x8a,0x84,0xae,0xa0,0xb2,0xbc,0xe6,0xe8,0xfa,0xf4,0xde,0xd0,0xc2,0xcc,
101 | 0x41,0x4f,0x5d,0x53,0x79,0x77,0x65,0x6b,0x31,0x3f,0x2d,0x23,0x09,0x07,0x15,0x1b,
102 | 0xa1,0xaf,0xbd,0xb3,0x99,0x97,0x85,0x8b,0xd1,0xdf,0xcd,0xc3,0xe9,0xe7,0xf5,0xfb,
103 | 0x9a,0x94,0x86,0x88,0xa2,0xac,0xbe,0xb0,0xea,0xe4,0xf6,0xf8,0xd2,0xdc,0xce,0xc0,
104 | 0x7a,0x74,0x66,0x68,0x42,0x4c,0x5e,0x50,0x0a,0x04,0x16,0x18,0x32,0x3c,0x2e,0x20,
105 | 0xec,0xe2,0xf0,0xfe,0xd4,0xda,0xc8,0xc6,0x9c,0x92,0x80,0x8e,0xa4,0xaa,0xb8,0xb6,
106 | 0x0c,0x02,0x10,0x1e,0x34,0x3a,0x28,0x26,0x7c,0x72,0x60,0x6e,0x44,0x4a,0x58,0x56,
107 | 0x37,0x39,0x2b,0x25,0x0f,0x01,0x13,0x1d,0x47,0x49,0x5b,0x55,0x7f,0x71,0x63,0x6d,
108 | 0xd7,0xd9,0xcb,0xc5,0xef,0xe1,0xf3,0xfd,0xa7,0xa9,0xbb,0xb5,0x9f,0x91,0x83,0x8d),
109 | (0x00,0x0b,0x16,0x1d,0x2c,0x27,0x3a,0x31,0x58,0x53,0x4e,0x45,0x74,0x7f,0x62,0x69,
110 | 0xb0,0xbb,0xa6,0xad,0x9c,0x97,0x8a,0x81,0xe8,0xe3,0xfe,0xf5,0xc4,0xcf,0xd2,0xd9,
111 | 0x7b,0x70,0x6d,0x66,0x57,0x5c,0x41,0x4a,0x23,0x28,0x35,0x3e,0x0f,0x04,0x19,0x12,
112 | 0xcb,0xc0,0xdd,0xd6,0xe7,0xec,0xf1,0xfa,0x93,0x98,0x85,0x8e,0xbf,0xb4,0xa9,0xa2,
113 | 0xf6,0xfd,0xe0,0xeb,0xda,0xd1,0xcc,0xc7,0xae,0xa5,0xb8,0xb3,0x82,0x89,0x94,0x9f,
114 | 0x46,0x4d,0x50,0x5b,0x6a,0x61,0x7c,0x77,0x1e,0x15,0x08,0x03,0x32,0x39,0x24,0x2f,
115 | 0x8d,0x86,0x9b,0x90,0xa1,0xaa,0xb7,0xbc,0xd5,0xde,0xc3,0xc8,0xf9,0xf2,0xef,0xe4,
116 | 0x3d,0x36,0x2b,0x20,0x11,0x1a,0x07,0x0c,0x65,0x6e,0x73,0x78,0x49,0x42,0x5f,0x54,
117 | 0xf7,0xfc,0xe1,0xea,0xdb,0xd0,0xcd,0xc6,0xaf,0xa4,0xb9,0xb2,0x83,0x88,0x95,0x9e,
118 | 0x47,0x4c,0x51,0x5a,0x6b,0x60,0x7d,0x76,0x1f,0x14,0x09,0x02,0x33,0x38,0x25,0x2e,
119 | 0x8c,0x87,0x9a,0x91,0xa0,0xab,0xb6,0xbd,0xd4,0xdf,0xc2,0xc9,0xf8,0xf3,0xee,0xe5,
120 | 0x3c,0x37,0x2a,0x21,0x10,0x1b,0x06,0x0d,0x64,0x6f,0x72,0x79,0x48,0x43,0x5e,0x55,
121 | 0x01,0x0a,0x17,0x1c,0x2d,0x26,0x3b,0x30,0x59,0x52,0x4f,0x44,0x75,0x7e,0x63,0x68,
122 | 0xb1,0xba,0xa7,0xac,0x9d,0x96,0x8b,0x80,0xe9,0xe2,0xff,0xf4,0xc5,0xce,0xd3,0xd8,
123 | 0x7a,0x71,0x6c,0x67,0x56,0x5d,0x40,0x4b,0x22,0x29,0x34,0x3f,0x0e,0x05,0x18,0x13,
124 | 0xca,0xc1,0xdc,0xd7,0xe6,0xed,0xf0,0xfb,0x92,0x99,0x84,0x8f,0xbe,0xb5,0xa8,0xa3),
125 | (0x00,0x0d,0x1a,0x17,0x34,0x39,0x2e,0x23,0x68,0x65,0x72,0x7f,0x5c,0x51,0x46,0x4b,
126 | 0xd0,0xdd,0xca,0xc7,0xe4,0xe9,0xfe,0xf3,0xb8,0xb5,0xa2,0xaf,0x8c,0x81,0x96,0x9b,
127 | 0xbb,0xb6,0xa1,0xac,0x8f,0x82,0x95,0x98,0xd3,0xde,0xc9,0xc4,0xe7,0xea,0xfd,0xf0,
128 | 0x6b,0x66,0x71,0x7c,0x5f,0x52,0x45,0x48,0x03,0x0e,0x19,0x14,0x37,0x3a,0x2d,0x20,
129 | 0x6d,0x60,0x77,0x7a,0x59,0x54,0x43,0x4e,0x05,0x08,0x1f,0x12,0x31,0x3c,0x2b,0x26,
130 | 0xbd,0xb0,0xa7,0xaa,0x89,0x84,0x93,0x9e,0xd5,0xd8,0xcf,0xc2,0xe1,0xec,0xfb,0xf6,
131 | 0xd6,0xdb,0xcc,0xc1,0xe2,0xef,0xf8,0xf5,0xbe,0xb3,0xa4,0xa9,0x8a,0x87,0x90,0x9d,
132 | 0x06,0x0b,0x1c,0x11,0x32,0x3f,0x28,0x25,0x6e,0x63,0x74,0x79,0x5a,0x57,0x40,0x4d,
133 | 0xda,0xd7,0xc0,0xcd,0xee,0xe3,0xf4,0xf9,0xb2,0xbf,0xa8,0xa5,0x86,0x8b,0x9c,0x91,
134 | 0x0a,0x07,0x10,0x1d,0x3e,0x33,0x24,0x29,0x62,0x6f,0x78,0x75,0x56,0x5b,0x4c,0x41,
135 | 0x61,0x6c,0x7b,0x76,0x55,0x58,0x4f,0x42,0x09,0x04,0x13,0x1e,0x3d,0x30,0x27,0x2a,
136 | 0xb1,0xbc,0xab,0xa6,0x85,0x88,0x9f,0x92,0xd9,0xd4,0xc3,0xce,0xed,0xe0,0xf7,0xfa,
137 | 0xb7,0xba,0xad,0xa0,0x83,0x8e,0x99,0x94,0xdf,0xd2,0xc5,0xc8,0xeb,0xe6,0xf1,0xfc,
138 | 0x67,0x6a,0x7d,0x70,0x53,0x5e,0x49,0x44,0x0f,0x02,0x15,0x18,0x3b,0x36,0x21,0x2c,
139 | 0x0c,0x01,0x16,0x1b,0x38,0x35,0x22,0x2f,0x64,0x69,0x7e,0x73,0x50,0x5d,0x4a,0x47,
140 | 0xdc,0xd1,0xc6,0xcb,0xe8,0xe5,0xf2,0xff,0xb4,0xb9,0xae,0xa3,0x80,0x8d,0x9a,0x97),
141 | (0x00,0x09,0x12,0x1b,0x24,0x2d,0x36,0x3f,0x48,0x41,0x5a,0x53,0x6c,0x65,0x7e,0x77,
142 | 0x90,0x99,0x82,0x8b,0xb4,0xbd,0xa6,0xaf,0xd8,0xd1,0xca,0xc3,0xfc,0xf5,0xee,0xe7,
143 | 0x3b,0x32,0x29,0x20,0x1f,0x16,0x0d,0x04,0x73,0x7a,0x61,0x68,0x57,0x5e,0x45,0x4c,
144 | 0xab,0xa2,0xb9,0xb0,0x8f,0x86,0x9d,0x94,0xe3,0xea,0xf1,0xf8,0xc7,0xce,0xd5,0xdc,
145 | 0x76,0x7f,0x64,0x6d,0x52,0x5b,0x40,0x49,0x3e,0x37,0x2c,0x25,0x1a,0x13,0x08,0x01,
146 | 0xe6,0xef,0xf4,0xfd,0xc2,0xcb,0xd0,0xd9,0xae,0xa7,0xbc,0xb5,0x8a,0x83,0x98,0x91,
147 | 0x4d,0x44,0x5f,0x56,0x69,0x60,0x7b,0x72,0x05,0x0c,0x17,0x1e,0x21,0x28,0x33,0x3a,
148 | 0xdd,0xd4,0xcf,0xc6,0xf9,0xf0,0xeb,0xe2,0x95,0x9c,0x87,0x8e,0xb1,0xb8,0xa3,0xaa,
149 | 0xec,0xe5,0xfe,0xf7,0xc8,0xc1,0xda,0xd3,0xa4,0xad,0xb6,0xbf,0x80,0x89,0x92,0x9b,
150 | 0x7c,0x75,0x6e,0x67,0x58,0x51,0x4a,0x43,0x34,0x3d,0x26,0x2f,0x10,0x19,0x02,0x0b,
151 | 0xd7,0xde,0xc5,0xcc,0xf3,0xfa,0xe1,0xe8,0x9f,0x96,0x8d,0x84,0xbb,0xb2,0xa9,0xa0,
152 | 0x47,0x4e,0x55,0x5c,0x63,0x6a,0x71,0x78,0x0f,0x06,0x1d,0x14,0x2b,0x22,0x39,0x30,
153 | 0x9a,0x93,0x88,0x81,0xbe,0xb7,0xac,0xa5,0xd2,0xdb,0xc0,0xc9,0xf6,0xff,0xe4,0xed,
154 | 0x0a,0x03,0x18,0x11,0x2e,0x27,0x3c,0x35,0x42,0x4b,0x50,0x59,0x66,0x6f,0x74,0x7d,
155 | 0xa1,0xa8,0xb3,0xba,0x85,0x8c,0x97,0x9e,0xe9,0xe0,0xfb,0xf2,0xcd,0xc4,0xdf,0xd6,
156 | 0x31,0x38,0x23,0x2a,0x15,0x1c,0x07,0x0e,0x79,0x70,0x6b,0x62,0x5d,0x54,0x4f,0x46))


--------------------------------------------------------------------------------
/lib/parse/peepdf/aespython/cbc_mode.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | CBC Mode of operation
 4 | 
 5 | Running this file as __main__ will result in a self-test of the algorithm.
 6 | 
 7 | Algorithm per NIST SP 800-38A http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf
 8 | 
 9 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
10 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
11 | """
12 | __author__ = "Adam Newman"
13 | 
14 | class CBCMode:
15 |     """Perform CBC operation on a block and retain IV information for next operation"""
16 |     def __init__(self, block_cipher, block_size):
17 |         self._block_cipher = block_cipher
18 |         self._block_size = block_size
19 |         self._iv = [0] * block_size
20 | 
21 |     def set_iv(self, iv):
22 |         if len(iv) == self._block_size:
23 |             self._iv = iv
24 | 
25 |     def encrypt_block(self, plaintext):
26 |         iv=self._iv=self._block_cipher.cipher_block([i ^ j for i,j in zip (plaintext, self._iv)])
27 |         return iv
28 | 
29 |     def decrypt_block(self, ciphertext):
30 |         plaintext = list(self._block_cipher.decipher_block(ciphertext))
31 |         for i,v in enumerate(self._iv):plaintext[i]^=v
32 |         self._iv = ciphertext
33 |         return plaintext
34 | 
35 | import unittest
36 | class TestEncryptionMode(unittest.TestCase):
37 |     def test_mode(self):
38 |         #Self test
39 |         import key_expander
40 |         import aes_cipher
41 |         import test_keys
42 | 
43 |         test_data = test_keys.TestKeys()
44 | 
45 |         test_expander = key_expander.KeyExpander(256)
46 |         test_expanded_key = test_expander.expand(test_data.test_mode_key)
47 | 
48 |         test_cipher = aes_cipher.AESCipher(test_expanded_key)
49 | 
50 |         test_cbc = CBCMode(test_cipher, 16)
51 | 
52 |         test_cbc.set_iv(test_data.test_mode_iv)
53 |         for k in range(4):
54 |             self.assertEquals(len([i for i, j in zip(test_data.test_cbc_ciphertext[k],test_cbc.encrypt_block(test_data.test_mode_plaintext[k])) if i == j]),
55 |                 16,
56 |                 msg='CBC encrypt test block %d'%k)
57 | 
58 |         test_cbc.set_iv(test_data.test_mode_iv)
59 |         for k in range(4):
60 |             self.assertEquals(len([i for i, j in zip(test_data.test_mode_plaintext[k],test_cbc.decrypt_block(test_data.test_cbc_ciphertext[k])) if i == j]),
61 |                 16,
62 |                 msg='CBC decrypt test block %d'%k)
63 | 
64 | if __name__ == "__main__":
65 |     unittest.main()


--------------------------------------------------------------------------------
/lib/parse/peepdf/aespython/cfb_mode.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | CFB Mode of operation
 4 | 
 5 | Running this file as __main__ will result in a self-test of the algorithm.
 6 | 
 7 | Algorithm per NIST SP 800-38A http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf
 8 | 
 9 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
10 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
11 | """
12 | __author__ = "Adam Newman"
13 | 
14 | class CFBMode:
15 |     """Perform CFB operation on a block and retain IV information for next operation"""
16 |     def __init__(self, block_cipher, block_size):
17 |         self._block_cipher = block_cipher
18 |         self._block_size = block_size
19 |         self._iv = [0] * block_size
20 | 
21 |     def set_iv(self, iv):
22 |         if len(iv) == self._block_size:
23 |             self._iv = iv
24 | 
25 |     def encrypt_block(self, plaintext):
26 |         cipher_iv = self._block_cipher.cipher_block(self._iv)
27 |         iv = self._iv = [i ^ j for i,j in zip (plaintext, cipher_iv)]
28 |         return iv
29 | 
30 |     def decrypt_block(self, ciphertext):
31 |         cipher_iv = self._block_cipher.cipher_block(self._iv)
32 |         self._iv = ciphertext
33 |         return [i ^ j for i,j in zip (cipher_iv, ciphertext)]
34 | 
35 | import unittest
36 | class TestEncryptionMode(unittest.TestCase):
37 |     def test_mode(self):
38 |         #Self test
39 |         import key_expander
40 |         import aes_cipher
41 |         import test_keys
42 | 
43 |         test_data = test_keys.TestKeys()
44 | 
45 |         test_expander = key_expander.KeyExpander(256)
46 |         test_expanded_key = test_expander.expand(test_data.test_mode_key)
47 | 
48 |         test_cipher = aes_cipher.AESCipher(test_expanded_key)
49 | 
50 |         test_cfb = CFBMode(test_cipher, 16)
51 | 
52 |         test_cfb.set_iv(test_data.test_mode_iv)
53 |         for k in range(4):
54 |             self.assertEquals(len([i for i, j in zip(test_data.test_cfb_ciphertext[k],test_cfb.encrypt_block(test_data.test_mode_plaintext[k])) if i == j]),
55 |                 16,
56 |                 msg='CFB encrypt test block' + str(k))
57 | 
58 |         test_cfb.set_iv(test_data.test_mode_iv)
59 |         for k in range(4):
60 |             self.assertEquals(len([i for i, j in zip(test_data.test_mode_plaintext[k],test_cfb.decrypt_block(test_data.test_cfb_ciphertext[k])) if i == j]),
61 |                 16,
62 |                 msg='CFB decrypt test block' + str(k))
63 | 
64 | if __name__ == "__main__":
65 |     unittest.main()
66 | 


--------------------------------------------------------------------------------
/lib/parse/peepdf/aespython/key_expander.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | AES Key Expansion.
 5 | 
 6 | Expands 128, 192, or 256 bit key for use with AES
 7 | 
 8 | Running this file as __main__ will result in a self-test of the algorithm.
 9 | 
10 | Algorithm per NIST FIPS-197 http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf
11 | 
12 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
13 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
14 | """
15 | __author__ = "Adam Newman"
16 | 
17 | #Normally use relative import. In test mode use local import.
18 | try:from .aes_tables import sbox,rcon
19 | except ValueError:from aes_tables import sbox,rcon
20 | from operator import xor
21 | class KeyExpander:
22 |     """Perform AES Key Expansion"""
23 | 
24 |     _expanded_key_length = {128 : 176, 192 : 208, 256 : 240}
25 | 
26 |     def __init__(self, key_length):
27 |         self._key_length = key_length
28 |         self._n = key_length>>3
29 | 
30 |         if key_length in self._expanded_key_length:
31 |             self._b = self._expanded_key_length[key_length]
32 |         else:
33 |             raise LookupError('Invalid Key Size')
34 | 
35 |     def expand(self, new_key):
36 |         """
37 |             Expand the encryption key per AES key schedule specifications
38 | 
39 |             http://en.wikipedia.org/wiki/Rijndael_key_schedule#Key_schedule_description
40 |         """
41 |         #First n bytes are copied from key
42 |         len_new_key = len(new_key)
43 |         if len_new_key != self._n:
44 |             raise RuntimeError('expand(): key size is invalid')
45 |         rcon_iter = 1
46 |         nex=new_key.extend
47 | 
48 |         #Grow the key until it is the correct length
49 |         while 1:
50 |             #Copy last 4 bytes of extended key, apply core, increment i(rcon_iter),
51 |             #core Append the list of elements 1-3 and list comprised of element 0 (circular rotate left)
52 |             #core For each element of this new list, put the result of sbox into output array.
53 |             #xor with 4 bytes n bytes from end of extended key
54 |             keyarr=[sbox[i] for i in new_key[-3:]+new_key[-4:-3]]
55 |             #First byte of output array is XORed with rcon(iter)
56 |             keyarr[0] ^= rcon[rcon_iter]
57 |             nex(map(xor,keyarr, new_key[-self._n:4-self._n]))
58 |             rcon_iter += 1
59 |             len_new_key += 4
60 | 
61 |             #Run three passes of 4 byte expansion using copy of 4 byte tail of extended key
62 |             #which is then xor'd with 4 bytes n bytes from end of extended key
63 |             for j in 0,1,2:
64 |                 nex(map(xor,new_key[-4:], new_key[-self._n:4-self._n]))
65 |                 len_new_key += 4
66 |             if len_new_key >= self._b:return new_key
67 |             else:
68 |                 #If key length is 256 and key is not complete, add 4 bytes tail of extended key
69 |                 #run through sbox before xor with 4 bytes n bytes from end of extended key
70 |                 if self._key_length == 256:
71 |                     nex(map(xor,[sbox[x] for x in new_key[-4:]], new_key[-self._n:4-self._n]))
72 |                     len_new_key += 4
73 |                     if len_new_key >= self._b:return new_key
74 | 
75 |                 #If key length is 192 or 256 and key is not complete, run 2 or 3 passes respectively
76 |                 #of 4 byte tail of extended key xor with 4 bytes n bytes from end of extended key
77 |                 if self._key_length != 128:
78 |                     for j in ((0,1) if self._key_length == 192 else (0,1,2)):
79 |                         nex(map(xor,new_key[-4:], new_key[-self._n:4-self._n]))
80 |                         len_new_key += 4
81 |                     if len_new_key >= self._b:return new_key
82 | 
83 | import unittest
84 | class TestKeyExpander(unittest.TestCase):
85 |     def test_keys(self):
86 |         """Test All Key Expansions"""
87 |         import test_keys
88 |         test_data = test_keys.TestKeys()
89 |         for key_size in 128, 192, 256:
90 |             test_expander = KeyExpander(key_size)
91 |             test_expanded_key = test_expander.expand(test_data.test_key[key_size])
92 |             self.assertEqual (len([i for i, j in zip(test_expanded_key, test_data.test_expanded_key_validated[key_size]) if i == j]),
93 |                 len(test_data.test_expanded_key_validated[key_size]),
94 |                 msg='Key expansion ' + str(key_size) + ' bit')
95 | 
96 | if __name__ == "__main__":
97 |     unittest.main()


--------------------------------------------------------------------------------
/lib/parse/peepdf/aespython/ofb_mode.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | OFB Mode of operation
 4 | 
 5 | Running this file as __main__ will result in a self-test of the algorithm.
 6 | 
 7 | Algorithm per NIST SP 800-38A http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf
 8 | 
 9 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
10 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
11 | """
12 | __author__ = "Adam Newman"
13 | 
14 | class OFBMode:
15 |     """Perform OFB operation on a block and retain IV information for next operation"""
16 |     def __init__(self, block_cipher, block_size):
17 |         self._block_cipher = block_cipher
18 |         self._block_size = block_size
19 |         self._iv = [0] * block_size
20 | 
21 |     def set_iv(self, iv):
22 |         if len(iv) == self._block_size:
23 |             self._iv = iv
24 | 
25 |     def encrypt_block(self, plaintext):
26 |         self._iv = cipher_iv = self._block_cipher.cipher_block(self._iv)
27 |         return [i ^ j for i,j in zip (plaintext, cipher_iv)]
28 | 
29 |     def decrypt_block(self, ciphertext):
30 |         self._iv = cipher_iv = self._block_cipher.cipher_block(self._iv)
31 |         return [i ^ j for i,j in zip (cipher_iv, ciphertext)]
32 | 
33 | import unittest
34 | class TestEncryptionMode(unittest.TestCase):
35 |     def test_mode(self):
36 |         #Self test
37 |         import key_expander
38 |         import aes_cipher
39 |         import test_keys
40 | 
41 |         test_data = test_keys.TestKeys()
42 | 
43 |         test_expander = key_expander.KeyExpander(256)
44 |         test_expanded_key = test_expander.expand(test_data.test_mode_key)
45 | 
46 |         test_cipher = aes_cipher.AESCipher(test_expanded_key)
47 | 
48 |         test_ofb = OFBMode(test_cipher, 16)
49 | 
50 |         test_ofb.set_iv(test_data.test_mode_iv)
51 |         for k in range(4):
52 |             self.assertEquals(len([i for i, j in zip(test_data.test_ofb_ciphertext[k],test_ofb.encrypt_block(test_data.test_mode_plaintext[k])) if i == j]),
53 |                 16,
54 |                 msg='OFB encrypt test block' + str(k))
55 | 
56 |         test_ofb.set_iv(test_data.test_mode_iv)
57 |         for k in range(4):
58 |             self.assertEquals(len([i for i, j in zip(test_data.test_mode_plaintext[k],test_ofb.decrypt_block(test_data.test_ofb_ciphertext[k])) if i == j]),
59 |                 16,
60 |                 msg='OFB decrypt test block' + str(k))
61 | 
62 | if __name__ == "__main__":
63 |     unittest.main()


--------------------------------------------------------------------------------
/lib/parse/peepdf/aespython/test_keys.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test keys and data for self-test operations.
  3 | 
  4 | Test data from:
  5 | NIST SP 800-38A http://csrc.nist.gov/publications/nistpubs/800-38a/sp800-38a.pdf
  6 | NIST FIPS-197 http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf
  7 | 
  8 | Copyright (c) 2010, Adam Newman http://www.caller9.com/
  9 | Licensed under the MIT license http://www.opensource.org/licenses/mit-license.php
 10 | """
 11 | __author__ = "Adam Newman"
 12 | 
 13 | class TestKeys:
 14 |     """Test data, keys, IVs, and output to use in self-tests"""
 15 |     test_key = {
 16 |         128 : [
 17 |         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f]
 18 |         , 192 : [
 19 |         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 
 20 |         0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17]
 21 |         , 256 : [
 22 |         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 
 23 |         0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f]
 24 |         }
 25 |     
 26 |     test_expanded_key_validated = {
 27 |         128 : [
 28 |         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 
 29 |         0xd6, 0xaa, 0x74, 0xfd, 0xd2, 0xaf, 0x72, 0xfa, 0xda, 0xa6, 0x78, 0xf1, 0xd6, 0xab, 0x76, 0xfe, 
 30 |         0xb6, 0x92, 0xcf, 0x0b, 0x64, 0x3d, 0xbd, 0xf1, 0xbe, 0x9b, 0xc5, 0x00, 0x68, 0x30, 0xb3, 0xfe, 
 31 |         0xb6, 0xff, 0x74, 0x4e, 0xd2, 0xc2, 0xc9, 0xbf, 0x6c, 0x59, 0x0c, 0xbf, 0x04, 0x69, 0xbf, 0x41,
 32 |         0x47, 0xf7, 0xf7, 0xbc, 0x95, 0x35, 0x3e, 0x03, 0xf9, 0x6c, 0x32, 0xbc, 0xfd, 0x05, 0x8d, 0xfd, 
 33 |         0x3c, 0xaa, 0xa3, 0xe8, 0xa9, 0x9f, 0x9d, 0xeb, 0x50, 0xf3, 0xaf, 0x57, 0xad, 0xf6, 0x22, 0xaa,
 34 |         0x5e, 0x39, 0x0f, 0x7d, 0xf7, 0xa6, 0x92, 0x96, 0xa7, 0x55, 0x3d, 0xc1, 0x0a, 0xa3, 0x1f, 0x6b, 
 35 |         0x14, 0xf9, 0x70, 0x1a, 0xe3, 0x5f, 0xe2, 0x8c, 0x44, 0x0a, 0xdf, 0x4d, 0x4e, 0xa9, 0xc0, 0x26, 
 36 |         0x47, 0x43, 0x87, 0x35, 0xa4, 0x1c, 0x65, 0xb9, 0xe0, 0x16, 0xba, 0xf4, 0xae, 0xbf, 0x7a, 0xd2, 
 37 |         0x54, 0x99, 0x32, 0xd1, 0xf0, 0x85, 0x57, 0x68, 0x10, 0x93, 0xed, 0x9c, 0xbe, 0x2c, 0x97, 0x4e, 
 38 |         0x13, 0x11, 0x1d, 0x7f, 0xe3, 0x94, 0x4a, 0x17, 0xf3, 0x07, 0xa7, 0x8b, 0x4d, 0x2b, 0x30, 0xc5]
 39 |         , 192 : [
 40 |         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 
 41 |         0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x58, 0x46, 0xf2, 0xf9, 0x5c, 0x43, 0xf4, 0xfe, 
 42 |         0x54, 0x4a, 0xfe, 0xf5, 0x58, 0x47, 0xf0, 0xfa, 0x48, 0x56, 0xe2, 0xe9, 0x5c, 0x43, 0xf4, 0xfe, 
 43 |         0x40, 0xf9, 0x49, 0xb3, 0x1c, 0xba, 0xbd, 0x4d, 0x48, 0xf0, 0x43, 0xb8, 0x10, 0xb7, 0xb3, 0x42, 
 44 |         0x58, 0xe1, 0x51, 0xab, 0x04, 0xa2, 0xa5, 0x55, 0x7e, 0xff, 0xb5, 0x41, 0x62, 0x45, 0x08, 0x0c, 
 45 |         0x2a, 0xb5, 0x4b, 0xb4, 0x3a, 0x02, 0xf8, 0xf6, 0x62, 0xe3, 0xa9, 0x5d, 0x66, 0x41, 0x0c, 0x08, 
 46 |         0xf5, 0x01, 0x85, 0x72, 0x97, 0x44, 0x8d, 0x7e, 0xbd, 0xf1, 0xc6, 0xca, 0x87, 0xf3, 0x3e, 0x3c, 
 47 |         0xe5, 0x10, 0x97, 0x61, 0x83, 0x51, 0x9b, 0x69, 0x34, 0x15, 0x7c, 0x9e, 0xa3, 0x51, 0xf1, 0xe0, 
 48 |         0x1e, 0xa0, 0x37, 0x2a, 0x99, 0x53, 0x09, 0x16, 0x7c, 0x43, 0x9e, 0x77, 0xff, 0x12, 0x05, 0x1e, 
 49 |         0xdd, 0x7e, 0x0e, 0x88, 0x7e, 0x2f, 0xff, 0x68, 0x60, 0x8f, 0xc8, 0x42, 0xf9, 0xdc, 0xc1, 0x54, 
 50 |         0x85, 0x9f, 0x5f, 0x23, 0x7a, 0x8d, 0x5a, 0x3d, 0xc0, 0xc0, 0x29, 0x52, 0xbe, 0xef, 0xd6, 0x3a, 
 51 |         0xde, 0x60, 0x1e, 0x78, 0x27, 0xbc, 0xdf, 0x2c, 0xa2, 0x23, 0x80, 0x0f, 0xd8, 0xae, 0xda, 0x32, 
 52 |         0xa4, 0x97, 0x0a, 0x33, 0x1a, 0x78, 0xdc, 0x09, 0xc4, 0x18, 0xc2, 0x71, 0xe3, 0xa4, 0x1d, 0x5d]
 53 |         , 256 : [
 54 |         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 
 55 |         0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
 56 |         0xa5, 0x73, 0xc2, 0x9f, 0xa1, 0x76, 0xc4, 0x98, 0xa9, 0x7f, 0xce, 0x93, 0xa5, 0x72, 0xc0, 0x9c,
 57 |         0x16, 0x51, 0xa8, 0xcd, 0x02, 0x44, 0xbe, 0xda, 0x1a, 0x5d, 0xa4, 0xc1, 0x06, 0x40, 0xba, 0xde, 
 58 |         0xae, 0x87, 0xdf, 0xf0, 0x0f, 0xf1, 0x1b, 0x68, 0xa6, 0x8e, 0xd5, 0xfb, 0x03, 0xfc, 0x15, 0x67, 
 59 |         0x6d, 0xe1, 0xf1, 0x48, 0x6f, 0xa5, 0x4f, 0x92, 0x75, 0xf8, 0xeb, 0x53, 0x73, 0xb8, 0x51, 0x8d, 
 60 |         0xc6, 0x56, 0x82, 0x7f, 0xc9, 0xa7, 0x99, 0x17, 0x6f, 0x29, 0x4c, 0xec, 0x6c, 0xd5, 0x59, 0x8b, 
 61 |         0x3d, 0xe2, 0x3a, 0x75, 0x52, 0x47, 0x75, 0xe7, 0x27, 0xbf, 0x9e, 0xb4, 0x54, 0x07, 0xcf, 0x39, 
 62 |         0x0b, 0xdc, 0x90, 0x5f, 0xc2, 0x7b, 0x09, 0x48, 0xad, 0x52, 0x45, 0xa4, 0xc1, 0x87, 0x1c, 0x2f, 
 63 |         0x45, 0xf5, 0xa6, 0x60, 0x17, 0xb2, 0xd3, 0x87, 0x30, 0x0d, 0x4d, 0x33, 0x64, 0x0a, 0x82, 0x0a, 
 64 |         0x7c, 0xcf, 0xf7, 0x1c, 0xbe, 0xb4, 0xfe, 0x54, 0x13, 0xe6, 0xbb, 0xf0, 0xd2, 0x61, 0xa7, 0xdf,
 65 |         0xf0, 0x1a, 0xfa, 0xfe, 0xe7, 0xa8, 0x29, 0x79, 0xd7, 0xa5, 0x64, 0x4a, 0xb3, 0xaf, 0xe6, 0x40, 
 66 |         0x25, 0x41, 0xfe, 0x71, 0x9b, 0xf5, 0x00, 0x25, 0x88, 0x13, 0xbb, 0xd5, 0x5a, 0x72, 0x1c, 0x0a, 
 67 |         0x4e, 0x5a, 0x66, 0x99, 0xa9, 0xf2, 0x4f, 0xe0, 0x7e, 0x57, 0x2b, 0xaa, 0xcd, 0xf8, 0xcd, 0xea, 
 68 |         0x24, 0xfc, 0x79, 0xcc, 0xbf, 0x09, 0x79, 0xe9, 0x37, 0x1a, 0xc2, 0x3c, 0x6d, 0x68, 0xde, 0x36]
 69 |         }
 70 |     
 71 |     test_block_ciphertext_validated = {
 72 |         128 : [
 73 |         0x69, 0xc4, 0xe0, 0xd8, 0x6a, 0x7b, 0x04, 0x30, 0xd8, 0xcd, 0xb7, 0x80, 0x70, 0xb4, 0xc5, 0x5a]
 74 |         , 192 : [
 75 |         0xdd, 0xa9, 0x7c, 0xa4, 0x86, 0x4c, 0xdf, 0xe0, 0x6e, 0xaf, 0x70, 0xa0, 0xec, 0x0d, 0x71, 0x91]
 76 |         , 256 : [
 77 |         0x8e, 0xa2, 0xb7, 0xca, 0x51, 0x67, 0x45, 0xbf, 0xea, 0xfc, 0x49, 0x90, 0x4b, 0x49, 0x60, 0x89]
 78 |         }
 79 |     
 80 |     test_block_plaintext = [
 81 |         0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff]
 82 |     
 83 |     #After initial validation, these deviated from test in SP 800-38A to use same key, iv, and plaintext on tests.
 84 |     #Still valid, just easier to test with.
 85 |     test_mode_key= [
 86 |         0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81,
 87 |         0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4]
 88 |     test_mode_iv = [
 89 |         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f]
 90 |     test_mode_plaintext = [
 91 |         [0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a],
 92 |         [0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51],
 93 |         [0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef],
 94 |         [0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10]]
 95 |     test_cbc_ciphertext = [
 96 |         [0xf5, 0x8c, 0x4c, 0x04, 0xd6, 0xe5, 0xf1, 0xba, 0x77, 0x9e, 0xab, 0xfb, 0x5f, 0x7b, 0xfb, 0xd6],
 97 |         [0x9c, 0xfc, 0x4e, 0x96, 0x7e, 0xdb, 0x80, 0x8d, 0x67, 0x9f, 0x77, 0x7b, 0xc6, 0x70, 0x2c, 0x7d],
 98 |         [0x39, 0xf2, 0x33, 0x69, 0xa9, 0xd9, 0xba, 0xcf, 0xa5, 0x30, 0xe2, 0x63, 0x04, 0x23, 0x14, 0x61],
 99 |         [0xb2, 0xeb, 0x05, 0xe2, 0xc3, 0x9b, 0xe9, 0xfc, 0xda, 0x6c, 0x19, 0x07, 0x8c, 0x6a, 0x9d, 0x1b]]
100 |     test_cfb_ciphertext = [
101 |         [0xdc, 0x7e, 0x84, 0xbf, 0xda, 0x79, 0x16, 0x4b, 0x7e, 0xcd, 0x84, 0x86, 0x98, 0x5d, 0x38, 0x60],
102 |         [0x39, 0xff, 0xed, 0x14, 0x3b, 0x28, 0xb1, 0xc8, 0x32, 0x11, 0x3c, 0x63, 0x31, 0xe5, 0x40, 0x7b],
103 |         [0xdf, 0x10, 0x13, 0x24, 0x15, 0xe5, 0x4b, 0x92, 0xa1, 0x3e, 0xd0, 0xa8, 0x26, 0x7a, 0xe2, 0xf9],
104 |         [0x75, 0xa3, 0x85, 0x74, 0x1a, 0xb9, 0xce, 0xf8, 0x20, 0x31, 0x62, 0x3d, 0x55, 0xb1, 0xe4, 0x71]]
105 |     test_ofb_ciphertext = [
106 |         [0xdc, 0x7e, 0x84, 0xbf, 0xda, 0x79, 0x16, 0x4b, 0x7e, 0xcd, 0x84, 0x86, 0x98, 0x5d, 0x38, 0x60],
107 |         [0x4f, 0xeb, 0xdc, 0x67, 0x40, 0xd2, 0x0b, 0x3a, 0xc8, 0x8f, 0x6a, 0xd8, 0x2a, 0x4f, 0xb0, 0x8d],
108 |         [0x71, 0xab, 0x47, 0xa0, 0x86, 0xe8, 0x6e, 0xed, 0xf3, 0x9d, 0x1c, 0x5b, 0xba, 0x97, 0xc4, 0x08],
109 |         [0x01, 0x26, 0x14, 0x1d, 0x67, 0xf3, 0x7b, 0xe8, 0x53, 0x8f, 0x5a, 0x8b, 0xe7, 0x40, 0xe4, 0x84]]
110 |         
111 |     def hex_output(self, list):
112 |         #Debugging output helper
113 |         result = '['
114 |         for i in list[:-1]:
115 |             result += hex(i) + ','
116 |         return result + hex(list[-1]) + ']'
117 |     
118 |     
119 |     


--------------------------------------------------------------------------------
/lib/parse/peepdf/colorama/__init__.py:
--------------------------------------------------------------------------------
1 | from .initialise import init
2 | from .ansi import Fore, Back, Style
3 | from .ansitowin32 import AnsiToWin32
4 | 
5 | VERSION = '0.1.18'
6 | 
7 | 


--------------------------------------------------------------------------------
/lib/parse/peepdf/colorama/ansi.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | This module generates ANSI character codes to printing colors to terminals.
 3 | See: http://en.wikipedia.org/wiki/ANSI_escape_code
 4 | '''
 5 | 
 6 | CSI = '\033['
 7 | 
 8 | def code_to_chars(code):
 9 |     return CSI + str(code) + 'm'
10 | 
11 | class AnsiCodes(object):
12 |     def __init__(self, codes):
13 |         for name in dir(codes):
14 |             if not name.startswith('_'):
15 |                 value = getattr(codes, name)
16 |                 setattr(self, name, code_to_chars(value))
17 | 
18 | class AnsiFore:
19 |     BLACK   = 30
20 |     RED     = 31
21 |     GREEN   = 32
22 |     YELLOW  = 33
23 |     BLUE    = 34
24 |     MAGENTA = 35
25 |     CYAN    = 36
26 |     WHITE   = 37
27 |     RESET   = 39
28 | 
29 | class AnsiBack:
30 |     BLACK   = 40
31 |     RED     = 41
32 |     GREEN   = 42
33 |     YELLOW  = 43
34 |     BLUE    = 44
35 |     MAGENTA = 45
36 |     CYAN    = 46
37 |     WHITE   = 47
38 |     RESET   = 49
39 | 
40 | class AnsiStyle:
41 |     BRIGHT    = 1
42 |     DIM       = 2
43 |     NORMAL    = 22
44 |     RESET_ALL = 0
45 | 
46 | Fore = AnsiCodes( AnsiFore )
47 | Back = AnsiCodes( AnsiBack )
48 | Style = AnsiCodes( AnsiStyle )
49 | 
50 | 


--------------------------------------------------------------------------------
/lib/parse/peepdf/colorama/ansitowin32.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import re
  3 | import sys
  4 | 
  5 | from .ansi import AnsiFore, AnsiBack, AnsiStyle, Style
  6 | from .winterm import WinTerm, WinColor, WinStyle
  7 | from .win32 import windll
  8 | 
  9 | 
 10 | if windll is not None:
 11 |     winterm = WinTerm()
 12 | 
 13 | 
 14 | def is_a_tty(stream):
 15 |     return hasattr(stream, 'isatty') and stream.isatty()
 16 | 
 17 | 
 18 | class StreamWrapper(object):
 19 |     '''
 20 |     Wraps a stream (such as stdout), acting as a transparent proxy for all
 21 |     attribute access apart from method 'write()', which is delegated to our
 22 |     Converter instance.
 23 |     '''
 24 |     def __init__(self, wrapped, converter):
 25 |         # double-underscore everything to prevent clashes with names of
 26 |         # attributes on the wrapped stream object.
 27 |         self.__wrapped = wrapped
 28 |         self.__convertor = converter
 29 | 
 30 |     def __getattr__(self, name):
 31 |         return getattr(self.__wrapped, name)
 32 | 
 33 |     def write(self, text):
 34 |         self.__convertor.write(text)
 35 | 
 36 | 
 37 | class AnsiToWin32(object):
 38 |     '''
 39 |     Implements a 'write()' method which, on Windows, will strip ANSI character
 40 |     sequences from the text, and if outputting to a tty, will convert them into
 41 |     win32 function calls.
 42 |     '''
 43 |     ANSI_RE = re.compile('\033\[((?:\d|;)*)([a-zA-Z])')
 44 | 
 45 |     def __init__(self, wrapped, convert=None, strip=None, autoreset=False):
 46 |         # The wrapped stream (normally sys.stdout or sys.stderr)
 47 |         self.wrapped = wrapped
 48 | 
 49 |         # should we reset colors to defaults after every .write()
 50 |         self.autoreset = autoreset
 51 | 
 52 |         # create the proxy wrapping our output stream
 53 |         self.stream = StreamWrapper(wrapped, self)
 54 | 
 55 |         on_windows = sys.platform.startswith('win')
 56 | 
 57 |         # should we strip ANSI sequences from our output?
 58 |         if strip is None:
 59 |             strip = on_windows
 60 |         self.strip = strip
 61 | 
 62 |         # should we should convert ANSI sequences into win32 calls?
 63 |         if convert is None:
 64 |             convert = on_windows and is_a_tty(wrapped)
 65 |         self.convert = convert
 66 | 
 67 |         # dict of ansi codes to win32 functions and parameters
 68 |         self.win32_calls = self.get_win32_calls()
 69 | 
 70 |         # are we wrapping stderr?
 71 |         self.on_stderr = self.wrapped is sys.stderr
 72 | 
 73 | 
 74 |     def should_wrap(self):
 75 |         '''
 76 |         True if this class is actually needed. If false, then the output
 77 |         stream will not be affected, nor will win32 calls be issued, so
 78 |         wrapping stdout is not actually required. This will generally be
 79 |         False on non-Windows platforms, unless optional functionality like
 80 |         autoreset has been requested using kwargs to init()
 81 |         '''
 82 |         return self.convert or self.strip or self.autoreset
 83 | 
 84 | 
 85 |     def get_win32_calls(self):
 86 |         if self.convert and winterm:
 87 |             return {
 88 |                 AnsiStyle.RESET_ALL: (winterm.reset_all, ),
 89 |                 AnsiStyle.BRIGHT: (winterm.style, WinStyle.BRIGHT),
 90 |                 AnsiStyle.DIM: (winterm.style, WinStyle.NORMAL),
 91 |                 AnsiStyle.NORMAL: (winterm.style, WinStyle.NORMAL),
 92 |                 AnsiFore.BLACK: (winterm.fore, WinColor.BLACK),
 93 |                 AnsiFore.RED: (winterm.fore, WinColor.RED),
 94 |                 AnsiFore.GREEN: (winterm.fore, WinColor.GREEN),
 95 |                 AnsiFore.YELLOW: (winterm.fore, WinColor.YELLOW),
 96 |                 AnsiFore.BLUE: (winterm.fore, WinColor.BLUE),
 97 |                 AnsiFore.MAGENTA: (winterm.fore, WinColor.MAGENTA),
 98 |                 AnsiFore.CYAN: (winterm.fore, WinColor.CYAN),
 99 |                 AnsiFore.WHITE: (winterm.fore, WinColor.GREY),
100 |                 AnsiFore.RESET: (winterm.fore, ),
101 |                 AnsiBack.BLACK: (winterm.back, WinColor.BLACK),
102 |                 AnsiBack.RED: (winterm.back, WinColor.RED),
103 |                 AnsiBack.GREEN: (winterm.back, WinColor.GREEN),
104 |                 AnsiBack.YELLOW: (winterm.back, WinColor.YELLOW),
105 |                 AnsiBack.BLUE: (winterm.back, WinColor.BLUE),
106 |                 AnsiBack.MAGENTA: (winterm.back, WinColor.MAGENTA),
107 |                 AnsiBack.CYAN: (winterm.back, WinColor.CYAN),
108 |                 AnsiBack.WHITE: (winterm.back, WinColor.GREY),
109 |                 AnsiBack.RESET: (winterm.back, ),
110 |             }
111 | 
112 | 
113 |     def write(self, text):
114 |         if self.strip or self.convert:
115 |             self.write_and_convert(text)
116 |         else:
117 |             self.wrapped.write(text)
118 |             self.wrapped.flush()
119 |         if self.autoreset:
120 |             self.reset_all()
121 |         
122 | 
123 |     def reset_all(self):
124 |         if self.convert:
125 |             self.call_win32('m', (0,))
126 |         else:
127 |             self.wrapped.write(Style.RESET_ALL)
128 | 
129 | 
130 |     def write_and_convert(self, text):
131 |         '''
132 |         Write the given text to our wrapped stream, stripping any ANSI
133 |         sequences from the text, and optionally converting them into win32
134 |         calls.
135 |         '''
136 |         cursor = 0
137 |         for match in self.ANSI_RE.finditer(text):
138 |             start, end = match.span()
139 |             self.write_plain_text(text, cursor, start)
140 |             self.convert_ansi(*match.groups())
141 |             cursor = end
142 |         self.write_plain_text(text, cursor, len(text))
143 | 
144 | 
145 |     def write_plain_text(self, text, start, end):
146 |         if start < end:
147 |             self.wrapped.write(text[start:end])
148 |             self.wrapped.flush()
149 | 
150 | 
151 |     def convert_ansi(self, paramstring, command):
152 |         if self.convert:
153 |             params = self.extract_params(paramstring)
154 |             self.call_win32(command, params)
155 | 
156 | 
157 |     def extract_params(self, paramstring):
158 |         def split(paramstring):
159 |             for p in paramstring.split(';'):
160 |                 if p != '':
161 |                     yield int(p)
162 |         return tuple(split(paramstring))
163 | 
164 | 
165 |     def call_win32(self, command, params):
166 |         if params == []:
167 |             params = [0]
168 |         if command == 'm':
169 |             for param in params:
170 |                 if param in self.win32_calls:
171 |                     func_args = self.win32_calls[param]
172 |                     func = func_args[0]
173 |                     args = func_args[1:]
174 |                     kwargs = dict(on_stderr=self.on_stderr)
175 |                     func(*args, **kwargs)
176 | 
177 | 


--------------------------------------------------------------------------------
/lib/parse/peepdf/colorama/initialise.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | import sys
 3 | 
 4 | from .ansitowin32 import AnsiToWin32
 5 | 
 6 | 
 7 | orig_stdout = sys.stdout
 8 | orig_stderr = sys.stderr
 9 | 
10 | atexit_done = False
11 | 
12 | 
13 | def reset_all():
14 |     AnsiToWin32(orig_stdout).reset_all()
15 | 
16 | 
17 | def init(autoreset=False, convert=None, strip=None, wrap=True):
18 | 
19 |     if wrap==False and (autoreset==True or convert==True or strip==True):
20 |         raise ValueError('wrap=False conflicts with any other arg=True')
21 | 
22 |     sys.stdout = wrap_stream(orig_stdout, convert, strip, autoreset, wrap)
23 |     sys.stderr = wrap_stream(orig_stderr, convert, strip, autoreset, wrap)
24 | 
25 |     global atexit_done
26 |     if not atexit_done:
27 |         atexit.register(reset_all)
28 |         atexit_done = True
29 | 
30 | 
31 | def wrap_stream(stream, convert, strip, autoreset, wrap):
32 |     if wrap:
33 |         wrapper = AnsiToWin32(stream,
34 |             convert=convert, strip=strip, autoreset=autoreset)
35 |         if wrapper.should_wrap():
36 |             stream = wrapper.stream
37 |     return stream
38 | 
39 | 


--------------------------------------------------------------------------------
/lib/parse/peepdf/colorama/win32.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # from winbase.h
 3 | STDOUT = -11
 4 | STDERR = -12
 5 | 
 6 | try:
 7 |     from ctypes import windll
 8 | except ImportError:
 9 |     windll = None
10 |     SetConsoleTextAttribute = lambda *_: None
11 | else:
12 |     from ctypes import (
13 |         byref, Structure, c_char, c_short, c_uint32, c_ushort
14 |     )
15 | 
16 |     handles = {
17 |         STDOUT: windll.kernel32.GetStdHandle(STDOUT),
18 |         STDERR: windll.kernel32.GetStdHandle(STDERR),
19 |     }
20 | 
21 |     SHORT = c_short
22 |     WORD = c_ushort
23 |     DWORD = c_uint32
24 |     TCHAR = c_char
25 | 
26 |     class COORD(Structure):
27 |         """struct in wincon.h"""
28 |         _fields_ = [
29 |             ('X', SHORT),
30 |             ('Y', SHORT),
31 |         ]
32 | 
33 |     class  SMALL_RECT(Structure):
34 |         """struct in wincon.h."""
35 |         _fields_ = [
36 |             ("Left", SHORT),
37 |             ("Top", SHORT),
38 |             ("Right", SHORT),
39 |             ("Bottom", SHORT),
40 |         ]
41 | 
42 |     class CONSOLE_SCREEN_BUFFER_INFO(Structure):
43 |         """struct in wincon.h."""
44 |         _fields_ = [
45 |             ("dwSize", COORD),
46 |             ("dwCursorPosition", COORD),
47 |             ("wAttributes", WORD),
48 |             ("srWindow", SMALL_RECT),
49 |             ("dwMaximumWindowSize", COORD),
50 |         ]
51 | 
52 |     def GetConsoleScreenBufferInfo(stream_id):
53 |         handle = handles[stream_id]
54 |         csbi = CONSOLE_SCREEN_BUFFER_INFO()
55 |         success = windll.kernel32.GetConsoleScreenBufferInfo(
56 |             handle, byref(csbi))
57 |         # This fails when imported via setup.py when installing using 'pip'
58 |         # presumably the fix is that running setup.py should not trigger all
59 |         # this activity.
60 |         # assert success
61 |         return csbi
62 | 
63 |     def SetConsoleTextAttribute(stream_id, attrs):
64 |         handle = handles[stream_id]
65 |         success = windll.kernel32.SetConsoleTextAttribute(handle, attrs)
66 |         assert success
67 | 
68 |     def SetConsoleCursorPosition(stream_id, position):
69 |         handle = handles[stream_id]
70 |         position = COORD(*position)
71 |         success = windll.kernel32.SetConsoleCursorPosition(handle, position)
72 |         assert success
73 | 
74 |     def FillConsoleOutputCharacter(stream_id, char, length, start):
75 |         handle = handles[stream_id]
76 |         char = TCHAR(char)
77 |         length = DWORD(length)
78 |         start = COORD(*start)
79 |         num_written = DWORD(0)
80 |         # AttributeError: function 'FillConsoleOutputCharacter' not found
81 |         # could it just be that my types are wrong?
82 |         success = windll.kernel32.FillConsoleOutputCharacter(
83 |             handle, char, length, start, byref(num_written))
84 |         assert success
85 |         return num_written.value
86 | 
87 | 
88 | if __name__=='__main__':
89 |     x = GetConsoleScreenBufferInfo(STDOUT)
90 |     print(x.dwSize)
91 |     print(x.dwCursorPosition)
92 |     print(x.wAttributes)
93 |     print(x.srWindow)
94 |     print(x.dwMaximumWindowSize)
95 | 
96 | 


--------------------------------------------------------------------------------
/lib/parse/peepdf/colorama/winterm.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from . import win32
 3 | 
 4 | 
 5 | # from wincon.h
 6 | class WinColor(object):
 7 |     BLACK   = 0
 8 |     BLUE    = 1
 9 |     GREEN   = 2
10 |     CYAN    = 3
11 |     RED     = 4
12 |     MAGENTA = 5
13 |     YELLOW  = 6
14 |     GREY    = 7
15 | 
16 | # from wincon.h
17 | class WinStyle(object):
18 |     NORMAL = 0x00 # dim text, dim background
19 |     BRIGHT = 0x08 # bright text, dim background
20 | 
21 | 
22 | class WinTerm(object):
23 | 
24 |     def __init__(self):
25 |         self._default = \
26 |             win32.GetConsoleScreenBufferInfo(win32.STDOUT).wAttributes
27 |         self.set_attrs(self._default)
28 |         self._default_fore = self._fore
29 |         self._default_back = self._back
30 |         self._default_style = self._style
31 | 
32 |     def get_attrs(self):
33 |         return self._fore + self._back * 16 + self._style
34 | 
35 |     def set_attrs(self, value):
36 |         self._fore = value & 7
37 |         self._back = (value >> 4) & 7
38 |         self._style = value & WinStyle.BRIGHT
39 | 
40 |     def reset_all(self, on_stderr=None):
41 |         self.set_attrs(self._default)
42 |         self.set_console(attrs=self._default)
43 | 
44 |     def fore(self, fore=None, on_stderr=False):
45 |         if fore is None:
46 |             fore = self._default_fore
47 |         self._fore = fore
48 |         self.set_console(on_stderr=on_stderr)
49 | 
50 |     def back(self, back=None, on_stderr=False):
51 |         if back is None:
52 |             back = self._default_back
53 |         self._back = back
54 |         self.set_console(on_stderr=on_stderr)
55 | 
56 |     def style(self, style=None, on_stderr=False):
57 |         if style is None:
58 |             style = self._default_style
59 |         self._style = style
60 |         self.set_console(on_stderr=on_stderr)
61 | 
62 |     def set_console(self, attrs=None, on_stderr=False):
63 |         if attrs is None:
64 |             attrs = self.get_attrs()
65 |         handle = win32.STDOUT
66 |         if on_stderr:
67 |             handle = win32.STDERR
68 |         win32.SetConsoleTextAttribute(handle, attrs)
69 | 
70 | 


--------------------------------------------------------------------------------
/lib/parse/peepdf/jsbeautifier/unpackers/README.specs.mkd:
--------------------------------------------------------------------------------
 1 | # UNPACKERS SPECIFICATIONS
 2 | 
 3 | Nothing very difficult: an unpacker is a submodule placed in the directory
 4 | where this file was found. Each unpacker must define three symbols:
 5 | 
 6 |  * `PRIORITY`       : integer number expressing the priority in applying this
 7 |                       unpacker. Lower number means higher priority.
 8 |                       Makes sense only if a source file has been packed with
 9 |                       more than one packer.
10 |  * `detect(source)` : returns `True` if source is packed, otherwise, `False`.
11 |  * `unpack(source)` : takes a `source` string and unpacks it. Must always return
12 |                       valid JavaScript. That is to say, your code should look
13 |                       like:
14 | 
15 | ```
16 | if detect(source):
17 |     return do_your_fancy_things_with(source)
18 | else:
19 |     return source
20 | ```
21 | 
22 | *You can safely define any other symbol in your module, as it will be ignored.*
23 | 
24 | `__init__` code will automatically load new unpackers, without any further step
25 | to be accomplished. Simply drop it in this directory.
26 | 


--------------------------------------------------------------------------------
/lib/parse/peepdf/jsbeautifier/unpackers/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # General code for JSBeautifier unpackers infrastructure. See README.specs
 3 | #     written by Stefano Sanfilippo <a.little.coder@gmail.com>
 4 | #
 5 | 
 6 | """General code for JSBeautifier unpackers infrastructure."""
 7 | 
 8 | import pkgutil
 9 | import re
10 | #from jsbeautifier.unpackers import evalbased
11 | import evalbased
12 | 
13 | # NOTE: AT THE MOMENT, IT IS DEACTIVATED FOR YOUR SECURITY: it runs js!
14 | BLACKLIST = ['jsbeautifier.unpackers.evalbased']
15 | 
16 | class UnpackingError(Exception):
17 |     """Badly packed source or general error. Argument is a
18 |     meaningful description."""
19 |     pass
20 | 
21 | def getunpackers():
22 |     """Scans the unpackers dir, finds unpackers and add them to UNPACKERS list.
23 |     An unpacker will be loaded only if it is a valid python module (name must
24 |     adhere to naming conventions) and it is not blacklisted (i.e. inserted
25 |     into BLACKLIST."""
26 |     path = __path__
27 |     prefix = __name__ + '.'
28 |     unpackers = []
29 |     interface = ['unpack', 'detect', 'PRIORITY']
30 |     for _importer, modname, _ispkg in pkgutil.iter_modules(path, prefix):
31 |         if 'tests' not in modname and modname not in BLACKLIST:
32 |             try:
33 |                 module = __import__(modname, fromlist=interface)
34 |             except ImportError:
35 |                 raise UnpackingError('Bad unpacker: %s' % modname)
36 |             else:
37 |                 unpackers.append(module)
38 | 
39 |     return sorted(unpackers, key = lambda mod: mod.PRIORITY)
40 | 
41 | #UNPACKERS = getunpackers()
42 | UNPACKERS = []
43 | 
44 | def run(source, evalcode=False):
45 |     """Runs the applicable unpackers and return unpacked source as a string."""
46 |     for unpacker in [mod for mod in UNPACKERS if mod.detect(source)]:
47 |         source = unpacker.unpack(source)
48 |     if evalcode and evalbased.detect(source):
49 |         source = evalbased.unpack(source)
50 |     return source
51 | 
52 | def filtercomments(source):
53 |     """NOT USED: strips trailing comments and put them at the top."""
54 |     trailing_comments = []
55 |     comment = True
56 | 
57 |     while comment:
58 |         if re.search(r'^\s*\/\*', source):
59 |             comment = source[0, source.index('*/') + 2]
60 |         elif re.search(r'^\s*\/\/', source):
61 |             comment = re.search(r'^\s*\/\/', source).group(0)
62 |         else:
63 |             comment = None
64 | 
65 |         if comment:
66 |             source = re.sub(r'^\s+', '', source[len(comment):])
67 |             trailing_comments.append(comment)
68 | 
69 |     return '\n'.join(trailing_comments) + source
70 | 


--------------------------------------------------------------------------------
/lib/parse/peepdf/jsbeautifier/unpackers/evalbased.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Unpacker for eval() based packers, a part of javascript beautifier
 3 | # by Einar Lielmanis <einar@jsbeautifier.org>
 4 | #
 5 | #     written by Stefano Sanfilippo <a.little.coder@gmail.com>
 6 | #
 7 | # usage:
 8 | #
 9 | # if detect(some_string):
10 | #     unpacked = unpack(some_string)
11 | #
12 | 
13 | """Unpacker for eval() based packers: runs JS code and returns result.
14 | Works only if a JS interpreter (e.g. Mozilla's Rhino) is installed and
15 | properly set up on host."""
16 | 
17 | from subprocess import PIPE, Popen
18 | 
19 | PRIORITY = 3
20 | 
21 | def detect(source):
22 |     """Detects if source is likely to be eval() packed."""
23 |     return source.strip().lower().startswith('eval(function(')
24 | 
25 | def unpack(source):
26 |     """Runs source and return resulting code."""
27 |     return jseval('print %s;' % source[4:]) if detect(source) else source
28 | 
29 | # In case of failure, we'll just return the original, without crashing on user.
30 | def jseval(script):
31 |     """Run code in the JS interpreter and return output."""
32 |     try:
33 |         interpreter = Popen(['js'], stdin=PIPE, stdout=PIPE)
34 |     except OSError:
35 |         return script
36 |     result, errors = interpreter.communicate(script)
37 |     if interpreter.poll() or errors:
38 |         return script
39 |     return result
40 | 


--------------------------------------------------------------------------------
/lib/parse/peepdf/jsbeautifier/unpackers/javascriptobfuscator.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # simple unpacker/deobfuscator for scripts messed up with
 3 | # javascriptobfuscator.com
 4 | #
 5 | #     written by Einar Lielmanis <einar@jsbeautifier.org>
 6 | #     rewritten in Python by Stefano Sanfilippo <a.little.coder@gmail.com>
 7 | #
 8 | # Will always return valid javascript: if `detect()` is false, `code` is
 9 | # returned, unmodified.
10 | #
11 | # usage:
12 | #
13 | # if javascriptobfuscator.detect(some_string):
14 | #     some_string = javascriptobfuscator.unpack(some_string)
15 | #
16 | 
17 | """deobfuscator for scripts messed up with JavascriptObfuscator.com"""
18 | 
19 | import re
20 | 
21 | PRIORITY = 1
22 | 
23 | def smartsplit(code):
24 |     """Split `code` at " symbol, only if it is not escaped."""
25 |     strings = []
26 |     pos = 0
27 |     while pos < len(code):
28 |         if code[pos] == '"':
29 |             word = '' # new word
30 |             pos += 1
31 |             while pos < len(code):
32 |                 if code[pos] == '"':
33 |                     break
34 |                 if code[pos] == '\\':
35 |                     word += '\\'
36 |                     pos += 1
37 |                 word += code[pos]
38 |                 pos += 1
39 |             strings.append('"%s"' % word)
40 |         pos += 1
41 |     return strings
42 | 
43 | def detect(code):
44 |     """Detects if `code` is JavascriptObfuscator.com packed."""
45 |     # prefer `is not` idiom, so that a true boolean is returned
46 |     return (re.search(r'^var _0x[a-f0-9]+ ?\= ?\[', code) is not None)
47 | 
48 | def unpack(code):
49 |     """Unpacks JavascriptObfuscator.com packed code."""
50 |     if detect(code):
51 |         matches = re.search(r'var (_0x[a-f\d]+) ?\= ?\[(.*?)\];', code)
52 |         if matches:
53 |             variable = matches.group(1)
54 |             dictionary = smartsplit(matches.group(2))
55 |             code = code[len(matches.group(0)):]
56 |             for key, value in enumerate(dictionary):
57 |                 code = code.replace(r'%s[%s]' % (variable, key), value)
58 |     return code
59 | 


--------------------------------------------------------------------------------
/lib/parse/peepdf/jsbeautifier/unpackers/myobfuscate.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # deobfuscator for scripts messed up with myobfuscate.com
 3 | # by Einar Lielmanis <einar@jsbeautifier.org>
 4 | #
 5 | #     written by Stefano Sanfilippo <a.little.coder@gmail.com>
 6 | #
 7 | # usage:
 8 | #
 9 | # if detect(some_string):
10 | #     unpacked = unpack(some_string)
11 | #
12 | 
13 | # CAVEAT by Einar Lielmanis
14 | 
15 | #
16 | # You really don't want to obfuscate your scripts there: they're tracking
17 | # your unpackings, your script gets turned into something like this,
18 | # as of 2011-08-26:
19 | #
20 | #   var _escape = 'your_script_escaped';
21 | #   var _111 = document.createElement('script');
22 | #   _111.src = 'http://api.www.myobfuscate.com/?getsrc=ok' +
23 | #              '&ref=' + encodeURIComponent(document.referrer) +
24 | #              '&url=' + encodeURIComponent(document.URL);
25 | #   var 000 = document.getElementsByTagName('head')[0];
26 | #   000.appendChild(_111);
27 | #   document.write(unescape(_escape));
28 | #
29 | 
30 | """Deobfuscator for scripts messed up with MyObfuscate.com"""
31 | 
32 | import re
33 | import base64
34 | 
35 | # Python 2 retrocompatibility
36 | # pylint: disable=F0401
37 | # pylint: disable=E0611
38 | try:
39 |     from urllib import unquote
40 | except ImportError:
41 |     from urllib.parse import unquote
42 | 
43 | from jsbeautifier.unpackers import UnpackingError
44 | 
45 | PRIORITY = 1
46 | 
47 | CAVEAT = """//
48 | // Unpacker warning: be careful when using myobfuscate.com for your projects:
49 | // scripts obfuscated by the free online version call back home.
50 | //
51 | 
52 | """
53 | 
54 | SIGNATURE = (r'["\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F'
55 |              r'\x50\x51\x52\x53\x54\x55\x56\x57\x58\x59\x5A\x61\x62\x63\x64\x65'
56 |              r'\x66\x67\x68\x69\x6A\x6B\x6C\x6D\x6E\x6F\x70\x71\x72\x73\x74\x75'
57 |              r'\x76\x77\x78\x79\x7A\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x2B'
58 |              r'\x2F\x3D","","\x63\x68\x61\x72\x41\x74","\x69\x6E\x64\x65\x78'
59 |              r'\x4F\x66","\x66\x72\x6F\x6D\x43\x68\x61\x72\x43\x6F\x64\x65","'
60 |              r'\x6C\x65\x6E\x67\x74\x68"]')
61 | 
62 | def detect(source):
63 |     """Detects MyObfuscate.com packer."""
64 |     return SIGNATURE in source
65 | 
66 | def unpack(source):
67 |     """Unpacks js code packed with MyObfuscate.com"""
68 |     if not detect(source):
69 |         return source
70 |     payload = unquote(_filter(source))
71 |     match = re.search(r"^var _escape\='<script>(.*)<\/script>'",
72 |                       payload, re.DOTALL)
73 |     polished = match.group(1) if match else source
74 |     return CAVEAT + polished
75 | 
76 | def _filter(source):
77 |     """Extracts and decode payload (original file) from `source`"""
78 |     try:
79 |         varname = re.search(r'eval\(\w+\(\w+\((\w+)\)\)\);', source).group(1)
80 |         reverse = re.search(r"var +%s *\= *'(.*)';" % varname, source).group(1)
81 |     except AttributeError:
82 |         raise UnpackingError('Malformed MyObfuscate data.')
83 |     try:
84 |         return base64.b64decode(reverse[::-1].encode('utf8')).decode('utf8')
85 |     except TypeError:
86 |         raise UnpackingError('MyObfuscate payload is not base64-encoded.')
87 | 


--------------------------------------------------------------------------------
/lib/parse/peepdf/jsbeautifier/unpackers/packer.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Unpacker for Dean Edward's p.a.c.k.e.r, a part of javascript beautifier
  3 | # by Einar Lielmanis <einar@jsbeautifier.org>
  4 | #
  5 | #     written by Stefano Sanfilippo <a.little.coder@gmail.com>
  6 | #
  7 | # usage:
  8 | #
  9 | # if detect(some_string):
 10 | #     unpacked = unpack(some_string)
 11 | #
 12 | 
 13 | """Unpacker for Dean Edward's p.a.c.k.e.r"""
 14 | 
 15 | import re
 16 | import string
 17 | from jsbeautifier.unpackers import UnpackingError
 18 | 
 19 | PRIORITY = 1
 20 | 
 21 | def detect(source):
 22 |     """Detects whether `source` is P.A.C.K.E.R. coded."""
 23 |     return source.replace(' ', '').startswith('eval(function(p,a,c,k,e,r')
 24 | 
 25 | def unpack(source):
 26 |     """Unpacks P.A.C.K.E.R. packed js code."""
 27 |     payload, symtab, radix, count = _filterargs(source)
 28 | 
 29 |     if count != len(symtab):
 30 |         raise UnpackingError('Malformed p.a.c.k.e.r. symtab.')
 31 | 
 32 |     try:
 33 |         unbase = Unbaser(radix)
 34 |     except TypeError:
 35 |         raise UnpackingError('Unknown p.a.c.k.e.r. encoding.')
 36 | 
 37 |     def lookup(match):
 38 |         """Look up symbols in the synthetic symtab."""
 39 |         word  = match.group(0)
 40 |         return symtab[unbase(word)] or word
 41 | 
 42 |     source = re.sub(r'\b\w+\b', lookup, payload)
 43 |     return _replacestrings(source)
 44 | 
 45 | def _filterargs(source):
 46 |     """Juice from a source file the four args needed by decoder."""
 47 |     argsregex = (r"}\('(.*)', *(\d+), *(\d+), *'(.*)'\."
 48 |                  r"split\('\|'\), *(\d+), *(.*)\)\)")
 49 |     args = re.search(argsregex, source, re.DOTALL).groups()
 50 | 
 51 |     try:
 52 |         return args[0], args[3].split('|'), int(args[1]), int(args[2])
 53 |     except ValueError:
 54 |         raise UnpackingError('Corrupted p.a.c.k.e.r. data.')
 55 | 
 56 | def _replacestrings(source):
 57 |     """Strip string lookup table (list) and replace values in source."""
 58 |     match = re.search(r'var *(_\w+)\=\["(.*?)"\];', source, re.DOTALL)
 59 | 
 60 |     if match:
 61 |         varname, strings = match.groups()
 62 |         startpoint = len(match.group(0))
 63 |         lookup = strings.split('","')
 64 |         variable = '%s[%%d]' % varname
 65 |         for index, value in enumerate(lookup):
 66 |             source = source.replace(variable % index, '"%s"' % value)
 67 |         return source[startpoint:]
 68 |     return source
 69 | 
 70 | 
 71 | class Unbaser(object):
 72 |     """Functor for a given base. Will efficiently convert
 73 |     strings to natural numbers."""
 74 |     ALPHABET  = {
 75 |         62 : '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ',
 76 |         95 : (' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 77 |               '[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')
 78 |     }
 79 | 
 80 |     def __init__(self, base):
 81 |         self.base = base
 82 | 
 83 |         # If base can be handled by int() builtin, let it do it for us
 84 |         if 2 <= base <= 36:
 85 |             self.unbase = lambda string: int(string, base)
 86 |         else:
 87 |             # Build conversion dictionary cache
 88 |             try:
 89 |                 self.dictionary = dict((cipher, index) for
 90 |                     index, cipher in enumerate(self.ALPHABET[base]))
 91 |             except KeyError:
 92 |                 raise TypeError('Unsupported base encoding.')
 93 | 
 94 |             self.unbase = self._dictunbaser
 95 | 
 96 |     def __call__(self, string):
 97 |         return self.unbase(string)
 98 | 
 99 |     def _dictunbaser(self, string):
100 |         """Decodes a  value to an integer."""
101 |         ret = 0
102 |         for index, cipher in enumerate(string[::-1]):
103 |             ret += (self.base ** index) * self.dictionary[cipher]
104 |         return ret
105 | 


--------------------------------------------------------------------------------
/lib/parse/peepdf/jsbeautifier/unpackers/urlencode.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Trivial bookmarklet/escaped script detector for the javascript beautifier
 3 | #     written by Einar Lielmanis <einar@jsbeautifier.org>
 4 | #     rewritten in Python by Stefano Sanfilippo <a.little.coder@gmail.com>
 5 | #
 6 | # Will always return valid javascript: if `detect()` is false, `code` is
 7 | # returned, unmodified.
 8 | #
 9 | # usage:
10 | #
11 | # some_string = urlencode.unpack(some_string)
12 | #
13 | 
14 | """Bookmarklet/escaped script unpacker."""
15 | 
16 | # Python 2 retrocompatibility
17 | # pylint: disable=F0401
18 | # pylint: disable=E0611
19 | try:
20 |     from urllib import unquote_plus
21 | except ImportError:
22 |     from urllib.parse import unquote_plus
23 | 
24 | PRIORITY = 0
25 | 
26 | def detect(code):
27 |     """Detects if a scriptlet is urlencoded."""
28 |     # the fact that script doesn't contain any space, but has %20 instead
29 |     # should be sufficient check for now.
30 |     return ' ' not in code and ('%20' in code or code.count('%') > 3)
31 | 
32 | def unpack(code):
33 |     """URL decode `code` source string."""
34 |     return unquote_plus(code) if detect(code) else code
35 | 


--------------------------------------------------------------------------------
/lib/parse/peepdf/peepdf.dtd:
--------------------------------------------------------------------------------
  1 | <!ELEMENT peepdf_analysis ( date, basic, advanced ) >
  2 | <!ATTLIST peepdf_analysis author CDATA #REQUIRED >
  3 | <!ATTLIST peepdf_analysis url CDATA #REQUIRED >
  4 | <!ATTLIST peepdf_analysis version CDATA #REQUIRED >
  5 | 
  6 | 
  7 | <!ELEMENT date ( #PCDATA ) >
  8 | 
  9 | 
 10 | <!ELEMENT basic ( filename, md5, sha1, sha256, size, detection, pdf_version, binary, linearized, encrypted, updates, num_objects, num_streams, comments, errors ) >
 11 | 
 12 | <!ELEMENT filename ( #PCDATA ) >
 13 | 
 14 | <!ELEMENT md5 ( #PCDATA ) >
 15 | 
 16 | <!ELEMENT sha1 ( #PCDATA ) >
 17 | 
 18 | <!ELEMENT sha256 ( #PCDATA ) >
 19 | 
 20 | <!ELEMENT size ( #PCDATA ) >
 21 | 
 22 | <!ELEMENT detection ( rate?, report_link? ) >
 23 | 
 24 | <!ELEMENT rate ( #PCDATA ) >
 25 | 
 26 | <!ELEMENT report_link ( #PCDATA ) >
 27 | 
 28 | <!ELEMENT pdf_version ( #PCDATA ) >
 29 | 
 30 | <!ELEMENT binary EMPTY >
 31 | <!ATTLIST binary status ( false | true ) #REQUIRED >
 32 | 
 33 | <!ELEMENT linearized EMPTY >
 34 | <!ATTLIST linearized status ( false | true ) #REQUIRED >
 35 | 
 36 | <!ELEMENT encrypted ( algorithms? ) >
 37 | <!ATTLIST encrypted status ( false | true ) #REQUIRED >
 38 | 
 39 | <!ELEMENT algorithms ( algorithm+ ) >
 40 | 
 41 | <!ELEMENT algorithm ( #PCDATA ) >
 42 | <!ATTLIST algorithm bits NMTOKEN #REQUIRED >
 43 | 
 44 | <!ELEMENT updates ( #PCDATA ) >
 45 | 
 46 | <!ELEMENT num_objects ( #PCDATA ) >
 47 | 
 48 | <!ELEMENT num_streams ( #PCDATA ) >
 49 | 
 50 | <!ELEMENT comments ( #PCDATA ) >
 51 | 
 52 | <!ELEMENT errors ( error_message* ) >
 53 | <!ATTLIST errors num NMTOKEN #REQUIRED >
 54 | 
 55 | <!ELEMENT error_message ( #PCDATA ) >
 56 | 
 57 | 
 58 | <!ELEMENT advanced ( version* ) >
 59 | 
 60 | <!ELEMENT version ( catalog, info, objects, streams ,js_objects, suspicious_elements, suspicious_urls ) >
 61 | <!ATTLIST version num NMTOKEN #REQUIRED >
 62 | <!ATTLIST version type ( original | update ) #REQUIRED >
 63 | 
 64 | <!ELEMENT catalog EMPTY >
 65 | <!ATTLIST catalog object_id NMTOKEN #IMPLIED >
 66 | 
 67 | <!ELEMENT info EMPTY >
 68 | <!ATTLIST info object_id NMTOKEN #IMPLIED >
 69 | 
 70 | <!ELEMENT objects ( object* ) >
 71 | <!ATTLIST objects num NMTOKEN #REQUIRED >
 72 | 
 73 | <!ELEMENT object EMPTY >
 74 | <!ATTLIST object errors ( false | true ) #IMPLIED >
 75 | <!ATTLIST object compressed ( false | true ) #IMPLIED >
 76 | <!ATTLIST object id NMTOKEN #REQUIRED >
 77 | 
 78 | <!ELEMENT streams ( stream* ) >
 79 | <!ATTLIST streams num NMTOKEN #REQUIRED >
 80 | 
 81 | <!ELEMENT stream EMPTY >
 82 | <!ATTLIST stream encoded ( false | true ) #IMPLIED >
 83 | <!ATTLIST stream id NMTOKEN #REQUIRED >
 84 | <!ATTLIST stream object_stream ( false | true ) #IMPLIED >
 85 | <!ATTLIST stream xref_stream ( false | true ) #IMPLIED >
 86 | <!ATTLIST stream decoding_errors ( false | true ) #IMPLIED >
 87 | 
 88 | <!ELEMENT js_objects ( container_object* ) >
 89 | 
 90 | <!ELEMENT container_object EMPTY >
 91 | <!ATTLIST container_object id NMTOKEN #REQUIRED >
 92 | 
 93 | <!ELEMENT suspicious_elements ( triggers?, actions?, elements?, js_vulns? ) >
 94 | 
 95 | <!ELEMENT triggers ( trigger* ) >
 96 | 
 97 | <!ELEMENT trigger ( container_object+ ) >
 98 | <!ATTLIST trigger name CDATA #REQUIRED >
 99 | 
100 | <!ELEMENT actions ( action* ) >
101 | 
102 | <!ELEMENT action ( container_object+ ) >
103 | <!ATTLIST action name CDATA #REQUIRED >
104 | 
105 | <!ELEMENT elements ( element* ) >
106 | 
107 | <!ELEMENT element ( cve*, container_object+ ) >
108 | <!ATTLIST element name CDATA #REQUIRED >
109 | 
110 | <!ELEMENT cve ( #PCDATA ) >
111 | 
112 | <!ELEMENT js_vulns ( vulnerable_function* ) >
113 | 
114 | <!ELEMENT vulnerable_function ( cve*, container_object+ ) >
115 | <!ATTLIST vulnerable_function name CDATA #REQUIRED >
116 | 
117 | <!ELEMENT suspicious_urls ( url* ) >
118 | 
119 | <!ELEMENT url ( #PCDATA ) >
120 | 


--------------------------------------------------------------------------------
/lib/spectragraph/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'honey'
2 | 


--------------------------------------------------------------------------------
/lib/spectragraph/conversion.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Contains functions for the conversion of the parsed edge list to the different graph algorithms.
 3 | """
 4 | 
 5 | """
 6 |     Create a normalized edge list where vertices are numbered 0-n rather than using object numbers
 7 | """
 8 | def normalize_edge_list (e, v = None):
 9 |     """ the normalized edge list """
10 |     n = []
11 |     
12 |     """
13 |         Get the unique vertices of the graph. The index of a vertex in this 
14 |         list will be used as its normalized label.
15 |     """
16 |     if not v:
17 |         print "not v"
18 |         v = find_v_list (e)
19 | 
20 |     for a, b in e:
21 |         n.append((v.index(a), v.index(b)))
22 | 
23 |     return n
24 | 
25 | def find_v_list (e) :
26 |     v = []
27 |     for a, b in e:
28 |         if a not in v:
29 |             v.append(a)
30 |         if b not in v:
31 |             v.append(b)
32 | 
33 |     return v
34 | 
35 | 
36 | 
37 | """ test """
38 | if __name__ == "__main__":
39 |     e = [(0,1), (1,3), (0, 5)]
40 |     v = find_v_list(e)
41 |     print normalize_edge_list(e)


--------------------------------------------------------------------------------
/lib/spectragraph/matrix.py:
--------------------------------------------------------------------------------
 1 | from array import array
 2 | 
 3 | 
 4 | class SimpleMatrix(object):
 5 | 
 6 |     def __init__(self, r, c=None):
 7 |         if not c:
 8 |             c = r
 9 |         self.rows = r
10 |         self.cols = c
11 |         self.size = self.rows * self.cols
12 |         self.elements = [array('B', (0 for col in range(self.cols))) for row in range(self.rows)]
13 | 
14 |     def grow(self, r, c=None):
15 |         if not c:
16 |             c = r
17 |         self.rows += r
18 |         self.cols += c
19 |         for row in range(self.rows):
20 |             try:
21 |                 self.elements[row].extend([0 for i in range(c)])
22 |             except IndexError:
23 |                 self.elements.append(array('B', (0 for col in range(self.cols))))
24 | 
25 |     def __len__(self):
26 |         return self.size
27 | 
28 |     def __getitem__(self, key):
29 |         return self.elements[key]
30 | 
31 |     def __setitem__(self, key, value):
32 |         self.elements[key] = value
33 | 
34 |     def __delitem__(self, key):
35 |         self.elements[key] = array('B', [0 for col in range(self.cols)])
36 | 
37 |     def __iter__(self):
38 |         return iter(self.elements)
39 | 
40 |     def __str__(self):
41 |         rv = ["  %s" % ' '.join([str(i) for i in range(self.cols)])]
42 |         for idx, row in enumerate(self.elements):
43 |             rv.append("%d %s" % (idx, ' '.join([str(i) for i in row])))
44 |         return '\n'.join(rv)
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     from timeit import timeit
49 |     from random import randint
50 | 
51 |     m = SimpleMatrix(100)
52 | 
53 |     print "Size:", m.size
54 |     print " Len:", len(m)
55 |     assert(m.size == len(m))
56 | 
57 |     for i in range(100):
58 |         for j in range(100):
59 |             r = randint(0, 255)
60 |             m[i][j] = r
61 |             assert (m[i][j] == r)
62 | 
63 |     for i in range(100):
64 |         for j in range(100):
65 |             print m[i][j],
66 |         print
67 | 
68 |     for i in range(100):
69 |         del m[i]
70 | 
71 |     for row in m:
72 |         for col in row:
73 |             print col,
74 |             assert(not col)
75 |         print
76 | 
77 |     def create_matrix(r, c):
78 |         SimpleMatrix(r, c)
79 | 
80 |     for i in range(5):
81 |         i *= 10
82 |         print timeit("create_matrix(%d, %d)" % (i, i), setup="from __main__ import create_matrix", number=10)
83 | 
84 | 


--------------------------------------------------------------------------------
/logs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-sei/nabu/3afcab20a5ddd8a9b984d8f34756ebedfc0b45a9/logs/__init__.py


--------------------------------------------------------------------------------
/process/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'sei-mappel'
2 | 


--------------------------------------------------------------------------------
/process/hashers/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'sei-mappel'
2 | 


--------------------------------------------------------------------------------
/process/hashers/hasher.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | #
  3 | #  NO WARRANTY
  4 | #
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | 
 15 | class Hasher(multiprocessing.Process):
 16 |     """
 17 |     Hashers generally make hashes of things
 18 |     """
 19 | 
 20 |     def __init__(self, qin, qout, counter, debug):
 21 |         multiprocessing.Process.__init__(self)
 22 |         self.qin = qin
 23 |         self.qout = qout
 24 |         self.counter = counter
 25 |         self.debug = debug
 26 | 
 27 |     '''
 28 |     This loop is the main process of the hasher. It is automatically called
 29 |     when you call multiprocessing.Process.start()
 30 | 
 31 |     All variables should be local to the loop, and returned as strings
 32 |     suitable for inserting into the database.
 33 |     '''
 34 | 
 35 |     def run(self):
 36 |         while True:
 37 |             pdf = self.qin.get()
 38 |             if not pdf:
 39 |                 '''
 40 |                 This terminates the process by receiving a poison sentinel, None.
 41 |                 '''
 42 |                 self.qout.put(None)
 43 |                 # self.qin.task_done()
 44 |                 return 0
 45 | 
 46 |             '''
 47 |             Reset the values on each pdf.
 48 |             '''
 49 |             err = []
 50 |             urls = ''
 51 |             t_hash = ''
 52 |             t_str = ''
 53 |             graph = ''
 54 |             obf_js = ''
 55 |             de_js = ''
 56 |             obf_js_sdhash = ''
 57 |             de_js_sdhash = ''
 58 |             swf_sdhash = ''
 59 |             swf = ''
 60 |             fsize = ''
 61 |             pdfsize = ''
 62 |             bin_blob = ''
 63 |             malformed = {}
 64 | 
 65 |             '''
 66 |             Arguments are validated when Jobber adds them to the queue based
 67 |             on the Validators valid() return value. We can assume these will
 68 |             succeed. However, this process must reach the task_done() call,
 69 |             and so we try/catch everything
 70 |             '''
 71 |             try:
 72 |                 pdf_name = pdf.rstrip(os.path.sep).rpartition(os.path.sep)[2]
 73 |             except Exception as e:
 74 |                 err.append('UNEXPECTED OS ERROR:\n%s' % traceback.format_exc())
 75 |                 pdf_name = pdf
 76 |             write('H\t#%d\t(%d / %d)\t%s\n' % (self.pid, self.counter.value(), self.counter.ceil(), pdf_name))
 77 |             '''
 78 |             The parse_pdf call will return a value that evaluates to false if it
 79 |             did not succeed. Error messages will appended to the err list.
 80 |             '''
 81 |             parsed_pdf = self.parse_pdf(pdf, err)
 82 | 
 83 |             if parsed_pdf:
 84 |                 try:
 85 |                     fsize = self.get_file_size(pdf)
 86 |                     pdfsize = self.get_pdf_size(parsed_pdf, err)
 87 |                     graph = self.make_graph(parsed_pdf, err)
 88 |                     t_str = self.make_tree_string(parsed_pdf, err)
 89 |                     t_hash = self.make_tree_hash(graph, err)
 90 |                     obf_js = self.get_js(parsed_pdf, err)
 91 |                     de_js = self.get_deobf_js(obf_js, parsed_pdf, err)
 92 |                     obf_js_sdhash = make_sdhash(obf_js, err)
 93 |                     de_js_sdhash = make_sdhash(de_js, err)
 94 |                     urls = self.get_urls(obf_js, err)
 95 |                     urls += self.get_urls(de_js, err)
 96 |                     swf = self.get_swf(parsed_pdf, err)
 97 |                     swf_sdhash = make_sdhash(swf, err)
 98 |                     bin_blob = parsed_pdf.bin_blob
 99 |                     malformed = parsed_pdf.getmalformed()
100 |                     self.get_errors(parsed_pdf, err)
101 |                 except Exception as e:
102 |                     err.append('UNCAUGHT PARSING EXCEPTION:\n%s' % traceback.format_exc())
103 | 
104 |             err = 'Error: '.join(err)
105 |             malformed['skipkeys'] = False
106 |             try:
107 |                 json_malformed = json.dumps(malformed)
108 |             except (TypeError, ValueError):
109 |                 malformed['skipkeys'] = True
110 |                 json_malformed = json.dumps(malformed, skipkeys=True)
111 | 
112 |             self.qout.put({'fsize': fsize,
113 |                            'pdf_md5': pdf_name,
114 |                            'tree_md5': t_hash,
115 |                            'tree': t_str,
116 |                            'obf_js': obf_js,
117 |                            'de_js': de_js,
118 |                            'swf': swf,
119 |                            'graph': graph,
120 |                            'pdfsize': pdfsize,
121 |                            'urls': urls,
122 |                            'bin_blob': bin_blob,
123 |                            'obf_js_sdhash': obf_js_sdhash,
124 |                            'de_js_sdhash': de_js_sdhash,
125 |                            'swf_sdhash': swf_sdhash,
126 |                            'malformed': json_malformed,
127 |                            'errors': err})
128 |             self.counter.inc()
129 |             # self.qin.task_done()
130 | 
131 |     def parse_pdf(self, pdf, err=''):
132 |         return None, 'Hasher: Unimplemented method, %s' % sys._getframe().f_code.co_name
133 | 
134 |     def get_file_size(self, pdf):
135 |         try:
136 |             size = os.path.getsize(pdf)
137 |         except OSError:
138 |             '''
139 |             This should never actually happen if we were able to parse it
140 |             '''
141 |             size = 0
142 |         return str(size)
143 | 
144 |     def get_pdf_size(self, pdf):
145 |         return 'Hasher: Unimplemented method, %s' % sys._getframe().f_code.co_name
146 | 
147 |     def make_graph(self, pdf, err=''):
148 |         return 'Hasher: Unimplemented method, %s' % sys._getframe().f_code.co_name
149 | 
150 |     def make_tree_string(self, pdf, err=''):
151 |         return 'Hasher: Unimplemented method, %s' % sys._getframe().f_code.co_name
152 | 
153 |     def make_tree_hash(self, t_str, err=''):
154 |         t_hash = ''
155 |         m = hashlib.md5()
156 |         try:
157 |             m.update(t_str)
158 |             t_hash = m.hexdigest()
159 |         except TypeError:
160 |             err.append('<HashException>%s</HashException>' % traceback.format_exc())
161 |         return t_hash
162 | 
163 |     def get_js(self, pdf, err=''):
164 |         return 'Hasher: Unimplemented method, %s' % sys._getframe().f_code.co_name
165 | 
166 |     def get_debof_js(self, js, pdf, err=''):
167 |         return 'Hasher: Unimplemented method, %s' % sys._getframe().f_code.co_name
168 | 
169 |     def get_swf(self, pdf, err=''):
170 |         return 'Hasher: Unimplemented method, %s' % sys._getframe().f_code.co_name
171 | 
172 |     def get_errors(self, pdf, err=''):
173 |         return 'Hasher: Unimplemented method, %s' % sys._getframe().f_code.co_name
174 | 
175 |     def get_urls(self, haystack, err='', needle=''):
176 |         urls = ''
177 |         if not needle:
178 |             for needle in huntterp.Test.tests:
179 |                 urls = huntterp.find_in_hex(needle, haystack)
180 |                 urls += huntterp.find_unicode(needle, haystack)
181 |         else:
182 |             urls = huntterp.find_in_hex(needle, haystack)
183 |             urls += huntterp.find_unicode(haystack)
184 |         return '\n'.join([u[1] for u in urls])
185 | 


--------------------------------------------------------------------------------
/process/hashers/pdfminer.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2011-2015 by Carnegie Mellon University
 2 | #
 3 | #  NO WARRANTY
 4 | #
 5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
 6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
 7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
 8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
 9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
13 | 
14 | 
15 | import re
16 | import traceback
17 | from hasher import Hasher
18 | 
19 | 
20 | class PDFMinerHasher(Hasher):
21 | 
22 |     def parse_pdf(self, pdf, err):
23 |         parsed = False
24 |         try:
25 |             parsed = xml_creator.FrankenParser(pdf, self.debug)
26 |         except Exception:
27 |             err.append('<ParseException><pdf="%s">"%s"</ParseException>' % (str(pdf), traceback.format_exc()))
28 |         return parsed
29 | 
30 |     def make_tree_string(self, pdf, err):
31 |         if pdf.xml:
32 |             return pdf.xml
33 |         else:
34 |             return '<TreeException>EMPTY TREE</TreeException>'
35 | 
36 |     def get_js(self, pdf, err):
37 |         js = ''
38 |         try:
39 |             js_list = [ self.comment_out(js) for js in pdf.javascript ]
40 |             js = '\n\n'.join(js_list)
41 |         except Exception as e:
42 |             err.append('<GetJSException>%s</GetJSException>' % traceback.format_exc())
43 |         return js
44 | 
45 |     def get_deobf_js(self, js, pdf, err):
46 |         de_js = ''
47 |         try:
48 |             if pdf.tree.startswith('TREE_ERROR'):
49 |                 err.append('<DeobfuscateJSException>%s</DeobfuscateJSException>' % pdf.tree)
50 |         except AttributeError:
51 |             try:
52 |                 #de_js = analyse(js, pdf.tree)
53 |                 pass
54 |             except Exception as e:
55 |                 err.append('<DeobfuscateJSException>%s</DeobfuscateJSException>' % traceback.format_exc())
56 |         return de_js
57 | 
58 |     def get_swf(self, pdf, err):
59 |         swf = ''
60 |         if pdf.swf:
61 |             if isinstance(pdf.swf, list):
62 |                 swf = ''.join(pdf.swf)
63 |             elif isinstance(pdf.swf, str):
64 |                 swf = pdf.swf
65 |         return swf
66 | 
67 |     def get_pdf_size(self, pdf, err):
68 |         return str(pdf.bytes_read)
69 | 
70 |     def get_errors(self, pdf, err):
71 |         err.extend(pdf.errors)
72 | 
73 |     def make_graph(self, pdf, err):
74 |         graph = ''
75 |         try:
76 |             graph = pdf.make_graph(pdf.tree)
77 |             graph = '\n'.join(graph)
78 |         except Exception as e:
79 |             err.append('<GetJSException>%s</GetJSException>' % traceback.format_exc())
80 |         return graph
81 | 
82 |     def comment_out(self, js):
83 |         return re.sub("^(<)", "//", unescapeHTML(js), flags=re.M)
84 | 


--------------------------------------------------------------------------------
/process/hashers/peepdf.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | #
  3 | #  NO WARRANTY
  4 | #
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | 
 15 | import traceback
 16 | from hasher import Hasher
 17 | from lib.parse.peepdf.PDFCore import PDFParser
 18 | 
 19 | 
 20 | class PeePDFHasher(Hasher):
 21 | 
 22 |     def parse_pdf(self, pdf, err):
 23 |         retval = True
 24 |         try:
 25 |             _, pdffile = self.PDFParser().parse(pdf, forceMode=True, manualAnalysis=True)
 26 |         except Exception as e:
 27 |             retval = False
 28 |             pdffile = '\n'.join([traceback.format_exc(), repr(e)])
 29 |         return pdffile
 30 | 
 31 |     def get_swf(self, pdf, err):
 32 |         swf = ''
 33 |         for version in range(pdf.updates + 1):
 34 |             for idx, obj in pdf.body[version].objects.items():
 35 |                 if obj.object.type == 'stream':
 36 |                     stream_ident = obj.object.decodedStream[:3]
 37 |                     if stream_ident in ['CWS', 'FWS']:
 38 |                         swf += obj.object.decodedStream.strip()
 39 |         return swf
 40 | 
 41 |     def get_js(self, pdf, err):
 42 |         js = ''
 43 |         for version in range(pdf.updates+1):
 44 |             for obj_id in pdf.body[version].getContainingJS():
 45 |                 js += self.do_js_code(obj_id, pdf)
 46 |         return js
 47 | 
 48 |     def make_tree_string(self, pdf, err):
 49 |         try:
 50 |             t_str = self.do_tree(pdf)
 51 |         except Exception as e:
 52 |             t_str = 'ERROR: ' + repr(e)
 53 |         return t_str
 54 | 
 55 |     def do_js_code(self, obj_id, pdf):
 56 |         consoleOutput = ''
 57 |         obj_id = int(obj_id)
 58 |         pdfobject = pdf.getObject(obj_id, None)
 59 |         if pdfobject.containsJS():
 60 |             jsCode = pdfobject.getJSCode()
 61 |             for js in jsCode:
 62 |                 consoleOutput += js
 63 |         return consoleOutput
 64 | 
 65 |     def do_tree(self, pdfFile):
 66 |         version = None
 67 |         treeOutput = ''
 68 |         tree = pdfFile.getTree()
 69 |         for i in range(len(tree)):
 70 |             nodesPrinted = []
 71 |             root = tree[i][0]
 72 |             objectsInfo = tree[i][1]
 73 |             if i != 0:
 74 |                 treeOutput += os.linesep + ' Version '+str(i)+':' + os.linesep*2
 75 |             if root != None:
 76 |                 nodesPrinted, nodeOutput = self.printTreeNode(root, objectsInfo, nodesPrinted)
 77 |                 treeOutput += nodeOutput
 78 |             for object in objectsInfo:
 79 |                 nodesPrinted, nodeOutput = self.printTreeNode(object, objectsInfo, nodesPrinted)
 80 |                 treeOutput += nodeOutput
 81 |         return treeOutput
 82 | 
 83 |     def printTreeNode(self, node, nodesInfo, expandedNodes = [], depth = 0, recursive = True):
 84 |         '''
 85 |             Given a tree prints the whole tree and its dependencies
 86 | 
 87 |             @param node: Root of the tree
 88 |             @param nodesInfo: Information abour the nodes of the tree
 89 |             @param expandedNodes: Already expanded nodes
 90 |             @param depth: Actual depth of the tree
 91 |             @param recursive: Boolean to specify if it's a recursive call or not
 92 |             @return: A tuple (expandedNodes,output), where expandedNodes is a list with the distinct nodes and output is the string representation of the tree
 93 |         '''
 94 |         output = ''
 95 |         if nodesInfo.has_key(node):
 96 |             if node not in expandedNodes or (node in expandedNodes and depth > 0):
 97 |                 output += '\t'*depth + nodesInfo[node][0] + ' (' +str(node) + ')' + os.linesep
 98 |             if node not in expandedNodes:
 99 |                 expandedNodes.append(node)
100 |                 children = nodesInfo[node][1]
101 |                 if children != []:
102 |                     for child in children:
103 |                         if nodesInfo.has_key(child):
104 |                             childType = nodesInfo[child][0]
105 |                         else:
106 |                             childType = 'Unknown'
107 |                         if childType != 'Unknown' and recursive:
108 |                             expChildrenNodes, childrenOutput = self.printTreeNode(child, nodesInfo, expandedNodes, depth+1)
109 |                             output += childrenOutput
110 |                             expandedNodes = expChildrenNodes
111 |                         else:
112 |                             output += '\t'*(depth+1) + childType + ' (' +str(child) + ')' + os.linesep
113 |                 else:
114 |                     return expandedNodes, output
115 |         return expandedNodes, output


--------------------------------------------------------------------------------
/process/parsers/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'mappel'
2 | 


--------------------------------------------------------------------------------
/process/parsers/parse.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2011-2015 by Carnegie Mellon University
 2 | #
 3 | #  NO WARRANTY
 4 | #
 5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
 6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
 7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
 8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
 9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
13 | 
14 | 
15 | def get_pdfminer():
16 |     import pdfminer
17 |     return pdfminer.parse_and_hash
18 | 
19 | 
20 | def get_peepdf():
21 |     pass
22 | 
23 | 
24 | PARSER_FACTORY_FUNCS = {'pdfminer': get_pdfminer, 'peepdf': get_peepdf}
25 | 
26 | 
27 | def get_parser(type_):
28 |     factory = PARSER_FACTORY_FUNCS.get(type_)
29 |     return factory()
30 | 


--------------------------------------------------------------------------------
/process/parsers/pdfminer.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | #
  3 | #  NO WARRANTY
  4 | #
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | # Remove logging support to workaround issue23278
 15 | # import logging
 16 | 
 17 | 
 18 | import gzip
 19 | import numpy
 20 | import os
 21 | import re
 22 | import pickle
 23 | import signal
 24 | import sys
 25 | """
 26 | It's tempting to use cElementTree, but don't. It is not compatible with multiprocessing because it cannot be pickled.
 27 | from xml.etree.cElementTree import tostring, ElementTree
 28 | """
 29 | from xml.etree.ElementTree import TreeBuilder, tostring, dump
 30 | 
 31 | from lib.parse.pdfminer import pdftypes
 32 | from lib.parse.pdfminer.pdfdocument import PDFDocument
 33 | from lib.parse.pdfminer.pdfparser import PDFParser
 34 | from lib.parse.pdfminer.psparser import PSKeyword, PSLiteral, PSEOF, PSException
 35 | 
 36 | from process.pdf import PDF
 37 | from util.str_utils import getJavascript, isFlash
 38 | 
 39 | 
 40 | numpy.seterr(all='ignore')
 41 | 
 42 | 
 43 | ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
 44 | ENC = 'base64'
 45 | 
 46 | 
 47 | # Used only for saving XML, which is currently a disabled feature.
 48 | OUTPUTDIR = 'xml-output'
 49 | 
 50 | 
 51 | def sigint_handler(signal, frame):
 52 |     """
 53 | 
 54 |     Avoid issue 8296.
 55 |     :param signal:
 56 |     :param frame:
 57 |     :return: Does not.
 58 |     """
 59 |     sys.stderr.write("%d Caught SIGINT\n" % os.getpid())
 60 |     sys.exit(0)
 61 | 
 62 | 
 63 | def pdf_error_xml(pdfpath, err):
 64 |     tb = TreeBuilder()
 65 |     tb.start("pdf", {"path": pdfpath, "type": "error"})
 66 |     tb.data("%s" % err)
 67 |     return tb.end("pdf")
 68 | 
 69 | def check_pdf_retval(pdf):
 70 |     """
 71 |     Certain PDFs will be unpickleable, which means they cannot be sent back through the multiprocessing.Pool for
 72 |     handling in the main function. If you try they will raise a MaybeEncodingError, and there is currently no
 73 |     good way to recover from these exceptions and continue processing the rest of the samples. So, the check is done
 74 |     here.
 75 |     """
 76 |     try:
 77 |         if len(pickle.dumps(pdf)) >= pow(2, 31) - 1:
 78 |             pdf = PDF(pdf.path, pdf.name)
 79 |             pdf.xml = pdf_error_xml(pdf.path, "Parsed PDF size overlows 32 bit int result capacity")
 80 |     except Exception as e:
 81 |         sys.stderr.write("PDF cannot be returned to main process for storage in database: %s: %s\n" % (pdf.name, e))
 82 |         pdf = PDF(pdf.path, pdf.name)
 83 |         pdf.xml = pdf_error_xml(pdf.path, str(e))
 84 |     return pdf
 85 | 
 86 | 
 87 | def parse_and_hash(pdfpath):
 88 |     signal.signal(signal.SIGINT, sigint_handler)
 89 |     parser = PDFMinerParser()
 90 |     pdf = PDF(pdfpath, os.path.basename(pdfpath))
 91 | 
 92 |     try:
 93 |         parser.parse(pdf)
 94 |     except PSException as e:
 95 |         sys.stderr.write("PDFMiner failed to parse: %s\n" % pdf.name)
 96 |     except Exception as e:
 97 |         sys.stderr.write("PDFMiner uncaught error: %s: %s\n" % (pdf.name, e))
 98 | 
 99 |     if pdf.parsed and pdf.xml != '':
100 |         try:
101 |             pdf.set_feature_vector()
102 |         except AttributeError as e:
103 |             sys.stderr.write("Attribute Error: %s\n" % e)
104 |         fout = os.path.join(OUTPUTDIR, "%s.xml.zip" % pdf.name)
105 |         try:
106 |             gzfp = gzip.open(fout, "wb", compresslevel=4)
107 |         except IOError as e:
108 |             # logging.error("Parse and hash error opening xml output file: %s\n\t%s" % (fout, e))
109 |             sys.stderr.write("Parse and hash error opening xml output file: %s\n\t%s\n" % (fout, e))
110 |         else:
111 |             pdf.save_xml(gzfp)
112 |             gzfp.close()
113 | 
114 |     return check_pdf_retval(pdf)
115 | 
116 | 
117 | class PDFMinerParser(object):
118 | 
119 |     def __init__(self):
120 |         self.treebuild = TreeBuilder()
121 | 
122 |     @staticmethod
123 |     def esc(s):
124 |         return ESC_PAT.sub(lambda m: '&#%d;' % ord(m.group(0)), s)
125 | 
126 |     def add_xml_node(self, tag, attrs, data):
127 |         if not attrs:
128 |             attrs = {}
129 |         if data is None:
130 |             data = "MISSING"
131 |         self.treebuild.start(tag, attrs)
132 |         self.treebuild.data(data)
133 |         self.treebuild.end(tag)
134 | 
135 |     def dump(self, obj):
136 |         try:
137 |             obj_attrs = {"size": str(len(obj))}
138 |         except TypeError:
139 |             obj_attrs = {}
140 | 
141 |         if obj is None:
142 |             self.add_xml_node("null", {}, '')
143 | 
144 |         elif isinstance(obj, dict):
145 |             self.treebuild.start("dict", obj_attrs)
146 |             for key, val in obj.iteritems():
147 |                 # Replace non word characters in key
148 |                 key = re.sub(r'\W+', '', key)
149 |                 if key.isdigit() or not key:
150 |                     key = 'KEYERROR'
151 |                 self.treebuild.start(key, {})
152 |                 try:
153 |                     self.dump(val)
154 |                 except Exception as e:
155 |                     sys.stderr.write("DUMP excpetion: %s\n" % e)
156 |                 self.treebuild.end(key)
157 |             self.treebuild.end("dict")
158 | 
159 |         elif isinstance(obj, list):
160 |             self.treebuild.start("list", obj_attrs)
161 |             for listobj in obj:
162 |                 try:
163 |                     self.dump(listobj)
164 |                 except Exception as e:
165 |                     sys.stderr.write("DUMP excpetion: %s\n" % e)
166 |             self.treebuild.end("list")
167 | 
168 |         elif isinstance(obj, str):
169 |             self.add_xml_node("string", obj_attrs.update({"enc": ENC}), self.esc(obj).encode(ENC))
170 | 
171 |         elif isinstance(obj, pdftypes.PDFStream):
172 |             self.treebuild.start("stream", obj_attrs)
173 | 
174 |             self.treebuild.start("props", {})
175 |             try:
176 |                 self.dump(obj.attrs)
177 |             except Exception as e:
178 |                 sys.stderr.write("DUMP excpetion: %s\n" % e)
179 |             self.treebuild.end("props")
180 | 
181 |             try:
182 |                 data = obj.get_data()
183 |             except pdftypes.PDFNotImplementedError as e:
184 |                 self.add_xml_node("error", {"type": "PDFNotImplementedError"}, e.message)
185 |             except pdftypes.PDFException as e:
186 |                 self.add_xml_node("error", {"type": "PDFException"}, e.message)
187 |             except Exception as e:
188 |                 self.add_xml_node("error", {"type": "Uncaught"}, str(e))
189 |             else:
190 |                 js = getJavascript(str(data))
191 |                 if js:
192 |                     self.add_xml_node("js", {"enc": ENC, "size": str(len(js))}, js)
193 |                 else:
194 |                     self.add_xml_node("data", {"enc": ENC, "size": str(len(data))}, self.esc(data).encode(ENC))
195 | 
196 |             self.treebuild.end("stream")
197 | 
198 |         elif isinstance(obj, pdftypes.PDFObjRef):
199 |             self.add_xml_node("ref", {"id": str(obj.objid)}, '')
200 | 
201 |         elif isinstance(obj, PSKeyword):
202 |             self.add_xml_node("keyword", {}, obj.name)
203 | 
204 |         elif isinstance(obj, PSLiteral):
205 |             self.add_xml_node("literal", {}, obj.name)
206 | 
207 |         elif isinstance(obj, (int, long, float)):
208 |             self.add_xml_node("number", {}, str(obj))
209 | 
210 |         else:
211 |             raise TypeError(obj)
212 | 
213 |     def get_obj_loc(self, xref, objid):
214 |         loc = "UNKNOWN"
215 |         try:
216 |             loc = xref.get_pos(objid)[1]
217 |         except KeyError:
218 |             loc = "FREE"
219 |         finally:
220 |             return loc
221 | 
222 |     def read_pdf_block(self, parser, pos, length=512):
223 |         obj_data = "UNKNOWN"
224 |         try:
225 |             obj_data = parser.read_n_from(pos, length)
226 |         except TypeError:
227 |             obj_data = "ERROR: Could not read PDF data from pos: %s for %s bytes" % (pos, length)
228 |         finally:
229 |             return obj_data
230 | 
231 |     def end_xml_node(self, tag):
232 |         try:
233 |             self.treebuild.end(tag)
234 |         except AssertionError as e:
235 |             if 'mismatch' in e.message:
236 |                 expected_tag = e.message.partition("(expected ")[2]
237 |                 expected_tag = expected_tag.partition(",")[0]
238 |                 if expected_tag:
239 |                     self.end_xml_node(expected_tag)
240 | 
241 |     def parse(self, pdf):
242 |         try:
243 |             fp = open(pdf.path, 'rb')
244 |         except IOError as e:
245 |             # logging.error("PDFMinerParser.parse unable to open PDF: %s" % e)
246 |             sys.stderr.write("PDFMinerParser.parse unable to open PDF: %s\n" % e)
247 |             return
248 | 
249 |         visited = set()
250 |         self.treebuild.start("pdf", {"path": pdf.path})
251 | 
252 |         try:
253 |             parser = PDFParser(fp)
254 |             doc = PDFDocument(parser)
255 |         except PSEOF:
256 |             self.add_xml_node("PSException", {}, "Unexpected end of PDF")
257 |             self.treebuild.end("pdf")
258 |             pdf.parsed = True
259 |             return
260 | 
261 |         if doc.found_eof and doc.eof_distance > 3:
262 |             pdf.blob = parser.read_from_end(doc.eof_distance).encode("base64")
263 | 
264 |         for xref in doc.xrefs:
265 |             for objid in xref.get_objids():
266 | 
267 |                 if objid in visited:
268 |                     continue
269 | 
270 |                 visited.add(objid)
271 | 
272 |                 obj_attrs = {"id": str(objid), "type": "normal"}
273 |                 obj_data = ''
274 |                 obj_xml = self.treebuild.start("object", obj_attrs)
275 |                 obj_loc = self.get_obj_loc(xref, objid)
276 |                 obj_xml.set("location", str(obj_loc))
277 | 
278 |                 try:
279 |                     self.dump(doc.getobj(objid))
280 |                 except pdftypes.PDFObjectNotFound as e:
281 |                     obj_xml.set("type", "malformed")
282 |                     obj_data = self.read_pdf_block(parser, obj_loc, 4096).replace("<", "0x3C")
283 |                 except TypeError:
284 |                     obj_xml.set("type", "unknown")
285 |                     obj_data = self.read_pdf_block(parser, obj_loc).replace("<", "0x3C")
286 |                 except Exception as e:
287 |                     obj_xml.set("type", "exception")
288 |                     obj_data = self.read_pdf_block(parser, obj_loc).replace("<", "0x3C")
289 |                     self.add_xml_node("exception", {}, str(e))
290 | 
291 |                 try:
292 |                     obj_data.decode("ascii")
293 |                 except UnicodeDecodeError:
294 |                     obj_data = obj_data.encode("base64")
295 | 
296 |                 self.treebuild.data(obj_data)
297 | 
298 |                 #self.end_xml_node("object")
299 |                 try:
300 |                     self.treebuild.end("object")
301 |                 except (AssertionError, TypeError):
302 |                     return
303 | 
304 |             self.treebuild.start("trailer", {})
305 |             self.dump(xref.trailer)
306 |             self.treebuild.end("trailer")
307 | 
308 |         self.treebuild.end("pdf")
309 | 
310 |         pdf.xml = self.treebuild.close()
311 | 
312 |         pdf.errors = doc.errors
313 |         pdf.bytes_read = parser.BYTES
314 |         pdf.parsed = True
315 |         fp.close()
316 | 


--------------------------------------------------------------------------------
/process/parsers/peepdf.py:
--------------------------------------------------------------------------------
1 | __author__ = 'sei-mappel'
2 | 


--------------------------------------------------------------------------------
/process/pdf.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | #
  3 | #  NO WARRANTY
  4 | #
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | """
 15 | Logging with the logging module is disabled. See http://bugs.python.org/issue23278
 16 | Using sys.stderr instead, but only for errors
 17 | 
 18 | #import#logging
 19 | """
 20 | import sys
 21 | import traceback
 22 | from xml.parsers.expat import ExpatError
 23 | from xml.dom import minidom
 24 | from util.str_utils import prettify_dict, check_decoding
 25 | 
 26 | """
 27 | It's tempting to use cElementTree, but don't. It is not compatible with multiprocessing because it cannot be pickled.
 28 | from xml.etree.cElementTree import tostring, ElementTree
 29 | """
 30 | from xml.etree.ElementTree import tostring, ElementTree, Element, dump
 31 | 
 32 | import networkx
 33 | import numpy
 34 | from scipy.stats import stats
 35 | 
 36 | """
 37 | Number of features used in the NetSimile paper
 38 | """
 39 | NUMFEATURES = 7
 40 | 
 41 | 
 42 | class PDF(object):
 43 |     """
 44 |     :type xml: xml.etree.ElementTree.Element
 45 |     """
 46 |     def __init__(self, path, name='unnamed'):
 47 |         self.name = name
 48 |         self.path = path
 49 |         self.parsed = False
 50 |         self.size = 0
 51 |         self.js = ''
 52 |         self.swf = ''
 53 |         self.xml = ''
 54 |         self.blob = ''
 55 |         self.errors = ''
 56 |         self.bytes_read = 0
 57 |         self.v = []
 58 |         self.e = []
 59 |         self.ftr_vec = []
 60 | 
 61 |     def set_feature_vector(self):
 62 |         verts, edges = self.get_nodes_edges()
 63 |         ftr_matrix = self.get_graph_features(verts, edges)
 64 |         self.ftr_vec = self.aggregate_ftr_matrix(ftr_matrix)
 65 | 
 66 |     def get_root(self):
 67 |         rootid = None
 68 |         if self.xml is not None:
 69 |             obj = self.xml.find(".//Root")
 70 |             if obj is not None and isinstance(obj, Element):
 71 |                 try:
 72 |                     rootid = obj.find(".//ref").get("id")
 73 |                 except AttributeError:
 74 |                     sys.stderr.write("PDF.get_root: %s\nPDF obj: %s\nRoot missing reference object: %s\n" % (
 75 |                         self.name, obj, "HOLDER"))
 76 |             else:
 77 |                 """
 78 |                 This output gets a bit heavy. Malicious docs often don't have a root
 79 |                 sys.stderr.write("PDF Missing root node: %s\n" % self.name)
 80 |                 """
 81 |                 pass
 82 |         return rootid
 83 | 
 84 |     def get_nodes_edges(self):
 85 |         if not self.v or not self.e:
 86 |             """
 87 |             Setup the PDF start and root object. The start object is simply the start state for the graph, it is not
 88 |             actually a part of the PDF document on disk or the PDF document specifications.
 89 |             """
 90 |             self.v.append(("PDF", ["start"]))
 91 |             rootid = self.get_root()
 92 |             if not rootid:
 93 |                 rootid = 'missing_root'
 94 |                 self.v.append((rootid, ["root"]))
 95 |             self.e.append(("PDF", rootid))
 96 | 
 97 |             """
 98 |             Walk the PDF graph from the parsed XML
 99 |             """
100 |             visited = {()}
101 |             new_v = []
102 |             if isinstance(self.xml, Element):
103 |                 for obj in self.xml.iterfind("object"):
104 |                     src_id = obj.get("id")
105 |                     while src_id in visited:
106 |                         src_id += '_'
107 |                     visited.add(src_id)
108 |                     self.v.append((src_id, [item.tag for item in obj.iter()]))
109 |                     for ref in obj.iter("ref"):
110 |                         dst_id = ref.get("id")
111 |                         if dst_id not in visited:
112 |                             new_v.append(dst_id)
113 |                         self.e.append((src_id, dst_id))
114 |                 for v in new_v:
115 |                     if v not in visited:
116 |                         self.v.append((v, ['missing_target']))
117 |         return self.v, self.e
118 | 
119 |     def get_graph_features(self, v, e):
120 |         """ Graph features based on NetSimile paper
121 | 
122 |         :param v: set of vertices (label, [attrib])
123 |         :type v:  list
124 |         :param e: edges in the graph (vertex, vertex)
125 |         :type e: list
126 |         :return: a vector of features
127 |         :rtype: list
128 |         """
129 |         graph = networkx.Graph()
130 |         for label, attrs in v:
131 |             graph.add_node(label, contains=attrs)
132 |         for edge in e:
133 |             graph.add_edge(*edge)
134 | 
135 |         """
136 |         Transforms matrix from paper, so that each row is a feature, and each col is a node
137 |         """
138 |         features = [[] for i in range(NUMFEATURES)]
139 |         for node in graph.nodes_iter():
140 |             for idx, ftr in enumerate(self.get_node_features(graph, node)):
141 |                 features[idx].append(ftr)
142 | 
143 |         return features
144 | 
145 |     def get_node_features(self, graph, node):
146 |         """  Node features based on NetSimile paper
147 |         :param node:
148 |         :type node:
149 |         :return:
150 |         :rtype:
151 |         """
152 |         """
153 |         degree of node
154 |         cluserting coef of node
155 |         avg number of node's two-hop away neighbors
156 |         avg clustering coef of Neighbors(node)
157 |         number of edges in node i's egonet
158 |         number of outgoing edges from ego(node)
159 |         number of neighbors(ego(node))
160 |         """
161 |         neighbors = graph.neighbors(node)
162 | 
163 |         degree = graph.degree(node)
164 | 
165 |         cl_coef = networkx.clustering(graph, node)
166 | 
167 |         nbrs_two_hops = 0.0
168 |         nbrs_cl_coef = 0.0
169 |         for neighbor in neighbors:
170 |             nbrs_two_hops += graph.degree(neighbor)
171 |             nbrs_cl_coef += networkx.clustering(graph, neighbor)
172 | 
173 |         try:
174 |             avg_two_hops = nbrs_two_hops / degree
175 |             avg_cl_coef = nbrs_cl_coef / degree
176 |         except ZeroDivisionError:
177 |             avg_two_hops = 0.0
178 |             avg_cl_coef = 0.0
179 | 
180 |         egonet = networkx.ego_graph(graph, node)
181 | 
182 |         ego_size = egonet.size()
183 | 
184 |         ego_out = 0
185 |         ego_nbrs = set()
186 |         for ego_node in egonet:
187 |             for nbr in graph.neighbors(ego_node):
188 |                 if nbr not in neighbors:
189 |                     ego_out += 1
190 |                     ego_nbrs.add(nbr)
191 | 
192 |         return [degree, cl_coef, avg_two_hops, avg_cl_coef, ego_size, ego_out, len(ego_nbrs)]
193 | 
194 |     def aggregate_ftr_matrix(self, ftr_matrix):
195 |         sig = []
196 |         for ftr in ftr_matrix:
197 |             try:
198 |                 median = stats.nanmedian(ftr)
199 |                 mean = stats.nanmean(ftr)
200 |                 std = stats.nanstd(ftr)
201 |             except AttributeError:
202 |                 median = numpy.nanmedian(ftr)
203 |                 mean = numpy.nanmean(ftr)
204 |                 std = numpy.nanstd(ftr)
205 |             # Invalid double scalars warning appears here
206 |             skew = stats.skew(ftr) if any(ftr) else 0.0
207 |             kurtosis = stats.kurtosis(ftr)
208 |             sig.extend([median, mean, std, skew, kurtosis])
209 |         return sig
210 | 
211 |     def get_javascript(self):
212 |         if not self.js and isinstance(self.xml, Element):
213 |             matches = self.xml.findall(".//js")
214 |             if matches:
215 |                 self.js = "\n".join([match.text for match in matches])
216 |         return self.js
217 | 
218 |     def get_xml_str(self):
219 |         try:
220 |             rv = tostring(self.xml)
221 |         except AttributeError as e:
222 |             sys.stderr.write("PDF xml element object error: %s: %s\n" % (self.name, e))
223 |             rv = ''
224 |         except (UnicodeDecodeError, UnicodeEncodeError) as e:
225 |             sys.stderr.write("PDF to xml string encode/decode error: %s\n" % e)
226 |             rv = ''
227 |         return rv
228 | 
229 |     def dump_xml(self, elem):
230 |         """
231 | 
232 |         :param elem:
233 |         :type elem: xml.etree.ElementTree.Element
234 |         :return:
235 |         """
236 |         xml = "<dumped_pdf>"
237 |         for o in elem.iter():
238 |             tag, attrib, text = o.tag, o.attrib, o.text
239 |             tag = check_decoding(tag)
240 |             text = check_decoding(text)
241 |             xml += "<%s %s>%s<%s>\n" % (tag, prettify_dict(attrib), text, tag)
242 |         return xml + "</dumped_pdf>"
243 | 
244 |     def save_xml(self, fp):
245 |         xml_str = new_xml_str = "<xml_str>pdf.save_xml initialized value. Should not see this message</xml_str>"
246 |         try:
247 |             xml_str = tostring(self.xml)
248 |             new_xml_str = minidom.parseString(xml_str)
249 |             new_xml_str = new_xml_str.toprettyxml(indent="    ")
250 |         except AttributeError as e:
251 |             sys.stderr.write("Save XML AttributeError, missing xml likely: %s: %s\n" % (self.name, e))
252 |             if xml_str != new_xml_str:
253 |                 new_xml_str = xml_str
254 |             else:
255 |                 new_xml_str = str(e)
256 |         except IOError as e:
257 |             sys.stderr.write("PDF save xml unable to write out xml: %s: %s\n" % (self.name, e))
258 |             new_xml_str = str(e)
259 |         except UnicodeEncodeError as e:
260 |             sys.stderr.write("Unicode ENCODE error saving XML file: %s: %s\n" % (self.name, e))
261 |             new_xml_str = str(e)
262 |         except UnicodeDecodeError as e:
263 |             xml_str = self.dump_xml(self.xml)
264 |             try:
265 |                 new_xml_str = minidom.parseString(xml_str)
266 |                 new_xml_str = new_xml_str.toprettyxml(indent="    ")
267 |             except ExpatError as e:
268 |                 new_xml_str = xml_str
269 |         except ExpatError as e:
270 |             new_xml_str = "%s\n%s\n%s" % (e, '-'*80, xml_str)
271 |         except OverflowError as e:
272 |             new_xml_str = str(e)
273 |         except Exception as e:
274 |             sys.stderr.write("%s\n" % traceback.format_exc())
275 |             sys.stderr.write("PDF.save_xml: UNCAUGHT EXCEPTION: %s: %s\n" % (self.name, e))
276 |             new_xml_str = str(e)
277 |         finally:
278 |             fp.write(new_xml_str)
279 |             fp.flush()
280 | 


--------------------------------------------------------------------------------
/process/pdfhasher.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2011-2015 by Carnegie Mellon University
 2 | #
 3 | #  NO WARRANTY
 4 | #
 5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
 6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
 7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
 8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
 9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
13 | 
14 | 
15 | 
16 | from process.parsers import parse
17 | from process.pdf import PDF
18 | 
19 | 
20 | PTYPE = None
21 | 
22 | 
23 | def parse_and_hash(pdfpath):
24 |     parser = parse.get_parser(PTYPE)
25 |     pdf = PDF(pdfpath)
26 |     parser.parse(pdf)
27 |     return pdf
28 | 


--------------------------------------------------------------------------------
/process/run-jpexs.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | #
  3 | #  NO WARRANTY
  4 | #
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | 
 15 | 
 16 | import sys
 17 | import os
 18 | import re
 19 | import time
 20 | import glob
 21 | import shutil
 22 | import select
 23 | import binascii
 24 | import fileinput
 25 | import itertools
 26 | import multiprocessing as mp
 27 | from subprocess import Popen, PIPE, DEVNULL
 28 | 
 29 | ERRMSG = ''
 30 | 
 31 | def errmsg():
 32 |     global ERRMSG
 33 |     tmp = ERRMSG
 34 |     ERRMSG = ''
 35 |     return tmp
 36 | 
 37 | def simple_name(filepath):
 38 |     return os.path.splitext(os.path.basename(filepath))[0]
 39 | 
 40 | def frame_id(string):
 41 |     mobj = re.match(r'.*([\d]+)\).*"(\w*)"', string, re.U)
 42 |     if mobj:
 43 |         return (mobj.groups())
 44 |     else:
 45 |         return None
 46 | 
 47 | def get_frame_ids(fin):
 48 |     frame_nums = []
 49 |     listcmd = 'furnace-swf -i %s abclist' % fin
 50 |     proc = Popen(listcmd, shell=True, stdout=PIPE, stderr=PIPE, close_fds=True)
 51 |     for line in iter(proc.stdout.readline, b''):
 52 |         line = str(line, encoding='utf-8')
 53 |         num_name = frame_id(line)
 54 |         if num_name:
 55 |             frame_nums.append(num_name)
 56 |     proc.communicate()
 57 |     return frame_nums
 58 | 
 59 | def extract_frame(fin, dout, num_name):
 60 |     if num_name[1]:
 61 |         abcfile = "%s-%s.abc" % (simple_name(fin), num_name[1])
 62 |     else:
 63 |         abcfile = "%s-%s.abc" % (simple_name(fin), num_name[0])
 64 |     extractcmd = 'furnace-swf -i %s abcextract -o %s -n %s' % (fin, os.path.join(dout, abcfile), num_name[0])
 65 |     proc = Popen(extractcmd, shell=True, close_fds=True)
 66 |     proc.wait()
 67 |     return abcfile
 68 | 
 69 | def furnace_bytecode(fin, dout):
 70 |     abcfiles = []
 71 |     name = simple_name(fin)
 72 |     frames = get_frame_ids(fin)
 73 |     for num_name in frames:
 74 |         abcfiles.append(extract_frame(fin, dout, num_name))
 75 |     return abcfiles
 76 | 
 77 | def furnace_actionscript(abcfiles, dout):
 78 |     asfiles = []
 79 |     for bytecode in abcfiles:
 80 |         name = "%s.as" % simple_name(bytecode)
 81 |         decompilecmd = 'furnace-avm2-decompiler -d -i %s > %s' % (os.path.join(dout, bytecode), os.path.join(dout, name))
 82 |         proc = Popen(decompilecmd, shell=True, close_fds=True)
 83 |         proc.wait()
 84 |         asfiles.append(os.path.join(dout, name))
 85 |     return asfiles
 86 | 
 87 | def concat_scripts(scripts, fout):
 88 |     with open(fout, 'w') as fout:
 89 |         for line in fileinput.input(scripts, mode='rb'):
 90 |             fout.write(line)
 91 | 
 92 | def furnace_extract(fin, dirname):
 93 |     name = simple_name(fin)
 94 |     dout = os.path.join(dirname, name + '-furnace')
 95 |     try:
 96 |         os.mkdir(dout)
 97 |     except OSError as e:
 98 |         if e.errno == 17:
 99 |             pass
100 |         else:
101 |             print(e)
102 |             ERRMSG = str(e)
103 |             return None
104 |     abcfiles = furnace_bytecode(fin, dout)
105 |     asfiles = furnace_actionscript(abcfiles, dout)
106 |     if len(asfiles) > 1:
107 |         concat_scripts(asfiles, os.path.join(dout, "%s-all.as" % name))
108 |     return True
109 | 
110 | def jpexs_extract(fin, dirname):
111 |     global ERRMSG
112 |     name = ''
113 |     script = ''
114 |     extracted = []
115 |     fname = fin
116 |     dout = os.path.join(dirname, os.path.splitext(os.path.basename(fname))[0] + '-jpexs')
117 |     #javacmd = ['java', '-jar', 'ffdec.jar', '-export', 'script', dout, fname ]
118 |     #javacmd = 'java -Djava.awt.headless=true -jar /Users/honey/src/work/pdf/thisneedsacoolname/ffdec.jar -format script:pcode -export script %s %s' % (dout, fname)
119 |     javacmd = 'java -Djava.awt.headless=true -jar /Users/honey/src/work/pdf/thisneedsacoolname/ffdec.jar -export script %s %s' % (dout, fname)
120 |    
121 |     try:
122 |         os.mkdir(dout)
123 |     except OSError as e:
124 |         if e.errno == 17:
125 |             pass
126 |         else:
127 |             print('jpexs_extract mkdir(%s): %s' % (dout, e))
128 |             ERRMSG = str(e)
129 |             return None
130 | 
131 |     proc = Popen(javacmd, shell=True, stdout=PIPE, stderr=PIPE, close_fds=True, cwd=dout)
132 | 
133 |     p = select.poll()
134 |     p.register(proc.stderr.fileno(), select.POLLIN | select.POLLPRI)
135 |     for line in iter(proc.stdout.readline, b''):
136 |         line = str(line, encoding='utf-8')
137 |         if line.startswith('Exported'):
138 |             mobj = re.match(r"^Exported\s[^\d]*[\d]+/[\d]+\s([^,]+)\,\s", line, re.U)
139 |             if mobj:
140 |                 name = mobj.group(1).replace('.','/').replace(' ', '\ ')
141 |                 extracted.append(name)
142 |         if p.poll(1):
143 |             err = str(proc.stderr.readline(), encoding='utf-8')
144 |             if err.startswith('FAIL') or err.startswith('SEVERE'):
145 |                 proc.kill()
146 |                 sys.stderr.write("jpexs_extract error %s\n"%err)
147 |                 shutil.rmtree(dout)
148 |                 ERRMSG = err
149 |                 return None
150 |         
151 |     out, err = proc.communicate()
152 |     if err:
153 |         ERRMSG = err
154 | 
155 |     for srcfile in extracted:
156 |         try:
157 |             fin = open('%s/%s.as' % (dout, srcfile), 'r')
158 |         except IOError as e:
159 |             print('jpexs_extract open(srcfile): %s' % e)
160 |             ERRMSG = str(e)
161 |             return None
162 |         else:
163 |             script += '\n'.join([line.rstrip() for line in fin.readlines()])
164 |             script += '\n'
165 |             fin.close()
166 | 
167 |     if not script:
168 |         return None
169 | 
170 |     try:
171 |         fout = open('%s/%s-all.as' % (dout, os.path.splitext(os.path.basename(fname))[0]), 'w')
172 |     except IOError as e:
173 |         print('jpexs_extract open(fout): %s' % e)
174 |         ERRMSG = str(e)
175 |         return None
176 |     else:
177 |         fout.write(script)
178 |         fout.close()
179 |         return True
180 | 
181 | 
182 | def main(din, dout='', tool='jpexs'):
183 |     if not dout:
184 |         dout = din
185 |     if not os.path.isdir(dout):
186 |         sys.stderr.write("Invalid directory: %s\n" % dout)
187 |         return None
188 |     files = glob.glob(os.path.join(din, '*.swf'))
189 | 
190 |     fdone = None
191 |     ferr = None
192 |     completed = set()
193 |     try:
194 |         fdone = open("%s/done.txt" % dout, "r")
195 |     except IOError as e:
196 |         if e.errno != 2:
197 |             sys.stderr.write("%s\n" % e)
198 |             sys.exit(0)
199 |     else: 
200 |         completed = set([l.rstrip() for l in fdone.readlines()])
201 |         fdone.close()
202 |     
203 |     total = 0
204 |     errors = 0
205 |     for f in files:
206 |         md5name = os.path.splitext(os.path.basename(f))[0]
207 |         if md5name not in completed:
208 |             total += 1
209 |             sys.stdout.write("Processing:\t%s\t" % md5name)
210 |             if tool == 'jpexs':
211 |                 rv = jpexs_extract(f, dout)
212 |             elif tool == 'furnace':
213 |                 rv = furnace_extract(f, dout)
214 |             if rv:
215 |                 sys.stdout.write("complete\n")
216 |                 try:
217 |                     fdone = open("%s/done.txt" % dout, "a")
218 |                     fdone.write("%s\n" % md5name)
219 |                 except IOError as e:
220 |                     sys.stderr.write("Unable to write to log file, done.txt: %s\n" % e)
221 |                 else:
222 |                     fdone.close()
223 |             else:
224 |                 errors += 1
225 |                 sys.stdout.write("error\n")
226 |                 try:
227 |                     ferr = open("%s/err.txt" % dout, "a")
228 |                     ferr.write("%s\n%s\n\n" % (md5name, errmsg()))
229 |                 except IOError as e:
230 |                     sys.stderr.write("Unable to write to log file, err.txt: %s\n" % e)
231 |                     continue
232 |                 else:
233 |                     ferr.close()
234 |         else:
235 |             sys.stdout.write("Skipping:\t%s\n" % md5name)
236 | 
237 |     sys.stdout.write("Complete:\t%d\nFailures:\t%d\nTotal jobs:\t%d\n" % (total - errors, errors, total))
238 | 
239 | if __name__ == '__main__':
240 |     try:
241 |         dir_in = sys.argv[1]
242 |     except IndexError:
243 |         dir_in = './'
244 | 
245 |     try:
246 |         dir_out = os.path.abspath(os.path.expandvars(os.path.expanduser(sys.argv[2])))
247 |     except IndexError:
248 |         dir_out = './'
249 | 
250 |     try:
251 |         tool = sys.argv[3]
252 |     except IndexError:
253 |         tool = 'jpexs'
254 | 
255 |     if os.path.isdir(dir_in):
256 |         main(dir_in, dir_out, tool)
257 | 


--------------------------------------------------------------------------------
/process/sdhasher.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2011-2015 by Carnegie Mellon University
 2 | #
 3 | #  NO WARRANTY
 4 | #
 5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
 6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
 7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
 8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
 9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
13 | 
14 | 
15 | 
16 | import tempfile
17 | from subprocess import Popen, PIPE
18 | from sys import stderr
19 | 
20 | def make_sdhash(data, log=None):
21 |     if not data or len(data) < 512:
22 |         return ''
23 |     stdout = ''
24 |     try:
25 |         tmpfile = tempfile.NamedTemporaryFile(delete=True)
26 |     except IOError as e:
27 |         if log:
28 |             if isinstance(log, list):
29 |                 log.append('sdhash: %s\n' % str(e))
30 |             else:
31 |                 logmsg(log, 'sdhash: %s\n'%str(e))
32 |         else:
33 |             stderr.write('sdhash: %s\n'%str(e))
34 |     else:
35 |         tmpfile.write(data)
36 |         tmpfile.flush()
37 |         cmd = ['sdhash', tmpfile.name]
38 |         proc = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
39 |         stdout, stderr = proc.communicate()
40 |         proc.wait()
41 |         tmpfile.close()
42 |         if not stdout:
43 |             stdout = ''
44 |         if stderr:
45 |             print stderr
46 |     finally:
47 |         return stdout
48 | 


--------------------------------------------------------------------------------
/storage/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'sei-mappel'
2 | 


--------------------------------------------------------------------------------
/storage/dbgw.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | #
  3 | #  NO WARRANTY
  4 | #
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | import cPickle
 15 | import json
 16 | import logging
 17 | import sqlite3
 18 | import sys
 19 | 
 20 | from util.str_utils import get_hash
 21 | 
 22 | class NabuDb(object):
 23 | 
 24 |     table = "unknown"
 25 |     cols = []
 26 | 
 27 |     def __init__(self, dbpath):
 28 |         self.dbpath = dbpath
 29 |         self.conn = None
 30 | 
 31 |     def init(self, table, cols):
 32 |         cmd = "create table if not exists %s(%s)" % (table, ','.join(cols))
 33 |         try:
 34 |             self.conn = sqlite3.connect(self.dbpath)
 35 |             self.conn.execute(cmd)
 36 |         except sqlite3.Error as e:
 37 |             logging.error("NabuDb.init error (%s): %s\n%s" % (self.dbpath, e, cmd))
 38 |             return False
 39 |         else:
 40 |             self.conn.text_factory = str
 41 |             return True
 42 | 
 43 |     def connected(self):
 44 |         return self.conn is not None
 45 | 
 46 |     def query(self, cmd, subs):
 47 |         try:
 48 |             c = self.conn.cursor()
 49 |             c.execute(cmd, subs)
 50 |         except sqlite3.Error as e:
 51 |             logging.error("NabuDb.query error: %s" % e)
 52 |             return []
 53 |         else:
 54 |             rows = c.fetchall()
 55 |             self.conn.commit()
 56 |             c.close()
 57 |             return rows
 58 | 
 59 |     def size(self):
 60 |         cmd = "select count(*) from %s" % self.table
 61 |         rows = self.query(cmd, ())
 62 |         if rows:
 63 |             return rows[0][0]
 64 |         else:
 65 |             return -1
 66 | 
 67 |     def get_unique(self, field):
 68 |         cmd = "select distinct %s from %s" % (field, self.table)
 69 |         rows = self.query(cmd, ())
 70 |         return rows
 71 | 
 72 |     def close(self):
 73 |         self.conn.close()
 74 | 
 75 |     @staticmethod
 76 |     def serialize(data):
 77 |         try:
 78 |             pickled = cPickle.dumps(data, protocol=2)
 79 |         except cPickle.PicklingError as e:
 80 |             logging.error("DB serialize error: %s" % e)
 81 |             return ''
 82 |         else:
 83 |             return pickled
 84 | 
 85 |     @staticmethod
 86 |     def deserialize(pickled):
 87 |         try:
 88 |             data = cPickle.loads(pickled)
 89 |         except cPickle.UnpicklingError as e:
 90 |             logging.error("DB deserialize error: %s" % e)
 91 |             return ''
 92 |         else:
 93 |             return data
 94 | 
 95 |     @staticmethod
 96 |     def serializeJSON(data):
 97 |         """
 98 | 
 99 |         :param data: data to be serialized
100 |         :return: JSON encoded string of data for serialization
101 |         """
102 |         try:
103 |             json_data = json.dumps(data)
104 |         except (TypeError, ValueError) as e:
105 |             logging.error("DB JSON serialize error: %s" % e)
106 |             return ''
107 |         else:
108 |             return json_data
109 | 
110 |     @staticmethod
111 |     def deserializeJSON(jsondata):
112 |         """
113 | 
114 |         :param jsondata: JSON encoded string
115 |         :return: Python data primitive or object
116 |         """
117 |         try:
118 |             data = json.loads(jsondata)
119 |         except (TypeError, ValueError) as e:
120 |             logging.error("DB JSON serialize error: %s" % e)
121 |             return ''
122 |         else:
123 |             return data
124 | 
125 | 
126 | class JobDb(NabuDb):
127 | 
128 |     table = "jobs"
129 |     cols = ["job_name text", "sample_path text"]
130 | 
131 |     def get_completed(self, job_name):
132 |         cmd = "select sample_path from %s where job_name=?" % self.table
133 |         rows = set([row[0] for row in self.query(cmd, (job_name,))])
134 |         return rows
135 | 
136 |     def mark_complete(self, job_name, sample):
137 |         cmd = "insert into %s values(?, ?)" % self.table
138 |         rv = self.query(cmd, (job_name, sample))
139 |         return rv
140 | 
141 | 
142 | class XmlDb(NabuDb):
143 | 
144 |     table = "xml"
145 |     cols = ["pdf_id primary key", "xml"]
146 | 
147 |     def save(self, pdf_id, xml_str):
148 |         cmd = "insert or replace into %s values(?, ?)" % self.table
149 |         return self.query(cmd, (pdf_id, xml_str))
150 | 
151 |     def load(self, pdf_id):
152 |         cmd = "select xml from %s where pdf_id=?" % self.table
153 |         rows = self.query(cmd, (pdf_id,))
154 |         try:
155 |             rv = rows[0][0]
156 |         except Exception:
157 |             rv = ''
158 |         return rv
159 | 
160 | 
161 | class PdfDb(NabuDb):
162 | 
163 |     table = "pdfs"
164 |     cols = ["pdf_id primary key", "v_md5", "e_md5", "vertices", "edges", "js", "features"]
165 | 
166 |     def save(self, pdf):
167 |         """
168 | 
169 |         :param pdf: pdf.PDF object that has been parsed
170 |         :type pdf: process.pdf.PDF
171 |         :return: boolean value for success
172 |         """
173 |         cmd = "insert or replace into %s values(?, ?, ?, ?, ?, ?, ?)" % self.table
174 |         v = self.serialize(pdf.v)
175 |         v_md5 = get_hash(v)
176 |         e = self.serialize(pdf.e)
177 |         e_md5 = get_hash(e)
178 |         ftrs = self.serialize(pdf.ftr_vec)
179 |         js = self.serializeJSON(pdf.get_javascript())
180 |         rv = self.query(cmd, (pdf.name, v_md5, e_md5, v, e, js, ftrs))
181 |         return rv
182 | 
183 |     def load_family_features(self, edge_md5):
184 |         cmd = "select pdf_id, features from %s where e_md5=? limit 1" % self.table
185 |         rows = self.query(cmd, (edge_md5,))
186 |         if rows:
187 |             pdf_id, f_json = rows[0]
188 |             f_list = self.deserialize(f_json)
189 |         else:
190 |             pdf_id, f_list = '', ''
191 |         return pdf_id, f_list
192 | 
193 |     def load_pdf_graph(self, pdf):
194 |         cmd = "select pdf_id, v_md5, e_md5, vertices, edges, features from %s where pdf_id=?" % self.table
195 |         rows = self.query(cmd, (pdf,))
196 |         if rows:
197 |             graph_md5, v_md5, e_md5, v_json, e_json, f_json = rows[0]
198 |             v_set = self.deserialize(v_json)
199 |             e_set = self.deserialize(e_json)
200 |             f_list = self.deserialize(f_json)
201 |             return graph_md5, v_md5, e_md5, v_set, e_set, f_list
202 |         else:
203 |             logging.debug("PDF not found: %s" % pdf)
204 |             return ['' for i in range(6)]
205 | 
206 |     def chunk(self, limit, offset):
207 |         cmd = "select pdf_id, vertices, edges from %s limit %d offset %d" % (self.table, limit, offset)
208 |         rows = self.query(cmd, ())
209 |         for idx, (pdf, v, e) in enumerate(rows):
210 |             rows[idx] = [pdf, self.deserialize(v), self.deserialize(e)]
211 |         return rows
212 | 
213 | if __name__ == "__main__":
214 |     if len(sys.argv) < 2:
215 |         print "Need database"
216 |         sys.exit(1)
217 | 
218 |     gdb = PdfDb(sys.argv[1])
219 |     if not gdb.init(gdb.table, gdb.cols):
220 |         print "Database error"
221 |         sys.exit(1)
222 | 
223 |     families = [row[0] for row in gdb.get_unique('e_md5')]
224 |     for f in families:
225 |         print "%s,%s" % gdb.load_family_features(f)
226 | 


--------------------------------------------------------------------------------
/storage/storage.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | #
  3 | #  NO WARRANTY
  4 | #
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | import sys
 15 | 
 16 | import psycopg2
 17 | from db_mgmt import DBGateway
 18 | 
 19 | TABLE = 'parsed_pdfs'
 20 | PRIMARY = 'pdf_md5'
 21 | COLUMNS = ('category',
 22 |            'pdf_md5',
 23 |            'tree_md5',
 24 |            'tree',
 25 |            'graph',
 26 |            'obf_js',
 27 |            'obf_js_sdhash',
 28 |            'de_js',
 29 |            'de_js_sdhash',
 30 |            'swf',
 31 |            'swf_sdhash',
 32 |            'abc',
 33 |            'abc_sdhash',
 34 |            'actionscript',
 35 |            'as_sdhash',
 36 |            'shellcode',
 37 |            'fsize',
 38 |            'pdfsize',
 39 |            'bin_blob',
 40 |            'urls',
 41 |            'malformed',
 42 |            'errors')
 43 | 
 44 | 
 45 | class StorageFactory(object):
 46 |     def new_storage(self, typ, name='500k-test', user='honey'):
 47 |         if typ == 'stdout':
 48 |             return StdoutStorage()
 49 |         if typ == 'sqlite3':
 50 |             return DbStorage(name)
 51 |         if typ == 'postgres':
 52 |             return PostgresStorage(name, user)
 53 |         if typ == 'neo4j':
 54 |             return NeoStorage()
 55 |         if typ == 'file':
 56 |             return FileStorage(name + '.txt')
 57 | 
 58 | 
 59 | class Storage(object):
 60 |     def __init__(self):
 61 |         pass
 62 | 
 63 |     def open(self):
 64 |         return False
 65 | 
 66 |     def store(self):
 67 |         pass
 68 | 
 69 |     def close(self):
 70 |         pass
 71 | 
 72 |     def align_kwargs(self, data):
 73 |         aligned = []
 74 |         for col in COLUMNS:
 75 |             aligned.append(data.get(col, ''))
 76 |         return tuple(aligned)
 77 | 
 78 | 
 79 | class PostgresStorage(Storage):
 80 |     insert = "INSERT INTO parsed_pdfs (%s) VALUES (%s)"
 81 |     create = "CREATE TABLE IF NOT EXISTS parsed_pdfs (rowid serial, %s TEXT, PRIMARY KEY (rowid, pdf_md5))"
 82 | 
 83 |     def __init__(self, dbname, user, pw=''):
 84 |         self.dbname = dbname
 85 |         self.user = user
 86 |         self.pw = pw
 87 |         ccols = ' TEXT, '.join(COLUMNS)
 88 |         icols = ', '.join(COLUMNS)
 89 |         markers = ', '.join(['%s' for x in COLUMNS])
 90 |         self.create = self.create % (ccols)
 91 |         self.insert = self.insert % (icols, markers)
 92 | 
 93 |     def open(self):
 94 |         try:
 95 |             self.conn = psycopg2.connect(database=self.dbname, user=self.user, password=self.pw)
 96 |             cur = self.conn.cursor()
 97 |             cur.execute(self.create)
 98 |             self.conn.commit()
 99 |         except Exception as e:
100 |             sys.stderr.write("Postgres Connect Error\t%s\n" % (str(e)))
101 |             sys.stderr.flush()
102 |             return False
103 |         else:
104 |             cur.close()
105 |             return True
106 | 
107 |     def store(self, data_dict):
108 |         data_tuple = self.align_kwargs(data_dict)
109 |         try:
110 |             cur = self.conn.cursor()
111 |             cur.execute(self.insert, data_tuple)
112 |         except Exception as e:
113 |             self.conn.rollback()
114 |         else:
115 |             self.conn.commit()
116 |             cur.close()
117 | 
118 |     def close(self):
119 |         self.conn.commit()
120 |         self.conn.close()
121 | 
122 | 
123 | class NeoStorage(Storage):
124 |     pass
125 | 
126 | 
127 | class StdoutStorage(Storage):
128 |     pass
129 | 
130 | 
131 | class DbStorage(Storage):
132 |     def __init__(self, db=''):
133 |         self.db = DBGateway(db + '.sqlite')
134 | 
135 |     def open(self):
136 |         try:
137 |             self.db.create_table(TABLE, cols=[' '.join([col, 'TEXT']) for col in COLUMNS], primary=PRIMARY)
138 |         except Exception:
139 |             return False
140 |         else:
141 |             return True
142 | 
143 |     def store(self, data_dict):
144 |         data_tuple = self.align_kwargs(data_dict)
145 |         if not self.db.insert(TABLE, cols=COLUMNS, vals=data_tuple):
146 |             err_tuple = (data_dict.get('pdf_md5'), 'DB_ERROR: %s' % self.db.get_error())
147 |             self.db.insert(TABLE, cols=['pdf_md5', 'errors'], vals=err_tuple)
148 | 
149 |     def close(self):
150 |         self.db.disconnect()
151 | 
152 |     def contains(self, key, val):
153 |         return self.db.count(TABLE, key, val)
154 | 
155 | 
156 | class FileStorage(Storage):
157 |     def __init__(self, path):
158 |         self.path = path
159 |         try:
160 |             self.fd = open(path, 'wb')
161 |         except IOError as e:
162 |             print e
163 |             print 'Unable to create output. Exiting.'
164 |             sys.exit(1)
165 |         else:
166 |             self.fd.close()
167 | 
168 |     def open(self):
169 |         try:
170 |             self.fd = open(self.path, 'wb')
171 |         except IOError:
172 |             return False
173 |         else:
174 |             return True
175 | 
176 |     def store(self, data_dict):
177 |         # self.json.dump(data_dict, self.fd, separators=(',', ':'))
178 |         header = '%s\n%s\n%s\n' % ('-' * 80, data_dict.get('pdf_md5', 'N/A'), '-' * 80)
179 |         footer = '\n'
180 | 
181 |         self.fd.write(header)
182 |         for k, v in data_dict.items():
183 |             try:
184 |                 self.fd.write("__%s\n" % k)
185 |                 self.fd.write(v)
186 |                 self.fd.write("\n\n")
187 |             except IOError as e:
188 |                 sys.stderr.write("FileStorage store IO error: %s\n" % e)
189 |                 sys.exit(0)
190 |         self.fd.write(footer)
191 | 
192 |     def close(self):
193 |         self.fd.close()
194 | 
195 | 
196 | if __name__ == '__main__':
197 |     tests = ['test.test', 'db', 'pg', 'neo4j']
198 |     for test in tests:
199 |         storage = StorageFactory().new_storage(test, "500k-test")
200 |         print "%s.open()\t%s" % (test, storage.open())
201 | 


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'sei-mappel'
2 | 


--------------------------------------------------------------------------------
/util/huntterp.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | #
  3 | #  NO WARRANTY
  4 | #
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | 
 15 | import sys
 16 | import os
 17 | import re
 18 | 
 19 | '''
 20 | For testing run the module without arguments. (Can also be run on arbitrary files.)
 21 | 
 22 | '''
 23 | class Test(object):
 24 |     tests = [ 'ftp', 'http' ]
 25 |     ftp = "6674703a2f2f676f6f676c652e636f6d"
 26 |     http = "6674703a2f2f676f6f676c652e636f6d687474703a2f2f676f6f676c652e636f6df1"
 27 | 
 28 | '''
 29 | This function makes no assumptions on the validity of the string values
 30 | '''
 31 | def ascii2hex(string):
 32 |     if isinstance(string, str):
 33 |         return ''.join( [hex(ord(c))[2:] for c in string] )
 34 |     else:
 35 |         return ''
 36 | '''
 37 | Convert a string from hex to ascii. Starting from the first position, and
 38 | stopping on the first invalid (not-printable) character or invalid input,
 39 | whichever comes first.
 40 | '''
 41 | def hex2ascii(string):
 42 |     letters = ''
 43 |     for idx in range(0, len(string), 2):
 44 |         try:
 45 |             c1 = string[idx]
 46 |             c2 = string[idx+1]
 47 |             i = int(c1+c2, 16)
 48 |             if i < 32 or i > 127:
 49 |                 break 
 50 |             ch = chr(i)
 51 |         except (ValueError, TypeError, IndexError):
 52 |             break
 53 |         else:
 54 |             letters += ch
 55 |     return letters
 56 | 
 57 | def get_unicode(h2):
 58 |     res = []
 59 |     res = re.findall('[\'\"]((%u[0-9a-f]{4})*)[\'\"]', h2)
 60 |     return res
 61 | 
 62 | '''
 63 | Return a list of strings found in the hexstring. Should not return overlapping
 64 | results. Needle is converted from ASCII to HEX on the first line.
 65 | '''
 66 | def find_in_hex(needle, hexstack):
 67 |     needle = ascii2hex(needle)
 68 |     results = []
 69 |     total = 0
 70 |     while True:
 71 |         idx = hexstack.find(needle)
 72 |         if idx < 0:
 73 |             break
 74 |         total += idx
 75 |         results.append((total, hex2ascii(hexstack[idx:])))
 76 |         hexstack = hexstack[idx + 1:]
 77 |         total += 1
 78 |     return results
 79 | 
 80 | def verify(vals, string):
 81 |     for val in vals:
 82 |         sys.stdout.write('Verifying [%s] @ [%d]...' % (val[1], val[0]))
 83 |         if string[val[0]:len(val[1])].startswith(hex2ascii(val[1])):
 84 |             sys.stdout.write('pass\n')
 85 |         else:
 86 |             sys.stdout.write('fail. string[%d]==[%s]...\n' % (val[0], val[1][val[0]:val[0]+32]))
 87 | 
 88 | '''
 89 | Return a list of urls found in the unicode string. Should not return overlapping
 90 | results. Needle is converted from ASCII to UNICODE on the first line.
 91 | '''
 92 | def find_unicode(needle, haystack):
 93 |     needle = ascii2uni(needle)
 94 |     results = []
 95 |     total = 0
 96 |     while True:
 97 |         idx = haystack.find(needle)
 98 |         if idx < 0:
 99 |             break
100 |         total += idx
101 |         quote_2 = haystack[idx:].find('"')
102 |         quote_1 = haystack[idx:].find('\'')
103 |         if quote_1 < quote_2 and quote_1 > -1:
104 |             quote = quote_1
105 |         else:
106 |             quote = quote_2
107 |         results.append((total, haystack[idx:idx+quote]))
108 |         haystack = haystack[idx+1:]
109 |         total += 1
110 |     res = []
111 |     for r in results:
112 |         res.append((r[0],uni2ascii(r[1])))
113 |     return res
114 | 
115 | '''
116 |     Convert a string from ascii to unicode
117 | '''
118 | def ascii2uni(string):
119 |     string = ascii2hex(string)
120 |     res = re.findall('([0-9a-f]{2})([0-9a-f]{2})', string)
121 |     string = ''
122 |     for i in res:
123 |         string += '%u' + i[1] + i[0]
124 |     return string
125 | 
126 | '''
127 |     Convert a string form unicode to ascii
128 | '''
129 | def uni2ascii(string):
130 |     string = re.sub("%u", "", string)
131 |     res = re.findall('([0-9a-f]{2})([0-9a-f]{2})', string)
132 |     string = ''
133 |     for i in res:
134 |         string += i[1] + i[0]
135 |     return hex2ascii(string)
136 | 
137 | '''
138 | Find h1 in h2 | h1 == ASCII && h2 == HEX
139 | '''
140 | def main(h1, h2):
141 |     if not isinstance(h2, str):
142 |         print 'Invalid input:',type(h2)
143 |         print str(h2)
144 |         return
145 | 
146 |     print 'Searching for "%s" in "%s"...' % (h1, h2[:32])
147 | 
148 |     urls = find_in_hex(h1, h2)
149 |     urls += find_unicode(h1, h2)
150 |     print urls
151 |     print 'Found: %d occurrences' % len(urls)
152 |     if len(urls):
153 |         verify(urls, h2)
154 | 
155 |     
156 | if __name__ == "__main__":
157 |     try:
158 |         needle = sys.argv[1]
159 |         fin = open(sys.argv[2], 'r')
160 |     except IndexError:
161 |         print 'Invalid or no arguments. Usage: huntterp.py needle haystack.txt'
162 |         print 'Beginning tests'
163 |         t = Test()
164 |         for needle in t.tests:
165 |             haystack = getattr(t, needle)
166 |             main(needle, haystack)
167 |     except IOError as e:
168 |         print e
169 |     else:
170 |         main(needle, fin.read())
171 | 


--------------------------------------------------------------------------------
/util/str_utils.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2011-2015 by Carnegie Mellon University
  2 | #
  3 | #  NO WARRANTY
  4 | #
  5 | #  THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING INSTITUTE
  6 | #  MATERIAL IS FURNISHED ON AN "AS-IS" BASIS.  CARNEGIE MELLON
  7 | #  UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
  8 | #  IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY
  9 | #  OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
 10 | #  OBTAINED FROM USE OF THE MATERIAL.  CARNEGIE MELLON UNIVERSITY
 11 | #  DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM
 12 | #  FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 13 | 
 14 | 
 15 | import hashlib
 16 | import re
 17 | 
 18 | import htmlentitydefs
 19 | 
 20 | ENTITY_PAT = re.compile(r'&#?\w+')
 21 | 
 22 | 
 23 | def check_decoding(string):
 24 |     rv = ""
 25 |     try:
 26 |         rv = string.decode("ascii")
 27 |     except UnicodeDecodeError:
 28 |         rv = string.encode("base64")
 29 |     except AttributeError:
 30 |         pass
 31 |     finally:
 32 |         return rv
 33 | 
 34 | 
 35 | def prettify_dict(dic):
 36 |     rv = ""
 37 |     for k, v in dic.items():
 38 |         k = check_decoding(k)
 39 |         if not k:
 40 |             k = "key_print_error"
 41 |         v = check_decoding(v)
 42 |         if not v:
 43 |             v = "value_print_error"
 44 |         rv += '%s="%s" ' % (k, v)
 45 |     return rv
 46 | 
 47 | 
 48 | def unescapeHTMLEntities(text):
 49 |     """
 50 |         Removes HTML or XML character references and entities from a text string.
 51 |         @param text The HTML (or XML) source text.
 52 |         @return The plain text, as a Unicode string, if necessary.
 53 |         Author: Fredrik Lundh
 54 |         Source: http://effbot.org/zone/re-sub.htm#unescape-html
 55 |     """
 56 |     def fixup(m):
 57 |         text = m.group(0)
 58 |         if text[:2] == '&#':
 59 |             # character reference
 60 |             try:
 61 |                 if text[:3] == '&#x':
 62 |                     return unichr(int(text[3:-1], 16))
 63 |                 else:
 64 |                     return unichr(int(text[2:-1]))
 65 |             except ValueError:
 66 |                 pass
 67 |         else:
 68 |             # named entity
 69 |             try:
 70 |                 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
 71 |             except KeyError:
 72 |                 pass
 73 |         return text # leave as is
 74 |     #return str(re.sub('&#?\w+;', fixup, text))
 75 |     return ENTITY_PAT.sub(fixup, text)
 76 | 
 77 | 
 78 | def get_hash(data):
 79 |     """
 80 | 
 81 |     :param data: Any byte or 2.X str
 82 |     :return: the MD5 hash of the data as a hexidecimal string
 83 |     :rtype: str
 84 |     """
 85 |     md5 = hashlib.md5()
 86 |     md5.update(data)
 87 |     return md5.hexdigest()
 88 | 
 89 | 
 90 | def isFlash (content):
 91 |     """
 92 |     Check for swf content in a string by searching for CWS or FWS
 93 |     in the first three characters
 94 | 
 95 |     :param content: String from PDF stream that needs to be checked for flash
 96 |     :type content: str
 97 |     :return: Whether or not the string starts with a valid flash signature
 98 |     :rtype: bool
 99 |     """
100 |     content = unescapeHTMLEntities(content)
101 |     return content.startswith("CWS") or content.startswith("FWS")
102 | 
103 | 
104 | def getJavascript(content):
105 |     """
106 |     Given an string this method looks for typical Javscript strings and try to identify if the string contains
107 |     Javascript code or not. If it contains JavaScript then the code is returned as as string.
108 | 
109 |     :param content: A string
110 |     :type content: str
111 |     :return: A string of suspected Javascript or an empty string if none.
112 |     :rtype: str
113 |     """
114 |     reJSscript = '<script[^>]*?contentType\s*?=\s*?[\'"]application/x-javascript[\'"][^>]*?>(.*?)</script>'
115 |     JSStrings = ['var ', ';', ')', '(', 'function ', '=', '{', '}', 'if ', 'else', 'return', 'while ', 'for ', ',',
116 |                  'eval', 'unescape', '.replace']
117 |     keyStrings = [';', '(', ')']
118 |     stringsFound = []
119 |     limit = 15
120 |     minDistinctStringsFound = 5
121 |     results = 0
122 |     try:
123 |         content = unescapeHTMLEntities(content)
124 |     except UnicodeDecodeError:
125 |         content = unescapeHTMLEntities(content.decode("latin1", errors="xmlcharrefreplace"))
126 | 
127 |     res = re.findall(reJSscript, content, re.DOTALL | re.IGNORECASE)
128 |     if res:
129 |         return "\n".join(res)
130 | 
131 |     for char in content:
132 |         if (ord(char) < 32 and char not in ['\n', '\r', '\t', '\f', '\x00']) or ord(char) >= 127:
133 |             return ''
134 | 
135 |     for string in JSStrings:
136 |         cont = content.count(string)
137 |         results += cont
138 |         if cont > 0 and string not in stringsFound:
139 |             stringsFound.append(string)
140 |         elif cont == 0 and string in keyStrings:
141 |             return ''
142 | 
143 |     if results > limit and len(stringsFound) >= minDistinctStringsFound:
144 |         return content
145 |     else:
146 |         return ''
147 | 


--------------------------------------------------------------------------------
/xml-output/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'sei-mappel'
2 | 


--------------------------------------------------------------------------------