├── .gitignore ├── LICENSE ├── README.md ├── officeparser.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | .dump 2 | *.pyc 3 | *.xls* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 John William Davison 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # officeparser.py 2 | 3 | officerparser.py is a python script that parses the format of OLE compound documents used by Microsoft Office applications. 4 | 5 | Some useful features of this script include 6 | * macro extraction 7 | * embedded file extraction 8 | * format analysis 9 | 10 | Execute the script with the --help option to view all available options. 11 | 12 | ## useful options 13 | 14 | __--extract-macros__ 15 | 16 | This extracts all macro code that can be detected in an office document. Use the -o or --output-dir option to specify the directory to store the files. 17 | 18 | -------------------------------------------------------------------------------- /officeparser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # CHANGELOG: 4 | # 2023-08-12: - VBA: Added check for optional PROJECTCOMPATVERSION Record (MS OVBA 2.3.4.2.1.2 Revision 11.0) 5 | # - VBA: fixed infinite loop if REFERENCE record is corrupt 6 | 7 | import sys 8 | from struct import unpack 9 | from optparse import OptionParser 10 | from cStringIO import StringIO 11 | import logging 12 | import re 13 | import os 14 | import zipfile 15 | import tempfile 16 | 17 | OLE_SIGNATURE = "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" 18 | DIFSECT = 0xFFFFFFFC; 19 | FATSECT = 0xFFFFFFFD; 20 | ENDOFCHAIN = 0xFFFFFFFE; 21 | FREESECT = 0xFFFFFFFF; 22 | 23 | MODULE_EXTENSION = "bas" 24 | CLASS_EXTENSION = "cls" 25 | FORM_EXTENSION = "frm" 26 | 27 | BINFILE_NAME = "/vbaProject.bin" 28 | 29 | def fat_value_to_str(value): 30 | if value == DIFSECT: 31 | return '0xFFFFFFFC (DIF)' 32 | elif value == FATSECT: 33 | return '0xFFFFFFFD (FAT)' 34 | elif value == ENDOFCHAIN: 35 | return '0xFFFFFFFE (EOC)' 36 | elif value == FREESECT: 37 | return '0xFFFFFFFF (FREE)' 38 | else: 39 | return '{0:08X} (PTR)'.format(value) 40 | 41 | def copytoken_help(difference): 42 | from math import ceil, log 43 | bit_count = int(ceil(log(difference, 2))) 44 | bit_count = max([bit_count, 4]) 45 | length_mask = 0xFFFF >> bit_count 46 | offset_mask = ~length_mask 47 | maximum_length = (0xFFFF >> bit_count) + 3 48 | return length_mask, offset_mask, bit_count, maximum_length 49 | 50 | def decompress_stream(compressed_container): 51 | # MS-OVBA 52 | # 2.4.1.2 53 | decompressed_container = '' # result 54 | compressed_current = 0 55 | compressed_chunk_start = None 56 | decompressed_chunk_start = None 57 | 58 | sig_byte = ord(compressed_container[compressed_current]) 59 | if sig_byte != 0x01: 60 | logging.error('invalid signature byte {0:02X}'.format(sig_byte)) 61 | return None 62 | 63 | compressed_current += 1 64 | 65 | while compressed_current < len(compressed_container): 66 | # 2.4.1.1.5 67 | compressed_chunk_start = compressed_current 68 | compressed_chunk_header = unpack("> 15 # 1 == compressed, 0 == uncompressed 72 | 73 | if chunk_is_compressed != 0 and chunk_size > 4095: 74 | logging.warning('CompressedChunkSize > 4095 but CompressedChunkFlag == 1') 75 | if chunk_is_compressed == 0 and chunk_size != 4095: 76 | logging.warning('CompressedChunkSize != 4095 but CompressedChunkFlag == 0') 77 | #if chunk_sign != 0b0110: 78 | #logging.warning('invalid CompressedChunkSignature') 79 | 80 | logging.debug("chunk size = {0}".format(chunk_size)) 81 | 82 | compressed_end = min([len(compressed_container), compressed_current + chunk_size]) 83 | compressed_current += 2 84 | 85 | if chunk_is_compressed == 0: # uncompressed 86 | decompressed_container += compressed_container[compressed_current:compressed_current + 4096] 87 | compressed_current += 4096 88 | continue 89 | 90 | decompressed_chunk_start = len(decompressed_container) 91 | while compressed_current < compressed_end: 92 | flag_byte = ord(compressed_container[compressed_current]) 93 | compressed_current += 1 94 | for bit_index in xrange(0, 8): 95 | if compressed_current >= compressed_end: 96 | break 97 | if (1 << bit_index) & flag_byte == 0: # LiteralToken 98 | decompressed_container += compressed_container[compressed_current] 99 | compressed_current += 1 100 | continue 101 | 102 | # 103 | # copy tokens 104 | # 105 | 106 | copy_token = unpack("> temp2) + 1 112 | copy_source = len(decompressed_container) - offset 113 | for index in xrange(copy_source, copy_source + length): 114 | decompressed_container += decompressed_container[index] 115 | compressed_current += 2 116 | 117 | return decompressed_container 118 | 119 | class ParserOptions: 120 | def __init__( 121 | self, 122 | fail_on_invalid_sig=False): 123 | self.fail_on_invalid_sig = fail_on_invalid_sig 124 | 125 | class CompoundBinaryFile: 126 | def __init__(self, file, parser_options=None): 127 | self.file = file 128 | 129 | self.f = open(self.file, 'rb') 130 | sig = self.f.read(8) 131 | is_ole_document = sig == OLE_SIGNATURE 132 | if not is_ole_document and parser_options.fail_on_invalid_sig: 133 | logging.warning('invalid OLE signature (not an office document?)') 134 | sys.exit(1) 135 | self.f.close() 136 | 137 | # if the file is a zipfile, extract the binary part to a tempfile and continue, 138 | # otherwise, proceed as if a real binary file. 139 | if not is_ole_document and zipfile.is_zipfile(self.file): 140 | zfile = zipfile.ZipFile(self.file, "r") 141 | for name in zfile.namelist(): 142 | if name.endswith(BINFILE_NAME): 143 | data = zfile.read(name) 144 | self.f = tempfile.TemporaryFile() 145 | self.f.write(data) 146 | self.f.seek(0) # rewind the data file to the beginning 147 | else: 148 | self.f = open(self.file, 'rb') 149 | 150 | # load the header 151 | self.header = Header(self.f.read(512), parser_options) 152 | self.sector_size = 2 ** self.header._uSectorShift 153 | self.mini_sector_size = 2 ** self.header._uMiniSectorShift 154 | 155 | # get a sector count 156 | if (os.path.getsize(file) - 512) % self.sector_size != 0: 157 | logging.warning("last sector has invalid size") 158 | 159 | self.sector_count = int((os.path.getsize(file) - 512) / self.sector_size) 160 | logging.debug("sector count = {0}".format(self.sector_size)) 161 | logging.debug('sector size = {0}'.format(self.sector_size)) 162 | logging.debug('mini sector size = {0}'.format(self.mini_sector_size)) 163 | 164 | # load the sectors marked as FAT 165 | self.fat_sectors = [] 166 | for fat_sect in self.header._sectFat: 167 | if fat_sect != FREESECT: 168 | self.fat_sectors.append(fat_sect) 169 | 170 | # load any DIF sectors 171 | sector = self.header._sectDifStart 172 | buffer = [sector] 173 | # NOTE I've seen this have an initial value of FREESECT -- not sure why 174 | while sector != FREESECT and sector != ENDOFCHAIN: 175 | data = self.read_sector(sector) 176 | dif_values = [x for x in unpack('<{0}L'.format(self.sector_size / 4), data)] 177 | # the last entry is actually a pointer to next DIF 178 | next = dif_values.pop() 179 | for value in dif_values: 180 | if value != FREESECT: 181 | self.fat_sectors.append(value) 182 | if next in buffer: 183 | logging.error('infinite loop detected at {0} to {1} starting at DIF'.format(sector, next)) 184 | break 185 | buffer.append(next) 186 | sector = next 187 | 188 | # load the FAT 189 | self.fat = [] 190 | for fat_sect in self.fat_sectors: 191 | data = self.read_sector(fat_sect) 192 | if len(data) != self.sector_size: 193 | logging.error('broken FAT (invalid sector size {0} != {1})'.format(len(data), self.sector_size)) 194 | else: 195 | for value in unpack('<{0}L'.format(self.sector_size / 4), data): 196 | self.fat.append(value) 197 | 198 | # get the list of directory sectors 199 | self.directory = [] 200 | buffer = self.read_chain(self.header._sectDirStart) 201 | directory_index = 0 202 | for chunk in unpack("128s" * (len(buffer) / 128), buffer): 203 | self.directory.append(Directory(chunk, directory_index)) 204 | directory_index += 1 205 | 206 | # load the ministream 207 | self.minifat = [] 208 | if self.directory[0]._sectStart != ENDOFCHAIN: 209 | self.ministream = self.read_chain(self.directory[0]._sectStart) 210 | #logging.debug("mini stream specified size = {0}".format(self.directory[0]._ulSize)) 211 | if len(self.ministream) < self.directory[0]._ulSize: 212 | logging.warning("specified size is larger than actual stream length {0}".format(len(self.ministream))) 213 | self.ministream = self.ministream[0:self.directory[0]._ulSize] 214 | 215 | # 2.3 The locations for MiniFat sectors are stored in a standard 216 | # chain in the Fat, with the beginning of the chain stored in the 217 | # header. 218 | 219 | data = StringIO(self.read_chain(self.header._sectMiniFatStart)) 220 | while True: 221 | chunk = data.read(self.sector_size) 222 | if chunk == '': 223 | break 224 | if len(chunk) != self.sector_size: 225 | logging.warning("encountered EOF while parsing minifat") 226 | continue 227 | for value in unpack('<{0}L'.format(self.sector_size / 4), chunk): 228 | self.minifat.append(value) 229 | 230 | def read_sector(self, sector): 231 | if sector >= self.sector_count: 232 | logging.warning("reference to invalid sector {0:04X} ({0})".format(sector)) 233 | self.f.seek(512 + (self.sector_size * sector)) 234 | return self.f.read(self.sector_size) 235 | 236 | def read_mini_sector(self, sector): 237 | offset = sector * self.mini_sector_size 238 | return self.ministream[offset:offset + self.mini_sector_size] 239 | 240 | def read_fat(self, sector): 241 | return self.fat[sector] 242 | 243 | def read_mini_fat(self, sector): 244 | return self.minifat[sector] 245 | 246 | def __impl_read_chain(self, start, read_sector_f, read_fat_f): 247 | """Returns the entire contents of a chain starting at the given sector.""" 248 | sector = start 249 | check = [ sector ] # keep a list of sectors we've already read 250 | buffer = StringIO() 251 | while sector != ENDOFCHAIN: 252 | buffer.write(read_sector_f(sector)) 253 | next = read_fat_f(sector) 254 | if next in check: 255 | logging.error('infinite loop detected at {0} to {1} starting at {2}'.format( 256 | sector, next, sector_start)) 257 | return buffer.getvalue() 258 | check.append(next) 259 | sector = next 260 | return buffer.getvalue() 261 | 262 | def read_mini_chain(self, sector_start): 263 | return self.__impl_read_chain(sector_start, self.read_mini_sector, self.read_mini_fat) 264 | 265 | def read_chain(self, sector_start): 266 | return self.__impl_read_chain(sector_start, self.read_sector, self.read_fat) 267 | 268 | def print_fat_sectors(self): 269 | for sector in self.fat_sectors: 270 | print '{0:08X}'.format(sector) 271 | 272 | def get_stream(self, index): 273 | d = self.directory[index] 274 | if d._ulSize < self.header._ulMiniSectorCutoff: 275 | data = self.read_mini_chain(d._sectStart) 276 | else: 277 | data = self.read_chain(d._sectStart) 278 | data = data[0:d._ulSize] 279 | return data 280 | 281 | def find_stream_by_name(self, name): 282 | for d in self.directory: 283 | if d.name == name: 284 | return d 285 | return None 286 | 287 | # TODO newer office documents can have 4096 byte headers 288 | class Header: 289 | def __init__(self, data, parser_options=None): 290 | # sanity checks 291 | if len(data) < 512: 292 | logging.warning('document is less than 512 bytes') 293 | 294 | self.data = data 295 | self.header = unpack("<8s16sHHHHHHLLLLLLLLLL109L", data) 296 | self._abSig = self.header[0] 297 | self._clid = self.header[1] 298 | self._uMinorVersion = self.header[2] 299 | self._uDllVersion = self.header[3] 300 | self._uByteOrder = self.header[4] 301 | self._uSectorShift = self.header[5] 302 | self._uMiniSectorShift = self.header[6] 303 | self._usReserved = self.header[7] 304 | self._usReserved1 = self.header[8] 305 | self._usReserved2 = self.header[9] 306 | self._csectFat = self.header[10] # number of sects in FAT chain 307 | self._sectDirStart = self.header[11] # first sect in Directory chain 308 | self._signature = self.header[12] 309 | self._ulMiniSectorCutoff = self.header[13] 310 | self._sectMiniFatStart = self.header[14] # first mini-FAT sect 311 | self._csectMiniFat = self.header[15] # number of sects in mini-FAT 312 | self._sectDifStart = self.header[16] # first sect in DIF chain 313 | self._csectDif = self.header[17] # number of sets in DIF chain 314 | self._sectFat = self.header[18:] # sects of first 109 FAT sectors 315 | 316 | def pretty_print(self): 317 | print """HEADER DUMP 318 | _abSig = {0} 319 | _clid = {1} 320 | _uMinorVersion = {2} 321 | _uDllVersion = {3} 322 | _uByteOrder = {4} 323 | _uSectorShift = {5} 324 | _uMiniSectorShift = {6} 325 | _usReserved = {7} 326 | _usReserved1 = {8} 327 | _usReserved2 = {9} 328 | _csectFat = {10} 329 | _sectDirStart = {11} 330 | _signature = {12} 331 | _ulMiniSectorCutoff = {13} 332 | _sectMiniFatStart = {14} 333 | _csectMiniFat = {15} 334 | _sectDifStart = {16} 335 | _csectDif = {17}""".format( 336 | ' '.join(['{0:02X}'.format(ord(x)) for x in self._abSig]), 337 | ' '.join(['{0:02X}'.format(ord(x)) for x in self._clid]), 338 | '{0:04X}'.format(self._uMinorVersion), 339 | '{0}'.format(self._uDllVersion), 340 | '{0:04X}'.format(self._uByteOrder), 341 | '{0} ({1} bytes)'.format(self._uSectorShift, 2 ** self._uSectorShift), 342 | '{0} ({1} bytes)'.format(self._uMiniSectorShift, 343 | 2 ** self._uMiniSectorShift), 344 | '{0:04X}'.format(self._usReserved), 345 | '{0:08X}'.format(self._usReserved1), 346 | '{0:08X}'.format(self._usReserved2), 347 | '{0:08X}'.format(self._csectFat), 348 | '{0:08X}'.format(self._sectDirStart), 349 | '{0:08X}'.format(self._signature), 350 | '{0:08X} ({1} bytes)'.format(self._ulMiniSectorCutoff, 351 | self._ulMiniSectorCutoff), 352 | '{0:08X}'.format(self._sectMiniFatStart), 353 | '{0:08X}'.format(self._csectMiniFat), 354 | '{0:08X}'.format(self._sectDifStart), 355 | '{0:08X}'.format(self._csectDif)) 356 | 357 | for fat in self._sectFat: 358 | if fat != FREESECT: 359 | print '_sectFat = {0:08X}'.format(fat) 360 | 361 | STGTY_INVALID = 0 362 | STGTY_STORAGE = 1 363 | STGTY_STREAM = 2 364 | STGTY_LOCKBYTES = 3 365 | STGTY_PROPERTY = 4 366 | STGTY_ROOT = 5 367 | 368 | def stgty_to_str(value): 369 | if value == STGTY_INVALID: 370 | return "STGTY_INVALID" 371 | elif value == STGTY_STORAGE: 372 | return "STGTY_STORAGE" 373 | elif value == STGTY_STREAM: 374 | return "STGTY_STREAM" 375 | elif value == STGTY_LOCKBYTES: 376 | return "STGTY_LOCKBYTES" 377 | elif value == STGTY_PROPERTY: 378 | return "STGTY_PROPERTY" 379 | elif value == STGTY_ROOT: 380 | return "STGTY_ROOT" 381 | else: 382 | return "UNKNOWN VALUE {0}".format(value) 383 | 384 | DE_RED = 0 385 | DE_BLACK = 1 386 | 387 | def de_to_str(value): 388 | if value == DE_RED: 389 | return "DE_RED" 390 | elif value == DE_BLACK: 391 | return "DE_BLACK" 392 | else: 393 | return "UNKNOWN VALUE {0}".format(value) 394 | 395 | class Directory: 396 | def __init__(self, data, index): 397 | self.data = data 398 | self.index = index 399 | self.directory = unpack("<64sHbbLLL16sLQQLLHH", data) 400 | self._ab = self.directory[0] 401 | self._cb = self.directory[1] 402 | # convert wide chars into ASCII 403 | self.name = ''.join([x for x in self._ab[0:self._cb] if ord(x) != 0]) 404 | self._mse = self.directory[2] 405 | self._bflags = self.directory[3] 406 | self._sidLeftSib = self.directory[4] 407 | self._sidRightSib = self.directory[5] 408 | self._sidChild = self.directory[6] 409 | self._clsId = self.directory[7] 410 | self._dwUserFlags = self.directory[8] 411 | self._time = [ self.directory[9], self.directory[10] ] 412 | self._sectStart = self.directory[11] 413 | self._ulSize = self.directory[12] 414 | self._dptPropType = self.directory[13] 415 | # last two bytes are padding 416 | 417 | def pretty_print(self): 418 | print """ 419 | _ab = {0} 420 | _cb = {1} 421 | _mse = {2} 422 | _bflags = {3} 423 | _sidLeftSib = {4} 424 | _sidRightSib = {5} 425 | _sidChild = {6} 426 | _clsId = {7} 427 | _dwUserFlags = {8} 428 | _time[0] = {9} 429 | _time[1] = {10} 430 | _sectStart = {11} 431 | _ulSize = {12} 432 | _dptPropType = {13}""".format( 433 | "{0}\n {1}".format(self.name, 434 | ' '.join(['{0:02X}'.format(ord(x)) for x in self._ab[0:self._cb]])), 435 | #unicode(self._ab).encode('us-ascii', 'ignore'), 436 | '{0:04X}'.format(self._cb), 437 | stgty_to_str(self._mse), 438 | de_to_str(self._bflags), 439 | '{0:04X}'.format(self._sidLeftSib), 440 | '{0:04X}'.format(self._sidRightSib), 441 | '{0:04X}'.format(self._sidChild), 442 | ' '.join(['{0:02X}'.format(ord(x)) for x in self._clsId]), 443 | '{0:04X}'.format(self._dwUserFlags), 444 | '{0}'.format(self._time[0]), 445 | '{0}'.format(self._time[1]), 446 | '{0:08X}'.format(self._sectStart), 447 | '{0:08X} ({0} bytes)'.format(self._ulSize), 448 | '{0:04X}'.format(self._dptPropType)) 449 | 450 | def _main(): 451 | 452 | parser = OptionParser() 453 | 454 | parser.add_option('-l', '--log-level', dest='log_level', 455 | type='string', default='INFO', 456 | help='Sets logging level to DEBUG, INFO (default), WARNING or ERROR.') 457 | 458 | parser.add_option('-x', '--fail-on-invalid-signature', dest='fail_on_invalid_sig', 459 | action='store_true', default=False, 460 | help='Stop processing if the document is missing the required header signature.') 461 | 462 | parser.add_option('-H', "--print-header", dest="print_header", 463 | action="store_true", default=False, 464 | help="Print header section.") 465 | 466 | parser.add_option('-d', "--print-directory", dest="print_directory", 467 | action="store_true", default=False, 468 | help="Print directory structure.") 469 | 470 | parser.add_option('-f', "--print-fat", dest="print_fat", 471 | action="store_true", default=False, 472 | help="Print FAT structure.") 473 | 474 | parser.add_option('-m', "--print-mini-fat", dest="print_mini_fat", 475 | action="store_true", default=False, 476 | help="Print mini-FAT structure.") 477 | 478 | parser.add_option('-s', '--print-expected-file-size', dest='print_expected_file_size', 479 | action='store_true', default=False, 480 | help='Print the expected file size based on the number of FAT sectors and sector size.') 481 | 482 | parser.add_option('-t', "--print-streams", dest="print_streams", 483 | action="store_true", default=False, 484 | help="Print the index and names of the streams contained in the document.") 485 | 486 | parser.add_option('-i', "--print-invalid-fat-count", dest="print_invalid_fat_count", 487 | action="store_true", default=False, 488 | help="Prints the number of invalid FAT entries.") 489 | 490 | parser.add_option('--create-manifest', dest='create_manifest', 491 | action='store_true', default=False, 492 | help="Create a manifest file that contains a list of all created files.") 493 | 494 | parser.add_option('-o', '--output-dir', dest='output_dir', 495 | type='string', default='.', 496 | help="Directory to store all extracted files.") 497 | 498 | parser.add_option("--dump-sector", dest="dump_sector", 499 | type="int", default=None, 500 | help="Dump the contents of the given sector.") 501 | 502 | parser.add_option('--dump-stream', dest='dump_stream', 503 | type='int', default=None, 504 | help="Dump the contents of the given stream identified by directory index.") 505 | 506 | parser.add_option('--dump-stream-by-name', dest='dump_stream_by_name', 507 | type='string', default=None, 508 | help="Dump the contents of the given stream identified by name.") 509 | 510 | parser.add_option('--dump-ministream', dest='dump_ministream', 511 | action='store_true', default=False, 512 | help='Dump the entire contents of the ministream to standard output.') 513 | 514 | parser.add_option('--extract-streams', dest='extract_streams', 515 | action='store_true', default=False, 516 | help='Store all streams as the specified files. The string {0} in the file name is replaced with the directory index.') 517 | 518 | parser.add_option('--extract-ole-streams', dest='extract_ole_streams', 519 | action='store_true', default=False, 520 | help="Extract all Ole10Native streams.") 521 | 522 | parser.add_option('--extract-macros', dest='extract_macros', 523 | action='store_true', default=False, 524 | help='Extract all macros into .vbs files.') 525 | 526 | parser.add_option('--extract-unknown-sectors', dest='extract_unknown_sectors', 527 | action='store_true', default=False, 528 | help='Extract any sectors that are not represented in the FAT to unknown_sectors.dat.') 529 | 530 | parser.add_option('--check-stream-continuity', dest='check_stream_cont', 531 | action='store_true', default=False, 532 | help='Checks that sectors beloning to FAT chains are stored in sequential order.') 533 | 534 | parser.add_option('--check-fat', dest='check_fat', 535 | action='store_true', default=False, 536 | help='Checks for FAT values that point to sectors that do not exist.') 537 | 538 | parser.add_option('--check-orphaned-chains', dest='check_orphaned_chains', 539 | action='store_true', default=False, 540 | help='Checks for chains that are not accesible from any directory entry.') 541 | 542 | (options, args) = parser.parse_args() 543 | 544 | logging.basicConfig(format='%(levelname)s: %(message)s', 545 | level=logging.__dict__[options.log_level]) 546 | 547 | parser_options = ParserOptions( 548 | fail_on_invalid_sig=options.fail_on_invalid_sig) 549 | 550 | ofdoc = CompoundBinaryFile(args[0], parser_options) 551 | 552 | if options.create_manifest: 553 | manifest = open(os.path.join(options.output_dir, 'manifest'), 'wb') 554 | 555 | # 556 | # print options 557 | # 558 | if options.print_header: 559 | ofdoc.header.pretty_print() 560 | 561 | if options.print_directory: 562 | for x in xrange(0, len(ofdoc.directory)): 563 | print "Directory Index {0:08X} ({0})".format(x) 564 | ofdoc.directory[x].pretty_print() 565 | print 566 | 567 | if options.print_fat: 568 | for sector in xrange(0, len(ofdoc.fat)): 569 | print '{0:08X}: {1}'.format(sector, fat_value_to_str(ofdoc.fat[sector])) 570 | 571 | if options.print_mini_fat: 572 | for sector in xrange(0, len(ofdoc.minifat)): 573 | print '{0:08X}: {1}'.format(sector, fat_value_to_str(ofdoc.minifat[sector])) 574 | 575 | if options.print_streams: 576 | for d in ofdoc.directory: 577 | if d._mse == STGTY_STREAM: 578 | print '{0}: {1}'.format(d.index, d.name) 579 | 580 | if options.print_expected_file_size: 581 | expected_file_size = (len([x for x in ofdoc.fat if x != FREESECT]) * ofdoc.sector_size) + 512 582 | actual_file_size = os.path.getsize(args[0]) 583 | size_diff = abs(expected_file_size - actual_file_size) 584 | percent_diff = (float(size_diff) / float(expected_file_size)) * 100.0 585 | 586 | print "expected file size {0} actual {1} difference {2} ({3:0.2f}%)".format( 587 | expected_file_size, actual_file_size, size_diff, percent_diff) 588 | 589 | # 590 | # analysis options 591 | # 592 | if options.check_stream_cont: 593 | for d in ofdoc.directory[1:]: 594 | if d._mse == STGTY_INVALID: 595 | continue 596 | # ignore streams in the ministream 597 | if d._ulSize < ofdoc.header._ulMiniSectorCutoff: 598 | continue 599 | 600 | d.pretty_print() 601 | if d._sectStart != ENDOFCHAIN: 602 | current = d._sectStart 603 | while True: 604 | next = ofdoc.read_fat(current) 605 | logging.debug("next = {0:08X}".format(next)) 606 | if next == ENDOFCHAIN: 607 | break 608 | if next - current != 1: 609 | logging.warning('directory index {0} non-continuous at sector {1:08X} to {2:08X}'.format( 610 | d.index, current, next)) 611 | current = next 612 | 613 | invalid_fat_sectors = 0 614 | if options.check_fat or options.print_invalid_fat_count: 615 | for value in ofdoc.fat_sectors: 616 | if value > ofdoc.sector_count: 617 | invalid_fat_sectors += 1 618 | if options.check_fat: 619 | logging.warning('invalid FAT sector reference {0:08X}'.format(value)) 620 | 621 | if options.print_invalid_fat_count: 622 | print "invalid FAT sector references: {0}".format(invalid_fat_sectors) 623 | 624 | invalid_fat_entries = 0 625 | if options.check_fat or options.print_invalid_fat_count: 626 | for value in xrange(0, len(ofdoc.fat)): 627 | ptr = ofdoc.read_fat(value) 628 | if ptr == DIFSECT or ptr == FATSECT or ptr == ENDOFCHAIN or ptr == FREESECT: 629 | continue 630 | if ptr > len(ofdoc.fat): 631 | invalid_fat_entries += 1 632 | if options.check_fat: 633 | logging.warning('invalid FAT sector {0:08X} value {1:08X}'.format(value, ptr)) 634 | 635 | if options.print_invalid_fat_count: 636 | print "invalid FAT entries: {0}".format(invalid_fat_entries) 637 | 638 | if options.check_orphaned_chains: 639 | buffer = [False for fat in ofdoc.fat] 640 | # directory sectors 641 | index = ofdoc.header._sectDirStart 642 | while index != ENDOFCHAIN: 643 | buffer[index] = True 644 | index = ofdoc.read_fat(index) 645 | # minifat sectors 646 | index = ofdoc.header._sectMiniFatStart 647 | while index != ENDOFCHAIN: 648 | buffer[index] = True 649 | index = ofdoc.read_fat(index) 650 | # fat sectors specified in the header 651 | for index in ofdoc.header._sectFat: 652 | if index != FREESECT: 653 | buffer[index] = True 654 | # stream sectors 655 | for d in ofdoc.directory: 656 | if d._mse == STGTY_INVALID: 657 | continue 658 | # ignore streams in the ministream 659 | if d.index > 0 and d._ulSize < ofdoc.header._ulMiniSectorCutoff: 660 | continue 661 | 662 | index = d._sectStart 663 | while index != ENDOFCHAIN: 664 | #logging.debug('checking index {0:08X}'.format(index)) 665 | buffer[index] = True 666 | index = ofdoc.read_fat(index) 667 | 668 | for index in xrange(0, len(buffer)): 669 | #logging.debug('{0:08X} {1} {2}'.format(index, buffer[index], fat_value_to_str(ofdoc.read_fat(index)))) 670 | if ofdoc.read_fat(index) == FREESECT and buffer[index] == True: 671 | logging.warning('FREESECT is marked as used') 672 | if ofdoc.read_fat(index) != FREESECT and buffer[index] == False: 673 | logging.warning('non-FREESECT is not used') 674 | 675 | # 676 | # dump options 677 | # 678 | if options.dump_sector: 679 | sys.stdout.write(ofdoc.read_sector(options.dump_sector)) 680 | sys.exit(0) 681 | 682 | if options.dump_ministream: 683 | sys.stdout.write(ofdoc.ministream) 684 | sys.exit(0) 685 | 686 | if options.dump_stream: 687 | sys.stdout.write(ofdoc.get_stream(options.dump_stream)) 688 | sys.exit(0) 689 | 690 | if options.dump_stream_by_name: 691 | d = ofdoc.find_stream_by_name(options.dump_stream_by_name) 692 | sys.stdout.write(ofdoc.get_stream(d.index)) 693 | sys.exit(0) 694 | 695 | # 696 | # extraction options 697 | # 698 | if options.extract_ole_streams: 699 | for d in ofdoc.directory: 700 | if d.name == "\x01Ole10Native": 701 | data = ofdoc.get_stream(d.index) 702 | size = unpack(' 128: 911 | logging.error("PROJECTNAME_SizeOfProjectName value not in range: {0}".format(PROJECTNAME_SizeOfProjectName)) 912 | PROJECTNAME_ProjectName = dir_stream.read(PROJECTNAME_SizeOfProjectName) 913 | 914 | # PROJECTDOCSTRING Record 915 | PROJECTDOCSTRING_Id = unpack(" 2000: 919 | logging.error("PROJECTDOCSTRING_SizeOfDocString value not in range: {0}".format(PROJECTDOCSTRING_SizeOfDocString)) 920 | PROJECTDOCSTRING_DocString = dir_stream.read(PROJECTDOCSTRING_SizeOfDocString) 921 | PROJECTDOCSTRING_Reserved = unpack(" 260: 933 | logging.error("PROJECTHELPFILEPATH_SizeOfHelpFile1 value not in range: {0}".format(PROJECTHELPFILEPATH_SizeOfHelpFile1)) 934 | PROJECTHELPFILEPATH_HelpFile1 = dir_stream.read(PROJECTHELPFILEPATH_SizeOfHelpFile1) 935 | PROJECTHELPFILEPATH_Reserved = unpack(" 1015: 972 | logging.error("PROJECTCONSTANTS_SizeOfConstants value not in range: {0}".format(PROJECTCONSTANTS_SizeOfConstants)) 973 | PROJECTCONSTANTS_Constants = dir_stream.read(PROJECTCONSTANTS_SizeOfConstants) 974 | PROJECTCONSTANTS_Reserved = unpack(" 0: 1175 | code_data = decompress_stream(code_data) 1176 | # build filename 1177 | filext = code_modules[MODULENAME_ModuleName] 1178 | filename = os.path.join(options.output_dir, '{0}.{1}'.format(MODULENAME_ModuleName, filext)) 1179 | # if the file already exists, add a counter until it's unused: 1180 | counter = 1 1181 | while os.path.exists(filename): 1182 | logging.debug('Filename %s already exists' % filename) 1183 | filename = os.path.join(options.output_dir, '%s_%d.%s' % (MODULENAME_ModuleName, counter, filext)) 1184 | counter += 1 1185 | logging.info('Saving VBA code to %s' % filename) 1186 | f = open(filename, 'wb') 1187 | f.write(code_data) 1188 | f.close() 1189 | 1190 | if options.create_manifest: 1191 | manifest.write(os.path.basename(filename)) 1192 | manifest.write("\n") 1193 | 1194 | logging.debug('created file {0}'.format(filename)) 1195 | else: 1196 | logging.warning("module stream {0} has code data length 0".format(MODULESTREAMNAME_StreamName)) 1197 | break 1198 | 1199 | if options.extract_unknown_sectors: 1200 | i = 0 1201 | while True: 1202 | filename = os.path.join(options.output_dir, 'unknown_sectors_{0}.dat'.format(i)) 1203 | if not os.path.exists(filename): 1204 | break 1205 | i += 1 1206 | f_in = open(args[0], 'rb') 1207 | f_in.seek(512 + (len(ofdoc.fat) * ofdoc.sector_size)) 1208 | f_out = open(filename, 'wb') 1209 | f_out.write(f_in.read()) 1210 | f_out.close() 1211 | f_in.close() 1212 | 1213 | if options.create_manifest: 1214 | manifest.write(os.path.basename(filename)) 1215 | manifest.write("\n") 1216 | 1217 | logging.debug('created file {0} size = {1}'.format(filename, os.path.getsize(filename))) 1218 | logging.debug('header + fat allocation = {0}'.format(512 + (len(ofdoc.fat) * ofdoc.sector_size))) 1219 | logging.debug('file size = {0}'.format(os.path.getsize(args[0]))) 1220 | 1221 | if options.create_manifest: 1222 | manifest.close() 1223 | 1224 | if __name__ == '__main__': 1225 | _main() 1226 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """A setuptools based setup module. 5 | See: 6 | https://packaging.python.org/en/latest/distributing.html 7 | https://github.com/pypa/sampleproject 8 | """ 9 | 10 | 11 | # Always prefer setuptools over distutils 12 | from setuptools import setup 13 | # To use a consistent encoding 14 | from codecs import open 15 | from os import path 16 | 17 | __version__ = "1.0.1" 18 | description = "A python script that parses the format of OLE compound documents used by Microsoft Office applications." 19 | 20 | here = path.abspath(path.dirname(__file__)) 21 | 22 | # Get the long description from the README file 23 | with open(path.join(here, 'README.md'), encoding='utf-8') as f: 24 | long_description = f.read() 25 | 26 | setup( 27 | name='officeparser', 28 | 29 | # Versions should comply with PEP440. For a discussion on single-sourcing 30 | # the version across setup.py and the project code, see 31 | # https://packaging.python.org/en/latest/single_source_version.html 32 | version=__version__, 33 | 34 | description=description, 35 | long_description=long_description, 36 | 37 | # The project's main homepage. 38 | url='https://github.com/unixfreak0037/officeparser', 39 | 40 | # Author details 41 | author='John Davison', 42 | author_email='unixfreak0037@gmail.com', 43 | 44 | # Choose your license 45 | license='MIT', 46 | 47 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers 48 | classifiers=[ 49 | # How mature is this project? Common values are 50 | # 3 - Alpha 51 | # 4 - Beta 52 | # 5 - Production/Stable 53 | 'Development Status :: 5 - Production/Stable', 54 | 55 | # Indicate who your project is intended for 56 | 'Intended Audience :: Developers', 57 | "Intended Audience :: Information Technology", 58 | 'Operating System :: OS Independent', 59 | 60 | 61 | # Pick your license as you wish (should match "license" above) 62 | 'License :: OSI Approved :: MIT License', 63 | 64 | # Specify the Python versions you support here. In particular, ensure 65 | # that you indicate whether you support Python 2, Python 3 or both. 66 | 'Programming Language :: Python :: 3 :: Only', 67 | 'Programming Language :: Python :: 3', 68 | 'Programming Language :: Python :: 3.0', 69 | 'Programming Language :: Python :: 3.1', 70 | 'Programming Language :: Python :: 3.2', 71 | 'Programming Language :: Python :: 3.3', 72 | 'Programming Language :: Python :: 3.4', 73 | 'Programming Language :: Python :: 3.5', 74 | 'Programming Language :: Python :: 3.6', 75 | ], 76 | 77 | # What does your project relate to? 78 | keywords='Microsoft Office, ole, vba, macro', 79 | 80 | # You can just specify the packages manually here if your project is 81 | # simple. Or you can use find_packages(). 82 | # packages=["msoffice_decrypt"], 83 | 84 | 85 | # Alternatively, if you want to distribute just a my_module.py, uncomment 86 | # this: 87 | py_modules=["officeparser"], 88 | 89 | # List run-time dependencies here. These will be installed by pip when 90 | # your project is installed. For an analysis of "install_requires" vs pip's 91 | # requirements files see: 92 | # https://packaging.python.org/en/latest/requirements.html 93 | install_requires=[], 94 | 95 | entry_points={ 96 | 'console_scripts': ['officeparser=officeparser:_main'], 97 | } 98 | ) 99 | --------------------------------------------------------------------------------