├── pdfminer ├── cmap │ ├── __init__.py │ └── Makefile ├── __init__.py ├── pdfcolor.py ├── arcfour.py ├── encodingdb.py ├── runlength.py ├── ascii85.py └── lzw.py ├── hachoir_metadata ├── qt │ ├── __init__.py │ └── dialog.ui ├── config.py ├── version.py ├── __init__.py ├── misc.py.rej ├── formatter.py ├── safe.py ├── timezone.py ├── file_system.py ├── filter.py ├── history.patch └── program.py ├── hachoir_parser ├── common │ ├── __init__.py │ ├── tracker.py │ ├── deflate.py │ ├── msdos.py │ └── win32_lang_id.py ├── network │ ├── __init__.py │ └── common.py ├── version.py ├── game │ ├── __init__.py │ ├── spider_man_video.py │ └── laf.py ├── video │ ├── __init__.py │ ├── amf.py │ └── mpeg_ts.py ├── program │ ├── __init__.py │ ├── prc.py │ └── exe_ne.py ├── container │ └── __init__.py ├── __init__.py ├── file_system │ ├── __init__.py │ └── linux_swap.py ├── misc │ ├── common.py │ ├── __init__.py │ ├── ole2_util.py │ └── hlp.py ├── image │ ├── __init__.py │ ├── common.py │ ├── tiff.py │ ├── psd.py │ ├── pcx.py │ ├── tga.py │ └── iptc.py ├── audio │ ├── __init__.py │ ├── au.py │ ├── real_audio.py │ └── aiff.py ├── archive │ ├── __init__.py │ ├── ar.py │ ├── mar.py │ ├── mozilla_ar.py │ └── tar.py ├── template.py └── guess.py ├── discovery ├── __init__.py └── googlesearch.py ├── lib └── __init__.py ├── hachoir_core ├── __init__.py ├── stream │ ├── stream.py │ ├── __init__.py │ └── input_helper.py ├── version.py ├── field │ ├── field_set.py │ ├── character.py │ ├── enum.py │ ├── vector.py │ ├── parser.py │ ├── integer.py │ ├── bit_field.py │ ├── static_field_set.py │ ├── helper.py │ ├── byte_field.py │ ├── __init__.py │ ├── fake_array.py │ ├── sub_file.py │ ├── timestamp.py │ ├── seekable_field_set.py │ ├── link.py │ ├── float.py │ └── padding.py ├── endian.py ├── language.py ├── event_handler.py ├── profiler.py ├── config.py ├── error.py ├── cmd_line.py ├── text_handler.py ├── timeout.py ├── memory.py └── log.py ├── extractors ├── __init__.py ├── metadataExtractor.py ├── metadataMSOffice.py └── metadataPDF.py ├── LICENSES ├── downloader.py ├── processor.py ├── README.md ├── parser.py ├── myparser.py ├── unzip.py └── htmlExport.py /pdfminer/cmap/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hachoir_metadata/qt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hachoir_parser/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /discovery/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["googlesearch"] 2 | -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["markup","graphs"] 2 | -------------------------------------------------------------------------------- /hachoir_metadata/config.py: -------------------------------------------------------------------------------- 1 | MAX_STR_LENGTH = 300 # characters 2 | RAW_OUTPUT = False 3 | -------------------------------------------------------------------------------- /hachoir_parser/network/__init__.py: -------------------------------------------------------------------------------- 1 | from hachoir_parser.network.tcpdump import TcpdumpFile 2 | 3 | -------------------------------------------------------------------------------- /hachoir_core/__init__.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.version import VERSION as __version__, PACKAGE, WEBSITE, LICENSE 2 | 3 | -------------------------------------------------------------------------------- /pdfminer/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | __version__ = '20110227' 3 | 4 | if __name__ == '__main__': print __version__ 5 | -------------------------------------------------------------------------------- /extractors/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["metadataExtractor","metadataMSOffice","metadataMSOfficeXML","metadataOpenOffice","metadataPDF"] 2 | -------------------------------------------------------------------------------- /hachoir_core/stream/stream.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.error import HachoirError 2 | 3 | class StreamError(HachoirError): 4 | pass 5 | 6 | -------------------------------------------------------------------------------- /pdfminer/cmap/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for pdfminer.cmap 2 | 3 | all: 4 | 5 | clean: 6 | -rm *.pyc *.pyo 7 | 8 | cmap_clean: 9 | rm -f *.pickle.gz 10 | -------------------------------------------------------------------------------- /hachoir_core/version.py: -------------------------------------------------------------------------------- 1 | PACKAGE = "hachoir-core" 2 | VERSION = "1.3.4" 3 | WEBSITE = 'http://bitbucket.org/haypo/hachoir/wiki/hachoir-core' 4 | LICENSE = 'GNU GPL v2' 5 | 6 | -------------------------------------------------------------------------------- /hachoir_metadata/version.py: -------------------------------------------------------------------------------- 1 | PACKAGE = "hachoir-metadata" 2 | VERSION = "1.3.3" 3 | WEBSITE = "http://bitbucket.org/haypo/hachoir/wiki/hachoir-metadata" 4 | LICENSE = "GNU GPL v2" 5 | 6 | -------------------------------------------------------------------------------- /hachoir_parser/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.3.5" 2 | PACKAGE = "hachoir-parser" 3 | WEBSITE = "http://bitbucket.org/haypo/hachoir/wiki/hachoir-parser" 4 | LICENSE = 'GNU GPL v2' 5 | 6 | -------------------------------------------------------------------------------- /hachoir_parser/game/__init__.py: -------------------------------------------------------------------------------- 1 | from hachoir_parser.game.zsnes import ZSNESFile 2 | from hachoir_parser.game.spider_man_video import SpiderManVideoFile 3 | from hachoir_parser.game.laf import LafFile 4 | from hachoir_parser.game.blp import BLP1File, BLP2File -------------------------------------------------------------------------------- /hachoir_parser/video/__init__.py: -------------------------------------------------------------------------------- 1 | from hachoir_parser.video.asf import AsfFile 2 | from hachoir_parser.video.flv import FlvFile 3 | from hachoir_parser.video.mov import MovFile 4 | from hachoir_parser.video.mpeg_video import MPEGVideoFile 5 | from hachoir_parser.video.mpeg_ts import MPEG_TS 6 | 7 | -------------------------------------------------------------------------------- /hachoir_parser/program/__init__.py: -------------------------------------------------------------------------------- 1 | from hachoir_parser.program.elf import ElfFile 2 | from hachoir_parser.program.exe import ExeFile 3 | from hachoir_parser.program.python import PythonCompiledFile 4 | from hachoir_parser.program.java import JavaCompiledClassFile 5 | from hachoir_parser.program.prc import PRCFile 6 | 7 | -------------------------------------------------------------------------------- /hachoir_core/field/field_set.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.field import BasicFieldSet, GenericFieldSet 2 | 3 | class FieldSet(GenericFieldSet): 4 | def __init__(self, parent, name, *args, **kw): 5 | assert issubclass(parent.__class__, BasicFieldSet) 6 | GenericFieldSet.__init__(self, parent, name, parent.stream, *args, **kw) 7 | 8 | -------------------------------------------------------------------------------- /hachoir_parser/common/tracker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Shared code for tracker parser. 3 | """ 4 | 5 | NOTE_NAME = {} 6 | NOTES = ("C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "G#", "A", "A#", "B") 7 | for octave in xrange(10): 8 | for index, note in enumerate(NOTES): 9 | NOTE_NAME[octave*12+index] = "%s (octave %s)" % (note, octave) 10 | 11 | -------------------------------------------------------------------------------- /hachoir_parser/container/__init__.py: -------------------------------------------------------------------------------- 1 | from hachoir_parser.container.asn1 import ASN1File 2 | from hachoir_parser.container.mkv import MkvFile 3 | from hachoir_parser.container.ogg import OggFile, OggStream 4 | from hachoir_parser.container.riff import RiffFile 5 | from hachoir_parser.container.swf import SwfFile 6 | from hachoir_parser.container.realmedia import RealMediaFile 7 | 8 | -------------------------------------------------------------------------------- /hachoir_core/endian.py: -------------------------------------------------------------------------------- 1 | """ 2 | Constant values about endian. 3 | """ 4 | 5 | from hachoir_core.i18n import _ 6 | 7 | BIG_ENDIAN = "ABCD" 8 | LITTLE_ENDIAN = "DCBA" 9 | MIDDLE_ENDIAN = "BADC" 10 | NETWORK_ENDIAN = BIG_ENDIAN 11 | 12 | endian_name = { 13 | BIG_ENDIAN: _("Big endian"), 14 | LITTLE_ENDIAN: _("Little endian"), 15 | MIDDLE_ENDIAN: _("Middle endian"), 16 | } 17 | -------------------------------------------------------------------------------- /hachoir_parser/__init__.py: -------------------------------------------------------------------------------- 1 | from hachoir_parser.version import __version__ 2 | from hachoir_parser.parser import ValidateError, HachoirParser, Parser 3 | from hachoir_parser.parser_list import ParserList, HachoirParserList 4 | from hachoir_parser.guess import (QueryParser, guessParser, createParser) 5 | from hachoir_parser import (archive, audio, container, 6 | file_system, image, game, misc, network, program, video) 7 | 8 | -------------------------------------------------------------------------------- /hachoir_parser/file_system/__init__.py: -------------------------------------------------------------------------------- 1 | from hachoir_parser.file_system.ext2 import EXT2_FS 2 | from hachoir_parser.file_system.fat import FAT12, FAT16, FAT32 3 | from hachoir_parser.file_system.mbr import MSDos_HardDrive 4 | from hachoir_parser.file_system.ntfs import NTFS 5 | from hachoir_parser.file_system.iso9660 import ISO9660 6 | from hachoir_parser.file_system.reiser_fs import REISER_FS 7 | from hachoir_parser.file_system.linux_swap import LinuxSwapFile 8 | 9 | -------------------------------------------------------------------------------- /hachoir_parser/misc/common.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.field import StaticFieldSet, Float32 2 | 3 | class Vertex(StaticFieldSet): 4 | format = ((Float32, "x"), (Float32, "y"), (Float32, "z")) 5 | 6 | def createValue(self): 7 | return (self["x"].value, self["y"].value, self["z"].value) 8 | 9 | class MapUV(StaticFieldSet): 10 | format = ((Float32, "u"), (Float32, "v")) 11 | 12 | def createValue(self): 13 | return (self["u"].value, self["v"].value) 14 | -------------------------------------------------------------------------------- /hachoir_metadata/__init__.py: -------------------------------------------------------------------------------- 1 | from hachoir_metadata.version import VERSION as __version__ 2 | from hachoir_metadata.metadata import extractMetadata 3 | 4 | # Just import the module, 5 | # each module use registerExtractor() method 6 | import hachoir_metadata.archive 7 | import hachoir_metadata.audio 8 | import hachoir_metadata.file_system 9 | import hachoir_metadata.image 10 | import hachoir_metadata.jpeg 11 | import hachoir_metadata.misc 12 | import hachoir_metadata.program 13 | import hachoir_metadata.riff 14 | import hachoir_metadata.video 15 | 16 | -------------------------------------------------------------------------------- /hachoir_parser/image/__init__.py: -------------------------------------------------------------------------------- 1 | from hachoir_parser.image.bmp import BmpFile 2 | from hachoir_parser.image.gif import GifFile 3 | from hachoir_parser.image.ico import IcoFile 4 | from hachoir_parser.image.jpeg import JpegFile 5 | from hachoir_parser.image.pcx import PcxFile 6 | from hachoir_parser.image.psd import PsdFile 7 | from hachoir_parser.image.png import PngFile 8 | from hachoir_parser.image.tga import TargaFile 9 | from hachoir_parser.image.tiff import TiffFile 10 | from hachoir_parser.image.wmf import WMF_File 11 | from hachoir_parser.image.xcf import XcfFile 12 | 13 | -------------------------------------------------------------------------------- /hachoir_core/stream/__init__.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.endian import BIG_ENDIAN, LITTLE_ENDIAN 2 | from hachoir_core.stream.stream import StreamError 3 | from hachoir_core.stream.input import ( 4 | InputStreamError, 5 | InputStream, InputIOStream, StringInputStream, 6 | InputSubStream, InputFieldStream, 7 | FragmentedStream, ConcatStream) 8 | from hachoir_core.stream.input_helper import FileInputStream, guessStreamCharset 9 | from hachoir_core.stream.output import (OutputStreamError, 10 | FileOutputStream, StringOutputStream, OutputStream) 11 | 12 | -------------------------------------------------------------------------------- /hachoir_parser/audio/__init__.py: -------------------------------------------------------------------------------- 1 | from hachoir_parser.audio.aiff import AiffFile 2 | from hachoir_parser.audio.au import AuFile 3 | from hachoir_parser.audio.itunesdb import ITunesDBFile 4 | from hachoir_parser.audio.midi import MidiFile 5 | from hachoir_parser.audio.mpeg_audio import MpegAudioFile 6 | from hachoir_parser.audio.real_audio import RealAudioFile 7 | from hachoir_parser.audio.xm import XMModule 8 | from hachoir_parser.audio.s3m import S3MModule 9 | from hachoir_parser.audio.s3m import PTMModule 10 | from hachoir_parser.audio.mod import AmigaModule 11 | from hachoir_parser.audio.flac import FlacParser 12 | 13 | -------------------------------------------------------------------------------- /hachoir_metadata/misc.py.rej: -------------------------------------------------------------------------------- 1 | *************** 2 | *** 125,130 **** 3 | summary = self.getField(fieldset, "summary[0]") 4 | if summary: 5 | self.useSummary(summary, False) 6 | 7 | def getFragment(self, frag): 8 | stream = frag.getSubIStream() 9 | --- 125,133 ---- 10 | summary = self.getField(fieldset, "summary[0]") 11 | if summary: 12 | self.useSummary(summary, False) 13 | + table = self.getField(fieldset, "table1[0]") 14 | + if table: 15 | + self.useTable(table) 16 | 17 | def getFragment(self, frag): 18 | stream = frag.getSubIStream() 19 | -------------------------------------------------------------------------------- /LICENSES: -------------------------------------------------------------------------------- 1 | Released under the GPL v 2.0. 2 | If you did not recieve a copy of the GPL, try http://www.gnu.org/. 3 | 4 | Copyright 2011 Christian Martorella 5 | 6 | Metagoofil is free software; you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation version 2 of the License. 9 | 10 | Metagoofil is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 15 | 16 | 17 | -------------------------------------------------------------------------------- /hachoir_core/language.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.iso639 import ISO639_2 2 | 3 | class Language: 4 | def __init__(self, code): 5 | code = str(code) 6 | if code not in ISO639_2: 7 | raise ValueError("Invalid language code: %r" % code) 8 | self.code = code 9 | 10 | def __cmp__(self, other): 11 | if other.__class__ != Language: 12 | return 1 13 | return cmp(self.code, other.code) 14 | 15 | def __unicode__(self): 16 | return ISO639_2[self.code] 17 | 18 | def __str__(self): 19 | return self.__unicode__() 20 | 21 | def __repr__(self): 22 | return "" % (unicode(self), self.code) 23 | 24 | -------------------------------------------------------------------------------- /hachoir_metadata/formatter.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.i18n import _, ngettext 2 | 3 | NB_CHANNEL_NAME = {1: _("mono"), 2: _("stereo")} 4 | 5 | def humanAudioChannel(value): 6 | return NB_CHANNEL_NAME.get(value, unicode(value)) 7 | 8 | def humanFrameRate(value): 9 | if isinstance(value, (int, long, float)): 10 | return _("%.1f fps") % value 11 | else: 12 | return value 13 | 14 | def humanComprRate(rate): 15 | return u"%.1fx" % rate 16 | 17 | def humanAltitude(value): 18 | return ngettext("%.1f meter", "%.1f meters", value) % value 19 | 20 | def humanPixelSize(value): 21 | return ngettext("%s pixel", "%s pixels", value) % value 22 | 23 | def humanDPI(value): 24 | return u"%s DPI" % value 25 | 26 | -------------------------------------------------------------------------------- /hachoir_parser/archive/__init__.py: -------------------------------------------------------------------------------- 1 | from hachoir_parser.archive.ace import AceFile 2 | from hachoir_parser.archive.ar import ArchiveFile 3 | from hachoir_parser.archive.bzip2_parser import Bzip2Parser 4 | from hachoir_parser.archive.cab import CabFile 5 | from hachoir_parser.archive.gzip_parser import GzipParser 6 | from hachoir_parser.archive.tar import TarFile 7 | from hachoir_parser.archive.zip import ZipFile 8 | from hachoir_parser.archive.rar import RarFile 9 | from hachoir_parser.archive.rpm import RpmFile 10 | from hachoir_parser.archive.sevenzip import SevenZipParser 11 | from hachoir_parser.archive.mar import MarFile 12 | from hachoir_parser.archive.mozilla_ar import MozillaArchive 13 | from hachoir_parser.archive.zlib import ZlibData 14 | -------------------------------------------------------------------------------- /hachoir_core/event_handler.py: -------------------------------------------------------------------------------- 1 | class EventHandler(object): 2 | """ 3 | Class to connect events to event handlers. 4 | """ 5 | 6 | def __init__(self): 7 | self.handlers = {} 8 | 9 | def connect(self, event_name, handler): 10 | """ 11 | Connect an event handler to an event. Append it to handlers list. 12 | """ 13 | try: 14 | self.handlers[event_name].append(handler) 15 | except KeyError: 16 | self.handlers[event_name] = [handler] 17 | 18 | def raiseEvent(self, event_name, *args): 19 | """ 20 | Raiser an event: call each handler for this event_name. 21 | """ 22 | if event_name not in self.handlers: 23 | return 24 | for handler in self.handlers[event_name]: 25 | handler(*args) 26 | 27 | -------------------------------------------------------------------------------- /pdfminer/pdfcolor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | from psparser import LIT 3 | 4 | 5 | ## PDFColorSpace 6 | ## 7 | LITERAL_DEVICE_GRAY = LIT('DeviceGray') 8 | LITERAL_DEVICE_RGB = LIT('DeviceRGB') 9 | LITERAL_DEVICE_CMYK = LIT('DeviceCMYK') 10 | 11 | class PDFColorSpace(object): 12 | 13 | def __init__(self, name, ncomponents): 14 | self.name = name 15 | self.ncomponents = ncomponents 16 | return 17 | 18 | def __repr__(self): 19 | return '' % (self.name, self.ncomponents) 20 | 21 | 22 | PREDEFINED_COLORSPACE = dict( 23 | (name, PDFColorSpace(name,n)) for (name,n) in { 24 | 'CalRGB': 3, 25 | 'CalGray': 1, 26 | 'Lab': 3, 27 | 'DeviceRGB': 3, 28 | 'DeviceCMYK': 4, 29 | 'DeviceGray': 1, 30 | 'Separation': 1, 31 | 'Indexed': 1, 32 | 'Pattern': 1, 33 | }.iteritems()) 34 | -------------------------------------------------------------------------------- /downloader.py: -------------------------------------------------------------------------------- 1 | import urllib, os, sys 2 | 3 | class downloader(): 4 | def __init__(self, url, dir): 5 | self.url = url 6 | self.dir = dir 7 | self.filename = str(url.split("/")[-1]) 8 | 9 | # def dlProgress(count, blockSize, totalSize): 10 | # percent = int(count*blockSize*100/totalSize) 11 | # sys.stdout.write("\r" +"test" + "...%d%%" % percent) 12 | # sys.stdout.flush() 13 | 14 | def down(self): 15 | if os.path.exists(self.dir + "/" + self.filename): 16 | pass 17 | else: 18 | try: 19 | urllib.urlretrieve(self.url, self.dir + "/" + self.filename) 20 | except: 21 | print "\t [x] Error downloading " + self.url 22 | self.filename = "" 23 | 24 | def name(self): 25 | return self.filename 26 | -------------------------------------------------------------------------------- /hachoir_core/field/character.py: -------------------------------------------------------------------------------- 1 | """ 2 | Character field class: a 8-bit character 3 | """ 4 | 5 | from hachoir_core.field import Bits 6 | from hachoir_core.endian import BIG_ENDIAN 7 | from hachoir_core.tools import makePrintable 8 | 9 | class Character(Bits): 10 | """ 11 | A 8-bit character using ASCII charset for display attribute. 12 | """ 13 | static_size = 8 14 | 15 | def __init__(self, parent, name, description=None): 16 | Bits.__init__(self, parent, name, 8, description=description) 17 | 18 | def createValue(self): 19 | return chr(self._parent.stream.readBits( 20 | self.absolute_address, 8, BIG_ENDIAN)) 21 | 22 | def createRawDisplay(self): 23 | return unicode(Bits.createValue(self)) 24 | 25 | def createDisplay(self): 26 | return makePrintable(self.value, "ASCII", quote="'", to_unicode=True) 27 | -------------------------------------------------------------------------------- /hachoir_core/field/enum.py: -------------------------------------------------------------------------------- 1 | def Enum(field, enum, key_func=None): 2 | """ 3 | Enum is an adapter to another field: it will just change its display 4 | attribute. It uses a dictionary to associate a value to another. 5 | 6 | key_func is an optional function with prototype "def func(key)->key" 7 | which is called to transform key. 8 | """ 9 | display = field.createDisplay 10 | if key_func: 11 | def createDisplay(): 12 | try: 13 | key = key_func(field.value) 14 | return enum[key] 15 | except LookupError: 16 | return display() 17 | else: 18 | def createDisplay(): 19 | try: 20 | return enum[field.value] 21 | except LookupError: 22 | return display() 23 | field.createDisplay = createDisplay 24 | field.getEnum = lambda: enum 25 | return field 26 | -------------------------------------------------------------------------------- /hachoir_parser/misc/__init__.py: -------------------------------------------------------------------------------- 1 | from hachoir_parser.misc.file_3do import File3do 2 | from hachoir_parser.misc.file_3ds import File3ds 3 | from hachoir_parser.misc.torrent import TorrentFile 4 | from hachoir_parser.misc.ttf import TrueTypeFontFile 5 | from hachoir_parser.misc.chm import ChmFile 6 | from hachoir_parser.misc.lnk import LnkFile 7 | from hachoir_parser.misc.pcf import PcfFile 8 | from hachoir_parser.misc.ole2 import OLE2_File 9 | from hachoir_parser.misc.pdf import PDFDocument 10 | from hachoir_parser.misc.pifv import PIFVFile 11 | from hachoir_parser.misc.hlp import HlpFile 12 | from hachoir_parser.misc.gnome_keyring import GnomeKeyring 13 | from hachoir_parser.misc.bplist import BPList 14 | from hachoir_parser.misc.dsstore import DSStore 15 | from hachoir_parser.misc.word_doc import WordDocumentParser 16 | from hachoir_parser.misc.word_2 import Word2DocumentParser 17 | from hachoir_parser.misc.mstask import MSTaskFile 18 | -------------------------------------------------------------------------------- /hachoir_metadata/safe.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.error import HACHOIR_ERRORS, warning 2 | 3 | def fault_tolerant(func, *args): 4 | def safe_func(*args, **kw): 5 | try: 6 | func(*args, **kw) 7 | except HACHOIR_ERRORS, err: 8 | warning("Error when calling function %s(): %s" % ( 9 | func.__name__, err)) 10 | return safe_func 11 | 12 | def getFieldAttribute(fieldset, key, attrname): 13 | try: 14 | field = fieldset[key] 15 | if field.hasValue(): 16 | return getattr(field, attrname) 17 | except HACHOIR_ERRORS, err: 18 | warning("Unable to get %s of field %s/%s: %s" % ( 19 | attrname, fieldset.path, key, err)) 20 | return None 21 | 22 | def getValue(fieldset, key): 23 | return getFieldAttribute(fieldset, key, "value") 24 | 25 | def getDisplay(fieldset, key): 26 | return getFieldAttribute(fieldset, key, "display") 27 | 28 | -------------------------------------------------------------------------------- /hachoir_core/profiler.py: -------------------------------------------------------------------------------- 1 | from hotshot import Profile 2 | from hotshot.stats import load as loadStats 3 | from os import unlink 4 | 5 | def runProfiler(func, args=tuple(), kw={}, verbose=True, nb_func=25, sort_by=('cumulative', 'calls')): 6 | profile_filename = "/tmp/profiler" 7 | prof = Profile(profile_filename) 8 | try: 9 | if verbose: 10 | print "[+] Run profiler" 11 | result = prof.runcall(func, *args, **kw) 12 | prof.close() 13 | if verbose: 14 | print "[+] Stop profiler" 15 | print "[+] Process data..." 16 | stat = loadStats(profile_filename) 17 | if verbose: 18 | print "[+] Strip..." 19 | stat.strip_dirs() 20 | if verbose: 21 | print "[+] Sort data..." 22 | stat.sort_stats(*sort_by) 23 | if verbose: 24 | print 25 | print "[+] Display statistics" 26 | print 27 | stat.print_stats(nb_func) 28 | return result 29 | finally: 30 | unlink(profile_filename) 31 | 32 | -------------------------------------------------------------------------------- /hachoir_core/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configuration of Hachoir 3 | """ 4 | 5 | import os 6 | 7 | # UI: display options 8 | max_string_length = 40 # Max. length in characters of GenericString.display 9 | max_byte_length = 14 # Max. length in bytes of RawBytes.display 10 | max_bit_length = 256 # Max. length in bits of RawBits.display 11 | unicode_stdout = True # Replace stdout and stderr with Unicode compatible objects 12 | # Disable it for readline or ipython 13 | 14 | # Global options 15 | debug = False # Display many informations usefull to debug 16 | verbose = False # Display more informations 17 | quiet = True # Don't display warni 18 | 19 | # Use internationalization and localization (gettext)? 20 | if os.name == "nt": 21 | # TODO: Remove this hack and make i18n works on Windows :-) 22 | use_i18n = False 23 | else: 24 | use_i18n = True 25 | 26 | # Parser global options 27 | autofix = True # Enable Autofix? see hachoir_core.field.GenericFieldSet 28 | check_padding_pattern = True # Check padding fields pattern? 29 | 30 | -------------------------------------------------------------------------------- /hachoir_metadata/timezone.py: -------------------------------------------------------------------------------- 1 | from datetime import tzinfo, timedelta 2 | 3 | class TimezoneUTC(tzinfo): 4 | """UTC timezone""" 5 | ZERO = timedelta(0) 6 | 7 | def utcoffset(self, dt): 8 | return TimezoneUTC.ZERO 9 | 10 | def tzname(self, dt): 11 | return u"UTC" 12 | 13 | def dst(self, dt): 14 | return TimezoneUTC.ZERO 15 | 16 | def __repr__(self): 17 | return "" 18 | 19 | class Timezone(TimezoneUTC): 20 | """Fixed offset in hour from UTC.""" 21 | def __init__(self, offset): 22 | self._offset = timedelta(minutes=offset*60) 23 | self._name = u"%+03u00" % offset 24 | 25 | def utcoffset(self, dt): 26 | return self._offset 27 | 28 | def tzname(self, dt): 29 | return self._name 30 | 31 | def __repr__(self): 32 | return "" % ( 33 | self._offset, self._name) 34 | 35 | UTC = TimezoneUTC() 36 | 37 | def createTimezone(offset): 38 | if offset: 39 | return Timezone(offset) 40 | else: 41 | return UTC 42 | 43 | -------------------------------------------------------------------------------- /hachoir_parser/common/deflate.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.field import CompressedField 2 | 3 | try: 4 | from zlib import decompressobj, MAX_WBITS 5 | 6 | class DeflateStream: 7 | def __init__(self, stream, wbits=None): 8 | if wbits: 9 | self.gzip = decompressobj(-MAX_WBITS) 10 | else: 11 | self.gzip = decompressobj() 12 | 13 | def __call__(self, size, data=None): 14 | if data is None: 15 | data = '' 16 | return self.gzip.decompress(self.gzip.unconsumed_tail+data, size) 17 | 18 | class DeflateStreamWbits(DeflateStream): 19 | def __init__(self, stream): 20 | DeflateStream.__init__(self, stream, True) 21 | 22 | def Deflate(field, wbits=True): 23 | if wbits: 24 | CompressedField(field, DeflateStreamWbits) 25 | else: 26 | CompressedField(field, DeflateStream) 27 | return field 28 | has_deflate = True 29 | except ImportError: 30 | def Deflate(field, wbits=True): 31 | return field 32 | has_deflate = False 33 | 34 | -------------------------------------------------------------------------------- /hachoir_metadata/file_system.py: -------------------------------------------------------------------------------- 1 | from hachoir_metadata.metadata import RootMetadata, registerExtractor 2 | from hachoir_metadata.safe import fault_tolerant 3 | from hachoir_parser.file_system import ISO9660 4 | from datetime import datetime 5 | 6 | class ISO9660_Metadata(RootMetadata): 7 | def extract(self, iso): 8 | desc = iso['volume[0]/content'] 9 | self.title = desc['volume_id'].value 10 | self.title = desc['vol_set_id'].value 11 | self.author = desc['publisher'].value 12 | self.author = desc['data_preparer'].value 13 | self.producer = desc['application'].value 14 | self.copyright = desc['copyright'].value 15 | self.readTimestamp('creation_date', desc['creation_ts'].value) 16 | self.readTimestamp('last_modification', desc['modification_ts'].value) 17 | 18 | @fault_tolerant 19 | def readTimestamp(self, key, value): 20 | if value.startswith("0000"): 21 | return 22 | value = datetime( 23 | int(value[0:4]), int(value[4:6]), int(value[6:8]), 24 | int(value[8:10]), int(value[10:12]), int(value[12:14])) 25 | setattr(self, key, value) 26 | 27 | registerExtractor(ISO9660, ISO9660_Metadata) 28 | 29 | -------------------------------------------------------------------------------- /pdfminer/arcfour.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | """ Python implementation of Arcfour encryption algorithm. 4 | 5 | This code is in the public domain. 6 | 7 | """ 8 | 9 | ## Arcfour 10 | ## 11 | class Arcfour(object): 12 | 13 | """ 14 | >>> Arcfour('Key').process('Plaintext').encode('hex') 15 | 'bbf316e8d940af0ad3' 16 | >>> Arcfour('Wiki').process('pedia').encode('hex') 17 | '1021bf0420' 18 | >>> Arcfour('Secret').process('Attack at dawn').encode('hex') 19 | '45a01f645fc35b383552544b9bf5' 20 | """ 21 | 22 | def __init__(self, key): 23 | s = range(256) 24 | j = 0 25 | klen = len(key) 26 | for i in xrange(256): 27 | j = (j + s[i] + ord(key[i % klen])) % 256 28 | (s[i], s[j]) = (s[j], s[i]) 29 | self.s = s 30 | (self.i, self.j) = (0, 0) 31 | return 32 | 33 | def process(self, data): 34 | (i, j) = (self.i, self.j) 35 | s = self.s 36 | r = '' 37 | for c in data: 38 | i = (i+1) % 256 39 | j = (j+s[i]) % 256 40 | (s[i], s[j]) = (s[j], s[i]) 41 | k = s[(s[i]+s[j]) % 256] 42 | r += chr(ord(c) ^ k) 43 | (self.i, self.j) = (i, j) 44 | return r 45 | 46 | # test 47 | if __name__ == '__main__': 48 | import doctest 49 | doctest.testmod() 50 | -------------------------------------------------------------------------------- /extractors/metadataExtractor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | import sys, re, os, subprocess 3 | 4 | class metaExtractor: 5 | def __init__(self,fname): 6 | self.fname=fname 7 | self.command="extract" #If any error put the full path 8 | self.data="" 9 | self.paths=[] 10 | self.users=[] 11 | 12 | 13 | def runExtract(self): 14 | comm=self.command+" "+self.fname 15 | try: 16 | process = subprocess.Popen([self.command,self.fname], shell=False, stdout=subprocess.PIPE) 17 | res=process.communicate() 18 | self.data=res[0] 19 | return "ok" 20 | except: 21 | return "error" 22 | 23 | def getData(self): 24 | pathre= re.compile('worked on .*') 25 | pathre2= re.compile('template -.*') 26 | for reg in (pathre,pathre2): 27 | path=reg.findall(self.data) 28 | if path !=[]: 29 | for x in path: 30 | try: 31 | temp=x.split('\'')[1] 32 | if self.paths.count(temp) == 0: 33 | self.paths.append(temp) 34 | except: 35 | pass 36 | 37 | author= re.compile(': Author \'.*\'') 38 | authors=author.findall(self.data) 39 | if authors !=[]: 40 | for x in authors: 41 | temp=x.split('\'')[1] 42 | temp=temp.replace('\'','') 43 | if self.users.count(temp) == 0: 44 | self.users.append(temp) 45 | 46 | def getUsers(self): 47 | return self.users 48 | 49 | def getPaths(self): 50 | return self.paths 51 | -------------------------------------------------------------------------------- /processor.py: -------------------------------------------------------------------------------- 1 | #Christian Martorella 2011 2 | ''' 3 | This class will sort the results and create unique list of software, users and paths 4 | ''' 5 | 6 | class processor(): 7 | def __init__(self,list): 8 | self.list = list 9 | self.unique_users = [] 10 | self.unique_soft = [] 11 | self.stat_soft = [] 12 | self.unique_paths = [] 13 | 14 | def print_all(self): 15 | for x in self.list: 16 | print x[0] 17 | if x[1] != []: 18 | print x[1] 19 | if x[2] != []: 20 | print x[2] 21 | 22 | def sort_users(self): 23 | for x in self.list: 24 | if x[1]!=[]: 25 | for y in x[1]: 26 | if self.unique_users.count(y) != 0: 27 | pass 28 | else: 29 | try: 30 | self.unique_users.append(y.lstrip()) 31 | except: 32 | pass 33 | else: 34 | pass 35 | return self.unique_users 36 | 37 | def sort_software(self): 38 | for x in self.list: 39 | if x[3]!=[]: 40 | for y in x[3]: 41 | if self.unique_soft.count(y) != 0: 42 | pass 43 | else: 44 | try: 45 | self.unique_soft.append(y.lstrip()) 46 | except Exception, e: 47 | pass 48 | else: 49 | pass 50 | return self.unique_soft 51 | 52 | def sort_paths(self): 53 | for x in self.list: 54 | if x[2]!=[]: 55 | for y in x[2]: 56 | if self.unique_paths.count(y) != 0: 57 | pass 58 | else: 59 | self.unique_paths.append(y) 60 | return self.unique_paths 61 | -------------------------------------------------------------------------------- /hachoir_parser/misc/ole2_util.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.endian import BIG_ENDIAN, LITTLE_ENDIAN 2 | from hachoir_core.field import RawBytes, RootSeekableFieldSet, ParserError 3 | from hachoir_parser import HachoirParser 4 | 5 | class OLE2FragmentParser(HachoirParser,RootSeekableFieldSet): 6 | tags = { 7 | "description": "Microsoft Office document subfragments", 8 | } 9 | endian = LITTLE_ENDIAN 10 | 11 | ENDIAN_CHECK=False 12 | 13 | def __init__(self, stream, **args): 14 | RootSeekableFieldSet.__init__(self, None, "root", stream, None, stream.askSize(self)) 15 | HachoirParser.__init__(self, stream, **args) 16 | if self.ENDIAN_CHECK: 17 | if self["endian"].value == "\xFF\xFE": 18 | self.endian = BIG_ENDIAN 19 | elif self["endian"].value == "\xFE\xFF": 20 | self.endian = LITTLE_ENDIAN 21 | else: 22 | raise ParserError("OLE2: Invalid endian value") 23 | 24 | def validate(self): 25 | if self.ENDIAN_CHECK: 26 | if self["endian"].value not in ["\xFF\xFE", "\xFE\xFF"]: 27 | return "Unknown endian value %s"%self["endian"].value.encode('hex') 28 | return True 29 | 30 | class RawParser(OLE2FragmentParser): 31 | ENDIAN_CHECK=False 32 | OS_CHECK=False 33 | def createFields(self): 34 | yield RawBytes(self,"rawdata",self.datasize) 35 | if self.datasize 2 | Form 3 | 4 | 5 | 6 | 0 7 | 0 8 | 441 9 | 412 10 | 11 | 12 | 13 | hachoir-metadata 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | Open 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 0 30 | 0 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | true 41 | 42 | 43 | false 44 | 45 | 46 | 0 47 | 48 | 49 | 0 50 | 51 | 52 | 53 | 54 | 55 | 56 | Quit 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /pdfminer/runlength.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # 3 | # RunLength decoder (Adobe version) implementation based on PDF Reference 4 | # version 1.4 section 3.3.4. 5 | # 6 | # * public domain * 7 | # 8 | 9 | import sys 10 | 11 | def rldecode(data): 12 | """ 13 | RunLength decoder (Adobe version) implementation based on PDF Reference 14 | version 1.4 section 3.3.4: 15 | The RunLengthDecode filter decodes data that has been encoded in a 16 | simple byte-oriented format based on run length. The encoded data 17 | is a sequence of runs, where each run consists of a length byte 18 | followed by 1 to 128 bytes of data. If the length byte is in the 19 | range 0 to 127, the following length + 1 (1 to 128) bytes are 20 | copied literally during decompression. If length is in the range 21 | 129 to 255, the following single byte is to be copied 257 - length 22 | (2 to 128) times during decompression. A length value of 128 23 | denotes EOD. 24 | >>> s = "\x05123456\xfa7\x04abcde\x80junk" 25 | >>> rldecode(s) 26 | '1234567777777abcde' 27 | """ 28 | decoded = [] 29 | i=0 30 | while i < len(data): 31 | #print "data[%d]=:%d:" % (i,ord(data[i])) 32 | length = ord(data[i]) 33 | if length == 128: 34 | break 35 | if length >= 0 and length < 128: 36 | run = data[i+1:(i+1)+(length+1)] 37 | #print "length=%d, run=%s" % (length+1,run) 38 | decoded.append(run) 39 | i = (i+1) + (length+1) 40 | if length > 128: 41 | run = data[i+1]*(257-length) 42 | #print "length=%d, run=%s" % (257-length,run) 43 | decoded.append(run) 44 | i = (i+1) + 1 45 | return ''.join(decoded) 46 | 47 | 48 | if __name__ == '__main__': 49 | import doctest 50 | doctest.testmod() 51 | -------------------------------------------------------------------------------- /hachoir_metadata/filter.py: -------------------------------------------------------------------------------- 1 | from hachoir_metadata.timezone import UTC 2 | from datetime import date, datetime 3 | 4 | # Year in 1850..2030 5 | MIN_YEAR = 1850 6 | MAX_YEAR = 2030 7 | 8 | class Filter: 9 | def __init__(self, valid_types, min=None, max=None): 10 | self.types = valid_types 11 | self.min = min 12 | self.max = max 13 | 14 | def __call__(self, value): 15 | if not isinstance(value, self.types): 16 | return True 17 | if self.min is not None and value < self.min: 18 | return False 19 | if self.max is not None and self.max < value: 20 | return False 21 | return True 22 | 23 | class NumberFilter(Filter): 24 | def __init__(self, min=None, max=None): 25 | Filter.__init__(self, (int, long, float), min, max) 26 | 27 | class DatetimeFilter(Filter): 28 | def __init__(self, min=None, max=None): 29 | Filter.__init__(self, (date, datetime), 30 | datetime(MIN_YEAR, 1, 1), 31 | datetime(MAX_YEAR, 12, 31)) 32 | self.min_date = date(MIN_YEAR, 1, 1) 33 | self.max_date = date(MAX_YEAR, 12, 31) 34 | self.min_tz = datetime(MIN_YEAR, 1, 1, tzinfo=UTC) 35 | self.max_tz = datetime(MAX_YEAR, 12, 31, tzinfo=UTC) 36 | 37 | def __call__(self, value): 38 | """ 39 | Use different min/max values depending on value type 40 | (datetime with timezone, datetime or date). 41 | """ 42 | if not isinstance(value, self.types): 43 | return True 44 | if hasattr(value, "tzinfo") and value.tzinfo: 45 | return (self.min_tz <= value <= self.max_tz) 46 | elif isinstance(value, datetime): 47 | return (self.min <= value <= self.max) 48 | else: 49 | return (self.min_date <= value <= self.max_date) 50 | 51 | DATETIME_FILTER = DatetimeFilter() 52 | 53 | -------------------------------------------------------------------------------- /hachoir_parser/archive/ar.py: -------------------------------------------------------------------------------- 1 | """ 2 | GNU ar archive : archive file (.a) and Debian (.deb) archive. 3 | """ 4 | 5 | from hachoir_parser import Parser 6 | from hachoir_core.field import (FieldSet, ParserError, 7 | String, RawBytes, UnixLine) 8 | from hachoir_core.endian import BIG_ENDIAN 9 | 10 | class ArchiveFileEntry(FieldSet): 11 | def createFields(self): 12 | yield UnixLine(self, "header", "Header") 13 | info = self["header"].value.split() 14 | if len(info) != 7: 15 | raise ParserError("Invalid file entry header") 16 | size = int(info[5]) 17 | if 0 < size: 18 | yield RawBytes(self, "content", size, "File data") 19 | 20 | def createDescription(self): 21 | return "File entry (%s)" % self["header"].value.split()[0] 22 | 23 | class ArchiveFile(Parser): 24 | endian = BIG_ENDIAN 25 | MAGIC = '!\n' 26 | PARSER_TAGS = { 27 | "id": "unix_archive", 28 | "category": "archive", 29 | "file_ext": ("a", "deb"), 30 | "mime": 31 | (u"application/x-debian-package", 32 | u"application/x-archive", 33 | u"application/x-dpkg"), 34 | "min_size": (8 + 13)*8, # file signature + smallest file as possible 35 | "magic": ((MAGIC, 0),), 36 | "description": "Unix archive" 37 | } 38 | 39 | def validate(self): 40 | if self.stream.readBytes(0, len(self.MAGIC)) != self.MAGIC: 41 | return "Invalid magic string" 42 | return True 43 | 44 | def createFields(self): 45 | yield String(self, "id", 8, "Unix archive identifier (\"\")", charset="ASCII") 46 | while not self.eof: 47 | data = self.stream.readBytes(self.current_size, 1) 48 | if data == "\n": 49 | yield RawBytes(self, "empty_line[]", 1, "Empty line") 50 | else: 51 | yield ArchiveFileEntry(self, "file[]", "File") 52 | 53 | -------------------------------------------------------------------------------- /hachoir_metadata/history.patch: -------------------------------------------------------------------------------- 1 | Index: misc.py 2 | =================================================================== 3 | --- misc.py (revision 2947) 4 | +++ misc.py (working copy) 5 | @@ -125,6 +125,9 @@ 6 | summary = self.getField(fieldset, "summary[0]") 7 | if summary: 8 | self.useSummary(summary, False) 9 | + table = self.getField(fieldset, "table1[0]") 10 | + if table: 11 | + self.useTable(table) 12 | 13 | def getFragment(self, frag): 14 | stream = frag.getSubIStream() 15 | @@ -161,6 +164,13 @@ 16 | self.comment = "Encrypted: %s" % doc["FIB/fEncrypted"].value 17 | 18 | @fault_tolerant 19 | + def useTable(self, table): 20 | + if 'SttbSavedBy' in table: 21 | + arr = list(table['SttbSavedBy'].array('string')) 22 | + for i in xrange(0, len(arr), 2): 23 | + self.revision_history = "Revision #%d: Author '%s', file '%s'"%(i//2, arr[i].value, arr[i+1].value) 24 | + 25 | + @fault_tolerant 26 | def useProperty(self, summary, property, is_doc_summary): 27 | field = summary.getFieldByAddress(property["offset"].value*8) 28 | if not field \ 29 | Index: register.py 30 | =================================================================== 31 | --- register.py (revision 2986) 32 | +++ register.py (working copy) 33 | @@ -78,6 +78,7 @@ 34 | filter=DATETIME_FILTER, type=(datetime, date), conversion=setDatetime)) 35 | meta.register(Data("last_modification", 501, _("Last modification"), text_handler=humanDatetime, 36 | filter=DATETIME_FILTER, type=(datetime, date), conversion=setDatetime)) 37 | + meta.register(Data("revision_history", 502, _("Revision history"), type=unicode)) 38 | meta.register(Data("latitude", 510, _("Latitude"), type=float)) 39 | meta.register(Data("longitude", 511, _("Longitude"), type=float)) 40 | meta.register(Data("altitude", 511, _("Altitude"), type=float, text_handler=humanAltitude)) 41 | -------------------------------------------------------------------------------- /hachoir_core/field/integer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Integer field classes: 3 | - UInt8, UInt16, UInt24, UInt32, UInt64: unsigned integer of 8, 16, 32, 64 bits ; 4 | - Int8, Int16, Int24, Int32, Int64: signed integer of 8, 16, 32, 64 bits. 5 | """ 6 | 7 | from hachoir_core.field import Bits, FieldError 8 | 9 | class GenericInteger(Bits): 10 | """ 11 | Generic integer class used to generate other classes. 12 | """ 13 | def __init__(self, parent, name, signed, size, description=None): 14 | if not (8 <= size <= 16384): 15 | raise FieldError("Invalid integer size (%s): have to be in 8..16384" % size) 16 | Bits.__init__(self, parent, name, size, description) 17 | self.signed = signed 18 | 19 | def createValue(self): 20 | return self._parent.stream.readInteger( 21 | self.absolute_address, self.signed, self._size, self._parent.endian) 22 | 23 | def integerFactory(name, is_signed, size, doc): 24 | class Integer(GenericInteger): 25 | __doc__ = doc 26 | static_size = size 27 | def __init__(self, parent, name, description=None): 28 | GenericInteger.__init__(self, parent, name, is_signed, size, description) 29 | cls = Integer 30 | cls.__name__ = name 31 | return cls 32 | 33 | UInt8 = integerFactory("UInt8", False, 8, "Unsigned integer of 8 bits") 34 | UInt16 = integerFactory("UInt16", False, 16, "Unsigned integer of 16 bits") 35 | UInt24 = integerFactory("UInt24", False, 24, "Unsigned integer of 24 bits") 36 | UInt32 = integerFactory("UInt32", False, 32, "Unsigned integer of 32 bits") 37 | UInt64 = integerFactory("UInt64", False, 64, "Unsigned integer of 64 bits") 38 | 39 | Int8 = integerFactory("Int8", True, 8, "Signed integer of 8 bits") 40 | Int16 = integerFactory("Int16", True, 16, "Signed integer of 16 bits") 41 | Int24 = integerFactory("Int24", True, 24, "Signed integer of 24 bits") 42 | Int32 = integerFactory("Int32", True, 32, "Signed integer of 32 bits") 43 | Int64 = integerFactory("Int64", True, 64, "Signed integer of 64 bits") 44 | 45 | -------------------------------------------------------------------------------- /hachoir_core/field/bit_field.py: -------------------------------------------------------------------------------- 1 | """ 2 | Bit sized classes: 3 | - Bit: Single bit, value is False or True ; 4 | - Bits: Integer with a size in bits ; 5 | - RawBits: unknown content with a size in bits. 6 | """ 7 | 8 | from hachoir_core.field import Field 9 | from hachoir_core.i18n import _ 10 | from hachoir_core import config 11 | 12 | class RawBits(Field): 13 | """ 14 | Unknown content with a size in bits. 15 | """ 16 | static_size = staticmethod(lambda *args, **kw: args[1]) 17 | 18 | def __init__(self, parent, name, size, description=None): 19 | """ 20 | Constructor: see L{Field.__init__} for parameter description 21 | """ 22 | Field.__init__(self, parent, name, size, description) 23 | 24 | def hasValue(self): 25 | return True 26 | 27 | def createValue(self): 28 | return self._parent.stream.readBits( 29 | self.absolute_address, self._size, self._parent.endian) 30 | 31 | def createDisplay(self): 32 | if self._size < config.max_bit_length: 33 | return unicode(self.value) 34 | else: 35 | return _("<%s size=%u>" % 36 | (self.__class__.__name__, self._size)) 37 | createRawDisplay = createDisplay 38 | 39 | class Bits(RawBits): 40 | """ 41 | Positive integer with a size in bits 42 | 43 | @see: L{Bit} 44 | @see: L{RawBits} 45 | """ 46 | pass 47 | 48 | class Bit(RawBits): 49 | """ 50 | Single bit: value can be False or True, and size is exactly one bit. 51 | 52 | @see: L{Bits} 53 | """ 54 | static_size = 1 55 | 56 | def __init__(self, parent, name, description=None): 57 | """ 58 | Constructor: see L{Field.__init__} for parameter description 59 | """ 60 | RawBits.__init__(self, parent, name, 1, description=description) 61 | 62 | def createValue(self): 63 | return 1 == self._parent.stream.readBits( 64 | self.absolute_address, 1, self._parent.endian) 65 | 66 | def createRawDisplay(self): 67 | return unicode(int(self.value)) 68 | 69 | -------------------------------------------------------------------------------- /hachoir_core/field/static_field_set.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.field import FieldSet, ParserError 2 | 3 | class StaticFieldSet(FieldSet): 4 | """ 5 | Static field set: format class attribute is a tuple of all fields 6 | in syntax like: 7 | format = ( 8 | (TYPE1, ARG1, ARG2, ...), 9 | (TYPE2, ARG1, ARG2, ..., {KEY1=VALUE1, ...}), 10 | ... 11 | ) 12 | 13 | Types with dynamic size are forbidden, eg. CString, PascalString8, etc. 14 | """ 15 | format = None # You have to redefine this class variable 16 | _class = None 17 | 18 | def __new__(cls, *args, **kw): 19 | assert cls.format is not None, "Class attribute 'format' is not set" 20 | if cls._class is not cls.__name__: 21 | cls._class = cls.__name__ 22 | cls.static_size = cls._computeStaticSize() 23 | return object.__new__(cls, *args, **kw) 24 | 25 | @staticmethod 26 | def _computeItemSize(item): 27 | item_class = item[0] 28 | if item_class.static_size is None: 29 | raise ParserError("Unable to get static size of field type: %s" 30 | % item_class.__name__) 31 | if callable(item_class.static_size): 32 | if isinstance(item[-1], dict): 33 | return item_class.static_size(*item[1:-1], **item[-1]) 34 | else: 35 | return item_class.static_size(*item[1:]) 36 | else: 37 | assert isinstance(item_class.static_size, (int, long)) 38 | return item_class.static_size 39 | 40 | def createFields(self): 41 | for item in self.format: 42 | if isinstance(item[-1], dict): 43 | yield item[0](self, *item[1:-1], **item[-1]) 44 | else: 45 | yield item[0](self, *item[1:]) 46 | 47 | @classmethod 48 | def _computeStaticSize(cls, *args): 49 | return sum(cls._computeItemSize(item) for item in cls.format) 50 | 51 | # Initial value of static_size, it changes when first instance 52 | # is created (see __new__) 53 | static_size = _computeStaticSize 54 | 55 | -------------------------------------------------------------------------------- /hachoir_core/field/helper.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.field import (FieldError, 2 | RawBits, RawBytes, 3 | PaddingBits, PaddingBytes, 4 | NullBits, NullBytes, 5 | GenericString, GenericInteger) 6 | from hachoir_core.stream import FileOutputStream 7 | 8 | def createRawField(parent, size, name="raw[]", description=None): 9 | if size <= 0: 10 | raise FieldError("Unable to create raw field of %s bits" % size) 11 | if (size % 8) == 0: 12 | return RawBytes(parent, name, size/8, description) 13 | else: 14 | return RawBits(parent, name, size, description) 15 | 16 | def createPaddingField(parent, nbits, name="padding[]", description=None): 17 | if nbits <= 0: 18 | raise FieldError("Unable to create padding of %s bits" % nbits) 19 | if (nbits % 8) == 0: 20 | return PaddingBytes(parent, name, nbits/8, description) 21 | else: 22 | return PaddingBits(parent, name, nbits, description) 23 | 24 | def createNullField(parent, nbits, name="padding[]", description=None): 25 | if nbits <= 0: 26 | raise FieldError("Unable to create null padding of %s bits" % nbits) 27 | if (nbits % 8) == 0: 28 | return NullBytes(parent, name, nbits/8, description) 29 | else: 30 | return NullBits(parent, name, nbits, description) 31 | 32 | def isString(field): 33 | return issubclass(field.__class__, GenericString) 34 | 35 | def isInteger(field): 36 | return issubclass(field.__class__, GenericInteger) 37 | 38 | def writeIntoFile(fieldset, filename): 39 | output = FileOutputStream(filename) 40 | fieldset.writeInto(output) 41 | 42 | def createOrphanField(fieldset, address, field_cls, *args, **kw): 43 | """ 44 | Create an orphan field at specified address: 45 | field_cls(fieldset, *args, **kw) 46 | 47 | The field uses the fieldset properties but it isn't added to the 48 | field set. 49 | """ 50 | save_size = fieldset._current_size 51 | try: 52 | fieldset._current_size = address 53 | field = field_cls(fieldset, *args, **kw) 54 | finally: 55 | fieldset._current_size = save_size 56 | return field 57 | 58 | -------------------------------------------------------------------------------- /hachoir_parser/template.py: -------------------------------------------------------------------------------- 1 | """ 2 | ====================== 8< ============================ 3 | This file is an Hachoir parser template. Make a copy 4 | of it, and adapt it to your needs. 5 | 6 | You have to replace all "TODO" with you code. 7 | ====================== 8< ============================ 8 | 9 | TODO parser. 10 | 11 | Author: TODO TODO 12 | Creation date: YYYY-mm-DD 13 | """ 14 | 15 | # TODO: Just keep what you need 16 | from hachoir_parser import Parser 17 | from hachoir_core.field import (ParserError, 18 | UInt8, UInt16, UInt32, String, RawBytes) 19 | from hachoir_core.endian import LITTLE_ENDIAN, BIG_ENDIAN 20 | 21 | class TODOFile(Parser): 22 | PARSER_TAGS = { 23 | "id": "TODO", 24 | "category": "TODO", # "archive", "audio", "container", ... 25 | "file_ext": ("TODO",), # TODO: Example ("bmp",) to parse the file "image.bmp" 26 | "mime": (u"TODO"), # TODO: Example: "image/png" 27 | "min_size": 0, # TODO: Minimum file size (x bits, or x*8 in bytes) 28 | "description": "TODO", # TODO: Example: "A bitmap picture" 29 | } 30 | 31 | # TODO: Choose between little or big endian 32 | # endian = LITTLE_ENDIAN 33 | # endian = BIG_ENDIAN 34 | 35 | def validate(self): 36 | # TODO: Check that file looks like your format 37 | # Example: check first two bytes 38 | # return (self.stream.readBytes(0, 2) == 'BM') 39 | return False 40 | 41 | def createFields(self): 42 | # TODO: Write your parser using this model: 43 | # yield UInt8(self, "name1", "description1") 44 | # yield UInt16(self, "name2", "description2") 45 | # yield UInt32(self, "name3", "description3") 46 | # yield String(self, "name4", 1, "description4") # TODO: add ", charset="ASCII")" 47 | # yield String(self, "name5", 1, "description5", charset="ASCII") 48 | # yield String(self, "name6", 1, "description6", charset="ISO-8859-1") 49 | 50 | # Read rest of the file (if any) 51 | # TODO: You may remove this code 52 | if self.current_size < self._size: 53 | yield self.seekBit(self._size, "end") 54 | 55 | -------------------------------------------------------------------------------- /hachoir_core/text_handler.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities used to convert a field to human classic reprentation of data. 3 | """ 4 | 5 | from hachoir_core.tools import ( 6 | humanDuration, humanFilesize, alignValue, 7 | durationWin64 as doDurationWin64, 8 | deprecated) 9 | from types import FunctionType, MethodType 10 | from hachoir_core.field import Field 11 | 12 | def textHandler(field, handler): 13 | assert isinstance(handler, (FunctionType, MethodType)) 14 | assert issubclass(field.__class__, Field) 15 | field.createDisplay = lambda: handler(field) 16 | return field 17 | 18 | def displayHandler(field, handler): 19 | assert isinstance(handler, (FunctionType, MethodType)) 20 | assert issubclass(field.__class__, Field) 21 | field.createDisplay = lambda: handler(field.value) 22 | return field 23 | 24 | @deprecated("Use TimedeltaWin64 field type") 25 | def durationWin64(field): 26 | """ 27 | Convert Windows 64-bit duration to string. The timestamp format is 28 | a 64-bit number: number of 100ns. See also timestampWin64(). 29 | 30 | >>> durationWin64(type("", (), dict(value=2146280000, size=64))) 31 | u'3 min 34 sec 628 ms' 32 | >>> durationWin64(type("", (), dict(value=(1 << 64)-1, size=64))) 33 | u'58494 years 88 days 5 hours' 34 | """ 35 | assert hasattr(field, "value") and hasattr(field, "size") 36 | assert field.size == 64 37 | delta = doDurationWin64(field.value) 38 | return humanDuration(delta) 39 | 40 | def filesizeHandler(field): 41 | """ 42 | Format field value using humanFilesize() 43 | """ 44 | return displayHandler(field, humanFilesize) 45 | 46 | def hexadecimal(field): 47 | """ 48 | Convert an integer to hexadecimal in lower case. Returns unicode string. 49 | 50 | >>> hexadecimal(type("", (), dict(value=412, size=16))) 51 | u'0x019c' 52 | >>> hexadecimal(type("", (), dict(value=0, size=32))) 53 | u'0x00000000' 54 | """ 55 | assert hasattr(field, "value") and hasattr(field, "size") 56 | size = field.size 57 | padding = alignValue(size, 4) // 4 58 | pattern = u"0x%%0%ux" % padding 59 | return pattern % field.value 60 | 61 | -------------------------------------------------------------------------------- /extractors/metadataMSOffice.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.error import HachoirError 2 | from hachoir_core.cmd_line import unicodeFilename 3 | from hachoir_parser import createParser 4 | from hachoir_core.tools import makePrintable 5 | from hachoir_metadata import extractMetadata 6 | from hachoir_core.i18n import getTerminalCharset 7 | from sys import argv, stderr, exit 8 | 9 | class metaMs2k: 10 | def __init__(self,filename): 11 | self.filename=filename 12 | self.users=[] 13 | self.paths=[] 14 | self.software=[] 15 | self.modification=[] 16 | self.creationDate=[] 17 | self.lastPrinted=[] 18 | self.raw="" 19 | 20 | def getData(self): 21 | filename, realname = unicodeFilename(self.filename), self.filename 22 | try: 23 | parser = createParser(filename, realname) 24 | except: 25 | return "error" 26 | try: 27 | metadata = extractMetadata(parser) 28 | except HachoirError, err: 29 | print "Metadata extraction error: %s" % unicode(err) 30 | metadata = None 31 | if not metadata: 32 | print "Unable to extract metadata on file: " + self.filename 33 | else: 34 | text = metadata.exportPlaintext() 35 | charset = getTerminalCharset() 36 | for line in text: 37 | res=line.split(":") 38 | if res[0]=="- Author": 39 | self.users.append(res[1]) 40 | elif res[1]==" Author:": 41 | self.users.append(res[2]) 42 | elif res[0]=="- Producer": 43 | self.software.append(res[1]) 44 | elif res[0]=="- Creation date": 45 | self.creationDate.append(res[1]) 46 | elif res[0]=="- Last modification": 47 | self.modification.append(res[1]) 48 | elif res[1]==" Template": 49 | xres= line.replace("- Comment: Template:","") 50 | self.paths.append(xres) 51 | elif res[1]==" LastSavedBy": 52 | # print res[1] + res[2] 53 | self.users.append(res[2]) 54 | elif res[1]==" LastPrinted": 55 | self.lastPrinted.append(res[2]) 56 | elif res[0]=="- Revision history": 57 | #self.paths.append(res[2]) 58 | res2=line.split(",") 59 | self.paths.append(res2[1].split("file ")[1]) 60 | self.raw=text 61 | return "ok" 62 | 63 | def getUsers(self): 64 | return self.users 65 | def getSoftware(self): 66 | return self.software 67 | def getPaths(self): 68 | return self.paths 69 | def getRaw(self): 70 | return self.raw 71 | -------------------------------------------------------------------------------- /hachoir_core/timeout.py: -------------------------------------------------------------------------------- 1 | """ 2 | limitedTime(): set a timeout in seconds when calling a function, 3 | raise a Timeout error if time exceed. 4 | """ 5 | from math import ceil 6 | 7 | IMPLEMENTATION = None 8 | 9 | class Timeout(RuntimeError): 10 | """ 11 | Timeout error, inherits from RuntimeError 12 | """ 13 | pass 14 | 15 | def signalHandler(signum, frame): 16 | """ 17 | Signal handler to catch timeout signal: raise Timeout exception. 18 | """ 19 | raise Timeout("Timeout exceed!") 20 | 21 | def limitedTime(second, func, *args, **kw): 22 | """ 23 | Call func(*args, **kw) with a timeout of second seconds. 24 | """ 25 | return func(*args, **kw) 26 | 27 | def fixTimeout(second): 28 | """ 29 | Fix timeout value: convert to integer with a minimum of 1 second 30 | """ 31 | if isinstance(second, float): 32 | second = int(ceil(second)) 33 | assert isinstance(second, (int, long)) 34 | return max(second, 1) 35 | 36 | if not IMPLEMENTATION: 37 | try: 38 | from signal import signal, alarm, SIGALRM 39 | 40 | # signal.alarm() implementation 41 | def limitedTime(second, func, *args, **kw): 42 | second = fixTimeout(second) 43 | old_alarm = signal(SIGALRM, signalHandler) 44 | try: 45 | alarm(second) 46 | return func(*args, **kw) 47 | finally: 48 | alarm(0) 49 | signal(SIGALRM, old_alarm) 50 | 51 | IMPLEMENTATION = "signal.alarm()" 52 | except ImportError: 53 | pass 54 | 55 | if not IMPLEMENTATION: 56 | try: 57 | from signal import signal, SIGXCPU 58 | from resource import getrlimit, setrlimit, RLIMIT_CPU 59 | 60 | # resource.setrlimit(RLIMIT_CPU) implementation 61 | # "Bug": timeout is 'CPU' time so sleep() are not part of the timeout 62 | def limitedTime(second, func, *args, **kw): 63 | second = fixTimeout(second) 64 | old_alarm = signal(SIGXCPU, signalHandler) 65 | current = getrlimit(RLIMIT_CPU) 66 | try: 67 | setrlimit(RLIMIT_CPU, (second, current[1])) 68 | return func(*args, **kw) 69 | finally: 70 | setrlimit(RLIMIT_CPU, current) 71 | signal(SIGXCPU, old_alarm) 72 | 73 | IMPLEMENTATION = "resource.setrlimit(RLIMIT_CPU)" 74 | except ImportError: 75 | pass 76 | 77 | -------------------------------------------------------------------------------- /hachoir_parser/archive/mar.py: -------------------------------------------------------------------------------- 1 | """ 2 | Microsoft Archive parser 3 | 4 | Author: Victor Stinner 5 | Creation date: 2007-03-04 6 | """ 7 | 8 | MAX_NB_FILE = 100000 9 | 10 | from hachoir_parser import Parser 11 | from hachoir_core.field import FieldSet, String, UInt32, SubFile 12 | from hachoir_core.endian import LITTLE_ENDIAN 13 | from hachoir_core.text_handler import textHandler, filesizeHandler, hexadecimal 14 | 15 | class FileIndex(FieldSet): 16 | static_size = 68*8 17 | 18 | def createFields(self): 19 | yield String(self, "filename", 56, truncate="\0", charset="ASCII") 20 | yield filesizeHandler(UInt32(self, "filesize")) 21 | yield textHandler(UInt32(self, "crc32"), hexadecimal) 22 | yield UInt32(self, "offset") 23 | 24 | def createDescription(self): 25 | return "File %s (%s) at %s" % ( 26 | self["filename"].value, self["filesize"].display, self["offset"].value) 27 | 28 | class MarFile(Parser): 29 | MAGIC = "MARC" 30 | PARSER_TAGS = { 31 | "id": "mar", 32 | "category": "archive", 33 | "file_ext": ("mar",), 34 | "min_size": 80*8, # At least one file index 35 | "magic": ((MAGIC, 0),), 36 | "description": "Microsoft Archive", 37 | } 38 | endian = LITTLE_ENDIAN 39 | 40 | def validate(self): 41 | if self.stream.readBytes(0, 4) != self.MAGIC: 42 | return "Invalid magic" 43 | if self["version"].value != 3: 44 | return "Invalid version" 45 | if not(1 <= self["nb_file"].value <= MAX_NB_FILE): 46 | return "Invalid number of file" 47 | return True 48 | 49 | def createFields(self): 50 | yield String(self, "magic", 4, "File signature (MARC)", charset="ASCII") 51 | yield UInt32(self, "version") 52 | yield UInt32(self, "nb_file") 53 | files = [] 54 | for index in xrange(self["nb_file"].value): 55 | item = FileIndex(self, "file[]") 56 | yield item 57 | if item["filesize"].value: 58 | files.append(item) 59 | files.sort(key=lambda item: item["offset"].value) 60 | for index in files: 61 | padding = self.seekByte(index["offset"].value) 62 | if padding: 63 | yield padding 64 | size = index["filesize"].value 65 | desc = "File %s" % index["filename"].value 66 | yield SubFile(self, "data[]", size, desc, filename=index["filename"].value) 67 | 68 | -------------------------------------------------------------------------------- /hachoir_core/field/byte_field.py: -------------------------------------------------------------------------------- 1 | """ 2 | Very basic field: raw content with a size in byte. Use this class for 3 | unknown content. 4 | """ 5 | 6 | from hachoir_core.field import Field, FieldError 7 | from hachoir_core.tools import makePrintable 8 | from hachoir_core.bits import str2hex 9 | from hachoir_core import config 10 | 11 | MAX_LENGTH = (2**64) 12 | 13 | class RawBytes(Field): 14 | """ 15 | Byte vector of unknown content 16 | 17 | @see: L{Bytes} 18 | """ 19 | static_size = staticmethod(lambda *args, **kw: args[1]*8) 20 | 21 | def __init__(self, parent, name, length, description="Raw data"): 22 | assert issubclass(parent.__class__, Field) 23 | if not(0 < length <= MAX_LENGTH): 24 | raise FieldError("Invalid RawBytes length (%s)!" % length) 25 | Field.__init__(self, parent, name, length*8, description) 26 | self._display = None 27 | 28 | def _createDisplay(self, human): 29 | max_bytes = config.max_byte_length 30 | if type(self._getValue) is type(lambda: None): 31 | display = self.value[:max_bytes] 32 | else: 33 | if self._display is None: 34 | address = self.absolute_address 35 | length = min(self._size / 8, max_bytes) 36 | self._display = self._parent.stream.readBytes(address, length) 37 | display = self._display 38 | truncated = (8 * len(display) < self._size) 39 | if human: 40 | if truncated: 41 | display += "(...)" 42 | return makePrintable(display, "latin-1", quote='"', to_unicode=True) 43 | else: 44 | display = str2hex(display, format=r"\x%02x") 45 | if truncated: 46 | return '"%s(...)"' % display 47 | else: 48 | return '"%s"' % display 49 | 50 | def createDisplay(self): 51 | return self._createDisplay(True) 52 | 53 | def createRawDisplay(self): 54 | return self._createDisplay(False) 55 | 56 | def hasValue(self): 57 | return True 58 | 59 | def createValue(self): 60 | assert (self._size % 8) == 0 61 | if self._display: 62 | self._display = None 63 | return self._parent.stream.readBytes( 64 | self.absolute_address, self._size / 8) 65 | 66 | class Bytes(RawBytes): 67 | """ 68 | Byte vector: can be used for magic number or GUID/UUID for example. 69 | 70 | @see: L{RawBytes} 71 | """ 72 | pass 73 | 74 | -------------------------------------------------------------------------------- /hachoir_core/field/__init__.py: -------------------------------------------------------------------------------- 1 | # Field classes 2 | from hachoir_core.field.field import Field, FieldError, MissingField, joinPath 3 | from hachoir_core.field.bit_field import Bit, Bits, RawBits 4 | from hachoir_core.field.byte_field import Bytes, RawBytes 5 | from hachoir_core.field.sub_file import SubFile, CompressedField 6 | from hachoir_core.field.character import Character 7 | from hachoir_core.field.integer import ( 8 | Int8, Int16, Int24, Int32, Int64, 9 | UInt8, UInt16, UInt24, UInt32, UInt64, 10 | GenericInteger) 11 | from hachoir_core.field.enum import Enum 12 | from hachoir_core.field.string_field import (GenericString, 13 | String, CString, UnixLine, 14 | PascalString8, PascalString16, PascalString32) 15 | from hachoir_core.field.padding import (PaddingBits, PaddingBytes, 16 | NullBits, NullBytes) 17 | 18 | # Functions 19 | from hachoir_core.field.helper import (isString, isInteger, 20 | createPaddingField, createNullField, createRawField, 21 | writeIntoFile, createOrphanField) 22 | 23 | # FieldSet classes 24 | from hachoir_core.field.fake_array import FakeArray 25 | from hachoir_core.field.basic_field_set import (BasicFieldSet, 26 | ParserError, MatchError) 27 | from hachoir_core.field.generic_field_set import GenericFieldSet 28 | from hachoir_core.field.seekable_field_set import SeekableFieldSet, RootSeekableFieldSet 29 | from hachoir_core.field.field_set import FieldSet 30 | from hachoir_core.field.static_field_set import StaticFieldSet 31 | from hachoir_core.field.parser import Parser 32 | from hachoir_core.field.vector import GenericVector, UserVector 33 | 34 | # Complex types 35 | from hachoir_core.field.float import Float32, Float64, Float80 36 | from hachoir_core.field.timestamp import (GenericTimestamp, 37 | TimestampUnix32, TimestampUnix64, TimestampMac32, TimestampUUID60, TimestampWin64, 38 | DateTimeMSDOS32, TimeDateMSDOS32, TimedeltaWin64) 39 | 40 | # Special Field classes 41 | from hachoir_core.field.link import Link, Fragment 42 | 43 | available_types = ( 44 | Bit, Bits, RawBits, 45 | Bytes, RawBytes, 46 | SubFile, 47 | Character, 48 | Int8, Int16, Int24, Int32, Int64, 49 | UInt8, UInt16, UInt24, UInt32, UInt64, 50 | String, CString, UnixLine, 51 | PascalString8, PascalString16, PascalString32, 52 | Float32, Float64, 53 | PaddingBits, PaddingBytes, 54 | NullBits, NullBytes, 55 | TimestampUnix32, TimestampMac32, TimestampWin64, 56 | DateTimeMSDOS32, TimeDateMSDOS32, 57 | # GenericInteger, GenericString, 58 | ) 59 | 60 | -------------------------------------------------------------------------------- /hachoir_parser/game/spider_man_video.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parser for an obscure FMV file format: bin files from the game 3 | "The Amazing Spider-Man vs. The Kingpin" (Sega CD) 4 | 5 | Author: Mike Melanson 6 | Creation date: 2006-09-30 7 | File samples: http://samples.mplayerhq.hu/game-formats/spiderman-segacd-bin/ 8 | """ 9 | 10 | from hachoir_parser import Parser 11 | from hachoir_core.field import FieldSet, UInt32, String, RawBytes 12 | from hachoir_core.endian import BIG_ENDIAN 13 | from hachoir_core.text_handler import textHandler, hexadecimal 14 | 15 | class Chunk(FieldSet): 16 | tag_info = { 17 | "CONF" : ("conf[]", None, "Configuration header"), 18 | "AUDI" : ("audio[]", None, "Audio chunk"), 19 | "SYNC" : ("sync[]", None, "Start of video frame data"), 20 | "IVRA" : ("ivra[]", None, "Vector codebook (?)"), 21 | "VRAM" : ("video[]", None, "Video RAM tile pattern"), 22 | "CRAM" : ("color[]", None, "Color RAM (palette)"), 23 | "CEND" : ("video_end[]", None, "End of video data"), 24 | "MEND" : ("end_file", None, "End of file"), 25 | } 26 | 27 | def __init__(self, *args): 28 | FieldSet.__init__(self, *args) 29 | self._size = self["length"].value * 8 30 | fourcc = self["fourcc"].value 31 | if fourcc in self.tag_info: 32 | self._name, self._parser, self._description = self.tag_info[fourcc] 33 | else: 34 | self._parser = None 35 | self._description = "Unknown chunk: fourcc %s" % self["fourcc"].display 36 | 37 | def createFields(self): 38 | yield String(self, "fourcc", 4, "FourCC", charset="ASCII") 39 | yield textHandler(UInt32(self, "length", "length"), hexadecimal) 40 | size = self["length"].value - 8 41 | if 0 < size: 42 | if self._parser: 43 | for field in self._parser(self, size): 44 | yield field 45 | else: 46 | yield RawBytes(self, "data", size) 47 | 48 | class SpiderManVideoFile(Parser): 49 | PARSER_TAGS = { 50 | "id": "spiderman_video", 51 | "category": "game", 52 | "file_ext": ("bin",), 53 | "min_size": 8*8, 54 | "description": "The Amazing Spider-Man vs. The Kingpin (Sega CD) FMV video" 55 | } 56 | 57 | endian = BIG_ENDIAN 58 | 59 | def validate(self): 60 | return (self.stream.readBytes(0, 4) == 'CONF') 61 | 62 | def createFields(self): 63 | while not self.eof: 64 | yield Chunk(self, "chunk[]") 65 | 66 | -------------------------------------------------------------------------------- /hachoir_parser/image/tiff.py: -------------------------------------------------------------------------------- 1 | """ 2 | TIFF image parser. 3 | 4 | Authors: Victor Stinner, Sebastien Ponce, Robert Xiao 5 | Creation date: 30 september 2006 6 | """ 7 | 8 | from hachoir_parser import Parser 9 | from hachoir_core.field import FieldSet, SeekableFieldSet, RootSeekableFieldSet, Bytes 10 | from hachoir_core.endian import LITTLE_ENDIAN, BIG_ENDIAN 11 | from hachoir_parser.image.exif import TIFF 12 | 13 | def getStrips(ifd): 14 | data = {} 15 | for i, entry in enumerate(ifd.array('entry')): 16 | data[entry['tag'].display] = entry 17 | # image data 18 | if "StripOffsets" in data and "StripByteCounts" in data: 19 | offs = ifd.getEntryValues(data["StripOffsets"]) 20 | bytes = ifd.getEntryValues(data["StripByteCounts"]) 21 | for off, byte in zip(offs, bytes): 22 | yield off.value, byte.value 23 | 24 | class ImageFile(SeekableFieldSet): 25 | def __init__(self, parent, name, description, ifd): 26 | SeekableFieldSet.__init__(self, parent, name, description, None) 27 | self._ifd = ifd 28 | 29 | def createFields(self): 30 | for off, byte in getStrips(self._ifd): 31 | self.seekByte(off, relative=False) 32 | yield Bytes(self, "strip[]", byte) 33 | 34 | class TiffFile(RootSeekableFieldSet, Parser): 35 | PARSER_TAGS = { 36 | "id": "tiff", 37 | "category": "image", 38 | "file_ext": ("tif", "tiff"), 39 | "mime": (u"image/tiff",), 40 | "min_size": 8*8, 41 | "magic": (("II\x2A\0", 0), ("MM\0\x2A", 0)), 42 | "description": "TIFF picture" 43 | } 44 | 45 | # Correct endian is set in constructor 46 | endian = LITTLE_ENDIAN 47 | 48 | def __init__(self, stream, **args): 49 | RootSeekableFieldSet.__init__(self, None, "root", stream, None, stream.askSize(self)) 50 | if self.stream.readBytes(0, 2) == "MM": 51 | self.endian = BIG_ENDIAN 52 | Parser.__init__(self, stream, **args) 53 | 54 | def validate(self): 55 | endian = self.stream.readBytes(0, 2) 56 | if endian not in ("MM", "II"): 57 | return "Invalid endian (%r)" % endian 58 | if self["version"].value != 42: 59 | return "Unknown TIFF version" 60 | return True 61 | 62 | def createFields(self): 63 | for field in TIFF(self): 64 | yield field 65 | 66 | for ifd in self.array('ifd'): 67 | offs = (off for off, byte in getStrips(ifd)) 68 | self.seekByte(min(offs), relative=False) 69 | image = ImageFile(self, "image[]", "Image File", ifd) 70 | yield image 71 | -------------------------------------------------------------------------------- /hachoir_core/field/fake_array.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from hachoir_core.field import MissingField 3 | 4 | class FakeArray: 5 | """ 6 | Simulate an array for GenericFieldSet.array(): fielset.array("item")[0] is 7 | equivalent to fielset.array("item[0]"). 8 | 9 | It's possible to iterate over the items using:: 10 | 11 | for element in fieldset.array("item"): 12 | ... 13 | 14 | And to get array size using len(fieldset.array("item")). 15 | """ 16 | def __init__(self, fieldset, name): 17 | pos = name.rfind("/") 18 | if pos != -1: 19 | self.fieldset = fieldset[name[:pos]] 20 | self.name = name[pos+1:] 21 | else: 22 | self.fieldset = fieldset 23 | self.name = name 24 | self._format = "%s[%%u]" % self.name 25 | self._cache = {} 26 | self._known_size = False 27 | self._max_index = -1 28 | 29 | def __nonzero__(self): 30 | "Is the array empty or not?" 31 | if self._cache: 32 | return True 33 | else: 34 | return (0 in self) 35 | 36 | def __len__(self): 37 | "Number of fields in the array" 38 | total = self._max_index+1 39 | if not self._known_size: 40 | for index in itertools.count(total): 41 | try: 42 | field = self[index] 43 | total += 1 44 | except MissingField: 45 | break 46 | return total 47 | 48 | def __contains__(self, index): 49 | try: 50 | field = self[index] 51 | return True 52 | except MissingField: 53 | return False 54 | 55 | def __getitem__(self, index): 56 | """ 57 | Get a field of the array. Returns a field, or raise MissingField 58 | exception if the field doesn't exist. 59 | """ 60 | try: 61 | value = self._cache[index] 62 | except KeyError: 63 | try: 64 | value = self.fieldset[self._format % index] 65 | except MissingField: 66 | self._known_size = True 67 | raise 68 | self._cache[index] = value 69 | self._max_index = max(index, self._max_index) 70 | return value 71 | 72 | def __iter__(self): 73 | """ 74 | Iterate in the fields in their index order: field[0], field[1], ... 75 | """ 76 | for index in itertools.count(0): 77 | try: 78 | yield self[index] 79 | except MissingField: 80 | raise StopIteration() 81 | 82 | -------------------------------------------------------------------------------- /hachoir_parser/archive/mozilla_ar.py: -------------------------------------------------------------------------------- 1 | """MAR (Mozilla ARchive) parser 2 | 3 | Author: Robert Xiao 4 | Creation date: July 10, 2007 5 | 6 | """ 7 | 8 | from hachoir_core.endian import BIG_ENDIAN 9 | from hachoir_core.field import (RootSeekableFieldSet, FieldSet, 10 | String, CString, UInt32, RawBytes) 11 | from hachoir_core.text_handler import displayHandler, filesizeHandler 12 | from hachoir_core.tools import humanUnixAttributes 13 | from hachoir_parser import HachoirParser 14 | 15 | class IndexEntry(FieldSet): 16 | def createFields(self): 17 | yield UInt32(self, "offset", "Offset in bytes relative to start of archive") 18 | yield filesizeHandler(UInt32(self, "length", "Length in bytes")) 19 | yield displayHandler(UInt32(self, "flags"), humanUnixAttributes) 20 | yield CString(self, "name", "Filename (byte array)") 21 | 22 | def createDescription(self): 23 | return 'File %s, Size %s, Mode %s'%( 24 | self["name"].display, self["length"].display, self["flags"].display) 25 | 26 | class MozillaArchive(HachoirParser, RootSeekableFieldSet): 27 | MAGIC = "MAR1" 28 | PARSER_TAGS = { 29 | "id": "mozilla_ar", 30 | "category": "archive", 31 | "file_ext": ("mar",), 32 | "min_size": (8+4+13)*8, # Header, Index Header, 1 Index Entry 33 | "magic": ((MAGIC, 0),), 34 | "description": "Mozilla Archive", 35 | } 36 | endian = BIG_ENDIAN 37 | 38 | def __init__(self, stream, **args): 39 | RootSeekableFieldSet.__init__(self, None, "root", stream, None, stream.askSize(self)) 40 | HachoirParser.__init__(self, stream, **args) 41 | 42 | def validate(self): 43 | if self.stream.readBytes(0, 4) != self.MAGIC: 44 | return "Invalid magic" 45 | return True 46 | 47 | def createFields(self): 48 | yield String(self, "magic", 4, "File signature (MAR1)", charset="ASCII") 49 | yield UInt32(self, "index_offset", "Offset to index relative to file start") 50 | self.seekByte(self["index_offset"].value, False) 51 | yield UInt32(self, "index_size", "size of index in bytes") 52 | current_index_size = 0 # bytes 53 | while current_index_size < self["index_size"].value: 54 | # plus 4 compensates for index_size 55 | self.seekByte(self["index_offset"].value + current_index_size + 4, False) 56 | entry = IndexEntry(self, "index_entry[]") 57 | yield entry 58 | current_index_size += entry.size // 8 59 | self.seekByte(entry["offset"].value, False) 60 | yield RawBytes(self, "file[]", entry["length"].value) 61 | -------------------------------------------------------------------------------- /pdfminer/ascii85.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | """ Python implementation of ASCII85/ASCIIHex decoder (Adobe version). 4 | 5 | This code is in the public domain. 6 | 7 | """ 8 | 9 | import re 10 | import struct 11 | 12 | # ascii85decode(data) 13 | def ascii85decode(data): 14 | """ 15 | In ASCII85 encoding, every four bytes are encoded with five ASCII 16 | letters, using 85 different types of characters (as 256**4 < 85**5). 17 | When the length of the original bytes is not a multiple of 4, a special 18 | rule is used for round up. 19 | 20 | The Adobe's ASCII85 implementation is slightly different from 21 | its original in handling the last characters. 22 | 23 | The sample string is taken from: 24 | http://en.wikipedia.org/w/index.php?title=Ascii85 25 | 26 | >>> ascii85decode('9jqo^BlbD-BleB1DJ+*+F(f,q') 27 | 'Man is distinguished' 28 | >>> ascii85decode('E,9)oF*2M7/c~>') 29 | 'pleasure.' 30 | """ 31 | n = b = 0 32 | out = '' 33 | for c in data: 34 | if '!' <= c and c <= 'u': 35 | n += 1 36 | b = b*85+(ord(c)-33) 37 | if n == 5: 38 | out += struct.pack('>L',b) 39 | n = b = 0 40 | elif c == 'z': 41 | assert n == 0 42 | out += '\0\0\0\0' 43 | elif c == '~': 44 | if n: 45 | for _ in range(5-n): 46 | b = b*85+84 47 | out += struct.pack('>L',b)[:n-1] 48 | break 49 | return out 50 | 51 | # asciihexdecode(data) 52 | hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE) 53 | trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE) 54 | def asciihexdecode(data): 55 | """ 56 | ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1 57 | For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the 58 | ASCIIHexDecode filter produces one byte of binary data. All white-space 59 | characters are ignored. A right angle bracket character (>) indicates 60 | EOD. Any other characters will cause an error. If the filter encounters 61 | the EOD marker after reading an odd number of hexadecimal digits, it 62 | will behave as if a 0 followed the last digit. 63 | 64 | >>> asciihexdecode('61 62 2e6364 65') 65 | 'ab.cde' 66 | >>> asciihexdecode('61 62 2e6364 657>') 67 | 'ab.cdep' 68 | >>> asciihexdecode('7>') 69 | 'p' 70 | """ 71 | decode = (lambda hx: chr(int(hx, 16))) 72 | out = map(decode, hex_re.findall(data)) 73 | m = trail_re.search(data) 74 | if m: 75 | out.append(decode("%c0" % m.group(1))) 76 | return ''.join(out) 77 | 78 | 79 | if __name__ == '__main__': 80 | import doctest 81 | doctest.testmod() 82 | -------------------------------------------------------------------------------- /hachoir_parser/image/psd.py: -------------------------------------------------------------------------------- 1 | """ 2 | Photoshop parser (.psd file). 3 | 4 | Creation date: 8 january 2006 5 | Author: Victor Stinner 6 | """ 7 | 8 | from hachoir_parser import Parser 9 | from hachoir_core.field import (FieldSet, 10 | UInt16, UInt32, String, NullBytes, Enum, RawBytes) 11 | from hachoir_core.endian import BIG_ENDIAN 12 | from hachoir_parser.image.photoshop_metadata import Photoshop8BIM 13 | 14 | class Config(FieldSet): 15 | def __init__(self, *args): 16 | FieldSet.__init__(self, *args) 17 | self._size = (4 + self["size"].value) * 8 18 | 19 | def createFields(self): 20 | yield UInt32(self, "size") 21 | while not self.eof: 22 | yield Photoshop8BIM(self, "item[]") 23 | 24 | class PsdFile(Parser): 25 | endian = BIG_ENDIAN 26 | PARSER_TAGS = { 27 | "id": "psd", 28 | "category": "image", 29 | "file_ext": ("psd",), 30 | "mime": (u"image/psd", u"image/photoshop", u"image/x-photoshop"), 31 | "min_size": 4*8, 32 | "magic": (("8BPS\0\1",0),), 33 | "description": "Photoshop (PSD) picture", 34 | } 35 | COLOR_MODE = { 36 | 0: u"Bitmap", 37 | 1: u"Grayscale", 38 | 2: u"Indexed", 39 | 3: u"RGB color", 40 | 4: u"CMYK color", 41 | 7: u"Multichannel", 42 | 8: u"Duotone", 43 | 9: u"Lab Color", 44 | } 45 | COMPRESSION_NAME = { 46 | 0: "Raw data", 47 | 1: "RLE", 48 | } 49 | 50 | def validate(self): 51 | if self.stream.readBytes(0, 4) != "8BPS": 52 | return "Invalid signature" 53 | return True 54 | 55 | def createFields(self): 56 | yield String(self, "signature", 4, "PSD signature (8BPS)", charset="ASCII") 57 | yield UInt16(self, "version") 58 | yield NullBytes(self, "reserved[]", 6) 59 | yield UInt16(self, "nb_channels") 60 | yield UInt32(self, "width") 61 | yield UInt32(self, "height") 62 | yield UInt16(self, "depth") 63 | yield Enum(UInt16(self, "color_mode"), self.COLOR_MODE) 64 | 65 | # Mode data 66 | yield UInt32(self, "mode_data_size") 67 | size = self["mode_data_size"].value 68 | if size: 69 | yield RawBytes(self, "mode_data", size) 70 | 71 | # Resources 72 | yield Config(self, "config") 73 | 74 | # Reserved 75 | yield UInt32(self, "reserved_data_size") 76 | size = self["reserved_data_size"].value 77 | if size: 78 | yield RawBytes(self, "reserved_data", size) 79 | 80 | yield Enum(UInt16(self, "compression"), self.COMPRESSION_NAME) 81 | 82 | size = (self.size - self.current_size) // 8 83 | if size: 84 | yield RawBytes(self, "end", size) 85 | 86 | -------------------------------------------------------------------------------- /hachoir_core/memory.py: -------------------------------------------------------------------------------- 1 | import gc 2 | 3 | #---- Default implementation when resource is missing ---------------------- 4 | PAGE_SIZE = 4096 5 | 6 | def getMemoryLimit(): 7 | """ 8 | Get current memory limit in bytes. 9 | 10 | Return None on error. 11 | """ 12 | return None 13 | 14 | def setMemoryLimit(max_mem): 15 | """ 16 | Set memory limit in bytes. 17 | Use value 'None' to disable memory limit. 18 | 19 | Return True if limit is set, False on error. 20 | """ 21 | return False 22 | 23 | def getMemorySize(): 24 | """ 25 | Read currenet process memory size: size of available virtual memory. 26 | This value is NOT the real memory usage. 27 | 28 | This function only works on Linux (use /proc/self/statm file). 29 | """ 30 | try: 31 | statm = open('/proc/self/statm').readline().split() 32 | except IOError: 33 | return None 34 | return int(statm[0]) * PAGE_SIZE 35 | 36 | def clearCaches(): 37 | """ 38 | Try to clear all caches: call gc.collect() (Python garbage collector). 39 | """ 40 | gc.collect() 41 | #import re; re.purge() 42 | 43 | try: 44 | #---- 'resource' implementation --------------------------------------------- 45 | from resource import getpagesize, getrlimit, setrlimit, RLIMIT_AS 46 | 47 | PAGE_SIZE = getpagesize() 48 | 49 | def getMemoryLimit(): 50 | try: 51 | limit = getrlimit(RLIMIT_AS)[0] 52 | if 0 < limit: 53 | limit *= PAGE_SIZE 54 | return limit 55 | except ValueError: 56 | return None 57 | 58 | def setMemoryLimit(max_mem): 59 | if max_mem is None: 60 | max_mem = -1 61 | try: 62 | setrlimit(RLIMIT_AS, (max_mem, -1)) 63 | return True 64 | except ValueError: 65 | return False 66 | except ImportError: 67 | pass 68 | 69 | def limitedMemory(limit, func, *args, **kw): 70 | """ 71 | Limit memory grow when calling func(*args, **kw): 72 | restrict memory grow to 'limit' bytes. 73 | 74 | Use try/except MemoryError to catch the error. 75 | """ 76 | # First step: clear cache to gain memory 77 | clearCaches() 78 | 79 | # Get total program size 80 | max_rss = getMemorySize() 81 | if max_rss is not None: 82 | # Get old limit and then set our new memory limit 83 | old_limit = getMemoryLimit() 84 | limit = max_rss + limit 85 | limited = setMemoryLimit(limit) 86 | else: 87 | limited = False 88 | 89 | try: 90 | # Call function 91 | return func(*args, **kw) 92 | finally: 93 | # and unset our memory limit 94 | if limited: 95 | setMemoryLimit(old_limit) 96 | 97 | # After calling the function: clear all caches 98 | clearCaches() 99 | 100 | -------------------------------------------------------------------------------- /hachoir_core/field/sub_file.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.field import Bytes 2 | from hachoir_core.tools import makePrintable, humanFilesize 3 | from hachoir_core.stream import InputIOStream 4 | 5 | class SubFile(Bytes): 6 | """ 7 | File stored in another file 8 | """ 9 | def __init__(self, parent, name, length, description=None, 10 | parser=None, filename=None, mime_type=None, parser_class=None): 11 | if filename: 12 | if not isinstance(filename, unicode): 13 | filename = makePrintable(filename, "ISO-8859-1") 14 | if not description: 15 | description = 'File "%s" (%s)' % (filename, humanFilesize(length)) 16 | Bytes.__init__(self, parent, name, length, description) 17 | def createInputStream(cis, **args): 18 | tags = args.setdefault("tags",[]) 19 | if parser_class: 20 | tags.append(( "class", parser_class )) 21 | if parser is not None: 22 | tags.append(( "id", parser.PARSER_TAGS["id"] )) 23 | if mime_type: 24 | tags.append(( "mime", mime_type )) 25 | if filename: 26 | tags.append(( "filename", filename )) 27 | return cis(**args) 28 | self.setSubIStream(createInputStream) 29 | 30 | class CompressedStream: 31 | offset = 0 32 | 33 | def __init__(self, stream, decompressor): 34 | self.stream = stream 35 | self.decompressor = decompressor(stream) 36 | self._buffer = '' 37 | 38 | def read(self, size): 39 | d = self._buffer 40 | data = [ d[:size] ] 41 | size -= len(d) 42 | if size > 0: 43 | d = self.decompressor(size) 44 | data.append(d[:size]) 45 | size -= len(d) 46 | while size > 0: 47 | n = 4096 48 | if self.stream.size: 49 | n = min(self.stream.size - self.offset, n) 50 | if not n: 51 | break 52 | d = self.stream.read(self.offset, n)[1] 53 | self.offset += 8 * len(d) 54 | d = self.decompressor(size, d) 55 | data.append(d[:size]) 56 | size -= len(d) 57 | self._buffer = d[size+len(d):] 58 | return ''.join(data) 59 | 60 | def CompressedField(field, decompressor): 61 | def createInputStream(cis, source=None, **args): 62 | if field._parent: 63 | stream = cis(source=source) 64 | args.setdefault("tags", []).extend(stream.tags) 65 | else: 66 | stream = field.stream 67 | input = CompressedStream(stream, decompressor) 68 | if source is None: 69 | source = "Compressed source: '%s' (offset=%s)" % (stream.source, field.absolute_address) 70 | return InputIOStream(input, source=source, **args) 71 | field.setSubIStream(createInputStream) 72 | return field 73 | -------------------------------------------------------------------------------- /hachoir_parser/image/pcx.py: -------------------------------------------------------------------------------- 1 | """ 2 | PCX picture filter. 3 | """ 4 | 5 | from hachoir_parser import Parser 6 | from hachoir_core.field import ( 7 | UInt8, UInt16, 8 | PaddingBytes, RawBytes, 9 | Enum) 10 | from hachoir_parser.image.common import PaletteRGB 11 | from hachoir_core.endian import LITTLE_ENDIAN 12 | 13 | class PcxFile(Parser): 14 | endian = LITTLE_ENDIAN 15 | PARSER_TAGS = { 16 | "id": "pcx", 17 | "category": "image", 18 | "file_ext": ("pcx",), 19 | "mime": (u"image/x-pcx",), 20 | "min_size": 128*8, 21 | "description": "PC Paintbrush (PCX) picture" 22 | } 23 | compression_name = { 1: "Run-length encoding (RLE)" } 24 | version_name = { 25 | 0: u"Version 2.5 of PC Paintbrush", 26 | 2: u"Version 2.8 with palette information", 27 | 3: u"Version 2.8 without palette information", 28 | 4: u"PC Paintbrush for Windows", 29 | 5: u"Version 3.0 (or greater) of PC Paintbrush" 30 | } 31 | 32 | def validate(self): 33 | if self["id"].value != 10: 34 | return "Wrong signature" 35 | if self["version"].value not in self.version_name: 36 | return "Unknown format version" 37 | if self["bpp"].value not in (1, 2, 4, 8, 24, 32): 38 | return "Unknown bits/pixel" 39 | if self["reserved[0]"].value != "\0": 40 | return "Invalid reserved value" 41 | return True 42 | 43 | def createFields(self): 44 | yield UInt8(self, "id", "PCX identifier (10)") 45 | yield Enum(UInt8(self, "version", "PCX version"), self.version_name) 46 | yield Enum(UInt8(self, "compression", "Compression method"), self.compression_name) 47 | yield UInt8(self, "bpp", "Bits / pixel") 48 | yield UInt16(self, "xmin", "Minimum X") 49 | yield UInt16(self, "ymin", "Minimum Y") 50 | yield UInt16(self, "xmax", "Width minus one") # value + 1 51 | yield UInt16(self, "ymax", "Height minus one") # value + 1 52 | yield UInt16(self, "horiz_dpi", "Horizontal DPI") 53 | yield UInt16(self, "vert_dpi", "Vertical DPI") 54 | yield PaletteRGB(self, "palette_4bits", 16, "Palette (4 bits)") 55 | yield PaddingBytes(self, "reserved[]", 1) 56 | yield UInt8(self, "nb_color_plan", "Number of color plans") 57 | yield UInt16(self, "bytes_per_line", "Bytes per line") 58 | yield UInt16(self, "color_mode", "Color mode") 59 | yield PaddingBytes(self, "reserved[]", 58) 60 | 61 | if self._size is None: # TODO: is it possible to handle piped input? 62 | raise NotImplementedError 63 | 64 | nb_colors = 256 65 | size = (self._size - self.current_size)/8 66 | has_palette = self["bpp"].value == 8 67 | if has_palette: 68 | size -= nb_colors*3 69 | yield RawBytes(self, "image_data", size, "Image data") 70 | 71 | if has_palette: 72 | yield PaletteRGB(self, "palette_8bits", nb_colors, "Palette (8 bit)") 73 | 74 | -------------------------------------------------------------------------------- /hachoir_parser/program/prc.py: -------------------------------------------------------------------------------- 1 | """ 2 | PRC (Palm resource) parser. 3 | 4 | Author: Sebastien Ponce 5 | Creation date: 29 october 2008 6 | """ 7 | 8 | from hachoir_parser import Parser 9 | from hachoir_core.field import (FieldSet, 10 | UInt16, UInt32, TimestampMac32, 11 | String, RawBytes) 12 | from hachoir_core.endian import BIG_ENDIAN 13 | 14 | class PRCHeader(FieldSet): 15 | static_size = 78*8 16 | 17 | def createFields(self): 18 | yield String(self, "name", 32, "Name") 19 | yield UInt16(self, "flags", "Flags") 20 | yield UInt16(self, "version", "Version") 21 | yield TimestampMac32(self, "create_time", "Creation time") 22 | yield TimestampMac32(self, "mod_time", "Modification time") 23 | yield TimestampMac32(self, "backup_time", "Backup time") 24 | yield UInt32(self, "mod_num", "mod num") 25 | yield UInt32(self, "app_info", "app info") 26 | yield UInt32(self, "sort_info", "sort info") 27 | yield UInt32(self, "type", "type") 28 | yield UInt32(self, "id", "id") 29 | yield UInt32(self, "unique_id_seed", "unique_id_seed") 30 | yield UInt32(self, "next_record_list", "next_record_list") 31 | yield UInt16(self, "num_records", "num_records") 32 | 33 | class ResourceHeader(FieldSet): 34 | static_size = 10*8 35 | 36 | def createFields(self): 37 | yield String(self, "name", 4, "Name of the resource") 38 | yield UInt16(self, "flags", "ID number of the resource") 39 | yield UInt32(self, "offset", "Pointer to the resource data") 40 | 41 | def createDescription(self): 42 | return "Resource Header (%s)" % self["name"] 43 | 44 | class PRCFile(Parser): 45 | PARSER_TAGS = { 46 | "id": "prc", 47 | "category": "program", 48 | "file_ext": ("prc", ""), 49 | "min_size": ResourceHeader.static_size, # At least one program header 50 | "mime": ( 51 | u"application/x-pilot-prc", 52 | u"application/x-palmpilot"), 53 | "description": "Palm Resource File" 54 | } 55 | endian = BIG_ENDIAN 56 | 57 | def validate(self): 58 | # FIXME: Implement the validation function! 59 | return False 60 | 61 | def createFields(self): 62 | # Parse header and program headers 63 | yield PRCHeader(self, "header", "Header") 64 | lens = [] 65 | firstOne = True 66 | poff = 0 67 | for index in xrange(self["header/num_records"].value): 68 | r = ResourceHeader(self, "res_header[]") 69 | if firstOne: 70 | firstOne = False 71 | else: 72 | lens.append(r["offset"].value - poff) 73 | poff = r["offset"].value 74 | yield r 75 | lens.append(self.size/8 - poff) 76 | yield UInt16(self, "placeholder", "Place holder bytes") 77 | for i in range(len(lens)): 78 | yield RawBytes(self, "res[]", lens[i], '"'+self["res_header["+str(i)+"]/name"].value+"\" Resource") 79 | 80 | def createDescription(self): 81 | return "Palm Resource file" 82 | 83 | -------------------------------------------------------------------------------- /hachoir_parser/misc/hlp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Microsoft Windows Help (HLP) parser for Hachoir project. 3 | 4 | Documents: 5 | - Windows Help File Format / Annotation File Format / SHG and MRB File Format 6 | written by M. Winterhoff (100326.2776@compuserve.com) 7 | found on http://www.wotsit.org/ 8 | 9 | Author: Victor Stinner 10 | Creation date: 2007-09-03 11 | """ 12 | 13 | from hachoir_parser import Parser 14 | from hachoir_core.field import (FieldSet, 15 | Bits, Int32, UInt16, UInt32, 16 | NullBytes, RawBytes, PaddingBytes, String) 17 | from hachoir_core.endian import LITTLE_ENDIAN 18 | from hachoir_core.text_handler import (textHandler, hexadecimal, 19 | displayHandler, humanFilesize) 20 | 21 | class FileEntry(FieldSet): 22 | def __init__(self, *args, **kw): 23 | FieldSet.__init__(self, *args, **kw) 24 | self._size = self["res_space"].value * 8 25 | 26 | def createFields(self): 27 | yield displayHandler(UInt32(self, "res_space", "Reserved space"), humanFilesize) 28 | yield displayHandler(UInt32(self, "used_space", "Used space"), humanFilesize) 29 | yield Bits(self, "file_flags", 8, "(=4)") 30 | 31 | yield textHandler(UInt16(self, "magic"), hexadecimal) 32 | yield Bits(self, "flags", 16) 33 | yield displayHandler(UInt16(self, "page_size", "Page size in bytes"), humanFilesize) 34 | yield String(self, "structure", 16, strip="\0", charset="ASCII") 35 | yield NullBytes(self, "zero", 2) 36 | yield UInt16(self, "nb_page_splits", "Number of page splits B+ tree has suffered") 37 | yield UInt16(self, "root_page", "Page number of B+ tree root page") 38 | yield PaddingBytes(self, "one", 2, pattern="\xFF") 39 | yield UInt16(self, "nb_page", "Number of B+ tree pages") 40 | yield UInt16(self, "nb_level", "Number of levels of B+ tree") 41 | yield UInt16(self, "nb_entry", "Number of entries in B+ tree") 42 | 43 | size = (self.size - self.current_size)//8 44 | if size: 45 | yield PaddingBytes(self, "reserved_space", size) 46 | 47 | class HlpFile(Parser): 48 | PARSER_TAGS = { 49 | "id": "hlp", 50 | "category": "misc", 51 | "file_ext": ("hlp",), 52 | "min_size": 32, 53 | "description": "Microsoft Windows Help (HLP)", 54 | } 55 | endian = LITTLE_ENDIAN 56 | 57 | def validate(self): 58 | if self["magic"].value != 0x00035F3F: 59 | return "Invalid magic" 60 | if self["filesize"].value != self.stream.size//8: 61 | return "Invalid magic" 62 | return True 63 | 64 | def createFields(self): 65 | yield textHandler(UInt32(self, "magic"), hexadecimal) 66 | yield UInt32(self, "dir_start", "Directory start") 67 | yield Int32(self, "first_free_block", "First free block") 68 | yield UInt32(self, "filesize", "File size in bytes") 69 | 70 | yield self.seekByte(self["dir_start"].value) 71 | yield FileEntry(self, "file[]") 72 | 73 | size = (self.size - self.current_size)//8 74 | if size: 75 | yield RawBytes(self, "end", size) 76 | 77 | -------------------------------------------------------------------------------- /hachoir_parser/image/tga.py: -------------------------------------------------------------------------------- 1 | """ 2 | Truevision Targa Graphic (TGA) picture parser. 3 | 4 | Author: Victor Stinner 5 | Creation: 18 december 2006 6 | """ 7 | 8 | from hachoir_parser import Parser 9 | from hachoir_core.field import FieldSet, UInt8, UInt16, Enum, RawBytes 10 | from hachoir_core.endian import LITTLE_ENDIAN 11 | from hachoir_parser.image.common import PaletteRGB 12 | 13 | class Line(FieldSet): 14 | def __init__(self, *args): 15 | FieldSet.__init__(self, *args) 16 | self._size = self["/width"].value * self["/bpp"].value 17 | 18 | def createFields(self): 19 | for x in xrange(self["/width"].value): 20 | yield UInt8(self, "pixel[]") 21 | 22 | class Pixels(FieldSet): 23 | def __init__(self, *args): 24 | FieldSet.__init__(self, *args) 25 | self._size = self["/width"].value * self["/height"].value * self["/bpp"].value 26 | 27 | def createFields(self): 28 | if self["/options"].value == 0: 29 | RANGE = xrange(self["/height"].value-1,-1,-1) 30 | else: 31 | RANGE = xrange(self["/height"].value) 32 | for y in RANGE: 33 | yield Line(self, "line[%u]" % y) 34 | 35 | class TargaFile(Parser): 36 | PARSER_TAGS = { 37 | "id": "targa", 38 | "category": "image", 39 | "file_ext": ("tga",), 40 | "mime": (u"image/targa", u"image/tga", u"image/x-tga"), 41 | "min_size": 18*8, 42 | "description": u"Truevision Targa Graphic (TGA)" 43 | } 44 | CODEC_NAME = { 45 | 1: u"8-bit uncompressed", 46 | 2: u"24-bit uncompressed", 47 | 9: u"8-bit RLE", 48 | 10: u"24-bit RLE", 49 | } 50 | endian = LITTLE_ENDIAN 51 | 52 | def validate(self): 53 | if self["version"].value != 1: 54 | return "Unknown version" 55 | if self["codec"].value not in self.CODEC_NAME: 56 | return "Unknown codec" 57 | if self["x_min"].value != 0 or self["y_min"].value != 0: 58 | return "(x_min, y_min) is not (0,0)" 59 | if self["bpp"].value not in (8, 24): 60 | return "Unknown bits/pixel value" 61 | return True 62 | 63 | def createFields(self): 64 | yield UInt8(self, "hdr_size", "Header size in bytes") 65 | yield UInt8(self, "version", "Targa version (always one)") 66 | yield Enum(UInt8(self, "codec", "Pixels encoding"), self.CODEC_NAME) 67 | yield UInt16(self, "palette_ofs", "Palette absolute file offset") 68 | yield UInt16(self, "nb_color", "Number of color") 69 | yield UInt8(self, "color_map_size", "Color map entry size") 70 | yield UInt16(self, "x_min") 71 | yield UInt16(self, "y_min") 72 | yield UInt16(self, "width") 73 | yield UInt16(self, "height") 74 | yield UInt8(self, "bpp", "Bits per pixel") 75 | yield UInt8(self, "options", "Options (0: vertical mirror)") 76 | if self["bpp"].value == 8: 77 | yield PaletteRGB(self, "palette", 256) 78 | if self["codec"].value == 1: 79 | yield Pixels(self, "pixels") 80 | else: 81 | size = (self.size - self.current_size) // 8 82 | if size: 83 | yield RawBytes(self, "raw_pixels", size) 84 | 85 | 86 | -------------------------------------------------------------------------------- /hachoir_parser/game/laf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | LucasArts Font parser. 5 | 6 | Author: Cyril Zorin 7 | Creation date: 1 January 2007 8 | """ 9 | 10 | from hachoir_parser import Parser 11 | from hachoir_core.field import (FieldSet, 12 | UInt8, UInt16, UInt32, GenericVector) 13 | from hachoir_core.endian import LITTLE_ENDIAN 14 | 15 | class CharData(FieldSet): 16 | def __init__(self, chars, *args): 17 | FieldSet.__init__(self, *args) 18 | self.chars = chars 19 | 20 | def createFields(self): 21 | for char in self.chars: 22 | yield CharBitmap(char, self, "char_bitmap[]") 23 | 24 | class CharBitmap(FieldSet): 25 | def __init__(self, char, *args): 26 | FieldSet.__init__(self, *args) 27 | self.char = char 28 | 29 | def createFields(self): 30 | width = self.char["width_pixels"].value 31 | for line in xrange(self.char["height_pixels"].value): 32 | yield GenericVector(self, "line[]", width, 33 | UInt8, "pixel") 34 | 35 | class CharInfo(FieldSet): 36 | static_size = 16 * 8 37 | 38 | def createFields(self): 39 | yield UInt32(self, "data_offset") 40 | yield UInt8(self, "logical_width") 41 | yield UInt8(self, "unknown[]") 42 | yield UInt8(self, "unknown[]") 43 | yield UInt8(self, "unknown[]") 44 | yield UInt32(self, "width_pixels") 45 | yield UInt32(self, "height_pixels") 46 | 47 | class LafFile(Parser): 48 | PARSER_TAGS = { 49 | "id": "lucasarts_font", 50 | "category": "game", 51 | "file_ext" : ("laf",), 52 | "min_size" : 32*8, 53 | "description" : "LucasArts Font" 54 | } 55 | 56 | endian = LITTLE_ENDIAN 57 | 58 | def validate(self): 59 | if self["num_chars"].value != 256: 60 | return "Invalid number of characters (%u)" % self["num_chars"].value 61 | if self["first_char_code"].value != 0: 62 | return "Invalid of code of first character code (%u)" % self["first_char_code"].value 63 | if self["last_char_code"].value != 255: 64 | return "Invalid of code of last character code (%u)" % self["last_char_code"].value 65 | if self["char_codes/char[0]"].value != 0: 66 | return "Invalid character code #0 (%u)" % self["char_codes/char[0]"].value 67 | if self["chars/char[0]/data_offset"].value != 0: 68 | return "Invalid character #0 offset" 69 | return True 70 | 71 | def createFields(self): 72 | yield UInt32(self, "num_chars") 73 | yield UInt32(self, "raw_font_data_size") 74 | yield UInt32(self, "max_char_width") 75 | yield UInt32(self, "min_char_width") 76 | yield UInt32(self, "unknown[]", 4) 77 | yield UInt32(self, "unknown[]", 4) 78 | yield UInt32(self, "first_char_code") 79 | yield UInt32(self, "last_char_code") 80 | 81 | yield GenericVector(self, "char_codes", self["num_chars"].value, 82 | UInt16, "char") 83 | 84 | yield GenericVector(self, "chars", self["num_chars"].value, 85 | CharInfo, "char") 86 | 87 | # character data. we make an effort to provide 88 | # something more meaningful than "RawBytes: 89 | # character bitmap data" 90 | yield CharData(self["chars"], self, "char_data") 91 | 92 | # read to the end 93 | if self.current_size < self._size: 94 | yield self.seekBit(self._size, "unknown[]") 95 | -------------------------------------------------------------------------------- /hachoir_core/field/timestamp.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.tools import (humanDatetime, humanDuration, 2 | timestampUNIX, timestampMac32, timestampUUID60, 3 | timestampWin64, durationWin64) 4 | from hachoir_core.field import Bits, FieldSet 5 | from datetime import datetime 6 | 7 | class GenericTimestamp(Bits): 8 | def __init__(self, parent, name, size, description=None): 9 | Bits.__init__(self, parent, name, size, description) 10 | 11 | def createDisplay(self): 12 | return humanDatetime(self.value) 13 | 14 | def createRawDisplay(self): 15 | value = Bits.createValue(self) 16 | return unicode(value) 17 | 18 | def __nonzero__(self): 19 | return Bits.createValue(self) != 0 20 | 21 | def timestampFactory(cls_name, handler, size): 22 | class Timestamp(GenericTimestamp): 23 | def __init__(self, parent, name, description=None): 24 | GenericTimestamp.__init__(self, parent, name, size, description) 25 | 26 | def createValue(self): 27 | value = Bits.createValue(self) 28 | return handler(value) 29 | cls = Timestamp 30 | cls.__name__ = cls_name 31 | return cls 32 | 33 | TimestampUnix32 = timestampFactory("TimestampUnix32", timestampUNIX, 32) 34 | TimestampUnix64 = timestampFactory("TimestampUnix64", timestampUNIX, 64) 35 | TimestampMac32 = timestampFactory("TimestampUnix32", timestampMac32, 32) 36 | TimestampUUID60 = timestampFactory("TimestampUUID60", timestampUUID60, 60) 37 | TimestampWin64 = timestampFactory("TimestampWin64", timestampWin64, 64) 38 | 39 | class TimeDateMSDOS32(FieldSet): 40 | """ 41 | 32-bit MS-DOS timestamp (16-bit time, 16-bit date) 42 | """ 43 | static_size = 32 44 | 45 | def createFields(self): 46 | # TODO: Create type "MSDOS_Second" : value*2 47 | yield Bits(self, "second", 5, "Second/2") 48 | yield Bits(self, "minute", 6) 49 | yield Bits(self, "hour", 5) 50 | 51 | yield Bits(self, "day", 5) 52 | yield Bits(self, "month", 4) 53 | # TODO: Create type "MSDOS_Year" : value+1980 54 | yield Bits(self, "year", 7, "Number of year after 1980") 55 | 56 | def createValue(self): 57 | return datetime( 58 | 1980+self["year"].value, self["month"].value, self["day"].value, 59 | self["hour"].value, self["minute"].value, 2*self["second"].value) 60 | 61 | def createDisplay(self): 62 | return humanDatetime(self.value) 63 | 64 | class DateTimeMSDOS32(TimeDateMSDOS32): 65 | """ 66 | 32-bit MS-DOS timestamp (16-bit date, 16-bit time) 67 | """ 68 | def createFields(self): 69 | yield Bits(self, "day", 5) 70 | yield Bits(self, "month", 4) 71 | yield Bits(self, "year", 7, "Number of year after 1980") 72 | yield Bits(self, "second", 5, "Second/2") 73 | yield Bits(self, "minute", 6) 74 | yield Bits(self, "hour", 5) 75 | 76 | class TimedeltaWin64(GenericTimestamp): 77 | def __init__(self, parent, name, description=None): 78 | GenericTimestamp.__init__(self, parent, name, 64, description) 79 | 80 | def createDisplay(self): 81 | return humanDuration(self.value) 82 | 83 | def createValue(self): 84 | value = Bits.createValue(self) 85 | return durationWin64(value) 86 | 87 | -------------------------------------------------------------------------------- /pdfminer/lzw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | import sys 3 | try: 4 | from cStringIO import StringIO 5 | except ImportError: 6 | from StringIO import StringIO 7 | 8 | 9 | ## LZWDecoder 10 | ## 11 | class LZWDecoder(object): 12 | 13 | debug = 0 14 | 15 | def __init__(self, fp): 16 | self.fp = fp 17 | self.buff = 0 18 | self.bpos = 8 19 | self.nbits = 9 20 | self.table = None 21 | self.prevbuf = None 22 | return 23 | 24 | def readbits(self, bits): 25 | v = 0 26 | while 1: 27 | # the number of remaining bits we can get from the current buffer. 28 | r = 8-self.bpos 29 | if bits <= r: 30 | # |-----8-bits-----| 31 | # |-bpos-|-bits-| | 32 | # | |----r----| 33 | v = (v<>(r-bits)) & ((1<>sys.stderr, ('nbits=%d, code=%d, output=%r, table=%r' % 87 | (self.nbits, code, x, self.table[258:])) 88 | return 89 | 90 | # lzwdecode 91 | def lzwdecode(data): 92 | """ 93 | >>> lzwdecode('\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01') 94 | '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42' 95 | """ 96 | fp = StringIO(data) 97 | return ''.join(LZWDecoder(fp).run()) 98 | 99 | if __name__ == '__main__': 100 | import doctest 101 | doctest.testmod() 102 | -------------------------------------------------------------------------------- /parser.py: -------------------------------------------------------------------------------- 1 | import string 2 | import re 3 | 4 | class parser: 5 | def __init__(self,results,word,file): 6 | self.results=results 7 | self.word=word 8 | self.temp=[] 9 | self.file=file 10 | 11 | def genericClean(self): 12 | self.results = re.sub('', '', self.results) 13 | self.results = re.sub('', '', self.results) 14 | self.results = re.sub('', '', self.results) 15 | self.results = re.sub('', '', self.results) 16 | self.results = re.sub('%2f', ' ', self.results) 17 | self.results = re.sub('%3a', ' ', self.results) 18 | self.results = re.sub('', '', self.results) 19 | self.results = re.sub('', '', self.results) 20 | 21 | 22 | for e in ('>',':','=', '<', '/', '\\',';','&','%3A','%3D','%3C'): 23 | self.results = string.replace(self.results, e, ' ') 24 | 25 | def urlClean(self): 26 | self.results = re.sub('', '', self.results) 27 | self.results = re.sub('', '', self.results) 28 | self.results = re.sub('%2f', ' ', self.results) 29 | self.results = re.sub('%3a', ' ', self.results) 30 | for e in ('<','>',':','=',';','&','%3A','%3D','%3C'): 31 | self.results = string.replace(self.results, e, ' ') 32 | 33 | def emails(self): 34 | self.genericClean() 35 | reg_emails = re.compile('[a-zA-Z0-9.-_]*' + '@' + '[a-zA-Z0-9.-]*' + self.word) 36 | self.temp = reg_emails.findall(self.results) 37 | emails=self.unique() 38 | return emails 39 | 40 | def fileurls(self): 41 | urls=[] 42 | reg_urls = re.compile('[a-zA-Z0-9._ -]* profiles | LinkedIn') 54 | 55 | self.temp = reg_people.findall(self.results) 56 | resul = [] 57 | for x in self.temp: 58 | y = string.replace(x, ' LinkedIn', '') 59 | y = string.replace(y, ' profiles ', '') 60 | y = string.replace(y, 'LinkedIn', '') 61 | y = string.replace(y, '"', '') 62 | y = string.replace(y, '>', '') 63 | if y !=" ": 64 | resul.append(y) 65 | return resul 66 | 67 | def profiles(self): 68 | reg_people = re.compile('">[a-zA-Z0-9._ -]* - Google Profile') 69 | self.temp = reg_people.findall(self.results) 70 | resul = [] 71 | for x in self.temp: 72 | y = string.replace(x, ' Google Profile', '') 73 | y = string.replace(y, '-', '') 74 | y = string.replace(y, '">', '') 75 | if y !=" ": 76 | resul.append(y) 77 | return resul 78 | 79 | 80 | def hostnames(self): 81 | self.genericClean() 82 | reg_hosts = re.compile('[a-zA-Z0-9.-]*\.'+ self.word) 83 | self.temp = reg_hosts.findall(self.results) 84 | hostnames=self.unique() 85 | return hostnames 86 | 87 | def hostnames_all(self): 88 | reg_hosts = re.compile('(.*?)') 89 | temp = reg_hosts.findall(self.results) 90 | for x in temp: 91 | if x.count(':'): 92 | res=x.split(':')[1].split('/')[2] 93 | else: 94 | res=x.split("/")[0] 95 | self.temp.append(res) 96 | hostnames=self.unique() 97 | return hostnames 98 | 99 | def unique(self): 100 | self.new=[] 101 | for x in self.temp: 102 | if x not in self.new: 103 | self.new.append(x) 104 | return self.new 105 | -------------------------------------------------------------------------------- /hachoir_core/field/seekable_field_set.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.field import BasicFieldSet, GenericFieldSet, ParserError, createRawField 2 | from hachoir_core.error import HACHOIR_ERRORS 3 | 4 | # getgaps(int, int, [listof (int, int)]) -> generator of (int, int) 5 | # Gets all the gaps not covered by a block in `blocks` from `start` for `length` units. 6 | def getgaps(start, length, blocks): 7 | ''' 8 | Example: 9 | >>> list(getgaps(0, 20, [(15,3), (6,2), (6,2), (1,2), (2,3), (11,2), (9,5)])) 10 | [(0, 1), (5, 1), (8, 1), (14, 1), (18, 2)] 11 | ''' 12 | # done this way to avoid mutating the original 13 | blocks = sorted(blocks, key=lambda b: b[0]) 14 | end = start+length 15 | for s, l in blocks: 16 | if s > start: 17 | yield (start, s-start) 18 | start = s 19 | if s+l > start: 20 | start = s+l 21 | if start < end: 22 | yield (start, end-start) 23 | 24 | class RootSeekableFieldSet(GenericFieldSet): 25 | def seekBit(self, address, relative=True): 26 | if not relative: 27 | address -= self.absolute_address 28 | if address < 0: 29 | raise ParserError("Seek below field set start (%s.%s)" % divmod(address, 8)) 30 | self._current_size = address 31 | return None 32 | 33 | def seekByte(self, address, relative=True): 34 | return self.seekBit(address*8, relative) 35 | 36 | def _fixLastField(self): 37 | """ 38 | Try to fix last field when we know current field set size. 39 | Returns new added field if any, or None. 40 | """ 41 | assert self._size is not None 42 | 43 | # Stop parser 44 | message = ["stop parser"] 45 | self._field_generator = None 46 | 47 | # If last field is too big, delete it 48 | while self._size < self._current_size: 49 | field = self._deleteField(len(self._fields)-1) 50 | message.append("delete field %s" % field.path) 51 | assert self._current_size <= self._size 52 | 53 | blocks = [(x.absolute_address, x.size) for x in self._fields] 54 | fields = [] 55 | self._size = max(self._size, max(a+b for a,b in blocks) - self.absolute_address) 56 | for start, length in getgaps(self.absolute_address, self._size, blocks): 57 | self.seekBit(start, relative=False) 58 | field = createRawField(self, length, "unparsed[]") 59 | self.setUniqueFieldName(field) 60 | self._fields.append(field.name, field) 61 | fields.append(field) 62 | message.append("found unparsed segment: start %s, length %s" % (start, length)) 63 | self.seekBit(self._size + self.absolute_address, relative=False) 64 | message = ", ".join(message) 65 | if fields: 66 | self.warning("[Autofix] Fix parser error: " + message) 67 | return fields 68 | 69 | def _stopFeeding(self): 70 | new_field = None 71 | if self._size is None: 72 | if self._parent: 73 | self._size = self._current_size 74 | 75 | new_field = self._fixLastField() 76 | self._field_generator = None 77 | return new_field 78 | 79 | class SeekableFieldSet(RootSeekableFieldSet): 80 | def __init__(self, parent, name, description=None, size=None): 81 | assert issubclass(parent.__class__, BasicFieldSet) 82 | RootSeekableFieldSet.__init__(self, parent, name, parent.stream, description, size) 83 | -------------------------------------------------------------------------------- /hachoir_parser/audio/au.py: -------------------------------------------------------------------------------- 1 | """ 2 | AU audio file parser 3 | 4 | Author: Victor Stinner 5 | Creation: 12 july 2006 6 | """ 7 | 8 | from hachoir_parser import Parser 9 | from hachoir_core.field import UInt32, Enum, String, RawBytes 10 | from hachoir_core.endian import BIG_ENDIAN 11 | from hachoir_core.text_handler import displayHandler, filesizeHandler 12 | from hachoir_core.tools import createDict, humanFrequency 13 | 14 | class AuFile(Parser): 15 | PARSER_TAGS = { 16 | "id": "sun_next_snd", 17 | "category": "audio", 18 | "file_ext": ("au", "snd"), 19 | "mime": (u"audio/basic",), 20 | "min_size": 24*8, 21 | "magic": ((".snd", 0),), 22 | "description": "Sun/NeXT audio" 23 | } 24 | endian = BIG_ENDIAN 25 | 26 | CODEC_INFO = { 27 | 1: (8, u"8-bit ISDN u-law"), 28 | 2: (8, u"8-bit linear PCM"), 29 | 3: (16, u"16-bit linear PCM"), 30 | 4: (24, u"24-bit linear PCM"), 31 | 5: (32, u"32-bit linear PCM"), 32 | 6: (32, u"32-bit IEEE floating point"), 33 | 7: (64, u"64-bit IEEE floating point"), 34 | 8: (None, u"Fragmented sample data"), 35 | 9: (None, u"DSP program"), 36 | 10: (8, u"8-bit fixed point"), 37 | 11: (16, u"16-bit fixed point"), 38 | 12: (24, u"24-bit fixed point"), 39 | 13: (32, u"32-bit fixed point"), 40 | 18: (16, u"16-bit linear with emphasis"), 41 | 19: (16, u"16-bit linear compressed"), 42 | 20: (16, u"16-bit linear with emphasis and compression"), 43 | 21: (None, u"Music kit DSP commands"), 44 | 23: (None, u"4-bit ISDN u-law compressed (CCITT G.721 ADPCM)"), 45 | 24: (None, u"ITU-T G.722 ADPCM"), 46 | 25: (None, u"ITU-T G.723 3-bit ADPCM"), 47 | 26: (None, u"ITU-T G.723 5-bit ADPCM"), 48 | 27: (8, u"8-bit ISDN A-law"), 49 | } 50 | 51 | # Create bit rate and codec name dictionnaries 52 | BITS_PER_SAMPLE = createDict(CODEC_INFO, 0) 53 | CODEC_NAME = createDict(CODEC_INFO, 1) 54 | 55 | VALID_NB_CHANNEL = set((1,2)) # FIXME: 4, 5, 7, 8 channels are supported? 56 | 57 | def validate(self): 58 | if self.stream.readBytes(0, 4) != ".snd": 59 | return "Wrong file signature" 60 | if self["channels"].value not in self.VALID_NB_CHANNEL: 61 | return "Invalid number of channel" 62 | return True 63 | 64 | def getBitsPerSample(self): 65 | """ 66 | Get bit rate (number of bit per sample per channel), 67 | may returns None if you unable to compute it. 68 | """ 69 | return self.BITS_PER_SAMPLE.get(self["codec"].value) 70 | 71 | def createFields(self): 72 | yield String(self, "signature", 4, 'Format signature (".snd")', charset="ASCII") 73 | yield UInt32(self, "data_ofs", "Data offset") 74 | yield filesizeHandler(UInt32(self, "data_size", "Data size")) 75 | yield Enum(UInt32(self, "codec", "Audio codec"), self.CODEC_NAME) 76 | yield displayHandler(UInt32(self, "sample_rate", "Number of samples/second"), humanFrequency) 77 | yield UInt32(self, "channels", "Number of interleaved channels") 78 | 79 | size = self["data_ofs"].value - self.current_size // 8 80 | if 0 < size: 81 | yield String(self, "info", size, "Information", strip=" \0", charset="ISO-8859-1") 82 | 83 | size = min(self["data_size"].value, (self.size - self.current_size) // 8) 84 | yield RawBytes(self, "audio_data", size, "Audio data") 85 | 86 | def createContentSize(self): 87 | return (self["data_ofs"].value + self["data_size"].value) * 8 88 | 89 | -------------------------------------------------------------------------------- /hachoir_parser/video/amf.py: -------------------------------------------------------------------------------- 1 | """ 2 | AMF metadata (inside Flash video, FLV file) parser. 3 | 4 | Documentation: 5 | 6 | - flashticle: Python project to read Flash (formats SWF, FLV and AMF) 7 | http://undefined.org/python/#flashticle 8 | 9 | Author: Victor Stinner 10 | Creation date: 4 november 2006 11 | """ 12 | 13 | from hachoir_core.field import (FieldSet, ParserError, 14 | UInt8, UInt16, UInt32, PascalString16, Float64) 15 | from hachoir_core.tools import timestampUNIX 16 | 17 | def parseUTF8(parent): 18 | yield PascalString16(parent, "value", charset="UTF-8") 19 | 20 | def parseDouble(parent): 21 | yield Float64(parent, "value") 22 | 23 | def parseBool(parent): 24 | yield UInt8(parent, "value") 25 | 26 | def parseArray(parent): 27 | yield UInt32(parent, "count") 28 | for index in xrange(parent["count"].value): 29 | yield AMFObject(parent, "item[]") 30 | 31 | def parseObjectAttributes(parent): 32 | while True: 33 | item = Attribute(parent, "attr[]") 34 | yield item 35 | if item["key"].value == "": 36 | break 37 | 38 | def parseMixedArray(parent): 39 | yield UInt32(parent, "count") 40 | for index in xrange(parent["count"].value + 1): 41 | item = Attribute(parent, "item[]") 42 | yield item 43 | if not item['key'].value: 44 | break 45 | 46 | def parseDate(parent): 47 | yield Float64(parent, "timestamp_microsec") 48 | yield UInt16(parent, "timestamp_sec") 49 | 50 | def parseNothing(parent): 51 | raise StopIteration() 52 | 53 | class AMFObject(FieldSet): 54 | CODE_DATE = 11 55 | tag_info = { 56 | # http://osflash.org/amf/astypes 57 | 0: (parseDouble, "Double"), 58 | 1: (parseBool, "Boolean"), 59 | 2: (parseUTF8, "UTF-8 string"), 60 | 3: (parseObjectAttributes, "Object attributes"), 61 | #MOVIECLIP = '\x04', 62 | #NULL = '\x05', 63 | #UNDEFINED = '\x06', 64 | #REFERENCE = '\x07', 65 | 8: (parseMixedArray, "Mixed array"), 66 | 9: (parseNothing, "End of object"), 67 | 10: (parseArray, "Array"), 68 | CODE_DATE: (parseDate, "Date"), 69 | #LONGUTF8 = '\x0c', 70 | #UNSUPPORTED = '\x0d', 71 | ## Server-to-client only 72 | #RECORDSET = '\x0e', 73 | #XML = '\x0f', 74 | #TYPEDOBJECT = '\x10', 75 | } 76 | 77 | def __init__(self, *args, **kw): 78 | FieldSet.__init__(self, *args, **kw) 79 | code = self["type"].value 80 | try: 81 | self.parser, desc = self.tag_info[code] 82 | if code == self.CODE_DATE: 83 | self.createValue = self.createValueDate 84 | except KeyError: 85 | raise ParserError("AMF: Unable to parse type %s" % code) 86 | 87 | def createFields(self): 88 | yield UInt8(self, "type") 89 | for field in self.parser(self): 90 | yield field 91 | 92 | def createValueDate(self): 93 | value = (self["timestamp_microsec"].value * 0.001) \ 94 | - (self["timestamp_sec"].value * 60) 95 | return timestampUNIX(value) 96 | 97 | class Attribute(AMFObject): 98 | def __init__(self, *args): 99 | AMFObject.__init__(self, *args) 100 | self._description = None 101 | 102 | def createFields(self): 103 | yield PascalString16(self, "key", charset="UTF-8") 104 | yield UInt8(self, "type") 105 | for field in self.parser(self): 106 | yield field 107 | 108 | def createDescription(self): 109 | return 'Attribute "%s"' % self["key"].value 110 | 111 | -------------------------------------------------------------------------------- /myparser.py: -------------------------------------------------------------------------------- 1 | import string 2 | import re 3 | 4 | class parser: 5 | def __init__(self,results,word=""): 6 | self.results=results 7 | self.word=word 8 | self.temp=[] 9 | self.file=file 10 | 11 | def genericClean(self): 12 | self.results = re.sub('', '', self.results) 13 | self.results = re.sub('', '', self.results) 14 | self.results = re.sub('', '', self.results) 15 | self.results = re.sub('', '', self.results) 16 | self.results = re.sub('%2f', ' ', self.results) 17 | self.results = re.sub('%3a', ' ', self.results) 18 | self.results = re.sub('', '', self.results) 19 | self.results = re.sub('', '', self.results) 20 | self.results = re.sub('',' ',self.results) 21 | 22 | 23 | for e in ('>',':','=', '<', '/', '\\',';','&','%3A','%3D','%3C'): 24 | self.results = string.replace(self.results, e, ' ') 25 | 26 | def urlClean(self): 27 | self.results = re.sub('', '', self.results) 28 | self.results = re.sub('', '', self.results) 29 | self.results = re.sub('%2f', ' ', self.results) 30 | self.results = re.sub('%3a', ' ', self.results) 31 | for e in ('<','>',':','=',';','&','%3A','%3D','%3C'): 32 | self.results = string.replace(self.results, e, ' ') 33 | 34 | def emails(self): 35 | self.genericClean() 36 | reg_emails = re.compile('[a-zA-Z0-9.-_]+' + '@' + '[a-zA-Z0-9.-]+') 37 | self.temp = reg_emails.findall(self.results) 38 | emails=self.unique() 39 | return emails 40 | 41 | def fileurls(self): 42 | urls=[] 43 | reg_urls = re.compile('[a-zA-Z0-9._ -]* profiles | LinkedIn') 57 | 58 | self.temp = reg_people.findall(self.results) 59 | resul = [] 60 | for x in self.temp: 61 | y = string.replace(x, ' LinkedIn', '') 62 | y = string.replace(y, ' profiles ', '') 63 | y = string.replace(y, 'LinkedIn', '') 64 | y = string.replace(y, '"', '') 65 | y = string.replace(y, '>', '') 66 | if y !=" ": 67 | resul.append(y) 68 | return resul 69 | 70 | def profiles(self): 71 | reg_people = re.compile('">[a-zA-Z0-9._ -]* - Google Profile') 72 | self.temp = reg_people.findall(self.results) 73 | resul = [] 74 | for x in self.temp: 75 | y = string.replace(x, ' Google Profile', '') 76 | y = string.replace(y, '-', '') 77 | y = string.replace(y, '">', '') 78 | if y !=" ": 79 | resul.append(y) 80 | return resul 81 | 82 | 83 | def hostnames(self): 84 | self.genericClean() 85 | reg_hosts = re.compile('[a-zA-Z0-9.-]*\.'+ self.word) 86 | self.temp = reg_hosts.findall(self.results) 87 | hosts=self.unique() 88 | return hosts 89 | 90 | def hostnames_all(self): 91 | reg_hosts = re.compile('(.*?)') 92 | temp = reg_hosts.findall(self.results) 93 | for x in temp: 94 | if x.count(':'): 95 | res=x.split(':')[1].split('/')[2] 96 | else: 97 | res=x.split("/")[0] 98 | self.temp.append(res) 99 | hostnames=self.unique() 100 | return hostnames 101 | 102 | def unique(self): 103 | self.new=[] 104 | for x in self.temp: 105 | if x not in self.new: 106 | self.new.append(x) 107 | return self.new 108 | -------------------------------------------------------------------------------- /extractors/metadataPDF.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # 3 | # metadataPDF.py - dump pdf metadata 4 | # 5 | # Copy of Yusuke's dumppdf to add dumpmeta 6 | import sys, re, os 7 | from pdfminer.psparser import PSKeyword, PSLiteral 8 | from pdfminer.pdfparser import PDFDocument, PDFParser 9 | from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value 10 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf 11 | from pdfminer.pdfdevice import PDFDevice, TagExtractor 12 | from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter 13 | from pdfminer.cmapdb import CMapDB 14 | from pdfminer.layout import LAParams 15 | import myparser 16 | 17 | 18 | # dumpmeta 19 | class metapdf: 20 | def __init__(self,fname, password=''): 21 | self.fname=fname 22 | self.password=password 23 | self.metadata='' 24 | self.users=[] 25 | self.software=[] 26 | self.paths=[] 27 | self.raw="" 28 | self.company=[] 29 | self.text="" 30 | 31 | def getTexts(self): 32 | try: 33 | password ='' 34 | pagenos = set() 35 | maxpages = 0 36 | codec = 'utf-8' 37 | caching = True 38 | laparams = LAParams() 39 | rsrcmgr = PDFResourceManager(caching=caching) 40 | outfp = file('temppdf.txt','w') 41 | device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) 42 | fname= self.fname 43 | fp = file(fname, 'rb') 44 | process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True) 45 | fp.close() 46 | device.close() 47 | outfp.close() 48 | infp = file('temppdf.txt','rb') 49 | test=infp.read() 50 | infp.close() 51 | os.remove('temppdf.txt') 52 | self.text=test 53 | return "ok" 54 | except Exception,e: 55 | return e 56 | 57 | def getData(self): 58 | doc = PDFDocument() 59 | fp = file(self.fname, 'rb') 60 | parser = PDFParser(fp) 61 | try: 62 | parser.set_document(doc) 63 | doc.set_parser(parser) 64 | doc.initialize(self.password) 65 | except: 66 | return "error" 67 | 68 | parser.close() 69 | fp.close() 70 | #try: 71 | # metadata = resolve1(doc.catalog['Metadata']) 72 | # return "ok" 73 | #except: 74 | # print "[x] Error in PDF extractor, Metadata catalog" 75 | try: 76 | for xref in doc.xrefs: 77 | info_ref=xref.trailer.get('Info') 78 | if info_ref: 79 | info=resolve1(info_ref) 80 | self.metadata=info 81 | self.raw = info 82 | if self.raw == None: 83 | return "Empty metadata" 84 | else: 85 | return "ok" 86 | except Exception,e: 87 | return e 88 | print "\t [x] Error in PDF extractor, Trailer Info" 89 | 90 | def getEmails(self): 91 | em=myparser.parser(self.text) 92 | return em.emails() 93 | 94 | def getHosts(self,domain): 95 | em=myparser.parser(self.text,domain) 96 | return em.hostnames() 97 | 98 | def getUsers(self): 99 | if self.metadata.has_key('Author'): 100 | self.users.append(self.metadata['Author']) 101 | return self.users 102 | def getCompany(self): 103 | try: 104 | self.users.append(self.metadata['Company']) 105 | except: 106 | print "\t [x] Error in PDF metadata Company" 107 | return self.company 108 | 109 | 110 | def getSoftware(self): 111 | try: 112 | self.software.append(self.metadata['Producer']) 113 | except: 114 | print "\t [x] Error in PDF metadata Software" 115 | try: 116 | self.software.append(self.metadata['Creator']) 117 | except: 118 | print "\t [x] Error in PDF metadata Creator" 119 | return self.software 120 | 121 | def getPaths(self): 122 | return self.paths 123 | 124 | def getRaw(self): 125 | return self.raw 126 | -------------------------------------------------------------------------------- /hachoir_core/field/link.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.field import Field, FieldSet, ParserError, Bytes, MissingField 2 | from hachoir_core.stream import FragmentedStream 3 | 4 | 5 | class Link(Field): 6 | def __init__(self, parent, name, *args, **kw): 7 | Field.__init__(self, parent, name, 0, *args, **kw) 8 | 9 | def hasValue(self): 10 | return True 11 | 12 | def createValue(self): 13 | return self._parent[self.display] 14 | 15 | def createDisplay(self): 16 | value = self.value 17 | if value is None: 18 | return "<%s>" % MissingField.__name__ 19 | return value.path 20 | 21 | def _getField(self, name, const): 22 | target = self.value 23 | assert self != target 24 | return target._getField(name, const) 25 | 26 | 27 | class Fragments: 28 | def __init__(self, first): 29 | self.first = first 30 | 31 | def __iter__(self): 32 | fragment = self.first 33 | while fragment is not None: 34 | data = fragment.getData() 35 | yield data and data.size 36 | fragment = fragment.next 37 | 38 | 39 | class Fragment(FieldSet): 40 | _first = None 41 | 42 | def __init__(self, *args, **kw): 43 | FieldSet.__init__(self, *args, **kw) 44 | self._field_generator = self._createFields(self._field_generator) 45 | if self.__class__.createFields == Fragment.createFields: 46 | self._getData = lambda: self 47 | 48 | def getData(self): 49 | try: 50 | return self._getData() 51 | except MissingField, e: 52 | self.error(str(e)) 53 | return None 54 | 55 | def setLinks(self, first, next=None): 56 | self._first = first or self 57 | self._next = next 58 | self._feedLinks = lambda: self 59 | return self 60 | 61 | def _feedLinks(self): 62 | while self._first is None and self.readMoreFields(1): 63 | pass 64 | if self._first is None: 65 | raise ParserError("first is None") 66 | return self 67 | first = property(lambda self: self._feedLinks()._first) 68 | 69 | def _getNext(self): 70 | next = self._feedLinks()._next 71 | if callable(next): 72 | self._next = next = next() 73 | return next 74 | next = property(_getNext) 75 | 76 | def _createInputStream(self, **args): 77 | first = self.first 78 | if first is self and hasattr(first, "_getData"): 79 | return FragmentedStream(first, packets=Fragments(first), **args) 80 | return FieldSet._createInputStream(self, **args) 81 | 82 | def _createFields(self, field_generator): 83 | if self._first is None: 84 | for field in field_generator: 85 | if self._first is not None: 86 | break 87 | yield field 88 | else: 89 | raise ParserError("Fragment.setLinks not called") 90 | else: 91 | field = None 92 | if self._first is not self: 93 | link = Link(self, "first", None) 94 | link._getValue = lambda: self._first 95 | yield link 96 | if self._next: 97 | link = Link(self, "next", None) 98 | link.createValue = self._getNext 99 | yield link 100 | if field: 101 | yield field 102 | for field in field_generator: 103 | yield field 104 | 105 | def createFields(self): 106 | if self._size is None: 107 | self._size = self._getSize() 108 | yield Bytes(self, "data", self._size/8) 109 | 110 | -------------------------------------------------------------------------------- /hachoir_parser/program/exe_ne.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.field import (FieldSet, 2 | Bit, UInt8, UInt16, UInt32, Bytes, 3 | PaddingBits, PaddingBytes, NullBits, NullBytes) 4 | from hachoir_core.text_handler import textHandler, hexadecimal, filesizeHandler 5 | 6 | class NE_Header(FieldSet): 7 | static_size = 64*8 8 | def createFields(self): 9 | yield Bytes(self, "signature", 2, "New executable signature (NE)") 10 | yield UInt8(self, "link_ver", "Linker version number") 11 | yield UInt8(self, "link_rev", "Linker revision number") 12 | yield UInt16(self, "entry_table_ofst", "Offset to the entry table") 13 | yield UInt16(self, "entry_table_size", "Length (in bytes) of the entry table") 14 | yield PaddingBytes(self, "reserved[]", 4) 15 | 16 | yield Bit(self, "is_dll", "Is a dynamic-link library (DLL)?") 17 | yield Bit(self, "is_win_app", "Is a Windows application?") 18 | yield PaddingBits(self, "reserved[]", 9) 19 | yield Bit(self, "first_seg_code", "First segment contains code that loads the application?") 20 | yield NullBits(self, "reserved[]", 1) 21 | yield Bit(self, "link_error", "Load even if linker detects errors?") 22 | yield NullBits(self, "reserved[]", 1) 23 | yield Bit(self, "is_lib", "Is a library module?") 24 | 25 | yield UInt16(self, "auto_data_seg", "Automatic data segment number") 26 | yield filesizeHandler(UInt16(self, "local_heap_size", "Initial size (in bytes) of the local heap")) 27 | yield filesizeHandler(UInt16(self, "stack_size", "Initial size (in bytes) of the stack")) 28 | yield textHandler(UInt32(self, "cs_ip", "Value of CS:IP"), hexadecimal) 29 | yield textHandler(UInt32(self, "ss_sp", "Value of SS:SP"), hexadecimal) 30 | 31 | yield UInt16(self, "nb_entry_seg_tab", "Number of entries in the segment table") 32 | yield UInt16(self, "nb_entry_modref_tab", "Number of entries in the module-reference table") 33 | yield filesizeHandler(UInt16(self, "size_nonres_name_tab", "Number of bytes in the nonresident-name table")) 34 | yield UInt16(self, "seg_tab_ofs", "Segment table offset") 35 | yield UInt16(self, "rsrc_ofs", "Resource offset") 36 | 37 | yield UInt16(self, "res_name_tab_ofs", "Resident-name table offset") 38 | yield UInt16(self, "mod_ref_tab_ofs", "Module-reference table offset") 39 | yield UInt16(self, "import_tab_ofs", "Imported-name table offset") 40 | 41 | yield UInt32(self, "non_res_name_tab_ofs", "Nonresident-name table offset") 42 | yield UInt16(self, "nb_mov_ent_pt", "Number of movable entry points") 43 | yield UInt16(self, "log2_sector_size", "Log2 of the segment sector size") 44 | yield UInt16(self, "nb_rsrc_seg", "Number of resource segments") 45 | 46 | yield Bit(self, "unknown_os_format", "Operating system format is unknown") 47 | yield PaddingBits(self, "reserved[]", 1) 48 | yield Bit(self, "os_windows", "Operating system is Microsoft Windows") 49 | yield NullBits(self, "reserved[]", 6) 50 | yield Bit(self, "is_win20_prot", "Is Windows 2.x application running in version 3.x protected mode") 51 | yield Bit(self, "is_win20_font", "Is Windows 2.x application supporting proportional fonts") 52 | yield Bit(self, "fast_load", "Contains a fast-load area?") 53 | yield NullBits(self, "reserved[]", 4) 54 | 55 | yield UInt16(self, "fastload_ofs", "Fast-load area offset (in sector)") 56 | yield UInt16(self, "fastload_size", "Fast-load area length (in sector)") 57 | 58 | yield NullBytes(self, "reserved[]", 2) 59 | yield textHandler(UInt16(self, "win_version", "Expected Windows version number"), hexadecimal) 60 | 61 | -------------------------------------------------------------------------------- /hachoir_core/field/float.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.field import Bit, Bits, FieldSet 2 | from hachoir_core.endian import BIG_ENDIAN, LITTLE_ENDIAN 3 | import struct 4 | 5 | # Make sure that we use right struct types 6 | assert struct.calcsize("f") == 4 7 | assert struct.calcsize("d") == 8 8 | assert struct.unpack("d", "\xc0\0\0\0\0\0\0\0")[0] == -2.0 10 | 11 | class FloatMantissa(Bits): 12 | def createValue(self): 13 | value = Bits.createValue(self) 14 | return 1 + float(value) / (2 ** self.size) 15 | 16 | def createRawDisplay(self): 17 | return unicode(Bits.createValue(self)) 18 | 19 | class FloatExponent(Bits): 20 | def __init__(self, parent, name, size): 21 | Bits.__init__(self, parent, name, size) 22 | self.bias = 2 ** (size-1) - 1 23 | 24 | def createValue(self): 25 | return Bits.createValue(self) - self.bias 26 | 27 | def createRawDisplay(self): 28 | return unicode(self.value + self.bias) 29 | 30 | def floatFactory(name, format, mantissa_bits, exponent_bits, doc): 31 | size = 1 + mantissa_bits + exponent_bits 32 | 33 | class Float(FieldSet): 34 | static_size = size 35 | __doc__ = doc 36 | 37 | def __init__(self, parent, name, description=None): 38 | assert parent.endian in (BIG_ENDIAN, LITTLE_ENDIAN) 39 | FieldSet.__init__(self, parent, name, description, size) 40 | if format: 41 | if self._parent.endian == BIG_ENDIAN: 42 | self.struct_format = ">"+format 43 | else: 44 | self.struct_format = "<"+format 45 | else: 46 | self.struct_format = None 47 | 48 | def createValue(self): 49 | """ 50 | Create float value: use struct.unpack() when it's possible 51 | (32 and 64-bit float) or compute it with : 52 | mantissa * (2.0 ** exponent) 53 | 54 | This computation may raise an OverflowError. 55 | """ 56 | if self.struct_format: 57 | raw = self._parent.stream.readBytes( 58 | self.absolute_address, self._size//8) 59 | try: 60 | return struct.unpack(self.struct_format, raw)[0] 61 | except struct.error, err: 62 | raise ValueError("[%s] conversion error: %s" % 63 | (self.__class__.__name__, err)) 64 | else: 65 | try: 66 | value = self["mantissa"].value * (2.0 ** float(self["exponent"].value)) 67 | if self["negative"].value: 68 | return -(value) 69 | else: 70 | return value 71 | except OverflowError: 72 | raise ValueError("[%s] floating point overflow" % 73 | self.__class__.__name__) 74 | 75 | def createFields(self): 76 | yield Bit(self, "negative") 77 | yield FloatExponent(self, "exponent", exponent_bits) 78 | if 64 <= mantissa_bits: 79 | yield Bit(self, "one") 80 | yield FloatMantissa(self, "mantissa", mantissa_bits-1) 81 | else: 82 | yield FloatMantissa(self, "mantissa", mantissa_bits) 83 | 84 | cls = Float 85 | cls.__name__ = name 86 | return cls 87 | 88 | # 32-bit float (standard: IEEE 754/854) 89 | Float32 = floatFactory("Float32", "f", 23, 8, 90 | "Floating point number: format IEEE 754 int 32 bit") 91 | 92 | # 64-bit float (standard: IEEE 754/854) 93 | Float64 = floatFactory("Float64", "d", 52, 11, 94 | "Floating point number: format IEEE 754 in 64 bit") 95 | 96 | # 80-bit float (standard: IEEE 754/854) 97 | Float80 = floatFactory("Float80", None, 64, 15, 98 | "Floating point number: format IEEE 754 in 80 bit") 99 | 100 | -------------------------------------------------------------------------------- /hachoir_parser/video/mpeg_ts.py: -------------------------------------------------------------------------------- 1 | """ 2 | MPEG-2 Transport Stream parser. 3 | 4 | Documentation: 5 | - MPEG-2 Transmission 6 | http://erg.abdn.ac.uk/research/future-net/digital-video/mpeg2-trans.html 7 | 8 | Author: Victor Stinner 9 | Creation date: 13 january 2007 10 | """ 11 | 12 | from hachoir_parser import Parser 13 | from hachoir_core.field import (FieldSet, ParserError, MissingField, 14 | UInt8, Enum, Bit, Bits, RawBytes) 15 | from hachoir_core.endian import BIG_ENDIAN 16 | from hachoir_core.text_handler import textHandler, hexadecimal 17 | 18 | class Packet(FieldSet): 19 | def __init__(self, *args): 20 | FieldSet.__init__(self, *args) 21 | if self["has_error"].value: 22 | self._size = 204*8 23 | else: 24 | self._size = 188*8 25 | 26 | PID = { 27 | 0x0000: "Program Association Table (PAT)", 28 | 0x0001: "Conditional Access Table (CAT)", 29 | # 0x0002..0x000f: reserved 30 | # 0x0010..0x1FFE: network PID, program map PID, elementary PID, etc. 31 | # TODO: Check above values 32 | #0x0044: "video", 33 | #0x0045: "audio", 34 | 0x1FFF: "Null packet", 35 | } 36 | 37 | def createFields(self): 38 | yield textHandler(UInt8(self, "sync", 8), hexadecimal) 39 | if self["sync"].value != 0x47: 40 | raise ParserError("MPEG-2 TS: Invalid synchronization byte") 41 | yield Bit(self, "has_error") 42 | yield Bit(self, "payload_unit_start") 43 | yield Bit(self, "priority") 44 | yield Enum(textHandler(Bits(self, "pid", 13, "Program identifier"), hexadecimal), self.PID) 45 | yield Bits(self, "scrambling_control", 2) 46 | yield Bit(self, "has_adaptation") 47 | yield Bit(self, "has_payload") 48 | yield Bits(self, "counter", 4) 49 | yield RawBytes(self, "payload", 184) 50 | if self["has_error"].value: 51 | yield RawBytes(self, "error_correction", 16) 52 | 53 | def createDescription(self): 54 | text = "Packet: PID %s" % self["pid"].display 55 | if self["payload_unit_start"].value: 56 | text += ", start of payload" 57 | return text 58 | 59 | def isValid(self): 60 | if not self["has_payload"].value and not self["has_adaptation"].value: 61 | return u"No payload and no adaptation" 62 | pid = self["pid"].value 63 | if (0x0002 <= pid <= 0x000f) or (0x2000 <= pid): 64 | return u"Invalid program identifier (%s)" % self["pid"].display 65 | return "" 66 | 67 | class MPEG_TS(Parser): 68 | PARSER_TAGS = { 69 | "id": "mpeg_ts", 70 | "category": "video", 71 | "file_ext": ("ts",), 72 | "min_size": 188*8, 73 | "description": u"MPEG-2 Transport Stream" 74 | } 75 | endian = BIG_ENDIAN 76 | 77 | def validate(self): 78 | sync = self.stream.searchBytes("\x47", 0, 204*8) 79 | if sync is None: 80 | return "Unable to find synchronization byte" 81 | for index in xrange(5): 82 | try: 83 | packet = self["packet[%u]" % index] 84 | except (ParserError, MissingField): 85 | if index and self.eof: 86 | return True 87 | else: 88 | return "Unable to get packet #%u" % index 89 | err = packet.isValid() 90 | if err: 91 | return "Packet #%u is invalid: %s" % (index, err) 92 | return True 93 | 94 | def createFields(self): 95 | while not self.eof: 96 | sync = self.stream.searchBytes("\x47", self.current_size, self.current_size+204*8) 97 | if sync is None: 98 | raise ParserError("Unable to find synchronization byte") 99 | elif sync: 100 | yield RawBytes(self, "incomplete_packet[]", (sync-self.current_size)//8) 101 | yield Packet(self, "packet[]") 102 | 103 | -------------------------------------------------------------------------------- /hachoir_parser/network/common.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.field import FieldSet, Field, Bits 2 | from hachoir_core.bits import str2hex 3 | from hachoir_parser.network.ouid import REGISTERED_OUID 4 | from hachoir_core.endian import BIG_ENDIAN 5 | from socket import gethostbyaddr, herror as socket_host_error 6 | 7 | def ip2name(addr): 8 | if not ip2name.resolve: 9 | return addr 10 | try: 11 | if addr in ip2name.cache: 12 | return ip2name.cache[addr] 13 | # FIXME: Workaround Python bug 14 | # Need double try/except to catch the bug 15 | try: 16 | name = gethostbyaddr(addr)[0] 17 | except KeyboardInterrupt: 18 | raise 19 | except (socket_host_error, ValueError): 20 | name = addr 21 | except (socket_host_error, KeyboardInterrupt, ValueError): 22 | ip2name.resolve = False 23 | name = addr 24 | ip2name.cache[addr] = name 25 | return name 26 | ip2name.cache = {} 27 | ip2name.resolve = True 28 | 29 | class IPv4_Address(Field): 30 | def __init__(self, parent, name, description=None): 31 | Field.__init__(self, parent, name, 32, description) 32 | 33 | def createValue(self): 34 | value = self._parent.stream.readBytes(self.absolute_address, 4) 35 | return ".".join(( "%u" % ord(byte) for byte in value )) 36 | 37 | def createDisplay(self): 38 | return ip2name(self.value) 39 | 40 | class IPv6_Address(Field): 41 | def __init__(self, parent, name, description=None): 42 | Field.__init__(self, parent, name, 128, description) 43 | 44 | def createValue(self): 45 | value = self._parent.stream.readBits(self.absolute_address, 128, self.parent.endian) 46 | parts = [] 47 | for index in xrange(8): 48 | part = "%04x" % (value & 0xffff) 49 | value >>= 16 50 | parts.append(part) 51 | return ':'.join(reversed(parts)) 52 | 53 | def createDisplay(self): 54 | return self.value 55 | 56 | class OrganizationallyUniqueIdentifier(Bits): 57 | """ 58 | IEEE 24-bit Organizationally unique identifier 59 | """ 60 | static_size = 24 61 | 62 | def __init__(self, parent, name, description=None): 63 | Bits.__init__(self, parent, name, 24, description=None) 64 | 65 | def createDisplay(self, human=True): 66 | if human: 67 | key = self.value 68 | if key in REGISTERED_OUID: 69 | return REGISTERED_OUID[key] 70 | else: 71 | return self.raw_display 72 | else: 73 | return self.raw_display 74 | 75 | def createRawDisplay(self): 76 | value = self.value 77 | a = value >> 16 78 | b = (value >> 8) & 0xFF 79 | c = value & 0xFF 80 | return "%02X-%02X-%02X" % (a, b, c) 81 | 82 | class NIC24(Bits): 83 | static_size = 24 84 | 85 | def __init__(self, parent, name, description=None): 86 | Bits.__init__(self, parent, name, 24, description=None) 87 | 88 | def createDisplay(self): 89 | value = self.value 90 | a = value >> 16 91 | b = (value >> 8) & 0xFF 92 | c = value & 0xFF 93 | return "%02x:%02x:%02x" % (a, b, c) 94 | 95 | def createRawDisplay(self): 96 | return "0x%06X" % self.value 97 | 98 | class MAC48_Address(FieldSet): 99 | """ 100 | IEEE 802 48-bit MAC address 101 | """ 102 | static_size = 48 103 | endian = BIG_ENDIAN 104 | 105 | def createFields(self): 106 | yield OrganizationallyUniqueIdentifier(self, "organization") 107 | yield NIC24(self, "nic") 108 | 109 | def hasValue(self): 110 | return True 111 | 112 | def createValue(self): 113 | bytes = self.stream.readBytes(self.absolute_address, 6) 114 | return str2hex(bytes, format="%02x:")[:-1] 115 | 116 | def createDisplay(self): 117 | return "%s [%s]" % (self["organization"].display, self["nic"].display) 118 | 119 | -------------------------------------------------------------------------------- /hachoir_parser/audio/real_audio.py: -------------------------------------------------------------------------------- 1 | """ 2 | RealAudio (.ra) parser 3 | 4 | Author: Mike Melanson 5 | References: 6 | http://wiki.multimedia.cx/index.php?title=RealMedia 7 | Samples: 8 | http://samples.mplayerhq.hu/real/RA/ 9 | """ 10 | 11 | from hachoir_parser import Parser 12 | from hachoir_core.field import (FieldSet, 13 | UInt8, UInt16, UInt32, 14 | Bytes, RawBytes, String, 15 | PascalString8) 16 | from hachoir_core.tools import humanFrequency 17 | from hachoir_core.text_handler import displayHandler 18 | from hachoir_core.endian import BIG_ENDIAN 19 | 20 | class Metadata(FieldSet): 21 | def createFields(self): 22 | yield PascalString8(self, "title", charset="ISO-8859-1") 23 | yield PascalString8(self, "author", charset="ISO-8859-1") 24 | yield PascalString8(self, "copyright", charset="ISO-8859-1") 25 | yield PascalString8(self, "comment", charset="ISO-8859-1") 26 | 27 | class RealAudioFile(Parser): 28 | MAGIC = ".ra\xFD" 29 | PARSER_TAGS = { 30 | "id": "real_audio", 31 | "category": "audio", 32 | "file_ext": ["ra"], 33 | "mime": (u"audio/x-realaudio", u"audio/x-pn-realaudio"), 34 | "min_size": 6*8, 35 | "magic": ((MAGIC, 0),), 36 | "description": u"Real audio (.ra)", 37 | } 38 | endian = BIG_ENDIAN 39 | 40 | def validate(self): 41 | if self["signature"].value != self.MAGIC: 42 | return "Invalid signature" 43 | if self["version"].value not in (3, 4): 44 | return "Unknown version" 45 | return True 46 | 47 | def createFields(self): 48 | yield Bytes(self, "signature", 4, r"RealAudio identifier ('.ra\xFD')") 49 | yield UInt16(self, "version", "Version") 50 | if self["version"].value == 3: 51 | yield UInt16(self, "header_size", "Header size") 52 | yield RawBytes(self, "Unknown1", 10) 53 | yield UInt32(self, "data_size", "Data size") 54 | yield Metadata(self, "metadata") 55 | yield UInt8(self, "Unknown2") 56 | yield PascalString8(self, "FourCC") 57 | audio_size = self["data_size"].value 58 | else: # version = 4 59 | yield UInt16(self, "reserved1", "Reserved, should be 0") 60 | yield String(self, "ra4sig", 4, "'.ra4' signature") 61 | yield UInt32(self, "filesize", "File size (minus 40 bytes)") 62 | yield UInt16(self, "version2", "Version 2 (always equal to version)") 63 | yield UInt32(self, "headersize", "Header size (minus 16)") 64 | yield UInt16(self, "codec_flavor", "Codec flavor") 65 | yield UInt32(self, "coded_frame_size", "Coded frame size") 66 | yield RawBytes(self, "unknown1", 12) 67 | yield UInt16(self, "subpacketh", "Subpacket h (?)") 68 | yield UInt16(self, "frame_size", "Frame size") 69 | yield UInt16(self, "sub_packet_size", "Subpacket size") 70 | yield UInt16(self, "unknown2", "Unknown") 71 | yield displayHandler(UInt16(self, "sample_rate", "Sample rate"), humanFrequency) 72 | yield UInt16(self, "unknown3", "Unknown") 73 | yield UInt16(self, "sample_size", "Sample size") 74 | yield UInt16(self, "channels", "Channels") 75 | yield PascalString8(self, "Interleaving ID String") 76 | yield PascalString8(self, "FourCC") 77 | yield RawBytes(self, "unknown4", 3) 78 | yield Metadata(self, "metadata") 79 | audio_size = (self["filesize"].value + 40) - (self["headersize"].value + 16) 80 | if 0 < audio_size: 81 | yield RawBytes(self, "audio_data", audio_size) 82 | 83 | def createDescription(self): 84 | if (self["version"].value == 3): 85 | return "RealAudio v3 file, '%s' codec" % self["FourCC"].value 86 | elif (self["version"].value == 4): 87 | return "RealAudio v4 file, '%s' codec, %s, %u channels" % ( 88 | self["FourCC"].value, self["sample_rate"].display, self["channels"].value) 89 | else: 90 | return "Real audio" 91 | -------------------------------------------------------------------------------- /hachoir_metadata/program.py: -------------------------------------------------------------------------------- 1 | from hachoir_metadata.metadata import RootMetadata, registerExtractor 2 | from hachoir_parser.program import ExeFile 3 | from hachoir_metadata.safe import fault_tolerant, getValue 4 | 5 | class ExeMetadata(RootMetadata): 6 | KEY_TO_ATTR = { 7 | u"ProductName": "title", 8 | u"LegalCopyright": "copyright", 9 | u"LegalTrademarks": "copyright", 10 | u"LegalTrademarks1": "copyright", 11 | u"LegalTrademarks2": "copyright", 12 | u"CompanyName": "author", 13 | u"BuildDate": "creation_date", 14 | u"FileDescription": "title", 15 | u"ProductVersion": "version", 16 | } 17 | SKIP_KEY = set((u"InternalName", u"OriginalFilename", u"FileVersion", u"BuildVersion")) 18 | 19 | def extract(self, exe): 20 | if exe.isPE(): 21 | self.extractPE(exe) 22 | elif exe.isNE(): 23 | self.extractNE(exe) 24 | 25 | def extractNE(self, exe): 26 | if "ne_header" in exe: 27 | self.useNE_Header(exe["ne_header"]) 28 | if "info" in exe: 29 | self.useNEInfo(exe["info"]) 30 | 31 | @fault_tolerant 32 | def useNEInfo(self, info): 33 | for node in info.array("node"): 34 | if node["name"].value == "StringFileInfo": 35 | self.readVersionInfo(node["node[0]"]) 36 | 37 | def extractPE(self, exe): 38 | # Read information from headers 39 | if "pe_header" in exe: 40 | self.usePE_Header(exe["pe_header"]) 41 | if "pe_opt_header" in exe: 42 | self.usePE_OptHeader(exe["pe_opt_header"]) 43 | 44 | # Use PE resource 45 | resource = exe.getResource() 46 | if resource and "version_info/node[0]" in resource: 47 | for node in resource.array("version_info/node[0]/node"): 48 | if getValue(node, "name") == "StringFileInfo" \ 49 | and "node[0]" in node: 50 | self.readVersionInfo(node["node[0]"]) 51 | 52 | @fault_tolerant 53 | def useNE_Header(self, hdr): 54 | if hdr["is_dll"].value: 55 | self.format_version = u"New-style executable: Dynamic-link library (DLL)" 56 | elif hdr["is_win_app"].value: 57 | self.format_version = u"New-style executable: Windows 3.x application" 58 | else: 59 | self.format_version = u"New-style executable for Windows 3.x" 60 | 61 | @fault_tolerant 62 | def usePE_Header(self, hdr): 63 | self.creation_date = hdr["creation_date"].value 64 | self.comment = "CPU: %s" % hdr["cpu"].display 65 | if hdr["is_dll"].value: 66 | self.format_version = u"Portable Executable: Dynamic-link library (DLL)" 67 | else: 68 | self.format_version = u"Portable Executable: Windows application" 69 | 70 | @fault_tolerant 71 | def usePE_OptHeader(self, hdr): 72 | self.comment = "Subsystem: %s" % hdr["subsystem"].display 73 | 74 | def readVersionInfo(self, info): 75 | values = {} 76 | for node in info.array("node"): 77 | if "value" not in node or "name" not in node: 78 | continue 79 | value = node["value"].value.strip(" \0") 80 | if not value: 81 | continue 82 | key = node["name"].value 83 | values[key] = value 84 | 85 | if "ProductName" in values and "FileDescription" in values: 86 | # Make sure that FileDescription is set before ProductName 87 | # as title value 88 | self.title = values["FileDescription"] 89 | self.title = values["ProductName"] 90 | del values["FileDescription"] 91 | del values["ProductName"] 92 | 93 | for key, value in values.iteritems(): 94 | if key in self.KEY_TO_ATTR: 95 | setattr(self, self.KEY_TO_ATTR[key], value) 96 | elif key not in self.SKIP_KEY: 97 | self.comment = "%s=%s" % (key, value) 98 | 99 | registerExtractor(ExeFile, ExeMetadata) 100 | 101 | -------------------------------------------------------------------------------- /hachoir_parser/file_system/linux_swap.py: -------------------------------------------------------------------------------- 1 | """ 2 | Linux swap file. 3 | 4 | Documentation: Linux kernel source code, files: 5 | - mm/swapfile.c 6 | - include/linux/swap.h 7 | 8 | Author: Victor Stinner 9 | Creation date: 25 december 2006 (christmas ;-)) 10 | """ 11 | 12 | from hachoir_parser import Parser 13 | from hachoir_core.field import (ParserError, GenericVector, 14 | UInt32, String, 15 | Bytes, NullBytes, RawBytes) 16 | from hachoir_core.endian import LITTLE_ENDIAN 17 | from hachoir_core.tools import humanFilesize 18 | from hachoir_core.bits import str2hex 19 | 20 | PAGE_SIZE = 4096 21 | 22 | # Definition of MAX_SWAP_BADPAGES in Linux kernel: 23 | # (__swapoffset(magic.magic) - __swapoffset(info.badpages)) / sizeof(int) 24 | MAX_SWAP_BADPAGES = ((PAGE_SIZE - 10) - 1536) // 4 25 | 26 | class Page(RawBytes): 27 | static_size = PAGE_SIZE*8 28 | def __init__(self, parent, name): 29 | RawBytes.__init__(self, parent, name, PAGE_SIZE) 30 | 31 | class UUID(Bytes): 32 | static_size = 16*8 33 | def __init__(self, parent, name): 34 | Bytes.__init__(self, parent, name, 16) 35 | def createDisplay(self): 36 | text = str2hex(self.value, format=r"%02x") 37 | return "%s-%s-%s-%s-%s" % ( 38 | text[:8], text[8:12], text[12:16], text[16:20], text[20:]) 39 | 40 | class LinuxSwapFile(Parser): 41 | PARSER_TAGS = { 42 | "id": "linux_swap", 43 | "file_ext": ("",), 44 | "category": "file_system", 45 | "min_size": PAGE_SIZE*8, 46 | "description": "Linux swap file", 47 | "magic": ( 48 | ("SWAP-SPACE", (PAGE_SIZE-10)*8), 49 | ("SWAPSPACE2", (PAGE_SIZE-10)*8), 50 | ("S1SUSPEND\0", (PAGE_SIZE-10)*8), 51 | ), 52 | } 53 | endian = LITTLE_ENDIAN 54 | 55 | def validate(self): 56 | magic = self.stream.readBytes((PAGE_SIZE-10)*8, 10) 57 | if magic not in ("SWAP-SPACE", "SWAPSPACE2", "S1SUSPEND\0"): 58 | return "Unknown magic string" 59 | if MAX_SWAP_BADPAGES < self["nb_badpage"].value: 60 | return "Invalid number of bad page (%u)" % self["nb_badpage"].value 61 | return True 62 | 63 | def getPageCount(self): 64 | """ 65 | Number of pages which can really be used for swapping: 66 | number of page minus bad pages minus one page (used for the header) 67 | """ 68 | # -1 because first page is used for the header 69 | return self["last_page"].value - self["nb_badpage"].value - 1 70 | 71 | def createDescription(self): 72 | if self["magic"].value == "S1SUSPEND\0": 73 | text = "Suspend swap file version 1" 74 | elif self["magic"].value == "SWAPSPACE2": 75 | text = "Linux swap file version 2" 76 | else: 77 | text = "Linux swap file version 1" 78 | nb_page = self.getPageCount() 79 | return "%s, page size: %s, %s pages" % ( 80 | text, humanFilesize(PAGE_SIZE), nb_page) 81 | 82 | def createFields(self): 83 | # First kilobyte: boot sectors 84 | yield RawBytes(self, "boot", 1024, "Space for disklabel etc.") 85 | 86 | # Header 87 | yield UInt32(self, "version") 88 | yield UInt32(self, "last_page") 89 | yield UInt32(self, "nb_badpage") 90 | yield UUID(self, "sws_uuid") 91 | yield UUID(self, "sws_volume") 92 | yield NullBytes(self, "reserved", 117*4) 93 | 94 | # Read bad pages (if any) 95 | count = self["nb_badpage"].value 96 | if count: 97 | if MAX_SWAP_BADPAGES < count: 98 | raise ParserError("Invalid number of bad page (%u)" % count) 99 | yield GenericVector(self, "badpages", count, UInt32, "badpage") 100 | 101 | # Read magic 102 | padding = self.seekByte(PAGE_SIZE - 10, "padding", null=True) 103 | if padding: 104 | yield padding 105 | yield String(self, "magic", 10, charset="ASCII") 106 | 107 | # Read all pages 108 | yield GenericVector(self, "pages", self["last_page"].value, Page, "page") 109 | 110 | # Padding at the end 111 | padding = self.seekBit(self.size, "end_padding", null=True) 112 | if padding: 113 | yield padding 114 | 115 | -------------------------------------------------------------------------------- /unzip.py: -------------------------------------------------------------------------------- 1 | """ unzip.py 2 | Version: 1.1 3 | 4 | Extract a zipfile to the directory provided 5 | It first creates the directory structure to house the files 6 | then it extracts the files to it. 7 | 8 | Sample usage: 9 | command line 10 | unzip.py -p 10 -z c:\testfile.zip -o c:\testoutput 11 | 12 | python class 13 | import unzip 14 | un = unzip.unzip() 15 | un.extract(r'c:\testfile.zip', 'c:\testoutput') 16 | 17 | 18 | By Doug Tolton 19 | """ 20 | 21 | import sys 22 | import zipfile 23 | import os 24 | import os.path 25 | import getopt 26 | 27 | class unzip: 28 | def __init__(self, verbose = False, percent = 10): 29 | self.verbose = False 30 | self.percent = percent 31 | 32 | def extract(self, file, dir): 33 | if not dir.endswith(':') and not os.path.exists(dir): 34 | os.mkdir(dir) 35 | 36 | zf = zipfile.ZipFile(file) 37 | 38 | # create directory structure to house files 39 | self._createstructure(file, dir) 40 | 41 | num_files = len(zf.namelist()) 42 | percent = self.percent 43 | divisions = 100 / percent 44 | perc = int(num_files / divisions) 45 | 46 | # extract files to directory structure 47 | for i, name in enumerate(zf.namelist()): 48 | 49 | if self.verbose == True: 50 | print "Extracting %s" % name 51 | elif perc > 0 and (i % perc) == 0 and i > 0: 52 | complete = int (i / perc) * percent 53 | #print "%s%% complete" % complete 54 | 55 | if not name.endswith('/'): 56 | outfile = open(os.path.join(dir, name), 'wb') 57 | outfile.write(zf.read(name)) 58 | outfile.flush() 59 | outfile.close() 60 | 61 | 62 | def _createstructure(self, file, dir): 63 | self._makedirs(self._listdirs(file), dir) 64 | 65 | 66 | def _makedirs(self, directories, basedir): 67 | """ Create any directories that don't currently exist """ 68 | for dir in directories: 69 | curdir = os.path.join(basedir, dir) 70 | if not os.path.exists(curdir): 71 | os.mkdir(curdir) 72 | #print("dir-->"+str(curdir)) 73 | 74 | def _listdirs(self, file): 75 | """ Grabs all the directories in the zip structure 76 | This is necessary to create the structure before trying 77 | to extract the file to it. """ 78 | zf = zipfile.ZipFile(file) 79 | 80 | dirs = [] 81 | #print str(zf.namelist()) 82 | 83 | for name in zf.namelist(): 84 | dirsname = name.split("/") 85 | ant="" 86 | for dirname in dirsname[:-1]: 87 | dirs.append(ant+dirname) 88 | #print "anadiendo:"+(ant+dirname) 89 | ant=ant+dirname+"/" 90 | 91 | dirs.sort() 92 | return dirs 93 | 94 | def usage(): 95 | print """usage: unzip.py -z -o 96 | is the source zipfile to extract 97 | is the target destination 98 | 99 | -z zipfile to extract 100 | -o target location 101 | -p sets the percentage notification 102 | -v sets the extraction to verbose (overrides -p) 103 | 104 | long options also work: 105 | --verbose 106 | --percent=10 107 | --zipfile= 108 | --outdir=""" 109 | 110 | 111 | def main(): 112 | shortargs = 'vhp:z:o:' 113 | longargs = ['verbose', 'help', 'percent=', 'zipfile=', 'outdir='] 114 | 115 | unzipper = unzip() 116 | 117 | try: 118 | opts, args = getopt.getopt(sys.argv[1:], shortargs, longargs) 119 | except getopt.GetoptError: 120 | usage() 121 | sys.exit(2) 122 | 123 | zipsource = "" 124 | zipdest = "" 125 | 126 | for o, a in opts: 127 | if o in ("-v", "--verbose"): 128 | unzipper.verbose = True 129 | if o in ("-p", "--percent"): 130 | if not unzipper.verbose == True: 131 | unzipper.percent = int(a) 132 | if o in ("-z", "--zipfile"): 133 | zipsource = a 134 | if o in ("-o", "--outdir"): 135 | zipdest = a 136 | if o in ("-h", "--help"): 137 | usage() 138 | sys.exit() 139 | 140 | if zipsource == "" or zipdest == "": 141 | usage() 142 | sys.exit() 143 | 144 | unzipper.extract(zipsource, zipdest) 145 | 146 | if __name__ == '__main__': main() -------------------------------------------------------------------------------- /hachoir_parser/common/win32_lang_id.py: -------------------------------------------------------------------------------- 1 | """ 2 | Windows 2000 - List of Locale IDs and Language Groups 3 | 4 | Original data table: 5 | http://www.microsoft.com/globaldev/reference/win2k/setup/lcid.mspx 6 | """ 7 | 8 | LANGUAGE_ID = { 9 | 0x0436: u"Afrikaans", 10 | 0x041c: u"Albanian", 11 | 0x0401: u"Arabic Saudi Arabia", 12 | 0x0801: u"Arabic Iraq", 13 | 0x0c01: u"Arabic Egypt", 14 | 0x1001: u"Arabic Libya", 15 | 0x1401: u"Arabic Algeria", 16 | 0x1801: u"Arabic Morocco", 17 | 0x1c01: u"Arabic Tunisia", 18 | 0x2001: u"Arabic Oman", 19 | 0x2401: u"Arabic Yemen", 20 | 0x2801: u"Arabic Syria", 21 | 0x2c01: u"Arabic Jordan", 22 | 0x3001: u"Arabic Lebanon", 23 | 0x3401: u"Arabic Kuwait", 24 | 0x3801: u"Arabic UAE", 25 | 0x3c01: u"Arabic Bahrain", 26 | 0x4001: u"Arabic Qatar", 27 | 0x042b: u"Armenian", 28 | 0x042c: u"Azeri Latin", 29 | 0x082c: u"Azeri Cyrillic", 30 | 0x042d: u"Basque", 31 | 0x0423: u"Belarusian", 32 | 0x0402: u"Bulgarian", 33 | 0x0403: u"Catalan", 34 | 0x0404: u"Chinese Taiwan", 35 | 0x0804: u"Chinese PRC", 36 | 0x0c04: u"Chinese Hong Kong", 37 | 0x1004: u"Chinese Singapore", 38 | 0x1404: u"Chinese Macau", 39 | 0x041a: u"Croatian", 40 | 0x0405: u"Czech", 41 | 0x0406: u"Danish", 42 | 0x0413: u"Dutch Standard", 43 | 0x0813: u"Dutch Belgian", 44 | 0x0409: u"English United States", 45 | 0x0809: u"English United Kingdom", 46 | 0x0c09: u"English Australian", 47 | 0x1009: u"English Canadian", 48 | 0x1409: u"English New Zealand", 49 | 0x1809: u"English Irish", 50 | 0x1c09: u"English South Africa", 51 | 0x2009: u"English Jamaica", 52 | 0x2409: u"English Caribbean", 53 | 0x2809: u"English Belize", 54 | 0x2c09: u"English Trinidad", 55 | 0x3009: u"English Zimbabwe", 56 | 0x3409: u"English Philippines", 57 | 0x0425: u"Estonian", 58 | 0x0438: u"Faeroese", 59 | 0x0429: u"Farsi", 60 | 0x040b: u"Finnish", 61 | 0x040c: u"French Standard", 62 | 0x080c: u"French Belgian", 63 | 0x0c0c: u"French Canadian", 64 | 0x100c: u"French Swiss", 65 | 0x140c: u"French Luxembourg", 66 | 0x180c: u"French Monaco", 67 | 0x0437: u"Georgian", 68 | 0x0407: u"German Standard", 69 | 0x0807: u"German Swiss", 70 | 0x0c07: u"German Austrian", 71 | 0x1007: u"German Luxembourg", 72 | 0x1407: u"German Liechtenstein", 73 | 0x0408: u"Greek", 74 | 0x040d: u"Hebrew", 75 | 0x0439: u"Hindi", 76 | 0x040e: u"Hungarian", 77 | 0x040f: u"Icelandic", 78 | 0x0421: u"Indonesian", 79 | 0x0410: u"Italian Standard", 80 | 0x0810: u"Italian Swiss", 81 | 0x0411: u"Japanese", 82 | 0x043f: u"Kazakh", 83 | 0x0457: u"Konkani", 84 | 0x0412: u"Korean", 85 | 0x0426: u"Latvian", 86 | 0x0427: u"Lithuanian", 87 | 0x042f: u"Macedonian", 88 | 0x043e: u"Malay Malaysia", 89 | 0x083e: u"Malay Brunei Darussalam", 90 | 0x044e: u"Marathi", 91 | 0x0414: u"Norwegian Bokmal", 92 | 0x0814: u"Norwegian Nynorsk", 93 | 0x0415: u"Polish", 94 | 0x0416: u"Portuguese Brazilian", 95 | 0x0816: u"Portuguese Standard", 96 | 0x0418: u"Romanian", 97 | 0x0419: u"Russian", 98 | 0x044f: u"Sanskrit", 99 | 0x081a: u"Serbian Latin", 100 | 0x0c1a: u"Serbian Cyrillic", 101 | 0x041b: u"Slovak", 102 | 0x0424: u"Slovenian", 103 | 0x040a: u"Spanish Traditional Sort", 104 | 0x080a: u"Spanish Mexican", 105 | 0x0c0a: u"Spanish Modern Sort", 106 | 0x100a: u"Spanish Guatemala", 107 | 0x140a: u"Spanish Costa Rica", 108 | 0x180a: u"Spanish Panama", 109 | 0x1c0a: u"Spanish Dominican Republic", 110 | 0x200a: u"Spanish Venezuela", 111 | 0x240a: u"Spanish Colombia", 112 | 0x280a: u"Spanish Peru", 113 | 0x2c0a: u"Spanish Argentina", 114 | 0x300a: u"Spanish Ecuador", 115 | 0x340a: u"Spanish Chile", 116 | 0x380a: u"Spanish Uruguay", 117 | 0x3c0a: u"Spanish Paraguay", 118 | 0x400a: u"Spanish Bolivia", 119 | 0x440a: u"Spanish El Salvador", 120 | 0x480a: u"Spanish Honduras", 121 | 0x4c0a: u"Spanish Nicaragua", 122 | 0x500a: u"Spanish Puerto Rico", 123 | 0x0441: u"Swahili", 124 | 0x041d: u"Swedish", 125 | 0x081d: u"Swedish Finland", 126 | 0x0449: u"Tamil", 127 | 0x0444: u"Tatar", 128 | 0x041e: u"Thai", 129 | 0x041f: u"Turkish", 130 | 0x0422: u"Ukrainian", 131 | 0x0420: u"Urdu", 132 | 0x0443: u"Uzbek Latin", 133 | 0x0843: u"Uzbek Cyrillic", 134 | 0x042a: u"Vietnamese", 135 | } 136 | 137 | -------------------------------------------------------------------------------- /hachoir_parser/audio/aiff.py: -------------------------------------------------------------------------------- 1 | """ 2 | Audio Interchange File Format (AIFF) parser. 3 | 4 | Author: Victor Stinner 5 | Creation: 27 december 2006 6 | """ 7 | 8 | from hachoir_parser import Parser 9 | from hachoir_core.field import (FieldSet, 10 | UInt16, UInt32, Float80, TimestampMac32, 11 | RawBytes, NullBytes, 12 | String, Enum, PascalString32) 13 | from hachoir_core.endian import BIG_ENDIAN 14 | from hachoir_core.text_handler import filesizeHandler 15 | from hachoir_core.tools import alignValue 16 | from hachoir_parser.audio.id3 import ID3v2 17 | 18 | CODEC_NAME = { 19 | 'ACE2': u"ACE 2-to-1", 20 | 'ACE8': u"ACE 8-to-3", 21 | 'MAC3': u"MAC 3-to-1", 22 | 'MAC6': u"MAC 6-to-1", 23 | 'NONE': u"None", 24 | 'sowt': u"Little-endian, no compression", 25 | } 26 | 27 | class Comment(FieldSet): 28 | def createFields(self): 29 | yield TimestampMac32(self, "timestamp") 30 | yield PascalString32(self, "text") 31 | 32 | def parseText(self): 33 | yield String(self, "text", self["size"].value) 34 | 35 | def parseID3(self): 36 | yield ID3v2(self, "id3v2", size=self["size"].value*8) 37 | 38 | def parseComment(self): 39 | yield UInt16(self, "nb_comment") 40 | for index in xrange(self["nb_comment"].value): 41 | yield Comment(self, "comment[]") 42 | 43 | def parseCommon(self): 44 | yield UInt16(self, "nb_channel") 45 | yield UInt32(self, "nb_sample") 46 | yield UInt16(self, "sample_size") 47 | yield Float80(self, "sample_rate") 48 | yield Enum(String(self, "codec", 4, strip="\0", charset="ASCII"), CODEC_NAME) 49 | 50 | def parseVersion(self): 51 | yield TimestampMac32(self, "timestamp") 52 | 53 | def parseSound(self): 54 | yield UInt32(self, "offset") 55 | yield UInt32(self, "block_size") 56 | size = (self.size - self.current_size) // 8 57 | if size: 58 | yield RawBytes(self, "data", size) 59 | 60 | class Chunk(FieldSet): 61 | TAG_INFO = { 62 | 'COMM': ('common', "Common chunk", parseCommon), 63 | 'COMT': ('comment', "Comment", parseComment), 64 | 'NAME': ('name', "Name", parseText), 65 | 'AUTH': ('author', "Author", parseText), 66 | 'FVER': ('version', "Version", parseVersion), 67 | 'SSND': ('sound', "Sound data", parseSound), 68 | 'ID3 ': ('id3', "ID3", parseID3), 69 | } 70 | 71 | def __init__(self, *args): 72 | FieldSet.__init__(self, *args) 73 | self._size = (8 + alignValue(self["size"].value, 2)) * 8 74 | tag = self["type"].value 75 | if tag in self.TAG_INFO: 76 | self._name, self._description, self._parser = self.TAG_INFO[tag] 77 | else: 78 | self._parser = None 79 | 80 | def createFields(self): 81 | yield String(self, "type", 4, "Signature (FORM)", charset="ASCII") 82 | yield filesizeHandler(UInt32(self, "size")) 83 | size = self["size"].value 84 | if size: 85 | if self._parser: 86 | for field in self._parser(self): 87 | yield field 88 | if size % 2: 89 | yield NullBytes(self, "padding", 1) 90 | else: 91 | yield RawBytes(self, "data", size) 92 | 93 | class AiffFile(Parser): 94 | PARSER_TAGS = { 95 | "id": "aiff", 96 | "category": "audio", 97 | "file_ext": ("aif", "aiff", "aifc"), 98 | "mime": (u"audio/x-aiff",), 99 | "magic_regex": (("FORM.{4}AIF[CF]", 0),), 100 | "min_size": 12*8, 101 | "description": "Audio Interchange File Format (AIFF)" 102 | } 103 | endian = BIG_ENDIAN 104 | 105 | def validate(self): 106 | if self.stream.readBytes(0, 4) != "FORM": 107 | return "Invalid signature" 108 | if self.stream.readBytes(8*8, 4) not in ("AIFF", "AIFC"): 109 | return "Invalid type" 110 | return True 111 | 112 | def createFields(self): 113 | yield String(self, "signature", 4, "Signature (FORM)", charset="ASCII") 114 | yield filesizeHandler(UInt32(self, "filesize")) 115 | yield String(self, "type", 4, "Form type (AIFF or AIFC)", charset="ASCII") 116 | while not self.eof: 117 | yield Chunk(self, "chunk[]") 118 | 119 | def createDescription(self): 120 | if self["type"].value == "AIFC": 121 | return "Audio Interchange File Format Compressed (AIFC)" 122 | else: 123 | return "Audio Interchange File Format (AIFF)" 124 | 125 | def createContentSize(self): 126 | return self["filesize"].value * 8 127 | 128 | -------------------------------------------------------------------------------- /htmlExport.py: -------------------------------------------------------------------------------- 1 | from lib import markup 2 | from lib import graphs 3 | 4 | class htmlExport(): 5 | def __init__(self,users,softs,paths,allinfo,fname,dirs,failed,domain,emails): 6 | self.users=users 7 | self.softs=softs 8 | self.paths=paths 9 | self.allinfo=allinfo 10 | self.fname=fname 11 | self.dir=dirs 12 | self.failed=failed 13 | self.style="" 14 | self.domain=domain 15 | self.emails=emails 16 | 17 | def styler(self): 18 | a=""" 78 | """ 79 | self.style=a 80 | 81 | def writehtml(self): 82 | page = markup.page() 83 | page.title("Metagoofil results") 84 | page.html() 85 | self.styler() 86 | page.head(self.style) 87 | page.head.close() 88 | page.body() 89 | page.h2("Metagoofil results") 90 | page.h3("Results for: " + self.domain) 91 | graph = graphs.BarGraph('vBar') 92 | try: 93 | graph.values = [len(self.users),len(self.softs),len(self.emails),len(self.paths)] 94 | graph.labels = ["Usernames","Software","Emails","Paths/Servers"] 95 | graph.showValues = 1 96 | page.body(graph.create()) 97 | except: 98 | print "graph" 99 | try: 100 | page.h3("User names found:") 101 | page.ul( class_="userslist") 102 | page.li( self.users, class_="useritem") 103 | page.ul.close( ) 104 | page.h3("Software versions found:") 105 | except: 106 | print "user" 107 | try: 108 | page.ul( class_="softlist") 109 | page.li(self.softs, class_="softitem") 110 | page.ul.close( ) 111 | except: 112 | print "email" 113 | page.h3("E-mails found:") 114 | if self.emails!=[]: 115 | page.ul( class_="emailslist") 116 | page.li(self.emails, class_="emailitem") 117 | page.ul.close( ) 118 | else: 119 | page.p("0 results") 120 | page.h3("Servers and paths found:") 121 | if self.paths!=[]: 122 | page.ul( class_="pathslist") 123 | page.li(self.paths, class_="pathitem") 124 | page.ul.close( ) 125 | else: 126 | page.p("0 results") 127 | page.h3("Files analyzed:") 128 | page.ul( class_="files") 129 | for x in self.allinfo: 130 | page.li(x[0], class_="file") 131 | page.ul.close() 132 | page.h2("Files and metadata found:") 133 | for x in self.allinfo: 134 | page.h3(x[0]) 135 | page.a("Local copy", class_="link", href=self.dir+"/"+x[0]) 136 | page.pre(x[1]) 137 | page.pre(x[2]) 138 | page.pre(x[3]) 139 | page.pre(x[5]) 140 | page.pre.close() 141 | page.h2("Failed extractions and reasons") 142 | for x in self.failed: 143 | page.pre(x) 144 | page.body.close() 145 | page.html.close() 146 | file = open(self.fname,'w') 147 | for x in page.content: 148 | try: 149 | file.write(x) 150 | except: 151 | #print "Exception" + x # send to logs 152 | pass 153 | file.close 154 | return "ok" 155 | -------------------------------------------------------------------------------- /hachoir_parser/image/iptc.py: -------------------------------------------------------------------------------- 1 | """ 2 | IPTC metadata parser (can be found in a JPEG picture for example) 3 | 4 | Sources: 5 | - Image-MetaData Perl module: 6 | http://www.annocpan.org/~BETTELLI/Image-MetaData-JPEG-0.15/... 7 | ...lib/Image/MetaData/JPEG/TagLists.pod 8 | - IPTC tag name and description: 9 | http://peccatte.karefil.com/software/IPTCTableau.pdf 10 | 11 | Author: Victor Stinner 12 | """ 13 | 14 | from hachoir_core.field import (FieldSet, ParserError, 15 | UInt8, UInt16, String, RawBytes, NullBytes) 16 | from hachoir_core.text_handler import textHandler, hexadecimal 17 | 18 | def IPTC_String(parent, name, desc=None): 19 | # Charset may be utf-8, ISO-8859-1, or ... 20 | return String(parent, name, parent["size"].value, desc, 21 | strip=" ") 22 | 23 | dataset1 = { 24 | } 25 | dataset2 = { 26 | 0: ("record_version", "Record version (2 for JPEG)", UInt16), 27 | 5: ("obj_name", "Object name", None), 28 | 7: ("edit_stat", "Edit status", None), 29 | 10: ("urgency", "Urgency", UInt8), 30 | 15: ("category[]", "Category", None), 31 | 22: ("fixture", "Fixture identifier", IPTC_String), 32 | 25: ("keyword[]", "Keywords", IPTC_String), 33 | 30: ("release_date", "Release date", IPTC_String), 34 | 35: ("release_time", "Release time", IPTC_String), 35 | 40: ("instruction", "Special instructions", IPTC_String), 36 | 55: ("date_created", "Date created", IPTC_String), 37 | 60: ("time_created", "Time created (ISO 8601)", IPTC_String), 38 | 65: ("originating_prog", "Originating program", IPTC_String), 39 | 70: ("prog_ver", "Program version", IPTC_String), 40 | 80: ("author", "By-line (Author)", IPTC_String), 41 | 85: ("author_job", "By-line (Author precision)", IPTC_String), 42 | 90: ("city", "City", IPTC_String), 43 | 95: ("state", "Province / State", IPTC_String), 44 | 100: ("country_code", "Country / Primary location code", IPTC_String), 45 | 101: ("country_name", "Country / Primary location name", IPTC_String), 46 | 103: ("trans_ref", "Original transmission reference", IPTC_String), 47 | 105: ("headline", "Headline", IPTC_String), 48 | 110: ("credit", "Credit", IPTC_String), 49 | 115: ("source", "Source", IPTC_String), 50 | 116: ("copyright", "Copyright notice", IPTC_String), 51 | 120: ("caption", "Caption/Abstract", IPTC_String), 52 | 122: ("writer", "Writer/editor", IPTC_String), 53 | 231: ("history[]", "Document history (timestamp)", IPTC_String) 54 | } 55 | datasets = {1: dataset1, 2: dataset2} 56 | 57 | class IPTC_Size(FieldSet): 58 | def __init__(self, *args, **kw): 59 | FieldSet.__init__(self, *args, **kw) 60 | value = 0 61 | for field in self: 62 | value <<= 15 63 | value += (field.value & 0x7fff) 64 | self.createValue = lambda: value 65 | 66 | def createFields(self): 67 | while True: 68 | field = UInt16(self, "value[]") 69 | yield field 70 | if field.value < 0x8000: 71 | break 72 | 73 | class IPTC_Chunk(FieldSet): 74 | def __init__(self, *args, **kw): 75 | FieldSet.__init__(self, *args, **kw) 76 | number = self["dataset_nb"].value 77 | self.dataset_info = None 78 | if number in datasets: 79 | tag = self["tag"].value 80 | if tag in datasets[number]: 81 | self.dataset_info = datasets[number][tag] 82 | self._name = self.dataset_info[0] 83 | self._description = self.dataset_info[1] 84 | size_chunk = self["size"] 85 | self._size = 3*8 + size_chunk.size + size_chunk.value*8 86 | 87 | def createFields(self): 88 | yield textHandler(UInt8(self, "signature", "IPTC signature (0x1c)"), hexadecimal) 89 | if self["signature"].value != 0x1C: 90 | raise ParserError("Wrong IPTC signature") 91 | yield textHandler(UInt8(self, "dataset_nb", "Dataset number"), hexadecimal) 92 | yield UInt8(self, "tag", "Tag") 93 | yield IPTC_Size(self, "size", "Content size") 94 | 95 | size = self["size"].value 96 | if 0 < size: 97 | if self.dataset_info: 98 | cls = self.dataset_info[2] 99 | else: 100 | cls = None 101 | if cls: 102 | yield cls(self, "content") 103 | else: 104 | yield RawBytes(self, "content", size) 105 | 106 | class IPTC(FieldSet): 107 | def createFields(self): 108 | while 5 <= (self._size - self.current_size)/8: 109 | yield IPTC_Chunk(self, "chunk[]") 110 | size = (self._size - self.current_size) / 8 111 | if 0 < size: 112 | yield NullBytes(self, "padding", size) 113 | 114 | -------------------------------------------------------------------------------- /hachoir_core/log.py: -------------------------------------------------------------------------------- 1 | import os, sys, time 2 | import hachoir_core.config as config 3 | from hachoir_core.i18n import _ 4 | 5 | class Log: 6 | LOG_INFO = 0 7 | LOG_WARN = 1 8 | LOG_ERROR = 2 9 | 10 | level_name = { 11 | LOG_WARN: "[warn]", 12 | LOG_ERROR: "[err!]", 13 | LOG_INFO: "[info]" 14 | } 15 | 16 | def __init__(self): 17 | self.__buffer = {} 18 | self.__file = None 19 | self.use_print = True 20 | self.use_buffer = False 21 | self.on_new_message = None # Prototype: def func(level, prefix, text, context) 22 | 23 | def shutdown(self): 24 | if self.__file: 25 | self._writeIntoFile(_("Stop Hachoir")) 26 | 27 | def setFilename(self, filename, append=True): 28 | """ 29 | Use a file to store all messages. The 30 | UTF-8 encoding will be used. Write an informative 31 | message if the file can't be created. 32 | 33 | @param filename: C{L{string}} 34 | """ 35 | 36 | # Look if file already exists or not 37 | filename = os.path.expanduser(filename) 38 | filename = os.path.realpath(filename) 39 | append = os.access(filename, os.F_OK) 40 | 41 | # Create log file (or open it in append mode, if it already exists) 42 | try: 43 | import codecs 44 | if append: 45 | self.__file = codecs.open(filename, "a", "utf-8") 46 | else: 47 | self.__file = codecs.open(filename, "w", "utf-8") 48 | self._writeIntoFile(_("Starting Hachoir")) 49 | except IOError, err: 50 | if err.errno == 2: 51 | self.__file = None 52 | self.info(_("[Log] setFilename(%s) fails: no such file") % filename) 53 | else: 54 | raise 55 | 56 | def _writeIntoFile(self, message): 57 | timestamp = time.strftime("%Y-%m-%d %H:%M:%S") 58 | self.__file.write(u"%s - %s\n" % (timestamp, message)) 59 | self.__file.flush() 60 | 61 | def newMessage(self, level, text, ctxt=None): 62 | """ 63 | Write a new message : append it in the buffer, 64 | display it to the screen (if needed), and write 65 | it in the log file (if needed). 66 | 67 | @param level: Message level. 68 | @type level: C{int} 69 | @param text: Message content. 70 | @type text: C{str} 71 | @param ctxt: The caller instance. 72 | """ 73 | 74 | if level < self.LOG_ERROR and config.quiet or \ 75 | level <= self.LOG_INFO and not config.verbose: 76 | return 77 | if config.debug: 78 | from hachoir_core.error import getBacktrace 79 | backtrace = getBacktrace(None) 80 | if backtrace: 81 | text += "\n\n" + backtrace 82 | 83 | _text = text 84 | if hasattr(ctxt, "_logger"): 85 | _ctxt = ctxt._logger() 86 | if _ctxt is not None: 87 | text = "[%s] %s" % (_ctxt, text) 88 | 89 | # Add message to log buffer 90 | if self.use_buffer: 91 | if not self.__buffer.has_key(level): 92 | self.__buffer[level] = [text] 93 | else: 94 | self.__buffer[level].append(text) 95 | 96 | # Add prefix 97 | prefix = self.level_name.get(level, "[info]") 98 | 99 | # Display on stdout (if used) 100 | if self.use_print: 101 | sys.stdout.flush() 102 | sys.stderr.write("%s %s\n" % (prefix, text)) 103 | sys.stderr.flush() 104 | 105 | # Write into outfile (if used) 106 | if self.__file: 107 | self._writeIntoFile("%s %s" % (prefix, text)) 108 | 109 | # Use callback (if used) 110 | if self.on_new_message: 111 | self.on_new_message (level, prefix, _text, ctxt) 112 | 113 | def info(self, text): 114 | """ 115 | New informative message. 116 | @type text: C{str} 117 | """ 118 | self.newMessage(Log.LOG_INFO, text) 119 | 120 | def warning(self, text): 121 | """ 122 | New warning message. 123 | @type text: C{str} 124 | """ 125 | self.newMessage(Log.LOG_WARN, text) 126 | 127 | def error(self, text): 128 | """ 129 | New error message. 130 | @type text: C{str} 131 | """ 132 | self.newMessage(Log.LOG_ERROR, text) 133 | 134 | log = Log() 135 | 136 | class Logger(object): 137 | def _logger(self): 138 | return "<%s>" % self.__class__.__name__ 139 | def info(self, text): 140 | log.newMessage(Log.LOG_INFO, text, self) 141 | def warning(self, text): 142 | log.newMessage(Log.LOG_WARN, text, self) 143 | def error(self, text): 144 | log.newMessage(Log.LOG_ERROR, text, self) 145 | -------------------------------------------------------------------------------- /hachoir_parser/archive/tar.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tar archive parser. 3 | 4 | Author: Victor Stinner 5 | """ 6 | 7 | from hachoir_parser import Parser 8 | from hachoir_core.field import (FieldSet, 9 | Enum, UInt8, SubFile, String, NullBytes) 10 | from hachoir_core.tools import humanFilesize, paddingSize, timestampUNIX 11 | from hachoir_core.endian import BIG_ENDIAN 12 | import re 13 | 14 | class FileEntry(FieldSet): 15 | type_name = { 16 | # 48 is "0", 49 is "1", ... 17 | 0: u"Normal disk file (old format)", 18 | 48: u"Normal disk file", 19 | 49: u"Link to previously dumped file", 20 | 50: u"Symbolic link", 21 | 51: u"Character special file", 22 | 52: u"Block special file", 23 | 53: u"Directory", 24 | 54: u"FIFO special file", 25 | 55: u"Contiguous file" 26 | } 27 | 28 | def getOctal(self, name): 29 | return self.octal2int(self[name].value) 30 | 31 | def getDatetime(self): 32 | """ 33 | Create modification date as Unicode string, may raise ValueError. 34 | """ 35 | timestamp = self.getOctal("mtime") 36 | return timestampUNIX(timestamp) 37 | 38 | def createFields(self): 39 | yield String(self, "name", 100, "Name", strip="\0", charset="ISO-8859-1") 40 | yield String(self, "mode", 8, "Mode", strip=" \0", charset="ASCII") 41 | yield String(self, "uid", 8, "User ID", strip=" \0", charset="ASCII") 42 | yield String(self, "gid", 8, "Group ID", strip=" \0", charset="ASCII") 43 | yield String(self, "size", 12, "Size", strip=" \0", charset="ASCII") 44 | yield String(self, "mtime", 12, "Modification time", strip=" \0", charset="ASCII") 45 | yield String(self, "check_sum", 8, "Check sum", strip=" \0", charset="ASCII") 46 | yield Enum(UInt8(self, "type", "Type"), self.type_name) 47 | yield String(self, "lname", 100, "Link name", strip=" \0", charset="ISO-8859-1") 48 | yield String(self, "magic", 8, "Magic", strip=" \0", charset="ASCII") 49 | yield String(self, "uname", 32, "User name", strip=" \0", charset="ISO-8859-1") 50 | yield String(self, "gname", 32, "Group name", strip=" \0", charset="ISO-8859-1") 51 | yield String(self, "devmajor", 8, "Dev major", strip=" \0", charset="ASCII") 52 | yield String(self, "devminor", 8, "Dev minor", strip=" \0", charset="ASCII") 53 | yield NullBytes(self, "padding", 167, "Padding (zero)") 54 | 55 | filesize = self.getOctal("size") 56 | if filesize: 57 | yield SubFile(self, "content", filesize, filename=self["name"].value) 58 | 59 | size = paddingSize(self.current_size//8, 512) 60 | if size: 61 | yield NullBytes(self, "padding_end", size, "Padding (512 align)") 62 | 63 | def convertOctal(self, chunk): 64 | return self.octal2int(chunk.value) 65 | 66 | def isEmpty(self): 67 | return self["name"].value == "" 68 | 69 | def octal2int(self, text): 70 | try: 71 | return int(text, 8) 72 | except ValueError: 73 | return 0 74 | 75 | def createDescription(self): 76 | if self.isEmpty(): 77 | desc = "(terminator, empty header)" 78 | else: 79 | filename = self["name"].value 80 | filesize = humanFilesize(self.getOctal("size")) 81 | desc = "(%s: %s, %s)" % \ 82 | (filename, self["type"].display, filesize) 83 | return "Tar File " + desc 84 | 85 | class TarFile(Parser): 86 | endian = BIG_ENDIAN 87 | PARSER_TAGS = { 88 | "id": "tar", 89 | "category": "archive", 90 | "file_ext": ("tar",), 91 | "mime": (u"application/x-tar", u"application/x-gtar"), 92 | "min_size": 512*8, 93 | "magic": (("ustar \0", 257*8),), 94 | "subfile": "skip", 95 | "description": "TAR archive", 96 | } 97 | _sign = re.compile("ustar *\0|[ \0]*$") 98 | 99 | def validate(self): 100 | if not self._sign.match(self.stream.readBytes(257*8, 8)): 101 | return "Invalid magic number" 102 | if self[0].name == "terminator": 103 | return "Don't contain any file" 104 | try: 105 | int(self["file[0]/uid"].value, 8) 106 | int(self["file[0]/gid"].value, 8) 107 | int(self["file[0]/size"].value, 8) 108 | except ValueError: 109 | return "Invalid file size" 110 | return True 111 | 112 | def createFields(self): 113 | while not self.eof: 114 | field = FileEntry(self, "file[]") 115 | if field.isEmpty(): 116 | yield NullBytes(self, "terminator", 512) 117 | break 118 | yield field 119 | if self.current_size < self._size: 120 | yield self.seekBit(self._size, "end") 121 | 122 | def createContentSize(self): 123 | return self["terminator"].address + self["terminator"].size 124 | 125 | -------------------------------------------------------------------------------- /hachoir_parser/guess.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parser list managment: 3 | - createParser() find the best parser for a file. 4 | """ 5 | 6 | import os 7 | from hachoir_core.error import warning, info, HACHOIR_ERRORS 8 | from hachoir_parser import ValidateError, HachoirParserList 9 | from hachoir_core.stream import FileInputStream 10 | from hachoir_core.i18n import _ 11 | import weakref 12 | 13 | 14 | class QueryParser(object): 15 | fallback = None 16 | other = None 17 | 18 | def __init__(self, tags): 19 | self.validate = True 20 | self.use_fallback = False 21 | self.parser_args = None 22 | self.db = HachoirParserList.getInstance() 23 | self.parsers = set(self.db) 24 | parsers = [] 25 | for tag in tags: 26 | if not self.parsers: 27 | break 28 | parsers += self._getByTag(tag) 29 | if self.fallback is None: 30 | self.fallback = len(parsers) == 1 31 | if self.parsers: 32 | other = len(parsers) 33 | parsers += list(self.parsers) 34 | self.other = parsers[other] 35 | self.parsers = parsers 36 | 37 | def __iter__(self): 38 | return iter(self.parsers) 39 | 40 | def translate(self, name, value): 41 | if name == "filename": 42 | filename = os.path.basename(value).split(".") 43 | if len(filename) <= 1: 44 | value = "" 45 | else: 46 | value = filename[-1].lower() 47 | name = "file_ext" 48 | return name, value 49 | 50 | def _getByTag(self, tag): 51 | if tag is None: 52 | self.parsers.clear() 53 | return [] 54 | elif callable(tag): 55 | parsers = [ parser for parser in self.parsers if tag(parser) ] 56 | for parser in parsers: 57 | self.parsers.remove(parser) 58 | elif tag[0] == "class": 59 | self.validate = False 60 | return [ tag[1] ] 61 | elif tag[0] == "args": 62 | self.parser_args = tag[1] 63 | return [] 64 | else: 65 | tag = self.translate(*tag) 66 | parsers = [] 67 | if tag is not None: 68 | key = tag[0] 69 | byname = self.db.bytag.get(key,{}) 70 | if tag[1] is None: 71 | values = byname.itervalues() 72 | else: 73 | values = byname.get(tag[1],()), 74 | if key == "id" and values: 75 | self.validate = False 76 | for value in values: 77 | for parser in value: 78 | if parser in self.parsers: 79 | parsers.append(parser) 80 | self.parsers.remove(parser) 81 | return parsers 82 | 83 | def parse(self, stream, fallback=True): 84 | if hasattr(stream, "_cached_parser"): 85 | parser = stream._cached_parser() 86 | else: 87 | parser = None 88 | if parser is not None: 89 | if parser.__class__ in self.parsers: 90 | return parser 91 | if self.use_fallback and parser.__class__ == fb: 92 | return parser 93 | parser = self.doparse(stream, fallback) 94 | stream._cached_parser = weakref.ref(parser) 95 | return parser 96 | 97 | def doparse(self, stream, fallback=True): 98 | fb = None 99 | warn = warning 100 | for parser in self.parsers: 101 | try: 102 | parser_obj = parser(stream, validate=self.validate) 103 | if self.parser_args: 104 | for key, value in self.parser_args.iteritems(): 105 | setattr(parser_obj, key, value) 106 | return parser_obj 107 | except ValidateError, err: 108 | res = unicode(err) 109 | if fallback and self.fallback: 110 | fb = parser 111 | except HACHOIR_ERRORS, err: 112 | res = unicode(err) 113 | if warn: 114 | if parser == self.other: 115 | warn = info 116 | warn(_("Skip parser '%s': %s") % (parser.__name__, res)) 117 | fallback = False 118 | if self.use_fallback and fb: 119 | warning(_("Force use of parser '%s'") % fb.__name__) 120 | return fb(stream) 121 | 122 | 123 | def guessParser(stream): 124 | return QueryParser(stream.tags).parse(stream) 125 | 126 | 127 | def createParser(filename, real_filename=None, tags=None): 128 | """ 129 | Create a parser from a file or returns None on error. 130 | 131 | Options: 132 | - filename (unicode): Input file name ; 133 | - real_filename (str|unicode): Real file name. 134 | """ 135 | if not tags: 136 | tags = [] 137 | stream = FileInputStream(filename, real_filename, tags=tags) 138 | return guessParser(stream) 139 | -------------------------------------------------------------------------------- /hachoir_core/field/padding.py: -------------------------------------------------------------------------------- 1 | from hachoir_core.field import Bits, Bytes 2 | from hachoir_core.tools import makePrintable, humanFilesize 3 | from hachoir_core import config 4 | 5 | class PaddingBits(Bits): 6 | """ 7 | Padding bits used, for example, to align address (of next field). 8 | See also NullBits and PaddingBytes types. 9 | 10 | Arguments: 11 | * nbits: Size of the field in bits 12 | 13 | Optional arguments: 14 | * pattern (int): Content pattern, eg. 0 if all bits are set to 0 15 | """ 16 | static_size = staticmethod(lambda *args, **kw: args[1]) 17 | MAX_SIZE = 128 18 | 19 | def __init__(self, parent, name, nbits, description="Padding", pattern=None): 20 | Bits.__init__(self, parent, name, nbits, description) 21 | self.pattern = pattern 22 | self._display_pattern = self.checkPattern() 23 | 24 | def checkPattern(self): 25 | if not(config.check_padding_pattern): 26 | return False 27 | if self.pattern != 0: 28 | return False 29 | 30 | if self.MAX_SIZE < self._size: 31 | value = self._parent.stream.readBits( 32 | self.absolute_address, self.MAX_SIZE, self._parent.endian) 33 | else: 34 | value = self.value 35 | if value != 0: 36 | self.warning("padding contents doesn't look normal (invalid pattern)") 37 | return False 38 | if self.MAX_SIZE < self._size: 39 | self.info("only check first %u bits" % self.MAX_SIZE) 40 | return True 41 | 42 | def createDisplay(self): 43 | if self._display_pattern: 44 | return u"" % self.pattern 45 | else: 46 | return Bits.createDisplay(self) 47 | 48 | class PaddingBytes(Bytes): 49 | """ 50 | Padding bytes used, for example, to align address (of next field). 51 | See also NullBytes and PaddingBits types. 52 | 53 | Arguments: 54 | * nbytes: Size of the field in bytes 55 | 56 | Optional arguments: 57 | * pattern (str): Content pattern, eg. "\0" for nul bytes 58 | """ 59 | 60 | static_size = staticmethod(lambda *args, **kw: args[1]*8) 61 | MAX_SIZE = 4096 62 | 63 | def __init__(self, parent, name, nbytes, 64 | description="Padding", pattern=None): 65 | """ pattern is None or repeated string """ 66 | assert (pattern is None) or (isinstance(pattern, str)) 67 | Bytes.__init__(self, parent, name, nbytes, description) 68 | self.pattern = pattern 69 | self._display_pattern = self.checkPattern() 70 | 71 | def checkPattern(self): 72 | if not(config.check_padding_pattern): 73 | return False 74 | if self.pattern is None: 75 | return False 76 | 77 | if self.MAX_SIZE < self._size/8: 78 | self.info("only check first %s of padding" % humanFilesize(self.MAX_SIZE)) 79 | content = self._parent.stream.readBytes( 80 | self.absolute_address, self.MAX_SIZE) 81 | else: 82 | content = self.value 83 | index = 0 84 | pattern_len = len(self.pattern) 85 | while index < len(content): 86 | if content[index:index+pattern_len] != self.pattern: 87 | self.warning( 88 | "padding contents doesn't look normal" 89 | " (invalid pattern at byte %u)!" 90 | % index) 91 | return False 92 | index += pattern_len 93 | return True 94 | 95 | def createDisplay(self): 96 | if self._display_pattern: 97 | return u"" % makePrintable(self.pattern, "ASCII", quote="'") 98 | else: 99 | return Bytes.createDisplay(self) 100 | 101 | def createRawDisplay(self): 102 | return Bytes.createDisplay(self) 103 | 104 | class NullBits(PaddingBits): 105 | """ 106 | Null padding bits used, for example, to align address (of next field). 107 | See also PaddingBits and NullBytes types. 108 | 109 | Arguments: 110 | * nbits: Size of the field in bits 111 | """ 112 | 113 | def __init__(self, parent, name, nbits, description=None): 114 | PaddingBits.__init__(self, parent, name, nbits, description, pattern=0) 115 | 116 | def createDisplay(self): 117 | if self._display_pattern: 118 | return "" 119 | else: 120 | return Bits.createDisplay(self) 121 | 122 | class NullBytes(PaddingBytes): 123 | """ 124 | Null padding bytes used, for example, to align address (of next field). 125 | See also PaddingBytes and NullBits types. 126 | 127 | Arguments: 128 | * nbytes: Size of the field in bytes 129 | """ 130 | def __init__(self, parent, name, nbytes, description=None): 131 | PaddingBytes.__init__(self, parent, name, nbytes, description, pattern="\0") 132 | 133 | def createDisplay(self): 134 | if self._display_pattern: 135 | return "" 136 | else: 137 | return Bytes.createDisplay(self) 138 | 139 | --------------------------------------------------------------------------------