├── pdfminer
    ├── cmap
    │   ├── __init__.py
    │   └── Makefile
    ├── __init__.py
    ├── pdfcolor.py
    ├── arcfour.py
    ├── encodingdb.py
    ├── runlength.py
    ├── ascii85.py
    └── lzw.py
├── hachoir_metadata
    ├── qt
    │   ├── __init__.py
    │   └── dialog.ui
    ├── config.py
    ├── version.py
    ├── __init__.py
    ├── misc.py.rej
    ├── formatter.py
    ├── safe.py
    ├── timezone.py
    ├── file_system.py
    ├── filter.py
    ├── history.patch
    └── program.py
├── hachoir_parser
    ├── common
    │   ├── __init__.py
    │   ├── tracker.py
    │   ├── deflate.py
    │   ├── msdos.py
    │   └── win32_lang_id.py
    ├── network
    │   ├── __init__.py
    │   └── common.py
    ├── version.py
    ├── game
    │   ├── __init__.py
    │   ├── spider_man_video.py
    │   └── laf.py
    ├── video
    │   ├── __init__.py
    │   ├── amf.py
    │   └── mpeg_ts.py
    ├── program
    │   ├── __init__.py
    │   ├── prc.py
    │   └── exe_ne.py
    ├── container
    │   └── __init__.py
    ├── __init__.py
    ├── file_system
    │   ├── __init__.py
    │   └── linux_swap.py
    ├── misc
    │   ├── common.py
    │   ├── __init__.py
    │   ├── ole2_util.py
    │   └── hlp.py
    ├── image
    │   ├── __init__.py
    │   ├── common.py
    │   ├── tiff.py
    │   ├── psd.py
    │   ├── pcx.py
    │   ├── tga.py
    │   └── iptc.py
    ├── audio
    │   ├── __init__.py
    │   ├── au.py
    │   ├── real_audio.py
    │   └── aiff.py
    ├── archive
    │   ├── __init__.py
    │   ├── ar.py
    │   ├── mar.py
    │   ├── mozilla_ar.py
    │   └── tar.py
    ├── template.py
    └── guess.py
├── discovery
    ├── __init__.py
    └── googlesearch.py
├── lib
    └── __init__.py
├── hachoir_core
    ├── __init__.py
    ├── stream
    │   ├── stream.py
    │   ├── __init__.py
    │   └── input_helper.py
    ├── version.py
    ├── field
    │   ├── field_set.py
    │   ├── character.py
    │   ├── enum.py
    │   ├── vector.py
    │   ├── parser.py
    │   ├── integer.py
    │   ├── bit_field.py
    │   ├── static_field_set.py
    │   ├── helper.py
    │   ├── byte_field.py
    │   ├── __init__.py
    │   ├── fake_array.py
    │   ├── sub_file.py
    │   ├── timestamp.py
    │   ├── seekable_field_set.py
    │   ├── link.py
    │   ├── float.py
    │   └── padding.py
    ├── endian.py
    ├── language.py
    ├── event_handler.py
    ├── profiler.py
    ├── config.py
    ├── error.py
    ├── cmd_line.py
    ├── text_handler.py
    ├── timeout.py
    ├── memory.py
    └── log.py
├── extractors
    ├── __init__.py
    ├── metadataExtractor.py
    ├── metadataMSOffice.py
    └── metadataPDF.py
├── LICENSES
├── downloader.py
├── processor.py
├── README.md
├── parser.py
├── myparser.py
├── unzip.py
└── htmlExport.py


/pdfminer/cmap/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hachoir_metadata/qt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hachoir_parser/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/discovery/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["googlesearch"]
2 | 


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["markup","graphs"]
2 | 


--------------------------------------------------------------------------------
/hachoir_metadata/config.py:
--------------------------------------------------------------------------------
1 | MAX_STR_LENGTH = 300  # characters
2 | RAW_OUTPUT = False
3 | 


--------------------------------------------------------------------------------
/hachoir_parser/network/__init__.py:
--------------------------------------------------------------------------------
1 | from hachoir_parser.network.tcpdump import TcpdumpFile
2 | 
3 | 


--------------------------------------------------------------------------------
/hachoir_core/__init__.py:
--------------------------------------------------------------------------------
1 | from hachoir_core.version import VERSION as __version__, PACKAGE, WEBSITE, LICENSE
2 | 
3 | 


--------------------------------------------------------------------------------
/pdfminer/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | __version__ = '20110227'
3 | 
4 | if __name__ == '__main__': print __version__
5 | 


--------------------------------------------------------------------------------
/extractors/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["metadataExtractor","metadataMSOffice","metadataMSOfficeXML","metadataOpenOffice","metadataPDF"]
2 | 


--------------------------------------------------------------------------------
/hachoir_core/stream/stream.py:
--------------------------------------------------------------------------------
1 | from hachoir_core.error import HachoirError
2 | 
3 | class StreamError(HachoirError):
4 |     pass
5 | 
6 | 


--------------------------------------------------------------------------------
/pdfminer/cmap/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for pdfminer.cmap
 2 | 
 3 | all:
 4 | 
 5 | clean:
 6 | 	-rm *.pyc *.pyo
 7 | 
 8 | cmap_clean:
 9 | 	rm -f *.pickle.gz
10 | 


--------------------------------------------------------------------------------
/hachoir_core/version.py:
--------------------------------------------------------------------------------
1 | PACKAGE = "hachoir-core"
2 | VERSION = "1.3.4"
3 | WEBSITE = 'http://bitbucket.org/haypo/hachoir/wiki/hachoir-core'
4 | LICENSE = 'GNU GPL v2'
5 | 
6 | 


--------------------------------------------------------------------------------
/hachoir_metadata/version.py:
--------------------------------------------------------------------------------
1 | PACKAGE = "hachoir-metadata"
2 | VERSION = "1.3.3"
3 | WEBSITE = "http://bitbucket.org/haypo/hachoir/wiki/hachoir-metadata"
4 | LICENSE = "GNU GPL v2"
5 | 
6 | 


--------------------------------------------------------------------------------
/hachoir_parser/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.3.5"
2 | PACKAGE = "hachoir-parser"
3 | WEBSITE = "http://bitbucket.org/haypo/hachoir/wiki/hachoir-parser"
4 | LICENSE = 'GNU GPL v2'
5 | 
6 | 


--------------------------------------------------------------------------------
/hachoir_parser/game/__init__.py:
--------------------------------------------------------------------------------
1 | from hachoir_parser.game.zsnes import ZSNESFile
2 | from hachoir_parser.game.spider_man_video import SpiderManVideoFile
3 | from hachoir_parser.game.laf import LafFile
4 | from hachoir_parser.game.blp import BLP1File, BLP2File


--------------------------------------------------------------------------------
/hachoir_parser/video/__init__.py:
--------------------------------------------------------------------------------
1 | from hachoir_parser.video.asf import AsfFile
2 | from hachoir_parser.video.flv import FlvFile
3 | from hachoir_parser.video.mov import MovFile
4 | from hachoir_parser.video.mpeg_video import MPEGVideoFile
5 | from hachoir_parser.video.mpeg_ts import MPEG_TS
6 | 
7 | 


--------------------------------------------------------------------------------
/hachoir_parser/program/__init__.py:
--------------------------------------------------------------------------------
1 | from hachoir_parser.program.elf import ElfFile
2 | from hachoir_parser.program.exe import ExeFile
3 | from hachoir_parser.program.python import PythonCompiledFile
4 | from hachoir_parser.program.java import JavaCompiledClassFile
5 | from hachoir_parser.program.prc import PRCFile
6 | 
7 | 


--------------------------------------------------------------------------------
/hachoir_core/field/field_set.py:
--------------------------------------------------------------------------------
1 | from hachoir_core.field import BasicFieldSet, GenericFieldSet
2 | 
3 | class FieldSet(GenericFieldSet):
4 |     def __init__(self, parent, name, *args, **kw):
5 |         assert issubclass(parent.__class__, BasicFieldSet)
6 |         GenericFieldSet.__init__(self, parent, name, parent.stream, *args, **kw)
7 | 
8 | 


--------------------------------------------------------------------------------
/hachoir_parser/common/tracker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Shared code for tracker parser.
 3 | """
 4 | 
 5 | NOTE_NAME = {}
 6 | NOTES = ("C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "G#", "A", "A#", "B")
 7 | for octave in xrange(10):
 8 |     for index, note in enumerate(NOTES):
 9 |         NOTE_NAME[octave*12+index] = "%s (octave %s)" % (note, octave)
10 | 
11 | 


--------------------------------------------------------------------------------
/hachoir_parser/container/__init__.py:
--------------------------------------------------------------------------------
1 | from hachoir_parser.container.asn1 import ASN1File
2 | from hachoir_parser.container.mkv import MkvFile
3 | from hachoir_parser.container.ogg import OggFile, OggStream
4 | from hachoir_parser.container.riff import RiffFile
5 | from hachoir_parser.container.swf import SwfFile
6 | from hachoir_parser.container.realmedia import RealMediaFile
7 | 
8 | 


--------------------------------------------------------------------------------
/hachoir_core/endian.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Constant values about endian.
 3 | """
 4 | 
 5 | from hachoir_core.i18n import _
 6 | 
 7 | BIG_ENDIAN = "ABCD"
 8 | LITTLE_ENDIAN = "DCBA"
 9 | MIDDLE_ENDIAN = "BADC"
10 | NETWORK_ENDIAN = BIG_ENDIAN
11 | 
12 | endian_name = {
13 |     BIG_ENDIAN: _("Big endian"),
14 |     LITTLE_ENDIAN: _("Little endian"),
15 |     MIDDLE_ENDIAN: _("Middle endian"),
16 | }
17 | 


--------------------------------------------------------------------------------
/hachoir_parser/__init__.py:
--------------------------------------------------------------------------------
1 | from hachoir_parser.version import __version__
2 | from hachoir_parser.parser import ValidateError, HachoirParser, Parser
3 | from hachoir_parser.parser_list import ParserList, HachoirParserList
4 | from hachoir_parser.guess import (QueryParser, guessParser, createParser)
5 | from hachoir_parser import (archive, audio, container,
6 |     file_system, image, game, misc, network, program, video)
7 | 
8 | 


--------------------------------------------------------------------------------
/hachoir_parser/file_system/__init__.py:
--------------------------------------------------------------------------------
1 | from hachoir_parser.file_system.ext2 import EXT2_FS
2 | from hachoir_parser.file_system.fat import FAT12, FAT16, FAT32
3 | from hachoir_parser.file_system.mbr import MSDos_HardDrive
4 | from hachoir_parser.file_system.ntfs import NTFS
5 | from hachoir_parser.file_system.iso9660 import ISO9660
6 | from hachoir_parser.file_system.reiser_fs import REISER_FS
7 | from hachoir_parser.file_system.linux_swap import LinuxSwapFile
8 | 
9 | 


--------------------------------------------------------------------------------
/hachoir_parser/misc/common.py:
--------------------------------------------------------------------------------
 1 | from hachoir_core.field import StaticFieldSet, Float32
 2 | 
 3 | class Vertex(StaticFieldSet):
 4 |     format = ((Float32, "x"), (Float32, "y"), (Float32, "z"))
 5 | 
 6 |     def createValue(self):
 7 |         return (self["x"].value, self["y"].value, self["z"].value)
 8 | 
 9 | class MapUV(StaticFieldSet):
10 |     format = ((Float32, "u"), (Float32, "v"))
11 | 
12 |     def createValue(self):
13 |         return (self["u"].value, self["v"].value)
14 | 


--------------------------------------------------------------------------------
/hachoir_metadata/__init__.py:
--------------------------------------------------------------------------------
 1 | from hachoir_metadata.version import VERSION as __version__
 2 | from hachoir_metadata.metadata import extractMetadata
 3 | 
 4 | # Just import the module,
 5 | # each module use registerExtractor() method
 6 | import hachoir_metadata.archive
 7 | import hachoir_metadata.audio
 8 | import hachoir_metadata.file_system
 9 | import hachoir_metadata.image
10 | import hachoir_metadata.jpeg
11 | import hachoir_metadata.misc
12 | import hachoir_metadata.program
13 | import hachoir_metadata.riff
14 | import hachoir_metadata.video
15 | 
16 | 


--------------------------------------------------------------------------------
/hachoir_parser/image/__init__.py:
--------------------------------------------------------------------------------
 1 | from hachoir_parser.image.bmp import BmpFile
 2 | from hachoir_parser.image.gif import GifFile
 3 | from hachoir_parser.image.ico import IcoFile
 4 | from hachoir_parser.image.jpeg import JpegFile
 5 | from hachoir_parser.image.pcx import PcxFile
 6 | from hachoir_parser.image.psd import PsdFile
 7 | from hachoir_parser.image.png import PngFile
 8 | from hachoir_parser.image.tga import TargaFile
 9 | from hachoir_parser.image.tiff import TiffFile
10 | from hachoir_parser.image.wmf import WMF_File
11 | from hachoir_parser.image.xcf import XcfFile
12 | 
13 | 


--------------------------------------------------------------------------------
/hachoir_core/stream/__init__.py:
--------------------------------------------------------------------------------
 1 | from hachoir_core.endian import BIG_ENDIAN, LITTLE_ENDIAN
 2 | from hachoir_core.stream.stream import StreamError
 3 | from hachoir_core.stream.input import (
 4 |         InputStreamError,
 5 |         InputStream, InputIOStream, StringInputStream,
 6 |         InputSubStream, InputFieldStream,
 7 |         FragmentedStream, ConcatStream)
 8 | from hachoir_core.stream.input_helper import FileInputStream, guessStreamCharset
 9 | from hachoir_core.stream.output import (OutputStreamError,
10 |         FileOutputStream, StringOutputStream, OutputStream)
11 | 
12 | 


--------------------------------------------------------------------------------
/hachoir_parser/audio/__init__.py:
--------------------------------------------------------------------------------
 1 | from hachoir_parser.audio.aiff import AiffFile
 2 | from hachoir_parser.audio.au import AuFile
 3 | from hachoir_parser.audio.itunesdb import ITunesDBFile
 4 | from hachoir_parser.audio.midi import MidiFile
 5 | from hachoir_parser.audio.mpeg_audio import MpegAudioFile
 6 | from hachoir_parser.audio.real_audio import RealAudioFile
 7 | from hachoir_parser.audio.xm import XMModule
 8 | from hachoir_parser.audio.s3m import S3MModule
 9 | from hachoir_parser.audio.s3m import PTMModule
10 | from hachoir_parser.audio.mod import AmigaModule
11 | from hachoir_parser.audio.flac import FlacParser
12 | 
13 | 


--------------------------------------------------------------------------------
/hachoir_metadata/misc.py.rej:
--------------------------------------------------------------------------------
 1 | ***************
 2 | *** 125,130 ****
 3 |           summary = self.getField(fieldset, "summary[0]")
 4 |           if summary:
 5 |               self.useSummary(summary, False)
 6 |   
 7 |       def getFragment(self, frag):
 8 |           stream = frag.getSubIStream()
 9 | --- 125,133 ----
10 |           summary = self.getField(fieldset, "summary[0]")
11 |           if summary:
12 |               self.useSummary(summary, False)
13 | +         table = self.getField(fieldset, "table1[0]")
14 | +         if table:
15 | +             self.useTable(table)
16 |   
17 |       def getFragment(self, frag):
18 |           stream = frag.getSubIStream()
19 | 


--------------------------------------------------------------------------------
/LICENSES:
--------------------------------------------------------------------------------
 1 | Released under the GPL v 2.0.
 2 | If you did not recieve a copy of the GPL, try http://www.gnu.org/.
 3 | 
 4 | Copyright 2011 Christian Martorella 
 5 | 
 6 | Metagoofil is free software; you can redistribute it and/or modify
 7 | it under the terms of the GNU General Public License as published by
 8 | the Free Software Foundation version 2 of the License.
 9 | 
10 | Metagoofil is distributed in the hope that it will be useful,
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | GNU General Public License for more details.
14 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/hachoir_core/language.py:
--------------------------------------------------------------------------------
 1 | from hachoir_core.iso639 import ISO639_2
 2 | 
 3 | class Language:
 4 |     def __init__(self, code):
 5 |         code = str(code)
 6 |         if code not in ISO639_2:
 7 |             raise ValueError("Invalid language code: %r" % code)
 8 |         self.code = code
 9 | 
10 |     def __cmp__(self, other):
11 |         if other.__class__ != Language:
12 |             return 1
13 |         return cmp(self.code, other.code)
14 | 
15 |     def __unicode__(self):
16 |        return ISO639_2[self.code]
17 | 
18 |     def __str__(self):
19 |        return self.__unicode__()
20 | 
21 |     def __repr__(self):
22 |         return "<Language '%s', code=%r>" % (unicode(self), self.code)
23 | 
24 | 


--------------------------------------------------------------------------------
/hachoir_metadata/formatter.py:
--------------------------------------------------------------------------------
 1 | from hachoir_core.i18n import _, ngettext
 2 | 
 3 | NB_CHANNEL_NAME = {1: _("mono"), 2: _("stereo")}
 4 | 
 5 | def humanAudioChannel(value):
 6 |     return NB_CHANNEL_NAME.get(value, unicode(value))
 7 | 
 8 | def humanFrameRate(value):
 9 |     if isinstance(value, (int, long, float)):
10 |         return _("%.1f fps") % value
11 |     else:
12 |         return value
13 | 
14 | def humanComprRate(rate):
15 |     return u"%.1fx" % rate
16 | 
17 | def humanAltitude(value):
18 |     return ngettext("%.1f meter", "%.1f meters", value) % value
19 | 
20 | def humanPixelSize(value):
21 |     return ngettext("%s pixel", "%s pixels", value) % value
22 | 
23 | def humanDPI(value):
24 |     return u"%s DPI" % value
25 | 
26 | 


--------------------------------------------------------------------------------
/hachoir_parser/archive/__init__.py:
--------------------------------------------------------------------------------
 1 | from hachoir_parser.archive.ace import AceFile
 2 | from hachoir_parser.archive.ar import ArchiveFile
 3 | from hachoir_parser.archive.bzip2_parser import Bzip2Parser
 4 | from hachoir_parser.archive.cab import CabFile
 5 | from hachoir_parser.archive.gzip_parser import GzipParser
 6 | from hachoir_parser.archive.tar import TarFile
 7 | from hachoir_parser.archive.zip import ZipFile
 8 | from hachoir_parser.archive.rar import RarFile
 9 | from hachoir_parser.archive.rpm import RpmFile
10 | from hachoir_parser.archive.sevenzip import SevenZipParser
11 | from hachoir_parser.archive.mar import MarFile
12 | from hachoir_parser.archive.mozilla_ar import MozillaArchive
13 | from hachoir_parser.archive.zlib import ZlibData
14 | 


--------------------------------------------------------------------------------
/hachoir_core/event_handler.py:
--------------------------------------------------------------------------------
 1 | class EventHandler(object):
 2 |     """
 3 |     Class to connect events to event handlers.
 4 |     """
 5 | 
 6 |     def __init__(self):
 7 |         self.handlers = {}
 8 | 
 9 |     def connect(self, event_name, handler):
10 |         """
11 |         Connect an event handler to an event. Append it to handlers list.
12 |         """
13 |         try:
14 |             self.handlers[event_name].append(handler)
15 |         except KeyError:
16 |             self.handlers[event_name] = [handler]
17 | 
18 |     def raiseEvent(self, event_name, *args):
19 |         """
20 |         Raiser an event: call each handler for this event_name.
21 |         """
22 |         if event_name not in self.handlers:
23 |             return
24 |         for handler in self.handlers[event_name]:
25 |             handler(*args)
26 | 
27 | 


--------------------------------------------------------------------------------
/pdfminer/pdfcolor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | from psparser import LIT
 3 | 
 4 | 
 5 | ##  PDFColorSpace
 6 | ##
 7 | LITERAL_DEVICE_GRAY = LIT('DeviceGray')
 8 | LITERAL_DEVICE_RGB = LIT('DeviceRGB')
 9 | LITERAL_DEVICE_CMYK = LIT('DeviceCMYK')
10 | 
11 | class PDFColorSpace(object):
12 | 
13 |     def __init__(self, name, ncomponents):
14 |         self.name = name
15 |         self.ncomponents = ncomponents
16 |         return
17 | 
18 |     def __repr__(self):
19 |         return '<PDFColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
20 | 
21 | 
22 | PREDEFINED_COLORSPACE = dict(
23 |   (name, PDFColorSpace(name,n)) for (name,n) in {
24 |   'CalRGB': 3,
25 |   'CalGray': 1,
26 |   'Lab': 3,
27 |   'DeviceRGB': 3,
28 |   'DeviceCMYK': 4,
29 |   'DeviceGray': 1,
30 |   'Separation': 1,
31 |   'Indexed': 1,
32 |   'Pattern': 1,
33 |   }.iteritems())
34 | 


--------------------------------------------------------------------------------
/downloader.py:
--------------------------------------------------------------------------------
 1 | import urllib, os, sys
 2 | 
 3 | class downloader():
 4 |     def __init__(self, url, dir):
 5 |         self.url = url
 6 |         self.dir = dir
 7 |         self.filename = str(url.split("/")[-1])
 8 | 
 9 | #       def dlProgress(count, blockSize, totalSize):
10 | #               percent = int(count*blockSize*100/totalSize)
11 | #               sys.stdout.write("\r" +"test" + "...%d%%" % percent)
12 | #               sys.stdout.flush()
13 | 
14 |     def down(self):
15 |         if os.path.exists(self.dir + "/" + self.filename):
16 |             pass
17 |         else:
18 |             try:
19 |                 urllib.urlretrieve(self.url, self.dir + "/" + self.filename)
20 |             except:
21 |                 print "\t [x] Error downloading " + self.url
22 |                 self.filename = ""
23 | 
24 |     def name(self):
25 |         return self.filename
26 | 


--------------------------------------------------------------------------------
/hachoir_core/field/character.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Character field class: a 8-bit character
 3 | """
 4 | 
 5 | from hachoir_core.field import Bits
 6 | from hachoir_core.endian import BIG_ENDIAN
 7 | from hachoir_core.tools import makePrintable
 8 | 
 9 | class Character(Bits):
10 |     """
11 |     A 8-bit character using ASCII charset for display attribute.
12 |     """
13 |     static_size = 8
14 | 
15 |     def __init__(self, parent, name, description=None):
16 |         Bits.__init__(self, parent, name, 8, description=description)
17 | 
18 |     def createValue(self):
19 |         return chr(self._parent.stream.readBits(
20 |             self.absolute_address, 8, BIG_ENDIAN))
21 | 
22 |     def createRawDisplay(self):
23 |         return unicode(Bits.createValue(self))
24 | 
25 |     def createDisplay(self):
26 |         return makePrintable(self.value, "ASCII", quote="'", to_unicode=True)
27 | 


--------------------------------------------------------------------------------
/hachoir_core/field/enum.py:
--------------------------------------------------------------------------------
 1 | def Enum(field, enum, key_func=None):
 2 |     """
 3 |     Enum is an adapter to another field: it will just change its display
 4 |     attribute. It uses a dictionary to associate a value to another.
 5 | 
 6 |     key_func is an optional function with prototype "def func(key)->key"
 7 |     which is called to transform key.
 8 |     """
 9 |     display = field.createDisplay
10 |     if key_func:
11 |         def createDisplay():
12 |             try:
13 |                 key = key_func(field.value)
14 |                 return enum[key]
15 |             except LookupError:
16 |                 return display()
17 |     else:
18 |         def createDisplay():
19 |             try:
20 |                 return enum[field.value]
21 |             except LookupError:
22 |                 return display()
23 |     field.createDisplay = createDisplay
24 |     field.getEnum = lambda: enum
25 |     return field
26 | 


--------------------------------------------------------------------------------
/hachoir_parser/misc/__init__.py:
--------------------------------------------------------------------------------
 1 | from hachoir_parser.misc.file_3do import File3do
 2 | from hachoir_parser.misc.file_3ds import File3ds
 3 | from hachoir_parser.misc.torrent import TorrentFile
 4 | from hachoir_parser.misc.ttf import TrueTypeFontFile
 5 | from hachoir_parser.misc.chm import ChmFile
 6 | from hachoir_parser.misc.lnk import LnkFile
 7 | from hachoir_parser.misc.pcf import PcfFile
 8 | from hachoir_parser.misc.ole2 import OLE2_File
 9 | from hachoir_parser.misc.pdf import PDFDocument
10 | from hachoir_parser.misc.pifv import PIFVFile
11 | from hachoir_parser.misc.hlp import HlpFile
12 | from hachoir_parser.misc.gnome_keyring import GnomeKeyring
13 | from hachoir_parser.misc.bplist import BPList
14 | from hachoir_parser.misc.dsstore import DSStore
15 | from hachoir_parser.misc.word_doc import WordDocumentParser
16 | from hachoir_parser.misc.word_2 import Word2DocumentParser
17 | from hachoir_parser.misc.mstask import MSTaskFile
18 | 


--------------------------------------------------------------------------------
/hachoir_metadata/safe.py:
--------------------------------------------------------------------------------
 1 | from hachoir_core.error import HACHOIR_ERRORS, warning
 2 | 
 3 | def fault_tolerant(func, *args):
 4 |     def safe_func(*args, **kw):
 5 |         try:
 6 |             func(*args, **kw)
 7 |         except HACHOIR_ERRORS, err:
 8 |             warning("Error when calling function %s(): %s" % (
 9 |                 func.__name__, err))
10 |     return safe_func
11 | 
12 | def getFieldAttribute(fieldset, key, attrname):
13 |     try:
14 |         field = fieldset[key]
15 |         if field.hasValue():
16 |             return getattr(field, attrname)
17 |     except HACHOIR_ERRORS, err:
18 |         warning("Unable to get %s of field %s/%s: %s" % (
19 |             attrname, fieldset.path, key, err))
20 |     return None
21 | 
22 | def getValue(fieldset, key):
23 |     return getFieldAttribute(fieldset, key, "value")
24 | 
25 | def getDisplay(fieldset, key):
26 |     return getFieldAttribute(fieldset, key, "display")
27 | 
28 | 


--------------------------------------------------------------------------------
/hachoir_core/profiler.py:
--------------------------------------------------------------------------------
 1 | from hotshot import Profile
 2 | from hotshot.stats import load as loadStats
 3 | from os import unlink
 4 | 
 5 | def runProfiler(func, args=tuple(), kw={}, verbose=True, nb_func=25, sort_by=('cumulative', 'calls')):
 6 |     profile_filename = "/tmp/profiler"
 7 |     prof = Profile(profile_filename)
 8 |     try:
 9 |         if verbose:
10 |             print "[+] Run profiler"
11 |         result = prof.runcall(func, *args, **kw)
12 |         prof.close()
13 |         if verbose:
14 |             print "[+] Stop profiler"
15 |             print "[+] Process data..."
16 |         stat = loadStats(profile_filename)
17 |         if verbose:
18 |             print "[+] Strip..."
19 |         stat.strip_dirs()
20 |         if verbose:
21 |             print "[+] Sort data..."
22 |         stat.sort_stats(*sort_by)
23 |         if verbose:
24 |             print
25 |             print "[+] Display statistics"
26 |             print
27 |         stat.print_stats(nb_func)
28 |         return result
29 |     finally:
30 |         unlink(profile_filename)
31 | 
32 | 


--------------------------------------------------------------------------------
/hachoir_core/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Configuration of Hachoir
 3 | """
 4 | 
 5 | import os
 6 | 
 7 | # UI: display options
 8 | max_string_length = 40    # Max. length in characters of GenericString.display
 9 | max_byte_length = 14      # Max. length in bytes of RawBytes.display
10 | max_bit_length = 256      # Max. length in bits of RawBits.display
11 | unicode_stdout = True     # Replace stdout and stderr with Unicode compatible objects
12 |                           # Disable it for readline or ipython
13 | 
14 | # Global options
15 | debug = False            # Display many informations usefull to debug
16 | verbose = False          # Display more informations
17 | quiet = True             # Don't display warni
18 | 
19 | # Use internationalization and localization (gettext)?
20 | if os.name == "nt":
21 |     # TODO: Remove this hack and make i18n works on Windows :-)
22 |     use_i18n = False
23 | else:
24 |     use_i18n = True
25 | 
26 | # Parser global options
27 | autofix = True            # Enable Autofix? see hachoir_core.field.GenericFieldSet
28 | check_padding_pattern = True   # Check padding fields pattern?
29 | 
30 | 


--------------------------------------------------------------------------------
/hachoir_metadata/timezone.py:
--------------------------------------------------------------------------------
 1 | from datetime import tzinfo, timedelta
 2 | 
 3 | class TimezoneUTC(tzinfo):
 4 |     """UTC timezone"""
 5 |     ZERO = timedelta(0)
 6 | 
 7 |     def utcoffset(self, dt):
 8 |         return TimezoneUTC.ZERO
 9 | 
10 |     def tzname(self, dt):
11 |         return u"UTC"
12 | 
13 |     def dst(self, dt):
14 |         return TimezoneUTC.ZERO
15 | 
16 |     def __repr__(self):
17 |         return "<TimezoneUTC delta=0, name=u'UTC'>"
18 | 
19 | class Timezone(TimezoneUTC):
20 |     """Fixed offset in hour from UTC."""
21 |     def __init__(self, offset):
22 |         self._offset = timedelta(minutes=offset*60)
23 |         self._name = u"%+03u00" % offset
24 | 
25 |     def utcoffset(self, dt):
26 |         return self._offset
27 | 
28 |     def tzname(self, dt):
29 |         return self._name
30 | 
31 |     def __repr__(self):
32 |         return "<Timezone delta=%s, name='%s'>" % (
33 |             self._offset, self._name)
34 | 
35 | UTC = TimezoneUTC()
36 | 
37 | def createTimezone(offset):
38 |     if offset:
39 |         return Timezone(offset)
40 |     else:
41 |         return UTC
42 | 
43 | 


--------------------------------------------------------------------------------
/hachoir_parser/common/deflate.py:
--------------------------------------------------------------------------------
 1 | from hachoir_core.field import CompressedField
 2 | 
 3 | try:
 4 |     from zlib import decompressobj, MAX_WBITS
 5 | 
 6 |     class DeflateStream:
 7 |         def __init__(self, stream, wbits=None):
 8 |             if wbits:
 9 |                 self.gzip = decompressobj(-MAX_WBITS)
10 |             else:
11 |                 self.gzip = decompressobj()
12 | 
13 |         def __call__(self, size, data=None):
14 |             if data is None:
15 |                 data = ''
16 |             return self.gzip.decompress(self.gzip.unconsumed_tail+data, size)
17 | 
18 |     class DeflateStreamWbits(DeflateStream):
19 |         def __init__(self, stream):
20 |             DeflateStream.__init__(self, stream, True)
21 | 
22 |     def Deflate(field, wbits=True):
23 |         if wbits:
24 |             CompressedField(field, DeflateStreamWbits)
25 |         else:
26 |             CompressedField(field, DeflateStream)
27 |         return field
28 |     has_deflate = True
29 | except ImportError:
30 |     def Deflate(field, wbits=True):
31 |         return field
32 |     has_deflate = False
33 | 
34 | 


--------------------------------------------------------------------------------
/hachoir_metadata/file_system.py:
--------------------------------------------------------------------------------
 1 | from hachoir_metadata.metadata import RootMetadata, registerExtractor
 2 | from hachoir_metadata.safe import fault_tolerant
 3 | from hachoir_parser.file_system import ISO9660
 4 | from datetime import datetime
 5 | 
 6 | class ISO9660_Metadata(RootMetadata):
 7 |     def extract(self, iso):
 8 |         desc = iso['volume[0]/content']
 9 |         self.title = desc['volume_id'].value
10 |         self.title = desc['vol_set_id'].value
11 |         self.author = desc['publisher'].value
12 |         self.author = desc['data_preparer'].value
13 |         self.producer = desc['application'].value
14 |         self.copyright = desc['copyright'].value
15 |         self.readTimestamp('creation_date', desc['creation_ts'].value)
16 |         self.readTimestamp('last_modification', desc['modification_ts'].value)
17 | 
18 |     @fault_tolerant
19 |     def readTimestamp(self, key, value):
20 |         if value.startswith("0000"):
21 |             return
22 |         value = datetime(
23 |             int(value[0:4]), int(value[4:6]), int(value[6:8]),
24 |             int(value[8:10]), int(value[10:12]), int(value[12:14]))
25 |         setattr(self, key, value)
26 | 
27 | registerExtractor(ISO9660, ISO9660_Metadata)
28 | 
29 | 


--------------------------------------------------------------------------------
/pdfminer/arcfour.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | """ Python implementation of Arcfour encryption algorithm.
 4 | 
 5 | This code is in the public domain.
 6 | 
 7 | """
 8 | 
 9 | ##  Arcfour
10 | ##
11 | class Arcfour(object):
12 | 
13 |     """
14 |     >>> Arcfour('Key').process('Plaintext').encode('hex')
15 |     'bbf316e8d940af0ad3'
16 |     >>> Arcfour('Wiki').process('pedia').encode('hex')
17 |     '1021bf0420'
18 |     >>> Arcfour('Secret').process('Attack at dawn').encode('hex')
19 |     '45a01f645fc35b383552544b9bf5'
20 |     """
21 | 
22 |     def __init__(self, key):
23 |         s = range(256)
24 |         j = 0
25 |         klen = len(key)
26 |         for i in xrange(256):
27 |             j = (j + s[i] + ord(key[i % klen])) % 256
28 |             (s[i], s[j]) = (s[j], s[i])
29 |         self.s = s
30 |         (self.i, self.j) = (0, 0)
31 |         return
32 | 
33 |     def process(self, data):
34 |         (i, j) = (self.i, self.j)
35 |         s = self.s
36 |         r = ''
37 |         for c in data:
38 |             i = (i+1) % 256
39 |             j = (j+s[i]) % 256
40 |             (s[i], s[j]) = (s[j], s[i])
41 |             k = s[(s[i]+s[j]) % 256]
42 |             r += chr(ord(c) ^ k)
43 |         (self.i, self.j) = (i, j)
44 |         return r
45 | 
46 | # test
47 | if __name__ == '__main__':
48 |     import doctest
49 |     doctest.testmod()
50 | 


--------------------------------------------------------------------------------
/extractors/metadataExtractor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | import sys, re, os, subprocess
 3 | 
 4 | class metaExtractor:
 5 | 	def __init__(self,fname):
 6 | 		self.fname=fname
 7 | 		self.command="extract" #If any error put the full path
 8 | 		self.data=""
 9 | 		self.paths=[]
10 | 		self.users=[]
11 | 
12 | 
13 | 	def runExtract(self):
14 | 		comm=self.command+" "+self.fname
15 | 		try:
16 | 			process = subprocess.Popen([self.command,self.fname], shell=False, stdout=subprocess.PIPE)
17 | 			res=process.communicate()
18 | 			self.data=res[0]
19 | 			return "ok"
20 | 		except:
21 | 			return "error"
22 | 
23 | 	def getData(self):
24 | 		pathre= re.compile('worked on .*')
25 | 		pathre2= re.compile('template -.*')
26 | 		for reg in (pathre,pathre2):
27 | 			path=reg.findall(self.data)
28 | 			if path !=[]:
29 | 				for x in path:
30 | 					try:
31 | 						temp=x.split('\'')[1]
32 | 						if self.paths.count(temp) == 0:
33 | 							self.paths.append(temp)
34 | 					except:
35 | 						pass
36 | 
37 | 		author= re.compile(': Author \'.*\'')
38 | 		authors=author.findall(self.data)
39 | 		if authors !=[]:
40 | 			for x in authors:
41 | 				temp=x.split('\'')[1]
42 | 				temp=temp.replace('\'','')
43 | 				if self.users.count(temp) == 0:
44 | 					self.users.append(temp)
45 | 
46 | 	def getUsers(self):
47 | 		return self.users
48 | 
49 | 	def getPaths(self):
50 | 		return self.paths		
51 | 


--------------------------------------------------------------------------------
/processor.py:
--------------------------------------------------------------------------------
 1 | #Christian Martorella 2011
 2 | '''
 3 | This class will sort the results and create unique list of software, users and paths
 4 | '''
 5 | 
 6 | class processor():
 7 | 		def __init__(self,list):
 8 | 			self.list = list
 9 | 			self.unique_users = []
10 | 			self.unique_soft = []
11 | 			self.stat_soft = []
12 | 			self.unique_paths = []
13 | 
14 | 		def print_all(self):
15 | 			for x in self.list:
16 |  				print x[0]
17 |  				if x[1] != []:
18 | 					print x[1]
19 |  				if x[2] != []:
20 | 					print x[2]
21 |   
22 | 		def sort_users(self):
23 | 			for x in self.list:
24 |  				if x[1]!=[]: 
25 |  					for y in x[1]:
26 |  						if self.unique_users.count(y) != 0:
27 | 							pass
28 | 						else:
29 | 							try:
30 | 								self.unique_users.append(y.lstrip())
31 | 							except:
32 | 								pass
33 | 				else:
34 | 					pass 
35 | 			return self.unique_users
36 | 
37 |  		def sort_software(self):
38 | 			for x in self.list:
39 | 				if x[3]!=[]:
40 |  					for y in x[3]:
41 | 						if self.unique_soft.count(y) != 0:
42 | 							pass
43 | 						else:
44 | 							try:
45 | 								self.unique_soft.append(y.lstrip())
46 | 							except Exception, e:
47 | 								pass
48 | 				else:
49 | 					pass
50 | 			return self.unique_soft
51 | 
52 | 		def sort_paths(self):
53 | 			for x in self.list:
54 | 				if x[2]!=[]:
55 |  					for y in x[2]:
56 | 						if self.unique_paths.count(y) != 0:
57 | 							pass
58 | 						else:
59 | 							self.unique_paths.append(y)
60 | 			return self.unique_paths
61 | 


--------------------------------------------------------------------------------
/hachoir_parser/misc/ole2_util.py:
--------------------------------------------------------------------------------
 1 | from hachoir_core.endian import BIG_ENDIAN, LITTLE_ENDIAN
 2 | from hachoir_core.field import RawBytes, RootSeekableFieldSet, ParserError
 3 | from hachoir_parser import HachoirParser
 4 | 
 5 | class OLE2FragmentParser(HachoirParser,RootSeekableFieldSet):
 6 |     tags = {
 7 |         "description": "Microsoft Office document subfragments",
 8 |     }
 9 |     endian = LITTLE_ENDIAN
10 | 
11 |     ENDIAN_CHECK=False
12 | 
13 |     def __init__(self, stream, **args):
14 |         RootSeekableFieldSet.__init__(self, None, "root", stream, None, stream.askSize(self))
15 |         HachoirParser.__init__(self, stream, **args)
16 |         if self.ENDIAN_CHECK:
17 |             if self["endian"].value == "\xFF\xFE":
18 |                 self.endian = BIG_ENDIAN
19 |             elif self["endian"].value == "\xFE\xFF":
20 |                 self.endian = LITTLE_ENDIAN
21 |             else:
22 |                 raise ParserError("OLE2: Invalid endian value")
23 | 
24 |     def validate(self):
25 |         if self.ENDIAN_CHECK:
26 |             if self["endian"].value not in ["\xFF\xFE", "\xFE\xFF"]:
27 |                 return "Unknown endian value %s"%self["endian"].value.encode('hex')
28 |         return True
29 | 
30 | class RawParser(OLE2FragmentParser):
31 |     ENDIAN_CHECK=False
32 |     OS_CHECK=False
33 |     def createFields(self):
34 |         yield RawBytes(self,"rawdata",self.datasize)
35 |         if self.datasize<self.size//8: yield RawBytes(self,"slack_space",(self.size//8)-self.datasize)
36 | 


--------------------------------------------------------------------------------
/hachoir_core/field/vector.py:
--------------------------------------------------------------------------------
 1 | from hachoir_core.field import Field, FieldSet, ParserError
 2 | 
 3 | class GenericVector(FieldSet):
 4 |     def __init__(self, parent, name, nb_items, item_class, item_name="item", description=None):
 5 |         # Sanity checks
 6 |         assert issubclass(item_class, Field)
 7 |         assert isinstance(item_class.static_size, (int, long))
 8 |         if not(0 < nb_items):
 9 |             raise ParserError('Unable to create empty vector "%s" in %s' \
10 |                 % (name, parent.path))
11 |         size = nb_items * item_class.static_size
12 |         self.__nb_items = nb_items
13 |         self._item_class = item_class
14 |         self._item_name = item_name
15 |         FieldSet.__init__(self, parent, name, description, size=size)
16 | 
17 |     def __len__(self):
18 |         return self.__nb_items
19 | 
20 |     def createFields(self):
21 |         name = self._item_name + "[]"
22 |         parser = self._item_class
23 |         for index in xrange(len(self)):
24 |             yield parser(self, name)
25 | 
26 | class UserVector(GenericVector):
27 |     """
28 |     To implement:
29 |     - item_name: name of a field without [] (eg. "color" becomes "color[0]"),
30 |       default value is "item"
31 |     - item_class: class of an item
32 |     """
33 |     item_class = None
34 |     item_name = "item"
35 | 
36 |     def __init__(self, parent, name, nb_items, description=None):
37 |         GenericVector.__init__(self, parent, name, nb_items, self.item_class, self.item_name, description)
38 | 
39 | 


--------------------------------------------------------------------------------
/hachoir_core/error.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Functions to display an error (error, warning or information) message.
 3 | """
 4 | 
 5 | from hachoir_core.log import log
 6 | from hachoir_core.tools import makePrintable
 7 | import sys, traceback
 8 | 
 9 | def getBacktrace(empty="Empty backtrace."):
10 |     """
11 |     Try to get backtrace as string.
12 |     Returns "Error while trying to get backtrace" on failure.
13 |     """
14 |     try:
15 |         info = sys.exc_info()
16 |         trace = traceback.format_exception(*info)
17 |         sys.exc_clear()
18 |         if trace[0] != "None\n":
19 |             return "".join(trace)
20 |     except:
21 |         # No i18n here (imagine if i18n function calls error...)
22 |         return "Error while trying to get backtrace"
23 |     return empty
24 | 
25 | class HachoirError(Exception):
26 |     """
27 |     Parent of all errors in Hachoir library
28 |     """
29 |     def __init__(self, message):
30 |         message_bytes = makePrintable(message, "ASCII")
31 |         Exception.__init__(self, message_bytes)
32 |         self.text = message
33 | 
34 |     def __unicode__(self):
35 |         return self.text
36 | 
37 | # Error classes which may be raised by Hachoir core
38 | # FIXME: Add EnvironmentError (IOError or OSError) and AssertionError?
39 | # FIXME: Remove ArithmeticError and RuntimeError?
40 | HACHOIR_ERRORS = (HachoirError, LookupError, NameError, AttributeError,
41 |     TypeError, ValueError, ArithmeticError, RuntimeError)
42 | 
43 | info    = log.info
44 | warning = log.warning
45 | error   = log.error
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Metagoofil
 2 | ==========
 3 | 
 4 | This is an unofficial fork of Metagoofil by Edge Security
 5 | 
 6 | What is this?
 7 | -------------
 8 | 
 9 | Metagoofil is a tool for extracting metadata of public documents (pdf,doc,xls,ppt,etc) availables in the target websites.This information could be useful because you can get valid usernames, people names, for using later in bruteforce password attacks (vpn, ftp, webapps), the tool will also extracts interesting "paths" of the documents, where we can get shared resources names, server names, etc.
10 | 
11 | This new version will also extract emails addresses from PDF and Word documents content.
12 | 
13 | How it works?
14 | -------------
15 | 
16 | The tool first perform a query in Google requesting different filetypes that can have useful metadata (pdf, doc, xls,ppt,etc), then will download those documents to the disk and extracts the metadata of the file using specific libraries for parsing different file types (Hachoir, Pdfminer, etc)
17 | 
18 | 
19 | Dependencies:
20 | -------------
21 | Python2
22 | 
23 | Usage
24 | -----
25 | 	-d: domain to search
26 | 	-t: filetype to download (pdf,doc,xls,ppt,odp,ods,docx,xlsx,pptx)
27 | 	-l: limit of results to search (default 200)
28 | 	-h: work with documents in directory (use "yes" for local analysis)
29 | 	-n: limit of files to download
30 | 	-o: working directory (location to save downloaded files)
31 | 	-f: output file
32 | 
33 | 	Examples:
34 | 		metagoofil.py -d apple.com -t doc,pdf -l 200 -n 50 -o applefiles -f results.html
35 | 		metagoofil.py -h yes -o applefiles -f results.html (local dir analysis)
36 | 


--------------------------------------------------------------------------------
/discovery/googlesearch.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | import httplib, sys
 3 | import myparser
 4 | import re
 5 | import time
 6 | 
 7 | class search_google:
 8 | 	def __init__(self,word,limit,start,filetype):
 9 | 		self.word=word
10 | 		self.results=""
11 | 		self.totalresults=""
12 | 		self.filetype=filetype
13 | 		self.server="www.google.com"
14 | 		self.hostname="www.google.com"
15 | 		self.userAgent="(Mozilla/5.0 (Windows; U; Windows NT 6.0;en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6"
16 | 		self.quantity="100"
17 | 		self.limit=limit
18 | 		self.counter=start
19 | 		
20 | 	def do_search_files(self):
21 | 		h = httplib.HTTP(self.server)
22 | 		h.putrequest('GET', "/search?num="+self.quantity+"&start=" + str(self.counter) + "&hl=en&meta=&q=filetype:"+self.filetype+"%20site:" + self.word)
23 | 		h.putheader('Host', self.hostname)
24 | 		h.putheader('User-agent', self.userAgent)	
25 | 		h.endheaders()
26 | 		returncode, returnmsg, headers = h.getreply()
27 | 		self.results = h.getfile().read()
28 | 		self.totalresults+= self.results
29 | 
30 | 	def get_emails(self):
31 | 		rawres=myparser.parser(self.totalresults,self.word)
32 | 		return rawres.emails()
33 | 	
34 | 	def get_hostnames(self):
35 | 		rawres=myparser.parser(self.totalresults,self.word)
36 | 		return rawres.hostnames()
37 | 	
38 | 	def get_files(self):
39 | 		rawres=myparser.parser(self.totalresults,self.word)
40 | 		return rawres.fileurls()
41 | 	
42 | 	def process_files(self):
43 | 		while self.counter < self.limit:
44 | 			self.do_search_files()
45 | 			time.sleep(1)
46 | 			self.counter+=100
47 | 			print "\tSearching "+ str(self.counter) + " results..."
48 | 
49 | 


--------------------------------------------------------------------------------
/hachoir_core/cmd_line.py:
--------------------------------------------------------------------------------
 1 | from optparse import OptionGroup
 2 | from hachoir_core.log import log
 3 | from hachoir_core.i18n import _, getTerminalCharset
 4 | from hachoir_core.tools import makePrintable
 5 | import hachoir_core.config as config
 6 | 
 7 | def getHachoirOptions(parser):
 8 |     """
 9 |     Create an option group (type optparse.OptionGroup) of Hachoir
10 |     library options.
11 |     """
12 |     def setLogFilename(*args):
13 |         log.setFilename(args[2])
14 | 
15 |     common = OptionGroup(parser, _("Hachoir library"), \
16 |         "Configure Hachoir library")
17 |     common.add_option("--verbose", help=_("Verbose mode"),
18 |         default=False, action="store_true")
19 |     common.add_option("--log", help=_("Write log in a file"),
20 |         type="string", action="callback", callback=setLogFilename)
21 |     common.add_option("--quiet", help=_("Quiet mode (don't display warning)"),
22 |         default=False, action="store_true")
23 |     common.add_option("--debug", help=_("Debug mode"),
24 |         default=False, action="store_true")
25 |     return common
26 | 
27 | def configureHachoir(option):
28 |     # Configure Hachoir using "option" (value from optparse)
29 |     if option.quiet:
30 |       config.quiet = True
31 |     if option.verbose:
32 |       config.verbose = True
33 |     if option.debug:
34 |       config.debug = True
35 | 
36 | def unicodeFilename(filename, charset=None):
37 |     if not charset:
38 |         charset = getTerminalCharset()
39 |     try:
40 |         return unicode(filename, charset)
41 |     except UnicodeDecodeError:
42 |         return makePrintable(filename, charset, to_unicode=True)
43 | 
44 | 


--------------------------------------------------------------------------------
/hachoir_core/field/parser.py:
--------------------------------------------------------------------------------
 1 | from hachoir_core.endian import BIG_ENDIAN, LITTLE_ENDIAN, MIDDLE_ENDIAN
 2 | from hachoir_core.field import GenericFieldSet
 3 | from hachoir_core.log import Logger
 4 | import hachoir_core.config as config
 5 | 
 6 | class Parser(GenericFieldSet):
 7 |     """
 8 |     A parser is the root of all other fields. It create first level of fields
 9 |     and have special attributes and methods:
10 |     - endian: Byte order (L{BIG_ENDIAN}, L{LITTLE_ENDIAN} or L{MIDDLE_ENDIAN}) of input data ;
11 |     - stream: Data input stream (set in L{__init__()}) ;
12 |     - size: Field set size will be size of input stream.
13 |     """
14 | 
15 |     def __init__(self, stream, description=None):
16 |         """
17 |         Parser constructor
18 | 
19 |         @param stream: Data input stream (see L{InputStream})
20 |         @param description: (optional) String description
21 |         """
22 |         # Check arguments
23 |         assert hasattr(self, "endian") \
24 |             and self.endian in (BIG_ENDIAN, LITTLE_ENDIAN, MIDDLE_ENDIAN)
25 | 
26 |         # Call parent constructor
27 |         GenericFieldSet.__init__(self, None, "root", stream, description, stream.askSize(self))
28 | 
29 |     def _logger(self):
30 |         return Logger._logger(self)
31 | 
32 |     def _setSize(self, size):
33 |         self._truncate(size)
34 |         self.raiseEvent("field-resized", self)
35 |     size = property(lambda self: self._size, doc="Size in bits")
36 | 
37 |     path = property(lambda self: "/")
38 | 
39 |     # dummy definition to prevent hachoir-core from depending on hachoir-parser
40 |     autofix = property(lambda self: config.autofix)
41 | 


--------------------------------------------------------------------------------
/hachoir_parser/image/common.py:
--------------------------------------------------------------------------------
 1 | from hachoir_core.field import FieldSet, UserVector, UInt8
 2 | 
 3 | class RGB(FieldSet):
 4 |     color_name = {
 5 |         (  0,   0,   0): "Black",
 6 |         (255,   0,   0): "Red",
 7 |         (  0, 255,   0): "Green",
 8 |         (  0,   0, 255): "Blue",
 9 |         (255, 255, 255): "White",
10 |     }
11 |     static_size = 24
12 | 
13 |     def createFields(self):
14 |         yield UInt8(self, "red", "Red")
15 |         yield UInt8(self, "green", "Green")
16 |         yield UInt8(self, "blue", "Blue")
17 | 
18 |     def createDescription(self):
19 |         rgb = self["red"].value, self["green"].value, self["blue"].value
20 |         name = self.color_name.get(rgb)
21 |         if not name:
22 |             name = "#%02X%02X%02X" % rgb
23 |         return "RGB color: " + name
24 | 
25 | class RGBA(RGB):
26 |     static_size = 32
27 | 
28 |     def createFields(self):
29 |         yield UInt8(self, "red", "Red")
30 |         yield UInt8(self, "green", "Green")
31 |         yield UInt8(self, "blue", "Blue")
32 |         yield UInt8(self, "alpha", "Alpha")
33 | 
34 |     def createDescription(self):
35 |         description = RGB.createDescription(self)
36 |         opacity = self["alpha"].value*100/255
37 |         return "%s (opacity: %s%%)" % (description, opacity)
38 | 
39 | class PaletteRGB(UserVector):
40 |     item_class = RGB
41 |     item_name = "color"
42 |     def createDescription(self):
43 |         return "Palette of %u RGB colors" % len(self)
44 | 
45 | class PaletteRGBA(PaletteRGB):
46 |     item_class = RGBA
47 |     def createDescription(self):
48 |         return "Palette of %u RGBA colors" % len(self)
49 | 
50 | 


--------------------------------------------------------------------------------
/hachoir_core/stream/input_helper.py:
--------------------------------------------------------------------------------
 1 | from hachoir_core.i18n import getTerminalCharset, guessBytesCharset, _
 2 | from hachoir_core.stream import InputIOStream, InputSubStream, InputStreamError
 3 | 
 4 | def FileInputStream(filename, real_filename=None, **args):
 5 |     """
 6 |     Create an input stream of a file. filename must be unicode.
 7 | 
 8 |     real_filename is an optional argument used to specify the real filename,
 9 |     its type can be 'str' or 'unicode'. Use real_filename when you are
10 |     not able to convert filename to real unicode string (ie. you have to
11 |     use unicode(name, 'replace') or unicode(name, 'ignore')).
12 |     """
13 |     assert isinstance(filename, unicode)
14 |     if not real_filename:
15 |         real_filename = filename
16 |     try:
17 |         inputio = open(real_filename, 'rb')
18 |     except IOError, err:
19 |         charset = getTerminalCharset()
20 |         errmsg = unicode(str(err), charset)
21 |         raise InputStreamError(_("Unable to open file %s: %s") % (filename, errmsg))
22 |     source = "file:" + filename
23 |     offset = args.pop("offset", 0)
24 |     size = args.pop("size", None)
25 |     if offset or size:
26 |         if size:
27 |             size = 8 * size
28 |         stream = InputIOStream(inputio, source=source, **args)
29 |         return InputSubStream(stream, 8 * offset, size, **args)
30 |     else:
31 |         args.setdefault("tags",[]).append(("filename", filename))
32 |         return InputIOStream(inputio, source=source, **args)
33 | 
34 | def guessStreamCharset(stream, address, size, default=None):
35 |     size = min(size, 1024*8)
36 |     bytes = stream.readBytes(address, size//8)
37 |     return guessBytesCharset(bytes, default)
38 | 
39 | 


--------------------------------------------------------------------------------
/hachoir_parser/common/msdos.py:
--------------------------------------------------------------------------------
 1 | """
 2 | MS-DOS structures.
 3 | 
 4 | Documentation:
 5 | - File attributes:
 6 |   http://www.cs.colorado.edu/~main/cs1300/include/ddk/winddk.h
 7 | """
 8 | 
 9 | from hachoir_core.field import StaticFieldSet
10 | from hachoir_core.field import Bit, NullBits
11 | 
12 | _FIELDS = (
13 |     (Bit, "read_only"),
14 |     (Bit, "hidden"),
15 |     (Bit, "system"),
16 |     (NullBits, "reserved[]", 1),
17 |     (Bit, "directory"),
18 |     (Bit, "archive"),
19 |     (Bit, "device"),
20 |     (Bit, "normal"),
21 |     (Bit, "temporary"),
22 |     (Bit, "sparse_file"),
23 |     (Bit, "reparse_file"),
24 |     (Bit, "compressed"),
25 |     (Bit, "offline"),
26 |     (Bit, "dont_index_content"),
27 |     (Bit, "encrypted"),
28 | )
29 | 
30 | class MSDOSFileAttr16(StaticFieldSet):
31 |     """
32 |     MSDOS 16-bit file attributes
33 |     """
34 |     format = _FIELDS + ((NullBits, "reserved[]", 1),)
35 | 
36 |     _text_keys = (
37 |         # Sort attributes by importance
38 |         "directory", "read_only", "compressed",
39 |         "hidden", "system",
40 |         "normal", "device",
41 |         "temporary", "archive")
42 | 
43 |     def createValue(self):
44 |         mode = []
45 |         for name in self._text_keys:
46 |             if self[name].value:
47 |                 if 4 <= len(mode):
48 |                     mode.append("...")
49 |                     break
50 |                 else:
51 |                     mode.append(name)
52 |         if mode:
53 |             return ", ".join(mode)
54 |         else:
55 |             return "(none)"
56 | 
57 | class MSDOSFileAttr32(MSDOSFileAttr16):
58 |     """
59 |     MSDOS 32-bit file attributes
60 |     """
61 |     format = _FIELDS + ((NullBits, "reserved[]", 17),)
62 | 
63 | 


--------------------------------------------------------------------------------
/pdfminer/encodingdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | import re
 4 | from psparser import PSLiteral
 5 | from glyphlist import glyphname2unicode
 6 | from latin_enc import ENCODING
 7 | 
 8 | 
 9 | ##  name2unicode
10 | ##
11 | STRIP_NAME = re.compile(r'[0-9]+')
12 | def name2unicode(name):
13 |     """Converts Adobe glyph names to Unicode numbers."""
14 |     if name in glyphname2unicode:
15 |         return glyphname2unicode[name]
16 |     m = STRIP_NAME.search(name)
17 |     if not m: raise KeyError(name)
18 |     return unichr(int(m.group(0)))
19 | 
20 | 
21 | ##  EncodingDB
22 | ##
23 | class EncodingDB(object):
24 | 
25 |     std2unicode = {}
26 |     mac2unicode = {}
27 |     win2unicode = {}
28 |     pdf2unicode = {}
29 |     for (name,std,mac,win,pdf) in ENCODING:
30 |         c = name2unicode(name)
31 |         if std: std2unicode[std] = c
32 |         if mac: mac2unicode[mac] = c
33 |         if win: win2unicode[win] = c
34 |         if pdf: pdf2unicode[pdf] = c
35 | 
36 |     encodings = {
37 |       'StandardEncoding': std2unicode,
38 |       'MacRomanEncoding': mac2unicode,
39 |       'WinAnsiEncoding': win2unicode,
40 |       'PDFDocEncoding': pdf2unicode,
41 |       }
42 | 
43 |     @classmethod
44 |     def get_encoding(klass, name, diff=None):
45 |         cid2unicode = klass.encodings.get(name, klass.std2unicode)
46 |         if diff:
47 |             cid2unicode = cid2unicode.copy()
48 |             cid = 0
49 |             for x in diff:
50 |                 if isinstance(x, int):
51 |                     cid = x
52 |                 elif isinstance(x, PSLiteral):
53 |                     try:
54 |                         cid2unicode[cid] = name2unicode(x.name)
55 |                     except KeyError:
56 |                         pass
57 |                     cid += 1
58 |         return cid2unicode
59 | 


--------------------------------------------------------------------------------
/hachoir_metadata/qt/dialog.ui:
--------------------------------------------------------------------------------
 1 | <ui version="4.0" >
 2 |  <class>Form</class>
 3 |  <widget class="QWidget" name="Form" >
 4 |   <property name="geometry" >
 5 |    <rect>
 6 |     <x>0</x>
 7 |     <y>0</y>
 8 |     <width>441</width>
 9 |     <height>412</height>
10 |    </rect>
11 |   </property>
12 |   <property name="windowTitle" >
13 |    <string>hachoir-metadata</string>
14 |   </property>
15 |   <layout class="QVBoxLayout" name="verticalLayout" >
16 |    <item>
17 |     <layout class="QHBoxLayout" name="horizontalLayout_2" >
18 |      <item>
19 |       <widget class="QPushButton" name="open_button" >
20 |        <property name="text" >
21 |         <string>Open</string>
22 |        </property>
23 |       </widget>
24 |      </item>
25 |      <item>
26 |       <widget class="QComboBox" name="files_combo" >
27 |        <property name="sizePolicy" >
28 |         <sizepolicy vsizetype="Fixed" hsizetype="Expanding" >
29 |          <horstretch>0</horstretch>
30 |          <verstretch>0</verstretch>
31 |         </sizepolicy>
32 |        </property>
33 |       </widget>
34 |      </item>
35 |     </layout>
36 |    </item>
37 |    <item>
38 |     <widget class="QTableWidget" name="metadata_table" >
39 |      <property name="alternatingRowColors" >
40 |       <bool>true</bool>
41 |      </property>
42 |      <property name="showGrid" >
43 |       <bool>false</bool>
44 |      </property>
45 |      <property name="rowCount" >
46 |       <number>0</number>
47 |      </property>
48 |      <property name="columnCount" >
49 |       <number>0</number>
50 |      </property>
51 |     </widget>
52 |    </item>
53 |    <item>
54 |     <widget class="QPushButton" name="quit_button" >
55 |      <property name="text" >
56 |       <string>Quit</string>
57 |      </property>
58 |     </widget>
59 |    </item>
60 |   </layout>
61 |  </widget>
62 |  <resources/>
63 |  <connections/>
64 | </ui>
65 | 


--------------------------------------------------------------------------------
/pdfminer/runlength.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | #
 3 | # RunLength decoder (Adobe version) implementation based on PDF Reference
 4 | # version 1.4 section 3.3.4.
 5 | #
 6 | #  * public domain *
 7 | #
 8 | 
 9 | import sys
10 | 
11 | def rldecode(data):
12 |     """
13 |     RunLength decoder (Adobe version) implementation based on PDF Reference
14 |     version 1.4 section 3.3.4:
15 |         The RunLengthDecode filter decodes data that has been encoded in a
16 |         simple byte-oriented format based on run length. The encoded data
17 |         is a sequence of runs, where each run consists of a length byte
18 |         followed by 1 to 128 bytes of data. If the length byte is in the
19 |         range 0 to 127, the following length + 1 (1 to 128) bytes are
20 |         copied literally during decompression. If length is in the range
21 |         129 to 255, the following single byte is to be copied 257 - length
22 |         (2 to 128) times during decompression. A length value of 128
23 |         denotes EOD.
24 |     >>> s = "\x05123456\xfa7\x04abcde\x80junk"
25 |     >>> rldecode(s)
26 |     '1234567777777abcde'
27 |     """
28 |     decoded = []
29 |     i=0
30 |     while i < len(data):
31 |         #print "data[%d]=:%d:" % (i,ord(data[i]))
32 |         length = ord(data[i])
33 |         if length == 128:
34 |             break
35 |         if length >= 0 and length < 128:
36 |             run = data[i+1:(i+1)+(length+1)]
37 |             #print "length=%d, run=%s" % (length+1,run)
38 |             decoded.append(run)
39 |             i = (i+1) + (length+1)
40 |         if length > 128:
41 |             run = data[i+1]*(257-length)
42 |             #print "length=%d, run=%s" % (257-length,run)
43 |             decoded.append(run)
44 |             i = (i+1) + 1
45 |     return ''.join(decoded)
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     import doctest
50 |     doctest.testmod()
51 | 


--------------------------------------------------------------------------------
/hachoir_metadata/filter.py:
--------------------------------------------------------------------------------
 1 | from hachoir_metadata.timezone import UTC
 2 | from datetime import date, datetime
 3 | 
 4 | # Year in 1850..2030
 5 | MIN_YEAR = 1850
 6 | MAX_YEAR = 2030
 7 | 
 8 | class Filter:
 9 |     def __init__(self, valid_types, min=None, max=None):
10 |         self.types = valid_types
11 |         self.min = min
12 |         self.max = max
13 | 
14 |     def __call__(self, value):
15 |         if not isinstance(value, self.types):
16 |             return True
17 |         if self.min is not None and value < self.min:
18 |             return False
19 |         if self.max is not None and self.max < value:
20 |             return False
21 |         return True
22 | 
23 | class NumberFilter(Filter):
24 |     def __init__(self, min=None, max=None):
25 |         Filter.__init__(self, (int, long, float), min, max)
26 | 
27 | class DatetimeFilter(Filter):
28 |     def __init__(self, min=None, max=None):
29 |         Filter.__init__(self, (date, datetime),
30 |             datetime(MIN_YEAR, 1, 1),
31 |             datetime(MAX_YEAR, 12, 31))
32 |         self.min_date = date(MIN_YEAR, 1, 1)
33 |         self.max_date = date(MAX_YEAR, 12, 31)
34 |         self.min_tz = datetime(MIN_YEAR, 1, 1, tzinfo=UTC)
35 |         self.max_tz = datetime(MAX_YEAR, 12, 31, tzinfo=UTC)
36 | 
37 |     def __call__(self, value):
38 |         """
39 |         Use different min/max values depending on value type
40 |         (datetime with timezone, datetime or date).
41 |         """
42 |         if not isinstance(value, self.types):
43 |             return True
44 |         if hasattr(value, "tzinfo") and value.tzinfo:
45 |             return (self.min_tz <= value <= self.max_tz)
46 |         elif isinstance(value, datetime):
47 |             return (self.min <= value <= self.max)
48 |         else:
49 |             return (self.min_date <= value <= self.max_date)
50 | 
51 | DATETIME_FILTER = DatetimeFilter()
52 | 
53 | 


--------------------------------------------------------------------------------
/hachoir_parser/archive/ar.py:
--------------------------------------------------------------------------------
 1 | """
 2 | GNU ar archive : archive file (.a) and Debian (.deb) archive.
 3 | """
 4 | 
 5 | from hachoir_parser import Parser
 6 | from hachoir_core.field import (FieldSet, ParserError,
 7 |     String, RawBytes, UnixLine)
 8 | from hachoir_core.endian import BIG_ENDIAN
 9 | 
10 | class ArchiveFileEntry(FieldSet):
11 |     def createFields(self):
12 |         yield UnixLine(self, "header", "Header")
13 |         info = self["header"].value.split()
14 |         if len(info) != 7:
15 |             raise ParserError("Invalid file entry header")
16 |         size = int(info[5])
17 |         if 0 < size:
18 |             yield RawBytes(self, "content", size, "File data")
19 | 
20 |     def createDescription(self):
21 |         return "File entry (%s)" % self["header"].value.split()[0]
22 | 
23 | class ArchiveFile(Parser):
24 |     endian = BIG_ENDIAN
25 |     MAGIC = '!<arch>\n'
26 |     PARSER_TAGS = {
27 |         "id": "unix_archive",
28 |         "category": "archive",
29 |         "file_ext": ("a", "deb"),
30 |         "mime":
31 |             (u"application/x-debian-package",
32 |              u"application/x-archive",
33 |              u"application/x-dpkg"),
34 |         "min_size": (8 + 13)*8, # file signature + smallest file as possible
35 |         "magic": ((MAGIC, 0),),
36 |         "description": "Unix archive"
37 |     }
38 | 
39 |     def validate(self):
40 |         if self.stream.readBytes(0, len(self.MAGIC)) != self.MAGIC:
41 |             return "Invalid magic string"
42 |         return True
43 | 
44 |     def createFields(self):
45 |         yield String(self, "id", 8, "Unix archive identifier (\"<!arch>\")", charset="ASCII")
46 |         while not self.eof:
47 |             data = self.stream.readBytes(self.current_size, 1)
48 |             if data == "\n":
49 |                 yield RawBytes(self, "empty_line[]", 1, "Empty line")
50 |             else:
51 |                 yield ArchiveFileEntry(self, "file[]", "File")
52 | 
53 | 


--------------------------------------------------------------------------------
/hachoir_metadata/history.patch:
--------------------------------------------------------------------------------
 1 | Index: misc.py
 2 | ===================================================================
 3 | --- misc.py	(revision 2947)
 4 | +++ misc.py	(working copy)
 5 | @@ -125,6 +125,9 @@
 6 |          summary = self.getField(fieldset, "summary[0]")
 7 |          if summary:
 8 |              self.useSummary(summary, False)
 9 | +        table = self.getField(fieldset, "table1[0]")
10 | +        if table:
11 | +            self.useTable(table)
12 |  
13 |      def getFragment(self, frag):
14 |          stream = frag.getSubIStream()
15 | @@ -161,6 +164,13 @@
16 |          self.comment = "Encrypted: %s" % doc["FIB/fEncrypted"].value
17 |  
18 |      @fault_tolerant
19 | +    def useTable(self, table):
20 | +        if 'SttbSavedBy' in table:
21 | +            arr = list(table['SttbSavedBy'].array('string'))
22 | +            for i in xrange(0, len(arr), 2):
23 | +                self.revision_history = "Revision #%d: Author '%s', file '%s'"%(i//2, arr[i].value, arr[i+1].value)
24 | +
25 | +    @fault_tolerant
26 |      def useProperty(self, summary, property, is_doc_summary):
27 |          field = summary.getFieldByAddress(property["offset"].value*8)
28 |          if not field \
29 | Index: register.py
30 | ===================================================================
31 | --- register.py	(revision 2986)
32 | +++ register.py	(working copy)
33 | @@ -78,6 +78,7 @@
34 |          filter=DATETIME_FILTER, type=(datetime, date), conversion=setDatetime))
35 |      meta.register(Data("last_modification", 501, _("Last modification"), text_handler=humanDatetime,
36 |          filter=DATETIME_FILTER, type=(datetime, date), conversion=setDatetime))
37 | +    meta.register(Data("revision_history", 502, _("Revision history"), type=unicode))
38 |      meta.register(Data("latitude", 510, _("Latitude"), type=float))
39 |      meta.register(Data("longitude", 511, _("Longitude"), type=float))
40 |      meta.register(Data("altitude", 511, _("Altitude"), type=float, text_handler=humanAltitude))
41 | 


--------------------------------------------------------------------------------
/hachoir_core/field/integer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Integer field classes:
 3 | - UInt8, UInt16, UInt24, UInt32, UInt64: unsigned integer of 8, 16, 32, 64 bits ;
 4 | - Int8, Int16, Int24, Int32, Int64: signed integer of 8, 16, 32, 64 bits.
 5 | """
 6 | 
 7 | from hachoir_core.field import Bits, FieldError
 8 | 
 9 | class GenericInteger(Bits):
10 |     """
11 |     Generic integer class used to generate other classes.
12 |     """
13 |     def __init__(self, parent, name, signed, size, description=None):
14 |         if not (8 <= size <= 16384):
15 |             raise FieldError("Invalid integer size (%s): have to be in 8..16384" % size)
16 |         Bits.__init__(self, parent, name, size, description)
17 |         self.signed = signed
18 | 
19 |     def createValue(self):
20 |         return self._parent.stream.readInteger(
21 |             self.absolute_address, self.signed, self._size, self._parent.endian)
22 | 
23 | def integerFactory(name, is_signed, size, doc):
24 |     class Integer(GenericInteger):
25 |         __doc__ = doc
26 |         static_size = size
27 |         def __init__(self, parent, name, description=None):
28 |             GenericInteger.__init__(self, parent, name, is_signed, size, description)
29 |     cls = Integer
30 |     cls.__name__ = name
31 |     return cls
32 | 
33 | UInt8 = integerFactory("UInt8", False, 8, "Unsigned integer of 8 bits")
34 | UInt16 = integerFactory("UInt16", False, 16, "Unsigned integer of 16 bits")
35 | UInt24 = integerFactory("UInt24", False, 24, "Unsigned integer of 24 bits")
36 | UInt32 = integerFactory("UInt32", False, 32, "Unsigned integer of 32 bits")
37 | UInt64 = integerFactory("UInt64", False, 64, "Unsigned integer of 64 bits")
38 | 
39 | Int8 = integerFactory("Int8", True, 8, "Signed integer of 8 bits")
40 | Int16 = integerFactory("Int16", True, 16, "Signed integer of 16 bits")
41 | Int24 = integerFactory("Int24", True, 24, "Signed integer of 24 bits")
42 | Int32 = integerFactory("Int32", True, 32, "Signed integer of 32 bits")
43 | Int64 = integerFactory("Int64", True, 64, "Signed integer of 64 bits")
44 | 
45 | 


--------------------------------------------------------------------------------
/hachoir_core/field/bit_field.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Bit sized classes:
 3 | - Bit: Single bit, value is False or True ;
 4 | - Bits: Integer with a size in bits ;
 5 | - RawBits: unknown content with a size in bits.
 6 | """
 7 | 
 8 | from hachoir_core.field import Field
 9 | from hachoir_core.i18n import _
10 | from hachoir_core import config
11 | 
12 | class RawBits(Field):
13 |     """
14 |     Unknown content with a size in bits.
15 |     """
16 |     static_size = staticmethod(lambda *args, **kw: args[1])
17 | 
18 |     def __init__(self, parent, name, size, description=None):
19 |         """
20 |         Constructor: see L{Field.__init__} for parameter description
21 |         """
22 |         Field.__init__(self, parent, name, size, description)
23 | 
24 |     def hasValue(self):
25 |         return True
26 | 
27 |     def createValue(self):
28 |         return self._parent.stream.readBits(
29 |             self.absolute_address, self._size, self._parent.endian)
30 | 
31 |     def createDisplay(self):
32 |         if self._size < config.max_bit_length:
33 |             return unicode(self.value)
34 |         else:
35 |             return _("<%s size=%u>" %
36 |                 (self.__class__.__name__, self._size))
37 |     createRawDisplay = createDisplay
38 | 
39 | class Bits(RawBits):
40 |     """
41 |     Positive integer with a size in bits
42 | 
43 |     @see: L{Bit}
44 |     @see: L{RawBits}
45 |     """
46 |     pass
47 | 
48 | class Bit(RawBits):
49 |     """
50 |     Single bit: value can be False or True, and size is exactly one bit.
51 | 
52 |     @see: L{Bits}
53 |     """
54 |     static_size = 1
55 | 
56 |     def __init__(self, parent, name, description=None):
57 |         """
58 |         Constructor: see L{Field.__init__} for parameter description
59 |         """
60 |         RawBits.__init__(self, parent, name, 1, description=description)
61 | 
62 |     def createValue(self):
63 |         return 1 == self._parent.stream.readBits(
64 |                 self.absolute_address, 1, self._parent.endian)
65 | 
66 |     def createRawDisplay(self):
67 |         return unicode(int(self.value))
68 | 
69 | 


--------------------------------------------------------------------------------
/hachoir_core/field/static_field_set.py:
--------------------------------------------------------------------------------
 1 | from hachoir_core.field import FieldSet, ParserError
 2 | 
 3 | class StaticFieldSet(FieldSet):
 4 |     """
 5 |     Static field set: format class attribute is a tuple of all fields
 6 |     in syntax like:
 7 |        format = (
 8 |           (TYPE1, ARG1, ARG2, ...),
 9 |           (TYPE2, ARG1, ARG2, ..., {KEY1=VALUE1, ...}),
10 |           ...
11 |        )
12 | 
13 |     Types with dynamic size are forbidden, eg. CString, PascalString8, etc.
14 |     """
15 |     format = None  # You have to redefine this class variable
16 |     _class = None
17 | 
18 |     def __new__(cls, *args, **kw):
19 |         assert cls.format is not None, "Class attribute 'format' is not set"
20 |         if cls._class is not cls.__name__:
21 |             cls._class = cls.__name__
22 |             cls.static_size = cls._computeStaticSize()
23 |         return object.__new__(cls, *args, **kw)
24 | 
25 |     @staticmethod
26 |     def _computeItemSize(item):
27 |         item_class = item[0]
28 |         if item_class.static_size is None:
29 |             raise ParserError("Unable to get static size of field type: %s"
30 |                 % item_class.__name__)
31 |         if callable(item_class.static_size):
32 |             if isinstance(item[-1], dict):
33 |                 return item_class.static_size(*item[1:-1], **item[-1])
34 |             else:
35 |                 return item_class.static_size(*item[1:])
36 |         else:
37 |             assert isinstance(item_class.static_size, (int, long))
38 |             return item_class.static_size
39 | 
40 |     def createFields(self):
41 |         for item in self.format:
42 |             if isinstance(item[-1], dict):
43 |                 yield item[0](self, *item[1:-1], **item[-1])
44 |             else:
45 |                 yield item[0](self, *item[1:])
46 | 
47 |     @classmethod
48 |     def _computeStaticSize(cls, *args):
49 |         return sum(cls._computeItemSize(item) for item in cls.format)
50 | 
51 |     # Initial value of static_size, it changes when first instance
52 |     # is created (see __new__)
53 |     static_size = _computeStaticSize
54 | 
55 | 


--------------------------------------------------------------------------------
/hachoir_core/field/helper.py:
--------------------------------------------------------------------------------
 1 | from hachoir_core.field import (FieldError,
 2 |     RawBits, RawBytes,
 3 |     PaddingBits, PaddingBytes,
 4 |     NullBits, NullBytes,
 5 |     GenericString, GenericInteger)
 6 | from hachoir_core.stream import FileOutputStream
 7 | 
 8 | def createRawField(parent, size, name="raw[]", description=None):
 9 |     if size <= 0:
10 |         raise FieldError("Unable to create raw field of %s bits" % size)
11 |     if (size % 8) == 0:
12 |         return RawBytes(parent, name, size/8, description)
13 |     else:
14 |         return RawBits(parent, name, size, description)
15 | 
16 | def createPaddingField(parent, nbits, name="padding[]", description=None):
17 |     if nbits <= 0:
18 |         raise FieldError("Unable to create padding of %s bits" % nbits)
19 |     if (nbits % 8) == 0:
20 |         return PaddingBytes(parent, name, nbits/8, description)
21 |     else:
22 |         return PaddingBits(parent, name, nbits, description)
23 | 
24 | def createNullField(parent, nbits, name="padding[]", description=None):
25 |     if nbits <= 0:
26 |         raise FieldError("Unable to create null padding of %s bits" % nbits)
27 |     if (nbits % 8) == 0:
28 |         return NullBytes(parent, name, nbits/8, description)
29 |     else:
30 |         return NullBits(parent, name, nbits, description)
31 | 
32 | def isString(field):
33 |     return issubclass(field.__class__, GenericString)
34 | 
35 | def isInteger(field):
36 |     return issubclass(field.__class__, GenericInteger)
37 | 
38 | def writeIntoFile(fieldset, filename):
39 |     output = FileOutputStream(filename)
40 |     fieldset.writeInto(output)
41 | 
42 | def createOrphanField(fieldset, address, field_cls, *args, **kw):
43 |     """
44 |     Create an orphan field at specified address:
45 |       field_cls(fieldset, *args, **kw)
46 | 
47 |     The field uses the fieldset properties but it isn't added to the
48 |     field set.
49 |     """
50 |     save_size = fieldset._current_size
51 |     try:
52 |         fieldset._current_size = address
53 |         field = field_cls(fieldset, *args, **kw)
54 |     finally:
55 |         fieldset._current_size = save_size
56 |     return field
57 | 
58 | 


--------------------------------------------------------------------------------
/hachoir_parser/template.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ====================== 8< ============================
 3 | This file is an Hachoir parser template. Make a copy
 4 | of it, and adapt it to your needs.
 5 | 
 6 | You have to replace all "TODO" with you code.
 7 | ====================== 8< ============================
 8 | 
 9 | TODO parser.
10 | 
11 | Author: TODO TODO
12 | Creation date: YYYY-mm-DD
13 | """
14 | 
15 | # TODO: Just keep what you need
16 | from hachoir_parser import Parser
17 | from hachoir_core.field import (ParserError,
18 |     UInt8, UInt16, UInt32, String, RawBytes)
19 | from hachoir_core.endian import LITTLE_ENDIAN, BIG_ENDIAN
20 | 
21 | class TODOFile(Parser):
22 |     PARSER_TAGS = {
23 |         "id": "TODO",
24 |         "category": "TODO",    # "archive", "audio", "container", ...
25 |         "file_ext": ("TODO",), # TODO: Example ("bmp",) to parse the file "image.bmp"
26 |         "mime": (u"TODO"),      # TODO: Example: "image/png"
27 |         "min_size": 0,         # TODO: Minimum file size (x bits, or x*8 in bytes)
28 |         "description": "TODO", # TODO: Example: "A bitmap picture"
29 |     }
30 | 
31 | #    TODO: Choose between little or big endian
32 | #    endian = LITTLE_ENDIAN
33 | #    endian = BIG_ENDIAN
34 | 
35 |     def validate(self):
36 |         # TODO: Check that file looks like your format
37 |         # Example: check first two bytes
38 |         # return (self.stream.readBytes(0, 2) == 'BM')
39 |         return False
40 | 
41 |     def createFields(self):
42 |         # TODO: Write your parser using this model:
43 |         # yield UInt8(self, "name1", "description1")
44 |         # yield UInt16(self, "name2", "description2")
45 |         # yield UInt32(self, "name3", "description3")
46 |         # yield String(self, "name4", 1, "description4") # TODO: add ", charset="ASCII")"
47 |         # yield String(self, "name5", 1, "description5", charset="ASCII")
48 |         # yield String(self, "name6", 1, "description6", charset="ISO-8859-1")
49 | 
50 |         # Read rest of the file (if any)
51 |         # TODO: You may remove this code
52 |         if self.current_size < self._size:
53 |             yield self.seekBit(self._size, "end")
54 | 
55 | 


--------------------------------------------------------------------------------
/hachoir_core/text_handler.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utilities used to convert a field to human classic reprentation of data.
 3 | """
 4 | 
 5 | from hachoir_core.tools import (
 6 |     humanDuration, humanFilesize, alignValue,
 7 |     durationWin64 as doDurationWin64,
 8 |     deprecated)
 9 | from types import FunctionType, MethodType
10 | from hachoir_core.field import Field
11 | 
12 | def textHandler(field, handler):
13 |     assert isinstance(handler, (FunctionType, MethodType))
14 |     assert issubclass(field.__class__, Field)
15 |     field.createDisplay = lambda: handler(field)
16 |     return field
17 | 
18 | def displayHandler(field, handler):
19 |     assert isinstance(handler, (FunctionType, MethodType))
20 |     assert issubclass(field.__class__, Field)
21 |     field.createDisplay = lambda: handler(field.value)
22 |     return field
23 | 
24 | @deprecated("Use TimedeltaWin64 field type")
25 | def durationWin64(field):
26 |     """
27 |     Convert Windows 64-bit duration to string. The timestamp format is
28 |     a 64-bit number: number of 100ns. See also timestampWin64().
29 | 
30 |     >>> durationWin64(type("", (), dict(value=2146280000, size=64)))
31 |     u'3 min 34 sec 628 ms'
32 |     >>> durationWin64(type("", (), dict(value=(1 << 64)-1, size=64)))
33 |     u'58494 years 88 days 5 hours'
34 |     """
35 |     assert hasattr(field, "value") and hasattr(field, "size")
36 |     assert field.size == 64
37 |     delta = doDurationWin64(field.value)
38 |     return humanDuration(delta)
39 | 
40 | def filesizeHandler(field):
41 |     """
42 |     Format field value using humanFilesize()
43 |     """
44 |     return displayHandler(field, humanFilesize)
45 | 
46 | def hexadecimal(field):
47 |     """
48 |     Convert an integer to hexadecimal in lower case. Returns unicode string.
49 | 
50 |     >>> hexadecimal(type("", (), dict(value=412, size=16)))
51 |     u'0x019c'
52 |     >>> hexadecimal(type("", (), dict(value=0, size=32)))
53 |     u'0x00000000'
54 |     """
55 |     assert hasattr(field, "value") and hasattr(field, "size")
56 |     size = field.size
57 |     padding = alignValue(size, 4) // 4
58 |     pattern = u"0x%%0%ux" % padding
59 |     return pattern % field.value
60 | 
61 | 


--------------------------------------------------------------------------------
/extractors/metadataMSOffice.py:
--------------------------------------------------------------------------------
 1 | from hachoir_core.error import HachoirError
 2 | from hachoir_core.cmd_line import unicodeFilename
 3 | from hachoir_parser import createParser
 4 | from hachoir_core.tools import makePrintable
 5 | from hachoir_metadata import extractMetadata
 6 | from hachoir_core.i18n import getTerminalCharset
 7 | from sys import argv, stderr, exit
 8 | 
 9 | class metaMs2k:
10 | 	def __init__(self,filename):
11 | 		self.filename=filename
12 | 		self.users=[]
13 | 		self.paths=[]
14 | 		self.software=[]
15 | 		self.modification=[]
16 | 		self.creationDate=[]
17 | 		self.lastPrinted=[]
18 | 		self.raw=""
19 | 
20 | 	def getData(self):
21 | 		filename, realname = unicodeFilename(self.filename), self.filename
22 | 		try:
23 | 			parser = createParser(filename, realname)
24 | 		except:
25 | 			return "error"
26 | 		try:
27 | 			metadata = extractMetadata(parser)
28 | 		except HachoirError, err:
29 | 			print "Metadata extraction error: %s" % unicode(err)
30 | 			metadata = None
31 | 		if not metadata:
32 | 			print "Unable to extract metadata on file: " + self.filename
33 | 		else:
34 | 			text = metadata.exportPlaintext()
35 | 			charset = getTerminalCharset()
36 | 			for line in text:
37 | 				res=line.split(":")
38 | 				if res[0]=="- Author":
39 | 						self.users.append(res[1])
40 | 				elif res[1]==" Author:":
41 | 						self.users.append(res[2])
42 | 				elif res[0]=="- Producer":
43 | 						self.software.append(res[1])
44 | 				elif res[0]=="- Creation date":
45 | 						self.creationDate.append(res[1])
46 | 				elif res[0]=="- Last modification":
47 | 						self.modification.append(res[1])
48 | 				elif res[1]==" Template":
49 | 						xres= line.replace("- Comment: Template:","")
50 | 						self.paths.append(xres)
51 | 				elif res[1]==" LastSavedBy":
52 | 				#		print res[1] + res[2]
53 | 						self.users.append(res[2])
54 | 				elif res[1]==" LastPrinted":
55 | 						self.lastPrinted.append(res[2])
56 | 				elif res[0]=="- Revision history":
57 | 						#self.paths.append(res[2])
58 | 						res2=line.split(",")
59 | 						self.paths.append(res2[1].split("file ")[1])
60 | 				self.raw=text
61 | 		return "ok"
62 | 
63 | 	def getUsers(self):
64 | 		return self.users
65 | 	def getSoftware(self):
66 | 		return self.software
67 | 	def getPaths(self):
68 | 		return self.paths
69 | 	def getRaw(self):
70 | 		return self.raw
71 | 


--------------------------------------------------------------------------------
/hachoir_core/timeout.py:
--------------------------------------------------------------------------------
 1 | """
 2 | limitedTime(): set a timeout in seconds when calling a function,
 3 | raise a Timeout error if time exceed.
 4 | """
 5 | from math import ceil
 6 | 
 7 | IMPLEMENTATION = None
 8 | 
 9 | class Timeout(RuntimeError):
10 |     """
11 |     Timeout error, inherits from RuntimeError
12 |     """
13 |     pass
14 | 
15 | def signalHandler(signum, frame):
16 |     """
17 |     Signal handler to catch timeout signal: raise Timeout exception.
18 |     """
19 |     raise Timeout("Timeout exceed!")
20 | 
21 | def limitedTime(second, func, *args, **kw):
22 |     """
23 |     Call func(*args, **kw) with a timeout of second seconds.
24 |     """
25 |     return func(*args, **kw)
26 | 
27 | def fixTimeout(second):
28 |     """
29 |     Fix timeout value: convert to integer with a minimum of 1 second
30 |     """
31 |     if isinstance(second, float):
32 |         second = int(ceil(second))
33 |     assert isinstance(second, (int, long))
34 |     return max(second, 1)
35 | 
36 | if not IMPLEMENTATION:
37 |     try:
38 |         from signal import signal, alarm, SIGALRM
39 | 
40 |         # signal.alarm() implementation
41 |         def limitedTime(second, func, *args, **kw):
42 |             second = fixTimeout(second)
43 |             old_alarm = signal(SIGALRM, signalHandler)
44 |             try:
45 |                 alarm(second)
46 |                 return func(*args, **kw)
47 |             finally:
48 |                 alarm(0)
49 |                 signal(SIGALRM, old_alarm)
50 | 
51 |         IMPLEMENTATION = "signal.alarm()"
52 |     except ImportError:
53 |         pass
54 | 
55 | if not IMPLEMENTATION:
56 |     try:
57 |         from signal import signal, SIGXCPU
58 |         from resource import getrlimit, setrlimit, RLIMIT_CPU
59 | 
60 |         # resource.setrlimit(RLIMIT_CPU) implementation
61 |         # "Bug": timeout is 'CPU' time so sleep() are not part of the timeout
62 |         def limitedTime(second, func, *args, **kw):
63 |             second = fixTimeout(second)
64 |             old_alarm = signal(SIGXCPU, signalHandler)
65 |             current = getrlimit(RLIMIT_CPU)
66 |             try:
67 |                 setrlimit(RLIMIT_CPU, (second, current[1]))
68 |                 return func(*args, **kw)
69 |             finally:
70 |                 setrlimit(RLIMIT_CPU, current)
71 |                 signal(SIGXCPU, old_alarm)
72 | 
73 |         IMPLEMENTATION = "resource.setrlimit(RLIMIT_CPU)"
74 |     except ImportError:
75 |         pass
76 | 
77 | 


--------------------------------------------------------------------------------
/hachoir_parser/archive/mar.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Microsoft Archive parser
 3 | 
 4 | Author: Victor Stinner
 5 | Creation date: 2007-03-04
 6 | """
 7 | 
 8 | MAX_NB_FILE = 100000
 9 | 
10 | from hachoir_parser import Parser
11 | from hachoir_core.field import FieldSet, String, UInt32, SubFile
12 | from hachoir_core.endian import LITTLE_ENDIAN
13 | from hachoir_core.text_handler import textHandler, filesizeHandler, hexadecimal
14 | 
15 | class FileIndex(FieldSet):
16 |     static_size = 68*8
17 | 
18 |     def createFields(self):
19 |         yield String(self, "filename", 56, truncate="\0", charset="ASCII")
20 |         yield filesizeHandler(UInt32(self, "filesize"))
21 |         yield textHandler(UInt32(self, "crc32"), hexadecimal)
22 |         yield UInt32(self, "offset")
23 | 
24 |     def createDescription(self):
25 |         return "File %s (%s) at %s" % (
26 |             self["filename"].value, self["filesize"].display, self["offset"].value)
27 | 
28 | class MarFile(Parser):
29 |     MAGIC = "MARC"
30 |     PARSER_TAGS = {
31 |         "id": "mar",
32 |         "category": "archive",
33 |         "file_ext": ("mar",),
34 |         "min_size": 80*8,  # At least one file index
35 |         "magic": ((MAGIC, 0),),
36 |         "description": "Microsoft Archive",
37 |     }
38 |     endian = LITTLE_ENDIAN
39 | 
40 |     def validate(self):
41 |         if self.stream.readBytes(0, 4) != self.MAGIC:
42 |             return "Invalid magic"
43 |         if self["version"].value != 3:
44 |             return "Invalid version"
45 |         if not(1 <= self["nb_file"].value <= MAX_NB_FILE):
46 |             return "Invalid number of file"
47 |         return True
48 | 
49 |     def createFields(self):
50 |         yield String(self, "magic", 4, "File signature (MARC)", charset="ASCII")
51 |         yield UInt32(self, "version")
52 |         yield UInt32(self, "nb_file")
53 |         files = []
54 |         for index in xrange(self["nb_file"].value):
55 |             item = FileIndex(self, "file[]")
56 |             yield item
57 |             if item["filesize"].value:
58 |                 files.append(item)
59 |         files.sort(key=lambda item: item["offset"].value)
60 |         for index in files:
61 |             padding = self.seekByte(index["offset"].value)
62 |             if padding:
63 |                 yield padding
64 |             size = index["filesize"].value
65 |             desc = "File %s" % index["filename"].value
66 |             yield SubFile(self, "data[]", size, desc, filename=index["filename"].value)
67 | 
68 | 


--------------------------------------------------------------------------------
/hachoir_core/field/byte_field.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Very basic field: raw content with a size in byte. Use this class for
 3 | unknown content.
 4 | """
 5 | 
 6 | from hachoir_core.field import Field, FieldError
 7 | from hachoir_core.tools import makePrintable
 8 | from hachoir_core.bits import str2hex
 9 | from hachoir_core import config
10 | 
11 | MAX_LENGTH = (2**64)
12 | 
13 | class RawBytes(Field):
14 |     """
15 |     Byte vector of unknown content
16 | 
17 |     @see: L{Bytes}
18 |     """
19 |     static_size = staticmethod(lambda *args, **kw: args[1]*8)
20 | 
21 |     def __init__(self, parent, name, length, description="Raw data"):
22 |         assert issubclass(parent.__class__, Field)
23 |         if not(0 < length <= MAX_LENGTH):
24 |             raise FieldError("Invalid RawBytes length (%s)!" % length)
25 |         Field.__init__(self, parent, name, length*8, description)
26 |         self._display = None
27 | 
28 |     def _createDisplay(self, human):
29 |         max_bytes = config.max_byte_length
30 |         if type(self._getValue) is type(lambda: None):
31 |             display = self.value[:max_bytes]
32 |         else:
33 |             if self._display is None:
34 |                 address = self.absolute_address
35 |                 length = min(self._size / 8, max_bytes)
36 |                 self._display = self._parent.stream.readBytes(address, length)
37 |             display = self._display
38 |         truncated = (8 * len(display) < self._size)
39 |         if human:
40 |             if truncated:
41 |                 display += "(...)"
42 |             return makePrintable(display, "latin-1", quote='"', to_unicode=True)
43 |         else:
44 |             display = str2hex(display, format=r"\x%02x")
45 |             if truncated:
46 |                 return '"%s(...)"' % display
47 |             else:
48 |                 return '"%s"' % display
49 | 
50 |     def createDisplay(self):
51 |         return self._createDisplay(True)
52 | 
53 |     def createRawDisplay(self):
54 |         return self._createDisplay(False)
55 | 
56 |     def hasValue(self):
57 |         return True
58 | 
59 |     def createValue(self):
60 |         assert (self._size % 8) == 0
61 |         if self._display:
62 |             self._display = None
63 |         return self._parent.stream.readBytes(
64 |             self.absolute_address, self._size / 8)
65 | 
66 | class Bytes(RawBytes):
67 |     """
68 |     Byte vector: can be used for magic number or GUID/UUID for example.
69 | 
70 |     @see: L{RawBytes}
71 |     """
72 |     pass
73 | 
74 | 


--------------------------------------------------------------------------------
/hachoir_core/field/__init__.py:
--------------------------------------------------------------------------------
 1 | # Field classes
 2 | from hachoir_core.field.field import Field, FieldError, MissingField, joinPath
 3 | from hachoir_core.field.bit_field import Bit, Bits, RawBits
 4 | from hachoir_core.field.byte_field import Bytes, RawBytes
 5 | from hachoir_core.field.sub_file import SubFile, CompressedField
 6 | from hachoir_core.field.character import Character
 7 | from hachoir_core.field.integer import (
 8 |     Int8,  Int16,  Int24,  Int32,  Int64,
 9 |     UInt8, UInt16, UInt24, UInt32, UInt64,
10 |     GenericInteger)
11 | from hachoir_core.field.enum import Enum
12 | from hachoir_core.field.string_field import (GenericString,
13 |     String, CString, UnixLine,
14 |     PascalString8, PascalString16, PascalString32)
15 | from hachoir_core.field.padding import (PaddingBits, PaddingBytes,
16 |     NullBits, NullBytes)
17 | 
18 | # Functions
19 | from hachoir_core.field.helper import (isString, isInteger,
20 |     createPaddingField, createNullField, createRawField,
21 |     writeIntoFile, createOrphanField)
22 | 
23 | # FieldSet classes
24 | from hachoir_core.field.fake_array import FakeArray
25 | from hachoir_core.field.basic_field_set import (BasicFieldSet,
26 |     ParserError, MatchError)
27 | from hachoir_core.field.generic_field_set import GenericFieldSet
28 | from hachoir_core.field.seekable_field_set import SeekableFieldSet, RootSeekableFieldSet
29 | from hachoir_core.field.field_set import FieldSet
30 | from hachoir_core.field.static_field_set import StaticFieldSet
31 | from hachoir_core.field.parser import Parser
32 | from hachoir_core.field.vector import GenericVector, UserVector
33 | 
34 | # Complex types
35 | from hachoir_core.field.float import Float32, Float64, Float80
36 | from hachoir_core.field.timestamp import (GenericTimestamp,
37 |     TimestampUnix32, TimestampUnix64, TimestampMac32, TimestampUUID60, TimestampWin64,
38 |     DateTimeMSDOS32, TimeDateMSDOS32, TimedeltaWin64)
39 | 
40 | # Special Field classes
41 | from hachoir_core.field.link import Link, Fragment
42 | 
43 | available_types = (
44 |     Bit, Bits, RawBits,
45 |     Bytes, RawBytes,
46 |     SubFile,
47 |     Character,
48 |     Int8, Int16, Int24, Int32, Int64,
49 |     UInt8, UInt16, UInt24, UInt32, UInt64,
50 |     String, CString, UnixLine,
51 |     PascalString8, PascalString16, PascalString32,
52 |     Float32, Float64,
53 |     PaddingBits, PaddingBytes,
54 |     NullBits, NullBytes,
55 |     TimestampUnix32, TimestampMac32, TimestampWin64,
56 |     DateTimeMSDOS32, TimeDateMSDOS32,
57 | #    GenericInteger, GenericString,
58 | )
59 | 
60 | 


--------------------------------------------------------------------------------
/hachoir_parser/game/spider_man_video.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Parser for an obscure FMV file format: bin files from the game
 3 | "The Amazing Spider-Man vs. The Kingpin" (Sega CD)
 4 | 
 5 | Author: Mike Melanson
 6 | Creation date: 2006-09-30
 7 | File samples: http://samples.mplayerhq.hu/game-formats/spiderman-segacd-bin/
 8 | """
 9 | 
10 | from hachoir_parser import Parser
11 | from hachoir_core.field import FieldSet, UInt32, String, RawBytes
12 | from hachoir_core.endian import BIG_ENDIAN
13 | from hachoir_core.text_handler import textHandler, hexadecimal
14 | 
15 | class Chunk(FieldSet):
16 |     tag_info = {
17 |         "CONF" : ("conf[]", None, "Configuration header"),
18 |         "AUDI" : ("audio[]", None, "Audio chunk"),
19 |         "SYNC" : ("sync[]", None, "Start of video frame data"),
20 |         "IVRA" : ("ivra[]", None, "Vector codebook (?)"),
21 |         "VRAM" : ("video[]", None, "Video RAM tile pattern"),
22 |         "CRAM" : ("color[]", None, "Color RAM (palette)"),
23 |         "CEND" : ("video_end[]", None, "End of video data"),
24 |         "MEND" : ("end_file", None, "End of file"),
25 |     }
26 | 
27 |     def __init__(self, *args):
28 |         FieldSet.__init__(self, *args)
29 |         self._size = self["length"].value * 8
30 |         fourcc = self["fourcc"].value
31 |         if fourcc in self.tag_info:
32 |             self._name, self._parser, self._description = self.tag_info[fourcc]
33 |         else:
34 |             self._parser = None
35 |             self._description = "Unknown chunk: fourcc %s" % self["fourcc"].display
36 | 
37 |     def createFields(self):
38 |         yield String(self, "fourcc", 4, "FourCC", charset="ASCII")
39 |         yield textHandler(UInt32(self, "length", "length"), hexadecimal)
40 |         size = self["length"].value - 8
41 |         if 0 < size:
42 |             if self._parser:
43 |                 for field in self._parser(self, size):
44 |                     yield field
45 |             else:
46 |                 yield RawBytes(self, "data", size)
47 | 
48 | class SpiderManVideoFile(Parser):
49 |     PARSER_TAGS = {
50 |         "id": "spiderman_video",
51 |         "category": "game",
52 |         "file_ext": ("bin",),
53 |         "min_size": 8*8,
54 |         "description": "The Amazing Spider-Man vs. The Kingpin (Sega CD) FMV video"
55 |     }
56 | 
57 |     endian = BIG_ENDIAN
58 | 
59 |     def validate(self):
60 |         return (self.stream.readBytes(0, 4) == 'CONF')
61 | 
62 |     def createFields(self):
63 |         while not self.eof:
64 |             yield Chunk(self, "chunk[]")
65 | 
66 | 


--------------------------------------------------------------------------------
/hachoir_parser/image/tiff.py:
--------------------------------------------------------------------------------
 1 | """
 2 | TIFF image parser.
 3 | 
 4 | Authors: Victor Stinner, Sebastien Ponce, Robert Xiao
 5 | Creation date: 30 september 2006
 6 | """
 7 | 
 8 | from hachoir_parser import Parser
 9 | from hachoir_core.field import FieldSet, SeekableFieldSet, RootSeekableFieldSet, Bytes
10 | from hachoir_core.endian import LITTLE_ENDIAN, BIG_ENDIAN
11 | from hachoir_parser.image.exif import TIFF
12 | 
13 | def getStrips(ifd):
14 |     data = {}
15 |     for i, entry in enumerate(ifd.array('entry')):
16 |         data[entry['tag'].display] = entry
17 |     # image data
18 |     if "StripOffsets" in data and "StripByteCounts" in data:
19 |         offs = ifd.getEntryValues(data["StripOffsets"])
20 |         bytes = ifd.getEntryValues(data["StripByteCounts"])
21 |         for off, byte in zip(offs, bytes):
22 |             yield off.value, byte.value
23 | 
24 | class ImageFile(SeekableFieldSet):
25 |     def __init__(self, parent, name, description, ifd):
26 |         SeekableFieldSet.__init__(self, parent, name, description, None)
27 |         self._ifd = ifd
28 | 
29 |     def createFields(self):
30 |         for off, byte in getStrips(self._ifd):
31 |             self.seekByte(off, relative=False)
32 |             yield Bytes(self, "strip[]", byte)
33 | 
34 | class TiffFile(RootSeekableFieldSet, Parser):
35 |     PARSER_TAGS = {
36 |         "id": "tiff",
37 |         "category": "image",
38 |         "file_ext": ("tif", "tiff"),
39 |         "mime": (u"image/tiff",),
40 |         "min_size": 8*8,
41 |         "magic": (("II\x2A\0", 0), ("MM\0\x2A", 0)),
42 |         "description": "TIFF picture"
43 |     }
44 | 
45 |     # Correct endian is set in constructor
46 |     endian = LITTLE_ENDIAN
47 | 
48 |     def __init__(self, stream, **args):
49 |         RootSeekableFieldSet.__init__(self, None, "root", stream, None, stream.askSize(self))
50 |         if self.stream.readBytes(0, 2) == "MM":
51 |             self.endian = BIG_ENDIAN
52 |         Parser.__init__(self, stream, **args)
53 | 
54 |     def validate(self):
55 |         endian = self.stream.readBytes(0, 2)
56 |         if endian not in ("MM", "II"):
57 |             return "Invalid endian (%r)" % endian
58 |         if self["version"].value != 42:
59 |             return "Unknown TIFF version"
60 |         return True
61 | 
62 |     def createFields(self):
63 |         for field in TIFF(self):
64 |             yield field
65 | 
66 |         for ifd in self.array('ifd'):
67 |             offs = (off for off, byte in getStrips(ifd))
68 |             self.seekByte(min(offs), relative=False)
69 |             image = ImageFile(self, "image[]", "Image File", ifd)
70 |             yield image
71 | 


--------------------------------------------------------------------------------
/hachoir_core/field/fake_array.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | from hachoir_core.field import MissingField
 3 | 
 4 | class FakeArray:
 5 |     """
 6 |     Simulate an array for GenericFieldSet.array(): fielset.array("item")[0] is
 7 |     equivalent to fielset.array("item[0]").
 8 | 
 9 |     It's possible to iterate over the items using::
10 | 
11 |         for element in fieldset.array("item"):
12 |             ...
13 | 
14 |     And to get array size using len(fieldset.array("item")).
15 |     """
16 |     def __init__(self, fieldset, name):
17 |         pos = name.rfind("/")
18 |         if pos != -1:
19 |             self.fieldset = fieldset[name[:pos]]
20 |             self.name = name[pos+1:]
21 |         else:
22 |             self.fieldset = fieldset
23 |             self.name = name
24 |         self._format = "%s[%%u]" % self.name
25 |         self._cache = {}
26 |         self._known_size = False
27 |         self._max_index = -1
28 | 
29 |     def __nonzero__(self):
30 |         "Is the array empty or not?"
31 |         if self._cache:
32 |             return True
33 |         else:
34 |             return (0 in self)
35 | 
36 |     def __len__(self):
37 |         "Number of fields in the array"
38 |         total = self._max_index+1
39 |         if not self._known_size:
40 |             for index in itertools.count(total):
41 |                 try:
42 |                     field = self[index]
43 |                     total += 1
44 |                 except MissingField:
45 |                     break
46 |         return total
47 | 
48 |     def __contains__(self, index):
49 |         try:
50 |             field = self[index]
51 |             return True
52 |         except MissingField:
53 |             return False
54 | 
55 |     def __getitem__(self, index):
56 |         """
57 |         Get a field of the array. Returns a field, or raise MissingField
58 |         exception if the field doesn't exist.
59 |         """
60 |         try:
61 |             value = self._cache[index]
62 |         except KeyError:
63 |             try:
64 |                 value = self.fieldset[self._format % index]
65 |             except MissingField:
66 |                 self._known_size = True
67 |                 raise
68 |             self._cache[index] = value
69 |             self._max_index = max(index, self._max_index)
70 |         return value
71 | 
72 |     def __iter__(self):
73 |         """
74 |         Iterate in the fields in their index order: field[0], field[1], ...
75 |         """
76 |         for index in itertools.count(0):
77 |             try:
78 |                 yield self[index]
79 |             except MissingField:
80 |                 raise StopIteration()
81 | 
82 | 


--------------------------------------------------------------------------------
/hachoir_parser/archive/mozilla_ar.py:
--------------------------------------------------------------------------------
 1 | """MAR (Mozilla ARchive) parser
 2 | 
 3 | Author: Robert Xiao
 4 | Creation date: July 10, 2007
 5 | 
 6 | """
 7 | 
 8 | from hachoir_core.endian import BIG_ENDIAN
 9 | from hachoir_core.field import (RootSeekableFieldSet, FieldSet,
10 |     String, CString, UInt32, RawBytes)
11 | from hachoir_core.text_handler import displayHandler, filesizeHandler
12 | from hachoir_core.tools import humanUnixAttributes
13 | from hachoir_parser import HachoirParser
14 | 
15 | class IndexEntry(FieldSet):
16 |     def createFields(self):
17 |         yield UInt32(self, "offset", "Offset in bytes relative to start of archive")
18 |         yield filesizeHandler(UInt32(self, "length", "Length in bytes"))
19 |         yield displayHandler(UInt32(self, "flags"), humanUnixAttributes)
20 |         yield CString(self, "name", "Filename (byte array)")
21 | 
22 |     def createDescription(self):
23 |         return 'File %s, Size %s, Mode %s'%(
24 |             self["name"].display, self["length"].display, self["flags"].display)
25 | 
26 | class MozillaArchive(HachoirParser, RootSeekableFieldSet):
27 |     MAGIC = "MAR1"
28 |     PARSER_TAGS = {
29 |         "id": "mozilla_ar",
30 |         "category": "archive",
31 |         "file_ext": ("mar",),
32 |         "min_size": (8+4+13)*8,  # Header, Index Header, 1 Index Entry
33 |         "magic": ((MAGIC, 0),),
34 |         "description": "Mozilla Archive",
35 |     }
36 |     endian = BIG_ENDIAN
37 |     
38 |     def __init__(self, stream, **args):
39 |         RootSeekableFieldSet.__init__(self, None, "root", stream, None, stream.askSize(self))
40 |         HachoirParser.__init__(self, stream, **args)
41 |         
42 |     def validate(self):
43 |         if self.stream.readBytes(0, 4) != self.MAGIC:
44 |             return "Invalid magic"
45 |         return True
46 | 
47 |     def createFields(self):
48 |         yield String(self, "magic", 4, "File signature (MAR1)", charset="ASCII")
49 |         yield UInt32(self, "index_offset", "Offset to index relative to file start")
50 |         self.seekByte(self["index_offset"].value, False)
51 |         yield UInt32(self, "index_size", "size of index in bytes")
52 |         current_index_size = 0 # bytes
53 |         while current_index_size < self["index_size"].value:
54 |             # plus 4 compensates for index_size
55 |             self.seekByte(self["index_offset"].value + current_index_size + 4, False)
56 |             entry = IndexEntry(self, "index_entry[]")
57 |             yield entry
58 |             current_index_size += entry.size // 8
59 |             self.seekByte(entry["offset"].value, False)
60 |             yield RawBytes(self, "file[]", entry["length"].value)
61 | 


--------------------------------------------------------------------------------
/pdfminer/ascii85.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | """ Python implementation of ASCII85/ASCIIHex decoder (Adobe version).
 4 | 
 5 | This code is in the public domain.
 6 | 
 7 | """
 8 | 
 9 | import re
10 | import struct
11 | 
12 | # ascii85decode(data)
13 | def ascii85decode(data):
14 |     """
15 |     In ASCII85 encoding, every four bytes are encoded with five ASCII
16 |     letters, using 85 different types of characters (as 256**4 < 85**5).
17 |     When the length of the original bytes is not a multiple of 4, a special
18 |     rule is used for round up.
19 |     
20 |     The Adobe's ASCII85 implementation is slightly different from
21 |     its original in handling the last characters.
22 |     
23 |     The sample string is taken from:
24 |       http://en.wikipedia.org/w/index.php?title=Ascii85
25 |     
26 |     >>> ascii85decode('9jqo^BlbD-BleB1DJ+*+F(f,q')
27 |     'Man is distinguished'
28 |     >>> ascii85decode('E,9)oF*2M7/c~>')
29 |     'pleasure.'
30 |     """
31 |     n = b = 0
32 |     out = ''
33 |     for c in data:
34 |         if '!' <= c and c <= 'u':
35 |             n += 1
36 |             b = b*85+(ord(c)-33)
37 |             if n == 5:
38 |                 out += struct.pack('>L',b)
39 |                 n = b = 0
40 |         elif c == 'z':
41 |             assert n == 0
42 |             out += '\0\0\0\0'
43 |         elif c == '~':
44 |             if n:
45 |                 for _ in range(5-n):
46 |                     b = b*85+84
47 |                 out += struct.pack('>L',b)[:n-1]
48 |             break
49 |     return out
50 | 
51 | # asciihexdecode(data)
52 | hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
53 | trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
54 | def asciihexdecode(data):
55 |     """
56 |     ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
57 |     For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
58 |     ASCIIHexDecode filter produces one byte of binary data. All white-space
59 |     characters are ignored. A right angle bracket character (>) indicates
60 |     EOD. Any other characters will cause an error. If the filter encounters
61 |     the EOD marker after reading an odd number of hexadecimal digits, it
62 |     will behave as if a 0 followed the last digit.
63 |     
64 |     >>> asciihexdecode('61 62 2e6364   65')
65 |     'ab.cde'
66 |     >>> asciihexdecode('61 62 2e6364   657>')
67 |     'ab.cdep'
68 |     >>> asciihexdecode('7>')
69 |     'p'
70 |     """
71 |     decode = (lambda hx: chr(int(hx, 16)))
72 |     out = map(decode, hex_re.findall(data))
73 |     m = trail_re.search(data)
74 |     if m:
75 |         out.append(decode("%c0" % m.group(1)))
76 |     return ''.join(out)
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     import doctest
81 |     doctest.testmod()
82 | 


--------------------------------------------------------------------------------
/hachoir_parser/image/psd.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Photoshop parser (.psd file).
 3 | 
 4 | Creation date: 8 january 2006
 5 | Author: Victor Stinner
 6 | """
 7 | 
 8 | from hachoir_parser import Parser
 9 | from hachoir_core.field import (FieldSet,
10 |     UInt16, UInt32, String, NullBytes, Enum, RawBytes)
11 | from hachoir_core.endian import BIG_ENDIAN
12 | from hachoir_parser.image.photoshop_metadata import Photoshop8BIM
13 | 
14 | class Config(FieldSet):
15 |     def __init__(self, *args):
16 |         FieldSet.__init__(self, *args)
17 |         self._size = (4 + self["size"].value) * 8
18 | 
19 |     def createFields(self):
20 |         yield UInt32(self, "size")
21 |         while not self.eof:
22 |             yield Photoshop8BIM(self, "item[]")
23 | 
24 | class PsdFile(Parser):
25 |     endian = BIG_ENDIAN
26 |     PARSER_TAGS = {
27 |         "id": "psd",
28 |         "category": "image",
29 |         "file_ext": ("psd",),
30 |         "mime": (u"image/psd", u"image/photoshop", u"image/x-photoshop"),
31 |         "min_size": 4*8,
32 |         "magic": (("8BPS\0\1",0),),
33 |         "description": "Photoshop (PSD) picture",
34 |     }
35 |     COLOR_MODE = {
36 |         0: u"Bitmap",
37 |         1: u"Grayscale",
38 |         2: u"Indexed",
39 |         3: u"RGB color",
40 |         4: u"CMYK color",
41 |         7: u"Multichannel",
42 |         8: u"Duotone",
43 |         9: u"Lab Color",
44 |     }
45 |     COMPRESSION_NAME = {
46 |         0: "Raw data",
47 |         1: "RLE",
48 |     }
49 | 
50 |     def validate(self):
51 |         if self.stream.readBytes(0, 4) != "8BPS":
52 |             return "Invalid signature"
53 |         return True
54 | 
55 |     def createFields(self):
56 |         yield String(self, "signature", 4, "PSD signature (8BPS)", charset="ASCII")
57 |         yield UInt16(self, "version")
58 |         yield NullBytes(self, "reserved[]", 6)
59 |         yield UInt16(self, "nb_channels")
60 |         yield UInt32(self, "width")
61 |         yield UInt32(self, "height")
62 |         yield UInt16(self, "depth")
63 |         yield Enum(UInt16(self, "color_mode"), self.COLOR_MODE)
64 | 
65 |         # Mode data
66 |         yield UInt32(self, "mode_data_size")
67 |         size = self["mode_data_size"].value
68 |         if size:
69 |             yield RawBytes(self, "mode_data", size)
70 | 
71 |         # Resources
72 |         yield Config(self, "config")
73 | 
74 |         # Reserved
75 |         yield UInt32(self, "reserved_data_size")
76 |         size = self["reserved_data_size"].value
77 |         if size:
78 |             yield RawBytes(self, "reserved_data", size)
79 | 
80 |         yield Enum(UInt16(self, "compression"), self.COMPRESSION_NAME)
81 | 
82 |         size = (self.size - self.current_size) // 8
83 |         if size:
84 |             yield RawBytes(self, "end", size)
85 | 
86 | 


--------------------------------------------------------------------------------
/hachoir_core/memory.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | 
  3 | #---- Default implementation when resource is missing ----------------------
  4 | PAGE_SIZE = 4096
  5 | 
  6 | def getMemoryLimit():
  7 |     """
  8 |     Get current memory limit in bytes.
  9 | 
 10 |     Return None on error.
 11 |     """
 12 |     return None
 13 | 
 14 | def setMemoryLimit(max_mem):
 15 |     """
 16 |     Set memory limit in bytes.
 17 |     Use value 'None' to disable memory limit.
 18 | 
 19 |     Return True if limit is set, False on error.
 20 |     """
 21 |     return False
 22 | 
 23 | def getMemorySize():
 24 |     """
 25 |     Read currenet process memory size: size of available virtual memory.
 26 |     This value is NOT the real memory usage.
 27 | 
 28 |     This function only works on Linux (use /proc/self/statm file).
 29 |     """
 30 |     try:
 31 |         statm = open('/proc/self/statm').readline().split()
 32 |     except IOError:
 33 |         return None
 34 |     return int(statm[0]) * PAGE_SIZE
 35 | 
 36 | def clearCaches():
 37 |     """
 38 |     Try to clear all caches: call gc.collect() (Python garbage collector).
 39 |     """
 40 |     gc.collect()
 41 |     #import re; re.purge()
 42 | 
 43 | try:
 44 | #---- 'resource' implementation ---------------------------------------------
 45 |     from resource import getpagesize, getrlimit, setrlimit, RLIMIT_AS
 46 | 
 47 |     PAGE_SIZE = getpagesize()
 48 | 
 49 |     def getMemoryLimit():
 50 |         try:
 51 |             limit = getrlimit(RLIMIT_AS)[0]
 52 |             if 0 < limit:
 53 |                 limit *= PAGE_SIZE
 54 |             return limit
 55 |         except ValueError:
 56 |             return None
 57 | 
 58 |     def setMemoryLimit(max_mem):
 59 |         if max_mem is None:
 60 |             max_mem = -1
 61 |         try:
 62 |             setrlimit(RLIMIT_AS, (max_mem, -1))
 63 |             return True
 64 |         except ValueError:
 65 |             return False
 66 | except ImportError:
 67 |     pass
 68 | 
 69 | def limitedMemory(limit, func, *args, **kw):
 70 |     """
 71 |     Limit memory grow when calling func(*args, **kw):
 72 |     restrict memory grow to 'limit' bytes.
 73 | 
 74 |     Use try/except MemoryError to catch the error.
 75 |     """
 76 |     # First step: clear cache to gain memory
 77 |     clearCaches()
 78 | 
 79 |     # Get total program size
 80 |     max_rss = getMemorySize()
 81 |     if max_rss is not None:
 82 |         # Get old limit and then set our new memory limit
 83 |         old_limit = getMemoryLimit()
 84 |         limit = max_rss + limit
 85 |         limited = setMemoryLimit(limit)
 86 |     else:
 87 |         limited = False
 88 | 
 89 |     try:
 90 |         # Call function
 91 |         return func(*args, **kw)
 92 |     finally:
 93 |         # and unset our memory limit
 94 |         if limited:
 95 |             setMemoryLimit(old_limit)
 96 | 
 97 |         # After calling the function: clear all caches
 98 |         clearCaches()
 99 | 
100 | 


--------------------------------------------------------------------------------
/hachoir_core/field/sub_file.py:
--------------------------------------------------------------------------------
 1 | from hachoir_core.field import Bytes
 2 | from hachoir_core.tools import makePrintable, humanFilesize
 3 | from hachoir_core.stream import InputIOStream
 4 | 
 5 | class SubFile(Bytes):
 6 |     """
 7 |     File stored in another file
 8 |     """
 9 |     def __init__(self, parent, name, length, description=None,
10 |     parser=None, filename=None, mime_type=None, parser_class=None):
11 |         if filename:
12 |             if not isinstance(filename, unicode):
13 |                 filename = makePrintable(filename, "ISO-8859-1")
14 |             if not description:
15 |                 description = 'File "%s" (%s)' % (filename, humanFilesize(length))
16 |         Bytes.__init__(self, parent, name, length, description)
17 |         def createInputStream(cis, **args):
18 |             tags = args.setdefault("tags",[])
19 |             if parser_class:
20 |                 tags.append(( "class", parser_class ))
21 |             if parser is not None:
22 |                 tags.append(( "id", parser.PARSER_TAGS["id"] ))
23 |             if mime_type:
24 |                 tags.append(( "mime", mime_type ))
25 |             if filename:
26 |                 tags.append(( "filename", filename ))
27 |             return cis(**args)
28 |         self.setSubIStream(createInputStream)
29 | 
30 | class CompressedStream:
31 |     offset = 0
32 | 
33 |     def __init__(self, stream, decompressor):
34 |         self.stream = stream
35 |         self.decompressor = decompressor(stream)
36 |         self._buffer = ''
37 | 
38 |     def read(self, size):
39 |         d = self._buffer
40 |         data = [ d[:size] ]
41 |         size -= len(d)
42 |         if size > 0:
43 |             d = self.decompressor(size)
44 |             data.append(d[:size])
45 |             size -= len(d)
46 |             while size > 0:
47 |                 n = 4096
48 |                 if self.stream.size:
49 |                     n = min(self.stream.size - self.offset, n)
50 |                     if not n:
51 |                         break
52 |                 d = self.stream.read(self.offset, n)[1]
53 |                 self.offset += 8 * len(d)
54 |                 d = self.decompressor(size, d)
55 |                 data.append(d[:size])
56 |                 size -= len(d)
57 |         self._buffer = d[size+len(d):]
58 |         return ''.join(data)
59 | 
60 | def CompressedField(field, decompressor):
61 |     def createInputStream(cis, source=None, **args):
62 |         if field._parent:
63 |             stream = cis(source=source)
64 |             args.setdefault("tags", []).extend(stream.tags)
65 |         else:
66 |             stream = field.stream
67 |         input = CompressedStream(stream, decompressor)
68 |         if source is None:
69 |             source = "Compressed source: '%s' (offset=%s)" % (stream.source, field.absolute_address)
70 |         return InputIOStream(input, source=source, **args)
71 |     field.setSubIStream(createInputStream)
72 |     return field
73 | 


--------------------------------------------------------------------------------
/hachoir_parser/image/pcx.py:
--------------------------------------------------------------------------------
 1 | """
 2 | PCX picture filter.
 3 | """
 4 | 
 5 | from hachoir_parser import Parser
 6 | from hachoir_core.field import (
 7 |     UInt8, UInt16,
 8 |     PaddingBytes, RawBytes,
 9 |     Enum)
10 | from hachoir_parser.image.common import PaletteRGB
11 | from hachoir_core.endian import LITTLE_ENDIAN
12 | 
13 | class PcxFile(Parser):
14 |     endian = LITTLE_ENDIAN
15 |     PARSER_TAGS = {
16 |         "id": "pcx",
17 |         "category": "image",
18 |         "file_ext": ("pcx",),
19 |         "mime": (u"image/x-pcx",),
20 |         "min_size": 128*8,
21 |         "description": "PC Paintbrush (PCX) picture"
22 |     }
23 |     compression_name = { 1: "Run-length encoding (RLE)" }
24 |     version_name = {
25 |         0: u"Version 2.5 of PC Paintbrush",
26 |         2: u"Version 2.8 with palette information",
27 |         3: u"Version 2.8 without palette information",
28 |         4: u"PC Paintbrush for Windows",
29 |         5: u"Version 3.0 (or greater) of PC Paintbrush"
30 |     }
31 | 
32 |     def validate(self):
33 |         if self["id"].value != 10:
34 |             return "Wrong signature"
35 |         if self["version"].value not in self.version_name:
36 |             return "Unknown format version"
37 |         if self["bpp"].value not in (1, 2, 4, 8, 24, 32):
38 |             return "Unknown bits/pixel"
39 |         if self["reserved[0]"].value != "\0":
40 |             return "Invalid reserved value"
41 |         return True
42 | 
43 |     def createFields(self):
44 |         yield UInt8(self, "id", "PCX identifier (10)")
45 |         yield Enum(UInt8(self, "version", "PCX version"), self.version_name)
46 |         yield Enum(UInt8(self, "compression", "Compression method"), self.compression_name)
47 |         yield UInt8(self, "bpp", "Bits / pixel")
48 |         yield UInt16(self, "xmin", "Minimum X")
49 |         yield UInt16(self, "ymin", "Minimum Y")
50 |         yield UInt16(self, "xmax", "Width minus one") # value + 1
51 |         yield UInt16(self, "ymax", "Height minus one") # value + 1
52 |         yield UInt16(self, "horiz_dpi", "Horizontal DPI")
53 |         yield UInt16(self, "vert_dpi", "Vertical DPI")
54 |         yield PaletteRGB(self, "palette_4bits", 16, "Palette (4 bits)")
55 |         yield PaddingBytes(self, "reserved[]", 1)
56 |         yield UInt8(self, "nb_color_plan", "Number of color plans")
57 |         yield UInt16(self, "bytes_per_line", "Bytes per line")
58 |         yield UInt16(self, "color_mode", "Color mode")
59 |         yield PaddingBytes(self, "reserved[]", 58)
60 | 
61 |         if self._size is None: # TODO: is it possible to handle piped input?
62 |             raise NotImplementedError
63 | 
64 |         nb_colors = 256
65 |         size = (self._size - self.current_size)/8
66 |         has_palette = self["bpp"].value == 8
67 |         if has_palette:
68 |             size -= nb_colors*3
69 |         yield RawBytes(self, "image_data", size, "Image data")
70 | 
71 |         if has_palette:
72 |             yield PaletteRGB(self, "palette_8bits", nb_colors, "Palette (8 bit)")
73 | 
74 | 


--------------------------------------------------------------------------------
/hachoir_parser/program/prc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | PRC (Palm resource) parser.
 3 | 
 4 | Author: Sebastien Ponce
 5 | Creation date: 29 october 2008
 6 | """
 7 | 
 8 | from hachoir_parser import Parser
 9 | from hachoir_core.field import (FieldSet,
10 |     UInt16, UInt32, TimestampMac32,
11 |     String, RawBytes)
12 | from hachoir_core.endian import BIG_ENDIAN
13 | 
14 | class PRCHeader(FieldSet):
15 |     static_size = 78*8
16 | 
17 |     def createFields(self):
18 |         yield String(self, "name", 32, "Name")
19 |         yield UInt16(self, "flags", "Flags")
20 |         yield UInt16(self, "version", "Version")
21 |         yield TimestampMac32(self, "create_time", "Creation time")
22 |         yield TimestampMac32(self, "mod_time", "Modification time")
23 |         yield TimestampMac32(self, "backup_time", "Backup time")
24 |         yield UInt32(self, "mod_num", "mod num")
25 |         yield UInt32(self, "app_info", "app info")
26 |         yield UInt32(self, "sort_info", "sort info")
27 |         yield UInt32(self, "type", "type")
28 |         yield UInt32(self, "id", "id")
29 |         yield UInt32(self, "unique_id_seed", "unique_id_seed")
30 |         yield UInt32(self, "next_record_list", "next_record_list")
31 |         yield UInt16(self, "num_records", "num_records")
32 | 
33 | class ResourceHeader(FieldSet):
34 |     static_size = 10*8
35 | 
36 |     def createFields(self):
37 |         yield String(self, "name", 4, "Name of the resource")
38 |         yield UInt16(self, "flags", "ID number of the resource")
39 |         yield UInt32(self, "offset", "Pointer to the resource data")
40 | 
41 |     def createDescription(self):
42 |         return "Resource Header (%s)" % self["name"]
43 | 
44 | class PRCFile(Parser):
45 |     PARSER_TAGS = {
46 |         "id": "prc",
47 |         "category": "program",
48 |         "file_ext": ("prc", ""),
49 |         "min_size": ResourceHeader.static_size,  # At least one program header
50 |         "mime": (
51 |             u"application/x-pilot-prc",
52 |             u"application/x-palmpilot"),
53 |         "description": "Palm Resource File"
54 |     }
55 |     endian = BIG_ENDIAN
56 | 
57 |     def validate(self):
58 |         # FIXME: Implement the validation function!
59 |         return False
60 | 
61 |     def createFields(self):
62 |         # Parse header and program headers
63 |         yield PRCHeader(self, "header", "Header")
64 |         lens = []
65 |         firstOne = True
66 |         poff = 0
67 |         for index in xrange(self["header/num_records"].value):
68 |             r = ResourceHeader(self, "res_header[]")
69 |             if firstOne:
70 |                 firstOne = False
71 |             else:
72 |                 lens.append(r["offset"].value - poff)
73 |             poff = r["offset"].value
74 |             yield r
75 |         lens.append(self.size/8 - poff)
76 |         yield UInt16(self, "placeholder", "Place holder bytes")
77 |         for i in range(len(lens)):
78 |             yield RawBytes(self, "res[]", lens[i], '"'+self["res_header["+str(i)+"]/name"].value+"\" Resource")
79 | 
80 |     def createDescription(self):
81 |         return "Palm Resource file"
82 | 
83 | 


--------------------------------------------------------------------------------
/hachoir_parser/misc/hlp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Microsoft Windows Help (HLP) parser for Hachoir project.
 3 | 
 4 | Documents:
 5 | - Windows Help File Format / Annotation File Format / SHG and MRB File Format
 6 |   written by M. Winterhoff (100326.2776@compuserve.com)
 7 |   found on http://www.wotsit.org/
 8 | 
 9 | Author: Victor Stinner
10 | Creation date: 2007-09-03
11 | """
12 | 
13 | from hachoir_parser import Parser
14 | from hachoir_core.field import (FieldSet,
15 |     Bits, Int32, UInt16, UInt32,
16 |     NullBytes, RawBytes, PaddingBytes, String)
17 | from hachoir_core.endian import LITTLE_ENDIAN
18 | from hachoir_core.text_handler import (textHandler, hexadecimal,
19 |     displayHandler, humanFilesize)
20 | 
21 | class FileEntry(FieldSet):
22 |     def __init__(self, *args, **kw):
23 |         FieldSet.__init__(self, *args, **kw)
24 |         self._size = self["res_space"].value * 8
25 | 
26 |     def createFields(self):
27 |         yield displayHandler(UInt32(self, "res_space", "Reserved space"), humanFilesize)
28 |         yield displayHandler(UInt32(self, "used_space", "Used space"), humanFilesize)
29 |         yield Bits(self, "file_flags", 8, "(=4)")
30 | 
31 |         yield textHandler(UInt16(self, "magic"), hexadecimal)
32 |         yield Bits(self, "flags", 16)
33 |         yield displayHandler(UInt16(self, "page_size", "Page size in bytes"), humanFilesize)
34 |         yield String(self, "structure", 16, strip="\0", charset="ASCII")
35 |         yield NullBytes(self, "zero", 2)
36 |         yield UInt16(self, "nb_page_splits", "Number of page splits B+ tree has suffered")
37 |         yield UInt16(self, "root_page", "Page number of B+ tree root page")
38 |         yield PaddingBytes(self, "one", 2, pattern="\xFF")
39 |         yield UInt16(self, "nb_page", "Number of B+ tree pages")
40 |         yield UInt16(self, "nb_level", "Number of levels of B+ tree")
41 |         yield UInt16(self, "nb_entry", "Number of entries in B+ tree")
42 | 
43 |         size = (self.size - self.current_size)//8
44 |         if size:
45 |             yield PaddingBytes(self, "reserved_space", size)
46 | 
47 | class HlpFile(Parser):
48 |     PARSER_TAGS = {
49 |         "id": "hlp",
50 |         "category": "misc",
51 |         "file_ext": ("hlp",),
52 |         "min_size": 32,
53 |         "description": "Microsoft Windows Help (HLP)",
54 |     }
55 |     endian = LITTLE_ENDIAN
56 | 
57 |     def validate(self):
58 |         if self["magic"].value != 0x00035F3F:
59 |             return "Invalid magic"
60 |         if self["filesize"].value != self.stream.size//8:
61 |             return "Invalid magic"
62 |         return True
63 | 
64 |     def createFields(self):
65 |         yield textHandler(UInt32(self, "magic"), hexadecimal)
66 |         yield UInt32(self, "dir_start", "Directory start")
67 |         yield Int32(self, "first_free_block", "First free block")
68 |         yield UInt32(self, "filesize", "File size in bytes")
69 | 
70 |         yield self.seekByte(self["dir_start"].value)
71 |         yield FileEntry(self, "file[]")
72 | 
73 |         size = (self.size - self.current_size)//8
74 |         if size:
75 |             yield RawBytes(self, "end", size)
76 | 
77 | 


--------------------------------------------------------------------------------
/hachoir_parser/image/tga.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Truevision Targa Graphic (TGA) picture parser.
 3 | 
 4 | Author: Victor Stinner
 5 | Creation: 18 december 2006
 6 | """
 7 | 
 8 | from hachoir_parser import Parser
 9 | from hachoir_core.field import FieldSet, UInt8, UInt16, Enum, RawBytes
10 | from hachoir_core.endian import LITTLE_ENDIAN
11 | from hachoir_parser.image.common import PaletteRGB
12 | 
13 | class Line(FieldSet):
14 |     def __init__(self, *args):
15 |         FieldSet.__init__(self, *args)
16 |         self._size = self["/width"].value * self["/bpp"].value
17 | 
18 |     def createFields(self):
19 |         for x in xrange(self["/width"].value):
20 |             yield UInt8(self, "pixel[]")
21 | 
22 | class Pixels(FieldSet):
23 |     def __init__(self, *args):
24 |         FieldSet.__init__(self, *args)
25 |         self._size = self["/width"].value * self["/height"].value * self["/bpp"].value
26 | 
27 |     def createFields(self):
28 |         if self["/options"].value == 0:
29 |             RANGE = xrange(self["/height"].value-1,-1,-1)
30 |         else:
31 |             RANGE = xrange(self["/height"].value)
32 |         for y in RANGE:
33 |             yield Line(self, "line[%u]" % y)
34 | 
35 | class TargaFile(Parser):
36 |     PARSER_TAGS = {
37 |         "id": "targa",
38 |         "category": "image",
39 |         "file_ext": ("tga",),
40 |         "mime": (u"image/targa", u"image/tga", u"image/x-tga"),
41 |         "min_size": 18*8,
42 |         "description": u"Truevision Targa Graphic (TGA)"
43 |     }
44 |     CODEC_NAME = {
45 |          1: u"8-bit uncompressed",
46 |          2: u"24-bit uncompressed",
47 |          9: u"8-bit RLE",
48 |         10: u"24-bit RLE",
49 |     }
50 |     endian = LITTLE_ENDIAN
51 | 
52 |     def validate(self):
53 |         if self["version"].value != 1:
54 |             return "Unknown version"
55 |         if self["codec"].value not in self.CODEC_NAME:
56 |             return "Unknown codec"
57 |         if self["x_min"].value != 0 or self["y_min"].value != 0:
58 |             return "(x_min, y_min) is not (0,0)"
59 |         if self["bpp"].value not in (8, 24):
60 |             return "Unknown bits/pixel value"
61 |         return True
62 | 
63 |     def createFields(self):
64 |         yield UInt8(self, "hdr_size", "Header size in bytes")
65 |         yield UInt8(self, "version", "Targa version (always one)")
66 |         yield Enum(UInt8(self, "codec", "Pixels encoding"), self.CODEC_NAME)
67 |         yield UInt16(self, "palette_ofs", "Palette absolute file offset")
68 |         yield UInt16(self, "nb_color", "Number of color")
69 |         yield UInt8(self, "color_map_size", "Color map entry size")
70 |         yield UInt16(self, "x_min")
71 |         yield UInt16(self, "y_min")
72 |         yield UInt16(self, "width")
73 |         yield UInt16(self, "height")
74 |         yield UInt8(self, "bpp", "Bits per pixel")
75 |         yield UInt8(self, "options", "Options (0: vertical mirror)")
76 |         if self["bpp"].value == 8:
77 |             yield PaletteRGB(self, "palette", 256)
78 |         if self["codec"].value == 1:
79 |             yield Pixels(self, "pixels")
80 |         else:
81 |             size = (self.size - self.current_size) // 8
82 |             if size:
83 |                 yield RawBytes(self, "raw_pixels", size)
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/hachoir_parser/game/laf.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | LucasArts Font parser.
 5 | 
 6 | Author: Cyril Zorin
 7 | Creation date: 1 January 2007
 8 | """
 9 | 
10 | from hachoir_parser import Parser
11 | from hachoir_core.field import (FieldSet,
12 |         UInt8, UInt16, UInt32, GenericVector)
13 | from hachoir_core.endian import LITTLE_ENDIAN
14 | 
15 | class CharData(FieldSet):
16 |   def __init__(self, chars, *args):
17 |     FieldSet.__init__(self, *args)
18 |     self.chars = chars
19 | 
20 |   def createFields(self):
21 |     for char in self.chars:
22 |       yield CharBitmap(char, self, "char_bitmap[]")
23 | 
24 | class CharBitmap(FieldSet):
25 |   def __init__(self, char, *args):
26 |     FieldSet.__init__(self, *args)
27 |     self.char = char
28 | 
29 |   def createFields(self):
30 |     width = self.char["width_pixels"].value
31 |     for line in xrange(self.char["height_pixels"].value):
32 |       yield GenericVector(self, "line[]", width,
33 |                           UInt8, "pixel")
34 | 
35 | class CharInfo(FieldSet):
36 |   static_size = 16 * 8
37 | 
38 |   def createFields(self):
39 |     yield UInt32(self, "data_offset")
40 |     yield UInt8(self, "logical_width")
41 |     yield UInt8(self, "unknown[]")
42 |     yield UInt8(self, "unknown[]")
43 |     yield UInt8(self, "unknown[]")
44 |     yield UInt32(self, "width_pixels")
45 |     yield UInt32(self, "height_pixels")
46 | 
47 | class LafFile(Parser):
48 |   PARSER_TAGS = {
49 |     "id": "lucasarts_font",
50 |     "category": "game",
51 |     "file_ext" : ("laf",),
52 |     "min_size" : 32*8,
53 |     "description" : "LucasArts Font"
54 |     }
55 | 
56 |   endian = LITTLE_ENDIAN
57 | 
58 |   def validate(self):
59 |     if self["num_chars"].value != 256:
60 |         return "Invalid number of characters (%u)" % self["num_chars"].value
61 |     if self["first_char_code"].value != 0:
62 |         return "Invalid of code of first character code (%u)" % self["first_char_code"].value
63 |     if self["last_char_code"].value != 255:
64 |         return "Invalid of code of last character code (%u)" % self["last_char_code"].value
65 |     if self["char_codes/char[0]"].value != 0:
66 |         return "Invalid character code #0 (%u)" % self["char_codes/char[0]"].value
67 |     if self["chars/char[0]/data_offset"].value != 0:
68 |         return "Invalid character #0 offset"
69 |     return True
70 | 
71 |   def createFields(self):
72 |     yield UInt32(self, "num_chars")
73 |     yield UInt32(self, "raw_font_data_size")
74 |     yield UInt32(self, "max_char_width")
75 |     yield UInt32(self, "min_char_width")
76 |     yield UInt32(self, "unknown[]", 4)
77 |     yield UInt32(self, "unknown[]", 4)
78 |     yield UInt32(self, "first_char_code")
79 |     yield UInt32(self, "last_char_code")
80 | 
81 |     yield GenericVector(self, "char_codes", self["num_chars"].value,
82 |             UInt16, "char")
83 | 
84 |     yield GenericVector(self, "chars", self["num_chars"].value,
85 |             CharInfo, "char")
86 | 
87 |     # character data. we make an effort to provide
88 |     # something more meaningful than "RawBytes:
89 |     # character bitmap data"
90 |     yield CharData(self["chars"], self, "char_data")
91 | 
92 |     # read to the end
93 |     if self.current_size < self._size:
94 |       yield self.seekBit(self._size, "unknown[]")
95 | 


--------------------------------------------------------------------------------
/hachoir_core/field/timestamp.py:
--------------------------------------------------------------------------------
 1 | from hachoir_core.tools import (humanDatetime, humanDuration,
 2 |     timestampUNIX, timestampMac32, timestampUUID60,
 3 |     timestampWin64, durationWin64)
 4 | from hachoir_core.field import Bits, FieldSet
 5 | from datetime import datetime
 6 | 
 7 | class GenericTimestamp(Bits):
 8 |     def __init__(self, parent, name, size, description=None):
 9 |         Bits.__init__(self, parent, name, size, description)
10 | 
11 |     def createDisplay(self):
12 |         return humanDatetime(self.value)
13 | 
14 |     def createRawDisplay(self):
15 |         value = Bits.createValue(self)
16 |         return unicode(value)
17 | 
18 |     def __nonzero__(self):
19 |         return Bits.createValue(self) != 0
20 | 
21 | def timestampFactory(cls_name, handler, size):
22 |     class Timestamp(GenericTimestamp):
23 |         def __init__(self, parent, name, description=None):
24 |             GenericTimestamp.__init__(self, parent, name, size, description)
25 | 
26 |         def createValue(self):
27 |             value = Bits.createValue(self)
28 |             return handler(value)
29 |     cls = Timestamp
30 |     cls.__name__ = cls_name
31 |     return cls
32 | 
33 | TimestampUnix32 = timestampFactory("TimestampUnix32", timestampUNIX, 32)
34 | TimestampUnix64 = timestampFactory("TimestampUnix64", timestampUNIX, 64)
35 | TimestampMac32 = timestampFactory("TimestampUnix32", timestampMac32, 32)
36 | TimestampUUID60 = timestampFactory("TimestampUUID60", timestampUUID60, 60)
37 | TimestampWin64 = timestampFactory("TimestampWin64", timestampWin64, 64)
38 | 
39 | class TimeDateMSDOS32(FieldSet):
40 |     """
41 |     32-bit MS-DOS timestamp (16-bit time, 16-bit date)
42 |     """
43 |     static_size = 32
44 | 
45 |     def createFields(self):
46 |         # TODO: Create type "MSDOS_Second" : value*2
47 |         yield Bits(self, "second", 5, "Second/2")
48 |         yield Bits(self, "minute", 6)
49 |         yield Bits(self, "hour", 5)
50 | 
51 |         yield Bits(self, "day", 5)
52 |         yield Bits(self, "month", 4)
53 |         # TODO: Create type "MSDOS_Year" : value+1980
54 |         yield Bits(self, "year", 7, "Number of year after 1980")
55 | 
56 |     def createValue(self):
57 |         return datetime(
58 |             1980+self["year"].value, self["month"].value, self["day"].value,
59 |             self["hour"].value, self["minute"].value, 2*self["second"].value)
60 | 
61 |     def createDisplay(self):
62 |         return humanDatetime(self.value)
63 | 
64 | class DateTimeMSDOS32(TimeDateMSDOS32):
65 |     """
66 |     32-bit MS-DOS timestamp (16-bit date, 16-bit time)
67 |     """
68 |     def createFields(self):
69 |         yield Bits(self, "day", 5)
70 |         yield Bits(self, "month", 4)
71 |         yield Bits(self, "year", 7, "Number of year after 1980")
72 |         yield Bits(self, "second", 5, "Second/2")
73 |         yield Bits(self, "minute", 6)
74 |         yield Bits(self, "hour", 5)
75 | 
76 | class TimedeltaWin64(GenericTimestamp):
77 |     def __init__(self, parent, name, description=None):
78 |         GenericTimestamp.__init__(self, parent, name, 64, description)
79 | 
80 |     def createDisplay(self):
81 |         return humanDuration(self.value)
82 | 
83 |     def createValue(self):
84 |         value = Bits.createValue(self)
85 |         return durationWin64(value)
86 | 
87 | 


--------------------------------------------------------------------------------
/pdfminer/lzw.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | import sys
  3 | try:
  4 |     from cStringIO import StringIO
  5 | except ImportError:
  6 |     from StringIO import StringIO
  7 | 
  8 | 
  9 | ##  LZWDecoder
 10 | ##
 11 | class LZWDecoder(object):
 12 | 
 13 |     debug = 0
 14 | 
 15 |     def __init__(self, fp):
 16 |         self.fp = fp
 17 |         self.buff = 0
 18 |         self.bpos = 8
 19 |         self.nbits = 9
 20 |         self.table = None
 21 |         self.prevbuf = None
 22 |         return
 23 | 
 24 |     def readbits(self, bits):
 25 |         v = 0
 26 |         while 1:
 27 |             # the number of remaining bits we can get from the current buffer.
 28 |             r = 8-self.bpos
 29 |             if bits <= r:
 30 |                 # |-----8-bits-----|
 31 |                 # |-bpos-|-bits-|  |
 32 |                 # |      |----r----|
 33 |                 v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1))
 34 |                 self.bpos += bits
 35 |                 break
 36 |             else:
 37 |                 # |-----8-bits-----|
 38 |                 # |-bpos-|---bits----...
 39 |                 # |      |----r----|
 40 |                 v = (v<<r) | (self.buff & ((1<<r)-1))
 41 |                 bits -= r
 42 |                 x = self.fp.read(1)
 43 |                 if not x: raise EOFError
 44 |                 self.buff = ord(x)
 45 |                 self.bpos = 0
 46 |         return v
 47 | 
 48 |     def feed(self, code):
 49 |         x = ''
 50 |         if code == 256:
 51 |             self.table = [ chr(c) for c in xrange(256) ] # 0-255
 52 |             self.table.append(None) # 256
 53 |             self.table.append(None) # 257
 54 |             self.prevbuf = ''
 55 |             self.nbits = 9
 56 |         elif code == 257:
 57 |             pass
 58 |         elif not self.prevbuf:
 59 |             x = self.prevbuf = self.table[code]
 60 |         else:
 61 |             if code < len(self.table):
 62 |                 x = self.table[code]
 63 |                 self.table.append(self.prevbuf+x[0])
 64 |             else:
 65 |                 self.table.append(self.prevbuf+self.prevbuf[0])
 66 |                 x = self.table[code]
 67 |             l = len(self.table)
 68 |             if l == 511:
 69 |                 self.nbits = 10
 70 |             elif l == 1023:
 71 |                 self.nbits = 11
 72 |             elif l == 2047:
 73 |                 self.nbits = 12
 74 |             self.prevbuf = x
 75 |         return x
 76 | 
 77 |     def run(self):
 78 |         while 1:
 79 |             try:
 80 |                 code = self.readbits(self.nbits)
 81 |             except EOFError:
 82 |                 break
 83 |             x = self.feed(code)
 84 |             yield x
 85 |             if self.debug:
 86 |                 print >>sys.stderr, ('nbits=%d, code=%d, output=%r, table=%r' %
 87 |                                      (self.nbits, code, x, self.table[258:]))
 88 |         return
 89 | 
 90 | # lzwdecode
 91 | def lzwdecode(data):
 92 |     """
 93 |     >>> lzwdecode('\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01')
 94 |     '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
 95 |     """
 96 |     fp = StringIO(data)
 97 |     return ''.join(LZWDecoder(fp).run())
 98 | 
 99 | if __name__ == '__main__':
100 |     import doctest
101 |     doctest.testmod()
102 | 


--------------------------------------------------------------------------------
/parser.py:
--------------------------------------------------------------------------------
  1 | import string
  2 | import re
  3 | 
  4 | class parser:
  5 | 	def __init__(self,results,word,file):
  6 | 		self.results=results
  7 | 		self.word=word
  8 | 		self.temp=[]
  9 | 		self.file=file
 10 | 		
 11 | 	def genericClean(self):
 12 | 		self.results = re.sub('<em>', '', self.results)
 13 | 		self.results = re.sub('<b>', '', self.results)
 14 | 		self.results = re.sub('</b>', '', self.results)
 15 | 		self.results = re.sub('</em>', '', self.results)
 16 | 		self.results = re.sub('%2f', ' ', self.results)
 17 | 		self.results = re.sub('%3a', ' ', self.results)
 18 | 		self.results = re.sub('<strong>', '', self.results)
 19 | 		self.results = re.sub('</strong>', '', self.results)
 20 | 
 21 | 
 22 | 		for e in ('>',':','=', '<', '/', '\\',';','&','%3A','%3D','%3C'):
 23 | 			self.results = string.replace(self.results, e, ' ')
 24 | 			
 25 | 	def urlClean(self):
 26 | 		self.results = re.sub('<em>', '', self.results)
 27 | 		self.results = re.sub('</em>', '', self.results)
 28 | 		self.results = re.sub('%2f', ' ', self.results)
 29 | 		self.results = re.sub('%3a', ' ', self.results)
 30 | 		for e in ('<','>',':','=',';','&','%3A','%3D','%3C'):
 31 | 			self.results = string.replace(self.results, e, ' ')
 32 | 		
 33 | 	def emails(self):
 34 | 		self.genericClean()
 35 | 		reg_emails = re.compile('[a-zA-Z0-9.-_]*' + '@' + '[a-zA-Z0-9.-]*' + self.word)
 36 | 		self.temp = reg_emails.findall(self.results)
 37 | 		emails=self.unique()
 38 | 		return emails
 39 | 	
 40 | 	def fileurls(self):
 41 | 		urls=[]
 42 | 		reg_urls = re.compile('<a href="(.*?)"')
 43 | 		self.temp = reg_urls.findall(self.results)
 44 | 		allurls=self.unique()
 45 | 		for x in allurls:
 46 | 			if x.count('webcache') or x.count('google.com') or x.count('search?'):
 47 | 				pass
 48 | 			else:
 49 | 				urls.append(x)
 50 | 		return urls
 51 | 	
 52 | 	def people_linkedin(self):
 53 | 		reg_people = re.compile('">[a-zA-Z0-9._ -]* profiles | LinkedIn')
 54 | 		
 55 | 		self.temp = reg_people.findall(self.results)
 56 | 		resul = []
 57 | 		for x in self.temp:
 58 | 				y = string.replace(x, '  LinkedIn', '')
 59 | 				y = string.replace(y, ' profiles ', '')
 60 | 				y = string.replace(y, 'LinkedIn', '')
 61 | 				y = string.replace(y, '"', '')
 62 | 				y = string.replace(y, '>', '')
 63 | 				if y !=" ":
 64 | 					resul.append(y)
 65 | 		return resul
 66 | 
 67 | 	def profiles(self):
 68 | 		reg_people = re.compile('">[a-zA-Z0-9._ -]* - <em>Google Profile</em>')
 69 | 		self.temp = reg_people.findall(self.results)
 70 | 		resul = []
 71 | 		for x in self.temp:
 72 | 				y = string.replace(x, ' <em>Google Profile</em>', '')
 73 | 				y = string.replace(y, '-', '')
 74 | 				y = string.replace(y, '">', '')
 75 | 				if y !=" ":
 76 | 					resul.append(y)
 77 | 		return resul
 78 | 	
 79 | 	
 80 | 	def hostnames(self):
 81 | 		self.genericClean()
 82 | 		reg_hosts = re.compile('[a-zA-Z0-9.-]*\.'+ self.word)
 83 | 		self.temp = reg_hosts.findall(self.results)
 84 | 		hostnames=self.unique()
 85 | 		return hostnames
 86 | 
 87 | 	def hostnames_all(self):
 88 | 		reg_hosts = re.compile('<cite>(.*?)</cite>')
 89 | 		temp = reg_hosts.findall(self.results)
 90 | 		for x in temp:
 91 | 			if x.count(':'):
 92 | 				res=x.split(':')[1].split('/')[2]
 93 | 			else:
 94 | 				res=x.split("/")[0]
 95 | 			self.temp.append(res)
 96 | 		hostnames=self.unique()
 97 | 		return hostnames
 98 | 		
 99 | 	def unique(self):
100 | 		self.new=[]
101 | 		for x in self.temp:
102 | 			if x not in self.new:
103 | 				self.new.append(x)
104 | 		return self.new
105 | 


--------------------------------------------------------------------------------
/hachoir_core/field/seekable_field_set.py:
--------------------------------------------------------------------------------
 1 | from hachoir_core.field import BasicFieldSet, GenericFieldSet, ParserError, createRawField
 2 | from hachoir_core.error import HACHOIR_ERRORS
 3 | 
 4 | # getgaps(int, int, [listof (int, int)]) -> generator of (int, int)
 5 | # Gets all the gaps not covered by a block in `blocks` from `start` for `length` units.
 6 | def getgaps(start, length, blocks):
 7 |     '''
 8 |     Example:
 9 |     >>> list(getgaps(0, 20, [(15,3), (6,2), (6,2), (1,2), (2,3), (11,2), (9,5)]))
10 |     [(0, 1), (5, 1), (8, 1), (14, 1), (18, 2)]
11 |     '''
12 |     # done this way to avoid mutating the original
13 |     blocks = sorted(blocks, key=lambda b: b[0])
14 |     end = start+length
15 |     for s, l in blocks:
16 |         if s > start:
17 |             yield (start, s-start)
18 |             start = s
19 |         if s+l > start:
20 |             start = s+l
21 |     if start < end:
22 |         yield (start, end-start)
23 | 
24 | class RootSeekableFieldSet(GenericFieldSet):
25 |     def seekBit(self, address, relative=True):
26 |         if not relative:
27 |             address -= self.absolute_address
28 |         if address < 0:
29 |             raise ParserError("Seek below field set start (%s.%s)" % divmod(address, 8))
30 |         self._current_size = address
31 |         return None
32 | 
33 |     def seekByte(self, address, relative=True):
34 |         return self.seekBit(address*8, relative)
35 | 
36 |     def _fixLastField(self):
37 |         """
38 |         Try to fix last field when we know current field set size.
39 |         Returns new added field if any, or None.
40 |         """
41 |         assert self._size is not None
42 | 
43 |         # Stop parser
44 |         message = ["stop parser"]
45 |         self._field_generator = None
46 | 
47 |         # If last field is too big, delete it
48 |         while self._size < self._current_size:
49 |             field = self._deleteField(len(self._fields)-1)
50 |             message.append("delete field %s" % field.path)
51 |         assert self._current_size <= self._size
52 | 
53 |         blocks = [(x.absolute_address, x.size) for x in self._fields]
54 |         fields = []
55 |         self._size = max(self._size, max(a+b for a,b in blocks) - self.absolute_address)
56 |         for start, length in getgaps(self.absolute_address, self._size, blocks):
57 |             self.seekBit(start, relative=False)
58 |             field = createRawField(self, length, "unparsed[]")
59 |             self.setUniqueFieldName(field)
60 |             self._fields.append(field.name, field)
61 |             fields.append(field)
62 |             message.append("found unparsed segment: start %s, length %s" % (start, length))
63 |         self.seekBit(self._size + self.absolute_address, relative=False)
64 |         message = ", ".join(message)
65 |         if fields:
66 |             self.warning("[Autofix] Fix parser error: " + message)
67 |         return fields
68 | 
69 |     def _stopFeeding(self):
70 |         new_field = None
71 |         if self._size is None:
72 |             if self._parent:
73 |                 self._size = self._current_size
74 | 
75 |         new_field = self._fixLastField()
76 |         self._field_generator = None
77 |         return new_field
78 | 
79 | class SeekableFieldSet(RootSeekableFieldSet):
80 |     def __init__(self, parent, name, description=None, size=None):
81 |         assert issubclass(parent.__class__, BasicFieldSet)
82 |         RootSeekableFieldSet.__init__(self, parent, name, parent.stream, description, size)
83 | 


--------------------------------------------------------------------------------
/hachoir_parser/audio/au.py:
--------------------------------------------------------------------------------
 1 | """
 2 | AU audio file parser
 3 | 
 4 | Author: Victor Stinner
 5 | Creation: 12 july 2006
 6 | """
 7 | 
 8 | from hachoir_parser import Parser
 9 | from hachoir_core.field import UInt32, Enum, String, RawBytes
10 | from hachoir_core.endian import BIG_ENDIAN
11 | from hachoir_core.text_handler import displayHandler, filesizeHandler
12 | from hachoir_core.tools import createDict, humanFrequency
13 | 
14 | class AuFile(Parser):
15 |     PARSER_TAGS = {
16 |         "id": "sun_next_snd",
17 |         "category": "audio",
18 |         "file_ext": ("au", "snd"),
19 |         "mime": (u"audio/basic",),
20 |         "min_size": 24*8,
21 |         "magic": ((".snd", 0),),
22 |         "description": "Sun/NeXT audio"
23 |     }
24 |     endian = BIG_ENDIAN
25 | 
26 |     CODEC_INFO = {
27 |         1: (8,    u"8-bit ISDN u-law"),
28 |         2: (8,    u"8-bit linear PCM"),
29 |         3: (16,   u"16-bit linear PCM"),
30 |         4: (24,   u"24-bit linear PCM"),
31 |         5: (32,   u"32-bit linear PCM"),
32 |         6: (32,   u"32-bit IEEE floating point"),
33 |         7: (64,   u"64-bit IEEE floating point"),
34 |         8: (None, u"Fragmented sample data"),
35 |         9: (None, u"DSP program"),
36 |        10: (8,    u"8-bit fixed point"),
37 |        11: (16,   u"16-bit fixed point"),
38 |        12: (24,   u"24-bit fixed point"),
39 |        13: (32,   u"32-bit fixed point"),
40 |        18: (16,   u"16-bit linear with emphasis"),
41 |        19: (16,   u"16-bit linear compressed"),
42 |        20: (16,   u"16-bit linear with emphasis and compression"),
43 |        21: (None, u"Music kit DSP commands"),
44 |        23: (None, u"4-bit ISDN u-law compressed (CCITT G.721 ADPCM)"),
45 |        24: (None, u"ITU-T G.722 ADPCM"),
46 |        25: (None, u"ITU-T G.723 3-bit ADPCM"),
47 |        26: (None, u"ITU-T G.723 5-bit ADPCM"),
48 |        27: (8,    u"8-bit ISDN A-law"),
49 |     }
50 | 
51 |     # Create bit rate and codec name dictionnaries
52 |     BITS_PER_SAMPLE = createDict(CODEC_INFO, 0)
53 |     CODEC_NAME = createDict(CODEC_INFO, 1)
54 | 
55 |     VALID_NB_CHANNEL = set((1,2))   # FIXME: 4, 5, 7, 8 channels are supported?
56 | 
57 |     def validate(self):
58 |         if self.stream.readBytes(0, 4) != ".snd":
59 |             return "Wrong file signature"
60 |         if self["channels"].value not in self.VALID_NB_CHANNEL:
61 |             return "Invalid number of channel"
62 |         return True
63 | 
64 |     def getBitsPerSample(self):
65 |         """
66 |         Get bit rate (number of bit per sample per channel),
67 |         may returns None if you unable to compute it.
68 |         """
69 |         return self.BITS_PER_SAMPLE.get(self["codec"].value)
70 | 
71 |     def createFields(self):
72 |         yield String(self, "signature", 4, 'Format signature (".snd")', charset="ASCII")
73 |         yield UInt32(self, "data_ofs", "Data offset")
74 |         yield filesizeHandler(UInt32(self, "data_size", "Data size"))
75 |         yield Enum(UInt32(self, "codec", "Audio codec"), self.CODEC_NAME)
76 |         yield displayHandler(UInt32(self, "sample_rate", "Number of samples/second"), humanFrequency)
77 |         yield UInt32(self, "channels", "Number of interleaved channels")
78 | 
79 |         size = self["data_ofs"].value - self.current_size // 8
80 |         if 0 < size:
81 |             yield String(self, "info", size, "Information", strip=" \0", charset="ISO-8859-1")
82 | 
83 |         size = min(self["data_size"].value, (self.size - self.current_size) // 8)
84 |         yield RawBytes(self, "audio_data", size, "Audio data")
85 | 
86 |     def createContentSize(self):
87 |         return (self["data_ofs"].value + self["data_size"].value) * 8
88 | 
89 | 


--------------------------------------------------------------------------------
/hachoir_parser/video/amf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | AMF metadata (inside Flash video, FLV file) parser.
  3 | 
  4 | Documentation:
  5 | 
  6 |  - flashticle: Python project to read Flash (formats SWF, FLV and AMF)
  7 |    http://undefined.org/python/#flashticle
  8 | 
  9 | Author: Victor Stinner
 10 | Creation date: 4 november 2006
 11 | """
 12 | 
 13 | from hachoir_core.field import (FieldSet, ParserError,
 14 |     UInt8, UInt16, UInt32, PascalString16, Float64)
 15 | from hachoir_core.tools import timestampUNIX
 16 | 
 17 | def parseUTF8(parent):
 18 |     yield PascalString16(parent, "value", charset="UTF-8")
 19 | 
 20 | def parseDouble(parent):
 21 |     yield Float64(parent, "value")
 22 | 
 23 | def parseBool(parent):
 24 |     yield UInt8(parent, "value")
 25 | 
 26 | def parseArray(parent):
 27 |     yield UInt32(parent, "count")
 28 |     for index in xrange(parent["count"].value):
 29 |         yield AMFObject(parent, "item[]")
 30 | 
 31 | def parseObjectAttributes(parent):
 32 |     while True:
 33 |         item = Attribute(parent, "attr[]")
 34 |         yield item
 35 |         if item["key"].value == "":
 36 |             break
 37 | 
 38 | def parseMixedArray(parent):
 39 |     yield UInt32(parent, "count")
 40 |     for index in xrange(parent["count"].value + 1):
 41 |         item = Attribute(parent, "item[]")
 42 |         yield item
 43 |         if not item['key'].value:
 44 |             break
 45 | 
 46 | def parseDate(parent):
 47 |     yield Float64(parent, "timestamp_microsec")
 48 |     yield UInt16(parent, "timestamp_sec")
 49 | 
 50 | def parseNothing(parent):
 51 |     raise StopIteration()
 52 | 
 53 | class AMFObject(FieldSet):
 54 |     CODE_DATE = 11
 55 |     tag_info = {
 56 |         # http://osflash.org/amf/astypes
 57 |          0: (parseDouble, "Double"),
 58 |          1: (parseBool, "Boolean"),
 59 |          2: (parseUTF8, "UTF-8 string"),
 60 |          3: (parseObjectAttributes, "Object attributes"),
 61 |         #MOVIECLIP = '\x04',
 62 |         #NULL = '\x05',
 63 |         #UNDEFINED = '\x06',
 64 |         #REFERENCE = '\x07',
 65 |          8: (parseMixedArray, "Mixed array"),
 66 |          9: (parseNothing, "End of object"),
 67 |         10: (parseArray, "Array"),
 68 |         CODE_DATE: (parseDate, "Date"),
 69 |         #LONGUTF8 = '\x0c',
 70 |         #UNSUPPORTED = '\x0d',
 71 |         ## Server-to-client only
 72 |         #RECORDSET = '\x0e',
 73 |         #XML = '\x0f',
 74 |         #TYPEDOBJECT = '\x10',
 75 |     }
 76 | 
 77 |     def __init__(self, *args, **kw):
 78 |         FieldSet.__init__(self, *args, **kw)
 79 |         code = self["type"].value
 80 |         try:
 81 |             self.parser, desc = self.tag_info[code]
 82 |             if code == self.CODE_DATE:
 83 |                 self.createValue = self.createValueDate
 84 |         except KeyError:
 85 |             raise ParserError("AMF: Unable to parse type %s" % code)
 86 | 
 87 |     def createFields(self):
 88 |         yield UInt8(self, "type")
 89 |         for field in self.parser(self):
 90 |             yield field
 91 | 
 92 |     def createValueDate(self):
 93 |         value = (self["timestamp_microsec"].value * 0.001) \
 94 |             - (self["timestamp_sec"].value * 60)
 95 |         return timestampUNIX(value)
 96 | 
 97 | class Attribute(AMFObject):
 98 |     def __init__(self, *args):
 99 |         AMFObject.__init__(self, *args)
100 |         self._description = None
101 | 
102 |     def createFields(self):
103 |         yield PascalString16(self, "key", charset="UTF-8")
104 |         yield UInt8(self, "type")
105 |         for field in self.parser(self):
106 |             yield field
107 | 
108 |     def createDescription(self):
109 |         return 'Attribute "%s"' % self["key"].value
110 | 
111 | 


--------------------------------------------------------------------------------
/myparser.py:
--------------------------------------------------------------------------------
  1 | import string
  2 | import re
  3 | 
  4 | class parser:
  5 | 	def __init__(self,results,word=""):
  6 | 		self.results=results
  7 | 		self.word=word
  8 | 		self.temp=[]
  9 | 		self.file=file
 10 | 		
 11 | 	def genericClean(self):
 12 | 		self.results = re.sub('<em>', '', self.results)
 13 | 		self.results = re.sub('<b>', '', self.results)
 14 | 		self.results = re.sub('</b>', '', self.results)
 15 | 		self.results = re.sub('</em>', '', self.results)
 16 | 		self.results = re.sub('%2f', ' ', self.results)
 17 | 		self.results = re.sub('%3a', ' ', self.results)
 18 | 		self.results = re.sub('<strong>', '', self.results)
 19 | 		self.results = re.sub('</strong>', '', self.results)	
 20 | 		self.results = re.sub('<w:t>',' ',self.results)
 21 | 
 22 | 
 23 | 		for e in ('>',':','=', '<', '/', '\\',';','&','%3A','%3D','%3C'):
 24 | 			self.results = string.replace(self.results, e, ' ')
 25 | 			
 26 | 	def urlClean(self):
 27 | 		self.results = re.sub('<em>', '', self.results)
 28 | 		self.results = re.sub('</em>', '', self.results)
 29 | 		self.results = re.sub('%2f', ' ', self.results)
 30 | 		self.results = re.sub('%3a', ' ', self.results)
 31 | 		for e in ('<','>',':','=',';','&','%3A','%3D','%3C'):
 32 | 			self.results = string.replace(self.results, e, ' ')
 33 | 		
 34 | 	def emails(self):
 35 | 		self.genericClean()
 36 | 		reg_emails = re.compile('[a-zA-Z0-9.-_]+' + '@' + '[a-zA-Z0-9.-]+')
 37 | 		self.temp = reg_emails.findall(self.results)
 38 | 		emails=self.unique()
 39 | 		return emails
 40 | 	
 41 | 	def fileurls(self):
 42 | 		urls=[]
 43 | 		reg_urls = re.compile('<a href="(.*?)"')
 44 | 		self.temp = reg_urls.findall(self.results)
 45 | 		allurls=self.unique()
 46 | 		for z in allurls:
 47 | 			y = string.replace(z,'/url?q=','')
 48 | 			x = y.split('&')[0]
 49 | 			if x.count('webcache') or x.count('google.com') or x.count('search?') or x.count('about.html') or x.count('privacy.html') or x.count('ads/') or x.count('services/') or x == "#" or x=="/":
 50 | 				pass
 51 | 			else:
 52 | 				urls.append(x)
 53 | 		return urls
 54 | 	
 55 | 	def people_linkedin(self):
 56 | 		reg_people = re.compile('">[a-zA-Z0-9._ -]* profiles | LinkedIn')
 57 | 		
 58 | 		self.temp = reg_people.findall(self.results)
 59 | 		resul = []
 60 | 		for x in self.temp:
 61 | 				y = string.replace(x, '  LinkedIn', '')
 62 | 				y = string.replace(y, ' profiles ', '')
 63 | 				y = string.replace(y, 'LinkedIn', '')
 64 | 				y = string.replace(y, '"', '')
 65 | 				y = string.replace(y, '>', '')
 66 | 				if y !=" ":
 67 | 					resul.append(y)
 68 | 		return resul
 69 | 
 70 | 	def profiles(self):
 71 | 		reg_people = re.compile('">[a-zA-Z0-9._ -]* - <em>Google Profile</em>')
 72 | 		self.temp = reg_people.findall(self.results)
 73 | 		resul = []
 74 | 		for x in self.temp:
 75 | 				y = string.replace(x, ' <em>Google Profile</em>', '')
 76 | 				y = string.replace(y, '-', '')
 77 | 				y = string.replace(y, '">', '')
 78 | 				if y !=" ":
 79 | 					resul.append(y)
 80 | 		return resul
 81 | 	
 82 | 	
 83 | 	def hostnames(self):
 84 | 		self.genericClean()
 85 | 		reg_hosts = re.compile('[a-zA-Z0-9.-]*\.'+ self.word)
 86 | 		self.temp = reg_hosts.findall(self.results)
 87 | 		hosts=self.unique()
 88 | 		return hosts
 89 | 
 90 | 	def hostnames_all(self):
 91 | 		reg_hosts = re.compile('<cite>(.*?)</cite>')
 92 | 		temp = reg_hosts.findall(self.results)
 93 | 		for x in temp:
 94 | 			if x.count(':'):
 95 | 				res=x.split(':')[1].split('/')[2]
 96 | 			else:
 97 | 				res=x.split("/")[0]
 98 | 			self.temp.append(res)
 99 | 		hostnames=self.unique()
100 | 		return hostnames
101 | 		
102 | 	def unique(self):
103 | 		self.new=[]
104 | 		for x in self.temp:
105 | 			if x not in self.new:
106 | 				self.new.append(x)
107 | 		return self.new
108 | 


--------------------------------------------------------------------------------
/extractors/metadataPDF.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | #
  3 | # metadataPDF.py - dump pdf metadata 
  4 | #
  5 | # Copy of Yusuke's dumppdf to add dumpmeta
  6 | import sys, re, os
  7 | from pdfminer.psparser import PSKeyword, PSLiteral
  8 | from pdfminer.pdfparser import PDFDocument, PDFParser
  9 | from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
 10 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
 11 | from pdfminer.pdfdevice import PDFDevice, TagExtractor
 12 | from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
 13 | from pdfminer.cmapdb import CMapDB
 14 | from pdfminer.layout import LAParams
 15 | import myparser
 16 | 
 17 | 
 18 | # dumpmeta
 19 | class metapdf:
 20 | 	def __init__(self,fname, password=''):
 21 | 		self.fname=fname
 22 | 		self.password=password
 23 | 		self.metadata=''
 24 | 		self.users=[]
 25 | 		self.software=[]
 26 | 		self.paths=[]
 27 | 		self.raw=""
 28 | 		self.company=[]
 29 | 		self.text=""
 30 | 	
 31 | 	def getTexts(self):
 32 | 		try:
 33 | 			password =''
 34 | 			pagenos = set()
 35 | 			maxpages = 0
 36 | 			codec = 'utf-8'
 37 | 			caching = True
 38 | 			laparams = LAParams()
 39 | 			rsrcmgr = PDFResourceManager(caching=caching)
 40 | 			outfp = file('temppdf.txt','w')
 41 | 			device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
 42 | 			fname= self.fname
 43 | 			fp = file(fname, 'rb')
 44 | 			process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)
 45 | 			fp.close()
 46 | 			device.close()
 47 | 			outfp.close()
 48 | 			infp = file('temppdf.txt','rb')
 49 | 			test=infp.read()
 50 | 			infp.close()
 51 | 			os.remove('temppdf.txt')
 52 | 			self.text=test
 53 | 			return "ok"
 54 | 		except Exception,e:
 55 | 			return e
 56 | 
 57 | 	def getData(self):
 58 | 		doc = PDFDocument()
 59 | 		fp = file(self.fname, 'rb')
 60 | 		parser = PDFParser(fp)
 61 | 		try:
 62 | 			parser.set_document(doc)
 63 | 			doc.set_parser(parser)
 64 | 			doc.initialize(self.password)
 65 | 		except:
 66 | 			return "error"
 67 | 		
 68 | 		parser.close()
 69 | 		fp.close()
 70 | 		#try:
 71 | 		#	metadata = resolve1(doc.catalog['Metadata'])
 72 | 		#	return "ok"
 73 | 		#except:
 74 | 		#	print "[x] Error in PDF extractor, Metadata catalog"
 75 | 		try:
 76 | 			for xref in doc.xrefs:
 77 | 				info_ref=xref.trailer.get('Info')
 78 | 				if info_ref:
 79 | 					info=resolve1(info_ref)
 80 | 				self.metadata=info
 81 | 				self.raw = info
 82 | 			if self.raw == None:
 83 | 				return "Empty metadata"
 84 | 			else:
 85 | 				return "ok"
 86 | 		except Exception,e:
 87 | 			return e 
 88 | 			print "\t [x] Error in PDF extractor, Trailer Info"
 89 | 
 90 | 	def getEmails(self):
 91 | 		em=myparser.parser(self.text)
 92 | 		return em.emails()
 93 | 		
 94 | 	def getHosts(self,domain):
 95 | 		em=myparser.parser(self.text,domain)
 96 | 		return em.hostnames()
 97 | 		
 98 | 	def getUsers(self):
 99 | 		if self.metadata.has_key('Author'):
100 | 			self.users.append(self.metadata['Author'])
101 | 		return self.users	
102 | 	def getCompany(self):
103 | 		try:
104 | 			self.users.append(self.metadata['Company'])
105 | 		except:
106 | 			print "\t [x] Error in PDF metadata Company"	
107 | 		return self.company
108 | 
109 | 
110 | 	def getSoftware(self):
111 | 		try:
112 | 			self.software.append(self.metadata['Producer'])
113 | 		except:	
114 | 			print "\t [x] Error in PDF metadata Software"
115 | 		try:
116 | 			self.software.append(self.metadata['Creator'])
117 | 		except:
118 | 			print "\t [x] Error in PDF metadata Creator"
119 | 		return self.software
120 | 
121 | 	def getPaths(self):
122 | 		return self.paths
123 | 
124 | 	def getRaw(self):
125 | 		return self.raw
126 | 


--------------------------------------------------------------------------------
/hachoir_core/field/link.py:
--------------------------------------------------------------------------------
  1 | from hachoir_core.field import Field, FieldSet, ParserError, Bytes, MissingField
  2 | from hachoir_core.stream import FragmentedStream
  3 | 
  4 | 
  5 | class Link(Field):
  6 |     def __init__(self, parent, name, *args, **kw):
  7 |         Field.__init__(self, parent, name, 0, *args, **kw)
  8 | 
  9 |     def hasValue(self):
 10 |         return True
 11 | 
 12 |     def createValue(self):
 13 |         return self._parent[self.display]
 14 | 
 15 |     def createDisplay(self):
 16 |         value = self.value
 17 |         if value is None:
 18 |             return "<%s>" % MissingField.__name__
 19 |         return value.path
 20 | 
 21 |     def _getField(self, name, const):
 22 |         target = self.value
 23 |         assert self != target
 24 |         return target._getField(name, const)
 25 | 
 26 | 
 27 | class Fragments:
 28 |     def __init__(self, first):
 29 |         self.first = first
 30 | 
 31 |     def __iter__(self):
 32 |         fragment = self.first
 33 |         while fragment is not None:
 34 |             data = fragment.getData()
 35 |             yield data and data.size
 36 |             fragment = fragment.next
 37 | 
 38 | 
 39 | class Fragment(FieldSet):
 40 |     _first = None
 41 | 
 42 |     def __init__(self, *args, **kw):
 43 |         FieldSet.__init__(self, *args, **kw)
 44 |         self._field_generator = self._createFields(self._field_generator)
 45 |         if self.__class__.createFields == Fragment.createFields:
 46 |             self._getData = lambda: self
 47 | 
 48 |     def getData(self):
 49 |         try:
 50 |             return self._getData()
 51 |         except MissingField, e:
 52 |             self.error(str(e))
 53 |         return None
 54 | 
 55 |     def setLinks(self, first, next=None):
 56 |         self._first = first or self
 57 |         self._next = next
 58 |         self._feedLinks = lambda: self
 59 |         return self
 60 | 
 61 |     def _feedLinks(self):
 62 |         while self._first is None and self.readMoreFields(1):
 63 |             pass
 64 |         if self._first is None:
 65 |             raise ParserError("first is None")
 66 |         return self
 67 |     first = property(lambda self: self._feedLinks()._first)
 68 | 
 69 |     def _getNext(self):
 70 |         next = self._feedLinks()._next
 71 |         if callable(next):
 72 |             self._next = next = next()
 73 |         return next
 74 |     next  = property(_getNext)
 75 | 
 76 |     def _createInputStream(self, **args):
 77 |         first = self.first
 78 |         if first is self and hasattr(first, "_getData"):
 79 |             return FragmentedStream(first, packets=Fragments(first), **args)
 80 |         return FieldSet._createInputStream(self, **args)
 81 | 
 82 |     def _createFields(self, field_generator):
 83 |         if self._first is None:
 84 |             for field in field_generator:
 85 |                 if self._first is not None:
 86 |                     break
 87 |                 yield field
 88 |             else:
 89 |                 raise ParserError("Fragment.setLinks not called")
 90 |         else:
 91 |             field = None
 92 |         if self._first is not self:
 93 |             link = Link(self, "first", None)
 94 |             link._getValue = lambda: self._first
 95 |             yield link
 96 |         if self._next:
 97 |             link = Link(self, "next", None)
 98 |             link.createValue = self._getNext
 99 |             yield link
100 |         if field:
101 |             yield field
102 |         for field in field_generator:
103 |             yield field
104 | 
105 |     def createFields(self):
106 |         if self._size is None:
107 |             self._size = self._getSize()
108 |         yield Bytes(self, "data", self._size/8)
109 | 
110 | 


--------------------------------------------------------------------------------
/hachoir_parser/program/exe_ne.py:
--------------------------------------------------------------------------------
 1 | from hachoir_core.field import (FieldSet,
 2 |     Bit, UInt8, UInt16, UInt32, Bytes,
 3 |     PaddingBits, PaddingBytes, NullBits, NullBytes)
 4 | from hachoir_core.text_handler import textHandler, hexadecimal, filesizeHandler
 5 | 
 6 | class NE_Header(FieldSet):
 7 |     static_size = 64*8
 8 |     def createFields(self):
 9 |         yield Bytes(self, "signature", 2, "New executable signature (NE)")
10 |         yield UInt8(self, "link_ver", "Linker version number")
11 |         yield UInt8(self, "link_rev", "Linker revision number")
12 |         yield UInt16(self, "entry_table_ofst", "Offset to the entry table")
13 |         yield UInt16(self, "entry_table_size", "Length (in bytes) of the entry table")
14 |         yield PaddingBytes(self, "reserved[]", 4)
15 | 
16 |         yield Bit(self, "is_dll", "Is a dynamic-link library (DLL)?")
17 |         yield Bit(self, "is_win_app", "Is a Windows application?")
18 |         yield PaddingBits(self, "reserved[]", 9)
19 |         yield Bit(self, "first_seg_code", "First segment contains code that loads the application?")
20 |         yield NullBits(self, "reserved[]", 1)
21 |         yield Bit(self, "link_error", "Load even if linker detects errors?")
22 |         yield NullBits(self, "reserved[]", 1)
23 |         yield Bit(self, "is_lib", "Is a library module?")
24 | 
25 |         yield UInt16(self, "auto_data_seg", "Automatic data segment number")
26 |         yield filesizeHandler(UInt16(self, "local_heap_size", "Initial size (in bytes) of the local heap"))
27 |         yield filesizeHandler(UInt16(self, "stack_size", "Initial size (in bytes) of the stack"))
28 |         yield textHandler(UInt32(self, "cs_ip", "Value of CS:IP"), hexadecimal)
29 |         yield textHandler(UInt32(self, "ss_sp", "Value of SS:SP"), hexadecimal)
30 | 
31 |         yield UInt16(self, "nb_entry_seg_tab", "Number of entries in the segment table")
32 |         yield UInt16(self, "nb_entry_modref_tab", "Number of entries in the module-reference table")
33 |         yield filesizeHandler(UInt16(self, "size_nonres_name_tab", "Number of bytes in the nonresident-name table"))
34 |         yield UInt16(self, "seg_tab_ofs", "Segment table offset")
35 |         yield UInt16(self, "rsrc_ofs", "Resource offset")
36 | 
37 |         yield UInt16(self, "res_name_tab_ofs", "Resident-name table offset")
38 |         yield UInt16(self, "mod_ref_tab_ofs", "Module-reference table offset")
39 |         yield UInt16(self, "import_tab_ofs", "Imported-name table offset")
40 | 
41 |         yield UInt32(self, "non_res_name_tab_ofs", "Nonresident-name table offset")
42 |         yield UInt16(self, "nb_mov_ent_pt", "Number of movable entry points")
43 |         yield UInt16(self, "log2_sector_size", "Log2 of the segment sector size")
44 |         yield UInt16(self, "nb_rsrc_seg", "Number of resource segments")
45 | 
46 |         yield Bit(self, "unknown_os_format", "Operating system format is unknown")
47 |         yield PaddingBits(self, "reserved[]", 1)
48 |         yield Bit(self, "os_windows", "Operating system is Microsoft Windows")
49 |         yield NullBits(self, "reserved[]", 6)
50 |         yield Bit(self, "is_win20_prot", "Is Windows 2.x application running in version 3.x protected mode")
51 |         yield Bit(self, "is_win20_font", "Is Windows 2.x application supporting proportional fonts")
52 |         yield Bit(self, "fast_load", "Contains a fast-load area?")
53 |         yield NullBits(self, "reserved[]", 4)
54 | 
55 |         yield UInt16(self, "fastload_ofs", "Fast-load area offset (in sector)")
56 |         yield UInt16(self, "fastload_size", "Fast-load area length (in sector)")
57 | 
58 |         yield NullBytes(self, "reserved[]", 2)
59 |         yield textHandler(UInt16(self, "win_version", "Expected Windows version number"), hexadecimal)
60 | 
61 | 


--------------------------------------------------------------------------------
/hachoir_core/field/float.py:
--------------------------------------------------------------------------------
  1 | from hachoir_core.field import Bit, Bits, FieldSet
  2 | from hachoir_core.endian import BIG_ENDIAN, LITTLE_ENDIAN
  3 | import struct
  4 | 
  5 | # Make sure that we use right struct types
  6 | assert struct.calcsize("f") == 4
  7 | assert struct.calcsize("d") == 8
  8 | assert struct.unpack("<d", "\x1f\x85\xebQ\xb8\x1e\t@")[0] == 3.14
  9 | assert struct.unpack(">d", "\xc0\0\0\0\0\0\0\0")[0] == -2.0
 10 | 
 11 | class FloatMantissa(Bits):
 12 |     def createValue(self):
 13 |         value = Bits.createValue(self)
 14 |         return 1 + float(value) / (2 ** self.size)
 15 | 
 16 |     def createRawDisplay(self):
 17 |         return unicode(Bits.createValue(self))
 18 | 
 19 | class FloatExponent(Bits):
 20 |     def __init__(self, parent, name, size):
 21 |         Bits.__init__(self, parent, name, size)
 22 |         self.bias = 2 ** (size-1) - 1
 23 | 
 24 |     def createValue(self):
 25 |         return Bits.createValue(self) - self.bias
 26 | 
 27 |     def createRawDisplay(self):
 28 |         return unicode(self.value + self.bias)
 29 | 
 30 | def floatFactory(name, format, mantissa_bits, exponent_bits, doc):
 31 |     size = 1 + mantissa_bits + exponent_bits
 32 | 
 33 |     class Float(FieldSet):
 34 |         static_size = size
 35 |         __doc__ = doc
 36 | 
 37 |         def __init__(self, parent, name, description=None):
 38 |             assert parent.endian in (BIG_ENDIAN, LITTLE_ENDIAN)
 39 |             FieldSet.__init__(self, parent, name, description, size)
 40 |             if format:
 41 |                 if self._parent.endian == BIG_ENDIAN:
 42 |                     self.struct_format = ">"+format
 43 |                 else:
 44 |                     self.struct_format = "<"+format
 45 |             else:
 46 |                 self.struct_format = None
 47 | 
 48 |         def createValue(self):
 49 |             """
 50 |             Create float value: use struct.unpack() when it's possible
 51 |             (32 and 64-bit float) or compute it with :
 52 |                mantissa * (2.0 ** exponent)
 53 | 
 54 |             This computation may raise an OverflowError.
 55 |             """
 56 |             if self.struct_format:
 57 |                 raw = self._parent.stream.readBytes(
 58 |                     self.absolute_address, self._size//8)
 59 |                 try:
 60 |                     return struct.unpack(self.struct_format, raw)[0]
 61 |                 except struct.error, err:
 62 |                     raise ValueError("[%s] conversion error: %s" %
 63 |                         (self.__class__.__name__, err))
 64 |             else:
 65 |                 try:
 66 |                     value = self["mantissa"].value * (2.0 ** float(self["exponent"].value))
 67 |                     if self["negative"].value:
 68 |                         return -(value)
 69 |                     else:
 70 |                         return value
 71 |                 except OverflowError:
 72 |                     raise ValueError("[%s] floating point overflow" %
 73 |                         self.__class__.__name__)
 74 | 
 75 |         def createFields(self):
 76 |             yield Bit(self, "negative")
 77 |             yield FloatExponent(self, "exponent", exponent_bits)
 78 |             if 64 <= mantissa_bits:
 79 |                 yield Bit(self, "one")
 80 |                 yield FloatMantissa(self, "mantissa", mantissa_bits-1)
 81 |             else:
 82 |                 yield FloatMantissa(self, "mantissa", mantissa_bits)
 83 | 
 84 |     cls = Float
 85 |     cls.__name__ = name
 86 |     return cls
 87 | 
 88 | # 32-bit float (standard: IEEE 754/854)
 89 | Float32 = floatFactory("Float32", "f", 23, 8,
 90 |     "Floating point number: format IEEE 754 int 32 bit")
 91 | 
 92 | # 64-bit float (standard: IEEE 754/854)
 93 | Float64 = floatFactory("Float64", "d", 52, 11,
 94 |     "Floating point number: format IEEE 754 in 64 bit")
 95 | 
 96 | # 80-bit float (standard: IEEE 754/854)
 97 | Float80 = floatFactory("Float80", None, 64, 15,
 98 |     "Floating point number: format IEEE 754 in 80 bit")
 99 | 
100 | 


--------------------------------------------------------------------------------
/hachoir_parser/video/mpeg_ts.py:
--------------------------------------------------------------------------------
  1 | """
  2 | MPEG-2 Transport Stream parser.
  3 | 
  4 | Documentation:
  5 | - MPEG-2 Transmission
  6 |   http://erg.abdn.ac.uk/research/future-net/digital-video/mpeg2-trans.html
  7 | 
  8 | Author: Victor Stinner
  9 | Creation date: 13 january 2007
 10 | """
 11 | 
 12 | from hachoir_parser import Parser
 13 | from hachoir_core.field import (FieldSet, ParserError, MissingField,
 14 |     UInt8, Enum, Bit, Bits, RawBytes)
 15 | from hachoir_core.endian import BIG_ENDIAN
 16 | from hachoir_core.text_handler import textHandler, hexadecimal
 17 | 
 18 | class Packet(FieldSet):
 19 |     def __init__(self, *args):
 20 |         FieldSet.__init__(self, *args)
 21 |         if self["has_error"].value:
 22 |             self._size = 204*8
 23 |         else:
 24 |             self._size = 188*8
 25 | 
 26 |     PID = {
 27 |         0x0000: "Program Association Table (PAT)",
 28 |         0x0001: "Conditional Access Table (CAT)",
 29 |         # 0x0002..0x000f: reserved
 30 |         # 0x0010..0x1FFE: network PID, program map PID, elementary PID, etc.
 31 |         # TODO: Check above values
 32 |         #0x0044: "video",
 33 |         #0x0045: "audio",
 34 |         0x1FFF: "Null packet",
 35 |     }
 36 | 
 37 |     def createFields(self):
 38 |         yield textHandler(UInt8(self, "sync", 8), hexadecimal)
 39 |         if self["sync"].value != 0x47:
 40 |             raise ParserError("MPEG-2 TS: Invalid synchronization byte")
 41 |         yield Bit(self, "has_error")
 42 |         yield Bit(self, "payload_unit_start")
 43 |         yield Bit(self, "priority")
 44 |         yield Enum(textHandler(Bits(self, "pid", 13, "Program identifier"), hexadecimal), self.PID)
 45 |         yield Bits(self, "scrambling_control", 2)
 46 |         yield Bit(self, "has_adaptation")
 47 |         yield Bit(self, "has_payload")
 48 |         yield Bits(self, "counter", 4)
 49 |         yield RawBytes(self, "payload", 184)
 50 |         if self["has_error"].value:
 51 |             yield RawBytes(self, "error_correction", 16)
 52 | 
 53 |     def createDescription(self):
 54 |         text = "Packet: PID %s" % self["pid"].display
 55 |         if self["payload_unit_start"].value:
 56 |             text += ", start of payload"
 57 |         return text
 58 | 
 59 |     def isValid(self):
 60 |         if not self["has_payload"].value and not self["has_adaptation"].value:
 61 |             return u"No payload and no adaptation"
 62 |         pid = self["pid"].value
 63 |         if (0x0002 <= pid <= 0x000f) or (0x2000 <= pid):
 64 |             return u"Invalid program identifier (%s)" % self["pid"].display
 65 |         return ""
 66 | 
 67 | class MPEG_TS(Parser):
 68 |     PARSER_TAGS = {
 69 |         "id": "mpeg_ts",
 70 |         "category": "video",
 71 |         "file_ext": ("ts",),
 72 |         "min_size": 188*8,
 73 |         "description": u"MPEG-2 Transport Stream"
 74 |     }
 75 |     endian = BIG_ENDIAN
 76 | 
 77 |     def validate(self):
 78 |         sync = self.stream.searchBytes("\x47", 0, 204*8)
 79 |         if sync is None:
 80 |             return "Unable to find synchronization byte"
 81 |         for index in xrange(5):
 82 |             try:
 83 |                 packet = self["packet[%u]" % index]
 84 |             except (ParserError, MissingField):
 85 |                 if index and self.eof:
 86 |                     return True
 87 |                 else:
 88 |                     return "Unable to get packet #%u" % index
 89 |             err = packet.isValid()
 90 |             if err:
 91 |                 return "Packet #%u is invalid: %s" % (index, err)
 92 |         return True
 93 | 
 94 |     def createFields(self):
 95 |         while not self.eof:
 96 |             sync = self.stream.searchBytes("\x47", self.current_size, self.current_size+204*8)
 97 |             if sync is None:
 98 |                 raise ParserError("Unable to find synchronization byte")
 99 |             elif sync:
100 |                 yield RawBytes(self, "incomplete_packet[]", (sync-self.current_size)//8)
101 |             yield Packet(self, "packet[]")
102 | 
103 | 


--------------------------------------------------------------------------------
/hachoir_parser/network/common.py:
--------------------------------------------------------------------------------
  1 | from hachoir_core.field import FieldSet, Field, Bits
  2 | from hachoir_core.bits import str2hex
  3 | from hachoir_parser.network.ouid import REGISTERED_OUID
  4 | from hachoir_core.endian import BIG_ENDIAN
  5 | from socket import gethostbyaddr, herror as socket_host_error
  6 | 
  7 | def ip2name(addr):
  8 |     if not ip2name.resolve:
  9 |         return addr
 10 |     try:
 11 |         if addr in ip2name.cache:
 12 |             return ip2name.cache[addr]
 13 |         # FIXME: Workaround Python bug
 14 |         # Need double try/except to catch the bug
 15 |         try:
 16 |             name = gethostbyaddr(addr)[0]
 17 |         except KeyboardInterrupt:
 18 |             raise
 19 |     except (socket_host_error, ValueError):
 20 |         name = addr
 21 |     except (socket_host_error, KeyboardInterrupt, ValueError):
 22 |         ip2name.resolve = False
 23 |         name = addr
 24 |     ip2name.cache[addr] = name
 25 |     return name
 26 | ip2name.cache = {}
 27 | ip2name.resolve = True
 28 | 
 29 | class IPv4_Address(Field):
 30 |     def __init__(self, parent, name, description=None):
 31 |         Field.__init__(self, parent, name, 32, description)
 32 | 
 33 |     def createValue(self):
 34 |         value = self._parent.stream.readBytes(self.absolute_address, 4)
 35 |         return ".".join(( "%u" % ord(byte) for byte in value ))
 36 | 
 37 |     def createDisplay(self):
 38 |         return ip2name(self.value)
 39 | 
 40 | class IPv6_Address(Field):
 41 |     def __init__(self, parent, name, description=None):
 42 |         Field.__init__(self, parent, name, 128, description)
 43 | 
 44 |     def createValue(self):
 45 |         value = self._parent.stream.readBits(self.absolute_address, 128, self.parent.endian)
 46 |         parts = []
 47 |         for index in xrange(8):
 48 |             part = "%04x" % (value & 0xffff)
 49 |             value >>= 16
 50 |             parts.append(part)
 51 |         return ':'.join(reversed(parts))
 52 | 
 53 |     def createDisplay(self):
 54 |         return self.value
 55 | 
 56 | class OrganizationallyUniqueIdentifier(Bits):
 57 |     """
 58 |     IEEE 24-bit Organizationally unique identifier
 59 |     """
 60 |     static_size = 24
 61 | 
 62 |     def __init__(self, parent, name, description=None):
 63 |         Bits.__init__(self, parent, name, 24, description=None)
 64 | 
 65 |     def createDisplay(self, human=True):
 66 |         if human:
 67 |             key = self.value
 68 |             if key in REGISTERED_OUID:
 69 |                 return REGISTERED_OUID[key]
 70 |             else:
 71 |                 return self.raw_display
 72 |         else:
 73 |             return self.raw_display
 74 | 
 75 |     def createRawDisplay(self):
 76 |         value = self.value
 77 |         a = value >> 16
 78 |         b = (value >> 8) & 0xFF
 79 |         c = value & 0xFF
 80 |         return "%02X-%02X-%02X" % (a, b, c)
 81 | 
 82 | class NIC24(Bits):
 83 |     static_size = 24
 84 | 
 85 |     def __init__(self, parent, name, description=None):
 86 |         Bits.__init__(self, parent, name, 24, description=None)
 87 | 
 88 |     def createDisplay(self):
 89 |         value = self.value
 90 |         a = value >> 16
 91 |         b = (value >> 8) & 0xFF
 92 |         c = value & 0xFF
 93 |         return "%02x:%02x:%02x" % (a, b, c)
 94 | 
 95 |     def createRawDisplay(self):
 96 |         return "0x%06X" % self.value
 97 | 
 98 | class MAC48_Address(FieldSet):
 99 |     """
100 |     IEEE 802 48-bit MAC address
101 |     """
102 |     static_size = 48
103 |     endian = BIG_ENDIAN
104 | 
105 |     def createFields(self):
106 |         yield OrganizationallyUniqueIdentifier(self, "organization")
107 |         yield NIC24(self, "nic")
108 | 
109 |     def hasValue(self):
110 |         return True
111 | 
112 |     def createValue(self):
113 |         bytes = self.stream.readBytes(self.absolute_address, 6)
114 |         return str2hex(bytes, format="%02x:")[:-1]
115 | 
116 |     def createDisplay(self):
117 |         return "%s [%s]" % (self["organization"].display, self["nic"].display)
118 | 
119 | 


--------------------------------------------------------------------------------
/hachoir_parser/audio/real_audio.py:
--------------------------------------------------------------------------------
 1 | """
 2 | RealAudio (.ra) parser
 3 | 
 4 | Author: Mike Melanson
 5 | References:
 6 |   http://wiki.multimedia.cx/index.php?title=RealMedia
 7 | Samples:
 8 |   http://samples.mplayerhq.hu/real/RA/
 9 | """
10 | 
11 | from hachoir_parser import Parser
12 | from hachoir_core.field import (FieldSet,
13 |     UInt8, UInt16, UInt32,
14 |     Bytes, RawBytes, String,
15 |     PascalString8)
16 | from hachoir_core.tools import humanFrequency
17 | from hachoir_core.text_handler import displayHandler
18 | from hachoir_core.endian import BIG_ENDIAN
19 | 
20 | class Metadata(FieldSet):
21 |     def createFields(self):
22 |         yield PascalString8(self, "title", charset="ISO-8859-1")
23 |         yield PascalString8(self, "author", charset="ISO-8859-1")
24 |         yield PascalString8(self, "copyright", charset="ISO-8859-1")
25 |         yield PascalString8(self, "comment", charset="ISO-8859-1")
26 | 
27 | class RealAudioFile(Parser):
28 |     MAGIC = ".ra\xFD"
29 |     PARSER_TAGS = {
30 |         "id": "real_audio",
31 |         "category": "audio",
32 |         "file_ext": ["ra"],
33 |         "mime": (u"audio/x-realaudio", u"audio/x-pn-realaudio"),
34 |         "min_size": 6*8,
35 |         "magic": ((MAGIC, 0),),
36 |         "description": u"Real audio (.ra)",
37 |     }
38 |     endian = BIG_ENDIAN
39 | 
40 |     def validate(self):
41 |         if self["signature"].value != self.MAGIC:
42 |             return "Invalid signature"
43 |         if self["version"].value not in (3, 4):
44 |             return "Unknown version"
45 |         return True
46 | 
47 |     def createFields(self):
48 |         yield Bytes(self, "signature", 4, r"RealAudio identifier ('.ra\xFD')")
49 |         yield UInt16(self, "version", "Version")
50 |         if self["version"].value == 3:
51 |             yield UInt16(self, "header_size", "Header size")
52 |             yield RawBytes(self, "Unknown1", 10)
53 |             yield UInt32(self, "data_size", "Data size")
54 |             yield Metadata(self, "metadata")
55 |             yield UInt8(self, "Unknown2")
56 |             yield PascalString8(self, "FourCC")
57 |             audio_size = self["data_size"].value
58 |         else: # version = 4
59 |             yield UInt16(self, "reserved1", "Reserved, should be 0")
60 |             yield String(self, "ra4sig", 4, "'.ra4' signature")
61 |             yield UInt32(self, "filesize", "File size (minus 40 bytes)")
62 |             yield UInt16(self, "version2", "Version 2 (always equal to version)")
63 |             yield UInt32(self, "headersize", "Header size (minus 16)")
64 |             yield UInt16(self, "codec_flavor", "Codec flavor")
65 |             yield UInt32(self, "coded_frame_size", "Coded frame size")
66 |             yield RawBytes(self, "unknown1", 12)
67 |             yield UInt16(self, "subpacketh", "Subpacket h (?)")
68 |             yield UInt16(self, "frame_size", "Frame size")
69 |             yield UInt16(self, "sub_packet_size", "Subpacket size")
70 |             yield UInt16(self, "unknown2", "Unknown")
71 |             yield displayHandler(UInt16(self, "sample_rate", "Sample rate"), humanFrequency)
72 |             yield UInt16(self, "unknown3", "Unknown")
73 |             yield UInt16(self, "sample_size", "Sample size")
74 |             yield UInt16(self, "channels", "Channels")
75 |             yield PascalString8(self, "Interleaving ID String")
76 |             yield PascalString8(self, "FourCC")
77 |             yield RawBytes(self, "unknown4", 3)
78 |             yield Metadata(self, "metadata")
79 |             audio_size = (self["filesize"].value + 40) - (self["headersize"].value + 16)
80 |         if 0 < audio_size:
81 |             yield RawBytes(self, "audio_data", audio_size)
82 | 
83 |     def createDescription(self):
84 |         if (self["version"].value == 3):
85 |             return "RealAudio v3 file, '%s' codec" % self["FourCC"].value
86 |         elif (self["version"].value == 4):
87 |             return "RealAudio v4 file, '%s' codec, %s, %u channels" % (
88 |                 self["FourCC"].value, self["sample_rate"].display, self["channels"].value)
89 |         else:
90 |             return "Real audio"
91 | 


--------------------------------------------------------------------------------
/hachoir_metadata/program.py:
--------------------------------------------------------------------------------
  1 | from hachoir_metadata.metadata import RootMetadata, registerExtractor
  2 | from hachoir_parser.program import ExeFile
  3 | from hachoir_metadata.safe import fault_tolerant, getValue
  4 | 
  5 | class ExeMetadata(RootMetadata):
  6 |     KEY_TO_ATTR = {
  7 |         u"ProductName": "title",
  8 |         u"LegalCopyright": "copyright",
  9 |         u"LegalTrademarks": "copyright",
 10 |         u"LegalTrademarks1": "copyright",
 11 |         u"LegalTrademarks2": "copyright",
 12 |         u"CompanyName": "author",
 13 |         u"BuildDate": "creation_date",
 14 |         u"FileDescription": "title",
 15 |         u"ProductVersion": "version",
 16 |     }
 17 |     SKIP_KEY = set((u"InternalName", u"OriginalFilename", u"FileVersion", u"BuildVersion"))
 18 | 
 19 |     def extract(self, exe):
 20 |         if exe.isPE():
 21 |             self.extractPE(exe)
 22 |         elif exe.isNE():
 23 |             self.extractNE(exe)
 24 | 
 25 |     def extractNE(self, exe):
 26 |         if "ne_header" in exe:
 27 |             self.useNE_Header(exe["ne_header"])
 28 |         if "info" in exe:
 29 |             self.useNEInfo(exe["info"])
 30 | 
 31 |     @fault_tolerant
 32 |     def useNEInfo(self, info):
 33 |         for node in info.array("node"):
 34 |             if node["name"].value == "StringFileInfo":
 35 |                 self.readVersionInfo(node["node[0]"])
 36 | 
 37 |     def extractPE(self, exe):
 38 |         # Read information from headers
 39 |         if "pe_header" in exe:
 40 |             self.usePE_Header(exe["pe_header"])
 41 |         if "pe_opt_header" in exe:
 42 |             self.usePE_OptHeader(exe["pe_opt_header"])
 43 | 
 44 |         # Use PE resource
 45 |         resource = exe.getResource()
 46 |         if resource and "version_info/node[0]" in resource:
 47 |             for node in resource.array("version_info/node[0]/node"):
 48 |                 if getValue(node, "name") == "StringFileInfo" \
 49 |                 and "node[0]" in node:
 50 |                     self.readVersionInfo(node["node[0]"])
 51 | 
 52 |     @fault_tolerant
 53 |     def useNE_Header(self, hdr):
 54 |         if hdr["is_dll"].value:
 55 |             self.format_version = u"New-style executable: Dynamic-link library (DLL)"
 56 |         elif hdr["is_win_app"].value:
 57 |             self.format_version = u"New-style executable: Windows 3.x application"
 58 |         else:
 59 |             self.format_version = u"New-style executable for Windows 3.x"
 60 | 
 61 |     @fault_tolerant
 62 |     def usePE_Header(self, hdr):
 63 |         self.creation_date = hdr["creation_date"].value
 64 |         self.comment = "CPU: %s" % hdr["cpu"].display
 65 |         if hdr["is_dll"].value:
 66 |             self.format_version = u"Portable Executable: Dynamic-link library (DLL)"
 67 |         else:
 68 |             self.format_version = u"Portable Executable: Windows application"
 69 | 
 70 |     @fault_tolerant
 71 |     def usePE_OptHeader(self, hdr):
 72 |         self.comment = "Subsystem: %s" % hdr["subsystem"].display
 73 | 
 74 |     def readVersionInfo(self, info):
 75 |         values = {}
 76 |         for node in info.array("node"):
 77 |             if "value" not in node or "name" not in node:
 78 |                 continue
 79 |             value = node["value"].value.strip(" \0")
 80 |             if not value:
 81 |                 continue
 82 |             key = node["name"].value
 83 |             values[key] = value
 84 | 
 85 |         if "ProductName" in values and "FileDescription" in values:
 86 |             # Make sure that FileDescription is set before ProductName
 87 |             # as title value
 88 |             self.title = values["FileDescription"]
 89 |             self.title = values["ProductName"]
 90 |             del values["FileDescription"]
 91 |             del values["ProductName"]
 92 | 
 93 |         for key, value in values.iteritems():
 94 |             if key in self.KEY_TO_ATTR:
 95 |                 setattr(self, self.KEY_TO_ATTR[key], value)
 96 |             elif key not in self.SKIP_KEY:
 97 |                 self.comment = "%s=%s" % (key, value)
 98 | 
 99 | registerExtractor(ExeFile, ExeMetadata)
100 | 
101 | 


--------------------------------------------------------------------------------
/hachoir_parser/file_system/linux_swap.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Linux swap file.
  3 | 
  4 | Documentation: Linux kernel source code, files:
  5 |  - mm/swapfile.c
  6 |  - include/linux/swap.h
  7 | 
  8 | Author: Victor Stinner
  9 | Creation date: 25 december 2006 (christmas ;-))
 10 | """
 11 | 
 12 | from hachoir_parser import Parser
 13 | from hachoir_core.field import (ParserError, GenericVector,
 14 |     UInt32, String,
 15 |     Bytes, NullBytes, RawBytes)
 16 | from hachoir_core.endian import LITTLE_ENDIAN
 17 | from hachoir_core.tools import humanFilesize
 18 | from hachoir_core.bits import str2hex
 19 | 
 20 | PAGE_SIZE = 4096
 21 | 
 22 | # Definition of MAX_SWAP_BADPAGES in Linux kernel:
 23 | #  (__swapoffset(magic.magic) - __swapoffset(info.badpages)) / sizeof(int)
 24 | MAX_SWAP_BADPAGES = ((PAGE_SIZE - 10) - 1536) // 4
 25 | 
 26 | class Page(RawBytes):
 27 |     static_size = PAGE_SIZE*8
 28 |     def __init__(self, parent, name):
 29 |         RawBytes.__init__(self, parent, name, PAGE_SIZE)
 30 | 
 31 | class UUID(Bytes):
 32 |     static_size = 16*8
 33 |     def __init__(self, parent, name):
 34 |         Bytes.__init__(self, parent, name, 16)
 35 |     def createDisplay(self):
 36 |         text = str2hex(self.value, format=r"%02x")
 37 |         return "%s-%s-%s-%s-%s" % (
 38 |             text[:8], text[8:12], text[12:16], text[16:20], text[20:])
 39 | 
 40 | class LinuxSwapFile(Parser):
 41 |     PARSER_TAGS = {
 42 |         "id": "linux_swap",
 43 |         "file_ext": ("",),
 44 |         "category": "file_system",
 45 |         "min_size": PAGE_SIZE*8,
 46 |         "description": "Linux swap file",
 47 |         "magic": (
 48 |             ("SWAP-SPACE", (PAGE_SIZE-10)*8),
 49 |             ("SWAPSPACE2", (PAGE_SIZE-10)*8),
 50 |             ("S1SUSPEND\0", (PAGE_SIZE-10)*8),
 51 |         ),
 52 |     }
 53 |     endian = LITTLE_ENDIAN
 54 | 
 55 |     def validate(self):
 56 |         magic = self.stream.readBytes((PAGE_SIZE-10)*8, 10)
 57 |         if magic not in ("SWAP-SPACE", "SWAPSPACE2", "S1SUSPEND\0"):
 58 |             return "Unknown magic string"
 59 |         if MAX_SWAP_BADPAGES < self["nb_badpage"].value:
 60 |             return "Invalid number of bad page (%u)" % self["nb_badpage"].value
 61 |         return True
 62 | 
 63 |     def getPageCount(self):
 64 |         """
 65 |         Number of pages which can really be used for swapping:
 66 |         number of page minus bad pages minus one page (used for the header)
 67 |         """
 68 |         # -1 because first page is used for the header
 69 |         return self["last_page"].value - self["nb_badpage"].value - 1
 70 | 
 71 |     def createDescription(self):
 72 |         if self["magic"].value == "S1SUSPEND\0":
 73 |             text = "Suspend swap file version 1"
 74 |         elif self["magic"].value == "SWAPSPACE2":
 75 |             text = "Linux swap file version 2"
 76 |         else:
 77 |             text = "Linux swap file version 1"
 78 |         nb_page = self.getPageCount()
 79 |         return "%s, page size: %s, %s pages" % (
 80 |             text, humanFilesize(PAGE_SIZE), nb_page)
 81 | 
 82 |     def createFields(self):
 83 |         # First kilobyte: boot sectors
 84 |         yield RawBytes(self, "boot", 1024, "Space for disklabel etc.")
 85 | 
 86 |         # Header
 87 |         yield UInt32(self, "version")
 88 |         yield UInt32(self, "last_page")
 89 |         yield UInt32(self, "nb_badpage")
 90 |         yield UUID(self, "sws_uuid")
 91 |         yield UUID(self, "sws_volume")
 92 |         yield NullBytes(self, "reserved", 117*4)
 93 | 
 94 |         # Read bad pages (if any)
 95 |         count = self["nb_badpage"].value
 96 |         if count:
 97 |             if MAX_SWAP_BADPAGES < count:
 98 |                 raise ParserError("Invalid number of bad page (%u)" % count)
 99 |             yield GenericVector(self, "badpages", count, UInt32, "badpage")
100 | 
101 |         # Read magic
102 |         padding = self.seekByte(PAGE_SIZE - 10, "padding", null=True)
103 |         if padding:
104 |             yield padding
105 |         yield String(self, "magic", 10, charset="ASCII")
106 | 
107 |         # Read all pages
108 |         yield GenericVector(self, "pages", self["last_page"].value, Page, "page")
109 | 
110 |         # Padding at the end
111 |         padding = self.seekBit(self.size, "end_padding", null=True)
112 |         if padding:
113 |             yield padding
114 | 
115 | 


--------------------------------------------------------------------------------
/unzip.py:
--------------------------------------------------------------------------------
  1 | """ unzip.py
  2 |     Version: 1.1
  3 | 
  4 |     Extract a zipfile to the directory provided
  5 |     It first creates the directory structure to house the files
  6 |     then it extracts the files to it.
  7 | 
  8 |     Sample usage:
  9 |     command line
 10 |     unzip.py -p 10 -z c:\testfile.zip -o c:\testoutput
 11 | 
 12 |     python class
 13 |     import unzip
 14 |     un = unzip.unzip()
 15 |     un.extract(r'c:\testfile.zip', 'c:\testoutput')
 16 |     
 17 | 
 18 |     By Doug Tolton
 19 | """
 20 | 
 21 | import sys
 22 | import zipfile
 23 | import os
 24 | import os.path
 25 | import getopt
 26 | 
 27 | class unzip:
 28 |     def __init__(self, verbose = False, percent = 10):
 29 |         self.verbose = False
 30 |         self.percent = percent
 31 |         
 32 |     def extract(self, file, dir):
 33 |         if not dir.endswith(':') and not os.path.exists(dir):
 34 |             os.mkdir(dir)
 35 | 
 36 |         zf = zipfile.ZipFile(file)
 37 | 
 38 |         # create directory structure to house files
 39 |         self._createstructure(file, dir)
 40 | 
 41 |         num_files = len(zf.namelist())
 42 |         percent = self.percent
 43 |         divisions = 100 / percent
 44 |         perc = int(num_files / divisions)
 45 | 
 46 |         # extract files to directory structure
 47 |         for i, name in enumerate(zf.namelist()):
 48 | 
 49 |             if self.verbose == True:
 50 |                 print "Extracting %s" % name
 51 |             elif perc > 0 and (i % perc) == 0 and i > 0:
 52 |                 complete = int (i / perc) * percent
 53 |                 #print "%s%% complete" % complete
 54 | 
 55 |             if not name.endswith('/'):
 56 |                 outfile = open(os.path.join(dir, name), 'wb')
 57 |                 outfile.write(zf.read(name))
 58 |                 outfile.flush()
 59 |                 outfile.close()
 60 | 
 61 | 
 62 |     def _createstructure(self, file, dir):
 63 |         self._makedirs(self._listdirs(file), dir)
 64 | 
 65 | 
 66 |     def _makedirs(self, directories, basedir):
 67 |         """ Create any directories that don't currently exist """
 68 |         for dir in directories:
 69 |             curdir = os.path.join(basedir, dir)
 70 |             if not os.path.exists(curdir):
 71 |                 os.mkdir(curdir)
 72 |                 #print("dir-->"+str(curdir))
 73 | 
 74 |     def _listdirs(self, file):
 75 |         """ Grabs all the directories in the zip structure
 76 |         This is necessary to create the structure before trying
 77 |         to extract the file to it. """
 78 |         zf = zipfile.ZipFile(file)
 79 | 
 80 |         dirs = []
 81 |         #print str(zf.namelist())
 82 | 
 83 |         for name in zf.namelist():
 84 |             dirsname = name.split("/")
 85 |             ant=""
 86 |             for dirname in dirsname[:-1]:
 87 | 				dirs.append(ant+dirname)
 88 | 				#print "anadiendo:"+(ant+dirname)
 89 | 				ant=ant+dirname+"/"
 90 | 
 91 |         dirs.sort()
 92 |         return dirs
 93 | 
 94 | def usage():
 95 |     print """usage: unzip.py -z <zipfile> -o <targetdir>
 96 |     <zipfile> is the source zipfile to extract
 97 |     <targetdir> is the target destination
 98 | 
 99 |     -z zipfile to extract
100 |     -o target location
101 |     -p sets the percentage notification
102 |     -v sets the extraction to verbose (overrides -p)
103 | 
104 |     long options also work:
105 |     --verbose
106 |     --percent=10
107 |     --zipfile=<zipfile>
108 |     --outdir=<targetdir>"""
109 |     
110 | 
111 | def main():
112 |     shortargs = 'vhp:z:o:'
113 |     longargs = ['verbose', 'help', 'percent=', 'zipfile=', 'outdir=']
114 | 
115 |     unzipper = unzip()
116 | 
117 |     try:
118 |         opts, args = getopt.getopt(sys.argv[1:], shortargs, longargs)
119 |     except getopt.GetoptError:
120 |         usage()
121 |         sys.exit(2)
122 | 
123 |     zipsource = ""
124 |     zipdest = ""
125 | 
126 |     for o, a in opts:
127 |         if o in ("-v", "--verbose"):
128 |             unzipper.verbose = True
129 |         if o in ("-p", "--percent"):
130 |             if not unzipper.verbose == True:
131 |                 unzipper.percent = int(a)
132 |         if o in ("-z", "--zipfile"):
133 |             zipsource = a
134 |         if o in ("-o", "--outdir"):
135 |             zipdest = a
136 |         if o in ("-h", "--help"):
137 |             usage()
138 |             sys.exit()
139 | 
140 |     if zipsource == "" or zipdest == "":
141 |         usage()
142 |         sys.exit()
143 |             
144 |     unzipper.extract(zipsource, zipdest)
145 | 
146 | if __name__ == '__main__': main()


--------------------------------------------------------------------------------
/hachoir_parser/common/win32_lang_id.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Windows 2000 - List of Locale IDs and Language Groups
  3 | 
  4 | Original data table:
  5 | http://www.microsoft.com/globaldev/reference/win2k/setup/lcid.mspx
  6 | """
  7 | 
  8 | LANGUAGE_ID = {
  9 |     0x0436: u"Afrikaans",
 10 |     0x041c: u"Albanian",
 11 |     0x0401: u"Arabic Saudi Arabia",
 12 |     0x0801: u"Arabic Iraq",
 13 |     0x0c01: u"Arabic Egypt",
 14 |     0x1001: u"Arabic Libya",
 15 |     0x1401: u"Arabic Algeria",
 16 |     0x1801: u"Arabic Morocco",
 17 |     0x1c01: u"Arabic Tunisia",
 18 |     0x2001: u"Arabic Oman",
 19 |     0x2401: u"Arabic Yemen",
 20 |     0x2801: u"Arabic Syria",
 21 |     0x2c01: u"Arabic Jordan",
 22 |     0x3001: u"Arabic Lebanon",
 23 |     0x3401: u"Arabic Kuwait",
 24 |     0x3801: u"Arabic UAE",
 25 |     0x3c01: u"Arabic Bahrain",
 26 |     0x4001: u"Arabic Qatar",
 27 |     0x042b: u"Armenian",
 28 |     0x042c: u"Azeri Latin",
 29 |     0x082c: u"Azeri Cyrillic",
 30 |     0x042d: u"Basque",
 31 |     0x0423: u"Belarusian",
 32 |     0x0402: u"Bulgarian",
 33 |     0x0403: u"Catalan",
 34 |     0x0404: u"Chinese Taiwan",
 35 |     0x0804: u"Chinese PRC",
 36 |     0x0c04: u"Chinese Hong Kong",
 37 |     0x1004: u"Chinese Singapore",
 38 |     0x1404: u"Chinese Macau",
 39 |     0x041a: u"Croatian",
 40 |     0x0405: u"Czech",
 41 |     0x0406: u"Danish",
 42 |     0x0413: u"Dutch Standard",
 43 |     0x0813: u"Dutch Belgian",
 44 |     0x0409: u"English United States",
 45 |     0x0809: u"English United Kingdom",
 46 |     0x0c09: u"English Australian",
 47 |     0x1009: u"English Canadian",
 48 |     0x1409: u"English New Zealand",
 49 |     0x1809: u"English Irish",
 50 |     0x1c09: u"English South Africa",
 51 |     0x2009: u"English Jamaica",
 52 |     0x2409: u"English Caribbean",
 53 |     0x2809: u"English Belize",
 54 |     0x2c09: u"English Trinidad",
 55 |     0x3009: u"English Zimbabwe",
 56 |     0x3409: u"English Philippines",
 57 |     0x0425: u"Estonian",
 58 |     0x0438: u"Faeroese",
 59 |     0x0429: u"Farsi",
 60 |     0x040b: u"Finnish",
 61 |     0x040c: u"French Standard",
 62 |     0x080c: u"French Belgian",
 63 |     0x0c0c: u"French Canadian",
 64 |     0x100c: u"French Swiss",
 65 |     0x140c: u"French Luxembourg",
 66 |     0x180c: u"French Monaco",
 67 |     0x0437: u"Georgian",
 68 |     0x0407: u"German Standard",
 69 |     0x0807: u"German Swiss",
 70 |     0x0c07: u"German Austrian",
 71 |     0x1007: u"German Luxembourg",
 72 |     0x1407: u"German Liechtenstein",
 73 |     0x0408: u"Greek",
 74 |     0x040d: u"Hebrew",
 75 |     0x0439: u"Hindi",
 76 |     0x040e: u"Hungarian",
 77 |     0x040f: u"Icelandic",
 78 |     0x0421: u"Indonesian",
 79 |     0x0410: u"Italian Standard",
 80 |     0x0810: u"Italian Swiss",
 81 |     0x0411: u"Japanese",
 82 |     0x043f: u"Kazakh",
 83 |     0x0457: u"Konkani",
 84 |     0x0412: u"Korean",
 85 |     0x0426: u"Latvian",
 86 |     0x0427: u"Lithuanian",
 87 |     0x042f: u"Macedonian",
 88 |     0x043e: u"Malay Malaysia",
 89 |     0x083e: u"Malay Brunei Darussalam",
 90 |     0x044e: u"Marathi",
 91 |     0x0414: u"Norwegian Bokmal",
 92 |     0x0814: u"Norwegian Nynorsk",
 93 |     0x0415: u"Polish",
 94 |     0x0416: u"Portuguese Brazilian",
 95 |     0x0816: u"Portuguese Standard",
 96 |     0x0418: u"Romanian",
 97 |     0x0419: u"Russian",
 98 |     0x044f: u"Sanskrit",
 99 |     0x081a: u"Serbian Latin",
100 |     0x0c1a: u"Serbian Cyrillic",
101 |     0x041b: u"Slovak",
102 |     0x0424: u"Slovenian",
103 |     0x040a: u"Spanish Traditional Sort",
104 |     0x080a: u"Spanish Mexican",
105 |     0x0c0a: u"Spanish Modern Sort",
106 |     0x100a: u"Spanish Guatemala",
107 |     0x140a: u"Spanish Costa Rica",
108 |     0x180a: u"Spanish Panama",
109 |     0x1c0a: u"Spanish Dominican Republic",
110 |     0x200a: u"Spanish Venezuela",
111 |     0x240a: u"Spanish Colombia",
112 |     0x280a: u"Spanish Peru",
113 |     0x2c0a: u"Spanish Argentina",
114 |     0x300a: u"Spanish Ecuador",
115 |     0x340a: u"Spanish Chile",
116 |     0x380a: u"Spanish Uruguay",
117 |     0x3c0a: u"Spanish Paraguay",
118 |     0x400a: u"Spanish Bolivia",
119 |     0x440a: u"Spanish El Salvador",
120 |     0x480a: u"Spanish Honduras",
121 |     0x4c0a: u"Spanish Nicaragua",
122 |     0x500a: u"Spanish Puerto Rico",
123 |     0x0441: u"Swahili",
124 |     0x041d: u"Swedish",
125 |     0x081d: u"Swedish Finland",
126 |     0x0449: u"Tamil",
127 |     0x0444: u"Tatar",
128 |     0x041e: u"Thai",
129 |     0x041f: u"Turkish",
130 |     0x0422: u"Ukrainian",
131 |     0x0420: u"Urdu",
132 |     0x0443: u"Uzbek Latin",
133 |     0x0843: u"Uzbek Cyrillic",
134 |     0x042a: u"Vietnamese",
135 | }
136 | 
137 | 


--------------------------------------------------------------------------------
/hachoir_parser/audio/aiff.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Audio Interchange File Format (AIFF) parser.
  3 | 
  4 | Author: Victor Stinner
  5 | Creation: 27 december 2006
  6 | """
  7 | 
  8 | from hachoir_parser import Parser
  9 | from hachoir_core.field import (FieldSet,
 10 |     UInt16, UInt32, Float80, TimestampMac32,
 11 |     RawBytes, NullBytes,
 12 |     String, Enum, PascalString32)
 13 | from hachoir_core.endian import BIG_ENDIAN
 14 | from hachoir_core.text_handler import filesizeHandler
 15 | from hachoir_core.tools import alignValue
 16 | from hachoir_parser.audio.id3 import ID3v2
 17 | 
 18 | CODEC_NAME = {
 19 |     'ACE2': u"ACE 2-to-1",
 20 |     'ACE8': u"ACE 8-to-3",
 21 |     'MAC3': u"MAC 3-to-1",
 22 |     'MAC6': u"MAC 6-to-1",
 23 |     'NONE': u"None",
 24 |     'sowt': u"Little-endian, no compression",
 25 | }
 26 | 
 27 | class Comment(FieldSet):
 28 |     def createFields(self):
 29 |         yield TimestampMac32(self, "timestamp")
 30 |         yield PascalString32(self, "text")
 31 | 
 32 | def parseText(self):
 33 |     yield String(self, "text", self["size"].value)
 34 | 
 35 | def parseID3(self):
 36 |     yield ID3v2(self, "id3v2", size=self["size"].value*8)
 37 | 
 38 | def parseComment(self):
 39 |     yield UInt16(self, "nb_comment")
 40 |     for index in xrange(self["nb_comment"].value):
 41 |         yield Comment(self, "comment[]")
 42 | 
 43 | def parseCommon(self):
 44 |     yield UInt16(self, "nb_channel")
 45 |     yield UInt32(self, "nb_sample")
 46 |     yield UInt16(self, "sample_size")
 47 |     yield Float80(self, "sample_rate")
 48 |     yield Enum(String(self, "codec", 4, strip="\0", charset="ASCII"), CODEC_NAME)
 49 | 
 50 | def parseVersion(self):
 51 |     yield TimestampMac32(self, "timestamp")
 52 | 
 53 | def parseSound(self):
 54 |     yield UInt32(self, "offset")
 55 |     yield UInt32(self, "block_size")
 56 |     size = (self.size - self.current_size) // 8
 57 |     if size:
 58 |         yield RawBytes(self, "data", size)
 59 | 
 60 | class Chunk(FieldSet):
 61 |     TAG_INFO = {
 62 |         'COMM': ('common', "Common chunk", parseCommon),
 63 |         'COMT': ('comment', "Comment", parseComment),
 64 |         'NAME': ('name', "Name", parseText),
 65 |         'AUTH': ('author', "Author", parseText),
 66 |         'FVER': ('version', "Version", parseVersion),
 67 |         'SSND': ('sound', "Sound data", parseSound),
 68 |         'ID3 ': ('id3', "ID3", parseID3),
 69 |     }
 70 | 
 71 |     def __init__(self, *args):
 72 |         FieldSet.__init__(self, *args)
 73 |         self._size = (8 + alignValue(self["size"].value, 2)) * 8
 74 |         tag = self["type"].value
 75 |         if tag in self.TAG_INFO:
 76 |             self._name, self._description, self._parser = self.TAG_INFO[tag]
 77 |         else:
 78 |             self._parser = None
 79 | 
 80 |     def createFields(self):
 81 |         yield String(self, "type", 4, "Signature (FORM)", charset="ASCII")
 82 |         yield filesizeHandler(UInt32(self, "size"))
 83 |         size = self["size"].value
 84 |         if size:
 85 |             if self._parser:
 86 |                 for field in self._parser(self):
 87 |                     yield field
 88 |                 if size % 2:
 89 |                     yield NullBytes(self, "padding", 1)
 90 |             else:
 91 |                 yield RawBytes(self, "data", size)
 92 | 
 93 | class AiffFile(Parser):
 94 |     PARSER_TAGS = {
 95 |         "id": "aiff",
 96 |         "category": "audio",
 97 |         "file_ext": ("aif", "aiff", "aifc"),
 98 |         "mime": (u"audio/x-aiff",),
 99 |         "magic_regex": (("FORM.{4}AIF[CF]", 0),),
100 |         "min_size": 12*8,
101 |         "description": "Audio Interchange File Format (AIFF)"
102 |     }
103 |     endian = BIG_ENDIAN
104 | 
105 |     def validate(self):
106 |         if self.stream.readBytes(0, 4) != "FORM":
107 |             return "Invalid signature"
108 |         if self.stream.readBytes(8*8, 4) not in ("AIFF", "AIFC"):
109 |             return "Invalid type"
110 |         return True
111 | 
112 |     def createFields(self):
113 |         yield String(self, "signature", 4, "Signature (FORM)", charset="ASCII")
114 |         yield filesizeHandler(UInt32(self, "filesize"))
115 |         yield String(self, "type", 4, "Form type (AIFF or AIFC)", charset="ASCII")
116 |         while not self.eof:
117 |             yield Chunk(self, "chunk[]")
118 | 
119 |     def createDescription(self):
120 |         if self["type"].value == "AIFC":
121 |             return "Audio Interchange File Format Compressed (AIFC)"
122 |         else:
123 |             return "Audio Interchange File Format (AIFF)"
124 | 
125 |     def createContentSize(self):
126 |         return self["filesize"].value * 8
127 | 
128 | 


--------------------------------------------------------------------------------
/htmlExport.py:
--------------------------------------------------------------------------------
  1 | from lib import markup
  2 | from lib import graphs
  3 | 
  4 | class htmlExport():
  5 |     def __init__(self,users,softs,paths,allinfo,fname,dirs,failed,domain,emails):
  6 |         self.users=users
  7 |         self.softs=softs
  8 |         self.paths=paths
  9 |         self.allinfo=allinfo
 10 |         self.fname=fname
 11 |         self.dir=dirs
 12 |         self.failed=failed
 13 |         self.style=""
 14 |         self.domain=domain
 15 |         self.emails=emails
 16 | 
 17 |     def styler(self):
 18 |         a="""<style type='text/css'>body {
 19 |      background: #e1e5e4  top no-repeat;
 20 |  }
 21 | 
 22 | h1 { font-family: times, Times New Roman, times-roman, georgia, serif;
 23 |     color: #680000;
 24 |     margin: 0;
 25 |     padding: 0px 0px 6px 0px;
 26 |     font-size: 51px;
 27 |     line-height: 44px;
 28 |     letter-spacing: -2px;
 29 |     font-weight: bold;
 30 | }
 31 | 
 32 | h3 { font-family: times, Times New Roman, times-roman, georgia, serif;
 33 |     color: #444;
 34 |     margin: 0;
 35 |     padding: 0px 0px 6px 0px;
 36 |     font-size: 30px;
 37 |     line-height: 44px;
 38 |     letter-spacing: -2px;
 39 |     font-weight: bold;
 40 | }
 41 | 
 42 | li { font-family: times, Times New Roman, times-roman, georgia, serif;
 43 |     color: #444;
 44 |     margin: 0;
 45 |     padding: 0px 0px 6px 0px;
 46 |     font-size: 15px;
 47 |     line-height: 15px;
 48 |     letter-spacing: 0.4px;
 49 | 
 50 | }
 51 | 
 52 | h2{
 53 | font-family: times, Times New Roman, times-roman, georgia, serif;
 54 |         font-size: 48px;
 55 |         line-height: 40px;
 56 |         letter-spacing: -1px;
 57 |         color: #680000 ;
 58 |         margin: 0 0 0 0;
 59 |         padding: 0 0 0 0;
 60 |         font-weight: 100;
 61 | 
 62 | }
 63 | 
 64 | pre {
 65 | overflow: auto;
 66 | padding-left: 15px;
 67 | padding-right: 15px;
 68 | font-size: 11px;
 69 | line-height: 15px;
 70 | margin-top: 10px;
 71 | width: 93%;
 72 | display: block;
 73 | background-color: #eeeeee;
 74 | color: #000000;
 75 | max-height: 300px;
 76 | }
 77 | </style>
 78 |         """
 79 |         self.style=a
 80 | 
 81 |     def writehtml(self):
 82 |         page = markup.page()
 83 |         page.title("Metagoofil results")
 84 |         page.html()
 85 |         self.styler()
 86 |         page.head(self.style)
 87 |         page.head.close()
 88 |         page.body()
 89 |         page.h2("Metagoofil results")
 90 |         page.h3("Results for: " + self.domain)
 91 |         graph = graphs.BarGraph('vBar')
 92 | 	try:
 93 |         	graph.values = [len(self.users),len(self.softs),len(self.emails),len(self.paths)]
 94 | 		graph.labels = ["Usernames","Software","Emails","Paths/Servers"]
 95 | 		graph.showValues = 1
 96 |         	page.body(graph.create())
 97 | 	except:
 98 | 		print "graph"
 99 | 	try:
100 | 		page.h3("User names found:")
101 | 		page.ul( class_="userslist")
102 | 		page.li( self.users, class_="useritem")
103 | 		page.ul.close( )
104 | 		page.h3("Software versions found:")
105 | 	except:
106 | 		print "user"
107 |         try:
108 | 		page.ul( class_="softlist")
109 | 		page.li(self.softs, class_="softitem")
110 | 		page.ul.close( )
111 | 	except:
112 | 		print "email"
113 | 	page.h3("E-mails found:")
114 |         if self.emails!=[]:
115 |             page.ul( class_="emailslist")
116 |             page.li(self.emails, class_="emailitem")
117 |             page.ul.close( )
118 |         else:
119 |             page.p("0 results")
120 |         page.h3("Servers and paths found:")
121 |         if self.paths!=[]:
122 |             page.ul( class_="pathslist")
123 |             page.li(self.paths, class_="pathitem")
124 |             page.ul.close( )
125 |         else:
126 |             page.p("0 results")
127 |         page.h3("Files analyzed:")
128 |         page.ul( class_="files")
129 |         for x in self.allinfo:
130 |             page.li(x[0], class_="file")
131 |         page.ul.close()
132 |         page.h2("Files and metadata found:")
133 |         for x in self.allinfo:
134 |             page.h3(x[0])
135 |             page.a("Local copy", class_="link", href=self.dir+"/"+x[0])
136 |             page.pre(x[1])
137 |             page.pre(x[2])
138 |             page.pre(x[3])
139 |             page.pre(x[5])
140 |             page.pre.close()
141 |         page.h2("Failed extractions and reasons")
142 |         for x in self.failed:
143 |             page.pre(x)
144 |         page.body.close()
145 |         page.html.close()
146 |         file = open(self.fname,'w')
147 |         for x in page.content:
148 |             try:
149 |                 file.write(x)
150 |             except:
151 |                 #print "Exception" +  x # send to logs
152 |                 pass
153 |         file.close
154 |         return "ok"
155 | 


--------------------------------------------------------------------------------
/hachoir_parser/image/iptc.py:
--------------------------------------------------------------------------------
  1 | """
  2 | IPTC metadata parser (can be found in a JPEG picture for example)
  3 | 
  4 | Sources:
  5 | - Image-MetaData Perl module:
  6 |   http://www.annocpan.org/~BETTELLI/Image-MetaData-JPEG-0.15/...
  7 |   ...lib/Image/MetaData/JPEG/TagLists.pod
  8 | - IPTC tag name and description:
  9 |   http://peccatte.karefil.com/software/IPTCTableau.pdf
 10 | 
 11 | Author: Victor Stinner
 12 | """
 13 | 
 14 | from hachoir_core.field import (FieldSet, ParserError,
 15 |     UInt8, UInt16, String, RawBytes, NullBytes)
 16 | from hachoir_core.text_handler import textHandler, hexadecimal
 17 | 
 18 | def IPTC_String(parent, name, desc=None):
 19 |     # Charset may be utf-8, ISO-8859-1, or ...
 20 |     return String(parent, name, parent["size"].value, desc,
 21 |         strip=" ")
 22 | 
 23 | dataset1 = {
 24 | }
 25 | dataset2 = {
 26 |       0: ("record_version", "Record version (2 for JPEG)", UInt16),
 27 |       5: ("obj_name", "Object name", None),
 28 |       7: ("edit_stat", "Edit status", None),
 29 |      10: ("urgency", "Urgency", UInt8),
 30 |      15: ("category[]", "Category", None),
 31 |      22: ("fixture", "Fixture identifier", IPTC_String),
 32 |      25: ("keyword[]", "Keywords", IPTC_String),
 33 |      30: ("release_date", "Release date", IPTC_String),
 34 |      35: ("release_time", "Release time", IPTC_String),
 35 |      40: ("instruction", "Special instructions", IPTC_String),
 36 |      55: ("date_created", "Date created", IPTC_String),
 37 |      60: ("time_created", "Time created (ISO 8601)", IPTC_String),
 38 |      65: ("originating_prog", "Originating program", IPTC_String),
 39 |      70: ("prog_ver", "Program version", IPTC_String),
 40 |      80: ("author", "By-line (Author)", IPTC_String),
 41 |      85: ("author_job", "By-line (Author precision)", IPTC_String),
 42 |      90: ("city", "City", IPTC_String),
 43 |      95: ("state", "Province / State", IPTC_String),
 44 |     100: ("country_code", "Country / Primary location code", IPTC_String),
 45 |     101: ("country_name", "Country / Primary location name", IPTC_String),
 46 |     103: ("trans_ref", "Original transmission reference", IPTC_String),
 47 |     105: ("headline", "Headline", IPTC_String),
 48 |     110: ("credit", "Credit", IPTC_String),
 49 |     115: ("source", "Source", IPTC_String),
 50 |     116: ("copyright", "Copyright notice", IPTC_String),
 51 |     120: ("caption", "Caption/Abstract", IPTC_String),
 52 |     122: ("writer", "Writer/editor", IPTC_String),
 53 |     231: ("history[]", "Document history (timestamp)", IPTC_String)
 54 | }
 55 | datasets = {1: dataset1, 2: dataset2}
 56 | 
 57 | class IPTC_Size(FieldSet):
 58 |     def __init__(self, *args, **kw):
 59 |         FieldSet.__init__(self, *args, **kw)
 60 |         value = 0
 61 |         for field in self:
 62 |             value <<= 15
 63 |             value  += (field.value & 0x7fff)
 64 |         self.createValue = lambda: value
 65 | 
 66 |     def createFields(self):
 67 |         while True:
 68 |             field = UInt16(self, "value[]")
 69 |             yield field
 70 |             if field.value < 0x8000:
 71 |                 break
 72 | 
 73 | class IPTC_Chunk(FieldSet):
 74 |     def __init__(self, *args, **kw):
 75 |         FieldSet.__init__(self, *args, **kw)
 76 |         number = self["dataset_nb"].value
 77 |         self.dataset_info = None
 78 |         if number in datasets:
 79 |             tag = self["tag"].value
 80 |             if tag in datasets[number]:
 81 |                 self.dataset_info = datasets[number][tag]
 82 |                 self._name = self.dataset_info[0]
 83 |                 self._description = self.dataset_info[1]
 84 |         size_chunk = self["size"]
 85 |         self._size = 3*8 + size_chunk.size + size_chunk.value*8
 86 | 
 87 |     def createFields(self):
 88 |         yield textHandler(UInt8(self, "signature", "IPTC signature (0x1c)"), hexadecimal)
 89 |         if self["signature"].value != 0x1C:
 90 |             raise ParserError("Wrong IPTC signature")
 91 |         yield textHandler(UInt8(self, "dataset_nb", "Dataset number"), hexadecimal)
 92 |         yield UInt8(self, "tag", "Tag")
 93 |         yield IPTC_Size(self, "size", "Content size")
 94 | 
 95 |         size = self["size"].value
 96 |         if 0 < size:
 97 |             if self.dataset_info:
 98 |                 cls = self.dataset_info[2]
 99 |             else:
100 |                 cls = None
101 |             if cls:
102 |                 yield cls(self, "content")
103 |             else:
104 |                 yield RawBytes(self, "content", size)
105 | 
106 | class IPTC(FieldSet):
107 |     def createFields(self):
108 |         while 5 <= (self._size - self.current_size)/8:
109 |             yield IPTC_Chunk(self, "chunk[]")
110 |         size = (self._size - self.current_size) / 8
111 |         if 0 < size:
112 |             yield NullBytes(self, "padding", size)
113 | 
114 | 


--------------------------------------------------------------------------------
/hachoir_core/log.py:
--------------------------------------------------------------------------------
  1 | import os, sys, time
  2 | import hachoir_core.config as config
  3 | from hachoir_core.i18n import _
  4 | 
  5 | class Log:
  6 |     LOG_INFO   = 0
  7 |     LOG_WARN   = 1
  8 |     LOG_ERROR  = 2
  9 | 
 10 |     level_name = {
 11 |         LOG_WARN: "[warn]",
 12 |         LOG_ERROR: "[err!]",
 13 |         LOG_INFO: "[info]"
 14 |     }
 15 | 
 16 |     def __init__(self):
 17 |         self.__buffer = {}
 18 |         self.__file = None
 19 |         self.use_print = True
 20 |         self.use_buffer = False
 21 |         self.on_new_message = None # Prototype: def func(level, prefix, text, context)
 22 | 
 23 |     def shutdown(self):
 24 |         if self.__file:
 25 |             self._writeIntoFile(_("Stop Hachoir"))
 26 | 
 27 |     def setFilename(self, filename, append=True):
 28 |         """
 29 |         Use a file to store all messages. The
 30 |         UTF-8 encoding will be used. Write an informative
 31 |         message if the file can't be created.
 32 | 
 33 |         @param filename: C{L{string}}
 34 |         """
 35 | 
 36 |         # Look if file already exists or not
 37 |         filename = os.path.expanduser(filename)
 38 |         filename = os.path.realpath(filename)
 39 |         append = os.access(filename, os.F_OK)
 40 | 
 41 |         # Create log file (or open it in append mode, if it already exists)
 42 |         try:
 43 |             import codecs
 44 |             if append:
 45 |                 self.__file = codecs.open(filename, "a", "utf-8")
 46 |             else:
 47 |                 self.__file = codecs.open(filename, "w", "utf-8")
 48 |             self._writeIntoFile(_("Starting Hachoir"))
 49 |         except IOError, err:
 50 |             if err.errno == 2:
 51 |                 self.__file = None
 52 |                 self.info(_("[Log] setFilename(%s) fails: no such file") % filename)
 53 |             else:
 54 |                 raise
 55 | 
 56 |     def _writeIntoFile(self, message):
 57 |         timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
 58 |         self.__file.write(u"%s - %s\n" % (timestamp, message))
 59 |         self.__file.flush()
 60 | 
 61 |     def newMessage(self, level, text, ctxt=None):
 62 |         """
 63 |         Write a new message : append it in the buffer,
 64 |         display it to the screen (if needed), and write
 65 |         it in the log file (if needed).
 66 | 
 67 |         @param level: Message level.
 68 |         @type level: C{int}
 69 |         @param text: Message content.
 70 |         @type text: C{str}
 71 |         @param ctxt: The caller instance.
 72 |         """
 73 | 
 74 |         if level < self.LOG_ERROR and config.quiet or \
 75 |            level <= self.LOG_INFO and not config.verbose:
 76 |             return
 77 |         if config.debug:
 78 |             from hachoir_core.error import getBacktrace
 79 |             backtrace = getBacktrace(None)
 80 |             if backtrace:
 81 |                 text += "\n\n" + backtrace
 82 | 
 83 |         _text = text
 84 |         if hasattr(ctxt, "_logger"):
 85 |             _ctxt = ctxt._logger()
 86 |             if _ctxt is not None:
 87 |                 text = "[%s] %s" % (_ctxt, text)
 88 | 
 89 |         # Add message to log buffer
 90 |         if self.use_buffer:
 91 |             if not self.__buffer.has_key(level):
 92 |                 self.__buffer[level] = [text]
 93 |             else:
 94 |                 self.__buffer[level].append(text)
 95 | 
 96 |         # Add prefix
 97 |         prefix = self.level_name.get(level, "[info]")
 98 | 
 99 |         # Display on stdout (if used)
100 |         if self.use_print:
101 |             sys.stdout.flush()
102 |             sys.stderr.write("%s %s\n" % (prefix, text))
103 |             sys.stderr.flush()
104 | 
105 |         # Write into outfile (if used)
106 |         if self.__file:
107 |             self._writeIntoFile("%s %s" % (prefix, text))
108 | 
109 |         # Use callback (if used)
110 |         if self.on_new_message:
111 |             self.on_new_message (level, prefix, _text, ctxt)
112 | 
113 |     def info(self, text):
114 |         """
115 |         New informative message.
116 |         @type text: C{str}
117 |         """
118 |         self.newMessage(Log.LOG_INFO, text)
119 | 
120 |     def warning(self, text):
121 |         """
122 |         New warning message.
123 |         @type text: C{str}
124 |         """
125 |         self.newMessage(Log.LOG_WARN, text)
126 | 
127 |     def error(self, text):
128 |         """
129 |         New error message.
130 |         @type text: C{str}
131 |         """
132 |         self.newMessage(Log.LOG_ERROR, text)
133 | 
134 | log = Log()
135 | 
136 | class Logger(object):
137 |     def _logger(self):
138 |         return "<%s>" % self.__class__.__name__
139 |     def info(self, text):
140 |         log.newMessage(Log.LOG_INFO, text, self)
141 |     def warning(self, text):
142 |         log.newMessage(Log.LOG_WARN, text, self)
143 |     def error(self, text):
144 |         log.newMessage(Log.LOG_ERROR, text, self)
145 | 


--------------------------------------------------------------------------------
/hachoir_parser/archive/tar.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tar archive parser.
  3 | 
  4 | Author: Victor Stinner
  5 | """
  6 | 
  7 | from hachoir_parser import Parser
  8 | from hachoir_core.field import (FieldSet,
  9 |     Enum, UInt8, SubFile, String, NullBytes)
 10 | from hachoir_core.tools import humanFilesize, paddingSize, timestampUNIX
 11 | from hachoir_core.endian import BIG_ENDIAN
 12 | import re
 13 | 
 14 | class FileEntry(FieldSet):
 15 |     type_name = {
 16 |         # 48 is "0", 49 is "1", ...
 17 |          0: u"Normal disk file (old format)",
 18 |         48: u"Normal disk file",
 19 |         49: u"Link to previously dumped file",
 20 |         50: u"Symbolic link",
 21 |         51: u"Character special file",
 22 |         52: u"Block special file",
 23 |         53: u"Directory",
 24 |         54: u"FIFO special file",
 25 |         55: u"Contiguous file"
 26 |     }
 27 | 
 28 |     def getOctal(self, name):
 29 |         return self.octal2int(self[name].value)
 30 | 
 31 |     def getDatetime(self):
 32 |         """
 33 |         Create modification date as Unicode string, may raise ValueError.
 34 |         """
 35 |         timestamp = self.getOctal("mtime")
 36 |         return timestampUNIX(timestamp)
 37 | 
 38 |     def createFields(self):
 39 |         yield String(self, "name", 100, "Name", strip="\0", charset="ISO-8859-1")
 40 |         yield String(self, "mode", 8, "Mode", strip=" \0", charset="ASCII")
 41 |         yield String(self, "uid", 8, "User ID", strip=" \0", charset="ASCII")
 42 |         yield String(self, "gid", 8, "Group ID", strip=" \0", charset="ASCII")
 43 |         yield String(self, "size", 12, "Size", strip=" \0", charset="ASCII")
 44 |         yield String(self, "mtime", 12, "Modification time", strip=" \0", charset="ASCII")
 45 |         yield String(self, "check_sum", 8, "Check sum", strip=" \0", charset="ASCII")
 46 |         yield Enum(UInt8(self, "type", "Type"), self.type_name)
 47 |         yield String(self, "lname", 100, "Link name", strip=" \0", charset="ISO-8859-1")
 48 |         yield String(self, "magic", 8, "Magic", strip=" \0", charset="ASCII")
 49 |         yield String(self, "uname", 32, "User name", strip=" \0", charset="ISO-8859-1")
 50 |         yield String(self, "gname", 32, "Group name", strip=" \0", charset="ISO-8859-1")
 51 |         yield String(self, "devmajor", 8, "Dev major", strip=" \0", charset="ASCII")
 52 |         yield String(self, "devminor", 8, "Dev minor", strip=" \0", charset="ASCII")
 53 |         yield NullBytes(self, "padding", 167, "Padding (zero)")
 54 | 
 55 |         filesize = self.getOctal("size")
 56 |         if filesize:
 57 |             yield SubFile(self, "content", filesize, filename=self["name"].value)
 58 | 
 59 |         size = paddingSize(self.current_size//8, 512)
 60 |         if size:
 61 |             yield NullBytes(self, "padding_end", size, "Padding (512 align)")
 62 | 
 63 |     def convertOctal(self, chunk):
 64 |         return self.octal2int(chunk.value)
 65 | 
 66 |     def isEmpty(self):
 67 |         return self["name"].value == ""
 68 | 
 69 |     def octal2int(self, text):
 70 |         try:
 71 |             return int(text, 8)
 72 |         except ValueError:
 73 |             return 0
 74 | 
 75 |     def createDescription(self):
 76 |         if self.isEmpty():
 77 |             desc = "(terminator, empty header)"
 78 |         else:
 79 |             filename = self["name"].value
 80 |             filesize = humanFilesize(self.getOctal("size"))
 81 |             desc = "(%s: %s, %s)" % \
 82 |                 (filename, self["type"].display, filesize)
 83 |         return "Tar File " + desc
 84 | 
 85 | class TarFile(Parser):
 86 |     endian = BIG_ENDIAN
 87 |     PARSER_TAGS = {
 88 |         "id": "tar",
 89 |         "category": "archive",
 90 |         "file_ext": ("tar",),
 91 |         "mime": (u"application/x-tar", u"application/x-gtar"),
 92 |         "min_size": 512*8,
 93 |         "magic": (("ustar  \0", 257*8),),
 94 |         "subfile": "skip",
 95 |         "description": "TAR archive",
 96 |     }
 97 |     _sign = re.compile("ustar *\0|[ \0]*$")
 98 | 
 99 |     def validate(self):
100 |         if not self._sign.match(self.stream.readBytes(257*8, 8)):
101 |             return "Invalid magic number"
102 |         if self[0].name == "terminator":
103 |             return "Don't contain any file"
104 |         try:
105 |             int(self["file[0]/uid"].value, 8)
106 |             int(self["file[0]/gid"].value, 8)
107 |             int(self["file[0]/size"].value, 8)
108 |         except ValueError:
109 |             return "Invalid file size"
110 |         return True
111 | 
112 |     def createFields(self):
113 |         while not self.eof:
114 |             field = FileEntry(self, "file[]")
115 |             if field.isEmpty():
116 |                 yield NullBytes(self, "terminator", 512)
117 |                 break
118 |             yield field
119 |         if self.current_size < self._size:
120 |             yield self.seekBit(self._size, "end")
121 | 
122 |     def createContentSize(self):
123 |         return self["terminator"].address + self["terminator"].size
124 | 
125 | 


--------------------------------------------------------------------------------
/hachoir_parser/guess.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Parser list managment:
  3 | - createParser() find the best parser for a file.
  4 | """
  5 | 
  6 | import os
  7 | from hachoir_core.error import warning, info, HACHOIR_ERRORS
  8 | from hachoir_parser import ValidateError, HachoirParserList
  9 | from hachoir_core.stream import FileInputStream
 10 | from hachoir_core.i18n import _
 11 | import weakref
 12 | 
 13 | 
 14 | class QueryParser(object):
 15 |     fallback = None
 16 |     other = None
 17 | 
 18 |     def __init__(self, tags):
 19 |         self.validate = True
 20 |         self.use_fallback = False
 21 |         self.parser_args = None
 22 |         self.db = HachoirParserList.getInstance()
 23 |         self.parsers = set(self.db)
 24 |         parsers = []
 25 |         for tag in tags:
 26 |             if not self.parsers:
 27 |                 break
 28 |             parsers += self._getByTag(tag)
 29 |             if self.fallback is None:
 30 |                 self.fallback = len(parsers) == 1
 31 |         if self.parsers:
 32 |             other = len(parsers)
 33 |             parsers += list(self.parsers)
 34 |             self.other = parsers[other]
 35 |         self.parsers = parsers
 36 | 
 37 |     def __iter__(self):
 38 |         return iter(self.parsers)
 39 | 
 40 |     def translate(self, name, value):
 41 |         if name == "filename":
 42 |             filename = os.path.basename(value).split(".")
 43 |             if len(filename) <= 1:
 44 |                 value = ""
 45 |             else:
 46 |                 value = filename[-1].lower()
 47 |             name = "file_ext"
 48 |         return name, value
 49 | 
 50 |     def _getByTag(self, tag):
 51 |         if tag is None:
 52 |             self.parsers.clear()
 53 |             return []
 54 |         elif callable(tag):
 55 |             parsers = [ parser for parser in self.parsers if tag(parser) ]
 56 |             for parser in parsers:
 57 |                 self.parsers.remove(parser)
 58 |         elif tag[0] == "class":
 59 |             self.validate = False
 60 |             return [ tag[1] ]
 61 |         elif tag[0] == "args":
 62 |             self.parser_args = tag[1]
 63 |             return []
 64 |         else:
 65 |             tag = self.translate(*tag)
 66 |             parsers = []
 67 |             if tag is not None:
 68 |                 key = tag[0]
 69 |                 byname = self.db.bytag.get(key,{})
 70 |                 if tag[1] is None:
 71 |                     values = byname.itervalues()
 72 |                 else:
 73 |                     values = byname.get(tag[1],()),
 74 |                 if key == "id" and values:
 75 |                     self.validate = False
 76 |                 for value in values:
 77 |                     for parser in value:
 78 |                         if parser in self.parsers:
 79 |                             parsers.append(parser)
 80 |                             self.parsers.remove(parser)
 81 |         return parsers
 82 | 
 83 |     def parse(self, stream, fallback=True):
 84 |         if hasattr(stream, "_cached_parser"):
 85 |             parser = stream._cached_parser()
 86 |         else:
 87 |             parser = None
 88 |         if parser is not None:
 89 |             if parser.__class__ in self.parsers:
 90 |                 return parser
 91 |             if self.use_fallback and parser.__class__ == fb:
 92 |                 return parser
 93 |         parser = self.doparse(stream, fallback)
 94 |         stream._cached_parser = weakref.ref(parser)
 95 |         return parser
 96 | 
 97 |     def doparse(self, stream, fallback=True):
 98 |         fb = None
 99 |         warn = warning
100 |         for parser in self.parsers:
101 |             try:
102 |                 parser_obj = parser(stream, validate=self.validate)
103 |                 if self.parser_args:
104 |                     for key, value in self.parser_args.iteritems():
105 |                         setattr(parser_obj, key, value)
106 |                 return parser_obj
107 |             except ValidateError, err:
108 |                 res = unicode(err)
109 |                 if fallback and self.fallback:
110 |                     fb = parser
111 |             except HACHOIR_ERRORS, err:
112 |                 res = unicode(err)
113 |             if warn:
114 |                 if parser == self.other:
115 |                     warn = info
116 |                 warn(_("Skip parser '%s': %s") % (parser.__name__, res))
117 |             fallback = False
118 |         if self.use_fallback and fb:
119 |             warning(_("Force use of parser '%s'") % fb.__name__)
120 |             return fb(stream)
121 | 
122 | 
123 | def guessParser(stream):
124 |     return QueryParser(stream.tags).parse(stream)
125 | 
126 | 
127 | def createParser(filename, real_filename=None, tags=None):
128 |     """
129 |     Create a parser from a file or returns None on error.
130 | 
131 |     Options:
132 |     - filename (unicode): Input file name ;
133 |     - real_filename (str|unicode): Real file name.
134 |     """
135 |     if not tags:
136 |         tags = []
137 |     stream = FileInputStream(filename, real_filename, tags=tags)
138 |     return guessParser(stream)
139 | 


--------------------------------------------------------------------------------
/hachoir_core/field/padding.py:
--------------------------------------------------------------------------------
  1 | from hachoir_core.field import Bits, Bytes
  2 | from hachoir_core.tools import makePrintable, humanFilesize
  3 | from hachoir_core import config
  4 | 
  5 | class PaddingBits(Bits):
  6 |     """
  7 |     Padding bits used, for example, to align address (of next field).
  8 |     See also NullBits and PaddingBytes types.
  9 | 
 10 |     Arguments:
 11 |      * nbits: Size of the field in bits
 12 | 
 13 |     Optional arguments:
 14 |      * pattern (int): Content pattern, eg. 0 if all bits are set to 0
 15 |     """
 16 |     static_size = staticmethod(lambda *args, **kw: args[1])
 17 |     MAX_SIZE = 128
 18 | 
 19 |     def __init__(self, parent, name, nbits, description="Padding", pattern=None):
 20 |         Bits.__init__(self, parent, name, nbits, description)
 21 |         self.pattern = pattern
 22 |         self._display_pattern = self.checkPattern()
 23 | 
 24 |     def checkPattern(self):
 25 |         if not(config.check_padding_pattern):
 26 |             return False
 27 |         if self.pattern != 0:
 28 |             return False
 29 | 
 30 |         if self.MAX_SIZE < self._size:
 31 |             value = self._parent.stream.readBits(
 32 |                 self.absolute_address, self.MAX_SIZE, self._parent.endian)
 33 |         else:
 34 |             value = self.value
 35 |         if value != 0:
 36 |             self.warning("padding contents doesn't look normal (invalid pattern)")
 37 |             return False
 38 |         if self.MAX_SIZE < self._size:
 39 |             self.info("only check first %u bits" % self.MAX_SIZE)
 40 |         return True
 41 | 
 42 |     def createDisplay(self):
 43 |         if self._display_pattern:
 44 |             return u"<padding pattern=%s>" % self.pattern
 45 |         else:
 46 |             return Bits.createDisplay(self)
 47 | 
 48 | class PaddingBytes(Bytes):
 49 |     """
 50 |     Padding bytes used, for example, to align address (of next field).
 51 |     See also NullBytes and PaddingBits types.
 52 | 
 53 |     Arguments:
 54 |      * nbytes: Size of the field in bytes
 55 | 
 56 |     Optional arguments:
 57 |      * pattern (str): Content pattern, eg. "\0" for nul bytes
 58 |     """
 59 | 
 60 |     static_size = staticmethod(lambda *args, **kw: args[1]*8)
 61 |     MAX_SIZE = 4096
 62 | 
 63 |     def __init__(self, parent, name, nbytes,
 64 |     description="Padding", pattern=None):
 65 |         """ pattern is None or repeated string """
 66 |         assert (pattern is None) or (isinstance(pattern, str))
 67 |         Bytes.__init__(self, parent, name, nbytes, description)
 68 |         self.pattern = pattern
 69 |         self._display_pattern = self.checkPattern()
 70 | 
 71 |     def checkPattern(self):
 72 |         if not(config.check_padding_pattern):
 73 |             return False
 74 |         if self.pattern is None:
 75 |             return False
 76 | 
 77 |         if self.MAX_SIZE < self._size/8:
 78 |             self.info("only check first %s of padding" % humanFilesize(self.MAX_SIZE))
 79 |             content = self._parent.stream.readBytes(
 80 |                 self.absolute_address, self.MAX_SIZE)
 81 |         else:
 82 |             content = self.value
 83 |         index = 0
 84 |         pattern_len = len(self.pattern)
 85 |         while index < len(content):
 86 |             if content[index:index+pattern_len] != self.pattern:
 87 |                 self.warning(
 88 |                     "padding contents doesn't look normal"
 89 |                     " (invalid pattern at byte %u)!"
 90 |                     % index)
 91 |                 return False
 92 |             index += pattern_len
 93 |         return True
 94 | 
 95 |     def createDisplay(self):
 96 |         if self._display_pattern:
 97 |             return u"<padding pattern=%s>" % makePrintable(self.pattern, "ASCII", quote="'")
 98 |         else:
 99 |             return Bytes.createDisplay(self)
100 | 
101 |     def createRawDisplay(self):
102 |         return Bytes.createDisplay(self)
103 | 
104 | class NullBits(PaddingBits):
105 |     """
106 |     Null padding bits used, for example, to align address (of next field).
107 |     See also PaddingBits and NullBytes types.
108 | 
109 |     Arguments:
110 |      * nbits: Size of the field in bits
111 |     """
112 | 
113 |     def __init__(self, parent, name, nbits, description=None):
114 |         PaddingBits.__init__(self, parent, name, nbits, description, pattern=0)
115 | 
116 |     def createDisplay(self):
117 |         if self._display_pattern:
118 |             return "<null>"
119 |         else:
120 |             return Bits.createDisplay(self)
121 | 
122 | class NullBytes(PaddingBytes):
123 |     """
124 |     Null padding bytes used, for example, to align address (of next field).
125 |     See also PaddingBytes and NullBits types.
126 | 
127 |     Arguments:
128 |      * nbytes: Size of the field in bytes
129 |     """
130 |     def __init__(self, parent, name, nbytes, description=None):
131 |         PaddingBytes.__init__(self, parent, name, nbytes, description, pattern="\0")
132 | 
133 |     def createDisplay(self):
134 |         if self._display_pattern:
135 |             return "<null>"
136 |         else:
137 |             return Bytes.createDisplay(self)
138 | 
139 | 


--------------------------------------------------------------------------------