├── .gitignore ├── README.md ├── kindledecrypt.py ├── mobidedrm.py ├── process.py ├── screenshot.png ├── setup-macosx.py ├── setup-win32.py └── topaz ├── __init__.py ├── cmbtc.py ├── convert2xml.py ├── decode_meta.py ├── flatxml2html.py ├── genhtml.py ├── gensvg.py ├── getpagedim.py └── stylexml2css.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Kindle Book Decrypter 2 | ===================== 3 | A simple GUI to remove the restrictions put in place to prevent copying books, 4 | listening to some books, or transfering books to other devices, etc. All it 5 | requires it the original book file and your Kindle serial number. 6 | 7 | * Backup your purchased Kindle books on your own media 8 | * Use other e-reading devices and/or software to read your purchased books 9 | * Re-enable audio playback of books that have disabled it 10 | * Allow a friend to borrow a book you are no longer reading (if this falls 11 | under fair use and is legal where you live) 12 | * Supports Mobipocket and Topaz book formats (azw, mobi, prc, azw1, tpz) 13 | 14 | ![Screenshot](screenshot.png) 15 | 16 | Pre-Built Binaries 17 | ------------------ 18 | The following pre-built binaries are available and kept up to date with the 19 | latest changes: 20 | 21 | * [Microsoft Windows](http://programmer-art.org/dropbox/kindledecrypt-1.1-win32.exe) 22 | * [Mac OS X](http://programmer-art.org/dropbox/kindledecrypt-1.1-macosx.zip) 23 | 24 | Dependencies 25 | ------------ 26 | The Kindle Book Decrypter depends on the following when not using the pre-built binaries: 27 | 28 | * Python 29 | * wxWidgets (and Python bindings) 30 | 31 | Usage 32 | ----- 33 | You can use the application by running it in a terminal or double clicking it: 34 | 35 | cd kindledecrypt 36 | ./kindledecrypt.py 37 | 38 | License 39 | ------- 40 | Original reverse engineering and Kindle PIN code is copyright the respective 41 | authors. The GUI is copyright 2010 Daniel G. Taylor and released under the MIT 42 | License. See the script itself for details. 43 | 44 | -------------------------------------------------------------------------------- /kindledecrypt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | """ 4 | Kindle Book Decrypter 5 | ===================== 6 | Simple GUI for MobiDeDRM code written with wxWidgets. This GUI takes a 7 | serial number and encrypted book file and outputs an unencrypted book 8 | that can be used to backup your data or legally remove audio and other 9 | restrictions by allowing you to convert to other formats. 10 | 11 | License 12 | ------- 13 | Copyright (C) 2010 Daniel G. Taylor 14 | 15 | Permission is hereby granted, free of charge, to any person obtaining a copy 16 | of this software and associated documentation files (the "Software"), to deal 17 | in the Software without restriction, including without limitation the rights 18 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 19 | copies of the Software, and to permit persons to whom the Software is 20 | furnished to do so, subject to the following conditions: 21 | 22 | The above copyright notice and this permission notice shall be included in 23 | all copies or substantial portions of the Software. 24 | 25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 26 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 27 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 28 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 29 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 30 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 31 | THE SOFTWARE. 32 | """ 33 | 34 | __author__ = "Daniel G. Taylor" 35 | __version__ = 1.1 36 | 37 | import ConfigParser 38 | import optparse 39 | import os 40 | import sys 41 | import wx 42 | 43 | import mobidedrm 44 | import process 45 | import topaz 46 | 47 | CONFIG = os.path.expanduser("~/.mobidedrmwx.cfg") 48 | 49 | class MobiDeDrmApp(wx.App): 50 | """ 51 | The main application holding all windows, controls, etc. 52 | """ 53 | def __init__(self, redir=False): 54 | super(MobiDeDrmApp, self).__init__(redir) 55 | 56 | self.config = ConfigParser.SafeConfigParser() 57 | if os.path.exists(CONFIG): 58 | self.config.read(CONFIG) 59 | 60 | if not self.config.has_section("General"): 61 | self.config.add_section("General") 62 | 63 | if self.config.has_option("General", "Serial"): 64 | default_serial = self.config.get("General", "Serial") 65 | else: 66 | # This is just a random example serial 67 | default_serial = "B002A1C457493453" 68 | 69 | self.frame = wx.Frame(None, wx.ID_ANY, "Kindle Book Decrypter", size=(400, 130)) 70 | 71 | self.panel = wx.Panel(self.frame) 72 | self.vbox = wx.BoxSizer(wx.VERTICAL) 73 | 74 | self.grid = wx.GridBagSizer(3, 3) 75 | self.serial_label = wx.StaticText(self.panel, label="Serial:") 76 | self.serial = wx.TextCtrl(self.panel, value=default_serial) 77 | self.serial_help = wx.StaticText(self.panel, label="Kindle or Kindle for iPhone serial number") 78 | font = self.serial_help.GetFont() 79 | font.SetPointSize(8) 80 | self.serial_help.SetFont(font) 81 | self.input_label = wx.StaticText(self.panel, label="Book:") 82 | self.input = wx.FilePickerCtrl(self.panel, wildcard="Kindle Books|*.azw;*.mobi;*.prc;*.azw1|All Files|*.*") 83 | self.button = wx.Button(self.panel, label="Decrypt") 84 | 85 | self.grid.Add(self.serial_label, (0, 0), flag=wx.ALIGN_CENTER_VERTICAL | wx.ALL) 86 | self.grid.Add(self.serial, (0, 1), flag=wx.ALIGN_CENTER_VERTICAL | wx.EXPAND) 87 | self.grid.Add(self.serial_help, (1, 1)) 88 | self.grid.Add(self.input_label, (2, 0), flag=wx.ALIGN_CENTER_VERTICAL) 89 | self.grid.Add(self.input, (2, 1), flag=wx.ALIGN_CENTER_VERTICAL | wx.EXPAND) 90 | self.grid.Add(self.button, (3, 1), flag=wx.ALIGN_CENTER_VERTICAL | wx.ALIGN_RIGHT) 91 | 92 | self.grid.AddGrowableCol(1, 1) 93 | 94 | self.vbox.Add(self.grid, 1, wx.ALL | wx.EXPAND, border=5) 95 | 96 | self.panel.SetSizer(self.vbox) 97 | self.vbox.Fit(self.frame) 98 | 99 | self.frame.Bind(wx.EVT_BUTTON, self.on_process, self.button) 100 | self.frame.Bind(wx.EVT_TEXT, self.on_serial_changed, self.serial) 101 | 102 | self.frame.Centre() 103 | self.frame.Show(True) 104 | 105 | def on_serial_changed(self, event): 106 | """ 107 | The serial number has changed. If it is the correct number of 108 | characters then enable the decrypt button, otherwise disable it 109 | until a valid serial is entered. 110 | """ 111 | serial = self.serial.GetValue() 112 | if len(serial) in [16, 40]: 113 | self.button.Enable() 114 | self.config.set("General", "Serial", self.serial.GetValue()) 115 | self.config.write(open(CONFIG, "w")) 116 | else: 117 | self.button.Disable() 118 | 119 | def on_process(self, event): 120 | """ 121 | The decrypt button was clicked, so start the decrypting process. 122 | This shows a pulsing progress dialog while the book is decrypted, 123 | displaying a dialog for any errors that are encountered. 124 | """ 125 | infile = self.input.GetPath() 126 | 127 | if not os.path.exists(infile): 128 | error_dialog = wx.MessageDialog(self.panel, "Error: Input file doesn't exist!", "Error procesesing file!", wx.OK | wx.ICON_ERROR) 129 | error_dialog.ShowModal() 130 | error_dialog.Destroy() 131 | return 132 | 133 | # Which type of book is this? 134 | ext = "" 135 | try: 136 | topaz.cmbtc.bookFile = topaz.cmbtc.openBook(infile) 137 | topaz.cmbtc.parseTopazHeader() 138 | except topaz.cmbtc.CMBDTCFatal: 139 | ext = ".mobi" 140 | 141 | outfile = os.path.splitext(infile)[0] + "-decrypted" + ext 142 | pid = mobidedrm.getPid(self.serial.GetValue()) 143 | dialog = wx.ProgressDialog("Progress", "Decrypting...") 144 | dialog.Pulse() 145 | dialog.Show() 146 | for error in process.decrypt(infile, outfile, pid): 147 | dialog.Pulse() 148 | wx.Yield() 149 | 150 | if error: 151 | error_dialog = wx.MessageDialog(self.panel, "Error: %s" % error, "Error processing file!", wx.OK | wx.ICON_ERROR) 152 | error_dialog.ShowModal() 153 | error_dialog.Destroy() 154 | 155 | dialog.Destroy() 156 | 157 | if __name__ == "__main__": 158 | parser = optparse.OptionParser("%prog [options]", version="Kindle Book Decrypter %s" % __version__) 159 | 160 | options, args = parser.parse_args() 161 | 162 | app = MobiDeDrmApp() 163 | app.MainLoop() 164 | 165 | -------------------------------------------------------------------------------- /mobidedrm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # This is a python script. You need a Python interpreter to run it. 4 | # For example, ActiveState Python, which exists for windows. 5 | # 6 | # It can run standalone to convert files, or it can be installed as a 7 | # plugin for Calibre (http://calibre-ebook.com/about) so that 8 | # importing files with DRM 'Just Works'. 9 | # 10 | # To create a Calibre plugin, rename this file so that the filename 11 | # ends in '_plugin.py', put it into a ZIP file and import that Calibre 12 | # using its plugin configuration GUI. 13 | # 14 | # Changelog 15 | # 0.01 - Initial version 16 | # 0.02 - Huffdic compressed books were not properly decrypted 17 | # 0.03 - Wasn't checking MOBI header length 18 | # 0.04 - Wasn't sanity checking size of data record 19 | # 0.05 - It seems that the extra data flags take two bytes not four 20 | # 0.06 - And that low bit does mean something after all :-) 21 | # 0.07 - The extra data flags aren't present in MOBI header < 0xE8 in size 22 | # 0.08 - ...and also not in Mobi header version < 6 23 | # 0.09 - ...but they are there with Mobi header version 6, header size 0xE4! 24 | # 0.10 - Outputs unencrypted files as-is, so that when run as a Calibre 25 | # import filter it works when importing unencrypted files. 26 | # Also now handles encrypted files that don't need a specific PID. 27 | # 0.11 - use autoflushed stdout and proper return values 28 | # 0.12 - Fix for problems with metadata import as Calibre plugin, report errors 29 | # 0.13 - Formatting fixes: retabbed file, removed trailing whitespace 30 | # and extra blank lines, converted CR/LF pairs at ends of each line, 31 | # and other cosmetic fixes. 32 | # 0.14 - Working out when the extra data flags are present has been problematic 33 | # Versions 7 through 9 have tried to tweak the conditions, but have been 34 | # only partially successful. Closer examination of lots of sample 35 | # files reveals that a confusin has arisen because trailing data entries 36 | # are not encrypted, but it turns out that the multibyte entries 37 | # in utf8 file are encrypted. (Although neither kind gets compressed.) 38 | # This knowledge leads to a simplification of the test for the 39 | # trailing data byte flags - version 5 and higher AND header 40 | # size >= 0xE4. 41 | # 0.15 - Now outputs 'hearbeat', and is also quicker for long files. 42 | # 0.16 - And reverts to 'done' not 'done.' at the end for unswindle 43 | # compatibility. 44 | # 0.17 - Added ability to extract PID given a Kindle serial number, added 45 | # OptionParser interface to argument processing, allow import as a 46 | # library without assuming Calibre is importing it 47 | 48 | __version__ = '0.17' 49 | 50 | import sys 51 | import struct 52 | import binascii 53 | 54 | from optparse import OptionParser 55 | 56 | letters = "ABCDEFGHIJKLMNPQRSTUVWXYZ123456789" 57 | 58 | class Unbuffered: 59 | def __init__(self, stream): 60 | self.stream = stream 61 | def write(self, data): 62 | self.stream.write(data) 63 | self.stream.flush() 64 | def __getattr__(self, attr): 65 | return getattr(self.stream, attr) 66 | 67 | class DrmException(Exception): 68 | pass 69 | 70 | # Implementation of Pukall Cipher 1 71 | def PC1(key, src, decryption=True): 72 | sum1 = 0; 73 | sum2 = 0; 74 | keyXorVal = 0; 75 | if len(key)!=16: 76 | print "Bad key length!" 77 | return None 78 | wkey = [] 79 | for i in xrange(8): 80 | wkey.append(ord(key[i*2])<<8 | ord(key[i*2+1])) 81 | 82 | dst = "" 83 | for i in xrange(len(src)): 84 | temp1 = 0; 85 | byteXorVal = 0; 86 | for j in xrange(8): 87 | temp1 ^= wkey[j] 88 | sum2 = (sum2+j)*20021 + sum1 89 | sum1 = (temp1*346)&0xFFFF 90 | sum2 = (sum2+sum1)&0xFFFF 91 | temp1 = (temp1*20021+1)&0xFFFF 92 | byteXorVal ^= temp1 ^ sum2 93 | curByte = ord(src[i]) 94 | if not decryption: 95 | keyXorVal = curByte * 257; 96 | curByte = ((curByte ^ (byteXorVal >> 8)) ^ byteXorVal) & 0xFF 97 | if decryption: 98 | keyXorVal = curByte * 257; 99 | for j in xrange(8): 100 | wkey[j] ^= keyXorVal; 101 | dst+=chr(curByte) 102 | return dst 103 | 104 | def checksumPid(s): 105 | crc = (~binascii.crc32(s,-1))&0xFFFFFFFF 106 | crc = crc ^ (crc >> 16) 107 | res = s 108 | l = len(letters) 109 | for i in (0,1): 110 | b = crc & 0xff 111 | pos = (b // l) ^ (b % l) 112 | res += letters[pos%l] 113 | crc >>= 8 114 | return res 115 | 116 | def pidFromSerial(s, l): 117 | crc = (~binascii.crc32(s,-1))&0xFFFFFFFF 118 | 119 | arr1 = [0]*l 120 | for i in xrange(len(s)): 121 | arr1[i%l] ^= ord(s[i]) 122 | 123 | crc_bytes = [crc >> 24 & 0xff, crc >> 16 & 0xff, crc >> 8 & 0xff, crc & 0xff] 124 | for i in xrange(l): 125 | arr1[i] ^= crc_bytes[i&3] 126 | 127 | pid = "" 128 | for i in xrange(l): 129 | b = arr1[i] & 0xff 130 | pid+=letters[(b >> 7) + ((b >> 5 & 3) ^ (b & 0x1f))] 131 | 132 | return pid 133 | 134 | def getPid(serial): 135 | pid = "" 136 | if len(serial) == 16: 137 | pid = checksumPid(pidFromSerial(serial, 7) + "*") 138 | elif len(serial) == 40: 139 | pid = checksumPid(pidFromSerial(serial, 8)) 140 | 141 | return pid 142 | 143 | def getSizeOfTrailingDataEntries(ptr, size, flags): 144 | def getSizeOfTrailingDataEntry(ptr, size): 145 | bitpos, result = 0, 0 146 | if size <= 0: 147 | return result 148 | while True: 149 | v = ord(ptr[size-1]) 150 | result |= (v & 0x7F) << bitpos 151 | bitpos += 7 152 | size -= 1 153 | if (v & 0x80) != 0 or (bitpos >= 28) or (size == 0): 154 | return result 155 | num = 0 156 | testflags = flags >> 1 157 | while testflags: 158 | if testflags & 1: 159 | num += getSizeOfTrailingDataEntry(ptr, size - num) 160 | testflags >>= 1 161 | # Multibyte data, if present, is included in the encryption, so 162 | # we do not need to check the low bit. 163 | # if flags & 1: 164 | # num += (ord(ptr[size - num - 1]) & 0x3) + 1 165 | return num 166 | 167 | class DrmStripper: 168 | def loadSection(self, section): 169 | if (section + 1 == self.num_sections): 170 | endoff = len(self.data_file) 171 | else: 172 | endoff = self.sections[section + 1][0] 173 | off = self.sections[section][0] 174 | return self.data_file[off:endoff] 175 | 176 | def patch(self, off, new): 177 | self.data_file = self.data_file[:off] + new + self.data_file[off+len(new):] 178 | 179 | def patchSection(self, section, new, in_off = 0): 180 | if (section + 1 == self.num_sections): 181 | endoff = len(self.data_file) 182 | else: 183 | endoff = self.sections[section + 1][0] 184 | off = self.sections[section][0] 185 | assert off + in_off + len(new) <= endoff 186 | self.patch(off + in_off, new) 187 | 188 | def parseDRM(self, data, count, pid): 189 | pid = pid.ljust(16,'\0') 190 | keyvec1 = "\x72\x38\x33\xB0\xB4\xF2\xE3\xCA\xDF\x09\x01\xD6\xE2\xE0\x3F\x96" 191 | temp_key = PC1(keyvec1, pid, False) 192 | temp_key_sum = sum(map(ord,temp_key)) & 0xff 193 | found_key = None 194 | for i in xrange(count): 195 | verification, size, type, cksum, cookie = struct.unpack('>LLLBxxx32s', data[i*0x30:i*0x30+0x30]) 196 | cookie = PC1(temp_key, cookie) 197 | ver,flags,finalkey,expiry,expiry2 = struct.unpack('>LL16sLL', cookie) 198 | if verification == ver and cksum == temp_key_sum and (flags & 0x1F) == 1: 199 | found_key = finalkey 200 | break 201 | if not found_key: 202 | # Then try the default encoding that doesn't require a PID 203 | temp_key = keyvec1 204 | temp_key_sum = sum(map(ord,temp_key)) & 0xff 205 | for i in xrange(count): 206 | verification, size, type, cksum, cookie = struct.unpack('>LLLBxxx32s', data[i*0x30:i*0x30+0x30]) 207 | cookie = PC1(temp_key, cookie) 208 | ver,flags,finalkey,expiry,expiry2 = struct.unpack('>LL16sLL', cookie) 209 | if verification == ver and cksum == temp_key_sum: 210 | found_key = finalkey 211 | break 212 | return found_key 213 | 214 | def __init__(self, data_file, pid): 215 | if checksumPid(pid[0:-2]) != pid: 216 | raise DrmException("invalid PID checksum") 217 | pid = pid[0:-2] 218 | 219 | self.data_file = data_file 220 | header = data_file[0:72] 221 | if header[0x3C:0x3C+8] != 'BOOKMOBI': 222 | raise DrmException("invalid file format") 223 | self.num_sections, = struct.unpack('>H', data_file[76:78]) 224 | 225 | self.sections = [] 226 | for i in xrange(self.num_sections): 227 | offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', data_file[78+i*8:78+i*8+8]) 228 | flags, val = a1, a2<<16|a3<<8|a4 229 | self.sections.append( (offset, flags, val) ) 230 | 231 | sect = self.loadSection(0) 232 | records, = struct.unpack('>H', sect[0x8:0x8+2]) 233 | mobi_length, = struct.unpack('>L',sect[0x14:0x18]) 234 | mobi_version, = struct.unpack('>L',sect[0x68:0x6C]) 235 | extra_data_flags = 0 236 | print "MOBI header version = %d, length = %d" %(mobi_version, mobi_length) 237 | if (mobi_length >= 0xE4) and (mobi_version >= 5): 238 | extra_data_flags, = struct.unpack('>H', sect[0xF2:0xF4]) 239 | print "Extra Data Flags = %d" %extra_data_flags 240 | 241 | crypto_type, = struct.unpack('>H', sect[0xC:0xC+2]) 242 | if crypto_type == 0: 243 | print "This book is not encrypted." 244 | else: 245 | if crypto_type == 1: 246 | raise DrmException("cannot decode Mobipocket encryption type 1") 247 | if crypto_type != 2: 248 | raise DrmException("unknown encryption type: %d" % crypto_type) 249 | 250 | # calculate the keys 251 | drm_ptr, drm_count, drm_size, drm_flags = struct.unpack('>LLLL', sect[0xA8:0xA8+16]) 252 | if drm_count == 0: 253 | raise DrmException("no PIDs found in this file") 254 | found_key = self.parseDRM(sect[drm_ptr:drm_ptr+drm_size], drm_count, pid) 255 | if not found_key: 256 | raise DrmException("no key found. maybe the PID is incorrect") 257 | 258 | # kill the drm keys 259 | self.patchSection(0, "\0" * drm_size, drm_ptr) 260 | # kill the drm pointers 261 | self.patchSection(0, "\xff" * 4 + "\0" * 12, 0xA8) 262 | # clear the crypto type 263 | self.patchSection(0, "\0" * 2, 0xC) 264 | 265 | # decrypt sections 266 | print "Decrypting. Please wait . . .", 267 | new_data = self.data_file[:self.sections[1][0]] 268 | for i in xrange(1, records+1): 269 | data = self.loadSection(i) 270 | extra_size = getSizeOfTrailingDataEntries(data, len(data), extra_data_flags) 271 | if i%100 == 0: 272 | print ".", 273 | # print "record %d, extra_size %d" %(i,extra_size) 274 | new_data += PC1(found_key, data[0:len(data) - extra_size]) 275 | if extra_size > 0: 276 | new_data += data[-extra_size:] 277 | #self.patchSection(i, PC1(found_key, data[0:len(data) - extra_size])) 278 | if self.num_sections > records+1: 279 | new_data += self.data_file[self.sections[records+1][0]:] 280 | self.data_file = new_data 281 | print "done" 282 | 283 | def getResult(self): 284 | return self.data_file 285 | 286 | if __name__ == "__main__": 287 | sys.stdout=Unbuffered(sys.stdout) 288 | print ('MobiDeDrm v%(__version__)s. ' 289 | 'Copyright 2008-2010 The Dark Reverser.' % globals()) 290 | 291 | parser = OptionParser("Usage: %prog [options] input.azw output.mobi PID", version=__version__) 292 | parser.add_option("-s", "--serial", dest="serial", default="", help="Get the PID from a Kindle or Kindle for iPhone serial number") 293 | 294 | options, args = parser.parse_args() 295 | 296 | if options.serial: 297 | print "Mobipocket PID: " + getPid(options.serial) 298 | sys.exit(0) 299 | 300 | if len(args) < 4: 301 | print "Removes protection from Mobipocket books" 302 | parser.print_help() 303 | sys.exit(1) 304 | else: 305 | infile = args[1] 306 | outfile = args[2] 307 | pid = args[3] 308 | data_file = file(infile, 'rb').read() 309 | try: 310 | strippedFile = DrmStripper(data_file, pid) 311 | file(outfile, 'wb').write(strippedFile.getResult()) 312 | except DrmException, e: 313 | print "Error: %s" % e 314 | sys.exit(1) 315 | sys.exit(0) 316 | elif "calibre" in globals(): 317 | from calibre.customize import FileTypePlugin 318 | 319 | class MobiDeDRM(FileTypePlugin): 320 | name = 'MobiDeDRM' # Name of the plugin 321 | description = 'Removes DRM from secure Mobi files' 322 | supported_platforms = ['linux', 'osx', 'windows'] # Platforms this plugin will run on 323 | author = 'The Dark Reverser' # The author of this plugin 324 | version = (0, 1, 6) # The version number of this plugin 325 | file_types = set(['prc','mobi','azw']) # The file types that this plugin will be applied to 326 | on_import = True # Run this plugin during the import 327 | 328 | def run(self, path_to_ebook): 329 | from calibre.gui2 import is_ok_to_use_qt 330 | from PyQt4.Qt import QMessageBox 331 | PID = self.site_customization 332 | data_file = file(path_to_ebook, 'rb').read() 333 | ar = PID.split(',') 334 | for i in ar: 335 | try: 336 | unlocked_file = DrmStripper(data_file, i).getResult() 337 | except DrmException: 338 | # ignore the error 339 | pass 340 | else: 341 | of = self.temporary_file('.mobi') 342 | of.write(unlocked_file) 343 | of.close() 344 | return of.name 345 | if is_ok_to_use_qt(): 346 | d = QMessageBox(QMessageBox.Warning, "MobiDeDRM Plugin", "Couldn't decode: %s\n\nImporting encrypted version." % path_to_ebook) 347 | d.show() 348 | d.raise_() 349 | d.exec_() 350 | return path_to_ebook 351 | 352 | def customization_help(self, gui=False): 353 | return 'Enter PID (separate multiple PIDs with comma)' 354 | 355 | -------------------------------------------------------------------------------- /process.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | """ 4 | Utilities for decrypting a book in a separate process. This gets its own 5 | module as the multiprocessing module duplicates the global namespace when 6 | spawning new processes. This separate module limits the amount of stuff 7 | that gets duplicated and prevents serialization errors on certain platforms 8 | with e.g. wxWidgets. 9 | """ 10 | 11 | import mobidedrm 12 | import multiprocessing 13 | import os 14 | import shutil 15 | import tempfile 16 | import time 17 | import topaz 18 | 19 | multiprocessing.freeze_support() 20 | 21 | def _process(infile, outfile, pid, error): 22 | try: 23 | if outfile.endswith(".mobi"): 24 | # Mobi file 25 | data_file = open(infile, "rb").read() 26 | strippedFile = mobidedrm.DrmStripper(data_file, pid) 27 | file(outfile, 'wb').write(strippedFile.getResult()) 28 | else: 29 | # Topaz file 30 | tmp = tempfile.mkdtemp() 31 | args = ['./cmbtc.py', '-v', '-p', pid[:8], '-d', '-o', tmp, infile] 32 | topaz.cmbtc.main(argv=args) 33 | topaz.gensvg.main(['./gensvg.py', tmp]) 34 | topaz.genhtml.main(['./genhtml.py', tmp]) 35 | 36 | if not os.path.exists(outfile): 37 | os.mkdir(outfile) 38 | 39 | for filename in ["img", "style.css", "book.html"]: 40 | shutil.move(os.path.join(tmp, filename), os.path.join(outfile, filename)) 41 | 42 | shutil.rmtree(tmp) 43 | except Exception, e: 44 | error.value = str(e) 45 | 46 | def decrypt(infile, outfile, pid): 47 | """ 48 | Decrypt a Kindle book in a different process. This periodically yields 49 | so that status information can be shown. Use like: 50 | 51 | >>> for error in decrypt(infile, outfile, pid): 52 | >>> progress_update() 53 | >>> if error: 54 | >>> print error 55 | 56 | """ 57 | error = None 58 | 59 | errorobj = multiprocessing.Array("c", 512) 60 | proc = multiprocessing.Process(target=_process, args=(infile, outfile, pid, errorobj)) 61 | proc.start() 62 | while proc.is_alive(): 63 | yield "" 64 | time.sleep(0.1) 65 | proc.join() 66 | 67 | if errorobj.value: 68 | error = errorobj.value 69 | 70 | yield error 71 | 72 | -------------------------------------------------------------------------------- /screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lrobot/kindledecrypt/46052aeb076081ab1027de0f7dc2033d4a6304d9/screenshot.png -------------------------------------------------------------------------------- /setup-macosx.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a setup.py script generated by py2applet 3 | 4 | Usage: 5 | python setup.py py2app 6 | """ 7 | 8 | from setuptools import setup 9 | 10 | APP = ['kindledecrypt.py'] 11 | DATA_FILES = [] 12 | OPTIONS = {'argv_emulation': True} 13 | 14 | setup( 15 | app=APP, 16 | data_files=DATA_FILES, 17 | options={'py2app': OPTIONS}, 18 | setup_requires=['py2app'], 19 | ) 20 | -------------------------------------------------------------------------------- /setup-win32.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from distutils.core import setup 4 | import py2exe, sys, os 5 | 6 | sys.argv.append('py2exe') 7 | 8 | setup( 9 | options = {'py2exe': {'bundle_files':1}}, 10 | windows = [{'script': 'kindledecrypt.py'}], 11 | zipfile = None, 12 | ) 13 | -------------------------------------------------------------------------------- /topaz/__init__.py: -------------------------------------------------------------------------------- 1 | import cmbtc 2 | import gensvg 3 | import genhtml 4 | -------------------------------------------------------------------------------- /topaz/cmbtc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # For use with Topaz Scripts Version 2.6 3 | 4 | class Unbuffered: 5 | def __init__(self, stream): 6 | self.stream = stream 7 | def write(self, data): 8 | self.stream.write(data) 9 | self.stream.flush() 10 | def __getattr__(self, attr): 11 | return getattr(self.stream, attr) 12 | 13 | import sys 14 | sys.stdout=Unbuffered(sys.stdout) 15 | 16 | import csv 17 | import os 18 | import getopt 19 | import zlib 20 | from struct import pack 21 | from struct import unpack 22 | 23 | MAX_PATH = 255 24 | 25 | # Put the first 8 characters of your Kindle PID here 26 | # or supply it with the -p option in the command line 27 | #################################################### 28 | kindlePID = "12345678" 29 | #################################################### 30 | 31 | global bookFile 32 | global bookPayloadOffset 33 | global bookHeaderRecords 34 | global bookMetadata 35 | global bookKey 36 | global command 37 | 38 | # 39 | # Exceptions for all the problems that might happen during the script 40 | # 41 | 42 | class CMBDTCError(Exception): 43 | pass 44 | 45 | class CMBDTCFatal(Exception): 46 | pass 47 | 48 | # 49 | # Open the book file at path 50 | # 51 | 52 | def openBook(path): 53 | try: 54 | return open(path,'rb') 55 | except: 56 | raise CMBDTCFatal("Could not open book file: " + path) 57 | 58 | # 59 | # Get a 7 bit encoded number from the book file 60 | # 61 | 62 | def bookReadEncodedNumber(): 63 | flag = False 64 | data = ord(bookFile.read(1)) 65 | 66 | if data == 0xFF: 67 | flag = True 68 | data = ord(bookFile.read(1)) 69 | 70 | if data >= 0x80: 71 | datax = (data & 0x7F) 72 | while data >= 0x80 : 73 | data = ord(bookFile.read(1)) 74 | datax = (datax <<7) + (data & 0x7F) 75 | data = datax 76 | 77 | if flag: 78 | data = -data 79 | return data 80 | 81 | # 82 | # Encode a number in 7 bit format 83 | # 84 | 85 | def encodeNumber(number): 86 | result = "" 87 | negative = False 88 | flag = 0 89 | 90 | if number < 0 : 91 | number = -number + 1 92 | negative = True 93 | 94 | while True: 95 | byte = number & 0x7F 96 | number = number >> 7 97 | byte += flag 98 | result += chr(byte) 99 | flag = 0x80 100 | if number == 0 : 101 | if (byte == 0xFF and negative == False) : 102 | result += chr(0x80) 103 | break 104 | 105 | if negative: 106 | result += chr(0xFF) 107 | 108 | return result[::-1] 109 | 110 | # 111 | # Get a length prefixed string from the file 112 | # 113 | 114 | def bookReadString(): 115 | stringLength = bookReadEncodedNumber() 116 | return unpack(str(stringLength)+"s",bookFile.read(stringLength))[0] 117 | 118 | # 119 | # Returns a length prefixed string 120 | # 121 | 122 | def lengthPrefixString(data): 123 | return encodeNumber(len(data))+data 124 | 125 | 126 | # 127 | # Read and return the data of one header record at the current book file position [[offset,decompressedLength,compressedLength],...] 128 | # 129 | 130 | def bookReadHeaderRecordData(): 131 | nbValues = bookReadEncodedNumber() 132 | values = [] 133 | for i in range (0,nbValues): 134 | values.append([bookReadEncodedNumber(),bookReadEncodedNumber(),bookReadEncodedNumber()]) 135 | return values 136 | 137 | # 138 | # Read and parse one header record at the current book file position and return the associated data [[offset,decompressedLength,compressedLength],...] 139 | # 140 | 141 | def parseTopazHeaderRecord(): 142 | if ord(bookFile.read(1)) != 0x63: 143 | raise CMBDTCFatal("Parse Error : Invalid Header") 144 | 145 | tag = bookReadString() 146 | record = bookReadHeaderRecordData() 147 | return [tag,record] 148 | 149 | # 150 | # Parse the header of a Topaz file, get all the header records and the offset for the payload 151 | # 152 | 153 | def parseTopazHeader(): 154 | global bookHeaderRecords 155 | global bookPayloadOffset 156 | magic = unpack("4s",bookFile.read(4))[0] 157 | 158 | if magic != 'TPZ0': 159 | raise CMBDTCFatal("Parse Error : Invalid Header, not a Topaz file") 160 | 161 | nbRecords = bookReadEncodedNumber() 162 | bookHeaderRecords = {} 163 | 164 | for i in range (0,nbRecords): 165 | result = parseTopazHeaderRecord() 166 | #print result[0], result[1] 167 | bookHeaderRecords[result[0]] = result[1] 168 | 169 | if ord(bookFile.read(1)) != 0x64 : 170 | raise CMBDTCFatal("Parse Error : Invalid Header") 171 | 172 | bookPayloadOffset = bookFile.tell() 173 | 174 | # 175 | # Get a record in the book payload, given its name and index. If necessary the record is decrypted. The record is not decompressed 176 | # Correction, the record is correctly decompressed too 177 | # 178 | 179 | def getBookPayloadRecord(name, index): 180 | encrypted = False 181 | compressed = False 182 | 183 | try: 184 | recordOffset = bookHeaderRecords[name][index][0] 185 | except: 186 | raise CMBDTCFatal("Parse Error : Invalid Record, record not found") 187 | 188 | bookFile.seek(bookPayloadOffset + recordOffset) 189 | 190 | tag = bookReadString() 191 | if tag != name : 192 | raise CMBDTCFatal("Parse Error : Invalid Record, record name doesn't match") 193 | 194 | recordIndex = bookReadEncodedNumber() 195 | 196 | if recordIndex < 0 : 197 | encrypted = True 198 | recordIndex = -recordIndex -1 199 | 200 | if recordIndex != index : 201 | raise CMBDTCFatal("Parse Error : Invalid Record, index doesn't match") 202 | 203 | if (bookHeaderRecords[name][index][2] > 0): 204 | compressed = True 205 | record = bookFile.read(bookHeaderRecords[name][index][2]) 206 | else: 207 | record = bookFile.read(bookHeaderRecords[name][index][1]) 208 | 209 | if encrypted: 210 | ctx = topazCryptoInit(bookKey) 211 | record = topazCryptoDecrypt(record,ctx) 212 | 213 | if compressed: 214 | record = zlib.decompress(record) 215 | 216 | return record 217 | 218 | # 219 | # Extract, decrypt and decompress a book record indicated by name and index and print it or save it in "filename" 220 | # 221 | 222 | def extractBookPayloadRecord(name, index, filename): 223 | compressed = False 224 | 225 | try: 226 | compressed = bookHeaderRecords[name][index][2] != 0 227 | record = getBookPayloadRecord(name,index) 228 | except: 229 | print("Could not find record") 230 | 231 | # if compressed: 232 | # try: 233 | # record = zlib.decompress(record) 234 | # except: 235 | # raise CMBDTCFatal("Could not decompress record") 236 | 237 | if filename != "": 238 | try: 239 | file = open(filename,"wb") 240 | file.write(record) 241 | file.close() 242 | except: 243 | raise CMBDTCFatal("Could not write to destination file") 244 | else: 245 | print(record) 246 | 247 | # 248 | # return next record [key,value] from the book metadata from the current book position 249 | # 250 | 251 | def readMetadataRecord(): 252 | return [bookReadString(),bookReadString()] 253 | 254 | # 255 | # Parse the metadata record from the book payload and return a list of [key,values] 256 | # 257 | 258 | def parseMetadata(): 259 | global bookHeaderRecords 260 | global bookPayloadAddress 261 | global bookMetadata 262 | bookMetadata = {} 263 | bookFile.seek(bookPayloadOffset + bookHeaderRecords["metadata"][0][0]) 264 | tag = bookReadString() 265 | if tag != "metadata" : 266 | raise CMBDTCFatal("Parse Error : Record Names Don't Match") 267 | 268 | flags = ord(bookFile.read(1)) 269 | nbRecords = ord(bookFile.read(1)) 270 | 271 | for i in range (0,nbRecords) : 272 | record =readMetadataRecord() 273 | bookMetadata[record[0]] = record[1] 274 | 275 | # 276 | # Context initialisation for the Topaz Crypto 277 | # 278 | 279 | def topazCryptoInit(key): 280 | ctx1 = 0x0CAFFE19E 281 | 282 | for keyChar in key: 283 | keyByte = ord(keyChar) 284 | ctx2 = ctx1 285 | ctx1 = ((((ctx1 >>2) * (ctx1 >>7))&0xFFFFFFFF) ^ (keyByte * keyByte * 0x0F902007)& 0xFFFFFFFF ) 286 | return [ctx1,ctx2] 287 | 288 | # 289 | # decrypt data with the context prepared by topazCryptoInit() 290 | # 291 | 292 | def topazCryptoDecrypt(data, ctx): 293 | ctx1 = ctx[0] 294 | ctx2 = ctx[1] 295 | 296 | plainText = "" 297 | 298 | for dataChar in data: 299 | dataByte = ord(dataChar) 300 | m = (dataByte ^ ((ctx1 >> 3) &0xFF) ^ ((ctx2<<3) & 0xFF)) &0xFF 301 | ctx2 = ctx1 302 | ctx1 = (((ctx1 >> 2) * (ctx1 >> 7)) &0xFFFFFFFF) ^((m * m * 0x0F902007) &0xFFFFFFFF) 303 | plainText += chr(m) 304 | 305 | return plainText 306 | 307 | # 308 | # Decrypt a payload record with the PID 309 | # 310 | 311 | def decryptRecord(data,PID): 312 | ctx = topazCryptoInit(PID) 313 | return topazCryptoDecrypt(data, ctx) 314 | 315 | # 316 | # Try to decrypt a dkey record (contains the book PID) 317 | # 318 | 319 | def decryptDkeyRecord(data,PID): 320 | record = decryptRecord(data,PID) 321 | fields = unpack("3sB8sB8s3s",record) 322 | 323 | if fields[0] != "PID" or fields[5] != "pid" : 324 | raise CMBDTCError("Didn't find PID magic numbers in record") 325 | elif fields[1] != 8 or fields[3] != 8 : 326 | raise CMBDTCError("Record didn't contain correct length fields") 327 | elif fields[2] != PID : 328 | raise CMBDTCError("Record didn't contain PID") 329 | 330 | return fields[4] 331 | 332 | # 333 | # Decrypt all the book's dkey records (contain the book PID) 334 | # 335 | 336 | def decryptDkeyRecords(data,PID): 337 | nbKeyRecords = ord(data[0]) 338 | records = [] 339 | data = data[1:] 340 | for i in range (0,nbKeyRecords): 341 | length = ord(data[0]) 342 | try: 343 | key = decryptDkeyRecord(data[1:length+1],PID) 344 | records.append(key) 345 | except CMBDTCError: 346 | pass 347 | data = data[1+length:] 348 | 349 | return records 350 | 351 | 352 | def createDecryptedPayload(payload): 353 | for headerRecord in bookHeaderRecords: 354 | name = headerRecord 355 | if name != "dkey" : 356 | ext = '.dat' 357 | if name == 'img' : ext = '.jpg' 358 | for index in range (0,len(bookHeaderRecords[name])) : 359 | fnum = "%04d" % index 360 | fname = name + fnum + ext 361 | destdir = payload 362 | if name == 'img': 363 | destdir = os.path.join(payload,'img') 364 | if name == 'page': 365 | destdir = os.path.join(payload,'page') 366 | if name == 'glyphs': 367 | destdir = os.path.join(payload,'glyphs') 368 | outputFile = os.path.join(destdir,fname) 369 | file(outputFile, 'wb').write(getBookPayloadRecord(name, index)) 370 | 371 | 372 | # Create decrypted book 373 | # 374 | 375 | def createDecryptedBook(outdir): 376 | if not os.path.exists(outdir): 377 | os.makedirs(outdir) 378 | 379 | destdir = os.path.join(outdir,'img') 380 | if not os.path.exists(destdir): 381 | os.makedirs(destdir) 382 | 383 | destdir = os.path.join(outdir,'page') 384 | if not os.path.exists(destdir): 385 | os.makedirs(destdir) 386 | 387 | destdir = os.path.join(outdir,'glyphs') 388 | if not os.path.exists(destdir): 389 | os.makedirs(destdir) 390 | 391 | createDecryptedPayload(outdir) 392 | 393 | 394 | # 395 | # Set the command to execute by the programm according to cmdLine parameters 396 | # 397 | 398 | def setCommand(name) : 399 | global command 400 | if command != "" : 401 | raise CMBDTCFatal("Invalid command line parameters") 402 | else : 403 | command = name 404 | 405 | # 406 | # Program usage 407 | # 408 | 409 | def usage(): 410 | print("\nUsage:") 411 | print("\ncmbtc_dump_linux.py [options] bookFileName\n") 412 | print("-p Adds a PID to the list of PIDs that are tried to decrypt the book key (can be used several times)") 413 | print("-d Dumps the unencrypted book as files to outdir") 414 | print("-o Output directory to save book files to") 415 | print("-v Verbose (can be used several times)") 416 | 417 | 418 | # 419 | # Main 420 | # 421 | 422 | def main(argv=sys.argv): 423 | global bookMetadata 424 | global bookKey 425 | global bookFile 426 | global command 427 | 428 | print argv 429 | 430 | progname = os.path.basename(argv[0]) 431 | 432 | verbose = 0 433 | recordName = "" 434 | recordIndex = 0 435 | outdir = "" 436 | PIDs = [] 437 | command = "" 438 | 439 | # Preloads your Kindle pid from the top of the program. 440 | PIDs.append(kindlePID) 441 | 442 | try: 443 | opts, args = getopt.getopt(argv[1:], "vo:p:d") 444 | except getopt.GetoptError, err: 445 | # print help information and exit: 446 | print str(err) # will print something like "option -a not recognized" 447 | usage() 448 | sys.exit(2) 449 | 450 | if len(opts) == 0 and len(args) == 0 : 451 | usage() 452 | sys.exit(2) 453 | 454 | for o, a in opts: 455 | if o == "-v": 456 | verbose+=1 457 | if o =="-o": 458 | if a == None : 459 | raise CMBDTCFatal("Invalid parameter for -o") 460 | outdir = a 461 | if o =="-p": 462 | PIDs.append(a) 463 | if o =="-d": 464 | setCommand("doit") 465 | 466 | if command == "" : 467 | raise CMBDTCFatal("No action supplied on command line") 468 | 469 | # 470 | # Open book and parse metadata 471 | # 472 | 473 | if len(args) == 1: 474 | 475 | bookFile = openBook(args[0]) 476 | parseTopazHeader() 477 | parseMetadata() 478 | 479 | # 480 | # Decrypt book key 481 | # 482 | 483 | dkey = getBookPayloadRecord('dkey', 0) 484 | 485 | bookKeys = [] 486 | for PID in PIDs : 487 | bookKeys+=decryptDkeyRecords(dkey,PID) 488 | 489 | if len(bookKeys) == 0 : 490 | if verbose > 0 : 491 | print ("Book key could not be found. Maybe this book is not registered with this device.") 492 | return 1 493 | else : 494 | bookKey = bookKeys[0] 495 | if verbose > 0: 496 | print("Book key: " + bookKey.encode('hex')) 497 | 498 | if command == "printRecord" : 499 | extractBookPayloadRecord(recordName,int(recordIndex),outputFile) 500 | if outputFile != "" and verbose>0 : 501 | print("Wrote record to file: "+outputFile) 502 | elif command == "doit" : 503 | if outdir != "" : 504 | createDecryptedBook(outdir) 505 | if verbose >0 : 506 | print ("Decrypted book saved. Don't pirate!") 507 | elif verbose > 0: 508 | print("Output directory name was not supplied.") 509 | return 1 510 | 511 | return 0 512 | 513 | if __name__ == '__main__': 514 | sys.exit(main()) 515 | -------------------------------------------------------------------------------- /topaz/convert2xml.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 3 | # For use with Topaz Scripts Version 2.6 4 | 5 | class Unbuffered: 6 | def __init__(self, stream): 7 | self.stream = stream 8 | def write(self, data): 9 | self.stream.write(data) 10 | self.stream.flush() 11 | def __getattr__(self, attr): 12 | return getattr(self.stream, attr) 13 | 14 | import sys 15 | sys.stdout=Unbuffered(sys.stdout) 16 | 17 | import csv 18 | import os 19 | import getopt 20 | from struct import pack 21 | from struct import unpack 22 | 23 | 24 | # Get a 7 bit encoded number from string. The most 25 | # significant byte comes first and has the high bit (8th) set 26 | 27 | def readEncodedNumber(file): 28 | flag = False 29 | c = file.read(1) 30 | if (len(c) == 0): 31 | return None 32 | data = ord(c) 33 | 34 | if data == 0xFF: 35 | flag = True 36 | c = file.read(1) 37 | if (len(c) == 0): 38 | return None 39 | data = ord(c) 40 | 41 | if data >= 0x80: 42 | datax = (data & 0x7F) 43 | while data >= 0x80 : 44 | c = file.read(1) 45 | if (len(c) == 0): 46 | return None 47 | data = ord(c) 48 | datax = (datax <<7) + (data & 0x7F) 49 | data = datax 50 | 51 | if flag: 52 | data = -data 53 | return data 54 | 55 | 56 | # returns a binary string that encodes a number into 7 bits 57 | # most significant byte first which has the high bit set 58 | 59 | def encodeNumber(number): 60 | result = "" 61 | negative = False 62 | flag = 0 63 | 64 | if number < 0 : 65 | number = -number + 1 66 | negative = True 67 | 68 | while True: 69 | byte = number & 0x7F 70 | number = number >> 7 71 | byte += flag 72 | result += chr(byte) 73 | flag = 0x80 74 | if number == 0 : 75 | if (byte == 0xFF and negative == False) : 76 | result += chr(0x80) 77 | break 78 | 79 | if negative: 80 | result += chr(0xFF) 81 | 82 | return result[::-1] 83 | 84 | 85 | 86 | # create / read a length prefixed string from the file 87 | 88 | def lengthPrefixString(data): 89 | return encodeNumber(len(data))+data 90 | 91 | def readString(file): 92 | stringLength = readEncodedNumber(file) 93 | if (stringLength == None): 94 | return "" 95 | sv = file.read(stringLength) 96 | if (len(sv) != stringLength): 97 | return "" 98 | return unpack(str(stringLength)+"s",sv)[0] 99 | 100 | 101 | # convert a binary string generated by encodeNumber (7 bit encoded number) 102 | # to the value you would find inside the page*.dat files to be processed 103 | 104 | def convert(i): 105 | result = '' 106 | val = encodeNumber(i) 107 | for j in xrange(len(val)): 108 | c = ord(val[j:j+1]) 109 | result += '%02x' % c 110 | return result 111 | 112 | 113 | 114 | # the complete string table used to store all book text content 115 | # as well as the xml tokens and values that make sense out of it 116 | 117 | class Dictionary(object): 118 | def __init__(self, dictFile): 119 | self.filename = dictFile 120 | self.size = 0 121 | self.fo = file(dictFile,'rb') 122 | self.stable = [] 123 | self.size = readEncodedNumber(self.fo) 124 | for i in xrange(self.size): 125 | self.stable.append(self.escapestr(readString(self.fo))) 126 | self.pos = 0 127 | 128 | def escapestr(self, str): 129 | str = str.replace('&','&') 130 | str = str.replace('<','<') 131 | str = str.replace('>','>') 132 | str = str.replace('=','=') 133 | return str 134 | 135 | def lookup(self,val): 136 | if ((val >= 0) and (val < self.size)) : 137 | self.pos = val 138 | return self.stable[self.pos] 139 | else: 140 | print "Error - %d outside of string table limits" % val 141 | sys.exit(-1) 142 | 143 | def getSize(self): 144 | return self.size 145 | 146 | def getPos(self): 147 | return self.pos 148 | 149 | def dumpDict(self): 150 | for i in xrange(self.size): 151 | print "%d %s %s" % (i, convert(i), self.stable[i]) 152 | return 153 | 154 | # parses the xml snippets that are represented by each page*.dat file. 155 | # also parses the other0.dat file - the main stylesheet 156 | # and information used to inject the xml snippets into page*.dat files 157 | 158 | class PageParser(object): 159 | def __init__(self, filename, dict, debug, flat_xml): 160 | self.fo = file(filename,'rb') 161 | self.id = os.path.basename(filename).replace('.dat','') 162 | self.dict = dict 163 | self.debug = debug 164 | self.flat_xml = flat_xml 165 | self.tagpath = [] 166 | self.doc = [] 167 | self.snippetList = [] 168 | 169 | 170 | # hash table used to enable the decoding process 171 | # This has all been developed by trial and error so it may still have omissions or 172 | # contain errors 173 | # Format: 174 | # tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped) 175 | 176 | token_tags = { 177 | 'x' : (1, 'scalar_number', 0, 0), 178 | 'y' : (1, 'scalar_number', 0, 0), 179 | 'h' : (1, 'scalar_number', 0, 0), 180 | 'w' : (1, 'scalar_number', 0, 0), 181 | 'firstWord' : (1, 'scalar_number', 0, 0), 182 | 'lastWord' : (1, 'scalar_number', 0, 0), 183 | 'rootID' : (1, 'scalar_number', 0, 0), 184 | 'stemID' : (1, 'scalar_number', 0, 0), 185 | 'type' : (1, 'scalar_text', 0, 0), 186 | 187 | 'info' : (0, 'number', 1, 0), 188 | 189 | 'info.word' : (0, 'number', 1, 1), 190 | 'info.word.ocrText' : (1, 'text', 0, 0), 191 | 'info.word.firstGlyph' : (1, 'raw', 0, 0), 192 | 'info.word.lastGlyph' : (1, 'raw', 0, 0), 193 | 'info.word.bl' : (1, 'raw', 0, 0), 194 | 'info.word.link_id' : (1, 'number', 0, 0), 195 | 196 | 'glyph' : (0, 'number', 1, 1), 197 | 'glyph.x' : (1, 'number', 0, 0), 198 | 'glyph.y' : (1, 'number', 0, 0), 199 | 'glyph.glyphID' : (1, 'number', 0, 0), 200 | 201 | 'dehyphen' : (0, 'number', 1, 1), 202 | 'dehyphen.rootID' : (1, 'number', 0, 0), 203 | 'dehyphen.stemID' : (1, 'number', 0, 0), 204 | 'dehyphen.stemPage' : (1, 'number', 0, 0), 205 | 'dehyphen.sh' : (1, 'number', 0, 0), 206 | 207 | 'links' : (0, 'number', 1, 1), 208 | 'links.page' : (1, 'number', 0, 0), 209 | 'links.rel' : (1, 'number', 0, 0), 210 | 'links.row' : (1, 'number', 0, 0), 211 | 'links.title' : (1, 'text', 0, 0), 212 | 'links.href' : (1, 'text', 0, 0), 213 | 'links.type' : (1, 'text', 0, 0), 214 | 215 | 'paraCont' : (0, 'number', 1, 1), 216 | 'paraCont.rootID' : (1, 'number', 0, 0), 217 | 'paraCont.stemID' : (1, 'number', 0, 0), 218 | 'paraCont.stemPage' : (1, 'number', 0, 0), 219 | 220 | 'paraStems' : (0, 'number', 1, 1), 221 | 'paraStems.stemID' : (1, 'number', 0, 0), 222 | 223 | 'wordStems' : (0, 'number', 1, 1), 224 | 'wordStems.stemID' : (1, 'number', 0, 0), 225 | 226 | 'empty' : (1, 'snippets', 1, 0), 227 | 228 | 'page' : (1, 'snippets', 1, 0), 229 | 'page.pageid' : (1, 'scalar_text', 0, 0), 230 | 'page.pagelabel' : (1, 'scalar_text', 0, 0), 231 | 'page.type' : (1, 'scalar_text', 0, 0), 232 | 'page.h' : (1, 'scalar_number', 0, 0), 233 | 'page.w' : (1, 'scalar_number', 0, 0), 234 | 'page.startID' : (1, 'scalar_number', 0, 0), 235 | 236 | 'group' : (1, 'snippets', 1, 0), 237 | 'group.type' : (1, 'scalar_text', 0, 0), 238 | 239 | 'region' : (1, 'snippets', 1, 0), 240 | 'region.type' : (1, 'scalar_text', 0, 0), 241 | 'region.x' : (1, 'scalar_number', 0, 0), 242 | 'region.y' : (1, 'scalar_number', 0, 0), 243 | 'region.h' : (1, 'scalar_number', 0, 0), 244 | 'region.w' : (1, 'scalar_number', 0, 0), 245 | 246 | 'empty_text_region' : (1, 'snippets', 1, 0), 247 | 248 | 'img' : (1, 'snippets', 1, 0), 249 | 'img.x' : (1, 'scalar_number', 0, 0), 250 | 'img.y' : (1, 'scalar_number', 0, 0), 251 | 'img.h' : (1, 'scalar_number', 0, 0), 252 | 'img.w' : (1, 'scalar_number', 0, 0), 253 | 'img.src' : (1, 'scalar_number', 0, 0), 254 | 'img.color_src' : (1, 'scalar_number', 0, 0), 255 | 256 | 'paragraph' : (1, 'snippets', 1, 0), 257 | 'paragraph.class' : (1, 'scalar_text', 0, 0), 258 | 'paragraph.firstWord' : (1, 'scalar_number', 0, 0), 259 | 'paragraph.lastWord' : (1, 'scalar_number', 0, 0), 260 | 261 | 'word_semantic' : (1, 'snippets', 1, 1), 262 | 'word_semantic.type' : (1, 'scalar_text', 0, 0), 263 | 'word_semantic.firstWord' : (1, 'scalar_number', 0, 0), 264 | 'word_semantic.lastWord' : (1, 'scalar_number', 0, 0), 265 | 266 | 'word' : (1, 'snippets', 1, 0), 267 | 'word.type' : (1, 'scalar_text', 0, 0), 268 | 'word.class' : (1, 'scalar_text', 0, 0), 269 | 'word.firstGlyph' : (1, 'scalar_number', 0, 0), 270 | 'word.lastGlyph' : (1, 'scalar_number', 0, 0), 271 | 272 | '_span' : (1, 'snippets', 1, 0), 273 | '_span.firstWord' : (1, 'scalar_number', 0, 0), 274 | '-span.lastWord' : (1, 'scalar_number', 0, 0), 275 | 276 | 'span' : (1, 'snippets', 1, 0), 277 | 'span.firstWord' : (1, 'scalar_number', 0, 0), 278 | 'span.lastWord' : (1, 'scalar_number', 0, 0), 279 | 280 | 'extratokens' : (1, 'snippets', 1, 0), 281 | 'extratokens.type' : (1, 'scalar_text', 0, 0), 282 | 'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0), 283 | 'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0), 284 | 285 | 'glyph.h' : (1, 'number', 0, 0), 286 | 'glyph.w' : (1, 'number', 0, 0), 287 | 'glyph.use' : (1, 'number', 0, 0), 288 | 'glyph.vtx' : (1, 'number', 0, 1), 289 | 'glyph.len' : (1, 'number', 0, 1), 290 | 'glyph.dpi' : (1, 'number', 0, 0), 291 | 'vtx' : (0, 'number', 1, 1), 292 | 'vtx.x' : (1, 'number', 0, 0), 293 | 'vtx.y' : (1, 'number', 0, 0), 294 | 'len' : (0, 'number', 1, 1), 295 | 'len.n' : (1, 'number', 0, 0), 296 | 297 | 'book' : (1, 'snippets', 1, 0), 298 | 'version' : (1, 'snippets', 1, 0), 299 | 'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0), 300 | 'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0), 301 | 'version.Schema_id' : (1, 'scalar_text', 0, 0), 302 | 'version.Schema_version' : (1, 'scalar_text', 0, 0), 303 | 'version.Topaz_version' : (1, 'scalar_text', 0, 0), 304 | 'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0), 305 | 'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0), 306 | 'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0), 307 | 'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0), 308 | 'version.chapterheaders' : (1, 'scalar_text', 0, 0), 309 | 'version.creation_date' : (1, 'scalar_text', 0, 0), 310 | 'version.header_footer' : (1, 'scalar_text', 0, 0), 311 | 'version.init_from_ocr' : (1, 'scalar_text', 0, 0), 312 | 'version.letter_insertion' : (1, 'scalar_text', 0, 0), 313 | 'version.xmlinj_convert' : (1, 'scalar_text', 0, 0), 314 | 'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0), 315 | 'version.xmlinj_transform' : (1, 'scalar_text', 0, 0), 316 | 'version.findlists' : (1, 'scalar_text', 0, 0), 317 | 'version.page_num' : (1, 'scalar_text', 0, 0), 318 | 'version.page_type' : (1, 'scalar_text', 0, 0), 319 | 'version.bad_text' : (1, 'scalar_text', 0, 0), 320 | 'version.glyph_mismatch' : (1, 'scalar_text', 0, 0), 321 | 'version.margins' : (1, 'scalar_text', 0, 0), 322 | 'version.staggered_lines' : (1, 'scalar_text', 0, 0), 323 | 'version.paragraph_continuation' : (1, 'scalar_text', 0, 0), 324 | 'version.toc' : (1, 'scalar_text', 0, 0), 325 | 326 | 'stylesheet' : (1, 'snippets', 1, 0), 327 | 'style' : (1, 'snippets', 1, 0), 328 | 'style._tag' : (1, 'scalar_text', 0, 0), 329 | 'style.type' : (1, 'scalar_text', 0, 0), 330 | 'style._parent_type' : (1, 'scalar_text', 0, 0), 331 | 'style.class' : (1, 'scalar_text', 0, 0), 332 | 'style._after_class' : (1, 'scalar_text', 0, 0), 333 | 'rule' : (1, 'snippets', 1, 0), 334 | 'rule.attr' : (1, 'scalar_text', 0, 0), 335 | 'rule.value' : (1, 'scalar_text', 0, 0), 336 | 337 | 'original' : (0, 'number', 1, 1), 338 | 'original.pnum' : (1, 'number', 0, 0), 339 | 'original.pid' : (1, 'text', 0, 0), 340 | 'pages' : (0, 'number', 1, 1), 341 | 'pages.ref' : (1, 'number', 0, 0), 342 | 'pages.id' : (1, 'number', 0, 0), 343 | 'startID' : (0, 'number', 1, 1), 344 | 'startID.page' : (1, 'number', 0, 0), 345 | 'startID.id' : (1, 'number', 0, 0), 346 | 347 | } 348 | 349 | 350 | # full tag path record keeping routines 351 | def tag_push(self, token): 352 | self.tagpath.append(token) 353 | def tag_pop(self): 354 | if len(self.tagpath) > 0 : 355 | self.tagpath.pop() 356 | def tagpath_len(self): 357 | return len(self.tagpath) 358 | def get_tagpath(self, i): 359 | cnt = len(self.tagpath) 360 | if i < cnt : result = self.tagpath[i] 361 | for j in xrange(i+1, cnt) : 362 | result += '.' + self.tagpath[j] 363 | return result 364 | 365 | 366 | # list of absolute command byte values values that indicate 367 | # various types of loop meachanisms typically used to generate vectors 368 | 369 | cmd_list = (0x76, 0x76) 370 | 371 | # peek at and return 1 byte that is ahead by i bytes 372 | def peek(self, aheadi): 373 | c = self.fo.read(aheadi) 374 | if (len(c) == 0): 375 | return None 376 | self.fo.seek(-aheadi,1) 377 | c = c[-1:] 378 | return ord(c) 379 | 380 | 381 | # get the next value from the file being processed 382 | def getNext(self): 383 | nbyte = self.peek(1); 384 | if (nbyte == None): 385 | return None 386 | val = readEncodedNumber(self.fo) 387 | return val 388 | 389 | 390 | # format an arg by argtype 391 | def formatArg(self, arg, argtype): 392 | if (argtype == 'text') or (argtype == 'scalar_text') : 393 | result = self.dict.lookup(arg) 394 | elif (argtype == 'raw') or (argtype == 'number') or (argtype == 'scalar_number') : 395 | result = arg 396 | elif (argtype == 'snippets') : 397 | result = arg 398 | else : 399 | print "Error Unknown argtype %s" % argtype 400 | sys.exit(-2) 401 | return result 402 | 403 | 404 | # process the next tag token, recursively handling subtags, 405 | # arguments, and commands 406 | def procToken(self, token): 407 | 408 | known_token = False 409 | self.tag_push(token) 410 | 411 | if self.debug : print 'Processing: ', self.get_tagpath(0) 412 | cnt = self.tagpath_len() 413 | for j in xrange(cnt): 414 | tkn = self.get_tagpath(j) 415 | if tkn in self.token_tags : 416 | num_args = self.token_tags[tkn][0] 417 | argtype = self.token_tags[tkn][1] 418 | subtags = self.token_tags[tkn][2] 419 | splcase = self.token_tags[tkn][3] 420 | ntags = -1 421 | known_token = True 422 | break 423 | 424 | if known_token : 425 | 426 | # handle subtags if present 427 | subtagres = [] 428 | if (splcase == 1): 429 | # this type of tag uses of escape marker 0x74 indicate subtag count 430 | if self.peek(1) == 0x74: 431 | skip = readEncodedNumber(self.fo) 432 | subtags = 1 433 | num_args = 0 434 | 435 | if (subtags == 1): 436 | ntags = readEncodedNumber(self.fo) 437 | if self.debug : print 'subtags: ' + token + ' has ' + str(ntags) 438 | for j in xrange(ntags): 439 | val = readEncodedNumber(self.fo) 440 | subtagres.append(self.procToken(self.dict.lookup(val))) 441 | 442 | # arguments can be scalars or vectors of text or numbers 443 | argres = [] 444 | if num_args > 0 : 445 | firstarg = self.peek(1) 446 | if (firstarg in self.cmd_list) and (argtype != 'scalar_number') and (argtype != 'scalar_text'): 447 | # single argument is a variable length vector of data 448 | arg = readEncodedNumber(self.fo) 449 | argres = self.decodeCMD(arg,argtype) 450 | else : 451 | # num_arg scalar arguments 452 | for i in xrange(num_args): 453 | argres.append(self.formatArg(readEncodedNumber(self.fo), argtype)) 454 | 455 | # build the return tag 456 | result = [] 457 | tkn = self.get_tagpath(0) 458 | result.append(tkn) 459 | result.append(subtagres) 460 | result.append(argtype) 461 | result.append(argres) 462 | self.tag_pop() 463 | return result 464 | 465 | # all tokens that need to be processed should be in the hash 466 | # table if it may indicate a problem, either new token 467 | # or an out of sync condition 468 | else: 469 | result = [] 470 | if (self.debug): 471 | print 'Unknown Token:', token 472 | self.tag_pop() 473 | return result 474 | 475 | 476 | # special loop used to process code snippets 477 | # it is NEVER used to format arguments. 478 | # builds the snippetList 479 | def doLoop72(self, argtype): 480 | cnt = readEncodedNumber(self.fo) 481 | if self.debug : 482 | result = 'Set of '+ str(cnt) + ' xml snippets. The overall structure \n' 483 | result += 'of the document is indicated by snippet number sets at the\n' 484 | result += 'end of each snippet. \n' 485 | print result 486 | for i in xrange(cnt): 487 | if self.debug: print 'Snippet:',str(i) 488 | snippet = [] 489 | snippet.append(i) 490 | val = readEncodedNumber(self.fo) 491 | snippet.append(self.procToken(self.dict.lookup(val))) 492 | self.snippetList.append(snippet) 493 | return 494 | 495 | 496 | 497 | # general loop code gracisouly submitted by "skindle" - thank you! 498 | def doLoop76Mode(self, argtype, cnt, mode): 499 | result = [] 500 | adj = 0 501 | if mode & 1: 502 | adj = readEncodedNumber(self.fo) 503 | mode = mode >> 1 504 | x = [] 505 | for i in xrange(cnt): 506 | x.append(readEncodedNumber(self.fo) - adj) 507 | for i in xrange(mode): 508 | for j in xrange(1, cnt): 509 | x[j] = x[j] + x[j - 1] 510 | for i in xrange(cnt): 511 | result.append(self.formatArg(x[i],argtype)) 512 | return result 513 | 514 | 515 | # dispatches loop commands bytes with various modes 516 | # The 0x76 style loops are used to build vectors 517 | 518 | # This was all derived by trial and error and 519 | # new loop types may exist that are not handled here 520 | # since they did not appear in the test cases 521 | 522 | def decodeCMD(self, cmd, argtype): 523 | if (cmd == 0x76): 524 | 525 | # loop with cnt, and mode to control loop styles 526 | cnt = readEncodedNumber(self.fo) 527 | mode = readEncodedNumber(self.fo) 528 | 529 | if self.debug : print 'Loop for', cnt, 'with mode', mode, ': ' 530 | return self.doLoop76Mode(argtype, cnt, mode) 531 | 532 | if self.dbug: print "Unknown command", cmd 533 | result = [] 534 | return result 535 | 536 | 537 | 538 | # add full tag path to injected snippets 539 | def updateName(self, tag, prefix): 540 | name = tag[0] 541 | subtagList = tag[1] 542 | argtype = tag[2] 543 | argList = tag[3] 544 | nname = prefix + '.' + name 545 | nsubtaglist = [] 546 | for j in subtagList: 547 | nsubtaglist.append(self.updateName(j,prefix)) 548 | ntag = [] 549 | ntag.append(nname) 550 | ntag.append(nsubtaglist) 551 | ntag.append(argtype) 552 | ntag.append(argList) 553 | return ntag 554 | 555 | 556 | 557 | # perform depth first injection of specified snippets into this one 558 | def injectSnippets(self, snippet): 559 | snipno, tag = snippet 560 | name = tag[0] 561 | subtagList = tag[1] 562 | argtype = tag[2] 563 | argList = tag[3] 564 | nsubtagList = [] 565 | if len(argList) > 0 : 566 | for j in argList: 567 | asnip = self.snippetList[j] 568 | aso, atag = self.injectSnippets(asnip) 569 | atag = self.updateName(atag, name) 570 | nsubtagList.append(atag) 571 | argtype='number' 572 | argList=[] 573 | if len(nsubtagList) > 0 : 574 | subtagList.extend(nsubtagList) 575 | tag = [] 576 | tag.append(name) 577 | tag.append(subtagList) 578 | tag.append(argtype) 579 | tag.append(argList) 580 | snippet = [] 581 | snippet.append(snipno) 582 | snippet.append(tag) 583 | return snippet 584 | 585 | 586 | 587 | # format the tag for output 588 | def formatTag(self, node): 589 | name = node[0] 590 | subtagList = node[1] 591 | argtype = node[2] 592 | argList = node[3] 593 | fullpathname = name.split('.') 594 | nodename = fullpathname.pop() 595 | ilvl = len(fullpathname) 596 | indent = ' ' * (3 * ilvl) 597 | result = indent + '<' + nodename + '>' 598 | if len(argList) > 0: 599 | argres = '' 600 | for j in argList: 601 | if (argtype == 'text') or (argtype == 'scalar_text') : 602 | argres += j + '|' 603 | else : 604 | argres += str(j) + ',' 605 | argres = argres[0:-1] 606 | if argtype == 'snippets' : 607 | result += 'snippets:' + argres 608 | else : 609 | result += argres 610 | if len(subtagList) > 0 : 611 | result += '\n' 612 | for j in subtagList: 613 | if len(j) > 0 : 614 | result += self.formatTag(j) 615 | result += indent + '\n' 616 | else: 617 | result += '\n' 618 | return result 619 | 620 | 621 | # flatten tag 622 | def flattenTag(self, node): 623 | name = node[0] 624 | subtagList = node[1] 625 | argtype = node[2] 626 | argList = node[3] 627 | result = name 628 | if (len(argList) > 0): 629 | argres = '' 630 | for j in argList: 631 | if (argtype == 'text') or (argtype == 'scalar_text') : 632 | argres += j + '|' 633 | else : 634 | argres += str(j) + '|' 635 | argres = argres[0:-1] 636 | if argtype == 'snippets' : 637 | result += '.snippets=' + argres 638 | else : 639 | result += '=' + argres 640 | result += '\n' 641 | for j in subtagList: 642 | if len(j) > 0 : 643 | result += self.flattenTag(j) 644 | return result 645 | 646 | 647 | # reduce create xml output 648 | def formatDoc(self, flat_xml): 649 | result = '' 650 | for j in self.doc : 651 | if len(j) > 0: 652 | if flat_xml: 653 | result += self.flattenTag(j) 654 | else: 655 | result += self.formatTag(j) 656 | if self.debug : print result 657 | return result 658 | 659 | 660 | 661 | # main loop - parse the page.dat files 662 | # to create structured document and snippets 663 | 664 | # FIXME: value at end of magic appears to be a subtags count 665 | # but for what? For now, inject an 'info" tag as it is in 666 | # every dictionary and seems close to what is meant 667 | # The alternative is to special case the last _ "0x5f" to mean something 668 | 669 | def process(self): 670 | 671 | # peek at the first bytes to see what type of file it is 672 | magic = self.fo.read(9) 673 | if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'): 674 | first_token = 'info' 675 | elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'): 676 | skip = self.fo.read(2) 677 | first_token = 'info' 678 | elif (magic[0:1] == 'p') and (magic[2:8] == '_PAGE_'): 679 | first_token = 'info' 680 | elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'): 681 | skip = self.fo.read(3) 682 | first_token = 'info' 683 | else : 684 | # other0.dat file 685 | first_token = None 686 | self.fo.seek(-9,1) 687 | 688 | 689 | # main loop to read and build the document tree 690 | while True: 691 | 692 | if first_token != None : 693 | # use "inserted" first token 'info' for page and glyph files 694 | tag = self.procToken(first_token) 695 | if len(tag) > 0 : 696 | self.doc.append(tag) 697 | first_token = None 698 | 699 | v = self.getNext() 700 | if (v == None): 701 | break 702 | 703 | if (v == 0x72): 704 | self.doLoop72('number') 705 | elif (v > 0) and (v < self.dict.getSize()) : 706 | tag = self.procToken(self.dict.lookup(v)) 707 | if len(tag) > 0 : 708 | self.doc.append(tag) 709 | else: 710 | if self.debug: 711 | print "Main Loop: Unknown value: %x" % v 712 | if (v == 0): 713 | if (self.peek(1) == 0x5f): 714 | skip = self.fo.read(1) 715 | first_token = 'info' 716 | 717 | # now do snippet injection 718 | if len(self.snippetList) > 0 : 719 | if self.debug : print 'Injecting Snippets:' 720 | snippet = self.injectSnippets(self.snippetList[0]) 721 | snipno = snippet[0] 722 | tag_add = snippet[1] 723 | if self.debug : print self.formatTag(tag_add) 724 | if len(tag_add) > 0: 725 | self.doc.append(tag_add) 726 | 727 | # handle generation of xml output 728 | xmlpage = self.formatDoc(self.flat_xml) 729 | 730 | return xmlpage 731 | 732 | 733 | 734 | def usage(): 735 | print 'Usage: ' 736 | print ' convert2xml.py dict0000.dat infile.dat ' 737 | print ' ' 738 | print ' Options:' 739 | print ' -h print this usage help message ' 740 | print ' -d turn on debug output to check for potential errors ' 741 | print ' --flat-xml output the flattened xml page description only ' 742 | print ' ' 743 | print ' This program will attempt to convert a page*.dat file or ' 744 | print ' glyphs*.dat file, using the dict0000.dat file, to its xml description. ' 745 | print ' ' 746 | print ' Use "cmbtc_dump.py" first to unencrypt, uncompress, and dump ' 747 | print ' the *.dat files from a Topaz format e-book.' 748 | 749 | # 750 | # Main 751 | # 752 | 753 | def main(argv): 754 | dictFile = "" 755 | pageFile = "" 756 | debug = False 757 | flat_xml = False 758 | printOutput = False 759 | if len(argv) == 0: 760 | printOutput = True 761 | argv = sys.argv 762 | 763 | try: 764 | opts, args = getopt.getopt(argv[1:], "hd", ["flat-xml"]) 765 | 766 | except getopt.GetoptError, err: 767 | 768 | # print help information and exit: 769 | print str(err) # will print something like "option -a not recognized" 770 | usage() 771 | sys.exit(2) 772 | 773 | if len(opts) == 0 and len(args) == 0 : 774 | usage() 775 | sys.exit(2) 776 | 777 | for o, a in opts: 778 | if o =="-d": 779 | debug=True 780 | if o =="-h": 781 | usage() 782 | sys.exit(0) 783 | if o =="--flat-xml": 784 | flat_xml = True 785 | 786 | dictFile, pageFile = args[0], args[1] 787 | 788 | # read in the string table dictionary 789 | dict = Dictionary(dictFile) 790 | # dict.dumpDict() 791 | 792 | # create a page parser 793 | pp = PageParser(pageFile, dict, debug, flat_xml) 794 | 795 | xmlpage = pp.process() 796 | 797 | if printOutput: 798 | print xmlpage 799 | return 0 800 | 801 | return xmlpage 802 | 803 | if __name__ == '__main__': 804 | sys.exit(main('')) -------------------------------------------------------------------------------- /topaz/decode_meta.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 3 | # For use with Topaz Scripts Version 2.6 4 | 5 | import csv 6 | import sys 7 | import os 8 | import getopt 9 | from struct import pack 10 | from struct import unpack 11 | 12 | # 13 | # Get a 7 bit encoded number from string 14 | # 15 | 16 | def readEncodedNumber(file): 17 | flag = False 18 | c = file.read(1) 19 | if (len(c) == 0): 20 | return None 21 | data = ord(c) 22 | 23 | if data == 0xFF: 24 | flag = True 25 | c = file.read(1) 26 | if (len(c) == 0): 27 | return None 28 | data = ord(c) 29 | 30 | if data >= 0x80: 31 | datax = (data & 0x7F) 32 | while data >= 0x80 : 33 | c = file.read(1) 34 | if (len(c) == 0): 35 | return None 36 | data = ord(c) 37 | datax = (datax <<7) + (data & 0x7F) 38 | data = datax 39 | 40 | if flag: 41 | data = -data 42 | return data 43 | 44 | # 45 | # Encode a number in 7 bit format 46 | # 47 | 48 | def encodeNumber(number): 49 | result = "" 50 | negative = False 51 | flag = 0 52 | 53 | if number < 0 : 54 | number = -number + 1 55 | negative = True 56 | 57 | while True: 58 | byte = number & 0x7F 59 | number = number >> 7 60 | byte += flag 61 | result += chr(byte) 62 | flag = 0x80 63 | if number == 0 : 64 | if (byte == 0xFF and negative == False) : 65 | result += chr(0x80) 66 | break 67 | 68 | if negative: 69 | result += chr(0xFF) 70 | 71 | return result[::-1] 72 | 73 | # 74 | # Get a length prefixed string from the file 75 | # 76 | def lengthPrefixString(data): 77 | return encodeNumber(len(data))+data 78 | 79 | def readString(file): 80 | stringLength = readEncodedNumber(file) 81 | if (stringLength == None): 82 | return None 83 | sv = file.read(stringLength) 84 | if (len(sv) != stringLength): 85 | return "" 86 | return unpack(str(stringLength)+"s",sv)[0] 87 | 88 | 89 | 90 | def getMetaArray(metaFile): 91 | # parse the meta file into a Python dictionary (associative array) 92 | result = {} 93 | fo = file(metaFile,'rb') 94 | size = readEncodedNumber(fo) 95 | for i in xrange(size): 96 | temp = readString(fo) 97 | result[temp] = readString(fo) 98 | fo.close() 99 | return result 100 | 101 | 102 | 103 | def getMetaData(metaFile): 104 | # parse the meta file 105 | result = '' 106 | fo = file(metaFile,'rb') 107 | size = readEncodedNumber(fo) 108 | for i in xrange(size): 109 | result += readString(fo) + '|' 110 | result += readString(fo) + '\n' 111 | fo.close() 112 | return result 113 | -------------------------------------------------------------------------------- /topaz/flatxml2html.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 3 | # For use with Topaz Scripts Version 2.6 4 | 5 | import sys 6 | import csv 7 | import os 8 | import math 9 | import getopt 10 | from struct import pack 11 | from struct import unpack 12 | 13 | 14 | class DocParser(object): 15 | def __init__(self, flatxml, classlst, fileid, bookDir, fixedimage): 16 | self.id = os.path.basename(fileid).replace('.dat','') 17 | self.svgcount = 0 18 | self.docList = flatxml.split('\n') 19 | self.docSize = len(self.docList) 20 | self.classList = {} 21 | self.bookDir = bookDir 22 | self.glyphPaths = { } 23 | self.numPaths = 0 24 | tmpList = classlst.split('\n') 25 | for pclass in tmpList: 26 | if pclass != '': 27 | # remove the leading period from the css name 28 | cname = pclass[1:] 29 | self.classList[cname] = True 30 | self.fixedimage = fixedimage 31 | self.ocrtext = [] 32 | self.link_id = [] 33 | self.link_title = [] 34 | self.link_page = [] 35 | self.link_href = [] 36 | self.link_type = [] 37 | self.dehyphen_rootid = [] 38 | self.paracont_stemid = [] 39 | self.parastems_stemid = [] 40 | 41 | 42 | def getGlyph(self, gid): 43 | result = '' 44 | id='gl%d' % gid 45 | return self.glyphPaths[id] 46 | 47 | 48 | def glyphs_to_image(self, glyphList): 49 | 50 | def extract(path, key): 51 | b = path.find(key) + len(key) 52 | e = path.find(' ',b) 53 | return int(path[b:e]) 54 | 55 | def extractID(path, key): 56 | b = path.find(key) + len(key) 57 | e = path.find('"',b) 58 | return path[b:e] 59 | 60 | 61 | svgDir = os.path.join(self.bookDir,'svg') 62 | glyfile = os.path.join(svgDir,'glyphs.svg') 63 | 64 | imgDir = os.path.join(self.bookDir,'img') 65 | imgname = self.id + '_%04d.svg' % self.svgcount 66 | imgfile = os.path.join(imgDir,imgname) 67 | 68 | # build hashtable of glyph paths keyed by glyph id 69 | if self.numPaths == 0: 70 | gfile = open(glyfile, 'r') 71 | while True: 72 | path = gfile.readline() 73 | if (path == ''): break 74 | glyphid = extractID(path,'id="') 75 | self.glyphPaths[glyphid] = path 76 | self.numPaths += 1 77 | gfile.close() 78 | 79 | 80 | # get glyph information 81 | gxList = self.getData('info.glyph.x',0,-1) 82 | gyList = self.getData('info.glyph.y',0,-1) 83 | gidList = self.getData('info.glyph.glyphID',0,-1) 84 | 85 | gids = [] 86 | maxws = [] 87 | maxhs = [] 88 | xs = [] 89 | ys = [] 90 | gdefs = [] 91 | 92 | # get path defintions, positions, dimensions for ecah glyph 93 | # that makes up the image, and find min x and min y to reposition origin 94 | minx = -1 95 | miny = -1 96 | for j in glyphList: 97 | gid = gidList[j] 98 | gids.append(gid) 99 | 100 | xs.append(gxList[j]) 101 | if minx == -1: minx = gxList[j] 102 | else : minx = min(minx, gxList[j]) 103 | 104 | ys.append(gyList[j]) 105 | if miny == -1: miny = gyList[j] 106 | else : miny = min(miny, gyList[j]) 107 | 108 | path = self.getGlyph(gid) 109 | gdefs.append(path) 110 | 111 | maxws.append(extract(path,'width=')) 112 | maxhs.append(extract(path,'height=')) 113 | 114 | 115 | # change the origin to minx, miny and calc max height and width 116 | maxw = maxws[0] + xs[0] - minx 117 | maxh = maxhs[0] + ys[0] - miny 118 | for j in xrange(0, len(xs)): 119 | xs[j] = xs[j] - minx 120 | ys[j] = ys[j] - miny 121 | maxw = max( maxw, (maxws[j] + xs[j]) ) 122 | maxh = max( maxh, (maxhs[j] + ys[j]) ) 123 | 124 | # open the image file for output 125 | ifile = open(imgfile,'w') 126 | ifile.write('\n') 127 | ifile.write('\n') 128 | ifile.write('\n' % (math.floor(maxw/10), math.floor(maxh/10), maxw, maxh)) 129 | ifile.write('\n') 130 | for j in xrange(0,len(gdefs)): 131 | ifile.write(gdefs[j]) 132 | ifile.write('\n') 133 | for j in xrange(0,len(gids)): 134 | ifile.write('\n' % (gids[j], xs[j], ys[j])) 135 | ifile.write('') 136 | ifile.close() 137 | 138 | return 0 139 | 140 | 141 | 142 | # return tag at line pos in document 143 | def lineinDoc(self, pos) : 144 | if (pos >= 0) and (pos < self.docSize) : 145 | item = self.docList[pos] 146 | if item.find('=') >= 0: 147 | (name, argres) = item.split('=',1) 148 | else : 149 | name = item 150 | argres = '' 151 | return name, argres 152 | 153 | 154 | # find tag in doc if within pos to end inclusive 155 | def findinDoc(self, tagpath, pos, end) : 156 | result = None 157 | if end == -1 : 158 | end = self.docSize 159 | else: 160 | end = min(self.docSize, end) 161 | foundat = -1 162 | for j in xrange(pos, end): 163 | item = self.docList[j] 164 | if item.find('=') >= 0: 165 | (name, argres) = item.split('=',1) 166 | else : 167 | name = item 168 | argres = '' 169 | if name.endswith(tagpath) : 170 | result = argres 171 | foundat = j 172 | break 173 | return foundat, result 174 | 175 | 176 | # return list of start positions for the tagpath 177 | def posinDoc(self, tagpath): 178 | startpos = [] 179 | pos = 0 180 | res = "" 181 | while res != None : 182 | (foundpos, res) = self.findinDoc(tagpath, pos, -1) 183 | if res != None : 184 | startpos.append(foundpos) 185 | pos = foundpos + 1 186 | return startpos 187 | 188 | 189 | # returns a vector of integers for the tagpath 190 | def getData(self, tagpath, pos, end): 191 | argres=[] 192 | (foundat, argt) = self.findinDoc(tagpath, pos, end) 193 | if (argt != None) and (len(argt) > 0) : 194 | argList = argt.split('|') 195 | argres = [ int(strval) for strval in argList] 196 | return argres 197 | 198 | 199 | # get the class 200 | def getClass(self, pclass): 201 | nclass = pclass 202 | 203 | # class names are an issue given topaz may start them with numerals (not allowed), 204 | # use a mix of cases (which cause some browsers problems), and actually 205 | # attach numbers after "_reclustered*" to the end to deal classeses that inherit 206 | # from a base class (but then not actually provide all of these _reclustereed 207 | # classes in the stylesheet! 208 | 209 | # so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass 210 | # that exists in the stylesheet first, and then adding this specific class 211 | # after 212 | 213 | # also some class names have spaces in them so need to convert to dashes 214 | if nclass != None : 215 | nclass = nclass.replace(' ','-') 216 | classres = '' 217 | nclass = nclass.lower() 218 | nclass = 'cl-' + nclass 219 | baseclass = '' 220 | # graphic is the base class for captions 221 | if nclass.find('cl-cap-') >=0 : 222 | classres = 'graphic' + ' ' 223 | else : 224 | # strip to find baseclass 225 | p = nclass.find('_') 226 | if p > 0 : 227 | baseclass = nclass[0:p] 228 | if baseclass in self.classList: 229 | classres += baseclass + ' ' 230 | classres += nclass 231 | nclass = classres 232 | return nclass 233 | 234 | 235 | # develop a sorted description of the starting positions of 236 | # groups and regions on the page, as well as the page type 237 | def PageDescription(self): 238 | 239 | def compare(x, y): 240 | (xtype, xval) = x 241 | (ytype, yval) = y 242 | if xval > yval: 243 | return 1 244 | if xval == yval: 245 | return 0 246 | return -1 247 | 248 | result = [] 249 | (pos, pagetype) = self.findinDoc('page.type',0,-1) 250 | 251 | groupList = self.posinDoc('page.group') 252 | groupregionList = self.posinDoc('page.group.region') 253 | pageregionList = self.posinDoc('page.region') 254 | # integrate into one list 255 | for j in groupList: 256 | result.append(('grpbeg',j)) 257 | for j in groupregionList: 258 | result.append(('gregion',j)) 259 | for j in pageregionList: 260 | result.append(('pregion',j)) 261 | result.sort(compare) 262 | 263 | # insert group end and page end indicators 264 | inGroup = False 265 | j = 0 266 | while True: 267 | if j == len(result): break 268 | rtype = result[j][0] 269 | rval = result[j][1] 270 | if not inGroup and (rtype == 'grpbeg') : 271 | inGroup = True 272 | j = j + 1 273 | elif inGroup and (rtype in ('grpbeg', 'pregion')): 274 | result.insert(j,('grpend',rval)) 275 | inGroup = False 276 | else: 277 | j = j + 1 278 | if inGroup: 279 | result.append(('grpend',-1)) 280 | result.append(('pageend', -1)) 281 | return pagetype, result 282 | 283 | 284 | 285 | # build a description of the paragraph 286 | def getParaDescription(self, start, end, regtype): 287 | 288 | result = [] 289 | 290 | # paragraph 291 | (pos, pclass) = self.findinDoc('paragraph.class',start,end) 292 | 293 | pclass = self.getClass(pclass) 294 | 295 | # build up a description of the paragraph in result and return it 296 | # first check for the basic - all words paragraph 297 | (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end) 298 | (pos, slast) = self.findinDoc('paragraph.lastWord',start,end) 299 | if (sfirst != None) and (slast != None) : 300 | first = int(sfirst) 301 | last = int(slast) 302 | 303 | makeImage = (regtype == 'vertical') or (regtype == 'table') 304 | if self.fixedimage: 305 | makeImage = makeImage or (regtype == 'fixed') 306 | 307 | if (pclass != None): 308 | makeImage = makeImage or (pclass.find('.inverted') >= 0) 309 | if self.fixedimage : 310 | makeImage = makeImage or (pclass.find('cl-f-') >= 0) 311 | 312 | if not makeImage : 313 | # standard all word paragraph 314 | for wordnum in xrange(first, last): 315 | result.append(('ocr', wordnum)) 316 | return pclass, result 317 | 318 | # convert paragraph to svg image 319 | # translate first and last word into first and last glyphs 320 | # and generate inline image and include it 321 | glyphList = [] 322 | firstglyphList = self.getData('word.firstGlyph',0,-1) 323 | gidList = self.getData('info.glyph.glyphID',0,-1) 324 | firstGlyph = firstglyphList[first] 325 | if last < len(firstglyphList): 326 | lastGlyph = firstglyphList[last] 327 | else : 328 | lastGlyph = len(gidList) 329 | for glyphnum in xrange(firstGlyph, lastGlyph): 330 | glyphList.append(glyphnum) 331 | # include any extratokens if they exist 332 | (pos, sfg) = self.findinDoc('extratokens.firstGlyph',start,end) 333 | (pos, slg) = self.findinDoc('extratokens.lastGlyph',start,end) 334 | if (sfg != None) and (slg != None): 335 | for glyphnum in xrange(int(sfg), int(slg)): 336 | glyphList.append(glyphnum) 337 | num = self.svgcount 338 | self.glyphs_to_image(glyphList) 339 | self.svgcount += 1 340 | result.append(('svg', num)) 341 | return pclass, result 342 | 343 | # this type of paragraph may be made up of multiple spans, inline 344 | # word monograms (images), and words with semantic meaning, 345 | # plus glyphs used to form starting letter of first word 346 | 347 | # need to parse this type line by line 348 | line = start + 1 349 | word_class = '' 350 | 351 | # if end is -1 then we must search to end of document 352 | if end == -1 : 353 | end = self.docSize 354 | 355 | # seems some xml has last* coming before first* so we have to 356 | # handle any order 357 | sp_first = -1 358 | sp_last = -1 359 | 360 | gl_first = -1 361 | gl_last = -1 362 | 363 | ws_first = -1 364 | ws_last = -1 365 | 366 | word_class = '' 367 | 368 | while (line < end) : 369 | 370 | (name, argres) = self.lineinDoc(line) 371 | 372 | if name.endswith('span.firstWord') : 373 | sp_first = int(argres) 374 | 375 | elif name.endswith('span.lastWord') : 376 | sp_last = int(argres) 377 | 378 | elif name.endswith('word.firstGlyph') : 379 | gl_first = int(argres) 380 | 381 | elif name.endswith('word.lastGlyph') : 382 | gl_last = int(argres) 383 | 384 | elif name.endswith('word_semantic.firstWord'): 385 | ws_first = int(argres) 386 | 387 | elif name.endswith('word_semantic.lastWord'): 388 | ws_last = int(argres) 389 | 390 | elif name.endswith('word.class'): 391 | (cname, space) = argres.split('-',1) 392 | if space == '' : space = '0' 393 | if (cname == 'spaceafter') and (int(space) > 0) : 394 | word_class = 'sa' 395 | 396 | elif name.endswith('word.img.src'): 397 | result.append(('img' + word_class, int(argres))) 398 | word_class = '' 399 | 400 | elif name.endswith('region.img.src'): 401 | result.append(('img' + word_class, int(argres))) 402 | 403 | if (sp_first != -1) and (sp_last != -1): 404 | for wordnum in xrange(sp_first, sp_last): 405 | result.append(('ocr', wordnum)) 406 | sp_first = -1 407 | sp_last = -1 408 | 409 | if (gl_first != -1) and (gl_last != -1): 410 | glyphList = [] 411 | for glyphnum in xrange(gl_first, gl_last): 412 | glyphList.append(glyphnum) 413 | num = self.svgcount 414 | self.glyphs_to_image(glyphList) 415 | self.svgcount += 1 416 | result.append(('svg', num)) 417 | gl_first = -1 418 | gl_last = -1 419 | 420 | if (ws_first != -1) and (ws_last != -1): 421 | for wordnum in xrange(ws_first, ws_last): 422 | result.append(('ocr', wordnum)) 423 | ws_first = -1 424 | ws_last = -1 425 | 426 | line += 1 427 | 428 | return pclass, result 429 | 430 | 431 | def buildParagraph(self, pclass, pdesc, type, regtype) : 432 | parares = '' 433 | sep ='' 434 | 435 | classres = '' 436 | if pclass : 437 | classres = ' class="' + pclass + '"' 438 | 439 | br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') or (regtype == 'vertical') 440 | 441 | handle_links = len(self.link_id) > 0 442 | 443 | if (type == 'full') or (type == 'begin') : 444 | parares += '' 445 | 446 | if (type == 'end'): 447 | parares += ' ' 448 | 449 | lstart = len(parares) 450 | 451 | cnt = len(pdesc) 452 | 453 | for j in xrange( 0, cnt) : 454 | 455 | (wtype, num) = pdesc[j] 456 | 457 | if wtype == 'ocr' : 458 | word = self.ocrtext[num] 459 | sep = ' ' 460 | 461 | if handle_links: 462 | link = self.link_id[num] 463 | if (link > 0): 464 | linktype = self.link_type[link-1] 465 | title = self.link_title[link-1] 466 | if (title == "") or (parares.rfind(title) < 0): 467 | title=parares[lstart:] 468 | if linktype == 'external' : 469 | linkhref = self.link_href[link-1] 470 | linkhtml = '' % linkhref 471 | else : 472 | if len(self.link_page) >= link : 473 | ptarget = self.link_page[link-1] - 1 474 | linkhtml = '' % ptarget 475 | else : 476 | # just link to the current page 477 | linkhtml = '' 478 | linkhtml += title + '' 479 | pos = parares.rfind(title) 480 | if pos >= 0: 481 | parares = parares[0:pos] + linkhtml + parares[pos+len(title):] 482 | else : 483 | parares += linkhtml 484 | lstart = len(parares) 485 | if word == '_link_' : word = '' 486 | elif (link < 0) : 487 | if word == '_link_' : word = '' 488 | 489 | if word == '_lb_': 490 | if ((num-1) in self.dehyphen_rootid ) or handle_links: 491 | word = '' 492 | sep = '' 493 | elif br_lb : 494 | word = '
\n' 495 | sep = '' 496 | else : 497 | word = '\n' 498 | sep = '' 499 | 500 | if num in self.dehyphen_rootid : 501 | word = word[0:-1] 502 | sep = '' 503 | 504 | parares += word + sep 505 | 506 | elif wtype == 'img' : 507 | sep = '' 508 | parares += '' % num 509 | parares += sep 510 | 511 | elif wtype == 'imgsa' : 512 | sep = ' ' 513 | parares += '' % num 514 | parares += sep 515 | 516 | elif wtype == 'svg' : 517 | sep = '' 518 | parares += '' % num 519 | parares += sep 520 | 521 | if len(sep) > 0 : parares = parares[0:-1] 522 | if (type == 'full') or (type == 'end') : 523 | parares += '

' 524 | return parares 525 | 526 | 527 | 528 | # walk the document tree collecting the information needed 529 | # to build an html page using the ocrText 530 | 531 | def process(self): 532 | 533 | htmlpage = '' 534 | 535 | # get the ocr text 536 | (pos, argres) = self.findinDoc('info.word.ocrText',0,-1) 537 | if argres : self.ocrtext = argres.split('|') 538 | 539 | # get information to dehyphenate the text 540 | self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1) 541 | 542 | # determine if first paragraph is continued from previous page 543 | (pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1) 544 | first_para_continued = (self.parastems_stemid != None) 545 | 546 | # determine if last paragraph is continued onto the next page 547 | (pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1) 548 | last_para_continued = (self.paracont_stemid != None) 549 | 550 | # collect link ids 551 | self.link_id = self.getData('info.word.link_id',0,-1) 552 | 553 | # collect link destination page numbers 554 | self.link_page = self.getData('info.links.page',0,-1) 555 | 556 | # collect link types (container versus external) 557 | (pos, argres) = self.findinDoc('info.links.type',0,-1) 558 | if argres : self.link_type = argres.split('|') 559 | 560 | # collect link destinations 561 | (pos, argres) = self.findinDoc('info.links.href',0,-1) 562 | if argres : self.link_href = argres.split('|') 563 | 564 | # collect link titles 565 | (pos, argres) = self.findinDoc('info.links.title',0,-1) 566 | if argres : 567 | self.link_title = argres.split('|') 568 | else: 569 | self.link_title.append('') 570 | 571 | # get a descriptions of the starting points of the regions 572 | # and groups on the page 573 | (pagetype, pageDesc) = self.PageDescription() 574 | regcnt = len(pageDesc) - 1 575 | 576 | anchorSet = False 577 | breakSet = False 578 | inGroup = False 579 | 580 | # process each region on the page and convert what you can to html 581 | 582 | for j in xrange(regcnt): 583 | 584 | (etype, start) = pageDesc[j] 585 | (ntype, end) = pageDesc[j+1] 586 | 587 | 588 | # set anchor for link target on this page 589 | if not anchorSet and not first_para_continued: 590 | htmlpage += '\n' 592 | anchorSet = True 593 | 594 | # handle groups of graphics with text captions 595 | if (etype == 'grpbeg'): 596 | (pos, grptype) = self.findinDoc('group.type', start, end) 597 | if grptype != None: 598 | if grptype == 'graphic': 599 | gcstr = ' class="' + grptype + '"' 600 | htmlpage += '' 601 | inGroup = True 602 | 603 | elif (etype == 'grpend'): 604 | if inGroup: 605 | htmlpage += '\n' 606 | inGroup = False 607 | 608 | else: 609 | (pos, regtype) = self.findinDoc('region.type',start,end) 610 | 611 | if regtype == 'graphic' : 612 | (pos, simgsrc) = self.findinDoc('img.src',start,end) 613 | if simgsrc: 614 | if inGroup: 615 | htmlpage += '' % int(simgsrc) 616 | else: 617 | htmlpage += '
' % int(simgsrc) 618 | 619 | elif regtype == 'chapterheading' : 620 | (pclass, pdesc) = self.getParaDescription(start,end, regtype) 621 | if not breakSet: 622 | htmlpage += '
 
\n' 623 | breakSet = True 624 | tag = 'h1' 625 | if pclass and (len(pclass) >= 7): 626 | if pclass[3:7] == 'ch1-' : tag = 'h1' 627 | if pclass[3:7] == 'ch2-' : tag = 'h2' 628 | if pclass[3:7] == 'ch3-' : tag = 'h3' 629 | htmlpage += '<' + tag + ' class="' + pclass + '">' 630 | else: 631 | htmlpage += '<' + tag + '>' 632 | htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype) 633 | htmlpage += '' 634 | 635 | elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem'): 636 | ptype = 'full' 637 | # check to see if this is a continution from the previous page 638 | if first_para_continued : 639 | ptype = 'end' 640 | first_para_continued = False 641 | (pclass, pdesc) = self.getParaDescription(start,end, regtype) 642 | if pclass and (len(pclass) >= 6) and (ptype == 'full'): 643 | tag = 'p' 644 | if pclass[3:6] == 'h1-' : tag = 'h4' 645 | if pclass[3:6] == 'h2-' : tag = 'h5' 646 | if pclass[3:6] == 'h3-' : tag = 'h6' 647 | htmlpage += '<' + tag + ' class="' + pclass + '">' 648 | htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype) 649 | htmlpage += '' 650 | else : 651 | htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) 652 | 653 | elif (regtype == 'tocentry') : 654 | ptype = 'full' 655 | if first_para_continued : 656 | ptype = 'end' 657 | first_para_continued = False 658 | (pclass, pdesc) = self.getParaDescription(start,end, regtype) 659 | htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) 660 | 661 | 662 | elif (regtype == 'vertical') or (regtype == 'table') : 663 | ptype = 'full' 664 | if inGroup: 665 | ptype = 'middle' 666 | if first_para_continued : 667 | ptype = 'end' 668 | first_para_continued = False 669 | (pclass, pdesc) = self.getParaDescription(start, end, regtype) 670 | htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) 671 | 672 | 673 | elif (regtype == 'synth_fcvr.center'): 674 | (pos, simgsrc) = self.findinDoc('img.src',start,end) 675 | if simgsrc: 676 | htmlpage += '
' % int(simgsrc) 677 | 678 | else : 679 | print ' Making region type', regtype, 680 | (pos, temp) = self.findinDoc('paragraph',start,end) 681 | (pos2, temp) = self.findinDoc('span',start,end) 682 | if pos != -1 or pos2 != -1: 683 | print ' a "text" region' 684 | orig_regtype = regtype 685 | regtype = 'fixed' 686 | ptype = 'full' 687 | # check to see if this is a continution from the previous page 688 | if first_para_continued : 689 | ptype = 'end' 690 | first_para_continued = False 691 | (pclass, pdesc) = self.getParaDescription(start,end, regtype) 692 | if not pclass: 693 | if orig_regtype.endswith('.right') : pclass = 'cl-right' 694 | elif orig_regtype.endswith('.center') : pclass = 'cl-center' 695 | elif orig_regtype.endswith('.left') : pclass = 'cl-left' 696 | elif orig_regtype.endswith('.justify') : pclass = 'cl-justify' 697 | if pclass and (ptype == 'full') and (len(pclass) >= 6): 698 | tag = 'p' 699 | if pclass[3:6] == 'h1-' : tag = 'h4' 700 | if pclass[3:6] == 'h2-' : tag = 'h5' 701 | if pclass[3:6] == 'h3-' : tag = 'h6' 702 | htmlpage += '<' + tag + ' class="' + pclass + '">' 703 | htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype) 704 | htmlpage += '' 705 | else : 706 | htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype) 707 | else : 708 | print ' a "graphic" region' 709 | (pos, simgsrc) = self.findinDoc('img.src',start,end) 710 | if simgsrc: 711 | htmlpage += '
' % int(simgsrc) 712 | 713 | 714 | if last_para_continued : 715 | if htmlpage[-4:] == '

': 716 | htmlpage = htmlpage[0:-4] 717 | last_para_continued = False 718 | 719 | return htmlpage 720 | 721 | 722 | 723 | def convert2HTML(flatxml, classlst, fileid, bookDir, fixedimage): 724 | 725 | # create a document parser 726 | dp = DocParser(flatxml, classlst, fileid, bookDir, fixedimage) 727 | 728 | htmlpage = dp.process() 729 | 730 | return htmlpage 731 | -------------------------------------------------------------------------------- /topaz/genhtml.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 3 | # For use with Topaz Scripts Version 2.6 4 | 5 | class Unbuffered: 6 | def __init__(self, stream): 7 | self.stream = stream 8 | def write(self, data): 9 | self.stream.write(data) 10 | self.stream.flush() 11 | def __getattr__(self, attr): 12 | return getattr(self.stream, attr) 13 | 14 | import sys 15 | sys.stdout=Unbuffered(sys.stdout) 16 | 17 | 18 | import os, getopt 19 | 20 | # local routines 21 | import convert2xml 22 | import flatxml2html 23 | import decode_meta 24 | import stylexml2css 25 | import getpagedim 26 | 27 | def usage(): 28 | print 'Usage: ' 29 | print ' ' 30 | print ' genhtml.py [--fixed-image] unencryptedBookDir' 31 | print ' ' 32 | print ' Options: ' 33 | print ' --fixed-image : force translation of fixed regions into svg images ' 34 | print ' ' 35 | 36 | 37 | def main(argv): 38 | bookDir = '' 39 | fixedimage = False 40 | 41 | if len(argv) == 0: 42 | argv = sys.argv 43 | 44 | try: 45 | opts, args = getopt.getopt(argv[1:], "h:",["fixed-image"]) 46 | 47 | except getopt.GetoptError, err: 48 | print str(err) 49 | usage() 50 | sys.exit(1) 51 | 52 | if len(opts) == 0 and len(args) == 0 : 53 | usage() 54 | sys.exit(1) 55 | 56 | for o, a in opts: 57 | if o =="-h": 58 | usage() 59 | sys.exit(0) 60 | if o =="--fixed-image": 61 | fixedimage = True 62 | 63 | bookDir = args[0] 64 | 65 | if not os.path.exists(bookDir) : 66 | print "Can not find directory with unencrypted book" 67 | sys.exit(1) 68 | 69 | dictFile = os.path.join(bookDir,'dict0000.dat') 70 | 71 | if not os.path.exists(dictFile) : 72 | print "Can not find dict0000.dat file" 73 | sys.exit(1) 74 | 75 | pageDir = os.path.join(bookDir,'page') 76 | if not os.path.exists(pageDir) : 77 | print "Can not find page directory in unencrypted book" 78 | sys.exit(1) 79 | 80 | imgDir = os.path.join(bookDir,'img') 81 | if not os.path.exists(imgDir) : 82 | print "Can not find image directory in unencrypted book" 83 | sys.exit(1) 84 | 85 | svgDir = os.path.join(bookDir,'svg') 86 | if not os.path.exists(svgDir) : 87 | print "Can not find svg directory in unencrypted book" 88 | print "please run gensvg.py before running genhtml.py" 89 | sys.exit(1) 90 | 91 | otherFile = os.path.join(bookDir,'other0000.dat') 92 | if not os.path.exists(otherFile) : 93 | print "Can not find other0000.dat in unencrypted book" 94 | sys.exit(1) 95 | 96 | metaFile = os.path.join(bookDir,'metadata0000.dat') 97 | if not os.path.exists(metaFile) : 98 | print "Can not find metadata0000.dat in unencrypted book" 99 | sys.exit(1) 100 | 101 | htmlFileName = "book.html" 102 | htmlstr = '\n' 103 | htmlstr += '\n' 104 | 105 | filenames = os.listdir(pageDir) 106 | filenames = sorted(filenames) 107 | 108 | print 'Processing ... ' 109 | 110 | htmlstr += '\n' 111 | htmlstr += '\n' 112 | 113 | # process metadata and retrieve fontSize info 114 | print ' ', 'metadata0000.dat' 115 | fname = os.path.join(bookDir,'metadata0000.dat') 116 | xname = os.path.join(bookDir, 'metadata.txt') 117 | metastr = decode_meta.getMetaData(fname) 118 | file(xname, 'wb').write(metastr) 119 | meta_array = decode_meta.getMetaArray(fname) 120 | 121 | htmlstr += '' + meta_array['Title'] + ' by ' + meta_array['Authors'] + '\n' 122 | htmlstr += '\n' 123 | htmlstr += '\n' 124 | 125 | # get some scaling info from metadata to use while processing styles 126 | fontsize = '135' 127 | if 'fontSize' in meta_array: 128 | fontsize = meta_array['fontSize'] 129 | 130 | # also get the size of a normal text page 131 | spage = '1' 132 | if 'firstTextPage' in meta_array: 133 | spage = meta_array['firstTextPage'] 134 | pnum = int(spage) 135 | 136 | # get page height and width from first text page for use in stylesheet scaling 137 | pname = 'page%04d.dat' % (pnum + 1) 138 | fname = os.path.join(pageDir,pname) 139 | pargv=[] 140 | pargv.append('convert2xml.py') 141 | pargv.append('--flat-xml') 142 | pargv.append(dictFile) 143 | pargv.append(fname) 144 | flat_xml = convert2xml.main(pargv) 145 | (ph, pw) = getpagedim.getPageDim(flat_xml) 146 | if (ph == '-1') or (ph == '0') : ph = '11000' 147 | if (pw == '-1') or (pw == '0') : pw = '8500' 148 | 149 | # now build up the style sheet 150 | print ' ', 'other0000.dat' 151 | fname = os.path.join(bookDir,'other0000.dat') 152 | xname = os.path.join(bookDir, 'style.css') 153 | pargv=[] 154 | pargv.append('convert2xml.py') 155 | pargv.append('--flat-xml') 156 | pargv.append(dictFile) 157 | pargv.append(fname) 158 | xmlstr = convert2xml.main(pargv) 159 | cssstr , classlst = stylexml2css.convert2CSS(xmlstr, fontsize, ph, pw) 160 | file(xname, 'wb').write(cssstr) 161 | htmlstr += '\n' 162 | htmlstr += '\n\n' 163 | 164 | for filename in filenames: 165 | print ' ', filename 166 | fname = os.path.join(pageDir,filename) 167 | pargv=[] 168 | pargv.append('convert2xml.py') 169 | pargv.append('--flat-xml') 170 | pargv.append(dictFile) 171 | pargv.append(fname) 172 | flat_xml = convert2xml.main(pargv) 173 | htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, fixedimage) 174 | 175 | htmlstr += '\n\n' 176 | 177 | file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr) 178 | print 'Processing Complete' 179 | 180 | return 0 181 | 182 | if __name__ == '__main__': 183 | sys.exit(main('')) 184 | 185 | 186 | -------------------------------------------------------------------------------- /topaz/gensvg.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 3 | # For use with Topaz Scripts Version 2.6 4 | 5 | class Unbuffered: 6 | def __init__(self, stream): 7 | self.stream = stream 8 | def write(self, data): 9 | self.stream.write(data) 10 | self.stream.flush() 11 | def __getattr__(self, attr): 12 | return getattr(self.stream, attr) 13 | 14 | import sys 15 | sys.stdout=Unbuffered(sys.stdout) 16 | 17 | import os, getopt 18 | 19 | # local routines 20 | import convert2xml 21 | import decode_meta 22 | 23 | 24 | class GParser(object): 25 | def __init__(self, flatxml): 26 | self.flatdoc = flatxml.split('\n') 27 | self.dpi = 1440 28 | self.gh = self.getData('info.glyph.h') 29 | self.gw = self.getData('info.glyph.w') 30 | self.guse = self.getData('info.glyph.use') 31 | if self.guse : 32 | self.count = len(self.guse) 33 | else : 34 | self.count = 0 35 | self.gvtx = self.getData('info.glyph.vtx') 36 | self.glen = self.getData('info.glyph.len') 37 | self.gdpi = self.getData('info.glyph.dpi') 38 | self.vx = self.getData('info.vtx.x') 39 | self.vy = self.getData('info.vtx.y') 40 | self.vlen = self.getData('info.len.n') 41 | if self.vlen : 42 | self.glen.append(len(self.vlen)) 43 | elif self.glen: 44 | self.glen.append(0) 45 | if self.vx : 46 | self.gvtx.append(len(self.vx)) 47 | elif self.gvtx : 48 | self.gvtx.append(0) 49 | 50 | def getData(self, path): 51 | result = None 52 | cnt = len(self.flatdoc) 53 | for j in xrange(cnt): 54 | item = self.flatdoc[j] 55 | if item.find('=') >= 0: 56 | (name, argt) = item.split('=') 57 | argres = argt.split('|') 58 | else: 59 | name = item 60 | argres = [] 61 | if (name == path): 62 | result = argres 63 | break 64 | if (len(argres) > 0) : 65 | for j in xrange(0,len(argres)): 66 | argres[j] = int(argres[j]) 67 | return result 68 | 69 | 70 | def getGlyphDim(self, gly): 71 | maxh = (self.gh[gly] * self.dpi) / self.gdpi[gly] 72 | maxw = (self.gw[gly] * self.dpi) / self.gdpi[gly] 73 | return maxh, maxw 74 | 75 | 76 | def getPath(self, gly): 77 | path = '' 78 | if (gly < 0) or (gly >= self.count): 79 | return path 80 | tx = self.vx[self.gvtx[gly]:self.gvtx[gly+1]] 81 | ty = self.vy[self.gvtx[gly]:self.gvtx[gly+1]] 82 | p = 0 83 | for k in xrange(self.glen[gly], self.glen[gly+1]): 84 | if (p == 0): 85 | zx = tx[0:self.vlen[k]+1] 86 | zy = ty[0:self.vlen[k]+1] 87 | else: 88 | zx = tx[self.vlen[k-1]+1:self.vlen[k]+1] 89 | zy = ty[self.vlen[k-1]+1:self.vlen[k]+1] 90 | p += 1 91 | j = 0 92 | while ( j < len(zx) ): 93 | if (j == 0): 94 | # Start Position. 95 | path += 'M %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly]) 96 | elif (j <= len(zx)-3): 97 | # Cubic Bezier Curve 98 | path += 'C %d %d %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[j+1] * self.dpi / self.gdpi[gly], zy[j+1] * self.dpi / self.gdpi[gly], zx[j+2] * self.dpi / self.gdpi[gly], zy[j+2] * self.dpi / self.gdpi[gly]) 99 | j += 2 100 | elif (j == len(zx)-2): 101 | # Cubic Bezier Curve to Start Position 102 | path += 'C %d %d %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[j+1] * self.dpi / self.gdpi[gly], zy[j+1] * self.dpi / self.gdpi[gly], zx[0] * self.dpi / self.gdpi[gly], zy[0] * self.dpi / self.gdpi[gly]) 103 | j += 1 104 | elif (j == len(zx)-1): 105 | # Quadratic Bezier Curve to Start Position 106 | path += 'Q %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[0] * self.dpi / self.gdpi[gly], zy[0] * self.dpi / self.gdpi[gly]) 107 | 108 | j += 1 109 | path += 'z' 110 | return path 111 | 112 | class PParser(object): 113 | def __init__(self, flatxml): 114 | self.flatdoc = flatxml.split('\n') 115 | self.temp = [] 116 | foo = self.getData('page.h') or self.getData('book.h') 117 | self.ph = foo[0] 118 | foo = self.getData('page.w') or self.getData('book.w') 119 | self.pw = foo[0] 120 | self.gx = self.getData('info.glyph.x') 121 | self.gy = self.getData('info.glyph.y') 122 | self.gid = self.getData('info.glyph.glyphID') 123 | 124 | def getData(self, path): 125 | result = None 126 | cnt = len(self.flatdoc) 127 | for j in xrange(cnt): 128 | item = self.flatdoc[j] 129 | if item.find('=') >= 0: 130 | (name, argt) = item.split('=') 131 | argres = argt.split('|') 132 | else: 133 | name = item 134 | argres = [] 135 | if (name.endswith(path)): 136 | result = argres 137 | break 138 | if (len(argres) > 0) : 139 | for j in xrange(0,len(argres)): 140 | argres[j] = int(argres[j]) 141 | return result 142 | 143 | def getDataTemp(self, path): 144 | result = None 145 | cnt = len(self.temp) 146 | for j in xrange(cnt): 147 | item = self.temp[j] 148 | if item.find('=') >= 0: 149 | (name, argt) = item.split('=') 150 | argres = argt.split('|') 151 | else: 152 | name = item 153 | argres = [] 154 | if (name.endswith(path)): 155 | result = argres 156 | self.temp.pop(j) 157 | break 158 | if (len(argres) > 0) : 159 | for j in xrange(0,len(argres)): 160 | argres[j] = int(argres[j]) 161 | return result 162 | 163 | def getImages(self): 164 | result = [] 165 | self.temp = self.flatdoc 166 | while (self.getDataTemp('img') != None): 167 | h = self.getDataTemp('img.h')[0] 168 | w = self.getDataTemp('img.w')[0] 169 | x = self.getDataTemp('img.x')[0] 170 | y = self.getDataTemp('img.y')[0] 171 | src = self.getDataTemp('img.src')[0] 172 | result.append('\n' % (src, x, y, w, h)) 173 | return result 174 | 175 | def getGlyphs(self,glyfname): 176 | result = [] 177 | if (self.gid != None) and (len(self.gid) > 0): 178 | glyphs = [] 179 | for j in set(self.gid): 180 | glyphs.append(j) 181 | glyphs.sort() 182 | gfile = open(glyfname, 'r') 183 | j = 0 184 | while True : 185 | inp = gfile.readline() 186 | if (inp == ''): 187 | break 188 | id='id="gl%d"' % glyphs[j] 189 | if (inp.find(id) > 0): 190 | result.append(inp) 191 | j += 1 192 | if (j == len(glyphs)): 193 | break 194 | gfile.close() 195 | return result 196 | 197 | 198 | 199 | 200 | def usage(): 201 | print 'Usage: ' 202 | print ' ' 203 | print ' gensvg.py [options] unencryptedBookDir' 204 | print ' ' 205 | print ' -x : output browseable XHTML+SVG pages (default)' 206 | print ' -r : output raw SVG images' 207 | 208 | 209 | def main(argv): 210 | bookDir = '' 211 | 212 | if len(argv) == 0: 213 | argv = sys.argv 214 | 215 | try: 216 | opts, args = getopt.getopt(argv[1:], "xrh") 217 | 218 | except getopt.GetoptError, err: 219 | print str(err) 220 | usage() 221 | sys.exit(1) 222 | 223 | if len(opts) == 0 and len(args) == 0 : 224 | usage() 225 | sys.exit(1) 226 | 227 | raw = 0 228 | for o, a in opts: 229 | if o =="-h": 230 | usage() 231 | sys.exit(0) 232 | if o =="-x": 233 | raw = 0 234 | if o =="-r": 235 | raw = 1 236 | 237 | bookDir = args[0] 238 | 239 | if not os.path.exists(bookDir) : 240 | print "Can not find directory with unencrypted book" 241 | sys.exit(1) 242 | 243 | dictFile = os.path.join(bookDir,'dict0000.dat') 244 | 245 | if not os.path.exists(dictFile) : 246 | print "Can not find dict0000.dat file" 247 | sys.exit(1) 248 | 249 | pageDir = os.path.join(bookDir,'page') 250 | if not os.path.exists(pageDir) : 251 | print "Can not find page directory in unencrypted book" 252 | sys.exit(1) 253 | 254 | imgDir = os.path.join(bookDir,'img') 255 | if not os.path.exists(imgDir) : 256 | print "Can not find image directory in unencrypted book" 257 | sys.exit(1) 258 | 259 | glyphsDir = os.path.join(bookDir,'glyphs') 260 | if not os.path.exists(glyphsDir) : 261 | print "Can not find glyphs directory in unencrypted book" 262 | sys.exit(1) 263 | 264 | metaFile = os.path.join(bookDir,'metadata0000.dat') 265 | if not os.path.exists(metaFile) : 266 | print "Can not find metadata0000.dat in unencrypted book" 267 | sys.exit(1) 268 | 269 | svgDir = os.path.join(bookDir,'svg') 270 | if not os.path.exists(svgDir) : 271 | os.makedirs(svgDir) 272 | 273 | 274 | print 'Processing Meta Data ... ' 275 | 276 | print ' ', 'metadata0000.dat' 277 | fname = os.path.join(bookDir,'metadata0000.dat') 278 | metadata = decode_meta.getMetaArray(fname) 279 | 280 | print 'Processing Glyphs ... ' 281 | 282 | filenames = os.listdir(glyphsDir) 283 | filenames = sorted(filenames) 284 | 285 | glyfname = os.path.join(svgDir,'glyphs.svg') 286 | glyfile = open(glyfname, 'w') 287 | glyfile.write('\n') 288 | glyfile.write('\n') 289 | glyfile.write('\n') 290 | glyfile.write('Glyphs for %s\n' % metadata['Title']) 291 | glyfile.write('\n') 292 | counter = 0 293 | for filename in filenames: 294 | print ' ', filename 295 | fname = os.path.join(glyphsDir,filename) 296 | pargv=[] 297 | pargv.append('convert2xml.py') 298 | pargv.append('--flat-xml') 299 | pargv.append(dictFile) 300 | pargv.append(fname) 301 | flat_xml = convert2xml.main(pargv) 302 | gp = GParser(flat_xml) 303 | for i in xrange(0, gp.count): 304 | path = gp.getPath(i) 305 | maxh, maxw = gp.getGlyphDim(i) 306 | # glyfile.write('\n' % (counter * 256 + i, path)) 307 | glyfile.write('\n' % (counter * 256 + i, path, maxw, maxh )) 308 | counter += 1 309 | glyfile.write('\n') 310 | glyfile.write('\n') 311 | glyfile.close() 312 | 313 | print 'Processing Pages ... ' 314 | 315 | # Books are at 1440 DPI. This is rendering at twice that size for 316 | # readability when rendering to the screen. 317 | scaledpi = 1440 318 | filenames = os.listdir(pageDir) 319 | filenames = sorted(filenames) 320 | counter = 0 321 | for filename in filenames: 322 | print ' ', filename 323 | fname = os.path.join(pageDir,filename) 324 | pargv=[] 325 | pargv.append('convert2xml.py') 326 | pargv.append('--flat-xml') 327 | pargv.append(dictFile) 328 | pargv.append(fname) 329 | flat_xml = convert2xml.main(pargv) 330 | pp = PParser(flat_xml) 331 | if (raw) : 332 | pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w') 333 | else : 334 | pfile = open(os.path.join(svgDir,'page%04d.xhtml' % counter), 'w') 335 | 336 | pfile.write('\n') 337 | if (raw): 338 | pfile.write('\n') 339 | pfile.write('\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1)) 340 | pfile.write('Page %d - %s by %s\n' % (counter, metadata['Title'],metadata['Authors'])) 341 | else: 342 | pfile.write('\n'); 343 | pfile.write('\n'); 344 | pfile.write('Page %d - %s by %s\n' % (counter, metadata['Title'],metadata['Authors'])) 345 | pfile.write('\n') 361 | pfile.write('\n') 362 | pfile.write('\n') 363 | pfile.write('
\n') 364 | if (counter == 0) : 365 | pfile.write('\n') 366 | else: 367 | pfile.write('\n') 368 | pfile.write('' % (pp.pw, pp.ph)) 369 | 370 | if (pp.gid != None): 371 | pfile.write('\n') 372 | gdefs = pp.getGlyphs(glyfname) 373 | for j in xrange(0,len(gdefs)): 374 | pfile.write(gdefs[j]) 375 | pfile.write('\n') 376 | img = pp.getImages() 377 | if (img != None): 378 | for j in xrange(0,len(img)): 379 | pfile.write(img[j]) 380 | if (pp.gid != None): 381 | for j in xrange(0,len(pp.gid)): 382 | pfile.write('\n' % (pp.gid[j], pp.gx[j], pp.gy[j])) 383 | if (img == None or len(img) == 0) and (pp.gid == None or len(pp.gid) == 0): 384 | pfile.write('This page intentionally left blank.\nUntil this notice unintentionally gave it content. (gensvg.py)\n'); 385 | if (raw) : 386 | pfile.write('') 387 | else : 388 | pfile.write('\n') 389 | if (counter == len(filenames) - 1) : 390 | pfile.write('\n') 391 | else : 392 | pfile.write('\n') 393 | pfile.write('
\n') 394 | pfile.write('
zoom in - zoom out
\n') 395 | pfile.write('\n') 396 | pfile.write('\n') 397 | pfile.close() 398 | counter += 1 399 | 400 | print 'Processing Complete' 401 | 402 | return 0 403 | 404 | if __name__ == '__main__': 405 | sys.exit(main('')) 406 | -------------------------------------------------------------------------------- /topaz/getpagedim.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 3 | # For use with Topaz Scripts Version 2.6 4 | 5 | import csv 6 | import sys 7 | import os 8 | import getopt 9 | from struct import pack 10 | from struct import unpack 11 | 12 | 13 | class DocParser(object): 14 | def __init__(self, flatxml): 15 | self.flatdoc = flatxml.split('\n') 16 | 17 | 18 | # find tag if within pos to end inclusive 19 | def findinDoc(self, tagpath, pos, end) : 20 | result = None 21 | docList = self.flatdoc 22 | cnt = len(docList) 23 | if end == -1 : 24 | end = cnt 25 | else: 26 | end = min(cnt,end) 27 | foundat = -1 28 | for j in xrange(pos, end): 29 | item = docList[j] 30 | if item.find('=') >= 0: 31 | (name, argres) = item.split('=') 32 | else : 33 | name = item 34 | argres = '' 35 | if name.endswith(tagpath) : 36 | result = argres 37 | foundat = j 38 | break 39 | return foundat, result 40 | 41 | def process(self): 42 | (pos, sph) = self.findinDoc('page.h',0,-1) 43 | (pos, spw) = self.findinDoc('page.w',0,-1) 44 | if (sph == None): sph = '-1' 45 | if (spw == None): spw = '-1' 46 | return sph, spw 47 | 48 | 49 | def getPageDim(flatxml): 50 | # create a document parser 51 | dp = DocParser(flatxml) 52 | (ph, pw) = dp.process() 53 | return ph, pw 54 | -------------------------------------------------------------------------------- /topaz/stylexml2css.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 3 | # For use with Topaz Scripts Version 2.6 4 | 5 | import csv 6 | import sys 7 | import os 8 | import getopt 9 | from struct import pack 10 | from struct import unpack 11 | 12 | 13 | class DocParser(object): 14 | def __init__(self, flatxml, fontsize, ph, pw): 15 | self.flatdoc = flatxml.split('\n') 16 | self.fontsize = int(fontsize) 17 | self.ph = int(ph) * 1.0 18 | self.pw = int(pw) * 1.0 19 | 20 | stags = { 21 | 'paragraph' : 'p', 22 | 'graphic' : '.graphic' 23 | } 24 | 25 | attr_val_map = { 26 | 'hang' : 'text-indent: ', 27 | 'indent' : 'text-indent: ', 28 | 'line-space' : 'line-height: ', 29 | 'margin-bottom' : 'margin-bottom: ', 30 | 'margin-left' : 'margin-left: ', 31 | 'margin-right' : 'margin-right: ', 32 | 'margin-top' : 'margin-top: ', 33 | 'space-after' : 'padding-bottom: ', 34 | } 35 | 36 | attr_str_map = { 37 | 'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;', 38 | 'align-left' : 'text-align: left;', 39 | 'align-right' : 'text-align: right;', 40 | 'align-justify' : 'text-align: justify;', 41 | 'display-inline' : 'display: inline;', 42 | 'pos-left' : 'text-align: left;', 43 | 'pos-right' : 'text-align: right;', 44 | 'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;', 45 | } 46 | 47 | 48 | # find tag if within pos to end inclusive 49 | def findinDoc(self, tagpath, pos, end) : 50 | result = None 51 | docList = self.flatdoc 52 | cnt = len(docList) 53 | if end == -1 : 54 | end = cnt 55 | else: 56 | end = min(cnt,end) 57 | foundat = -1 58 | for j in xrange(pos, end): 59 | item = docList[j] 60 | if item.find('=') >= 0: 61 | (name, argres) = item.split('=',1) 62 | else : 63 | name = item 64 | argres = '' 65 | if name.endswith(tagpath) : 66 | result = argres 67 | foundat = j 68 | break 69 | return foundat, result 70 | 71 | 72 | # return list of start positions for the tagpath 73 | def posinDoc(self, tagpath): 74 | startpos = [] 75 | pos = 0 76 | res = "" 77 | while res != None : 78 | (foundpos, res) = self.findinDoc(tagpath, pos, -1) 79 | if res != None : 80 | startpos.append(foundpos) 81 | pos = foundpos + 1 82 | return startpos 83 | 84 | 85 | def process(self): 86 | 87 | classlst = '' 88 | csspage = '.cl-center { text-align: center; margin-left: auto; margin-right: auto; }\n' 89 | csspage += '.cl-right { text-align: right; }\n' 90 | csspage += '.cl-left { text-align: left; }\n' 91 | csspage += '.cl-justify { text-align: justify; }\n' 92 | 93 | # generate a list of each