├── .gitignore
├── README.md
├── kindledecrypt.py
├── mobidedrm.py
├── process.py
├── screenshot.png
├── setup-macosx.py
├── setup-win32.py
└── topaz
├── __init__.py
├── cmbtc.py
├── convert2xml.py
├── decode_meta.py
├── flatxml2html.py
├── genhtml.py
├── gensvg.py
├── getpagedim.py
└── stylexml2css.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Kindle Book Decrypter
2 | =====================
3 | A simple GUI to remove the restrictions put in place to prevent copying books,
4 | listening to some books, or transfering books to other devices, etc. All it
5 | requires it the original book file and your Kindle serial number.
6 |
7 | * Backup your purchased Kindle books on your own media
8 | * Use other e-reading devices and/or software to read your purchased books
9 | * Re-enable audio playback of books that have disabled it
10 | * Allow a friend to borrow a book you are no longer reading (if this falls
11 | under fair use and is legal where you live)
12 | * Supports Mobipocket and Topaz book formats (azw, mobi, prc, azw1, tpz)
13 |
14 | 
15 |
16 | Pre-Built Binaries
17 | ------------------
18 | The following pre-built binaries are available and kept up to date with the
19 | latest changes:
20 |
21 | * [Microsoft Windows](http://programmer-art.org/dropbox/kindledecrypt-1.1-win32.exe)
22 | * [Mac OS X](http://programmer-art.org/dropbox/kindledecrypt-1.1-macosx.zip)
23 |
24 | Dependencies
25 | ------------
26 | The Kindle Book Decrypter depends on the following when not using the pre-built binaries:
27 |
28 | * Python
29 | * wxWidgets (and Python bindings)
30 |
31 | Usage
32 | -----
33 | You can use the application by running it in a terminal or double clicking it:
34 |
35 | cd kindledecrypt
36 | ./kindledecrypt.py
37 |
38 | License
39 | -------
40 | Original reverse engineering and Kindle PIN code is copyright the respective
41 | authors. The GUI is copyright 2010 Daniel G. Taylor and released under the MIT
42 | License. See the script itself for details.
43 |
44 |
--------------------------------------------------------------------------------
/kindledecrypt.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | """
4 | Kindle Book Decrypter
5 | =====================
6 | Simple GUI for MobiDeDRM code written with wxWidgets. This GUI takes a
7 | serial number and encrypted book file and outputs an unencrypted book
8 | that can be used to backup your data or legally remove audio and other
9 | restrictions by allowing you to convert to other formats.
10 |
11 | License
12 | -------
13 | Copyright (C) 2010 Daniel G. Taylor
14 |
15 | Permission is hereby granted, free of charge, to any person obtaining a copy
16 | of this software and associated documentation files (the "Software"), to deal
17 | in the Software without restriction, including without limitation the rights
18 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19 | copies of the Software, and to permit persons to whom the Software is
20 | furnished to do so, subject to the following conditions:
21 |
22 | The above copyright notice and this permission notice shall be included in
23 | all copies or substantial portions of the Software.
24 |
25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
31 | THE SOFTWARE.
32 | """
33 |
34 | __author__ = "Daniel G. Taylor"
35 | __version__ = 1.1
36 |
37 | import ConfigParser
38 | import optparse
39 | import os
40 | import sys
41 | import wx
42 |
43 | import mobidedrm
44 | import process
45 | import topaz
46 |
47 | CONFIG = os.path.expanduser("~/.mobidedrmwx.cfg")
48 |
49 | class MobiDeDrmApp(wx.App):
50 | """
51 | The main application holding all windows, controls, etc.
52 | """
53 | def __init__(self, redir=False):
54 | super(MobiDeDrmApp, self).__init__(redir)
55 |
56 | self.config = ConfigParser.SafeConfigParser()
57 | if os.path.exists(CONFIG):
58 | self.config.read(CONFIG)
59 |
60 | if not self.config.has_section("General"):
61 | self.config.add_section("General")
62 |
63 | if self.config.has_option("General", "Serial"):
64 | default_serial = self.config.get("General", "Serial")
65 | else:
66 | # This is just a random example serial
67 | default_serial = "B002A1C457493453"
68 |
69 | self.frame = wx.Frame(None, wx.ID_ANY, "Kindle Book Decrypter", size=(400, 130))
70 |
71 | self.panel = wx.Panel(self.frame)
72 | self.vbox = wx.BoxSizer(wx.VERTICAL)
73 |
74 | self.grid = wx.GridBagSizer(3, 3)
75 | self.serial_label = wx.StaticText(self.panel, label="Serial:")
76 | self.serial = wx.TextCtrl(self.panel, value=default_serial)
77 | self.serial_help = wx.StaticText(self.panel, label="Kindle or Kindle for iPhone serial number")
78 | font = self.serial_help.GetFont()
79 | font.SetPointSize(8)
80 | self.serial_help.SetFont(font)
81 | self.input_label = wx.StaticText(self.panel, label="Book:")
82 | self.input = wx.FilePickerCtrl(self.panel, wildcard="Kindle Books|*.azw;*.mobi;*.prc;*.azw1|All Files|*.*")
83 | self.button = wx.Button(self.panel, label="Decrypt")
84 |
85 | self.grid.Add(self.serial_label, (0, 0), flag=wx.ALIGN_CENTER_VERTICAL | wx.ALL)
86 | self.grid.Add(self.serial, (0, 1), flag=wx.ALIGN_CENTER_VERTICAL | wx.EXPAND)
87 | self.grid.Add(self.serial_help, (1, 1))
88 | self.grid.Add(self.input_label, (2, 0), flag=wx.ALIGN_CENTER_VERTICAL)
89 | self.grid.Add(self.input, (2, 1), flag=wx.ALIGN_CENTER_VERTICAL | wx.EXPAND)
90 | self.grid.Add(self.button, (3, 1), flag=wx.ALIGN_CENTER_VERTICAL | wx.ALIGN_RIGHT)
91 |
92 | self.grid.AddGrowableCol(1, 1)
93 |
94 | self.vbox.Add(self.grid, 1, wx.ALL | wx.EXPAND, border=5)
95 |
96 | self.panel.SetSizer(self.vbox)
97 | self.vbox.Fit(self.frame)
98 |
99 | self.frame.Bind(wx.EVT_BUTTON, self.on_process, self.button)
100 | self.frame.Bind(wx.EVT_TEXT, self.on_serial_changed, self.serial)
101 |
102 | self.frame.Centre()
103 | self.frame.Show(True)
104 |
105 | def on_serial_changed(self, event):
106 | """
107 | The serial number has changed. If it is the correct number of
108 | characters then enable the decrypt button, otherwise disable it
109 | until a valid serial is entered.
110 | """
111 | serial = self.serial.GetValue()
112 | if len(serial) in [16, 40]:
113 | self.button.Enable()
114 | self.config.set("General", "Serial", self.serial.GetValue())
115 | self.config.write(open(CONFIG, "w"))
116 | else:
117 | self.button.Disable()
118 |
119 | def on_process(self, event):
120 | """
121 | The decrypt button was clicked, so start the decrypting process.
122 | This shows a pulsing progress dialog while the book is decrypted,
123 | displaying a dialog for any errors that are encountered.
124 | """
125 | infile = self.input.GetPath()
126 |
127 | if not os.path.exists(infile):
128 | error_dialog = wx.MessageDialog(self.panel, "Error: Input file doesn't exist!", "Error procesesing file!", wx.OK | wx.ICON_ERROR)
129 | error_dialog.ShowModal()
130 | error_dialog.Destroy()
131 | return
132 |
133 | # Which type of book is this?
134 | ext = ""
135 | try:
136 | topaz.cmbtc.bookFile = topaz.cmbtc.openBook(infile)
137 | topaz.cmbtc.parseTopazHeader()
138 | except topaz.cmbtc.CMBDTCFatal:
139 | ext = ".mobi"
140 |
141 | outfile = os.path.splitext(infile)[0] + "-decrypted" + ext
142 | pid = mobidedrm.getPid(self.serial.GetValue())
143 | dialog = wx.ProgressDialog("Progress", "Decrypting...")
144 | dialog.Pulse()
145 | dialog.Show()
146 | for error in process.decrypt(infile, outfile, pid):
147 | dialog.Pulse()
148 | wx.Yield()
149 |
150 | if error:
151 | error_dialog = wx.MessageDialog(self.panel, "Error: %s" % error, "Error processing file!", wx.OK | wx.ICON_ERROR)
152 | error_dialog.ShowModal()
153 | error_dialog.Destroy()
154 |
155 | dialog.Destroy()
156 |
157 | if __name__ == "__main__":
158 | parser = optparse.OptionParser("%prog [options]", version="Kindle Book Decrypter %s" % __version__)
159 |
160 | options, args = parser.parse_args()
161 |
162 | app = MobiDeDrmApp()
163 | app.MainLoop()
164 |
165 |
--------------------------------------------------------------------------------
/mobidedrm.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # This is a python script. You need a Python interpreter to run it.
4 | # For example, ActiveState Python, which exists for windows.
5 | #
6 | # It can run standalone to convert files, or it can be installed as a
7 | # plugin for Calibre (http://calibre-ebook.com/about) so that
8 | # importing files with DRM 'Just Works'.
9 | #
10 | # To create a Calibre plugin, rename this file so that the filename
11 | # ends in '_plugin.py', put it into a ZIP file and import that Calibre
12 | # using its plugin configuration GUI.
13 | #
14 | # Changelog
15 | # 0.01 - Initial version
16 | # 0.02 - Huffdic compressed books were not properly decrypted
17 | # 0.03 - Wasn't checking MOBI header length
18 | # 0.04 - Wasn't sanity checking size of data record
19 | # 0.05 - It seems that the extra data flags take two bytes not four
20 | # 0.06 - And that low bit does mean something after all :-)
21 | # 0.07 - The extra data flags aren't present in MOBI header < 0xE8 in size
22 | # 0.08 - ...and also not in Mobi header version < 6
23 | # 0.09 - ...but they are there with Mobi header version 6, header size 0xE4!
24 | # 0.10 - Outputs unencrypted files as-is, so that when run as a Calibre
25 | # import filter it works when importing unencrypted files.
26 | # Also now handles encrypted files that don't need a specific PID.
27 | # 0.11 - use autoflushed stdout and proper return values
28 | # 0.12 - Fix for problems with metadata import as Calibre plugin, report errors
29 | # 0.13 - Formatting fixes: retabbed file, removed trailing whitespace
30 | # and extra blank lines, converted CR/LF pairs at ends of each line,
31 | # and other cosmetic fixes.
32 | # 0.14 - Working out when the extra data flags are present has been problematic
33 | # Versions 7 through 9 have tried to tweak the conditions, but have been
34 | # only partially successful. Closer examination of lots of sample
35 | # files reveals that a confusin has arisen because trailing data entries
36 | # are not encrypted, but it turns out that the multibyte entries
37 | # in utf8 file are encrypted. (Although neither kind gets compressed.)
38 | # This knowledge leads to a simplification of the test for the
39 | # trailing data byte flags - version 5 and higher AND header
40 | # size >= 0xE4.
41 | # 0.15 - Now outputs 'hearbeat', and is also quicker for long files.
42 | # 0.16 - And reverts to 'done' not 'done.' at the end for unswindle
43 | # compatibility.
44 | # 0.17 - Added ability to extract PID given a Kindle serial number, added
45 | # OptionParser interface to argument processing, allow import as a
46 | # library without assuming Calibre is importing it
47 |
48 | __version__ = '0.17'
49 |
50 | import sys
51 | import struct
52 | import binascii
53 |
54 | from optparse import OptionParser
55 |
56 | letters = "ABCDEFGHIJKLMNPQRSTUVWXYZ123456789"
57 |
58 | class Unbuffered:
59 | def __init__(self, stream):
60 | self.stream = stream
61 | def write(self, data):
62 | self.stream.write(data)
63 | self.stream.flush()
64 | def __getattr__(self, attr):
65 | return getattr(self.stream, attr)
66 |
67 | class DrmException(Exception):
68 | pass
69 |
70 | # Implementation of Pukall Cipher 1
71 | def PC1(key, src, decryption=True):
72 | sum1 = 0;
73 | sum2 = 0;
74 | keyXorVal = 0;
75 | if len(key)!=16:
76 | print "Bad key length!"
77 | return None
78 | wkey = []
79 | for i in xrange(8):
80 | wkey.append(ord(key[i*2])<<8 | ord(key[i*2+1]))
81 |
82 | dst = ""
83 | for i in xrange(len(src)):
84 | temp1 = 0;
85 | byteXorVal = 0;
86 | for j in xrange(8):
87 | temp1 ^= wkey[j]
88 | sum2 = (sum2+j)*20021 + sum1
89 | sum1 = (temp1*346)&0xFFFF
90 | sum2 = (sum2+sum1)&0xFFFF
91 | temp1 = (temp1*20021+1)&0xFFFF
92 | byteXorVal ^= temp1 ^ sum2
93 | curByte = ord(src[i])
94 | if not decryption:
95 | keyXorVal = curByte * 257;
96 | curByte = ((curByte ^ (byteXorVal >> 8)) ^ byteXorVal) & 0xFF
97 | if decryption:
98 | keyXorVal = curByte * 257;
99 | for j in xrange(8):
100 | wkey[j] ^= keyXorVal;
101 | dst+=chr(curByte)
102 | return dst
103 |
104 | def checksumPid(s):
105 | crc = (~binascii.crc32(s,-1))&0xFFFFFFFF
106 | crc = crc ^ (crc >> 16)
107 | res = s
108 | l = len(letters)
109 | for i in (0,1):
110 | b = crc & 0xff
111 | pos = (b // l) ^ (b % l)
112 | res += letters[pos%l]
113 | crc >>= 8
114 | return res
115 |
116 | def pidFromSerial(s, l):
117 | crc = (~binascii.crc32(s,-1))&0xFFFFFFFF
118 |
119 | arr1 = [0]*l
120 | for i in xrange(len(s)):
121 | arr1[i%l] ^= ord(s[i])
122 |
123 | crc_bytes = [crc >> 24 & 0xff, crc >> 16 & 0xff, crc >> 8 & 0xff, crc & 0xff]
124 | for i in xrange(l):
125 | arr1[i] ^= crc_bytes[i&3]
126 |
127 | pid = ""
128 | for i in xrange(l):
129 | b = arr1[i] & 0xff
130 | pid+=letters[(b >> 7) + ((b >> 5 & 3) ^ (b & 0x1f))]
131 |
132 | return pid
133 |
134 | def getPid(serial):
135 | pid = ""
136 | if len(serial) == 16:
137 | pid = checksumPid(pidFromSerial(serial, 7) + "*")
138 | elif len(serial) == 40:
139 | pid = checksumPid(pidFromSerial(serial, 8))
140 |
141 | return pid
142 |
143 | def getSizeOfTrailingDataEntries(ptr, size, flags):
144 | def getSizeOfTrailingDataEntry(ptr, size):
145 | bitpos, result = 0, 0
146 | if size <= 0:
147 | return result
148 | while True:
149 | v = ord(ptr[size-1])
150 | result |= (v & 0x7F) << bitpos
151 | bitpos += 7
152 | size -= 1
153 | if (v & 0x80) != 0 or (bitpos >= 28) or (size == 0):
154 | return result
155 | num = 0
156 | testflags = flags >> 1
157 | while testflags:
158 | if testflags & 1:
159 | num += getSizeOfTrailingDataEntry(ptr, size - num)
160 | testflags >>= 1
161 | # Multibyte data, if present, is included in the encryption, so
162 | # we do not need to check the low bit.
163 | # if flags & 1:
164 | # num += (ord(ptr[size - num - 1]) & 0x3) + 1
165 | return num
166 |
167 | class DrmStripper:
168 | def loadSection(self, section):
169 | if (section + 1 == self.num_sections):
170 | endoff = len(self.data_file)
171 | else:
172 | endoff = self.sections[section + 1][0]
173 | off = self.sections[section][0]
174 | return self.data_file[off:endoff]
175 |
176 | def patch(self, off, new):
177 | self.data_file = self.data_file[:off] + new + self.data_file[off+len(new):]
178 |
179 | def patchSection(self, section, new, in_off = 0):
180 | if (section + 1 == self.num_sections):
181 | endoff = len(self.data_file)
182 | else:
183 | endoff = self.sections[section + 1][0]
184 | off = self.sections[section][0]
185 | assert off + in_off + len(new) <= endoff
186 | self.patch(off + in_off, new)
187 |
188 | def parseDRM(self, data, count, pid):
189 | pid = pid.ljust(16,'\0')
190 | keyvec1 = "\x72\x38\x33\xB0\xB4\xF2\xE3\xCA\xDF\x09\x01\xD6\xE2\xE0\x3F\x96"
191 | temp_key = PC1(keyvec1, pid, False)
192 | temp_key_sum = sum(map(ord,temp_key)) & 0xff
193 | found_key = None
194 | for i in xrange(count):
195 | verification, size, type, cksum, cookie = struct.unpack('>LLLBxxx32s', data[i*0x30:i*0x30+0x30])
196 | cookie = PC1(temp_key, cookie)
197 | ver,flags,finalkey,expiry,expiry2 = struct.unpack('>LL16sLL', cookie)
198 | if verification == ver and cksum == temp_key_sum and (flags & 0x1F) == 1:
199 | found_key = finalkey
200 | break
201 | if not found_key:
202 | # Then try the default encoding that doesn't require a PID
203 | temp_key = keyvec1
204 | temp_key_sum = sum(map(ord,temp_key)) & 0xff
205 | for i in xrange(count):
206 | verification, size, type, cksum, cookie = struct.unpack('>LLLBxxx32s', data[i*0x30:i*0x30+0x30])
207 | cookie = PC1(temp_key, cookie)
208 | ver,flags,finalkey,expiry,expiry2 = struct.unpack('>LL16sLL', cookie)
209 | if verification == ver and cksum == temp_key_sum:
210 | found_key = finalkey
211 | break
212 | return found_key
213 |
214 | def __init__(self, data_file, pid):
215 | if checksumPid(pid[0:-2]) != pid:
216 | raise DrmException("invalid PID checksum")
217 | pid = pid[0:-2]
218 |
219 | self.data_file = data_file
220 | header = data_file[0:72]
221 | if header[0x3C:0x3C+8] != 'BOOKMOBI':
222 | raise DrmException("invalid file format")
223 | self.num_sections, = struct.unpack('>H', data_file[76:78])
224 |
225 | self.sections = []
226 | for i in xrange(self.num_sections):
227 | offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', data_file[78+i*8:78+i*8+8])
228 | flags, val = a1, a2<<16|a3<<8|a4
229 | self.sections.append( (offset, flags, val) )
230 |
231 | sect = self.loadSection(0)
232 | records, = struct.unpack('>H', sect[0x8:0x8+2])
233 | mobi_length, = struct.unpack('>L',sect[0x14:0x18])
234 | mobi_version, = struct.unpack('>L',sect[0x68:0x6C])
235 | extra_data_flags = 0
236 | print "MOBI header version = %d, length = %d" %(mobi_version, mobi_length)
237 | if (mobi_length >= 0xE4) and (mobi_version >= 5):
238 | extra_data_flags, = struct.unpack('>H', sect[0xF2:0xF4])
239 | print "Extra Data Flags = %d" %extra_data_flags
240 |
241 | crypto_type, = struct.unpack('>H', sect[0xC:0xC+2])
242 | if crypto_type == 0:
243 | print "This book is not encrypted."
244 | else:
245 | if crypto_type == 1:
246 | raise DrmException("cannot decode Mobipocket encryption type 1")
247 | if crypto_type != 2:
248 | raise DrmException("unknown encryption type: %d" % crypto_type)
249 |
250 | # calculate the keys
251 | drm_ptr, drm_count, drm_size, drm_flags = struct.unpack('>LLLL', sect[0xA8:0xA8+16])
252 | if drm_count == 0:
253 | raise DrmException("no PIDs found in this file")
254 | found_key = self.parseDRM(sect[drm_ptr:drm_ptr+drm_size], drm_count, pid)
255 | if not found_key:
256 | raise DrmException("no key found. maybe the PID is incorrect")
257 |
258 | # kill the drm keys
259 | self.patchSection(0, "\0" * drm_size, drm_ptr)
260 | # kill the drm pointers
261 | self.patchSection(0, "\xff" * 4 + "\0" * 12, 0xA8)
262 | # clear the crypto type
263 | self.patchSection(0, "\0" * 2, 0xC)
264 |
265 | # decrypt sections
266 | print "Decrypting. Please wait . . .",
267 | new_data = self.data_file[:self.sections[1][0]]
268 | for i in xrange(1, records+1):
269 | data = self.loadSection(i)
270 | extra_size = getSizeOfTrailingDataEntries(data, len(data), extra_data_flags)
271 | if i%100 == 0:
272 | print ".",
273 | # print "record %d, extra_size %d" %(i,extra_size)
274 | new_data += PC1(found_key, data[0:len(data) - extra_size])
275 | if extra_size > 0:
276 | new_data += data[-extra_size:]
277 | #self.patchSection(i, PC1(found_key, data[0:len(data) - extra_size]))
278 | if self.num_sections > records+1:
279 | new_data += self.data_file[self.sections[records+1][0]:]
280 | self.data_file = new_data
281 | print "done"
282 |
283 | def getResult(self):
284 | return self.data_file
285 |
286 | if __name__ == "__main__":
287 | sys.stdout=Unbuffered(sys.stdout)
288 | print ('MobiDeDrm v%(__version__)s. '
289 | 'Copyright 2008-2010 The Dark Reverser.' % globals())
290 |
291 | parser = OptionParser("Usage: %prog [options] input.azw output.mobi PID", version=__version__)
292 | parser.add_option("-s", "--serial", dest="serial", default="", help="Get the PID from a Kindle or Kindle for iPhone serial number")
293 |
294 | options, args = parser.parse_args()
295 |
296 | if options.serial:
297 | print "Mobipocket PID: " + getPid(options.serial)
298 | sys.exit(0)
299 |
300 | if len(args) < 4:
301 | print "Removes protection from Mobipocket books"
302 | parser.print_help()
303 | sys.exit(1)
304 | else:
305 | infile = args[1]
306 | outfile = args[2]
307 | pid = args[3]
308 | data_file = file(infile, 'rb').read()
309 | try:
310 | strippedFile = DrmStripper(data_file, pid)
311 | file(outfile, 'wb').write(strippedFile.getResult())
312 | except DrmException, e:
313 | print "Error: %s" % e
314 | sys.exit(1)
315 | sys.exit(0)
316 | elif "calibre" in globals():
317 | from calibre.customize import FileTypePlugin
318 |
319 | class MobiDeDRM(FileTypePlugin):
320 | name = 'MobiDeDRM' # Name of the plugin
321 | description = 'Removes DRM from secure Mobi files'
322 | supported_platforms = ['linux', 'osx', 'windows'] # Platforms this plugin will run on
323 | author = 'The Dark Reverser' # The author of this plugin
324 | version = (0, 1, 6) # The version number of this plugin
325 | file_types = set(['prc','mobi','azw']) # The file types that this plugin will be applied to
326 | on_import = True # Run this plugin during the import
327 |
328 | def run(self, path_to_ebook):
329 | from calibre.gui2 import is_ok_to_use_qt
330 | from PyQt4.Qt import QMessageBox
331 | PID = self.site_customization
332 | data_file = file(path_to_ebook, 'rb').read()
333 | ar = PID.split(',')
334 | for i in ar:
335 | try:
336 | unlocked_file = DrmStripper(data_file, i).getResult()
337 | except DrmException:
338 | # ignore the error
339 | pass
340 | else:
341 | of = self.temporary_file('.mobi')
342 | of.write(unlocked_file)
343 | of.close()
344 | return of.name
345 | if is_ok_to_use_qt():
346 | d = QMessageBox(QMessageBox.Warning, "MobiDeDRM Plugin", "Couldn't decode: %s\n\nImporting encrypted version." % path_to_ebook)
347 | d.show()
348 | d.raise_()
349 | d.exec_()
350 | return path_to_ebook
351 |
352 | def customization_help(self, gui=False):
353 | return 'Enter PID (separate multiple PIDs with comma)'
354 |
355 |
--------------------------------------------------------------------------------
/process.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | """
4 | Utilities for decrypting a book in a separate process. This gets its own
5 | module as the multiprocessing module duplicates the global namespace when
6 | spawning new processes. This separate module limits the amount of stuff
7 | that gets duplicated and prevents serialization errors on certain platforms
8 | with e.g. wxWidgets.
9 | """
10 |
11 | import mobidedrm
12 | import multiprocessing
13 | import os
14 | import shutil
15 | import tempfile
16 | import time
17 | import topaz
18 |
19 | multiprocessing.freeze_support()
20 |
21 | def _process(infile, outfile, pid, error):
22 | try:
23 | if outfile.endswith(".mobi"):
24 | # Mobi file
25 | data_file = open(infile, "rb").read()
26 | strippedFile = mobidedrm.DrmStripper(data_file, pid)
27 | file(outfile, 'wb').write(strippedFile.getResult())
28 | else:
29 | # Topaz file
30 | tmp = tempfile.mkdtemp()
31 | args = ['./cmbtc.py', '-v', '-p', pid[:8], '-d', '-o', tmp, infile]
32 | topaz.cmbtc.main(argv=args)
33 | topaz.gensvg.main(['./gensvg.py', tmp])
34 | topaz.genhtml.main(['./genhtml.py', tmp])
35 |
36 | if not os.path.exists(outfile):
37 | os.mkdir(outfile)
38 |
39 | for filename in ["img", "style.css", "book.html"]:
40 | shutil.move(os.path.join(tmp, filename), os.path.join(outfile, filename))
41 |
42 | shutil.rmtree(tmp)
43 | except Exception, e:
44 | error.value = str(e)
45 |
46 | def decrypt(infile, outfile, pid):
47 | """
48 | Decrypt a Kindle book in a different process. This periodically yields
49 | so that status information can be shown. Use like:
50 |
51 | >>> for error in decrypt(infile, outfile, pid):
52 | >>> progress_update()
53 | >>> if error:
54 | >>> print error
55 |
56 | """
57 | error = None
58 |
59 | errorobj = multiprocessing.Array("c", 512)
60 | proc = multiprocessing.Process(target=_process, args=(infile, outfile, pid, errorobj))
61 | proc.start()
62 | while proc.is_alive():
63 | yield ""
64 | time.sleep(0.1)
65 | proc.join()
66 |
67 | if errorobj.value:
68 | error = errorobj.value
69 |
70 | yield error
71 |
72 |
--------------------------------------------------------------------------------
/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lrobot/kindledecrypt/46052aeb076081ab1027de0f7dc2033d4a6304d9/screenshot.png
--------------------------------------------------------------------------------
/setup-macosx.py:
--------------------------------------------------------------------------------
1 | """
2 | This is a setup.py script generated by py2applet
3 |
4 | Usage:
5 | python setup.py py2app
6 | """
7 |
8 | from setuptools import setup
9 |
10 | APP = ['kindledecrypt.py']
11 | DATA_FILES = []
12 | OPTIONS = {'argv_emulation': True}
13 |
14 | setup(
15 | app=APP,
16 | data_files=DATA_FILES,
17 | options={'py2app': OPTIONS},
18 | setup_requires=['py2app'],
19 | )
20 |
--------------------------------------------------------------------------------
/setup-win32.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | from distutils.core import setup
4 | import py2exe, sys, os
5 |
6 | sys.argv.append('py2exe')
7 |
8 | setup(
9 | options = {'py2exe': {'bundle_files':1}},
10 | windows = [{'script': 'kindledecrypt.py'}],
11 | zipfile = None,
12 | )
13 |
--------------------------------------------------------------------------------
/topaz/__init__.py:
--------------------------------------------------------------------------------
1 | import cmbtc
2 | import gensvg
3 | import genhtml
4 |
--------------------------------------------------------------------------------
/topaz/cmbtc.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # For use with Topaz Scripts Version 2.6
3 |
4 | class Unbuffered:
5 | def __init__(self, stream):
6 | self.stream = stream
7 | def write(self, data):
8 | self.stream.write(data)
9 | self.stream.flush()
10 | def __getattr__(self, attr):
11 | return getattr(self.stream, attr)
12 |
13 | import sys
14 | sys.stdout=Unbuffered(sys.stdout)
15 |
16 | import csv
17 | import os
18 | import getopt
19 | import zlib
20 | from struct import pack
21 | from struct import unpack
22 |
23 | MAX_PATH = 255
24 |
25 | # Put the first 8 characters of your Kindle PID here
26 | # or supply it with the -p option in the command line
27 | ####################################################
28 | kindlePID = "12345678"
29 | ####################################################
30 |
31 | global bookFile
32 | global bookPayloadOffset
33 | global bookHeaderRecords
34 | global bookMetadata
35 | global bookKey
36 | global command
37 |
38 | #
39 | # Exceptions for all the problems that might happen during the script
40 | #
41 |
42 | class CMBDTCError(Exception):
43 | pass
44 |
45 | class CMBDTCFatal(Exception):
46 | pass
47 |
48 | #
49 | # Open the book file at path
50 | #
51 |
52 | def openBook(path):
53 | try:
54 | return open(path,'rb')
55 | except:
56 | raise CMBDTCFatal("Could not open book file: " + path)
57 |
58 | #
59 | # Get a 7 bit encoded number from the book file
60 | #
61 |
62 | def bookReadEncodedNumber():
63 | flag = False
64 | data = ord(bookFile.read(1))
65 |
66 | if data == 0xFF:
67 | flag = True
68 | data = ord(bookFile.read(1))
69 |
70 | if data >= 0x80:
71 | datax = (data & 0x7F)
72 | while data >= 0x80 :
73 | data = ord(bookFile.read(1))
74 | datax = (datax <<7) + (data & 0x7F)
75 | data = datax
76 |
77 | if flag:
78 | data = -data
79 | return data
80 |
81 | #
82 | # Encode a number in 7 bit format
83 | #
84 |
85 | def encodeNumber(number):
86 | result = ""
87 | negative = False
88 | flag = 0
89 |
90 | if number < 0 :
91 | number = -number + 1
92 | negative = True
93 |
94 | while True:
95 | byte = number & 0x7F
96 | number = number >> 7
97 | byte += flag
98 | result += chr(byte)
99 | flag = 0x80
100 | if number == 0 :
101 | if (byte == 0xFF and negative == False) :
102 | result += chr(0x80)
103 | break
104 |
105 | if negative:
106 | result += chr(0xFF)
107 |
108 | return result[::-1]
109 |
110 | #
111 | # Get a length prefixed string from the file
112 | #
113 |
114 | def bookReadString():
115 | stringLength = bookReadEncodedNumber()
116 | return unpack(str(stringLength)+"s",bookFile.read(stringLength))[0]
117 |
118 | #
119 | # Returns a length prefixed string
120 | #
121 |
122 | def lengthPrefixString(data):
123 | return encodeNumber(len(data))+data
124 |
125 |
126 | #
127 | # Read and return the data of one header record at the current book file position [[offset,decompressedLength,compressedLength],...]
128 | #
129 |
130 | def bookReadHeaderRecordData():
131 | nbValues = bookReadEncodedNumber()
132 | values = []
133 | for i in range (0,nbValues):
134 | values.append([bookReadEncodedNumber(),bookReadEncodedNumber(),bookReadEncodedNumber()])
135 | return values
136 |
137 | #
138 | # Read and parse one header record at the current book file position and return the associated data [[offset,decompressedLength,compressedLength],...]
139 | #
140 |
141 | def parseTopazHeaderRecord():
142 | if ord(bookFile.read(1)) != 0x63:
143 | raise CMBDTCFatal("Parse Error : Invalid Header")
144 |
145 | tag = bookReadString()
146 | record = bookReadHeaderRecordData()
147 | return [tag,record]
148 |
149 | #
150 | # Parse the header of a Topaz file, get all the header records and the offset for the payload
151 | #
152 |
153 | def parseTopazHeader():
154 | global bookHeaderRecords
155 | global bookPayloadOffset
156 | magic = unpack("4s",bookFile.read(4))[0]
157 |
158 | if magic != 'TPZ0':
159 | raise CMBDTCFatal("Parse Error : Invalid Header, not a Topaz file")
160 |
161 | nbRecords = bookReadEncodedNumber()
162 | bookHeaderRecords = {}
163 |
164 | for i in range (0,nbRecords):
165 | result = parseTopazHeaderRecord()
166 | #print result[0], result[1]
167 | bookHeaderRecords[result[0]] = result[1]
168 |
169 | if ord(bookFile.read(1)) != 0x64 :
170 | raise CMBDTCFatal("Parse Error : Invalid Header")
171 |
172 | bookPayloadOffset = bookFile.tell()
173 |
174 | #
175 | # Get a record in the book payload, given its name and index. If necessary the record is decrypted. The record is not decompressed
176 | # Correction, the record is correctly decompressed too
177 | #
178 |
179 | def getBookPayloadRecord(name, index):
180 | encrypted = False
181 | compressed = False
182 |
183 | try:
184 | recordOffset = bookHeaderRecords[name][index][0]
185 | except:
186 | raise CMBDTCFatal("Parse Error : Invalid Record, record not found")
187 |
188 | bookFile.seek(bookPayloadOffset + recordOffset)
189 |
190 | tag = bookReadString()
191 | if tag != name :
192 | raise CMBDTCFatal("Parse Error : Invalid Record, record name doesn't match")
193 |
194 | recordIndex = bookReadEncodedNumber()
195 |
196 | if recordIndex < 0 :
197 | encrypted = True
198 | recordIndex = -recordIndex -1
199 |
200 | if recordIndex != index :
201 | raise CMBDTCFatal("Parse Error : Invalid Record, index doesn't match")
202 |
203 | if (bookHeaderRecords[name][index][2] > 0):
204 | compressed = True
205 | record = bookFile.read(bookHeaderRecords[name][index][2])
206 | else:
207 | record = bookFile.read(bookHeaderRecords[name][index][1])
208 |
209 | if encrypted:
210 | ctx = topazCryptoInit(bookKey)
211 | record = topazCryptoDecrypt(record,ctx)
212 |
213 | if compressed:
214 | record = zlib.decompress(record)
215 |
216 | return record
217 |
218 | #
219 | # Extract, decrypt and decompress a book record indicated by name and index and print it or save it in "filename"
220 | #
221 |
222 | def extractBookPayloadRecord(name, index, filename):
223 | compressed = False
224 |
225 | try:
226 | compressed = bookHeaderRecords[name][index][2] != 0
227 | record = getBookPayloadRecord(name,index)
228 | except:
229 | print("Could not find record")
230 |
231 | # if compressed:
232 | # try:
233 | # record = zlib.decompress(record)
234 | # except:
235 | # raise CMBDTCFatal("Could not decompress record")
236 |
237 | if filename != "":
238 | try:
239 | file = open(filename,"wb")
240 | file.write(record)
241 | file.close()
242 | except:
243 | raise CMBDTCFatal("Could not write to destination file")
244 | else:
245 | print(record)
246 |
247 | #
248 | # return next record [key,value] from the book metadata from the current book position
249 | #
250 |
251 | def readMetadataRecord():
252 | return [bookReadString(),bookReadString()]
253 |
254 | #
255 | # Parse the metadata record from the book payload and return a list of [key,values]
256 | #
257 |
258 | def parseMetadata():
259 | global bookHeaderRecords
260 | global bookPayloadAddress
261 | global bookMetadata
262 | bookMetadata = {}
263 | bookFile.seek(bookPayloadOffset + bookHeaderRecords["metadata"][0][0])
264 | tag = bookReadString()
265 | if tag != "metadata" :
266 | raise CMBDTCFatal("Parse Error : Record Names Don't Match")
267 |
268 | flags = ord(bookFile.read(1))
269 | nbRecords = ord(bookFile.read(1))
270 |
271 | for i in range (0,nbRecords) :
272 | record =readMetadataRecord()
273 | bookMetadata[record[0]] = record[1]
274 |
275 | #
276 | # Context initialisation for the Topaz Crypto
277 | #
278 |
279 | def topazCryptoInit(key):
280 | ctx1 = 0x0CAFFE19E
281 |
282 | for keyChar in key:
283 | keyByte = ord(keyChar)
284 | ctx2 = ctx1
285 | ctx1 = ((((ctx1 >>2) * (ctx1 >>7))&0xFFFFFFFF) ^ (keyByte * keyByte * 0x0F902007)& 0xFFFFFFFF )
286 | return [ctx1,ctx2]
287 |
288 | #
289 | # decrypt data with the context prepared by topazCryptoInit()
290 | #
291 |
292 | def topazCryptoDecrypt(data, ctx):
293 | ctx1 = ctx[0]
294 | ctx2 = ctx[1]
295 |
296 | plainText = ""
297 |
298 | for dataChar in data:
299 | dataByte = ord(dataChar)
300 | m = (dataByte ^ ((ctx1 >> 3) &0xFF) ^ ((ctx2<<3) & 0xFF)) &0xFF
301 | ctx2 = ctx1
302 | ctx1 = (((ctx1 >> 2) * (ctx1 >> 7)) &0xFFFFFFFF) ^((m * m * 0x0F902007) &0xFFFFFFFF)
303 | plainText += chr(m)
304 |
305 | return plainText
306 |
307 | #
308 | # Decrypt a payload record with the PID
309 | #
310 |
311 | def decryptRecord(data,PID):
312 | ctx = topazCryptoInit(PID)
313 | return topazCryptoDecrypt(data, ctx)
314 |
315 | #
316 | # Try to decrypt a dkey record (contains the book PID)
317 | #
318 |
319 | def decryptDkeyRecord(data,PID):
320 | record = decryptRecord(data,PID)
321 | fields = unpack("3sB8sB8s3s",record)
322 |
323 | if fields[0] != "PID" or fields[5] != "pid" :
324 | raise CMBDTCError("Didn't find PID magic numbers in record")
325 | elif fields[1] != 8 or fields[3] != 8 :
326 | raise CMBDTCError("Record didn't contain correct length fields")
327 | elif fields[2] != PID :
328 | raise CMBDTCError("Record didn't contain PID")
329 |
330 | return fields[4]
331 |
332 | #
333 | # Decrypt all the book's dkey records (contain the book PID)
334 | #
335 |
336 | def decryptDkeyRecords(data,PID):
337 | nbKeyRecords = ord(data[0])
338 | records = []
339 | data = data[1:]
340 | for i in range (0,nbKeyRecords):
341 | length = ord(data[0])
342 | try:
343 | key = decryptDkeyRecord(data[1:length+1],PID)
344 | records.append(key)
345 | except CMBDTCError:
346 | pass
347 | data = data[1+length:]
348 |
349 | return records
350 |
351 |
352 | def createDecryptedPayload(payload):
353 | for headerRecord in bookHeaderRecords:
354 | name = headerRecord
355 | if name != "dkey" :
356 | ext = '.dat'
357 | if name == 'img' : ext = '.jpg'
358 | for index in range (0,len(bookHeaderRecords[name])) :
359 | fnum = "%04d" % index
360 | fname = name + fnum + ext
361 | destdir = payload
362 | if name == 'img':
363 | destdir = os.path.join(payload,'img')
364 | if name == 'page':
365 | destdir = os.path.join(payload,'page')
366 | if name == 'glyphs':
367 | destdir = os.path.join(payload,'glyphs')
368 | outputFile = os.path.join(destdir,fname)
369 | file(outputFile, 'wb').write(getBookPayloadRecord(name, index))
370 |
371 |
372 | # Create decrypted book
373 | #
374 |
375 | def createDecryptedBook(outdir):
376 | if not os.path.exists(outdir):
377 | os.makedirs(outdir)
378 |
379 | destdir = os.path.join(outdir,'img')
380 | if not os.path.exists(destdir):
381 | os.makedirs(destdir)
382 |
383 | destdir = os.path.join(outdir,'page')
384 | if not os.path.exists(destdir):
385 | os.makedirs(destdir)
386 |
387 | destdir = os.path.join(outdir,'glyphs')
388 | if not os.path.exists(destdir):
389 | os.makedirs(destdir)
390 |
391 | createDecryptedPayload(outdir)
392 |
393 |
394 | #
395 | # Set the command to execute by the programm according to cmdLine parameters
396 | #
397 |
398 | def setCommand(name) :
399 | global command
400 | if command != "" :
401 | raise CMBDTCFatal("Invalid command line parameters")
402 | else :
403 | command = name
404 |
405 | #
406 | # Program usage
407 | #
408 |
409 | def usage():
410 | print("\nUsage:")
411 | print("\ncmbtc_dump_linux.py [options] bookFileName\n")
412 | print("-p Adds a PID to the list of PIDs that are tried to decrypt the book key (can be used several times)")
413 | print("-d Dumps the unencrypted book as files to outdir")
414 | print("-o Output directory to save book files to")
415 | print("-v Verbose (can be used several times)")
416 |
417 |
418 | #
419 | # Main
420 | #
421 |
422 | def main(argv=sys.argv):
423 | global bookMetadata
424 | global bookKey
425 | global bookFile
426 | global command
427 |
428 | print argv
429 |
430 | progname = os.path.basename(argv[0])
431 |
432 | verbose = 0
433 | recordName = ""
434 | recordIndex = 0
435 | outdir = ""
436 | PIDs = []
437 | command = ""
438 |
439 | # Preloads your Kindle pid from the top of the program.
440 | PIDs.append(kindlePID)
441 |
442 | try:
443 | opts, args = getopt.getopt(argv[1:], "vo:p:d")
444 | except getopt.GetoptError, err:
445 | # print help information and exit:
446 | print str(err) # will print something like "option -a not recognized"
447 | usage()
448 | sys.exit(2)
449 |
450 | if len(opts) == 0 and len(args) == 0 :
451 | usage()
452 | sys.exit(2)
453 |
454 | for o, a in opts:
455 | if o == "-v":
456 | verbose+=1
457 | if o =="-o":
458 | if a == None :
459 | raise CMBDTCFatal("Invalid parameter for -o")
460 | outdir = a
461 | if o =="-p":
462 | PIDs.append(a)
463 | if o =="-d":
464 | setCommand("doit")
465 |
466 | if command == "" :
467 | raise CMBDTCFatal("No action supplied on command line")
468 |
469 | #
470 | # Open book and parse metadata
471 | #
472 |
473 | if len(args) == 1:
474 |
475 | bookFile = openBook(args[0])
476 | parseTopazHeader()
477 | parseMetadata()
478 |
479 | #
480 | # Decrypt book key
481 | #
482 |
483 | dkey = getBookPayloadRecord('dkey', 0)
484 |
485 | bookKeys = []
486 | for PID in PIDs :
487 | bookKeys+=decryptDkeyRecords(dkey,PID)
488 |
489 | if len(bookKeys) == 0 :
490 | if verbose > 0 :
491 | print ("Book key could not be found. Maybe this book is not registered with this device.")
492 | return 1
493 | else :
494 | bookKey = bookKeys[0]
495 | if verbose > 0:
496 | print("Book key: " + bookKey.encode('hex'))
497 |
498 | if command == "printRecord" :
499 | extractBookPayloadRecord(recordName,int(recordIndex),outputFile)
500 | if outputFile != "" and verbose>0 :
501 | print("Wrote record to file: "+outputFile)
502 | elif command == "doit" :
503 | if outdir != "" :
504 | createDecryptedBook(outdir)
505 | if verbose >0 :
506 | print ("Decrypted book saved. Don't pirate!")
507 | elif verbose > 0:
508 | print("Output directory name was not supplied.")
509 | return 1
510 |
511 | return 0
512 |
513 | if __name__ == '__main__':
514 | sys.exit(main())
515 |
--------------------------------------------------------------------------------
/topaz/convert2xml.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
3 | # For use with Topaz Scripts Version 2.6
4 |
5 | class Unbuffered:
6 | def __init__(self, stream):
7 | self.stream = stream
8 | def write(self, data):
9 | self.stream.write(data)
10 | self.stream.flush()
11 | def __getattr__(self, attr):
12 | return getattr(self.stream, attr)
13 |
14 | import sys
15 | sys.stdout=Unbuffered(sys.stdout)
16 |
17 | import csv
18 | import os
19 | import getopt
20 | from struct import pack
21 | from struct import unpack
22 |
23 |
24 | # Get a 7 bit encoded number from string. The most
25 | # significant byte comes first and has the high bit (8th) set
26 |
27 | def readEncodedNumber(file):
28 | flag = False
29 | c = file.read(1)
30 | if (len(c) == 0):
31 | return None
32 | data = ord(c)
33 |
34 | if data == 0xFF:
35 | flag = True
36 | c = file.read(1)
37 | if (len(c) == 0):
38 | return None
39 | data = ord(c)
40 |
41 | if data >= 0x80:
42 | datax = (data & 0x7F)
43 | while data >= 0x80 :
44 | c = file.read(1)
45 | if (len(c) == 0):
46 | return None
47 | data = ord(c)
48 | datax = (datax <<7) + (data & 0x7F)
49 | data = datax
50 |
51 | if flag:
52 | data = -data
53 | return data
54 |
55 |
56 | # returns a binary string that encodes a number into 7 bits
57 | # most significant byte first which has the high bit set
58 |
59 | def encodeNumber(number):
60 | result = ""
61 | negative = False
62 | flag = 0
63 |
64 | if number < 0 :
65 | number = -number + 1
66 | negative = True
67 |
68 | while True:
69 | byte = number & 0x7F
70 | number = number >> 7
71 | byte += flag
72 | result += chr(byte)
73 | flag = 0x80
74 | if number == 0 :
75 | if (byte == 0xFF and negative == False) :
76 | result += chr(0x80)
77 | break
78 |
79 | if negative:
80 | result += chr(0xFF)
81 |
82 | return result[::-1]
83 |
84 |
85 |
86 | # create / read a length prefixed string from the file
87 |
88 | def lengthPrefixString(data):
89 | return encodeNumber(len(data))+data
90 |
91 | def readString(file):
92 | stringLength = readEncodedNumber(file)
93 | if (stringLength == None):
94 | return ""
95 | sv = file.read(stringLength)
96 | if (len(sv) != stringLength):
97 | return ""
98 | return unpack(str(stringLength)+"s",sv)[0]
99 |
100 |
101 | # convert a binary string generated by encodeNumber (7 bit encoded number)
102 | # to the value you would find inside the page*.dat files to be processed
103 |
104 | def convert(i):
105 | result = ''
106 | val = encodeNumber(i)
107 | for j in xrange(len(val)):
108 | c = ord(val[j:j+1])
109 | result += '%02x' % c
110 | return result
111 |
112 |
113 |
114 | # the complete string table used to store all book text content
115 | # as well as the xml tokens and values that make sense out of it
116 |
117 | class Dictionary(object):
118 | def __init__(self, dictFile):
119 | self.filename = dictFile
120 | self.size = 0
121 | self.fo = file(dictFile,'rb')
122 | self.stable = []
123 | self.size = readEncodedNumber(self.fo)
124 | for i in xrange(self.size):
125 | self.stable.append(self.escapestr(readString(self.fo)))
126 | self.pos = 0
127 |
128 | def escapestr(self, str):
129 | str = str.replace('&','&')
130 | str = str.replace('<','<')
131 | str = str.replace('>','>')
132 | str = str.replace('=','=')
133 | return str
134 |
135 | def lookup(self,val):
136 | if ((val >= 0) and (val < self.size)) :
137 | self.pos = val
138 | return self.stable[self.pos]
139 | else:
140 | print "Error - %d outside of string table limits" % val
141 | sys.exit(-1)
142 |
143 | def getSize(self):
144 | return self.size
145 |
146 | def getPos(self):
147 | return self.pos
148 |
149 | def dumpDict(self):
150 | for i in xrange(self.size):
151 | print "%d %s %s" % (i, convert(i), self.stable[i])
152 | return
153 |
154 | # parses the xml snippets that are represented by each page*.dat file.
155 | # also parses the other0.dat file - the main stylesheet
156 | # and information used to inject the xml snippets into page*.dat files
157 |
158 | class PageParser(object):
159 | def __init__(self, filename, dict, debug, flat_xml):
160 | self.fo = file(filename,'rb')
161 | self.id = os.path.basename(filename).replace('.dat','')
162 | self.dict = dict
163 | self.debug = debug
164 | self.flat_xml = flat_xml
165 | self.tagpath = []
166 | self.doc = []
167 | self.snippetList = []
168 |
169 |
170 | # hash table used to enable the decoding process
171 | # This has all been developed by trial and error so it may still have omissions or
172 | # contain errors
173 | # Format:
174 | # tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
175 |
176 | token_tags = {
177 | 'x' : (1, 'scalar_number', 0, 0),
178 | 'y' : (1, 'scalar_number', 0, 0),
179 | 'h' : (1, 'scalar_number', 0, 0),
180 | 'w' : (1, 'scalar_number', 0, 0),
181 | 'firstWord' : (1, 'scalar_number', 0, 0),
182 | 'lastWord' : (1, 'scalar_number', 0, 0),
183 | 'rootID' : (1, 'scalar_number', 0, 0),
184 | 'stemID' : (1, 'scalar_number', 0, 0),
185 | 'type' : (1, 'scalar_text', 0, 0),
186 |
187 | 'info' : (0, 'number', 1, 0),
188 |
189 | 'info.word' : (0, 'number', 1, 1),
190 | 'info.word.ocrText' : (1, 'text', 0, 0),
191 | 'info.word.firstGlyph' : (1, 'raw', 0, 0),
192 | 'info.word.lastGlyph' : (1, 'raw', 0, 0),
193 | 'info.word.bl' : (1, 'raw', 0, 0),
194 | 'info.word.link_id' : (1, 'number', 0, 0),
195 |
196 | 'glyph' : (0, 'number', 1, 1),
197 | 'glyph.x' : (1, 'number', 0, 0),
198 | 'glyph.y' : (1, 'number', 0, 0),
199 | 'glyph.glyphID' : (1, 'number', 0, 0),
200 |
201 | 'dehyphen' : (0, 'number', 1, 1),
202 | 'dehyphen.rootID' : (1, 'number', 0, 0),
203 | 'dehyphen.stemID' : (1, 'number', 0, 0),
204 | 'dehyphen.stemPage' : (1, 'number', 0, 0),
205 | 'dehyphen.sh' : (1, 'number', 0, 0),
206 |
207 | 'links' : (0, 'number', 1, 1),
208 | 'links.page' : (1, 'number', 0, 0),
209 | 'links.rel' : (1, 'number', 0, 0),
210 | 'links.row' : (1, 'number', 0, 0),
211 | 'links.title' : (1, 'text', 0, 0),
212 | 'links.href' : (1, 'text', 0, 0),
213 | 'links.type' : (1, 'text', 0, 0),
214 |
215 | 'paraCont' : (0, 'number', 1, 1),
216 | 'paraCont.rootID' : (1, 'number', 0, 0),
217 | 'paraCont.stemID' : (1, 'number', 0, 0),
218 | 'paraCont.stemPage' : (1, 'number', 0, 0),
219 |
220 | 'paraStems' : (0, 'number', 1, 1),
221 | 'paraStems.stemID' : (1, 'number', 0, 0),
222 |
223 | 'wordStems' : (0, 'number', 1, 1),
224 | 'wordStems.stemID' : (1, 'number', 0, 0),
225 |
226 | 'empty' : (1, 'snippets', 1, 0),
227 |
228 | 'page' : (1, 'snippets', 1, 0),
229 | 'page.pageid' : (1, 'scalar_text', 0, 0),
230 | 'page.pagelabel' : (1, 'scalar_text', 0, 0),
231 | 'page.type' : (1, 'scalar_text', 0, 0),
232 | 'page.h' : (1, 'scalar_number', 0, 0),
233 | 'page.w' : (1, 'scalar_number', 0, 0),
234 | 'page.startID' : (1, 'scalar_number', 0, 0),
235 |
236 | 'group' : (1, 'snippets', 1, 0),
237 | 'group.type' : (1, 'scalar_text', 0, 0),
238 |
239 | 'region' : (1, 'snippets', 1, 0),
240 | 'region.type' : (1, 'scalar_text', 0, 0),
241 | 'region.x' : (1, 'scalar_number', 0, 0),
242 | 'region.y' : (1, 'scalar_number', 0, 0),
243 | 'region.h' : (1, 'scalar_number', 0, 0),
244 | 'region.w' : (1, 'scalar_number', 0, 0),
245 |
246 | 'empty_text_region' : (1, 'snippets', 1, 0),
247 |
248 | 'img' : (1, 'snippets', 1, 0),
249 | 'img.x' : (1, 'scalar_number', 0, 0),
250 | 'img.y' : (1, 'scalar_number', 0, 0),
251 | 'img.h' : (1, 'scalar_number', 0, 0),
252 | 'img.w' : (1, 'scalar_number', 0, 0),
253 | 'img.src' : (1, 'scalar_number', 0, 0),
254 | 'img.color_src' : (1, 'scalar_number', 0, 0),
255 |
256 | 'paragraph' : (1, 'snippets', 1, 0),
257 | 'paragraph.class' : (1, 'scalar_text', 0, 0),
258 | 'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
259 | 'paragraph.lastWord' : (1, 'scalar_number', 0, 0),
260 |
261 | 'word_semantic' : (1, 'snippets', 1, 1),
262 | 'word_semantic.type' : (1, 'scalar_text', 0, 0),
263 | 'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
264 | 'word_semantic.lastWord' : (1, 'scalar_number', 0, 0),
265 |
266 | 'word' : (1, 'snippets', 1, 0),
267 | 'word.type' : (1, 'scalar_text', 0, 0),
268 | 'word.class' : (1, 'scalar_text', 0, 0),
269 | 'word.firstGlyph' : (1, 'scalar_number', 0, 0),
270 | 'word.lastGlyph' : (1, 'scalar_number', 0, 0),
271 |
272 | '_span' : (1, 'snippets', 1, 0),
273 | '_span.firstWord' : (1, 'scalar_number', 0, 0),
274 | '-span.lastWord' : (1, 'scalar_number', 0, 0),
275 |
276 | 'span' : (1, 'snippets', 1, 0),
277 | 'span.firstWord' : (1, 'scalar_number', 0, 0),
278 | 'span.lastWord' : (1, 'scalar_number', 0, 0),
279 |
280 | 'extratokens' : (1, 'snippets', 1, 0),
281 | 'extratokens.type' : (1, 'scalar_text', 0, 0),
282 | 'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
283 | 'extratokens.lastGlyph' : (1, 'scalar_number', 0, 0),
284 |
285 | 'glyph.h' : (1, 'number', 0, 0),
286 | 'glyph.w' : (1, 'number', 0, 0),
287 | 'glyph.use' : (1, 'number', 0, 0),
288 | 'glyph.vtx' : (1, 'number', 0, 1),
289 | 'glyph.len' : (1, 'number', 0, 1),
290 | 'glyph.dpi' : (1, 'number', 0, 0),
291 | 'vtx' : (0, 'number', 1, 1),
292 | 'vtx.x' : (1, 'number', 0, 0),
293 | 'vtx.y' : (1, 'number', 0, 0),
294 | 'len' : (0, 'number', 1, 1),
295 | 'len.n' : (1, 'number', 0, 0),
296 |
297 | 'book' : (1, 'snippets', 1, 0),
298 | 'version' : (1, 'snippets', 1, 0),
299 | 'version.FlowEdit_1_id' : (1, 'scalar_text', 0, 0),
300 | 'version.FlowEdit_1_version' : (1, 'scalar_text', 0, 0),
301 | 'version.Schema_id' : (1, 'scalar_text', 0, 0),
302 | 'version.Schema_version' : (1, 'scalar_text', 0, 0),
303 | 'version.Topaz_version' : (1, 'scalar_text', 0, 0),
304 | 'version.WordDetailEdit_1_id' : (1, 'scalar_text', 0, 0),
305 | 'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
306 | 'version.ZoneEdit_1_id' : (1, 'scalar_text', 0, 0),
307 | 'version.ZoneEdit_1_version' : (1, 'scalar_text', 0, 0),
308 | 'version.chapterheaders' : (1, 'scalar_text', 0, 0),
309 | 'version.creation_date' : (1, 'scalar_text', 0, 0),
310 | 'version.header_footer' : (1, 'scalar_text', 0, 0),
311 | 'version.init_from_ocr' : (1, 'scalar_text', 0, 0),
312 | 'version.letter_insertion' : (1, 'scalar_text', 0, 0),
313 | 'version.xmlinj_convert' : (1, 'scalar_text', 0, 0),
314 | 'version.xmlinj_reflow' : (1, 'scalar_text', 0, 0),
315 | 'version.xmlinj_transform' : (1, 'scalar_text', 0, 0),
316 | 'version.findlists' : (1, 'scalar_text', 0, 0),
317 | 'version.page_num' : (1, 'scalar_text', 0, 0),
318 | 'version.page_type' : (1, 'scalar_text', 0, 0),
319 | 'version.bad_text' : (1, 'scalar_text', 0, 0),
320 | 'version.glyph_mismatch' : (1, 'scalar_text', 0, 0),
321 | 'version.margins' : (1, 'scalar_text', 0, 0),
322 | 'version.staggered_lines' : (1, 'scalar_text', 0, 0),
323 | 'version.paragraph_continuation' : (1, 'scalar_text', 0, 0),
324 | 'version.toc' : (1, 'scalar_text', 0, 0),
325 |
326 | 'stylesheet' : (1, 'snippets', 1, 0),
327 | 'style' : (1, 'snippets', 1, 0),
328 | 'style._tag' : (1, 'scalar_text', 0, 0),
329 | 'style.type' : (1, 'scalar_text', 0, 0),
330 | 'style._parent_type' : (1, 'scalar_text', 0, 0),
331 | 'style.class' : (1, 'scalar_text', 0, 0),
332 | 'style._after_class' : (1, 'scalar_text', 0, 0),
333 | 'rule' : (1, 'snippets', 1, 0),
334 | 'rule.attr' : (1, 'scalar_text', 0, 0),
335 | 'rule.value' : (1, 'scalar_text', 0, 0),
336 |
337 | 'original' : (0, 'number', 1, 1),
338 | 'original.pnum' : (1, 'number', 0, 0),
339 | 'original.pid' : (1, 'text', 0, 0),
340 | 'pages' : (0, 'number', 1, 1),
341 | 'pages.ref' : (1, 'number', 0, 0),
342 | 'pages.id' : (1, 'number', 0, 0),
343 | 'startID' : (0, 'number', 1, 1),
344 | 'startID.page' : (1, 'number', 0, 0),
345 | 'startID.id' : (1, 'number', 0, 0),
346 |
347 | }
348 |
349 |
350 | # full tag path record keeping routines
351 | def tag_push(self, token):
352 | self.tagpath.append(token)
353 | def tag_pop(self):
354 | if len(self.tagpath) > 0 :
355 | self.tagpath.pop()
356 | def tagpath_len(self):
357 | return len(self.tagpath)
358 | def get_tagpath(self, i):
359 | cnt = len(self.tagpath)
360 | if i < cnt : result = self.tagpath[i]
361 | for j in xrange(i+1, cnt) :
362 | result += '.' + self.tagpath[j]
363 | return result
364 |
365 |
366 | # list of absolute command byte values values that indicate
367 | # various types of loop meachanisms typically used to generate vectors
368 |
369 | cmd_list = (0x76, 0x76)
370 |
371 | # peek at and return 1 byte that is ahead by i bytes
372 | def peek(self, aheadi):
373 | c = self.fo.read(aheadi)
374 | if (len(c) == 0):
375 | return None
376 | self.fo.seek(-aheadi,1)
377 | c = c[-1:]
378 | return ord(c)
379 |
380 |
381 | # get the next value from the file being processed
382 | def getNext(self):
383 | nbyte = self.peek(1);
384 | if (nbyte == None):
385 | return None
386 | val = readEncodedNumber(self.fo)
387 | return val
388 |
389 |
390 | # format an arg by argtype
391 | def formatArg(self, arg, argtype):
392 | if (argtype == 'text') or (argtype == 'scalar_text') :
393 | result = self.dict.lookup(arg)
394 | elif (argtype == 'raw') or (argtype == 'number') or (argtype == 'scalar_number') :
395 | result = arg
396 | elif (argtype == 'snippets') :
397 | result = arg
398 | else :
399 | print "Error Unknown argtype %s" % argtype
400 | sys.exit(-2)
401 | return result
402 |
403 |
404 | # process the next tag token, recursively handling subtags,
405 | # arguments, and commands
406 | def procToken(self, token):
407 |
408 | known_token = False
409 | self.tag_push(token)
410 |
411 | if self.debug : print 'Processing: ', self.get_tagpath(0)
412 | cnt = self.tagpath_len()
413 | for j in xrange(cnt):
414 | tkn = self.get_tagpath(j)
415 | if tkn in self.token_tags :
416 | num_args = self.token_tags[tkn][0]
417 | argtype = self.token_tags[tkn][1]
418 | subtags = self.token_tags[tkn][2]
419 | splcase = self.token_tags[tkn][3]
420 | ntags = -1
421 | known_token = True
422 | break
423 |
424 | if known_token :
425 |
426 | # handle subtags if present
427 | subtagres = []
428 | if (splcase == 1):
429 | # this type of tag uses of escape marker 0x74 indicate subtag count
430 | if self.peek(1) == 0x74:
431 | skip = readEncodedNumber(self.fo)
432 | subtags = 1
433 | num_args = 0
434 |
435 | if (subtags == 1):
436 | ntags = readEncodedNumber(self.fo)
437 | if self.debug : print 'subtags: ' + token + ' has ' + str(ntags)
438 | for j in xrange(ntags):
439 | val = readEncodedNumber(self.fo)
440 | subtagres.append(self.procToken(self.dict.lookup(val)))
441 |
442 | # arguments can be scalars or vectors of text or numbers
443 | argres = []
444 | if num_args > 0 :
445 | firstarg = self.peek(1)
446 | if (firstarg in self.cmd_list) and (argtype != 'scalar_number') and (argtype != 'scalar_text'):
447 | # single argument is a variable length vector of data
448 | arg = readEncodedNumber(self.fo)
449 | argres = self.decodeCMD(arg,argtype)
450 | else :
451 | # num_arg scalar arguments
452 | for i in xrange(num_args):
453 | argres.append(self.formatArg(readEncodedNumber(self.fo), argtype))
454 |
455 | # build the return tag
456 | result = []
457 | tkn = self.get_tagpath(0)
458 | result.append(tkn)
459 | result.append(subtagres)
460 | result.append(argtype)
461 | result.append(argres)
462 | self.tag_pop()
463 | return result
464 |
465 | # all tokens that need to be processed should be in the hash
466 | # table if it may indicate a problem, either new token
467 | # or an out of sync condition
468 | else:
469 | result = []
470 | if (self.debug):
471 | print 'Unknown Token:', token
472 | self.tag_pop()
473 | return result
474 |
475 |
476 | # special loop used to process code snippets
477 | # it is NEVER used to format arguments.
478 | # builds the snippetList
479 | def doLoop72(self, argtype):
480 | cnt = readEncodedNumber(self.fo)
481 | if self.debug :
482 | result = 'Set of '+ str(cnt) + ' xml snippets. The overall structure \n'
483 | result += 'of the document is indicated by snippet number sets at the\n'
484 | result += 'end of each snippet. \n'
485 | print result
486 | for i in xrange(cnt):
487 | if self.debug: print 'Snippet:',str(i)
488 | snippet = []
489 | snippet.append(i)
490 | val = readEncodedNumber(self.fo)
491 | snippet.append(self.procToken(self.dict.lookup(val)))
492 | self.snippetList.append(snippet)
493 | return
494 |
495 |
496 |
497 | # general loop code gracisouly submitted by "skindle" - thank you!
498 | def doLoop76Mode(self, argtype, cnt, mode):
499 | result = []
500 | adj = 0
501 | if mode & 1:
502 | adj = readEncodedNumber(self.fo)
503 | mode = mode >> 1
504 | x = []
505 | for i in xrange(cnt):
506 | x.append(readEncodedNumber(self.fo) - adj)
507 | for i in xrange(mode):
508 | for j in xrange(1, cnt):
509 | x[j] = x[j] + x[j - 1]
510 | for i in xrange(cnt):
511 | result.append(self.formatArg(x[i],argtype))
512 | return result
513 |
514 |
515 | # dispatches loop commands bytes with various modes
516 | # The 0x76 style loops are used to build vectors
517 |
518 | # This was all derived by trial and error and
519 | # new loop types may exist that are not handled here
520 | # since they did not appear in the test cases
521 |
522 | def decodeCMD(self, cmd, argtype):
523 | if (cmd == 0x76):
524 |
525 | # loop with cnt, and mode to control loop styles
526 | cnt = readEncodedNumber(self.fo)
527 | mode = readEncodedNumber(self.fo)
528 |
529 | if self.debug : print 'Loop for', cnt, 'with mode', mode, ': '
530 | return self.doLoop76Mode(argtype, cnt, mode)
531 |
532 | if self.dbug: print "Unknown command", cmd
533 | result = []
534 | return result
535 |
536 |
537 |
538 | # add full tag path to injected snippets
539 | def updateName(self, tag, prefix):
540 | name = tag[0]
541 | subtagList = tag[1]
542 | argtype = tag[2]
543 | argList = tag[3]
544 | nname = prefix + '.' + name
545 | nsubtaglist = []
546 | for j in subtagList:
547 | nsubtaglist.append(self.updateName(j,prefix))
548 | ntag = []
549 | ntag.append(nname)
550 | ntag.append(nsubtaglist)
551 | ntag.append(argtype)
552 | ntag.append(argList)
553 | return ntag
554 |
555 |
556 |
557 | # perform depth first injection of specified snippets into this one
558 | def injectSnippets(self, snippet):
559 | snipno, tag = snippet
560 | name = tag[0]
561 | subtagList = tag[1]
562 | argtype = tag[2]
563 | argList = tag[3]
564 | nsubtagList = []
565 | if len(argList) > 0 :
566 | for j in argList:
567 | asnip = self.snippetList[j]
568 | aso, atag = self.injectSnippets(asnip)
569 | atag = self.updateName(atag, name)
570 | nsubtagList.append(atag)
571 | argtype='number'
572 | argList=[]
573 | if len(nsubtagList) > 0 :
574 | subtagList.extend(nsubtagList)
575 | tag = []
576 | tag.append(name)
577 | tag.append(subtagList)
578 | tag.append(argtype)
579 | tag.append(argList)
580 | snippet = []
581 | snippet.append(snipno)
582 | snippet.append(tag)
583 | return snippet
584 |
585 |
586 |
587 | # format the tag for output
588 | def formatTag(self, node):
589 | name = node[0]
590 | subtagList = node[1]
591 | argtype = node[2]
592 | argList = node[3]
593 | fullpathname = name.split('.')
594 | nodename = fullpathname.pop()
595 | ilvl = len(fullpathname)
596 | indent = ' ' * (3 * ilvl)
597 | result = indent + '<' + nodename + '>'
598 | if len(argList) > 0:
599 | argres = ''
600 | for j in argList:
601 | if (argtype == 'text') or (argtype == 'scalar_text') :
602 | argres += j + '|'
603 | else :
604 | argres += str(j) + ','
605 | argres = argres[0:-1]
606 | if argtype == 'snippets' :
607 | result += 'snippets:' + argres
608 | else :
609 | result += argres
610 | if len(subtagList) > 0 :
611 | result += '\n'
612 | for j in subtagList:
613 | if len(j) > 0 :
614 | result += self.formatTag(j)
615 | result += indent + '' + nodename + '>\n'
616 | else:
617 | result += '' + nodename + '>\n'
618 | return result
619 |
620 |
621 | # flatten tag
622 | def flattenTag(self, node):
623 | name = node[0]
624 | subtagList = node[1]
625 | argtype = node[2]
626 | argList = node[3]
627 | result = name
628 | if (len(argList) > 0):
629 | argres = ''
630 | for j in argList:
631 | if (argtype == 'text') or (argtype == 'scalar_text') :
632 | argres += j + '|'
633 | else :
634 | argres += str(j) + '|'
635 | argres = argres[0:-1]
636 | if argtype == 'snippets' :
637 | result += '.snippets=' + argres
638 | else :
639 | result += '=' + argres
640 | result += '\n'
641 | for j in subtagList:
642 | if len(j) > 0 :
643 | result += self.flattenTag(j)
644 | return result
645 |
646 |
647 | # reduce create xml output
648 | def formatDoc(self, flat_xml):
649 | result = ''
650 | for j in self.doc :
651 | if len(j) > 0:
652 | if flat_xml:
653 | result += self.flattenTag(j)
654 | else:
655 | result += self.formatTag(j)
656 | if self.debug : print result
657 | return result
658 |
659 |
660 |
661 | # main loop - parse the page.dat files
662 | # to create structured document and snippets
663 |
664 | # FIXME: value at end of magic appears to be a subtags count
665 | # but for what? For now, inject an 'info" tag as it is in
666 | # every dictionary and seems close to what is meant
667 | # The alternative is to special case the last _ "0x5f" to mean something
668 |
669 | def process(self):
670 |
671 | # peek at the first bytes to see what type of file it is
672 | magic = self.fo.read(9)
673 | if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'):
674 | first_token = 'info'
675 | elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'):
676 | skip = self.fo.read(2)
677 | first_token = 'info'
678 | elif (magic[0:1] == 'p') and (magic[2:8] == '_PAGE_'):
679 | first_token = 'info'
680 | elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'):
681 | skip = self.fo.read(3)
682 | first_token = 'info'
683 | else :
684 | # other0.dat file
685 | first_token = None
686 | self.fo.seek(-9,1)
687 |
688 |
689 | # main loop to read and build the document tree
690 | while True:
691 |
692 | if first_token != None :
693 | # use "inserted" first token 'info' for page and glyph files
694 | tag = self.procToken(first_token)
695 | if len(tag) > 0 :
696 | self.doc.append(tag)
697 | first_token = None
698 |
699 | v = self.getNext()
700 | if (v == None):
701 | break
702 |
703 | if (v == 0x72):
704 | self.doLoop72('number')
705 | elif (v > 0) and (v < self.dict.getSize()) :
706 | tag = self.procToken(self.dict.lookup(v))
707 | if len(tag) > 0 :
708 | self.doc.append(tag)
709 | else:
710 | if self.debug:
711 | print "Main Loop: Unknown value: %x" % v
712 | if (v == 0):
713 | if (self.peek(1) == 0x5f):
714 | skip = self.fo.read(1)
715 | first_token = 'info'
716 |
717 | # now do snippet injection
718 | if len(self.snippetList) > 0 :
719 | if self.debug : print 'Injecting Snippets:'
720 | snippet = self.injectSnippets(self.snippetList[0])
721 | snipno = snippet[0]
722 | tag_add = snippet[1]
723 | if self.debug : print self.formatTag(tag_add)
724 | if len(tag_add) > 0:
725 | self.doc.append(tag_add)
726 |
727 | # handle generation of xml output
728 | xmlpage = self.formatDoc(self.flat_xml)
729 |
730 | return xmlpage
731 |
732 |
733 |
734 | def usage():
735 | print 'Usage: '
736 | print ' convert2xml.py dict0000.dat infile.dat '
737 | print ' '
738 | print ' Options:'
739 | print ' -h print this usage help message '
740 | print ' -d turn on debug output to check for potential errors '
741 | print ' --flat-xml output the flattened xml page description only '
742 | print ' '
743 | print ' This program will attempt to convert a page*.dat file or '
744 | print ' glyphs*.dat file, using the dict0000.dat file, to its xml description. '
745 | print ' '
746 | print ' Use "cmbtc_dump.py" first to unencrypt, uncompress, and dump '
747 | print ' the *.dat files from a Topaz format e-book.'
748 |
749 | #
750 | # Main
751 | #
752 |
753 | def main(argv):
754 | dictFile = ""
755 | pageFile = ""
756 | debug = False
757 | flat_xml = False
758 | printOutput = False
759 | if len(argv) == 0:
760 | printOutput = True
761 | argv = sys.argv
762 |
763 | try:
764 | opts, args = getopt.getopt(argv[1:], "hd", ["flat-xml"])
765 |
766 | except getopt.GetoptError, err:
767 |
768 | # print help information and exit:
769 | print str(err) # will print something like "option -a not recognized"
770 | usage()
771 | sys.exit(2)
772 |
773 | if len(opts) == 0 and len(args) == 0 :
774 | usage()
775 | sys.exit(2)
776 |
777 | for o, a in opts:
778 | if o =="-d":
779 | debug=True
780 | if o =="-h":
781 | usage()
782 | sys.exit(0)
783 | if o =="--flat-xml":
784 | flat_xml = True
785 |
786 | dictFile, pageFile = args[0], args[1]
787 |
788 | # read in the string table dictionary
789 | dict = Dictionary(dictFile)
790 | # dict.dumpDict()
791 |
792 | # create a page parser
793 | pp = PageParser(pageFile, dict, debug, flat_xml)
794 |
795 | xmlpage = pp.process()
796 |
797 | if printOutput:
798 | print xmlpage
799 | return 0
800 |
801 | return xmlpage
802 |
803 | if __name__ == '__main__':
804 | sys.exit(main(''))
--------------------------------------------------------------------------------
/topaz/decode_meta.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
3 | # For use with Topaz Scripts Version 2.6
4 |
5 | import csv
6 | import sys
7 | import os
8 | import getopt
9 | from struct import pack
10 | from struct import unpack
11 |
12 | #
13 | # Get a 7 bit encoded number from string
14 | #
15 |
16 | def readEncodedNumber(file):
17 | flag = False
18 | c = file.read(1)
19 | if (len(c) == 0):
20 | return None
21 | data = ord(c)
22 |
23 | if data == 0xFF:
24 | flag = True
25 | c = file.read(1)
26 | if (len(c) == 0):
27 | return None
28 | data = ord(c)
29 |
30 | if data >= 0x80:
31 | datax = (data & 0x7F)
32 | while data >= 0x80 :
33 | c = file.read(1)
34 | if (len(c) == 0):
35 | return None
36 | data = ord(c)
37 | datax = (datax <<7) + (data & 0x7F)
38 | data = datax
39 |
40 | if flag:
41 | data = -data
42 | return data
43 |
44 | #
45 | # Encode a number in 7 bit format
46 | #
47 |
48 | def encodeNumber(number):
49 | result = ""
50 | negative = False
51 | flag = 0
52 |
53 | if number < 0 :
54 | number = -number + 1
55 | negative = True
56 |
57 | while True:
58 | byte = number & 0x7F
59 | number = number >> 7
60 | byte += flag
61 | result += chr(byte)
62 | flag = 0x80
63 | if number == 0 :
64 | if (byte == 0xFF and negative == False) :
65 | result += chr(0x80)
66 | break
67 |
68 | if negative:
69 | result += chr(0xFF)
70 |
71 | return result[::-1]
72 |
73 | #
74 | # Get a length prefixed string from the file
75 | #
76 | def lengthPrefixString(data):
77 | return encodeNumber(len(data))+data
78 |
79 | def readString(file):
80 | stringLength = readEncodedNumber(file)
81 | if (stringLength == None):
82 | return None
83 | sv = file.read(stringLength)
84 | if (len(sv) != stringLength):
85 | return ""
86 | return unpack(str(stringLength)+"s",sv)[0]
87 |
88 |
89 |
90 | def getMetaArray(metaFile):
91 | # parse the meta file into a Python dictionary (associative array)
92 | result = {}
93 | fo = file(metaFile,'rb')
94 | size = readEncodedNumber(fo)
95 | for i in xrange(size):
96 | temp = readString(fo)
97 | result[temp] = readString(fo)
98 | fo.close()
99 | return result
100 |
101 |
102 |
103 | def getMetaData(metaFile):
104 | # parse the meta file
105 | result = ''
106 | fo = file(metaFile,'rb')
107 | size = readEncodedNumber(fo)
108 | for i in xrange(size):
109 | result += readString(fo) + '|'
110 | result += readString(fo) + '\n'
111 | fo.close()
112 | return result
113 |
--------------------------------------------------------------------------------
/topaz/flatxml2html.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
3 | # For use with Topaz Scripts Version 2.6
4 |
5 | import sys
6 | import csv
7 | import os
8 | import math
9 | import getopt
10 | from struct import pack
11 | from struct import unpack
12 |
13 |
14 | class DocParser(object):
15 | def __init__(self, flatxml, classlst, fileid, bookDir, fixedimage):
16 | self.id = os.path.basename(fileid).replace('.dat','')
17 | self.svgcount = 0
18 | self.docList = flatxml.split('\n')
19 | self.docSize = len(self.docList)
20 | self.classList = {}
21 | self.bookDir = bookDir
22 | self.glyphPaths = { }
23 | self.numPaths = 0
24 | tmpList = classlst.split('\n')
25 | for pclass in tmpList:
26 | if pclass != '':
27 | # remove the leading period from the css name
28 | cname = pclass[1:]
29 | self.classList[cname] = True
30 | self.fixedimage = fixedimage
31 | self.ocrtext = []
32 | self.link_id = []
33 | self.link_title = []
34 | self.link_page = []
35 | self.link_href = []
36 | self.link_type = []
37 | self.dehyphen_rootid = []
38 | self.paracont_stemid = []
39 | self.parastems_stemid = []
40 |
41 |
42 | def getGlyph(self, gid):
43 | result = ''
44 | id='gl%d' % gid
45 | return self.glyphPaths[id]
46 |
47 |
48 | def glyphs_to_image(self, glyphList):
49 |
50 | def extract(path, key):
51 | b = path.find(key) + len(key)
52 | e = path.find(' ',b)
53 | return int(path[b:e])
54 |
55 | def extractID(path, key):
56 | b = path.find(key) + len(key)
57 | e = path.find('"',b)
58 | return path[b:e]
59 |
60 |
61 | svgDir = os.path.join(self.bookDir,'svg')
62 | glyfile = os.path.join(svgDir,'glyphs.svg')
63 |
64 | imgDir = os.path.join(self.bookDir,'img')
65 | imgname = self.id + '_%04d.svg' % self.svgcount
66 | imgfile = os.path.join(imgDir,imgname)
67 |
68 | # build hashtable of glyph paths keyed by glyph id
69 | if self.numPaths == 0:
70 | gfile = open(glyfile, 'r')
71 | while True:
72 | path = gfile.readline()
73 | if (path == ''): break
74 | glyphid = extractID(path,'id="')
75 | self.glyphPaths[glyphid] = path
76 | self.numPaths += 1
77 | gfile.close()
78 |
79 |
80 | # get glyph information
81 | gxList = self.getData('info.glyph.x',0,-1)
82 | gyList = self.getData('info.glyph.y',0,-1)
83 | gidList = self.getData('info.glyph.glyphID',0,-1)
84 |
85 | gids = []
86 | maxws = []
87 | maxhs = []
88 | xs = []
89 | ys = []
90 | gdefs = []
91 |
92 | # get path defintions, positions, dimensions for ecah glyph
93 | # that makes up the image, and find min x and min y to reposition origin
94 | minx = -1
95 | miny = -1
96 | for j in glyphList:
97 | gid = gidList[j]
98 | gids.append(gid)
99 |
100 | xs.append(gxList[j])
101 | if minx == -1: minx = gxList[j]
102 | else : minx = min(minx, gxList[j])
103 |
104 | ys.append(gyList[j])
105 | if miny == -1: miny = gyList[j]
106 | else : miny = min(miny, gyList[j])
107 |
108 | path = self.getGlyph(gid)
109 | gdefs.append(path)
110 |
111 | maxws.append(extract(path,'width='))
112 | maxhs.append(extract(path,'height='))
113 |
114 |
115 | # change the origin to minx, miny and calc max height and width
116 | maxw = maxws[0] + xs[0] - minx
117 | maxh = maxhs[0] + ys[0] - miny
118 | for j in xrange(0, len(xs)):
119 | xs[j] = xs[j] - minx
120 | ys[j] = ys[j] - miny
121 | maxw = max( maxw, (maxws[j] + xs[j]) )
122 | maxh = max( maxh, (maxhs[j] + ys[j]) )
123 |
124 | # open the image file for output
125 | ifile = open(imgfile,'w')
126 | ifile.write('\n')
127 | ifile.write('\n')
128 | ifile.write('')
136 | ifile.close()
137 |
138 | return 0
139 |
140 |
141 |
142 | # return tag at line pos in document
143 | def lineinDoc(self, pos) :
144 | if (pos >= 0) and (pos < self.docSize) :
145 | item = self.docList[pos]
146 | if item.find('=') >= 0:
147 | (name, argres) = item.split('=',1)
148 | else :
149 | name = item
150 | argres = ''
151 | return name, argres
152 |
153 |
154 | # find tag in doc if within pos to end inclusive
155 | def findinDoc(self, tagpath, pos, end) :
156 | result = None
157 | if end == -1 :
158 | end = self.docSize
159 | else:
160 | end = min(self.docSize, end)
161 | foundat = -1
162 | for j in xrange(pos, end):
163 | item = self.docList[j]
164 | if item.find('=') >= 0:
165 | (name, argres) = item.split('=',1)
166 | else :
167 | name = item
168 | argres = ''
169 | if name.endswith(tagpath) :
170 | result = argres
171 | foundat = j
172 | break
173 | return foundat, result
174 |
175 |
176 | # return list of start positions for the tagpath
177 | def posinDoc(self, tagpath):
178 | startpos = []
179 | pos = 0
180 | res = ""
181 | while res != None :
182 | (foundpos, res) = self.findinDoc(tagpath, pos, -1)
183 | if res != None :
184 | startpos.append(foundpos)
185 | pos = foundpos + 1
186 | return startpos
187 |
188 |
189 | # returns a vector of integers for the tagpath
190 | def getData(self, tagpath, pos, end):
191 | argres=[]
192 | (foundat, argt) = self.findinDoc(tagpath, pos, end)
193 | if (argt != None) and (len(argt) > 0) :
194 | argList = argt.split('|')
195 | argres = [ int(strval) for strval in argList]
196 | return argres
197 |
198 |
199 | # get the class
200 | def getClass(self, pclass):
201 | nclass = pclass
202 |
203 | # class names are an issue given topaz may start them with numerals (not allowed),
204 | # use a mix of cases (which cause some browsers problems), and actually
205 | # attach numbers after "_reclustered*" to the end to deal classeses that inherit
206 | # from a base class (but then not actually provide all of these _reclustereed
207 | # classes in the stylesheet!
208 |
209 | # so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass
210 | # that exists in the stylesheet first, and then adding this specific class
211 | # after
212 |
213 | # also some class names have spaces in them so need to convert to dashes
214 | if nclass != None :
215 | nclass = nclass.replace(' ','-')
216 | classres = ''
217 | nclass = nclass.lower()
218 | nclass = 'cl-' + nclass
219 | baseclass = ''
220 | # graphic is the base class for captions
221 | if nclass.find('cl-cap-') >=0 :
222 | classres = 'graphic' + ' '
223 | else :
224 | # strip to find baseclass
225 | p = nclass.find('_')
226 | if p > 0 :
227 | baseclass = nclass[0:p]
228 | if baseclass in self.classList:
229 | classres += baseclass + ' '
230 | classres += nclass
231 | nclass = classres
232 | return nclass
233 |
234 |
235 | # develop a sorted description of the starting positions of
236 | # groups and regions on the page, as well as the page type
237 | def PageDescription(self):
238 |
239 | def compare(x, y):
240 | (xtype, xval) = x
241 | (ytype, yval) = y
242 | if xval > yval:
243 | return 1
244 | if xval == yval:
245 | return 0
246 | return -1
247 |
248 | result = []
249 | (pos, pagetype) = self.findinDoc('page.type',0,-1)
250 |
251 | groupList = self.posinDoc('page.group')
252 | groupregionList = self.posinDoc('page.group.region')
253 | pageregionList = self.posinDoc('page.region')
254 | # integrate into one list
255 | for j in groupList:
256 | result.append(('grpbeg',j))
257 | for j in groupregionList:
258 | result.append(('gregion',j))
259 | for j in pageregionList:
260 | result.append(('pregion',j))
261 | result.sort(compare)
262 |
263 | # insert group end and page end indicators
264 | inGroup = False
265 | j = 0
266 | while True:
267 | if j == len(result): break
268 | rtype = result[j][0]
269 | rval = result[j][1]
270 | if not inGroup and (rtype == 'grpbeg') :
271 | inGroup = True
272 | j = j + 1
273 | elif inGroup and (rtype in ('grpbeg', 'pregion')):
274 | result.insert(j,('grpend',rval))
275 | inGroup = False
276 | else:
277 | j = j + 1
278 | if inGroup:
279 | result.append(('grpend',-1))
280 | result.append(('pageend', -1))
281 | return pagetype, result
282 |
283 |
284 |
285 | # build a description of the paragraph
286 | def getParaDescription(self, start, end, regtype):
287 |
288 | result = []
289 |
290 | # paragraph
291 | (pos, pclass) = self.findinDoc('paragraph.class',start,end)
292 |
293 | pclass = self.getClass(pclass)
294 |
295 | # build up a description of the paragraph in result and return it
296 | # first check for the basic - all words paragraph
297 | (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
298 | (pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
299 | if (sfirst != None) and (slast != None) :
300 | first = int(sfirst)
301 | last = int(slast)
302 |
303 | makeImage = (regtype == 'vertical') or (regtype == 'table')
304 | if self.fixedimage:
305 | makeImage = makeImage or (regtype == 'fixed')
306 |
307 | if (pclass != None):
308 | makeImage = makeImage or (pclass.find('.inverted') >= 0)
309 | if self.fixedimage :
310 | makeImage = makeImage or (pclass.find('cl-f-') >= 0)
311 |
312 | if not makeImage :
313 | # standard all word paragraph
314 | for wordnum in xrange(first, last):
315 | result.append(('ocr', wordnum))
316 | return pclass, result
317 |
318 | # convert paragraph to svg image
319 | # translate first and last word into first and last glyphs
320 | # and generate inline image and include it
321 | glyphList = []
322 | firstglyphList = self.getData('word.firstGlyph',0,-1)
323 | gidList = self.getData('info.glyph.glyphID',0,-1)
324 | firstGlyph = firstglyphList[first]
325 | if last < len(firstglyphList):
326 | lastGlyph = firstglyphList[last]
327 | else :
328 | lastGlyph = len(gidList)
329 | for glyphnum in xrange(firstGlyph, lastGlyph):
330 | glyphList.append(glyphnum)
331 | # include any extratokens if they exist
332 | (pos, sfg) = self.findinDoc('extratokens.firstGlyph',start,end)
333 | (pos, slg) = self.findinDoc('extratokens.lastGlyph',start,end)
334 | if (sfg != None) and (slg != None):
335 | for glyphnum in xrange(int(sfg), int(slg)):
336 | glyphList.append(glyphnum)
337 | num = self.svgcount
338 | self.glyphs_to_image(glyphList)
339 | self.svgcount += 1
340 | result.append(('svg', num))
341 | return pclass, result
342 |
343 | # this type of paragraph may be made up of multiple spans, inline
344 | # word monograms (images), and words with semantic meaning,
345 | # plus glyphs used to form starting letter of first word
346 |
347 | # need to parse this type line by line
348 | line = start + 1
349 | word_class = ''
350 |
351 | # if end is -1 then we must search to end of document
352 | if end == -1 :
353 | end = self.docSize
354 |
355 | # seems some xml has last* coming before first* so we have to
356 | # handle any order
357 | sp_first = -1
358 | sp_last = -1
359 |
360 | gl_first = -1
361 | gl_last = -1
362 |
363 | ws_first = -1
364 | ws_last = -1
365 |
366 | word_class = ''
367 |
368 | while (line < end) :
369 |
370 | (name, argres) = self.lineinDoc(line)
371 |
372 | if name.endswith('span.firstWord') :
373 | sp_first = int(argres)
374 |
375 | elif name.endswith('span.lastWord') :
376 | sp_last = int(argres)
377 |
378 | elif name.endswith('word.firstGlyph') :
379 | gl_first = int(argres)
380 |
381 | elif name.endswith('word.lastGlyph') :
382 | gl_last = int(argres)
383 |
384 | elif name.endswith('word_semantic.firstWord'):
385 | ws_first = int(argres)
386 |
387 | elif name.endswith('word_semantic.lastWord'):
388 | ws_last = int(argres)
389 |
390 | elif name.endswith('word.class'):
391 | (cname, space) = argres.split('-',1)
392 | if space == '' : space = '0'
393 | if (cname == 'spaceafter') and (int(space) > 0) :
394 | word_class = 'sa'
395 |
396 | elif name.endswith('word.img.src'):
397 | result.append(('img' + word_class, int(argres)))
398 | word_class = ''
399 |
400 | elif name.endswith('region.img.src'):
401 | result.append(('img' + word_class, int(argres)))
402 |
403 | if (sp_first != -1) and (sp_last != -1):
404 | for wordnum in xrange(sp_first, sp_last):
405 | result.append(('ocr', wordnum))
406 | sp_first = -1
407 | sp_last = -1
408 |
409 | if (gl_first != -1) and (gl_last != -1):
410 | glyphList = []
411 | for glyphnum in xrange(gl_first, gl_last):
412 | glyphList.append(glyphnum)
413 | num = self.svgcount
414 | self.glyphs_to_image(glyphList)
415 | self.svgcount += 1
416 | result.append(('svg', num))
417 | gl_first = -1
418 | gl_last = -1
419 |
420 | if (ws_first != -1) and (ws_last != -1):
421 | for wordnum in xrange(ws_first, ws_last):
422 | result.append(('ocr', wordnum))
423 | ws_first = -1
424 | ws_last = -1
425 |
426 | line += 1
427 |
428 | return pclass, result
429 |
430 |
431 | def buildParagraph(self, pclass, pdesc, type, regtype) :
432 | parares = ''
433 | sep =''
434 |
435 | classres = ''
436 | if pclass :
437 | classres = ' class="' + pclass + '"'
438 |
439 | br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') or (regtype == 'vertical')
440 |
441 | handle_links = len(self.link_id) > 0
442 |
443 | if (type == 'full') or (type == 'begin') :
444 | parares += ''
445 |
446 | if (type == 'end'):
447 | parares += ' '
448 |
449 | lstart = len(parares)
450 |
451 | cnt = len(pdesc)
452 |
453 | for j in xrange( 0, cnt) :
454 |
455 | (wtype, num) = pdesc[j]
456 |
457 | if wtype == 'ocr' :
458 | word = self.ocrtext[num]
459 | sep = ' '
460 |
461 | if handle_links:
462 | link = self.link_id[num]
463 | if (link > 0):
464 | linktype = self.link_type[link-1]
465 | title = self.link_title[link-1]
466 | if (title == "") or (parares.rfind(title) < 0):
467 | title=parares[lstart:]
468 | if linktype == 'external' :
469 | linkhref = self.link_href[link-1]
470 | linkhtml = '' % linkhref
471 | else :
472 | if len(self.link_page) >= link :
473 | ptarget = self.link_page[link-1] - 1
474 | linkhtml = '' % ptarget
475 | else :
476 | # just link to the current page
477 | linkhtml = ''
478 | linkhtml += title + ''
479 | pos = parares.rfind(title)
480 | if pos >= 0:
481 | parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
482 | else :
483 | parares += linkhtml
484 | lstart = len(parares)
485 | if word == '_link_' : word = ''
486 | elif (link < 0) :
487 | if word == '_link_' : word = ''
488 |
489 | if word == '_lb_':
490 | if ((num-1) in self.dehyphen_rootid ) or handle_links:
491 | word = ''
492 | sep = ''
493 | elif br_lb :
494 | word = '
\n'
495 | sep = ''
496 | else :
497 | word = '\n'
498 | sep = ''
499 |
500 | if num in self.dehyphen_rootid :
501 | word = word[0:-1]
502 | sep = ''
503 |
504 | parares += word + sep
505 |
506 | elif wtype == 'img' :
507 | sep = ''
508 | parares += '
' % num
509 | parares += sep
510 |
511 | elif wtype == 'imgsa' :
512 | sep = ' '
513 | parares += '
' % num
514 | parares += sep
515 |
516 | elif wtype == 'svg' :
517 | sep = ''
518 | parares += '
' % num
519 | parares += sep
520 |
521 | if len(sep) > 0 : parares = parares[0:-1]
522 | if (type == 'full') or (type == 'end') :
523 | parares += '
'
524 | return parares
525 |
526 |
527 |
528 | # walk the document tree collecting the information needed
529 | # to build an html page using the ocrText
530 |
531 | def process(self):
532 |
533 | htmlpage = ''
534 |
535 | # get the ocr text
536 | (pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
537 | if argres : self.ocrtext = argres.split('|')
538 |
539 | # get information to dehyphenate the text
540 | self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1)
541 |
542 | # determine if first paragraph is continued from previous page
543 | (pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
544 | first_para_continued = (self.parastems_stemid != None)
545 |
546 | # determine if last paragraph is continued onto the next page
547 | (pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1)
548 | last_para_continued = (self.paracont_stemid != None)
549 |
550 | # collect link ids
551 | self.link_id = self.getData('info.word.link_id',0,-1)
552 |
553 | # collect link destination page numbers
554 | self.link_page = self.getData('info.links.page',0,-1)
555 |
556 | # collect link types (container versus external)
557 | (pos, argres) = self.findinDoc('info.links.type',0,-1)
558 | if argres : self.link_type = argres.split('|')
559 |
560 | # collect link destinations
561 | (pos, argres) = self.findinDoc('info.links.href',0,-1)
562 | if argres : self.link_href = argres.split('|')
563 |
564 | # collect link titles
565 | (pos, argres) = self.findinDoc('info.links.title',0,-1)
566 | if argres :
567 | self.link_title = argres.split('|')
568 | else:
569 | self.link_title.append('')
570 |
571 | # get a descriptions of the starting points of the regions
572 | # and groups on the page
573 | (pagetype, pageDesc) = self.PageDescription()
574 | regcnt = len(pageDesc) - 1
575 |
576 | anchorSet = False
577 | breakSet = False
578 | inGroup = False
579 |
580 | # process each region on the page and convert what you can to html
581 |
582 | for j in xrange(regcnt):
583 |
584 | (etype, start) = pageDesc[j]
585 | (ntype, end) = pageDesc[j+1]
586 |
587 |
588 | # set anchor for link target on this page
589 | if not anchorSet and not first_para_continued:
590 | htmlpage += '\n'
592 | anchorSet = True
593 |
594 | # handle groups of graphics with text captions
595 | if (etype == 'grpbeg'):
596 | (pos, grptype) = self.findinDoc('group.type', start, end)
597 | if grptype != None:
598 | if grptype == 'graphic':
599 | gcstr = ' class="' + grptype + '"'
600 | htmlpage += ''
601 | inGroup = True
602 |
603 | elif (etype == 'grpend'):
604 | if inGroup:
605 | htmlpage += '
\n'
606 | inGroup = False
607 |
608 | else:
609 | (pos, regtype) = self.findinDoc('region.type',start,end)
610 |
611 | if regtype == 'graphic' :
612 | (pos, simgsrc) = self.findinDoc('img.src',start,end)
613 | if simgsrc:
614 | if inGroup:
615 | htmlpage += '
' % int(simgsrc)
616 | else:
617 | htmlpage += '' % int(simgsrc)
618 |
619 | elif regtype == 'chapterheading' :
620 | (pclass, pdesc) = self.getParaDescription(start,end, regtype)
621 | if not breakSet:
622 | htmlpage += '
\n'
623 | breakSet = True
624 | tag = 'h1'
625 | if pclass and (len(pclass) >= 7):
626 | if pclass[3:7] == 'ch1-' : tag = 'h1'
627 | if pclass[3:7] == 'ch2-' : tag = 'h2'
628 | if pclass[3:7] == 'ch3-' : tag = 'h3'
629 | htmlpage += '<' + tag + ' class="' + pclass + '">'
630 | else:
631 | htmlpage += '<' + tag + '>'
632 | htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
633 | htmlpage += '' + tag + '>'
634 |
635 | elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem'):
636 | ptype = 'full'
637 | # check to see if this is a continution from the previous page
638 | if first_para_continued :
639 | ptype = 'end'
640 | first_para_continued = False
641 | (pclass, pdesc) = self.getParaDescription(start,end, regtype)
642 | if pclass and (len(pclass) >= 6) and (ptype == 'full'):
643 | tag = 'p'
644 | if pclass[3:6] == 'h1-' : tag = 'h4'
645 | if pclass[3:6] == 'h2-' : tag = 'h5'
646 | if pclass[3:6] == 'h3-' : tag = 'h6'
647 | htmlpage += '<' + tag + ' class="' + pclass + '">'
648 | htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
649 | htmlpage += '' + tag + '>'
650 | else :
651 | htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
652 |
653 | elif (regtype == 'tocentry') :
654 | ptype = 'full'
655 | if first_para_continued :
656 | ptype = 'end'
657 | first_para_continued = False
658 | (pclass, pdesc) = self.getParaDescription(start,end, regtype)
659 | htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
660 |
661 |
662 | elif (regtype == 'vertical') or (regtype == 'table') :
663 | ptype = 'full'
664 | if inGroup:
665 | ptype = 'middle'
666 | if first_para_continued :
667 | ptype = 'end'
668 | first_para_continued = False
669 | (pclass, pdesc) = self.getParaDescription(start, end, regtype)
670 | htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
671 |
672 |
673 | elif (regtype == 'synth_fcvr.center'):
674 | (pos, simgsrc) = self.findinDoc('img.src',start,end)
675 | if simgsrc:
676 | htmlpage += '' % int(simgsrc)
677 |
678 | else :
679 | print ' Making region type', regtype,
680 | (pos, temp) = self.findinDoc('paragraph',start,end)
681 | (pos2, temp) = self.findinDoc('span',start,end)
682 | if pos != -1 or pos2 != -1:
683 | print ' a "text" region'
684 | orig_regtype = regtype
685 | regtype = 'fixed'
686 | ptype = 'full'
687 | # check to see if this is a continution from the previous page
688 | if first_para_continued :
689 | ptype = 'end'
690 | first_para_continued = False
691 | (pclass, pdesc) = self.getParaDescription(start,end, regtype)
692 | if not pclass:
693 | if orig_regtype.endswith('.right') : pclass = 'cl-right'
694 | elif orig_regtype.endswith('.center') : pclass = 'cl-center'
695 | elif orig_regtype.endswith('.left') : pclass = 'cl-left'
696 | elif orig_regtype.endswith('.justify') : pclass = 'cl-justify'
697 | if pclass and (ptype == 'full') and (len(pclass) >= 6):
698 | tag = 'p'
699 | if pclass[3:6] == 'h1-' : tag = 'h4'
700 | if pclass[3:6] == 'h2-' : tag = 'h5'
701 | if pclass[3:6] == 'h3-' : tag = 'h6'
702 | htmlpage += '<' + tag + ' class="' + pclass + '">'
703 | htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
704 | htmlpage += '' + tag + '>'
705 | else :
706 | htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
707 | else :
708 | print ' a "graphic" region'
709 | (pos, simgsrc) = self.findinDoc('img.src',start,end)
710 | if simgsrc:
711 | htmlpage += '' % int(simgsrc)
712 |
713 |
714 | if last_para_continued :
715 | if htmlpage[-4:] == '
':
716 | htmlpage = htmlpage[0:-4]
717 | last_para_continued = False
718 |
719 | return htmlpage
720 |
721 |
722 |
723 | def convert2HTML(flatxml, classlst, fileid, bookDir, fixedimage):
724 |
725 | # create a document parser
726 | dp = DocParser(flatxml, classlst, fileid, bookDir, fixedimage)
727 |
728 | htmlpage = dp.process()
729 |
730 | return htmlpage
731 |
--------------------------------------------------------------------------------
/topaz/genhtml.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
3 | # For use with Topaz Scripts Version 2.6
4 |
5 | class Unbuffered:
6 | def __init__(self, stream):
7 | self.stream = stream
8 | def write(self, data):
9 | self.stream.write(data)
10 | self.stream.flush()
11 | def __getattr__(self, attr):
12 | return getattr(self.stream, attr)
13 |
14 | import sys
15 | sys.stdout=Unbuffered(sys.stdout)
16 |
17 |
18 | import os, getopt
19 |
20 | # local routines
21 | import convert2xml
22 | import flatxml2html
23 | import decode_meta
24 | import stylexml2css
25 | import getpagedim
26 |
27 | def usage():
28 | print 'Usage: '
29 | print ' '
30 | print ' genhtml.py [--fixed-image] unencryptedBookDir'
31 | print ' '
32 | print ' Options: '
33 | print ' --fixed-image : force translation of fixed regions into svg images '
34 | print ' '
35 |
36 |
37 | def main(argv):
38 | bookDir = ''
39 | fixedimage = False
40 |
41 | if len(argv) == 0:
42 | argv = sys.argv
43 |
44 | try:
45 | opts, args = getopt.getopt(argv[1:], "h:",["fixed-image"])
46 |
47 | except getopt.GetoptError, err:
48 | print str(err)
49 | usage()
50 | sys.exit(1)
51 |
52 | if len(opts) == 0 and len(args) == 0 :
53 | usage()
54 | sys.exit(1)
55 |
56 | for o, a in opts:
57 | if o =="-h":
58 | usage()
59 | sys.exit(0)
60 | if o =="--fixed-image":
61 | fixedimage = True
62 |
63 | bookDir = args[0]
64 |
65 | if not os.path.exists(bookDir) :
66 | print "Can not find directory with unencrypted book"
67 | sys.exit(1)
68 |
69 | dictFile = os.path.join(bookDir,'dict0000.dat')
70 |
71 | if not os.path.exists(dictFile) :
72 | print "Can not find dict0000.dat file"
73 | sys.exit(1)
74 |
75 | pageDir = os.path.join(bookDir,'page')
76 | if not os.path.exists(pageDir) :
77 | print "Can not find page directory in unencrypted book"
78 | sys.exit(1)
79 |
80 | imgDir = os.path.join(bookDir,'img')
81 | if not os.path.exists(imgDir) :
82 | print "Can not find image directory in unencrypted book"
83 | sys.exit(1)
84 |
85 | svgDir = os.path.join(bookDir,'svg')
86 | if not os.path.exists(svgDir) :
87 | print "Can not find svg directory in unencrypted book"
88 | print "please run gensvg.py before running genhtml.py"
89 | sys.exit(1)
90 |
91 | otherFile = os.path.join(bookDir,'other0000.dat')
92 | if not os.path.exists(otherFile) :
93 | print "Can not find other0000.dat in unencrypted book"
94 | sys.exit(1)
95 |
96 | metaFile = os.path.join(bookDir,'metadata0000.dat')
97 | if not os.path.exists(metaFile) :
98 | print "Can not find metadata0000.dat in unencrypted book"
99 | sys.exit(1)
100 |
101 | htmlFileName = "book.html"
102 | htmlstr = '\n'
103 | htmlstr += '\n'
104 |
105 | filenames = os.listdir(pageDir)
106 | filenames = sorted(filenames)
107 |
108 | print 'Processing ... '
109 |
110 | htmlstr += '\n'
111 | htmlstr += '\n'
112 |
113 | # process metadata and retrieve fontSize info
114 | print ' ', 'metadata0000.dat'
115 | fname = os.path.join(bookDir,'metadata0000.dat')
116 | xname = os.path.join(bookDir, 'metadata.txt')
117 | metastr = decode_meta.getMetaData(fname)
118 | file(xname, 'wb').write(metastr)
119 | meta_array = decode_meta.getMetaArray(fname)
120 |
121 | htmlstr += '' + meta_array['Title'] + ' by ' + meta_array['Authors'] + '\n'
122 | htmlstr += '\n'
123 | htmlstr += '\n'
124 |
125 | # get some scaling info from metadata to use while processing styles
126 | fontsize = '135'
127 | if 'fontSize' in meta_array:
128 | fontsize = meta_array['fontSize']
129 |
130 | # also get the size of a normal text page
131 | spage = '1'
132 | if 'firstTextPage' in meta_array:
133 | spage = meta_array['firstTextPage']
134 | pnum = int(spage)
135 |
136 | # get page height and width from first text page for use in stylesheet scaling
137 | pname = 'page%04d.dat' % (pnum + 1)
138 | fname = os.path.join(pageDir,pname)
139 | pargv=[]
140 | pargv.append('convert2xml.py')
141 | pargv.append('--flat-xml')
142 | pargv.append(dictFile)
143 | pargv.append(fname)
144 | flat_xml = convert2xml.main(pargv)
145 | (ph, pw) = getpagedim.getPageDim(flat_xml)
146 | if (ph == '-1') or (ph == '0') : ph = '11000'
147 | if (pw == '-1') or (pw == '0') : pw = '8500'
148 |
149 | # now build up the style sheet
150 | print ' ', 'other0000.dat'
151 | fname = os.path.join(bookDir,'other0000.dat')
152 | xname = os.path.join(bookDir, 'style.css')
153 | pargv=[]
154 | pargv.append('convert2xml.py')
155 | pargv.append('--flat-xml')
156 | pargv.append(dictFile)
157 | pargv.append(fname)
158 | xmlstr = convert2xml.main(pargv)
159 | cssstr , classlst = stylexml2css.convert2CSS(xmlstr, fontsize, ph, pw)
160 | file(xname, 'wb').write(cssstr)
161 | htmlstr += '\n'
162 | htmlstr += '\n\n'
163 |
164 | for filename in filenames:
165 | print ' ', filename
166 | fname = os.path.join(pageDir,filename)
167 | pargv=[]
168 | pargv.append('convert2xml.py')
169 | pargv.append('--flat-xml')
170 | pargv.append(dictFile)
171 | pargv.append(fname)
172 | flat_xml = convert2xml.main(pargv)
173 | htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, fixedimage)
174 |
175 | htmlstr += '\n\n'
176 |
177 | file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
178 | print 'Processing Complete'
179 |
180 | return 0
181 |
182 | if __name__ == '__main__':
183 | sys.exit(main(''))
184 |
185 |
186 |
--------------------------------------------------------------------------------
/topaz/gensvg.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
3 | # For use with Topaz Scripts Version 2.6
4 |
5 | class Unbuffered:
6 | def __init__(self, stream):
7 | self.stream = stream
8 | def write(self, data):
9 | self.stream.write(data)
10 | self.stream.flush()
11 | def __getattr__(self, attr):
12 | return getattr(self.stream, attr)
13 |
14 | import sys
15 | sys.stdout=Unbuffered(sys.stdout)
16 |
17 | import os, getopt
18 |
19 | # local routines
20 | import convert2xml
21 | import decode_meta
22 |
23 |
24 | class GParser(object):
25 | def __init__(self, flatxml):
26 | self.flatdoc = flatxml.split('\n')
27 | self.dpi = 1440
28 | self.gh = self.getData('info.glyph.h')
29 | self.gw = self.getData('info.glyph.w')
30 | self.guse = self.getData('info.glyph.use')
31 | if self.guse :
32 | self.count = len(self.guse)
33 | else :
34 | self.count = 0
35 | self.gvtx = self.getData('info.glyph.vtx')
36 | self.glen = self.getData('info.glyph.len')
37 | self.gdpi = self.getData('info.glyph.dpi')
38 | self.vx = self.getData('info.vtx.x')
39 | self.vy = self.getData('info.vtx.y')
40 | self.vlen = self.getData('info.len.n')
41 | if self.vlen :
42 | self.glen.append(len(self.vlen))
43 | elif self.glen:
44 | self.glen.append(0)
45 | if self.vx :
46 | self.gvtx.append(len(self.vx))
47 | elif self.gvtx :
48 | self.gvtx.append(0)
49 |
50 | def getData(self, path):
51 | result = None
52 | cnt = len(self.flatdoc)
53 | for j in xrange(cnt):
54 | item = self.flatdoc[j]
55 | if item.find('=') >= 0:
56 | (name, argt) = item.split('=')
57 | argres = argt.split('|')
58 | else:
59 | name = item
60 | argres = []
61 | if (name == path):
62 | result = argres
63 | break
64 | if (len(argres) > 0) :
65 | for j in xrange(0,len(argres)):
66 | argres[j] = int(argres[j])
67 | return result
68 |
69 |
70 | def getGlyphDim(self, gly):
71 | maxh = (self.gh[gly] * self.dpi) / self.gdpi[gly]
72 | maxw = (self.gw[gly] * self.dpi) / self.gdpi[gly]
73 | return maxh, maxw
74 |
75 |
76 | def getPath(self, gly):
77 | path = ''
78 | if (gly < 0) or (gly >= self.count):
79 | return path
80 | tx = self.vx[self.gvtx[gly]:self.gvtx[gly+1]]
81 | ty = self.vy[self.gvtx[gly]:self.gvtx[gly+1]]
82 | p = 0
83 | for k in xrange(self.glen[gly], self.glen[gly+1]):
84 | if (p == 0):
85 | zx = tx[0:self.vlen[k]+1]
86 | zy = ty[0:self.vlen[k]+1]
87 | else:
88 | zx = tx[self.vlen[k-1]+1:self.vlen[k]+1]
89 | zy = ty[self.vlen[k-1]+1:self.vlen[k]+1]
90 | p += 1
91 | j = 0
92 | while ( j < len(zx) ):
93 | if (j == 0):
94 | # Start Position.
95 | path += 'M %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly])
96 | elif (j <= len(zx)-3):
97 | # Cubic Bezier Curve
98 | path += 'C %d %d %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[j+1] * self.dpi / self.gdpi[gly], zy[j+1] * self.dpi / self.gdpi[gly], zx[j+2] * self.dpi / self.gdpi[gly], zy[j+2] * self.dpi / self.gdpi[gly])
99 | j += 2
100 | elif (j == len(zx)-2):
101 | # Cubic Bezier Curve to Start Position
102 | path += 'C %d %d %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[j+1] * self.dpi / self.gdpi[gly], zy[j+1] * self.dpi / self.gdpi[gly], zx[0] * self.dpi / self.gdpi[gly], zy[0] * self.dpi / self.gdpi[gly])
103 | j += 1
104 | elif (j == len(zx)-1):
105 | # Quadratic Bezier Curve to Start Position
106 | path += 'Q %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[0] * self.dpi / self.gdpi[gly], zy[0] * self.dpi / self.gdpi[gly])
107 |
108 | j += 1
109 | path += 'z'
110 | return path
111 |
112 | class PParser(object):
113 | def __init__(self, flatxml):
114 | self.flatdoc = flatxml.split('\n')
115 | self.temp = []
116 | foo = self.getData('page.h') or self.getData('book.h')
117 | self.ph = foo[0]
118 | foo = self.getData('page.w') or self.getData('book.w')
119 | self.pw = foo[0]
120 | self.gx = self.getData('info.glyph.x')
121 | self.gy = self.getData('info.glyph.y')
122 | self.gid = self.getData('info.glyph.glyphID')
123 |
124 | def getData(self, path):
125 | result = None
126 | cnt = len(self.flatdoc)
127 | for j in xrange(cnt):
128 | item = self.flatdoc[j]
129 | if item.find('=') >= 0:
130 | (name, argt) = item.split('=')
131 | argres = argt.split('|')
132 | else:
133 | name = item
134 | argres = []
135 | if (name.endswith(path)):
136 | result = argres
137 | break
138 | if (len(argres) > 0) :
139 | for j in xrange(0,len(argres)):
140 | argres[j] = int(argres[j])
141 | return result
142 |
143 | def getDataTemp(self, path):
144 | result = None
145 | cnt = len(self.temp)
146 | for j in xrange(cnt):
147 | item = self.temp[j]
148 | if item.find('=') >= 0:
149 | (name, argt) = item.split('=')
150 | argres = argt.split('|')
151 | else:
152 | name = item
153 | argres = []
154 | if (name.endswith(path)):
155 | result = argres
156 | self.temp.pop(j)
157 | break
158 | if (len(argres) > 0) :
159 | for j in xrange(0,len(argres)):
160 | argres[j] = int(argres[j])
161 | return result
162 |
163 | def getImages(self):
164 | result = []
165 | self.temp = self.flatdoc
166 | while (self.getDataTemp('img') != None):
167 | h = self.getDataTemp('img.h')[0]
168 | w = self.getDataTemp('img.w')[0]
169 | x = self.getDataTemp('img.x')[0]
170 | y = self.getDataTemp('img.y')[0]
171 | src = self.getDataTemp('img.src')[0]
172 | result.append('\n' % (src, x, y, w, h))
173 | return result
174 |
175 | def getGlyphs(self,glyfname):
176 | result = []
177 | if (self.gid != None) and (len(self.gid) > 0):
178 | glyphs = []
179 | for j in set(self.gid):
180 | glyphs.append(j)
181 | glyphs.sort()
182 | gfile = open(glyfname, 'r')
183 | j = 0
184 | while True :
185 | inp = gfile.readline()
186 | if (inp == ''):
187 | break
188 | id='id="gl%d"' % glyphs[j]
189 | if (inp.find(id) > 0):
190 | result.append(inp)
191 | j += 1
192 | if (j == len(glyphs)):
193 | break
194 | gfile.close()
195 | return result
196 |
197 |
198 |
199 |
200 | def usage():
201 | print 'Usage: '
202 | print ' '
203 | print ' gensvg.py [options] unencryptedBookDir'
204 | print ' '
205 | print ' -x : output browseable XHTML+SVG pages (default)'
206 | print ' -r : output raw SVG images'
207 |
208 |
209 | def main(argv):
210 | bookDir = ''
211 |
212 | if len(argv) == 0:
213 | argv = sys.argv
214 |
215 | try:
216 | opts, args = getopt.getopt(argv[1:], "xrh")
217 |
218 | except getopt.GetoptError, err:
219 | print str(err)
220 | usage()
221 | sys.exit(1)
222 |
223 | if len(opts) == 0 and len(args) == 0 :
224 | usage()
225 | sys.exit(1)
226 |
227 | raw = 0
228 | for o, a in opts:
229 | if o =="-h":
230 | usage()
231 | sys.exit(0)
232 | if o =="-x":
233 | raw = 0
234 | if o =="-r":
235 | raw = 1
236 |
237 | bookDir = args[0]
238 |
239 | if not os.path.exists(bookDir) :
240 | print "Can not find directory with unencrypted book"
241 | sys.exit(1)
242 |
243 | dictFile = os.path.join(bookDir,'dict0000.dat')
244 |
245 | if not os.path.exists(dictFile) :
246 | print "Can not find dict0000.dat file"
247 | sys.exit(1)
248 |
249 | pageDir = os.path.join(bookDir,'page')
250 | if not os.path.exists(pageDir) :
251 | print "Can not find page directory in unencrypted book"
252 | sys.exit(1)
253 |
254 | imgDir = os.path.join(bookDir,'img')
255 | if not os.path.exists(imgDir) :
256 | print "Can not find image directory in unencrypted book"
257 | sys.exit(1)
258 |
259 | glyphsDir = os.path.join(bookDir,'glyphs')
260 | if not os.path.exists(glyphsDir) :
261 | print "Can not find glyphs directory in unencrypted book"
262 | sys.exit(1)
263 |
264 | metaFile = os.path.join(bookDir,'metadata0000.dat')
265 | if not os.path.exists(metaFile) :
266 | print "Can not find metadata0000.dat in unencrypted book"
267 | sys.exit(1)
268 |
269 | svgDir = os.path.join(bookDir,'svg')
270 | if not os.path.exists(svgDir) :
271 | os.makedirs(svgDir)
272 |
273 |
274 | print 'Processing Meta Data ... '
275 |
276 | print ' ', 'metadata0000.dat'
277 | fname = os.path.join(bookDir,'metadata0000.dat')
278 | metadata = decode_meta.getMetaArray(fname)
279 |
280 | print 'Processing Glyphs ... '
281 |
282 | filenames = os.listdir(glyphsDir)
283 | filenames = sorted(filenames)
284 |
285 | glyfname = os.path.join(svgDir,'glyphs.svg')
286 | glyfile = open(glyfname, 'w')
287 | glyfile.write('\n')
288 | glyfile.write('\n')
289 | glyfile.write('\n')
311 | glyfile.close()
312 |
313 | print 'Processing Pages ... '
314 |
315 | # Books are at 1440 DPI. This is rendering at twice that size for
316 | # readability when rendering to the screen.
317 | scaledpi = 1440
318 | filenames = os.listdir(pageDir)
319 | filenames = sorted(filenames)
320 | counter = 0
321 | for filename in filenames:
322 | print ' ', filename
323 | fname = os.path.join(pageDir,filename)
324 | pargv=[]
325 | pargv.append('convert2xml.py')
326 | pargv.append('--flat-xml')
327 | pargv.append(dictFile)
328 | pargv.append(fname)
329 | flat_xml = convert2xml.main(pargv)
330 | pp = PParser(flat_xml)
331 | if (raw) :
332 | pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
333 | else :
334 | pfile = open(os.path.join(svgDir,'page%04d.xhtml' % counter), 'w')
335 |
336 | pfile.write('\n')
337 | if (raw):
338 | pfile.write('\n')
339 | pfile.write('