├── .gitignore
├── README.md
├── kindledecrypt.py
├── mobidedrm.py
├── process.py
├── screenshot.png
├── setup-macosx.py
├── setup-win32.py
└── topaz
    ├── __init__.py
    ├── cmbtc.py
    ├── convert2xml.py
    ├── decode_meta.py
    ├── flatxml2html.py
    ├── genhtml.py
    ├── gensvg.py
    ├── getpagedim.py
    └── stylexml2css.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Kindle Book Decrypter
 2 | =====================
 3 | A simple GUI to remove the restrictions put in place to prevent copying books,
 4 | listening to some books, or transfering books to other devices, etc. All it
 5 | requires it the original book file and your Kindle serial number.
 6 | 
 7 |  * Backup your purchased Kindle books on your own media
 8 |  * Use other e-reading devices and/or software to read your purchased books
 9 |  * Re-enable audio playback of books that have disabled it
10 |  * Allow a friend to borrow a book you are no longer reading (if this falls
11 |    under fair use and is legal where you live)
12 |  * Supports Mobipocket and Topaz book formats (azw, mobi, prc, azw1, tpz)
13 | 
14 | ![Screenshot](screenshot.png)
15 | 
16 | Pre-Built Binaries
17 | ------------------
18 | The following pre-built binaries are available and kept up to date with the
19 | latest changes:
20 | 
21 |  * [Microsoft Windows](http://programmer-art.org/dropbox/kindledecrypt-1.1-win32.exe)
22 |  * [Mac OS X](http://programmer-art.org/dropbox/kindledecrypt-1.1-macosx.zip)
23 | 
24 | Dependencies
25 | ------------
26 | The Kindle Book Decrypter depends on the following when not using the pre-built binaries:
27 | 
28 |  * Python
29 |  * wxWidgets (and Python bindings)
30 | 
31 | Usage
32 | -----
33 | You can use the application by running it in a terminal or double clicking it:
34 | 
35 |     cd kindledecrypt
36 |     ./kindledecrypt.py
37 | 
38 | License
39 | -------
40 | Original reverse engineering and Kindle PIN code is copyright the respective
41 | authors. The GUI is copyright 2010 Daniel G. Taylor and released under the MIT
42 | License. See the script itself for details.
43 | 
44 | 


--------------------------------------------------------------------------------
/kindledecrypt.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | """
  4 |     Kindle Book Decrypter
  5 |     =====================
  6 |     Simple GUI for MobiDeDRM code written with wxWidgets. This GUI takes a 
  7 |     serial number and encrypted book file and outputs an unencrypted book
  8 |     that can be used to backup your data or legally remove audio and other
  9 |     restrictions by allowing you to convert to other formats.
 10 |     
 11 |     License
 12 |     -------
 13 |     Copyright (C) 2010 Daniel G. Taylor <dan@programmer-art.org>
 14 |     
 15 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 16 |     of this software and associated documentation files (the "Software"), to deal
 17 |     in the Software without restriction, including without limitation the rights
 18 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 19 |     copies of the Software, and to permit persons to whom the Software is
 20 |     furnished to do so, subject to the following conditions:
 21 | 
 22 |     The above copyright notice and this permission notice shall be included in
 23 |     all copies or substantial portions of the Software.
 24 | 
 25 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 26 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 27 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 28 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 29 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 30 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 31 |     THE SOFTWARE.
 32 | """
 33 | 
 34 | __author__ = "Daniel G. Taylor"
 35 | __version__ = 1.1
 36 | 
 37 | import ConfigParser
 38 | import optparse
 39 | import os
 40 | import sys
 41 | import wx
 42 | 
 43 | import mobidedrm
 44 | import process
 45 | import topaz
 46 | 
 47 | CONFIG = os.path.expanduser("~/.mobidedrmwx.cfg")
 48 | 
 49 | class MobiDeDrmApp(wx.App):
 50 |     """
 51 |         The main application holding all windows, controls, etc.
 52 |     """
 53 |     def __init__(self, redir=False):
 54 |         super(MobiDeDrmApp, self).__init__(redir)
 55 |         
 56 |         self.config = ConfigParser.SafeConfigParser()
 57 |         if os.path.exists(CONFIG):
 58 |             self.config.read(CONFIG)
 59 |         
 60 |         if not self.config.has_section("General"):
 61 |             self.config.add_section("General")
 62 |         
 63 |         if self.config.has_option("General", "Serial"):
 64 |             default_serial = self.config.get("General", "Serial")
 65 |         else:
 66 |             # This is just a random example serial
 67 |             default_serial = "B002A1C457493453"
 68 |         
 69 |         self.frame = wx.Frame(None, wx.ID_ANY, "Kindle Book Decrypter", size=(400, 130))
 70 |         
 71 |         self.panel = wx.Panel(self.frame)
 72 |         self.vbox = wx.BoxSizer(wx.VERTICAL)
 73 |         
 74 |         self.grid = wx.GridBagSizer(3, 3)
 75 |         self.serial_label = wx.StaticText(self.panel, label="Serial:")
 76 |         self.serial = wx.TextCtrl(self.panel, value=default_serial)
 77 |         self.serial_help = wx.StaticText(self.panel, label="Kindle or Kindle for iPhone serial number")
 78 |         font = self.serial_help.GetFont()
 79 |         font.SetPointSize(8)
 80 |         self.serial_help.SetFont(font)
 81 |         self.input_label = wx.StaticText(self.panel, label="Book:")
 82 |         self.input = wx.FilePickerCtrl(self.panel, wildcard="Kindle Books|*.azw;*.mobi;*.prc;*.azw1|All Files|*.*")
 83 |         self.button = wx.Button(self.panel, label="Decrypt")
 84 |         
 85 |         self.grid.Add(self.serial_label, (0, 0), flag=wx.ALIGN_CENTER_VERTICAL | wx.ALL)
 86 |         self.grid.Add(self.serial, (0, 1), flag=wx.ALIGN_CENTER_VERTICAL | wx.EXPAND)
 87 |         self.grid.Add(self.serial_help, (1, 1))
 88 |         self.grid.Add(self.input_label, (2, 0), flag=wx.ALIGN_CENTER_VERTICAL)
 89 |         self.grid.Add(self.input, (2, 1), flag=wx.ALIGN_CENTER_VERTICAL | wx.EXPAND)
 90 |         self.grid.Add(self.button, (3, 1), flag=wx.ALIGN_CENTER_VERTICAL | wx.ALIGN_RIGHT)
 91 |         
 92 |         self.grid.AddGrowableCol(1, 1)
 93 |         
 94 |         self.vbox.Add(self.grid, 1, wx.ALL | wx.EXPAND, border=5)
 95 |         
 96 |         self.panel.SetSizer(self.vbox)
 97 |         self.vbox.Fit(self.frame)
 98 |         
 99 |         self.frame.Bind(wx.EVT_BUTTON, self.on_process, self.button)
100 |         self.frame.Bind(wx.EVT_TEXT, self.on_serial_changed, self.serial)
101 |         
102 |         self.frame.Centre()
103 |         self.frame.Show(True)
104 |     
105 |     def on_serial_changed(self, event):
106 |         """
107 |             The serial number has changed. If it is the correct number of
108 |             characters then enable the decrypt button, otherwise disable it
109 |             until a valid serial is entered.
110 |         """
111 |         serial = self.serial.GetValue()
112 |         if len(serial) in [16, 40]:
113 |             self.button.Enable()
114 |             self.config.set("General", "Serial", self.serial.GetValue())
115 |             self.config.write(open(CONFIG, "w"))
116 |         else:
117 |             self.button.Disable()
118 |     
119 |     def on_process(self, event):
120 |         """
121 |             The decrypt button was clicked, so start the decrypting process.
122 |             This shows a pulsing progress dialog while the book is decrypted,
123 |             displaying a dialog for any errors that are encountered.
124 |         """
125 |         infile = self.input.GetPath()
126 |         
127 |         if not os.path.exists(infile):
128 |             error_dialog = wx.MessageDialog(self.panel, "Error: Input file doesn't exist!", "Error procesesing file!", wx.OK | wx.ICON_ERROR)
129 |             error_dialog.ShowModal()
130 |             error_dialog.Destroy()
131 |             return
132 |         
133 |         # Which type of book is this?
134 |         ext = ""
135 |         try:
136 |             topaz.cmbtc.bookFile = topaz.cmbtc.openBook(infile)
137 |             topaz.cmbtc.parseTopazHeader()
138 |         except topaz.cmbtc.CMBDTCFatal:
139 |             ext = ".mobi"
140 |         
141 |         outfile = os.path.splitext(infile)[0] + "-decrypted" + ext
142 |         pid = mobidedrm.getPid(self.serial.GetValue())
143 |         dialog = wx.ProgressDialog("Progress", "Decrypting...")
144 |         dialog.Pulse()
145 |         dialog.Show()
146 |         for error in process.decrypt(infile, outfile, pid):
147 |             dialog.Pulse()
148 |             wx.Yield()
149 |         
150 |         if error:
151 |             error_dialog = wx.MessageDialog(self.panel, "Error: %s" % error, "Error processing file!", wx.OK | wx.ICON_ERROR)
152 |             error_dialog.ShowModal()
153 |             error_dialog.Destroy()
154 |         
155 |         dialog.Destroy()
156 | 
157 | if __name__ == "__main__":
158 |     parser = optparse.OptionParser("%prog [options]", version="Kindle Book Decrypter %s" % __version__)
159 |     
160 |     options, args = parser.parse_args()    
161 |     
162 |     app = MobiDeDrmApp()
163 |     app.MainLoop()
164 | 
165 | 


--------------------------------------------------------------------------------
/mobidedrm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # This is a python script. You need a Python interpreter to run it.
  4 | # For example, ActiveState Python, which exists for windows.
  5 | #
  6 | # It can run standalone to convert files, or it can be installed as a
  7 | # plugin for Calibre (http://calibre-ebook.com/about) so that
  8 | # importing files with DRM 'Just Works'.
  9 | #
 10 | # To create a Calibre plugin, rename this file so that the filename
 11 | # ends in '_plugin.py', put it into a ZIP file and import that Calibre
 12 | # using its plugin configuration GUI.
 13 | #
 14 | # Changelog
 15 | #  0.01 - Initial version
 16 | #  0.02 - Huffdic compressed books were not properly decrypted
 17 | #  0.03 - Wasn't checking MOBI header length
 18 | #  0.04 - Wasn't sanity checking size of data record
 19 | #  0.05 - It seems that the extra data flags take two bytes not four
 20 | #  0.06 - And that low bit does mean something after all :-)
 21 | #  0.07 - The extra data flags aren't present in MOBI header < 0xE8 in size
 22 | #  0.08 - ...and also not in Mobi header version < 6
 23 | #  0.09 - ...but they are there with Mobi header version 6, header size 0xE4!
 24 | #  0.10 - Outputs unencrypted files as-is, so that when run as a Calibre
 25 | #         import filter it works when importing unencrypted files.
 26 | #         Also now handles encrypted files that don't need a specific PID.
 27 | #  0.11 - use autoflushed stdout and proper return values
 28 | #  0.12 - Fix for problems with metadata import as Calibre plugin, report errors
 29 | #  0.13 - Formatting fixes: retabbed file, removed trailing whitespace
 30 | #         and extra blank lines, converted CR/LF pairs at ends of each line,
 31 | #         and other cosmetic fixes.
 32 | #  0.14 - Working out when the extra data flags are present has been problematic
 33 | #         Versions 7 through 9 have tried to tweak the conditions, but have been
 34 | #         only partially successful. Closer examination of lots of sample
 35 | #         files reveals that a confusin has arisen because trailing data entries
 36 | #         are not encrypted, but it turns out that the multibyte entries
 37 | #         in utf8 file are encrypted. (Although neither kind gets compressed.)
 38 | #         This knowledge leads to a simplification of the test for the 
 39 | #         trailing data byte flags - version 5 and higher AND header 
 40 | #         size >= 0xE4. 
 41 | #  0.15 - Now outputs 'hearbeat', and is also quicker for long files.
 42 | #  0.16 - And reverts to 'done' not 'done.' at the end for unswindle 
 43 | #         compatibility.
 44 | #  0.17 - Added ability to extract PID given a Kindle serial number, added
 45 | #         OptionParser interface to argument processing, allow import as a
 46 | #         library without assuming Calibre is importing it
 47 | 
 48 | __version__ = '0.17'
 49 | 
 50 | import sys
 51 | import struct
 52 | import binascii
 53 | 
 54 | from optparse import OptionParser
 55 | 
 56 | letters = "ABCDEFGHIJKLMNPQRSTUVWXYZ123456789"
 57 | 
 58 | class Unbuffered:
 59 |     def __init__(self, stream):
 60 |         self.stream = stream
 61 |     def write(self, data):
 62 |         self.stream.write(data)
 63 |         self.stream.flush()
 64 |     def __getattr__(self, attr):
 65 |         return getattr(self.stream, attr)
 66 | 
 67 | class DrmException(Exception):
 68 |     pass
 69 | 
 70 | # Implementation of Pukall Cipher 1
 71 | def PC1(key, src, decryption=True):
 72 |     sum1 = 0;
 73 |     sum2 = 0;
 74 |     keyXorVal = 0;
 75 |     if len(key)!=16:
 76 |         print "Bad key length!"
 77 |         return None
 78 |     wkey = []
 79 |     for i in xrange(8):
 80 |         wkey.append(ord(key[i*2])<<8 | ord(key[i*2+1]))
 81 | 
 82 |     dst = ""
 83 |     for i in xrange(len(src)):
 84 |         temp1 = 0;
 85 |         byteXorVal = 0;
 86 |         for j in xrange(8):
 87 |             temp1 ^= wkey[j]
 88 |             sum2  = (sum2+j)*20021 + sum1
 89 |             sum1  = (temp1*346)&0xFFFF
 90 |             sum2  = (sum2+sum1)&0xFFFF
 91 |             temp1 = (temp1*20021+1)&0xFFFF
 92 |             byteXorVal ^= temp1 ^ sum2
 93 |         curByte = ord(src[i])
 94 |         if not decryption:
 95 |             keyXorVal = curByte * 257;
 96 |         curByte = ((curByte ^ (byteXorVal >> 8)) ^ byteXorVal) & 0xFF
 97 |         if decryption:
 98 |             keyXorVal = curByte * 257;
 99 |         for j in xrange(8):
100 |             wkey[j] ^= keyXorVal;
101 |         dst+=chr(curByte)
102 |     return dst
103 | 
104 | def checksumPid(s):
105 |     crc = (~binascii.crc32(s,-1))&0xFFFFFFFF
106 |     crc = crc ^ (crc >> 16)
107 |     res = s
108 |     l = len(letters)
109 |     for i in (0,1):
110 |         b = crc & 0xff
111 |         pos = (b // l) ^ (b % l)
112 |         res += letters[pos%l]
113 |         crc >>= 8
114 |     return res
115 | 
116 | def pidFromSerial(s, l):
117 |   crc = (~binascii.crc32(s,-1))&0xFFFFFFFF
118 |   
119 |   arr1 = [0]*l
120 |   for i in xrange(len(s)):
121 |     arr1[i%l] ^= ord(s[i])
122 | 
123 |   crc_bytes = [crc >> 24 & 0xff, crc >> 16 & 0xff, crc >> 8 & 0xff, crc & 0xff]
124 |   for i in xrange(l):
125 |     arr1[i] ^= crc_bytes[i&3]
126 | 
127 |   pid = ""
128 |   for i in xrange(l):
129 |     b = arr1[i] & 0xff
130 |     pid+=letters[(b >> 7) + ((b >> 5 & 3) ^ (b & 0x1f))]
131 | 
132 |   return pid
133 | 
134 | def getPid(serial):
135 |     pid = ""
136 |     if len(serial) == 16:
137 |         pid = checksumPid(pidFromSerial(serial, 7) + "*")
138 |     elif len(serial) == 40:
139 |         pid = checksumPid(pidFromSerial(serial, 8))
140 |     
141 |     return pid
142 | 
143 | def getSizeOfTrailingDataEntries(ptr, size, flags):
144 |     def getSizeOfTrailingDataEntry(ptr, size):
145 |         bitpos, result = 0, 0
146 |         if size <= 0:
147 |             return result
148 |         while True:
149 |             v = ord(ptr[size-1])
150 |             result |= (v & 0x7F) << bitpos
151 |             bitpos += 7
152 |             size -= 1
153 |             if (v & 0x80) != 0 or (bitpos >= 28) or (size == 0):
154 |                 return result
155 |     num = 0
156 |     testflags = flags >> 1
157 |     while testflags:
158 |         if testflags & 1:
159 |             num += getSizeOfTrailingDataEntry(ptr, size - num)
160 |         testflags >>= 1
161 |     # Multibyte data, if present, is included in the encryption, so
162 |     # we do not need to check the low bit.
163 |     # if flags & 1:
164 |     #    num += (ord(ptr[size - num - 1]) & 0x3) + 1
165 |     return num
166 | 
167 | class DrmStripper:
168 |     def loadSection(self, section):
169 |         if (section + 1 == self.num_sections):
170 |             endoff = len(self.data_file)
171 |         else:
172 |             endoff = self.sections[section + 1][0]
173 |         off = self.sections[section][0]
174 |         return self.data_file[off:endoff]
175 | 
176 |     def patch(self, off, new):
177 |         self.data_file = self.data_file[:off] + new + self.data_file[off+len(new):]
178 | 
179 |     def patchSection(self, section, new, in_off = 0):
180 |         if (section + 1 == self.num_sections):
181 |             endoff = len(self.data_file)
182 |         else:
183 |             endoff = self.sections[section + 1][0]
184 |         off = self.sections[section][0]
185 |         assert off + in_off + len(new) <= endoff
186 |         self.patch(off + in_off, new)
187 | 
188 |     def parseDRM(self, data, count, pid):
189 |         pid = pid.ljust(16,'\0')
190 |         keyvec1 = "\x72\x38\x33\xB0\xB4\xF2\xE3\xCA\xDF\x09\x01\xD6\xE2\xE0\x3F\x96"
191 |         temp_key = PC1(keyvec1, pid, False)
192 |         temp_key_sum = sum(map(ord,temp_key)) & 0xff
193 |         found_key = None
194 |         for i in xrange(count):
195 |             verification, size, type, cksum, cookie = struct.unpack('>LLLBxxx32s', data[i*0x30:i*0x30+0x30])
196 |             cookie = PC1(temp_key, cookie)
197 |             ver,flags,finalkey,expiry,expiry2 = struct.unpack('>LL16sLL', cookie)
198 |             if verification == ver and cksum == temp_key_sum and (flags & 0x1F) == 1:
199 |                 found_key = finalkey
200 |                 break
201 |         if not found_key:
202 |             # Then try the default encoding that doesn't require a PID
203 |             temp_key = keyvec1
204 |             temp_key_sum = sum(map(ord,temp_key)) & 0xff
205 |             for i in xrange(count):
206 |                 verification, size, type, cksum, cookie = struct.unpack('>LLLBxxx32s', data[i*0x30:i*0x30+0x30])
207 |                 cookie = PC1(temp_key, cookie)
208 |                 ver,flags,finalkey,expiry,expiry2 = struct.unpack('>LL16sLL', cookie)
209 |                 if verification == ver and cksum == temp_key_sum:
210 |                     found_key = finalkey
211 |                     break
212 |         return found_key
213 | 
214 |     def __init__(self, data_file, pid):
215 |         if checksumPid(pid[0:-2]) != pid:
216 |             raise DrmException("invalid PID checksum")
217 |         pid = pid[0:-2]
218 | 
219 |         self.data_file = data_file
220 |         header = data_file[0:72]
221 |         if header[0x3C:0x3C+8] != 'BOOKMOBI':
222 |             raise DrmException("invalid file format")
223 |         self.num_sections, = struct.unpack('>H', data_file[76:78])
224 | 
225 |         self.sections = []
226 |         for i in xrange(self.num_sections):
227 |             offset, a1,a2,a3,a4 = struct.unpack('>LBBBB', data_file[78+i*8:78+i*8+8])
228 |             flags, val = a1, a2<<16|a3<<8|a4
229 |             self.sections.append( (offset, flags, val) )
230 | 
231 |         sect = self.loadSection(0)
232 |         records, = struct.unpack('>H', sect[0x8:0x8+2])
233 |         mobi_length, = struct.unpack('>L',sect[0x14:0x18])
234 |         mobi_version, = struct.unpack('>L',sect[0x68:0x6C])
235 |         extra_data_flags = 0
236 |         print "MOBI header version = %d, length = %d" %(mobi_version, mobi_length)
237 |         if (mobi_length >= 0xE4) and (mobi_version >= 5):
238 |             extra_data_flags, = struct.unpack('>H', sect[0xF2:0xF4])
239 |             print "Extra Data Flags = %d" %extra_data_flags
240 | 
241 |         crypto_type, = struct.unpack('>H', sect[0xC:0xC+2])
242 |         if crypto_type == 0:
243 |             print "This book is not encrypted."
244 |         else:
245 |             if crypto_type == 1:
246 |                 raise DrmException("cannot decode Mobipocket encryption type 1")
247 |             if crypto_type != 2:
248 |                 raise DrmException("unknown encryption type: %d" % crypto_type)
249 | 
250 |             # calculate the keys
251 |             drm_ptr, drm_count, drm_size, drm_flags = struct.unpack('>LLLL', sect[0xA8:0xA8+16])
252 |             if drm_count == 0:
253 |                 raise DrmException("no PIDs found in this file")
254 |             found_key = self.parseDRM(sect[drm_ptr:drm_ptr+drm_size], drm_count, pid)
255 |             if not found_key:
256 |                 raise DrmException("no key found. maybe the PID is incorrect")
257 | 
258 |             # kill the drm keys
259 |             self.patchSection(0, "\0" * drm_size, drm_ptr)
260 |             # kill the drm pointers
261 |             self.patchSection(0, "\xff" * 4 + "\0" * 12, 0xA8)
262 |             # clear the crypto type
263 |             self.patchSection(0, "\0" * 2, 0xC)
264 | 
265 |             # decrypt sections
266 |             print "Decrypting. Please wait . . .",
267 |             new_data = self.data_file[:self.sections[1][0]]
268 |             for i in xrange(1, records+1):
269 |                 data = self.loadSection(i)
270 |                 extra_size = getSizeOfTrailingDataEntries(data, len(data), extra_data_flags)
271 |                 if i%100 == 0:
272 |                     print ".",
273 |                 # print "record %d, extra_size %d" %(i,extra_size)
274 |                 new_data += PC1(found_key, data[0:len(data) - extra_size])
275 |                 if extra_size > 0:
276 |                     new_data += data[-extra_size:]
277 |                 #self.patchSection(i, PC1(found_key, data[0:len(data) - extra_size]))
278 |             if self.num_sections > records+1:
279 |                 new_data += self.data_file[self.sections[records+1][0]:]
280 |             self.data_file = new_data
281 |             print "done"
282 | 
283 |     def getResult(self):
284 |         return self.data_file
285 | 
286 | if __name__ == "__main__":
287 |     sys.stdout=Unbuffered(sys.stdout)
288 |     print ('MobiDeDrm v%(__version__)s. '
289 |        'Copyright 2008-2010 The Dark Reverser.' % globals())
290 |     
291 |     parser = OptionParser("Usage: %prog [options] input.azw output.mobi PID", version=__version__)
292 |     parser.add_option("-s", "--serial", dest="serial", default="", help="Get the PID from a Kindle or Kindle for iPhone serial number")
293 |     
294 |     options, args = parser.parse_args()
295 |     
296 |     if options.serial:
297 |         print "Mobipocket PID: " + getPid(options.serial)
298 |         sys.exit(0)
299 |     
300 |     if len(args) < 4:
301 |         print "Removes protection from Mobipocket books"
302 |         parser.print_help()
303 |         sys.exit(1)
304 |     else:
305 |         infile = args[1]
306 |         outfile = args[2]
307 |         pid = args[3]
308 |         data_file = file(infile, 'rb').read()
309 |         try:
310 |             strippedFile = DrmStripper(data_file, pid)
311 |             file(outfile, 'wb').write(strippedFile.getResult())
312 |         except DrmException, e:
313 |             print "Error: %s" % e
314 |             sys.exit(1)
315 |     sys.exit(0)
316 | elif "calibre" in globals():
317 |     from calibre.customize import FileTypePlugin
318 | 
319 |     class MobiDeDRM(FileTypePlugin):
320 |         name                = 'MobiDeDRM' # Name of the plugin
321 |         description         = 'Removes DRM from secure Mobi files'
322 |         supported_platforms = ['linux', 'osx', 'windows'] # Platforms this plugin will run on
323 |         author              = 'The Dark Reverser' # The author of this plugin
324 |         version             = (0, 1, 6)   # The version number of this plugin
325 |         file_types          = set(['prc','mobi','azw']) # The file types that this plugin will be applied to
326 |         on_import           = True # Run this plugin during the import
327 | 
328 |         def run(self, path_to_ebook):
329 |             from calibre.gui2 import is_ok_to_use_qt
330 |             from PyQt4.Qt import QMessageBox
331 |             PID = self.site_customization
332 |             data_file = file(path_to_ebook, 'rb').read()
333 |             ar = PID.split(',')
334 |             for i in ar:
335 |                 try:
336 |                     unlocked_file = DrmStripper(data_file, i).getResult()
337 |                 except DrmException:
338 |                     # ignore the error
339 |                     pass
340 |                 else:
341 |                     of = self.temporary_file('.mobi')
342 |                     of.write(unlocked_file)
343 |                     of.close()
344 |                     return of.name
345 |             if is_ok_to_use_qt():
346 |                 d = QMessageBox(QMessageBox.Warning, "MobiDeDRM Plugin", "Couldn't decode: %s\n\nImporting encrypted version." % path_to_ebook)
347 |                 d.show()
348 |                 d.raise_()
349 |                 d.exec_()
350 |             return path_to_ebook
351 | 
352 |         def customization_help(self, gui=False):
353 |             return 'Enter PID (separate multiple PIDs with comma)'
354 | 
355 | 


--------------------------------------------------------------------------------
/process.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | """
 4 |     Utilities for decrypting a book in a separate process. This gets its own
 5 |     module as the multiprocessing module duplicates the global namespace when
 6 |     spawning new processes. This separate module limits the amount of stuff
 7 |     that gets duplicated and prevents serialization errors on certain platforms
 8 |     with e.g. wxWidgets.
 9 | """
10 | 
11 | import mobidedrm
12 | import multiprocessing
13 | import os
14 | import shutil
15 | import tempfile
16 | import time
17 | import topaz
18 | 
19 | multiprocessing.freeze_support()
20 | 
21 | def _process(infile, outfile, pid, error):
22 |     try:
23 |         if outfile.endswith(".mobi"):
24 |             # Mobi file
25 |             data_file = open(infile, "rb").read()
26 |             strippedFile = mobidedrm.DrmStripper(data_file, pid)
27 |             file(outfile, 'wb').write(strippedFile.getResult())
28 |         else:
29 |             # Topaz file
30 |             tmp = tempfile.mkdtemp()
31 |             args = ['./cmbtc.py', '-v', '-p', pid[:8], '-d', '-o', tmp, infile]
32 |             topaz.cmbtc.main(argv=args)
33 |             topaz.gensvg.main(['./gensvg.py', tmp])
34 |             topaz.genhtml.main(['./genhtml.py', tmp])
35 |             
36 |             if not os.path.exists(outfile):
37 |                 os.mkdir(outfile)
38 |                 
39 |             for filename in ["img", "style.css", "book.html"]:
40 |                 shutil.move(os.path.join(tmp, filename), os.path.join(outfile, filename))
41 |             
42 |             shutil.rmtree(tmp)
43 |     except Exception, e:
44 |         error.value = str(e)
45 | 
46 | def decrypt(infile, outfile, pid):
47 |     """
48 |         Decrypt a Kindle book in a different process. This periodically yields
49 |         so that status information can be shown. Use like:
50 |         
51 |             >>> for error in decrypt(infile, outfile, pid):
52 |             >>>     progress_update()
53 |             >>> if error:
54 |             >>>     print error
55 |         
56 |     """
57 |     error = None
58 |     
59 |     errorobj = multiprocessing.Array("c", 512)
60 |     proc = multiprocessing.Process(target=_process, args=(infile, outfile, pid, errorobj))
61 |     proc.start()
62 |     while proc.is_alive():
63 |         yield ""
64 |         time.sleep(0.1)
65 |     proc.join()
66 |     
67 |     if errorobj.value:
68 |         error = errorobj.value
69 |     
70 |     yield error
71 | 
72 | 


--------------------------------------------------------------------------------
/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lrobot/kindledecrypt/46052aeb076081ab1027de0f7dc2033d4a6304d9/screenshot.png


--------------------------------------------------------------------------------
/setup-macosx.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a setup.py script generated by py2applet
 3 | 
 4 | Usage:
 5 |     python setup.py py2app
 6 | """
 7 | 
 8 | from setuptools import setup
 9 | 
10 | APP = ['kindledecrypt.py']
11 | DATA_FILES = []
12 | OPTIONS = {'argv_emulation': True}
13 | 
14 | setup(
15 |     app=APP,
16 |     data_files=DATA_FILES,
17 |     options={'py2app': OPTIONS},
18 |     setup_requires=['py2app'],
19 | )
20 | 


--------------------------------------------------------------------------------
/setup-win32.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | from distutils.core import setup
 4 | import py2exe, sys, os
 5 | 
 6 | sys.argv.append('py2exe')
 7 | 
 8 | setup(
 9 |     options = {'py2exe': {'bundle_files':1}},
10 |     windows = [{'script': 'kindledecrypt.py'}],
11 |     zipfile = None,
12 | )
13 | 


--------------------------------------------------------------------------------
/topaz/__init__.py:
--------------------------------------------------------------------------------
1 | import cmbtc
2 | import gensvg
3 | import genhtml
4 | 


--------------------------------------------------------------------------------
/topaz/cmbtc.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # For use with Topaz Scripts Version 2.6
  3 | 
  4 | class Unbuffered:
  5 |     def __init__(self, stream):
  6 |         self.stream = stream
  7 |     def write(self, data):
  8 |         self.stream.write(data)
  9 |         self.stream.flush()
 10 |     def __getattr__(self, attr):
 11 |         return getattr(self.stream, attr)
 12 | 
 13 | import sys
 14 | sys.stdout=Unbuffered(sys.stdout)
 15 | 
 16 | import csv
 17 | import os
 18 | import getopt
 19 | import zlib
 20 | from struct import pack
 21 | from struct import unpack
 22 | 
 23 | MAX_PATH = 255
 24 | 
 25 | # Put the first 8 characters of your Kindle PID here
 26 | # or supply it with the -p option in the command line
 27 | ####################################################
 28 | kindlePID = "12345678"
 29 | ####################################################
 30 | 
 31 | global bookFile
 32 | global bookPayloadOffset
 33 | global bookHeaderRecords
 34 | global bookMetadata
 35 | global bookKey
 36 | global command
 37 | 
 38 | #
 39 | # Exceptions for all the problems that might happen during the script
 40 | #
 41 | 
 42 | class CMBDTCError(Exception):
 43 |     pass
 44 |     
 45 | class CMBDTCFatal(Exception):
 46 |     pass
 47 | 
 48 | #
 49 | # Open the book file at path
 50 | #
 51 | 
 52 | def openBook(path):
 53 |     try:
 54 |         return open(path,'rb')
 55 |     except:
 56 |         raise CMBDTCFatal("Could not open book file: " + path)
 57 | 
 58 | #
 59 | # Get a 7 bit encoded number from the book file
 60 | #
 61 | 
 62 | def bookReadEncodedNumber():
 63 |     flag = False
 64 |     data = ord(bookFile.read(1))
 65 |     
 66 |     if data == 0xFF:
 67 |        flag = True
 68 |        data = ord(bookFile.read(1))
 69 |        
 70 |     if data >= 0x80:
 71 |         datax = (data & 0x7F)
 72 |         while data >= 0x80 :
 73 |             data = ord(bookFile.read(1))
 74 |             datax = (datax <<7) + (data & 0x7F)
 75 |         data = datax 
 76 |     
 77 |     if flag:
 78 |        data = -data
 79 |     return data
 80 |     
 81 | #
 82 | # Encode a number in 7 bit format
 83 | #
 84 | 
 85 | def encodeNumber(number):
 86 |    result = ""
 87 |    negative = False
 88 |    flag = 0
 89 |    
 90 |    if number < 0 :
 91 |        number = -number + 1
 92 |        negative = True
 93 |    
 94 |    while True:
 95 |        byte = number & 0x7F
 96 |        number = number >> 7
 97 |        byte += flag
 98 |        result += chr(byte)
 99 |        flag = 0x80
100 |        if number == 0 :
101 |            if (byte == 0xFF and negative == False) :
102 |                result += chr(0x80)
103 |            break
104 |    
105 |    if negative:
106 |        result += chr(0xFF)
107 |    
108 |    return result[::-1]
109 |   
110 | #
111 | # Get a length prefixed string from the file 
112 | #
113 | 
114 | def bookReadString():
115 |     stringLength = bookReadEncodedNumber()
116 |     return unpack(str(stringLength)+"s",bookFile.read(stringLength))[0]  
117 |     
118 | #
119 | # Returns a length prefixed string
120 | #
121 | 
122 | def lengthPrefixString(data):
123 |     return encodeNumber(len(data))+data
124 |     
125 | 
126 | #
127 | # Read and return the data of one header record at the current book file position [[offset,decompressedLength,compressedLength],...]
128 | #
129 |     
130 | def bookReadHeaderRecordData():
131 |     nbValues = bookReadEncodedNumber()
132 |     values = []
133 |     for i in range (0,nbValues):
134 |         values.append([bookReadEncodedNumber(),bookReadEncodedNumber(),bookReadEncodedNumber()])
135 |     return values
136 |    
137 | #
138 | # Read and parse one header record at the current book file position and return the associated data [[offset,decompressedLength,compressedLength],...]
139 | #
140 | 
141 | def parseTopazHeaderRecord():
142 |     if ord(bookFile.read(1)) != 0x63:
143 |         raise CMBDTCFatal("Parse Error : Invalid Header")
144 |     
145 |     tag = bookReadString()
146 |     record = bookReadHeaderRecordData()
147 |     return [tag,record]
148 | 
149 | #
150 | # Parse the header of a Topaz file, get all the header records and the offset for the payload
151 | #
152 |  
153 | def parseTopazHeader():
154 |     global bookHeaderRecords
155 |     global bookPayloadOffset
156 |     magic = unpack("4s",bookFile.read(4))[0]
157 |     
158 |     if magic != 'TPZ0':
159 |         raise CMBDTCFatal("Parse Error : Invalid Header, not a Topaz file")
160 |         
161 |     nbRecords = bookReadEncodedNumber()
162 |     bookHeaderRecords = {}
163 |    
164 |     for i in range (0,nbRecords):
165 |         result = parseTopazHeaderRecord()
166 |         #print result[0], result[1]
167 |         bookHeaderRecords[result[0]] = result[1]
168 |     
169 |     if ord(bookFile.read(1))  != 0x64 :
170 |         raise CMBDTCFatal("Parse Error : Invalid Header")
171 |     
172 |     bookPayloadOffset = bookFile.tell()
173 |    
174 | #
175 | # Get a record in the book payload, given its name and index. If necessary the record is decrypted. The record is not decompressed
176 | # Correction, the record is correctly decompressed too
177 | #
178 | 
179 | def getBookPayloadRecord(name, index):   
180 |     encrypted = False
181 |     compressed = False
182 | 
183 |     try: 
184 |         recordOffset = bookHeaderRecords[name][index][0]
185 |     except:
186 |         raise CMBDTCFatal("Parse Error : Invalid Record, record not found")
187 |     
188 |     bookFile.seek(bookPayloadOffset + recordOffset)
189 |     
190 |     tag = bookReadString()
191 |     if tag != name :
192 |         raise CMBDTCFatal("Parse Error : Invalid Record, record name doesn't match")
193 |     
194 |     recordIndex = bookReadEncodedNumber()
195 |     
196 |     if recordIndex < 0 :
197 |         encrypted = True
198 |         recordIndex = -recordIndex -1
199 |     
200 |     if recordIndex != index :
201 |       raise CMBDTCFatal("Parse Error : Invalid Record, index doesn't match")
202 |             
203 |     if (bookHeaderRecords[name][index][2] > 0):
204 |         compressed = True
205 |         record = bookFile.read(bookHeaderRecords[name][index][2])
206 |     else:
207 |         record = bookFile.read(bookHeaderRecords[name][index][1])
208 |  
209 |     if encrypted:
210 |        ctx = topazCryptoInit(bookKey)
211 |        record = topazCryptoDecrypt(record,ctx)
212 | 
213 |     if compressed:
214 |         record = zlib.decompress(record)
215 |     
216 |     return record
217 | 
218 | #
219 | # Extract, decrypt and decompress a book record indicated by name and index and print it or save it in "filename"
220 | #
221 | 
222 | def extractBookPayloadRecord(name, index, filename):
223 |     compressed = False
224 | 
225 |     try:
226 |         compressed = bookHeaderRecords[name][index][2] != 0
227 |         record = getBookPayloadRecord(name,index)
228 |     except:
229 |         print("Could not find record")
230 |     
231 |     # if compressed:
232 |     #    try:
233 |     #        record = zlib.decompress(record)
234 |     #    except:
235 |     #        raise CMBDTCFatal("Could not decompress record")
236 |             
237 |     if filename != "":
238 |         try:
239 |             file = open(filename,"wb")
240 |             file.write(record)
241 |             file.close()
242 |         except:
243 |             raise CMBDTCFatal("Could not write to destination file")
244 |     else:
245 |         print(record)
246 |     
247 | #
248 | # return next record [key,value] from the book metadata from the current book position
249 | #  
250 | 
251 | def readMetadataRecord():
252 |     return [bookReadString(),bookReadString()]
253 |     
254 | #
255 | # Parse the metadata record from the book payload and return a list of [key,values]
256 | #
257 | 
258 | def parseMetadata():
259 |     global bookHeaderRecords
260 |     global bookPayloadAddress
261 |     global bookMetadata
262 |     bookMetadata = {}
263 |     bookFile.seek(bookPayloadOffset + bookHeaderRecords["metadata"][0][0])
264 |     tag = bookReadString()
265 |     if tag != "metadata" :
266 |         raise CMBDTCFatal("Parse Error : Record Names Don't Match")
267 |     
268 |     flags = ord(bookFile.read(1))
269 |     nbRecords = ord(bookFile.read(1))
270 |     
271 |     for i in range (0,nbRecords) :
272 |         record =readMetadataRecord()
273 |         bookMetadata[record[0]] = record[1]
274 | 
275 | #
276 | # Context initialisation for the Topaz Crypto
277 | #
278 | 
279 | def topazCryptoInit(key):
280 |     ctx1 = 0x0CAFFE19E
281 |     
282 |     for keyChar in key:
283 |         keyByte = ord(keyChar)
284 |         ctx2 = ctx1 
285 |         ctx1 = ((((ctx1 >>2) * (ctx1 >>7))&0xFFFFFFFF) ^ (keyByte * keyByte * 0x0F902007)& 0xFFFFFFFF )
286 |     return [ctx1,ctx2]
287 |     
288 | #
289 | # decrypt data with the context prepared by topazCryptoInit()
290 | #
291 |     
292 | def topazCryptoDecrypt(data, ctx):
293 |     ctx1 = ctx[0]
294 |     ctx2 = ctx[1]
295 |     
296 |     plainText = ""
297 |     
298 |     for dataChar in data:
299 |         dataByte = ord(dataChar)
300 |         m = (dataByte ^ ((ctx1 >> 3) &0xFF) ^ ((ctx2<<3) & 0xFF)) &0xFF
301 |         ctx2 = ctx1
302 |         ctx1 = (((ctx1 >> 2) * (ctx1 >> 7)) &0xFFFFFFFF) ^((m * m * 0x0F902007) &0xFFFFFFFF)
303 |         plainText += chr(m)
304 |         
305 |     return plainText
306 | 
307 | #
308 | # Decrypt a payload record with the PID
309 | #
310 | 
311 | def decryptRecord(data,PID):
312 |     ctx = topazCryptoInit(PID)
313 |     return topazCryptoDecrypt(data, ctx)
314 | 
315 | #
316 | # Try to decrypt a dkey record (contains the book PID)
317 | #
318 | 
319 | def decryptDkeyRecord(data,PID):
320 |     record = decryptRecord(data,PID)
321 |     fields = unpack("3sB8sB8s3s",record)
322 |     
323 |     if fields[0] != "PID" or fields[5] != "pid" :
324 |         raise CMBDTCError("Didn't find PID magic numbers in record")
325 |     elif fields[1] != 8 or fields[3] != 8 :
326 |         raise CMBDTCError("Record didn't contain correct length fields")
327 |     elif fields[2] != PID :
328 |         raise CMBDTCError("Record didn't contain PID")
329 |     
330 |     return fields[4]
331 |     
332 | #
333 | # Decrypt all the book's dkey records (contain the book PID)
334 | #
335 |   
336 | def decryptDkeyRecords(data,PID):
337 |     nbKeyRecords = ord(data[0])
338 |     records = []
339 |     data = data[1:]
340 |     for i in range (0,nbKeyRecords):
341 |         length = ord(data[0])
342 |         try:
343 |             key = decryptDkeyRecord(data[1:length+1],PID)
344 |             records.append(key)
345 |         except CMBDTCError:
346 |             pass
347 |         data = data[1+length:]
348 |         
349 |     return records
350 | 
351 | 
352 | def createDecryptedPayload(payload):
353 |     for headerRecord in bookHeaderRecords:
354 |        name = headerRecord
355 |        if name != "dkey" :
356 |            ext = '.dat'
357 |            if name == 'img' : ext = '.jpg'
358 |            for index in range (0,len(bookHeaderRecords[name])) :
359 |                fnum = "%04d" % index
360 |                fname = name + fnum + ext
361 |                destdir = payload
362 |                if name == 'img':
363 |                    destdir =  os.path.join(payload,'img')
364 |                if name == 'page':
365 |                    destdir =  os.path.join(payload,'page')
366 |                if name == 'glyphs':
367 |                    destdir =  os.path.join(payload,'glyphs')
368 |                outputFile = os.path.join(destdir,fname)
369 |                file(outputFile, 'wb').write(getBookPayloadRecord(name, index))
370 |                    
371 | 
372 | # Create decrypted book
373 | #
374 | 
375 | def createDecryptedBook(outdir):
376 |     if not os.path.exists(outdir):
377 |         os.makedirs(outdir)
378 | 
379 |     destdir =  os.path.join(outdir,'img')
380 |     if not os.path.exists(destdir):
381 |         os.makedirs(destdir)
382 | 
383 |     destdir =  os.path.join(outdir,'page')
384 |     if not os.path.exists(destdir):
385 |         os.makedirs(destdir)
386 | 
387 |     destdir =  os.path.join(outdir,'glyphs')
388 |     if not os.path.exists(destdir):
389 |         os.makedirs(destdir)
390 | 
391 |     createDecryptedPayload(outdir)
392 | 
393 | 
394 | #
395 | # Set the command to execute by the programm according to cmdLine parameters
396 | #
397 | 
398 | def setCommand(name) :
399 |     global command
400 |     if command != "" :
401 |          raise CMBDTCFatal("Invalid command line parameters")
402 |     else :
403 |         command = name
404 | 
405 | # 
406 | # Program usage
407 | #
408 |    
409 | def usage():
410 |     print("\nUsage:")
411 |     print("\ncmbtc_dump_linux.py [options] bookFileName\n")
412 |     print("-p Adds a PID to the list of PIDs that are tried to decrypt the book key (can be used several times)")
413 |     print("-d Dumps the unencrypted book as files to outdir")
414 |     print("-o Output directory to save book files to")
415 |     print("-v Verbose (can be used several times)")
416 | 
417 |  
418 | #
419 | # Main
420 | #   
421 | 
422 | def main(argv=sys.argv):
423 |     global bookMetadata
424 |     global bookKey
425 |     global bookFile
426 |     global command
427 |     
428 |     print argv
429 |     
430 |     progname = os.path.basename(argv[0])
431 |     
432 |     verbose = 0
433 |     recordName = ""
434 |     recordIndex = 0
435 |     outdir = ""
436 |     PIDs = []
437 |     command = ""
438 |     
439 |     # Preloads your Kindle pid from the top of the program.
440 |     PIDs.append(kindlePID)
441 |     
442 |     try:
443 |         opts, args = getopt.getopt(argv[1:], "vo:p:d")
444 |     except getopt.GetoptError, err:
445 |         # print help information and exit:
446 |         print str(err) # will print something like "option -a not recognized"
447 |         usage()
448 |         sys.exit(2)
449 |     
450 |     if len(opts) == 0 and len(args) == 0 :
451 |         usage()
452 |         sys.exit(2) 
453 |        
454 |     for o, a in opts:
455 |         if o == "-v":
456 |             verbose+=1
457 |         if o =="-o":
458 |             if a == None :
459 |                 raise CMBDTCFatal("Invalid parameter for -o")
460 |             outdir = a
461 |         if o =="-p":
462 |             PIDs.append(a)
463 |         if o =="-d":
464 |             setCommand("doit")
465 |             
466 |     if command == "" :
467 |         raise CMBDTCFatal("No action supplied on command line")
468 |    
469 |     #
470 |     # Open book and parse metadata
471 |     #
472 |         
473 |     if len(args) == 1:
474 |     
475 |         bookFile = openBook(args[0])
476 |         parseTopazHeader()
477 |         parseMetadata()
478 |     
479 |     #
480 |     #  Decrypt book key
481 |     #
482 |     
483 |         dkey = getBookPayloadRecord('dkey', 0) 
484 |         
485 |         bookKeys = []
486 |         for PID in PIDs :
487 |             bookKeys+=decryptDkeyRecords(dkey,PID)
488 |             
489 |         if len(bookKeys) == 0 :
490 |             if verbose > 0 :
491 |                 print ("Book key could not be found. Maybe this book is not registered with this device.")
492 |                 return 1
493 |         else :
494 |             bookKey = bookKeys[0]
495 |             if verbose > 0:
496 |                 print("Book key: " + bookKey.encode('hex'))
497 |                   
498 |             if command == "printRecord" :
499 |                 extractBookPayloadRecord(recordName,int(recordIndex),outputFile)
500 |                 if outputFile != "" and verbose>0 :
501 |                     print("Wrote record to file: "+outputFile) 
502 |             elif command == "doit" :
503 |                 if outdir != "" :
504 |                     createDecryptedBook(outdir)
505 |                     if verbose >0 :
506 |                         print ("Decrypted book saved. Don't pirate!")
507 |                 elif verbose > 0:
508 |                     print("Output directory name was not supplied.")
509 |                     return 1
510 |     
511 |     return 0
512 | 
513 | if __name__ == '__main__':
514 |     sys.exit(main())
515 | 


--------------------------------------------------------------------------------
/topaz/convert2xml.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  3 | # For use with Topaz Scripts Version 2.6
  4 | 
  5 | class Unbuffered:
  6 |     def __init__(self, stream):
  7 |         self.stream = stream
  8 |     def write(self, data):
  9 |         self.stream.write(data)
 10 |         self.stream.flush()
 11 |     def __getattr__(self, attr):
 12 |         return getattr(self.stream, attr)
 13 | 
 14 | import sys
 15 | sys.stdout=Unbuffered(sys.stdout)
 16 | 
 17 | import csv
 18 | import os
 19 | import getopt
 20 | from struct import pack
 21 | from struct import unpack
 22 | 
 23 | 
 24 | # Get a 7 bit encoded number from string. The most 
 25 | # significant byte comes first and has the high bit (8th) set
 26 | 
 27 | def readEncodedNumber(file):
 28 |     flag = False
 29 |     c = file.read(1)
 30 |     if (len(c) == 0):
 31 |         return None
 32 |     data = ord(c)
 33 |     
 34 |     if data == 0xFF:
 35 |        flag = True
 36 |        c = file.read(1)
 37 |        if (len(c) == 0):
 38 |            return None
 39 |        data = ord(c)
 40 |        
 41 |     if data >= 0x80:
 42 |         datax = (data & 0x7F)
 43 |         while data >= 0x80 :
 44 |             c = file.read(1)
 45 |             if (len(c) == 0): 
 46 |                 return None
 47 |             data = ord(c)
 48 |             datax = (datax <<7) + (data & 0x7F)
 49 |         data = datax 
 50 |     
 51 |     if flag:
 52 |        data = -data
 53 |     return data
 54 |     
 55 | 
 56 | # returns a binary string that encodes a number into 7 bits
 57 | # most significant byte first which has the high bit set
 58 | 
 59 | def encodeNumber(number):
 60 |    result = ""
 61 |    negative = False
 62 |    flag = 0
 63 |    
 64 |    if number < 0 :
 65 |        number = -number + 1
 66 |        negative = True
 67 |    
 68 |    while True:
 69 |        byte = number & 0x7F
 70 |        number = number >> 7
 71 |        byte += flag
 72 |        result += chr(byte)
 73 |        flag = 0x80
 74 |        if number == 0 :
 75 |            if (byte == 0xFF and negative == False) :
 76 |                result += chr(0x80)
 77 |            break
 78 |    
 79 |    if negative:
 80 |        result += chr(0xFF)
 81 |    
 82 |    return result[::-1]
 83 |   
 84 | 
 85 | 
 86 | # create / read  a length prefixed string from the file
 87 | 
 88 | def lengthPrefixString(data):
 89 |     return encodeNumber(len(data))+data
 90 | 
 91 | def readString(file):
 92 |     stringLength = readEncodedNumber(file)
 93 |     if (stringLength == None):
 94 |         return ""
 95 |     sv = file.read(stringLength)
 96 |     if (len(sv)  != stringLength):
 97 |         return ""
 98 |     return unpack(str(stringLength)+"s",sv)[0]  
 99 | 
100 |  
101 | # convert a binary string generated by encodeNumber (7 bit encoded number)
102 | # to the value you would find inside the page*.dat files to be processed
103 | 
104 | def convert(i):
105 |     result = ''
106 |     val = encodeNumber(i)
107 |     for j in xrange(len(val)):
108 |         c = ord(val[j:j+1])
109 |         result += '%02x' % c
110 |     return result
111 | 
112 | 
113 | 
114 | # the complete string table used to store all book text content
115 | # as well as the xml tokens and values that make sense out of it
116 | 
117 | class Dictionary(object):
118 |     def __init__(self, dictFile):
119 |         self.filename = dictFile
120 |         self.size = 0
121 |         self.fo = file(dictFile,'rb')
122 |         self.stable = []
123 |         self.size = readEncodedNumber(self.fo)
124 |         for i in xrange(self.size):
125 |             self.stable.append(self.escapestr(readString(self.fo)))
126 |         self.pos = 0
127 | 
128 |     def escapestr(self, str):
129 |         str = str.replace('&','&amp;')
130 |         str = str.replace('<','&lt;')
131 |         str = str.replace('>','&gt;')
132 |         str = str.replace('=','&#61;')
133 |         return str
134 | 
135 |     def lookup(self,val):
136 |         if ((val >= 0) and (val < self.size)) :
137 |             self.pos = val
138 |             return self.stable[self.pos]
139 |         else:
140 |             print "Error - %d outside of string table limits" % val
141 |             sys.exit(-1)
142 | 
143 |     def getSize(self):
144 |         return self.size
145 | 
146 |     def getPos(self):
147 |         return self.pos
148 | 
149 |     def dumpDict(self):
150 |         for i in xrange(self.size):
151 |             print "%d %s %s" % (i, convert(i), self.stable[i])
152 |         return
153 | 
154 | # parses the xml snippets that are represented by each page*.dat file.
155 | # also parses the other0.dat file - the main stylesheet
156 | # and information used to inject the xml snippets into page*.dat files
157 | 
158 | class PageParser(object):
159 |     def __init__(self, filename, dict, debug, flat_xml):
160 |         self.fo = file(filename,'rb')
161 |         self.id = os.path.basename(filename).replace('.dat','')
162 |         self.dict = dict
163 |         self.debug = debug
164 |         self.flat_xml = flat_xml
165 |         self.tagpath = []
166 |         self.doc = []
167 |         self.snippetList = []
168 | 
169 | 
170 |     # hash table used to enable the decoding process
171 |     # This has all been developed by trial and error so it may still have omissions or
172 |     # contain errors
173 |     # Format:
174 |     # tag : (number of arguments, argument type, subtags present, special case of subtags presents when escaped)
175 | 
176 |     token_tags = {
177 |         'x'            : (1, 'scalar_number', 0, 0),
178 |         'y'            : (1, 'scalar_number', 0, 0),
179 |         'h'            : (1, 'scalar_number', 0, 0),
180 |         'w'            : (1, 'scalar_number', 0, 0),
181 |         'firstWord'    : (1, 'scalar_number', 0, 0),
182 |         'lastWord'     : (1, 'scalar_number', 0, 0),
183 |         'rootID'       : (1, 'scalar_number', 0, 0),
184 |         'stemID'       : (1, 'scalar_number', 0, 0),
185 |         'type'         : (1, 'scalar_text', 0, 0),
186 | 
187 |         'info'            : (0, 'number', 1, 0),
188 | 
189 |         'info.word'            : (0, 'number', 1, 1),
190 |         'info.word.ocrText'    : (1, 'text', 0, 0),
191 |         'info.word.firstGlyph' : (1, 'raw', 0, 0),
192 |         'info.word.lastGlyph'  : (1, 'raw', 0, 0),
193 |         'info.word.bl'         : (1, 'raw', 0, 0),
194 |         'info.word.link_id'    : (1, 'number', 0, 0),
195 | 
196 |         'glyph'           : (0, 'number', 1, 1),
197 |         'glyph.x'         : (1, 'number', 0, 0),
198 |         'glyph.y'         : (1, 'number', 0, 0),
199 |         'glyph.glyphID'   : (1, 'number', 0, 0),
200 | 
201 |         'dehyphen'          : (0, 'number', 1, 1),
202 |         'dehyphen.rootID'   : (1, 'number', 0, 0),
203 |         'dehyphen.stemID'   : (1, 'number', 0, 0),
204 |         'dehyphen.stemPage' : (1, 'number', 0, 0),
205 |         'dehyphen.sh'       : (1, 'number', 0, 0),
206 | 
207 |         'links'        : (0, 'number', 1, 1),
208 |         'links.page'   : (1, 'number', 0, 0),
209 |         'links.rel'    : (1, 'number', 0, 0),
210 |         'links.row'    : (1, 'number', 0, 0),
211 |         'links.title'  : (1, 'text', 0, 0),
212 |         'links.href'   : (1, 'text', 0, 0),
213 |         'links.type'   : (1, 'text', 0, 0),
214 | 
215 |         'paraCont'          : (0, 'number', 1, 1),
216 |         'paraCont.rootID'   : (1, 'number', 0, 0),
217 |         'paraCont.stemID'   : (1, 'number', 0, 0),
218 |         'paraCont.stemPage' : (1, 'number', 0, 0),
219 | 
220 |         'paraStems'        : (0, 'number', 1, 1),
221 |         'paraStems.stemID' : (1, 'number', 0, 0),
222 | 
223 |         'wordStems'          : (0, 'number', 1, 1),
224 |         'wordStems.stemID'   : (1, 'number', 0, 0),
225 | 
226 |         'empty'          : (1, 'snippets', 1, 0),
227 | 
228 |         'page'           : (1, 'snippets', 1, 0),
229 |         'page.pageid'    : (1, 'scalar_text', 0, 0),
230 |         'page.pagelabel' : (1, 'scalar_text', 0, 0),
231 |         'page.type'      : (1, 'scalar_text', 0, 0),
232 |         'page.h'         : (1, 'scalar_number', 0, 0),
233 |         'page.w'         : (1, 'scalar_number', 0, 0),
234 |         'page.startID' : (1, 'scalar_number', 0, 0),
235 | 
236 |         'group'           : (1, 'snippets', 1, 0),
237 |         'group.type'      : (1, 'scalar_text', 0, 0),
238 | 
239 |         'region'           : (1, 'snippets', 1, 0),
240 |         'region.type'      : (1, 'scalar_text', 0, 0),
241 |         'region.x'         : (1, 'scalar_number', 0, 0),
242 |         'region.y'         : (1, 'scalar_number', 0, 0),
243 |         'region.h'         : (1, 'scalar_number', 0, 0),
244 |         'region.w'         : (1, 'scalar_number', 0, 0),
245 | 
246 |         'empty_text_region' : (1, 'snippets', 1, 0),
247 | 
248 |         'img'           : (1, 'snippets', 1, 0),
249 |         'img.x'         : (1, 'scalar_number', 0, 0),
250 |         'img.y'         : (1, 'scalar_number', 0, 0),
251 |         'img.h'         : (1, 'scalar_number', 0, 0),
252 |         'img.w'         : (1, 'scalar_number', 0, 0),
253 |         'img.src'       : (1, 'scalar_number', 0, 0),
254 |         'img.color_src' : (1, 'scalar_number', 0, 0),
255 | 
256 |         'paragraph'           : (1, 'snippets', 1, 0),
257 |         'paragraph.class'     : (1, 'scalar_text', 0, 0),
258 |         'paragraph.firstWord' : (1, 'scalar_number', 0, 0),
259 |         'paragraph.lastWord'  : (1, 'scalar_number', 0, 0),
260 | 
261 |         'word_semantic'           : (1, 'snippets', 1, 1),
262 |         'word_semantic.type'      : (1, 'scalar_text', 0, 0),
263 |         'word_semantic.firstWord' : (1, 'scalar_number', 0, 0),
264 |         'word_semantic.lastWord'  : (1, 'scalar_number', 0, 0),
265 | 
266 |         'word'            : (1, 'snippets', 1, 0),
267 |         'word.type'       : (1, 'scalar_text', 0, 0),
268 |         'word.class'      : (1, 'scalar_text', 0, 0),
269 |         'word.firstGlyph' : (1, 'scalar_number', 0, 0),
270 |         'word.lastGlyph'  : (1, 'scalar_number', 0, 0),
271 | 
272 |         '_span'           : (1, 'snippets', 1, 0),
273 |         '_span.firstWord' : (1, 'scalar_number', 0, 0),
274 |         '-span.lastWord'  : (1, 'scalar_number', 0, 0),
275 | 
276 |         'span'           : (1, 'snippets', 1, 0),
277 |         'span.firstWord' : (1, 'scalar_number', 0, 0),
278 |         'span.lastWord'  : (1, 'scalar_number', 0, 0),
279 | 
280 |         'extratokens'            : (1, 'snippets', 1, 0),
281 |         'extratokens.type'       : (1, 'scalar_text', 0, 0),
282 |         'extratokens.firstGlyph' : (1, 'scalar_number', 0, 0),
283 |         'extratokens.lastGlyph'  : (1, 'scalar_number', 0, 0),
284 | 
285 |         'glyph.h'      : (1, 'number', 0, 0),
286 |         'glyph.w'      : (1, 'number', 0, 0),
287 |         'glyph.use'    : (1, 'number', 0, 0),
288 |         'glyph.vtx'    : (1, 'number', 0, 1),
289 |         'glyph.len'    : (1, 'number', 0, 1),
290 |         'glyph.dpi'    : (1, 'number', 0, 0),
291 |         'vtx'          : (0, 'number', 1, 1),
292 |         'vtx.x'        : (1, 'number', 0, 0),
293 |         'vtx.y'        : (1, 'number', 0, 0),
294 |         'len'          : (0, 'number', 1, 1),
295 |         'len.n'        : (1, 'number', 0, 0),
296 | 
297 |         'book'         : (1, 'snippets', 1, 0),
298 |         'version'      : (1, 'snippets', 1, 0),
299 |         'version.FlowEdit_1_id'            : (1, 'scalar_text', 0, 0),
300 |         'version.FlowEdit_1_version'       : (1, 'scalar_text', 0, 0),
301 |         'version.Schema_id'                : (1, 'scalar_text', 0, 0),
302 |         'version.Schema_version'           : (1, 'scalar_text', 0, 0),
303 |         'version.Topaz_version'            : (1, 'scalar_text', 0, 0),
304 |         'version.WordDetailEdit_1_id'      : (1, 'scalar_text', 0, 0),
305 |         'version.WordDetailEdit_1_version' : (1, 'scalar_text', 0, 0),
306 |         'version.ZoneEdit_1_id'            : (1, 'scalar_text', 0, 0),
307 |         'version.ZoneEdit_1_version'       : (1, 'scalar_text', 0, 0),
308 |         'version.chapterheaders'           : (1, 'scalar_text', 0, 0),
309 |         'version.creation_date'            : (1, 'scalar_text', 0, 0),
310 |         'version.header_footer'            : (1, 'scalar_text', 0, 0),
311 |         'version.init_from_ocr'            : (1, 'scalar_text', 0, 0),
312 |         'version.letter_insertion'         : (1, 'scalar_text', 0, 0),
313 |         'version.xmlinj_convert'           : (1, 'scalar_text', 0, 0),
314 |         'version.xmlinj_reflow'            : (1, 'scalar_text', 0, 0),
315 |         'version.xmlinj_transform'         : (1, 'scalar_text', 0, 0),
316 |         'version.findlists'                : (1, 'scalar_text', 0, 0),
317 |         'version.page_num'                 : (1, 'scalar_text', 0, 0),
318 |         'version.page_type'                : (1, 'scalar_text', 0, 0),
319 |         'version.bad_text'                 : (1, 'scalar_text', 0, 0),
320 |         'version.glyph_mismatch'           : (1, 'scalar_text', 0, 0),
321 |         'version.margins'                  : (1, 'scalar_text', 0, 0),
322 |         'version.staggered_lines'          : (1, 'scalar_text', 0, 0),
323 |         'version.paragraph_continuation'   : (1, 'scalar_text', 0, 0),
324 |         'version.toc'                      : (1, 'scalar_text', 0, 0),
325 | 
326 |         'stylesheet'   : (1, 'snippets', 1, 0),
327 |         'style'              : (1, 'snippets', 1, 0),
328 |         'style._tag'         : (1, 'scalar_text', 0, 0),
329 |         'style.type'         : (1, 'scalar_text', 0, 0),
330 |         'style._parent_type' : (1, 'scalar_text', 0, 0),
331 |         'style.class'        : (1, 'scalar_text', 0, 0),
332 |         'style._after_class' : (1, 'scalar_text', 0, 0),
333 |         'rule'               : (1, 'snippets', 1, 0),
334 |         'rule.attr'          : (1, 'scalar_text', 0, 0),
335 |         'rule.value'         : (1, 'scalar_text', 0, 0),
336 | 
337 |         'original'      : (0, 'number', 1, 1),
338 |         'original.pnum' : (1, 'number', 0, 0),
339 |         'original.pid'  : (1, 'text', 0, 0),
340 |         'pages'        : (0, 'number', 1, 1),
341 |         'pages.ref'    : (1, 'number', 0, 0),
342 |         'pages.id'     : (1, 'number', 0, 0),
343 |         'startID'      : (0, 'number', 1, 1),
344 |         'startID.page' : (1, 'number', 0, 0),
345 |         'startID.id'   : (1, 'number', 0, 0),
346 | 
347 |      }
348 | 
349 | 
350 |     # full tag path record keeping routines
351 |     def tag_push(self, token):
352 |         self.tagpath.append(token)
353 |     def tag_pop(self):
354 |         if len(self.tagpath) > 0 :
355 |             self.tagpath.pop()
356 |     def tagpath_len(self):
357 |         return len(self.tagpath)
358 |     def get_tagpath(self, i):
359 |         cnt = len(self.tagpath)
360 |         if i < cnt : result = self.tagpath[i]
361 |         for j in xrange(i+1, cnt) :
362 |             result += '.' + self.tagpath[j]
363 |         return result
364 |             
365 | 
366 |     # list of absolute command byte values values that indicate
367 |     # various types of loop meachanisms typically used to generate vectors
368 | 
369 |     cmd_list = (0x76, 0x76)
370 | 
371 |     # peek at and return 1 byte that is ahead by i bytes 
372 |     def peek(self, aheadi):
373 |         c = self.fo.read(aheadi)
374 |         if (len(c) == 0):
375 |             return None
376 |         self.fo.seek(-aheadi,1)
377 |         c = c[-1:]
378 |         return ord(c)
379 | 
380 | 
381 |     # get the next value from the file being processed
382 |     def getNext(self):
383 |         nbyte = self.peek(1);
384 |         if (nbyte == None):
385 |             return None
386 |         val = readEncodedNumber(self.fo)
387 |         return val
388 | 
389 | 
390 |     # format an arg by argtype
391 |     def formatArg(self, arg, argtype):
392 |         if (argtype == 'text') or (argtype == 'scalar_text') :
393 |             result = self.dict.lookup(arg)
394 |         elif (argtype == 'raw') or (argtype == 'number') or (argtype == 'scalar_number') :
395 |             result = arg
396 |         elif (argtype == 'snippets') :
397 |             result = arg
398 |         else :
399 |             print "Error Unknown argtype %s" % argtype
400 |             sys.exit(-2)
401 |         return result
402 | 
403 | 
404 |     # process the next tag token, recursively handling subtags, 
405 |     # arguments, and commands
406 |     def procToken(self, token):
407 | 
408 |         known_token = False
409 |         self.tag_push(token)
410 | 
411 |         if self.debug : print 'Processing: ', self.get_tagpath(0)
412 |         cnt = self.tagpath_len()
413 |         for j in xrange(cnt):
414 |             tkn = self.get_tagpath(j)
415 |             if tkn in self.token_tags :
416 |                 num_args = self.token_tags[tkn][0]
417 |                 argtype = self.token_tags[tkn][1]
418 |                 subtags = self.token_tags[tkn][2]
419 |                 splcase = self.token_tags[tkn][3]
420 |                 ntags = -1
421 |                 known_token = True
422 |                 break
423 | 
424 |         if known_token :
425 | 
426 |             # handle subtags if present 
427 |             subtagres = []
428 |             if (splcase == 1):
429 |                 # this type of tag uses of escape marker 0x74 indicate subtag count
430 |                 if self.peek(1) == 0x74:
431 |                     skip = readEncodedNumber(self.fo)
432 |                     subtags = 1
433 |                     num_args = 0
434 | 
435 |             if (subtags == 1): 
436 |                 ntags = readEncodedNumber(self.fo)
437 |                 if self.debug : print 'subtags: ' + token + ' has ' + str(ntags)
438 |                 for j in xrange(ntags):
439 |                     val = readEncodedNumber(self.fo)
440 |                     subtagres.append(self.procToken(self.dict.lookup(val)))
441 | 
442 |             # arguments can be scalars or vectors of text or numbers
443 |             argres = []
444 |             if num_args > 0 :
445 |                 firstarg = self.peek(1)
446 |                 if (firstarg in self.cmd_list) and (argtype != 'scalar_number') and (argtype != 'scalar_text'):
447 |                     # single argument is a variable length vector of data
448 |                     arg = readEncodedNumber(self.fo)
449 |                     argres = self.decodeCMD(arg,argtype)
450 |                 else :
451 |                     # num_arg scalar arguments
452 |                     for i in xrange(num_args):
453 |                         argres.append(self.formatArg(readEncodedNumber(self.fo), argtype))
454 | 
455 |             # build the return tag
456 |             result = []
457 |             tkn = self.get_tagpath(0)
458 |             result.append(tkn)
459 |             result.append(subtagres)
460 |             result.append(argtype)
461 |             result.append(argres)
462 |             self.tag_pop()
463 |             return result
464 | 
465 |         # all tokens that need to be processed should be in the hash
466 |         # table if it may indicate a problem, either new token 
467 |         # or an out of sync condition
468 |         else:
469 |             result = []
470 |             if (self.debug):
471 |                 print 'Unknown Token:', token
472 |             self.tag_pop()
473 |             return result
474 | 
475 | 
476 |     # special loop used to process code snippets
477 |     # it is NEVER used to format arguments.
478 |     # builds the snippetList
479 |     def doLoop72(self, argtype):
480 |         cnt = readEncodedNumber(self.fo)
481 |         if self.debug :
482 |             result = 'Set of '+ str(cnt) + ' xml snippets. The overall structure \n'
483 |             result += 'of the document is indicated by snippet number sets at the\n'
484 |             result += 'end of each snippet. \n'
485 |             print result
486 |         for i in xrange(cnt):
487 |             if self.debug: print 'Snippet:',str(i)
488 |             snippet = []
489 |             snippet.append(i)
490 |             val = readEncodedNumber(self.fo)
491 |             snippet.append(self.procToken(self.dict.lookup(val)))
492 |             self.snippetList.append(snippet)
493 |         return
494 | 
495 | 
496 | 
497 |     # general loop code gracisouly submitted by "skindle" - thank you!
498 |     def doLoop76Mode(self, argtype, cnt, mode):
499 |         result = []
500 |         adj = 0
501 |         if mode & 1:
502 |             adj = readEncodedNumber(self.fo)
503 |         mode = mode >> 1
504 |         x = []
505 |         for i in xrange(cnt):
506 |             x.append(readEncodedNumber(self.fo) - adj)
507 |         for i in xrange(mode):
508 |             for j in xrange(1, cnt):
509 |                 x[j] = x[j] + x[j - 1]
510 |         for i in xrange(cnt):
511 |             result.append(self.formatArg(x[i],argtype))
512 |         return result
513 | 
514 | 
515 |     # dispatches loop commands bytes with various modes
516 |     # The 0x76 style loops are used to build vectors
517 | 
518 |     # This was all derived by trial and error and 
519 |     # new loop types may exist that are not handled here
520 |     # since they did not appear in the test cases
521 | 
522 |     def decodeCMD(self, cmd, argtype):
523 |         if (cmd == 0x76):
524 | 
525 |             # loop with cnt, and mode to control loop styles
526 |             cnt = readEncodedNumber(self.fo)
527 |             mode = readEncodedNumber(self.fo)
528 | 
529 |             if self.debug : print 'Loop for', cnt, 'with  mode', mode,  ':  '
530 |             return self.doLoop76Mode(argtype, cnt, mode)
531 | 
532 |         if self.dbug: print  "Unknown command", cmd
533 |         result = []
534 |         return result
535 | 
536 | 
537 |             
538 |     # add full tag path to injected snippets
539 |     def updateName(self, tag, prefix):
540 |         name = tag[0]
541 |         subtagList = tag[1]
542 |         argtype = tag[2]
543 |         argList = tag[3]
544 |         nname = prefix + '.' + name
545 |         nsubtaglist = []
546 |         for j in subtagList:
547 |             nsubtaglist.append(self.updateName(j,prefix))
548 |         ntag = []
549 |         ntag.append(nname)
550 |         ntag.append(nsubtaglist)
551 |         ntag.append(argtype)
552 |         ntag.append(argList)
553 |         return ntag
554 | 
555 | 
556 | 
557 |     # perform depth first injection of specified snippets into this one
558 |     def injectSnippets(self, snippet):
559 |         snipno, tag = snippet
560 |         name = tag[0]
561 |         subtagList = tag[1]
562 |         argtype = tag[2]
563 |         argList = tag[3]
564 |         nsubtagList = []
565 |         if len(argList) > 0 : 
566 |             for j in argList:
567 |                 asnip = self.snippetList[j]
568 |                 aso, atag = self.injectSnippets(asnip)
569 |                 atag = self.updateName(atag, name)
570 |                 nsubtagList.append(atag)
571 |         argtype='number'
572 |         argList=[]
573 |         if len(nsubtagList) > 0 :
574 |             subtagList.extend(nsubtagList)
575 |         tag = []
576 |         tag.append(name)
577 |         tag.append(subtagList)
578 |         tag.append(argtype)
579 |         tag.append(argList)
580 |         snippet = []
581 |         snippet.append(snipno)
582 |         snippet.append(tag)
583 |         return snippet
584 | 
585 | 
586 | 
587 |     # format the tag for output
588 |     def formatTag(self, node):
589 |         name = node[0]
590 |         subtagList = node[1]
591 |         argtype = node[2]
592 |         argList = node[3]
593 |         fullpathname = name.split('.')
594 |         nodename = fullpathname.pop()
595 |         ilvl = len(fullpathname)
596 |         indent = ' ' * (3 * ilvl)
597 |         result = indent + '<' + nodename + '>'
598 |         if len(argList) > 0:
599 |             argres = ''
600 |             for j in argList:
601 |                 if (argtype == 'text') or (argtype == 'scalar_text') :
602 |                     argres += j + '|'
603 |                 else :
604 |                     argres += str(j) + ','
605 |             argres = argres[0:-1]
606 |             if argtype == 'snippets' :
607 |                 result += 'snippets:' + argres
608 |             else :
609 |                 result += argres
610 |         if len(subtagList) > 0 :
611 |             result += '\n'
612 |             for j in subtagList:
613 |                 if len(j) > 0 :
614 |                     result += self.formatTag(j)
615 |             result += indent + '</' + nodename + '>\n'
616 |         else:
617 |             result += '</' + nodename + '>\n'
618 |         return result
619 | 
620 | 
621 |    # flatten tag
622 |     def flattenTag(self, node):
623 |         name = node[0]
624 |         subtagList = node[1]
625 |         argtype = node[2]
626 |         argList = node[3]
627 |         result = name
628 |         if (len(argList) > 0):
629 |             argres = ''
630 |             for j in argList:
631 |                 if (argtype == 'text') or (argtype == 'scalar_text') :
632 |                     argres += j + '|'
633 |                 else :
634 |                     argres += str(j) + '|'
635 |             argres = argres[0:-1]
636 |             if argtype == 'snippets' :
637 |                 result += '.snippets=' + argres
638 |             else :
639 |                 result += '=' + argres
640 |         result += '\n'
641 |         for j in subtagList:
642 |             if len(j) > 0 :
643 |                 result += self.flattenTag(j)
644 |         return result
645 | 
646 | 
647 |     # reduce create xml output
648 |     def formatDoc(self, flat_xml):
649 |         result = ''
650 |         for j in self.doc :
651 |             if len(j) > 0:
652 |                 if flat_xml:
653 |                     result += self.flattenTag(j)
654 |                 else:
655 |                     result += self.formatTag(j)
656 |         if self.debug : print result
657 |         return result
658 | 
659 | 
660 | 
661 |     # main loop - parse the page.dat files
662 |     # to create structured document and snippets
663 | 
664 |     # FIXME: value at end of magic appears to be a subtags count
665 |     # but for what?  For now, inject an 'info" tag as it is in
666 |     # every dictionary and seems close to what is meant
667 |     # The alternative is to special case the last _ "0x5f" to mean something
668 | 
669 |     def process(self):
670 | 
671 |         # peek at the first bytes to see what type of file it is
672 |         magic = self.fo.read(9)
673 |         if (magic[0:1] == 'p') and (magic[2:9] == 'marker_'):
674 |             first_token = 'info'
675 |         elif (magic[0:1] == 'p') and (magic[2:9] == '__PAGE_'):
676 |             skip = self.fo.read(2)
677 |             first_token = 'info'
678 |         elif (magic[0:1] == 'p') and (magic[2:8] == '_PAGE_'):
679 |             first_token = 'info'
680 |         elif (magic[0:1] == 'g') and (magic[2:9] == '__GLYPH'):
681 |             skip = self.fo.read(3)
682 |             first_token = 'info'
683 |         else :
684 |             # other0.dat file
685 |             first_token = None
686 |             self.fo.seek(-9,1)
687 | 
688 | 
689 |         # main loop to read and build the document tree
690 |         while True:
691 | 
692 |             if first_token != None :
693 |                 # use "inserted" first token 'info' for page and glyph files
694 |                 tag = self.procToken(first_token)
695 |                 if len(tag) > 0 :
696 |                     self.doc.append(tag)
697 |                 first_token = None
698 | 
699 |             v = self.getNext()
700 |             if (v == None): 
701 |                 break
702 | 
703 |             if (v == 0x72):
704 |                 self.doLoop72('number')
705 |             elif (v > 0) and (v < self.dict.getSize()) :
706 |                 tag = self.procToken(self.dict.lookup(v))
707 |                 if len(tag) > 0 :
708 |                     self.doc.append(tag)
709 |             else:
710 |                 if self.debug:
711 |                     print "Main Loop:  Unknown value: %x" % v 
712 |                 if (v == 0):
713 |                     if (self.peek(1) == 0x5f):
714 |                         skip = self.fo.read(1)
715 |                         first_token = 'info'
716 | 
717 |         # now do snippet injection
718 |         if len(self.snippetList) > 0 :
719 |             if self.debug : print 'Injecting Snippets:'
720 |             snippet = self.injectSnippets(self.snippetList[0])
721 |             snipno = snippet[0]
722 |             tag_add = snippet[1]
723 |             if self.debug : print self.formatTag(tag_add)
724 |             if len(tag_add) > 0:
725 |                 self.doc.append(tag_add)
726 | 
727 |         # handle generation of xml output
728 |         xmlpage = self.formatDoc(self.flat_xml)
729 | 
730 |         return xmlpage
731 | 
732 | 
733 |     
734 | def usage():
735 |     print 'Usage: '
736 |     print '    convert2xml.py dict0000.dat infile.dat '
737 |     print ' '
738 |     print ' Options:'
739 |     print '   -h            print this usage help message '
740 |     print '   -d            turn on debug output to check for potential errors '
741 |     print '   --flat-xml    output the flattened xml page description only '
742 |     print ' '
743 |     print '     This program will attempt to convert a page*.dat file or '
744 |     print ' glyphs*.dat file, using the dict0000.dat file, to its xml description. '
745 |     print ' '
746 |     print ' Use "cmbtc_dump.py" first to unencrypt, uncompress, and dump '
747 |     print ' the *.dat files from a Topaz format e-book.'
748 | 
749 | #
750 | # Main
751 | #   
752 | 
753 | def main(argv):
754 |     dictFile = ""
755 |     pageFile = ""
756 |     debug = False
757 |     flat_xml = False
758 |     printOutput = False
759 |     if len(argv) == 0:
760 |         printOutput = True
761 |         argv = sys.argv
762 | 
763 |     try:
764 |         opts, args = getopt.getopt(argv[1:], "hd", ["flat-xml"])
765 | 
766 |     except getopt.GetoptError, err:
767 | 
768 |         # print help information and exit:
769 |         print str(err) # will print something like "option -a not recognized"
770 |         usage()
771 |         sys.exit(2)
772 |     
773 |     if len(opts) == 0 and len(args) == 0 :
774 |         usage()
775 |         sys.exit(2) 
776 |        
777 |     for o, a in opts:
778 |         if o =="-d":
779 |             debug=True
780 |         if o =="-h":
781 |             usage()
782 |             sys.exit(0)
783 |         if o =="--flat-xml":
784 |             flat_xml = True
785 | 
786 |     dictFile, pageFile = args[0], args[1]
787 | 
788 |     # read in the string table dictionary
789 |     dict = Dictionary(dictFile)
790 |     # dict.dumpDict()
791 | 
792 |     # create a page parser
793 |     pp = PageParser(pageFile, dict, debug, flat_xml)
794 | 
795 |     xmlpage = pp.process()
796 | 
797 |     if printOutput:
798 |         print xmlpage
799 |         return 0
800 | 
801 |     return xmlpage
802 | 
803 | if __name__ == '__main__':
804 |     sys.exit(main(''))


--------------------------------------------------------------------------------
/topaz/decode_meta.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  3 | # For use with Topaz Scripts Version 2.6
  4 | 
  5 | import csv
  6 | import sys
  7 | import os
  8 | import getopt
  9 | from struct import pack
 10 | from struct import unpack
 11 | 
 12 | #
 13 | # Get a 7 bit encoded number from string
 14 | #
 15 | 
 16 | def readEncodedNumber(file):
 17 |     flag = False
 18 |     c = file.read(1)
 19 |     if (len(c) == 0):
 20 |         return None
 21 |     data = ord(c)
 22 |     
 23 |     if data == 0xFF:
 24 |        flag = True
 25 |        c = file.read(1)
 26 |        if (len(c) == 0):
 27 |            return None
 28 |        data = ord(c)
 29 |        
 30 |     if data >= 0x80:
 31 |         datax = (data & 0x7F)
 32 |         while data >= 0x80 :
 33 |             c = file.read(1)
 34 |             if (len(c) == 0): 
 35 |                 return None
 36 |             data = ord(c)
 37 |             datax = (datax <<7) + (data & 0x7F)
 38 |         data = datax 
 39 |     
 40 |     if flag:
 41 |        data = -data
 42 |     return data
 43 |     
 44 | #
 45 | # Encode a number in 7 bit format
 46 | #
 47 | 
 48 | def encodeNumber(number):
 49 |    result = ""
 50 |    negative = False
 51 |    flag = 0
 52 |    
 53 |    if number < 0 :
 54 |        number = -number + 1
 55 |        negative = True
 56 |    
 57 |    while True:
 58 |        byte = number & 0x7F
 59 |        number = number >> 7
 60 |        byte += flag
 61 |        result += chr(byte)
 62 |        flag = 0x80
 63 |        if number == 0 :
 64 |            if (byte == 0xFF and negative == False) :
 65 |                result += chr(0x80)
 66 |            break
 67 | 
 68 |    if negative:
 69 |        result += chr(0xFF)
 70 |    
 71 |    return result[::-1]
 72 |   
 73 | #
 74 | # Get a length prefixed string from the file 
 75 | #
 76 | def lengthPrefixString(data):
 77 |     return encodeNumber(len(data))+data
 78 | 
 79 | def readString(file):
 80 |     stringLength = readEncodedNumber(file)
 81 |     if (stringLength == None):
 82 |         return None
 83 |     sv = file.read(stringLength)
 84 |     if (len(sv)  != stringLength):
 85 |         return ""
 86 |     return unpack(str(stringLength)+"s",sv)[0]  
 87 | 
 88 | 
 89 | 
 90 | def getMetaArray(metaFile):
 91 |     # parse the meta file into a Python dictionary (associative array)
 92 |     result = {}
 93 |     fo = file(metaFile,'rb')
 94 |     size = readEncodedNumber(fo)
 95 |     for i in xrange(size):
 96 |         temp = readString(fo)
 97 |         result[temp] = readString(fo)
 98 |     fo.close()
 99 |     return result
100 | 
101 | 
102 | 
103 | def getMetaData(metaFile):
104 |     # parse the meta file
105 |     result = ''    
106 |     fo = file(metaFile,'rb')
107 |     size = readEncodedNumber(fo)
108 |     for i in xrange(size):
109 |         result += readString(fo) + '|'
110 |         result += readString(fo) + '\n'
111 |     fo.close()
112 |     return result
113 | 


--------------------------------------------------------------------------------
/topaz/flatxml2html.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  3 | # For use with Topaz Scripts Version 2.6
  4 | 
  5 | import sys
  6 | import csv
  7 | import os
  8 | import math
  9 | import getopt
 10 | from struct import pack
 11 | from struct import unpack
 12 | 
 13 | 
 14 | class DocParser(object):
 15 |     def __init__(self, flatxml, classlst, fileid, bookDir, fixedimage):
 16 |         self.id = os.path.basename(fileid).replace('.dat','')
 17 |         self.svgcount = 0
 18 |         self.docList = flatxml.split('\n')
 19 |         self.docSize = len(self.docList)
 20 |         self.classList = {}
 21 |         self.bookDir = bookDir
 22 |         self.glyphPaths = { }
 23 |         self.numPaths = 0
 24 |         tmpList = classlst.split('\n')
 25 |         for pclass in tmpList:
 26 |             if pclass != '':
 27 |                 # remove the leading period from the css name
 28 |                 cname = pclass[1:]
 29 |             self.classList[cname] = True
 30 |         self.fixedimage = fixedimage
 31 |         self.ocrtext = []
 32 |         self.link_id = []
 33 |         self.link_title = []
 34 |         self.link_page = []
 35 |         self.link_href = []
 36 |         self.link_type = []
 37 |         self.dehyphen_rootid = []
 38 |         self.paracont_stemid = []
 39 |         self.parastems_stemid = []
 40 | 
 41 | 
 42 |     def getGlyph(self, gid):
 43 |         result = ''
 44 |         id='gl%d' % gid
 45 |         return self.glyphPaths[id]
 46 | 
 47 | 
 48 |     def glyphs_to_image(self, glyphList):
 49 | 
 50 |         def extract(path, key):
 51 |             b = path.find(key) + len(key)
 52 |             e = path.find(' ',b)
 53 |             return int(path[b:e])
 54 | 
 55 |         def extractID(path, key):
 56 |             b = path.find(key) + len(key)
 57 |             e = path.find('"',b)
 58 |             return path[b:e]
 59 |             
 60 | 
 61 |         svgDir = os.path.join(self.bookDir,'svg')
 62 |         glyfile = os.path.join(svgDir,'glyphs.svg')
 63 | 
 64 |         imgDir = os.path.join(self.bookDir,'img')
 65 |         imgname = self.id + '_%04d.svg' % self.svgcount
 66 |         imgfile = os.path.join(imgDir,imgname)
 67 | 
 68 |         # build hashtable of glyph paths keyed by glyph id
 69 |         if self.numPaths == 0:
 70 |             gfile = open(glyfile, 'r')
 71 |             while True:
 72 |                 path = gfile.readline()
 73 |                 if (path == ''): break
 74 |                 glyphid = extractID(path,'id="')
 75 |                 self.glyphPaths[glyphid] = path
 76 |                 self.numPaths += 1
 77 |             gfile.close()
 78 | 
 79 | 
 80 |         # get glyph information
 81 |         gxList = self.getData('info.glyph.x',0,-1)
 82 |         gyList = self.getData('info.glyph.y',0,-1)
 83 |         gidList = self.getData('info.glyph.glyphID',0,-1)
 84 | 
 85 |         gids = []
 86 |         maxws = []
 87 |         maxhs = []
 88 |         xs = []
 89 |         ys = []
 90 |         gdefs = []
 91 | 
 92 |         # get path defintions, positions, dimensions for ecah glyph 
 93 |         # that makes up the image, and find min x and min y to reposition origin
 94 |         minx = -1
 95 |         miny = -1
 96 |         for j in glyphList:
 97 |             gid = gidList[j]
 98 |             gids.append(gid)
 99 | 
100 |             xs.append(gxList[j])
101 |             if minx == -1: minx = gxList[j]
102 |             else : minx = min(minx, gxList[j])
103 |  
104 |             ys.append(gyList[j])
105 |             if miny == -1: miny = gyList[j]
106 |             else : miny = min(miny, gyList[j])
107 | 
108 |             path = self.getGlyph(gid)
109 |             gdefs.append(path)
110 | 
111 |             maxws.append(extract(path,'width='))
112 |             maxhs.append(extract(path,'height='))
113 | 
114 | 
115 |         # change the origin to minx, miny and calc max height and width
116 |         maxw = maxws[0] + xs[0] - minx
117 |         maxh = maxhs[0] + ys[0] - miny
118 |         for j in xrange(0, len(xs)):
119 |             xs[j] = xs[j] - minx
120 |             ys[j] = ys[j] - miny
121 |             maxw = max( maxw, (maxws[j] + xs[j]) )
122 |             maxh = max( maxh, (maxhs[j] + ys[j]) )
123 | 
124 |         # open the image file for output
125 |         ifile = open(imgfile,'w')
126 |         ifile.write('<?xml version="1.0" standalone="no"?>\n')
127 |         ifile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
128 |         ifile.write('<svg width="%dpx" height="%dpx" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (math.floor(maxw/10), math.floor(maxh/10), maxw, maxh))
129 |         ifile.write('<defs>\n')
130 |         for j in xrange(0,len(gdefs)):
131 |             ifile.write(gdefs[j])
132 |         ifile.write('</defs>\n')
133 |         for j in xrange(0,len(gids)):
134 |             ifile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (gids[j], xs[j], ys[j]))
135 |         ifile.write('</svg>')
136 |         ifile.close()
137 | 
138 |         return 0
139 | 
140 | 
141 | 
142 |     # return tag at line pos in document
143 |     def lineinDoc(self, pos) :
144 |         if (pos >= 0) and (pos < self.docSize) :
145 |             item = self.docList[pos]
146 |             if item.find('=') >= 0:
147 |                 (name, argres) = item.split('=',1)
148 |             else : 
149 |                 name = item
150 |                 argres = ''
151 |         return name, argres
152 | 
153 |         
154 |     # find tag in doc if within pos to end inclusive
155 |     def findinDoc(self, tagpath, pos, end) :
156 |         result = None
157 |         if end == -1 :
158 |             end = self.docSize
159 |         else:
160 |             end = min(self.docSize, end)
161 |         foundat = -1
162 |         for j in xrange(pos, end):
163 |             item = self.docList[j]
164 |             if item.find('=') >= 0:
165 |                 (name, argres) = item.split('=',1)
166 |             else : 
167 |                 name = item
168 |                 argres = ''
169 |             if name.endswith(tagpath) : 
170 |                 result = argres
171 |                 foundat = j
172 |                 break
173 |         return foundat, result
174 | 
175 | 
176 |     # return list of start positions for the tagpath
177 |     def posinDoc(self, tagpath):
178 |         startpos = []
179 |         pos = 0
180 |         res = ""
181 |         while res != None :
182 |             (foundpos, res) = self.findinDoc(tagpath, pos, -1)
183 |             if res != None :
184 |                 startpos.append(foundpos)
185 |             pos = foundpos + 1
186 |         return startpos
187 | 
188 | 
189 |     # returns a vector of integers for the tagpath
190 |     def getData(self, tagpath, pos, end):
191 |         argres=[]
192 |         (foundat, argt) = self.findinDoc(tagpath, pos, end)
193 |         if (argt != None) and (len(argt) > 0) :
194 |             argList = argt.split('|')
195 |             argres = [ int(strval) for strval in argList]
196 |         return argres
197 | 
198 | 
199 |     # get the class
200 |     def getClass(self, pclass):
201 |         nclass = pclass
202 | 
203 |         # class names are an issue given topaz may start them with numerals (not allowed),
204 |         # use a mix of cases (which cause some browsers problems), and actually
205 |         # attach numbers after "_reclustered*" to the end to deal classeses that inherit
206 |         # from a base class (but then not actually provide all of these _reclustereed 
207 |         # classes in the stylesheet!
208 | 
209 |         # so we clean this up by lowercasing, prepend 'cl-', and getting any baseclass
210 |         # that exists in the stylesheet first, and then adding this specific class
211 |         # after
212 |         
213 |         # also some class names have spaces in them so need to convert to dashes
214 |         if nclass != None :
215 |             nclass = nclass.replace(' ','-')
216 |             classres = ''
217 |             nclass = nclass.lower()
218 |             nclass = 'cl-' + nclass
219 |             baseclass = ''
220 |             # graphic is the base class for captions
221 |             if nclass.find('cl-cap-') >=0 :
222 |                 classres = 'graphic' + ' '
223 |             else :
224 |                 # strip to find baseclass
225 |                 p = nclass.find('_')
226 |                 if p > 0 :
227 |                     baseclass = nclass[0:p]
228 |                     if baseclass in self.classList:
229 |                         classres += baseclass + ' '
230 |             classres += nclass
231 |             nclass = classres
232 |         return nclass
233 | 
234 | 
235 |     # develop a sorted description of the starting positions of 
236 |     # groups and regions on the page, as well as the page type
237 |     def PageDescription(self):
238 | 
239 |         def compare(x, y):
240 |             (xtype, xval) = x
241 |             (ytype, yval) = y
242 |             if xval > yval:
243 |                 return 1
244 |             if xval == yval:
245 |                 return 0
246 |             return -1
247 | 
248 |         result = []
249 |         (pos, pagetype) = self.findinDoc('page.type',0,-1)
250 | 
251 |         groupList = self.posinDoc('page.group')
252 |         groupregionList = self.posinDoc('page.group.region')
253 |         pageregionList = self.posinDoc('page.region')
254 |         # integrate into one list
255 |         for j in groupList:
256 |             result.append(('grpbeg',j))
257 |         for j in groupregionList:
258 |             result.append(('gregion',j))
259 |         for j in pageregionList:
260 |             result.append(('pregion',j))
261 |         result.sort(compare)
262 | 
263 |         # insert group end and page end indicators
264 |         inGroup = False
265 |         j = 0
266 |         while True:
267 |             if j == len(result): break
268 |             rtype = result[j][0]
269 |             rval = result[j][1]
270 |             if not inGroup and (rtype == 'grpbeg') :
271 |                 inGroup = True
272 |                 j = j + 1
273 |             elif inGroup and (rtype in ('grpbeg', 'pregion')):
274 |                 result.insert(j,('grpend',rval))
275 |                 inGroup = False
276 |             else:
277 |                 j = j + 1
278 |         if inGroup:
279 |             result.append(('grpend',-1))
280 |         result.append(('pageend', -1))
281 |         return pagetype, result
282 | 
283 | 
284 | 
285 |     # build a description of the paragraph
286 |     def getParaDescription(self, start, end, regtype):
287 | 
288 |         result = []
289 | 
290 |         # paragraph
291 |         (pos, pclass) = self.findinDoc('paragraph.class',start,end) 
292 | 
293 |         pclass = self.getClass(pclass)
294 | 
295 |         # build up a description of the paragraph in result and return it
296 |         # first check for the  basic - all words paragraph
297 |         (pos, sfirst) = self.findinDoc('paragraph.firstWord',start,end)
298 |         (pos, slast) = self.findinDoc('paragraph.lastWord',start,end)
299 |         if (sfirst != None) and (slast != None) :
300 |             first = int(sfirst)
301 |             last = int(slast)
302 |             
303 |             makeImage = (regtype == 'vertical') or (regtype == 'table')
304 |             if self.fixedimage:
305 |                 makeImage = makeImage or (regtype == 'fixed')
306 | 
307 |             if (pclass != None): 
308 |                 makeImage = makeImage or (pclass.find('.inverted') >= 0)
309 |                 if self.fixedimage :
310 |                     makeImage = makeImage or (pclass.find('cl-f-') >= 0)
311 | 
312 |             if not makeImage :
313 |                 # standard all word paragraph
314 |                 for wordnum in xrange(first, last):
315 |                     result.append(('ocr', wordnum))
316 |                 return pclass, result
317 | 
318 |             # convert paragraph to svg image
319 |             # translate first and last word into first and last glyphs
320 |             # and generate inline image and include it
321 |             glyphList = []
322 |             firstglyphList = self.getData('word.firstGlyph',0,-1)
323 |             gidList = self.getData('info.glyph.glyphID',0,-1)
324 |             firstGlyph = firstglyphList[first]
325 |             if last < len(firstglyphList):
326 |                 lastGlyph = firstglyphList[last]
327 |             else :
328 |                 lastGlyph = len(gidList)
329 |             for glyphnum in xrange(firstGlyph, lastGlyph):
330 |                 glyphList.append(glyphnum)
331 |             # include any extratokens if they exist
332 |             (pos, sfg) = self.findinDoc('extratokens.firstGlyph',start,end)
333 |             (pos, slg) = self.findinDoc('extratokens.lastGlyph',start,end)
334 |             if (sfg != None) and (slg != None):
335 |                 for glyphnum in xrange(int(sfg), int(slg)):
336 |                     glyphList.append(glyphnum)
337 |             num = self.svgcount
338 |             self.glyphs_to_image(glyphList)
339 |             self.svgcount += 1
340 |             result.append(('svg', num))
341 |             return pclass, result
342 | 
343 |         # this type of paragraph may be made up of multiple spans, inline 
344 |         # word monograms (images), and words with semantic meaning, 
345 |         # plus glyphs used to form starting letter of first word
346 |         
347 |         # need to parse this type line by line
348 |         line = start + 1
349 |         word_class = ''
350 | 
351 |         # if end is -1 then we must search to end of document
352 |         if end == -1 :
353 |             end = self.docSize
354 | 
355 |         # seems some xml has last* coming before first* so we have to 
356 |         # handle any order
357 |         sp_first = -1
358 |         sp_last = -1
359 | 
360 |         gl_first = -1
361 |         gl_last = -1
362 | 
363 |         ws_first = -1
364 |         ws_last = -1
365 | 
366 |         word_class = ''
367 | 
368 |         while (line < end) :
369 | 
370 |             (name, argres) = self.lineinDoc(line)
371 | 
372 |             if name.endswith('span.firstWord') :
373 |                 sp_first = int(argres)
374 | 
375 |             elif name.endswith('span.lastWord') :
376 |                 sp_last = int(argres)
377 | 
378 |             elif name.endswith('word.firstGlyph') :
379 |                 gl_first = int(argres)
380 | 
381 |             elif name.endswith('word.lastGlyph') :
382 |                 gl_last = int(argres)
383 | 
384 |             elif name.endswith('word_semantic.firstWord'):
385 |                 ws_first = int(argres)
386 | 
387 |             elif name.endswith('word_semantic.lastWord'):
388 |                 ws_last = int(argres)
389 | 
390 |             elif name.endswith('word.class'):
391 |                (cname, space) = argres.split('-',1)
392 |                if space == '' : space = '0'
393 |                if (cname == 'spaceafter') and (int(space) > 0) :
394 |                    word_class = 'sa'
395 | 
396 |             elif name.endswith('word.img.src'):
397 |                 result.append(('img' + word_class, int(argres)))
398 |                 word_class = ''
399 | 
400 |             elif name.endswith('region.img.src'):
401 |                 result.append(('img' + word_class, int(argres)))
402 | 
403 |             if (sp_first != -1) and (sp_last != -1):
404 |                 for wordnum in xrange(sp_first, sp_last):
405 |                     result.append(('ocr', wordnum))
406 |                 sp_first = -1
407 |                 sp_last = -1
408 | 
409 |             if (gl_first != -1) and (gl_last != -1):
410 |                 glyphList = []
411 |                 for glyphnum in xrange(gl_first, gl_last):
412 |                     glyphList.append(glyphnum)
413 |                 num = self.svgcount
414 |                 self.glyphs_to_image(glyphList)
415 |                 self.svgcount += 1
416 |                 result.append(('svg', num))
417 |                 gl_first = -1
418 |                 gl_last = -1
419 | 
420 |             if (ws_first != -1) and (ws_last != -1):
421 |                 for wordnum in xrange(ws_first, ws_last):
422 |                     result.append(('ocr', wordnum))
423 |                 ws_first = -1
424 |                 ws_last = -1
425 |                               
426 |             line += 1
427 | 
428 |         return pclass, result
429 |                             
430 | 
431 |     def buildParagraph(self, pclass, pdesc, type, regtype) :
432 |         parares = ''
433 |         sep =''
434 | 
435 |         classres = ''
436 |         if pclass :
437 |             classres = ' class="' + pclass + '"'
438 | 
439 |         br_lb = (regtype == 'fixed') or (regtype == 'chapterheading') or (regtype == 'vertical')
440 | 
441 |         handle_links = len(self.link_id) > 0
442 |         
443 |         if (type == 'full') or (type == 'begin') :
444 |             parares += '<p' + classres + '>'
445 | 
446 |         if (type == 'end'):
447 |             parares += ' '
448 | 
449 |         lstart = len(parares)
450 | 
451 |         cnt = len(pdesc)
452 | 
453 |         for j in xrange( 0, cnt) :
454 | 
455 |             (wtype, num) = pdesc[j]
456 | 
457 |             if wtype == 'ocr' :
458 |                 word = self.ocrtext[num]
459 |                 sep = ' '
460 | 
461 |                 if handle_links:
462 |                     link = self.link_id[num]
463 |                     if (link > 0):
464 |                         linktype = self.link_type[link-1]
465 |                         title = self.link_title[link-1]
466 |                         if (title == "") or (parares.rfind(title) < 0):
467 |                             title=parares[lstart:]
468 |                         if linktype == 'external' :
469 |                             linkhref = self.link_href[link-1]
470 |                             linkhtml = '<a href="%s">' % linkhref
471 |                         else : 
472 |                             if len(self.link_page) >= link :
473 |                                 ptarget = self.link_page[link-1] - 1
474 |                                 linkhtml = '<a href="#page%04d">' % ptarget
475 |                             else :
476 |                                 # just link to the current page
477 |                                 linkhtml = '<a href="#' + self.id + '">'
478 |                         linkhtml += title + '</a>'
479 |                         pos = parares.rfind(title)
480 |                         if pos >= 0:
481 |                             parares = parares[0:pos] + linkhtml + parares[pos+len(title):]
482 |                         else :
483 |                             parares += linkhtml
484 |                         lstart = len(parares)
485 |                         if word == '_link_' : word = ''
486 |                     elif (link < 0) :
487 |                         if word == '_link_' : word = ''
488 | 
489 |                 if word == '_lb_':
490 |                     if ((num-1) in self.dehyphen_rootid ) or handle_links:
491 |                         word = ''
492 |                         sep = ''
493 |                     elif br_lb :
494 |                         word = '<br />\n'
495 |                         sep = ''
496 |                     else :
497 |                         word = '\n'
498 |                         sep = ''
499 | 
500 |                 if num in self.dehyphen_rootid :
501 |                     word = word[0:-1]
502 |                     sep = ''
503 | 
504 |                 parares += word + sep
505 | 
506 |             elif wtype == 'img' :
507 |                 sep = ''
508 |                 parares += '<img src="img/img%04d.jpg" alt="" />' % num
509 |                 parares += sep
510 | 
511 |             elif wtype == 'imgsa' :
512 |                 sep = ' '
513 |                 parares += '<img src="img/img%04d.jpg" alt="" />' % num
514 |                 parares += sep
515 | 
516 |             elif wtype == 'svg' :
517 |                 sep = ''
518 |                 parares += '<img src="img/' + self.id + '_%04d.svg" alt="" />' % num 
519 |                 parares += sep
520 | 
521 |         if len(sep) > 0 : parares = parares[0:-1]
522 |         if (type == 'full') or (type == 'end') :
523 |             parares += '</p>'
524 |         return parares
525 | 
526 | 
527 |     
528 |     # walk the document tree collecting the information needed
529 |     # to build an html page using the ocrText
530 | 
531 |     def process(self):
532 | 
533 |         htmlpage = ''
534 | 
535 |         # get the ocr text
536 |         (pos, argres) = self.findinDoc('info.word.ocrText',0,-1)
537 |         if argres :  self.ocrtext = argres.split('|')
538 | 
539 |         # get information to dehyphenate the text
540 |         self.dehyphen_rootid = self.getData('info.dehyphen.rootID',0,-1)
541 | 
542 |         # determine if first paragraph is continued from previous page
543 |         (pos, self.parastems_stemid) = self.findinDoc('info.paraStems.stemID',0,-1)
544 |         first_para_continued = (self.parastems_stemid  != None) 
545 |         
546 |         # determine if last paragraph is continued onto the next page
547 |         (pos, self.paracont_stemid) = self.findinDoc('info.paraCont.stemID',0,-1)
548 |         last_para_continued = (self.paracont_stemid != None)
549 | 
550 |         # collect link ids
551 |         self.link_id = self.getData('info.word.link_id',0,-1)
552 | 
553 |         # collect link destination page numbers
554 |         self.link_page = self.getData('info.links.page',0,-1)
555 | 
556 |         # collect link types (container versus external)
557 |         (pos, argres) = self.findinDoc('info.links.type',0,-1)
558 |         if argres :  self.link_type = argres.split('|')
559 | 
560 |         # collect link destinations
561 |         (pos, argres) = self.findinDoc('info.links.href',0,-1)
562 |         if argres :  self.link_href = argres.split('|')
563 | 
564 |         # collect link titles
565 |         (pos, argres) = self.findinDoc('info.links.title',0,-1)
566 |         if argres :
567 |             self.link_title = argres.split('|')
568 |         else:
569 |             self.link_title.append('')
570 | 
571 |         # get a descriptions of the starting points of the regions
572 |         # and groups on the page
573 |         (pagetype, pageDesc) = self.PageDescription() 
574 |         regcnt = len(pageDesc) - 1
575 | 
576 |         anchorSet = False
577 |         breakSet = False
578 |         inGroup = False
579 |         
580 |         # process each region on the page and convert what you can to html
581 | 
582 |         for j in xrange(regcnt):
583 | 
584 |             (etype, start) = pageDesc[j]
585 |             (ntype, end) = pageDesc[j+1]
586 |             
587 | 
588 |             # set anchor for link target on this page
589 |             if not anchorSet and not first_para_continued:
590 |                 htmlpage += '<div style="visibility: hidden; height: 0; width: 0;" id="' 
591 |                 htmlpage += self.id + '" title="pagetype_' + pagetype + '"></div>\n'
592 |                 anchorSet = True
593 | 
594 |             # handle groups of graphics with text captions
595 |             if (etype == 'grpbeg'):
596 |                 (pos, grptype) = self.findinDoc('group.type', start, end)
597 |                 if grptype != None:
598 |                     if grptype == 'graphic':
599 |                         gcstr = ' class="' + grptype + '"'
600 |                         htmlpage += '<div' + gcstr + '>'
601 |                         inGroup = True
602 |                 
603 |             elif (etype == 'grpend'):
604 |                 if inGroup:
605 |                     htmlpage += '</div>\n'
606 |                     inGroup = False
607 | 
608 |             else:
609 |                 (pos, regtype) = self.findinDoc('region.type',start,end)
610 | 
611 |                 if regtype == 'graphic' :
612 |                     (pos, simgsrc) = self.findinDoc('img.src',start,end)
613 |                     if simgsrc:
614 |                         if inGroup:
615 |                             htmlpage += '<img src="img/img%04d.jpg" alt="" />' % int(simgsrc)
616 |                         else:
617 |                             htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
618 |             
619 |                 elif regtype == 'chapterheading' :
620 |                     (pclass, pdesc) = self.getParaDescription(start,end, regtype)
621 |                     if not breakSet:
622 |                         htmlpage += '<div style="page-break-after: always;">&nbsp;</div>\n'
623 |                         breakSet = True
624 |                     tag = 'h1'
625 |                     if pclass and (len(pclass) >= 7):
626 |                         if pclass[3:7] == 'ch1-' : tag = 'h1'
627 |                         if pclass[3:7] == 'ch2-' : tag = 'h2'
628 |                         if pclass[3:7] == 'ch3-' : tag = 'h3'
629 |                         htmlpage += '<' + tag + ' class="' + pclass + '">'
630 |                     else:
631 |                         htmlpage += '<' + tag + '>'
632 |                     htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
633 |                     htmlpage += '</' + tag + '>'
634 | 
635 |                 elif (regtype == 'text') or (regtype == 'fixed') or (regtype == 'insert') or (regtype == 'listitem'):
636 |                     ptype = 'full'
637 |                     # check to see if this is a continution from the previous page
638 |                     if first_para_continued :
639 |                         ptype = 'end'
640 |                         first_para_continued = False
641 |                     (pclass, pdesc) = self.getParaDescription(start,end, regtype)
642 |                     if pclass and (len(pclass) >= 6) and (ptype == 'full'):
643 |                         tag = 'p'
644 |                         if pclass[3:6] == 'h1-' : tag = 'h4'
645 |                         if pclass[3:6] == 'h2-' : tag = 'h5'
646 |                         if pclass[3:6] == 'h3-' : tag = 'h6'
647 |                         htmlpage += '<' + tag + ' class="' + pclass + '">'
648 |                         htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
649 |                         htmlpage += '</' + tag + '>'
650 |                     else :
651 |                         htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
652 | 
653 |                 elif (regtype == 'tocentry') :
654 |                     ptype = 'full'
655 |                     if first_para_continued :
656 |                         ptype = 'end'
657 |                         first_para_continued = False
658 |                     (pclass, pdesc) = self.getParaDescription(start,end, regtype)
659 |                     htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
660 | 
661 | 
662 |                 elif (regtype == 'vertical') or (regtype == 'table') :
663 |                     ptype = 'full'
664 |                     if inGroup:
665 |                         ptype = 'middle'
666 |                     if first_para_continued :
667 |                         ptype = 'end'
668 |                         first_para_continued = False
669 |                     (pclass, pdesc) = self.getParaDescription(start, end, regtype)
670 |                     htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
671 | 
672 | 
673 |                 elif (regtype == 'synth_fcvr.center'):
674 |                     (pos, simgsrc) = self.findinDoc('img.src',start,end)
675 |                     if simgsrc:
676 |                         htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
677 | 
678 |                 else :
679 |                     print '          Making region type', regtype,
680 |                     (pos, temp) = self.findinDoc('paragraph',start,end)
681 |                     (pos2, temp) = self.findinDoc('span',start,end)
682 |                     if pos != -1 or pos2 != -1:
683 |                         print ' a "text" region'
684 |                         orig_regtype = regtype
685 |                         regtype = 'fixed'
686 |                         ptype = 'full'
687 |                         # check to see if this is a continution from the previous page
688 |                         if first_para_continued :
689 |                             ptype = 'end'
690 |                             first_para_continued = False
691 |                         (pclass, pdesc) = self.getParaDescription(start,end, regtype)
692 |                         if not pclass:
693 |                             if orig_regtype.endswith('.right')     : pclass = 'cl-right'
694 |                             elif orig_regtype.endswith('.center')  : pclass = 'cl-center'
695 |                             elif orig_regtype.endswith('.left')    : pclass = 'cl-left'
696 |                             elif orig_regtype.endswith('.justify') : pclass = 'cl-justify'
697 |                         if pclass and (ptype == 'full') and (len(pclass) >= 6):
698 |                             tag = 'p'
699 |                             if pclass[3:6] == 'h1-' : tag = 'h4'
700 |                             if pclass[3:6] == 'h2-' : tag = 'h5'
701 |                             if pclass[3:6] == 'h3-' : tag = 'h6'
702 |                             htmlpage += '<' + tag + ' class="' + pclass + '">'
703 |                             htmlpage += self.buildParagraph(pclass, pdesc, 'middle', regtype)
704 |                             htmlpage += '</' + tag + '>'
705 |                         else :
706 |                             htmlpage += self.buildParagraph(pclass, pdesc, ptype, regtype)
707 |                     else :
708 |                         print ' a "graphic" region'
709 |                         (pos, simgsrc) = self.findinDoc('img.src',start,end)
710 |                         if simgsrc:
711 |                             htmlpage += '<div class="graphic"><img src="img/img%04d.jpg" alt="" /></div>' % int(simgsrc)
712 | 
713 | 
714 |         if last_para_continued :
715 |             if htmlpage[-4:] == '</p>':
716 |                 htmlpage = htmlpage[0:-4]
717 |             last_para_continued = False
718 | 
719 |         return htmlpage
720 | 
721 | 
722 | 
723 | def convert2HTML(flatxml, classlst, fileid, bookDir, fixedimage):
724 | 
725 |     # create a document parser
726 |     dp = DocParser(flatxml, classlst, fileid, bookDir, fixedimage)
727 | 
728 |     htmlpage = dp.process()
729 | 
730 |     return htmlpage
731 | 


--------------------------------------------------------------------------------
/topaz/genhtml.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  3 | # For use with Topaz Scripts Version 2.6
  4 | 
  5 | class Unbuffered:
  6 |     def __init__(self, stream):
  7 |         self.stream = stream
  8 |     def write(self, data):
  9 |         self.stream.write(data)
 10 |         self.stream.flush()
 11 |     def __getattr__(self, attr):
 12 |         return getattr(self.stream, attr)
 13 | 
 14 | import sys
 15 | sys.stdout=Unbuffered(sys.stdout)
 16 | 
 17 | 
 18 | import os, getopt
 19 | 
 20 | # local routines
 21 | import convert2xml
 22 | import flatxml2html
 23 | import decode_meta
 24 | import stylexml2css
 25 | import getpagedim
 26 | 
 27 | def usage():
 28 |     print 'Usage: '
 29 |     print ' '
 30 |     print '   genhtml.py [--fixed-image] unencryptedBookDir'
 31 |     print '  '
 32 |     print '  Options:  '
 33 |     print '     --fixed-image   : force translation of fixed regions into svg images '
 34 |     print '  '
 35 | 
 36 | 
 37 | def main(argv):
 38 |     bookDir = ''
 39 |     fixedimage = False
 40 | 
 41 |     if len(argv) == 0:
 42 |         argv = sys.argv
 43 | 
 44 |     try:
 45 |         opts, args = getopt.getopt(argv[1:], "h:",["fixed-image"])
 46 | 
 47 |     except getopt.GetoptError, err:
 48 |         print str(err)
 49 |         usage()
 50 |         sys.exit(1)
 51 |     
 52 |     if len(opts) == 0 and len(args) == 0 :
 53 |         usage()
 54 |         sys.exit(1) 
 55 |        
 56 |     for o, a in opts:
 57 |         if o =="-h":
 58 |             usage()
 59 |             sys.exit(0)
 60 |         if o =="--fixed-image":
 61 |             fixedimage = True
 62 | 
 63 |     bookDir = args[0]
 64 | 
 65 |     if not os.path.exists(bookDir) :
 66 |         print "Can not find directory with unencrypted book"
 67 |         sys.exit(1)
 68 | 
 69 |     dictFile = os.path.join(bookDir,'dict0000.dat')
 70 | 
 71 |     if not os.path.exists(dictFile) :
 72 |         print "Can not find dict0000.dat file"
 73 |         sys.exit(1)
 74 | 
 75 |     pageDir = os.path.join(bookDir,'page')
 76 |     if not os.path.exists(pageDir) :
 77 |         print "Can not find page directory in unencrypted book"
 78 |         sys.exit(1)
 79 | 
 80 |     imgDir = os.path.join(bookDir,'img')
 81 |     if not os.path.exists(imgDir) :
 82 |         print "Can not find image directory in unencrypted book"
 83 |         sys.exit(1)
 84 | 
 85 |     svgDir = os.path.join(bookDir,'svg')
 86 |     if not os.path.exists(svgDir) :
 87 |         print "Can not find svg directory in unencrypted book"
 88 |         print "please run gensvg.py before running genhtml.py"
 89 |         sys.exit(1)
 90 | 
 91 |     otherFile = os.path.join(bookDir,'other0000.dat')
 92 |     if not os.path.exists(otherFile) :
 93 |         print "Can not find other0000.dat in unencrypted book"
 94 |         sys.exit(1)
 95 | 
 96 |     metaFile = os.path.join(bookDir,'metadata0000.dat')
 97 |     if not os.path.exists(metaFile) :
 98 |         print "Can not find metadata0000.dat in unencrypted book"
 99 |         sys.exit(1)
100 | 
101 |     htmlFileName = "book.html"
102 |     htmlstr = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n'
103 |     htmlstr += '<html>\n'
104 | 
105 |     filenames = os.listdir(pageDir)
106 |     filenames = sorted(filenames)
107 | 
108 |     print 'Processing ... '
109 | 
110 |     htmlstr += '<head>\n'
111 |     htmlstr += '<meta http-equiv="content-type" content="text/html; charset=utf-8"/>\n'
112 | 
113 |     # process metadata and retrieve fontSize info
114 |     print '     ', 'metadata0000.dat'
115 |     fname = os.path.join(bookDir,'metadata0000.dat')
116 |     xname = os.path.join(bookDir, 'metadata.txt')
117 |     metastr = decode_meta.getMetaData(fname)
118 |     file(xname, 'wb').write(metastr)
119 |     meta_array = decode_meta.getMetaArray(fname)
120 | 
121 |     htmlstr += '<title>' + meta_array['Title'] + ' by ' + meta_array['Authors'] + '</title>\n' 
122 |     htmlstr += '<meta name="Author" content="' + meta_array['Authors'] + '" />\n'
123 |     htmlstr += '<meta name="Title" content="' + meta_array['Title'] + '" />\n'
124 | 
125 |     # get some scaling info from metadata to use while processing styles
126 |     fontsize = '135'
127 |     if 'fontSize' in meta_array:
128 |         fontsize = meta_array['fontSize']
129 | 
130 |     # also get the size of a normal text page
131 |     spage = '1'
132 |     if 'firstTextPage' in meta_array:
133 |         spage = meta_array['firstTextPage']
134 |     pnum = int(spage)
135 | 
136 |     # get page height and width from first text page for use in stylesheet scaling
137 |     pname = 'page%04d.dat' % (pnum + 1)
138 |     fname = os.path.join(pageDir,pname)
139 |     pargv=[]
140 |     pargv.append('convert2xml.py')
141 |     pargv.append('--flat-xml')
142 |     pargv.append(dictFile)
143 |     pargv.append(fname)
144 |     flat_xml = convert2xml.main(pargv)
145 |     (ph, pw) = getpagedim.getPageDim(flat_xml)
146 |     if (ph == '-1') or (ph == '0') : ph = '11000'
147 |     if (pw == '-1') or (pw == '0') : pw = '8500'
148 | 
149 |     # now build up the style sheet
150 |     print '     ', 'other0000.dat'
151 |     fname = os.path.join(bookDir,'other0000.dat')
152 |     xname = os.path.join(bookDir, 'style.css')
153 |     pargv=[]
154 |     pargv.append('convert2xml.py')
155 |     pargv.append('--flat-xml')
156 |     pargv.append(dictFile)
157 |     pargv.append(fname)
158 |     xmlstr = convert2xml.main(pargv)
159 |     cssstr , classlst = stylexml2css.convert2CSS(xmlstr, fontsize, ph, pw)
160 |     file(xname, 'wb').write(cssstr)
161 |     htmlstr += '<link href="style.css" rel="stylesheet" type="text/css" />\n'
162 |     htmlstr += '</head>\n<body>\n'
163 | 
164 |     for filename in filenames:
165 |         print '     ', filename
166 |         fname = os.path.join(pageDir,filename)
167 |         pargv=[]
168 |         pargv.append('convert2xml.py')
169 |         pargv.append('--flat-xml')
170 |         pargv.append(dictFile)
171 |         pargv.append(fname)
172 |         flat_xml = convert2xml.main(pargv) 
173 |         htmlstr += flatxml2html.convert2HTML(flat_xml, classlst, fname, bookDir, fixedimage)
174 | 
175 |     htmlstr += '</body>\n</html>\n'
176 | 
177 |     file(os.path.join(bookDir, htmlFileName), 'wb').write(htmlstr)
178 |     print 'Processing Complete'
179 | 
180 |     return 0
181 | 
182 | if __name__ == '__main__':
183 |     sys.exit(main(''))
184 | 
185 | 
186 | 


--------------------------------------------------------------------------------
/topaz/gensvg.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  3 | # For use with Topaz Scripts Version 2.6
  4 | 
  5 | class Unbuffered:
  6 |     def __init__(self, stream):
  7 |         self.stream = stream
  8 |     def write(self, data):
  9 |         self.stream.write(data)
 10 |         self.stream.flush()
 11 |     def __getattr__(self, attr):
 12 |         return getattr(self.stream, attr)
 13 | 
 14 | import sys
 15 | sys.stdout=Unbuffered(sys.stdout)
 16 | 
 17 | import os, getopt
 18 | 
 19 | # local routines
 20 | import convert2xml
 21 | import decode_meta
 22 | 
 23 | 
 24 | class GParser(object):
 25 |  def __init__(self, flatxml):
 26 |      self.flatdoc = flatxml.split('\n')
 27 |      self.dpi = 1440
 28 |      self.gh = self.getData('info.glyph.h')
 29 |      self.gw = self.getData('info.glyph.w')
 30 |      self.guse = self.getData('info.glyph.use')
 31 |      if self.guse :
 32 |          self.count = len(self.guse)
 33 |      else :
 34 |          self.count = 0
 35 |      self.gvtx = self.getData('info.glyph.vtx')
 36 |      self.glen = self.getData('info.glyph.len')
 37 |      self.gdpi = self.getData('info.glyph.dpi')
 38 |      self.vx = self.getData('info.vtx.x')
 39 |      self.vy = self.getData('info.vtx.y')
 40 |      self.vlen = self.getData('info.len.n')
 41 |      if self.vlen :
 42 |          self.glen.append(len(self.vlen))
 43 |      elif self.glen:
 44 |          self.glen.append(0)
 45 |      if self.vx :
 46 |          self.gvtx.append(len(self.vx))
 47 |      elif self.gvtx :
 48 |          self.gvtx.append(0)
 49 | 
 50 |  def getData(self, path):
 51 |      result = None
 52 |      cnt = len(self.flatdoc)
 53 |      for j in xrange(cnt):
 54 |          item = self.flatdoc[j]
 55 |          if item.find('=') >= 0:
 56 |              (name, argt) = item.split('=')
 57 |              argres = argt.split('|')
 58 |          else:
 59 |              name = item
 60 |              argres = []
 61 |          if (name == path):
 62 |              result = argres
 63 |              break
 64 |      if (len(argres) > 0) :
 65 |          for j in xrange(0,len(argres)):
 66 |              argres[j] = int(argres[j])
 67 |      return result
 68 | 
 69 | 
 70 |  def getGlyphDim(self, gly):
 71 |      maxh = (self.gh[gly] * self.dpi) / self.gdpi[gly]
 72 |      maxw = (self.gw[gly] * self.dpi) / self.gdpi[gly]
 73 |      return maxh, maxw
 74 |      
 75 | 
 76 |  def getPath(self, gly):
 77 |      path = ''
 78 |      if (gly < 0) or (gly >= self.count):
 79 |          return path
 80 |      tx = self.vx[self.gvtx[gly]:self.gvtx[gly+1]]
 81 |      ty = self.vy[self.gvtx[gly]:self.gvtx[gly+1]]
 82 |      p = 0
 83 |      for k in xrange(self.glen[gly], self.glen[gly+1]):
 84 |          if (p == 0):
 85 |              zx = tx[0:self.vlen[k]+1]
 86 |              zy = ty[0:self.vlen[k]+1]
 87 |          else:
 88 |              zx = tx[self.vlen[k-1]+1:self.vlen[k]+1]
 89 |              zy = ty[self.vlen[k-1]+1:self.vlen[k]+1]
 90 |          p += 1
 91 |          j = 0
 92 |          while ( j  < len(zx) ):
 93 |              if (j == 0):
 94 |                  # Start Position.
 95 |                  path += 'M %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly])
 96 |              elif (j <= len(zx)-3):
 97 |                  # Cubic Bezier Curve
 98 |                  path += 'C %d %d %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[j+1] * self.dpi / self.gdpi[gly], zy[j+1] * self.dpi / self.gdpi[gly], zx[j+2] * self.dpi / self.gdpi[gly], zy[j+2] * self.dpi / self.gdpi[gly])
 99 |                  j += 2
100 |              elif (j == len(zx)-2):
101 |                  # Cubic Bezier Curve to Start Position
102 |                  path += 'C %d %d %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[j+1] * self.dpi / self.gdpi[gly], zy[j+1] * self.dpi / self.gdpi[gly], zx[0] * self.dpi / self.gdpi[gly], zy[0] * self.dpi / self.gdpi[gly])
103 |                  j += 1
104 |              elif (j == len(zx)-1):
105 |                  # Quadratic Bezier Curve to Start Position
106 |                  path += 'Q %d %d %d %d ' % (zx[j] * self.dpi / self.gdpi[gly], zy[j] * self.dpi / self.gdpi[gly], zx[0] * self.dpi / self.gdpi[gly], zy[0] * self.dpi / self.gdpi[gly])
107 | 
108 |              j += 1
109 |      path += 'z'
110 |      return path
111 | 
112 | class PParser(object):
113 |  def __init__(self, flatxml):
114 |      self.flatdoc = flatxml.split('\n')
115 |      self.temp = []
116 |      foo = self.getData('page.h') or self.getData('book.h')
117 |      self.ph = foo[0]
118 |      foo = self.getData('page.w') or self.getData('book.w')
119 |      self.pw = foo[0]
120 |      self.gx = self.getData('info.glyph.x')
121 |      self.gy = self.getData('info.glyph.y')
122 |      self.gid = self.getData('info.glyph.glyphID')
123 | 
124 |  def getData(self, path):
125 |      result = None
126 |      cnt = len(self.flatdoc)
127 |      for j in xrange(cnt):
128 |          item = self.flatdoc[j]
129 |          if item.find('=') >= 0:
130 |              (name, argt) = item.split('=')
131 |              argres = argt.split('|')
132 |          else:
133 |              name = item
134 |              argres = []
135 |          if (name.endswith(path)):
136 |              result = argres
137 |              break
138 |      if (len(argres) > 0) :
139 |          for j in xrange(0,len(argres)):
140 |              argres[j] = int(argres[j])
141 |      return result
142 | 
143 |  def getDataTemp(self, path):
144 |      result = None
145 |      cnt = len(self.temp)
146 |      for j in xrange(cnt):
147 |          item = self.temp[j]
148 |          if item.find('=') >= 0:
149 |              (name, argt) = item.split('=')
150 |              argres = argt.split('|')
151 |          else:
152 |              name = item
153 |              argres = []
154 |          if (name.endswith(path)):
155 |              result = argres
156 |              self.temp.pop(j)
157 |              break
158 |      if (len(argres) > 0) :
159 |          for j in xrange(0,len(argres)):
160 |              argres[j] = int(argres[j])
161 |      return result
162 | 
163 |  def getImages(self):
164 |      result = []
165 |      self.temp = self.flatdoc
166 |      while (self.getDataTemp('img') != None):
167 |          h = self.getDataTemp('img.h')[0]
168 |          w = self.getDataTemp('img.w')[0]
169 |          x = self.getDataTemp('img.x')[0]
170 |          y = self.getDataTemp('img.y')[0]
171 |          src = self.getDataTemp('img.src')[0]
172 |          result.append('<image xlink:href="../img/img%04d.jpg" x="%d" y="%d" width="%d" height="%d" />\n' % (src, x, y, w, h))
173 |      return result
174 | 
175 |  def getGlyphs(self,glyfname):
176 |      result = []
177 |      if (self.gid != None) and (len(self.gid) > 0):
178 |          glyphs = []
179 |          for j in set(self.gid):
180 |              glyphs.append(j)
181 |          glyphs.sort()
182 |          gfile = open(glyfname, 'r')
183 |          j = 0
184 |          while True :
185 |              inp = gfile.readline()
186 |              if (inp == ''):
187 |                  break
188 |              id='id="gl%d"' % glyphs[j]
189 |              if (inp.find(id) > 0):
190 |                  result.append(inp)
191 |                  j += 1
192 |                  if (j == len(glyphs)):
193 |                      break
194 |          gfile.close()
195 |      return result
196 | 
197 | 
198 | 
199 | 
200 | def usage():
201 |  print 'Usage: '
202 |  print ' '
203 |  print '   gensvg.py [options] unencryptedBookDir'
204 |  print '  '
205 |  print '   -x : output browseable XHTML+SVG pages (default)'
206 |  print '   -r : output raw SVG images'
207 | 
208 | 
209 | def main(argv):
210 |  bookDir = ''
211 | 
212 |  if len(argv) == 0:
213 |      argv = sys.argv
214 | 
215 |  try:
216 |      opts, args = getopt.getopt(argv[1:], "xrh")
217 | 
218 |  except getopt.GetoptError, err:
219 |      print str(err)
220 |      usage()
221 |      sys.exit(1)
222 | 
223 |  if len(opts) == 0 and len(args) == 0 :
224 |      usage()
225 |      sys.exit(1) 
226 | 
227 |  raw = 0
228 |  for o, a in opts:
229 |      if o =="-h":
230 |          usage()
231 |          sys.exit(0)
232 |      if o =="-x":
233 |          raw = 0
234 |      if o =="-r":
235 |          raw = 1
236 | 
237 |  bookDir = args[0]
238 | 
239 |  if not os.path.exists(bookDir) :
240 |      print "Can not find directory with unencrypted book"
241 |      sys.exit(1)
242 | 
243 |  dictFile = os.path.join(bookDir,'dict0000.dat')
244 | 
245 |  if not os.path.exists(dictFile) :
246 |      print "Can not find dict0000.dat file"
247 |      sys.exit(1)
248 | 
249 |  pageDir = os.path.join(bookDir,'page')
250 |  if not os.path.exists(pageDir) :
251 |      print "Can not find page directory in unencrypted book"
252 |      sys.exit(1)
253 | 
254 |  imgDir = os.path.join(bookDir,'img')
255 |  if not os.path.exists(imgDir) :
256 |      print "Can not find image directory in unencrypted book"
257 |      sys.exit(1)
258 | 
259 |  glyphsDir = os.path.join(bookDir,'glyphs')
260 |  if not os.path.exists(glyphsDir) :
261 |      print "Can not find glyphs directory in unencrypted book"
262 |      sys.exit(1)
263 | 
264 |  metaFile = os.path.join(bookDir,'metadata0000.dat')
265 |  if not os.path.exists(metaFile) :
266 |      print "Can not find metadata0000.dat in unencrypted book"
267 |      sys.exit(1)
268 | 
269 |  svgDir = os.path.join(bookDir,'svg')
270 |  if not os.path.exists(svgDir) :
271 |      os.makedirs(svgDir)
272 | 
273 | 
274 |  print 'Processing Meta Data ... '
275 | 
276 |  print '     ', 'metadata0000.dat'
277 |  fname = os.path.join(bookDir,'metadata0000.dat')
278 |  metadata = decode_meta.getMetaArray(fname)
279 | 
280 |  print 'Processing Glyphs ... '
281 | 
282 |  filenames = os.listdir(glyphsDir)
283 |  filenames = sorted(filenames)
284 | 
285 |  glyfname = os.path.join(svgDir,'glyphs.svg')
286 |  glyfile = open(glyfname, 'w')
287 |  glyfile.write('<?xml version="1.0" standalone="no"?>\n')
288 |  glyfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
289 |  glyfile.write('<svg width="512" height="512" viewBox="0 0 511 511" xmlns="http://www.w3.org/2000/svg" version="1.1">\n')
290 |  glyfile.write('<title>Glyphs for %s</title>\n' % metadata['Title'])
291 |  glyfile.write('<defs>\n')
292 |  counter = 0
293 |  for filename in filenames:
294 |      print '     ', filename
295 |      fname = os.path.join(glyphsDir,filename)
296 |      pargv=[]
297 |      pargv.append('convert2xml.py')
298 |      pargv.append('--flat-xml')
299 |      pargv.append(dictFile)
300 |      pargv.append(fname)
301 |      flat_xml = convert2xml.main(pargv)
302 |      gp = GParser(flat_xml)
303 |      for i in xrange(0, gp.count):
304 |          path = gp.getPath(i)
305 |          maxh, maxw = gp.getGlyphDim(i)
306 |          # glyfile.write('<path id="gl%d" d="%s" fill="black" />\n' % (counter * 256 + i, path))
307 |          glyfile.write('<path id="gl%d" d="%s" fill="black" /><!-- width=%d height=%d -->\n' % (counter * 256 + i, path, maxw, maxh ))
308 |      counter += 1
309 |  glyfile.write('</defs>\n')
310 |  glyfile.write('</svg>\n')
311 |  glyfile.close()
312 | 
313 |  print 'Processing Pages ... '
314 | 
315 |  # Books are at 1440 DPI.  This is rendering at twice that size for
316 |  # readability when rendering to the screen.  
317 |  scaledpi = 1440
318 |  filenames = os.listdir(pageDir)
319 |  filenames = sorted(filenames)
320 |  counter = 0
321 |  for filename in filenames:
322 |      print '     ', filename
323 |      fname = os.path.join(pageDir,filename)
324 |      pargv=[]
325 |      pargv.append('convert2xml.py')
326 |      pargv.append('--flat-xml')
327 |      pargv.append(dictFile)
328 |      pargv.append(fname)
329 |      flat_xml = convert2xml.main(pargv)
330 |      pp = PParser(flat_xml)
331 |      if (raw) :
332 |          pfile = open(os.path.join(svgDir,filename.replace('.dat','.svg')), 'w')
333 |      else :
334 |          pfile = open(os.path.join(svgDir,'page%04d.xhtml' % counter), 'w')
335 | 
336 |      pfile.write('<?xml version="1.0" standalone="no"?>\n')
337 |      if (raw):
338 |          pfile.write('<!DOCTYPE svg PUBLIC "-//W3C/DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n')
339 |          pfile.write('<svg width="%fin" height="%fin" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1">\n' % (pp.pw / scaledpi, pp.ph / scaledpi, pp.pw -1, pp.ph -1))
340 |          pfile.write('<title>Page %d - %s by %s</title>\n' % (counter, metadata['Title'],metadata['Authors']))
341 |      else:
342 |          pfile.write('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n');
343 |          pfile.write('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" ><head>\n');
344 |          pfile.write('<title>Page %d - %s by %s</title>\n' % (counter, metadata['Title'],metadata['Authors']))
345 |          pfile.write('<script><![CDATA[\n');
346 |          pfile.write('function gd(){var p=window.location.href.replace(/^.*\?dpi=(\d+).*$/i,"$1");return p;}\n');
347 |          pfile.write('var dpi=%d;\n' % scaledpi);
348 |          if (counter) :
349 |             pfile.write('var prevpage="page%04d.xhtml";\n' % (counter - 1))
350 |          if (counter < len(filenames)-1) :
351 |             pfile.write('var nextpage="page%04d.xhtml";\n' % (counter + 1))
352 |          pfile.write('var pw=%d;var ph=%d;' % (pp.pw, pp.ph))
353 |          pfile.write('function zoomin(){dpi=dpi*(2/3);setsize();}\n')
354 |          pfile.write('function zoomout(){dpi=dpi*1.5;setsize();}\n')
355 |          pfile.write('function setsize(){var svg=document.getElementById("svgimg");var prev=document.getElementById("prevsvg");var next=document.getElementById("nextsvg");var width=(pw/dpi)+"in";var height=(ph/dpi)+"in";svg.setAttribute("width",width);svg.setAttribute("height",height);prev.setAttribute("height",height);prev.setAttribute("width","50px");next.setAttribute("height",height);next.setAttribute("width","50px");}\n')
356 |          pfile.write('function ppage(){window.location.href=prevpage+"?dpi="+Math.round(dpi);}\n')
357 |          pfile.write('function npage(){window.location.href=nextpage+"?dpi="+Math.round(dpi);}\n')
358 |          pfile.write('var gt=gd();if(gt>0){dpi=gt;}\n')
359 |          pfile.write('window.onload=setsize;\n')
360 |          pfile.write(']]></script>\n')
361 |          pfile.write('</head>\n')
362 |          pfile.write('<body onLoad="setsize();" style="background-color:#777;text-align:center;">\n')
363 |          pfile.write('<div style="white-space:nowrap;">\n')
364 |          if (counter == 0) :
365 |              pfile.write('<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n')
366 |          else:
367 |              pfile.write('<a href="javascript:ppage();"><svg id="prevsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,150,95,5,95,295" fill="#AAAAAA" /></svg></a>\n')
368 |          pfile.write('<a href="javascript:npage();"><svg id="svgimg" viewBox="0 0 %d %d" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" style="background-color:#FFF;border:1px solid black;">' % (pp.pw, pp.ph))
369 | 
370 |      if (pp.gid != None): 
371 |          pfile.write('<defs>\n')
372 |          gdefs = pp.getGlyphs(glyfname)
373 |          for j in xrange(0,len(gdefs)):
374 |              pfile.write(gdefs[j])
375 |          pfile.write('</defs>\n')
376 |      img = pp.getImages()
377 |      if (img != None):
378 |          for j in xrange(0,len(img)):
379 |              pfile.write(img[j])
380 |      if (pp.gid != None): 
381 |          for j in xrange(0,len(pp.gid)):
382 |              pfile.write('<use xlink:href="#gl%d" x="%d" y="%d" />\n' % (pp.gid[j], pp.gx[j], pp.gy[j]))
383 |      if (img == None or len(img) == 0) and (pp.gid == None or len(pp.gid) == 0):
384 |          pfile.write('<text x="10" y="10" font-family="Helvetica" font-size="100" stroke="black">This page intentionally left blank.</text>\n<text x="10" y="110" font-family="Helvetica" font-size="50" stroke="black">Until this notice unintentionally gave it content.  (gensvg.py)</text>\n');
385 |      if (raw) :
386 |          pfile.write('</svg>')
387 |      else :
388 |          pfile.write('</svg></a>\n')
389 |          if (counter == len(filenames) - 1) :
390 |              pfile.write('<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"></svg></a>\n')
391 |          else :
392 |              pfile.write('<a href="javascript:npage();"><svg id="nextsvg" viewBox="0 0 100 300" xmlns="http://www.w3.org/2000/svg" version="1.1" style="background-color:#777"><polygon points="5,5,5,295,95,150" fill="#AAAAAA" /></svg></a>\n')
393 |          pfile.write('</div>\n')
394 |          pfile.write('<div><a href="javascript:zoomin();">zoom in</a> - <a href="javascript:zoomout();">zoom out</a></div>\n')
395 |          pfile.write('</body>\n')
396 |          pfile.write('</html>\n')
397 |      pfile.close()
398 |      counter += 1
399 | 
400 |  print 'Processing Complete'
401 | 
402 |  return 0
403 | 
404 | if __name__ == '__main__':
405 |  sys.exit(main(''))
406 | 


--------------------------------------------------------------------------------
/topaz/getpagedim.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 3 | # For use with Topaz Scripts Version 2.6
 4 | 
 5 | import csv
 6 | import sys
 7 | import os
 8 | import getopt
 9 | from struct import pack
10 | from struct import unpack
11 | 
12 | 
13 | class DocParser(object):
14 |     def __init__(self, flatxml):
15 |         self.flatdoc = flatxml.split('\n')
16 | 
17 | 
18 |     # find tag if within pos to end inclusive
19 |     def findinDoc(self, tagpath, pos, end) :
20 |         result = None
21 |         docList = self.flatdoc
22 |         cnt = len(docList)
23 |         if end == -1 :
24 |             end = cnt
25 |         else:
26 |             end = min(cnt,end)
27 |         foundat = -1
28 |         for j in xrange(pos, end):
29 |             item = docList[j]
30 |             if item.find('=') >= 0:
31 |                 (name, argres) = item.split('=')
32 |             else : 
33 |                 name = item
34 |                 argres = ''
35 |             if name.endswith(tagpath) : 
36 |                 result = argres
37 |                 foundat = j
38 |                 break
39 |         return foundat, result
40 | 
41 |     def process(self):
42 |         (pos, sph) = self.findinDoc('page.h',0,-1)
43 |         (pos, spw) = self.findinDoc('page.w',0,-1)
44 |         if (sph == None): sph = '-1'
45 |         if (spw == None): spw = '-1'
46 |         return sph, spw
47 | 
48 | 
49 | def getPageDim(flatxml):
50 |     # create a document parser
51 |     dp = DocParser(flatxml)
52 |     (ph, pw) = dp.process()
53 |     return ph, pw
54 | 


--------------------------------------------------------------------------------
/topaz/stylexml2css.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  3 | # For use with Topaz Scripts Version 2.6
  4 | 
  5 | import csv
  6 | import sys
  7 | import os
  8 | import getopt
  9 | from struct import pack
 10 | from struct import unpack
 11 | 
 12 | 
 13 | class DocParser(object):
 14 |     def __init__(self, flatxml, fontsize, ph, pw):
 15 |         self.flatdoc = flatxml.split('\n')
 16 |         self.fontsize = int(fontsize)
 17 |         self.ph = int(ph) * 1.0
 18 |         self.pw = int(pw) * 1.0
 19 | 
 20 |     stags = {
 21 |         'paragraph' : 'p',
 22 |         'graphic'   : '.graphic'
 23 |     }
 24 | 
 25 |     attr_val_map = {
 26 |         'hang'            : 'text-indent: ',
 27 |         'indent'          : 'text-indent: ',
 28 |         'line-space'      : 'line-height: ',
 29 |         'margin-bottom'   : 'margin-bottom: ',
 30 |         'margin-left'     : 'margin-left: ',
 31 |         'margin-right'    : 'margin-right: ',
 32 |         'margin-top'      : 'margin-top: ',
 33 |         'space-after'     : 'padding-bottom: ',
 34 |     }
 35 | 
 36 |     attr_str_map = {
 37 |         'align-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
 38 |         'align-left'   : 'text-align: left;',
 39 |         'align-right'  : 'text-align: right;',
 40 |         'align-justify' : 'text-align: justify;',
 41 |         'display-inline' : 'display: inline;',
 42 |         'pos-left' : 'text-align: left;',
 43 |         'pos-right' : 'text-align: right;',
 44 |         'pos-center' : 'text-align: center; margin-left: auto; margin-right: auto;',
 45 |     }
 46 |     
 47 |     
 48 |     # find tag if within pos to end inclusive
 49 |     def findinDoc(self, tagpath, pos, end) :
 50 |         result = None
 51 |         docList = self.flatdoc
 52 |         cnt = len(docList)
 53 |         if end == -1 :
 54 |             end = cnt
 55 |         else:
 56 |             end = min(cnt,end)
 57 |         foundat = -1
 58 |         for j in xrange(pos, end):
 59 |             item = docList[j]
 60 |             if item.find('=') >= 0:
 61 |                 (name, argres) = item.split('=',1)
 62 |             else : 
 63 |                 name = item
 64 |                 argres = ''
 65 |             if name.endswith(tagpath) : 
 66 |                 result = argres
 67 |                 foundat = j
 68 |                 break
 69 |         return foundat, result
 70 | 
 71 | 
 72 |     # return list of start positions for the tagpath
 73 |     def posinDoc(self, tagpath):
 74 |         startpos = []
 75 |         pos = 0
 76 |         res = ""
 77 |         while res != None :
 78 |             (foundpos, res) = self.findinDoc(tagpath, pos, -1)
 79 |             if res != None :
 80 |                 startpos.append(foundpos)
 81 |             pos = foundpos + 1
 82 |         return startpos
 83 | 
 84 | 
 85 |     def process(self):
 86 | 
 87 |         classlst = ''
 88 |         csspage = '.cl-center { text-align: center; margin-left: auto; margin-right: auto; }\n'
 89 |         csspage += '.cl-right { text-align: right; }\n'
 90 |         csspage += '.cl-left { text-align: left; }\n'
 91 |         csspage += '.cl-justify { text-align: justify; }\n'
 92 | 
 93 |         # generate a list of each <style> starting point in the stylesheet
 94 |         styleList= self.posinDoc('book.stylesheet.style')
 95 |         stylecnt = len(styleList)
 96 |         styleList.append(-1)
 97 | 
 98 |         # process each style converting what you can
 99 | 
100 |         for j in xrange(stylecnt):
101 |             start = styleList[j]
102 |             end = styleList[j+1]
103 | 
104 |             (pos, tag) = self.findinDoc('style._tag',start,end)
105 |             if tag == None :
106 |                 (pos, tag) = self.findinDoc('style.type',start,end)
107 |                 
108 |             # Is this something we know how to convert to css
109 |             if tag in self.stags :
110 | 
111 |                 # get the style class
112 |                 (pos, sclass) = self.findinDoc('style.class',start,end)
113 |                 if sclass != None:
114 |                     sclass = sclass.replace(' ','-')
115 |                     sclass = '.cl-' + sclass.lower()
116 |                 else : 
117 |                     sclass = ''
118 | 
119 |                 # check for any "after class" specifiers
120 |                 (pos, aftclass) = self.findinDoc('style._after_class',start,end)
121 |                 if aftclass != None:
122 |                     aftclass = aftclass.replace(' ','-')
123 |                     aftclass = '.cl-' + aftclass.lower()
124 |                 else : 
125 |                     aftclass = ''
126 | 
127 |                 cssargs = {}
128 | 
129 |                 while True :
130 | 
131 |                     (pos1, attr) = self.findinDoc('style.rule.attr', start, end)
132 |                     (pos2, val) = self.findinDoc('style.rule.value', start, end)
133 | 
134 |                     if attr == None : break
135 |                     
136 |                     if (attr == 'display') or (attr == 'pos') or (attr == 'align'):
137 |                         # handle text based attributess
138 |                         attr = attr + '-' + val
139 |                         if attr in self.attr_str_map :
140 |                             cssargs[attr] = (self.attr_str_map[attr], '')
141 |                     else :
142 |                         # handle value based attributes
143 |                         if attr in self.attr_val_map :
144 |                             name = self.attr_val_map[attr]
145 |                             if attr in ('margin-bottom', 'margin-top', 'space-after') :
146 |                                 scale = self.ph
147 |                             elif attr in ('margin-right', 'indent', 'margin-left', 'hang') :
148 |                                 scale = self.pw
149 |                             elif attr == 'line-space':
150 |                                 scale = self.fontsize * 2.0
151 | 
152 |                             if not ((attr == 'hang') and (int(val) == 0)) :
153 |                                 pv = float(val)/scale
154 |                                 cssargs[attr] = (self.attr_val_map[attr], pv)
155 |                                 keep = True
156 | 
157 |                     start = max(pos1, pos2) + 1
158 | 
159 |                 # disable all of the after class tags until I figure out how to handle them
160 |                 if aftclass != "" : keep = False
161 | 
162 |                 if keep :
163 |                     # make sure line-space does not go below 100% or above 300% since 
164 |                     # it can be wacky in some styles
165 |                     if 'line-space' in cssargs:
166 |                         seg = cssargs['line-space'][0]
167 |                         val = cssargs['line-space'][1]
168 |                         if val < 1.0: val = 1.0
169 |                         if val > 3.0: val = 3.0
170 |                         del cssargs['line-space']
171 |                         cssargs['line-space'] = (self.attr_val_map['line-space'], val)
172 | 
173 |                     
174 |                     # handle modifications for css style hanging indents
175 |                     if 'hang' in cssargs:
176 |                         hseg = cssargs['hang'][0]
177 |                         hval = cssargs['hang'][1]
178 |                         del cssargs['hang']
179 |                         cssargs['hang'] = (self.attr_val_map['hang'], -hval)
180 |                         mval = 0
181 |                         mseg = 'margin-left: '
182 |                         mval = hval
183 |                         if 'margin-left' in cssargs:
184 |                             mseg = cssargs['margin-left'][0]
185 |                             mval = cssargs['margin-left'][1]
186 |                             if mval < 0: mval = 0
187 |                             mval = hval + mval
188 |                         cssargs['margin-left'] = (mseg, mval)
189 |                         if 'indent' in cssargs:
190 |                             del cssargs['indent']
191 | 
192 |                     cssline = sclass + ' { '
193 |                     for key in iter(cssargs):
194 |                         mseg = cssargs[key][0]
195 |                         mval = cssargs[key][1]
196 |                         if mval == '':
197 |                             cssline += mseg + ' '
198 |                         else :
199 |                             aseg = mseg + '%.1f%%;' % (mval * 100.0)
200 |                             cssline += aseg + ' '
201 | 
202 |                     cssline += '}'
203 | 
204 |                     if sclass != '' :
205 |                         classlst += sclass + '\n'
206 |                     
207 |                     # handle special case of paragraph class used inside chapter heading
208 |                     # and non-chapter headings
209 |                     if sclass != '' :
210 |                         ctype = sclass[4:7]
211 |                         if ctype == 'ch1' :
212 |                             csspage += 'h1' + cssline + '\n'
213 |                         if ctype == 'ch2' :
214 |                             csspage += 'h2' + cssline + '\n'
215 |                         if ctype == 'ch3' :
216 |                             csspage += 'h3' + cssline + '\n'
217 |                         if ctype == 'h1-' :
218 |                             csspage += 'h4' + cssline + '\n'
219 |                         if ctype == 'h2-' :
220 |                             csspage += 'h5' + cssline + '\n'
221 |                         if ctype == 'h3_' :
222 |                             csspage += 'h6' + cssline + '\n'
223 | 
224 |                     if cssline != ' { }':
225 |                         csspage += self.stags[tag] + cssline + '\n'
226 | 
227 |                 
228 |         return csspage, classlst
229 | 
230 | 
231 | 
232 | def convert2CSS(flatxml, fontsize, ph, pw):
233 | 
234 |     print '          ', 'Using font size:',fontsize
235 |     print '          ', 'Using page height:', ph
236 |     print '          ', 'Using page width:', pw
237 | 
238 |     # create a document parser
239 |     dp = DocParser(flatxml, fontsize, ph, pw)
240 | 
241 |     csspage = dp.process()
242 | 
243 |     return csspage
244 | 


--------------------------------------------------------------------------------