├── .gitattributes
├── .gitignore
├── COPYING.txt
├── DumpMobiHeader_v023.py
├── KindleUnpack.pyw
├── KindleUnpack_ReadMe.htm
├── README.md
├── lib
    ├── __init__.py
    ├── compatibility_utils.py
    ├── imghdr.py
    ├── kindleunpack.py
    ├── mobi_cover.py
    ├── mobi_dict.py
    ├── mobi_header.py
    ├── mobi_html.py
    ├── mobi_index.py
    ├── mobi_k8proc.py
    ├── mobi_k8resc.py
    ├── mobi_nav.py
    ├── mobi_ncx.py
    ├── mobi_opf.py
    ├── mobi_pagemap.py
    ├── mobi_sectioner.py
    ├── mobi_split.py
    ├── mobi_uncompress.py
    ├── mobi_utils.py
    ├── mobiml2xhtml.py
    ├── unipath.py
    └── unpack_structure.py
└── libgui
    ├── __init__.py
    ├── askfolder_ed.py
    ├── prefs.py
    └── scrolltextwidget.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.txt           text
2 | *.py		text
3 | *.cfg		text
4 | 
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | *.DS_Store
 6 | 
 7 | # Vim temp/swap files
 8 | *~
 9 | *.orig
10 | *.keep
11 | *.swp
12 | *.swo
13 | 
14 | # PyInstaller
15 | #  Usually these files are written by a python script from a template
16 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
17 | *.manifest
18 | *.spec
19 | 
20 | 
21 | # Translations
22 | *.mo
23 | *.pot
24 | 
25 | *.log
26 | 
27 | # PyBuilder
28 | target/
29 | 
30 | # Files/folders used/produced when testing Kindleunpack
31 | HDImages/
32 | mobi7/
33 | mobi8/
34 | 
35 | *.bak
36 | *.dat
37 | *.data
38 | *.pdf
39 | *.ini
40 | *.json
41 | *.mobi
42 | *.prc
43 | *.azw
44 | *.azw[34]
45 | 
46 | # Folder to direct output to when testing command-line (will be ignored by git)
47 | testout/
48 | 


--------------------------------------------------------------------------------
/KindleUnpack.pyw:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | import sys
  8 | 
  9 | from lib.compatibility_utils import PY2, text_type, unicode_str
 10 | from lib.compatibility_utils import unicode_argv, add_cp65001_codec
 11 | 
 12 | import lib.unipath as unipath
 13 | from lib.unipath import pathof
 14 | 
 15 | import os
 16 | import traceback
 17 | 
 18 | import codecs
 19 | add_cp65001_codec()
 20 | 
 21 | try:
 22 |     from queue import Full
 23 |     from queue import Empty
 24 | except ImportError:
 25 |     from Queue import Full
 26 |     from Queue import Empty
 27 | 
 28 | if PY2 and sys.platform.startswith("win"):
 29 |     from libgui.askfolder_ed import AskFolder
 30 | 
 31 | from multiprocessing import Process, Queue
 32 | 
 33 | if PY2:
 34 |     import Tkinter as tkinter
 35 |     import Tkconstants as tkinter_constants
 36 |     import tkFileDialog as tkinter_filedialog
 37 |     import ttk as tkinter_ttk
 38 | else:
 39 |     import tkinter
 40 |     import tkinter.constants as tkinter_constants
 41 |     import tkinter.filedialog as tkinter_filedialog
 42 |     import tkinter.ttk as tkinter_ttk
 43 | 
 44 | from libgui.scrolltextwidget import ScrolledText
 45 | 
 46 | import lib.kindleunpack as kindleunpack
 47 | 
 48 | # Set to false to NOT save prefences to an ini file.
 49 | # Starting directories for file dialogs will still persist
 50 | # for the current KindleUnpack session.
 51 | #
 52 | # Need to delete the ini file after setting to false, of course.
 53 | PERSISTENT_PREFS = True
 54 | 
 55 | from inspect import getfile, currentframe
 56 | from libgui.prefs import getprefs, saveprefs
 57 | 
 58 | # Probably overkill, but to ensure cross-platform success no matter how the script is called/run...
 59 | SCRIPT_NAME = unicode_str(getfile(currentframe()))
 60 | SCRIPT_DIR = unicode_str(os.path.dirname(unipath.abspath(getfile(currentframe()))))
 61 | PROGNAME = unicode_str(os.path.splitext(SCRIPT_NAME)[0])
 62 | 
 63 | # Include platform in the ini file name. That way, settings can still persist
 64 | # in the event that different OSs access the same script via a network share/flash-drive.
 65 | CONFIGFILE = unicode_str(os.path.join(SCRIPT_DIR, '{0}_{1}.json'.format(PROGNAME, sys.platform[:3])))
 66 | 
 67 | # Wrap a stream so that output gets appended to shared queue
 68 | # using utf-8 encoding
 69 | class QueuedStream:
 70 |     def __init__(self, stream, q):
 71 |         self.stream = stream
 72 |         self.encoding = stream.encoding
 73 |         self.q = q
 74 |         if self.encoding == None:
 75 |             self.encoding = 'utf-8'
 76 |     def write(self, data):
 77 |         if isinstance(data,text_type):
 78 |             data = data.encode('utf-8')
 79 |         elif self.encoding not in ['utf-8','UTF-8','cp65001','CP65001']:
 80 |             udata = data.decode(self.encoding)
 81 |             data = udata.encode('utf-8')
 82 |         self.q.put(data)
 83 |     def __getattr__(self, attr):
 84 |         if attr == 'mode':
 85 |             return 'wb'
 86 |         if attr == 'encoding':
 87 |             return 'utf-8'
 88 |         return getattr(self.stream, attr)
 89 | 
 90 | 
 91 | class MainDialog(tkinter.Frame):
 92 | 
 93 |     def __init__(self, root):
 94 |         tkinter.Frame.__init__(self, root, border=5)
 95 |         self.root = root
 96 |         self.interval = 50
 97 |         self.p2 = None
 98 |         self.q = Queue()
 99 |         # To keep things simple for possible future preference additions/deletions:
100 |         # Try to stick to - TK Widget name = prefs dictionary key = ini.get|set name.
101 |         # EX: mobipath = prefs['mobipath'] = config.get('Defaults', mobipath).
102 |         self.prefs = getprefs(CONFIGFILE, self.root, PERSISTENT_PREFS)
103 | 
104 |         self.status = tkinter.StringVar()
105 |         tkinter.Label(self, textvariable=self.status, justify='center').grid(row=0, columnspan=3, sticky=tkinter_constants.N)
106 |         self.status.set('Upack a non-DRM Kindle eBook')
107 |         sticky = tkinter_constants.E + tkinter_constants.W
108 |         ALL = tkinter_constants.E+tkinter_constants.W+tkinter_constants.N+tkinter_constants.S
109 |         # Set to the column the textentry boxes are in.
110 |         self.grid_columnconfigure(1, weight=1)
111 |         # Set to the row the debug log widget is in.
112 |         self.grid_rowconfigure(10, weight=1)
113 | 
114 |         tkinter.Label(self, text='').grid(row=1, sticky=tkinter_constants.E)
115 |         tkinter.Label(self, text='Unencrypted Kindle eBook input file', wraplength=200).grid(row=2, sticky=tkinter_constants.E)
116 |         self.mobipath = tkinter.Entry(self, width=50)
117 |         self.mobipath.grid(row=2, column=1, sticky=sticky)
118 |         self.mobipath.insert(0, '')
119 |         button = tkinter.Button(self, text="Browse...", command=self.get_mobipath)
120 |         button.grid(row=2, column=2, sticky=sticky)
121 | 
122 |         tkinter.Label(self, text='Output Directory', wraplength=200).grid(row=3, sticky=tkinter_constants.E)
123 |         self.outpath = tkinter.Entry(self, width=50)
124 |         self.outpath.grid(row=3, column=1, sticky=sticky)
125 |         if self.prefs['outpath'] and PERSISTENT_PREFS and unipath.exists(CONFIGFILE):
126 |             outpath = pathof(os.path.normpath(self.prefs['outpath']))
127 |             self.outpath.insert(0, outpath)
128 |         else:
129 |             self.outpath.insert(0, '')
130 |         button = tkinter.Button(self, text="Browse...", command=self.get_outpath)
131 |         button.grid(row=3, column=2, sticky=sticky)
132 | 
133 |         tkinter.Label(self, text='OPTIONAL: APNX file Associated with AZW3', wraplength=200).grid(row=4, sticky=tkinter_constants.E)
134 |         self.apnxpath = tkinter.Entry(self, width=50)
135 |         self.apnxpath.grid(row=4, column=1, sticky=sticky)
136 |         self.apnxpath.insert(0, '')
137 |         button = tkinter.Button(self, text="Browse...", command=self.get_apnxpath)
138 |         button.grid(row=4, column=2, sticky=sticky)
139 | 
140 |         self.splitvar = tkinter.IntVar()
141 |         checkbox = tkinter.Checkbutton(self, text="Split Combination Kindlegen eBooks", variable=self.splitvar)
142 |         if self.prefs['splitvar'] and PERSISTENT_PREFS:
143 |             checkbox.select()
144 |         checkbox.grid(row=5, column=1, columnspan=2, sticky=tkinter_constants.W)
145 | 
146 |         self.rawvar = tkinter.IntVar()
147 |         checkbox = tkinter.Checkbutton(self, text="Write Raw Data", variable=self.rawvar)
148 |         if self.prefs['rawvar'] and PERSISTENT_PREFS:
149 |             checkbox.select()
150 |         checkbox.grid(row=6, column=1, columnspan=2, sticky=tkinter_constants.W)
151 | 
152 |         self.dbgvar = tkinter.IntVar()
153 |         checkbox = tkinter.Checkbutton(self, text="Dump Mode", variable=self.dbgvar)
154 |         if self.prefs['dbgvar'] and PERSISTENT_PREFS:
155 |             checkbox.select()
156 |         checkbox.grid(row=7, column=1, columnspan=2, sticky=tkinter_constants.W)
157 | 
158 |         self.hdvar = tkinter.IntVar()
159 |         checkbox = tkinter.Checkbutton(self, text="Use HD Images If Present", variable=self.hdvar)
160 |         if self.prefs['hdvar'] and PERSISTENT_PREFS:
161 |             checkbox.select()
162 |         checkbox.grid(row=8, column=1, columnspan=2, sticky=tkinter_constants.W)
163 | 
164 |         tkinter.Label(self, text='ePub Output Type:').grid(row=9, sticky=tkinter_constants.E)
165 |         self.epubver_val = tkinter.StringVar()
166 |         self.epubver = tkinter_ttk.Combobox(self, textvariable=self.epubver_val, state='readonly')
167 |         self.epubver['values'] = ('ePub 2', 'ePub 3', 'Auto-detect', 'Force ePub 2')
168 |         self.epubver.current(0)
169 |         if self.prefs['epubver'] and PERSISTENT_PREFS:
170 |             self.epubver.current(self.prefs['epubver'])
171 |         self.epubver.grid(row=9, column=1, columnspan=2, pady=(3,5), sticky=tkinter_constants.W)
172 | 
173 |         msg1 = 'Conversion Log \n\n'
174 |         self.stext = ScrolledText(self, bd=5, relief=tkinter_constants.RIDGE, wrap=tkinter_constants.WORD)
175 |         self.stext.grid(row=10, column=0, columnspan=3, sticky=ALL)
176 |         self.stext.insert(tkinter_constants.END,msg1)
177 | 
178 |         self.sbotton = tkinter.Button(
179 |             self, text="Start", width=10, command=self.convertit)
180 |         self.sbotton.grid(row=11, column=1, sticky=tkinter_constants.S+tkinter_constants.E)
181 |         self.qbutton = tkinter.Button(
182 |             self, text="Quit", width=10, command=self.quitting)
183 |         self.qbutton.grid(row=11, column=2, sticky=tkinter_constants.S+tkinter_constants.W)
184 |         if self.prefs['windowgeometry'] and PERSISTENT_PREFS:
185 |             self.root.geometry(self.prefs['windowgeometry'])
186 |         else:
187 |             self.root.update_idletasks()
188 |             w = self.root.winfo_screenwidth()
189 |             h = self.root.winfo_screenheight()
190 |             rootsize = (605, 575)
191 |             x = w//2 - rootsize[0]//2
192 |             y = h//2 - rootsize[1]//2
193 |             self.root.geometry('%dx%d+%d+%d' % (rootsize + (x, y)))
194 |         self.root.protocol('WM_DELETE_WINDOW', self.quitting)
195 | 
196 |     # read queue shared between this main process and spawned child processes
197 |     def readQueueUntilEmpty(self):
198 |         done = False
199 |         text = ''
200 |         while not done:
201 |             try:
202 |                 data = self.q.get_nowait()
203 |                 text += unicode_str(data, 'utf-8')
204 |             except Empty:
205 |                 done = True
206 |                 pass
207 |         return text
208 | 
209 |     # read from subprocess pipe without blocking
210 |     # invoked every interval via the widget "after"
211 |     # option being used, so need to reset it for the next time
212 |     def processQueue(self):
213 |         poll = self.p2.exitcode
214 |         if poll != None:
215 |             text = self.readQueueUntilEmpty()
216 |             msg = text + '\n\n' + 'eBook successfully unpacked\n'
217 |             if poll != 0:
218 |                 msg = text + '\n\n' + 'Error: Unpacking Failed\n'
219 |             self.p2.join()
220 |             self.showCmdOutput(msg)
221 |             self.p2 = None
222 |             self.sbotton.configure(state='normal')
223 |             return
224 |         text = self.readQueueUntilEmpty()
225 |         self.showCmdOutput(text)
226 |         # make sure we get invoked again by event loop after interval
227 |         self.stext.after(self.interval,self.processQueue)
228 |         return
229 | 
230 |     # post output from subprocess in scrolled text widget
231 |     def showCmdOutput(self, msg):
232 |         if msg and msg !='':
233 |             if sys.platform.startswith('win'):
234 |                 msg = msg.replace('\r\n','\n')
235 |             self.stext.insert(tkinter_constants.END,msg)
236 |             self.stext.yview_pickplace(tkinter_constants.END)
237 |         return
238 | 
239 |     def get_mobipath(self):
240 |         cwd = unipath.getcwd()
241 |         mobipath = tkinter_filedialog.askopenfilename(
242 |             parent=None, title='Select Unencrypted Kindle eBook File',
243 |             initialdir=self.prefs['mobipath'] or cwd,
244 |             initialfile=None,
245 |             defaultextension=('.mobi', '.prc', '.azw', '.azw4', '.azw3'),
246 |             filetypes=[('All Kindle formats', ('.mobi', '.prc', '.azw', '.azw4', '.azw3')),
247 |                        ('Kindle Mobi eBook File', '.mobi'), ('Kindle PRC eBook File', '.prc'),
248 |                        ('Kindle AZW eBook File', '.azw'), ('Kindle AZW4 Print Replica', '.azw4'),
249 |                        ('Kindle Version 8', '.azw3'),('All Files', '.*')])
250 |         if mobipath:
251 |             self.prefs['mobipath'] = pathof(os.path.dirname(mobipath))
252 |             mobipath = pathof(os.path.normpath(mobipath))
253 |             self.mobipath.delete(0, tkinter_constants.END)
254 |             self.mobipath.insert(0, mobipath)
255 |         return
256 | 
257 |     def get_apnxpath(self):
258 |         cwd = unipath.getcwd()
259 |         apnxpath = tkinter_filedialog.askopenfilename(
260 |             parent=None, title='Optional APNX file associated with AZW3',
261 |             initialdir=self.prefs['apnxpath'] or cwd,
262 |             initialfile=None,
263 |             defaultextension='.apnx', filetypes=[('Kindle APNX Page Information File', '.apnx'), ('All Files', '.*')])
264 |         if apnxpath:
265 |             self.prefs['apnxpath'] = pathof(os.path.dirname(apnxpath))
266 |             apnxpath = pathof(os.path.normpath(apnxpath))
267 |             self.apnxpath.delete(0, tkinter_constants.END)
268 |             self.apnxpath.insert(0, apnxpath)
269 |         return
270 | 
271 |     def get_outpath(self):
272 |         cwd = unipath.getcwd()
273 |         if sys.platform.startswith("win") and PY2:
274 |             # tk_chooseDirectory is horribly broken for unicode paths
275 |             # on windows - bug has been reported but not fixed for years
276 |             # workaround by using our own unicode aware version
277 |             outpath = AskFolder(message="Folder to Store Output into",
278 |                 defaultLocation=self.prefs['outpath'] or unipath.getcwd())
279 |         else:
280 |             outpath = tkinter_filedialog.askdirectory(
281 |                 parent=None, title='Folder to Store Output into',
282 |                 initialdir=self.prefs['outpath'] or cwd, initialfile=None)
283 |         if outpath:
284 |             self.prefs['outpath'] = outpath
285 |             outpath = pathof(os.path.normpath(outpath))
286 |             self.outpath.delete(0, tkinter_constants.END)
287 |             self.outpath.insert(0, outpath)
288 |         return
289 | 
290 |     def quitting(self):
291 |         # kill any still running subprocess
292 |         if self.p2 != None:
293 |             if (self.p2.exitcode == None):
294 |                 self.p2.terminate()
295 |         if PERSISTENT_PREFS:
296 |             if not saveprefs(CONFIGFILE, self.prefs, self):
297 |                 print("Couldn't save INI file.")
298 |         self.root.destroy()
299 |         self.quit()
300 | 
301 |     # run in a child process and collect its output
302 |     def convertit(self):
303 |         # now disable the button to prevent multiple launches
304 |         self.sbotton.configure(state='disabled')
305 |         mobipath = unicode_str(self.mobipath.get())
306 |         apnxpath = unicode_str(self.apnxpath.get())
307 |         outdir = unicode_str(self.outpath.get())
308 |         if not mobipath or not unipath.exists(mobipath):
309 |             self.status.set('Specified eBook file does not exist')
310 |             self.sbotton.configure(state='normal')
311 |             return
312 |         apnxfile = None
313 |         if apnxpath != "" and unipath.exists(apnxpath):
314 |             apnxfile = apnxpath
315 |         if not outdir:
316 |             self.status.set('No output directory specified')
317 |             self.sbotton.configure(state='normal')
318 |             return
319 |         q = self.q
320 |         log = 'Input Path = "'+ mobipath + '"\n'
321 |         log += 'Output Path = "' + outdir + '"\n'
322 |         if apnxfile != None:
323 |             log += 'APNX Path = "' + apnxfile + '"\n'
324 |         dump = False
325 |         writeraw = False
326 |         splitcombos = False
327 |         use_hd = False
328 |         if self.dbgvar.get() == 1:
329 |             dump = True
330 |             log += 'Debug = True\n'
331 |         if self.rawvar.get() == 1:
332 |             writeraw = True
333 |             log += 'WriteRawML = True\n'
334 |         if self.splitvar.get() == 1:
335 |             splitcombos = True
336 |             log += 'Split Combo KF8 Kindle eBooks = True\n'
337 |         if self.epubver.current() == 0:
338 |             epubversion = '2'
339 |         elif self.epubver.current() == 1:
340 |             epubversion = '3'
341 |         elif self.epubver.current() == 2:
342 |             epubversion = 'A'
343 |         else:
344 |             epubversion = 'F'
345 |         log += 'Epub Output Type Set To: {0}\n'.format(self.epubver_val.get())
346 |         if self.hdvar.get():
347 |             use_hd = True
348 |             # stub for processing the Use HD Images setting
349 |             log += 'Use HD Images If Present = True\n'
350 |         log += '\n\n'
351 |         log += 'Please Wait ...\n\n'
352 |         self.stext.insert(tkinter_constants.END,log)
353 |         self.p2 = Process(target=unpackEbook, args=(q, mobipath, outdir, apnxfile, epubversion, use_hd, dump, writeraw, splitcombos))
354 |         self.p2.start()
355 | 
356 |         # python does not seem to allow you to create
357 |         # your own eventloop which every other gui does - strange
358 |         # so need to use the widget "after" command to force
359 |         # event loop to run non-gui events every interval
360 |         self.stext.after(self.interval,self.processQueue)
361 |         return
362 | 
363 | 
364 | # child process / multiprocessing thread starts here
365 | def unpackEbook(q, infile, outdir, apnxfile, epubversion, use_hd, dump, writeraw, splitcombos):
366 |     sys.stdout = QueuedStream(sys.stdout, q)
367 |     sys.stderr = QueuedStream(sys.stderr, q)
368 |     rv = 0
369 |     try:
370 |         kindleunpack.unpackBook(infile, outdir, apnxfile, epubversion, use_hd, dodump=dump, dowriteraw=writeraw, dosplitcombos=splitcombos)
371 |     except Exception as e:
372 |         print("Error: %s" % e)
373 |         print(traceback.format_exc())
374 |         rv = 1
375 |     sys.exit(rv)
376 | 
377 | 
378 | def main(argv=unicode_argv()):
379 |     root = tkinter.Tk()
380 |     root.title('Kindle eBook Unpack Tool')
381 |     root.minsize(440, 350)
382 |     root.resizable(True, True)
383 |     MainDialog(root).pack(fill=tkinter_constants.BOTH, expand=tkinter_constants.YES)
384 |     root.mainloop()
385 |     return 0
386 | 
387 | if __name__ == "__main__":
388 |     sys.exit(main())
389 | 


--------------------------------------------------------------------------------
/KindleUnpack_ReadMe.htm:
--------------------------------------------------------------------------------
 1 | T<html>
 2 | <head>
 3 | <title>KindleUnpack ReadMe</title>
 4 | </head>
 5 | <body>
 6 | 	<p>KindleUnpack unpacks a non-DRM Kindle/MobiPocket ebook into its component parts which change depending on the type of Kindle/Mobi ebook being processed.</p>
 7 | <p>
 8 | 	<ol>
 9 | 	<li><p>MobiPocket and early Kindle version 7 or less ebooks are unpacked to the original html 3.2 and images folder that can then be edited and reprocessed by MobiPocketCreator.</p></li>
10 | 	<li><p>Kindle Print Replica ebook as unpacked to the original PDF and any associated images.</p></li>
11 | 	<li><p>Kindle KF8 only ebooks (.azw3) are unpacked into an epub-like structure that may or may not be a fully valid epub depending on if a fully valid epub was originally provided to kindlegen as input.</><p>NOTE: The generated epub should be validated using an epub validator and should changes be needed, it should load properly into Sigil and Calibre either of which can be used to edit the result to create a fully valid epub.</p></li>
12 | 	<li><p>Newer Kindle ebooks which have both KF8 and older versions inside are unpacked into two different parts: the first being the older MobiPocket format ebook parts (see #1 above) and the second being an epub-like structure that can be edited using Sigil (see #3 above).</p></li>
13 | 	</ol>
14 | </p>
15 | <p>The KindleUnpack program requires Python 2.7.X or Python 3.4 or later to function properly.</p>
16 | <p>On Windows machines we strongly recommend you install the free version of ActiveState's Active Python 2.7.3 or later 2.7.X version as it properly installs all of the required parts including the tk widget kit and updates the system path on Windows machines.  The official installer from python.org does not properly handle this for Windows machines.</p>
17 | <p>On Mac OS X 10.6.X and later and almost all recent Linux versions the required version of Python is already installed as part of the official OS installation so Mac OS X and Linux users need install nothing extra.</p>
18 | 
19 | <p>To install KindleUnpack, simply find a nice location on your machine and fully unzip it.  Do not move the KindleUnpack.pyw program away from its associated "lib" folder.  If you have a proper Python 2.6 or later installation on your machine, you should be able to simply double-click the KindleUnpack.pyw icon and the gui interface should start.</p>
20 | 
21 | <p>If you would prefer a command-line interface, simply look inside KindleUnpack's "lib" folder for the KindleUnpack.py python program and its support modules.  You should then be able to run KindleUnpack.py by the following command:</p>
22 | 
23 | <pre>
24 | python kindleunpack.py [-r -s -d -h -i] [-p APNX_FILE] INPUT_FILE OUTPUT_FOLDER
25 | </pre>
26 | 
27 | <p>where you replace:</p>
28 | <pre>
29 | 
30 |    INPUT_FILE      - path to the desired Kindle/MobiPocket ebook
31 | 
32 |    OUTPUT_FOLDER   - path to folder where the ebook will be unpacked
33 | 
34 | Options:
35 |     -h               print this help message
36 |     -i               use HDImages to overwrite lower resolution versions, if present
37 |     -s               split combination mobis into older mobi and mobi KF8 ebooks
38 |     -p APNX_FILE     path to a .apnx file that contains real page numbers associated with an azw3 ebook (optional)
39 |                      Note: many apnx files have arbitrarily assigned page offsets that will confuse KindleUnpack if used
40 |    --epub_version=   specify epub version to unpack to: 2, 3 or A (for automatic) or 
41 |                         F for Force to epub2, default is 2
42 |     -r               write raw data to the output folder
43 |     -d               dump headers and other debug info to output and extra files
44 | 
45 | </pre>
46 | 
47 | <p>Please report any bugs or comments/requests our sticky forum on the Mobileread website.  It can be found at http://www.mobileread.com/forums.  Look under E-Book Formats > Kindle Formats > KindleUnpack (MobiUnpack).
48 | </p>
49 | 
50 | <p>License Information</p>
51 | <pre>
52 | KindleUnpack
53 |     Based on initial mobipocket version Copyright © 2009 Charles M. Hannum <root@ihack.net>
54 |     Extensive Extensions and Improvements Copyright © 2009-2014 
55 |          By P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.
56 |     This program is free software: you can redistribute it and/or modify
57 |     it under the terms of the GNU General Public License as published by
58 |     the Free Software Foundation, version 3.
59 | </pre>
60 | 
61 | </body>
62 | </html>
63 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | KindleUnpack
 2 | ============
 3 | 
 4 | python based software to unpack Amazon / Kindlegen generated ebooks
 5 | 
 6 | KindleUnpack unpacks a non-DRM Kindle/MobiPocket ebook into its component parts 
 7 | which change depending on the type of Kindle/Mobi ebook being processed
 8 | 
 9 | 	- MobiPocket and early Kindle version 7 or less ebooks are unpacked to the 
10 |             original html 3.2 and images folder that can then be edited and reprocessed by 
11 |             MobiPocketCreator.
12 | 
13 | 	- Kindle Print Replica ebook are unpacked to the original PDF and any associated images.
14 | 
15 | 	- Kindle KF8 only ebooks (.azw3) are unpacked into an epub-like structure that may or 
16 |             may not be a fully valid epub depending on if a fully valid epub was 
17 |             originally provided to kindlegen as input.  NOTE: The generated epub should be
18 |             validated using an epub validator and should changes be needed, it should load
19 |             properly into Sigil and Calibre either of which can be used to edit the result
20 |             to create a fully valid epub.
21 | 
22 | 	- Newer Kindle ebooks which have both KF8 and older versions inside are unpacked into 
23 |             two different parts: the first being the older MobiPocket format ebook parts 
24 |             and the second being an epub-like structure that can be edited using Sigil
25 | 
26 | The KindleUnpack program requires Python 2.7.X or Python 3.4 or later to function properly. 
27 | 
28 | On Windows machines we strongly recommend you install the free version of ActiveState's 
29 | Active Python 2.7.X or 3.4.X or later as it properly installs all of the required parts 
30 | including the tk widget kit and updates the system path on Windows machines.  The official 
31 | installer from python.org sometimes does not properly handle this for Windows machines.
32 | 
33 | On Mac OS X 10.6.X and later and almost all recent Linux versions, the required version 
34 | of Python is already installed as part of the official OS installation so Mac OS X and 
35 | Linux users need install nothing extra.
36 | 
37 | To install KindleUnpack, simply find a nice location on your machine and fully unzip it.  
38 | Do not move the KindleUnpack.pyw program away from its associated "lib" folder.  If you 
39 | have a proper Python 2.7 or later installation on your machine, you should be able to 
40 | simply double-click the KindleUnpack.pyw icon and the gui interface should start
41 | 
42 | If you would prefer a command-line interface, simply look inside KindleUnpack's "lib" 
43 | folder for the KindleUnpack.py python program and its support modules.  You should 
44 | then be able to run KindleUnpack.py by the following command:
45 | 
46 | ```sh
47 | python kindleunpack.py [-r -s -d -h -i] [-p APNX_FILE] INPUT_FILE OUTPUT_FOLDER
48 | ```
49 | 
50 | where you replace:
51 | 
52 | `INPUT_FILE`      - path to the desired Kindle/MobiPocket ebook
53 | 
54 | `OUTPUT_FOLDER`   - path to folder where the ebook will be unpacked
55 | 
56 | ### Options
57 | 
58 | `-h`               print this help message
59 | 
60 | `-i`               use HDImages to overwrite lower resolution versions, if present
61 | 
62 | `-s`               split combination mobis into older mobi and mobi KF8 ebooks
63 | 
64 | `-p APNX_FILE`     path to a .apnx file that contains real page numbers associated
65 |                    with an azw3 ebook (optional).  Note: many apnx files have
66 |                    arbitrarily assigned page offsets that will confuse KindleUnpack
67 |                    if used
68 | 
69 | `--epub_version=`  specify EPUB version to unpack to: 2, 3 or A (for automatic) or
70 |                    F for Force to EPUB2, default is 2
71 | 
72 | `-r`               write raw data to the output folder
73 | 
74 | `-d`               dump headers and other debug info to output and extra files
75 | 
76 | Please report any bugs or comments/requests our sticky forum on the Mobileread website.  
77 | It can be found at http://www.mobileread.com/forums.  
78 | 
79 | Look under E-Book Formats > Kindle Formats > KindleUnpack (MobiUnpack).
80 | 
81 | License Information
82 | 
83 | KindleUnpack
84 |     Based on initial mobipocket version Copyright © 2009 Charles M. Hannum <root@ihack.net>
85 |     Extensive Extensions and Improvements Copyright © 2009-2014 
86 |          By P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.
87 |     This program is free software: you can redistribute it and/or modify
88 |     it under the terms of the GNU General Public License as published by
89 |     the Free Software Foundation, version 3.
90 | 


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
3 | 


--------------------------------------------------------------------------------
/lib/compatibility_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | # Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
  6 | # All rights reserved.
  7 | #
  8 | # Redistribution and use in source and binary forms, with or without modification,
  9 | # are permitted provided that the following conditions are met:
 10 | #
 11 | # 1. Redistributions of source code must retain the above copyright notice, this list of
 12 | # conditions and the following disclaimer.
 13 | #
 14 | # 2. Redistributions in binary form must reproduce the above copyright notice, this list
 15 | # of conditions and the following disclaimer in the documentation and/or other materials
 16 | # provided with the distribution.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
 19 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 20 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
 21 | # SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 22 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 23 | # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 24 | # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 25 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
 26 | # WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | 
 28 | from __future__ import unicode_literals, division, absolute_import, print_function
 29 | 
 30 | import sys
 31 | import codecs
 32 | 
 33 | PY2 = sys.version_info[0] == 2
 34 | PY3 = sys.version_info[0] == 3
 35 | 
 36 | iswindows = sys.platform.startswith('win')
 37 | 
 38 | try:
 39 |     from urllib.parse import unquote
 40 | except ImportError:
 41 |     from urllib import unquote
 42 | 
 43 | if PY2:
 44 |     from HTMLParser import HTMLParser
 45 |     _h = HTMLParser()
 46 | elif sys.version_info[1] < 4:
 47 |     import html.parser
 48 |     _h = html.parser.HTMLParser()
 49 | else:
 50 |     import html as _h
 51 | 
 52 | if PY3:
 53 |     text_type = str
 54 |     binary_type = bytes
 55 |     # if will be printing arbitraty binary data to stdout on python 3
 56 |     # sys.stdin = sys.stdin.detach()
 57 |     # sys.stdout = sys.stdout.detach()
 58 |     # sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
 59 | else:
 60 |     range = xrange
 61 |     text_type = unicode
 62 |     binary_type = str
 63 |     # if will be printing unicode under python 2 need to protect
 64 |     # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode
 65 |     # sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
 66 |     # alternatively set environment variable as follows **before** launching python:  export PYTHONIOENCODING=UTF-8
 67 | 
 68 | # NOTE: Python 3 is completely broken when accessing single bytes in bytes strings
 69 | # (and they amazingly claim by design and no bug!)
 70 | 
 71 | # To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode
 72 | # >>> o = '123456789'
 73 | # >>> o[-3]
 74 | # '7'
 75 | # >>> type(o[-3])
 76 | # <class 'str'>
 77 | # >>> type(o)
 78 | # <class 'str'>
 79 | 
 80 | # Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings
 81 | # >>> o = b'123456789'
 82 | # >>> o[-3]
 83 | # 55
 84 | # >>> type(o[-3])
 85 | # <class 'int'>
 86 | # >>> type(o)
 87 | # <class 'bytes'>
 88 | 
 89 | # This mind boggling  behaviour also happens when indexing a bytestring and/or
 90 | # iteratoring over a bytestring.  In other words it will return an int but not
 91 | # the byte itself!!!!!!!
 92 | 
 93 | # The only way to access a single byte as a byte in bytestring and get the byte in both
 94 | # Python 2 and Python 3 is to use a slice
 95 | 
 96 | # This problem is so common there are horrible hacks floating around the net to **try**
 97 | # to work around it, so that code that works on both Python 2 and Python 3 is possible.
 98 | 
 99 | # So in order to write code that works on both Python 2 and Python 3
100 | # if you index or access a single byte and want its ord() then use the bord() function.
101 | # If instead you want it as a single character byte use the bchar() function
102 | # both of which are defined below.
103 | 
104 | if PY3:
105 |     # Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding)
106 |     # in place of ascii you will get a byte value to half-word or integer value
107 |     # one-to-one mapping (in the 0 - 255 range)
108 | 
109 |     def bchr(s):
110 |         return bytes([s])
111 | 
112 |     def bstr(s):
113 |         if isinstance(s, str):
114 |             return bytes(s, 'latin-1')
115 |         else:
116 |             return bytes(s)
117 | 
118 |     def bord(s):
119 |         return s
120 | 
121 |     def bchar(s):
122 |         return bytes([s])
123 | 
124 | else:
125 |     def bchr(s):
126 |         return chr(s)
127 | 
128 |     def bstr(s):
129 |         return str(s)
130 | 
131 |     def bord(s):
132 |         return ord(s)
133 | 
134 |     def bchar(s):
135 |         return s
136 | 
137 | if PY3:
138 |     # list-producing versions of the major Python iterating functions
139 |     def lrange(*args, **kwargs):
140 |         return list(range(*args, **kwargs))
141 | 
142 |     def lzip(*args, **kwargs):
143 |         return list(zip(*args, **kwargs))
144 | 
145 |     def lmap(*args, **kwargs):
146 |         return list(map(*args, **kwargs))
147 | 
148 |     def lfilter(*args, **kwargs):
149 |         return list(filter(*args, **kwargs))
150 | else:
151 |     import __builtin__
152 |     # Python 2-builtin ranges produce lists
153 |     lrange = __builtin__.range
154 |     lzip = __builtin__.zip
155 |     lmap = __builtin__.map
156 |     lfilter = __builtin__.filter
157 | 
158 | # In Python 3 you can no longer use .encode('hex') on a bytestring
159 | # instead use the following on both platforms
160 | import binascii
161 | def hexlify(bdata):
162 |     return (binascii.hexlify(bdata)).decode('ascii')
163 | 
164 | # If you: import struct
165 | # Note:  struct pack, unpack, unpack_from all *require* bytestring format
166 | # data all the way up to at least Python 2.7.5, Python 3 is okay with either
167 | 
168 | # If you: import re
169 | # note: Python 3 "re" requires the pattern to be the exact same type as the data to be
170 | # searched ... but u"" is not allowed for the pattern itself only b""
171 | # Python 2.X allows the pattern to be any type and converts it to match the data
172 | # and returns the same type as the data
173 | 
174 | # convert string to be utf-8 encoded
175 | def utf8_str(p, enc='utf-8'):
176 |     if p is None:
177 |         return None
178 |     if isinstance(p, text_type):
179 |         return p.encode('utf-8')
180 |     if enc != 'utf-8':
181 |         return p.decode(enc).encode('utf-8')
182 |     return p
183 | 
184 | # convert string to be unicode encoded
185 | def unicode_str(p, enc='utf-8'):
186 |     if p is None:
187 |         return None
188 |     if isinstance(p, text_type):
189 |         return p
190 |     return p.decode(enc)
191 | 
192 | ASCII_CHARS   = set(chr(x) for x in range(128))
193 | URL_SAFE      = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
194 |                     'abcdefghijklmnopqrstuvwxyz'
195 |                     '0123456789' '#' '_.-/~')
196 | IRI_UNSAFE = ASCII_CHARS - URL_SAFE
197 | 
198 | # returns a quoted IRI (not a URI)
199 | def quoteurl(href):
200 |     if isinstance(href,binary_type):
201 |         href = href.decode('utf-8')
202 |     result = []
203 |     for char in href:
204 |         if char in IRI_UNSAFE:
205 |             char = "%%%02x" % ord(char)
206 |         result.append(char)
207 |     return ''.join(result)
208 | 
209 | # unquotes url/iri
210 | def unquoteurl(href):
211 |     if isinstance(href,binary_type):
212 |         href = href.decode('utf-8')
213 |     href = unquote(href)
214 |     return href
215 | 
216 | # unescape html
217 | def unescapeit(sval):
218 |     return _h.unescape(sval)
219 | 
220 | # Python 2.X commandline parsing under Windows has been horribly broken for years!
221 | # Use the following code to emulate full unicode commandline parsing on Python 2
222 | # ie. To get  sys.argv arguments and properly encode them as unicode
223 | 
224 | def unicode_argv():
225 |     global iswindows
226 |     global PY3
227 |     if PY3:
228 |         return sys.argv
229 |     if iswindows:
230 |         # Versions 2.x of Python don't support Unicode in sys.argv on
231 |         # Windows, with the underlying Windows API instead replacing multi-byte
232 |         # characters with '?'.  So use shell32.GetCommandLineArgvW to get sys.argv
233 |         # as a list of Unicode strings
234 |         from ctypes import POINTER, byref, cdll, c_int, windll
235 |         from ctypes.wintypes import LPCWSTR, LPWSTR
236 | 
237 |         GetCommandLineW = cdll.kernel32.GetCommandLineW
238 |         GetCommandLineW.argtypes = []
239 |         GetCommandLineW.restype = LPCWSTR
240 | 
241 |         CommandLineToArgvW = windll.shell32.CommandLineToArgvW
242 |         CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)]
243 |         CommandLineToArgvW.restype = POINTER(LPWSTR)
244 | 
245 |         cmd = GetCommandLineW()
246 |         argc = c_int(0)
247 |         argv = CommandLineToArgvW(cmd, byref(argc))
248 |         if argc.value > 0:
249 |             # Remove Python executable and commands if present
250 |             start = argc.value - len(sys.argv)
251 |             return [argv[i] for i in
252 |                     range(start, argc.value)]
253 |         # this should never happen
254 |         return None
255 |     else:
256 |         argv = []
257 |         argvencoding = sys.stdin.encoding
258 |         if argvencoding is None:
259 |             argvencoding = sys.getfilesystemencoding()
260 |         if argvencoding is None:
261 |             argvencoding = 'utf-8'
262 |         for arg in sys.argv:
263 |             if isinstance(arg, text_type):
264 |                 argv.append(arg)
265 |             else:
266 |                 argv.append(arg.decode(argvencoding))
267 |         return argv
268 | 
269 | 
270 | # Python 2.X is broken in that it does not recognize CP65001 as UTF-8
271 | def add_cp65001_codec():
272 |     if PY2:
273 |         try:
274 |             codecs.lookup('cp65001')
275 |         except LookupError:
276 |             codecs.register(
277 |                 lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
278 |     return
279 | 


--------------------------------------------------------------------------------
/lib/imghdr.py:
--------------------------------------------------------------------------------
  1 | """Recognize image file formats based on their first few bytes."""
  2 | 
  3 | # Python software and documentation are licensed under the
  4 | # Python Software Foundation License Version 2.
  5 | 
  6 | # Starting with Python 3.8.6, examples, recipes, and other code in
  7 | # the documentation are dual licensed under the PSF License Version 2
  8 | # and the Zero-Clause BSD license.
  9 | 
 10 | # Some software incorporated into Python is under different licenses.
 11 | # The licenses are listed with code falling under that license.
 12 | 
 13 | # PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
 14 | # --------------------------------------------
 15 | 
 16 | # 1. This LICENSE AGREEMENT is between the Python Software Foundation
 17 | # ("PSF"), and the Individual or Organization ("Licensee") accessing and
 18 | # otherwise using this software ("Python") in source or binary form and
 19 | # its associated documentation.
 20 | 
 21 | # 2. Subject to the terms and conditions of this License Agreement, PSF hereby
 22 | # grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
 23 | # analyze, test, perform and/or display publicly, prepare derivative works,
 24 | # distribute, and otherwise use Python alone or in any derivative version,
 25 | # provided, however, that PSF's License Agreement and PSF's notice of copyright,
 26 | # i.e., "Copyright (c) 2001 Python Software Foundation; All Rights Reserved"
 27 | # are retained in Python alone or in any derivative version prepared by Licensee.
 28 | 
 29 | # 3. In the event Licensee prepares a derivative work that is based on
 30 | # or incorporates Python or any part thereof, and wants to make
 31 | # the derivative work available to others as provided herein, then
 32 | # Licensee hereby agrees to include in any such work a brief summary of
 33 | # the changes made to Python.
 34 | 
 35 | # 4. PSF is making Python available to Licensee on an "AS IS"
 36 | # basis.  PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
 37 | # IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
 38 | # DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
 39 | # FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
 40 | # INFRINGE ANY THIRD PARTY RIGHTS.
 41 | 
 42 | # 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
 43 | # FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
 44 | # A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
 45 | # OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
 46 | 
 47 | # 6. This License Agreement will automatically terminate upon a material
 48 | # breach of its terms and conditions.
 49 | 
 50 | # 7. Nothing in this License Agreement shall be deemed to create any
 51 | # relationship of agency, partnership, or joint venture between PSF and
 52 | # Licensee.  This License Agreement does not grant permission to use PSF
 53 | # trademarks or trade name in a trademark sense to endorse or promote
 54 | # products or services of Licensee, or any third party.
 55 | 
 56 | # 8. By copying, installing or otherwise using Python, Licensee
 57 | # agrees to be bound by the terms and conditions of this License
 58 | # Agreement.
 59 | 
 60 | from os import PathLike
 61 | 
 62 | __all__ = ["what"]
 63 | 
 64 | #-------------------------#
 65 | # Recognize image headers #
 66 | #-------------------------#
 67 | 
 68 | def what(file, h=None):
 69 |     f = None
 70 |     try:
 71 |         if h is None:
 72 |             if isinstance(file, (str, PathLike)):
 73 |                 f = open(file, 'rb')
 74 |                 h = f.read(32)
 75 |             else:
 76 |                 location = file.tell()
 77 |                 h = file.read(32)
 78 |                 file.seek(location)
 79 |         for tf in tests:
 80 |             res = tf(h, f)
 81 |             if res:
 82 |                 return res
 83 |     finally:
 84 |         if f: f.close()
 85 |     return None
 86 | 
 87 | 
 88 | #---------------------------------#
 89 | # Subroutines per image file type #
 90 | #---------------------------------#
 91 | 
 92 | tests = []
 93 | 
 94 | def test_jpeg(h, f):
 95 |     """JPEG data in JFIF or Exif format"""
 96 |     if h[6:10] in (b'JFIF', b'Exif'):
 97 |         return 'jpeg'
 98 | 
 99 | tests.append(test_jpeg)
100 | 
101 | def test_png(h, f):
102 |     if h.startswith(b'\211PNG\r\n\032\n'):
103 |         return 'png'
104 | 
105 | tests.append(test_png)
106 | 
107 | def test_gif(h, f):
108 |     """GIF ('87 and '89 variants)"""
109 |     if h[:6] in (b'GIF87a', b'GIF89a'):
110 |         return 'gif'
111 | 
112 | tests.append(test_gif)
113 | 
114 | def test_tiff(h, f):
115 |     """TIFF (can be in Motorola or Intel byte order)"""
116 |     if h[:2] in (b'MM', b'II'):
117 |         return 'tiff'
118 | 
119 | tests.append(test_tiff)
120 | 
121 | def test_rgb(h, f):
122 |     """SGI image library"""
123 |     if h.startswith(b'\001\332'):
124 |         return 'rgb'
125 | 
126 | tests.append(test_rgb)
127 | 
128 | def test_pbm(h, f):
129 |     """PBM (portable bitmap)"""
130 |     if len(h) >= 3 and \
131 |         h[0] == ord(b'P') and h[1] in b'14' and h[2] in b' \t\n\r':
132 |         return 'pbm'
133 | 
134 | tests.append(test_pbm)
135 | 
136 | def test_pgm(h, f):
137 |     """PGM (portable graymap)"""
138 |     if len(h) >= 3 and \
139 |         h[0] == ord(b'P') and h[1] in b'25' and h[2] in b' \t\n\r':
140 |         return 'pgm'
141 | 
142 | tests.append(test_pgm)
143 | 
144 | def test_ppm(h, f):
145 |     """PPM (portable pixmap)"""
146 |     if len(h) >= 3 and \
147 |         h[0] == ord(b'P') and h[1] in b'36' and h[2] in b' \t\n\r':
148 |         return 'ppm'
149 | 
150 | tests.append(test_ppm)
151 | 
152 | def test_rast(h, f):
153 |     """Sun raster file"""
154 |     if h.startswith(b'\x59\xA6\x6A\x95'):
155 |         return 'rast'
156 | 
157 | tests.append(test_rast)
158 | 
159 | def test_xbm(h, f):
160 |     """X bitmap (X10 or X11)"""
161 |     if h.startswith(b'#define '):
162 |         return 'xbm'
163 | 
164 | tests.append(test_xbm)
165 | 
166 | def test_bmp(h, f):
167 |     if h.startswith(b'BM'):
168 |         return 'bmp'
169 | 
170 | tests.append(test_bmp)
171 | 
172 | def test_webp(h, f):
173 |     if h.startswith(b'RIFF') and h[8:12] == b'WEBP':
174 |         return 'webp'
175 | 
176 | tests.append(test_webp)
177 | 
178 | def test_exr(h, f):
179 |     if h.startswith(b'\x76\x2f\x31\x01'):
180 |         return 'exr'
181 | 
182 | tests.append(test_exr)
183 | 
184 | #--------------------#
185 | # Small test program #
186 | #--------------------#
187 | 
188 | def test():
189 |     import sys
190 |     recursive = 0
191 |     if sys.argv[1:] and sys.argv[1] == '-r':
192 |         del sys.argv[1:2]
193 |         recursive = 1
194 |     try:
195 |         if sys.argv[1:]:
196 |             testall(sys.argv[1:], recursive, 1)
197 |         else:
198 |             testall(['.'], recursive, 1)
199 |     except KeyboardInterrupt:
200 |         sys.stderr.write('\n[Interrupted]\n')
201 |         sys.exit(1)
202 | 
203 | def testall(list, recursive, toplevel):
204 |     import sys
205 |     import os
206 |     for filename in list:
207 |         if os.path.isdir(filename):
208 |             print(filename + '/:', end=' ')
209 |             if recursive or toplevel:
210 |                 print('recursing down:')
211 |                 import glob
212 |                 names = glob.glob(os.path.join(glob.escape(filename), '*'))
213 |                 testall(names, recursive, 0)
214 |             else:
215 |                 print('*** directory (use -r) ***')
216 |         else:
217 |             print(filename + ':', end=' ')
218 |             sys.stdout.flush()
219 |             try:
220 |                 print(what(filename))
221 |             except OSError:
222 |                 print('*** not found ***')
223 | 
224 | if __name__ == '__main__':
225 |     test()
226 | 


--------------------------------------------------------------------------------
/lib/mobi_cover.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | from .compatibility_utils import unicode_str
  8 | 
  9 | from .unipath import pathof
 10 | import os
 11 | from . import imghdr
 12 | 
 13 | import struct
 14 | # note:  struct pack, unpack, unpack_from all require bytestring format
 15 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 16 | 
 17 | USE_SVG_WRAPPER = True
 18 | """ Set to True to use svg wrapper for default. """
 19 | 
 20 | FORCE_DEFAULT_TITLE = False
 21 | """ Set to True to force to use the default title. """
 22 | 
 23 | COVER_PAGE_FINENAME = 'cover_page.xhtml'
 24 | """ The name for the cover page. """
 25 | 
 26 | DEFAULT_TITLE = 'Cover'
 27 | """ The default title for the cover page. """
 28 | 
 29 | MAX_WIDTH = 4096
 30 | """ The max width for the svg cover page. """
 31 | 
 32 | MAX_HEIGHT = 4096
 33 | """ The max height for the svg cover page. """
 34 | 
 35 | 
 36 | def get_image_type(imgname, imgdata=None):
 37 |     imgtype = unicode_str(imghdr.what(pathof(imgname), imgdata))
 38 | 
 39 |     # imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some
 40 |     # with only the magic JPEG bytes out there...
 41 |     # ImageMagick handles those, so, do it too.
 42 |     if imgtype is None:
 43 |         if imgdata is None:
 44 |             with open(pathof(imgname), 'rb') as f:
 45 |                 imgdata = f.read()
 46 |         if imgdata[0:2] == b'\xFF\xD8':
 47 |             # Get last non-null bytes
 48 |             last = len(imgdata)
 49 |             while (imgdata[last-1:last] == b'\x00'):
 50 |                 last-=1
 51 |             # Be extra safe, check the trailing bytes, too.
 52 |             if imgdata[last-2:last] == b'\xFF\xD9':
 53 |                 imgtype = "jpeg"
 54 |     return imgtype
 55 | 
 56 | 
 57 | def get_image_size(imgname, imgdata=None):
 58 |     '''Determine the image type of imgname (or imgdata) and return its size.
 59 | 
 60 |     Originally,
 61 |     Determine the image type of fhandle and return its size.
 62 |     from draco'''
 63 |     if imgdata is None:
 64 |         fhandle = open(pathof(imgname), 'rb')
 65 |         head = fhandle.read(24)
 66 |     else:
 67 |         head = imgdata[0:24]
 68 |     if len(head) != 24:
 69 |         return
 70 | 
 71 |     imgtype = get_image_type(imgname, imgdata)
 72 |     if imgtype == 'png':
 73 |         check = struct.unpack(b'>i', head[4:8])[0]
 74 |         if check != 0x0d0a1a0a:
 75 |             return
 76 |         width, height = struct.unpack(b'>ii', head[16:24])
 77 |     elif imgtype == 'gif':
 78 |         width, height = struct.unpack(b'<HH', head[6:10])
 79 |     elif imgtype == 'jpeg' and imgdata is None:
 80 |         try:
 81 |             fhandle.seek(0)  # Read 0xff next
 82 |             size = 2
 83 |             ftype = 0
 84 |             while not 0xc0 <= ftype <= 0xcf:
 85 |                 fhandle.seek(size, 1)
 86 |                 byte = fhandle.read(1)
 87 |                 while ord(byte) == 0xff:
 88 |                     byte = fhandle.read(1)
 89 |                 ftype = ord(byte)
 90 |                 size = struct.unpack(b'>H', fhandle.read(2))[0] - 2
 91 |             # We are at a SOFn block
 92 |             fhandle.seek(1, 1)  # Skip `precision' byte.
 93 |             height, width = struct.unpack(b'>HH', fhandle.read(4))
 94 |         except Exception:  # IGNORE:W0703
 95 |             return
 96 |     elif imgtype == 'jpeg' and imgdata is not None:
 97 |         try:
 98 |             pos = 0
 99 |             size = 2
100 |             ftype = 0
101 |             while not 0xc0 <= ftype <= 0xcf:
102 |                 pos += size
103 |                 byte = imgdata[pos:pos+1]
104 |                 pos += 1
105 |                 while ord(byte) == 0xff:
106 |                     byte = imgdata[pos:pos+1]
107 |                     pos += 1
108 |                 ftype = ord(byte)
109 |                 size = struct.unpack(b'>H', imgdata[pos:pos+2])[0] - 2
110 |                 pos += 2
111 |             # We are at a SOFn block
112 |             pos += 1  # Skip `precision' byte.
113 |             height, width = struct.unpack(b'>HH', imgdata[pos:pos+4])
114 |             pos += 4
115 |         except Exception:  # IGNORE:W0703
116 |             return
117 |     else:
118 |         return
119 |     return width, height
120 | 
121 | # XXX experimental
122 | class CoverProcessor(object):
123 | 
124 |     """Create a cover page.
125 | 
126 |     """
127 |     def __init__(self, files, metadata, rscnames, imgname=None, imgdata=None):
128 |         self.files = files
129 |         self.metadata = metadata
130 |         self.rscnames = rscnames
131 |         self.cover_page = COVER_PAGE_FINENAME
132 |         self.use_svg = USE_SVG_WRAPPER  # Use svg wrapper.
133 |         self.lang = metadata.get('Language', ['en'])[0]
134 |         # This should ensure that if the methods to find the cover image's
135 |         # dimensions should fail for any reason, the SVG routine will not be used.
136 |         [self.width, self.height] = (-1,-1)
137 |         if FORCE_DEFAULT_TITLE:
138 |             self.title = DEFAULT_TITLE
139 |         else:
140 |             self.title = metadata.get('Title', [DEFAULT_TITLE])[0]
141 | 
142 |         self.cover_image = None
143 |         if imgname is not None:
144 |             self.cover_image = imgname
145 |         elif 'CoverOffset' in metadata:
146 |             imageNumber = int(metadata['CoverOffset'][0])
147 |             cover_image = self.rscnames[imageNumber]
148 |             if cover_image is not None:
149 |                 self.cover_image = cover_image
150 |             else:
151 |                 print('Warning: Cannot identify the cover image.')
152 |         if self.use_svg:
153 |             try:
154 |                 if imgdata is None:
155 |                     fname = os.path.join(files.imgdir, self.cover_image)
156 |                     [self.width, self.height] = get_image_size(fname)
157 |                 else:
158 |                     [self.width, self.height] = get_image_size(None, imgdata)
159 |             except:
160 |                 self.use_svg = False
161 |             width = self.width
162 |             height = self.height
163 |             if width < 0 or height < 0 or width > MAX_WIDTH or height > MAX_HEIGHT:
164 |                 self.use_svg = False
165 |         return
166 | 
167 |     def getImageName(self):
168 |         return self.cover_image
169 | 
170 |     def getXHTMLName(self):
171 |         return self.cover_page
172 | 
173 |     def buildXHTML(self):
174 |         print('Building a cover page.')
175 |         files = self.files
176 |         cover_image = self.cover_image
177 |         title = self.title
178 |         lang = self.lang
179 | 
180 |         image_dir = os.path.normpath(os.path.relpath(files.k8images, files.k8text))
181 |         image_path = os.path.join(image_dir, cover_image).replace('\\', '/')
182 | 
183 |         if not self.use_svg:
184 |             data = ''
185 |             data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>'
186 |             data += '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"'
187 |             data += ' xml:lang="{:s}">\n'.format(lang)
188 |             data += '<head>\n<title>{:s}</title>\n'.format(title)
189 |             data += '<style type="text/css">\n'
190 |             data += 'body {\n  margin: 0;\n  padding: 0;\n  text-align: center;\n}\n'
191 |             data += 'div {\n  height: 100%;\n  width: 100%;\n  text-align: center;\n  page-break-inside: avoid;\n}\n'
192 |             data += 'img {\n  display: inline-block;\n  height: 100%;\n  margin: 0 auto;\n}\n'
193 |             data += '</style>\n</head>\n'
194 |             data += '<body><div>\n'
195 |             data += '  <img src="{:s}" alt=""/>\n'.format(image_path)
196 |             data += '</div></body>\n</html>'
197 |         else:
198 |             width = self.width
199 |             height = self.height
200 |             viewBox = "0 0 {0:d} {1:d}".format(width, height)
201 | 
202 |             data = ''
203 |             data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>'
204 |             data += '<html xmlns="http://www.w3.org/1999/xhtml"'
205 |             data += ' xml:lang="{:s}">\n'.format(lang)
206 |             data += '<head>\n  <title>{:s}</title>\n'.format(title)
207 |             data += '<style type="text/css">\n'
208 |             data += 'svg {padding: 0pt; margin:0pt}\n'
209 |             data += 'body { text-align: center; padding:0pt; margin: 0pt; }\n'
210 |             data += '</style>\n</head>\n'
211 |             data += '<body>\n  <div>\n'
212 |             data += '    <svg xmlns="http://www.w3.org/2000/svg" height="100%" preserveAspectRatio="xMidYMid meet"'
213 |             data += ' version="1.1" viewBox="{0:s}" width="100%" xmlns:xlink="http://www.w3.org/1999/xlink">\n'.format(viewBox)
214 |             data += '      <image height="{0}" width="{1}" xlink:href="{2}"/>\n'.format(height, width, image_path)
215 |             data += '    </svg>\n'
216 |             data += '  </div>\n</body>\n</html>'
217 |         return data
218 | 
219 |     def writeXHTML(self):
220 |         files = self.files
221 |         cover_page = self.cover_page
222 | 
223 |         data = self.buildXHTML()
224 | 
225 |         outfile = os.path.join(files.k8text, cover_page)
226 |         if os.path.exists(pathof(outfile)):
227 |             print('Warning: {:s} already exists.'.format(cover_page))
228 |             os.remove(pathof(outfile))
229 |         with open(pathof(outfile), 'wb') as f:
230 |             f.write(data.encode('utf-8'))
231 |         return
232 | 
233 |     def guide_toxml(self):
234 |         files = self.files
235 |         text_dir = os.path.relpath(files.k8text, files.k8oebps)
236 |         data = '<reference type="cover" title="Cover" href="{:s}/{:s}" />\n'.format(
237 |                 text_dir, self.cover_page)
238 |         return data
239 | 


--------------------------------------------------------------------------------
/lib/mobi_dict.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr
  8 | 
  9 | if PY2:
 10 |     range = xrange
 11 |     array_format = b'B'
 12 | if PY3:
 13 |     unichr = chr
 14 |     array_format = "B"
 15 | 
 16 | import array
 17 | 
 18 | import struct
 19 | # note:  struct pack, unpack, unpack_from all require bytestring format
 20 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 21 | 
 22 | from .mobi_index import getVariableWidthValue, readTagSection, getTagMap
 23 | from .mobi_utils import toHex
 24 | 
 25 | #python 3.9 dropped support for array tostring()
 26 | def convert_to_bytes(ar):
 27 |     if PY2:
 28 |         return ar.tostring()
 29 |     return ar.tobytes()
 30 | 
 31 | DEBUG_DICT = False
 32 | 
 33 | class InflectionData(object):
 34 | 
 35 |     def __init__(self, infldatas):
 36 |         self.infldatas = infldatas
 37 |         self.starts = []
 38 |         self.counts = []
 39 |         for idata in self.infldatas:
 40 |             start, = struct.unpack_from(b'>L', idata, 0x14)
 41 |             count, = struct.unpack_from(b'>L', idata, 0x18)
 42 |             self.starts.append(start)
 43 |             self.counts.append(count)
 44 | 
 45 |     def lookup(self, lookupvalue):
 46 |         i = 0
 47 |         rvalue = lookupvalue
 48 |         while rvalue >= self.counts[i]:
 49 |             rvalue = rvalue - self.counts[i]
 50 |             i += 1
 51 |             if i == len(self.counts):
 52 |                 print("Error: Problem with multiple inflections data sections")
 53 |                 return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0]
 54 |         return rvalue, self.starts[i], self.counts[i], self.infldatas[i]
 55 | 
 56 |     def offsets(self, value):
 57 |         rvalue, start, count, data = self.lookup(value)
 58 |         offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue))
 59 |         if rvalue + 1 < count:
 60 |             nextOffset, = struct.unpack_from(b'>H',data, start + 4 + (2 * (rvalue + 1)))
 61 |         else:
 62 |             nextOffset = None
 63 |         return offset, nextOffset, data
 64 | 
 65 | 
 66 | class dictSupport(object):
 67 | 
 68 |     def __init__(self, mh, sect):
 69 |         self.mh = mh
 70 |         self.header = mh.header
 71 |         self.sect = sect
 72 |         self.metaOrthIndex = mh.metaOrthIndex
 73 |         self.metaInflIndex = mh.metaInflIndex
 74 | 
 75 |     def parseHeader(self, data):
 76 |         "read INDX header"
 77 |         if not data[:4] == b'INDX':
 78 |             print("Warning: index section is not INDX")
 79 |             return False
 80 |         words = (
 81 |                 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
 82 |                 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc'
 83 |         )
 84 |         num = len(words)
 85 |         values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)])
 86 |         header = {}
 87 |         for n in range(num):
 88 |             header[words[n]] = values[n]
 89 | 
 90 |         ordt1 = None
 91 |         ordt2 = None
 92 | 
 93 |         otype, oentries, op1, op2, otagx  = struct.unpack_from(b'>LLLLL',data, 0xa4)
 94 |         header['otype'] = otype
 95 |         header['oentries'] = oentries
 96 | 
 97 |         if DEBUG_DICT:
 98 |             print("otype %d, oentries %d, op1 %d, op2 %d, otagx %d" % (otype, oentries, op1, op2, otagx))
 99 | 
100 |         if header['code'] == 0xfdea or oentries > 0:
101 |             # some dictionaries seem to be codepage 65002 (0xFDEA) which seems
102 |             # to be some sort of strange EBCDIC utf-8 or 16 encoded strings
103 |             # So we need to look for them and store them away to process leading text
104 |             # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
105 |             # we only ever seem to use the second but ...
106 |             #
107 |             # if otype = 0, ORDT table uses 16 bit values as offsets into the table
108 |             # if otype = 1, ORDT table uses 8 bit values as offsets inot the table
109 | 
110 |             assert(data[op1:op1+4] == b'ORDT')
111 |             assert(data[op2:op2+4] == b'ORDT')
112 |             ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4)
113 |             ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4)
114 | 
115 |         if DEBUG_DICT:
116 |             print("parsed INDX header:")
117 |             for key in header:
118 |                 print(key, "%x" % header[key],)
119 |             print("\n")
120 |         return header, ordt1, ordt2
121 | 
122 |     def getPositionMap(self):
123 |         sect = self.sect
124 | 
125 |         positionMap = {}
126 | 
127 |         metaOrthIndex = self.metaOrthIndex
128 |         metaInflIndex = self.metaInflIndex
129 | 
130 |         decodeInflection = True
131 |         if metaOrthIndex != 0xFFFFFFFF:
132 |             print("Info: Document contains orthographic index, handle as dictionary")
133 |             if metaInflIndex == 0xFFFFFFFF:
134 |                 decodeInflection = False
135 |             else:
136 |                 metaInflIndexData = sect.loadSection(metaInflIndex)
137 | 
138 |                 print("\nParsing metaInflIndexData")
139 |                 midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData)
140 | 
141 |                 metaIndexCount = midxhdr['count']
142 |                 idatas = []
143 |                 for j in range(metaIndexCount):
144 |                     idatas.append(sect.loadSection(metaInflIndex + 1 + j))
145 |                 dinfl = InflectionData(idatas)
146 | 
147 |                 inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount)
148 |                 tagSectionStart = midxhdr['len']
149 |                 inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData)
150 |                 if DEBUG_DICT:
151 |                     print("inflectionTagTable: %s" % inflectionTagTable)
152 |                 if self.hasTag(inflectionTagTable, 0x07):
153 |                     print("Error: Dictionary uses obsolete inflection rule scheme which is not yet supported")
154 |                     decodeInflection = False
155 | 
156 |             data = sect.loadSection(metaOrthIndex)
157 | 
158 |             print("\nParsing metaOrthIndex")
159 |             idxhdr, hordt1, hordt2 = self.parseHeader(data)
160 | 
161 |             tagSectionStart = idxhdr['len']
162 |             controlByteCount, tagTable = readTagSection(tagSectionStart, data)
163 |             orthIndexCount = idxhdr['count']
164 |             print("orthIndexCount is", orthIndexCount)
165 |             if DEBUG_DICT:
166 |                 print("orthTagTable: %s" % tagTable)
167 |             if hordt2 is not None:
168 |                 print("orth entry uses ordt2 lookup table of type ", idxhdr['otype'])
169 |             hasEntryLength = self.hasTag(tagTable, 0x02)
170 |             if not hasEntryLength:
171 |                 print("Info: Index doesn't contain entry length tags")
172 | 
173 |             print("Read dictionary index data")
174 |             for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount):
175 |                 data = sect.loadSection(i)
176 |                 hdrinfo, ordt1, ordt2 = self.parseHeader(data)
177 |                 idxtPos = hdrinfo['start']
178 |                 entryCount = hdrinfo['count']
179 |                 idxPositions = []
180 |                 for j in range(entryCount):
181 |                     pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j))
182 |                     idxPositions.append(pos)
183 |                 # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
184 |                 idxPositions.append(idxtPos)
185 |                 for j in range(entryCount):
186 |                     startPos = idxPositions[j]
187 |                     endPos = idxPositions[j+1]
188 |                     textLength = ord(data[startPos:startPos+1])
189 |                     text = data[startPos+1:startPos+1+textLength]
190 |                     if hordt2 is not None:
191 |                         utext = u""
192 |                         if idxhdr['otype'] == 0:
193 |                             pattern = b'>H'
194 |                             inc = 2
195 |                         else:
196 |                             pattern = b'>B'
197 |                             inc = 1
198 |                         pos = 0
199 |                         while pos < textLength:
200 |                             off, = struct.unpack_from(pattern, text, pos)
201 |                             if off < len(hordt2):
202 |                                 utext += unichr(hordt2[off])
203 |                             else:
204 |                                 utext += unichr(off)
205 |                             pos += inc
206 |                         text = utext.encode('utf-8')
207 | 
208 |                     tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
209 |                     if 0x01 in tagMap:
210 |                         if decodeInflection and 0x2a in tagMap:
211 |                             inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable,
212 |                                                                         dinfl, inflNameData, tagMap[0x2a])
213 |                         else:
214 |                             inflectionGroups = b''
215 |                         assert len(tagMap[0x01]) == 1
216 |                         entryStartPosition = tagMap[0x01][0]
217 |                         if hasEntryLength:
218 |                             # The idx:entry attribute "scriptable" must be present to create entry length tags.
219 |                             ml = b'<idx:entry scriptable="yes"><idx:orth value="' + text + b'">' + inflectionGroups + b'</idx:orth>'
220 |                             if entryStartPosition in positionMap:
221 |                                 positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml
222 |                             else:
223 |                                 positionMap[entryStartPosition] = ml
224 |                             assert len(tagMap[0x02]) == 1
225 |                             entryEndPosition = entryStartPosition + tagMap[0x02][0]
226 |                             if entryEndPosition in positionMap:
227 |                                 positionMap[entryEndPosition] = b"</idx:entry>" + positionMap[entryEndPosition]
228 |                             else:
229 |                                 positionMap[entryEndPosition] = b"</idx:entry>"
230 | 
231 |                         else:
232 |                             indexTags = b'<idx:entry>\n<idx:orth value="' + text + b'">\n' + inflectionGroups + b'</idx:entry>\n'
233 |                             if entryStartPosition in positionMap:
234 |                                 positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags
235 |                             else:
236 |                                 positionMap[entryStartPosition] = indexTags
237 |         return positionMap
238 | 
239 |     def hasTag(self, tagTable, tag):
240 |         '''
241 |         Test if tag table contains given tag.
242 | 
243 |         @param tagTable: The tag table.
244 |         @param tag: The tag to search.
245 |         @return: True if tag table contains given tag; False otherwise.
246 |         '''
247 |         for currentTag, _, _, _ in tagTable:
248 |             if currentTag == tag:
249 |                 return True
250 |         return False
251 | 
252 |     def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList):
253 |         '''
254 |         Create string which contains the inflection groups with inflection rules as mobipocket tags.
255 | 
256 |         @param mainEntry: The word to inflect.
257 |         @param controlByteCount: The number of control bytes.
258 |         @param tagTable: The tag table.
259 |         @param data: The Inflection data object to properly select the right inflection data section to use
260 |         @param inflectionNames: The inflection rule name data.
261 |         @param groupList: The list of inflection groups to process.
262 |         @return: String with inflection groups and rules or empty string if required tags are not available.
263 |         '''
264 |         result = b""
265 |         for value in groupList:
266 |             offset, nextOffset, data = dinfl.offsets(value)
267 | 
268 |             # First byte seems to be always 0x00 and must be skipped.
269 |             assert ord(data[offset:offset+1]) == 0x00
270 |             tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset)
271 | 
272 |             # Make sure that the required tags are available.
273 |             if 0x05 not in tagMap:
274 |                 print("Error: Required tag 0x05 not found in tagMap")
275 |                 return ""
276 |             if 0x1a not in tagMap:
277 |                 print("Error: Required tag 0x1a not found in tagMap")
278 |                 return b''
279 | 
280 |             result += b'<idx:infl>'
281 | 
282 |             for i in range(len(tagMap[0x05])):
283 | 
284 |                 # Get name of inflection rule.
285 |                 value = tagMap[0x05][i]
286 |                 consumed, textLength = getVariableWidthValue(inflectionNames, value)
287 |                 inflectionName = inflectionNames[value+consumed:value+consumed+textLength]
288 | 
289 |                 # Get and apply inflection rule across possibly multiple inflection data sections
290 |                 value = tagMap[0x1a][i]
291 |                 rvalue, start, count, data = dinfl.lookup(value)
292 |                 offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue))
293 |                 textLength = ord(data[offset:offset+1])
294 |                 inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength)
295 |                 if inflection is not None:
296 |                     result += b'  <idx:iform name="' + inflectionName + b'" value="' + inflection + b'"/>'
297 | 
298 |             result += b'</idx:infl>'
299 |         return result
300 | 
301 |     def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end):
302 |         '''
303 |         Apply inflection rule.
304 | 
305 |         @param mainEntry: The word to inflect.
306 |         @param inflectionRuleData: The inflection rules.
307 |         @param start: The start position of the inflection rule to use.
308 |         @param end: The end position of the inflection rule to use.
309 |         @return: The string with the inflected word or None if an error occurs.
310 |         '''
311 |         mode = -1
312 |         byteArray = array.array(array_format, mainEntry)
313 |         position = len(byteArray)
314 |         for charOffset in range(start, end):
315 |             char = inflectionRuleData[charOffset:charOffset+1]
316 |             abyte = ord(char)
317 |             if abyte >= 0x0a and abyte <= 0x13:
318 |                 # Move cursor backwards
319 |                 offset = abyte - 0x0a
320 |                 if mode not in [0x02, 0x03]:
321 |                     mode = 0x02
322 |                     position = len(byteArray)
323 |                 position -= offset
324 |             elif abyte > 0x13:
325 |                 if mode == -1:
326 |                     print("Error: Unexpected first byte %i of inflection rule" % abyte)
327 |                     return None
328 |                 elif position == -1:
329 |                     print("Error: Unexpected first byte %i of inflection rule" % abyte)
330 |                     return None
331 |                 else:
332 |                     if mode == 0x01:
333 |                         # Insert at word start
334 |                         byteArray.insert(position, abyte)
335 |                         position += 1
336 |                     elif mode == 0x02:
337 |                         # Insert at word end
338 |                         byteArray.insert(position, abyte)
339 |                     elif mode == 0x03:
340 |                         # Delete at word end
341 |                         position -= 1
342 |                         deleted = byteArray.pop(position)
343 |                         if bchr(deleted) != char:
344 |                             if DEBUG_DICT:
345 |                                 print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted)))
346 |                             print("Error: Delete operation of inflection rule failed")
347 |                             return None
348 |                     elif mode == 0x04:
349 |                         # Delete at word start
350 |                         deleted = byteArray.pop(position)
351 |                         if bchr(deleted) != char:
352 |                             if DEBUG_DICT:
353 |                                 print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted)))
354 |                             print("Error: Delete operation of inflection rule failed")
355 |                             return None
356 |                     else:
357 |                         print("Error: Inflection rule mode %x is not implemented" % mode)
358 |                         return None
359 |             elif abyte == 0x01:
360 |                 # Insert at word start
361 |                 if mode not in [0x01, 0x04]:
362 |                     position = 0
363 |                 mode = abyte
364 |             elif abyte == 0x02:
365 |                 # Insert at word end
366 |                 if mode not in [0x02, 0x03]:
367 |                     position = len(byteArray)
368 |                 mode = abyte
369 |             elif abyte == 0x03:
370 |                 # Delete at word end
371 |                 if mode not in [0x02, 0x03]:
372 |                     position = len(byteArray)
373 |                 mode = abyte
374 |             elif abyte == 0x04:
375 |                 # Delete at word start
376 |                 if mode not in [0x01, 0x04]:
377 |                     position = 0
378 |                 # Delete at word start
379 |                 mode = abyte
380 |             else:
381 |                 print("Error: Inflection rule mode %x is not implemented" % abyte)
382 |                 return None
383 |         return utf8_str(convert_to_bytes(byteArray))
384 | 


--------------------------------------------------------------------------------
/lib/mobi_html.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | from .compatibility_utils import PY2, utf8_str
  8 | 
  9 | if PY2:
 10 |     range = xrange
 11 | 
 12 | import re
 13 | # note: re requites the pattern to be the exact same type as the data to be searched in python3
 14 | # but u"" is not allowed for the pattern itself only b""
 15 | 
 16 | from .mobi_utils import fromBase32
 17 | 
 18 | class HTMLProcessor:
 19 | 
 20 |     def __init__(self, files, metadata, rscnames):
 21 |         self.files = files
 22 |         self.metadata = metadata
 23 |         self.rscnames = rscnames
 24 |         # for original style mobis, default to including all image files in the opf manifest
 25 |         self.used = {}
 26 |         for name in rscnames:
 27 |             self.used[name] = 'used'
 28 | 
 29 |     def findAnchors(self, rawtext, indx_data, positionMap):
 30 |         # process the raw text
 31 |         # find anchors...
 32 |         print("Find link anchors")
 33 |         link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', re.IGNORECASE)
 34 |         # TEST NCX: merge in filepos from indx
 35 |         pos_links = [int(m.group(1)) for m in link_pattern.finditer(rawtext)]
 36 |         if indx_data:
 37 |             pos_indx = [e['pos'] for e in indx_data if e['pos']>0]
 38 |             pos_links = list(set(pos_links + pos_indx))
 39 | 
 40 |         for position in pos_links:
 41 |             if position in positionMap:
 42 |                 positionMap[position] = positionMap[position] + utf8_str('<a id="filepos%d" />' % position)
 43 |             else:
 44 |                 positionMap[position] = utf8_str('<a id="filepos%d" />' % position)
 45 | 
 46 |         # apply dictionary metadata and anchors
 47 |         print("Insert data into html")
 48 |         pos = 0
 49 |         lastPos = len(rawtext)
 50 |         dataList = []
 51 |         for end in sorted(positionMap.keys()):
 52 |             if end == 0 or end > lastPos:
 53 |                 continue  # something's up - can't put a tag in outside <html>...</html>
 54 |             dataList.append(rawtext[pos:end])
 55 |             dataList.append(positionMap[end])
 56 |             pos = end
 57 |         dataList.append(rawtext[pos:])
 58 |         srctext = b"".join(dataList)
 59 |         rawtext = None
 60 |         dataList = None
 61 |         self.srctext = srctext
 62 |         self.indx_data = indx_data
 63 |         return srctext
 64 | 
 65 |     def insertHREFS(self):
 66 |         srctext = self.srctext
 67 |         rscnames = self.rscnames
 68 |         metadata = self.metadata
 69 | 
 70 |         # put in the hrefs
 71 |         print("Insert hrefs into html")
 72 |         # There doesn't seem to be a standard, so search as best as we can
 73 | 
 74 |         link_pattern = re.compile(br'''<a([^>]*?)filepos=['"]{0,1}0*(\d+)['"]{0,1}([^>]*?)>''', re.IGNORECASE)
 75 |         srctext = link_pattern.sub(br'''<a\1href="#filepos\2"\3>''', srctext)
 76 | 
 77 |         # remove empty anchors
 78 |         print("Remove empty anchors from html")
 79 |         srctext = re.sub(br"<a\s*/>",br"", srctext)
 80 |         srctext = re.sub(br"<a\s*>\s*</a>",br"", srctext)
 81 | 
 82 |         # convert image references
 83 |         print("Insert image references into html")
 84 |         # split string into image tag pieces and other pieces
 85 |         image_pattern = re.compile(br'''(<img.*?>)''', re.IGNORECASE)
 86 |         image_index_pattern = re.compile(br'''recindex=['"]{0,1}([0-9]+)['"]{0,1}''', re.IGNORECASE)
 87 |         srcpieces = image_pattern.split(srctext)
 88 |         srctext = self.srctext = None
 89 | 
 90 |         # all odd pieces are image tags (nulls string on even pieces if no space between them in srctext)
 91 |         for i in range(1, len(srcpieces), 2):
 92 |             tag = srcpieces[i]
 93 |             for m in image_index_pattern.finditer(tag):
 94 |                 imageNumber = int(m.group(1))
 95 |                 imageName = rscnames[imageNumber-1]
 96 |                 if imageName is None:
 97 |                     print("Error: Referenced image %s was not recognized as a valid image" % imageNumber)
 98 |                 else:
 99 |                     replacement = b'src="Images/' + utf8_str(imageName) + b'"'
100 |                     tag = image_index_pattern.sub(replacement, tag, 1)
101 |             srcpieces[i] = tag
102 |         srctext = b"".join(srcpieces)
103 | 
104 |         # add in character set meta into the html header if needed
105 |         if 'Codec' in metadata:
106 |             srctext = srctext[0:12]+b'<meta http-equiv="content-type" content="text/html; charset='+utf8_str(metadata.get('Codec')[0])+b'" />'+srctext[12:]
107 |         return srctext, self.used
108 | 
109 | 
110 | class XHTMLK8Processor:
111 | 
112 |     def __init__(self, rscnames, k8proc, viewport=None):
113 |         self.rscnames = rscnames
114 |         self.k8proc = k8proc
115 |         self.viewport = viewport
116 |         self.used = {}
117 | 
118 |     def buildXHTML(self):
119 | 
120 |         # first need to update all links that are internal which
121 |         # are based on positions within the xhtml files **BEFORE**
122 |         # cutting and pasting any pieces into the xhtml text files
123 | 
124 |         #   kindle:pos:fid:XXXX:off:YYYYYYYYYY  (used for internal link within xhtml)
125 |         #       XXXX is the offset in records into divtbl
126 |         #       YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position
127 | 
128 |         # pos:fid pattern
129 |         posfid_pattern = re.compile(br'''(<a.*?href=.*?>)''', re.IGNORECASE)
130 |         posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''')
131 | 
132 |         parts = []
133 |         print("Building proper xhtml for each file")
134 |         for i in range(self.k8proc.getNumberOfParts()):
135 |             part = self.k8proc.getPart(i)
136 |             [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i)
137 | 
138 |             # internal links
139 |             srcpieces = posfid_pattern.split(part)
140 |             for j in range(1, len(srcpieces),2):
141 |                 tag = srcpieces[j]
142 |                 if tag.startswith(b'<'):
143 |                     for m in posfid_index_pattern.finditer(tag):
144 |                         posfid = m.group(1)
145 |                         offset = m.group(2)
146 |                         filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset)
147 |                         if idtag == b'':
148 |                             replacement= b'"' + utf8_str(filename) + b'"'
149 |                         else:
150 |                             replacement = b'"' + utf8_str(filename) + b'#' + idtag + b'"'
151 |                         tag = posfid_index_pattern.sub(replacement, tag, 1)
152 |                     srcpieces[j] = tag
153 |             part = b"".join(srcpieces)
154 |             parts.append(part)
155 | 
156 |         # we are free to cut and paste as we see fit
157 |         # we can safely remove all of the Kindlegen generated aid tags
158 |         # change aid ids that are in k8proc.linked_aids to xhtml ids
159 |         find_tag_with_aid_pattern = re.compile(br'''(<[^>]*\said\s*=[^>]*>)''', re.IGNORECASE)
160 |         within_tag_aid_position_pattern = re.compile(br'''\said\s*=['"]([^'"]*)['"]''')
161 |         for i in range(len(parts)):
162 |             part = parts[i]
163 |             srcpieces = find_tag_with_aid_pattern.split(part)
164 |             for j in range(len(srcpieces)):
165 |                 tag = srcpieces[j]
166 |                 if tag.startswith(b'<'):
167 |                     for m in within_tag_aid_position_pattern.finditer(tag):
168 |                         try:
169 |                             aid = m.group(1)
170 |                         except IndexError:
171 |                             aid = None
172 |                         replacement = b''
173 |                         if aid in self.k8proc.linked_aids:
174 |                             replacement = b' id="aid-' + aid + b'"'
175 |                         tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
176 |                     srcpieces[j] = tag
177 |             part = b"".join(srcpieces)
178 |             parts[i] = part
179 | 
180 |         # we can safely replace all of the Kindlegen generated data-AmznPageBreak tags
181 |         # with page-break-after style patterns
182 |         find_tag_with_AmznPageBreak_pattern = re.compile(br'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
183 |         within_tag_AmznPageBreak_position_pattern = re.compile(br'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''')
184 |         for i in range(len(parts)):
185 |             part = parts[i]
186 |             srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
187 |             for j in range(len(srcpieces)):
188 |                 tag = srcpieces[j]
189 |                 if tag.startswith(b'<'):
190 |                     srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub(
191 |                         lambda m:b' style="page-break-after:' + m.group(1) + b'"', tag)
192 |             part = b"".join(srcpieces)
193 |             parts[i] = part
194 | 
195 |         # we have to handle substitutions for the flows  pieces first as they may
196 |         # be inlined into the xhtml text
197 |         #   kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
198 |         #   kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
199 |         #   kindle:embed:XXXX   (used for fonts)
200 | 
201 |         flows = []
202 |         flows.append(None)
203 |         flowinfo = []
204 |         flowinfo.append([None, None, None, None])
205 | 
206 |         # regular expression search patterns
207 |         img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
208 |         img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE)
209 | 
210 |         tag_pattern = re.compile(br'''(<[^>]*>)''')
211 |         flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
212 | 
213 |         url_pattern = re.compile(br'''(url\(.*?\))''', re.IGNORECASE)
214 |         url_img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]''', re.IGNORECASE)
215 |         font_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)["')]''', re.IGNORECASE)
216 |         url_css_index_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)
217 |         url_svg_image_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=image/svg\+xml[^\)]*''', re.IGNORECASE)
218 | 
219 |         for i in range(1, self.k8proc.getNumberOfFlows()):
220 |             [ftype, format, dir, filename] = self.k8proc.getFlowInfo(i)
221 |             flowpart = self.k8proc.getFlow(i)
222 | 
223 |             # links to raster image files from image tags
224 |             # image_pattern
225 |             srcpieces = img_pattern.split(flowpart)
226 |             for j in range(1, len(srcpieces),2):
227 |                 tag = srcpieces[j]
228 |                 if tag.startswith(b'<im'):
229 |                     for m in img_index_pattern.finditer(tag):
230 |                         imageNumber = fromBase32(m.group(1))
231 |                         imageName = self.rscnames[imageNumber-1]
232 |                         if imageName is not None:
233 |                             replacement = b'"../Images/' + utf8_str(imageName) + b'"'
234 |                             self.used[imageName] = 'used'
235 |                             tag = img_index_pattern.sub(replacement, tag, 1)
236 |                         else:
237 |                             print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag))
238 |                     srcpieces[j] = tag
239 |             flowpart = b"".join(srcpieces)
240 | 
241 |             # replacements inside css url():
242 |             srcpieces = url_pattern.split(flowpart)
243 |             for j in range(1, len(srcpieces),2):
244 |                 tag = srcpieces[j]
245 | 
246 |                 #  process links to raster image files
247 |                 for m in url_img_index_pattern.finditer(tag):
248 |                     imageNumber = fromBase32(m.group(1))
249 |                     imageName = self.rscnames[imageNumber-1]
250 |                     osep = m.group()[0:1]
251 |                     csep = m.group()[-1:]
252 |                     if imageName is not None:
253 |                         replacement = osep +  b'../Images/' + utf8_str(imageName) +  csep
254 |                         self.used[imageName] = 'used'
255 |                         tag = url_img_index_pattern.sub(replacement, tag, 1)
256 |                     else:
257 |                         print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag))
258 | 
259 |                 # process links to fonts
260 |                 for m in font_index_pattern.finditer(tag):
261 |                     fontNumber = fromBase32(m.group(1))
262 |                     fontName = self.rscnames[fontNumber-1]
263 |                     osep = m.group()[0:1]
264 |                     csep = m.group()[-1:]
265 |                     if fontName is None:
266 |                         print("Error: Referenced font %s was not recognized as a valid font in %s" % (fontNumber, tag))
267 |                     else:
268 |                         replacement = osep +  b'../Fonts/' + utf8_str(fontName) +  csep
269 |                         tag = font_index_pattern.sub(replacement, tag, 1)
270 |                         self.used[fontName] = 'used'
271 | 
272 |                 # process links to other css pieces
273 |                 for m in url_css_index_pattern.finditer(tag):
274 |                     num = fromBase32(m.group(1))
275 |                     [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
276 |                     replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
277 |                     tag = url_css_index_pattern.sub(replacement, tag, 1)
278 |                     self.used[fnm] = 'used'
279 | 
280 |                 # process links to svg images
281 |                 for m in url_svg_image_pattern.finditer(tag):
282 |                     num = fromBase32(m.group(1))
283 |                     [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
284 |                     replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
285 |                     tag = url_svg_image_pattern.sub(replacement, tag, 1)
286 |                     self.used[fnm] = 'used'
287 | 
288 |                 srcpieces[j] = tag
289 |             flowpart = b"".join(srcpieces)
290 | 
291 |             # store away in our own copy
292 |             flows.append(flowpart)
293 | 
294 |             # I do not think this case exists and even if it does exist, it needs to be done in a separate
295 |             # pass to prevent inlining a flow piece into another flow piece before the inserted one or the
296 |             # target one has been fully processed
297 | 
298 |             # but keep it around if it ends up we do need it
299 | 
300 |             # flow pattern not inside url()
301 |             # srcpieces = tag_pattern.split(flowpart)
302 |             # for j in range(1, len(srcpieces),2):
303 |             #     tag = srcpieces[j]
304 |             #     if tag.startswith(b'<'):
305 |             #         for m in flow_pattern.finditer(tag):
306 |             #             num = fromBase32(m.group(1))
307 |             #             [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
308 |             #             flowtext = self.k8proc.getFlow(num)
309 |             #             if fmt == b'inline':
310 |             #                 tag = flowtext
311 |             #             else:
312 |             #                 replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
313 |             #                 tag = flow_pattern.sub(replacement, tag, 1)
314 |             #                 self.used[fnm] = 'used'
315 |             #         srcpieces[j] = tag
316 |             # flowpart = b"".join(srcpieces)
317 | 
318 |         # now handle the main text xhtml parts
319 | 
320 |         # Handle the flow items in the XHTML text pieces
321 |         # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
322 |         tag_pattern = re.compile(br'''(<[^>]*>)''')
323 |         flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
324 |         for i in range(len(parts)):
325 |             part = parts[i]
326 |             [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
327 |             # flow pattern
328 |             srcpieces = tag_pattern.split(part)
329 |             for j in range(1, len(srcpieces),2):
330 |                 tag = srcpieces[j]
331 |                 if tag.startswith(b'<'):
332 |                     for m in flow_pattern.finditer(tag):
333 |                         num = fromBase32(m.group(1))
334 |                         if num > 0 and num < len(self.k8proc.flowinfo):
335 |                             [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
336 |                             flowpart = flows[num]
337 |                             if fmt == b'inline':
338 |                                 tag = flowpart
339 |                             else:
340 |                                 replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
341 |                                 tag = flow_pattern.sub(replacement, tag, 1)
342 |                                 self.used[fnm] = 'used'
343 |                         else:
344 |                             print("warning: ignoring non-existent flow link", tag, " value 0x%x" % num)
345 |                     srcpieces[j] = tag
346 |             part = b''.join(srcpieces)
347 | 
348 |             # store away modified version
349 |             parts[i] = part
350 | 
351 |         # Handle any embedded raster images links in style= attributes urls
352 |         style_pattern = re.compile(br'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''', re.IGNORECASE)
353 |         img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE)
354 | 
355 |         for i in range(len(parts)):
356 |             part = parts[i]
357 |             [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
358 | 
359 |             # replace urls in style attributes
360 |             srcpieces = style_pattern.split(part)
361 |             for j in range(1, len(srcpieces),2):
362 |                 tag = srcpieces[j]
363 |                 if b'kindle:embed' in tag:
364 |                     for m in img_index_pattern.finditer(tag):
365 |                         imageNumber = fromBase32(m.group(1))
366 |                         imageName = self.rscnames[imageNumber-1]
367 |                         osep = m.group()[0:1]
368 |                         csep = m.group()[-1:]
369 |                         if imageName is not None:
370 |                             replacement = osep + b'../Images/'+ utf8_str(imageName) + csep
371 |                             self.used[imageName] = 'used'
372 |                             tag = img_index_pattern.sub(replacement, tag, 1)
373 |                         else:
374 |                             print("Error: Referenced image %s in style url was not recognized in %s" % (imageNumber, tag))
375 |                     srcpieces[j] = tag
376 |             part = b"".join(srcpieces)
377 | 
378 |             # store away modified version
379 |             parts[i] = part
380 | 
381 |         # Handle any embedded raster images links in the xhtml text
382 |         # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
383 |         img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
384 |         img_index_pattern = re.compile(br'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''')
385 | 
386 |         for i in range(len(parts)):
387 |             part = parts[i]
388 |             [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
389 | 
390 |             # links to raster image files
391 |             # image_pattern
392 |             srcpieces = img_pattern.split(part)
393 |             for j in range(1, len(srcpieces),2):
394 |                 tag = srcpieces[j]
395 |                 if tag.startswith(b'<im'):
396 |                     for m in img_index_pattern.finditer(tag):
397 |                         imageNumber = fromBase32(m.group(1))
398 |                         imageName = self.rscnames[imageNumber-1]
399 |                         if imageName is not None:
400 |                             replacement = b'"../Images/' + utf8_str(imageName) + b'"'
401 |                             self.used[imageName] = 'used'
402 |                             tag = img_index_pattern.sub(replacement, tag, 1)
403 |                         else:
404 |                             print("Error: Referenced image %s was not recognized as a valid image in %s" % (imageNumber, tag))
405 |                     srcpieces[j] = tag
406 |             part = b"".join(srcpieces)
407 |             # store away modified version
408 |             parts[i] = part
409 | 
410 |         # finally perform any general cleanups needed to make valid XHTML
411 |         # these include:
412 |         #   in svg tags replace "perserveaspectratio" attributes with "perserveAspectRatio"
413 |         #   in svg tags replace "viewbox" attributes with "viewBox"
414 |         #   in <li> remove value="XX" attributes since these are illegal
415 |         tag_pattern = re.compile(br'''(<[^>]*>)''')
416 |         li_value_pattern = re.compile(br'''\svalue\s*=\s*['"][^'"]*['"]''', re.IGNORECASE)
417 | 
418 |         for i in range(len(parts)):
419 |             part = parts[i]
420 |             [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
421 | 
422 |             # tag pattern
423 |             srcpieces = tag_pattern.split(part)
424 |             for j in range(1, len(srcpieces),2):
425 |                 tag = srcpieces[j]
426 |                 if tag.startswith(b'<svg') or tag.startswith(b'<SVG'):
427 |                     tag = tag.replace(b'preserveaspectratio',b'preserveAspectRatio')
428 |                     tag = tag.replace(b'viewbox',b'viewBox')
429 |                 elif tag.startswith(b'<li ') or tag.startswith(b'<LI '):
430 |                     tagpieces = li_value_pattern.split(tag)
431 |                     tag = b"".join(tagpieces)
432 |                 srcpieces[j] = tag
433 |             part = b"".join(srcpieces)
434 |             # store away modified version
435 |             parts[i] = part
436 | 
437 |         # handle injection viewport meta data if needed in each xhtml file
438 |         if self.viewport:
439 |             injected_meta = b'<meta name="viewport" content="' + utf8_str(self.viewport) + b'"/>\n'
440 |             viewport_pattern = re.compile(br'''<meta\s[^>]*name\s*=\s*["'][^"'>]*viewport["'][^>]*>''', re.IGNORECASE)
441 |             for i in range(len(parts)):
442 |                 part = parts[i]
443 |                 # only inject if a viewport meta item does not already exist in that part
444 |                 if not viewport_pattern.search(part):
445 |                     endheadpos = part.find(b'</head>')
446 |                     if endheadpos >= 0:
447 |                         part = part[0:endheadpos] + injected_meta + part[endheadpos:]
448 |                 parts[i] = part
449 | 
450 |         self.k8proc.setFlows(flows)
451 |         self.k8proc.setParts(parts)
452 | 
453 |         return self.used
454 | 


--------------------------------------------------------------------------------
/lib/mobi_index.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | from .compatibility_utils import PY2, bchr, bstr, bord
  8 | if PY2:
  9 |     range = xrange
 10 | 
 11 | import struct
 12 | # note:  struct pack, unpack, unpack_from all require bytestring format
 13 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 14 | 
 15 | from .mobi_utils import toHex
 16 | 
 17 | class MobiIndex:
 18 | 
 19 |     def __init__(self, sect, DEBUG=False):
 20 |         self.sect = sect
 21 |         self.DEBUG = DEBUG
 22 | 
 23 |     def getIndexData(self, idx, label="Unknown"):
 24 |         sect = self.sect
 25 |         outtbl = []
 26 |         ctoc_text = {}
 27 |         if idx != 0xffffffff:
 28 |             sect.setsectiondescription(idx,"{0} Main INDX section".format(label))
 29 |             data = sect.loadSection(idx)
 30 |             idxhdr, hordt1, hordt2 = self.parseINDXHeader(data)
 31 |             IndexCount = idxhdr['count']
 32 |             # handle the case of multiple sections used for CTOC
 33 |             rec_off = 0
 34 |             off = idx + IndexCount + 1
 35 |             for j in range(idxhdr['nctoc']):
 36 |                 cdata = sect.loadSection(off + j)
 37 |                 sect.setsectiondescription(off+j, label + ' CTOC Data ' + str(j))
 38 |                 ctocdict = self.readCTOC(cdata)
 39 |                 for k in ctocdict:
 40 |                     ctoc_text[k + rec_off] = ctocdict[k]
 41 |                 rec_off += 0x10000
 42 |             tagSectionStart = idxhdr['len']
 43 |             controlByteCount, tagTable = readTagSection(tagSectionStart, data)
 44 |             if self.DEBUG:
 45 |                 print("ControlByteCount is", controlByteCount)
 46 |                 print("IndexCount is", IndexCount)
 47 |                 print("TagTable: %s" % tagTable)
 48 |             for i in range(idx + 1, idx + 1 + IndexCount):
 49 |                 sect.setsectiondescription(i,"{0} Extra {1:d} INDX section".format(label,i-idx))
 50 |                 data = sect.loadSection(i)
 51 |                 hdrinfo, ordt1, ordt2 = self.parseINDXHeader(data)
 52 |                 idxtPos = hdrinfo['start']
 53 |                 entryCount = hdrinfo['count']
 54 |                 if self.DEBUG:
 55 |                     print(idxtPos, entryCount)
 56 |                 # loop through to build up the IDXT position starts
 57 |                 idxPositions = []
 58 |                 for j in range(entryCount):
 59 |                     pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j))
 60 |                     idxPositions.append(pos)
 61 |                 # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
 62 |                 idxPositions.append(idxtPos)
 63 |                 # for each entry in the IDXT build up the tagMap and any associated text
 64 |                 for j in range(entryCount):
 65 |                     startPos = idxPositions[j]
 66 |                     endPos = idxPositions[j+1]
 67 |                     textLength = ord(data[startPos:startPos+1])
 68 |                     text = data[startPos+1:startPos+1+textLength]
 69 |                     if hordt2 is not None:
 70 |                         text = b''.join(bchr(hordt2[bord(x)]) for x in text)
 71 |                     tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
 72 |                     outtbl.append([text, tagMap])
 73 |                     if self.DEBUG:
 74 |                         print(tagMap)
 75 |                         print(text)
 76 |         return outtbl, ctoc_text
 77 | 
 78 |     def parseINDXHeader(self, data):
 79 |         "read INDX header"
 80 |         if not data[:4] == b'INDX':
 81 |             print("Warning: index section is not INDX")
 82 |             return False
 83 |         words = (
 84 |                 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
 85 |                 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc'
 86 |         )
 87 |         num = len(words)
 88 |         values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)])
 89 |         header = {}
 90 |         for n in range(num):
 91 |             header[words[n]] = values[n]
 92 | 
 93 |         ordt1 = None
 94 |         ordt2 = None
 95 | 
 96 |         ocnt, oentries, op1, op2, otagx  = struct.unpack_from(b'>LLLLL',data, 0xa4)
 97 |         if header['code'] == 0xfdea or ocnt != 0 or oentries > 0:
 98 |             # horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify
 99 |             # them in the proper place in the header.  They seem to be codepage 65002 which seems
100 |             # to be some sort of strange EBCDIC utf-8 or 16 encoded strings
101 | 
102 |             # so we need to look for them and store them away to process leading text
103 |             # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
104 |             # we only ever seem to use the seocnd but ...
105 |             assert(ocnt == 1)
106 |             assert(data[op1:op1+4] == b'ORDT')
107 |             assert(data[op2:op2+4] == b'ORDT')
108 |             ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4)
109 |             ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4)
110 | 
111 |         if self.DEBUG:
112 |             print("parsed INDX header:")
113 |             for n in words:
114 |                 print(n, "%X" % header[n],)
115 |             print("")
116 |         return header, ordt1, ordt2
117 | 
118 |     def readCTOC(self, txtdata):
119 |         # read all blocks from CTOC
120 |         ctoc_data = {}
121 |         offset = 0
122 |         while offset<len(txtdata):
123 |             if PY2:
124 |                 if txtdata[offset] == b'\0':
125 |                     break
126 |             else:
127 |                 if txtdata[offset] == 0:
128 |                     break
129 |             idx_offs = offset
130 |             # first n bytes: name len as vwi
131 |             pos, ilen = getVariableWidthValue(txtdata, offset)
132 |             offset += pos
133 |             # <len> next bytes: name
134 |             name = txtdata[offset:offset+ilen]
135 |             offset += ilen
136 |             if self.DEBUG:
137 |                 print("name length is ", ilen)
138 |                 print(idx_offs, name)
139 |             ctoc_data[idx_offs] = name
140 |         return ctoc_data
141 | 
142 | 
143 | def getVariableWidthValue(data, offset):
144 |     '''
145 |     Decode variable width value from given bytes.
146 | 
147 |     @param data: The bytes to decode.
148 |     @param offset: The start offset into data.
149 |     @return: Tuple of consumed bytes count and decoded value.
150 |     '''
151 |     value = 0
152 |     consumed = 0
153 |     finished = False
154 |     while not finished:
155 |         v = data[offset + consumed: offset + consumed + 1]
156 |         consumed += 1
157 |         if ord(v) & 0x80:
158 |             finished = True
159 |         value = (value << 7) | (ord(v) & 0x7f)
160 |     return consumed, value
161 | 
162 | 
163 | def readTagSection(start, data):
164 |     '''
165 |     Read tag section from given data.
166 | 
167 |     @param start: The start position in the data.
168 |     @param data: The data to process.
169 |     @return: Tuple of control byte count and list of tag tuples.
170 |     '''
171 |     controlByteCount = 0
172 |     tags = []
173 |     if data[start:start+4] == b"TAGX":
174 |         firstEntryOffset, = struct.unpack_from(b'>L', data, start + 0x04)
175 |         controlByteCount, = struct.unpack_from(b'>L', data, start + 0x08)
176 | 
177 |         # Skip the first 12 bytes already read above.
178 |         for i in range(12, firstEntryOffset, 4):
179 |             pos = start + i
180 |             tags.append((ord(data[pos:pos+1]), ord(data[pos+1:pos+2]), ord(data[pos+2:pos+3]), ord(data[pos+3:pos+4])))
181 |     return controlByteCount, tags
182 | 
183 | 
184 | def countSetBits(value, bits=8):
185 |     '''
186 |     Count the set bits in the given value.
187 | 
188 |     @param value: Integer value.
189 |     @param bits: The number of bits of the input value (defaults to 8).
190 |     @return: Number of set bits.
191 |     '''
192 |     count = 0
193 |     for _ in range(bits):
194 |         if value & 0x01 == 0x01:
195 |             count += 1
196 |         value = value >> 1
197 |     return count
198 | 
199 | 
200 | def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos):
201 |     '''
202 |     Create a map of tags and values from the given byte section.
203 | 
204 |     @param controlByteCount: The number of control bytes.
205 |     @param tagTable: The tag table.
206 |     @param entryData: The data to process.
207 |     @param startPos: The starting position in entryData.
208 |     @param endPos: The end position in entryData or None if it is unknown.
209 |     @return: Hashmap of tag and list of values.
210 |     '''
211 |     tags = []
212 |     tagHashMap = {}
213 |     controlByteIndex = 0
214 |     dataStart = startPos + controlByteCount
215 | 
216 |     for tag, valuesPerEntry, mask, endFlag in tagTable:
217 |         if endFlag == 0x01:
218 |             controlByteIndex += 1
219 |             continue
220 |         cbyte = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1])
221 |         if 0:
222 |             print("Control Byte Index %0x , Control Byte Value %0x" % (controlByteIndex, cbyte))
223 | 
224 |         value = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) & mask
225 |         if value != 0:
226 |             if value == mask:
227 |                 if countSetBits(mask) > 1:
228 |                     # If all bits of masked value are set and the mask has more than one bit, a variable width value
229 |                     # will follow after the control bytes which defines the length of bytes (NOT the value count!)
230 |                     # which will contain the corresponding variable width values.
231 |                     consumed, value = getVariableWidthValue(entryData, dataStart)
232 |                     dataStart += consumed
233 |                     tags.append((tag, None, value, valuesPerEntry))
234 |                 else:
235 |                     tags.append((tag, 1, None, valuesPerEntry))
236 |             else:
237 |                 # Shift bits to get the masked value.
238 |                 while mask & 0x01 == 0:
239 |                     mask = mask >> 1
240 |                     value = value >> 1
241 |                 tags.append((tag, value, None, valuesPerEntry))
242 |     for tag, valueCount, valueBytes, valuesPerEntry in tags:
243 |         values = []
244 |         if valueCount is not None:
245 |             # Read valueCount * valuesPerEntry variable width values.
246 |             for _ in range(valueCount):
247 |                 for _ in range(valuesPerEntry):
248 |                     consumed, data = getVariableWidthValue(entryData, dataStart)
249 |                     dataStart += consumed
250 |                     values.append(data)
251 |         else:
252 |             # Convert valueBytes to variable width values.
253 |             totalConsumed = 0
254 |             while totalConsumed < valueBytes:
255 |                 # Does this work for valuesPerEntry != 1?
256 |                 consumed, data = getVariableWidthValue(entryData, dataStart)
257 |                 dataStart += consumed
258 |                 totalConsumed += consumed
259 |                 values.append(data)
260 |             if totalConsumed != valueBytes:
261 |                 print("Error: Should consume %s bytes, but consumed %s" % (valueBytes, totalConsumed))
262 |         tagHashMap[tag] = values
263 |     # Test that all bytes have been processed if endPos is given.
264 |     if endPos is not None and dataStart != endPos:
265 |         # The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
266 |         for char in entryData[dataStart:endPos]:
267 |             if bord(char) != 0:
268 |                 print("Warning: There are unprocessed index bytes left: %s" % toHex(entryData[dataStart:endPos]))
269 |                 if 0:
270 |                     print("controlByteCount: %s" % controlByteCount)
271 |                     print("tagTable: %s" % tagTable)
272 |                     print("data: %s" % toHex(entryData[startPos:endPos]))
273 |                     print("tagHashMap: %s" % tagHashMap)
274 |                 break
275 | 
276 |     return tagHashMap
277 | 


--------------------------------------------------------------------------------
/lib/mobi_k8resc.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | DEBUG_USE_ORDERED_DICTIONARY = False  # OrderedDict is supoorted >= python 2.7.
  8 | """ set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr."""
  9 | 
 10 | if DEBUG_USE_ORDERED_DICTIONARY:
 11 |     from collections import OrderedDict as dict_
 12 | else:
 13 |     dict_ = dict
 14 | 
 15 | from .compatibility_utils import unicode_str
 16 | 
 17 | from .mobi_utils import fromBase32
 18 | 
 19 | _OPF_PARENT_TAGS = ['xml', 'package', 'metadata', 'dc-metadata',
 20 |                     'x-metadata', 'manifest', 'spine', 'tours', 'guide']
 21 | 
 22 | class K8RESCProcessor(object):
 23 | 
 24 |     def __init__(self, data, debug=False):
 25 |         self._debug = debug
 26 |         self.resc = None
 27 |         self.opos = 0
 28 |         self.extrameta = []
 29 |         self.cover_name = None
 30 |         self.spine_idrefs = {}
 31 |         self.spine_order = []
 32 |         self.spine_pageattributes = {}
 33 |         self.spine_ppd = None
 34 |         # need3 indicate the book has fields which require epub3.
 35 |         # but the estimation of the source epub version from the fields is difficult.
 36 |         self.need3 = False
 37 |         self.package_ver = None
 38 |         self.extra_metadata = []
 39 |         self.refines_metadata = []
 40 |         self.extra_attributes = []
 41 |         # get header
 42 |         start_pos = data.find(b'<')
 43 |         self.resc_header = data[:start_pos]
 44 |         # get resc data length
 45 |         start = self.resc_header.find(b'=') + 1
 46 |         end = self.resc_header.find(b'&', start)
 47 |         resc_size = 0
 48 |         if end > 0:
 49 |             resc_size = fromBase32(self.resc_header[start:end])
 50 |         resc_rawbytes = len(data) - start_pos
 51 |         if resc_rawbytes == resc_size:
 52 |             self.resc_length = resc_size
 53 |         else:
 54 |             # Most RESC has a nul string at its tail but some do not.
 55 |             end_pos = data.find(b'\x00', start_pos)
 56 |             if end_pos < 0:
 57 |                 self.resc_length = resc_rawbytes
 58 |             else:
 59 |                 self.resc_length = end_pos - start_pos
 60 |         if self.resc_length != resc_size:
 61 |             print("Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size))
 62 |         # now parse RESC after converting it to unicode from utf-8
 63 |         try:
 64 |             self.resc = unicode_str(data[start_pos:start_pos+self.resc_length])
 65 |         except UnicodeDecodeError:
 66 |             self.resc = unicode_str(data[start_pos:start_pos+self.resc_length], enc='latin-1')
 67 |         self.parseData()
 68 | 
 69 |     def prepend_to_spine(self, key, idref, linear, properties):
 70 |         self.spine_order = [key] + self.spine_order
 71 |         self.spine_idrefs[key] = idref
 72 |         attributes = {}
 73 |         if linear is not None:
 74 |             attributes['linear'] = linear
 75 |         if properties is not None:
 76 |             attributes['properties'] = properties
 77 |         self.spine_pageattributes[key] = attributes
 78 | 
 79 |     # RESC tag iterator
 80 |     def resc_tag_iter(self):
 81 |         tcontent = last_tattr = None
 82 |         prefix = ['']
 83 |         while True:
 84 |             text, tag = self.parseresc()
 85 |             if text is None and tag is None:
 86 |                 break
 87 |             if text is not None:
 88 |                 tcontent = text.rstrip(' \r\n')
 89 |             else:  # we have a tag
 90 |                 ttype, tname, tattr = self.parsetag(tag)
 91 |                 if ttype == 'begin':
 92 |                     tcontent = None
 93 |                     prefix.append(tname + '.')
 94 |                     if tname in _OPF_PARENT_TAGS:
 95 |                         yield ''.join(prefix), tname, tattr, tcontent
 96 |                     else:
 97 |                         last_tattr = tattr
 98 |                 else:  # single or end
 99 |                     if ttype == 'end':
100 |                         prefix.pop()
101 |                         tattr = last_tattr
102 |                         last_tattr = None
103 |                         if tname in _OPF_PARENT_TAGS:
104 |                             tname += '-end'
105 |                     yield ''.join(prefix), tname, tattr, tcontent
106 |                     tcontent = None
107 | 
108 |     # now parse the RESC to extract spine and extra metadata info
109 |     def parseData(self):
110 |         for prefix, tname, tattr, tcontent in self.resc_tag_iter():
111 |             if self._debug:
112 |                 print("   Parsing RESC: ", prefix, tname, tattr, tcontent)
113 |             if tname == 'package':
114 |                 self.package_ver = tattr.get('version', '2.0')
115 |                 package_prefix = tattr.get('prefix','')
116 |                 if self.package_ver.startswith('3') or package_prefix.startswith('rendition'):
117 |                     self.need3 = True
118 |             if tname == 'spine':
119 |                 self.spine_ppd = tattr.get('page-progession-direction', None)
120 |                 if self.spine_ppd is not None and self.spine_ppd == 'rtl':
121 |                     self.need3 = True
122 |             if tname == 'itemref':
123 |                 skelid = tattr.pop('skelid', None)
124 |                 if skelid is None and len(self.spine_order) == 0:
125 |                     # assume it was removed initial coverpage
126 |                     skelid = 'coverpage'
127 |                     tattr['linear'] = 'no'
128 |                 self.spine_order.append(skelid)
129 |                 idref = tattr.pop('idref', None)
130 |                 if idref is not None:
131 |                     idref = 'x_' + idref
132 |                 self.spine_idrefs[skelid] = idref
133 |                 if 'id' in tattr:
134 |                     del tattr['id']
135 |                 # tattr["id"] = 'x_' + tattr["id"]
136 |                 if 'properties' in tattr:
137 |                     self.need3 = True
138 |                 self.spine_pageattributes[skelid] = tattr
139 |             if tname == 'meta' or tname.startswith('dc:'):
140 |                 if 'refines' in tattr or 'property' in tattr:
141 |                     self.need3 = True
142 |                 if tattr.get('name','') == 'cover':
143 |                     cover_name = tattr.get('content',None)
144 |                     if cover_name is not None:
145 |                         cover_name = 'x_' + cover_name
146 |                     self.cover_name = cover_name
147 |                 else:
148 |                     self.extrameta.append([tname, tattr, tcontent])
149 | 
150 |     # parse and return either leading text or the next tag
151 |     def parseresc(self):
152 |         p = self.opos
153 |         if p >= len(self.resc):
154 |             return None, None
155 |         if self.resc[p] != '<':
156 |             res = self.resc.find('<',p)
157 |             if res == -1 :
158 |                 res = len(self.resc)
159 |             self.opos = res
160 |             return self.resc[p:res], None
161 |         # handle comment as a special case
162 |         if self.resc[p:p+4] == '<!--':
163 |             te = self.resc.find('-->',p+1)
164 |             if te != -1:
165 |                 te = te+2
166 |         else:
167 |             te = self.resc.find('>',p+1)
168 |             ntb = self.resc.find('<',p+1)
169 |             if ntb != -1 and ntb < te:
170 |                 self.opos = ntb
171 |                 return self.resc[p:ntb], None
172 |         self.opos = te + 1
173 |         return None, self.resc[p:te+1]
174 | 
175 |     # parses tag to identify:  [tname, ttype, tattr]
176 |     #    tname: tag name
177 |     #    ttype: tag type ('begin', 'end' or 'single');
178 |     #    tattr: dictionary of tag atributes
179 |     def parsetag(self, s):
180 |         p = 1
181 |         tname = None
182 |         ttype = None
183 |         tattr = dict_()
184 |         while s[p:p+1] == ' ' :
185 |             p += 1
186 |         if s[p:p+1] == '/':
187 |             ttype = 'end'
188 |             p += 1
189 |             while s[p:p+1] == ' ' :
190 |                 p += 1
191 |         b = p
192 |         while s[p:p+1] not in ('>', '/', ' ', '"', "'",'\r','\n') :
193 |             p += 1
194 |         tname=s[b:p].lower()
195 |         # some special cases
196 |         if tname == '?xml':
197 |             tname = 'xml'
198 |         if tname == '!--':
199 |             ttype = 'single'
200 |             comment = s[p:-3].strip()
201 |             tattr['comment'] = comment
202 |         if ttype is None:
203 |             # parse any attributes of begin or single tags
204 |             while s.find('=',p) != -1 :
205 |                 while s[p:p+1] == ' ' :
206 |                     p += 1
207 |                 b = p
208 |                 while s[p:p+1] != '=' :
209 |                     p += 1
210 |                 aname = s[b:p].lower()
211 |                 aname = aname.rstrip(' ')
212 |                 p += 1
213 |                 while s[p:p+1] == ' ' :
214 |                     p += 1
215 |                 if s[p:p+1] in ('"', "'") :
216 |                     p = p + 1
217 |                     b = p
218 |                     while s[p:p+1] not in ('"', "'"):
219 |                         p += 1
220 |                     val = s[b:p]
221 |                     p += 1
222 |                 else :
223 |                     b = p
224 |                     while s[p:p+1] not in ('>', '/', ' ') :
225 |                         p += 1
226 |                     val = s[b:p]
227 |                 tattr[aname] = val
228 |         if ttype is None:
229 |             ttype = 'begin'
230 |             if s.find('/',p) >= 0:
231 |                 ttype = 'single'
232 |         return ttype, tname, tattr
233 | 
234 |     def taginfo_toxml(self, taginfo):
235 |         res = []
236 |         tname, tattr, tcontent = taginfo
237 |         res.append('<' + tname)
238 |         if tattr is not None:
239 |             for key in tattr:
240 |                 res.append(' ' + key + '="'+tattr[key]+'"')
241 |         if tcontent is not None:
242 |             res.append('>' + tcontent + '</' + tname + '>\n')
243 |         else:
244 |             res.append('/>\n')
245 |         return "".join(res)
246 | 
247 |     def hasSpine(self):
248 |         return len(self.spine_order) > 0
249 | 
250 |     def needEPUB3(self):
251 |         return self.need3
252 | 
253 |     def hasRefines(self):
254 |         for [tname, tattr, tcontent] in self.extrameta:
255 |             if 'refines' in tattr:
256 |                 return True
257 |         return False
258 | 
259 |     def createMetadata(self, epubver):
260 |         for taginfo in self.extrameta:
261 |             tname, tattr, tcontent = taginfo
262 |             if 'refines' in tattr:
263 |                 if epubver == 'F' and 'property' in tattr:
264 |                     attr = ' id="%s" opf:%s="%s"\n' % (tattr['refines'], tattr['property'], tcontent)
265 |                     self.extra_attributes.append(attr)
266 |                 else:
267 |                     tag = self.taginfo_toxml(taginfo)
268 |                     self.refines_metadata.append(tag)
269 |             else:
270 |                 tag = self.taginfo_toxml(taginfo)
271 |                 self.extra_metadata.append(tag)
272 | 


--------------------------------------------------------------------------------
/lib/mobi_nav.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | from .compatibility_utils import unicode_str
  8 | import os
  9 | from .unipath import pathof
 10 | 
 11 | import re
 12 | # note: re requites the pattern to be the exact same type as the data to be searched in python3
 13 | # but u"" is not allowed for the pattern itself only b""
 14 | 
 15 | DEBUG_NAV = False
 16 | 
 17 | FORCE_DEFAULT_TITLE = False
 18 | """ Set to True to force to use the default title. """
 19 | 
 20 | NAVIGATION_FINENAME = 'nav.xhtml'
 21 | """ The name for the navigation document. """
 22 | 
 23 | DEFAULT_TITLE = 'Navigation'
 24 | """ The default title for the navigation document. """
 25 | 
 26 | class NAVProcessor(object):
 27 | 
 28 |     def __init__(self, files):
 29 |         self.files = files
 30 |         self.navname = NAVIGATION_FINENAME
 31 | 
 32 |     def buildLandmarks(self, guidetext):
 33 |         header = ''
 34 |         header += '  <nav epub:type="landmarks" id="landmarks" hidden="">\n'
 35 |         header += '    <h2>Guide</h2>\n'
 36 |         header += '    <ol>\n'
 37 |         element = '      <li><a epub:type="{:s}" href="{:s}">{:s}</a></li>\n'
 38 |         footer = ''
 39 |         footer += '    </ol>\n'
 40 |         footer += '  </nav>\n'
 41 | 
 42 |         type_map = {
 43 |             'cover' : 'cover',
 44 |             'title-page' : 'title-page',
 45 |             # ?: 'frontmatter',
 46 |             'text' : 'bodymatter',
 47 |             # ?: 'backmatter',
 48 |             'toc' : 'toc',
 49 |             'loi' : 'loi',
 50 |             'lot' : 'lot',
 51 |             'preface' : 'preface',
 52 |             'bibliography' : 'bibliography',
 53 |             'index' : 'index',
 54 |             'glossary' : 'glossary',
 55 |             'acknowledgements' : 'acknowledgements',
 56 |             'colophon' : None,
 57 |             'copyright-page' : None,
 58 |             'dedication' : None,
 59 |             'epigraph' : None,
 60 |             'foreword' : None,
 61 |             'notes' : None
 62 |             }
 63 | 
 64 |         re_type = re.compile(r'\s+type\s*=\s*"(.*?)"', re.I)
 65 |         re_title = re.compile(r'\s+title\s*=\s*"(.*?)"', re.I)
 66 |         re_link = re.compile(r'\s+href\s*=\s*"(.*?)"', re.I)
 67 |         dir_ = os.path.relpath(self.files.k8text, self.files.k8oebps).replace('\\', '/')
 68 | 
 69 |         data = ''
 70 |         references = re.findall(r'<reference\s+.*?>', unicode_str(guidetext), re.I)
 71 |         for reference in references:
 72 |             mo_type = re_type.search(reference)
 73 |             mo_title = re_title.search(reference)
 74 |             mo_link = re_link.search(reference)
 75 |             if mo_type is not None:
 76 |                 type_ = type_map.get(mo_type.group(1), None)
 77 |             else:
 78 |                 type_ = None
 79 |             if mo_title is not None:
 80 |                 title = mo_title.group(1)
 81 |             else:
 82 |                 title = None
 83 |             if mo_link is not None:
 84 |                 link = mo_link.group(1)
 85 |             else:
 86 |                 link = None
 87 | 
 88 |             if type_ is not None and title is not None and link is not None:
 89 |                 link = os.path.relpath(link, dir_).replace('\\', '/')
 90 |                 data += element.format(type_, link, title)
 91 |         if len(data) > 0:
 92 |             return header + data + footer
 93 |         else:
 94 |             return ''
 95 | 
 96 |     def buildTOC(self, indx_data):
 97 |         header = ''
 98 |         header += '  <nav epub:type="toc" id="toc">\n'
 99 |         header += '    <h1>Table of contents</h1>\n'
100 |         footer = '  </nav>\n'
101 | 
102 |         # recursive part
103 |         def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
104 |             if start>len(indx_data) or end>len(indx_data):
105 |                 print("Warning (in buildTOC): missing INDX child entries", start, end, len(indx_data))
106 |                 return ''
107 |             if DEBUG_NAV:
108 |                 print("recursINDX (in buildTOC) lvl %d from %d to %d" % (lvl, start, end))
109 |             xhtml = ''
110 |             if start <= 0:
111 |                 start = 0
112 |             if end <= 0:
113 |                 end = len(indx_data)
114 |             if lvl > max_lvl:
115 |                 max_lvl = lvl
116 | 
117 |             indent1 = '  ' * (2 + lvl * 2)
118 |             indent2 = '  ' * (3 + lvl * 2)
119 |             xhtml += indent1 + '<ol>\n'
120 |             for i in range(start, end):
121 |                 e = indx_data[i]
122 |                 htmlfile = e['filename']
123 |                 desttag = e['idtag']
124 |                 text = e['text']
125 |                 if not e['hlvl'] == lvl:
126 |                     continue
127 |                 num += 1
128 |                 if desttag == '':
129 |                     link = htmlfile
130 |                 else:
131 |                     link = '{:s}#{:s}'.format(htmlfile, desttag)
132 |                 xhtml += indent2 + '<li>'
133 |                 entry = '<a href="{:}">{:s}</a>'.format(link, text)
134 |                 xhtml += entry
135 |                 # recurs
136 |                 if e['child1'] >= 0:
137 |                     xhtml += '\n'
138 |                     xhtmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
139 |                             e['child1'], e['childn'] + 1)
140 |                     xhtml += xhtmlrec
141 |                     xhtml += indent2
142 |                 # close entry
143 |                 xhtml += '</li>\n'
144 |             xhtml += indent1 + '</ol>\n'
145 |             return xhtml, max_lvl, num
146 | 
147 |         data, max_lvl, num = recursINDX()
148 |         if not len(indx_data) == num:
149 |             print("Warning (in buildTOC): different number of entries in NCX", len(indx_data), num)
150 |         return header + data + footer
151 | 
152 |     def buildNAV(self, ncx_data, guidetext, title, lang):
153 |         print("Building Navigation Document.")
154 |         if FORCE_DEFAULT_TITLE:
155 |             title = DEFAULT_TITLE
156 |         nav_header = ''
157 |         nav_header += '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>'
158 |         nav_header += '<html xmlns="http://www.w3.org/1999/xhtml"'
159 |         nav_header += ' xmlns:epub="http://www.idpf.org/2007/ops"'
160 |         nav_header += ' lang="{0:s}" xml:lang="{0:s}">\n'.format(lang)
161 |         nav_header += '<head>\n<title>{:s}</title>\n'.format(title)
162 |         nav_header += '<meta charset="UTF-8" />\n'
163 |         nav_header += '<style type="text/css">\n'
164 |         nav_header += 'nav#landmarks { display:none; }\n'
165 |         nav_header += 'ol { list-style-type: none; }'
166 |         nav_header += '</style>\n</head>\n<body>\n'
167 |         nav_footer = '</body>\n</html>\n'
168 | 
169 |         landmarks =  self.buildLandmarks(guidetext)
170 |         toc = self.buildTOC(ncx_data)
171 | 
172 |         data = nav_header
173 |         data += landmarks
174 |         data += toc
175 |         data += nav_footer
176 |         return data
177 | 
178 |     def getNAVName(self):
179 |         return self.navname
180 | 
181 |     def writeNAV(self, ncx_data, guidetext, metadata):
182 |         # build the xhtml
183 |         # print("Write Navigation Document.")
184 |         xhtml = self.buildNAV(ncx_data, guidetext, metadata.get('Title')[0], metadata.get('Language')[0])
185 |         fname = os.path.join(self.files.k8text, self.navname)
186 |         with open(pathof(fname), 'wb') as f:
187 |             f.write(xhtml.encode('utf-8'))
188 | 


--------------------------------------------------------------------------------
/lib/mobi_ncx.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | import os
  8 | from .unipath import pathof
  9 | from .compatibility_utils import unescapeit
 10 | 
 11 | 
 12 | import re
 13 | # note: re requites the pattern to be the exact same type as the data to be searched in python3
 14 | # but u"" is not allowed for the pattern itself only b""
 15 | 
 16 | from xml.sax.saxutils import escape as xmlescape
 17 | 
 18 | from .mobi_utils import toBase32
 19 | from .mobi_index import MobiIndex
 20 | 
 21 | DEBUG_NCX = False
 22 | 
 23 | class ncxExtract:
 24 | 
 25 |     def __init__(self, mh, files):
 26 |         self.mh = mh
 27 |         self.sect = self.mh.sect
 28 |         self.files = files
 29 |         self.isNCX = False
 30 |         self.mi = MobiIndex(self.sect)
 31 |         self.ncxidx = self.mh.ncxidx
 32 |         self.indx_data = None
 33 | 
 34 |     def parseNCX(self):
 35 |         indx_data = []
 36 |         tag_fieldname_map = {
 37 |                 1: ['pos',0],
 38 |                 2: ['len',0],
 39 |                 3: ['noffs',0],
 40 |                 4: ['hlvl',0],
 41 |                 5: ['koffs',0],
 42 |                 6: ['pos_fid',0],
 43 |                 21: ['parent',0],
 44 |                 22: ['child1',0],
 45 |                 23: ['childn',0]
 46 |         }
 47 |         if self.ncxidx != 0xffffffff:
 48 |             outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX")
 49 |             if DEBUG_NCX:
 50 |                 print(ctoc_text)
 51 |                 print(outtbl)
 52 |             num = 0
 53 |             for [text, tagMap] in outtbl:
 54 |                 tmp = {
 55 |                         'name': text.decode('utf-8'),
 56 |                         'pos':  -1,
 57 |                         'len':  0,
 58 |                         'noffs': -1,
 59 |                         'text' : "Unknown Text",
 60 |                         'hlvl' : -1,
 61 |                         'kind' : "Unknown Kind",
 62 |                         'pos_fid' : None,
 63 |                         'parent' : -1,
 64 |                         'child1' : -1,
 65 |                         'childn' : -1,
 66 |                         'num'  : num
 67 |                         }
 68 |                 for tag in tag_fieldname_map:
 69 |                     [fieldname, i] = tag_fieldname_map[tag]
 70 |                     if tag in tagMap:
 71 |                         fieldvalue = tagMap[tag][i]
 72 |                         if tag == 6:
 73 |                             pos_fid = toBase32(fieldvalue,4).decode('utf-8')
 74 |                             fieldvalue2 = tagMap[tag][i+1]
 75 |                             pos_off = toBase32(fieldvalue2,10).decode('utf-8')
 76 |                             fieldvalue = 'kindle:pos:fid:%s:off:%s' % (pos_fid, pos_off)
 77 |                         tmp[fieldname] = fieldvalue
 78 |                         if tag == 3:
 79 |                             toctext = ctoc_text.get(fieldvalue, b'Unknown Text')
 80 |                             toctext = toctext.decode(self.mh.codec)
 81 |                             tmp['text'] = toctext
 82 |                         if tag == 5:
 83 |                             kindtext = ctoc_text.get(fieldvalue, b'Unknown Kind')
 84 |                             kindtext = kindtext.decode(self.mh.codec)
 85 |                             tmp['kind'] = kindtext
 86 |                 indx_data.append(tmp)
 87 |                 if DEBUG_NCX:
 88 |                     print("record number: ", num)
 89 |                     print("name: ", tmp['name'],)
 90 |                     print("position", tmp['pos']," length: ", tmp['len'])
 91 |                     print("text: ", tmp['text'])
 92 |                     print("kind: ", tmp['kind'])
 93 |                     print("heading level: ", tmp['hlvl'])
 94 |                     print("parent:", tmp['parent'])
 95 |                     print("first child: ",tmp['child1']," last child: ", tmp['childn'])
 96 |                     print("pos_fid is ", tmp['pos_fid'])
 97 |                     print("\n\n")
 98 |                 num += 1
 99 |         self.indx_data = indx_data
100 |         return indx_data
101 | 
102 |     def buildNCX(self, htmlfile, title, ident, lang):
103 |         indx_data = self.indx_data
104 | 
105 |         ncx_header = \
106 | '''<?xml version='1.0' encoding='utf-8'?>
107 | <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%s">
108 | <head>
109 | <meta content="%s" name="dtb:uid"/>
110 | <meta content="%d" name="dtb:depth"/>
111 | <meta content="mobiunpack.py" name="dtb:generator"/>
112 | <meta content="0" name="dtb:totalPageCount"/>
113 | <meta content="0" name="dtb:maxPageNumber"/>
114 | </head>
115 | <docTitle>
116 | <text>%s</text>
117 | </docTitle>
118 | <navMap>
119 | '''
120 | 
121 |         ncx_footer = \
122 | '''  </navMap>
123 | </ncx>
124 | '''
125 | 
126 |         ncx_entry = \
127 | '''<navPoint id="%s" playOrder="%d">
128 | <navLabel>
129 | <text>%s</text>
130 | </navLabel>
131 | <content src="%s"/>'''
132 | 
133 |         # recursive part
134 |         def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
135 |             if start>len(indx_data) or end>len(indx_data):
136 |                 print("Warning: missing INDX child entries", start, end, len(indx_data))
137 |                 return ''
138 |             if DEBUG_NCX:
139 |                 print("recursINDX lvl %d from %d to %d" % (lvl, start, end))
140 |             xml = ''
141 |             if start <= 0:
142 |                 start = 0
143 |             if end <= 0:
144 |                 end = len(indx_data)
145 |             if lvl > max_lvl:
146 |                 max_lvl = lvl
147 |             indent = '  ' * (2 + lvl)
148 | 
149 |             for i in range(start, end):
150 |                 e = indx_data[i]
151 |                 if not e['hlvl'] == lvl:
152 |                     continue
153 |                 # open entry
154 |                 num += 1
155 |                 link = '%s#filepos%d' % (htmlfile, e['pos'])
156 |                 tagid = 'np_%d' % num
157 |                 entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link)
158 |                 entry = re.sub(re.compile('^', re.M), indent, entry, 0)
159 |                 xml += entry + '\n'
160 |                 # recurs
161 |                 if e['child1']>=0:
162 |                     xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
163 |                             e['child1'], e['childn'] + 1)
164 |                     xml += xmlrec
165 |                 # close entry
166 |                 xml += indent + '</navPoint>\n'
167 |             return xml, max_lvl, num
168 | 
169 |         body, max_lvl, num = recursINDX()
170 |         header = ncx_header % (lang, ident, max_lvl + 1, xmlescape(unescapeit(title)))
171 |         ncx =  header + body + ncx_footer
172 |         if not len(indx_data) == num:
173 |             print("Warning: different number of entries in NCX", len(indx_data), num)
174 |         return ncx
175 | 
176 |     def writeNCX(self, metadata):
177 |         # build the xml
178 |         self.isNCX = True
179 |         print("Write ncx")
180 |         # htmlname = os.path.basename(self.files.outbase)
181 |         # htmlname += '.html'
182 |         htmlname = 'book.html'
183 |         xml = self.buildNCX(htmlname, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0])
184 |         # write the ncx file
185 |         # ncxname = os.path.join(self.files.mobi7dir, self.files.getInputFileBasename() + '.ncx')
186 |         ncxname = os.path.join(self.files.mobi7dir, 'toc.ncx')
187 |         with open(pathof(ncxname), 'wb') as f:
188 |             f.write(xml.encode('utf-8'))
189 | 
190 |     def buildK8NCX(self, indx_data, title, ident, lang):
191 |         ncx_header = \
192 | '''<?xml version='1.0' encoding='utf-8'?>
193 | <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%s">
194 | <head>
195 | <meta content="%s" name="dtb:uid"/>
196 | <meta content="%d" name="dtb:depth"/>
197 | <meta content="mobiunpack.py" name="dtb:generator"/>
198 | <meta content="0" name="dtb:totalPageCount"/>
199 | <meta content="0" name="dtb:maxPageNumber"/>
200 | </head>
201 | <docTitle>
202 | <text>%s</text>
203 | </docTitle>
204 | <navMap>
205 | '''
206 | 
207 |         ncx_footer = \
208 | '''  </navMap>
209 | </ncx>
210 | '''
211 | 
212 |         ncx_entry = \
213 | '''<navPoint id="%s" playOrder="%d">
214 | <navLabel>
215 | <text>%s</text>
216 | </navLabel>
217 | <content src="%s"/>'''
218 | 
219 |         # recursive part
220 |         def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
221 |             if start>len(indx_data) or end>len(indx_data):
222 |                 print("Warning: missing INDX child entries", start, end, len(indx_data))
223 |                 return ''
224 |             if DEBUG_NCX:
225 |                 print("recursINDX lvl %d from %d to %d" % (lvl, start, end))
226 |             xml = ''
227 |             if start <= 0:
228 |                 start = 0
229 |             if end <= 0:
230 |                 end = len(indx_data)
231 |             if lvl > max_lvl:
232 |                 max_lvl = lvl
233 |             indent = '  ' * (2 + lvl)
234 | 
235 |             for i in range(start, end):
236 |                 e = indx_data[i]
237 |                 htmlfile = e['filename']
238 |                 desttag = e['idtag']
239 |                 if not e['hlvl'] == lvl:
240 |                     continue
241 |                 # open entry
242 |                 num += 1
243 |                 if desttag == '':
244 |                     link = 'Text/%s' % htmlfile
245 |                 else:
246 |                     link = 'Text/%s#%s' % (htmlfile, desttag)
247 |                 tagid = 'np_%d' % num
248 |                 entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link)
249 |                 entry = re.sub(re.compile('^', re.M), indent, entry, 0)
250 |                 xml += entry + '\n'
251 |                 # recurs
252 |                 if e['child1']>=0:
253 |                     xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
254 |                             e['child1'], e['childn'] + 1)
255 |                     xml += xmlrec
256 |                 # close entry
257 |                 xml += indent + '</navPoint>\n'
258 |             return xml, max_lvl, num
259 | 
260 |         body, max_lvl, num = recursINDX()
261 |         header = ncx_header % (lang, ident, max_lvl + 1, xmlescape(unescapeit(title)))
262 |         ncx =  header + body + ncx_footer
263 |         if not len(indx_data) == num:
264 |             print("Warning: different number of entries in NCX", len(indx_data), num)
265 |         return ncx
266 | 
267 |     def writeK8NCX(self, ncx_data, metadata):
268 |         # build the xml
269 |         self.isNCX = True
270 |         print("Write K8 ncx")
271 |         xml = self.buildK8NCX(ncx_data, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0])
272 |         bname = 'toc.ncx'
273 |         ncxname = os.path.join(self.files.k8oebps,bname)
274 |         with open(pathof(ncxname), 'wb') as f:
275 |             f.write(xml.encode('utf-8'))
276 | 


--------------------------------------------------------------------------------
/lib/mobi_pagemap.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | from .compatibility_utils import PY2, unicode_str
  8 | 
  9 | if PY2:
 10 |     range = xrange
 11 | 
 12 | import struct
 13 | # note:  struct pack, unpack, unpack_from all require bytestring format
 14 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 15 | 
 16 | import re
 17 | # note: re requites the pattern to be the exact same type as the data to be searched in python3
 18 | # but u"" is not allowed for the pattern itself only b""
 19 | 
 20 | 
 21 | _TABLE = [('m', 1000), ('cm', 900), ('d', 500), ('cd', 400), ('c', 100), ('xc', 90), ('l', 50), ('xl', 40), ('x', 10), ('ix', 9), ('v', 5), ('iv', 4), ('i', 1)]
 22 | 
 23 | def int_to_roman(i):
 24 |     parts = []
 25 |     num = i
 26 |     for letter, value in _TABLE:
 27 |         while value <= num:
 28 |             num -= value
 29 |             parts.append(letter)
 30 |     return ''.join(parts)
 31 | 
 32 | def roman_to_int(s):
 33 |     result = 0
 34 |     rnstr = s
 35 |     for letter, value in _TABLE:
 36 |         while rnstr.startswith(letter):
 37 |             result += value
 38 |             rnstr = rnstr[len(letter):]
 39 |     return result
 40 | 
 41 | _pattern = r'''\(([^\)]*)\)'''
 42 | _tup_pattern = re.compile(_pattern,re.IGNORECASE)
 43 | 
 44 | 
 45 | def _parseNames(numpages, data):
 46 |     data = unicode_str(data)
 47 |     pagenames = []
 48 |     pageMap = ''
 49 |     for i in range(numpages):
 50 |         pagenames.append(None)
 51 |     for m in re.finditer(_tup_pattern, data):
 52 |         tup = m.group(1)
 53 |         if pageMap != '':
 54 |             pageMap += ','
 55 |         pageMap += '(' + tup + ')'
 56 |         spos, nametype, svalue = tup.split(",")
 57 |         # print(spos, nametype, svalue)
 58 |         if nametype == 'a' or nametype == 'r':
 59 |             svalue = int(svalue)
 60 |         spos = int(spos)
 61 |         for i in range(spos - 1, numpages):
 62 |             if nametype == 'r':
 63 |                 pname = int_to_roman(svalue)
 64 |                 svalue += 1
 65 |             elif nametype == 'a':
 66 |                 pname = "%s" % svalue
 67 |                 svalue += 1
 68 |             elif nametype == 'c':
 69 |                 sp = svalue.find('|')
 70 |                 if sp == -1:
 71 |                     pname = svalue
 72 |                 else:
 73 |                     pname = svalue[0:sp]
 74 |                     svalue = svalue[sp+1:]
 75 |             else:
 76 |                 print("Error: unknown page numbering type", nametype)
 77 |             pagenames[i] = pname
 78 |     return pagenames, pageMap
 79 | 
 80 | 
 81 | class PageMapProcessor:
 82 | 
 83 |     def __init__(self, mh, data):
 84 |         self.mh = mh
 85 |         self.data = data
 86 |         self.pagenames = []
 87 |         self.pageoffsets = []
 88 |         self.pageMap = ''
 89 |         self.pm_len = 0
 90 |         self.pm_nn = 0
 91 |         self.pn_bits = 0
 92 |         self.pmoff = None
 93 |         self.pmstr = ''
 94 |         print("Extracting Page Map Information")
 95 |         rev_len, = struct.unpack_from(b'>L', self.data, 0x10)
 96 |         # skip over header, revision string length data, and revision string
 97 |         ptr = 0x14 + rev_len
 98 |         pm_1, self.pm_len, self.pm_nn, self.pm_bits  = struct.unpack_from(b'>4H', self.data, ptr)
 99 |         # print(pm_1, self.pm_len, self.pm_nn, self.pm_bits)
100 |         self.pmstr = self.data[ptr+8:ptr+8+self.pm_len]
101 |         self.pmoff = self.data[ptr+8+self.pm_len:]
102 |         offsize = b">L"
103 |         offwidth = 4
104 |         if self.pm_bits == 16:
105 |             offsize = b">H"
106 |             offwidth = 2
107 |         ptr = 0
108 |         for i in range(self.pm_nn):
109 |             od, = struct.unpack_from(offsize, self.pmoff, ptr)
110 |             ptr += offwidth
111 |             self.pageoffsets.append(od)
112 |         self.pagenames, self.pageMap = _parseNames(self.pm_nn, self.pmstr)
113 | 
114 |     def getPageMap(self):
115 |         return self.pageMap
116 | 
117 |     def getNames(self):
118 |         return self.pagenames
119 | 
120 |     def getOffsets(self):
121 |         return self.pageoffsets
122 | 
123 |     # page-map.xml will be unicode but encoded to utf-8 immediately before being written to a file
124 |     def generateKF8PageMapXML(self, k8proc):
125 |         pagemapxml = '<page-map xmlns="http://www.idpf.org/2007/opf">\n'
126 |         for i in range(len(self.pagenames)):
127 |             pos = self.pageoffsets[i]
128 |             name = self.pagenames[i]
129 |             if name is not None and name != "":
130 |                 [pn, dir, filename, skelpos, skelend, aidtext] = k8proc.getSkelInfo(pos)
131 |                 idtext = unicode_str(k8proc.getPageIDTag(pos))
132 |                 linktgt = unicode_str(filename)
133 |                 if idtext != '':
134 |                     linktgt += '#' + idtext
135 |                 pagemapxml += '<page name="%s" href="%s/%s" />\n' % (name, dir, linktgt)
136 |         pagemapxml += "</page-map>\n"
137 |         return pagemapxml
138 | 
139 |     def generateAPNX(self, apnx_meta):
140 |         if apnx_meta['format'] == 'MOBI_8':
141 |             content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","format":"%(format)s","fileRevisionId":"1","acr":"%(acr)s"}' %apnx_meta
142 |         else:
143 |             content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","fileRevisionId":"1"}' % apnx_meta
144 |         content_header = content_header.encode('utf-8')
145 |         page_header = '{"asin":"%(asin)s","pageMap":"%(pageMap)s"}' % apnx_meta
146 |         page_header = page_header.encode('utf-8')
147 |         apnx = struct.pack(b'>H',1) + struct.pack(b'>H',1)
148 |         apnx += struct.pack(b'>I', 12 + len(content_header))
149 |         apnx += struct.pack(b'>I', len(content_header))
150 |         apnx += content_header
151 |         apnx += struct.pack(b'>H', 1)
152 |         apnx += struct.pack(b'>H', len(page_header))
153 |         apnx += struct.pack(b'>H', self.pm_nn)
154 |         apnx += struct.pack(b'>H', 32)
155 |         apnx += page_header
156 |         for page in self.pageoffsets:
157 |             apnx += struct.pack(b'>L', page)
158 |         return apnx
159 | 


--------------------------------------------------------------------------------
/lib/mobi_sectioner.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | from .compatibility_utils import PY2, hexlify, bstr, bord, bchar
  8 | 
  9 | import datetime
 10 | 
 11 | if PY2:
 12 |     range = xrange
 13 | 
 14 | # note:  struct pack, unpack, unpack_from all require bytestring format
 15 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 16 | import struct
 17 | 
 18 | from .unipath import pathof
 19 | 
 20 | DUMP = False
 21 | """ Set to True to dump all possible information. """
 22 | 
 23 | class unpackException(Exception):
 24 |     pass
 25 | 
 26 | 
 27 | def describe(data):
 28 |     txtans = ''
 29 |     hexans = hexlify(data)
 30 |     for i in data:
 31 |         if bord(i) < 32 or bord(i) > 127:
 32 |             txtans += '?'
 33 |         else:
 34 |             txtans += bchar(i).decode('latin-1')
 35 |     return '"' + txtans + '"' + ' 0x'+ hexans
 36 | 
 37 | def datetimefrompalmtime(palmtime):
 38 |     if palmtime > 0x7FFFFFFF:
 39 |         pythondatetime = datetime.datetime(year=1904,month=1,day=1)+datetime.timedelta(seconds=palmtime)
 40 |     else:
 41 |         pythondatetime = datetime.datetime(year=1970,month=1,day=1)+datetime.timedelta(seconds=palmtime)
 42 |     return pythondatetime
 43 | 
 44 | 
 45 | class Sectionizer:
 46 | 
 47 |     def __init__(self, filename):
 48 |         self.data = b''
 49 |         with open(pathof(filename), 'rb') as f:
 50 |             self.data = f.read()
 51 |         self.palmheader = self.data[:78]
 52 |         self.palmname = self.data[:32]
 53 |         self.ident = self.palmheader[0x3C:0x3C+8]
 54 |         self.num_sections, = struct.unpack_from(b'>H', self.palmheader, 76)
 55 |         self.filelength = len(self.data)
 56 |         sectionsdata = struct.unpack_from(bstr('>%dL' % (self.num_sections*2)), self.data, 78) + (self.filelength, 0)
 57 |         self.sectionoffsets = sectionsdata[::2]
 58 |         self.sectionattributes = sectionsdata[1::2]
 59 |         self.sectiondescriptions = ["" for x in range(self.num_sections+1)]
 60 |         self.sectiondescriptions[-1] = "File Length Only"
 61 |         return
 62 | 
 63 |     def dumpsectionsinfo(self):
 64 |         print("Section     Offset  Length      UID Attribs Description")
 65 |         for i in range(self.num_sections):
 66 |             print("%3d %3X  0x%07X 0x%05X % 8d % 7d %s" % (i,i, self.sectionoffsets[i], self.sectionoffsets[
 67 |                   i+1] - self.sectionoffsets[i], self.sectionattributes[i]&0xFFFFFF, (self.sectionattributes[i]>>24)&0xFF, self.sectiondescriptions[i]))
 68 |         print("%3d %3X  0x%07X                          %s" %
 69 |               (self.num_sections,self.num_sections, self.sectionoffsets[self.num_sections], self.sectiondescriptions[self.num_sections]))
 70 | 
 71 |     def setsectiondescription(self, section, description):
 72 |         if section < len(self.sectiondescriptions):
 73 |             self.sectiondescriptions[section] = description
 74 |         else:
 75 |             print("Section out of range: %d, description %s" % (section,description))
 76 | 
 77 |     def dumppalmheader(self):
 78 |         print("Palm Database Header")
 79 |         print("Database name: " + repr(self.palmheader[:32]))
 80 |         dbattributes, = struct.unpack_from(b'>H', self.palmheader, 32)
 81 |         print("Bitfield attributes: 0x%0X" % dbattributes,)
 82 |         if dbattributes != 0:
 83 |             print(" (",)
 84 |             if (dbattributes & 2):
 85 |                 print("Read-only; ",)
 86 |             if (dbattributes & 4):
 87 |                 print("Dirty AppInfoArea; ",)
 88 |             if (dbattributes & 8):
 89 |                 print("Needs to be backed up; ",)
 90 |             if (dbattributes & 16):
 91 |                 print("OK to install over newer; ",)
 92 |             if (dbattributes & 32):
 93 |                 print("Reset after installation; ",)
 94 |             if (dbattributes & 64):
 95 |                 print("No copying by PalmPilot beaming; ",)
 96 |             print(")")
 97 |         else:
 98 |             print("")
 99 |         print("File version: %d" % struct.unpack_from(b'>H', self.palmheader, 34)[0])
100 |         dbcreation, = struct.unpack_from(b'>L', self.palmheader, 36)
101 |         print("Creation Date: " + str(datetimefrompalmtime(dbcreation))+ (" (0x%0X)" % dbcreation))
102 |         dbmodification, = struct.unpack_from(b'>L', self.palmheader, 40)
103 |         print("Modification Date: " + str(datetimefrompalmtime(dbmodification))+ (" (0x%0X)" % dbmodification))
104 |         dbbackup, = struct.unpack_from(b'>L', self.palmheader, 44)
105 |         if dbbackup != 0:
106 |             print("Backup Date: " + str(datetimefrompalmtime(dbbackup))+ (" (0x%0X)" % dbbackup))
107 |         print("Modification No.: %d" % struct.unpack_from(b'>L', self.palmheader, 48)[0])
108 |         print("App Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 52)[0])
109 |         print("Sort Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 56)[0])
110 |         print("Type/Creator: %s/%s" % (repr(self.palmheader[60:64]), repr(self.palmheader[64:68])))
111 |         print("Unique seed: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 68)[0])
112 |         expectedzero, = struct.unpack_from(b'>L', self.palmheader, 72)
113 |         if expectedzero != 0:
114 |             print("Should be zero but isn't: %d" % struct.unpack_from(b'>L', self.palmheader, 72)[0])
115 |         print("Number of sections: %d" % struct.unpack_from(b'>H', self.palmheader, 76)[0])
116 |         return
117 | 
118 |     def loadSection(self, section):
119 |         before, after = self.sectionoffsets[section:section+2]
120 |         return self.data[before:after]
121 | 


--------------------------------------------------------------------------------
/lib/mobi_split.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | import struct
  8 | # note:  struct pack, unpack, unpack_from all require bytestring format
  9 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 10 | 
 11 | from .unipath import pathof
 12 | 
 13 | 
 14 | # important  pdb header offsets
 15 | unique_id_seed = 68
 16 | number_of_pdb_records = 76
 17 | 
 18 | # important palmdoc header offsets
 19 | book_length = 4
 20 | book_record_count = 8
 21 | first_pdb_record = 78
 22 | 
 23 | # important rec0 offsets
 24 | length_of_book = 4
 25 | mobi_header_base = 16
 26 | mobi_header_length = 20
 27 | mobi_type = 24
 28 | mobi_version = 36
 29 | first_non_text = 80
 30 | title_offset = 84
 31 | first_resc_record = 108
 32 | first_content_index = 192
 33 | last_content_index = 194
 34 | kf8_fdst_index = 192  # for KF8 mobi headers
 35 | fcis_index = 200
 36 | flis_index = 208
 37 | srcs_index = 224
 38 | srcs_count = 228
 39 | primary_index = 244
 40 | datp_index = 256
 41 | huffoff = 112
 42 | hufftbloff = 120
 43 | 
 44 | def getint(datain,ofs,sz=b'L'):
 45 |     i, = struct.unpack_from(b'>'+sz,datain,ofs)
 46 |     return i
 47 | 
 48 | def writeint(datain,ofs,n,len=b'L'):
 49 |     if len==b'L':
 50 |         return datain[:ofs]+struct.pack(b'>L',n)+datain[ofs+4:]
 51 |     else:
 52 |         return datain[:ofs]+struct.pack(b'>H',n)+datain[ofs+2:]
 53 | 
 54 | def getsecaddr(datain,secno):
 55 |     nsec = getint(datain,number_of_pdb_records,b'H')
 56 |     assert secno>=0 & secno<nsec,'secno %d out of range (nsec=%d)'%(secno,nsec)
 57 |     secstart = getint(datain,first_pdb_record+secno*8)
 58 |     if secno == nsec-1:
 59 |         secend = len(datain)
 60 |     else:
 61 |         secend = getint(datain,first_pdb_record+(secno+1)*8)
 62 |     return secstart,secend
 63 | 
 64 | def readsection(datain,secno):
 65 |     secstart, secend = getsecaddr(datain,secno)
 66 |     return datain[secstart:secend]
 67 | 
 68 | def writesection(datain,secno,secdata):  # overwrite, accounting for different length
 69 |     # dataout = deletesectionrange(datain,secno, secno)
 70 |     # return insertsection(dataout, secno, secdata)
 71 |     datalst = []
 72 |     nsec = getint(datain,number_of_pdb_records,b'H')
 73 |     zerosecstart,zerosecend = getsecaddr(datain,0)
 74 |     secstart,secend = getsecaddr(datain,secno)
 75 |     dif = len(secdata) - (secend - secstart)
 76 |     datalst.append(datain[:unique_id_seed])
 77 |     datalst.append(struct.pack(b'>L',2*nsec+1))
 78 |     datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
 79 |     datalst.append(struct.pack(b'>H',nsec))
 80 |     newstart = zerosecstart
 81 |     for i in range(0,secno):
 82 |         ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
 83 |         datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
 84 |     datalst.append(struct.pack(b'>L', secstart) + struct.pack(b'>L', (2*secno)))
 85 |     for i in range(secno+1,nsec):
 86 |         ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
 87 |         ofs = ofs + dif
 88 |         datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
 89 |     lpad = newstart - (first_pdb_record + 8*nsec)
 90 |     if lpad > 0:
 91 |         datalst.append(b'\0' * lpad)
 92 |     datalst.append(datain[zerosecstart:secstart])
 93 |     datalst.append(secdata)
 94 |     datalst.append(datain[secend:])
 95 |     dataout = b''.join(datalst)
 96 |     return dataout
 97 | 
 98 | def nullsection(datain,secno):  # make it zero-length without deleting it
 99 |     datalst = []
100 |     nsec = getint(datain,number_of_pdb_records,b'H')
101 |     secstart, secend = getsecaddr(datain,secno)
102 |     zerosecstart, zerosecend = getsecaddr(datain, 0)
103 |     dif =  secend-secstart
104 |     datalst.append(datain[:first_pdb_record])
105 |     for i in range(0,secno+1):
106 |         ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
107 |         datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
108 |     for i in range(secno+1, nsec):
109 |         ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
110 |         ofs = ofs - dif
111 |         datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
112 |     lpad = zerosecstart - (first_pdb_record + 8*nsec)
113 |     if lpad > 0:
114 |         datalst.append(b'\0' * lpad)
115 |     datalst.append(datain[zerosecstart: secstart])
116 |     datalst.append(datain[secend:])
117 |     dataout = b''.join(datalst)
118 |     return dataout
119 | 
120 | def deletesectionrange(datain,firstsec,lastsec):  # delete a range of sections
121 |     datalst = []
122 |     firstsecstart,firstsecend = getsecaddr(datain,firstsec)
123 |     lastsecstart,lastsecend = getsecaddr(datain,lastsec)
124 |     zerosecstart, zerosecend = getsecaddr(datain, 0)
125 |     dif = lastsecend - firstsecstart + 8*(lastsec-firstsec+1)
126 |     nsec = getint(datain,number_of_pdb_records,b'H')
127 |     datalst.append(datain[:unique_id_seed])
128 |     datalst.append(struct.pack(b'>L',2*(nsec-(lastsec-firstsec+1))+1))
129 |     datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
130 |     datalst.append(struct.pack(b'>H',nsec-(lastsec-firstsec+1)))
131 |     newstart = zerosecstart - 8*(lastsec-firstsec+1)
132 |     for i in range(0,firstsec):
133 |         ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
134 |         ofs = ofs-8*(lastsec-firstsec+1)
135 |         datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
136 |     for i in range(lastsec+1,nsec):
137 |         ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
138 |         ofs = ofs - dif
139 |         flgval = 2*(i-(lastsec-firstsec+1))
140 |         datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
141 |     lpad = newstart - (first_pdb_record + 8*(nsec - (lastsec - firstsec + 1)))
142 |     if lpad > 0:
143 |         datalst.append(b'\0' * lpad)
144 |     datalst.append(datain[zerosecstart:firstsecstart])
145 |     datalst.append(datain[lastsecend:])
146 |     dataout = b''.join(datalst)
147 |     return dataout
148 | 
149 | def insertsection(datain,secno,secdata):  # insert a new section
150 |     datalst = []
151 |     nsec = getint(datain,number_of_pdb_records,b'H')
152 |     # print("inserting secno" , secno,  "into" ,nsec, "sections")
153 |     secstart,secend = getsecaddr(datain,secno)
154 |     zerosecstart,zerosecend = getsecaddr(datain,0)
155 |     dif = len(secdata)
156 |     datalst.append(datain[:unique_id_seed])
157 |     datalst.append(struct.pack(b'>L',2*(nsec+1)+1))
158 |     datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
159 |     datalst.append(struct.pack(b'>H',nsec+1))
160 |     newstart = zerosecstart + 8
161 |     for i in range(0,secno):
162 |         ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
163 |         ofs += 8
164 |         datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
165 |     datalst.append(struct.pack(b'>L', secstart + 8) + struct.pack(b'>L', (2*secno)))
166 |     for i in range(secno,nsec):
167 |         ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
168 |         ofs = ofs + dif + 8
169 |         flgval = 2*(i+1)
170 |         datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
171 |     lpad = newstart - (first_pdb_record + 8*(nsec + 1))
172 |     if lpad > 0:
173 |         datalst.append(b'\0' * lpad)
174 |     datalst.append(datain[zerosecstart:secstart])
175 |     datalst.append(secdata)
176 |     datalst.append(datain[secstart:])
177 |     dataout = b''.join(datalst)
178 |     return dataout
179 | 
180 | 
181 | def insertsectionrange(sectionsource,firstsec,lastsec,sectiontarget,targetsec):  # insert a range of sections
182 |     # print("inserting secno" , firstsec,  "to", lastsec, "into" ,targetsec, "sections")
183 |     # dataout = sectiontarget
184 |     # for idx in range(lastsec,firstsec-1,-1):
185 |     #    dataout = insertsection(dataout,targetsec,readsection(sectionsource,idx))
186 |     # return dataout
187 |     datalst = []
188 |     nsec = getint(sectiontarget,number_of_pdb_records,b'H')
189 |     zerosecstart, zerosecend = getsecaddr(sectiontarget,0)
190 |     insstart, nul = getsecaddr(sectiontarget,targetsec)
191 |     nins = lastsec - firstsec + 1
192 |     srcstart, nul = getsecaddr(sectionsource,firstsec)
193 |     nul, srcend = getsecaddr(sectionsource,lastsec)
194 |     newstart = zerosecstart + 8*nins
195 | 
196 |     datalst.append(sectiontarget[:unique_id_seed])
197 |     datalst.append(struct.pack(b'>L',2*(nsec+nins)+1))
198 |     datalst.append(sectiontarget[unique_id_seed+4:number_of_pdb_records])
199 |     datalst.append(struct.pack(b'>H',nsec+nins))
200 |     for i in range(0,targetsec):
201 |         ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8)
202 |         ofsnew = ofs + 8*nins
203 |         flgvalnew = flgval
204 |         datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew))
205 |         # print(ofsnew, flgvalnew, ofs, flgval)
206 |     srcstart0, nul = getsecaddr(sectionsource,firstsec)
207 |     for i in range(nins):
208 |         isrcstart, nul = getsecaddr(sectionsource,firstsec+i)
209 |         ofsnew = insstart + (isrcstart-srcstart0) + 8*nins
210 |         flgvalnew = 2*(targetsec+i)
211 |         datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew))
212 |         # print(ofsnew, flgvalnew)
213 |     dif = srcend - srcstart
214 |     for i in range(targetsec,nsec):
215 |         ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8)
216 |         ofsnew = ofs + dif + 8*nins
217 |         flgvalnew = 2*(i+nins)
218 |         datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L',flgvalnew))
219 |         # print(ofsnew, flgvalnew, ofs, flgval)
220 |     lpad = newstart - (first_pdb_record + 8*(nsec + nins))
221 |     if lpad > 0:
222 |         datalst.append(b'\0' * lpad)
223 |     datalst.append(sectiontarget[zerosecstart:insstart])
224 |     datalst.append(sectionsource[srcstart:srcend])
225 |     datalst.append(sectiontarget[insstart:])
226 |     dataout = b''.join(datalst)
227 |     return dataout
228 | 
229 | def get_exth_params(rec0):
230 |     ebase = mobi_header_base + getint(rec0,mobi_header_length)
231 |     elen = getint(rec0,ebase+4)
232 |     enum = getint(rec0,ebase+8)
233 |     return ebase,elen,enum
234 | 
235 | def add_exth(rec0,exth_num,exth_bytes):
236 |     ebase,elen,enum = get_exth_params(rec0)
237 |     newrecsize = 8+len(exth_bytes)
238 |     newrec0 = rec0[0:ebase+4]+struct.pack(b'>L',elen+newrecsize)+struct.pack(b'>L',enum+1)+\
239 |               struct.pack(b'>L',exth_num)+struct.pack(b'>L',newrecsize)+exth_bytes+rec0[ebase+12:]
240 |     newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+newrecsize)
241 |     return newrec0
242 | 
243 | def read_exth(rec0,exth_num):
244 |     exth_values = []
245 |     ebase,elen,enum = get_exth_params(rec0)
246 |     ebase = ebase+12
247 |     while enum>0:
248 |         exth_id = getint(rec0,ebase)
249 |         if exth_id == exth_num:
250 |             # We might have multiple exths, so build a list.
251 |             exth_values.append(rec0[ebase+8:ebase+getint(rec0,ebase+4)])
252 |         enum = enum-1
253 |         ebase = ebase+getint(rec0,ebase+4)
254 |     return exth_values
255 | 
256 | def write_exth(rec0,exth_num,exth_bytes):
257 |     ebase,elen,enum = get_exth_params(rec0)
258 |     ebase_idx = ebase+12
259 |     enum_idx = enum
260 |     while enum_idx>0:
261 |         exth_id = getint(rec0,ebase_idx)
262 |         if exth_id == exth_num:
263 |             dif = len(exth_bytes)+8-getint(rec0,ebase_idx+4)
264 |             newrec0 = rec0
265 |             if dif != 0:
266 |                 newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+dif)
267 |             return newrec0[:ebase+4]+struct.pack(b'>L',elen+len(exth_bytes)+8-getint(rec0,ebase_idx+4))+\
268 |                                               struct.pack(b'>L',enum)+rec0[ebase+12:ebase_idx+4]+\
269 |                                               struct.pack(b'>L',len(exth_bytes)+8)+exth_bytes+\
270 |                                               rec0[ebase_idx+getint(rec0,ebase_idx+4):]
271 |         enum_idx = enum_idx-1
272 |         ebase_idx = ebase_idx+getint(rec0,ebase_idx+4)
273 |     return rec0
274 | 
275 | def del_exth(rec0,exth_num):
276 |     ebase,elen,enum = get_exth_params(rec0)
277 |     ebase_idx = ebase+12
278 |     enum_idx = 0
279 |     while enum_idx < enum:
280 |         exth_id = getint(rec0,ebase_idx)
281 |         exth_size = getint(rec0,ebase_idx+4)
282 |         if exth_id == exth_num:
283 |             newrec0 = rec0
284 |             newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)-exth_size)
285 |             newrec0 = newrec0[:ebase_idx]+newrec0[ebase_idx+exth_size:]
286 |             newrec0 = newrec0[0:ebase+4]+struct.pack(b'>L',elen-exth_size)+struct.pack(b'>L',enum-1)+newrec0[ebase+12:]
287 |             return newrec0
288 |         enum_idx += 1
289 |         ebase_idx = ebase_idx+exth_size
290 |     return rec0
291 | 
292 | 
293 | class mobi_split:
294 | 
295 |     def __init__(self, infile):
296 |         datain = b''
297 |         with open(pathof(infile), 'rb') as f:
298 |             datain = f.read()
299 |         datain_rec0 = readsection(datain,0)
300 |         ver = getint(datain_rec0,mobi_version)
301 |         self.combo = (ver!=8)
302 |         if not self.combo:
303 |             return
304 |         exth121 = read_exth(datain_rec0,121)
305 |         if len(exth121) == 0:
306 |             self.combo = False
307 |             return
308 |         else:
309 |             # only pay attention to first exth121
310 |             # (there should only be one)
311 |             datain_kf8, = struct.unpack_from(b'>L',exth121[0],0)
312 |             if datain_kf8 == 0xffffffff:
313 |                 self.combo = False
314 |                 return
315 |         datain_kfrec0 =readsection(datain,datain_kf8)
316 | 
317 |         # create the standalone mobi7
318 |         num_sec = getint(datain,number_of_pdb_records,b'H')
319 |         # remove BOUNDARY up to but not including ELF record
320 |         self.result_file7 = deletesectionrange(datain,datain_kf8-1,num_sec-2)
321 |         # check if there are SRCS records and delete them
322 |         srcs = getint(datain_rec0,srcs_index)
323 |         num_srcs = getint(datain_rec0,srcs_count)
324 |         if srcs != 0xffffffff and num_srcs > 0:
325 |             self.result_file7 = deletesectionrange(self.result_file7,srcs,srcs+num_srcs-1)
326 |             datain_rec0 = writeint(datain_rec0,srcs_index,0xffffffff)
327 |             datain_rec0 = writeint(datain_rec0,srcs_count,0)
328 |         # reset the EXTH 121 KF8 Boundary meta data to 0xffffffff
329 |         datain_rec0 = write_exth(datain_rec0,121, struct.pack(b'>L', 0xffffffff))
330 |         # datain_rec0 = del_exth(datain_rec0,121)
331 |         # datain_rec0 = del_exth(datain_rec0,534)
332 |         # don't remove the EXTH 125 KF8 Count of Resources, seems to be present in mobi6 files as well
333 |         # set the EXTH 129 KF8 Masthead / Cover Image string to the null string
334 |         datain_rec0 = write_exth(datain_rec0,129, b'')
335 |         # don't remove the EXTH 131 KF8 Unidentified Count, seems to be present in mobi6 files as well
336 | 
337 |         # need to reset flags stored in 0x80-0x83
338 |         # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
339 |         # Bit Flags
340 |         # 0x1000 = Bit 12 indicates if embedded fonts are used or not
341 |         # 0x0800 = means this Header points to *shared* images/resource/fonts ??
342 |         # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
343 |         # 0x0040 = exth exists
344 |         # 0x0010 = Not sure but this is always set so far
345 |         fval, = struct.unpack_from(b'>L',datain_rec0, 0x80)
346 |         # need to remove flag 0x0800 for KindlePreviewer 2.8 and unset Bit 12 for embedded fonts
347 |         fval = fval & 0x07FF
348 |         datain_rec0 = datain_rec0[:0x80] + struct.pack(b'>L',fval) + datain_rec0[0x84:]
349 | 
350 |         self.result_file7 = writesection(self.result_file7,0,datain_rec0)
351 | 
352 |         # no need to replace kf8 style fcis with mobi 7 one
353 |         # fcis_secnum, = struct.unpack_from(b'>L',datain_rec0, 0xc8)
354 |         # if fcis_secnum != 0xffffffff:
355 |         #     fcis_info = readsection(datain, fcis_secnum)
356 |         #     text_len,  = struct.unpack_from(b'>L', fcis_info, 0x14)
357 |         #     new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
358 |         #     new_fcis += struct.pack(b'>L',text_len)
359 |         #     new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
360 |         #     self.result_file7 = writesection(self.result_file7, fcis_secnum, new_fcis)
361 | 
362 |         firstimage = getint(datain_rec0,first_resc_record)
363 |         lastimage = getint(datain_rec0,last_content_index,b'H')
364 |         # print("Old First Image, last Image", firstimage,lastimage)
365 |         if lastimage == 0xffff:
366 |             # find the lowest of the next sections and copy up to that.
367 |             ofs_list = [(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')]
368 |             for ofs,sz in ofs_list:
369 |                 n = getint(datain_rec0,ofs,sz)
370 |                 # print("n",n)
371 |                 if n > 0 and n < lastimage:
372 |                     lastimage = n-1
373 |         print("First Image, last Image", firstimage,lastimage)
374 | 
375 |         # Try to null out FONT and RES, but leave the (empty) PDB record so image refs remain valid
376 |         for i in range(firstimage,lastimage):
377 |             imgsec = readsection(self.result_file7,i)
378 |             if imgsec[0:4] in [b'RESC',b'FONT']:
379 |                 self.result_file7 = nullsection(self.result_file7,i)
380 | 
381 |         # mobi7 finished
382 | 
383 |         # create standalone mobi8
384 |         self.result_file8 = deletesectionrange(datain,0,datain_kf8-1)
385 |         target = getint(datain_kfrec0,first_resc_record)
386 |         self.result_file8 = insertsectionrange(datain,firstimage,lastimage,self.result_file8,target)
387 |         datain_kfrec0 =readsection(self.result_file8,0)
388 | 
389 |         # Only keep the correct EXTH 116 StartOffset, KG 2.5 carries over the one from the mobi7 part, which then points at garbage in the mobi8 part, and confuses FW 3.4
390 |         kf8starts = read_exth(datain_kfrec0,116)
391 |         # If we have multiple StartOffset, keep only the last one
392 |         kf8start_count = len(kf8starts)
393 |         while kf8start_count > 1:
394 |             kf8start_count -= 1
395 |             datain_kfrec0 = del_exth(datain_kfrec0,116)
396 | 
397 |         # update the EXTH 125 KF8 Count of Images/Fonts/Resources
398 |         datain_kfrec0 = write_exth(datain_kfrec0,125,struct.pack(b'>L',lastimage-firstimage+1))
399 | 
400 |         # need to reset flags stored in 0x80-0x83
401 |         # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
402 |         # standalone mobi8 with exth: 0x0050
403 |         # Bit Flags
404 |         # 0x1000 = Bit 12 indicates if embedded fonts are used or not
405 |         # 0x0800 = means this Header points to *shared* images/resource/fonts ??
406 |         # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
407 |         # 0x0040 = exth exists
408 |         # 0x0010 = Not sure but this is always set so far
409 |         fval, = struct.unpack_from('>L',datain_kfrec0, 0x80)
410 |         fval = fval & 0x1FFF
411 |         fval |= 0x0800
412 |         datain_kfrec0 = datain_kfrec0[:0x80] + struct.pack(b'>L',fval) + datain_kfrec0[0x84:]
413 | 
414 |         # properly update other index pointers that have been shifted by the insertion of images
415 |         ofs_list = [(kf8_fdst_index,b'L'),(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')]
416 |         for ofs,sz in ofs_list:
417 |             n = getint(datain_kfrec0,ofs,sz)
418 |             if n != 0xffffffff:
419 |                 datain_kfrec0 = writeint(datain_kfrec0,ofs,n+lastimage-firstimage+1,sz)
420 |         self.result_file8 = writesection(self.result_file8,0,datain_kfrec0)
421 | 
422 |         # no need to replace kf8 style fcis with mobi 7 one
423 |         # fcis_secnum, = struct.unpack_from(b'>L',datain_kfrec0, 0xc8)
424 |         # if fcis_secnum != 0xffffffff:
425 |         #     fcis_info = readsection(self.result_file8, fcis_secnum)
426 |         #     text_len,  = struct.unpack_from(b'>L', fcis_info, 0x14)
427 |         #     new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
428 |         #     new_fcis += struct.pack(b'>L',text_len)
429 |         #     new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
430 |         #     self.result_file8 = writesection(self.result_file8, fcis_secnum, new_fcis)
431 | 
432 |         # mobi8 finished
433 | 
434 |     def getResult8(self):
435 |         return self.result_file8
436 | 
437 |     def getResult7(self):
438 |         return self.result_file7
439 | 


--------------------------------------------------------------------------------
/lib/mobi_uncompress.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | from .compatibility_utils import PY2, bchr, lmap, bstr
  8 | 
  9 | if PY2:
 10 |     range = xrange
 11 | 
 12 | import struct
 13 | # note:  struct pack, unpack, unpack_from all require bytestring format
 14 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 15 | 
 16 | 
 17 | class unpackException(Exception):
 18 |     pass
 19 | 
 20 | class UncompressedReader:
 21 | 
 22 |     def unpack(self, data):
 23 |         return data
 24 | 
 25 | class PalmdocReader:
 26 | 
 27 |     def unpack(self, i):
 28 |         o, p = b'', 0
 29 |         while p < len(i):
 30 |             # for python 3 must use slice since i[p] returns int while slice returns character
 31 |             c = ord(i[p:p+1])
 32 |             p += 1
 33 |             if (c >= 1 and c <= 8):
 34 |                 o += i[p:p+c]
 35 |                 p += c
 36 |             elif (c < 128):
 37 |                 o += bchr(c)
 38 |             elif (c >= 192):
 39 |                 o += b' ' + bchr(c ^ 128)
 40 |             else:
 41 |                 if p < len(i):
 42 |                     c = (c << 8) | ord(i[p:p+1])
 43 |                     p += 1
 44 |                     m = (c >> 3) & 0x07ff
 45 |                     n = (c & 7) + 3
 46 |                     if (m > n):
 47 |                         o += o[-m:n-m]
 48 |                     else:
 49 |                         for _ in range(n):
 50 |                             # because of completely ass-backwards decision by python mainters for python 3
 51 |                             # we must use slice for bytes as i[p] returns int while slice returns character
 52 |                             if m == 1:
 53 |                                 o += o[-m:]
 54 |                             else:
 55 |                                 o += o[-m:-m+1]
 56 |         return o
 57 | 
 58 | class HuffcdicReader:
 59 |     q = struct.Struct(b'>Q').unpack_from
 60 | 
 61 |     def loadHuff(self, huff):
 62 |         if huff[0:8] != b'HUFF\x00\x00\x00\x18':
 63 |             raise unpackException('invalid huff header')
 64 |         off1, off2 = struct.unpack_from(b'>LL', huff, 8)
 65 | 
 66 |         def dict1_unpack(v):
 67 |             codelen, term, maxcode = v&0x1f, v&0x80, v>>8
 68 |             assert codelen != 0
 69 |             if codelen <= 8:
 70 |                 assert term
 71 |             maxcode = ((maxcode + 1) << (32 - codelen)) - 1
 72 |             return (codelen, term, maxcode)
 73 |         self.dict1 = lmap(dict1_unpack, struct.unpack_from(b'>256L', huff, off1))
 74 | 
 75 |         dict2 = struct.unpack_from(b'>64L', huff, off2)
 76 |         self.mincode, self.maxcode = (), ()
 77 |         for codelen, mincode in enumerate((0,) + dict2[0::2]):
 78 |             self.mincode += (mincode << (32 - codelen), )
 79 |         for codelen, maxcode in enumerate((0,) + dict2[1::2]):
 80 |             self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, )
 81 | 
 82 |         self.dictionary = []
 83 | 
 84 |     def loadCdic(self, cdic):
 85 |         if cdic[0:8] != b'CDIC\x00\x00\x00\x10':
 86 |             raise unpackException('invalid cdic header')
 87 |         phrases, bits = struct.unpack_from(b'>LL', cdic, 8)
 88 |         n = min(1<<bits, phrases-len(self.dictionary))
 89 |         h = struct.Struct(b'>H').unpack_from
 90 |         def getslice(off):
 91 |             blen, = h(cdic, 16+off)
 92 |             slice = cdic[18+off:18+off+(blen&0x7fff)]
 93 |             return (slice, blen&0x8000)
 94 |         self.dictionary += lmap(getslice, struct.unpack_from(bstr('>%dH' % n), cdic, 16))
 95 | 
 96 |     def unpack(self, data):
 97 |         q = HuffcdicReader.q
 98 | 
 99 |         bitsleft = len(data) * 8
100 |         data += b"\x00\x00\x00\x00\x00\x00\x00\x00"
101 |         pos = 0
102 |         x, = q(data, pos)
103 |         n = 32
104 | 
105 |         s = b''
106 |         while True:
107 |             if n <= 0:
108 |                 pos += 4
109 |                 x, = q(data, pos)
110 |                 n += 32
111 |             code = (x >> n) & ((1 << 32) - 1)
112 | 
113 |             codelen, term, maxcode = self.dict1[code >> 24]
114 |             if not term:
115 |                 while code < self.mincode[codelen]:
116 |                     codelen += 1
117 |                 maxcode = self.maxcode[codelen]
118 | 
119 |             n -= codelen
120 |             bitsleft -= codelen
121 |             if bitsleft < 0:
122 |                 break
123 | 
124 |             r = (maxcode - code) >> (32 - codelen)
125 |             slice, flag = self.dictionary[r]
126 |             if not flag:
127 |                 self.dictionary[r] = None
128 |                 slice = self.unpack(slice)
129 |                 self.dictionary[r] = (slice, 1)
130 |             s += slice
131 |         return s
132 | 


--------------------------------------------------------------------------------
/lib/mobi_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | # flake8: noqa
  5 | 
  6 | from __future__ import unicode_literals, division, absolute_import, print_function
  7 | 
  8 | from .compatibility_utils import PY2, text_type, bchr, bord
  9 | 
 10 | import binascii
 11 | 
 12 | if PY2:
 13 |     range = xrange
 14 | 
 15 | from itertools import cycle
 16 | 
 17 | def getLanguage(langID, sublangID):
 18 |     mobilangdict = {
 19 |             54 : {0 : 'af'},  # Afrikaans
 20 |             28 : {0 : 'sq'},  # Albanian
 21 |              1 : {0 : 'ar' , 5 : 'ar-dz' , 15 : 'ar-bh' , 3 : 'ar-eg' , 2 : 'ar-iq',  11 : 'ar-jo' , 13 : 'ar-kw' , 12 : 'ar-lb' , 4: 'ar-ly',
 22 |                   6 : 'ar-ma' , 8 : 'ar-om' , 16 : 'ar-qa' , 1 : 'ar-sa' , 10 : 'ar-sy' , 7 : 'ar-tn' , 14 : 'ar-ae' , 9 : 'ar-ye'},
 23 |              # Arabic,  Arabic (Algeria),  Arabic (Bahrain),  Arabic (Egypt),  Arabic
 24 |              # (Iraq), Arabic (Jordan),  Arabic (Kuwait),  Arabic (Lebanon),  Arabic
 25 |              # (Libya), Arabic (Morocco),  Arabic (Oman),  Arabic (Qatar),  Arabic
 26 |              # (Saudi Arabia),  Arabic (Syria),  Arabic (Tunisia),  Arabic (United Arab
 27 |              # Emirates),  Arabic (Yemen)
 28 |             43 : {0 : 'hy'},  # Armenian
 29 |             77 : {0 : 'as'},  # Assamese
 30 |             44 : {0 : 'az'},  # "Azeri (IANA: Azerbaijani)
 31 |             45 : {0 : 'eu'},  # Basque
 32 |             35 : {0 : 'be'},  # Belarusian
 33 |             69 : {0 : 'bn'},  # Bengali
 34 |              2 : {0 : 'bg'},  # Bulgarian
 35 |              3 : {0 : 'ca'},  # Catalan
 36 |              4 : {0 : 'zh' , 3 : 'zh-hk' , 2 : 'zh-cn' , 4 : 'zh-sg' , 1 : 'zh-tw'},
 37 |             # Chinese,  Chinese (Hong Kong),  Chinese (PRC),  Chinese (Singapore),  Chinese (Taiwan)
 38 |             26 : {0 : 'hr', 3 : 'sr'},  # Croatian, Serbian
 39 |              5 : {0 : 'cs'},  # Czech
 40 |              6 : {0 : 'da'},  # Danish
 41 |             19 : {0: 'nl', 1 : 'nl' , 2 : 'nl-be'},  # Dutch / Flemish,  Dutch (Belgium)
 42 |              9 : {0: 'en', 1 : 'en' , 3 : 'en-au' , 10 : 'en-bz' , 4 : 'en-ca' , 6 : 'en-ie' , 8 : 'en-jm' , 5 : 'en-nz' , 13 : 'en-ph' ,
 43 |                   7 : 'en-za' , 11 : 'en-tt' , 2 : 'en-gb', 1 : 'en-us' , 12 : 'en-zw'},
 44 |              # English,  English (Australia),  English (Belize),  English (Canada),
 45 |              # English (Ireland),  English (Jamaica),  English (New Zealand),  English
 46 |              # (Philippines),  English (South Africa),  English (Trinidad),  English
 47 |              # (United Kingdom),  English (United States),  English (Zimbabwe)
 48 |             37 : {0 : 'et'},  # Estonian
 49 |             56 : {0 : 'fo'},  # Faroese
 50 |             41 : {0 : 'fa'},  # Farsi / Persian
 51 |             11 : {0 : 'fi'},  # Finnish
 52 |             12 : {0 : 'fr', 1 : 'fr' , 2 : 'fr-be' , 3 : 'fr-ca' , 5 : 'fr-lu' , 6 : 'fr-mc' , 4 : 'fr-ch'},
 53 |             # French,  French (Belgium),  French (Canada),  French (Luxembourg),  French (Monaco),  French (Switzerland)
 54 |             55 : {0 : 'ka'},  # Georgian
 55 |              7 : {0 : 'de', 1 : 'de' , 3 : 'de-at' , 5 : 'de-li' , 4 : 'de-lu' , 2 : 'de-ch'},
 56 |              # German,  German (Austria),  German (Liechtenstein),  German (Luxembourg),  German (Switzerland)
 57 |              8 : {0 : 'el'},  # Greek, Modern (1453-)
 58 |             71 : {0 : 'gu'},  # Gujarati
 59 |             13 : {0 : 'he'},  # Hebrew (also code 'iw'?)
 60 |             57 : {0 : 'hi'},  # Hindi
 61 |             14 : {0 : 'hu'},  # Hungarian
 62 |             15 : {0 : 'is'},  # Icelandic
 63 |             33 : {0 : 'id'},  # Indonesian
 64 |             16 : {0 : 'it', 1 : 'it' , 2 : 'it-ch'},  # Italian,  Italian (Switzerland)
 65 |             17 : {0 : 'ja'},  # Japanese
 66 |             75 : {0 : 'kn'},  # Kannada
 67 |             63 : {0 : 'kk'},  # Kazakh
 68 |             87 : {0 : 'x-kok'},  # Konkani (real language code is 'kok'?)
 69 |             18 : {0 : 'ko'},  # Korean
 70 |             38 : {0 : 'lv'},  # Latvian
 71 |             39 : {0 : 'lt'},  # Lithuanian
 72 |             47 : {0 : 'mk'},  # Macedonian
 73 |             62 : {0 : 'ms'},  # Malay
 74 |             76 : {0 : 'ml'},  # Malayalam
 75 |             58 : {0 : 'mt'},  # Maltese
 76 |             78 : {0 : 'mr'},  # Marathi
 77 |             97 : {0 : 'ne'},  # Nepali
 78 |             20 : {0 : 'no'},  # Norwegian
 79 |             72 : {0 : 'or'},  # Oriya
 80 |             21 : {0 : 'pl'},  # Polish
 81 |             22 : {0 : 'pt', 2 : 'pt' , 1 : 'pt-br'},  # Portuguese,  Portuguese (Brazil)
 82 |             70 : {0 : 'pa'},  # Punjabi
 83 |             23 : {0 : 'rm'},  # "Rhaeto-Romanic" (IANA: Romansh)
 84 |             24 : {0 : 'ro'},  # Romanian
 85 |             25 : {0 : 'ru'},  # Russian
 86 |             59 : {0 : 'sz'},  # "Sami (Lappish)" (not an IANA language code)
 87 |             # IANA code for "Northern Sami" is 'se'
 88 |             # 'SZ' is the IANA region code for Swaziland
 89 |             79 : {0 : 'sa'},  # Sanskrit
 90 |             27 : {0 : 'sk'},  # Slovak
 91 |             36 : {0 : 'sl'},  # Slovenian
 92 |             46 : {0 : 'sb'},  # "Sorbian" (not an IANA language code)
 93 |             # 'SB' is IANA region code for 'Solomon Islands'
 94 |             # Lower Sorbian = 'dsb'
 95 |             # Upper Sorbian = 'hsb'
 96 |             # Sorbian Languages = 'wen'
 97 |             10 : {0 : 'es' , 1 : 'es' , 11 : 'es-ar' , 16 : 'es-bo' , 13 : 'es-cl' , 9 : 'es-co' , 5 : 'es-cr' , 7 : 'es-do' ,
 98 |                   12 : 'es-ec' , 17 : 'es-sv' , 4 : 'es-gt' , 18 : 'es-hn' , 2 : 'es-mx' , 19 : 'es-ni' , 6 : 'es-pa' ,
 99 |                   15 : 'es-py' , 10 : 'es-pe' , 20 : 'es-pr' , 14 : 'es-uy' , 8 : 'es-ve'},
100 |             # Spanish,  Spanish (Mobipocket bug?),  Spanish (Argentina),  Spanish
101 |             # (Bolivia),  Spanish (Chile),  Spanish (Colombia),  Spanish (Costa Rica),
102 |             # Spanish (Dominican Republic),  Spanish (Ecuador),  Spanish (El
103 |             # Salvador),  Spanish (Guatemala),  Spanish (Honduras),  Spanish (Mexico),
104 |             # Spanish (Nicaragua),  Spanish (Panama),  Spanish (Paraguay),  Spanish
105 |             # (Peru),  Spanish (Puerto Rico),  Spanish (Uruguay),  Spanish (Venezuela)
106 |             48 : {0 : 'sx'},  # "Sutu" (not an IANA language code)
107 |             # "Sutu" is another name for "Southern Sotho"?
108 |             # IANA code for "Southern Sotho" is 'st'
109 |             65 : {0 : 'sw'},  # Swahili
110 |             29 : {0 : 'sv' , 1 : 'sv' , 2 : 'sv-fi'},  # Swedish,  Swedish (Finland)
111 |             73 : {0 : 'ta'},  # Tamil
112 |             68 : {0 : 'tt'},  # Tatar
113 |             74 : {0 : 'te'},  # Telugu
114 |             30 : {0 : 'th'},  # Thai
115 |             49 : {0 : 'ts'},  # Tsonga
116 |             50 : {0 : 'tn'},  # Tswana
117 |             31 : {0 : 'tr'},  # Turkish
118 |             34 : {0 : 'uk'},  # Ukrainian
119 |             32 : {0 : 'ur'},  # Urdu
120 |             67 : {0 : 'uz', 1 : 'uz'},  # Uzbek
121 |             42 : {0 : 'vi'},  # Vietnamese
122 |             52 : {0 : 'xh'},  # Xhosa
123 |             53 : {0 : 'zu'},  # Zulu
124 |     }
125 |     lang = "en"
126 |     if langID in mobilangdict:
127 |         subdict = mobilangdict[langID]
128 |         lang = subdict[0]
129 |         if sublangID in subdict:
130 |             lang = subdict[sublangID]
131 |     return lang
132 | 
133 | 
134 | def toHex(byteList):
135 |     return binascii.hexlify(byteList)
136 | 
137 | # returns base32 bytestring
138 | def toBase32(value, npad=4):
139 |     digits = b'0123456789ABCDEFGHIJKLMNOPQRSTUV'
140 |     num_string=b''
141 |     current = value
142 |     while current != 0:
143 |         next, remainder = divmod(current, 32)
144 |         rem_string = digits[remainder:remainder+1]
145 |         num_string = rem_string + num_string
146 |         current=next
147 |     if num_string == b'':
148 |         num_string = b'0'
149 |     pad = npad - len(num_string)
150 |     if pad > 0:
151 |         num_string = b'0' * pad + num_string
152 |     return num_string
153 | 
154 | 
155 | # converts base32 string to value
156 | def fromBase32(str_num):
157 |     if isinstance(str_num, text_type):
158 |         str_num = str_num.encode('latin-1')
159 |     scalelst = [1,32,1024,32768,1048576,33554432,1073741824,34359738368]
160 |     value = 0
161 |     j = 0
162 |     n = len(str_num)
163 |     scale = 0
164 |     for i in range(n):
165 |         c = str_num[n-i-1:n-i]
166 |         if c in b'0123456789':
167 |             v = ord(c) - ord(b'0')
168 |         else:
169 |             v = ord(c) - ord(b'A') + 10
170 |         if j < len(scalelst):
171 |             scale = scalelst[j]
172 |         else:
173 |             scale = scale * 32
174 |         j += 1
175 |         if v != 0:
176 |             value = value + (v * scale)
177 |     return value
178 | 
179 | 
180 | # note: if decode a bytestring using 'latin-1' (or any other 0-255 encoding)
181 | # in place of ascii you will get a byte to half-word or integer
182 | # one to one mapping of values from 0 - 255
183 | 
184 | def mangle_fonts(encryption_key, data):
185 |     if isinstance(encryption_key, text_type):
186 |         encryption_key = encryption_key.encode('latin-1')
187 |     crypt = data[:1024]
188 |     key = cycle(iter(map(bord, encryption_key)))
189 |     # encrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
190 |     encrypt = b''.join([bchr(bord(x)^next(key)) for x in crypt])
191 |     return encrypt + data[1024:]
192 | 


--------------------------------------------------------------------------------
/lib/mobiml2xhtml.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  3 | 
  4 | 
  5 | # this program works in concert with the output from KindleUnpack
  6 | 
  7 | '''
  8 | Convert from Mobi ML to XHTML
  9 | '''
 10 | 
 11 | from __future__ import division, absolute_import, print_function
 12 | 
 13 | import os
 14 | import sys
 15 | import re
 16 | 
 17 | SPECIAL_HANDLING_TAGS = {
 18 |     '?xml'     : ('xmlheader', -1),
 19 |     '!--'      : ('comment', -3),
 20 |     '!DOCTYPE' : ('doctype', -1),
 21 | }
 22 | 
 23 | SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment']
 24 | 
 25 | SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference']
 26 | 
 27 | class MobiMLConverter(object):
 28 | 
 29 |     PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
 30 |     IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
 31 | 
 32 |     def __init__(self, filename):
 33 |         self.base_css_rules =  'blockquote { margin: 0em 0em 0em 1.25em }\n'
 34 |         self.base_css_rules += 'p { margin: 0em }\n'
 35 |         self.base_css_rules += '.bold { font-weight: bold }\n'
 36 |         self.base_css_rules += '.italic { font-style: italic }\n'
 37 |         self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n'
 38 |         self.tag_css_rules = {}
 39 |         self.tag_css_rule_cnt = 0
 40 |         self.path = []
 41 |         self.filename = filename
 42 |         self.wipml = open(self.filename, 'r').read()
 43 |         self.pos = 0
 44 |         self.opfname = self.filename.rsplit('.',1)[0] + '.opf'
 45 |         self.opos = 0
 46 |         self.meta = ''
 47 |         self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css')
 48 |         self.current_font_size = 3
 49 |         self.font_history = []
 50 | 
 51 |     def cleanup_html(self):
 52 |         self.wipml = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.wipml)
 53 |         self.wipml = self.wipml.replace('\r\n', '\n')
 54 |         self.wipml = self.wipml.replace('> <', '>\n<')
 55 |         self.wipml = self.wipml.replace('<mbp: ', '<mbp:')
 56 |         # self.wipml = re.sub(r'<?xml[^>]*>', '', self.wipml)
 57 |         self.wipml = self.wipml.replace('<br></br>','<br/>')
 58 | 
 59 |     def replace_page_breaks(self):
 60 |         self.wipml = self.PAGE_BREAK_PAT.sub(
 61 |             '<div class="mbp_pagebreak" />',
 62 |             self.wipml)
 63 | 
 64 |     # parse leading text of ml and tag
 65 |     def parseml(self):
 66 |         p = self.pos
 67 |         if p >= len(self.wipml):
 68 |             return None
 69 |         if self.wipml[p] != '<':
 70 |             res = self.wipml.find('<',p)
 71 |             if res == -1 :
 72 |                 res = len(self.wipml)
 73 |             self.pos = res
 74 |             return self.wipml[p:res], None
 75 |         # handle comment as a special case to deal with multi-line comments
 76 |         if self.wipml[p:p+4] == '<!--':
 77 |             te = self.wipml.find('-->',p+1)
 78 |             if te != -1:
 79 |                 te = te+2
 80 |         else :
 81 |             te = self.wipml.find('>',p+1)
 82 |             ntb = self.wipml.find('<',p+1)
 83 |             if ntb != -1 and ntb < te:
 84 |                 self.pos = ntb
 85 |                 return self.wipml[p:ntb], None
 86 |         self.pos = te + 1
 87 |         return None, self.wipml[p:te+1]
 88 | 
 89 |     # parses string version of tag to identify its name,
 90 |     # its type 'begin', 'end' or 'single',
 91 |     # plus build a hashtable of its attributes
 92 |     # code is written to handle the possiblity of very poor formating
 93 |     def parsetag(self, s):
 94 |         p = 1
 95 |         # get the tag name
 96 |         tname = None
 97 |         ttype = None
 98 |         tattr = {}
 99 |         while s[p:p+1] == ' ' :
100 |             p += 1
101 |         if s[p:p+1] == '/':
102 |             ttype = 'end'
103 |             p += 1
104 |             while s[p:p+1] == ' ' :
105 |                 p += 1
106 |         b = p
107 |         while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") :
108 |             p += 1
109 |         tname=s[b:p].lower()
110 |         if tname == '!doctype':
111 |             tname = '!DOCTYPE'
112 |         # special cases
113 |         if tname in SPECIAL_HANDLING_TAGS:
114 |             ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
115 |             tattr['special'] = s[p:backstep]
116 |         if ttype is None:
117 |             # parse any attributes
118 |             while s.find('=',p) != -1 :
119 |                 while s[p:p+1] == ' ' :
120 |                     p += 1
121 |                 b = p
122 |                 while s[p:p+1] != '=' :
123 |                     p += 1
124 |                 aname = s[b:p].lower()
125 |                 aname = aname.rstrip(' ')
126 |                 p += 1
127 |                 while s[p:p+1] == ' ' :
128 |                     p += 1
129 |                 if s[p:p+1] in ('"', "'") :
130 |                     p = p + 1
131 |                     b = p
132 |                     while s[p:p+1] not in ('"', "'") :
133 |                         p += 1
134 |                     val = s[b:p]
135 |                     p += 1
136 |                 else :
137 |                     b = p
138 |                     while s[p:p+1] not in ('>', '/', ' ') :
139 |                         p += 1
140 |                     val = s[b:p]
141 |                 tattr[aname] = val
142 |         # label beginning and single tags
143 |         if ttype is None:
144 |             ttype = 'begin'
145 |             if s.find(' /',p) >= 0:
146 |                 ttype = 'single_ext'
147 |             elif s.find('/',p) >= 0:
148 |                 ttype = 'single'
149 |         return ttype, tname, tattr
150 | 
151 |     # main routine to convert from mobi markup language to html
152 |     def processml(self):
153 | 
154 |         # are these really needed
155 |         html_done = False
156 |         head_done = False
157 |         body_done = False
158 | 
159 |         skip = False
160 | 
161 |         htmlstr = ''
162 |         self.replace_page_breaks()
163 |         self.cleanup_html()
164 | 
165 |         # now parse the cleaned up ml into standard xhtml
166 |         while True:
167 | 
168 |             r = self.parseml()
169 |             if not r:
170 |                 break
171 | 
172 |             text, tag = r
173 | 
174 |             if text:
175 |                 if not skip:
176 |                     htmlstr += text
177 | 
178 |             if tag:
179 |                 ttype, tname, tattr = self.parsetag(tag)
180 | 
181 |                 # If we run into a DTD or xml declarations inside the body ... bail.
182 |                 if tname in SPECIAL_HANDLING_TAGS and tname != 'comment' and body_done:
183 |                     htmlstr += '\n</body></html>'
184 |                     break
185 | 
186 |                 # make sure self-closing tags actually self-close
187 |                 if ttype == 'begin' and tname in SELF_CLOSING_TAGS:
188 |                     ttype = 'single'
189 | 
190 |                 # make sure any end tags of self-closing tags are discarded
191 |                 if ttype == 'end' and tname in SELF_CLOSING_TAGS:
192 |                     continue
193 | 
194 |                 # remove embedded guide and refernces from old mobis
195 |                 if tname in ('guide', 'ncx', 'reference') and ttype in ('begin', 'single', 'single_ext'):
196 |                     tname = 'removeme:{0}'.format(tname)
197 |                     tattr = None
198 |                 if tname in ('guide', 'ncx', 'reference', 'font', 'span') and ttype == 'end':
199 |                     if self.path[-1] == 'removeme:{0}'.format(tname):
200 |                         tname = 'removeme:{0}'.format(tname)
201 |                         tattr = None
202 | 
203 |                 # Get rid of font tags that only have a color attribute.
204 |                 if tname == 'font' and ttype in ('begin', 'single', 'single_ext'):
205 |                     if 'color' in tattr and len(tattr) == 1:
206 |                         tname = 'removeme:{0}'.format(tname)
207 |                         tattr = None
208 | 
209 |                 # Get rid of empty spans in the markup.
210 |                 if tname == 'span' and ttype in ('begin', 'single', 'single_ext') and not len(tattr):
211 |                     tname = 'removeme:{0}'.format(tname)
212 | 
213 |                 # need to handle fonts outside of the normal methods
214 |                 # so fonts tags won't be added to the self.path since we keep track
215 |                 # of font tags separately with self.font_history
216 |                 if tname == 'font' and ttype == 'begin':
217 |                     # check for nested font start tags
218 |                     if len(self.font_history) > 0 :
219 |                         # inject a font end tag
220 |                         taginfo = ('end', 'font', None)
221 |                         htmlstr += self.processtag(taginfo)
222 |                     self.font_history.append((ttype, tname, tattr))
223 |                     # handle the current font start tag
224 |                     taginfo = (ttype, tname, tattr)
225 |                     htmlstr += self.processtag(taginfo)
226 |                     continue
227 | 
228 |                 # check for nested font tags and unnest them
229 |                 if tname == 'font' and ttype == 'end':
230 |                     self.font_history.pop()
231 |                     # handle this font end tag
232 |                     taginfo = ('end', 'font', None)
233 |                     htmlstr += self.processtag(taginfo)
234 |                     # check if we were nested
235 |                     if len(self.font_history) > 0:
236 |                         # inject a copy of the most recent font start tag from history
237 |                         taginfo = self.font_history[-1]
238 |                         htmlstr += self.processtag(taginfo)
239 |                     continue
240 | 
241 |                 # keep track of nesting path
242 |                 if ttype == 'begin':
243 |                     self.path.append(tname)
244 |                 elif ttype == 'end':
245 |                     if tname != self.path[-1]:
246 |                         print('improper nesting: ', self.path, tname, ttype)
247 |                         if tname not in self.path:
248 |                             # handle case of end tag with no beginning by injecting empty begin tag
249 |                             taginfo = ('begin', tname, None)
250 |                             htmlstr += self.processtag(taginfo)
251 |                             print("     - fixed by injecting empty start tag ", tname)
252 |                             self.path.append(tname)
253 |                         elif len(self.path) >  1 and tname == self.path[-2]:
254 |                             # handle case of dangling missing end
255 |                             taginfo = ('end', self.path[-1], None)
256 |                             htmlstr += self.processtag(taginfo)
257 |                             print("     - fixed by injecting end tag ", self.path[-1])
258 |                             self.path.pop()
259 |                     self.path.pop()
260 | 
261 |                 if tname == 'removeme:{0}'.format(tname):
262 |                     if ttype in ('begin', 'single', 'single_ext'):
263 |                         skip = True
264 |                     else:
265 |                         skip = False
266 |                 else:
267 |                     taginfo = (ttype, tname, tattr)
268 |                     htmlstr += self.processtag(taginfo)
269 | 
270 |                 # handle potential issue of multiple html, head, and body sections
271 |                 if tname == 'html' and ttype == 'begin' and not html_done:
272 |                     htmlstr += '\n'
273 |                     html_done = True
274 | 
275 |                 if tname == 'head' and ttype == 'begin' and not head_done:
276 |                     htmlstr += '\n'
277 |                     # also add in metadata and style link tags
278 |                     htmlstr += self.meta
279 |                     htmlstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
280 |                     head_done = True
281 | 
282 |                 if tname == 'body' and ttype == 'begin' and not body_done:
283 |                     htmlstr += '\n'
284 |                     body_done = True
285 | 
286 |         # handle issue of possibly missing html, head, and body tags
287 |         # I have not seen this but the original did something like this so ...
288 |         if not body_done:
289 |             htmlstr = '<body>\n' + htmlstr + '</body>\n'
290 |         if not head_done:
291 |             headstr = '<head>\n'
292 |             headstr += self.meta
293 |             headstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
294 |             headstr += '</head>\n'
295 |             htmlstr = headstr + htmlstr
296 |         if not html_done:
297 |             htmlstr = '<html>\n' + htmlstr + '</html>\n'
298 | 
299 |         # finally add DOCTYPE info
300 |         htmlstr = '<?xml version="1.0"?>\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n' + htmlstr
301 | 
302 |         css = self.base_css_rules
303 |         for cls, rule in self.tag_css_rules.items():
304 |             css += '.%s { %s }\n' % (cls, rule)
305 | 
306 |         return (htmlstr, css, self.cssname)
307 | 
308 |     def ensure_unit(self, raw, unit='px'):
309 |         if re.search(r'\d+$', raw) is not None:
310 |             raw += unit
311 |         return raw
312 | 
313 |     # flatten possibly modified tag back to string
314 |     def taginfo_tostring(self, taginfo):
315 |         (ttype, tname, tattr) = taginfo
316 |         if ttype is None or tname is None:
317 |             return ''
318 |         if ttype == 'end':
319 |             return '</%s>' % tname
320 |         if ttype in SPECIAL_HANDLING_TYPES and tattr is not None and 'special' in tattr:
321 |             info = tattr['special']
322 |             if ttype == 'comment':
323 |                 return '<%s %s-->' % (tname, info)
324 |             else:
325 |                 return '<%s %s>' % (tname, info)
326 |         res = []
327 |         res.append('<%s' % tname)
328 |         if tattr is not None:
329 |             for key in tattr:
330 |                 res.append(' %s="%s"' % (key, tattr[key]))
331 |         if ttype == 'single':
332 |             res.append('/>')
333 |         elif ttype == 'single_ext':
334 |             res.append(' />')
335 |         else :
336 |             res.append('>')
337 |         return "".join(res)
338 | 
339 |     # routines to convert from mobi ml tags atributes to xhtml attributes and styles
340 |     def processtag(self, taginfo):
341 |         # Converting mobi font sizes to numerics
342 |         size_map = {
343 |             'xx-small': '1',
344 |             'x-small': '2',
345 |             'small': '3',
346 |             'medium': '4',
347 |             'large': '5',
348 |             'x-large': '6',
349 |             'xx-large': '7',
350 |             }
351 | 
352 |         size_to_em_map = {
353 |             '1': '.65em',
354 |             '2': '.75em',
355 |             '3': '1em',
356 |             '4': '1.125em',
357 |             '5': '1.25em',
358 |             '6': '1.5em',
359 |             '7': '2em',
360 |             }
361 | 
362 |         # current tag to work on
363 |         (ttype, tname, tattr) = taginfo
364 |         if not tattr:
365 |             tattr = {}
366 | 
367 |         styles = []
368 | 
369 |         if tname is None or tname.startswith('removeme'):
370 |             return ''
371 | 
372 |         # have not seen an example of this yet so keep it here to be safe
373 |         # until this is better understood
374 |         if tname in ('country-region', 'place', 'placetype', 'placename',
375 |                 'state', 'city', 'street', 'address', 'content'):
376 |             tname = 'div' if tname == 'content' else 'span'
377 |             for key in tattr:
378 |                 tattr.pop(key)
379 | 
380 |         # handle general case of style, height, width, bgcolor in any tag
381 |         if 'style' in tattr:
382 |             style = tattr.pop('style').strip()
383 |             if style:
384 |                 styles.append(style)
385 | 
386 |         if 'align' in tattr:
387 |             align = tattr.pop('align').strip()
388 |             if align:
389 |                 if tname in ('table', 'td', 'tr'):
390 |                     pass
391 |                 else:
392 |                     styles.append('text-align: %s' % align)
393 | 
394 |         if 'height' in tattr:
395 |             height = tattr.pop('height').strip()
396 |             if height and '<' not in height and '>' not in height and re.search(r'\d+', height):
397 |                 if tname in ('table', 'td', 'tr'):
398 |                     pass
399 |                 elif tname == 'img':
400 |                     tattr['height'] = height
401 |                 else:
402 |                     styles.append('margin-top: %s' % self.ensure_unit(height))
403 | 
404 |         if 'width' in tattr:
405 |             width = tattr.pop('width').strip()
406 |             if width and re.search(r'\d+', width):
407 |                 if tname in ('table', 'td', 'tr'):
408 |                     pass
409 |                 elif tname == 'img':
410 |                     tattr['width'] =  width
411 |                 else:
412 |                     styles.append('text-indent: %s' % self.ensure_unit(width))
413 |                     if width.startswith('-'):
414 |                         styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
415 | 
416 |         if 'bgcolor' in tattr:
417 |             # no proprietary html allowed
418 |             if tname == 'div':
419 |                 del tattr['bgcolor']
420 | 
421 |         elif tname == 'font':
422 |             # Change font tags to span tags
423 |             tname = 'span'
424 |             if ttype in ('begin', 'single', 'single_ext'):
425 |                 # move the face attribute to css font-family
426 |                 if 'face' in tattr:
427 |                     face = tattr.pop('face').strip()
428 |                     styles.append('font-family: "%s"' % face)
429 | 
430 |                     # Monitor the constantly changing font sizes, change them to ems and move
431 |                     # them to css. The following will work for 'flat' font tags, but nested font tags
432 |                     # will cause things to go wonky. Need to revert to the parent font tag's size
433 |                     # when a closing tag is encountered.
434 |                 if 'size' in tattr:
435 |                     sz = tattr.pop('size').strip().lower()
436 |                     try:
437 |                         float(sz)
438 |                     except ValueError:
439 |                         if sz in size_map:
440 |                             sz = size_map[sz]
441 |                     else:
442 |                         if sz.startswith('-') or sz.startswith('+'):
443 |                             sz = self.current_font_size + float(sz)
444 |                             if sz > 7:
445 |                                 sz = 7
446 |                             elif sz < 1:
447 |                                 sz = 1
448 |                             sz = str(int(sz))
449 |                     styles.append('font-size: %s' % size_to_em_map[sz])
450 |                     self.current_font_size = int(sz)
451 | 
452 |         elif tname == 'img':
453 |             for attr in ('width', 'height'):
454 |                 if attr in tattr:
455 |                     val = tattr[attr]
456 |                     if val.lower().endswith('em'):
457 |                         try:
458 |                             nval = float(val[:-2])
459 |                             nval *= 16 * (168.451/72)  # Assume this was set using the Kindle profile
460 |                             tattr[attr] = "%dpx"%int(nval)
461 |                         except:
462 |                             del tattr[attr]
463 |                     elif val.lower().endswith('%'):
464 |                         del tattr[attr]
465 | 
466 |         # convert the anchor tags
467 |         if 'filepos-id' in tattr:
468 |             tattr['id'] = tattr.pop('filepos-id')
469 |             if 'name' in tattr and tattr['name'] != tattr['id']:
470 |                 tattr['name'] = tattr['id']
471 | 
472 |         if 'filepos' in tattr:
473 |             filepos = tattr.pop('filepos')
474 |             try:
475 |                 tattr['href'] = "#filepos%d" % int(filepos)
476 |             except ValueError:
477 |                 pass
478 | 
479 |         if styles:
480 |             ncls = None
481 |             rule = '; '.join(styles)
482 |             for sel, srule in self.tag_css_rules.items():
483 |                 if srule == rule:
484 |                     ncls = sel
485 |                     break
486 |             if ncls is None:
487 |                 self.tag_css_rule_cnt += 1
488 |                 ncls = 'rule_%d' % self.tag_css_rule_cnt
489 |                 self.tag_css_rules[ncls] = rule
490 |             cls = tattr.get('class', '')
491 |             cls = cls + (' ' if cls else '') + ncls
492 |             tattr['class'] = cls
493 | 
494 |         # convert updated tag back to string representation
495 |         if len(tattr) == 0:
496 |             tattr = None
497 |         taginfo = (ttype, tname, tattr)
498 |         return self.taginfo_tostring(taginfo)
499 | 
500 | ''' main only left in for testing outside of plugin '''
501 | 
502 | def main(argv=sys.argv):
503 |     if len(argv) != 2:
504 |         return 1
505 |     else:
506 |         infile = argv[1]
507 | 
508 |     try:
509 |         print('Converting Mobi Markup Language to XHTML')
510 |         mlc = MobiMLConverter(infile)
511 |         print('Processing ...')
512 |         htmlstr, css, cssname = mlc.processml()
513 |         outname = infile.rsplit('.',1)[0] + '_converted.html'
514 |         open(outname, 'w').write(htmlstr)
515 |         open(cssname, 'w').write(css)
516 |         print('Completed')
517 |         print('XHTML version of book can be found at: ' + outname)
518 | 
519 |     except ValueError as e:
520 |         print("Error: %s" % e)
521 |         return 1
522 | 
523 |     return 0
524 | 
525 | 
526 | if __name__ == "__main__":
527 |     sys.exit(main())
528 | 


--------------------------------------------------------------------------------
/lib/unipath.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 4 | 
 5 | # Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
 6 | # All rights reserved.
 7 | #
 8 | # Redistribution and use in source and binary forms, with or without modification,
 9 | # are permitted provided that the following conditions are met:
10 | #
11 | # 1. Redistributions of source code must retain the above copyright notice, this list of
12 | # conditions and the following disclaimer.
13 | #
14 | # 2. Redistributions in binary form must reproduce the above copyright notice, this list
15 | # of conditions and the following disclaimer in the documentation and/or other materials
16 | # provided with the distribution.
17 | #
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
19 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
21 | # SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
23 | # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24 | # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
26 | # WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | from __future__ import unicode_literals, division, absolute_import, print_function
29 | from .compatibility_utils import PY2, text_type, binary_type
30 | 
31 | import sys
32 | import os
33 | 
34 | # utility routines to convert all paths to be full unicode
35 | 
36 | # Under Python 2, if a bytestring, try to convert it to unicode using sys.getfilesystemencoding
37 | # Under Python 3, if bytes, try to convert it to unicode using os.fsencode() to decode it
38 | 
39 | # Mac OS X and Windows will happily support full unicode paths
40 | # Linux can support full unicode paths but allows arbitrary byte paths which may be inconsistent with unicode
41 | 
42 | fsencoding = sys.getfilesystemencoding()
43 | 
44 | def pathof(s, enc=fsencoding):
45 |     if s is None:
46 |         return None
47 |     if isinstance(s, text_type):
48 |         return s
49 |     if isinstance(s, binary_type):
50 |         try:
51 |             return s.decode(enc)
52 |         except:
53 |             pass
54 |     return s
55 | 
56 | def exists(s):
57 |     return os.path.exists(pathof(s))
58 | 
59 | def isfile(s):
60 |     return os.path.isfile(pathof(s))
61 | 
62 | def isdir(s):
63 |     return os.path.isdir(pathof(s))
64 | 
65 | def mkdir(s):
66 |     return os.mkdir(pathof(s))
67 | 
68 | def listdir(s):
69 |     rv = []
70 |     for file in os.listdir(pathof(s)):
71 |         rv.append(pathof(file))
72 |     return rv
73 | 
74 | def getcwd():
75 |     if PY2:
76 |         return os.getcwdu()
77 |     return os.getcwd()
78 | 
79 | def walk(top):
80 |     top = pathof(top)
81 |     rv = []
82 |     for base, dnames, names in os.walk(top):
83 |         base = pathof(base)
84 |         for name in names:
85 |             name = pathof(name)
86 |             rv.append(relpath(os.path.join(base, name), top))
87 |     return rv
88 | 
89 | def relpath(path, start=None):
90 |     return os.path.relpath(pathof(path) , pathof(start))
91 | 
92 | def abspath(path):
93 |     return os.path.abspath(pathof(path))
94 | 


--------------------------------------------------------------------------------
/lib/unpack_structure.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | from .compatibility_utils import text_type
  8 | 
  9 | from . import unipath
 10 | from .unipath import pathof
 11 | 
 12 | DUMP = False
 13 | """ Set to True to dump all possible information. """
 14 | 
 15 | import os
 16 | 
 17 | import re
 18 | # note: re requites the pattern to be the exact same type as the data to be searched in python3
 19 | # but u"" is not allowed for the pattern itself only b""
 20 | 
 21 | import zipfile
 22 | import binascii
 23 | from .mobi_utils import mangle_fonts
 24 | 
 25 | class unpackException(Exception):
 26 |     pass
 27 | 
 28 | class ZipInfo(zipfile.ZipInfo):
 29 | 
 30 |     def __init__(self, *args, **kwargs):
 31 |         if 'compress_type' in kwargs:
 32 |             compress_type = kwargs.pop('compress_type')
 33 |         super(ZipInfo, self).__init__(*args, **kwargs)
 34 |         self.compress_type = compress_type
 35 | 
 36 | class fileNames:
 37 | 
 38 |     def __init__(self, infile, outdir):
 39 |         self.infile = infile
 40 |         self.outdir = outdir
 41 |         if not unipath.exists(self.outdir):
 42 |             unipath.mkdir(self.outdir)
 43 |         self.mobi7dir = os.path.join(self.outdir,'mobi7')
 44 |         if not unipath.exists(self.mobi7dir):
 45 |             unipath.mkdir(self.mobi7dir)
 46 |         self.imgdir = os.path.join(self.mobi7dir, 'Images')
 47 |         if not unipath.exists(self.imgdir):
 48 |             unipath.mkdir(self.imgdir)
 49 |         self.hdimgdir = os.path.join(self.outdir,'HDImages')
 50 |         if not unipath.exists(self.hdimgdir):
 51 |             unipath.mkdir(self.hdimgdir)
 52 |         self.outbase = os.path.join(self.outdir, os.path.splitext(os.path.split(infile)[1])[0])
 53 | 
 54 |     def getInputFileBasename(self):
 55 |         return os.path.splitext(os.path.basename(self.infile))[0]
 56 | 
 57 |     def makeK8Struct(self):
 58 |         self.k8dir = os.path.join(self.outdir,'mobi8')
 59 |         if not unipath.exists(self.k8dir):
 60 |             unipath.mkdir(self.k8dir)
 61 |         self.k8metainf = os.path.join(self.k8dir,'META-INF')
 62 |         if not unipath.exists(self.k8metainf):
 63 |             unipath.mkdir(self.k8metainf)
 64 |         self.k8oebps = os.path.join(self.k8dir,'OEBPS')
 65 |         if not unipath.exists(self.k8oebps):
 66 |             unipath.mkdir(self.k8oebps)
 67 |         self.k8images = os.path.join(self.k8oebps,'Images')
 68 |         if not unipath.exists(self.k8images):
 69 |             unipath.mkdir(self.k8images)
 70 |         self.k8fonts = os.path.join(self.k8oebps,'Fonts')
 71 |         if not unipath.exists(self.k8fonts):
 72 |             unipath.mkdir(self.k8fonts)
 73 |         self.k8styles = os.path.join(self.k8oebps,'Styles')
 74 |         if not unipath.exists(self.k8styles):
 75 |             unipath.mkdir(self.k8styles)
 76 |         self.k8text = os.path.join(self.k8oebps,'Text')
 77 |         if not unipath.exists(self.k8text):
 78 |             unipath.mkdir(self.k8text)
 79 | 
 80 |     # recursive zip creation support routine
 81 |     def zipUpDir(self, myzip, tdir, localname):
 82 |         currentdir = tdir
 83 |         if localname != "":
 84 |             currentdir = os.path.join(currentdir,localname)
 85 |         list = unipath.listdir(currentdir)
 86 |         for file in list:
 87 |             afilename = file
 88 |             localfilePath = os.path.join(localname, afilename)
 89 |             realfilePath = os.path.join(currentdir,file)
 90 |             if unipath.isfile(realfilePath):
 91 |                 myzip.write(pathof(realfilePath), pathof(localfilePath), zipfile.ZIP_DEFLATED)
 92 |             elif unipath.isdir(realfilePath):
 93 |                 self.zipUpDir(myzip, tdir, localfilePath)
 94 | 
 95 |     def makeEPUB(self, usedmap, obfuscate_data, uid):
 96 |         bname = os.path.join(self.k8dir, self.getInputFileBasename() + '.epub')
 97 |         # Create an encryption key for Adobe font obfuscation
 98 |         # based on the epub's uid
 99 |         if isinstance(uid,text_type):
100 |             uid = uid.encode('ascii')
101 |         if obfuscate_data:
102 |             key = re.sub(br'[^a-fA-F0-9]', b'', uid)
103 |             key = binascii.unhexlify((key + key)[:32])
104 | 
105 |         # copy over all images and fonts that are actually used in the ebook
106 |         # and remove all font files from mobi7 since not supported
107 |         imgnames = unipath.listdir(self.imgdir)
108 |         for name in imgnames:
109 |             if usedmap.get(name,'not used') == 'used':
110 |                 filein = os.path.join(self.imgdir,name)
111 |                 if name.endswith(".ttf"):
112 |                     fileout = os.path.join(self.k8fonts,name)
113 |                 elif name.endswith(".otf"):
114 |                     fileout = os.path.join(self.k8fonts,name)
115 |                 elif name.endswith(".failed"):
116 |                     fileout = os.path.join(self.k8fonts,name)
117 |                 else:
118 |                     fileout = os.path.join(self.k8images,name)
119 |                 data = b''
120 |                 with open(pathof(filein),'rb') as f:
121 |                     data = f.read()
122 |                 if obfuscate_data:
123 |                     if name in obfuscate_data:
124 |                         data = mangle_fonts(key, data)
125 |                 open(pathof(fileout),'wb').write(data)
126 |                 if name.endswith(".ttf") or name.endswith(".otf"):
127 |                     os.remove(pathof(filein))
128 | 
129 |         # opf file name hard coded to "content.opf"
130 |         container = '<?xml version="1.0" encoding="UTF-8"?>\n'
131 |         container += '<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">\n'
132 |         container += '    <rootfiles>\n'
133 |         container += '<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>'
134 |         container += '    </rootfiles>\n</container>\n'
135 |         fileout = os.path.join(self.k8metainf,'container.xml')
136 |         with open(pathof(fileout),'wb') as f:
137 |             f.write(container.encode('utf-8'))
138 | 
139 |         if obfuscate_data:
140 |             encryption = '<encryption xmlns="urn:oasis:names:tc:opendocument:xmlns:container" \
141 | xmlns:enc="http://www.w3.org/2001/04/xmlenc#" xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">\n'
142 |             for font in obfuscate_data:
143 |                 encryption += '  <enc:EncryptedData>\n'
144 |                 encryption += '    <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>\n'
145 |                 encryption += '    <enc:CipherData>\n'
146 |                 encryption += '      <enc:CipherReference URI="OEBPS/Fonts/' + font + '"/>\n'
147 |                 encryption += '    </enc:CipherData>\n'
148 |                 encryption += '  </enc:EncryptedData>\n'
149 |             encryption += '</encryption>\n'
150 |             fileout = os.path.join(self.k8metainf,'encryption.xml')
151 |             with open(pathof(fileout),'wb') as f:
152 |                 f.write(encryption.encode('utf-8'))
153 | 
154 |         # ready to build epub
155 |         self.outzip = zipfile.ZipFile(pathof(bname), 'w')
156 | 
157 |         # add the mimetype file uncompressed
158 |         mimetype = b'application/epub+zip'
159 |         fileout = os.path.join(self.k8dir,'mimetype')
160 |         with open(pathof(fileout),'wb') as f:
161 |             f.write(mimetype)
162 |         nzinfo = ZipInfo('mimetype', compress_type=zipfile.ZIP_STORED)
163 |         nzinfo.external_attr = 0o600 << 16 # make this a normal file
164 |         self.outzip.writestr(nzinfo, mimetype)
165 |         self.zipUpDir(self.outzip,self.k8dir,'META-INF')
166 |         self.zipUpDir(self.outzip,self.k8dir,'OEBPS')
167 |         self.outzip.close()
168 | 


--------------------------------------------------------------------------------
/libgui/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
3 | 


--------------------------------------------------------------------------------
/libgui/askfolder_ed.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | # to work around tk_chooseDirectory not properly returning unicode paths on Windows
  8 | # need to use a dialog that can be hacked up to actually return full unicode paths
  9 | # originally based on AskFolder from EasyDialogs for Windows but modified to fix it
 10 | # to actually use unicode for path
 11 | 
 12 | # The original license for EasyDialogs is as follows
 13 | #
 14 | # Copyright (c) 2003-2005 Jimmy Retzlaff
 15 | #
 16 | # Permission is hereby granted, free of charge, to any person obtaining a
 17 | # copy of this software and associated documentation files (the "Software"),
 18 | # to deal in the Software without restriction, including without limitation
 19 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 20 | # and/or sell copies of the Software, and to permit persons to whom the
 21 | # Software is furnished to do so, subject to the following conditions:
 22 | #
 23 | # The above copyright notice and this permission notice shall be included in
 24 | # all copies or substantial portions of the Software.
 25 | #
 26 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 27 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 28 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 29 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 30 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 31 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 32 | # DEALINGS IN THE SOFTWARE.
 33 | 
 34 | """
 35 | AskFolder(...) -- Ask the user to select a folder Windows specific
 36 | """
 37 | 
 38 | import ctypes
 39 | from ctypes.wintypes import LPCWSTR
 40 | import ctypes.wintypes as wintypes
 41 | 
 42 | __all__ = ['AskFolder']
 43 | 
 44 | # Load required Windows DLLs
 45 | ole32 = ctypes.windll.ole32
 46 | shell32 = ctypes.windll.shell32
 47 | user32 = ctypes.windll.user32
 48 | 
 49 | # Windows Constants
 50 | BFFM_INITIALIZED = 1
 51 | BFFM_SETOKTEXT = 1129
 52 | BFFM_SETSELECTIONA = 1126
 53 | BFFM_SETSELECTIONW = 1127
 54 | BIF_EDITBOX = 16
 55 | BS_DEFPUSHBUTTON = 1
 56 | CB_ADDSTRING = 323
 57 | CB_GETCURSEL = 327
 58 | CB_SETCURSEL = 334
 59 | CDM_SETCONTROLTEXT = 1128
 60 | EM_GETLINECOUNT = 186
 61 | EM_GETMARGINS = 212
 62 | EM_POSFROMCHAR = 214
 63 | EM_SETSEL = 177
 64 | GWL_STYLE = -16
 65 | IDC_STATIC = -1
 66 | IDCANCEL = 2
 67 | IDNO = 7
 68 | IDOK = 1
 69 | IDYES = 6
 70 | MAX_PATH = 260
 71 | OFN_ALLOWMULTISELECT = 512
 72 | OFN_ENABLEHOOK = 32
 73 | OFN_ENABLESIZING = 8388608
 74 | OFN_ENABLETEMPLATEHANDLE = 128
 75 | OFN_EXPLORER = 524288
 76 | OFN_OVERWRITEPROMPT = 2
 77 | OPENFILENAME_SIZE_VERSION_400 = 76
 78 | PBM_GETPOS = 1032
 79 | PBM_SETMARQUEE = 1034
 80 | PBM_SETPOS = 1026
 81 | PBM_SETRANGE = 1025
 82 | PBM_SETRANGE32 = 1030
 83 | PBS_MARQUEE = 8
 84 | PM_REMOVE = 1
 85 | SW_HIDE = 0
 86 | SW_SHOW = 5
 87 | SW_SHOWNORMAL = 1
 88 | SWP_NOACTIVATE = 16
 89 | SWP_NOMOVE = 2
 90 | SWP_NOSIZE = 1
 91 | SWP_NOZORDER = 4
 92 | VER_PLATFORM_WIN32_NT = 2
 93 | WM_COMMAND = 273
 94 | WM_GETTEXT = 13
 95 | WM_GETTEXTLENGTH = 14
 96 | WM_INITDIALOG = 272
 97 | WM_NOTIFY = 78
 98 | 
 99 | # Windows function prototypes
100 | BrowseCallbackProc = ctypes.WINFUNCTYPE(ctypes.c_int, wintypes.HWND, ctypes.c_uint, wintypes.LPARAM, wintypes.LPARAM)
101 | 
102 | # Windows types
103 | LPCTSTR = ctypes.c_char_p
104 | LPTSTR = ctypes.c_char_p
105 | LPVOID = ctypes.c_voidp
106 | TCHAR = ctypes.c_char
107 | 
108 | class BROWSEINFO(ctypes.Structure):
109 |     _fields_ = [
110 |                 ("hwndOwner", wintypes.HWND),
111 |                 ("pidlRoot", LPVOID),
112 |                 ("pszDisplayName", LPTSTR),
113 |                 ("lpszTitle", LPCTSTR),
114 |                 ("ulFlags", ctypes.c_uint),
115 |                 ("lpfn", BrowseCallbackProc),
116 |                 ("lParam", wintypes.LPARAM),
117 |                 ("iImage", ctypes.c_int)
118 |                ]
119 | 
120 | 
121 | # Utilities
122 | def CenterWindow(hwnd):
123 |     desktopRect = GetWindowRect(user32.GetDesktopWindow())
124 |     myRect = GetWindowRect(hwnd)
125 |     x = width(desktopRect) // 2 - width(myRect) // 2
126 |     y = height(desktopRect) // 2 - height(myRect) // 2
127 |     user32.SetWindowPos(hwnd, 0,
128 |                         desktopRect.left + x,
129 |                         desktopRect.top + y,
130 |                         0, 0,
131 |                         SWP_NOACTIVATE | SWP_NOSIZE | SWP_NOZORDER
132 |                        )
133 | 
134 | 
135 | def GetWindowRect(hwnd):
136 |     rect = wintypes.RECT()
137 |     user32.GetWindowRect(hwnd, ctypes.byref(rect))
138 |     return rect
139 | 
140 | def width(rect):
141 |     return rect.right-rect.left
142 | 
143 | def height(rect):
144 |     return rect.bottom-rect.top
145 | 
146 | 
147 | def AskFolder(
148 |         message=None,
149 |         version=None,
150 |         defaultLocation=None,
151 |         location=None,
152 |         windowTitle=None,
153 |         actionButtonLabel=None,
154 |         cancelButtonLabel=None,
155 |         multiple=None):
156 |     """Display a dialog asking the user for select a folder.
157 |        modified to use unicode strings as much as possible
158 |        returns unicode path
159 |     """
160 | 
161 |     def BrowseCallback(hwnd, uMsg, lParam, lpData):
162 |         if uMsg == BFFM_INITIALIZED:
163 |             if actionButtonLabel:
164 |                 label = unicode(actionButtonLabel, errors='replace')
165 |                 user32.SendMessageW(hwnd, BFFM_SETOKTEXT, 0, label)
166 |             if cancelButtonLabel:
167 |                 label = unicode(cancelButtonLabel, errors='replace')
168 |                 cancelButton = user32.GetDlgItem(hwnd, IDCANCEL)
169 |                 if cancelButton:
170 |                     user32.SetWindowTextW(cancelButton, label)
171 |             if windowTitle:
172 |                 title = unicode(windowTitle, errors='replace')
173 |                 user32.SetWindowTextW(hwnd, title)
174 |             if defaultLocation:
175 |                 user32.SendMessageW(hwnd, BFFM_SETSELECTIONW, 1, defaultLocation.replace('/', '\\'))
176 |             if location:
177 |                 x, y = location
178 |                 desktopRect = wintypes.RECT()
179 |                 user32.GetWindowRect(0, ctypes.byref(desktopRect))
180 |                 user32.SetWindowPos(hwnd, 0,
181 |                                   desktopRect.left + x,
182 |                                   desktopRect.top + y, 0, 0,
183 |                                   SWP_NOACTIVATE | SWP_NOSIZE | SWP_NOZORDER)
184 |             else:
185 |                 CenterWindow(hwnd)
186 |         return 0
187 | 
188 |     # This next line is needed to prevent gc of the callback
189 |     callback = BrowseCallbackProc(BrowseCallback)
190 | 
191 |     browseInfo = BROWSEINFO()
192 |     browseInfo.pszDisplayName = ctypes.c_char_p('\0' * (MAX_PATH+1))
193 |     browseInfo.lpszTitle = message
194 |     browseInfo.lpfn = callback
195 | 
196 |     pidl = shell32.SHBrowseForFolder(ctypes.byref(browseInfo))
197 |     if not pidl:
198 |         result = None
199 |     else:
200 |         path = LPCWSTR(u" " * (MAX_PATH+1))
201 |         shell32.SHGetPathFromIDListW(pidl, path)
202 |         ole32.CoTaskMemFree(pidl)
203 |         result = path.value
204 |     return result
205 | 


--------------------------------------------------------------------------------
/libgui/prefs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | from lib.compatibility_utils import PY2, PY3, unicode_str
  8 | from lib import unipath
  9 | from lib.unipath import pathof
 10 | 
 11 | import os
 12 | import json
 13 | if PY2:
 14 |     import codecs
 15 | 
 16 | 
 17 | def getprefs(configfile, tkobj, PERSIST):
 18 |     # To keep things simple for possible future preference additions/deletions:
 19 |     # Try to stick to - TK Widget name = prefs dictionary key.
 20 |     # EX: tkobj.outpath = prefs['outpath']
 21 |     prefs = {}
 22 | 
 23 |     # Sane defaults
 24 |     prefs['mobipath'] = unipath.getcwd()
 25 |     prefs['outpath'] = unipath.getcwd()
 26 |     prefs['apnxpath'] = unipath.getcwd()
 27 |     prefs['splitvar'] = 0
 28 |     prefs['rawvar'] = 0
 29 |     prefs['dbgvar'] = 0
 30 |     prefs['hdvar'] = 0
 31 |     prefs['epubver'] = 0
 32 |     tkobj.update_idletasks()
 33 |     w = tkobj.winfo_screenwidth()
 34 |     h = tkobj.winfo_screenheight()
 35 |     rootsize = (605, 575)
 36 |     x = w//2 - rootsize[0]//2
 37 |     y = h//2 - rootsize[1]//2
 38 |     prefs['windowgeometry'] = ('%dx%d+%d+%d' % (rootsize + (x, y)))
 39 | 
 40 |     if unipath.exists(configfile) and PERSIST:
 41 |         try:
 42 |             if PY3:
 43 |                 with open(configfile, 'r', encoding='utf-8') as f:
 44 |                     tmpprefs = json.load(f)
 45 |             else:
 46 |                 with codecs.open(configfile, 'r', encoding='utf-8') as f:
 47 |                     tmpprefs = json.load(f)
 48 |         except:
 49 |             return prefs
 50 | 
 51 |         if 'mobipath' in tmpprefs.keys():
 52 |             prefs['mobipath'] = unicode_str(tmpprefs['mobipath'], 'utf-8')
 53 |         if 'outpath' in tmpprefs.keys():
 54 |             prefs['outpath'] = unicode_str(tmpprefs['outpath'], 'utf-8')
 55 |         if 'apnxpath' in tmpprefs.keys():
 56 |             prefs['apnxpath'] = unicode_str(tmpprefs['apnxpath'], 'utf-8')
 57 |         if 'splitvar' in tmpprefs.keys():
 58 |             prefs['splitvar'] = tmpprefs['splitvar']
 59 |         if 'rawvar' in tmpprefs.keys():
 60 |             prefs['rawvar'] = tmpprefs['rawvar']
 61 |         if 'dbgvar'in tmpprefs.keys():
 62 |             prefs['dbgvar'] = tmpprefs['dbgvar']
 63 |         if 'hdvar' in tmpprefs.keys():
 64 |             prefs['hdvar'] = tmpprefs['hdvar']
 65 |         if 'epubver' in tmpprefs.keys():
 66 |             prefs['epubver'] = tmpprefs['epubver']
 67 |         if 'windowgeometry' in tmpprefs.keys():
 68 |             prefs['windowgeometry'] = tmpprefs['windowgeometry']
 69 | 
 70 |     return prefs
 71 | 
 72 | 
 73 | def saveprefs(configfile, prefs, tkobj):
 74 |     # tkobj name = prefs dictionary key
 75 | 
 76 |     # mobipath
 77 |     apath = pathof(tkobj.mobipath.get())
 78 |     if apath is not None and unipath.isfile(apath):
 79 |         prefs['mobipath'] = os.path.dirname(apath)
 80 | 
 81 |     # outpath
 82 |     apath = pathof(tkobj.outpath.get())
 83 |     if apath is not None and unipath.isdir(apath):
 84 |         prefs['outpath'] = apath
 85 | 
 86 |     # apnxpath
 87 |     apath = pathof(tkobj.apnxpath.get())
 88 |     if apath is not None and unipath.isfile(apath):
 89 |         prefs['apnxpath'] = os.path.dirname(apath)
 90 | 
 91 |     prefs['splitvar'] = tkobj.splitvar.get()
 92 |     prefs['rawvar'] = tkobj.rawvar.get()
 93 |     prefs['dbgvar'] = tkobj.dbgvar.get()
 94 |     prefs['hdvar'] = tkobj.hdvar.get()
 95 |     prefs['epubver'] = tkobj.epubver.current()
 96 |     prefs['windowgeometry'] = tkobj.root.geometry()
 97 |     try:
 98 |         if PY3:
 99 |             with open(configfile, 'w', encoding='utf-8') as f:
100 |                 json.dump(prefs, f, ensure_ascii=False, indent=4)
101 |         else:
102 |             with codecs.open(configfile, 'w', encoding='utf-8') as f:
103 |                 json.dump(prefs, f, ensure_ascii=False, indent=4)
104 |         return 1
105 |     except:
106 |         pass
107 |         return 0
108 | 


--------------------------------------------------------------------------------
/libgui/scrolltextwidget.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from __future__ import unicode_literals, division, absolute_import, print_function
 5 | from lib.compatibility_utils import PY2
 6 | 
 7 | if PY2:
 8 |     import Tkinter as tkinter
 9 |     import Tkconstants as tkinter_constants
10 | else:
11 |     import tkinter
12 |     import tkinter.constants as tkinter_constants
13 | 
14 | # basic scrolled text widget
15 | class ScrolledText(tkinter.Text):
16 | 
17 |     def __init__(self, master=None, **kw):
18 |         self.frame = tkinter.Frame(master)
19 |         self.vbar = tkinter.Scrollbar(self.frame)
20 |         self.vbar.pack(side=tkinter_constants.RIGHT, fill=tkinter_constants.Y)
21 |         kw.update({'yscrollcommand': self.vbar.set})
22 |         tkinter.Text.__init__(self, self.frame, **kw)
23 |         self.pack(side=tkinter_constants.LEFT, fill=tkinter_constants.BOTH, expand=True)
24 |         self.vbar['command'] = self.yview
25 |         # Copy geometry methods of self.frame without overriding Text
26 |         # methods = hack!
27 |         text_meths = vars(tkinter.Text).keys()
28 |         methods = list(vars(tkinter.Pack).keys()) + list(vars(tkinter.Grid).keys()) + list(vars(tkinter.Place).keys())
29 |         methods = set(methods).difference(text_meths)
30 |         for m in methods:
31 |             if m[0] != '_' and m != 'config' and m != 'configure':
32 |                 setattr(self, m, getattr(self.frame, m))
33 | 
34 |     def __str__(self):
35 |         return str(self.frame)
36 | 


--------------------------------------------------------------------------------