├── .gitattributes
├── .gitignore
├── COPYING.txt
├── DumpMobiHeader_v023.py
├── KindleUnpack.pyw
├── KindleUnpack_ReadMe.htm
├── README.md
├── lib
├── __init__.py
├── compatibility_utils.py
├── imghdr.py
├── kindleunpack.py
├── mobi_cover.py
├── mobi_dict.py
├── mobi_header.py
├── mobi_html.py
├── mobi_index.py
├── mobi_k8proc.py
├── mobi_k8resc.py
├── mobi_nav.py
├── mobi_ncx.py
├── mobi_opf.py
├── mobi_pagemap.py
├── mobi_sectioner.py
├── mobi_split.py
├── mobi_uncompress.py
├── mobi_utils.py
├── mobiml2xhtml.py
├── unipath.py
└── unpack_structure.py
└── libgui
├── __init__.py
├── askfolder_ed.py
├── prefs.py
└── scrolltextwidget.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.txt text
2 | *.py text
3 | *.cfg text
4 |
5 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | *.DS_Store
6 |
7 | # Vim temp/swap files
8 | *~
9 | *.orig
10 | *.keep
11 | *.swp
12 | *.swo
13 |
14 | # PyInstaller
15 | # Usually these files are written by a python script from a template
16 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
17 | *.manifest
18 | *.spec
19 |
20 |
21 | # Translations
22 | *.mo
23 | *.pot
24 |
25 | *.log
26 |
27 | # PyBuilder
28 | target/
29 |
30 | # Files/folders used/produced when testing Kindleunpack
31 | HDImages/
32 | mobi7/
33 | mobi8/
34 |
35 | *.bak
36 | *.dat
37 | *.data
38 | *.pdf
39 | *.ini
40 | *.json
41 | *.mobi
42 | *.prc
43 | *.azw
44 | *.azw[34]
45 |
46 | # Folder to direct output to when testing command-line (will be ignored by git)
47 | testout/
48 |
--------------------------------------------------------------------------------
/KindleUnpack.pyw:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
4 |
5 | from __future__ import unicode_literals, division, absolute_import, print_function
6 |
7 | import sys
8 |
9 | from lib.compatibility_utils import PY2, text_type, unicode_str
10 | from lib.compatibility_utils import unicode_argv, add_cp65001_codec
11 |
12 | import lib.unipath as unipath
13 | from lib.unipath import pathof
14 |
15 | import os
16 | import traceback
17 |
18 | import codecs
19 | add_cp65001_codec()
20 |
21 | try:
22 | from queue import Full
23 | from queue import Empty
24 | except ImportError:
25 | from Queue import Full
26 | from Queue import Empty
27 |
28 | if PY2 and sys.platform.startswith("win"):
29 | from libgui.askfolder_ed import AskFolder
30 |
31 | from multiprocessing import Process, Queue
32 |
33 | if PY2:
34 | import Tkinter as tkinter
35 | import Tkconstants as tkinter_constants
36 | import tkFileDialog as tkinter_filedialog
37 | import ttk as tkinter_ttk
38 | else:
39 | import tkinter
40 | import tkinter.constants as tkinter_constants
41 | import tkinter.filedialog as tkinter_filedialog
42 | import tkinter.ttk as tkinter_ttk
43 |
44 | from libgui.scrolltextwidget import ScrolledText
45 |
46 | import lib.kindleunpack as kindleunpack
47 |
48 | # Set to false to NOT save prefences to an ini file.
49 | # Starting directories for file dialogs will still persist
50 | # for the current KindleUnpack session.
51 | #
52 | # Need to delete the ini file after setting to false, of course.
53 | PERSISTENT_PREFS = True
54 |
55 | from inspect import getfile, currentframe
56 | from libgui.prefs import getprefs, saveprefs
57 |
58 | # Probably overkill, but to ensure cross-platform success no matter how the script is called/run...
59 | SCRIPT_NAME = unicode_str(getfile(currentframe()))
60 | SCRIPT_DIR = unicode_str(os.path.dirname(unipath.abspath(getfile(currentframe()))))
61 | PROGNAME = unicode_str(os.path.splitext(SCRIPT_NAME)[0])
62 |
63 | # Include platform in the ini file name. That way, settings can still persist
64 | # in the event that different OSs access the same script via a network share/flash-drive.
65 | CONFIGFILE = unicode_str(os.path.join(SCRIPT_DIR, '{0}_{1}.json'.format(PROGNAME, sys.platform[:3])))
66 |
67 | # Wrap a stream so that output gets appended to shared queue
68 | # using utf-8 encoding
69 | class QueuedStream:
70 | def __init__(self, stream, q):
71 | self.stream = stream
72 | self.encoding = stream.encoding
73 | self.q = q
74 | if self.encoding == None:
75 | self.encoding = 'utf-8'
76 | def write(self, data):
77 | if isinstance(data,text_type):
78 | data = data.encode('utf-8')
79 | elif self.encoding not in ['utf-8','UTF-8','cp65001','CP65001']:
80 | udata = data.decode(self.encoding)
81 | data = udata.encode('utf-8')
82 | self.q.put(data)
83 | def __getattr__(self, attr):
84 | if attr == 'mode':
85 | return 'wb'
86 | if attr == 'encoding':
87 | return 'utf-8'
88 | return getattr(self.stream, attr)
89 |
90 |
91 | class MainDialog(tkinter.Frame):
92 |
93 | def __init__(self, root):
94 | tkinter.Frame.__init__(self, root, border=5)
95 | self.root = root
96 | self.interval = 50
97 | self.p2 = None
98 | self.q = Queue()
99 | # To keep things simple for possible future preference additions/deletions:
100 | # Try to stick to - TK Widget name = prefs dictionary key = ini.get|set name.
101 | # EX: mobipath = prefs['mobipath'] = config.get('Defaults', mobipath).
102 | self.prefs = getprefs(CONFIGFILE, self.root, PERSISTENT_PREFS)
103 |
104 | self.status = tkinter.StringVar()
105 | tkinter.Label(self, textvariable=self.status, justify='center').grid(row=0, columnspan=3, sticky=tkinter_constants.N)
106 | self.status.set('Upack a non-DRM Kindle eBook')
107 | sticky = tkinter_constants.E + tkinter_constants.W
108 | ALL = tkinter_constants.E+tkinter_constants.W+tkinter_constants.N+tkinter_constants.S
109 | # Set to the column the textentry boxes are in.
110 | self.grid_columnconfigure(1, weight=1)
111 | # Set to the row the debug log widget is in.
112 | self.grid_rowconfigure(10, weight=1)
113 |
114 | tkinter.Label(self, text='').grid(row=1, sticky=tkinter_constants.E)
115 | tkinter.Label(self, text='Unencrypted Kindle eBook input file', wraplength=200).grid(row=2, sticky=tkinter_constants.E)
116 | self.mobipath = tkinter.Entry(self, width=50)
117 | self.mobipath.grid(row=2, column=1, sticky=sticky)
118 | self.mobipath.insert(0, '')
119 | button = tkinter.Button(self, text="Browse...", command=self.get_mobipath)
120 | button.grid(row=2, column=2, sticky=sticky)
121 |
122 | tkinter.Label(self, text='Output Directory', wraplength=200).grid(row=3, sticky=tkinter_constants.E)
123 | self.outpath = tkinter.Entry(self, width=50)
124 | self.outpath.grid(row=3, column=1, sticky=sticky)
125 | if self.prefs['outpath'] and PERSISTENT_PREFS and unipath.exists(CONFIGFILE):
126 | outpath = pathof(os.path.normpath(self.prefs['outpath']))
127 | self.outpath.insert(0, outpath)
128 | else:
129 | self.outpath.insert(0, '')
130 | button = tkinter.Button(self, text="Browse...", command=self.get_outpath)
131 | button.grid(row=3, column=2, sticky=sticky)
132 |
133 | tkinter.Label(self, text='OPTIONAL: APNX file Associated with AZW3', wraplength=200).grid(row=4, sticky=tkinter_constants.E)
134 | self.apnxpath = tkinter.Entry(self, width=50)
135 | self.apnxpath.grid(row=4, column=1, sticky=sticky)
136 | self.apnxpath.insert(0, '')
137 | button = tkinter.Button(self, text="Browse...", command=self.get_apnxpath)
138 | button.grid(row=4, column=2, sticky=sticky)
139 |
140 | self.splitvar = tkinter.IntVar()
141 | checkbox = tkinter.Checkbutton(self, text="Split Combination Kindlegen eBooks", variable=self.splitvar)
142 | if self.prefs['splitvar'] and PERSISTENT_PREFS:
143 | checkbox.select()
144 | checkbox.grid(row=5, column=1, columnspan=2, sticky=tkinter_constants.W)
145 |
146 | self.rawvar = tkinter.IntVar()
147 | checkbox = tkinter.Checkbutton(self, text="Write Raw Data", variable=self.rawvar)
148 | if self.prefs['rawvar'] and PERSISTENT_PREFS:
149 | checkbox.select()
150 | checkbox.grid(row=6, column=1, columnspan=2, sticky=tkinter_constants.W)
151 |
152 | self.dbgvar = tkinter.IntVar()
153 | checkbox = tkinter.Checkbutton(self, text="Dump Mode", variable=self.dbgvar)
154 | if self.prefs['dbgvar'] and PERSISTENT_PREFS:
155 | checkbox.select()
156 | checkbox.grid(row=7, column=1, columnspan=2, sticky=tkinter_constants.W)
157 |
158 | self.hdvar = tkinter.IntVar()
159 | checkbox = tkinter.Checkbutton(self, text="Use HD Images If Present", variable=self.hdvar)
160 | if self.prefs['hdvar'] and PERSISTENT_PREFS:
161 | checkbox.select()
162 | checkbox.grid(row=8, column=1, columnspan=2, sticky=tkinter_constants.W)
163 |
164 | tkinter.Label(self, text='ePub Output Type:').grid(row=9, sticky=tkinter_constants.E)
165 | self.epubver_val = tkinter.StringVar()
166 | self.epubver = tkinter_ttk.Combobox(self, textvariable=self.epubver_val, state='readonly')
167 | self.epubver['values'] = ('ePub 2', 'ePub 3', 'Auto-detect', 'Force ePub 2')
168 | self.epubver.current(0)
169 | if self.prefs['epubver'] and PERSISTENT_PREFS:
170 | self.epubver.current(self.prefs['epubver'])
171 | self.epubver.grid(row=9, column=1, columnspan=2, pady=(3,5), sticky=tkinter_constants.W)
172 |
173 | msg1 = 'Conversion Log \n\n'
174 | self.stext = ScrolledText(self, bd=5, relief=tkinter_constants.RIDGE, wrap=tkinter_constants.WORD)
175 | self.stext.grid(row=10, column=0, columnspan=3, sticky=ALL)
176 | self.stext.insert(tkinter_constants.END,msg1)
177 |
178 | self.sbotton = tkinter.Button(
179 | self, text="Start", width=10, command=self.convertit)
180 | self.sbotton.grid(row=11, column=1, sticky=tkinter_constants.S+tkinter_constants.E)
181 | self.qbutton = tkinter.Button(
182 | self, text="Quit", width=10, command=self.quitting)
183 | self.qbutton.grid(row=11, column=2, sticky=tkinter_constants.S+tkinter_constants.W)
184 | if self.prefs['windowgeometry'] and PERSISTENT_PREFS:
185 | self.root.geometry(self.prefs['windowgeometry'])
186 | else:
187 | self.root.update_idletasks()
188 | w = self.root.winfo_screenwidth()
189 | h = self.root.winfo_screenheight()
190 | rootsize = (605, 575)
191 | x = w//2 - rootsize[0]//2
192 | y = h//2 - rootsize[1]//2
193 | self.root.geometry('%dx%d+%d+%d' % (rootsize + (x, y)))
194 | self.root.protocol('WM_DELETE_WINDOW', self.quitting)
195 |
196 | # read queue shared between this main process and spawned child processes
197 | def readQueueUntilEmpty(self):
198 | done = False
199 | text = ''
200 | while not done:
201 | try:
202 | data = self.q.get_nowait()
203 | text += unicode_str(data, 'utf-8')
204 | except Empty:
205 | done = True
206 | pass
207 | return text
208 |
209 | # read from subprocess pipe without blocking
210 | # invoked every interval via the widget "after"
211 | # option being used, so need to reset it for the next time
212 | def processQueue(self):
213 | poll = self.p2.exitcode
214 | if poll != None:
215 | text = self.readQueueUntilEmpty()
216 | msg = text + '\n\n' + 'eBook successfully unpacked\n'
217 | if poll != 0:
218 | msg = text + '\n\n' + 'Error: Unpacking Failed\n'
219 | self.p2.join()
220 | self.showCmdOutput(msg)
221 | self.p2 = None
222 | self.sbotton.configure(state='normal')
223 | return
224 | text = self.readQueueUntilEmpty()
225 | self.showCmdOutput(text)
226 | # make sure we get invoked again by event loop after interval
227 | self.stext.after(self.interval,self.processQueue)
228 | return
229 |
230 | # post output from subprocess in scrolled text widget
231 | def showCmdOutput(self, msg):
232 | if msg and msg !='':
233 | if sys.platform.startswith('win'):
234 | msg = msg.replace('\r\n','\n')
235 | self.stext.insert(tkinter_constants.END,msg)
236 | self.stext.yview_pickplace(tkinter_constants.END)
237 | return
238 |
239 | def get_mobipath(self):
240 | cwd = unipath.getcwd()
241 | mobipath = tkinter_filedialog.askopenfilename(
242 | parent=None, title='Select Unencrypted Kindle eBook File',
243 | initialdir=self.prefs['mobipath'] or cwd,
244 | initialfile=None,
245 | defaultextension=('.mobi', '.prc', '.azw', '.azw4', '.azw3'),
246 | filetypes=[('All Kindle formats', ('.mobi', '.prc', '.azw', '.azw4', '.azw3')),
247 | ('Kindle Mobi eBook File', '.mobi'), ('Kindle PRC eBook File', '.prc'),
248 | ('Kindle AZW eBook File', '.azw'), ('Kindle AZW4 Print Replica', '.azw4'),
249 | ('Kindle Version 8', '.azw3'),('All Files', '.*')])
250 | if mobipath:
251 | self.prefs['mobipath'] = pathof(os.path.dirname(mobipath))
252 | mobipath = pathof(os.path.normpath(mobipath))
253 | self.mobipath.delete(0, tkinter_constants.END)
254 | self.mobipath.insert(0, mobipath)
255 | return
256 |
257 | def get_apnxpath(self):
258 | cwd = unipath.getcwd()
259 | apnxpath = tkinter_filedialog.askopenfilename(
260 | parent=None, title='Optional APNX file associated with AZW3',
261 | initialdir=self.prefs['apnxpath'] or cwd,
262 | initialfile=None,
263 | defaultextension='.apnx', filetypes=[('Kindle APNX Page Information File', '.apnx'), ('All Files', '.*')])
264 | if apnxpath:
265 | self.prefs['apnxpath'] = pathof(os.path.dirname(apnxpath))
266 | apnxpath = pathof(os.path.normpath(apnxpath))
267 | self.apnxpath.delete(0, tkinter_constants.END)
268 | self.apnxpath.insert(0, apnxpath)
269 | return
270 |
271 | def get_outpath(self):
272 | cwd = unipath.getcwd()
273 | if sys.platform.startswith("win") and PY2:
274 | # tk_chooseDirectory is horribly broken for unicode paths
275 | # on windows - bug has been reported but not fixed for years
276 | # workaround by using our own unicode aware version
277 | outpath = AskFolder(message="Folder to Store Output into",
278 | defaultLocation=self.prefs['outpath'] or unipath.getcwd())
279 | else:
280 | outpath = tkinter_filedialog.askdirectory(
281 | parent=None, title='Folder to Store Output into',
282 | initialdir=self.prefs['outpath'] or cwd, initialfile=None)
283 | if outpath:
284 | self.prefs['outpath'] = outpath
285 | outpath = pathof(os.path.normpath(outpath))
286 | self.outpath.delete(0, tkinter_constants.END)
287 | self.outpath.insert(0, outpath)
288 | return
289 |
290 | def quitting(self):
291 | # kill any still running subprocess
292 | if self.p2 != None:
293 | if (self.p2.exitcode == None):
294 | self.p2.terminate()
295 | if PERSISTENT_PREFS:
296 | if not saveprefs(CONFIGFILE, self.prefs, self):
297 | print("Couldn't save INI file.")
298 | self.root.destroy()
299 | self.quit()
300 |
301 | # run in a child process and collect its output
302 | def convertit(self):
303 | # now disable the button to prevent multiple launches
304 | self.sbotton.configure(state='disabled')
305 | mobipath = unicode_str(self.mobipath.get())
306 | apnxpath = unicode_str(self.apnxpath.get())
307 | outdir = unicode_str(self.outpath.get())
308 | if not mobipath or not unipath.exists(mobipath):
309 | self.status.set('Specified eBook file does not exist')
310 | self.sbotton.configure(state='normal')
311 | return
312 | apnxfile = None
313 | if apnxpath != "" and unipath.exists(apnxpath):
314 | apnxfile = apnxpath
315 | if not outdir:
316 | self.status.set('No output directory specified')
317 | self.sbotton.configure(state='normal')
318 | return
319 | q = self.q
320 | log = 'Input Path = "'+ mobipath + '"\n'
321 | log += 'Output Path = "' + outdir + '"\n'
322 | if apnxfile != None:
323 | log += 'APNX Path = "' + apnxfile + '"\n'
324 | dump = False
325 | writeraw = False
326 | splitcombos = False
327 | use_hd = False
328 | if self.dbgvar.get() == 1:
329 | dump = True
330 | log += 'Debug = True\n'
331 | if self.rawvar.get() == 1:
332 | writeraw = True
333 | log += 'WriteRawML = True\n'
334 | if self.splitvar.get() == 1:
335 | splitcombos = True
336 | log += 'Split Combo KF8 Kindle eBooks = True\n'
337 | if self.epubver.current() == 0:
338 | epubversion = '2'
339 | elif self.epubver.current() == 1:
340 | epubversion = '3'
341 | elif self.epubver.current() == 2:
342 | epubversion = 'A'
343 | else:
344 | epubversion = 'F'
345 | log += 'Epub Output Type Set To: {0}\n'.format(self.epubver_val.get())
346 | if self.hdvar.get():
347 | use_hd = True
348 | # stub for processing the Use HD Images setting
349 | log += 'Use HD Images If Present = True\n'
350 | log += '\n\n'
351 | log += 'Please Wait ...\n\n'
352 | self.stext.insert(tkinter_constants.END,log)
353 | self.p2 = Process(target=unpackEbook, args=(q, mobipath, outdir, apnxfile, epubversion, use_hd, dump, writeraw, splitcombos))
354 | self.p2.start()
355 |
356 | # python does not seem to allow you to create
357 | # your own eventloop which every other gui does - strange
358 | # so need to use the widget "after" command to force
359 | # event loop to run non-gui events every interval
360 | self.stext.after(self.interval,self.processQueue)
361 | return
362 |
363 |
364 | # child process / multiprocessing thread starts here
365 | def unpackEbook(q, infile, outdir, apnxfile, epubversion, use_hd, dump, writeraw, splitcombos):
366 | sys.stdout = QueuedStream(sys.stdout, q)
367 | sys.stderr = QueuedStream(sys.stderr, q)
368 | rv = 0
369 | try:
370 | kindleunpack.unpackBook(infile, outdir, apnxfile, epubversion, use_hd, dodump=dump, dowriteraw=writeraw, dosplitcombos=splitcombos)
371 | except Exception as e:
372 | print("Error: %s" % e)
373 | print(traceback.format_exc())
374 | rv = 1
375 | sys.exit(rv)
376 |
377 |
378 | def main(argv=unicode_argv()):
379 | root = tkinter.Tk()
380 | root.title('Kindle eBook Unpack Tool')
381 | root.minsize(440, 350)
382 | root.resizable(True, True)
383 | MainDialog(root).pack(fill=tkinter_constants.BOTH, expand=tkinter_constants.YES)
384 | root.mainloop()
385 | return 0
386 |
387 | if __name__ == "__main__":
388 | sys.exit(main())
389 |
--------------------------------------------------------------------------------
/KindleUnpack_ReadMe.htm:
--------------------------------------------------------------------------------
1 | T
2 |
3 | KindleUnpack ReadMe
4 |
5 |
6 | KindleUnpack unpacks a non-DRM Kindle/MobiPocket ebook into its component parts which change depending on the type of Kindle/Mobi ebook being processed.
7 |
8 |
9 | MobiPocket and early Kindle version 7 or less ebooks are unpacked to the original html 3.2 and images folder that can then be edited and reprocessed by MobiPocketCreator.
10 | Kindle Print Replica ebook as unpacked to the original PDF and any associated images.
11 | Kindle KF8 only ebooks (.azw3) are unpacked into an epub-like structure that may or may not be a fully valid epub depending on if a fully valid epub was originally provided to kindlegen as input.>
NOTE: The generated epub should be validated using an epub validator and should changes be needed, it should load properly into Sigil and Calibre either of which can be used to edit the result to create a fully valid epub.
12 | Newer Kindle ebooks which have both KF8 and older versions inside are unpacked into two different parts: the first being the older MobiPocket format ebook parts (see #1 above) and the second being an epub-like structure that can be edited using Sigil (see #3 above).
13 |
14 |
15 | The KindleUnpack program requires Python 2.7.X or Python 3.4 or later to function properly.
16 | On Windows machines we strongly recommend you install the free version of ActiveState's Active Python 2.7.3 or later 2.7.X version as it properly installs all of the required parts including the tk widget kit and updates the system path on Windows machines. The official installer from python.org does not properly handle this for Windows machines.
17 | On Mac OS X 10.6.X and later and almost all recent Linux versions the required version of Python is already installed as part of the official OS installation so Mac OS X and Linux users need install nothing extra.
18 |
19 | To install KindleUnpack, simply find a nice location on your machine and fully unzip it. Do not move the KindleUnpack.pyw program away from its associated "lib" folder. If you have a proper Python 2.6 or later installation on your machine, you should be able to simply double-click the KindleUnpack.pyw icon and the gui interface should start.
20 |
21 | If you would prefer a command-line interface, simply look inside KindleUnpack's "lib" folder for the KindleUnpack.py python program and its support modules. You should then be able to run KindleUnpack.py by the following command:
22 |
23 |
24 | python kindleunpack.py [-r -s -d -h -i] [-p APNX_FILE] INPUT_FILE OUTPUT_FOLDER
25 |
26 |
27 | where you replace:
28 |
29 |
30 | INPUT_FILE - path to the desired Kindle/MobiPocket ebook
31 |
32 | OUTPUT_FOLDER - path to folder where the ebook will be unpacked
33 |
34 | Options:
35 | -h print this help message
36 | -i use HDImages to overwrite lower resolution versions, if present
37 | -s split combination mobis into older mobi and mobi KF8 ebooks
38 | -p APNX_FILE path to a .apnx file that contains real page numbers associated with an azw3 ebook (optional)
39 | Note: many apnx files have arbitrarily assigned page offsets that will confuse KindleUnpack if used
40 | --epub_version= specify epub version to unpack to: 2, 3 or A (for automatic) or
41 | F for Force to epub2, default is 2
42 | -r write raw data to the output folder
43 | -d dump headers and other debug info to output and extra files
44 |
45 |
46 |
47 | Please report any bugs or comments/requests our sticky forum on the Mobileread website. It can be found at http://www.mobileread.com/forums. Look under E-Book Formats > Kindle Formats > KindleUnpack (MobiUnpack).
48 |
49 |
50 | License Information
51 |
52 | KindleUnpack
53 | Based on initial mobipocket version Copyright © 2009 Charles M. Hannum
54 | Extensive Extensions and Improvements Copyright © 2009-2014
55 | By P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.
56 | This program is free software: you can redistribute it and/or modify
57 | it under the terms of the GNU General Public License as published by
58 | the Free Software Foundation, version 3.
59 |
60 |
61 |
62 |
63 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | KindleUnpack
2 | ============
3 |
4 | python based software to unpack Amazon / Kindlegen generated ebooks
5 |
6 | KindleUnpack unpacks a non-DRM Kindle/MobiPocket ebook into its component parts
7 | which change depending on the type of Kindle/Mobi ebook being processed
8 |
9 | - MobiPocket and early Kindle version 7 or less ebooks are unpacked to the
10 | original html 3.2 and images folder that can then be edited and reprocessed by
11 | MobiPocketCreator.
12 |
13 | - Kindle Print Replica ebook are unpacked to the original PDF and any associated images.
14 |
15 | - Kindle KF8 only ebooks (.azw3) are unpacked into an epub-like structure that may or
16 | may not be a fully valid epub depending on if a fully valid epub was
17 | originally provided to kindlegen as input. NOTE: The generated epub should be
18 | validated using an epub validator and should changes be needed, it should load
19 | properly into Sigil and Calibre either of which can be used to edit the result
20 | to create a fully valid epub.
21 |
22 | - Newer Kindle ebooks which have both KF8 and older versions inside are unpacked into
23 | two different parts: the first being the older MobiPocket format ebook parts
24 | and the second being an epub-like structure that can be edited using Sigil
25 |
26 | The KindleUnpack program requires Python 2.7.X or Python 3.4 or later to function properly.
27 |
28 | On Windows machines we strongly recommend you install the free version of ActiveState's
29 | Active Python 2.7.X or 3.4.X or later as it properly installs all of the required parts
30 | including the tk widget kit and updates the system path on Windows machines. The official
31 | installer from python.org sometimes does not properly handle this for Windows machines.
32 |
33 | On Mac OS X 10.6.X and later and almost all recent Linux versions, the required version
34 | of Python is already installed as part of the official OS installation so Mac OS X and
35 | Linux users need install nothing extra.
36 |
37 | To install KindleUnpack, simply find a nice location on your machine and fully unzip it.
38 | Do not move the KindleUnpack.pyw program away from its associated "lib" folder. If you
39 | have a proper Python 2.7 or later installation on your machine, you should be able to
40 | simply double-click the KindleUnpack.pyw icon and the gui interface should start
41 |
42 | If you would prefer a command-line interface, simply look inside KindleUnpack's "lib"
43 | folder for the KindleUnpack.py python program and its support modules. You should
44 | then be able to run KindleUnpack.py by the following command:
45 |
46 | ```sh
47 | python kindleunpack.py [-r -s -d -h -i] [-p APNX_FILE] INPUT_FILE OUTPUT_FOLDER
48 | ```
49 |
50 | where you replace:
51 |
52 | `INPUT_FILE` - path to the desired Kindle/MobiPocket ebook
53 |
54 | `OUTPUT_FOLDER` - path to folder where the ebook will be unpacked
55 |
56 | ### Options
57 |
58 | `-h` print this help message
59 |
60 | `-i` use HDImages to overwrite lower resolution versions, if present
61 |
62 | `-s` split combination mobis into older mobi and mobi KF8 ebooks
63 |
64 | `-p APNX_FILE` path to a .apnx file that contains real page numbers associated
65 | with an azw3 ebook (optional). Note: many apnx files have
66 | arbitrarily assigned page offsets that will confuse KindleUnpack
67 | if used
68 |
69 | `--epub_version=` specify EPUB version to unpack to: 2, 3 or A (for automatic) or
70 | F for Force to EPUB2, default is 2
71 |
72 | `-r` write raw data to the output folder
73 |
74 | `-d` dump headers and other debug info to output and extra files
75 |
76 | Please report any bugs or comments/requests our sticky forum on the Mobileread website.
77 | It can be found at http://www.mobileread.com/forums.
78 |
79 | Look under E-Book Formats > Kindle Formats > KindleUnpack (MobiUnpack).
80 |
81 | License Information
82 |
83 | KindleUnpack
84 | Based on initial mobipocket version Copyright © 2009 Charles M. Hannum
85 | Extensive Extensions and Improvements Copyright © 2009-2014
86 | By P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.
87 | This program is free software: you can redistribute it and/or modify
88 | it under the terms of the GNU General Public License as published by
89 | the Free Software Foundation, version 3.
90 |
--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
3 |
--------------------------------------------------------------------------------
/lib/compatibility_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
4 |
5 | # Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
6 | # All rights reserved.
7 | #
8 | # Redistribution and use in source and binary forms, with or without modification,
9 | # are permitted provided that the following conditions are met:
10 | #
11 | # 1. Redistributions of source code must retain the above copyright notice, this list of
12 | # conditions and the following disclaimer.
13 | #
14 | # 2. Redistributions in binary form must reproduce the above copyright notice, this list
15 | # of conditions and the following disclaimer in the documentation and/or other materials
16 | # provided with the distribution.
17 | #
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
19 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
21 | # SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
23 | # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24 | # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
26 | # WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 |
28 | from __future__ import unicode_literals, division, absolute_import, print_function
29 |
30 | import sys
31 | import codecs
32 |
33 | PY2 = sys.version_info[0] == 2
34 | PY3 = sys.version_info[0] == 3
35 |
36 | iswindows = sys.platform.startswith('win')
37 |
38 | try:
39 | from urllib.parse import unquote
40 | except ImportError:
41 | from urllib import unquote
42 |
43 | if PY2:
44 | from HTMLParser import HTMLParser
45 | _h = HTMLParser()
46 | elif sys.version_info[1] < 4:
47 | import html.parser
48 | _h = html.parser.HTMLParser()
49 | else:
50 | import html as _h
51 |
52 | if PY3:
53 | text_type = str
54 | binary_type = bytes
55 | # if will be printing arbitraty binary data to stdout on python 3
56 | # sys.stdin = sys.stdin.detach()
57 | # sys.stdout = sys.stdout.detach()
58 | # sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
59 | else:
60 | range = xrange
61 | text_type = unicode
62 | binary_type = str
63 | # if will be printing unicode under python 2 need to protect
64 | # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode
65 | # sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
66 | # alternatively set environment variable as follows **before** launching python: export PYTHONIOENCODING=UTF-8
67 |
68 | # NOTE: Python 3 is completely broken when accessing single bytes in bytes strings
69 | # (and they amazingly claim by design and no bug!)
70 |
71 | # To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode
72 | # >>> o = '123456789'
73 | # >>> o[-3]
74 | # '7'
75 | # >>> type(o[-3])
76 | #
77 | # >>> type(o)
78 | #
79 |
80 | # Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings
81 | # >>> o = b'123456789'
82 | # >>> o[-3]
83 | # 55
84 | # >>> type(o[-3])
85 | #
86 | # >>> type(o)
87 | #
88 |
89 | # This mind boggling behaviour also happens when indexing a bytestring and/or
90 | # iteratoring over a bytestring. In other words it will return an int but not
91 | # the byte itself!!!!!!!
92 |
93 | # The only way to access a single byte as a byte in bytestring and get the byte in both
94 | # Python 2 and Python 3 is to use a slice
95 |
96 | # This problem is so common there are horrible hacks floating around the net to **try**
97 | # to work around it, so that code that works on both Python 2 and Python 3 is possible.
98 |
99 | # So in order to write code that works on both Python 2 and Python 3
100 | # if you index or access a single byte and want its ord() then use the bord() function.
101 | # If instead you want it as a single character byte use the bchar() function
102 | # both of which are defined below.
103 |
104 | if PY3:
105 | # Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding)
106 | # in place of ascii you will get a byte value to half-word or integer value
107 | # one-to-one mapping (in the 0 - 255 range)
108 |
109 | def bchr(s):
110 | return bytes([s])
111 |
112 | def bstr(s):
113 | if isinstance(s, str):
114 | return bytes(s, 'latin-1')
115 | else:
116 | return bytes(s)
117 |
118 | def bord(s):
119 | return s
120 |
121 | def bchar(s):
122 | return bytes([s])
123 |
124 | else:
125 | def bchr(s):
126 | return chr(s)
127 |
128 | def bstr(s):
129 | return str(s)
130 |
131 | def bord(s):
132 | return ord(s)
133 |
134 | def bchar(s):
135 | return s
136 |
137 | if PY3:
138 | # list-producing versions of the major Python iterating functions
139 | def lrange(*args, **kwargs):
140 | return list(range(*args, **kwargs))
141 |
142 | def lzip(*args, **kwargs):
143 | return list(zip(*args, **kwargs))
144 |
145 | def lmap(*args, **kwargs):
146 | return list(map(*args, **kwargs))
147 |
148 | def lfilter(*args, **kwargs):
149 | return list(filter(*args, **kwargs))
150 | else:
151 | import __builtin__
152 | # Python 2-builtin ranges produce lists
153 | lrange = __builtin__.range
154 | lzip = __builtin__.zip
155 | lmap = __builtin__.map
156 | lfilter = __builtin__.filter
157 |
158 | # In Python 3 you can no longer use .encode('hex') on a bytestring
159 | # instead use the following on both platforms
160 | import binascii
161 | def hexlify(bdata):
162 | return (binascii.hexlify(bdata)).decode('ascii')
163 |
164 | # If you: import struct
165 | # Note: struct pack, unpack, unpack_from all *require* bytestring format
166 | # data all the way up to at least Python 2.7.5, Python 3 is okay with either
167 |
168 | # If you: import re
169 | # note: Python 3 "re" requires the pattern to be the exact same type as the data to be
170 | # searched ... but u"" is not allowed for the pattern itself only b""
171 | # Python 2.X allows the pattern to be any type and converts it to match the data
172 | # and returns the same type as the data
173 |
174 | # convert string to be utf-8 encoded
175 | def utf8_str(p, enc='utf-8'):
176 | if p is None:
177 | return None
178 | if isinstance(p, text_type):
179 | return p.encode('utf-8')
180 | if enc != 'utf-8':
181 | return p.decode(enc).encode('utf-8')
182 | return p
183 |
184 | # convert string to be unicode encoded
185 | def unicode_str(p, enc='utf-8'):
186 | if p is None:
187 | return None
188 | if isinstance(p, text_type):
189 | return p
190 | return p.decode(enc)
191 |
192 | ASCII_CHARS = set(chr(x) for x in range(128))
193 | URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
194 | 'abcdefghijklmnopqrstuvwxyz'
195 | '0123456789' '#' '_.-/~')
196 | IRI_UNSAFE = ASCII_CHARS - URL_SAFE
197 |
198 | # returns a quoted IRI (not a URI)
199 | def quoteurl(href):
200 | if isinstance(href,binary_type):
201 | href = href.decode('utf-8')
202 | result = []
203 | for char in href:
204 | if char in IRI_UNSAFE:
205 | char = "%%%02x" % ord(char)
206 | result.append(char)
207 | return ''.join(result)
208 |
209 | # unquotes url/iri
210 | def unquoteurl(href):
211 | if isinstance(href,binary_type):
212 | href = href.decode('utf-8')
213 | href = unquote(href)
214 | return href
215 |
216 | # unescape html
217 | def unescapeit(sval):
218 | return _h.unescape(sval)
219 |
220 | # Python 2.X commandline parsing under Windows has been horribly broken for years!
221 | # Use the following code to emulate full unicode commandline parsing on Python 2
222 | # ie. To get sys.argv arguments and properly encode them as unicode
223 |
224 | def unicode_argv():
225 | global iswindows
226 | global PY3
227 | if PY3:
228 | return sys.argv
229 | if iswindows:
230 | # Versions 2.x of Python don't support Unicode in sys.argv on
231 | # Windows, with the underlying Windows API instead replacing multi-byte
232 | # characters with '?'. So use shell32.GetCommandLineArgvW to get sys.argv
233 | # as a list of Unicode strings
234 | from ctypes import POINTER, byref, cdll, c_int, windll
235 | from ctypes.wintypes import LPCWSTR, LPWSTR
236 |
237 | GetCommandLineW = cdll.kernel32.GetCommandLineW
238 | GetCommandLineW.argtypes = []
239 | GetCommandLineW.restype = LPCWSTR
240 |
241 | CommandLineToArgvW = windll.shell32.CommandLineToArgvW
242 | CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)]
243 | CommandLineToArgvW.restype = POINTER(LPWSTR)
244 |
245 | cmd = GetCommandLineW()
246 | argc = c_int(0)
247 | argv = CommandLineToArgvW(cmd, byref(argc))
248 | if argc.value > 0:
249 | # Remove Python executable and commands if present
250 | start = argc.value - len(sys.argv)
251 | return [argv[i] for i in
252 | range(start, argc.value)]
253 | # this should never happen
254 | return None
255 | else:
256 | argv = []
257 | argvencoding = sys.stdin.encoding
258 | if argvencoding is None:
259 | argvencoding = sys.getfilesystemencoding()
260 | if argvencoding is None:
261 | argvencoding = 'utf-8'
262 | for arg in sys.argv:
263 | if isinstance(arg, text_type):
264 | argv.append(arg)
265 | else:
266 | argv.append(arg.decode(argvencoding))
267 | return argv
268 |
269 |
270 | # Python 2.X is broken in that it does not recognize CP65001 as UTF-8
271 | def add_cp65001_codec():
272 | if PY2:
273 | try:
274 | codecs.lookup('cp65001')
275 | except LookupError:
276 | codecs.register(
277 | lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
278 | return
279 |
--------------------------------------------------------------------------------
/lib/imghdr.py:
--------------------------------------------------------------------------------
1 | """Recognize image file formats based on their first few bytes."""
2 |
3 | # Python software and documentation are licensed under the
4 | # Python Software Foundation License Version 2.
5 |
6 | # Starting with Python 3.8.6, examples, recipes, and other code in
7 | # the documentation are dual licensed under the PSF License Version 2
8 | # and the Zero-Clause BSD license.
9 |
10 | # Some software incorporated into Python is under different licenses.
11 | # The licenses are listed with code falling under that license.
12 |
13 | # PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
14 | # --------------------------------------------
15 |
16 | # 1. This LICENSE AGREEMENT is between the Python Software Foundation
17 | # ("PSF"), and the Individual or Organization ("Licensee") accessing and
18 | # otherwise using this software ("Python") in source or binary form and
19 | # its associated documentation.
20 |
21 | # 2. Subject to the terms and conditions of this License Agreement, PSF hereby
22 | # grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
23 | # analyze, test, perform and/or display publicly, prepare derivative works,
24 | # distribute, and otherwise use Python alone or in any derivative version,
25 | # provided, however, that PSF's License Agreement and PSF's notice of copyright,
26 | # i.e., "Copyright (c) 2001 Python Software Foundation; All Rights Reserved"
27 | # are retained in Python alone or in any derivative version prepared by Licensee.
28 |
29 | # 3. In the event Licensee prepares a derivative work that is based on
30 | # or incorporates Python or any part thereof, and wants to make
31 | # the derivative work available to others as provided herein, then
32 | # Licensee hereby agrees to include in any such work a brief summary of
33 | # the changes made to Python.
34 |
35 | # 4. PSF is making Python available to Licensee on an "AS IS"
36 | # basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
37 | # IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
38 | # DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
39 | # FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
40 | # INFRINGE ANY THIRD PARTY RIGHTS.
41 |
42 | # 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
43 | # FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
44 | # A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
45 | # OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
46 |
47 | # 6. This License Agreement will automatically terminate upon a material
48 | # breach of its terms and conditions.
49 |
50 | # 7. Nothing in this License Agreement shall be deemed to create any
51 | # relationship of agency, partnership, or joint venture between PSF and
52 | # Licensee. This License Agreement does not grant permission to use PSF
53 | # trademarks or trade name in a trademark sense to endorse or promote
54 | # products or services of Licensee, or any third party.
55 |
56 | # 8. By copying, installing or otherwise using Python, Licensee
57 | # agrees to be bound by the terms and conditions of this License
58 | # Agreement.
59 |
60 | from os import PathLike
61 |
62 | __all__ = ["what"]
63 |
64 | #-------------------------#
65 | # Recognize image headers #
66 | #-------------------------#
67 |
68 | def what(file, h=None):
69 | f = None
70 | try:
71 | if h is None:
72 | if isinstance(file, (str, PathLike)):
73 | f = open(file, 'rb')
74 | h = f.read(32)
75 | else:
76 | location = file.tell()
77 | h = file.read(32)
78 | file.seek(location)
79 | for tf in tests:
80 | res = tf(h, f)
81 | if res:
82 | return res
83 | finally:
84 | if f: f.close()
85 | return None
86 |
87 |
88 | #---------------------------------#
89 | # Subroutines per image file type #
90 | #---------------------------------#
91 |
92 | tests = []
93 |
94 | def test_jpeg(h, f):
95 | """JPEG data in JFIF or Exif format"""
96 | if h[6:10] in (b'JFIF', b'Exif'):
97 | return 'jpeg'
98 |
99 | tests.append(test_jpeg)
100 |
101 | def test_png(h, f):
102 | if h.startswith(b'\211PNG\r\n\032\n'):
103 | return 'png'
104 |
105 | tests.append(test_png)
106 |
107 | def test_gif(h, f):
108 | """GIF ('87 and '89 variants)"""
109 | if h[:6] in (b'GIF87a', b'GIF89a'):
110 | return 'gif'
111 |
112 | tests.append(test_gif)
113 |
114 | def test_tiff(h, f):
115 | """TIFF (can be in Motorola or Intel byte order)"""
116 | if h[:2] in (b'MM', b'II'):
117 | return 'tiff'
118 |
119 | tests.append(test_tiff)
120 |
121 | def test_rgb(h, f):
122 | """SGI image library"""
123 | if h.startswith(b'\001\332'):
124 | return 'rgb'
125 |
126 | tests.append(test_rgb)
127 |
128 | def test_pbm(h, f):
129 | """PBM (portable bitmap)"""
130 | if len(h) >= 3 and \
131 | h[0] == ord(b'P') and h[1] in b'14' and h[2] in b' \t\n\r':
132 | return 'pbm'
133 |
134 | tests.append(test_pbm)
135 |
136 | def test_pgm(h, f):
137 | """PGM (portable graymap)"""
138 | if len(h) >= 3 and \
139 | h[0] == ord(b'P') and h[1] in b'25' and h[2] in b' \t\n\r':
140 | return 'pgm'
141 |
142 | tests.append(test_pgm)
143 |
144 | def test_ppm(h, f):
145 | """PPM (portable pixmap)"""
146 | if len(h) >= 3 and \
147 | h[0] == ord(b'P') and h[1] in b'36' and h[2] in b' \t\n\r':
148 | return 'ppm'
149 |
150 | tests.append(test_ppm)
151 |
152 | def test_rast(h, f):
153 | """Sun raster file"""
154 | if h.startswith(b'\x59\xA6\x6A\x95'):
155 | return 'rast'
156 |
157 | tests.append(test_rast)
158 |
159 | def test_xbm(h, f):
160 | """X bitmap (X10 or X11)"""
161 | if h.startswith(b'#define '):
162 | return 'xbm'
163 |
164 | tests.append(test_xbm)
165 |
166 | def test_bmp(h, f):
167 | if h.startswith(b'BM'):
168 | return 'bmp'
169 |
170 | tests.append(test_bmp)
171 |
172 | def test_webp(h, f):
173 | if h.startswith(b'RIFF') and h[8:12] == b'WEBP':
174 | return 'webp'
175 |
176 | tests.append(test_webp)
177 |
178 | def test_exr(h, f):
179 | if h.startswith(b'\x76\x2f\x31\x01'):
180 | return 'exr'
181 |
182 | tests.append(test_exr)
183 |
184 | #--------------------#
185 | # Small test program #
186 | #--------------------#
187 |
188 | def test():
189 | import sys
190 | recursive = 0
191 | if sys.argv[1:] and sys.argv[1] == '-r':
192 | del sys.argv[1:2]
193 | recursive = 1
194 | try:
195 | if sys.argv[1:]:
196 | testall(sys.argv[1:], recursive, 1)
197 | else:
198 | testall(['.'], recursive, 1)
199 | except KeyboardInterrupt:
200 | sys.stderr.write('\n[Interrupted]\n')
201 | sys.exit(1)
202 |
203 | def testall(list, recursive, toplevel):
204 | import sys
205 | import os
206 | for filename in list:
207 | if os.path.isdir(filename):
208 | print(filename + '/:', end=' ')
209 | if recursive or toplevel:
210 | print('recursing down:')
211 | import glob
212 | names = glob.glob(os.path.join(glob.escape(filename), '*'))
213 | testall(names, recursive, 0)
214 | else:
215 | print('*** directory (use -r) ***')
216 | else:
217 | print(filename + ':', end=' ')
218 | sys.stdout.flush()
219 | try:
220 | print(what(filename))
221 | except OSError:
222 | print('*** not found ***')
223 |
224 | if __name__ == '__main__':
225 | test()
226 |
--------------------------------------------------------------------------------
/lib/mobi_cover.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
4 |
5 | from __future__ import unicode_literals, division, absolute_import, print_function
6 |
7 | from .compatibility_utils import unicode_str
8 |
9 | from .unipath import pathof
10 | import os
11 | from . import imghdr
12 |
13 | import struct
14 | # note: struct pack, unpack, unpack_from all require bytestring format
15 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
16 |
17 | USE_SVG_WRAPPER = True
18 | """ Set to True to use svg wrapper for default. """
19 |
20 | FORCE_DEFAULT_TITLE = False
21 | """ Set to True to force to use the default title. """
22 |
23 | COVER_PAGE_FINENAME = 'cover_page.xhtml'
24 | """ The name for the cover page. """
25 |
26 | DEFAULT_TITLE = 'Cover'
27 | """ The default title for the cover page. """
28 |
29 | MAX_WIDTH = 4096
30 | """ The max width for the svg cover page. """
31 |
32 | MAX_HEIGHT = 4096
33 | """ The max height for the svg cover page. """
34 |
35 |
36 | def get_image_type(imgname, imgdata=None):
37 | imgtype = unicode_str(imghdr.what(pathof(imgname), imgdata))
38 |
39 | # imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some
40 | # with only the magic JPEG bytes out there...
41 | # ImageMagick handles those, so, do it too.
42 | if imgtype is None:
43 | if imgdata is None:
44 | with open(pathof(imgname), 'rb') as f:
45 | imgdata = f.read()
46 | if imgdata[0:2] == b'\xFF\xD8':
47 | # Get last non-null bytes
48 | last = len(imgdata)
49 | while (imgdata[last-1:last] == b'\x00'):
50 | last-=1
51 | # Be extra safe, check the trailing bytes, too.
52 | if imgdata[last-2:last] == b'\xFF\xD9':
53 | imgtype = "jpeg"
54 | return imgtype
55 |
56 |
57 | def get_image_size(imgname, imgdata=None):
58 | '''Determine the image type of imgname (or imgdata) and return its size.
59 |
60 | Originally,
61 | Determine the image type of fhandle and return its size.
62 | from draco'''
63 | if imgdata is None:
64 | fhandle = open(pathof(imgname), 'rb')
65 | head = fhandle.read(24)
66 | else:
67 | head = imgdata[0:24]
68 | if len(head) != 24:
69 | return
70 |
71 | imgtype = get_image_type(imgname, imgdata)
72 | if imgtype == 'png':
73 | check = struct.unpack(b'>i', head[4:8])[0]
74 | if check != 0x0d0a1a0a:
75 | return
76 | width, height = struct.unpack(b'>ii', head[16:24])
77 | elif imgtype == 'gif':
78 | width, height = struct.unpack(b'H', fhandle.read(2))[0] - 2
91 | # We are at a SOFn block
92 | fhandle.seek(1, 1) # Skip `precision' byte.
93 | height, width = struct.unpack(b'>HH', fhandle.read(4))
94 | except Exception: # IGNORE:W0703
95 | return
96 | elif imgtype == 'jpeg' and imgdata is not None:
97 | try:
98 | pos = 0
99 | size = 2
100 | ftype = 0
101 | while not 0xc0 <= ftype <= 0xcf:
102 | pos += size
103 | byte = imgdata[pos:pos+1]
104 | pos += 1
105 | while ord(byte) == 0xff:
106 | byte = imgdata[pos:pos+1]
107 | pos += 1
108 | ftype = ord(byte)
109 | size = struct.unpack(b'>H', imgdata[pos:pos+2])[0] - 2
110 | pos += 2
111 | # We are at a SOFn block
112 | pos += 1 # Skip `precision' byte.
113 | height, width = struct.unpack(b'>HH', imgdata[pos:pos+4])
114 | pos += 4
115 | except Exception: # IGNORE:W0703
116 | return
117 | else:
118 | return
119 | return width, height
120 |
121 | # XXX experimental
122 | class CoverProcessor(object):
123 |
124 | """Create a cover page.
125 |
126 | """
127 | def __init__(self, files, metadata, rscnames, imgname=None, imgdata=None):
128 | self.files = files
129 | self.metadata = metadata
130 | self.rscnames = rscnames
131 | self.cover_page = COVER_PAGE_FINENAME
132 | self.use_svg = USE_SVG_WRAPPER # Use svg wrapper.
133 | self.lang = metadata.get('Language', ['en'])[0]
134 | # This should ensure that if the methods to find the cover image's
135 | # dimensions should fail for any reason, the SVG routine will not be used.
136 | [self.width, self.height] = (-1,-1)
137 | if FORCE_DEFAULT_TITLE:
138 | self.title = DEFAULT_TITLE
139 | else:
140 | self.title = metadata.get('Title', [DEFAULT_TITLE])[0]
141 |
142 | self.cover_image = None
143 | if imgname is not None:
144 | self.cover_image = imgname
145 | elif 'CoverOffset' in metadata:
146 | imageNumber = int(metadata['CoverOffset'][0])
147 | cover_image = self.rscnames[imageNumber]
148 | if cover_image is not None:
149 | self.cover_image = cover_image
150 | else:
151 | print('Warning: Cannot identify the cover image.')
152 | if self.use_svg:
153 | try:
154 | if imgdata is None:
155 | fname = os.path.join(files.imgdir, self.cover_image)
156 | [self.width, self.height] = get_image_size(fname)
157 | else:
158 | [self.width, self.height] = get_image_size(None, imgdata)
159 | except:
160 | self.use_svg = False
161 | width = self.width
162 | height = self.height
163 | if width < 0 or height < 0 or width > MAX_WIDTH or height > MAX_HEIGHT:
164 | self.use_svg = False
165 | return
166 |
167 | def getImageName(self):
168 | return self.cover_image
169 |
170 | def getXHTMLName(self):
171 | return self.cover_page
172 |
173 | def buildXHTML(self):
174 | print('Building a cover page.')
175 | files = self.files
176 | cover_image = self.cover_image
177 | title = self.title
178 | lang = self.lang
179 |
180 | image_dir = os.path.normpath(os.path.relpath(files.k8images, files.k8text))
181 | image_path = os.path.join(image_dir, cover_image).replace('\\', '/')
182 |
183 | if not self.use_svg:
184 | data = ''
185 | data += ''
186 | data += '\n'.format(lang)
188 | data += '\n{:s} \n'.format(title)
189 | data += '\n\n'
194 | data += '\n'
195 | data += '
\n'.format(image_path)
196 | data += '
\n'
197 | else:
198 | width = self.width
199 | height = self.height
200 | viewBox = "0 0 {0:d} {1:d}".format(width, height)
201 |
202 | data = ''
203 | data += ''
204 | data += '\n'.format(lang)
206 | data += '\n {:s} \n'.format(title)
207 | data += '\n\n'
211 | data += '\n \n'
212 | data += ' \n'.format(viewBox)
214 | data += ' \n'.format(height, width, image_path)
215 | data += ' \n'
216 | data += '
\n\n'
217 | return data
218 |
219 | def writeXHTML(self):
220 | files = self.files
221 | cover_page = self.cover_page
222 |
223 | data = self.buildXHTML()
224 |
225 | outfile = os.path.join(files.k8text, cover_page)
226 | if os.path.exists(pathof(outfile)):
227 | print('Warning: {:s} already exists.'.format(cover_page))
228 | os.remove(pathof(outfile))
229 | with open(pathof(outfile), 'wb') as f:
230 | f.write(data.encode('utf-8'))
231 | return
232 |
233 | def guide_toxml(self):
234 | files = self.files
235 | text_dir = os.path.relpath(files.k8text, files.k8oebps)
236 | data = ' \n'.format(
237 | text_dir, self.cover_page)
238 | return data
239 |
--------------------------------------------------------------------------------
/lib/mobi_dict.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
4 |
5 | from __future__ import unicode_literals, division, absolute_import, print_function
6 |
7 | from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr
8 |
9 | if PY2:
10 | range = xrange
11 | array_format = b'B'
12 | if PY3:
13 | unichr = chr
14 | array_format = "B"
15 |
16 | import array
17 |
18 | import struct
19 | # note: struct pack, unpack, unpack_from all require bytestring format
20 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
21 |
22 | from .mobi_index import getVariableWidthValue, readTagSection, getTagMap
23 | from .mobi_utils import toHex
24 |
25 | #python 3.9 dropped support for array tostring()
26 | def convert_to_bytes(ar):
27 | if PY2:
28 | return ar.tostring()
29 | return ar.tobytes()
30 |
31 | DEBUG_DICT = False
32 |
33 | class InflectionData(object):
34 |
35 | def __init__(self, infldatas):
36 | self.infldatas = infldatas
37 | self.starts = []
38 | self.counts = []
39 | for idata in self.infldatas:
40 | start, = struct.unpack_from(b'>L', idata, 0x14)
41 | count, = struct.unpack_from(b'>L', idata, 0x18)
42 | self.starts.append(start)
43 | self.counts.append(count)
44 |
45 | def lookup(self, lookupvalue):
46 | i = 0
47 | rvalue = lookupvalue
48 | while rvalue >= self.counts[i]:
49 | rvalue = rvalue - self.counts[i]
50 | i += 1
51 | if i == len(self.counts):
52 | print("Error: Problem with multiple inflections data sections")
53 | return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0]
54 | return rvalue, self.starts[i], self.counts[i], self.infldatas[i]
55 |
56 | def offsets(self, value):
57 | rvalue, start, count, data = self.lookup(value)
58 | offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue))
59 | if rvalue + 1 < count:
60 | nextOffset, = struct.unpack_from(b'>H',data, start + 4 + (2 * (rvalue + 1)))
61 | else:
62 | nextOffset = None
63 | return offset, nextOffset, data
64 |
65 |
66 | class dictSupport(object):
67 |
68 | def __init__(self, mh, sect):
69 | self.mh = mh
70 | self.header = mh.header
71 | self.sect = sect
72 | self.metaOrthIndex = mh.metaOrthIndex
73 | self.metaInflIndex = mh.metaInflIndex
74 |
75 | def parseHeader(self, data):
76 | "read INDX header"
77 | if not data[:4] == b'INDX':
78 | print("Warning: index section is not INDX")
79 | return False
80 | words = (
81 | 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
82 | 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc'
83 | )
84 | num = len(words)
85 | values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)])
86 | header = {}
87 | for n in range(num):
88 | header[words[n]] = values[n]
89 |
90 | ordt1 = None
91 | ordt2 = None
92 |
93 | otype, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4)
94 | header['otype'] = otype
95 | header['oentries'] = oentries
96 |
97 | if DEBUG_DICT:
98 | print("otype %d, oentries %d, op1 %d, op2 %d, otagx %d" % (otype, oentries, op1, op2, otagx))
99 |
100 | if header['code'] == 0xfdea or oentries > 0:
101 | # some dictionaries seem to be codepage 65002 (0xFDEA) which seems
102 | # to be some sort of strange EBCDIC utf-8 or 16 encoded strings
103 | # So we need to look for them and store them away to process leading text
104 | # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
105 | # we only ever seem to use the second but ...
106 | #
107 | # if otype = 0, ORDT table uses 16 bit values as offsets into the table
108 | # if otype = 1, ORDT table uses 8 bit values as offsets inot the table
109 |
110 | assert(data[op1:op1+4] == b'ORDT')
111 | assert(data[op2:op2+4] == b'ORDT')
112 | ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4)
113 | ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4)
114 |
115 | if DEBUG_DICT:
116 | print("parsed INDX header:")
117 | for key in header:
118 | print(key, "%x" % header[key],)
119 | print("\n")
120 | return header, ordt1, ordt2
121 |
122 | def getPositionMap(self):
123 | sect = self.sect
124 |
125 | positionMap = {}
126 |
127 | metaOrthIndex = self.metaOrthIndex
128 | metaInflIndex = self.metaInflIndex
129 |
130 | decodeInflection = True
131 | if metaOrthIndex != 0xFFFFFFFF:
132 | print("Info: Document contains orthographic index, handle as dictionary")
133 | if metaInflIndex == 0xFFFFFFFF:
134 | decodeInflection = False
135 | else:
136 | metaInflIndexData = sect.loadSection(metaInflIndex)
137 |
138 | print("\nParsing metaInflIndexData")
139 | midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData)
140 |
141 | metaIndexCount = midxhdr['count']
142 | idatas = []
143 | for j in range(metaIndexCount):
144 | idatas.append(sect.loadSection(metaInflIndex + 1 + j))
145 | dinfl = InflectionData(idatas)
146 |
147 | inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount)
148 | tagSectionStart = midxhdr['len']
149 | inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData)
150 | if DEBUG_DICT:
151 | print("inflectionTagTable: %s" % inflectionTagTable)
152 | if self.hasTag(inflectionTagTable, 0x07):
153 | print("Error: Dictionary uses obsolete inflection rule scheme which is not yet supported")
154 | decodeInflection = False
155 |
156 | data = sect.loadSection(metaOrthIndex)
157 |
158 | print("\nParsing metaOrthIndex")
159 | idxhdr, hordt1, hordt2 = self.parseHeader(data)
160 |
161 | tagSectionStart = idxhdr['len']
162 | controlByteCount, tagTable = readTagSection(tagSectionStart, data)
163 | orthIndexCount = idxhdr['count']
164 | print("orthIndexCount is", orthIndexCount)
165 | if DEBUG_DICT:
166 | print("orthTagTable: %s" % tagTable)
167 | if hordt2 is not None:
168 | print("orth entry uses ordt2 lookup table of type ", idxhdr['otype'])
169 | hasEntryLength = self.hasTag(tagTable, 0x02)
170 | if not hasEntryLength:
171 | print("Info: Index doesn't contain entry length tags")
172 |
173 | print("Read dictionary index data")
174 | for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount):
175 | data = sect.loadSection(i)
176 | hdrinfo, ordt1, ordt2 = self.parseHeader(data)
177 | idxtPos = hdrinfo['start']
178 | entryCount = hdrinfo['count']
179 | idxPositions = []
180 | for j in range(entryCount):
181 | pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j))
182 | idxPositions.append(pos)
183 | # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
184 | idxPositions.append(idxtPos)
185 | for j in range(entryCount):
186 | startPos = idxPositions[j]
187 | endPos = idxPositions[j+1]
188 | textLength = ord(data[startPos:startPos+1])
189 | text = data[startPos+1:startPos+1+textLength]
190 | if hordt2 is not None:
191 | utext = u""
192 | if idxhdr['otype'] == 0:
193 | pattern = b'>H'
194 | inc = 2
195 | else:
196 | pattern = b'>B'
197 | inc = 1
198 | pos = 0
199 | while pos < textLength:
200 | off, = struct.unpack_from(pattern, text, pos)
201 | if off < len(hordt2):
202 | utext += unichr(hordt2[off])
203 | else:
204 | utext += unichr(off)
205 | pos += inc
206 | text = utext.encode('utf-8')
207 |
208 | tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
209 | if 0x01 in tagMap:
210 | if decodeInflection and 0x2a in tagMap:
211 | inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable,
212 | dinfl, inflNameData, tagMap[0x2a])
213 | else:
214 | inflectionGroups = b''
215 | assert len(tagMap[0x01]) == 1
216 | entryStartPosition = tagMap[0x01][0]
217 | if hasEntryLength:
218 | # The idx:entry attribute "scriptable" must be present to create entry length tags.
219 | ml = b'' + inflectionGroups + b' '
220 | if entryStartPosition in positionMap:
221 | positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml
222 | else:
223 | positionMap[entryStartPosition] = ml
224 | assert len(tagMap[0x02]) == 1
225 | entryEndPosition = entryStartPosition + tagMap[0x02][0]
226 | if entryEndPosition in positionMap:
227 | positionMap[entryEndPosition] = b" " + positionMap[entryEndPosition]
228 | else:
229 | positionMap[entryEndPosition] = b""
230 |
231 | else:
232 | indexTags = b'\n\n' + inflectionGroups + b' \n'
233 | if entryStartPosition in positionMap:
234 | positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags
235 | else:
236 | positionMap[entryStartPosition] = indexTags
237 | return positionMap
238 |
239 | def hasTag(self, tagTable, tag):
240 | '''
241 | Test if tag table contains given tag.
242 |
243 | @param tagTable: The tag table.
244 | @param tag: The tag to search.
245 | @return: True if tag table contains given tag; False otherwise.
246 | '''
247 | for currentTag, _, _, _ in tagTable:
248 | if currentTag == tag:
249 | return True
250 | return False
251 |
252 | def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList):
253 | '''
254 | Create string which contains the inflection groups with inflection rules as mobipocket tags.
255 |
256 | @param mainEntry: The word to inflect.
257 | @param controlByteCount: The number of control bytes.
258 | @param tagTable: The tag table.
259 | @param data: The Inflection data object to properly select the right inflection data section to use
260 | @param inflectionNames: The inflection rule name data.
261 | @param groupList: The list of inflection groups to process.
262 | @return: String with inflection groups and rules or empty string if required tags are not available.
263 | '''
264 | result = b""
265 | for value in groupList:
266 | offset, nextOffset, data = dinfl.offsets(value)
267 |
268 | # First byte seems to be always 0x00 and must be skipped.
269 | assert ord(data[offset:offset+1]) == 0x00
270 | tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset)
271 |
272 | # Make sure that the required tags are available.
273 | if 0x05 not in tagMap:
274 | print("Error: Required tag 0x05 not found in tagMap")
275 | return ""
276 | if 0x1a not in tagMap:
277 | print("Error: Required tag 0x1a not found in tagMap")
278 | return b''
279 |
280 | result += b''
281 |
282 | for i in range(len(tagMap[0x05])):
283 |
284 | # Get name of inflection rule.
285 | value = tagMap[0x05][i]
286 | consumed, textLength = getVariableWidthValue(inflectionNames, value)
287 | inflectionName = inflectionNames[value+consumed:value+consumed+textLength]
288 |
289 | # Get and apply inflection rule across possibly multiple inflection data sections
290 | value = tagMap[0x1a][i]
291 | rvalue, start, count, data = dinfl.lookup(value)
292 | offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue))
293 | textLength = ord(data[offset:offset+1])
294 | inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength)
295 | if inflection is not None:
296 | result += b' '
297 |
298 | result += b' '
299 | return result
300 |
301 | def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end):
302 | '''
303 | Apply inflection rule.
304 |
305 | @param mainEntry: The word to inflect.
306 | @param inflectionRuleData: The inflection rules.
307 | @param start: The start position of the inflection rule to use.
308 | @param end: The end position of the inflection rule to use.
309 | @return: The string with the inflected word or None if an error occurs.
310 | '''
311 | mode = -1
312 | byteArray = array.array(array_format, mainEntry)
313 | position = len(byteArray)
314 | for charOffset in range(start, end):
315 | char = inflectionRuleData[charOffset:charOffset+1]
316 | abyte = ord(char)
317 | if abyte >= 0x0a and abyte <= 0x13:
318 | # Move cursor backwards
319 | offset = abyte - 0x0a
320 | if mode not in [0x02, 0x03]:
321 | mode = 0x02
322 | position = len(byteArray)
323 | position -= offset
324 | elif abyte > 0x13:
325 | if mode == -1:
326 | print("Error: Unexpected first byte %i of inflection rule" % abyte)
327 | return None
328 | elif position == -1:
329 | print("Error: Unexpected first byte %i of inflection rule" % abyte)
330 | return None
331 | else:
332 | if mode == 0x01:
333 | # Insert at word start
334 | byteArray.insert(position, abyte)
335 | position += 1
336 | elif mode == 0x02:
337 | # Insert at word end
338 | byteArray.insert(position, abyte)
339 | elif mode == 0x03:
340 | # Delete at word end
341 | position -= 1
342 | deleted = byteArray.pop(position)
343 | if bchr(deleted) != char:
344 | if DEBUG_DICT:
345 | print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted)))
346 | print("Error: Delete operation of inflection rule failed")
347 | return None
348 | elif mode == 0x04:
349 | # Delete at word start
350 | deleted = byteArray.pop(position)
351 | if bchr(deleted) != char:
352 | if DEBUG_DICT:
353 | print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted)))
354 | print("Error: Delete operation of inflection rule failed")
355 | return None
356 | else:
357 | print("Error: Inflection rule mode %x is not implemented" % mode)
358 | return None
359 | elif abyte == 0x01:
360 | # Insert at word start
361 | if mode not in [0x01, 0x04]:
362 | position = 0
363 | mode = abyte
364 | elif abyte == 0x02:
365 | # Insert at word end
366 | if mode not in [0x02, 0x03]:
367 | position = len(byteArray)
368 | mode = abyte
369 | elif abyte == 0x03:
370 | # Delete at word end
371 | if mode not in [0x02, 0x03]:
372 | position = len(byteArray)
373 | mode = abyte
374 | elif abyte == 0x04:
375 | # Delete at word start
376 | if mode not in [0x01, 0x04]:
377 | position = 0
378 | # Delete at word start
379 | mode = abyte
380 | else:
381 | print("Error: Inflection rule mode %x is not implemented" % abyte)
382 | return None
383 | return utf8_str(convert_to_bytes(byteArray))
384 |
--------------------------------------------------------------------------------
/lib/mobi_html.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
4 |
5 | from __future__ import unicode_literals, division, absolute_import, print_function
6 |
7 | from .compatibility_utils import PY2, utf8_str
8 |
9 | if PY2:
10 | range = xrange
11 |
12 | import re
13 | # note: re requites the pattern to be the exact same type as the data to be searched in python3
14 | # but u"" is not allowed for the pattern itself only b""
15 |
16 | from .mobi_utils import fromBase32
17 |
18 | class HTMLProcessor:
19 |
20 | def __init__(self, files, metadata, rscnames):
21 | self.files = files
22 | self.metadata = metadata
23 | self.rscnames = rscnames
24 | # for original style mobis, default to including all image files in the opf manifest
25 | self.used = {}
26 | for name in rscnames:
27 | self.used[name] = 'used'
28 |
29 | def findAnchors(self, rawtext, indx_data, positionMap):
30 | # process the raw text
31 | # find anchors...
32 | print("Find link anchors")
33 | link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', re.IGNORECASE)
34 | # TEST NCX: merge in filepos from indx
35 | pos_links = [int(m.group(1)) for m in link_pattern.finditer(rawtext)]
36 | if indx_data:
37 | pos_indx = [e['pos'] for e in indx_data if e['pos']>0]
38 | pos_links = list(set(pos_links + pos_indx))
39 |
40 | for position in pos_links:
41 | if position in positionMap:
42 | positionMap[position] = positionMap[position] + utf8_str(' ' % position)
43 | else:
44 | positionMap[position] = utf8_str(' ' % position)
45 |
46 | # apply dictionary metadata and anchors
47 | print("Insert data into html")
48 | pos = 0
49 | lastPos = len(rawtext)
50 | dataList = []
51 | for end in sorted(positionMap.keys()):
52 | if end == 0 or end > lastPos:
53 | continue # something's up - can't put a tag in outside ...
54 | dataList.append(rawtext[pos:end])
55 | dataList.append(positionMap[end])
56 | pos = end
57 | dataList.append(rawtext[pos:])
58 | srctext = b"".join(dataList)
59 | rawtext = None
60 | dataList = None
61 | self.srctext = srctext
62 | self.indx_data = indx_data
63 | return srctext
64 |
65 | def insertHREFS(self):
66 | srctext = self.srctext
67 | rscnames = self.rscnames
68 | metadata = self.metadata
69 |
70 | # put in the hrefs
71 | print("Insert hrefs into html")
72 | # There doesn't seem to be a standard, so search as best as we can
73 |
74 | link_pattern = re.compile(br''']*?)filepos=['"]{0,1}0*(\d+)['"]{0,1}([^>]*?)>''', re.IGNORECASE)
75 | srctext = link_pattern.sub(br''' ''', srctext)
76 |
77 | # remove empty anchors
78 | print("Remove empty anchors from html")
79 | srctext = re.sub(br" ",br"", srctext)
80 | srctext = re.sub(br"\s* ",br"", srctext)
81 |
82 | # convert image references
83 | print("Insert image references into html")
84 | # split string into image tag pieces and other pieces
85 | image_pattern = re.compile(br'''()''', re.IGNORECASE)
86 | image_index_pattern = re.compile(br'''recindex=['"]{0,1}([0-9]+)['"]{0,1}''', re.IGNORECASE)
87 | srcpieces = image_pattern.split(srctext)
88 | srctext = self.srctext = None
89 |
90 | # all odd pieces are image tags (nulls string on even pieces if no space between them in srctext)
91 | for i in range(1, len(srcpieces), 2):
92 | tag = srcpieces[i]
93 | for m in image_index_pattern.finditer(tag):
94 | imageNumber = int(m.group(1))
95 | imageName = rscnames[imageNumber-1]
96 | if imageName is None:
97 | print("Error: Referenced image %s was not recognized as a valid image" % imageNumber)
98 | else:
99 | replacement = b'src="Images/' + utf8_str(imageName) + b'"'
100 | tag = image_index_pattern.sub(replacement, tag, 1)
101 | srcpieces[i] = tag
102 | srctext = b"".join(srcpieces)
103 |
104 | # add in character set meta into the html header if needed
105 | if 'Codec' in metadata:
106 | srctext = srctext[0:12]+b' '+srctext[12:]
107 | return srctext, self.used
108 |
109 |
110 | class XHTMLK8Processor:
111 |
112 | def __init__(self, rscnames, k8proc, viewport=None):
113 | self.rscnames = rscnames
114 | self.k8proc = k8proc
115 | self.viewport = viewport
116 | self.used = {}
117 |
118 | def buildXHTML(self):
119 |
120 | # first need to update all links that are internal which
121 | # are based on positions within the xhtml files **BEFORE**
122 | # cutting and pasting any pieces into the xhtml text files
123 |
124 | # kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml)
125 | # XXXX is the offset in records into divtbl
126 | # YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position
127 |
128 | # pos:fid pattern
129 | posfid_pattern = re.compile(br'''()''', re.IGNORECASE)
130 | posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''')
131 |
132 | parts = []
133 | print("Building proper xhtml for each file")
134 | for i in range(self.k8proc.getNumberOfParts()):
135 | part = self.k8proc.getPart(i)
136 | [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i)
137 |
138 | # internal links
139 | srcpieces = posfid_pattern.split(part)
140 | for j in range(1, len(srcpieces),2):
141 | tag = srcpieces[j]
142 | if tag.startswith(b'<'):
143 | for m in posfid_index_pattern.finditer(tag):
144 | posfid = m.group(1)
145 | offset = m.group(2)
146 | filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset)
147 | if idtag == b'':
148 | replacement= b'"' + utf8_str(filename) + b'"'
149 | else:
150 | replacement = b'"' + utf8_str(filename) + b'#' + idtag + b'"'
151 | tag = posfid_index_pattern.sub(replacement, tag, 1)
152 | srcpieces[j] = tag
153 | part = b"".join(srcpieces)
154 | parts.append(part)
155 |
156 | # we are free to cut and paste as we see fit
157 | # we can safely remove all of the Kindlegen generated aid tags
158 | # change aid ids that are in k8proc.linked_aids to xhtml ids
159 | find_tag_with_aid_pattern = re.compile(br'''(<[^>]*\said\s*=[^>]*>)''', re.IGNORECASE)
160 | within_tag_aid_position_pattern = re.compile(br'''\said\s*=['"]([^'"]*)['"]''')
161 | for i in range(len(parts)):
162 | part = parts[i]
163 | srcpieces = find_tag_with_aid_pattern.split(part)
164 | for j in range(len(srcpieces)):
165 | tag = srcpieces[j]
166 | if tag.startswith(b'<'):
167 | for m in within_tag_aid_position_pattern.finditer(tag):
168 | try:
169 | aid = m.group(1)
170 | except IndexError:
171 | aid = None
172 | replacement = b''
173 | if aid in self.k8proc.linked_aids:
174 | replacement = b' id="aid-' + aid + b'"'
175 | tag = within_tag_aid_position_pattern.sub(replacement, tag, 1)
176 | srcpieces[j] = tag
177 | part = b"".join(srcpieces)
178 | parts[i] = part
179 |
180 | # we can safely replace all of the Kindlegen generated data-AmznPageBreak tags
181 | # with page-break-after style patterns
182 | find_tag_with_AmznPageBreak_pattern = re.compile(br'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE)
183 | within_tag_AmznPageBreak_position_pattern = re.compile(br'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''')
184 | for i in range(len(parts)):
185 | part = parts[i]
186 | srcpieces = find_tag_with_AmznPageBreak_pattern.split(part)
187 | for j in range(len(srcpieces)):
188 | tag = srcpieces[j]
189 | if tag.startswith(b'<'):
190 | srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub(
191 | lambda m:b' style="page-break-after:' + m.group(1) + b'"', tag)
192 | part = b"".join(srcpieces)
193 | parts[i] = part
194 |
195 | # we have to handle substitutions for the flows pieces first as they may
196 | # be inlined into the xhtml text
197 | # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
198 | # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc)
199 | # kindle:embed:XXXX (used for fonts)
200 |
201 | flows = []
202 | flows.append(None)
203 | flowinfo = []
204 | flowinfo.append([None, None, None, None])
205 |
206 | # regular expression search patterns
207 | img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
208 | img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE)
209 |
210 | tag_pattern = re.compile(br'''(<[^>]*>)''')
211 | flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
212 |
213 | url_pattern = re.compile(br'''(url\(.*?\))''', re.IGNORECASE)
214 | url_img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]''', re.IGNORECASE)
215 | font_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)["')]''', re.IGNORECASE)
216 | url_css_index_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE)
217 | url_svg_image_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=image/svg\+xml[^\)]*''', re.IGNORECASE)
218 |
219 | for i in range(1, self.k8proc.getNumberOfFlows()):
220 | [ftype, format, dir, filename] = self.k8proc.getFlowInfo(i)
221 | flowpart = self.k8proc.getFlow(i)
222 |
223 | # links to raster image files from image tags
224 | # image_pattern
225 | srcpieces = img_pattern.split(flowpart)
226 | for j in range(1, len(srcpieces),2):
227 | tag = srcpieces[j]
228 | if tag.startswith(b']*>)''')
323 | flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE)
324 | for i in range(len(parts)):
325 | part = parts[i]
326 | [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
327 | # flow pattern
328 | srcpieces = tag_pattern.split(part)
329 | for j in range(1, len(srcpieces),2):
330 | tag = srcpieces[j]
331 | if tag.startswith(b'<'):
332 | for m in flow_pattern.finditer(tag):
333 | num = fromBase32(m.group(1))
334 | if num > 0 and num < len(self.k8proc.flowinfo):
335 | [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num)
336 | flowpart = flows[num]
337 | if fmt == b'inline':
338 | tag = flowpart
339 | else:
340 | replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"'
341 | tag = flow_pattern.sub(replacement, tag, 1)
342 | self.used[fnm] = 'used'
343 | else:
344 | print("warning: ignoring non-existent flow link", tag, " value 0x%x" % num)
345 | srcpieces[j] = tag
346 | part = b''.join(srcpieces)
347 |
348 | # store away modified version
349 | parts[i] = part
350 |
351 | # Handle any embedded raster images links in style= attributes urls
352 | style_pattern = re.compile(br'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''', re.IGNORECASE)
353 | img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE)
354 |
355 | for i in range(len(parts)):
356 | part = parts[i]
357 | [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
358 |
359 | # replace urls in style attributes
360 | srcpieces = style_pattern.split(part)
361 | for j in range(1, len(srcpieces),2):
362 | tag = srcpieces[j]
363 | if b'kindle:embed' in tag:
364 | for m in img_index_pattern.finditer(tag):
365 | imageNumber = fromBase32(m.group(1))
366 | imageName = self.rscnames[imageNumber-1]
367 | osep = m.group()[0:1]
368 | csep = m.group()[-1:]
369 | if imageName is not None:
370 | replacement = osep + b'../Images/'+ utf8_str(imageName) + csep
371 | self.used[imageName] = 'used'
372 | tag = img_index_pattern.sub(replacement, tag, 1)
373 | else:
374 | print("Error: Referenced image %s in style url was not recognized in %s" % (imageNumber, tag))
375 | srcpieces[j] = tag
376 | part = b"".join(srcpieces)
377 |
378 | # store away modified version
379 | parts[i] = part
380 |
381 | # Handle any embedded raster images links in the xhtml text
382 | # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images)
383 | img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE)
384 | img_index_pattern = re.compile(br'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''')
385 |
386 | for i in range(len(parts)):
387 | part = parts[i]
388 | [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
389 |
390 | # links to raster image files
391 | # image_pattern
392 | srcpieces = img_pattern.split(part)
393 | for j in range(1, len(srcpieces),2):
394 | tag = srcpieces[j]
395 | if tag.startswith(b' remove value="XX" attributes since these are illegal
415 | tag_pattern = re.compile(br'''(<[^>]*>)''')
416 | li_value_pattern = re.compile(br'''\svalue\s*=\s*['"][^'"]*['"]''', re.IGNORECASE)
417 |
418 | for i in range(len(parts)):
419 | part = parts[i]
420 | [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i]
421 |
422 | # tag pattern
423 | srcpieces = tag_pattern.split(part)
424 | for j in range(1, len(srcpieces),2):
425 | tag = srcpieces[j]
426 | if tag.startswith(b' \n'
440 | viewport_pattern = re.compile(br''' ]*name\s*=\s*["'][^"'>]*viewport["'][^>]*>''', re.IGNORECASE)
441 | for i in range(len(parts)):
442 | part = parts[i]
443 | # only inject if a viewport meta item does not already exist in that part
444 | if not viewport_pattern.search(part):
445 | endheadpos = part.find(b'')
446 | if endheadpos >= 0:
447 | part = part[0:endheadpos] + injected_meta + part[endheadpos:]
448 | parts[i] = part
449 |
450 | self.k8proc.setFlows(flows)
451 | self.k8proc.setParts(parts)
452 |
453 | return self.used
454 |
--------------------------------------------------------------------------------
/lib/mobi_index.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
4 |
5 | from __future__ import unicode_literals, division, absolute_import, print_function
6 |
7 | from .compatibility_utils import PY2, bchr, bstr, bord
8 | if PY2:
9 | range = xrange
10 |
11 | import struct
12 | # note: struct pack, unpack, unpack_from all require bytestring format
13 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
14 |
15 | from .mobi_utils import toHex
16 |
17 | class MobiIndex:
18 |
19 | def __init__(self, sect, DEBUG=False):
20 | self.sect = sect
21 | self.DEBUG = DEBUG
22 |
23 | def getIndexData(self, idx, label="Unknown"):
24 | sect = self.sect
25 | outtbl = []
26 | ctoc_text = {}
27 | if idx != 0xffffffff:
28 | sect.setsectiondescription(idx,"{0} Main INDX section".format(label))
29 | data = sect.loadSection(idx)
30 | idxhdr, hordt1, hordt2 = self.parseINDXHeader(data)
31 | IndexCount = idxhdr['count']
32 | # handle the case of multiple sections used for CTOC
33 | rec_off = 0
34 | off = idx + IndexCount + 1
35 | for j in range(idxhdr['nctoc']):
36 | cdata = sect.loadSection(off + j)
37 | sect.setsectiondescription(off+j, label + ' CTOC Data ' + str(j))
38 | ctocdict = self.readCTOC(cdata)
39 | for k in ctocdict:
40 | ctoc_text[k + rec_off] = ctocdict[k]
41 | rec_off += 0x10000
42 | tagSectionStart = idxhdr['len']
43 | controlByteCount, tagTable = readTagSection(tagSectionStart, data)
44 | if self.DEBUG:
45 | print("ControlByteCount is", controlByteCount)
46 | print("IndexCount is", IndexCount)
47 | print("TagTable: %s" % tagTable)
48 | for i in range(idx + 1, idx + 1 + IndexCount):
49 | sect.setsectiondescription(i,"{0} Extra {1:d} INDX section".format(label,i-idx))
50 | data = sect.loadSection(i)
51 | hdrinfo, ordt1, ordt2 = self.parseINDXHeader(data)
52 | idxtPos = hdrinfo['start']
53 | entryCount = hdrinfo['count']
54 | if self.DEBUG:
55 | print(idxtPos, entryCount)
56 | # loop through to build up the IDXT position starts
57 | idxPositions = []
58 | for j in range(entryCount):
59 | pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j))
60 | idxPositions.append(pos)
61 | # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
62 | idxPositions.append(idxtPos)
63 | # for each entry in the IDXT build up the tagMap and any associated text
64 | for j in range(entryCount):
65 | startPos = idxPositions[j]
66 | endPos = idxPositions[j+1]
67 | textLength = ord(data[startPos:startPos+1])
68 | text = data[startPos+1:startPos+1+textLength]
69 | if hordt2 is not None:
70 | text = b''.join(bchr(hordt2[bord(x)]) for x in text)
71 | tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
72 | outtbl.append([text, tagMap])
73 | if self.DEBUG:
74 | print(tagMap)
75 | print(text)
76 | return outtbl, ctoc_text
77 |
78 | def parseINDXHeader(self, data):
79 | "read INDX header"
80 | if not data[:4] == b'INDX':
81 | print("Warning: index section is not INDX")
82 | return False
83 | words = (
84 | 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
85 | 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc'
86 | )
87 | num = len(words)
88 | values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)])
89 | header = {}
90 | for n in range(num):
91 | header[words[n]] = values[n]
92 |
93 | ordt1 = None
94 | ordt2 = None
95 |
96 | ocnt, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4)
97 | if header['code'] == 0xfdea or ocnt != 0 or oentries > 0:
98 | # horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify
99 | # them in the proper place in the header. They seem to be codepage 65002 which seems
100 | # to be some sort of strange EBCDIC utf-8 or 16 encoded strings
101 |
102 | # so we need to look for them and store them away to process leading text
103 | # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
104 | # we only ever seem to use the seocnd but ...
105 | assert(ocnt == 1)
106 | assert(data[op1:op1+4] == b'ORDT')
107 | assert(data[op2:op2+4] == b'ORDT')
108 | ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4)
109 | ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4)
110 |
111 | if self.DEBUG:
112 | print("parsed INDX header:")
113 | for n in words:
114 | print(n, "%X" % header[n],)
115 | print("")
116 | return header, ordt1, ordt2
117 |
118 | def readCTOC(self, txtdata):
119 | # read all blocks from CTOC
120 | ctoc_data = {}
121 | offset = 0
122 | while offset next bytes: name
134 | name = txtdata[offset:offset+ilen]
135 | offset += ilen
136 | if self.DEBUG:
137 | print("name length is ", ilen)
138 | print(idx_offs, name)
139 | ctoc_data[idx_offs] = name
140 | return ctoc_data
141 |
142 |
143 | def getVariableWidthValue(data, offset):
144 | '''
145 | Decode variable width value from given bytes.
146 |
147 | @param data: The bytes to decode.
148 | @param offset: The start offset into data.
149 | @return: Tuple of consumed bytes count and decoded value.
150 | '''
151 | value = 0
152 | consumed = 0
153 | finished = False
154 | while not finished:
155 | v = data[offset + consumed: offset + consumed + 1]
156 | consumed += 1
157 | if ord(v) & 0x80:
158 | finished = True
159 | value = (value << 7) | (ord(v) & 0x7f)
160 | return consumed, value
161 |
162 |
163 | def readTagSection(start, data):
164 | '''
165 | Read tag section from given data.
166 |
167 | @param start: The start position in the data.
168 | @param data: The data to process.
169 | @return: Tuple of control byte count and list of tag tuples.
170 | '''
171 | controlByteCount = 0
172 | tags = []
173 | if data[start:start+4] == b"TAGX":
174 | firstEntryOffset, = struct.unpack_from(b'>L', data, start + 0x04)
175 | controlByteCount, = struct.unpack_from(b'>L', data, start + 0x08)
176 |
177 | # Skip the first 12 bytes already read above.
178 | for i in range(12, firstEntryOffset, 4):
179 | pos = start + i
180 | tags.append((ord(data[pos:pos+1]), ord(data[pos+1:pos+2]), ord(data[pos+2:pos+3]), ord(data[pos+3:pos+4])))
181 | return controlByteCount, tags
182 |
183 |
184 | def countSetBits(value, bits=8):
185 | '''
186 | Count the set bits in the given value.
187 |
188 | @param value: Integer value.
189 | @param bits: The number of bits of the input value (defaults to 8).
190 | @return: Number of set bits.
191 | '''
192 | count = 0
193 | for _ in range(bits):
194 | if value & 0x01 == 0x01:
195 | count += 1
196 | value = value >> 1
197 | return count
198 |
199 |
200 | def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos):
201 | '''
202 | Create a map of tags and values from the given byte section.
203 |
204 | @param controlByteCount: The number of control bytes.
205 | @param tagTable: The tag table.
206 | @param entryData: The data to process.
207 | @param startPos: The starting position in entryData.
208 | @param endPos: The end position in entryData or None if it is unknown.
209 | @return: Hashmap of tag and list of values.
210 | '''
211 | tags = []
212 | tagHashMap = {}
213 | controlByteIndex = 0
214 | dataStart = startPos + controlByteCount
215 |
216 | for tag, valuesPerEntry, mask, endFlag in tagTable:
217 | if endFlag == 0x01:
218 | controlByteIndex += 1
219 | continue
220 | cbyte = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1])
221 | if 0:
222 | print("Control Byte Index %0x , Control Byte Value %0x" % (controlByteIndex, cbyte))
223 |
224 | value = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) & mask
225 | if value != 0:
226 | if value == mask:
227 | if countSetBits(mask) > 1:
228 | # If all bits of masked value are set and the mask has more than one bit, a variable width value
229 | # will follow after the control bytes which defines the length of bytes (NOT the value count!)
230 | # which will contain the corresponding variable width values.
231 | consumed, value = getVariableWidthValue(entryData, dataStart)
232 | dataStart += consumed
233 | tags.append((tag, None, value, valuesPerEntry))
234 | else:
235 | tags.append((tag, 1, None, valuesPerEntry))
236 | else:
237 | # Shift bits to get the masked value.
238 | while mask & 0x01 == 0:
239 | mask = mask >> 1
240 | value = value >> 1
241 | tags.append((tag, value, None, valuesPerEntry))
242 | for tag, valueCount, valueBytes, valuesPerEntry in tags:
243 | values = []
244 | if valueCount is not None:
245 | # Read valueCount * valuesPerEntry variable width values.
246 | for _ in range(valueCount):
247 | for _ in range(valuesPerEntry):
248 | consumed, data = getVariableWidthValue(entryData, dataStart)
249 | dataStart += consumed
250 | values.append(data)
251 | else:
252 | # Convert valueBytes to variable width values.
253 | totalConsumed = 0
254 | while totalConsumed < valueBytes:
255 | # Does this work for valuesPerEntry != 1?
256 | consumed, data = getVariableWidthValue(entryData, dataStart)
257 | dataStart += consumed
258 | totalConsumed += consumed
259 | values.append(data)
260 | if totalConsumed != valueBytes:
261 | print("Error: Should consume %s bytes, but consumed %s" % (valueBytes, totalConsumed))
262 | tagHashMap[tag] = values
263 | # Test that all bytes have been processed if endPos is given.
264 | if endPos is not None and dataStart != endPos:
265 | # The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
266 | for char in entryData[dataStart:endPos]:
267 | if bord(char) != 0:
268 | print("Warning: There are unprocessed index bytes left: %s" % toHex(entryData[dataStart:endPos]))
269 | if 0:
270 | print("controlByteCount: %s" % controlByteCount)
271 | print("tagTable: %s" % tagTable)
272 | print("data: %s" % toHex(entryData[startPos:endPos]))
273 | print("tagHashMap: %s" % tagHashMap)
274 | break
275 |
276 | return tagHashMap
277 |
--------------------------------------------------------------------------------
/lib/mobi_k8resc.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
4 |
5 | from __future__ import unicode_literals, division, absolute_import, print_function
6 |
7 | DEBUG_USE_ORDERED_DICTIONARY = False # OrderedDict is supoorted >= python 2.7.
8 | """ set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr."""
9 |
10 | if DEBUG_USE_ORDERED_DICTIONARY:
11 | from collections import OrderedDict as dict_
12 | else:
13 | dict_ = dict
14 |
15 | from .compatibility_utils import unicode_str
16 |
17 | from .mobi_utils import fromBase32
18 |
19 | _OPF_PARENT_TAGS = ['xml', 'package', 'metadata', 'dc-metadata',
20 | 'x-metadata', 'manifest', 'spine', 'tours', 'guide']
21 |
22 | class K8RESCProcessor(object):
23 |
24 | def __init__(self, data, debug=False):
25 | self._debug = debug
26 | self.resc = None
27 | self.opos = 0
28 | self.extrameta = []
29 | self.cover_name = None
30 | self.spine_idrefs = {}
31 | self.spine_order = []
32 | self.spine_pageattributes = {}
33 | self.spine_ppd = None
34 | # need3 indicate the book has fields which require epub3.
35 | # but the estimation of the source epub version from the fields is difficult.
36 | self.need3 = False
37 | self.package_ver = None
38 | self.extra_metadata = []
39 | self.refines_metadata = []
40 | self.extra_attributes = []
41 | # get header
42 | start_pos = data.find(b'<')
43 | self.resc_header = data[:start_pos]
44 | # get resc data length
45 | start = self.resc_header.find(b'=') + 1
46 | end = self.resc_header.find(b'&', start)
47 | resc_size = 0
48 | if end > 0:
49 | resc_size = fromBase32(self.resc_header[start:end])
50 | resc_rawbytes = len(data) - start_pos
51 | if resc_rawbytes == resc_size:
52 | self.resc_length = resc_size
53 | else:
54 | # Most RESC has a nul string at its tail but some do not.
55 | end_pos = data.find(b'\x00', start_pos)
56 | if end_pos < 0:
57 | self.resc_length = resc_rawbytes
58 | else:
59 | self.resc_length = end_pos - start_pos
60 | if self.resc_length != resc_size:
61 | print("Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size))
62 | # now parse RESC after converting it to unicode from utf-8
63 | try:
64 | self.resc = unicode_str(data[start_pos:start_pos+self.resc_length])
65 | except UnicodeDecodeError:
66 | self.resc = unicode_str(data[start_pos:start_pos+self.resc_length], enc='latin-1')
67 | self.parseData()
68 |
69 | def prepend_to_spine(self, key, idref, linear, properties):
70 | self.spine_order = [key] + self.spine_order
71 | self.spine_idrefs[key] = idref
72 | attributes = {}
73 | if linear is not None:
74 | attributes['linear'] = linear
75 | if properties is not None:
76 | attributes['properties'] = properties
77 | self.spine_pageattributes[key] = attributes
78 |
79 | # RESC tag iterator
80 | def resc_tag_iter(self):
81 | tcontent = last_tattr = None
82 | prefix = ['']
83 | while True:
84 | text, tag = self.parseresc()
85 | if text is None and tag is None:
86 | break
87 | if text is not None:
88 | tcontent = text.rstrip(' \r\n')
89 | else: # we have a tag
90 | ttype, tname, tattr = self.parsetag(tag)
91 | if ttype == 'begin':
92 | tcontent = None
93 | prefix.append(tname + '.')
94 | if tname in _OPF_PARENT_TAGS:
95 | yield ''.join(prefix), tname, tattr, tcontent
96 | else:
97 | last_tattr = tattr
98 | else: # single or end
99 | if ttype == 'end':
100 | prefix.pop()
101 | tattr = last_tattr
102 | last_tattr = None
103 | if tname in _OPF_PARENT_TAGS:
104 | tname += '-end'
105 | yield ''.join(prefix), tname, tattr, tcontent
106 | tcontent = None
107 |
108 | # now parse the RESC to extract spine and extra metadata info
109 | def parseData(self):
110 | for prefix, tname, tattr, tcontent in self.resc_tag_iter():
111 | if self._debug:
112 | print(" Parsing RESC: ", prefix, tname, tattr, tcontent)
113 | if tname == 'package':
114 | self.package_ver = tattr.get('version', '2.0')
115 | package_prefix = tattr.get('prefix','')
116 | if self.package_ver.startswith('3') or package_prefix.startswith('rendition'):
117 | self.need3 = True
118 | if tname == 'spine':
119 | self.spine_ppd = tattr.get('page-progession-direction', None)
120 | if self.spine_ppd is not None and self.spine_ppd == 'rtl':
121 | self.need3 = True
122 | if tname == 'itemref':
123 | skelid = tattr.pop('skelid', None)
124 | if skelid is None and len(self.spine_order) == 0:
125 | # assume it was removed initial coverpage
126 | skelid = 'coverpage'
127 | tattr['linear'] = 'no'
128 | self.spine_order.append(skelid)
129 | idref = tattr.pop('idref', None)
130 | if idref is not None:
131 | idref = 'x_' + idref
132 | self.spine_idrefs[skelid] = idref
133 | if 'id' in tattr:
134 | del tattr['id']
135 | # tattr["id"] = 'x_' + tattr["id"]
136 | if 'properties' in tattr:
137 | self.need3 = True
138 | self.spine_pageattributes[skelid] = tattr
139 | if tname == 'meta' or tname.startswith('dc:'):
140 | if 'refines' in tattr or 'property' in tattr:
141 | self.need3 = True
142 | if tattr.get('name','') == 'cover':
143 | cover_name = tattr.get('content',None)
144 | if cover_name is not None:
145 | cover_name = 'x_' + cover_name
146 | self.cover_name = cover_name
147 | else:
148 | self.extrameta.append([tname, tattr, tcontent])
149 |
150 | # parse and return either leading text or the next tag
151 | def parseresc(self):
152 | p = self.opos
153 | if p >= len(self.resc):
154 | return None, None
155 | if self.resc[p] != '<':
156 | res = self.resc.find('<',p)
157 | if res == -1 :
158 | res = len(self.resc)
159 | self.opos = res
160 | return self.resc[p:res], None
161 | # handle comment as a special case
162 | if self.resc[p:p+4] == '',p+1)
164 | if te != -1:
165 | te = te+2
166 | else:
167 | te = self.resc.find('>',p+1)
168 | ntb = self.resc.find('<',p+1)
169 | if ntb != -1 and ntb < te:
170 | self.opos = ntb
171 | return self.resc[p:ntb], None
172 | self.opos = te + 1
173 | return None, self.resc[p:te+1]
174 |
175 | # parses tag to identify: [tname, ttype, tattr]
176 | # tname: tag name
177 | # ttype: tag type ('begin', 'end' or 'single');
178 | # tattr: dictionary of tag atributes
179 | def parsetag(self, s):
180 | p = 1
181 | tname = None
182 | ttype = None
183 | tattr = dict_()
184 | while s[p:p+1] == ' ' :
185 | p += 1
186 | if s[p:p+1] == '/':
187 | ttype = 'end'
188 | p += 1
189 | while s[p:p+1] == ' ' :
190 | p += 1
191 | b = p
192 | while s[p:p+1] not in ('>', '/', ' ', '"', "'",'\r','\n') :
193 | p += 1
194 | tname=s[b:p].lower()
195 | # some special cases
196 | if tname == '?xml':
197 | tname = 'xml'
198 | if tname == '!--':
199 | ttype = 'single'
200 | comment = s[p:-3].strip()
201 | tattr['comment'] = comment
202 | if ttype is None:
203 | # parse any attributes of begin or single tags
204 | while s.find('=',p) != -1 :
205 | while s[p:p+1] == ' ' :
206 | p += 1
207 | b = p
208 | while s[p:p+1] != '=' :
209 | p += 1
210 | aname = s[b:p].lower()
211 | aname = aname.rstrip(' ')
212 | p += 1
213 | while s[p:p+1] == ' ' :
214 | p += 1
215 | if s[p:p+1] in ('"', "'") :
216 | p = p + 1
217 | b = p
218 | while s[p:p+1] not in ('"', "'"):
219 | p += 1
220 | val = s[b:p]
221 | p += 1
222 | else :
223 | b = p
224 | while s[p:p+1] not in ('>', '/', ' ') :
225 | p += 1
226 | val = s[b:p]
227 | tattr[aname] = val
228 | if ttype is None:
229 | ttype = 'begin'
230 | if s.find('/',p) >= 0:
231 | ttype = 'single'
232 | return ttype, tname, tattr
233 |
234 | def taginfo_toxml(self, taginfo):
235 | res = []
236 | tname, tattr, tcontent = taginfo
237 | res.append('<' + tname)
238 | if tattr is not None:
239 | for key in tattr:
240 | res.append(' ' + key + '="'+tattr[key]+'"')
241 | if tcontent is not None:
242 | res.append('>' + tcontent + '' + tname + '>\n')
243 | else:
244 | res.append('/>\n')
245 | return "".join(res)
246 |
247 | def hasSpine(self):
248 | return len(self.spine_order) > 0
249 |
250 | def needEPUB3(self):
251 | return self.need3
252 |
253 | def hasRefines(self):
254 | for [tname, tattr, tcontent] in self.extrameta:
255 | if 'refines' in tattr:
256 | return True
257 | return False
258 |
259 | def createMetadata(self, epubver):
260 | for taginfo in self.extrameta:
261 | tname, tattr, tcontent = taginfo
262 | if 'refines' in tattr:
263 | if epubver == 'F' and 'property' in tattr:
264 | attr = ' id="%s" opf:%s="%s"\n' % (tattr['refines'], tattr['property'], tcontent)
265 | self.extra_attributes.append(attr)
266 | else:
267 | tag = self.taginfo_toxml(taginfo)
268 | self.refines_metadata.append(tag)
269 | else:
270 | tag = self.taginfo_toxml(taginfo)
271 | self.extra_metadata.append(tag)
272 |
--------------------------------------------------------------------------------
/lib/mobi_nav.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
4 |
5 | from __future__ import unicode_literals, division, absolute_import, print_function
6 |
7 | from .compatibility_utils import unicode_str
8 | import os
9 | from .unipath import pathof
10 |
11 | import re
12 | # note: re requites the pattern to be the exact same type as the data to be searched in python3
13 | # but u"" is not allowed for the pattern itself only b""
14 |
15 | DEBUG_NAV = False
16 |
17 | FORCE_DEFAULT_TITLE = False
18 | """ Set to True to force to use the default title. """
19 |
20 | NAVIGATION_FINENAME = 'nav.xhtml'
21 | """ The name for the navigation document. """
22 |
23 | DEFAULT_TITLE = 'Navigation'
24 | """ The default title for the navigation document. """
25 |
26 | class NAVProcessor(object):
27 |
28 | def __init__(self, files):
29 | self.files = files
30 | self.navname = NAVIGATION_FINENAME
31 |
32 | def buildLandmarks(self, guidetext):
33 | header = ''
34 | header += ' \n'
35 | header += ' Guide \n'
36 | header += ' \n'
37 | element = ' {:s} \n'
38 | footer = ''
39 | footer += ' \n'
40 | footer += ' \n'
41 |
42 | type_map = {
43 | 'cover' : 'cover',
44 | 'title-page' : 'title-page',
45 | # ?: 'frontmatter',
46 | 'text' : 'bodymatter',
47 | # ?: 'backmatter',
48 | 'toc' : 'toc',
49 | 'loi' : 'loi',
50 | 'lot' : 'lot',
51 | 'preface' : 'preface',
52 | 'bibliography' : 'bibliography',
53 | 'index' : 'index',
54 | 'glossary' : 'glossary',
55 | 'acknowledgements' : 'acknowledgements',
56 | 'colophon' : None,
57 | 'copyright-page' : None,
58 | 'dedication' : None,
59 | 'epigraph' : None,
60 | 'foreword' : None,
61 | 'notes' : None
62 | }
63 |
64 | re_type = re.compile(r'\s+type\s*=\s*"(.*?)"', re.I)
65 | re_title = re.compile(r'\s+title\s*=\s*"(.*?)"', re.I)
66 | re_link = re.compile(r'\s+href\s*=\s*"(.*?)"', re.I)
67 | dir_ = os.path.relpath(self.files.k8text, self.files.k8oebps).replace('\\', '/')
68 |
69 | data = ''
70 | references = re.findall(r'', unicode_str(guidetext), re.I)
71 | for reference in references:
72 | mo_type = re_type.search(reference)
73 | mo_title = re_title.search(reference)
74 | mo_link = re_link.search(reference)
75 | if mo_type is not None:
76 | type_ = type_map.get(mo_type.group(1), None)
77 | else:
78 | type_ = None
79 | if mo_title is not None:
80 | title = mo_title.group(1)
81 | else:
82 | title = None
83 | if mo_link is not None:
84 | link = mo_link.group(1)
85 | else:
86 | link = None
87 |
88 | if type_ is not None and title is not None and link is not None:
89 | link = os.path.relpath(link, dir_).replace('\\', '/')
90 | data += element.format(type_, link, title)
91 | if len(data) > 0:
92 | return header + data + footer
93 | else:
94 | return ''
95 |
96 | def buildTOC(self, indx_data):
97 | header = ''
98 | header += ' \n'
99 | header += ' Table of contents \n'
100 | footer = ' \n'
101 |
102 | # recursive part
103 | def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
104 | if start>len(indx_data) or end>len(indx_data):
105 | print("Warning (in buildTOC): missing INDX child entries", start, end, len(indx_data))
106 | return ''
107 | if DEBUG_NAV:
108 | print("recursINDX (in buildTOC) lvl %d from %d to %d" % (lvl, start, end))
109 | xhtml = ''
110 | if start <= 0:
111 | start = 0
112 | if end <= 0:
113 | end = len(indx_data)
114 | if lvl > max_lvl:
115 | max_lvl = lvl
116 |
117 | indent1 = ' ' * (2 + lvl * 2)
118 | indent2 = ' ' * (3 + lvl * 2)
119 | xhtml += indent1 + '\n'
120 | for i in range(start, end):
121 | e = indx_data[i]
122 | htmlfile = e['filename']
123 | desttag = e['idtag']
124 | text = e['text']
125 | if not e['hlvl'] == lvl:
126 | continue
127 | num += 1
128 | if desttag == '':
129 | link = htmlfile
130 | else:
131 | link = '{:s}#{:s}'.format(htmlfile, desttag)
132 | xhtml += indent2 + ''
133 | entry = '{:s} '.format(link, text)
134 | xhtml += entry
135 | # recurs
136 | if e['child1'] >= 0:
137 | xhtml += '\n'
138 | xhtmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
139 | e['child1'], e['childn'] + 1)
140 | xhtml += xhtmlrec
141 | xhtml += indent2
142 | # close entry
143 | xhtml += ' \n'
144 | xhtml += indent1 + ' \n'
145 | return xhtml, max_lvl, num
146 |
147 | data, max_lvl, num = recursINDX()
148 | if not len(indx_data) == num:
149 | print("Warning (in buildTOC): different number of entries in NCX", len(indx_data), num)
150 | return header + data + footer
151 |
152 | def buildNAV(self, ncx_data, guidetext, title, lang):
153 | print("Building Navigation Document.")
154 | if FORCE_DEFAULT_TITLE:
155 | title = DEFAULT_TITLE
156 | nav_header = ''
157 | nav_header += '\n'
158 | nav_header += '\n'.format(lang)
161 | nav_header += '\n{:s} \n'.format(title)
162 | nav_header += ' \n'
163 | nav_header += '\n\n\n'
167 | nav_footer = '\n\n'
168 |
169 | landmarks = self.buildLandmarks(guidetext)
170 | toc = self.buildTOC(ncx_data)
171 |
172 | data = nav_header
173 | data += landmarks
174 | data += toc
175 | data += nav_footer
176 | return data
177 |
178 | def getNAVName(self):
179 | return self.navname
180 |
181 | def writeNAV(self, ncx_data, guidetext, metadata):
182 | # build the xhtml
183 | # print("Write Navigation Document.")
184 | xhtml = self.buildNAV(ncx_data, guidetext, metadata.get('Title')[0], metadata.get('Language')[0])
185 | fname = os.path.join(self.files.k8text, self.navname)
186 | with open(pathof(fname), 'wb') as f:
187 | f.write(xhtml.encode('utf-8'))
188 |
--------------------------------------------------------------------------------
/lib/mobi_ncx.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
4 |
5 | from __future__ import unicode_literals, division, absolute_import, print_function
6 |
7 | import os
8 | from .unipath import pathof
9 | from .compatibility_utils import unescapeit
10 |
11 |
12 | import re
13 | # note: re requites the pattern to be the exact same type as the data to be searched in python3
14 | # but u"" is not allowed for the pattern itself only b""
15 |
16 | from xml.sax.saxutils import escape as xmlescape
17 |
18 | from .mobi_utils import toBase32
19 | from .mobi_index import MobiIndex
20 |
21 | DEBUG_NCX = False
22 |
23 | class ncxExtract:
24 |
25 | def __init__(self, mh, files):
26 | self.mh = mh
27 | self.sect = self.mh.sect
28 | self.files = files
29 | self.isNCX = False
30 | self.mi = MobiIndex(self.sect)
31 | self.ncxidx = self.mh.ncxidx
32 | self.indx_data = None
33 |
34 | def parseNCX(self):
35 | indx_data = []
36 | tag_fieldname_map = {
37 | 1: ['pos',0],
38 | 2: ['len',0],
39 | 3: ['noffs',0],
40 | 4: ['hlvl',0],
41 | 5: ['koffs',0],
42 | 6: ['pos_fid',0],
43 | 21: ['parent',0],
44 | 22: ['child1',0],
45 | 23: ['childn',0]
46 | }
47 | if self.ncxidx != 0xffffffff:
48 | outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX")
49 | if DEBUG_NCX:
50 | print(ctoc_text)
51 | print(outtbl)
52 | num = 0
53 | for [text, tagMap] in outtbl:
54 | tmp = {
55 | 'name': text.decode('utf-8'),
56 | 'pos': -1,
57 | 'len': 0,
58 | 'noffs': -1,
59 | 'text' : "Unknown Text",
60 | 'hlvl' : -1,
61 | 'kind' : "Unknown Kind",
62 | 'pos_fid' : None,
63 | 'parent' : -1,
64 | 'child1' : -1,
65 | 'childn' : -1,
66 | 'num' : num
67 | }
68 | for tag in tag_fieldname_map:
69 | [fieldname, i] = tag_fieldname_map[tag]
70 | if tag in tagMap:
71 | fieldvalue = tagMap[tag][i]
72 | if tag == 6:
73 | pos_fid = toBase32(fieldvalue,4).decode('utf-8')
74 | fieldvalue2 = tagMap[tag][i+1]
75 | pos_off = toBase32(fieldvalue2,10).decode('utf-8')
76 | fieldvalue = 'kindle:pos:fid:%s:off:%s' % (pos_fid, pos_off)
77 | tmp[fieldname] = fieldvalue
78 | if tag == 3:
79 | toctext = ctoc_text.get(fieldvalue, b'Unknown Text')
80 | toctext = toctext.decode(self.mh.codec)
81 | tmp['text'] = toctext
82 | if tag == 5:
83 | kindtext = ctoc_text.get(fieldvalue, b'Unknown Kind')
84 | kindtext = kindtext.decode(self.mh.codec)
85 | tmp['kind'] = kindtext
86 | indx_data.append(tmp)
87 | if DEBUG_NCX:
88 | print("record number: ", num)
89 | print("name: ", tmp['name'],)
90 | print("position", tmp['pos']," length: ", tmp['len'])
91 | print("text: ", tmp['text'])
92 | print("kind: ", tmp['kind'])
93 | print("heading level: ", tmp['hlvl'])
94 | print("parent:", tmp['parent'])
95 | print("first child: ",tmp['child1']," last child: ", tmp['childn'])
96 | print("pos_fid is ", tmp['pos_fid'])
97 | print("\n\n")
98 | num += 1
99 | self.indx_data = indx_data
100 | return indx_data
101 |
102 | def buildNCX(self, htmlfile, title, ident, lang):
103 | indx_data = self.indx_data
104 |
105 | ncx_header = \
106 | '''
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 | %s
117 |
118 |
119 | '''
120 |
121 | ncx_footer = \
122 | '''
123 |
124 | '''
125 |
126 | ncx_entry = \
127 | '''
128 |
129 | %s
130 |
131 | '''
132 |
133 | # recursive part
134 | def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
135 | if start>len(indx_data) or end>len(indx_data):
136 | print("Warning: missing INDX child entries", start, end, len(indx_data))
137 | return ''
138 | if DEBUG_NCX:
139 | print("recursINDX lvl %d from %d to %d" % (lvl, start, end))
140 | xml = ''
141 | if start <= 0:
142 | start = 0
143 | if end <= 0:
144 | end = len(indx_data)
145 | if lvl > max_lvl:
146 | max_lvl = lvl
147 | indent = ' ' * (2 + lvl)
148 |
149 | for i in range(start, end):
150 | e = indx_data[i]
151 | if not e['hlvl'] == lvl:
152 | continue
153 | # open entry
154 | num += 1
155 | link = '%s#filepos%d' % (htmlfile, e['pos'])
156 | tagid = 'np_%d' % num
157 | entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link)
158 | entry = re.sub(re.compile('^', re.M), indent, entry, 0)
159 | xml += entry + '\n'
160 | # recurs
161 | if e['child1']>=0:
162 | xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
163 | e['child1'], e['childn'] + 1)
164 | xml += xmlrec
165 | # close entry
166 | xml += indent + ' \n'
167 | return xml, max_lvl, num
168 |
169 | body, max_lvl, num = recursINDX()
170 | header = ncx_header % (lang, ident, max_lvl + 1, xmlescape(unescapeit(title)))
171 | ncx = header + body + ncx_footer
172 | if not len(indx_data) == num:
173 | print("Warning: different number of entries in NCX", len(indx_data), num)
174 | return ncx
175 |
176 | def writeNCX(self, metadata):
177 | # build the xml
178 | self.isNCX = True
179 | print("Write ncx")
180 | # htmlname = os.path.basename(self.files.outbase)
181 | # htmlname += '.html'
182 | htmlname = 'book.html'
183 | xml = self.buildNCX(htmlname, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0])
184 | # write the ncx file
185 | # ncxname = os.path.join(self.files.mobi7dir, self.files.getInputFileBasename() + '.ncx')
186 | ncxname = os.path.join(self.files.mobi7dir, 'toc.ncx')
187 | with open(pathof(ncxname), 'wb') as f:
188 | f.write(xml.encode('utf-8'))
189 |
190 | def buildK8NCX(self, indx_data, title, ident, lang):
191 | ncx_header = \
192 | '''
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 | %s
203 |
204 |
205 | '''
206 |
207 | ncx_footer = \
208 | '''
209 |
210 | '''
211 |
212 | ncx_entry = \
213 | '''
214 |
215 | %s
216 |
217 | '''
218 |
219 | # recursive part
220 | def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
221 | if start>len(indx_data) or end>len(indx_data):
222 | print("Warning: missing INDX child entries", start, end, len(indx_data))
223 | return ''
224 | if DEBUG_NCX:
225 | print("recursINDX lvl %d from %d to %d" % (lvl, start, end))
226 | xml = ''
227 | if start <= 0:
228 | start = 0
229 | if end <= 0:
230 | end = len(indx_data)
231 | if lvl > max_lvl:
232 | max_lvl = lvl
233 | indent = ' ' * (2 + lvl)
234 |
235 | for i in range(start, end):
236 | e = indx_data[i]
237 | htmlfile = e['filename']
238 | desttag = e['idtag']
239 | if not e['hlvl'] == lvl:
240 | continue
241 | # open entry
242 | num += 1
243 | if desttag == '':
244 | link = 'Text/%s' % htmlfile
245 | else:
246 | link = 'Text/%s#%s' % (htmlfile, desttag)
247 | tagid = 'np_%d' % num
248 | entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link)
249 | entry = re.sub(re.compile('^', re.M), indent, entry, 0)
250 | xml += entry + '\n'
251 | # recurs
252 | if e['child1']>=0:
253 | xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
254 | e['child1'], e['childn'] + 1)
255 | xml += xmlrec
256 | # close entry
257 | xml += indent + ' \n'
258 | return xml, max_lvl, num
259 |
260 | body, max_lvl, num = recursINDX()
261 | header = ncx_header % (lang, ident, max_lvl + 1, xmlescape(unescapeit(title)))
262 | ncx = header + body + ncx_footer
263 | if not len(indx_data) == num:
264 | print("Warning: different number of entries in NCX", len(indx_data), num)
265 | return ncx
266 |
267 | def writeK8NCX(self, ncx_data, metadata):
268 | # build the xml
269 | self.isNCX = True
270 | print("Write K8 ncx")
271 | xml = self.buildK8NCX(ncx_data, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0])
272 | bname = 'toc.ncx'
273 | ncxname = os.path.join(self.files.k8oebps,bname)
274 | with open(pathof(ncxname), 'wb') as f:
275 | f.write(xml.encode('utf-8'))
276 |
--------------------------------------------------------------------------------
/lib/mobi_pagemap.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
4 |
5 | from __future__ import unicode_literals, division, absolute_import, print_function
6 |
7 | from .compatibility_utils import PY2, unicode_str
8 |
9 | if PY2:
10 | range = xrange
11 |
12 | import struct
13 | # note: struct pack, unpack, unpack_from all require bytestring format
14 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
15 |
16 | import re
17 | # note: re requites the pattern to be the exact same type as the data to be searched in python3
18 | # but u"" is not allowed for the pattern itself only b""
19 |
20 |
21 | _TABLE = [('m', 1000), ('cm', 900), ('d', 500), ('cd', 400), ('c', 100), ('xc', 90), ('l', 50), ('xl', 40), ('x', 10), ('ix', 9), ('v', 5), ('iv', 4), ('i', 1)]
22 |
23 | def int_to_roman(i):
24 | parts = []
25 | num = i
26 | for letter, value in _TABLE:
27 | while value <= num:
28 | num -= value
29 | parts.append(letter)
30 | return ''.join(parts)
31 |
32 | def roman_to_int(s):
33 | result = 0
34 | rnstr = s
35 | for letter, value in _TABLE:
36 | while rnstr.startswith(letter):
37 | result += value
38 | rnstr = rnstr[len(letter):]
39 | return result
40 |
41 | _pattern = r'''\(([^\)]*)\)'''
42 | _tup_pattern = re.compile(_pattern,re.IGNORECASE)
43 |
44 |
45 | def _parseNames(numpages, data):
46 | data = unicode_str(data)
47 | pagenames = []
48 | pageMap = ''
49 | for i in range(numpages):
50 | pagenames.append(None)
51 | for m in re.finditer(_tup_pattern, data):
52 | tup = m.group(1)
53 | if pageMap != '':
54 | pageMap += ','
55 | pageMap += '(' + tup + ')'
56 | spos, nametype, svalue = tup.split(",")
57 | # print(spos, nametype, svalue)
58 | if nametype == 'a' or nametype == 'r':
59 | svalue = int(svalue)
60 | spos = int(spos)
61 | for i in range(spos - 1, numpages):
62 | if nametype == 'r':
63 | pname = int_to_roman(svalue)
64 | svalue += 1
65 | elif nametype == 'a':
66 | pname = "%s" % svalue
67 | svalue += 1
68 | elif nametype == 'c':
69 | sp = svalue.find('|')
70 | if sp == -1:
71 | pname = svalue
72 | else:
73 | pname = svalue[0:sp]
74 | svalue = svalue[sp+1:]
75 | else:
76 | print("Error: unknown page numbering type", nametype)
77 | pagenames[i] = pname
78 | return pagenames, pageMap
79 |
80 |
81 | class PageMapProcessor:
82 |
83 | def __init__(self, mh, data):
84 | self.mh = mh
85 | self.data = data
86 | self.pagenames = []
87 | self.pageoffsets = []
88 | self.pageMap = ''
89 | self.pm_len = 0
90 | self.pm_nn = 0
91 | self.pn_bits = 0
92 | self.pmoff = None
93 | self.pmstr = ''
94 | print("Extracting Page Map Information")
95 | rev_len, = struct.unpack_from(b'>L', self.data, 0x10)
96 | # skip over header, revision string length data, and revision string
97 | ptr = 0x14 + rev_len
98 | pm_1, self.pm_len, self.pm_nn, self.pm_bits = struct.unpack_from(b'>4H', self.data, ptr)
99 | # print(pm_1, self.pm_len, self.pm_nn, self.pm_bits)
100 | self.pmstr = self.data[ptr+8:ptr+8+self.pm_len]
101 | self.pmoff = self.data[ptr+8+self.pm_len:]
102 | offsize = b">L"
103 | offwidth = 4
104 | if self.pm_bits == 16:
105 | offsize = b">H"
106 | offwidth = 2
107 | ptr = 0
108 | for i in range(self.pm_nn):
109 | od, = struct.unpack_from(offsize, self.pmoff, ptr)
110 | ptr += offwidth
111 | self.pageoffsets.append(od)
112 | self.pagenames, self.pageMap = _parseNames(self.pm_nn, self.pmstr)
113 |
114 | def getPageMap(self):
115 | return self.pageMap
116 |
117 | def getNames(self):
118 | return self.pagenames
119 |
120 | def getOffsets(self):
121 | return self.pageoffsets
122 |
123 | # page-map.xml will be unicode but encoded to utf-8 immediately before being written to a file
124 | def generateKF8PageMapXML(self, k8proc):
125 | pagemapxml = '\n'
126 | for i in range(len(self.pagenames)):
127 | pos = self.pageoffsets[i]
128 | name = self.pagenames[i]
129 | if name is not None and name != "":
130 | [pn, dir, filename, skelpos, skelend, aidtext] = k8proc.getSkelInfo(pos)
131 | idtext = unicode_str(k8proc.getPageIDTag(pos))
132 | linktgt = unicode_str(filename)
133 | if idtext != '':
134 | linktgt += '#' + idtext
135 | pagemapxml += ' \n' % (name, dir, linktgt)
136 | pagemapxml += " \n"
137 | return pagemapxml
138 |
139 | def generateAPNX(self, apnx_meta):
140 | if apnx_meta['format'] == 'MOBI_8':
141 | content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","format":"%(format)s","fileRevisionId":"1","acr":"%(acr)s"}' %apnx_meta
142 | else:
143 | content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","fileRevisionId":"1"}' % apnx_meta
144 | content_header = content_header.encode('utf-8')
145 | page_header = '{"asin":"%(asin)s","pageMap":"%(pageMap)s"}' % apnx_meta
146 | page_header = page_header.encode('utf-8')
147 | apnx = struct.pack(b'>H',1) + struct.pack(b'>H',1)
148 | apnx += struct.pack(b'>I', 12 + len(content_header))
149 | apnx += struct.pack(b'>I', len(content_header))
150 | apnx += content_header
151 | apnx += struct.pack(b'>H', 1)
152 | apnx += struct.pack(b'>H', len(page_header))
153 | apnx += struct.pack(b'>H', self.pm_nn)
154 | apnx += struct.pack(b'>H', 32)
155 | apnx += page_header
156 | for page in self.pageoffsets:
157 | apnx += struct.pack(b'>L', page)
158 | return apnx
159 |
--------------------------------------------------------------------------------
/lib/mobi_sectioner.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
4 |
5 | from __future__ import unicode_literals, division, absolute_import, print_function
6 |
7 | from .compatibility_utils import PY2, hexlify, bstr, bord, bchar
8 |
9 | import datetime
10 |
11 | if PY2:
12 | range = xrange
13 |
14 | # note: struct pack, unpack, unpack_from all require bytestring format
15 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
16 | import struct
17 |
18 | from .unipath import pathof
19 |
20 | DUMP = False
21 | """ Set to True to dump all possible information. """
22 |
23 | class unpackException(Exception):
24 | pass
25 |
26 |
27 | def describe(data):
28 | txtans = ''
29 | hexans = hexlify(data)
30 | for i in data:
31 | if bord(i) < 32 or bord(i) > 127:
32 | txtans += '?'
33 | else:
34 | txtans += bchar(i).decode('latin-1')
35 | return '"' + txtans + '"' + ' 0x'+ hexans
36 |
37 | def datetimefrompalmtime(palmtime):
38 | if palmtime > 0x7FFFFFFF:
39 | pythondatetime = datetime.datetime(year=1904,month=1,day=1)+datetime.timedelta(seconds=palmtime)
40 | else:
41 | pythondatetime = datetime.datetime(year=1970,month=1,day=1)+datetime.timedelta(seconds=palmtime)
42 | return pythondatetime
43 |
44 |
45 | class Sectionizer:
46 |
47 | def __init__(self, filename):
48 | self.data = b''
49 | with open(pathof(filename), 'rb') as f:
50 | self.data = f.read()
51 | self.palmheader = self.data[:78]
52 | self.palmname = self.data[:32]
53 | self.ident = self.palmheader[0x3C:0x3C+8]
54 | self.num_sections, = struct.unpack_from(b'>H', self.palmheader, 76)
55 | self.filelength = len(self.data)
56 | sectionsdata = struct.unpack_from(bstr('>%dL' % (self.num_sections*2)), self.data, 78) + (self.filelength, 0)
57 | self.sectionoffsets = sectionsdata[::2]
58 | self.sectionattributes = sectionsdata[1::2]
59 | self.sectiondescriptions = ["" for x in range(self.num_sections+1)]
60 | self.sectiondescriptions[-1] = "File Length Only"
61 | return
62 |
63 | def dumpsectionsinfo(self):
64 | print("Section Offset Length UID Attribs Description")
65 | for i in range(self.num_sections):
66 | print("%3d %3X 0x%07X 0x%05X % 8d % 7d %s" % (i,i, self.sectionoffsets[i], self.sectionoffsets[
67 | i+1] - self.sectionoffsets[i], self.sectionattributes[i]&0xFFFFFF, (self.sectionattributes[i]>>24)&0xFF, self.sectiondescriptions[i]))
68 | print("%3d %3X 0x%07X %s" %
69 | (self.num_sections,self.num_sections, self.sectionoffsets[self.num_sections], self.sectiondescriptions[self.num_sections]))
70 |
71 | def setsectiondescription(self, section, description):
72 | if section < len(self.sectiondescriptions):
73 | self.sectiondescriptions[section] = description
74 | else:
75 | print("Section out of range: %d, description %s" % (section,description))
76 |
77 | def dumppalmheader(self):
78 | print("Palm Database Header")
79 | print("Database name: " + repr(self.palmheader[:32]))
80 | dbattributes, = struct.unpack_from(b'>H', self.palmheader, 32)
81 | print("Bitfield attributes: 0x%0X" % dbattributes,)
82 | if dbattributes != 0:
83 | print(" (",)
84 | if (dbattributes & 2):
85 | print("Read-only; ",)
86 | if (dbattributes & 4):
87 | print("Dirty AppInfoArea; ",)
88 | if (dbattributes & 8):
89 | print("Needs to be backed up; ",)
90 | if (dbattributes & 16):
91 | print("OK to install over newer; ",)
92 | if (dbattributes & 32):
93 | print("Reset after installation; ",)
94 | if (dbattributes & 64):
95 | print("No copying by PalmPilot beaming; ",)
96 | print(")")
97 | else:
98 | print("")
99 | print("File version: %d" % struct.unpack_from(b'>H', self.palmheader, 34)[0])
100 | dbcreation, = struct.unpack_from(b'>L', self.palmheader, 36)
101 | print("Creation Date: " + str(datetimefrompalmtime(dbcreation))+ (" (0x%0X)" % dbcreation))
102 | dbmodification, = struct.unpack_from(b'>L', self.palmheader, 40)
103 | print("Modification Date: " + str(datetimefrompalmtime(dbmodification))+ (" (0x%0X)" % dbmodification))
104 | dbbackup, = struct.unpack_from(b'>L', self.palmheader, 44)
105 | if dbbackup != 0:
106 | print("Backup Date: " + str(datetimefrompalmtime(dbbackup))+ (" (0x%0X)" % dbbackup))
107 | print("Modification No.: %d" % struct.unpack_from(b'>L', self.palmheader, 48)[0])
108 | print("App Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 52)[0])
109 | print("Sort Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 56)[0])
110 | print("Type/Creator: %s/%s" % (repr(self.palmheader[60:64]), repr(self.palmheader[64:68])))
111 | print("Unique seed: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 68)[0])
112 | expectedzero, = struct.unpack_from(b'>L', self.palmheader, 72)
113 | if expectedzero != 0:
114 | print("Should be zero but isn't: %d" % struct.unpack_from(b'>L', self.palmheader, 72)[0])
115 | print("Number of sections: %d" % struct.unpack_from(b'>H', self.palmheader, 76)[0])
116 | return
117 |
118 | def loadSection(self, section):
119 | before, after = self.sectionoffsets[section:section+2]
120 | return self.data[before:after]
121 |
--------------------------------------------------------------------------------
/lib/mobi_split.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
4 |
5 | from __future__ import unicode_literals, division, absolute_import, print_function
6 |
7 | import struct
8 | # note: struct pack, unpack, unpack_from all require bytestring format
9 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
10 |
11 | from .unipath import pathof
12 |
13 |
14 | # important pdb header offsets
15 | unique_id_seed = 68
16 | number_of_pdb_records = 76
17 |
18 | # important palmdoc header offsets
19 | book_length = 4
20 | book_record_count = 8
21 | first_pdb_record = 78
22 |
23 | # important rec0 offsets
24 | length_of_book = 4
25 | mobi_header_base = 16
26 | mobi_header_length = 20
27 | mobi_type = 24
28 | mobi_version = 36
29 | first_non_text = 80
30 | title_offset = 84
31 | first_resc_record = 108
32 | first_content_index = 192
33 | last_content_index = 194
34 | kf8_fdst_index = 192 # for KF8 mobi headers
35 | fcis_index = 200
36 | flis_index = 208
37 | srcs_index = 224
38 | srcs_count = 228
39 | primary_index = 244
40 | datp_index = 256
41 | huffoff = 112
42 | hufftbloff = 120
43 |
44 | def getint(datain,ofs,sz=b'L'):
45 | i, = struct.unpack_from(b'>'+sz,datain,ofs)
46 | return i
47 |
48 | def writeint(datain,ofs,n,len=b'L'):
49 | if len==b'L':
50 | return datain[:ofs]+struct.pack(b'>L',n)+datain[ofs+4:]
51 | else:
52 | return datain[:ofs]+struct.pack(b'>H',n)+datain[ofs+2:]
53 |
54 | def getsecaddr(datain,secno):
55 | nsec = getint(datain,number_of_pdb_records,b'H')
56 | assert secno>=0 & secnoL',2*nsec+1))
78 | datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
79 | datalst.append(struct.pack(b'>H',nsec))
80 | newstart = zerosecstart
81 | for i in range(0,secno):
82 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
83 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
84 | datalst.append(struct.pack(b'>L', secstart) + struct.pack(b'>L', (2*secno)))
85 | for i in range(secno+1,nsec):
86 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
87 | ofs = ofs + dif
88 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
89 | lpad = newstart - (first_pdb_record + 8*nsec)
90 | if lpad > 0:
91 | datalst.append(b'\0' * lpad)
92 | datalst.append(datain[zerosecstart:secstart])
93 | datalst.append(secdata)
94 | datalst.append(datain[secend:])
95 | dataout = b''.join(datalst)
96 | return dataout
97 |
98 | def nullsection(datain,secno): # make it zero-length without deleting it
99 | datalst = []
100 | nsec = getint(datain,number_of_pdb_records,b'H')
101 | secstart, secend = getsecaddr(datain,secno)
102 | zerosecstart, zerosecend = getsecaddr(datain, 0)
103 | dif = secend-secstart
104 | datalst.append(datain[:first_pdb_record])
105 | for i in range(0,secno+1):
106 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
107 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
108 | for i in range(secno+1, nsec):
109 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
110 | ofs = ofs - dif
111 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
112 | lpad = zerosecstart - (first_pdb_record + 8*nsec)
113 | if lpad > 0:
114 | datalst.append(b'\0' * lpad)
115 | datalst.append(datain[zerosecstart: secstart])
116 | datalst.append(datain[secend:])
117 | dataout = b''.join(datalst)
118 | return dataout
119 |
120 | def deletesectionrange(datain,firstsec,lastsec): # delete a range of sections
121 | datalst = []
122 | firstsecstart,firstsecend = getsecaddr(datain,firstsec)
123 | lastsecstart,lastsecend = getsecaddr(datain,lastsec)
124 | zerosecstart, zerosecend = getsecaddr(datain, 0)
125 | dif = lastsecend - firstsecstart + 8*(lastsec-firstsec+1)
126 | nsec = getint(datain,number_of_pdb_records,b'H')
127 | datalst.append(datain[:unique_id_seed])
128 | datalst.append(struct.pack(b'>L',2*(nsec-(lastsec-firstsec+1))+1))
129 | datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
130 | datalst.append(struct.pack(b'>H',nsec-(lastsec-firstsec+1)))
131 | newstart = zerosecstart - 8*(lastsec-firstsec+1)
132 | for i in range(0,firstsec):
133 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
134 | ofs = ofs-8*(lastsec-firstsec+1)
135 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
136 | for i in range(lastsec+1,nsec):
137 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
138 | ofs = ofs - dif
139 | flgval = 2*(i-(lastsec-firstsec+1))
140 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
141 | lpad = newstart - (first_pdb_record + 8*(nsec - (lastsec - firstsec + 1)))
142 | if lpad > 0:
143 | datalst.append(b'\0' * lpad)
144 | datalst.append(datain[zerosecstart:firstsecstart])
145 | datalst.append(datain[lastsecend:])
146 | dataout = b''.join(datalst)
147 | return dataout
148 |
149 | def insertsection(datain,secno,secdata): # insert a new section
150 | datalst = []
151 | nsec = getint(datain,number_of_pdb_records,b'H')
152 | # print("inserting secno" , secno, "into" ,nsec, "sections")
153 | secstart,secend = getsecaddr(datain,secno)
154 | zerosecstart,zerosecend = getsecaddr(datain,0)
155 | dif = len(secdata)
156 | datalst.append(datain[:unique_id_seed])
157 | datalst.append(struct.pack(b'>L',2*(nsec+1)+1))
158 | datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
159 | datalst.append(struct.pack(b'>H',nsec+1))
160 | newstart = zerosecstart + 8
161 | for i in range(0,secno):
162 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
163 | ofs += 8
164 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
165 | datalst.append(struct.pack(b'>L', secstart + 8) + struct.pack(b'>L', (2*secno)))
166 | for i in range(secno,nsec):
167 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
168 | ofs = ofs + dif + 8
169 | flgval = 2*(i+1)
170 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
171 | lpad = newstart - (first_pdb_record + 8*(nsec + 1))
172 | if lpad > 0:
173 | datalst.append(b'\0' * lpad)
174 | datalst.append(datain[zerosecstart:secstart])
175 | datalst.append(secdata)
176 | datalst.append(datain[secstart:])
177 | dataout = b''.join(datalst)
178 | return dataout
179 |
180 |
181 | def insertsectionrange(sectionsource,firstsec,lastsec,sectiontarget,targetsec): # insert a range of sections
182 | # print("inserting secno" , firstsec, "to", lastsec, "into" ,targetsec, "sections")
183 | # dataout = sectiontarget
184 | # for idx in range(lastsec,firstsec-1,-1):
185 | # dataout = insertsection(dataout,targetsec,readsection(sectionsource,idx))
186 | # return dataout
187 | datalst = []
188 | nsec = getint(sectiontarget,number_of_pdb_records,b'H')
189 | zerosecstart, zerosecend = getsecaddr(sectiontarget,0)
190 | insstart, nul = getsecaddr(sectiontarget,targetsec)
191 | nins = lastsec - firstsec + 1
192 | srcstart, nul = getsecaddr(sectionsource,firstsec)
193 | nul, srcend = getsecaddr(sectionsource,lastsec)
194 | newstart = zerosecstart + 8*nins
195 |
196 | datalst.append(sectiontarget[:unique_id_seed])
197 | datalst.append(struct.pack(b'>L',2*(nsec+nins)+1))
198 | datalst.append(sectiontarget[unique_id_seed+4:number_of_pdb_records])
199 | datalst.append(struct.pack(b'>H',nsec+nins))
200 | for i in range(0,targetsec):
201 | ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8)
202 | ofsnew = ofs + 8*nins
203 | flgvalnew = flgval
204 | datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew))
205 | # print(ofsnew, flgvalnew, ofs, flgval)
206 | srcstart0, nul = getsecaddr(sectionsource,firstsec)
207 | for i in range(nins):
208 | isrcstart, nul = getsecaddr(sectionsource,firstsec+i)
209 | ofsnew = insstart + (isrcstart-srcstart0) + 8*nins
210 | flgvalnew = 2*(targetsec+i)
211 | datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew))
212 | # print(ofsnew, flgvalnew)
213 | dif = srcend - srcstart
214 | for i in range(targetsec,nsec):
215 | ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8)
216 | ofsnew = ofs + dif + 8*nins
217 | flgvalnew = 2*(i+nins)
218 | datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L',flgvalnew))
219 | # print(ofsnew, flgvalnew, ofs, flgval)
220 | lpad = newstart - (first_pdb_record + 8*(nsec + nins))
221 | if lpad > 0:
222 | datalst.append(b'\0' * lpad)
223 | datalst.append(sectiontarget[zerosecstart:insstart])
224 | datalst.append(sectionsource[srcstart:srcend])
225 | datalst.append(sectiontarget[insstart:])
226 | dataout = b''.join(datalst)
227 | return dataout
228 |
229 | def get_exth_params(rec0):
230 | ebase = mobi_header_base + getint(rec0,mobi_header_length)
231 | elen = getint(rec0,ebase+4)
232 | enum = getint(rec0,ebase+8)
233 | return ebase,elen,enum
234 |
235 | def add_exth(rec0,exth_num,exth_bytes):
236 | ebase,elen,enum = get_exth_params(rec0)
237 | newrecsize = 8+len(exth_bytes)
238 | newrec0 = rec0[0:ebase+4]+struct.pack(b'>L',elen+newrecsize)+struct.pack(b'>L',enum+1)+\
239 | struct.pack(b'>L',exth_num)+struct.pack(b'>L',newrecsize)+exth_bytes+rec0[ebase+12:]
240 | newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+newrecsize)
241 | return newrec0
242 |
243 | def read_exth(rec0,exth_num):
244 | exth_values = []
245 | ebase,elen,enum = get_exth_params(rec0)
246 | ebase = ebase+12
247 | while enum>0:
248 | exth_id = getint(rec0,ebase)
249 | if exth_id == exth_num:
250 | # We might have multiple exths, so build a list.
251 | exth_values.append(rec0[ebase+8:ebase+getint(rec0,ebase+4)])
252 | enum = enum-1
253 | ebase = ebase+getint(rec0,ebase+4)
254 | return exth_values
255 |
256 | def write_exth(rec0,exth_num,exth_bytes):
257 | ebase,elen,enum = get_exth_params(rec0)
258 | ebase_idx = ebase+12
259 | enum_idx = enum
260 | while enum_idx>0:
261 | exth_id = getint(rec0,ebase_idx)
262 | if exth_id == exth_num:
263 | dif = len(exth_bytes)+8-getint(rec0,ebase_idx+4)
264 | newrec0 = rec0
265 | if dif != 0:
266 | newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+dif)
267 | return newrec0[:ebase+4]+struct.pack(b'>L',elen+len(exth_bytes)+8-getint(rec0,ebase_idx+4))+\
268 | struct.pack(b'>L',enum)+rec0[ebase+12:ebase_idx+4]+\
269 | struct.pack(b'>L',len(exth_bytes)+8)+exth_bytes+\
270 | rec0[ebase_idx+getint(rec0,ebase_idx+4):]
271 | enum_idx = enum_idx-1
272 | ebase_idx = ebase_idx+getint(rec0,ebase_idx+4)
273 | return rec0
274 |
275 | def del_exth(rec0,exth_num):
276 | ebase,elen,enum = get_exth_params(rec0)
277 | ebase_idx = ebase+12
278 | enum_idx = 0
279 | while enum_idx < enum:
280 | exth_id = getint(rec0,ebase_idx)
281 | exth_size = getint(rec0,ebase_idx+4)
282 | if exth_id == exth_num:
283 | newrec0 = rec0
284 | newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)-exth_size)
285 | newrec0 = newrec0[:ebase_idx]+newrec0[ebase_idx+exth_size:]
286 | newrec0 = newrec0[0:ebase+4]+struct.pack(b'>L',elen-exth_size)+struct.pack(b'>L',enum-1)+newrec0[ebase+12:]
287 | return newrec0
288 | enum_idx += 1
289 | ebase_idx = ebase_idx+exth_size
290 | return rec0
291 |
292 |
293 | class mobi_split:
294 |
295 | def __init__(self, infile):
296 | datain = b''
297 | with open(pathof(infile), 'rb') as f:
298 | datain = f.read()
299 | datain_rec0 = readsection(datain,0)
300 | ver = getint(datain_rec0,mobi_version)
301 | self.combo = (ver!=8)
302 | if not self.combo:
303 | return
304 | exth121 = read_exth(datain_rec0,121)
305 | if len(exth121) == 0:
306 | self.combo = False
307 | return
308 | else:
309 | # only pay attention to first exth121
310 | # (there should only be one)
311 | datain_kf8, = struct.unpack_from(b'>L',exth121[0],0)
312 | if datain_kf8 == 0xffffffff:
313 | self.combo = False
314 | return
315 | datain_kfrec0 =readsection(datain,datain_kf8)
316 |
317 | # create the standalone mobi7
318 | num_sec = getint(datain,number_of_pdb_records,b'H')
319 | # remove BOUNDARY up to but not including ELF record
320 | self.result_file7 = deletesectionrange(datain,datain_kf8-1,num_sec-2)
321 | # check if there are SRCS records and delete them
322 | srcs = getint(datain_rec0,srcs_index)
323 | num_srcs = getint(datain_rec0,srcs_count)
324 | if srcs != 0xffffffff and num_srcs > 0:
325 | self.result_file7 = deletesectionrange(self.result_file7,srcs,srcs+num_srcs-1)
326 | datain_rec0 = writeint(datain_rec0,srcs_index,0xffffffff)
327 | datain_rec0 = writeint(datain_rec0,srcs_count,0)
328 | # reset the EXTH 121 KF8 Boundary meta data to 0xffffffff
329 | datain_rec0 = write_exth(datain_rec0,121, struct.pack(b'>L', 0xffffffff))
330 | # datain_rec0 = del_exth(datain_rec0,121)
331 | # datain_rec0 = del_exth(datain_rec0,534)
332 | # don't remove the EXTH 125 KF8 Count of Resources, seems to be present in mobi6 files as well
333 | # set the EXTH 129 KF8 Masthead / Cover Image string to the null string
334 | datain_rec0 = write_exth(datain_rec0,129, b'')
335 | # don't remove the EXTH 131 KF8 Unidentified Count, seems to be present in mobi6 files as well
336 |
337 | # need to reset flags stored in 0x80-0x83
338 | # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
339 | # Bit Flags
340 | # 0x1000 = Bit 12 indicates if embedded fonts are used or not
341 | # 0x0800 = means this Header points to *shared* images/resource/fonts ??
342 | # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
343 | # 0x0040 = exth exists
344 | # 0x0010 = Not sure but this is always set so far
345 | fval, = struct.unpack_from(b'>L',datain_rec0, 0x80)
346 | # need to remove flag 0x0800 for KindlePreviewer 2.8 and unset Bit 12 for embedded fonts
347 | fval = fval & 0x07FF
348 | datain_rec0 = datain_rec0[:0x80] + struct.pack(b'>L',fval) + datain_rec0[0x84:]
349 |
350 | self.result_file7 = writesection(self.result_file7,0,datain_rec0)
351 |
352 | # no need to replace kf8 style fcis with mobi 7 one
353 | # fcis_secnum, = struct.unpack_from(b'>L',datain_rec0, 0xc8)
354 | # if fcis_secnum != 0xffffffff:
355 | # fcis_info = readsection(datain, fcis_secnum)
356 | # text_len, = struct.unpack_from(b'>L', fcis_info, 0x14)
357 | # new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
358 | # new_fcis += struct.pack(b'>L',text_len)
359 | # new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
360 | # self.result_file7 = writesection(self.result_file7, fcis_secnum, new_fcis)
361 |
362 | firstimage = getint(datain_rec0,first_resc_record)
363 | lastimage = getint(datain_rec0,last_content_index,b'H')
364 | # print("Old First Image, last Image", firstimage,lastimage)
365 | if lastimage == 0xffff:
366 | # find the lowest of the next sections and copy up to that.
367 | ofs_list = [(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')]
368 | for ofs,sz in ofs_list:
369 | n = getint(datain_rec0,ofs,sz)
370 | # print("n",n)
371 | if n > 0 and n < lastimage:
372 | lastimage = n-1
373 | print("First Image, last Image", firstimage,lastimage)
374 |
375 | # Try to null out FONT and RES, but leave the (empty) PDB record so image refs remain valid
376 | for i in range(firstimage,lastimage):
377 | imgsec = readsection(self.result_file7,i)
378 | if imgsec[0:4] in [b'RESC',b'FONT']:
379 | self.result_file7 = nullsection(self.result_file7,i)
380 |
381 | # mobi7 finished
382 |
383 | # create standalone mobi8
384 | self.result_file8 = deletesectionrange(datain,0,datain_kf8-1)
385 | target = getint(datain_kfrec0,first_resc_record)
386 | self.result_file8 = insertsectionrange(datain,firstimage,lastimage,self.result_file8,target)
387 | datain_kfrec0 =readsection(self.result_file8,0)
388 |
389 | # Only keep the correct EXTH 116 StartOffset, KG 2.5 carries over the one from the mobi7 part, which then points at garbage in the mobi8 part, and confuses FW 3.4
390 | kf8starts = read_exth(datain_kfrec0,116)
391 | # If we have multiple StartOffset, keep only the last one
392 | kf8start_count = len(kf8starts)
393 | while kf8start_count > 1:
394 | kf8start_count -= 1
395 | datain_kfrec0 = del_exth(datain_kfrec0,116)
396 |
397 | # update the EXTH 125 KF8 Count of Images/Fonts/Resources
398 | datain_kfrec0 = write_exth(datain_kfrec0,125,struct.pack(b'>L',lastimage-firstimage+1))
399 |
400 | # need to reset flags stored in 0x80-0x83
401 | # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
402 | # standalone mobi8 with exth: 0x0050
403 | # Bit Flags
404 | # 0x1000 = Bit 12 indicates if embedded fonts are used or not
405 | # 0x0800 = means this Header points to *shared* images/resource/fonts ??
406 | # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
407 | # 0x0040 = exth exists
408 | # 0x0010 = Not sure but this is always set so far
409 | fval, = struct.unpack_from('>L',datain_kfrec0, 0x80)
410 | fval = fval & 0x1FFF
411 | fval |= 0x0800
412 | datain_kfrec0 = datain_kfrec0[:0x80] + struct.pack(b'>L',fval) + datain_kfrec0[0x84:]
413 |
414 | # properly update other index pointers that have been shifted by the insertion of images
415 | ofs_list = [(kf8_fdst_index,b'L'),(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')]
416 | for ofs,sz in ofs_list:
417 | n = getint(datain_kfrec0,ofs,sz)
418 | if n != 0xffffffff:
419 | datain_kfrec0 = writeint(datain_kfrec0,ofs,n+lastimage-firstimage+1,sz)
420 | self.result_file8 = writesection(self.result_file8,0,datain_kfrec0)
421 |
422 | # no need to replace kf8 style fcis with mobi 7 one
423 | # fcis_secnum, = struct.unpack_from(b'>L',datain_kfrec0, 0xc8)
424 | # if fcis_secnum != 0xffffffff:
425 | # fcis_info = readsection(self.result_file8, fcis_secnum)
426 | # text_len, = struct.unpack_from(b'>L', fcis_info, 0x14)
427 | # new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
428 | # new_fcis += struct.pack(b'>L',text_len)
429 | # new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
430 | # self.result_file8 = writesection(self.result_file8, fcis_secnum, new_fcis)
431 |
432 | # mobi8 finished
433 |
434 | def getResult8(self):
435 | return self.result_file8
436 |
437 | def getResult7(self):
438 | return self.result_file7
439 |
--------------------------------------------------------------------------------
/lib/mobi_uncompress.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
4 |
5 | from __future__ import unicode_literals, division, absolute_import, print_function
6 |
7 | from .compatibility_utils import PY2, bchr, lmap, bstr
8 |
9 | if PY2:
10 | range = xrange
11 |
12 | import struct
13 | # note: struct pack, unpack, unpack_from all require bytestring format
14 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
15 |
16 |
17 | class unpackException(Exception):
18 | pass
19 |
20 | class UncompressedReader:
21 |
22 | def unpack(self, data):
23 | return data
24 |
25 | class PalmdocReader:
26 |
27 | def unpack(self, i):
28 | o, p = b'', 0
29 | while p < len(i):
30 | # for python 3 must use slice since i[p] returns int while slice returns character
31 | c = ord(i[p:p+1])
32 | p += 1
33 | if (c >= 1 and c <= 8):
34 | o += i[p:p+c]
35 | p += c
36 | elif (c < 128):
37 | o += bchr(c)
38 | elif (c >= 192):
39 | o += b' ' + bchr(c ^ 128)
40 | else:
41 | if p < len(i):
42 | c = (c << 8) | ord(i[p:p+1])
43 | p += 1
44 | m = (c >> 3) & 0x07ff
45 | n = (c & 7) + 3
46 | if (m > n):
47 | o += o[-m:n-m]
48 | else:
49 | for _ in range(n):
50 | # because of completely ass-backwards decision by python mainters for python 3
51 | # we must use slice for bytes as i[p] returns int while slice returns character
52 | if m == 1:
53 | o += o[-m:]
54 | else:
55 | o += o[-m:-m+1]
56 | return o
57 |
58 | class HuffcdicReader:
59 | q = struct.Struct(b'>Q').unpack_from
60 |
61 | def loadHuff(self, huff):
62 | if huff[0:8] != b'HUFF\x00\x00\x00\x18':
63 | raise unpackException('invalid huff header')
64 | off1, off2 = struct.unpack_from(b'>LL', huff, 8)
65 |
66 | def dict1_unpack(v):
67 | codelen, term, maxcode = v&0x1f, v&0x80, v>>8
68 | assert codelen != 0
69 | if codelen <= 8:
70 | assert term
71 | maxcode = ((maxcode + 1) << (32 - codelen)) - 1
72 | return (codelen, term, maxcode)
73 | self.dict1 = lmap(dict1_unpack, struct.unpack_from(b'>256L', huff, off1))
74 |
75 | dict2 = struct.unpack_from(b'>64L', huff, off2)
76 | self.mincode, self.maxcode = (), ()
77 | for codelen, mincode in enumerate((0,) + dict2[0::2]):
78 | self.mincode += (mincode << (32 - codelen), )
79 | for codelen, maxcode in enumerate((0,) + dict2[1::2]):
80 | self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, )
81 |
82 | self.dictionary = []
83 |
84 | def loadCdic(self, cdic):
85 | if cdic[0:8] != b'CDIC\x00\x00\x00\x10':
86 | raise unpackException('invalid cdic header')
87 | phrases, bits = struct.unpack_from(b'>LL', cdic, 8)
88 | n = min(1<H').unpack_from
90 | def getslice(off):
91 | blen, = h(cdic, 16+off)
92 | slice = cdic[18+off:18+off+(blen&0x7fff)]
93 | return (slice, blen&0x8000)
94 | self.dictionary += lmap(getslice, struct.unpack_from(bstr('>%dH' % n), cdic, 16))
95 |
96 | def unpack(self, data):
97 | q = HuffcdicReader.q
98 |
99 | bitsleft = len(data) * 8
100 | data += b"\x00\x00\x00\x00\x00\x00\x00\x00"
101 | pos = 0
102 | x, = q(data, pos)
103 | n = 32
104 |
105 | s = b''
106 | while True:
107 | if n <= 0:
108 | pos += 4
109 | x, = q(data, pos)
110 | n += 32
111 | code = (x >> n) & ((1 << 32) - 1)
112 |
113 | codelen, term, maxcode = self.dict1[code >> 24]
114 | if not term:
115 | while code < self.mincode[codelen]:
116 | codelen += 1
117 | maxcode = self.maxcode[codelen]
118 |
119 | n -= codelen
120 | bitsleft -= codelen
121 | if bitsleft < 0:
122 | break
123 |
124 | r = (maxcode - code) >> (32 - codelen)
125 | slice, flag = self.dictionary[r]
126 | if not flag:
127 | self.dictionary[r] = None
128 | slice = self.unpack(slice)
129 | self.dictionary[r] = (slice, 1)
130 | s += slice
131 | return s
132 |
--------------------------------------------------------------------------------
/lib/mobi_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
4 | # flake8: noqa
5 |
6 | from __future__ import unicode_literals, division, absolute_import, print_function
7 |
8 | from .compatibility_utils import PY2, text_type, bchr, bord
9 |
10 | import binascii
11 |
12 | if PY2:
13 | range = xrange
14 |
15 | from itertools import cycle
16 |
17 | def getLanguage(langID, sublangID):
18 | mobilangdict = {
19 | 54 : {0 : 'af'}, # Afrikaans
20 | 28 : {0 : 'sq'}, # Albanian
21 | 1 : {0 : 'ar' , 5 : 'ar-dz' , 15 : 'ar-bh' , 3 : 'ar-eg' , 2 : 'ar-iq', 11 : 'ar-jo' , 13 : 'ar-kw' , 12 : 'ar-lb' , 4: 'ar-ly',
22 | 6 : 'ar-ma' , 8 : 'ar-om' , 16 : 'ar-qa' , 1 : 'ar-sa' , 10 : 'ar-sy' , 7 : 'ar-tn' , 14 : 'ar-ae' , 9 : 'ar-ye'},
23 | # Arabic, Arabic (Algeria), Arabic (Bahrain), Arabic (Egypt), Arabic
24 | # (Iraq), Arabic (Jordan), Arabic (Kuwait), Arabic (Lebanon), Arabic
25 | # (Libya), Arabic (Morocco), Arabic (Oman), Arabic (Qatar), Arabic
26 | # (Saudi Arabia), Arabic (Syria), Arabic (Tunisia), Arabic (United Arab
27 | # Emirates), Arabic (Yemen)
28 | 43 : {0 : 'hy'}, # Armenian
29 | 77 : {0 : 'as'}, # Assamese
30 | 44 : {0 : 'az'}, # "Azeri (IANA: Azerbaijani)
31 | 45 : {0 : 'eu'}, # Basque
32 | 35 : {0 : 'be'}, # Belarusian
33 | 69 : {0 : 'bn'}, # Bengali
34 | 2 : {0 : 'bg'}, # Bulgarian
35 | 3 : {0 : 'ca'}, # Catalan
36 | 4 : {0 : 'zh' , 3 : 'zh-hk' , 2 : 'zh-cn' , 4 : 'zh-sg' , 1 : 'zh-tw'},
37 | # Chinese, Chinese (Hong Kong), Chinese (PRC), Chinese (Singapore), Chinese (Taiwan)
38 | 26 : {0 : 'hr', 3 : 'sr'}, # Croatian, Serbian
39 | 5 : {0 : 'cs'}, # Czech
40 | 6 : {0 : 'da'}, # Danish
41 | 19 : {0: 'nl', 1 : 'nl' , 2 : 'nl-be'}, # Dutch / Flemish, Dutch (Belgium)
42 | 9 : {0: 'en', 1 : 'en' , 3 : 'en-au' , 10 : 'en-bz' , 4 : 'en-ca' , 6 : 'en-ie' , 8 : 'en-jm' , 5 : 'en-nz' , 13 : 'en-ph' ,
43 | 7 : 'en-za' , 11 : 'en-tt' , 2 : 'en-gb', 1 : 'en-us' , 12 : 'en-zw'},
44 | # English, English (Australia), English (Belize), English (Canada),
45 | # English (Ireland), English (Jamaica), English (New Zealand), English
46 | # (Philippines), English (South Africa), English (Trinidad), English
47 | # (United Kingdom), English (United States), English (Zimbabwe)
48 | 37 : {0 : 'et'}, # Estonian
49 | 56 : {0 : 'fo'}, # Faroese
50 | 41 : {0 : 'fa'}, # Farsi / Persian
51 | 11 : {0 : 'fi'}, # Finnish
52 | 12 : {0 : 'fr', 1 : 'fr' , 2 : 'fr-be' , 3 : 'fr-ca' , 5 : 'fr-lu' , 6 : 'fr-mc' , 4 : 'fr-ch'},
53 | # French, French (Belgium), French (Canada), French (Luxembourg), French (Monaco), French (Switzerland)
54 | 55 : {0 : 'ka'}, # Georgian
55 | 7 : {0 : 'de', 1 : 'de' , 3 : 'de-at' , 5 : 'de-li' , 4 : 'de-lu' , 2 : 'de-ch'},
56 | # German, German (Austria), German (Liechtenstein), German (Luxembourg), German (Switzerland)
57 | 8 : {0 : 'el'}, # Greek, Modern (1453-)
58 | 71 : {0 : 'gu'}, # Gujarati
59 | 13 : {0 : 'he'}, # Hebrew (also code 'iw'?)
60 | 57 : {0 : 'hi'}, # Hindi
61 | 14 : {0 : 'hu'}, # Hungarian
62 | 15 : {0 : 'is'}, # Icelandic
63 | 33 : {0 : 'id'}, # Indonesian
64 | 16 : {0 : 'it', 1 : 'it' , 2 : 'it-ch'}, # Italian, Italian (Switzerland)
65 | 17 : {0 : 'ja'}, # Japanese
66 | 75 : {0 : 'kn'}, # Kannada
67 | 63 : {0 : 'kk'}, # Kazakh
68 | 87 : {0 : 'x-kok'}, # Konkani (real language code is 'kok'?)
69 | 18 : {0 : 'ko'}, # Korean
70 | 38 : {0 : 'lv'}, # Latvian
71 | 39 : {0 : 'lt'}, # Lithuanian
72 | 47 : {0 : 'mk'}, # Macedonian
73 | 62 : {0 : 'ms'}, # Malay
74 | 76 : {0 : 'ml'}, # Malayalam
75 | 58 : {0 : 'mt'}, # Maltese
76 | 78 : {0 : 'mr'}, # Marathi
77 | 97 : {0 : 'ne'}, # Nepali
78 | 20 : {0 : 'no'}, # Norwegian
79 | 72 : {0 : 'or'}, # Oriya
80 | 21 : {0 : 'pl'}, # Polish
81 | 22 : {0 : 'pt', 2 : 'pt' , 1 : 'pt-br'}, # Portuguese, Portuguese (Brazil)
82 | 70 : {0 : 'pa'}, # Punjabi
83 | 23 : {0 : 'rm'}, # "Rhaeto-Romanic" (IANA: Romansh)
84 | 24 : {0 : 'ro'}, # Romanian
85 | 25 : {0 : 'ru'}, # Russian
86 | 59 : {0 : 'sz'}, # "Sami (Lappish)" (not an IANA language code)
87 | # IANA code for "Northern Sami" is 'se'
88 | # 'SZ' is the IANA region code for Swaziland
89 | 79 : {0 : 'sa'}, # Sanskrit
90 | 27 : {0 : 'sk'}, # Slovak
91 | 36 : {0 : 'sl'}, # Slovenian
92 | 46 : {0 : 'sb'}, # "Sorbian" (not an IANA language code)
93 | # 'SB' is IANA region code for 'Solomon Islands'
94 | # Lower Sorbian = 'dsb'
95 | # Upper Sorbian = 'hsb'
96 | # Sorbian Languages = 'wen'
97 | 10 : {0 : 'es' , 1 : 'es' , 11 : 'es-ar' , 16 : 'es-bo' , 13 : 'es-cl' , 9 : 'es-co' , 5 : 'es-cr' , 7 : 'es-do' ,
98 | 12 : 'es-ec' , 17 : 'es-sv' , 4 : 'es-gt' , 18 : 'es-hn' , 2 : 'es-mx' , 19 : 'es-ni' , 6 : 'es-pa' ,
99 | 15 : 'es-py' , 10 : 'es-pe' , 20 : 'es-pr' , 14 : 'es-uy' , 8 : 'es-ve'},
100 | # Spanish, Spanish (Mobipocket bug?), Spanish (Argentina), Spanish
101 | # (Bolivia), Spanish (Chile), Spanish (Colombia), Spanish (Costa Rica),
102 | # Spanish (Dominican Republic), Spanish (Ecuador), Spanish (El
103 | # Salvador), Spanish (Guatemala), Spanish (Honduras), Spanish (Mexico),
104 | # Spanish (Nicaragua), Spanish (Panama), Spanish (Paraguay), Spanish
105 | # (Peru), Spanish (Puerto Rico), Spanish (Uruguay), Spanish (Venezuela)
106 | 48 : {0 : 'sx'}, # "Sutu" (not an IANA language code)
107 | # "Sutu" is another name for "Southern Sotho"?
108 | # IANA code for "Southern Sotho" is 'st'
109 | 65 : {0 : 'sw'}, # Swahili
110 | 29 : {0 : 'sv' , 1 : 'sv' , 2 : 'sv-fi'}, # Swedish, Swedish (Finland)
111 | 73 : {0 : 'ta'}, # Tamil
112 | 68 : {0 : 'tt'}, # Tatar
113 | 74 : {0 : 'te'}, # Telugu
114 | 30 : {0 : 'th'}, # Thai
115 | 49 : {0 : 'ts'}, # Tsonga
116 | 50 : {0 : 'tn'}, # Tswana
117 | 31 : {0 : 'tr'}, # Turkish
118 | 34 : {0 : 'uk'}, # Ukrainian
119 | 32 : {0 : 'ur'}, # Urdu
120 | 67 : {0 : 'uz', 1 : 'uz'}, # Uzbek
121 | 42 : {0 : 'vi'}, # Vietnamese
122 | 52 : {0 : 'xh'}, # Xhosa
123 | 53 : {0 : 'zu'}, # Zulu
124 | }
125 | lang = "en"
126 | if langID in mobilangdict:
127 | subdict = mobilangdict[langID]
128 | lang = subdict[0]
129 | if sublangID in subdict:
130 | lang = subdict[sublangID]
131 | return lang
132 |
133 |
134 | def toHex(byteList):
135 | return binascii.hexlify(byteList)
136 |
137 | # returns base32 bytestring
138 | def toBase32(value, npad=4):
139 | digits = b'0123456789ABCDEFGHIJKLMNOPQRSTUV'
140 | num_string=b''
141 | current = value
142 | while current != 0:
143 | next, remainder = divmod(current, 32)
144 | rem_string = digits[remainder:remainder+1]
145 | num_string = rem_string + num_string
146 | current=next
147 | if num_string == b'':
148 | num_string = b'0'
149 | pad = npad - len(num_string)
150 | if pad > 0:
151 | num_string = b'0' * pad + num_string
152 | return num_string
153 |
154 |
155 | # converts base32 string to value
156 | def fromBase32(str_num):
157 | if isinstance(str_num, text_type):
158 | str_num = str_num.encode('latin-1')
159 | scalelst = [1,32,1024,32768,1048576,33554432,1073741824,34359738368]
160 | value = 0
161 | j = 0
162 | n = len(str_num)
163 | scale = 0
164 | for i in range(n):
165 | c = str_num[n-i-1:n-i]
166 | if c in b'0123456789':
167 | v = ord(c) - ord(b'0')
168 | else:
169 | v = ord(c) - ord(b'A') + 10
170 | if j < len(scalelst):
171 | scale = scalelst[j]
172 | else:
173 | scale = scale * 32
174 | j += 1
175 | if v != 0:
176 | value = value + (v * scale)
177 | return value
178 |
179 |
180 | # note: if decode a bytestring using 'latin-1' (or any other 0-255 encoding)
181 | # in place of ascii you will get a byte to half-word or integer
182 | # one to one mapping of values from 0 - 255
183 |
184 | def mangle_fonts(encryption_key, data):
185 | if isinstance(encryption_key, text_type):
186 | encryption_key = encryption_key.encode('latin-1')
187 | crypt = data[:1024]
188 | key = cycle(iter(map(bord, encryption_key)))
189 | # encrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
190 | encrypt = b''.join([bchr(bord(x)^next(key)) for x in crypt])
191 | return encrypt + data[1024:]
192 |
--------------------------------------------------------------------------------
/lib/mobiml2xhtml.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
3 |
4 |
5 | # this program works in concert with the output from KindleUnpack
6 |
7 | '''
8 | Convert from Mobi ML to XHTML
9 | '''
10 |
11 | from __future__ import division, absolute_import, print_function
12 |
13 | import os
14 | import sys
15 | import re
16 |
17 | SPECIAL_HANDLING_TAGS = {
18 | '?xml' : ('xmlheader', -1),
19 | '!--' : ('comment', -3),
20 | '!DOCTYPE' : ('doctype', -1),
21 | }
22 |
23 | SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment']
24 |
25 | SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference']
26 |
27 | class MobiMLConverter(object):
28 |
29 | PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
30 | IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
31 |
32 | def __init__(self, filename):
33 | self.base_css_rules = 'blockquote { margin: 0em 0em 0em 1.25em }\n'
34 | self.base_css_rules += 'p { margin: 0em }\n'
35 | self.base_css_rules += '.bold { font-weight: bold }\n'
36 | self.base_css_rules += '.italic { font-style: italic }\n'
37 | self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n'
38 | self.tag_css_rules = {}
39 | self.tag_css_rule_cnt = 0
40 | self.path = []
41 | self.filename = filename
42 | self.wipml = open(self.filename, 'r').read()
43 | self.pos = 0
44 | self.opfname = self.filename.rsplit('.',1)[0] + '.opf'
45 | self.opos = 0
46 | self.meta = ''
47 | self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css')
48 | self.current_font_size = 3
49 | self.font_history = []
50 |
51 | def cleanup_html(self):
52 | self.wipml = re.sub(r'
', '', self.wipml)
53 | self.wipml = self.wipml.replace('\r\n', '\n')
54 | self.wipml = self.wipml.replace('> <', '>\n<')
55 | self.wipml = self.wipml.replace(']*>', '', self.wipml)
57 | self.wipml = self.wipml.replace(' ',' ')
58 |
59 | def replace_page_breaks(self):
60 | self.wipml = self.PAGE_BREAK_PAT.sub(
61 | '
',
62 | self.wipml)
63 |
64 | # parse leading text of ml and tag
65 | def parseml(self):
66 | p = self.pos
67 | if p >= len(self.wipml):
68 | return None
69 | if self.wipml[p] != '<':
70 | res = self.wipml.find('<',p)
71 | if res == -1 :
72 | res = len(self.wipml)
73 | self.pos = res
74 | return self.wipml[p:res], None
75 | # handle comment as a special case to deal with multi-line comments
76 | if self.wipml[p:p+4] == '',p+1)
78 | if te != -1:
79 | te = te+2
80 | else :
81 | te = self.wipml.find('>',p+1)
82 | ntb = self.wipml.find('<',p+1)
83 | if ntb != -1 and ntb < te:
84 | self.pos = ntb
85 | return self.wipml[p:ntb], None
86 | self.pos = te + 1
87 | return None, self.wipml[p:te+1]
88 |
89 | # parses string version of tag to identify its name,
90 | # its type 'begin', 'end' or 'single',
91 | # plus build a hashtable of its attributes
92 | # code is written to handle the possiblity of very poor formating
93 | def parsetag(self, s):
94 | p = 1
95 | # get the tag name
96 | tname = None
97 | ttype = None
98 | tattr = {}
99 | while s[p:p+1] == ' ' :
100 | p += 1
101 | if s[p:p+1] == '/':
102 | ttype = 'end'
103 | p += 1
104 | while s[p:p+1] == ' ' :
105 | p += 1
106 | b = p
107 | while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") :
108 | p += 1
109 | tname=s[b:p].lower()
110 | if tname == '!doctype':
111 | tname = '!DOCTYPE'
112 | # special cases
113 | if tname in SPECIAL_HANDLING_TAGS:
114 | ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
115 | tattr['special'] = s[p:backstep]
116 | if ttype is None:
117 | # parse any attributes
118 | while s.find('=',p) != -1 :
119 | while s[p:p+1] == ' ' :
120 | p += 1
121 | b = p
122 | while s[p:p+1] != '=' :
123 | p += 1
124 | aname = s[b:p].lower()
125 | aname = aname.rstrip(' ')
126 | p += 1
127 | while s[p:p+1] == ' ' :
128 | p += 1
129 | if s[p:p+1] in ('"', "'") :
130 | p = p + 1
131 | b = p
132 | while s[p:p+1] not in ('"', "'") :
133 | p += 1
134 | val = s[b:p]
135 | p += 1
136 | else :
137 | b = p
138 | while s[p:p+1] not in ('>', '/', ' ') :
139 | p += 1
140 | val = s[b:p]
141 | tattr[aname] = val
142 | # label beginning and single tags
143 | if ttype is None:
144 | ttype = 'begin'
145 | if s.find(' /',p) >= 0:
146 | ttype = 'single_ext'
147 | elif s.find('/',p) >= 0:
148 | ttype = 'single'
149 | return ttype, tname, tattr
150 |
151 | # main routine to convert from mobi markup language to html
152 | def processml(self):
153 |
154 | # are these really needed
155 | html_done = False
156 | head_done = False
157 | body_done = False
158 |
159 | skip = False
160 |
161 | htmlstr = ''
162 | self.replace_page_breaks()
163 | self.cleanup_html()
164 |
165 | # now parse the cleaned up ml into standard xhtml
166 | while True:
167 |
168 | r = self.parseml()
169 | if not r:
170 | break
171 |
172 | text, tag = r
173 |
174 | if text:
175 | if not skip:
176 | htmlstr += text
177 |
178 | if tag:
179 | ttype, tname, tattr = self.parsetag(tag)
180 |
181 | # If we run into a DTD or xml declarations inside the body ... bail.
182 | if tname in SPECIAL_HANDLING_TAGS and tname != 'comment' and body_done:
183 | htmlstr += '\n