├── .gitattributes ├── .gitignore ├── COPYING.txt ├── DumpMobiHeader_v023.py ├── KindleUnpack.pyw ├── KindleUnpack_ReadMe.htm ├── README.md ├── lib ├── __init__.py ├── compatibility_utils.py ├── imghdr.py ├── kindleunpack.py ├── mobi_cover.py ├── mobi_dict.py ├── mobi_header.py ├── mobi_html.py ├── mobi_index.py ├── mobi_k8proc.py ├── mobi_k8resc.py ├── mobi_nav.py ├── mobi_ncx.py ├── mobi_opf.py ├── mobi_pagemap.py ├── mobi_sectioner.py ├── mobi_split.py ├── mobi_uncompress.py ├── mobi_utils.py ├── mobiml2xhtml.py ├── unipath.py └── unpack_structure.py └── libgui ├── __init__.py ├── askfolder_ed.py ├── prefs.py └── scrolltextwidget.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.txt text 2 | *.py text 3 | *.cfg text 4 | 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | *.DS_Store 6 | 7 | # Vim temp/swap files 8 | *~ 9 | *.orig 10 | *.keep 11 | *.swp 12 | *.swo 13 | 14 | # PyInstaller 15 | # Usually these files are written by a python script from a template 16 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 17 | *.manifest 18 | *.spec 19 | 20 | 21 | # Translations 22 | *.mo 23 | *.pot 24 | 25 | *.log 26 | 27 | # PyBuilder 28 | target/ 29 | 30 | # Files/folders used/produced when testing Kindleunpack 31 | HDImages/ 32 | mobi7/ 33 | mobi8/ 34 | 35 | *.bak 36 | *.dat 37 | *.data 38 | *.pdf 39 | *.ini 40 | *.json 41 | *.mobi 42 | *.prc 43 | *.azw 44 | *.azw[34] 45 | 46 | # Folder to direct output to when testing command-line (will be ignored by git) 47 | testout/ 48 | -------------------------------------------------------------------------------- /KindleUnpack.pyw: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | import sys 8 | 9 | from lib.compatibility_utils import PY2, text_type, unicode_str 10 | from lib.compatibility_utils import unicode_argv, add_cp65001_codec 11 | 12 | import lib.unipath as unipath 13 | from lib.unipath import pathof 14 | 15 | import os 16 | import traceback 17 | 18 | import codecs 19 | add_cp65001_codec() 20 | 21 | try: 22 | from queue import Full 23 | from queue import Empty 24 | except ImportError: 25 | from Queue import Full 26 | from Queue import Empty 27 | 28 | if PY2 and sys.platform.startswith("win"): 29 | from libgui.askfolder_ed import AskFolder 30 | 31 | from multiprocessing import Process, Queue 32 | 33 | if PY2: 34 | import Tkinter as tkinter 35 | import Tkconstants as tkinter_constants 36 | import tkFileDialog as tkinter_filedialog 37 | import ttk as tkinter_ttk 38 | else: 39 | import tkinter 40 | import tkinter.constants as tkinter_constants 41 | import tkinter.filedialog as tkinter_filedialog 42 | import tkinter.ttk as tkinter_ttk 43 | 44 | from libgui.scrolltextwidget import ScrolledText 45 | 46 | import lib.kindleunpack as kindleunpack 47 | 48 | # Set to false to NOT save prefences to an ini file. 49 | # Starting directories for file dialogs will still persist 50 | # for the current KindleUnpack session. 51 | # 52 | # Need to delete the ini file after setting to false, of course. 53 | PERSISTENT_PREFS = True 54 | 55 | from inspect import getfile, currentframe 56 | from libgui.prefs import getprefs, saveprefs 57 | 58 | # Probably overkill, but to ensure cross-platform success no matter how the script is called/run... 59 | SCRIPT_NAME = unicode_str(getfile(currentframe())) 60 | SCRIPT_DIR = unicode_str(os.path.dirname(unipath.abspath(getfile(currentframe())))) 61 | PROGNAME = unicode_str(os.path.splitext(SCRIPT_NAME)[0]) 62 | 63 | # Include platform in the ini file name. That way, settings can still persist 64 | # in the event that different OSs access the same script via a network share/flash-drive. 65 | CONFIGFILE = unicode_str(os.path.join(SCRIPT_DIR, '{0}_{1}.json'.format(PROGNAME, sys.platform[:3]))) 66 | 67 | # Wrap a stream so that output gets appended to shared queue 68 | # using utf-8 encoding 69 | class QueuedStream: 70 | def __init__(self, stream, q): 71 | self.stream = stream 72 | self.encoding = stream.encoding 73 | self.q = q 74 | if self.encoding == None: 75 | self.encoding = 'utf-8' 76 | def write(self, data): 77 | if isinstance(data,text_type): 78 | data = data.encode('utf-8') 79 | elif self.encoding not in ['utf-8','UTF-8','cp65001','CP65001']: 80 | udata = data.decode(self.encoding) 81 | data = udata.encode('utf-8') 82 | self.q.put(data) 83 | def __getattr__(self, attr): 84 | if attr == 'mode': 85 | return 'wb' 86 | if attr == 'encoding': 87 | return 'utf-8' 88 | return getattr(self.stream, attr) 89 | 90 | 91 | class MainDialog(tkinter.Frame): 92 | 93 | def __init__(self, root): 94 | tkinter.Frame.__init__(self, root, border=5) 95 | self.root = root 96 | self.interval = 50 97 | self.p2 = None 98 | self.q = Queue() 99 | # To keep things simple for possible future preference additions/deletions: 100 | # Try to stick to - TK Widget name = prefs dictionary key = ini.get|set name. 101 | # EX: mobipath = prefs['mobipath'] = config.get('Defaults', mobipath). 102 | self.prefs = getprefs(CONFIGFILE, self.root, PERSISTENT_PREFS) 103 | 104 | self.status = tkinter.StringVar() 105 | tkinter.Label(self, textvariable=self.status, justify='center').grid(row=0, columnspan=3, sticky=tkinter_constants.N) 106 | self.status.set('Upack a non-DRM Kindle eBook') 107 | sticky = tkinter_constants.E + tkinter_constants.W 108 | ALL = tkinter_constants.E+tkinter_constants.W+tkinter_constants.N+tkinter_constants.S 109 | # Set to the column the textentry boxes are in. 110 | self.grid_columnconfigure(1, weight=1) 111 | # Set to the row the debug log widget is in. 112 | self.grid_rowconfigure(10, weight=1) 113 | 114 | tkinter.Label(self, text='').grid(row=1, sticky=tkinter_constants.E) 115 | tkinter.Label(self, text='Unencrypted Kindle eBook input file', wraplength=200).grid(row=2, sticky=tkinter_constants.E) 116 | self.mobipath = tkinter.Entry(self, width=50) 117 | self.mobipath.grid(row=2, column=1, sticky=sticky) 118 | self.mobipath.insert(0, '') 119 | button = tkinter.Button(self, text="Browse...", command=self.get_mobipath) 120 | button.grid(row=2, column=2, sticky=sticky) 121 | 122 | tkinter.Label(self, text='Output Directory', wraplength=200).grid(row=3, sticky=tkinter_constants.E) 123 | self.outpath = tkinter.Entry(self, width=50) 124 | self.outpath.grid(row=3, column=1, sticky=sticky) 125 | if self.prefs['outpath'] and PERSISTENT_PREFS and unipath.exists(CONFIGFILE): 126 | outpath = pathof(os.path.normpath(self.prefs['outpath'])) 127 | self.outpath.insert(0, outpath) 128 | else: 129 | self.outpath.insert(0, '') 130 | button = tkinter.Button(self, text="Browse...", command=self.get_outpath) 131 | button.grid(row=3, column=2, sticky=sticky) 132 | 133 | tkinter.Label(self, text='OPTIONAL: APNX file Associated with AZW3', wraplength=200).grid(row=4, sticky=tkinter_constants.E) 134 | self.apnxpath = tkinter.Entry(self, width=50) 135 | self.apnxpath.grid(row=4, column=1, sticky=sticky) 136 | self.apnxpath.insert(0, '') 137 | button = tkinter.Button(self, text="Browse...", command=self.get_apnxpath) 138 | button.grid(row=4, column=2, sticky=sticky) 139 | 140 | self.splitvar = tkinter.IntVar() 141 | checkbox = tkinter.Checkbutton(self, text="Split Combination Kindlegen eBooks", variable=self.splitvar) 142 | if self.prefs['splitvar'] and PERSISTENT_PREFS: 143 | checkbox.select() 144 | checkbox.grid(row=5, column=1, columnspan=2, sticky=tkinter_constants.W) 145 | 146 | self.rawvar = tkinter.IntVar() 147 | checkbox = tkinter.Checkbutton(self, text="Write Raw Data", variable=self.rawvar) 148 | if self.prefs['rawvar'] and PERSISTENT_PREFS: 149 | checkbox.select() 150 | checkbox.grid(row=6, column=1, columnspan=2, sticky=tkinter_constants.W) 151 | 152 | self.dbgvar = tkinter.IntVar() 153 | checkbox = tkinter.Checkbutton(self, text="Dump Mode", variable=self.dbgvar) 154 | if self.prefs['dbgvar'] and PERSISTENT_PREFS: 155 | checkbox.select() 156 | checkbox.grid(row=7, column=1, columnspan=2, sticky=tkinter_constants.W) 157 | 158 | self.hdvar = tkinter.IntVar() 159 | checkbox = tkinter.Checkbutton(self, text="Use HD Images If Present", variable=self.hdvar) 160 | if self.prefs['hdvar'] and PERSISTENT_PREFS: 161 | checkbox.select() 162 | checkbox.grid(row=8, column=1, columnspan=2, sticky=tkinter_constants.W) 163 | 164 | tkinter.Label(self, text='ePub Output Type:').grid(row=9, sticky=tkinter_constants.E) 165 | self.epubver_val = tkinter.StringVar() 166 | self.epubver = tkinter_ttk.Combobox(self, textvariable=self.epubver_val, state='readonly') 167 | self.epubver['values'] = ('ePub 2', 'ePub 3', 'Auto-detect', 'Force ePub 2') 168 | self.epubver.current(0) 169 | if self.prefs['epubver'] and PERSISTENT_PREFS: 170 | self.epubver.current(self.prefs['epubver']) 171 | self.epubver.grid(row=9, column=1, columnspan=2, pady=(3,5), sticky=tkinter_constants.W) 172 | 173 | msg1 = 'Conversion Log \n\n' 174 | self.stext = ScrolledText(self, bd=5, relief=tkinter_constants.RIDGE, wrap=tkinter_constants.WORD) 175 | self.stext.grid(row=10, column=0, columnspan=3, sticky=ALL) 176 | self.stext.insert(tkinter_constants.END,msg1) 177 | 178 | self.sbotton = tkinter.Button( 179 | self, text="Start", width=10, command=self.convertit) 180 | self.sbotton.grid(row=11, column=1, sticky=tkinter_constants.S+tkinter_constants.E) 181 | self.qbutton = tkinter.Button( 182 | self, text="Quit", width=10, command=self.quitting) 183 | self.qbutton.grid(row=11, column=2, sticky=tkinter_constants.S+tkinter_constants.W) 184 | if self.prefs['windowgeometry'] and PERSISTENT_PREFS: 185 | self.root.geometry(self.prefs['windowgeometry']) 186 | else: 187 | self.root.update_idletasks() 188 | w = self.root.winfo_screenwidth() 189 | h = self.root.winfo_screenheight() 190 | rootsize = (605, 575) 191 | x = w//2 - rootsize[0]//2 192 | y = h//2 - rootsize[1]//2 193 | self.root.geometry('%dx%d+%d+%d' % (rootsize + (x, y))) 194 | self.root.protocol('WM_DELETE_WINDOW', self.quitting) 195 | 196 | # read queue shared between this main process and spawned child processes 197 | def readQueueUntilEmpty(self): 198 | done = False 199 | text = '' 200 | while not done: 201 | try: 202 | data = self.q.get_nowait() 203 | text += unicode_str(data, 'utf-8') 204 | except Empty: 205 | done = True 206 | pass 207 | return text 208 | 209 | # read from subprocess pipe without blocking 210 | # invoked every interval via the widget "after" 211 | # option being used, so need to reset it for the next time 212 | def processQueue(self): 213 | poll = self.p2.exitcode 214 | if poll != None: 215 | text = self.readQueueUntilEmpty() 216 | msg = text + '\n\n' + 'eBook successfully unpacked\n' 217 | if poll != 0: 218 | msg = text + '\n\n' + 'Error: Unpacking Failed\n' 219 | self.p2.join() 220 | self.showCmdOutput(msg) 221 | self.p2 = None 222 | self.sbotton.configure(state='normal') 223 | return 224 | text = self.readQueueUntilEmpty() 225 | self.showCmdOutput(text) 226 | # make sure we get invoked again by event loop after interval 227 | self.stext.after(self.interval,self.processQueue) 228 | return 229 | 230 | # post output from subprocess in scrolled text widget 231 | def showCmdOutput(self, msg): 232 | if msg and msg !='': 233 | if sys.platform.startswith('win'): 234 | msg = msg.replace('\r\n','\n') 235 | self.stext.insert(tkinter_constants.END,msg) 236 | self.stext.yview_pickplace(tkinter_constants.END) 237 | return 238 | 239 | def get_mobipath(self): 240 | cwd = unipath.getcwd() 241 | mobipath = tkinter_filedialog.askopenfilename( 242 | parent=None, title='Select Unencrypted Kindle eBook File', 243 | initialdir=self.prefs['mobipath'] or cwd, 244 | initialfile=None, 245 | defaultextension=('.mobi', '.prc', '.azw', '.azw4', '.azw3'), 246 | filetypes=[('All Kindle formats', ('.mobi', '.prc', '.azw', '.azw4', '.azw3')), 247 | ('Kindle Mobi eBook File', '.mobi'), ('Kindle PRC eBook File', '.prc'), 248 | ('Kindle AZW eBook File', '.azw'), ('Kindle AZW4 Print Replica', '.azw4'), 249 | ('Kindle Version 8', '.azw3'),('All Files', '.*')]) 250 | if mobipath: 251 | self.prefs['mobipath'] = pathof(os.path.dirname(mobipath)) 252 | mobipath = pathof(os.path.normpath(mobipath)) 253 | self.mobipath.delete(0, tkinter_constants.END) 254 | self.mobipath.insert(0, mobipath) 255 | return 256 | 257 | def get_apnxpath(self): 258 | cwd = unipath.getcwd() 259 | apnxpath = tkinter_filedialog.askopenfilename( 260 | parent=None, title='Optional APNX file associated with AZW3', 261 | initialdir=self.prefs['apnxpath'] or cwd, 262 | initialfile=None, 263 | defaultextension='.apnx', filetypes=[('Kindle APNX Page Information File', '.apnx'), ('All Files', '.*')]) 264 | if apnxpath: 265 | self.prefs['apnxpath'] = pathof(os.path.dirname(apnxpath)) 266 | apnxpath = pathof(os.path.normpath(apnxpath)) 267 | self.apnxpath.delete(0, tkinter_constants.END) 268 | self.apnxpath.insert(0, apnxpath) 269 | return 270 | 271 | def get_outpath(self): 272 | cwd = unipath.getcwd() 273 | if sys.platform.startswith("win") and PY2: 274 | # tk_chooseDirectory is horribly broken for unicode paths 275 | # on windows - bug has been reported but not fixed for years 276 | # workaround by using our own unicode aware version 277 | outpath = AskFolder(message="Folder to Store Output into", 278 | defaultLocation=self.prefs['outpath'] or unipath.getcwd()) 279 | else: 280 | outpath = tkinter_filedialog.askdirectory( 281 | parent=None, title='Folder to Store Output into', 282 | initialdir=self.prefs['outpath'] or cwd, initialfile=None) 283 | if outpath: 284 | self.prefs['outpath'] = outpath 285 | outpath = pathof(os.path.normpath(outpath)) 286 | self.outpath.delete(0, tkinter_constants.END) 287 | self.outpath.insert(0, outpath) 288 | return 289 | 290 | def quitting(self): 291 | # kill any still running subprocess 292 | if self.p2 != None: 293 | if (self.p2.exitcode == None): 294 | self.p2.terminate() 295 | if PERSISTENT_PREFS: 296 | if not saveprefs(CONFIGFILE, self.prefs, self): 297 | print("Couldn't save INI file.") 298 | self.root.destroy() 299 | self.quit() 300 | 301 | # run in a child process and collect its output 302 | def convertit(self): 303 | # now disable the button to prevent multiple launches 304 | self.sbotton.configure(state='disabled') 305 | mobipath = unicode_str(self.mobipath.get()) 306 | apnxpath = unicode_str(self.apnxpath.get()) 307 | outdir = unicode_str(self.outpath.get()) 308 | if not mobipath or not unipath.exists(mobipath): 309 | self.status.set('Specified eBook file does not exist') 310 | self.sbotton.configure(state='normal') 311 | return 312 | apnxfile = None 313 | if apnxpath != "" and unipath.exists(apnxpath): 314 | apnxfile = apnxpath 315 | if not outdir: 316 | self.status.set('No output directory specified') 317 | self.sbotton.configure(state='normal') 318 | return 319 | q = self.q 320 | log = 'Input Path = "'+ mobipath + '"\n' 321 | log += 'Output Path = "' + outdir + '"\n' 322 | if apnxfile != None: 323 | log += 'APNX Path = "' + apnxfile + '"\n' 324 | dump = False 325 | writeraw = False 326 | splitcombos = False 327 | use_hd = False 328 | if self.dbgvar.get() == 1: 329 | dump = True 330 | log += 'Debug = True\n' 331 | if self.rawvar.get() == 1: 332 | writeraw = True 333 | log += 'WriteRawML = True\n' 334 | if self.splitvar.get() == 1: 335 | splitcombos = True 336 | log += 'Split Combo KF8 Kindle eBooks = True\n' 337 | if self.epubver.current() == 0: 338 | epubversion = '2' 339 | elif self.epubver.current() == 1: 340 | epubversion = '3' 341 | elif self.epubver.current() == 2: 342 | epubversion = 'A' 343 | else: 344 | epubversion = 'F' 345 | log += 'Epub Output Type Set To: {0}\n'.format(self.epubver_val.get()) 346 | if self.hdvar.get(): 347 | use_hd = True 348 | # stub for processing the Use HD Images setting 349 | log += 'Use HD Images If Present = True\n' 350 | log += '\n\n' 351 | log += 'Please Wait ...\n\n' 352 | self.stext.insert(tkinter_constants.END,log) 353 | self.p2 = Process(target=unpackEbook, args=(q, mobipath, outdir, apnxfile, epubversion, use_hd, dump, writeraw, splitcombos)) 354 | self.p2.start() 355 | 356 | # python does not seem to allow you to create 357 | # your own eventloop which every other gui does - strange 358 | # so need to use the widget "after" command to force 359 | # event loop to run non-gui events every interval 360 | self.stext.after(self.interval,self.processQueue) 361 | return 362 | 363 | 364 | # child process / multiprocessing thread starts here 365 | def unpackEbook(q, infile, outdir, apnxfile, epubversion, use_hd, dump, writeraw, splitcombos): 366 | sys.stdout = QueuedStream(sys.stdout, q) 367 | sys.stderr = QueuedStream(sys.stderr, q) 368 | rv = 0 369 | try: 370 | kindleunpack.unpackBook(infile, outdir, apnxfile, epubversion, use_hd, dodump=dump, dowriteraw=writeraw, dosplitcombos=splitcombos) 371 | except Exception as e: 372 | print("Error: %s" % e) 373 | print(traceback.format_exc()) 374 | rv = 1 375 | sys.exit(rv) 376 | 377 | 378 | def main(argv=unicode_argv()): 379 | root = tkinter.Tk() 380 | root.title('Kindle eBook Unpack Tool') 381 | root.minsize(440, 350) 382 | root.resizable(True, True) 383 | MainDialog(root).pack(fill=tkinter_constants.BOTH, expand=tkinter_constants.YES) 384 | root.mainloop() 385 | return 0 386 | 387 | if __name__ == "__main__": 388 | sys.exit(main()) 389 | -------------------------------------------------------------------------------- /KindleUnpack_ReadMe.htm: -------------------------------------------------------------------------------- 1 | T 2 | 3 | KindleUnpack ReadMe 4 | 5 | 6 |

KindleUnpack unpacks a non-DRM Kindle/MobiPocket ebook into its component parts which change depending on the type of Kindle/Mobi ebook being processed.

7 |

8 |

    9 |
  1. MobiPocket and early Kindle version 7 or less ebooks are unpacked to the original html 3.2 and images folder that can then be edited and reprocessed by MobiPocketCreator.

  2. 10 |
  3. Kindle Print Replica ebook as unpacked to the original PDF and any associated images.

  4. 11 |
  5. Kindle KF8 only ebooks (.azw3) are unpacked into an epub-like structure that may or may not be a fully valid epub depending on if a fully valid epub was originally provided to kindlegen as input.

    NOTE: The generated epub should be validated using an epub validator and should changes be needed, it should load properly into Sigil and Calibre either of which can be used to edit the result to create a fully valid epub.

  6. 12 |
  7. Newer Kindle ebooks which have both KF8 and older versions inside are unpacked into two different parts: the first being the older MobiPocket format ebook parts (see #1 above) and the second being an epub-like structure that can be edited using Sigil (see #3 above).

  8. 13 |
14 |

15 |

The KindleUnpack program requires Python 2.7.X or Python 3.4 or later to function properly.

16 |

On Windows machines we strongly recommend you install the free version of ActiveState's Active Python 2.7.3 or later 2.7.X version as it properly installs all of the required parts including the tk widget kit and updates the system path on Windows machines. The official installer from python.org does not properly handle this for Windows machines.

17 |

On Mac OS X 10.6.X and later and almost all recent Linux versions the required version of Python is already installed as part of the official OS installation so Mac OS X and Linux users need install nothing extra.

18 | 19 |

To install KindleUnpack, simply find a nice location on your machine and fully unzip it. Do not move the KindleUnpack.pyw program away from its associated "lib" folder. If you have a proper Python 2.6 or later installation on your machine, you should be able to simply double-click the KindleUnpack.pyw icon and the gui interface should start.

20 | 21 |

If you would prefer a command-line interface, simply look inside KindleUnpack's "lib" folder for the KindleUnpack.py python program and its support modules. You should then be able to run KindleUnpack.py by the following command:

22 | 23 |
24 | python kindleunpack.py [-r -s -d -h -i] [-p APNX_FILE] INPUT_FILE OUTPUT_FOLDER
25 | 
26 | 27 |

where you replace:

28 |
29 | 
30 |    INPUT_FILE      - path to the desired Kindle/MobiPocket ebook
31 | 
32 |    OUTPUT_FOLDER   - path to folder where the ebook will be unpacked
33 | 
34 | Options:
35 |     -h               print this help message
36 |     -i               use HDImages to overwrite lower resolution versions, if present
37 |     -s               split combination mobis into older mobi and mobi KF8 ebooks
38 |     -p APNX_FILE     path to a .apnx file that contains real page numbers associated with an azw3 ebook (optional)
39 |                      Note: many apnx files have arbitrarily assigned page offsets that will confuse KindleUnpack if used
40 |    --epub_version=   specify epub version to unpack to: 2, 3 or A (for automatic) or 
41 |                         F for Force to epub2, default is 2
42 |     -r               write raw data to the output folder
43 |     -d               dump headers and other debug info to output and extra files
44 | 
45 | 
46 | 47 |

Please report any bugs or comments/requests our sticky forum on the Mobileread website. It can be found at http://www.mobileread.com/forums. Look under E-Book Formats > Kindle Formats > KindleUnpack (MobiUnpack). 48 |

49 | 50 |

License Information

51 |
52 | KindleUnpack
53 |     Based on initial mobipocket version Copyright © 2009 Charles M. Hannum 
54 |     Extensive Extensions and Improvements Copyright © 2009-2014 
55 |          By P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo.
56 |     This program is free software: you can redistribute it and/or modify
57 |     it under the terms of the GNU General Public License as published by
58 |     the Free Software Foundation, version 3.
59 | 
60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | KindleUnpack 2 | ============ 3 | 4 | python based software to unpack Amazon / Kindlegen generated ebooks 5 | 6 | KindleUnpack unpacks a non-DRM Kindle/MobiPocket ebook into its component parts 7 | which change depending on the type of Kindle/Mobi ebook being processed 8 | 9 | - MobiPocket and early Kindle version 7 or less ebooks are unpacked to the 10 | original html 3.2 and images folder that can then be edited and reprocessed by 11 | MobiPocketCreator. 12 | 13 | - Kindle Print Replica ebook are unpacked to the original PDF and any associated images. 14 | 15 | - Kindle KF8 only ebooks (.azw3) are unpacked into an epub-like structure that may or 16 | may not be a fully valid epub depending on if a fully valid epub was 17 | originally provided to kindlegen as input. NOTE: The generated epub should be 18 | validated using an epub validator and should changes be needed, it should load 19 | properly into Sigil and Calibre either of which can be used to edit the result 20 | to create a fully valid epub. 21 | 22 | - Newer Kindle ebooks which have both KF8 and older versions inside are unpacked into 23 | two different parts: the first being the older MobiPocket format ebook parts 24 | and the second being an epub-like structure that can be edited using Sigil 25 | 26 | The KindleUnpack program requires Python 2.7.X or Python 3.4 or later to function properly. 27 | 28 | On Windows machines we strongly recommend you install the free version of ActiveState's 29 | Active Python 2.7.X or 3.4.X or later as it properly installs all of the required parts 30 | including the tk widget kit and updates the system path on Windows machines. The official 31 | installer from python.org sometimes does not properly handle this for Windows machines. 32 | 33 | On Mac OS X 10.6.X and later and almost all recent Linux versions, the required version 34 | of Python is already installed as part of the official OS installation so Mac OS X and 35 | Linux users need install nothing extra. 36 | 37 | To install KindleUnpack, simply find a nice location on your machine and fully unzip it. 38 | Do not move the KindleUnpack.pyw program away from its associated "lib" folder. If you 39 | have a proper Python 2.7 or later installation on your machine, you should be able to 40 | simply double-click the KindleUnpack.pyw icon and the gui interface should start 41 | 42 | If you would prefer a command-line interface, simply look inside KindleUnpack's "lib" 43 | folder for the KindleUnpack.py python program and its support modules. You should 44 | then be able to run KindleUnpack.py by the following command: 45 | 46 | ```sh 47 | python kindleunpack.py [-r -s -d -h -i] [-p APNX_FILE] INPUT_FILE OUTPUT_FOLDER 48 | ``` 49 | 50 | where you replace: 51 | 52 | `INPUT_FILE` - path to the desired Kindle/MobiPocket ebook 53 | 54 | `OUTPUT_FOLDER` - path to folder where the ebook will be unpacked 55 | 56 | ### Options 57 | 58 | `-h` print this help message 59 | 60 | `-i` use HDImages to overwrite lower resolution versions, if present 61 | 62 | `-s` split combination mobis into older mobi and mobi KF8 ebooks 63 | 64 | `-p APNX_FILE` path to a .apnx file that contains real page numbers associated 65 | with an azw3 ebook (optional). Note: many apnx files have 66 | arbitrarily assigned page offsets that will confuse KindleUnpack 67 | if used 68 | 69 | `--epub_version=` specify EPUB version to unpack to: 2, 3 or A (for automatic) or 70 | F for Force to EPUB2, default is 2 71 | 72 | `-r` write raw data to the output folder 73 | 74 | `-d` dump headers and other debug info to output and extra files 75 | 76 | Please report any bugs or comments/requests our sticky forum on the Mobileread website. 77 | It can be found at http://www.mobileread.com/forums. 78 | 79 | Look under E-Book Formats > Kindle Formats > KindleUnpack (MobiUnpack). 80 | 81 | License Information 82 | 83 | KindleUnpack 84 | Based on initial mobipocket version Copyright © 2009 Charles M. Hannum 85 | Extensive Extensions and Improvements Copyright © 2009-2014 86 | By P. Durrant, K. Hendricks, S. Siebert, fandrieu, DiapDealer, nickredding, tkeo. 87 | This program is free software: you can redistribute it and/or modify 88 | it under the terms of the GNU General Public License as published by 89 | the Free Software Foundation, version 3. 90 | -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai 3 | -------------------------------------------------------------------------------- /lib/compatibility_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | # Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay 6 | # All rights reserved. 7 | # 8 | # Redistribution and use in source and binary forms, with or without modification, 9 | # are permitted provided that the following conditions are met: 10 | # 11 | # 1. Redistributions of source code must retain the above copyright notice, this list of 12 | # conditions and the following disclaimer. 13 | # 14 | # 2. Redistributions in binary form must reproduce the above copyright notice, this list 15 | # of conditions and the following disclaimer in the documentation and/or other materials 16 | # provided with the distribution. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 19 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 21 | # SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 22 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 23 | # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 24 | # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 26 | # WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | from __future__ import unicode_literals, division, absolute_import, print_function 29 | 30 | import sys 31 | import codecs 32 | 33 | PY2 = sys.version_info[0] == 2 34 | PY3 = sys.version_info[0] == 3 35 | 36 | iswindows = sys.platform.startswith('win') 37 | 38 | try: 39 | from urllib.parse import unquote 40 | except ImportError: 41 | from urllib import unquote 42 | 43 | if PY2: 44 | from HTMLParser import HTMLParser 45 | _h = HTMLParser() 46 | elif sys.version_info[1] < 4: 47 | import html.parser 48 | _h = html.parser.HTMLParser() 49 | else: 50 | import html as _h 51 | 52 | if PY3: 53 | text_type = str 54 | binary_type = bytes 55 | # if will be printing arbitraty binary data to stdout on python 3 56 | # sys.stdin = sys.stdin.detach() 57 | # sys.stdout = sys.stdout.detach() 58 | # sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach()) 59 | else: 60 | range = xrange 61 | text_type = unicode 62 | binary_type = str 63 | # if will be printing unicode under python 2 need to protect 64 | # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode 65 | # sys.stdout = codecs.getwriter("utf-8")(sys.stdout) 66 | # alternatively set environment variable as follows **before** launching python: export PYTHONIOENCODING=UTF-8 67 | 68 | # NOTE: Python 3 is completely broken when accessing single bytes in bytes strings 69 | # (and they amazingly claim by design and no bug!) 70 | 71 | # To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode 72 | # >>> o = '123456789' 73 | # >>> o[-3] 74 | # '7' 75 | # >>> type(o[-3]) 76 | # 77 | # >>> type(o) 78 | # 79 | 80 | # Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings 81 | # >>> o = b'123456789' 82 | # >>> o[-3] 83 | # 55 84 | # >>> type(o[-3]) 85 | # 86 | # >>> type(o) 87 | # 88 | 89 | # This mind boggling behaviour also happens when indexing a bytestring and/or 90 | # iteratoring over a bytestring. In other words it will return an int but not 91 | # the byte itself!!!!!!! 92 | 93 | # The only way to access a single byte as a byte in bytestring and get the byte in both 94 | # Python 2 and Python 3 is to use a slice 95 | 96 | # This problem is so common there are horrible hacks floating around the net to **try** 97 | # to work around it, so that code that works on both Python 2 and Python 3 is possible. 98 | 99 | # So in order to write code that works on both Python 2 and Python 3 100 | # if you index or access a single byte and want its ord() then use the bord() function. 101 | # If instead you want it as a single character byte use the bchar() function 102 | # both of which are defined below. 103 | 104 | if PY3: 105 | # Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding) 106 | # in place of ascii you will get a byte value to half-word or integer value 107 | # one-to-one mapping (in the 0 - 255 range) 108 | 109 | def bchr(s): 110 | return bytes([s]) 111 | 112 | def bstr(s): 113 | if isinstance(s, str): 114 | return bytes(s, 'latin-1') 115 | else: 116 | return bytes(s) 117 | 118 | def bord(s): 119 | return s 120 | 121 | def bchar(s): 122 | return bytes([s]) 123 | 124 | else: 125 | def bchr(s): 126 | return chr(s) 127 | 128 | def bstr(s): 129 | return str(s) 130 | 131 | def bord(s): 132 | return ord(s) 133 | 134 | def bchar(s): 135 | return s 136 | 137 | if PY3: 138 | # list-producing versions of the major Python iterating functions 139 | def lrange(*args, **kwargs): 140 | return list(range(*args, **kwargs)) 141 | 142 | def lzip(*args, **kwargs): 143 | return list(zip(*args, **kwargs)) 144 | 145 | def lmap(*args, **kwargs): 146 | return list(map(*args, **kwargs)) 147 | 148 | def lfilter(*args, **kwargs): 149 | return list(filter(*args, **kwargs)) 150 | else: 151 | import __builtin__ 152 | # Python 2-builtin ranges produce lists 153 | lrange = __builtin__.range 154 | lzip = __builtin__.zip 155 | lmap = __builtin__.map 156 | lfilter = __builtin__.filter 157 | 158 | # In Python 3 you can no longer use .encode('hex') on a bytestring 159 | # instead use the following on both platforms 160 | import binascii 161 | def hexlify(bdata): 162 | return (binascii.hexlify(bdata)).decode('ascii') 163 | 164 | # If you: import struct 165 | # Note: struct pack, unpack, unpack_from all *require* bytestring format 166 | # data all the way up to at least Python 2.7.5, Python 3 is okay with either 167 | 168 | # If you: import re 169 | # note: Python 3 "re" requires the pattern to be the exact same type as the data to be 170 | # searched ... but u"" is not allowed for the pattern itself only b"" 171 | # Python 2.X allows the pattern to be any type and converts it to match the data 172 | # and returns the same type as the data 173 | 174 | # convert string to be utf-8 encoded 175 | def utf8_str(p, enc='utf-8'): 176 | if p is None: 177 | return None 178 | if isinstance(p, text_type): 179 | return p.encode('utf-8') 180 | if enc != 'utf-8': 181 | return p.decode(enc).encode('utf-8') 182 | return p 183 | 184 | # convert string to be unicode encoded 185 | def unicode_str(p, enc='utf-8'): 186 | if p is None: 187 | return None 188 | if isinstance(p, text_type): 189 | return p 190 | return p.decode(enc) 191 | 192 | ASCII_CHARS = set(chr(x) for x in range(128)) 193 | URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 194 | 'abcdefghijklmnopqrstuvwxyz' 195 | '0123456789' '#' '_.-/~') 196 | IRI_UNSAFE = ASCII_CHARS - URL_SAFE 197 | 198 | # returns a quoted IRI (not a URI) 199 | def quoteurl(href): 200 | if isinstance(href,binary_type): 201 | href = href.decode('utf-8') 202 | result = [] 203 | for char in href: 204 | if char in IRI_UNSAFE: 205 | char = "%%%02x" % ord(char) 206 | result.append(char) 207 | return ''.join(result) 208 | 209 | # unquotes url/iri 210 | def unquoteurl(href): 211 | if isinstance(href,binary_type): 212 | href = href.decode('utf-8') 213 | href = unquote(href) 214 | return href 215 | 216 | # unescape html 217 | def unescapeit(sval): 218 | return _h.unescape(sval) 219 | 220 | # Python 2.X commandline parsing under Windows has been horribly broken for years! 221 | # Use the following code to emulate full unicode commandline parsing on Python 2 222 | # ie. To get sys.argv arguments and properly encode them as unicode 223 | 224 | def unicode_argv(): 225 | global iswindows 226 | global PY3 227 | if PY3: 228 | return sys.argv 229 | if iswindows: 230 | # Versions 2.x of Python don't support Unicode in sys.argv on 231 | # Windows, with the underlying Windows API instead replacing multi-byte 232 | # characters with '?'. So use shell32.GetCommandLineArgvW to get sys.argv 233 | # as a list of Unicode strings 234 | from ctypes import POINTER, byref, cdll, c_int, windll 235 | from ctypes.wintypes import LPCWSTR, LPWSTR 236 | 237 | GetCommandLineW = cdll.kernel32.GetCommandLineW 238 | GetCommandLineW.argtypes = [] 239 | GetCommandLineW.restype = LPCWSTR 240 | 241 | CommandLineToArgvW = windll.shell32.CommandLineToArgvW 242 | CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)] 243 | CommandLineToArgvW.restype = POINTER(LPWSTR) 244 | 245 | cmd = GetCommandLineW() 246 | argc = c_int(0) 247 | argv = CommandLineToArgvW(cmd, byref(argc)) 248 | if argc.value > 0: 249 | # Remove Python executable and commands if present 250 | start = argc.value - len(sys.argv) 251 | return [argv[i] for i in 252 | range(start, argc.value)] 253 | # this should never happen 254 | return None 255 | else: 256 | argv = [] 257 | argvencoding = sys.stdin.encoding 258 | if argvencoding is None: 259 | argvencoding = sys.getfilesystemencoding() 260 | if argvencoding is None: 261 | argvencoding = 'utf-8' 262 | for arg in sys.argv: 263 | if isinstance(arg, text_type): 264 | argv.append(arg) 265 | else: 266 | argv.append(arg.decode(argvencoding)) 267 | return argv 268 | 269 | 270 | # Python 2.X is broken in that it does not recognize CP65001 as UTF-8 271 | def add_cp65001_codec(): 272 | if PY2: 273 | try: 274 | codecs.lookup('cp65001') 275 | except LookupError: 276 | codecs.register( 277 | lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None) 278 | return 279 | -------------------------------------------------------------------------------- /lib/imghdr.py: -------------------------------------------------------------------------------- 1 | """Recognize image file formats based on their first few bytes.""" 2 | 3 | # Python software and documentation are licensed under the 4 | # Python Software Foundation License Version 2. 5 | 6 | # Starting with Python 3.8.6, examples, recipes, and other code in 7 | # the documentation are dual licensed under the PSF License Version 2 8 | # and the Zero-Clause BSD license. 9 | 10 | # Some software incorporated into Python is under different licenses. 11 | # The licenses are listed with code falling under that license. 12 | 13 | # PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 14 | # -------------------------------------------- 15 | 16 | # 1. This LICENSE AGREEMENT is between the Python Software Foundation 17 | # ("PSF"), and the Individual or Organization ("Licensee") accessing and 18 | # otherwise using this software ("Python") in source or binary form and 19 | # its associated documentation. 20 | 21 | # 2. Subject to the terms and conditions of this License Agreement, PSF hereby 22 | # grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, 23 | # analyze, test, perform and/or display publicly, prepare derivative works, 24 | # distribute, and otherwise use Python alone or in any derivative version, 25 | # provided, however, that PSF's License Agreement and PSF's notice of copyright, 26 | # i.e., "Copyright (c) 2001 Python Software Foundation; All Rights Reserved" 27 | # are retained in Python alone or in any derivative version prepared by Licensee. 28 | 29 | # 3. In the event Licensee prepares a derivative work that is based on 30 | # or incorporates Python or any part thereof, and wants to make 31 | # the derivative work available to others as provided herein, then 32 | # Licensee hereby agrees to include in any such work a brief summary of 33 | # the changes made to Python. 34 | 35 | # 4. PSF is making Python available to Licensee on an "AS IS" 36 | # basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR 37 | # IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND 38 | # DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS 39 | # FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT 40 | # INFRINGE ANY THIRD PARTY RIGHTS. 41 | 42 | # 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON 43 | # FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS 44 | # A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, 45 | # OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. 46 | 47 | # 6. This License Agreement will automatically terminate upon a material 48 | # breach of its terms and conditions. 49 | 50 | # 7. Nothing in this License Agreement shall be deemed to create any 51 | # relationship of agency, partnership, or joint venture between PSF and 52 | # Licensee. This License Agreement does not grant permission to use PSF 53 | # trademarks or trade name in a trademark sense to endorse or promote 54 | # products or services of Licensee, or any third party. 55 | 56 | # 8. By copying, installing or otherwise using Python, Licensee 57 | # agrees to be bound by the terms and conditions of this License 58 | # Agreement. 59 | 60 | from os import PathLike 61 | 62 | __all__ = ["what"] 63 | 64 | #-------------------------# 65 | # Recognize image headers # 66 | #-------------------------# 67 | 68 | def what(file, h=None): 69 | f = None 70 | try: 71 | if h is None: 72 | if isinstance(file, (str, PathLike)): 73 | f = open(file, 'rb') 74 | h = f.read(32) 75 | else: 76 | location = file.tell() 77 | h = file.read(32) 78 | file.seek(location) 79 | for tf in tests: 80 | res = tf(h, f) 81 | if res: 82 | return res 83 | finally: 84 | if f: f.close() 85 | return None 86 | 87 | 88 | #---------------------------------# 89 | # Subroutines per image file type # 90 | #---------------------------------# 91 | 92 | tests = [] 93 | 94 | def test_jpeg(h, f): 95 | """JPEG data in JFIF or Exif format""" 96 | if h[6:10] in (b'JFIF', b'Exif'): 97 | return 'jpeg' 98 | 99 | tests.append(test_jpeg) 100 | 101 | def test_png(h, f): 102 | if h.startswith(b'\211PNG\r\n\032\n'): 103 | return 'png' 104 | 105 | tests.append(test_png) 106 | 107 | def test_gif(h, f): 108 | """GIF ('87 and '89 variants)""" 109 | if h[:6] in (b'GIF87a', b'GIF89a'): 110 | return 'gif' 111 | 112 | tests.append(test_gif) 113 | 114 | def test_tiff(h, f): 115 | """TIFF (can be in Motorola or Intel byte order)""" 116 | if h[:2] in (b'MM', b'II'): 117 | return 'tiff' 118 | 119 | tests.append(test_tiff) 120 | 121 | def test_rgb(h, f): 122 | """SGI image library""" 123 | if h.startswith(b'\001\332'): 124 | return 'rgb' 125 | 126 | tests.append(test_rgb) 127 | 128 | def test_pbm(h, f): 129 | """PBM (portable bitmap)""" 130 | if len(h) >= 3 and \ 131 | h[0] == ord(b'P') and h[1] in b'14' and h[2] in b' \t\n\r': 132 | return 'pbm' 133 | 134 | tests.append(test_pbm) 135 | 136 | def test_pgm(h, f): 137 | """PGM (portable graymap)""" 138 | if len(h) >= 3 and \ 139 | h[0] == ord(b'P') and h[1] in b'25' and h[2] in b' \t\n\r': 140 | return 'pgm' 141 | 142 | tests.append(test_pgm) 143 | 144 | def test_ppm(h, f): 145 | """PPM (portable pixmap)""" 146 | if len(h) >= 3 and \ 147 | h[0] == ord(b'P') and h[1] in b'36' and h[2] in b' \t\n\r': 148 | return 'ppm' 149 | 150 | tests.append(test_ppm) 151 | 152 | def test_rast(h, f): 153 | """Sun raster file""" 154 | if h.startswith(b'\x59\xA6\x6A\x95'): 155 | return 'rast' 156 | 157 | tests.append(test_rast) 158 | 159 | def test_xbm(h, f): 160 | """X bitmap (X10 or X11)""" 161 | if h.startswith(b'#define '): 162 | return 'xbm' 163 | 164 | tests.append(test_xbm) 165 | 166 | def test_bmp(h, f): 167 | if h.startswith(b'BM'): 168 | return 'bmp' 169 | 170 | tests.append(test_bmp) 171 | 172 | def test_webp(h, f): 173 | if h.startswith(b'RIFF') and h[8:12] == b'WEBP': 174 | return 'webp' 175 | 176 | tests.append(test_webp) 177 | 178 | def test_exr(h, f): 179 | if h.startswith(b'\x76\x2f\x31\x01'): 180 | return 'exr' 181 | 182 | tests.append(test_exr) 183 | 184 | #--------------------# 185 | # Small test program # 186 | #--------------------# 187 | 188 | def test(): 189 | import sys 190 | recursive = 0 191 | if sys.argv[1:] and sys.argv[1] == '-r': 192 | del sys.argv[1:2] 193 | recursive = 1 194 | try: 195 | if sys.argv[1:]: 196 | testall(sys.argv[1:], recursive, 1) 197 | else: 198 | testall(['.'], recursive, 1) 199 | except KeyboardInterrupt: 200 | sys.stderr.write('\n[Interrupted]\n') 201 | sys.exit(1) 202 | 203 | def testall(list, recursive, toplevel): 204 | import sys 205 | import os 206 | for filename in list: 207 | if os.path.isdir(filename): 208 | print(filename + '/:', end=' ') 209 | if recursive or toplevel: 210 | print('recursing down:') 211 | import glob 212 | names = glob.glob(os.path.join(glob.escape(filename), '*')) 213 | testall(names, recursive, 0) 214 | else: 215 | print('*** directory (use -r) ***') 216 | else: 217 | print(filename + ':', end=' ') 218 | sys.stdout.flush() 219 | try: 220 | print(what(filename)) 221 | except OSError: 222 | print('*** not found ***') 223 | 224 | if __name__ == '__main__': 225 | test() 226 | -------------------------------------------------------------------------------- /lib/mobi_cover.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | from .compatibility_utils import unicode_str 8 | 9 | from .unipath import pathof 10 | import os 11 | from . import imghdr 12 | 13 | import struct 14 | # note: struct pack, unpack, unpack_from all require bytestring format 15 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring 16 | 17 | USE_SVG_WRAPPER = True 18 | """ Set to True to use svg wrapper for default. """ 19 | 20 | FORCE_DEFAULT_TITLE = False 21 | """ Set to True to force to use the default title. """ 22 | 23 | COVER_PAGE_FINENAME = 'cover_page.xhtml' 24 | """ The name for the cover page. """ 25 | 26 | DEFAULT_TITLE = 'Cover' 27 | """ The default title for the cover page. """ 28 | 29 | MAX_WIDTH = 4096 30 | """ The max width for the svg cover page. """ 31 | 32 | MAX_HEIGHT = 4096 33 | """ The max height for the svg cover page. """ 34 | 35 | 36 | def get_image_type(imgname, imgdata=None): 37 | imgtype = unicode_str(imghdr.what(pathof(imgname), imgdata)) 38 | 39 | # imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some 40 | # with only the magic JPEG bytes out there... 41 | # ImageMagick handles those, so, do it too. 42 | if imgtype is None: 43 | if imgdata is None: 44 | with open(pathof(imgname), 'rb') as f: 45 | imgdata = f.read() 46 | if imgdata[0:2] == b'\xFF\xD8': 47 | # Get last non-null bytes 48 | last = len(imgdata) 49 | while (imgdata[last-1:last] == b'\x00'): 50 | last-=1 51 | # Be extra safe, check the trailing bytes, too. 52 | if imgdata[last-2:last] == b'\xFF\xD9': 53 | imgtype = "jpeg" 54 | return imgtype 55 | 56 | 57 | def get_image_size(imgname, imgdata=None): 58 | '''Determine the image type of imgname (or imgdata) and return its size. 59 | 60 | Originally, 61 | Determine the image type of fhandle and return its size. 62 | from draco''' 63 | if imgdata is None: 64 | fhandle = open(pathof(imgname), 'rb') 65 | head = fhandle.read(24) 66 | else: 67 | head = imgdata[0:24] 68 | if len(head) != 24: 69 | return 70 | 71 | imgtype = get_image_type(imgname, imgdata) 72 | if imgtype == 'png': 73 | check = struct.unpack(b'>i', head[4:8])[0] 74 | if check != 0x0d0a1a0a: 75 | return 76 | width, height = struct.unpack(b'>ii', head[16:24]) 77 | elif imgtype == 'gif': 78 | width, height = struct.unpack(b'H', fhandle.read(2))[0] - 2 91 | # We are at a SOFn block 92 | fhandle.seek(1, 1) # Skip `precision' byte. 93 | height, width = struct.unpack(b'>HH', fhandle.read(4)) 94 | except Exception: # IGNORE:W0703 95 | return 96 | elif imgtype == 'jpeg' and imgdata is not None: 97 | try: 98 | pos = 0 99 | size = 2 100 | ftype = 0 101 | while not 0xc0 <= ftype <= 0xcf: 102 | pos += size 103 | byte = imgdata[pos:pos+1] 104 | pos += 1 105 | while ord(byte) == 0xff: 106 | byte = imgdata[pos:pos+1] 107 | pos += 1 108 | ftype = ord(byte) 109 | size = struct.unpack(b'>H', imgdata[pos:pos+2])[0] - 2 110 | pos += 2 111 | # We are at a SOFn block 112 | pos += 1 # Skip `precision' byte. 113 | height, width = struct.unpack(b'>HH', imgdata[pos:pos+4]) 114 | pos += 4 115 | except Exception: # IGNORE:W0703 116 | return 117 | else: 118 | return 119 | return width, height 120 | 121 | # XXX experimental 122 | class CoverProcessor(object): 123 | 124 | """Create a cover page. 125 | 126 | """ 127 | def __init__(self, files, metadata, rscnames, imgname=None, imgdata=None): 128 | self.files = files 129 | self.metadata = metadata 130 | self.rscnames = rscnames 131 | self.cover_page = COVER_PAGE_FINENAME 132 | self.use_svg = USE_SVG_WRAPPER # Use svg wrapper. 133 | self.lang = metadata.get('Language', ['en'])[0] 134 | # This should ensure that if the methods to find the cover image's 135 | # dimensions should fail for any reason, the SVG routine will not be used. 136 | [self.width, self.height] = (-1,-1) 137 | if FORCE_DEFAULT_TITLE: 138 | self.title = DEFAULT_TITLE 139 | else: 140 | self.title = metadata.get('Title', [DEFAULT_TITLE])[0] 141 | 142 | self.cover_image = None 143 | if imgname is not None: 144 | self.cover_image = imgname 145 | elif 'CoverOffset' in metadata: 146 | imageNumber = int(metadata['CoverOffset'][0]) 147 | cover_image = self.rscnames[imageNumber] 148 | if cover_image is not None: 149 | self.cover_image = cover_image 150 | else: 151 | print('Warning: Cannot identify the cover image.') 152 | if self.use_svg: 153 | try: 154 | if imgdata is None: 155 | fname = os.path.join(files.imgdir, self.cover_image) 156 | [self.width, self.height] = get_image_size(fname) 157 | else: 158 | [self.width, self.height] = get_image_size(None, imgdata) 159 | except: 160 | self.use_svg = False 161 | width = self.width 162 | height = self.height 163 | if width < 0 or height < 0 or width > MAX_WIDTH or height > MAX_HEIGHT: 164 | self.use_svg = False 165 | return 166 | 167 | def getImageName(self): 168 | return self.cover_image 169 | 170 | def getXHTMLName(self): 171 | return self.cover_page 172 | 173 | def buildXHTML(self): 174 | print('Building a cover page.') 175 | files = self.files 176 | cover_image = self.cover_image 177 | title = self.title 178 | lang = self.lang 179 | 180 | image_dir = os.path.normpath(os.path.relpath(files.k8images, files.k8text)) 181 | image_path = os.path.join(image_dir, cover_image).replace('\\', '/') 182 | 183 | if not self.use_svg: 184 | data = '' 185 | data += '' 186 | data += 'L', idata, 0x14) 41 | count, = struct.unpack_from(b'>L', idata, 0x18) 42 | self.starts.append(start) 43 | self.counts.append(count) 44 | 45 | def lookup(self, lookupvalue): 46 | i = 0 47 | rvalue = lookupvalue 48 | while rvalue >= self.counts[i]: 49 | rvalue = rvalue - self.counts[i] 50 | i += 1 51 | if i == len(self.counts): 52 | print("Error: Problem with multiple inflections data sections") 53 | return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0] 54 | return rvalue, self.starts[i], self.counts[i], self.infldatas[i] 55 | 56 | def offsets(self, value): 57 | rvalue, start, count, data = self.lookup(value) 58 | offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue)) 59 | if rvalue + 1 < count: 60 | nextOffset, = struct.unpack_from(b'>H',data, start + 4 + (2 * (rvalue + 1))) 61 | else: 62 | nextOffset = None 63 | return offset, nextOffset, data 64 | 65 | 66 | class dictSupport(object): 67 | 68 | def __init__(self, mh, sect): 69 | self.mh = mh 70 | self.header = mh.header 71 | self.sect = sect 72 | self.metaOrthIndex = mh.metaOrthIndex 73 | self.metaInflIndex = mh.metaInflIndex 74 | 75 | def parseHeader(self, data): 76 | "read INDX header" 77 | if not data[:4] == b'INDX': 78 | print("Warning: index section is not INDX") 79 | return False 80 | words = ( 81 | 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', 82 | 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc' 83 | ) 84 | num = len(words) 85 | values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)]) 86 | header = {} 87 | for n in range(num): 88 | header[words[n]] = values[n] 89 | 90 | ordt1 = None 91 | ordt2 = None 92 | 93 | otype, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4) 94 | header['otype'] = otype 95 | header['oentries'] = oentries 96 | 97 | if DEBUG_DICT: 98 | print("otype %d, oentries %d, op1 %d, op2 %d, otagx %d" % (otype, oentries, op1, op2, otagx)) 99 | 100 | if header['code'] == 0xfdea or oentries > 0: 101 | # some dictionaries seem to be codepage 65002 (0xFDEA) which seems 102 | # to be some sort of strange EBCDIC utf-8 or 16 encoded strings 103 | # So we need to look for them and store them away to process leading text 104 | # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries 105 | # we only ever seem to use the second but ... 106 | # 107 | # if otype = 0, ORDT table uses 16 bit values as offsets into the table 108 | # if otype = 1, ORDT table uses 8 bit values as offsets inot the table 109 | 110 | assert(data[op1:op1+4] == b'ORDT') 111 | assert(data[op2:op2+4] == b'ORDT') 112 | ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4) 113 | ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4) 114 | 115 | if DEBUG_DICT: 116 | print("parsed INDX header:") 117 | for key in header: 118 | print(key, "%x" % header[key],) 119 | print("\n") 120 | return header, ordt1, ordt2 121 | 122 | def getPositionMap(self): 123 | sect = self.sect 124 | 125 | positionMap = {} 126 | 127 | metaOrthIndex = self.metaOrthIndex 128 | metaInflIndex = self.metaInflIndex 129 | 130 | decodeInflection = True 131 | if metaOrthIndex != 0xFFFFFFFF: 132 | print("Info: Document contains orthographic index, handle as dictionary") 133 | if metaInflIndex == 0xFFFFFFFF: 134 | decodeInflection = False 135 | else: 136 | metaInflIndexData = sect.loadSection(metaInflIndex) 137 | 138 | print("\nParsing metaInflIndexData") 139 | midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData) 140 | 141 | metaIndexCount = midxhdr['count'] 142 | idatas = [] 143 | for j in range(metaIndexCount): 144 | idatas.append(sect.loadSection(metaInflIndex + 1 + j)) 145 | dinfl = InflectionData(idatas) 146 | 147 | inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount) 148 | tagSectionStart = midxhdr['len'] 149 | inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData) 150 | if DEBUG_DICT: 151 | print("inflectionTagTable: %s" % inflectionTagTable) 152 | if self.hasTag(inflectionTagTable, 0x07): 153 | print("Error: Dictionary uses obsolete inflection rule scheme which is not yet supported") 154 | decodeInflection = False 155 | 156 | data = sect.loadSection(metaOrthIndex) 157 | 158 | print("\nParsing metaOrthIndex") 159 | idxhdr, hordt1, hordt2 = self.parseHeader(data) 160 | 161 | tagSectionStart = idxhdr['len'] 162 | controlByteCount, tagTable = readTagSection(tagSectionStart, data) 163 | orthIndexCount = idxhdr['count'] 164 | print("orthIndexCount is", orthIndexCount) 165 | if DEBUG_DICT: 166 | print("orthTagTable: %s" % tagTable) 167 | if hordt2 is not None: 168 | print("orth entry uses ordt2 lookup table of type ", idxhdr['otype']) 169 | hasEntryLength = self.hasTag(tagTable, 0x02) 170 | if not hasEntryLength: 171 | print("Info: Index doesn't contain entry length tags") 172 | 173 | print("Read dictionary index data") 174 | for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount): 175 | data = sect.loadSection(i) 176 | hdrinfo, ordt1, ordt2 = self.parseHeader(data) 177 | idxtPos = hdrinfo['start'] 178 | entryCount = hdrinfo['count'] 179 | idxPositions = [] 180 | for j in range(entryCount): 181 | pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j)) 182 | idxPositions.append(pos) 183 | # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) 184 | idxPositions.append(idxtPos) 185 | for j in range(entryCount): 186 | startPos = idxPositions[j] 187 | endPos = idxPositions[j+1] 188 | textLength = ord(data[startPos:startPos+1]) 189 | text = data[startPos+1:startPos+1+textLength] 190 | if hordt2 is not None: 191 | utext = u"" 192 | if idxhdr['otype'] == 0: 193 | pattern = b'>H' 194 | inc = 2 195 | else: 196 | pattern = b'>B' 197 | inc = 1 198 | pos = 0 199 | while pos < textLength: 200 | off, = struct.unpack_from(pattern, text, pos) 201 | if off < len(hordt2): 202 | utext += unichr(hordt2[off]) 203 | else: 204 | utext += unichr(off) 205 | pos += inc 206 | text = utext.encode('utf-8') 207 | 208 | tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos) 209 | if 0x01 in tagMap: 210 | if decodeInflection and 0x2a in tagMap: 211 | inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable, 212 | dinfl, inflNameData, tagMap[0x2a]) 213 | else: 214 | inflectionGroups = b'' 215 | assert len(tagMap[0x01]) == 1 216 | entryStartPosition = tagMap[0x01][0] 217 | if hasEntryLength: 218 | # The idx:entry attribute "scriptable" must be present to create entry length tags. 219 | ml = b'' + inflectionGroups + b'' 220 | if entryStartPosition in positionMap: 221 | positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml 222 | else: 223 | positionMap[entryStartPosition] = ml 224 | assert len(tagMap[0x02]) == 1 225 | entryEndPosition = entryStartPosition + tagMap[0x02][0] 226 | if entryEndPosition in positionMap: 227 | positionMap[entryEndPosition] = b"" + positionMap[entryEndPosition] 228 | else: 229 | positionMap[entryEndPosition] = b"" 230 | 231 | else: 232 | indexTags = b'\n\n' + inflectionGroups + b'\n' 233 | if entryStartPosition in positionMap: 234 | positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags 235 | else: 236 | positionMap[entryStartPosition] = indexTags 237 | return positionMap 238 | 239 | def hasTag(self, tagTable, tag): 240 | ''' 241 | Test if tag table contains given tag. 242 | 243 | @param tagTable: The tag table. 244 | @param tag: The tag to search. 245 | @return: True if tag table contains given tag; False otherwise. 246 | ''' 247 | for currentTag, _, _, _ in tagTable: 248 | if currentTag == tag: 249 | return True 250 | return False 251 | 252 | def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList): 253 | ''' 254 | Create string which contains the inflection groups with inflection rules as mobipocket tags. 255 | 256 | @param mainEntry: The word to inflect. 257 | @param controlByteCount: The number of control bytes. 258 | @param tagTable: The tag table. 259 | @param data: The Inflection data object to properly select the right inflection data section to use 260 | @param inflectionNames: The inflection rule name data. 261 | @param groupList: The list of inflection groups to process. 262 | @return: String with inflection groups and rules or empty string if required tags are not available. 263 | ''' 264 | result = b"" 265 | for value in groupList: 266 | offset, nextOffset, data = dinfl.offsets(value) 267 | 268 | # First byte seems to be always 0x00 and must be skipped. 269 | assert ord(data[offset:offset+1]) == 0x00 270 | tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset) 271 | 272 | # Make sure that the required tags are available. 273 | if 0x05 not in tagMap: 274 | print("Error: Required tag 0x05 not found in tagMap") 275 | return "" 276 | if 0x1a not in tagMap: 277 | print("Error: Required tag 0x1a not found in tagMap") 278 | return b'' 279 | 280 | result += b'' 281 | 282 | for i in range(len(tagMap[0x05])): 283 | 284 | # Get name of inflection rule. 285 | value = tagMap[0x05][i] 286 | consumed, textLength = getVariableWidthValue(inflectionNames, value) 287 | inflectionName = inflectionNames[value+consumed:value+consumed+textLength] 288 | 289 | # Get and apply inflection rule across possibly multiple inflection data sections 290 | value = tagMap[0x1a][i] 291 | rvalue, start, count, data = dinfl.lookup(value) 292 | offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue)) 293 | textLength = ord(data[offset:offset+1]) 294 | inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength) 295 | if inflection is not None: 296 | result += b' ' 297 | 298 | result += b'' 299 | return result 300 | 301 | def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end): 302 | ''' 303 | Apply inflection rule. 304 | 305 | @param mainEntry: The word to inflect. 306 | @param inflectionRuleData: The inflection rules. 307 | @param start: The start position of the inflection rule to use. 308 | @param end: The end position of the inflection rule to use. 309 | @return: The string with the inflected word or None if an error occurs. 310 | ''' 311 | mode = -1 312 | byteArray = array.array(array_format, mainEntry) 313 | position = len(byteArray) 314 | for charOffset in range(start, end): 315 | char = inflectionRuleData[charOffset:charOffset+1] 316 | abyte = ord(char) 317 | if abyte >= 0x0a and abyte <= 0x13: 318 | # Move cursor backwards 319 | offset = abyte - 0x0a 320 | if mode not in [0x02, 0x03]: 321 | mode = 0x02 322 | position = len(byteArray) 323 | position -= offset 324 | elif abyte > 0x13: 325 | if mode == -1: 326 | print("Error: Unexpected first byte %i of inflection rule" % abyte) 327 | return None 328 | elif position == -1: 329 | print("Error: Unexpected first byte %i of inflection rule" % abyte) 330 | return None 331 | else: 332 | if mode == 0x01: 333 | # Insert at word start 334 | byteArray.insert(position, abyte) 335 | position += 1 336 | elif mode == 0x02: 337 | # Insert at word end 338 | byteArray.insert(position, abyte) 339 | elif mode == 0x03: 340 | # Delete at word end 341 | position -= 1 342 | deleted = byteArray.pop(position) 343 | if bchr(deleted) != char: 344 | if DEBUG_DICT: 345 | print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted))) 346 | print("Error: Delete operation of inflection rule failed") 347 | return None 348 | elif mode == 0x04: 349 | # Delete at word start 350 | deleted = byteArray.pop(position) 351 | if bchr(deleted) != char: 352 | if DEBUG_DICT: 353 | print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted))) 354 | print("Error: Delete operation of inflection rule failed") 355 | return None 356 | else: 357 | print("Error: Inflection rule mode %x is not implemented" % mode) 358 | return None 359 | elif abyte == 0x01: 360 | # Insert at word start 361 | if mode not in [0x01, 0x04]: 362 | position = 0 363 | mode = abyte 364 | elif abyte == 0x02: 365 | # Insert at word end 366 | if mode not in [0x02, 0x03]: 367 | position = len(byteArray) 368 | mode = abyte 369 | elif abyte == 0x03: 370 | # Delete at word end 371 | if mode not in [0x02, 0x03]: 372 | position = len(byteArray) 373 | mode = abyte 374 | elif abyte == 0x04: 375 | # Delete at word start 376 | if mode not in [0x01, 0x04]: 377 | position = 0 378 | # Delete at word start 379 | mode = abyte 380 | else: 381 | print("Error: Inflection rule mode %x is not implemented" % abyte) 382 | return None 383 | return utf8_str(convert_to_bytes(byteArray)) 384 | -------------------------------------------------------------------------------- /lib/mobi_html.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | from .compatibility_utils import PY2, utf8_str 8 | 9 | if PY2: 10 | range = xrange 11 | 12 | import re 13 | # note: re requites the pattern to be the exact same type as the data to be searched in python3 14 | # but u"" is not allowed for the pattern itself only b"" 15 | 16 | from .mobi_utils import fromBase32 17 | 18 | class HTMLProcessor: 19 | 20 | def __init__(self, files, metadata, rscnames): 21 | self.files = files 22 | self.metadata = metadata 23 | self.rscnames = rscnames 24 | # for original style mobis, default to including all image files in the opf manifest 25 | self.used = {} 26 | for name in rscnames: 27 | self.used[name] = 'used' 28 | 29 | def findAnchors(self, rawtext, indx_data, positionMap): 30 | # process the raw text 31 | # find anchors... 32 | print("Find link anchors") 33 | link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', re.IGNORECASE) 34 | # TEST NCX: merge in filepos from indx 35 | pos_links = [int(m.group(1)) for m in link_pattern.finditer(rawtext)] 36 | if indx_data: 37 | pos_indx = [e['pos'] for e in indx_data if e['pos']>0] 38 | pos_links = list(set(pos_links + pos_indx)) 39 | 40 | for position in pos_links: 41 | if position in positionMap: 42 | positionMap[position] = positionMap[position] + utf8_str('' % position) 43 | else: 44 | positionMap[position] = utf8_str('' % position) 45 | 46 | # apply dictionary metadata and anchors 47 | print("Insert data into html") 48 | pos = 0 49 | lastPos = len(rawtext) 50 | dataList = [] 51 | for end in sorted(positionMap.keys()): 52 | if end == 0 or end > lastPos: 53 | continue # something's up - can't put a tag in outside ... 54 | dataList.append(rawtext[pos:end]) 55 | dataList.append(positionMap[end]) 56 | pos = end 57 | dataList.append(rawtext[pos:]) 58 | srctext = b"".join(dataList) 59 | rawtext = None 60 | dataList = None 61 | self.srctext = srctext 62 | self.indx_data = indx_data 63 | return srctext 64 | 65 | def insertHREFS(self): 66 | srctext = self.srctext 67 | rscnames = self.rscnames 68 | metadata = self.metadata 69 | 70 | # put in the hrefs 71 | print("Insert hrefs into html") 72 | # There doesn't seem to be a standard, so search as best as we can 73 | 74 | link_pattern = re.compile(br''']*?)filepos=['"]{0,1}0*(\d+)['"]{0,1}([^>]*?)>''', re.IGNORECASE) 75 | srctext = link_pattern.sub(br'''''', srctext) 76 | 77 | # remove empty anchors 78 | print("Remove empty anchors from html") 79 | srctext = re.sub(br"",br"", srctext) 80 | srctext = re.sub(br"\s*",br"", srctext) 81 | 82 | # convert image references 83 | print("Insert image references into html") 84 | # split string into image tag pieces and other pieces 85 | image_pattern = re.compile(br'''()''', re.IGNORECASE) 86 | image_index_pattern = re.compile(br'''recindex=['"]{0,1}([0-9]+)['"]{0,1}''', re.IGNORECASE) 87 | srcpieces = image_pattern.split(srctext) 88 | srctext = self.srctext = None 89 | 90 | # all odd pieces are image tags (nulls string on even pieces if no space between them in srctext) 91 | for i in range(1, len(srcpieces), 2): 92 | tag = srcpieces[i] 93 | for m in image_index_pattern.finditer(tag): 94 | imageNumber = int(m.group(1)) 95 | imageName = rscnames[imageNumber-1] 96 | if imageName is None: 97 | print("Error: Referenced image %s was not recognized as a valid image" % imageNumber) 98 | else: 99 | replacement = b'src="Images/' + utf8_str(imageName) + b'"' 100 | tag = image_index_pattern.sub(replacement, tag, 1) 101 | srcpieces[i] = tag 102 | srctext = b"".join(srcpieces) 103 | 104 | # add in character set meta into the html header if needed 105 | if 'Codec' in metadata: 106 | srctext = srctext[0:12]+b''+srctext[12:] 107 | return srctext, self.used 108 | 109 | 110 | class XHTMLK8Processor: 111 | 112 | def __init__(self, rscnames, k8proc, viewport=None): 113 | self.rscnames = rscnames 114 | self.k8proc = k8proc 115 | self.viewport = viewport 116 | self.used = {} 117 | 118 | def buildXHTML(self): 119 | 120 | # first need to update all links that are internal which 121 | # are based on positions within the xhtml files **BEFORE** 122 | # cutting and pasting any pieces into the xhtml text files 123 | 124 | # kindle:pos:fid:XXXX:off:YYYYYYYYYY (used for internal link within xhtml) 125 | # XXXX is the offset in records into divtbl 126 | # YYYYYYYYYYYY is a base32 number you add to the divtbl insertpos to get final position 127 | 128 | # pos:fid pattern 129 | posfid_pattern = re.compile(br'''()''', re.IGNORECASE) 130 | posfid_index_pattern = re.compile(br'''['"]kindle:pos:fid:([0-9|A-V]+):off:([0-9|A-V]+).*?["']''') 131 | 132 | parts = [] 133 | print("Building proper xhtml for each file") 134 | for i in range(self.k8proc.getNumberOfParts()): 135 | part = self.k8proc.getPart(i) 136 | [partnum, dir, filename, beg, end, aidtext] = self.k8proc.getPartInfo(i) 137 | 138 | # internal links 139 | srcpieces = posfid_pattern.split(part) 140 | for j in range(1, len(srcpieces),2): 141 | tag = srcpieces[j] 142 | if tag.startswith(b'<'): 143 | for m in posfid_index_pattern.finditer(tag): 144 | posfid = m.group(1) 145 | offset = m.group(2) 146 | filename, idtag = self.k8proc.getIDTagByPosFid(posfid, offset) 147 | if idtag == b'': 148 | replacement= b'"' + utf8_str(filename) + b'"' 149 | else: 150 | replacement = b'"' + utf8_str(filename) + b'#' + idtag + b'"' 151 | tag = posfid_index_pattern.sub(replacement, tag, 1) 152 | srcpieces[j] = tag 153 | part = b"".join(srcpieces) 154 | parts.append(part) 155 | 156 | # we are free to cut and paste as we see fit 157 | # we can safely remove all of the Kindlegen generated aid tags 158 | # change aid ids that are in k8proc.linked_aids to xhtml ids 159 | find_tag_with_aid_pattern = re.compile(br'''(<[^>]*\said\s*=[^>]*>)''', re.IGNORECASE) 160 | within_tag_aid_position_pattern = re.compile(br'''\said\s*=['"]([^'"]*)['"]''') 161 | for i in range(len(parts)): 162 | part = parts[i] 163 | srcpieces = find_tag_with_aid_pattern.split(part) 164 | for j in range(len(srcpieces)): 165 | tag = srcpieces[j] 166 | if tag.startswith(b'<'): 167 | for m in within_tag_aid_position_pattern.finditer(tag): 168 | try: 169 | aid = m.group(1) 170 | except IndexError: 171 | aid = None 172 | replacement = b'' 173 | if aid in self.k8proc.linked_aids: 174 | replacement = b' id="aid-' + aid + b'"' 175 | tag = within_tag_aid_position_pattern.sub(replacement, tag, 1) 176 | srcpieces[j] = tag 177 | part = b"".join(srcpieces) 178 | parts[i] = part 179 | 180 | # we can safely replace all of the Kindlegen generated data-AmznPageBreak tags 181 | # with page-break-after style patterns 182 | find_tag_with_AmznPageBreak_pattern = re.compile(br'''(<[^>]*\sdata-AmznPageBreak=[^>]*>)''', re.IGNORECASE) 183 | within_tag_AmznPageBreak_position_pattern = re.compile(br'''\sdata-AmznPageBreak=['"]([^'"]*)['"]''') 184 | for i in range(len(parts)): 185 | part = parts[i] 186 | srcpieces = find_tag_with_AmznPageBreak_pattern.split(part) 187 | for j in range(len(srcpieces)): 188 | tag = srcpieces[j] 189 | if tag.startswith(b'<'): 190 | srcpieces[j] = within_tag_AmznPageBreak_position_pattern.sub( 191 | lambda m:b' style="page-break-after:' + m.group(1) + b'"', tag) 192 | part = b"".join(srcpieces) 193 | parts[i] = part 194 | 195 | # we have to handle substitutions for the flows pieces first as they may 196 | # be inlined into the xhtml text 197 | # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) 198 | # kindle:flow:XXXX?mime=YYYY/ZZZ (used for style sheets, svg images, etc) 199 | # kindle:embed:XXXX (used for fonts) 200 | 201 | flows = [] 202 | flows.append(None) 203 | flowinfo = [] 204 | flowinfo.append([None, None, None, None]) 205 | 206 | # regular expression search patterns 207 | img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) 208 | img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE) 209 | 210 | tag_pattern = re.compile(br'''(<[^>]*>)''') 211 | flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) 212 | 213 | url_pattern = re.compile(br'''(url\(.*?\))''', re.IGNORECASE) 214 | url_img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)\?mime=image/[^\)]*["')]''', re.IGNORECASE) 215 | font_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)["')]''', re.IGNORECASE) 216 | url_css_index_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=text/css[^\)]*''', re.IGNORECASE) 217 | url_svg_image_pattern = re.compile(br'''kindle:flow:([0-9|A-V]+)\?mime=image/svg\+xml[^\)]*''', re.IGNORECASE) 218 | 219 | for i in range(1, self.k8proc.getNumberOfFlows()): 220 | [ftype, format, dir, filename] = self.k8proc.getFlowInfo(i) 221 | flowpart = self.k8proc.getFlow(i) 222 | 223 | # links to raster image files from image tags 224 | # image_pattern 225 | srcpieces = img_pattern.split(flowpart) 226 | for j in range(1, len(srcpieces),2): 227 | tag = srcpieces[j] 228 | if tag.startswith(b']*>)''') 323 | flow_pattern = re.compile(br'''['"]kindle:flow:([0-9|A-V]+)\?mime=([^'"]+)['"]''', re.IGNORECASE) 324 | for i in range(len(parts)): 325 | part = parts[i] 326 | [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] 327 | # flow pattern 328 | srcpieces = tag_pattern.split(part) 329 | for j in range(1, len(srcpieces),2): 330 | tag = srcpieces[j] 331 | if tag.startswith(b'<'): 332 | for m in flow_pattern.finditer(tag): 333 | num = fromBase32(m.group(1)) 334 | if num > 0 and num < len(self.k8proc.flowinfo): 335 | [typ, fmt, pdir, fnm] = self.k8proc.getFlowInfo(num) 336 | flowpart = flows[num] 337 | if fmt == b'inline': 338 | tag = flowpart 339 | else: 340 | replacement = b'"../' + utf8_str(pdir) + b'/' + utf8_str(fnm) + b'"' 341 | tag = flow_pattern.sub(replacement, tag, 1) 342 | self.used[fnm] = 'used' 343 | else: 344 | print("warning: ignoring non-existent flow link", tag, " value 0x%x" % num) 345 | srcpieces[j] = tag 346 | part = b''.join(srcpieces) 347 | 348 | # store away modified version 349 | parts[i] = part 350 | 351 | # Handle any embedded raster images links in style= attributes urls 352 | style_pattern = re.compile(br'''(<[a-zA-Z0-9]+\s[^>]*style\s*=\s*[^>]*>)''', re.IGNORECASE) 353 | img_index_pattern = re.compile(br'''[('"]kindle:embed:([0-9|A-V]+)[^'"]*['")]''', re.IGNORECASE) 354 | 355 | for i in range(len(parts)): 356 | part = parts[i] 357 | [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] 358 | 359 | # replace urls in style attributes 360 | srcpieces = style_pattern.split(part) 361 | for j in range(1, len(srcpieces),2): 362 | tag = srcpieces[j] 363 | if b'kindle:embed' in tag: 364 | for m in img_index_pattern.finditer(tag): 365 | imageNumber = fromBase32(m.group(1)) 366 | imageName = self.rscnames[imageNumber-1] 367 | osep = m.group()[0:1] 368 | csep = m.group()[-1:] 369 | if imageName is not None: 370 | replacement = osep + b'../Images/'+ utf8_str(imageName) + csep 371 | self.used[imageName] = 'used' 372 | tag = img_index_pattern.sub(replacement, tag, 1) 373 | else: 374 | print("Error: Referenced image %s in style url was not recognized in %s" % (imageNumber, tag)) 375 | srcpieces[j] = tag 376 | part = b"".join(srcpieces) 377 | 378 | # store away modified version 379 | parts[i] = part 380 | 381 | # Handle any embedded raster images links in the xhtml text 382 | # kindle:embed:XXXX?mime=image/gif (png, jpeg, etc) (used for images) 383 | img_pattern = re.compile(br'''(<[img\s|image\s][^>]*>)''', re.IGNORECASE) 384 | img_index_pattern = re.compile(br'''['"]kindle:embed:([0-9|A-V]+)[^'"]*['"]''') 385 | 386 | for i in range(len(parts)): 387 | part = parts[i] 388 | [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] 389 | 390 | # links to raster image files 391 | # image_pattern 392 | srcpieces = img_pattern.split(part) 393 | for j in range(1, len(srcpieces),2): 394 | tag = srcpieces[j] 395 | if tag.startswith(b' remove value="XX" attributes since these are illegal 415 | tag_pattern = re.compile(br'''(<[^>]*>)''') 416 | li_value_pattern = re.compile(br'''\svalue\s*=\s*['"][^'"]*['"]''', re.IGNORECASE) 417 | 418 | for i in range(len(parts)): 419 | part = parts[i] 420 | [partnum, dir, filename, beg, end, aidtext] = self.k8proc.partinfo[i] 421 | 422 | # tag pattern 423 | srcpieces = tag_pattern.split(part) 424 | for j in range(1, len(srcpieces),2): 425 | tag = srcpieces[j] 426 | if tag.startswith(b'\n' 440 | viewport_pattern = re.compile(br''']*name\s*=\s*["'][^"'>]*viewport["'][^>]*>''', re.IGNORECASE) 441 | for i in range(len(parts)): 442 | part = parts[i] 443 | # only inject if a viewport meta item does not already exist in that part 444 | if not viewport_pattern.search(part): 445 | endheadpos = part.find(b'') 446 | if endheadpos >= 0: 447 | part = part[0:endheadpos] + injected_meta + part[endheadpos:] 448 | parts[i] = part 449 | 450 | self.k8proc.setFlows(flows) 451 | self.k8proc.setParts(parts) 452 | 453 | return self.used 454 | -------------------------------------------------------------------------------- /lib/mobi_index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | from .compatibility_utils import PY2, bchr, bstr, bord 8 | if PY2: 9 | range = xrange 10 | 11 | import struct 12 | # note: struct pack, unpack, unpack_from all require bytestring format 13 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring 14 | 15 | from .mobi_utils import toHex 16 | 17 | class MobiIndex: 18 | 19 | def __init__(self, sect, DEBUG=False): 20 | self.sect = sect 21 | self.DEBUG = DEBUG 22 | 23 | def getIndexData(self, idx, label="Unknown"): 24 | sect = self.sect 25 | outtbl = [] 26 | ctoc_text = {} 27 | if idx != 0xffffffff: 28 | sect.setsectiondescription(idx,"{0} Main INDX section".format(label)) 29 | data = sect.loadSection(idx) 30 | idxhdr, hordt1, hordt2 = self.parseINDXHeader(data) 31 | IndexCount = idxhdr['count'] 32 | # handle the case of multiple sections used for CTOC 33 | rec_off = 0 34 | off = idx + IndexCount + 1 35 | for j in range(idxhdr['nctoc']): 36 | cdata = sect.loadSection(off + j) 37 | sect.setsectiondescription(off+j, label + ' CTOC Data ' + str(j)) 38 | ctocdict = self.readCTOC(cdata) 39 | for k in ctocdict: 40 | ctoc_text[k + rec_off] = ctocdict[k] 41 | rec_off += 0x10000 42 | tagSectionStart = idxhdr['len'] 43 | controlByteCount, tagTable = readTagSection(tagSectionStart, data) 44 | if self.DEBUG: 45 | print("ControlByteCount is", controlByteCount) 46 | print("IndexCount is", IndexCount) 47 | print("TagTable: %s" % tagTable) 48 | for i in range(idx + 1, idx + 1 + IndexCount): 49 | sect.setsectiondescription(i,"{0} Extra {1:d} INDX section".format(label,i-idx)) 50 | data = sect.loadSection(i) 51 | hdrinfo, ordt1, ordt2 = self.parseINDXHeader(data) 52 | idxtPos = hdrinfo['start'] 53 | entryCount = hdrinfo['count'] 54 | if self.DEBUG: 55 | print(idxtPos, entryCount) 56 | # loop through to build up the IDXT position starts 57 | idxPositions = [] 58 | for j in range(entryCount): 59 | pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j)) 60 | idxPositions.append(pos) 61 | # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) 62 | idxPositions.append(idxtPos) 63 | # for each entry in the IDXT build up the tagMap and any associated text 64 | for j in range(entryCount): 65 | startPos = idxPositions[j] 66 | endPos = idxPositions[j+1] 67 | textLength = ord(data[startPos:startPos+1]) 68 | text = data[startPos+1:startPos+1+textLength] 69 | if hordt2 is not None: 70 | text = b''.join(bchr(hordt2[bord(x)]) for x in text) 71 | tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos) 72 | outtbl.append([text, tagMap]) 73 | if self.DEBUG: 74 | print(tagMap) 75 | print(text) 76 | return outtbl, ctoc_text 77 | 78 | def parseINDXHeader(self, data): 79 | "read INDX header" 80 | if not data[:4] == b'INDX': 81 | print("Warning: index section is not INDX") 82 | return False 83 | words = ( 84 | 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', 85 | 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc' 86 | ) 87 | num = len(words) 88 | values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)]) 89 | header = {} 90 | for n in range(num): 91 | header[words[n]] = values[n] 92 | 93 | ordt1 = None 94 | ordt2 = None 95 | 96 | ocnt, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4) 97 | if header['code'] == 0xfdea or ocnt != 0 or oentries > 0: 98 | # horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify 99 | # them in the proper place in the header. They seem to be codepage 65002 which seems 100 | # to be some sort of strange EBCDIC utf-8 or 16 encoded strings 101 | 102 | # so we need to look for them and store them away to process leading text 103 | # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries 104 | # we only ever seem to use the seocnd but ... 105 | assert(ocnt == 1) 106 | assert(data[op1:op1+4] == b'ORDT') 107 | assert(data[op2:op2+4] == b'ORDT') 108 | ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4) 109 | ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4) 110 | 111 | if self.DEBUG: 112 | print("parsed INDX header:") 113 | for n in words: 114 | print(n, "%X" % header[n],) 115 | print("") 116 | return header, ordt1, ordt2 117 | 118 | def readCTOC(self, txtdata): 119 | # read all blocks from CTOC 120 | ctoc_data = {} 121 | offset = 0 122 | while offset next bytes: name 134 | name = txtdata[offset:offset+ilen] 135 | offset += ilen 136 | if self.DEBUG: 137 | print("name length is ", ilen) 138 | print(idx_offs, name) 139 | ctoc_data[idx_offs] = name 140 | return ctoc_data 141 | 142 | 143 | def getVariableWidthValue(data, offset): 144 | ''' 145 | Decode variable width value from given bytes. 146 | 147 | @param data: The bytes to decode. 148 | @param offset: The start offset into data. 149 | @return: Tuple of consumed bytes count and decoded value. 150 | ''' 151 | value = 0 152 | consumed = 0 153 | finished = False 154 | while not finished: 155 | v = data[offset + consumed: offset + consumed + 1] 156 | consumed += 1 157 | if ord(v) & 0x80: 158 | finished = True 159 | value = (value << 7) | (ord(v) & 0x7f) 160 | return consumed, value 161 | 162 | 163 | def readTagSection(start, data): 164 | ''' 165 | Read tag section from given data. 166 | 167 | @param start: The start position in the data. 168 | @param data: The data to process. 169 | @return: Tuple of control byte count and list of tag tuples. 170 | ''' 171 | controlByteCount = 0 172 | tags = [] 173 | if data[start:start+4] == b"TAGX": 174 | firstEntryOffset, = struct.unpack_from(b'>L', data, start + 0x04) 175 | controlByteCount, = struct.unpack_from(b'>L', data, start + 0x08) 176 | 177 | # Skip the first 12 bytes already read above. 178 | for i in range(12, firstEntryOffset, 4): 179 | pos = start + i 180 | tags.append((ord(data[pos:pos+1]), ord(data[pos+1:pos+2]), ord(data[pos+2:pos+3]), ord(data[pos+3:pos+4]))) 181 | return controlByteCount, tags 182 | 183 | 184 | def countSetBits(value, bits=8): 185 | ''' 186 | Count the set bits in the given value. 187 | 188 | @param value: Integer value. 189 | @param bits: The number of bits of the input value (defaults to 8). 190 | @return: Number of set bits. 191 | ''' 192 | count = 0 193 | for _ in range(bits): 194 | if value & 0x01 == 0x01: 195 | count += 1 196 | value = value >> 1 197 | return count 198 | 199 | 200 | def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos): 201 | ''' 202 | Create a map of tags and values from the given byte section. 203 | 204 | @param controlByteCount: The number of control bytes. 205 | @param tagTable: The tag table. 206 | @param entryData: The data to process. 207 | @param startPos: The starting position in entryData. 208 | @param endPos: The end position in entryData or None if it is unknown. 209 | @return: Hashmap of tag and list of values. 210 | ''' 211 | tags = [] 212 | tagHashMap = {} 213 | controlByteIndex = 0 214 | dataStart = startPos + controlByteCount 215 | 216 | for tag, valuesPerEntry, mask, endFlag in tagTable: 217 | if endFlag == 0x01: 218 | controlByteIndex += 1 219 | continue 220 | cbyte = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) 221 | if 0: 222 | print("Control Byte Index %0x , Control Byte Value %0x" % (controlByteIndex, cbyte)) 223 | 224 | value = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) & mask 225 | if value != 0: 226 | if value == mask: 227 | if countSetBits(mask) > 1: 228 | # If all bits of masked value are set and the mask has more than one bit, a variable width value 229 | # will follow after the control bytes which defines the length of bytes (NOT the value count!) 230 | # which will contain the corresponding variable width values. 231 | consumed, value = getVariableWidthValue(entryData, dataStart) 232 | dataStart += consumed 233 | tags.append((tag, None, value, valuesPerEntry)) 234 | else: 235 | tags.append((tag, 1, None, valuesPerEntry)) 236 | else: 237 | # Shift bits to get the masked value. 238 | while mask & 0x01 == 0: 239 | mask = mask >> 1 240 | value = value >> 1 241 | tags.append((tag, value, None, valuesPerEntry)) 242 | for tag, valueCount, valueBytes, valuesPerEntry in tags: 243 | values = [] 244 | if valueCount is not None: 245 | # Read valueCount * valuesPerEntry variable width values. 246 | for _ in range(valueCount): 247 | for _ in range(valuesPerEntry): 248 | consumed, data = getVariableWidthValue(entryData, dataStart) 249 | dataStart += consumed 250 | values.append(data) 251 | else: 252 | # Convert valueBytes to variable width values. 253 | totalConsumed = 0 254 | while totalConsumed < valueBytes: 255 | # Does this work for valuesPerEntry != 1? 256 | consumed, data = getVariableWidthValue(entryData, dataStart) 257 | dataStart += consumed 258 | totalConsumed += consumed 259 | values.append(data) 260 | if totalConsumed != valueBytes: 261 | print("Error: Should consume %s bytes, but consumed %s" % (valueBytes, totalConsumed)) 262 | tagHashMap[tag] = values 263 | # Test that all bytes have been processed if endPos is given. 264 | if endPos is not None and dataStart != endPos: 265 | # The last entry might have some zero padding bytes, so complain only if non zero bytes are left. 266 | for char in entryData[dataStart:endPos]: 267 | if bord(char) != 0: 268 | print("Warning: There are unprocessed index bytes left: %s" % toHex(entryData[dataStart:endPos])) 269 | if 0: 270 | print("controlByteCount: %s" % controlByteCount) 271 | print("tagTable: %s" % tagTable) 272 | print("data: %s" % toHex(entryData[startPos:endPos])) 273 | print("tagHashMap: %s" % tagHashMap) 274 | break 275 | 276 | return tagHashMap 277 | -------------------------------------------------------------------------------- /lib/mobi_k8resc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | DEBUG_USE_ORDERED_DICTIONARY = False # OrderedDict is supoorted >= python 2.7. 8 | """ set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr.""" 9 | 10 | if DEBUG_USE_ORDERED_DICTIONARY: 11 | from collections import OrderedDict as dict_ 12 | else: 13 | dict_ = dict 14 | 15 | from .compatibility_utils import unicode_str 16 | 17 | from .mobi_utils import fromBase32 18 | 19 | _OPF_PARENT_TAGS = ['xml', 'package', 'metadata', 'dc-metadata', 20 | 'x-metadata', 'manifest', 'spine', 'tours', 'guide'] 21 | 22 | class K8RESCProcessor(object): 23 | 24 | def __init__(self, data, debug=False): 25 | self._debug = debug 26 | self.resc = None 27 | self.opos = 0 28 | self.extrameta = [] 29 | self.cover_name = None 30 | self.spine_idrefs = {} 31 | self.spine_order = [] 32 | self.spine_pageattributes = {} 33 | self.spine_ppd = None 34 | # need3 indicate the book has fields which require epub3. 35 | # but the estimation of the source epub version from the fields is difficult. 36 | self.need3 = False 37 | self.package_ver = None 38 | self.extra_metadata = [] 39 | self.refines_metadata = [] 40 | self.extra_attributes = [] 41 | # get header 42 | start_pos = data.find(b'<') 43 | self.resc_header = data[:start_pos] 44 | # get resc data length 45 | start = self.resc_header.find(b'=') + 1 46 | end = self.resc_header.find(b'&', start) 47 | resc_size = 0 48 | if end > 0: 49 | resc_size = fromBase32(self.resc_header[start:end]) 50 | resc_rawbytes = len(data) - start_pos 51 | if resc_rawbytes == resc_size: 52 | self.resc_length = resc_size 53 | else: 54 | # Most RESC has a nul string at its tail but some do not. 55 | end_pos = data.find(b'\x00', start_pos) 56 | if end_pos < 0: 57 | self.resc_length = resc_rawbytes 58 | else: 59 | self.resc_length = end_pos - start_pos 60 | if self.resc_length != resc_size: 61 | print("Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size)) 62 | # now parse RESC after converting it to unicode from utf-8 63 | try: 64 | self.resc = unicode_str(data[start_pos:start_pos+self.resc_length]) 65 | except UnicodeDecodeError: 66 | self.resc = unicode_str(data[start_pos:start_pos+self.resc_length], enc='latin-1') 67 | self.parseData() 68 | 69 | def prepend_to_spine(self, key, idref, linear, properties): 70 | self.spine_order = [key] + self.spine_order 71 | self.spine_idrefs[key] = idref 72 | attributes = {} 73 | if linear is not None: 74 | attributes['linear'] = linear 75 | if properties is not None: 76 | attributes['properties'] = properties 77 | self.spine_pageattributes[key] = attributes 78 | 79 | # RESC tag iterator 80 | def resc_tag_iter(self): 81 | tcontent = last_tattr = None 82 | prefix = [''] 83 | while True: 84 | text, tag = self.parseresc() 85 | if text is None and tag is None: 86 | break 87 | if text is not None: 88 | tcontent = text.rstrip(' \r\n') 89 | else: # we have a tag 90 | ttype, tname, tattr = self.parsetag(tag) 91 | if ttype == 'begin': 92 | tcontent = None 93 | prefix.append(tname + '.') 94 | if tname in _OPF_PARENT_TAGS: 95 | yield ''.join(prefix), tname, tattr, tcontent 96 | else: 97 | last_tattr = tattr 98 | else: # single or end 99 | if ttype == 'end': 100 | prefix.pop() 101 | tattr = last_tattr 102 | last_tattr = None 103 | if tname in _OPF_PARENT_TAGS: 104 | tname += '-end' 105 | yield ''.join(prefix), tname, tattr, tcontent 106 | tcontent = None 107 | 108 | # now parse the RESC to extract spine and extra metadata info 109 | def parseData(self): 110 | for prefix, tname, tattr, tcontent in self.resc_tag_iter(): 111 | if self._debug: 112 | print(" Parsing RESC: ", prefix, tname, tattr, tcontent) 113 | if tname == 'package': 114 | self.package_ver = tattr.get('version', '2.0') 115 | package_prefix = tattr.get('prefix','') 116 | if self.package_ver.startswith('3') or package_prefix.startswith('rendition'): 117 | self.need3 = True 118 | if tname == 'spine': 119 | self.spine_ppd = tattr.get('page-progession-direction', None) 120 | if self.spine_ppd is not None and self.spine_ppd == 'rtl': 121 | self.need3 = True 122 | if tname == 'itemref': 123 | skelid = tattr.pop('skelid', None) 124 | if skelid is None and len(self.spine_order) == 0: 125 | # assume it was removed initial coverpage 126 | skelid = 'coverpage' 127 | tattr['linear'] = 'no' 128 | self.spine_order.append(skelid) 129 | idref = tattr.pop('idref', None) 130 | if idref is not None: 131 | idref = 'x_' + idref 132 | self.spine_idrefs[skelid] = idref 133 | if 'id' in tattr: 134 | del tattr['id'] 135 | # tattr["id"] = 'x_' + tattr["id"] 136 | if 'properties' in tattr: 137 | self.need3 = True 138 | self.spine_pageattributes[skelid] = tattr 139 | if tname == 'meta' or tname.startswith('dc:'): 140 | if 'refines' in tattr or 'property' in tattr: 141 | self.need3 = True 142 | if tattr.get('name','') == 'cover': 143 | cover_name = tattr.get('content',None) 144 | if cover_name is not None: 145 | cover_name = 'x_' + cover_name 146 | self.cover_name = cover_name 147 | else: 148 | self.extrameta.append([tname, tattr, tcontent]) 149 | 150 | # parse and return either leading text or the next tag 151 | def parseresc(self): 152 | p = self.opos 153 | if p >= len(self.resc): 154 | return None, None 155 | if self.resc[p] != '<': 156 | res = self.resc.find('<',p) 157 | if res == -1 : 158 | res = len(self.resc) 159 | self.opos = res 160 | return self.resc[p:res], None 161 | # handle comment as a special case 162 | if self.resc[p:p+4] == '',p+1) 164 | if te != -1: 165 | te = te+2 166 | else: 167 | te = self.resc.find('>',p+1) 168 | ntb = self.resc.find('<',p+1) 169 | if ntb != -1 and ntb < te: 170 | self.opos = ntb 171 | return self.resc[p:ntb], None 172 | self.opos = te + 1 173 | return None, self.resc[p:te+1] 174 | 175 | # parses tag to identify: [tname, ttype, tattr] 176 | # tname: tag name 177 | # ttype: tag type ('begin', 'end' or 'single'); 178 | # tattr: dictionary of tag atributes 179 | def parsetag(self, s): 180 | p = 1 181 | tname = None 182 | ttype = None 183 | tattr = dict_() 184 | while s[p:p+1] == ' ' : 185 | p += 1 186 | if s[p:p+1] == '/': 187 | ttype = 'end' 188 | p += 1 189 | while s[p:p+1] == ' ' : 190 | p += 1 191 | b = p 192 | while s[p:p+1] not in ('>', '/', ' ', '"', "'",'\r','\n') : 193 | p += 1 194 | tname=s[b:p].lower() 195 | # some special cases 196 | if tname == '?xml': 197 | tname = 'xml' 198 | if tname == '!--': 199 | ttype = 'single' 200 | comment = s[p:-3].strip() 201 | tattr['comment'] = comment 202 | if ttype is None: 203 | # parse any attributes of begin or single tags 204 | while s.find('=',p) != -1 : 205 | while s[p:p+1] == ' ' : 206 | p += 1 207 | b = p 208 | while s[p:p+1] != '=' : 209 | p += 1 210 | aname = s[b:p].lower() 211 | aname = aname.rstrip(' ') 212 | p += 1 213 | while s[p:p+1] == ' ' : 214 | p += 1 215 | if s[p:p+1] in ('"', "'") : 216 | p = p + 1 217 | b = p 218 | while s[p:p+1] not in ('"', "'"): 219 | p += 1 220 | val = s[b:p] 221 | p += 1 222 | else : 223 | b = p 224 | while s[p:p+1] not in ('>', '/', ' ') : 225 | p += 1 226 | val = s[b:p] 227 | tattr[aname] = val 228 | if ttype is None: 229 | ttype = 'begin' 230 | if s.find('/',p) >= 0: 231 | ttype = 'single' 232 | return ttype, tname, tattr 233 | 234 | def taginfo_toxml(self, taginfo): 235 | res = [] 236 | tname, tattr, tcontent = taginfo 237 | res.append('<' + tname) 238 | if tattr is not None: 239 | for key in tattr: 240 | res.append(' ' + key + '="'+tattr[key]+'"') 241 | if tcontent is not None: 242 | res.append('>' + tcontent + '\n') 243 | else: 244 | res.append('/>\n') 245 | return "".join(res) 246 | 247 | def hasSpine(self): 248 | return len(self.spine_order) > 0 249 | 250 | def needEPUB3(self): 251 | return self.need3 252 | 253 | def hasRefines(self): 254 | for [tname, tattr, tcontent] in self.extrameta: 255 | if 'refines' in tattr: 256 | return True 257 | return False 258 | 259 | def createMetadata(self, epubver): 260 | for taginfo in self.extrameta: 261 | tname, tattr, tcontent = taginfo 262 | if 'refines' in tattr: 263 | if epubver == 'F' and 'property' in tattr: 264 | attr = ' id="%s" opf:%s="%s"\n' % (tattr['refines'], tattr['property'], tcontent) 265 | self.extra_attributes.append(attr) 266 | else: 267 | tag = self.taginfo_toxml(taginfo) 268 | self.refines_metadata.append(tag) 269 | else: 270 | tag = self.taginfo_toxml(taginfo) 271 | self.extra_metadata.append(tag) 272 | -------------------------------------------------------------------------------- /lib/mobi_nav.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | from .compatibility_utils import unicode_str 8 | import os 9 | from .unipath import pathof 10 | 11 | import re 12 | # note: re requites the pattern to be the exact same type as the data to be searched in python3 13 | # but u"" is not allowed for the pattern itself only b"" 14 | 15 | DEBUG_NAV = False 16 | 17 | FORCE_DEFAULT_TITLE = False 18 | """ Set to True to force to use the default title. """ 19 | 20 | NAVIGATION_FINENAME = 'nav.xhtml' 21 | """ The name for the navigation document. """ 22 | 23 | DEFAULT_TITLE = 'Navigation' 24 | """ The default title for the navigation document. """ 25 | 26 | class NAVProcessor(object): 27 | 28 | def __init__(self, files): 29 | self.files = files 30 | self.navname = NAVIGATION_FINENAME 31 | 32 | def buildLandmarks(self, guidetext): 33 | header = '' 34 | header += ' \n' 41 | 42 | type_map = { 43 | 'cover' : 'cover', 44 | 'title-page' : 'title-page', 45 | # ?: 'frontmatter', 46 | 'text' : 'bodymatter', 47 | # ?: 'backmatter', 48 | 'toc' : 'toc', 49 | 'loi' : 'loi', 50 | 'lot' : 'lot', 51 | 'preface' : 'preface', 52 | 'bibliography' : 'bibliography', 53 | 'index' : 'index', 54 | 'glossary' : 'glossary', 55 | 'acknowledgements' : 'acknowledgements', 56 | 'colophon' : None, 57 | 'copyright-page' : None, 58 | 'dedication' : None, 59 | 'epigraph' : None, 60 | 'foreword' : None, 61 | 'notes' : None 62 | } 63 | 64 | re_type = re.compile(r'\s+type\s*=\s*"(.*?)"', re.I) 65 | re_title = re.compile(r'\s+title\s*=\s*"(.*?)"', re.I) 66 | re_link = re.compile(r'\s+href\s*=\s*"(.*?)"', re.I) 67 | dir_ = os.path.relpath(self.files.k8text, self.files.k8oebps).replace('\\', '/') 68 | 69 | data = '' 70 | references = re.findall(r'', unicode_str(guidetext), re.I) 71 | for reference in references: 72 | mo_type = re_type.search(reference) 73 | mo_title = re_title.search(reference) 74 | mo_link = re_link.search(reference) 75 | if mo_type is not None: 76 | type_ = type_map.get(mo_type.group(1), None) 77 | else: 78 | type_ = None 79 | if mo_title is not None: 80 | title = mo_title.group(1) 81 | else: 82 | title = None 83 | if mo_link is not None: 84 | link = mo_link.group(1) 85 | else: 86 | link = None 87 | 88 | if type_ is not None and title is not None and link is not None: 89 | link = os.path.relpath(link, dir_).replace('\\', '/') 90 | data += element.format(type_, link, title) 91 | if len(data) > 0: 92 | return header + data + footer 93 | else: 94 | return '' 95 | 96 | def buildTOC(self, indx_data): 97 | header = '' 98 | header += ' \n' 101 | 102 | # recursive part 103 | def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): 104 | if start>len(indx_data) or end>len(indx_data): 105 | print("Warning (in buildTOC): missing INDX child entries", start, end, len(indx_data)) 106 | return '' 107 | if DEBUG_NAV: 108 | print("recursINDX (in buildTOC) lvl %d from %d to %d" % (lvl, start, end)) 109 | xhtml = '' 110 | if start <= 0: 111 | start = 0 112 | if end <= 0: 113 | end = len(indx_data) 114 | if lvl > max_lvl: 115 | max_lvl = lvl 116 | 117 | indent1 = ' ' * (2 + lvl * 2) 118 | indent2 = ' ' * (3 + lvl * 2) 119 | xhtml += indent1 + '
    \n' 120 | for i in range(start, end): 121 | e = indx_data[i] 122 | htmlfile = e['filename'] 123 | desttag = e['idtag'] 124 | text = e['text'] 125 | if not e['hlvl'] == lvl: 126 | continue 127 | num += 1 128 | if desttag == '': 129 | link = htmlfile 130 | else: 131 | link = '{:s}#{:s}'.format(htmlfile, desttag) 132 | xhtml += indent2 + '
  1. ' 133 | entry = '{:s}'.format(link, text) 134 | xhtml += entry 135 | # recurs 136 | if e['child1'] >= 0: 137 | xhtml += '\n' 138 | xhtmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, 139 | e['child1'], e['childn'] + 1) 140 | xhtml += xhtmlrec 141 | xhtml += indent2 142 | # close entry 143 | xhtml += '
  2. \n' 144 | xhtml += indent1 + '
\n' 145 | return xhtml, max_lvl, num 146 | 147 | data, max_lvl, num = recursINDX() 148 | if not len(indx_data) == num: 149 | print("Warning (in buildTOC): different number of entries in NCX", len(indx_data), num) 150 | return header + data + footer 151 | 152 | def buildNAV(self, ncx_data, guidetext, title, lang): 153 | print("Building Navigation Document.") 154 | if FORCE_DEFAULT_TITLE: 155 | title = DEFAULT_TITLE 156 | nav_header = '' 157 | nav_header += '\n' 158 | nav_header += ' 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | %s 117 | 118 | 119 | ''' 120 | 121 | ncx_footer = \ 122 | ''' 123 | 124 | ''' 125 | 126 | ncx_entry = \ 127 | ''' 128 | 129 | %s 130 | 131 | ''' 132 | 133 | # recursive part 134 | def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): 135 | if start>len(indx_data) or end>len(indx_data): 136 | print("Warning: missing INDX child entries", start, end, len(indx_data)) 137 | return '' 138 | if DEBUG_NCX: 139 | print("recursINDX lvl %d from %d to %d" % (lvl, start, end)) 140 | xml = '' 141 | if start <= 0: 142 | start = 0 143 | if end <= 0: 144 | end = len(indx_data) 145 | if lvl > max_lvl: 146 | max_lvl = lvl 147 | indent = ' ' * (2 + lvl) 148 | 149 | for i in range(start, end): 150 | e = indx_data[i] 151 | if not e['hlvl'] == lvl: 152 | continue 153 | # open entry 154 | num += 1 155 | link = '%s#filepos%d' % (htmlfile, e['pos']) 156 | tagid = 'np_%d' % num 157 | entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link) 158 | entry = re.sub(re.compile('^', re.M), indent, entry, 0) 159 | xml += entry + '\n' 160 | # recurs 161 | if e['child1']>=0: 162 | xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, 163 | e['child1'], e['childn'] + 1) 164 | xml += xmlrec 165 | # close entry 166 | xml += indent + '\n' 167 | return xml, max_lvl, num 168 | 169 | body, max_lvl, num = recursINDX() 170 | header = ncx_header % (lang, ident, max_lvl + 1, xmlescape(unescapeit(title))) 171 | ncx = header + body + ncx_footer 172 | if not len(indx_data) == num: 173 | print("Warning: different number of entries in NCX", len(indx_data), num) 174 | return ncx 175 | 176 | def writeNCX(self, metadata): 177 | # build the xml 178 | self.isNCX = True 179 | print("Write ncx") 180 | # htmlname = os.path.basename(self.files.outbase) 181 | # htmlname += '.html' 182 | htmlname = 'book.html' 183 | xml = self.buildNCX(htmlname, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0]) 184 | # write the ncx file 185 | # ncxname = os.path.join(self.files.mobi7dir, self.files.getInputFileBasename() + '.ncx') 186 | ncxname = os.path.join(self.files.mobi7dir, 'toc.ncx') 187 | with open(pathof(ncxname), 'wb') as f: 188 | f.write(xml.encode('utf-8')) 189 | 190 | def buildK8NCX(self, indx_data, title, ident, lang): 191 | ncx_header = \ 192 | ''' 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | %s 203 | 204 | 205 | ''' 206 | 207 | ncx_footer = \ 208 | ''' 209 | 210 | ''' 211 | 212 | ncx_entry = \ 213 | ''' 214 | 215 | %s 216 | 217 | ''' 218 | 219 | # recursive part 220 | def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): 221 | if start>len(indx_data) or end>len(indx_data): 222 | print("Warning: missing INDX child entries", start, end, len(indx_data)) 223 | return '' 224 | if DEBUG_NCX: 225 | print("recursINDX lvl %d from %d to %d" % (lvl, start, end)) 226 | xml = '' 227 | if start <= 0: 228 | start = 0 229 | if end <= 0: 230 | end = len(indx_data) 231 | if lvl > max_lvl: 232 | max_lvl = lvl 233 | indent = ' ' * (2 + lvl) 234 | 235 | for i in range(start, end): 236 | e = indx_data[i] 237 | htmlfile = e['filename'] 238 | desttag = e['idtag'] 239 | if not e['hlvl'] == lvl: 240 | continue 241 | # open entry 242 | num += 1 243 | if desttag == '': 244 | link = 'Text/%s' % htmlfile 245 | else: 246 | link = 'Text/%s#%s' % (htmlfile, desttag) 247 | tagid = 'np_%d' % num 248 | entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link) 249 | entry = re.sub(re.compile('^', re.M), indent, entry, 0) 250 | xml += entry + '\n' 251 | # recurs 252 | if e['child1']>=0: 253 | xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, 254 | e['child1'], e['childn'] + 1) 255 | xml += xmlrec 256 | # close entry 257 | xml += indent + '\n' 258 | return xml, max_lvl, num 259 | 260 | body, max_lvl, num = recursINDX() 261 | header = ncx_header % (lang, ident, max_lvl + 1, xmlescape(unescapeit(title))) 262 | ncx = header + body + ncx_footer 263 | if not len(indx_data) == num: 264 | print("Warning: different number of entries in NCX", len(indx_data), num) 265 | return ncx 266 | 267 | def writeK8NCX(self, ncx_data, metadata): 268 | # build the xml 269 | self.isNCX = True 270 | print("Write K8 ncx") 271 | xml = self.buildK8NCX(ncx_data, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0]) 272 | bname = 'toc.ncx' 273 | ncxname = os.path.join(self.files.k8oebps,bname) 274 | with open(pathof(ncxname), 'wb') as f: 275 | f.write(xml.encode('utf-8')) 276 | -------------------------------------------------------------------------------- /lib/mobi_pagemap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | from .compatibility_utils import PY2, unicode_str 8 | 9 | if PY2: 10 | range = xrange 11 | 12 | import struct 13 | # note: struct pack, unpack, unpack_from all require bytestring format 14 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring 15 | 16 | import re 17 | # note: re requites the pattern to be the exact same type as the data to be searched in python3 18 | # but u"" is not allowed for the pattern itself only b"" 19 | 20 | 21 | _TABLE = [('m', 1000), ('cm', 900), ('d', 500), ('cd', 400), ('c', 100), ('xc', 90), ('l', 50), ('xl', 40), ('x', 10), ('ix', 9), ('v', 5), ('iv', 4), ('i', 1)] 22 | 23 | def int_to_roman(i): 24 | parts = [] 25 | num = i 26 | for letter, value in _TABLE: 27 | while value <= num: 28 | num -= value 29 | parts.append(letter) 30 | return ''.join(parts) 31 | 32 | def roman_to_int(s): 33 | result = 0 34 | rnstr = s 35 | for letter, value in _TABLE: 36 | while rnstr.startswith(letter): 37 | result += value 38 | rnstr = rnstr[len(letter):] 39 | return result 40 | 41 | _pattern = r'''\(([^\)]*)\)''' 42 | _tup_pattern = re.compile(_pattern,re.IGNORECASE) 43 | 44 | 45 | def _parseNames(numpages, data): 46 | data = unicode_str(data) 47 | pagenames = [] 48 | pageMap = '' 49 | for i in range(numpages): 50 | pagenames.append(None) 51 | for m in re.finditer(_tup_pattern, data): 52 | tup = m.group(1) 53 | if pageMap != '': 54 | pageMap += ',' 55 | pageMap += '(' + tup + ')' 56 | spos, nametype, svalue = tup.split(",") 57 | # print(spos, nametype, svalue) 58 | if nametype == 'a' or nametype == 'r': 59 | svalue = int(svalue) 60 | spos = int(spos) 61 | for i in range(spos - 1, numpages): 62 | if nametype == 'r': 63 | pname = int_to_roman(svalue) 64 | svalue += 1 65 | elif nametype == 'a': 66 | pname = "%s" % svalue 67 | svalue += 1 68 | elif nametype == 'c': 69 | sp = svalue.find('|') 70 | if sp == -1: 71 | pname = svalue 72 | else: 73 | pname = svalue[0:sp] 74 | svalue = svalue[sp+1:] 75 | else: 76 | print("Error: unknown page numbering type", nametype) 77 | pagenames[i] = pname 78 | return pagenames, pageMap 79 | 80 | 81 | class PageMapProcessor: 82 | 83 | def __init__(self, mh, data): 84 | self.mh = mh 85 | self.data = data 86 | self.pagenames = [] 87 | self.pageoffsets = [] 88 | self.pageMap = '' 89 | self.pm_len = 0 90 | self.pm_nn = 0 91 | self.pn_bits = 0 92 | self.pmoff = None 93 | self.pmstr = '' 94 | print("Extracting Page Map Information") 95 | rev_len, = struct.unpack_from(b'>L', self.data, 0x10) 96 | # skip over header, revision string length data, and revision string 97 | ptr = 0x14 + rev_len 98 | pm_1, self.pm_len, self.pm_nn, self.pm_bits = struct.unpack_from(b'>4H', self.data, ptr) 99 | # print(pm_1, self.pm_len, self.pm_nn, self.pm_bits) 100 | self.pmstr = self.data[ptr+8:ptr+8+self.pm_len] 101 | self.pmoff = self.data[ptr+8+self.pm_len:] 102 | offsize = b">L" 103 | offwidth = 4 104 | if self.pm_bits == 16: 105 | offsize = b">H" 106 | offwidth = 2 107 | ptr = 0 108 | for i in range(self.pm_nn): 109 | od, = struct.unpack_from(offsize, self.pmoff, ptr) 110 | ptr += offwidth 111 | self.pageoffsets.append(od) 112 | self.pagenames, self.pageMap = _parseNames(self.pm_nn, self.pmstr) 113 | 114 | def getPageMap(self): 115 | return self.pageMap 116 | 117 | def getNames(self): 118 | return self.pagenames 119 | 120 | def getOffsets(self): 121 | return self.pageoffsets 122 | 123 | # page-map.xml will be unicode but encoded to utf-8 immediately before being written to a file 124 | def generateKF8PageMapXML(self, k8proc): 125 | pagemapxml = '\n' 126 | for i in range(len(self.pagenames)): 127 | pos = self.pageoffsets[i] 128 | name = self.pagenames[i] 129 | if name is not None and name != "": 130 | [pn, dir, filename, skelpos, skelend, aidtext] = k8proc.getSkelInfo(pos) 131 | idtext = unicode_str(k8proc.getPageIDTag(pos)) 132 | linktgt = unicode_str(filename) 133 | if idtext != '': 134 | linktgt += '#' + idtext 135 | pagemapxml += '\n' % (name, dir, linktgt) 136 | pagemapxml += "\n" 137 | return pagemapxml 138 | 139 | def generateAPNX(self, apnx_meta): 140 | if apnx_meta['format'] == 'MOBI_8': 141 | content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","format":"%(format)s","fileRevisionId":"1","acr":"%(acr)s"}' %apnx_meta 142 | else: 143 | content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","fileRevisionId":"1"}' % apnx_meta 144 | content_header = content_header.encode('utf-8') 145 | page_header = '{"asin":"%(asin)s","pageMap":"%(pageMap)s"}' % apnx_meta 146 | page_header = page_header.encode('utf-8') 147 | apnx = struct.pack(b'>H',1) + struct.pack(b'>H',1) 148 | apnx += struct.pack(b'>I', 12 + len(content_header)) 149 | apnx += struct.pack(b'>I', len(content_header)) 150 | apnx += content_header 151 | apnx += struct.pack(b'>H', 1) 152 | apnx += struct.pack(b'>H', len(page_header)) 153 | apnx += struct.pack(b'>H', self.pm_nn) 154 | apnx += struct.pack(b'>H', 32) 155 | apnx += page_header 156 | for page in self.pageoffsets: 157 | apnx += struct.pack(b'>L', page) 158 | return apnx 159 | -------------------------------------------------------------------------------- /lib/mobi_sectioner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | from .compatibility_utils import PY2, hexlify, bstr, bord, bchar 8 | 9 | import datetime 10 | 11 | if PY2: 12 | range = xrange 13 | 14 | # note: struct pack, unpack, unpack_from all require bytestring format 15 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring 16 | import struct 17 | 18 | from .unipath import pathof 19 | 20 | DUMP = False 21 | """ Set to True to dump all possible information. """ 22 | 23 | class unpackException(Exception): 24 | pass 25 | 26 | 27 | def describe(data): 28 | txtans = '' 29 | hexans = hexlify(data) 30 | for i in data: 31 | if bord(i) < 32 or bord(i) > 127: 32 | txtans += '?' 33 | else: 34 | txtans += bchar(i).decode('latin-1') 35 | return '"' + txtans + '"' + ' 0x'+ hexans 36 | 37 | def datetimefrompalmtime(palmtime): 38 | if palmtime > 0x7FFFFFFF: 39 | pythondatetime = datetime.datetime(year=1904,month=1,day=1)+datetime.timedelta(seconds=palmtime) 40 | else: 41 | pythondatetime = datetime.datetime(year=1970,month=1,day=1)+datetime.timedelta(seconds=palmtime) 42 | return pythondatetime 43 | 44 | 45 | class Sectionizer: 46 | 47 | def __init__(self, filename): 48 | self.data = b'' 49 | with open(pathof(filename), 'rb') as f: 50 | self.data = f.read() 51 | self.palmheader = self.data[:78] 52 | self.palmname = self.data[:32] 53 | self.ident = self.palmheader[0x3C:0x3C+8] 54 | self.num_sections, = struct.unpack_from(b'>H', self.palmheader, 76) 55 | self.filelength = len(self.data) 56 | sectionsdata = struct.unpack_from(bstr('>%dL' % (self.num_sections*2)), self.data, 78) + (self.filelength, 0) 57 | self.sectionoffsets = sectionsdata[::2] 58 | self.sectionattributes = sectionsdata[1::2] 59 | self.sectiondescriptions = ["" for x in range(self.num_sections+1)] 60 | self.sectiondescriptions[-1] = "File Length Only" 61 | return 62 | 63 | def dumpsectionsinfo(self): 64 | print("Section Offset Length UID Attribs Description") 65 | for i in range(self.num_sections): 66 | print("%3d %3X 0x%07X 0x%05X % 8d % 7d %s" % (i,i, self.sectionoffsets[i], self.sectionoffsets[ 67 | i+1] - self.sectionoffsets[i], self.sectionattributes[i]&0xFFFFFF, (self.sectionattributes[i]>>24)&0xFF, self.sectiondescriptions[i])) 68 | print("%3d %3X 0x%07X %s" % 69 | (self.num_sections,self.num_sections, self.sectionoffsets[self.num_sections], self.sectiondescriptions[self.num_sections])) 70 | 71 | def setsectiondescription(self, section, description): 72 | if section < len(self.sectiondescriptions): 73 | self.sectiondescriptions[section] = description 74 | else: 75 | print("Section out of range: %d, description %s" % (section,description)) 76 | 77 | def dumppalmheader(self): 78 | print("Palm Database Header") 79 | print("Database name: " + repr(self.palmheader[:32])) 80 | dbattributes, = struct.unpack_from(b'>H', self.palmheader, 32) 81 | print("Bitfield attributes: 0x%0X" % dbattributes,) 82 | if dbattributes != 0: 83 | print(" (",) 84 | if (dbattributes & 2): 85 | print("Read-only; ",) 86 | if (dbattributes & 4): 87 | print("Dirty AppInfoArea; ",) 88 | if (dbattributes & 8): 89 | print("Needs to be backed up; ",) 90 | if (dbattributes & 16): 91 | print("OK to install over newer; ",) 92 | if (dbattributes & 32): 93 | print("Reset after installation; ",) 94 | if (dbattributes & 64): 95 | print("No copying by PalmPilot beaming; ",) 96 | print(")") 97 | else: 98 | print("") 99 | print("File version: %d" % struct.unpack_from(b'>H', self.palmheader, 34)[0]) 100 | dbcreation, = struct.unpack_from(b'>L', self.palmheader, 36) 101 | print("Creation Date: " + str(datetimefrompalmtime(dbcreation))+ (" (0x%0X)" % dbcreation)) 102 | dbmodification, = struct.unpack_from(b'>L', self.palmheader, 40) 103 | print("Modification Date: " + str(datetimefrompalmtime(dbmodification))+ (" (0x%0X)" % dbmodification)) 104 | dbbackup, = struct.unpack_from(b'>L', self.palmheader, 44) 105 | if dbbackup != 0: 106 | print("Backup Date: " + str(datetimefrompalmtime(dbbackup))+ (" (0x%0X)" % dbbackup)) 107 | print("Modification No.: %d" % struct.unpack_from(b'>L', self.palmheader, 48)[0]) 108 | print("App Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 52)[0]) 109 | print("Sort Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 56)[0]) 110 | print("Type/Creator: %s/%s" % (repr(self.palmheader[60:64]), repr(self.palmheader[64:68]))) 111 | print("Unique seed: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 68)[0]) 112 | expectedzero, = struct.unpack_from(b'>L', self.palmheader, 72) 113 | if expectedzero != 0: 114 | print("Should be zero but isn't: %d" % struct.unpack_from(b'>L', self.palmheader, 72)[0]) 115 | print("Number of sections: %d" % struct.unpack_from(b'>H', self.palmheader, 76)[0]) 116 | return 117 | 118 | def loadSection(self, section): 119 | before, after = self.sectionoffsets[section:section+2] 120 | return self.data[before:after] 121 | -------------------------------------------------------------------------------- /lib/mobi_split.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | import struct 8 | # note: struct pack, unpack, unpack_from all require bytestring format 9 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring 10 | 11 | from .unipath import pathof 12 | 13 | 14 | # important pdb header offsets 15 | unique_id_seed = 68 16 | number_of_pdb_records = 76 17 | 18 | # important palmdoc header offsets 19 | book_length = 4 20 | book_record_count = 8 21 | first_pdb_record = 78 22 | 23 | # important rec0 offsets 24 | length_of_book = 4 25 | mobi_header_base = 16 26 | mobi_header_length = 20 27 | mobi_type = 24 28 | mobi_version = 36 29 | first_non_text = 80 30 | title_offset = 84 31 | first_resc_record = 108 32 | first_content_index = 192 33 | last_content_index = 194 34 | kf8_fdst_index = 192 # for KF8 mobi headers 35 | fcis_index = 200 36 | flis_index = 208 37 | srcs_index = 224 38 | srcs_count = 228 39 | primary_index = 244 40 | datp_index = 256 41 | huffoff = 112 42 | hufftbloff = 120 43 | 44 | def getint(datain,ofs,sz=b'L'): 45 | i, = struct.unpack_from(b'>'+sz,datain,ofs) 46 | return i 47 | 48 | def writeint(datain,ofs,n,len=b'L'): 49 | if len==b'L': 50 | return datain[:ofs]+struct.pack(b'>L',n)+datain[ofs+4:] 51 | else: 52 | return datain[:ofs]+struct.pack(b'>H',n)+datain[ofs+2:] 53 | 54 | def getsecaddr(datain,secno): 55 | nsec = getint(datain,number_of_pdb_records,b'H') 56 | assert secno>=0 & secnoL',2*nsec+1)) 78 | datalst.append(datain[unique_id_seed+4:number_of_pdb_records]) 79 | datalst.append(struct.pack(b'>H',nsec)) 80 | newstart = zerosecstart 81 | for i in range(0,secno): 82 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) 83 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) 84 | datalst.append(struct.pack(b'>L', secstart) + struct.pack(b'>L', (2*secno))) 85 | for i in range(secno+1,nsec): 86 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) 87 | ofs = ofs + dif 88 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) 89 | lpad = newstart - (first_pdb_record + 8*nsec) 90 | if lpad > 0: 91 | datalst.append(b'\0' * lpad) 92 | datalst.append(datain[zerosecstart:secstart]) 93 | datalst.append(secdata) 94 | datalst.append(datain[secend:]) 95 | dataout = b''.join(datalst) 96 | return dataout 97 | 98 | def nullsection(datain,secno): # make it zero-length without deleting it 99 | datalst = [] 100 | nsec = getint(datain,number_of_pdb_records,b'H') 101 | secstart, secend = getsecaddr(datain,secno) 102 | zerosecstart, zerosecend = getsecaddr(datain, 0) 103 | dif = secend-secstart 104 | datalst.append(datain[:first_pdb_record]) 105 | for i in range(0,secno+1): 106 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) 107 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) 108 | for i in range(secno+1, nsec): 109 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) 110 | ofs = ofs - dif 111 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) 112 | lpad = zerosecstart - (first_pdb_record + 8*nsec) 113 | if lpad > 0: 114 | datalst.append(b'\0' * lpad) 115 | datalst.append(datain[zerosecstart: secstart]) 116 | datalst.append(datain[secend:]) 117 | dataout = b''.join(datalst) 118 | return dataout 119 | 120 | def deletesectionrange(datain,firstsec,lastsec): # delete a range of sections 121 | datalst = [] 122 | firstsecstart,firstsecend = getsecaddr(datain,firstsec) 123 | lastsecstart,lastsecend = getsecaddr(datain,lastsec) 124 | zerosecstart, zerosecend = getsecaddr(datain, 0) 125 | dif = lastsecend - firstsecstart + 8*(lastsec-firstsec+1) 126 | nsec = getint(datain,number_of_pdb_records,b'H') 127 | datalst.append(datain[:unique_id_seed]) 128 | datalst.append(struct.pack(b'>L',2*(nsec-(lastsec-firstsec+1))+1)) 129 | datalst.append(datain[unique_id_seed+4:number_of_pdb_records]) 130 | datalst.append(struct.pack(b'>H',nsec-(lastsec-firstsec+1))) 131 | newstart = zerosecstart - 8*(lastsec-firstsec+1) 132 | for i in range(0,firstsec): 133 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) 134 | ofs = ofs-8*(lastsec-firstsec+1) 135 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) 136 | for i in range(lastsec+1,nsec): 137 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) 138 | ofs = ofs - dif 139 | flgval = 2*(i-(lastsec-firstsec+1)) 140 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) 141 | lpad = newstart - (first_pdb_record + 8*(nsec - (lastsec - firstsec + 1))) 142 | if lpad > 0: 143 | datalst.append(b'\0' * lpad) 144 | datalst.append(datain[zerosecstart:firstsecstart]) 145 | datalst.append(datain[lastsecend:]) 146 | dataout = b''.join(datalst) 147 | return dataout 148 | 149 | def insertsection(datain,secno,secdata): # insert a new section 150 | datalst = [] 151 | nsec = getint(datain,number_of_pdb_records,b'H') 152 | # print("inserting secno" , secno, "into" ,nsec, "sections") 153 | secstart,secend = getsecaddr(datain,secno) 154 | zerosecstart,zerosecend = getsecaddr(datain,0) 155 | dif = len(secdata) 156 | datalst.append(datain[:unique_id_seed]) 157 | datalst.append(struct.pack(b'>L',2*(nsec+1)+1)) 158 | datalst.append(datain[unique_id_seed+4:number_of_pdb_records]) 159 | datalst.append(struct.pack(b'>H',nsec+1)) 160 | newstart = zerosecstart + 8 161 | for i in range(0,secno): 162 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) 163 | ofs += 8 164 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) 165 | datalst.append(struct.pack(b'>L', secstart + 8) + struct.pack(b'>L', (2*secno))) 166 | for i in range(secno,nsec): 167 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) 168 | ofs = ofs + dif + 8 169 | flgval = 2*(i+1) 170 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) 171 | lpad = newstart - (first_pdb_record + 8*(nsec + 1)) 172 | if lpad > 0: 173 | datalst.append(b'\0' * lpad) 174 | datalst.append(datain[zerosecstart:secstart]) 175 | datalst.append(secdata) 176 | datalst.append(datain[secstart:]) 177 | dataout = b''.join(datalst) 178 | return dataout 179 | 180 | 181 | def insertsectionrange(sectionsource,firstsec,lastsec,sectiontarget,targetsec): # insert a range of sections 182 | # print("inserting secno" , firstsec, "to", lastsec, "into" ,targetsec, "sections") 183 | # dataout = sectiontarget 184 | # for idx in range(lastsec,firstsec-1,-1): 185 | # dataout = insertsection(dataout,targetsec,readsection(sectionsource,idx)) 186 | # return dataout 187 | datalst = [] 188 | nsec = getint(sectiontarget,number_of_pdb_records,b'H') 189 | zerosecstart, zerosecend = getsecaddr(sectiontarget,0) 190 | insstart, nul = getsecaddr(sectiontarget,targetsec) 191 | nins = lastsec - firstsec + 1 192 | srcstart, nul = getsecaddr(sectionsource,firstsec) 193 | nul, srcend = getsecaddr(sectionsource,lastsec) 194 | newstart = zerosecstart + 8*nins 195 | 196 | datalst.append(sectiontarget[:unique_id_seed]) 197 | datalst.append(struct.pack(b'>L',2*(nsec+nins)+1)) 198 | datalst.append(sectiontarget[unique_id_seed+4:number_of_pdb_records]) 199 | datalst.append(struct.pack(b'>H',nsec+nins)) 200 | for i in range(0,targetsec): 201 | ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8) 202 | ofsnew = ofs + 8*nins 203 | flgvalnew = flgval 204 | datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew)) 205 | # print(ofsnew, flgvalnew, ofs, flgval) 206 | srcstart0, nul = getsecaddr(sectionsource,firstsec) 207 | for i in range(nins): 208 | isrcstart, nul = getsecaddr(sectionsource,firstsec+i) 209 | ofsnew = insstart + (isrcstart-srcstart0) + 8*nins 210 | flgvalnew = 2*(targetsec+i) 211 | datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew)) 212 | # print(ofsnew, flgvalnew) 213 | dif = srcend - srcstart 214 | for i in range(targetsec,nsec): 215 | ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8) 216 | ofsnew = ofs + dif + 8*nins 217 | flgvalnew = 2*(i+nins) 218 | datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L',flgvalnew)) 219 | # print(ofsnew, flgvalnew, ofs, flgval) 220 | lpad = newstart - (first_pdb_record + 8*(nsec + nins)) 221 | if lpad > 0: 222 | datalst.append(b'\0' * lpad) 223 | datalst.append(sectiontarget[zerosecstart:insstart]) 224 | datalst.append(sectionsource[srcstart:srcend]) 225 | datalst.append(sectiontarget[insstart:]) 226 | dataout = b''.join(datalst) 227 | return dataout 228 | 229 | def get_exth_params(rec0): 230 | ebase = mobi_header_base + getint(rec0,mobi_header_length) 231 | elen = getint(rec0,ebase+4) 232 | enum = getint(rec0,ebase+8) 233 | return ebase,elen,enum 234 | 235 | def add_exth(rec0,exth_num,exth_bytes): 236 | ebase,elen,enum = get_exth_params(rec0) 237 | newrecsize = 8+len(exth_bytes) 238 | newrec0 = rec0[0:ebase+4]+struct.pack(b'>L',elen+newrecsize)+struct.pack(b'>L',enum+1)+\ 239 | struct.pack(b'>L',exth_num)+struct.pack(b'>L',newrecsize)+exth_bytes+rec0[ebase+12:] 240 | newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+newrecsize) 241 | return newrec0 242 | 243 | def read_exth(rec0,exth_num): 244 | exth_values = [] 245 | ebase,elen,enum = get_exth_params(rec0) 246 | ebase = ebase+12 247 | while enum>0: 248 | exth_id = getint(rec0,ebase) 249 | if exth_id == exth_num: 250 | # We might have multiple exths, so build a list. 251 | exth_values.append(rec0[ebase+8:ebase+getint(rec0,ebase+4)]) 252 | enum = enum-1 253 | ebase = ebase+getint(rec0,ebase+4) 254 | return exth_values 255 | 256 | def write_exth(rec0,exth_num,exth_bytes): 257 | ebase,elen,enum = get_exth_params(rec0) 258 | ebase_idx = ebase+12 259 | enum_idx = enum 260 | while enum_idx>0: 261 | exth_id = getint(rec0,ebase_idx) 262 | if exth_id == exth_num: 263 | dif = len(exth_bytes)+8-getint(rec0,ebase_idx+4) 264 | newrec0 = rec0 265 | if dif != 0: 266 | newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+dif) 267 | return newrec0[:ebase+4]+struct.pack(b'>L',elen+len(exth_bytes)+8-getint(rec0,ebase_idx+4))+\ 268 | struct.pack(b'>L',enum)+rec0[ebase+12:ebase_idx+4]+\ 269 | struct.pack(b'>L',len(exth_bytes)+8)+exth_bytes+\ 270 | rec0[ebase_idx+getint(rec0,ebase_idx+4):] 271 | enum_idx = enum_idx-1 272 | ebase_idx = ebase_idx+getint(rec0,ebase_idx+4) 273 | return rec0 274 | 275 | def del_exth(rec0,exth_num): 276 | ebase,elen,enum = get_exth_params(rec0) 277 | ebase_idx = ebase+12 278 | enum_idx = 0 279 | while enum_idx < enum: 280 | exth_id = getint(rec0,ebase_idx) 281 | exth_size = getint(rec0,ebase_idx+4) 282 | if exth_id == exth_num: 283 | newrec0 = rec0 284 | newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)-exth_size) 285 | newrec0 = newrec0[:ebase_idx]+newrec0[ebase_idx+exth_size:] 286 | newrec0 = newrec0[0:ebase+4]+struct.pack(b'>L',elen-exth_size)+struct.pack(b'>L',enum-1)+newrec0[ebase+12:] 287 | return newrec0 288 | enum_idx += 1 289 | ebase_idx = ebase_idx+exth_size 290 | return rec0 291 | 292 | 293 | class mobi_split: 294 | 295 | def __init__(self, infile): 296 | datain = b'' 297 | with open(pathof(infile), 'rb') as f: 298 | datain = f.read() 299 | datain_rec0 = readsection(datain,0) 300 | ver = getint(datain_rec0,mobi_version) 301 | self.combo = (ver!=8) 302 | if not self.combo: 303 | return 304 | exth121 = read_exth(datain_rec0,121) 305 | if len(exth121) == 0: 306 | self.combo = False 307 | return 308 | else: 309 | # only pay attention to first exth121 310 | # (there should only be one) 311 | datain_kf8, = struct.unpack_from(b'>L',exth121[0],0) 312 | if datain_kf8 == 0xffffffff: 313 | self.combo = False 314 | return 315 | datain_kfrec0 =readsection(datain,datain_kf8) 316 | 317 | # create the standalone mobi7 318 | num_sec = getint(datain,number_of_pdb_records,b'H') 319 | # remove BOUNDARY up to but not including ELF record 320 | self.result_file7 = deletesectionrange(datain,datain_kf8-1,num_sec-2) 321 | # check if there are SRCS records and delete them 322 | srcs = getint(datain_rec0,srcs_index) 323 | num_srcs = getint(datain_rec0,srcs_count) 324 | if srcs != 0xffffffff and num_srcs > 0: 325 | self.result_file7 = deletesectionrange(self.result_file7,srcs,srcs+num_srcs-1) 326 | datain_rec0 = writeint(datain_rec0,srcs_index,0xffffffff) 327 | datain_rec0 = writeint(datain_rec0,srcs_count,0) 328 | # reset the EXTH 121 KF8 Boundary meta data to 0xffffffff 329 | datain_rec0 = write_exth(datain_rec0,121, struct.pack(b'>L', 0xffffffff)) 330 | # datain_rec0 = del_exth(datain_rec0,121) 331 | # datain_rec0 = del_exth(datain_rec0,534) 332 | # don't remove the EXTH 125 KF8 Count of Resources, seems to be present in mobi6 files as well 333 | # set the EXTH 129 KF8 Masthead / Cover Image string to the null string 334 | datain_rec0 = write_exth(datain_rec0,129, b'') 335 | # don't remove the EXTH 131 KF8 Unidentified Count, seems to be present in mobi6 files as well 336 | 337 | # need to reset flags stored in 0x80-0x83 338 | # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050 339 | # Bit Flags 340 | # 0x1000 = Bit 12 indicates if embedded fonts are used or not 341 | # 0x0800 = means this Header points to *shared* images/resource/fonts ?? 342 | # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8? 343 | # 0x0040 = exth exists 344 | # 0x0010 = Not sure but this is always set so far 345 | fval, = struct.unpack_from(b'>L',datain_rec0, 0x80) 346 | # need to remove flag 0x0800 for KindlePreviewer 2.8 and unset Bit 12 for embedded fonts 347 | fval = fval & 0x07FF 348 | datain_rec0 = datain_rec0[:0x80] + struct.pack(b'>L',fval) + datain_rec0[0x84:] 349 | 350 | self.result_file7 = writesection(self.result_file7,0,datain_rec0) 351 | 352 | # no need to replace kf8 style fcis with mobi 7 one 353 | # fcis_secnum, = struct.unpack_from(b'>L',datain_rec0, 0xc8) 354 | # if fcis_secnum != 0xffffffff: 355 | # fcis_info = readsection(datain, fcis_secnum) 356 | # text_len, = struct.unpack_from(b'>L', fcis_info, 0x14) 357 | # new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00' 358 | # new_fcis += struct.pack(b'>L',text_len) 359 | # new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00' 360 | # self.result_file7 = writesection(self.result_file7, fcis_secnum, new_fcis) 361 | 362 | firstimage = getint(datain_rec0,first_resc_record) 363 | lastimage = getint(datain_rec0,last_content_index,b'H') 364 | # print("Old First Image, last Image", firstimage,lastimage) 365 | if lastimage == 0xffff: 366 | # find the lowest of the next sections and copy up to that. 367 | ofs_list = [(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')] 368 | for ofs,sz in ofs_list: 369 | n = getint(datain_rec0,ofs,sz) 370 | # print("n",n) 371 | if n > 0 and n < lastimage: 372 | lastimage = n-1 373 | print("First Image, last Image", firstimage,lastimage) 374 | 375 | # Try to null out FONT and RES, but leave the (empty) PDB record so image refs remain valid 376 | for i in range(firstimage,lastimage): 377 | imgsec = readsection(self.result_file7,i) 378 | if imgsec[0:4] in [b'RESC',b'FONT']: 379 | self.result_file7 = nullsection(self.result_file7,i) 380 | 381 | # mobi7 finished 382 | 383 | # create standalone mobi8 384 | self.result_file8 = deletesectionrange(datain,0,datain_kf8-1) 385 | target = getint(datain_kfrec0,first_resc_record) 386 | self.result_file8 = insertsectionrange(datain,firstimage,lastimage,self.result_file8,target) 387 | datain_kfrec0 =readsection(self.result_file8,0) 388 | 389 | # Only keep the correct EXTH 116 StartOffset, KG 2.5 carries over the one from the mobi7 part, which then points at garbage in the mobi8 part, and confuses FW 3.4 390 | kf8starts = read_exth(datain_kfrec0,116) 391 | # If we have multiple StartOffset, keep only the last one 392 | kf8start_count = len(kf8starts) 393 | while kf8start_count > 1: 394 | kf8start_count -= 1 395 | datain_kfrec0 = del_exth(datain_kfrec0,116) 396 | 397 | # update the EXTH 125 KF8 Count of Images/Fonts/Resources 398 | datain_kfrec0 = write_exth(datain_kfrec0,125,struct.pack(b'>L',lastimage-firstimage+1)) 399 | 400 | # need to reset flags stored in 0x80-0x83 401 | # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050 402 | # standalone mobi8 with exth: 0x0050 403 | # Bit Flags 404 | # 0x1000 = Bit 12 indicates if embedded fonts are used or not 405 | # 0x0800 = means this Header points to *shared* images/resource/fonts ?? 406 | # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8? 407 | # 0x0040 = exth exists 408 | # 0x0010 = Not sure but this is always set so far 409 | fval, = struct.unpack_from('>L',datain_kfrec0, 0x80) 410 | fval = fval & 0x1FFF 411 | fval |= 0x0800 412 | datain_kfrec0 = datain_kfrec0[:0x80] + struct.pack(b'>L',fval) + datain_kfrec0[0x84:] 413 | 414 | # properly update other index pointers that have been shifted by the insertion of images 415 | ofs_list = [(kf8_fdst_index,b'L'),(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')] 416 | for ofs,sz in ofs_list: 417 | n = getint(datain_kfrec0,ofs,sz) 418 | if n != 0xffffffff: 419 | datain_kfrec0 = writeint(datain_kfrec0,ofs,n+lastimage-firstimage+1,sz) 420 | self.result_file8 = writesection(self.result_file8,0,datain_kfrec0) 421 | 422 | # no need to replace kf8 style fcis with mobi 7 one 423 | # fcis_secnum, = struct.unpack_from(b'>L',datain_kfrec0, 0xc8) 424 | # if fcis_secnum != 0xffffffff: 425 | # fcis_info = readsection(self.result_file8, fcis_secnum) 426 | # text_len, = struct.unpack_from(b'>L', fcis_info, 0x14) 427 | # new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00' 428 | # new_fcis += struct.pack(b'>L',text_len) 429 | # new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00' 430 | # self.result_file8 = writesection(self.result_file8, fcis_secnum, new_fcis) 431 | 432 | # mobi8 finished 433 | 434 | def getResult8(self): 435 | return self.result_file8 436 | 437 | def getResult7(self): 438 | return self.result_file7 439 | -------------------------------------------------------------------------------- /lib/mobi_uncompress.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | from .compatibility_utils import PY2, bchr, lmap, bstr 8 | 9 | if PY2: 10 | range = xrange 11 | 12 | import struct 13 | # note: struct pack, unpack, unpack_from all require bytestring format 14 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring 15 | 16 | 17 | class unpackException(Exception): 18 | pass 19 | 20 | class UncompressedReader: 21 | 22 | def unpack(self, data): 23 | return data 24 | 25 | class PalmdocReader: 26 | 27 | def unpack(self, i): 28 | o, p = b'', 0 29 | while p < len(i): 30 | # for python 3 must use slice since i[p] returns int while slice returns character 31 | c = ord(i[p:p+1]) 32 | p += 1 33 | if (c >= 1 and c <= 8): 34 | o += i[p:p+c] 35 | p += c 36 | elif (c < 128): 37 | o += bchr(c) 38 | elif (c >= 192): 39 | o += b' ' + bchr(c ^ 128) 40 | else: 41 | if p < len(i): 42 | c = (c << 8) | ord(i[p:p+1]) 43 | p += 1 44 | m = (c >> 3) & 0x07ff 45 | n = (c & 7) + 3 46 | if (m > n): 47 | o += o[-m:n-m] 48 | else: 49 | for _ in range(n): 50 | # because of completely ass-backwards decision by python mainters for python 3 51 | # we must use slice for bytes as i[p] returns int while slice returns character 52 | if m == 1: 53 | o += o[-m:] 54 | else: 55 | o += o[-m:-m+1] 56 | return o 57 | 58 | class HuffcdicReader: 59 | q = struct.Struct(b'>Q').unpack_from 60 | 61 | def loadHuff(self, huff): 62 | if huff[0:8] != b'HUFF\x00\x00\x00\x18': 63 | raise unpackException('invalid huff header') 64 | off1, off2 = struct.unpack_from(b'>LL', huff, 8) 65 | 66 | def dict1_unpack(v): 67 | codelen, term, maxcode = v&0x1f, v&0x80, v>>8 68 | assert codelen != 0 69 | if codelen <= 8: 70 | assert term 71 | maxcode = ((maxcode + 1) << (32 - codelen)) - 1 72 | return (codelen, term, maxcode) 73 | self.dict1 = lmap(dict1_unpack, struct.unpack_from(b'>256L', huff, off1)) 74 | 75 | dict2 = struct.unpack_from(b'>64L', huff, off2) 76 | self.mincode, self.maxcode = (), () 77 | for codelen, mincode in enumerate((0,) + dict2[0::2]): 78 | self.mincode += (mincode << (32 - codelen), ) 79 | for codelen, maxcode in enumerate((0,) + dict2[1::2]): 80 | self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, ) 81 | 82 | self.dictionary = [] 83 | 84 | def loadCdic(self, cdic): 85 | if cdic[0:8] != b'CDIC\x00\x00\x00\x10': 86 | raise unpackException('invalid cdic header') 87 | phrases, bits = struct.unpack_from(b'>LL', cdic, 8) 88 | n = min(1<H').unpack_from 90 | def getslice(off): 91 | blen, = h(cdic, 16+off) 92 | slice = cdic[18+off:18+off+(blen&0x7fff)] 93 | return (slice, blen&0x8000) 94 | self.dictionary += lmap(getslice, struct.unpack_from(bstr('>%dH' % n), cdic, 16)) 95 | 96 | def unpack(self, data): 97 | q = HuffcdicReader.q 98 | 99 | bitsleft = len(data) * 8 100 | data += b"\x00\x00\x00\x00\x00\x00\x00\x00" 101 | pos = 0 102 | x, = q(data, pos) 103 | n = 32 104 | 105 | s = b'' 106 | while True: 107 | if n <= 0: 108 | pos += 4 109 | x, = q(data, pos) 110 | n += 32 111 | code = (x >> n) & ((1 << 32) - 1) 112 | 113 | codelen, term, maxcode = self.dict1[code >> 24] 114 | if not term: 115 | while code < self.mincode[codelen]: 116 | codelen += 1 117 | maxcode = self.maxcode[codelen] 118 | 119 | n -= codelen 120 | bitsleft -= codelen 121 | if bitsleft < 0: 122 | break 123 | 124 | r = (maxcode - code) >> (32 - codelen) 125 | slice, flag = self.dictionary[r] 126 | if not flag: 127 | self.dictionary[r] = None 128 | slice = self.unpack(slice) 129 | self.dictionary[r] = (slice, 1) 130 | s += slice 131 | return s 132 | -------------------------------------------------------------------------------- /lib/mobi_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | # flake8: noqa 5 | 6 | from __future__ import unicode_literals, division, absolute_import, print_function 7 | 8 | from .compatibility_utils import PY2, text_type, bchr, bord 9 | 10 | import binascii 11 | 12 | if PY2: 13 | range = xrange 14 | 15 | from itertools import cycle 16 | 17 | def getLanguage(langID, sublangID): 18 | mobilangdict = { 19 | 54 : {0 : 'af'}, # Afrikaans 20 | 28 : {0 : 'sq'}, # Albanian 21 | 1 : {0 : 'ar' , 5 : 'ar-dz' , 15 : 'ar-bh' , 3 : 'ar-eg' , 2 : 'ar-iq', 11 : 'ar-jo' , 13 : 'ar-kw' , 12 : 'ar-lb' , 4: 'ar-ly', 22 | 6 : 'ar-ma' , 8 : 'ar-om' , 16 : 'ar-qa' , 1 : 'ar-sa' , 10 : 'ar-sy' , 7 : 'ar-tn' , 14 : 'ar-ae' , 9 : 'ar-ye'}, 23 | # Arabic, Arabic (Algeria), Arabic (Bahrain), Arabic (Egypt), Arabic 24 | # (Iraq), Arabic (Jordan), Arabic (Kuwait), Arabic (Lebanon), Arabic 25 | # (Libya), Arabic (Morocco), Arabic (Oman), Arabic (Qatar), Arabic 26 | # (Saudi Arabia), Arabic (Syria), Arabic (Tunisia), Arabic (United Arab 27 | # Emirates), Arabic (Yemen) 28 | 43 : {0 : 'hy'}, # Armenian 29 | 77 : {0 : 'as'}, # Assamese 30 | 44 : {0 : 'az'}, # "Azeri (IANA: Azerbaijani) 31 | 45 : {0 : 'eu'}, # Basque 32 | 35 : {0 : 'be'}, # Belarusian 33 | 69 : {0 : 'bn'}, # Bengali 34 | 2 : {0 : 'bg'}, # Bulgarian 35 | 3 : {0 : 'ca'}, # Catalan 36 | 4 : {0 : 'zh' , 3 : 'zh-hk' , 2 : 'zh-cn' , 4 : 'zh-sg' , 1 : 'zh-tw'}, 37 | # Chinese, Chinese (Hong Kong), Chinese (PRC), Chinese (Singapore), Chinese (Taiwan) 38 | 26 : {0 : 'hr', 3 : 'sr'}, # Croatian, Serbian 39 | 5 : {0 : 'cs'}, # Czech 40 | 6 : {0 : 'da'}, # Danish 41 | 19 : {0: 'nl', 1 : 'nl' , 2 : 'nl-be'}, # Dutch / Flemish, Dutch (Belgium) 42 | 9 : {0: 'en', 1 : 'en' , 3 : 'en-au' , 10 : 'en-bz' , 4 : 'en-ca' , 6 : 'en-ie' , 8 : 'en-jm' , 5 : 'en-nz' , 13 : 'en-ph' , 43 | 7 : 'en-za' , 11 : 'en-tt' , 2 : 'en-gb', 1 : 'en-us' , 12 : 'en-zw'}, 44 | # English, English (Australia), English (Belize), English (Canada), 45 | # English (Ireland), English (Jamaica), English (New Zealand), English 46 | # (Philippines), English (South Africa), English (Trinidad), English 47 | # (United Kingdom), English (United States), English (Zimbabwe) 48 | 37 : {0 : 'et'}, # Estonian 49 | 56 : {0 : 'fo'}, # Faroese 50 | 41 : {0 : 'fa'}, # Farsi / Persian 51 | 11 : {0 : 'fi'}, # Finnish 52 | 12 : {0 : 'fr', 1 : 'fr' , 2 : 'fr-be' , 3 : 'fr-ca' , 5 : 'fr-lu' , 6 : 'fr-mc' , 4 : 'fr-ch'}, 53 | # French, French (Belgium), French (Canada), French (Luxembourg), French (Monaco), French (Switzerland) 54 | 55 : {0 : 'ka'}, # Georgian 55 | 7 : {0 : 'de', 1 : 'de' , 3 : 'de-at' , 5 : 'de-li' , 4 : 'de-lu' , 2 : 'de-ch'}, 56 | # German, German (Austria), German (Liechtenstein), German (Luxembourg), German (Switzerland) 57 | 8 : {0 : 'el'}, # Greek, Modern (1453-) 58 | 71 : {0 : 'gu'}, # Gujarati 59 | 13 : {0 : 'he'}, # Hebrew (also code 'iw'?) 60 | 57 : {0 : 'hi'}, # Hindi 61 | 14 : {0 : 'hu'}, # Hungarian 62 | 15 : {0 : 'is'}, # Icelandic 63 | 33 : {0 : 'id'}, # Indonesian 64 | 16 : {0 : 'it', 1 : 'it' , 2 : 'it-ch'}, # Italian, Italian (Switzerland) 65 | 17 : {0 : 'ja'}, # Japanese 66 | 75 : {0 : 'kn'}, # Kannada 67 | 63 : {0 : 'kk'}, # Kazakh 68 | 87 : {0 : 'x-kok'}, # Konkani (real language code is 'kok'?) 69 | 18 : {0 : 'ko'}, # Korean 70 | 38 : {0 : 'lv'}, # Latvian 71 | 39 : {0 : 'lt'}, # Lithuanian 72 | 47 : {0 : 'mk'}, # Macedonian 73 | 62 : {0 : 'ms'}, # Malay 74 | 76 : {0 : 'ml'}, # Malayalam 75 | 58 : {0 : 'mt'}, # Maltese 76 | 78 : {0 : 'mr'}, # Marathi 77 | 97 : {0 : 'ne'}, # Nepali 78 | 20 : {0 : 'no'}, # Norwegian 79 | 72 : {0 : 'or'}, # Oriya 80 | 21 : {0 : 'pl'}, # Polish 81 | 22 : {0 : 'pt', 2 : 'pt' , 1 : 'pt-br'}, # Portuguese, Portuguese (Brazil) 82 | 70 : {0 : 'pa'}, # Punjabi 83 | 23 : {0 : 'rm'}, # "Rhaeto-Romanic" (IANA: Romansh) 84 | 24 : {0 : 'ro'}, # Romanian 85 | 25 : {0 : 'ru'}, # Russian 86 | 59 : {0 : 'sz'}, # "Sami (Lappish)" (not an IANA language code) 87 | # IANA code for "Northern Sami" is 'se' 88 | # 'SZ' is the IANA region code for Swaziland 89 | 79 : {0 : 'sa'}, # Sanskrit 90 | 27 : {0 : 'sk'}, # Slovak 91 | 36 : {0 : 'sl'}, # Slovenian 92 | 46 : {0 : 'sb'}, # "Sorbian" (not an IANA language code) 93 | # 'SB' is IANA region code for 'Solomon Islands' 94 | # Lower Sorbian = 'dsb' 95 | # Upper Sorbian = 'hsb' 96 | # Sorbian Languages = 'wen' 97 | 10 : {0 : 'es' , 1 : 'es' , 11 : 'es-ar' , 16 : 'es-bo' , 13 : 'es-cl' , 9 : 'es-co' , 5 : 'es-cr' , 7 : 'es-do' , 98 | 12 : 'es-ec' , 17 : 'es-sv' , 4 : 'es-gt' , 18 : 'es-hn' , 2 : 'es-mx' , 19 : 'es-ni' , 6 : 'es-pa' , 99 | 15 : 'es-py' , 10 : 'es-pe' , 20 : 'es-pr' , 14 : 'es-uy' , 8 : 'es-ve'}, 100 | # Spanish, Spanish (Mobipocket bug?), Spanish (Argentina), Spanish 101 | # (Bolivia), Spanish (Chile), Spanish (Colombia), Spanish (Costa Rica), 102 | # Spanish (Dominican Republic), Spanish (Ecuador), Spanish (El 103 | # Salvador), Spanish (Guatemala), Spanish (Honduras), Spanish (Mexico), 104 | # Spanish (Nicaragua), Spanish (Panama), Spanish (Paraguay), Spanish 105 | # (Peru), Spanish (Puerto Rico), Spanish (Uruguay), Spanish (Venezuela) 106 | 48 : {0 : 'sx'}, # "Sutu" (not an IANA language code) 107 | # "Sutu" is another name for "Southern Sotho"? 108 | # IANA code for "Southern Sotho" is 'st' 109 | 65 : {0 : 'sw'}, # Swahili 110 | 29 : {0 : 'sv' , 1 : 'sv' , 2 : 'sv-fi'}, # Swedish, Swedish (Finland) 111 | 73 : {0 : 'ta'}, # Tamil 112 | 68 : {0 : 'tt'}, # Tatar 113 | 74 : {0 : 'te'}, # Telugu 114 | 30 : {0 : 'th'}, # Thai 115 | 49 : {0 : 'ts'}, # Tsonga 116 | 50 : {0 : 'tn'}, # Tswana 117 | 31 : {0 : 'tr'}, # Turkish 118 | 34 : {0 : 'uk'}, # Ukrainian 119 | 32 : {0 : 'ur'}, # Urdu 120 | 67 : {0 : 'uz', 1 : 'uz'}, # Uzbek 121 | 42 : {0 : 'vi'}, # Vietnamese 122 | 52 : {0 : 'xh'}, # Xhosa 123 | 53 : {0 : 'zu'}, # Zulu 124 | } 125 | lang = "en" 126 | if langID in mobilangdict: 127 | subdict = mobilangdict[langID] 128 | lang = subdict[0] 129 | if sublangID in subdict: 130 | lang = subdict[sublangID] 131 | return lang 132 | 133 | 134 | def toHex(byteList): 135 | return binascii.hexlify(byteList) 136 | 137 | # returns base32 bytestring 138 | def toBase32(value, npad=4): 139 | digits = b'0123456789ABCDEFGHIJKLMNOPQRSTUV' 140 | num_string=b'' 141 | current = value 142 | while current != 0: 143 | next, remainder = divmod(current, 32) 144 | rem_string = digits[remainder:remainder+1] 145 | num_string = rem_string + num_string 146 | current=next 147 | if num_string == b'': 148 | num_string = b'0' 149 | pad = npad - len(num_string) 150 | if pad > 0: 151 | num_string = b'0' * pad + num_string 152 | return num_string 153 | 154 | 155 | # converts base32 string to value 156 | def fromBase32(str_num): 157 | if isinstance(str_num, text_type): 158 | str_num = str_num.encode('latin-1') 159 | scalelst = [1,32,1024,32768,1048576,33554432,1073741824,34359738368] 160 | value = 0 161 | j = 0 162 | n = len(str_num) 163 | scale = 0 164 | for i in range(n): 165 | c = str_num[n-i-1:n-i] 166 | if c in b'0123456789': 167 | v = ord(c) - ord(b'0') 168 | else: 169 | v = ord(c) - ord(b'A') + 10 170 | if j < len(scalelst): 171 | scale = scalelst[j] 172 | else: 173 | scale = scale * 32 174 | j += 1 175 | if v != 0: 176 | value = value + (v * scale) 177 | return value 178 | 179 | 180 | # note: if decode a bytestring using 'latin-1' (or any other 0-255 encoding) 181 | # in place of ascii you will get a byte to half-word or integer 182 | # one to one mapping of values from 0 - 255 183 | 184 | def mangle_fonts(encryption_key, data): 185 | if isinstance(encryption_key, text_type): 186 | encryption_key = encryption_key.encode('latin-1') 187 | crypt = data[:1024] 188 | key = cycle(iter(map(bord, encryption_key))) 189 | # encrypt = ''.join([chr(ord(x)^key.next()) for x in crypt]) 190 | encrypt = b''.join([bchr(bord(x)^next(key)) for x in crypt]) 191 | return encrypt + data[1024:] 192 | -------------------------------------------------------------------------------- /lib/mobiml2xhtml.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 3 | 4 | 5 | # this program works in concert with the output from KindleUnpack 6 | 7 | ''' 8 | Convert from Mobi ML to XHTML 9 | ''' 10 | 11 | from __future__ import division, absolute_import, print_function 12 | 13 | import os 14 | import sys 15 | import re 16 | 17 | SPECIAL_HANDLING_TAGS = { 18 | '?xml' : ('xmlheader', -1), 19 | '!--' : ('comment', -3), 20 | '!DOCTYPE' : ('doctype', -1), 21 | } 22 | 23 | SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment'] 24 | 25 | SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference'] 26 | 27 | class MobiMLConverter(object): 28 | 29 | PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE) 30 | IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') 31 | 32 | def __init__(self, filename): 33 | self.base_css_rules = 'blockquote { margin: 0em 0em 0em 1.25em }\n' 34 | self.base_css_rules += 'p { margin: 0em }\n' 35 | self.base_css_rules += '.bold { font-weight: bold }\n' 36 | self.base_css_rules += '.italic { font-style: italic }\n' 37 | self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n' 38 | self.tag_css_rules = {} 39 | self.tag_css_rule_cnt = 0 40 | self.path = [] 41 | self.filename = filename 42 | self.wipml = open(self.filename, 'r').read() 43 | self.pos = 0 44 | self.opfname = self.filename.rsplit('.',1)[0] + '.opf' 45 | self.opos = 0 46 | self.meta = '' 47 | self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css') 48 | self.current_font_size = 3 49 | self.font_history = [] 50 | 51 | def cleanup_html(self): 52 | self.wipml = re.sub(r'
', '', self.wipml) 53 | self.wipml = self.wipml.replace('\r\n', '\n') 54 | self.wipml = self.wipml.replace('> <', '>\n<') 55 | self.wipml = self.wipml.replace(']*>', '', self.wipml) 57 | self.wipml = self.wipml.replace('

','
') 58 | 59 | def replace_page_breaks(self): 60 | self.wipml = self.PAGE_BREAK_PAT.sub( 61 | '
', 62 | self.wipml) 63 | 64 | # parse leading text of ml and tag 65 | def parseml(self): 66 | p = self.pos 67 | if p >= len(self.wipml): 68 | return None 69 | if self.wipml[p] != '<': 70 | res = self.wipml.find('<',p) 71 | if res == -1 : 72 | res = len(self.wipml) 73 | self.pos = res 74 | return self.wipml[p:res], None 75 | # handle comment as a special case to deal with multi-line comments 76 | if self.wipml[p:p+4] == '',p+1) 78 | if te != -1: 79 | te = te+2 80 | else : 81 | te = self.wipml.find('>',p+1) 82 | ntb = self.wipml.find('<',p+1) 83 | if ntb != -1 and ntb < te: 84 | self.pos = ntb 85 | return self.wipml[p:ntb], None 86 | self.pos = te + 1 87 | return None, self.wipml[p:te+1] 88 | 89 | # parses string version of tag to identify its name, 90 | # its type 'begin', 'end' or 'single', 91 | # plus build a hashtable of its attributes 92 | # code is written to handle the possiblity of very poor formating 93 | def parsetag(self, s): 94 | p = 1 95 | # get the tag name 96 | tname = None 97 | ttype = None 98 | tattr = {} 99 | while s[p:p+1] == ' ' : 100 | p += 1 101 | if s[p:p+1] == '/': 102 | ttype = 'end' 103 | p += 1 104 | while s[p:p+1] == ' ' : 105 | p += 1 106 | b = p 107 | while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") : 108 | p += 1 109 | tname=s[b:p].lower() 110 | if tname == '!doctype': 111 | tname = '!DOCTYPE' 112 | # special cases 113 | if tname in SPECIAL_HANDLING_TAGS: 114 | ttype, backstep = SPECIAL_HANDLING_TAGS[tname] 115 | tattr['special'] = s[p:backstep] 116 | if ttype is None: 117 | # parse any attributes 118 | while s.find('=',p) != -1 : 119 | while s[p:p+1] == ' ' : 120 | p += 1 121 | b = p 122 | while s[p:p+1] != '=' : 123 | p += 1 124 | aname = s[b:p].lower() 125 | aname = aname.rstrip(' ') 126 | p += 1 127 | while s[p:p+1] == ' ' : 128 | p += 1 129 | if s[p:p+1] in ('"', "'") : 130 | p = p + 1 131 | b = p 132 | while s[p:p+1] not in ('"', "'") : 133 | p += 1 134 | val = s[b:p] 135 | p += 1 136 | else : 137 | b = p 138 | while s[p:p+1] not in ('>', '/', ' ') : 139 | p += 1 140 | val = s[b:p] 141 | tattr[aname] = val 142 | # label beginning and single tags 143 | if ttype is None: 144 | ttype = 'begin' 145 | if s.find(' /',p) >= 0: 146 | ttype = 'single_ext' 147 | elif s.find('/',p) >= 0: 148 | ttype = 'single' 149 | return ttype, tname, tattr 150 | 151 | # main routine to convert from mobi markup language to html 152 | def processml(self): 153 | 154 | # are these really needed 155 | html_done = False 156 | head_done = False 157 | body_done = False 158 | 159 | skip = False 160 | 161 | htmlstr = '' 162 | self.replace_page_breaks() 163 | self.cleanup_html() 164 | 165 | # now parse the cleaned up ml into standard xhtml 166 | while True: 167 | 168 | r = self.parseml() 169 | if not r: 170 | break 171 | 172 | text, tag = r 173 | 174 | if text: 175 | if not skip: 176 | htmlstr += text 177 | 178 | if tag: 179 | ttype, tname, tattr = self.parsetag(tag) 180 | 181 | # If we run into a DTD or xml declarations inside the body ... bail. 182 | if tname in SPECIAL_HANDLING_TAGS and tname != 'comment' and body_done: 183 | htmlstr += '\n' 184 | break 185 | 186 | # make sure self-closing tags actually self-close 187 | if ttype == 'begin' and tname in SELF_CLOSING_TAGS: 188 | ttype = 'single' 189 | 190 | # make sure any end tags of self-closing tags are discarded 191 | if ttype == 'end' and tname in SELF_CLOSING_TAGS: 192 | continue 193 | 194 | # remove embedded guide and refernces from old mobis 195 | if tname in ('guide', 'ncx', 'reference') and ttype in ('begin', 'single', 'single_ext'): 196 | tname = 'removeme:{0}'.format(tname) 197 | tattr = None 198 | if tname in ('guide', 'ncx', 'reference', 'font', 'span') and ttype == 'end': 199 | if self.path[-1] == 'removeme:{0}'.format(tname): 200 | tname = 'removeme:{0}'.format(tname) 201 | tattr = None 202 | 203 | # Get rid of font tags that only have a color attribute. 204 | if tname == 'font' and ttype in ('begin', 'single', 'single_ext'): 205 | if 'color' in tattr and len(tattr) == 1: 206 | tname = 'removeme:{0}'.format(tname) 207 | tattr = None 208 | 209 | # Get rid of empty spans in the markup. 210 | if tname == 'span' and ttype in ('begin', 'single', 'single_ext') and not len(tattr): 211 | tname = 'removeme:{0}'.format(tname) 212 | 213 | # need to handle fonts outside of the normal methods 214 | # so fonts tags won't be added to the self.path since we keep track 215 | # of font tags separately with self.font_history 216 | if tname == 'font' and ttype == 'begin': 217 | # check for nested font start tags 218 | if len(self.font_history) > 0 : 219 | # inject a font end tag 220 | taginfo = ('end', 'font', None) 221 | htmlstr += self.processtag(taginfo) 222 | self.font_history.append((ttype, tname, tattr)) 223 | # handle the current font start tag 224 | taginfo = (ttype, tname, tattr) 225 | htmlstr += self.processtag(taginfo) 226 | continue 227 | 228 | # check for nested font tags and unnest them 229 | if tname == 'font' and ttype == 'end': 230 | self.font_history.pop() 231 | # handle this font end tag 232 | taginfo = ('end', 'font', None) 233 | htmlstr += self.processtag(taginfo) 234 | # check if we were nested 235 | if len(self.font_history) > 0: 236 | # inject a copy of the most recent font start tag from history 237 | taginfo = self.font_history[-1] 238 | htmlstr += self.processtag(taginfo) 239 | continue 240 | 241 | # keep track of nesting path 242 | if ttype == 'begin': 243 | self.path.append(tname) 244 | elif ttype == 'end': 245 | if tname != self.path[-1]: 246 | print('improper nesting: ', self.path, tname, ttype) 247 | if tname not in self.path: 248 | # handle case of end tag with no beginning by injecting empty begin tag 249 | taginfo = ('begin', tname, None) 250 | htmlstr += self.processtag(taginfo) 251 | print(" - fixed by injecting empty start tag ", tname) 252 | self.path.append(tname) 253 | elif len(self.path) > 1 and tname == self.path[-2]: 254 | # handle case of dangling missing end 255 | taginfo = ('end', self.path[-1], None) 256 | htmlstr += self.processtag(taginfo) 257 | print(" - fixed by injecting end tag ", self.path[-1]) 258 | self.path.pop() 259 | self.path.pop() 260 | 261 | if tname == 'removeme:{0}'.format(tname): 262 | if ttype in ('begin', 'single', 'single_ext'): 263 | skip = True 264 | else: 265 | skip = False 266 | else: 267 | taginfo = (ttype, tname, tattr) 268 | htmlstr += self.processtag(taginfo) 269 | 270 | # handle potential issue of multiple html, head, and body sections 271 | if tname == 'html' and ttype == 'begin' and not html_done: 272 | htmlstr += '\n' 273 | html_done = True 274 | 275 | if tname == 'head' and ttype == 'begin' and not head_done: 276 | htmlstr += '\n' 277 | # also add in metadata and style link tags 278 | htmlstr += self.meta 279 | htmlstr += '\n' 280 | head_done = True 281 | 282 | if tname == 'body' and ttype == 'begin' and not body_done: 283 | htmlstr += '\n' 284 | body_done = True 285 | 286 | # handle issue of possibly missing html, head, and body tags 287 | # I have not seen this but the original did something like this so ... 288 | if not body_done: 289 | htmlstr = '\n' + htmlstr + '\n' 290 | if not head_done: 291 | headstr = '\n' 292 | headstr += self.meta 293 | headstr += '\n' 294 | headstr += '\n' 295 | htmlstr = headstr + htmlstr 296 | if not html_done: 297 | htmlstr = '\n' + htmlstr + '\n' 298 | 299 | # finally add DOCTYPE info 300 | htmlstr = '\n\n' + htmlstr 301 | 302 | css = self.base_css_rules 303 | for cls, rule in self.tag_css_rules.items(): 304 | css += '.%s { %s }\n' % (cls, rule) 305 | 306 | return (htmlstr, css, self.cssname) 307 | 308 | def ensure_unit(self, raw, unit='px'): 309 | if re.search(r'\d+$', raw) is not None: 310 | raw += unit 311 | return raw 312 | 313 | # flatten possibly modified tag back to string 314 | def taginfo_tostring(self, taginfo): 315 | (ttype, tname, tattr) = taginfo 316 | if ttype is None or tname is None: 317 | return '' 318 | if ttype == 'end': 319 | return '' % tname 320 | if ttype in SPECIAL_HANDLING_TYPES and tattr is not None and 'special' in tattr: 321 | info = tattr['special'] 322 | if ttype == 'comment': 323 | return '<%s %s-->' % (tname, info) 324 | else: 325 | return '<%s %s>' % (tname, info) 326 | res = [] 327 | res.append('<%s' % tname) 328 | if tattr is not None: 329 | for key in tattr: 330 | res.append(' %s="%s"' % (key, tattr[key])) 331 | if ttype == 'single': 332 | res.append('/>') 333 | elif ttype == 'single_ext': 334 | res.append(' />') 335 | else : 336 | res.append('>') 337 | return "".join(res) 338 | 339 | # routines to convert from mobi ml tags atributes to xhtml attributes and styles 340 | def processtag(self, taginfo): 341 | # Converting mobi font sizes to numerics 342 | size_map = { 343 | 'xx-small': '1', 344 | 'x-small': '2', 345 | 'small': '3', 346 | 'medium': '4', 347 | 'large': '5', 348 | 'x-large': '6', 349 | 'xx-large': '7', 350 | } 351 | 352 | size_to_em_map = { 353 | '1': '.65em', 354 | '2': '.75em', 355 | '3': '1em', 356 | '4': '1.125em', 357 | '5': '1.25em', 358 | '6': '1.5em', 359 | '7': '2em', 360 | } 361 | 362 | # current tag to work on 363 | (ttype, tname, tattr) = taginfo 364 | if not tattr: 365 | tattr = {} 366 | 367 | styles = [] 368 | 369 | if tname is None or tname.startswith('removeme'): 370 | return '' 371 | 372 | # have not seen an example of this yet so keep it here to be safe 373 | # until this is better understood 374 | if tname in ('country-region', 'place', 'placetype', 'placename', 375 | 'state', 'city', 'street', 'address', 'content'): 376 | tname = 'div' if tname == 'content' else 'span' 377 | for key in tattr: 378 | tattr.pop(key) 379 | 380 | # handle general case of style, height, width, bgcolor in any tag 381 | if 'style' in tattr: 382 | style = tattr.pop('style').strip() 383 | if style: 384 | styles.append(style) 385 | 386 | if 'align' in tattr: 387 | align = tattr.pop('align').strip() 388 | if align: 389 | if tname in ('table', 'td', 'tr'): 390 | pass 391 | else: 392 | styles.append('text-align: %s' % align) 393 | 394 | if 'height' in tattr: 395 | height = tattr.pop('height').strip() 396 | if height and '<' not in height and '>' not in height and re.search(r'\d+', height): 397 | if tname in ('table', 'td', 'tr'): 398 | pass 399 | elif tname == 'img': 400 | tattr['height'] = height 401 | else: 402 | styles.append('margin-top: %s' % self.ensure_unit(height)) 403 | 404 | if 'width' in tattr: 405 | width = tattr.pop('width').strip() 406 | if width and re.search(r'\d+', width): 407 | if tname in ('table', 'td', 'tr'): 408 | pass 409 | elif tname == 'img': 410 | tattr['width'] = width 411 | else: 412 | styles.append('text-indent: %s' % self.ensure_unit(width)) 413 | if width.startswith('-'): 414 | styles.append('margin-left: %s' % self.ensure_unit(width[1:])) 415 | 416 | if 'bgcolor' in tattr: 417 | # no proprietary html allowed 418 | if tname == 'div': 419 | del tattr['bgcolor'] 420 | 421 | elif tname == 'font': 422 | # Change font tags to span tags 423 | tname = 'span' 424 | if ttype in ('begin', 'single', 'single_ext'): 425 | # move the face attribute to css font-family 426 | if 'face' in tattr: 427 | face = tattr.pop('face').strip() 428 | styles.append('font-family: "%s"' % face) 429 | 430 | # Monitor the constantly changing font sizes, change them to ems and move 431 | # them to css. The following will work for 'flat' font tags, but nested font tags 432 | # will cause things to go wonky. Need to revert to the parent font tag's size 433 | # when a closing tag is encountered. 434 | if 'size' in tattr: 435 | sz = tattr.pop('size').strip().lower() 436 | try: 437 | float(sz) 438 | except ValueError: 439 | if sz in size_map: 440 | sz = size_map[sz] 441 | else: 442 | if sz.startswith('-') or sz.startswith('+'): 443 | sz = self.current_font_size + float(sz) 444 | if sz > 7: 445 | sz = 7 446 | elif sz < 1: 447 | sz = 1 448 | sz = str(int(sz)) 449 | styles.append('font-size: %s' % size_to_em_map[sz]) 450 | self.current_font_size = int(sz) 451 | 452 | elif tname == 'img': 453 | for attr in ('width', 'height'): 454 | if attr in tattr: 455 | val = tattr[attr] 456 | if val.lower().endswith('em'): 457 | try: 458 | nval = float(val[:-2]) 459 | nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile 460 | tattr[attr] = "%dpx"%int(nval) 461 | except: 462 | del tattr[attr] 463 | elif val.lower().endswith('%'): 464 | del tattr[attr] 465 | 466 | # convert the anchor tags 467 | if 'filepos-id' in tattr: 468 | tattr['id'] = tattr.pop('filepos-id') 469 | if 'name' in tattr and tattr['name'] != tattr['id']: 470 | tattr['name'] = tattr['id'] 471 | 472 | if 'filepos' in tattr: 473 | filepos = tattr.pop('filepos') 474 | try: 475 | tattr['href'] = "#filepos%d" % int(filepos) 476 | except ValueError: 477 | pass 478 | 479 | if styles: 480 | ncls = None 481 | rule = '; '.join(styles) 482 | for sel, srule in self.tag_css_rules.items(): 483 | if srule == rule: 484 | ncls = sel 485 | break 486 | if ncls is None: 487 | self.tag_css_rule_cnt += 1 488 | ncls = 'rule_%d' % self.tag_css_rule_cnt 489 | self.tag_css_rules[ncls] = rule 490 | cls = tattr.get('class', '') 491 | cls = cls + (' ' if cls else '') + ncls 492 | tattr['class'] = cls 493 | 494 | # convert updated tag back to string representation 495 | if len(tattr) == 0: 496 | tattr = None 497 | taginfo = (ttype, tname, tattr) 498 | return self.taginfo_tostring(taginfo) 499 | 500 | ''' main only left in for testing outside of plugin ''' 501 | 502 | def main(argv=sys.argv): 503 | if len(argv) != 2: 504 | return 1 505 | else: 506 | infile = argv[1] 507 | 508 | try: 509 | print('Converting Mobi Markup Language to XHTML') 510 | mlc = MobiMLConverter(infile) 511 | print('Processing ...') 512 | htmlstr, css, cssname = mlc.processml() 513 | outname = infile.rsplit('.',1)[0] + '_converted.html' 514 | open(outname, 'w').write(htmlstr) 515 | open(cssname, 'w').write(css) 516 | print('Completed') 517 | print('XHTML version of book can be found at: ' + outname) 518 | 519 | except ValueError as e: 520 | print("Error: %s" % e) 521 | return 1 522 | 523 | return 0 524 | 525 | 526 | if __name__ == "__main__": 527 | sys.exit(main()) 528 | -------------------------------------------------------------------------------- /lib/unipath.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | # Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay 6 | # All rights reserved. 7 | # 8 | # Redistribution and use in source and binary forms, with or without modification, 9 | # are permitted provided that the following conditions are met: 10 | # 11 | # 1. Redistributions of source code must retain the above copyright notice, this list of 12 | # conditions and the following disclaimer. 13 | # 14 | # 2. Redistributions in binary form must reproduce the above copyright notice, this list 15 | # of conditions and the following disclaimer in the documentation and/or other materials 16 | # provided with the distribution. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 19 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 21 | # SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 22 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 23 | # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 24 | # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 26 | # WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | from __future__ import unicode_literals, division, absolute_import, print_function 29 | from .compatibility_utils import PY2, text_type, binary_type 30 | 31 | import sys 32 | import os 33 | 34 | # utility routines to convert all paths to be full unicode 35 | 36 | # Under Python 2, if a bytestring, try to convert it to unicode using sys.getfilesystemencoding 37 | # Under Python 3, if bytes, try to convert it to unicode using os.fsencode() to decode it 38 | 39 | # Mac OS X and Windows will happily support full unicode paths 40 | # Linux can support full unicode paths but allows arbitrary byte paths which may be inconsistent with unicode 41 | 42 | fsencoding = sys.getfilesystemencoding() 43 | 44 | def pathof(s, enc=fsencoding): 45 | if s is None: 46 | return None 47 | if isinstance(s, text_type): 48 | return s 49 | if isinstance(s, binary_type): 50 | try: 51 | return s.decode(enc) 52 | except: 53 | pass 54 | return s 55 | 56 | def exists(s): 57 | return os.path.exists(pathof(s)) 58 | 59 | def isfile(s): 60 | return os.path.isfile(pathof(s)) 61 | 62 | def isdir(s): 63 | return os.path.isdir(pathof(s)) 64 | 65 | def mkdir(s): 66 | return os.mkdir(pathof(s)) 67 | 68 | def listdir(s): 69 | rv = [] 70 | for file in os.listdir(pathof(s)): 71 | rv.append(pathof(file)) 72 | return rv 73 | 74 | def getcwd(): 75 | if PY2: 76 | return os.getcwdu() 77 | return os.getcwd() 78 | 79 | def walk(top): 80 | top = pathof(top) 81 | rv = [] 82 | for base, dnames, names in os.walk(top): 83 | base = pathof(base) 84 | for name in names: 85 | name = pathof(name) 86 | rv.append(relpath(os.path.join(base, name), top)) 87 | return rv 88 | 89 | def relpath(path, start=None): 90 | return os.path.relpath(pathof(path) , pathof(start)) 91 | 92 | def abspath(path): 93 | return os.path.abspath(pathof(path)) 94 | -------------------------------------------------------------------------------- /lib/unpack_structure.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | from .compatibility_utils import text_type 8 | 9 | from . import unipath 10 | from .unipath import pathof 11 | 12 | DUMP = False 13 | """ Set to True to dump all possible information. """ 14 | 15 | import os 16 | 17 | import re 18 | # note: re requites the pattern to be the exact same type as the data to be searched in python3 19 | # but u"" is not allowed for the pattern itself only b"" 20 | 21 | import zipfile 22 | import binascii 23 | from .mobi_utils import mangle_fonts 24 | 25 | class unpackException(Exception): 26 | pass 27 | 28 | class ZipInfo(zipfile.ZipInfo): 29 | 30 | def __init__(self, *args, **kwargs): 31 | if 'compress_type' in kwargs: 32 | compress_type = kwargs.pop('compress_type') 33 | super(ZipInfo, self).__init__(*args, **kwargs) 34 | self.compress_type = compress_type 35 | 36 | class fileNames: 37 | 38 | def __init__(self, infile, outdir): 39 | self.infile = infile 40 | self.outdir = outdir 41 | if not unipath.exists(self.outdir): 42 | unipath.mkdir(self.outdir) 43 | self.mobi7dir = os.path.join(self.outdir,'mobi7') 44 | if not unipath.exists(self.mobi7dir): 45 | unipath.mkdir(self.mobi7dir) 46 | self.imgdir = os.path.join(self.mobi7dir, 'Images') 47 | if not unipath.exists(self.imgdir): 48 | unipath.mkdir(self.imgdir) 49 | self.hdimgdir = os.path.join(self.outdir,'HDImages') 50 | if not unipath.exists(self.hdimgdir): 51 | unipath.mkdir(self.hdimgdir) 52 | self.outbase = os.path.join(self.outdir, os.path.splitext(os.path.split(infile)[1])[0]) 53 | 54 | def getInputFileBasename(self): 55 | return os.path.splitext(os.path.basename(self.infile))[0] 56 | 57 | def makeK8Struct(self): 58 | self.k8dir = os.path.join(self.outdir,'mobi8') 59 | if not unipath.exists(self.k8dir): 60 | unipath.mkdir(self.k8dir) 61 | self.k8metainf = os.path.join(self.k8dir,'META-INF') 62 | if not unipath.exists(self.k8metainf): 63 | unipath.mkdir(self.k8metainf) 64 | self.k8oebps = os.path.join(self.k8dir,'OEBPS') 65 | if not unipath.exists(self.k8oebps): 66 | unipath.mkdir(self.k8oebps) 67 | self.k8images = os.path.join(self.k8oebps,'Images') 68 | if not unipath.exists(self.k8images): 69 | unipath.mkdir(self.k8images) 70 | self.k8fonts = os.path.join(self.k8oebps,'Fonts') 71 | if not unipath.exists(self.k8fonts): 72 | unipath.mkdir(self.k8fonts) 73 | self.k8styles = os.path.join(self.k8oebps,'Styles') 74 | if not unipath.exists(self.k8styles): 75 | unipath.mkdir(self.k8styles) 76 | self.k8text = os.path.join(self.k8oebps,'Text') 77 | if not unipath.exists(self.k8text): 78 | unipath.mkdir(self.k8text) 79 | 80 | # recursive zip creation support routine 81 | def zipUpDir(self, myzip, tdir, localname): 82 | currentdir = tdir 83 | if localname != "": 84 | currentdir = os.path.join(currentdir,localname) 85 | list = unipath.listdir(currentdir) 86 | for file in list: 87 | afilename = file 88 | localfilePath = os.path.join(localname, afilename) 89 | realfilePath = os.path.join(currentdir,file) 90 | if unipath.isfile(realfilePath): 91 | myzip.write(pathof(realfilePath), pathof(localfilePath), zipfile.ZIP_DEFLATED) 92 | elif unipath.isdir(realfilePath): 93 | self.zipUpDir(myzip, tdir, localfilePath) 94 | 95 | def makeEPUB(self, usedmap, obfuscate_data, uid): 96 | bname = os.path.join(self.k8dir, self.getInputFileBasename() + '.epub') 97 | # Create an encryption key for Adobe font obfuscation 98 | # based on the epub's uid 99 | if isinstance(uid,text_type): 100 | uid = uid.encode('ascii') 101 | if obfuscate_data: 102 | key = re.sub(br'[^a-fA-F0-9]', b'', uid) 103 | key = binascii.unhexlify((key + key)[:32]) 104 | 105 | # copy over all images and fonts that are actually used in the ebook 106 | # and remove all font files from mobi7 since not supported 107 | imgnames = unipath.listdir(self.imgdir) 108 | for name in imgnames: 109 | if usedmap.get(name,'not used') == 'used': 110 | filein = os.path.join(self.imgdir,name) 111 | if name.endswith(".ttf"): 112 | fileout = os.path.join(self.k8fonts,name) 113 | elif name.endswith(".otf"): 114 | fileout = os.path.join(self.k8fonts,name) 115 | elif name.endswith(".failed"): 116 | fileout = os.path.join(self.k8fonts,name) 117 | else: 118 | fileout = os.path.join(self.k8images,name) 119 | data = b'' 120 | with open(pathof(filein),'rb') as f: 121 | data = f.read() 122 | if obfuscate_data: 123 | if name in obfuscate_data: 124 | data = mangle_fonts(key, data) 125 | open(pathof(fileout),'wb').write(data) 126 | if name.endswith(".ttf") or name.endswith(".otf"): 127 | os.remove(pathof(filein)) 128 | 129 | # opf file name hard coded to "content.opf" 130 | container = '\n' 131 | container += '\n' 132 | container += ' \n' 133 | container += '' 134 | container += ' \n\n' 135 | fileout = os.path.join(self.k8metainf,'container.xml') 136 | with open(pathof(fileout),'wb') as f: 137 | f.write(container.encode('utf-8')) 138 | 139 | if obfuscate_data: 140 | encryption = '\n' 142 | for font in obfuscate_data: 143 | encryption += ' \n' 144 | encryption += ' \n' 145 | encryption += ' \n' 146 | encryption += ' \n' 147 | encryption += ' \n' 148 | encryption += ' \n' 149 | encryption += '\n' 150 | fileout = os.path.join(self.k8metainf,'encryption.xml') 151 | with open(pathof(fileout),'wb') as f: 152 | f.write(encryption.encode('utf-8')) 153 | 154 | # ready to build epub 155 | self.outzip = zipfile.ZipFile(pathof(bname), 'w') 156 | 157 | # add the mimetype file uncompressed 158 | mimetype = b'application/epub+zip' 159 | fileout = os.path.join(self.k8dir,'mimetype') 160 | with open(pathof(fileout),'wb') as f: 161 | f.write(mimetype) 162 | nzinfo = ZipInfo('mimetype', compress_type=zipfile.ZIP_STORED) 163 | nzinfo.external_attr = 0o600 << 16 # make this a normal file 164 | self.outzip.writestr(nzinfo, mimetype) 165 | self.zipUpDir(self.outzip,self.k8dir,'META-INF') 166 | self.zipUpDir(self.outzip,self.k8dir,'OEBPS') 167 | self.outzip.close() 168 | -------------------------------------------------------------------------------- /libgui/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai 3 | -------------------------------------------------------------------------------- /libgui/askfolder_ed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | # to work around tk_chooseDirectory not properly returning unicode paths on Windows 8 | # need to use a dialog that can be hacked up to actually return full unicode paths 9 | # originally based on AskFolder from EasyDialogs for Windows but modified to fix it 10 | # to actually use unicode for path 11 | 12 | # The original license for EasyDialogs is as follows 13 | # 14 | # Copyright (c) 2003-2005 Jimmy Retzlaff 15 | # 16 | # Permission is hereby granted, free of charge, to any person obtaining a 17 | # copy of this software and associated documentation files (the "Software"), 18 | # to deal in the Software without restriction, including without limitation 19 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 20 | # and/or sell copies of the Software, and to permit persons to whom the 21 | # Software is furnished to do so, subject to the following conditions: 22 | # 23 | # The above copyright notice and this permission notice shall be included in 24 | # all copies or substantial portions of the Software. 25 | # 26 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 27 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 28 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 29 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 30 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 31 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 32 | # DEALINGS IN THE SOFTWARE. 33 | 34 | """ 35 | AskFolder(...) -- Ask the user to select a folder Windows specific 36 | """ 37 | 38 | import ctypes 39 | from ctypes.wintypes import LPCWSTR 40 | import ctypes.wintypes as wintypes 41 | 42 | __all__ = ['AskFolder'] 43 | 44 | # Load required Windows DLLs 45 | ole32 = ctypes.windll.ole32 46 | shell32 = ctypes.windll.shell32 47 | user32 = ctypes.windll.user32 48 | 49 | # Windows Constants 50 | BFFM_INITIALIZED = 1 51 | BFFM_SETOKTEXT = 1129 52 | BFFM_SETSELECTIONA = 1126 53 | BFFM_SETSELECTIONW = 1127 54 | BIF_EDITBOX = 16 55 | BS_DEFPUSHBUTTON = 1 56 | CB_ADDSTRING = 323 57 | CB_GETCURSEL = 327 58 | CB_SETCURSEL = 334 59 | CDM_SETCONTROLTEXT = 1128 60 | EM_GETLINECOUNT = 186 61 | EM_GETMARGINS = 212 62 | EM_POSFROMCHAR = 214 63 | EM_SETSEL = 177 64 | GWL_STYLE = -16 65 | IDC_STATIC = -1 66 | IDCANCEL = 2 67 | IDNO = 7 68 | IDOK = 1 69 | IDYES = 6 70 | MAX_PATH = 260 71 | OFN_ALLOWMULTISELECT = 512 72 | OFN_ENABLEHOOK = 32 73 | OFN_ENABLESIZING = 8388608 74 | OFN_ENABLETEMPLATEHANDLE = 128 75 | OFN_EXPLORER = 524288 76 | OFN_OVERWRITEPROMPT = 2 77 | OPENFILENAME_SIZE_VERSION_400 = 76 78 | PBM_GETPOS = 1032 79 | PBM_SETMARQUEE = 1034 80 | PBM_SETPOS = 1026 81 | PBM_SETRANGE = 1025 82 | PBM_SETRANGE32 = 1030 83 | PBS_MARQUEE = 8 84 | PM_REMOVE = 1 85 | SW_HIDE = 0 86 | SW_SHOW = 5 87 | SW_SHOWNORMAL = 1 88 | SWP_NOACTIVATE = 16 89 | SWP_NOMOVE = 2 90 | SWP_NOSIZE = 1 91 | SWP_NOZORDER = 4 92 | VER_PLATFORM_WIN32_NT = 2 93 | WM_COMMAND = 273 94 | WM_GETTEXT = 13 95 | WM_GETTEXTLENGTH = 14 96 | WM_INITDIALOG = 272 97 | WM_NOTIFY = 78 98 | 99 | # Windows function prototypes 100 | BrowseCallbackProc = ctypes.WINFUNCTYPE(ctypes.c_int, wintypes.HWND, ctypes.c_uint, wintypes.LPARAM, wintypes.LPARAM) 101 | 102 | # Windows types 103 | LPCTSTR = ctypes.c_char_p 104 | LPTSTR = ctypes.c_char_p 105 | LPVOID = ctypes.c_voidp 106 | TCHAR = ctypes.c_char 107 | 108 | class BROWSEINFO(ctypes.Structure): 109 | _fields_ = [ 110 | ("hwndOwner", wintypes.HWND), 111 | ("pidlRoot", LPVOID), 112 | ("pszDisplayName", LPTSTR), 113 | ("lpszTitle", LPCTSTR), 114 | ("ulFlags", ctypes.c_uint), 115 | ("lpfn", BrowseCallbackProc), 116 | ("lParam", wintypes.LPARAM), 117 | ("iImage", ctypes.c_int) 118 | ] 119 | 120 | 121 | # Utilities 122 | def CenterWindow(hwnd): 123 | desktopRect = GetWindowRect(user32.GetDesktopWindow()) 124 | myRect = GetWindowRect(hwnd) 125 | x = width(desktopRect) // 2 - width(myRect) // 2 126 | y = height(desktopRect) // 2 - height(myRect) // 2 127 | user32.SetWindowPos(hwnd, 0, 128 | desktopRect.left + x, 129 | desktopRect.top + y, 130 | 0, 0, 131 | SWP_NOACTIVATE | SWP_NOSIZE | SWP_NOZORDER 132 | ) 133 | 134 | 135 | def GetWindowRect(hwnd): 136 | rect = wintypes.RECT() 137 | user32.GetWindowRect(hwnd, ctypes.byref(rect)) 138 | return rect 139 | 140 | def width(rect): 141 | return rect.right-rect.left 142 | 143 | def height(rect): 144 | return rect.bottom-rect.top 145 | 146 | 147 | def AskFolder( 148 | message=None, 149 | version=None, 150 | defaultLocation=None, 151 | location=None, 152 | windowTitle=None, 153 | actionButtonLabel=None, 154 | cancelButtonLabel=None, 155 | multiple=None): 156 | """Display a dialog asking the user for select a folder. 157 | modified to use unicode strings as much as possible 158 | returns unicode path 159 | """ 160 | 161 | def BrowseCallback(hwnd, uMsg, lParam, lpData): 162 | if uMsg == BFFM_INITIALIZED: 163 | if actionButtonLabel: 164 | label = unicode(actionButtonLabel, errors='replace') 165 | user32.SendMessageW(hwnd, BFFM_SETOKTEXT, 0, label) 166 | if cancelButtonLabel: 167 | label = unicode(cancelButtonLabel, errors='replace') 168 | cancelButton = user32.GetDlgItem(hwnd, IDCANCEL) 169 | if cancelButton: 170 | user32.SetWindowTextW(cancelButton, label) 171 | if windowTitle: 172 | title = unicode(windowTitle, errors='replace') 173 | user32.SetWindowTextW(hwnd, title) 174 | if defaultLocation: 175 | user32.SendMessageW(hwnd, BFFM_SETSELECTIONW, 1, defaultLocation.replace('/', '\\')) 176 | if location: 177 | x, y = location 178 | desktopRect = wintypes.RECT() 179 | user32.GetWindowRect(0, ctypes.byref(desktopRect)) 180 | user32.SetWindowPos(hwnd, 0, 181 | desktopRect.left + x, 182 | desktopRect.top + y, 0, 0, 183 | SWP_NOACTIVATE | SWP_NOSIZE | SWP_NOZORDER) 184 | else: 185 | CenterWindow(hwnd) 186 | return 0 187 | 188 | # This next line is needed to prevent gc of the callback 189 | callback = BrowseCallbackProc(BrowseCallback) 190 | 191 | browseInfo = BROWSEINFO() 192 | browseInfo.pszDisplayName = ctypes.c_char_p('\0' * (MAX_PATH+1)) 193 | browseInfo.lpszTitle = message 194 | browseInfo.lpfn = callback 195 | 196 | pidl = shell32.SHBrowseForFolder(ctypes.byref(browseInfo)) 197 | if not pidl: 198 | result = None 199 | else: 200 | path = LPCWSTR(u" " * (MAX_PATH+1)) 201 | shell32.SHGetPathFromIDListW(pidl, path) 202 | ole32.CoTaskMemFree(pidl) 203 | result = path.value 204 | return result 205 | -------------------------------------------------------------------------------- /libgui/prefs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | from lib.compatibility_utils import PY2, PY3, unicode_str 8 | from lib import unipath 9 | from lib.unipath import pathof 10 | 11 | import os 12 | import json 13 | if PY2: 14 | import codecs 15 | 16 | 17 | def getprefs(configfile, tkobj, PERSIST): 18 | # To keep things simple for possible future preference additions/deletions: 19 | # Try to stick to - TK Widget name = prefs dictionary key. 20 | # EX: tkobj.outpath = prefs['outpath'] 21 | prefs = {} 22 | 23 | # Sane defaults 24 | prefs['mobipath'] = unipath.getcwd() 25 | prefs['outpath'] = unipath.getcwd() 26 | prefs['apnxpath'] = unipath.getcwd() 27 | prefs['splitvar'] = 0 28 | prefs['rawvar'] = 0 29 | prefs['dbgvar'] = 0 30 | prefs['hdvar'] = 0 31 | prefs['epubver'] = 0 32 | tkobj.update_idletasks() 33 | w = tkobj.winfo_screenwidth() 34 | h = tkobj.winfo_screenheight() 35 | rootsize = (605, 575) 36 | x = w//2 - rootsize[0]//2 37 | y = h//2 - rootsize[1]//2 38 | prefs['windowgeometry'] = ('%dx%d+%d+%d' % (rootsize + (x, y))) 39 | 40 | if unipath.exists(configfile) and PERSIST: 41 | try: 42 | if PY3: 43 | with open(configfile, 'r', encoding='utf-8') as f: 44 | tmpprefs = json.load(f) 45 | else: 46 | with codecs.open(configfile, 'r', encoding='utf-8') as f: 47 | tmpprefs = json.load(f) 48 | except: 49 | return prefs 50 | 51 | if 'mobipath' in tmpprefs.keys(): 52 | prefs['mobipath'] = unicode_str(tmpprefs['mobipath'], 'utf-8') 53 | if 'outpath' in tmpprefs.keys(): 54 | prefs['outpath'] = unicode_str(tmpprefs['outpath'], 'utf-8') 55 | if 'apnxpath' in tmpprefs.keys(): 56 | prefs['apnxpath'] = unicode_str(tmpprefs['apnxpath'], 'utf-8') 57 | if 'splitvar' in tmpprefs.keys(): 58 | prefs['splitvar'] = tmpprefs['splitvar'] 59 | if 'rawvar' in tmpprefs.keys(): 60 | prefs['rawvar'] = tmpprefs['rawvar'] 61 | if 'dbgvar'in tmpprefs.keys(): 62 | prefs['dbgvar'] = tmpprefs['dbgvar'] 63 | if 'hdvar' in tmpprefs.keys(): 64 | prefs['hdvar'] = tmpprefs['hdvar'] 65 | if 'epubver' in tmpprefs.keys(): 66 | prefs['epubver'] = tmpprefs['epubver'] 67 | if 'windowgeometry' in tmpprefs.keys(): 68 | prefs['windowgeometry'] = tmpprefs['windowgeometry'] 69 | 70 | return prefs 71 | 72 | 73 | def saveprefs(configfile, prefs, tkobj): 74 | # tkobj name = prefs dictionary key 75 | 76 | # mobipath 77 | apath = pathof(tkobj.mobipath.get()) 78 | if apath is not None and unipath.isfile(apath): 79 | prefs['mobipath'] = os.path.dirname(apath) 80 | 81 | # outpath 82 | apath = pathof(tkobj.outpath.get()) 83 | if apath is not None and unipath.isdir(apath): 84 | prefs['outpath'] = apath 85 | 86 | # apnxpath 87 | apath = pathof(tkobj.apnxpath.get()) 88 | if apath is not None and unipath.isfile(apath): 89 | prefs['apnxpath'] = os.path.dirname(apath) 90 | 91 | prefs['splitvar'] = tkobj.splitvar.get() 92 | prefs['rawvar'] = tkobj.rawvar.get() 93 | prefs['dbgvar'] = tkobj.dbgvar.get() 94 | prefs['hdvar'] = tkobj.hdvar.get() 95 | prefs['epubver'] = tkobj.epubver.current() 96 | prefs['windowgeometry'] = tkobj.root.geometry() 97 | try: 98 | if PY3: 99 | with open(configfile, 'w', encoding='utf-8') as f: 100 | json.dump(prefs, f, ensure_ascii=False, indent=4) 101 | else: 102 | with codecs.open(configfile, 'w', encoding='utf-8') as f: 103 | json.dump(prefs, f, ensure_ascii=False, indent=4) 104 | return 1 105 | except: 106 | pass 107 | return 0 108 | -------------------------------------------------------------------------------- /libgui/scrolltextwidget.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import unicode_literals, division, absolute_import, print_function 5 | from lib.compatibility_utils import PY2 6 | 7 | if PY2: 8 | import Tkinter as tkinter 9 | import Tkconstants as tkinter_constants 10 | else: 11 | import tkinter 12 | import tkinter.constants as tkinter_constants 13 | 14 | # basic scrolled text widget 15 | class ScrolledText(tkinter.Text): 16 | 17 | def __init__(self, master=None, **kw): 18 | self.frame = tkinter.Frame(master) 19 | self.vbar = tkinter.Scrollbar(self.frame) 20 | self.vbar.pack(side=tkinter_constants.RIGHT, fill=tkinter_constants.Y) 21 | kw.update({'yscrollcommand': self.vbar.set}) 22 | tkinter.Text.__init__(self, self.frame, **kw) 23 | self.pack(side=tkinter_constants.LEFT, fill=tkinter_constants.BOTH, expand=True) 24 | self.vbar['command'] = self.yview 25 | # Copy geometry methods of self.frame without overriding Text 26 | # methods = hack! 27 | text_meths = vars(tkinter.Text).keys() 28 | methods = list(vars(tkinter.Pack).keys()) + list(vars(tkinter.Grid).keys()) + list(vars(tkinter.Place).keys()) 29 | methods = set(methods).difference(text_meths) 30 | for m in methods: 31 | if m[0] != '_' and m != 'config' and m != 'configure': 32 | setattr(self, m, getattr(self.frame, m)) 33 | 34 | def __str__(self): 35 | return str(self.frame) 36 | --------------------------------------------------------------------------------