.epub)") 201 | 202 | op.add_option ( 203 | "--packager", 204 | dest = "packager", 205 | choices = ['none', 'ww'], 206 | default = "none", 207 | help = "packager type [none | ww] (default: none)") 208 | 209 | op.add_option ( 210 | "--mediatype-from-extension", 211 | dest = "mediatype_from_extension", 212 | action = "store_true", 213 | default = False, 214 | help = "get mediatype from url extension instead of http response") 215 | 216 | op.add_option ( 217 | "--cover", 218 | dest = "coverpage_url", 219 | default = None, 220 | help = "add the specified cover to the epub") 221 | 222 | options, args = CommonOptions.parse_args (op, {}, { 223 | 'proxies': None, 224 | 'bibrec': 'http://www.gutenberg.org/ebooks/', 225 | 'xelatex': 'xelatex', 226 | 'mobigen': 'kindlegen', 227 | 'groff': 'groff', 228 | 'rhyming_dict': None, 229 | } ) 230 | 231 | if not args: 232 | op.error ("please specify which file to convert") 233 | 234 | Logger.set_log_level (options.verbose) 235 | 236 | options.types = options.types or ['all'] 237 | for opt, formats in DEPENDENCIES: 238 | if opt in options.types: 239 | options.types.remove (opt) 240 | options.types += formats 241 | 242 | if set (options.types).intersection (('html.images', 'pdf.images', 'rst.gen')): 243 | options.types.insert (0, 'picsdir.images') 244 | if set (options.types).intersection (('html.noimages', 'pdf.noimages')): 245 | options.types.insert (0, 'picsdir.noimages') 246 | if set (options.types).intersection (('kindle.images', )): 247 | options.types.insert (0, 'epub.images') 248 | if set (options.types).intersection (('kindle.noimages', )): 249 | options.types.insert (0, 'epub.noimages') 250 | 251 | 252 | debug ("Building types: %s" % ' '.join (options.types)) 253 | 254 | ParserFactory.load_parsers () 255 | WriterFactory.load_writers () 256 | 257 | packager_factory = None 258 | if options.packager != 'none': 259 | packager_factory = PackagerFactory (options.packager) 260 | packager_factory.load () 261 | 262 | for url in args: 263 | 264 | if options.include_argument: 265 | options.include = options.include_argument[:] 266 | else: 267 | exclude_patt = os.path.dirname (url) + '/*' 268 | options.include = [ exclude_patt ] 269 | if exclude_patt.startswith ('/'): 270 | options.include.append('file://' + exclude_patt) 271 | 272 | # try to get metadata 273 | 274 | options.candidate = Struct () 275 | options.candidate.filename = url 276 | options.candidate.mediatype = str (DCIMT ( 277 | mimetypes.types_map[os.path.splitext (url)[1]], options.inputencoding)) 278 | 279 | options.include_mediatypes = options.include_mediatypes_argument[:] 280 | options.want_images = False 281 | #options.coverpage_url = None 282 | 283 | parser = ParserFactory.ParserFactory.create (options.candidate.filename, {}) 284 | 285 | dc = None 286 | 287 | try: 288 | dc = DublinCore.GutenbergDublinCore () 289 | 290 | # try for rst header 291 | dc.load_from_rstheader (parser.unicode_content ()) 292 | 293 | if dc.project_gutenberg_id == 0: 294 | # try for Project Gutenberg header 295 | dc.load_from_parser (parser) 296 | 297 | except (ValueError, TypeError): 298 | # use standard HTML header 299 | dc = DublinCore.DublinCore () 300 | dc.load_from_parser (parser) 301 | dc.source = url 302 | 303 | dc.source = url 304 | 305 | if options.title: 306 | dc.title = options.title 307 | if not dc.title: 308 | dc.title = 'NA' 309 | 310 | if options.author: 311 | dc.add_author (options.author, 'cre') 312 | if not dc.authors: 313 | dc.add_author ('NA', 'cre') 314 | 315 | if options.ebook: 316 | dc.project_gutenberg_id = options.ebook 317 | 318 | if dc.project_gutenberg_id: 319 | dc.opf_identifier = ('http://www.gutenberg.org/ebooks/%d' % dc.project_gutenberg_id) 320 | else: 321 | dc.opf_identifier = ('urn:mybooks:%s' % 322 | hashlib.md5 (url.encode ('utf-8')).hexdigest ()) 323 | 324 | if not dc.languages: 325 | # we *need* a language to build a valid epub, so just make one up 326 | dc.add_lang_id ('en') 327 | 328 | aux_file_list = [] 329 | 330 | for type_ in options.types: 331 | debug ('=== Building %s ===' % type_) 332 | maintype, subtype = os.path.splitext (type_) 333 | 334 | try: 335 | writer = WriterFactory.create (maintype) 336 | writer.setup (options) 337 | options.type = type_ 338 | options.maintype = maintype 339 | options.subtype = subtype 340 | options.want_images = False 341 | 342 | options.include_mediatypes = options.include_mediatypes_argument[:] 343 | if subtype == '.images': 344 | options.include_mediatypes.append ('image/*') 345 | options.want_images = True 346 | else: 347 | # This is the mediatype of the 'broken' image. 348 | options.include_mediatypes.append ('image/png;type=resource') 349 | 350 | writer.parse (options) 351 | 352 | if maintype in ('html', ): 353 | # list of images for packager 354 | aux_file_list[:] = writer.get_aux_file_list () 355 | 356 | options.dc = dc 357 | options.outputfile = make_output_filename (dc, type_) 358 | 359 | if maintype == 'kindle': 360 | options.epub_filename = make_output_filename (dc, 'epub' + subtype) 361 | 362 | writer.build () 363 | 364 | if options.validate: 365 | writer.validate () 366 | 367 | if packager_factory: 368 | try: 369 | packager = packager_factory.create (type_) 370 | packager.setup (options) 371 | packager.package (aux_file_list) 372 | except KeyError: 373 | # no such packager 374 | pass 375 | 376 | options.outputfile = None 377 | 378 | except SkipOutputFormat: 379 | continue 380 | 381 | except StandardError, what: 382 | exception ("%s" % what) 383 | 384 | if options.packager == 'ww': 385 | try: 386 | packager = packager_factory.create ('push') 387 | options.outputfile = '%d-final.zip' % (dc.project_gutenberg_id) 388 | packager.setup (options) 389 | packager.package (aux_file_list) 390 | except KeyError: 391 | # no such packager 392 | pass 393 | 394 | sys.exit (0) 395 | 396 | if __name__ == "__main__": 397 | main () 398 | 399 | 400 | 401 | -------------------------------------------------------------------------------- /epubmaker/HTMLChunker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | 6 | HTMLChunker.py 7 | 8 | Copyright 2009 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Splits a HTML file into chunks. 13 | 14 | """ 15 | 16 | from __future__ import with_statement 17 | 18 | import urlparse 19 | import urllib 20 | import os 21 | import re 22 | import copy 23 | 24 | from lxml import etree 25 | 26 | import epubmaker.lib.GutenbergGlobals as gg 27 | from epubmaker.lib.GutenbergGlobals import NS 28 | from epubmaker.lib.Logger import debug, error 29 | from epubmaker.CommonOptions import Options 30 | 31 | options = Options() 32 | # MAX_CHUNK_SIZE = 300 * 1024 # bytes 33 | MAX_CHUNK_SIZE = 100 * 1024 # bytes 34 | 35 | SECTIONS = [ 36 | ('div.section', 0.0), 37 | ('div.chapter', 0.0), 38 | ('h1', 0.5), 39 | ('div', 0.5), 40 | ('h2', 0.7), 41 | ('h3', 0.75), 42 | ('p', 0.8) 43 | ] 44 | 45 | def xpath (node, path): 46 | """ xpath helper """ 47 | return node.xpath (path, namespaces = gg.NSMAP) 48 | 49 | def unicode_uri (uri): 50 | """ Normalize URI for idmap. """ 51 | return urllib.unquote (uri).decode ('utf-8') 52 | 53 | 54 | class HTMLChunker (object): 55 | """ Splits HTML tree into smaller chunks. 56 | 57 | Some epub viewers are limited in that they cannot display files 58 | larger than 300K. If our HTML happens to be longer, we have to 59 | split it up. Also smaller chunks do improve page flip times. 60 | 61 | 62 | """ 63 | 64 | def __init__ (self): 65 | self.chunks = [] 66 | self.idmap = {} 67 | self.chunk = None 68 | self.chunk_body = None 69 | self.chunk_size = 0 70 | self.next_id = 0 71 | 72 | self.tags = {} 73 | for tag, size in SECTIONS: 74 | self.tags[NS.xhtml[tag]] = int (size * MAX_CHUNK_SIZE) 75 | for tag in options.section_tags: 76 | self.tags[NS.xhtml[tag]] = 0 77 | 78 | 79 | def _make_name (self, url): 80 | """ Generate a name for the chunk. """ 81 | u = list (urlparse.urlparse (url)) 82 | root, ext = os.path.splitext (u[2]) 83 | # FIXME: brain-dead kindlegen only finds links in files with 84 | # .html extension. so we just add .html to everything 85 | u[2] = "%s-%d%s.html" % (root, self.next_id, ext) 86 | self.next_id += 1 87 | return urlparse.urlunparse (u) 88 | 89 | 90 | @staticmethod 91 | def make_template (tree): 92 | """ Make a copy with an empty html:body. 93 | 94 | This makes a template into which we can paste our chunks. 95 | 96 | """ 97 | 98 | template = copy.deepcopy (tree) 99 | 100 | for c in xpath (template, '//xhtml:body'): 101 | 102 | # descend while elem has only one child 103 | while len (c) == 1: 104 | c = c[0] 105 | 106 | # clear children but save attributes 107 | attributes = c.attrib.items () 108 | c.clear () 109 | # was tentative fix for patological one-element-html case 110 | # for child in c: 111 | # c.remove (child) 112 | for a in attributes: 113 | c.set (a[0], a[1]) 114 | 115 | # debug (etree.tostring (template)) 116 | 117 | return template 118 | 119 | 120 | def reset_chunk (self, template): 121 | """ start a new chunk """ 122 | 123 | self.chunk = copy.deepcopy (template) 124 | self.chunk_size = len (etree.tostring (self.chunk)) 125 | self.chunk_body = xpath (self.chunk, "//xhtml:body")[0] 126 | while len (self.chunk_body) == 1: 127 | self.chunk_body = self.chunk_body[0] 128 | 129 | 130 | def shipout_chunk (self, url, chunk_id = None, comment = None): 131 | """ ready chunk to be shipped """ 132 | 133 | if (self.chunk_size > MAX_CHUNK_SIZE): 134 | self.split (self.chunk, url) 135 | return 136 | 137 | url = unicode_uri (url) 138 | chunk_name = self._make_name (url) 139 | 140 | # the url of the whole page 141 | if not url in self.idmap: 142 | self.idmap[url] = chunk_name 143 | 144 | # fragments of the page 145 | for e in xpath (self.chunk, '//xhtml:*[@id]'): 146 | id_ = e.attrib['id'] 147 | old_id = "%s#%s" % (url, id_) 148 | # key is unicode string, 149 | # value is uri-escaped byte string 150 | # if ids get cloned while chunking, map to the first one only 151 | if old_id not in self.idmap: 152 | self.idmap[old_id] = "%s#%s" % ( 153 | chunk_name, urllib.quote (id_.encode ('utf-8'))) 154 | 155 | self.chunks.append ( { 'name' : chunk_name, 156 | 'id' : chunk_id, 157 | 'comment' : comment, 158 | 'chunk' : self.chunk, } ) 159 | 160 | debug ("Adding chunk %s (%d bytes) %s" % (chunk_name, self.chunk_size, chunk_id)) 161 | 162 | 163 | def split (self, tree, url): 164 | """ Split whole html or split chunk. 165 | 166 | Find some arbitrary points to do it. 167 | 168 | """ 169 | 170 | for body in xpath (tree, "//xhtml:body"): 171 | # we can't split a node that has only one child 172 | # descend while elem has only one child 173 | while len (body) == 1: 174 | body = body[0] 175 | 176 | debug ("body tag is %s" % body.tag) 177 | 178 | template = self.make_template (tree) 179 | self.reset_chunk (template) 180 | 181 | # FIXME: is this ok ??? 182 | # fixes patological one-element-body case 183 | self.chunk_body.text = body.text 184 | 185 | for child in body: 186 | if not isinstance (child, etree.ElementBase): 187 | # comments, processing instructions etc. 188 | continue 189 | child_size = len (etree.tostring (child)) 190 | 191 | try: 192 | tags = [child.tag + '.' + c for c in child.attrib['class'].split ()] 193 | tags.append (child.tag) 194 | except KeyError: 195 | tags = [child.tag] 196 | 197 | for tag in tags: 198 | if ((self.chunk_size + child_size > MAX_CHUNK_SIZE) or 199 | (tag in self.tags and 200 | self.chunk_size > self.tags[tag])): 201 | 202 | comment = ("Chunk: size=%d Split on %s" 203 | % (self.chunk_size, re.sub ('^{.*}', '', tag))) 204 | debug (comment) 205 | 206 | # find a suitable id 207 | chunk_id = None 208 | for c in self.chunk_body: 209 | if 'id' in c.attrib: 210 | chunk_id = c.get ('id') 211 | break 212 | debug ("chunk id is: %s" % (chunk_id or '')) 213 | 214 | self.shipout_chunk (url, chunk_id, comment) 215 | self.reset_chunk (template) 216 | break 217 | 218 | self.chunk_body.append (child) 219 | self.chunk_size = self.chunk_size + child_size 220 | 221 | # fixes patological one-element-body case 222 | self.chunk_body.tail = body.tail 223 | 224 | chunk_id = None 225 | if len (self.chunk_body): 226 | chunk_id = self.chunk_body[0].get ('id') 227 | comment = "Chunk: size=%d" % self.chunk_size 228 | self.shipout_chunk (url, chunk_id, comment) 229 | self.reset_chunk (template) 230 | 231 | 232 | def rewrite_links (self, f): 233 | """ Rewrite all href and src using f (). """ 234 | 235 | for chunk in self.chunks: 236 | # chunk['name'] = f (chunk['name']) 237 | 238 | for link in xpath (chunk['chunk'], '//xhtml:*[@href]'): 239 | url = link.get ('href') 240 | if not url.startswith('http://') and not url.startswith('https://'): 241 | link.set ('href', f (url)) 242 | 243 | for image in xpath (chunk['chunk'], '//xhtml:*[@src]'): 244 | image.set ('src', f (image.get ('src'))) 245 | 246 | for k, v in self.idmap.items (): 247 | self.idmap[k] = f (v) 248 | 249 | 250 | def rewrite_internal_links (self): 251 | """ Rewrite links to point into right chunks. 252 | 253 | Because we split the HTML into chunks, all internal links need 254 | to be rewritten to become links into the right chunk. 255 | Rewrite all internal links in all chunks. 256 | 257 | """ 258 | for chunk in self.chunks: 259 | for a in xpath (chunk['chunk'], "//xhtml:*[@href]"): 260 | try: 261 | uri = unicode_uri (a.get ('href')) 262 | a.set ('href', self.idmap[uri]) 263 | except KeyError: 264 | ur, dummy_frag = urlparse.urldefrag (uri) 265 | if ur in self.idmap: 266 | error ("HTMLChunker: Cannot rewrite internal link '%s'" % uri) 267 | 268 | 269 | def rewrite_internal_links_toc (self, toc): 270 | """ Rewrite links to point into right chunks. 271 | 272 | Because we split the HTML into chunks, all internal links need 273 | to be rewritten to become links into the right chunk. 274 | Rewrite all links in the passed toc. 275 | 276 | """ 277 | 278 | for entry in toc: 279 | try: 280 | entry[0] = self.idmap [unicode_uri (entry[0])] 281 | except KeyError: 282 | error ("HTMLChunker: Cannot rewrite toc entry '%s'" % entry[0]) 283 | del entry 284 | 285 | 286 | -------------------------------------------------------------------------------- /epubmaker/ParserFactory.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | 6 | ParserFactory.py 7 | 8 | Copyright 2009-10 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | """ 13 | 14 | from __future__ import with_statement 15 | 16 | import os.path 17 | import urllib 18 | 19 | from pkg_resources import resource_listdir # pylint: disable=E0611 20 | 21 | from epubmaker.mydocutils import broken 22 | from epubmaker.lib.Logger import debug, error 23 | from epubmaker.lib.MediaTypes import mediatypes 24 | from epubmaker.Version import VERSION 25 | from epubmaker.CommonOptions import Options 26 | 27 | options = Options() 28 | 29 | class AppURLopener (urllib.FancyURLopener): 30 | version = "ebookmaker/%s" % VERSION 31 | 32 | urllib._urlopener = AppURLopener () 33 | 34 | parser_modules = {} 35 | 36 | def load_parsers (): 37 | """ See what types we can parse. """ 38 | 39 | for fn in resource_listdir ('epubmaker.parsers', ''): 40 | modulename, ext = os.path.splitext (fn) 41 | if ext == '.py': 42 | if (modulename.endswith ('Parser')): 43 | module = __import__ ('epubmaker.parsers.' + modulename, fromlist = [modulename]) 44 | debug ("Loading parser from module: %s for mediatypes: %s" % ( 45 | modulename, ', '.join (module.mediatypes))) 46 | for mediatype in module.mediatypes: 47 | parser_modules[mediatype] = module 48 | 49 | return parser_modules.keys () 50 | 51 | 52 | def unload_parsers (): 53 | """ Unload parser modules. """ 54 | for k in parser_modules.keys (): 55 | del parser_modules[k] 56 | 57 | 58 | class ParserFactory (object): 59 | """ A factory and a cache for parsers. 60 | 61 | So we don't reparse the same file twice. 62 | 63 | """ 64 | 65 | parsers = {} # cache: parsers[url] = parser 66 | 67 | @staticmethod 68 | def get (mediatype): 69 | """ Get the right kind of parser. """ 70 | try: 71 | return parser_modules[mediatype].Parser () 72 | except KeyError: 73 | return parser_modules['*/*'].Parser () 74 | 75 | 76 | @classmethod 77 | def create (cls, url, attribs): 78 | """ Create an appropriate parser. """ 79 | 80 | # debug ("Need parser for %s" % url) 81 | 82 | if url in cls.parsers: 83 | # debug ("... reusing parser for %s" % url) 84 | # reuse same parser, maybe already filled with data 85 | return cls.parsers[url] 86 | 87 | orig_url = url 88 | mediatype = attribs.get ('mediatype') 89 | 90 | if url.endswith (broken): 91 | # hack! broken.png doesn't exist at the source location. 92 | # We take it from our resources and fake its provenience. 93 | parser = parser_modules['image/png'].Parser () 94 | parser.orig_url = url 95 | parser.url = url 96 | parser.broken_image () 97 | else: 98 | fp = urllib.urlopen (url, proxies = options.config.PROXIES) 99 | url = fp.geturl () 100 | 101 | if url != orig_url: 102 | debug ("... %s redirected to %s" % (orig_url, url)) 103 | if url in cls.parsers: 104 | # debug ("... reusing parser for %s" % url) 105 | # reuse same parser, maybe already filled with data 106 | return cls.parsers[url] 107 | 108 | # ok. so we have to create a new parser 109 | debug ("... creating new parser for %s" % url) 110 | 111 | if mediatype is not None: 112 | debug ("... got mediatype %s from link attributes" % mediatype) 113 | else: 114 | if options.mediatype_from_extension or not hasattr (fp, 'info'): 115 | name, ext = os.path.splitext (url) 116 | mediatype = mediatypes[ext[1:]] 117 | else: 118 | msg = fp.info () 119 | mediatype = msg.get ('Content-Type') 120 | if mediatype: 121 | mediatype = mediatype.partition (';')[0] 122 | debug ("... got mediatype %s from server" % mediatype) 123 | else: 124 | mediatype = 'application/octet-stream' 125 | error ("... cannot determine mediatype for %s" % url) 126 | 127 | # get the right kind of parser 128 | try: 129 | mt = mediatype.split (';')[0] 130 | parser = parser_modules[mt].Parser () 131 | except KeyError: 132 | parser = parser_modules['*/*'].Parser () 133 | 134 | parser.setup (orig_url, mediatype, attribs, fp) 135 | 136 | cls.parsers[parser.url] = parser 137 | cls.parsers[orig_url] = parser 138 | 139 | return parser 140 | 141 | 142 | @classmethod 143 | def clear (cls): 144 | """ Clear parser cache to free memory. """ 145 | 146 | # debug: kill refs 147 | for dummy_url, parser in cls.parsers.items (): 148 | del parser 149 | 150 | cls.parsers = {} 151 | 152 | -------------------------------------------------------------------------------- /epubmaker/Spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | 6 | Spider.py 7 | 8 | Copyright 2009 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Rudimentary Web Spider 13 | 14 | """ 15 | 16 | from __future__ import with_statement 17 | 18 | import urlparse 19 | import fnmatch 20 | 21 | from epubmaker.lib import MediaTypes 22 | import epubmaker.lib.GutenbergGlobals as gg 23 | from epubmaker.lib.GutenbergGlobals import NS 24 | from epubmaker.lib.Logger import debug, error 25 | 26 | from epubmaker import ParserFactory 27 | 28 | COVERPAGE_MIN_AREA = 200 * 200 29 | 30 | class Spider (object): 31 | """ A very rudimentary web spider. """ 32 | 33 | def __init__ (self): 34 | self.options = None 35 | self.parsed_urls = set () 36 | self.enqueued_urls = set () 37 | self.included_mediatypes = set () 38 | self.excluded_mediatypes = set () 39 | self.queue = [] 40 | self.parsers = [] 41 | self.next = [] # for a topological sort 42 | self.redirection_map = {} 43 | 44 | 45 | def parse (self, url, mediatype_hint, options): 46 | """ Do a recursive parse starting from url. 47 | 48 | Do a breadth-first traversal. Assuming the first page contains 49 | a linked TOC, this will get us a more natural ordering of the 50 | pages than a depth-first traversal. 51 | 52 | """ 53 | 54 | self.options = options 55 | 56 | for rewrite in self.options.rewrite: 57 | from_, to = rewrite.split ('>') 58 | self.redirection_map[from_] = to 59 | 60 | debug ("Start of retrieval") 61 | 62 | # enqueue root url 63 | 64 | attribs = { 'mediatype' : mediatype_hint, 'id': 'start' } 65 | self.enqueue (url, 0, attribs) 66 | 67 | while self.queue: 68 | (url, depth, attribs) = self.queue.pop (0) 69 | 70 | url = self.redirect (url) 71 | if url in self.parsed_urls: 72 | continue 73 | 74 | parser = ParserFactory.ParserFactory.create (url, attribs) 75 | self.add_redirection (parser) 76 | 77 | # if the url was redirected to something we already have 78 | url = self.redirect (parser.url) 79 | if url in self.parsed_urls: 80 | continue 81 | 82 | self.parsed_urls.add (url) 83 | parser.options = self.options 84 | parser.pre_parse () 85 | self.parsers.append (parser) 86 | 87 | # check potential coverpage for sufficient size 88 | if options.coverpage_url is None: 89 | if attribs.get ('rel', '') == 'coverpage': 90 | if hasattr (parser, 'get_image_dimen'): 91 | dimen = parser.get_image_dimen () 92 | if (dimen[0] * dimen[1]) > COVERPAGE_MIN_AREA: 93 | options.coverpage_url = parser.url 94 | debug ("Setting coverpage: %s ..." % parser.url) 95 | 96 | depth += 1 97 | 98 | # look for links in just parsed document 99 | debug ("Requesting iterlinks for: %s ..." % url) 100 | 101 | for (url, attr) in parser.iterlinks (): 102 | # debug ("*** link: %s ..." % url) 103 | 104 | url = urlparse.urldefrag (url)[0] 105 | tag = attr.get ('tag', '') 106 | 107 | if tag == NS.xhtml.link: 108 | if attr.get ('rel', '').lower () == 'next': 109 | self.next.append ((parser.url, url)) 110 | 111 | url = self.redirect (url) 112 | 113 | attribs = { 'mediatype' : attr.get ('type', None) } 114 | 115 | for k in ('id', 'rel'): 116 | if k in attr: 117 | attribs[k] = attr[k] 118 | 119 | if tag == NS.xhtml.a: 120 | self.enqueue_doc (url, depth, attribs) 121 | continue 122 | if tag == NS.xhtml.img: 123 | self.enqueue_aux (url, depth, attribs) 124 | continue 125 | if tag == NS.xhtml.object: 126 | if ('type' in attr and 127 | not self.is_included_mediatype (attr['type'])): 128 | continue 129 | self.enqueue_aux (url, depth, attribs) 130 | continue 131 | if tag == NS.xhtml.link: 132 | rel = attribs.get ('rel', '').lower () 133 | if 'stylesheet' in rel: 134 | self.enqueue_aux (url, depth, attribs) 135 | elif rel == 'coverpage': 136 | # We may also find the coverpage in <link rel='coverpage' href='url' /> 137 | self.enqueue_aux (url, depth, attribs) 138 | else: 139 | self.enqueue_doc (url, depth, attribs) 140 | continue 141 | 142 | debug ("End of retrieval") 143 | 144 | # rewrite redirected urls 145 | if self.redirection_map: 146 | for parser in self.parsers: 147 | parser.remap_links (self.redirection_map) 148 | 149 | # try a topological sort of documents using <link rel='next'> 150 | if self.next: 151 | self.next = map (lambda x: (self.redirect(x[0]), self.redirect(x[1])), self.next) 152 | 153 | try: 154 | d = {} 155 | for order, url in enumerate (gg.topological_sort (self.next)): 156 | d[url] = order 157 | debug ("%s order %d" % (url, order)) 158 | for parser in self.parsers: 159 | parser.order = d.get (parser.url, 999999) 160 | self.parsers.sort (key = lambda p: p.order) 161 | 162 | except StandardError: 163 | pass 164 | 165 | 166 | def add_redirection (self, parser): 167 | """ Remember this redirection. """ 168 | if parser.orig_url != parser.url: 169 | self.redirection_map[parser.orig_url] = parser.url 170 | debug ("Adding redirection from %s to %s" % (parser.orig_url, parser.url)) 171 | 172 | 173 | def redirect (self, url): 174 | """ Redirect url if we know the target. """ 175 | return self.redirection_map.get (url, url) 176 | 177 | 178 | def enqueue (self, url, depth, attribs): 179 | """ Enque url for parsing. """ 180 | 181 | url = self.redirect (url) 182 | if url in self.enqueued_urls: 183 | return 184 | 185 | debug ("Enqueing %s ..." % url) 186 | self.queue.append ((url, depth, attribs)) 187 | self.enqueued_urls.add (url) 188 | 189 | 190 | def enqueue_aux (self, url, depth, attribs): 191 | """ Enqueue an auxiliary file. 192 | 193 | We get auxiliary files even if they are too deep or not in 194 | 'included' directories. 195 | 196 | """ 197 | try: 198 | parser = ParserFactory.ParserFactory.create (url, attribs) 199 | self.add_redirection (parser) 200 | if self.is_wanted_aux (parser): 201 | self.enqueue (parser.url, depth, attribs) 202 | except IOError: 203 | error ("bad aux url: %s" % url) 204 | 205 | def enqueue_doc (self, url, depth, attribs): 206 | """ Enqueue a document file. 207 | 208 | We get document files only if they pass document-selection 209 | rules. 210 | 211 | """ 212 | 213 | if not self.options.max_depth or depth < self.options.max_depth: 214 | if self.is_included (url): 215 | try: 216 | parser = ParserFactory.ParserFactory.create (url, attribs) 217 | self.add_redirection (parser) 218 | if self.is_wanted_doc (parser): 219 | self.enqueue (parser.url, depth, attribs) 220 | except IOError: 221 | error ("bad url: %s" % url) 222 | 223 | 224 | def is_included (self, url): 225 | """ Return True if this document is eligible. """ 226 | 227 | included = any (map (lambda x: fnmatch.fnmatchcase (url, x), self.options.include)) 228 | excluded = any (map (lambda x: fnmatch.fnmatchcase (url, x), self.options.exclude)) 229 | 230 | if included and not excluded: 231 | if self.options.local_files_only: 232 | if url.startswith('http:') or url.startswith('https:'): 233 | return 0 234 | else: 235 | return 1 236 | return 1 237 | 238 | if excluded: 239 | debug ("Dropping excluded %s" % url) 240 | if not included: 241 | debug ("Dropping not included %s" % url) 242 | return 0 243 | 244 | 245 | def is_included_mediatype (self, mediatype): 246 | """ Return True if this document is eligible. """ 247 | 248 | included = any (map (lambda pattern: fnmatch.fnmatch (mediatype, pattern), 249 | self.options.include_mediatypes)) 250 | excluded = any (map (lambda pattern: fnmatch.fnmatch (mediatype, pattern), 251 | self.options.exclude_mediatypes)) 252 | 253 | if included and not excluded: 254 | self.included_mediatypes.add (mediatype) 255 | return 1 256 | 257 | if excluded: 258 | debug ("Dropping excluded mediatype %s" % mediatype) 259 | if not included: 260 | debug ("Dropping not included mediatype %s" % mediatype) 261 | 262 | self.excluded_mediatypes.add (mediatype) 263 | return 0 264 | 265 | 266 | def has_seen_images (self): 267 | """ Return True if the spider has encountered images. """ 268 | 269 | return bool (MediaTypes.IMAGE_MEDIATYPES & 270 | (self.included_mediatypes | self.excluded_mediatypes)) 271 | 272 | 273 | def dict_urls_mediatypes (self): 274 | """ Return a dict of all parsed urls and mediatypes. """ 275 | return dict (map (lambda p: (p.url, p.mediatype), self.parsers)) 276 | 277 | 278 | def is_wanted_doc (self, parser): 279 | """ Return True if we ought to parse this content document. 280 | 281 | Override this in custom spiders. 282 | 283 | """ 284 | return self.is_included_mediatype (parser.mediatype) 285 | 286 | 287 | def is_wanted_aux (self, parser): 288 | """ Return True if we ought to parse this image or aux file. 289 | 290 | Override this in custom spiders. 291 | 292 | """ 293 | return self.is_included_mediatype (parser.mediatype) 294 | 295 | 296 | -------------------------------------------------------------------------------- /epubmaker/Unitame.py: -------------------------------------------------------------------------------- 1 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 2 | 3 | """ 4 | Unitame.py 5 | 6 | Copyright 2010 by Marcello Perathoner 7 | 8 | Distributable under the GNU General Public License Version 3 or newer. 9 | 10 | Module to implement the totally superfluous PG plain text conversion 11 | into long extinct encodings. 12 | 13 | We have to unitame-translate before feeding to nroff because nroff 14 | does some irreversible (and wrong) translations of its own, like ä -> 15 | a. Also, some unitame-translations change the number of characters, 16 | thus throwing already-justified text off. 17 | 18 | We cannot do the translations before feeding the source to docutils 19 | because if we change the length of titles, we get the warning: Title 20 | underline too short. 21 | 22 | Translation does some dangerous things, like converting quotes to 23 | apostrophes, which are command escapes in nroff. We have to escape 24 | apostrophes in the source text but not apostroph-commands inserted by 25 | the converter. 26 | 27 | We also have to translate some important non-ascii characters, like 28 | nbsp and shy, into command sequences before they reach unitame because 29 | unitame would convert them into the semantically different space and 30 | hyhpen. 31 | 32 | All this makes translation inside the docutils converter the best 33 | choice. Implemented as a docutils translator that visits all text 34 | nodes. 35 | 36 | Smart quote translation should also go into a docutils 37 | translator. Likewise a translator for text-transform: upper. 38 | 39 | """ 40 | 41 | import codecs 42 | import unicodedata as ud 43 | 44 | # UnitameData is generated from unitame.dat 45 | from epubmaker.UnitameData import unicode_to_iso_8859_1, iso_8859_1_to_ascii 46 | 47 | # tweak dicts for translate () 48 | u2i = dict ( [ (ord (o), s) for o, s in unicode_to_iso_8859_1.iteritems () ] ) 49 | i2a = dict ( [ (ord (o), s) for o, s in iso_8859_1_to_ascii.iteritems () ] ) 50 | 51 | u2i.update ( { 52 | 0x2000: u' ', # en quad 53 | 0x2001: u' ', # em quad 54 | 0x2002: u' ', # en space 55 | 0x2003: u' ', # em space 56 | 0x2004: u' ', # 3/em space 57 | 0x2005: u'', # 4/em 58 | 0x2006: u'', # 6/em 59 | 0x2007: u' ', # figure space 60 | 0x2008: u'', # punctuation space 61 | 0x2009: u'', # thin space 62 | 0x200a: u'', # hair space 63 | 0x200b: u'', # zero space 64 | 0x200c: u'', # zwnj 65 | 0x200d: u'', # zwj 66 | 0x2010: u'-', # hyphen 67 | 0x2011: u'-', # non-breaking hyphen 68 | 0x2012: u'-', # figure-dash 69 | 0x2013: u'-', # en dash 70 | 0x2014: u'--', # em dash 71 | 0x2015: u'-', # horizontal bar 72 | 0x2026: u'...', # horizontal ellipsis 73 | ord (u'™'): u'(tm)', 74 | ord (u'‹'): u'<', 75 | ord (u'›'): u'>', 76 | ord (u'†'): u'+', 77 | ord (u'‡'): u'++', 78 | ord (u'⁑'): u'**', 79 | ord (u'⁂'): u'***', 80 | ord (u'•'): u'-', 81 | ord (u'′'): u'´', 82 | ord (u'″'): u'´´', 83 | ord (u'‴'): u'´´´', 84 | ord (u'⁗'): u'´´´´', 85 | ord (u'⁓'): u'~', 86 | ord (u'‰'): u'%o', 87 | ord (u'‱'): u'%oo', 88 | ord (u'⚹'): u'*', # U+26b9 sextile 89 | ord (u'⁰'): u'^0', 90 | ord (u'⁴'): u'^4', 91 | ord (u'⁵'): u'^5', 92 | ord (u'⁶'): u'^6', 93 | ord (u'⁷'): u'^7', 94 | ord (u'⁸'): u'^8', 95 | ord (u'⁹'): u'^9', 96 | } ) 97 | 98 | # somehow cram these into ascii, so the ppers stop whining about not 99 | # having nbsp in ascii, then fix it later by replacing them with nroff 100 | # commands. 101 | 102 | i2a.update ( { 103 | ord (u'¹'): u'^1', 104 | ord (u'²'): u'^2', 105 | ord (u'³'): u'^3', 106 | 0x00a0: u'\u0011', # nbsp => DC1 107 | 0x00ad: u'\u0012', # shy => DC2 108 | } ) 109 | 110 | unhandled_chars = [] 111 | 112 | def strip_accents (text): 113 | """ Strip accents from string. 114 | 115 | If the accented character doesn't fit into the encoding, 116 | remove the accent and try again. 117 | 118 | """ 119 | return ud.normalize ('NFKC', 120 | filter (lambda c: ud.category (c) != 'Mn', 121 | ud.normalize ('NFKD', text))) 122 | 123 | 124 | def unitame (exc): 125 | """ 126 | Encoding error handler. 127 | 128 | The encoder handles all compatible characters itself. It calls 129 | this function whenever it encounters a character it cannot encode. 130 | This function searches the unitame database for a replacement. 131 | 132 | 133 | """ 134 | 135 | l = [] 136 | for cc in exc.object[exc.start:exc.end]: 137 | c = cc 138 | if exc.encoding == 'latin-1': # python name for iso-8859-1 139 | c = c.translate (u2i) 140 | c = strip_accents (c) 141 | if c and ord (max (c)) < 256: 142 | l.append (c) 143 | c = None 144 | elif exc.encoding == 'ascii': # python name for us-ascii 145 | # "1¼" -> "1 1/4" 146 | if cc in u'¼½¾': 147 | if exc.start > 0 and exc.object[exc.start - 1] in u'0123456789': 148 | l.append (' ') 149 | c = c.translate (u2i) 150 | c = c.translate (i2a) 151 | c = strip_accents (c) 152 | if c and ord (max (c)) < 128: 153 | l.append (c) 154 | c = None 155 | 156 | if c: 157 | l.append ('{~%s U+%04x~}' % (ud.name (cc), ord (cc))) 158 | unhandled_chars.extend (l) 159 | 160 | return (u"".join (l), exc.end) 161 | 162 | 163 | codecs.register_error ('unitame', unitame) 164 | 165 | 166 | -------------------------------------------------------------------------------- /epubmaker/UnitameData.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | unicode_to_iso_8859_1 = { 5 | u'Đ': u'D', # LATIN CAPITAL LETTER D WITH STROKE 6 | u'đ': u'd', # LATIN SMALL LETTER D WITH STROKE 7 | u'Ħ': u'H', # LATIN CAPITAL LETTER H WITH STROKE 8 | u'ħ': u'h', # LATIN SMALL LETTER H WITH STROKE 9 | u'Ŀ': u'L', # LATIN CAPITAL LETTER L WITH MIDDLE DOT 10 | u'ŀ': u'l', # LATIN SMALL LETTER L WITH MIDDLE DOT 11 | u'Ł': u'L', # LATIN CAPITAL LETTER L WITH STROKE 12 | u'ł': u'l', # LATIN SMALL LETTER L WITH STROKE 13 | u'ŉ': u'n', # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE 14 | u'Œ': u'OE', # LATIN CAPITAL LIGATURE OE 15 | u'œ': u'oe', # LATIN SMALL LIGATURE OE 16 | u'Ŧ': u'T', # LATIN CAPITAL LETTER T WITH STROKE 17 | u'ŧ': u't', # LATIN SMALL LETTER T WITH STROKE 18 | u'ƀ': u'b', # LATIN SMALL LETTER B WITH STROKE 19 | u'Ɓ': u'B', # LATIN CAPITAL LETTER B WITH HOOK 20 | u'Ƃ': u'B', # LATIN CAPITAL LETTER B WITH TOPBAR 21 | u'ƃ': u'b', # LATIN SMALL LETTER B WITH TOPBAR 22 | u'Ɔ': u'O', # LATIN CAPITAL LETTER OPEN O 23 | u'Ƈ': u'C', # LATIN CAPITAL LETTER C WITH HOOK 24 | u'ƈ': u'c', # LATIN SMALL LETTER C WITH HOOK 25 | u'Ɗ': u'D', # LATIN CAPITAL LETTER D WITH HOOK 26 | u'Ƌ': u'D', # LATIN CAPITAL LETTER D WITH TOPBAR 27 | u'ƌ': u'd', # LATIN SMALL LETTER D WITH TOPBAR 28 | u'Ƒ': u'F', # LATIN CAPITAL LETTER F WITH HOOK 29 | u'ƒ': u'f', # LATIN SMALL LETTER F WITH HOOK 30 | u'Ɠ': u'G', # LATIN CAPITAL LETTER G WITH HOOK 31 | u'Ɨ': u'I', # LATIN CAPITAL LETTER I WITH STROKE 32 | u'Ƙ': u'K', # LATIN CAPITAL LETTER K WITH HOOK 33 | u'ƙ': u'k', # LATIN SMALL LETTER K WITH HOOK 34 | u'ƚ': u'l', # LATIN SMALL LETTER L WITH BAR 35 | u'Ɲ': u'N', # LATIN CAPITAL LETTER N WITH LEFT HOOK 36 | u'ƞ': u'n', # LATIN SMALL LETTER N WITH LONG RIGHT LEG 37 | u'Ɵ': u'O', # LATIN CAPITAL LETTER O WITH MIDDLE TILDE 38 | u'Ƥ': u'P', # LATIN CAPITAL LETTER P WITH HOOK 39 | u'ƥ': u'p', # LATIN SMALL LETTER P WITH HOOK 40 | u'ƫ': u't', # LATIN SMALL LETTER T WITH PALATAL HOOK 41 | u'Ƭ': u'T', # LATIN CAPITAL LETTER T WITH HOOK 42 | u'ƭ': u't', # LATIN SMALL LETTER T WITH HOOK 43 | u'Ʈ': u'T', # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK 44 | u'Ʋ': u'V', # LATIN CAPITAL LETTER V WITH HOOK 45 | u'Ƴ': u'Y', # LATIN CAPITAL LETTER Y WITH HOOK 46 | u'ƴ': u'y', # LATIN SMALL LETTER Y WITH HOOK 47 | u'Ƶ': u'Z', # LATIN CAPITAL LETTER Z WITH STROKE 48 | u'ƶ': u'z', # LATIN SMALL LETTER Z WITH STROKE 49 | u'ǈ': u'L', # LATIN CAPITAL LETTER L WITH SMALL LETTER J 50 | u'ǋ': u'N', # LATIN CAPITAL LETTER N WITH SMALL LETTER J 51 | u'Ǣ': u'AE', # LATIN CAPITAL LETTER AE WITH MACRON 52 | u'ǣ': u'ae', # LATIN SMALL LETTER AE WITH MACRON 53 | u'Ǥ': u'G', # LATIN CAPITAL LETTER G WITH STROKE 54 | u'ǥ': u'g', # LATIN SMALL LETTER G WITH STROKE 55 | u'ǲ': u'D', # LATIN CAPITAL LETTER D WITH SMALL LETTER Z 56 | u'Ǽ': u'AE', # LATIN CAPITAL LETTER AE WITH ACUTE 57 | u'ǽ': u'ae', # LATIN SMALL LETTER AE WITH ACUTE 58 | u'Ǿ': u'O', # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE 59 | u'ǿ': u'o', # LATIN SMALL LETTER O WITH STROKE AND ACUTE 60 | u'Ƞ': u'N', # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG 61 | u'ȡ': u'd', # LATIN SMALL LETTER D WITH CURL 62 | u'Ȥ': u'Z', # LATIN CAPITAL LETTER Z WITH HOOK 63 | u'ȥ': u'z', # LATIN SMALL LETTER Z WITH HOOK 64 | u'ȴ': u'l', # LATIN SMALL LETTER L WITH CURL 65 | u'ȵ': u'n', # LATIN SMALL LETTER N WITH CURL 66 | u'ȶ': u't', # LATIN SMALL LETTER T WITH CURL 67 | u'ɓ': u'b', # LATIN SMALL LETTER B WITH HOOK 68 | u'ɕ': u'c', # LATIN SMALL LETTER C WITH CURL 69 | u'ɖ': u'd', # LATIN SMALL LETTER D WITH TAIL 70 | u'ɗ': u'd', # LATIN SMALL LETTER D WITH HOOK 71 | u'ɠ': u'g', # LATIN SMALL LETTER G WITH HOOK 72 | u'ɦ': u'h', # LATIN SMALL LETTER H WITH HOOK 73 | u'ɨ': u'i', # LATIN SMALL LETTER I WITH STROKE 74 | u'ɫ': u'l', # LATIN SMALL LETTER L WITH MIDDLE TILDE 75 | u'ɬ': u'l', # LATIN SMALL LETTER L WITH BELT 76 | u'ɭ': u'l', # LATIN SMALL LETTER L WITH RETROFLEX HOOK 77 | u'ɱ': u'm', # LATIN SMALL LETTER M WITH HOOK 78 | u'ɲ': u'n', # LATIN SMALL LETTER N WITH LEFT HOOK 79 | u'ɳ': u'n', # LATIN SMALL LETTER N WITH RETROFLEX HOOK 80 | u'ɼ': u'r', # LATIN SMALL LETTER R WITH LONG LEG 81 | u'ɽ': u'r', # LATIN SMALL LETTER R WITH TAIL 82 | u'ɾ': u'r', # LATIN SMALL LETTER R WITH FISHHOOK 83 | u'ʂ': u's', # LATIN SMALL LETTER S WITH HOOK 84 | u'ʈ': u't', # LATIN SMALL LETTER T WITH RETROFLEX HOOK 85 | u'ʉ': u'u', # LATIN SMALL LETTER U BAR 86 | u'ʋ': u'v', # LATIN SMALL LETTER V WITH HOOK 87 | u'ʐ': u'z', # LATIN SMALL LETTER Z WITH RETROFLEX HOOK 88 | u'ʑ': u'z', # LATIN SMALL LETTER Z WITH CURL 89 | u'ʜ': u'H', # LATIN LETTER SMALL CAPITAL H 90 | u'ʝ': u'j', # LATIN SMALL LETTER J WITH CROSSED-TAIL 91 | u'ʠ': u'q', # LATIN SMALL LETTER Q WITH HOOK 92 | u'ʮ': u'h', # LATIN SMALL LETTER TURNED H WITH FISHHOOK 93 | u'ʯ': u'h', # LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL 94 | u'Ѝ': u'I', # CYRILLIC CAPITAL LETTER I WITH GRAVE 95 | u'ѝ': u'i', # CYRILLIC SMALL LETTER I WITH GRAVE 96 | u'Ӑ': u'A', # CYRILLIC CAPITAL LETTER A WITH BREVE 97 | u'ӑ': u'a', # CYRILLIC SMALL LETTER A WITH BREVE 98 | u'Ӓ': u'A', # CYRILLIC CAPITAL LETTER A WITH DIAERESIS 99 | u'ӓ': u'a', # CYRILLIC SMALL LETTER A WITH DIAERESIS 100 | u'Ӣ': u'I', # CYRILLIC CAPITAL LETTER I WITH MACRON 101 | u'ӣ': u'i', # CYRILLIC SMALL LETTER I WITH MACRON 102 | u'Ӥ': u'I', # CYRILLIC CAPITAL LETTER I WITH DIAERESIS 103 | u'ӥ': u'i', # CYRILLIC SMALL LETTER I WITH DIAERESIS 104 | u'Ӧ': u'O', # CYRILLIC CAPITAL LETTER O WITH DIAERESIS 105 | u'ӧ': u'o', # CYRILLIC SMALL LETTER O WITH DIAERESIS 106 | u'Ӭ': u'E', # CYRILLIC CAPITAL LETTER E WITH DIAERESIS 107 | u'ӭ': u'e', # CYRILLIC SMALL LETTER E WITH DIAERESIS 108 | u'Ӯ': u'U', # CYRILLIC CAPITAL LETTER U WITH MACRON 109 | u'ӯ': u'u', # CYRILLIC SMALL LETTER U WITH MACRON 110 | u'Ӱ': u'U', # CYRILLIC CAPITAL LETTER U WITH DIAERESIS 111 | u'ӱ': u'u', # CYRILLIC SMALL LETTER U WITH DIAERESIS 112 | u'Ӳ': u'U', # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE 113 | u'ӳ': u'u', # CYRILLIC SMALL LETTER U WITH DOUBLE ACUTE 114 | u'ẚ': u'a', # LATIN SMALL LETTER A WITH RIGHT HALF RING 115 | u'‐': u'-', # HYPHEN 116 | u'–': u'-', # EN DASH 117 | u'—': u'--', # EM DASH 118 | u'‖': u'||', # DOUBLE VERTICAL LINE 119 | u'‗': u'_', # DOUBLE LOW LINE 120 | u'‘': u'\'', # LEFT SINGLE QUOTATION MARK 121 | u'’': u'\'', # RIGHT SINGLE QUOTATION MARK 122 | u'‚': u'\'', # SINGLE LOW-9 QUOTATION MARK 123 | u'‛': u'\'', # SINGLE HIGH-REVERSED-9 QUOTATION MARK 124 | u'“': u'"', # LEFT DOUBLE QUOTATION MARK 125 | u'”': u'"', # RIGHT DOUBLE QUOTATION MARK 126 | u'„': u'"', # DOUBLE LOW-9 QUOTATION MARK 127 | u'‟': u'"', # DOUBLE HIGH-REVERSED-9 QUOTATION MARK 128 | u'⁅': u'[', # LEFT SQUARE BRACKET WITH QUILL 129 | u'⁆': u']', # RIGHT SQUARE BRACKET WITH QUILL 130 | } 131 | 132 | 133 | iso_8859_1_to_ascii = { 134 | u'¡': u'i', # INVERTED EXCLAMATION MARK 135 | u'¢': u'c', # CENT SIGN 136 | u'£': u'L', # POUND SIGN 137 | u'¥': u'Y', # YEN SIGN 138 | u'¦': u'|', # BROKEN BAR 139 | u'§': u'Sec.', # SECTION SIGN 140 | u'¨': u'"', # DIAERESIS 141 | u'©': u'(C)', # COPYRIGHT SIGN 142 | u'«': u'"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 143 | u'': u'-', # SOFT HYPHEN 144 | u'®': u'(R)', # REGISTERED SIGN 145 | u'¯': u'-', # MACRON 146 | u'°': u' deg.', # DEGREE SIGN 147 | u'±': u'+-', # PLUS-MINUS SIGN 148 | u'²': u'^2', # SUPERSCRIPT TWO 149 | u'³': u'^3', # SUPERSCRIPT THREE 150 | u'´': u'\'', # ACUTE ACCENT 151 | u'µ': u' mu', # MICRO SIGN 152 | u'·': u'.', # MIDDLE DOT 153 | u'»': u'"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 154 | u'¼': u'1/4', # VULGAR FRACTION ONE QUARTER 155 | u'½': u'1/2', # VULGAR FRACTION ONE HALF 156 | u'¾': u'3/4', # VULGAR FRACTION THREE QUARTERS 157 | u'¿': u'?', # INVERTED QUESTION MARK 158 | u'Ä': u'Ae', # LATIN CAPITAL LETTER A WITH DIAERESIS 159 | u'Æ': u'AE', # LATIN CAPITAL LETTER AE 160 | u'Ð': u'Eth', # LATIN CAPITAL LETTER ETH 161 | u'Ö': u'Oe', # LATIN CAPITAL LETTER O WITH DIAERESIS 162 | u'×': u'x', # MULTIPLICATION SIGN 163 | u'Ø': u'O', # LATIN CAPITAL LETTER O WITH STROKE 164 | u'Ü': u'Ue', # LATIN CAPITAL LETTER U WITH DIAERESIS 165 | u'ß': u'ss', # LATIN SMALL LETTER SHARP S 166 | u'ä': u'ae', # LATIN SMALL LETTER A WITH DIAERESIS 167 | u'æ': u'ae', # LATIN SMALL LETTER AE 168 | u'ð': u'eth', # LATIN SMALL LETTER ETH 169 | # u'ñ': u'ny', # LATIN SMALL LETTER N WITH TILDE 170 | u'ö': u'oe', # LATIN SMALL LETTER O WITH DIAERESIS 171 | u'÷': u'/', # DIVISION SIGN 172 | u'ø': u'o', # LATIN SMALL LETTER O WITH STROKE 173 | u'ü': u'ue', # LATIN SMALL LETTER U WITH DIAERESIS 174 | } 175 | 176 | 177 | -------------------------------------------------------------------------------- /epubmaker/Version.py: -------------------------------------------------------------------------------- 1 | VERSION = '0.3.26' 2 | GENERATOR = 'EpubMaker %s <https://github.com/gitenberg-dev/pg-epubmaker>' 3 | -------------------------------------------------------------------------------- /epubmaker/WriterFactory.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | 6 | WriterFactory.py 7 | 8 | Copyright 2009-14 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Writer factory. Dynamically loads writers from directories. 13 | 14 | """ 15 | 16 | from __future__ import with_statement 17 | 18 | import os.path 19 | 20 | from pkg_resources import resource_isdir, resource_listdir # pylint: disable=E0611 21 | 22 | from epubmaker.lib.Logger import debug 23 | 24 | writers = {} 25 | 26 | def __load_writers_from (package_name): 27 | """ See what types we can write. """ 28 | 29 | try: 30 | for fn in resource_listdir (package_name, ''): 31 | modulename, ext = os.path.splitext (fn) 32 | if ext == '.py': 33 | if modulename.endswith ('Writer'): 34 | type_ = modulename.lower ().replace ('writer', '') 35 | debug ("Loading writer type %s from module %s" % (type_, modulename)) 36 | module = __import__ (package_name + '.' + modulename, fromlist = [modulename]) 37 | writers[type_] = module 38 | 39 | except ImportError: 40 | pass 41 | 42 | 43 | def load_writers (): 44 | """ See what types we can write. """ 45 | 46 | __load_writers_from ('epubmaker.writers') 47 | __load_writers_from ('epubmaker.writers.ibiblio') 48 | 49 | return writers.keys () 50 | 51 | 52 | def unload_writers (): 53 | """ Unload writer modules. """ 54 | for k in writers.keys (): 55 | del writers[k] 56 | 57 | 58 | def create (type_): 59 | """ Load writer module for type. """ 60 | 61 | try: 62 | return writers[type_].Writer () 63 | except KeyError: 64 | raise KeyError ('No writer for type %s' % type_) 65 | 66 | 67 | -------------------------------------------------------------------------------- /epubmaker/__init__.py: -------------------------------------------------------------------------------- 1 | """ This is a package. """ 2 | -------------------------------------------------------------------------------- /epubmaker/lib/GutenbergGlobals.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | GutenbergGlobals.py 6 | 7 | Copyright 2009 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | This module has sadly decayed into a repository for all sorts of cruft. 12 | 13 | FIXME: refactor this module 14 | 15 | """ 16 | 17 | import os 18 | import re 19 | import datetime 20 | 21 | class Struct (object): 22 | """ handy class to pin attributes on 23 | 24 | usage: c = Struct () 25 | c.something = 1 26 | 27 | """ 28 | pass 29 | 30 | 31 | NSMAP = { 32 | 'atom': 'http://www.w3.org/2005/Atom', 33 | 'bio': 'http://purl.org/vocab/bio/0.1/', 34 | 'cc': 'http://web.resource.org/cc/', 35 | 'dc': 'http://purl.org/dc/elements/1.1/', 36 | 'dcam': 'http://purl.org/dc/dcam/', 37 | 'dcmitype': 'http://purl.org/dc/dcmitype/', 38 | 'dcterms': 'http://purl.org/dc/terms/', 39 | 'ebook': 'http://www.gutenberg.org/ebooks/', # URL 40 | 'foaf': 'http://xmlns.com/foaf/0.1/', 41 | 'marcrel': 'http://id.loc.gov/vocabulary/relators', 42 | 'mathml': 'http://www.w3.org/1998/Math/MathML', 43 | 'mbp': 'http://mobipocket.com/mbp', 44 | 'ncx': 'http://www.daisy.org/z3986/2005/ncx/', 45 | 'opds': 'http://opds-spec.org/2010/Catalog', 46 | 'opf': 'http://www.idpf.org/2007/opf', 47 | 'opensearch': 'http://a9.com/-/spec/opensearch/1.1/', 48 | 'pg': 'http://www.gutenberg.org/', # URL 49 | 'pgagents': 'http://www.gutenberg.org/2009/agents/', 50 | 'pgtei': 'http://www.gutenberg.org/tei/marcello/0.5/ns', 51 | 'pgterms': 'http://www.gutenberg.org/2009/pgterms/', 52 | 'py': 'http://genshi.edgewall.org/', 53 | 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 54 | 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 55 | 'svg': 'http://www.w3.org/2000/svg', 56 | 'tei': 'http://www.tei-c.org/ns/1.0', 57 | 'xhtml': 'http://www.w3.org/1999/xhtml', 58 | 'xinclude': 'http://www.w3.org/2001/XInclude', 59 | 'xml': 'http://www.w3.org/XML/1998/namespace', 60 | 'xmlns': 'http://www.w3.org/2000/xmlns/', 61 | 'xsd': 'http://www.w3.org/2001/XMLSchema#', 62 | 'xsi': 'http://www.w3.org/2001/XMLSchema-instance', 63 | 'xslfo': 'http://www.w3.org/1999/XSL/Format', 64 | } 65 | 66 | 67 | class NameSpaceClark (object): 68 | """ Build a tag name in Clark notation. 69 | 70 | ns = NameSpaceClark ("http://example.com/") 71 | >>> ns.foo 72 | '{http://example.com/}foo' 73 | >>> ns['bar'] 74 | '{http://example.com/}bar' 75 | 76 | """ 77 | 78 | def __init__ (self, root): 79 | self.root = root 80 | 81 | def __getitem__ (self, local): 82 | return "{%s}%s" % (self.root, local) 83 | 84 | def __getattr__ (self, local): 85 | return "{%s}%s" % (self.root, local) 86 | 87 | def __str__ (self): 88 | return self.root 89 | 90 | 91 | class NameSpaceURI (object): 92 | """ Build a URI. 93 | 94 | ns = NameSpaceURI ("http://example.com/") 95 | >>> ns.foo 96 | 'http://example.com/foo' 97 | >>> ns['bar'] 98 | 'http://example.com/bar' 99 | 100 | """ 101 | 102 | def __init__ (self, root): 103 | self.root = root 104 | 105 | def __getitem__ (self, local): 106 | return "%s%s" % (self.root, local) 107 | 108 | def __getattr__ (self, local): 109 | return "%s%s" % (self.root, local) 110 | 111 | def __str__ (self): 112 | return self.root 113 | 114 | 115 | def build_nsmap (prefixes = None): 116 | """ build a nsmap containing all namespaces for prefixes """ 117 | 118 | if prefixes is None: 119 | prefixes = NSMAP.keys () 120 | if isinstance (prefixes, str): 121 | prefixes = prefixes.split () 122 | 123 | ns = {} 124 | for prefix in prefixes: 125 | ns[prefix] = NSMAP[prefix] 126 | 127 | return ns 128 | 129 | 130 | NS = Struct () 131 | NSURI = Struct () 132 | 133 | for prefix, uri in NSMAP.items (): 134 | setattr (NS, prefix, NameSpaceClark (uri)) 135 | setattr (NSURI, prefix, NameSpaceURI (uri)) 136 | 137 | XML_DECLARATION = """<?xml version='1.0' encoding='UTF-8'?>""" 138 | 139 | XHTML_DOCTYPE = ("<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' " + 140 | "'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'>") 141 | 142 | XHTML1_DOCTYPE = ("<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Strict//EN' " + 143 | "'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'>") 144 | 145 | XHTML_RDFa_DOCTYPE = ("<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML+RDFa 1.0//EN' " + 146 | "'http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd'>") 147 | 148 | NCX_DOCTYPE = ("<!DOCTYPE ncx PUBLIC '-//NISO//DTD ncx 2005-1//EN' " + 149 | "'http://www.daisy.org/z3986/2005/ncx-2005-1.dtd'>") 150 | 151 | GENERATOR = 'EpubMaker by Marcello Perathoner <https://github.com/gitenberg-dev/pg-epubmaker>' 152 | 153 | 154 | def xmlspecialchars (s): 155 | return (s.replace (u'&', u'&') 156 | .replace (u'<', u'<') 157 | .replace (u'>', u'>')) 158 | 159 | def insert_breaks (s): 160 | return s.replace (u'\n', u'<br />') 161 | 162 | RE_NORMALIZE = re.compile (r"\s+") 163 | 164 | def normalize (s): 165 | s = RE_NORMALIZE.sub (' ', s) 166 | return s.strip () 167 | 168 | 169 | def cut_at_newline (text): 170 | """ Cut the text at the first newline. """ 171 | i = text.find ('\n') 172 | if i > -1: 173 | return text[:i] 174 | return text 175 | 176 | def archive_dir (ebook): 177 | """ build 1/2/3/4/12345 for 12345 """ 178 | ebook = str (ebook) 179 | a = [] 180 | for c in ebook: 181 | a.append (c) 182 | a[-1] = ebook 183 | return "/".join (a) 184 | 185 | def archive2files (ebook, path): 186 | adir = archive_dir (ebook) 187 | return path.replace ('dirs/' + adir, 'files/%d' % ebook) 188 | 189 | 190 | def xpath (node, path, **kwargs): 191 | """ xpath helper """ 192 | return node.xpath (path, namespaces = NSMAP, **kwargs) 193 | 194 | 195 | def mkdir_for_filename (fn): 196 | """ Make sure the directory for this file is present. """ 197 | 198 | try: 199 | os.makedirs (os.path.dirname (fn)) 200 | except os.error: 201 | pass 202 | 203 | 204 | def make_url_relative (base_url, url): 205 | """ Make absolute url relative to base_url if possible. """ 206 | 207 | if (url.startswith (base_url)): 208 | return url[len (base_url):] 209 | 210 | base_url = os.path.dirname (base_url) + '/' 211 | 212 | if (url.startswith (base_url)): 213 | return url[len (base_url):] 214 | 215 | return url 216 | 217 | 218 | def normalize_path (path): 219 | """ Normalize a file path. """ 220 | if path.startswith ('file://'): 221 | path = path[7:] 222 | return path 223 | 224 | def is_same_path (path1, path2): 225 | """ Does path1 point to the same file as path2? """ 226 | return os.path.realpath (normalize (path1)) == os.path.realpath (normalize (path2)) 227 | 228 | 229 | def string_to_filename (fn): 230 | """ Sanitize string so it can do as filename. """ 231 | 232 | def escape (matchobj): 233 | """ Escape a char. """ 234 | return '@%x' % ord (matchobj.group (0)) 235 | 236 | fn = os.path.normpath (fn) 237 | fn = normalize (fn) 238 | fn = fn.replace (os.sep, '@') 239 | if os.altsep: 240 | fn = fn.replace (os.altsep, '@') 241 | fn = re.sub (u'[\|/:?"*<>\u0000-\u001F]', escape, fn) 242 | 243 | return fn 244 | 245 | 246 | class DCIMT (object): 247 | """ encapsulates one dcterms internet mimetype 248 | 249 | """ 250 | 251 | def __init__ (self, mime, enc = None): 252 | if mime is None: 253 | self.mimetype = 'application/octet-stream' 254 | elif enc is not None and mime.startswith ('text/'): 255 | self.mimetype = "%s; charset=%s" % (mime, enc) 256 | else: 257 | self.mimetype = mime 258 | 259 | def __str__ (self): 260 | return self.mimetype 261 | 262 | 263 | class UTC (datetime.tzinfo): 264 | """ UTC helper for datetime.datetime """ 265 | 266 | def utcoffset (self, dummy_dt): 267 | return datetime.timedelta (0) 268 | 269 | def tzname (self, dummy_dt): 270 | return "UTC" 271 | 272 | def dst (self, dummy_dt): 273 | return datetime.timedelta (0) 274 | 275 | # exceptions 276 | 277 | class SkipOutputFormat (Exception): 278 | pass 279 | 280 | # Spider.py treis a topological sort on link rel=next 281 | def topological_sort (pairlist): 282 | """Topologically sort a list of (parent, child) pairs. 283 | 284 | Return a list of the elements in dependency order (parent to child order). 285 | 286 | >>> print topsort( [(1,2), (3,4), (5,6), (1,3), (1,5), (1,6), (2,5)] ) 287 | [1, 2, 3, 5, 4, 6] 288 | 289 | >>> print topsort( [(1,2), (1,3), (2,4), (3,4), (5,6), (4,5)] ) 290 | [1, 2, 3, 4, 5, 6] 291 | 292 | >>> print topsort( [(1,2), (2,3), (3,2)] ) 293 | Traceback (most recent call last): 294 | CycleError: ([1], {2: 1, 3: 1}, {2: [3], 3: [2]}) 295 | 296 | """ 297 | num_parents = {} # element -> # of predecessors 298 | children = {} # element -> list of successors 299 | for parent, child in pairlist: 300 | # Make sure every element is a key in num_parents. 301 | if not num_parents.has_key( parent ): 302 | num_parents[parent] = 0 303 | if not num_parents.has_key( child ): 304 | num_parents[child] = 0 305 | 306 | # Since child has a parent, increment child's num_parents count. 307 | num_parents[child] += 1 308 | 309 | # ... and parent gains a child. 310 | children.setdefault(parent, []).append(child) 311 | 312 | # Suck up everything without a parent. 313 | answer = [x for x in num_parents.keys() if num_parents[x] == 0] 314 | 315 | # For everything in answer, knock down the parent count on its children. 316 | # Note that answer grows *in* the loop. 317 | for parent in answer: 318 | del num_parents[parent] 319 | if children.has_key( parent ): 320 | for child in children[parent]: 321 | num_parents[child] -= 1 322 | if num_parents[child] == 0: 323 | answer.append( child ) 324 | # Following "del" isn't needed; just makes 325 | # CycleError details easier to grasp. 326 | del children[parent] 327 | 328 | if num_parents: 329 | # Everything in num_parents has at least one child -> 330 | # there's a cycle. 331 | raise Exception (answer, num_parents, children) 332 | return answer 333 | -------------------------------------------------------------------------------- /epubmaker/lib/Logger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | Logger.py 6 | 7 | Copyright 2009 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Logging support. 12 | 13 | 14 | """ 15 | 16 | import logging 17 | from logging import debug, info, warn, error, critical, exception 18 | 19 | LOGFORMAT = '%(asctime)s %(levelname)-8s #%(ebook)-5d %(message)s' 20 | 21 | ebook = 0 # global 22 | 23 | class CustomFormatter (logging.Formatter): 24 | """ A custom formatter that adds ebook no. """ 25 | 26 | def format (self, record): 27 | """ Add ebook no. to string format params. """ 28 | record.ebook = ebook 29 | return logging.Formatter.format (self, record) 30 | 31 | 32 | def setup (logformat, logfile = None): 33 | """ Setup logger. """ 34 | 35 | # StreamHandler defaults to sys.stderr 36 | file_handler = logging.FileHandler (logfile) if logfile else logging.StreamHandler () 37 | file_handler.setFormatter (CustomFormatter (logformat)) 38 | logging.getLogger ().addHandler (file_handler) 39 | logging.getLogger ().setLevel (logging.INFO) 40 | 41 | 42 | def set_log_level (level): 43 | """ Set log level. """ 44 | if level >= 1: 45 | logging.getLogger ().setLevel (logging.INFO) 46 | if level >= 2: 47 | logging.getLogger ().setLevel (logging.DEBUG) 48 | 49 | 50 | __all__ = 'debug info warn error critical exception'.split () 51 | -------------------------------------------------------------------------------- /epubmaker/lib/MediaTypes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | MediaTypes.py 6 | 7 | Copyright 2009 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Media Types Lists 12 | 13 | """ 14 | 15 | import mimetypes 16 | 17 | mimetypes.init () 18 | 19 | # overrides 20 | 21 | mimetypes.types_map['.htm'] = 'application/xhtml+xml' 22 | mimetypes.types_map['.html'] = 'application/xhtml+xml' 23 | mimetypes.types_map['.xhtml'] = 'application/xhtml+xml' 24 | mimetypes.types_map['.mobile'] = 'application/xhtml+xml' 25 | mimetypes.types_map['.ncx'] = 'application/x-dtbncx+xml' 26 | mimetypes.types_map['.pt'] = 'application/vnd.adobe-page-template+xml' 27 | mimetypes.types_map['.epub'] = 'application/epub+zip' 28 | mimetypes.types_map['.mobi'] = 'application/x-mobipocket-ebook' 29 | mimetypes.types_map['.pdf'] = 'application/pdf' 30 | mimetypes.types_map['.plucker'] = 'application/prs.plucker' 31 | mimetypes.types_map['.qioo'] = 'application/x-qioo-ebook' 32 | mimetypes.types_map['.jar'] = 'application/java-archive' 33 | mimetypes.types_map['.rss'] = 'application/rss+xml' 34 | mimetypes.types_map['.atom'] = 'application/atom+xml' 35 | mimetypes.types_map['.opds'] = 'application/atom+xml' 36 | mimetypes.types_map['.stanza'] = 'application/atom+xml' 37 | mimetypes.types_map['.wap'] = 'application/vnd.wap.xhtml+xml' 38 | mimetypes.types_map['.json'] = 'application/x-suggestions+json' 39 | mimetypes.types_map['.rst'] = 'text/x-rst' 40 | mimetypes.types_map['.png'] = 'image/png' # Windows XP thinks this is image/x-png 41 | mimetypes.types_map['.jpg'] = 'image/jpeg' # Windows XP thinks this is image/pjpeg 42 | mimetypes.types_map['.jpeg'] = 'image/jpeg' # Windows XP thinks this is image/pjpeg 43 | mimetypes.types_map['.jfif'] = 'image/jpeg' 44 | mimetypes.types_map['.mscz'] = 'application/x-musescore+xml' 45 | mimetypes.types_map['.mid'] = 'audio/midi' 46 | mimetypes.types_map['.midi'] = 'audio/midi' 47 | mimetypes.types_map['.mus'] = 'application/x-myriad-music' 48 | mimetypes.types_map['.sib'] = 'application/x-sibelius-score' 49 | mimetypes.types_map['.mxl'] = 'application/vnd.recordare.musicxml' 50 | mimetypes.types_map['.mp3'] = 'audio/mpeg' 51 | 52 | 53 | TEXT_MEDIATYPES = set ( ( 54 | 'application/xhtml+xml', 55 | 'application/xml', 56 | 'text/html', 57 | 'text/plain', 58 | ) ) 59 | 60 | IMAGE_MEDIATYPES = set ( ( 61 | 'image/gif', 62 | 'image/jpeg', 63 | 'image/png', 64 | ) ) 65 | 66 | AUX_MEDIATYPES = set ( ( 67 | 'text/css', 68 | ) ) 69 | 70 | class MediatypesLookup (object): 71 | """ Quick mediatype lookup 72 | 73 | ns = MediatypesLookup () 74 | >>> ns.epub 75 | 'application/atom+xml' 76 | >>> ns['mobi'] 77 | 'application/x-mobipocket-ebook' 78 | 79 | """ 80 | 81 | def __getitem__ (self, local): 82 | return mimetypes.types_map['.' + local] 83 | 84 | def __getattr__ (self, local): 85 | return mimetypes.types_map['.' + local] 86 | 87 | mediatypes = MediatypesLookup () 88 | 89 | -------------------------------------------------------------------------------- /epubmaker/lib/__init__.py: -------------------------------------------------------------------------------- 1 | """ This is a package. """ 2 | 3 | __all__ = ['DublinCore', 'DummyConnectionPool', 4 | 'GutenbergDatabaseDublinCore', 'GutenbergDatabase', 5 | 'GutenbergGlobals', 'Logger', 'MediaTypes'] 6 | -------------------------------------------------------------------------------- /epubmaker/mydocutils/__init__.py: -------------------------------------------------------------------------------- 1 | broken = 'images/broken.png' 2 | 3 | -------------------------------------------------------------------------------- /epubmaker/mydocutils/gutenberg/__init__.py: -------------------------------------------------------------------------------- 1 | """ This is a package. """ 2 | -------------------------------------------------------------------------------- /epubmaker/mydocutils/gutenberg/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | 6 | Module parsers 7 | 8 | Copyright 2010-2012 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Customized Project Gutenberg directives for RST parser. 13 | 14 | """ 15 | 16 | from docutils import statemachine 17 | from docutils.parsers.rst import Directive, directives 18 | 19 | from epubmaker.mydocutils import parsers 20 | 21 | from epubmaker.mydocutils.gutenberg import transforms as gutenberg_transforms 22 | 23 | from epubmaker.lib.Logger import error, info, debug, warn 24 | 25 | # pylint: disable=W0142, W0102 26 | 27 | 28 | class PGHeaderFooter (Directive): 29 | """ Inserts PG header or footer. """ 30 | 31 | required_arguments = 0 32 | optional_arguments = 0 33 | 34 | def run (self): 35 | settings = self.state.document.settings 36 | include_lines = statemachine.string2lines ( 37 | settings.get_resource ('mydocutils.gutenberg.parsers', self.resource).decode ('utf-8'), 38 | settings.tab_width, 39 | convert_whitespace = 1) 40 | self.state_machine.insert_input (include_lines, '') 41 | return [] 42 | 43 | 44 | class PGHeader (PGHeaderFooter): 45 | """ Inserts PG header. """ 46 | resource = 'pg-header.rst' 47 | 48 | 49 | class PGFooter (PGHeaderFooter): 50 | """ Inserts PG footer. """ 51 | resource = 'pg-footer.rst' 52 | 53 | 54 | class Parser (parsers.Parser): 55 | """ Parser with PG custom directives. """ 56 | 57 | def __init__ (self): 58 | parsers.Parser.__init__ (self) 59 | 60 | directives.register_directive ('pgheader', PGHeader) 61 | directives.register_directive ('pgfooter', PGFooter) 62 | 63 | 64 | def get_transforms (self): 65 | return parsers.Parser.get_transforms (self) + [ 66 | gutenberg_transforms.VariablesTransform, 67 | gutenberg_transforms.SubRefToVarTransform] 68 | -------------------------------------------------------------------------------- /epubmaker/mydocutils/gutenberg/parsers/pg-header.rst: -------------------------------------------------------------------------------- 1 | .. -*- encoding: utf-8 -*- 2 | 3 | .. |pg.copyrighted-header| replace:: 4 | 5 | This is a *copyrighted* Project Gutenberg eBook, details 6 | below. Please follow the copyright guidelines in this file. 7 | 8 | .. _pg-header: 9 | 10 | .. container:: noindent pgheader language-en 11 | 12 | This eBook is for the use of anyone anywhere at no cost and with 13 | almost no restrictions whatsoever. You may copy it, give it away or 14 | re-use it under the terms of the `Project Gutenberg License`_ 15 | included with this eBook or online at 16 | http://www.gutenberg.org/license. 17 | 18 | |pg.copyrighted-header| 19 | 20 | .. vspace:: 2 21 | 22 | .. _pg-machine-header: 23 | 24 | .. container:: noindent white-space-pre-line 25 | 26 | |pg.machine-header| 27 | 28 | .. vspace:: 2 29 | 30 | .. _pg-start-line: 31 | 32 | \*\*\* START OF THIS PROJECT GUTENBERG EBOOK |pg.upcase-title| \*\*\* 33 | 34 | .. vspace:: 4 35 | 36 | .. _pg-produced-by: 37 | 38 | |pg.produced-by| 39 | 40 | .. vspace:: 1 41 | 42 | |pg.credits| 43 | 44 | -------------------------------------------------------------------------------- /epubmaker/mydocutils/gutenberg/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | 6 | gutenberg.py 7 | 8 | Copyright 2012 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Transforms for the Project Gutenberg flavor. 13 | 14 | """ 15 | 16 | import datetime 17 | import textwrap 18 | 19 | from docutils import nodes 20 | import docutils.transforms 21 | import docutils.transforms.parts 22 | 23 | from epubmaker.lib.Logger import error, info, debug, warn 24 | from epubmaker.lib.DublinCore import DublinCore 25 | from epubmaker.mydocutils import nodes as mynodes 26 | 27 | # pylint: disable=W0142 28 | 29 | class SubRefToVarTransform (docutils.transforms.Transform): 30 | """ 31 | Transforms subref nodes in 'pg' namespace into var nodes. 32 | 33 | We need to save some subrefs for later processing. The standard 34 | subref processing happens too early (ie. before docinfo is 35 | collected). So we transform subrefs into variables, await docinfo 36 | to be processed, and then process the variables. 37 | 38 | """ 39 | 40 | default_priority = 219 41 | """ Before substitition def variables """ 42 | 43 | 44 | def apply (self): 45 | for ref in self.document.traverse (nodes.substitution_reference): 46 | refname = ref['refname'] 47 | if refname.startswith ('pg.'): 48 | var = mynodes.variable () 49 | var['name'] = refname 50 | ref.replace_self (var) 51 | 52 | 53 | class VariablesTransform (docutils.transforms.Transform): 54 | """ Replaces mynodes.var with parameters from metadata. """ 55 | 56 | default_priority = 342 57 | """ After DocInfoCollector. """ 58 | 59 | def apply(self): 60 | doc = self.document 61 | meta = doc.meta_block 62 | defs = doc.substitution_defs 63 | 64 | def getone (name, default = None): 65 | """ Get first value. """ 66 | if name in meta: 67 | return meta[name][0] 68 | return default 69 | 70 | def getmany (name, default = []): 71 | """ Get list of all values. """ 72 | return meta.get (name, default) 73 | 74 | def sub (var, nodes): 75 | var.replace_self (nodes) 76 | 77 | title = getone ('DC.Title', 'No Title') 78 | short_title = getone ('PG.Title', title) 79 | short_title = short_title.split ('\n', 1)[0] 80 | 81 | language = getmany ('DC.Language', ['en']) 82 | language = map (lambda x: DublinCore.language_map.get ( 83 | x, 'Unknown').title (), language) 84 | language = DublinCore.strunk (language) 85 | 86 | copyrighted = getone ('PG.Rights', '').lower () == 'copyrighted' 87 | 88 | for variable in doc.traverse (mynodes.variable): 89 | name = variable['name'] 90 | 91 | if name == 'pg.upcase-title': 92 | sub (variable, [ nodes.inline ('', short_title.upper ()) ]) 93 | 94 | elif name == 'pg.produced-by': 95 | producers = getmany ('PG.Producer') 96 | if producers: 97 | sub (variable, [ nodes.inline ('', u'Produced by %s.' % 98 | DublinCore.strunk (producers)) ]) 99 | else: 100 | sub (variable, []) 101 | 102 | elif name == 'pg.credits': 103 | sub (variable, [ nodes.inline ('', getone ('PG.Credits', '')) ]) 104 | 105 | elif name == 'pg.bibrec-url': 106 | url = 'http://www.gutenberg.org/ebooks/%s' % getone ('PG.Id', '999999') 107 | sub (variable, [ nodes.reference ('', '', nodes.inline ('', url), refuri = url) ]) 108 | 109 | elif name in ('pg.copyrighted-header', 'pg.copyrighted-footer'): 110 | if copyrighted: 111 | subdef_copy = defs[name].deepcopy () 112 | sub (variable, subdef_copy.children) 113 | else: 114 | sub (variable, []) 115 | 116 | elif name == 'pg.machine-header': 117 | tw = textwrap.TextWrapper ( 118 | width = 72, 119 | initial_indent = u'Title: ', 120 | subsequent_indent = u' ' * 7) 121 | 122 | if '\n' in title: 123 | maintitle, subtitle = title.split ('\n', 1) 124 | s = tw.fill (maintitle) 125 | s += '\n' 126 | tw.initial_indent = tw.subsequent_indent 127 | s += tw.fill (subtitle) 128 | else: 129 | s = tw.fill (title) 130 | s += '\n\n' 131 | 132 | tw.initial_indent = u'Author: ' 133 | tw.subsequent_indent = u' ' * 8 134 | s += tw.fill (DublinCore.strunk (getmany ('DC.Creator', ['Unknown']))) 135 | s += '\n\n' 136 | 137 | date = getone ('PG.Released', '') 138 | try: 139 | date = datetime.datetime.strptime (date, '%Y-%m-%d') 140 | date = datetime.datetime.strftime (date, '%B %d, %Y') 141 | except ValueError: 142 | date = 'unknown date' 143 | s += u'Release Date: %s [EBook #%s]\n' % (date, getone ('PG.Id', '999999')) 144 | 145 | for item in getmany ('PG.Reposted', []): 146 | try: 147 | date, comment = item.split (None, 1) 148 | except ValueError: 149 | date = item 150 | comment = None 151 | try: 152 | date = datetime.datetime.strptime (date, '%Y-%m-%d') 153 | date = datetime.datetime.strftime (date, '%B %d, %Y') 154 | except ValueError: 155 | date = 'unknown date' 156 | 157 | s += u'Reposted: %s' % date 158 | if comment: 159 | s += u' [%s]' % comment 160 | s += '\n' 161 | 162 | s += u'\nLanguage: %s\n\n' % language 163 | s += u'Character set encoding: %s' % doc.settings.encoding.upper () 164 | 165 | sub (variable, [ nodes.inline ('', nodes.Text (s)) ]) 166 | -------------------------------------------------------------------------------- /epubmaker/mydocutils/gutenberg/writers/__init__.py: -------------------------------------------------------------------------------- 1 | """ This is a package. """ 2 | -------------------------------------------------------------------------------- /epubmaker/mydocutils/gutenberg/writers/nroff.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # $Id: manpage.py 6270 2010-03-18 22:32:09Z milde $ 3 | # Author: Engelbert Gruber <grubert@users.sourceforge.net> 4 | # Copyright: This module is put into the public domain. 5 | # Rewritten almost completely 6 | # by Marcello Perathoner <marcello@perathoner.de> 7 | 8 | """ 9 | 10 | Nroff writer for reStructuredText. Tweaked for Project Gutenberg usage. 11 | 12 | """ 13 | 14 | __docformat__ = 'reStructuredText' 15 | 16 | from epubmaker.mydocutils.writers import nroff 17 | from epubmaker import Unitame 18 | 19 | from epubmaker.lib.Logger import info, debug, warn, error 20 | 21 | GUTENBERG_NROFF_PREAMBLE = r""".\" -*- mode: nroff -*- coding: {encoding} -*- 22 | .\" This file produces Project Gutenberg plain text. Usage: 23 | .\" $ groff -t -K {device} -T {device} this_file > output.txt 24 | . 25 | .pl 100000 \" very tall page: disable pagebreaks 26 | .ll 72m 27 | .po 0 28 | .ad l \" text-align: left 29 | .nh \" hyphenation: off 30 | .cflags 0 .?! \" single sentence space 31 | .cflags 0 -\[hy]\[em] \" don't break on - 32 | . 33 | .de nop 34 | .. 35 | .blm nop \" do nothing on empty line 36 | . 37 | .nr [env_cnt] 0 38 | .ev 0 \" start in a defined environment 39 | . 40 | .de push_env 41 | .br 42 | .nr last_env \\n[.ev] \" save current environment name 43 | .nr env_cnt +1 \" generate new environment name 44 | .ev \\n[env_cnt] 45 | .evc \\n[last_env] 46 | .. 47 | .de pop_env 48 | .br 49 | .ev 50 | .nr env_cnt -1 51 | .. 52 | . 53 | """ 54 | 55 | GUTENBERG_NROFF_POSTAMBLE = r""". 56 | .pl 0 \" ends very long page here 57 | .\" End of File 58 | """ 59 | 60 | class Writer (nroff.Writer): 61 | """ A plaintext writer thru nroff. """ 62 | 63 | supported = ('pg-nroff',) 64 | """Formats this writer supports.""" 65 | 66 | def __init__ (self): 67 | nroff.Writer.__init__ (self) 68 | self.translator_class = Translator 69 | 70 | def translate (self): 71 | visitor = self.translator_class (self.document) 72 | del Unitame.unhandled_chars[:] 73 | self.document.walkabout (visitor) 74 | self.output = visitor.astext () 75 | if Unitame.unhandled_chars: 76 | error ("unitame: unhandled chars: %s" % u", ".join (set (Unitame.unhandled_chars))) 77 | 78 | #def get_transforms (self): 79 | # tfs = writers.Writer.get_transforms (self) 80 | # return tfs + [parts.CharsetTransform] 81 | 82 | 83 | 84 | class Translator (nroff.Translator): 85 | """ nroff translator """ 86 | 87 | def preamble (self): 88 | """ Inserts nroff preamble. """ 89 | return GUTENBERG_NROFF_PREAMBLE.format ( 90 | encoding = self.encoding, device = self.device) 91 | 92 | 93 | def postamble (self): 94 | """ Inserts nroff postamble. """ 95 | return GUTENBERG_NROFF_POSTAMBLE.format ( 96 | encoding = self.encoding, device = self.device) 97 | 98 | 99 | def init_translate_maps (self): 100 | nroff.Translator.init_translate_maps (self) 101 | 102 | update = { 103 | 0x0011: ur"\~", # nbsp, see: Unitame.py 104 | 0x0012: ur"\%", # shy, see: Unitame.py 105 | } 106 | 107 | self.translate_map.update (update) 108 | self.translate_map_literal.update (update) 109 | 110 | 111 | def register_classes (self): 112 | """ Register classes. 113 | 114 | Use the idiosyncratic PG convention of marking up italics etc. 115 | 116 | """ 117 | 118 | # 119 | # This does not call the base class !!! 120 | # 121 | 122 | self.register_class ('simple', 'left', '.ad l', '') 123 | self.register_class ('simple', 'right', '.ad r', '') 124 | self.register_class ('simple', 'center', '.ad c', '') 125 | 126 | self.register_class ('inline', 'italics', '_', '_') 127 | self.register_class ('inline', 'bold', '*', '*') 128 | 129 | self.register_class ('inline', 'monospaced', '', '') 130 | self.register_class ('inline', 'superscript', '', '') 131 | self.register_class ('inline', 'subscript', '', '') 132 | 133 | self.register_class ('inline', 'small-caps', '_', '_') 134 | self.register_class ('inline', 'gesperrt', '_', '_') 135 | self.register_class ('inline', 'antiqua', '_', '_') 136 | self.register_class ('inline', 'larger', '', '') 137 | self.register_class ('inline', 'smaller', '', '') 138 | 139 | 140 | def translate (self, text): 141 | """ Reduce the charset while keeping text a unicode string. """ 142 | 143 | # NOTE: there's an alternate approach in 144 | # transforms.parts.CharsetTransform 145 | 146 | if self.encoding != 'utf-8': 147 | text = text.encode (self.encoding, 'unitame') 148 | text = text.decode (self.encoding) 149 | 150 | if self.in_literal: 151 | text = text.translate (self.translate_map_literal) 152 | else: 153 | text = text.translate (self.translate_map) 154 | 155 | return text 156 | 157 | 158 | def visit_inner (self, node): 159 | """ Try to remove duplicated PG highlight markers. """ 160 | if node.type == 'inline': 161 | prefixes = self.get_prefix (node.type, node['classes']) 162 | for prefix in prefixes: 163 | if prefix == self.last_output_char: 164 | self.backspace () 165 | else: 166 | self.text (prefix) 167 | else: 168 | nroff.Translator.visit_inner (self, node) 169 | 170 | 171 | def visit_inline (self, node): 172 | if 'toc-pageref' in node['classes']: 173 | maxlen = 3 # sensible default 174 | while node.parent: 175 | node = node.parent 176 | if 'pageno_maxlen' in node: 177 | maxlen = node['pageno_maxlen'] 178 | break 179 | self.cmd (('linetabs 1', 180 | r'ta (\n[.l]u - \n[.i]u - %dm) +%dmR' % (maxlen + 1, maxlen + 1), 181 | r'lc .')) 182 | self.text (chr (1) + '\t') 183 | nroff.Translator.visit_inline (self, node) 184 | 185 | def visit_section_title (self, node): 186 | """ Implements PG-standard spacing before headers. """ 187 | self.sp (max (2, 5 - self.section_level)) 188 | 189 | def visit_figure (self, node): 190 | self.sp (1) 191 | self.push () 192 | 193 | def depart_figure (self, node): 194 | self.pop () 195 | self.sp (1) 196 | 197 | def visit_image (self, node): 198 | # ignore alt attribute except for dropcaps 199 | if 'dropcap' in node['classes']: 200 | self.text (node.attributes.get ('alt', '')) 201 | 202 | def visit_page (self, node): 203 | if 'clearpage' in node['classes']: 204 | self.sp (4) 205 | elif 'cleardoublepage' in node['classes']: 206 | self.sp (4) 207 | else: 208 | nroff.Translator.visit_page (self, node) 209 | 210 | -------------------------------------------------------------------------------- /epubmaker/mydocutils/nodes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | 6 | nodes.py 7 | 8 | Copyright 2011 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Added nodes for PG. 13 | 14 | """ 15 | 16 | from docutils import nodes 17 | 18 | class page (nodes.Element, nodes.Special): 19 | """ Hold pagination commands. 20 | 21 | Like clearpage, vspace etc. 22 | 23 | """ 24 | 25 | class newline (nodes.Element): 26 | """ A line break. 27 | 28 | Outputs a hard line break if the node or one of its parents belong 29 | to the class 'white-space-pre-line'. Else a space. 30 | 31 | """ 32 | 33 | class footnote_group (nodes.container): 34 | """ Hold a group of footnotes. """ 35 | 36 | 37 | class variable (nodes.Inline, nodes.TextElement): 38 | """ A placeholder that gets substituted with actual text before output. 39 | 40 | We do not use substitution refs because they are resolved way too 41 | early in the transformation stage to be of much use to us. 42 | 43 | """ 44 | 45 | 46 | class node_selector (object): 47 | """ Allows CSS-like selectors as condition function for nodes.traverse (). """ 48 | 49 | def __init__ (self, selector): 50 | 51 | # allow selectors like [element][.class[.class[...]]][, selector[, selector]] 52 | 53 | self.matches = [] # list of 2-tuples 54 | 55 | for sel in selector.split (','): 56 | sel = sel.strip () 57 | if '.' not in sel: 58 | sel += '.' 59 | element, classes = sel.split ('.', 1) 60 | classes = set (classes.split ('.')) if classes else set () 61 | self.matches.append ( (getattr (nodes, element, nodes.Element), classes) ) 62 | 63 | 64 | def __call__ (self, node): 65 | """ returns True if the node matches the selector. """ 66 | 67 | for match in self.matches: 68 | if isinstance (node, match[0]) and match[1].issubset (node['classes']): 69 | return True 70 | 71 | return False 72 | -------------------------------------------------------------------------------- /epubmaker/mydocutils/parsers/default_style.rst: -------------------------------------------------------------------------------- 1 | .. this is the default PG-RST stylesheet 2 | 3 | .. style:: emphasis 4 | :class: italics 5 | 6 | .. style:: strong 7 | :class: bold 8 | 9 | .. style:: title_reference 10 | :class: italics 11 | 12 | .. style:: option_argument 13 | :class: italics 14 | 15 | .. style:: literal 16 | :class: monospaced 17 | 18 | .. style:: subscript 19 | :class: subscript 20 | 21 | .. style:: superscript 22 | :class: superscript 23 | 24 | .. style:: title.document-title 25 | :class: x-large center 26 | :titlehack: 27 | 28 | .. style:: title.topic-title 29 | :class: centerleft 30 | 31 | .. style:: title.table-title 32 | :class: centerleft larger 33 | 34 | .. figure and image styles for non-image formats 35 | 36 | .. style:: figure 37 | :class: margin 38 | 39 | .. style:: figure 40 | :formats: txt.* *.noimages 41 | :align: center 42 | :width: 80% 43 | 44 | .. style:: image 45 | :formats: *.noimages 46 | 47 | .. container:: center image margin 48 | 49 | [image] 50 | 51 | 52 | .. style:: image 53 | :formats: txt.* 54 | :display: none 55 | 56 | .. style:: caption.figure-caption 57 | :formats: -txt.* 58 | :class: centerleft italics margin 59 | 60 | .. style:: caption.figure-caption 61 | :formats: txt.* 62 | :class: margin 63 | :before: '[Illustration: ' 64 | :after: ']' 65 | 66 | .. style:: legend 67 | :class: margin 68 | 69 | 70 | .. default transition 71 | 72 | .. style:: transition 73 | 74 | .. container:: center transition margin 75 | 76 | ―――― 77 | 78 | .. default attribution 79 | 80 | .. style:: attribution 81 | :class: margin 82 | :before: '―― ' 83 | 84 | -------------------------------------------------------------------------------- /epubmaker/mydocutils/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | """ This is a package. """ 2 | -------------------------------------------------------------------------------- /epubmaker/mydocutils/writers/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | 6 | Mydocutils writer package. 7 | 8 | Copyright 2010-2012 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | """ 13 | 14 | 15 | __docformat__ = 'reStructuredText' 16 | 17 | import collections 18 | import operator 19 | 20 | from docutils import nodes, writers 21 | import roman 22 | 23 | 24 | class Writer (writers.Writer): 25 | """ A base class for writers. """ 26 | 27 | output = None 28 | """Final translated form of `document`.""" 29 | 30 | config_section_dependencies = ('writers', ) 31 | 32 | def translate (self): 33 | visitor = self.translator_class (self.document) 34 | self.document.walkabout (visitor) 35 | self.output = visitor.astext () 36 | 37 | 38 | class TablePass1 (nodes.SparseNodeVisitor): 39 | 40 | """ 41 | Make a first pass over a table to get a reliable row and column 42 | count. Insert placeholder cells for spanned cells. 43 | """ 44 | 45 | def __init__ (self, document): 46 | nodes.SparseNodeVisitor.__init__ (self, document) 47 | 48 | self.row = -1 # 0-based 49 | self.column = 0 # 0-based 50 | self.cells = 0 51 | self.colspecs = None 52 | 53 | def visit_table (self, table): 54 | self.colspecs = table.traverse (nodes.colspec) 55 | width = sum (map (operator.itemgetter ('colwidth'), self.colspecs)) 56 | for colspec in self.colspecs: 57 | colspec['relative_width'] = float (colspec['colwidth']) / width 58 | 59 | def depart_table (self, table): 60 | table['rows'] = self.rows () 61 | table['columns'] = self.cols () 62 | 63 | def visit_row (self, dummy_node): 64 | self.row += 1 65 | self.column = 0 66 | for colspec in self.colspecs: 67 | colspec['spanned'] = max (0, colspec.get ('spanned', 0) - 1) 68 | 69 | def visit_entry (self, node): 70 | """ Table cell. """ 71 | 72 | morerows = node.get ('morerows', 0) 73 | morecols = node.get ('morecols', 0) 74 | 75 | self.cells += (morecols + 1) * (morerows + 1) 76 | 77 | # skip columns that are row-spanned by preceding entries 78 | while True: 79 | colspec = self.colspecs [self.column] 80 | if colspec.get ('spanned', 0) > 0: 81 | placeholder = nodes.entry () 82 | placeholder.type = 'compound' 83 | placeholder['column'] = self.column 84 | placeholder.colspecs = self.colspecs[self.column:self.column + 1] 85 | placeholder['vspan'] = True 86 | node.replace_self ([placeholder, node]) 87 | self.column += 1 88 | else: 89 | break 90 | 91 | # mark columns we row-span 92 | if morerows: 93 | for colspec in self.colspecs [self.column : self.column + 1 + morecols]: 94 | colspec['spanned'] = morerows + 1 95 | 96 | node['row'] = self.row 97 | node['column'] = self.column 98 | 99 | node.colspecs = self.colspecs[self.column:self.column + morecols + 1] 100 | 101 | self.column += 1 + morecols 102 | 103 | raise nodes.SkipNode 104 | 105 | def rows (self): 106 | """ Return the no. of columns. """ 107 | return self.row + 1 108 | 109 | def cols (self): 110 | """ Return the no. of columns. """ 111 | return self.cells / self.rows () 112 | 113 | 114 | class ListEnumerator: 115 | """ Enumerate labels according to list type. """ 116 | 117 | def __init__ (self, node, encoding): 118 | self.type = node.get ('enumtype') or node.get ('bullet') or '*' 119 | self.start = node['start'] if 'start' in node else 1 120 | self.prefix = node.get ('prefix', '') 121 | self.suffix = node.get ('suffix', '') 122 | self.encoding = encoding 123 | 124 | self.indent = len (self.prefix + self.suffix) + 1 125 | if self.type == 'arabic': 126 | # indentation depends on end value 127 | self.indent += len (str (self.start + len (node.children))) 128 | elif self.type.endswith ('alpha'): 129 | self.indent += 1 130 | elif self.type.endswith ('roman'): 131 | self.indent += 5 # FIXME: calculate real length 132 | else: 133 | self.indent += 1 # none, bullets, etc. 134 | 135 | def get_next (self): 136 | """ Get the next label. """ 137 | 138 | if self.type == 'none': 139 | res = '' 140 | elif self.type == '*': 141 | res = u'•' if self.encoding == 'utf-8' else '-' 142 | elif self.type == '-': 143 | res = u'-' 144 | elif self.type == '+': 145 | res = u'+' 146 | elif self.type == 'arabic': 147 | res = "%d" % self.start 148 | elif self.type == 'loweralpha': 149 | res = "%c" % (self.start + ord ('a') - 1) 150 | elif self.type == 'upperalpha': 151 | res = "%c" % (self.start + ord ('A') - 1) 152 | elif self.type == 'upperroman': 153 | res = roman.toRoman (self.start).upper () 154 | elif self.type == 'lowerroman': 155 | res = roman.toRoman (self.start).lower () 156 | else: 157 | res = "%d" % self.start 158 | 159 | self.start += 1 160 | 161 | return self.prefix + res + self.suffix 162 | 163 | def get_width (self): 164 | """ Get indent width for this list. """ 165 | 166 | return self.indent 167 | 168 | 169 | class Translator (nodes.NodeVisitor): 170 | """ A base translator """ 171 | 172 | admonitions = """ 173 | attention caution danger error hint important note tip warning 174 | """.split () 175 | 176 | docinfo_elements = """ 177 | address author contact copyright date organization revision status 178 | version 179 | """.split () 180 | 181 | # see http://docutils.sourceforge.net/docs/ref/doctree.html#simple-body-elements 182 | 183 | # simple_structural_subelements = tuple ((getattr (nodes, n) for n in """ 184 | # title subtitle 185 | # """.split ())) 186 | 187 | # simple_body_elements = tuple ((getattr (nodes, n) for n in """ 188 | # comment doctest_block image literal_block math_block paragraph 189 | # pending raw rubric substitution_definition target 190 | # """.split ())) 191 | 192 | # simple_body_subelements = tuple ((getattr (nodes, n) for n in """ 193 | # attribution caption classifier colspec field_name 194 | # label line option_argument option_string term 195 | # """.split ())) 196 | 197 | # simple_elements = (simple_structural_subelements + 198 | # simple_body_elements + simple_body_subelements) 199 | 200 | def __init__ (self, document): 201 | nodes.NodeVisitor.__init__ (self, document) 202 | self.settings = document.settings 203 | 204 | self.body = [] 205 | self.context = self.body # start with context == body 206 | self.docinfo = collections.defaultdict (list) 207 | self.list_enumerator_stack = [] 208 | self.section_level = 0 209 | self.vspace = 0 # pending space (need this for collapsing) 210 | self.src_vspace = 0 # pending space for source pretty printing 211 | 212 | self.field_name = None 213 | self.compacting = 0 # > 0 if we are inside a compacting list 214 | self.in_literal = 0 # > 0 if we are inside one or more literal blocks 215 | 216 | self.prefixes = collections.defaultdict (list) # dict of arrays of prefixes in order in 217 | # which to apply classes 218 | self.suffixes = collections.defaultdict (list) # reverse order of above 219 | 220 | self.environments = [] # stack of \begin'ed environments 221 | 222 | self.register_classes () 223 | 224 | for name in self.docinfo_elements: 225 | setattr (self, 'visit_' + name, 226 | lambda node: self.visit_field_body (node, name)) 227 | setattr (self, 'depart_' + name, self.depart_field_body) 228 | 229 | for adm in self.admonitions: 230 | setattr (self, 'visit_' + adm, 231 | lambda node: self.visit_admonition (node, adm)) 232 | setattr (self, 'depart_' + adm, self.depart_admonition) 233 | 234 | 235 | def register_classes (self): 236 | pass 237 | 238 | 239 | def dispatch_visit (self, node): 240 | """ 241 | Call self."``visit_`` + node class name" with `node` as 242 | parameter. If the ``visit_...`` method does not exist, call 243 | self.unknown_visit. 244 | 245 | There are 3 hooks for every visit: 246 | 247 | visit_outer 248 | visit_<classname> 249 | visit_inner 250 | 251 | """ 252 | 253 | self.visit_outer (node) 254 | 255 | node_name = node.__class__.__name__ 256 | method = getattr (self, 'visit_' + node_name, self.unknown_visit) 257 | self.document.reporter.debug ( 258 | 'docutils.nodes.NodeVisitor.dispatch_visit calling %s for %s' 259 | % (method.__name__, node_name)) 260 | res = method (node) 261 | 262 | if node.type in ('compound', 'simple', 'inline'): 263 | self.visit_inner (node) 264 | 265 | return res 266 | 267 | def dispatch_departure (self, node): 268 | """ 269 | Call self."``depart_`` + node class name" with `node` as 270 | parameter. If the ``depart_...`` method does not exist, call 271 | self.unknown_departure. 272 | 273 | There are 3 hooks for every departure: 274 | 275 | depart_inner 276 | depart_<classname> 277 | depart_outer 278 | 279 | """ 280 | 281 | if node.type in ('compound', 'simple', 'inline'): 282 | self.depart_inner (node) 283 | 284 | node_name = node.__class__.__name__ 285 | method = getattr (self, 'depart_' + node_name, self.unknown_departure) 286 | self.document.reporter.debug ( 287 | 'docutils.nodes.NodeVisitor.dispatch_departure calling %s for %s' 288 | % (method.__name__, node_name)) 289 | res = method (node) 290 | 291 | self.depart_outer (node) 292 | 293 | return res 294 | 295 | 296 | def unknown_visit (self, node): 297 | """ Called if we have no handler for this element. """ 298 | pass 299 | 300 | def unknown_departure (self, node): 301 | """ Called if we have no handler for this element. """ 302 | pass 303 | 304 | 305 | def visit_outer (self, node): 306 | """ The very first hook called on a node, before 307 | ``visit_<classname>``. """ 308 | pass 309 | 310 | def visit_inner (self, node): 311 | """ Called after ``visit_<classname>``. """ 312 | pass 313 | 314 | def depart_inner (self, node): 315 | """ Called on a block before ``depart_<classname>``. """ 316 | pass 317 | 318 | def depart_outer (self, node): 319 | """ The very last hook called on a node, after 320 | ``depart_<classname>``.""" 321 | pass 322 | 323 | 324 | def register_class (self, types, class_, prefix, suffix): 325 | """ Register classes. 326 | 327 | A mechanism to automatically output strings before and after 328 | elements with specific classes. For most use cases this is 329 | easier than to write a handler for the element. 330 | 331 | types: types of node this class will apply to: 332 | tuple of one or more of (text, inline, simple, compound) 333 | class_: class that triggers the strings 334 | prefix: string output before element 335 | suffix: string output after element 336 | 337 | """ 338 | 339 | if isinstance (types, basestring): 340 | types = types.split () 341 | 342 | for t in types: 343 | self.prefixes[t].append ( (class_, prefix)) 344 | self.suffixes[t].insert (0, (class_, suffix)) 345 | 346 | def get_prefix (self, type_, classes): 347 | return self._get_prefix (type_, classes, self.prefixes) 348 | 349 | def get_suffix (self, type_, classes): 350 | return self._get_prefix (type_, classes, self.suffixes) 351 | 352 | def _get_prefix (self, type_, classes, array): 353 | """ Helper for inline handlers. """ 354 | if isinstance (classes, basestring): 355 | classes = classes.split () 356 | 357 | res = [] 358 | for s in array[type_]: 359 | if s[0] in classes: 360 | res.append (s[1]) 361 | return res 362 | 363 | 364 | def set_class_on_child (self, node, class_, index = 0): 365 | """ 366 | Set class `class_` on the visible child no. index of `node`. 367 | Do nothing if node has fewer children than `index`. 368 | """ 369 | children = [n for n in node if not isinstance (n, nodes.Invisible)] 370 | try: 371 | child = children[index] 372 | except IndexError: 373 | return 374 | child['classes'].append (class_) 375 | 376 | def set_first_last (self, node): 377 | """ Set class 'first' on first child, 'last' on last child. """ 378 | self.set_class_on_child (node, 'first', 0) 379 | self.set_class_on_child (node, 'last', -1) 380 | 381 | def astext (self): 382 | """ Return the final formatted document as a string. """ 383 | return self.preamble () + ''.join (self.context) + self.postamble () 384 | 385 | def comment (self, text): 386 | """ Output a comment. """ 387 | pass 388 | 389 | def text (self, text): 390 | """ Output text. """ 391 | pass 392 | 393 | def sp (self, n = 1): 394 | """ Adds vertical space before the next simple element. 395 | 396 | All spaces added collapse into the largest one. """ 397 | 398 | if n == 0: 399 | self.vspace = 1999 400 | else: 401 | self.vspace = max (n, self.vspace) 402 | 403 | def src_sp (self, n = 1): 404 | """ Add vertical space to the source. """ 405 | 406 | if n == 0: 407 | self.src_vspace = 1999 408 | else: 409 | self.src_vspace = max (n, self.src_vspace) 410 | 411 | def output_sp (self): 412 | pass 413 | 414 | def output_src_sp (self): 415 | pass 416 | 417 | def push (self): 418 | """ Push environment. """ 419 | pass 420 | 421 | def pop (self): 422 | """ Pop environment. """ 423 | pass 424 | 425 | def br_if_line_longer_than (self, length): 426 | """ Go one line up if the last line was shorter than length. 427 | 428 | Use this to compact lists etc. """ 429 | pass 430 | 431 | def indent (self, by = 2): 432 | """ Indent text. """ 433 | pass 434 | 435 | def rindent (self, by = 2): 436 | """ Indent text on the right side. """ 437 | pass 438 | 439 | def preamble (self): 440 | return '' 441 | 442 | def postamble (self): 443 | return '' 444 | 445 | def visit_title (self, node): 446 | """ Switch on the various incarnations the title element can have. """ 447 | 448 | if isinstance (node.parent, nodes.section): 449 | self.visit_section_title (node) 450 | elif isinstance (node.parent, nodes.document): 451 | self.visit_document_title (node) 452 | elif isinstance (node.parent, nodes.table): 453 | self.visit_table_title (node) 454 | elif isinstance (node.parent, nodes.topic): 455 | self.visit_topic_title (node) 456 | elif isinstance (node.parent, nodes.sidebar): 457 | self.visit_sidebar_title (node) 458 | elif isinstance (node.parent, nodes.admonition): 459 | self.visit_admonition_title (node) 460 | else: 461 | assert ("Can't happen.") 462 | 463 | def depart_title (self, node): 464 | """ Switch on the various incarnations the title element can have. """ 465 | 466 | if isinstance (node.parent, nodes.section): 467 | self.depart_section_title (node) 468 | elif isinstance (node.parent, nodes.document): 469 | self.depart_document_title (node) 470 | elif isinstance (node.parent, nodes.table): 471 | self.depart_table_title (node) 472 | elif isinstance (node.parent, nodes.topic): 473 | self.depart_topic_title (node) 474 | elif isinstance (node.parent, nodes.sidebar): 475 | self.depart_sidebar_title (node) 476 | elif isinstance (node.parent, nodes.admonition): 477 | self.depart_admonition_title (node) 478 | else: 479 | assert ("Can't happen.") 480 | 481 | def visit_subtitle (self, node): 482 | """ Switch on the various incarnations the subtitle element can have. """ 483 | 484 | if isinstance (node.parent, nodes.document): 485 | self.visit_document_subtitle (node) 486 | else: 487 | self.visit_section_subtitle (node) 488 | 489 | def depart_subtitle (self, node): 490 | """ Switch on the various incarnations the subtitle element can have. """ 491 | 492 | if isinstance (node.parent, nodes.document): 493 | self.depart_document_subtitle (node) 494 | else: 495 | self.depart_section_subtitle (node) 496 | 497 | -------------------------------------------------------------------------------- /epubmaker/mydocutils/writers/epub2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | 6 | epub2.py 7 | 8 | Copyright 2012 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | A writer that writes XHTML 1 files suited for conversion into EPUB2. 13 | 14 | """ 15 | 16 | import re 17 | 18 | from docutils import nodes 19 | 20 | # from epubmaker.lib.Logger import info, debug, warn, error 21 | 22 | from epubmaker.mydocutils.writers.xhtml1 import Writer as WriterBase 23 | from epubmaker.mydocutils.writers.xhtml1 import Translator as TranslatorBase 24 | 25 | 26 | class Writer (WriterBase): 27 | """ EPUB2 writer. """ 28 | 29 | def __init__ (self): 30 | WriterBase.__init__ (self) 31 | self.translator_class = Translator 32 | 33 | 34 | class Translator (TranslatorBase): 35 | """ HTML Translator with EPUB2 tweaks. """ 36 | 37 | def init_css (self): 38 | for css_file in ('rst2all.css', 'rst2epub.css'): 39 | self.head.append ('<style type="text/css">\n%s</style>\n' % 40 | self.encode (self.read_css (css_file))) 41 | 42 | 43 | def calc_centering_style (self, node): 44 | """ 45 | Rationale: The EPUB standard allows user agents to replace 46 | `margin: auto` with `margin: 0`. Thus we cannot use `margin: auto` 47 | to center images, we have to calculate the left margin value. 48 | 49 | Also we must use 'width' on the html element, not css style, 50 | or Adobe ADE will not scale the image properly (ie. only 51 | horizontally). 52 | 53 | :align: is supposed to work on blocks. It floats or centers 54 | a block. 55 | 56 | :align: center has not the same semantics as :class: center. 57 | Former centers the block, eg. the whole table, latter centers 58 | the text, eg, the text in every table cell. 59 | 60 | `:align: center` 61 | Used on image: centers image 62 | Used on figure: centers image and caption 63 | Used on table: centers table and caption 64 | 65 | """ 66 | 67 | width = node.get ('width') 68 | if width is None: 69 | return [] 70 | 71 | style = ['width: %s' % width] 72 | 73 | m = re.match ('(\d+)\s*%', width) 74 | if (m): 75 | width = max (min (int (m.group (1)), 100), 0) 76 | margin = 100 - width 77 | 78 | align = node.get ('align', 'center') 79 | if align == 'center': 80 | style.append ('margin-left: %d%%' % (margin / 2)) 81 | if align == 'right': 82 | style.append ('margin-left: %d%%' % margin) 83 | 84 | node['styles'].extend (style) 85 | 86 | 87 | -------------------------------------------------------------------------------- /epubmaker/mydocutils/writers/rst2all.css: -------------------------------------------------------------------------------- 1 | /* 2 | Project Gutenberg common docutils stylesheet. 3 | 4 | This stylesheet contains styles common to HTML and EPUB. Put styles 5 | that are specific to HTML and EPUB into their relative stylesheets. 6 | 7 | :Author: Marcello Perathoner (webmaster@gutenberg.org) 8 | :Copyright: This stylesheet has been placed in the public domain. 9 | 10 | This stylesheet is based on: 11 | 12 | :Author: David Goodger (goodger@python.org) 13 | :Copyright: This stylesheet has been placed in the public domain. 14 | 15 | Default cascading style sheet for the HTML output of Docutils. 16 | 17 | */ 18 | 19 | /* ADE 1.7.2 chokes on !important and throws all css out. */ 20 | 21 | /* FONTS */ 22 | 23 | .italics { font-style: italic } 24 | .no-italics { font-style: normal } 25 | 26 | .bold { font-weight: bold } 27 | .no-bold { font-weight: normal } 28 | 29 | .small-caps { } /* Epub needs italics */ 30 | .gesperrt { } /* Epub needs italics */ 31 | .antiqua { font-style: italic } /* what else can we do ? */ 32 | .monospaced { font-family: monospace } 33 | 34 | .smaller { font-size: smaller } 35 | .larger { font-size: larger } 36 | 37 | .xx-small { font-size: xx-small } 38 | .x-small { font-size: x-small } 39 | .small { font-size: small } 40 | .medium { font-size: medium } 41 | .large { font-size: large } 42 | .x-large { font-size: x-large } 43 | .xx-large { font-size: xx-large } 44 | 45 | .text-transform-uppercase { text-transform: uppercase } 46 | .text-transform-lowercase { text-transform: lowercase } 47 | .text-transform-none { text-transform: none } 48 | 49 | .red { color: red } 50 | .green { color: green } 51 | .blue { color: blue } 52 | .yellow { color: yellow } 53 | .white { color: white } 54 | .gray { color: gray } 55 | .black { color: black } 56 | 57 | /* ALIGN */ 58 | 59 | .left { text-align: left } 60 | .justify { text-align: justify } 61 | .center { text-align: center; text-indent: 0 } 62 | .centerleft { text-align: center; text-indent: 0 } 63 | .right { text-align: right; text-indent: 0 } 64 | 65 | /* LINE HEIGHT */ 66 | 67 | body { line-height: 1.5 } 68 | p { margin: 0; 69 | text-indent: 2em } 70 | 71 | /* PAGINATION */ 72 | 73 | .title, .subtitle { page-break-after: avoid } 74 | 75 | .container, .title, .subtitle, #pg-header 76 | { page-break-inside: avoid } 77 | 78 | /* SECTIONS */ 79 | 80 | body { text-align: justify } 81 | 82 | p.pfirst, p.noindent { 83 | text-indent: 0 84 | } 85 | 86 | .boxed { border: 1px solid black; padding: 1em } 87 | .topic, .note { margin: 5% 0; border: 1px solid black; padding: 1em } 88 | div.section { clear: both } 89 | 90 | div.line-block { margin: 1.5em 0 } /* same leading as p */ 91 | div.line-block.inner { margin: 0 0 0 10% } 92 | div.line { margin-left: 20%; text-indent: -20%; } 93 | .line-block.noindent div.line { margin-left: 0; text-indent: 0; } 94 | 95 | hr.docutils { margin: 1.5em 40%; border: none; border-bottom: 1px solid black; } 96 | div.transition { margin: 1.5em 0 } 97 | 98 | .vfill, .vspace { border: 0px solid white } 99 | 100 | .title { margin: 1.5em 0 } 101 | .title.with-subtitle { margin-bottom: 0 } 102 | .subtitle { margin: 1.5em 0 } 103 | 104 | /* header font style */ 105 | /* http://dev.w3.org/csswg/css3-fonts/#propdef-font-size */ 106 | 107 | h1.title { font-size: 200%; } /* for book title only */ 108 | h2.title, p.subtitle.level-1 { font-size: 150%; margin-top: 4.5em; margin-bottom: 2em } 109 | h3.title, p.subtitle.level-2 { font-size: 120%; margin-top: 2.25em; margin-bottom: 1.25em } 110 | h4.title, p.subtitle.level-3 { font-size: 100%; margin-top: 1.5em; margin-bottom: 1.5em; font-weight: bold; } 111 | h5.title, p.subtitle.level-4 { font-size: 89%; margin-top: 1.87em; margin-bottom: 1.69em; font-style: italic; } 112 | h6.title, p.subtitle.level-5 { font-size: 60%; margin-top: 3.5em; margin-bottom: 2.5em } 113 | 114 | /* title page */ 115 | 116 | h1.title, p.subtitle.level-1, 117 | h2.title, p.subtitle.level-2 { text-align: center } 118 | 119 | #pg-header, 120 | h1.document-title { margin: 10% 0 5% 0 } 121 | p.document-subtitle { margin: 0 0 5% 0 } 122 | 123 | /* PG header and footer */ 124 | #pg-machine-header { } 125 | #pg-produced-by { } 126 | 127 | li.toc-entry { list-style-type: none } 128 | ul.open li, ol.open li { margin-bottom: 1.5em } 129 | 130 | .attribution { margin-top: 1.5em } 131 | 132 | .example-rendered { 133 | margin: 1em 5%; border: 1px dotted red; padding: 1em; background-color: #ffd } 134 | .literal-block.example-source { 135 | margin: 1em 5%; border: 1px dotted blue; padding: 1em; background-color: #eef } 136 | 137 | /* DROPCAPS */ 138 | 139 | /* BLOCKQUOTES */ 140 | 141 | blockquote { margin: 1.5em 10% } 142 | 143 | blockquote.epigraph { } 144 | 145 | blockquote.highlights { } 146 | 147 | div.local-contents { margin: 1.5em 10% } 148 | 149 | div.abstract { margin: 3em 10% } 150 | div.image { margin: 1.5em 0 } 151 | div.caption { margin: 1.5em 0 } 152 | div.legend { margin: 1.5em 0 } 153 | 154 | .hidden { display: none } 155 | 156 | .invisible { visibility: hidden; color: white } /* white: mozilla print bug */ 157 | 158 | a.toc-backref { 159 | text-decoration: none ; 160 | color: black } 161 | 162 | dl.docutils dd { 163 | margin-bottom: 0.5em } 164 | 165 | div.figure { margin-top: 3em; margin-bottom: 3em } 166 | 167 | img { max-width: 100% } 168 | 169 | div.footer, div.header { 170 | clear: both; 171 | font-size: smaller } 172 | 173 | div.sidebar { 174 | margin: 0 0 0.5em 1em ; 175 | border: medium outset ; 176 | padding: 1em ; 177 | background-color: #ffffee ; 178 | width: 40% ; 179 | float: right ; 180 | clear: right } 181 | 182 | div.sidebar p.rubric { 183 | font-family: sans-serif ; 184 | font-size: medium } 185 | 186 | ol.simple, ul.simple { margin: 1.5em 0 } 187 | 188 | ol.toc-list, ul.toc-list { padding-left: 0 } 189 | ol ol.toc-list, ul ul.toc-list { padding-left: 5% } 190 | 191 | ol.arabic { 192 | list-style: decimal } 193 | 194 | ol.loweralpha { 195 | list-style: lower-alpha } 196 | 197 | ol.upperalpha { 198 | list-style: upper-alpha } 199 | 200 | ol.lowerroman { 201 | list-style: lower-roman } 202 | 203 | ol.upperroman { 204 | list-style: upper-roman } 205 | 206 | p.credits { 207 | font-style: italic ; 208 | font-size: smaller } 209 | 210 | p.label { 211 | white-space: nowrap } 212 | 213 | p.rubric { 214 | font-weight: bold ; 215 | font-size: larger ; 216 | color: maroon ; 217 | text-align: center } 218 | 219 | p.sidebar-title { 220 | font-family: sans-serif ; 221 | font-weight: bold ; 222 | font-size: larger } 223 | 224 | p.sidebar-subtitle { 225 | font-family: sans-serif ; 226 | font-weight: bold } 227 | 228 | p.topic-title, p.admonition-title { 229 | font-weight: bold } 230 | 231 | pre.address { 232 | margin-bottom: 0 ; 233 | margin-top: 0 ; 234 | font: inherit } 235 | 236 | .literal-block, .doctest-block { 237 | margin-left: 2em ; 238 | margin-right: 2em; } 239 | 240 | span.classifier { 241 | font-family: sans-serif ; 242 | font-style: oblique } 243 | 244 | span.classifier-delimiter { 245 | font-family: sans-serif ; 246 | font-weight: bold } 247 | 248 | span.interpreted { 249 | font-family: sans-serif } 250 | 251 | span.option { 252 | white-space: nowrap } 253 | 254 | span.pre { 255 | white-space: pre } 256 | 257 | span.problematic { 258 | color: red } 259 | 260 | span.section-subtitle { 261 | /* font-size relative to parent (h1..h6 element) */ 262 | font-size: 100% } 263 | 264 | table { margin-top: 1.5em; margin-bottom: 1.5em; border-spacing: 0 } 265 | table.align-left, table.align-right { margin-top: 0 } 266 | 267 | table.table { border-collapse: collapse; } 268 | 269 | table.table.hrules-table thead { border: 1px solid black; border-width: 2px 0 0 } 270 | table.table.hrules-table tbody { border: 1px solid black; border-width: 2px 0 } 271 | table.table.hrules-rows tr { border: 1px solid black; border-width: 0 0 1px } 272 | table.table.hrules-rows tr.last { border-width: 0 } 273 | table.table.hrules-rows td, 274 | table.table.hrules-rows th { padding: 1ex 1em; vertical-align: middle } 275 | 276 | table.table tr { border-width: 0 } 277 | table.table td, 278 | table.table th { padding: 0.5ex 1em } 279 | table.table tr.first td { padding-top: 1ex } 280 | table.table tr.last td { padding-bottom: 1ex } 281 | table.table tr.first th { padding-top: 1ex } 282 | table.table tr.last th { padding-bottom: 1ex } 283 | 284 | 285 | table.citation { 286 | border-left: solid 1px gray; 287 | margin-left: 1px } 288 | 289 | table.docinfo { 290 | margin: 3em 4em } 291 | 292 | table.docutils { } 293 | 294 | div.footnote-group { margin: 1em 0 } 295 | table.footnote td.label { width: 2em; text-align: right; padding-left: 0 } 296 | 297 | table.docutils td, table.docutils th, 298 | table.docinfo td, table.docinfo th { 299 | padding: 0 0.5em; 300 | vertical-align: top } 301 | 302 | table.docutils th.field-name, table.docinfo th.docinfo-name { 303 | font-weight: bold ; 304 | text-align: left ; 305 | white-space: nowrap ; 306 | padding-left: 0 } 307 | 308 | /* used to remove borders from tables and images */ 309 | .borderless, table.borderless td, table.borderless th { 310 | border: 0 } 311 | 312 | table.borderless td, table.borderless th { 313 | /* Override padding for "table.docutils td" with "!important". 314 | The right padding separates the table cells. */ 315 | padding: 0 0.5em 0 0 } /* FIXME: was !important */ 316 | 317 | h1 tt.docutils, h2 tt.docutils, h3 tt.docutils, 318 | h4 tt.docutils, h5 tt.docutils, h6 tt.docutils { 319 | font-size: 100% } 320 | 321 | ul.auto-toc { 322 | list-style-type: none } 323 | -------------------------------------------------------------------------------- /epubmaker/mydocutils/writers/rst2epub.css: -------------------------------------------------------------------------------- 1 | /* 2 | Project Gutenberg EPUB docutils stylesheet. 3 | 4 | This stylesheet contains styles specific to EPUB. 5 | */ 6 | 7 | /* FONTS */ 8 | 9 | /* mostly unsupported */ 10 | .small-caps { font-style: italic } 11 | .gesperrt { font-style: italic } 12 | 13 | /* ALIGN */ 14 | 15 | /* SECTIONS */ 16 | 17 | /* reduce screen real estate waste */ 18 | body { margin: 1% } 19 | 20 | /* ugly hack to give more specifity. because ADE chucks out the whole 21 | stylesheet when it sees an !important */ 22 | 23 | .first.first { margin-top: 0; text-indent: 0 } 24 | .last.last { margin-bottom: 0 } 25 | 26 | .no-page-break.no-page-break 27 | { page-break-before: avoid } 28 | 29 | /* PAGINATION */ 30 | 31 | div.clearpage { page-break-before: always; padding-top: 10% } 32 | div.cleardoublepage { page-break-before: right; padding-top: 10% } 33 | 34 | .vfill { margin-top: 10% } 35 | h2.title { margin-top: 10% } 36 | 37 | /* DIV */ 38 | 39 | a { text-decoration: none } 40 | .toc-pageref { display: none } 41 | 42 | /* DROPCAPS */ 43 | 44 | span.dropcap { line-height: 0 } 45 | img.dropcap { vertical-align: bottom } 46 | 47 | -------------------------------------------------------------------------------- /epubmaker/mydocutils/writers/rst2html.css: -------------------------------------------------------------------------------- 1 | /* 2 | Project Gutenberg HTML docutils stylesheet. 3 | 4 | This stylesheet contains styles specific to HTML. 5 | */ 6 | 7 | /* FONTS */ 8 | 9 | /* em { font-style: normal } 10 | strong { font-weight: normal } */ 11 | 12 | .small-caps { font-variant: small-caps } 13 | .gesperrt { letter-spacing: 0.1em } 14 | 15 | /* ALIGN */ 16 | 17 | .align-left { clear: left; 18 | float: left; 19 | margin-right: 1em } 20 | 21 | .align-right { clear: right; 22 | float: right; 23 | margin-left: 1em } 24 | 25 | .align-center { margin-left: auto; 26 | margin-right: auto } 27 | 28 | div.shrinkwrap { display: table; } 29 | 30 | /* SECTIONS */ 31 | 32 | body { margin: 5% 10% 5% 10% } 33 | 34 | /* compact list items containing just one p */ 35 | li p.pfirst { margin-top: 0; margin-bottom: 0 } 36 | 37 | .first { margin-top: 0 !important; 38 | text-indent: 0 !important } 39 | .last { margin-bottom: 0 !important } 40 | 41 | span.dropcap { float: left; margin: 0 0.1em 0 0; line-height: 1 } 42 | img.dropcap { float: left; margin: 0 0.5em 0 0; max-width: 25% } 43 | span.dropspan { font-variant: small-caps } 44 | 45 | .no-page-break { page-break-before: avoid !important } 46 | 47 | /* PAGINATION */ 48 | 49 | .pageno { position: absolute; right: 95%; font: medium sans-serif; text-indent: 0 } 50 | .pageno:after { color: gray; content: '[' attr(title) ']' } 51 | .lineno { position: absolute; left: 95%; font: medium sans-serif; text-indent: 0 } 52 | .lineno:after { color: gray; content: '[' attr(title) ']' } 53 | .toc-pageref { float: right } 54 | 55 | @media screen { 56 | .coverpage, .frontispiece, .titlepage, .verso, .dedication, .plainpage 57 | { margin: 10% 0; } 58 | 59 | div.clearpage, div.cleardoublepage 60 | { margin: 10% 0; border: none; border-top: 1px solid gray; } 61 | 62 | .vfill { margin: 5% 10% } 63 | } 64 | 65 | @media print { 66 | div.clearpage { page-break-before: always; padding-top: 10% } 67 | div.cleardoublepage { page-break-before: right; padding-top: 10% } 68 | 69 | .vfill { margin-top: 20% } 70 | h2.title { margin-top: 20% } 71 | } 72 | 73 | /* DIV */ 74 | pre { font-family: monospace; font-size: 0.9em; white-space: pre-wrap } 75 | 76 | -------------------------------------------------------------------------------- /epubmaker/packagers/GzipPackager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | GzipPackager.py 6 | 7 | Copyright 2010 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Gzip a file. 12 | 13 | """ 14 | 15 | from epubmaker.packagers import OneFileGzipPackager 16 | 17 | TYPE = 'gzip' 18 | FORMATS = 'rst html.noimages html.images txt.us-ascii txt.iso-8859-1 txt.utf-8'.split () 19 | 20 | class Packager (OneFileGzipPackager): 21 | """ Gzip packager. """ 22 | pass 23 | 24 | -------------------------------------------------------------------------------- /epubmaker/packagers/HTMLPackager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | HTMLPackager.py 6 | 7 | Copyright 2010 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Package a HTML file for PG. 12 | 13 | """ 14 | 15 | from epubmaker.packagers import HTMLishPackager 16 | 17 | TYPE = 'ww' 18 | FORMATS = 'html.images'.split () 19 | 20 | class Packager (HTMLishPackager): 21 | """ Package a HTML file with its images. """ 22 | pass 23 | -------------------------------------------------------------------------------- /epubmaker/packagers/PDFPackager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | PDFPackager.py 6 | 7 | Copyright 2010 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Package a PDF file for PG. 12 | 13 | """ 14 | 15 | from epubmaker.packagers import OneFileZipPackager 16 | 17 | TYPE = 'ww' 18 | FORMATS = ''.split () 19 | 20 | class Packager (OneFileZipPackager): 21 | """ WW packager for PDF files. """ 22 | pass 23 | 24 | -------------------------------------------------------------------------------- /epubmaker/packagers/PushPackager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | PushPackager.py 6 | 7 | Copyright 2011 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Package a zip containing everything, that can be pushed to ibiblio. 12 | 13 | """ 14 | 15 | from __future__ import with_statement 16 | 17 | import os 18 | import zipfile 19 | import re 20 | 21 | from epubmaker.lib.Logger import info, warn, error 22 | import epubmaker.lib.GutenbergGlobals as gg 23 | 24 | from epubmaker.packagers import BasePackager 25 | 26 | TYPE = 'ww' 27 | FORMATS = ['push'] 28 | 29 | class Packager (BasePackager): 30 | """ Package one big zip for push. 31 | 32 | Zip contains one directory named after ebook_no. 33 | This dir mirrors structure on ibiblio:: 34 | 35 | 12345/12345.txt 36 | 12345/12345.zip 37 | 12345/12345-h/12345-h.html 38 | 12345/12345-h/images/cover.jpg 39 | 12345/12345-h.zip 40 | 41 | """ 42 | 43 | @staticmethod 44 | def add (zip_, filename, memberfilename): 45 | """ Add one file to the zip. """ 46 | 47 | try: 48 | os.stat (filename) 49 | dummy_name, ext = os.path.splitext (filename) 50 | info (' Adding file: %s as %s' % (filename, memberfilename)) 51 | zip_.write (filename, memberfilename, 52 | zipfile.ZIP_STORED if ext in ['.zip', '.png'] 53 | else zipfile.ZIP_DEFLATED) 54 | except OSError: 55 | # warn ('PushPackager: Cannot find file %s', filename) 56 | return 57 | 58 | 59 | def package (self, aux_file_list = []): 60 | zipfilename = self.options.outputfile # filename is zipfile 61 | 62 | m = re.match (r'\d+', zipfilename) 63 | if m: 64 | ebook_no = m.group (0) 65 | else: 66 | error ('Invalid filename %s for push packager.' % zipfilename) 67 | return 68 | 69 | info ('Creating Zip file: %s' % zipfilename) 70 | 71 | zip_ = zipfile.ZipFile (zipfilename, 'w', zipfile.ZIP_DEFLATED) 72 | 73 | for suffix in '.txt -8.txt -0.txt .zip -8.zip -0.zip -rst.zip -h.zip'.split (): 74 | filename = '%s%s' % (ebook_no, suffix) 75 | memberfilename = '%s/%s' % (ebook_no, filename) 76 | self.add (zip_, filename, memberfilename) 77 | 78 | for suffix, ext in (('-h', 'html'), ('-rst', 'rst')): 79 | filename = '%s%s.%s' % (ebook_no, suffix, ext) 80 | memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, filename) 81 | self.add (zip_, filename, memberfilename) 82 | 83 | # image files 84 | for url in aux_file_list: 85 | rel_url = gg.make_url_relative (self.options.base_url, url) 86 | filename = os.path.join (self.path, rel_url) 87 | memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, rel_url) 88 | self.add (zip_, filename, memberfilename) 89 | 90 | zip_.close () 91 | 92 | info ('Done Zip file: %s' % zipfilename) 93 | 94 | 95 | -------------------------------------------------------------------------------- /epubmaker/packagers/RSTPackager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | RSTPackager.py 6 | 7 | Copyright 2010 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Package a RST file for PG. 12 | 13 | """ 14 | 15 | from epubmaker.packagers import HTMLishPackager 16 | 17 | TYPE = 'ww' 18 | FORMATS = 'rst.gen'.split () 19 | 20 | class Packager (HTMLishPackager): 21 | """ Package a RST file with its images. """ 22 | pass 23 | -------------------------------------------------------------------------------- /epubmaker/packagers/TxtPackager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | TxtPackager.py 6 | 7 | Copyright 2010 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Package a Txt file for PG. 12 | 13 | """ 14 | 15 | from epubmaker.packagers import OneFileZipPackager 16 | 17 | TYPE = 'ww' 18 | FORMATS = 'txt.us-ascii txt.iso-8859-1 txt.utf-8'.split () 19 | 20 | class Packager (OneFileZipPackager): 21 | """ WW packager for plain text files. """ 22 | pass 23 | 24 | -------------------------------------------------------------------------------- /epubmaker/packagers/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | 6 | Packager package 7 | 8 | Copyright 2009-2010 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Base class for Packager modules. 13 | 14 | """ 15 | 16 | from __future__ import with_statement 17 | 18 | import os.path 19 | import gzip 20 | import zipfile 21 | 22 | from pkg_resources import resource_listdir # pylint: disable=E0611 23 | 24 | from epubmaker.lib.Logger import debug, info, warn, error 25 | import epubmaker.lib.GutenbergGlobals as gg 26 | 27 | GZIP_EXTENSION = '.gzip' 28 | 29 | class BasePackager (object): 30 | """ 31 | Base class for Packagers. 32 | 33 | """ 34 | 35 | def __init__ (self): 36 | self.options = None 37 | self.path_name_ext = None 38 | self.path = None 39 | self.name = None 40 | self.ext = None 41 | 42 | 43 | def setup (self, options): 44 | """ Setup """ 45 | 46 | self.options = options 47 | self.path_name_ext = os.path.join (self.options.outputdir, self.options.outputfile) 48 | self.path, name = os.path.split (self.path_name_ext) 49 | self.name, self.ext = os.path.splitext (name) 50 | 51 | 52 | def package (self, aux_file_list = []): 53 | """ Package files. """ 54 | pass 55 | 56 | 57 | class OneFileGzipPackager (BasePackager): 58 | """ Gzips one file. """ 59 | 60 | def package (self, aux_file_list = []): 61 | filename = self.path_name_ext 62 | gzfilename = filename + GZIP_EXTENSION 63 | 64 | try: 65 | info ('Creating Gzip file: %s' % gzfilename) 66 | with open (filename, 'r') as fp: 67 | fpgz = gzip.open (gzfilename, 'w') 68 | info (' Adding file: %s' % filename) 69 | fpgz.write (fp.read ()) 70 | fpgz.close () 71 | info ('Done Zip file: %s' % gzfilename) 72 | except IOError, what: 73 | error (what) 74 | 75 | 76 | class OneFileZipPackager (BasePackager): 77 | """ Packages one file in zip of the same name. """ 78 | 79 | def package (self, aux_file_list = []): 80 | filename = self.path_name_ext 81 | zipfilename = os.path.join (self.path, self.name) + '.zip' 82 | memberfilename = self.name + self.ext 83 | 84 | info ('Creating Zip file: %s' % zipfilename) 85 | 86 | try: 87 | os.stat (filename) 88 | except OSError: 89 | # warn ('Packager: Cannot find file %s', filename) 90 | return 91 | 92 | zip_ = zipfile.ZipFile (zipfilename, 'w', zipfile.ZIP_DEFLATED) 93 | info (' Adding file: %s as %s' % (filename, memberfilename)) 94 | zip_.write (filename, memberfilename) 95 | zip_.close () 96 | 97 | info ('Done Zip file: %s' % zipfilename) 98 | 99 | 100 | class HTMLishPackager (BasePackager): 101 | """ Package a file with images. """ 102 | 103 | def package (self, aux_file_list = []): 104 | 105 | filename = self.options.outputfile 106 | zipfilename = os.path.join (self.path, self.name) + '.zip' 107 | memberfilename = os.path.join (self.name, self.name) + self.ext 108 | 109 | info ('Creating Zip file: %s' % zipfilename) 110 | 111 | zip_ = zipfile.ZipFile (zipfilename, 'w', zipfile.ZIP_DEFLATED) 112 | info (' Adding file: %s as %s' % (filename, memberfilename)) 113 | zip_.write (filename, memberfilename) 114 | 115 | # now images 116 | for url in aux_file_list: 117 | rel_url = gg.make_url_relative (self.options.base_url, url) 118 | filename = os.path.join (self.path, rel_url) 119 | memberfilename = os.path.join (self.name, rel_url) 120 | info (' Adding file: %s as %s' % (filename, memberfilename)) 121 | zip_.write (filename, memberfilename) 122 | 123 | zip_.close () 124 | 125 | info ('Done Zip file: %s' % zipfilename) 126 | 127 | 128 | class PackagerFactory (object): 129 | """ Implements Factory pattern for packagers. """ 130 | 131 | packagers = {} 132 | 133 | def __init__ (self, type_): 134 | self.type = type_ 135 | 136 | 137 | def load (self): 138 | """ Load the packagers in the packagers directory. """ 139 | 140 | for fn in resource_listdir ('epubmaker.packagers', ''): 141 | modulename, ext = os.path.splitext (fn) 142 | if ext == '.py': 143 | if modulename.endswith ('Packager'): 144 | module = __import__ ('epubmaker.packagers.' + modulename, 145 | fromlist = [modulename]) 146 | if self.type == module.TYPE: 147 | debug ("Loading packager type: %s from module: %s for formats: %s" % ( 148 | self.type, modulename, ', '.join (module.FORMATS))) 149 | for format_ in module.FORMATS: 150 | self.packagers[format_] = module 151 | 152 | return self.packagers.keys () 153 | 154 | 155 | def unload (self): 156 | """ Unload packager modules. """ 157 | 158 | for k in self.packagers.keys (): 159 | del self.packagers[k] 160 | 161 | 162 | def create (self, format_): 163 | """ Create a packager for format. """ 164 | 165 | try: 166 | return self.packagers[format_].Packager () 167 | except KeyError: 168 | raise KeyError ('No packager for type %s' % format_) 169 | 170 | -------------------------------------------------------------------------------- /epubmaker/parsers/AuxParser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | 6 | AuxParser.py 7 | 8 | Copyright 2009 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Open an url and return raw data. 13 | 14 | """ 15 | 16 | 17 | from epubmaker.parsers import ParserBase 18 | 19 | mediatypes = ('*/*', ) 20 | 21 | class Parser (ParserBase): 22 | """ Parse an auxiliary file. """ 23 | 24 | def __init__ (self): 25 | ParserBase.__init__ (self) 26 | self.data = None 27 | 28 | 29 | def parse (self): 30 | """ Parse the file. """ 31 | self.data = self.bytes_content () 32 | 33 | 34 | def serialize (self): 35 | """ Serialize file to string. """ 36 | return self.data 37 | -------------------------------------------------------------------------------- /epubmaker/parsers/CSSParser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | 6 | CSSParser.py 7 | 8 | Copyright 2009 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Open an url and return raw data. 13 | 14 | """ 15 | 16 | import re 17 | import urlparse 18 | import logging 19 | 20 | import cssutils 21 | 22 | from epubmaker.lib.Logger import debug 23 | from epubmaker.lib.MediaTypes import mediatypes as mt 24 | 25 | from epubmaker.parsers import ParserBase 26 | 27 | RE_ELEMENT = re.compile (r'((?:^|\s)[a-z0-9]+)', re.I) 28 | 29 | mediatypes = (mt.css, ) 30 | 31 | class Parser (ParserBase): 32 | """ Parse an external CSS file. """ 33 | 34 | def __init__ (self): 35 | cssutils.log.setLog (logging.getLogger ('cssutils')) 36 | # logging.DEBUG is way too verbose 37 | cssutils.log.setLevel (max (cssutils.log.getEffectiveLevel (), logging.INFO)) 38 | ParserBase.__init__ (self) 39 | self.sheet = None 40 | 41 | 42 | def parse (self): 43 | """ Parse the CSS file. """ 44 | 45 | if self.sheet is not None: 46 | return 47 | 48 | parser = cssutils.CSSParser () 49 | if self.fp: 50 | self.sheet = parser.parseString (self.bytes_content (), encoding = self.encoding) 51 | else: 52 | self.sheet = parser.parseUrl (self.url) 53 | 54 | self.mediatype = 'text/css' 55 | self.unpack_media_handheld (self.sheet) 56 | self.lowercase_selectors (self.sheet) 57 | 58 | 59 | def parse_string (self, s): 60 | """ Parse the CSS in string. """ 61 | 62 | if self.sheet is not None: 63 | return 64 | 65 | parser = cssutils.CSSParser () 66 | self.sheet = parser.parseString (s, encoding = 'utf-8') 67 | 68 | self.mediatype = 'text/css' 69 | self.unpack_media_handheld (self.sheet) 70 | self.lowercase_selectors (self.sheet) 71 | 72 | 73 | @staticmethod 74 | def iter_properties (sheet): 75 | """ Iterate on properties in css. """ 76 | for rule in sheet: 77 | if rule.type == rule.STYLE_RULE: 78 | for prop in rule.style: 79 | yield prop 80 | 81 | 82 | @staticmethod 83 | def unpack_media_handheld (sheet): 84 | """ unpack a @media handheld rule """ 85 | for rule in sheet: 86 | if rule.type == rule.MEDIA_RULE: 87 | if rule.media.mediaText.find ('handheld') > -1: 88 | debug ("Unpacking CSS @media handheld rule.") 89 | rule.media.mediaText = 'all' 90 | rule.insertRule (cssutils.css.CSSComment ('/* was @media handheld */'), 0) 91 | 92 | 93 | @staticmethod 94 | def lowercase_selectors (sheet): 95 | """ make selectors lowercase to match xhtml tags """ 96 | for rule in sheet: 97 | if rule.type == rule.STYLE_RULE: 98 | for sel in rule.selectorList: 99 | sel.selectorText = RE_ELEMENT.sub (lambda m: m.group(1).lower (), 100 | sel.selectorText) 101 | 102 | 103 | def rewrite_links (self, f): 104 | """ Rewrite all links using the function f. """ 105 | cssutils.replaceUrls (self.sheet, f) 106 | 107 | 108 | def drop_floats (self): 109 | """ Drop all floats in stylesheet. 110 | 111 | """ 112 | 113 | for prop in self.iter_properties (self.sheet): 114 | if prop and prop.name == 'float': # test for existence because we remove 115 | prop.parent.removeProperty ('float') 116 | prop.parent.removeProperty ('width') 117 | prop.parent.removeProperty ('height') 118 | elif prop and prop.name in ('position', 'left', 'right', 'top', 'bottom'): 119 | prop.parent.removeProperty (prop.name) 120 | 121 | for prop in self.iter_properties (self.sheet): 122 | #print prop.name 123 | #print prop.value 124 | if prop and prop.value.endswith ('px'): # test for existence because we remove 125 | prop.parent.removeProperty (prop.name) 126 | 127 | 128 | def get_image_urls (self): 129 | """ Return the urls of all images in document. 130 | 131 | Images are graphic files. The user may choose if he wants 132 | images included or not. 133 | 134 | """ 135 | 136 | images = [] 137 | 138 | for prop in self.iter_properties (self.sheet): 139 | if (prop.value.cssValueType == prop.value.CSS_PRIMITIVE_VALUE and 140 | prop.value.primitiveType == prop.value.CSS_URI): 141 | url = urlparse.urljoin (self.url, prop.value.cssText) 142 | images.append (url) 143 | 144 | return images 145 | 146 | 147 | def get_aux_urls (self): 148 | """ Return the urls of all auxiliary files in document. 149 | 150 | Auxiliary files are non-document files you need to correctly 151 | display the document file, eg. CSS files. 152 | 153 | """ 154 | 155 | aux = [] 156 | 157 | for rule in self.sheet: 158 | if rule.type == rule.IMPORT_RULE: 159 | aux.append (urlparse.urljoin (self.url, rule.href)) 160 | 161 | return aux 162 | 163 | 164 | def serialize (self): 165 | """ Serialize CSS. """ 166 | 167 | return self.sheet.cssText 168 | -------------------------------------------------------------------------------- /epubmaker/parsers/HTMLParser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | 6 | HTMLParser.py 7 | 8 | Copyright 2009 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | """ 13 | 14 | import re 15 | import subprocess 16 | import urllib 17 | import urlparse 18 | 19 | import lxml.html 20 | from lxml import etree 21 | # import tidy 22 | 23 | from epubmaker.lib.GutenbergGlobals import NS, xpath 24 | from epubmaker.lib.Logger import info, debug, warn, error 25 | from epubmaker.lib.MediaTypes import mediatypes as mt 26 | 27 | from epubmaker import parsers 28 | from epubmaker.parsers import HTMLParserBase 29 | 30 | mediatypes = ('text/html', mt.xhtml) 31 | 32 | RE_XMLDECL = re.compile ('<\?xml[^?]+\?>\s*') 33 | 34 | DEPRECATED = { 'align': """caption applet iframe img input object legend 35 | table hr div h1 h2 h3 h4 h5 h6 p""", 36 | 'alink': 'body', 37 | 'alt': 'applet', 38 | 'archive': 'applet', 39 | 'background': 'body', 40 | 'bgcolor': '*', 41 | 'border': 'img object', 42 | 'clear': 'br', 43 | 'code': 'applet', 44 | 'codebase': 'applet', 45 | 'color': '*', 46 | 'compact': '*', 47 | 'face': '*', 48 | 'height': 'td th applet', 49 | 'hspace': '*', 50 | 'language': 'script', 51 | 'link': 'body', 52 | 'name': 'applet', 53 | 'noshade': 'hr', 54 | 'nowrap': '*', 55 | 'object': 'applet', 56 | 'prompt': 'isindex', 57 | 'size': 'hr font basefont', 58 | 'start': 'ol', 59 | 'text': 'body', 60 | 'type': 'li ol ul', 61 | 'value': 'li', 62 | 'version': 'html', 63 | 'vlink': 'body', 64 | 'vspace': '*', 65 | 'width': 'hr td th applet pre', 66 | } 67 | 68 | 69 | class Parser (HTMLParserBase): 70 | """ Parse a HTML Text 71 | 72 | and convert it to xhtml suitable for ePub packaging. 73 | 74 | """ 75 | 76 | @staticmethod 77 | def _fix_id (id_): 78 | """ Fix more common mistakes in ids. 79 | 80 | xml:id cannot start with digit, very common in pg. 81 | 82 | """ 83 | 84 | if not parsers.RE_XML_NAME.match (id_): 85 | id_ = 'id_' + id_ 86 | 87 | # debug ("_fix_id: id = %s" % id_) 88 | return id_ 89 | 90 | 91 | def _fix_internal_frag (self, id_): 92 | """ Fix more common mistakes in ids. """ 93 | 94 | # This is a big mess because href attributes must be quoted, 95 | # but id attributes must not be quoted. Some HTML in PG 96 | # quotes ids in a misguided attempt to make id and href look 97 | # the same. But '%' is invalid in xml ids. 98 | # 99 | # See HTML 4.01 spec section B.2. 100 | 101 | if '%' in id_: 102 | id_ = urllib.unquote (id_) 103 | try: 104 | id_ = id_.decode ('utf-8') 105 | except UnicodeError: 106 | try: 107 | id_ = id_.decode (self.encoding) 108 | except UnicodeError: 109 | pass # we tried 110 | 111 | # xml:id cannot start with digit 112 | # very common in pg 113 | 114 | if not parsers.RE_XML_NAME.match (id_): 115 | id_ = 'id_' + id_ 116 | 117 | if not parsers.RE_XML_NAME.match (id_): 118 | # still invalid ... we tried 119 | return None 120 | 121 | # debug ("_fix_internal_frag: frag = %s" % id_) 122 | return id_ 123 | 124 | 125 | # @staticmethod 126 | # def tidylib (html): 127 | # """ Pipe html thru w3c tidylib. """ 128 | 129 | # html = parsers.RE_RESTRICTED.sub ('', html) 130 | # html = RE_XMLDECL.sub ('', html) 131 | # html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html) 132 | 133 | # options = { 134 | # "clean": 1, 135 | # "wrap": 0, 136 | # "output_xhtml": 1, 137 | # "numeric_entities": 1, 138 | # "merge_divs": 0, # keep poetry indentation 139 | # "merge_spans": 0, 140 | # "add_xml_decl": 0, 141 | # "doctype": "strict", 142 | # "anchor_as_name": 0, 143 | # "enclose_text": 1, 144 | # } 145 | 146 | # try: 147 | # html = tidy.parseString (html.encode ('utf-8')) 148 | # except TidyLibError, what: 149 | # error ("Tidy: %s" % what) 150 | # raise 151 | 152 | # return html 153 | 154 | 155 | @staticmethod 156 | def tidy (html): 157 | """ Pipe html thru w3c tidy. """ 158 | 159 | html = parsers.RE_RESTRICTED.sub ('', html) 160 | html = RE_XMLDECL.sub ('', html) 161 | html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html) 162 | 163 | # convert to xhtml 164 | tidy = subprocess.Popen ( 165 | ["tidy", 166 | "-utf8", 167 | "-clean", 168 | "--wrap", "0", 169 | # "--drop-font-tags", "y", 170 | # "--drop-proprietary-attributes", "y", 171 | # "--add-xml-space", "y", 172 | "--output-xhtml", "y", 173 | "--numeric-entities", "y", 174 | "--merge-divs", "n", # keep poetry indentation 175 | "--merge-spans", "n", 176 | "--add-xml-decl", "n", 177 | "--doctype", "strict", 178 | "--anchor-as-name", "n", 179 | "--enclose-text", "y" ], 180 | 181 | stdin = subprocess.PIPE, 182 | stdout = subprocess.PIPE, 183 | stderr = subprocess.PIPE) 184 | 185 | # print (html.encode ('utf-8')) 186 | # sys.exit () 187 | 188 | (html, stderr) = tidy.communicate (html.encode ('utf-8')) 189 | 190 | regex = re.compile ('(Info:|Warning:|Error:)\s*', re.I) 191 | 192 | # pylint: disable=E1103 193 | msg = stderr.rstrip () 194 | for line in msg.splitlines (): 195 | match = regex.search (line) 196 | if match: 197 | sline = regex.sub ("", line) 198 | g = match.group (1).lower () 199 | if g == 'info:': 200 | info ("tidy: %s" % sline) 201 | elif g == 'warning:': 202 | warn ("tidy: %s" % sline) 203 | elif g == 'error:': 204 | error ("tidy: %s" % sline) 205 | else: 206 | error (line) 207 | 208 | if tidy.returncode == 2: 209 | raise ValueError, stderr 210 | 211 | return html.decode ('utf-8') 212 | 213 | 214 | def find_coverpage (self): 215 | """ Search coverpage and put url into <link rel="coverpage" >. 216 | 217 | First look for an image with id of 'coverpage', then for an 218 | image with 'cover' in the url, then with 'title' in the url. 219 | 220 | """ 221 | for head in xpath (self.xhtml, 'xhtml:head'): 222 | for dummy_link in xpath (head, 'xhtml:link[@rel = "coverpage"]'): 223 | # already there 224 | return 225 | 226 | covers = (xpath (self.xhtml, '//xhtml:img[@id = "coverpage"]') or 227 | xpath (self.xhtml, '//xhtml:img[contains (@src, "cover")]') or 228 | xpath (self.xhtml, '//xhtml:img[contains (@src, "title")]')) 229 | if not covers: 230 | return 231 | 232 | href = covers[0].get ('src') 233 | # FIXME: enforce minimum size 234 | head.append (etree.Element (NS.xhtml.link, rel = 'coverpage', href = href)) 235 | return href 236 | 237 | 238 | def _fix_anchors (self): 239 | """ Move name to id and fix hrefs and ids. """ 240 | 241 | # move anchor name to id 242 | # 'id' values are more strict than 'name' values 243 | # try to fix ill-formed ids 244 | 245 | seen_ids = set () 246 | 247 | for anchor in (xpath (self.xhtml, "//xhtml:a[@name]") + 248 | xpath (self.xhtml, "//xhtml:*[@id]")): 249 | id_ = anchor.get ('id') or anchor.get ('name') 250 | 251 | if 'name' in anchor.attrib: 252 | del anchor.attrib['name'] 253 | if 'id' in anchor.attrib: 254 | del anchor.attrib['id'] 255 | if NS.xml.id in anchor.attrib: 256 | del anchor.attrib[NS.xml.id] 257 | 258 | id_ = self._fix_id (id_) 259 | 260 | if not parsers.RE_XML_NAME.match (id_): 261 | error ("Dropping ill-formed id '%s' in %s" % (id_, self.url)) 262 | continue 263 | 264 | # well-formed id 265 | if id_ in seen_ids: 266 | error ("Dropping duplicate id '%s' in %s" % (id_, self.url)) 267 | continue 268 | 269 | seen_ids.add (id_) 270 | anchor.set ('id', id_) 271 | 272 | 273 | # try to fix bogus fragment ids 274 | # 1. fragments point to xml:id, so must be well-formed ids 275 | # 2. the ids they point to must exist 276 | 277 | for link in xpath (self.xhtml, "//xhtml:*[@href]"): 278 | href = link.get ('href') 279 | hre, frag = urlparse.urldefrag (href) 280 | if frag: 281 | frag = self._fix_internal_frag (frag) 282 | 283 | if not frag: 284 | # non-recoverable ill-formed frag 285 | del link.attrib['href'] 286 | self.add_class (link, 'pgkilled') 287 | error ('Dropping ill-formed frag in %s' % href) 288 | continue 289 | 290 | # well-formed frag 291 | if hre: 292 | # we have url + frag 293 | link.set ('href', "%s#%s" % (hre, urllib.quote (frag.encode ('utf-8')))) 294 | self.add_class (link, 'pgexternal') 295 | elif frag in seen_ids: 296 | # we have only frag 297 | link.set ('href', "#%s" % urllib.quote (frag.encode ('utf-8'))) 298 | self.add_class (link, 'pginternal') 299 | else: 300 | del link.attrib['href'] 301 | self.add_class (link, 'pgkilled') 302 | error ("Dropping frag to non-existing id in %s" % href) 303 | 304 | 305 | def _to_xhtml11 (self): 306 | """ Make vanilla xhtml more conform to xhtml 1.1 """ 307 | 308 | # Change content-type meta to application/xhtml+xml. 309 | for meta in xpath (self.xhtml, "/xhtml:html/xhtml:head/xhtml:meta[@http-equiv]"): 310 | if meta.get ('http-equiv').lower () == 'content-type': 311 | meta.set ('content', mt.xhtml + '; charset=utf-8') 312 | 313 | # drop javascript 314 | 315 | for script in xpath (self.xhtml, "//xhtml:script"): 316 | script.drop_tree () 317 | 318 | # drop form 319 | 320 | for form in xpath (self.xhtml, "//xhtml:form"): 321 | form.drop_tree () 322 | 323 | # blockquotes 324 | 325 | for bq in xpath (self.xhtml, "//xhtml:blockquote"): 326 | # no naked text allowed in <blockquote> 327 | div = etree.Element (NS.xhtml.div) 328 | for child in bq: 329 | div.append (child) 330 | div.text = bq.text 331 | bq.text = None 332 | bq.append (div) 333 | # lxml.html.defs.block_tags 334 | 335 | # insert tbody 336 | 337 | for table in xpath (self.xhtml, "//xhtml:table[xhtml:tr]"): 338 | # no naked <tr> allowed in <table> 339 | tbody = etree.Element (NS.xhtml.tbody) 340 | for tr in table: 341 | if tr.tag == NS.xhtml.tr: 342 | tbody.append (tr) 343 | table.append (tbody) 344 | 345 | # move lang to xml:lang 346 | 347 | for elem in xpath (self.xhtml, "//xhtml:*[@lang]"): 348 | # bug in lxml 2.2.2: sometimes deletes wrong element 349 | # so we delete both and reset the right one 350 | lang = elem.get ('lang') 351 | try: 352 | del elem.attrib[NS.xml.lang] 353 | except KeyError: 354 | pass 355 | del elem.attrib['lang'] 356 | elem.set (NS.xml.lang, lang) 357 | 358 | # strip deprecated attributes 359 | 360 | for a, t in DEPRECATED.items (): 361 | for tag in t.split (): 362 | for elem in xpath (self.xhtml, "//xhtml:%s[@%s]" % (tag, a)): 363 | del elem.attrib[a] 364 | 365 | # strip empty class attributes 366 | 367 | for elem in xpath (self.xhtml, 368 | "//xhtml:*[@class and normalize-space (@class) = '']"): 369 | del elem.attrib['class'] 370 | 371 | # strip bogus header markup by Joe L. 372 | for elem in xpath (self.xhtml, "//xhtml:h1"): 373 | if elem.text and elem.text.startswith ("The Project Gutenberg eBook"): 374 | elem.tag = NS.xhtml.p 375 | for elem in xpath (self.xhtml, "//xhtml:h3"): 376 | if elem.text and elem.text.startswith ("E-text prepared by"): 377 | elem.tag = NS.xhtml.p 378 | 379 | 380 | def __parse (self, html): 381 | # remove xml decl and doctype, we will add the correct one before serializing 382 | # html = re.compile ('^.*<html ', re.I | re.S).sub ('<html ', html) 383 | # FIXME: do not remove doctype because we need it to load the dtd 384 | 385 | # remove xml declaration because of parser error: "Unicode 386 | # strings with encoding declaration are not supported. Please 387 | # use bytes input or XML fragments without declaration." 388 | re_xml_decl = re.compile (r'^<\?xml.*?\?>', re.S) 389 | html = re_xml_decl.sub ('', html) 390 | try: 391 | return etree.fromstring ( 392 | html, 393 | lxml.html.XHTMLParser (), 394 | base_url = self.url) 395 | except etree.ParseError, what: 396 | # cannot try HTML parser because we depend on correct xhtml namespace 397 | error ("etree.fromstring says: %s" % what) 398 | m = re.search (r'line\s(\d+),', str (what)) 399 | if m: 400 | lineno = int (m.group (1)) 401 | error ("Line %d: %s" % (lineno, html.splitlines ()[lineno - 1])) 402 | raise 403 | 404 | 405 | def pre_parse (self): 406 | """ Pre-parse a html ebook. Does a full parse because a 407 | lightweight parse would be almost as much work. """ 408 | 409 | # cache 410 | if self.xhtml is not None: 411 | return 412 | 413 | debug ("HTMLParser.pre_parse () ...") 414 | 415 | html = self.unicode_content () 416 | 417 | if html.startswith ('<?xml'): 418 | # Try a naive parse. This might fail because of errors in 419 | # the html or because we have no dtd loaded. We do not 420 | # load dtds because that makes us dependent on network and 421 | # the w3c site being up. Having all users of epubmaker 422 | # install local dtds is unrealistic. 423 | try: 424 | self.xhtml = self.__parse (html) 425 | except etree.ParseError: 426 | pass 427 | 428 | if self.xhtml is None: 429 | # previous parse failed, try tidy 430 | info ("Running html thru tidy.") 431 | html = self.tidy (html) 432 | self.xhtml = self.__parse (html) # let exception bubble up 433 | 434 | self._fix_anchors () # needs relative paths 435 | self.xhtml.make_links_absolute (base_url = self.url) 436 | self.find_coverpage () 437 | 438 | self._to_xhtml11 () 439 | 440 | debug ("Done parsing %s" % self.url) 441 | 442 | 443 | def parse (self): 444 | """ Fully parse a html ebook. """ 445 | 446 | debug ("HTMLParser.parse () ...") 447 | 448 | self.pre_parse () 449 | -------------------------------------------------------------------------------- /epubmaker/parsers/ImageParser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | 6 | ImageParser.py 7 | 8 | Copyright 2009 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Parse an url of type image/*. 13 | 14 | """ 15 | 16 | from __future__ import with_statement 17 | 18 | import StringIO 19 | 20 | from PIL import Image 21 | 22 | from pkg_resources import resource_string # pylint: disable=E0611 23 | 24 | from epubmaker.lib.Logger import debug, error 25 | from epubmaker.lib.MediaTypes import mediatypes as mt 26 | from epubmaker.parsers import ParserBase 27 | 28 | mediatypes = (mt.jpeg, mt.png, mt.gif) 29 | 30 | class Parser (ParserBase): 31 | """Parse an image. 32 | 33 | And maybe resize it for ePub packaging. 34 | 35 | """ 36 | 37 | def __init__ (self): 38 | ParserBase.__init__ (self) 39 | self.image_data = None 40 | self.dimen = None 41 | self.comment = None 42 | 43 | 44 | def resize_image (self, max_size, max_dimen, output_format = None): 45 | """ Create a new parser with a resized image. """ 46 | 47 | new_parser = Parser () 48 | 49 | try: 50 | image = Image.open (StringIO.StringIO (self.image_data)) 51 | 52 | format_ = image.format 53 | if output_format: 54 | format_ = output_format 55 | if format_ == 'gif': 56 | format_ = 'png' 57 | if format_ == 'jpeg' and image.mode.lower () != 'rgb': 58 | image = image.convert ('RGB') 59 | 60 | if 'dpi' in image.info: 61 | del image.info['dpi'] 62 | 63 | # maybe resize image 64 | 65 | # find scaling factor 66 | scale = 1.0 67 | scale = min (scale, max_dimen[0] / float (image.size[0])) 68 | scale = min (scale, max_dimen[1] / float (image.size[1])) 69 | 70 | was = '' 71 | if scale < 1.0: 72 | dimen = (int (image.size[0] * scale), int (image.size[1] * scale)) 73 | was = "(was %d x %d scale=%.2f) " % (image.size[0], image.size[1], scale) 74 | image = image.resize (dimen, Image.ANTIALIAS) 75 | 76 | # find best quality that fits into max_size 77 | data = self.image_data 78 | if (scale < 1.0) or (len (self.image_data) > max_size): 79 | for quality in (90, 85, 80, 70, 60, 50, 40, 30, 20, 10): 80 | buf = StringIO.StringIO () 81 | image.save (buf, format_, quality = quality) 82 | data = buf.getvalue () 83 | if (len (data) <= max_size): 84 | was += 'q=%d' % quality 85 | break 86 | 87 | comment = "Image: %d x %d size=%d %s" % ( 88 | image.size[0], image.size[1], len (data), was) 89 | debug (comment) 90 | 91 | new_parser.mediatype = self.mediatype 92 | new_parser.image_data = data 93 | new_parser.dimen = tuple (image.size) 94 | new_parser.comment = comment 95 | new_parser.url = self.url 96 | new_parser.orig_url = self.orig_url 97 | new_parser.attribs = self.attribs 98 | new_parser.fp = self.fp 99 | 100 | except IOError, what: 101 | error ("Could not resize image: %s" % what) 102 | new_parser.broken_image () 103 | 104 | return new_parser 105 | 106 | 107 | def get_image_dimen (self): 108 | if self.dimen is None: 109 | image = Image.open (StringIO.StringIO (self.image_data)) 110 | self.dimen = image.size 111 | return self.dimen 112 | 113 | 114 | def broken_image (self): 115 | """ Insert broken image placeholder. """ 116 | 117 | self.image_data = resource_string ('epubmaker.parsers', 'broken.png') 118 | # We need a way to distinguish between pngs to drop and pngs 119 | # to keep in a non-images build. 120 | self.mediatype = 'image/png;type=resource' 121 | 122 | 123 | def pre_parse (self): 124 | if self.image_data is None: 125 | self.image_data = self.bytes_content () 126 | if self.image_data is None: 127 | self.broken_image () 128 | 129 | 130 | def parse (self): 131 | """ Parse the image. """ 132 | 133 | pass 134 | 135 | 136 | def serialize (self): 137 | """ Serialize the image. """ 138 | return self.image_data 139 | 140 | -------------------------------------------------------------------------------- /epubmaker/parsers/RSTParser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | 6 | RSTParser.py 7 | 8 | Copyright 2010-2012 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | """ 13 | 14 | # FIXME: 15 | # use docinfo instead of meta for pg header 16 | 17 | import copy 18 | import re 19 | import os 20 | import collections 21 | import urlparse 22 | from functools import partial 23 | 24 | from lxml import etree 25 | import lxml.html 26 | 27 | import docutils.readers.standalone 28 | from docutils import nodes, frontend, io 29 | 30 | from pkg_resources import resource_string # pylint: disable=E0611 31 | 32 | from epubmaker.lib.GutenbergGlobals import NS, xpath 33 | from epubmaker.lib.Logger import info, debug, warn, error 34 | from epubmaker.lib.MediaTypes import mediatypes as mt 35 | 36 | from epubmaker import ParserFactory 37 | from epubmaker.parsers import HTMLParser 38 | 39 | from epubmaker.mydocutils import broken 40 | from epubmaker.mydocutils import nodes as mynodes 41 | from epubmaker.mydocutils.writers import xhtml1, epub2, xetex 42 | 43 | from epubmaker.mydocutils.gutenberg import parsers as gutenberg_parsers 44 | from epubmaker.mydocutils.gutenberg.writers import nroff as gutenberg_nroff 45 | from epubmaker.CommonOptions import Options 46 | 47 | options = Options() 48 | 49 | mediatypes = (mt.rst, ) 50 | 51 | RE_EMACS_CHARSET = re.compile (r'-\*-.*coding:\s*(\S+)', re.I) 52 | 53 | class Parser (HTMLParser.Parser): 54 | """ Parse a ReStructured Text 55 | 56 | and convert it to different xhtml flavours. 57 | 58 | """ 59 | 60 | def __init__ (self): 61 | HTMLParser.Parser.__init__ (self) 62 | self.document1 = None 63 | 64 | 65 | def preprocess (self, charset): 66 | """ Insert pg header and footer. """ 67 | 68 | return self.unicode_content () 69 | 70 | 71 | def to_xhtml (self, html, base_url): 72 | html = html.replace (u' ', u' ') 73 | html = html.replace (u'—', u'—') 74 | 75 | outputfilename = os.path.join (options.outputdir, options.outputfile) 76 | debugfilename = os.path.splitext (outputfilename)[0] + '.debug.html' 77 | 78 | try: 79 | os.remove (debugfilename) 80 | except OSError: 81 | pass 82 | 83 | if options.verbose > 1: 84 | with open (debugfilename, 'w') as fp: 85 | fp.write (html.encode ('utf-8')) 86 | 87 | try: 88 | xhtml = etree.fromstring ( 89 | html, 90 | lxml.html.XHTMLParser (), 91 | base_url = base_url) 92 | except etree.ParseError, what: 93 | error ("etree.fromstring says %s" % what) 94 | raise 95 | 96 | xhtml.make_links_absolute (base_url = base_url) 97 | 98 | return xhtml 99 | 100 | 101 | def rewrite_links (self, f): 102 | """ Rewrite all links using the function f. """ 103 | 104 | doc = self.document1 105 | 106 | if 'coverpage' in doc.meta_block: 107 | coverpage = doc.meta_block['coverpage'] 108 | coverpage[0] = f (coverpage[0]) 109 | else: 110 | for field in doc.traverse (nodes.field): 111 | field_name, field_body = field.children 112 | if field_name.astext () == 'coverpage': 113 | field_body[:] = nodes.paragraph ('', f (field_body.astext ())) 114 | break 115 | 116 | for node in doc.traverse (nodes.reference): 117 | if 'uri' in node: 118 | node['uri'] = f (node['uri']) 119 | 120 | for node in doc.traverse (nodes.image): 121 | if 'uri' in node: 122 | node['uri'] = f (node['uri']) 123 | 124 | for node in doc.traverse (nodes.pending): 125 | # dropcap images 126 | if 'image' in node.details: 127 | node.details['image'] = f (node.details['image']) 128 | 129 | 130 | def iterlinks (self): 131 | """ Grab links and images in RST. """ 132 | 133 | debug ("RSTParser iterlinks want_images = %d" % self.options.want_images) 134 | 135 | doc = self.document1 136 | 137 | # return coverpage even in noimages build 138 | if 'coverpage' in doc.meta_block: 139 | coverpage = doc.meta_block['coverpage'] 140 | yield coverpage[0], {'tag': NS.xhtml.link, 141 | 'type': 'image/jpeg;type=resource', 'rel': 'coverpage'} 142 | else: 143 | for field in doc.traverse (nodes.field): 144 | field_name, field_body = field.children 145 | if field_name.astext () == 'coverpage': 146 | yield field_body.astext (), { 147 | 'tag': NS.xhtml.link, 148 | 'type': 'image/jpeg;type=resource', 149 | 'rel': 'coverpage'} 150 | break 151 | 152 | # need broken.png for no-images build 153 | if not self.options.want_images: 154 | yield (urlparse.urljoin (self.url, broken), 155 | {'tag': NS.xhtml.img, 'type': 'image/png;type=resource', 'rel': 'broken'}) 156 | 157 | for node in doc.traverse (nodes.reference): 158 | if 'uri' in node: 159 | yield node['uri'], {'tag': NS.xhtml.a} 160 | 161 | if self.options.want_images: 162 | for node in doc.traverse (nodes.image): 163 | if 'uri' in node: 164 | yield node['uri'], {'tag': NS.xhtml.img} 165 | 166 | if self.options.want_images: 167 | for node in doc.traverse (nodes.pending): 168 | # dropcap images 169 | if 'image' in node.details: 170 | yield node.details['image'], {'tag': NS.xhtml.img} 171 | 172 | 173 | def get_settings (self, components, defaults): 174 | option_parser = frontend.OptionParser ( 175 | components = components, 176 | defaults = defaults, 177 | read_config_files = 1) 178 | return option_parser.get_default_values () 179 | 180 | 181 | def pre_parse (self): 182 | """ Parse a RST file as link list. """ 183 | 184 | debug ("RSTParser: Pre-parsing %s" % self.url) 185 | 186 | default_style = self.get_resource ( 187 | 'mydocutils.parsers', 'default_style.rst').decode ('utf-8') 188 | 189 | source = io.StringInput (default_style + self.unicode_content ()) 190 | reader = docutils.readers.standalone.Reader () 191 | parser = gutenberg_parsers.Parser () 192 | 193 | overrides = { 194 | 'get_resource': self.get_resource, 195 | 'get_image_size': self.get_image_size_from_parser, 196 | 'no_images': not self.options.want_images, 197 | 'base_url': self.url, 198 | } 199 | 200 | doc = reader.read ( 201 | source, parser, self.get_settings ((reader, parser), overrides)) 202 | self.document1 = doc 203 | 204 | self.rewrite_links (partial (urlparse.urljoin, self.url)) 205 | 206 | debug ("RSTParser: Done pre-parsing %s" % self.url) 207 | 208 | 209 | def _full_parse (self, writer, overrides): 210 | """ Full parse from scratch. """ 211 | 212 | debug ("RSTParser: Full-parsing %s" % self.url) 213 | 214 | default_style = self.get_resource ( 215 | 'mydocutils.parsers', 'default_style.rst').decode ('utf-8') 216 | 217 | source = io.StringInput (default_style + self.unicode_content (), 218 | self.url, 'unicode') 219 | reader = docutils.readers.standalone.Reader () 220 | parser = gutenberg_parsers.Parser () 221 | 222 | doc = reader.read ( 223 | source, parser, 224 | self.get_settings ((reader, parser, writer), overrides)) 225 | self.document1 = doc 226 | 227 | self.rewrite_links (partial (urlparse.urljoin, self.url)) 228 | 229 | doc.transformer.populate_from_components ((source, reader, parser, writer)) 230 | doc.transformer.apply_transforms () 231 | debug ("RSTParser: Done full-parsing %s" % self.url) 232 | 233 | return doc 234 | 235 | 236 | def _full_parse_2 (self, writer, destination, overrides): 237 | """ Full parser from pickled doctree. 238 | 239 | Doesn't work yet. It turned out pickling a doctree is much 240 | harder than I thought. """ 241 | 242 | debug ("Full-parsing %s" % self.url) 243 | 244 | source = io.StringInput (self.unicode_content ()) 245 | reader = docutils.readers.standalone.Reader () 246 | parser = gutenberg_parsers.Parser () 247 | 248 | doc = reader.read ( 249 | source, parser, 250 | self.get_settings ((reader, parser, writer), overrides)) 251 | self.document1 = doc 252 | 253 | self.rewrite_links (partial (urlparse.urljoin, self.url)) 254 | 255 | # make it picklable 256 | reporter = doc.reporter # = None 257 | # doc.reporter = None 258 | transformer = doc.transformer 259 | doc.settings = None 260 | from docutils.parsers.rst.directives.html import MetaBody 261 | 262 | #for metanode in doc.traverse (MetaBody.meta): 263 | for pending in doc.traverse (nodes.pending): 264 | # pending.transform = None 265 | # docutils' meta nodes aren't picklable because the class is nested 266 | # in pending['nodes'] 267 | if 'nodes' in pending.details: 268 | if isinstance (pending.details['nodes'][0], MetaBody.meta): 269 | pending.details['nodes'][0].__class__ = mynodes.meta 270 | import cPickle as pickle 271 | pickled = pickle.dumps (doc) 272 | 273 | doc = pickle.loads (pickled) 274 | 275 | #doc.transformer.populate_from_components ( 276 | # (source, reader, parser, writer)) 277 | 278 | doc.transformer = transformer 279 | doc.reporter = reporter 280 | doc.settings = self.get_settings ((reader, parser, writer), overrides) 281 | 282 | doc.transformer.apply_transforms () 283 | 284 | return writer.write (doc, destination) 285 | 286 | 287 | def rst2nroff (self, charset = 'utf-8'): 288 | """ Convert RST to nroff. """ 289 | 290 | writer = gutenberg_nroff.Writer () 291 | destination = io.StringOutput (encoding = 'unicode') 292 | 293 | overrides = { 294 | 'doctitle_xform': 1, 295 | 'sectsubtitle_xform': 1, 296 | 'footnote_references': 'superscript', 297 | 'compact_lists': 1, 298 | 'compact_simple': 1, 299 | 'page_numbers': 1, 300 | 'no_images': True, 301 | 'get_resource': self.get_resource, 302 | 'format': options.type, 303 | 'encoding': charset, 304 | 'base_url': self.url, 305 | } 306 | 307 | doc = self._full_parse (writer, overrides) 308 | return writer.write (doc, destination) 309 | 310 | 311 | def rst2xetex (self): 312 | """ Convert RST to xetex. """ 313 | 314 | writer = xetex.Writer () 315 | destination = io.StringOutput (encoding = 'unicode') 316 | 317 | overrides = { 318 | 'doctitle_xform': 1, 319 | 'sectsubtitle_xform': 1, 320 | 'footnote_references': 'superscript', 321 | 'compact_lists': 1, 322 | 'compact_simple': 1, 323 | 'page_numbers': 1, 324 | 'format': options.type, 325 | 'encoding': 'utf-8', 326 | 'get_resource': self.get_resource, 327 | 'get_image_size': self.get_image_size_from_parser, 328 | 'no_images': not self.options.want_images, 329 | 'base_url': self.url, 330 | } 331 | 332 | doc = self._full_parse (writer, overrides) 333 | return writer.write (doc, destination) 334 | 335 | 336 | def rst2htmlish (self, writer, more_overrides = {}): 337 | 338 | destination = io.StringOutput (encoding = 'unicode') 339 | 340 | overrides = { 341 | 'stylesheet': None, 342 | 'stylesheet_path': None, 343 | 'xml_declaration': 0, 344 | 'doctitle_xform': 1, 345 | 'initial_header_level': 2, 346 | 'sectsubtitle_xform': 1, 347 | 'footnote_references': 'superscript', 348 | 'page_numbers': 1, 349 | 'format': options.type, 350 | 'encoding': 'utf-8', 351 | 'get_resource': self.get_resource, 352 | 'get_image_size': self.get_image_size_from_parser, 353 | 'no_images': not self.options.want_images, 354 | 'base_url': self.url, 355 | } 356 | overrides.update (more_overrides) 357 | 358 | doc = self._full_parse (writer, overrides) 359 | return writer.fixup_xhtml (self.to_xhtml (writer.write (doc, destination), self.url)) 360 | 361 | 362 | def rst2html (self): 363 | """ Convert RST input to HTML output. """ 364 | return self.rst2htmlish (xhtml1.Writer ()) 365 | 366 | 367 | def rst2epub2 (self): 368 | """ Convert RST input to HTML output with Epub2 tweaks. """ 369 | return self.rst2htmlish (epub2.Writer (), 370 | { 'toc_backlinks': 'none' }) 371 | 372 | 373 | def get_resource (self, package, resource): 374 | return (resource_string ('epubmaker.' + package, resource)) 375 | 376 | 377 | def get_image_size_from_parser (self, uri): 378 | # debug ("Getting image dimen for %s" % uri) 379 | parser = ParserFactory.ParserFactory.create (uri, {}) 380 | parser.pre_parse () 381 | if hasattr (parser, 'get_image_dimen'): 382 | return parser.get_image_dimen () 383 | return None 384 | 385 | 386 | def get_charset_from_rstheader (self): 387 | """ Parse text for hints about charset. """ 388 | # .. -*- coding: utf-8 -*- 389 | 390 | charset = None 391 | rst = self.bytes_content () 392 | 393 | match = RE_EMACS_CHARSET.search (rst) 394 | if (match): 395 | charset = match.group (1) 396 | debug ('Got charset %s from emacs comment' % charset) 397 | 398 | return charset 399 | 400 | 401 | def parse (self): 402 | """ Dummy. Use rst2* instead. """ 403 | 404 | debug ("Done parsing %s" % self.url) 405 | -------------------------------------------------------------------------------- /epubmaker/parsers/broken.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gitenberg-dev/pg-epubmaker/9a982bab100518aea7582e3e570f5edc74a5fa0d/epubmaker/parsers/broken.png -------------------------------------------------------------------------------- /epubmaker/writers/HTMLWriter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | 6 | HTMLWriter.py 7 | 8 | Copyright 2009 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Writes an HTML file 13 | 14 | """ 15 | 16 | from __future__ import with_statement 17 | 18 | import os 19 | import copy 20 | 21 | from lxml import etree 22 | from pkg_resources import resource_string # pylint: disable=E0611 23 | 24 | import epubmaker.lib.GutenbergGlobals as gg 25 | from epubmaker.lib.GutenbergGlobals import xpath 26 | from epubmaker.lib.Logger import info, debug, error, exception 27 | 28 | from epubmaker import writers 29 | from epubmaker.CommonOptions import Options 30 | 31 | options = Options() 32 | 33 | 34 | class Writer (writers.HTMLishWriter): 35 | """ Class for writing HTML files. """ 36 | 37 | 38 | def add_dublincore (self, tree): 39 | """ Add dublin core metadata to <head>. """ 40 | source = gg.archive2files ( 41 | self.options.ebook, self.options.candidate.filename) 42 | 43 | if hasattr (options.config, 'FILESDIR'): 44 | self.options.dc.source = source.replace (options.config.FILESDIR, options.config.PGURL) 45 | 46 | for head in xpath (tree, '//xhtml:head'): 47 | for e in self.options.dc.to_html (): 48 | e.tail = '\n' 49 | head.append (e) 50 | 51 | 52 | def build (self): 53 | """ Build HTML file. """ 54 | 55 | htmlfilename = os.path.join (self.options.outputdir, 56 | self.options.outputfile) 57 | try: 58 | os.remove (htmlfilename) 59 | except OSError: 60 | pass 61 | 62 | try: 63 | info ("Creating HTML file: %s" % htmlfilename) 64 | 65 | for p in self.spider.parsers: 66 | # Do html only. The images were copied earlier by PicsDirWriter. 67 | 68 | xhtml = None 69 | if hasattr (p, 'rst2html'): 70 | xhtml = p.rst2html () 71 | elif hasattr (p, 'xhtml'): 72 | p.parse () 73 | xhtml = copy.deepcopy (p.xhtml) 74 | 75 | if xhtml is not None: 76 | self.make_links_relative (xhtml, p.url) 77 | 78 | self.add_dublincore (xhtml) 79 | 80 | # makes iphones zoom in 81 | self.add_meta (xhtml, 'viewport', 'width=device-width') 82 | self.add_meta_generator (xhtml) 83 | 84 | # This writer has currently to deal only with RST 85 | # input. The RST writer has a workaround that 86 | # avoids writing empty elements. So we don't need 87 | # the same ugly workaround as the EPUB writer, 88 | # that has to deal with HTML input too. 89 | html = etree.tostring (xhtml, 90 | method = 'xml', 91 | doctype = gg.XHTML_DOCTYPE, 92 | encoding = 'utf-8', 93 | pretty_print = True, 94 | xml_declaration = True) 95 | 96 | self.write_with_crlf (htmlfilename, html) 97 | 98 | # self.copy_aux_files (self.options.outputdir) 99 | 100 | info ("Done HTML file: %s" % htmlfilename) 101 | 102 | except StandardError, what: 103 | exception ("Error building HTML %s: %s" % (htmlfilename, what)) 104 | if os.access (htmlfilename, os.W_OK): 105 | os.remove (htmlfilename) 106 | raise what 107 | 108 | 109 | -------------------------------------------------------------------------------- /epubmaker/writers/KindleWriter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | 6 | KindleWriter.py 7 | 8 | Copyright 2009-2012 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | """ 13 | 14 | import re 15 | import os 16 | import subprocess 17 | 18 | from epubmaker.lib.Logger import info, debug, warn, error 19 | from epubmaker.lib.GutenbergGlobals import SkipOutputFormat 20 | from epubmaker.writers import EpubWriter 21 | from epubmaker.CommonOptions import Options 22 | 23 | options = Options() 24 | 25 | 26 | class Writer (EpubWriter.Writer): 27 | """ Class for writing kindle files. """ 28 | 29 | 30 | def parse (self, options): 31 | """ Standard parse. """ 32 | self.setup (options) 33 | 34 | 35 | def build (self): 36 | """ Build kindle file. """ 37 | 38 | # Build a special temporary epub file for kindlegen input. 39 | # This file is a valid epub but contains strongly simplified HTML. 40 | 41 | # Much unnecessary juggling of files here because 42 | # brain-dead kindlegen doesn't understand unix pipes 43 | # and can only output in current directory. 44 | # Furthermore we must not conflict with the filenames 45 | # of the other generated epub files. 46 | 47 | kindle_filename = self.options.outputfile 48 | epub_filename = self.options.epub_filename 49 | 50 | # tmp_epub_filename = os.path.splitext (kindle_filename)[0] + '-kindlegen.epub' 51 | # 52 | # debug ("Creating temp Epub file: %s" % os.path.join ( 53 | # self.options.outputdir, tmp_epub_filename)) 54 | # 55 | # # call EpubWriter to build temporary epub file 56 | # self.options.outputfile = tmp_epub_filename 57 | # EpubWriter.Writer.build (self) 58 | # self.options.outputfile = kindle_filename 59 | 60 | info ("Creating Kindle file: %s" % os.path.join ( 61 | self.options.outputdir, kindle_filename)) 62 | info (" ... from: %s" % os.path.join ( 63 | self.options.outputdir, epub_filename)) 64 | 65 | try: 66 | cwd = os.getcwd () 67 | os.chdir (self.options.outputdir) 68 | 69 | kindlegen = subprocess.Popen ( 70 | [options.config.MOBIGEN, '-o', os.path.basename (kindle_filename), epub_filename], 71 | stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 72 | 73 | except OSError, what: 74 | os.chdir (cwd) 75 | error ("KindleWriter: %s %s" % (options.config.MOBIGEN, what)) 76 | raise SkipOutputFormat 77 | 78 | (stdout, stderr) = kindlegen.communicate ('') 79 | 80 | # try: 81 | # # if self.options.verbose < 2: 82 | # # os.remove (tmp_epub_filename) 83 | # os.remove (kindle_filename) 84 | # except OSError: 85 | # pass 86 | # 87 | # tmp_mobi_filename = os.path.splitext (tmp_epub_filename)[0] + '.mobi' 88 | # os.rename (tmp_mobi_filename, kindle_filename) 89 | 90 | os.chdir (cwd) 91 | 92 | regex = re.compile ('^(\w+)$prcgen$:') 93 | 94 | if kindlegen.returncode > 0: 95 | # pylint: disable=E1103 96 | info (stderr.rstrip ()) 97 | msg = stdout.rstrip () 98 | for line in msg.splitlines (): 99 | match = regex.match (line) 100 | if match: 101 | sline = regex.sub ("", line) 102 | g = match.group (1).lower () 103 | if g == 'info': 104 | if sline == 'MOBI File generated with WARNINGS!': 105 | # we knew that already 106 | continue 107 | # info ("kindlegen: %s" % sline) 108 | elif g == 'warning': 109 | if sline.startswith ('Cover is too small'): 110 | continue 111 | if sline == 'Cover not specified': 112 | continue 113 | warn ("kindlegen: %s" % sline) 114 | elif g == 'error': 115 | error ("kindlegen: %s" % sline) 116 | else: 117 | error (line) 118 | 119 | info ("Done Kindle file: %s" % os.path.join ( 120 | self.options.outputdir, kindle_filename)) 121 | 122 | -------------------------------------------------------------------------------- /epubmaker/writers/PDFWriter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | PDFWriter.py 6 | 7 | Copyright 2011 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Convert RST to PDF. 12 | 13 | """ 14 | 15 | from __future__ import with_statement 16 | 17 | import os 18 | import subprocess 19 | 20 | from epubmaker.lib.Logger import debug, info, warn, error 21 | from epubmaker.lib.GutenbergGlobals import SkipOutputFormat 22 | 23 | from epubmaker import ParserFactory 24 | from epubmaker import writers 25 | from epubmaker.CommonOptions import Options 26 | 27 | options = Options() 28 | 29 | class Writer (writers.BaseWriter): 30 | """ Class to write PDF. """ 31 | 32 | def build (self): 33 | """ Build PDF file. """ 34 | 35 | inputfilename = self.options.candidate.filename 36 | outputfilename = os.path.join (self.options.outputdir, self.options.outputfile) 37 | 38 | debug ("Inputfile: %s" % inputfilename) 39 | info ("Creating PDF file: %s" % outputfilename) 40 | 41 | parser = ParserFactory.ParserFactory.create (inputfilename, 42 | self.options.candidate.mediatype) 43 | parser.options = self.options 44 | 45 | if not hasattr (parser, 'rst2xetex'): 46 | error ('PDFWriter can only work on a RSTParser.') 47 | raise SkipOutputFormat 48 | 49 | # Brain-dead xetex doesn't understand unix pipes 50 | # so we have to write a temp file 51 | 52 | texfilename = os.path.splitext (outputfilename)[0] + '.tex' 53 | auxfilename = os.path.splitext (outputfilename)[0] + '.aux' 54 | logfilename = os.path.splitext (outputfilename)[0] + '.log' 55 | 56 | try: 57 | os.remove (auxfilename) 58 | except OSError: 59 | pass 60 | 61 | tex = parser.rst2xetex () 62 | with open (texfilename, 'w') as fp: 63 | fp.write (tex.encode ('utf-8')) 64 | 65 | try: 66 | cwd = os.getcwd () 67 | os.chdir (self.options.outputdir) 68 | 69 | _xetex = subprocess.Popen ([options.config.XELATEX, 70 | "-output-directory", self.options.outputdir, 71 | "-interaction", "nonstopmode", 72 | texfilename], 73 | stdin = subprocess.PIPE, 74 | stdout = subprocess.PIPE, 75 | stderr = subprocess.PIPE) 76 | except OSError, what: 77 | os.chdir (cwd) 78 | error ("PDFWriter: %s %s" % (options.config.XELATEX, what)) 79 | raise SkipOutputFormat 80 | 81 | (dummy_stdout, dummy_stderr) = _xetex.communicate () 82 | 83 | with open (logfilename) as fp: 84 | for line in fp: 85 | line = line.strip () 86 | if 'Error:' in line: 87 | error ("xetex: %s" % line) 88 | if options.verbose >= 1: 89 | if 'Warning:' in line: 90 | warn ("xetex: %s" % line) 91 | 92 | if options.verbose < 2: 93 | try: 94 | os.remove (texfilename) 95 | os.remove (logfilename) 96 | os.remove (auxfilename) 97 | except OSError: 98 | pass 99 | 100 | os.chdir (cwd) 101 | 102 | info ("Done PDF file: %s" % outputfilename) 103 | 104 | 105 | -------------------------------------------------------------------------------- /epubmaker/writers/PicsDirWriter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | 6 | PicsDirWriter.py 7 | 8 | Copyright 2012 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Copies pics into local directory. Needed for HTML and Xetex. 13 | 14 | """ 15 | 16 | from __future__ import with_statement 17 | 18 | import os 19 | import copy 20 | 21 | from lxml import etree 22 | from pkg_resources import resource_string # pylint: disable=E0611 23 | 24 | import epubmaker.lib.GutenbergGlobals as gg 25 | from epubmaker.lib.GutenbergGlobals import xpath 26 | from epubmaker.lib.Logger import info, debug, error, exception 27 | 28 | from epubmaker import writers 29 | 30 | 31 | class Writer (writers.BaseWriter): 32 | """ Writes Pics directory. """ 33 | 34 | 35 | # def copy_aux_files_lowlevel (self, dest_dir): 36 | # """ Copy image files to dest_dir. """ 37 | 38 | # for src_uri in self.get_aux_file_list (): 39 | # fn_dest = gg.make_url_relative (self.options.base_url, src_uri) 40 | # fn_dest = os.path.join (dest_dir, fn_dest) 41 | 42 | # if gg.is_same_path (src_uri, fn_dest): 43 | # debug ('Not copying %s to %s: same file' % (src_uri, fn_dest)) 44 | # continue 45 | # debug ('Copying %s to %s' % (src_uri, fn_dest)) 46 | 47 | # fn_dest = gg.normalize_path (fn_dest) 48 | # gg.mkdir_for_filename (fn_dest) 49 | # try: 50 | # fp_src = urllib.urlopen (src_uri) 51 | # if fp_src: 52 | # with open (fn_dest, 'wb') as fp_dest: 53 | # fp_dest.write (fp_src.read ()) 54 | # except IOError, what: 55 | # error ('Cannot copy %s to %s: %s' % (src_uri, fn_dest, what)) 56 | 57 | 58 | def copy_aux_files (self, dest_dir): 59 | """ Copy image files to dest_dir. Use image data cached in parsers. """ 60 | 61 | for p in self.spider.parsers: 62 | if hasattr (p, 'resize_image'): 63 | src_uri = p.url 64 | fn_dest = gg.make_url_relative (self.options.base_url, src_uri) 65 | fn_dest = os.path.join (dest_dir, fn_dest) 66 | 67 | if gg.is_same_path (src_uri, fn_dest): 68 | debug ('Not copying %s to %s: same file' % (src_uri, fn_dest)) 69 | continue 70 | debug ('Copying %s to %s' % (src_uri, fn_dest)) 71 | 72 | fn_dest = gg.normalize_path (fn_dest) 73 | gg.mkdir_for_filename (fn_dest) 74 | try: 75 | with open (fn_dest, 'wb') as fp_dest: 76 | fp_dest.write (p.serialize ()) 77 | except IOError, what: 78 | error ('Cannot copy %s to %s: %s' % (src_uri, fn_dest, what)) 79 | 80 | 81 | 82 | def build (self): 83 | """ Build Pics file. """ 84 | 85 | dir = self.options.outputdir 86 | 87 | info ("Creating Pics directory in: %s" % dir) 88 | 89 | self.copy_aux_files (dir) 90 | 91 | info ("Done Pics directory in: %s" % dir) 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /epubmaker/writers/RSTWriter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | RSTWriter.py 6 | 7 | Copyright 2009 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Build an RST file. This is just the master RST with the PG license mixed in. 12 | 13 | """ 14 | 15 | from __future__ import with_statement 16 | 17 | import os 18 | 19 | from epubmaker.lib.Logger import debug, info, error 20 | from epubmaker import ParserFactory 21 | from epubmaker import writers 22 | 23 | class Writer (writers.BaseWriter): 24 | """ Class to write a reStructuredText. """ 25 | 26 | def build (self): 27 | """ Build RST file. """ 28 | 29 | filename = os.path.join (self.options.outputdir, self.options.outputfile) 30 | 31 | info ("Creating RST file: %s" % filename) 32 | 33 | parser = ParserFactory.ParserFactory.create (self.options.candidate.filename, 34 | self.options.candidate.mediatype) 35 | parser.options = self.options 36 | 37 | if not hasattr (parser, 'rst2nroff'): 38 | error ('RSTWriter can only work on a RSTParser.') 39 | return 40 | 41 | data = parser.preprocess ('utf-8').encode ('utf-8') 42 | 43 | self.write_with_crlf (filename, data) 44 | 45 | info ("Done RST file: %s" % filename) 46 | 47 | 48 | -------------------------------------------------------------------------------- /epubmaker/writers/TxtWriter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | TxtWriter.py 6 | 7 | Copyright 2009 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Build an UTF-8-encoded PG plain text file. This is just the plain text 12 | version recoded into UTF-8. 13 | 14 | """ 15 | 16 | from __future__ import with_statement 17 | 18 | import os 19 | import subprocess 20 | 21 | from epubmaker.lib.Logger import debug, info, warn, error 22 | from epubmaker.lib.GutenbergGlobals import SkipOutputFormat 23 | 24 | from epubmaker import ParserFactory 25 | from epubmaker import writers 26 | from epubmaker.CommonOptions import Options 27 | 28 | options = Options() 29 | 30 | # map some not-widely-supported characters to more common ones 31 | u2u = { 32 | 0x2010: u'-', # unicode HYPHEN to HYPHEN-MINUS. Many Windows fonts lack this. 33 | } 34 | 35 | class Writer (writers.BaseWriter): 36 | """ Class to write PG plain text. """ 37 | 38 | def groff (self, nroff, encoding = 'utf-8'): 39 | """ Process thru groff. 40 | 41 | Takes and returns unicode strings! 42 | 43 | """ 44 | 45 | device = { 'utf-8': 'utf8', 46 | 'iso-8859-1': 'latin1', 47 | 'us-ascii': 'ascii' }[encoding] 48 | 49 | nroff = nroff.encode (encoding) 50 | nrofffilename = os.path.join ( 51 | self.options.outputdir, 52 | os.path.splitext (self.options.outputfile)[0] + '.nroff') 53 | 54 | # write nroff file for debugging 55 | if options.verbose >= 2: 56 | with open (nrofffilename, 'w') as fp: 57 | fp.write (nroff) 58 | else: 59 | try: 60 | # remove debug files from previous runs 61 | os.remove (nrofffilename) 62 | except OSError: 63 | pass 64 | 65 | # call groff 66 | try: 67 | _groff = subprocess.Popen ([options.config.GROFF, 68 | "-t", # preprocess with tbl 69 | "-K", device, # input encoding 70 | "-T", device], # output device 71 | stdin = subprocess.PIPE, 72 | stdout = subprocess.PIPE, 73 | stderr = subprocess.PIPE) 74 | except OSError: 75 | error ("TxtWriter: executable not found: %s" % options.config.GROFF) 76 | raise SkipOutputFormat 77 | 78 | (txt, stderr) = _groff.communicate (nroff) 79 | 80 | # pylint: disable=E1103 81 | for line in stderr.splitlines (): 82 | line = line.strip () 83 | if 'error' in line: 84 | error ("groff: %s" % line) 85 | elif 'warn' in line: 86 | if options.verbose >= 1: 87 | warn ("groff: %s" % line) 88 | 89 | txt = txt.decode (encoding) 90 | return txt.translate (u2u) # fix nroff idiosyncracies 91 | 92 | 93 | def build (self): 94 | """ Build TXT file. """ 95 | 96 | filename = os.path.join (self.options.outputdir, self.options.outputfile) 97 | 98 | encoding = options.subtype.strip ('.') 99 | 100 | info ("Creating plain text file: %s" % filename) 101 | 102 | parser = ParserFactory.ParserFactory.create (self.options.candidate.filename, 103 | self.options.candidate.mediatype) 104 | parser.options = self.options 105 | 106 | if hasattr (parser, 'rst2nroff'): 107 | data = self.groff (parser.rst2nroff (encoding), encoding) 108 | else: 109 | data = parser.unicode_content () 110 | 111 | data = data.encode ('utf_8_sig' if encoding == 'utf-8' else encoding, 'unitame') 112 | 113 | self.write_with_crlf (filename, data) 114 | 115 | info ("Done plain text file: %s" % filename) 116 | 117 | 118 | -------------------------------------------------------------------------------- /epubmaker/writers/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | 6 | Writer package 7 | 8 | Copyright 2009-2010 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Base classes for *Writer modules. (EpubWriter, PluckerWriter, ...) 13 | 14 | """ 15 | 16 | from __future__ import with_statement 17 | 18 | from functools import partial 19 | import os.path 20 | import urllib 21 | 22 | from lxml import etree 23 | from lxml.builder import ElementMaker 24 | 25 | from epubmaker.lib.Logger import debug, error 26 | import epubmaker.lib.GutenbergGlobals as gg 27 | from epubmaker.lib import MediaTypes 28 | 29 | from epubmaker import ParserFactory 30 | from epubmaker import Spider 31 | from epubmaker.Version import VERSION, GENERATOR 32 | 33 | 34 | class BaseWriter (object): 35 | """ 36 | Base class for EpubWriter, PluckerWriter, ... 37 | 38 | also used as /dev/null writer for debugging 39 | 40 | """ 41 | 42 | def __init__ (self): 43 | self.options = None 44 | self.spider = None 45 | 46 | 47 | def setup (self, options): 48 | """ override this in a real writer 49 | 50 | put computationally cheap setup stuff in here, 51 | 52 | """ 53 | 54 | if not options.include_mediatypes: 55 | options.include_mediatypes = ( 56 | MediaTypes.TEXT_MEDIATYPES | 57 | MediaTypes.AUX_MEDIATYPES | 58 | MediaTypes.IMAGE_MEDIATYPES 59 | ) 60 | 61 | self.options = options 62 | 63 | 64 | def parse (self, options): 65 | """ Standard parse. """ 66 | self.setup (options) 67 | 68 | if self.spider is None: 69 | self.spider = Spider.Spider () 70 | 71 | self.spider.parse (options.candidate.filename, 72 | options.candidate.mediatype, 73 | options) 74 | 75 | options.candidate.filename = self.spider.redirect (options.candidate.filename) 76 | options.base_url = options.candidate.filename 77 | 78 | 79 | def build (self): 80 | """ override this in a real writer """ 81 | pass 82 | 83 | 84 | @staticmethod 85 | def write_with_crlf (filename, data): 86 | # \r\n is PG standard 87 | data = '\r\n'.join (data.splitlines ()) + '\r\n' 88 | 89 | # open binary so windows doesn't add another \r 90 | with open (filename, 'wb') as fp: 91 | fp.write (data) 92 | 93 | 94 | def validate (self): # pylint: disable=R0201 95 | """ Validate the output with some (external) tool. 96 | 97 | Override this in a real writer. 98 | 99 | """ 100 | return 0 101 | 102 | 103 | def sync (self): 104 | """ Override this if you need to sync before program exit. """ 105 | pass 106 | 107 | 108 | def make_links_relative (self, xhtml, base_url): 109 | """ Make absolute links in xhtml relative to base_url. """ 110 | 111 | debug ("Making links relative to: %s" % base_url) 112 | xhtml.rewrite_links (partial (gg.make_url_relative, base_url)) 113 | 114 | 115 | def get_aux_file_list (self): 116 | """ Iterate over image files. Return absolute urls. """ 117 | 118 | for p in self.spider.parsers: 119 | if hasattr (p, 'resize_image'): 120 | yield p.url 121 | 122 | 123 | em = ElementMaker (namespace = str (gg.NS.xhtml), 124 | nsmap = { None: str (gg.NS.xhtml) }) 125 | 126 | 127 | class HTMLishWriter (BaseWriter): 128 | """ Base class for writers with HTMLish contents. """ 129 | 130 | @staticmethod 131 | def add_class (elem, class_): 132 | """ Add a class to html element. """ 133 | 134 | classes = elem.get ('class', '').split () 135 | classes.append (class_) 136 | elem.set ('class', ' '.join (classes)) 137 | 138 | 139 | @staticmethod 140 | def add_meta (xhtml, name, content): 141 | """ Add a meta tag. """ 142 | 143 | for head in gg.xpath (xhtml, '//xhtml:head'): 144 | meta = em.meta (name = name, content = content) 145 | meta.tail = '\n' 146 | head.append (meta) 147 | 148 | 149 | @staticmethod 150 | def add_meta_generator (xhtml): 151 | """ Add our piss mark. """ 152 | 153 | HTMLishWriter.add_meta (xhtml, 'generator', GENERATOR % VERSION) 154 | 155 | 156 | @staticmethod 157 | def add_internal_css (xhtml, css_as_string): 158 | """ Add internal stylesheet to html. """ 159 | 160 | if css_as_string and xhtml is not None: 161 | css_as_string = '\n' + css_as_string.strip (' \n') + '\n' 162 | for head in gg.xpath (xhtml, '//xhtml:head'): 163 | style = em.style (css_as_string, type = 'text/css') 164 | style.tail = '\n' 165 | head.append (style) 166 | 167 | 168 | def add_external_css (self, xhtml, css_as_string, url): 169 | """ Add external stylesheet to html. """ 170 | 171 | if css_as_string: 172 | p = ParserFactory.ParserFactory.get ('text/css') 173 | p.parse_string (css_as_string) 174 | p.url = url 175 | self.spider.parsers.append (p) 176 | 177 | if xhtml is not None: 178 | for head in gg.xpath (xhtml, '//xhtml:head'): 179 | link = em.link (href = url, rel = 'stylesheet', type = 'text/css') 180 | link.tail = '\n' 181 | head.append (link) 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | -------------------------------------------------------------------------------- /epubmaker/writers/cover.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gitenberg-dev/pg-epubmaker/9a982bab100518aea7582e3e570f5edc74a5fa0d/epubmaker/writers/cover.jpg -------------------------------------------------------------------------------- /scripts/epubmaker: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | 6 | epubmaker script 7 | 8 | Copyright 2014 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | This script starts epubmaker. 13 | 14 | """ 15 | 16 | from epubmaker import EpubMaker 17 | 18 | EpubMaker.main () 19 | 20 | -------------------------------------------------------------------------------- /scripts/rhyme_compiler: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*- 3 | 4 | """ 5 | 6 | ryhme_compiler.py 7 | 8 | Copyright 2009 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | This module produces a dbm file of rhyme stems. 13 | 14 | We use a very naive concept of rhyme: we preprocess the 'CMU 15 | Pronouncing Dictionary' (found at 16 | http://www.speech.cs.cmu.edu/cgi-bin/cmudict) and extract the phonemes 17 | for each word from the last stressed one to the end of the word. 18 | 19 | The result is stored in cmudict.db hashed by word. 20 | 21 | To compile: 22 | 23 | $ ./rhyme_compiler.py cmudict.0.7a 24 | 25 | 26 | """ 27 | 28 | import fileinput 29 | import re 30 | import gdbm 31 | 32 | dbm = gdbm.open ('cmudict.db', 'nf') 33 | 34 | RE_STRESSED = re.compile ('[a-z]+[12][^12]*$') 35 | 36 | # two example lines from cmudict 37 | # 38 | # PRONUNCIATION P R OW0 N AH2 N S IY0 EY1 SH AH0 N 39 | # PRONUNCIATION(1) P R AH0 N AH2 N S IY0 EY1 SH AH0 N 40 | 41 | for line in fileinput.input (openhook = fileinput.hook_encoded ("iso-8859-1")): 42 | if line.startswith (';'): 43 | continue 44 | 45 | word, dummy_sep, phonemes = line.lower ().partition (' ') 46 | 47 | m = RE_STRESSED.search (phonemes) 48 | if m: 49 | phoneme = re.sub (r'[ 012]+', '-', m.group (0)) # remove stress marks 50 | dbm[word.encode ('utf-8')] = phoneme.encode ('utf-8') 51 | 52 | # print "%s %s\n" % (word, dbm[word]) 53 | 54 | dbm.sync () 55 | dbm.reorganize () 56 | dbm.close () 57 | 58 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [egg_info] 2 | 3 | [bdist_wininst] 4 | plat-name: win32 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # pypi epubmaker setup 3 | # 4 | 5 | from setuptools import setup 6 | from setup_inc import * 7 | 8 | setup ( 9 | name = 'epubmaker', 10 | version = VERSION, 11 | install_requires = install_requires, 12 | package_dir = package_dir, 13 | packages = pypi_packages, 14 | py_modules = pypi_py_modules, 15 | package_data = pypi_package_data, 16 | scripts = pypi_scripts, 17 | data_files = pypi_data_files, 18 | 19 | # metadata for upload to PyPI 20 | 21 | author = author, 22 | author_email = author_email, 23 | description = description, 24 | long_description = long_description, 25 | license = license, 26 | keywords = keywords, 27 | url = url, 28 | classifiers = classifiers, 29 | platforms = platforms, 30 | ) 31 | -------------------------------------------------------------------------------- /setup_inc.py: -------------------------------------------------------------------------------- 1 | # 2 | # epubmaker common setup all flavors 3 | # 4 | 5 | VERSION = '0.3.26' 6 | 7 | package_dir = { 8 | 'epubmaker': 'epubmaker', 9 | } 10 | 11 | install_requires = [ 12 | 'roman', 13 | 'docutils >= 0.8.1, <0.13', 14 | 'lxml >= 2.3', 15 | 'cssutils >= 0.9.8a1', 16 | 'pillow', 17 | ] 18 | 19 | 20 | pypi_packages = [ 21 | 'epubmaker.parsers', 22 | 'epubmaker.packagers', 23 | 'epubmaker.writers', 24 | 'epubmaker.mydocutils', 25 | 'epubmaker.mydocutils.parsers', 26 | 'epubmaker.mydocutils.transforms', 27 | 'epubmaker.mydocutils.writers', 28 | 'epubmaker.mydocutils.gutenberg', 29 | 'epubmaker.mydocutils.gutenberg.parsers', 30 | 'epubmaker.mydocutils.gutenberg.transforms', 31 | 'epubmaker.mydocutils.gutenberg.writers', 32 | ] 33 | 34 | ibiblio_packages = pypi_packages + [ 35 | 'epubmaker', 36 | 'epubmaker.lib', 37 | 'epubmaker.writers.ibiblio', 38 | ] 39 | 40 | pypi_py_modules = [ 41 | 'epubmaker.CommonOptions', 42 | 'epubmaker.EpubMaker', 43 | 'epubmaker.HTMLChunker', 44 | 'epubmaker.ParserFactory', 45 | 'epubmaker.Spider', 46 | 'epubmaker.Unitame', 47 | 'epubmaker.UnitameData', 48 | 'epubmaker.Version', 49 | 50 | 'epubmaker.lib.DublinCore', 51 | 'epubmaker.lib.GutenbergGlobals', 52 | 'epubmaker.lib.Logger', 53 | 'epubmaker.lib.MediaTypes', 54 | 55 | 'epubmaker.WriterFactory', 56 | ] 57 | 58 | pypi_package_data = { 59 | 'epubmaker.parsers': ['broken.png'], 60 | 'epubmaker.writers': ['cover.jpg'], 61 | 'epubmaker.mydocutils.parsers': ['*.rst'], 62 | 'epubmaker.mydocutils.writers': ['*.css'], 63 | 'epubmaker.mydocutils.gutenberg.parsers': ['*.rst'], 64 | } 65 | 66 | ibiblio_package_data = pypi_package_data 67 | ibiblio_package_data.update ({ 68 | 'epubmaker.writers.ibiblio': ['qioo-skeleton.zip'], 69 | }) 70 | 71 | pypi_data_files = [ 72 | ('', ['CHANGES', 'setup_inc.py']), 73 | ] 74 | 75 | ibiblio_data_files = [ 76 | ('epubmaker', ['CHANGES', 'setup_inc.py']), 77 | ] 78 | 79 | pypi_scripts = [ 80 | 'scripts/epubmaker', 81 | 'scripts/rhyme_compiler', 82 | ] 83 | 84 | ibiblio_scripts = pypi_scripts + [ 85 | 'scripts/makepub', 86 | 'scripts/convert_unitame', 87 | 'scripts/update_facebook_auth', 88 | ] 89 | 90 | # metadata for upload to PyPI 91 | 92 | author = "Marcello Perathoner" 93 | author_email = "webmaster@gutenberg.org" 94 | description = "The Project Gutenberg tool to generate EPUBs and other ebook formats." 95 | long_description = open ('README').read () 96 | license = "GPL v3" 97 | keywords = "ebook epub kindle pdf rst reST reStructuredText project gutenberg format conversion" 98 | url = "https://github.com/gitenberg-dev/pg-epubmaker" 99 | 100 | classifiers = [ 101 | "Topic :: Text Processing", 102 | "License :: OSI Approved :: GNU General Public License (GPL)", 103 | "Environment :: Console", 104 | "Operating System :: OS Independent", 105 | "Intended Audience :: Other Audience", 106 | "Development Status :: 4 - Beta" 107 | ] 108 | 109 | platforms = 'OS-independent' 110 | 111 | -------------------------------------------------------------------------------- /test/test.py: -------------------------------------------------------------------------------- 1 | from lxml import etree 2 | 3 | root = etree.fromstring (""" 4 | <html xml:lang="en" lang="en" xmlns="http://www.w3.org/1999/xhtml"> 5 | <body> 6 | <p> 7 | <span style="color: red"></span>black 8 | </p> 9 | </body> 10 | </html> 11 | """) 12 | 13 | XHTML11_DOCTYPE = "<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' \ 14 | 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'>" 15 | 16 | print (etree.tostring ( 17 | root, 18 | method = 'xml', 19 | xml_declaration = True, 20 | doctype = XHTML11_DOCTYPE, 21 | encoding = 'utf-8', 22 | pretty_print = True)) 23 | --------------------------------------------------------------------------------

├── .gitignore ├── CHANGES ├── LICENSE ├── PKG-INFO ├── README ├── epubmaker ├── CommonOptions.py ├── EpubMaker.py ├── HTMLChunker.py ├── ParserFactory.py ├── Spider.py ├── Unitame.py ├── UnitameData.py ├── Version.py ├── WriterFactory.py ├── __init__.py ├── lib │ ├── DublinCore.py │ ├── GutenbergGlobals.py │ ├── Logger.py │ ├── MediaTypes.py │ └── __init__.py ├── mydocutils │ ├── __init__.py │ ├── gutenberg │ │ ├── __init__.py │ │ ├── parsers │ │ │ ├── __init__.py │ │ │ ├── pg-footer.rst │ │ │ └── pg-header.rst │ │ ├── transforms │ │ │ └── __init__.py │ │ └── writers │ │ │ ├── __init__.py │ │ │ └── nroff.py │ ├── nodes.py │ ├── parsers │ │ ├── __init__.py │ │ └── default_style.rst │ ├── transforms │ │ ├── __init__.py │ │ └── parts.py │ └── writers │ │ ├── __init__.py │ │ ├── epub2.py │ │ ├── nroff.py │ │ ├── rst2all.css │ │ ├── rst2epub.css │ │ ├── rst2html.css │ │ ├── xetex.py │ │ └── xhtml1.py ├── packagers │ ├── GzipPackager.py │ ├── HTMLPackager.py │ ├── PDFPackager.py │ ├── PushPackager.py │ ├── RSTPackager.py │ ├── TxtPackager.py │ └── __init__.py ├── parsers │ ├── AuxParser.py │ ├── CSSParser.py │ ├── GutenbergTextParser.py │ ├── HTMLParser.py │ ├── ImageParser.py │ ├── RSTParser.py │ ├── __init__.py │ └── broken.png └── writers │ ├── EpubWriter.py │ ├── HTMLWriter.py │ ├── KindleWriter.py │ ├── PDFWriter.py │ ├── PicsDirWriter.py │ ├── RSTWriter.py │ ├── TxtWriter.py │ ├── __init__.py │ └── cover.jpg ├── scripts ├── epubmaker └── rhyme_compiler ├── setup.cfg ├── setup.py ├── setup_inc.py └── test └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # ignore the local copy of any logs 2 | logs/* 3 | 4 | # python ignores 5 | *.pyc 6 | *.db 7 | *.coverage 8 | */.ipynb_checkpoints/* 9 | *.ipynb_checkpoints/* 10 | 11 | # Python packaging 12 | .eggs* 13 | .env 14 | .tox* 15 | build* 16 | dist* 17 | epubmaker.egg-info 18 | log/log.txt 19 | *.log 20 | 21 | 22 | -------------------------------------------------------------------------------- /CHANGES: -------------------------------------------------------------------------------- 1 | 0.3.26 October 8, 2018 2 | 3 | Don't fail on audio links 4 | Don't unescape external hrefs 5 | 6 | 0.3.25 September 20, 2018 7 | 8 | Don't fail on with bad src 9 | Use a borg class to effect an options global instead of patching builtins 10 | Don't disable translations 11 | Add --local-only option so that using depth>1 can be used for multi-file books 12 | Running the code from source didn't work with the out of date practice of not using package name for directory name. 13 | fix bug for no stylesheet 14 | utf-8 is the encoding, not unicode 15 | update contact info 16 | 17 | 0.3.21 February 24, 2017 18 | 19 | Add parameter to add and set the cover image. 20 | Switch setup to setuptools to better manage dependencies, because docutils 0.13 breaks epubmaker. 21 | No longer strip hyperlinks to external resources. 22 | 23 | 24 | 0.3.20 25 | 26 | Do not make special kindlegen epub anymore. Requires kindlegen 2.7+. 27 | Better coverpage handling. 28 | Works with docutils 0.11+. 29 | 30 | 0.3.19 31 | 32 | 0.3.19b6 33 | 34 | Floats now support 'here'. 35 | 36 | 0.3.19b5 37 | 38 | Fix typo in license text. 39 | Fix "strip_links" debug message crash. 40 | Extend styles directive. 41 | - Add display option to hide the element. 42 | - Allow for negative matches. 43 | Don't use \marginpar for page numbers in TeX. 44 | 45 | 0.3.19b4 46 | 47 | Style directive extended. 48 | Now preserves all trailing whitespace except U+0020. 49 | Added "table de matières" to auto toc detection. 50 | Convert U+2015 to single hyphen in plain text. 51 | 52 | 0.3.19b3 53 | 54 | Fix keyerror hrules and vrules. 55 | Fix unescaped characters in html meta attribute values. 56 | Fix default block image alignment. 57 | Fix use numeric entities in xhtml writer. 58 | 59 | 0.3.19b2 60 | 61 | Fixed text-indent in page nos (made pagenos disapper in line blocks). 62 | Fixed whitespace collapsing in

 nodes.
 63 | Fixed: honors newlines in metadata fields.
 64 | Internal fix: correct format name is: "txt.utf-8".
 65 | Can use docinfo in addition to meta directive.
 66 | 
 67 | 0.3.19b1
 68 | 
 69 | New formats: html.noimages and pdf.noimages.
 70 | No-image builds use a placeholder 'broken' image instead of nothing.
 71 | Figure directives without a filename create a placeholder 'broken' image.
 72 | New option :selector: in lof and lot directives for filtering.
 73 | Turn off italics with class no-italics (and bold with no-bold).
 74 | nbsp now works in ascii txt, soft hyphens now removed from ascii txt.
 75 | Insert line numbers with [ln 42] and [ln!42].
 76 | Works with kindlegen 2.0.
 77 | 
 78 | 0.3.18
 79 | 
 80 | Allow unicode line separator U+2028 as line feed.
 81 | Fix XetexWriter bug with tables without explicit width.
 82 | Add language support in XetexWriter.
 83 | Works with docutils 0.8
 84 | Support docutils-0.8-style :class: language-.
 85 | 
 86 | 0.3.17
 87 | 
 88 | Fix line height of large text.
 89 | Fix images with spaces in src attribute.
 90 | 
 91 | 0.3.16
 92 | 
 93 | Add image_dir to Xetex writer.
 94 | Use quotation environment instead of quote.
 95 | Don't automatically insert \frontmatter.
 96 | Page nos. for kindlegen 1.2.
 97 | Call kindlegen.
 98 | Integrate changes into PG environment.
 99 | 
100 | 0.3.15
101 | 
102 | Reduce vertical margin of images to 1 in TXT.
103 | Fixed link targets in NROFF, PDF.
104 | Report error on xetex errors.
105 | Escape characters in PDF info.
106 | 
107 | 0.3.14
108 | 
109 | Fixed crash on HTML comments in Kindle writer.
110 | 
111 | 0.3.13
112 | 
113 | Start on Kindle writer.
114 | Fix spurious space in PDF literal blocks with classes.
115 | Fix `flat´ TOC.
116 | Thin spaces between quotes made optional.
117 | 
118 | 0.3.12
119 | 
120 | Add more front- and backmatter classes.
121 | Insert thin space between quotes.
122 | Generated List of Tables.
123 | Generated List of Figures.
124 | Emit warning instead of error on groff warnings.
125 | Fix crash when last cell in row spans rows.
126 | Add option vertical-aligns for tables.
127 | Default width of image calculated assuming 980px window.
128 | Fix docutils indentation bug in poetry.
129 | 
130 | 0.3.11
131 | 
132 | Add option widths to tables.
133 | Add option aligns to tables.
134 | Add class norules for tables.
135 | Generate typographically correct tables.
136 | Don't overwrite images if src dir == working dir.
137 | 
138 | 0.3.10
139 | 
140 | Bug fixes.
141 | 
142 | 0.3.9
143 | 
144 | A different fix for figure and image centering on ADE.
145 |   (Calculate explicit left margin).
146 | More work on PDF (Xetex) writer.
147 | Added directives for pagination control.
148 | 
149 | 0.3.8
150 | 
151 | Fix empty poetry lines on ADE.
152 | Fix figure and image centering on ADE.
153 | Fix thoughtbreak centering on ADE.
154 | For push, zip RST into subdir with images.
155 | Start implementing PDF (Xetex) writer.
156 | 
157 | 0.3.7
158 | 
159 | Integrate changes into PG environment.
160 | Fix more CR/LF issues on windows.
161 | Fix cover image format conversion.
162 | Zips a pushable file for the WWers.
163 | 
164 | 0.3.6
165 | 
166 | Code cleanup.
167 | Different CSS templates for RST -> HTML and RST -> EPUB.
168 | 
169 | 0.3.5
170 | 
171 | Zips files up for PG.
172 | 
173 | 0.3.4
174 | 
175 | Tell Tidy not to merge divs and spans.
176 | More fixes to plain text encoding.
177 | 
178 | 0.3.3
179 | 
180 | Implemented coverpages for Adobe ADE.
181 | CSS changes because Adobe ADE chokes on !important.
182 | RST dropcap directive: don't use image in EPUB.
183 | 
184 | 0.3.2
185 | 
186 | Packaging changes.
187 | 


--------------------------------------------------------------------------------
/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.1
 2 | Name: epubmaker
 3 | Version: 0.3.25
 4 | Summary: The Project Gutenberg tool to generate EPUBs and other ebook formats.
 5 | Home-page: https://github.com/gitenberg-dev/pg-epubmaker
 6 | Author: Marcello Perathoner
 7 | Author-email: webmaster@gutenberg.org
 8 | License: GPL v3
 9 | Description: =========
10 |         EpubMaker
11 |         =========
12 |         
13 |         EpubMaker is the tool used for format conversion at Project Gutenberg.
14 |         It builds EPUB2 and Kindle files from HTML.
15 |         Also it builds HTML4, EPUB2, Kindle, and PDF files from reST sources.
16 |         
17 |         
18 |         Prerequisites
19 |         =============
20 |         
21 |         * Python >= 2.6,
22 |         
23 |         * HTMLTidy, 
24 |         
25 |         * Kindlegen, 
26 |         
27 |         * TexLive, and
28 |         
29 |         * groff.
30 |         
31 | Keywords: ebook epub kindle pdf rst reST reStructuredText project gutenberg format conversion
32 | Platform: OS-independent
33 | Classifier: Topic :: Text Processing
34 | Classifier: License :: OSI Approved :: GNU General Public License (GPL)
35 | Classifier: Environment :: Console
36 | Classifier: Operating System :: OS Independent
37 | Classifier: Intended Audience :: Other Audience
38 | Classifier: Development Status :: 4 - Beta
39 | Requires: setuptools
40 | Requires: roman
41 | Requires: docutils (>= 0.8.1, < 0.13)
42 | Requires: lxml (>= 2.3)
43 | Requires: cssutils (>= 0.9.8a1)
44 | Requires: PIL (>= 1.1.7)
45 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | =========
 2 | EpubMaker
 3 | =========
 4 | 
 5 | EpubMaker is the tool used for format conversion at Project Gutenberg.
 6 | It builds EPUB2 and Kindle files from HTML.
 7 | Also it builds HTML4, EPUB2, Kindle, and PDF files from reST sources.
 8 | 
 9 | 
10 | Prerequisites
11 | =============
12 | 
13 | * Python >= 2.6,
14 | 
15 | * HTMLTidy, 
16 | 
17 | * Kindlegen, 
18 | 
19 | * TexLive, and
20 | 
21 | * groff.
22 | 


--------------------------------------------------------------------------------
/epubmaker/CommonOptions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
 3 | 
 4 | """
 5 | 
 6 | CommonOptions.py
 7 | 
 8 | Copyright 2010 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | Common options for programs.
13 | 
14 | """
15 | 
16 | from __future__ import with_statement
17 | 
18 | import optparse
19 | import ConfigParser
20 | import os
21 | 
22 | class Struct (object):
23 |     pass
24 | 
25 | # options is a "Borg" set by optparse (note that it's not thread-safe)
26 | class Options:
27 |     __shared_state = {}
28 |     def __init__(self):
29 |         self.__dict__ = self.__shared_state
30 |         
31 |     def update(self, _dict):
32 |         self.__dict__.update(_dict)
33 | 
34 | options = Options()
35 | 
36 | 
37 | def add_common_options (op):
38 |     """ Add options common to all programs. """
39 |     
40 |     op.add_option (
41 |         "-c", "--config",
42 |         metavar  = "FILE",
43 |         dest     = "config_name", 
44 |         action   = "store",
45 |         default  = "config",
46 |         help     = "use config file (default: config)")
47 | 
48 |     op.add_option (
49 |         "-v", "--verbose",
50 |         dest     = "verbose", 
51 |         action   = "count",
52 |         help     = "be verbose (-v -v be more verbose)")
53 | 
54 |     op.add_option (
55 |         "--validate",
56 |         dest     = "validate", 
57 |         action   = "count",
58 |         help     = "validate epub through epubcheck")
59 | 
60 |     op.add_option (
61 |         "--section",
62 |         metavar  = "TAG.CLASS",
63 |         dest     = "section_tags", 
64 |         default  = [],
65 |         action   = "append",
66 |         help     = "split epub on TAG.CLASS")
67 | 
68 | 
69 | def get_parser (**kwargs):
70 |     op = optparse.OptionParser (**kwargs)
71 |     add_common_options (op)
72 |     return op
73 |     
74 | 
75 | def parse_args (op, params = {}, defaults = {}):
76 |     (parsed_options, args) = op.parse_args ()
77 |     options.update(vars(parsed_options))
78 |     
79 |     cp = ConfigParser.SafeConfigParser (params)
80 |     cp.read ( [options.config_name,
81 |                os.path.expanduser ('~/.epubmaker.conf'),
82 |                '/etc/epubmaker.conf' ] )
83 | 
84 |     options.config = Struct ()
85 | 
86 |     for name, value in defaults.iteritems ():
87 |         setattr (options.config, name.upper (), value)
88 |         
89 |     for section in cp.sections ():
90 |         for name, value in cp.items (section):
91 |             #if value == 'None':
92 |             #    value = None
93 |             # print section, name, value
94 |             setattr (options.config, name.upper (), value)
95 | 
96 |     return options, args
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/epubmaker/EpubMaker.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | EpubMaker.py
  7 | 
  8 | Copyright 2009-2011 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Stand-alone application to build epub out of html or rst.
 13 | 
 14 | """
 15 | 
 16 | 
 17 | from __future__ import with_statement
 18 | 
 19 | import sys
 20 | import os.path
 21 | import re
 22 | import optparse
 23 | import hashlib
 24 | import mimetypes
 25 | 
 26 | from epubmaker.lib.GutenbergGlobals import Struct, DCIMT, SkipOutputFormat
 27 | import epubmaker.lib.GutenbergGlobals as gg
 28 | from epubmaker.lib.Logger import debug, exception
 29 | from epubmaker.lib import Logger, DublinCore
 30 | 
 31 | from epubmaker import ParserFactory
 32 | from epubmaker import WriterFactory
 33 | from epubmaker.packagers import PackagerFactory
 34 | from epubmaker import CommonOptions
 35 | 
 36 | from epubmaker.Version import VERSION
 37 | 
 38 | options = CommonOptions.Options()
 39 | 
 40 | def null_translation (s):
 41 |     """ Translate into same language. :-) """
 42 |     return s
 43 | 
 44 | TXT_FORMATS    = 'txt.utf-8 txt.iso-8859-1 txt.us-ascii'.split ()
 45 | HTML_FORMATS   = 'html.noimages html.images'.split ()
 46 | EPUB_FORMATS   = 'epub.noimages epub.images'.split ()
 47 | KINDLE_FORMATS = 'kindle.noimages kindle.images'.split ()
 48 | PDF_FORMATS    = 'pdf.noimages pdf.images'.split ()
 49 | RST_FORMATS    = 'rst.gen'.split ()
 50 | ALL_FORMATS    = HTML_FORMATS + EPUB_FORMATS + KINDLE_FORMATS + PDF_FORMATS + TXT_FORMATS + RST_FORMATS
 51 | 
 52 | DEPENDENCIES = (
 53 |     ('all',    ALL_FORMATS),
 54 |     ('html',   HTML_FORMATS), 
 55 |     ('epub',   EPUB_FORMATS),
 56 |     ('kindle', KINDLE_FORMATS), 
 57 |     ('pdf',    PDF_FORMATS),
 58 |     ('txt',    TXT_FORMATS), 
 59 |     ('rst',    RST_FORMATS), 
 60 |     )
 61 | 
 62 | FILENAMES = {
 63 |     'html.noimages':    '{id}-noimages-h.html',
 64 |     'html.images':      '{id}-h.html',
 65 | 
 66 |     'epub.noimages':    '{id}-epub.epub',
 67 |     'epub.images':      '{id}-images-epub.epub',
 68 | 
 69 |     'kindle.noimages':  '{id}-kindle.mobi',
 70 |     'kindle.images':    '{id}-images-kindle.mobi',
 71 | 
 72 |     'pdf.noimages':     '{id}-pdf.pdf',
 73 |     'pdf.images':       '{id}-images-pdf.pdf',
 74 | 
 75 |     'txt.utf-8':        '{id}-0.txt',
 76 |     'txt.iso-8859-1':   '{id}-8.txt',
 77 |     'txt.us-ascii':     '{id}.txt',
 78 | 
 79 |     'rst.gen':          '{id}-rst.rst',
 80 | 
 81 |     'picsdir.noimages': '{id}-noimages.picsdir',   # do we need this ?
 82 |     'picsdir.images':   '{id}-images.picsdir',     # do we need this ?
 83 |     }
 84 | 
 85 | def make_output_filename (dc, type_):
 86 |     if dc.project_gutenberg_id:
 87 |         # PG book: use PG naming convention
 88 |         return FILENAMES[type_].format (id = dc.project_gutenberg_id)
 89 |     else:
 90 |         # not a PG ebook
 91 |         return FILENAMES[type_].format (id = gg.string_to_filename (dc.title)[:65])
 92 | 
 93 | def main ():
 94 |     """ Main program. """
 95 | 
 96 |     op = optparse.OptionParser (usage = "usage: %prog [options] url", 
 97 |                                 version = "EpubMaker version %s" % VERSION)
 98 | 
 99 |     CommonOptions.add_common_options (op)
100 | 
101 |     op.add_option (
102 |         "--make",
103 |         dest    = "types",
104 |         choices = [x for x, y in DEPENDENCIES] + ALL_FORMATS,
105 |         default = [],
106 |         action  = 'append',
107 |         help    = ("output type [%s] (default: all)"
108 |                    % ' | '.join ([x for x, y in DEPENDENCIES] + ALL_FORMATS)))
109 | 
110 |     op.add_option (
111 |         "--max-depth",
112 |         metavar = "LEVELS",
113 |         dest    = "max_depth",
114 |         type    = "int",
115 |         default = 1,
116 |         help    = "go how many levels deep while recursively retrieving pages. (0 == infinite)")
117 | 
118 |     op.add_option (
119 |         "--local-only",
120 |         dest    = "local_files_only",
121 |         action  = "store_true",
122 |         default = False,
123 |         help    = "restrict recursive search to local files")
124 | 
125 |     op.add_option (
126 |         "--include",
127 |         metavar = "GLOB",
128 |         dest    = "include_argument", 
129 |         default = [],
130 |         action  = "append",
131 |         help    = "include this url (use globs, repeat for more urls)")
132 | 
133 |     op.add_option (
134 |         "--exclude",
135 |         metavar = "GLOB",
136 |         dest    = "exclude", 
137 |         default = [],
138 |         action  = "append",
139 |         help    = "exclude this url (use globs, repeat for more urls)")
140 | 
141 |     op.add_option (
142 |         "--include-mediatype",
143 |         metavar = "GLOB/GLOB",
144 |         dest    = "include_mediatypes_argument", 
145 |         default = ['text/*', 'application/xhtml+xml'],
146 |         action  = "append",
147 |         help    = "include this mediatype (use globs, repeat for more mediatypes, eg. 'image/*')")
148 | 
149 |     op.add_option (
150 |         "--exclude-mediatype",
151 |         metavar = "GLOB/GLOB",
152 |         dest    = "exclude_mediatypes", 
153 |         default = [],
154 |         action  = "append",
155 |         help    = "exclude this mediatype (use globs, repeat for more mediatypes)")
156 | 
157 |     op.add_option (
158 |         "--rewrite",
159 |         metavar = "from>to",
160 |         dest    = "rewrite", 
161 |         default = [],
162 |         action  = "append",
163 |         help    = "rewrite url eg. 'http://www.example.org/>http://www.example.org/index.html'")
164 | 
165 |     op.add_option (
166 |         "--title",
167 |         dest    = "title", 
168 |         default = None,
169 |         help    = "ebook title (default: from meta)")
170 | 
171 |     op.add_option (
172 |         "--author",
173 |         dest    = "author", 
174 |         default = None,
175 |         help    = "author (default: from meta)")
176 | 
177 |     op.add_option (
178 |         "--ebook",
179 |         dest    = "ebook", 
180 |         type    = "int",
181 |         default = 0,
182 |         help    = "ebook no. (default: from meta)")
183 | 
184 |     op.add_option (
185 |         "--input-encoding",
186 |         dest    = "inputencoding", 
187 |         default = None,
188 |         help    = "input encoding (default: from meta)")
189 | 
190 |     op.add_option (
191 |         "--output-dir",
192 |         dest    = "outputdir", 
193 |         default = "./",
194 |         help    = "output directory (default: ./)")
195 | 
196 |     op.add_option (
197 |         "--output-file",
198 |         dest    = "outputfile", 
199 |         default = None,
200 |         help    = "output file (default: .epub)")
201 | 
202 |     op.add_option (
203 |         "--packager",
204 |         dest    = "packager",
205 |         choices = ['none', 'ww'],
206 |         default = "none",
207 |         help    = "packager type [none | ww] (default: none)")
208 | 
209 |     op.add_option (
210 |         "--mediatype-from-extension",
211 |         dest    = "mediatype_from_extension",
212 |         action  = "store_true",
213 |         default = False,
214 |         help    = "get mediatype from url extension instead of http response")
215 | 
216 |     op.add_option (
217 |         "--cover",
218 |         dest    = "coverpage_url",
219 |         default = None,
220 |         help    = "add the specified cover to the epub")
221 | 
222 |     options, args = CommonOptions.parse_args (op, {}, {
223 |         'proxies': None,
224 |         'bibrec': 'http://www.gutenberg.org/ebooks/',
225 |         'xelatex': 'xelatex',
226 |         'mobigen': 'kindlegen',
227 |         'groff': 'groff',
228 |         'rhyming_dict': None,
229 |         } )
230 | 
231 |     if not args:
232 |         op.error ("please specify which file to convert")
233 | 
234 |     Logger.set_log_level (options.verbose)        
235 | 
236 |     options.types = options.types or ['all']
237 |     for opt, formats in DEPENDENCIES:
238 |         if opt in options.types:
239 |             options.types.remove (opt)
240 |             options.types += formats
241 | 
242 |     if set (options.types).intersection (('html.images', 'pdf.images', 'rst.gen')):
243 |         options.types.insert (0, 'picsdir.images')
244 |     if set (options.types).intersection (('html.noimages', 'pdf.noimages')):
245 |         options.types.insert (0, 'picsdir.noimages')
246 |     if set (options.types).intersection (('kindle.images', )):
247 |         options.types.insert (0, 'epub.images')
248 |     if set (options.types).intersection (('kindle.noimages', )):
249 |         options.types.insert (0, 'epub.noimages')
250 |         
251 |         
252 |     debug ("Building types: %s" % ' '.join (options.types))
253 | 
254 |     ParserFactory.load_parsers ()
255 |     WriterFactory.load_writers ()
256 | 
257 |     packager_factory = None
258 |     if options.packager != 'none':
259 |         packager_factory = PackagerFactory (options.packager)
260 |         packager_factory.load ()
261 | 
262 |     for url in args:
263 | 
264 |         if options.include_argument:
265 |             options.include = options.include_argument[:]
266 |         else:
267 |             exclude_patt = os.path.dirname (url) + '/*'
268 |             options.include = [ exclude_patt ]
269 |             if exclude_patt.startswith ('/'):
270 |                 options.include.append('file://' + exclude_patt)
271 |             
272 |         # try to get metadata
273 | 
274 |         options.candidate = Struct ()
275 |         options.candidate.filename = url
276 |         options.candidate.mediatype = str (DCIMT (
277 |             mimetypes.types_map[os.path.splitext (url)[1]], options.inputencoding))
278 | 
279 |         options.include_mediatypes = options.include_mediatypes_argument[:]
280 |         options.want_images = False
281 |         #options.coverpage_url = None
282 | 
283 |         parser = ParserFactory.ParserFactory.create (options.candidate.filename, {})
284 | 
285 |         dc = None
286 | 
287 |         try:
288 |             dc = DublinCore.GutenbergDublinCore ()
289 | 
290 |             # try for rst header
291 |             dc.load_from_rstheader (parser.unicode_content ())
292 | 
293 |             if dc.project_gutenberg_id == 0:
294 |                 # try for Project Gutenberg header
295 |                 dc.load_from_parser (parser)
296 | 
297 |         except (ValueError, TypeError):
298 |             # use standard HTML header
299 |             dc = DublinCore.DublinCore ()
300 |             dc.load_from_parser (parser)
301 |             dc.source = url
302 | 
303 |         dc.source = url
304 | 
305 |         if options.title:
306 |             dc.title = options.title
307 |         if not dc.title:
308 |             dc.title = 'NA'
309 | 
310 |         if options.author:
311 |             dc.add_author (options.author, 'cre')
312 |         if not dc.authors:
313 |             dc.add_author ('NA', 'cre')
314 | 
315 |         if options.ebook:
316 |             dc.project_gutenberg_id = options.ebook
317 | 
318 |         if dc.project_gutenberg_id:
319 |             dc.opf_identifier = ('http://www.gutenberg.org/ebooks/%d' % dc.project_gutenberg_id)
320 |         else:
321 |             dc.opf_identifier = ('urn:mybooks:%s' %
322 |                                  hashlib.md5 (url.encode ('utf-8')).hexdigest ())
323 | 
324 |         if not dc.languages:
325 |             # we *need* a language to build a valid epub, so just make one up
326 |             dc.add_lang_id ('en')
327 | 
328 |         aux_file_list = []
329 |         
330 |         for type_ in options.types:
331 |             debug ('=== Building %s ===' % type_)
332 |             maintype, subtype = os.path.splitext (type_)
333 | 
334 |             try:
335 |                 writer = WriterFactory.create (maintype)
336 |                 writer.setup (options)
337 |                 options.type = type_
338 |                 options.maintype = maintype
339 |                 options.subtype = subtype
340 |                 options.want_images = False
341 | 
342 |                 options.include_mediatypes = options.include_mediatypes_argument[:]
343 |                 if subtype == '.images':
344 |                     options.include_mediatypes.append ('image/*')
345 |                     options.want_images = True
346 |                 else:
347 |                     # This is the mediatype of the 'broken' image.
348 |                     options.include_mediatypes.append ('image/png;type=resource')
349 | 
350 |                 writer.parse (options)
351 | 
352 |                 if maintype in ('html', ):
353 |                     # list of images for packager
354 |                     aux_file_list[:] = writer.get_aux_file_list ()
355 | 
356 |                 options.dc = dc
357 |                 options.outputfile = make_output_filename (dc, type_)
358 | 
359 |                 if maintype == 'kindle':
360 |                     options.epub_filename = make_output_filename (dc, 'epub' + subtype)
361 | 
362 |                 writer.build ()
363 | 
364 |                 if options.validate:
365 |                     writer.validate ()
366 | 
367 |                 if packager_factory:
368 |                     try:
369 |                         packager = packager_factory.create (type_)
370 |                         packager.setup (options)
371 |                         packager.package (aux_file_list)
372 |                     except KeyError:
373 |                         # no such packager
374 |                         pass
375 | 
376 |                 options.outputfile = None
377 | 
378 |             except SkipOutputFormat:
379 |                 continue
380 |             
381 |             except StandardError, what:
382 |                 exception ("%s" % what)
383 | 
384 |         if options.packager == 'ww':
385 |             try:
386 |                 packager = packager_factory.create ('push')
387 |                 options.outputfile = '%d-final.zip' % (dc.project_gutenberg_id)
388 |                 packager.setup (options)
389 |                 packager.package (aux_file_list)
390 |             except KeyError:
391 |                 # no such packager
392 |                 pass
393 | 
394 |     sys.exit (0)
395 | 
396 | if __name__ == "__main__":
397 |     main ()
398 | 
399 | 
400 | 
401 | 


--------------------------------------------------------------------------------
/epubmaker/HTMLChunker.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | HTMLChunker.py
  7 | 
  8 | Copyright 2009 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Splits a HTML file into chunks.
 13 | 
 14 | """
 15 | 
 16 | from __future__ import with_statement
 17 | 
 18 | import urlparse
 19 | import urllib
 20 | import os
 21 | import re
 22 | import copy
 23 | 
 24 | from lxml import etree
 25 | 
 26 | import epubmaker.lib.GutenbergGlobals as gg
 27 | from epubmaker.lib.GutenbergGlobals import NS
 28 | from epubmaker.lib.Logger import debug, error
 29 | from epubmaker.CommonOptions import Options
 30 | 
 31 | options = Options()
 32 | # MAX_CHUNK_SIZE  = 300 * 1024  # bytes
 33 | MAX_CHUNK_SIZE  = 100 * 1024  # bytes
 34 | 
 35 | SECTIONS = [
 36 |     ('div.section', 0.0), 
 37 |     ('div.chapter', 0.0), 
 38 |     ('h1',          0.5),
 39 |     ('div',         0.5),
 40 |     ('h2',          0.7),
 41 |     ('h3',          0.75),
 42 |     ('p',           0.8)
 43 |     ]
 44 | 
 45 | def xpath (node, path):
 46 |     """ xpath helper """
 47 |     return node.xpath (path, namespaces = gg.NSMAP)
 48 | 
 49 | def unicode_uri (uri):
 50 |     """ Normalize URI for idmap. """
 51 |     return urllib.unquote (uri).decode ('utf-8')
 52 | 
 53 | 
 54 | class HTMLChunker (object):
 55 |     """ Splits HTML tree into smaller chunks.
 56 | 
 57 |     Some epub viewers are limited in that they cannot display files
 58 |     larger than 300K.  If our HTML happens to be longer, we have to
 59 |     split it up.  Also smaller chunks do improve page flip times.
 60 | 
 61 | 
 62 |     """
 63 | 
 64 |     def __init__ (self):
 65 |         self.chunks = []
 66 |         self.idmap = {}
 67 |         self.chunk = None
 68 |         self.chunk_body = None
 69 |         self.chunk_size = 0
 70 |         self.next_id = 0
 71 | 
 72 |         self.tags = {}
 73 |         for tag, size in SECTIONS:
 74 |             self.tags[NS.xhtml[tag]] = int (size * MAX_CHUNK_SIZE)
 75 |         for tag in options.section_tags:
 76 |             self.tags[NS.xhtml[tag]] = 0
 77 |         
 78 | 
 79 |     def _make_name (self, url):
 80 |         """ Generate a name for the chunk. """
 81 |         u = list (urlparse.urlparse (url))
 82 |         root, ext = os.path.splitext (u[2])
 83 |         # FIXME: brain-dead kindlegen only finds links in files with
 84 |         # .html extension. so we just add .html to everything
 85 |         u[2] = "%s-%d%s.html" % (root, self.next_id, ext)
 86 |         self.next_id += 1
 87 |         return urlparse.urlunparse (u)
 88 |     
 89 |         
 90 |     @staticmethod
 91 |     def make_template (tree):
 92 |         """ Make a copy with an empty html:body.
 93 | 
 94 |         This makes a template into which we can paste our chunks.
 95 | 
 96 |         """
 97 |         
 98 |         template = copy.deepcopy (tree)
 99 | 
100 |         for c in xpath (template, '//xhtml:body'):
101 | 
102 |             # descend while elem has only one child
103 |             while len (c) == 1:
104 |                 c = c[0]
105 | 
106 |             # clear children but save attributes
107 |             attributes = c.attrib.items ()
108 |             c.clear ()
109 |             # was tentative fix for patological one-element-html case
110 |             # for child in c:
111 |             #     c.remove (child)
112 |             for a in attributes:
113 |                 c.set (a[0], a[1])
114 | 
115 |         # debug (etree.tostring (template))
116 | 
117 |         return template
118 | 
119 | 
120 |     def reset_chunk (self, template):
121 |         """ start a new chunk """
122 | 
123 |         self.chunk = copy.deepcopy (template)
124 |         self.chunk_size = len (etree.tostring (self.chunk))
125 |         self.chunk_body = xpath (self.chunk, "//xhtml:body")[0]
126 |         while len (self.chunk_body) == 1:
127 |             self.chunk_body = self.chunk_body[0]
128 | 
129 | 
130 |     def shipout_chunk (self, url, chunk_id = None, comment = None):
131 |         """ ready chunk to be shipped """
132 | 
133 |         if (self.chunk_size > MAX_CHUNK_SIZE):
134 |             self.split (self.chunk, url)
135 |             return
136 | 
137 |         url = unicode_uri (url)
138 |         chunk_name = self._make_name (url)
139 | 
140 |         # the url of the whole page
141 |         if not url in self.idmap:
142 |             self.idmap[url] = chunk_name
143 | 
144 |         # fragments of the page
145 |         for e in xpath (self.chunk, '//xhtml:*[@id]'):
146 |             id_ = e.attrib['id']
147 |             old_id = "%s#%s" % (url, id_)
148 |             # key is unicode string,
149 |             # value is uri-escaped byte string
150 |             # if ids get cloned while chunking, map to the first one only
151 |             if old_id not in self.idmap:
152 |                 self.idmap[old_id] = "%s#%s" % (
153 |                     chunk_name,  urllib.quote (id_.encode ('utf-8')))
154 | 
155 |         self.chunks.append ( { 'name'     : chunk_name,
156 |                                'id'       : chunk_id,
157 |                                'comment'  : comment,
158 |                                'chunk'    : self.chunk,  } )
159 |             
160 |         debug ("Adding chunk %s (%d bytes) %s" % (chunk_name, self.chunk_size, chunk_id))
161 | 
162 | 
163 |     def split (self, tree, url):
164 |         """ Split whole html or split chunk.
165 | 
166 |         Find some arbitrary points to do it.
167 |     
168 |         """
169 | 
170 |         for body in xpath (tree, "//xhtml:body"):
171 |             # we can't split a node that has only one child
172 |             # descend while elem has only one child
173 |             while len (body) == 1:
174 |                 body = body[0]
175 | 
176 |             debug ("body tag is %s" % body.tag)
177 | 
178 |             template = self.make_template (tree)
179 |             self.reset_chunk (template)
180 | 
181 |             # FIXME: is this ok ???
182 |             # fixes patological one-element-body case
183 |             self.chunk_body.text = body.text
184 | 
185 |             for child in body:
186 |                 if not isinstance (child, etree.ElementBase):
187 |                     # comments, processing instructions etc. 
188 |                     continue
189 |                 child_size = len (etree.tostring (child))
190 | 
191 |                 try:
192 |                     tags = [child.tag + '.' + c for c in child.attrib['class'].split ()]
193 |                     tags.append (child.tag)
194 |                 except KeyError:
195 |                     tags = [child.tag]
196 | 
197 |                 for tag in tags:
198 |                     if ((self.chunk_size + child_size > MAX_CHUNK_SIZE) or
199 |                               (tag in self.tags and
200 |                                self.chunk_size > self.tags[tag])):
201 |                         
202 |                         comment = ("Chunk: size=%d Split on %s" 
203 |                                    % (self.chunk_size, re.sub ('^{.*}', '', tag)))
204 |                         debug (comment)
205 | 
206 |                         # find a suitable id
207 |                         chunk_id = None
208 |                         for c in self.chunk_body:
209 |                             if 'id' in c.attrib:
210 |                                 chunk_id = c.get ('id')
211 |                                 break
212 |                         debug ("chunk id is: %s" % (chunk_id or ''))
213 |                         
214 |                         self.shipout_chunk (url, chunk_id, comment)
215 |                         self.reset_chunk (template)
216 |                         break
217 | 
218 |                 self.chunk_body.append (child)
219 |                 self.chunk_size = self.chunk_size + child_size
220 | 
221 |             # fixes patological one-element-body case
222 |             self.chunk_body.tail = body.tail
223 |             
224 |             chunk_id = None
225 |             if len (self.chunk_body):
226 |                 chunk_id = self.chunk_body[0].get ('id')
227 |             comment = "Chunk: size=%d" % self.chunk_size
228 |             self.shipout_chunk (url, chunk_id, comment)
229 |             self.reset_chunk (template)
230 | 
231 | 
232 |     def rewrite_links (self, f):
233 |         """ Rewrite all href and src using f (). """
234 |         
235 |         for chunk in self.chunks:
236 |             # chunk['name'] = f (chunk['name'])
237 |             
238 |             for link in xpath (chunk['chunk'], '//xhtml:*[@href]'):
239 |                 url = link.get ('href')
240 |                 if not url.startswith('http://') and not url.startswith('https://'):
241 |                     link.set ('href', f (url))
242 | 
243 |             for image in xpath (chunk['chunk'], '//xhtml:*[@src]'):
244 |                 image.set ('src', f (image.get ('src')))
245 | 
246 |         for k, v in self.idmap.items ():
247 |             self.idmap[k] = f (v)
248 | 
249 | 
250 |     def rewrite_internal_links (self):
251 |         """ Rewrite links to point into right chunks.
252 | 
253 |         Because we split the HTML into chunks, all internal links need
254 |         to be rewritten to become links into the right chunk.
255 |         Rewrite all internal links in all chunks.
256 | 
257 |         """
258 |         for chunk in self.chunks:
259 |             for a in xpath (chunk['chunk'], "//xhtml:*[@href]"):
260 |                 try:
261 |                     uri = unicode_uri (a.get ('href'))
262 |                     a.set ('href', self.idmap[uri])
263 |                 except KeyError:
264 |                     ur, dummy_frag = urlparse.urldefrag (uri)
265 |                     if ur in self.idmap:
266 |                         error ("HTMLChunker: Cannot rewrite internal link '%s'" % uri)
267 |         
268 | 
269 |     def rewrite_internal_links_toc (self, toc):
270 |         """ Rewrite links to point into right chunks.
271 | 
272 |         Because we split the HTML into chunks, all internal links need
273 |         to be rewritten to become links into the right chunk.
274 |         Rewrite all links in the passed toc.
275 | 
276 |         """
277 | 
278 |         for entry in toc:
279 |             try:
280 |                 entry[0] = self.idmap [unicode_uri (entry[0])]
281 |             except KeyError:
282 |                 error ("HTMLChunker: Cannot rewrite toc entry '%s'" % entry[0]) 
283 |                 del entry
284 | 
285 | 
286 | 


--------------------------------------------------------------------------------
/epubmaker/ParserFactory.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | ParserFactory.py
  7 | 
  8 | Copyright 2009-10 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | """
 13 | 
 14 | from __future__ import with_statement
 15 | 
 16 | import os.path
 17 | import urllib
 18 | 
 19 | from pkg_resources import resource_listdir # pylint: disable=E0611
 20 | 
 21 | from epubmaker.mydocutils import broken
 22 | from epubmaker.lib.Logger import debug, error
 23 | from epubmaker.lib.MediaTypes import mediatypes
 24 | from epubmaker.Version import VERSION
 25 | from epubmaker.CommonOptions import Options
 26 | 
 27 | options = Options()
 28 | 
 29 | class AppURLopener (urllib.FancyURLopener):
 30 |     version = "ebookmaker/%s" % VERSION
 31 | 
 32 | urllib._urlopener = AppURLopener ()
 33 | 
 34 | parser_modules = {}
 35 | 
 36 | def load_parsers ():
 37 |     """ See what types we can parse. """
 38 | 
 39 |     for fn in resource_listdir ('epubmaker.parsers', ''):
 40 |         modulename, ext = os.path.splitext (fn)
 41 |         if ext == '.py':
 42 |             if (modulename.endswith ('Parser')):
 43 |                 module = __import__ ('epubmaker.parsers.' + modulename, fromlist = [modulename])
 44 |                 debug ("Loading parser from module: %s for mediatypes: %s" % (
 45 |                     modulename, ', '.join (module.mediatypes)))
 46 |                 for mediatype in module.mediatypes:
 47 |                     parser_modules[mediatype] = module
 48 | 
 49 |     return parser_modules.keys ()
 50 | 
 51 | 
 52 | def unload_parsers ():
 53 |     """ Unload parser modules. """
 54 |     for k in parser_modules.keys ():
 55 |         del parser_modules[k]
 56 |     
 57 | 
 58 | class ParserFactory (object):
 59 |     """ A factory and a cache for parsers.
 60 | 
 61 |     So we don't reparse the same file twice.
 62 | 
 63 |     """
 64 | 
 65 |     parsers = {} # cache: parsers[url] = parser
 66 |     
 67 |     @staticmethod
 68 |     def get (mediatype):
 69 |         """ Get the right kind of parser. """
 70 |         try:
 71 |             return parser_modules[mediatype].Parser ()
 72 |         except KeyError:
 73 |             return parser_modules['*/*'].Parser ()
 74 |             
 75 | 
 76 |     @classmethod
 77 |     def create (cls, url, attribs):
 78 |         """ Create an appropriate parser. """
 79 | 
 80 |         # debug ("Need parser for %s" % url)
 81 | 
 82 |         if url in cls.parsers:
 83 |             # debug ("... reusing parser for %s" % url)
 84 |             # reuse same parser, maybe already filled with data
 85 |             return cls.parsers[url]
 86 | 
 87 |         orig_url = url
 88 |         mediatype = attribs.get ('mediatype')
 89 | 
 90 |         if url.endswith (broken):
 91 |             # hack! broken.png doesn't exist at the source location.
 92 |             # We take it from our resources and fake its provenience.
 93 |             parser = parser_modules['image/png'].Parser ()
 94 |             parser.orig_url = url
 95 |             parser.url = url
 96 |             parser.broken_image ()
 97 |         else:
 98 |             fp = urllib.urlopen (url, proxies = options.config.PROXIES)
 99 |             url = fp.geturl ()
100 | 
101 |             if url != orig_url:
102 |                 debug ("... %s redirected to %s" % (orig_url, url))
103 |                 if url in cls.parsers:
104 |                     # debug ("... reusing parser for %s" % url)
105 |                     # reuse same parser, maybe already filled with data
106 |                     return cls.parsers[url]
107 | 
108 |             # ok. so we have to create a new parser
109 |             debug ("... creating new parser for %s" % url)
110 | 
111 |             if mediatype is not None:
112 |                 debug ("... got mediatype %s from link attributes" % mediatype)
113 |             else:
114 |                 if options.mediatype_from_extension or not hasattr (fp, 'info'):
115 |                     name, ext = os.path.splitext (url)
116 |                     mediatype = mediatypes[ext[1:]]
117 |                 else:
118 |                     msg = fp.info ()
119 |                     mediatype = msg.get ('Content-Type')
120 |                     if mediatype:
121 |                         mediatype = mediatype.partition (';')[0]
122 |                         debug ("... got mediatype %s from server" % mediatype)
123 |                     else:
124 |                         mediatype = 'application/octet-stream'
125 |                         error ("... cannot determine mediatype for %s" % url)
126 | 
127 |             # get the right kind of parser
128 |             try:
129 |                 mt = mediatype.split (';')[0]
130 |                 parser = parser_modules[mt].Parser ()
131 |             except KeyError:
132 |                 parser = parser_modules['*/*'].Parser ()
133 | 
134 |             parser.setup (orig_url, mediatype, attribs, fp)
135 | 
136 |         cls.parsers[parser.url] = parser
137 |         cls.parsers[orig_url] = parser
138 | 
139 |         return parser
140 |     
141 | 
142 |     @classmethod
143 |     def clear (cls):
144 |         """ Clear parser cache to free memory. """
145 | 
146 |         # debug: kill refs
147 |         for dummy_url, parser in cls.parsers.items ():
148 |             del parser
149 |             
150 |         cls.parsers = {}
151 | 
152 | 


--------------------------------------------------------------------------------
/epubmaker/Spider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | Spider.py
  7 | 
  8 | Copyright 2009 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Rudimentary Web Spider
 13 | 
 14 | """
 15 | 
 16 | from __future__ import with_statement
 17 | 
 18 | import urlparse
 19 | import fnmatch
 20 | 
 21 | from epubmaker.lib import MediaTypes
 22 | import epubmaker.lib.GutenbergGlobals as gg
 23 | from epubmaker.lib.GutenbergGlobals import NS
 24 | from epubmaker.lib.Logger import debug, error
 25 | 
 26 | from epubmaker import ParserFactory
 27 | 
 28 | COVERPAGE_MIN_AREA = 200 * 200
 29 | 
 30 | class Spider (object):
 31 |     """ A very rudimentary web spider. """
 32 | 
 33 |     def __init__ (self):
 34 |         self.options = None
 35 |         self.parsed_urls = set ()
 36 |         self.enqueued_urls = set ()
 37 |         self.included_mediatypes = set ()
 38 |         self.excluded_mediatypes = set ()
 39 |         self.queue = []
 40 |         self.parsers = []
 41 |         self.next = [] # for a topological sort
 42 |         self.redirection_map = {}
 43 | 
 44 | 
 45 |     def parse (self, url, mediatype_hint, options):
 46 |         """ Do a recursive parse starting from url.
 47 |         
 48 |         Do a breadth-first traversal. Assuming the first page contains
 49 |         a linked TOC, this will get us a more natural ordering of the
 50 |         pages than a depth-first traversal.
 51 | 
 52 |         """
 53 | 
 54 |         self.options = options
 55 | 
 56 |         for rewrite in self.options.rewrite:
 57 |             from_, to = rewrite.split ('>')
 58 |             self.redirection_map[from_] = to
 59 | 
 60 |         debug ("Start of retrieval")
 61 | 
 62 |         # enqueue root url
 63 |         
 64 |         attribs = { 'mediatype' : mediatype_hint, 'id': 'start' }
 65 |         self.enqueue (url, 0, attribs)
 66 | 
 67 |         while self.queue:
 68 |             (url, depth, attribs) = self.queue.pop (0)
 69 | 
 70 |             url = self.redirect (url)
 71 |             if url in self.parsed_urls:
 72 |                 continue
 73 |             
 74 |             parser = ParserFactory.ParserFactory.create (url, attribs)
 75 |             self.add_redirection (parser)
 76 |             
 77 |             # if the url was redirected to something we already have
 78 |             url = self.redirect (parser.url)
 79 |             if url in self.parsed_urls:
 80 |                 continue
 81 |             
 82 |             self.parsed_urls.add (url)
 83 |             parser.options = self.options
 84 |             parser.pre_parse ()
 85 |             self.parsers.append (parser)
 86 | 
 87 |             # check potential coverpage for sufficient size
 88 |             if options.coverpage_url is None:
 89 |                 if attribs.get ('rel', '') == 'coverpage':
 90 |                     if hasattr (parser, 'get_image_dimen'):
 91 |                         dimen = parser.get_image_dimen ()
 92 |                         if (dimen[0] * dimen[1]) > COVERPAGE_MIN_AREA:
 93 |                             options.coverpage_url = parser.url
 94 |                             debug ("Setting coverpage: %s ..." % parser.url)
 95 | 
 96 |             depth += 1
 97 | 
 98 |             # look for links in just parsed document
 99 |             debug ("Requesting iterlinks for: %s ..." % url)
100 | 
101 |             for (url, attr) in parser.iterlinks ():
102 |                 # debug ("*** link: %s ..." % url)
103 | 
104 |                 url = urlparse.urldefrag (url)[0]
105 |                 tag = attr.get ('tag', '')
106 | 
107 |                 if tag == NS.xhtml.link:
108 |                     if attr.get ('rel', '').lower () == 'next':
109 |                         self.next.append ((parser.url, url))
110 |                 
111 |                 url = self.redirect (url)
112 | 
113 |                 attribs = { 'mediatype' : attr.get ('type', None) }
114 | 
115 |                 for k in ('id', 'rel'):
116 |                     if k in attr:
117 |                         attribs[k] = attr[k]
118 |                 
119 |                 if tag == NS.xhtml.a:
120 |                     self.enqueue_doc (url, depth, attribs)
121 |                     continue
122 |                 if tag == NS.xhtml.img:
123 |                     self.enqueue_aux (url, depth, attribs)
124 |                     continue
125 |                 if tag == NS.xhtml.object:
126 |                     if ('type' in attr and
127 |                         not self.is_included_mediatype (attr['type'])):
128 |                         continue
129 |                     self.enqueue_aux (url, depth, attribs)
130 |                     continue
131 |                 if tag == NS.xhtml.link:
132 |                     rel = attribs.get ('rel', '').lower ()
133 |                     if 'stylesheet' in rel:
134 |                         self.enqueue_aux (url, depth, attribs)
135 |                     elif rel == 'coverpage':
136 |                         # We may also find the coverpage in <link rel='coverpage' href='url' />
137 |                         self.enqueue_aux (url, depth, attribs)
138 |                     else:
139 |                         self.enqueue_doc (url, depth, attribs)
140 |                     continue
141 |                     
142 |         debug ("End of retrieval")
143 |         
144 |         # rewrite redirected urls
145 |         if self.redirection_map:
146 |             for parser in self.parsers:
147 |                 parser.remap_links (self.redirection_map)
148 | 
149 |         # try a topological sort of documents using <link rel='next'>
150 |         if self.next:
151 |             self.next = map (lambda x: (self.redirect(x[0]), self.redirect(x[1])), self.next)
152 | 
153 |             try:
154 |                 d = {}
155 |                 for order, url in enumerate (gg.topological_sort (self.next)):
156 |                     d[url] = order
157 |                     debug ("%s order %d" % (url, order))
158 |                 for parser in self.parsers:
159 |                     parser.order = d.get (parser.url, 999999)
160 |                 self.parsers.sort (key = lambda p: p.order)
161 |                 
162 |             except StandardError:
163 |                 pass
164 | 
165 | 
166 |     def add_redirection (self, parser):
167 |         """ Remember this redirection. """
168 |         if parser.orig_url != parser.url:
169 |             self.redirection_map[parser.orig_url] = parser.url
170 |             debug ("Adding redirection from %s to %s" % (parser.orig_url, parser.url))
171 | 
172 |         
173 |     def redirect (self, url):
174 |         """ Redirect url if we know the target. """
175 |         return self.redirection_map.get (url, url)
176 | 
177 |         
178 |     def enqueue (self, url, depth, attribs):
179 |         """ Enque url for parsing. """
180 |         
181 |         url = self.redirect (url)
182 |         if url in self.enqueued_urls:
183 |             return
184 |         
185 |         debug ("Enqueing %s ..." % url)
186 |         self.queue.append ((url, depth, attribs))
187 |         self.enqueued_urls.add (url)
188 |         
189 |             
190 |     def enqueue_aux (self, url, depth, attribs):
191 |         """ Enqueue an auxiliary file.
192 | 
193 |         We get auxiliary files even if they are too deep or not in
194 |         'included' directories.
195 | 
196 |         """
197 |         try:
198 |             parser = ParserFactory.ParserFactory.create (url, attribs)
199 |             self.add_redirection (parser)
200 |             if self.is_wanted_aux (parser):
201 |                 self.enqueue (parser.url, depth, attribs)
202 |         except IOError:
203 |             error ("bad aux url: %s" % url)
204 | 
205 |     def enqueue_doc (self, url, depth, attribs):
206 |         """ Enqueue a document file.
207 | 
208 |         We get document files only if they pass document-selection
209 |         rules.
210 | 
211 |         """
212 |         
213 |         if not self.options.max_depth or depth < self.options.max_depth:
214 |             if self.is_included (url):
215 |                 try:
216 |                     parser = ParserFactory.ParserFactory.create (url, attribs)
217 |                     self.add_redirection (parser)
218 |                     if self.is_wanted_doc (parser):
219 |                         self.enqueue (parser.url, depth, attribs)
220 |                 except IOError:
221 |                     error ("bad url: %s" % url)
222 | 
223 | 
224 |     def is_included (self, url):
225 |         """ Return True if this document is eligible. """
226 | 
227 |         included = any (map (lambda x: fnmatch.fnmatchcase (url, x), self.options.include))
228 |         excluded = any (map (lambda x: fnmatch.fnmatchcase (url, x), self.options.exclude))
229 | 
230 |         if included and not excluded:
231 |             if self.options.local_files_only:
232 |                 if url.startswith('http:') or url.startswith('https:'):
233 |                     return 0
234 |                 else:
235 |                     return 1 
236 |             return 1
237 | 
238 |         if excluded:
239 |             debug ("Dropping excluded %s" % url)
240 |         if not included:
241 |             debug ("Dropping not included %s" % url)
242 |         return 0
243 |             
244 | 
245 |     def is_included_mediatype (self, mediatype):
246 |         """ Return True if this document is eligible. """
247 | 
248 |         included = any (map (lambda pattern: fnmatch.fnmatch (mediatype, pattern),
249 |                              self.options.include_mediatypes))
250 |         excluded = any (map (lambda pattern: fnmatch.fnmatch (mediatype, pattern),
251 |                              self.options.exclude_mediatypes))
252 | 
253 |         if included and not excluded:
254 |             self.included_mediatypes.add (mediatype)
255 |             return 1
256 | 
257 |         if excluded:
258 |             debug ("Dropping excluded mediatype %s" % mediatype)
259 |         if not included:
260 |             debug ("Dropping not included mediatype %s" % mediatype)
261 |             
262 |         self.excluded_mediatypes.add (mediatype)
263 |         return 0
264 |             
265 | 
266 |     def has_seen_images (self):
267 |         """ Return True if the spider has encountered images. """
268 | 
269 |         return bool (MediaTypes.IMAGE_MEDIATYPES &
270 |                        (self.included_mediatypes | self.excluded_mediatypes))
271 | 
272 |         
273 |     def dict_urls_mediatypes (self):
274 |         """ Return a dict of all parsed urls and mediatypes. """
275 |         return dict (map (lambda p: (p.url, p.mediatype), self.parsers))
276 |     
277 | 
278 |     def is_wanted_doc (self, parser):
279 |         """ Return True if we ought to parse this content document.
280 | 
281 |         Override this in custom spiders.
282 | 
283 |         """
284 |         return self.is_included_mediatype (parser.mediatype)
285 | 
286 | 
287 |     def is_wanted_aux (self, parser):
288 |         """ Return True if we ought to parse this image or aux file.
289 | 
290 |         Override this in custom spiders.
291 | 
292 |         """
293 |         return self.is_included_mediatype (parser.mediatype)
294 | 
295 | 
296 | 


--------------------------------------------------------------------------------
/epubmaker/Unitame.py:
--------------------------------------------------------------------------------
  1 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Unitame.py
  5 | 
  6 | Copyright 2010 by Marcello Perathoner
  7 | 
  8 | Distributable under the GNU General Public License Version 3 or newer.
  9 | 
 10 | Module to implement the totally superfluous PG plain text conversion
 11 | into long extinct encodings.
 12 | 
 13 | We have to unitame-translate before feeding to nroff because nroff
 14 | does some irreversible (and wrong) translations of its own, like ä ->
 15 | a. Also, some unitame-translations change the number of characters,
 16 | thus throwing already-justified text off.
 17 | 
 18 | We cannot do the translations before feeding the source to docutils
 19 | because if we change the length of titles, we get the warning: Title
 20 | underline too short.
 21 | 
 22 | Translation does some dangerous things, like converting quotes to
 23 | apostrophes, which are command escapes in nroff. We have to escape
 24 | apostrophes in the source text but not apostroph-commands inserted by
 25 | the converter.
 26 | 
 27 | We also have to translate some important non-ascii characters, like
 28 | nbsp and shy, into command sequences before they reach unitame because
 29 | unitame would convert them into the semantically different space and
 30 | hyhpen.
 31 | 
 32 | All this makes translation inside the docutils converter the best
 33 | choice. Implemented as a docutils translator that visits all text
 34 | nodes.
 35 | 
 36 | Smart quote translation should also go into a docutils
 37 | translator. Likewise a translator for text-transform: upper.
 38 | 
 39 | """
 40 | 
 41 | import codecs
 42 | import unicodedata as ud
 43 | 
 44 | # UnitameData is generated from unitame.dat
 45 | from epubmaker.UnitameData import unicode_to_iso_8859_1, iso_8859_1_to_ascii
 46 | 
 47 | # tweak dicts for translate ()
 48 | u2i = dict ( [ (ord (o), s) for o, s in unicode_to_iso_8859_1.iteritems () ] )
 49 | i2a = dict ( [ (ord (o), s) for o, s in iso_8859_1_to_ascii.iteritems () ] )
 50 | 
 51 | u2i.update ( {
 52 |     0x2000:     u' ',    # en quad
 53 |     0x2001:     u'  ',   # em quad
 54 |     0x2002:     u' ',    # en space
 55 |     0x2003:     u'  ',   # em space
 56 |     0x2004:     u' ',    # 3/em space
 57 |     0x2005:     u'',     # 4/em
 58 |     0x2006:     u'',     # 6/em
 59 |     0x2007:     u' ',    # figure space
 60 |     0x2008:     u'',     # punctuation space
 61 |     0x2009:     u'',     # thin space
 62 |     0x200a:     u'',     # hair space
 63 |     0x200b:     u'',     # zero space
 64 |     0x200c:     u'',     # zwnj
 65 |     0x200d:     u'',     # zwj
 66 |     0x2010:     u'-',    # hyphen
 67 |     0x2011:     u'-',    # non-breaking hyphen
 68 |     0x2012:     u'-',    # figure-dash
 69 |     0x2013:     u'-',    # en dash
 70 |     0x2014:     u'--',   # em dash
 71 |     0x2015:     u'-',    # horizontal bar
 72 |     0x2026:     u'...',  # horizontal ellipsis
 73 |     ord (u'™'): u'(tm)',
 74 |     ord (u'‹'): u'<',
 75 |     ord (u'›'): u'>',
 76 |     ord (u'†'): u'+',
 77 |     ord (u'‡'): u'++',
 78 |     ord (u'⁑'): u'**',
 79 |     ord (u'⁂'): u'***',
 80 |     ord (u'•'): u'-',
 81 |     ord (u'′'): u'´',
 82 |     ord (u'″'): u'´´',
 83 |     ord (u'‴'): u'´´´',
 84 |     ord (u'⁗'): u'´´´´',
 85 |     ord (u'⁓'): u'~',
 86 |     ord (u'‰'): u'%o',
 87 |     ord (u'‱'): u'%oo',
 88 |     ord (u'⚹'): u'*',    # U+26b9 sextile
 89 |     ord (u'⁰'): u'^0',
 90 |     ord (u'⁴'): u'^4',
 91 |     ord (u'⁵'): u'^5',
 92 |     ord (u'⁶'): u'^6',
 93 |     ord (u'⁷'): u'^7',
 94 |     ord (u'⁸'): u'^8',
 95 |     ord (u'⁹'): u'^9',
 96 |     } )
 97 | 
 98 | # somehow cram these into ascii, so the ppers stop whining about not
 99 | # having nbsp in ascii, then fix it later by replacing them with nroff
100 | # commands.
101 | 
102 | i2a.update ( {
103 |     ord (u'¹'): u'^1',
104 |     ord (u'²'): u'^2',
105 |     ord (u'³'): u'^3',
106 |     0x00a0:     u'\u0011',       # nbsp => DC1
107 |     0x00ad:     u'\u0012',       # shy  => DC2
108 | } )
109 | 
110 | unhandled_chars = []
111 | 
112 | def strip_accents (text):
113 |     """ Strip accents from string. 
114 | 
115 |     If the accented character doesn't fit into the encoding, 
116 |     remove the accent and try again.
117 | 
118 |     """
119 |     return ud.normalize ('NFKC', 
120 |                          filter (lambda c: ud.category (c) != 'Mn', 
121 |                                  ud.normalize ('NFKD', text)))
122 | 
123 | 
124 | def unitame (exc):
125 |     """
126 |     Encoding error handler.
127 | 
128 |     The encoder handles all compatible characters itself.  It calls
129 |     this function whenever it encounters a character it cannot encode.
130 |     This function searches the unitame database for a replacement.
131 | 
132 | 
133 |     """
134 | 
135 |     l = []
136 |     for cc in exc.object[exc.start:exc.end]:
137 |         c = cc
138 |         if exc.encoding == 'latin-1': # python name for iso-8859-1
139 |             c = c.translate (u2i)
140 |             c = strip_accents (c)
141 |             if c and ord (max (c)) < 256:
142 |                 l.append (c)
143 |                 c = None
144 |         elif exc.encoding == 'ascii': # python name for us-ascii
145 |             # "1¼" -> "1 1/4"
146 |             if cc in u'¼½¾':
147 |                 if exc.start > 0 and exc.object[exc.start - 1] in u'0123456789':
148 |                     l.append (' ')
149 |             c = c.translate (u2i)
150 |             c = c.translate (i2a)
151 |             c = strip_accents (c)
152 |             if c and ord (max (c)) < 128:
153 |                 l.append (c)
154 |                 c = None
155 | 
156 |         if c:
157 |             l.append ('{~%s U+%04x~}' % (ud.name (cc), ord (cc)))
158 |             unhandled_chars.extend (l)
159 |         
160 |     return (u"".join (l), exc.end)
161 | 
162 | 
163 | codecs.register_error ('unitame', unitame)
164 | 
165 | 
166 | 


--------------------------------------------------------------------------------
/epubmaker/UnitameData.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  3 | 
  4 | unicode_to_iso_8859_1 = {
  5 |     u'Đ': u'D', # LATIN CAPITAL LETTER D WITH STROKE
  6 |     u'đ': u'd', # LATIN SMALL LETTER D WITH STROKE
  7 |     u'Ħ': u'H', # LATIN CAPITAL LETTER H WITH STROKE
  8 |     u'ħ': u'h', # LATIN SMALL LETTER H WITH STROKE
  9 |     u'Ŀ': u'L', # LATIN CAPITAL LETTER L WITH MIDDLE DOT
 10 |     u'ŀ': u'l', # LATIN SMALL LETTER L WITH MIDDLE DOT
 11 |     u'Ł': u'L', # LATIN CAPITAL LETTER L WITH STROKE
 12 |     u'ł': u'l', # LATIN SMALL LETTER L WITH STROKE
 13 |     u'ŉ': u'n', # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
 14 |     u'Œ': u'OE', # LATIN CAPITAL LIGATURE OE
 15 |     u'œ': u'oe', # LATIN SMALL LIGATURE OE
 16 |     u'Ŧ': u'T', # LATIN CAPITAL LETTER T WITH STROKE
 17 |     u'ŧ': u't', # LATIN SMALL LETTER T WITH STROKE
 18 |     u'ƀ': u'b', # LATIN SMALL LETTER B WITH STROKE
 19 |     u'Ɓ': u'B', # LATIN CAPITAL LETTER B WITH HOOK
 20 |     u'Ƃ': u'B', # LATIN CAPITAL LETTER B WITH TOPBAR
 21 |     u'ƃ': u'b', # LATIN SMALL LETTER B WITH TOPBAR
 22 |     u'Ɔ': u'O', # LATIN CAPITAL LETTER OPEN O
 23 |     u'Ƈ': u'C', # LATIN CAPITAL LETTER C WITH HOOK
 24 |     u'ƈ': u'c', # LATIN SMALL LETTER C WITH HOOK
 25 |     u'Ɗ': u'D', # LATIN CAPITAL LETTER D WITH HOOK
 26 |     u'Ƌ': u'D', # LATIN CAPITAL LETTER D WITH TOPBAR
 27 |     u'ƌ': u'd', # LATIN SMALL LETTER D WITH TOPBAR
 28 |     u'Ƒ': u'F', # LATIN CAPITAL LETTER F WITH HOOK
 29 |     u'ƒ': u'f', # LATIN SMALL LETTER F WITH HOOK
 30 |     u'Ɠ': u'G', # LATIN CAPITAL LETTER G WITH HOOK
 31 |     u'Ɨ': u'I', # LATIN CAPITAL LETTER I WITH STROKE
 32 |     u'Ƙ': u'K', # LATIN CAPITAL LETTER K WITH HOOK
 33 |     u'ƙ': u'k', # LATIN SMALL LETTER K WITH HOOK
 34 |     u'ƚ': u'l', # LATIN SMALL LETTER L WITH BAR
 35 |     u'Ɲ': u'N', # LATIN CAPITAL LETTER N WITH LEFT HOOK
 36 |     u'ƞ': u'n', # LATIN SMALL LETTER N WITH LONG RIGHT LEG
 37 |     u'Ɵ': u'O', # LATIN CAPITAL LETTER O WITH MIDDLE TILDE
 38 |     u'Ƥ': u'P', # LATIN CAPITAL LETTER P WITH HOOK
 39 |     u'ƥ': u'p', # LATIN SMALL LETTER P WITH HOOK
 40 |     u'ƫ': u't', # LATIN SMALL LETTER T WITH PALATAL HOOK
 41 |     u'Ƭ': u'T', # LATIN CAPITAL LETTER T WITH HOOK
 42 |     u'ƭ': u't', # LATIN SMALL LETTER T WITH HOOK
 43 |     u'Ʈ': u'T', # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
 44 |     u'Ʋ': u'V', # LATIN CAPITAL LETTER V WITH HOOK
 45 |     u'Ƴ': u'Y', # LATIN CAPITAL LETTER Y WITH HOOK
 46 |     u'ƴ': u'y', # LATIN SMALL LETTER Y WITH HOOK
 47 |     u'Ƶ': u'Z', # LATIN CAPITAL LETTER Z WITH STROKE
 48 |     u'ƶ': u'z', # LATIN SMALL LETTER Z WITH STROKE
 49 |     u'ǈ': u'L', # LATIN CAPITAL LETTER L WITH SMALL LETTER J
 50 |     u'ǋ': u'N', # LATIN CAPITAL LETTER N WITH SMALL LETTER J
 51 |     u'Ǣ': u'AE', # LATIN CAPITAL LETTER AE WITH MACRON
 52 |     u'ǣ': u'ae', # LATIN SMALL LETTER AE WITH MACRON
 53 |     u'Ǥ': u'G', # LATIN CAPITAL LETTER G WITH STROKE
 54 |     u'ǥ': u'g', # LATIN SMALL LETTER G WITH STROKE
 55 |     u'ǲ': u'D', # LATIN CAPITAL LETTER D WITH SMALL LETTER Z
 56 |     u'Ǽ': u'AE', # LATIN CAPITAL LETTER AE WITH ACUTE
 57 |     u'ǽ': u'ae', # LATIN SMALL LETTER AE WITH ACUTE
 58 |     u'Ǿ': u'O', # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
 59 |     u'ǿ': u'o', # LATIN SMALL LETTER O WITH STROKE AND ACUTE
 60 |     u'Ƞ': u'N', # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG
 61 |     u'ȡ': u'd', # LATIN SMALL LETTER D WITH CURL
 62 |     u'Ȥ': u'Z', # LATIN CAPITAL LETTER Z WITH HOOK
 63 |     u'ȥ': u'z', # LATIN SMALL LETTER Z WITH HOOK
 64 |     u'ȴ': u'l', # LATIN SMALL LETTER L WITH CURL
 65 |     u'ȵ': u'n', # LATIN SMALL LETTER N WITH CURL
 66 |     u'ȶ': u't', # LATIN SMALL LETTER T WITH CURL
 67 |     u'ɓ': u'b', # LATIN SMALL LETTER B WITH HOOK
 68 |     u'ɕ': u'c', # LATIN SMALL LETTER C WITH CURL
 69 |     u'ɖ': u'd', # LATIN SMALL LETTER D WITH TAIL
 70 |     u'ɗ': u'd', # LATIN SMALL LETTER D WITH HOOK
 71 |     u'ɠ': u'g', # LATIN SMALL LETTER G WITH HOOK
 72 |     u'ɦ': u'h', # LATIN SMALL LETTER H WITH HOOK
 73 |     u'ɨ': u'i', # LATIN SMALL LETTER I WITH STROKE
 74 |     u'ɫ': u'l', # LATIN SMALL LETTER L WITH MIDDLE TILDE
 75 |     u'ɬ': u'l', # LATIN SMALL LETTER L WITH BELT
 76 |     u'ɭ': u'l', # LATIN SMALL LETTER L WITH RETROFLEX HOOK
 77 |     u'ɱ': u'm', # LATIN SMALL LETTER M WITH HOOK
 78 |     u'ɲ': u'n', # LATIN SMALL LETTER N WITH LEFT HOOK
 79 |     u'ɳ': u'n', # LATIN SMALL LETTER N WITH RETROFLEX HOOK
 80 |     u'ɼ': u'r', # LATIN SMALL LETTER R WITH LONG LEG
 81 |     u'ɽ': u'r', # LATIN SMALL LETTER R WITH TAIL
 82 |     u'ɾ': u'r', # LATIN SMALL LETTER R WITH FISHHOOK
 83 |     u'ʂ': u's', # LATIN SMALL LETTER S WITH HOOK
 84 |     u'ʈ': u't', # LATIN SMALL LETTER T WITH RETROFLEX HOOK
 85 |     u'ʉ': u'u', # LATIN SMALL LETTER U BAR
 86 |     u'ʋ': u'v', # LATIN SMALL LETTER V WITH HOOK
 87 |     u'ʐ': u'z', # LATIN SMALL LETTER Z WITH RETROFLEX HOOK
 88 |     u'ʑ': u'z', # LATIN SMALL LETTER Z WITH CURL
 89 |     u'ʜ': u'H', # LATIN LETTER SMALL CAPITAL H
 90 |     u'ʝ': u'j', # LATIN SMALL LETTER J WITH CROSSED-TAIL
 91 |     u'ʠ': u'q', # LATIN SMALL LETTER Q WITH HOOK
 92 |     u'ʮ': u'h', # LATIN SMALL LETTER TURNED H WITH FISHHOOK
 93 |     u'ʯ': u'h', # LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL
 94 |     u'Ѝ': u'I', # CYRILLIC CAPITAL LETTER I WITH GRAVE
 95 |     u'ѝ': u'i', # CYRILLIC SMALL LETTER I WITH GRAVE
 96 |     u'Ӑ': u'A', # CYRILLIC CAPITAL LETTER A WITH BREVE
 97 |     u'ӑ': u'a', # CYRILLIC SMALL LETTER A WITH BREVE
 98 |     u'Ӓ': u'A', # CYRILLIC CAPITAL LETTER A WITH DIAERESIS
 99 |     u'ӓ': u'a', # CYRILLIC SMALL LETTER A WITH DIAERESIS
100 |     u'Ӣ': u'I', # CYRILLIC CAPITAL LETTER I WITH MACRON
101 |     u'ӣ': u'i', # CYRILLIC SMALL LETTER I WITH MACRON
102 |     u'Ӥ': u'I', # CYRILLIC CAPITAL LETTER I WITH DIAERESIS
103 |     u'ӥ': u'i', # CYRILLIC SMALL LETTER I WITH DIAERESIS
104 |     u'Ӧ': u'O', # CYRILLIC CAPITAL LETTER O WITH DIAERESIS
105 |     u'ӧ': u'o', # CYRILLIC SMALL LETTER O WITH DIAERESIS
106 |     u'Ӭ': u'E', # CYRILLIC CAPITAL LETTER E WITH DIAERESIS
107 |     u'ӭ': u'e', # CYRILLIC SMALL LETTER E WITH DIAERESIS
108 |     u'Ӯ': u'U', # CYRILLIC CAPITAL LETTER U WITH MACRON
109 |     u'ӯ': u'u', # CYRILLIC SMALL LETTER U WITH MACRON
110 |     u'Ӱ': u'U', # CYRILLIC CAPITAL LETTER U WITH DIAERESIS
111 |     u'ӱ': u'u', # CYRILLIC SMALL LETTER U WITH DIAERESIS
112 |     u'Ӳ': u'U', # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE
113 |     u'ӳ': u'u', # CYRILLIC SMALL LETTER U WITH DOUBLE ACUTE
114 |     u'ẚ': u'a', # LATIN SMALL LETTER A WITH RIGHT HALF RING
115 |     u'‐': u'-', # HYPHEN
116 |     u'–': u'-', # EN DASH
117 |     u'—': u'--', # EM DASH
118 |     u'‖': u'||', # DOUBLE VERTICAL LINE
119 |     u'‗': u'_', # DOUBLE LOW LINE
120 |     u'‘': u'\'', # LEFT SINGLE QUOTATION MARK
121 |     u'’': u'\'', # RIGHT SINGLE QUOTATION MARK
122 |     u'‚': u'\'', # SINGLE LOW-9 QUOTATION MARK
123 |     u'‛': u'\'', # SINGLE HIGH-REVERSED-9 QUOTATION MARK
124 |     u'“': u'"', # LEFT DOUBLE QUOTATION MARK
125 |     u'”': u'"', # RIGHT DOUBLE QUOTATION MARK
126 |     u'„': u'"', # DOUBLE LOW-9 QUOTATION MARK
127 |     u'‟': u'"', # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
128 |     u'⁅': u'[', # LEFT SQUARE BRACKET WITH QUILL
129 |     u'⁆': u']', # RIGHT SQUARE BRACKET WITH QUILL
130 | }
131 | 
132 | 
133 | iso_8859_1_to_ascii = {
134 |     u'¡': u'i', # INVERTED EXCLAMATION MARK
135 |     u'¢': u'c', # CENT SIGN
136 |     u'£': u'L', # POUND SIGN
137 |     u'¥': u'Y', # YEN SIGN
138 |     u'¦': u'|', # BROKEN BAR
139 |     u'§': u'Sec.', # SECTION SIGN
140 |     u'¨': u'"', # DIAERESIS
141 |     u'©': u'(C)', # COPYRIGHT SIGN
142 |     u'«': u'"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
143 |     u'': u'-', # SOFT HYPHEN
144 |     u'®': u'(R)', # REGISTERED SIGN
145 |     u'¯': u'-', # MACRON
146 |     u'°': u' deg.', # DEGREE SIGN
147 |     u'±': u'+-', # PLUS-MINUS SIGN
148 |     u'²': u'^2', # SUPERSCRIPT TWO
149 |     u'³': u'^3', # SUPERSCRIPT THREE
150 |     u'´': u'\'', # ACUTE ACCENT
151 |     u'µ': u' mu', # MICRO SIGN
152 |     u'·': u'.', # MIDDLE DOT
153 |     u'»': u'"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
154 |     u'¼': u'1/4', # VULGAR FRACTION ONE QUARTER
155 |     u'½': u'1/2', # VULGAR FRACTION ONE HALF
156 |     u'¾': u'3/4', # VULGAR FRACTION THREE QUARTERS
157 |     u'¿': u'?', # INVERTED QUESTION MARK
158 |     u'Ä': u'Ae', # LATIN CAPITAL LETTER A WITH DIAERESIS
159 |     u'Æ': u'AE', # LATIN CAPITAL LETTER AE
160 |     u'Ð': u'Eth', # LATIN CAPITAL LETTER ETH
161 |     u'Ö': u'Oe', # LATIN CAPITAL LETTER O WITH DIAERESIS
162 |     u'×': u'x', # MULTIPLICATION SIGN
163 |     u'Ø': u'O', # LATIN CAPITAL LETTER O WITH STROKE
164 |     u'Ü': u'Ue', # LATIN CAPITAL LETTER U WITH DIAERESIS
165 |     u'ß': u'ss', # LATIN SMALL LETTER SHARP S
166 |     u'ä': u'ae', # LATIN SMALL LETTER A WITH DIAERESIS
167 |     u'æ': u'ae', # LATIN SMALL LETTER AE
168 |     u'ð': u'eth', # LATIN SMALL LETTER ETH
169 |     # u'ñ': u'ny', # LATIN SMALL LETTER N WITH TILDE
170 |     u'ö': u'oe', # LATIN SMALL LETTER O WITH DIAERESIS
171 |     u'÷': u'/', # DIVISION SIGN
172 |     u'ø': u'o', # LATIN SMALL LETTER O WITH STROKE
173 |     u'ü': u'ue', # LATIN SMALL LETTER U WITH DIAERESIS
174 | }
175 | 
176 | 
177 | 


--------------------------------------------------------------------------------
/epubmaker/Version.py:
--------------------------------------------------------------------------------
1 | VERSION = '0.3.26'
2 | GENERATOR = 'EpubMaker %s <https://github.com/gitenberg-dev/pg-epubmaker>'
3 | 


--------------------------------------------------------------------------------
/epubmaker/WriterFactory.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
 3 | 
 4 | """
 5 | 
 6 | WriterFactory.py
 7 | 
 8 | Copyright 2009-14 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | Writer factory. Dynamically loads writers from directories.
13 | 
14 | """
15 | 
16 | from __future__ import with_statement
17 | 
18 | import os.path
19 | 
20 | from pkg_resources import resource_isdir, resource_listdir # pylint: disable=E0611
21 | 
22 | from epubmaker.lib.Logger import debug
23 | 
24 | writers = {}
25 | 
26 | def __load_writers_from (package_name):
27 |     """ See what types we can write. """
28 | 
29 |     try:
30 |         for fn in resource_listdir (package_name, ''):
31 |             modulename, ext = os.path.splitext (fn)
32 |             if ext == '.py':
33 |                 if modulename.endswith ('Writer'):
34 |                     type_ = modulename.lower ().replace ('writer', '')
35 |                     debug ("Loading writer type %s from module %s" % (type_, modulename))
36 |                     module = __import__ (package_name + '.' + modulename, fromlist = [modulename])
37 |                     writers[type_] = module
38 | 
39 |     except ImportError:
40 |         pass
41 | 
42 | 
43 | def load_writers ():
44 |     """ See what types we can write. """
45 | 
46 |     __load_writers_from ('epubmaker.writers')
47 |     __load_writers_from ('epubmaker.writers.ibiblio')
48 | 
49 |     return writers.keys ()
50 | 
51 | 
52 | def unload_writers ():
53 |     """ Unload writer modules. """
54 |     for k in writers.keys ():
55 |         del writers[k]
56 | 
57 | 
58 | def create (type_):
59 |     """ Load writer module for type. """
60 | 
61 |     try:
62 |         return writers[type_].Writer ()
63 |     except KeyError:
64 |         raise KeyError ('No writer for type %s' % type_)
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/epubmaker/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 | 


--------------------------------------------------------------------------------
/epubmaker/lib/GutenbergGlobals.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | GutenbergGlobals.py
  6 | 
  7 | Copyright 2009 by Marcello Perathoner
  8 | 
  9 | Distributable under the GNU General Public License Version 3 or newer.
 10 | 
 11 | This module has sadly decayed into a repository for all sorts of cruft.
 12 | 
 13 | FIXME: refactor this module
 14 | 
 15 | """
 16 | 
 17 | import os
 18 | import re
 19 | import datetime
 20 | 
 21 | class Struct (object):
 22 |     """ handy class to pin attributes on
 23 | 
 24 |     usage: c = Struct ()
 25 |            c.something = 1
 26 | 
 27 |     """
 28 |     pass
 29 | 
 30 | 
 31 | NSMAP = {
 32 |     'atom':       'http://www.w3.org/2005/Atom',
 33 |     'bio':        'http://purl.org/vocab/bio/0.1/',
 34 |     'cc':         'http://web.resource.org/cc/',
 35 |     'dc':         'http://purl.org/dc/elements/1.1/',
 36 |     'dcam':       'http://purl.org/dc/dcam/',
 37 |     'dcmitype':   'http://purl.org/dc/dcmitype/',
 38 |     'dcterms':    'http://purl.org/dc/terms/',
 39 |     'ebook':      'http://www.gutenberg.org/ebooks/',             # URL
 40 |     'foaf':       'http://xmlns.com/foaf/0.1/',
 41 |     'marcrel':    'http://id.loc.gov/vocabulary/relators',
 42 |     'mathml':     'http://www.w3.org/1998/Math/MathML',
 43 |     'mbp':        'http://mobipocket.com/mbp',
 44 |     'ncx':        'http://www.daisy.org/z3986/2005/ncx/',
 45 |     'opds':       'http://opds-spec.org/2010/Catalog',
 46 |     'opf':        'http://www.idpf.org/2007/opf',
 47 |     'opensearch': 'http://a9.com/-/spec/opensearch/1.1/',
 48 |     'pg':         'http://www.gutenberg.org/',                    # URL
 49 |     'pgagents':   'http://www.gutenberg.org/2009/agents/',
 50 |     'pgtei':      'http://www.gutenberg.org/tei/marcello/0.5/ns',
 51 |     'pgterms':    'http://www.gutenberg.org/2009/pgterms/',
 52 |     'py':         'http://genshi.edgewall.org/',
 53 |     'rdf':        'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
 54 |     'rdfs':       'http://www.w3.org/2000/01/rdf-schema#',
 55 |     'svg':        'http://www.w3.org/2000/svg',
 56 |     'tei':        'http://www.tei-c.org/ns/1.0',
 57 |     'xhtml':      'http://www.w3.org/1999/xhtml',
 58 |     'xinclude':   'http://www.w3.org/2001/XInclude',
 59 |     'xml':        'http://www.w3.org/XML/1998/namespace',
 60 |     'xmlns':      'http://www.w3.org/2000/xmlns/',
 61 |     'xsd':        'http://www.w3.org/2001/XMLSchema#',
 62 |     'xsi':        'http://www.w3.org/2001/XMLSchema-instance',
 63 |     'xslfo':      'http://www.w3.org/1999/XSL/Format',
 64 | }
 65 | 
 66 | 
 67 | class NameSpaceClark (object):
 68 |     """ Build a tag name in Clark notation.
 69 | 
 70 |     ns = NameSpaceClark ("http://example.com/")
 71 |     >>> ns.foo
 72 |     '{http://example.com/}foo'
 73 |     >>> ns['bar']
 74 |     '{http://example.com/}bar'
 75 | 
 76 |     """
 77 | 
 78 |     def __init__ (self, root):
 79 |         self.root = root
 80 | 
 81 |     def __getitem__ (self, local):
 82 |         return "{%s}%s" % (self.root, local)
 83 | 
 84 |     def __getattr__ (self, local):
 85 |         return "{%s}%s" % (self.root, local)
 86 | 
 87 |     def __str__ (self):
 88 |         return self.root
 89 | 
 90 | 
 91 | class NameSpaceURI (object):
 92 |     """ Build a URI.
 93 | 
 94 |     ns = NameSpaceURI ("http://example.com/")
 95 |     >>> ns.foo
 96 |     'http://example.com/foo'
 97 |     >>> ns['bar']
 98 |     'http://example.com/bar'
 99 | 
100 |     """
101 | 
102 |     def __init__ (self, root):
103 |         self.root = root
104 | 
105 |     def __getitem__ (self, local):
106 |         return "%s%s" % (self.root, local)
107 | 
108 |     def __getattr__ (self, local):
109 |         return "%s%s" % (self.root, local)
110 | 
111 |     def __str__ (self):
112 |         return self.root
113 | 
114 | 
115 | def build_nsmap (prefixes = None):
116 |     """ build a nsmap containing all namespaces for prefixes """
117 | 
118 |     if prefixes is None:
119 |         prefixes = NSMAP.keys ()
120 |     if isinstance (prefixes, str):
121 |         prefixes = prefixes.split ()
122 | 
123 |     ns = {}
124 |     for prefix in prefixes:
125 |         ns[prefix] = NSMAP[prefix]
126 | 
127 |     return ns
128 | 
129 | 
130 | NS = Struct ()
131 | NSURI = Struct ()
132 | 
133 | for prefix, uri in NSMAP.items ():
134 |     setattr (NS, prefix, NameSpaceClark (uri))
135 |     setattr (NSURI, prefix, NameSpaceURI (uri))
136 | 
137 | XML_DECLARATION = """<?xml version='1.0' encoding='UTF-8'?>"""
138 | 
139 | XHTML_DOCTYPE   = ("<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' " +  
140 |                    "'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'>")
141 | 
142 | XHTML1_DOCTYPE   = ("<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Strict//EN' " +  
143 |                    "'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'>")
144 | 
145 | XHTML_RDFa_DOCTYPE = ("<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML+RDFa 1.0//EN' " +
146 |                       "'http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd'>")
147 | 
148 | NCX_DOCTYPE = ("<!DOCTYPE ncx PUBLIC '-//NISO//DTD ncx 2005-1//EN' " +
149 |                "'http://www.daisy.org/z3986/2005/ncx-2005-1.dtd'>")
150 | 
151 | GENERATOR = 'EpubMaker by Marcello Perathoner <https://github.com/gitenberg-dev/pg-epubmaker>'
152 | 
153 | 
154 | def xmlspecialchars (s):
155 |     return (s.replace (u'&',  u'&')
156 |              .replace (u'<',  u'<')
157 |              .replace (u'>',  u'>'))
158 | 
159 | def insert_breaks (s):
160 |     return s.replace (u'\n',  u'<br />')
161 | 
162 | RE_NORMALIZE    = re.compile (r"\s+")
163 | 
164 | def normalize (s):
165 |     s = RE_NORMALIZE.sub (' ', s)
166 |     return s.strip ()
167 | 
168 | 
169 | def cut_at_newline (text):
170 |     """ Cut the text at the first newline. """
171 |     i = text.find ('\n')
172 |     if i > -1:
173 |         return text[:i]
174 |     return text
175 | 
176 | def archive_dir (ebook):
177 |     """ build 1/2/3/4/12345 for 12345 """
178 |     ebook = str (ebook)
179 |     a = []
180 |     for c in ebook:
181 |         a.append (c)
182 |     a[-1] = ebook
183 |     return "/".join (a)
184 | 
185 | def archive2files (ebook, path):
186 |     adir = archive_dir (ebook)
187 |     return path.replace ('dirs/' + adir, 'files/%d' % ebook)
188 | 
189 | 
190 | def xpath (node, path, **kwargs):
191 |     """ xpath helper """
192 |     return node.xpath (path, namespaces = NSMAP, **kwargs)
193 | 
194 | 
195 | def mkdir_for_filename (fn):
196 |     """ Make sure the directory for this file is present. """
197 | 
198 |     try:
199 |         os.makedirs (os.path.dirname (fn))
200 |     except os.error:
201 |         pass
202 | 
203 | 
204 | def make_url_relative (base_url, url):
205 |     """ Make absolute url relative to base_url if possible. """
206 | 
207 |     if (url.startswith (base_url)):
208 |         return url[len (base_url):]
209 | 
210 |     base_url = os.path.dirname (base_url) + '/'
211 | 
212 |     if (url.startswith (base_url)):
213 |         return url[len (base_url):]
214 | 
215 |     return url
216 | 
217 | 
218 | def normalize_path (path):
219 |     """ Normalize a file path. """
220 |     if path.startswith ('file://'):
221 |         path = path[7:]
222 |     return path
223 |         
224 | def is_same_path (path1, path2):
225 |     """ Does path1 point to the same file as path2? """
226 |     return os.path.realpath (normalize (path1)) == os.path.realpath (normalize (path2))
227 | 
228 | 
229 | def string_to_filename (fn):
230 |     """ Sanitize string so it can do as filename. """
231 | 
232 |     def escape (matchobj):
233 |         """ Escape a char. """
234 |         return '@%x' % ord (matchobj.group (0))
235 | 
236 |     fn = os.path.normpath (fn)
237 |     fn = normalize (fn)
238 |     fn = fn.replace (os.sep, '@')
239 |     if os.altsep:
240 |         fn = fn.replace (os.altsep, '@')
241 |     fn = re.sub (u'[\|/:?"*<>\u0000-\u001F]', escape, fn)
242 | 
243 |     return fn
244 |     
245 | 
246 | class DCIMT (object):
247 |     """ encapsulates one dcterms internet mimetype 
248 | 
249 |     """
250 | 
251 |     def __init__ (self, mime, enc = None):
252 |         if mime is None:
253 |             self.mimetype = 'application/octet-stream'
254 |         elif enc is not None and mime.startswith ('text/'):
255 |             self.mimetype = "%s; charset=%s" % (mime, enc)
256 |         else:
257 |             self.mimetype = mime
258 |     
259 |     def __str__ (self):
260 |         return self.mimetype
261 |     
262 | 
263 | class UTC (datetime.tzinfo):
264 |     """ UTC helper for datetime.datetime """
265 | 
266 |     def utcoffset (self, dummy_dt):
267 |         return datetime.timedelta (0)
268 | 
269 |     def tzname (self, dummy_dt):
270 |         return "UTC"
271 | 
272 |     def dst (self, dummy_dt):
273 |         return datetime.timedelta (0)
274 | 
275 | # exceptions
276 | 
277 | class SkipOutputFormat (Exception):
278 |     pass
279 | 
280 | # Spider.py treis a topological sort on link rel=next
281 | def topological_sort (pairlist):
282 |     """Topologically sort a list of (parent, child) pairs.
283 | 
284 |     Return a list of the elements in dependency order (parent to child order).
285 | 
286 |     >>> print topsort( [(1,2), (3,4), (5,6), (1,3), (1,5), (1,6), (2,5)] ) 
287 |     [1, 2, 3, 5, 4, 6]
288 | 
289 |     >>> print topsort( [(1,2), (1,3), (2,4), (3,4), (5,6), (4,5)] )
290 |     [1, 2, 3, 4, 5, 6]
291 | 
292 |     >>> print topsort( [(1,2), (2,3), (3,2)] )
293 |     Traceback (most recent call last):
294 |     CycleError: ([1], {2: 1, 3: 1}, {2: [3], 3: [2]})
295 |  
296 |     """
297 |     num_parents = {}  # element -> # of predecessors 
298 |     children = {}  # element -> list of successors 
299 |     for parent, child in pairlist: 
300 |         # Make sure every element is a key in num_parents.
301 |         if not num_parents.has_key( parent ): 
302 |             num_parents[parent] = 0 
303 |         if not num_parents.has_key( child ): 
304 |             num_parents[child] = 0 
305 | 
306 |         # Since child has a parent, increment child's num_parents count.
307 |         num_parents[child] += 1
308 | 
309 |         # ... and parent gains a child.
310 |         children.setdefault(parent, []).append(child)
311 | 
312 |     # Suck up everything without a parent.
313 |     answer = [x for x in num_parents.keys() if num_parents[x] == 0]
314 | 
315 |     # For everything in answer, knock down the parent count on its children.
316 |     # Note that answer grows *in* the loop.
317 |     for parent in answer: 
318 |         del num_parents[parent]
319 |         if children.has_key( parent ): 
320 |             for child in children[parent]: 
321 |                 num_parents[child] -= 1
322 |                 if num_parents[child] == 0: 
323 |                     answer.append( child ) 
324 |             # Following "del" isn't needed; just makes 
325 |             # CycleError details easier to grasp.
326 |             del children[parent]
327 | 
328 |     if num_parents: 
329 |         # Everything in num_parents has at least one child -> 
330 |         # there's a cycle.
331 |         raise Exception (answer, num_parents, children)
332 |     return answer 
333 | 


--------------------------------------------------------------------------------
/epubmaker/lib/Logger.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
 3 | 
 4 | """
 5 | Logger.py
 6 | 
 7 | Copyright 2009 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Logging support.
12 | 
13 | 
14 | """
15 | 
16 | import logging
17 | from logging import debug, info, warn, error, critical, exception
18 | 
19 | LOGFORMAT = '%(asctime)s %(levelname)-8s  #%(ebook)-5d %(message)s'
20 | 
21 | ebook = 0 # global
22 | 
23 | class CustomFormatter (logging.Formatter):
24 |     """ A custom formatter that adds ebook no. """
25 |     
26 |     def format (self, record):
27 |         """ Add ebook no. to string format params. """
28 |         record.ebook = ebook
29 |         return logging.Formatter.format (self, record)
30 |         
31 |     
32 | def setup (logformat, logfile = None):
33 |     """ Setup logger. """
34 | 
35 |     # StreamHandler defaults to sys.stderr
36 |     file_handler = logging.FileHandler (logfile) if logfile else logging.StreamHandler ()
37 |     file_handler.setFormatter (CustomFormatter (logformat))
38 |     logging.getLogger ().addHandler (file_handler)
39 |     logging.getLogger ().setLevel (logging.INFO)
40 |     
41 | 
42 | def set_log_level (level):
43 |     """ Set log level. """
44 |     if level >= 1:
45 |         logging.getLogger ().setLevel (logging.INFO)
46 |     if level >= 2:
47 |         logging.getLogger ().setLevel (logging.DEBUG)
48 | 
49 | 
50 | __all__ = 'debug info warn error critical exception'.split ()
51 | 


--------------------------------------------------------------------------------
/epubmaker/lib/MediaTypes.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
 3 | 
 4 | """
 5 | MediaTypes.py
 6 | 
 7 | Copyright 2009 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Media Types Lists
12 | 
13 | """
14 | 
15 | import mimetypes
16 | 
17 | mimetypes.init ()
18 | 
19 | # overrides
20 | 
21 | mimetypes.types_map['.htm']     = 'application/xhtml+xml'
22 | mimetypes.types_map['.html']    = 'application/xhtml+xml'
23 | mimetypes.types_map['.xhtml']   = 'application/xhtml+xml'
24 | mimetypes.types_map['.mobile']  = 'application/xhtml+xml'
25 | mimetypes.types_map['.ncx']     = 'application/x-dtbncx+xml'
26 | mimetypes.types_map['.pt']      = 'application/vnd.adobe-page-template+xml'
27 | mimetypes.types_map['.epub']    = 'application/epub+zip'
28 | mimetypes.types_map['.mobi']    = 'application/x-mobipocket-ebook'
29 | mimetypes.types_map['.pdf']     = 'application/pdf'
30 | mimetypes.types_map['.plucker'] = 'application/prs.plucker'
31 | mimetypes.types_map['.qioo']    = 'application/x-qioo-ebook'
32 | mimetypes.types_map['.jar']     = 'application/java-archive'
33 | mimetypes.types_map['.rss']     = 'application/rss+xml'
34 | mimetypes.types_map['.atom']    = 'application/atom+xml'
35 | mimetypes.types_map['.opds']    = 'application/atom+xml'
36 | mimetypes.types_map['.stanza']  = 'application/atom+xml'
37 | mimetypes.types_map['.wap']     = 'application/vnd.wap.xhtml+xml'
38 | mimetypes.types_map['.json']    = 'application/x-suggestions+json'
39 | mimetypes.types_map['.rst']     = 'text/x-rst'
40 | mimetypes.types_map['.png']     = 'image/png'  # Windows XP thinks this is image/x-png
41 | mimetypes.types_map['.jpg']     = 'image/jpeg' # Windows XP thinks this is image/pjpeg
42 | mimetypes.types_map['.jpeg']    = 'image/jpeg' # Windows XP thinks this is image/pjpeg
43 | mimetypes.types_map['.jfif']    = 'image/jpeg' 
44 | mimetypes.types_map['.mscz']    = 'application/x-musescore+xml'
45 | mimetypes.types_map['.mid']     = 'audio/midi'
46 | mimetypes.types_map['.midi']    = 'audio/midi'
47 | mimetypes.types_map['.mus']     = 'application/x-myriad-music'
48 | mimetypes.types_map['.sib']     = 'application/x-sibelius-score'
49 | mimetypes.types_map['.mxl']     = 'application/vnd.recordare.musicxml'
50 | mimetypes.types_map['.mp3']     = 'audio/mpeg'
51 | 
52 | 
53 | TEXT_MEDIATYPES = set ( (
54 |     'application/xhtml+xml',
55 |     'application/xml',
56 |     'text/html',
57 |     'text/plain',
58 | ) )
59 | 
60 | IMAGE_MEDIATYPES = set ( (
61 |     'image/gif',
62 |     'image/jpeg',
63 |     'image/png',
64 | ) )
65 | 
66 | AUX_MEDIATYPES = set ( (
67 |     'text/css',
68 | ) )
69 | 
70 | class MediatypesLookup (object):
71 |     """ Quick mediatype lookup
72 | 
73 |     ns = MediatypesLookup ()
74 |     >>> ns.epub
75 |     'application/atom+xml'
76 |     >>> ns['mobi']
77 |     'application/x-mobipocket-ebook'
78 | 
79 |     """
80 | 
81 |     def __getitem__ (self, local):
82 |         return mimetypes.types_map['.' + local]
83 | 
84 |     def __getattr__ (self, local):
85 |         return mimetypes.types_map['.' + local]
86 | 
87 | mediatypes = MediatypesLookup ()
88 | 
89 | 


--------------------------------------------------------------------------------
/epubmaker/lib/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 | 
3 | __all__ = ['DublinCore', 'DummyConnectionPool',
4 |            'GutenbergDatabaseDublinCore', 'GutenbergDatabase',
5 |            'GutenbergGlobals', 'Logger', 'MediaTypes']
6 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/__init__.py:
--------------------------------------------------------------------------------
1 | broken = 'images/broken.png'
2 | 
3 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/gutenberg/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/gutenberg/parsers/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | 
 6 | Module parsers
 7 | 
 8 | Copyright 2010-2012 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | Customized Project Gutenberg directives for RST parser.
13 | 
14 | """
15 | 
16 | from docutils import statemachine
17 | from docutils.parsers.rst import Directive, directives
18 | 
19 | from epubmaker.mydocutils import parsers
20 | 
21 | from epubmaker.mydocutils.gutenberg import transforms as gutenberg_transforms
22 | 
23 | from epubmaker.lib.Logger import error, info, debug, warn
24 | 
25 | # pylint: disable=W0142, W0102
26 | 
27 | 
28 | class PGHeaderFooter (Directive):
29 |     """ Inserts PG header or footer. """
30 | 
31 |     required_arguments = 0
32 |     optional_arguments = 0
33 | 
34 |     def run (self):
35 |         settings = self.state.document.settings
36 |         include_lines = statemachine.string2lines (
37 |             settings.get_resource ('mydocutils.gutenberg.parsers', self.resource).decode ('utf-8'), 
38 |             settings.tab_width,
39 |             convert_whitespace = 1)
40 |         self.state_machine.insert_input (include_lines, '')
41 |         return []
42 | 
43 | 
44 | class PGHeader (PGHeaderFooter):
45 |     """ Inserts PG header. """
46 |     resource = 'pg-header.rst'
47 | 
48 | 
49 | class PGFooter (PGHeaderFooter):
50 |     """ Inserts PG footer. """
51 |     resource = 'pg-footer.rst'
52 | 
53 | 
54 | class Parser (parsers.Parser):
55 |     """ Parser with PG custom directives. """
56 | 
57 |     def __init__ (self):
58 |         parsers.Parser.__init__ (self)
59 | 
60 |         directives.register_directive ('pgheader',        PGHeader)
61 |         directives.register_directive ('pgfooter',        PGFooter)
62 | 
63 | 
64 |     def get_transforms (self):
65 |         return parsers.Parser.get_transforms (self) + [
66 |             gutenberg_transforms.VariablesTransform,
67 |             gutenberg_transforms.SubRefToVarTransform]
68 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/gutenberg/parsers/pg-header.rst:
--------------------------------------------------------------------------------
 1 | .. -*- encoding: utf-8 -*-
 2 | 
 3 | .. |pg.copyrighted-header| replace::
 4 | 
 5 |    This is a *copyrighted* Project Gutenberg eBook, details
 6 |    below. Please follow the copyright guidelines in this file.
 7 | 
 8 | .. _pg-header:
 9 | 
10 | .. container:: noindent pgheader language-en
11 | 
12 |    This eBook is for the use of anyone anywhere at no cost and with
13 |    almost no restrictions whatsoever. You may copy it, give it away or
14 |    re-use it under the terms of the `Project Gutenberg License`_
15 |    included with this eBook or online at
16 |    http://www.gutenberg.org/license.
17 | 
18 |    |pg.copyrighted-header|
19 | 
20 |    .. vspace:: 2
21 | 
22 |    .. _pg-machine-header:
23 | 
24 |    .. container:: noindent white-space-pre-line
25 | 
26 |       |pg.machine-header|
27 | 
28 |    .. vspace:: 2
29 | 
30 |    .. _pg-start-line:
31 | 
32 |    \*\*\* START OF THIS PROJECT GUTENBERG EBOOK |pg.upcase-title| \*\*\*
33 | 
34 |    .. vspace:: 4
35 | 
36 |    .. _pg-produced-by:
37 | 
38 |    |pg.produced-by|
39 | 
40 |    .. vspace:: 1
41 | 
42 |    |pg.credits|
43 | 
44 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/gutenberg/transforms/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | 
  6 | gutenberg.py
  7 | 
  8 | Copyright 2012 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Transforms for the Project Gutenberg flavor.
 13 | 
 14 | """
 15 | 
 16 | import datetime
 17 | import textwrap
 18 | 
 19 | from docutils import nodes
 20 | import docutils.transforms
 21 | import docutils.transforms.parts
 22 | 
 23 | from epubmaker.lib.Logger import error, info, debug, warn
 24 | from epubmaker.lib.DublinCore import DublinCore
 25 | from epubmaker.mydocutils import nodes as mynodes
 26 | 
 27 | # pylint: disable=W0142
 28 | 
 29 | class SubRefToVarTransform (docutils.transforms.Transform):
 30 |     """
 31 |     Transforms subref nodes in 'pg' namespace into var nodes.
 32 | 
 33 |     We need to save some subrefs for later processing. The standard
 34 |     subref processing happens too early (ie. before docinfo is
 35 |     collected). So we transform subrefs into variables, await docinfo
 36 |     to be processed, and then process the variables.
 37 | 
 38 |     """
 39 | 
 40 |     default_priority = 219
 41 |     """ Before substitition def variables """
 42 | 
 43 | 
 44 |     def apply (self):
 45 |         for ref in self.document.traverse (nodes.substitution_reference):
 46 |             refname = ref['refname']
 47 |             if refname.startswith ('pg.'):
 48 |                 var = mynodes.variable ()
 49 |                 var['name'] = refname
 50 |                 ref.replace_self (var)
 51 | 
 52 | 
 53 | class VariablesTransform (docutils.transforms.Transform):
 54 |     """ Replaces mynodes.var with parameters from metadata. """
 55 | 
 56 |     default_priority = 342
 57 |     """ After DocInfoCollector. """
 58 | 
 59 |     def apply(self):
 60 |         doc = self.document
 61 |         meta = doc.meta_block
 62 |         defs = doc.substitution_defs
 63 | 
 64 |         def getone (name, default = None):
 65 |             """ Get first value. """
 66 |             if name in meta:
 67 |                 return meta[name][0]
 68 |             return default
 69 | 
 70 |         def getmany (name, default = []):
 71 |             """ Get list of all values. """
 72 |             return meta.get (name, default)
 73 | 
 74 |         def sub (var, nodes):
 75 |             var.replace_self (nodes)
 76 | 
 77 |         title = getone ('DC.Title', 'No Title')
 78 |         short_title = getone ('PG.Title', title)
 79 |         short_title = short_title.split ('\n', 1)[0]
 80 | 
 81 |         language = getmany ('DC.Language', ['en'])
 82 |         language = map (lambda x: DublinCore.language_map.get (
 83 |             x, 'Unknown').title (), language)
 84 |         language = DublinCore.strunk (language)
 85 | 
 86 |         copyrighted = getone ('PG.Rights', '').lower () == 'copyrighted'
 87 | 
 88 |         for variable in doc.traverse (mynodes.variable):
 89 |             name = variable['name']
 90 | 
 91 |             if name == 'pg.upcase-title':
 92 |                 sub (variable, [ nodes.inline ('', short_title.upper ()) ])
 93 | 
 94 |             elif name == 'pg.produced-by':
 95 |                 producers = getmany ('PG.Producer')
 96 |                 if producers:
 97 |                      sub (variable, [ nodes.inline ('', u'Produced by %s.' %
 98 |                                                     DublinCore.strunk (producers)) ])
 99 |                 else:
100 |                     sub (variable, [])
101 | 
102 |             elif name == 'pg.credits':
103 |                 sub (variable, [ nodes.inline ('', getone ('PG.Credits', '')) ])
104 | 
105 |             elif name == 'pg.bibrec-url':
106 |                 url = 'http://www.gutenberg.org/ebooks/%s' % getone ('PG.Id', '999999')
107 |                 sub (variable, [ nodes.reference ('', '', nodes.inline ('', url), refuri = url) ])
108 | 
109 |             elif name in ('pg.copyrighted-header', 'pg.copyrighted-footer'):
110 |                 if copyrighted:
111 |                     subdef_copy = defs[name].deepcopy ()
112 |                     sub (variable, subdef_copy.children)
113 |                 else:
114 |                     sub (variable, [])
115 | 
116 |             elif name == 'pg.machine-header':
117 |                 tw = textwrap.TextWrapper (
118 |                     width = 72,
119 |                     initial_indent = u'Title: ',
120 |                     subsequent_indent = u' ' * 7)
121 | 
122 |                 if '\n' in title:
123 |                     maintitle, subtitle = title.split ('\n', 1)
124 |                     s = tw.fill (maintitle)
125 |                     s += '\n'
126 |                     tw.initial_indent = tw.subsequent_indent
127 |                     s += tw.fill (subtitle)
128 |                 else:
129 |                     s = tw.fill (title)
130 |                 s += '\n\n'
131 | 
132 |                 tw.initial_indent = u'Author: '
133 |                 tw.subsequent_indent = u' ' * 8
134 |                 s += tw.fill (DublinCore.strunk (getmany ('DC.Creator', ['Unknown'])))
135 |                 s += '\n\n'
136 | 
137 |                 date = getone ('PG.Released', '')
138 |                 try:
139 |                     date = datetime.datetime.strptime (date, '%Y-%m-%d')
140 |                     date = datetime.datetime.strftime (date, '%B %d, %Y')
141 |                 except ValueError:
142 |                     date = 'unknown date'
143 |                 s += u'Release Date: %s [EBook #%s]\n' % (date, getone ('PG.Id', '999999'))
144 | 
145 |                 for item in getmany ('PG.Reposted', []):
146 |                     try:
147 |                         date, comment = item.split (None, 1)
148 |                     except ValueError:
149 |                         date = item
150 |                         comment = None
151 |                     try:
152 |                         date = datetime.datetime.strptime (date, '%Y-%m-%d')
153 |                         date = datetime.datetime.strftime (date, '%B %d, %Y')
154 |                     except ValueError:
155 |                         date = 'unknown date'
156 | 
157 |                     s += u'Reposted: %s' % date
158 |                     if comment:
159 |                         s += u' [%s]' % comment
160 |                     s += '\n'
161 | 
162 |                 s += u'\nLanguage: %s\n\n' % language
163 |                 s += u'Character set encoding: %s' % doc.settings.encoding.upper ()
164 | 
165 |                 sub (variable, [ nodes.inline ('', nodes.Text (s)) ])
166 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/gutenberg/writers/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/gutenberg/writers/nroff.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # $Id: manpage.py 6270 2010-03-18 22:32:09Z milde $
  3 | # Author: Engelbert Gruber <grubert@users.sourceforge.net>
  4 | # Copyright: This module is put into the public domain.
  5 | # Rewritten almost completely
  6 | # by Marcello Perathoner <marcello@perathoner.de>
  7 | 
  8 | """
  9 | 
 10 | Nroff writer for reStructuredText. Tweaked for Project Gutenberg usage.
 11 | 
 12 | """
 13 | 
 14 | __docformat__ = 'reStructuredText'
 15 | 
 16 | from epubmaker.mydocutils.writers import nroff
 17 | from epubmaker import Unitame
 18 | 
 19 | from epubmaker.lib.Logger import info, debug, warn, error
 20 | 
 21 | GUTENBERG_NROFF_PREAMBLE = r""".\" -*- mode: nroff -*- coding: {encoding} -*-
 22 | .\" This file produces Project Gutenberg plain text. Usage:
 23 | .\"   $ groff -t -K {device} -T {device} this_file > output.txt
 24 | .
 25 | .pl 100000       \" very tall page: disable pagebreaks
 26 | .ll 72m
 27 | .po 0
 28 | .ad l           \" text-align: left
 29 | .nh             \" hyphenation: off
 30 | .cflags 0 .?!   \" single sentence space
 31 | .cflags 0 -\[hy]\[em]   \" don't break on -
 32 | .
 33 | .de nop
 34 | ..
 35 | .blm nop        \" do nothing on empty line
 36 | .
 37 | .nr [env_cnt] 0
 38 | .ev 0           \" start in a defined environment
 39 | .
 40 | .de push_env
 41 | .br
 42 | .nr last_env \\n[.ev]            \" save current environment name
 43 | .nr env_cnt +1   \" generate new environment name
 44 | .ev \\n[env_cnt]
 45 | .evc \\n[last_env]
 46 | ..
 47 | .de pop_env
 48 | .br
 49 | .ev
 50 | .nr env_cnt -1
 51 | ..
 52 | .
 53 | """
 54 | 
 55 | GUTENBERG_NROFF_POSTAMBLE = r""".
 56 | .pl 0    \" ends very long page here
 57 | .\" End of File
 58 | """
 59 | 
 60 | class Writer (nroff.Writer):
 61 |     """ A plaintext writer thru nroff. """
 62 | 
 63 |     supported = ('pg-nroff',)
 64 |     """Formats this writer supports."""
 65 | 
 66 |     def __init__ (self):
 67 |         nroff.Writer.__init__ (self)
 68 |         self.translator_class = Translator
 69 | 
 70 |     def translate (self):
 71 |         visitor = self.translator_class (self.document)
 72 |         del Unitame.unhandled_chars[:]
 73 |         self.document.walkabout (visitor)
 74 |         self.output = visitor.astext ()
 75 |         if Unitame.unhandled_chars:
 76 |             error ("unitame: unhandled chars: %s" % u", ".join (set (Unitame.unhandled_chars)))
 77 | 
 78 |     #def get_transforms (self):
 79 |     #    tfs = writers.Writer.get_transforms (self)
 80 |     #    return tfs + [parts.CharsetTransform]
 81 | 
 82 |         
 83 |         
 84 | class Translator (nroff.Translator):
 85 |     """ nroff translator """
 86 | 
 87 |     def preamble (self):
 88 |         """ Inserts nroff preamble. """
 89 |         return GUTENBERG_NROFF_PREAMBLE.format (
 90 |             encoding = self.encoding, device = self.device)
 91 | 
 92 | 
 93 |     def postamble (self):
 94 |         """ Inserts nroff postamble. """
 95 |         return GUTENBERG_NROFF_POSTAMBLE.format (
 96 |             encoding = self.encoding, device = self.device)
 97 | 
 98 | 
 99 |     def init_translate_maps (self):
100 |         nroff.Translator.init_translate_maps (self)
101 | 
102 |         update = {
103 |             0x0011: ur"\~",       # nbsp, see: Unitame.py
104 |             0x0012: ur"\%",       # shy,  see: Unitame.py
105 |             }
106 | 
107 |         self.translate_map.update (update)
108 |         self.translate_map_literal.update (update)
109 | 
110 | 
111 |     def register_classes (self):
112 |         """ Register classes.
113 |         
114 |         Use the idiosyncratic PG convention of marking up italics etc.
115 | 
116 |         """
117 | 
118 |         #
119 |         # This does not call the base class !!!
120 |         #
121 | 
122 |         self.register_class ('simple', 'left',         '.ad l', '')
123 |         self.register_class ('simple', 'right',        '.ad r', '')
124 |         self.register_class ('simple', 'center',       '.ad c', '')
125 |                                                     
126 |         self.register_class ('inline', 'italics',      '_',    '_')
127 |         self.register_class ('inline', 'bold',         '*',    '*')
128 | 
129 |         self.register_class ('inline', 'monospaced',   '',     '')
130 |         self.register_class ('inline', 'superscript',  '',     '')
131 |         self.register_class ('inline', 'subscript',    '',     '')
132 | 
133 |         self.register_class ('inline', 'small-caps',   '_',    '_')
134 |         self.register_class ('inline', 'gesperrt',     '_',    '_')
135 |         self.register_class ('inline', 'antiqua',      '_',    '_')
136 |         self.register_class ('inline', 'larger',       '',     '')
137 |         self.register_class ('inline', 'smaller',      '',     '')
138 | 
139 | 
140 |     def translate (self, text):
141 |         """ Reduce the charset while keeping text a unicode string. """
142 | 
143 |         # NOTE: there's an alternate approach in
144 |         # transforms.parts.CharsetTransform
145 | 
146 |         if self.encoding != 'utf-8':
147 |             text = text.encode (self.encoding, 'unitame')
148 |             text = text.decode (self.encoding)
149 | 
150 |         if self.in_literal:
151 |             text = text.translate (self.translate_map_literal)
152 |         else:
153 |             text = text.translate (self.translate_map)
154 | 
155 |         return text
156 | 
157 |         
158 |     def visit_inner (self, node):
159 |         """ Try to remove duplicated PG highlight markers. """
160 |         if node.type == 'inline':
161 |             prefixes = self.get_prefix (node.type, node['classes'])
162 |             for prefix in prefixes:
163 |                 if prefix == self.last_output_char:
164 |                     self.backspace ()
165 |                 else:
166 |                     self.text (prefix)
167 |         else:
168 |             nroff.Translator.visit_inner (self, node)
169 | 
170 | 
171 |     def visit_inline (self, node):
172 |         if 'toc-pageref' in node['classes']:
173 |             maxlen = 3 # sensible default
174 |             while node.parent:
175 |                 node = node.parent
176 |                 if 'pageno_maxlen' in node:
177 |                     maxlen = node['pageno_maxlen']
178 |                     break
179 |             self.cmd (('linetabs 1',
180 |                        r'ta (\n[.l]u - \n[.i]u - %dm) +%dmR' % (maxlen + 1, maxlen + 1),
181 |                        r'lc .'))
182 |             self.text (chr (1) + '\t')
183 |         nroff.Translator.visit_inline (self, node)
184 | 
185 |     def visit_section_title (self, node):
186 |         """ Implements PG-standard spacing before headers. """
187 |         self.sp (max (2, 5 - self.section_level))
188 | 
189 |     def visit_figure (self, node):
190 |         self.sp (1)
191 |         self.push ()
192 | 
193 |     def depart_figure (self, node):
194 |         self.pop ()
195 |         self.sp (1)
196 | 
197 |     def visit_image (self, node):
198 |         # ignore alt attribute except for dropcaps
199 |         if 'dropcap' in node['classes']:
200 |             self.text (node.attributes.get ('alt', ''))
201 | 
202 |     def visit_page (self, node):
203 |         if 'clearpage' in node['classes']:
204 |             self.sp (4)
205 |         elif 'cleardoublepage' in node['classes']:
206 |             self.sp (4)
207 |         else:
208 |             nroff.Translator.visit_page (self, node)
209 | 
210 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/nodes.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | 
 6 | nodes.py
 7 | 
 8 | Copyright 2011 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | Added nodes for PG.
13 | 
14 | """
15 | 
16 | from docutils import nodes
17 | 
18 | class page (nodes.Element, nodes.Special):
19 |     """ Hold pagination commands.
20 | 
21 |     Like clearpage, vspace etc.
22 |     
23 |     """
24 | 
25 | class newline (nodes.Element):
26 |     """ A line break.
27 | 
28 |     Outputs a hard line break if the node or one of its parents belong
29 |     to the class 'white-space-pre-line'.  Else a space.
30 | 
31 |     """
32 | 
33 | class footnote_group (nodes.container):
34 |     """ Hold a group of footnotes. """
35 | 
36 | 
37 | class variable (nodes.Inline, nodes.TextElement):
38 |     """ A placeholder that gets substituted with actual text before output. 
39 | 
40 |     We do not use substitution refs because they are resolved way too
41 |     early in the transformation stage to be of much use to us.
42 | 
43 |     """
44 | 
45 | 
46 | class node_selector (object):
47 |     """ Allows CSS-like selectors as condition function for nodes.traverse (). """
48 |     
49 |     def __init__ (self, selector):
50 | 
51 |         # allow selectors like [element][.class[.class[...]]][, selector[, selector]]
52 | 
53 |         self.matches = [] # list of 2-tuples
54 |         
55 |         for sel in selector.split (','):
56 |             sel = sel.strip ()
57 |             if '.' not in sel:
58 |                 sel += '.'
59 |             element, classes = sel.split ('.', 1)
60 |             classes = set (classes.split ('.')) if classes else set ()
61 |             self.matches.append ( (getattr (nodes, element, nodes.Element), classes) )
62 |         
63 | 
64 |     def __call__ (self, node):
65 |         """ returns True if the node matches the selector. """
66 |         
67 |         for match in self.matches:
68 |             if isinstance (node, match[0]) and match[1].issubset (node['classes']):
69 |                 return True
70 | 
71 |         return False
72 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/parsers/default_style.rst:
--------------------------------------------------------------------------------
 1 | .. this is the default PG-RST stylesheet
 2 | 
 3 | .. style:: emphasis
 4 |    :class: italics
 5 | 
 6 | .. style:: strong
 7 |    :class: bold
 8 | 
 9 | .. style:: title_reference
10 |    :class: italics
11 | 
12 | .. style:: option_argument
13 |    :class: italics
14 | 
15 | .. style:: literal
16 |    :class: monospaced
17 | 
18 | .. style:: subscript
19 |    :class: subscript
20 | 
21 | .. style:: superscript
22 |    :class: superscript
23 | 
24 | .. style:: title.document-title
25 |    :class: x-large center
26 |    :titlehack:
27 | 
28 | .. style:: title.topic-title
29 |    :class: centerleft
30 | 
31 | .. style:: title.table-title
32 |    :class: centerleft larger
33 | 
34 | .. figure and image styles for non-image formats
35 | 
36 | .. style:: figure
37 |    :class: margin
38 | 
39 | .. style:: figure
40 |    :formats: txt.* *.noimages
41 |    :align: center
42 |    :width: 80%
43 | 
44 | .. style:: image
45 |    :formats: *.noimages
46 |    
47 |    .. container:: center image margin
48 |    
49 |       [image]
50 | 
51 | 
52 | .. style:: image
53 |    :formats: txt.*
54 |    :display: none   
55 | 
56 | .. style:: caption.figure-caption
57 |    :formats: -txt.*
58 |    :class: centerleft italics margin
59 | 
60 | .. style:: caption.figure-caption
61 |    :formats: txt.*
62 |    :class: margin
63 |    :before:  '[Illustration: '
64 |    :after:   ']'
65 | 
66 | .. style:: legend
67 |    :class: margin
68 | 
69 | 
70 | .. default transition
71 | 
72 | .. style:: transition
73 | 
74 |    .. container:: center transition margin
75 | 
76 |       ――――
77 | 
78 | .. default attribution
79 | 
80 | .. style:: attribution
81 |    :class: margin
82 |    :before: '―― '
83 | 
84 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/writers/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | 
  6 | Mydocutils writer package.
  7 | 
  8 | Copyright 2010-2012 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | """
 13 | 
 14 | 
 15 | __docformat__ = 'reStructuredText'
 16 | 
 17 | import collections
 18 | import operator
 19 | 
 20 | from docutils import nodes, writers
 21 | import roman
 22 | 
 23 | 
 24 | class Writer (writers.Writer):
 25 |     """ A base class for writers. """
 26 | 
 27 |     output = None
 28 |     """Final translated form of `document`."""
 29 | 
 30 |     config_section_dependencies = ('writers', )
 31 | 
 32 |     def translate (self):
 33 |         visitor = self.translator_class (self.document)
 34 |         self.document.walkabout (visitor)
 35 |         self.output = visitor.astext ()
 36 | 
 37 |         
 38 | class TablePass1 (nodes.SparseNodeVisitor):
 39 | 
 40 |     """
 41 |     Make a first pass over a table to get a reliable row and column
 42 |     count.  Insert placeholder cells for spanned cells.
 43 |     """
 44 |     
 45 |     def __init__ (self, document):
 46 |         nodes.SparseNodeVisitor.__init__ (self, document)
 47 |         
 48 |         self.row = -1     # 0-based
 49 |         self.column = 0   # 0-based
 50 |         self.cells = 0
 51 |         self.colspecs = None
 52 | 
 53 |     def visit_table (self, table):
 54 |         self.colspecs = table.traverse (nodes.colspec)
 55 |         width = sum (map (operator.itemgetter ('colwidth'), self.colspecs))
 56 |         for colspec in self.colspecs:
 57 |             colspec['relative_width'] = float (colspec['colwidth']) / width
 58 |             
 59 |     def depart_table (self, table):
 60 |         table['rows'] = self.rows ()
 61 |         table['columns'] = self.cols ()
 62 | 
 63 |     def visit_row (self, dummy_node):
 64 |         self.row += 1
 65 |         self.column = 0
 66 |         for colspec in self.colspecs:
 67 |             colspec['spanned'] = max (0, colspec.get ('spanned', 0) - 1)
 68 |         
 69 |     def visit_entry (self, node):
 70 |         """ Table cell. """
 71 | 
 72 |         morerows = node.get ('morerows', 0)
 73 |         morecols = node.get ('morecols', 0)
 74 | 
 75 |         self.cells += (morecols + 1) * (morerows + 1)
 76 | 
 77 |         # skip columns that are row-spanned by preceding entries
 78 |         while True:
 79 |             colspec = self.colspecs [self.column]
 80 |             if colspec.get ('spanned', 0) > 0:
 81 |                 placeholder = nodes.entry ()
 82 |                 placeholder.type = 'compound'
 83 |                 placeholder['column'] = self.column
 84 |                 placeholder.colspecs = self.colspecs[self.column:self.column + 1]
 85 |                 placeholder['vspan'] = True
 86 |                 node.replace_self ([placeholder, node])
 87 |                 self.column += 1
 88 |             else:
 89 |                 break
 90 | 
 91 |         # mark columns we row-span
 92 |         if morerows:
 93 |             for colspec in self.colspecs [self.column : self.column + 1 + morecols]:
 94 |                 colspec['spanned'] = morerows + 1
 95 | 
 96 |         node['row'] = self.row
 97 |         node['column'] = self.column
 98 |         
 99 |         node.colspecs = self.colspecs[self.column:self.column + morecols + 1]
100 | 
101 |         self.column += 1 + morecols
102 |         
103 |         raise nodes.SkipNode
104 | 
105 |     def rows (self):
106 |         """ Return the no. of columns. """
107 |         return self.row + 1
108 | 
109 |     def cols (self):
110 |         """ Return the no. of columns. """
111 |         return self.cells / self.rows ()
112 | 
113 | 
114 | class ListEnumerator:
115 |     """ Enumerate labels according to list type. """
116 | 
117 |     def __init__ (self, node, encoding):
118 |         self.type  = node.get ('enumtype') or node.get ('bullet') or '*'
119 |         self.start = node['start'] if 'start' in node else 1
120 |         self.prefix = node.get ('prefix', '')
121 |         self.suffix = node.get ('suffix', '')
122 |         self.encoding = encoding
123 | 
124 |         self.indent = len (self.prefix + self.suffix) + 1
125 |         if self.type == 'arabic':
126 |             # indentation depends on end value
127 |             self.indent += len (str (self.start + len (node.children)))
128 |         elif self.type.endswith ('alpha'):
129 |             self.indent += 1
130 |         elif self.type.endswith ('roman'):
131 |             self.indent += 5 # FIXME: calculate real length
132 |         else:
133 |             self.indent += 1 # none, bullets, etc.
134 | 
135 |     def get_next (self):
136 |         """ Get the next label. """
137 | 
138 |         if self.type == 'none':
139 |             res = ''
140 |         elif self.type == '*':
141 |             res = u'•' if self.encoding == 'utf-8' else '-'
142 |         elif self.type == '-':
143 |             res = u'-'
144 |         elif self.type == '+':
145 |             res = u'+'
146 |         elif self.type == 'arabic':
147 |             res = "%d" % self.start
148 |         elif self.type == 'loweralpha':
149 |             res = "%c" % (self.start + ord ('a') - 1)
150 |         elif self.type == 'upperalpha':
151 |             res = "%c" % (self.start + ord ('A') - 1)
152 |         elif self.type == 'upperroman':
153 |             res = roman.toRoman (self.start).upper ()
154 |         elif self.type == 'lowerroman':
155 |             res = roman.toRoman (self.start).lower ()
156 |         else:
157 |             res = "%d" % self.start
158 | 
159 |         self.start += 1
160 | 
161 |         return self.prefix + res + self.suffix
162 | 
163 |     def get_width (self):
164 |         """ Get indent width for this list. """
165 | 
166 |         return self.indent
167 | 
168 | 
169 | class Translator (nodes.NodeVisitor):
170 |     """ A base translator """
171 | 
172 |     admonitions = """
173 |     attention caution danger error hint important note tip warning
174 |     """.split ()
175 | 
176 |     docinfo_elements = """
177 |     address author contact copyright date organization revision status
178 |     version
179 |     """.split ()
180 | 
181 |     # see http://docutils.sourceforge.net/docs/ref/doctree.html#simple-body-elements
182 | 
183 |     # simple_structural_subelements = tuple ((getattr (nodes, n) for n in """
184 |     # title subtitle
185 |     # """.split ()))
186 | 
187 |     # simple_body_elements = tuple ((getattr (nodes, n) for n in """
188 |     # comment doctest_block image literal_block math_block paragraph 
189 |     # pending raw rubric substitution_definition target
190 |     # """.split ()))
191 | 
192 |     # simple_body_subelements = tuple ((getattr (nodes, n) for n in """
193 |     # attribution caption classifier colspec field_name 
194 |     # label line option_argument option_string term
195 |     # """.split ()))
196 | 
197 |     # simple_elements = (simple_structural_subelements + 
198 |     #                    simple_body_elements + simple_body_subelements)
199 | 
200 |     def __init__ (self, document):
201 |         nodes.NodeVisitor.__init__ (self, document)
202 |         self.settings = document.settings
203 |         
204 |         self.body = []
205 |         self.context = self.body # start with context == body
206 |         self.docinfo = collections.defaultdict (list)
207 |         self.list_enumerator_stack = []
208 |         self.section_level = 0
209 |         self.vspace = 0 # pending space (need this for collapsing)
210 |         self.src_vspace = 0 # pending space for source pretty printing
211 | 
212 |         self.field_name = None
213 |         self.compacting = 0 # > 0 if we are inside a compacting list
214 |         self.in_literal = 0 # > 0 if we are inside one or more literal blocks
215 |         
216 |         self.prefixes = collections.defaultdict (list) # dict of arrays of prefixes in order in 
217 |                                                        # which to apply classes
218 |         self.suffixes = collections.defaultdict (list) # reverse order of above
219 |         
220 |         self.environments = [] # stack of \begin'ed environments
221 | 
222 |         self.register_classes ()
223 |         
224 |         for name in self.docinfo_elements:
225 |             setattr (self, 'visit_' + name,
226 |                      lambda node: self.visit_field_body (node, name))
227 |             setattr (self, 'depart_' + name, self.depart_field_body)
228 |             
229 |         for adm in self.admonitions:
230 |             setattr (self, 'visit_' + adm,
231 |                      lambda node: self.visit_admonition (node, adm))
232 |             setattr (self, 'depart_' + adm, self.depart_admonition)
233 |             
234 | 
235 |     def register_classes (self):
236 |         pass
237 | 
238 | 
239 |     def dispatch_visit (self, node):
240 |         """
241 |         Call self."``visit_`` + node class name" with `node` as
242 |         parameter.  If the ``visit_...`` method does not exist, call
243 |         self.unknown_visit.
244 | 
245 |         There are 3 hooks for every visit:
246 |         
247 |         visit_outer
248 |         visit_<classname>
249 |         visit_inner
250 | 
251 |         """
252 | 
253 |         self.visit_outer (node)
254 | 
255 |         node_name = node.__class__.__name__
256 |         method = getattr (self, 'visit_' + node_name, self.unknown_visit)
257 |         self.document.reporter.debug (
258 |             'docutils.nodes.NodeVisitor.dispatch_visit calling %s for %s'
259 |             % (method.__name__, node_name))
260 |         res = method (node)
261 | 
262 |         if node.type in ('compound', 'simple', 'inline'):
263 |             self.visit_inner (node)
264 | 
265 |         return res
266 | 
267 |     def dispatch_departure (self, node):
268 |         """
269 |         Call self."``depart_`` + node class name" with `node` as
270 |         parameter.  If the ``depart_...`` method does not exist, call
271 |         self.unknown_departure.
272 | 
273 |         There are 3 hooks for every departure:
274 |         
275 |         depart_inner
276 |         depart_<classname>
277 |         depart_outer
278 | 
279 |         """
280 | 
281 |         if node.type in ('compound', 'simple', 'inline'):
282 |             self.depart_inner (node)
283 | 
284 |         node_name = node.__class__.__name__
285 |         method = getattr (self, 'depart_' + node_name, self.unknown_departure)
286 |         self.document.reporter.debug (
287 |             'docutils.nodes.NodeVisitor.dispatch_departure calling %s for %s'
288 |             % (method.__name__, node_name))
289 |         res = method (node)
290 | 
291 |         self.depart_outer (node)
292 | 
293 |         return res
294 | 
295 | 
296 |     def unknown_visit (self, node):
297 |         """ Called if we have no handler for this element. """
298 |         pass
299 | 
300 |     def unknown_departure (self, node):
301 |         """ Called if we have no handler for this element. """
302 |         pass
303 | 
304 | 
305 |     def visit_outer (self, node):
306 |         """ The very first hook called on a node, before
307 |         ``visit_<classname>``. """
308 |         pass
309 | 
310 |     def visit_inner (self, node):
311 |         """ Called after ``visit_<classname>``. """
312 |         pass
313 | 
314 |     def depart_inner (self, node):
315 |         """ Called on a block before ``depart_<classname>``. """
316 |         pass
317 | 
318 |     def depart_outer (self, node):
319 |         """ The very last hook called on a node, after
320 |         ``depart_<classname>``."""
321 |         pass
322 | 
323 | 
324 |     def register_class (self, types, class_, prefix, suffix):
325 |         """ Register classes. 
326 | 
327 |         A mechanism to automatically output strings before and after
328 |         elements with specific classes.  For most use cases this is
329 |         easier than to write a handler for the element.
330 | 
331 |         types: types of node this class will apply to: 
332 |                tuple of one or more of (text, inline, simple, compound)
333 |         class_: class that triggers the strings
334 |         prefix: string output before element
335 |         suffix: string output after element
336 | 
337 |         """
338 | 
339 |         if isinstance (types, basestring):
340 |             types = types.split ()
341 | 
342 |         for t in types:
343 |             self.prefixes[t].append (   (class_, prefix))
344 |             self.suffixes[t].insert (0, (class_, suffix))
345 | 
346 |     def get_prefix (self, type_, classes):
347 |         return self._get_prefix (type_, classes, self.prefixes)
348 | 
349 |     def get_suffix (self, type_, classes):
350 |         return self._get_prefix (type_, classes, self.suffixes)
351 | 
352 |     def _get_prefix (self, type_, classes, array):
353 |         """ Helper for inline handlers. """
354 |         if isinstance (classes, basestring):
355 |             classes = classes.split ()
356 | 
357 |         res = []
358 |         for s in array[type_]:
359 |             if s[0] in classes:
360 |                 res.append (s[1])
361 |         return res
362 |     
363 | 
364 |     def set_class_on_child (self, node, class_, index = 0):
365 |         """
366 |         Set class `class_` on the visible child no. index of `node`.
367 |         Do nothing if node has fewer children than `index`.
368 |         """
369 |         children = [n for n in node if not isinstance (n, nodes.Invisible)]
370 |         try:
371 |             child = children[index]
372 |         except IndexError:
373 |             return
374 |         child['classes'].append (class_)
375 | 
376 |     def set_first_last (self, node):
377 |         """ Set class 'first' on first child, 'last' on last child. """
378 |         self.set_class_on_child (node, 'first', 0)
379 |         self.set_class_on_child (node, 'last', -1)
380 |  	
381 |     def astext (self):
382 |         """ Return the final formatted document as a string. """
383 |         return self.preamble () + ''.join (self.context) + self.postamble ()
384 | 
385 |     def comment (self, text):
386 |         """ Output a comment. """
387 |         pass
388 |     
389 |     def text (self, text):
390 |         """ Output text. """
391 |         pass
392 | 
393 |     def sp (self, n = 1):
394 |         """ Adds vertical space before the next simple element. 
395 | 
396 |         All spaces added collapse into the largest one. """
397 | 
398 |         if n == 0:
399 |             self.vspace = 1999
400 |         else:
401 |             self.vspace = max (n, self.vspace)
402 | 
403 |     def src_sp (self, n = 1):
404 |         """ Add vertical space to the source. """
405 | 
406 |         if n == 0:
407 |             self.src_vspace = 1999
408 |         else:
409 |             self.src_vspace = max (n, self.src_vspace)
410 | 
411 |     def output_sp (self):
412 |         pass
413 |     
414 |     def output_src_sp (self):
415 |         pass
416 |     
417 |     def push (self):
418 |         """ Push environment. """
419 |         pass
420 |        
421 |     def pop (self):
422 |         """ Pop environment. """
423 |         pass
424 |         
425 |     def br_if_line_longer_than (self, length):
426 |         """ Go one line up if the last line was shorter than length.
427 | 
428 |         Use this to compact lists etc. """
429 |         pass
430 |         
431 |     def indent (self, by = 2):
432 |         """ Indent text. """
433 |         pass
434 | 
435 |     def rindent (self, by = 2):
436 |         """ Indent text on the right side. """
437 |         pass
438 | 
439 |     def preamble (self):
440 |         return ''
441 | 
442 |     def postamble (self):
443 |         return ''
444 | 
445 |     def visit_title (self, node):
446 |         """ Switch on the various incarnations the title element can have. """
447 | 
448 |         if isinstance (node.parent, nodes.section):
449 |             self.visit_section_title (node)
450 |         elif isinstance (node.parent, nodes.document):
451 |             self.visit_document_title (node)
452 |         elif isinstance (node.parent, nodes.table):
453 |             self.visit_table_title (node)
454 |         elif isinstance (node.parent, nodes.topic):
455 |             self.visit_topic_title (node)
456 |         elif isinstance (node.parent, nodes.sidebar):
457 |             self.visit_sidebar_title (node)
458 |         elif isinstance (node.parent, nodes.admonition):
459 |             self.visit_admonition_title (node)
460 |         else:
461 |             assert ("Can't happen.")
462 | 
463 |     def depart_title (self, node):
464 |         """ Switch on the various incarnations the title element can have. """
465 | 
466 |         if isinstance (node.parent, nodes.section):
467 |             self.depart_section_title (node)
468 |         elif isinstance (node.parent, nodes.document):
469 |             self.depart_document_title (node)
470 |         elif isinstance (node.parent, nodes.table):
471 |             self.depart_table_title (node)
472 |         elif isinstance (node.parent, nodes.topic):
473 |             self.depart_topic_title (node)
474 |         elif isinstance (node.parent, nodes.sidebar):
475 |             self.depart_sidebar_title (node)
476 |         elif isinstance (node.parent, nodes.admonition):
477 |             self.depart_admonition_title (node)
478 |         else:
479 |             assert ("Can't happen.")
480 | 
481 |     def visit_subtitle (self, node):
482 |         """ Switch on the various incarnations the subtitle element can have. """
483 | 
484 |         if isinstance (node.parent, nodes.document):
485 |             self.visit_document_subtitle (node)
486 |         else:
487 |             self.visit_section_subtitle (node)
488 |         
489 |     def depart_subtitle (self, node):
490 |         """ Switch on the various incarnations the subtitle element can have. """
491 | 
492 |         if isinstance (node.parent, nodes.document):
493 |             self.depart_document_subtitle (node)
494 |         else:
495 |             self.depart_section_subtitle (node)
496 |         
497 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/writers/epub2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | 
 6 | epub2.py
 7 | 
 8 | Copyright 2012 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | A writer that writes XHTML 1 files suited for conversion into EPUB2.
13 | 
14 | """
15 | 
16 | import re
17 | 
18 | from docutils import nodes
19 | 
20 | # from epubmaker.lib.Logger import info, debug, warn, error
21 | 
22 | from epubmaker.mydocutils.writers.xhtml1 import Writer as WriterBase
23 | from epubmaker.mydocutils.writers.xhtml1 import Translator as TranslatorBase
24 | 
25 | 
26 | class Writer (WriterBase):
27 |     """ EPUB2 writer. """
28 |     
29 |     def __init__ (self):
30 |         WriterBase.__init__ (self)
31 |         self.translator_class = Translator
32 | 
33 | 
34 | class Translator (TranslatorBase):
35 |     """ HTML Translator with EPUB2 tweaks. """
36 |     
37 |     def init_css (self):
38 |         for css_file in ('rst2all.css', 'rst2epub.css'):
39 |             self.head.append ('<style type="text/css">\n%s</style>\n' % 
40 |                               self.encode (self.read_css (css_file)))
41 | 
42 | 
43 |     def calc_centering_style (self, node):
44 |         """
45 |         Rationale: The EPUB standard allows user agents to replace
46 |         `margin: auto` with `margin: 0`. Thus we cannot use `margin: auto`
47 |         to center images, we have to calculate the left margin value.
48 | 
49 |         Also we must use 'width' on the html element, not css style,
50 |         or Adobe ADE will not scale the image properly (ie. only
51 |         horizontally).
52 | 
53 |         :align: is supposed to work on blocks. It floats or centers
54 |         a block.
55 | 
56 |         :align: center has not the same semantics as :class: center.
57 |         Former centers the block, eg. the whole table, latter centers
58 |         the text, eg, the text in every table cell.
59 | 
60 |             `:align: center`
61 |                 Used on image: centers image
62 |                 Used on figure: centers image and caption
63 |                 Used on table: centers table and caption
64 | 
65 |         """
66 | 
67 |         width = node.get ('width')
68 |         if width is None:
69 |             return []
70 |         
71 |         style = ['width: %s' % width]
72 | 
73 |         m = re.match ('(\d+)\s*%', width)
74 |         if (m):
75 |             width = max (min (int (m.group (1)), 100), 0)
76 |             margin = 100 - width
77 | 
78 |             align = node.get ('align', 'center')
79 |             if align == 'center':
80 |                 style.append ('margin-left: %d%%' % (margin / 2))
81 |             if align == 'right':
82 |                 style.append ('margin-left: %d%%' % margin)
83 |                 
84 |         node['styles'].extend (style)
85 | 
86 |     
87 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/writers/rst2all.css:
--------------------------------------------------------------------------------
  1 | /* 
  2 | Project Gutenberg common docutils stylesheet.
  3 | 
  4 | This stylesheet contains styles common to HTML and EPUB.  Put styles
  5 | that are specific to HTML and EPUB into their relative stylesheets.
  6 | 
  7 | :Author: Marcello Perathoner (webmaster@gutenberg.org)
  8 | :Copyright: This stylesheet has been placed in the public domain.
  9 | 
 10 | This stylesheet is based on:
 11 | 
 12 |   :Author: David Goodger (goodger@python.org)
 13 |   :Copyright: This stylesheet has been placed in the public domain.
 14 | 
 15 |   Default cascading style sheet for the HTML output of Docutils.
 16 | 
 17 | */
 18 | 
 19 | /* ADE 1.7.2 chokes on !important and throws all css out. */
 20 | 
 21 | /* FONTS */
 22 | 
 23 | .italics    { font-style: italic }
 24 | .no-italics { font-style: normal }
 25 | 
 26 | .bold       { font-weight: bold }
 27 | .no-bold    { font-weight: normal }
 28 | 
 29 | .small-caps { } /* Epub needs italics */
 30 | .gesperrt   { } /* Epub needs italics */
 31 | .antiqua    { font-style: italic } /* what else can we do ? */
 32 | .monospaced { font-family: monospace }
 33 | 
 34 | .smaller    { font-size: smaller }
 35 | .larger     { font-size: larger }
 36 | 
 37 | .xx-small   { font-size: xx-small }
 38 | .x-small    { font-size: x-small }
 39 | .small      { font-size: small }
 40 | .medium     { font-size: medium }
 41 | .large      { font-size: large }
 42 | .x-large    { font-size: x-large }
 43 | .xx-large   { font-size: xx-large }
 44 | 
 45 | .text-transform-uppercase { text-transform: uppercase }
 46 | .text-transform-lowercase { text-transform: lowercase }
 47 | .text-transform-none      { text-transform: none }
 48 | 
 49 | .red        { color: red }
 50 | .green      { color: green }
 51 | .blue       { color: blue }
 52 | .yellow     { color: yellow }
 53 | .white      { color: white }
 54 | .gray       { color: gray }
 55 | .black      { color: black }
 56 | 
 57 | /* ALIGN */
 58 | 
 59 | .left       { text-align: left }
 60 | .justify    { text-align: justify }
 61 | .center     { text-align: center; text-indent: 0 }
 62 | .centerleft { text-align: center; text-indent: 0 }
 63 | .right      { text-align: right;  text-indent: 0 }
 64 | 
 65 | /* LINE HEIGHT */
 66 | 
 67 | body        { line-height: 1.5 }
 68 | p           { margin: 0; 
 69 | 	      text-indent: 2em }
 70 | 
 71 | /* PAGINATION */
 72 | 
 73 | .title, .subtitle     { page-break-after:  avoid }
 74 | 
 75 | .container, .title, .subtitle, #pg-header            
 76 |                       { page-break-inside: avoid }
 77 | 
 78 | /* SECTIONS */
 79 | 
 80 | body         { text-align: justify }
 81 | 
 82 | p.pfirst, p.noindent { 
 83 |     text-indent: 0 
 84 | }
 85 | 
 86 | .boxed         { border: 1px solid black; padding: 1em }
 87 | .topic, .note  { margin: 5% 0; border: 1px solid black; padding: 1em }
 88 | div.section    { clear: both }
 89 | 
 90 | div.line-block       { margin: 1.5em 0 }  /* same leading as p */
 91 | div.line-block.inner { margin: 0 0 0 10% }
 92 | div.line             { margin-left: 20%; text-indent: -20%; }
 93 | .line-block.noindent div.line { margin-left: 0; text-indent: 0; }
 94 | 
 95 | hr.docutils          { margin: 1.5em 40%; border: none; border-bottom: 1px solid black; }
 96 | div.transition       { margin: 1.5em 0 }
 97 | 
 98 | .vfill, .vspace      { border: 0px solid white }
 99 | 
100 | .title               { margin: 1.5em 0 }
101 | .title.with-subtitle { margin-bottom: 0 }
102 | .subtitle            { margin: 1.5em 0 }
103 | 
104 | /* header font style */
105 | /* http://dev.w3.org/csswg/css3-fonts/#propdef-font-size */
106 | 
107 | h1.title                        { font-size: 200%; }  /* for book title only */
108 | h2.title, p.subtitle.level-1    { font-size: 150%; margin-top: 4.5em;  margin-bottom: 2em }
109 | h3.title, p.subtitle.level-2    { font-size: 120%; margin-top: 2.25em; margin-bottom: 1.25em }
110 | h4.title, p.subtitle.level-3    { font-size: 100%; margin-top: 1.5em;  margin-bottom: 1.5em;  font-weight: bold; }
111 | h5.title, p.subtitle.level-4    { font-size:  89%; margin-top: 1.87em; margin-bottom: 1.69em; font-style: italic; }
112 | h6.title, p.subtitle.level-5    { font-size:  60%; margin-top: 3.5em;  margin-bottom: 2.5em }
113 | 
114 | /* title page */
115 | 
116 | h1.title, p.subtitle.level-1,
117 | h2.title, p.subtitle.level-2    { text-align: center }
118 | 
119 | #pg-header,
120 | h1.document-title               { margin: 10% 0 5% 0 }
121 | p.document-subtitle             { margin:  0  0 5% 0 }
122 | 
123 | /* PG header and footer */
124 | #pg-machine-header { }
125 | #pg-produced-by { }
126 | 
127 | li.toc-entry            { list-style-type: none }
128 | ul.open li, ol.open li  { margin-bottom: 1.5em }
129 | 
130 | .attribution            { margin-top: 1.5em }
131 | 
132 | .example-rendered { 
133 |     margin: 1em 5%; border: 1px dotted red;  padding: 1em; background-color: #ffd }
134 | .literal-block.example-source   { 
135 |     margin: 1em 5%; border: 1px dotted blue; padding: 1em; background-color: #eef }
136 | 
137 | /* DROPCAPS */
138 | 
139 | /* BLOCKQUOTES */
140 | 
141 | blockquote { margin: 1.5em 10% }
142 | 
143 | blockquote.epigraph { }
144 | 
145 | blockquote.highlights { }
146 | 
147 | div.local-contents { margin: 1.5em 10% }
148 | 
149 | div.abstract { margin: 3em   10% }
150 | div.image    { margin: 1.5em  0  }
151 | div.caption  { margin: 1.5em  0 }
152 | div.legend   { margin: 1.5em  0 }
153 | 
154 | .hidden { display: none }
155 | 
156 | .invisible { visibility: hidden; color: white } /* white: mozilla print bug */
157 | 
158 | a.toc-backref {
159 |   text-decoration: none ;
160 |   color: black }
161 | 
162 | dl.docutils dd {
163 |   margin-bottom: 0.5em }
164 | 
165 | div.figure { margin-top: 3em; margin-bottom: 3em }
166 | 
167 | img { max-width: 100% }
168 | 
169 | div.footer, div.header {
170 |   clear: both;
171 |   font-size: smaller }
172 | 
173 | div.sidebar {
174 |   margin: 0 0 0.5em 1em ;
175 |   border: medium outset ;
176 |   padding: 1em ;
177 |   background-color: #ffffee ;
178 |   width: 40% ;
179 |   float: right ;
180 |   clear: right }
181 | 
182 | div.sidebar p.rubric {
183 |   font-family: sans-serif ;
184 |   font-size: medium }
185 | 
186 | ol.simple, ul.simple { margin: 1.5em 0 }
187 | 
188 | ol.toc-list,    ul.toc-list    { padding-left:  0  }
189 | ol ol.toc-list, ul ul.toc-list { padding-left:  5% }
190 | 
191 | ol.arabic {
192 |   list-style: decimal }
193 | 
194 | ol.loweralpha {
195 |   list-style: lower-alpha }
196 | 
197 | ol.upperalpha {
198 |   list-style: upper-alpha }
199 | 
200 | ol.lowerroman {
201 |   list-style: lower-roman }
202 | 
203 | ol.upperroman {
204 |   list-style: upper-roman }
205 | 
206 | p.credits {
207 |   font-style: italic ;
208 |   font-size: smaller }
209 | 
210 | p.label {
211 |   white-space: nowrap }
212 | 
213 | p.rubric {
214 |   font-weight: bold ;
215 |   font-size: larger ;
216 |   color: maroon ;
217 |   text-align: center }
218 | 
219 | p.sidebar-title {
220 |   font-family: sans-serif ;
221 |   font-weight: bold ;
222 |   font-size: larger }
223 | 
224 | p.sidebar-subtitle {
225 |   font-family: sans-serif ;
226 |   font-weight: bold }
227 | 
228 | p.topic-title, p.admonition-title {
229 |   font-weight: bold }
230 | 
231 | pre.address {
232 |   margin-bottom: 0 ;
233 |   margin-top: 0 ;
234 |   font: inherit }
235 | 
236 | .literal-block, .doctest-block {
237 |   margin-left: 2em ;
238 |   margin-right: 2em; }
239 | 
240 | span.classifier {
241 |   font-family: sans-serif ;
242 |   font-style: oblique }
243 | 
244 | span.classifier-delimiter {
245 |   font-family: sans-serif ;
246 |   font-weight: bold }
247 | 
248 | span.interpreted {
249 |   font-family: sans-serif }
250 | 
251 | span.option {
252 |   white-space: nowrap }
253 | 
254 | span.pre {
255 |   white-space: pre }
256 | 
257 | span.problematic {
258 |   color: red }
259 | 
260 | span.section-subtitle {
261 |   /* font-size relative to parent (h1..h6 element) */
262 |   font-size: 100% }
263 | 
264 | table { margin-top: 1.5em; margin-bottom: 1.5em; border-spacing: 0 }
265 | table.align-left, table.align-right { margin-top: 0 }
266 | 
267 | table.table                { border-collapse: collapse; }
268 | 
269 | table.table.hrules-table thead          { border: 1px solid black; border-width: 2px 0 0 }
270 | table.table.hrules-table tbody          { border: 1px solid black; border-width: 2px 0 }
271 | table.table.hrules-rows  tr             { border: 1px solid black; border-width: 0 0 1px }
272 | table.table.hrules-rows  tr.last        { border-width: 0 }
273 | table.table.hrules-rows  td, 
274 | table.table.hrules-rows  th             { padding: 1ex 1em; vertical-align: middle }
275 | 
276 | table.table tr             { border-width: 0 }
277 | table.table td, 
278 | table.table th             { padding: 0.5ex 1em }
279 | table.table tr.first td    { padding-top: 1ex }
280 | table.table tr.last td     { padding-bottom: 1ex }
281 | table.table tr.first th    { padding-top: 1ex }
282 | table.table tr.last th     { padding-bottom: 1ex }
283 | 
284 | 
285 | table.citation {
286 |   border-left: solid 1px gray;
287 |   margin-left: 1px }
288 | 
289 | table.docinfo {
290 |   margin: 3em 4em }
291 | 
292 | table.docutils { }
293 | 
294 | div.footnote-group          { margin: 1em 0 }
295 | table.footnote td.label     { width: 2em; text-align: right; padding-left: 0 }
296 | 
297 | table.docutils td, table.docutils th,
298 | table.docinfo td, table.docinfo th {
299 |   padding: 0 0.5em;
300 |   vertical-align: top }
301 | 
302 | table.docutils th.field-name, table.docinfo th.docinfo-name {
303 |   font-weight: bold ;
304 |   text-align: left ;
305 |   white-space: nowrap ;
306 |   padding-left: 0 }
307 | 
308 | /* used to remove borders from tables and images */
309 | .borderless, table.borderless td, table.borderless th {
310 |   border: 0 }
311 | 
312 | table.borderless td, table.borderless th {
313 |   /* Override padding for "table.docutils td" with "!important".
314 |      The right padding separates the table cells. */
315 |   padding: 0 0.5em 0 0 } /* FIXME: was !important */
316 | 
317 | h1 tt.docutils, h2 tt.docutils, h3 tt.docutils,
318 | h4 tt.docutils, h5 tt.docutils, h6 tt.docutils {
319 |   font-size: 100% }
320 | 
321 | ul.auto-toc {
322 |   list-style-type: none }
323 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/writers/rst2epub.css:
--------------------------------------------------------------------------------
 1 | /* 
 2 | Project Gutenberg EPUB docutils stylesheet.
 3 | 
 4 | This stylesheet contains styles specific to EPUB.
 5 | */
 6 | 
 7 | /* FONTS */
 8 | 
 9 | /* mostly unsupported */
10 | .small-caps        { font-style: italic }
11 | .gesperrt          { font-style: italic }
12 | 
13 | /* ALIGN */
14 | 
15 | /* SECTIONS */
16 | 
17 | /* reduce screen real estate waste */
18 | body               { margin: 1% }
19 | 
20 | /* ugly hack to give more specifity.  because ADE chucks out the whole
21 |    stylesheet when it sees an !important */
22 | 
23 | .first.first        { margin-top: 0; text-indent: 0 } 
24 | .last.last          { margin-bottom: 0 }
25 | 
26 | .no-page-break.no-page-break 
27 |                     { page-break-before: avoid }
28 | 
29 | /* PAGINATION */
30 | 
31 | div.clearpage       { page-break-before: always; padding-top: 10% }
32 | div.cleardoublepage { page-break-before: right;  padding-top: 10%  }
33 | 
34 | .vfill              { margin-top: 10% }
35 | h2.title            { margin-top: 10% }
36 | 
37 | /* DIV */
38 | 
39 | a                   { text-decoration: none }
40 | .toc-pageref        { display: none }
41 | 
42 | /* DROPCAPS */
43 | 
44 | span.dropcap        { line-height: 0 }
45 | img.dropcap         { vertical-align: bottom }
46 | 
47 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/writers/rst2html.css:
--------------------------------------------------------------------------------
 1 | /* 
 2 | Project Gutenberg HTML docutils stylesheet.
 3 | 
 4 | This stylesheet contains styles specific to HTML.
 5 | */
 6 | 
 7 | /* FONTS */
 8 | 
 9 | /* em                { font-style: normal }
10 | strong            { font-weight: normal } */
11 | 
12 | .small-caps       { font-variant: small-caps }
13 | .gesperrt         { letter-spacing: 0.1em }
14 | 
15 | /* ALIGN */
16 | 
17 | .align-left       { clear: left;
18 | 		    float: left;
19 | 		    margin-right: 1em }
20 | 
21 | .align-right      { clear: right;
22 | 		    float: right;
23 | 		    margin-left: 1em }
24 | 
25 | .align-center     { margin-left: auto;
26 | 		    margin-right: auto }
27 | 
28 | div.shrinkwrap    { display: table; }
29 | 
30 | /* SECTIONS */
31 | 
32 | body              { margin: 5% 10% 5% 10% }
33 | 
34 | /* compact list items containing just one p */
35 | li p.pfirst       { margin-top: 0; margin-bottom: 0 } 
36 | 
37 | .first            { margin-top: 0 !important; 
38 | 		    text-indent: 0 !important } 
39 | .last             { margin-bottom: 0 !important }
40 | 
41 | span.dropcap      { float: left; margin: 0 0.1em 0 0; line-height: 1 }
42 | img.dropcap       { float: left; margin: 0 0.5em 0 0; max-width: 25% }
43 | span.dropspan     { font-variant: small-caps }
44 | 
45 | .no-page-break    { page-break-before: avoid !important }
46 | 
47 | /* PAGINATION */
48 | 
49 | .pageno           { position: absolute; right: 95%; font: medium sans-serif; text-indent: 0 }
50 | .pageno:after     { color: gray; content: '[' attr(title) ']' }
51 | .lineno           { position: absolute; left:  95%; font: medium sans-serif; text-indent: 0 }
52 | .lineno:after     { color: gray; content: '[' attr(title) ']' }
53 | .toc-pageref      { float: right }
54 | 
55 | @media screen {
56 |    .coverpage, .frontispiece, .titlepage, .verso, .dedication, .plainpage
57 |                        { margin: 10% 0; }
58 | 
59 |    div.clearpage, div.cleardoublepage
60 |                        { margin: 10% 0; border: none; border-top: 1px solid gray; }
61 | 
62 |    .vfill              { margin:  5% 10% }
63 | }
64 | 
65 | @media print {
66 |    div.clearpage       { page-break-before: always; padding-top: 10% }
67 |    div.cleardoublepage { page-break-before: right;  padding-top: 10%  }
68 | 
69 |    .vfill              { margin-top: 20% }
70 |    h2.title            { margin-top: 20% }
71 | }
72 | 
73 | /* DIV */
74 | pre               { font-family: monospace; font-size: 0.9em; white-space: pre-wrap }
75 | 
76 | 


--------------------------------------------------------------------------------
/epubmaker/packagers/GzipPackager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | GzipPackager.py
 6 | 
 7 | Copyright 2010 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Gzip a file.
12 | 
13 | """
14 | 
15 | from epubmaker.packagers import OneFileGzipPackager
16 | 
17 | TYPE = 'gzip'
18 | FORMATS = 'rst html.noimages html.images txt.us-ascii txt.iso-8859-1 txt.utf-8'.split ()
19 | 
20 | class Packager (OneFileGzipPackager):
21 |     """ Gzip packager. """
22 |     pass
23 | 
24 | 


--------------------------------------------------------------------------------
/epubmaker/packagers/HTMLPackager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | HTMLPackager.py
 6 | 
 7 | Copyright 2010 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Package a HTML file for PG.
12 | 
13 | """
14 | 
15 | from epubmaker.packagers import HTMLishPackager
16 | 
17 | TYPE = 'ww'
18 | FORMATS = 'html.images'.split ()
19 | 
20 | class Packager (HTMLishPackager):
21 |     """ Package a HTML file with its images. """
22 |     pass
23 | 


--------------------------------------------------------------------------------
/epubmaker/packagers/PDFPackager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | PDFPackager.py
 6 | 
 7 | Copyright 2010 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Package a PDF file for PG.
12 | 
13 | """
14 | 
15 | from epubmaker.packagers import OneFileZipPackager
16 | 
17 | TYPE = 'ww'
18 | FORMATS = ''.split ()
19 | 
20 | class Packager (OneFileZipPackager):
21 |     """ WW packager for PDF files. """
22 |     pass
23 | 
24 | 


--------------------------------------------------------------------------------
/epubmaker/packagers/PushPackager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | PushPackager.py
 6 | 
 7 | Copyright 2011 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Package a zip containing everything, that can be pushed to ibiblio.
12 | 
13 | """
14 | 
15 | from __future__ import with_statement
16 | 
17 | import os
18 | import zipfile
19 | import re
20 | 
21 | from epubmaker.lib.Logger import info, warn, error
22 | import epubmaker.lib.GutenbergGlobals as gg
23 | 
24 | from epubmaker.packagers import BasePackager
25 | 
26 | TYPE = 'ww'
27 | FORMATS = ['push']
28 | 
29 | class Packager (BasePackager):
30 |     """ Package one big zip for push.
31 | 
32 |     Zip contains one directory named after ebook_no.
33 |     This dir mirrors structure on ibiblio::
34 | 
35 |       12345/12345.txt
36 |       12345/12345.zip
37 |       12345/12345-h/12345-h.html
38 |       12345/12345-h/images/cover.jpg
39 |       12345/12345-h.zip
40 |     
41 |     """
42 | 
43 |     @staticmethod
44 |     def add (zip_, filename, memberfilename):
45 |         """ Add one file to the zip. """
46 |         
47 |         try:
48 |             os.stat (filename)
49 |             dummy_name, ext = os.path.splitext (filename)
50 |             info ('  Adding file: %s as %s' % (filename, memberfilename))
51 |             zip_.write (filename, memberfilename,
52 |                         zipfile.ZIP_STORED if ext in ['.zip', '.png']
53 |                         else zipfile.ZIP_DEFLATED)
54 |         except OSError:
55 |             # warn ('PushPackager: Cannot find file %s', filename)
56 |             return
57 | 
58 | 
59 |     def package (self, aux_file_list = []):
60 |         zipfilename = self.options.outputfile # filename is zipfile
61 | 
62 |         m = re.match (r'\d+', zipfilename)
63 |         if m:
64 |             ebook_no = m.group (0)
65 |         else:
66 |             error ('Invalid filename %s for push packager.' % zipfilename)
67 |             return
68 | 
69 |         info ('Creating Zip file: %s' % zipfilename)
70 | 
71 |         zip_ = zipfile.ZipFile (zipfilename, 'w', zipfile.ZIP_DEFLATED)
72 | 
73 |         for suffix in '.txt -8.txt -0.txt .zip -8.zip -0.zip -rst.zip -h.zip'.split ():
74 |             filename = '%s%s' % (ebook_no, suffix)
75 |             memberfilename = '%s/%s' % (ebook_no, filename)
76 |             self.add (zip_, filename, memberfilename)
77 | 
78 |         for suffix, ext in (('-h', 'html'), ('-rst', 'rst')):
79 |             filename = '%s%s.%s' % (ebook_no, suffix, ext)
80 |             memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, filename)
81 |             self.add (zip_, filename, memberfilename)
82 | 
83 |             # image files
84 |             for url in aux_file_list:
85 |                 rel_url = gg.make_url_relative (self.options.base_url, url)
86 |                 filename = os.path.join (self.path, rel_url)
87 |                 memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, rel_url)
88 |                 self.add (zip_, filename, memberfilename)
89 | 
90 |         zip_.close ()
91 | 
92 |         info ('Done Zip file: %s' % zipfilename)
93 | 
94 |     
95 | 


--------------------------------------------------------------------------------
/epubmaker/packagers/RSTPackager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | RSTPackager.py
 6 | 
 7 | Copyright 2010 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Package a RST file for PG.
12 | 
13 | """
14 | 
15 | from epubmaker.packagers import HTMLishPackager
16 | 
17 | TYPE = 'ww'
18 | FORMATS = 'rst.gen'.split ()
19 | 
20 | class Packager (HTMLishPackager):
21 |     """ Package a RST file with its images. """
22 |     pass
23 | 


--------------------------------------------------------------------------------
/epubmaker/packagers/TxtPackager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | TxtPackager.py
 6 | 
 7 | Copyright 2010 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Package a Txt file for PG.
12 | 
13 | """
14 | 
15 | from epubmaker.packagers import OneFileZipPackager
16 | 
17 | TYPE = 'ww'
18 | FORMATS = 'txt.us-ascii txt.iso-8859-1 txt.utf-8'.split ()
19 | 
20 | class Packager (OneFileZipPackager):
21 |     """ WW packager for plain text files. """
22 |     pass
23 | 
24 | 


--------------------------------------------------------------------------------
/epubmaker/packagers/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | Packager package
  7 | 
  8 | Copyright 2009-2010 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Base class for Packager modules.
 13 | 
 14 | """
 15 | 
 16 | from __future__ import with_statement
 17 | 
 18 | import os.path
 19 | import gzip
 20 | import zipfile
 21 | 
 22 | from pkg_resources import resource_listdir  # pylint: disable=E0611
 23 | 
 24 | from epubmaker.lib.Logger import debug, info, warn, error
 25 | import epubmaker.lib.GutenbergGlobals as gg
 26 | 
 27 | GZIP_EXTENSION = '.gzip'
 28 | 
 29 | class BasePackager (object):
 30 |     """
 31 |     Base class for Packagers.
 32 | 
 33 |     """
 34 | 
 35 |     def __init__ (self):
 36 |         self.options = None
 37 |         self.path_name_ext = None
 38 |         self.path = None
 39 |         self.name = None
 40 |         self.ext = None
 41 | 
 42 | 
 43 |     def setup (self, options):
 44 |         """ Setup """
 45 |         
 46 |         self.options = options
 47 |         self.path_name_ext = os.path.join (self.options.outputdir, self.options.outputfile)
 48 |         self.path, name = os.path.split (self.path_name_ext)
 49 |         self.name, self.ext = os.path.splitext (name)
 50 | 
 51 | 
 52 |     def package (self, aux_file_list = []):
 53 |         """ Package files. """
 54 |         pass
 55 | 
 56 | 
 57 | class OneFileGzipPackager (BasePackager):
 58 |     """ Gzips one file. """
 59 | 
 60 |     def package (self, aux_file_list = []):
 61 |         filename = self.path_name_ext
 62 |         gzfilename = filename + GZIP_EXTENSION
 63 | 
 64 |         try:
 65 |             info ('Creating Gzip file: %s' % gzfilename)
 66 |             with open (filename, 'r') as fp:
 67 |                 fpgz = gzip.open (gzfilename, 'w')
 68 |                 info ('  Adding file: %s' % filename)
 69 |                 fpgz.write (fp.read ())
 70 |                 fpgz.close ()
 71 |                 info ('Done Zip file: %s' % gzfilename)
 72 |         except IOError, what:
 73 |             error (what)
 74 |             
 75 | 
 76 | class OneFileZipPackager (BasePackager):
 77 |     """ Packages one file in zip of the same name. """
 78 | 
 79 |     def package (self, aux_file_list = []):
 80 |         filename = self.path_name_ext
 81 |         zipfilename = os.path.join (self.path, self.name) + '.zip'
 82 |         memberfilename = self.name + self.ext
 83 | 
 84 |         info ('Creating Zip file: %s' % zipfilename)
 85 | 
 86 |         try:
 87 |             os.stat (filename)
 88 |         except OSError:
 89 |             # warn ('Packager: Cannot find file %s', filename)
 90 |             return
 91 |         
 92 |         zip_ = zipfile.ZipFile (zipfilename, 'w', zipfile.ZIP_DEFLATED)
 93 |         info ('  Adding file: %s as %s' % (filename, memberfilename))
 94 |         zip_.write (filename, memberfilename)
 95 |         zip_.close ()
 96 | 
 97 |         info ('Done Zip file: %s' % zipfilename)
 98 | 
 99 | 
100 | class HTMLishPackager (BasePackager):
101 |     """ Package a file with images. """
102 | 
103 |     def package (self, aux_file_list = []):
104 |         
105 |         filename = self.options.outputfile
106 |         zipfilename = os.path.join (self.path, self.name) + '.zip'
107 |         memberfilename = os.path.join (self.name, self.name) + self.ext
108 | 
109 |         info ('Creating Zip file: %s' % zipfilename)
110 | 
111 |         zip_ = zipfile.ZipFile (zipfilename, 'w', zipfile.ZIP_DEFLATED)
112 |         info ('  Adding file: %s as %s' % (filename, memberfilename))
113 |         zip_.write (filename, memberfilename)
114 | 
115 |         # now images
116 |         for url in aux_file_list:
117 |             rel_url = gg.make_url_relative (self.options.base_url, url)
118 |             filename = os.path.join (self.path, rel_url)
119 |             memberfilename = os.path.join (self.name, rel_url)
120 |             info ('  Adding file: %s as %s' % (filename, memberfilename))
121 |             zip_.write (filename, memberfilename)
122 |         
123 |         zip_.close ()
124 | 
125 |         info ('Done Zip file: %s' % zipfilename)
126 | 
127 |     
128 | class PackagerFactory (object):
129 |     """ Implements Factory pattern for packagers. """
130 | 
131 |     packagers = {}
132 | 
133 |     def __init__ (self, type_):
134 |         self.type = type_
135 |         
136 | 
137 |     def load (self):
138 |         """ Load the packagers in the packagers directory. """
139 | 
140 |         for fn in resource_listdir ('epubmaker.packagers', ''):
141 |             modulename, ext = os.path.splitext (fn)
142 |             if ext == '.py':
143 |                 if modulename.endswith ('Packager'):
144 |                     module = __import__ ('epubmaker.packagers.' + modulename,
145 |                                          fromlist = [modulename])
146 |                     if self.type == module.TYPE:
147 |                         debug ("Loading packager type: %s from module: %s for formats: %s" % (
148 |                             self.type, modulename, ', '.join (module.FORMATS)))
149 |                         for format_ in module.FORMATS:
150 |                             self.packagers[format_] = module
151 | 
152 |         return self.packagers.keys ()
153 | 
154 | 
155 |     def unload (self):
156 |         """ Unload packager modules. """
157 | 
158 |         for k in self.packagers.keys ():
159 |             del self.packagers[k]
160 | 
161 | 
162 |     def create (self, format_):
163 |         """ Create a packager for format. """
164 | 
165 |         try:
166 |             return self.packagers[format_].Packager ()
167 |         except KeyError:
168 |             raise KeyError ('No packager for type %s' % format_)
169 |     
170 | 


--------------------------------------------------------------------------------
/epubmaker/parsers/AuxParser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
 3 | 
 4 | """
 5 | 
 6 | AuxParser.py
 7 | 
 8 | Copyright 2009 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | Open an url and return raw data.
13 | 
14 | """
15 | 
16 | 
17 | from epubmaker.parsers import ParserBase
18 | 
19 | mediatypes = ('*/*', )
20 | 
21 | class Parser (ParserBase):
22 |     """ Parse an auxiliary file. """
23 | 
24 |     def __init__ (self):
25 |         ParserBase.__init__ (self)
26 |         self.data = None
27 | 
28 | 
29 |     def parse (self):
30 |         """ Parse the file. """
31 |         self.data = self.bytes_content ()
32 | 
33 | 
34 |     def serialize (self):
35 |         """ Serialize file to string. """
36 |         return self.data
37 | 


--------------------------------------------------------------------------------
/epubmaker/parsers/CSSParser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | CSSParser.py
  7 | 
  8 | Copyright 2009 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Open an url and return raw data.
 13 | 
 14 | """
 15 | 
 16 | import re
 17 | import urlparse
 18 | import logging
 19 | 
 20 | import cssutils
 21 | 
 22 | from epubmaker.lib.Logger import debug
 23 | from epubmaker.lib.MediaTypes import mediatypes as mt
 24 | 
 25 | from epubmaker.parsers import ParserBase
 26 | 
 27 | RE_ELEMENT = re.compile (r'((?:^|\s)[a-z0-9]+)', re.I)
 28 | 
 29 | mediatypes = (mt.css, )
 30 | 
 31 | class Parser (ParserBase):
 32 |     """ Parse an external CSS file. """
 33 | 
 34 |     def __init__ (self):
 35 |         cssutils.log.setLog (logging.getLogger ('cssutils'))
 36 |         # logging.DEBUG is way too verbose
 37 |         cssutils.log.setLevel (max (cssutils.log.getEffectiveLevel (), logging.INFO))
 38 |         ParserBase.__init__ (self)
 39 |         self.sheet = None
 40 | 
 41 | 
 42 |     def parse (self):
 43 |         """ Parse the CSS file. """
 44 | 
 45 |         if self.sheet is not None:
 46 |             return
 47 |         
 48 |         parser = cssutils.CSSParser ()
 49 |         if self.fp:
 50 |             self.sheet = parser.parseString (self.bytes_content (), encoding = self.encoding)
 51 |         else:
 52 |             self.sheet = parser.parseUrl (self.url)
 53 | 
 54 |         self.mediatype = 'text/css'
 55 |         self.unpack_media_handheld (self.sheet)
 56 |         self.lowercase_selectors (self.sheet)
 57 | 
 58 | 
 59 |     def parse_string (self, s):
 60 |         """ Parse the CSS in string. """
 61 | 
 62 |         if self.sheet is not None:
 63 |             return
 64 |         
 65 |         parser = cssutils.CSSParser ()
 66 |         self.sheet = parser.parseString (s, encoding = 'utf-8')
 67 | 
 68 |         self.mediatype = 'text/css'
 69 |         self.unpack_media_handheld (self.sheet)
 70 |         self.lowercase_selectors (self.sheet)
 71 | 
 72 | 
 73 |     @staticmethod
 74 |     def iter_properties (sheet):
 75 |         """ Iterate on properties in css. """
 76 |         for rule in sheet:
 77 |             if rule.type == rule.STYLE_RULE:
 78 |                 for prop in rule.style:
 79 |                     yield prop
 80 | 
 81 | 
 82 |     @staticmethod
 83 |     def unpack_media_handheld (sheet):
 84 |         """ unpack a @media handheld rule """
 85 |         for rule in sheet:
 86 |             if rule.type == rule.MEDIA_RULE:
 87 |                 if rule.media.mediaText.find ('handheld') > -1:
 88 |                     debug ("Unpacking CSS @media handheld rule.")
 89 |                     rule.media.mediaText = 'all'
 90 |                     rule.insertRule (cssutils.css.CSSComment ('/* was @media handheld */'), 0)
 91 | 
 92 | 
 93 |     @staticmethod
 94 |     def lowercase_selectors (sheet):
 95 |         """ make selectors lowercase to match xhtml tags """
 96 |         for rule in sheet:
 97 |             if rule.type == rule.STYLE_RULE:
 98 |                 for sel in rule.selectorList:
 99 |                     sel.selectorText = RE_ELEMENT.sub (lambda m: m.group(1).lower (),
100 |                                                        sel.selectorText)
101 | 
102 | 
103 |     def rewrite_links (self, f):
104 |         """ Rewrite all links using the function f. """
105 |         cssutils.replaceUrls (self.sheet, f)
106 | 
107 | 
108 |     def drop_floats (self):
109 |         """ Drop all floats in stylesheet.
110 | 
111 |         """
112 | 
113 |         for prop in self.iter_properties (self.sheet):
114 |             if prop and prop.name == 'float': # test for existence because we remove
115 |                 prop.parent.removeProperty ('float')
116 |                 prop.parent.removeProperty ('width')
117 |                 prop.parent.removeProperty ('height')
118 |             elif prop and prop.name in ('position', 'left', 'right', 'top', 'bottom'):
119 |                 prop.parent.removeProperty (prop.name)
120 |                 
121 |         for prop in self.iter_properties (self.sheet):
122 |             #print prop.name
123 |             #print prop.value
124 |             if prop and prop.value.endswith ('px'): # test for existence because we remove
125 |                 prop.parent.removeProperty (prop.name)
126 | 
127 | 
128 |     def get_image_urls (self):
129 |         """ Return the urls of all images in document.
130 | 
131 |         Images are graphic files. The user may choose if he wants
132 |         images included or not.
133 | 
134 |         """
135 | 
136 |         images = []
137 |         
138 |         for prop in self.iter_properties (self.sheet):
139 |             if (prop.value.cssValueType == prop.value.CSS_PRIMITIVE_VALUE and
140 |                 prop.value.primitiveType == prop.value.CSS_URI):
141 |                 url = urlparse.urljoin (self.url, prop.value.cssText)
142 |                 images.append (url)
143 |             
144 |         return  images
145 | 
146 | 
147 |     def get_aux_urls (self):
148 |         """ Return the urls of all auxiliary files in document.
149 | 
150 |         Auxiliary files are non-document files you need to correctly
151 |         display the document file, eg. CSS files.
152 | 
153 |         """
154 | 
155 |         aux = []
156 |         
157 |         for rule in self.sheet:
158 |             if rule.type == rule.IMPORT_RULE:
159 |                 aux.append (urlparse.urljoin (self.url, rule.href))
160 | 
161 |         return  aux
162 | 
163 | 
164 |     def serialize (self):
165 |         """ Serialize CSS. """
166 | 
167 |         return self.sheet.cssText
168 | 


--------------------------------------------------------------------------------
/epubmaker/parsers/HTMLParser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | HTMLParser.py
  7 | 
  8 | Copyright 2009 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | """
 13 | 
 14 | import re
 15 | import subprocess
 16 | import urllib
 17 | import urlparse
 18 | 
 19 | import lxml.html
 20 | from lxml import etree
 21 | # import tidy
 22 | 
 23 | from epubmaker.lib.GutenbergGlobals import NS, xpath
 24 | from epubmaker.lib.Logger import info, debug, warn, error
 25 | from epubmaker.lib.MediaTypes import mediatypes as mt
 26 | 
 27 | from epubmaker import parsers
 28 | from epubmaker.parsers import HTMLParserBase
 29 | 
 30 | mediatypes = ('text/html', mt.xhtml)
 31 | 
 32 | RE_XMLDECL = re.compile ('<\?xml[^?]+\?>\s*')
 33 | 
 34 | DEPRECATED = { 'align':      """caption applet iframe img input object legend
 35 |                              table hr div h1 h2 h3 h4 h5 h6 p""",
 36 |                'alink':      'body',
 37 |                'alt':        'applet',
 38 |                'archive':    'applet',
 39 |                'background': 'body',
 40 |                'bgcolor':    '*',
 41 |                'border':     'img object',
 42 |                'clear':      'br',
 43 |                'code':       'applet',
 44 |                'codebase':   'applet',
 45 |                'color':      '*',
 46 |                'compact':    '*',
 47 |                'face':       '*',
 48 |                'height':     'td th applet',
 49 |                'hspace':     '*',
 50 |                'language':   'script',
 51 |                'link':       'body',
 52 |                'name':       'applet',
 53 |                'noshade':    'hr',
 54 |                'nowrap':     '*',
 55 |                'object':     'applet',
 56 |                'prompt':     'isindex',
 57 |                'size':       'hr font basefont',
 58 |                'start':      'ol',
 59 |                'text':       'body',
 60 |                'type':       'li ol ul',
 61 |                'value':      'li',
 62 |                'version':    'html',
 63 |                'vlink':      'body',
 64 |                'vspace':     '*',
 65 |                'width':      'hr td th applet pre',
 66 |                }
 67 | 
 68 | 
 69 | class Parser (HTMLParserBase):
 70 |     """ Parse a HTML Text
 71 | 
 72 |     and convert it to xhtml suitable for ePub packaging.
 73 | 
 74 |     """
 75 | 
 76 |     @staticmethod
 77 |     def _fix_id (id_):
 78 |         """ Fix more common mistakes in ids.
 79 | 
 80 |         xml:id cannot start with digit, very common in pg.
 81 | 
 82 |         """
 83 | 
 84 |         if not parsers.RE_XML_NAME.match (id_):
 85 |             id_ = 'id_' + id_
 86 | 
 87 |         # debug ("_fix_id: id = %s" % id_)
 88 |         return id_
 89 | 
 90 | 
 91 |     def _fix_internal_frag (self, id_):
 92 |         """ Fix more common mistakes in ids. """
 93 | 
 94 |         # This is a big mess because href attributes must be quoted,
 95 |         # but id attributes must not be quoted.  Some HTML in PG
 96 |         # quotes ids in a misguided attempt to make id and href look
 97 |         # the same.  But '%' is invalid in xml ids.
 98 |         #
 99 |         # See HTML 4.01 spec section B.2.
100 | 
101 |         if '%' in id_:
102 |             id_ = urllib.unquote (id_)
103 |             try:
104 |                 id_ = id_.decode ('utf-8')
105 |             except UnicodeError:
106 |                 try:
107 |                     id_ = id_.decode (self.encoding)
108 |                 except UnicodeError:
109 |                     pass # we tried
110 | 
111 |         # xml:id cannot start with digit
112 |         # very common in pg
113 | 
114 |         if not parsers.RE_XML_NAME.match (id_):
115 |             id_ = 'id_' + id_
116 | 
117 |         if not parsers.RE_XML_NAME.match (id_):
118 |             # still invalid ... we tried
119 |             return None
120 | 
121 |         # debug ("_fix_internal_frag: frag = %s" % id_)
122 |         return id_
123 | 
124 | 
125 |     # @staticmethod
126 |     # def tidylib (html):
127 |     #     """ Pipe html thru w3c tidylib. """
128 | 
129 |     #     html = parsers.RE_RESTRICTED.sub ('', html)
130 |     #     html = RE_XMLDECL.sub ('', html)
131 |     #     html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html)
132 | 
133 |     #     options = {
134 |     #         "clean": 1,
135 |     #         "wrap":  0,
136 |     #         "output_xhtml":     1,
137 |     #         "numeric_entities": 1,
138 |     #         "merge_divs":       0, # keep poetry indentation
139 |     #         "merge_spans":      0,
140 |     #         "add_xml_decl":     0,
141 |     #         "doctype":          "strict",
142 |     #         "anchor_as_name":   0,
143 |     #         "enclose_text":     1,
144 |     #         }
145 | 
146 |     #     try:
147 |     #         html = tidy.parseString (html.encode ('utf-8'))
148 |     #     except TidyLibError, what:
149 |     #         error ("Tidy: %s" % what)
150 |     #         raise
151 | 
152 |     #     return html
153 | 
154 | 
155 |     @staticmethod
156 |     def tidy (html):
157 |         """ Pipe html thru w3c tidy. """
158 | 
159 |         html = parsers.RE_RESTRICTED.sub ('', html)
160 |         html = RE_XMLDECL.sub ('', html)
161 |         html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html)
162 | 
163 |         # convert to xhtml
164 |         tidy = subprocess.Popen (
165 |             ["tidy",
166 |              "-utf8",
167 |              "-clean",
168 |              "--wrap",             "0",
169 |              # "--drop-font-tags",   "y",
170 |              # "--drop-proprietary-attributes", "y",
171 |              # "--add-xml-space",    "y",
172 |              "--output-xhtml",     "y",
173 |              "--numeric-entities", "y",
174 |              "--merge-divs",       "n", # keep poetry indentation
175 |              "--merge-spans",      "n",
176 |              "--add-xml-decl",     "n",
177 |              "--doctype",          "strict",
178 |              "--anchor-as-name",   "n",
179 |              "--enclose-text",     "y" ],
180 | 
181 |             stdin = subprocess.PIPE,
182 |             stdout = subprocess.PIPE,
183 |             stderr = subprocess.PIPE)
184 | 
185 |         # print (html.encode ('utf-8'))
186 |         # sys.exit ()
187 | 
188 |         (html, stderr) = tidy.communicate (html.encode ('utf-8'))
189 | 
190 |         regex = re.compile ('(Info:|Warning:|Error:)\s*', re.I)
191 | 
192 |         # pylint: disable=E1103
193 |         msg = stderr.rstrip ()
194 |         for line in msg.splitlines ():
195 |             match = regex.search (line)
196 |             if match:
197 |                 sline = regex.sub ("", line)
198 |                 g = match.group (1).lower ()
199 |                 if g == 'info:':
200 |                     info ("tidy: %s" % sline)
201 |                 elif g == 'warning:':
202 |                     warn ("tidy: %s" % sline)
203 |                 elif g == 'error:':
204 |                     error ("tidy: %s" % sline)
205 |                 else:
206 |                     error (line)
207 | 
208 |         if tidy.returncode == 2:
209 |             raise ValueError, stderr
210 | 
211 |         return html.decode ('utf-8')
212 | 
213 | 
214 |     def find_coverpage (self):
215 |         """ Search coverpage and put url into <link rel="coverpage" >.
216 | 
217 |         First look for an image with id of 'coverpage', then for an
218 |         image with 'cover' in the url, then with 'title' in the url.
219 | 
220 |         """
221 |         for head in xpath (self.xhtml, 'xhtml:head'):
222 |             for dummy_link in xpath (head, 'xhtml:link[@rel = "coverpage"]'):
223 |                 # already there
224 |                 return
225 | 
226 |             covers = (xpath (self.xhtml, '//xhtml:img[@id = "coverpage"]') or
227 |                       xpath (self.xhtml, '//xhtml:img[contains (@src, "cover")]') or
228 |                       xpath (self.xhtml, '//xhtml:img[contains (@src, "title")]'))
229 |             if not covers:
230 |                 return
231 | 
232 |             href = covers[0].get ('src')
233 |             # FIXME: enforce minimum size
234 |             head.append (etree.Element (NS.xhtml.link, rel = 'coverpage', href = href))
235 |             return href
236 | 
237 | 
238 |     def _fix_anchors (self):
239 |         """ Move name to id and fix hrefs and ids. """
240 | 
241 |         # move anchor name to id
242 |         # 'id' values are more strict than 'name' values
243 |         # try to fix ill-formed ids
244 | 
245 |         seen_ids = set ()
246 | 
247 |         for anchor in (xpath (self.xhtml, "//xhtml:a[@name]") +
248 |                        xpath (self.xhtml, "//xhtml:*[@id]")):
249 |             id_ = anchor.get ('id') or anchor.get ('name')
250 | 
251 |             if 'name' in anchor.attrib:
252 |                 del anchor.attrib['name']
253 |             if 'id' in anchor.attrib:
254 |                 del anchor.attrib['id']
255 |             if NS.xml.id in anchor.attrib:
256 |                 del anchor.attrib[NS.xml.id]
257 | 
258 |             id_ = self._fix_id (id_)
259 | 
260 |             if not parsers.RE_XML_NAME.match (id_):
261 |                 error ("Dropping ill-formed id '%s' in %s" % (id_, self.url))
262 |                 continue
263 | 
264 |             # well-formed id
265 |             if id_ in seen_ids:
266 |                 error ("Dropping duplicate id '%s' in %s" % (id_, self.url))
267 |                 continue
268 | 
269 |             seen_ids.add (id_)
270 |             anchor.set ('id', id_)
271 | 
272 | 
273 |         # try to fix bogus fragment ids
274 |         # 1. fragments point to xml:id, so must be well-formed ids
275 |         # 2. the ids they point to must exist
276 | 
277 |         for link in xpath (self.xhtml, "//xhtml:*[@href]"):
278 |             href = link.get ('href')
279 |             hre, frag = urlparse.urldefrag (href)
280 |             if frag:
281 |                 frag = self._fix_internal_frag (frag)
282 | 
283 |                 if not frag:
284 |                     # non-recoverable ill-formed frag
285 |                     del link.attrib['href']
286 |                     self.add_class (link, 'pgkilled')
287 |                     error ('Dropping ill-formed frag in %s' % href)
288 |                     continue
289 | 
290 |                 # well-formed frag
291 |                 if hre:
292 |                     # we have url + frag
293 |                     link.set ('href', "%s#%s" % (hre, urllib.quote (frag.encode ('utf-8'))))
294 |                     self.add_class (link, 'pgexternal')
295 |                 elif frag in seen_ids:
296 |                     # we have only frag
297 |                     link.set ('href', "#%s" % urllib.quote (frag.encode ('utf-8')))
298 |                     self.add_class (link, 'pginternal')
299 |                 else:
300 |                     del link.attrib['href']
301 |                     self.add_class (link, 'pgkilled')
302 |                     error ("Dropping frag to non-existing id in %s" % href)
303 | 
304 | 
305 |     def _to_xhtml11 (self):
306 |         """ Make vanilla xhtml more conform to xhtml 1.1 """
307 | 
308 |         # Change content-type meta to application/xhtml+xml.
309 |         for meta in xpath (self.xhtml, "/xhtml:html/xhtml:head/xhtml:meta[@http-equiv]"):
310 |             if meta.get ('http-equiv').lower () == 'content-type':
311 |                 meta.set ('content', mt.xhtml + '; charset=utf-8')
312 | 
313 |         # drop javascript
314 | 
315 |         for script in xpath (self.xhtml, "//xhtml:script"):
316 |             script.drop_tree ()
317 | 
318 |         # drop form
319 | 
320 |         for form in xpath (self.xhtml, "//xhtml:form"):
321 |             form.drop_tree ()
322 | 
323 |         # blockquotes
324 | 
325 |         for bq in xpath (self.xhtml, "//xhtml:blockquote"):
326 |             # no naked text allowed in <blockquote>
327 |             div = etree.Element (NS.xhtml.div)
328 |             for child in bq:
329 |                 div.append (child)
330 |             div.text = bq.text
331 |             bq.text = None
332 |             bq.append (div)
333 |             # lxml.html.defs.block_tags
334 | 
335 |         # insert tbody
336 | 
337 |         for table in xpath (self.xhtml, "//xhtml:table[xhtml:tr]"):
338 |             # no naked <tr> allowed in <table>
339 |             tbody = etree.Element (NS.xhtml.tbody)
340 |             for tr in table:
341 |                 if tr.tag == NS.xhtml.tr:
342 |                     tbody.append (tr)
343 |             table.append (tbody)
344 | 
345 |         # move lang to xml:lang
346 | 
347 |         for elem in xpath (self.xhtml, "//xhtml:*[@lang]"):
348 |             # bug in lxml 2.2.2: sometimes deletes wrong element
349 |             # so we delete both and reset the right one
350 |             lang = elem.get ('lang')
351 |             try:
352 |                 del elem.attrib[NS.xml.lang]
353 |             except KeyError:
354 |                 pass
355 |             del elem.attrib['lang']
356 |             elem.set (NS.xml.lang, lang)
357 | 
358 |         # strip deprecated attributes
359 | 
360 |         for a, t in DEPRECATED.items ():
361 |             for tag in t.split ():
362 |                 for elem in xpath (self.xhtml, "//xhtml:%s[@%s]" % (tag, a)):
363 |                     del elem.attrib[a]
364 | 
365 |         # strip empty class attributes
366 | 
367 |         for elem in xpath (self.xhtml,
368 |             "//xhtml:*[@class and normalize-space (@class) = '']"):
369 |             del elem.attrib['class']
370 | 
371 |         # strip bogus header markup by Joe L.
372 |         for elem in xpath (self.xhtml, "//xhtml:h1"):
373 |             if elem.text and elem.text.startswith ("The Project Gutenberg eBook"):
374 |                 elem.tag = NS.xhtml.p
375 |         for elem in xpath (self.xhtml, "//xhtml:h3"):
376 |             if elem.text and elem.text.startswith ("E-text prepared by"):
377 |                 elem.tag = NS.xhtml.p
378 | 
379 | 
380 |     def __parse (self, html):
381 |         # remove xml decl and doctype, we will add the correct one before serializing
382 |         # html = re.compile ('^.*<html ', re.I | re.S).sub ('<html ', html)
383 |         # FIXME: do not remove doctype because we need it to load the dtd
384 | 
385 |         # remove xml declaration because of parser error: "Unicode
386 |         # strings with encoding declaration are not supported. Please
387 |         # use bytes input or XML fragments without declaration."
388 |         re_xml_decl = re.compile (r'^<\?xml.*?\?>', re.S)
389 |         html = re_xml_decl.sub ('', html)
390 |         try:
391 |             return etree.fromstring (
392 |                 html,
393 |                 lxml.html.XHTMLParser (),
394 |                 base_url = self.url)
395 |         except etree.ParseError, what:
396 |             # cannot try HTML parser because we depend on correct xhtml namespace
397 |             error ("etree.fromstring says: %s" % what)
398 |             m = re.search (r'line\s(\d+),', str (what))
399 |             if m:
400 |                 lineno = int (m.group (1))
401 |                 error ("Line %d: %s" % (lineno, html.splitlines ()[lineno - 1]))
402 |             raise
403 | 
404 | 
405 |     def pre_parse (self):
406 |         """ Pre-parse a html ebook. Does a full parse because a
407 |         lightweight parse would be almost as much work. """
408 | 
409 |         # cache
410 |         if self.xhtml is not None:
411 |             return
412 | 
413 |         debug ("HTMLParser.pre_parse () ...")
414 | 
415 |         html = self.unicode_content ()
416 | 
417 |         if html.startswith ('<?xml'):
418 |             # Try a naive parse. This might fail because of errors in
419 |             # the html or because we have no dtd loaded.  We do not
420 |             # load dtds because that makes us dependent on network and
421 |             # the w3c site being up.  Having all users of epubmaker
422 |             # install local dtds is unrealistic.
423 |             try:
424 |                 self.xhtml = self.__parse (html)
425 |             except etree.ParseError:
426 |                 pass
427 | 
428 |         if self.xhtml is None:
429 |             # previous parse failed, try tidy
430 |             info ("Running html thru tidy.")
431 |             html = self.tidy (html)
432 |             self.xhtml = self.__parse (html)     # let exception bubble up
433 | 
434 |         self._fix_anchors () # needs relative paths
435 |         self.xhtml.make_links_absolute (base_url = self.url)
436 |         self.find_coverpage ()
437 | 
438 |         self._to_xhtml11 ()
439 | 
440 |         debug ("Done parsing %s" % self.url)
441 | 
442 | 
443 |     def parse (self):
444 |         """ Fully parse a html ebook. """
445 | 
446 |         debug ("HTMLParser.parse () ...")
447 | 
448 |         self.pre_parse ()
449 | 


--------------------------------------------------------------------------------
/epubmaker/parsers/ImageParser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | ImageParser.py
  7 | 
  8 | Copyright 2009 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Parse an url of type image/*.
 13 | 
 14 | """
 15 | 
 16 | from __future__ import with_statement
 17 | 
 18 | import StringIO
 19 | 
 20 | from PIL import Image
 21 | 
 22 | from pkg_resources import resource_string # pylint: disable=E0611
 23 | 
 24 | from epubmaker.lib.Logger import debug, error
 25 | from epubmaker.lib.MediaTypes import mediatypes as mt
 26 | from epubmaker.parsers import ParserBase
 27 | 
 28 | mediatypes = (mt.jpeg, mt.png, mt.gif)
 29 | 
 30 | class Parser (ParserBase):
 31 |     """Parse an image.
 32 | 
 33 |     And maybe resize it for ePub packaging.
 34 | 
 35 |     """
 36 | 
 37 |     def __init__ (self):
 38 |         ParserBase.__init__ (self)
 39 |         self.image_data = None
 40 |         self.dimen = None
 41 |         self.comment = None
 42 | 
 43 | 
 44 |     def resize_image (self, max_size, max_dimen, output_format = None):
 45 |         """ Create a new parser with a resized image. """
 46 | 
 47 |         new_parser = Parser ()
 48 | 
 49 |         try:
 50 |             image = Image.open (StringIO.StringIO (self.image_data))
 51 | 
 52 |             format_ = image.format
 53 |             if output_format:
 54 |                 format_ = output_format
 55 |             if format_ == 'gif':
 56 |                 format_ = 'png'
 57 |             if format_ == 'jpeg' and image.mode.lower () != 'rgb':
 58 |                 image = image.convert ('RGB')
 59 | 
 60 |             if 'dpi' in image.info:
 61 |                 del image.info['dpi']
 62 | 
 63 |             # maybe resize image
 64 | 
 65 |             # find scaling factor
 66 |             scale = 1.0
 67 |             scale = min (scale, max_dimen[0] / float (image.size[0]))
 68 |             scale = min (scale, max_dimen[1] / float (image.size[1]))
 69 | 
 70 |             was = ''
 71 |             if scale < 1.0:
 72 |                 dimen = (int (image.size[0] * scale), int (image.size[1] * scale))
 73 |                 was = "(was %d x %d scale=%.2f) " % (image.size[0], image.size[1], scale)
 74 |                 image = image.resize (dimen, Image.ANTIALIAS)
 75 | 
 76 |             # find best quality that fits into max_size
 77 |             data = self.image_data
 78 |             if (scale < 1.0) or (len (self.image_data) > max_size):
 79 |                 for quality in (90, 85, 80, 70, 60, 50, 40, 30, 20, 10):
 80 |                     buf = StringIO.StringIO ()
 81 |                     image.save (buf, format_, quality = quality)
 82 |                     data = buf.getvalue ()
 83 |                     if (len (data) <= max_size):
 84 |                         was += 'q=%d' % quality
 85 |                         break
 86 | 
 87 |             comment = "Image: %d x %d size=%d %s" % (
 88 |                         image.size[0], image.size[1], len (data), was)
 89 |             debug (comment)
 90 | 
 91 |             new_parser.mediatype = self.mediatype
 92 |             new_parser.image_data = data
 93 |             new_parser.dimen = tuple (image.size)
 94 |             new_parser.comment = comment
 95 |             new_parser.url = self.url
 96 |             new_parser.orig_url = self.orig_url
 97 |             new_parser.attribs = self.attribs
 98 |             new_parser.fp = self.fp
 99 | 
100 |         except IOError, what:
101 |             error ("Could not resize image: %s" % what)
102 |             new_parser.broken_image ()
103 | 
104 |         return new_parser
105 | 
106 | 
107 |     def get_image_dimen (self):
108 |         if self.dimen is None:
109 |             image = Image.open (StringIO.StringIO (self.image_data))
110 |             self.dimen = image.size
111 |         return self.dimen
112 | 
113 | 
114 |     def broken_image (self):
115 |         """ Insert broken image placeholder. """
116 | 
117 |         self.image_data = resource_string ('epubmaker.parsers', 'broken.png')
118 |         # We need a way to distinguish between pngs to drop and pngs
119 |         # to keep in a non-images build.
120 |         self.mediatype = 'image/png;type=resource'
121 | 
122 | 
123 |     def pre_parse (self):
124 |         if self.image_data is None:
125 |             self.image_data = self.bytes_content ()
126 |         if self.image_data is None:
127 |             self.broken_image ()
128 | 
129 | 
130 |     def parse (self):
131 |         """ Parse the image. """
132 | 
133 |         pass
134 | 
135 | 
136 |     def serialize (self):
137 |         """ Serialize the image. """
138 |         return self.image_data
139 | 
140 | 


--------------------------------------------------------------------------------
/epubmaker/parsers/RSTParser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | 
  6 | RSTParser.py
  7 | 
  8 | Copyright 2010-2012 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | """
 13 | 
 14 | # FIXME:
 15 | # use docinfo instead of meta for pg header
 16 | 
 17 | import copy
 18 | import re
 19 | import os
 20 | import collections
 21 | import urlparse
 22 | from functools import partial
 23 | 
 24 | from lxml import etree
 25 | import lxml.html
 26 | 
 27 | import docutils.readers.standalone
 28 | from docutils import nodes, frontend, io
 29 | 
 30 | from pkg_resources import resource_string # pylint: disable=E0611
 31 | 
 32 | from epubmaker.lib.GutenbergGlobals import NS, xpath
 33 | from epubmaker.lib.Logger import info, debug, warn, error
 34 | from epubmaker.lib.MediaTypes import mediatypes as mt
 35 | 
 36 | from epubmaker import ParserFactory
 37 | from epubmaker.parsers import HTMLParser
 38 | 
 39 | from epubmaker.mydocutils import broken
 40 | from epubmaker.mydocutils import nodes as mynodes
 41 | from epubmaker.mydocutils.writers import xhtml1, epub2, xetex
 42 | 
 43 | from epubmaker.mydocutils.gutenberg import parsers as gutenberg_parsers
 44 | from epubmaker.mydocutils.gutenberg.writers import nroff as gutenberg_nroff
 45 | from epubmaker.CommonOptions import Options
 46 | 
 47 | options = Options()
 48 | 
 49 | mediatypes = (mt.rst, )
 50 | 
 51 | RE_EMACS_CHARSET = re.compile (r'-\*-.*coding:\s*(\S+)',  re.I)
 52 | 
 53 | class Parser (HTMLParser.Parser):
 54 |     """ Parse a ReStructured Text 
 55 | 
 56 |     and convert it to different xhtml flavours.
 57 | 
 58 |     """
 59 | 
 60 |     def __init__ (self):
 61 |         HTMLParser.Parser.__init__ (self)
 62 |         self.document1 = None
 63 | 
 64 | 
 65 |     def preprocess (self, charset):
 66 |         """ Insert pg header and footer. """
 67 |         
 68 |         return self.unicode_content ()
 69 | 
 70 | 
 71 |     def to_xhtml (self, html, base_url):
 72 |         html = html.replace (u' ', u' ')
 73 |         html = html.replace (u'—', u'—')
 74 | 
 75 |         outputfilename = os.path.join (options.outputdir, options.outputfile)
 76 |         debugfilename = os.path.splitext (outputfilename)[0] + '.debug.html'
 77 | 
 78 |         try:
 79 |             os.remove (debugfilename)
 80 |         except OSError:
 81 |             pass
 82 |         
 83 |         if options.verbose > 1:
 84 |             with open (debugfilename, 'w') as fp:
 85 |                 fp.write (html.encode ('utf-8'))
 86 | 
 87 |         try:
 88 |             xhtml = etree.fromstring (
 89 |                 html, 
 90 |                 lxml.html.XHTMLParser (),
 91 |                 base_url = base_url)                                           
 92 |         except etree.ParseError, what:
 93 |             error ("etree.fromstring says %s" % what)
 94 |             raise
 95 | 
 96 |         xhtml.make_links_absolute (base_url = base_url)
 97 | 
 98 |         return xhtml
 99 | 
100 | 
101 |     def rewrite_links (self, f):
102 |         """ Rewrite all links using the function f. """
103 | 
104 |         doc = self.document1
105 | 
106 |         if 'coverpage' in doc.meta_block:
107 |             coverpage = doc.meta_block['coverpage']
108 |             coverpage[0] = f (coverpage[0])
109 |         else:
110 |             for field in doc.traverse (nodes.field):
111 |                 field_name, field_body = field.children
112 |                 if field_name.astext () == 'coverpage':
113 |                     field_body[:] = nodes.paragraph ('', f (field_body.astext ()))
114 |                     break
115 | 
116 |         for node in doc.traverse (nodes.reference):
117 |             if 'uri' in node:
118 |                 node['uri'] = f (node['uri'])
119 | 
120 |         for node in doc.traverse (nodes.image):
121 |             if 'uri' in node:
122 |                 node['uri'] = f (node['uri'])
123 | 
124 |         for node in doc.traverse (nodes.pending):
125 |             # dropcap images
126 |             if 'image' in node.details:
127 |                 node.details['image'] = f (node.details['image'])
128 | 
129 | 
130 |     def iterlinks (self):
131 |         """ Grab links and images in RST. """
132 | 
133 |         debug ("RSTParser iterlinks want_images = %d" % self.options.want_images)
134 | 
135 |         doc = self.document1
136 | 
137 |         # return coverpage even in noimages build
138 |         if 'coverpage' in doc.meta_block:
139 |             coverpage = doc.meta_block['coverpage']
140 |             yield coverpage[0], {'tag': NS.xhtml.link, 
141 |                                  'type': 'image/jpeg;type=resource', 'rel': 'coverpage'}
142 |         else:
143 |             for field in doc.traverse (nodes.field):
144 |                 field_name, field_body = field.children
145 |                 if field_name.astext () == 'coverpage':
146 |                     yield field_body.astext (), {
147 |                         'tag': NS.xhtml.link, 
148 |                         'type': 'image/jpeg;type=resource', 
149 |                         'rel': 'coverpage'}
150 |                     break
151 | 
152 |         # need broken.png for no-images build
153 |         if not self.options.want_images:
154 |             yield (urlparse.urljoin (self.url, broken), 
155 |                    {'tag': NS.xhtml.img, 'type': 'image/png;type=resource', 'rel': 'broken'})
156 | 
157 |         for node in doc.traverse (nodes.reference):
158 |             if 'uri' in node:
159 |                 yield node['uri'], {'tag': NS.xhtml.a}
160 | 
161 |         if self.options.want_images:
162 |             for node in doc.traverse (nodes.image):
163 |                 if 'uri' in node:
164 |                     yield node['uri'], {'tag': NS.xhtml.img}
165 | 
166 |         if self.options.want_images:
167 |             for node in doc.traverse (nodes.pending):
168 |                 # dropcap images
169 |                 if 'image' in node.details:
170 |                     yield node.details['image'], {'tag': NS.xhtml.img}
171 | 
172 | 
173 |     def get_settings (self, components, defaults):
174 |         option_parser = frontend.OptionParser (
175 |             components = components,
176 |             defaults = defaults, 
177 |             read_config_files = 1)
178 |         return option_parser.get_default_values ()
179 | 
180 | 
181 |     def pre_parse (self):
182 |         """ Parse a RST file as link list. """
183 | 
184 |         debug ("RSTParser: Pre-parsing %s" % self.url)
185 | 
186 |         default_style = self.get_resource (
187 |             'mydocutils.parsers', 'default_style.rst').decode ('utf-8')
188 | 
189 |         source = io.StringInput (default_style + self.unicode_content ())
190 |         reader = docutils.readers.standalone.Reader ()
191 |         parser = gutenberg_parsers.Parser ()
192 | 
193 |         overrides = {
194 |             'get_resource': self.get_resource,
195 |             'get_image_size': self.get_image_size_from_parser,
196 |             'no_images': not self.options.want_images,
197 |             'base_url': self.url,
198 |             }
199 | 
200 |         doc = reader.read (
201 |             source, parser, self.get_settings ((reader, parser), overrides))
202 |         self.document1 = doc
203 | 
204 |         self.rewrite_links (partial (urlparse.urljoin, self.url))
205 | 
206 |         debug ("RSTParser: Done pre-parsing %s" % self.url)
207 | 
208 | 
209 |     def _full_parse (self, writer, overrides):
210 |         """ Full parse from scratch. """
211 | 
212 |         debug ("RSTParser: Full-parsing %s" % self.url)
213 | 
214 |         default_style = self.get_resource (
215 |             'mydocutils.parsers', 'default_style.rst').decode ('utf-8')
216 | 
217 |         source = io.StringInput (default_style + self.unicode_content (), 
218 |                                  self.url, 'unicode')
219 |         reader = docutils.readers.standalone.Reader ()
220 |         parser = gutenberg_parsers.Parser ()
221 | 
222 |         doc = reader.read (
223 |             source, parser, 
224 |             self.get_settings ((reader, parser, writer), overrides))
225 |         self.document1 = doc
226 | 
227 |         self.rewrite_links (partial (urlparse.urljoin, self.url))
228 | 
229 |         doc.transformer.populate_from_components ((source, reader, parser, writer))
230 |         doc.transformer.apply_transforms ()
231 |         debug ("RSTParser: Done full-parsing %s" % self.url)
232 | 
233 |         return doc
234 | 
235 | 
236 |     def _full_parse_2 (self, writer, destination, overrides):
237 |         """ Full parser from pickled doctree. 
238 | 
239 |         Doesn't work yet. It turned out pickling a doctree is much
240 |         harder than I thought. """
241 | 
242 |         debug ("Full-parsing %s" % self.url)
243 | 
244 |         source = io.StringInput (self.unicode_content ())
245 |         reader = docutils.readers.standalone.Reader ()
246 |         parser = gutenberg_parsers.Parser ()
247 | 
248 |         doc = reader.read (
249 |             source, parser, 
250 |             self.get_settings ((reader, parser, writer), overrides))
251 |         self.document1 = doc
252 | 
253 |         self.rewrite_links (partial (urlparse.urljoin, self.url))
254 | 
255 |         # make it picklable
256 |         reporter = doc.reporter #  = None
257 |         # doc.reporter = None
258 |         transformer = doc.transformer
259 |         doc.settings = None
260 |         from docutils.parsers.rst.directives.html import MetaBody
261 | 
262 |         #for metanode in doc.traverse (MetaBody.meta):
263 |         for pending in doc.traverse (nodes.pending):
264 |             # pending.transform = None
265 |             # docutils' meta nodes aren't picklable because the class is nested
266 |             # in pending['nodes']
267 |             if 'nodes' in pending.details: 
268 |                 if isinstance (pending.details['nodes'][0], MetaBody.meta):
269 |                     pending.details['nodes'][0].__class__ = mynodes.meta
270 |         import cPickle as pickle
271 |         pickled = pickle.dumps (doc)
272 | 
273 |         doc = pickle.loads (pickled)
274 | 
275 |         #doc.transformer.populate_from_components (
276 |         #    (source, reader, parser, writer))
277 | 
278 |         doc.transformer = transformer
279 |         doc.reporter = reporter
280 |         doc.settings = self.get_settings ((reader, parser, writer), overrides)
281 | 
282 |         doc.transformer.apply_transforms ()
283 | 
284 |         return writer.write (doc, destination)
285 | 
286 | 
287 |     def rst2nroff (self, charset = 'utf-8'):
288 |         """ Convert RST to nroff. """
289 | 
290 |         writer = gutenberg_nroff.Writer ()
291 |         destination = io.StringOutput (encoding = 'unicode')
292 | 
293 |         overrides = {
294 |             'doctitle_xform': 1,
295 |             'sectsubtitle_xform': 1,
296 |             'footnote_references': 'superscript',
297 |             'compact_lists': 1,
298 |             'compact_simple': 1,
299 |             'page_numbers': 1,
300 |             'no_images': True,
301 |             'get_resource': self.get_resource,
302 |             'format': options.type,
303 |             'encoding': charset,
304 |             'base_url': self.url,
305 |             }
306 |    
307 |         doc = self._full_parse (writer, overrides)
308 |         return writer.write (doc, destination)
309 | 
310 | 
311 |     def rst2xetex (self):
312 |         """ Convert RST to xetex. """
313 | 
314 |         writer = xetex.Writer ()
315 |         destination = io.StringOutput (encoding = 'unicode')
316 | 
317 |         overrides = {
318 |             'doctitle_xform': 1,
319 |             'sectsubtitle_xform': 1,
320 |             'footnote_references': 'superscript',
321 |             'compact_lists': 1,
322 |             'compact_simple': 1,
323 |             'page_numbers': 1,
324 |             'format': options.type,
325 |             'encoding': 'utf-8',
326 |             'get_resource': self.get_resource,
327 |             'get_image_size': self.get_image_size_from_parser,
328 |             'no_images': not self.options.want_images,
329 |             'base_url': self.url,
330 |             }
331 | 
332 |         doc = self._full_parse (writer, overrides)
333 |         return writer.write (doc, destination)
334 | 
335 | 
336 |     def rst2htmlish (self, writer, more_overrides = {}):
337 | 
338 |         destination = io.StringOutput (encoding = 'unicode')
339 | 
340 |         overrides = {
341 |             'stylesheet': None,
342 |             'stylesheet_path': None,
343 |             'xml_declaration': 0,
344 |             'doctitle_xform': 1,
345 |             'initial_header_level': 2,
346 |             'sectsubtitle_xform': 1,
347 |             'footnote_references': 'superscript',
348 |             'page_numbers': 1,
349 |             'format': options.type,
350 |             'encoding': 'utf-8',
351 |             'get_resource': self.get_resource,
352 |             'get_image_size': self.get_image_size_from_parser,
353 |             'no_images': not self.options.want_images,
354 |             'base_url': self.url,
355 |             }
356 |         overrides.update (more_overrides)
357 | 
358 |         doc = self._full_parse (writer, overrides)
359 |         return writer.fixup_xhtml (self.to_xhtml (writer.write (doc, destination), self.url))
360 | 
361 | 
362 |     def rst2html (self):
363 |         """ Convert RST input to HTML output. """
364 |         return self.rst2htmlish (xhtml1.Writer ())
365 | 
366 | 
367 |     def rst2epub2 (self):
368 |         """ Convert RST input to HTML output with Epub2 tweaks. """
369 |         return self.rst2htmlish (epub2.Writer (), 
370 |                                  { 'toc_backlinks': 'none' })
371 | 
372 | 
373 |     def get_resource (self, package, resource):
374 |         return (resource_string ('epubmaker.' + package, resource))
375 | 
376 | 
377 |     def get_image_size_from_parser (self, uri):
378 |         # debug ("Getting image dimen for %s" % uri)
379 |         parser = ParserFactory.ParserFactory.create (uri, {})
380 |         parser.pre_parse ()
381 |         if hasattr (parser, 'get_image_dimen'):
382 |             return parser.get_image_dimen ()
383 |         return None
384 | 
385 | 
386 |     def get_charset_from_rstheader (self):
387 |         """ Parse text for hints about charset. """
388 |         # .. -*- coding: utf-8 -*-
389 |         
390 |         charset = None
391 |         rst = self.bytes_content ()
392 |         
393 |         match = RE_EMACS_CHARSET.search (rst)
394 |         if (match):
395 |             charset = match.group (1)
396 |             debug ('Got charset %s from emacs comment' % charset)
397 | 
398 |         return charset
399 | 
400 | 
401 |     def parse (self):
402 |         """ Dummy. Use rst2* instead. """
403 | 
404 |         debug ("Done parsing %s" % self.url)
405 | 


--------------------------------------------------------------------------------
/epubmaker/parsers/broken.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gitenberg-dev/pg-epubmaker/9a982bab100518aea7582e3e570f5edc74a5fa0d/epubmaker/parsers/broken.png


--------------------------------------------------------------------------------
/epubmaker/writers/HTMLWriter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | HTMLWriter.py
  7 | 
  8 | Copyright 2009 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Writes an HTML file
 13 | 
 14 | """
 15 | 
 16 | from __future__ import with_statement
 17 | 
 18 | import os
 19 | import copy
 20 | 
 21 | from lxml import etree
 22 | from pkg_resources import resource_string # pylint: disable=E0611
 23 | 
 24 | import epubmaker.lib.GutenbergGlobals as gg
 25 | from epubmaker.lib.GutenbergGlobals import xpath
 26 | from epubmaker.lib.Logger import info, debug, error, exception
 27 | 
 28 | from epubmaker import writers
 29 | from epubmaker.CommonOptions import Options
 30 | 
 31 | options = Options()
 32 | 
 33 | 
 34 | class Writer (writers.HTMLishWriter):
 35 |     """ Class for writing HTML files. """
 36 | 
 37 | 
 38 |     def add_dublincore (self, tree):
 39 |         """ Add dublin core metadata to <head>. """
 40 |         source = gg.archive2files (
 41 |             self.options.ebook, self.options.candidate.filename)
 42 | 
 43 |         if hasattr (options.config, 'FILESDIR'):
 44 |             self.options.dc.source = source.replace (options.config.FILESDIR, options.config.PGURL)
 45 |         
 46 |         for head in xpath (tree, '//xhtml:head'):
 47 |             for e in self.options.dc.to_html ():
 48 |                 e.tail = '\n'
 49 |                 head.append (e)
 50 | 
 51 | 
 52 |     def build (self):
 53 |         """ Build HTML file. """
 54 | 
 55 |         htmlfilename = os.path.join (self.options.outputdir, 
 56 |                                      self.options.outputfile)
 57 |         try:
 58 |             os.remove (htmlfilename)
 59 |         except OSError:
 60 |             pass
 61 |                                      
 62 |         try:
 63 |             info ("Creating HTML file: %s" % htmlfilename)
 64 | 
 65 |             for p in self.spider.parsers:
 66 |                 # Do html only. The images were copied earlier by PicsDirWriter.
 67 | 
 68 |                 xhtml = None
 69 |                 if hasattr (p, 'rst2html'):
 70 |                     xhtml = p.rst2html ()
 71 |                 elif hasattr (p, 'xhtml'):
 72 |                     p.parse ()
 73 |                     xhtml = copy.deepcopy (p.xhtml)
 74 | 
 75 |                 if xhtml is not None:
 76 |                     self.make_links_relative (xhtml, p.url)
 77 | 
 78 |                     self.add_dublincore (xhtml)
 79 | 
 80 |                     # makes iphones zoom in
 81 |                     self.add_meta (xhtml, 'viewport', 'width=device-width')
 82 |                     self.add_meta_generator (xhtml)
 83 | 
 84 |                     # This writer has currently to deal only with RST
 85 |                     # input.  The RST writer has a workaround that
 86 |                     # avoids writing empty elements.  So we don't need
 87 |                     # the same ugly workaround as the EPUB writer,
 88 |                     # that has to deal with HTML input too.
 89 |                     html = etree.tostring (xhtml, 
 90 |                                            method = 'xml',
 91 |                                            doctype = gg.XHTML_DOCTYPE,
 92 |                                            encoding = 'utf-8', 
 93 |                                            pretty_print = True,
 94 |                                            xml_declaration = True)
 95 |                     
 96 |                     self.write_with_crlf (htmlfilename, html)
 97 | 
 98 |             # self.copy_aux_files (self.options.outputdir)
 99 |         
100 |             info ("Done HTML file: %s" % htmlfilename)
101 | 
102 |         except StandardError, what:
103 |             exception ("Error building HTML %s: %s" % (htmlfilename, what))
104 |             if os.access (htmlfilename, os.W_OK):
105 |                 os.remove (htmlfilename)
106 |             raise what
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/epubmaker/writers/KindleWriter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | KindleWriter.py
  7 | 
  8 | Copyright 2009-2012 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | """
 13 | 
 14 | import re
 15 | import os
 16 | import subprocess
 17 | 
 18 | from epubmaker.lib.Logger import info, debug, warn, error
 19 | from epubmaker.lib.GutenbergGlobals import SkipOutputFormat
 20 | from epubmaker.writers import EpubWriter
 21 | from epubmaker.CommonOptions import Options
 22 | 
 23 | options = Options()
 24 | 
 25 | 
 26 | class Writer (EpubWriter.Writer):
 27 |     """ Class for writing kindle files. """
 28 | 
 29 | 
 30 |     def parse (self, options):
 31 |         """ Standard parse. """
 32 |         self.setup (options)
 33 | 
 34 | 
 35 |     def build (self):
 36 |         """ Build kindle file. """
 37 | 
 38 |         # Build a special temporary epub file for kindlegen input.
 39 |         # This file is a valid epub but contains strongly simplified HTML.
 40 |         
 41 |         # Much unnecessary juggling of files here because
 42 |         # brain-dead kindlegen doesn't understand unix pipes
 43 |         # and can only output in current directory.
 44 |         # Furthermore we must not conflict with the filenames
 45 |         # of the other generated epub files.
 46 | 
 47 |         kindle_filename = self.options.outputfile
 48 |         epub_filename   = self.options.epub_filename
 49 | 
 50 |         # tmp_epub_filename = os.path.splitext (kindle_filename)[0] + '-kindlegen.epub'
 51 |         # 
 52 |         # debug ("Creating temp Epub file: %s" % os.path.join (
 53 |         #     self.options.outputdir, tmp_epub_filename))
 54 |         # 
 55 |         # # call EpubWriter to build temporary epub file
 56 |         # self.options.outputfile = tmp_epub_filename
 57 |         # EpubWriter.Writer.build (self)
 58 |         # self.options.outputfile = kindle_filename
 59 |         
 60 |         info ("Creating Kindle file: %s" % os.path.join (
 61 |             self.options.outputdir, kindle_filename))
 62 |         info ("            ... from: %s" % os.path.join (
 63 |             self.options.outputdir, epub_filename))
 64 | 
 65 |         try:
 66 |             cwd = os.getcwd ()
 67 |             os.chdir (self.options.outputdir)
 68 | 
 69 |             kindlegen = subprocess.Popen (
 70 |                 [options.config.MOBIGEN, '-o', os.path.basename (kindle_filename), epub_filename],
 71 |                 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 72 | 
 73 |         except OSError, what:
 74 |             os.chdir (cwd)
 75 |             error ("KindleWriter: %s %s" % (options.config.MOBIGEN, what))
 76 |             raise SkipOutputFormat
 77 |         
 78 |         (stdout, stderr) = kindlegen.communicate ('')
 79 | 
 80 |         # try:
 81 |         #     # if self.options.verbose < 2:
 82 |         #     #     os.remove (tmp_epub_filename)
 83 |         #     os.remove (kindle_filename)
 84 |         # except OSError:
 85 |         #     pass
 86 |         #
 87 |         # tmp_mobi_filename = os.path.splitext (tmp_epub_filename)[0] + '.mobi'
 88 |         # os.rename (tmp_mobi_filename, kindle_filename)
 89 | 
 90 |         os.chdir (cwd)
 91 | 
 92 |         regex = re.compile ('^(\w+)\(prcgen\):')
 93 | 
 94 |         if kindlegen.returncode > 0:
 95 |             # pylint: disable=E1103
 96 |             info (stderr.rstrip ())
 97 |             msg = stdout.rstrip ()
 98 |             for line in msg.splitlines ():
 99 |                 match = regex.match (line)
100 |                 if match:
101 |                     sline = regex.sub ("", line)
102 |                     g = match.group (1).lower ()
103 |                     if g == 'info':
104 |                         if sline == 'MOBI File generated with WARNINGS!':
105 |                             # we knew that already
106 |                             continue
107 |                         # info ("kindlegen: %s" % sline)
108 |                     elif g == 'warning':
109 |                         if sline.startswith ('Cover is too small'):
110 |                             continue
111 |                         if sline == 'Cover not specified':
112 |                             continue
113 |                         warn ("kindlegen: %s" % sline)
114 |                     elif g == 'error':
115 |                         error ("kindlegen: %s" % sline)
116 |                     else:
117 |                         error (line)
118 | 
119 |         info ("Done Kindle file: %s" % os.path.join (
120 |             self.options.outputdir, kindle_filename))
121 | 
122 | 


--------------------------------------------------------------------------------
/epubmaker/writers/PDFWriter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | PDFWriter.py
  6 | 
  7 | Copyright 2011 by Marcello Perathoner
  8 | 
  9 | Distributable under the GNU General Public License Version 3 or newer.
 10 | 
 11 | Convert RST to PDF.
 12 | 
 13 | """
 14 | 
 15 | from __future__ import with_statement
 16 | 
 17 | import os
 18 | import subprocess
 19 | 
 20 | from epubmaker.lib.Logger import debug, info, warn, error
 21 | from epubmaker.lib.GutenbergGlobals import SkipOutputFormat
 22 | 
 23 | from epubmaker import ParserFactory
 24 | from epubmaker import writers
 25 | from epubmaker.CommonOptions import Options
 26 | 
 27 | options = Options()
 28 | 
 29 | class Writer (writers.BaseWriter):
 30 |     """ Class to write PDF. """
 31 | 
 32 |     def build (self):
 33 |         """ Build PDF file. """
 34 | 
 35 |         inputfilename  = self.options.candidate.filename
 36 |         outputfilename = os.path.join (self.options.outputdir, self.options.outputfile)
 37 | 
 38 |         debug ("Inputfile: %s" % inputfilename)
 39 |         info ("Creating PDF file: %s" % outputfilename)
 40 | 
 41 |         parser = ParserFactory.ParserFactory.create (inputfilename,
 42 |                                                      self.options.candidate.mediatype)
 43 |         parser.options = self.options
 44 | 
 45 |         if not hasattr (parser, 'rst2xetex'):
 46 |             error ('PDFWriter can only work on a RSTParser.')
 47 |             raise SkipOutputFormat
 48 |         
 49 |         # Brain-dead xetex doesn't understand unix pipes
 50 |         # so we have to write a temp file
 51 |         
 52 |         texfilename = os.path.splitext (outputfilename)[0] + '.tex'
 53 |         auxfilename = os.path.splitext (outputfilename)[0] + '.aux'
 54 |         logfilename = os.path.splitext (outputfilename)[0] + '.log'
 55 | 
 56 |         try:
 57 |             os.remove (auxfilename)
 58 |         except OSError:
 59 |             pass
 60 |         
 61 |         tex = parser.rst2xetex ()
 62 |         with open (texfilename, 'w') as fp:
 63 |             fp.write (tex.encode ('utf-8'))
 64 | 
 65 |         try:
 66 |             cwd = os.getcwd ()
 67 |             os.chdir (self.options.outputdir)
 68 | 
 69 |             _xetex = subprocess.Popen ([options.config.XELATEX,
 70 |                                         "-output-directory", self.options.outputdir,
 71 |                                         "-interaction", "nonstopmode",
 72 |                                         texfilename],
 73 |                                        stdin = subprocess.PIPE, 
 74 |                                        stdout = subprocess.PIPE, 
 75 |                                        stderr = subprocess.PIPE)
 76 |         except OSError, what:
 77 |             os.chdir (cwd)
 78 |             error ("PDFWriter: %s %s" % (options.config.XELATEX, what))
 79 |             raise SkipOutputFormat
 80 | 
 81 |         (dummy_stdout, dummy_stderr) = _xetex.communicate ()
 82 |         
 83 |         with open (logfilename) as fp:
 84 |             for line in fp:
 85 |                 line = line.strip ()
 86 |                 if 'Error:' in line:
 87 |                     error ("xetex: %s" % line)
 88 |                 if options.verbose >= 1:
 89 |                     if 'Warning:' in line:
 90 |                         warn ("xetex: %s" % line)
 91 | 
 92 |         if options.verbose < 2:
 93 |             try:
 94 |                 os.remove (texfilename)
 95 |                 os.remove (logfilename)
 96 |                 os.remove (auxfilename)
 97 |             except OSError:
 98 |                 pass
 99 | 
100 |         os.chdir (cwd)
101 | 
102 |         info ("Done PDF file: %s" % outputfilename)
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/epubmaker/writers/PicsDirWriter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
 3 | 
 4 | """
 5 | 
 6 | PicsDirWriter.py
 7 | 
 8 | Copyright 2012 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | Copies pics into local directory. Needed for HTML and Xetex.
13 | 
14 | """
15 | 
16 | from __future__ import with_statement
17 | 
18 | import os
19 | import copy
20 | 
21 | from lxml import etree
22 | from pkg_resources import resource_string # pylint: disable=E0611
23 | 
24 | import epubmaker.lib.GutenbergGlobals as gg
25 | from epubmaker.lib.GutenbergGlobals import xpath
26 | from epubmaker.lib.Logger import info, debug, error, exception
27 | 
28 | from epubmaker import writers
29 | 
30 | 
31 | class Writer (writers.BaseWriter):
32 |     """ Writes Pics directory. """
33 | 
34 | 
35 |     # def copy_aux_files_lowlevel (self, dest_dir):
36 |     #     """ Copy image files to dest_dir. """
37 |         
38 |     #     for src_uri in self.get_aux_file_list ():
39 |     #         fn_dest = gg.make_url_relative (self.options.base_url, src_uri)
40 |     #         fn_dest = os.path.join (dest_dir, fn_dest)
41 |             
42 |     #         if gg.is_same_path (src_uri, fn_dest):
43 |     #             debug ('Not copying %s to %s: same file' % (src_uri, fn_dest))
44 |     #             continue
45 |     #         debug ('Copying %s to %s' % (src_uri, fn_dest))
46 | 
47 |     #         fn_dest = gg.normalize_path (fn_dest)
48 |     #         gg.mkdir_for_filename (fn_dest)
49 |     #         try:
50 |     #             fp_src = urllib.urlopen (src_uri)
51 |     #             if fp_src:
52 |     #                 with open (fn_dest, 'wb') as fp_dest:
53 |     #                     fp_dest.write (fp_src.read ())
54 |     #         except IOError, what:
55 |     #             error ('Cannot copy %s to %s: %s' % (src_uri, fn_dest, what))
56 | 
57 | 
58 |     def copy_aux_files (self, dest_dir):
59 |         """ Copy image files to dest_dir. Use image data cached in parsers. """
60 | 
61 |         for p in self.spider.parsers:
62 |             if hasattr (p, 'resize_image'):
63 |                 src_uri = p.url
64 |                 fn_dest = gg.make_url_relative (self.options.base_url, src_uri)
65 |                 fn_dest = os.path.join (dest_dir, fn_dest)
66 | 
67 |                 if gg.is_same_path (src_uri, fn_dest):
68 |                     debug ('Not copying %s to %s: same file' % (src_uri, fn_dest))
69 |                     continue
70 |                 debug ('Copying %s to %s' % (src_uri, fn_dest))
71 | 
72 |                 fn_dest = gg.normalize_path (fn_dest)
73 |                 gg.mkdir_for_filename (fn_dest)
74 |                 try:
75 |                     with open (fn_dest, 'wb') as fp_dest:
76 |                         fp_dest.write (p.serialize ())
77 |                 except IOError, what:
78 |                     error ('Cannot copy %s to %s: %s' % (src_uri, fn_dest, what))
79 | 
80 | 
81 |                     
82 |     def build (self):
83 |         """ Build Pics file. """
84 | 
85 |         dir = self.options.outputdir
86 | 
87 |         info ("Creating Pics directory in: %s" % dir)
88 | 
89 |         self.copy_aux_files (dir)
90 |         
91 |         info ("Done Pics directory in: %s" % dir)
92 | 
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/epubmaker/writers/RSTWriter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
 3 | 
 4 | """
 5 | RSTWriter.py
 6 | 
 7 | Copyright 2009 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Build an RST file. This is just the master RST with the PG license mixed in.
12 | 
13 | """
14 | 
15 | from __future__ import with_statement
16 | 
17 | import os
18 | 
19 | from epubmaker.lib.Logger import debug, info, error
20 | from epubmaker import ParserFactory
21 | from epubmaker import writers
22 | 
23 | class Writer (writers.BaseWriter):
24 |     """ Class to write a reStructuredText. """
25 | 
26 |     def build (self):
27 |         """ Build RST file. """
28 | 
29 |         filename = os.path.join (self.options.outputdir, self.options.outputfile)
30 | 
31 |         info ("Creating RST file: %s" % filename)
32 | 
33 |         parser = ParserFactory.ParserFactory.create (self.options.candidate.filename,
34 |                                                      self.options.candidate.mediatype)
35 |         parser.options = self.options
36 | 
37 |         if not hasattr (parser, 'rst2nroff'):
38 |             error ('RSTWriter can only work on a RSTParser.')
39 |             return
40 |         
41 |         data = parser.preprocess ('utf-8').encode ('utf-8')
42 | 
43 |         self.write_with_crlf (filename, data)
44 |         
45 |         info ("Done RST file: %s" % filename)
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/epubmaker/writers/TxtWriter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | TxtWriter.py
  6 | 
  7 | Copyright 2009 by Marcello Perathoner
  8 | 
  9 | Distributable under the GNU General Public License Version 3 or newer.
 10 | 
 11 | Build an UTF-8-encoded PG plain text file. This is just the plain text
 12 | version recoded into UTF-8.
 13 | 
 14 | """
 15 | 
 16 | from __future__ import with_statement
 17 | 
 18 | import os
 19 | import subprocess
 20 | 
 21 | from epubmaker.lib.Logger import debug, info, warn, error
 22 | from epubmaker.lib.GutenbergGlobals import SkipOutputFormat
 23 | 
 24 | from epubmaker import ParserFactory
 25 | from epubmaker import writers
 26 | from epubmaker.CommonOptions import Options
 27 | 
 28 | options = Options()
 29 | 
 30 | # map some not-widely-supported characters to more common ones
 31 | u2u = {
 32 |     0x2010: u'-',  # unicode HYPHEN to HYPHEN-MINUS. Many Windows fonts lack this.
 33 |     }
 34 | 
 35 | class Writer (writers.BaseWriter):
 36 |     """ Class to write PG plain text. """
 37 | 
 38 |     def groff (self, nroff, encoding = 'utf-8'):
 39 |         """ Process thru groff.
 40 | 
 41 |         Takes and returns unicode strings!
 42 | 
 43 |         """
 44 | 
 45 |         device = { 'utf-8': 'utf8',
 46 |                    'iso-8859-1': 'latin1',
 47 |                    'us-ascii': 'ascii' }[encoding]
 48 |         
 49 |         nroff = nroff.encode (encoding)
 50 |         nrofffilename = os.path.join (
 51 |             self.options.outputdir,
 52 |             os.path.splitext (self.options.outputfile)[0] + '.nroff')
 53 | 
 54 |         # write nroff file for debugging
 55 |         if options.verbose >= 2:
 56 |             with open (nrofffilename, 'w') as fp:
 57 |                 fp.write (nroff)
 58 |         else:
 59 |             try:
 60 |                 # remove debug files from previous runs
 61 |                 os.remove (nrofffilename)
 62 |             except OSError:
 63 |                 pass
 64 | 
 65 |         # call groff
 66 |         try:
 67 |             _groff = subprocess.Popen ([options.config.GROFF, 
 68 |                                        "-t",             # preprocess with tbl
 69 |                                        "-K", device,     # input encoding
 70 |                                        "-T", device],    # output device
 71 |                                       stdin = subprocess.PIPE, 
 72 |                                       stdout = subprocess.PIPE, 
 73 |                                       stderr = subprocess.PIPE)
 74 |         except OSError:
 75 |             error ("TxtWriter: executable not found: %s" % options.config.GROFF)
 76 |             raise SkipOutputFormat
 77 | 
 78 |         (txt, stderr) = _groff.communicate (nroff)
 79 |         
 80 |         # pylint: disable=E1103
 81 |         for line in stderr.splitlines ():
 82 |             line = line.strip ()
 83 |             if 'error' in line:
 84 |                 error ("groff: %s" % line)
 85 |             elif 'warn' in line:
 86 |                 if options.verbose >= 1:
 87 |                     warn ("groff: %s" % line)
 88 | 
 89 |         txt = txt.decode (encoding)
 90 |         return txt.translate (u2u) # fix nroff idiosyncracies
 91 | 
 92 | 
 93 |     def build (self):
 94 |         """ Build TXT file. """
 95 | 
 96 |         filename = os.path.join (self.options.outputdir, self.options.outputfile)
 97 | 
 98 |         encoding = options.subtype.strip ('.')
 99 | 
100 |         info ("Creating plain text file: %s" % filename)
101 | 
102 |         parser = ParserFactory.ParserFactory.create (self.options.candidate.filename,
103 |                                                      self.options.candidate.mediatype)
104 |         parser.options = self.options
105 | 
106 |         if hasattr (parser, 'rst2nroff'):
107 |             data = self.groff (parser.rst2nroff (encoding), encoding)
108 |         else:
109 |             data = parser.unicode_content ()
110 | 
111 |         data = data.encode ('utf_8_sig' if encoding == 'utf-8' else encoding, 'unitame')
112 | 
113 |         self.write_with_crlf (filename, data)
114 |             
115 |         info ("Done plain text file: %s" % filename)
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/epubmaker/writers/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | Writer package
  7 | 
  8 | Copyright 2009-2010 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Base classes for *Writer modules. (EpubWriter, PluckerWriter, ...)
 13 | 
 14 | """
 15 | 
 16 | from __future__ import with_statement
 17 | 
 18 | from functools import partial
 19 | import os.path
 20 | import urllib
 21 | 
 22 | from lxml import etree
 23 | from lxml.builder import ElementMaker
 24 | 
 25 | from epubmaker.lib.Logger import debug, error
 26 | import epubmaker.lib.GutenbergGlobals as gg
 27 | from epubmaker.lib import MediaTypes
 28 | 
 29 | from epubmaker import ParserFactory
 30 | from epubmaker import Spider
 31 | from epubmaker.Version import VERSION, GENERATOR
 32 | 
 33 | 
 34 | class BaseWriter (object):
 35 |     """
 36 |     Base class for EpubWriter, PluckerWriter, ... 
 37 | 
 38 |     also used as /dev/null writer for debugging
 39 | 
 40 |     """
 41 | 
 42 |     def __init__ (self):
 43 |         self.options = None
 44 |         self.spider = None
 45 | 
 46 | 
 47 |     def setup (self, options):
 48 |         """ override this in a real writer
 49 | 
 50 |         put computationally cheap setup stuff in here,
 51 |         
 52 |         """
 53 | 
 54 |         if not options.include_mediatypes:
 55 |             options.include_mediatypes = (
 56 |                 MediaTypes.TEXT_MEDIATYPES |
 57 |                 MediaTypes.AUX_MEDIATYPES |
 58 |                 MediaTypes.IMAGE_MEDIATYPES
 59 |                 )
 60 | 
 61 |         self.options = options
 62 | 
 63 | 
 64 |     def parse (self, options):
 65 |         """ Standard parse. """
 66 |         self.setup (options)
 67 | 
 68 |         if self.spider is None:
 69 |             self.spider = Spider.Spider ()
 70 | 
 71 |         self.spider.parse (options.candidate.filename, 
 72 |                            options.candidate.mediatype,
 73 |                            options)
 74 | 
 75 |         options.candidate.filename = self.spider.redirect (options.candidate.filename)
 76 |         options.base_url = options.candidate.filename
 77 | 
 78 | 
 79 |     def build (self):
 80 |         """ override this in a real writer """
 81 |         pass
 82 | 
 83 | 
 84 |     @staticmethod
 85 |     def write_with_crlf (filename, data):
 86 |         # \r\n is PG standard
 87 |         data = '\r\n'.join (data.splitlines ()) + '\r\n'
 88 |         
 89 |         # open binary so windows doesn't add another \r
 90 |         with open (filename, 'wb') as fp:
 91 |             fp.write (data)
 92 |             
 93 | 
 94 |     def validate (self): # pylint: disable=R0201
 95 |         """ Validate the output with some (external) tool.
 96 | 
 97 |         Override this in a real writer.
 98 | 
 99 |         """
100 |         return 0
101 | 
102 | 
103 |     def sync (self):
104 |         """  Override this if you need to sync before program exit. """
105 |         pass
106 | 
107 | 
108 |     def make_links_relative (self, xhtml, base_url):
109 |         """ Make absolute links in xhtml relative to base_url. """
110 | 
111 |         debug ("Making links relative to: %s" % base_url)
112 |         xhtml.rewrite_links (partial (gg.make_url_relative, base_url))
113 | 
114 | 
115 |     def get_aux_file_list (self):
116 |         """ Iterate over image files. Return absolute urls. """
117 | 
118 |         for p in self.spider.parsers:
119 |             if hasattr (p, 'resize_image'):
120 |                 yield p.url
121 | 
122 | 
123 | em = ElementMaker (namespace = str (gg.NS.xhtml),
124 |                    nsmap = { None: str (gg.NS.xhtml) })
125 | 
126 | 
127 | class HTMLishWriter (BaseWriter):
128 |     """ Base class for writers with HTMLish contents. """
129 | 
130 |     @staticmethod
131 |     def add_class (elem, class_):
132 |         """ Add a class to html element. """
133 | 
134 |         classes = elem.get ('class', '').split ()
135 |         classes.append (class_)
136 |         elem.set ('class', ' '.join (classes))
137 | 
138 | 
139 |     @staticmethod
140 |     def add_meta (xhtml, name, content):
141 |         """ Add a meta tag. """
142 |         
143 |         for head in gg.xpath (xhtml, '//xhtml:head'):
144 |             meta = em.meta (name = name, content = content)
145 |             meta.tail = '\n'
146 |             head.append (meta)
147 |         
148 | 
149 |     @staticmethod
150 |     def add_meta_generator (xhtml):
151 |         """ Add our piss mark. """
152 | 
153 |         HTMLishWriter.add_meta (xhtml, 'generator', GENERATOR % VERSION)
154 | 
155 | 
156 |     @staticmethod
157 |     def add_internal_css (xhtml, css_as_string):
158 |         """ Add internal stylesheet to html. """
159 |         
160 |         if css_as_string and xhtml is not None:
161 |             css_as_string = '\n' + css_as_string.strip (' \n') + '\n'
162 |             for head in gg.xpath (xhtml, '//xhtml:head'):
163 |                 style = em.style (css_as_string, type = 'text/css')
164 |                 style.tail = '\n'
165 |                 head.append (style)
166 | 
167 | 
168 |     def add_external_css (self, xhtml, css_as_string, url):
169 |         """ Add external stylesheet to html. """
170 |         
171 |         if css_as_string:
172 |             p = ParserFactory.ParserFactory.get ('text/css')
173 |             p.parse_string (css_as_string)
174 |             p.url = url
175 |             self.spider.parsers.append (p)
176 |             
177 |         if xhtml is not None:
178 |             for head in gg.xpath (xhtml, '//xhtml:head'):
179 |                 link = em.link (href = url, rel = 'stylesheet', type = 'text/css')
180 |                 link.tail = '\n'
181 |                 head.append (link)
182 | 
183 | 
184 | 
185 | 
186 | 
187 |     
188 | 
189 | 


--------------------------------------------------------------------------------
/epubmaker/writers/cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gitenberg-dev/pg-epubmaker/9a982bab100518aea7582e3e570f5edc74a5fa0d/epubmaker/writers/cover.jpg


--------------------------------------------------------------------------------
/scripts/epubmaker:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
 3 | 
 4 | """
 5 | 
 6 | epubmaker script
 7 | 
 8 | Copyright 2014 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | This script starts epubmaker.
13 | 
14 | """
15 | 
16 | from epubmaker import EpubMaker
17 | 
18 | EpubMaker.main ()
19 | 
20 | 


--------------------------------------------------------------------------------
/scripts/rhyme_compiler:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
 3 | 
 4 | """
 5 | 
 6 | ryhme_compiler.py
 7 | 
 8 | Copyright 2009 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | This module produces a dbm file of rhyme stems.
13 | 
14 | We use a very naive concept of rhyme: we preprocess the 'CMU
15 | Pronouncing Dictionary' (found at
16 | http://www.speech.cs.cmu.edu/cgi-bin/cmudict) and extract the phonemes
17 | for each word from the last stressed one to the end of the word.
18 | 
19 | The result is stored in cmudict.db hashed by word.
20 | 
21 | To compile:
22 | 
23 | $ ./rhyme_compiler.py cmudict.0.7a
24 | 
25 | 
26 | """
27 | 
28 | import fileinput
29 | import re
30 | import gdbm
31 | 
32 | dbm = gdbm.open ('cmudict.db', 'nf')
33 | 
34 | RE_STRESSED = re.compile ('[a-z]+[12][^12]*$')
35 | 
36 | # two example lines from cmudict
37 | #
38 | # PRONUNCIATION  P R OW0 N AH2 N S IY0 EY1 SH AH0 N
39 | # PRONUNCIATION(1)  P R AH0 N AH2 N S IY0 EY1 SH AH0 N
40 | 
41 | for line in fileinput.input (openhook = fileinput.hook_encoded ("iso-8859-1")):
42 |     if line.startswith (';'):
43 |         continue
44 | 
45 |     word, dummy_sep, phonemes = line.lower ().partition ('  ')
46 | 
47 |     m = RE_STRESSED.search (phonemes)
48 |     if m:
49 |         phoneme = re.sub (r'[ 012]+', '-', m.group (0)) # remove stress marks
50 |         dbm[word.encode ('utf-8')] = phoneme.encode ('utf-8')
51 | 
52 |         # print "%s %s\n" % (word, dbm[word])
53 | 
54 | dbm.sync ()
55 | dbm.reorganize ()
56 | dbm.close ()
57 | 
58 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [egg_info]
2 | 
3 | [bdist_wininst]
4 | plat-name: win32
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # pypi epubmaker setup
 3 | #
 4 | 
 5 | from setuptools import setup
 6 | from setup_inc import *
 7 | 
 8 | setup (
 9 |     name = 'epubmaker',
10 |     version = VERSION,
11 |     install_requires = install_requires,
12 |     package_dir  = package_dir,
13 |     packages     = pypi_packages,
14 |     py_modules   = pypi_py_modules,
15 |     package_data = pypi_package_data,
16 |     scripts      = pypi_scripts,
17 |     data_files   = pypi_data_files,
18 | 
19 |     # metadata for upload to PyPI
20 | 
21 |     author = author,
22 |     author_email = author_email,
23 |     description = description,
24 |     long_description = long_description,
25 |     license = license,
26 |     keywords = keywords,
27 |     url = url,
28 |     classifiers = classifiers,
29 |     platforms = platforms,
30 | )
31 | 


--------------------------------------------------------------------------------
/setup_inc.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # epubmaker common setup all flavors
  3 | #
  4 | 
  5 | VERSION = '0.3.26'
  6 | 
  7 | package_dir = {
  8 |     'epubmaker': 'epubmaker',
  9 |     }
 10 | 
 11 | install_requires = [
 12 |     'roman',
 13 |     'docutils >= 0.8.1, <0.13',
 14 |     'lxml >= 2.3',
 15 |     'cssutils >= 0.9.8a1',
 16 |     'pillow',
 17 |     ]
 18 | 
 19 | 
 20 | pypi_packages = [
 21 |     'epubmaker.parsers',
 22 |     'epubmaker.packagers',
 23 |     'epubmaker.writers',
 24 |     'epubmaker.mydocutils',
 25 |     'epubmaker.mydocutils.parsers',
 26 |     'epubmaker.mydocutils.transforms',
 27 |     'epubmaker.mydocutils.writers',
 28 |     'epubmaker.mydocutils.gutenberg',
 29 |     'epubmaker.mydocutils.gutenberg.parsers',
 30 |     'epubmaker.mydocutils.gutenberg.transforms',
 31 |     'epubmaker.mydocutils.gutenberg.writers',
 32 |     ]
 33 | 
 34 | ibiblio_packages = pypi_packages + [
 35 |     'epubmaker',
 36 |     'epubmaker.lib',
 37 |     'epubmaker.writers.ibiblio',
 38 |     ]
 39 | 
 40 | pypi_py_modules = [
 41 |     'epubmaker.CommonOptions',
 42 |     'epubmaker.EpubMaker',
 43 |     'epubmaker.HTMLChunker',
 44 |     'epubmaker.ParserFactory',
 45 |     'epubmaker.Spider',
 46 |     'epubmaker.Unitame',
 47 |     'epubmaker.UnitameData',
 48 |     'epubmaker.Version',
 49 | 
 50 |     'epubmaker.lib.DublinCore',
 51 |     'epubmaker.lib.GutenbergGlobals',
 52 |     'epubmaker.lib.Logger',
 53 |     'epubmaker.lib.MediaTypes',
 54 | 
 55 |     'epubmaker.WriterFactory',
 56 |     ]
 57 | 
 58 | pypi_package_data = {
 59 |     'epubmaker.parsers': ['broken.png'],
 60 |     'epubmaker.writers': ['cover.jpg'],
 61 |     'epubmaker.mydocutils.parsers': ['*.rst'],
 62 |     'epubmaker.mydocutils.writers': ['*.css'],
 63 |     'epubmaker.mydocutils.gutenberg.parsers': ['*.rst'],
 64 |     }
 65 | 
 66 | ibiblio_package_data = pypi_package_data
 67 | ibiblio_package_data.update ({
 68 |     'epubmaker.writers.ibiblio': ['qioo-skeleton.zip'],
 69 |     })
 70 | 
 71 | pypi_data_files = [
 72 |     ('', ['CHANGES', 'setup_inc.py']),
 73 |     ]
 74 | 
 75 | ibiblio_data_files = [
 76 |     ('epubmaker', ['CHANGES', 'setup_inc.py']),
 77 |     ]
 78 | 
 79 | pypi_scripts = [
 80 |     'scripts/epubmaker',
 81 |     'scripts/rhyme_compiler',
 82 |     ]
 83 | 
 84 | ibiblio_scripts = pypi_scripts + [
 85 |     'scripts/makepub',
 86 |     'scripts/convert_unitame',
 87 |     'scripts/update_facebook_auth',
 88 |     ]
 89 | 
 90 | # metadata for upload to PyPI
 91 | 
 92 | author = "Marcello Perathoner"
 93 | author_email = "webmaster@gutenberg.org"
 94 | description = "The Project Gutenberg tool to generate EPUBs and other ebook formats."
 95 | long_description = open ('README').read ()
 96 | license = "GPL v3"
 97 | keywords = "ebook epub kindle pdf rst reST reStructuredText project gutenberg format conversion"
 98 | url = "https://github.com/gitenberg-dev/pg-epubmaker"
 99 | 
100 | classifiers = [
101 |     "Topic :: Text Processing",
102 |     "License :: OSI Approved :: GNU General Public License (GPL)",
103 |     "Environment :: Console",
104 |     "Operating System :: OS Independent",
105 |     "Intended Audience :: Other Audience",
106 |     "Development Status :: 4 - Beta"
107 |     ]
108 | 
109 | platforms = 'OS-independent'
110 | 
111 | 


--------------------------------------------------------------------------------
/test/test.py:
--------------------------------------------------------------------------------
 1 | from lxml import etree
 2 | 
 3 | root = etree.fromstring ("""
 4 | <html xml:lang="en" lang="en" xmlns="http://www.w3.org/1999/xhtml">
 5 |   <body>
 6 |      <p>
 7 |        <span style="color: red"></span>black
 8 |      </p>
 9 |   </body>
10 | </html>
11 | """)
12 | 
13 | XHTML11_DOCTYPE = "<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' \
14 | 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'>"
15 | 
16 | print (etree.tostring (
17 |         root,
18 |         method = 'xml',
19 |         xml_declaration = True,
20 |         doctype = XHTML11_DOCTYPE,
21 |         encoding = 'utf-8', 
22 |         pretty_print = True))
23 | 


--------------------------------------------------------------------------------