├── .gitignore ├── CHANGES ├── LICENSE ├── PKG-INFO ├── README ├── epubmaker ├── CommonOptions.py ├── EpubMaker.py ├── HTMLChunker.py ├── ParserFactory.py ├── Spider.py ├── Unitame.py ├── UnitameData.py ├── Version.py ├── WriterFactory.py ├── __init__.py ├── lib │ ├── DublinCore.py │ ├── GutenbergGlobals.py │ ├── Logger.py │ ├── MediaTypes.py │ └── __init__.py ├── mydocutils │ ├── __init__.py │ ├── gutenberg │ │ ├── __init__.py │ │ ├── parsers │ │ │ ├── __init__.py │ │ │ ├── pg-footer.rst │ │ │ └── pg-header.rst │ │ ├── transforms │ │ │ └── __init__.py │ │ └── writers │ │ │ ├── __init__.py │ │ │ └── nroff.py │ ├── nodes.py │ ├── parsers │ │ ├── __init__.py │ │ └── default_style.rst │ ├── transforms │ │ ├── __init__.py │ │ └── parts.py │ └── writers │ │ ├── __init__.py │ │ ├── epub2.py │ │ ├── nroff.py │ │ ├── rst2all.css │ │ ├── rst2epub.css │ │ ├── rst2html.css │ │ ├── xetex.py │ │ └── xhtml1.py ├── packagers │ ├── GzipPackager.py │ ├── HTMLPackager.py │ ├── PDFPackager.py │ ├── PushPackager.py │ ├── RSTPackager.py │ ├── TxtPackager.py │ └── __init__.py ├── parsers │ ├── AuxParser.py │ ├── CSSParser.py │ ├── GutenbergTextParser.py │ ├── HTMLParser.py │ ├── ImageParser.py │ ├── RSTParser.py │ ├── __init__.py │ └── broken.png └── writers │ ├── EpubWriter.py │ ├── HTMLWriter.py │ ├── KindleWriter.py │ ├── PDFWriter.py │ ├── PicsDirWriter.py │ ├── RSTWriter.py │ ├── TxtWriter.py │ ├── __init__.py │ └── cover.jpg ├── scripts ├── epubmaker └── rhyme_compiler ├── setup.cfg ├── setup.py ├── setup_inc.py └── test └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # ignore the local copy of any logs 2 | logs/* 3 | 4 | # python ignores 5 | *.pyc 6 | *.db 7 | *.coverage 8 | */.ipynb_checkpoints/* 9 | *.ipynb_checkpoints/* 10 | 11 | # Python packaging 12 | .eggs* 13 | .env 14 | .tox* 15 | build* 16 | dist* 17 | epubmaker.egg-info 18 | log/log.txt 19 | *.log 20 | 21 | 22 | -------------------------------------------------------------------------------- /CHANGES: -------------------------------------------------------------------------------- 1 | 0.3.26 October 8, 2018 2 | 3 | Don't fail on audio links 4 | Don't unescape external hrefs 5 | 6 | 0.3.25 September 20, 2018 7 | 8 | Don't fail on with bad src 9 | Use a borg class to effect an options global instead of patching builtins 10 | Don't disable translations 11 | Add --local-only option so that using depth>1 can be used for multi-file books 12 | Running the code from source didn't work with the out of date practice of not using package name for directory name. 13 | fix bug for no stylesheet 14 | utf-8 is the encoding, not unicode 15 | update contact info 16 | 17 | 0.3.21 February 24, 2017 18 | 19 | Add parameter to add and set the cover image. 20 | Switch setup to setuptools to better manage dependencies, because docutils 0.13 breaks epubmaker. 21 | No longer strip hyperlinks to external resources. 22 | 23 | 24 | 0.3.20 25 | 26 | Do not make special kindlegen epub anymore. Requires kindlegen 2.7+. 27 | Better coverpage handling. 28 | Works with docutils 0.11+. 29 | 30 | 0.3.19 31 | 32 | 0.3.19b6 33 | 34 | Floats now support 'here'. 35 | 36 | 0.3.19b5 37 | 38 | Fix typo in license text. 39 | Fix "strip_links" debug message crash. 40 | Extend styles directive. 41 | - Add display option to hide the element. 42 | - Allow for negative matches. 43 | Don't use \marginpar for page numbers in TeX. 44 | 45 | 0.3.19b4 46 | 47 | Style directive extended. 48 | Now preserves all trailing whitespace except U+0020. 49 | Added "table de matières" to auto toc detection. 50 | Convert U+2015 to single hyphen in plain text. 51 | 52 | 0.3.19b3 53 | 54 | Fix keyerror hrules and vrules. 55 | Fix unescaped characters in html meta attribute values. 56 | Fix default block image alignment. 57 | Fix use numeric entities in xhtml writer. 58 | 59 | 0.3.19b2 60 | 61 | Fixed text-indent in page nos (made pagenos disapper in line blocks). 62 | Fixed whitespace collapsing in
 nodes.
 63 | Fixed: honors newlines in metadata fields.
 64 | Internal fix: correct format name is: "txt.utf-8".
 65 | Can use docinfo in addition to meta directive.
 66 | 
 67 | 0.3.19b1
 68 | 
 69 | New formats: html.noimages and pdf.noimages.
 70 | No-image builds use a placeholder 'broken' image instead of nothing.
 71 | Figure directives without a filename create a placeholder 'broken' image.
 72 | New option :selector: in lof and lot directives for filtering.
 73 | Turn off italics with class no-italics (and bold with no-bold).
 74 | nbsp now works in ascii txt, soft hyphens now removed from ascii txt.
 75 | Insert line numbers with [ln 42] and [ln!42].
 76 | Works with kindlegen 2.0.
 77 | 
 78 | 0.3.18
 79 | 
 80 | Allow unicode line separator U+2028 as line feed.
 81 | Fix XetexWriter bug with tables without explicit width.
 82 | Add language support in XetexWriter.
 83 | Works with docutils 0.8
 84 | Support docutils-0.8-style :class: language-.
 85 | 
 86 | 0.3.17
 87 | 
 88 | Fix line height of large text.
 89 | Fix images with spaces in src attribute.
 90 | 
 91 | 0.3.16
 92 | 
 93 | Add image_dir to Xetex writer.
 94 | Use quotation environment instead of quote.
 95 | Don't automatically insert \frontmatter.
 96 | Page nos. for kindlegen 1.2.
 97 | Call kindlegen.
 98 | Integrate changes into PG environment.
 99 | 
100 | 0.3.15
101 | 
102 | Reduce vertical margin of images to 1 in TXT.
103 | Fixed link targets in NROFF, PDF.
104 | Report error on xetex errors.
105 | Escape characters in PDF info.
106 | 
107 | 0.3.14
108 | 
109 | Fixed crash on HTML comments in Kindle writer.
110 | 
111 | 0.3.13
112 | 
113 | Start on Kindle writer.
114 | Fix spurious space in PDF literal blocks with classes.
115 | Fix `flat´ TOC.
116 | Thin spaces between quotes made optional.
117 | 
118 | 0.3.12
119 | 
120 | Add more front- and backmatter classes.
121 | Insert thin space between quotes.
122 | Generated List of Tables.
123 | Generated List of Figures.
124 | Emit warning instead of error on groff warnings.
125 | Fix crash when last cell in row spans rows.
126 | Add option vertical-aligns for tables.
127 | Default width of image calculated assuming 980px window.
128 | Fix docutils indentation bug in poetry.
129 | 
130 | 0.3.11
131 | 
132 | Add option widths to tables.
133 | Add option aligns to tables.
134 | Add class norules for tables.
135 | Generate typographically correct tables.
136 | Don't overwrite images if src dir == working dir.
137 | 
138 | 0.3.10
139 | 
140 | Bug fixes.
141 | 
142 | 0.3.9
143 | 
144 | A different fix for figure and image centering on ADE.
145 |   (Calculate explicit left margin).
146 | More work on PDF (Xetex) writer.
147 | Added directives for pagination control.
148 | 
149 | 0.3.8
150 | 
151 | Fix empty poetry lines on ADE.
152 | Fix figure and image centering on ADE.
153 | Fix thoughtbreak centering on ADE.
154 | For push, zip RST into subdir with images.
155 | Start implementing PDF (Xetex) writer.
156 | 
157 | 0.3.7
158 | 
159 | Integrate changes into PG environment.
160 | Fix more CR/LF issues on windows.
161 | Fix cover image format conversion.
162 | Zips a pushable file for the WWers.
163 | 
164 | 0.3.6
165 | 
166 | Code cleanup.
167 | Different CSS templates for RST -> HTML and RST -> EPUB.
168 | 
169 | 0.3.5
170 | 
171 | Zips files up for PG.
172 | 
173 | 0.3.4
174 | 
175 | Tell Tidy not to merge divs and spans.
176 | More fixes to plain text encoding.
177 | 
178 | 0.3.3
179 | 
180 | Implemented coverpages for Adobe ADE.
181 | CSS changes because Adobe ADE chokes on !important.
182 | RST dropcap directive: don't use image in EPUB.
183 | 
184 | 0.3.2
185 | 
186 | Packaging changes.
187 | 


--------------------------------------------------------------------------------
/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.1
 2 | Name: epubmaker
 3 | Version: 0.3.25
 4 | Summary: The Project Gutenberg tool to generate EPUBs and other ebook formats.
 5 | Home-page: https://github.com/gitenberg-dev/pg-epubmaker
 6 | Author: Marcello Perathoner
 7 | Author-email: webmaster@gutenberg.org
 8 | License: GPL v3
 9 | Description: =========
10 |         EpubMaker
11 |         =========
12 |         
13 |         EpubMaker is the tool used for format conversion at Project Gutenberg.
14 |         It builds EPUB2 and Kindle files from HTML.
15 |         Also it builds HTML4, EPUB2, Kindle, and PDF files from reST sources.
16 |         
17 |         
18 |         Prerequisites
19 |         =============
20 |         
21 |         * Python >= 2.6,
22 |         
23 |         * HTMLTidy, 
24 |         
25 |         * Kindlegen, 
26 |         
27 |         * TexLive, and
28 |         
29 |         * groff.
30 |         
31 | Keywords: ebook epub kindle pdf rst reST reStructuredText project gutenberg format conversion
32 | Platform: OS-independent
33 | Classifier: Topic :: Text Processing
34 | Classifier: License :: OSI Approved :: GNU General Public License (GPL)
35 | Classifier: Environment :: Console
36 | Classifier: Operating System :: OS Independent
37 | Classifier: Intended Audience :: Other Audience
38 | Classifier: Development Status :: 4 - Beta
39 | Requires: setuptools
40 | Requires: roman
41 | Requires: docutils (>= 0.8.1, < 0.13)
42 | Requires: lxml (>= 2.3)
43 | Requires: cssutils (>= 0.9.8a1)
44 | Requires: PIL (>= 1.1.7)
45 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | =========
 2 | EpubMaker
 3 | =========
 4 | 
 5 | EpubMaker is the tool used for format conversion at Project Gutenberg.
 6 | It builds EPUB2 and Kindle files from HTML.
 7 | Also it builds HTML4, EPUB2, Kindle, and PDF files from reST sources.
 8 | 
 9 | 
10 | Prerequisites
11 | =============
12 | 
13 | * Python >= 2.6,
14 | 
15 | * HTMLTidy, 
16 | 
17 | * Kindlegen, 
18 | 
19 | * TexLive, and
20 | 
21 | * groff.
22 | 


--------------------------------------------------------------------------------
/epubmaker/CommonOptions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
 3 | 
 4 | """
 5 | 
 6 | CommonOptions.py
 7 | 
 8 | Copyright 2010 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | Common options for programs.
13 | 
14 | """
15 | 
16 | from __future__ import with_statement
17 | 
18 | import optparse
19 | import ConfigParser
20 | import os
21 | 
22 | class Struct (object):
23 |     pass
24 | 
25 | # options is a "Borg" set by optparse (note that it's not thread-safe)
26 | class Options:
27 |     __shared_state = {}
28 |     def __init__(self):
29 |         self.__dict__ = self.__shared_state
30 |         
31 |     def update(self, _dict):
32 |         self.__dict__.update(_dict)
33 | 
34 | options = Options()
35 | 
36 | 
37 | def add_common_options (op):
38 |     """ Add options common to all programs. """
39 |     
40 |     op.add_option (
41 |         "-c", "--config",
42 |         metavar  = "FILE",
43 |         dest     = "config_name", 
44 |         action   = "store",
45 |         default  = "config",
46 |         help     = "use config file (default: config)")
47 | 
48 |     op.add_option (
49 |         "-v", "--verbose",
50 |         dest     = "verbose", 
51 |         action   = "count",
52 |         help     = "be verbose (-v -v be more verbose)")
53 | 
54 |     op.add_option (
55 |         "--validate",
56 |         dest     = "validate", 
57 |         action   = "count",
58 |         help     = "validate epub through epubcheck")
59 | 
60 |     op.add_option (
61 |         "--section",
62 |         metavar  = "TAG.CLASS",
63 |         dest     = "section_tags", 
64 |         default  = [],
65 |         action   = "append",
66 |         help     = "split epub on TAG.CLASS")
67 | 
68 | 
69 | def get_parser (**kwargs):
70 |     op = optparse.OptionParser (**kwargs)
71 |     add_common_options (op)
72 |     return op
73 |     
74 | 
75 | def parse_args (op, params = {}, defaults = {}):
76 |     (parsed_options, args) = op.parse_args ()
77 |     options.update(vars(parsed_options))
78 |     
79 |     cp = ConfigParser.SafeConfigParser (params)
80 |     cp.read ( [options.config_name,
81 |                os.path.expanduser ('~/.epubmaker.conf'),
82 |                '/etc/epubmaker.conf' ] )
83 | 
84 |     options.config = Struct ()
85 | 
86 |     for name, value in defaults.iteritems ():
87 |         setattr (options.config, name.upper (), value)
88 |         
89 |     for section in cp.sections ():
90 |         for name, value in cp.items (section):
91 |             #if value == 'None':
92 |             #    value = None
93 |             # print section, name, value
94 |             setattr (options.config, name.upper (), value)
95 | 
96 |     return options, args
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/epubmaker/EpubMaker.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | EpubMaker.py
  7 | 
  8 | Copyright 2009-2011 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Stand-alone application to build epub out of html or rst.
 13 | 
 14 | """
 15 | 
 16 | 
 17 | from __future__ import with_statement
 18 | 
 19 | import sys
 20 | import os.path
 21 | import re
 22 | import optparse
 23 | import hashlib
 24 | import mimetypes
 25 | 
 26 | from epubmaker.lib.GutenbergGlobals import Struct, DCIMT, SkipOutputFormat
 27 | import epubmaker.lib.GutenbergGlobals as gg
 28 | from epubmaker.lib.Logger import debug, exception
 29 | from epubmaker.lib import Logger, DublinCore
 30 | 
 31 | from epubmaker import ParserFactory
 32 | from epubmaker import WriterFactory
 33 | from epubmaker.packagers import PackagerFactory
 34 | from epubmaker import CommonOptions
 35 | 
 36 | from epubmaker.Version import VERSION
 37 | 
 38 | options = CommonOptions.Options()
 39 | 
 40 | def null_translation (s):
 41 |     """ Translate into same language. :-) """
 42 |     return s
 43 | 
 44 | TXT_FORMATS    = 'txt.utf-8 txt.iso-8859-1 txt.us-ascii'.split ()
 45 | HTML_FORMATS   = 'html.noimages html.images'.split ()
 46 | EPUB_FORMATS   = 'epub.noimages epub.images'.split ()
 47 | KINDLE_FORMATS = 'kindle.noimages kindle.images'.split ()
 48 | PDF_FORMATS    = 'pdf.noimages pdf.images'.split ()
 49 | RST_FORMATS    = 'rst.gen'.split ()
 50 | ALL_FORMATS    = HTML_FORMATS + EPUB_FORMATS + KINDLE_FORMATS + PDF_FORMATS + TXT_FORMATS + RST_FORMATS
 51 | 
 52 | DEPENDENCIES = (
 53 |     ('all',    ALL_FORMATS),
 54 |     ('html',   HTML_FORMATS), 
 55 |     ('epub',   EPUB_FORMATS),
 56 |     ('kindle', KINDLE_FORMATS), 
 57 |     ('pdf',    PDF_FORMATS),
 58 |     ('txt',    TXT_FORMATS), 
 59 |     ('rst',    RST_FORMATS), 
 60 |     )
 61 | 
 62 | FILENAMES = {
 63 |     'html.noimages':    '{id}-noimages-h.html',
 64 |     'html.images':      '{id}-h.html',
 65 | 
 66 |     'epub.noimages':    '{id}-epub.epub',
 67 |     'epub.images':      '{id}-images-epub.epub',
 68 | 
 69 |     'kindle.noimages':  '{id}-kindle.mobi',
 70 |     'kindle.images':    '{id}-images-kindle.mobi',
 71 | 
 72 |     'pdf.noimages':     '{id}-pdf.pdf',
 73 |     'pdf.images':       '{id}-images-pdf.pdf',
 74 | 
 75 |     'txt.utf-8':        '{id}-0.txt',
 76 |     'txt.iso-8859-1':   '{id}-8.txt',
 77 |     'txt.us-ascii':     '{id}.txt',
 78 | 
 79 |     'rst.gen':          '{id}-rst.rst',
 80 | 
 81 |     'picsdir.noimages': '{id}-noimages.picsdir',   # do we need this ?
 82 |     'picsdir.images':   '{id}-images.picsdir',     # do we need this ?
 83 |     }
 84 | 
 85 | def make_output_filename (dc, type_):
 86 |     if dc.project_gutenberg_id:
 87 |         # PG book: use PG naming convention
 88 |         return FILENAMES[type_].format (id = dc.project_gutenberg_id)
 89 |     else:
 90 |         # not a PG ebook
 91 |         return FILENAMES[type_].format (id = gg.string_to_filename (dc.title)[:65])
 92 | 
 93 | def main ():
 94 |     """ Main program. """
 95 | 
 96 |     op = optparse.OptionParser (usage = "usage: %prog [options] url", 
 97 |                                 version = "EpubMaker version %s" % VERSION)
 98 | 
 99 |     CommonOptions.add_common_options (op)
100 | 
101 |     op.add_option (
102 |         "--make",
103 |         dest    = "types",
104 |         choices = [x for x, y in DEPENDENCIES] + ALL_FORMATS,
105 |         default = [],
106 |         action  = 'append',
107 |         help    = ("output type [%s] (default: all)"
108 |                    % ' | '.join ([x for x, y in DEPENDENCIES] + ALL_FORMATS)))
109 | 
110 |     op.add_option (
111 |         "--max-depth",
112 |         metavar = "LEVELS",
113 |         dest    = "max_depth",
114 |         type    = "int",
115 |         default = 1,
116 |         help    = "go how many levels deep while recursively retrieving pages. (0 == infinite)")
117 | 
118 |     op.add_option (
119 |         "--local-only",
120 |         dest    = "local_files_only",
121 |         action  = "store_true",
122 |         default = False,
123 |         help    = "restrict recursive search to local files")
124 | 
125 |     op.add_option (
126 |         "--include",
127 |         metavar = "GLOB",
128 |         dest    = "include_argument", 
129 |         default = [],
130 |         action  = "append",
131 |         help    = "include this url (use globs, repeat for more urls)")
132 | 
133 |     op.add_option (
134 |         "--exclude",
135 |         metavar = "GLOB",
136 |         dest    = "exclude", 
137 |         default = [],
138 |         action  = "append",
139 |         help    = "exclude this url (use globs, repeat for more urls)")
140 | 
141 |     op.add_option (
142 |         "--include-mediatype",
143 |         metavar = "GLOB/GLOB",
144 |         dest    = "include_mediatypes_argument", 
145 |         default = ['text/*', 'application/xhtml+xml'],
146 |         action  = "append",
147 |         help    = "include this mediatype (use globs, repeat for more mediatypes, eg. 'image/*')")
148 | 
149 |     op.add_option (
150 |         "--exclude-mediatype",
151 |         metavar = "GLOB/GLOB",
152 |         dest    = "exclude_mediatypes", 
153 |         default = [],
154 |         action  = "append",
155 |         help    = "exclude this mediatype (use globs, repeat for more mediatypes)")
156 | 
157 |     op.add_option (
158 |         "--rewrite",
159 |         metavar = "from>to",
160 |         dest    = "rewrite", 
161 |         default = [],
162 |         action  = "append",
163 |         help    = "rewrite url eg. 'http://www.example.org/>http://www.example.org/index.html'")
164 | 
165 |     op.add_option (
166 |         "--title",
167 |         dest    = "title", 
168 |         default = None,
169 |         help    = "ebook title (default: from meta)")
170 | 
171 |     op.add_option (
172 |         "--author",
173 |         dest    = "author", 
174 |         default = None,
175 |         help    = "author (default: from meta)")
176 | 
177 |     op.add_option (
178 |         "--ebook",
179 |         dest    = "ebook", 
180 |         type    = "int",
181 |         default = 0,
182 |         help    = "ebook no. (default: from meta)")
183 | 
184 |     op.add_option (
185 |         "--input-encoding",
186 |         dest    = "inputencoding", 
187 |         default = None,
188 |         help    = "input encoding (default: from meta)")
189 | 
190 |     op.add_option (
191 |         "--output-dir",
192 |         dest    = "outputdir", 
193 |         default = "./",
194 |         help    = "output directory (default: ./)")
195 | 
196 |     op.add_option (
197 |         "--output-file",
198 |         dest    = "outputfile", 
199 |         default = None,
200 |         help    = "output file (default: .epub)")
201 | 
202 |     op.add_option (
203 |         "--packager",
204 |         dest    = "packager",
205 |         choices = ['none', 'ww'],
206 |         default = "none",
207 |         help    = "packager type [none | ww] (default: none)")
208 | 
209 |     op.add_option (
210 |         "--mediatype-from-extension",
211 |         dest    = "mediatype_from_extension",
212 |         action  = "store_true",
213 |         default = False,
214 |         help    = "get mediatype from url extension instead of http response")
215 | 
216 |     op.add_option (
217 |         "--cover",
218 |         dest    = "coverpage_url",
219 |         default = None,
220 |         help    = "add the specified cover to the epub")
221 | 
222 |     options, args = CommonOptions.parse_args (op, {}, {
223 |         'proxies': None,
224 |         'bibrec': 'http://www.gutenberg.org/ebooks/',
225 |         'xelatex': 'xelatex',
226 |         'mobigen': 'kindlegen',
227 |         'groff': 'groff',
228 |         'rhyming_dict': None,
229 |         } )
230 | 
231 |     if not args:
232 |         op.error ("please specify which file to convert")
233 | 
234 |     Logger.set_log_level (options.verbose)        
235 | 
236 |     options.types = options.types or ['all']
237 |     for opt, formats in DEPENDENCIES:
238 |         if opt in options.types:
239 |             options.types.remove (opt)
240 |             options.types += formats
241 | 
242 |     if set (options.types).intersection (('html.images', 'pdf.images', 'rst.gen')):
243 |         options.types.insert (0, 'picsdir.images')
244 |     if set (options.types).intersection (('html.noimages', 'pdf.noimages')):
245 |         options.types.insert (0, 'picsdir.noimages')
246 |     if set (options.types).intersection (('kindle.images', )):
247 |         options.types.insert (0, 'epub.images')
248 |     if set (options.types).intersection (('kindle.noimages', )):
249 |         options.types.insert (0, 'epub.noimages')
250 |         
251 |         
252 |     debug ("Building types: %s" % ' '.join (options.types))
253 | 
254 |     ParserFactory.load_parsers ()
255 |     WriterFactory.load_writers ()
256 | 
257 |     packager_factory = None
258 |     if options.packager != 'none':
259 |         packager_factory = PackagerFactory (options.packager)
260 |         packager_factory.load ()
261 | 
262 |     for url in args:
263 | 
264 |         if options.include_argument:
265 |             options.include = options.include_argument[:]
266 |         else:
267 |             exclude_patt = os.path.dirname (url) + '/*'
268 |             options.include = [ exclude_patt ]
269 |             if exclude_patt.startswith ('/'):
270 |                 options.include.append('file://' + exclude_patt)
271 |             
272 |         # try to get metadata
273 | 
274 |         options.candidate = Struct ()
275 |         options.candidate.filename = url
276 |         options.candidate.mediatype = str (DCIMT (
277 |             mimetypes.types_map[os.path.splitext (url)[1]], options.inputencoding))
278 | 
279 |         options.include_mediatypes = options.include_mediatypes_argument[:]
280 |         options.want_images = False
281 |         #options.coverpage_url = None
282 | 
283 |         parser = ParserFactory.ParserFactory.create (options.candidate.filename, {})
284 | 
285 |         dc = None
286 | 
287 |         try:
288 |             dc = DublinCore.GutenbergDublinCore ()
289 | 
290 |             # try for rst header
291 |             dc.load_from_rstheader (parser.unicode_content ())
292 | 
293 |             if dc.project_gutenberg_id == 0:
294 |                 # try for Project Gutenberg header
295 |                 dc.load_from_parser (parser)
296 | 
297 |         except (ValueError, TypeError):
298 |             # use standard HTML header
299 |             dc = DublinCore.DublinCore ()
300 |             dc.load_from_parser (parser)
301 |             dc.source = url
302 | 
303 |         dc.source = url
304 | 
305 |         if options.title:
306 |             dc.title = options.title
307 |         if not dc.title:
308 |             dc.title = 'NA'
309 | 
310 |         if options.author:
311 |             dc.add_author (options.author, 'cre')
312 |         if not dc.authors:
313 |             dc.add_author ('NA', 'cre')
314 | 
315 |         if options.ebook:
316 |             dc.project_gutenberg_id = options.ebook
317 | 
318 |         if dc.project_gutenberg_id:
319 |             dc.opf_identifier = ('http://www.gutenberg.org/ebooks/%d' % dc.project_gutenberg_id)
320 |         else:
321 |             dc.opf_identifier = ('urn:mybooks:%s' %
322 |                                  hashlib.md5 (url.encode ('utf-8')).hexdigest ())
323 | 
324 |         if not dc.languages:
325 |             # we *need* a language to build a valid epub, so just make one up
326 |             dc.add_lang_id ('en')
327 | 
328 |         aux_file_list = []
329 |         
330 |         for type_ in options.types:
331 |             debug ('=== Building %s ===' % type_)
332 |             maintype, subtype = os.path.splitext (type_)
333 | 
334 |             try:
335 |                 writer = WriterFactory.create (maintype)
336 |                 writer.setup (options)
337 |                 options.type = type_
338 |                 options.maintype = maintype
339 |                 options.subtype = subtype
340 |                 options.want_images = False
341 | 
342 |                 options.include_mediatypes = options.include_mediatypes_argument[:]
343 |                 if subtype == '.images':
344 |                     options.include_mediatypes.append ('image/*')
345 |                     options.want_images = True
346 |                 else:
347 |                     # This is the mediatype of the 'broken' image.
348 |                     options.include_mediatypes.append ('image/png;type=resource')
349 | 
350 |                 writer.parse (options)
351 | 
352 |                 if maintype in ('html', ):
353 |                     # list of images for packager
354 |                     aux_file_list[:] = writer.get_aux_file_list ()
355 | 
356 |                 options.dc = dc
357 |                 options.outputfile = make_output_filename (dc, type_)
358 | 
359 |                 if maintype == 'kindle':
360 |                     options.epub_filename = make_output_filename (dc, 'epub' + subtype)
361 | 
362 |                 writer.build ()
363 | 
364 |                 if options.validate:
365 |                     writer.validate ()
366 | 
367 |                 if packager_factory:
368 |                     try:
369 |                         packager = packager_factory.create (type_)
370 |                         packager.setup (options)
371 |                         packager.package (aux_file_list)
372 |                     except KeyError:
373 |                         # no such packager
374 |                         pass
375 | 
376 |                 options.outputfile = None
377 | 
378 |             except SkipOutputFormat:
379 |                 continue
380 |             
381 |             except StandardError, what:
382 |                 exception ("%s" % what)
383 | 
384 |         if options.packager == 'ww':
385 |             try:
386 |                 packager = packager_factory.create ('push')
387 |                 options.outputfile = '%d-final.zip' % (dc.project_gutenberg_id)
388 |                 packager.setup (options)
389 |                 packager.package (aux_file_list)
390 |             except KeyError:
391 |                 # no such packager
392 |                 pass
393 | 
394 |     sys.exit (0)
395 | 
396 | if __name__ == "__main__":
397 |     main ()
398 | 
399 | 
400 | 
401 | 


--------------------------------------------------------------------------------
/epubmaker/HTMLChunker.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | HTMLChunker.py
  7 | 
  8 | Copyright 2009 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Splits a HTML file into chunks.
 13 | 
 14 | """
 15 | 
 16 | from __future__ import with_statement
 17 | 
 18 | import urlparse
 19 | import urllib
 20 | import os
 21 | import re
 22 | import copy
 23 | 
 24 | from lxml import etree
 25 | 
 26 | import epubmaker.lib.GutenbergGlobals as gg
 27 | from epubmaker.lib.GutenbergGlobals import NS
 28 | from epubmaker.lib.Logger import debug, error
 29 | from epubmaker.CommonOptions import Options
 30 | 
 31 | options = Options()
 32 | # MAX_CHUNK_SIZE  = 300 * 1024  # bytes
 33 | MAX_CHUNK_SIZE  = 100 * 1024  # bytes
 34 | 
 35 | SECTIONS = [
 36 |     ('div.section', 0.0), 
 37 |     ('div.chapter', 0.0), 
 38 |     ('h1',          0.5),
 39 |     ('div',         0.5),
 40 |     ('h2',          0.7),
 41 |     ('h3',          0.75),
 42 |     ('p',           0.8)
 43 |     ]
 44 | 
 45 | def xpath (node, path):
 46 |     """ xpath helper """
 47 |     return node.xpath (path, namespaces = gg.NSMAP)
 48 | 
 49 | def unicode_uri (uri):
 50 |     """ Normalize URI for idmap. """
 51 |     return urllib.unquote (uri).decode ('utf-8')
 52 | 
 53 | 
 54 | class HTMLChunker (object):
 55 |     """ Splits HTML tree into smaller chunks.
 56 | 
 57 |     Some epub viewers are limited in that they cannot display files
 58 |     larger than 300K.  If our HTML happens to be longer, we have to
 59 |     split it up.  Also smaller chunks do improve page flip times.
 60 | 
 61 | 
 62 |     """
 63 | 
 64 |     def __init__ (self):
 65 |         self.chunks = []
 66 |         self.idmap = {}
 67 |         self.chunk = None
 68 |         self.chunk_body = None
 69 |         self.chunk_size = 0
 70 |         self.next_id = 0
 71 | 
 72 |         self.tags = {}
 73 |         for tag, size in SECTIONS:
 74 |             self.tags[NS.xhtml[tag]] = int (size * MAX_CHUNK_SIZE)
 75 |         for tag in options.section_tags:
 76 |             self.tags[NS.xhtml[tag]] = 0
 77 |         
 78 | 
 79 |     def _make_name (self, url):
 80 |         """ Generate a name for the chunk. """
 81 |         u = list (urlparse.urlparse (url))
 82 |         root, ext = os.path.splitext (u[2])
 83 |         # FIXME: brain-dead kindlegen only finds links in files with
 84 |         # .html extension. so we just add .html to everything
 85 |         u[2] = "%s-%d%s.html" % (root, self.next_id, ext)
 86 |         self.next_id += 1
 87 |         return urlparse.urlunparse (u)
 88 |     
 89 |         
 90 |     @staticmethod
 91 |     def make_template (tree):
 92 |         """ Make a copy with an empty html:body.
 93 | 
 94 |         This makes a template into which we can paste our chunks.
 95 | 
 96 |         """
 97 |         
 98 |         template = copy.deepcopy (tree)
 99 | 
100 |         for c in xpath (template, '//xhtml:body'):
101 | 
102 |             # descend while elem has only one child
103 |             while len (c) == 1:
104 |                 c = c[0]
105 | 
106 |             # clear children but save attributes
107 |             attributes = c.attrib.items ()
108 |             c.clear ()
109 |             # was tentative fix for patological one-element-html case
110 |             # for child in c:
111 |             #     c.remove (child)
112 |             for a in attributes:
113 |                 c.set (a[0], a[1])
114 | 
115 |         # debug (etree.tostring (template))
116 | 
117 |         return template
118 | 
119 | 
120 |     def reset_chunk (self, template):
121 |         """ start a new chunk """
122 | 
123 |         self.chunk = copy.deepcopy (template)
124 |         self.chunk_size = len (etree.tostring (self.chunk))
125 |         self.chunk_body = xpath (self.chunk, "//xhtml:body")[0]
126 |         while len (self.chunk_body) == 1:
127 |             self.chunk_body = self.chunk_body[0]
128 | 
129 | 
130 |     def shipout_chunk (self, url, chunk_id = None, comment = None):
131 |         """ ready chunk to be shipped """
132 | 
133 |         if (self.chunk_size > MAX_CHUNK_SIZE):
134 |             self.split (self.chunk, url)
135 |             return
136 | 
137 |         url = unicode_uri (url)
138 |         chunk_name = self._make_name (url)
139 | 
140 |         # the url of the whole page
141 |         if not url in self.idmap:
142 |             self.idmap[url] = chunk_name
143 | 
144 |         # fragments of the page
145 |         for e in xpath (self.chunk, '//xhtml:*[@id]'):
146 |             id_ = e.attrib['id']
147 |             old_id = "%s#%s" % (url, id_)
148 |             # key is unicode string,
149 |             # value is uri-escaped byte string
150 |             # if ids get cloned while chunking, map to the first one only
151 |             if old_id not in self.idmap:
152 |                 self.idmap[old_id] = "%s#%s" % (
153 |                     chunk_name,  urllib.quote (id_.encode ('utf-8')))
154 | 
155 |         self.chunks.append ( { 'name'     : chunk_name,
156 |                                'id'       : chunk_id,
157 |                                'comment'  : comment,
158 |                                'chunk'    : self.chunk,  } )
159 |             
160 |         debug ("Adding chunk %s (%d bytes) %s" % (chunk_name, self.chunk_size, chunk_id))
161 | 
162 | 
163 |     def split (self, tree, url):
164 |         """ Split whole html or split chunk.
165 | 
166 |         Find some arbitrary points to do it.
167 |     
168 |         """
169 | 
170 |         for body in xpath (tree, "//xhtml:body"):
171 |             # we can't split a node that has only one child
172 |             # descend while elem has only one child
173 |             while len (body) == 1:
174 |                 body = body[0]
175 | 
176 |             debug ("body tag is %s" % body.tag)
177 | 
178 |             template = self.make_template (tree)
179 |             self.reset_chunk (template)
180 | 
181 |             # FIXME: is this ok ???
182 |             # fixes patological one-element-body case
183 |             self.chunk_body.text = body.text
184 | 
185 |             for child in body:
186 |                 if not isinstance (child, etree.ElementBase):
187 |                     # comments, processing instructions etc. 
188 |                     continue
189 |                 child_size = len (etree.tostring (child))
190 | 
191 |                 try:
192 |                     tags = [child.tag + '.' + c for c in child.attrib['class'].split ()]
193 |                     tags.append (child.tag)
194 |                 except KeyError:
195 |                     tags = [child.tag]
196 | 
197 |                 for tag in tags:
198 |                     if ((self.chunk_size + child_size > MAX_CHUNK_SIZE) or
199 |                               (tag in self.tags and
200 |                                self.chunk_size > self.tags[tag])):
201 |                         
202 |                         comment = ("Chunk: size=%d Split on %s" 
203 |                                    % (self.chunk_size, re.sub ('^{.*}', '', tag)))
204 |                         debug (comment)
205 | 
206 |                         # find a suitable id
207 |                         chunk_id = None
208 |                         for c in self.chunk_body:
209 |                             if 'id' in c.attrib:
210 |                                 chunk_id = c.get ('id')
211 |                                 break
212 |                         debug ("chunk id is: %s" % (chunk_id or ''))
213 |                         
214 |                         self.shipout_chunk (url, chunk_id, comment)
215 |                         self.reset_chunk (template)
216 |                         break
217 | 
218 |                 self.chunk_body.append (child)
219 |                 self.chunk_size = self.chunk_size + child_size
220 | 
221 |             # fixes patological one-element-body case
222 |             self.chunk_body.tail = body.tail
223 |             
224 |             chunk_id = None
225 |             if len (self.chunk_body):
226 |                 chunk_id = self.chunk_body[0].get ('id')
227 |             comment = "Chunk: size=%d" % self.chunk_size
228 |             self.shipout_chunk (url, chunk_id, comment)
229 |             self.reset_chunk (template)
230 | 
231 | 
232 |     def rewrite_links (self, f):
233 |         """ Rewrite all href and src using f (). """
234 |         
235 |         for chunk in self.chunks:
236 |             # chunk['name'] = f (chunk['name'])
237 |             
238 |             for link in xpath (chunk['chunk'], '//xhtml:*[@href]'):
239 |                 url = link.get ('href')
240 |                 if not url.startswith('http://') and not url.startswith('https://'):
241 |                     link.set ('href', f (url))
242 | 
243 |             for image in xpath (chunk['chunk'], '//xhtml:*[@src]'):
244 |                 image.set ('src', f (image.get ('src')))
245 | 
246 |         for k, v in self.idmap.items ():
247 |             self.idmap[k] = f (v)
248 | 
249 | 
250 |     def rewrite_internal_links (self):
251 |         """ Rewrite links to point into right chunks.
252 | 
253 |         Because we split the HTML into chunks, all internal links need
254 |         to be rewritten to become links into the right chunk.
255 |         Rewrite all internal links in all chunks.
256 | 
257 |         """
258 |         for chunk in self.chunks:
259 |             for a in xpath (chunk['chunk'], "//xhtml:*[@href]"):
260 |                 try:
261 |                     uri = unicode_uri (a.get ('href'))
262 |                     a.set ('href', self.idmap[uri])
263 |                 except KeyError:
264 |                     ur, dummy_frag = urlparse.urldefrag (uri)
265 |                     if ur in self.idmap:
266 |                         error ("HTMLChunker: Cannot rewrite internal link '%s'" % uri)
267 |         
268 | 
269 |     def rewrite_internal_links_toc (self, toc):
270 |         """ Rewrite links to point into right chunks.
271 | 
272 |         Because we split the HTML into chunks, all internal links need
273 |         to be rewritten to become links into the right chunk.
274 |         Rewrite all links in the passed toc.
275 | 
276 |         """
277 | 
278 |         for entry in toc:
279 |             try:
280 |                 entry[0] = self.idmap [unicode_uri (entry[0])]
281 |             except KeyError:
282 |                 error ("HTMLChunker: Cannot rewrite toc entry '%s'" % entry[0]) 
283 |                 del entry
284 | 
285 | 
286 | 


--------------------------------------------------------------------------------
/epubmaker/ParserFactory.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | ParserFactory.py
  7 | 
  8 | Copyright 2009-10 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | """
 13 | 
 14 | from __future__ import with_statement
 15 | 
 16 | import os.path
 17 | import urllib
 18 | 
 19 | from pkg_resources import resource_listdir # pylint: disable=E0611
 20 | 
 21 | from epubmaker.mydocutils import broken
 22 | from epubmaker.lib.Logger import debug, error
 23 | from epubmaker.lib.MediaTypes import mediatypes
 24 | from epubmaker.Version import VERSION
 25 | from epubmaker.CommonOptions import Options
 26 | 
 27 | options = Options()
 28 | 
 29 | class AppURLopener (urllib.FancyURLopener):
 30 |     version = "ebookmaker/%s" % VERSION
 31 | 
 32 | urllib._urlopener = AppURLopener ()
 33 | 
 34 | parser_modules = {}
 35 | 
 36 | def load_parsers ():
 37 |     """ See what types we can parse. """
 38 | 
 39 |     for fn in resource_listdir ('epubmaker.parsers', ''):
 40 |         modulename, ext = os.path.splitext (fn)
 41 |         if ext == '.py':
 42 |             if (modulename.endswith ('Parser')):
 43 |                 module = __import__ ('epubmaker.parsers.' + modulename, fromlist = [modulename])
 44 |                 debug ("Loading parser from module: %s for mediatypes: %s" % (
 45 |                     modulename, ', '.join (module.mediatypes)))
 46 |                 for mediatype in module.mediatypes:
 47 |                     parser_modules[mediatype] = module
 48 | 
 49 |     return parser_modules.keys ()
 50 | 
 51 | 
 52 | def unload_parsers ():
 53 |     """ Unload parser modules. """
 54 |     for k in parser_modules.keys ():
 55 |         del parser_modules[k]
 56 |     
 57 | 
 58 | class ParserFactory (object):
 59 |     """ A factory and a cache for parsers.
 60 | 
 61 |     So we don't reparse the same file twice.
 62 | 
 63 |     """
 64 | 
 65 |     parsers = {} # cache: parsers[url] = parser
 66 |     
 67 |     @staticmethod
 68 |     def get (mediatype):
 69 |         """ Get the right kind of parser. """
 70 |         try:
 71 |             return parser_modules[mediatype].Parser ()
 72 |         except KeyError:
 73 |             return parser_modules['*/*'].Parser ()
 74 |             
 75 | 
 76 |     @classmethod
 77 |     def create (cls, url, attribs):
 78 |         """ Create an appropriate parser. """
 79 | 
 80 |         # debug ("Need parser for %s" % url)
 81 | 
 82 |         if url in cls.parsers:
 83 |             # debug ("... reusing parser for %s" % url)
 84 |             # reuse same parser, maybe already filled with data
 85 |             return cls.parsers[url]
 86 | 
 87 |         orig_url = url
 88 |         mediatype = attribs.get ('mediatype')
 89 | 
 90 |         if url.endswith (broken):
 91 |             # hack! broken.png doesn't exist at the source location.
 92 |             # We take it from our resources and fake its provenience.
 93 |             parser = parser_modules['image/png'].Parser ()
 94 |             parser.orig_url = url
 95 |             parser.url = url
 96 |             parser.broken_image ()
 97 |         else:
 98 |             fp = urllib.urlopen (url, proxies = options.config.PROXIES)
 99 |             url = fp.geturl ()
100 | 
101 |             if url != orig_url:
102 |                 debug ("... %s redirected to %s" % (orig_url, url))
103 |                 if url in cls.parsers:
104 |                     # debug ("... reusing parser for %s" % url)
105 |                     # reuse same parser, maybe already filled with data
106 |                     return cls.parsers[url]
107 | 
108 |             # ok. so we have to create a new parser
109 |             debug ("... creating new parser for %s" % url)
110 | 
111 |             if mediatype is not None:
112 |                 debug ("... got mediatype %s from link attributes" % mediatype)
113 |             else:
114 |                 if options.mediatype_from_extension or not hasattr (fp, 'info'):
115 |                     name, ext = os.path.splitext (url)
116 |                     mediatype = mediatypes[ext[1:]]
117 |                 else:
118 |                     msg = fp.info ()
119 |                     mediatype = msg.get ('Content-Type')
120 |                     if mediatype:
121 |                         mediatype = mediatype.partition (';')[0]
122 |                         debug ("... got mediatype %s from server" % mediatype)
123 |                     else:
124 |                         mediatype = 'application/octet-stream'
125 |                         error ("... cannot determine mediatype for %s" % url)
126 | 
127 |             # get the right kind of parser
128 |             try:
129 |                 mt = mediatype.split (';')[0]
130 |                 parser = parser_modules[mt].Parser ()
131 |             except KeyError:
132 |                 parser = parser_modules['*/*'].Parser ()
133 | 
134 |             parser.setup (orig_url, mediatype, attribs, fp)
135 | 
136 |         cls.parsers[parser.url] = parser
137 |         cls.parsers[orig_url] = parser
138 | 
139 |         return parser
140 |     
141 | 
142 |     @classmethod
143 |     def clear (cls):
144 |         """ Clear parser cache to free memory. """
145 | 
146 |         # debug: kill refs
147 |         for dummy_url, parser in cls.parsers.items ():
148 |             del parser
149 |             
150 |         cls.parsers = {}
151 | 
152 | 


--------------------------------------------------------------------------------
/epubmaker/Spider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | Spider.py
  7 | 
  8 | Copyright 2009 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Rudimentary Web Spider
 13 | 
 14 | """
 15 | 
 16 | from __future__ import with_statement
 17 | 
 18 | import urlparse
 19 | import fnmatch
 20 | 
 21 | from epubmaker.lib import MediaTypes
 22 | import epubmaker.lib.GutenbergGlobals as gg
 23 | from epubmaker.lib.GutenbergGlobals import NS
 24 | from epubmaker.lib.Logger import debug, error
 25 | 
 26 | from epubmaker import ParserFactory
 27 | 
 28 | COVERPAGE_MIN_AREA = 200 * 200
 29 | 
 30 | class Spider (object):
 31 |     """ A very rudimentary web spider. """
 32 | 
 33 |     def __init__ (self):
 34 |         self.options = None
 35 |         self.parsed_urls = set ()
 36 |         self.enqueued_urls = set ()
 37 |         self.included_mediatypes = set ()
 38 |         self.excluded_mediatypes = set ()
 39 |         self.queue = []
 40 |         self.parsers = []
 41 |         self.next = [] # for a topological sort
 42 |         self.redirection_map = {}
 43 | 
 44 | 
 45 |     def parse (self, url, mediatype_hint, options):
 46 |         """ Do a recursive parse starting from url.
 47 |         
 48 |         Do a breadth-first traversal. Assuming the first page contains
 49 |         a linked TOC, this will get us a more natural ordering of the
 50 |         pages than a depth-first traversal.
 51 | 
 52 |         """
 53 | 
 54 |         self.options = options
 55 | 
 56 |         for rewrite in self.options.rewrite:
 57 |             from_, to = rewrite.split ('>')
 58 |             self.redirection_map[from_] = to
 59 | 
 60 |         debug ("Start of retrieval")
 61 | 
 62 |         # enqueue root url
 63 |         
 64 |         attribs = { 'mediatype' : mediatype_hint, 'id': 'start' }
 65 |         self.enqueue (url, 0, attribs)
 66 | 
 67 |         while self.queue:
 68 |             (url, depth, attribs) = self.queue.pop (0)
 69 | 
 70 |             url = self.redirect (url)
 71 |             if url in self.parsed_urls:
 72 |                 continue
 73 |             
 74 |             parser = ParserFactory.ParserFactory.create (url, attribs)
 75 |             self.add_redirection (parser)
 76 |             
 77 |             # if the url was redirected to something we already have
 78 |             url = self.redirect (parser.url)
 79 |             if url in self.parsed_urls:
 80 |                 continue
 81 |             
 82 |             self.parsed_urls.add (url)
 83 |             parser.options = self.options
 84 |             parser.pre_parse ()
 85 |             self.parsers.append (parser)
 86 | 
 87 |             # check potential coverpage for sufficient size
 88 |             if options.coverpage_url is None:
 89 |                 if attribs.get ('rel', '') == 'coverpage':
 90 |                     if hasattr (parser, 'get_image_dimen'):
 91 |                         dimen = parser.get_image_dimen ()
 92 |                         if (dimen[0] * dimen[1]) > COVERPAGE_MIN_AREA:
 93 |                             options.coverpage_url = parser.url
 94 |                             debug ("Setting coverpage: %s ..." % parser.url)
 95 | 
 96 |             depth += 1
 97 | 
 98 |             # look for links in just parsed document
 99 |             debug ("Requesting iterlinks for: %s ..." % url)
100 | 
101 |             for (url, attr) in parser.iterlinks ():
102 |                 # debug ("*** link: %s ..." % url)
103 | 
104 |                 url = urlparse.urldefrag (url)[0]
105 |                 tag = attr.get ('tag', '')
106 | 
107 |                 if tag == NS.xhtml.link:
108 |                     if attr.get ('rel', '').lower () == 'next':
109 |                         self.next.append ((parser.url, url))
110 |                 
111 |                 url = self.redirect (url)
112 | 
113 |                 attribs = { 'mediatype' : attr.get ('type', None) }
114 | 
115 |                 for k in ('id', 'rel'):
116 |                     if k in attr:
117 |                         attribs[k] = attr[k]
118 |                 
119 |                 if tag == NS.xhtml.a:
120 |                     self.enqueue_doc (url, depth, attribs)
121 |                     continue
122 |                 if tag == NS.xhtml.img:
123 |                     self.enqueue_aux (url, depth, attribs)
124 |                     continue
125 |                 if tag == NS.xhtml.object:
126 |                     if ('type' in attr and
127 |                         not self.is_included_mediatype (attr['type'])):
128 |                         continue
129 |                     self.enqueue_aux (url, depth, attribs)
130 |                     continue
131 |                 if tag == NS.xhtml.link:
132 |                     rel = attribs.get ('rel', '').lower ()
133 |                     if 'stylesheet' in rel:
134 |                         self.enqueue_aux (url, depth, attribs)
135 |                     elif rel == 'coverpage':
136 |                         # We may also find the coverpage in <link rel='coverpage' href='url' />
137 |                         self.enqueue_aux (url, depth, attribs)
138 |                     else:
139 |                         self.enqueue_doc (url, depth, attribs)
140 |                     continue
141 |                     
142 |         debug ("End of retrieval")
143 |         
144 |         # rewrite redirected urls
145 |         if self.redirection_map:
146 |             for parser in self.parsers:
147 |                 parser.remap_links (self.redirection_map)
148 | 
149 |         # try a topological sort of documents using <link rel='next'>
150 |         if self.next:
151 |             self.next = map (lambda x: (self.redirect(x[0]), self.redirect(x[1])), self.next)
152 | 
153 |             try:
154 |                 d = {}
155 |                 for order, url in enumerate (gg.topological_sort (self.next)):
156 |                     d[url] = order
157 |                     debug ("%s order %d" % (url, order))
158 |                 for parser in self.parsers:
159 |                     parser.order = d.get (parser.url, 999999)
160 |                 self.parsers.sort (key = lambda p: p.order)
161 |                 
162 |             except StandardError:
163 |                 pass
164 | 
165 | 
166 |     def add_redirection (self, parser):
167 |         """ Remember this redirection. """
168 |         if parser.orig_url != parser.url:
169 |             self.redirection_map[parser.orig_url] = parser.url
170 |             debug ("Adding redirection from %s to %s" % (parser.orig_url, parser.url))
171 | 
172 |         
173 |     def redirect (self, url):
174 |         """ Redirect url if we know the target. """
175 |         return self.redirection_map.get (url, url)
176 | 
177 |         
178 |     def enqueue (self, url, depth, attribs):
179 |         """ Enque url for parsing. """
180 |         
181 |         url = self.redirect (url)
182 |         if url in self.enqueued_urls:
183 |             return
184 |         
185 |         debug ("Enqueing %s ..." % url)
186 |         self.queue.append ((url, depth, attribs))
187 |         self.enqueued_urls.add (url)
188 |         
189 |             
190 |     def enqueue_aux (self, url, depth, attribs):
191 |         """ Enqueue an auxiliary file.
192 | 
193 |         We get auxiliary files even if they are too deep or not in
194 |         'included' directories.
195 | 
196 |         """
197 |         try:
198 |             parser = ParserFactory.ParserFactory.create (url, attribs)
199 |             self.add_redirection (parser)
200 |             if self.is_wanted_aux (parser):
201 |                 self.enqueue (parser.url, depth, attribs)
202 |         except IOError:
203 |             error ("bad aux url: %s" % url)
204 | 
205 |     def enqueue_doc (self, url, depth, attribs):
206 |         """ Enqueue a document file.
207 | 
208 |         We get document files only if they pass document-selection
209 |         rules.
210 | 
211 |         """
212 |         
213 |         if not self.options.max_depth or depth < self.options.max_depth:
214 |             if self.is_included (url):
215 |                 try:
216 |                     parser = ParserFactory.ParserFactory.create (url, attribs)
217 |                     self.add_redirection (parser)
218 |                     if self.is_wanted_doc (parser):
219 |                         self.enqueue (parser.url, depth, attribs)
220 |                 except IOError:
221 |                     error ("bad url: %s" % url)
222 | 
223 | 
224 |     def is_included (self, url):
225 |         """ Return True if this document is eligible. """
226 | 
227 |         included = any (map (lambda x: fnmatch.fnmatchcase (url, x), self.options.include))
228 |         excluded = any (map (lambda x: fnmatch.fnmatchcase (url, x), self.options.exclude))
229 | 
230 |         if included and not excluded:
231 |             if self.options.local_files_only:
232 |                 if url.startswith('http:') or url.startswith('https:'):
233 |                     return 0
234 |                 else:
235 |                     return 1 
236 |             return 1
237 | 
238 |         if excluded:
239 |             debug ("Dropping excluded %s" % url)
240 |         if not included:
241 |             debug ("Dropping not included %s" % url)
242 |         return 0
243 |             
244 | 
245 |     def is_included_mediatype (self, mediatype):
246 |         """ Return True if this document is eligible. """
247 | 
248 |         included = any (map (lambda pattern: fnmatch.fnmatch (mediatype, pattern),
249 |                              self.options.include_mediatypes))
250 |         excluded = any (map (lambda pattern: fnmatch.fnmatch (mediatype, pattern),
251 |                              self.options.exclude_mediatypes))
252 | 
253 |         if included and not excluded:
254 |             self.included_mediatypes.add (mediatype)
255 |             return 1
256 | 
257 |         if excluded:
258 |             debug ("Dropping excluded mediatype %s" % mediatype)
259 |         if not included:
260 |             debug ("Dropping not included mediatype %s" % mediatype)
261 |             
262 |         self.excluded_mediatypes.add (mediatype)
263 |         return 0
264 |             
265 | 
266 |     def has_seen_images (self):
267 |         """ Return True if the spider has encountered images. """
268 | 
269 |         return bool (MediaTypes.IMAGE_MEDIATYPES &
270 |                        (self.included_mediatypes | self.excluded_mediatypes))
271 | 
272 |         
273 |     def dict_urls_mediatypes (self):
274 |         """ Return a dict of all parsed urls and mediatypes. """
275 |         return dict (map (lambda p: (p.url, p.mediatype), self.parsers))
276 |     
277 | 
278 |     def is_wanted_doc (self, parser):
279 |         """ Return True if we ought to parse this content document.
280 | 
281 |         Override this in custom spiders.
282 | 
283 |         """
284 |         return self.is_included_mediatype (parser.mediatype)
285 | 
286 | 
287 |     def is_wanted_aux (self, parser):
288 |         """ Return True if we ought to parse this image or aux file.
289 | 
290 |         Override this in custom spiders.
291 | 
292 |         """
293 |         return self.is_included_mediatype (parser.mediatype)
294 | 
295 | 
296 | 


--------------------------------------------------------------------------------
/epubmaker/Unitame.py:
--------------------------------------------------------------------------------
  1 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Unitame.py
  5 | 
  6 | Copyright 2010 by Marcello Perathoner
  7 | 
  8 | Distributable under the GNU General Public License Version 3 or newer.
  9 | 
 10 | Module to implement the totally superfluous PG plain text conversion
 11 | into long extinct encodings.
 12 | 
 13 | We have to unitame-translate before feeding to nroff because nroff
 14 | does some irreversible (and wrong) translations of its own, like ä ->
 15 | a. Also, some unitame-translations change the number of characters,
 16 | thus throwing already-justified text off.
 17 | 
 18 | We cannot do the translations before feeding the source to docutils
 19 | because if we change the length of titles, we get the warning: Title
 20 | underline too short.
 21 | 
 22 | Translation does some dangerous things, like converting quotes to
 23 | apostrophes, which are command escapes in nroff. We have to escape
 24 | apostrophes in the source text but not apostroph-commands inserted by
 25 | the converter.
 26 | 
 27 | We also have to translate some important non-ascii characters, like
 28 | nbsp and shy, into command sequences before they reach unitame because
 29 | unitame would convert them into the semantically different space and
 30 | hyhpen.
 31 | 
 32 | All this makes translation inside the docutils converter the best
 33 | choice. Implemented as a docutils translator that visits all text
 34 | nodes.
 35 | 
 36 | Smart quote translation should also go into a docutils
 37 | translator. Likewise a translator for text-transform: upper.
 38 | 
 39 | """
 40 | 
 41 | import codecs
 42 | import unicodedata as ud
 43 | 
 44 | # UnitameData is generated from unitame.dat
 45 | from epubmaker.UnitameData import unicode_to_iso_8859_1, iso_8859_1_to_ascii
 46 | 
 47 | # tweak dicts for translate ()
 48 | u2i = dict ( [ (ord (o), s) for o, s in unicode_to_iso_8859_1.iteritems () ] )
 49 | i2a = dict ( [ (ord (o), s) for o, s in iso_8859_1_to_ascii.iteritems () ] )
 50 | 
 51 | u2i.update ( {
 52 |     0x2000:     u' ',    # en quad
 53 |     0x2001:     u'  ',   # em quad
 54 |     0x2002:     u' ',    # en space
 55 |     0x2003:     u'  ',   # em space
 56 |     0x2004:     u' ',    # 3/em space
 57 |     0x2005:     u'',     # 4/em
 58 |     0x2006:     u'',     # 6/em
 59 |     0x2007:     u' ',    # figure space
 60 |     0x2008:     u'',     # punctuation space
 61 |     0x2009:     u'',     # thin space
 62 |     0x200a:     u'',     # hair space
 63 |     0x200b:     u'',     # zero space
 64 |     0x200c:     u'',     # zwnj
 65 |     0x200d:     u'',     # zwj
 66 |     0x2010:     u'-',    # hyphen
 67 |     0x2011:     u'-',    # non-breaking hyphen
 68 |     0x2012:     u'-',    # figure-dash
 69 |     0x2013:     u'-',    # en dash
 70 |     0x2014:     u'--',   # em dash
 71 |     0x2015:     u'-',    # horizontal bar
 72 |     0x2026:     u'...',  # horizontal ellipsis
 73 |     ord (u'™'): u'(tm)',
 74 |     ord (u'‹'): u'<',
 75 |     ord (u'›'): u'>',
 76 |     ord (u'†'): u'+',
 77 |     ord (u'‡'): u'++',
 78 |     ord (u'⁑'): u'**',
 79 |     ord (u'⁂'): u'***',
 80 |     ord (u'•'): u'-',
 81 |     ord (u'′'): u'´',
 82 |     ord (u'″'): u'´´',
 83 |     ord (u'‴'): u'´´´',
 84 |     ord (u'⁗'): u'´´´´',
 85 |     ord (u'⁓'): u'~',
 86 |     ord (u'‰'): u'%o',
 87 |     ord (u'‱'): u'%oo',
 88 |     ord (u'⚹'): u'*',    # U+26b9 sextile
 89 |     ord (u'⁰'): u'^0',
 90 |     ord (u'⁴'): u'^4',
 91 |     ord (u'⁵'): u'^5',
 92 |     ord (u'⁶'): u'^6',
 93 |     ord (u'⁷'): u'^7',
 94 |     ord (u'⁸'): u'^8',
 95 |     ord (u'⁹'): u'^9',
 96 |     } )
 97 | 
 98 | # somehow cram these into ascii, so the ppers stop whining about not
 99 | # having nbsp in ascii, then fix it later by replacing them with nroff
100 | # commands.
101 | 
102 | i2a.update ( {
103 |     ord (u'¹'): u'^1',
104 |     ord (u'²'): u'^2',
105 |     ord (u'³'): u'^3',
106 |     0x00a0:     u'\u0011',       # nbsp => DC1
107 |     0x00ad:     u'\u0012',       # shy  => DC2
108 | } )
109 | 
110 | unhandled_chars = []
111 | 
112 | def strip_accents (text):
113 |     """ Strip accents from string. 
114 | 
115 |     If the accented character doesn't fit into the encoding, 
116 |     remove the accent and try again.
117 | 
118 |     """
119 |     return ud.normalize ('NFKC', 
120 |                          filter (lambda c: ud.category (c) != 'Mn', 
121 |                                  ud.normalize ('NFKD', text)))
122 | 
123 | 
124 | def unitame (exc):
125 |     """
126 |     Encoding error handler.
127 | 
128 |     The encoder handles all compatible characters itself.  It calls
129 |     this function whenever it encounters a character it cannot encode.
130 |     This function searches the unitame database for a replacement.
131 | 
132 | 
133 |     """
134 | 
135 |     l = []
136 |     for cc in exc.object[exc.start:exc.end]:
137 |         c = cc
138 |         if exc.encoding == 'latin-1': # python name for iso-8859-1
139 |             c = c.translate (u2i)
140 |             c = strip_accents (c)
141 |             if c and ord (max (c)) < 256:
142 |                 l.append (c)
143 |                 c = None
144 |         elif exc.encoding == 'ascii': # python name for us-ascii
145 |             # "1¼" -> "1 1/4"
146 |             if cc in u'¼½¾':
147 |                 if exc.start > 0 and exc.object[exc.start - 1] in u'0123456789':
148 |                     l.append (' ')
149 |             c = c.translate (u2i)
150 |             c = c.translate (i2a)
151 |             c = strip_accents (c)
152 |             if c and ord (max (c)) < 128:
153 |                 l.append (c)
154 |                 c = None
155 | 
156 |         if c:
157 |             l.append ('{~%s U+%04x~}' % (ud.name (cc), ord (cc)))
158 |             unhandled_chars.extend (l)
159 |         
160 |     return (u"".join (l), exc.end)
161 | 
162 | 
163 | codecs.register_error ('unitame', unitame)
164 | 
165 | 
166 | 


--------------------------------------------------------------------------------
/epubmaker/UnitameData.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  3 | 
  4 | unicode_to_iso_8859_1 = {
  5 |     u'Đ': u'D', # LATIN CAPITAL LETTER D WITH STROKE
  6 |     u'đ': u'd', # LATIN SMALL LETTER D WITH STROKE
  7 |     u'Ħ': u'H', # LATIN CAPITAL LETTER H WITH STROKE
  8 |     u'ħ': u'h', # LATIN SMALL LETTER H WITH STROKE
  9 |     u'Ŀ': u'L', # LATIN CAPITAL LETTER L WITH MIDDLE DOT
 10 |     u'ŀ': u'l', # LATIN SMALL LETTER L WITH MIDDLE DOT
 11 |     u'Ł': u'L', # LATIN CAPITAL LETTER L WITH STROKE
 12 |     u'ł': u'l', # LATIN SMALL LETTER L WITH STROKE
 13 |     u'ʼn': u'n', # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
 14 |     u'Œ': u'OE', # LATIN CAPITAL LIGATURE OE
 15 |     u'œ': u'oe', # LATIN SMALL LIGATURE OE
 16 |     u'Ŧ': u'T', # LATIN CAPITAL LETTER T WITH STROKE
 17 |     u'ŧ': u't', # LATIN SMALL LETTER T WITH STROKE
 18 |     u'ƀ': u'b', # LATIN SMALL LETTER B WITH STROKE
 19 |     u'Ɓ': u'B', # LATIN CAPITAL LETTER B WITH HOOK
 20 |     u'Ƃ': u'B', # LATIN CAPITAL LETTER B WITH TOPBAR
 21 |     u'ƃ': u'b', # LATIN SMALL LETTER B WITH TOPBAR
 22 |     u'Ɔ': u'O', # LATIN CAPITAL LETTER OPEN O
 23 |     u'Ƈ': u'C', # LATIN CAPITAL LETTER C WITH HOOK
 24 |     u'ƈ': u'c', # LATIN SMALL LETTER C WITH HOOK
 25 |     u'Ɗ': u'D', # LATIN CAPITAL LETTER D WITH HOOK
 26 |     u'Ƌ': u'D', # LATIN CAPITAL LETTER D WITH TOPBAR
 27 |     u'ƌ': u'd', # LATIN SMALL LETTER D WITH TOPBAR
 28 |     u'Ƒ': u'F', # LATIN CAPITAL LETTER F WITH HOOK
 29 |     u'ƒ': u'f', # LATIN SMALL LETTER F WITH HOOK
 30 |     u'Ɠ': u'G', # LATIN CAPITAL LETTER G WITH HOOK
 31 |     u'Ɨ': u'I', # LATIN CAPITAL LETTER I WITH STROKE
 32 |     u'Ƙ': u'K', # LATIN CAPITAL LETTER K WITH HOOK
 33 |     u'ƙ': u'k', # LATIN SMALL LETTER K WITH HOOK
 34 |     u'ƚ': u'l', # LATIN SMALL LETTER L WITH BAR
 35 |     u'Ɲ': u'N', # LATIN CAPITAL LETTER N WITH LEFT HOOK
 36 |     u'ƞ': u'n', # LATIN SMALL LETTER N WITH LONG RIGHT LEG
 37 |     u'Ɵ': u'O', # LATIN CAPITAL LETTER O WITH MIDDLE TILDE
 38 |     u'Ƥ': u'P', # LATIN CAPITAL LETTER P WITH HOOK
 39 |     u'ƥ': u'p', # LATIN SMALL LETTER P WITH HOOK
 40 |     u'ƫ': u't', # LATIN SMALL LETTER T WITH PALATAL HOOK
 41 |     u'Ƭ': u'T', # LATIN CAPITAL LETTER T WITH HOOK
 42 |     u'ƭ': u't', # LATIN SMALL LETTER T WITH HOOK
 43 |     u'Ʈ': u'T', # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
 44 |     u'Ʋ': u'V', # LATIN CAPITAL LETTER V WITH HOOK
 45 |     u'Ƴ': u'Y', # LATIN CAPITAL LETTER Y WITH HOOK
 46 |     u'ƴ': u'y', # LATIN SMALL LETTER Y WITH HOOK
 47 |     u'Ƶ': u'Z', # LATIN CAPITAL LETTER Z WITH STROKE
 48 |     u'ƶ': u'z', # LATIN SMALL LETTER Z WITH STROKE
 49 |     u'Lj': u'L', # LATIN CAPITAL LETTER L WITH SMALL LETTER J
 50 |     u'Nj': u'N', # LATIN CAPITAL LETTER N WITH SMALL LETTER J
 51 |     u'Ǣ': u'AE', # LATIN CAPITAL LETTER AE WITH MACRON
 52 |     u'ǣ': u'ae', # LATIN SMALL LETTER AE WITH MACRON
 53 |     u'Ǥ': u'G', # LATIN CAPITAL LETTER G WITH STROKE
 54 |     u'ǥ': u'g', # LATIN SMALL LETTER G WITH STROKE
 55 |     u'Dz': u'D', # LATIN CAPITAL LETTER D WITH SMALL LETTER Z
 56 |     u'Ǽ': u'AE', # LATIN CAPITAL LETTER AE WITH ACUTE
 57 |     u'ǽ': u'ae', # LATIN SMALL LETTER AE WITH ACUTE
 58 |     u'Ǿ': u'O', # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
 59 |     u'ǿ': u'o', # LATIN SMALL LETTER O WITH STROKE AND ACUTE
 60 |     u'Ƞ': u'N', # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG
 61 |     u'ȡ': u'd', # LATIN SMALL LETTER D WITH CURL
 62 |     u'Ȥ': u'Z', # LATIN CAPITAL LETTER Z WITH HOOK
 63 |     u'ȥ': u'z', # LATIN SMALL LETTER Z WITH HOOK
 64 |     u'ȴ': u'l', # LATIN SMALL LETTER L WITH CURL
 65 |     u'ȵ': u'n', # LATIN SMALL LETTER N WITH CURL
 66 |     u'ȶ': u't', # LATIN SMALL LETTER T WITH CURL
 67 |     u'ɓ': u'b', # LATIN SMALL LETTER B WITH HOOK
 68 |     u'ɕ': u'c', # LATIN SMALL LETTER C WITH CURL
 69 |     u'ɖ': u'd', # LATIN SMALL LETTER D WITH TAIL
 70 |     u'ɗ': u'd', # LATIN SMALL LETTER D WITH HOOK
 71 |     u'ɠ': u'g', # LATIN SMALL LETTER G WITH HOOK
 72 |     u'ɦ': u'h', # LATIN SMALL LETTER H WITH HOOK
 73 |     u'ɨ': u'i', # LATIN SMALL LETTER I WITH STROKE
 74 |     u'ɫ': u'l', # LATIN SMALL LETTER L WITH MIDDLE TILDE
 75 |     u'ɬ': u'l', # LATIN SMALL LETTER L WITH BELT
 76 |     u'ɭ': u'l', # LATIN SMALL LETTER L WITH RETROFLEX HOOK
 77 |     u'ɱ': u'm', # LATIN SMALL LETTER M WITH HOOK
 78 |     u'ɲ': u'n', # LATIN SMALL LETTER N WITH LEFT HOOK
 79 |     u'ɳ': u'n', # LATIN SMALL LETTER N WITH RETROFLEX HOOK
 80 |     u'ɼ': u'r', # LATIN SMALL LETTER R WITH LONG LEG
 81 |     u'ɽ': u'r', # LATIN SMALL LETTER R WITH TAIL
 82 |     u'ɾ': u'r', # LATIN SMALL LETTER R WITH FISHHOOK
 83 |     u'ʂ': u's', # LATIN SMALL LETTER S WITH HOOK
 84 |     u'ʈ': u't', # LATIN SMALL LETTER T WITH RETROFLEX HOOK
 85 |     u'ʉ': u'u', # LATIN SMALL LETTER U BAR
 86 |     u'ʋ': u'v', # LATIN SMALL LETTER V WITH HOOK
 87 |     u'ʐ': u'z', # LATIN SMALL LETTER Z WITH RETROFLEX HOOK
 88 |     u'ʑ': u'z', # LATIN SMALL LETTER Z WITH CURL
 89 |     u'ʜ': u'H', # LATIN LETTER SMALL CAPITAL H
 90 |     u'ʝ': u'j', # LATIN SMALL LETTER J WITH CROSSED-TAIL
 91 |     u'ʠ': u'q', # LATIN SMALL LETTER Q WITH HOOK
 92 |     u'ʮ': u'h', # LATIN SMALL LETTER TURNED H WITH FISHHOOK
 93 |     u'ʯ': u'h', # LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL
 94 |     u'Ѝ': u'I', # CYRILLIC CAPITAL LETTER I WITH GRAVE
 95 |     u'ѝ': u'i', # CYRILLIC SMALL LETTER I WITH GRAVE
 96 |     u'Ӑ': u'A', # CYRILLIC CAPITAL LETTER A WITH BREVE
 97 |     u'ӑ': u'a', # CYRILLIC SMALL LETTER A WITH BREVE
 98 |     u'Ӓ': u'A', # CYRILLIC CAPITAL LETTER A WITH DIAERESIS
 99 |     u'ӓ': u'a', # CYRILLIC SMALL LETTER A WITH DIAERESIS
100 |     u'Ӣ': u'I', # CYRILLIC CAPITAL LETTER I WITH MACRON
101 |     u'ӣ': u'i', # CYRILLIC SMALL LETTER I WITH MACRON
102 |     u'Ӥ': u'I', # CYRILLIC CAPITAL LETTER I WITH DIAERESIS
103 |     u'ӥ': u'i', # CYRILLIC SMALL LETTER I WITH DIAERESIS
104 |     u'Ӧ': u'O', # CYRILLIC CAPITAL LETTER O WITH DIAERESIS
105 |     u'ӧ': u'o', # CYRILLIC SMALL LETTER O WITH DIAERESIS
106 |     u'Ӭ': u'E', # CYRILLIC CAPITAL LETTER E WITH DIAERESIS
107 |     u'ӭ': u'e', # CYRILLIC SMALL LETTER E WITH DIAERESIS
108 |     u'Ӯ': u'U', # CYRILLIC CAPITAL LETTER U WITH MACRON
109 |     u'ӯ': u'u', # CYRILLIC SMALL LETTER U WITH MACRON
110 |     u'Ӱ': u'U', # CYRILLIC CAPITAL LETTER U WITH DIAERESIS
111 |     u'ӱ': u'u', # CYRILLIC SMALL LETTER U WITH DIAERESIS
112 |     u'Ӳ': u'U', # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE
113 |     u'ӳ': u'u', # CYRILLIC SMALL LETTER U WITH DOUBLE ACUTE
114 |     u'ẚ': u'a', # LATIN SMALL LETTER A WITH RIGHT HALF RING
115 |     u'‐': u'-', # HYPHEN
116 |     u'–': u'-', # EN DASH
117 |     u'—': u'--', # EM DASH
118 |     u'‖': u'||', # DOUBLE VERTICAL LINE
119 |     u'‗': u'_', # DOUBLE LOW LINE
120 |     u'‘': u'\'', # LEFT SINGLE QUOTATION MARK
121 |     u'’': u'\'', # RIGHT SINGLE QUOTATION MARK
122 |     u'‚': u'\'', # SINGLE LOW-9 QUOTATION MARK
123 |     u'‛': u'\'', # SINGLE HIGH-REVERSED-9 QUOTATION MARK
124 |     u'“': u'"', # LEFT DOUBLE QUOTATION MARK
125 |     u'”': u'"', # RIGHT DOUBLE QUOTATION MARK
126 |     u'„': u'"', # DOUBLE LOW-9 QUOTATION MARK
127 |     u'‟': u'"', # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
128 |     u'⁅': u'[', # LEFT SQUARE BRACKET WITH QUILL
129 |     u'⁆': u']', # RIGHT SQUARE BRACKET WITH QUILL
130 | }
131 | 
132 | 
133 | iso_8859_1_to_ascii = {
134 |     u'¡': u'i', # INVERTED EXCLAMATION MARK
135 |     u'¢': u'c', # CENT SIGN
136 |     u'£': u'L', # POUND SIGN
137 |     u'¥': u'Y', # YEN SIGN
138 |     u'¦': u'|', # BROKEN BAR
139 |     u'§': u'Sec.', # SECTION SIGN
140 |     u'¨': u'"', # DIAERESIS
141 |     u'©': u'(C)', # COPYRIGHT SIGN
142 |     u'«': u'"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
143 |     u'­': u'-', # SOFT HYPHEN
144 |     u'®': u'(R)', # REGISTERED SIGN
145 |     u'¯': u'-', # MACRON
146 |     u'°': u' deg.', # DEGREE SIGN
147 |     u'±': u'+-', # PLUS-MINUS SIGN
148 |     u'²': u'^2', # SUPERSCRIPT TWO
149 |     u'³': u'^3', # SUPERSCRIPT THREE
150 |     u'´': u'\'', # ACUTE ACCENT
151 |     u'µ': u' mu', # MICRO SIGN
152 |     u'·': u'.', # MIDDLE DOT
153 |     u'»': u'"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
154 |     u'¼': u'1/4', # VULGAR FRACTION ONE QUARTER
155 |     u'½': u'1/2', # VULGAR FRACTION ONE HALF
156 |     u'¾': u'3/4', # VULGAR FRACTION THREE QUARTERS
157 |     u'¿': u'?', # INVERTED QUESTION MARK
158 |     u'Ä': u'Ae', # LATIN CAPITAL LETTER A WITH DIAERESIS
159 |     u'Æ': u'AE', # LATIN CAPITAL LETTER AE
160 |     u'Ð': u'Eth', # LATIN CAPITAL LETTER ETH
161 |     u'Ö': u'Oe', # LATIN CAPITAL LETTER O WITH DIAERESIS
162 |     u'×': u'x', # MULTIPLICATION SIGN
163 |     u'Ø': u'O', # LATIN CAPITAL LETTER O WITH STROKE
164 |     u'Ü': u'Ue', # LATIN CAPITAL LETTER U WITH DIAERESIS
165 |     u'ß': u'ss', # LATIN SMALL LETTER SHARP S
166 |     u'ä': u'ae', # LATIN SMALL LETTER A WITH DIAERESIS
167 |     u'æ': u'ae', # LATIN SMALL LETTER AE
168 |     u'ð': u'eth', # LATIN SMALL LETTER ETH
169 |     # u'ñ': u'ny', # LATIN SMALL LETTER N WITH TILDE
170 |     u'ö': u'oe', # LATIN SMALL LETTER O WITH DIAERESIS
171 |     u'÷': u'/', # DIVISION SIGN
172 |     u'ø': u'o', # LATIN SMALL LETTER O WITH STROKE
173 |     u'ü': u'ue', # LATIN SMALL LETTER U WITH DIAERESIS
174 | }
175 | 
176 | 
177 | 


--------------------------------------------------------------------------------
/epubmaker/Version.py:
--------------------------------------------------------------------------------
1 | VERSION = '0.3.26'
2 | GENERATOR = 'EpubMaker %s <https://github.com/gitenberg-dev/pg-epubmaker>'
3 | 


--------------------------------------------------------------------------------
/epubmaker/WriterFactory.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
 3 | 
 4 | """
 5 | 
 6 | WriterFactory.py
 7 | 
 8 | Copyright 2009-14 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | Writer factory. Dynamically loads writers from directories.
13 | 
14 | """
15 | 
16 | from __future__ import with_statement
17 | 
18 | import os.path
19 | 
20 | from pkg_resources import resource_isdir, resource_listdir # pylint: disable=E0611
21 | 
22 | from epubmaker.lib.Logger import debug
23 | 
24 | writers = {}
25 | 
26 | def __load_writers_from (package_name):
27 |     """ See what types we can write. """
28 | 
29 |     try:
30 |         for fn in resource_listdir (package_name, ''):
31 |             modulename, ext = os.path.splitext (fn)
32 |             if ext == '.py':
33 |                 if modulename.endswith ('Writer'):
34 |                     type_ = modulename.lower ().replace ('writer', '')
35 |                     debug ("Loading writer type %s from module %s" % (type_, modulename))
36 |                     module = __import__ (package_name + '.' + modulename, fromlist = [modulename])
37 |                     writers[type_] = module
38 | 
39 |     except ImportError:
40 |         pass
41 | 
42 | 
43 | def load_writers ():
44 |     """ See what types we can write. """
45 | 
46 |     __load_writers_from ('epubmaker.writers')
47 |     __load_writers_from ('epubmaker.writers.ibiblio')
48 | 
49 |     return writers.keys ()
50 | 
51 | 
52 | def unload_writers ():
53 |     """ Unload writer modules. """
54 |     for k in writers.keys ():
55 |         del writers[k]
56 | 
57 | 
58 | def create (type_):
59 |     """ Load writer module for type. """
60 | 
61 |     try:
62 |         return writers[type_].Writer ()
63 |     except KeyError:
64 |         raise KeyError ('No writer for type %s' % type_)
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/epubmaker/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 | 


--------------------------------------------------------------------------------
/epubmaker/lib/GutenbergGlobals.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | GutenbergGlobals.py
  6 | 
  7 | Copyright 2009 by Marcello Perathoner
  8 | 
  9 | Distributable under the GNU General Public License Version 3 or newer.
 10 | 
 11 | This module has sadly decayed into a repository for all sorts of cruft.
 12 | 
 13 | FIXME: refactor this module
 14 | 
 15 | """
 16 | 
 17 | import os
 18 | import re
 19 | import datetime
 20 | 
 21 | class Struct (object):
 22 |     """ handy class to pin attributes on
 23 | 
 24 |     usage: c = Struct ()
 25 |            c.something = 1
 26 | 
 27 |     """
 28 |     pass
 29 | 
 30 | 
 31 | NSMAP = {
 32 |     'atom':       'http://www.w3.org/2005/Atom',
 33 |     'bio':        'http://purl.org/vocab/bio/0.1/',
 34 |     'cc':         'http://web.resource.org/cc/',
 35 |     'dc':         'http://purl.org/dc/elements/1.1/',
 36 |     'dcam':       'http://purl.org/dc/dcam/',
 37 |     'dcmitype':   'http://purl.org/dc/dcmitype/',
 38 |     'dcterms':    'http://purl.org/dc/terms/',
 39 |     'ebook':      'http://www.gutenberg.org/ebooks/',             # URL
 40 |     'foaf':       'http://xmlns.com/foaf/0.1/',
 41 |     'marcrel':    'http://id.loc.gov/vocabulary/relators',
 42 |     'mathml':     'http://www.w3.org/1998/Math/MathML',
 43 |     'mbp':        'http://mobipocket.com/mbp',
 44 |     'ncx':        'http://www.daisy.org/z3986/2005/ncx/',
 45 |     'opds':       'http://opds-spec.org/2010/Catalog',
 46 |     'opf':        'http://www.idpf.org/2007/opf',
 47 |     'opensearch': 'http://a9.com/-/spec/opensearch/1.1/',
 48 |     'pg':         'http://www.gutenberg.org/',                    # URL
 49 |     'pgagents':   'http://www.gutenberg.org/2009/agents/',
 50 |     'pgtei':      'http://www.gutenberg.org/tei/marcello/0.5/ns',
 51 |     'pgterms':    'http://www.gutenberg.org/2009/pgterms/',
 52 |     'py':         'http://genshi.edgewall.org/',
 53 |     'rdf':        'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
 54 |     'rdfs':       'http://www.w3.org/2000/01/rdf-schema#',
 55 |     'svg':        'http://www.w3.org/2000/svg',
 56 |     'tei':        'http://www.tei-c.org/ns/1.0',
 57 |     'xhtml':      'http://www.w3.org/1999/xhtml',
 58 |     'xinclude':   'http://www.w3.org/2001/XInclude',
 59 |     'xml':        'http://www.w3.org/XML/1998/namespace',
 60 |     'xmlns':      'http://www.w3.org/2000/xmlns/',
 61 |     'xsd':        'http://www.w3.org/2001/XMLSchema#',
 62 |     'xsi':        'http://www.w3.org/2001/XMLSchema-instance',
 63 |     'xslfo':      'http://www.w3.org/1999/XSL/Format',
 64 | }
 65 | 
 66 | 
 67 | class NameSpaceClark (object):
 68 |     """ Build a tag name in Clark notation.
 69 | 
 70 |     ns = NameSpaceClark ("http://example.com/")
 71 |     >>> ns.foo
 72 |     '{http://example.com/}foo'
 73 |     >>> ns['bar']
 74 |     '{http://example.com/}bar'
 75 | 
 76 |     """
 77 | 
 78 |     def __init__ (self, root):
 79 |         self.root = root
 80 | 
 81 |     def __getitem__ (self, local):
 82 |         return "{%s}%s" % (self.root, local)
 83 | 
 84 |     def __getattr__ (self, local):
 85 |         return "{%s}%s" % (self.root, local)
 86 | 
 87 |     def __str__ (self):
 88 |         return self.root
 89 | 
 90 | 
 91 | class NameSpaceURI (object):
 92 |     """ Build a URI.
 93 | 
 94 |     ns = NameSpaceURI ("http://example.com/")
 95 |     >>> ns.foo
 96 |     'http://example.com/foo'
 97 |     >>> ns['bar']
 98 |     'http://example.com/bar'
 99 | 
100 |     """
101 | 
102 |     def __init__ (self, root):
103 |         self.root = root
104 | 
105 |     def __getitem__ (self, local):
106 |         return "%s%s" % (self.root, local)
107 | 
108 |     def __getattr__ (self, local):
109 |         return "%s%s" % (self.root, local)
110 | 
111 |     def __str__ (self):
112 |         return self.root
113 | 
114 | 
115 | def build_nsmap (prefixes = None):
116 |     """ build a nsmap containing all namespaces for prefixes """
117 | 
118 |     if prefixes is None:
119 |         prefixes = NSMAP.keys ()
120 |     if isinstance (prefixes, str):
121 |         prefixes = prefixes.split ()
122 | 
123 |     ns = {}
124 |     for prefix in prefixes:
125 |         ns[prefix] = NSMAP[prefix]
126 | 
127 |     return ns
128 | 
129 | 
130 | NS = Struct ()
131 | NSURI = Struct ()
132 | 
133 | for prefix, uri in NSMAP.items ():
134 |     setattr (NS, prefix, NameSpaceClark (uri))
135 |     setattr (NSURI, prefix, NameSpaceURI (uri))
136 | 
137 | XML_DECLARATION = """<?xml version='1.0' encoding='UTF-8'?>"""
138 | 
139 | XHTML_DOCTYPE   = ("<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' " +  
140 |                    "'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'>")
141 | 
142 | XHTML1_DOCTYPE   = ("<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Strict//EN' " +  
143 |                    "'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'>")
144 | 
145 | XHTML_RDFa_DOCTYPE = ("<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML+RDFa 1.0//EN' " +
146 |                       "'http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd'>")
147 | 
148 | NCX_DOCTYPE = ("<!DOCTYPE ncx PUBLIC '-//NISO//DTD ncx 2005-1//EN' " +
149 |                "'http://www.daisy.org/z3986/2005/ncx-2005-1.dtd'>")
150 | 
151 | GENERATOR = 'EpubMaker by Marcello Perathoner <https://github.com/gitenberg-dev/pg-epubmaker>'
152 | 
153 | 
154 | def xmlspecialchars (s):
155 |     return (s.replace (u'&',  u'&')
156 |              .replace (u'<',  u'<')
157 |              .replace (u'>',  u'>'))
158 | 
159 | def insert_breaks (s):
160 |     return s.replace (u'\n',  u'<br />')
161 | 
162 | RE_NORMALIZE    = re.compile (r"\s+")
163 | 
164 | def normalize (s):
165 |     s = RE_NORMALIZE.sub (' ', s)
166 |     return s.strip ()
167 | 
168 | 
169 | def cut_at_newline (text):
170 |     """ Cut the text at the first newline. """
171 |     i = text.find ('\n')
172 |     if i > -1:
173 |         return text[:i]
174 |     return text
175 | 
176 | def archive_dir (ebook):
177 |     """ build 1/2/3/4/12345 for 12345 """
178 |     ebook = str (ebook)
179 |     a = []
180 |     for c in ebook:
181 |         a.append (c)
182 |     a[-1] = ebook
183 |     return "/".join (a)
184 | 
185 | def archive2files (ebook, path):
186 |     adir = archive_dir (ebook)
187 |     return path.replace ('dirs/' + adir, 'files/%d' % ebook)
188 | 
189 | 
190 | def xpath (node, path, **kwargs):
191 |     """ xpath helper """
192 |     return node.xpath (path, namespaces = NSMAP, **kwargs)
193 | 
194 | 
195 | def mkdir_for_filename (fn):
196 |     """ Make sure the directory for this file is present. """
197 | 
198 |     try:
199 |         os.makedirs (os.path.dirname (fn))
200 |     except os.error:
201 |         pass
202 | 
203 | 
204 | def make_url_relative (base_url, url):
205 |     """ Make absolute url relative to base_url if possible. """
206 | 
207 |     if (url.startswith (base_url)):
208 |         return url[len (base_url):]
209 | 
210 |     base_url = os.path.dirname (base_url) + '/'
211 | 
212 |     if (url.startswith (base_url)):
213 |         return url[len (base_url):]
214 | 
215 |     return url
216 | 
217 | 
218 | def normalize_path (path):
219 |     """ Normalize a file path. """
220 |     if path.startswith ('file://'):
221 |         path = path[7:]
222 |     return path
223 |         
224 | def is_same_path (path1, path2):
225 |     """ Does path1 point to the same file as path2? """
226 |     return os.path.realpath (normalize (path1)) == os.path.realpath (normalize (path2))
227 | 
228 | 
229 | def string_to_filename (fn):
230 |     """ Sanitize string so it can do as filename. """
231 | 
232 |     def escape (matchobj):
233 |         """ Escape a char. """
234 |         return '@%x' % ord (matchobj.group (0))
235 | 
236 |     fn = os.path.normpath (fn)
237 |     fn = normalize (fn)
238 |     fn = fn.replace (os.sep, '@')
239 |     if os.altsep:
240 |         fn = fn.replace (os.altsep, '@')
241 |     fn = re.sub (u'[\|/:?"*<>\u0000-\u001F]', escape, fn)
242 | 
243 |     return fn
244 |     
245 | 
246 | class DCIMT (object):
247 |     """ encapsulates one dcterms internet mimetype 
248 | 
249 |     """
250 | 
251 |     def __init__ (self, mime, enc = None):
252 |         if mime is None:
253 |             self.mimetype = 'application/octet-stream'
254 |         elif enc is not None and mime.startswith ('text/'):
255 |             self.mimetype = "%s; charset=%s" % (mime, enc)
256 |         else:
257 |             self.mimetype = mime
258 |     
259 |     def __str__ (self):
260 |         return self.mimetype
261 |     
262 | 
263 | class UTC (datetime.tzinfo):
264 |     """ UTC helper for datetime.datetime """
265 | 
266 |     def utcoffset (self, dummy_dt):
267 |         return datetime.timedelta (0)
268 | 
269 |     def tzname (self, dummy_dt):
270 |         return "UTC"
271 | 
272 |     def dst (self, dummy_dt):
273 |         return datetime.timedelta (0)
274 | 
275 | # exceptions
276 | 
277 | class SkipOutputFormat (Exception):
278 |     pass
279 | 
280 | # Spider.py treis a topological sort on link rel=next
281 | def topological_sort (pairlist):
282 |     """Topologically sort a list of (parent, child) pairs.
283 | 
284 |     Return a list of the elements in dependency order (parent to child order).
285 | 
286 |     >>> print topsort( [(1,2), (3,4), (5,6), (1,3), (1,5), (1,6), (2,5)] ) 
287 |     [1, 2, 3, 5, 4, 6]
288 | 
289 |     >>> print topsort( [(1,2), (1,3), (2,4), (3,4), (5,6), (4,5)] )
290 |     [1, 2, 3, 4, 5, 6]
291 | 
292 |     >>> print topsort( [(1,2), (2,3), (3,2)] )
293 |     Traceback (most recent call last):
294 |     CycleError: ([1], {2: 1, 3: 1}, {2: [3], 3: [2]})
295 |  
296 |     """
297 |     num_parents = {}  # element -> # of predecessors 
298 |     children = {}  # element -> list of successors 
299 |     for parent, child in pairlist: 
300 |         # Make sure every element is a key in num_parents.
301 |         if not num_parents.has_key( parent ): 
302 |             num_parents[parent] = 0 
303 |         if not num_parents.has_key( child ): 
304 |             num_parents[child] = 0 
305 | 
306 |         # Since child has a parent, increment child's num_parents count.
307 |         num_parents[child] += 1
308 | 
309 |         # ... and parent gains a child.
310 |         children.setdefault(parent, []).append(child)
311 | 
312 |     # Suck up everything without a parent.
313 |     answer = [x for x in num_parents.keys() if num_parents[x] == 0]
314 | 
315 |     # For everything in answer, knock down the parent count on its children.
316 |     # Note that answer grows *in* the loop.
317 |     for parent in answer: 
318 |         del num_parents[parent]
319 |         if children.has_key( parent ): 
320 |             for child in children[parent]: 
321 |                 num_parents[child] -= 1
322 |                 if num_parents[child] == 0: 
323 |                     answer.append( child ) 
324 |             # Following "del" isn't needed; just makes 
325 |             # CycleError details easier to grasp.
326 |             del children[parent]
327 | 
328 |     if num_parents: 
329 |         # Everything in num_parents has at least one child -> 
330 |         # there's a cycle.
331 |         raise Exception (answer, num_parents, children)
332 |     return answer 
333 | 


--------------------------------------------------------------------------------
/epubmaker/lib/Logger.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
 3 | 
 4 | """
 5 | Logger.py
 6 | 
 7 | Copyright 2009 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Logging support.
12 | 
13 | 
14 | """
15 | 
16 | import logging
17 | from logging import debug, info, warn, error, critical, exception
18 | 
19 | LOGFORMAT = '%(asctime)s %(levelname)-8s  #%(ebook)-5d %(message)s'
20 | 
21 | ebook = 0 # global
22 | 
23 | class CustomFormatter (logging.Formatter):
24 |     """ A custom formatter that adds ebook no. """
25 |     
26 |     def format (self, record):
27 |         """ Add ebook no. to string format params. """
28 |         record.ebook = ebook
29 |         return logging.Formatter.format (self, record)
30 |         
31 |     
32 | def setup (logformat, logfile = None):
33 |     """ Setup logger. """
34 | 
35 |     # StreamHandler defaults to sys.stderr
36 |     file_handler = logging.FileHandler (logfile) if logfile else logging.StreamHandler ()
37 |     file_handler.setFormatter (CustomFormatter (logformat))
38 |     logging.getLogger ().addHandler (file_handler)
39 |     logging.getLogger ().setLevel (logging.INFO)
40 |     
41 | 
42 | def set_log_level (level):
43 |     """ Set log level. """
44 |     if level >= 1:
45 |         logging.getLogger ().setLevel (logging.INFO)
46 |     if level >= 2:
47 |         logging.getLogger ().setLevel (logging.DEBUG)
48 | 
49 | 
50 | __all__ = 'debug info warn error critical exception'.split ()
51 | 


--------------------------------------------------------------------------------
/epubmaker/lib/MediaTypes.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
 3 | 
 4 | """
 5 | MediaTypes.py
 6 | 
 7 | Copyright 2009 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Media Types Lists
12 | 
13 | """
14 | 
15 | import mimetypes
16 | 
17 | mimetypes.init ()
18 | 
19 | # overrides
20 | 
21 | mimetypes.types_map['.htm']     = 'application/xhtml+xml'
22 | mimetypes.types_map['.html']    = 'application/xhtml+xml'
23 | mimetypes.types_map['.xhtml']   = 'application/xhtml+xml'
24 | mimetypes.types_map['.mobile']  = 'application/xhtml+xml'
25 | mimetypes.types_map['.ncx']     = 'application/x-dtbncx+xml'
26 | mimetypes.types_map['.pt']      = 'application/vnd.adobe-page-template+xml'
27 | mimetypes.types_map['.epub']    = 'application/epub+zip'
28 | mimetypes.types_map['.mobi']    = 'application/x-mobipocket-ebook'
29 | mimetypes.types_map['.pdf']     = 'application/pdf'
30 | mimetypes.types_map['.plucker'] = 'application/prs.plucker'
31 | mimetypes.types_map['.qioo']    = 'application/x-qioo-ebook'
32 | mimetypes.types_map['.jar']     = 'application/java-archive'
33 | mimetypes.types_map['.rss']     = 'application/rss+xml'
34 | mimetypes.types_map['.atom']    = 'application/atom+xml'
35 | mimetypes.types_map['.opds']    = 'application/atom+xml'
36 | mimetypes.types_map['.stanza']  = 'application/atom+xml'
37 | mimetypes.types_map['.wap']     = 'application/vnd.wap.xhtml+xml'
38 | mimetypes.types_map['.json']    = 'application/x-suggestions+json'
39 | mimetypes.types_map['.rst']     = 'text/x-rst'
40 | mimetypes.types_map['.png']     = 'image/png'  # Windows XP thinks this is image/x-png
41 | mimetypes.types_map['.jpg']     = 'image/jpeg' # Windows XP thinks this is image/pjpeg
42 | mimetypes.types_map['.jpeg']    = 'image/jpeg' # Windows XP thinks this is image/pjpeg
43 | mimetypes.types_map['.jfif']    = 'image/jpeg' 
44 | mimetypes.types_map['.mscz']    = 'application/x-musescore+xml'
45 | mimetypes.types_map['.mid']     = 'audio/midi'
46 | mimetypes.types_map['.midi']    = 'audio/midi'
47 | mimetypes.types_map['.mus']     = 'application/x-myriad-music'
48 | mimetypes.types_map['.sib']     = 'application/x-sibelius-score'
49 | mimetypes.types_map['.mxl']     = 'application/vnd.recordare.musicxml'
50 | mimetypes.types_map['.mp3']     = 'audio/mpeg'
51 | 
52 | 
53 | TEXT_MEDIATYPES = set ( (
54 |     'application/xhtml+xml',
55 |     'application/xml',
56 |     'text/html',
57 |     'text/plain',
58 | ) )
59 | 
60 | IMAGE_MEDIATYPES = set ( (
61 |     'image/gif',
62 |     'image/jpeg',
63 |     'image/png',
64 | ) )
65 | 
66 | AUX_MEDIATYPES = set ( (
67 |     'text/css',
68 | ) )
69 | 
70 | class MediatypesLookup (object):
71 |     """ Quick mediatype lookup
72 | 
73 |     ns = MediatypesLookup ()
74 |     >>> ns.epub
75 |     'application/atom+xml'
76 |     >>> ns['mobi']
77 |     'application/x-mobipocket-ebook'
78 | 
79 |     """
80 | 
81 |     def __getitem__ (self, local):
82 |         return mimetypes.types_map['.' + local]
83 | 
84 |     def __getattr__ (self, local):
85 |         return mimetypes.types_map['.' + local]
86 | 
87 | mediatypes = MediatypesLookup ()
88 | 
89 | 


--------------------------------------------------------------------------------
/epubmaker/lib/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 | 
3 | __all__ = ['DublinCore', 'DummyConnectionPool',
4 |            'GutenbergDatabaseDublinCore', 'GutenbergDatabase',
5 |            'GutenbergGlobals', 'Logger', 'MediaTypes']
6 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/__init__.py:
--------------------------------------------------------------------------------
1 | broken = 'images/broken.png'
2 | 
3 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/gutenberg/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/gutenberg/parsers/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | 
 6 | Module parsers
 7 | 
 8 | Copyright 2010-2012 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | Customized Project Gutenberg directives for RST parser.
13 | 
14 | """
15 | 
16 | from docutils import statemachine
17 | from docutils.parsers.rst import Directive, directives
18 | 
19 | from epubmaker.mydocutils import parsers
20 | 
21 | from epubmaker.mydocutils.gutenberg import transforms as gutenberg_transforms
22 | 
23 | from epubmaker.lib.Logger import error, info, debug, warn
24 | 
25 | # pylint: disable=W0142, W0102
26 | 
27 | 
28 | class PGHeaderFooter (Directive):
29 |     """ Inserts PG header or footer. """
30 | 
31 |     required_arguments = 0
32 |     optional_arguments = 0
33 | 
34 |     def run (self):
35 |         settings = self.state.document.settings
36 |         include_lines = statemachine.string2lines (
37 |             settings.get_resource ('mydocutils.gutenberg.parsers', self.resource).decode ('utf-8'), 
38 |             settings.tab_width,
39 |             convert_whitespace = 1)
40 |         self.state_machine.insert_input (include_lines, '')
41 |         return []
42 | 
43 | 
44 | class PGHeader (PGHeaderFooter):
45 |     """ Inserts PG header. """
46 |     resource = 'pg-header.rst'
47 | 
48 | 
49 | class PGFooter (PGHeaderFooter):
50 |     """ Inserts PG footer. """
51 |     resource = 'pg-footer.rst'
52 | 
53 | 
54 | class Parser (parsers.Parser):
55 |     """ Parser with PG custom directives. """
56 | 
57 |     def __init__ (self):
58 |         parsers.Parser.__init__ (self)
59 | 
60 |         directives.register_directive ('pgheader',        PGHeader)
61 |         directives.register_directive ('pgfooter',        PGFooter)
62 | 
63 | 
64 |     def get_transforms (self):
65 |         return parsers.Parser.get_transforms (self) + [
66 |             gutenberg_transforms.VariablesTransform,
67 |             gutenberg_transforms.SubRefToVarTransform]
68 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/gutenberg/parsers/pg-header.rst:
--------------------------------------------------------------------------------
 1 | .. -*- encoding: utf-8 -*-
 2 | 
 3 | .. |pg.copyrighted-header| replace::
 4 | 
 5 |    This is a *copyrighted* Project Gutenberg eBook, details
 6 |    below. Please follow the copyright guidelines in this file.
 7 | 
 8 | .. _pg-header:
 9 | 
10 | .. container:: noindent pgheader language-en
11 | 
12 |    This eBook is for the use of anyone anywhere at no cost and with
13 |    almost no restrictions whatsoever. You may copy it, give it away or
14 |    re-use it under the terms of the `Project Gutenberg License`_
15 |    included with this eBook or online at
16 |    http://www.gutenberg.org/license.
17 | 
18 |    |pg.copyrighted-header|
19 | 
20 |    .. vspace:: 2
21 | 
22 |    .. _pg-machine-header:
23 | 
24 |    .. container:: noindent white-space-pre-line
25 | 
26 |       |pg.machine-header|
27 | 
28 |    .. vspace:: 2
29 | 
30 |    .. _pg-start-line:
31 | 
32 |    \*\*\* START OF THIS PROJECT GUTENBERG EBOOK |pg.upcase-title| \*\*\*
33 | 
34 |    .. vspace:: 4
35 | 
36 |    .. _pg-produced-by:
37 | 
38 |    |pg.produced-by|
39 | 
40 |    .. vspace:: 1
41 | 
42 |    |pg.credits|
43 | 
44 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/gutenberg/transforms/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | 
  6 | gutenberg.py
  7 | 
  8 | Copyright 2012 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Transforms for the Project Gutenberg flavor.
 13 | 
 14 | """
 15 | 
 16 | import datetime
 17 | import textwrap
 18 | 
 19 | from docutils import nodes
 20 | import docutils.transforms
 21 | import docutils.transforms.parts
 22 | 
 23 | from epubmaker.lib.Logger import error, info, debug, warn
 24 | from epubmaker.lib.DublinCore import DublinCore
 25 | from epubmaker.mydocutils import nodes as mynodes
 26 | 
 27 | # pylint: disable=W0142
 28 | 
 29 | class SubRefToVarTransform (docutils.transforms.Transform):
 30 |     """
 31 |     Transforms subref nodes in 'pg' namespace into var nodes.
 32 | 
 33 |     We need to save some subrefs for later processing. The standard
 34 |     subref processing happens too early (ie. before docinfo is
 35 |     collected). So we transform subrefs into variables, await docinfo
 36 |     to be processed, and then process the variables.
 37 | 
 38 |     """
 39 | 
 40 |     default_priority = 219
 41 |     """ Before substitition def variables """
 42 | 
 43 | 
 44 |     def apply (self):
 45 |         for ref in self.document.traverse (nodes.substitution_reference):
 46 |             refname = ref['refname']
 47 |             if refname.startswith ('pg.'):
 48 |                 var = mynodes.variable ()
 49 |                 var['name'] = refname
 50 |                 ref.replace_self (var)
 51 | 
 52 | 
 53 | class VariablesTransform (docutils.transforms.Transform):
 54 |     """ Replaces mynodes.var with parameters from metadata. """
 55 | 
 56 |     default_priority = 342
 57 |     """ After DocInfoCollector. """
 58 | 
 59 |     def apply(self):
 60 |         doc = self.document
 61 |         meta = doc.meta_block
 62 |         defs = doc.substitution_defs
 63 | 
 64 |         def getone (name, default = None):
 65 |             """ Get first value. """
 66 |             if name in meta:
 67 |                 return meta[name][0]
 68 |             return default
 69 | 
 70 |         def getmany (name, default = []):
 71 |             """ Get list of all values. """
 72 |             return meta.get (name, default)
 73 | 
 74 |         def sub (var, nodes):
 75 |             var.replace_self (nodes)
 76 | 
 77 |         title = getone ('DC.Title', 'No Title')
 78 |         short_title = getone ('PG.Title', title)
 79 |         short_title = short_title.split ('\n', 1)[0]
 80 | 
 81 |         language = getmany ('DC.Language', ['en'])
 82 |         language = map (lambda x: DublinCore.language_map.get (
 83 |             x, 'Unknown').title (), language)
 84 |         language = DublinCore.strunk (language)
 85 | 
 86 |         copyrighted = getone ('PG.Rights', '').lower () == 'copyrighted'
 87 | 
 88 |         for variable in doc.traverse (mynodes.variable):
 89 |             name = variable['name']
 90 | 
 91 |             if name == 'pg.upcase-title':
 92 |                 sub (variable, [ nodes.inline ('', short_title.upper ()) ])
 93 | 
 94 |             elif name == 'pg.produced-by':
 95 |                 producers = getmany ('PG.Producer')
 96 |                 if producers:
 97 |                      sub (variable, [ nodes.inline ('', u'Produced by %s.' %
 98 |                                                     DublinCore.strunk (producers)) ])
 99 |                 else:
100 |                     sub (variable, [])
101 | 
102 |             elif name == 'pg.credits':
103 |                 sub (variable, [ nodes.inline ('', getone ('PG.Credits', '')) ])
104 | 
105 |             elif name == 'pg.bibrec-url':
106 |                 url = 'http://www.gutenberg.org/ebooks/%s' % getone ('PG.Id', '999999')
107 |                 sub (variable, [ nodes.reference ('', '', nodes.inline ('', url), refuri = url) ])
108 | 
109 |             elif name in ('pg.copyrighted-header', 'pg.copyrighted-footer'):
110 |                 if copyrighted:
111 |                     subdef_copy = defs[name].deepcopy ()
112 |                     sub (variable, subdef_copy.children)
113 |                 else:
114 |                     sub (variable, [])
115 | 
116 |             elif name == 'pg.machine-header':
117 |                 tw = textwrap.TextWrapper (
118 |                     width = 72,
119 |                     initial_indent = u'Title: ',
120 |                     subsequent_indent = u' ' * 7)
121 | 
122 |                 if '\n' in title:
123 |                     maintitle, subtitle = title.split ('\n', 1)
124 |                     s = tw.fill (maintitle)
125 |                     s += '\n'
126 |                     tw.initial_indent = tw.subsequent_indent
127 |                     s += tw.fill (subtitle)
128 |                 else:
129 |                     s = tw.fill (title)
130 |                 s += '\n\n'
131 | 
132 |                 tw.initial_indent = u'Author: '
133 |                 tw.subsequent_indent = u' ' * 8
134 |                 s += tw.fill (DublinCore.strunk (getmany ('DC.Creator', ['Unknown'])))
135 |                 s += '\n\n'
136 | 
137 |                 date = getone ('PG.Released', '')
138 |                 try:
139 |                     date = datetime.datetime.strptime (date, '%Y-%m-%d')
140 |                     date = datetime.datetime.strftime (date, '%B %d, %Y')
141 |                 except ValueError:
142 |                     date = 'unknown date'
143 |                 s += u'Release Date: %s [EBook #%s]\n' % (date, getone ('PG.Id', '999999'))
144 | 
145 |                 for item in getmany ('PG.Reposted', []):
146 |                     try:
147 |                         date, comment = item.split (None, 1)
148 |                     except ValueError:
149 |                         date = item
150 |                         comment = None
151 |                     try:
152 |                         date = datetime.datetime.strptime (date, '%Y-%m-%d')
153 |                         date = datetime.datetime.strftime (date, '%B %d, %Y')
154 |                     except ValueError:
155 |                         date = 'unknown date'
156 | 
157 |                     s += u'Reposted: %s' % date
158 |                     if comment:
159 |                         s += u' [%s]' % comment
160 |                     s += '\n'
161 | 
162 |                 s += u'\nLanguage: %s\n\n' % language
163 |                 s += u'Character set encoding: %s' % doc.settings.encoding.upper ()
164 | 
165 |                 sub (variable, [ nodes.inline ('', nodes.Text (s)) ])
166 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/gutenberg/writers/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/gutenberg/writers/nroff.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # $Id: manpage.py 6270 2010-03-18 22:32:09Z milde $
  3 | # Author: Engelbert Gruber <grubert@users.sourceforge.net>
  4 | # Copyright: This module is put into the public domain.
  5 | # Rewritten almost completely
  6 | # by Marcello Perathoner <marcello@perathoner.de>
  7 | 
  8 | """
  9 | 
 10 | Nroff writer for reStructuredText. Tweaked for Project Gutenberg usage.
 11 | 
 12 | """
 13 | 
 14 | __docformat__ = 'reStructuredText'
 15 | 
 16 | from epubmaker.mydocutils.writers import nroff
 17 | from epubmaker import Unitame
 18 | 
 19 | from epubmaker.lib.Logger import info, debug, warn, error
 20 | 
 21 | GUTENBERG_NROFF_PREAMBLE = r""".\" -*- mode: nroff -*- coding: {encoding} -*-
 22 | .\" This file produces Project Gutenberg plain text. Usage:
 23 | .\"   $ groff -t -K {device} -T {device} this_file > output.txt
 24 | .
 25 | .pl 100000       \" very tall page: disable pagebreaks
 26 | .ll 72m
 27 | .po 0
 28 | .ad l           \" text-align: left
 29 | .nh             \" hyphenation: off
 30 | .cflags 0 .?!   \" single sentence space
 31 | .cflags 0 -\[hy]\[em]   \" don't break on -
 32 | .
 33 | .de nop
 34 | ..
 35 | .blm nop        \" do nothing on empty line
 36 | .
 37 | .nr [env_cnt] 0
 38 | .ev 0           \" start in a defined environment
 39 | .
 40 | .de push_env
 41 | .br
 42 | .nr last_env \\n[.ev]            \" save current environment name
 43 | .nr env_cnt +1   \" generate new environment name
 44 | .ev \\n[env_cnt]
 45 | .evc \\n[last_env]
 46 | ..
 47 | .de pop_env
 48 | .br
 49 | .ev
 50 | .nr env_cnt -1
 51 | ..
 52 | .
 53 | """
 54 | 
 55 | GUTENBERG_NROFF_POSTAMBLE = r""".
 56 | .pl 0    \" ends very long page here
 57 | .\" End of File
 58 | """
 59 | 
 60 | class Writer (nroff.Writer):
 61 |     """ A plaintext writer thru nroff. """
 62 | 
 63 |     supported = ('pg-nroff',)
 64 |     """Formats this writer supports."""
 65 | 
 66 |     def __init__ (self):
 67 |         nroff.Writer.__init__ (self)
 68 |         self.translator_class = Translator
 69 | 
 70 |     def translate (self):
 71 |         visitor = self.translator_class (self.document)
 72 |         del Unitame.unhandled_chars[:]
 73 |         self.document.walkabout (visitor)
 74 |         self.output = visitor.astext ()
 75 |         if Unitame.unhandled_chars:
 76 |             error ("unitame: unhandled chars: %s" % u", ".join (set (Unitame.unhandled_chars)))
 77 | 
 78 |     #def get_transforms (self):
 79 |     #    tfs = writers.Writer.get_transforms (self)
 80 |     #    return tfs + [parts.CharsetTransform]
 81 | 
 82 |         
 83 |         
 84 | class Translator (nroff.Translator):
 85 |     """ nroff translator """
 86 | 
 87 |     def preamble (self):
 88 |         """ Inserts nroff preamble. """
 89 |         return GUTENBERG_NROFF_PREAMBLE.format (
 90 |             encoding = self.encoding, device = self.device)
 91 | 
 92 | 
 93 |     def postamble (self):
 94 |         """ Inserts nroff postamble. """
 95 |         return GUTENBERG_NROFF_POSTAMBLE.format (
 96 |             encoding = self.encoding, device = self.device)
 97 | 
 98 | 
 99 |     def init_translate_maps (self):
100 |         nroff.Translator.init_translate_maps (self)
101 | 
102 |         update = {
103 |             0x0011: ur"\~",       # nbsp, see: Unitame.py
104 |             0x0012: ur"\%",       # shy,  see: Unitame.py
105 |             }
106 | 
107 |         self.translate_map.update (update)
108 |         self.translate_map_literal.update (update)
109 | 
110 | 
111 |     def register_classes (self):
112 |         """ Register classes.
113 |         
114 |         Use the idiosyncratic PG convention of marking up italics etc.
115 | 
116 |         """
117 | 
118 |         #
119 |         # This does not call the base class !!!
120 |         #
121 | 
122 |         self.register_class ('simple', 'left',         '.ad l', '')
123 |         self.register_class ('simple', 'right',        '.ad r', '')
124 |         self.register_class ('simple', 'center',       '.ad c', '')
125 |                                                     
126 |         self.register_class ('inline', 'italics',      '_',    '_')
127 |         self.register_class ('inline', 'bold',         '*',    '*')
128 | 
129 |         self.register_class ('inline', 'monospaced',   '',     '')
130 |         self.register_class ('inline', 'superscript',  '',     '')
131 |         self.register_class ('inline', 'subscript',    '',     '')
132 | 
133 |         self.register_class ('inline', 'small-caps',   '_',    '_')
134 |         self.register_class ('inline', 'gesperrt',     '_',    '_')
135 |         self.register_class ('inline', 'antiqua',      '_',    '_')
136 |         self.register_class ('inline', 'larger',       '',     '')
137 |         self.register_class ('inline', 'smaller',      '',     '')
138 | 
139 | 
140 |     def translate (self, text):
141 |         """ Reduce the charset while keeping text a unicode string. """
142 | 
143 |         # NOTE: there's an alternate approach in
144 |         # transforms.parts.CharsetTransform
145 | 
146 |         if self.encoding != 'utf-8':
147 |             text = text.encode (self.encoding, 'unitame')
148 |             text = text.decode (self.encoding)
149 | 
150 |         if self.in_literal:
151 |             text = text.translate (self.translate_map_literal)
152 |         else:
153 |             text = text.translate (self.translate_map)
154 | 
155 |         return text
156 | 
157 |         
158 |     def visit_inner (self, node):
159 |         """ Try to remove duplicated PG highlight markers. """
160 |         if node.type == 'inline':
161 |             prefixes = self.get_prefix (node.type, node['classes'])
162 |             for prefix in prefixes:
163 |                 if prefix == self.last_output_char:
164 |                     self.backspace ()
165 |                 else:
166 |                     self.text (prefix)
167 |         else:
168 |             nroff.Translator.visit_inner (self, node)
169 | 
170 | 
171 |     def visit_inline (self, node):
172 |         if 'toc-pageref' in node['classes']:
173 |             maxlen = 3 # sensible default
174 |             while node.parent:
175 |                 node = node.parent
176 |                 if 'pageno_maxlen' in node:
177 |                     maxlen = node['pageno_maxlen']
178 |                     break
179 |             self.cmd (('linetabs 1',
180 |                        r'ta (\n[.l]u - \n[.i]u - %dm) +%dmR' % (maxlen + 1, maxlen + 1),
181 |                        r'lc .'))
182 |             self.text (chr (1) + '\t')
183 |         nroff.Translator.visit_inline (self, node)
184 | 
185 |     def visit_section_title (self, node):
186 |         """ Implements PG-standard spacing before headers. """
187 |         self.sp (max (2, 5 - self.section_level))
188 | 
189 |     def visit_figure (self, node):
190 |         self.sp (1)
191 |         self.push ()
192 | 
193 |     def depart_figure (self, node):
194 |         self.pop ()
195 |         self.sp (1)
196 | 
197 |     def visit_image (self, node):
198 |         # ignore alt attribute except for dropcaps
199 |         if 'dropcap' in node['classes']:
200 |             self.text (node.attributes.get ('alt', ''))
201 | 
202 |     def visit_page (self, node):
203 |         if 'clearpage' in node['classes']:
204 |             self.sp (4)
205 |         elif 'cleardoublepage' in node['classes']:
206 |             self.sp (4)
207 |         else:
208 |             nroff.Translator.visit_page (self, node)
209 | 
210 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/nodes.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | 
 6 | nodes.py
 7 | 
 8 | Copyright 2011 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | Added nodes for PG.
13 | 
14 | """
15 | 
16 | from docutils import nodes
17 | 
18 | class page (nodes.Element, nodes.Special):
19 |     """ Hold pagination commands.
20 | 
21 |     Like clearpage, vspace etc.
22 |     
23 |     """
24 | 
25 | class newline (nodes.Element):
26 |     """ A line break.
27 | 
28 |     Outputs a hard line break if the node or one of its parents belong
29 |     to the class 'white-space-pre-line'.  Else a space.
30 | 
31 |     """
32 | 
33 | class footnote_group (nodes.container):
34 |     """ Hold a group of footnotes. """
35 | 
36 | 
37 | class variable (nodes.Inline, nodes.TextElement):
38 |     """ A placeholder that gets substituted with actual text before output. 
39 | 
40 |     We do not use substitution refs because they are resolved way too
41 |     early in the transformation stage to be of much use to us.
42 | 
43 |     """
44 | 
45 | 
46 | class node_selector (object):
47 |     """ Allows CSS-like selectors as condition function for nodes.traverse (). """
48 |     
49 |     def __init__ (self, selector):
50 | 
51 |         # allow selectors like [element][.class[.class[...]]][, selector[, selector]]
52 | 
53 |         self.matches = [] # list of 2-tuples
54 |         
55 |         for sel in selector.split (','):
56 |             sel = sel.strip ()
57 |             if '.' not in sel:
58 |                 sel += '.'
59 |             element, classes = sel.split ('.', 1)
60 |             classes = set (classes.split ('.')) if classes else set ()
61 |             self.matches.append ( (getattr (nodes, element, nodes.Element), classes) )
62 |         
63 | 
64 |     def __call__ (self, node):
65 |         """ returns True if the node matches the selector. """
66 |         
67 |         for match in self.matches:
68 |             if isinstance (node, match[0]) and match[1].issubset (node['classes']):
69 |                 return True
70 | 
71 |         return False
72 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/parsers/default_style.rst:
--------------------------------------------------------------------------------
 1 | .. this is the default PG-RST stylesheet
 2 | 
 3 | .. style:: emphasis
 4 |    :class: italics
 5 | 
 6 | .. style:: strong
 7 |    :class: bold
 8 | 
 9 | .. style:: title_reference
10 |    :class: italics
11 | 
12 | .. style:: option_argument
13 |    :class: italics
14 | 
15 | .. style:: literal
16 |    :class: monospaced
17 | 
18 | .. style:: subscript
19 |    :class: subscript
20 | 
21 | .. style:: superscript
22 |    :class: superscript
23 | 
24 | .. style:: title.document-title
25 |    :class: x-large center
26 |    :titlehack:
27 | 
28 | .. style:: title.topic-title
29 |    :class: centerleft
30 | 
31 | .. style:: title.table-title
32 |    :class: centerleft larger
33 | 
34 | .. figure and image styles for non-image formats
35 | 
36 | .. style:: figure
37 |    :class: margin
38 | 
39 | .. style:: figure
40 |    :formats: txt.* *.noimages
41 |    :align: center
42 |    :width: 80%
43 | 
44 | .. style:: image
45 |    :formats: *.noimages
46 |    
47 |    .. container:: center image margin
48 |    
49 |       [image]
50 | 
51 | 
52 | .. style:: image
53 |    :formats: txt.*
54 |    :display: none   
55 | 
56 | .. style:: caption.figure-caption
57 |    :formats: -txt.*
58 |    :class: centerleft italics margin
59 | 
60 | .. style:: caption.figure-caption
61 |    :formats: txt.*
62 |    :class: margin
63 |    :before:  '[Illustration: '
64 |    :after:   ']'
65 | 
66 | .. style:: legend
67 |    :class: margin
68 | 
69 | 
70 | .. default transition
71 | 
72 | .. style:: transition
73 | 
74 |    .. container:: center transition margin
75 | 
76 |       ――――
77 | 
78 | .. default attribution
79 | 
80 | .. style:: attribution
81 |    :class: margin
82 |    :before: '―― '
83 | 
84 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/writers/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | 
  6 | Mydocutils writer package.
  7 | 
  8 | Copyright 2010-2012 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | """
 13 | 
 14 | 
 15 | __docformat__ = 'reStructuredText'
 16 | 
 17 | import collections
 18 | import operator
 19 | 
 20 | from docutils import nodes, writers
 21 | import roman
 22 | 
 23 | 
 24 | class Writer (writers.Writer):
 25 |     """ A base class for writers. """
 26 | 
 27 |     output = None
 28 |     """Final translated form of `document`."""
 29 | 
 30 |     config_section_dependencies = ('writers', )
 31 | 
 32 |     def translate (self):
 33 |         visitor = self.translator_class (self.document)
 34 |         self.document.walkabout (visitor)
 35 |         self.output = visitor.astext ()
 36 | 
 37 |         
 38 | class TablePass1 (nodes.SparseNodeVisitor):
 39 | 
 40 |     """
 41 |     Make a first pass over a table to get a reliable row and column
 42 |     count.  Insert placeholder cells for spanned cells.
 43 |     """
 44 |     
 45 |     def __init__ (self, document):
 46 |         nodes.SparseNodeVisitor.__init__ (self, document)
 47 |         
 48 |         self.row = -1     # 0-based
 49 |         self.column = 0   # 0-based
 50 |         self.cells = 0
 51 |         self.colspecs = None
 52 | 
 53 |     def visit_table (self, table):
 54 |         self.colspecs = table.traverse (nodes.colspec)
 55 |         width = sum (map (operator.itemgetter ('colwidth'), self.colspecs))
 56 |         for colspec in self.colspecs:
 57 |             colspec['relative_width'] = float (colspec['colwidth']) / width
 58 |             
 59 |     def depart_table (self, table):
 60 |         table['rows'] = self.rows ()
 61 |         table['columns'] = self.cols ()
 62 | 
 63 |     def visit_row (self, dummy_node):
 64 |         self.row += 1
 65 |         self.column = 0
 66 |         for colspec in self.colspecs:
 67 |             colspec['spanned'] = max (0, colspec.get ('spanned', 0) - 1)
 68 |         
 69 |     def visit_entry (self, node):
 70 |         """ Table cell. """
 71 | 
 72 |         morerows = node.get ('morerows', 0)
 73 |         morecols = node.get ('morecols', 0)
 74 | 
 75 |         self.cells += (morecols + 1) * (morerows + 1)
 76 | 
 77 |         # skip columns that are row-spanned by preceding entries
 78 |         while True:
 79 |             colspec = self.colspecs [self.column]
 80 |             if colspec.get ('spanned', 0) > 0:
 81 |                 placeholder = nodes.entry ()
 82 |                 placeholder.type = 'compound'
 83 |                 placeholder['column'] = self.column
 84 |                 placeholder.colspecs = self.colspecs[self.column:self.column + 1]
 85 |                 placeholder['vspan'] = True
 86 |                 node.replace_self ([placeholder, node])
 87 |                 self.column += 1
 88 |             else:
 89 |                 break
 90 | 
 91 |         # mark columns we row-span
 92 |         if morerows:
 93 |             for colspec in self.colspecs [self.column : self.column + 1 + morecols]:
 94 |                 colspec['spanned'] = morerows + 1
 95 | 
 96 |         node['row'] = self.row
 97 |         node['column'] = self.column
 98 |         
 99 |         node.colspecs = self.colspecs[self.column:self.column + morecols + 1]
100 | 
101 |         self.column += 1 + morecols
102 |         
103 |         raise nodes.SkipNode
104 | 
105 |     def rows (self):
106 |         """ Return the no. of columns. """
107 |         return self.row + 1
108 | 
109 |     def cols (self):
110 |         """ Return the no. of columns. """
111 |         return self.cells / self.rows ()
112 | 
113 | 
114 | class ListEnumerator:
115 |     """ Enumerate labels according to list type. """
116 | 
117 |     def __init__ (self, node, encoding):
118 |         self.type  = node.get ('enumtype') or node.get ('bullet') or '*'
119 |         self.start = node['start'] if 'start' in node else 1
120 |         self.prefix = node.get ('prefix', '')
121 |         self.suffix = node.get ('suffix', '')
122 |         self.encoding = encoding
123 | 
124 |         self.indent = len (self.prefix + self.suffix) + 1
125 |         if self.type == 'arabic':
126 |             # indentation depends on end value
127 |             self.indent += len (str (self.start + len (node.children)))
128 |         elif self.type.endswith ('alpha'):
129 |             self.indent += 1
130 |         elif self.type.endswith ('roman'):
131 |             self.indent += 5 # FIXME: calculate real length
132 |         else:
133 |             self.indent += 1 # none, bullets, etc.
134 | 
135 |     def get_next (self):
136 |         """ Get the next label. """
137 | 
138 |         if self.type == 'none':
139 |             res = ''
140 |         elif self.type == '*':
141 |             res = u'•' if self.encoding == 'utf-8' else '-'
142 |         elif self.type == '-':
143 |             res = u'-'
144 |         elif self.type == '+':
145 |             res = u'+'
146 |         elif self.type == 'arabic':
147 |             res = "%d" % self.start
148 |         elif self.type == 'loweralpha':
149 |             res = "%c" % (self.start + ord ('a') - 1)
150 |         elif self.type == 'upperalpha':
151 |             res = "%c" % (self.start + ord ('A') - 1)
152 |         elif self.type == 'upperroman':
153 |             res = roman.toRoman (self.start).upper ()
154 |         elif self.type == 'lowerroman':
155 |             res = roman.toRoman (self.start).lower ()
156 |         else:
157 |             res = "%d" % self.start
158 | 
159 |         self.start += 1
160 | 
161 |         return self.prefix + res + self.suffix
162 | 
163 |     def get_width (self):
164 |         """ Get indent width for this list. """
165 | 
166 |         return self.indent
167 | 
168 | 
169 | class Translator (nodes.NodeVisitor):
170 |     """ A base translator """
171 | 
172 |     admonitions = """
173 |     attention caution danger error hint important note tip warning
174 |     """.split ()
175 | 
176 |     docinfo_elements = """
177 |     address author contact copyright date organization revision status
178 |     version
179 |     """.split ()
180 | 
181 |     # see http://docutils.sourceforge.net/docs/ref/doctree.html#simple-body-elements
182 | 
183 |     # simple_structural_subelements = tuple ((getattr (nodes, n) for n in """
184 |     # title subtitle
185 |     # """.split ()))
186 | 
187 |     # simple_body_elements = tuple ((getattr (nodes, n) for n in """
188 |     # comment doctest_block image literal_block math_block paragraph 
189 |     # pending raw rubric substitution_definition target
190 |     # """.split ()))
191 | 
192 |     # simple_body_subelements = tuple ((getattr (nodes, n) for n in """
193 |     # attribution caption classifier colspec field_name 
194 |     # label line option_argument option_string term
195 |     # """.split ()))
196 | 
197 |     # simple_elements = (simple_structural_subelements + 
198 |     #                    simple_body_elements + simple_body_subelements)
199 | 
200 |     def __init__ (self, document):
201 |         nodes.NodeVisitor.__init__ (self, document)
202 |         self.settings = document.settings
203 |         
204 |         self.body = []
205 |         self.context = self.body # start with context == body
206 |         self.docinfo = collections.defaultdict (list)
207 |         self.list_enumerator_stack = []
208 |         self.section_level = 0
209 |         self.vspace = 0 # pending space (need this for collapsing)
210 |         self.src_vspace = 0 # pending space for source pretty printing
211 | 
212 |         self.field_name = None
213 |         self.compacting = 0 # > 0 if we are inside a compacting list
214 |         self.in_literal = 0 # > 0 if we are inside one or more literal blocks
215 |         
216 |         self.prefixes = collections.defaultdict (list) # dict of arrays of prefixes in order in 
217 |                                                        # which to apply classes
218 |         self.suffixes = collections.defaultdict (list) # reverse order of above
219 |         
220 |         self.environments = [] # stack of \begin'ed environments
221 | 
222 |         self.register_classes ()
223 |         
224 |         for name in self.docinfo_elements:
225 |             setattr (self, 'visit_' + name,
226 |                      lambda node: self.visit_field_body (node, name))
227 |             setattr (self, 'depart_' + name, self.depart_field_body)
228 |             
229 |         for adm in self.admonitions:
230 |             setattr (self, 'visit_' + adm,
231 |                      lambda node: self.visit_admonition (node, adm))
232 |             setattr (self, 'depart_' + adm, self.depart_admonition)
233 |             
234 | 
235 |     def register_classes (self):
236 |         pass
237 | 
238 | 
239 |     def dispatch_visit (self, node):
240 |         """
241 |         Call self."``visit_`` + node class name" with `node` as
242 |         parameter.  If the ``visit_...`` method does not exist, call
243 |         self.unknown_visit.
244 | 
245 |         There are 3 hooks for every visit:
246 |         
247 |         visit_outer
248 |         visit_<classname>
249 |         visit_inner
250 | 
251 |         """
252 | 
253 |         self.visit_outer (node)
254 | 
255 |         node_name = node.__class__.__name__
256 |         method = getattr (self, 'visit_' + node_name, self.unknown_visit)
257 |         self.document.reporter.debug (
258 |             'docutils.nodes.NodeVisitor.dispatch_visit calling %s for %s'
259 |             % (method.__name__, node_name))
260 |         res = method (node)
261 | 
262 |         if node.type in ('compound', 'simple', 'inline'):
263 |             self.visit_inner (node)
264 | 
265 |         return res
266 | 
267 |     def dispatch_departure (self, node):
268 |         """
269 |         Call self."``depart_`` + node class name" with `node` as
270 |         parameter.  If the ``depart_...`` method does not exist, call
271 |         self.unknown_departure.
272 | 
273 |         There are 3 hooks for every departure:
274 |         
275 |         depart_inner
276 |         depart_<classname>
277 |         depart_outer
278 | 
279 |         """
280 | 
281 |         if node.type in ('compound', 'simple', 'inline'):
282 |             self.depart_inner (node)
283 | 
284 |         node_name = node.__class__.__name__
285 |         method = getattr (self, 'depart_' + node_name, self.unknown_departure)
286 |         self.document.reporter.debug (
287 |             'docutils.nodes.NodeVisitor.dispatch_departure calling %s for %s'
288 |             % (method.__name__, node_name))
289 |         res = method (node)
290 | 
291 |         self.depart_outer (node)
292 | 
293 |         return res
294 | 
295 | 
296 |     def unknown_visit (self, node):
297 |         """ Called if we have no handler for this element. """
298 |         pass
299 | 
300 |     def unknown_departure (self, node):
301 |         """ Called if we have no handler for this element. """
302 |         pass
303 | 
304 | 
305 |     def visit_outer (self, node):
306 |         """ The very first hook called on a node, before
307 |         ``visit_<classname>``. """
308 |         pass
309 | 
310 |     def visit_inner (self, node):
311 |         """ Called after ``visit_<classname>``. """
312 |         pass
313 | 
314 |     def depart_inner (self, node):
315 |         """ Called on a block before ``depart_<classname>``. """
316 |         pass
317 | 
318 |     def depart_outer (self, node):
319 |         """ The very last hook called on a node, after
320 |         ``depart_<classname>``."""
321 |         pass
322 | 
323 | 
324 |     def register_class (self, types, class_, prefix, suffix):
325 |         """ Register classes. 
326 | 
327 |         A mechanism to automatically output strings before and after
328 |         elements with specific classes.  For most use cases this is
329 |         easier than to write a handler for the element.
330 | 
331 |         types: types of node this class will apply to: 
332 |                tuple of one or more of (text, inline, simple, compound)
333 |         class_: class that triggers the strings
334 |         prefix: string output before element
335 |         suffix: string output after element
336 | 
337 |         """
338 | 
339 |         if isinstance (types, basestring):
340 |             types = types.split ()
341 | 
342 |         for t in types:
343 |             self.prefixes[t].append (   (class_, prefix))
344 |             self.suffixes[t].insert (0, (class_, suffix))
345 | 
346 |     def get_prefix (self, type_, classes):
347 |         return self._get_prefix (type_, classes, self.prefixes)
348 | 
349 |     def get_suffix (self, type_, classes):
350 |         return self._get_prefix (type_, classes, self.suffixes)
351 | 
352 |     def _get_prefix (self, type_, classes, array):
353 |         """ Helper for inline handlers. """
354 |         if isinstance (classes, basestring):
355 |             classes = classes.split ()
356 | 
357 |         res = []
358 |         for s in array[type_]:
359 |             if s[0] in classes:
360 |                 res.append (s[1])
361 |         return res
362 |     
363 | 
364 |     def set_class_on_child (self, node, class_, index = 0):
365 |         """
366 |         Set class `class_` on the visible child no. index of `node`.
367 |         Do nothing if node has fewer children than `index`.
368 |         """
369 |         children = [n for n in node if not isinstance (n, nodes.Invisible)]
370 |         try:
371 |             child = children[index]
372 |         except IndexError:
373 |             return
374 |         child['classes'].append (class_)
375 | 
376 |     def set_first_last (self, node):
377 |         """ Set class 'first' on first child, 'last' on last child. """
378 |         self.set_class_on_child (node, 'first', 0)
379 |         self.set_class_on_child (node, 'last', -1)
380 |  	
381 |     def astext (self):
382 |         """ Return the final formatted document as a string. """
383 |         return self.preamble () + ''.join (self.context) + self.postamble ()
384 | 
385 |     def comment (self, text):
386 |         """ Output a comment. """
387 |         pass
388 |     
389 |     def text (self, text):
390 |         """ Output text. """
391 |         pass
392 | 
393 |     def sp (self, n = 1):
394 |         """ Adds vertical space before the next simple element. 
395 | 
396 |         All spaces added collapse into the largest one. """
397 | 
398 |         if n == 0:
399 |             self.vspace = 1999
400 |         else:
401 |             self.vspace = max (n, self.vspace)
402 | 
403 |     def src_sp (self, n = 1):
404 |         """ Add vertical space to the source. """
405 | 
406 |         if n == 0:
407 |             self.src_vspace = 1999
408 |         else:
409 |             self.src_vspace = max (n, self.src_vspace)
410 | 
411 |     def output_sp (self):
412 |         pass
413 |     
414 |     def output_src_sp (self):
415 |         pass
416 |     
417 |     def push (self):
418 |         """ Push environment. """
419 |         pass
420 |        
421 |     def pop (self):
422 |         """ Pop environment. """
423 |         pass
424 |         
425 |     def br_if_line_longer_than (self, length):
426 |         """ Go one line up if the last line was shorter than length.
427 | 
428 |         Use this to compact lists etc. """
429 |         pass
430 |         
431 |     def indent (self, by = 2):
432 |         """ Indent text. """
433 |         pass
434 | 
435 |     def rindent (self, by = 2):
436 |         """ Indent text on the right side. """
437 |         pass
438 | 
439 |     def preamble (self):
440 |         return ''
441 | 
442 |     def postamble (self):
443 |         return ''
444 | 
445 |     def visit_title (self, node):
446 |         """ Switch on the various incarnations the title element can have. """
447 | 
448 |         if isinstance (node.parent, nodes.section):
449 |             self.visit_section_title (node)
450 |         elif isinstance (node.parent, nodes.document):
451 |             self.visit_document_title (node)
452 |         elif isinstance (node.parent, nodes.table):
453 |             self.visit_table_title (node)
454 |         elif isinstance (node.parent, nodes.topic):
455 |             self.visit_topic_title (node)
456 |         elif isinstance (node.parent, nodes.sidebar):
457 |             self.visit_sidebar_title (node)
458 |         elif isinstance (node.parent, nodes.admonition):
459 |             self.visit_admonition_title (node)
460 |         else:
461 |             assert ("Can't happen.")
462 | 
463 |     def depart_title (self, node):
464 |         """ Switch on the various incarnations the title element can have. """
465 | 
466 |         if isinstance (node.parent, nodes.section):
467 |             self.depart_section_title (node)
468 |         elif isinstance (node.parent, nodes.document):
469 |             self.depart_document_title (node)
470 |         elif isinstance (node.parent, nodes.table):
471 |             self.depart_table_title (node)
472 |         elif isinstance (node.parent, nodes.topic):
473 |             self.depart_topic_title (node)
474 |         elif isinstance (node.parent, nodes.sidebar):
475 |             self.depart_sidebar_title (node)
476 |         elif isinstance (node.parent, nodes.admonition):
477 |             self.depart_admonition_title (node)
478 |         else:
479 |             assert ("Can't happen.")
480 | 
481 |     def visit_subtitle (self, node):
482 |         """ Switch on the various incarnations the subtitle element can have. """
483 | 
484 |         if isinstance (node.parent, nodes.document):
485 |             self.visit_document_subtitle (node)
486 |         else:
487 |             self.visit_section_subtitle (node)
488 |         
489 |     def depart_subtitle (self, node):
490 |         """ Switch on the various incarnations the subtitle element can have. """
491 | 
492 |         if isinstance (node.parent, nodes.document):
493 |             self.depart_document_subtitle (node)
494 |         else:
495 |             self.depart_section_subtitle (node)
496 |         
497 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/writers/epub2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | 
 6 | epub2.py
 7 | 
 8 | Copyright 2012 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | A writer that writes XHTML 1 files suited for conversion into EPUB2.
13 | 
14 | """
15 | 
16 | import re
17 | 
18 | from docutils import nodes
19 | 
20 | # from epubmaker.lib.Logger import info, debug, warn, error
21 | 
22 | from epubmaker.mydocutils.writers.xhtml1 import Writer as WriterBase
23 | from epubmaker.mydocutils.writers.xhtml1 import Translator as TranslatorBase
24 | 
25 | 
26 | class Writer (WriterBase):
27 |     """ EPUB2 writer. """
28 |     
29 |     def __init__ (self):
30 |         WriterBase.__init__ (self)
31 |         self.translator_class = Translator
32 | 
33 | 
34 | class Translator (TranslatorBase):
35 |     """ HTML Translator with EPUB2 tweaks. """
36 |     
37 |     def init_css (self):
38 |         for css_file in ('rst2all.css', 'rst2epub.css'):
39 |             self.head.append ('<style type="text/css">\n%s</style>\n' % 
40 |                               self.encode (self.read_css (css_file)))
41 | 
42 | 
43 |     def calc_centering_style (self, node):
44 |         """
45 |         Rationale: The EPUB standard allows user agents to replace
46 |         `margin: auto` with `margin: 0`. Thus we cannot use `margin: auto`
47 |         to center images, we have to calculate the left margin value.
48 | 
49 |         Also we must use 'width' on the html element, not css style,
50 |         or Adobe ADE will not scale the image properly (ie. only
51 |         horizontally).
52 | 
53 |         :align: is supposed to work on blocks. It floats or centers
54 |         a block.
55 | 
56 |         :align: center has not the same semantics as :class: center.
57 |         Former centers the block, eg. the whole table, latter centers
58 |         the text, eg, the text in every table cell.
59 | 
60 |             `:align: center`
61 |                 Used on image: centers image
62 |                 Used on figure: centers image and caption
63 |                 Used on table: centers table and caption
64 | 
65 |         """
66 | 
67 |         width = node.get ('width')
68 |         if width is None:
69 |             return []
70 |         
71 |         style = ['width: %s' % width]
72 | 
73 |         m = re.match ('(\d+)\s*%', width)
74 |         if (m):
75 |             width = max (min (int (m.group (1)), 100), 0)
76 |             margin = 100 - width
77 | 
78 |             align = node.get ('align', 'center')
79 |             if align == 'center':
80 |                 style.append ('margin-left: %d%%' % (margin / 2))
81 |             if align == 'right':
82 |                 style.append ('margin-left: %d%%' % margin)
83 |                 
84 |         node['styles'].extend (style)
85 | 
86 |     
87 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/writers/rst2all.css:
--------------------------------------------------------------------------------
  1 | /* 
  2 | Project Gutenberg common docutils stylesheet.
  3 | 
  4 | This stylesheet contains styles common to HTML and EPUB.  Put styles
  5 | that are specific to HTML and EPUB into their relative stylesheets.
  6 | 
  7 | :Author: Marcello Perathoner (webmaster@gutenberg.org)
  8 | :Copyright: This stylesheet has been placed in the public domain.
  9 | 
 10 | This stylesheet is based on:
 11 | 
 12 |   :Author: David Goodger (goodger@python.org)
 13 |   :Copyright: This stylesheet has been placed in the public domain.
 14 | 
 15 |   Default cascading style sheet for the HTML output of Docutils.
 16 | 
 17 | */
 18 | 
 19 | /* ADE 1.7.2 chokes on !important and throws all css out. */
 20 | 
 21 | /* FONTS */
 22 | 
 23 | .italics    { font-style: italic }
 24 | .no-italics { font-style: normal }
 25 | 
 26 | .bold       { font-weight: bold }
 27 | .no-bold    { font-weight: normal }
 28 | 
 29 | .small-caps { } /* Epub needs italics */
 30 | .gesperrt   { } /* Epub needs italics */
 31 | .antiqua    { font-style: italic } /* what else can we do ? */
 32 | .monospaced { font-family: monospace }
 33 | 
 34 | .smaller    { font-size: smaller }
 35 | .larger     { font-size: larger }
 36 | 
 37 | .xx-small   { font-size: xx-small }
 38 | .x-small    { font-size: x-small }
 39 | .small      { font-size: small }
 40 | .medium     { font-size: medium }
 41 | .large      { font-size: large }
 42 | .x-large    { font-size: x-large }
 43 | .xx-large   { font-size: xx-large }
 44 | 
 45 | .text-transform-uppercase { text-transform: uppercase }
 46 | .text-transform-lowercase { text-transform: lowercase }
 47 | .text-transform-none      { text-transform: none }
 48 | 
 49 | .red        { color: red }
 50 | .green      { color: green }
 51 | .blue       { color: blue }
 52 | .yellow     { color: yellow }
 53 | .white      { color: white }
 54 | .gray       { color: gray }
 55 | .black      { color: black }
 56 | 
 57 | /* ALIGN */
 58 | 
 59 | .left       { text-align: left }
 60 | .justify    { text-align: justify }
 61 | .center     { text-align: center; text-indent: 0 }
 62 | .centerleft { text-align: center; text-indent: 0 }
 63 | .right      { text-align: right;  text-indent: 0 }
 64 | 
 65 | /* LINE HEIGHT */
 66 | 
 67 | body        { line-height: 1.5 }
 68 | p           { margin: 0; 
 69 | 	      text-indent: 2em }
 70 | 
 71 | /* PAGINATION */
 72 | 
 73 | .title, .subtitle     { page-break-after:  avoid }
 74 | 
 75 | .container, .title, .subtitle, #pg-header            
 76 |                       { page-break-inside: avoid }
 77 | 
 78 | /* SECTIONS */
 79 | 
 80 | body         { text-align: justify }
 81 | 
 82 | p.pfirst, p.noindent { 
 83 |     text-indent: 0 
 84 | }
 85 | 
 86 | .boxed         { border: 1px solid black; padding: 1em }
 87 | .topic, .note  { margin: 5% 0; border: 1px solid black; padding: 1em }
 88 | div.section    { clear: both }
 89 | 
 90 | div.line-block       { margin: 1.5em 0 }  /* same leading as p */
 91 | div.line-block.inner { margin: 0 0 0 10% }
 92 | div.line             { margin-left: 20%; text-indent: -20%; }
 93 | .line-block.noindent div.line { margin-left: 0; text-indent: 0; }
 94 | 
 95 | hr.docutils          { margin: 1.5em 40%; border: none; border-bottom: 1px solid black; }
 96 | div.transition       { margin: 1.5em 0 }
 97 | 
 98 | .vfill, .vspace      { border: 0px solid white }
 99 | 
100 | .title               { margin: 1.5em 0 }
101 | .title.with-subtitle { margin-bottom: 0 }
102 | .subtitle            { margin: 1.5em 0 }
103 | 
104 | /* header font style */
105 | /* http://dev.w3.org/csswg/css3-fonts/#propdef-font-size */
106 | 
107 | h1.title                        { font-size: 200%; }  /* for book title only */
108 | h2.title, p.subtitle.level-1    { font-size: 150%; margin-top: 4.5em;  margin-bottom: 2em }
109 | h3.title, p.subtitle.level-2    { font-size: 120%; margin-top: 2.25em; margin-bottom: 1.25em }
110 | h4.title, p.subtitle.level-3    { font-size: 100%; margin-top: 1.5em;  margin-bottom: 1.5em;  font-weight: bold; }
111 | h5.title, p.subtitle.level-4    { font-size:  89%; margin-top: 1.87em; margin-bottom: 1.69em; font-style: italic; }
112 | h6.title, p.subtitle.level-5    { font-size:  60%; margin-top: 3.5em;  margin-bottom: 2.5em }
113 | 
114 | /* title page */
115 | 
116 | h1.title, p.subtitle.level-1,
117 | h2.title, p.subtitle.level-2    { text-align: center }
118 | 
119 | #pg-header,
120 | h1.document-title               { margin: 10% 0 5% 0 }
121 | p.document-subtitle             { margin:  0  0 5% 0 }
122 | 
123 | /* PG header and footer */
124 | #pg-machine-header { }
125 | #pg-produced-by { }
126 | 
127 | li.toc-entry            { list-style-type: none }
128 | ul.open li, ol.open li  { margin-bottom: 1.5em }
129 | 
130 | .attribution            { margin-top: 1.5em }
131 | 
132 | .example-rendered { 
133 |     margin: 1em 5%; border: 1px dotted red;  padding: 1em; background-color: #ffd }
134 | .literal-block.example-source   { 
135 |     margin: 1em 5%; border: 1px dotted blue; padding: 1em; background-color: #eef }
136 | 
137 | /* DROPCAPS */
138 | 
139 | /* BLOCKQUOTES */
140 | 
141 | blockquote { margin: 1.5em 10% }
142 | 
143 | blockquote.epigraph { }
144 | 
145 | blockquote.highlights { }
146 | 
147 | div.local-contents { margin: 1.5em 10% }
148 | 
149 | div.abstract { margin: 3em   10% }
150 | div.image    { margin: 1.5em  0  }
151 | div.caption  { margin: 1.5em  0 }
152 | div.legend   { margin: 1.5em  0 }
153 | 
154 | .hidden { display: none }
155 | 
156 | .invisible { visibility: hidden; color: white } /* white: mozilla print bug */
157 | 
158 | a.toc-backref {
159 |   text-decoration: none ;
160 |   color: black }
161 | 
162 | dl.docutils dd {
163 |   margin-bottom: 0.5em }
164 | 
165 | div.figure { margin-top: 3em; margin-bottom: 3em }
166 | 
167 | img { max-width: 100% }
168 | 
169 | div.footer, div.header {
170 |   clear: both;
171 |   font-size: smaller }
172 | 
173 | div.sidebar {
174 |   margin: 0 0 0.5em 1em ;
175 |   border: medium outset ;
176 |   padding: 1em ;
177 |   background-color: #ffffee ;
178 |   width: 40% ;
179 |   float: right ;
180 |   clear: right }
181 | 
182 | div.sidebar p.rubric {
183 |   font-family: sans-serif ;
184 |   font-size: medium }
185 | 
186 | ol.simple, ul.simple { margin: 1.5em 0 }
187 | 
188 | ol.toc-list,    ul.toc-list    { padding-left:  0  }
189 | ol ol.toc-list, ul ul.toc-list { padding-left:  5% }
190 | 
191 | ol.arabic {
192 |   list-style: decimal }
193 | 
194 | ol.loweralpha {
195 |   list-style: lower-alpha }
196 | 
197 | ol.upperalpha {
198 |   list-style: upper-alpha }
199 | 
200 | ol.lowerroman {
201 |   list-style: lower-roman }
202 | 
203 | ol.upperroman {
204 |   list-style: upper-roman }
205 | 
206 | p.credits {
207 |   font-style: italic ;
208 |   font-size: smaller }
209 | 
210 | p.label {
211 |   white-space: nowrap }
212 | 
213 | p.rubric {
214 |   font-weight: bold ;
215 |   font-size: larger ;
216 |   color: maroon ;
217 |   text-align: center }
218 | 
219 | p.sidebar-title {
220 |   font-family: sans-serif ;
221 |   font-weight: bold ;
222 |   font-size: larger }
223 | 
224 | p.sidebar-subtitle {
225 |   font-family: sans-serif ;
226 |   font-weight: bold }
227 | 
228 | p.topic-title, p.admonition-title {
229 |   font-weight: bold }
230 | 
231 | pre.address {
232 |   margin-bottom: 0 ;
233 |   margin-top: 0 ;
234 |   font: inherit }
235 | 
236 | .literal-block, .doctest-block {
237 |   margin-left: 2em ;
238 |   margin-right: 2em; }
239 | 
240 | span.classifier {
241 |   font-family: sans-serif ;
242 |   font-style: oblique }
243 | 
244 | span.classifier-delimiter {
245 |   font-family: sans-serif ;
246 |   font-weight: bold }
247 | 
248 | span.interpreted {
249 |   font-family: sans-serif }
250 | 
251 | span.option {
252 |   white-space: nowrap }
253 | 
254 | span.pre {
255 |   white-space: pre }
256 | 
257 | span.problematic {
258 |   color: red }
259 | 
260 | span.section-subtitle {
261 |   /* font-size relative to parent (h1..h6 element) */
262 |   font-size: 100% }
263 | 
264 | table { margin-top: 1.5em; margin-bottom: 1.5em; border-spacing: 0 }
265 | table.align-left, table.align-right { margin-top: 0 }
266 | 
267 | table.table                { border-collapse: collapse; }
268 | 
269 | table.table.hrules-table thead          { border: 1px solid black; border-width: 2px 0 0 }
270 | table.table.hrules-table tbody          { border: 1px solid black; border-width: 2px 0 }
271 | table.table.hrules-rows  tr             { border: 1px solid black; border-width: 0 0 1px }
272 | table.table.hrules-rows  tr.last        { border-width: 0 }
273 | table.table.hrules-rows  td, 
274 | table.table.hrules-rows  th             { padding: 1ex 1em; vertical-align: middle }
275 | 
276 | table.table tr             { border-width: 0 }
277 | table.table td, 
278 | table.table th             { padding: 0.5ex 1em }
279 | table.table tr.first td    { padding-top: 1ex }
280 | table.table tr.last td     { padding-bottom: 1ex }
281 | table.table tr.first th    { padding-top: 1ex }
282 | table.table tr.last th     { padding-bottom: 1ex }
283 | 
284 | 
285 | table.citation {
286 |   border-left: solid 1px gray;
287 |   margin-left: 1px }
288 | 
289 | table.docinfo {
290 |   margin: 3em 4em }
291 | 
292 | table.docutils { }
293 | 
294 | div.footnote-group          { margin: 1em 0 }
295 | table.footnote td.label     { width: 2em; text-align: right; padding-left: 0 }
296 | 
297 | table.docutils td, table.docutils th,
298 | table.docinfo td, table.docinfo th {
299 |   padding: 0 0.5em;
300 |   vertical-align: top }
301 | 
302 | table.docutils th.field-name, table.docinfo th.docinfo-name {
303 |   font-weight: bold ;
304 |   text-align: left ;
305 |   white-space: nowrap ;
306 |   padding-left: 0 }
307 | 
308 | /* used to remove borders from tables and images */
309 | .borderless, table.borderless td, table.borderless th {
310 |   border: 0 }
311 | 
312 | table.borderless td, table.borderless th {
313 |   /* Override padding for "table.docutils td" with "!important".
314 |      The right padding separates the table cells. */
315 |   padding: 0 0.5em 0 0 } /* FIXME: was !important */
316 | 
317 | h1 tt.docutils, h2 tt.docutils, h3 tt.docutils,
318 | h4 tt.docutils, h5 tt.docutils, h6 tt.docutils {
319 |   font-size: 100% }
320 | 
321 | ul.auto-toc {
322 |   list-style-type: none }
323 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/writers/rst2epub.css:
--------------------------------------------------------------------------------
 1 | /* 
 2 | Project Gutenberg EPUB docutils stylesheet.
 3 | 
 4 | This stylesheet contains styles specific to EPUB.
 5 | */
 6 | 
 7 | /* FONTS */
 8 | 
 9 | /* mostly unsupported */
10 | .small-caps        { font-style: italic }
11 | .gesperrt          { font-style: italic }
12 | 
13 | /* ALIGN */
14 | 
15 | /* SECTIONS */
16 | 
17 | /* reduce screen real estate waste */
18 | body               { margin: 1% }
19 | 
20 | /* ugly hack to give more specifity.  because ADE chucks out the whole
21 |    stylesheet when it sees an !important */
22 | 
23 | .first.first        { margin-top: 0; text-indent: 0 } 
24 | .last.last          { margin-bottom: 0 }
25 | 
26 | .no-page-break.no-page-break 
27 |                     { page-break-before: avoid }
28 | 
29 | /* PAGINATION */
30 | 
31 | div.clearpage       { page-break-before: always; padding-top: 10% }
32 | div.cleardoublepage { page-break-before: right;  padding-top: 10%  }
33 | 
34 | .vfill              { margin-top: 10% }
35 | h2.title            { margin-top: 10% }
36 | 
37 | /* DIV */
38 | 
39 | a                   { text-decoration: none }
40 | .toc-pageref        { display: none }
41 | 
42 | /* DROPCAPS */
43 | 
44 | span.dropcap        { line-height: 0 }
45 | img.dropcap         { vertical-align: bottom }
46 | 
47 | 


--------------------------------------------------------------------------------
/epubmaker/mydocutils/writers/rst2html.css:
--------------------------------------------------------------------------------
 1 | /* 
 2 | Project Gutenberg HTML docutils stylesheet.
 3 | 
 4 | This stylesheet contains styles specific to HTML.
 5 | */
 6 | 
 7 | /* FONTS */
 8 | 
 9 | /* em                { font-style: normal }
10 | strong            { font-weight: normal } */
11 | 
12 | .small-caps       { font-variant: small-caps }
13 | .gesperrt         { letter-spacing: 0.1em }
14 | 
15 | /* ALIGN */
16 | 
17 | .align-left       { clear: left;
18 | 		    float: left;
19 | 		    margin-right: 1em }
20 | 
21 | .align-right      { clear: right;
22 | 		    float: right;
23 | 		    margin-left: 1em }
24 | 
25 | .align-center     { margin-left: auto;
26 | 		    margin-right: auto }
27 | 
28 | div.shrinkwrap    { display: table; }
29 | 
30 | /* SECTIONS */
31 | 
32 | body              { margin: 5% 10% 5% 10% }
33 | 
34 | /* compact list items containing just one p */
35 | li p.pfirst       { margin-top: 0; margin-bottom: 0 } 
36 | 
37 | .first            { margin-top: 0 !important; 
38 | 		    text-indent: 0 !important } 
39 | .last             { margin-bottom: 0 !important }
40 | 
41 | span.dropcap      { float: left; margin: 0 0.1em 0 0; line-height: 1 }
42 | img.dropcap       { float: left; margin: 0 0.5em 0 0; max-width: 25% }
43 | span.dropspan     { font-variant: small-caps }
44 | 
45 | .no-page-break    { page-break-before: avoid !important }
46 | 
47 | /* PAGINATION */
48 | 
49 | .pageno           { position: absolute; right: 95%; font: medium sans-serif; text-indent: 0 }
50 | .pageno:after     { color: gray; content: '[' attr(title) ']' }
51 | .lineno           { position: absolute; left:  95%; font: medium sans-serif; text-indent: 0 }
52 | .lineno:after     { color: gray; content: '[' attr(title) ']' }
53 | .toc-pageref      { float: right }
54 | 
55 | @media screen {
56 |    .coverpage, .frontispiece, .titlepage, .verso, .dedication, .plainpage
57 |                        { margin: 10% 0; }
58 | 
59 |    div.clearpage, div.cleardoublepage
60 |                        { margin: 10% 0; border: none; border-top: 1px solid gray; }
61 | 
62 |    .vfill              { margin:  5% 10% }
63 | }
64 | 
65 | @media print {
66 |    div.clearpage       { page-break-before: always; padding-top: 10% }
67 |    div.cleardoublepage { page-break-before: right;  padding-top: 10%  }
68 | 
69 |    .vfill              { margin-top: 20% }
70 |    h2.title            { margin-top: 20% }
71 | }
72 | 
73 | /* DIV */
74 | pre               { font-family: monospace; font-size: 0.9em; white-space: pre-wrap }
75 | 
76 | 


--------------------------------------------------------------------------------
/epubmaker/packagers/GzipPackager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | GzipPackager.py
 6 | 
 7 | Copyright 2010 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Gzip a file.
12 | 
13 | """
14 | 
15 | from epubmaker.packagers import OneFileGzipPackager
16 | 
17 | TYPE = 'gzip'
18 | FORMATS = 'rst html.noimages html.images txt.us-ascii txt.iso-8859-1 txt.utf-8'.split ()
19 | 
20 | class Packager (OneFileGzipPackager):
21 |     """ Gzip packager. """
22 |     pass
23 | 
24 | 


--------------------------------------------------------------------------------
/epubmaker/packagers/HTMLPackager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | HTMLPackager.py
 6 | 
 7 | Copyright 2010 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Package a HTML file for PG.
12 | 
13 | """
14 | 
15 | from epubmaker.packagers import HTMLishPackager
16 | 
17 | TYPE = 'ww'
18 | FORMATS = 'html.images'.split ()
19 | 
20 | class Packager (HTMLishPackager):
21 |     """ Package a HTML file with its images. """
22 |     pass
23 | 


--------------------------------------------------------------------------------
/epubmaker/packagers/PDFPackager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | PDFPackager.py
 6 | 
 7 | Copyright 2010 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Package a PDF file for PG.
12 | 
13 | """
14 | 
15 | from epubmaker.packagers import OneFileZipPackager
16 | 
17 | TYPE = 'ww'
18 | FORMATS = ''.split ()
19 | 
20 | class Packager (OneFileZipPackager):
21 |     """ WW packager for PDF files. """
22 |     pass
23 | 
24 | 


--------------------------------------------------------------------------------
/epubmaker/packagers/PushPackager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | PushPackager.py
 6 | 
 7 | Copyright 2011 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Package a zip containing everything, that can be pushed to ibiblio.
12 | 
13 | """
14 | 
15 | from __future__ import with_statement
16 | 
17 | import os
18 | import zipfile
19 | import re
20 | 
21 | from epubmaker.lib.Logger import info, warn, error
22 | import epubmaker.lib.GutenbergGlobals as gg
23 | 
24 | from epubmaker.packagers import BasePackager
25 | 
26 | TYPE = 'ww'
27 | FORMATS = ['push']
28 | 
29 | class Packager (BasePackager):
30 |     """ Package one big zip for push.
31 | 
32 |     Zip contains one directory named after ebook_no.
33 |     This dir mirrors structure on ibiblio::
34 | 
35 |       12345/12345.txt
36 |       12345/12345.zip
37 |       12345/12345-h/12345-h.html
38 |       12345/12345-h/images/cover.jpg
39 |       12345/12345-h.zip
40 |     
41 |     """
42 | 
43 |     @staticmethod
44 |     def add (zip_, filename, memberfilename):
45 |         """ Add one file to the zip. """
46 |         
47 |         try:
48 |             os.stat (filename)
49 |             dummy_name, ext = os.path.splitext (filename)
50 |             info ('  Adding file: %s as %s' % (filename, memberfilename))
51 |             zip_.write (filename, memberfilename,
52 |                         zipfile.ZIP_STORED if ext in ['.zip', '.png']
53 |                         else zipfile.ZIP_DEFLATED)
54 |         except OSError:
55 |             # warn ('PushPackager: Cannot find file %s', filename)
56 |             return
57 | 
58 | 
59 |     def package (self, aux_file_list = []):
60 |         zipfilename = self.options.outputfile # filename is zipfile
61 | 
62 |         m = re.match (r'\d+', zipfilename)
63 |         if m:
64 |             ebook_no = m.group (0)
65 |         else:
66 |             error ('Invalid filename %s for push packager.' % zipfilename)
67 |             return
68 | 
69 |         info ('Creating Zip file: %s' % zipfilename)
70 | 
71 |         zip_ = zipfile.ZipFile (zipfilename, 'w', zipfile.ZIP_DEFLATED)
72 | 
73 |         for suffix in '.txt -8.txt -0.txt .zip -8.zip -0.zip -rst.zip -h.zip'.split ():
74 |             filename = '%s%s' % (ebook_no, suffix)
75 |             memberfilename = '%s/%s' % (ebook_no, filename)
76 |             self.add (zip_, filename, memberfilename)
77 | 
78 |         for suffix, ext in (('-h', 'html'), ('-rst', 'rst')):
79 |             filename = '%s%s.%s' % (ebook_no, suffix, ext)
80 |             memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, filename)
81 |             self.add (zip_, filename, memberfilename)
82 | 
83 |             # image files
84 |             for url in aux_file_list:
85 |                 rel_url = gg.make_url_relative (self.options.base_url, url)
86 |                 filename = os.path.join (self.path, rel_url)
87 |                 memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, rel_url)
88 |                 self.add (zip_, filename, memberfilename)
89 | 
90 |         zip_.close ()
91 | 
92 |         info ('Done Zip file: %s' % zipfilename)
93 | 
94 |     
95 | 


--------------------------------------------------------------------------------
/epubmaker/packagers/RSTPackager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | RSTPackager.py
 6 | 
 7 | Copyright 2010 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Package a RST file for PG.
12 | 
13 | """
14 | 
15 | from epubmaker.packagers import HTMLishPackager
16 | 
17 | TYPE = 'ww'
18 | FORMATS = 'rst.gen'.split ()
19 | 
20 | class Packager (HTMLishPackager):
21 |     """ Package a RST file with its images. """
22 |     pass
23 | 


--------------------------------------------------------------------------------
/epubmaker/packagers/TxtPackager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | TxtPackager.py
 6 | 
 7 | Copyright 2010 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Package a Txt file for PG.
12 | 
13 | """
14 | 
15 | from epubmaker.packagers import OneFileZipPackager
16 | 
17 | TYPE = 'ww'
18 | FORMATS = 'txt.us-ascii txt.iso-8859-1 txt.utf-8'.split ()
19 | 
20 | class Packager (OneFileZipPackager):
21 |     """ WW packager for plain text files. """
22 |     pass
23 | 
24 | 


--------------------------------------------------------------------------------
/epubmaker/packagers/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | Packager package
  7 | 
  8 | Copyright 2009-2010 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Base class for Packager modules.
 13 | 
 14 | """
 15 | 
 16 | from __future__ import with_statement
 17 | 
 18 | import os.path
 19 | import gzip
 20 | import zipfile
 21 | 
 22 | from pkg_resources import resource_listdir  # pylint: disable=E0611
 23 | 
 24 | from epubmaker.lib.Logger import debug, info, warn, error
 25 | import epubmaker.lib.GutenbergGlobals as gg
 26 | 
 27 | GZIP_EXTENSION = '.gzip'
 28 | 
 29 | class BasePackager (object):
 30 |     """
 31 |     Base class for Packagers.
 32 | 
 33 |     """
 34 | 
 35 |     def __init__ (self):
 36 |         self.options = None
 37 |         self.path_name_ext = None
 38 |         self.path = None
 39 |         self.name = None
 40 |         self.ext = None
 41 | 
 42 | 
 43 |     def setup (self, options):
 44 |         """ Setup """
 45 |         
 46 |         self.options = options
 47 |         self.path_name_ext = os.path.join (self.options.outputdir, self.options.outputfile)
 48 |         self.path, name = os.path.split (self.path_name_ext)
 49 |         self.name, self.ext = os.path.splitext (name)
 50 | 
 51 | 
 52 |     def package (self, aux_file_list = []):
 53 |         """ Package files. """
 54 |         pass
 55 | 
 56 | 
 57 | class OneFileGzipPackager (BasePackager):
 58 |     """ Gzips one file. """
 59 | 
 60 |     def package (self, aux_file_list = []):
 61 |         filename = self.path_name_ext
 62 |         gzfilename = filename + GZIP_EXTENSION
 63 | 
 64 |         try:
 65 |             info ('Creating Gzip file: %s' % gzfilename)
 66 |             with open (filename, 'r') as fp:
 67 |                 fpgz = gzip.open (gzfilename, 'w')
 68 |                 info ('  Adding file: %s' % filename)
 69 |                 fpgz.write (fp.read ())
 70 |                 fpgz.close ()
 71 |                 info ('Done Zip file: %s' % gzfilename)
 72 |         except IOError, what:
 73 |             error (what)
 74 |             
 75 | 
 76 | class OneFileZipPackager (BasePackager):
 77 |     """ Packages one file in zip of the same name. """
 78 | 
 79 |     def package (self, aux_file_list = []):
 80 |         filename = self.path_name_ext
 81 |         zipfilename = os.path.join (self.path, self.name) + '.zip'
 82 |         memberfilename = self.name + self.ext
 83 | 
 84 |         info ('Creating Zip file: %s' % zipfilename)
 85 | 
 86 |         try:
 87 |             os.stat (filename)
 88 |         except OSError:
 89 |             # warn ('Packager: Cannot find file %s', filename)
 90 |             return
 91 |         
 92 |         zip_ = zipfile.ZipFile (zipfilename, 'w', zipfile.ZIP_DEFLATED)
 93 |         info ('  Adding file: %s as %s' % (filename, memberfilename))
 94 |         zip_.write (filename, memberfilename)
 95 |         zip_.close ()
 96 | 
 97 |         info ('Done Zip file: %s' % zipfilename)
 98 | 
 99 | 
100 | class HTMLishPackager (BasePackager):
101 |     """ Package a file with images. """
102 | 
103 |     def package (self, aux_file_list = []):
104 |         
105 |         filename = self.options.outputfile
106 |         zipfilename = os.path.join (self.path, self.name) + '.zip'
107 |         memberfilename = os.path.join (self.name, self.name) + self.ext
108 | 
109 |         info ('Creating Zip file: %s' % zipfilename)
110 | 
111 |         zip_ = zipfile.ZipFile (zipfilename, 'w', zipfile.ZIP_DEFLATED)
112 |         info ('  Adding file: %s as %s' % (filename, memberfilename))
113 |         zip_.write (filename, memberfilename)
114 | 
115 |         # now images
116 |         for url in aux_file_list:
117 |             rel_url = gg.make_url_relative (self.options.base_url, url)
118 |             filename = os.path.join (self.path, rel_url)
119 |             memberfilename = os.path.join (self.name, rel_url)
120 |             info ('  Adding file: %s as %s' % (filename, memberfilename))
121 |             zip_.write (filename, memberfilename)
122 |         
123 |         zip_.close ()
124 | 
125 |         info ('Done Zip file: %s' % zipfilename)
126 | 
127 |     
128 | class PackagerFactory (object):
129 |     """ Implements Factory pattern for packagers. """
130 | 
131 |     packagers = {}
132 | 
133 |     def __init__ (self, type_):
134 |         self.type = type_
135 |         
136 | 
137 |     def load (self):
138 |         """ Load the packagers in the packagers directory. """
139 | 
140 |         for fn in resource_listdir ('epubmaker.packagers', ''):
141 |             modulename, ext = os.path.splitext (fn)
142 |             if ext == '.py':
143 |                 if modulename.endswith ('Packager'):
144 |                     module = __import__ ('epubmaker.packagers.' + modulename,
145 |                                          fromlist = [modulename])
146 |                     if self.type == module.TYPE:
147 |                         debug ("Loading packager type: %s from module: %s for formats: %s" % (
148 |                             self.type, modulename, ', '.join (module.FORMATS)))
149 |                         for format_ in module.FORMATS:
150 |                             self.packagers[format_] = module
151 | 
152 |         return self.packagers.keys ()
153 | 
154 | 
155 |     def unload (self):
156 |         """ Unload packager modules. """
157 | 
158 |         for k in self.packagers.keys ():
159 |             del self.packagers[k]
160 | 
161 | 
162 |     def create (self, format_):
163 |         """ Create a packager for format. """
164 | 
165 |         try:
166 |             return self.packagers[format_].Packager ()
167 |         except KeyError:
168 |             raise KeyError ('No packager for type %s' % format_)
169 |     
170 | 


--------------------------------------------------------------------------------
/epubmaker/parsers/AuxParser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
 3 | 
 4 | """
 5 | 
 6 | AuxParser.py
 7 | 
 8 | Copyright 2009 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | Open an url and return raw data.
13 | 
14 | """
15 | 
16 | 
17 | from epubmaker.parsers import ParserBase
18 | 
19 | mediatypes = ('*/*', )
20 | 
21 | class Parser (ParserBase):
22 |     """ Parse an auxiliary file. """
23 | 
24 |     def __init__ (self):
25 |         ParserBase.__init__ (self)
26 |         self.data = None
27 | 
28 | 
29 |     def parse (self):
30 |         """ Parse the file. """
31 |         self.data = self.bytes_content ()
32 | 
33 | 
34 |     def serialize (self):
35 |         """ Serialize file to string. """
36 |         return self.data
37 | 


--------------------------------------------------------------------------------
/epubmaker/parsers/CSSParser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | CSSParser.py
  7 | 
  8 | Copyright 2009 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Open an url and return raw data.
 13 | 
 14 | """
 15 | 
 16 | import re
 17 | import urlparse
 18 | import logging
 19 | 
 20 | import cssutils
 21 | 
 22 | from epubmaker.lib.Logger import debug
 23 | from epubmaker.lib.MediaTypes import mediatypes as mt
 24 | 
 25 | from epubmaker.parsers import ParserBase
 26 | 
 27 | RE_ELEMENT = re.compile (r'((?:^|\s)[a-z0-9]+)', re.I)
 28 | 
 29 | mediatypes = (mt.css, )
 30 | 
 31 | class Parser (ParserBase):
 32 |     """ Parse an external CSS file. """
 33 | 
 34 |     def __init__ (self):
 35 |         cssutils.log.setLog (logging.getLogger ('cssutils'))
 36 |         # logging.DEBUG is way too verbose
 37 |         cssutils.log.setLevel (max (cssutils.log.getEffectiveLevel (), logging.INFO))
 38 |         ParserBase.__init__ (self)
 39 |         self.sheet = None
 40 | 
 41 | 
 42 |     def parse (self):
 43 |         """ Parse the CSS file. """
 44 | 
 45 |         if self.sheet is not None:
 46 |             return
 47 |         
 48 |         parser = cssutils.CSSParser ()
 49 |         if self.fp:
 50 |             self.sheet = parser.parseString (self.bytes_content (), encoding = self.encoding)
 51 |         else:
 52 |             self.sheet = parser.parseUrl (self.url)
 53 | 
 54 |         self.mediatype = 'text/css'
 55 |         self.unpack_media_handheld (self.sheet)
 56 |         self.lowercase_selectors (self.sheet)
 57 | 
 58 | 
 59 |     def parse_string (self, s):
 60 |         """ Parse the CSS in string. """
 61 | 
 62 |         if self.sheet is not None:
 63 |             return
 64 |         
 65 |         parser = cssutils.CSSParser ()
 66 |         self.sheet = parser.parseString (s, encoding = 'utf-8')
 67 | 
 68 |         self.mediatype = 'text/css'
 69 |         self.unpack_media_handheld (self.sheet)
 70 |         self.lowercase_selectors (self.sheet)
 71 | 
 72 | 
 73 |     @staticmethod
 74 |     def iter_properties (sheet):
 75 |         """ Iterate on properties in css. """
 76 |         for rule in sheet:
 77 |             if rule.type == rule.STYLE_RULE:
 78 |                 for prop in rule.style:
 79 |                     yield prop
 80 | 
 81 | 
 82 |     @staticmethod
 83 |     def unpack_media_handheld (sheet):
 84 |         """ unpack a @media handheld rule """
 85 |         for rule in sheet:
 86 |             if rule.type == rule.MEDIA_RULE:
 87 |                 if rule.media.mediaText.find ('handheld') > -1:
 88 |                     debug ("Unpacking CSS @media handheld rule.")
 89 |                     rule.media.mediaText = 'all'
 90 |                     rule.insertRule (cssutils.css.CSSComment ('/* was @media handheld */'), 0)
 91 | 
 92 | 
 93 |     @staticmethod
 94 |     def lowercase_selectors (sheet):
 95 |         """ make selectors lowercase to match xhtml tags """
 96 |         for rule in sheet:
 97 |             if rule.type == rule.STYLE_RULE:
 98 |                 for sel in rule.selectorList:
 99 |                     sel.selectorText = RE_ELEMENT.sub (lambda m: m.group(1).lower (),
100 |                                                        sel.selectorText)
101 | 
102 | 
103 |     def rewrite_links (self, f):
104 |         """ Rewrite all links using the function f. """
105 |         cssutils.replaceUrls (self.sheet, f)
106 | 
107 | 
108 |     def drop_floats (self):
109 |         """ Drop all floats in stylesheet.
110 | 
111 |         """
112 | 
113 |         for prop in self.iter_properties (self.sheet):
114 |             if prop and prop.name == 'float': # test for existence because we remove
115 |                 prop.parent.removeProperty ('float')
116 |                 prop.parent.removeProperty ('width')
117 |                 prop.parent.removeProperty ('height')
118 |             elif prop and prop.name in ('position', 'left', 'right', 'top', 'bottom'):
119 |                 prop.parent.removeProperty (prop.name)
120 |                 
121 |         for prop in self.iter_properties (self.sheet):
122 |             #print prop.name
123 |             #print prop.value
124 |             if prop and prop.value.endswith ('px'): # test for existence because we remove
125 |                 prop.parent.removeProperty (prop.name)
126 | 
127 | 
128 |     def get_image_urls (self):
129 |         """ Return the urls of all images in document.
130 | 
131 |         Images are graphic files. The user may choose if he wants
132 |         images included or not.
133 | 
134 |         """
135 | 
136 |         images = []
137 |         
138 |         for prop in self.iter_properties (self.sheet):
139 |             if (prop.value.cssValueType == prop.value.CSS_PRIMITIVE_VALUE and
140 |                 prop.value.primitiveType == prop.value.CSS_URI):
141 |                 url = urlparse.urljoin (self.url, prop.value.cssText)
142 |                 images.append (url)
143 |             
144 |         return  images
145 | 
146 | 
147 |     def get_aux_urls (self):
148 |         """ Return the urls of all auxiliary files in document.
149 | 
150 |         Auxiliary files are non-document files you need to correctly
151 |         display the document file, eg. CSS files.
152 | 
153 |         """
154 | 
155 |         aux = []
156 |         
157 |         for rule in self.sheet:
158 |             if rule.type == rule.IMPORT_RULE:
159 |                 aux.append (urlparse.urljoin (self.url, rule.href))
160 | 
161 |         return  aux
162 | 
163 | 
164 |     def serialize (self):
165 |         """ Serialize CSS. """
166 | 
167 |         return self.sheet.cssText
168 | 


--------------------------------------------------------------------------------
/epubmaker/parsers/HTMLParser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | HTMLParser.py
  7 | 
  8 | Copyright 2009 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | """
 13 | 
 14 | import re
 15 | import subprocess
 16 | import urllib
 17 | import urlparse
 18 | 
 19 | import lxml.html
 20 | from lxml import etree
 21 | # import tidy
 22 | 
 23 | from epubmaker.lib.GutenbergGlobals import NS, xpath
 24 | from epubmaker.lib.Logger import info, debug, warn, error
 25 | from epubmaker.lib.MediaTypes import mediatypes as mt
 26 | 
 27 | from epubmaker import parsers
 28 | from epubmaker.parsers import HTMLParserBase
 29 | 
 30 | mediatypes = ('text/html', mt.xhtml)
 31 | 
 32 | RE_XMLDECL = re.compile ('<\?xml[^?]+\?>\s*')
 33 | 
 34 | DEPRECATED = { 'align':      """caption applet iframe img input object legend
 35 |                              table hr div h1 h2 h3 h4 h5 h6 p""",
 36 |                'alink':      'body',
 37 |                'alt':        'applet',
 38 |                'archive':    'applet',
 39 |                'background': 'body',
 40 |                'bgcolor':    '*',
 41 |                'border':     'img object',
 42 |                'clear':      'br',
 43 |                'code':       'applet',
 44 |                'codebase':   'applet',
 45 |                'color':      '*',
 46 |                'compact':    '*',
 47 |                'face':       '*',
 48 |                'height':     'td th applet',
 49 |                'hspace':     '*',
 50 |                'language':   'script',
 51 |                'link':       'body',
 52 |                'name':       'applet',
 53 |                'noshade':    'hr',
 54 |                'nowrap':     '*',
 55 |                'object':     'applet',
 56 |                'prompt':     'isindex',
 57 |                'size':       'hr font basefont',
 58 |                'start':      'ol',
 59 |                'text':       'body',
 60 |                'type':       'li ol ul',
 61 |                'value':      'li',
 62 |                'version':    'html',
 63 |                'vlink':      'body',
 64 |                'vspace':     '*',
 65 |                'width':      'hr td th applet pre',
 66 |                }
 67 | 
 68 | 
 69 | class Parser (HTMLParserBase):
 70 |     """ Parse a HTML Text
 71 | 
 72 |     and convert it to xhtml suitable for ePub packaging.
 73 | 
 74 |     """
 75 | 
 76 |     @staticmethod
 77 |     def _fix_id (id_):
 78 |         """ Fix more common mistakes in ids.
 79 | 
 80 |         xml:id cannot start with digit, very common in pg.
 81 | 
 82 |         """
 83 | 
 84 |         if not parsers.RE_XML_NAME.match (id_):
 85 |             id_ = 'id_' + id_
 86 | 
 87 |         # debug ("_fix_id: id = %s" % id_)
 88 |         return id_
 89 | 
 90 | 
 91 |     def _fix_internal_frag (self, id_):
 92 |         """ Fix more common mistakes in ids. """
 93 | 
 94 |         # This is a big mess because href attributes must be quoted,
 95 |         # but id attributes must not be quoted.  Some HTML in PG
 96 |         # quotes ids in a misguided attempt to make id and href look
 97 |         # the same.  But '%' is invalid in xml ids.
 98 |         #
 99 |         # See HTML 4.01 spec section B.2.
100 | 
101 |         if '%' in id_:
102 |             id_ = urllib.unquote (id_)
103 |             try:
104 |                 id_ = id_.decode ('utf-8')
105 |             except UnicodeError:
106 |                 try:
107 |                     id_ = id_.decode (self.encoding)
108 |                 except UnicodeError:
109 |                     pass # we tried
110 | 
111 |         # xml:id cannot start with digit
112 |         # very common in pg
113 | 
114 |         if not parsers.RE_XML_NAME.match (id_):
115 |             id_ = 'id_' + id_
116 | 
117 |         if not parsers.RE_XML_NAME.match (id_):
118 |             # still invalid ... we tried
119 |             return None
120 | 
121 |         # debug ("_fix_internal_frag: frag = %s" % id_)
122 |         return id_
123 | 
124 | 
125 |     # @staticmethod
126 |     # def tidylib (html):
127 |     #     """ Pipe html thru w3c tidylib. """
128 | 
129 |     #     html = parsers.RE_RESTRICTED.sub ('', html)
130 |     #     html = RE_XMLDECL.sub ('', html)
131 |     #     html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html)
132 | 
133 |     #     options = {
134 |     #         "clean": 1,
135 |     #         "wrap":  0,
136 |     #         "output_xhtml":     1,
137 |     #         "numeric_entities": 1,
138 |     #         "merge_divs":       0, # keep poetry indentation
139 |     #         "merge_spans":      0,
140 |     #         "add_xml_decl":     0,
141 |     #         "doctype":          "strict",
142 |     #         "anchor_as_name":   0,
143 |     #         "enclose_text":     1,
144 |     #         }
145 | 
146 |     #     try:
147 |     #         html = tidy.parseString (html.encode ('utf-8'))
148 |     #     except TidyLibError, what:
149 |     #         error ("Tidy: %s" % what)
150 |     #         raise
151 | 
152 |     #     return html
153 | 
154 | 
155 |     @staticmethod
156 |     def tidy (html):
157 |         """ Pipe html thru w3c tidy. """
158 | 
159 |         html = parsers.RE_RESTRICTED.sub ('', html)
160 |         html = RE_XMLDECL.sub ('', html)
161 |         html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html)
162 | 
163 |         # convert to xhtml
164 |         tidy = subprocess.Popen (
165 |             ["tidy",
166 |              "-utf8",
167 |              "-clean",
168 |              "--wrap",             "0",
169 |              # "--drop-font-tags",   "y",
170 |              # "--drop-proprietary-attributes", "y",
171 |              # "--add-xml-space",    "y",
172 |              "--output-xhtml",     "y",
173 |              "--numeric-entities", "y",
174 |              "--merge-divs",       "n", # keep poetry indentation
175 |              "--merge-spans",      "n",
176 |              "--add-xml-decl",     "n",
177 |              "--doctype",          "strict",
178 |              "--anchor-as-name",   "n",
179 |              "--enclose-text",     "y" ],
180 | 
181 |             stdin = subprocess.PIPE,
182 |             stdout = subprocess.PIPE,
183 |             stderr = subprocess.PIPE)
184 | 
185 |         # print (html.encode ('utf-8'))
186 |         # sys.exit ()
187 | 
188 |         (html, stderr) = tidy.communicate (html.encode ('utf-8'))
189 | 
190 |         regex = re.compile ('(Info:|Warning:|Error:)\s*', re.I)
191 | 
192 |         # pylint: disable=E1103
193 |         msg = stderr.rstrip ()
194 |         for line in msg.splitlines ():
195 |             match = regex.search (line)
196 |             if match:
197 |                 sline = regex.sub ("", line)
198 |                 g = match.group (1).lower ()
199 |                 if g == 'info:':
200 |                     info ("tidy: %s" % sline)
201 |                 elif g == 'warning:':
202 |                     warn ("tidy: %s" % sline)
203 |                 elif g == 'error:':
204 |                     error ("tidy: %s" % sline)
205 |                 else:
206 |                     error (line)
207 | 
208 |         if tidy.returncode == 2:
209 |             raise ValueError, stderr
210 | 
211 |         return html.decode ('utf-8')
212 | 
213 | 
214 |     def find_coverpage (self):
215 |         """ Search coverpage and put url into <link rel="coverpage" >.
216 | 
217 |         First look for an image with id of 'coverpage', then for an
218 |         image with 'cover' in the url, then with 'title' in the url.
219 | 
220 |         """
221 |         for head in xpath (self.xhtml, 'xhtml:head'):
222 |             for dummy_link in xpath (head, 'xhtml:link[@rel = "coverpage"]'):
223 |                 # already there
224 |                 return
225 | 
226 |             covers = (xpath (self.xhtml, '//xhtml:img[@id = "coverpage"]') or
227 |                       xpath (self.xhtml, '//xhtml:img[contains (@src, "cover")]') or
228 |                       xpath (self.xhtml, '//xhtml:img[contains (@src, "title")]'))
229 |             if not covers:
230 |                 return
231 | 
232 |             href = covers[0].get ('src')
233 |             # FIXME: enforce minimum size
234 |             head.append (etree.Element (NS.xhtml.link, rel = 'coverpage', href = href))
235 |             return href
236 | 
237 | 
238 |     def _fix_anchors (self):
239 |         """ Move name to id and fix hrefs and ids. """
240 | 
241 |         # move anchor name to id
242 |         # 'id' values are more strict than 'name' values
243 |         # try to fix ill-formed ids
244 | 
245 |         seen_ids = set ()
246 | 
247 |         for anchor in (xpath (self.xhtml, "//xhtml:a[@name]") +
248 |                        xpath (self.xhtml, "//xhtml:*[@id]")):
249 |             id_ = anchor.get ('id') or anchor.get ('name')
250 | 
251 |             if 'name' in anchor.attrib:
252 |                 del anchor.attrib['name']
253 |             if 'id' in anchor.attrib:
254 |                 del anchor.attrib['id']
255 |             if NS.xml.id in anchor.attrib:
256 |                 del anchor.attrib[NS.xml.id]
257 | 
258 |             id_ = self._fix_id (id_)
259 | 
260 |             if not parsers.RE_XML_NAME.match (id_):
261 |                 error ("Dropping ill-formed id '%s' in %s" % (id_, self.url))
262 |                 continue
263 | 
264 |             # well-formed id
265 |             if id_ in seen_ids:
266 |                 error ("Dropping duplicate id '%s' in %s" % (id_, self.url))
267 |                 continue
268 | 
269 |             seen_ids.add (id_)
270 |             anchor.set ('id', id_)
271 | 
272 | 
273 |         # try to fix bogus fragment ids
274 |         # 1. fragments point to xml:id, so must be well-formed ids
275 |         # 2. the ids they point to must exist
276 | 
277 |         for link in xpath (self.xhtml, "//xhtml:*[@href]"):
278 |             href = link.get ('href')
279 |             hre, frag = urlparse.urldefrag (href)
280 |             if frag:
281 |                 frag = self._fix_internal_frag (frag)
282 | 
283 |                 if not frag:
284 |                     # non-recoverable ill-formed frag
285 |                     del link.attrib['href']
286 |                     self.add_class (link, 'pgkilled')
287 |                     error ('Dropping ill-formed frag in %s' % href)
288 |                     continue
289 | 
290 |                 # well-formed frag
291 |                 if hre:
292 |                     # we have url + frag
293 |                     link.set ('href', "%s#%s" % (hre, urllib.quote (frag.encode ('utf-8'))))
294 |                     self.add_class (link, 'pgexternal')
295 |                 elif frag in seen_ids:
296 |                     # we have only frag
297 |                     link.set ('href', "#%s" % urllib.quote (frag.encode ('utf-8')))
298 |                     self.add_class (link, 'pginternal')
299 |                 else:
300 |                     del link.attrib['href']
301 |                     self.add_class (link, 'pgkilled')
302 |                     error ("Dropping frag to non-existing id in %s" % href)
303 | 
304 | 
305 |     def _to_xhtml11 (self):
306 |         """ Make vanilla xhtml more conform to xhtml 1.1 """
307 | 
308 |         # Change content-type meta to application/xhtml+xml.
309 |         for meta in xpath (self.xhtml, "/xhtml:html/xhtml:head/xhtml:meta[@http-equiv]"):
310 |             if meta.get ('http-equiv').lower () == 'content-type':
311 |                 meta.set ('content', mt.xhtml + '; charset=utf-8')
312 | 
313 |         # drop javascript
314 | 
315 |         for script in xpath (self.xhtml, "//xhtml:script"):
316 |             script.drop_tree ()
317 | 
318 |         # drop form
319 | 
320 |         for form in xpath (self.xhtml, "//xhtml:form"):
321 |             form.drop_tree ()
322 | 
323 |         # blockquotes
324 | 
325 |         for bq in xpath (self.xhtml, "//xhtml:blockquote"):
326 |             # no naked text allowed in <blockquote>
327 |             div = etree.Element (NS.xhtml.div)
328 |             for child in bq:
329 |                 div.append (child)
330 |             div.text = bq.text
331 |             bq.text = None
332 |             bq.append (div)
333 |             # lxml.html.defs.block_tags
334 | 
335 |         # insert tbody
336 | 
337 |         for table in xpath (self.xhtml, "//xhtml:table[xhtml:tr]"):
338 |             # no naked <tr> allowed in <table>
339 |             tbody = etree.Element (NS.xhtml.tbody)
340 |             for tr in table:
341 |                 if tr.tag == NS.xhtml.tr:
342 |                     tbody.append (tr)
343 |             table.append (tbody)
344 | 
345 |         # move lang to xml:lang
346 | 
347 |         for elem in xpath (self.xhtml, "//xhtml:*[@lang]"):
348 |             # bug in lxml 2.2.2: sometimes deletes wrong element
349 |             # so we delete both and reset the right one
350 |             lang = elem.get ('lang')
351 |             try:
352 |                 del elem.attrib[NS.xml.lang]
353 |             except KeyError:
354 |                 pass
355 |             del elem.attrib['lang']
356 |             elem.set (NS.xml.lang, lang)
357 | 
358 |         # strip deprecated attributes
359 | 
360 |         for a, t in DEPRECATED.items ():
361 |             for tag in t.split ():
362 |                 for elem in xpath (self.xhtml, "//xhtml:%s[@%s]" % (tag, a)):
363 |                     del elem.attrib[a]
364 | 
365 |         # strip empty class attributes
366 | 
367 |         for elem in xpath (self.xhtml,
368 |             "//xhtml:*[@class and normalize-space (@class) = '']"):
369 |             del elem.attrib['class']
370 | 
371 |         # strip bogus header markup by Joe L.
372 |         for elem in xpath (self.xhtml, "//xhtml:h1"):
373 |             if elem.text and elem.text.startswith ("The Project Gutenberg eBook"):
374 |                 elem.tag = NS.xhtml.p
375 |         for elem in xpath (self.xhtml, "//xhtml:h3"):
376 |             if elem.text and elem.text.startswith ("E-text prepared by"):
377 |                 elem.tag = NS.xhtml.p
378 | 
379 | 
380 |     def __parse (self, html):
381 |         # remove xml decl and doctype, we will add the correct one before serializing
382 |         # html = re.compile ('^.*<html ', re.I | re.S).sub ('<html ', html)
383 |         # FIXME: do not remove doctype because we need it to load the dtd
384 | 
385 |         # remove xml declaration because of parser error: "Unicode
386 |         # strings with encoding declaration are not supported. Please
387 |         # use bytes input or XML fragments without declaration."
388 |         re_xml_decl = re.compile (r'^<\?xml.*?\?>', re.S)
389 |         html = re_xml_decl.sub ('', html)
390 |         try:
391 |             return etree.fromstring (
392 |                 html,
393 |                 lxml.html.XHTMLParser (),
394 |                 base_url = self.url)
395 |         except etree.ParseError, what:
396 |             # cannot try HTML parser because we depend on correct xhtml namespace
397 |             error ("etree.fromstring says: %s" % what)
398 |             m = re.search (r'line\s(\d+),', str (what))
399 |             if m:
400 |                 lineno = int (m.group (1))
401 |                 error ("Line %d: %s" % (lineno, html.splitlines ()[lineno - 1]))
402 |             raise
403 | 
404 | 
405 |     def pre_parse (self):
406 |         """ Pre-parse a html ebook. Does a full parse because a
407 |         lightweight parse would be almost as much work. """
408 | 
409 |         # cache
410 |         if self.xhtml is not None:
411 |             return
412 | 
413 |         debug ("HTMLParser.pre_parse () ...")
414 | 
415 |         html = self.unicode_content ()
416 | 
417 |         if html.startswith ('<?xml'):
418 |             # Try a naive parse. This might fail because of errors in
419 |             # the html or because we have no dtd loaded.  We do not
420 |             # load dtds because that makes us dependent on network and
421 |             # the w3c site being up.  Having all users of epubmaker
422 |             # install local dtds is unrealistic.
423 |             try:
424 |                 self.xhtml = self.__parse (html)
425 |             except etree.ParseError:
426 |                 pass
427 | 
428 |         if self.xhtml is None:
429 |             # previous parse failed, try tidy
430 |             info ("Running html thru tidy.")
431 |             html = self.tidy (html)
432 |             self.xhtml = self.__parse (html)     # let exception bubble up
433 | 
434 |         self._fix_anchors () # needs relative paths
435 |         self.xhtml.make_links_absolute (base_url = self.url)
436 |         self.find_coverpage ()
437 | 
438 |         self._to_xhtml11 ()
439 | 
440 |         debug ("Done parsing %s" % self.url)
441 | 
442 | 
443 |     def parse (self):
444 |         """ Fully parse a html ebook. """
445 | 
446 |         debug ("HTMLParser.parse () ...")
447 | 
448 |         self.pre_parse ()
449 | 


--------------------------------------------------------------------------------
/epubmaker/parsers/ImageParser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | ImageParser.py
  7 | 
  8 | Copyright 2009 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Parse an url of type image/*.
 13 | 
 14 | """
 15 | 
 16 | from __future__ import with_statement
 17 | 
 18 | import StringIO
 19 | 
 20 | from PIL import Image
 21 | 
 22 | from pkg_resources import resource_string # pylint: disable=E0611
 23 | 
 24 | from epubmaker.lib.Logger import debug, error
 25 | from epubmaker.lib.MediaTypes import mediatypes as mt
 26 | from epubmaker.parsers import ParserBase
 27 | 
 28 | mediatypes = (mt.jpeg, mt.png, mt.gif)
 29 | 
 30 | class Parser (ParserBase):
 31 |     """Parse an image.
 32 | 
 33 |     And maybe resize it for ePub packaging.
 34 | 
 35 |     """
 36 | 
 37 |     def __init__ (self):
 38 |         ParserBase.__init__ (self)
 39 |         self.image_data = None
 40 |         self.dimen = None
 41 |         self.comment = None
 42 | 
 43 | 
 44 |     def resize_image (self, max_size, max_dimen, output_format = None):
 45 |         """ Create a new parser with a resized image. """
 46 | 
 47 |         new_parser = Parser ()
 48 | 
 49 |         try:
 50 |             image = Image.open (StringIO.StringIO (self.image_data))
 51 | 
 52 |             format_ = image.format
 53 |             if output_format:
 54 |                 format_ = output_format
 55 |             if format_ == 'gif':
 56 |                 format_ = 'png'
 57 |             if format_ == 'jpeg' and image.mode.lower () != 'rgb':
 58 |                 image = image.convert ('RGB')
 59 | 
 60 |             if 'dpi' in image.info:
 61 |                 del image.info['dpi']
 62 | 
 63 |             # maybe resize image
 64 | 
 65 |             # find scaling factor
 66 |             scale = 1.0
 67 |             scale = min (scale, max_dimen[0] / float (image.size[0]))
 68 |             scale = min (scale, max_dimen[1] / float (image.size[1]))
 69 | 
 70 |             was = ''
 71 |             if scale < 1.0:
 72 |                 dimen = (int (image.size[0] * scale), int (image.size[1] * scale))
 73 |                 was = "(was %d x %d scale=%.2f) " % (image.size[0], image.size[1], scale)
 74 |                 image = image.resize (dimen, Image.ANTIALIAS)
 75 | 
 76 |             # find best quality that fits into max_size
 77 |             data = self.image_data
 78 |             if (scale < 1.0) or (len (self.image_data) > max_size):
 79 |                 for quality in (90, 85, 80, 70, 60, 50, 40, 30, 20, 10):
 80 |                     buf = StringIO.StringIO ()
 81 |                     image.save (buf, format_, quality = quality)
 82 |                     data = buf.getvalue ()
 83 |                     if (len (data) <= max_size):
 84 |                         was += 'q=%d' % quality
 85 |                         break
 86 | 
 87 |             comment = "Image: %d x %d size=%d %s" % (
 88 |                         image.size[0], image.size[1], len (data), was)
 89 |             debug (comment)
 90 | 
 91 |             new_parser.mediatype = self.mediatype
 92 |             new_parser.image_data = data
 93 |             new_parser.dimen = tuple (image.size)
 94 |             new_parser.comment = comment
 95 |             new_parser.url = self.url
 96 |             new_parser.orig_url = self.orig_url
 97 |             new_parser.attribs = self.attribs
 98 |             new_parser.fp = self.fp
 99 | 
100 |         except IOError, what:
101 |             error ("Could not resize image: %s" % what)
102 |             new_parser.broken_image ()
103 | 
104 |         return new_parser
105 | 
106 | 
107 |     def get_image_dimen (self):
108 |         if self.dimen is None:
109 |             image = Image.open (StringIO.StringIO (self.image_data))
110 |             self.dimen = image.size
111 |         return self.dimen
112 | 
113 | 
114 |     def broken_image (self):
115 |         """ Insert broken image placeholder. """
116 | 
117 |         self.image_data = resource_string ('epubmaker.parsers', 'broken.png')
118 |         # We need a way to distinguish between pngs to drop and pngs
119 |         # to keep in a non-images build.
120 |         self.mediatype = 'image/png;type=resource'
121 | 
122 | 
123 |     def pre_parse (self):
124 |         if self.image_data is None:
125 |             self.image_data = self.bytes_content ()
126 |         if self.image_data is None:
127 |             self.broken_image ()
128 | 
129 | 
130 |     def parse (self):
131 |         """ Parse the image. """
132 | 
133 |         pass
134 | 
135 | 
136 |     def serialize (self):
137 |         """ Serialize the image. """
138 |         return self.image_data
139 | 
140 | 


--------------------------------------------------------------------------------
/epubmaker/parsers/RSTParser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | 
  6 | RSTParser.py
  7 | 
  8 | Copyright 2010-2012 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | """
 13 | 
 14 | # FIXME:
 15 | # use docinfo instead of meta for pg header
 16 | 
 17 | import copy
 18 | import re
 19 | import os
 20 | import collections
 21 | import urlparse
 22 | from functools import partial
 23 | 
 24 | from lxml import etree
 25 | import lxml.html
 26 | 
 27 | import docutils.readers.standalone
 28 | from docutils import nodes, frontend, io
 29 | 
 30 | from pkg_resources import resource_string # pylint: disable=E0611
 31 | 
 32 | from epubmaker.lib.GutenbergGlobals import NS, xpath
 33 | from epubmaker.lib.Logger import info, debug, warn, error
 34 | from epubmaker.lib.MediaTypes import mediatypes as mt
 35 | 
 36 | from epubmaker import ParserFactory
 37 | from epubmaker.parsers import HTMLParser
 38 | 
 39 | from epubmaker.mydocutils import broken
 40 | from epubmaker.mydocutils import nodes as mynodes
 41 | from epubmaker.mydocutils.writers import xhtml1, epub2, xetex
 42 | 
 43 | from epubmaker.mydocutils.gutenberg import parsers as gutenberg_parsers
 44 | from epubmaker.mydocutils.gutenberg.writers import nroff as gutenberg_nroff
 45 | from epubmaker.CommonOptions import Options
 46 | 
 47 | options = Options()
 48 | 
 49 | mediatypes = (mt.rst, )
 50 | 
 51 | RE_EMACS_CHARSET = re.compile (r'-\*-.*coding:\s*(\S+)',  re.I)
 52 | 
 53 | class Parser (HTMLParser.Parser):
 54 |     """ Parse a ReStructured Text 
 55 | 
 56 |     and convert it to different xhtml flavours.
 57 | 
 58 |     """
 59 | 
 60 |     def __init__ (self):
 61 |         HTMLParser.Parser.__init__ (self)
 62 |         self.document1 = None
 63 | 
 64 | 
 65 |     def preprocess (self, charset):
 66 |         """ Insert pg header and footer. """
 67 |         
 68 |         return self.unicode_content ()
 69 | 
 70 | 
 71 |     def to_xhtml (self, html, base_url):
 72 |         html = html.replace (u' ', u' ')
 73 |         html = html.replace (u'—', u'—')
 74 | 
 75 |         outputfilename = os.path.join (options.outputdir, options.outputfile)
 76 |         debugfilename = os.path.splitext (outputfilename)[0] + '.debug.html'
 77 | 
 78 |         try:
 79 |             os.remove (debugfilename)
 80 |         except OSError:
 81 |             pass
 82 |         
 83 |         if options.verbose > 1:
 84 |             with open (debugfilename, 'w') as fp:
 85 |                 fp.write (html.encode ('utf-8'))
 86 | 
 87 |         try:
 88 |             xhtml = etree.fromstring (
 89 |                 html, 
 90 |                 lxml.html.XHTMLParser (),
 91 |                 base_url = base_url)                                           
 92 |         except etree.ParseError, what:
 93 |             error ("etree.fromstring says %s" % what)
 94 |             raise
 95 | 
 96 |         xhtml.make_links_absolute (base_url = base_url)
 97 | 
 98 |         return xhtml
 99 | 
100 | 
101 |     def rewrite_links (self, f):
102 |         """ Rewrite all links using the function f. """
103 | 
104 |         doc = self.document1
105 | 
106 |         if 'coverpage' in doc.meta_block:
107 |             coverpage = doc.meta_block['coverpage']
108 |             coverpage[0] = f (coverpage[0])
109 |         else:
110 |             for field in doc.traverse (nodes.field):
111 |                 field_name, field_body = field.children
112 |                 if field_name.astext () == 'coverpage':
113 |                     field_body[:] = nodes.paragraph ('', f (field_body.astext ()))
114 |                     break
115 | 
116 |         for node in doc.traverse (nodes.reference):
117 |             if 'uri' in node:
118 |                 node['uri'] = f (node['uri'])
119 | 
120 |         for node in doc.traverse (nodes.image):
121 |             if 'uri' in node:
122 |                 node['uri'] = f (node['uri'])
123 | 
124 |         for node in doc.traverse (nodes.pending):
125 |             # dropcap images
126 |             if 'image' in node.details:
127 |                 node.details['image'] = f (node.details['image'])
128 | 
129 | 
130 |     def iterlinks (self):
131 |         """ Grab links and images in RST. """
132 | 
133 |         debug ("RSTParser iterlinks want_images = %d" % self.options.want_images)
134 | 
135 |         doc = self.document1
136 | 
137 |         # return coverpage even in noimages build
138 |         if 'coverpage' in doc.meta_block:
139 |             coverpage = doc.meta_block['coverpage']
140 |             yield coverpage[0], {'tag': NS.xhtml.link, 
141 |                                  'type': 'image/jpeg;type=resource', 'rel': 'coverpage'}
142 |         else:
143 |             for field in doc.traverse (nodes.field):
144 |                 field_name, field_body = field.children
145 |                 if field_name.astext () == 'coverpage':
146 |                     yield field_body.astext (), {
147 |                         'tag': NS.xhtml.link, 
148 |                         'type': 'image/jpeg;type=resource', 
149 |                         'rel': 'coverpage'}
150 |                     break
151 | 
152 |         # need broken.png for no-images build
153 |         if not self.options.want_images:
154 |             yield (urlparse.urljoin (self.url, broken), 
155 |                    {'tag': NS.xhtml.img, 'type': 'image/png;type=resource', 'rel': 'broken'})
156 | 
157 |         for node in doc.traverse (nodes.reference):
158 |             if 'uri' in node:
159 |                 yield node['uri'], {'tag': NS.xhtml.a}
160 | 
161 |         if self.options.want_images:
162 |             for node in doc.traverse (nodes.image):
163 |                 if 'uri' in node:
164 |                     yield node['uri'], {'tag': NS.xhtml.img}
165 | 
166 |         if self.options.want_images:
167 |             for node in doc.traverse (nodes.pending):
168 |                 # dropcap images
169 |                 if 'image' in node.details:
170 |                     yield node.details['image'], {'tag': NS.xhtml.img}
171 | 
172 | 
173 |     def get_settings (self, components, defaults):
174 |         option_parser = frontend.OptionParser (
175 |             components = components,
176 |             defaults = defaults, 
177 |             read_config_files = 1)
178 |         return option_parser.get_default_values ()
179 | 
180 | 
181 |     def pre_parse (self):
182 |         """ Parse a RST file as link list. """
183 | 
184 |         debug ("RSTParser: Pre-parsing %s" % self.url)
185 | 
186 |         default_style = self.get_resource (
187 |             'mydocutils.parsers', 'default_style.rst').decode ('utf-8')
188 | 
189 |         source = io.StringInput (default_style + self.unicode_content ())
190 |         reader = docutils.readers.standalone.Reader ()
191 |         parser = gutenberg_parsers.Parser ()
192 | 
193 |         overrides = {
194 |             'get_resource': self.get_resource,
195 |             'get_image_size': self.get_image_size_from_parser,
196 |             'no_images': not self.options.want_images,
197 |             'base_url': self.url,
198 |             }
199 | 
200 |         doc = reader.read (
201 |             source, parser, self.get_settings ((reader, parser), overrides))
202 |         self.document1 = doc
203 | 
204 |         self.rewrite_links (partial (urlparse.urljoin, self.url))
205 | 
206 |         debug ("RSTParser: Done pre-parsing %s" % self.url)
207 | 
208 | 
209 |     def _full_parse (self, writer, overrides):
210 |         """ Full parse from scratch. """
211 | 
212 |         debug ("RSTParser: Full-parsing %s" % self.url)
213 | 
214 |         default_style = self.get_resource (
215 |             'mydocutils.parsers', 'default_style.rst').decode ('utf-8')
216 | 
217 |         source = io.StringInput (default_style + self.unicode_content (), 
218 |                                  self.url, 'unicode')
219 |         reader = docutils.readers.standalone.Reader ()
220 |         parser = gutenberg_parsers.Parser ()
221 | 
222 |         doc = reader.read (
223 |             source, parser, 
224 |             self.get_settings ((reader, parser, writer), overrides))
225 |         self.document1 = doc
226 | 
227 |         self.rewrite_links (partial (urlparse.urljoin, self.url))
228 | 
229 |         doc.transformer.populate_from_components ((source, reader, parser, writer))
230 |         doc.transformer.apply_transforms ()
231 |         debug ("RSTParser: Done full-parsing %s" % self.url)
232 | 
233 |         return doc
234 | 
235 | 
236 |     def _full_parse_2 (self, writer, destination, overrides):
237 |         """ Full parser from pickled doctree. 
238 | 
239 |         Doesn't work yet. It turned out pickling a doctree is much
240 |         harder than I thought. """
241 | 
242 |         debug ("Full-parsing %s" % self.url)
243 | 
244 |         source = io.StringInput (self.unicode_content ())
245 |         reader = docutils.readers.standalone.Reader ()
246 |         parser = gutenberg_parsers.Parser ()
247 | 
248 |         doc = reader.read (
249 |             source, parser, 
250 |             self.get_settings ((reader, parser, writer), overrides))
251 |         self.document1 = doc
252 | 
253 |         self.rewrite_links (partial (urlparse.urljoin, self.url))
254 | 
255 |         # make it picklable
256 |         reporter = doc.reporter #  = None
257 |         # doc.reporter = None
258 |         transformer = doc.transformer
259 |         doc.settings = None
260 |         from docutils.parsers.rst.directives.html import MetaBody
261 | 
262 |         #for metanode in doc.traverse (MetaBody.meta):
263 |         for pending in doc.traverse (nodes.pending):
264 |             # pending.transform = None
265 |             # docutils' meta nodes aren't picklable because the class is nested
266 |             # in pending['nodes']
267 |             if 'nodes' in pending.details: 
268 |                 if isinstance (pending.details['nodes'][0], MetaBody.meta):
269 |                     pending.details['nodes'][0].__class__ = mynodes.meta
270 |         import cPickle as pickle
271 |         pickled = pickle.dumps (doc)
272 | 
273 |         doc = pickle.loads (pickled)
274 | 
275 |         #doc.transformer.populate_from_components (
276 |         #    (source, reader, parser, writer))
277 | 
278 |         doc.transformer = transformer
279 |         doc.reporter = reporter
280 |         doc.settings = self.get_settings ((reader, parser, writer), overrides)
281 | 
282 |         doc.transformer.apply_transforms ()
283 | 
284 |         return writer.write (doc, destination)
285 | 
286 | 
287 |     def rst2nroff (self, charset = 'utf-8'):
288 |         """ Convert RST to nroff. """
289 | 
290 |         writer = gutenberg_nroff.Writer ()
291 |         destination = io.StringOutput (encoding = 'unicode')
292 | 
293 |         overrides = {
294 |             'doctitle_xform': 1,
295 |             'sectsubtitle_xform': 1,
296 |             'footnote_references': 'superscript',
297 |             'compact_lists': 1,
298 |             'compact_simple': 1,
299 |             'page_numbers': 1,
300 |             'no_images': True,
301 |             'get_resource': self.get_resource,
302 |             'format': options.type,
303 |             'encoding': charset,
304 |             'base_url': self.url,
305 |             }
306 |    
307 |         doc = self._full_parse (writer, overrides)
308 |         return writer.write (doc, destination)
309 | 
310 | 
311 |     def rst2xetex (self):
312 |         """ Convert RST to xetex. """
313 | 
314 |         writer = xetex.Writer ()
315 |         destination = io.StringOutput (encoding = 'unicode')
316 | 
317 |         overrides = {
318 |             'doctitle_xform': 1,
319 |             'sectsubtitle_xform': 1,
320 |             'footnote_references': 'superscript',
321 |             'compact_lists': 1,
322 |             'compact_simple': 1,
323 |             'page_numbers': 1,
324 |             'format': options.type,
325 |             'encoding': 'utf-8',
326 |             'get_resource': self.get_resource,
327 |             'get_image_size': self.get_image_size_from_parser,
328 |             'no_images': not self.options.want_images,
329 |             'base_url': self.url,
330 |             }
331 | 
332 |         doc = self._full_parse (writer, overrides)
333 |         return writer.write (doc, destination)
334 | 
335 | 
336 |     def rst2htmlish (self, writer, more_overrides = {}):
337 | 
338 |         destination = io.StringOutput (encoding = 'unicode')
339 | 
340 |         overrides = {
341 |             'stylesheet': None,
342 |             'stylesheet_path': None,
343 |             'xml_declaration': 0,
344 |             'doctitle_xform': 1,
345 |             'initial_header_level': 2,
346 |             'sectsubtitle_xform': 1,
347 |             'footnote_references': 'superscript',
348 |             'page_numbers': 1,
349 |             'format': options.type,
350 |             'encoding': 'utf-8',
351 |             'get_resource': self.get_resource,
352 |             'get_image_size': self.get_image_size_from_parser,
353 |             'no_images': not self.options.want_images,
354 |             'base_url': self.url,
355 |             }
356 |         overrides.update (more_overrides)
357 | 
358 |         doc = self._full_parse (writer, overrides)
359 |         return writer.fixup_xhtml (self.to_xhtml (writer.write (doc, destination), self.url))
360 | 
361 | 
362 |     def rst2html (self):
363 |         """ Convert RST input to HTML output. """
364 |         return self.rst2htmlish (xhtml1.Writer ())
365 | 
366 | 
367 |     def rst2epub2 (self):
368 |         """ Convert RST input to HTML output with Epub2 tweaks. """
369 |         return self.rst2htmlish (epub2.Writer (), 
370 |                                  { 'toc_backlinks': 'none' })
371 | 
372 | 
373 |     def get_resource (self, package, resource):
374 |         return (resource_string ('epubmaker.' + package, resource))
375 | 
376 | 
377 |     def get_image_size_from_parser (self, uri):
378 |         # debug ("Getting image dimen for %s" % uri)
379 |         parser = ParserFactory.ParserFactory.create (uri, {})
380 |         parser.pre_parse ()
381 |         if hasattr (parser, 'get_image_dimen'):
382 |             return parser.get_image_dimen ()
383 |         return None
384 | 
385 | 
386 |     def get_charset_from_rstheader (self):
387 |         """ Parse text for hints about charset. """
388 |         # .. -*- coding: utf-8 -*-
389 |         
390 |         charset = None
391 |         rst = self.bytes_content ()
392 |         
393 |         match = RE_EMACS_CHARSET.search (rst)
394 |         if (match):
395 |             charset = match.group (1)
396 |             debug ('Got charset %s from emacs comment' % charset)
397 | 
398 |         return charset
399 | 
400 | 
401 |     def parse (self):
402 |         """ Dummy. Use rst2* instead. """
403 | 
404 |         debug ("Done parsing %s" % self.url)
405 | 


--------------------------------------------------------------------------------
/epubmaker/parsers/broken.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gitenberg-dev/pg-epubmaker/9a982bab100518aea7582e3e570f5edc74a5fa0d/epubmaker/parsers/broken.png


--------------------------------------------------------------------------------
/epubmaker/writers/HTMLWriter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | HTMLWriter.py
  7 | 
  8 | Copyright 2009 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Writes an HTML file
 13 | 
 14 | """
 15 | 
 16 | from __future__ import with_statement
 17 | 
 18 | import os
 19 | import copy
 20 | 
 21 | from lxml import etree
 22 | from pkg_resources import resource_string # pylint: disable=E0611
 23 | 
 24 | import epubmaker.lib.GutenbergGlobals as gg
 25 | from epubmaker.lib.GutenbergGlobals import xpath
 26 | from epubmaker.lib.Logger import info, debug, error, exception
 27 | 
 28 | from epubmaker import writers
 29 | from epubmaker.CommonOptions import Options
 30 | 
 31 | options = Options()
 32 | 
 33 | 
 34 | class Writer (writers.HTMLishWriter):
 35 |     """ Class for writing HTML files. """
 36 | 
 37 | 
 38 |     def add_dublincore (self, tree):
 39 |         """ Add dublin core metadata to <head>. """
 40 |         source = gg.archive2files (
 41 |             self.options.ebook, self.options.candidate.filename)
 42 | 
 43 |         if hasattr (options.config, 'FILESDIR'):
 44 |             self.options.dc.source = source.replace (options.config.FILESDIR, options.config.PGURL)
 45 |         
 46 |         for head in xpath (tree, '//xhtml:head'):
 47 |             for e in self.options.dc.to_html ():
 48 |                 e.tail = '\n'
 49 |                 head.append (e)
 50 | 
 51 | 
 52 |     def build (self):
 53 |         """ Build HTML file. """
 54 | 
 55 |         htmlfilename = os.path.join (self.options.outputdir, 
 56 |                                      self.options.outputfile)
 57 |         try:
 58 |             os.remove (htmlfilename)
 59 |         except OSError:
 60 |             pass
 61 |                                      
 62 |         try:
 63 |             info ("Creating HTML file: %s" % htmlfilename)
 64 | 
 65 |             for p in self.spider.parsers:
 66 |                 # Do html only. The images were copied earlier by PicsDirWriter.
 67 | 
 68 |                 xhtml = None
 69 |                 if hasattr (p, 'rst2html'):
 70 |                     xhtml = p.rst2html ()
 71 |                 elif hasattr (p, 'xhtml'):
 72 |                     p.parse ()
 73 |                     xhtml = copy.deepcopy (p.xhtml)
 74 | 
 75 |                 if xhtml is not None:
 76 |                     self.make_links_relative (xhtml, p.url)
 77 | 
 78 |                     self.add_dublincore (xhtml)
 79 | 
 80 |                     # makes iphones zoom in
 81 |                     self.add_meta (xhtml, 'viewport', 'width=device-width')
 82 |                     self.add_meta_generator (xhtml)
 83 | 
 84 |                     # This writer has currently to deal only with RST
 85 |                     # input.  The RST writer has a workaround that
 86 |                     # avoids writing empty elements.  So we don't need
 87 |                     # the same ugly workaround as the EPUB writer,
 88 |                     # that has to deal with HTML input too.
 89 |                     html = etree.tostring (xhtml, 
 90 |                                            method = 'xml',
 91 |                                            doctype = gg.XHTML_DOCTYPE,
 92 |                                            encoding = 'utf-8', 
 93 |                                            pretty_print = True,
 94 |                                            xml_declaration = True)
 95 |                     
 96 |                     self.write_with_crlf (htmlfilename, html)
 97 | 
 98 |             # self.copy_aux_files (self.options.outputdir)
 99 |         
100 |             info ("Done HTML file: %s" % htmlfilename)
101 | 
102 |         except StandardError, what:
103 |             exception ("Error building HTML %s: %s" % (htmlfilename, what))
104 |             if os.access (htmlfilename, os.W_OK):
105 |                 os.remove (htmlfilename)
106 |             raise what
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/epubmaker/writers/KindleWriter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | KindleWriter.py
  7 | 
  8 | Copyright 2009-2012 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | """
 13 | 
 14 | import re
 15 | import os
 16 | import subprocess
 17 | 
 18 | from epubmaker.lib.Logger import info, debug, warn, error
 19 | from epubmaker.lib.GutenbergGlobals import SkipOutputFormat
 20 | from epubmaker.writers import EpubWriter
 21 | from epubmaker.CommonOptions import Options
 22 | 
 23 | options = Options()
 24 | 
 25 | 
 26 | class Writer (EpubWriter.Writer):
 27 |     """ Class for writing kindle files. """
 28 | 
 29 | 
 30 |     def parse (self, options):
 31 |         """ Standard parse. """
 32 |         self.setup (options)
 33 | 
 34 | 
 35 |     def build (self):
 36 |         """ Build kindle file. """
 37 | 
 38 |         # Build a special temporary epub file for kindlegen input.
 39 |         # This file is a valid epub but contains strongly simplified HTML.
 40 |         
 41 |         # Much unnecessary juggling of files here because
 42 |         # brain-dead kindlegen doesn't understand unix pipes
 43 |         # and can only output in current directory.
 44 |         # Furthermore we must not conflict with the filenames
 45 |         # of the other generated epub files.
 46 | 
 47 |         kindle_filename = self.options.outputfile
 48 |         epub_filename   = self.options.epub_filename
 49 | 
 50 |         # tmp_epub_filename = os.path.splitext (kindle_filename)[0] + '-kindlegen.epub'
 51 |         # 
 52 |         # debug ("Creating temp Epub file: %s" % os.path.join (
 53 |         #     self.options.outputdir, tmp_epub_filename))
 54 |         # 
 55 |         # # call EpubWriter to build temporary epub file
 56 |         # self.options.outputfile = tmp_epub_filename
 57 |         # EpubWriter.Writer.build (self)
 58 |         # self.options.outputfile = kindle_filename
 59 |         
 60 |         info ("Creating Kindle file: %s" % os.path.join (
 61 |             self.options.outputdir, kindle_filename))
 62 |         info ("            ... from: %s" % os.path.join (
 63 |             self.options.outputdir, epub_filename))
 64 | 
 65 |         try:
 66 |             cwd = os.getcwd ()
 67 |             os.chdir (self.options.outputdir)
 68 | 
 69 |             kindlegen = subprocess.Popen (
 70 |                 [options.config.MOBIGEN, '-o', os.path.basename (kindle_filename), epub_filename],
 71 |                 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 72 | 
 73 |         except OSError, what:
 74 |             os.chdir (cwd)
 75 |             error ("KindleWriter: %s %s" % (options.config.MOBIGEN, what))
 76 |             raise SkipOutputFormat
 77 |         
 78 |         (stdout, stderr) = kindlegen.communicate ('')
 79 | 
 80 |         # try:
 81 |         #     # if self.options.verbose < 2:
 82 |         #     #     os.remove (tmp_epub_filename)
 83 |         #     os.remove (kindle_filename)
 84 |         # except OSError:
 85 |         #     pass
 86 |         #
 87 |         # tmp_mobi_filename = os.path.splitext (tmp_epub_filename)[0] + '.mobi'
 88 |         # os.rename (tmp_mobi_filename, kindle_filename)
 89 | 
 90 |         os.chdir (cwd)
 91 | 
 92 |         regex = re.compile ('^(\w+)\(prcgen\):')
 93 | 
 94 |         if kindlegen.returncode > 0:
 95 |             # pylint: disable=E1103
 96 |             info (stderr.rstrip ())
 97 |             msg = stdout.rstrip ()
 98 |             for line in msg.splitlines ():
 99 |                 match = regex.match (line)
100 |                 if match:
101 |                     sline = regex.sub ("", line)
102 |                     g = match.group (1).lower ()
103 |                     if g == 'info':
104 |                         if sline == 'MOBI File generated with WARNINGS!':
105 |                             # we knew that already
106 |                             continue
107 |                         # info ("kindlegen: %s" % sline)
108 |                     elif g == 'warning':
109 |                         if sline.startswith ('Cover is too small'):
110 |                             continue
111 |                         if sline == 'Cover not specified':
112 |                             continue
113 |                         warn ("kindlegen: %s" % sline)
114 |                     elif g == 'error':
115 |                         error ("kindlegen: %s" % sline)
116 |                     else:
117 |                         error (line)
118 | 
119 |         info ("Done Kindle file: %s" % os.path.join (
120 |             self.options.outputdir, kindle_filename))
121 | 
122 | 


--------------------------------------------------------------------------------
/epubmaker/writers/PDFWriter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | PDFWriter.py
  6 | 
  7 | Copyright 2011 by Marcello Perathoner
  8 | 
  9 | Distributable under the GNU General Public License Version 3 or newer.
 10 | 
 11 | Convert RST to PDF.
 12 | 
 13 | """
 14 | 
 15 | from __future__ import with_statement
 16 | 
 17 | import os
 18 | import subprocess
 19 | 
 20 | from epubmaker.lib.Logger import debug, info, warn, error
 21 | from epubmaker.lib.GutenbergGlobals import SkipOutputFormat
 22 | 
 23 | from epubmaker import ParserFactory
 24 | from epubmaker import writers
 25 | from epubmaker.CommonOptions import Options
 26 | 
 27 | options = Options()
 28 | 
 29 | class Writer (writers.BaseWriter):
 30 |     """ Class to write PDF. """
 31 | 
 32 |     def build (self):
 33 |         """ Build PDF file. """
 34 | 
 35 |         inputfilename  = self.options.candidate.filename
 36 |         outputfilename = os.path.join (self.options.outputdir, self.options.outputfile)
 37 | 
 38 |         debug ("Inputfile: %s" % inputfilename)
 39 |         info ("Creating PDF file: %s" % outputfilename)
 40 | 
 41 |         parser = ParserFactory.ParserFactory.create (inputfilename,
 42 |                                                      self.options.candidate.mediatype)
 43 |         parser.options = self.options
 44 | 
 45 |         if not hasattr (parser, 'rst2xetex'):
 46 |             error ('PDFWriter can only work on a RSTParser.')
 47 |             raise SkipOutputFormat
 48 |         
 49 |         # Brain-dead xetex doesn't understand unix pipes
 50 |         # so we have to write a temp file
 51 |         
 52 |         texfilename = os.path.splitext (outputfilename)[0] + '.tex'
 53 |         auxfilename = os.path.splitext (outputfilename)[0] + '.aux'
 54 |         logfilename = os.path.splitext (outputfilename)[0] + '.log'
 55 | 
 56 |         try:
 57 |             os.remove (auxfilename)
 58 |         except OSError:
 59 |             pass
 60 |         
 61 |         tex = parser.rst2xetex ()
 62 |         with open (texfilename, 'w') as fp:
 63 |             fp.write (tex.encode ('utf-8'))
 64 | 
 65 |         try:
 66 |             cwd = os.getcwd ()
 67 |             os.chdir (self.options.outputdir)
 68 | 
 69 |             _xetex = subprocess.Popen ([options.config.XELATEX,
 70 |                                         "-output-directory", self.options.outputdir,
 71 |                                         "-interaction", "nonstopmode",
 72 |                                         texfilename],
 73 |                                        stdin = subprocess.PIPE, 
 74 |                                        stdout = subprocess.PIPE, 
 75 |                                        stderr = subprocess.PIPE)
 76 |         except OSError, what:
 77 |             os.chdir (cwd)
 78 |             error ("PDFWriter: %s %s" % (options.config.XELATEX, what))
 79 |             raise SkipOutputFormat
 80 | 
 81 |         (dummy_stdout, dummy_stderr) = _xetex.communicate ()
 82 |         
 83 |         with open (logfilename) as fp:
 84 |             for line in fp:
 85 |                 line = line.strip ()
 86 |                 if 'Error:' in line:
 87 |                     error ("xetex: %s" % line)
 88 |                 if options.verbose >= 1:
 89 |                     if 'Warning:' in line:
 90 |                         warn ("xetex: %s" % line)
 91 | 
 92 |         if options.verbose < 2:
 93 |             try:
 94 |                 os.remove (texfilename)
 95 |                 os.remove (logfilename)
 96 |                 os.remove (auxfilename)
 97 |             except OSError:
 98 |                 pass
 99 | 
100 |         os.chdir (cwd)
101 | 
102 |         info ("Done PDF file: %s" % outputfilename)
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/epubmaker/writers/PicsDirWriter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
 3 | 
 4 | """
 5 | 
 6 | PicsDirWriter.py
 7 | 
 8 | Copyright 2012 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | Copies pics into local directory. Needed for HTML and Xetex.
13 | 
14 | """
15 | 
16 | from __future__ import with_statement
17 | 
18 | import os
19 | import copy
20 | 
21 | from lxml import etree
22 | from pkg_resources import resource_string # pylint: disable=E0611
23 | 
24 | import epubmaker.lib.GutenbergGlobals as gg
25 | from epubmaker.lib.GutenbergGlobals import xpath
26 | from epubmaker.lib.Logger import info, debug, error, exception
27 | 
28 | from epubmaker import writers
29 | 
30 | 
31 | class Writer (writers.BaseWriter):
32 |     """ Writes Pics directory. """
33 | 
34 | 
35 |     # def copy_aux_files_lowlevel (self, dest_dir):
36 |     #     """ Copy image files to dest_dir. """
37 |         
38 |     #     for src_uri in self.get_aux_file_list ():
39 |     #         fn_dest = gg.make_url_relative (self.options.base_url, src_uri)
40 |     #         fn_dest = os.path.join (dest_dir, fn_dest)
41 |             
42 |     #         if gg.is_same_path (src_uri, fn_dest):
43 |     #             debug ('Not copying %s to %s: same file' % (src_uri, fn_dest))
44 |     #             continue
45 |     #         debug ('Copying %s to %s' % (src_uri, fn_dest))
46 | 
47 |     #         fn_dest = gg.normalize_path (fn_dest)
48 |     #         gg.mkdir_for_filename (fn_dest)
49 |     #         try:
50 |     #             fp_src = urllib.urlopen (src_uri)
51 |     #             if fp_src:
52 |     #                 with open (fn_dest, 'wb') as fp_dest:
53 |     #                     fp_dest.write (fp_src.read ())
54 |     #         except IOError, what:
55 |     #             error ('Cannot copy %s to %s: %s' % (src_uri, fn_dest, what))
56 | 
57 | 
58 |     def copy_aux_files (self, dest_dir):
59 |         """ Copy image files to dest_dir. Use image data cached in parsers. """
60 | 
61 |         for p in self.spider.parsers:
62 |             if hasattr (p, 'resize_image'):
63 |                 src_uri = p.url
64 |                 fn_dest = gg.make_url_relative (self.options.base_url, src_uri)
65 |                 fn_dest = os.path.join (dest_dir, fn_dest)
66 | 
67 |                 if gg.is_same_path (src_uri, fn_dest):
68 |                     debug ('Not copying %s to %s: same file' % (src_uri, fn_dest))
69 |                     continue
70 |                 debug ('Copying %s to %s' % (src_uri, fn_dest))
71 | 
72 |                 fn_dest = gg.normalize_path (fn_dest)
73 |                 gg.mkdir_for_filename (fn_dest)
74 |                 try:
75 |                     with open (fn_dest, 'wb') as fp_dest:
76 |                         fp_dest.write (p.serialize ())
77 |                 except IOError, what:
78 |                     error ('Cannot copy %s to %s: %s' % (src_uri, fn_dest, what))
79 | 
80 | 
81 |                     
82 |     def build (self):
83 |         """ Build Pics file. """
84 | 
85 |         dir = self.options.outputdir
86 | 
87 |         info ("Creating Pics directory in: %s" % dir)
88 | 
89 |         self.copy_aux_files (dir)
90 |         
91 |         info ("Done Pics directory in: %s" % dir)
92 | 
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/epubmaker/writers/RSTWriter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
 3 | 
 4 | """
 5 | RSTWriter.py
 6 | 
 7 | Copyright 2009 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Build an RST file. This is just the master RST with the PG license mixed in.
12 | 
13 | """
14 | 
15 | from __future__ import with_statement
16 | 
17 | import os
18 | 
19 | from epubmaker.lib.Logger import debug, info, error
20 | from epubmaker import ParserFactory
21 | from epubmaker import writers
22 | 
23 | class Writer (writers.BaseWriter):
24 |     """ Class to write a reStructuredText. """
25 | 
26 |     def build (self):
27 |         """ Build RST file. """
28 | 
29 |         filename = os.path.join (self.options.outputdir, self.options.outputfile)
30 | 
31 |         info ("Creating RST file: %s" % filename)
32 | 
33 |         parser = ParserFactory.ParserFactory.create (self.options.candidate.filename,
34 |                                                      self.options.candidate.mediatype)
35 |         parser.options = self.options
36 | 
37 |         if not hasattr (parser, 'rst2nroff'):
38 |             error ('RSTWriter can only work on a RSTParser.')
39 |             return
40 |         
41 |         data = parser.preprocess ('utf-8').encode ('utf-8')
42 | 
43 |         self.write_with_crlf (filename, data)
44 |         
45 |         info ("Done RST file: %s" % filename)
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/epubmaker/writers/TxtWriter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | TxtWriter.py
  6 | 
  7 | Copyright 2009 by Marcello Perathoner
  8 | 
  9 | Distributable under the GNU General Public License Version 3 or newer.
 10 | 
 11 | Build an UTF-8-encoded PG plain text file. This is just the plain text
 12 | version recoded into UTF-8.
 13 | 
 14 | """
 15 | 
 16 | from __future__ import with_statement
 17 | 
 18 | import os
 19 | import subprocess
 20 | 
 21 | from epubmaker.lib.Logger import debug, info, warn, error
 22 | from epubmaker.lib.GutenbergGlobals import SkipOutputFormat
 23 | 
 24 | from epubmaker import ParserFactory
 25 | from epubmaker import writers
 26 | from epubmaker.CommonOptions import Options
 27 | 
 28 | options = Options()
 29 | 
 30 | # map some not-widely-supported characters to more common ones
 31 | u2u = {
 32 |     0x2010: u'-',  # unicode HYPHEN to HYPHEN-MINUS. Many Windows fonts lack this.
 33 |     }
 34 | 
 35 | class Writer (writers.BaseWriter):
 36 |     """ Class to write PG plain text. """
 37 | 
 38 |     def groff (self, nroff, encoding = 'utf-8'):
 39 |         """ Process thru groff.
 40 | 
 41 |         Takes and returns unicode strings!
 42 | 
 43 |         """
 44 | 
 45 |         device = { 'utf-8': 'utf8',
 46 |                    'iso-8859-1': 'latin1',
 47 |                    'us-ascii': 'ascii' }[encoding]
 48 |         
 49 |         nroff = nroff.encode (encoding)
 50 |         nrofffilename = os.path.join (
 51 |             self.options.outputdir,
 52 |             os.path.splitext (self.options.outputfile)[0] + '.nroff')
 53 | 
 54 |         # write nroff file for debugging
 55 |         if options.verbose >= 2:
 56 |             with open (nrofffilename, 'w') as fp:
 57 |                 fp.write (nroff)
 58 |         else:
 59 |             try:
 60 |                 # remove debug files from previous runs
 61 |                 os.remove (nrofffilename)
 62 |             except OSError:
 63 |                 pass
 64 | 
 65 |         # call groff
 66 |         try:
 67 |             _groff = subprocess.Popen ([options.config.GROFF, 
 68 |                                        "-t",             # preprocess with tbl
 69 |                                        "-K", device,     # input encoding
 70 |                                        "-T", device],    # output device
 71 |                                       stdin = subprocess.PIPE, 
 72 |                                       stdout = subprocess.PIPE, 
 73 |                                       stderr = subprocess.PIPE)
 74 |         except OSError:
 75 |             error ("TxtWriter: executable not found: %s" % options.config.GROFF)
 76 |             raise SkipOutputFormat
 77 | 
 78 |         (txt, stderr) = _groff.communicate (nroff)
 79 |         
 80 |         # pylint: disable=E1103
 81 |         for line in stderr.splitlines ():
 82 |             line = line.strip ()
 83 |             if 'error' in line:
 84 |                 error ("groff: %s" % line)
 85 |             elif 'warn' in line:
 86 |                 if options.verbose >= 1:
 87 |                     warn ("groff: %s" % line)
 88 | 
 89 |         txt = txt.decode (encoding)
 90 |         return txt.translate (u2u) # fix nroff idiosyncracies
 91 | 
 92 | 
 93 |     def build (self):
 94 |         """ Build TXT file. """
 95 | 
 96 |         filename = os.path.join (self.options.outputdir, self.options.outputfile)
 97 | 
 98 |         encoding = options.subtype.strip ('.')
 99 | 
100 |         info ("Creating plain text file: %s" % filename)
101 | 
102 |         parser = ParserFactory.ParserFactory.create (self.options.candidate.filename,
103 |                                                      self.options.candidate.mediatype)
104 |         parser.options = self.options
105 | 
106 |         if hasattr (parser, 'rst2nroff'):
107 |             data = self.groff (parser.rst2nroff (encoding), encoding)
108 |         else:
109 |             data = parser.unicode_content ()
110 | 
111 |         data = data.encode ('utf_8_sig' if encoding == 'utf-8' else encoding, 'unitame')
112 | 
113 |         self.write_with_crlf (filename, data)
114 |             
115 |         info ("Done plain text file: %s" % filename)
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/epubmaker/writers/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
  3 | 
  4 | """
  5 | 
  6 | Writer package
  7 | 
  8 | Copyright 2009-2010 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Base classes for *Writer modules. (EpubWriter, PluckerWriter, ...)
 13 | 
 14 | """
 15 | 
 16 | from __future__ import with_statement
 17 | 
 18 | from functools import partial
 19 | import os.path
 20 | import urllib
 21 | 
 22 | from lxml import etree
 23 | from lxml.builder import ElementMaker
 24 | 
 25 | from epubmaker.lib.Logger import debug, error
 26 | import epubmaker.lib.GutenbergGlobals as gg
 27 | from epubmaker.lib import MediaTypes
 28 | 
 29 | from epubmaker import ParserFactory
 30 | from epubmaker import Spider
 31 | from epubmaker.Version import VERSION, GENERATOR
 32 | 
 33 | 
 34 | class BaseWriter (object):
 35 |     """
 36 |     Base class for EpubWriter, PluckerWriter, ... 
 37 | 
 38 |     also used as /dev/null writer for debugging
 39 | 
 40 |     """
 41 | 
 42 |     def __init__ (self):
 43 |         self.options = None
 44 |         self.spider = None
 45 | 
 46 | 
 47 |     def setup (self, options):
 48 |         """ override this in a real writer
 49 | 
 50 |         put computationally cheap setup stuff in here,
 51 |         
 52 |         """
 53 | 
 54 |         if not options.include_mediatypes:
 55 |             options.include_mediatypes = (
 56 |                 MediaTypes.TEXT_MEDIATYPES |
 57 |                 MediaTypes.AUX_MEDIATYPES |
 58 |                 MediaTypes.IMAGE_MEDIATYPES
 59 |                 )
 60 | 
 61 |         self.options = options
 62 | 
 63 | 
 64 |     def parse (self, options):
 65 |         """ Standard parse. """
 66 |         self.setup (options)
 67 | 
 68 |         if self.spider is None:
 69 |             self.spider = Spider.Spider ()
 70 | 
 71 |         self.spider.parse (options.candidate.filename, 
 72 |                            options.candidate.mediatype,
 73 |                            options)
 74 | 
 75 |         options.candidate.filename = self.spider.redirect (options.candidate.filename)
 76 |         options.base_url = options.candidate.filename
 77 | 
 78 | 
 79 |     def build (self):
 80 |         """ override this in a real writer """
 81 |         pass
 82 | 
 83 | 
 84 |     @staticmethod
 85 |     def write_with_crlf (filename, data):
 86 |         # \r\n is PG standard
 87 |         data = '\r\n'.join (data.splitlines ()) + '\r\n'
 88 |         
 89 |         # open binary so windows doesn't add another \r
 90 |         with open (filename, 'wb') as fp:
 91 |             fp.write (data)
 92 |             
 93 | 
 94 |     def validate (self): # pylint: disable=R0201
 95 |         """ Validate the output with some (external) tool.
 96 | 
 97 |         Override this in a real writer.
 98 | 
 99 |         """
100 |         return 0
101 | 
102 | 
103 |     def sync (self):
104 |         """  Override this if you need to sync before program exit. """
105 |         pass
106 | 
107 | 
108 |     def make_links_relative (self, xhtml, base_url):
109 |         """ Make absolute links in xhtml relative to base_url. """
110 | 
111 |         debug ("Making links relative to: %s" % base_url)
112 |         xhtml.rewrite_links (partial (gg.make_url_relative, base_url))
113 | 
114 | 
115 |     def get_aux_file_list (self):
116 |         """ Iterate over image files. Return absolute urls. """
117 | 
118 |         for p in self.spider.parsers:
119 |             if hasattr (p, 'resize_image'):
120 |                 yield p.url
121 | 
122 | 
123 | em = ElementMaker (namespace = str (gg.NS.xhtml),
124 |                    nsmap = { None: str (gg.NS.xhtml) })
125 | 
126 | 
127 | class HTMLishWriter (BaseWriter):
128 |     """ Base class for writers with HTMLish contents. """
129 | 
130 |     @staticmethod
131 |     def add_class (elem, class_):
132 |         """ Add a class to html element. """
133 | 
134 |         classes = elem.get ('class', '').split ()
135 |         classes.append (class_)
136 |         elem.set ('class', ' '.join (classes))
137 | 
138 | 
139 |     @staticmethod
140 |     def add_meta (xhtml, name, content):
141 |         """ Add a meta tag. """
142 |         
143 |         for head in gg.xpath (xhtml, '//xhtml:head'):
144 |             meta = em.meta (name = name, content = content)
145 |             meta.tail = '\n'
146 |             head.append (meta)
147 |         
148 | 
149 |     @staticmethod
150 |     def add_meta_generator (xhtml):
151 |         """ Add our piss mark. """
152 | 
153 |         HTMLishWriter.add_meta (xhtml, 'generator', GENERATOR % VERSION)
154 | 
155 | 
156 |     @staticmethod
157 |     def add_internal_css (xhtml, css_as_string):
158 |         """ Add internal stylesheet to html. """
159 |         
160 |         if css_as_string and xhtml is not None:
161 |             css_as_string = '\n' + css_as_string.strip (' \n') + '\n'
162 |             for head in gg.xpath (xhtml, '//xhtml:head'):
163 |                 style = em.style (css_as_string, type = 'text/css')
164 |                 style.tail = '\n'
165 |                 head.append (style)
166 | 
167 | 
168 |     def add_external_css (self, xhtml, css_as_string, url):
169 |         """ Add external stylesheet to html. """
170 |         
171 |         if css_as_string:
172 |             p = ParserFactory.ParserFactory.get ('text/css')
173 |             p.parse_string (css_as_string)
174 |             p.url = url
175 |             self.spider.parsers.append (p)
176 |             
177 |         if xhtml is not None:
178 |             for head in gg.xpath (xhtml, '//xhtml:head'):
179 |                 link = em.link (href = url, rel = 'stylesheet', type = 'text/css')
180 |                 link.tail = '\n'
181 |                 head.append (link)
182 | 
183 | 
184 | 
185 | 
186 | 
187 |     
188 | 
189 | 


--------------------------------------------------------------------------------
/epubmaker/writers/cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gitenberg-dev/pg-epubmaker/9a982bab100518aea7582e3e570f5edc74a5fa0d/epubmaker/writers/cover.jpg


--------------------------------------------------------------------------------
/scripts/epubmaker:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
 3 | 
 4 | """
 5 | 
 6 | epubmaker script
 7 | 
 8 | Copyright 2014 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | This script starts epubmaker.
13 | 
14 | """
15 | 
16 | from epubmaker import EpubMaker
17 | 
18 | EpubMaker.main ()
19 | 
20 | 


--------------------------------------------------------------------------------
/scripts/rhyme_compiler:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
 3 | 
 4 | """
 5 | 
 6 | ryhme_compiler.py
 7 | 
 8 | Copyright 2009 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | This module produces a dbm file of rhyme stems.
13 | 
14 | We use a very naive concept of rhyme: we preprocess the 'CMU
15 | Pronouncing Dictionary' (found at
16 | http://www.speech.cs.cmu.edu/cgi-bin/cmudict) and extract the phonemes
17 | for each word from the last stressed one to the end of the word.
18 | 
19 | The result is stored in cmudict.db hashed by word.
20 | 
21 | To compile:
22 | 
23 | $ ./rhyme_compiler.py cmudict.0.7a
24 | 
25 | 
26 | """
27 | 
28 | import fileinput
29 | import re
30 | import gdbm
31 | 
32 | dbm = gdbm.open ('cmudict.db', 'nf')
33 | 
34 | RE_STRESSED = re.compile ('[a-z]+[12][^12]*$')
35 | 
36 | # two example lines from cmudict
37 | #
38 | # PRONUNCIATION  P R OW0 N AH2 N S IY0 EY1 SH AH0 N
39 | # PRONUNCIATION(1)  P R AH0 N AH2 N S IY0 EY1 SH AH0 N
40 | 
41 | for line in fileinput.input (openhook = fileinput.hook_encoded ("iso-8859-1")):
42 |     if line.startswith (';'):
43 |         continue
44 | 
45 |     word, dummy_sep, phonemes = line.lower ().partition ('  ')
46 | 
47 |     m = RE_STRESSED.search (phonemes)
48 |     if m:
49 |         phoneme = re.sub (r'[ 012]+', '-', m.group (0)) # remove stress marks
50 |         dbm[word.encode ('utf-8')] = phoneme.encode ('utf-8')
51 | 
52 |         # print "%s %s\n" % (word, dbm[word])
53 | 
54 | dbm.sync ()
55 | dbm.reorganize ()
56 | dbm.close ()
57 | 
58 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [egg_info]
2 | 
3 | [bdist_wininst]
4 | plat-name: win32
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # pypi epubmaker setup
 3 | #
 4 | 
 5 | from setuptools import setup
 6 | from setup_inc import *
 7 | 
 8 | setup (
 9 |     name = 'epubmaker',
10 |     version = VERSION,
11 |     install_requires = install_requires,
12 |     package_dir  = package_dir,
13 |     packages     = pypi_packages,
14 |     py_modules   = pypi_py_modules,
15 |     package_data = pypi_package_data,
16 |     scripts      = pypi_scripts,
17 |     data_files   = pypi_data_files,
18 | 
19 |     # metadata for upload to PyPI
20 | 
21 |     author = author,
22 |     author_email = author_email,
23 |     description = description,
24 |     long_description = long_description,
25 |     license = license,
26 |     keywords = keywords,
27 |     url = url,
28 |     classifiers = classifiers,
29 |     platforms = platforms,
30 | )
31 | 


--------------------------------------------------------------------------------
/setup_inc.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # epubmaker common setup all flavors
  3 | #
  4 | 
  5 | VERSION = '0.3.26'
  6 | 
  7 | package_dir = {
  8 |     'epubmaker': 'epubmaker',
  9 |     }
 10 | 
 11 | install_requires = [
 12 |     'roman',
 13 |     'docutils >= 0.8.1, <0.13',
 14 |     'lxml >= 2.3',
 15 |     'cssutils >= 0.9.8a1',
 16 |     'pillow',
 17 |     ]
 18 | 
 19 | 
 20 | pypi_packages = [
 21 |     'epubmaker.parsers',
 22 |     'epubmaker.packagers',
 23 |     'epubmaker.writers',
 24 |     'epubmaker.mydocutils',
 25 |     'epubmaker.mydocutils.parsers',
 26 |     'epubmaker.mydocutils.transforms',
 27 |     'epubmaker.mydocutils.writers',
 28 |     'epubmaker.mydocutils.gutenberg',
 29 |     'epubmaker.mydocutils.gutenberg.parsers',
 30 |     'epubmaker.mydocutils.gutenberg.transforms',
 31 |     'epubmaker.mydocutils.gutenberg.writers',
 32 |     ]
 33 | 
 34 | ibiblio_packages = pypi_packages + [
 35 |     'epubmaker',
 36 |     'epubmaker.lib',
 37 |     'epubmaker.writers.ibiblio',
 38 |     ]
 39 | 
 40 | pypi_py_modules = [
 41 |     'epubmaker.CommonOptions',
 42 |     'epubmaker.EpubMaker',
 43 |     'epubmaker.HTMLChunker',
 44 |     'epubmaker.ParserFactory',
 45 |     'epubmaker.Spider',
 46 |     'epubmaker.Unitame',
 47 |     'epubmaker.UnitameData',
 48 |     'epubmaker.Version',
 49 | 
 50 |     'epubmaker.lib.DublinCore',
 51 |     'epubmaker.lib.GutenbergGlobals',
 52 |     'epubmaker.lib.Logger',
 53 |     'epubmaker.lib.MediaTypes',
 54 | 
 55 |     'epubmaker.WriterFactory',
 56 |     ]
 57 | 
 58 | pypi_package_data = {
 59 |     'epubmaker.parsers': ['broken.png'],
 60 |     'epubmaker.writers': ['cover.jpg'],
 61 |     'epubmaker.mydocutils.parsers': ['*.rst'],
 62 |     'epubmaker.mydocutils.writers': ['*.css'],
 63 |     'epubmaker.mydocutils.gutenberg.parsers': ['*.rst'],
 64 |     }
 65 | 
 66 | ibiblio_package_data = pypi_package_data
 67 | ibiblio_package_data.update ({
 68 |     'epubmaker.writers.ibiblio': ['qioo-skeleton.zip'],
 69 |     })
 70 | 
 71 | pypi_data_files = [
 72 |     ('', ['CHANGES', 'setup_inc.py']),
 73 |     ]
 74 | 
 75 | ibiblio_data_files = [
 76 |     ('epubmaker', ['CHANGES', 'setup_inc.py']),
 77 |     ]
 78 | 
 79 | pypi_scripts = [
 80 |     'scripts/epubmaker',
 81 |     'scripts/rhyme_compiler',
 82 |     ]
 83 | 
 84 | ibiblio_scripts = pypi_scripts + [
 85 |     'scripts/makepub',
 86 |     'scripts/convert_unitame',
 87 |     'scripts/update_facebook_auth',
 88 |     ]
 89 | 
 90 | # metadata for upload to PyPI
 91 | 
 92 | author = "Marcello Perathoner"
 93 | author_email = "webmaster@gutenberg.org"
 94 | description = "The Project Gutenberg tool to generate EPUBs and other ebook formats."
 95 | long_description = open ('README').read ()
 96 | license = "GPL v3"
 97 | keywords = "ebook epub kindle pdf rst reST reStructuredText project gutenberg format conversion"
 98 | url = "https://github.com/gitenberg-dev/pg-epubmaker"
 99 | 
100 | classifiers = [
101 |     "Topic :: Text Processing",
102 |     "License :: OSI Approved :: GNU General Public License (GPL)",
103 |     "Environment :: Console",
104 |     "Operating System :: OS Independent",
105 |     "Intended Audience :: Other Audience",
106 |     "Development Status :: 4 - Beta"
107 |     ]
108 | 
109 | platforms = 'OS-independent'
110 | 
111 | 


--------------------------------------------------------------------------------
/test/test.py:
--------------------------------------------------------------------------------
 1 | from lxml import etree
 2 | 
 3 | root = etree.fromstring ("""
 4 | <html xml:lang="en" lang="en" xmlns="http://www.w3.org/1999/xhtml">
 5 |   <body>
 6 |      <p>
 7 |        <span style="color: red"></span>black
 8 |      </p>
 9 |   </body>
10 | </html>
11 | """)
12 | 
13 | XHTML11_DOCTYPE = "<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' \
14 | 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'>"
15 | 
16 | print (etree.tostring (
17 |         root,
18 |         method = 'xml',
19 |         xml_declaration = True,
20 |         doctype = XHTML11_DOCTYPE,
21 |         encoding = 'utf-8', 
22 |         pretty_print = True))
23 | 


--------------------------------------------------------------------------------