├── .gitignore
├── CHANGES
├── LICENSE
├── PKG-INFO
├── README
├── epubmaker
├── CommonOptions.py
├── EpubMaker.py
├── HTMLChunker.py
├── ParserFactory.py
├── Spider.py
├── Unitame.py
├── UnitameData.py
├── Version.py
├── WriterFactory.py
├── __init__.py
├── lib
│ ├── DublinCore.py
│ ├── GutenbergGlobals.py
│ ├── Logger.py
│ ├── MediaTypes.py
│ └── __init__.py
├── mydocutils
│ ├── __init__.py
│ ├── gutenberg
│ │ ├── __init__.py
│ │ ├── parsers
│ │ │ ├── __init__.py
│ │ │ ├── pg-footer.rst
│ │ │ └── pg-header.rst
│ │ ├── transforms
│ │ │ └── __init__.py
│ │ └── writers
│ │ │ ├── __init__.py
│ │ │ └── nroff.py
│ ├── nodes.py
│ ├── parsers
│ │ ├── __init__.py
│ │ └── default_style.rst
│ ├── transforms
│ │ ├── __init__.py
│ │ └── parts.py
│ └── writers
│ │ ├── __init__.py
│ │ ├── epub2.py
│ │ ├── nroff.py
│ │ ├── rst2all.css
│ │ ├── rst2epub.css
│ │ ├── rst2html.css
│ │ ├── xetex.py
│ │ └── xhtml1.py
├── packagers
│ ├── GzipPackager.py
│ ├── HTMLPackager.py
│ ├── PDFPackager.py
│ ├── PushPackager.py
│ ├── RSTPackager.py
│ ├── TxtPackager.py
│ └── __init__.py
├── parsers
│ ├── AuxParser.py
│ ├── CSSParser.py
│ ├── GutenbergTextParser.py
│ ├── HTMLParser.py
│ ├── ImageParser.py
│ ├── RSTParser.py
│ ├── __init__.py
│ └── broken.png
└── writers
│ ├── EpubWriter.py
│ ├── HTMLWriter.py
│ ├── KindleWriter.py
│ ├── PDFWriter.py
│ ├── PicsDirWriter.py
│ ├── RSTWriter.py
│ ├── TxtWriter.py
│ ├── __init__.py
│ └── cover.jpg
├── scripts
├── epubmaker
└── rhyme_compiler
├── setup.cfg
├── setup.py
├── setup_inc.py
└── test
└── test.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # ignore the local copy of any logs
2 | logs/*
3 |
4 | # python ignores
5 | *.pyc
6 | *.db
7 | *.coverage
8 | */.ipynb_checkpoints/*
9 | *.ipynb_checkpoints/*
10 |
11 | # Python packaging
12 | .eggs*
13 | .env
14 | .tox*
15 | build*
16 | dist*
17 | epubmaker.egg-info
18 | log/log.txt
19 | *.log
20 |
21 |
22 |
--------------------------------------------------------------------------------
/CHANGES:
--------------------------------------------------------------------------------
1 | 0.3.26 October 8, 2018
2 |
3 | Don't fail on audio links
4 | Don't unescape external hrefs
5 |
6 | 0.3.25 September 20, 2018
7 |
8 | Don't fail on with bad src
9 | Use a borg class to effect an options global instead of patching builtins
10 | Don't disable translations
11 | Add --local-only option so that using depth>1 can be used for multi-file books
12 | Running the code from source didn't work with the out of date practice of not using package name for directory name.
13 | fix bug for no stylesheet
14 | utf-8 is the encoding, not unicode
15 | update contact info
16 |
17 | 0.3.21 February 24, 2017
18 |
19 | Add parameter to add and set the cover image.
20 | Switch setup to setuptools to better manage dependencies, because docutils 0.13 breaks epubmaker.
21 | No longer strip hyperlinks to external resources.
22 |
23 |
24 | 0.3.20
25 |
26 | Do not make special kindlegen epub anymore. Requires kindlegen 2.7+.
27 | Better coverpage handling.
28 | Works with docutils 0.11+.
29 |
30 | 0.3.19
31 |
32 | 0.3.19b6
33 |
34 | Floats now support 'here'.
35 |
36 | 0.3.19b5
37 |
38 | Fix typo in license text.
39 | Fix "strip_links" debug message crash.
40 | Extend styles directive.
41 | - Add display option to hide the element.
42 | - Allow for negative matches.
43 | Don't use \marginpar for page numbers in TeX.
44 |
45 | 0.3.19b4
46 |
47 | Style directive extended.
48 | Now preserves all trailing whitespace except U+0020.
49 | Added "table de matières" to auto toc detection.
50 | Convert U+2015 to single hyphen in plain text.
51 |
52 | 0.3.19b3
53 |
54 | Fix keyerror hrules and vrules.
55 | Fix unescaped characters in html meta attribute values.
56 | Fix default block image alignment.
57 | Fix use numeric entities in xhtml writer.
58 |
59 | 0.3.19b2
60 |
61 | Fixed text-indent in page nos (made pagenos disapper in line blocks).
62 | Fixed whitespace collapsing in
nodes.
63 | Fixed: honors newlines in metadata fields.
64 | Internal fix: correct format name is: "txt.utf-8".
65 | Can use docinfo in addition to meta directive.
66 |
67 | 0.3.19b1
68 |
69 | New formats: html.noimages and pdf.noimages.
70 | No-image builds use a placeholder 'broken' image instead of nothing.
71 | Figure directives without a filename create a placeholder 'broken' image.
72 | New option :selector: in lof and lot directives for filtering.
73 | Turn off italics with class no-italics (and bold with no-bold).
74 | nbsp now works in ascii txt, soft hyphens now removed from ascii txt.
75 | Insert line numbers with [ln 42] and [ln!42].
76 | Works with kindlegen 2.0.
77 |
78 | 0.3.18
79 |
80 | Allow unicode line separator U+2028 as line feed.
81 | Fix XetexWriter bug with tables without explicit width.
82 | Add language support in XetexWriter.
83 | Works with docutils 0.8
84 | Support docutils-0.8-style :class: language-.
85 |
86 | 0.3.17
87 |
88 | Fix line height of large text.
89 | Fix images with spaces in src attribute.
90 |
91 | 0.3.16
92 |
93 | Add image_dir to Xetex writer.
94 | Use quotation environment instead of quote.
95 | Don't automatically insert \frontmatter.
96 | Page nos. for kindlegen 1.2.
97 | Call kindlegen.
98 | Integrate changes into PG environment.
99 |
100 | 0.3.15
101 |
102 | Reduce vertical margin of images to 1 in TXT.
103 | Fixed link targets in NROFF, PDF.
104 | Report error on xetex errors.
105 | Escape characters in PDF info.
106 |
107 | 0.3.14
108 |
109 | Fixed crash on HTML comments in Kindle writer.
110 |
111 | 0.3.13
112 |
113 | Start on Kindle writer.
114 | Fix spurious space in PDF literal blocks with classes.
115 | Fix `flat´ TOC.
116 | Thin spaces between quotes made optional.
117 |
118 | 0.3.12
119 |
120 | Add more front- and backmatter classes.
121 | Insert thin space between quotes.
122 | Generated List of Tables.
123 | Generated List of Figures.
124 | Emit warning instead of error on groff warnings.
125 | Fix crash when last cell in row spans rows.
126 | Add option vertical-aligns for tables.
127 | Default width of image calculated assuming 980px window.
128 | Fix docutils indentation bug in poetry.
129 |
130 | 0.3.11
131 |
132 | Add option widths to tables.
133 | Add option aligns to tables.
134 | Add class norules for tables.
135 | Generate typographically correct tables.
136 | Don't overwrite images if src dir == working dir.
137 |
138 | 0.3.10
139 |
140 | Bug fixes.
141 |
142 | 0.3.9
143 |
144 | A different fix for figure and image centering on ADE.
145 | (Calculate explicit left margin).
146 | More work on PDF (Xetex) writer.
147 | Added directives for pagination control.
148 |
149 | 0.3.8
150 |
151 | Fix empty poetry lines on ADE.
152 | Fix figure and image centering on ADE.
153 | Fix thoughtbreak centering on ADE.
154 | For push, zip RST into subdir with images.
155 | Start implementing PDF (Xetex) writer.
156 |
157 | 0.3.7
158 |
159 | Integrate changes into PG environment.
160 | Fix more CR/LF issues on windows.
161 | Fix cover image format conversion.
162 | Zips a pushable file for the WWers.
163 |
164 | 0.3.6
165 |
166 | Code cleanup.
167 | Different CSS templates for RST -> HTML and RST -> EPUB.
168 |
169 | 0.3.5
170 |
171 | Zips files up for PG.
172 |
173 | 0.3.4
174 |
175 | Tell Tidy not to merge divs and spans.
176 | More fixes to plain text encoding.
177 |
178 | 0.3.3
179 |
180 | Implemented coverpages for Adobe ADE.
181 | CSS changes because Adobe ADE chokes on !important.
182 | RST dropcap directive: don't use image in EPUB.
183 |
184 | 0.3.2
185 |
186 | Packaging changes.
187 |
--------------------------------------------------------------------------------
/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 1.1
2 | Name: epubmaker
3 | Version: 0.3.25
4 | Summary: The Project Gutenberg tool to generate EPUBs and other ebook formats.
5 | Home-page: https://github.com/gitenberg-dev/pg-epubmaker
6 | Author: Marcello Perathoner
7 | Author-email: webmaster@gutenberg.org
8 | License: GPL v3
9 | Description: =========
10 | EpubMaker
11 | =========
12 |
13 | EpubMaker is the tool used for format conversion at Project Gutenberg.
14 | It builds EPUB2 and Kindle files from HTML.
15 | Also it builds HTML4, EPUB2, Kindle, and PDF files from reST sources.
16 |
17 |
18 | Prerequisites
19 | =============
20 |
21 | * Python >= 2.6,
22 |
23 | * HTMLTidy,
24 |
25 | * Kindlegen,
26 |
27 | * TexLive, and
28 |
29 | * groff.
30 |
31 | Keywords: ebook epub kindle pdf rst reST reStructuredText project gutenberg format conversion
32 | Platform: OS-independent
33 | Classifier: Topic :: Text Processing
34 | Classifier: License :: OSI Approved :: GNU General Public License (GPL)
35 | Classifier: Environment :: Console
36 | Classifier: Operating System :: OS Independent
37 | Classifier: Intended Audience :: Other Audience
38 | Classifier: Development Status :: 4 - Beta
39 | Requires: setuptools
40 | Requires: roman
41 | Requires: docutils (>= 0.8.1, < 0.13)
42 | Requires: lxml (>= 2.3)
43 | Requires: cssutils (>= 0.9.8a1)
44 | Requires: PIL (>= 1.1.7)
45 |
--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | =========
2 | EpubMaker
3 | =========
4 |
5 | EpubMaker is the tool used for format conversion at Project Gutenberg.
6 | It builds EPUB2 and Kindle files from HTML.
7 | Also it builds HTML4, EPUB2, Kindle, and PDF files from reST sources.
8 |
9 |
10 | Prerequisites
11 | =============
12 |
13 | * Python >= 2.6,
14 |
15 | * HTMLTidy,
16 |
17 | * Kindlegen,
18 |
19 | * TexLive, and
20 |
21 | * groff.
22 |
--------------------------------------------------------------------------------
/epubmaker/CommonOptions.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 |
6 | CommonOptions.py
7 |
8 | Copyright 2010 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | Common options for programs.
13 |
14 | """
15 |
16 | from __future__ import with_statement
17 |
18 | import optparse
19 | import ConfigParser
20 | import os
21 |
22 | class Struct (object):
23 | pass
24 |
25 | # options is a "Borg" set by optparse (note that it's not thread-safe)
26 | class Options:
27 | __shared_state = {}
28 | def __init__(self):
29 | self.__dict__ = self.__shared_state
30 |
31 | def update(self, _dict):
32 | self.__dict__.update(_dict)
33 |
34 | options = Options()
35 |
36 |
37 | def add_common_options (op):
38 | """ Add options common to all programs. """
39 |
40 | op.add_option (
41 | "-c", "--config",
42 | metavar = "FILE",
43 | dest = "config_name",
44 | action = "store",
45 | default = "config",
46 | help = "use config file (default: config)")
47 |
48 | op.add_option (
49 | "-v", "--verbose",
50 | dest = "verbose",
51 | action = "count",
52 | help = "be verbose (-v -v be more verbose)")
53 |
54 | op.add_option (
55 | "--validate",
56 | dest = "validate",
57 | action = "count",
58 | help = "validate epub through epubcheck")
59 |
60 | op.add_option (
61 | "--section",
62 | metavar = "TAG.CLASS",
63 | dest = "section_tags",
64 | default = [],
65 | action = "append",
66 | help = "split epub on TAG.CLASS")
67 |
68 |
69 | def get_parser (**kwargs):
70 | op = optparse.OptionParser (**kwargs)
71 | add_common_options (op)
72 | return op
73 |
74 |
75 | def parse_args (op, params = {}, defaults = {}):
76 | (parsed_options, args) = op.parse_args ()
77 | options.update(vars(parsed_options))
78 |
79 | cp = ConfigParser.SafeConfigParser (params)
80 | cp.read ( [options.config_name,
81 | os.path.expanduser ('~/.epubmaker.conf'),
82 | '/etc/epubmaker.conf' ] )
83 |
84 | options.config = Struct ()
85 |
86 | for name, value in defaults.iteritems ():
87 | setattr (options.config, name.upper (), value)
88 |
89 | for section in cp.sections ():
90 | for name, value in cp.items (section):
91 | #if value == 'None':
92 | # value = None
93 | # print section, name, value
94 | setattr (options.config, name.upper (), value)
95 |
96 | return options, args
97 |
98 |
99 |
--------------------------------------------------------------------------------
/epubmaker/EpubMaker.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 |
6 | EpubMaker.py
7 |
8 | Copyright 2009-2011 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | Stand-alone application to build epub out of html or rst.
13 |
14 | """
15 |
16 |
17 | from __future__ import with_statement
18 |
19 | import sys
20 | import os.path
21 | import re
22 | import optparse
23 | import hashlib
24 | import mimetypes
25 |
26 | from epubmaker.lib.GutenbergGlobals import Struct, DCIMT, SkipOutputFormat
27 | import epubmaker.lib.GutenbergGlobals as gg
28 | from epubmaker.lib.Logger import debug, exception
29 | from epubmaker.lib import Logger, DublinCore
30 |
31 | from epubmaker import ParserFactory
32 | from epubmaker import WriterFactory
33 | from epubmaker.packagers import PackagerFactory
34 | from epubmaker import CommonOptions
35 |
36 | from epubmaker.Version import VERSION
37 |
38 | options = CommonOptions.Options()
39 |
40 | def null_translation (s):
41 | """ Translate into same language. :-) """
42 | return s
43 |
44 | TXT_FORMATS = 'txt.utf-8 txt.iso-8859-1 txt.us-ascii'.split ()
45 | HTML_FORMATS = 'html.noimages html.images'.split ()
46 | EPUB_FORMATS = 'epub.noimages epub.images'.split ()
47 | KINDLE_FORMATS = 'kindle.noimages kindle.images'.split ()
48 | PDF_FORMATS = 'pdf.noimages pdf.images'.split ()
49 | RST_FORMATS = 'rst.gen'.split ()
50 | ALL_FORMATS = HTML_FORMATS + EPUB_FORMATS + KINDLE_FORMATS + PDF_FORMATS + TXT_FORMATS + RST_FORMATS
51 |
52 | DEPENDENCIES = (
53 | ('all', ALL_FORMATS),
54 | ('html', HTML_FORMATS),
55 | ('epub', EPUB_FORMATS),
56 | ('kindle', KINDLE_FORMATS),
57 | ('pdf', PDF_FORMATS),
58 | ('txt', TXT_FORMATS),
59 | ('rst', RST_FORMATS),
60 | )
61 |
62 | FILENAMES = {
63 | 'html.noimages': '{id}-noimages-h.html',
64 | 'html.images': '{id}-h.html',
65 |
66 | 'epub.noimages': '{id}-epub.epub',
67 | 'epub.images': '{id}-images-epub.epub',
68 |
69 | 'kindle.noimages': '{id}-kindle.mobi',
70 | 'kindle.images': '{id}-images-kindle.mobi',
71 |
72 | 'pdf.noimages': '{id}-pdf.pdf',
73 | 'pdf.images': '{id}-images-pdf.pdf',
74 |
75 | 'txt.utf-8': '{id}-0.txt',
76 | 'txt.iso-8859-1': '{id}-8.txt',
77 | 'txt.us-ascii': '{id}.txt',
78 |
79 | 'rst.gen': '{id}-rst.rst',
80 |
81 | 'picsdir.noimages': '{id}-noimages.picsdir', # do we need this ?
82 | 'picsdir.images': '{id}-images.picsdir', # do we need this ?
83 | }
84 |
85 | def make_output_filename (dc, type_):
86 | if dc.project_gutenberg_id:
87 | # PG book: use PG naming convention
88 | return FILENAMES[type_].format (id = dc.project_gutenberg_id)
89 | else:
90 | # not a PG ebook
91 | return FILENAMES[type_].format (id = gg.string_to_filename (dc.title)[:65])
92 |
93 | def main ():
94 | """ Main program. """
95 |
96 | op = optparse.OptionParser (usage = "usage: %prog [options] url",
97 | version = "EpubMaker version %s" % VERSION)
98 |
99 | CommonOptions.add_common_options (op)
100 |
101 | op.add_option (
102 | "--make",
103 | dest = "types",
104 | choices = [x for x, y in DEPENDENCIES] + ALL_FORMATS,
105 | default = [],
106 | action = 'append',
107 | help = ("output type [%s] (default: all)"
108 | % ' | '.join ([x for x, y in DEPENDENCIES] + ALL_FORMATS)))
109 |
110 | op.add_option (
111 | "--max-depth",
112 | metavar = "LEVELS",
113 | dest = "max_depth",
114 | type = "int",
115 | default = 1,
116 | help = "go how many levels deep while recursively retrieving pages. (0 == infinite)")
117 |
118 | op.add_option (
119 | "--local-only",
120 | dest = "local_files_only",
121 | action = "store_true",
122 | default = False,
123 | help = "restrict recursive search to local files")
124 |
125 | op.add_option (
126 | "--include",
127 | metavar = "GLOB",
128 | dest = "include_argument",
129 | default = [],
130 | action = "append",
131 | help = "include this url (use globs, repeat for more urls)")
132 |
133 | op.add_option (
134 | "--exclude",
135 | metavar = "GLOB",
136 | dest = "exclude",
137 | default = [],
138 | action = "append",
139 | help = "exclude this url (use globs, repeat for more urls)")
140 |
141 | op.add_option (
142 | "--include-mediatype",
143 | metavar = "GLOB/GLOB",
144 | dest = "include_mediatypes_argument",
145 | default = ['text/*', 'application/xhtml+xml'],
146 | action = "append",
147 | help = "include this mediatype (use globs, repeat for more mediatypes, eg. 'image/*')")
148 |
149 | op.add_option (
150 | "--exclude-mediatype",
151 | metavar = "GLOB/GLOB",
152 | dest = "exclude_mediatypes",
153 | default = [],
154 | action = "append",
155 | help = "exclude this mediatype (use globs, repeat for more mediatypes)")
156 |
157 | op.add_option (
158 | "--rewrite",
159 | metavar = "from>to",
160 | dest = "rewrite",
161 | default = [],
162 | action = "append",
163 | help = "rewrite url eg. 'http://www.example.org/>http://www.example.org/index.html'")
164 |
165 | op.add_option (
166 | "--title",
167 | dest = "title",
168 | default = None,
169 | help = "ebook title (default: from meta)")
170 |
171 | op.add_option (
172 | "--author",
173 | dest = "author",
174 | default = None,
175 | help = "author (default: from meta)")
176 |
177 | op.add_option (
178 | "--ebook",
179 | dest = "ebook",
180 | type = "int",
181 | default = 0,
182 | help = "ebook no. (default: from meta)")
183 |
184 | op.add_option (
185 | "--input-encoding",
186 | dest = "inputencoding",
187 | default = None,
188 | help = "input encoding (default: from meta)")
189 |
190 | op.add_option (
191 | "--output-dir",
192 | dest = "outputdir",
193 | default = "./",
194 | help = "output directory (default: ./)")
195 |
196 | op.add_option (
197 | "--output-file",
198 | dest = "outputfile",
199 | default = None,
200 | help = "output file (default: .epub)")
201 |
202 | op.add_option (
203 | "--packager",
204 | dest = "packager",
205 | choices = ['none', 'ww'],
206 | default = "none",
207 | help = "packager type [none | ww] (default: none)")
208 |
209 | op.add_option (
210 | "--mediatype-from-extension",
211 | dest = "mediatype_from_extension",
212 | action = "store_true",
213 | default = False,
214 | help = "get mediatype from url extension instead of http response")
215 |
216 | op.add_option (
217 | "--cover",
218 | dest = "coverpage_url",
219 | default = None,
220 | help = "add the specified cover to the epub")
221 |
222 | options, args = CommonOptions.parse_args (op, {}, {
223 | 'proxies': None,
224 | 'bibrec': 'http://www.gutenberg.org/ebooks/',
225 | 'xelatex': 'xelatex',
226 | 'mobigen': 'kindlegen',
227 | 'groff': 'groff',
228 | 'rhyming_dict': None,
229 | } )
230 |
231 | if not args:
232 | op.error ("please specify which file to convert")
233 |
234 | Logger.set_log_level (options.verbose)
235 |
236 | options.types = options.types or ['all']
237 | for opt, formats in DEPENDENCIES:
238 | if opt in options.types:
239 | options.types.remove (opt)
240 | options.types += formats
241 |
242 | if set (options.types).intersection (('html.images', 'pdf.images', 'rst.gen')):
243 | options.types.insert (0, 'picsdir.images')
244 | if set (options.types).intersection (('html.noimages', 'pdf.noimages')):
245 | options.types.insert (0, 'picsdir.noimages')
246 | if set (options.types).intersection (('kindle.images', )):
247 | options.types.insert (0, 'epub.images')
248 | if set (options.types).intersection (('kindle.noimages', )):
249 | options.types.insert (0, 'epub.noimages')
250 |
251 |
252 | debug ("Building types: %s" % ' '.join (options.types))
253 |
254 | ParserFactory.load_parsers ()
255 | WriterFactory.load_writers ()
256 |
257 | packager_factory = None
258 | if options.packager != 'none':
259 | packager_factory = PackagerFactory (options.packager)
260 | packager_factory.load ()
261 |
262 | for url in args:
263 |
264 | if options.include_argument:
265 | options.include = options.include_argument[:]
266 | else:
267 | exclude_patt = os.path.dirname (url) + '/*'
268 | options.include = [ exclude_patt ]
269 | if exclude_patt.startswith ('/'):
270 | options.include.append('file://' + exclude_patt)
271 |
272 | # try to get metadata
273 |
274 | options.candidate = Struct ()
275 | options.candidate.filename = url
276 | options.candidate.mediatype = str (DCIMT (
277 | mimetypes.types_map[os.path.splitext (url)[1]], options.inputencoding))
278 |
279 | options.include_mediatypes = options.include_mediatypes_argument[:]
280 | options.want_images = False
281 | #options.coverpage_url = None
282 |
283 | parser = ParserFactory.ParserFactory.create (options.candidate.filename, {})
284 |
285 | dc = None
286 |
287 | try:
288 | dc = DublinCore.GutenbergDublinCore ()
289 |
290 | # try for rst header
291 | dc.load_from_rstheader (parser.unicode_content ())
292 |
293 | if dc.project_gutenberg_id == 0:
294 | # try for Project Gutenberg header
295 | dc.load_from_parser (parser)
296 |
297 | except (ValueError, TypeError):
298 | # use standard HTML header
299 | dc = DublinCore.DublinCore ()
300 | dc.load_from_parser (parser)
301 | dc.source = url
302 |
303 | dc.source = url
304 |
305 | if options.title:
306 | dc.title = options.title
307 | if not dc.title:
308 | dc.title = 'NA'
309 |
310 | if options.author:
311 | dc.add_author (options.author, 'cre')
312 | if not dc.authors:
313 | dc.add_author ('NA', 'cre')
314 |
315 | if options.ebook:
316 | dc.project_gutenberg_id = options.ebook
317 |
318 | if dc.project_gutenberg_id:
319 | dc.opf_identifier = ('http://www.gutenberg.org/ebooks/%d' % dc.project_gutenberg_id)
320 | else:
321 | dc.opf_identifier = ('urn:mybooks:%s' %
322 | hashlib.md5 (url.encode ('utf-8')).hexdigest ())
323 |
324 | if not dc.languages:
325 | # we *need* a language to build a valid epub, so just make one up
326 | dc.add_lang_id ('en')
327 |
328 | aux_file_list = []
329 |
330 | for type_ in options.types:
331 | debug ('=== Building %s ===' % type_)
332 | maintype, subtype = os.path.splitext (type_)
333 |
334 | try:
335 | writer = WriterFactory.create (maintype)
336 | writer.setup (options)
337 | options.type = type_
338 | options.maintype = maintype
339 | options.subtype = subtype
340 | options.want_images = False
341 |
342 | options.include_mediatypes = options.include_mediatypes_argument[:]
343 | if subtype == '.images':
344 | options.include_mediatypes.append ('image/*')
345 | options.want_images = True
346 | else:
347 | # This is the mediatype of the 'broken' image.
348 | options.include_mediatypes.append ('image/png;type=resource')
349 |
350 | writer.parse (options)
351 |
352 | if maintype in ('html', ):
353 | # list of images for packager
354 | aux_file_list[:] = writer.get_aux_file_list ()
355 |
356 | options.dc = dc
357 | options.outputfile = make_output_filename (dc, type_)
358 |
359 | if maintype == 'kindle':
360 | options.epub_filename = make_output_filename (dc, 'epub' + subtype)
361 |
362 | writer.build ()
363 |
364 | if options.validate:
365 | writer.validate ()
366 |
367 | if packager_factory:
368 | try:
369 | packager = packager_factory.create (type_)
370 | packager.setup (options)
371 | packager.package (aux_file_list)
372 | except KeyError:
373 | # no such packager
374 | pass
375 |
376 | options.outputfile = None
377 |
378 | except SkipOutputFormat:
379 | continue
380 |
381 | except StandardError, what:
382 | exception ("%s" % what)
383 |
384 | if options.packager == 'ww':
385 | try:
386 | packager = packager_factory.create ('push')
387 | options.outputfile = '%d-final.zip' % (dc.project_gutenberg_id)
388 | packager.setup (options)
389 | packager.package (aux_file_list)
390 | except KeyError:
391 | # no such packager
392 | pass
393 |
394 | sys.exit (0)
395 |
396 | if __name__ == "__main__":
397 | main ()
398 |
399 |
400 |
401 |
--------------------------------------------------------------------------------
/epubmaker/HTMLChunker.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 |
6 | HTMLChunker.py
7 |
8 | Copyright 2009 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | Splits a HTML file into chunks.
13 |
14 | """
15 |
16 | from __future__ import with_statement
17 |
18 | import urlparse
19 | import urllib
20 | import os
21 | import re
22 | import copy
23 |
24 | from lxml import etree
25 |
26 | import epubmaker.lib.GutenbergGlobals as gg
27 | from epubmaker.lib.GutenbergGlobals import NS
28 | from epubmaker.lib.Logger import debug, error
29 | from epubmaker.CommonOptions import Options
30 |
31 | options = Options()
32 | # MAX_CHUNK_SIZE = 300 * 1024 # bytes
33 | MAX_CHUNK_SIZE = 100 * 1024 # bytes
34 |
35 | SECTIONS = [
36 | ('div.section', 0.0),
37 | ('div.chapter', 0.0),
38 | ('h1', 0.5),
39 | ('div', 0.5),
40 | ('h2', 0.7),
41 | ('h3', 0.75),
42 | ('p', 0.8)
43 | ]
44 |
45 | def xpath (node, path):
46 | """ xpath helper """
47 | return node.xpath (path, namespaces = gg.NSMAP)
48 |
49 | def unicode_uri (uri):
50 | """ Normalize URI for idmap. """
51 | return urllib.unquote (uri).decode ('utf-8')
52 |
53 |
54 | class HTMLChunker (object):
55 | """ Splits HTML tree into smaller chunks.
56 |
57 | Some epub viewers are limited in that they cannot display files
58 | larger than 300K. If our HTML happens to be longer, we have to
59 | split it up. Also smaller chunks do improve page flip times.
60 |
61 |
62 | """
63 |
64 | def __init__ (self):
65 | self.chunks = []
66 | self.idmap = {}
67 | self.chunk = None
68 | self.chunk_body = None
69 | self.chunk_size = 0
70 | self.next_id = 0
71 |
72 | self.tags = {}
73 | for tag, size in SECTIONS:
74 | self.tags[NS.xhtml[tag]] = int (size * MAX_CHUNK_SIZE)
75 | for tag in options.section_tags:
76 | self.tags[NS.xhtml[tag]] = 0
77 |
78 |
79 | def _make_name (self, url):
80 | """ Generate a name for the chunk. """
81 | u = list (urlparse.urlparse (url))
82 | root, ext = os.path.splitext (u[2])
83 | # FIXME: brain-dead kindlegen only finds links in files with
84 | # .html extension. so we just add .html to everything
85 | u[2] = "%s-%d%s.html" % (root, self.next_id, ext)
86 | self.next_id += 1
87 | return urlparse.urlunparse (u)
88 |
89 |
90 | @staticmethod
91 | def make_template (tree):
92 | """ Make a copy with an empty html:body.
93 |
94 | This makes a template into which we can paste our chunks.
95 |
96 | """
97 |
98 | template = copy.deepcopy (tree)
99 |
100 | for c in xpath (template, '//xhtml:body'):
101 |
102 | # descend while elem has only one child
103 | while len (c) == 1:
104 | c = c[0]
105 |
106 | # clear children but save attributes
107 | attributes = c.attrib.items ()
108 | c.clear ()
109 | # was tentative fix for patological one-element-html case
110 | # for child in c:
111 | # c.remove (child)
112 | for a in attributes:
113 | c.set (a[0], a[1])
114 |
115 | # debug (etree.tostring (template))
116 |
117 | return template
118 |
119 |
120 | def reset_chunk (self, template):
121 | """ start a new chunk """
122 |
123 | self.chunk = copy.deepcopy (template)
124 | self.chunk_size = len (etree.tostring (self.chunk))
125 | self.chunk_body = xpath (self.chunk, "//xhtml:body")[0]
126 | while len (self.chunk_body) == 1:
127 | self.chunk_body = self.chunk_body[0]
128 |
129 |
130 | def shipout_chunk (self, url, chunk_id = None, comment = None):
131 | """ ready chunk to be shipped """
132 |
133 | if (self.chunk_size > MAX_CHUNK_SIZE):
134 | self.split (self.chunk, url)
135 | return
136 |
137 | url = unicode_uri (url)
138 | chunk_name = self._make_name (url)
139 |
140 | # the url of the whole page
141 | if not url in self.idmap:
142 | self.idmap[url] = chunk_name
143 |
144 | # fragments of the page
145 | for e in xpath (self.chunk, '//xhtml:*[@id]'):
146 | id_ = e.attrib['id']
147 | old_id = "%s#%s" % (url, id_)
148 | # key is unicode string,
149 | # value is uri-escaped byte string
150 | # if ids get cloned while chunking, map to the first one only
151 | if old_id not in self.idmap:
152 | self.idmap[old_id] = "%s#%s" % (
153 | chunk_name, urllib.quote (id_.encode ('utf-8')))
154 |
155 | self.chunks.append ( { 'name' : chunk_name,
156 | 'id' : chunk_id,
157 | 'comment' : comment,
158 | 'chunk' : self.chunk, } )
159 |
160 | debug ("Adding chunk %s (%d bytes) %s" % (chunk_name, self.chunk_size, chunk_id))
161 |
162 |
163 | def split (self, tree, url):
164 | """ Split whole html or split chunk.
165 |
166 | Find some arbitrary points to do it.
167 |
168 | """
169 |
170 | for body in xpath (tree, "//xhtml:body"):
171 | # we can't split a node that has only one child
172 | # descend while elem has only one child
173 | while len (body) == 1:
174 | body = body[0]
175 |
176 | debug ("body tag is %s" % body.tag)
177 |
178 | template = self.make_template (tree)
179 | self.reset_chunk (template)
180 |
181 | # FIXME: is this ok ???
182 | # fixes patological one-element-body case
183 | self.chunk_body.text = body.text
184 |
185 | for child in body:
186 | if not isinstance (child, etree.ElementBase):
187 | # comments, processing instructions etc.
188 | continue
189 | child_size = len (etree.tostring (child))
190 |
191 | try:
192 | tags = [child.tag + '.' + c for c in child.attrib['class'].split ()]
193 | tags.append (child.tag)
194 | except KeyError:
195 | tags = [child.tag]
196 |
197 | for tag in tags:
198 | if ((self.chunk_size + child_size > MAX_CHUNK_SIZE) or
199 | (tag in self.tags and
200 | self.chunk_size > self.tags[tag])):
201 |
202 | comment = ("Chunk: size=%d Split on %s"
203 | % (self.chunk_size, re.sub ('^{.*}', '', tag)))
204 | debug (comment)
205 |
206 | # find a suitable id
207 | chunk_id = None
208 | for c in self.chunk_body:
209 | if 'id' in c.attrib:
210 | chunk_id = c.get ('id')
211 | break
212 | debug ("chunk id is: %s" % (chunk_id or ''))
213 |
214 | self.shipout_chunk (url, chunk_id, comment)
215 | self.reset_chunk (template)
216 | break
217 |
218 | self.chunk_body.append (child)
219 | self.chunk_size = self.chunk_size + child_size
220 |
221 | # fixes patological one-element-body case
222 | self.chunk_body.tail = body.tail
223 |
224 | chunk_id = None
225 | if len (self.chunk_body):
226 | chunk_id = self.chunk_body[0].get ('id')
227 | comment = "Chunk: size=%d" % self.chunk_size
228 | self.shipout_chunk (url, chunk_id, comment)
229 | self.reset_chunk (template)
230 |
231 |
232 | def rewrite_links (self, f):
233 | """ Rewrite all href and src using f (). """
234 |
235 | for chunk in self.chunks:
236 | # chunk['name'] = f (chunk['name'])
237 |
238 | for link in xpath (chunk['chunk'], '//xhtml:*[@href]'):
239 | url = link.get ('href')
240 | if not url.startswith('http://') and not url.startswith('https://'):
241 | link.set ('href', f (url))
242 |
243 | for image in xpath (chunk['chunk'], '//xhtml:*[@src]'):
244 | image.set ('src', f (image.get ('src')))
245 |
246 | for k, v in self.idmap.items ():
247 | self.idmap[k] = f (v)
248 |
249 |
250 | def rewrite_internal_links (self):
251 | """ Rewrite links to point into right chunks.
252 |
253 | Because we split the HTML into chunks, all internal links need
254 | to be rewritten to become links into the right chunk.
255 | Rewrite all internal links in all chunks.
256 |
257 | """
258 | for chunk in self.chunks:
259 | for a in xpath (chunk['chunk'], "//xhtml:*[@href]"):
260 | try:
261 | uri = unicode_uri (a.get ('href'))
262 | a.set ('href', self.idmap[uri])
263 | except KeyError:
264 | ur, dummy_frag = urlparse.urldefrag (uri)
265 | if ur in self.idmap:
266 | error ("HTMLChunker: Cannot rewrite internal link '%s'" % uri)
267 |
268 |
269 | def rewrite_internal_links_toc (self, toc):
270 | """ Rewrite links to point into right chunks.
271 |
272 | Because we split the HTML into chunks, all internal links need
273 | to be rewritten to become links into the right chunk.
274 | Rewrite all links in the passed toc.
275 |
276 | """
277 |
278 | for entry in toc:
279 | try:
280 | entry[0] = self.idmap [unicode_uri (entry[0])]
281 | except KeyError:
282 | error ("HTMLChunker: Cannot rewrite toc entry '%s'" % entry[0])
283 | del entry
284 |
285 |
286 |
--------------------------------------------------------------------------------
/epubmaker/ParserFactory.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 |
6 | ParserFactory.py
7 |
8 | Copyright 2009-10 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | """
13 |
14 | from __future__ import with_statement
15 |
16 | import os.path
17 | import urllib
18 |
19 | from pkg_resources import resource_listdir # pylint: disable=E0611
20 |
21 | from epubmaker.mydocutils import broken
22 | from epubmaker.lib.Logger import debug, error
23 | from epubmaker.lib.MediaTypes import mediatypes
24 | from epubmaker.Version import VERSION
25 | from epubmaker.CommonOptions import Options
26 |
27 | options = Options()
28 |
29 | class AppURLopener (urllib.FancyURLopener):
30 | version = "ebookmaker/%s" % VERSION
31 |
32 | urllib._urlopener = AppURLopener ()
33 |
34 | parser_modules = {}
35 |
36 | def load_parsers ():
37 | """ See what types we can parse. """
38 |
39 | for fn in resource_listdir ('epubmaker.parsers', ''):
40 | modulename, ext = os.path.splitext (fn)
41 | if ext == '.py':
42 | if (modulename.endswith ('Parser')):
43 | module = __import__ ('epubmaker.parsers.' + modulename, fromlist = [modulename])
44 | debug ("Loading parser from module: %s for mediatypes: %s" % (
45 | modulename, ', '.join (module.mediatypes)))
46 | for mediatype in module.mediatypes:
47 | parser_modules[mediatype] = module
48 |
49 | return parser_modules.keys ()
50 |
51 |
52 | def unload_parsers ():
53 | """ Unload parser modules. """
54 | for k in parser_modules.keys ():
55 | del parser_modules[k]
56 |
57 |
58 | class ParserFactory (object):
59 | """ A factory and a cache for parsers.
60 |
61 | So we don't reparse the same file twice.
62 |
63 | """
64 |
65 | parsers = {} # cache: parsers[url] = parser
66 |
67 | @staticmethod
68 | def get (mediatype):
69 | """ Get the right kind of parser. """
70 | try:
71 | return parser_modules[mediatype].Parser ()
72 | except KeyError:
73 | return parser_modules['*/*'].Parser ()
74 |
75 |
76 | @classmethod
77 | def create (cls, url, attribs):
78 | """ Create an appropriate parser. """
79 |
80 | # debug ("Need parser for %s" % url)
81 |
82 | if url in cls.parsers:
83 | # debug ("... reusing parser for %s" % url)
84 | # reuse same parser, maybe already filled with data
85 | return cls.parsers[url]
86 |
87 | orig_url = url
88 | mediatype = attribs.get ('mediatype')
89 |
90 | if url.endswith (broken):
91 | # hack! broken.png doesn't exist at the source location.
92 | # We take it from our resources and fake its provenience.
93 | parser = parser_modules['image/png'].Parser ()
94 | parser.orig_url = url
95 | parser.url = url
96 | parser.broken_image ()
97 | else:
98 | fp = urllib.urlopen (url, proxies = options.config.PROXIES)
99 | url = fp.geturl ()
100 |
101 | if url != orig_url:
102 | debug ("... %s redirected to %s" % (orig_url, url))
103 | if url in cls.parsers:
104 | # debug ("... reusing parser for %s" % url)
105 | # reuse same parser, maybe already filled with data
106 | return cls.parsers[url]
107 |
108 | # ok. so we have to create a new parser
109 | debug ("... creating new parser for %s" % url)
110 |
111 | if mediatype is not None:
112 | debug ("... got mediatype %s from link attributes" % mediatype)
113 | else:
114 | if options.mediatype_from_extension or not hasattr (fp, 'info'):
115 | name, ext = os.path.splitext (url)
116 | mediatype = mediatypes[ext[1:]]
117 | else:
118 | msg = fp.info ()
119 | mediatype = msg.get ('Content-Type')
120 | if mediatype:
121 | mediatype = mediatype.partition (';')[0]
122 | debug ("... got mediatype %s from server" % mediatype)
123 | else:
124 | mediatype = 'application/octet-stream'
125 | error ("... cannot determine mediatype for %s" % url)
126 |
127 | # get the right kind of parser
128 | try:
129 | mt = mediatype.split (';')[0]
130 | parser = parser_modules[mt].Parser ()
131 | except KeyError:
132 | parser = parser_modules['*/*'].Parser ()
133 |
134 | parser.setup (orig_url, mediatype, attribs, fp)
135 |
136 | cls.parsers[parser.url] = parser
137 | cls.parsers[orig_url] = parser
138 |
139 | return parser
140 |
141 |
142 | @classmethod
143 | def clear (cls):
144 | """ Clear parser cache to free memory. """
145 |
146 | # debug: kill refs
147 | for dummy_url, parser in cls.parsers.items ():
148 | del parser
149 |
150 | cls.parsers = {}
151 |
152 |
--------------------------------------------------------------------------------
/epubmaker/Spider.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 |
6 | Spider.py
7 |
8 | Copyright 2009 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | Rudimentary Web Spider
13 |
14 | """
15 |
16 | from __future__ import with_statement
17 |
18 | import urlparse
19 | import fnmatch
20 |
21 | from epubmaker.lib import MediaTypes
22 | import epubmaker.lib.GutenbergGlobals as gg
23 | from epubmaker.lib.GutenbergGlobals import NS
24 | from epubmaker.lib.Logger import debug, error
25 |
26 | from epubmaker import ParserFactory
27 |
28 | COVERPAGE_MIN_AREA = 200 * 200
29 |
30 | class Spider (object):
31 | """ A very rudimentary web spider. """
32 |
33 | def __init__ (self):
34 | self.options = None
35 | self.parsed_urls = set ()
36 | self.enqueued_urls = set ()
37 | self.included_mediatypes = set ()
38 | self.excluded_mediatypes = set ()
39 | self.queue = []
40 | self.parsers = []
41 | self.next = [] # for a topological sort
42 | self.redirection_map = {}
43 |
44 |
45 | def parse (self, url, mediatype_hint, options):
46 | """ Do a recursive parse starting from url.
47 |
48 | Do a breadth-first traversal. Assuming the first page contains
49 | a linked TOC, this will get us a more natural ordering of the
50 | pages than a depth-first traversal.
51 |
52 | """
53 |
54 | self.options = options
55 |
56 | for rewrite in self.options.rewrite:
57 | from_, to = rewrite.split ('>')
58 | self.redirection_map[from_] = to
59 |
60 | debug ("Start of retrieval")
61 |
62 | # enqueue root url
63 |
64 | attribs = { 'mediatype' : mediatype_hint, 'id': 'start' }
65 | self.enqueue (url, 0, attribs)
66 |
67 | while self.queue:
68 | (url, depth, attribs) = self.queue.pop (0)
69 |
70 | url = self.redirect (url)
71 | if url in self.parsed_urls:
72 | continue
73 |
74 | parser = ParserFactory.ParserFactory.create (url, attribs)
75 | self.add_redirection (parser)
76 |
77 | # if the url was redirected to something we already have
78 | url = self.redirect (parser.url)
79 | if url in self.parsed_urls:
80 | continue
81 |
82 | self.parsed_urls.add (url)
83 | parser.options = self.options
84 | parser.pre_parse ()
85 | self.parsers.append (parser)
86 |
87 | # check potential coverpage for sufficient size
88 | if options.coverpage_url is None:
89 | if attribs.get ('rel', '') == 'coverpage':
90 | if hasattr (parser, 'get_image_dimen'):
91 | dimen = parser.get_image_dimen ()
92 | if (dimen[0] * dimen[1]) > COVERPAGE_MIN_AREA:
93 | options.coverpage_url = parser.url
94 | debug ("Setting coverpage: %s ..." % parser.url)
95 |
96 | depth += 1
97 |
98 | # look for links in just parsed document
99 | debug ("Requesting iterlinks for: %s ..." % url)
100 |
101 | for (url, attr) in parser.iterlinks ():
102 | # debug ("*** link: %s ..." % url)
103 |
104 | url = urlparse.urldefrag (url)[0]
105 | tag = attr.get ('tag', '')
106 |
107 | if tag == NS.xhtml.link:
108 | if attr.get ('rel', '').lower () == 'next':
109 | self.next.append ((parser.url, url))
110 |
111 | url = self.redirect (url)
112 |
113 | attribs = { 'mediatype' : attr.get ('type', None) }
114 |
115 | for k in ('id', 'rel'):
116 | if k in attr:
117 | attribs[k] = attr[k]
118 |
119 | if tag == NS.xhtml.a:
120 | self.enqueue_doc (url, depth, attribs)
121 | continue
122 | if tag == NS.xhtml.img:
123 | self.enqueue_aux (url, depth, attribs)
124 | continue
125 | if tag == NS.xhtml.object:
126 | if ('type' in attr and
127 | not self.is_included_mediatype (attr['type'])):
128 | continue
129 | self.enqueue_aux (url, depth, attribs)
130 | continue
131 | if tag == NS.xhtml.link:
132 | rel = attribs.get ('rel', '').lower ()
133 | if 'stylesheet' in rel:
134 | self.enqueue_aux (url, depth, attribs)
135 | elif rel == 'coverpage':
136 | # We may also find the coverpage in
137 | self.enqueue_aux (url, depth, attribs)
138 | else:
139 | self.enqueue_doc (url, depth, attribs)
140 | continue
141 |
142 | debug ("End of retrieval")
143 |
144 | # rewrite redirected urls
145 | if self.redirection_map:
146 | for parser in self.parsers:
147 | parser.remap_links (self.redirection_map)
148 |
149 | # try a topological sort of documents using
150 | if self.next:
151 | self.next = map (lambda x: (self.redirect(x[0]), self.redirect(x[1])), self.next)
152 |
153 | try:
154 | d = {}
155 | for order, url in enumerate (gg.topological_sort (self.next)):
156 | d[url] = order
157 | debug ("%s order %d" % (url, order))
158 | for parser in self.parsers:
159 | parser.order = d.get (parser.url, 999999)
160 | self.parsers.sort (key = lambda p: p.order)
161 |
162 | except StandardError:
163 | pass
164 |
165 |
166 | def add_redirection (self, parser):
167 | """ Remember this redirection. """
168 | if parser.orig_url != parser.url:
169 | self.redirection_map[parser.orig_url] = parser.url
170 | debug ("Adding redirection from %s to %s" % (parser.orig_url, parser.url))
171 |
172 |
173 | def redirect (self, url):
174 | """ Redirect url if we know the target. """
175 | return self.redirection_map.get (url, url)
176 |
177 |
178 | def enqueue (self, url, depth, attribs):
179 | """ Enque url for parsing. """
180 |
181 | url = self.redirect (url)
182 | if url in self.enqueued_urls:
183 | return
184 |
185 | debug ("Enqueing %s ..." % url)
186 | self.queue.append ((url, depth, attribs))
187 | self.enqueued_urls.add (url)
188 |
189 |
190 | def enqueue_aux (self, url, depth, attribs):
191 | """ Enqueue an auxiliary file.
192 |
193 | We get auxiliary files even if they are too deep or not in
194 | 'included' directories.
195 |
196 | """
197 | try:
198 | parser = ParserFactory.ParserFactory.create (url, attribs)
199 | self.add_redirection (parser)
200 | if self.is_wanted_aux (parser):
201 | self.enqueue (parser.url, depth, attribs)
202 | except IOError:
203 | error ("bad aux url: %s" % url)
204 |
205 | def enqueue_doc (self, url, depth, attribs):
206 | """ Enqueue a document file.
207 |
208 | We get document files only if they pass document-selection
209 | rules.
210 |
211 | """
212 |
213 | if not self.options.max_depth or depth < self.options.max_depth:
214 | if self.is_included (url):
215 | try:
216 | parser = ParserFactory.ParserFactory.create (url, attribs)
217 | self.add_redirection (parser)
218 | if self.is_wanted_doc (parser):
219 | self.enqueue (parser.url, depth, attribs)
220 | except IOError:
221 | error ("bad url: %s" % url)
222 |
223 |
224 | def is_included (self, url):
225 | """ Return True if this document is eligible. """
226 |
227 | included = any (map (lambda x: fnmatch.fnmatchcase (url, x), self.options.include))
228 | excluded = any (map (lambda x: fnmatch.fnmatchcase (url, x), self.options.exclude))
229 |
230 | if included and not excluded:
231 | if self.options.local_files_only:
232 | if url.startswith('http:') or url.startswith('https:'):
233 | return 0
234 | else:
235 | return 1
236 | return 1
237 |
238 | if excluded:
239 | debug ("Dropping excluded %s" % url)
240 | if not included:
241 | debug ("Dropping not included %s" % url)
242 | return 0
243 |
244 |
245 | def is_included_mediatype (self, mediatype):
246 | """ Return True if this document is eligible. """
247 |
248 | included = any (map (lambda pattern: fnmatch.fnmatch (mediatype, pattern),
249 | self.options.include_mediatypes))
250 | excluded = any (map (lambda pattern: fnmatch.fnmatch (mediatype, pattern),
251 | self.options.exclude_mediatypes))
252 |
253 | if included and not excluded:
254 | self.included_mediatypes.add (mediatype)
255 | return 1
256 |
257 | if excluded:
258 | debug ("Dropping excluded mediatype %s" % mediatype)
259 | if not included:
260 | debug ("Dropping not included mediatype %s" % mediatype)
261 |
262 | self.excluded_mediatypes.add (mediatype)
263 | return 0
264 |
265 |
266 | def has_seen_images (self):
267 | """ Return True if the spider has encountered images. """
268 |
269 | return bool (MediaTypes.IMAGE_MEDIATYPES &
270 | (self.included_mediatypes | self.excluded_mediatypes))
271 |
272 |
273 | def dict_urls_mediatypes (self):
274 | """ Return a dict of all parsed urls and mediatypes. """
275 | return dict (map (lambda p: (p.url, p.mediatype), self.parsers))
276 |
277 |
278 | def is_wanted_doc (self, parser):
279 | """ Return True if we ought to parse this content document.
280 |
281 | Override this in custom spiders.
282 |
283 | """
284 | return self.is_included_mediatype (parser.mediatype)
285 |
286 |
287 | def is_wanted_aux (self, parser):
288 | """ Return True if we ought to parse this image or aux file.
289 |
290 | Override this in custom spiders.
291 |
292 | """
293 | return self.is_included_mediatype (parser.mediatype)
294 |
295 |
296 |
--------------------------------------------------------------------------------
/epubmaker/Unitame.py:
--------------------------------------------------------------------------------
1 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
2 |
3 | """
4 | Unitame.py
5 |
6 | Copyright 2010 by Marcello Perathoner
7 |
8 | Distributable under the GNU General Public License Version 3 or newer.
9 |
10 | Module to implement the totally superfluous PG plain text conversion
11 | into long extinct encodings.
12 |
13 | We have to unitame-translate before feeding to nroff because nroff
14 | does some irreversible (and wrong) translations of its own, like ä ->
15 | a. Also, some unitame-translations change the number of characters,
16 | thus throwing already-justified text off.
17 |
18 | We cannot do the translations before feeding the source to docutils
19 | because if we change the length of titles, we get the warning: Title
20 | underline too short.
21 |
22 | Translation does some dangerous things, like converting quotes to
23 | apostrophes, which are command escapes in nroff. We have to escape
24 | apostrophes in the source text but not apostroph-commands inserted by
25 | the converter.
26 |
27 | We also have to translate some important non-ascii characters, like
28 | nbsp and shy, into command sequences before they reach unitame because
29 | unitame would convert them into the semantically different space and
30 | hyhpen.
31 |
32 | All this makes translation inside the docutils converter the best
33 | choice. Implemented as a docutils translator that visits all text
34 | nodes.
35 |
36 | Smart quote translation should also go into a docutils
37 | translator. Likewise a translator for text-transform: upper.
38 |
39 | """
40 |
41 | import codecs
42 | import unicodedata as ud
43 |
44 | # UnitameData is generated from unitame.dat
45 | from epubmaker.UnitameData import unicode_to_iso_8859_1, iso_8859_1_to_ascii
46 |
47 | # tweak dicts for translate ()
48 | u2i = dict ( [ (ord (o), s) for o, s in unicode_to_iso_8859_1.iteritems () ] )
49 | i2a = dict ( [ (ord (o), s) for o, s in iso_8859_1_to_ascii.iteritems () ] )
50 |
51 | u2i.update ( {
52 | 0x2000: u' ', # en quad
53 | 0x2001: u' ', # em quad
54 | 0x2002: u' ', # en space
55 | 0x2003: u' ', # em space
56 | 0x2004: u' ', # 3/em space
57 | 0x2005: u'', # 4/em
58 | 0x2006: u'', # 6/em
59 | 0x2007: u' ', # figure space
60 | 0x2008: u'', # punctuation space
61 | 0x2009: u'', # thin space
62 | 0x200a: u'', # hair space
63 | 0x200b: u'', # zero space
64 | 0x200c: u'', # zwnj
65 | 0x200d: u'', # zwj
66 | 0x2010: u'-', # hyphen
67 | 0x2011: u'-', # non-breaking hyphen
68 | 0x2012: u'-', # figure-dash
69 | 0x2013: u'-', # en dash
70 | 0x2014: u'--', # em dash
71 | 0x2015: u'-', # horizontal bar
72 | 0x2026: u'...', # horizontal ellipsis
73 | ord (u'™'): u'(tm)',
74 | ord (u'‹'): u'<',
75 | ord (u'›'): u'>',
76 | ord (u'†'): u'+',
77 | ord (u'‡'): u'++',
78 | ord (u'⁑'): u'**',
79 | ord (u'⁂'): u'***',
80 | ord (u'•'): u'-',
81 | ord (u'′'): u'´',
82 | ord (u'″'): u'´´',
83 | ord (u'‴'): u'´´´',
84 | ord (u'⁗'): u'´´´´',
85 | ord (u'⁓'): u'~',
86 | ord (u'‰'): u'%o',
87 | ord (u'‱'): u'%oo',
88 | ord (u'⚹'): u'*', # U+26b9 sextile
89 | ord (u'⁰'): u'^0',
90 | ord (u'⁴'): u'^4',
91 | ord (u'⁵'): u'^5',
92 | ord (u'⁶'): u'^6',
93 | ord (u'⁷'): u'^7',
94 | ord (u'⁸'): u'^8',
95 | ord (u'⁹'): u'^9',
96 | } )
97 |
98 | # somehow cram these into ascii, so the ppers stop whining about not
99 | # having nbsp in ascii, then fix it later by replacing them with nroff
100 | # commands.
101 |
102 | i2a.update ( {
103 | ord (u'¹'): u'^1',
104 | ord (u'²'): u'^2',
105 | ord (u'³'): u'^3',
106 | 0x00a0: u'\u0011', # nbsp => DC1
107 | 0x00ad: u'\u0012', # shy => DC2
108 | } )
109 |
110 | unhandled_chars = []
111 |
112 | def strip_accents (text):
113 | """ Strip accents from string.
114 |
115 | If the accented character doesn't fit into the encoding,
116 | remove the accent and try again.
117 |
118 | """
119 | return ud.normalize ('NFKC',
120 | filter (lambda c: ud.category (c) != 'Mn',
121 | ud.normalize ('NFKD', text)))
122 |
123 |
124 | def unitame (exc):
125 | """
126 | Encoding error handler.
127 |
128 | The encoder handles all compatible characters itself. It calls
129 | this function whenever it encounters a character it cannot encode.
130 | This function searches the unitame database for a replacement.
131 |
132 |
133 | """
134 |
135 | l = []
136 | for cc in exc.object[exc.start:exc.end]:
137 | c = cc
138 | if exc.encoding == 'latin-1': # python name for iso-8859-1
139 | c = c.translate (u2i)
140 | c = strip_accents (c)
141 | if c and ord (max (c)) < 256:
142 | l.append (c)
143 | c = None
144 | elif exc.encoding == 'ascii': # python name for us-ascii
145 | # "1¼" -> "1 1/4"
146 | if cc in u'¼½¾':
147 | if exc.start > 0 and exc.object[exc.start - 1] in u'0123456789':
148 | l.append (' ')
149 | c = c.translate (u2i)
150 | c = c.translate (i2a)
151 | c = strip_accents (c)
152 | if c and ord (max (c)) < 128:
153 | l.append (c)
154 | c = None
155 |
156 | if c:
157 | l.append ('{~%s U+%04x~}' % (ud.name (cc), ord (cc)))
158 | unhandled_chars.extend (l)
159 |
160 | return (u"".join (l), exc.end)
161 |
162 |
163 | codecs.register_error ('unitame', unitame)
164 |
165 |
166 |
--------------------------------------------------------------------------------
/epubmaker/UnitameData.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
3 |
4 | unicode_to_iso_8859_1 = {
5 | u'Đ': u'D', # LATIN CAPITAL LETTER D WITH STROKE
6 | u'đ': u'd', # LATIN SMALL LETTER D WITH STROKE
7 | u'Ħ': u'H', # LATIN CAPITAL LETTER H WITH STROKE
8 | u'ħ': u'h', # LATIN SMALL LETTER H WITH STROKE
9 | u'Ŀ': u'L', # LATIN CAPITAL LETTER L WITH MIDDLE DOT
10 | u'ŀ': u'l', # LATIN SMALL LETTER L WITH MIDDLE DOT
11 | u'Ł': u'L', # LATIN CAPITAL LETTER L WITH STROKE
12 | u'ł': u'l', # LATIN SMALL LETTER L WITH STROKE
13 | u'ʼn': u'n', # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
14 | u'Œ': u'OE', # LATIN CAPITAL LIGATURE OE
15 | u'œ': u'oe', # LATIN SMALL LIGATURE OE
16 | u'Ŧ': u'T', # LATIN CAPITAL LETTER T WITH STROKE
17 | u'ŧ': u't', # LATIN SMALL LETTER T WITH STROKE
18 | u'ƀ': u'b', # LATIN SMALL LETTER B WITH STROKE
19 | u'Ɓ': u'B', # LATIN CAPITAL LETTER B WITH HOOK
20 | u'Ƃ': u'B', # LATIN CAPITAL LETTER B WITH TOPBAR
21 | u'ƃ': u'b', # LATIN SMALL LETTER B WITH TOPBAR
22 | u'Ɔ': u'O', # LATIN CAPITAL LETTER OPEN O
23 | u'Ƈ': u'C', # LATIN CAPITAL LETTER C WITH HOOK
24 | u'ƈ': u'c', # LATIN SMALL LETTER C WITH HOOK
25 | u'Ɗ': u'D', # LATIN CAPITAL LETTER D WITH HOOK
26 | u'Ƌ': u'D', # LATIN CAPITAL LETTER D WITH TOPBAR
27 | u'ƌ': u'd', # LATIN SMALL LETTER D WITH TOPBAR
28 | u'Ƒ': u'F', # LATIN CAPITAL LETTER F WITH HOOK
29 | u'ƒ': u'f', # LATIN SMALL LETTER F WITH HOOK
30 | u'Ɠ': u'G', # LATIN CAPITAL LETTER G WITH HOOK
31 | u'Ɨ': u'I', # LATIN CAPITAL LETTER I WITH STROKE
32 | u'Ƙ': u'K', # LATIN CAPITAL LETTER K WITH HOOK
33 | u'ƙ': u'k', # LATIN SMALL LETTER K WITH HOOK
34 | u'ƚ': u'l', # LATIN SMALL LETTER L WITH BAR
35 | u'Ɲ': u'N', # LATIN CAPITAL LETTER N WITH LEFT HOOK
36 | u'ƞ': u'n', # LATIN SMALL LETTER N WITH LONG RIGHT LEG
37 | u'Ɵ': u'O', # LATIN CAPITAL LETTER O WITH MIDDLE TILDE
38 | u'Ƥ': u'P', # LATIN CAPITAL LETTER P WITH HOOK
39 | u'ƥ': u'p', # LATIN SMALL LETTER P WITH HOOK
40 | u'ƫ': u't', # LATIN SMALL LETTER T WITH PALATAL HOOK
41 | u'Ƭ': u'T', # LATIN CAPITAL LETTER T WITH HOOK
42 | u'ƭ': u't', # LATIN SMALL LETTER T WITH HOOK
43 | u'Ʈ': u'T', # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
44 | u'Ʋ': u'V', # LATIN CAPITAL LETTER V WITH HOOK
45 | u'Ƴ': u'Y', # LATIN CAPITAL LETTER Y WITH HOOK
46 | u'ƴ': u'y', # LATIN SMALL LETTER Y WITH HOOK
47 | u'Ƶ': u'Z', # LATIN CAPITAL LETTER Z WITH STROKE
48 | u'ƶ': u'z', # LATIN SMALL LETTER Z WITH STROKE
49 | u'Lj': u'L', # LATIN CAPITAL LETTER L WITH SMALL LETTER J
50 | u'Nj': u'N', # LATIN CAPITAL LETTER N WITH SMALL LETTER J
51 | u'Ǣ': u'AE', # LATIN CAPITAL LETTER AE WITH MACRON
52 | u'ǣ': u'ae', # LATIN SMALL LETTER AE WITH MACRON
53 | u'Ǥ': u'G', # LATIN CAPITAL LETTER G WITH STROKE
54 | u'ǥ': u'g', # LATIN SMALL LETTER G WITH STROKE
55 | u'Dz': u'D', # LATIN CAPITAL LETTER D WITH SMALL LETTER Z
56 | u'Ǽ': u'AE', # LATIN CAPITAL LETTER AE WITH ACUTE
57 | u'ǽ': u'ae', # LATIN SMALL LETTER AE WITH ACUTE
58 | u'Ǿ': u'O', # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
59 | u'ǿ': u'o', # LATIN SMALL LETTER O WITH STROKE AND ACUTE
60 | u'Ƞ': u'N', # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG
61 | u'ȡ': u'd', # LATIN SMALL LETTER D WITH CURL
62 | u'Ȥ': u'Z', # LATIN CAPITAL LETTER Z WITH HOOK
63 | u'ȥ': u'z', # LATIN SMALL LETTER Z WITH HOOK
64 | u'ȴ': u'l', # LATIN SMALL LETTER L WITH CURL
65 | u'ȵ': u'n', # LATIN SMALL LETTER N WITH CURL
66 | u'ȶ': u't', # LATIN SMALL LETTER T WITH CURL
67 | u'ɓ': u'b', # LATIN SMALL LETTER B WITH HOOK
68 | u'ɕ': u'c', # LATIN SMALL LETTER C WITH CURL
69 | u'ɖ': u'd', # LATIN SMALL LETTER D WITH TAIL
70 | u'ɗ': u'd', # LATIN SMALL LETTER D WITH HOOK
71 | u'ɠ': u'g', # LATIN SMALL LETTER G WITH HOOK
72 | u'ɦ': u'h', # LATIN SMALL LETTER H WITH HOOK
73 | u'ɨ': u'i', # LATIN SMALL LETTER I WITH STROKE
74 | u'ɫ': u'l', # LATIN SMALL LETTER L WITH MIDDLE TILDE
75 | u'ɬ': u'l', # LATIN SMALL LETTER L WITH BELT
76 | u'ɭ': u'l', # LATIN SMALL LETTER L WITH RETROFLEX HOOK
77 | u'ɱ': u'm', # LATIN SMALL LETTER M WITH HOOK
78 | u'ɲ': u'n', # LATIN SMALL LETTER N WITH LEFT HOOK
79 | u'ɳ': u'n', # LATIN SMALL LETTER N WITH RETROFLEX HOOK
80 | u'ɼ': u'r', # LATIN SMALL LETTER R WITH LONG LEG
81 | u'ɽ': u'r', # LATIN SMALL LETTER R WITH TAIL
82 | u'ɾ': u'r', # LATIN SMALL LETTER R WITH FISHHOOK
83 | u'ʂ': u's', # LATIN SMALL LETTER S WITH HOOK
84 | u'ʈ': u't', # LATIN SMALL LETTER T WITH RETROFLEX HOOK
85 | u'ʉ': u'u', # LATIN SMALL LETTER U BAR
86 | u'ʋ': u'v', # LATIN SMALL LETTER V WITH HOOK
87 | u'ʐ': u'z', # LATIN SMALL LETTER Z WITH RETROFLEX HOOK
88 | u'ʑ': u'z', # LATIN SMALL LETTER Z WITH CURL
89 | u'ʜ': u'H', # LATIN LETTER SMALL CAPITAL H
90 | u'ʝ': u'j', # LATIN SMALL LETTER J WITH CROSSED-TAIL
91 | u'ʠ': u'q', # LATIN SMALL LETTER Q WITH HOOK
92 | u'ʮ': u'h', # LATIN SMALL LETTER TURNED H WITH FISHHOOK
93 | u'ʯ': u'h', # LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL
94 | u'Ѝ': u'I', # CYRILLIC CAPITAL LETTER I WITH GRAVE
95 | u'ѝ': u'i', # CYRILLIC SMALL LETTER I WITH GRAVE
96 | u'Ӑ': u'A', # CYRILLIC CAPITAL LETTER A WITH BREVE
97 | u'ӑ': u'a', # CYRILLIC SMALL LETTER A WITH BREVE
98 | u'Ӓ': u'A', # CYRILLIC CAPITAL LETTER A WITH DIAERESIS
99 | u'ӓ': u'a', # CYRILLIC SMALL LETTER A WITH DIAERESIS
100 | u'Ӣ': u'I', # CYRILLIC CAPITAL LETTER I WITH MACRON
101 | u'ӣ': u'i', # CYRILLIC SMALL LETTER I WITH MACRON
102 | u'Ӥ': u'I', # CYRILLIC CAPITAL LETTER I WITH DIAERESIS
103 | u'ӥ': u'i', # CYRILLIC SMALL LETTER I WITH DIAERESIS
104 | u'Ӧ': u'O', # CYRILLIC CAPITAL LETTER O WITH DIAERESIS
105 | u'ӧ': u'o', # CYRILLIC SMALL LETTER O WITH DIAERESIS
106 | u'Ӭ': u'E', # CYRILLIC CAPITAL LETTER E WITH DIAERESIS
107 | u'ӭ': u'e', # CYRILLIC SMALL LETTER E WITH DIAERESIS
108 | u'Ӯ': u'U', # CYRILLIC CAPITAL LETTER U WITH MACRON
109 | u'ӯ': u'u', # CYRILLIC SMALL LETTER U WITH MACRON
110 | u'Ӱ': u'U', # CYRILLIC CAPITAL LETTER U WITH DIAERESIS
111 | u'ӱ': u'u', # CYRILLIC SMALL LETTER U WITH DIAERESIS
112 | u'Ӳ': u'U', # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE
113 | u'ӳ': u'u', # CYRILLIC SMALL LETTER U WITH DOUBLE ACUTE
114 | u'ẚ': u'a', # LATIN SMALL LETTER A WITH RIGHT HALF RING
115 | u'‐': u'-', # HYPHEN
116 | u'–': u'-', # EN DASH
117 | u'—': u'--', # EM DASH
118 | u'‖': u'||', # DOUBLE VERTICAL LINE
119 | u'‗': u'_', # DOUBLE LOW LINE
120 | u'‘': u'\'', # LEFT SINGLE QUOTATION MARK
121 | u'’': u'\'', # RIGHT SINGLE QUOTATION MARK
122 | u'‚': u'\'', # SINGLE LOW-9 QUOTATION MARK
123 | u'‛': u'\'', # SINGLE HIGH-REVERSED-9 QUOTATION MARK
124 | u'“': u'"', # LEFT DOUBLE QUOTATION MARK
125 | u'”': u'"', # RIGHT DOUBLE QUOTATION MARK
126 | u'„': u'"', # DOUBLE LOW-9 QUOTATION MARK
127 | u'‟': u'"', # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
128 | u'⁅': u'[', # LEFT SQUARE BRACKET WITH QUILL
129 | u'⁆': u']', # RIGHT SQUARE BRACKET WITH QUILL
130 | }
131 |
132 |
133 | iso_8859_1_to_ascii = {
134 | u'¡': u'i', # INVERTED EXCLAMATION MARK
135 | u'¢': u'c', # CENT SIGN
136 | u'£': u'L', # POUND SIGN
137 | u'¥': u'Y', # YEN SIGN
138 | u'¦': u'|', # BROKEN BAR
139 | u'§': u'Sec.', # SECTION SIGN
140 | u'¨': u'"', # DIAERESIS
141 | u'©': u'(C)', # COPYRIGHT SIGN
142 | u'«': u'"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
143 | u'': u'-', # SOFT HYPHEN
144 | u'®': u'(R)', # REGISTERED SIGN
145 | u'¯': u'-', # MACRON
146 | u'°': u' deg.', # DEGREE SIGN
147 | u'±': u'+-', # PLUS-MINUS SIGN
148 | u'²': u'^2', # SUPERSCRIPT TWO
149 | u'³': u'^3', # SUPERSCRIPT THREE
150 | u'´': u'\'', # ACUTE ACCENT
151 | u'µ': u' mu', # MICRO SIGN
152 | u'·': u'.', # MIDDLE DOT
153 | u'»': u'"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
154 | u'¼': u'1/4', # VULGAR FRACTION ONE QUARTER
155 | u'½': u'1/2', # VULGAR FRACTION ONE HALF
156 | u'¾': u'3/4', # VULGAR FRACTION THREE QUARTERS
157 | u'¿': u'?', # INVERTED QUESTION MARK
158 | u'Ä': u'Ae', # LATIN CAPITAL LETTER A WITH DIAERESIS
159 | u'Æ': u'AE', # LATIN CAPITAL LETTER AE
160 | u'Ð': u'Eth', # LATIN CAPITAL LETTER ETH
161 | u'Ö': u'Oe', # LATIN CAPITAL LETTER O WITH DIAERESIS
162 | u'×': u'x', # MULTIPLICATION SIGN
163 | u'Ø': u'O', # LATIN CAPITAL LETTER O WITH STROKE
164 | u'Ü': u'Ue', # LATIN CAPITAL LETTER U WITH DIAERESIS
165 | u'ß': u'ss', # LATIN SMALL LETTER SHARP S
166 | u'ä': u'ae', # LATIN SMALL LETTER A WITH DIAERESIS
167 | u'æ': u'ae', # LATIN SMALL LETTER AE
168 | u'ð': u'eth', # LATIN SMALL LETTER ETH
169 | # u'ñ': u'ny', # LATIN SMALL LETTER N WITH TILDE
170 | u'ö': u'oe', # LATIN SMALL LETTER O WITH DIAERESIS
171 | u'÷': u'/', # DIVISION SIGN
172 | u'ø': u'o', # LATIN SMALL LETTER O WITH STROKE
173 | u'ü': u'ue', # LATIN SMALL LETTER U WITH DIAERESIS
174 | }
175 |
176 |
177 |
--------------------------------------------------------------------------------
/epubmaker/Version.py:
--------------------------------------------------------------------------------
1 | VERSION = '0.3.26'
2 | GENERATOR = 'EpubMaker %s '
3 |
--------------------------------------------------------------------------------
/epubmaker/WriterFactory.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 |
6 | WriterFactory.py
7 |
8 | Copyright 2009-14 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | Writer factory. Dynamically loads writers from directories.
13 |
14 | """
15 |
16 | from __future__ import with_statement
17 |
18 | import os.path
19 |
20 | from pkg_resources import resource_isdir, resource_listdir # pylint: disable=E0611
21 |
22 | from epubmaker.lib.Logger import debug
23 |
24 | writers = {}
25 |
26 | def __load_writers_from (package_name):
27 | """ See what types we can write. """
28 |
29 | try:
30 | for fn in resource_listdir (package_name, ''):
31 | modulename, ext = os.path.splitext (fn)
32 | if ext == '.py':
33 | if modulename.endswith ('Writer'):
34 | type_ = modulename.lower ().replace ('writer', '')
35 | debug ("Loading writer type %s from module %s" % (type_, modulename))
36 | module = __import__ (package_name + '.' + modulename, fromlist = [modulename])
37 | writers[type_] = module
38 |
39 | except ImportError:
40 | pass
41 |
42 |
43 | def load_writers ():
44 | """ See what types we can write. """
45 |
46 | __load_writers_from ('epubmaker.writers')
47 | __load_writers_from ('epubmaker.writers.ibiblio')
48 |
49 | return writers.keys ()
50 |
51 |
52 | def unload_writers ():
53 | """ Unload writer modules. """
54 | for k in writers.keys ():
55 | del writers[k]
56 |
57 |
58 | def create (type_):
59 | """ Load writer module for type. """
60 |
61 | try:
62 | return writers[type_].Writer ()
63 | except KeyError:
64 | raise KeyError ('No writer for type %s' % type_)
65 |
66 |
67 |
--------------------------------------------------------------------------------
/epubmaker/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 |
--------------------------------------------------------------------------------
/epubmaker/lib/GutenbergGlobals.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 | GutenbergGlobals.py
6 |
7 | Copyright 2009 by Marcello Perathoner
8 |
9 | Distributable under the GNU General Public License Version 3 or newer.
10 |
11 | This module has sadly decayed into a repository for all sorts of cruft.
12 |
13 | FIXME: refactor this module
14 |
15 | """
16 |
17 | import os
18 | import re
19 | import datetime
20 |
21 | class Struct (object):
22 | """ handy class to pin attributes on
23 |
24 | usage: c = Struct ()
25 | c.something = 1
26 |
27 | """
28 | pass
29 |
30 |
31 | NSMAP = {
32 | 'atom': 'http://www.w3.org/2005/Atom',
33 | 'bio': 'http://purl.org/vocab/bio/0.1/',
34 | 'cc': 'http://web.resource.org/cc/',
35 | 'dc': 'http://purl.org/dc/elements/1.1/',
36 | 'dcam': 'http://purl.org/dc/dcam/',
37 | 'dcmitype': 'http://purl.org/dc/dcmitype/',
38 | 'dcterms': 'http://purl.org/dc/terms/',
39 | 'ebook': 'http://www.gutenberg.org/ebooks/', # URL
40 | 'foaf': 'http://xmlns.com/foaf/0.1/',
41 | 'marcrel': 'http://id.loc.gov/vocabulary/relators',
42 | 'mathml': 'http://www.w3.org/1998/Math/MathML',
43 | 'mbp': 'http://mobipocket.com/mbp',
44 | 'ncx': 'http://www.daisy.org/z3986/2005/ncx/',
45 | 'opds': 'http://opds-spec.org/2010/Catalog',
46 | 'opf': 'http://www.idpf.org/2007/opf',
47 | 'opensearch': 'http://a9.com/-/spec/opensearch/1.1/',
48 | 'pg': 'http://www.gutenberg.org/', # URL
49 | 'pgagents': 'http://www.gutenberg.org/2009/agents/',
50 | 'pgtei': 'http://www.gutenberg.org/tei/marcello/0.5/ns',
51 | 'pgterms': 'http://www.gutenberg.org/2009/pgterms/',
52 | 'py': 'http://genshi.edgewall.org/',
53 | 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
54 | 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
55 | 'svg': 'http://www.w3.org/2000/svg',
56 | 'tei': 'http://www.tei-c.org/ns/1.0',
57 | 'xhtml': 'http://www.w3.org/1999/xhtml',
58 | 'xinclude': 'http://www.w3.org/2001/XInclude',
59 | 'xml': 'http://www.w3.org/XML/1998/namespace',
60 | 'xmlns': 'http://www.w3.org/2000/xmlns/',
61 | 'xsd': 'http://www.w3.org/2001/XMLSchema#',
62 | 'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
63 | 'xslfo': 'http://www.w3.org/1999/XSL/Format',
64 | }
65 |
66 |
67 | class NameSpaceClark (object):
68 | """ Build a tag name in Clark notation.
69 |
70 | ns = NameSpaceClark ("http://example.com/")
71 | >>> ns.foo
72 | '{http://example.com/}foo'
73 | >>> ns['bar']
74 | '{http://example.com/}bar'
75 |
76 | """
77 |
78 | def __init__ (self, root):
79 | self.root = root
80 |
81 | def __getitem__ (self, local):
82 | return "{%s}%s" % (self.root, local)
83 |
84 | def __getattr__ (self, local):
85 | return "{%s}%s" % (self.root, local)
86 |
87 | def __str__ (self):
88 | return self.root
89 |
90 |
91 | class NameSpaceURI (object):
92 | """ Build a URI.
93 |
94 | ns = NameSpaceURI ("http://example.com/")
95 | >>> ns.foo
96 | 'http://example.com/foo'
97 | >>> ns['bar']
98 | 'http://example.com/bar'
99 |
100 | """
101 |
102 | def __init__ (self, root):
103 | self.root = root
104 |
105 | def __getitem__ (self, local):
106 | return "%s%s" % (self.root, local)
107 |
108 | def __getattr__ (self, local):
109 | return "%s%s" % (self.root, local)
110 |
111 | def __str__ (self):
112 | return self.root
113 |
114 |
115 | def build_nsmap (prefixes = None):
116 | """ build a nsmap containing all namespaces for prefixes """
117 |
118 | if prefixes is None:
119 | prefixes = NSMAP.keys ()
120 | if isinstance (prefixes, str):
121 | prefixes = prefixes.split ()
122 |
123 | ns = {}
124 | for prefix in prefixes:
125 | ns[prefix] = NSMAP[prefix]
126 |
127 | return ns
128 |
129 |
130 | NS = Struct ()
131 | NSURI = Struct ()
132 |
133 | for prefix, uri in NSMAP.items ():
134 | setattr (NS, prefix, NameSpaceClark (uri))
135 | setattr (NSURI, prefix, NameSpaceURI (uri))
136 |
137 | XML_DECLARATION = """"""
138 |
139 | XHTML_DOCTYPE = ("")
141 |
142 | XHTML1_DOCTYPE = ("")
144 |
145 | XHTML_RDFa_DOCTYPE = ("")
147 |
148 | NCX_DOCTYPE = ("")
150 |
151 | GENERATOR = 'EpubMaker by Marcello Perathoner '
152 |
153 |
154 | def xmlspecialchars (s):
155 | return (s.replace (u'&', u'&')
156 | .replace (u'<', u'<')
157 | .replace (u'>', u'>'))
158 |
159 | def insert_breaks (s):
160 | return s.replace (u'\n', u'
')
161 |
162 | RE_NORMALIZE = re.compile (r"\s+")
163 |
164 | def normalize (s):
165 | s = RE_NORMALIZE.sub (' ', s)
166 | return s.strip ()
167 |
168 |
169 | def cut_at_newline (text):
170 | """ Cut the text at the first newline. """
171 | i = text.find ('\n')
172 | if i > -1:
173 | return text[:i]
174 | return text
175 |
176 | def archive_dir (ebook):
177 | """ build 1/2/3/4/12345 for 12345 """
178 | ebook = str (ebook)
179 | a = []
180 | for c in ebook:
181 | a.append (c)
182 | a[-1] = ebook
183 | return "/".join (a)
184 |
185 | def archive2files (ebook, path):
186 | adir = archive_dir (ebook)
187 | return path.replace ('dirs/' + adir, 'files/%d' % ebook)
188 |
189 |
190 | def xpath (node, path, **kwargs):
191 | """ xpath helper """
192 | return node.xpath (path, namespaces = NSMAP, **kwargs)
193 |
194 |
195 | def mkdir_for_filename (fn):
196 | """ Make sure the directory for this file is present. """
197 |
198 | try:
199 | os.makedirs (os.path.dirname (fn))
200 | except os.error:
201 | pass
202 |
203 |
204 | def make_url_relative (base_url, url):
205 | """ Make absolute url relative to base_url if possible. """
206 |
207 | if (url.startswith (base_url)):
208 | return url[len (base_url):]
209 |
210 | base_url = os.path.dirname (base_url) + '/'
211 |
212 | if (url.startswith (base_url)):
213 | return url[len (base_url):]
214 |
215 | return url
216 |
217 |
218 | def normalize_path (path):
219 | """ Normalize a file path. """
220 | if path.startswith ('file://'):
221 | path = path[7:]
222 | return path
223 |
224 | def is_same_path (path1, path2):
225 | """ Does path1 point to the same file as path2? """
226 | return os.path.realpath (normalize (path1)) == os.path.realpath (normalize (path2))
227 |
228 |
229 | def string_to_filename (fn):
230 | """ Sanitize string so it can do as filename. """
231 |
232 | def escape (matchobj):
233 | """ Escape a char. """
234 | return '@%x' % ord (matchobj.group (0))
235 |
236 | fn = os.path.normpath (fn)
237 | fn = normalize (fn)
238 | fn = fn.replace (os.sep, '@')
239 | if os.altsep:
240 | fn = fn.replace (os.altsep, '@')
241 | fn = re.sub (u'[\|/:?"*<>\u0000-\u001F]', escape, fn)
242 |
243 | return fn
244 |
245 |
246 | class DCIMT (object):
247 | """ encapsulates one dcterms internet mimetype
248 |
249 | """
250 |
251 | def __init__ (self, mime, enc = None):
252 | if mime is None:
253 | self.mimetype = 'application/octet-stream'
254 | elif enc is not None and mime.startswith ('text/'):
255 | self.mimetype = "%s; charset=%s" % (mime, enc)
256 | else:
257 | self.mimetype = mime
258 |
259 | def __str__ (self):
260 | return self.mimetype
261 |
262 |
263 | class UTC (datetime.tzinfo):
264 | """ UTC helper for datetime.datetime """
265 |
266 | def utcoffset (self, dummy_dt):
267 | return datetime.timedelta (0)
268 |
269 | def tzname (self, dummy_dt):
270 | return "UTC"
271 |
272 | def dst (self, dummy_dt):
273 | return datetime.timedelta (0)
274 |
275 | # exceptions
276 |
277 | class SkipOutputFormat (Exception):
278 | pass
279 |
280 | # Spider.py treis a topological sort on link rel=next
281 | def topological_sort (pairlist):
282 | """Topologically sort a list of (parent, child) pairs.
283 |
284 | Return a list of the elements in dependency order (parent to child order).
285 |
286 | >>> print topsort( [(1,2), (3,4), (5,6), (1,3), (1,5), (1,6), (2,5)] )
287 | [1, 2, 3, 5, 4, 6]
288 |
289 | >>> print topsort( [(1,2), (1,3), (2,4), (3,4), (5,6), (4,5)] )
290 | [1, 2, 3, 4, 5, 6]
291 |
292 | >>> print topsort( [(1,2), (2,3), (3,2)] )
293 | Traceback (most recent call last):
294 | CycleError: ([1], {2: 1, 3: 1}, {2: [3], 3: [2]})
295 |
296 | """
297 | num_parents = {} # element -> # of predecessors
298 | children = {} # element -> list of successors
299 | for parent, child in pairlist:
300 | # Make sure every element is a key in num_parents.
301 | if not num_parents.has_key( parent ):
302 | num_parents[parent] = 0
303 | if not num_parents.has_key( child ):
304 | num_parents[child] = 0
305 |
306 | # Since child has a parent, increment child's num_parents count.
307 | num_parents[child] += 1
308 |
309 | # ... and parent gains a child.
310 | children.setdefault(parent, []).append(child)
311 |
312 | # Suck up everything without a parent.
313 | answer = [x for x in num_parents.keys() if num_parents[x] == 0]
314 |
315 | # For everything in answer, knock down the parent count on its children.
316 | # Note that answer grows *in* the loop.
317 | for parent in answer:
318 | del num_parents[parent]
319 | if children.has_key( parent ):
320 | for child in children[parent]:
321 | num_parents[child] -= 1
322 | if num_parents[child] == 0:
323 | answer.append( child )
324 | # Following "del" isn't needed; just makes
325 | # CycleError details easier to grasp.
326 | del children[parent]
327 |
328 | if num_parents:
329 | # Everything in num_parents has at least one child ->
330 | # there's a cycle.
331 | raise Exception (answer, num_parents, children)
332 | return answer
333 |
--------------------------------------------------------------------------------
/epubmaker/lib/Logger.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 | Logger.py
6 |
7 | Copyright 2009 by Marcello Perathoner
8 |
9 | Distributable under the GNU General Public License Version 3 or newer.
10 |
11 | Logging support.
12 |
13 |
14 | """
15 |
16 | import logging
17 | from logging import debug, info, warn, error, critical, exception
18 |
19 | LOGFORMAT = '%(asctime)s %(levelname)-8s #%(ebook)-5d %(message)s'
20 |
21 | ebook = 0 # global
22 |
23 | class CustomFormatter (logging.Formatter):
24 | """ A custom formatter that adds ebook no. """
25 |
26 | def format (self, record):
27 | """ Add ebook no. to string format params. """
28 | record.ebook = ebook
29 | return logging.Formatter.format (self, record)
30 |
31 |
32 | def setup (logformat, logfile = None):
33 | """ Setup logger. """
34 |
35 | # StreamHandler defaults to sys.stderr
36 | file_handler = logging.FileHandler (logfile) if logfile else logging.StreamHandler ()
37 | file_handler.setFormatter (CustomFormatter (logformat))
38 | logging.getLogger ().addHandler (file_handler)
39 | logging.getLogger ().setLevel (logging.INFO)
40 |
41 |
42 | def set_log_level (level):
43 | """ Set log level. """
44 | if level >= 1:
45 | logging.getLogger ().setLevel (logging.INFO)
46 | if level >= 2:
47 | logging.getLogger ().setLevel (logging.DEBUG)
48 |
49 |
50 | __all__ = 'debug info warn error critical exception'.split ()
51 |
--------------------------------------------------------------------------------
/epubmaker/lib/MediaTypes.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 | MediaTypes.py
6 |
7 | Copyright 2009 by Marcello Perathoner
8 |
9 | Distributable under the GNU General Public License Version 3 or newer.
10 |
11 | Media Types Lists
12 |
13 | """
14 |
15 | import mimetypes
16 |
17 | mimetypes.init ()
18 |
19 | # overrides
20 |
21 | mimetypes.types_map['.htm'] = 'application/xhtml+xml'
22 | mimetypes.types_map['.html'] = 'application/xhtml+xml'
23 | mimetypes.types_map['.xhtml'] = 'application/xhtml+xml'
24 | mimetypes.types_map['.mobile'] = 'application/xhtml+xml'
25 | mimetypes.types_map['.ncx'] = 'application/x-dtbncx+xml'
26 | mimetypes.types_map['.pt'] = 'application/vnd.adobe-page-template+xml'
27 | mimetypes.types_map['.epub'] = 'application/epub+zip'
28 | mimetypes.types_map['.mobi'] = 'application/x-mobipocket-ebook'
29 | mimetypes.types_map['.pdf'] = 'application/pdf'
30 | mimetypes.types_map['.plucker'] = 'application/prs.plucker'
31 | mimetypes.types_map['.qioo'] = 'application/x-qioo-ebook'
32 | mimetypes.types_map['.jar'] = 'application/java-archive'
33 | mimetypes.types_map['.rss'] = 'application/rss+xml'
34 | mimetypes.types_map['.atom'] = 'application/atom+xml'
35 | mimetypes.types_map['.opds'] = 'application/atom+xml'
36 | mimetypes.types_map['.stanza'] = 'application/atom+xml'
37 | mimetypes.types_map['.wap'] = 'application/vnd.wap.xhtml+xml'
38 | mimetypes.types_map['.json'] = 'application/x-suggestions+json'
39 | mimetypes.types_map['.rst'] = 'text/x-rst'
40 | mimetypes.types_map['.png'] = 'image/png' # Windows XP thinks this is image/x-png
41 | mimetypes.types_map['.jpg'] = 'image/jpeg' # Windows XP thinks this is image/pjpeg
42 | mimetypes.types_map['.jpeg'] = 'image/jpeg' # Windows XP thinks this is image/pjpeg
43 | mimetypes.types_map['.jfif'] = 'image/jpeg'
44 | mimetypes.types_map['.mscz'] = 'application/x-musescore+xml'
45 | mimetypes.types_map['.mid'] = 'audio/midi'
46 | mimetypes.types_map['.midi'] = 'audio/midi'
47 | mimetypes.types_map['.mus'] = 'application/x-myriad-music'
48 | mimetypes.types_map['.sib'] = 'application/x-sibelius-score'
49 | mimetypes.types_map['.mxl'] = 'application/vnd.recordare.musicxml'
50 | mimetypes.types_map['.mp3'] = 'audio/mpeg'
51 |
52 |
53 | TEXT_MEDIATYPES = set ( (
54 | 'application/xhtml+xml',
55 | 'application/xml',
56 | 'text/html',
57 | 'text/plain',
58 | ) )
59 |
60 | IMAGE_MEDIATYPES = set ( (
61 | 'image/gif',
62 | 'image/jpeg',
63 | 'image/png',
64 | ) )
65 |
66 | AUX_MEDIATYPES = set ( (
67 | 'text/css',
68 | ) )
69 |
70 | class MediatypesLookup (object):
71 | """ Quick mediatype lookup
72 |
73 | ns = MediatypesLookup ()
74 | >>> ns.epub
75 | 'application/atom+xml'
76 | >>> ns['mobi']
77 | 'application/x-mobipocket-ebook'
78 |
79 | """
80 |
81 | def __getitem__ (self, local):
82 | return mimetypes.types_map['.' + local]
83 |
84 | def __getattr__ (self, local):
85 | return mimetypes.types_map['.' + local]
86 |
87 | mediatypes = MediatypesLookup ()
88 |
89 |
--------------------------------------------------------------------------------
/epubmaker/lib/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 |
3 | __all__ = ['DublinCore', 'DummyConnectionPool',
4 | 'GutenbergDatabaseDublinCore', 'GutenbergDatabase',
5 | 'GutenbergGlobals', 'Logger', 'MediaTypes']
6 |
--------------------------------------------------------------------------------
/epubmaker/mydocutils/__init__.py:
--------------------------------------------------------------------------------
1 | broken = 'images/broken.png'
2 |
3 |
--------------------------------------------------------------------------------
/epubmaker/mydocutils/gutenberg/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 |
--------------------------------------------------------------------------------
/epubmaker/mydocutils/gutenberg/parsers/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
3 |
4 | """
5 |
6 | Module parsers
7 |
8 | Copyright 2010-2012 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | Customized Project Gutenberg directives for RST parser.
13 |
14 | """
15 |
16 | from docutils import statemachine
17 | from docutils.parsers.rst import Directive, directives
18 |
19 | from epubmaker.mydocutils import parsers
20 |
21 | from epubmaker.mydocutils.gutenberg import transforms as gutenberg_transforms
22 |
23 | from epubmaker.lib.Logger import error, info, debug, warn
24 |
25 | # pylint: disable=W0142, W0102
26 |
27 |
28 | class PGHeaderFooter (Directive):
29 | """ Inserts PG header or footer. """
30 |
31 | required_arguments = 0
32 | optional_arguments = 0
33 |
34 | def run (self):
35 | settings = self.state.document.settings
36 | include_lines = statemachine.string2lines (
37 | settings.get_resource ('mydocutils.gutenberg.parsers', self.resource).decode ('utf-8'),
38 | settings.tab_width,
39 | convert_whitespace = 1)
40 | self.state_machine.insert_input (include_lines, '')
41 | return []
42 |
43 |
44 | class PGHeader (PGHeaderFooter):
45 | """ Inserts PG header. """
46 | resource = 'pg-header.rst'
47 |
48 |
49 | class PGFooter (PGHeaderFooter):
50 | """ Inserts PG footer. """
51 | resource = 'pg-footer.rst'
52 |
53 |
54 | class Parser (parsers.Parser):
55 | """ Parser with PG custom directives. """
56 |
57 | def __init__ (self):
58 | parsers.Parser.__init__ (self)
59 |
60 | directives.register_directive ('pgheader', PGHeader)
61 | directives.register_directive ('pgfooter', PGFooter)
62 |
63 |
64 | def get_transforms (self):
65 | return parsers.Parser.get_transforms (self) + [
66 | gutenberg_transforms.VariablesTransform,
67 | gutenberg_transforms.SubRefToVarTransform]
68 |
--------------------------------------------------------------------------------
/epubmaker/mydocutils/gutenberg/parsers/pg-header.rst:
--------------------------------------------------------------------------------
1 | .. -*- encoding: utf-8 -*-
2 |
3 | .. |pg.copyrighted-header| replace::
4 |
5 | This is a *copyrighted* Project Gutenberg eBook, details
6 | below. Please follow the copyright guidelines in this file.
7 |
8 | .. _pg-header:
9 |
10 | .. container:: noindent pgheader language-en
11 |
12 | This eBook is for the use of anyone anywhere at no cost and with
13 | almost no restrictions whatsoever. You may copy it, give it away or
14 | re-use it under the terms of the `Project Gutenberg License`_
15 | included with this eBook or online at
16 | http://www.gutenberg.org/license.
17 |
18 | |pg.copyrighted-header|
19 |
20 | .. vspace:: 2
21 |
22 | .. _pg-machine-header:
23 |
24 | .. container:: noindent white-space-pre-line
25 |
26 | |pg.machine-header|
27 |
28 | .. vspace:: 2
29 |
30 | .. _pg-start-line:
31 |
32 | \*\*\* START OF THIS PROJECT GUTENBERG EBOOK |pg.upcase-title| \*\*\*
33 |
34 | .. vspace:: 4
35 |
36 | .. _pg-produced-by:
37 |
38 | |pg.produced-by|
39 |
40 | .. vspace:: 1
41 |
42 | |pg.credits|
43 |
44 |
--------------------------------------------------------------------------------
/epubmaker/mydocutils/gutenberg/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
3 |
4 | """
5 |
6 | gutenberg.py
7 |
8 | Copyright 2012 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | Transforms for the Project Gutenberg flavor.
13 |
14 | """
15 |
16 | import datetime
17 | import textwrap
18 |
19 | from docutils import nodes
20 | import docutils.transforms
21 | import docutils.transforms.parts
22 |
23 | from epubmaker.lib.Logger import error, info, debug, warn
24 | from epubmaker.lib.DublinCore import DublinCore
25 | from epubmaker.mydocutils import nodes as mynodes
26 |
27 | # pylint: disable=W0142
28 |
29 | class SubRefToVarTransform (docutils.transforms.Transform):
30 | """
31 | Transforms subref nodes in 'pg' namespace into var nodes.
32 |
33 | We need to save some subrefs for later processing. The standard
34 | subref processing happens too early (ie. before docinfo is
35 | collected). So we transform subrefs into variables, await docinfo
36 | to be processed, and then process the variables.
37 |
38 | """
39 |
40 | default_priority = 219
41 | """ Before substitition def variables """
42 |
43 |
44 | def apply (self):
45 | for ref in self.document.traverse (nodes.substitution_reference):
46 | refname = ref['refname']
47 | if refname.startswith ('pg.'):
48 | var = mynodes.variable ()
49 | var['name'] = refname
50 | ref.replace_self (var)
51 |
52 |
53 | class VariablesTransform (docutils.transforms.Transform):
54 | """ Replaces mynodes.var with parameters from metadata. """
55 |
56 | default_priority = 342
57 | """ After DocInfoCollector. """
58 |
59 | def apply(self):
60 | doc = self.document
61 | meta = doc.meta_block
62 | defs = doc.substitution_defs
63 |
64 | def getone (name, default = None):
65 | """ Get first value. """
66 | if name in meta:
67 | return meta[name][0]
68 | return default
69 |
70 | def getmany (name, default = []):
71 | """ Get list of all values. """
72 | return meta.get (name, default)
73 |
74 | def sub (var, nodes):
75 | var.replace_self (nodes)
76 |
77 | title = getone ('DC.Title', 'No Title')
78 | short_title = getone ('PG.Title', title)
79 | short_title = short_title.split ('\n', 1)[0]
80 |
81 | language = getmany ('DC.Language', ['en'])
82 | language = map (lambda x: DublinCore.language_map.get (
83 | x, 'Unknown').title (), language)
84 | language = DublinCore.strunk (language)
85 |
86 | copyrighted = getone ('PG.Rights', '').lower () == 'copyrighted'
87 |
88 | for variable in doc.traverse (mynodes.variable):
89 | name = variable['name']
90 |
91 | if name == 'pg.upcase-title':
92 | sub (variable, [ nodes.inline ('', short_title.upper ()) ])
93 |
94 | elif name == 'pg.produced-by':
95 | producers = getmany ('PG.Producer')
96 | if producers:
97 | sub (variable, [ nodes.inline ('', u'Produced by %s.' %
98 | DublinCore.strunk (producers)) ])
99 | else:
100 | sub (variable, [])
101 |
102 | elif name == 'pg.credits':
103 | sub (variable, [ nodes.inline ('', getone ('PG.Credits', '')) ])
104 |
105 | elif name == 'pg.bibrec-url':
106 | url = 'http://www.gutenberg.org/ebooks/%s' % getone ('PG.Id', '999999')
107 | sub (variable, [ nodes.reference ('', '', nodes.inline ('', url), refuri = url) ])
108 |
109 | elif name in ('pg.copyrighted-header', 'pg.copyrighted-footer'):
110 | if copyrighted:
111 | subdef_copy = defs[name].deepcopy ()
112 | sub (variable, subdef_copy.children)
113 | else:
114 | sub (variable, [])
115 |
116 | elif name == 'pg.machine-header':
117 | tw = textwrap.TextWrapper (
118 | width = 72,
119 | initial_indent = u'Title: ',
120 | subsequent_indent = u' ' * 7)
121 |
122 | if '\n' in title:
123 | maintitle, subtitle = title.split ('\n', 1)
124 | s = tw.fill (maintitle)
125 | s += '\n'
126 | tw.initial_indent = tw.subsequent_indent
127 | s += tw.fill (subtitle)
128 | else:
129 | s = tw.fill (title)
130 | s += '\n\n'
131 |
132 | tw.initial_indent = u'Author: '
133 | tw.subsequent_indent = u' ' * 8
134 | s += tw.fill (DublinCore.strunk (getmany ('DC.Creator', ['Unknown'])))
135 | s += '\n\n'
136 |
137 | date = getone ('PG.Released', '')
138 | try:
139 | date = datetime.datetime.strptime (date, '%Y-%m-%d')
140 | date = datetime.datetime.strftime (date, '%B %d, %Y')
141 | except ValueError:
142 | date = 'unknown date'
143 | s += u'Release Date: %s [EBook #%s]\n' % (date, getone ('PG.Id', '999999'))
144 |
145 | for item in getmany ('PG.Reposted', []):
146 | try:
147 | date, comment = item.split (None, 1)
148 | except ValueError:
149 | date = item
150 | comment = None
151 | try:
152 | date = datetime.datetime.strptime (date, '%Y-%m-%d')
153 | date = datetime.datetime.strftime (date, '%B %d, %Y')
154 | except ValueError:
155 | date = 'unknown date'
156 |
157 | s += u'Reposted: %s' % date
158 | if comment:
159 | s += u' [%s]' % comment
160 | s += '\n'
161 |
162 | s += u'\nLanguage: %s\n\n' % language
163 | s += u'Character set encoding: %s' % doc.settings.encoding.upper ()
164 |
165 | sub (variable, [ nodes.inline ('', nodes.Text (s)) ])
166 |
--------------------------------------------------------------------------------
/epubmaker/mydocutils/gutenberg/writers/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 |
--------------------------------------------------------------------------------
/epubmaker/mydocutils/gutenberg/writers/nroff.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # $Id: manpage.py 6270 2010-03-18 22:32:09Z milde $
3 | # Author: Engelbert Gruber
4 | # Copyright: This module is put into the public domain.
5 | # Rewritten almost completely
6 | # by Marcello Perathoner
7 |
8 | """
9 |
10 | Nroff writer for reStructuredText. Tweaked for Project Gutenberg usage.
11 |
12 | """
13 |
14 | __docformat__ = 'reStructuredText'
15 |
16 | from epubmaker.mydocutils.writers import nroff
17 | from epubmaker import Unitame
18 |
19 | from epubmaker.lib.Logger import info, debug, warn, error
20 |
21 | GUTENBERG_NROFF_PREAMBLE = r""".\" -*- mode: nroff -*- coding: {encoding} -*-
22 | .\" This file produces Project Gutenberg plain text. Usage:
23 | .\" $ groff -t -K {device} -T {device} this_file > output.txt
24 | .
25 | .pl 100000 \" very tall page: disable pagebreaks
26 | .ll 72m
27 | .po 0
28 | .ad l \" text-align: left
29 | .nh \" hyphenation: off
30 | .cflags 0 .?! \" single sentence space
31 | .cflags 0 -\[hy]\[em] \" don't break on -
32 | .
33 | .de nop
34 | ..
35 | .blm nop \" do nothing on empty line
36 | .
37 | .nr [env_cnt] 0
38 | .ev 0 \" start in a defined environment
39 | .
40 | .de push_env
41 | .br
42 | .nr last_env \\n[.ev] \" save current environment name
43 | .nr env_cnt +1 \" generate new environment name
44 | .ev \\n[env_cnt]
45 | .evc \\n[last_env]
46 | ..
47 | .de pop_env
48 | .br
49 | .ev
50 | .nr env_cnt -1
51 | ..
52 | .
53 | """
54 |
55 | GUTENBERG_NROFF_POSTAMBLE = r""".
56 | .pl 0 \" ends very long page here
57 | .\" End of File
58 | """
59 |
60 | class Writer (nroff.Writer):
61 | """ A plaintext writer thru nroff. """
62 |
63 | supported = ('pg-nroff',)
64 | """Formats this writer supports."""
65 |
66 | def __init__ (self):
67 | nroff.Writer.__init__ (self)
68 | self.translator_class = Translator
69 |
70 | def translate (self):
71 | visitor = self.translator_class (self.document)
72 | del Unitame.unhandled_chars[:]
73 | self.document.walkabout (visitor)
74 | self.output = visitor.astext ()
75 | if Unitame.unhandled_chars:
76 | error ("unitame: unhandled chars: %s" % u", ".join (set (Unitame.unhandled_chars)))
77 |
78 | #def get_transforms (self):
79 | # tfs = writers.Writer.get_transforms (self)
80 | # return tfs + [parts.CharsetTransform]
81 |
82 |
83 |
84 | class Translator (nroff.Translator):
85 | """ nroff translator """
86 |
87 | def preamble (self):
88 | """ Inserts nroff preamble. """
89 | return GUTENBERG_NROFF_PREAMBLE.format (
90 | encoding = self.encoding, device = self.device)
91 |
92 |
93 | def postamble (self):
94 | """ Inserts nroff postamble. """
95 | return GUTENBERG_NROFF_POSTAMBLE.format (
96 | encoding = self.encoding, device = self.device)
97 |
98 |
99 | def init_translate_maps (self):
100 | nroff.Translator.init_translate_maps (self)
101 |
102 | update = {
103 | 0x0011: ur"\~", # nbsp, see: Unitame.py
104 | 0x0012: ur"\%", # shy, see: Unitame.py
105 | }
106 |
107 | self.translate_map.update (update)
108 | self.translate_map_literal.update (update)
109 |
110 |
111 | def register_classes (self):
112 | """ Register classes.
113 |
114 | Use the idiosyncratic PG convention of marking up italics etc.
115 |
116 | """
117 |
118 | #
119 | # This does not call the base class !!!
120 | #
121 |
122 | self.register_class ('simple', 'left', '.ad l', '')
123 | self.register_class ('simple', 'right', '.ad r', '')
124 | self.register_class ('simple', 'center', '.ad c', '')
125 |
126 | self.register_class ('inline', 'italics', '_', '_')
127 | self.register_class ('inline', 'bold', '*', '*')
128 |
129 | self.register_class ('inline', 'monospaced', '', '')
130 | self.register_class ('inline', 'superscript', '', '')
131 | self.register_class ('inline', 'subscript', '', '')
132 |
133 | self.register_class ('inline', 'small-caps', '_', '_')
134 | self.register_class ('inline', 'gesperrt', '_', '_')
135 | self.register_class ('inline', 'antiqua', '_', '_')
136 | self.register_class ('inline', 'larger', '', '')
137 | self.register_class ('inline', 'smaller', '', '')
138 |
139 |
140 | def translate (self, text):
141 | """ Reduce the charset while keeping text a unicode string. """
142 |
143 | # NOTE: there's an alternate approach in
144 | # transforms.parts.CharsetTransform
145 |
146 | if self.encoding != 'utf-8':
147 | text = text.encode (self.encoding, 'unitame')
148 | text = text.decode (self.encoding)
149 |
150 | if self.in_literal:
151 | text = text.translate (self.translate_map_literal)
152 | else:
153 | text = text.translate (self.translate_map)
154 |
155 | return text
156 |
157 |
158 | def visit_inner (self, node):
159 | """ Try to remove duplicated PG highlight markers. """
160 | if node.type == 'inline':
161 | prefixes = self.get_prefix (node.type, node['classes'])
162 | for prefix in prefixes:
163 | if prefix == self.last_output_char:
164 | self.backspace ()
165 | else:
166 | self.text (prefix)
167 | else:
168 | nroff.Translator.visit_inner (self, node)
169 |
170 |
171 | def visit_inline (self, node):
172 | if 'toc-pageref' in node['classes']:
173 | maxlen = 3 # sensible default
174 | while node.parent:
175 | node = node.parent
176 | if 'pageno_maxlen' in node:
177 | maxlen = node['pageno_maxlen']
178 | break
179 | self.cmd (('linetabs 1',
180 | r'ta (\n[.l]u - \n[.i]u - %dm) +%dmR' % (maxlen + 1, maxlen + 1),
181 | r'lc .'))
182 | self.text (chr (1) + '\t')
183 | nroff.Translator.visit_inline (self, node)
184 |
185 | def visit_section_title (self, node):
186 | """ Implements PG-standard spacing before headers. """
187 | self.sp (max (2, 5 - self.section_level))
188 |
189 | def visit_figure (self, node):
190 | self.sp (1)
191 | self.push ()
192 |
193 | def depart_figure (self, node):
194 | self.pop ()
195 | self.sp (1)
196 |
197 | def visit_image (self, node):
198 | # ignore alt attribute except for dropcaps
199 | if 'dropcap' in node['classes']:
200 | self.text (node.attributes.get ('alt', ''))
201 |
202 | def visit_page (self, node):
203 | if 'clearpage' in node['classes']:
204 | self.sp (4)
205 | elif 'cleardoublepage' in node['classes']:
206 | self.sp (4)
207 | else:
208 | nroff.Translator.visit_page (self, node)
209 |
210 |
--------------------------------------------------------------------------------
/epubmaker/mydocutils/nodes.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
3 |
4 | """
5 |
6 | nodes.py
7 |
8 | Copyright 2011 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | Added nodes for PG.
13 |
14 | """
15 |
16 | from docutils import nodes
17 |
18 | class page (nodes.Element, nodes.Special):
19 | """ Hold pagination commands.
20 |
21 | Like clearpage, vspace etc.
22 |
23 | """
24 |
25 | class newline (nodes.Element):
26 | """ A line break.
27 |
28 | Outputs a hard line break if the node or one of its parents belong
29 | to the class 'white-space-pre-line'. Else a space.
30 |
31 | """
32 |
33 | class footnote_group (nodes.container):
34 | """ Hold a group of footnotes. """
35 |
36 |
37 | class variable (nodes.Inline, nodes.TextElement):
38 | """ A placeholder that gets substituted with actual text before output.
39 |
40 | We do not use substitution refs because they are resolved way too
41 | early in the transformation stage to be of much use to us.
42 |
43 | """
44 |
45 |
46 | class node_selector (object):
47 | """ Allows CSS-like selectors as condition function for nodes.traverse (). """
48 |
49 | def __init__ (self, selector):
50 |
51 | # allow selectors like [element][.class[.class[...]]][, selector[, selector]]
52 |
53 | self.matches = [] # list of 2-tuples
54 |
55 | for sel in selector.split (','):
56 | sel = sel.strip ()
57 | if '.' not in sel:
58 | sel += '.'
59 | element, classes = sel.split ('.', 1)
60 | classes = set (classes.split ('.')) if classes else set ()
61 | self.matches.append ( (getattr (nodes, element, nodes.Element), classes) )
62 |
63 |
64 | def __call__ (self, node):
65 | """ returns True if the node matches the selector. """
66 |
67 | for match in self.matches:
68 | if isinstance (node, match[0]) and match[1].issubset (node['classes']):
69 | return True
70 |
71 | return False
72 |
--------------------------------------------------------------------------------
/epubmaker/mydocutils/parsers/default_style.rst:
--------------------------------------------------------------------------------
1 | .. this is the default PG-RST stylesheet
2 |
3 | .. style:: emphasis
4 | :class: italics
5 |
6 | .. style:: strong
7 | :class: bold
8 |
9 | .. style:: title_reference
10 | :class: italics
11 |
12 | .. style:: option_argument
13 | :class: italics
14 |
15 | .. style:: literal
16 | :class: monospaced
17 |
18 | .. style:: subscript
19 | :class: subscript
20 |
21 | .. style:: superscript
22 | :class: superscript
23 |
24 | .. style:: title.document-title
25 | :class: x-large center
26 | :titlehack:
27 |
28 | .. style:: title.topic-title
29 | :class: centerleft
30 |
31 | .. style:: title.table-title
32 | :class: centerleft larger
33 |
34 | .. figure and image styles for non-image formats
35 |
36 | .. style:: figure
37 | :class: margin
38 |
39 | .. style:: figure
40 | :formats: txt.* *.noimages
41 | :align: center
42 | :width: 80%
43 |
44 | .. style:: image
45 | :formats: *.noimages
46 |
47 | .. container:: center image margin
48 |
49 | [image]
50 |
51 |
52 | .. style:: image
53 | :formats: txt.*
54 | :display: none
55 |
56 | .. style:: caption.figure-caption
57 | :formats: -txt.*
58 | :class: centerleft italics margin
59 |
60 | .. style:: caption.figure-caption
61 | :formats: txt.*
62 | :class: margin
63 | :before: '[Illustration: '
64 | :after: ']'
65 |
66 | .. style:: legend
67 | :class: margin
68 |
69 |
70 | .. default transition
71 |
72 | .. style:: transition
73 |
74 | .. container:: center transition margin
75 |
76 | ――――
77 |
78 | .. default attribution
79 |
80 | .. style:: attribution
81 | :class: margin
82 | :before: '―― '
83 |
84 |
--------------------------------------------------------------------------------
/epubmaker/mydocutils/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 |
--------------------------------------------------------------------------------
/epubmaker/mydocutils/writers/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
3 |
4 | """
5 |
6 | Mydocutils writer package.
7 |
8 | Copyright 2010-2012 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | """
13 |
14 |
15 | __docformat__ = 'reStructuredText'
16 |
17 | import collections
18 | import operator
19 |
20 | from docutils import nodes, writers
21 | import roman
22 |
23 |
24 | class Writer (writers.Writer):
25 | """ A base class for writers. """
26 |
27 | output = None
28 | """Final translated form of `document`."""
29 |
30 | config_section_dependencies = ('writers', )
31 |
32 | def translate (self):
33 | visitor = self.translator_class (self.document)
34 | self.document.walkabout (visitor)
35 | self.output = visitor.astext ()
36 |
37 |
38 | class TablePass1 (nodes.SparseNodeVisitor):
39 |
40 | """
41 | Make a first pass over a table to get a reliable row and column
42 | count. Insert placeholder cells for spanned cells.
43 | """
44 |
45 | def __init__ (self, document):
46 | nodes.SparseNodeVisitor.__init__ (self, document)
47 |
48 | self.row = -1 # 0-based
49 | self.column = 0 # 0-based
50 | self.cells = 0
51 | self.colspecs = None
52 |
53 | def visit_table (self, table):
54 | self.colspecs = table.traverse (nodes.colspec)
55 | width = sum (map (operator.itemgetter ('colwidth'), self.colspecs))
56 | for colspec in self.colspecs:
57 | colspec['relative_width'] = float (colspec['colwidth']) / width
58 |
59 | def depart_table (self, table):
60 | table['rows'] = self.rows ()
61 | table['columns'] = self.cols ()
62 |
63 | def visit_row (self, dummy_node):
64 | self.row += 1
65 | self.column = 0
66 | for colspec in self.colspecs:
67 | colspec['spanned'] = max (0, colspec.get ('spanned', 0) - 1)
68 |
69 | def visit_entry (self, node):
70 | """ Table cell. """
71 |
72 | morerows = node.get ('morerows', 0)
73 | morecols = node.get ('morecols', 0)
74 |
75 | self.cells += (morecols + 1) * (morerows + 1)
76 |
77 | # skip columns that are row-spanned by preceding entries
78 | while True:
79 | colspec = self.colspecs [self.column]
80 | if colspec.get ('spanned', 0) > 0:
81 | placeholder = nodes.entry ()
82 | placeholder.type = 'compound'
83 | placeholder['column'] = self.column
84 | placeholder.colspecs = self.colspecs[self.column:self.column + 1]
85 | placeholder['vspan'] = True
86 | node.replace_self ([placeholder, node])
87 | self.column += 1
88 | else:
89 | break
90 |
91 | # mark columns we row-span
92 | if morerows:
93 | for colspec in self.colspecs [self.column : self.column + 1 + morecols]:
94 | colspec['spanned'] = morerows + 1
95 |
96 | node['row'] = self.row
97 | node['column'] = self.column
98 |
99 | node.colspecs = self.colspecs[self.column:self.column + morecols + 1]
100 |
101 | self.column += 1 + morecols
102 |
103 | raise nodes.SkipNode
104 |
105 | def rows (self):
106 | """ Return the no. of columns. """
107 | return self.row + 1
108 |
109 | def cols (self):
110 | """ Return the no. of columns. """
111 | return self.cells / self.rows ()
112 |
113 |
114 | class ListEnumerator:
115 | """ Enumerate labels according to list type. """
116 |
117 | def __init__ (self, node, encoding):
118 | self.type = node.get ('enumtype') or node.get ('bullet') or '*'
119 | self.start = node['start'] if 'start' in node else 1
120 | self.prefix = node.get ('prefix', '')
121 | self.suffix = node.get ('suffix', '')
122 | self.encoding = encoding
123 |
124 | self.indent = len (self.prefix + self.suffix) + 1
125 | if self.type == 'arabic':
126 | # indentation depends on end value
127 | self.indent += len (str (self.start + len (node.children)))
128 | elif self.type.endswith ('alpha'):
129 | self.indent += 1
130 | elif self.type.endswith ('roman'):
131 | self.indent += 5 # FIXME: calculate real length
132 | else:
133 | self.indent += 1 # none, bullets, etc.
134 |
135 | def get_next (self):
136 | """ Get the next label. """
137 |
138 | if self.type == 'none':
139 | res = ''
140 | elif self.type == '*':
141 | res = u'•' if self.encoding == 'utf-8' else '-'
142 | elif self.type == '-':
143 | res = u'-'
144 | elif self.type == '+':
145 | res = u'+'
146 | elif self.type == 'arabic':
147 | res = "%d" % self.start
148 | elif self.type == 'loweralpha':
149 | res = "%c" % (self.start + ord ('a') - 1)
150 | elif self.type == 'upperalpha':
151 | res = "%c" % (self.start + ord ('A') - 1)
152 | elif self.type == 'upperroman':
153 | res = roman.toRoman (self.start).upper ()
154 | elif self.type == 'lowerroman':
155 | res = roman.toRoman (self.start).lower ()
156 | else:
157 | res = "%d" % self.start
158 |
159 | self.start += 1
160 |
161 | return self.prefix + res + self.suffix
162 |
163 | def get_width (self):
164 | """ Get indent width for this list. """
165 |
166 | return self.indent
167 |
168 |
169 | class Translator (nodes.NodeVisitor):
170 | """ A base translator """
171 |
172 | admonitions = """
173 | attention caution danger error hint important note tip warning
174 | """.split ()
175 |
176 | docinfo_elements = """
177 | address author contact copyright date organization revision status
178 | version
179 | """.split ()
180 |
181 | # see http://docutils.sourceforge.net/docs/ref/doctree.html#simple-body-elements
182 |
183 | # simple_structural_subelements = tuple ((getattr (nodes, n) for n in """
184 | # title subtitle
185 | # """.split ()))
186 |
187 | # simple_body_elements = tuple ((getattr (nodes, n) for n in """
188 | # comment doctest_block image literal_block math_block paragraph
189 | # pending raw rubric substitution_definition target
190 | # """.split ()))
191 |
192 | # simple_body_subelements = tuple ((getattr (nodes, n) for n in """
193 | # attribution caption classifier colspec field_name
194 | # label line option_argument option_string term
195 | # """.split ()))
196 |
197 | # simple_elements = (simple_structural_subelements +
198 | # simple_body_elements + simple_body_subelements)
199 |
200 | def __init__ (self, document):
201 | nodes.NodeVisitor.__init__ (self, document)
202 | self.settings = document.settings
203 |
204 | self.body = []
205 | self.context = self.body # start with context == body
206 | self.docinfo = collections.defaultdict (list)
207 | self.list_enumerator_stack = []
208 | self.section_level = 0
209 | self.vspace = 0 # pending space (need this for collapsing)
210 | self.src_vspace = 0 # pending space for source pretty printing
211 |
212 | self.field_name = None
213 | self.compacting = 0 # > 0 if we are inside a compacting list
214 | self.in_literal = 0 # > 0 if we are inside one or more literal blocks
215 |
216 | self.prefixes = collections.defaultdict (list) # dict of arrays of prefixes in order in
217 | # which to apply classes
218 | self.suffixes = collections.defaultdict (list) # reverse order of above
219 |
220 | self.environments = [] # stack of \begin'ed environments
221 |
222 | self.register_classes ()
223 |
224 | for name in self.docinfo_elements:
225 | setattr (self, 'visit_' + name,
226 | lambda node: self.visit_field_body (node, name))
227 | setattr (self, 'depart_' + name, self.depart_field_body)
228 |
229 | for adm in self.admonitions:
230 | setattr (self, 'visit_' + adm,
231 | lambda node: self.visit_admonition (node, adm))
232 | setattr (self, 'depart_' + adm, self.depart_admonition)
233 |
234 |
235 | def register_classes (self):
236 | pass
237 |
238 |
239 | def dispatch_visit (self, node):
240 | """
241 | Call self."``visit_`` + node class name" with `node` as
242 | parameter. If the ``visit_...`` method does not exist, call
243 | self.unknown_visit.
244 |
245 | There are 3 hooks for every visit:
246 |
247 | visit_outer
248 | visit_
249 | visit_inner
250 |
251 | """
252 |
253 | self.visit_outer (node)
254 |
255 | node_name = node.__class__.__name__
256 | method = getattr (self, 'visit_' + node_name, self.unknown_visit)
257 | self.document.reporter.debug (
258 | 'docutils.nodes.NodeVisitor.dispatch_visit calling %s for %s'
259 | % (method.__name__, node_name))
260 | res = method (node)
261 |
262 | if node.type in ('compound', 'simple', 'inline'):
263 | self.visit_inner (node)
264 |
265 | return res
266 |
267 | def dispatch_departure (self, node):
268 | """
269 | Call self."``depart_`` + node class name" with `node` as
270 | parameter. If the ``depart_...`` method does not exist, call
271 | self.unknown_departure.
272 |
273 | There are 3 hooks for every departure:
274 |
275 | depart_inner
276 | depart_
277 | depart_outer
278 |
279 | """
280 |
281 | if node.type in ('compound', 'simple', 'inline'):
282 | self.depart_inner (node)
283 |
284 | node_name = node.__class__.__name__
285 | method = getattr (self, 'depart_' + node_name, self.unknown_departure)
286 | self.document.reporter.debug (
287 | 'docutils.nodes.NodeVisitor.dispatch_departure calling %s for %s'
288 | % (method.__name__, node_name))
289 | res = method (node)
290 |
291 | self.depart_outer (node)
292 |
293 | return res
294 |
295 |
296 | def unknown_visit (self, node):
297 | """ Called if we have no handler for this element. """
298 | pass
299 |
300 | def unknown_departure (self, node):
301 | """ Called if we have no handler for this element. """
302 | pass
303 |
304 |
305 | def visit_outer (self, node):
306 | """ The very first hook called on a node, before
307 | ``visit_``. """
308 | pass
309 |
310 | def visit_inner (self, node):
311 | """ Called after ``visit_``. """
312 | pass
313 |
314 | def depart_inner (self, node):
315 | """ Called on a block before ``depart_``. """
316 | pass
317 |
318 | def depart_outer (self, node):
319 | """ The very last hook called on a node, after
320 | ``depart_``."""
321 | pass
322 |
323 |
324 | def register_class (self, types, class_, prefix, suffix):
325 | """ Register classes.
326 |
327 | A mechanism to automatically output strings before and after
328 | elements with specific classes. For most use cases this is
329 | easier than to write a handler for the element.
330 |
331 | types: types of node this class will apply to:
332 | tuple of one or more of (text, inline, simple, compound)
333 | class_: class that triggers the strings
334 | prefix: string output before element
335 | suffix: string output after element
336 |
337 | """
338 |
339 | if isinstance (types, basestring):
340 | types = types.split ()
341 |
342 | for t in types:
343 | self.prefixes[t].append ( (class_, prefix))
344 | self.suffixes[t].insert (0, (class_, suffix))
345 |
346 | def get_prefix (self, type_, classes):
347 | return self._get_prefix (type_, classes, self.prefixes)
348 |
349 | def get_suffix (self, type_, classes):
350 | return self._get_prefix (type_, classes, self.suffixes)
351 |
352 | def _get_prefix (self, type_, classes, array):
353 | """ Helper for inline handlers. """
354 | if isinstance (classes, basestring):
355 | classes = classes.split ()
356 |
357 | res = []
358 | for s in array[type_]:
359 | if s[0] in classes:
360 | res.append (s[1])
361 | return res
362 |
363 |
364 | def set_class_on_child (self, node, class_, index = 0):
365 | """
366 | Set class `class_` on the visible child no. index of `node`.
367 | Do nothing if node has fewer children than `index`.
368 | """
369 | children = [n for n in node if not isinstance (n, nodes.Invisible)]
370 | try:
371 | child = children[index]
372 | except IndexError:
373 | return
374 | child['classes'].append (class_)
375 |
376 | def set_first_last (self, node):
377 | """ Set class 'first' on first child, 'last' on last child. """
378 | self.set_class_on_child (node, 'first', 0)
379 | self.set_class_on_child (node, 'last', -1)
380 |
381 | def astext (self):
382 | """ Return the final formatted document as a string. """
383 | return self.preamble () + ''.join (self.context) + self.postamble ()
384 |
385 | def comment (self, text):
386 | """ Output a comment. """
387 | pass
388 |
389 | def text (self, text):
390 | """ Output text. """
391 | pass
392 |
393 | def sp (self, n = 1):
394 | """ Adds vertical space before the next simple element.
395 |
396 | All spaces added collapse into the largest one. """
397 |
398 | if n == 0:
399 | self.vspace = 1999
400 | else:
401 | self.vspace = max (n, self.vspace)
402 |
403 | def src_sp (self, n = 1):
404 | """ Add vertical space to the source. """
405 |
406 | if n == 0:
407 | self.src_vspace = 1999
408 | else:
409 | self.src_vspace = max (n, self.src_vspace)
410 |
411 | def output_sp (self):
412 | pass
413 |
414 | def output_src_sp (self):
415 | pass
416 |
417 | def push (self):
418 | """ Push environment. """
419 | pass
420 |
421 | def pop (self):
422 | """ Pop environment. """
423 | pass
424 |
425 | def br_if_line_longer_than (self, length):
426 | """ Go one line up if the last line was shorter than length.
427 |
428 | Use this to compact lists etc. """
429 | pass
430 |
431 | def indent (self, by = 2):
432 | """ Indent text. """
433 | pass
434 |
435 | def rindent (self, by = 2):
436 | """ Indent text on the right side. """
437 | pass
438 |
439 | def preamble (self):
440 | return ''
441 |
442 | def postamble (self):
443 | return ''
444 |
445 | def visit_title (self, node):
446 | """ Switch on the various incarnations the title element can have. """
447 |
448 | if isinstance (node.parent, nodes.section):
449 | self.visit_section_title (node)
450 | elif isinstance (node.parent, nodes.document):
451 | self.visit_document_title (node)
452 | elif isinstance (node.parent, nodes.table):
453 | self.visit_table_title (node)
454 | elif isinstance (node.parent, nodes.topic):
455 | self.visit_topic_title (node)
456 | elif isinstance (node.parent, nodes.sidebar):
457 | self.visit_sidebar_title (node)
458 | elif isinstance (node.parent, nodes.admonition):
459 | self.visit_admonition_title (node)
460 | else:
461 | assert ("Can't happen.")
462 |
463 | def depart_title (self, node):
464 | """ Switch on the various incarnations the title element can have. """
465 |
466 | if isinstance (node.parent, nodes.section):
467 | self.depart_section_title (node)
468 | elif isinstance (node.parent, nodes.document):
469 | self.depart_document_title (node)
470 | elif isinstance (node.parent, nodes.table):
471 | self.depart_table_title (node)
472 | elif isinstance (node.parent, nodes.topic):
473 | self.depart_topic_title (node)
474 | elif isinstance (node.parent, nodes.sidebar):
475 | self.depart_sidebar_title (node)
476 | elif isinstance (node.parent, nodes.admonition):
477 | self.depart_admonition_title (node)
478 | else:
479 | assert ("Can't happen.")
480 |
481 | def visit_subtitle (self, node):
482 | """ Switch on the various incarnations the subtitle element can have. """
483 |
484 | if isinstance (node.parent, nodes.document):
485 | self.visit_document_subtitle (node)
486 | else:
487 | self.visit_section_subtitle (node)
488 |
489 | def depart_subtitle (self, node):
490 | """ Switch on the various incarnations the subtitle element can have. """
491 |
492 | if isinstance (node.parent, nodes.document):
493 | self.depart_document_subtitle (node)
494 | else:
495 | self.depart_section_subtitle (node)
496 |
497 |
--------------------------------------------------------------------------------
/epubmaker/mydocutils/writers/epub2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
3 |
4 | """
5 |
6 | epub2.py
7 |
8 | Copyright 2012 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | A writer that writes XHTML 1 files suited for conversion into EPUB2.
13 |
14 | """
15 |
16 | import re
17 |
18 | from docutils import nodes
19 |
20 | # from epubmaker.lib.Logger import info, debug, warn, error
21 |
22 | from epubmaker.mydocutils.writers.xhtml1 import Writer as WriterBase
23 | from epubmaker.mydocutils.writers.xhtml1 import Translator as TranslatorBase
24 |
25 |
26 | class Writer (WriterBase):
27 | """ EPUB2 writer. """
28 |
29 | def __init__ (self):
30 | WriterBase.__init__ (self)
31 | self.translator_class = Translator
32 |
33 |
34 | class Translator (TranslatorBase):
35 | """ HTML Translator with EPUB2 tweaks. """
36 |
37 | def init_css (self):
38 | for css_file in ('rst2all.css', 'rst2epub.css'):
39 | self.head.append ('\n' %
40 | self.encode (self.read_css (css_file)))
41 |
42 |
43 | def calc_centering_style (self, node):
44 | """
45 | Rationale: The EPUB standard allows user agents to replace
46 | `margin: auto` with `margin: 0`. Thus we cannot use `margin: auto`
47 | to center images, we have to calculate the left margin value.
48 |
49 | Also we must use 'width' on the html element, not css style,
50 | or Adobe ADE will not scale the image properly (ie. only
51 | horizontally).
52 |
53 | :align: is supposed to work on blocks. It floats or centers
54 | a block.
55 |
56 | :align: center has not the same semantics as :class: center.
57 | Former centers the block, eg. the whole table, latter centers
58 | the text, eg, the text in every table cell.
59 |
60 | `:align: center`
61 | Used on image: centers image
62 | Used on figure: centers image and caption
63 | Used on table: centers table and caption
64 |
65 | """
66 |
67 | width = node.get ('width')
68 | if width is None:
69 | return []
70 |
71 | style = ['width: %s' % width]
72 |
73 | m = re.match ('(\d+)\s*%', width)
74 | if (m):
75 | width = max (min (int (m.group (1)), 100), 0)
76 | margin = 100 - width
77 |
78 | align = node.get ('align', 'center')
79 | if align == 'center':
80 | style.append ('margin-left: %d%%' % (margin / 2))
81 | if align == 'right':
82 | style.append ('margin-left: %d%%' % margin)
83 |
84 | node['styles'].extend (style)
85 |
86 |
87 |
--------------------------------------------------------------------------------
/epubmaker/mydocutils/writers/rst2all.css:
--------------------------------------------------------------------------------
1 | /*
2 | Project Gutenberg common docutils stylesheet.
3 |
4 | This stylesheet contains styles common to HTML and EPUB. Put styles
5 | that are specific to HTML and EPUB into their relative stylesheets.
6 |
7 | :Author: Marcello Perathoner (webmaster@gutenberg.org)
8 | :Copyright: This stylesheet has been placed in the public domain.
9 |
10 | This stylesheet is based on:
11 |
12 | :Author: David Goodger (goodger@python.org)
13 | :Copyright: This stylesheet has been placed in the public domain.
14 |
15 | Default cascading style sheet for the HTML output of Docutils.
16 |
17 | */
18 |
19 | /* ADE 1.7.2 chokes on !important and throws all css out. */
20 |
21 | /* FONTS */
22 |
23 | .italics { font-style: italic }
24 | .no-italics { font-style: normal }
25 |
26 | .bold { font-weight: bold }
27 | .no-bold { font-weight: normal }
28 |
29 | .small-caps { } /* Epub needs italics */
30 | .gesperrt { } /* Epub needs italics */
31 | .antiqua { font-style: italic } /* what else can we do ? */
32 | .monospaced { font-family: monospace }
33 |
34 | .smaller { font-size: smaller }
35 | .larger { font-size: larger }
36 |
37 | .xx-small { font-size: xx-small }
38 | .x-small { font-size: x-small }
39 | .small { font-size: small }
40 | .medium { font-size: medium }
41 | .large { font-size: large }
42 | .x-large { font-size: x-large }
43 | .xx-large { font-size: xx-large }
44 |
45 | .text-transform-uppercase { text-transform: uppercase }
46 | .text-transform-lowercase { text-transform: lowercase }
47 | .text-transform-none { text-transform: none }
48 |
49 | .red { color: red }
50 | .green { color: green }
51 | .blue { color: blue }
52 | .yellow { color: yellow }
53 | .white { color: white }
54 | .gray { color: gray }
55 | .black { color: black }
56 |
57 | /* ALIGN */
58 |
59 | .left { text-align: left }
60 | .justify { text-align: justify }
61 | .center { text-align: center; text-indent: 0 }
62 | .centerleft { text-align: center; text-indent: 0 }
63 | .right { text-align: right; text-indent: 0 }
64 |
65 | /* LINE HEIGHT */
66 |
67 | body { line-height: 1.5 }
68 | p { margin: 0;
69 | text-indent: 2em }
70 |
71 | /* PAGINATION */
72 |
73 | .title, .subtitle { page-break-after: avoid }
74 |
75 | .container, .title, .subtitle, #pg-header
76 | { page-break-inside: avoid }
77 |
78 | /* SECTIONS */
79 |
80 | body { text-align: justify }
81 |
82 | p.pfirst, p.noindent {
83 | text-indent: 0
84 | }
85 |
86 | .boxed { border: 1px solid black; padding: 1em }
87 | .topic, .note { margin: 5% 0; border: 1px solid black; padding: 1em }
88 | div.section { clear: both }
89 |
90 | div.line-block { margin: 1.5em 0 } /* same leading as p */
91 | div.line-block.inner { margin: 0 0 0 10% }
92 | div.line { margin-left: 20%; text-indent: -20%; }
93 | .line-block.noindent div.line { margin-left: 0; text-indent: 0; }
94 |
95 | hr.docutils { margin: 1.5em 40%; border: none; border-bottom: 1px solid black; }
96 | div.transition { margin: 1.5em 0 }
97 |
98 | .vfill, .vspace { border: 0px solid white }
99 |
100 | .title { margin: 1.5em 0 }
101 | .title.with-subtitle { margin-bottom: 0 }
102 | .subtitle { margin: 1.5em 0 }
103 |
104 | /* header font style */
105 | /* http://dev.w3.org/csswg/css3-fonts/#propdef-font-size */
106 |
107 | h1.title { font-size: 200%; } /* for book title only */
108 | h2.title, p.subtitle.level-1 { font-size: 150%; margin-top: 4.5em; margin-bottom: 2em }
109 | h3.title, p.subtitle.level-2 { font-size: 120%; margin-top: 2.25em; margin-bottom: 1.25em }
110 | h4.title, p.subtitle.level-3 { font-size: 100%; margin-top: 1.5em; margin-bottom: 1.5em; font-weight: bold; }
111 | h5.title, p.subtitle.level-4 { font-size: 89%; margin-top: 1.87em; margin-bottom: 1.69em; font-style: italic; }
112 | h6.title, p.subtitle.level-5 { font-size: 60%; margin-top: 3.5em; margin-bottom: 2.5em }
113 |
114 | /* title page */
115 |
116 | h1.title, p.subtitle.level-1,
117 | h2.title, p.subtitle.level-2 { text-align: center }
118 |
119 | #pg-header,
120 | h1.document-title { margin: 10% 0 5% 0 }
121 | p.document-subtitle { margin: 0 0 5% 0 }
122 |
123 | /* PG header and footer */
124 | #pg-machine-header { }
125 | #pg-produced-by { }
126 |
127 | li.toc-entry { list-style-type: none }
128 | ul.open li, ol.open li { margin-bottom: 1.5em }
129 |
130 | .attribution { margin-top: 1.5em }
131 |
132 | .example-rendered {
133 | margin: 1em 5%; border: 1px dotted red; padding: 1em; background-color: #ffd }
134 | .literal-block.example-source {
135 | margin: 1em 5%; border: 1px dotted blue; padding: 1em; background-color: #eef }
136 |
137 | /* DROPCAPS */
138 |
139 | /* BLOCKQUOTES */
140 |
141 | blockquote { margin: 1.5em 10% }
142 |
143 | blockquote.epigraph { }
144 |
145 | blockquote.highlights { }
146 |
147 | div.local-contents { margin: 1.5em 10% }
148 |
149 | div.abstract { margin: 3em 10% }
150 | div.image { margin: 1.5em 0 }
151 | div.caption { margin: 1.5em 0 }
152 | div.legend { margin: 1.5em 0 }
153 |
154 | .hidden { display: none }
155 |
156 | .invisible { visibility: hidden; color: white } /* white: mozilla print bug */
157 |
158 | a.toc-backref {
159 | text-decoration: none ;
160 | color: black }
161 |
162 | dl.docutils dd {
163 | margin-bottom: 0.5em }
164 |
165 | div.figure { margin-top: 3em; margin-bottom: 3em }
166 |
167 | img { max-width: 100% }
168 |
169 | div.footer, div.header {
170 | clear: both;
171 | font-size: smaller }
172 |
173 | div.sidebar {
174 | margin: 0 0 0.5em 1em ;
175 | border: medium outset ;
176 | padding: 1em ;
177 | background-color: #ffffee ;
178 | width: 40% ;
179 | float: right ;
180 | clear: right }
181 |
182 | div.sidebar p.rubric {
183 | font-family: sans-serif ;
184 | font-size: medium }
185 |
186 | ol.simple, ul.simple { margin: 1.5em 0 }
187 |
188 | ol.toc-list, ul.toc-list { padding-left: 0 }
189 | ol ol.toc-list, ul ul.toc-list { padding-left: 5% }
190 |
191 | ol.arabic {
192 | list-style: decimal }
193 |
194 | ol.loweralpha {
195 | list-style: lower-alpha }
196 |
197 | ol.upperalpha {
198 | list-style: upper-alpha }
199 |
200 | ol.lowerroman {
201 | list-style: lower-roman }
202 |
203 | ol.upperroman {
204 | list-style: upper-roman }
205 |
206 | p.credits {
207 | font-style: italic ;
208 | font-size: smaller }
209 |
210 | p.label {
211 | white-space: nowrap }
212 |
213 | p.rubric {
214 | font-weight: bold ;
215 | font-size: larger ;
216 | color: maroon ;
217 | text-align: center }
218 |
219 | p.sidebar-title {
220 | font-family: sans-serif ;
221 | font-weight: bold ;
222 | font-size: larger }
223 |
224 | p.sidebar-subtitle {
225 | font-family: sans-serif ;
226 | font-weight: bold }
227 |
228 | p.topic-title, p.admonition-title {
229 | font-weight: bold }
230 |
231 | pre.address {
232 | margin-bottom: 0 ;
233 | margin-top: 0 ;
234 | font: inherit }
235 |
236 | .literal-block, .doctest-block {
237 | margin-left: 2em ;
238 | margin-right: 2em; }
239 |
240 | span.classifier {
241 | font-family: sans-serif ;
242 | font-style: oblique }
243 |
244 | span.classifier-delimiter {
245 | font-family: sans-serif ;
246 | font-weight: bold }
247 |
248 | span.interpreted {
249 | font-family: sans-serif }
250 |
251 | span.option {
252 | white-space: nowrap }
253 |
254 | span.pre {
255 | white-space: pre }
256 |
257 | span.problematic {
258 | color: red }
259 |
260 | span.section-subtitle {
261 | /* font-size relative to parent (h1..h6 element) */
262 | font-size: 100% }
263 |
264 | table { margin-top: 1.5em; margin-bottom: 1.5em; border-spacing: 0 }
265 | table.align-left, table.align-right { margin-top: 0 }
266 |
267 | table.table { border-collapse: collapse; }
268 |
269 | table.table.hrules-table thead { border: 1px solid black; border-width: 2px 0 0 }
270 | table.table.hrules-table tbody { border: 1px solid black; border-width: 2px 0 }
271 | table.table.hrules-rows tr { border: 1px solid black; border-width: 0 0 1px }
272 | table.table.hrules-rows tr.last { border-width: 0 }
273 | table.table.hrules-rows td,
274 | table.table.hrules-rows th { padding: 1ex 1em; vertical-align: middle }
275 |
276 | table.table tr { border-width: 0 }
277 | table.table td,
278 | table.table th { padding: 0.5ex 1em }
279 | table.table tr.first td { padding-top: 1ex }
280 | table.table tr.last td { padding-bottom: 1ex }
281 | table.table tr.first th { padding-top: 1ex }
282 | table.table tr.last th { padding-bottom: 1ex }
283 |
284 |
285 | table.citation {
286 | border-left: solid 1px gray;
287 | margin-left: 1px }
288 |
289 | table.docinfo {
290 | margin: 3em 4em }
291 |
292 | table.docutils { }
293 |
294 | div.footnote-group { margin: 1em 0 }
295 | table.footnote td.label { width: 2em; text-align: right; padding-left: 0 }
296 |
297 | table.docutils td, table.docutils th,
298 | table.docinfo td, table.docinfo th {
299 | padding: 0 0.5em;
300 | vertical-align: top }
301 |
302 | table.docutils th.field-name, table.docinfo th.docinfo-name {
303 | font-weight: bold ;
304 | text-align: left ;
305 | white-space: nowrap ;
306 | padding-left: 0 }
307 |
308 | /* used to remove borders from tables and images */
309 | .borderless, table.borderless td, table.borderless th {
310 | border: 0 }
311 |
312 | table.borderless td, table.borderless th {
313 | /* Override padding for "table.docutils td" with "!important".
314 | The right padding separates the table cells. */
315 | padding: 0 0.5em 0 0 } /* FIXME: was !important */
316 |
317 | h1 tt.docutils, h2 tt.docutils, h3 tt.docutils,
318 | h4 tt.docutils, h5 tt.docutils, h6 tt.docutils {
319 | font-size: 100% }
320 |
321 | ul.auto-toc {
322 | list-style-type: none }
323 |
--------------------------------------------------------------------------------
/epubmaker/mydocutils/writers/rst2epub.css:
--------------------------------------------------------------------------------
1 | /*
2 | Project Gutenberg EPUB docutils stylesheet.
3 |
4 | This stylesheet contains styles specific to EPUB.
5 | */
6 |
7 | /* FONTS */
8 |
9 | /* mostly unsupported */
10 | .small-caps { font-style: italic }
11 | .gesperrt { font-style: italic }
12 |
13 | /* ALIGN */
14 |
15 | /* SECTIONS */
16 |
17 | /* reduce screen real estate waste */
18 | body { margin: 1% }
19 |
20 | /* ugly hack to give more specifity. because ADE chucks out the whole
21 | stylesheet when it sees an !important */
22 |
23 | .first.first { margin-top: 0; text-indent: 0 }
24 | .last.last { margin-bottom: 0 }
25 |
26 | .no-page-break.no-page-break
27 | { page-break-before: avoid }
28 |
29 | /* PAGINATION */
30 |
31 | div.clearpage { page-break-before: always; padding-top: 10% }
32 | div.cleardoublepage { page-break-before: right; padding-top: 10% }
33 |
34 | .vfill { margin-top: 10% }
35 | h2.title { margin-top: 10% }
36 |
37 | /* DIV */
38 |
39 | a { text-decoration: none }
40 | .toc-pageref { display: none }
41 |
42 | /* DROPCAPS */
43 |
44 | span.dropcap { line-height: 0 }
45 | img.dropcap { vertical-align: bottom }
46 |
47 |
--------------------------------------------------------------------------------
/epubmaker/mydocutils/writers/rst2html.css:
--------------------------------------------------------------------------------
1 | /*
2 | Project Gutenberg HTML docutils stylesheet.
3 |
4 | This stylesheet contains styles specific to HTML.
5 | */
6 |
7 | /* FONTS */
8 |
9 | /* em { font-style: normal }
10 | strong { font-weight: normal } */
11 |
12 | .small-caps { font-variant: small-caps }
13 | .gesperrt { letter-spacing: 0.1em }
14 |
15 | /* ALIGN */
16 |
17 | .align-left { clear: left;
18 | float: left;
19 | margin-right: 1em }
20 |
21 | .align-right { clear: right;
22 | float: right;
23 | margin-left: 1em }
24 |
25 | .align-center { margin-left: auto;
26 | margin-right: auto }
27 |
28 | div.shrinkwrap { display: table; }
29 |
30 | /* SECTIONS */
31 |
32 | body { margin: 5% 10% 5% 10% }
33 |
34 | /* compact list items containing just one p */
35 | li p.pfirst { margin-top: 0; margin-bottom: 0 }
36 |
37 | .first { margin-top: 0 !important;
38 | text-indent: 0 !important }
39 | .last { margin-bottom: 0 !important }
40 |
41 | span.dropcap { float: left; margin: 0 0.1em 0 0; line-height: 1 }
42 | img.dropcap { float: left; margin: 0 0.5em 0 0; max-width: 25% }
43 | span.dropspan { font-variant: small-caps }
44 |
45 | .no-page-break { page-break-before: avoid !important }
46 |
47 | /* PAGINATION */
48 |
49 | .pageno { position: absolute; right: 95%; font: medium sans-serif; text-indent: 0 }
50 | .pageno:after { color: gray; content: '[' attr(title) ']' }
51 | .lineno { position: absolute; left: 95%; font: medium sans-serif; text-indent: 0 }
52 | .lineno:after { color: gray; content: '[' attr(title) ']' }
53 | .toc-pageref { float: right }
54 |
55 | @media screen {
56 | .coverpage, .frontispiece, .titlepage, .verso, .dedication, .plainpage
57 | { margin: 10% 0; }
58 |
59 | div.clearpage, div.cleardoublepage
60 | { margin: 10% 0; border: none; border-top: 1px solid gray; }
61 |
62 | .vfill { margin: 5% 10% }
63 | }
64 |
65 | @media print {
66 | div.clearpage { page-break-before: always; padding-top: 10% }
67 | div.cleardoublepage { page-break-before: right; padding-top: 10% }
68 |
69 | .vfill { margin-top: 20% }
70 | h2.title { margin-top: 20% }
71 | }
72 |
73 | /* DIV */
74 | pre { font-family: monospace; font-size: 0.9em; white-space: pre-wrap }
75 |
76 |
--------------------------------------------------------------------------------
/epubmaker/packagers/GzipPackager.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
3 |
4 | """
5 | GzipPackager.py
6 |
7 | Copyright 2010 by Marcello Perathoner
8 |
9 | Distributable under the GNU General Public License Version 3 or newer.
10 |
11 | Gzip a file.
12 |
13 | """
14 |
15 | from epubmaker.packagers import OneFileGzipPackager
16 |
17 | TYPE = 'gzip'
18 | FORMATS = 'rst html.noimages html.images txt.us-ascii txt.iso-8859-1 txt.utf-8'.split ()
19 |
20 | class Packager (OneFileGzipPackager):
21 | """ Gzip packager. """
22 | pass
23 |
24 |
--------------------------------------------------------------------------------
/epubmaker/packagers/HTMLPackager.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
3 |
4 | """
5 | HTMLPackager.py
6 |
7 | Copyright 2010 by Marcello Perathoner
8 |
9 | Distributable under the GNU General Public License Version 3 or newer.
10 |
11 | Package a HTML file for PG.
12 |
13 | """
14 |
15 | from epubmaker.packagers import HTMLishPackager
16 |
17 | TYPE = 'ww'
18 | FORMATS = 'html.images'.split ()
19 |
20 | class Packager (HTMLishPackager):
21 | """ Package a HTML file with its images. """
22 | pass
23 |
--------------------------------------------------------------------------------
/epubmaker/packagers/PDFPackager.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
3 |
4 | """
5 | PDFPackager.py
6 |
7 | Copyright 2010 by Marcello Perathoner
8 |
9 | Distributable under the GNU General Public License Version 3 or newer.
10 |
11 | Package a PDF file for PG.
12 |
13 | """
14 |
15 | from epubmaker.packagers import OneFileZipPackager
16 |
17 | TYPE = 'ww'
18 | FORMATS = ''.split ()
19 |
20 | class Packager (OneFileZipPackager):
21 | """ WW packager for PDF files. """
22 | pass
23 |
24 |
--------------------------------------------------------------------------------
/epubmaker/packagers/PushPackager.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
3 |
4 | """
5 | PushPackager.py
6 |
7 | Copyright 2011 by Marcello Perathoner
8 |
9 | Distributable under the GNU General Public License Version 3 or newer.
10 |
11 | Package a zip containing everything, that can be pushed to ibiblio.
12 |
13 | """
14 |
15 | from __future__ import with_statement
16 |
17 | import os
18 | import zipfile
19 | import re
20 |
21 | from epubmaker.lib.Logger import info, warn, error
22 | import epubmaker.lib.GutenbergGlobals as gg
23 |
24 | from epubmaker.packagers import BasePackager
25 |
26 | TYPE = 'ww'
27 | FORMATS = ['push']
28 |
29 | class Packager (BasePackager):
30 | """ Package one big zip for push.
31 |
32 | Zip contains one directory named after ebook_no.
33 | This dir mirrors structure on ibiblio::
34 |
35 | 12345/12345.txt
36 | 12345/12345.zip
37 | 12345/12345-h/12345-h.html
38 | 12345/12345-h/images/cover.jpg
39 | 12345/12345-h.zip
40 |
41 | """
42 |
43 | @staticmethod
44 | def add (zip_, filename, memberfilename):
45 | """ Add one file to the zip. """
46 |
47 | try:
48 | os.stat (filename)
49 | dummy_name, ext = os.path.splitext (filename)
50 | info (' Adding file: %s as %s' % (filename, memberfilename))
51 | zip_.write (filename, memberfilename,
52 | zipfile.ZIP_STORED if ext in ['.zip', '.png']
53 | else zipfile.ZIP_DEFLATED)
54 | except OSError:
55 | # warn ('PushPackager: Cannot find file %s', filename)
56 | return
57 |
58 |
59 | def package (self, aux_file_list = []):
60 | zipfilename = self.options.outputfile # filename is zipfile
61 |
62 | m = re.match (r'\d+', zipfilename)
63 | if m:
64 | ebook_no = m.group (0)
65 | else:
66 | error ('Invalid filename %s for push packager.' % zipfilename)
67 | return
68 |
69 | info ('Creating Zip file: %s' % zipfilename)
70 |
71 | zip_ = zipfile.ZipFile (zipfilename, 'w', zipfile.ZIP_DEFLATED)
72 |
73 | for suffix in '.txt -8.txt -0.txt .zip -8.zip -0.zip -rst.zip -h.zip'.split ():
74 | filename = '%s%s' % (ebook_no, suffix)
75 | memberfilename = '%s/%s' % (ebook_no, filename)
76 | self.add (zip_, filename, memberfilename)
77 |
78 | for suffix, ext in (('-h', 'html'), ('-rst', 'rst')):
79 | filename = '%s%s.%s' % (ebook_no, suffix, ext)
80 | memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, filename)
81 | self.add (zip_, filename, memberfilename)
82 |
83 | # image files
84 | for url in aux_file_list:
85 | rel_url = gg.make_url_relative (self.options.base_url, url)
86 | filename = os.path.join (self.path, rel_url)
87 | memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, rel_url)
88 | self.add (zip_, filename, memberfilename)
89 |
90 | zip_.close ()
91 |
92 | info ('Done Zip file: %s' % zipfilename)
93 |
94 |
95 |
--------------------------------------------------------------------------------
/epubmaker/packagers/RSTPackager.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
3 |
4 | """
5 | RSTPackager.py
6 |
7 | Copyright 2010 by Marcello Perathoner
8 |
9 | Distributable under the GNU General Public License Version 3 or newer.
10 |
11 | Package a RST file for PG.
12 |
13 | """
14 |
15 | from epubmaker.packagers import HTMLishPackager
16 |
17 | TYPE = 'ww'
18 | FORMATS = 'rst.gen'.split ()
19 |
20 | class Packager (HTMLishPackager):
21 | """ Package a RST file with its images. """
22 | pass
23 |
--------------------------------------------------------------------------------
/epubmaker/packagers/TxtPackager.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
3 |
4 | """
5 | TxtPackager.py
6 |
7 | Copyright 2010 by Marcello Perathoner
8 |
9 | Distributable under the GNU General Public License Version 3 or newer.
10 |
11 | Package a Txt file for PG.
12 |
13 | """
14 |
15 | from epubmaker.packagers import OneFileZipPackager
16 |
17 | TYPE = 'ww'
18 | FORMATS = 'txt.us-ascii txt.iso-8859-1 txt.utf-8'.split ()
19 |
20 | class Packager (OneFileZipPackager):
21 | """ WW packager for plain text files. """
22 | pass
23 |
24 |
--------------------------------------------------------------------------------
/epubmaker/packagers/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 |
6 | Packager package
7 |
8 | Copyright 2009-2010 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | Base class for Packager modules.
13 |
14 | """
15 |
16 | from __future__ import with_statement
17 |
18 | import os.path
19 | import gzip
20 | import zipfile
21 |
22 | from pkg_resources import resource_listdir # pylint: disable=E0611
23 |
24 | from epubmaker.lib.Logger import debug, info, warn, error
25 | import epubmaker.lib.GutenbergGlobals as gg
26 |
27 | GZIP_EXTENSION = '.gzip'
28 |
29 | class BasePackager (object):
30 | """
31 | Base class for Packagers.
32 |
33 | """
34 |
35 | def __init__ (self):
36 | self.options = None
37 | self.path_name_ext = None
38 | self.path = None
39 | self.name = None
40 | self.ext = None
41 |
42 |
43 | def setup (self, options):
44 | """ Setup """
45 |
46 | self.options = options
47 | self.path_name_ext = os.path.join (self.options.outputdir, self.options.outputfile)
48 | self.path, name = os.path.split (self.path_name_ext)
49 | self.name, self.ext = os.path.splitext (name)
50 |
51 |
52 | def package (self, aux_file_list = []):
53 | """ Package files. """
54 | pass
55 |
56 |
57 | class OneFileGzipPackager (BasePackager):
58 | """ Gzips one file. """
59 |
60 | def package (self, aux_file_list = []):
61 | filename = self.path_name_ext
62 | gzfilename = filename + GZIP_EXTENSION
63 |
64 | try:
65 | info ('Creating Gzip file: %s' % gzfilename)
66 | with open (filename, 'r') as fp:
67 | fpgz = gzip.open (gzfilename, 'w')
68 | info (' Adding file: %s' % filename)
69 | fpgz.write (fp.read ())
70 | fpgz.close ()
71 | info ('Done Zip file: %s' % gzfilename)
72 | except IOError, what:
73 | error (what)
74 |
75 |
76 | class OneFileZipPackager (BasePackager):
77 | """ Packages one file in zip of the same name. """
78 |
79 | def package (self, aux_file_list = []):
80 | filename = self.path_name_ext
81 | zipfilename = os.path.join (self.path, self.name) + '.zip'
82 | memberfilename = self.name + self.ext
83 |
84 | info ('Creating Zip file: %s' % zipfilename)
85 |
86 | try:
87 | os.stat (filename)
88 | except OSError:
89 | # warn ('Packager: Cannot find file %s', filename)
90 | return
91 |
92 | zip_ = zipfile.ZipFile (zipfilename, 'w', zipfile.ZIP_DEFLATED)
93 | info (' Adding file: %s as %s' % (filename, memberfilename))
94 | zip_.write (filename, memberfilename)
95 | zip_.close ()
96 |
97 | info ('Done Zip file: %s' % zipfilename)
98 |
99 |
100 | class HTMLishPackager (BasePackager):
101 | """ Package a file with images. """
102 |
103 | def package (self, aux_file_list = []):
104 |
105 | filename = self.options.outputfile
106 | zipfilename = os.path.join (self.path, self.name) + '.zip'
107 | memberfilename = os.path.join (self.name, self.name) + self.ext
108 |
109 | info ('Creating Zip file: %s' % zipfilename)
110 |
111 | zip_ = zipfile.ZipFile (zipfilename, 'w', zipfile.ZIP_DEFLATED)
112 | info (' Adding file: %s as %s' % (filename, memberfilename))
113 | zip_.write (filename, memberfilename)
114 |
115 | # now images
116 | for url in aux_file_list:
117 | rel_url = gg.make_url_relative (self.options.base_url, url)
118 | filename = os.path.join (self.path, rel_url)
119 | memberfilename = os.path.join (self.name, rel_url)
120 | info (' Adding file: %s as %s' % (filename, memberfilename))
121 | zip_.write (filename, memberfilename)
122 |
123 | zip_.close ()
124 |
125 | info ('Done Zip file: %s' % zipfilename)
126 |
127 |
128 | class PackagerFactory (object):
129 | """ Implements Factory pattern for packagers. """
130 |
131 | packagers = {}
132 |
133 | def __init__ (self, type_):
134 | self.type = type_
135 |
136 |
137 | def load (self):
138 | """ Load the packagers in the packagers directory. """
139 |
140 | for fn in resource_listdir ('epubmaker.packagers', ''):
141 | modulename, ext = os.path.splitext (fn)
142 | if ext == '.py':
143 | if modulename.endswith ('Packager'):
144 | module = __import__ ('epubmaker.packagers.' + modulename,
145 | fromlist = [modulename])
146 | if self.type == module.TYPE:
147 | debug ("Loading packager type: %s from module: %s for formats: %s" % (
148 | self.type, modulename, ', '.join (module.FORMATS)))
149 | for format_ in module.FORMATS:
150 | self.packagers[format_] = module
151 |
152 | return self.packagers.keys ()
153 |
154 |
155 | def unload (self):
156 | """ Unload packager modules. """
157 |
158 | for k in self.packagers.keys ():
159 | del self.packagers[k]
160 |
161 |
162 | def create (self, format_):
163 | """ Create a packager for format. """
164 |
165 | try:
166 | return self.packagers[format_].Packager ()
167 | except KeyError:
168 | raise KeyError ('No packager for type %s' % format_)
169 |
170 |
--------------------------------------------------------------------------------
/epubmaker/parsers/AuxParser.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 |
6 | AuxParser.py
7 |
8 | Copyright 2009 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | Open an url and return raw data.
13 |
14 | """
15 |
16 |
17 | from epubmaker.parsers import ParserBase
18 |
19 | mediatypes = ('*/*', )
20 |
21 | class Parser (ParserBase):
22 | """ Parse an auxiliary file. """
23 |
24 | def __init__ (self):
25 | ParserBase.__init__ (self)
26 | self.data = None
27 |
28 |
29 | def parse (self):
30 | """ Parse the file. """
31 | self.data = self.bytes_content ()
32 |
33 |
34 | def serialize (self):
35 | """ Serialize file to string. """
36 | return self.data
37 |
--------------------------------------------------------------------------------
/epubmaker/parsers/CSSParser.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 |
6 | CSSParser.py
7 |
8 | Copyright 2009 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | Open an url and return raw data.
13 |
14 | """
15 |
16 | import re
17 | import urlparse
18 | import logging
19 |
20 | import cssutils
21 |
22 | from epubmaker.lib.Logger import debug
23 | from epubmaker.lib.MediaTypes import mediatypes as mt
24 |
25 | from epubmaker.parsers import ParserBase
26 |
27 | RE_ELEMENT = re.compile (r'((?:^|\s)[a-z0-9]+)', re.I)
28 |
29 | mediatypes = (mt.css, )
30 |
31 | class Parser (ParserBase):
32 | """ Parse an external CSS file. """
33 |
34 | def __init__ (self):
35 | cssutils.log.setLog (logging.getLogger ('cssutils'))
36 | # logging.DEBUG is way too verbose
37 | cssutils.log.setLevel (max (cssutils.log.getEffectiveLevel (), logging.INFO))
38 | ParserBase.__init__ (self)
39 | self.sheet = None
40 |
41 |
42 | def parse (self):
43 | """ Parse the CSS file. """
44 |
45 | if self.sheet is not None:
46 | return
47 |
48 | parser = cssutils.CSSParser ()
49 | if self.fp:
50 | self.sheet = parser.parseString (self.bytes_content (), encoding = self.encoding)
51 | else:
52 | self.sheet = parser.parseUrl (self.url)
53 |
54 | self.mediatype = 'text/css'
55 | self.unpack_media_handheld (self.sheet)
56 | self.lowercase_selectors (self.sheet)
57 |
58 |
59 | def parse_string (self, s):
60 | """ Parse the CSS in string. """
61 |
62 | if self.sheet is not None:
63 | return
64 |
65 | parser = cssutils.CSSParser ()
66 | self.sheet = parser.parseString (s, encoding = 'utf-8')
67 |
68 | self.mediatype = 'text/css'
69 | self.unpack_media_handheld (self.sheet)
70 | self.lowercase_selectors (self.sheet)
71 |
72 |
73 | @staticmethod
74 | def iter_properties (sheet):
75 | """ Iterate on properties in css. """
76 | for rule in sheet:
77 | if rule.type == rule.STYLE_RULE:
78 | for prop in rule.style:
79 | yield prop
80 |
81 |
82 | @staticmethod
83 | def unpack_media_handheld (sheet):
84 | """ unpack a @media handheld rule """
85 | for rule in sheet:
86 | if rule.type == rule.MEDIA_RULE:
87 | if rule.media.mediaText.find ('handheld') > -1:
88 | debug ("Unpacking CSS @media handheld rule.")
89 | rule.media.mediaText = 'all'
90 | rule.insertRule (cssutils.css.CSSComment ('/* was @media handheld */'), 0)
91 |
92 |
93 | @staticmethod
94 | def lowercase_selectors (sheet):
95 | """ make selectors lowercase to match xhtml tags """
96 | for rule in sheet:
97 | if rule.type == rule.STYLE_RULE:
98 | for sel in rule.selectorList:
99 | sel.selectorText = RE_ELEMENT.sub (lambda m: m.group(1).lower (),
100 | sel.selectorText)
101 |
102 |
103 | def rewrite_links (self, f):
104 | """ Rewrite all links using the function f. """
105 | cssutils.replaceUrls (self.sheet, f)
106 |
107 |
108 | def drop_floats (self):
109 | """ Drop all floats in stylesheet.
110 |
111 | """
112 |
113 | for prop in self.iter_properties (self.sheet):
114 | if prop and prop.name == 'float': # test for existence because we remove
115 | prop.parent.removeProperty ('float')
116 | prop.parent.removeProperty ('width')
117 | prop.parent.removeProperty ('height')
118 | elif prop and prop.name in ('position', 'left', 'right', 'top', 'bottom'):
119 | prop.parent.removeProperty (prop.name)
120 |
121 | for prop in self.iter_properties (self.sheet):
122 | #print prop.name
123 | #print prop.value
124 | if prop and prop.value.endswith ('px'): # test for existence because we remove
125 | prop.parent.removeProperty (prop.name)
126 |
127 |
128 | def get_image_urls (self):
129 | """ Return the urls of all images in document.
130 |
131 | Images are graphic files. The user may choose if he wants
132 | images included or not.
133 |
134 | """
135 |
136 | images = []
137 |
138 | for prop in self.iter_properties (self.sheet):
139 | if (prop.value.cssValueType == prop.value.CSS_PRIMITIVE_VALUE and
140 | prop.value.primitiveType == prop.value.CSS_URI):
141 | url = urlparse.urljoin (self.url, prop.value.cssText)
142 | images.append (url)
143 |
144 | return images
145 |
146 |
147 | def get_aux_urls (self):
148 | """ Return the urls of all auxiliary files in document.
149 |
150 | Auxiliary files are non-document files you need to correctly
151 | display the document file, eg. CSS files.
152 |
153 | """
154 |
155 | aux = []
156 |
157 | for rule in self.sheet:
158 | if rule.type == rule.IMPORT_RULE:
159 | aux.append (urlparse.urljoin (self.url, rule.href))
160 |
161 | return aux
162 |
163 |
164 | def serialize (self):
165 | """ Serialize CSS. """
166 |
167 | return self.sheet.cssText
168 |
--------------------------------------------------------------------------------
/epubmaker/parsers/HTMLParser.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 |
6 | HTMLParser.py
7 |
8 | Copyright 2009 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | """
13 |
14 | import re
15 | import subprocess
16 | import urllib
17 | import urlparse
18 |
19 | import lxml.html
20 | from lxml import etree
21 | # import tidy
22 |
23 | from epubmaker.lib.GutenbergGlobals import NS, xpath
24 | from epubmaker.lib.Logger import info, debug, warn, error
25 | from epubmaker.lib.MediaTypes import mediatypes as mt
26 |
27 | from epubmaker import parsers
28 | from epubmaker.parsers import HTMLParserBase
29 |
30 | mediatypes = ('text/html', mt.xhtml)
31 |
32 | RE_XMLDECL = re.compile ('<\?xml[^?]+\?>\s*')
33 |
34 | DEPRECATED = { 'align': """caption applet iframe img input object legend
35 | table hr div h1 h2 h3 h4 h5 h6 p""",
36 | 'alink': 'body',
37 | 'alt': 'applet',
38 | 'archive': 'applet',
39 | 'background': 'body',
40 | 'bgcolor': '*',
41 | 'border': 'img object',
42 | 'clear': 'br',
43 | 'code': 'applet',
44 | 'codebase': 'applet',
45 | 'color': '*',
46 | 'compact': '*',
47 | 'face': '*',
48 | 'height': 'td th applet',
49 | 'hspace': '*',
50 | 'language': 'script',
51 | 'link': 'body',
52 | 'name': 'applet',
53 | 'noshade': 'hr',
54 | 'nowrap': '*',
55 | 'object': 'applet',
56 | 'prompt': 'isindex',
57 | 'size': 'hr font basefont',
58 | 'start': 'ol',
59 | 'text': 'body',
60 | 'type': 'li ol ul',
61 | 'value': 'li',
62 | 'version': 'html',
63 | 'vlink': 'body',
64 | 'vspace': '*',
65 | 'width': 'hr td th applet pre',
66 | }
67 |
68 |
69 | class Parser (HTMLParserBase):
70 | """ Parse a HTML Text
71 |
72 | and convert it to xhtml suitable for ePub packaging.
73 |
74 | """
75 |
76 | @staticmethod
77 | def _fix_id (id_):
78 | """ Fix more common mistakes in ids.
79 |
80 | xml:id cannot start with digit, very common in pg.
81 |
82 | """
83 |
84 | if not parsers.RE_XML_NAME.match (id_):
85 | id_ = 'id_' + id_
86 |
87 | # debug ("_fix_id: id = %s" % id_)
88 | return id_
89 |
90 |
91 | def _fix_internal_frag (self, id_):
92 | """ Fix more common mistakes in ids. """
93 |
94 | # This is a big mess because href attributes must be quoted,
95 | # but id attributes must not be quoted. Some HTML in PG
96 | # quotes ids in a misguided attempt to make id and href look
97 | # the same. But '%' is invalid in xml ids.
98 | #
99 | # See HTML 4.01 spec section B.2.
100 |
101 | if '%' in id_:
102 | id_ = urllib.unquote (id_)
103 | try:
104 | id_ = id_.decode ('utf-8')
105 | except UnicodeError:
106 | try:
107 | id_ = id_.decode (self.encoding)
108 | except UnicodeError:
109 | pass # we tried
110 |
111 | # xml:id cannot start with digit
112 | # very common in pg
113 |
114 | if not parsers.RE_XML_NAME.match (id_):
115 | id_ = 'id_' + id_
116 |
117 | if not parsers.RE_XML_NAME.match (id_):
118 | # still invalid ... we tried
119 | return None
120 |
121 | # debug ("_fix_internal_frag: frag = %s" % id_)
122 | return id_
123 |
124 |
125 | # @staticmethod
126 | # def tidylib (html):
127 | # """ Pipe html thru w3c tidylib. """
128 |
129 | # html = parsers.RE_RESTRICTED.sub ('', html)
130 | # html = RE_XMLDECL.sub ('', html)
131 | # html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html)
132 |
133 | # options = {
134 | # "clean": 1,
135 | # "wrap": 0,
136 | # "output_xhtml": 1,
137 | # "numeric_entities": 1,
138 | # "merge_divs": 0, # keep poetry indentation
139 | # "merge_spans": 0,
140 | # "add_xml_decl": 0,
141 | # "doctype": "strict",
142 | # "anchor_as_name": 0,
143 | # "enclose_text": 1,
144 | # }
145 |
146 | # try:
147 | # html = tidy.parseString (html.encode ('utf-8'))
148 | # except TidyLibError, what:
149 | # error ("Tidy: %s" % what)
150 | # raise
151 |
152 | # return html
153 |
154 |
155 | @staticmethod
156 | def tidy (html):
157 | """ Pipe html thru w3c tidy. """
158 |
159 | html = parsers.RE_RESTRICTED.sub ('', html)
160 | html = RE_XMLDECL.sub ('', html)
161 | html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html)
162 |
163 | # convert to xhtml
164 | tidy = subprocess.Popen (
165 | ["tidy",
166 | "-utf8",
167 | "-clean",
168 | "--wrap", "0",
169 | # "--drop-font-tags", "y",
170 | # "--drop-proprietary-attributes", "y",
171 | # "--add-xml-space", "y",
172 | "--output-xhtml", "y",
173 | "--numeric-entities", "y",
174 | "--merge-divs", "n", # keep poetry indentation
175 | "--merge-spans", "n",
176 | "--add-xml-decl", "n",
177 | "--doctype", "strict",
178 | "--anchor-as-name", "n",
179 | "--enclose-text", "y" ],
180 |
181 | stdin = subprocess.PIPE,
182 | stdout = subprocess.PIPE,
183 | stderr = subprocess.PIPE)
184 |
185 | # print (html.encode ('utf-8'))
186 | # sys.exit ()
187 |
188 | (html, stderr) = tidy.communicate (html.encode ('utf-8'))
189 |
190 | regex = re.compile ('(Info:|Warning:|Error:)\s*', re.I)
191 |
192 | # pylint: disable=E1103
193 | msg = stderr.rstrip ()
194 | for line in msg.splitlines ():
195 | match = regex.search (line)
196 | if match:
197 | sline = regex.sub ("", line)
198 | g = match.group (1).lower ()
199 | if g == 'info:':
200 | info ("tidy: %s" % sline)
201 | elif g == 'warning:':
202 | warn ("tidy: %s" % sline)
203 | elif g == 'error:':
204 | error ("tidy: %s" % sline)
205 | else:
206 | error (line)
207 |
208 | if tidy.returncode == 2:
209 | raise ValueError, stderr
210 |
211 | return html.decode ('utf-8')
212 |
213 |
214 | def find_coverpage (self):
215 | """ Search coverpage and put url into .
216 |
217 | First look for an image with id of 'coverpage', then for an
218 | image with 'cover' in the url, then with 'title' in the url.
219 |
220 | """
221 | for head in xpath (self.xhtml, 'xhtml:head'):
222 | for dummy_link in xpath (head, 'xhtml:link[@rel = "coverpage"]'):
223 | # already there
224 | return
225 |
226 | covers = (xpath (self.xhtml, '//xhtml:img[@id = "coverpage"]') or
227 | xpath (self.xhtml, '//xhtml:img[contains (@src, "cover")]') or
228 | xpath (self.xhtml, '//xhtml:img[contains (@src, "title")]'))
229 | if not covers:
230 | return
231 |
232 | href = covers[0].get ('src')
233 | # FIXME: enforce minimum size
234 | head.append (etree.Element (NS.xhtml.link, rel = 'coverpage', href = href))
235 | return href
236 |
237 |
238 | def _fix_anchors (self):
239 | """ Move name to id and fix hrefs and ids. """
240 |
241 | # move anchor name to id
242 | # 'id' values are more strict than 'name' values
243 | # try to fix ill-formed ids
244 |
245 | seen_ids = set ()
246 |
247 | for anchor in (xpath (self.xhtml, "//xhtml:a[@name]") +
248 | xpath (self.xhtml, "//xhtml:*[@id]")):
249 | id_ = anchor.get ('id') or anchor.get ('name')
250 |
251 | if 'name' in anchor.attrib:
252 | del anchor.attrib['name']
253 | if 'id' in anchor.attrib:
254 | del anchor.attrib['id']
255 | if NS.xml.id in anchor.attrib:
256 | del anchor.attrib[NS.xml.id]
257 |
258 | id_ = self._fix_id (id_)
259 |
260 | if not parsers.RE_XML_NAME.match (id_):
261 | error ("Dropping ill-formed id '%s' in %s" % (id_, self.url))
262 | continue
263 |
264 | # well-formed id
265 | if id_ in seen_ids:
266 | error ("Dropping duplicate id '%s' in %s" % (id_, self.url))
267 | continue
268 |
269 | seen_ids.add (id_)
270 | anchor.set ('id', id_)
271 |
272 |
273 | # try to fix bogus fragment ids
274 | # 1. fragments point to xml:id, so must be well-formed ids
275 | # 2. the ids they point to must exist
276 |
277 | for link in xpath (self.xhtml, "//xhtml:*[@href]"):
278 | href = link.get ('href')
279 | hre, frag = urlparse.urldefrag (href)
280 | if frag:
281 | frag = self._fix_internal_frag (frag)
282 |
283 | if not frag:
284 | # non-recoverable ill-formed frag
285 | del link.attrib['href']
286 | self.add_class (link, 'pgkilled')
287 | error ('Dropping ill-formed frag in %s' % href)
288 | continue
289 |
290 | # well-formed frag
291 | if hre:
292 | # we have url + frag
293 | link.set ('href', "%s#%s" % (hre, urllib.quote (frag.encode ('utf-8'))))
294 | self.add_class (link, 'pgexternal')
295 | elif frag in seen_ids:
296 | # we have only frag
297 | link.set ('href', "#%s" % urllib.quote (frag.encode ('utf-8')))
298 | self.add_class (link, 'pginternal')
299 | else:
300 | del link.attrib['href']
301 | self.add_class (link, 'pgkilled')
302 | error ("Dropping frag to non-existing id in %s" % href)
303 |
304 |
305 | def _to_xhtml11 (self):
306 | """ Make vanilla xhtml more conform to xhtml 1.1 """
307 |
308 | # Change content-type meta to application/xhtml+xml.
309 | for meta in xpath (self.xhtml, "/xhtml:html/xhtml:head/xhtml:meta[@http-equiv]"):
310 | if meta.get ('http-equiv').lower () == 'content-type':
311 | meta.set ('content', mt.xhtml + '; charset=utf-8')
312 |
313 | # drop javascript
314 |
315 | for script in xpath (self.xhtml, "//xhtml:script"):
316 | script.drop_tree ()
317 |
318 | # drop form
319 |
320 | for form in xpath (self.xhtml, "//xhtml:form"):
321 | form.drop_tree ()
322 |
323 | # blockquotes
324 |
325 | for bq in xpath (self.xhtml, "//xhtml:blockquote"):
326 | # no naked text allowed in
327 | div = etree.Element (NS.xhtml.div)
328 | for child in bq:
329 | div.append (child)
330 | div.text = bq.text
331 | bq.text = None
332 | bq.append (div)
333 | # lxml.html.defs.block_tags
334 |
335 | # insert tbody
336 |
337 | for table in xpath (self.xhtml, "//xhtml:table[xhtml:tr]"):
338 | # no naked allowed in
339 | tbody = etree.Element (NS.xhtml.tbody)
340 | for tr in table:
341 | if tr.tag == NS.xhtml.tr:
342 | tbody.append (tr)
343 | table.append (tbody)
344 |
345 | # move lang to xml:lang
346 |
347 | for elem in xpath (self.xhtml, "//xhtml:*[@lang]"):
348 | # bug in lxml 2.2.2: sometimes deletes wrong element
349 | # so we delete both and reset the right one
350 | lang = elem.get ('lang')
351 | try:
352 | del elem.attrib[NS.xml.lang]
353 | except KeyError:
354 | pass
355 | del elem.attrib['lang']
356 | elem.set (NS.xml.lang, lang)
357 |
358 | # strip deprecated attributes
359 |
360 | for a, t in DEPRECATED.items ():
361 | for tag in t.split ():
362 | for elem in xpath (self.xhtml, "//xhtml:%s[@%s]" % (tag, a)):
363 | del elem.attrib[a]
364 |
365 | # strip empty class attributes
366 |
367 | for elem in xpath (self.xhtml,
368 | "//xhtml:*[@class and normalize-space (@class) = '']"):
369 | del elem.attrib['class']
370 |
371 | # strip bogus header markup by Joe L.
372 | for elem in xpath (self.xhtml, "//xhtml:h1"):
373 | if elem.text and elem.text.startswith ("The Project Gutenberg eBook"):
374 | elem.tag = NS.xhtml.p
375 | for elem in xpath (self.xhtml, "//xhtml:h3"):
376 | if elem.text and elem.text.startswith ("E-text prepared by"):
377 | elem.tag = NS.xhtml.p
378 |
379 |
380 | def __parse (self, html):
381 | # remove xml decl and doctype, we will add the correct one before serializing
382 | # html = re.compile ('^.*', re.S)
389 | html = re_xml_decl.sub ('', html)
390 | try:
391 | return etree.fromstring (
392 | html,
393 | lxml.html.XHTMLParser (),
394 | base_url = self.url)
395 | except etree.ParseError, what:
396 | # cannot try HTML parser because we depend on correct xhtml namespace
397 | error ("etree.fromstring says: %s" % what)
398 | m = re.search (r'line\s(\d+),', str (what))
399 | if m:
400 | lineno = int (m.group (1))
401 | error ("Line %d: %s" % (lineno, html.splitlines ()[lineno - 1]))
402 | raise
403 |
404 |
405 | def pre_parse (self):
406 | """ Pre-parse a html ebook. Does a full parse because a
407 | lightweight parse would be almost as much work. """
408 |
409 | # cache
410 | if self.xhtml is not None:
411 | return
412 |
413 | debug ("HTMLParser.pre_parse () ...")
414 |
415 | html = self.unicode_content ()
416 |
417 | if html.startswith (' max_size):
79 | for quality in (90, 85, 80, 70, 60, 50, 40, 30, 20, 10):
80 | buf = StringIO.StringIO ()
81 | image.save (buf, format_, quality = quality)
82 | data = buf.getvalue ()
83 | if (len (data) <= max_size):
84 | was += 'q=%d' % quality
85 | break
86 |
87 | comment = "Image: %d x %d size=%d %s" % (
88 | image.size[0], image.size[1], len (data), was)
89 | debug (comment)
90 |
91 | new_parser.mediatype = self.mediatype
92 | new_parser.image_data = data
93 | new_parser.dimen = tuple (image.size)
94 | new_parser.comment = comment
95 | new_parser.url = self.url
96 | new_parser.orig_url = self.orig_url
97 | new_parser.attribs = self.attribs
98 | new_parser.fp = self.fp
99 |
100 | except IOError, what:
101 | error ("Could not resize image: %s" % what)
102 | new_parser.broken_image ()
103 |
104 | return new_parser
105 |
106 |
107 | def get_image_dimen (self):
108 | if self.dimen is None:
109 | image = Image.open (StringIO.StringIO (self.image_data))
110 | self.dimen = image.size
111 | return self.dimen
112 |
113 |
114 | def broken_image (self):
115 | """ Insert broken image placeholder. """
116 |
117 | self.image_data = resource_string ('epubmaker.parsers', 'broken.png')
118 | # We need a way to distinguish between pngs to drop and pngs
119 | # to keep in a non-images build.
120 | self.mediatype = 'image/png;type=resource'
121 |
122 |
123 | def pre_parse (self):
124 | if self.image_data is None:
125 | self.image_data = self.bytes_content ()
126 | if self.image_data is None:
127 | self.broken_image ()
128 |
129 |
130 | def parse (self):
131 | """ Parse the image. """
132 |
133 | pass
134 |
135 |
136 | def serialize (self):
137 | """ Serialize the image. """
138 | return self.image_data
139 |
140 |
--------------------------------------------------------------------------------
/epubmaker/parsers/RSTParser.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
3 |
4 | """
5 |
6 | RSTParser.py
7 |
8 | Copyright 2010-2012 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | """
13 |
14 | # FIXME:
15 | # use docinfo instead of meta for pg header
16 |
17 | import copy
18 | import re
19 | import os
20 | import collections
21 | import urlparse
22 | from functools import partial
23 |
24 | from lxml import etree
25 | import lxml.html
26 |
27 | import docutils.readers.standalone
28 | from docutils import nodes, frontend, io
29 |
30 | from pkg_resources import resource_string # pylint: disable=E0611
31 |
32 | from epubmaker.lib.GutenbergGlobals import NS, xpath
33 | from epubmaker.lib.Logger import info, debug, warn, error
34 | from epubmaker.lib.MediaTypes import mediatypes as mt
35 |
36 | from epubmaker import ParserFactory
37 | from epubmaker.parsers import HTMLParser
38 |
39 | from epubmaker.mydocutils import broken
40 | from epubmaker.mydocutils import nodes as mynodes
41 | from epubmaker.mydocutils.writers import xhtml1, epub2, xetex
42 |
43 | from epubmaker.mydocutils.gutenberg import parsers as gutenberg_parsers
44 | from epubmaker.mydocutils.gutenberg.writers import nroff as gutenberg_nroff
45 | from epubmaker.CommonOptions import Options
46 |
47 | options = Options()
48 |
49 | mediatypes = (mt.rst, )
50 |
51 | RE_EMACS_CHARSET = re.compile (r'-\*-.*coding:\s*(\S+)', re.I)
52 |
53 | class Parser (HTMLParser.Parser):
54 | """ Parse a ReStructured Text
55 |
56 | and convert it to different xhtml flavours.
57 |
58 | """
59 |
60 | def __init__ (self):
61 | HTMLParser.Parser.__init__ (self)
62 | self.document1 = None
63 |
64 |
65 | def preprocess (self, charset):
66 | """ Insert pg header and footer. """
67 |
68 | return self.unicode_content ()
69 |
70 |
71 | def to_xhtml (self, html, base_url):
72 | html = html.replace (u' ', u' ')
73 | html = html.replace (u'—', u'—')
74 |
75 | outputfilename = os.path.join (options.outputdir, options.outputfile)
76 | debugfilename = os.path.splitext (outputfilename)[0] + '.debug.html'
77 |
78 | try:
79 | os.remove (debugfilename)
80 | except OSError:
81 | pass
82 |
83 | if options.verbose > 1:
84 | with open (debugfilename, 'w') as fp:
85 | fp.write (html.encode ('utf-8'))
86 |
87 | try:
88 | xhtml = etree.fromstring (
89 | html,
90 | lxml.html.XHTMLParser (),
91 | base_url = base_url)
92 | except etree.ParseError, what:
93 | error ("etree.fromstring says %s" % what)
94 | raise
95 |
96 | xhtml.make_links_absolute (base_url = base_url)
97 |
98 | return xhtml
99 |
100 |
101 | def rewrite_links (self, f):
102 | """ Rewrite all links using the function f. """
103 |
104 | doc = self.document1
105 |
106 | if 'coverpage' in doc.meta_block:
107 | coverpage = doc.meta_block['coverpage']
108 | coverpage[0] = f (coverpage[0])
109 | else:
110 | for field in doc.traverse (nodes.field):
111 | field_name, field_body = field.children
112 | if field_name.astext () == 'coverpage':
113 | field_body[:] = nodes.paragraph ('', f (field_body.astext ()))
114 | break
115 |
116 | for node in doc.traverse (nodes.reference):
117 | if 'uri' in node:
118 | node['uri'] = f (node['uri'])
119 |
120 | for node in doc.traverse (nodes.image):
121 | if 'uri' in node:
122 | node['uri'] = f (node['uri'])
123 |
124 | for node in doc.traverse (nodes.pending):
125 | # dropcap images
126 | if 'image' in node.details:
127 | node.details['image'] = f (node.details['image'])
128 |
129 |
130 | def iterlinks (self):
131 | """ Grab links and images in RST. """
132 |
133 | debug ("RSTParser iterlinks want_images = %d" % self.options.want_images)
134 |
135 | doc = self.document1
136 |
137 | # return coverpage even in noimages build
138 | if 'coverpage' in doc.meta_block:
139 | coverpage = doc.meta_block['coverpage']
140 | yield coverpage[0], {'tag': NS.xhtml.link,
141 | 'type': 'image/jpeg;type=resource', 'rel': 'coverpage'}
142 | else:
143 | for field in doc.traverse (nodes.field):
144 | field_name, field_body = field.children
145 | if field_name.astext () == 'coverpage':
146 | yield field_body.astext (), {
147 | 'tag': NS.xhtml.link,
148 | 'type': 'image/jpeg;type=resource',
149 | 'rel': 'coverpage'}
150 | break
151 |
152 | # need broken.png for no-images build
153 | if not self.options.want_images:
154 | yield (urlparse.urljoin (self.url, broken),
155 | {'tag': NS.xhtml.img, 'type': 'image/png;type=resource', 'rel': 'broken'})
156 |
157 | for node in doc.traverse (nodes.reference):
158 | if 'uri' in node:
159 | yield node['uri'], {'tag': NS.xhtml.a}
160 |
161 | if self.options.want_images:
162 | for node in doc.traverse (nodes.image):
163 | if 'uri' in node:
164 | yield node['uri'], {'tag': NS.xhtml.img}
165 |
166 | if self.options.want_images:
167 | for node in doc.traverse (nodes.pending):
168 | # dropcap images
169 | if 'image' in node.details:
170 | yield node.details['image'], {'tag': NS.xhtml.img}
171 |
172 |
173 | def get_settings (self, components, defaults):
174 | option_parser = frontend.OptionParser (
175 | components = components,
176 | defaults = defaults,
177 | read_config_files = 1)
178 | return option_parser.get_default_values ()
179 |
180 |
181 | def pre_parse (self):
182 | """ Parse a RST file as link list. """
183 |
184 | debug ("RSTParser: Pre-parsing %s" % self.url)
185 |
186 | default_style = self.get_resource (
187 | 'mydocutils.parsers', 'default_style.rst').decode ('utf-8')
188 |
189 | source = io.StringInput (default_style + self.unicode_content ())
190 | reader = docutils.readers.standalone.Reader ()
191 | parser = gutenberg_parsers.Parser ()
192 |
193 | overrides = {
194 | 'get_resource': self.get_resource,
195 | 'get_image_size': self.get_image_size_from_parser,
196 | 'no_images': not self.options.want_images,
197 | 'base_url': self.url,
198 | }
199 |
200 | doc = reader.read (
201 | source, parser, self.get_settings ((reader, parser), overrides))
202 | self.document1 = doc
203 |
204 | self.rewrite_links (partial (urlparse.urljoin, self.url))
205 |
206 | debug ("RSTParser: Done pre-parsing %s" % self.url)
207 |
208 |
209 | def _full_parse (self, writer, overrides):
210 | """ Full parse from scratch. """
211 |
212 | debug ("RSTParser: Full-parsing %s" % self.url)
213 |
214 | default_style = self.get_resource (
215 | 'mydocutils.parsers', 'default_style.rst').decode ('utf-8')
216 |
217 | source = io.StringInput (default_style + self.unicode_content (),
218 | self.url, 'unicode')
219 | reader = docutils.readers.standalone.Reader ()
220 | parser = gutenberg_parsers.Parser ()
221 |
222 | doc = reader.read (
223 | source, parser,
224 | self.get_settings ((reader, parser, writer), overrides))
225 | self.document1 = doc
226 |
227 | self.rewrite_links (partial (urlparse.urljoin, self.url))
228 |
229 | doc.transformer.populate_from_components ((source, reader, parser, writer))
230 | doc.transformer.apply_transforms ()
231 | debug ("RSTParser: Done full-parsing %s" % self.url)
232 |
233 | return doc
234 |
235 |
236 | def _full_parse_2 (self, writer, destination, overrides):
237 | """ Full parser from pickled doctree.
238 |
239 | Doesn't work yet. It turned out pickling a doctree is much
240 | harder than I thought. """
241 |
242 | debug ("Full-parsing %s" % self.url)
243 |
244 | source = io.StringInput (self.unicode_content ())
245 | reader = docutils.readers.standalone.Reader ()
246 | parser = gutenberg_parsers.Parser ()
247 |
248 | doc = reader.read (
249 | source, parser,
250 | self.get_settings ((reader, parser, writer), overrides))
251 | self.document1 = doc
252 |
253 | self.rewrite_links (partial (urlparse.urljoin, self.url))
254 |
255 | # make it picklable
256 | reporter = doc.reporter # = None
257 | # doc.reporter = None
258 | transformer = doc.transformer
259 | doc.settings = None
260 | from docutils.parsers.rst.directives.html import MetaBody
261 |
262 | #for metanode in doc.traverse (MetaBody.meta):
263 | for pending in doc.traverse (nodes.pending):
264 | # pending.transform = None
265 | # docutils' meta nodes aren't picklable because the class is nested
266 | # in pending['nodes']
267 | if 'nodes' in pending.details:
268 | if isinstance (pending.details['nodes'][0], MetaBody.meta):
269 | pending.details['nodes'][0].__class__ = mynodes.meta
270 | import cPickle as pickle
271 | pickled = pickle.dumps (doc)
272 |
273 | doc = pickle.loads (pickled)
274 |
275 | #doc.transformer.populate_from_components (
276 | # (source, reader, parser, writer))
277 |
278 | doc.transformer = transformer
279 | doc.reporter = reporter
280 | doc.settings = self.get_settings ((reader, parser, writer), overrides)
281 |
282 | doc.transformer.apply_transforms ()
283 |
284 | return writer.write (doc, destination)
285 |
286 |
287 | def rst2nroff (self, charset = 'utf-8'):
288 | """ Convert RST to nroff. """
289 |
290 | writer = gutenberg_nroff.Writer ()
291 | destination = io.StringOutput (encoding = 'unicode')
292 |
293 | overrides = {
294 | 'doctitle_xform': 1,
295 | 'sectsubtitle_xform': 1,
296 | 'footnote_references': 'superscript',
297 | 'compact_lists': 1,
298 | 'compact_simple': 1,
299 | 'page_numbers': 1,
300 | 'no_images': True,
301 | 'get_resource': self.get_resource,
302 | 'format': options.type,
303 | 'encoding': charset,
304 | 'base_url': self.url,
305 | }
306 |
307 | doc = self._full_parse (writer, overrides)
308 | return writer.write (doc, destination)
309 |
310 |
311 | def rst2xetex (self):
312 | """ Convert RST to xetex. """
313 |
314 | writer = xetex.Writer ()
315 | destination = io.StringOutput (encoding = 'unicode')
316 |
317 | overrides = {
318 | 'doctitle_xform': 1,
319 | 'sectsubtitle_xform': 1,
320 | 'footnote_references': 'superscript',
321 | 'compact_lists': 1,
322 | 'compact_simple': 1,
323 | 'page_numbers': 1,
324 | 'format': options.type,
325 | 'encoding': 'utf-8',
326 | 'get_resource': self.get_resource,
327 | 'get_image_size': self.get_image_size_from_parser,
328 | 'no_images': not self.options.want_images,
329 | 'base_url': self.url,
330 | }
331 |
332 | doc = self._full_parse (writer, overrides)
333 | return writer.write (doc, destination)
334 |
335 |
336 | def rst2htmlish (self, writer, more_overrides = {}):
337 |
338 | destination = io.StringOutput (encoding = 'unicode')
339 |
340 | overrides = {
341 | 'stylesheet': None,
342 | 'stylesheet_path': None,
343 | 'xml_declaration': 0,
344 | 'doctitle_xform': 1,
345 | 'initial_header_level': 2,
346 | 'sectsubtitle_xform': 1,
347 | 'footnote_references': 'superscript',
348 | 'page_numbers': 1,
349 | 'format': options.type,
350 | 'encoding': 'utf-8',
351 | 'get_resource': self.get_resource,
352 | 'get_image_size': self.get_image_size_from_parser,
353 | 'no_images': not self.options.want_images,
354 | 'base_url': self.url,
355 | }
356 | overrides.update (more_overrides)
357 |
358 | doc = self._full_parse (writer, overrides)
359 | return writer.fixup_xhtml (self.to_xhtml (writer.write (doc, destination), self.url))
360 |
361 |
362 | def rst2html (self):
363 | """ Convert RST input to HTML output. """
364 | return self.rst2htmlish (xhtml1.Writer ())
365 |
366 |
367 | def rst2epub2 (self):
368 | """ Convert RST input to HTML output with Epub2 tweaks. """
369 | return self.rst2htmlish (epub2.Writer (),
370 | { 'toc_backlinks': 'none' })
371 |
372 |
373 | def get_resource (self, package, resource):
374 | return (resource_string ('epubmaker.' + package, resource))
375 |
376 |
377 | def get_image_size_from_parser (self, uri):
378 | # debug ("Getting image dimen for %s" % uri)
379 | parser = ParserFactory.ParserFactory.create (uri, {})
380 | parser.pre_parse ()
381 | if hasattr (parser, 'get_image_dimen'):
382 | return parser.get_image_dimen ()
383 | return None
384 |
385 |
386 | def get_charset_from_rstheader (self):
387 | """ Parse text for hints about charset. """
388 | # .. -*- coding: utf-8 -*-
389 |
390 | charset = None
391 | rst = self.bytes_content ()
392 |
393 | match = RE_EMACS_CHARSET.search (rst)
394 | if (match):
395 | charset = match.group (1)
396 | debug ('Got charset %s from emacs comment' % charset)
397 |
398 | return charset
399 |
400 |
401 | def parse (self):
402 | """ Dummy. Use rst2* instead. """
403 |
404 | debug ("Done parsing %s" % self.url)
405 |
--------------------------------------------------------------------------------
/epubmaker/parsers/broken.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gitenberg-dev/pg-epubmaker/9a982bab100518aea7582e3e570f5edc74a5fa0d/epubmaker/parsers/broken.png
--------------------------------------------------------------------------------
/epubmaker/writers/HTMLWriter.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 |
6 | HTMLWriter.py
7 |
8 | Copyright 2009 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | Writes an HTML file
13 |
14 | """
15 |
16 | from __future__ import with_statement
17 |
18 | import os
19 | import copy
20 |
21 | from lxml import etree
22 | from pkg_resources import resource_string # pylint: disable=E0611
23 |
24 | import epubmaker.lib.GutenbergGlobals as gg
25 | from epubmaker.lib.GutenbergGlobals import xpath
26 | from epubmaker.lib.Logger import info, debug, error, exception
27 |
28 | from epubmaker import writers
29 | from epubmaker.CommonOptions import Options
30 |
31 | options = Options()
32 |
33 |
34 | class Writer (writers.HTMLishWriter):
35 | """ Class for writing HTML files. """
36 |
37 |
38 | def add_dublincore (self, tree):
39 | """ Add dublin core metadata to . """
40 | source = gg.archive2files (
41 | self.options.ebook, self.options.candidate.filename)
42 |
43 | if hasattr (options.config, 'FILESDIR'):
44 | self.options.dc.source = source.replace (options.config.FILESDIR, options.config.PGURL)
45 |
46 | for head in xpath (tree, '//xhtml:head'):
47 | for e in self.options.dc.to_html ():
48 | e.tail = '\n'
49 | head.append (e)
50 |
51 |
52 | def build (self):
53 | """ Build HTML file. """
54 |
55 | htmlfilename = os.path.join (self.options.outputdir,
56 | self.options.outputfile)
57 | try:
58 | os.remove (htmlfilename)
59 | except OSError:
60 | pass
61 |
62 | try:
63 | info ("Creating HTML file: %s" % htmlfilename)
64 |
65 | for p in self.spider.parsers:
66 | # Do html only. The images were copied earlier by PicsDirWriter.
67 |
68 | xhtml = None
69 | if hasattr (p, 'rst2html'):
70 | xhtml = p.rst2html ()
71 | elif hasattr (p, 'xhtml'):
72 | p.parse ()
73 | xhtml = copy.deepcopy (p.xhtml)
74 |
75 | if xhtml is not None:
76 | self.make_links_relative (xhtml, p.url)
77 |
78 | self.add_dublincore (xhtml)
79 |
80 | # makes iphones zoom in
81 | self.add_meta (xhtml, 'viewport', 'width=device-width')
82 | self.add_meta_generator (xhtml)
83 |
84 | # This writer has currently to deal only with RST
85 | # input. The RST writer has a workaround that
86 | # avoids writing empty elements. So we don't need
87 | # the same ugly workaround as the EPUB writer,
88 | # that has to deal with HTML input too.
89 | html = etree.tostring (xhtml,
90 | method = 'xml',
91 | doctype = gg.XHTML_DOCTYPE,
92 | encoding = 'utf-8',
93 | pretty_print = True,
94 | xml_declaration = True)
95 |
96 | self.write_with_crlf (htmlfilename, html)
97 |
98 | # self.copy_aux_files (self.options.outputdir)
99 |
100 | info ("Done HTML file: %s" % htmlfilename)
101 |
102 | except StandardError, what:
103 | exception ("Error building HTML %s: %s" % (htmlfilename, what))
104 | if os.access (htmlfilename, os.W_OK):
105 | os.remove (htmlfilename)
106 | raise what
107 |
108 |
109 |
--------------------------------------------------------------------------------
/epubmaker/writers/KindleWriter.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 |
6 | KindleWriter.py
7 |
8 | Copyright 2009-2012 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | """
13 |
14 | import re
15 | import os
16 | import subprocess
17 |
18 | from epubmaker.lib.Logger import info, debug, warn, error
19 | from epubmaker.lib.GutenbergGlobals import SkipOutputFormat
20 | from epubmaker.writers import EpubWriter
21 | from epubmaker.CommonOptions import Options
22 |
23 | options = Options()
24 |
25 |
26 | class Writer (EpubWriter.Writer):
27 | """ Class for writing kindle files. """
28 |
29 |
30 | def parse (self, options):
31 | """ Standard parse. """
32 | self.setup (options)
33 |
34 |
35 | def build (self):
36 | """ Build kindle file. """
37 |
38 | # Build a special temporary epub file for kindlegen input.
39 | # This file is a valid epub but contains strongly simplified HTML.
40 |
41 | # Much unnecessary juggling of files here because
42 | # brain-dead kindlegen doesn't understand unix pipes
43 | # and can only output in current directory.
44 | # Furthermore we must not conflict with the filenames
45 | # of the other generated epub files.
46 |
47 | kindle_filename = self.options.outputfile
48 | epub_filename = self.options.epub_filename
49 |
50 | # tmp_epub_filename = os.path.splitext (kindle_filename)[0] + '-kindlegen.epub'
51 | #
52 | # debug ("Creating temp Epub file: %s" % os.path.join (
53 | # self.options.outputdir, tmp_epub_filename))
54 | #
55 | # # call EpubWriter to build temporary epub file
56 | # self.options.outputfile = tmp_epub_filename
57 | # EpubWriter.Writer.build (self)
58 | # self.options.outputfile = kindle_filename
59 |
60 | info ("Creating Kindle file: %s" % os.path.join (
61 | self.options.outputdir, kindle_filename))
62 | info (" ... from: %s" % os.path.join (
63 | self.options.outputdir, epub_filename))
64 |
65 | try:
66 | cwd = os.getcwd ()
67 | os.chdir (self.options.outputdir)
68 |
69 | kindlegen = subprocess.Popen (
70 | [options.config.MOBIGEN, '-o', os.path.basename (kindle_filename), epub_filename],
71 | stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
72 |
73 | except OSError, what:
74 | os.chdir (cwd)
75 | error ("KindleWriter: %s %s" % (options.config.MOBIGEN, what))
76 | raise SkipOutputFormat
77 |
78 | (stdout, stderr) = kindlegen.communicate ('')
79 |
80 | # try:
81 | # # if self.options.verbose < 2:
82 | # # os.remove (tmp_epub_filename)
83 | # os.remove (kindle_filename)
84 | # except OSError:
85 | # pass
86 | #
87 | # tmp_mobi_filename = os.path.splitext (tmp_epub_filename)[0] + '.mobi'
88 | # os.rename (tmp_mobi_filename, kindle_filename)
89 |
90 | os.chdir (cwd)
91 |
92 | regex = re.compile ('^(\w+)\(prcgen\):')
93 |
94 | if kindlegen.returncode > 0:
95 | # pylint: disable=E1103
96 | info (stderr.rstrip ())
97 | msg = stdout.rstrip ()
98 | for line in msg.splitlines ():
99 | match = regex.match (line)
100 | if match:
101 | sline = regex.sub ("", line)
102 | g = match.group (1).lower ()
103 | if g == 'info':
104 | if sline == 'MOBI File generated with WARNINGS!':
105 | # we knew that already
106 | continue
107 | # info ("kindlegen: %s" % sline)
108 | elif g == 'warning':
109 | if sline.startswith ('Cover is too small'):
110 | continue
111 | if sline == 'Cover not specified':
112 | continue
113 | warn ("kindlegen: %s" % sline)
114 | elif g == 'error':
115 | error ("kindlegen: %s" % sline)
116 | else:
117 | error (line)
118 |
119 | info ("Done Kindle file: %s" % os.path.join (
120 | self.options.outputdir, kindle_filename))
121 |
122 |
--------------------------------------------------------------------------------
/epubmaker/writers/PDFWriter.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
3 |
4 | """
5 | PDFWriter.py
6 |
7 | Copyright 2011 by Marcello Perathoner
8 |
9 | Distributable under the GNU General Public License Version 3 or newer.
10 |
11 | Convert RST to PDF.
12 |
13 | """
14 |
15 | from __future__ import with_statement
16 |
17 | import os
18 | import subprocess
19 |
20 | from epubmaker.lib.Logger import debug, info, warn, error
21 | from epubmaker.lib.GutenbergGlobals import SkipOutputFormat
22 |
23 | from epubmaker import ParserFactory
24 | from epubmaker import writers
25 | from epubmaker.CommonOptions import Options
26 |
27 | options = Options()
28 |
29 | class Writer (writers.BaseWriter):
30 | """ Class to write PDF. """
31 |
32 | def build (self):
33 | """ Build PDF file. """
34 |
35 | inputfilename = self.options.candidate.filename
36 | outputfilename = os.path.join (self.options.outputdir, self.options.outputfile)
37 |
38 | debug ("Inputfile: %s" % inputfilename)
39 | info ("Creating PDF file: %s" % outputfilename)
40 |
41 | parser = ParserFactory.ParserFactory.create (inputfilename,
42 | self.options.candidate.mediatype)
43 | parser.options = self.options
44 |
45 | if not hasattr (parser, 'rst2xetex'):
46 | error ('PDFWriter can only work on a RSTParser.')
47 | raise SkipOutputFormat
48 |
49 | # Brain-dead xetex doesn't understand unix pipes
50 | # so we have to write a temp file
51 |
52 | texfilename = os.path.splitext (outputfilename)[0] + '.tex'
53 | auxfilename = os.path.splitext (outputfilename)[0] + '.aux'
54 | logfilename = os.path.splitext (outputfilename)[0] + '.log'
55 |
56 | try:
57 | os.remove (auxfilename)
58 | except OSError:
59 | pass
60 |
61 | tex = parser.rst2xetex ()
62 | with open (texfilename, 'w') as fp:
63 | fp.write (tex.encode ('utf-8'))
64 |
65 | try:
66 | cwd = os.getcwd ()
67 | os.chdir (self.options.outputdir)
68 |
69 | _xetex = subprocess.Popen ([options.config.XELATEX,
70 | "-output-directory", self.options.outputdir,
71 | "-interaction", "nonstopmode",
72 | texfilename],
73 | stdin = subprocess.PIPE,
74 | stdout = subprocess.PIPE,
75 | stderr = subprocess.PIPE)
76 | except OSError, what:
77 | os.chdir (cwd)
78 | error ("PDFWriter: %s %s" % (options.config.XELATEX, what))
79 | raise SkipOutputFormat
80 |
81 | (dummy_stdout, dummy_stderr) = _xetex.communicate ()
82 |
83 | with open (logfilename) as fp:
84 | for line in fp:
85 | line = line.strip ()
86 | if 'Error:' in line:
87 | error ("xetex: %s" % line)
88 | if options.verbose >= 1:
89 | if 'Warning:' in line:
90 | warn ("xetex: %s" % line)
91 |
92 | if options.verbose < 2:
93 | try:
94 | os.remove (texfilename)
95 | os.remove (logfilename)
96 | os.remove (auxfilename)
97 | except OSError:
98 | pass
99 |
100 | os.chdir (cwd)
101 |
102 | info ("Done PDF file: %s" % outputfilename)
103 |
104 |
105 |
--------------------------------------------------------------------------------
/epubmaker/writers/PicsDirWriter.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 |
6 | PicsDirWriter.py
7 |
8 | Copyright 2012 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | Copies pics into local directory. Needed for HTML and Xetex.
13 |
14 | """
15 |
16 | from __future__ import with_statement
17 |
18 | import os
19 | import copy
20 |
21 | from lxml import etree
22 | from pkg_resources import resource_string # pylint: disable=E0611
23 |
24 | import epubmaker.lib.GutenbergGlobals as gg
25 | from epubmaker.lib.GutenbergGlobals import xpath
26 | from epubmaker.lib.Logger import info, debug, error, exception
27 |
28 | from epubmaker import writers
29 |
30 |
31 | class Writer (writers.BaseWriter):
32 | """ Writes Pics directory. """
33 |
34 |
35 | # def copy_aux_files_lowlevel (self, dest_dir):
36 | # """ Copy image files to dest_dir. """
37 |
38 | # for src_uri in self.get_aux_file_list ():
39 | # fn_dest = gg.make_url_relative (self.options.base_url, src_uri)
40 | # fn_dest = os.path.join (dest_dir, fn_dest)
41 |
42 | # if gg.is_same_path (src_uri, fn_dest):
43 | # debug ('Not copying %s to %s: same file' % (src_uri, fn_dest))
44 | # continue
45 | # debug ('Copying %s to %s' % (src_uri, fn_dest))
46 |
47 | # fn_dest = gg.normalize_path (fn_dest)
48 | # gg.mkdir_for_filename (fn_dest)
49 | # try:
50 | # fp_src = urllib.urlopen (src_uri)
51 | # if fp_src:
52 | # with open (fn_dest, 'wb') as fp_dest:
53 | # fp_dest.write (fp_src.read ())
54 | # except IOError, what:
55 | # error ('Cannot copy %s to %s: %s' % (src_uri, fn_dest, what))
56 |
57 |
58 | def copy_aux_files (self, dest_dir):
59 | """ Copy image files to dest_dir. Use image data cached in parsers. """
60 |
61 | for p in self.spider.parsers:
62 | if hasattr (p, 'resize_image'):
63 | src_uri = p.url
64 | fn_dest = gg.make_url_relative (self.options.base_url, src_uri)
65 | fn_dest = os.path.join (dest_dir, fn_dest)
66 |
67 | if gg.is_same_path (src_uri, fn_dest):
68 | debug ('Not copying %s to %s: same file' % (src_uri, fn_dest))
69 | continue
70 | debug ('Copying %s to %s' % (src_uri, fn_dest))
71 |
72 | fn_dest = gg.normalize_path (fn_dest)
73 | gg.mkdir_for_filename (fn_dest)
74 | try:
75 | with open (fn_dest, 'wb') as fp_dest:
76 | fp_dest.write (p.serialize ())
77 | except IOError, what:
78 | error ('Cannot copy %s to %s: %s' % (src_uri, fn_dest, what))
79 |
80 |
81 |
82 | def build (self):
83 | """ Build Pics file. """
84 |
85 | dir = self.options.outputdir
86 |
87 | info ("Creating Pics directory in: %s" % dir)
88 |
89 | self.copy_aux_files (dir)
90 |
91 | info ("Done Pics directory in: %s" % dir)
92 |
93 |
94 |
95 |
--------------------------------------------------------------------------------
/epubmaker/writers/RSTWriter.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 | RSTWriter.py
6 |
7 | Copyright 2009 by Marcello Perathoner
8 |
9 | Distributable under the GNU General Public License Version 3 or newer.
10 |
11 | Build an RST file. This is just the master RST with the PG license mixed in.
12 |
13 | """
14 |
15 | from __future__ import with_statement
16 |
17 | import os
18 |
19 | from epubmaker.lib.Logger import debug, info, error
20 | from epubmaker import ParserFactory
21 | from epubmaker import writers
22 |
23 | class Writer (writers.BaseWriter):
24 | """ Class to write a reStructuredText. """
25 |
26 | def build (self):
27 | """ Build RST file. """
28 |
29 | filename = os.path.join (self.options.outputdir, self.options.outputfile)
30 |
31 | info ("Creating RST file: %s" % filename)
32 |
33 | parser = ParserFactory.ParserFactory.create (self.options.candidate.filename,
34 | self.options.candidate.mediatype)
35 | parser.options = self.options
36 |
37 | if not hasattr (parser, 'rst2nroff'):
38 | error ('RSTWriter can only work on a RSTParser.')
39 | return
40 |
41 | data = parser.preprocess ('utf-8').encode ('utf-8')
42 |
43 | self.write_with_crlf (filename, data)
44 |
45 | info ("Done RST file: %s" % filename)
46 |
47 |
48 |
--------------------------------------------------------------------------------
/epubmaker/writers/TxtWriter.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
3 |
4 | """
5 | TxtWriter.py
6 |
7 | Copyright 2009 by Marcello Perathoner
8 |
9 | Distributable under the GNU General Public License Version 3 or newer.
10 |
11 | Build an UTF-8-encoded PG plain text file. This is just the plain text
12 | version recoded into UTF-8.
13 |
14 | """
15 |
16 | from __future__ import with_statement
17 |
18 | import os
19 | import subprocess
20 |
21 | from epubmaker.lib.Logger import debug, info, warn, error
22 | from epubmaker.lib.GutenbergGlobals import SkipOutputFormat
23 |
24 | from epubmaker import ParserFactory
25 | from epubmaker import writers
26 | from epubmaker.CommonOptions import Options
27 |
28 | options = Options()
29 |
30 | # map some not-widely-supported characters to more common ones
31 | u2u = {
32 | 0x2010: u'-', # unicode HYPHEN to HYPHEN-MINUS. Many Windows fonts lack this.
33 | }
34 |
35 | class Writer (writers.BaseWriter):
36 | """ Class to write PG plain text. """
37 |
38 | def groff (self, nroff, encoding = 'utf-8'):
39 | """ Process thru groff.
40 |
41 | Takes and returns unicode strings!
42 |
43 | """
44 |
45 | device = { 'utf-8': 'utf8',
46 | 'iso-8859-1': 'latin1',
47 | 'us-ascii': 'ascii' }[encoding]
48 |
49 | nroff = nroff.encode (encoding)
50 | nrofffilename = os.path.join (
51 | self.options.outputdir,
52 | os.path.splitext (self.options.outputfile)[0] + '.nroff')
53 |
54 | # write nroff file for debugging
55 | if options.verbose >= 2:
56 | with open (nrofffilename, 'w') as fp:
57 | fp.write (nroff)
58 | else:
59 | try:
60 | # remove debug files from previous runs
61 | os.remove (nrofffilename)
62 | except OSError:
63 | pass
64 |
65 | # call groff
66 | try:
67 | _groff = subprocess.Popen ([options.config.GROFF,
68 | "-t", # preprocess with tbl
69 | "-K", device, # input encoding
70 | "-T", device], # output device
71 | stdin = subprocess.PIPE,
72 | stdout = subprocess.PIPE,
73 | stderr = subprocess.PIPE)
74 | except OSError:
75 | error ("TxtWriter: executable not found: %s" % options.config.GROFF)
76 | raise SkipOutputFormat
77 |
78 | (txt, stderr) = _groff.communicate (nroff)
79 |
80 | # pylint: disable=E1103
81 | for line in stderr.splitlines ():
82 | line = line.strip ()
83 | if 'error' in line:
84 | error ("groff: %s" % line)
85 | elif 'warn' in line:
86 | if options.verbose >= 1:
87 | warn ("groff: %s" % line)
88 |
89 | txt = txt.decode (encoding)
90 | return txt.translate (u2u) # fix nroff idiosyncracies
91 |
92 |
93 | def build (self):
94 | """ Build TXT file. """
95 |
96 | filename = os.path.join (self.options.outputdir, self.options.outputfile)
97 |
98 | encoding = options.subtype.strip ('.')
99 |
100 | info ("Creating plain text file: %s" % filename)
101 |
102 | parser = ParserFactory.ParserFactory.create (self.options.candidate.filename,
103 | self.options.candidate.mediatype)
104 | parser.options = self.options
105 |
106 | if hasattr (parser, 'rst2nroff'):
107 | data = self.groff (parser.rst2nroff (encoding), encoding)
108 | else:
109 | data = parser.unicode_content ()
110 |
111 | data = data.encode ('utf_8_sig' if encoding == 'utf-8' else encoding, 'unitame')
112 |
113 | self.write_with_crlf (filename, data)
114 |
115 | info ("Done plain text file: %s" % filename)
116 |
117 |
118 |
--------------------------------------------------------------------------------
/epubmaker/writers/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 |
6 | Writer package
7 |
8 | Copyright 2009-2010 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | Base classes for *Writer modules. (EpubWriter, PluckerWriter, ...)
13 |
14 | """
15 |
16 | from __future__ import with_statement
17 |
18 | from functools import partial
19 | import os.path
20 | import urllib
21 |
22 | from lxml import etree
23 | from lxml.builder import ElementMaker
24 |
25 | from epubmaker.lib.Logger import debug, error
26 | import epubmaker.lib.GutenbergGlobals as gg
27 | from epubmaker.lib import MediaTypes
28 |
29 | from epubmaker import ParserFactory
30 | from epubmaker import Spider
31 | from epubmaker.Version import VERSION, GENERATOR
32 |
33 |
34 | class BaseWriter (object):
35 | """
36 | Base class for EpubWriter, PluckerWriter, ...
37 |
38 | also used as /dev/null writer for debugging
39 |
40 | """
41 |
42 | def __init__ (self):
43 | self.options = None
44 | self.spider = None
45 |
46 |
47 | def setup (self, options):
48 | """ override this in a real writer
49 |
50 | put computationally cheap setup stuff in here,
51 |
52 | """
53 |
54 | if not options.include_mediatypes:
55 | options.include_mediatypes = (
56 | MediaTypes.TEXT_MEDIATYPES |
57 | MediaTypes.AUX_MEDIATYPES |
58 | MediaTypes.IMAGE_MEDIATYPES
59 | )
60 |
61 | self.options = options
62 |
63 |
64 | def parse (self, options):
65 | """ Standard parse. """
66 | self.setup (options)
67 |
68 | if self.spider is None:
69 | self.spider = Spider.Spider ()
70 |
71 | self.spider.parse (options.candidate.filename,
72 | options.candidate.mediatype,
73 | options)
74 |
75 | options.candidate.filename = self.spider.redirect (options.candidate.filename)
76 | options.base_url = options.candidate.filename
77 |
78 |
79 | def build (self):
80 | """ override this in a real writer """
81 | pass
82 |
83 |
84 | @staticmethod
85 | def write_with_crlf (filename, data):
86 | # \r\n is PG standard
87 | data = '\r\n'.join (data.splitlines ()) + '\r\n'
88 |
89 | # open binary so windows doesn't add another \r
90 | with open (filename, 'wb') as fp:
91 | fp.write (data)
92 |
93 |
94 | def validate (self): # pylint: disable=R0201
95 | """ Validate the output with some (external) tool.
96 |
97 | Override this in a real writer.
98 |
99 | """
100 | return 0
101 |
102 |
103 | def sync (self):
104 | """ Override this if you need to sync before program exit. """
105 | pass
106 |
107 |
108 | def make_links_relative (self, xhtml, base_url):
109 | """ Make absolute links in xhtml relative to base_url. """
110 |
111 | debug ("Making links relative to: %s" % base_url)
112 | xhtml.rewrite_links (partial (gg.make_url_relative, base_url))
113 |
114 |
115 | def get_aux_file_list (self):
116 | """ Iterate over image files. Return absolute urls. """
117 |
118 | for p in self.spider.parsers:
119 | if hasattr (p, 'resize_image'):
120 | yield p.url
121 |
122 |
123 | em = ElementMaker (namespace = str (gg.NS.xhtml),
124 | nsmap = { None: str (gg.NS.xhtml) })
125 |
126 |
127 | class HTMLishWriter (BaseWriter):
128 | """ Base class for writers with HTMLish contents. """
129 |
130 | @staticmethod
131 | def add_class (elem, class_):
132 | """ Add a class to html element. """
133 |
134 | classes = elem.get ('class', '').split ()
135 | classes.append (class_)
136 | elem.set ('class', ' '.join (classes))
137 |
138 |
139 | @staticmethod
140 | def add_meta (xhtml, name, content):
141 | """ Add a meta tag. """
142 |
143 | for head in gg.xpath (xhtml, '//xhtml:head'):
144 | meta = em.meta (name = name, content = content)
145 | meta.tail = '\n'
146 | head.append (meta)
147 |
148 |
149 | @staticmethod
150 | def add_meta_generator (xhtml):
151 | """ Add our piss mark. """
152 |
153 | HTMLishWriter.add_meta (xhtml, 'generator', GENERATOR % VERSION)
154 |
155 |
156 | @staticmethod
157 | def add_internal_css (xhtml, css_as_string):
158 | """ Add internal stylesheet to html. """
159 |
160 | if css_as_string and xhtml is not None:
161 | css_as_string = '\n' + css_as_string.strip (' \n') + '\n'
162 | for head in gg.xpath (xhtml, '//xhtml:head'):
163 | style = em.style (css_as_string, type = 'text/css')
164 | style.tail = '\n'
165 | head.append (style)
166 |
167 |
168 | def add_external_css (self, xhtml, css_as_string, url):
169 | """ Add external stylesheet to html. """
170 |
171 | if css_as_string:
172 | p = ParserFactory.ParserFactory.get ('text/css')
173 | p.parse_string (css_as_string)
174 | p.url = url
175 | self.spider.parsers.append (p)
176 |
177 | if xhtml is not None:
178 | for head in gg.xpath (xhtml, '//xhtml:head'):
179 | link = em.link (href = url, rel = 'stylesheet', type = 'text/css')
180 | link.tail = '\n'
181 | head.append (link)
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
--------------------------------------------------------------------------------
/epubmaker/writers/cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gitenberg-dev/pg-epubmaker/9a982bab100518aea7582e3e570f5edc74a5fa0d/epubmaker/writers/cover.jpg
--------------------------------------------------------------------------------
/scripts/epubmaker:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 |
6 | epubmaker script
7 |
8 | Copyright 2014 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | This script starts epubmaker.
13 |
14 | """
15 |
16 | from epubmaker import EpubMaker
17 |
18 | EpubMaker.main ()
19 |
20 |
--------------------------------------------------------------------------------
/scripts/rhyme_compiler:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: iso-8859-1 -*-
3 |
4 | """
5 |
6 | ryhme_compiler.py
7 |
8 | Copyright 2009 by Marcello Perathoner
9 |
10 | Distributable under the GNU General Public License Version 3 or newer.
11 |
12 | This module produces a dbm file of rhyme stems.
13 |
14 | We use a very naive concept of rhyme: we preprocess the 'CMU
15 | Pronouncing Dictionary' (found at
16 | http://www.speech.cs.cmu.edu/cgi-bin/cmudict) and extract the phonemes
17 | for each word from the last stressed one to the end of the word.
18 |
19 | The result is stored in cmudict.db hashed by word.
20 |
21 | To compile:
22 |
23 | $ ./rhyme_compiler.py cmudict.0.7a
24 |
25 |
26 | """
27 |
28 | import fileinput
29 | import re
30 | import gdbm
31 |
32 | dbm = gdbm.open ('cmudict.db', 'nf')
33 |
34 | RE_STRESSED = re.compile ('[a-z]+[12][^12]*$')
35 |
36 | # two example lines from cmudict
37 | #
38 | # PRONUNCIATION P R OW0 N AH2 N S IY0 EY1 SH AH0 N
39 | # PRONUNCIATION(1) P R AH0 N AH2 N S IY0 EY1 SH AH0 N
40 |
41 | for line in fileinput.input (openhook = fileinput.hook_encoded ("iso-8859-1")):
42 | if line.startswith (';'):
43 | continue
44 |
45 | word, dummy_sep, phonemes = line.lower ().partition (' ')
46 |
47 | m = RE_STRESSED.search (phonemes)
48 | if m:
49 | phoneme = re.sub (r'[ 012]+', '-', m.group (0)) # remove stress marks
50 | dbm[word.encode ('utf-8')] = phoneme.encode ('utf-8')
51 |
52 | # print "%s %s\n" % (word, dbm[word])
53 |
54 | dbm.sync ()
55 | dbm.reorganize ()
56 | dbm.close ()
57 |
58 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [egg_info]
2 |
3 | [bdist_wininst]
4 | plat-name: win32
5 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #
2 | # pypi epubmaker setup
3 | #
4 |
5 | from setuptools import setup
6 | from setup_inc import *
7 |
8 | setup (
9 | name = 'epubmaker',
10 | version = VERSION,
11 | install_requires = install_requires,
12 | package_dir = package_dir,
13 | packages = pypi_packages,
14 | py_modules = pypi_py_modules,
15 | package_data = pypi_package_data,
16 | scripts = pypi_scripts,
17 | data_files = pypi_data_files,
18 |
19 | # metadata for upload to PyPI
20 |
21 | author = author,
22 | author_email = author_email,
23 | description = description,
24 | long_description = long_description,
25 | license = license,
26 | keywords = keywords,
27 | url = url,
28 | classifiers = classifiers,
29 | platforms = platforms,
30 | )
31 |
--------------------------------------------------------------------------------
/setup_inc.py:
--------------------------------------------------------------------------------
1 | #
2 | # epubmaker common setup all flavors
3 | #
4 |
5 | VERSION = '0.3.26'
6 |
7 | package_dir = {
8 | 'epubmaker': 'epubmaker',
9 | }
10 |
11 | install_requires = [
12 | 'roman',
13 | 'docutils >= 0.8.1, <0.13',
14 | 'lxml >= 2.3',
15 | 'cssutils >= 0.9.8a1',
16 | 'pillow',
17 | ]
18 |
19 |
20 | pypi_packages = [
21 | 'epubmaker.parsers',
22 | 'epubmaker.packagers',
23 | 'epubmaker.writers',
24 | 'epubmaker.mydocutils',
25 | 'epubmaker.mydocutils.parsers',
26 | 'epubmaker.mydocutils.transforms',
27 | 'epubmaker.mydocutils.writers',
28 | 'epubmaker.mydocutils.gutenberg',
29 | 'epubmaker.mydocutils.gutenberg.parsers',
30 | 'epubmaker.mydocutils.gutenberg.transforms',
31 | 'epubmaker.mydocutils.gutenberg.writers',
32 | ]
33 |
34 | ibiblio_packages = pypi_packages + [
35 | 'epubmaker',
36 | 'epubmaker.lib',
37 | 'epubmaker.writers.ibiblio',
38 | ]
39 |
40 | pypi_py_modules = [
41 | 'epubmaker.CommonOptions',
42 | 'epubmaker.EpubMaker',
43 | 'epubmaker.HTMLChunker',
44 | 'epubmaker.ParserFactory',
45 | 'epubmaker.Spider',
46 | 'epubmaker.Unitame',
47 | 'epubmaker.UnitameData',
48 | 'epubmaker.Version',
49 |
50 | 'epubmaker.lib.DublinCore',
51 | 'epubmaker.lib.GutenbergGlobals',
52 | 'epubmaker.lib.Logger',
53 | 'epubmaker.lib.MediaTypes',
54 |
55 | 'epubmaker.WriterFactory',
56 | ]
57 |
58 | pypi_package_data = {
59 | 'epubmaker.parsers': ['broken.png'],
60 | 'epubmaker.writers': ['cover.jpg'],
61 | 'epubmaker.mydocutils.parsers': ['*.rst'],
62 | 'epubmaker.mydocutils.writers': ['*.css'],
63 | 'epubmaker.mydocutils.gutenberg.parsers': ['*.rst'],
64 | }
65 |
66 | ibiblio_package_data = pypi_package_data
67 | ibiblio_package_data.update ({
68 | 'epubmaker.writers.ibiblio': ['qioo-skeleton.zip'],
69 | })
70 |
71 | pypi_data_files = [
72 | ('', ['CHANGES', 'setup_inc.py']),
73 | ]
74 |
75 | ibiblio_data_files = [
76 | ('epubmaker', ['CHANGES', 'setup_inc.py']),
77 | ]
78 |
79 | pypi_scripts = [
80 | 'scripts/epubmaker',
81 | 'scripts/rhyme_compiler',
82 | ]
83 |
84 | ibiblio_scripts = pypi_scripts + [
85 | 'scripts/makepub',
86 | 'scripts/convert_unitame',
87 | 'scripts/update_facebook_auth',
88 | ]
89 |
90 | # metadata for upload to PyPI
91 |
92 | author = "Marcello Perathoner"
93 | author_email = "webmaster@gutenberg.org"
94 | description = "The Project Gutenberg tool to generate EPUBs and other ebook formats."
95 | long_description = open ('README').read ()
96 | license = "GPL v3"
97 | keywords = "ebook epub kindle pdf rst reST reStructuredText project gutenberg format conversion"
98 | url = "https://github.com/gitenberg-dev/pg-epubmaker"
99 |
100 | classifiers = [
101 | "Topic :: Text Processing",
102 | "License :: OSI Approved :: GNU General Public License (GPL)",
103 | "Environment :: Console",
104 | "Operating System :: OS Independent",
105 | "Intended Audience :: Other Audience",
106 | "Development Status :: 4 - Beta"
107 | ]
108 |
109 | platforms = 'OS-independent'
110 |
111 |
--------------------------------------------------------------------------------
/test/test.py:
--------------------------------------------------------------------------------
1 | from lxml import etree
2 |
3 | root = etree.fromstring ("""
4 |
5 |
6 |
7 | black
8 |
9 |
10 |
11 | """)
12 |
13 | XHTML11_DOCTYPE = ""
15 |
16 | print (etree.tostring (
17 | root,
18 | method = 'xml',
19 | xml_declaration = True,
20 | doctype = XHTML11_DOCTYPE,
21 | encoding = 'utf-8',
22 | pretty_print = True))
23 |
--------------------------------------------------------------------------------