├── tests
    ├── __init__.py
    ├── cache
    │   └── epub
    │   │   └── test
    ├── files
    │   └── 43172
    │   │   ├── images
    │   │       └── image.jpg
    │   │   └── 43172-h
    │   │       ├── music
    │   │           └── test.mp3
    │   │       └── images
    │   │           ├── image.jpg
    │   │           └── mathex.jpg
    ├── test_rst.py
    ├── test_txt.py
    ├── test_html.py
    ├── test_templates.py
    ├── test_job.py
    ├── test_setup.py
    └── test_htm.py
├── src
    └── ebookmaker
    │   ├── __init__.py
    │   ├── mydocutils
    │       ├── __init__.py
    │       ├── gutenberg
    │       │   ├── __init__.py
    │       │   ├── writers
    │       │   │   ├── __init__.py
    │       │   │   └── nroff.py
    │       │   ├── parsers
    │       │   │   ├── pg-header.rst
    │       │   │   └── __init__.py
    │       │   └── transforms
    │       │   │   └── __init__.py
    │       ├── transforms
    │       │   └── __init__.py
    │       ├── writers
    │       │   ├── rst2epub.css
    │       │   ├── rst2html.css
    │       │   ├── epub2.py
    │       │   └── rst2all.css
    │       ├── parsers
    │       │   └── default_style.rst
    │       └── nodes.py
    │   ├── Version.py
    │   ├── parsers
    │       ├── broken.png
    │       ├── AuxParser.py
    │       ├── WrapperParser.py
    │       ├── txt2all.css
    │       ├── ImageParser.py
    │       ├── CSSParser.py
    │       └── boilerplate.py
    │   ├── writers
    │       ├── cover.jpg
    │       ├── RSTWriter.py
    │       ├── PicsDirWriter.py
    │       ├── PDFWriter.py
    │       ├── HtmlTemplates.py
    │       ├── KindleWriter.py
    │       ├── TxtWriter.py
    │       └── __init__.py
    │   ├── packagers
    │       ├── PDFPackager.py
    │       ├── RSTPackager.py
    │       ├── HTMLPackager.py
    │       ├── GzipPackager.py
    │       ├── TxtPackager.py
    │       ├── PushPackager.py
    │       └── __init__.py
    │   ├── WriterFactory.py
    │   ├── utils.py
    │   ├── Unitame.py
    │   ├── ParserFactory.py
    │   ├── UnitameData.py
    │   ├── CommonCode.py
    │   ├── HTMLChunker.py
    │   └── Spider.py
├── pyproject.toml
├── setup.cfg
├── .travis.yml
├── Pipfile
├── scripts
    ├── ebookmaker
    ├── rhyme_compiler
    └── convert_unitame
├── .gitignore
├── ebookmaker.conf
├── docs
    ├── alt-text.md
    ├── images.md
    └── ebookmaker_v0_11.md
├── MANIFEST
├── setup.py
├── USAGE.md
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/cache/epub/test:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/ebookmaker/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 | 


--------------------------------------------------------------------------------
/src/ebookmaker/mydocutils/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package """
2 | 


--------------------------------------------------------------------------------
/src/ebookmaker/mydocutils/gutenberg/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 | 


--------------------------------------------------------------------------------
/src/ebookmaker/mydocutils/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 | 


--------------------------------------------------------------------------------
/src/ebookmaker/mydocutils/gutenberg/writers/__init__.py:
--------------------------------------------------------------------------------
1 | """ This is a package. """
2 | 


--------------------------------------------------------------------------------
/src/ebookmaker/Version.py:
--------------------------------------------------------------------------------
1 | VERSION = '0.13.8'
2 | GENERATOR = 'Ebookmaker %s by Project Gutenberg'
3 | 


--------------------------------------------------------------------------------
/src/ebookmaker/parsers/broken.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gutenbergtools/ebookmaker/HEAD/src/ebookmaker/parsers/broken.png


--------------------------------------------------------------------------------
/src/ebookmaker/writers/cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gutenbergtools/ebookmaker/HEAD/src/ebookmaker/writers/cover.jpg


--------------------------------------------------------------------------------
/tests/files/43172/images/image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gutenbergtools/ebookmaker/HEAD/tests/files/43172/images/image.jpg


--------------------------------------------------------------------------------
/tests/files/43172/43172-h/music/test.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gutenbergtools/ebookmaker/HEAD/tests/files/43172/43172-h/music/test.mp3


--------------------------------------------------------------------------------
/tests/files/43172/43172-h/images/image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gutenbergtools/ebookmaker/HEAD/tests/files/43172/43172-h/images/image.jpg


--------------------------------------------------------------------------------
/tests/files/43172/43172-h/images/mathex.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gutenbergtools/ebookmaker/HEAD/tests/files/43172/43172-h/images/mathex.jpg


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | # As per https://github.com/pypa/setuptools/blob/main/docs/userguide/quickstart.rst
2 | [build-system]
3 | requires = ["setuptools"]
4 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = ebookmaker
 3 | 
 4 | version = 0.13.8
 5 | 
 6 | [options]
 7 | package_dir=
 8 |     =src
 9 | packages=find:
10 | 
11 | [options.packages.find]
12 | where=src


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python:
 4 |   - '3.6'
 5 | 
 6 | before_install:
 7 |   - sudo apt-get update
 8 | 
 9 | install:
10 |   - 'pip install pipenv'
11 |   - 'pipenv install'
12 | 
13 | script: python setup.py test
14 | 
15 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.python.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [dev-packages]
 7 | pylint = "*"
 8 | 
 9 | [packages]
10 | e1839a8 = {path = ".",editable = true}
11 | libgutenberg = ">=0.10.31"
12 | psycopg2 = "*"
13 | docutils = ">=0.18.1"
14 | html5lib = "*"
15 | cchardet = "==2.2.0a2"
16 | ebookmaker = {file = ".", editable = true}
17 | 


--------------------------------------------------------------------------------
/scripts/ebookmaker:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*-
 3 | 
 4 | """
 5 | 
 6 | ebookmaker script
 7 | 
 8 | Copyright 2014 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | This script starts epubmaker.
13 | 
14 | """
15 | 
16 | from ebookmaker import EbookMaker
17 | 
18 | EbookMaker.main ()
19 | 


--------------------------------------------------------------------------------
/src/ebookmaker/packagers/PDFPackager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | PDFPackager.py
 6 | 
 7 | Copyright 2010 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Package a PDF file for PG.
12 | 
13 | """
14 | 
15 | from ebookmaker.packagers import OneFileZipPackager
16 | 
17 | TYPE = 'ww'
18 | FORMATS = ''.split ()
19 | 
20 | class Packager (OneFileZipPackager):
21 |     """ WW packager for PDF files. """
22 |     pass
23 | 


--------------------------------------------------------------------------------
/src/ebookmaker/packagers/RSTPackager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | RSTPackager.py
 6 | 
 7 | Copyright 2010 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Package a RST file for PG.
12 | 
13 | """
14 | 
15 | from ebookmaker.packagers import HTMLishPackager
16 | 
17 | TYPE = 'ww'
18 | FORMATS = 'rst.gen'.split ()
19 | 
20 | class Packager (HTMLishPackager):
21 |     """ Package a RST file with its images. """
22 |     pass
23 | 


--------------------------------------------------------------------------------
/src/ebookmaker/packagers/HTMLPackager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | HTMLPackager.py
 6 | 
 7 | Copyright 2010 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Package a HTML file for PG.
12 | 
13 | """
14 | 
15 | from ebookmaker.packagers import HTMLishPackager
16 | 
17 | TYPE = 'ww'
18 | FORMATS = 'html.images'.split ()
19 | 
20 | class Packager (HTMLishPackager):
21 |     """ Package a HTML file with its images. """
22 |     pass
23 | 


--------------------------------------------------------------------------------
/src/ebookmaker/packagers/GzipPackager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | GzipPackager.py
 6 | 
 7 | Copyright 2010 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Gzip a file.
12 | 
13 | """
14 | 
15 | from ebookmaker.packagers import OneFileGzipPackager
16 | 
17 | TYPE = 'gzip'
18 | FORMATS = 'rst html.noimages html.images txt.us-ascii txt.iso-8859-1 txt.utf-8'.split ()
19 | 
20 | class Packager (OneFileGzipPackager):
21 |     """ Gzip packager. """
22 |     pass
23 | 


--------------------------------------------------------------------------------
/src/ebookmaker/packagers/TxtPackager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | TxtPackager.py
 6 | 
 7 | Copyright 2010 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Package a Txt file for PG.
12 | 
13 | """
14 | 
15 | from ebookmaker.packagers import OneFileZipPackager
16 | 
17 | TYPE = 'ww'
18 | FORMATS = 'txt.us-ascii txt.iso-8859-1 txt.utf-8'.split ()
19 | 
20 | class Packager (OneFileZipPackager):
21 |     """ WW packager for plain text files. """
22 |     pass
23 | 


--------------------------------------------------------------------------------
/src/ebookmaker/parsers/AuxParser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*-
 3 | 
 4 | """
 5 | 
 6 | AuxParser.py
 7 | 
 8 | Copyright 2009 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | Open an url and return raw data.
13 | 
14 | """
15 | 
16 | 
17 | from ebookmaker.parsers import ParserBase
18 | 
19 | mediatypes = ('*/*', )
20 | 
21 | class Parser (ParserBase):
22 |     """ Parse an auxiliary file. """
23 |     auxparser = True
24 |     def __init__ (self, attribs = None):
25 |         ParserBase.__init__ (self, attribs)
26 |         self.data = None
27 | 
28 | 
29 |     def pre_parse (self):
30 |         """ Parse the file. """
31 |         self.data = self.bytes_content ()
32 | 
33 | 
34 |     def serialize (self):
35 |         """ Serialize file to string. """
36 |         return self.data
37 | 


--------------------------------------------------------------------------------
/tests/test_rst.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import os
 4 | import unittest
 5 | import subprocess
 6 | 
 7 | 
 8 | import ebookmaker
 9 | 
10 | class TestFromRst(unittest.TestCase):
11 |     def setUp(self):
12 |         self.sample_dir = os.path.join(os.path.dirname(__file__), 'files')
13 | 
14 |     def test_33968(self):
15 |         book_id = '33968'
16 |         dir = os.path.join(self.sample_dir, book_id)
17 |         rstfile = os.path.join(dir, '%s-rst' % book_id, '%s-rst.rst' % book_id)
18 |         cmd = 'ebookmaker --make=pdf --output-dir={dir} {rstfile}'.format(
19 |             dir=dir,
20 |             rstfile=rstfile,
21 |         )
22 | 
23 |         output = subprocess.check_output(cmd, shell=True)
24 | 
25 |         self.assertFalse(output)
26 |         outs = [
27 |             "%s-cover.png",
28 |             "%s-images-pdf.pdf",
29 |         ]
30 |         for out in outs:
31 |             self.assertTrue(os.path.exists(os.path.join(dir, out % book_id)))
32 |             os.remove(os.path.join(dir, out % book_id))
33 |         


--------------------------------------------------------------------------------
/tests/test_txt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import os
 4 | import unittest
 5 | import subprocess
 6 | 
 7 | 
 8 | import ebookmaker
 9 | 
10 | class TestFromTxt(unittest.TestCase):
11 |     def setUp(self):
12 |         self.sample_dir = os.path.join(os.path.dirname(__file__), 'files')
13 |         self.out_dir = os.path.join(os.path.dirname(__file__), 'out')
14 | 
15 |     def test_69030(self):
16 |         book_id = '69030'
17 |         dir = os.path.join(self.sample_dir, book_id)
18 |         srcfile = os.path.join(dir, '%s-0.txt' % book_id)
19 |         cmd = 'ebookmaker '
20 |         cmd += f'--ebook={book_id} --make=txt --make=html --output-dir={self.out_dir} '
21 |         cmd += f'--validate {srcfile}'
22 | 
23 |         output = subprocess.check_output(cmd, shell=True)
24 | 
25 |         self.assertFalse(output)
26 |         outs = [
27 |             "%s.txt",
28 |             "%s-0.txt",
29 |             "%s-8.txt",
30 |             "%s-h.html",
31 |             "%s-cover.png",
32 |         ]
33 |         for out in outs:
34 |             self.assertTrue(os.path.exists(os.path.join(self.out_dir, out % book_id)))
35 |             os.remove(os.path.join(self.out_dir, out % book_id))
36 |         


--------------------------------------------------------------------------------
/src/ebookmaker/mydocutils/writers/rst2epub.css:
--------------------------------------------------------------------------------
 1 | /*
 2 | Project Gutenberg EPUB docutils stylesheet.
 3 | 
 4 | This stylesheet contains styles specific to EPUB.
 5 | */
 6 | 
 7 | /* FONTS */
 8 | 
 9 | /* mostly unsupported */
10 | .small-caps        { font-style: italic }
11 | .gesperrt          { font-style: italic }
12 | 
13 | /* ALIGN */
14 | 
15 | /* SECTIONS */
16 | 
17 | /* reduce screen real estate waste */
18 | body               { margin: 1% }
19 | 
20 | /* ugly hack to give more specifity.  because ADE chucks out the whole
21 |    stylesheet when it sees an !important */
22 | 
23 | .first.first        { margin-top: 0; text-indent: 0 }
24 | .last.last          { margin-bottom: 0 }
25 | 
26 | .no-page-break.no-page-break
27 |                     { page-break-before: avoid }
28 | 
29 | /* PAGINATION */
30 | 
31 | div.clearpage       { page-break-before: always; padding-top: 10% }
32 | div.cleardoublepage { page-break-before: right;  padding-top: 10%  }
33 | 
34 | .vfill              { margin-top: 10% }
35 | h2.title            { margin-top: 10% }
36 | 
37 | /* DIV */
38 | 
39 | a                   { text-decoration: none }
40 | .toc-pageref        { display: none }
41 | 
42 | /* DROPCAPS */
43 | 
44 | span.dropcap        { line-height: 0 }
45 | img.dropcap         { vertical-align: bottom }
46 | 


--------------------------------------------------------------------------------
/src/ebookmaker/mydocutils/gutenberg/parsers/pg-header.rst:
--------------------------------------------------------------------------------
 1 | .. -*- encoding: utf-8 -*-
 2 | 
 3 | .. |pg.copyrighted-header| replace::
 4 | 
 5 |    This is a *copyrighted* Project Gutenberg eBook, details
 6 |    below.
 7 | 
 8 | .. _pg-header:
 9 | 
10 | .. container:: noindent pgheader language-en pg_boilerplate
11 | 
12 |    This ebook is for the use of anyone anywhere in the United States
13 |    and most other parts of the world at no cost and with almost no
14 |    restrictions whatsoever. You may copy it, give it away or re-use it
15 |    under the terms of the `Project Gutenberg License`_ included with
16 |    this ebook or online at https://www.gutenberg.org/license. If you
17 |    are not located in the United States, you'll have to check the laws
18 |    of the country where you are located before using this ebook.
19 | 
20 |    |pg.copyrighted-header|
21 | 
22 |    .. vspace:: 2
23 | 
24 |    .. _pg-machine-header:
25 | 
26 |    .. container:: noindent white-space-pre-line
27 | 
28 |       |pg.machine-header|
29 | 
30 |    .. vspace:: 2
31 | 
32 |    .. _pg-start-line:
33 | 
34 |    \*\*\* START OF THIS PROJECT GUTENBERG EBOOK |pg.upcase-title| \*\*\*
35 | 
36 |    .. vspace:: 4
37 | 
38 |    .. _pg-produced-by:
39 | 
40 |    |pg.produced-by|
41 | 
42 |    .. vspace:: 1
43 | 
44 |    |pg.credits|
45 | 


--------------------------------------------------------------------------------
/src/ebookmaker/writers/RSTWriter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*-
 3 | 
 4 | """
 5 | RSTWriter.py
 6 | 
 7 | Copyright 2009 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Build an RST file. This is just the master RST with the PG license mixed in.
12 | 
13 | """
14 | 
15 | 
16 | import os
17 | 
18 | from libgutenberg.Logger import debug, info, error
19 | from libgutenberg.GutenbergGlobals import SkipOutputFormat
20 | from ebookmaker import ParserFactory
21 | from ebookmaker import writers
22 | 
23 | class Writer (writers.BaseWriter):
24 |     """ Class to write a reStructuredText. """
25 | 
26 |     def build (self, job):
27 |         """ Build RST file. """
28 | 
29 |         filename = os.path.join (os.path.abspath(job.outputdir), job.outputfile)
30 | 
31 |         debug ("Creating RST file: %s" % filename)
32 | 
33 |         parser = ParserFactory.ParserFactory.create (job.url)
34 | 
35 |         if not hasattr (parser, 'rst2nroff'):
36 |             debug ('RSTWriter can only work on a RSTParser.')
37 |             raise SkipOutputFormat
38 | 
39 |         data = parser.preprocess ('utf-8').encode ('utf-8')
40 | 
41 |         self.write_with_crlf (filename, data)
42 | 
43 |         debug ("Done RST file: %s" % filename)
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | Pipfile.lock
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | .hypothesis/
48 | 
49 | # Translations
50 | *.mo
51 | *.pot
52 | 
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 | 
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 | 
61 | # Scrapy stuff:
62 | .scrapy
63 | 
64 | # Sphinx documentation
65 | docs/_build/
66 | 
67 | # PyBuilder
68 | target/
69 | 
70 | # IPython Notebook
71 | .ipynb_checkpoints
72 | 
73 | # pyenv
74 | .python-version
75 | 
76 | # celery beat schedule file
77 | celerybeat-schedule
78 | 
79 | # dotenv
80 | .env
81 | 
82 | # virtualenv
83 | venv/
84 | ENV/
85 | 
86 | # Spyder project settings
87 | .spyderproject
88 | 
89 | # Rope project settings
90 | .ropeproject
91 | 


--------------------------------------------------------------------------------
/ebookmaker.conf:
--------------------------------------------------------------------------------
 1 | # copy this file to /etc/ebookmaker.conf to set config paths 
 2 | #
 3 | # copy this file to ~/.ebookmaker to set defaults for command line arguments 
 4 | # or to override config paths in /etc/ebookmaker.conf
 5 | 
 6 | [DEFAULT_ARGS]
 7 | #### this section is inactive in /etc/ebookmaker.conf ####
 8 | # types: all [list of output types]
 9 | # max_depth: 1
10 | # strip_links: False
11 | # include_urls: [list of urls]
12 | # exclude_urls: [list]
13 | # include_mediatypes: [list of mediatypes]
14 | # exclude_mediatypes: [list of mediatypes]
15 | # mediatype_from_extension: False
16 | # rewrite: [url]>[rewritten url]
17 | # title: None
18 | # author: None
19 | # ebook: 0
20 | # outputdir: ./
21 | # outputfile: [title].epub
22 | # section_tags: [list of classes]
23 | # packager: None ['ww', 'gzip']
24 | # cover: None [path]
25 | # generate_cover: False
26 | # epub_validator: java -jar epubcheck-4.2.6/epubcheck.jar
27 | # html_validator: vnu-runtime-image/bin/vnu
28 | # production: False
29 | 
30 | [PATHS]
31 | # proxies: None
32 | # xelatex: xelatex
33 | # mobigen: ebook-convert  # can also be a path to kindlegen
34 | # mobilang: ebook-convert # converter to use for languages not supported by Kindlegen
35 | # mobikf8:  ebook-convert # converter for kf8
36 | # groff: groff
37 | # rhyming_dict: None
38 | 
39 | # default is '~'
40 | # FILESDIR = file:///Users/Shared/Documents/pg/dev/html/files
41 | 
42 | # default is "~/cache/epub/"
43 | # CACHEDIR = /Users/Shared/Documents/gitenberg/cache1/epub


--------------------------------------------------------------------------------
/tests/test_html.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import os
 4 | import unittest
 5 | import subprocess
 6 | 
 7 | 
 8 | import ebookmaker
 9 | 
10 | class TestFromHtml(unittest.TestCase):
11 |     def setUp(self):
12 |         self.sample_dir = os.path.join(os.path.dirname(__file__), 'files')
13 |         self.out_dir = os.path.join(os.path.dirname(__file__), 'out')
14 | 
15 |     def test_43172(self):
16 |         book_id = '43172'
17 |         dir = os.path.join(self.sample_dir, book_id)
18 |         htmfile = os.path.join(dir, '%s-h' % book_id, '%s-h.html' % book_id)
19 |         cmd = 'ebookmaker --make=test --output-dir={dir} --generate_cover {htmfile}'.format(
20 |             dir=self.out_dir,
21 |             htmfile=htmfile,
22 |         )
23 | 
24 |         output = subprocess.check_output(cmd, shell=True)
25 | 
26 |         self.assertFalse(output)
27 |         outs = [
28 |             "%s-epub.epub",
29 |             "%s-images-epub3.epub",
30 |             "%s-images-epub.epub",
31 |             "%s-h.html",
32 |         ]
33 |         for out in outs:
34 |             self.assertTrue(os.path.exists(os.path.join(self.out_dir, out % book_id)))
35 |             os.remove(os.path.join(self.out_dir, out % book_id))
36 |         os.remove(os.path.join(self.out_dir, 'images/image.jpg'))
37 |         os.remove(os.path.join(self.out_dir, 'images/mathex.jpg'))
38 |         os.remove(os.path.join(self.out_dir, 'music/test.mp3'))
39 |         os.rmdir(os.path.join(self.out_dir, 'images'))
40 |         os.rmdir(os.path.join(self.out_dir, 'music'))
41 | 


--------------------------------------------------------------------------------
/tests/test_templates.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | '''
 4 | run this with
 5 | python -m unittest -v tests.test_templates
 6 | '''
 7 | 
 8 | import os
 9 | import unittest
10 | 
11 | from libgutenberg.DublinCore import GutenbergDublinCore
12 | 
13 | from ebookmaker.writers import HtmlTemplates, TemplateStrings
14 | 
15 | 
16 | class TestHeaders(unittest.TestCase):
17 | 
18 |     def setUp(self):
19 |         self.dc = GutenbergDublinCore()
20 |         book_id = '69030'
21 |         self.sample_dir = os.path.join(os.path.dirname(__file__), 'files')
22 |         dir = os.path.join(self.sample_dir, book_id)
23 |         srcfile = os.path.join(dir, '%s-0.txt' % book_id)
24 |         with open(srcfile, 'r') as f:
25 |             sampledata = f.read()
26 |         self.dc.load_from_pgheader(sampledata)
27 |  
28 |     def test_templates(self):
29 |         self.assertTrue('in the United States' in TemplateStrings.headera)
30 |         self.assertTrue('FULL PROJECT GUTENBERG LICENSE' in TemplateStrings.headerb)
31 |         self.assertTrue('COPYRIGHTED' in TemplateStrings.headera_copy)
32 |         self.assertTrue('This particular' in TemplateStrings.headerb_copy)
33 |         self.assertTrue('<div>' not in TemplateStrings.headera_txt)
34 |         self.assertTrue('<div>' not in TemplateStrings.headerb_txt)
35 |         self.assertTrue('Gutenberg License' in TemplateStrings.headera_copy_txt)
36 |         self.assertTrue('where you are located' in TemplateStrings.headerb_copy_txt)
37 | 
38 |     def test_headdata(self):
39 |         self.assertTrue('The girl in the crowd' in HtmlTemplates.pgheader(self.dc).text_content())
40 | 


--------------------------------------------------------------------------------
/scripts/rhyme_compiler:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*-
 3 | 
 4 | """
 5 | 
 6 | ryhme_compiler.py
 7 | 
 8 | Copyright 2009 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | This module produces a dbm file of rhyme stems.
13 | 
14 | We use a very naive concept of rhyme: we preprocess the 'CMU
15 | Pronouncing Dictionary' (found at
16 | http://www.speech.cs.cmu.edu/cgi-bin/cmudict) and extract the phonemes
17 | for each word from the last stressed one to the end of the word.
18 | 
19 | The result is stored in cmudict.db hashed by word.
20 | 
21 | To compile:
22 | 
23 | $ ./rhyme_compiler.py cmudict.0.7a
24 | 
25 | 
26 | """
27 | 
28 | import fileinput
29 | import re
30 | from six.moves import dbm_gnu as gdbm
31 | 
32 | dbm = gdbm.open ('cmudict.db', 'nf')
33 | 
34 | RE_STRESSED = re.compile ('[a-z]+[12][^12]*$')
35 | 
36 | # two example lines from cmudict
37 | #
38 | # PRONUNCIATION  P R OW0 N AH2 N S IY0 EY1 SH AH0 N
39 | # PRONUNCIATION(1)  P R AH0 N AH2 N S IY0 EY1 SH AH0 N
40 | 
41 | for line in fileinput.input (openhook = fileinput.hook_encoded ("iso-8859-1")):
42 |     if line.startswith (';'):
43 |         continue
44 | 
45 |     word, dummy_sep, phonemes = line.lower ().partition ('  ')
46 | 
47 |     m = RE_STRESSED.search (phonemes)
48 |     if m:
49 |         phoneme = re.sub (r'[ 012]+', '-', m.group (0)) # remove stress marks
50 |         dbm[word.encode ('utf-8')] = phoneme.encode ('utf-8')
51 | 
52 |         # print "%s %s\n" % (word, dbm[word])
53 | 
54 | dbm.sync ()
55 | dbm.reorganize ()
56 | dbm.close ()
57 | 
58 | 


--------------------------------------------------------------------------------
/src/ebookmaker/mydocutils/parsers/default_style.rst:
--------------------------------------------------------------------------------
 1 | .. this is the default PG-RST stylesheet
 2 | 
 3 | .. style:: emphasis
 4 |    :class: italics
 5 | 
 6 | .. style:: strong
 7 |    :class: bold
 8 | 
 9 | .. style:: title_reference
10 |    :class: italics
11 | 
12 | .. style:: option_argument
13 |    :class: italics
14 | 
15 | .. style:: literal
16 |    :class: monospaced
17 | 
18 | .. style:: subscript
19 |    :class: subscript
20 | 
21 | .. style:: superscript
22 |    :class: superscript
23 | 
24 | .. style:: title.document-title
25 |    :class: x-large center
26 |    :titlehack:
27 | 
28 | .. style:: title.topic-title
29 |    :class: centerleft
30 | 
31 | .. style:: title.table-title
32 |    :class: centerleft larger
33 | 
34 | .. figure and image styles for non-image formats
35 | 
36 | .. style:: figure
37 |    :class: margin
38 | 
39 | .. style:: figure
40 |    :formats: txt.* *.noimages
41 |    :align: center
42 |    :width: 80%
43 | 
44 | .. style:: image
45 |    :formats: *.noimages
46 |    
47 |    .. container:: center image margin
48 |    
49 |       [image]
50 | 
51 | 
52 | .. style:: image
53 |    :formats: txt.*
54 |    :display: none   
55 | 
56 | .. style:: caption.figure-caption
57 |    :formats: -txt.*
58 |    :class: centerleft italics margin
59 | 
60 | .. style:: caption.figure-caption
61 |    :formats: txt.*
62 |    :class: margin
63 |    :before:  '[Illustration: '
64 |    :after:   ']'
65 | 
66 | .. style:: legend
67 |    :class: margin
68 | 
69 | 
70 | .. default transition
71 | 
72 | .. style:: transition
73 | 
74 |    .. container:: center transition margin
75 | 
76 |       ――――
77 | 
78 | .. default attribution
79 | 
80 | .. style:: attribution
81 |    :class: margin
82 |    :before: '―― '
83 | 
84 | 


--------------------------------------------------------------------------------
/docs/alt-text.md:
--------------------------------------------------------------------------------
 1 | Ebookmaker encourages proper use of the alt attribute to make books with images more accessible to the reading disabled. Ebookmaker ensures that every `img` element has an `alt` attribute and issues warnings if the alt attribute is empty.
 2 | 
 3 | Often the `alt` attribute should be left empty:
 4 | 
 5 | 1. when the image is purely decorative or used to help with the visual presentation of text. It would be disruptive to a person using text-to-speach or a braille reader to have the image described. In such a case, add a`role` attribute with value `presentation`: `<img src="image.png" alt="" role="presentation">` and the warning message will be suppressed. Because of a bug in the W3C HTML validator, you can also use `data-role="presentation"` so that the validator won't complain - ebookmaker will use this to produce valid html5 and epub files.
 6 | 
 7 | 2. when the image is well described by associated text. Often an image from a book will appear above a descriptive caption. For this reason, Ebookmaker will not emit a warning message if it appears inside a `<figure>` element containing a `<figcaption>`, or if the img has an `aria-labelledby` attribute: `<img src="image.png" alt="" aria-labelledby="id_for_label">` But when relying on a caption text, make sure it is describing what a sighted reader sees. Some captions comment on the image without describing it.
 8 | 
 9 | 
10 | Accessibiity Tutorial:
11 | https://www.w3.org/WAI/tutorials/images/
12 | 
13 | Using `aria-labelledby`:
14 | https://www.w3.org/WAI/WCAG21/Techniques/aria/ARIA16
15 | 
16 | Other helpful guides:
17 | https://publishers.asn.au/BooksWithoutBarriers
18 | https://axesslab.com/alt-texts/
19 | https://accessibility.huit.harvard.edu/describe-content-images
20 | 
21 | w3c validator bug: https://github.com/validator/validator/issues/1599


--------------------------------------------------------------------------------
/tests/test_job.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | '''
 4 | run this with
 5 | python -m unittest -v tests.test_job
 6 | '''
 7 | import datetime
 8 | import os
 9 | import subprocess
10 | import sys
11 | import unittest
12 | 
13 | from libgutenberg import Logger
14 | from libgutenberg.Logger import debug
15 | from libgutenberg.DublinCore import PGDCObject
16 | 
17 | import ebookmaker
18 | from ebookmaker import CommonCode, ParserFactory
19 | from ebookmaker.CommonCode import Options
20 | from ebookmaker.EbookMaker import config, get_dc
21 | from ebookmaker.parsers import webify_url
22 | 
23 | options = Options()
24 | Logger.set_log_level(10) # DEBUG
25 | 
26 | class TestJob(unittest.TestCase):
27 | 
28 |     def setUp(self):
29 |         config()
30 |         ParserFactory.load_parsers()
31 |         self.sample_dir = os.path.join(os.path.dirname(__file__), 'files')
32 |         self.out_dir = os.path.join(os.path.dirname(__file__), 'out')
33 |         self.testfile = os.path.join(self.sample_dir, '43172/43172-h/43172-h.htm')
34 |         subprocess.run(["touch", self.testfile])
35 |         self.testdbfile = "file://" + self.testfile
36 |         options.config.CACHEDIR = os.path.join(os.path.dirname(__file__), 'cache/epub')
37 |         options.config.FILESDIR = webify_url(os.path.join(os.path.dirname(__file__), 'files/'))
38 |  
39 |     def test_update(self):
40 |         job = CommonCode.Job('html.images')
41 |         job.ebook = 43172
42 |         job.url = self.testfile
43 |         job.dc = get_dc(job)
44 |         job.last_updated()
45 |         self.assertEqual(job.dc.update_date, datetime.date.today())
46 |         
47 |     def test_update_db(self):        
48 |         job = CommonCode.Job('html.images')
49 |         job.ebook = 43172
50 |         options.is_job_queue = True
51 |         job.url = self.testdbfile
52 |         job.dc = get_dc(job)
53 |         self.assertTrue(len(job.dc.files) > 0)
54 |         job.last_updated()
55 |         self.assertEqual(job.dc.update_date, datetime.date(2013,7,9))
56 |         
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/src/ebookmaker/mydocutils/gutenberg/parsers/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | 
 6 | Module parsers
 7 | 
 8 | Copyright 2010-2012 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | Customized Project Gutenberg directives for RST parser.
13 | 
14 | """
15 | 
16 | from docutils import statemachine
17 | from docutils.parsers.rst import Directive, directives
18 | 
19 | from ebookmaker.mydocutils import parsers
20 | 
21 | from ebookmaker.mydocutils.gutenberg import transforms as gutenberg_transforms
22 | 
23 | from libgutenberg.Logger import error, warning, info, debug
24 | 
25 | # pylint: disable=W0142, W0102
26 | 
27 | 
28 | class PGHeaderFooter (Directive):
29 |     """ Inserts PG header or footer. """
30 | 
31 |     required_arguments = 0
32 |     optional_arguments = 0
33 | 
34 |     def run (self):
35 |         settings = self.state.document.settings
36 |         include_lines = statemachine.string2lines (
37 |             settings.get_resource ('mydocutils.gutenberg.parsers', self.resource),
38 |             settings.tab_width,
39 |             convert_whitespace = 1)
40 |         self.state_machine.insert_input (include_lines, '')
41 |         return []
42 | 
43 | 
44 | class PGHeader (PGHeaderFooter):
45 |     """ Inserts PG header. """
46 |     resource = 'pg-header.rst'
47 | 
48 | 
49 | class PGFooter (PGHeaderFooter):
50 |     """ Inserts PG footer. """
51 |     resource = 'pg-footer.rst'
52 | 
53 | 
54 | class Parser (parsers.Parser):
55 |     """ Parser with PG custom directives. """
56 | 
57 |     def __init__ (self):
58 |         parsers.Parser.__init__ (self)
59 | 
60 |         directives.register_directive ('pgheader',        PGHeader)
61 |         directives.register_directive ('pgfooter',        PGFooter)
62 | 
63 | 
64 |     def get_transforms (self):
65 |         return parsers.Parser.get_transforms (self) + [
66 |             gutenberg_transforms.VariablesTransform,
67 |             gutenberg_transforms.SubRefToVarTransform]
68 | 


--------------------------------------------------------------------------------
/src/ebookmaker/WriterFactory.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*-
 3 | 
 4 | """
 5 | 
 6 | WriterFactory.py
 7 | 
 8 | Copyright 2009-14 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | Writer factory. Dynamically loads writers from directories.
13 | 
14 | """
15 | 
16 | 
17 | import os.path
18 | 
19 | from pkg_resources import resource_listdir # pylint: disable=E0611
20 | 
21 | from libgutenberg.Logger import error, debug
22 | from ebookmaker.CommonCode import Options
23 | 
24 | options = Options()
25 | 
26 | writers = {}
27 | 
28 | def __load_writers_from (package_name):
29 |     """ See what types we can write. """
30 | 
31 |     for fn in resource_listdir (package_name, ''):
32 |         modulename, ext = os.path.splitext (fn)
33 |         if ext == '.py' and modulename.endswith ('Writer'):
34 |             type_ = modulename.lower ().replace ('writer', '')
35 |             try:
36 |                 debug ("Loading writer type %s from module %s" % (type_, modulename))
37 |                 module = __import__ (package_name + '.' + modulename, fromlist = [modulename])
38 |                 writers[type_] = module
39 |             except ImportError as what:
40 |                 error (
41 |                     "Could not load writer type %s from module %s. %s" %
42 |                     (type_, modulename, what)
43 |                 )
44 | 
45 | 
46 | def load_writers ():
47 |     """ See what types we can write. """
48 | 
49 |     __load_writers_from ('ebookmaker.writers')
50 | 
51 |     for package in options.extension_packages:
52 |         __load_writers_from (package)
53 | 
54 |     return writers.keys ()
55 | 
56 | 
57 | def unload_writers ():
58 |     """ Unload writer modules. """
59 |     for k in writers.keys ():
60 |         del writers[k]
61 | 
62 | 
63 | def create (type_):
64 |     """ Load writer module for type. """
65 |     try:
66 |         if type_ == 'kf8':
67 |             type_ = 'kindle'
68 |         return writers[type_].Writer ()
69 |     except KeyError:
70 |         raise KeyError ('No writer for type %s' % type_)
71 | 


--------------------------------------------------------------------------------
/src/ebookmaker/mydocutils/nodes.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | 
 6 | nodes.py
 7 | 
 8 | Copyright 2011 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | Added nodes for PG.
13 | 
14 | """
15 | 
16 | from docutils import nodes
17 | 
18 | class page (nodes.Element, nodes.Special):
19 |     """ Hold pagination commands.
20 | 
21 |     Like clearpage, vspace etc.
22 | 
23 |     """
24 | 
25 | class newline (nodes.Element):
26 |     """ A line break.
27 | 
28 |     Outputs a hard line break if the node or one of its parents belong
29 |     to the class 'white-space-pre-line'.  Else a space.
30 | 
31 |     """
32 | 
33 | class footnote_group (nodes.container):
34 |     """ Hold a group of footnotes. """
35 | 
36 | 
37 | class variable (nodes.Inline, nodes.TextElement):
38 |     """ A placeholder that gets substituted with actual text before output.
39 | 
40 |     We do not use substitution refs because they are resolved way too
41 |     early in the transformation stage to be of much use to us.
42 | 
43 |     """
44 | 
45 | 
46 | class node_selector (object):
47 |     """ Allows CSS-like selectors as condition function for nodes.traverse (). """
48 | 
49 |     def __init__ (self, selector):
50 | 
51 |         # allow selectors like [element][.class[.class[...]]][, selector[, selector]]
52 | 
53 |         self.matches = [] # list of 2-tuples
54 | 
55 |         for sel in selector.split (','):
56 |             sel = sel.strip ()
57 |             if '.' not in sel:
58 |                 sel += '.'
59 |             element, classes = sel.split ('.', 1)
60 |             classes = set (classes.split ('.')) if classes else set ()
61 |             self.matches.append ( (getattr (nodes, element, nodes.Element), classes) )
62 | 
63 | 
64 |     def __call__ (self, node):
65 |         """ returns True if the node matches the selector. """
66 | 
67 |         for match in self.matches:
68 |             if isinstance (node, match[0]) and match[1].issubset (node['classes']):
69 |                 return True
70 | 
71 |         return False
72 | 


--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
 1 | # file GENERATED by distutils, do NOT edit
 2 | CHANGES
 3 | README
 4 | setup.cfg
 5 | setup.py
 6 | ebookmaker/CommonCode.py
 7 | ebookmaker/EbookMaker.py
 8 | ebookmaker/HTMLChunker.py
 9 | ebookmaker/ParserFactory.py
10 | ebookmaker/Spider.py
11 | ebookmaker/Unitame.py
12 | ebookmaker/UnitameData.py
13 | ebookmaker/Version.py
14 | ebookmaker/WriterFactory.py
15 | ebookmaker/__init__.py
16 | ebookmaker/mydocutils/__init__.py
17 | ebookmaker/mydocutils/nodes.py
18 | ebookmaker/mydocutils/gutenberg/__init__.py
19 | ebookmaker/mydocutils/gutenberg/parsers/__init__.py
20 | ebookmaker/mydocutils/gutenberg/parsers/pg-footer.rst
21 | ebookmaker/mydocutils/gutenberg/parsers/pg-header.rst
22 | ebookmaker/mydocutils/gutenberg/transforms/__init__.py
23 | ebookmaker/mydocutils/gutenberg/writers/__init__.py
24 | ebookmaker/mydocutils/gutenberg/writers/nroff.py
25 | ebookmaker/mydocutils/parsers/__init__.py
26 | ebookmaker/mydocutils/parsers/default_style.rst
27 | ebookmaker/mydocutils/transforms/__init__.py
28 | ebookmaker/mydocutils/transforms/parts.py
29 | ebookmaker/mydocutils/writers/__init__.py
30 | ebookmaker/mydocutils/writers/epub2.py
31 | ebookmaker/mydocutils/writers/nroff.py
32 | ebookmaker/mydocutils/writers/rst2all.css
33 | ebookmaker/mydocutils/writers/rst2epub.css
34 | ebookmaker/mydocutils/writers/rst2html.css
35 | ebookmaker/mydocutils/writers/xetex.py
36 | ebookmaker/mydocutils/writers/xhtml1.py
37 | ebookmaker/packagers/GzipPackager.py
38 | ebookmaker/packagers/HTMLPackager.py
39 | ebookmaker/packagers/PDFPackager.py
40 | ebookmaker/packagers/PushPackager.py
41 | ebookmaker/packagers/RSTPackager.py
42 | ebookmaker/packagers/TxtPackager.py
43 | ebookmaker/packagers/__init__.py
44 | ebookmaker/parsers/AuxParser.py
45 | ebookmaker/parsers/CSSParser.py
46 | ebookmaker/parsers/GutenbergTextParser.py
47 | ebookmaker/parsers/HTMLParser.py
48 | ebookmaker/parsers/ImageParser.py
49 | ebookmaker/parsers/RSTParser.py
50 | ebookmaker/parsers/__init__.py
51 | ebookmaker/parsers/broken.png
52 | ebookmaker/writers/EpubWriter.py
53 | ebookmaker/writers/HTMLWriter.py
54 | ebookmaker/writers/KindleWriter.py
55 | ebookmaker/writers/PDFWriter.py
56 | ebookmaker/writers/PicsDirWriter.py
57 | ebookmaker/writers/RSTWriter.py
58 | ebookmaker/writers/TxtWriter.py
59 | ebookmaker/writers/__init__.py
60 | ebookmaker/writers/cover.jpg
61 | scripts/convert_unitame
62 | scripts/ebookmaker
63 | scripts/rhyme_compiler
64 | 


--------------------------------------------------------------------------------
/src/ebookmaker/writers/PicsDirWriter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*-
 3 | 
 4 | """
 5 | 
 6 | PicsDirWriter.py
 7 | 
 8 | Copyright 2012 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | Copies pics into local directory. Needed for HTML and Xetex.
13 | 
14 | """
15 | 
16 | 
17 | import os.path
18 | 
19 | import libgutenberg.GutenbergGlobals as gg
20 | from libgutenberg.Logger import info, debug, error
21 | 
22 | from ebookmaker.parsers import webify_url
23 | from ebookmaker import writers
24 | 
25 | 
26 | class Writer(writers.BaseWriter):
27 |     """ Writes Pics directory. """
28 | 
29 |     def copy_aux_files(self, job, dest_dir):
30 |         """ Copy image files to dest_dir. Use image data cached in parsers. """
31 | 
32 |         for p in job.spider.parsers:
33 |             if hasattr(p, 'resize_image') or hasattr(p, 'auxparser'):
34 |                 src_uri = p.attribs.url
35 |                 if src_uri.startswith(webify_url(dest_dir)):
36 |                     debug('Not copying %s to %s: already there' % (src_uri, dest_dir))
37 |                     continue
38 | 
39 |                 fn_dest = gg.make_url_relative(webify_url(job.base_url), src_uri)
40 |                 fn_dest = os.path.join(dest_dir, fn_dest)
41 | 
42 |                 # debug('base_url =  %s, src_uri = %s' % (job.base_url, src_uri))
43 | 
44 |                 if gg.is_same_path(src_uri, fn_dest):
45 |                     debug('Not copying %s to %s: same file' % (src_uri, fn_dest))
46 |                     continue
47 | 
48 |                 fn_dest = gg.normalize_path(fn_dest)
49 |                 debug('Copying %s to %s' % (src_uri, fn_dest))
50 |                 gg.mkdir_for_filename(fn_dest)
51 |                 try:
52 |                     with open(fn_dest, 'wb') as fp_dest:
53 |                         fp_dest.write(p.serialize())
54 |                 except IOError as what:
55 |                     error('Cannot copy %s to %s: %s' % (src_uri, fn_dest, what))
56 | 
57 | 
58 | 
59 |     def build(self, job):
60 |         """ Build Pics file. """
61 | 
62 |         dest_dir = os.path.abspath(job.outputdir)
63 | 
64 |         debug("Creating Pics directory in: %s" % dest_dir)
65 | 
66 |         self.copy_aux_files(job, dest_dir)
67 | 
68 |         debug("Done Pics directory in: %s" % dest_dir)
69 | 


--------------------------------------------------------------------------------
/scripts/convert_unitame:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*-
 3 | 
 4 | """
 5 | 
 6 | convert_unitame.py
 7 | 
 8 | Copyright 2010,2014 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | Converts unitame.dat into UnitameData module.
13 | 
14 | """
15 | 
16 | import codecs
17 | import unicodedata as ud
18 | 
19 | # from addhd
20 | 
21 | i2a = (
22 | "Euro","",",","f","\"","...","","","^","%","S","<","OE","","Z","",
23 | "","'","'","\"","\"","","-","--","~","(TM)","s",">","oe","","z","Y",
24 | " ","i","c","L","","Y","|","Sec.","\"","(C)","","\"","","-","(R)","-",
25 | " deg.","+-"," squared"," cubed","'"," mu","",".","","","","\"","1/4","1/2","3/4","?",
26 | "A","A","A","A","Ae","A","AE","C","E","E","E","E","I","I","I","I",
27 | "Eth","N","O","O","O","O","Oe","x","O","U","U","U","Ue","Y","","ss",
28 | "a","a","a","a","ae","a","ae","c","e","e","e","e","i","i","i","i",
29 | "eth","n","o","o","o","o","oe","/","o","u","u","u","ue","y","","y"
30 | )
31 | 
32 | 
33 | def strip_accents (s):
34 |     """ Strip accents from string. """
35 |     return ud.normalize ('NFKC',
36 |                          filter (lambda c: ud.category (c) != 'Mn',
37 |                                  ud.normalize ('NFKD', s)))
38 | 
39 | fp = codecs.open ('unitame.dat', 'rU', 'iso-8859-1')
40 | 
41 | print '''#!/usr/bin/env python
42 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
43 | 
44 | """ Converted from unitame.dat """
45 | 
46 | from __future__ import unicode_literals
47 | 
48 | unicode_to_iso_8859_1 = {'''
49 | 
50 | for line in fp.readlines ():
51 |     line = line.strip ()
52 |     c, dummy, sub = line.split (';', 2)
53 |     c = "%c" % int (c, 16)
54 |     if sub and c != sub and strip_accents (c) != sub:
55 |         comment = ud.name (c)
56 |         if sub == "'":
57 |             sub = r"\'"
58 |         print ("    '%s': '%s', # %s" % (c, sub, comment)).encode ('utf-8')
59 | 
60 | print "}\n\n"
61 | 
62 | print "iso_8859_1_to_ascii = {"
63 | 
64 | for n, sub in enumerate (i2a):
65 |     n = n + 0x80
66 |     if n > 0xa0:
67 |         c = unichr (n)
68 |         if sub and strip_accents (c) != sub:
69 |             comment = ud.name (c)
70 |             if sub == "'":
71 |                 sub = r"\'"
72 |             print ("    '%s': '%s', # %s" % (c, sub, comment)).encode ('utf-8')
73 | 
74 | print "}\n\n"
75 | 


--------------------------------------------------------------------------------
/tests/test_setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | '''
 4 | run this with
 5 | python -m unittest -v ebookmaker.tests.test_setup
 6 | '''
 7 | import os
 8 | import unittest
 9 | import subprocess
10 | 
11 | from libgutenberg import Logger
12 | from libgutenberg.Logger import debug
13 | 
14 | import ebookmaker
15 | from ebookmaker import CommonCode
16 | from ebookmaker import ParserFactory
17 | from ebookmaker import WriterFactory
18 | from ebookmaker.CommonCode import Options, path_from_file
19 | from ebookmaker.EbookMaker import config
20 | from ebookmaker.EbookMaker import DEPENDENCIES, BUILD_ORDER
21 | from ebookmaker.packagers import PackagerFactory
22 | from ebookmaker.parsers import BROKEN, webify_url
23 | 
24 | options = Options()
25 | 
26 | class TestLoad(unittest.TestCase):
27 | 
28 |     def setUp(self):
29 |         config()
30 |         Logger.set_log_level(options.verbose)
31 |         options.types = options.types or ['all']
32 |         options.types = CommonCode.add_dependencies(options.types, DEPENDENCIES, BUILD_ORDER)
33 |         options.config.CACHEDIR = os.path.join(os.path.dirname(__file__), 'cache/epub')
34 |         options.config.FILESDIR = webify_url(os.path.join(os.path.dirname(__file__), 'files/'))
35 |         debug("Building types: %s" % ' '.join(options.types))
36 |  
37 |     def test_parsers(self):
38 |         ParserFactory.load_parsers()
39 |         pf = ParserFactory.ParserFactory()
40 |         
41 |         # check parser created from resource
42 |         broken_parser = pf.create(BROKEN)
43 |         self.assertTrue(hasattr(broken_parser, 'resize_image'))
44 |         broken_parser.pre_parse()
45 |         self.assertTrue(len(broken_parser.image_data) > 0)
46 |         self.assertTrue(broken_parser.get_image_dimen()[0] > 0)
47 |         
48 |         # check conversion to jpeg
49 |         broken_parser.resize_image(16 * 1024, (66, 100), output_format='jpeg')
50 | 
51 |     def test_writers(self):
52 |         WriterFactory.load_writers()
53 | 
54 |     def test_packagers(self):
55 |         PackagerFactory.load_packagers()
56 | 
57 |     def test_dirs(self):
58 |         print(path_from_file('cache/epub/1234/test'))
59 |         self.assertTrue(path_from_file('cache/epub/1234/test').endswith(
60 |             'cache/epub/1234/test'))
61 |         print(path_from_file('1/2/3/1234/test'))
62 |         self.assertTrue(path_from_file('1/2/3/1234/test').endswith('files/1234/test'))
63 | 


--------------------------------------------------------------------------------
/src/ebookmaker/mydocutils/writers/rst2html.css:
--------------------------------------------------------------------------------
 1 | /*
 2 | Project Gutenberg HTML docutils stylesheet.
 3 | 
 4 | This stylesheet contains styles specific to HTML.
 5 | */
 6 | 
 7 | /* FONTS */
 8 | 
 9 | /* em                { font-style: normal }
10 | strong            { font-weight: normal } */
11 | 
12 | .small-caps       { font-variant: small-caps }
13 | .gesperrt         { letter-spacing: 0.1em }
14 | 
15 | /* ALIGN */
16 | 
17 | .align-left       { clear: left;
18 | 		    float: left;
19 | 		    margin-right: 1em }
20 | 
21 | .align-right      { clear: right;
22 | 		    float: right;
23 | 		    margin-left: 1em }
24 | 
25 | .align-center     { margin-left: auto;
26 | 		    margin-right: auto }
27 | 
28 | div.shrinkwrap    { display: table; }
29 | 
30 | /* SECTIONS */
31 | 
32 | body              { margin: 5% 10% 5% 10% }
33 | 
34 | /* compact list items containing just one p */
35 | li p.pfirst       { margin-top: 0; margin-bottom: 0 }
36 | 
37 | .first            { margin-top: 0 !important;
38 | 		    text-indent: 0 !important }
39 | .last             { margin-bottom: 0 !important }
40 | 
41 | span.dropcap      { float: left; margin: 0 0.1em 0 0; line-height: 1 }
42 | img.dropcap       { float: left; margin: 0 0.5em 0 0; max-width: 25% }
43 | span.dropspan     { font-variant: small-caps }
44 | 
45 | .no-page-break    { page-break-before: avoid !important }
46 | 
47 | /* PAGINATION */
48 | 
49 | .pageno           { position: absolute; right: 95%; font: medium sans-serif; text-indent: 0 }
50 | .pageno:after     { color: gray; content: '[' attr(title) ']' }
51 | .lineno           { position: absolute; left:  95%; font: medium sans-serif; text-indent: 0 }
52 | .lineno:after     { color: gray; content: '[' attr(title) ']' }
53 | .toc-pageref      { float: right }
54 | 
55 | @media screen {
56 |    .coverpage, .frontispiece, .titlepage, .verso, .dedication, .plainpage
57 |                        { margin: 10% 0; }
58 | 
59 |    div.clearpage, div.cleardoublepage
60 |                        { margin: 10% 0; border: none; border-top: 1px solid gray; }
61 | 
62 |    .vfill              { margin:  5% 10% }
63 | }
64 | 
65 | @media print {
66 |    div.clearpage       { page-break-before: always; padding-top: 10% }
67 |    div.cleardoublepage { page-break-before: right;  padding-top: 10%  }
68 | 
69 |    .vfill              { margin-top: 20% }
70 |    h2.title            { margin-top: 20% }
71 | }
72 | 
73 | /* DIV */
74 | pre               { font-family: monospace; font-size: 0.9em; white-space: pre-wrap }
75 | 


--------------------------------------------------------------------------------
/src/ebookmaker/packagers/PushPackager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | PushPackager.py
 6 | 
 7 | Copyright 2011 by Marcello Perathoner
 8 | 
 9 | Distributable under the GNU General Public License Version 3 or newer.
10 | 
11 | Package a zip containing everything, that can be pushed to ibiblio.
12 | 
13 | """
14 | 
15 | 
16 | import os.path
17 | import re
18 | 
19 | from libgutenberg.Logger import info, error
20 | import libgutenberg.GutenbergGlobals as gg
21 | 
22 | from ebookmaker.CommonCode import Options
23 | from ebookmaker.packagers import ZipPackager
24 | 
25 | options = Options()
26 | TYPE = 'ww'
27 | FORMATS = ['push']
28 | 
29 | class Packager (ZipPackager):
30 |     """ Package one big zip for push.
31 | 
32 |     Zip contains one directory named after ebook_no.
33 |     This dir mirrors structure on ibiblio::
34 | 
35 |       12345/12345.txt
36 |       12345/12345.zip
37 |       12345/12345-h/12345-h.html
38 |       12345/12345-h/images/cover.jpg
39 |       12345/12345-h.zip
40 | 
41 |     """
42 | 
43 |     def package (self, job):
44 |         self.setup (job)
45 |         zipfilename = job.outputfile # filename is zipfile
46 | 
47 |         m = re.match (r'\d+', zipfilename)
48 |         if m:
49 |             ebook_no = m.group (0)
50 |         else:
51 |             error ('Invalid filename %s for push packager.' % zipfilename)
52 |             return
53 | 
54 |         zip_ = self.create (zipfilename)
55 | 
56 |         for suffix in '.txt -8.txt -0.txt .zip -8.zip -0.zip -rst.zip -h.zip'.split ():
57 |             filename = '%s%s' % (ebook_no, suffix)
58 |             memberfilename = '%s/%s' % (ebook_no, filename)
59 |             self.add (zip_, filename, memberfilename)
60 | 
61 |         for suffix, ext in (('-h', 'html'), ('-rst', 'rst')):
62 |             filename = '%s%s.%s' % (ebook_no, suffix, ext)
63 |             memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, filename)
64 |             self.add (zip_, filename, memberfilename)
65 | 
66 |             # image files
67 |             for url in options.html_images_list:
68 |                 rel_url = gg.make_url_relative (job.base_url, url)
69 |                 filename = os.path.join (self.path, rel_url)
70 |                 memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, rel_url)
71 |                 self.add (zip_, filename, memberfilename)
72 | 
73 |         zip_.close ()
74 |         info ('Done Zip file: %s' % zipfilename)
75 | 


--------------------------------------------------------------------------------
/tests/test_htm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import os
 4 | import unittest
 5 | import subprocess
 6 | 
 7 | 
 8 | import ebookmaker
 9 | 
10 | class TestFromHtm(unittest.TestCase):
11 |     def setUp(self):
12 |         self.sample_dir = os.path.join(os.path.dirname(__file__), 'files')
13 |         self.out_dir = os.path.join(os.path.dirname(__file__), 'out')
14 | 
15 |     def test_43172(self):
16 |         book_id = '43172'
17 |         dir = os.path.join(self.sample_dir, book_id)
18 |         htmfile = os.path.join(dir, '%s-h' % book_id, '%s-h.htm' % book_id)
19 |         cmd = f'ebookmaker -v --ebook=43172 --make=test --output-dir={self.out_dir} '
20 |         cmd += f'--validate {htmfile}'
21 | 
22 |         output = subprocess.check_output(cmd, shell=True)
23 | 
24 |         self.assertFalse(output)
25 |         outs = [
26 |             "%s-epub.epub",
27 |             "%s-images-epub3.epub",
28 |             "%s-images-epub.epub",
29 |             "%s-h.html",
30 |         ]
31 |         for out in outs:
32 |             self.assertTrue(os.path.exists(os.path.join(self.out_dir, out % book_id)))
33 |             os.remove(os.path.join(self.out_dir, out % book_id))
34 |         os.remove(os.path.join(self.out_dir, 'images/image.jpg'))
35 |         os.remove(os.path.join(self.out_dir, 'images/mathex.jpg'))
36 |         os.remove(os.path.join(self.out_dir, 'music/test.mp3'))
37 |         os.rmdir(os.path.join(self.out_dir, 'images'))
38 |         os.rmdir(os.path.join(self.out_dir, 'music'))
39 | 
40 |     def test_43172_nocover(self):
41 |         book_id = '43172'
42 |         dir = os.path.join(self.sample_dir, book_id)
43 |         htmfile = os.path.join(dir, '%s-h' % book_id, '%s-nocover.htm' % book_id)
44 |         cmd = 'ebookmaker --make=test --output-dir={dir} --generate_cover {htmfile}'.format(
45 |             dir=self.out_dir,
46 |             htmfile=htmfile,
47 |         )
48 | 
49 |         output = subprocess.check_output(cmd, shell=True)
50 | 
51 |         self.assertFalse(output)
52 |         outs = [
53 |             "%s-epub.epub",
54 |             "%s-images-epub3.epub",
55 |             "%s-images-epub.epub",
56 |             "%s-h.html",
57 |             "%s-cover.png",
58 |         ]
59 |         for out in outs:
60 |             self.assertTrue(os.path.exists(os.path.join(self.out_dir, out % book_id)))
61 |             os.remove(os.path.join(self.out_dir, out % book_id))
62 |         os.remove(os.path.join(self.out_dir, 'images/image.jpg'))
63 |         os.rmdir(os.path.join(self.out_dir, 'images'))
64 | 


--------------------------------------------------------------------------------
/src/ebookmaker/parsers/WrapperParser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*-
 3 | 
 4 | """
 5 | 
 6 | WrapperParser.py
 7 | 
 8 | Copyright 2020 by Eric Hellman
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | """
13 | from xml.sax.saxutils import escape, quoteattr
14 | 
15 | import lxml
16 | 
17 | from copy import copy
18 | from libgutenberg.Logger import info
19 | from libgutenberg import GutenbergGlobals as gg
20 | from ebookmaker.parsers import HTMLParserBase, IMAGE_WRAPPER
21 | 
22 | mediatypes = ()
23 | 
24 | class Parser(HTMLParserBase):
25 | 
26 |     def __init__(self, attribs):
27 |         HTMLParserBase.__init__(self, copy(attribs))
28 |         self.attribs.orig_mediatype = self.attribs.mediatype
29 |         self.src = attribs.url
30 |         self.attribs.url = self.wrapper_url(attribs.url)
31 |         self.attribs.orig_url = self.attribs.url
32 |         self.attribs.nonlinear = True
33 |         if not self.attribs.title:
34 |             self.attribs.title = 'linked image'
35 |         self.xhtml = lxml.etree.fromstring(
36 |             self.unicode_content(),
37 |             lxml.html.XHTMLParser(),
38 |             base_url=self.attribs.url
39 |         )
40 |         self.fp = True  # so writers won't skip it
41 | 
42 |         # mark the image for treatment as a linked image
43 |         attribs.rel.add('linked_image')
44 |         # set the referrer for the image to this wrapper
45 |         attribs.referrer = self.attribs.url
46 | 
47 | 
48 |     def unicode_content(self):
49 |         """ wrapper page content """
50 |         frag = ('#%s' % self.attribs.id) if self.attribs.id else ''
51 |         backlink = '<br /><a href="%s%s" title="back" >back</a>' % (
52 |             escape(self.attribs.referrer), frag)
53 |         return IMAGE_WRAPPER.format(
54 |             src=escape(self.src),
55 |             title=quoteattr(self.attribs.title),
56 |             backlink=backlink,
57 |             wrapper_class='x-ebookmaker-wrapper',
58 |             doctype=gg.XHTML_DOCTYPE,
59 |             style='')
60 | 
61 | 
62 |     def wrapper_url(self, img_url):
63 |         """ make the wrapper url. """
64 |         if self.attribs.id:
65 |             return '%s.%s.wrap.html' % (img_url, self.attribs.id)
66 |         return img_url + '.wrap.html'
67 | 
68 | 
69 |     def make_toc(self, xhtml):
70 |         return []
71 | 
72 | 
73 |     def iterlinks(self):
74 |         """ only return the image """
75 |         for iterlink in super(Parser, self).iterlinks():
76 |             if iterlink[1].tag == gg.NS.xhtml.img:
77 |                 yield iterlink


--------------------------------------------------------------------------------
/src/ebookmaker/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*-
 3 | """
 4 | 
 5 | utils.py
 6 | 
 7 | tools for manipulating xhtml
 8 | Copyright 2009 by Project Gutenberg
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | """
12 | 
13 | import libgutenberg.GutenbergGlobals as gg
14 | from libgutenberg.GutenbergGlobals import xpath, NS
15 | from libgutenberg.Logger import critical, debug, error, info, warning
16 | 
17 | def css_len(len_str):
18 |     """ if an int, make px """
19 |     try:
20 |         return str(int(len_str)) + 'px'
21 |     except ValueError:
22 |         return len_str
23 | 
24 | def add_class(elem, classname):
25 |     if 'class' in elem.attrib and elem.attrib['class']:
26 |         vals = elem.attrib['class'].split()
27 |     else:
28 |         vals = []
29 |     vals.append(classname)
30 |     elem.set('class', ' '.join(vals))
31 | 
32 | def add_style(elem, style=''):
33 |     if style:
34 |         if 'style' in elem.attrib and elem.attrib['style']:
35 |             prev_style = elem.attrib['style'].strip(' ;')
36 |             style = f'{style.strip(" ;")};{prev_style};'
37 |         elem.set('style', style)
38 | 
39 | def check_lang(elem, lang_att):
40 |     three2two = {'ita': 'it', 'lat': 'la', 'heb': 'he', 'fra': 'fr', 'spa': 'es', 'deu': 'de',
41 |                  'gla': 'gd', 'oji': 'oj', 'nav': 'nv',}
42 |     lang_att = three2two.get(lang_att, lang_att)
43 |     lang = elem.attrib[lang_att]
44 |     lang_name = gg.language_map.get(lang, default=None)
45 |     if lang_name:
46 |         if NS.xml.lang in elem.attrib:
47 |              del elem.attrib[NS.xml.lang]
48 |         elem.attrib['lang'] = lang
49 |         return True
50 |     clean_lang = gg.language_map.inverse(lang, default=None)
51 |     if not clean_lang:
52 |         warning("invalid lang attribute %s", lang)
53 |         del elem.attrib[lang_att]
54 |         elem.attrib['data-invalid-lang'] = lang
55 |     elif lang != clean_lang:
56 |         elem.attrib['lang'] = clean_lang
57 |         if NS.xml.lang in elem.attrib:
58 |              del elem.attrib[NS.xml.lang]
59 | 
60 | def replace_elements(xhtml, deprecated):
61 |     ''' replace a dictionary of deprecated elements with a new element or just delete it.
62 |         return a set of replaced elements 
63 |     '''
64 |     deprecated_used = set()
65 |     for tag in deprecated:
66 |         for elem in xpath(xhtml, "//xhtml:" + tag):
67 |             if deprecated[tag]:
68 |                 add_class(elem, 'xhtml_' + tag)
69 |                 elem.tag = getattr(NS.xhtml, deprecated[tag])
70 |             else:
71 |                 elem.getparent().remove(elem)
72 |             deprecated_used.add(tag)
73 |     return deprecated_used
74 | 


--------------------------------------------------------------------------------
/src/ebookmaker/mydocutils/writers/epub2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | 
 6 | epub2.py
 7 | 
 8 | Copyright 2012 by Marcello Perathoner
 9 | 
10 | Distributable under the GNU General Public License Version 3 or newer.
11 | 
12 | A writer that writes XHTML 1 files suited for conversion into EPUB2.
13 | 
14 | """
15 | 
16 | import re
17 | 
18 | # from libgutenberg.Logger import info, debug, warning, error
19 | 
20 | from ebookmaker.mydocutils.writers.xhtml1 import Writer as WriterBase
21 | from ebookmaker.mydocutils.writers.xhtml1 import Translator as TranslatorBase
22 | 
23 | 
24 | class Writer (WriterBase):
25 |     """ EPUB2 writer. """
26 | 
27 |     def __init__ (self):
28 |         WriterBase.__init__ (self)
29 |         self.translator_class = Translator
30 | 
31 | 
32 | class Translator (TranslatorBase):
33 |     """ HTML Translator with EPUB2 tweaks. """
34 | 
35 |     def init_css (self):
36 |         for css_file in ('rst2all.css', 'rst2epub.css'):
37 |             self.head.append ('<style type="text/css">\n%s</style>\n' %
38 |                               self.encode (self.read_css (css_file)))
39 | 
40 | 
41 |     def calc_centering_style (self, node):
42 |         """
43 |         Rationale: The EPUB standard allows user agents to replace
44 |         `margin: auto` with `margin: 0`. Thus we cannot use `margin: auto`
45 |         to center images, we have to calculate the left margin value.
46 | 
47 |         Also we must use 'width' on the html element, not css style,
48 |         or Adobe ADE will not scale the image properly (ie. only
49 |         horizontally).
50 | 
51 |         :align: is supposed to work on blocks. It floats or centers
52 |         a block.
53 | 
54 |         :align: center has not the same semantics as :class: center.
55 |         Former centers the block, eg. the whole table, latter centers
56 |         the text, eg, the text in every table cell.
57 | 
58 |             `:align: center`
59 |                 Used on image: centers image
60 |                 Used on figure: centers image and caption
61 |                 Used on table: centers table and caption
62 | 
63 |         """
64 | 
65 |         width = node.get ('width')
66 |         if width is None:
67 |             return []
68 | 
69 |         style = ['width: %s' % width]
70 | 
71 |         m = re.match (r'(\d+)\s*%', width)
72 |         if m:
73 |             width = max (min (int (m.group (1)), 100), 0)
74 |             margin = 100 - width
75 | 
76 |             align = node.get ('align', 'center')
77 |             if align == 'center':
78 |                 style.append ('margin-left: %d%%' % (margin / 2))
79 |             if align == 'right':
80 |                 style.append ('margin-left: %d%%' % margin)
81 | 
82 |         node['styles'].extend (style)
83 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # ebookmaker distribution
 3 | #
 4 | #!/usr/bin/env python
 5 | 
 6 | from setuptools import setup
 7 | 
 8 | VERSION = '0.13.8'
 9 | 
10 | if __name__ == "__main__":
11 |  
12 |     setup (
13 |         name = 'ebookmaker',
14 |         version = VERSION,
15 | 
16 |         packages = [
17 |             'ebookmaker',
18 |             'ebookmaker.parsers',
19 |             'ebookmaker.writers',
20 |             'ebookmaker.packagers',
21 | 
22 |             'ebookmaker.mydocutils',
23 |             'ebookmaker.mydocutils.parsers',
24 |             'ebookmaker.mydocutils.transforms',
25 |             'ebookmaker.mydocutils.writers',
26 | 
27 |             'ebookmaker.mydocutils.gutenberg',
28 |             'ebookmaker.mydocutils.gutenberg.parsers',
29 |             'ebookmaker.mydocutils.gutenberg.transforms',
30 |             'ebookmaker.mydocutils.gutenberg.writers',
31 |         ],
32 | 
33 |         scripts = [
34 |             'scripts/ebookmaker',
35 |             'scripts/convert_unitame',
36 |             'scripts/rhyme_compiler',
37 |         ],
38 | 
39 |         install_requires = [
40 |             'pillow>=8.3.2',
41 |             'cssutils',
42 |             'docutils>=0.18.1',
43 |             'lxml',
44 |             'roman',
45 |             'requests',
46 |             'six>=1.4.1',
47 |             'libgutenberg[covers]>=0.10.22',
48 |             'cchardet==2.2.0a2',
49 |             'beautifulsoup4',
50 |             'html5lib',
51 |         ],
52 |     
53 |         package_data = {
54 |             'ebookmaker.parsers': ['broken.png', 'txt2all.css'],
55 |             'ebookmaker.writers': ['cover.jpg'],
56 |             'ebookmaker.mydocutils.parsers': ['*.rst'],
57 |             'ebookmaker.mydocutils.writers': ['*.css'],
58 |             'ebookmaker.mydocutils.gutenberg.parsers': ['*.rst'],
59 |         },
60 | 
61 |         data_files = [
62 |             ('', ['CHANGES', 'README.md']),
63 |         ],
64 | 
65 |         # metadata for upload to PyPI
66 | 
67 |         author = "Marcello Perathoner",
68 |         maintainer = "Eric Hellman",
69 |         maintainer_email = "eric@hellman.net",
70 |         description = "The Project Gutenberg tool to generate EPUBs and other ebook formats.",
71 |         long_description = open ('README.md', encoding='utf-8').read (),
72 |         long_description_content_type = 'text/markdown',
73 |         license = "GPL v3",
74 |         keywords = "ebook epub kindle pdf rst reST reStructuredText project gutenberg format conversion",
75 |         url = "https://github.com/gutenbergtools/ebookmaker/",
76 | 
77 |         classifiers = [
78 |             "Topic :: Text Processing",
79 |             "License :: OSI Approved :: GNU General Public License (GPL)",
80 |             "Environment :: Console",
81 |             "Operating System :: OS Independent",
82 |             "Intended Audience :: Other Audience",
83 |             "Programming Language :: Python",
84 |             "Programming Language :: Python :: 3.9",
85 |             "Programming Language :: Python :: 3.10",
86 |             "Programming Language :: Python :: 3.11",
87 |         ],
88 | 
89 |         platforms = 'OS-independent'
90 |     )
91 | 


--------------------------------------------------------------------------------
/src/ebookmaker/parsers/txt2all.css:
--------------------------------------------------------------------------------
 1 | /* ************************************************************************
 2 |  * classless css copied from https://www.pgdp.net/wiki/CSS_Cookbook/Styles
 3 |  * ********************************************************************** */
 4 | /* ************************************************************************
 5 |  * set the body margins to allow whitespace along sides of window
 6 |  * ********************************************************************** */
 7 | 	body { margin-left:8%; width:85%; /* == margin-left:7% */ }
 8 | /* ************************************************************************
 9 |  * set the indention, spacing, and leading for body paragraphs.
10 |  * ********************************************************************** */
11 | 	p { /* all paragraphs unless overridden */
12 | 		margin-top: 1em;	/* inter-paragraph space */
13 | 		margin-bottom: 0;	/* use only top-margin for spacing */
14 | 		line-height: 1.4em;	/* interline spacing ("leading") */
15 | 	}
16 | 	body > p { /* paras at <body> level - not in <div> or <table>  */
17 | 		text-align: justify; /* or left?? */
18 | 		text-indent: 1em;	/* first-line indent */
19 | 	}
20 | 	/* suppress indentation on paragraphs following heads  */
21 | 	h2+p, h3+p, h4+p { text-indent: 0; }
22 | 	/* tighter spacing for list item paragraphs */
23 | 	dd, li {
24 | 		margin-top: 0.25em; margin-bottom:0;
25 | 		line-height: 1.2em; /* a bit closer than p's */
26 | 	}
27 | /* ************************************************************************
28 |  * Head 2 is for chapter heads. 
29 |  * ********************************************************************** */
30 | 	h2 {
31 | 		/* text-align:center;  left-aligned by default. */
32 | 		margin-top:3em;		/* extra space above.. */
33 | 		margin-bottom: 2em;	/* ..and below */
34 | 		clear: both;		/* don't let sidebars overlap */
35 | 	}
36 | /* ************************************************************************
37 |  * Head 3 is for main-topic heads.
38 |  * ********************************************************************** */
39 | 	h3 {
40 | 			/* text-align:center;  left-aligned by default. */
41 | 			margin-top: 2em;	/* extra space above but not below */
42 | 			font-weight: normal; /* override default of bold */
43 | 			clear: both; /* don't let sidebars overlap */
44 | 	}
45 | /* ************************************************************************
46 |  * Styling the default HR and some special-purpose ones.
47 |  * Default rule centered and clear of floats; sized for thought-breaks
48 |  * ********************************************************************** */
49 | 	hr {
50 | 		width:45%;			/* adjust to ape original work */
51 | 		margin-top: 1em;	/* space above & below */
52 | 		margin-bottom: 1em;
53 | 		margin-left: auto;  /* these two ensure a.. */
54 | 		margin-right: auto; /* ..centered rule */
55 | 		clear: both;		/* don't let sidebars & floats overlap rule */
56 | 	}
57 | /* ************************************************************************
58 |  * Images and captions
59 |  * ********************************************************************** */
60 | 	img { /* the default inline image has */
61 | 		border: 1px solid black; /* a thin black line border.. */
62 | 		padding: 6px; /* ..spaced a bit out from the graphic */
63 | 		} 


--------------------------------------------------------------------------------
/docs/images.md:
--------------------------------------------------------------------------------
 1 | IMAGES AND COVERS
 2 | 
 3 | As of EbookMaker 0.9, image filesize and dimension limits are being set differently.
 4 | 
 5 | EbookMaker now considers three types of images it finds in html, and handles them each differently.:
 6 | 
 7 | 1. inline images
 8 |     `<img src="unicorn.png" alt="Image of a Unicorn" />`
 9 | 2. linked images
10 |     `<a href="bigunicorn.png" title="Expanded Image of a Unicorn" />Click for larger Unicorn</a>`
11 | 3. cover images
12 |    These can come in 4 flavors (in priority order):
13 |     1. coverpage relation
14 |         `<link href="unicorn_image.jpg" rel="coverpage" />`  or `<link href="unicorn_image.jpg" rel="icon" type="image/x-cover" />` (preferred)
15 |     2. coverpage id
16 |         `<img src="unicorn_image.jpg" id="coverpage" alt="front jacket" />`
17 |     3. image with 'cover' in the url
18 |         `<img src="unicorn_cover.jpg" alt="front jacket" />`
19 |     4. image with 'title' in the url
20 |         `<img src="unicorn_titlepage.jpg" alt="front jacket" />`
21 | 
22 | Ebookmaker doesn't like to have duplicate covers, so it takes the first cover of sufficient size (>200x200), creates a cover wrapper for it, and tries to remove duplicates.
23 | 
24 | Ebookmaker doesn't touch HTML or image files submitted to Project Gutenberg and displayed as HTML books. However, it transforms both HTML and image files for inclusion in EPUB and Kindle. Cover images displayed on Project Gutenberg are sized and processed by EbookMaker, and when no cover is present, an abstract cover is generated for the book.
25 | 
26 | For compatibility, Ebookmaker > 0.9 creates "wrapper" files for linked images. Submitters do not need to create wrapper files.
27 | 
28 | Images submitted for use in HTML should be sized and compressed so that load times are reasonably short and they look good on screens.
29 | 
30 | Ebookmaker 0.9 has relaxed some limits on image sizes used inside EPUB and Kindle, considering advances in device power and network speed. Before version 0.9, any image or cover larger than 128KB was compressed to  fit under 128KB. Similarly, images and covers wider than 800 px or taller than 1280 px were proportionately scaled down to fit. In version 0.9, the limits depend on the type of the image. 
31 | 
32 | - inline images are compressed if they are larger than 256KB and scaled if they are larger than 5000 x 5000.
33 | - linked images and cover images are compressed if they are larger than 1MB and scaled if they are larger than 5000x5000
34 | 
35 | Industry specifications for book cover images have changed in the last few years. Amazon now requires that commercial ebook covers have _minimum_ dimensions of "at least 1200 pixels in width or 1800 pixels in height." They're more relaxed for self-published covers; KDP suggests minimum dimensions of 625 x 1000 px and ideal dimensions of 1600 x 2560. New Project Gutenberg books should have covers of quality commensurate with industry practice.
36 | 
37 | Since cover images specified by the coverpage/icon relation are not displayed in HTML, there is no need to limit their size (within reason!!!)
38 | 
39 | Suggested Guidelines for cover and image submissions to Project Gutenberg:
40 | 
41 | 1. Submitted cover images should be at least 625 x 1000 px and ideally larger. The should be not exceed 1MB in size unless specified by a coverpage relation.
42 | 
43 | 2. Submitted images should be less than 256KB for inline images and less than 1MB for linked images.
44 | 
45 | 3. Display sizes for images should be set using relative units i.e. `ems`, and Project Gutenberg does not need to restrict pixel sizes for submitted images.
46 | 
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/src/ebookmaker/writers/PDFWriter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | PDFWriter.py
  6 | 
  7 | Copyright 2011 by Marcello Perathoner
  8 | 
  9 | Distributable under the GNU General Public License Version 3 or newer.
 10 | 
 11 | Convert RST to PDF.
 12 | 
 13 | """
 14 | 
 15 | 
 16 | import os
 17 | import subprocess
 18 | 
 19 | from libgutenberg.Logger import debug, info, warning, error
 20 | from libgutenberg.GutenbergGlobals import SkipOutputFormat, mkdir_for_filename
 21 | 
 22 | from ebookmaker import ParserFactory
 23 | from ebookmaker import writers
 24 | from ebookmaker.CommonCode import Options
 25 | 
 26 | options = Options()
 27 | 
 28 | class Writer (writers.BaseWriter):
 29 |     """ Class to write PDF. """
 30 | 
 31 |     def build (self, job):
 32 |         """ Build PDF file. """
 33 | 
 34 |         inputfilename  = job.url
 35 |         outputfilename = os.path.join (os.path.abspath(job.outputdir), job.outputfile)
 36 | 
 37 |         debug ("Inputfile: %s" % inputfilename)
 38 |         debug ("Creating PDF file: %s" % outputfilename)
 39 | 
 40 |         mkdir_for_filename(outputfilename)
 41 |         debug(f'parser input is {inputfilename}')
 42 |         parser = ParserFactory.ParserFactory.create (inputfilename)
 43 | 
 44 |         if not hasattr (parser, 'rst2xetex'):
 45 |             debug ('Skipping PDF Output because input mediatype is %s' % parser.mediatype())
 46 |             raise SkipOutputFormat
 47 | 
 48 |         # Brain-dead xetex doesn't understand unix pipes
 49 |         # so we have to write a temp file
 50 | 
 51 |         texfilename = os.path.splitext (outputfilename)[0] + '.tex'
 52 |         auxfilename = os.path.splitext (outputfilename)[0] + '.aux'
 53 |         logfilename = os.path.splitext (outputfilename)[0] + '.log'
 54 | 
 55 |         try:
 56 |             os.remove (auxfilename)
 57 |         except OSError:
 58 |             pass
 59 | 
 60 |         tex = parser.rst2xetex (job)
 61 |         with open (texfilename, 'wb') as fp:
 62 |             fp.write (tex)
 63 | 
 64 |         try:
 65 |             cwd = os.getcwd ()
 66 |             os.chdir (os.path.abspath(job.outputdir))
 67 | 
 68 |             _xetex = subprocess.Popen ([options.config.XELATEX,
 69 |                                         "-output-directory", job.outputdir,
 70 |                                         "-interaction", "nonstopmode",
 71 |                                         texfilename],
 72 |                                        stdin = subprocess.PIPE,
 73 |                                        stdout = subprocess.PIPE,
 74 |                                        stderr = subprocess.PIPE)
 75 |         except OSError as what:
 76 |             os.chdir (cwd)
 77 |             error ("PDFWriter: %s %s" % (options.config.XELATEX, what))
 78 |             raise SkipOutputFormat
 79 | 
 80 |         (dummy_stdout, dummy_stderr) = _xetex.communicate ()
 81 | 
 82 |         with open (logfilename, encoding='utf-8') as fp:
 83 |             for line in fp:
 84 |                 line = line.strip ()
 85 |                 if 'Error:' in line:
 86 |                     error ("xetex: %s" % line)
 87 |                 if options.verbose >= 1:
 88 |                     if 'Warning:' in line:
 89 |                         warning ("xetex: %s" % line)
 90 | 
 91 |         if options.verbose < 2:
 92 |             try:
 93 |                 os.remove (texfilename)
 94 |                 os.remove (logfilename)
 95 |                 os.remove (auxfilename)
 96 |             except OSError:
 97 |                 pass
 98 | 
 99 |         os.chdir (cwd)
100 | 
101 |         debug ("Done PDF file: %s" % outputfilename)
102 | 


--------------------------------------------------------------------------------
/USAGE.md:
--------------------------------------------------------------------------------
 1 | # Usage Notes
 2 | 
 3 | Ebookmaker has to reliably make EPUB and MOBI for over 60,000 different titles every month, so it includes a number of adaptations that may not be intuitive for HTML authors.
 4 | 
 5 | ## Crawling
 6 | 
 7 | Ebookmaker starts with a document file path or URL, and then follows links and images to a depth determined by the `--max_depth` setting. It only follows links that are in the same directory or below; anything in the same directory linked by the starting page will be included in the ebook it tries to build. The `*.noimages` filetype builds (for example, `--make=epub.noimages`) exclude images. If you don't want the ebook to include a resource that your HTML links to, use the `rel='nofollow'` attribute of the `a` tag.
 8 | 
 9 | The crawl from the starting page determines the reading order for the ebook. If the starting page links to another html page, the content from that page will be placed after the starting page in the reading order. For this reason, it's simpler to put all the content on a single page. Multi-page HTML books should convert well if attention is paid to the reading order implied by the starting page.
10 | 
11 | ## Floats and absolute positioning
12 | 
13 | Ebookmaker removes elements that float, because a large part of the PG backfile was produced before any ebook readers could handle floats. It also removes elements with absolute as it is not supported by EPUB2. HTML authors can prevent floating elements from being stripped by using a css selector that contains the `x-ebookmaker` class. Ebookmaker assumes that if the HTML designer uses the `x-ebookmaker` class, they've considered the impact of the float on the generated EPUB.
14 | 
15 | ## Page numbers
16 | 
17 | Ebookmaker strips content from elements that it thinks are page numbers. HTML produced for PG often implements the original page numbers either with float or with absolute positioning. If these elements were left in, they would show up as numbers in the middle of the text.
18 | 
19 | To still keep links working, all page number contraptions are replaced with empty `a` tags with class `x-ebookmaker-pageno`.
20 | 
21 | The classes that make Ebookmaker think the element is a page number are: `pagenum pageno page pb folionum foliono`.
22 | 
23 | ## Tables of Contents
24 | 
25 | Ebookmaker uses HTML heading elements to generate a table of contents. To play nicely with this process, HTML should not use heading elements for things that don't belong in the table of contents, and _should_ use heading elements for things that do!
26 | 
27 | ## Hidden content
28 | 
29 | Content hidden by the `display:none` css directive can create havoc with ebook generation. For example, MOBI generation _will_ fail if the target of a link is hidden. Authors of HTML for Ebookmaker should refrain from using `display:none` and should check that all ebook formats convert as expected.
30 | 
31 | ## Images and Covers
32 | 
33 | HTML authors can control the image that Ebookmaker uses as a cover for ebook files. If there is no suitable cover image, Ebookmaker will generate one. Images are scaled if they are "too big". It's a bit complicated, so there's [a separate page](docs/images.md) that tries to explain it all.
34 | 
35 | 
36 | ## Special classes
37 | 
38 | Ebookmaker recognizes a number of special classes that can be used to modify its HTML conversion. There are 4 "`x-ebookmaker`" classes:
39 | 
40 |  - Ebookmaker adds the class `x-ebookmaker` to the `body` element inside the EPUBs it builds. This can be then be used by css to make styles that are only active inside an ebook file. This class replaces a deprecated 'handheld' @media query.
41 |  - The `x-ebookmaker-important` class on an image element tells ebookmaker not to remove the image, even in `*.noimages` builds.
42 |  - The `x-ebookmaker-drop` class tells ebookmaker to remove an element and its descendents from ebook builds. Don't use this class to prevent a file from being crawled - use `rel='nofollow'` instead.
43 |  - As described above, Ebookmaker adds the `x-ebookmaker-pageno` class to  elements whose content has been stripped because they use a class that indicates they represent page numbers.
44 | 
45 | 


--------------------------------------------------------------------------------
/docs/ebookmaker_v0_11.md:
--------------------------------------------------------------------------------
 1 | # New features in Ebookmaker v0.11
 2 | 
 3 | In addition to some small tweaks in its generated EPUBs, Ebookmaker version 0.11 also emits regularized HTML files for all types of input, including HTML source files. These "derived" files are now the preferred HTML presentation on the PG website.
 4 | 
 5 | The source HTML files are not modified, and are available (at the URLs they've always been at) via the "More files..." link on the website. Errata should be addressed in the source files, not the derived files, as whitespace and link structure are changed by ebookmaker in ways that may preclude reprocessing. Files are re-derived for the entire catalog monthly.
 6 | 
 7 | A major impetus for this change is to improve compatibility with browser plugins, mobile apps, proxy servers, accessibility tools and PG's own file processors. Much of our back file uses old versions of HTML that are poorly supported in modern browsers and other tools, and while there is ongoing work to update the back file, we are thousands of books away from being able to present uniformly coded HTML. This change is also a first step towards being able to use HTML5 for both source files and for presentation; for many PG books, the derived files will validate as HTML5. 
 8 | 
 9 | Submitters should be aware that our current process first converts submitted files to XHTML and HTML5ish files are derived from the XHTML; our process does not yet support features introduced in HTML5.
10 | 
11 | Here are the differences between HTML source files and the HTML files derived from them:
12 | 
13 | 1. all HTML files are cleaned by HTML Tidy. Tidy does the following:
14 |     i. HTML Tidy emits well-formed UTF8-encoded XHTML-compatible files. This will allow the PG web server to add the encoding to MIME headers, improving browser compatibility and accessibility.
15 |     ii. LF is used as the newline character for all files (unix standard)
16 |     iii. HTML entities such as `&rsquo;` `&Aacute;` etc. are converted to unicode characters. Together with webserver configuration changes, this will improve web browser compatibility.
17 |     iv. Tidy corrects badly formed HTML, improving browser compatibility and standards conformance.
18 |     v. An doctype declaration: `<!DOCTYPE html>` is used for all files This is compatible with the included metadata.
19 |     vi. Tags are now uniformly lower case
20 |     vii. Some legacy presentational tags (`<i>`, `<b>`, `<center>` when enclosed within appropriate inline tags, and ) are replaced with CSS `<style>` tags and structural markup as appropriate.
21 |     viii. Empty paragraphs are discarded.
22 |     ix. Any text directly in the `<body>` element is wrapped in a `<p>` element.
23 |     x. Empty tags in HTML not closed with an end tag. So... `<a id="x" />` is changed to `<a id="x" ></a>`. This is needed because Chrome and Safari no longer support self-closing tags.
24 |     xi. Inline style attributes are moved to a generated inline stylesheet for better rendering performance. The same mechanism is used to separate CSS from text in our EPUB files.
25 |     
26 | 2. Metadata is added to the `<head>` element. We include Facebook OpenGraph, Dublin Core, and schema.org metadata for better SEO and Facebook/Twitter unfurls. Changes in the metadata entered by cataloguers are now reflected in the metadata of the HTML presentation.
27 | 
28 | 3. Because the derived HTML is moved to a new directory, linked files also needed to be moved. Because the derived file has a different name, back-links needed to be changed.
29 | 
30 | There is one minor change to the EPUB generation process. `data-*` attributes are now removed because they were preventing EPUB2 validation.
31 | 
32 | 
33 | Some versions of ebookmaker since our last production release did not run without access to the live PG database. Don't use them. 
34 | 
35 | This version on ebookmaker has not been tested on Windows, as I don't currently have access to a Windows box for development. If you run ebookmaker on Window, please let me know how it goes, and if there are problems, please comment here or create an issue on the Gihub repo: https://github.com/gutenbergtools/ebookmaker/issues
36 | 
37 | In the next major version of Ebookmaker, the boilerplate headers and footers will be inserted/replaced as part of the presentation HTML derivation process.


--------------------------------------------------------------------------------
/src/ebookmaker/writers/HtmlTemplates.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | HtmlTemplates.py
  6 | 
  7 | Copyright 2022 by Project Gutenberg
  8 | 
  9 | Use f-strings to render boilerplate trees
 10 | """
 11 | import datetime
 12 | import html
 13 | import lxml
 14 | from lxml import etree
 15 | 
 16 | from .TemplateStrings import COPYRIGHT_ADDITION, COPYRIGHTED, CSS_FOR_HEADER, HEADERA, HEADERB
 17 | 
 18 | pg_date = datetime.date(1971, 12, 1)
 19 | try:
 20 |     hr_format = "%B %-d, %Y"
 21 |     f'{pg_date.strftime(hr_format)}'
 22 | except ValueError:
 23 |     # https://strftime.org/
 24 |     hr_format = "%B %#d, %Y"
 25 | 
 26 | 
 27 | def pgheader(dc):
 28 |     def pstyle(key, val):
 29 |         key = key.capitalize()
 30 |         if not key or not val:
 31 |             return ''
 32 |         val = f'<br/>\n{padding}'.join([html.escape(v) for v in val.split('\n')])
 33 |         if key == 'Previous':
 34 |             # note: having the padding inside the span was causing kobo readers to crash
 35 |             return f'''<p style='margin-top:0'>{padding}<span style='padding-left: 7.5ex'></span>{val}</p>{nl}'''
 36 |         else:
 37 |             return f'''<p><strong>{key}</strong>: {val}</p>{nl}'''
 38 | 
 39 |     def dcauthlist(dc):
 40 |         cre_list = ''
 41 |         block_role = ''
 42 |         for creator in dc.authors:
 43 |             if block_role != creator.role:
 44 |                 cre_list +=  nl + pstyle(creator.role, dc.make_pretty_name(creator.name))
 45 |                 block_role = creator.role
 46 |             else:
 47 |                 # roughly line up additional vals under previous 
 48 |                 cre_list += pstyle('Previous', dc.make_pretty_name(creator.name))
 49 |         return cre_list
 50 | 
 51 |     language_list = []
 52 |     lang = ''
 53 |     padding = "        "
 54 |     nl = '\n'
 55 |     for language in dc.languages:
 56 |         lang = lang if lang else language.id 
 57 |         language_list.append(language.language)
 58 | 
 59 |     if 'copyright' in dc.rights.lower():
 60 |         rights = HEADERA.format(copyrighted=COPYRIGHTED)    
 61 |     else:
 62 |         rights = HEADERA.format(copyrighted='')
 63 | 
 64 |     if dc.update_date - dc.release_date < datetime.timedelta(days=14):
 65 |         updated = ''
 66 |     else:
 67 |         updated = f'{nl}{padding}Most recently updated: {dc.update_date.strftime(hr_format)}'
 68 |     if dc.release_date == datetime.date.min:
 69 |         release_date = 'No release date'
 70 |     else:
 71 |         release_date = dc.release_date.strftime(hr_format)
 72 |     pg_header = '<section class="pg-boilerplate pgheader" id="pg-header" xml:lang="en" lang="en" xmlns="http://www.w3.org/1999/xhtml">'
 73 |     pg_header += "<h2 id='pg-header-heading' title=''>"
 74 |     pg_header += 'The Project Gutenberg eBook of '
 75 |     pg_header += f'''<span lang='{lang}' xml:lang='{lang}' id='pg-title-no-subtitle'>{
 76 |         html.escape(dc.title_no_subtitle)
 77 |     }</span></h2>
 78 |     {rights}<div class='container' id='pg-machine-header'>{
 79 |         pstyle('Title', dc.title_no_subtitle)
 80 |     }{
 81 |         pstyle('Previous', dc.subtitle) if dc.subtitle  else ''
 82 |     }<div id='pg-header-authlist'>{
 83 |         dcauthlist(dc)
 84 |     }</div>
 85 | {pstyle('Release Date', 
 86 |             f'{release_date} [eBook #{dc.project_gutenberg_id}]' + updated)}
 87 | {pstyle('Language', ', '.join(language_list))}
 88 | {pstyle('Original Publication', str(dc.pubinfo))}
 89 | {pstyle('Credits', dc.credit)}
 90 | </div><div id='pg-start-separator'>
 91 | <span>*** START OF THE PROJECT GUTENBERG EBOOK {html.escape(dc.title_no_subtitle.upper())} ***</span>
 92 | </div></section>
 93 | '''
 94 |     return etree.fromstring(pg_header.replace('\n\n\n', '\n\n'), lxml.html.XHTMLParser())
 95 |     
 96 | 
 97 | def pgfooter(dc):
 98 |     copyright_addition = COPYRIGHT_ADDITION if 'copyright' in dc.rights.lower() else ''
 99 | 
100 |     pg_footer = f'''
101 | <section class="pg-boilerplate pgheader" id="pg-footer" lang='en' xml:lang='en' xmlns="http://www.w3.org/1999/xhtml">
102 | <div id='pg-end-separator'>
103 | <span>*** END OF THE PROJECT GUTENBERG EBOOK {html.escape(dc.title_no_subtitle.upper())} ***</span>
104 | </div>
105 | 
106 |     {HEADERB.format(copyright_addition=copyright_addition)}
107 | </section>
108 | '''
109 |     return etree.fromstring(pg_footer, lxml.html.XHTMLParser())
110 | 


--------------------------------------------------------------------------------
/src/ebookmaker/writers/KindleWriter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*-
  3 | 
  4 | """
  5 | 
  6 | KindleWriter.py
  7 | 
  8 | Copyright 2009-2012 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | """
 13 | 
 14 | import re
 15 | import os
 16 | import subprocess
 17 | import sys
 18 | 
 19 | from libgutenberg.Logger import info, debug, warning, error
 20 | from libgutenberg.GutenbergGlobals import SkipOutputFormat
 21 | from ebookmaker.writers import BaseWriter
 22 | from ebookmaker.CommonCode import Options
 23 | 
 24 | options = Options()
 25 | no_kindlegen_langs = ['ceb', 'eo', 'fur', 'ia', 'ilo', 'iu', 'mi',
 26 |                       'myn', 'nah', 'nap', 'oc', 'oji', 'tl', 'bo']
 27 | 
 28 | class Writer(BaseWriter):
 29 |     """ Class for writing kindle files. """
 30 | 
 31 | 
 32 |     def build(self, job):
 33 |         """ Build kindle file from epub using amazon kindlegen or calibre. """
 34 | 
 35 |         if job.maintype == 'kindle':
 36 |             if job.dc.languages:
 37 |                 if job.dc.languages[0].id in no_kindlegen_langs:
 38 |                     mobimaker = options.config.MOBILANG
 39 |                 else:
 40 |                     mobimaker = options.config.MOBIGEN
 41 |         else:
 42 |             mobimaker = options.config.MOBIKF8
 43 |         if not mobimaker:
 44 |             info('no mobimaker available')
 45 |             return
 46 | 
 47 |         # kindlegen needs localized paths
 48 |         outputdir = os.path.abspath(job.outputdir)
 49 | 
 50 |         debug("Creating Kindle file: %s" % os.path.join(outputdir, job.outputfile))
 51 |         debug("            ... from: %s" % job.url)
 52 | 
 53 |         try:
 54 |             cwd = os.getcwd()
 55 |             os.chdir(outputdir)
 56 |             if 'ebook-convert' in mobimaker:
 57 |                 kindlegen = subprocess.run(
 58 |                     [
 59 |                         mobimaker,
 60 |                         job.url,
 61 |                         os.path.basename(job.outputfile),
 62 |                         '--personal-doc="[EBOK]"',
 63 |                         '--mobi-file-type=' + ('new' if job.maintype == 'kf8' else 'old')
 64 |                     ],
 65 |                     stdout=subprocess.PIPE,
 66 |                     stderr=subprocess.PIPE
 67 |                 )
 68 |             else:
 69 |                 kindlegen = subprocess.run(
 70 |                     [
 71 |                         mobimaker,
 72 |                         '-o', os.path.basename(job.outputfile),
 73 |                         job.url
 74 |                     ],
 75 |                     stdout=subprocess.PIPE,
 76 |                     stderr=subprocess.PIPE
 77 |                 )
 78 | 
 79 |         except OSError as what:
 80 |             os.chdir(cwd)
 81 |             error("KindleWriter: %s %s" % (mobimaker, what))
 82 |             raise SkipOutputFormat
 83 | 
 84 |         os.chdir(cwd)
 85 | 
 86 |         if kindlegen.returncode > 0:
 87 |             regex = re.compile(r'^(\w+)\(prcgen\):')
 88 | 
 89 |             # pylint: disable=E1103
 90 |             msg = kindlegen.stderr.rstrip()
 91 |             if msg:
 92 |                 msg = msg.decode(sys.stderr.encoding)
 93 |                 error(msg)
 94 |             msg = kindlegen.stdout.rstrip()
 95 |             msg = msg.decode(sys.stdout.encoding)
 96 |             for line in msg.splitlines():
 97 |                 match = regex.match(line)
 98 |                 if match:
 99 |                     sline = regex.sub("", line)
100 |                     g = match.group(1).lower()
101 |                     if g == 'info':
102 |                         if sline == 'MOBI File generated with WARNINGS!':
103 |                             # we knew that already
104 |                             continue
105 |                     elif g == 'warning':
106 |                         if sline.startswith('Cover is too small'):
107 |                             continue
108 |                         if sline == 'Cover not specified':
109 |                             continue
110 |                         warning("kindlegen: %s" % sline)
111 |                     elif g == 'error':
112 |                         error("kindlegen: %s" % sline)
113 |                     else:
114 |                         error(line)
115 | 
116 |         debug("Done Kindle file: %s" % os.path.join(outputdir, job.outputfile))
117 | 


--------------------------------------------------------------------------------
/src/ebookmaker/writers/TxtWriter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | TxtWriter.py
  6 | 
  7 | Copyright 2009 by Marcello Perathoner
  8 | 
  9 | Distributable under the GNU General Public License Version 3 or newer.
 10 | 
 11 | Build an UTF-8-encoded PG plain text file. This is just the plain text
 12 | version recoded into UTF-8.
 13 | 
 14 | """
 15 | 
 16 | from __future__ import unicode_literals
 17 | 
 18 | import os
 19 | import subprocess
 20 | import sys
 21 | 
 22 | from libgutenberg.Logger import debug, info, warning, error
 23 | from libgutenberg.GutenbergGlobals import SkipOutputFormat, mkdir_for_filename
 24 | 
 25 | from ebookmaker import ParserFactory
 26 | from ebookmaker import writers
 27 | from ebookmaker.CommonCode import Options
 28 | from ebookmaker.parsers.boilerplate import strip_headers_from_txt
 29 | 
 30 | from .HtmlTemplates import pgheader, pgfooter
 31 | 
 32 | options = Options()
 33 | 
 34 | # map some not-widely-supported characters to more common ones
 35 | u2u = {
 36 |     0x2010: '-',  # unicode HYPHEN to HYPHEN-MINUS. Many Windows fonts lack this.
 37 |     }
 38 | 
 39 | 
 40 | def insert_boilerplate(job, text):
 41 |     text, header, footer = strip_headers_from_txt(text)
 42 |     pg_header = pgheader(job.dc).text_content()
 43 |     pg_footer = pgfooter(job.dc).text_content()
 44 |     return pg_header + text + pg_footer
 45 | 
 46 | 
 47 | class Writer(writers.BaseWriter):
 48 |     """ Class to write PG plain text. """
 49 | 
 50 |     def groff(self, job, nroff, encoding='utf-8'):
 51 |         """ Process thru groff.
 52 | 
 53 |         Takes and returns unicode strings!
 54 | 
 55 |         """
 56 | 
 57 |         device = {'utf-8': 'utf8',
 58 |                   'iso-8859-1': 'latin1',
 59 |                   'us-ascii': 'ascii'}[encoding]
 60 | 
 61 |         nroff = nroff.encode(encoding)
 62 |         nrofffilename = os.path.join(
 63 |             os.path.abspath(job.outputdir),
 64 |             os.path.splitext(job.outputfile)[0] + '.nroff')
 65 | 
 66 |         # write nroff file for debugging
 67 |         if options.verbose >= 2:
 68 |             with open(nrofffilename, 'wb') as fp:
 69 |                 fp.write(nroff)
 70 |         else:
 71 |             try:
 72 |                 # remove debug files from previous runs
 73 |                 os.remove(nrofffilename)
 74 |             except OSError:
 75 |                 pass
 76 | 
 77 |         # call groff
 78 |         try:
 79 |             _groff = subprocess.Popen([options.config.GROFF,
 80 |                                        "-t",             # preprocess with tbl
 81 |                                        "-K", device,     # input encoding
 82 |                                        "-T", device],    # output device
 83 |                                       stdin=subprocess.PIPE,
 84 |                                       stdout=subprocess.PIPE,
 85 |                                       stderr=subprocess.PIPE)
 86 |         except OSError:
 87 |             error("TxtWriter: executable not found: %s" % options.config.GROFF)
 88 |             raise SkipOutputFormat
 89 | 
 90 |         (txt, stderr) = _groff.communicate(nroff)
 91 | 
 92 |         # pylint: disable=E1103
 93 |         for line in stderr.splitlines():
 94 |             line = line.decode(sys.stderr.encoding)
 95 |             line = line.strip()
 96 |             if 'error' in line:
 97 |                 error("groff: %s" % line)
 98 |             elif 'warn' in line:
 99 |                 if options.verbose >= 1:
100 |                     warning("groff: %s" % line)
101 | 
102 |         txt = txt.decode(encoding)
103 |         return txt.translate(u2u) # fix nroff idiosyncracies
104 | 
105 | 
106 |     def build(self, job):
107 |         """ Build TXT file. """
108 | 
109 |         filename = os.path.join(job.outputdir, job.outputfile)
110 | 
111 |         encoding = job.subtype.strip('.')
112 | 
113 |         mkdir_for_filename(filename)
114 | 
115 |         debug("Creating plain text file: %s from %s", filename, job.url)
116 | 
117 |         parser = ParserFactory.ParserFactory.create(job.url)
118 | 
119 |         # don't make txt file unless the source is txt of some encoding
120 |         has_txt_source = 'text/plain' in str(parser.attribs.orig_mediatype)
121 |         is_html_source = not has_txt_source and \
122 |                          hasattr(parser, 'xhtml') and \
123 |                          parser.xhtml is not None
124 | 
125 |         if hasattr(parser, 'rst2nroff'):
126 |             data = self.groff(job, parser.rst2nroff(job, encoding), encoding)
127 |         elif is_html_source:
128 |             info("Plain text file %s aborted due to html input" % filename)
129 |             return
130 |         else:
131 |             data = parser.unicode_content()
132 | 
133 |         data = insert_boilerplate(job, data)
134 | 
135 |         data = data.encode('utf_8_sig' if encoding == 'utf-8' else encoding, 'unitame')
136 | 
137 |         self.write_with_crlf(filename, data)
138 | 
139 |         debug("Done plain text file: %s" % filename)
140 | 


--------------------------------------------------------------------------------
/src/ebookmaker/Unitame.py:
--------------------------------------------------------------------------------
  1 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Unitame.py
  5 | 
  6 | Copyright 2010 by Marcello Perathoner
  7 | 
  8 | Distributable under the GNU General Public License Version 3 or newer.
  9 | 
 10 | Module to implement the totally superfluous PG plain text conversion
 11 | into long extinct encodings.
 12 | 
 13 | We have to unitame-translate before feeding to nroff because nroff
 14 | does some irreversible (and wrong) translations of its own, like ä ->
 15 | a. Also, some unitame-translations change the number of characters,
 16 | thus throwing already-justified text off.
 17 | 
 18 | We cannot do the translations before feeding the source to docutils
 19 | because if we change the length of titles, we get the warning: Title
 20 | underline too short.
 21 | 
 22 | Translation does some dangerous things, like converting quotes to
 23 | apostrophes, which are command escapes in nroff. We have to escape
 24 | apostrophes in the source text but not apostroph-commands inserted by
 25 | the converter.
 26 | 
 27 | We also have to translate some important non-ascii characters, like
 28 | nbsp and shy, into command sequences before they reach unitame because
 29 | unitame would convert them into the semantically different space and
 30 | hyhpen.
 31 | 
 32 | All this makes translation inside the docutils converter the best
 33 | choice. Implemented as a docutils translator that visits all text
 34 | nodes.
 35 | 
 36 | Smart quote translation should also go into a docutils
 37 | translator. Likewise a translator for text-transform: upper.
 38 | 
 39 | """
 40 | 
 41 | from __future__ import unicode_literals
 42 | 
 43 | import codecs
 44 | import unicodedata as ud
 45 | 
 46 | # UnitameData is generated from unitame.dat
 47 | from ebookmaker.UnitameData import unicode_to_iso_8859_1, iso_8859_1_to_ascii
 48 | 
 49 | # tweak dicts for translate ()
 50 | u2i = dict ( [ (ord (o), s) for o, s in unicode_to_iso_8859_1.items () ] )
 51 | i2a = dict ( [ (ord (o), s) for o, s in iso_8859_1_to_ascii.items () ] )
 52 | 
 53 | u2i.update ( {
 54 |     0x2000:     ' ',    # en quad
 55 |     0x2001:     '  ',   # em quad
 56 |     0x2002:     ' ',    # en space
 57 |     0x2003:     '  ',   # em space
 58 |     0x2004:     ' ',    # 3/em space
 59 |     0x2005:     '',     # 4/em
 60 |     0x2006:     '',     # 6/em
 61 |     0x2007:     ' ',    # figure space
 62 |     0x2008:     '',     # punctuation space
 63 |     0x2009:     '',     # thin space
 64 |     0x200a:     '',     # hair space
 65 |     0x200b:     '',     # zero space
 66 |     0x200c:     '',     # zwnj
 67 |     0x200d:     '',     # zwj
 68 |     0x2010:     '-',    # hyphen
 69 |     0x2011:     '-',    # non-breaking hyphen
 70 |     0x2012:     '-',    # figure-dash
 71 |     0x2013:     '-',    # en dash
 72 |     0x2014:     '--',   # em dash
 73 |     0x2015:     '-',    # horizontal bar
 74 |     0x2026:     '...',  # horizontal ellipsis
 75 |     ord ('™'): '(tm)',
 76 |     ord ('‹'): '<',
 77 |     ord ('›'): '>',
 78 |     ord ('†'): '+',
 79 |     ord ('‡'): '++',
 80 |     ord ('⁑'): '**',
 81 |     ord ('⁂'): '***',
 82 |     ord ('•'): '-',
 83 |     ord ('′'): '´',
 84 |     ord ('″'): '´´',
 85 |     ord ('‴'): '´´´',
 86 |     ord ('⁗'): '´´´´',
 87 |     ord ('⁓'): '~',
 88 |     ord ('‰'): '%o',
 89 |     ord ('‱'): '%oo',
 90 |     ord ('⚹'): '*',    # U+26b9 sextile
 91 |     ord ('⁰'): '^0',
 92 |     ord ('⁴'): '^4',
 93 |     ord ('⁵'): '^5',
 94 |     ord ('⁶'): '^6',
 95 |     ord ('⁷'): '^7',
 96 |     ord ('⁸'): '^8',
 97 |     ord ('⁹'): '^9',
 98 |     } )
 99 | 
100 | # somehow cram these into ascii, so the ppers stop whining about not
101 | # having nbsp in ascii, then fix it later by replacing them with nroff
102 | # commands.
103 | 
104 | i2a.update ( {
105 |     ord ('¹'): '^1',
106 |     ord ('²'): '^2',
107 |     ord ('³'): '^3',
108 |     0x00a0:     '\u0011',       # nbsp => DC1
109 |     0x00ad:     '\u0012',       # shy  => DC2
110 | } )
111 | 
112 | unhandled_chars = []
113 | 
114 | def strip_accents (text):
115 |     """ Strip accents from string.
116 | 
117 |     If the accented character doesn't fit into the encoding,
118 |     remove the accent and try again.
119 | 
120 |     """
121 |     return ud.normalize (
122 |         'NFKC',
123 |         ''.join ([c for c in ud.normalize ('NFKD', text) if ud.category (c) != 'Mn'])
124 |     )
125 | 
126 | 
127 | def unitame (exc):
128 |     """
129 |     Encoding error handler.
130 | 
131 |     The encoder handles all compatible characters itself.  It calls
132 |     this function whenever it encounters a character it cannot encode.
133 |     This function searches the unitame database for a replacement.
134 | 
135 | 
136 |     """
137 | 
138 |     l = []
139 |     for cc in exc.object[exc.start:exc.end]:
140 |         c = cc
141 |         if exc.encoding == 'latin-1': # python name for iso-8859-1
142 |             c = c.translate (u2i)
143 |             c = strip_accents (c)
144 |             if c and ord (max (c)) < 256:
145 |                 l.append (c)
146 |                 c = None
147 |         elif exc.encoding == 'ascii': # python name for us-ascii
148 |             # "1¼" -> "1 1/4"
149 |             if cc in '¼½¾':
150 |                 if exc.start > 0 and exc.object[exc.start - 1] in '0123456789':
151 |                     l.append (' ')
152 |             c = c.translate (u2i)
153 |             c = c.translate (i2a)
154 |             c = strip_accents (c)
155 |             if c and ord (max (c)) < 128:
156 |                 l.append (c)
157 |                 c = None
158 | 
159 |         if c:
160 |             l.append ('{~%s U+%04x~}' % (ud.name (cc), ord (cc)))
161 |             unhandled_chars.extend (l)
162 | 
163 |     return (''.join (l), exc.end)
164 | 
165 | 
166 | codecs.register_error ('unitame', unitame)
167 | 


--------------------------------------------------------------------------------
/src/ebookmaker/writers/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*-
  3 | 
  4 | """
  5 | 
  6 | Writer package
  7 | 
  8 | Copyright 2009-2010 by Marcello Perathoner
  9 | Copyright 2025 by Project Gutenberg
 10 | 
 11 | Distributable under the GNU General Public License Version 3 or newer.
 12 | 
 13 | Base classes for *Writer modules. (EpubWriter, PluckerWriter, ...)
 14 | 
 15 | """
 16 | import re
 17 | import subprocess
 18 | 
 19 | from functools import partial
 20 | import os.path
 21 | 
 22 | from lxml import etree
 23 | from lxml.builder import ElementMaker
 24 | 
 25 | from libgutenberg.Logger import critical, debug, info, error
 26 | import libgutenberg.GutenbergGlobals as gg
 27 | from libgutenberg import MediaTypes
 28 | 
 29 | from ebookmaker import parsers
 30 | from ebookmaker import ParserFactory
 31 | from ebookmaker.CommonCode import Options
 32 | from ebookmaker.Version import VERSION, GENERATOR
 33 | 
 34 | 
 35 | options = Options()
 36 | 
 37 | def remove_cr(content):
 38 |     content = re.sub(r'\s*[\r\n]+\s*', '&#10;', content)
 39 |     return content
 40 | 
 41 | class BaseWriter(object):
 42 |     """
 43 |     Base class for EpubWriter, PluckerWriter, ...
 44 | 
 45 |     also used as /dev/null writer for debugging
 46 | 
 47 |     """
 48 | 
 49 |     VALIDATOR = None
 50 | 
 51 |     def build(self, job):
 52 |         """ override this in a real writer """
 53 |         pass
 54 | 
 55 | 
 56 |     @staticmethod
 57 |     def write_with_crlf(filename, bytes_):
 58 |         # \r\n is PG standard
 59 |         bytes_ = b'\r\n'.join(bytes_.splitlines()) + b'\r\n'
 60 | 
 61 |         # open binary so windows doesn't add another \r
 62 |         with open(filename, 'wb') as fp:
 63 |             fp.write(bytes_)
 64 | 
 65 | 
 66 |     def validate(self, job):
 67 |         """ Validate generated file using external tools. """
 68 | 
 69 |         if not self.VALIDATOR:
 70 |             return 0
 71 | 
 72 |         debug("Validating %s ..." % job.outputfile)
 73 | 
 74 |         filename = os.path.join(os.path.abspath(job.outputdir), job.outputfile)
 75 | 
 76 |         if hasattr(options.config, self.VALIDATOR):
 77 |             validator = getattr(options.config, self.VALIDATOR)
 78 |             info('validating...')
 79 |             params = validator.split() + [filename]
 80 |             checker = subprocess.run(params,
 81 |                                        stdout=subprocess.PIPE,
 82 |                                        stderr=subprocess.PIPE)
 83 | 
 84 |             if checker.stderr:
 85 |                 critical('validation error reported by %s:\r', self.VALIDATOR)
 86 |                 critical(checker.stderr.decode("utf-8"))
 87 |                 return 1
 88 | 
 89 |         info("%s validates ok." % job.outputfile)
 90 |         return 0
 91 | 
 92 | 
 93 |     def sync(self):
 94 |         """  Override this if you need to sync before program exit. """
 95 |         pass
 96 | 
 97 | 
 98 |     def make_links_relative(self, xhtml, base_url):
 99 |         """ Make absolute links in xhtml relative to base_url. """
100 | 
101 |         debug("Making links relative to: %s" % base_url)
102 |         xhtml.rewrite_links(partial(gg.make_url_relative, base_url))
103 | 
104 | 
105 | 
106 | em = ElementMaker()
107 | 
108 | class HTMLishWriter(BaseWriter):
109 |     """ Base class for writers with HTMLish contents. """
110 | 
111 |     @staticmethod
112 |     def add_class(elem, class_):
113 |         """ Add a class to html element. """
114 | 
115 |         classes = elem.get('class', '').split()
116 |         classes.append(class_)
117 |         elem.set('class', ' '.join(classes))
118 | 
119 | 
120 |     @staticmethod
121 |     def add_meta(xhtml, name, content):
122 |         """ Add a meta tag. """
123 | 
124 |         for head in gg.xpath(xhtml, '//xhtml:head'):
125 |             meta = em.meta(name=name, content=remove_cr(content))
126 |             meta.tail = '\n'
127 |             head.append(meta)
128 | 
129 |     @staticmethod
130 |     def add_prop(xhtml, prop, content):
131 |         """ Add a property meta tag. """
132 | 
133 |         for head in gg.xpath(xhtml, '//xhtml:head'):
134 |             meta = em.meta(property=prop, content=remove_cr(content))
135 |             meta.tail = '\n'
136 |             head.append(meta)
137 | 
138 | 
139 |     @staticmethod
140 |     def add_meta_generator(xhtml):
141 |         """ Add our piss mark. """
142 | 
143 |         HTMLishWriter.add_meta(xhtml, 'generator', GENERATOR % VERSION)
144 | 
145 | 
146 |     @staticmethod
147 |     def add_internal_css(xhtml, css_as_string):
148 |         """ Add internal stylesheet to html. """
149 | 
150 |         if css_as_string and xhtml is not None:
151 |             css_as_string = '\n' + css_as_string.strip(' \n') + '\n'
152 |             for head in gg.xpath(xhtml, '//xhtml:head'):
153 |                 style = em.style(css_as_string, type='text/css')
154 |                 style.tail = '\n'
155 |                 head.insert(0, style)
156 | 
157 |     @staticmethod
158 |     def add_body_class(xhtml, classname):
159 |         """ Add a class to the body element. """
160 | 
161 |         if classname and xhtml is not None:
162 |             for body in gg.xpath(xhtml, '//xhtml:body'):
163 |                 HTMLishWriter.add_class(body, classname)
164 | 
165 | 
166 |     def add_external_css(self, spider, xhtml, css_as_string, url):
167 |         """ Add external stylesheet to html. """
168 | 
169 |         if css_as_string:
170 |             attribs = parsers.ParserAttributes()
171 |             attribs.orig_mediatype = 'text/css'
172 |             attribs.url = attribs.orig_url = url
173 |             p = ParserFactory.ParserFactory.get(attribs)
174 |             p.parse_string(css_as_string)
175 |             p.make_links_absolute()
176 |             spider.parsers.append(p)
177 | 
178 |         if xhtml is not None:
179 |             for head in gg.xpath(xhtml, '//xhtml:head'):
180 |                 link = em.link(href=url, rel='stylesheet', type='text/css')
181 |                 link.tail = '\n'
182 |                 head.append(link)
183 | 
184 | 
185 | 


--------------------------------------------------------------------------------
/src/ebookmaker/packagers/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*-
  3 | 
  4 | """
  5 | 
  6 | Packager package
  7 | 
  8 | Copyright 2009-2010 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Base class for Packager modules.
 13 | 
 14 | """
 15 | 
 16 | 
 17 | import os.path
 18 | import gzip
 19 | import zipfile
 20 | 
 21 | from pkg_resources import resource_listdir  # pylint: disable=E0611
 22 | 
 23 | from libgutenberg.Logger import debug, info, warning, error
 24 | import libgutenberg.GutenbergGlobals as gg
 25 | 
 26 | GZIP_EXTENSION = '.gzip'
 27 | 
 28 | class BasePackager (object):
 29 |     """
 30 |     Base class for Packagers.
 31 | 
 32 |     """
 33 | 
 34 |     def __init__ (self):
 35 |         self.path_name_ext = None
 36 |         self.path = None
 37 |         self.name = None
 38 |         self.ext = None
 39 | 
 40 | 
 41 |     def setup (self, job):
 42 |         """ Setup """
 43 | 
 44 |         self.path_name_ext = os.path.join(os.path.abspath(job.outputdir), job.outputfile)
 45 |         self.path, name = os.path.split (self.path_name_ext)
 46 |         self.name, self.ext = os.path.splitext (name)
 47 | 
 48 | 
 49 |     def package (self, job):
 50 |         """ Package files. """
 51 |         pass
 52 | 
 53 | 
 54 | class OneFileGzipPackager (BasePackager):
 55 |     """ Gzips one file. """
 56 | 
 57 |     def package (self, job):
 58 |         self.setup (job)
 59 |         filename = self.path_name_ext
 60 |         gzfilename = filename + GZIP_EXTENSION
 61 | 
 62 |         try:
 63 |             info ('Creating Gzip file: %s' % gzfilename)
 64 |             info ('  Adding file: %s' % filename)
 65 |             with open (filename, 'rb') as fp:
 66 |                 with gzip.open (gzfilename, 'wb') as fpgz:
 67 |                     fpgz.writelines (fp)
 68 |             info ('Done Zip file: %s' % gzfilename)
 69 |         except IOError as what:
 70 |             error (what)
 71 | 
 72 | 
 73 | class ZipPackager (BasePackager):
 74 |     """ Packages a zip file. """
 75 | 
 76 |     @staticmethod
 77 |     def create (zipfilename):
 78 |         """ Create a zip file. """
 79 | 
 80 |         info ('Creating Zip file: %s' % zipfilename)
 81 |         return  zipfile.ZipFile (zipfilename, 'w', zipfile.ZIP_DEFLATED)
 82 | 
 83 | 
 84 |     @staticmethod
 85 |     def add (zip_, filename, memberfilename):
 86 |         """ Add one file to the zip. """
 87 | 
 88 |         try:
 89 |             os.stat (filename)
 90 |             dummy_name, ext = os.path.splitext (filename)
 91 |             debug ('  Adding file: %s as %s' % (filename, memberfilename))
 92 |             zip_.write (filename, memberfilename,
 93 |                         zipfile.ZIP_STORED if ext in ['.zip', '.png']
 94 |                         else zipfile.ZIP_DEFLATED)
 95 |         except OSError:
 96 |             warning ('ZipPackager: Cannot add file %s', filename)
 97 | 
 98 | 
 99 | class OneFileZipPackager (ZipPackager):
100 |     """ Packages one file in zip of the same name. """
101 | 
102 |     def package (self, job):
103 |         self.setup (job)
104 |         filename = self.path_name_ext
105 |         zipfilename = os.path.join (self.path, self.name) + '.zip'
106 |         memberfilename = self.name + self.ext
107 | 
108 |         zip_ = self.create (zipfilename)
109 |         self.add (zip_, filename, memberfilename)
110 |         zip_.close ()
111 | 
112 |         info ('Done Zip file: %s' % zipfilename)
113 | 
114 | 
115 | class HTMLishPackager (ZipPackager):
116 |     """ Package a file with images. """
117 | 
118 |     def package (self, job):
119 |         self.setup (job)
120 | 
121 |         try:
122 |             aux_file_list = list (job.spider.aux_file_iter ())
123 |         except AttributeError:
124 |             aux_file_list = []
125 | 
126 |         filename = job.outputfile
127 |         zipfilename = os.path.join (self.path, self.name) + '.zip'
128 |         memberfilename = os.path.join (self.name, self.name) + self.ext
129 | 
130 |         zip_ = self.create (zipfilename)
131 |         self.add (zip_, filename, memberfilename)
132 | 
133 |         # now images
134 |         for url in aux_file_list:
135 |             rel_url = gg.make_url_relative (job.base_url, url)
136 |             filename = os.path.join (self.path, rel_url)
137 |             memberfilename = os.path.join (self.name, rel_url)
138 |             self.add (zip_, filename, memberfilename)
139 | 
140 |         zip_.close ()
141 |         info ('Done Zip file: %s' % zipfilename)
142 | 
143 | 
144 | class PackagerFactory (object):
145 |     """ Implements Factory pattern for packagers. """
146 | 
147 |     packagers = {}
148 | 
149 |     @staticmethod
150 |     def mk_key (type_, format_):
151 |         """ Make a key for the packager map. """
152 | 
153 |         return (type_ or '') + '/' + format_
154 | 
155 | 
156 |     @classmethod
157 |     def load_packagers (cls):
158 |         """ Load the packagers in the packagers directory. """
159 | 
160 |         for fn in resource_listdir ('ebookmaker.packagers', ''):
161 |             modulename, ext = os.path.splitext (fn)
162 |             if ext == '.py':
163 |                 if modulename.endswith ('Packager'):
164 |                     module = __import__ ('ebookmaker.packagers.' + modulename,
165 |                                          fromlist = [modulename])
166 |                     debug ("Loading packager type: %s from module: %s for formats: %s" % (
167 |                         module.TYPE, modulename, ', '.join (module.FORMATS)))
168 |                     for format_ in module.FORMATS:
169 |                         cls.packagers[cls.mk_key (module.TYPE, format_)] = module
170 | 
171 |         return cls.packagers.keys ()
172 | 
173 | 
174 |     @classmethod
175 |     def unload_packagers (cls):
176 |         """ Unload packager modules. """
177 | 
178 |         for k in list (cls.packagers.keys ()):
179 |             del cls.packagers[k]
180 | 
181 | 
182 |     @classmethod
183 |     def create (cls, type_, format_):
184 |         """ Create a packager for format. """
185 | 
186 |         module = cls.packagers.get (cls.mk_key (type_, format_))
187 |         if module:
188 |             return module.Packager ()
189 |         return None
190 | 


--------------------------------------------------------------------------------
/src/ebookmaker/parsers/ImageParser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*-
  3 | 
  4 | """
  5 | 
  6 | ImageParser.py
  7 | 
  8 | Copyright 2009 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Parse an url of type image/*.
 13 | 
 14 | """
 15 | 
 16 | import copy
 17 | 
 18 | import six
 19 | from PIL import Image, ImageFile
 20 | 
 21 | 
 22 | from pkg_resources import resource_stream # pylint: disable=E0611
 23 | 
 24 | from libgutenberg.Logger import debug, error
 25 | from libgutenberg.MediaTypes import mediatypes as mt
 26 | from ebookmaker.parsers import ParserBase
 27 | from ebookmaker.ParserFactory import ParserFactory
 28 | from . import ParserAttributes
 29 | 
 30 | # works around problems with bad checksums in a small number of png files
 31 | ImageFile.LOAD_TRUNCATED_IMAGES = True
 32 | 
 33 | mediatypes = (mt.jpeg, mt.png, mt.gif, mt.svg)
 34 | 
 35 | 
 36 | 
 37 | class Parser(ParserBase):
 38 |     """Parse an image.
 39 | 
 40 |     And maybe resize it for ePub packaging.
 41 | 
 42 |     """
 43 | 
 44 |     def __init__(self, attribs=None):
 45 |         ParserBase.__init__(self, attribs)
 46 |         self.image_data = None
 47 |         self.dimen = None
 48 | 
 49 | 
 50 |     def resize_image(self, max_size, max_dimen, output_format=None):
 51 |         """ Create a new parser with a resized image. """
 52 | 
 53 |         def scale_image(image, scale):
 54 |             was = ''
 55 |             if scale < 1.0:
 56 |                 dimen = (int(image.size[0] * scale), int(image.size[1] * scale))
 57 |                 was = "(was %d x %d scale=%.2f) " % (image.size[0], image.size[1], scale)
 58 |                 image = image.resize(dimen, Image.LANCZOS)
 59 |             return was, image
 60 | 
 61 |         def get_image_data(image, format_, quality='keep'):
 62 |             """ Format is the output format, not necessarily the input format """
 63 |             buf = six.BytesIO()
 64 |             if image.format != 'JPEG' and quality == 'keep':
 65 |                 quality = 90
 66 |             if format_ == 'png':
 67 |                 image.save(buf, 'png', optimize=True)
 68 |             else:
 69 |                 try:
 70 |                     image.save(buf, 'jpeg', quality=quality)
 71 |                 except ValueError as e:
 72 |                     if quality == 'keep' and 'quantization' in str(e):
 73 |                         image.save(buf, 'jpeg', quality=90)
 74 |                     else:
 75 |                         raise e
 76 |             return buf.getvalue()
 77 |         
 78 |         # can't do anything with SVG files
 79 |         if self.attribs.url.endswith('.svg'):
 80 |             return self
 81 | 
 82 |         new_parser = Parser()
 83 | 
 84 |         try:
 85 |             unsized_image = Image.open(six.BytesIO(self.image_data))
 86 | 
 87 |             format_ = unsized_image.format.lower()
 88 |             if output_format:
 89 |                 format_ = output_format
 90 |             if format_ == 'gif':
 91 |                 format_ = 'png'
 92 |                 self.attribs.url +=  '.png'
 93 |                 self.attribs.orig_mediatype = self.attribs.mediatype
 94 |                 self.attribs.mediatype = mt.png
 95 |             if format_ == 'jpeg' and unsized_image.mode.lower() not in ('rgb', 'l'):
 96 |                 unsized_image = unsized_image.convert('RGB')
 97 | 
 98 |             if 'dpi' in unsized_image.info:
 99 |                 del unsized_image.info['dpi']
100 | 
101 |             # maybe resize image
102 | 
103 |             # find scaling factor
104 |             scale = 1.0
105 |             scale = min(scale, max_dimen[0] / float(unsized_image.size[0]))
106 |             scale = min(scale, max_dimen[1] / float(unsized_image.size[1]))
107 | 
108 |             was, image = scale_image(unsized_image, scale)
109 |             data = get_image_data(image, format_)
110 | 
111 |             if format_ == 'png':
112 |                 # scale it till it fits into max_size
113 |                 while len(data) > max_size and scale > 0.01:
114 |                     scale = scale * 0.8
115 |                     was, image = scale_image(unsized_image, scale)
116 |                     data = get_image_data(image, format_)
117 |             else:
118 |                 # find best quality that fits into max_size
119 |                 if len(data) > max_size:
120 |                     for quality in (90, 85, 80, 70, 60, 50, 40, 30, 20, 10):
121 |                         data = get_image_data(image, format_, quality=quality)
122 |                         if len(data) <= max_size:
123 |                             break
124 | 
125 |                     was += 'q=%d' % quality
126 |             comment = "Image: %d x %d size=%d %s" % (
127 |                 image.size[0], image.size[1], len(data), was
128 |             )
129 |             #debug(comment)
130 | 
131 |             new_parser.image_data = data
132 |             new_parser.dimen = tuple(image.size)
133 | 
134 |             new_parser.attribs = copy.copy(self.attribs)
135 |             new_parser.attribs.comment = comment
136 |             new_parser.fp = self.fp
137 | 
138 |         except IOError as what:
139 |             error("Could not resize image: %s; message %s", self.attribs.url, what)
140 |             new_parser.attribs = copy.copy(self.attribs)
141 |             fp = resource_stream('ebookmaker.parsers', 'broken.png')
142 |             new_parser.image_data = fp.read()
143 |             fp.close()
144 | 
145 |         return new_parser
146 | 
147 | 
148 |     def get_image_dimen(self):
149 |         if self.dimen is None:
150 |             if self.image_data:
151 |                 try:
152 |                     image = Image.open(six.BytesIO(self.image_data))
153 |                     self.dimen = image.size
154 |                 except IOError as what:
155 |                     error("Could not resize image (probably broken): %s", self.attribs.url)
156 |                     self.dimen = (0, 0)  # broken image
157 |             else:
158 |                 self.dimen = (0, 0)  # broken image
159 |         return self.dimen
160 | 
161 | 
162 |     def pre_parse(self):
163 |         if self.image_data is None:
164 |             self.image_data = self.bytes_content()
165 | 
166 |     def parse(self):
167 |         pass
168 | 
169 |     def serialize(self):
170 |         """ Serialize the image. """
171 |         return self.image_data
172 | 


--------------------------------------------------------------------------------
/src/ebookmaker/mydocutils/gutenberg/transforms/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | 
  6 | gutenberg.py
  7 | 
  8 | Copyright 2012 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Transforms for the Project Gutenberg flavor.
 13 | 
 14 | """
 15 | 
 16 | from __future__ import unicode_literals
 17 | 
 18 | import datetime
 19 | import textwrap
 20 | 
 21 | from docutils import nodes
 22 | import docutils.transforms
 23 | import docutils.transforms.parts
 24 | 
 25 | from libgutenberg.Logger import error, warning, info, debug
 26 | from libgutenberg.DublinCore import DublinCore
 27 | from libgutenberg.GutenbergGlobals import PG_URL
 28 | from ebookmaker.mydocutils import nodes as mynodes
 29 | 
 30 | # pylint: disable=W0142
 31 | 
 32 | class SubRefToVarTransform (docutils.transforms.Transform):
 33 |     """
 34 |     Transforms subref nodes in 'pg' namespace into var nodes.
 35 | 
 36 |     We need to save some subrefs for later processing. The standard
 37 |     subref processing happens too early (ie. before docinfo is
 38 |     collected). So we transform subrefs into variables, await docinfo
 39 |     to be processed, and then process the variables.
 40 | 
 41 |     """
 42 | 
 43 |     default_priority = 219
 44 |     """ Before substitition def variables """
 45 | 
 46 | 
 47 |     def apply (self):
 48 |         for ref in self.document.traverse (nodes.substitution_reference):
 49 |             refname = ref['refname']
 50 |             if refname.startswith ('pg.'):
 51 |                 var = mynodes.variable ()
 52 |                 var['name'] = refname
 53 |                 ref.replace_self (var)
 54 | 
 55 | 
 56 | class VariablesTransform (docutils.transforms.Transform):
 57 |     """ Replaces mynodes.var with parameters from metadata. """
 58 | 
 59 |     default_priority = 342
 60 |     """ After DocInfoCollector. """
 61 | 
 62 |     def apply(self):
 63 |         doc = self.document
 64 |         meta = doc.meta_block
 65 |         defs = doc.substitution_defs
 66 | 
 67 |         def getone (name, default = None):
 68 |             """ Get first value. """
 69 |             if name in meta:
 70 |                 return meta[name][0]
 71 |             return default
 72 | 
 73 |         def getmany (name, default = []):
 74 |             """ Get list of all values. """
 75 |             return meta.get (name, default)
 76 | 
 77 |         def sub (var, nodes):
 78 |             var.replace_self (nodes)
 79 | 
 80 |         title = getone ('DC.Title', 'No Title')
 81 |         short_title = getone ('PG.Title', title)
 82 |         short_title = short_title.split ('\n', 1)[0]
 83 | 
 84 |         language = getmany ('DC.Language', ['en'])
 85 |         language = [DublinCore.language_map.get (
 86 |             x, default='Unknown').title () for x in language]
 87 |         language = DublinCore.strunk (language)
 88 | 
 89 |         copyrighted = getone ('PG.Rights', '').lower () == 'copyrighted'
 90 | 
 91 |         for variable in doc.traverse (mynodes.variable):
 92 |             name = variable['name']
 93 | 
 94 |             if name == 'pg.upcase-title':
 95 |                 sub (variable, [ nodes.inline ('', short_title.upper ()) ])
 96 | 
 97 |             elif name == 'pg.produced-by':
 98 |                 producers = getmany ('PG.Producer')
 99 |                 if producers:
100 |                     sub (variable, [ nodes.inline ('', 'Produced by %s.' %
101 |                                                    DublinCore.strunk (producers)) ])
102 |                 else:
103 |                     sub (variable, [])
104 | 
105 |             elif name == 'pg.credits':
106 |                 sub (variable, [ nodes.inline ('', getone ('PG.Credits', '')) ])
107 | 
108 |             elif name == 'pg.bibrec-url':
109 |                 url =  '%sebooks/%s' % (PG_URL, getone ('PG.Id', '999999'))
110 |                 sub (variable, [ nodes.reference ('', '', nodes.inline ('', url), refuri = url) ])
111 | 
112 |             elif name in ('pg.copyrighted-header', 'pg.copyrighted-footer'):
113 |                 if copyrighted:
114 |                     subdef_copy = defs[name].deepcopy ()
115 |                     sub (variable, subdef_copy.children)
116 |                 else:
117 |                     sub (variable, [])
118 | 
119 |             elif name == 'pg.machine-header':
120 |                 tw = textwrap.TextWrapper (
121 |                     width = 72,
122 |                     initial_indent = 'Title: ',
123 |                     subsequent_indent = ' ' * 7)
124 | 
125 |                 if '\n' in title:
126 |                     maintitle, subtitle = title.split ('\n', 1)
127 |                     s = tw.fill (maintitle)
128 |                     s += '\n'
129 |                     tw.initial_indent = tw.subsequent_indent
130 |                     s += tw.fill (subtitle)
131 |                 else:
132 |                     s = tw.fill (title)
133 |                 s += '\n\n'
134 | 
135 |                 tw.initial_indent = 'Author: '
136 |                 tw.subsequent_indent = ' ' * 8
137 |                 s += tw.fill (DublinCore.strunk (getmany ('DC.Creator', ['Unknown'])))
138 |                 s += '\n\n'
139 | 
140 |                 date = getone ('PG.Released', '')
141 |                 try:
142 |                     date = datetime.datetime.strptime (date, '%Y-%m-%d')
143 |                     date = datetime.datetime.strftime (date, '%B %d, %Y')
144 |                 except ValueError:
145 |                     date = 'unknown date'
146 |                 s += 'Release Date: %s [eBook #%s]\n' % (date, getone ('PG.Id', '999999'))
147 | 
148 |                 for item in getmany ('PG.Reposted', []):
149 |                     try:
150 |                         date, comment = item.split (None, 1)
151 |                     except ValueError:
152 |                         date = item
153 |                         comment = None
154 |                     try:
155 |                         date = datetime.datetime.strptime (date, '%Y-%m-%d')
156 |                         date = datetime.datetime.strftime (date, '%B %d, %Y')
157 |                     except ValueError:
158 |                         date = 'unknown date'
159 | 
160 |                     s += 'Reposted: %s' % date
161 |                     if comment:
162 |                         s += ' [%s]' % comment
163 |                     s += '\n'
164 | 
165 |                 s += '\nLanguage: %s\n\n' % language
166 | 
167 |                 sub (variable, [ nodes.inline ('', nodes.Text (s)) ])
168 | 


--------------------------------------------------------------------------------
/src/ebookmaker/parsers/CSSParser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*-
  3 | 
  4 | """
  5 | 
  6 | CSSParser.py
  7 | 
  8 | Copyright 2009 by Marcello Perathoner
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | Open an url and return raw data.
 13 | 
 14 | """
 15 | 
 16 | import logging
 17 | import re
 18 | from six.moves import urllib
 19 | 
 20 | import cssutils
 21 | 
 22 | from libgutenberg.Logger import debug
 23 | from libgutenberg.MediaTypes import mediatypes as mt
 24 | 
 25 | from ebookmaker import parsers
 26 | from ebookmaker.parsers import ParserBase
 27 | 
 28 | RE_ELEMENT = re.compile(r'\[[^\]]*\]|((?:^|\s|\+|>|~|,)[a-z0-9]+)', re.I)
 29 | 
 30 | mediatypes = (mt.css, )
 31 | PG_CSS_PROFILE = (
 32 |     'Added Properties for Project Gutenberg',
 33 |     {
 34 |         'display': 'flex|initial',
 35 |         'justify-content': 'center',
 36 |         'speak': r'auto|never|always',
 37 |         'speak-as': 'normal|spell-out|digits|literal-punctuation|no-punctuation',
 38 |         'all': 'initial|inherit|unset',
 39 | 
 40 |         # added to update css fonts level 3
 41 |         'font-variant-numeric': r'normal|{font-variant-attrs}(\s+{font-variant-attrs})*',
 42 | 
 43 |         # (partial) update to CSS Cascading and Inheritance Level 3
 44 |         'font-family': 'initial',
 45 |         'font-size': 'initial',
 46 |         'font-style': 'initial',
 47 |         'font-variant': 'initial',
 48 |         'font-weight': 'initial',
 49 |         'font': 'initial',
 50 |         'margin-right': 'initial',
 51 |         'margin-left': 'initial',
 52 |         'margin-top': 'initial',
 53 |         'margin-bottom': 'initial',
 54 |         'margin': 'initial',
 55 |         'padding-top': 'initial',
 56 |         'padding-right': 'initial',
 57 |         'padding-bottom': 'initial',
 58 |         'padding-left': 'initial',
 59 |         'padding': 'initial',
 60 |         'text-align': 'initial',
 61 |         'text-decoration': 'initial',
 62 |         'text-indent': 'initial',
 63 |         'text-transform': 'initial',
 64 |         
 65 |         # updated for  https://www.w3.org/TR/css-writing-modes-3/
 66 |         # direction and unicode-bidi  are not supported based on the standard's recommendation
 67 |         'writing-mode': 'vertical-lr|vertical-rl|horizontal-tb',
 68 |         'text-orientation': 'mixed|upright|sideways',
 69 |         'text-combine-upright': 'none | all',      
 70 |     },
 71 |     {
 72 |         'numeric-figure-values': 'lining-nums|oldstyle-nums',
 73 |         'numeric-spacing-values': 'proportional-nums|tabular-nums',
 74 |         'numeric-fraction-values': 'diagonal-fractions|stacked-fractions',
 75 |         'font-variant-attrs': '{numeric-figure-values}|{numeric-spacing-values}|{numeric-fraction-values}|ordinal|slashed-zero',
 76 |     }
 77 | )
 78 | 
 79 | cssutils.profile.addProfiles([PG_CSS_PROFILE])
 80 | 
 81 | class Parser(ParserBase):
 82 |     """ Parse an external CSS file. """
 83 | 
 84 |     def __init__(self, attribs=None):
 85 |         cssutils.log.setLog(logging.getLogger('cssutils'))
 86 |         # logging.DEBUG is way too verbose
 87 |         cssutils.log.setLevel(max(cssutils.log.getEffectiveLevel(), logging.INFO))
 88 |         ParserBase.__init__(self, attribs)
 89 |         self.sheet = None
 90 | 
 91 |     def pre_parse(self):
 92 |         """ Parse the CSS file. """
 93 | 
 94 |         if self.sheet is not None:
 95 |             return
 96 | 
 97 |         parser = cssutils.CSSParser()
 98 |         if self.fp:
 99 |             self.sheet = parser.parseString(self.unicode_content())
100 |         else:
101 |             try:
102 |                 self.sheet = parser.parseUrl(self.attribs.url)
103 |             except ValueError:
104 |                 logging.error('Missing file: %s', self.attribs.url)
105 |                 return
106 | 
107 |         self.attribs.mediatype = 'text/css'
108 |         self.lowercase_selectors(self.sheet)
109 |         self.make_links_absolute()
110 | 
111 | 
112 |     def parse_string(self, s):
113 |         """ Parse the CSS in string. """
114 | 
115 |         if self.sheet is not None:
116 |             return
117 | 
118 |         parser = cssutils.CSSParser()
119 |         self.sheet = parser.parseString(s)
120 | 
121 |         self.attribs.mediatype = 'text/css'
122 |         self.lowercase_selectors(self.sheet)
123 | 
124 | 
125 |     @staticmethod
126 |     def iter_properties(sheet):
127 |         """ Iterate on properties in css. """
128 |         for rule in sheet:
129 |             if rule.type == rule.STYLE_RULE:
130 |                 for prop in rule.style:
131 |                     yield prop
132 | 
133 | 
134 |     @staticmethod
135 |     def lowercase_selectors(sheet):
136 |         """ make element names in selectors lowercase to match xhtml tags """
137 |         for rule in sheet:
138 |             if rule.type == rule.STYLE_RULE:
139 |                 for sel in rule.selectorList:
140 |                     sel.selectorText = RE_ELEMENT.sub(
141 |                         lambda m: m.group(1).lower() if m.group(1) else m.group(0),
142 |                         sel.selectorText)
143 | 
144 |     def make_links_absolute(self):
145 |         """ make links absolute """
146 |         def abs_url(url):
147 |             return urllib.parse.urljoin(self.attribs.url, url)
148 |         cssutils.replaceUrls(self.sheet, abs_url)
149 | 
150 | 
151 |     def rewrite_links(self, f):
152 |         """ Rewrite all links using the function f. """
153 |         cssutils.replaceUrls(self.sheet, f)
154 | 
155 | 
156 |     def iterlinks(self):
157 |         """ Return the urls of all images in document."""
158 | 
159 |         for url in cssutils.getUrls(self.sheet):
160 |             yield urllib.parse.urljoin(self.attribs.url, url), parsers.em.style()
161 | 
162 |     def strip_images(self):
163 |         """ remove all rules with url() in them """
164 |         to_delete = []
165 |         for rule in self.sheet:
166 |             if rule.type == rule.STYLE_RULE and rule.cssText and 'url(' in rule.cssText:
167 |                 to_delete.append(rule)
168 |         for rule in to_delete:
169 |             self.sheet.deleteRule(rule)
170 | 
171 | 
172 |     def get_aux_urls(self):
173 |         """ Return the urls of all auxiliary files in document.
174 | 
175 |         Auxiliary files are non-document files you need to correctly
176 |         display the document file, eg. CSS files.
177 | 
178 |         """
179 | 
180 |         aux = []
181 | 
182 |         for rule in self.sheet:
183 |             if rule.type == rule.IMPORT_RULE:
184 |                 aux.append(urllib.parse.urljoin(self.attribs.url, rule.href))
185 | 
186 |         return  aux
187 | 
188 | 
189 |     def serialize(self):
190 |         """ Serialize CSS. """
191 | 
192 |         return self.sheet.cssText
193 | 


--------------------------------------------------------------------------------
/src/ebookmaker/mydocutils/gutenberg/writers/nroff.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # $Id: manpage.py 6270 2010-03-18 22:32:09Z milde $
  4 | # Author: Engelbert Gruber <grubert@users.sourceforge.net>
  5 | # Copyright: This module is put into the public domain.
  6 | #
  7 | # Rewritten almost completely
  8 | # by Marcello Perathoner <marcello@perathoner.de>
  9 | 
 10 | """
 11 | 
 12 | Nroff writer for reStructuredText. Tweaked for Project Gutenberg usage.
 13 | 
 14 | """
 15 | 
 16 | from __future__ import unicode_literals
 17 | 
 18 | __docformat__ = 'reStructuredText'
 19 | 
 20 | from ebookmaker.mydocutils.writers import nroff
 21 | from ebookmaker import Unitame
 22 | 
 23 | from libgutenberg.Logger import info, debug, warning, error
 24 | 
 25 | GUTENBERG_NROFF_PREAMBLE = r""".\" -*- mode: nroff -*- coding: {encoding} -*-
 26 | .\" This file produces Project Gutenberg plain text. Usage:
 27 | .\"   $ groff -t -K {device} -T {device} this_file > output.txt
 28 | .
 29 | .pl 100000       \" very tall page: disable pagebreaks
 30 | .ll 72m
 31 | .po 0
 32 | .ad l           \" text-align: left
 33 | .nh             \" hyphenation: off
 34 | .cflags 0 .?!   \" single sentence space
 35 | .cflags 0 -\[hy]\[em]   \" don't break on -
 36 | .
 37 | .de nop
 38 | ..
 39 | .blm nop        \" do nothing on empty line
 40 | .
 41 | .nr [env_cnt] 0
 42 | .ev 0           \" start in a defined environment
 43 | .
 44 | .de push_env
 45 | .br
 46 | .nr last_env \\n[.ev]            \" save current environment name
 47 | .nr env_cnt +1   \" generate new environment name
 48 | .ev \\n[env_cnt]
 49 | .evc \\n[last_env]
 50 | ..
 51 | .de pop_env
 52 | .br
 53 | .ev
 54 | .nr env_cnt -1
 55 | ..
 56 | .
 57 | """
 58 | 
 59 | GUTENBERG_NROFF_POSTAMBLE = r""".
 60 | .pl 0    \" ends very long page here
 61 | .\" End of File
 62 | """
 63 | 
 64 | class Writer (nroff.Writer):
 65 |     """ A plaintext writer thru nroff. """
 66 | 
 67 |     supported = ('pg-nroff',)
 68 |     """Formats this writer supports."""
 69 | 
 70 |     def __init__ (self):
 71 |         nroff.Writer.__init__ (self)
 72 |         self.translator_class = Translator
 73 | 
 74 |     def translate (self):
 75 |         visitor = self.translator_class (self.document)
 76 |         del Unitame.unhandled_chars[:]
 77 |         self.document.walkabout (visitor)
 78 |         self.output = visitor.astext ()
 79 |         if Unitame.unhandled_chars:
 80 |             error ("unitame: unhandled chars: %s" % ", ".join (set (Unitame.unhandled_chars)))
 81 | 
 82 |     #def get_transforms (self):
 83 |     #    tfs = writers.Writer.get_transforms (self)
 84 |     #    return tfs + [parts.CharsetTransform]
 85 | 
 86 | 
 87 | 
 88 | class Translator (nroff.Translator):
 89 |     """ nroff translator """
 90 | 
 91 |     def preamble (self):
 92 |         """ Inserts nroff preamble. """
 93 |         return GUTENBERG_NROFF_PREAMBLE.format (
 94 |             encoding = self.encoding, device = self.device)
 95 | 
 96 | 
 97 |     def postamble (self):
 98 |         """ Inserts nroff postamble. """
 99 |         return GUTENBERG_NROFF_POSTAMBLE.format (
100 |             encoding = self.encoding, device = self.device)
101 | 
102 | 
103 |     def init_translate_maps (self):
104 |         nroff.Translator.init_translate_maps (self)
105 | 
106 |         update = {
107 |             0x0011: r"\~",       # nbsp, see: Unitame.py
108 |             0x0012: r"\%",       # shy,  see: Unitame.py
109 |             }
110 | 
111 |         self.translate_map.update (update)
112 |         self.translate_map_literal.update (update)
113 | 
114 | 
115 |     def register_classes (self):
116 |         """ Register classes.
117 | 
118 |         Use the idiosyncratic PG convention of marking up italics etc.
119 | 
120 |         """
121 | 
122 |         #
123 |         # This does not call the base class !!!
124 |         #
125 | 
126 |         self.register_class ('simple', 'left',         '.ad l', '')
127 |         self.register_class ('simple', 'right',        '.ad r', '')
128 |         self.register_class ('simple', 'center',       '.ad c', '')
129 | 
130 |         self.register_class ('inline', 'italics',      '_',    '_')
131 |         self.register_class ('inline', 'bold',         '*',    '*')
132 | 
133 |         self.register_class ('inline', 'monospaced',   '',     '')
134 |         self.register_class ('inline', 'superscript',  '',     '')
135 |         self.register_class ('inline', 'subscript',    '',     '')
136 | 
137 |         self.register_class ('inline', 'small-caps',   '_',    '_')
138 |         self.register_class ('inline', 'gesperrt',     '_',    '_')
139 |         self.register_class ('inline', 'antiqua',      '_',    '_')
140 |         self.register_class ('inline', 'larger',       '',     '')
141 |         self.register_class ('inline', 'smaller',      '',     '')
142 | 
143 | 
144 |     def translate (self, text):
145 |         """ Reduce the charset while keeping text a unicode string. """
146 | 
147 |         # NOTE: there's an alternate approach in
148 |         # transforms.parts.CharsetTransform
149 | 
150 |         if self.encoding != 'utf-8':
151 |             text = text.encode (self.encoding, 'unitame')
152 |             text = text.decode (self.encoding)
153 | 
154 |         if self.in_literal:
155 |             text = text.translate (self.translate_map_literal)
156 |         else:
157 |             text = text.translate (self.translate_map)
158 | 
159 |         return text
160 | 
161 | 
162 |     def visit_inner (self, node):
163 |         """ Try to remove duplicated PG highlight markers. """
164 |         if node.type == 'inline':
165 |             prefixes = self.get_prefix (node.type, node['classes'])
166 |             for prefix in prefixes:
167 |                 if prefix == self.last_output_char:
168 |                     self.backspace ()
169 |                 else:
170 |                     self.text (prefix)
171 |         else:
172 |             nroff.Translator.visit_inner (self, node)
173 | 
174 | 
175 |     def visit_inline (self, node):
176 |         if 'toc-pageref' in node['classes']:
177 |             maxlen = 3 # sensible default
178 |             while node.parent:
179 |                 node = node.parent
180 |                 if 'pageno_maxlen' in node:
181 |                     maxlen = node['pageno_maxlen']
182 |                     break
183 |             self.cmd (('linetabs 1',
184 |                        r'ta (\n[.l]u - \n[.i]u - %dm) +%dmR' % (maxlen + 1, maxlen + 1),
185 |                        r'lc .'))
186 |             self.text (chr (1) + '\t')
187 |         nroff.Translator.visit_inline (self, node)
188 | 
189 |     def visit_section_title (self, node):
190 |         """ Implements PG-standard spacing before headers. """
191 |         self.sp (max (2, 5 - self.section_level))
192 | 
193 |     def visit_figure (self, node):
194 |         self.sp (1)
195 |         self.push ()
196 | 
197 |     def depart_figure (self, node):
198 |         self.pop ()
199 |         self.sp (1)
200 | 
201 |     def visit_image (self, node):
202 |         # ignore alt attribute except for dropcaps
203 |         if 'dropcap' in node['classes']:
204 |             self.text (node.attributes.get ('alt', ''))
205 | 
206 |     def visit_page (self, node):
207 |         if 'clearpage' in node['classes']:
208 |             self.sp (4)
209 |         elif 'cleardoublepage' in node['classes']:
210 |             self.sp (4)
211 |         else:
212 |             nroff.Translator.visit_page (self, node)
213 | 


--------------------------------------------------------------------------------
/src/ebookmaker/parsers/boilerplate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*-
  3 | 
  4 | """
  5 | 
  6 | boilerplate.py
  7 | 
  8 | Copyright 2022 by Project Gutenberg
  9 | 
 10 | Distributable under the GNU General Public License Version 3 or newer.
 11 | 
 12 | This module finds Project Gutenberg boilerplate and, if found puts it into 3 section elements:
 13 | 
 14 | pg_header
 15 |     usually a title and license declaration
 16 |     sometimes, title, book number, release date, authors, language, encoding, credits
 17 |     when detected, metadata will be parsed and enclosed in a pg_metadata_raw sub-section
 18 | 
 19 | pg_footer
 20 |     usually the license
 21 | 
 22 | pg_smallprint
 23 |     on older books, this will contain license-ish language and other material. it's usually found
 24 |     at the top of the text, and is often comically dated.
 25 | 
 26 | 
 27 | BeautifulSoup is used for its superior "scraping" tools.
 28 | 
 29 | """
 30 | 
 31 | import copy
 32 | import re
 33 | 
 34 | import soupsieve as sv
 35 | 
 36 | from libgutenberg.GutenbergGlobals import xmlspecialchars
 37 | from libgutenberg.Logger import critical, info, debug, warning, error
 38 | 
 39 | TOP_MARKERS = [
 40 |     re.compile(r"\*+ ?START\s+OF\s+TH(E|IS)\s+PROJECT\s+GUTENBERG", re.I),
 41 | ]
 42 | BOTTOM_MARKERS = [
 43 |     re.compile(r"\** ?END\s+OF\s+TH(E|IS)\s+PROJECT\s+GUTENBERG", re.I),
 44 |     re.compile(r"\** ?Ende\w*dieses\w*Projekt\w*Gutenberg", re.I),
 45 |     re.compile(r"\** ?END\s+OF\s+PROJECT\s+GUTENBERG", re.I),
 46 |     re.compile(r"\** ?End\s+of\s+the\s+Project\s+Gutenberg", re.I),
 47 | ]
 48 | SMALLPRINT_MARKERS = [
 49 |     re.compile(r"\** ?END\*? ?THE\s+SMALL\s+PRINT", re.I),
 50 |     re.compile(r"\**END\s+THE\s+SMALL\s+PRINT", re.I),
 51 |     re.compile(r"\** ?These\s+\w+\s+Were\s+Prepared\s+By\s+Thousands", re.I),
 52 | ]
 53 | MARKER_END = re.compile(r"\*+")
 54 | 
 55 | def prune(root, divider, after=True):
 56 |     ''' prune parts of the root element before or after a divider  '''
 57 |     def next_or_prev(el, after=True):
 58 |         return el.next_sibling if after else el.previous_sibling
 59 | 
 60 |     def after_or_before(el, after=True):
 61 |         return list(el.next_siblings) if after else list(el.previous_siblings)
 62 | 
 63 |     dividers = [divider] + list(divider.parents)
 64 |     keep = False
 65 |     for elem in dividers:
 66 |         if elem is root:
 67 |             break
 68 |         has_sibling = bool(next_or_prev(elem, after=not after))
 69 |         for sibling in after_or_before(elem, after=after):
 70 |             sibling.extract()
 71 |         keep = has_sibling or keep
 72 | 
 73 | def check_patterns(node, patterns):
 74 |     ''' finds the element containing the marker pattern '''
 75 |     for pattern in patterns:
 76 |         found = node.find(string=pattern)
 77 |         if found:
 78 |             in_bp = sv.filter('.pg_boilerplate', found.parents)
 79 |             if not in_bp:
 80 |                 return found
 81 | 
 82 | def mark_soup(soup):
 83 |     def mark_bp(node, mark, markers, top=True):
 84 |         marked = node.find(id=mark)
 85 |         if marked:
 86 |             marked.name = 'section'
 87 |             return True
 88 |         divider = check_patterns(node, markers)
 89 |         if divider:
 90 | 
 91 |             # the following mess deals with the case where the marker includes 
 92 |             # '<span>something</span) end of marker **' (as in titles with language tags)
 93 |             if divider.next_sibling and divider.next_sibling.name == 'span':
 94 |                 if divider.next_sibling.next_sibling and not divider.next_sibling.next_sibling.name:
 95 |                     new_divider_string = str(divider.string + divider.next_sibling.string + 
 96 |                                           divider.next_sibling.next_sibling.string)
 97 |                     divider.insert_before(new_divider_string)
 98 |                     divider = divider.previous_sibling
 99 |                     divider.next_sibling.extract()
100 |                     divider.next_sibling.extract()
101 |                     divider.next_sibling.extract()
102 | 
103 |             # first, copy the Node - it contains the divider
104 |             node_for_divider = copy.copy(node)
105 |             divider_copy = check_patterns(node_for_divider, markers)
106 | 
107 |             # prune all content after (before) the divider
108 |             prune(node_for_divider, divider_copy, after=top)
109 | 
110 |             #put that into a new section tag
111 |             bp_section = soup.new_tag('section', id=mark)
112 |             bp_section['class'] = 'pg_boilerplate'
113 |             for child in node_for_divider.contents:
114 |                 bp_section.append(copy.copy(child))
115 |             
116 |             # now prune all content before (after) the divider 
117 |             # this should be mostly the divider and old boilerplate
118 |             prune(node, divider, after=not top)
119 | 
120 |             # remove the divider
121 |             divider.extract()
122 | 
123 |             # re-insert the boilerplate
124 |             if top:
125 |                 node.insert(0, bp_section)
126 |             else:
127 |                 node.append(bp_section)
128 |             return True
129 |         return False
130 | 
131 |     try:
132 |         body = soup.html.body
133 |     except:
134 |         return
135 | 
136 |     found_top = mark_bp(body, 'pg-header', TOP_MARKERS, top=True)
137 |     if not found_top:
138 |         info('No PG header marker found.')
139 | 
140 |     found_bottom = mark_bp(body, 'pg-footer', BOTTOM_MARKERS, top=False)
141 |     if not found_bottom:
142 |         info('No PG footer marker found.')
143 | 
144 |     return found_top or found_bottom
145 | 
146 | 
147 | def strip_headers_from_txt(text):
148 |     '''
149 |     when input is plain text, strip the heaters and return (stripped_text, pg_header, pg_footer)
150 |     '''
151 |     def markers_split(text, markers):
152 |         for marker in markers:
153 |             divider = marker.search(text)
154 |             if divider:
155 |                 before, after = text.split(divider.group(0), maxsplit=1)
156 |                 after_sections = MARKER_END.split(after, maxsplit=1)
157 |                 if len(after_sections) == 2 and len(after_sections[0]) < 500:
158 |                     after = after_sections[1]
159 |                 
160 |                 return before, divider.group(0), after
161 |         return  text, None, text
162 |     header_text, divider, text = markers_split(text, TOP_MARKERS + SMALLPRINT_MARKERS)
163 |     if divider is None:
164 |         pg_header = '<pre id="pg-header" x-header="0"></pre>'
165 |         info('No PG header found in txt file.')
166 | 
167 |     else:
168 |         divider_tail = ''
169 |         if '\n' in text:
170 |             divider_tail, text = text.split('\n', maxsplit=1)
171 |         pg_header = '\n'.join([
172 |             '<pre id="pg-header">',
173 |             xmlspecialchars(header_text),
174 |             xmlspecialchars(divider),
175 |             xmlspecialchars(divider_tail),
176 |             '</pre>'])
177 | 
178 |     text, divider, footer_text = markers_split(text, BOTTOM_MARKERS)
179 |     if divider is None:
180 |         pg_footer = '<pre id="pg-footer" x-footer="0"></pre>'
181 |         info('No PG footer found in txt file.')
182 |     else:
183 |         pg_footer = '\n'.join(['<pre id="pg-footer">',
184 |                                divider,
185 |                                xmlspecialchars(footer_text),
186 |                                '</pre>'])
187 |     return text, pg_header, pg_footer
188 | 


--------------------------------------------------------------------------------
/src/ebookmaker/ParserFactory.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*-
  3 | 
  4 | """
  5 | 
  6 | ParserFactory.py
  7 | 
  8 | Copyright 2009-14 by Marcello Perathoner
  9 | Copyright 2025 by Project Gutenberg
 10 | 
 11 | Distributable under the GNU General Public License Version 3 or newer.
 12 | 
 13 | """
 14 | 
 15 | 
 16 | import os.path
 17 | import re
 18 | 
 19 | from six.moves import urllib
 20 | import six
 21 | 
 22 | from pkg_resources import resource_listdir, resource_stream # pylint: disable=E0611
 23 | import requests
 24 | 
 25 | from libgutenberg.Logger import critical, debug, error, info
 26 | from libgutenberg import MediaTypes
 27 | import libgutenberg.GutenbergGlobals as gg
 28 | from ebookmaker.CommonCode import Options
 29 | from ebookmaker.Version import VERSION
 30 | from ebookmaker import parsers
 31 | 
 32 | options = Options()
 33 | parser_modules = {}
 34 | 
 35 | def load_parsers():
 36 |     """ See what types we can parse. """
 37 | 
 38 |     for fn in resource_listdir('ebookmaker.parsers', ''):
 39 |         modulename, ext = os.path.splitext(fn)
 40 |         if ext == '.py':
 41 |             if modulename.endswith('Parser'):
 42 |                 module = __import__('ebookmaker.parsers.' + modulename, fromlist=[modulename])
 43 |                 debug("Loading parser from module: %s for mediatypes: %s" % (
 44 |                     modulename, ', '.join(module.mediatypes)))
 45 |                 for mediatype in module.mediatypes:
 46 |                     parser_modules[mediatype] = module
 47 | 
 48 |     return parser_modules.keys()
 49 | 
 50 | 
 51 | def unload_parsers():
 52 |     """ Unload parser modules. """
 53 |     for k in parser_modules.keys():
 54 |         del parser_modules[k]
 55 | 
 56 | 
 57 | class ParserFactory(object):
 58 |     """ A factory and a cache for parsers.
 59 | 
 60 |     So we don't reparse the same file twice.
 61 | 
 62 |     """
 63 | 
 64 |     parsers = {} # cache: parsers[url] = parser
 65 |     sources = {} # sources[outfile] = source
 66 | 
 67 |     @staticmethod
 68 |     def get(attribs):
 69 |         """ Get the right kind of parser. """
 70 | 
 71 |         try:
 72 |             mediatype = attribs.orig_mediatype
 73 |             if mediatype == 'text/plain' and attribs.referrer:
 74 |                 # don't use GutenbergTextParser, it's a linked text file
 75 |                 return parsers.TxtParser(attribs)
 76 |             return parser_modules[mediatype].Parser(attribs)
 77 |         except (AttributeError, KeyError):
 78 |             return parser_modules['*/*'].Parser(attribs)
 79 | 
 80 | 
 81 |     @classmethod
 82 |     def create(cls, url, attribs=None):
 83 |         """ Create an appropriate parser. """
 84 |         url = parsers.webify_url(url)
 85 |         if attribs is None:
 86 |             attribs = parsers.ParserAttributes()
 87 | 
 88 |         # debug("Need parser for %s" % url)
 89 |         
 90 |         # first check if input url is in output directory (we've already made it!)
 91 |         if gg.is_same_path(os.path.abspath(options.outputdir), os.path.dirname(url)):
 92 |             # find the file (and the parser) used to make the file
 93 |             if url in cls.sources: 
 94 |                 if cls.sources[url] in cls.parsers:
 95 |                     parser = cls.parsers[cls.sources[url]]
 96 |                     parser.reset()
 97 |                     parser.attribs.update(attribs)
 98 |                     return parser
 99 |                 
100 |         
101 | 
102 |         if url in cls.parsers:
103 |             # debug("... reusing parser for %s" % url)
104 |             # reuse same parser, maybe already filled with data
105 |             parser = cls.parsers[url]
106 |             parser.reset()
107 |             parser.attribs.update(attribs)
108 |             # debug(str(parser.attribs))
109 |             return parser
110 | 
111 |         scheme = urllib.parse.urlsplit(url).scheme
112 |         if scheme == 'resource':
113 |             fp = cls.open_resource(url, attribs)
114 |         elif scheme in ('http', 'https'):
115 |             fp = cls.open_url(url, attribs)
116 |         else:
117 |             fp = cls.open_file(url, attribs)
118 |         if fp is None:
119 |             return
120 |         if attribs.url in cls.parsers:
121 |             # reuse parser because parsing may be expensive, eg. reST docs
122 |             # debug("... reusing parser for %s" % attribs.url)
123 |             parser = cls.parsers[attribs.url]
124 |             parser.attribs.update(attribs)
125 |             return parser
126 | 
127 |         # ok. so we have to create a new parser
128 |         debug("... creating new parser for %s" % url)
129 | 
130 |         if hasattr(options, 'mediatype_from_extension') and options.mediatype_from_extension:
131 |             attribs.orig_mediatype = MediaTypes.guess_type(url)
132 | 
133 |         attribs.orig_url = url
134 |         parser = cls.get(attribs)
135 |         parser.fp = fp
136 | 
137 |         cls.parsers[url] = parser
138 | 
139 |         return parser
140 | 
141 | 
142 |     @classmethod
143 |     def open_url(cls, url, attribs):
144 |         """ Open url for parsing. """
145 | 
146 |         fp = requests.get(
147 |             url,
148 |             stream=True,
149 |             headers={
150 |                 'User-Agent': "EbookMaker/%s (+http://pypi.python.org/ebookmaker)" % VERSION
151 |             },
152 |             proxies=options.config.PROXIES
153 |         )
154 |         attribs.orig_mediatype = fp.headers.get('Content-Type', 'application/octet-stream')
155 |         debug("... got mediatype %s from server" % str(attribs.orig_mediatype))
156 |         attribs.orig_url = url
157 |         attribs.url = fp.url
158 |         return six.BytesIO(fp.content)
159 | 
160 | 
161 |     @classmethod
162 |     def open_file(cls, url, attribs):
163 |         """ Open a local file for parsing. """
164 |         def open_file_from_path(path):
165 |             try:
166 |                 return open(url, 'rb')
167 |             except FileNotFoundError:
168 |                 critical('Missing file: %s' % url)
169 |             except IsADirectoryError:
170 |                 critical('Missing file is a directory: %s' % url)
171 |             return None
172 |             
173 |         if re.search(r'^([a-zA-z]:|/)', url):
174 |             fp = open_file_from_path(url)
175 |         else:
176 |             try:
177 |                 # handles all the flavors of file: urls, including on windows
178 |                 fp = urllib.request.urlopen(url)
179 |             except urllib.error.URLError as what:
180 |                 fp = None
181 |                 critical('Missing file: %s' % what.reason)
182 |                 return None
183 |             except ValueError:  # just a relative path?
184 |                 fp = open_file_from_path(url)
185 |             
186 |         attribs.orig_mediatype = MediaTypes.guess_type(url)
187 | 
188 |         debug("... got mediatype %s from guess_type" % str(attribs.orig_mediatype))
189 |         attribs.orig_url = attribs.url = url
190 |         return fp
191 | 
192 | 
193 |     @classmethod
194 |     def open_resource(cls, orig_url, attribs):
195 |         """ Open a python package resource file for parsing. """
196 | 
197 |         # resource://python.package/filename.ext
198 | 
199 |         o = urllib.parse.urlsplit(orig_url)
200 |         package = o.hostname
201 |         filename = o.path[1:]
202 |         fp = resource_stream(package, filename)
203 |         attribs.orig_mediatype = MediaTypes.guess_type(filename)
204 | 
205 |         debug("... got mediatype %s from guess_type" % str(attribs.orig_mediatype))
206 |         attribs.orig_url = orig_url
207 |         attribs.url = orig_url
208 |         return fp
209 | 
210 | 
211 |     @classmethod
212 |     def clear_parser_cache(cls):
213 |         """ Clear parser cache to free memory. """
214 | 
215 |         # debug: kill refs
216 |         for dummy_url, parser in cls.parsers.items():
217 |             del parser
218 | 
219 |         cls.parsers = {}
220 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Ebookmaker
  2 | 
  3 | 
  4 | Ebookmaker is the tool used for format conversion at Project Gutenberg.
  5 | It builds HTML5, EPUB2, EPUB3 and Kindle files (including KF8) from HTML.
  6 | It builds builds thes formats and PDF files from reST sources.
  7 | 
  8 | If you are preparing HTML for use with Ebookmaker, the [Usage Notes](USAGE.md) may be of interest.
  9 | 
 10 | A web interface for Ebookmaker, used for [Online Ebookmaker](https://ebookmaker.pglaf.org), is maintained at https://github.com/gutenbergtools/ebookmaker-web/
 11 | 
 12 | 
 13 | ## Prerequisites
 14 | 
 15 | * Python3 >= 3.7
 16 | 
 17 | ### Needed only for Kindle generation
 18 | 
 19 | * Calibre (https://calibre-ebook.com/) (needed to make  Kindle files)
 20 |     * may need to install Calibre's ebook-convert command line tool https://manual.calibre-ebook.com/generated/en/cli-index.html
 21 | 
 22 | ### Needed only for validation
 23 | 
 24 | * EpubCheck (for EPUB validation) To use EPUBCheck validation, first download and install EPUBCheck from https://www.w3.org/publishing/epubcheck/. If the command to invoke it is  `java -jar /Applications/epubcheck-4.2.6/epubcheck.jar`, then add this line to ~/.ebookmaker or /etc/ebookmaker.conf: `epub_validator: java -jar /Applications/epubcheck-4.2.6/epubcheck.jar` then turn on validation by adding `--validate` to Ebookmaker's command line invocation or by setting validate to true in ~/.ebookmaker
 25 | * the W3C "Nu" validator (for HTML5 validation) https://validator.github.io/validator/ add this line to ~/.ebookmaker or /etc/ebookmaker.conf: `html_validator: [something for your install]/vnu-runtime-image/bin/vnu` then turn on validation by adding `--validate` to Ebookmaker's command line invocation or by setting validate to true in ~/.ebookmaker
 26 | * on MacOS, you may need to create security exceptions. On my system, I had to do `sudo xattr -r -d com.apple.quarantine /Users/eric/vnu-runtime-image/lib`
 27 | 
 28 | ### Needed only for cover generation
 29 | 
 30 | * Cairo https://www.cairographics.org/download/
 31 | * Noto Sans and Noto Sans CJK:
 32 |     * CentOS or RedHat: `yum install google-noto-sans-cjk-fonts; yum install google-noto-sans-fonts`
 33 |     * Ubuntu: `apt-get install fonts-noto-cjk fonts-noto`
 34 | 
 35 | ### Needed only for conversion from RST
 36 | 
 37 | * Libertinus Serif and Libertinus Sans https://github.com/alerque/libertinus
 38 |     * For Linux, 
 39 |         * Download the latest release https://github.com/alerque/libertinus/releases/latest
 40 |         * unzip, put .otf files into ~/.fonts 
 41 |         * update font catalog `fc-cache -f -v`
 42 | * DejaVu Sans Mono https://dejavu-fonts.github.io/
 43 | * TexLive (to build PDF from TeX and rst)
 44 | 
 45 | Tested with Python 3.8
 46 | 
 47 | ## Install
 48 | 
 49 | (master branch, editable install)
 50 | `pipenv install ebookmaker`
 51 | 
 52 | Use the ebookmaker.conf file to pass a path to your kindlegen, tex, and groff programs 
 53 | if they're not in your PATH. Edit the ebookmaker.conf and copy it to /etc/ebookmaker.conf to 
 54 | reset the paths.
 55 | Copy ebookmaker.conf to ~/.ebookmaker to override settings in /etc/ebookmaker.conf or to set default 
 56 | command line options.
 57 | 
 58 | ## Sample invocation
 59 | 
 60 | (From the directory where you ran `pipenv install`)
 61 | 
 62 | `pipenv shell`
 63 | `ebookmaker -v -v --make=epub.images --output-dir=/Documents/pg /Documents/library/58669/58669-h/58669-h.htm`
 64 | 
 65 | or
 66 | 
 67 | `pipenv run ebookmaker -v -v --make=epub.images --output-dir=/Documents/pg /Documents/library/58669/58669-h/58669-h.htm`
 68 | 
 69 | 
 70 | ## new to pipenv?
 71 | 
 72 | Install pipenv  (might be `pip install --user pipenv`, depending on your default python)
 73 | 
 74 | `$ pip3 install --user pipenv`
 75 | 
 76 | (Debian/Ubuntu Linux users will instead need to use `apt install pipenv`.)
 77 | 
 78 | The default install location is `${HOME}/.local/bin`, so add this to your login shell's ${PATH} if needed.
 79 | 
 80 | Change directories to where you want to have your ebookmaker environment. Then, to initialize a python 3 virtual environment, do
 81 | 
 82 | `$ pipenv --three`
 83 | 
 84 | Whenever you want to enter this environment, move to this directory and do:
 85 | 
 86 | `$ pipenv shell`
 87 |  
 88 | Install the gutenberg modules:
 89 | 
 90 | `$ pipenv install ebookmaker`
 91 | 
 92 | Check your install:
 93 | 
 94 | `$ ebookmaker --version`
 95 | `EbookMaker 0.12.0`
 96 | 
 97 | > **If you get an error similar to this one:**
 98 | >
 99 | > `ModuleNotFoundError: No module named 'pkg_resources'`
100 | >
101 | > You can fix it by installing the `setuptools` module:
102 | >
103 | > `$ pipenv install setuptools`
104 | 
105 | Since you're in the shell, you can navigate to a book's directory and convert it:
106 | 
107 | `$ ebookmaker -v -v --make=epub.images --ebook 10001 --title "The Luck of the Kid" --author "Ridgwell Cullum" luck-kid.html`
108 | 
109 | ## Update
110 | 
111 | `$ cd ebookmaker` to wherever you ran `$ pipenv install ebookmaker`
112 | 
113 | then:
114 | 
115 | `$ pipenv update ebookmaker`
116 | 
117 | ## Test
118 | 
119 | Install, as above.
120 | 
121 | `$ cd ebookmaker` to wherever you ran `$ pip install ebookmaker`
122 | 
123 | then:
124 | 
125 | `$ git checkout master`
126 | 
127 | `$ pipenv install -e .`
128 | 
129 | `$ python -m unittest discover`
130 | 
131 | 
132 | ## Notes running Ebookmaker on Windows Machine (adapted from @windymilla)
133 | 
134 | 1. Install Python 3.7+ from python.org. Install Kindlegen. Add it to the path. 
135 | 2. Add system environment variable: Right-click "My Computer", then Properties, then Advanced, then Environment variables, then New. Call the variable PYTHON_HOME, and set it to the Python folder.
136 | 3. Edit the Path variable and add to the end of it `;%PYTHON_HOME%\;%PYTHON_HOME%\Scripts\`
137 | 4. Check by starting a new command window and typing `python`. It should run your version of Python. Quit python with `^Z` & Enter.
138 | 5. In command window, type `pip3 install --user pipenv`. Script may warn it has put scripts into a folder such as `C:\Users\myname\AppData\Roaming\Python\Python37\Scripts`, and to add this to the Path environment variable. Do this – don't forget the semicolon before the new folder name! (Possibly might work instead to just copy the newly installed files from where they were installed into your main python scripts folder, i.e. `%PYTHON_HOME%\Scripts` ?)
139 | 6. Close old command window and start a new (to get the new path)
140 | 7. Create a folder for ebookmaker, e.g. `C:\DP\ebookmaker`
141 | 8. In command window, go to the new folder
142 | 9. Type `pipenv install ebookmaker` – takes a while to install. It will also create a "virtual environment", with a new folder, something like `C:\Users\myname\.virtualenvs\ebookmaker-cgaQuYhi`
143 | 10. (Optional - only if you need ebookmaker to create a book cover for you because you are not providing one)
144 |     Download GTK+ to get Cairo. Precompiled Win32 binaries are here: http://ftp.gnome.org/pub/gnome/binaries ... _win32.zip
145 |     Unzip this to a folder, e.g. `C:\DP\gtk` and add `C:\DP\gtk\bin` to the Path environment variable.
146 |     Exit command window and start a new one to get new path
147 |     Go to the ebookmaker folder, `C:\DP\ebookmaker`
148 | 11. Type `pipenv run python ebookmaker --version` to check ebookmaker version. If this doesn't work (it should, but didn't work for us) try:
149 |     - Look in `C:\Users\myname\.virtualenvs\` and find the name of your virtualenv - it should be something like `ebookmaker-cgaQuYhi`
150 |     - Type `pipenv run python C:\Users\myname\.virtualenvs\<name of virtualenv>\Scripts\ebookmaker --version` to check ebookmaker version. 
151 | 12. (Should not happen now Cairo is optional) If there's error like like no "cairo" or "cairo-2" found, check if your libcairo and libcairo-2 path exist. If they do, edit dlopen in  _init_.py in cairocffi package. Return the path found by ctypes.util.find_library directly instead of calling ffi.dlopen(path).
152 | 13. If folder/file name contains space, pathnames muse be enclosed in `"`, like `--output-dir="C:\your foldername"`. If pathname is quoted, it MUST NOT end with trailing `\` or error will be raised. If running bat file from within Guiguts, this means you should use `$d.` rather than `$d` (i.e. a dot after $d so quoted pathname will end in `\."` rather than `\"`) when passing it as a value for the output-dir argument.
153 | 14. Example run_ebookmaker.bat file for use with Guiguts:
154 |       cd C:\DP\ebookmaker
155 |       pipenv run python C:\Users\myname\.virtualenvs\ebookmaker-cgaQuYhi\Scripts\ebookmaker -v --make=epub.images --make=kindle.images --output-dir=%1 --title=%2 %3
156 | 15. Corresponding "external program" setup within Guiguts:
157 |       `c:\dp\ebookmaker\run_ebookmaker.bat $d. $f $d$f$e`
158 | 


--------------------------------------------------------------------------------
/src/ebookmaker/UnitameData.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*-
  3 | 
  4 | from __future__ import unicode_literals
  5 | 
  6 | unicode_to_iso_8859_1 = {
  7 |     'Đ': 'D', # LATIN CAPITAL LETTER D WITH STROKE
  8 |     'đ': 'd', # LATIN SMALL LETTER D WITH STROKE
  9 |     'Ħ': 'H', # LATIN CAPITAL LETTER H WITH STROKE
 10 |     'ħ': 'h', # LATIN SMALL LETTER H WITH STROKE
 11 |     'Ŀ': 'L', # LATIN CAPITAL LETTER L WITH MIDDLE DOT
 12 |     'ŀ': 'l', # LATIN SMALL LETTER L WITH MIDDLE DOT
 13 |     'Ł': 'L', # LATIN CAPITAL LETTER L WITH STROKE
 14 |     'ł': 'l', # LATIN SMALL LETTER L WITH STROKE
 15 |     'ŉ': 'n', # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
 16 |     'Œ': 'OE', # LATIN CAPITAL LIGATURE OE
 17 |     'œ': 'oe', # LATIN SMALL LIGATURE OE
 18 |     'Ŧ': 'T', # LATIN CAPITAL LETTER T WITH STROKE
 19 |     'ŧ': 't', # LATIN SMALL LETTER T WITH STROKE
 20 |     'ƀ': 'b', # LATIN SMALL LETTER B WITH STROKE
 21 |     'Ɓ': 'B', # LATIN CAPITAL LETTER B WITH HOOK
 22 |     'Ƃ': 'B', # LATIN CAPITAL LETTER B WITH TOPBAR
 23 |     'ƃ': 'b', # LATIN SMALL LETTER B WITH TOPBAR
 24 |     'Ɔ': 'O', # LATIN CAPITAL LETTER OPEN O
 25 |     'Ƈ': 'C', # LATIN CAPITAL LETTER C WITH HOOK
 26 |     'ƈ': 'c', # LATIN SMALL LETTER C WITH HOOK
 27 |     'Ɗ': 'D', # LATIN CAPITAL LETTER D WITH HOOK
 28 |     'Ƌ': 'D', # LATIN CAPITAL LETTER D WITH TOPBAR
 29 |     'ƌ': 'd', # LATIN SMALL LETTER D WITH TOPBAR
 30 |     'Ƒ': 'F', # LATIN CAPITAL LETTER F WITH HOOK
 31 |     'ƒ': 'f', # LATIN SMALL LETTER F WITH HOOK
 32 |     'Ɠ': 'G', # LATIN CAPITAL LETTER G WITH HOOK
 33 |     'Ɨ': 'I', # LATIN CAPITAL LETTER I WITH STROKE
 34 |     'Ƙ': 'K', # LATIN CAPITAL LETTER K WITH HOOK
 35 |     'ƙ': 'k', # LATIN SMALL LETTER K WITH HOOK
 36 |     'ƚ': 'l', # LATIN SMALL LETTER L WITH BAR
 37 |     'Ɲ': 'N', # LATIN CAPITAL LETTER N WITH LEFT HOOK
 38 |     'ƞ': 'n', # LATIN SMALL LETTER N WITH LONG RIGHT LEG
 39 |     'Ɵ': 'O', # LATIN CAPITAL LETTER O WITH MIDDLE TILDE
 40 |     'Ƥ': 'P', # LATIN CAPITAL LETTER P WITH HOOK
 41 |     'ƥ': 'p', # LATIN SMALL LETTER P WITH HOOK
 42 |     'ƫ': 't', # LATIN SMALL LETTER T WITH PALATAL HOOK
 43 |     'Ƭ': 'T', # LATIN CAPITAL LETTER T WITH HOOK
 44 |     'ƭ': 't', # LATIN SMALL LETTER T WITH HOOK
 45 |     'Ʈ': 'T', # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
 46 |     'Ʋ': 'V', # LATIN CAPITAL LETTER V WITH HOOK
 47 |     'Ƴ': 'Y', # LATIN CAPITAL LETTER Y WITH HOOK
 48 |     'ƴ': 'y', # LATIN SMALL LETTER Y WITH HOOK
 49 |     'Ƶ': 'Z', # LATIN CAPITAL LETTER Z WITH STROKE
 50 |     'ƶ': 'z', # LATIN SMALL LETTER Z WITH STROKE
 51 |     'ǈ': 'L', # LATIN CAPITAL LETTER L WITH SMALL LETTER J
 52 |     'ǋ': 'N', # LATIN CAPITAL LETTER N WITH SMALL LETTER J
 53 |     'Ǣ': 'AE', # LATIN CAPITAL LETTER AE WITH MACRON
 54 |     'ǣ': 'ae', # LATIN SMALL LETTER AE WITH MACRON
 55 |     'Ǥ': 'G', # LATIN CAPITAL LETTER G WITH STROKE
 56 |     'ǥ': 'g', # LATIN SMALL LETTER G WITH STROKE
 57 |     'ǲ': 'D', # LATIN CAPITAL LETTER D WITH SMALL LETTER Z
 58 |     'Ǽ': 'AE', # LATIN CAPITAL LETTER AE WITH ACUTE
 59 |     'ǽ': 'ae', # LATIN SMALL LETTER AE WITH ACUTE
 60 |     'Ǿ': 'O', # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
 61 |     'ǿ': 'o', # LATIN SMALL LETTER O WITH STROKE AND ACUTE
 62 |     'Ƞ': 'N', # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG
 63 |     'ȡ': 'd', # LATIN SMALL LETTER D WITH CURL
 64 |     'Ȥ': 'Z', # LATIN CAPITAL LETTER Z WITH HOOK
 65 |     'ȥ': 'z', # LATIN SMALL LETTER Z WITH HOOK
 66 |     'ȴ': 'l', # LATIN SMALL LETTER L WITH CURL
 67 |     'ȵ': 'n', # LATIN SMALL LETTER N WITH CURL
 68 |     'ȶ': 't', # LATIN SMALL LETTER T WITH CURL
 69 |     'ɓ': 'b', # LATIN SMALL LETTER B WITH HOOK
 70 |     'ɕ': 'c', # LATIN SMALL LETTER C WITH CURL
 71 |     'ɖ': 'd', # LATIN SMALL LETTER D WITH TAIL
 72 |     'ɗ': 'd', # LATIN SMALL LETTER D WITH HOOK
 73 |     'ɠ': 'g', # LATIN SMALL LETTER G WITH HOOK
 74 |     'ɦ': 'h', # LATIN SMALL LETTER H WITH HOOK
 75 |     'ɨ': 'i', # LATIN SMALL LETTER I WITH STROKE
 76 |     'ɫ': 'l', # LATIN SMALL LETTER L WITH MIDDLE TILDE
 77 |     'ɬ': 'l', # LATIN SMALL LETTER L WITH BELT
 78 |     'ɭ': 'l', # LATIN SMALL LETTER L WITH RETROFLEX HOOK
 79 |     'ɱ': 'm', # LATIN SMALL LETTER M WITH HOOK
 80 |     'ɲ': 'n', # LATIN SMALL LETTER N WITH LEFT HOOK
 81 |     'ɳ': 'n', # LATIN SMALL LETTER N WITH RETROFLEX HOOK
 82 |     'ɼ': 'r', # LATIN SMALL LETTER R WITH LONG LEG
 83 |     'ɽ': 'r', # LATIN SMALL LETTER R WITH TAIL
 84 |     'ɾ': 'r', # LATIN SMALL LETTER R WITH FISHHOOK
 85 |     'ʂ': 's', # LATIN SMALL LETTER S WITH HOOK
 86 |     'ʈ': 't', # LATIN SMALL LETTER T WITH RETROFLEX HOOK
 87 |     'ʉ': '', # LATIN SMALL LETTER U BAR
 88 |     'ʋ': 'v', # LATIN SMALL LETTER V WITH HOOK
 89 |     'ʐ': 'z', # LATIN SMALL LETTER Z WITH RETROFLEX HOOK
 90 |     'ʑ': 'z', # LATIN SMALL LETTER Z WITH CURL
 91 |     'ʜ': 'H', # LATIN LETTER SMALL CAPITAL H
 92 |     'ʝ': 'j', # LATIN SMALL LETTER J WITH CROSSED-TAIL
 93 |     'ʠ': 'q', # LATIN SMALL LETTER Q WITH HOOK
 94 |     'ʮ': 'h', # LATIN SMALL LETTER TURNED H WITH FISHHOOK
 95 |     'ʯ': 'h', # LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL
 96 |     'Ѝ': 'I', # CYRILLIC CAPITAL LETTER I WITH GRAVE
 97 |     'ѝ': 'i', # CYRILLIC SMALL LETTER I WITH GRAVE
 98 |     'Ӑ': 'A', # CYRILLIC CAPITAL LETTER A WITH BREVE
 99 |     'ӑ': 'a', # CYRILLIC SMALL LETTER A WITH BREVE
100 |     'Ӓ': 'A', # CYRILLIC CAPITAL LETTER A WITH DIAERESIS
101 |     'ӓ': 'a', # CYRILLIC SMALL LETTER A WITH DIAERESIS
102 |     'Ӣ': 'I', # CYRILLIC CAPITAL LETTER I WITH MACRON
103 |     'ӣ': 'i', # CYRILLIC SMALL LETTER I WITH MACRON
104 |     'Ӥ': 'I', # CYRILLIC CAPITAL LETTER I WITH DIAERESIS
105 |     'ӥ': 'i', # CYRILLIC SMALL LETTER I WITH DIAERESIS
106 |     'Ӧ': 'O', # CYRILLIC CAPITAL LETTER O WITH DIAERESIS
107 |     'ӧ': 'o', # CYRILLIC SMALL LETTER O WITH DIAERESIS
108 |     'Ӭ': 'E', # CYRILLIC CAPITAL LETTER E WITH DIAERESIS
109 |     'ӭ': 'e', # CYRILLIC SMALL LETTER E WITH DIAERESIS
110 |     'Ӯ': '', # CYRILLIC CAPITAL LETTER U WITH MACRON
111 |     'ӯ': '', # CYRILLIC SMALL LETTER U WITH MACRON
112 |     'Ӱ': '', # CYRILLIC CAPITAL LETTER U WITH DIAERESIS
113 |     'ӱ': '', # CYRILLIC SMALL LETTER U WITH DIAERESIS
114 |     'Ӳ': '', # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE
115 |     'ӳ': '', # CYRILLIC SMALL LETTER U WITH DOUBLE ACUTE
116 |     'ẚ': 'a', # LATIN SMALL LETTER A WITH RIGHT HALF RING
117 |     '‐': '-', # HYPHEN
118 |     '–': '-', # EN DASH
119 |     '—': '--', # EM DASH
120 |     '‖': '||', # DOUBLE VERTICAL LINE
121 |     '‗': '_', # DOUBLE LOW LINE
122 |     '‘': '\'', # LEFT SINGLE QUOTATION MARK
123 |     '’': '\'', # RIGHT SINGLE QUOTATION MARK
124 |     '‚': '\'', # SINGLE LOW-9 QUOTATION MARK
125 |     '‛': '\'', # SINGLE HIGH-REVERSED-9 QUOTATION MARK
126 |     '“': '"', # LEFT DOUBLE QUOTATION MARK
127 |     '”': '"', # RIGHT DOUBLE QUOTATION MARK
128 |     '„': '"', # DOUBLE LOW-9 QUOTATION MARK
129 |     '‟': '"', # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
130 |     '⁅': '[', # LEFT SQUARE BRACKET WITH QUILL
131 |     '⁆': ']', # RIGHT SQUARE BRACKET WITH QUILL
132 | }
133 | 
134 | 
135 | iso_8859_1_to_ascii = {
136 |     '¡': 'i', # INVERTED EXCLAMATION MARK
137 |     '¢': 'c', # CENT SIGN
138 |     '£': 'L', # POUND SIGN
139 |     '¥': 'Y', # YEN SIGN
140 |     '¦': '|', # BROKEN BAR
141 |     '§': 'Sec.', # SECTION SIGN
142 |     '¨': '"', # DIAERESIS
143 |     '©': '(C)', # COPYRIGHT SIGN
144 |     '«': '"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
145 |     '­': '-', # SOFT HYPHEN
146 |     '®': '(R)', # REGISTERED SIGN
147 |     '¯': '-', # MACRON
148 |     '°': ' deg.', # DEGREE SIGN
149 |     '±': '+-', # PLUS-MINUS SIGN
150 |     '²': '^2', # SUPERSCRIPT TWO
151 |     '³': '^3', # SUPERSCRIPT THREE
152 |     '´': '\'', # ACUTE ACCENT
153 |     'µ': ' m', # MICRO SIGN
154 |     '·': '.', # MIDDLE DOT
155 |     '»': '"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
156 |     '¼': '1/4', # VULGAR FRACTION ONE QUARTER
157 |     '½': '1/2', # VULGAR FRACTION ONE HALF
158 |     '¾': '3/4', # VULGAR FRACTION THREE QUARTERS
159 |     '¿': '?', # INVERTED QUESTION MARK
160 |     'Ä': 'Ae', # LATIN CAPITAL LETTER A WITH DIAERESIS
161 |     'Æ': 'AE', # LATIN CAPITAL LETTER AE
162 |     'Ð': 'Eth', # LATIN CAPITAL LETTER ETH
163 |     'Ö': 'Oe', # LATIN CAPITAL LETTER O WITH DIAERESIS
164 |     '×': 'x', # MULTIPLICATION SIGN
165 |     'Ø': 'O', # LATIN CAPITAL LETTER O WITH STROKE
166 |     'Ü': 'Ue', # LATIN CAPITAL LETTER U WITH DIAERESIS
167 |     'ß': 'ss', # LATIN SMALL LETTER SHARP S
168 |     'ä': 'ae', # LATIN SMALL LETTER A WITH DIAERESIS
169 |     'æ': 'ae', # LATIN SMALL LETTER AE
170 |     'ð': 'eth', # LATIN SMALL LETTER ETH
171 |     # 'ñ': 'ny', # LATIN SMALL LETTER N WITH TILDE
172 |     'ö': 'oe', # LATIN SMALL LETTER O WITH DIAERESIS
173 |     '÷': '/', # DIVISION SIGN
174 |     'ø': 'o', # LATIN SMALL LETTER O WITH STROKE
175 |     'ü': 'ue', # LATIN SMALL LETTER U WITH DIAERESIS
176 | }
177 | 


--------------------------------------------------------------------------------
/src/ebookmaker/mydocutils/writers/rst2all.css:
--------------------------------------------------------------------------------
  1 | /*
  2 | Project Gutenberg common docutils stylesheet.
  3 | 
  4 | This stylesheet contains styles common to HTML and EPUB.  Put styles
  5 | that are specific to HTML and EPUB into their relative stylesheets.
  6 | 
  7 | :Author: Marcello Perathoner (webmaster@gutenberg.org)
  8 | :Copyright: This stylesheet has been placed in the public domain.
  9 | 
 10 | This stylesheet is based on:
 11 | 
 12 |   :Author: David Goodger (goodger@python.org)
 13 |   :Copyright: This stylesheet has been placed in the public domain.
 14 | 
 15 |   Default cascading style sheet for the HTML output of Docutils.
 16 | 
 17 | */
 18 | 
 19 | /* ADE 1.7.2 chokes on !important and throws all css out. */
 20 | 
 21 | /* FONTS */
 22 | 
 23 | .italics    { font-style: italic }
 24 | .no-italics { font-style: normal }
 25 | 
 26 | .bold       { font-weight: bold }
 27 | .no-bold    { font-weight: normal }
 28 | 
 29 | .small-caps { } /* Epub needs italics */
 30 | .gesperrt   { } /* Epub needs italics */
 31 | .antiqua    { font-style: italic } /* what else can we do ? */
 32 | .monospaced { font-family: monospace }
 33 | 
 34 | .smaller    { font-size: smaller }
 35 | .larger     { font-size: larger }
 36 | 
 37 | .xx-small   { font-size: xx-small }
 38 | .x-small    { font-size: x-small }
 39 | .small      { font-size: small }
 40 | .medium     { font-size: medium }
 41 | .large      { font-size: large }
 42 | .x-large    { font-size: x-large }
 43 | .xx-large   { font-size: xx-large }
 44 | 
 45 | .text-transform-uppercase { text-transform: uppercase }
 46 | .text-transform-lowercase { text-transform: lowercase }
 47 | .text-transform-none      { text-transform: none }
 48 | 
 49 | .red        { color: red }
 50 | .green      { color: green }
 51 | .blue       { color: blue }
 52 | .yellow     { color: yellow }
 53 | .white      { color: white }
 54 | .gray       { color: gray }
 55 | .black      { color: black }
 56 | 
 57 | /* ALIGN */
 58 | 
 59 | .left       { text-align: left }
 60 | .justify    { text-align: justify }
 61 | .center     { text-align: center; text-indent: 0 }
 62 | .centerleft { text-align: center; text-indent: 0 }
 63 | .right      { text-align: right;  text-indent: 0 }
 64 | 
 65 | /* LINE HEIGHT */
 66 | 
 67 | body        { line-height: 1.5 }
 68 | p           { margin: 0;
 69 | 	      text-indent: 2em }
 70 | 
 71 | /* PAGINATION */
 72 | 
 73 | .title, .subtitle     { page-break-after:  avoid }
 74 | 
 75 | .container, .title, .subtitle, #pg-header
 76 |                       { page-break-inside: avoid }
 77 | 
 78 | /* SECTIONS */
 79 | 
 80 | body         { text-align: justify }
 81 | 
 82 | p.pfirst, p.noindent {
 83 |     text-indent: 0
 84 | }
 85 | 
 86 | .boxed         { border: 1px solid black; padding: 1em }
 87 | .topic, .note  { margin: 5% 0; border: 1px solid black; padding: 1em }
 88 | div.section    { clear: both }
 89 | 
 90 | div.line-block       { margin: 1.5em 0 }  /* same leading as p */
 91 | div.line-block.inner { margin: 0 0 0 10% }
 92 | div.line             { margin-left: 20%; text-indent: -20%; }
 93 | .line-block.noindent div.line { margin-left: 0; text-indent: 0; }
 94 | 
 95 | hr.docutils          { margin: 1.5em 40%; border: none; border-bottom: 1px solid black; }
 96 | div.transition       { margin: 1.5em 0 }
 97 | 
 98 | .vfill, .vspace      { border: 0px solid white }
 99 | 
100 | .title               { margin: 1.5em 0 }
101 | .title.with-subtitle { margin-bottom: 0 }
102 | .subtitle            { margin: 1.5em 0 }
103 | 
104 | /* header font style */
105 | /* http://dev.w3.org/csswg/css3-fonts/#propdef-font-size */
106 | 
107 | h1.title                        { font-size: 200%; }  /* for book title only */
108 | h2.title, p.subtitle.level-1    { font-size: 150%; margin-top: 4.5em;  margin-bottom: 2em }
109 | h3.title, p.subtitle.level-2    { font-size: 120%; margin-top: 2.25em; margin-bottom: 1.25em }
110 | h4.title, p.subtitle.level-3    { font-size: 100%; margin-top: 1.5em;  margin-bottom: 1.5em;  font-weight: bold; }
111 | h5.title, p.subtitle.level-4    { font-size:  89%; margin-top: 1.87em; margin-bottom: 1.69em; font-style: italic; }
112 | h6.title, p.subtitle.level-5    { font-size:  60%; margin-top: 3.5em;  margin-bottom: 2.5em }
113 | 
114 | /* title page */
115 | 
116 | h1.title, p.subtitle.level-1,
117 | h2.title, p.subtitle.level-2    { text-align: center }
118 | 
119 | #pg-header,
120 | h1.document-title               { margin: 10% 0 5% 0 }
121 | p.document-subtitle             { margin:  0  0 5% 0 }
122 | 
123 | /* PG header and footer */
124 | #pg-machine-header { }
125 | #pg-produced-by { }
126 | 
127 | li.toc-entry            { list-style-type: none }
128 | ul.open li, ol.open li  { margin-bottom: 1.5em }
129 | 
130 | .attribution            { margin-top: 1.5em }
131 | 
132 | .example-rendered {
133 |     margin: 1em 5%; border: 1px dotted red;  padding: 1em; background-color: #ffd }
134 | .literal-block.example-source   {
135 |     margin: 1em 5%; border: 1px dotted blue; padding: 1em; background-color: #eef }
136 | 
137 | /* DROPCAPS */
138 | 
139 | /* BLOCKQUOTES */
140 | 
141 | blockquote { margin: 1.5em 10% }
142 | 
143 | blockquote.epigraph { }
144 | 
145 | blockquote.highlights { }
146 | 
147 | div.local-contents { margin: 1.5em 10% }
148 | 
149 | div.abstract { margin: 3em   10% }
150 | div.image    { margin: 1.5em  0  }
151 | div.caption  { margin: 1.5em  0 }
152 | div.legend   { margin: 1.5em  0 }
153 | 
154 | .hidden { display: none }
155 | 
156 | .invisible { visibility: hidden; color: white } /* white: mozilla print bug */
157 | 
158 | a.toc-backref {
159 |   text-decoration: none ;
160 |   color: black }
161 | 
162 | dl.docutils dd {
163 |   margin-bottom: 0.5em }
164 | 
165 | div.figure { margin-top: 3em; margin-bottom: 3em }
166 | 
167 | img { max-width: 100% }
168 | 
169 | div.footer, div.header {
170 |   clear: both;
171 |   font-size: smaller }
172 | 
173 | div.sidebar {
174 |   margin: 0 0 0.5em 1em ;
175 |   border: medium outset ;
176 |   padding: 1em ;
177 |   background-color: #ffffee ;
178 |   width: 40% ;
179 |   float: right ;
180 |   clear: right }
181 | 
182 | div.sidebar p.rubric {
183 |   font-family: sans-serif ;
184 |   font-size: medium }
185 | 
186 | ol.simple, ul.simple { margin: 1.5em 0 }
187 | 
188 | ol.toc-list,    ul.toc-list    { padding-left:  0  }
189 | ol ol.toc-list, ul ul.toc-list { padding-left:  5% }
190 | 
191 | ol.arabic {
192 |   list-style: decimal }
193 | 
194 | ol.loweralpha {
195 |   list-style: lower-alpha }
196 | 
197 | ol.upperalpha {
198 |   list-style: upper-alpha }
199 | 
200 | ol.lowerroman {
201 |   list-style: lower-roman }
202 | 
203 | ol.upperroman {
204 |   list-style: upper-roman }
205 | 
206 | p.credits {
207 |   font-style: italic ;
208 |   font-size: smaller }
209 | 
210 | p.label {
211 |   white-space: nowrap }
212 | 
213 | p.rubric {
214 |   font-weight: bold ;
215 |   font-size: larger ;
216 |   color: maroon ;
217 |   text-align: center }
218 | 
219 | p.sidebar-title {
220 |   font-family: sans-serif ;
221 |   font-weight: bold ;
222 |   font-size: larger }
223 | 
224 | p.sidebar-subtitle {
225 |   font-family: sans-serif ;
226 |   font-weight: bold }
227 | 
228 | p.topic-title, p.admonition-title {
229 |   font-weight: bold }
230 | 
231 | pre.address {
232 |   margin-bottom: 0 ;
233 |   margin-top: 0 ;
234 |   font: inherit }
235 | 
236 | .literal-block, .doctest-block {
237 |   margin-left: 2em ;
238 |   margin-right: 2em; }
239 | 
240 | span.classifier {
241 |   font-family: sans-serif ;
242 |   font-style: oblique }
243 | 
244 | span.classifier-delimiter {
245 |   font-family: sans-serif ;
246 |   font-weight: bold }
247 | 
248 | span.interpreted {
249 |   font-family: sans-serif }
250 | 
251 | span.option {
252 |   white-space: nowrap }
253 | 
254 | span.pre {
255 |   white-space: pre }
256 | 
257 | span.problematic {
258 |   color: red }
259 | 
260 | span.section-subtitle {
261 |   /* font-size relative to parent (h1..h6 element) */
262 |   font-size: 100% }
263 | 
264 | table { margin-top: 1.5em; margin-bottom: 1.5em; border-spacing: 0 }
265 | table.align-left, table.align-right { margin-top: 0 }
266 | 
267 | table.table                { border-collapse: collapse; }
268 | 
269 | table.table.hrules-table thead          { border: 1px solid black; border-width: 2px 0 0 }
270 | table.table.hrules-table tbody          { border: 1px solid black; border-width: 2px 0 }
271 | table.table.hrules-rows  tr             { border: 1px solid black; border-width: 0 0 1px }
272 | table.table.hrules-rows  tr.last        { border-width: 0 }
273 | table.table.hrules-rows  td,
274 | table.table.hrules-rows  th             { padding: 1ex 1em; vertical-align: middle }
275 | 
276 | table.table tr             { border-width: 0 }
277 | table.table td,
278 | table.table th             { padding: 0.5ex 1em }
279 | table.table tr.first td    { padding-top: 1ex }
280 | table.table tr.last td     { padding-bottom: 1ex }
281 | table.table tr.first th    { padding-top: 1ex }
282 | table.table tr.last th     { padding-bottom: 1ex }
283 | 
284 | 
285 | table.citation {
286 |   border-left: solid 1px gray;
287 |   margin-left: 1px }
288 | 
289 | table.docinfo {
290 |   margin: 3em 4em }
291 | 
292 | table.docutils { }
293 | 
294 | div.footnote-group          { margin: 1em 0 }
295 | table.footnote td.label     { width: 2em; text-align: right; padding-left: 0 }
296 | 
297 | table.docutils td, table.docutils th,
298 | table.docinfo td, table.docinfo th {
299 |   padding: 0 0.5em;
300 |   vertical-align: top }
301 | 
302 | table.docutils th.field-name, table.docinfo th.docinfo-name {
303 |   font-weight: bold ;
304 |   text-align: left ;
305 |   white-space: nowrap ;
306 |   padding-left: 0 }
307 | 
308 | /* used to remove borders from tables and images */
309 | .borderless, table.borderless td, table.borderless th {
310 |   border: 0 }
311 | 
312 | table.borderless td, table.borderless th {
313 |   /* Override padding for "table.docutils td" with "!important".
314 |      The right padding separates the table cells. */
315 |   padding: 0 0.5em 0 0 } /* FIXME: was !important */
316 | 
317 | h1 tt.docutils, h2 tt.docutils, h3 tt.docutils,
318 | h4 tt.docutils, h5 tt.docutils, h6 tt.docutils {
319 |   font-size: 100% }
320 | 
321 | ul.auto-toc {
322 |   list-style-type: none }
323 | 


--------------------------------------------------------------------------------
/src/ebookmaker/CommonCode.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | 
  4 | """
  5 | CommonCode.py
  6 | 
  7 | Copyright 2014-2025 by Marcello Perathoner and Project Gutenberg
  8 | 
  9 | Distributable under the GNU General Public License Version 3 or newer.
 10 | 
 11 | Common code for EbookMaker and EbookConverter.
 12 | 
 13 | """
 14 | import csv
 15 | import datetime
 16 | import json
 17 | import os
 18 | import re
 19 | 
 20 | from io import StringIO
 21 | from six.moves import configparser
 22 | 
 23 | from libgutenberg.CommonOptions import Options
 24 | from libgutenberg.GutenbergGlobals import archive2files
 25 | from libgutenberg.Logger import debug, info, error, warning
 26 | from libgutenberg.Models import File
 27 | from . import parsers
 28 | 
 29 | class Struct(object):
 30 |     pass
 31 | 
 32 | options = Options()
 33 | 
 34 | class EbookmakerBadFileException(Exception):
 35 |     pass
 36 | 
 37 | class Job(object):
 38 |     """Hold 'globals' for a job.
 39 | 
 40 |     A job is defined as one unit of work, acting on one input url.
 41 | 
 42 |     """
 43 | 
 44 |     def __init__(self, type_):
 45 |         self.type = type_
 46 |         self.maintype, self.subtype = os.path.splitext(self.type)
 47 | 
 48 |         self.url = None
 49 |         self.outputdir = None
 50 |         self.outputfile = None
 51 |         self.logfile = None
 52 |         self.dc = None
 53 |         self.source = None
 54 |         self.opf_identifier = None
 55 |         self.main = None
 56 |         self.link_map = {}
 57 |         self.job = 0
 58 | 
 59 | 
 60 |     def __str__(self):
 61 |         l = []
 62 |         for k, v in self.__dict__.items():
 63 |             l.append("%s: %s" % (k, v))
 64 |         return '\n'.join(l)
 65 | 
 66 | 
 67 |     def last_updated(self):
 68 |         if not self.url:
 69 |             return None
 70 |         if hasattr(self.dc, 'files'):
 71 |             for file in self.dc.files:
 72 |                 file_url = parsers.webify_url(path_from_file(file))
 73 |                 if self.url == file_url:
 74 |                     self.dc.update_date = file.modified.date()
 75 |                     return file.modified
 76 | 
 77 |         path = self.url[7:] if self.url.startswith('file:///') else self.url
 78 |         try:
 79 |             statinfo = os.stat(path)
 80 |             modified = datetime.datetime.fromtimestamp(statinfo.st_mtime)
 81 |             if self.dc:
 82 |                 self.dc.update_date = modified.date()
 83 |             return modified
 84 |         except FileNotFoundError as e:
 85 |             error(e)
 86 |             return
 87 | 
 88 | 
 89 | def add_dependencies(targets, deps, order=None):
 90 |     """ Add dependent formats and optionally put into right build order. """
 91 | 
 92 |     for target, deps in deps.items():
 93 |         if target in targets:
 94 |             targets = list(set(targets).union(deps))
 95 |     if order:
 96 |         return list(filter(targets.__contains__, order))
 97 |     return targets
 98 | 
 99 | 
100 | def add_common_options(ap, user_config_file):
101 |     """ Add aptions common to all programs. """
102 | 
103 |     ap.add_argument(
104 |         "--verbose", "-v",
105 |         action="count",
106 |         default=0,
107 |         help="be verbose (-v -v be more verbose)")
108 | 
109 |     ap.add_argument(
110 |         "--production",
111 |         dest="production",
112 |         action="store_true",
113 |         help="use messages appropriate for a production context")
114 | 
115 | 
116 |     ap.add_argument(
117 |         "--config",
118 |         metavar="CONFIG_FILE",
119 |         dest="config_file",
120 |         action="store",
121 |         default=user_config_file,
122 |         help="read config file (default: %(default)s)")
123 | 
124 |     ap.add_argument(
125 |         "--validate",
126 |         dest="validate",
127 |         action="store_true",
128 |         help="validate epub and html through epubcheck/nu")
129 | 
130 |     ap.add_argument(
131 |         "--notify",
132 |         dest="notify",
133 |         action="store_true",
134 |         help="write CRITICAL messages to notifier logs")
135 | 
136 | 
137 | def set_arg_defaults(ap, config_file):
138 |     # get default command-line args
139 |     cp = configparser.ConfigParser()
140 |     cp.read(config_file)
141 |     if cp.has_section('DEFAULT_ARGS'):
142 |         ap.set_defaults(**dict(cp.items('DEFAULT_ARGS')))
143 | 
144 | def parse_config_and_args(ap, sys_config, defaults=None):
145 | 
146 |     # put command-line args into options
147 |     options.update(vars(ap.parse_args()))
148 | 
149 |     cp = configparser.ConfigParser()
150 |     cp.read((sys_config, options.config_file))
151 | 
152 |     options.config = Struct()
153 | 
154 |     for name, value in defaults.items():
155 |         setattr(options.config, name.upper(), value)
156 | 
157 |     for section in cp.sections():
158 |         for name, value in cp.items(section):
159 |             setattr(options.config, name.upper(), value)
160 | 
161 |     return options
162 | 
163 | 
164 | PRIVATE = os.getenv('PRIVATE') or ''
165 | NOTIFICATION_DIR = os.path.join(PRIVATE, 'logs', 'notifications')
166 | 
167 | def queue_notifications(ebook, message, subject='EbookMaker Notification'):
168 |     message_queue = os.path.join(NOTIFICATION_DIR, '%s.txt' % ebook)
169 |     with open(message_queue, 'a+') as messagefile:
170 |         messagefile.write('Subject: %s\n' % subject)
171 |         messagefile.write(message)
172 | 
173 | 
174 | def dir_from_url(url):
175 |     if os.path.isdir(url):
176 |         return url
177 |     if url.startswith('file://'):
178 |         dir = os.path.dirname(os.path.abspath(url[7:]))
179 |     elif url.startswith('file:'):
180 |         dir = os.path.dirname(os.path.abspath(url[5:]))
181 |     else:
182 |         dir = os.path.dirname(os.path.abspath(url))
183 |     return dir
184 | 
185 | 
186 | RE_SIMPATH = re.compile(r'^\d/')
187 | RE_PGNUM = re.compile(r'/(\d\d+/.*)')
188 | RE_FATNUM = re.compile(r'/(\d\d+)/')
189 | 
190 | def pgnum_from_url(url):
191 |     longest = '0'
192 |     for num in  RE_FATNUM.findall(url):
193 |         longest = num if len(num) > len(longest) else longest
194 |     return longest
195 | 
196 | def filesdir():
197 |     if hasattr(options.config, 'FILESDIR'):
198 |         if not options.config.FILESDIR[-1] == '/':
199 |             return dir_from_url(options.config.FILESDIR + '/')
200 |         else:
201 |             return dir_from_url(options.config.FILESDIR)
202 |     else:
203 |         # use home dir
204 |         _filesdir = os.path.expanduser("~")
205 |         info('Not configured, using %s for FILESDIR', _filesdir)
206 |         return _filesdir
207 |     
208 | def path_from_file(f):
209 |     """
210 |     In some places, we need to get a file system path from a database file object
211 |     these objects have `archive_path` properties, which are meant to be resolved with respect to a 
212 |     home directory with the assistance of some simlinks. There are 2 types, one that
213 |     starts with 'cache/epub/NNNN' (where NNNN is the ebook number) and another that 
214 |     starts with 'N/N/N/NNNN', which gets simlinked to 'files/NNNN`. (a third type, 
215 |     starting with 'etext' is obsolete and should no longer be encountered.
216 |     
217 |     this method need to deal with three cases.
218 |     1. the production environment
219 |     2. a development environment
220 |     3. a test environment
221 |     
222 |     These environments are characterized by configuration variables (set by parse_config_and_args): 
223 |     FILESDIR  - should be a 'file:' URL
224 |         on prod: file:///public/vhost/g/gutenberg/html/
225 |     CACHEDIR - should be a file system path
226 |         on prod: /public/vhost/g/gutenberg/html/cache/epub
227 | 
228 |     for good measure, the paths might include Windows partitions ('c:')
229 |     
230 |     """
231 |     if isinstance(f, File):
232 |         archive_path = f.archive_path
233 |     elif isinstance(f, str):
234 |         archive_path = f
235 |     else:       
236 |         error('%s is not a string or a libgutenberg.Models.File object', f)
237 |         return
238 | 
239 |     if hasattr(options.config, 'CACHEDIR'):
240 |         cachedir = dir_from_url(options.config.CACHEDIR)        
241 |     else:
242 |         # use home dir
243 |         cachedir = os.path.expanduser("~/cache/epub/")
244 |         info('Not configured, using %s for CACHE', cachedir)
245 |     if archive_path.startswith('cache/epub/'):
246 |         # generated file
247 |         return os.path.join(cachedir, archive_path[11:])
248 |     if RE_SIMPATH.search(archive_path):
249 |         # files directory, replace 1/2/3/1234 with files/1234
250 |         if archive_path[0] == '0':
251 |             # special case for single digits
252 |             return os.path.join(filesdir(), 'files', archive_path[2:])
253 |         else:
254 |             pgnum = RE_PGNUM.search(archive_path)
255 |             if pgnum:
256 |                 return os.path.join(filesdir(), 'files', pgnum.group(1))
257 |     # legacy pattern, shouldn't be there, but give it a try
258 |     warning('%s is an obsolete or incomplete archive path', archive_path)
259 |     return os.path.join('filesdir', 'dirs', archive_path)
260 |             
261 | 
262 | def find_candidates(path, file_filter=lambda x: True):
263 |     """ walk the directory containing path, return files satisfying file_filter 
264 |     """
265 |     path = dir_from_url(path)
266 |     for (root, dirs, files) in os.walk(path):
267 |         if '/.' in root or root.startswith('.'):
268 |             continue
269 |         for fname in files:
270 |             fpath = os.path.join(root, fname)
271 |             if file_filter(fpath):
272 |                 yield fpath
273 | 
274 | ALTTEXT_DIR = os.path.join(PRIVATE, 'logs', 'alt')
275 | 
276 | class EbookAltText:
277 |     _alt_map = None
278 |     
279 |     def __init__(self, ebook):
280 |         alt_text_file = os.path.join(ALTTEXT_DIR, f'alt{ebook}.json')
281 |         if os.path.exists(alt_text_file):
282 |             with open(alt_text_file, 'r') as data:
283 |                 try:
284 |                     self._alt_map = json.loads(data.read())
285 |                 except json.decoder.JSONDecodeError as jde:
286 |                     self._alt_map = None
287 |                     error(f'{alt_text_file} is not valid json. {jde}')
288 |                     
289 | 
290 |     # note that this returns None if there is no alt text file for the ebook
291 |     def get(self, img_id):
292 |         if self._alt_map != None:
293 |             return self._alt_map.get(img_id, '')
294 | 
295 | ONELINE = re.compile(r'[\r\n]+')
296 | def csv_escape(items):
297 |     buf = StringIO()
298 |     csvwriter = csv.writer(buf, dialect="excel")
299 |     csvwriter.writerow([ONELINE.sub(' ', str(item)) for item in items])
300 |     return buf.getvalue()
301 | 


--------------------------------------------------------------------------------
/src/ebookmaker/HTMLChunker.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*-
  3 | 
  4 | """
  5 | 
  6 | HTMLChunker.py
  7 | 
  8 | Copyright 2009, 2014 by Marcello Perathoner
  9 | Copyright 2025 by Project Gutenberg
 10 | 
 11 | Distributable under the GNU General Public License Version 3 or newer.
 12 | 
 13 | Splits a HTML file into chunks.
 14 | 
 15 | """
 16 | 
 17 | from __future__ import unicode_literals
 18 | 
 19 | 
 20 | import os
 21 | import re
 22 | import copy
 23 | 
 24 | from lxml import etree
 25 | from six.moves import urllib
 26 | 
 27 | import libgutenberg.GutenbergGlobals as gg
 28 | from libgutenberg.GutenbergGlobals import NS
 29 | from libgutenberg.Logger import debug, error, info
 30 | 
 31 | from ebookmaker.CommonCode import Options
 32 | options = Options()
 33 | 
 34 | MAX_CHUNK_SIZE = 100 * 1024  # bytes
 35 | MIN_CHUNK_SIZE = 256  # bytes
 36 | 
 37 | SECTIONS = [
 38 |     ('div.section', 0.0),
 39 |     ('div.chapter', 0.0),
 40 |     ('section', 0.0),
 41 |     ('h1', 0.5),
 42 |     ('div', 0.5),
 43 |     ('h2', 0.7),
 44 |     ('h3', 0.75),
 45 |     ('p', 0.8),
 46 |     ('figure', 0.8),
 47 | ]
 48 | 
 49 | NEVER_SPLIT = [NS.xhtml[tag] for tag in ['table', 'figure', 'dl', 'ol', 'ul']]
 50 | 
 51 | def xpath(node, path):
 52 |     """ xpath helper """
 53 |     return node.xpath(path, namespaces = gg.NSMAP)
 54 | 
 55 | def normalize_uri(uri):
 56 |     """ Normalize URI for idmap. """
 57 |     return urllib.parse.unquote(uri) # .decode('utf-8')
 58 | 
 59 | 
 60 | class HTMLChunker(object):
 61 |     """ Splits HTML tree into smaller chunks.
 62 | 
 63 |     Some epub viewers are limited in that they cannot display files
 64 |     larger than 300K.  If our HTML happens to be longer, we have to
 65 |     split it up.  Also smaller chunks do improve page flip times.
 66 | 
 67 | 
 68 |     """
 69 | 
 70 |     def __init__(self, version='epub2'):
 71 |         self.chunks = []
 72 |         self.idmap = {}
 73 |         self.chunk = None
 74 |         self.chunk_body = None
 75 |         self.chunk_size = 0
 76 |         self.next_id = 0
 77 |         self.version = version
 78 |         self.max_chunk_size = MAX_CHUNK_SIZE * (1 if version == 'epub2' else 3)
 79 |         self.nosplit = False
 80 | 
 81 |         self.tags = {}
 82 |         for tag, size in SECTIONS:
 83 |             self.tags[NS.xhtml[tag]] = max(MIN_CHUNK_SIZE, int(size * self.max_chunk_size))
 84 |         for tag in options.section_tags:
 85 |             self.tags[NS.xhtml[tag]] = MIN_CHUNK_SIZE
 86 | 
 87 | 
 88 |     def _make_name(self, url):
 89 |         """ Generate a name for the chunk. """
 90 |         u = list(urllib.parse.urlparse(url))
 91 |         root, ext = os.path.splitext(u[2])
 92 |         html_ext = 'html' if self.version == 'epub2' else 'xhtml'
 93 |         u[2] = f'{root}-{self.next_id}{ext}.{html_ext}'
 94 |         self.next_id += 1
 95 |         return urllib.parse.urlunparse(u)
 96 | 
 97 | 
 98 |     @staticmethod
 99 |     def make_template(tree):
100 |         """ Make a copy with an empty html:body.
101 | 
102 |         This makes a template into which we can paste our chunks.
103 | 
104 |         """
105 | 
106 |         template = copy.deepcopy(tree)
107 | 
108 |         for c in xpath(template, '//xhtml:body'):
109 | 
110 |             # descend while elem has only one child
111 |             while len(c) == 1 and c[0].tag not in NEVER_SPLIT:
112 |                 c = c[0]
113 | 
114 |             # clear children but save attributes
115 |             attributes = c.attrib.items()
116 |             c.clear()
117 |             for a in attributes:
118 |                 c.set(a[0], a[1])
119 | 
120 |         return template
121 | 
122 | 
123 |     def reset_chunk(self, template):
124 |         """ start a new chunk """
125 | 
126 |         self.chunk = copy.deepcopy(template)
127 |         self.chunk_size = 0
128 |         self.chunk_body = xpath(self.chunk, "//xhtml:body")[0]
129 |         while len(self.chunk_body) == 1 and self.chunk_body[0].tag not in NEVER_SPLIT:
130 |             self.chunk_body = self.chunk_body[0]
131 |         self.nosplit = False
132 | 
133 | 
134 |     def shipout_chunk(self, attribs, chunk_id = None, comment = None):
135 |         """ ready chunk to be shipped """
136 | 
137 |         attribs = copy.deepcopy(attribs)
138 |         attribs.rel.discard('mathml')
139 |         if self.chunk_size > self.max_chunk_size and not self.nosplit:
140 |             self.split(self.chunk, attribs)
141 |             return
142 | 
143 |         url = normalize_uri(attribs.url)
144 |         chunk_name = self._make_name(url)
145 | 
146 |         # the url of the whole page
147 |         if url not in self.idmap:
148 |             self.idmap[url] = chunk_name
149 | 
150 |         # fragments of the page
151 |         for e in xpath(self.chunk, '//xhtml:*[@id]'):
152 |             id_ = e.attrib['id']
153 |             old_id = "%s#%s" % (url, id_)
154 |             # key is unicode string,
155 |             # value is uri-escaped byte string
156 |             # if ids get cloned while chunking, map to the first one only
157 |             if old_id not in self.idmap:
158 |                 self.idmap[old_id] = "%s#%s" % (
159 |                     chunk_name,  urllib.parse.quote(id_))
160 |         for e in xpath(self.chunk, '//xhtml:math'):
161 |             attribs.rel.add('mathml')
162 |             break
163 |         for e in xpath(self.chunk, '//svg:svg'):
164 |             attribs.rel.add('svg')
165 |             break
166 |         attribs.url = chunk_name
167 |         attribs.id = chunk_id
168 |         attribs.comment = comment
169 |         if self.chunk_size > 0:
170 |             self.chunks.append((self.chunk, attribs) )
171 | 
172 |             debug("Adding chunk %s (%d bytes) %s" % (chunk_name, self.chunk_size, chunk_id))
173 | 
174 | 
175 |     def split(self, tree, attribs):
176 |         """ Split whole html or split chunk.
177 | 
178 |         Find some arbitrary points to do it.
179 | 
180 |         """
181 | 
182 |         for body in xpath(tree, "//xhtml:body"):
183 |             # we can't split a node that has only one child
184 |             # descend while elem has only one child
185 |             while len(body) == 1 and body[0].tag not in NEVER_SPLIT:
186 |                 body = body[0]
187 | 
188 |             debug("body tag is %s" % body.tag)
189 | 
190 |             template = self.make_template(tree)
191 |             self.reset_chunk(template)
192 | 
193 |             # FIXME: is this ok ???
194 |             # fixes patological one-element-body case
195 |             self.chunk_body.text = body.text
196 | 
197 |             for child in body:
198 |                 if not isinstance(child, etree.ElementBase):
199 |                     # comments, processing instructions etc.
200 |                     continue
201 | 
202 |                 # size measurement doesn't need to be exact
203 |                 try:
204 |                     child_size = len(etree.tostring(child, encoding='utf-8'))
205 |                 except etree.SerialisationError:
206 |                     child_size = len(etree.tostring(child, encoding='latin_1'))
207 | 
208 |                 try:
209 |                     tags = [child.tag + '.' + c for c in child.attrib['class'].split()]
210 |                     tags.append(child.tag)
211 |                 except KeyError:
212 |                     tags = [child.tag]
213 | 
214 |                 for tag in tags:
215 |                     if child.tag in NEVER_SPLIT:
216 |                         self.nosplit = True
217 |                         break
218 |                     if ((self.chunk_size + child_size > self.max_chunk_size) or
219 |                               (tag in self.tags and self.chunk_size > self.tags[tag])):
220 | 
221 |                         comment = ("Chunk: size=%d Split on %s"
222 |                                    % (self.chunk_size, re.sub('^{.*}', '', tag)))
223 |                         debug(comment)
224 | 
225 |                         # find a suitable id
226 |                         chunk_id = None
227 |                         for c in self.chunk_body:
228 |                             if 'id' in c.attrib:
229 |                                 chunk_id = c.get('id')
230 |                                 break
231 |                         debug("chunk id is: %s" % (chunk_id or ''))
232 | 
233 |                         self.shipout_chunk(attribs, chunk_id, comment)
234 |                         self.reset_chunk(template)
235 |                         break
236 | 
237 |                 self.chunk_body.append(child)
238 |                 self.chunk_size = self.chunk_size + child_size
239 | 
240 |             # fixes patological one-element-body case
241 |             self.chunk_body.tail = body.tail
242 | 
243 |             chunk_id = None
244 |             if len(self.chunk_body):
245 |                 chunk_id = self.chunk_body[0].get('id')
246 |             comment = "Chunk: size=%d" % self.chunk_size
247 |             
248 |             self.shipout_chunk(attribs, chunk_id, comment)
249 |             self.reset_chunk(template)
250 | 
251 | 
252 |     def rewrite_links(self, f):
253 |         """ Rewrite all href and src using f(). """
254 | 
255 |         for chunk in self.chunks:
256 |             # chunk['name'] = f(chunk['name'])
257 | 
258 |             for link in xpath(chunk[0], '//xhtml:*[@href]'):
259 |                 link.set('href', f(link.get('href')))
260 | 
261 |             for image in xpath(chunk[0], '//xhtml:*[@src]'):
262 |                 image.set('src', f(image.get('src')))
263 | 
264 |         for k, v in self.idmap.items():
265 |             self.idmap[k] = f(v)
266 | 
267 | 
268 |     def rewrite_internal_links(self):
269 |         """ Rewrite links to point into right chunks.
270 | 
271 |         Because we split the HTML into chunks, all internal links need
272 |         to be rewritten to become links into the right chunk.
273 |         Rewrite all internal links in all chunks.
274 | 
275 |         """
276 |         for chunk in self.chunks:
277 |             for a in xpath(chunk[0], "//xhtml:*[@href]"):
278 |                 try:
279 |                     uri = normalize_uri(a.get('href'))
280 |                     a.set('href', self.idmap[uri])
281 |                 except KeyError:
282 |                     ur, dummy_frag = urllib.parse.urldefrag(uri)
283 |                     if ur in self.idmap:
284 |                         error("HTMLChunker: Cannot rewrite internal link '%s'", uri)
285 | 
286 | 
287 |     def rewrite_internal_links_toc(self, toc):
288 |         """ Rewrite links to point into right chunks.
289 | 
290 |         Because we split the HTML into chunks, all internal links need
291 |         to be rewritten to become links into the right chunk.
292 |         Rewrite all links in the passed toc.
293 | 
294 |         """
295 | 
296 |         for entry in toc:
297 |             try:
298 |                 entry[0] = self.idmap [normalize_uri(entry[0])]
299 |             except KeyError:
300 |                 error("HTMLChunker: Cannot rewrite toc entry '%s'" % entry[0])
301 |                 error(repr(self.idmap))
302 |                 del entry
303 | 


--------------------------------------------------------------------------------
/src/ebookmaker/Spider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #  -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*-
  3 | 
  4 | """
  5 | 
  6 | Spider.py
  7 | 
  8 | Copyright 2009 by Marcello Perathoner
  9 | Copyright 2025 by Project Gutenberg
 10 | 
 11 | Distributable under the GNU General Public License Version 3 or newer.
 12 | 
 13 | Rudimentary Web Spider
 14 | 
 15 | """
 16 | import copy
 17 | import fnmatch
 18 | import os.path
 19 | import re
 20 | 
 21 | from six.moves import urllib
 22 | 
 23 | import libgutenberg.GutenbergGlobals as gg
 24 | from libgutenberg.GutenbergGlobals import NS
 25 | from libgutenberg.Logger import critical, debug, error, info, warning
 26 | from libgutenberg import MediaTypes
 27 | 
 28 | from ebookmaker import parsers
 29 | from ebookmaker.CommonCode import Options
 30 | from ebookmaker.ParserFactory import ParserFactory
 31 | 
 32 | 
 33 | OPS_AUDIO_MEDIATYPES = set((
 34 |     'audio/mpeg',
 35 |     'audio/ogg; codecs=opus',
 36 |     'audio/ogg',  # need both because spider believes type attribute
 37 | ))
 38 | 
 39 | options = Options()
 40 | 
 41 | RE_PGLINK = re.compile(r'^https?://(www.|)(gutenberg|pglaf|pgdp).org(\W|$)', re.I)
 42 | 
 43 | class Spider(object):
 44 |     """ A very rudimentary web spider. """
 45 | 
 46 |     def __init__(self, job):
 47 |         self.parsed_urls = set()
 48 |         self.parsers = []
 49 |         self.redirection_map = {}
 50 | 
 51 |         dirpath = os.path.dirname(job.url)  # platform native path
 52 |         # use for parser only
 53 |         self.include_urls = []
 54 |         self.include_urls += options.include_urls or [parsers.webify_url(dirpath) + '/*']
 55 | 
 56 |         self.include_mediatypes = []
 57 |         self.include_mediatypes += options.include_mediatypes
 58 |         if job.subtype == '.images' or job.type == 'rst.gen':
 59 |             self.include_mediatypes.append('image/*')
 60 |         if job.type == 'epub3.images':
 61 |             self.include_mediatypes.extend([str(mt) for mt in OPS_AUDIO_MEDIATYPES])
 62 |         if job.type == 'html.images':
 63 |             self.include_mediatypes.append('*/*')
 64 | 
 65 |         self.exclude_urls = []
 66 |         self.exclude_urls += options.exclude_urls
 67 | 
 68 |         self.exclude_mediatypes = []
 69 |         self.exclude_mediatypes += options.exclude_mediatypes
 70 |         if job.type in {'epub.images', 'epub.noimages', 'epub3.images'}:
 71 |             self.exclude_mediatypes.append('application/xml')
 72 |             self.include_unknown = False
 73 |         else:
 74 |             self.include_unknown = True
 75 |         self.max_depth = options.max_depth or six.MAXSIZE
 76 |         self.jobtype = job.type
 77 |         self.job_dc = job.dc
 78 | 
 79 | 
 80 |     def recursive_parse(self, root_attribs):
 81 |         """ Do a recursive parse starting from url.
 82 | 
 83 |         Do a breadth-first traversal. Assuming the first page contains
 84 |         a linked TOC, this will get us a more natural ordering of the
 85 |         pages than a depth-first traversal.
 86 | 
 87 |         """
 88 | 
 89 |         queue = []
 90 | 
 91 |         debug("Start of retrieval")
 92 | 
 93 |         # enqueue root url
 94 | 
 95 |         counter = 0
 96 |         self.enqueue(queue, 0, root_attribs, True)
 97 | 
 98 |         while queue:
 99 |             depth, attribs = queue.pop(0)
100 | 
101 |             url = self.redirect(attribs.url)
102 |             if url in self.parsed_urls:
103 |                 continue
104 | 
105 |             parser = ParserFactory.create(url, attribs)
106 |             if parser is None:
107 |                 continue
108 |             # Maybe the url was redirected to something we already have?
109 |             url = parser.attribs.url
110 |             if url in self.parsed_urls:
111 |                 critical('no content in %s', url)
112 |                 continue
113 |             self.parsed_urls.add(url)
114 |             if hasattr(parser, 'add_title'):
115 |                 parser.add_title(self.job_dc)
116 | 
117 |             self.add_redirection(parser.attribs.orig_url, url)
118 |             parser.pre_parse()
119 |             self.parsers.append(parser)
120 |             
121 |             # the following code alters the the dom tree, so make a copy of the tree
122 |             if hasattr(parser, 'xhtml') and parser.xhtml is not None:
123 |                 parser._xhtml = copy.deepcopy(parser.xhtml)
124 | 
125 |             # look for more documents to add to the queue
126 |             # debug("Requesting iterlinks for: %s ..." % url)
127 |             for url, elem in parser.iterlinks():
128 |                 counter += 1
129 |                 url = urllib.parse.urldefrag(url)[0]
130 |                 if url == parser.attribs.url or url in self.parsed_urls:
131 |                     continue
132 |                 if elem.get('rel') == 'nofollow' and self.jobtype in ('epub.images', 'epub3.images'):
133 |                     # remove link to content not followed
134 |                     elem.tag = 'span'
135 |                     elem.set('data-nofollow-href', elem.get('href'))
136 |                     del elem.attrib['href']
137 |                     del elem.attrib['rel']
138 |                     warning('not followed: %s' % url)
139 |                     continue
140 | 
141 |                 new_attribs = parsers.ParserAttributes()
142 |                 new_attribs.url = url
143 |                 new_attribs.referrer = parser.attribs.url
144 | 
145 |                 for k, v in elem.items():
146 |                     if k in ('id', 'title'):
147 |                         setattr(new_attribs, k, v)
148 |                     elif k == 'type':
149 |                         new_attribs.orig_mediatype = v
150 |                     elif k == 'rel':
151 |                         new_attribs.rel.update(v.lower().split())
152 | 
153 |                 if not new_attribs.id:
154 |                     # synthesize and set an id for backlink
155 |                     seed = url + ' ' + str(counter)
156 |                     new_attribs.id = f'id-{abs(hash(seed))}'
157 |                     elem.attrib['id'] = new_attribs.id
158 |    
159 |                 tag = elem.tag
160 |                 if tag == NS.xhtml.a:
161 |                     if self.is_image(new_attribs) and self.is_included_url(new_attribs) and \
162 |                             self.is_included_mediatype(new_attribs) and \
163 |                             self.jobtype in ('epub.images', 'epub3.images'):
164 |                         # need to wrap an image
165 |                         wrapper_parser = parsers.WrapperParser.Parser(new_attribs)
166 |                         if wrapper_parser.attribs.url not in self.parsed_urls:
167 |                             ParserFactory.parsers[wrapper_parser.attribs.url] = wrapper_parser
168 |                             self.parsers.append(wrapper_parser)
169 |                             self.parsed_urls.add(wrapper_parser.attribs.url)
170 |                         
171 |                         elem.set('href', wrapper_parser.attribs.url)
172 |                         new_attribs.referrer = wrapper_parser.attribs.url
173 |                         elem.set('title', wrapper_parser.attribs.title)
174 |                         self.enqueue(queue, depth + 1, new_attribs, False)
175 |                     else:
176 |                         self.enqueue(queue, depth + 1, new_attribs, True)
177 |                         
178 |                 elif tag in (NS.xhtml.img, NS.xhtml.style, NS.xhtml.math):
179 |                     if tag == NS.xhtml.style or self.is_image(new_attribs):
180 |                         self.enqueue(queue, depth, new_attribs, False)
181 |                     else:
182 |                         error(f"{url} is not a supported image type")
183 |                 elif tag == NS.xhtml.link:
184 |                     if new_attribs.rel.intersection(('stylesheet', 'icon')):
185 |                         self.enqueue(queue, depth, new_attribs, False)
186 |                     else:
187 |                         self.enqueue(queue, depth + 1, new_attribs, True)
188 |                 elif tag in (NS.xhtml.object, NS.xhtml.source):
189 |                     self.enqueue(queue, depth, new_attribs, False)
190 | 
191 |         debug("End of retrieval")
192 | 
193 |         # rewrite redirected urls
194 |         if self.redirection_map:
195 |             for parser in self.parsers:
196 |                 parser.remap_links(self.redirection_map)
197 |         # remove parsers with missing content
198 |         self.parsers = [parser for parser in self.parsers if parser.fp != None]
199 | 
200 |         self.topological_sort()
201 | 
202 | 
203 |     def enqueue(self, queue, depth, attribs, is_doc):
204 |         """ Enqueue url for parsing."""
205 | 
206 |         if is_doc:
207 |             if not self.is_included_url(attribs):
208 |                 if attribs.url and RE_PGLINK.search(attribs.url):
209 |                     info('PG link in %s: %s', attribs.referrer, attribs.url)
210 |                 else: 
211 |                     warning('External link in %s: %s', attribs.referrer, attribs.url)
212 |                 return
213 |             if depth >= self.max_depth:
214 |                 critical('Omitted file %s due to depth > max_depth' % attribs.url)
215 |                 return
216 |         
217 |         if not self.is_included_mediatype(attribs) and not self.is_included_relation(attribs):
218 |             return
219 |         elif not self.is_included_url(attribs) and not self.is_included_relation(attribs):
220 |             critical('Failed for embedded media in %s from disallowed location: %s'
221 |                   % (attribs.referrer, attribs.url))
222 |             return
223 | 
224 |         queue.append((depth, attribs))
225 | 
226 | 
227 |     def is_image(self, attribs):
228 |         """ Return True if png, gif, svg or jpg. """
229 |         return self.get_mediatype(attribs) in parsers.ImageParser.mediatypes
230 | 
231 | 
232 |     def is_included_url(self, attribs):
233 |         """ Return True if this document is eligible. """
234 | 
235 |         url = attribs.url
236 | 
237 |         included = any([fnmatch.fnmatchcase(url, x) for x in self.include_urls])
238 |         excluded = any([fnmatch.fnmatchcase(url, x) for x in self.exclude_urls])
239 | 
240 |         if included and not excluded:
241 |             return True
242 | 
243 |         if excluded:
244 |             debug("Dropping excluded %s" % url)
245 |         return False
246 | 
247 | 
248 |     def get_mediatype(self, attribs):
249 |         """ Get mediatype out of attribs, guessing if needed. """
250 |         if attribs.orig_mediatype is None:
251 |             mediatype = MediaTypes.guess_type(attribs.url)
252 |             if mediatype:               
253 |                 attribs.orig_mediatype = mediatype
254 |             else:
255 |                 return None
256 |         return attribs.orig_mediatype
257 | 
258 | 
259 |     def is_included_mediatype(self, attribs):
260 |         """ Return True if this document is eligible. """
261 | 
262 |         mediatype = self.get_mediatype(attribs)
263 |         if not mediatype:
264 |             warning('Mediatype could not be determined from url %s' % attribs.url)
265 |             return self.include_unknown # don't include in epubs if mediatype unknown
266 | 
267 |         included = any([fnmatch.fnmatch(mediatype, pattern)
268 |                         for pattern in self.include_mediatypes])
269 |         excluded = any([fnmatch.fnmatch(mediatype, pattern)
270 |                         for pattern in self.exclude_mediatypes])
271 | 
272 |         if included and not excluded:
273 |             return True
274 | 
275 |         if excluded:
276 |             debug("Dropping excluded mediatype %s" % mediatype)
277 | 
278 |         return False
279 | 
280 | 
281 |     def is_included_relation(self, attribs):
282 |         """ Return True if this document is eligible. """
283 | 
284 |         keep = attribs.rel.intersection(('icon', 'important', 'linked_image'))
285 |         if keep:
286 |             debug("Not dropping after all because of rel.")
287 | 
288 |         return keep
289 | 
290 | 
291 |     def topological_sort(self):
292 |         """ Do a topological sort of documents using <link rel='next'> """
293 | 
294 |         relnext = [(p.attribs.referrer, p.attribs.url)
295 |                    for p in self.parsers if 'next' in p.attribs.rel]
296 |         if relnext:
297 |             try:
298 |                 d = {}
299 |                 for order, url in enumerate(gg.topological_sort(relnext)):
300 |                     d[url] = order
301 |                     debug("%s order %d" % (url, order))
302 |                 for parser in self.parsers:
303 |                     parser.order = d.get(parser.attribs.url, 999999)
304 |                 self.parsers.sort(key=lambda p: p.order)
305 | 
306 |             except Exception:
307 |                 pass
308 | 
309 | 
310 |     def add_redirection(self, from_url, to_url):
311 |         """ Remember this redirection. """
312 | 
313 |         if from_url != to_url:
314 |             self.redirection_map[from_url] = to_url
315 |             debug("Adding redirection from %s to %s" % (from_url, to_url))
316 | 
317 | 
318 |     def redirect(self, url):
319 |         """
320 |         Redirect url.
321 | 
322 |         Parsers are cached under the 200 url. This is an offline redirect
323 |         to find the 200 url.
324 | 
325 |         """
326 |         return self.redirection_map.get(url, url)
327 | 
328 | 
329 |     def dict_urls_mediatypes(self):
330 |         """ Return a dict of all parsed urls and mediatypes. """
331 |         return dict([(p.attribs.url, p.mediatype()) for p in self.parsers])
332 | 
333 | 
334 |     def aux_file_iter(self):
335 |         """ Iterate over image files. Return absolute urls. """
336 | 
337 |         for p in self.parsers:
338 |             if hasattr(p, 'resize_image'):
339 |                 yield p.attribs.url
340 | 


--------------------------------------------------------------------------------