├── tests ├── __init__.py ├── cache │ └── epub │ │ └── test ├── files │ └── 43172 │ │ ├── images │ │ └── image.jpg │ │ └── 43172-h │ │ ├── music │ │ └── test.mp3 │ │ └── images │ │ ├── image.jpg │ │ └── mathex.jpg ├── test_rst.py ├── test_txt.py ├── test_html.py ├── test_templates.py ├── test_job.py ├── test_setup.py └── test_htm.py ├── src └── ebookmaker │ ├── __init__.py │ ├── mydocutils │ ├── __init__.py │ ├── gutenberg │ │ ├── __init__.py │ │ ├── writers │ │ │ ├── __init__.py │ │ │ └── nroff.py │ │ ├── parsers │ │ │ ├── pg-header.rst │ │ │ └── __init__.py │ │ └── transforms │ │ │ └── __init__.py │ ├── transforms │ │ └── __init__.py │ ├── writers │ │ ├── rst2epub.css │ │ ├── rst2html.css │ │ ├── epub2.py │ │ └── rst2all.css │ ├── parsers │ │ └── default_style.rst │ └── nodes.py │ ├── Version.py │ ├── parsers │ ├── broken.png │ ├── AuxParser.py │ ├── WrapperParser.py │ ├── txt2all.css │ ├── ImageParser.py │ ├── CSSParser.py │ └── boilerplate.py │ ├── writers │ ├── cover.jpg │ ├── RSTWriter.py │ ├── PicsDirWriter.py │ ├── PDFWriter.py │ ├── HtmlTemplates.py │ ├── KindleWriter.py │ ├── TxtWriter.py │ └── __init__.py │ ├── packagers │ ├── PDFPackager.py │ ├── RSTPackager.py │ ├── HTMLPackager.py │ ├── GzipPackager.py │ ├── TxtPackager.py │ ├── PushPackager.py │ └── __init__.py │ ├── WriterFactory.py │ ├── utils.py │ ├── Unitame.py │ ├── ParserFactory.py │ ├── UnitameData.py │ ├── CommonCode.py │ ├── HTMLChunker.py │ └── Spider.py ├── pyproject.toml ├── setup.cfg ├── .travis.yml ├── Pipfile ├── scripts ├── ebookmaker ├── rhyme_compiler └── convert_unitame ├── .gitignore ├── ebookmaker.conf ├── docs ├── alt-text.md ├── images.md └── ebookmaker_v0_11.md ├── MANIFEST ├── setup.py ├── USAGE.md └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/cache/epub/test: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/ebookmaker/__init__.py: -------------------------------------------------------------------------------- 1 | """ This is a package. """ 2 | -------------------------------------------------------------------------------- /src/ebookmaker/mydocutils/__init__.py: -------------------------------------------------------------------------------- 1 | """ This is a package """ 2 | -------------------------------------------------------------------------------- /src/ebookmaker/mydocutils/gutenberg/__init__.py: -------------------------------------------------------------------------------- 1 | """ This is a package. """ 2 | -------------------------------------------------------------------------------- /src/ebookmaker/mydocutils/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | """ This is a package. """ 2 | -------------------------------------------------------------------------------- /src/ebookmaker/mydocutils/gutenberg/writers/__init__.py: -------------------------------------------------------------------------------- 1 | """ This is a package. """ 2 | -------------------------------------------------------------------------------- /src/ebookmaker/Version.py: -------------------------------------------------------------------------------- 1 | VERSION = '0.13.8' 2 | GENERATOR = 'Ebookmaker %s by Project Gutenberg' 3 | -------------------------------------------------------------------------------- /src/ebookmaker/parsers/broken.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gutenbergtools/ebookmaker/HEAD/src/ebookmaker/parsers/broken.png -------------------------------------------------------------------------------- /src/ebookmaker/writers/cover.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gutenbergtools/ebookmaker/HEAD/src/ebookmaker/writers/cover.jpg -------------------------------------------------------------------------------- /tests/files/43172/images/image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gutenbergtools/ebookmaker/HEAD/tests/files/43172/images/image.jpg -------------------------------------------------------------------------------- /tests/files/43172/43172-h/music/test.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gutenbergtools/ebookmaker/HEAD/tests/files/43172/43172-h/music/test.mp3 -------------------------------------------------------------------------------- /tests/files/43172/43172-h/images/image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gutenbergtools/ebookmaker/HEAD/tests/files/43172/43172-h/images/image.jpg -------------------------------------------------------------------------------- /tests/files/43172/43172-h/images/mathex.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gutenbergtools/ebookmaker/HEAD/tests/files/43172/43172-h/images/mathex.jpg -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # As per https://github.com/pypa/setuptools/blob/main/docs/userguide/quickstart.rst 2 | [build-system] 3 | requires = ["setuptools"] 4 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = ebookmaker 3 | 4 | version = 0.13.8 5 | 6 | [options] 7 | package_dir= 8 | =src 9 | packages=find: 10 | 11 | [options.packages.find] 12 | where=src -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - '3.6' 5 | 6 | before_install: 7 | - sudo apt-get update 8 | 9 | install: 10 | - 'pip install pipenv' 11 | - 'pipenv install' 12 | 13 | script: python setup.py test 14 | 15 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.python.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [dev-packages] 7 | pylint = "*" 8 | 9 | [packages] 10 | e1839a8 = {path = ".",editable = true} 11 | libgutenberg = ">=0.10.31" 12 | psycopg2 = "*" 13 | docutils = ">=0.18.1" 14 | html5lib = "*" 15 | cchardet = "==2.2.0a2" 16 | ebookmaker = {file = ".", editable = true} 17 | -------------------------------------------------------------------------------- /scripts/ebookmaker: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*- 3 | 4 | """ 5 | 6 | ebookmaker script 7 | 8 | Copyright 2014 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | This script starts epubmaker. 13 | 14 | """ 15 | 16 | from ebookmaker import EbookMaker 17 | 18 | EbookMaker.main () 19 | -------------------------------------------------------------------------------- /src/ebookmaker/packagers/PDFPackager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | PDFPackager.py 6 | 7 | Copyright 2010 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Package a PDF file for PG. 12 | 13 | """ 14 | 15 | from ebookmaker.packagers import OneFileZipPackager 16 | 17 | TYPE = 'ww' 18 | FORMATS = ''.split () 19 | 20 | class Packager (OneFileZipPackager): 21 | """ WW packager for PDF files. """ 22 | pass 23 | -------------------------------------------------------------------------------- /src/ebookmaker/packagers/RSTPackager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | RSTPackager.py 6 | 7 | Copyright 2010 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Package a RST file for PG. 12 | 13 | """ 14 | 15 | from ebookmaker.packagers import HTMLishPackager 16 | 17 | TYPE = 'ww' 18 | FORMATS = 'rst.gen'.split () 19 | 20 | class Packager (HTMLishPackager): 21 | """ Package a RST file with its images. """ 22 | pass 23 | -------------------------------------------------------------------------------- /src/ebookmaker/packagers/HTMLPackager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | HTMLPackager.py 6 | 7 | Copyright 2010 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Package a HTML file for PG. 12 | 13 | """ 14 | 15 | from ebookmaker.packagers import HTMLishPackager 16 | 17 | TYPE = 'ww' 18 | FORMATS = 'html.images'.split () 19 | 20 | class Packager (HTMLishPackager): 21 | """ Package a HTML file with its images. """ 22 | pass 23 | -------------------------------------------------------------------------------- /src/ebookmaker/packagers/GzipPackager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | GzipPackager.py 6 | 7 | Copyright 2010 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Gzip a file. 12 | 13 | """ 14 | 15 | from ebookmaker.packagers import OneFileGzipPackager 16 | 17 | TYPE = 'gzip' 18 | FORMATS = 'rst html.noimages html.images txt.us-ascii txt.iso-8859-1 txt.utf-8'.split () 19 | 20 | class Packager (OneFileGzipPackager): 21 | """ Gzip packager. """ 22 | pass 23 | -------------------------------------------------------------------------------- /src/ebookmaker/packagers/TxtPackager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | TxtPackager.py 6 | 7 | Copyright 2010 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Package a Txt file for PG. 12 | 13 | """ 14 | 15 | from ebookmaker.packagers import OneFileZipPackager 16 | 17 | TYPE = 'ww' 18 | FORMATS = 'txt.us-ascii txt.iso-8859-1 txt.utf-8'.split () 19 | 20 | class Packager (OneFileZipPackager): 21 | """ WW packager for plain text files. """ 22 | pass 23 | -------------------------------------------------------------------------------- /src/ebookmaker/parsers/AuxParser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*- 3 | 4 | """ 5 | 6 | AuxParser.py 7 | 8 | Copyright 2009 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Open an url and return raw data. 13 | 14 | """ 15 | 16 | 17 | from ebookmaker.parsers import ParserBase 18 | 19 | mediatypes = ('*/*', ) 20 | 21 | class Parser (ParserBase): 22 | """ Parse an auxiliary file. """ 23 | auxparser = True 24 | def __init__ (self, attribs = None): 25 | ParserBase.__init__ (self, attribs) 26 | self.data = None 27 | 28 | 29 | def pre_parse (self): 30 | """ Parse the file. """ 31 | self.data = self.bytes_content () 32 | 33 | 34 | def serialize (self): 35 | """ Serialize file to string. """ 36 | return self.data 37 | -------------------------------------------------------------------------------- /tests/test_rst.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import unittest 5 | import subprocess 6 | 7 | 8 | import ebookmaker 9 | 10 | class TestFromRst(unittest.TestCase): 11 | def setUp(self): 12 | self.sample_dir = os.path.join(os.path.dirname(__file__), 'files') 13 | 14 | def test_33968(self): 15 | book_id = '33968' 16 | dir = os.path.join(self.sample_dir, book_id) 17 | rstfile = os.path.join(dir, '%s-rst' % book_id, '%s-rst.rst' % book_id) 18 | cmd = 'ebookmaker --make=pdf --output-dir={dir} {rstfile}'.format( 19 | dir=dir, 20 | rstfile=rstfile, 21 | ) 22 | 23 | output = subprocess.check_output(cmd, shell=True) 24 | 25 | self.assertFalse(output) 26 | outs = [ 27 | "%s-cover.png", 28 | "%s-images-pdf.pdf", 29 | ] 30 | for out in outs: 31 | self.assertTrue(os.path.exists(os.path.join(dir, out % book_id))) 32 | os.remove(os.path.join(dir, out % book_id)) 33 | -------------------------------------------------------------------------------- /tests/test_txt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import unittest 5 | import subprocess 6 | 7 | 8 | import ebookmaker 9 | 10 | class TestFromTxt(unittest.TestCase): 11 | def setUp(self): 12 | self.sample_dir = os.path.join(os.path.dirname(__file__), 'files') 13 | self.out_dir = os.path.join(os.path.dirname(__file__), 'out') 14 | 15 | def test_69030(self): 16 | book_id = '69030' 17 | dir = os.path.join(self.sample_dir, book_id) 18 | srcfile = os.path.join(dir, '%s-0.txt' % book_id) 19 | cmd = 'ebookmaker ' 20 | cmd += f'--ebook={book_id} --make=txt --make=html --output-dir={self.out_dir} ' 21 | cmd += f'--validate {srcfile}' 22 | 23 | output = subprocess.check_output(cmd, shell=True) 24 | 25 | self.assertFalse(output) 26 | outs = [ 27 | "%s.txt", 28 | "%s-0.txt", 29 | "%s-8.txt", 30 | "%s-h.html", 31 | "%s-cover.png", 32 | ] 33 | for out in outs: 34 | self.assertTrue(os.path.exists(os.path.join(self.out_dir, out % book_id))) 35 | os.remove(os.path.join(self.out_dir, out % book_id)) 36 | -------------------------------------------------------------------------------- /src/ebookmaker/mydocutils/writers/rst2epub.css: -------------------------------------------------------------------------------- 1 | /* 2 | Project Gutenberg EPUB docutils stylesheet. 3 | 4 | This stylesheet contains styles specific to EPUB. 5 | */ 6 | 7 | /* FONTS */ 8 | 9 | /* mostly unsupported */ 10 | .small-caps { font-style: italic } 11 | .gesperrt { font-style: italic } 12 | 13 | /* ALIGN */ 14 | 15 | /* SECTIONS */ 16 | 17 | /* reduce screen real estate waste */ 18 | body { margin: 1% } 19 | 20 | /* ugly hack to give more specifity. because ADE chucks out the whole 21 | stylesheet when it sees an !important */ 22 | 23 | .first.first { margin-top: 0; text-indent: 0 } 24 | .last.last { margin-bottom: 0 } 25 | 26 | .no-page-break.no-page-break 27 | { page-break-before: avoid } 28 | 29 | /* PAGINATION */ 30 | 31 | div.clearpage { page-break-before: always; padding-top: 10% } 32 | div.cleardoublepage { page-break-before: right; padding-top: 10% } 33 | 34 | .vfill { margin-top: 10% } 35 | h2.title { margin-top: 10% } 36 | 37 | /* DIV */ 38 | 39 | a { text-decoration: none } 40 | .toc-pageref { display: none } 41 | 42 | /* DROPCAPS */ 43 | 44 | span.dropcap { line-height: 0 } 45 | img.dropcap { vertical-align: bottom } 46 | -------------------------------------------------------------------------------- /src/ebookmaker/mydocutils/gutenberg/parsers/pg-header.rst: -------------------------------------------------------------------------------- 1 | .. -*- encoding: utf-8 -*- 2 | 3 | .. |pg.copyrighted-header| replace:: 4 | 5 | This is a *copyrighted* Project Gutenberg eBook, details 6 | below. 7 | 8 | .. _pg-header: 9 | 10 | .. container:: noindent pgheader language-en pg_boilerplate 11 | 12 | This ebook is for the use of anyone anywhere in the United States 13 | and most other parts of the world at no cost and with almost no 14 | restrictions whatsoever. You may copy it, give it away or re-use it 15 | under the terms of the `Project Gutenberg License`_ included with 16 | this ebook or online at https://www.gutenberg.org/license. If you 17 | are not located in the United States, you'll have to check the laws 18 | of the country where you are located before using this ebook. 19 | 20 | |pg.copyrighted-header| 21 | 22 | .. vspace:: 2 23 | 24 | .. _pg-machine-header: 25 | 26 | .. container:: noindent white-space-pre-line 27 | 28 | |pg.machine-header| 29 | 30 | .. vspace:: 2 31 | 32 | .. _pg-start-line: 33 | 34 | \*\*\* START OF THIS PROJECT GUTENBERG EBOOK |pg.upcase-title| \*\*\* 35 | 36 | .. vspace:: 4 37 | 38 | .. _pg-produced-by: 39 | 40 | |pg.produced-by| 41 | 42 | .. vspace:: 1 43 | 44 | |pg.credits| 45 | -------------------------------------------------------------------------------- /src/ebookmaker/writers/RSTWriter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*- 3 | 4 | """ 5 | RSTWriter.py 6 | 7 | Copyright 2009 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Build an RST file. This is just the master RST with the PG license mixed in. 12 | 13 | """ 14 | 15 | 16 | import os 17 | 18 | from libgutenberg.Logger import debug, info, error 19 | from libgutenberg.GutenbergGlobals import SkipOutputFormat 20 | from ebookmaker import ParserFactory 21 | from ebookmaker import writers 22 | 23 | class Writer (writers.BaseWriter): 24 | """ Class to write a reStructuredText. """ 25 | 26 | def build (self, job): 27 | """ Build RST file. """ 28 | 29 | filename = os.path.join (os.path.abspath(job.outputdir), job.outputfile) 30 | 31 | debug ("Creating RST file: %s" % filename) 32 | 33 | parser = ParserFactory.ParserFactory.create (job.url) 34 | 35 | if not hasattr (parser, 'rst2nroff'): 36 | debug ('RSTWriter can only work on a RSTParser.') 37 | raise SkipOutputFormat 38 | 39 | data = parser.preprocess ('utf-8').encode ('utf-8') 40 | 41 | self.write_with_crlf (filename, data) 42 | 43 | debug ("Done RST file: %s" % filename) 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | Pipfile.lock 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # IPython Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # dotenv 80 | .env 81 | 82 | # virtualenv 83 | venv/ 84 | ENV/ 85 | 86 | # Spyder project settings 87 | .spyderproject 88 | 89 | # Rope project settings 90 | .ropeproject 91 | -------------------------------------------------------------------------------- /ebookmaker.conf: -------------------------------------------------------------------------------- 1 | # copy this file to /etc/ebookmaker.conf to set config paths 2 | # 3 | # copy this file to ~/.ebookmaker to set defaults for command line arguments 4 | # or to override config paths in /etc/ebookmaker.conf 5 | 6 | [DEFAULT_ARGS] 7 | #### this section is inactive in /etc/ebookmaker.conf #### 8 | # types: all [list of output types] 9 | # max_depth: 1 10 | # strip_links: False 11 | # include_urls: [list of urls] 12 | # exclude_urls: [list] 13 | # include_mediatypes: [list of mediatypes] 14 | # exclude_mediatypes: [list of mediatypes] 15 | # mediatype_from_extension: False 16 | # rewrite: [url]>[rewritten url] 17 | # title: None 18 | # author: None 19 | # ebook: 0 20 | # outputdir: ./ 21 | # outputfile: [title].epub 22 | # section_tags: [list of classes] 23 | # packager: None ['ww', 'gzip'] 24 | # cover: None [path] 25 | # generate_cover: False 26 | # epub_validator: java -jar epubcheck-4.2.6/epubcheck.jar 27 | # html_validator: vnu-runtime-image/bin/vnu 28 | # production: False 29 | 30 | [PATHS] 31 | # proxies: None 32 | # xelatex: xelatex 33 | # mobigen: ebook-convert # can also be a path to kindlegen 34 | # mobilang: ebook-convert # converter to use for languages not supported by Kindlegen 35 | # mobikf8: ebook-convert # converter for kf8 36 | # groff: groff 37 | # rhyming_dict: None 38 | 39 | # default is '~' 40 | # FILESDIR = file:///Users/Shared/Documents/pg/dev/html/files 41 | 42 | # default is "~/cache/epub/" 43 | # CACHEDIR = /Users/Shared/Documents/gitenberg/cache1/epub -------------------------------------------------------------------------------- /tests/test_html.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import unittest 5 | import subprocess 6 | 7 | 8 | import ebookmaker 9 | 10 | class TestFromHtml(unittest.TestCase): 11 | def setUp(self): 12 | self.sample_dir = os.path.join(os.path.dirname(__file__), 'files') 13 | self.out_dir = os.path.join(os.path.dirname(__file__), 'out') 14 | 15 | def test_43172(self): 16 | book_id = '43172' 17 | dir = os.path.join(self.sample_dir, book_id) 18 | htmfile = os.path.join(dir, '%s-h' % book_id, '%s-h.html' % book_id) 19 | cmd = 'ebookmaker --make=test --output-dir={dir} --generate_cover {htmfile}'.format( 20 | dir=self.out_dir, 21 | htmfile=htmfile, 22 | ) 23 | 24 | output = subprocess.check_output(cmd, shell=True) 25 | 26 | self.assertFalse(output) 27 | outs = [ 28 | "%s-epub.epub", 29 | "%s-images-epub3.epub", 30 | "%s-images-epub.epub", 31 | "%s-h.html", 32 | ] 33 | for out in outs: 34 | self.assertTrue(os.path.exists(os.path.join(self.out_dir, out % book_id))) 35 | os.remove(os.path.join(self.out_dir, out % book_id)) 36 | os.remove(os.path.join(self.out_dir, 'images/image.jpg')) 37 | os.remove(os.path.join(self.out_dir, 'images/mathex.jpg')) 38 | os.remove(os.path.join(self.out_dir, 'music/test.mp3')) 39 | os.rmdir(os.path.join(self.out_dir, 'images')) 40 | os.rmdir(os.path.join(self.out_dir, 'music')) 41 | -------------------------------------------------------------------------------- /tests/test_templates.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | run this with 5 | python -m unittest -v tests.test_templates 6 | ''' 7 | 8 | import os 9 | import unittest 10 | 11 | from libgutenberg.DublinCore import GutenbergDublinCore 12 | 13 | from ebookmaker.writers import HtmlTemplates, TemplateStrings 14 | 15 | 16 | class TestHeaders(unittest.TestCase): 17 | 18 | def setUp(self): 19 | self.dc = GutenbergDublinCore() 20 | book_id = '69030' 21 | self.sample_dir = os.path.join(os.path.dirname(__file__), 'files') 22 | dir = os.path.join(self.sample_dir, book_id) 23 | srcfile = os.path.join(dir, '%s-0.txt' % book_id) 24 | with open(srcfile, 'r') as f: 25 | sampledata = f.read() 26 | self.dc.load_from_pgheader(sampledata) 27 | 28 | def test_templates(self): 29 | self.assertTrue('in the United States' in TemplateStrings.headera) 30 | self.assertTrue('FULL PROJECT GUTENBERG LICENSE' in TemplateStrings.headerb) 31 | self.assertTrue('COPYRIGHTED' in TemplateStrings.headera_copy) 32 | self.assertTrue('This particular' in TemplateStrings.headerb_copy) 33 | self.assertTrue('
' not in TemplateStrings.headera_txt) 34 | self.assertTrue('
' not in TemplateStrings.headerb_txt) 35 | self.assertTrue('Gutenberg License' in TemplateStrings.headera_copy_txt) 36 | self.assertTrue('where you are located' in TemplateStrings.headerb_copy_txt) 37 | 38 | def test_headdata(self): 39 | self.assertTrue('The girl in the crowd' in HtmlTemplates.pgheader(self.dc).text_content()) 40 | -------------------------------------------------------------------------------- /scripts/rhyme_compiler: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*- 3 | 4 | """ 5 | 6 | ryhme_compiler.py 7 | 8 | Copyright 2009 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | This module produces a dbm file of rhyme stems. 13 | 14 | We use a very naive concept of rhyme: we preprocess the 'CMU 15 | Pronouncing Dictionary' (found at 16 | http://www.speech.cs.cmu.edu/cgi-bin/cmudict) and extract the phonemes 17 | for each word from the last stressed one to the end of the word. 18 | 19 | The result is stored in cmudict.db hashed by word. 20 | 21 | To compile: 22 | 23 | $ ./rhyme_compiler.py cmudict.0.7a 24 | 25 | 26 | """ 27 | 28 | import fileinput 29 | import re 30 | from six.moves import dbm_gnu as gdbm 31 | 32 | dbm = gdbm.open ('cmudict.db', 'nf') 33 | 34 | RE_STRESSED = re.compile ('[a-z]+[12][^12]*$') 35 | 36 | # two example lines from cmudict 37 | # 38 | # PRONUNCIATION P R OW0 N AH2 N S IY0 EY1 SH AH0 N 39 | # PRONUNCIATION(1) P R AH0 N AH2 N S IY0 EY1 SH AH0 N 40 | 41 | for line in fileinput.input (openhook = fileinput.hook_encoded ("iso-8859-1")): 42 | if line.startswith (';'): 43 | continue 44 | 45 | word, dummy_sep, phonemes = line.lower ().partition (' ') 46 | 47 | m = RE_STRESSED.search (phonemes) 48 | if m: 49 | phoneme = re.sub (r'[ 012]+', '-', m.group (0)) # remove stress marks 50 | dbm[word.encode ('utf-8')] = phoneme.encode ('utf-8') 51 | 52 | # print "%s %s\n" % (word, dbm[word]) 53 | 54 | dbm.sync () 55 | dbm.reorganize () 56 | dbm.close () 57 | 58 | -------------------------------------------------------------------------------- /src/ebookmaker/mydocutils/parsers/default_style.rst: -------------------------------------------------------------------------------- 1 | .. this is the default PG-RST stylesheet 2 | 3 | .. style:: emphasis 4 | :class: italics 5 | 6 | .. style:: strong 7 | :class: bold 8 | 9 | .. style:: title_reference 10 | :class: italics 11 | 12 | .. style:: option_argument 13 | :class: italics 14 | 15 | .. style:: literal 16 | :class: monospaced 17 | 18 | .. style:: subscript 19 | :class: subscript 20 | 21 | .. style:: superscript 22 | :class: superscript 23 | 24 | .. style:: title.document-title 25 | :class: x-large center 26 | :titlehack: 27 | 28 | .. style:: title.topic-title 29 | :class: centerleft 30 | 31 | .. style:: title.table-title 32 | :class: centerleft larger 33 | 34 | .. figure and image styles for non-image formats 35 | 36 | .. style:: figure 37 | :class: margin 38 | 39 | .. style:: figure 40 | :formats: txt.* *.noimages 41 | :align: center 42 | :width: 80% 43 | 44 | .. style:: image 45 | :formats: *.noimages 46 | 47 | .. container:: center image margin 48 | 49 | [image] 50 | 51 | 52 | .. style:: image 53 | :formats: txt.* 54 | :display: none 55 | 56 | .. style:: caption.figure-caption 57 | :formats: -txt.* 58 | :class: centerleft italics margin 59 | 60 | .. style:: caption.figure-caption 61 | :formats: txt.* 62 | :class: margin 63 | :before: '[Illustration: ' 64 | :after: ']' 65 | 66 | .. style:: legend 67 | :class: margin 68 | 69 | 70 | .. default transition 71 | 72 | .. style:: transition 73 | 74 | .. container:: center transition margin 75 | 76 | ―――― 77 | 78 | .. default attribution 79 | 80 | .. style:: attribution 81 | :class: margin 82 | :before: '―― ' 83 | 84 | -------------------------------------------------------------------------------- /docs/alt-text.md: -------------------------------------------------------------------------------- 1 | Ebookmaker encourages proper use of the alt attribute to make books with images more accessible to the reading disabled. Ebookmaker ensures that every `img` element has an `alt` attribute and issues warnings if the alt attribute is empty. 2 | 3 | Often the `alt` attribute should be left empty: 4 | 5 | 1. when the image is purely decorative or used to help with the visual presentation of text. It would be disruptive to a person using text-to-speach or a braille reader to have the image described. In such a case, add a`role` attribute with value `presentation`: `` and the warning message will be suppressed. Because of a bug in the W3C HTML validator, you can also use `data-role="presentation"` so that the validator won't complain - ebookmaker will use this to produce valid html5 and epub files. 6 | 7 | 2. when the image is well described by associated text. Often an image from a book will appear above a descriptive caption. For this reason, Ebookmaker will not emit a warning message if it appears inside a `
` element containing a `
`, or if the img has an `aria-labelledby` attribute: `` But when relying on a caption text, make sure it is describing what a sighted reader sees. Some captions comment on the image without describing it. 8 | 9 | 10 | Accessibiity Tutorial: 11 | https://www.w3.org/WAI/tutorials/images/ 12 | 13 | Using `aria-labelledby`: 14 | https://www.w3.org/WAI/WCAG21/Techniques/aria/ARIA16 15 | 16 | Other helpful guides: 17 | https://publishers.asn.au/BooksWithoutBarriers 18 | https://axesslab.com/alt-texts/ 19 | https://accessibility.huit.harvard.edu/describe-content-images 20 | 21 | w3c validator bug: https://github.com/validator/validator/issues/1599 -------------------------------------------------------------------------------- /tests/test_job.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | run this with 5 | python -m unittest -v tests.test_job 6 | ''' 7 | import datetime 8 | import os 9 | import subprocess 10 | import sys 11 | import unittest 12 | 13 | from libgutenberg import Logger 14 | from libgutenberg.Logger import debug 15 | from libgutenberg.DublinCore import PGDCObject 16 | 17 | import ebookmaker 18 | from ebookmaker import CommonCode, ParserFactory 19 | from ebookmaker.CommonCode import Options 20 | from ebookmaker.EbookMaker import config, get_dc 21 | from ebookmaker.parsers import webify_url 22 | 23 | options = Options() 24 | Logger.set_log_level(10) # DEBUG 25 | 26 | class TestJob(unittest.TestCase): 27 | 28 | def setUp(self): 29 | config() 30 | ParserFactory.load_parsers() 31 | self.sample_dir = os.path.join(os.path.dirname(__file__), 'files') 32 | self.out_dir = os.path.join(os.path.dirname(__file__), 'out') 33 | self.testfile = os.path.join(self.sample_dir, '43172/43172-h/43172-h.htm') 34 | subprocess.run(["touch", self.testfile]) 35 | self.testdbfile = "file://" + self.testfile 36 | options.config.CACHEDIR = os.path.join(os.path.dirname(__file__), 'cache/epub') 37 | options.config.FILESDIR = webify_url(os.path.join(os.path.dirname(__file__), 'files/')) 38 | 39 | def test_update(self): 40 | job = CommonCode.Job('html.images') 41 | job.ebook = 43172 42 | job.url = self.testfile 43 | job.dc = get_dc(job) 44 | job.last_updated() 45 | self.assertEqual(job.dc.update_date, datetime.date.today()) 46 | 47 | def test_update_db(self): 48 | job = CommonCode.Job('html.images') 49 | job.ebook = 43172 50 | options.is_job_queue = True 51 | job.url = self.testdbfile 52 | job.dc = get_dc(job) 53 | self.assertTrue(len(job.dc.files) > 0) 54 | job.last_updated() 55 | self.assertEqual(job.dc.update_date, datetime.date(2013,7,9)) 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /src/ebookmaker/mydocutils/gutenberg/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | 6 | Module parsers 7 | 8 | Copyright 2010-2012 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Customized Project Gutenberg directives for RST parser. 13 | 14 | """ 15 | 16 | from docutils import statemachine 17 | from docutils.parsers.rst import Directive, directives 18 | 19 | from ebookmaker.mydocutils import parsers 20 | 21 | from ebookmaker.mydocutils.gutenberg import transforms as gutenberg_transforms 22 | 23 | from libgutenberg.Logger import error, warning, info, debug 24 | 25 | # pylint: disable=W0142, W0102 26 | 27 | 28 | class PGHeaderFooter (Directive): 29 | """ Inserts PG header or footer. """ 30 | 31 | required_arguments = 0 32 | optional_arguments = 0 33 | 34 | def run (self): 35 | settings = self.state.document.settings 36 | include_lines = statemachine.string2lines ( 37 | settings.get_resource ('mydocutils.gutenberg.parsers', self.resource), 38 | settings.tab_width, 39 | convert_whitespace = 1) 40 | self.state_machine.insert_input (include_lines, '') 41 | return [] 42 | 43 | 44 | class PGHeader (PGHeaderFooter): 45 | """ Inserts PG header. """ 46 | resource = 'pg-header.rst' 47 | 48 | 49 | class PGFooter (PGHeaderFooter): 50 | """ Inserts PG footer. """ 51 | resource = 'pg-footer.rst' 52 | 53 | 54 | class Parser (parsers.Parser): 55 | """ Parser with PG custom directives. """ 56 | 57 | def __init__ (self): 58 | parsers.Parser.__init__ (self) 59 | 60 | directives.register_directive ('pgheader', PGHeader) 61 | directives.register_directive ('pgfooter', PGFooter) 62 | 63 | 64 | def get_transforms (self): 65 | return parsers.Parser.get_transforms (self) + [ 66 | gutenberg_transforms.VariablesTransform, 67 | gutenberg_transforms.SubRefToVarTransform] 68 | -------------------------------------------------------------------------------- /src/ebookmaker/WriterFactory.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*- 3 | 4 | """ 5 | 6 | WriterFactory.py 7 | 8 | Copyright 2009-14 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Writer factory. Dynamically loads writers from directories. 13 | 14 | """ 15 | 16 | 17 | import os.path 18 | 19 | from pkg_resources import resource_listdir # pylint: disable=E0611 20 | 21 | from libgutenberg.Logger import error, debug 22 | from ebookmaker.CommonCode import Options 23 | 24 | options = Options() 25 | 26 | writers = {} 27 | 28 | def __load_writers_from (package_name): 29 | """ See what types we can write. """ 30 | 31 | for fn in resource_listdir (package_name, ''): 32 | modulename, ext = os.path.splitext (fn) 33 | if ext == '.py' and modulename.endswith ('Writer'): 34 | type_ = modulename.lower ().replace ('writer', '') 35 | try: 36 | debug ("Loading writer type %s from module %s" % (type_, modulename)) 37 | module = __import__ (package_name + '.' + modulename, fromlist = [modulename]) 38 | writers[type_] = module 39 | except ImportError as what: 40 | error ( 41 | "Could not load writer type %s from module %s. %s" % 42 | (type_, modulename, what) 43 | ) 44 | 45 | 46 | def load_writers (): 47 | """ See what types we can write. """ 48 | 49 | __load_writers_from ('ebookmaker.writers') 50 | 51 | for package in options.extension_packages: 52 | __load_writers_from (package) 53 | 54 | return writers.keys () 55 | 56 | 57 | def unload_writers (): 58 | """ Unload writer modules. """ 59 | for k in writers.keys (): 60 | del writers[k] 61 | 62 | 63 | def create (type_): 64 | """ Load writer module for type. """ 65 | try: 66 | if type_ == 'kf8': 67 | type_ = 'kindle' 68 | return writers[type_].Writer () 69 | except KeyError: 70 | raise KeyError ('No writer for type %s' % type_) 71 | -------------------------------------------------------------------------------- /src/ebookmaker/mydocutils/nodes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | 6 | nodes.py 7 | 8 | Copyright 2011 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Added nodes for PG. 13 | 14 | """ 15 | 16 | from docutils import nodes 17 | 18 | class page (nodes.Element, nodes.Special): 19 | """ Hold pagination commands. 20 | 21 | Like clearpage, vspace etc. 22 | 23 | """ 24 | 25 | class newline (nodes.Element): 26 | """ A line break. 27 | 28 | Outputs a hard line break if the node or one of its parents belong 29 | to the class 'white-space-pre-line'. Else a space. 30 | 31 | """ 32 | 33 | class footnote_group (nodes.container): 34 | """ Hold a group of footnotes. """ 35 | 36 | 37 | class variable (nodes.Inline, nodes.TextElement): 38 | """ A placeholder that gets substituted with actual text before output. 39 | 40 | We do not use substitution refs because they are resolved way too 41 | early in the transformation stage to be of much use to us. 42 | 43 | """ 44 | 45 | 46 | class node_selector (object): 47 | """ Allows CSS-like selectors as condition function for nodes.traverse (). """ 48 | 49 | def __init__ (self, selector): 50 | 51 | # allow selectors like [element][.class[.class[...]]][, selector[, selector]] 52 | 53 | self.matches = [] # list of 2-tuples 54 | 55 | for sel in selector.split (','): 56 | sel = sel.strip () 57 | if '.' not in sel: 58 | sel += '.' 59 | element, classes = sel.split ('.', 1) 60 | classes = set (classes.split ('.')) if classes else set () 61 | self.matches.append ( (getattr (nodes, element, nodes.Element), classes) ) 62 | 63 | 64 | def __call__ (self, node): 65 | """ returns True if the node matches the selector. """ 66 | 67 | for match in self.matches: 68 | if isinstance (node, match[0]) and match[1].issubset (node['classes']): 69 | return True 70 | 71 | return False 72 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | CHANGES 3 | README 4 | setup.cfg 5 | setup.py 6 | ebookmaker/CommonCode.py 7 | ebookmaker/EbookMaker.py 8 | ebookmaker/HTMLChunker.py 9 | ebookmaker/ParserFactory.py 10 | ebookmaker/Spider.py 11 | ebookmaker/Unitame.py 12 | ebookmaker/UnitameData.py 13 | ebookmaker/Version.py 14 | ebookmaker/WriterFactory.py 15 | ebookmaker/__init__.py 16 | ebookmaker/mydocutils/__init__.py 17 | ebookmaker/mydocutils/nodes.py 18 | ebookmaker/mydocutils/gutenberg/__init__.py 19 | ebookmaker/mydocutils/gutenberg/parsers/__init__.py 20 | ebookmaker/mydocutils/gutenberg/parsers/pg-footer.rst 21 | ebookmaker/mydocutils/gutenberg/parsers/pg-header.rst 22 | ebookmaker/mydocutils/gutenberg/transforms/__init__.py 23 | ebookmaker/mydocutils/gutenberg/writers/__init__.py 24 | ebookmaker/mydocutils/gutenberg/writers/nroff.py 25 | ebookmaker/mydocutils/parsers/__init__.py 26 | ebookmaker/mydocutils/parsers/default_style.rst 27 | ebookmaker/mydocutils/transforms/__init__.py 28 | ebookmaker/mydocutils/transforms/parts.py 29 | ebookmaker/mydocutils/writers/__init__.py 30 | ebookmaker/mydocutils/writers/epub2.py 31 | ebookmaker/mydocutils/writers/nroff.py 32 | ebookmaker/mydocutils/writers/rst2all.css 33 | ebookmaker/mydocutils/writers/rst2epub.css 34 | ebookmaker/mydocutils/writers/rst2html.css 35 | ebookmaker/mydocutils/writers/xetex.py 36 | ebookmaker/mydocutils/writers/xhtml1.py 37 | ebookmaker/packagers/GzipPackager.py 38 | ebookmaker/packagers/HTMLPackager.py 39 | ebookmaker/packagers/PDFPackager.py 40 | ebookmaker/packagers/PushPackager.py 41 | ebookmaker/packagers/RSTPackager.py 42 | ebookmaker/packagers/TxtPackager.py 43 | ebookmaker/packagers/__init__.py 44 | ebookmaker/parsers/AuxParser.py 45 | ebookmaker/parsers/CSSParser.py 46 | ebookmaker/parsers/GutenbergTextParser.py 47 | ebookmaker/parsers/HTMLParser.py 48 | ebookmaker/parsers/ImageParser.py 49 | ebookmaker/parsers/RSTParser.py 50 | ebookmaker/parsers/__init__.py 51 | ebookmaker/parsers/broken.png 52 | ebookmaker/writers/EpubWriter.py 53 | ebookmaker/writers/HTMLWriter.py 54 | ebookmaker/writers/KindleWriter.py 55 | ebookmaker/writers/PDFWriter.py 56 | ebookmaker/writers/PicsDirWriter.py 57 | ebookmaker/writers/RSTWriter.py 58 | ebookmaker/writers/TxtWriter.py 59 | ebookmaker/writers/__init__.py 60 | ebookmaker/writers/cover.jpg 61 | scripts/convert_unitame 62 | scripts/ebookmaker 63 | scripts/rhyme_compiler 64 | -------------------------------------------------------------------------------- /src/ebookmaker/writers/PicsDirWriter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*- 3 | 4 | """ 5 | 6 | PicsDirWriter.py 7 | 8 | Copyright 2012 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Copies pics into local directory. Needed for HTML and Xetex. 13 | 14 | """ 15 | 16 | 17 | import os.path 18 | 19 | import libgutenberg.GutenbergGlobals as gg 20 | from libgutenberg.Logger import info, debug, error 21 | 22 | from ebookmaker.parsers import webify_url 23 | from ebookmaker import writers 24 | 25 | 26 | class Writer(writers.BaseWriter): 27 | """ Writes Pics directory. """ 28 | 29 | def copy_aux_files(self, job, dest_dir): 30 | """ Copy image files to dest_dir. Use image data cached in parsers. """ 31 | 32 | for p in job.spider.parsers: 33 | if hasattr(p, 'resize_image') or hasattr(p, 'auxparser'): 34 | src_uri = p.attribs.url 35 | if src_uri.startswith(webify_url(dest_dir)): 36 | debug('Not copying %s to %s: already there' % (src_uri, dest_dir)) 37 | continue 38 | 39 | fn_dest = gg.make_url_relative(webify_url(job.base_url), src_uri) 40 | fn_dest = os.path.join(dest_dir, fn_dest) 41 | 42 | # debug('base_url = %s, src_uri = %s' % (job.base_url, src_uri)) 43 | 44 | if gg.is_same_path(src_uri, fn_dest): 45 | debug('Not copying %s to %s: same file' % (src_uri, fn_dest)) 46 | continue 47 | 48 | fn_dest = gg.normalize_path(fn_dest) 49 | debug('Copying %s to %s' % (src_uri, fn_dest)) 50 | gg.mkdir_for_filename(fn_dest) 51 | try: 52 | with open(fn_dest, 'wb') as fp_dest: 53 | fp_dest.write(p.serialize()) 54 | except IOError as what: 55 | error('Cannot copy %s to %s: %s' % (src_uri, fn_dest, what)) 56 | 57 | 58 | 59 | def build(self, job): 60 | """ Build Pics file. """ 61 | 62 | dest_dir = os.path.abspath(job.outputdir) 63 | 64 | debug("Creating Pics directory in: %s" % dest_dir) 65 | 66 | self.copy_aux_files(job, dest_dir) 67 | 68 | debug("Done Pics directory in: %s" % dest_dir) 69 | -------------------------------------------------------------------------------- /scripts/convert_unitame: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*- 3 | 4 | """ 5 | 6 | convert_unitame.py 7 | 8 | Copyright 2010,2014 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | Converts unitame.dat into UnitameData module. 13 | 14 | """ 15 | 16 | import codecs 17 | import unicodedata as ud 18 | 19 | # from addhd 20 | 21 | i2a = ( 22 | "Euro","",",","f","\"","...","","","^","%","S","<","OE","","Z","", 23 | "","'","'","\"","\"","","-","--","~","(TM)","s",">","oe","","z","Y", 24 | " ","i","c","L","","Y","|","Sec.","\"","(C)","","\"","","-","(R)","-", 25 | " deg.","+-"," squared"," cubed","'"," mu","",".","","","","\"","1/4","1/2","3/4","?", 26 | "A","A","A","A","Ae","A","AE","C","E","E","E","E","I","I","I","I", 27 | "Eth","N","O","O","O","O","Oe","x","O","U","U","U","Ue","Y","","ss", 28 | "a","a","a","a","ae","a","ae","c","e","e","e","e","i","i","i","i", 29 | "eth","n","o","o","o","o","oe","/","o","u","u","u","ue","y","","y" 30 | ) 31 | 32 | 33 | def strip_accents (s): 34 | """ Strip accents from string. """ 35 | return ud.normalize ('NFKC', 36 | filter (lambda c: ud.category (c) != 'Mn', 37 | ud.normalize ('NFKD', s))) 38 | 39 | fp = codecs.open ('unitame.dat', 'rU', 'iso-8859-1') 40 | 41 | print '''#!/usr/bin/env python 42 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 43 | 44 | """ Converted from unitame.dat """ 45 | 46 | from __future__ import unicode_literals 47 | 48 | unicode_to_iso_8859_1 = {''' 49 | 50 | for line in fp.readlines (): 51 | line = line.strip () 52 | c, dummy, sub = line.split (';', 2) 53 | c = "%c" % int (c, 16) 54 | if sub and c != sub and strip_accents (c) != sub: 55 | comment = ud.name (c) 56 | if sub == "'": 57 | sub = r"\'" 58 | print (" '%s': '%s', # %s" % (c, sub, comment)).encode ('utf-8') 59 | 60 | print "}\n\n" 61 | 62 | print "iso_8859_1_to_ascii = {" 63 | 64 | for n, sub in enumerate (i2a): 65 | n = n + 0x80 66 | if n > 0xa0: 67 | c = unichr (n) 68 | if sub and strip_accents (c) != sub: 69 | comment = ud.name (c) 70 | if sub == "'": 71 | sub = r"\'" 72 | print (" '%s': '%s', # %s" % (c, sub, comment)).encode ('utf-8') 73 | 74 | print "}\n\n" 75 | -------------------------------------------------------------------------------- /tests/test_setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | run this with 5 | python -m unittest -v ebookmaker.tests.test_setup 6 | ''' 7 | import os 8 | import unittest 9 | import subprocess 10 | 11 | from libgutenberg import Logger 12 | from libgutenberg.Logger import debug 13 | 14 | import ebookmaker 15 | from ebookmaker import CommonCode 16 | from ebookmaker import ParserFactory 17 | from ebookmaker import WriterFactory 18 | from ebookmaker.CommonCode import Options, path_from_file 19 | from ebookmaker.EbookMaker import config 20 | from ebookmaker.EbookMaker import DEPENDENCIES, BUILD_ORDER 21 | from ebookmaker.packagers import PackagerFactory 22 | from ebookmaker.parsers import BROKEN, webify_url 23 | 24 | options = Options() 25 | 26 | class TestLoad(unittest.TestCase): 27 | 28 | def setUp(self): 29 | config() 30 | Logger.set_log_level(options.verbose) 31 | options.types = options.types or ['all'] 32 | options.types = CommonCode.add_dependencies(options.types, DEPENDENCIES, BUILD_ORDER) 33 | options.config.CACHEDIR = os.path.join(os.path.dirname(__file__), 'cache/epub') 34 | options.config.FILESDIR = webify_url(os.path.join(os.path.dirname(__file__), 'files/')) 35 | debug("Building types: %s" % ' '.join(options.types)) 36 | 37 | def test_parsers(self): 38 | ParserFactory.load_parsers() 39 | pf = ParserFactory.ParserFactory() 40 | 41 | # check parser created from resource 42 | broken_parser = pf.create(BROKEN) 43 | self.assertTrue(hasattr(broken_parser, 'resize_image')) 44 | broken_parser.pre_parse() 45 | self.assertTrue(len(broken_parser.image_data) > 0) 46 | self.assertTrue(broken_parser.get_image_dimen()[0] > 0) 47 | 48 | # check conversion to jpeg 49 | broken_parser.resize_image(16 * 1024, (66, 100), output_format='jpeg') 50 | 51 | def test_writers(self): 52 | WriterFactory.load_writers() 53 | 54 | def test_packagers(self): 55 | PackagerFactory.load_packagers() 56 | 57 | def test_dirs(self): 58 | print(path_from_file('cache/epub/1234/test')) 59 | self.assertTrue(path_from_file('cache/epub/1234/test').endswith( 60 | 'cache/epub/1234/test')) 61 | print(path_from_file('1/2/3/1234/test')) 62 | self.assertTrue(path_from_file('1/2/3/1234/test').endswith('files/1234/test')) 63 | -------------------------------------------------------------------------------- /src/ebookmaker/mydocutils/writers/rst2html.css: -------------------------------------------------------------------------------- 1 | /* 2 | Project Gutenberg HTML docutils stylesheet. 3 | 4 | This stylesheet contains styles specific to HTML. 5 | */ 6 | 7 | /* FONTS */ 8 | 9 | /* em { font-style: normal } 10 | strong { font-weight: normal } */ 11 | 12 | .small-caps { font-variant: small-caps } 13 | .gesperrt { letter-spacing: 0.1em } 14 | 15 | /* ALIGN */ 16 | 17 | .align-left { clear: left; 18 | float: left; 19 | margin-right: 1em } 20 | 21 | .align-right { clear: right; 22 | float: right; 23 | margin-left: 1em } 24 | 25 | .align-center { margin-left: auto; 26 | margin-right: auto } 27 | 28 | div.shrinkwrap { display: table; } 29 | 30 | /* SECTIONS */ 31 | 32 | body { margin: 5% 10% 5% 10% } 33 | 34 | /* compact list items containing just one p */ 35 | li p.pfirst { margin-top: 0; margin-bottom: 0 } 36 | 37 | .first { margin-top: 0 !important; 38 | text-indent: 0 !important } 39 | .last { margin-bottom: 0 !important } 40 | 41 | span.dropcap { float: left; margin: 0 0.1em 0 0; line-height: 1 } 42 | img.dropcap { float: left; margin: 0 0.5em 0 0; max-width: 25% } 43 | span.dropspan { font-variant: small-caps } 44 | 45 | .no-page-break { page-break-before: avoid !important } 46 | 47 | /* PAGINATION */ 48 | 49 | .pageno { position: absolute; right: 95%; font: medium sans-serif; text-indent: 0 } 50 | .pageno:after { color: gray; content: '[' attr(title) ']' } 51 | .lineno { position: absolute; left: 95%; font: medium sans-serif; text-indent: 0 } 52 | .lineno:after { color: gray; content: '[' attr(title) ']' } 53 | .toc-pageref { float: right } 54 | 55 | @media screen { 56 | .coverpage, .frontispiece, .titlepage, .verso, .dedication, .plainpage 57 | { margin: 10% 0; } 58 | 59 | div.clearpage, div.cleardoublepage 60 | { margin: 10% 0; border: none; border-top: 1px solid gray; } 61 | 62 | .vfill { margin: 5% 10% } 63 | } 64 | 65 | @media print { 66 | div.clearpage { page-break-before: always; padding-top: 10% } 67 | div.cleardoublepage { page-break-before: right; padding-top: 10% } 68 | 69 | .vfill { margin-top: 20% } 70 | h2.title { margin-top: 20% } 71 | } 72 | 73 | /* DIV */ 74 | pre { font-family: monospace; font-size: 0.9em; white-space: pre-wrap } 75 | -------------------------------------------------------------------------------- /src/ebookmaker/packagers/PushPackager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | PushPackager.py 6 | 7 | Copyright 2011 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Package a zip containing everything, that can be pushed to ibiblio. 12 | 13 | """ 14 | 15 | 16 | import os.path 17 | import re 18 | 19 | from libgutenberg.Logger import info, error 20 | import libgutenberg.GutenbergGlobals as gg 21 | 22 | from ebookmaker.CommonCode import Options 23 | from ebookmaker.packagers import ZipPackager 24 | 25 | options = Options() 26 | TYPE = 'ww' 27 | FORMATS = ['push'] 28 | 29 | class Packager (ZipPackager): 30 | """ Package one big zip for push. 31 | 32 | Zip contains one directory named after ebook_no. 33 | This dir mirrors structure on ibiblio:: 34 | 35 | 12345/12345.txt 36 | 12345/12345.zip 37 | 12345/12345-h/12345-h.html 38 | 12345/12345-h/images/cover.jpg 39 | 12345/12345-h.zip 40 | 41 | """ 42 | 43 | def package (self, job): 44 | self.setup (job) 45 | zipfilename = job.outputfile # filename is zipfile 46 | 47 | m = re.match (r'\d+', zipfilename) 48 | if m: 49 | ebook_no = m.group (0) 50 | else: 51 | error ('Invalid filename %s for push packager.' % zipfilename) 52 | return 53 | 54 | zip_ = self.create (zipfilename) 55 | 56 | for suffix in '.txt -8.txt -0.txt .zip -8.zip -0.zip -rst.zip -h.zip'.split (): 57 | filename = '%s%s' % (ebook_no, suffix) 58 | memberfilename = '%s/%s' % (ebook_no, filename) 59 | self.add (zip_, filename, memberfilename) 60 | 61 | for suffix, ext in (('-h', 'html'), ('-rst', 'rst')): 62 | filename = '%s%s.%s' % (ebook_no, suffix, ext) 63 | memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, filename) 64 | self.add (zip_, filename, memberfilename) 65 | 66 | # image files 67 | for url in options.html_images_list: 68 | rel_url = gg.make_url_relative (job.base_url, url) 69 | filename = os.path.join (self.path, rel_url) 70 | memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, rel_url) 71 | self.add (zip_, filename, memberfilename) 72 | 73 | zip_.close () 74 | info ('Done Zip file: %s' % zipfilename) 75 | -------------------------------------------------------------------------------- /tests/test_htm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import unittest 5 | import subprocess 6 | 7 | 8 | import ebookmaker 9 | 10 | class TestFromHtm(unittest.TestCase): 11 | def setUp(self): 12 | self.sample_dir = os.path.join(os.path.dirname(__file__), 'files') 13 | self.out_dir = os.path.join(os.path.dirname(__file__), 'out') 14 | 15 | def test_43172(self): 16 | book_id = '43172' 17 | dir = os.path.join(self.sample_dir, book_id) 18 | htmfile = os.path.join(dir, '%s-h' % book_id, '%s-h.htm' % book_id) 19 | cmd = f'ebookmaker -v --ebook=43172 --make=test --output-dir={self.out_dir} ' 20 | cmd += f'--validate {htmfile}' 21 | 22 | output = subprocess.check_output(cmd, shell=True) 23 | 24 | self.assertFalse(output) 25 | outs = [ 26 | "%s-epub.epub", 27 | "%s-images-epub3.epub", 28 | "%s-images-epub.epub", 29 | "%s-h.html", 30 | ] 31 | for out in outs: 32 | self.assertTrue(os.path.exists(os.path.join(self.out_dir, out % book_id))) 33 | os.remove(os.path.join(self.out_dir, out % book_id)) 34 | os.remove(os.path.join(self.out_dir, 'images/image.jpg')) 35 | os.remove(os.path.join(self.out_dir, 'images/mathex.jpg')) 36 | os.remove(os.path.join(self.out_dir, 'music/test.mp3')) 37 | os.rmdir(os.path.join(self.out_dir, 'images')) 38 | os.rmdir(os.path.join(self.out_dir, 'music')) 39 | 40 | def test_43172_nocover(self): 41 | book_id = '43172' 42 | dir = os.path.join(self.sample_dir, book_id) 43 | htmfile = os.path.join(dir, '%s-h' % book_id, '%s-nocover.htm' % book_id) 44 | cmd = 'ebookmaker --make=test --output-dir={dir} --generate_cover {htmfile}'.format( 45 | dir=self.out_dir, 46 | htmfile=htmfile, 47 | ) 48 | 49 | output = subprocess.check_output(cmd, shell=True) 50 | 51 | self.assertFalse(output) 52 | outs = [ 53 | "%s-epub.epub", 54 | "%s-images-epub3.epub", 55 | "%s-images-epub.epub", 56 | "%s-h.html", 57 | "%s-cover.png", 58 | ] 59 | for out in outs: 60 | self.assertTrue(os.path.exists(os.path.join(self.out_dir, out % book_id))) 61 | os.remove(os.path.join(self.out_dir, out % book_id)) 62 | os.remove(os.path.join(self.out_dir, 'images/image.jpg')) 63 | os.rmdir(os.path.join(self.out_dir, 'images')) 64 | -------------------------------------------------------------------------------- /src/ebookmaker/parsers/WrapperParser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*- 3 | 4 | """ 5 | 6 | WrapperParser.py 7 | 8 | Copyright 2020 by Eric Hellman 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | """ 13 | from xml.sax.saxutils import escape, quoteattr 14 | 15 | import lxml 16 | 17 | from copy import copy 18 | from libgutenberg.Logger import info 19 | from libgutenberg import GutenbergGlobals as gg 20 | from ebookmaker.parsers import HTMLParserBase, IMAGE_WRAPPER 21 | 22 | mediatypes = () 23 | 24 | class Parser(HTMLParserBase): 25 | 26 | def __init__(self, attribs): 27 | HTMLParserBase.__init__(self, copy(attribs)) 28 | self.attribs.orig_mediatype = self.attribs.mediatype 29 | self.src = attribs.url 30 | self.attribs.url = self.wrapper_url(attribs.url) 31 | self.attribs.orig_url = self.attribs.url 32 | self.attribs.nonlinear = True 33 | if not self.attribs.title: 34 | self.attribs.title = 'linked image' 35 | self.xhtml = lxml.etree.fromstring( 36 | self.unicode_content(), 37 | lxml.html.XHTMLParser(), 38 | base_url=self.attribs.url 39 | ) 40 | self.fp = True # so writers won't skip it 41 | 42 | # mark the image for treatment as a linked image 43 | attribs.rel.add('linked_image') 44 | # set the referrer for the image to this wrapper 45 | attribs.referrer = self.attribs.url 46 | 47 | 48 | def unicode_content(self): 49 | """ wrapper page content """ 50 | frag = ('#%s' % self.attribs.id) if self.attribs.id else '' 51 | backlink = '
back' % ( 52 | escape(self.attribs.referrer), frag) 53 | return IMAGE_WRAPPER.format( 54 | src=escape(self.src), 55 | title=quoteattr(self.attribs.title), 56 | backlink=backlink, 57 | wrapper_class='x-ebookmaker-wrapper', 58 | doctype=gg.XHTML_DOCTYPE, 59 | style='') 60 | 61 | 62 | def wrapper_url(self, img_url): 63 | """ make the wrapper url. """ 64 | if self.attribs.id: 65 | return '%s.%s.wrap.html' % (img_url, self.attribs.id) 66 | return img_url + '.wrap.html' 67 | 68 | 69 | def make_toc(self, xhtml): 70 | return [] 71 | 72 | 73 | def iterlinks(self): 74 | """ only return the image """ 75 | for iterlink in super(Parser, self).iterlinks(): 76 | if iterlink[1].tag == gg.NS.xhtml.img: 77 | yield iterlink -------------------------------------------------------------------------------- /src/ebookmaker/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: UTF8 -*- 3 | """ 4 | 5 | utils.py 6 | 7 | tools for manipulating xhtml 8 | Copyright 2009 by Project Gutenberg 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | """ 12 | 13 | import libgutenberg.GutenbergGlobals as gg 14 | from libgutenberg.GutenbergGlobals import xpath, NS 15 | from libgutenberg.Logger import critical, debug, error, info, warning 16 | 17 | def css_len(len_str): 18 | """ if an int, make px """ 19 | try: 20 | return str(int(len_str)) + 'px' 21 | except ValueError: 22 | return len_str 23 | 24 | def add_class(elem, classname): 25 | if 'class' in elem.attrib and elem.attrib['class']: 26 | vals = elem.attrib['class'].split() 27 | else: 28 | vals = [] 29 | vals.append(classname) 30 | elem.set('class', ' '.join(vals)) 31 | 32 | def add_style(elem, style=''): 33 | if style: 34 | if 'style' in elem.attrib and elem.attrib['style']: 35 | prev_style = elem.attrib['style'].strip(' ;') 36 | style = f'{style.strip(" ;")};{prev_style};' 37 | elem.set('style', style) 38 | 39 | def check_lang(elem, lang_att): 40 | three2two = {'ita': 'it', 'lat': 'la', 'heb': 'he', 'fra': 'fr', 'spa': 'es', 'deu': 'de', 41 | 'gla': 'gd', 'oji': 'oj', 'nav': 'nv',} 42 | lang_att = three2two.get(lang_att, lang_att) 43 | lang = elem.attrib[lang_att] 44 | lang_name = gg.language_map.get(lang, default=None) 45 | if lang_name: 46 | if NS.xml.lang in elem.attrib: 47 | del elem.attrib[NS.xml.lang] 48 | elem.attrib['lang'] = lang 49 | return True 50 | clean_lang = gg.language_map.inverse(lang, default=None) 51 | if not clean_lang: 52 | warning("invalid lang attribute %s", lang) 53 | del elem.attrib[lang_att] 54 | elem.attrib['data-invalid-lang'] = lang 55 | elif lang != clean_lang: 56 | elem.attrib['lang'] = clean_lang 57 | if NS.xml.lang in elem.attrib: 58 | del elem.attrib[NS.xml.lang] 59 | 60 | def replace_elements(xhtml, deprecated): 61 | ''' replace a dictionary of deprecated elements with a new element or just delete it. 62 | return a set of replaced elements 63 | ''' 64 | deprecated_used = set() 65 | for tag in deprecated: 66 | for elem in xpath(xhtml, "//xhtml:" + tag): 67 | if deprecated[tag]: 68 | add_class(elem, 'xhtml_' + tag) 69 | elem.tag = getattr(NS.xhtml, deprecated[tag]) 70 | else: 71 | elem.getparent().remove(elem) 72 | deprecated_used.add(tag) 73 | return deprecated_used 74 | -------------------------------------------------------------------------------- /src/ebookmaker/mydocutils/writers/epub2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | 6 | epub2.py 7 | 8 | Copyright 2012 by Marcello Perathoner 9 | 10 | Distributable under the GNU General Public License Version 3 or newer. 11 | 12 | A writer that writes XHTML 1 files suited for conversion into EPUB2. 13 | 14 | """ 15 | 16 | import re 17 | 18 | # from libgutenberg.Logger import info, debug, warning, error 19 | 20 | from ebookmaker.mydocutils.writers.xhtml1 import Writer as WriterBase 21 | from ebookmaker.mydocutils.writers.xhtml1 import Translator as TranslatorBase 22 | 23 | 24 | class Writer (WriterBase): 25 | """ EPUB2 writer. """ 26 | 27 | def __init__ (self): 28 | WriterBase.__init__ (self) 29 | self.translator_class = Translator 30 | 31 | 32 | class Translator (TranslatorBase): 33 | """ HTML Translator with EPUB2 tweaks. """ 34 | 35 | def init_css (self): 36 | for css_file in ('rst2all.css', 'rst2epub.css'): 37 | self.head.append ('\n' % 38 | self.encode (self.read_css (css_file))) 39 | 40 | 41 | def calc_centering_style (self, node): 42 | """ 43 | Rationale: The EPUB standard allows user agents to replace 44 | `margin: auto` with `margin: 0`. Thus we cannot use `margin: auto` 45 | to center images, we have to calculate the left margin value. 46 | 47 | Also we must use 'width' on the html element, not css style, 48 | or Adobe ADE will not scale the image properly (ie. only 49 | horizontally). 50 | 51 | :align: is supposed to work on blocks. It floats or centers 52 | a block. 53 | 54 | :align: center has not the same semantics as :class: center. 55 | Former centers the block, eg. the whole table, latter centers 56 | the text, eg, the text in every table cell. 57 | 58 | `:align: center` 59 | Used on image: centers image 60 | Used on figure: centers image and caption 61 | Used on table: centers table and caption 62 | 63 | """ 64 | 65 | width = node.get ('width') 66 | if width is None: 67 | return [] 68 | 69 | style = ['width: %s' % width] 70 | 71 | m = re.match (r'(\d+)\s*%', width) 72 | if m: 73 | width = max (min (int (m.group (1)), 100), 0) 74 | margin = 100 - width 75 | 76 | align = node.get ('align', 'center') 77 | if align == 'center': 78 | style.append ('margin-left: %d%%' % (margin / 2)) 79 | if align == 'right': 80 | style.append ('margin-left: %d%%' % margin) 81 | 82 | node['styles'].extend (style) 83 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # ebookmaker distribution 3 | # 4 | #!/usr/bin/env python 5 | 6 | from setuptools import setup 7 | 8 | VERSION = '0.13.8' 9 | 10 | if __name__ == "__main__": 11 | 12 | setup ( 13 | name = 'ebookmaker', 14 | version = VERSION, 15 | 16 | packages = [ 17 | 'ebookmaker', 18 | 'ebookmaker.parsers', 19 | 'ebookmaker.writers', 20 | 'ebookmaker.packagers', 21 | 22 | 'ebookmaker.mydocutils', 23 | 'ebookmaker.mydocutils.parsers', 24 | 'ebookmaker.mydocutils.transforms', 25 | 'ebookmaker.mydocutils.writers', 26 | 27 | 'ebookmaker.mydocutils.gutenberg', 28 | 'ebookmaker.mydocutils.gutenberg.parsers', 29 | 'ebookmaker.mydocutils.gutenberg.transforms', 30 | 'ebookmaker.mydocutils.gutenberg.writers', 31 | ], 32 | 33 | scripts = [ 34 | 'scripts/ebookmaker', 35 | 'scripts/convert_unitame', 36 | 'scripts/rhyme_compiler', 37 | ], 38 | 39 | install_requires = [ 40 | 'pillow>=8.3.2', 41 | 'cssutils', 42 | 'docutils>=0.18.1', 43 | 'lxml', 44 | 'roman', 45 | 'requests', 46 | 'six>=1.4.1', 47 | 'libgutenberg[covers]>=0.10.22', 48 | 'cchardet==2.2.0a2', 49 | 'beautifulsoup4', 50 | 'html5lib', 51 | ], 52 | 53 | package_data = { 54 | 'ebookmaker.parsers': ['broken.png', 'txt2all.css'], 55 | 'ebookmaker.writers': ['cover.jpg'], 56 | 'ebookmaker.mydocutils.parsers': ['*.rst'], 57 | 'ebookmaker.mydocutils.writers': ['*.css'], 58 | 'ebookmaker.mydocutils.gutenberg.parsers': ['*.rst'], 59 | }, 60 | 61 | data_files = [ 62 | ('', ['CHANGES', 'README.md']), 63 | ], 64 | 65 | # metadata for upload to PyPI 66 | 67 | author = "Marcello Perathoner", 68 | maintainer = "Eric Hellman", 69 | maintainer_email = "eric@hellman.net", 70 | description = "The Project Gutenberg tool to generate EPUBs and other ebook formats.", 71 | long_description = open ('README.md', encoding='utf-8').read (), 72 | long_description_content_type = 'text/markdown', 73 | license = "GPL v3", 74 | keywords = "ebook epub kindle pdf rst reST reStructuredText project gutenberg format conversion", 75 | url = "https://github.com/gutenbergtools/ebookmaker/", 76 | 77 | classifiers = [ 78 | "Topic :: Text Processing", 79 | "License :: OSI Approved :: GNU General Public License (GPL)", 80 | "Environment :: Console", 81 | "Operating System :: OS Independent", 82 | "Intended Audience :: Other Audience", 83 | "Programming Language :: Python", 84 | "Programming Language :: Python :: 3.9", 85 | "Programming Language :: Python :: 3.10", 86 | "Programming Language :: Python :: 3.11", 87 | ], 88 | 89 | platforms = 'OS-independent' 90 | ) 91 | -------------------------------------------------------------------------------- /src/ebookmaker/parsers/txt2all.css: -------------------------------------------------------------------------------- 1 | /* ************************************************************************ 2 | * classless css copied from https://www.pgdp.net/wiki/CSS_Cookbook/Styles 3 | * ********************************************************************** */ 4 | /* ************************************************************************ 5 | * set the body margins to allow whitespace along sides of window 6 | * ********************************************************************** */ 7 | body { margin-left:8%; width:85%; /* == margin-left:7% */ } 8 | /* ************************************************************************ 9 | * set the indention, spacing, and leading for body paragraphs. 10 | * ********************************************************************** */ 11 | p { /* all paragraphs unless overridden */ 12 | margin-top: 1em; /* inter-paragraph space */ 13 | margin-bottom: 0; /* use only top-margin for spacing */ 14 | line-height: 1.4em; /* interline spacing ("leading") */ 15 | } 16 | body > p { /* paras at level - not in
or */ 17 | text-align: justify; /* or left?? */ 18 | text-indent: 1em; /* first-line indent */ 19 | } 20 | /* suppress indentation on paragraphs following heads */ 21 | h2+p, h3+p, h4+p { text-indent: 0; } 22 | /* tighter spacing for list item paragraphs */ 23 | dd, li { 24 | margin-top: 0.25em; margin-bottom:0; 25 | line-height: 1.2em; /* a bit closer than p's */ 26 | } 27 | /* ************************************************************************ 28 | * Head 2 is for chapter heads. 29 | * ********************************************************************** */ 30 | h2 { 31 | /* text-align:center; left-aligned by default. */ 32 | margin-top:3em; /* extra space above.. */ 33 | margin-bottom: 2em; /* ..and below */ 34 | clear: both; /* don't let sidebars overlap */ 35 | } 36 | /* ************************************************************************ 37 | * Head 3 is for main-topic heads. 38 | * ********************************************************************** */ 39 | h3 { 40 | /* text-align:center; left-aligned by default. */ 41 | margin-top: 2em; /* extra space above but not below */ 42 | font-weight: normal; /* override default of bold */ 43 | clear: both; /* don't let sidebars overlap */ 44 | } 45 | /* ************************************************************************ 46 | * Styling the default HR and some special-purpose ones. 47 | * Default rule centered and clear of floats; sized for thought-breaks 48 | * ********************************************************************** */ 49 | hr { 50 | width:45%; /* adjust to ape original work */ 51 | margin-top: 1em; /* space above & below */ 52 | margin-bottom: 1em; 53 | margin-left: auto; /* these two ensure a.. */ 54 | margin-right: auto; /* ..centered rule */ 55 | clear: both; /* don't let sidebars & floats overlap rule */ 56 | } 57 | /* ************************************************************************ 58 | * Images and captions 59 | * ********************************************************************** */ 60 | img { /* the default inline image has */ 61 | border: 1px solid black; /* a thin black line border.. */ 62 | padding: 6px; /* ..spaced a bit out from the graphic */ 63 | } -------------------------------------------------------------------------------- /docs/images.md: -------------------------------------------------------------------------------- 1 | IMAGES AND COVERS 2 | 3 | As of EbookMaker 0.9, image filesize and dimension limits are being set differently. 4 | 5 | EbookMaker now considers three types of images it finds in html, and handles them each differently.: 6 | 7 | 1. inline images 8 | `Image of a Unicorn` 9 | 2. linked images 10 | `Click for larger Unicorn` 11 | 3. cover images 12 | These can come in 4 flavors (in priority order): 13 | 1. coverpage relation 14 | `` or `` (preferred) 15 | 2. coverpage id 16 | `front jacket` 17 | 3. image with 'cover' in the url 18 | `front jacket` 19 | 4. image with 'title' in the url 20 | `front jacket` 21 | 22 | Ebookmaker doesn't like to have duplicate covers, so it takes the first cover of sufficient size (>200x200), creates a cover wrapper for it, and tries to remove duplicates. 23 | 24 | Ebookmaker doesn't touch HTML or image files submitted to Project Gutenberg and displayed as HTML books. However, it transforms both HTML and image files for inclusion in EPUB and Kindle. Cover images displayed on Project Gutenberg are sized and processed by EbookMaker, and when no cover is present, an abstract cover is generated for the book. 25 | 26 | For compatibility, Ebookmaker > 0.9 creates "wrapper" files for linked images. Submitters do not need to create wrapper files. 27 | 28 | Images submitted for use in HTML should be sized and compressed so that load times are reasonably short and they look good on screens. 29 | 30 | Ebookmaker 0.9 has relaxed some limits on image sizes used inside EPUB and Kindle, considering advances in device power and network speed. Before version 0.9, any image or cover larger than 128KB was compressed to fit under 128KB. Similarly, images and covers wider than 800 px or taller than 1280 px were proportionately scaled down to fit. In version 0.9, the limits depend on the type of the image. 31 | 32 | - inline images are compressed if they are larger than 256KB and scaled if they are larger than 5000 x 5000. 33 | - linked images and cover images are compressed if they are larger than 1MB and scaled if they are larger than 5000x5000 34 | 35 | Industry specifications for book cover images have changed in the last few years. Amazon now requires that commercial ebook covers have _minimum_ dimensions of "at least 1200 pixels in width or 1800 pixels in height." They're more relaxed for self-published covers; KDP suggests minimum dimensions of 625 x 1000 px and ideal dimensions of 1600 x 2560. New Project Gutenberg books should have covers of quality commensurate with industry practice. 36 | 37 | Since cover images specified by the coverpage/icon relation are not displayed in HTML, there is no need to limit their size (within reason!!!) 38 | 39 | Suggested Guidelines for cover and image submissions to Project Gutenberg: 40 | 41 | 1. Submitted cover images should be at least 625 x 1000 px and ideally larger. The should be not exceed 1MB in size unless specified by a coverpage relation. 42 | 43 | 2. Submitted images should be less than 256KB for inline images and less than 1MB for linked images. 44 | 45 | 3. Display sizes for images should be set using relative units i.e. `ems`, and Project Gutenberg does not need to restrict pixel sizes for submitted images. 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /src/ebookmaker/writers/PDFWriter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python; indent-tabs-mode: nil; -*- coding: utf-8 -*- 3 | 4 | """ 5 | PDFWriter.py 6 | 7 | Copyright 2011 by Marcello Perathoner 8 | 9 | Distributable under the GNU General Public License Version 3 or newer. 10 | 11 | Convert RST to PDF. 12 | 13 | """ 14 | 15 | 16 | import os 17 | import subprocess 18 | 19 | from libgutenberg.Logger import debug, info, warning, error 20 | from libgutenberg.GutenbergGlobals import SkipOutputFormat, mkdir_for_filename 21 | 22 | from ebookmaker import ParserFactory 23 | from ebookmaker import writers 24 | from ebookmaker.CommonCode import Options 25 | 26 | options = Options() 27 | 28 | class Writer (writers.BaseWriter): 29 | """ Class to write PDF. """ 30 | 31 | def build (self, job): 32 | """ Build PDF file. """ 33 | 34 | inputfilename = job.url 35 | outputfilename = os.path.join (os.path.abspath(job.outputdir), job.outputfile) 36 | 37 | debug ("Inputfile: %s" % inputfilename) 38 | debug ("Creating PDF file: %s" % outputfilename) 39 | 40 | mkdir_for_filename(outputfilename) 41 | debug(f'parser input is {inputfilename}') 42 | parser = ParserFactory.ParserFactory.create (inputfilename) 43 | 44 | if not hasattr (parser, 'rst2xetex'): 45 | debug ('Skipping PDF Output because input mediatype is %s' % parser.mediatype()) 46 | raise SkipOutputFormat 47 | 48 | # Brain-dead xetex doesn't understand unix pipes 49 | # so we have to write a temp file 50 | 51 | texfilename = os.path.splitext (outputfilename)[0] + '.tex' 52 | auxfilename = os.path.splitext (outputfilename)[0] + '.aux' 53 | logfilename = os.path.splitext (outputfilename)[0] + '.log' 54 | 55 | try: 56 | os.remove (auxfilename) 57 | except OSError: 58 | pass 59 | 60 | tex = parser.rst2xetex (job) 61 | with open (texfilename, 'wb') as fp: 62 | fp.write (tex) 63 | 64 | try: 65 | cwd = os.getcwd () 66 | os.chdir (os.path.abspath(job.outputdir)) 67 | 68 | _xetex = subprocess.Popen ([options.config.XELATEX, 69 | "-output-directory", job.outputdir, 70 | "-interaction", "nonstopmode", 71 | texfilename], 72 | stdin = subprocess.PIPE, 73 | stdout = subprocess.PIPE, 74 | stderr = subprocess.PIPE) 75 | except OSError as what: 76 | os.chdir (cwd) 77 | error ("PDFWriter: %s %s" % (options.config.XELATEX, what)) 78 | raise SkipOutputFormat 79 | 80 | (dummy_stdout, dummy_stderr) = _xetex.communicate () 81 | 82 | with open (logfilename, encoding='utf-8') as fp: 83 | for line in fp: 84 | line = line.strip () 85 | if 'Error:' in line: 86 | error ("xetex: %s" % line) 87 | if options.verbose >= 1: 88 | if 'Warning:' in line: 89 | warning ("xetex: %s" % line) 90 | 91 | if options.verbose < 2: 92 | try: 93 | os.remove (texfilename) 94 | os.remove (logfilename) 95 | os.remove (auxfilename) 96 | except OSError: 97 | pass 98 | 99 | os.chdir (cwd) 100 | 101 | debug ("Done PDF file: %s" % outputfilename) 102 | -------------------------------------------------------------------------------- /USAGE.md: -------------------------------------------------------------------------------- 1 | # Usage Notes 2 | 3 | Ebookmaker has to reliably make EPUB and MOBI for over 60,000 different titles every month, so it includes a number of adaptations that may not be intuitive for HTML authors. 4 | 5 | ## Crawling 6 | 7 | Ebookmaker starts with a document file path or URL, and then follows links and images to a depth determined by the `--max_depth` setting. It only follows links that are in the same directory or below; anything in the same directory linked by the starting page will be included in the ebook it tries to build. The `*.noimages` filetype builds (for example, `--make=epub.noimages`) exclude images. If you don't want the ebook to include a resource that your HTML links to, use the `rel='nofollow'` attribute of the `a` tag. 8 | 9 | The crawl from the starting page determines the reading order for the ebook. If the starting page links to another html page, the content from that page will be placed after the starting page in the reading order. For this reason, it's simpler to put all the content on a single page. Multi-page HTML books should convert well if attention is paid to the reading order implied by the starting page. 10 | 11 | ## Floats and absolute positioning 12 | 13 | Ebookmaker removes elements that float, because a large part of the PG backfile was produced before any ebook readers could handle floats. It also removes elements with absolute as it is not supported by EPUB2. HTML authors can prevent floating elements from being stripped by using a css selector that contains the `x-ebookmaker` class. Ebookmaker assumes that if the HTML designer uses the `x-ebookmaker` class, they've considered the impact of the float on the generated EPUB. 14 | 15 | ## Page numbers 16 | 17 | Ebookmaker strips content from elements that it thinks are page numbers. HTML produced for PG often implements the original page numbers either with float or with absolute positioning. If these elements were left in, they would show up as numbers in the middle of the text. 18 | 19 | To still keep links working, all page number contraptions are replaced with empty `a` tags with class `x-ebookmaker-pageno`. 20 | 21 | The classes that make Ebookmaker think the element is a page number are: `pagenum pageno page pb folionum foliono`. 22 | 23 | ## Tables of Contents 24 | 25 | Ebookmaker uses HTML heading elements to generate a table of contents. To play nicely with this process, HTML should not use heading elements for things that don't belong in the table of contents, and _should_ use heading elements for things that do! 26 | 27 | ## Hidden content 28 | 29 | Content hidden by the `display:none` css directive can create havoc with ebook generation. For example, MOBI generation _will_ fail if the target of a link is hidden. Authors of HTML for Ebookmaker should refrain from using `display:none` and should check that all ebook formats convert as expected. 30 | 31 | ## Images and Covers 32 | 33 | HTML authors can control the image that Ebookmaker uses as a cover for ebook files. If there is no suitable cover image, Ebookmaker will generate one. Images are scaled if they are "too big". It's a bit complicated, so there's [a separate page](docs/images.md) that tries to explain it all. 34 | 35 | 36 | ## Special classes 37 | 38 | Ebookmaker recognizes a number of special classes that can be used to modify its HTML conversion. There are 4 "`x-ebookmaker`" classes: 39 | 40 | - Ebookmaker adds the class `x-ebookmaker` to the `body` element inside the EPUBs it builds. This can be then be used by css to make styles that are only active inside an ebook file. This class replaces a deprecated 'handheld' @media query. 41 | - The `x-ebookmaker-important` class on an image element tells ebookmaker not to remove the image, even in `*.noimages` builds. 42 | - The `x-ebookmaker-drop` class tells ebookmaker to remove an element and its descendents from ebook builds. Don't use this class to prevent a file from being crawled - use `rel='nofollow'` instead. 43 | - As described above, Ebookmaker adds the `x-ebookmaker-pageno` class to elements whose content has been stripped because they use a class that indicates they represent page numbers. 44 | 45 | -------------------------------------------------------------------------------- /docs/ebookmaker_v0_11.md: -------------------------------------------------------------------------------- 1 | # New features in Ebookmaker v0.11 2 | 3 | In addition to some small tweaks in its generated EPUBs, Ebookmaker version 0.11 also emits regularized HTML files for all types of input, including HTML source files. These "derived" files are now the preferred HTML presentation on the PG website. 4 | 5 | The source HTML files are not modified, and are available (at the URLs they've always been at) via the "More files..." link on the website. Errata should be addressed in the source files, not the derived files, as whitespace and link structure are changed by ebookmaker in ways that may preclude reprocessing. Files are re-derived for the entire catalog monthly. 6 | 7 | A major impetus for this change is to improve compatibility with browser plugins, mobile apps, proxy servers, accessibility tools and PG's own file processors. Much of our back file uses old versions of HTML that are poorly supported in modern browsers and other tools, and while there is ongoing work to update the back file, we are thousands of books away from being able to present uniformly coded HTML. This change is also a first step towards being able to use HTML5 for both source files and for presentation; for many PG books, the derived files will validate as HTML5. 8 | 9 | Submitters should be aware that our current process first converts submitted files to XHTML and HTML5ish files are derived from the XHTML; our process does not yet support features introduced in HTML5. 10 | 11 | Here are the differences between HTML source files and the HTML files derived from them: 12 | 13 | 1. all HTML files are cleaned by HTML Tidy. Tidy does the following: 14 | i. HTML Tidy emits well-formed UTF8-encoded XHTML-compatible files. This will allow the PG web server to add the encoding to MIME headers, improving browser compatibility and accessibility. 15 | ii. LF is used as the newline character for all files (unix standard) 16 | iii. HTML entities such as `’` `Á` etc. are converted to unicode characters. Together with webserver configuration changes, this will improve web browser compatibility. 17 | iv. Tidy corrects badly formed HTML, improving browser compatibility and standards conformance. 18 | v. An doctype declaration: `` is used for all files This is compatible with the included metadata. 19 | vi. Tags are now uniformly lower case 20 | vii. Some legacy presentational tags (``, ``, `
` when enclosed within appropriate inline tags, and ) are replaced with CSS `