├── test-epubs
    ├── .gitignore
    └── README
├── Makefile
├── epub_extractor
    ├── __init__.py
    ├── epub_dump_toc.py
    ├── epub_dump_meta.py
    ├── epub_extract_jpeg.py
    └── epub_extractor.py
├── test-dump-toc.sh
├── test-dump-meta.sh
├── test-extract-jpeg.sh
├── .editorconfig
├── README.rst
├── .gitignore
├── setup.py
├── LICENSE.txt
└── .pre-commit-config.yaml


/test-epubs/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | !README
4 | 


--------------------------------------------------------------------------------
/test-epubs/README:
--------------------------------------------------------------------------------
1 | Put EPUBs this directory and run ./test-extract.sh
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | release:
2 | 	python3 setup.py sdist
3 | 	twine upload dist/*
4 | 


--------------------------------------------------------------------------------
/epub_extractor/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding: utf-8
3 | 
4 | __author__ = 'torico'
5 | __version__ = '0.4.3'
6 | __license__ = 'BSD'
7 | 


--------------------------------------------------------------------------------
/test-dump-toc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd $(dirname $0)
 4 | 
 5 | for DIR in test-epubs/*; do
 6 |     if [ -d ${DIR} ]; then
 7 |         rm -r ${DIR}
 8 |     fi
 9 | done
10 | 
11 | for EPUB in test-epubs/*.epub; do
12 |     echo ${EPUB}
13 |     epub_extractor/epub_dump_toc.py ${EPUB}
14 | done
15 | 


--------------------------------------------------------------------------------
/test-dump-meta.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd $(dirname $0)
 4 | 
 5 | for DIR in test-epubs/*; do
 6 |     if [ -d ${DIR} ]; then
 7 |         rm -r ${DIR}
 8 |     fi
 9 | done
10 | 
11 | for EPUB in test-epubs/*.epub; do
12 |     echo ${EPUB}
13 |     epub_extractor/epub_dump_meta.py ${EPUB}
14 | done
15 | 


--------------------------------------------------------------------------------
/test-extract-jpeg.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd $(dirname $0)
 4 | 
 5 | for DIR in test-epubs/*; do
 6 |     if [ -d ${DIR} ]; then
 7 |         rm -r ${DIR}
 8 |     fi
 9 | done
10 | 
11 | for EPUB in test-epubs/*.epub; do
12 |     echo ${EPUB}
13 |     epub_extractor/epub_extract_jpeg.py ${EPUB}
14 | done
15 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | indent_style = space
 5 | indent_size = 2
 6 | insert_final_newline = true
 7 | trim_trailing_whitespace = true
 8 | end_of_line = lf
 9 | charset = utf-8
10 | 
11 | [*.py]
12 | indent_size = 4
13 | max_line_length = 79
14 | 
15 | [*.json]
16 | insert_final_newline = ignore
17 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | ~~~~~~~~~~~~~~
 2 | epub-extractor
 3 | ~~~~~~~~~~~~~~
 4 | 
 5 | * Extract Jpeg images from EPUB file.
 6 | 
 7 | * Dump metadata from EPUB file.
 8 | 
 9 | 
10 | Install
11 | -------
12 | 
13 | ::
14 | 
15 |   $ pip install epub-extractor
16 | 
17 | 
18 | Requirements
19 | ------------
20 | 
21 | * unzip
22 | 
23 | 
24 | 
25 | epub-extract-jpeg
26 | -----------------
27 | 
28 | ::
29 | 
30 |     $ epub-extract-jpeg comic.epub
31 | 
32 | Extract jpeg images.
33 | 
34 | 
35 | 
36 | epub-dump-meta
37 | --------------
38 | 
39 | ::
40 | 
41 |     $ epub-extract-jpeg comic.epub
42 | 
43 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # General
 2 | tmp
 3 | temp
 4 | #*
 5 | .#*
 6 | 
 7 | # Editor
 8 | *~
 9 | *.orig
10 | *.swp
11 | 
12 | # Python
13 | *.pyc
14 | *.pyo
15 | *.egg-info/
16 | dist/
17 | 
18 | # Windows
19 | Thumbs.db
20 | 
21 | # Eclipse
22 | .project
23 | 
24 | # Pydev
25 | .pydevproject
26 | 
27 | # Mac
28 | .DS_Store
29 | 
30 | # Xcode
31 | build/
32 | *.pbxuser
33 | !default.pbxuser
34 | *.mode1v3
35 | !default.mode1v3
36 | *.mode2v3
37 | !default.mode2v3
38 | *.perspectivev3
39 | !default.perspectivev3
40 | xcuserdata
41 | *.xccheckout
42 | *.moved-aside
43 | DerivedData
44 | *.hmap
45 | *.ipa
46 | *.xcuserstate
47 | 
48 | 
49 | # CocoaPod
50 | Pods/*
51 | Podfile.lock
52 | 
53 | # JetBrains
54 | .idea
55 | 
56 | # if required
57 | # data/*
58 | # config/*
59 | ~*
60 | venv
61 | .venv
62 | 
63 | .env
64 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from setuptools import setup
 3 | from epub_extractor import __author__, __version__, __license__
 4 | 
 5 | setup(
 6 |     name='epub-extractor',
 7 |     version=__version__,
 8 |     description='Extract comic EPUB pages to Jpeg files, '
 9 |                 'Dump meta information.',
10 |     license=__license__,
11 |     author=__author__,
12 |     author_email='ytyng@live.jp',
13 |     url='https://github.com/ytyng/epub-extractor.git',
14 |     keywords='comic epub extract jpeg images and meta information.',
15 |     packages=['epub_extractor'],
16 |     install_requires=[],
17 |     entry_points={
18 |         'console_scripts': [
19 |             'epub-extract-jpeg = epub_extractor.epub_extract_jpeg:main',
20 |             'epub-dump-meta = epub_extractor.epub_dump_meta:main',
21 |         ]
22 |     },
23 | )
24 | 


--------------------------------------------------------------------------------
/epub_extractor/epub_dump_toc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | EPUB ファイルの TOC を表示
 5 | """
 6 | 
 7 | import argparse
 8 | 
 9 | try:
10 |     from .epub_extractor import EpubExtractor
11 | except (ValueError, SystemError, ImportError):
12 |     try:
13 |         from epub_extractor import EpubExtractor
14 |     except (ValueError, SystemError, ImportError):
15 |         from epub_extractor.epub_extractor import EpubExtractor
16 | 
17 | 
18 | def procedure(file_path):
19 |     epub_extractor = EpubExtractor(file_path)
20 |     toc_table = epub_extractor.get_toc_table()
21 |     epub_extractor.close()
22 |     return toc_table
23 | 
24 | 
25 | def main():
26 |     parser = argparse.ArgumentParser(description='Dump EPUB toc data.')
27 |     parser.add_argument(
28 |         'epub_files',
29 |         metavar='EPUB-Files',
30 |         type=str,
31 |         nargs='+',
32 |         help='Target Epub Files',
33 |     )
34 | 
35 |     args = parser.parse_args()
36 | 
37 |     if len(args.epub_files) > 1:
38 |         out = []
39 |         for epub_file in args.epub_files:
40 |             out.append(procedure(epub_file))
41 |     else:
42 |         out = procedure(args.epub_files[0])
43 | 
44 |     EpubExtractor.print_json(out)
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     main()
49 | 


--------------------------------------------------------------------------------
/epub_extractor/epub_dump_meta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | 
 4 | """
 5 | EPUB ファイルの Meta を表示
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | try:
11 |     from .epub_extractor import EpubExtractor
12 | except (ValueError, SystemError, ImportError):
13 |     try:
14 |         from epub_extractor import EpubExtractor
15 |     except (ValueError, SystemError, ImportError):
16 |         from epub_extractor.epub_extractor import EpubExtractor
17 | 
18 | 
19 | def procedure(file_path):
20 |     epub_extractor = EpubExtractor(file_path)
21 |     meta = epub_extractor.meta
22 |     metadata = meta.as_ordered_dict()
23 |     epub_extractor.close()
24 |     return metadata
25 | 
26 | 
27 | def main():
28 |     parser = argparse.ArgumentParser(description='Dump EPUB Meta information.')
29 |     parser.add_argument(
30 |         'epub_files',
31 |         metavar='EPUB-Files',
32 |         type=str,
33 |         nargs='+',
34 |         help='Target Epub Files',
35 |     )
36 | 
37 |     args = parser.parse_args()
38 | 
39 |     if len(args.epub_files) > 1:
40 |         out = []
41 |         for epub_file in args.epub_files:
42 |             out.append(procedure(epub_file))
43 |     else:
44 |         out = procedure(args.epub_files[0])
45 | 
46 |     EpubExtractor.print_json(out)
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     main()
51 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016, @ytyng
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | * Redistributions of source code must retain the above copyright notice, 
 7 |   this list of conditions and the following disclaimer.
 8 | * Redistributions in binary form must reproduce the above copyright notice, 
 9 |   this list of conditions and the following disclaimer in the documentation 
10 |   and/or other materials provided with the distribution.
11 | * Neither the name of the @ytyng nor the names of its contributors
12 |   may be used to endorse or promote products derived from this software 
13 |   without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | 


--------------------------------------------------------------------------------
/epub_extractor/epub_extract_jpeg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | EPUB ファイルを jpeg に展開する
 4 | """
 5 | 
 6 | import argparse
 7 | 
 8 | try:
 9 |     from .epub_extractor import EpubExtractor
10 | except (ValueError, SystemError, ImportError):
11 |     try:
12 |         from epub_extractor import EpubExtractor
13 |     except (ValueError, SystemError, ImportError):
14 |         from epub_extractor.epub_extractor import EpubExtractor
15 | 
16 | 
17 | def procedure(file_path, convert_png=True, delete_exists_dir=False):
18 |     epub_extractor = EpubExtractor(file_path)
19 |     epub_extractor.extract_images(
20 |         convert_png=convert_png, delete_exists_dir=delete_exists_dir
21 |     )
22 |     epub_extractor.close()
23 | 
24 | 
25 | def main():
26 |     parser = argparse.ArgumentParser(description='Extract Jpeg files in EPUB')
27 |     parser.add_argument(
28 |         'epub_files',
29 |         metavar='EPUB-Files',
30 |         type=str,
31 |         nargs='+',
32 |         help='Target Epub Files',
33 |     )
34 |     parser.add_argument(
35 |         '--no-png-convert',
36 |         dest='no_png_convert',
37 |         action='store_true',
38 |         default=False,
39 |         help='No png convert to jpeg',
40 |     )
41 |     parser.add_argument(
42 |         '--delete-exists-dir',
43 |         dest='delete_exists_dir',
44 |         action='store_true',
45 |         default=False,
46 |         help='No png convert to jpeg',
47 |     )
48 | 
49 |     args = parser.parse_args()
50 | 
51 |     for epub_file in args.epub_files:
52 |         procedure(
53 |             epub_file,
54 |             convert_png=not args.no_png_convert,
55 |             delete_exists_dir=args.delete_exists_dir,
56 |         )
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     main()
61 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/psf/black
 3 |     rev: 22.12.0
 4 |     hooks:
 5 |       - id: black
 6 |         language: python
 7 |         types: [python]
 8 |         # GitHub actions の black と同じ設定にしている。 .github/workflows/black.yml
 9 |         args: [
10 |             --line-length, "79",
11 |             --target-version, "py310",
12 |             --exclude, "migrations",
13 |             --skip-string-normalization,  # 文字列の正規化をスキップ
14 |             --experimental-string-processing,
15 |         ]
16 | 
17 |   - repo: https://github.com/PyCQA/flake8
18 |     rev: 6.0.0
19 |     hooks:
20 |       - id: flake8
21 |         # github actions の flake8 と同じ設定にしている。 .github/workflows/flake8.yml
22 |         # B: Bugbearによって検出されるエラー
23 |         # B950: Bugbearによって検出される、最大行長に対する緩いチェック
24 |         # extend-ignore = E203, E501: これは、Flake8が無視するエラーコードを指定しています。
25 |         # E203とE501はそれぞれ次のようなエラーコードです。
26 |         # E203: コロンの前に空白があるときに発生するエラー。これは、BlackとPEP 8のコーディングスタイルが競合するため、無視されます。
27 |         args: [
28 |             --count,
29 |             --max-line-length, "88",
30 |             --select, "E9,F63,F7,F82,B,B950",
31 |             --extend-ignore, "E203",
32 |             --statistics,
33 |             --show-source,
34 |             --exclude,  "*/migrations/*",
35 |         ]
36 |         additional_dependencies: [ flake8-bugbear ]  # flake8-bugbear を追加
37 | 
38 |   - repo: https://github.com/PyCQA/isort
39 |     rev: 5.12.0
40 |     hooks:
41 |       - id: isort
42 |         args: [--profile, "black"] # black とコンフリクトさせないよう同じ設定を使用
43 | 
44 |   - repo: https://github.com/pre-commit/pre-commit-hooks
45 |     rev: v4.1.0
46 |     hooks:
47 |     # https://pre-commit.com/hooks.html にあるもの
48 |       - id: check-json  # json ファイルの構文チェック
49 |       - id: check-toml # toml ファイルの構文チェック
50 |       - id: check-yaml # yaml ファイルの構文チェック
51 |       - id: debug-statements # デバッグ用の print 文を検出
52 |       - id: end-of-file-fixer # ファイルの最後に改行を追加
53 |       - id: fix-byte-order-marker # BOM を削除
54 |       - id: trailing-whitespace # 行末の空白を削除
55 |       - id: detect-aws-credentials # AWS の認証情報を検出
56 |       - id: detect-private-key # 秘密鍵を検出
57 | 


--------------------------------------------------------------------------------
/epub_extractor/epub_extractor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import os
  4 | import re
  5 | import shutil
  6 | import subprocess
  7 | import sys
  8 | import tempfile
  9 | import warnings
 10 | from abc import ABCMeta, abstractmethod
 11 | from collections import OrderedDict
 12 | from functools import cached_property
 13 | from typing import Callable, Dict, Iterator, List, Optional
 14 | from xml.etree import ElementTree
 15 | from xml.etree.ElementTree import Element
 16 | 
 17 | 
 18 | def parse_xml_with_recover(xml_path: str) -> ElementTree:
 19 |     """
 20 |     xmlをパース
 21 |     & の使い方が悪いファイルがある場合、
 22 |     それをパースしようとするとエラーになるので、失敗したら文字列置換してリトライする。
 23 |     http://stackoverflow.com/questions/13046240/parseerror-not-well-formed
 24 |     -invalid-token-using-celementtree
 25 |     ここには、lxml の場合の対応方法があるが、python3 のxml ではやり方不明のため
 26 |     ( ElementTree.XMLParser のコンストラクタには recover 引数が無い)、
 27 |     自力で置換する
 28 |     """
 29 |     try:
 30 |         etree = ElementTree.parse(xml_path)
 31 |         return etree
 32 |     except ElementTree.ParseError as e:
 33 |         # ParseError の場合のみ、修復を試みる
 34 |         print('{}, {}'.format(e.__class__.__name__, e))
 35 | 
 36 |     xml_source = open(xml_path).read()
 37 |     # 修復!
 38 |     xml_source = xml_repair(xml_source)
 39 |     return ElementTree.fromstring(xml_source)
 40 | 
 41 | 
 42 | def convert_to_jpeg(
 43 |     source_file_path: str,
 44 |     destination_file_path: str,
 45 |     *,
 46 |     jpeg_quality: int = 70,
 47 |     copy: bool = True,
 48 | ) -> None:
 49 |     """
 50 |     PNG を Jpeg に変換する
 51 |     """
 52 |     try:
 53 |         from PIL import Image
 54 |     except ImportError:
 55 |         print(
 56 |             'PNG image found. Converting png to jpeg, require PIL.',
 57 |             file=sys.stderr,
 58 |         )
 59 |         print(
 60 |             'Try: "pip install PIL" or "pip install pillow"', file=sys.stderr
 61 |         )
 62 |         raise
 63 | 
 64 |     im = Image.open(source_file_path)
 65 |     im = im.convert("RGB")
 66 |     im.save(destination_file_path, 'jpeg', quality=jpeg_quality)
 67 |     if not copy:
 68 |         os.remove(source_file_path)
 69 |     print('{} -> {}'.format(source_file_path, destination_file_path))
 70 | 
 71 | 
 72 | re_entity = re.compile(r'(>[^<]*)(&)([^<]*<)')
 73 | re_replace = re.compile(r'&(?!\w*?;)')
 74 | 
 75 | 
 76 | def xml_repair(xml_source: str) -> str:
 77 |     """
 78 |     XMLのソースコードの & を &amp; に変換する
 79 |     :param self:
 80 |     :param xml_source:
 81 |     :return:
 82 |     """
 83 | 
 84 |     def _replace(matcher):
 85 |         return re_replace.sub('&amp;', matcher.group(0))
 86 | 
 87 |     return re_entity.sub(_replace, xml_source)
 88 | 
 89 | 
 90 | def get_etree_namespace(element: Element) -> str:
 91 |     m = re.match('\{.*\}', element.tag)
 92 |     return m.group(0) if m else ''
 93 | 
 94 | 
 95 | def namespace_tag_query(element: Element) -> Callable[[str], str]:
 96 |     """
 97 |     element のネームスペースをバインドし、ネームスペースつきのタグ名を返す関数を返す
 98 |     """
 99 |     ns = get_etree_namespace(element)
100 | 
101 |     def _generate_query(tag_name):
102 |         return './/{}{}'.format(ns, tag_name)
103 | 
104 |     return _generate_query
105 | 
106 | 
107 | class ImageElementBase(metaclass=ABCMeta):
108 |     class ItemHrefNotFound(Exception):
109 |         pass
110 | 
111 |     @cached_property
112 |     @abstractmethod
113 |     def image_path(self) -> str:
114 |         raise NotImplementedError
115 | 
116 |     @cached_property
117 |     @abstractmethod
118 |     def is_png(self) -> bool:
119 |         """Is image type PNG"""
120 |         raise NotImplementedError
121 | 
122 | 
123 | class ImageElement(ImageElementBase):
124 |     """
125 |     item_element が、image/xxx の場合 (ページごとの XMLが無い場合)
126 |     """
127 | 
128 |     def __init__(
129 |         self,
130 |         item_element: Element,
131 |         itemref_element: Element,
132 |         epub_extractor: 'EpubExtractor',
133 |     ):
134 |         self.item_element = item_element
135 |         self.itemref_element = itemref_element
136 |         self.epub_extractor = epub_extractor
137 | 
138 |     @cached_property
139 |     def image_path(self) -> str:
140 |         item_href = self.item_element.attrib.get('href', None)
141 |         if not item_href:
142 |             raise self.ItemHrefNotFound(f'{self.item_element}')
143 |         return os.path.join(self.epub_extractor.content_base_dir, item_href)
144 | 
145 |     @cached_property
146 |     def is_png(self) -> bool:
147 |         return self.item_element.attrib.get('href', '').endswith(
148 |             '.png'
149 |         ) or self.item_element.attrib.get('media-type', '').endswith('/png')
150 | 
151 | 
152 | class ImagePage(ImageElementBase):
153 |     """
154 |     画像ページ のクラス
155 | 
156 |     item_element が、xhtml で、その中に画像が含まれる場合の処理
157 |     """
158 | 
159 |     class InvalidImageLength(Exception):
160 |         pass
161 | 
162 |     class ImagePathAttrNotFound(Exception):
163 |         pass
164 | 
165 |     def __init__(
166 |         self,
167 |         item_element: Element,
168 |         itemref_element: Element,
169 |         epub_extractor: 'EpubExtractor',
170 |     ):
171 |         self.item_element = item_element
172 |         self.itemref_element = itemref_element
173 |         self.epub_extractor = epub_extractor
174 | 
175 |     @cached_property
176 |     def page_xhtml_path(self) -> str:
177 |         """
178 |         ページのXMLのパス
179 |         例: item/xhtml/001.xhtml
180 |         :return:
181 |         """
182 |         item_href = self.item_element.attrib.get('href', None)
183 |         if not item_href:
184 |             raise self.ItemHrefNotFound(self.item_element)
185 | 
186 |         return os.path.join(self.epub_extractor.content_base_dir, item_href)
187 | 
188 |     # page_xml_path = os.path.join(self.content_base_dir, item_href)
189 | 
190 |     @cached_property
191 |     def page_xhtml_etree(self) -> ElementTree:
192 |         # ページを解析
193 |         return parse_xml_with_recover(self.page_xhtml_path)
194 | 
195 |     @cached_property
196 |     def image_element(self) -> Element:
197 | 
198 |         if self.item_element.attrib.get('properties') == 'svg':
199 |             # SVGラッピング 日本のコミックEPUBでよくある形式
200 |             svg = self.page_xhtml_etree.find(
201 |                 './/{http://www.w3.org/2000/svg}svg'
202 |             )
203 |             if not svg:
204 |                 # 極稀に、svg タグが存在していない場合がある。
205 |                 # 代わりに img タグを探す
206 |                 images = self.page_xhtml_etree.findall(
207 |                     './/{http://www.w3.org/1999/xhtml}img'
208 |                 )
209 |             else:
210 |                 images = svg.findall('.//{http://www.w3.org/2000/svg}image')
211 |                 # 画像パスの属性は {http://www.w3.org/1999/xlink}href
212 | 
213 |         else:
214 |             # ここ未テスト
215 |             images = self.page_xhtml_etree.findall(
216 |                 './/{http://www.w3.org/1999/xhtml}img'
217 |             )
218 |             # 画像パスの属性は src
219 | 
220 |         if len(images) >= 2:
221 |             return self.get_largest_image_element(images)
222 | 
223 |         if len(images) != 1:
224 |             raise self.InvalidImageLength(
225 |                 '{}, {}'.format(self.item_element, len(images))
226 |             )
227 | 
228 |         return images[0]
229 | 
230 |     @cached_property
231 |     def image_path(self) -> str:
232 |         """
233 |         画像のフルパス
234 |         :return:
235 |         """
236 |         return self.get_image_path_of_image_element(self.image_element)
237 | 
238 |     def get_largest_image_element(self, image_elements: List[Element]):
239 |         """
240 |         複数の image_element から一番サイズの大きな画像を取得
241 |         """
242 |         L = [
243 |             (i, self.get_image_size_of_image_element(i))
244 |             for i in image_elements
245 |         ]
246 |         return list(sorted(L, key=lambda x: x[1], reverse=True))[0][0]
247 | 
248 |     # その他プロパティが必要であれば
249 |     # self.image_element.attrib.get('width', None)
250 |     # self.image_element.attrib.get('height', None)
251 |     # self.image_element.attrib.get('width', None)
252 |     def get_image_path_of_image_element(self, image_element: Element) -> str:
253 |         attr_names = [
254 |             '{http://www.w3.org/1999/xlink}href',
255 |             'src',
256 |             '{http://www.w3.org/1999/xlink}src',
257 |         ]
258 |         for attr_name in attr_names:
259 |             val = image_element.attrib.get(attr_name)
260 |             if val:
261 |                 return os.path.join(os.path.dirname(self.page_xhtml_path), val)
262 |         raise self.ImagePathAttrNotFound(image_element.attrib)
263 | 
264 |     def get_image_size_of_image_element(self, image_element: Element) -> int:
265 |         """
266 |         画像のサイズを取得
267 |         """
268 |         return os.path.getsize(
269 |             self.get_image_path_of_image_element(image_element)
270 |         )
271 | 
272 |     @cached_property
273 |     def is_png(self) -> bool:
274 |         return self.image_path.endswith('.png')
275 | 
276 |     @cached_property
277 |     def item_href(self) -> Optional[str]:
278 |         return self.item_element.attrib.get('href', None)
279 | 
280 | 
281 | class ImageSVGElement(ImagePage):
282 |     """
283 |     item_element が、image/svg+xml の場合
284 |     """
285 | 
286 |     @cached_property
287 |     def image_element(self) -> str:
288 |         item_href = self.item_element.attrib.get('href', None)
289 |         if not item_href:
290 |             raise self.ItemHrefNotFound(f'{self.item_element}')
291 |         if not item_href.lower().endswith('.svg'):
292 |             # SVG ではない画像。普通の画像として扱う。
293 |             return os.path.join(
294 |                 self.epub_extractor.content_base_dir, item_href
295 |             )
296 |         # SVG だった。
297 |         svg_path = os.path.join(
298 |             self.epub_extractor.content_base_dir, item_href
299 |         )
300 |         etree = parse_xml_with_recover(svg_path)
301 |         # SVG から image を抽出
302 |         images = etree.findall('.//{http://www.w3.org/2000/svg}image')
303 | 
304 |         if len(images) >= 2:
305 |             return self.get_largest_image_element(images)
306 | 
307 |         if len(images) != 1:
308 |             raise self.InvalidImageLength(
309 |                 '{}, {}'.format(self.item_element, len(images))
310 |             )
311 | 
312 |         return images[0]
313 | 
314 | 
315 | class EpubExtractorError(Exception):
316 |     pass
317 | 
318 | 
319 | class EpubExtractor:
320 |     class EpubNotFound(EpubExtractorError):
321 |         pass
322 | 
323 |     class NoEpubExtension(EpubExtractorError):
324 |         pass
325 | 
326 |     class ContentXmlNotFound(EpubExtractorError):
327 |         pass
328 | 
329 |     class IdRefNotFound(Exception):
330 |         pass
331 | 
332 |     class ItemNotFound(Exception):
333 |         pass
334 | 
335 |     class ItemHrefNotFound(Exception):
336 |         pass
337 | 
338 |     class OutputDirectoryAlreadyExists(EpubExtractorError):
339 |         pass
340 | 
341 |     def __init__(self, epub_file_path: str):
342 |         if not os.path.exists(epub_file_path):
343 |             raise self.EpubNotFound(epub_file_path)
344 | 
345 |         if not epub_file_path.endswith('.epub'):
346 |             raise self.NoEpubExtension(epub_file_path)
347 | 
348 |         self.epub_file_path = epub_file_path
349 |         self.setup()
350 | 
351 |     def setup(self) -> None:
352 |         self.temp_dir = tempfile.mkdtemp(suffix='epub-dump-meta')
353 |         # unzip
354 |         subprocess.Popen(
355 |             ('unzip', self.epub_file_path, "-d", self.temp_dir),
356 |             stdout=subprocess.PIPE,
357 |             stderr=subprocess.PIPE,
358 |         ).communicate()
359 | 
360 |     def close(self, *, fail_silently=True) -> None:
361 |         try:
362 |             shutil.rmtree(self.temp_dir)
363 |         except PermissionError as e:
364 |             if not fail_silently:
365 |                 raise
366 |             print('{}: {} ({})'.format(e.__class__.__name__, e, self.temp_dir))
367 | 
368 |     @cached_property
369 |     def content_xml_path(self) -> str:
370 |         """
371 |         content.xml (standard.opf) のファイルパスを返す
372 |         """
373 |         # META-INF/container.xml で固定
374 |         container_xml_path = os.path.join(
375 |             self.temp_dir, 'META-INF', 'container.xml'
376 |         )
377 |         etree = parse_xml_with_recover(container_xml_path)
378 |         # rootfile タグを探す
379 |         rootfile_node = etree.find(
380 |             ".//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile"
381 |         )
382 |         content_opf_path = rootfile_node.attrib['full-path']
383 | 
384 |         content_xml_path = os.path.join(self.temp_dir, content_opf_path)
385 |         if not os.path.exists(content_xml_path):
386 |             raise self.ContentXmlNotFound(content_xml_path)
387 |         return content_xml_path
388 | 
389 |     @cached_property
390 |     def content_xml_text(self) -> str:
391 |         return open(self.content_xml_path).read()
392 | 
393 |     @cached_property
394 |     def content_xml_etree(self) -> ElementTree:
395 |         return parse_xml_with_recover(self.content_xml_path)
396 | 
397 |     @cached_property
398 |     def content_base_dir(self) -> str:
399 |         # ファイルのパス基準となるディレクトリ
400 |         return os.path.dirname(self.content_xml_path)
401 | 
402 |     @cached_property
403 |     def items_dict(self) -> Dict[str, Element]:
404 |         """
405 |         id をキーにした item の辞書
406 |         """
407 |         ntq = namespace_tag_query(self.content_xml_etree._root)
408 |         manifest = self.content_xml_etree.find(ntq('manifest'))
409 |         items = manifest.findall(ntq('item'))
410 |         items_dict = {}  # type: Dict[str, Element]
411 |         for item in items:
412 |             id = item.attrib.get('id')
413 |             items_dict[id] = item
414 |         return items_dict
415 | 
416 |     @cached_property
417 |     def itemrefs(self) -> Iterator[Element]:
418 |         """
419 |         spine > itemref をページ順に返すジェネレータ
420 |         """
421 |         ntq = namespace_tag_query(self.content_xml_etree._root)
422 |         spine = self.content_xml_etree.find(ntq('spine'))
423 |         itemrefs = spine.findall(ntq('itemref'))
424 |         for itemref in itemrefs:
425 |             yield itemref
426 | 
427 |     def _get_image_pages(self) -> Iterator[ImageElementBase]:
428 |         items_dict = self.items_dict
429 | 
430 |         for itemref in self.itemrefs:  # type: Element
431 | 
432 |             idref = itemref.attrib.get('idref', None)
433 |             if not idref:
434 |                 raise self.IdRefNotFound(itemref)
435 | 
436 |             if idref not in items_dict:
437 |                 raise self.ItemNotFound(idref)
438 | 
439 |             item = items_dict[idref]  # type: Element
440 | 
441 |             media_type = item.attrib.get('media-type', '')
442 |             if media_type.startswith('image/svg'):
443 |                 # image/svg+xml 等
444 |                 image_page = ImageSVGElement(item, itemref, self)
445 |             elif media_type.startswith('image/'):
446 |                 # image/jpeg, image/png
447 |                 image_page = ImageElement(item, itemref, self)
448 |             else:
449 |                 # application/xhtml+xml 等
450 |                 image_page = ImagePage(item, itemref, self)
451 |             yield image_page
452 | 
453 |     @cached_property
454 |     def image_pages(self) -> List[ImageElementBase]:
455 |         return list(self._get_image_pages())
456 | 
457 |     def format_page_number(self, page_number: str) -> str:
458 |         return '{:05d}'.format(page_number)
459 | 
460 |     def _move_jpeg_file(
461 |         self,
462 |         image_page: ImageElementBase,
463 |         output_dir: str,
464 |         page_index: int,
465 |         convert_png: bool = True,
466 |         # copy: 指定すると、ファイルの移動ではなくコピーをする。
467 |         # 1つの画像ファイルが複数箇所で使われれている場合、移動するとエラーになるので
468 |         # コピーをしたほうが安全に処理ができる。
469 |         # ただし、ストレージ容量が余分に必要で、遅い。
470 |         copy: bool = True,
471 |     ):
472 |         source_image_path = image_page.image_path
473 | 
474 |         if image_page.is_png:
475 |             if convert_png:
476 |                 # PNGを変換する場合
477 |                 destination_image_name = '{}.jpg'.format(
478 |                     self.format_page_number(page_index)
479 |                 )
480 |                 destination_image_path = os.path.join(
481 |                     output_dir, destination_image_name
482 |                 )
483 |                 convert_to_jpeg(
484 |                     source_image_path, destination_image_path, copy=copy
485 |                 )
486 |                 return
487 |             destination_image_name = '{}.png'.format(
488 |                 self.format_page_number(page_index)
489 |             )
490 |         else:
491 |             destination_image_name = '{}.jpg'.format(
492 |                 self.format_page_number(page_index)
493 |             )
494 |         destination_image_path = os.path.join(
495 |             output_dir, destination_image_name
496 |         )
497 |         if copy:
498 |             shutil.copy(source_image_path, destination_image_path)
499 |         else:
500 |             shutil.move(source_image_path, destination_image_path)
501 |         print('{} -> {}'.format(source_image_path, destination_image_name))
502 | 
503 |     def extract_images(
504 |         self,
505 |         output_dir: Optional[str] = None,
506 |         convert_png: bool = True,
507 |         delete_exists_dir: bool = False,
508 |         copy: bool = True,
509 |         fail_silently: bool = True,
510 |     ):
511 |         """
512 |         画像ファイルをディレクトリに展開(移動)
513 |         """
514 |         if not output_dir:
515 |             output_dir, _ext = os.path.splitext(self.epub_file_path)
516 |         if os.path.exists(output_dir):
517 |             if delete_exists_dir:
518 |                 try:
519 |                     shutil.rmtree(output_dir)
520 |                 except PermissionError as e:
521 |                     if not fail_silently:
522 |                         raise
523 |                     print(
524 |                         '{}: {} ({})'.format(
525 |                             e.__class__.__name__, e, output_dir
526 |                         )
527 |                     )
528 |             else:
529 |                 raise self.OutputDirectoryAlreadyExists(output_dir)
530 | 
531 |         os.mkdir(output_dir)
532 | 
533 |         for i, image_page in enumerate(self.image_pages, start=1):
534 |             try:
535 |                 self._move_jpeg_file(
536 |                     image_page,
537 |                     output_dir,
538 |                     i,
539 |                     convert_png=convert_png,
540 |                     copy=copy,
541 |                 )
542 |             except ImagePage.InvalidImageLength as e:
543 |                 warnings.warn(
544 |                     '{} {}'.format(e.__class__.__name__, e), stacklevel=2
545 |                 )
546 | 
547 |     @cached_property
548 |     def metadata_element(self):
549 |         """
550 |         コンテンツXML ( standard.opf) 内の、metadata エレメント
551 |         """
552 |         ntq = namespace_tag_query(self.content_xml_etree._root)
553 |         metadata = self.content_xml_etree.find(ntq('metadata'))
554 |         return metadata
555 | 
556 |     @cached_property
557 |     def last_page_number(self) -> int:
558 |         return len(self.image_pages)
559 | 
560 |     def _get_item_href_from_image_page(self, image_page):
561 |         """
562 |         ページのリンク先を取得
563 |         e.g.: 'xhtml/cover.xhtml'
564 |         """
565 |         path = getattr(image_page, 'item_href', None)
566 |         if path:
567 |             return path
568 |         if hasattr(image_page, 'item_element') and hasattr(
569 |             image_page.item_element, 'attrib'
570 |         ):
571 |             path = image_page.item_element.attrib.get('href', None)
572 |             if path:
573 |                 return path
574 |         # 未知のパターン。デバッグして調査してください。
575 |         raise self.ItemHrefNotFound(image_page)
576 | 
577 |     @cached_property
578 |     def xml_path_page_number_dict(self) -> Dict[str, Element]:
579 |         """
580 |         XMLファイルとページ番号の対応表
581 |         :return: dict
582 |         """
583 |         return {
584 |             self._get_item_href_from_image_page(image_page): i
585 |             for i, image_page in enumerate(self.image_pages, start=1)
586 |         }
587 | 
588 |     @cached_property
589 |     def xml_path_page_number_dict_basename(self):
590 |         """
591 |         XMLファイルとページ番号の対応表 ファイル名のみ版
592 |         :return: dict
593 |         """
594 |         return {
595 |             os.path.basename(k): v
596 |             for k, v in self.xml_path_page_number_dict.items()
597 |         }
598 | 
599 |     def get_page_number_from_page_xml_path(self, page_xml_path, default=1):
600 |         """
601 |         ページXMLパスから画像番号を取得
602 |         page_xml_path は XHTMLファイルのパスか、画像のパスになる(EPUB形式による)
603 |         """
604 |         if page_xml_path in self.xml_path_page_number_dict:
605 |             return self.xml_path_page_number_dict[page_xml_path]
606 |         else:
607 |             return self.xml_path_page_number_dict_basename.get(
608 |                 os.path.basename(page_xml_path), default
609 |             )
610 | 
611 |     @cached_property
612 |     def navigation_xml(self):
613 |         """
614 |         :rtype: NavigationXml
615 |         """
616 |         return NavigationXml(self)
617 | 
618 |     @cached_property
619 |     def toc_ncx(self):
620 |         """
621 |         :rtype: TocNcx
622 |         """
623 |         return TocNcx(self)
624 | 
625 |     @cached_property
626 |     def meta(self):
627 |         """
628 |         :rtype: EpubMeta
629 |         """
630 |         return EpubMeta(self)
631 | 
632 |     def get_toc_table(self):
633 |         """
634 |         目次情報を取得
635 |         """
636 |         try:
637 |             if self.toc_ncx.cleaned_toc_ncx_data:
638 |                 # toc.ncx がパースできたらそれを使う
639 |                 return self.toc_ncx.cleaned_toc_ncx_data
640 |         except TocNcx.TocNcxNotFound:
641 |             pass
642 |         try:
643 |             if self.navigation_xml.cleaned_navigation_xml_data:
644 |                 # toc.ncx がパースできなければ、navigation-xml から取得を試す
645 |                 return self.navigation_xml.cleaned_navigation_xml_data
646 |         except NavigationXml.NavigationXmlNotFound:
647 |             pass
648 |         return None
649 | 
650 |     @staticmethod
651 |     def print_json(object):
652 |         import json
653 | 
654 |         print(json.dumps(object, ensure_ascii=False, indent=2))
655 | 
656 |     def dump_meta(self):
657 |         pass
658 |         # self.toc_xml_path
659 |         # self.navigation_xml.debug_cleaned_navigation_xml_data()
660 | 
661 |         # self.toc_ncx.debug_cleaned_toc_ncx_data()
662 | 
663 | 
664 | class EpubMeta:
665 |     def __init__(self, epub_extractor):
666 |         self.ee = epub_extractor
667 |         self.meta_element = self.ee.metadata_element
668 | 
669 |     def _get_text_dc(self, tag_name):
670 |         tag = self.meta_element.find(
671 |             './/{}{}'.format("{http://purl.org/dc/elements/1.1/}", tag_name)
672 |         )
673 |         if tag is not None:
674 |             return tag.text
675 |         else:
676 |             return None
677 | 
678 |     def _get_texts_dc(self, tag_name):
679 |         return [
680 |             e.text
681 |             for e in self.meta_element.findall(
682 |                 './/{}{}'.format(
683 |                     "{http://purl.org/dc/elements/1.1/}", tag_name
684 |                 )
685 |             )
686 |         ]
687 | 
688 |     @cached_property
689 |     def title(self):
690 |         return self._get_text_dc('title')
691 | 
692 |     @cached_property
693 |     def publisher(self):
694 |         return self._get_text_dc('publisher')
695 | 
696 |     @cached_property
697 |     def identifier(self):
698 |         return self._get_text_dc('identifier')
699 | 
700 |     @cached_property
701 |     def language(self):
702 |         return self._get_text_dc('language')
703 | 
704 |     @cached_property
705 |     def creators(self):
706 |         return self._get_texts_dc('creator')
707 | 
708 |     def as_ordered_dict(self):
709 |         return OrderedDict(
710 |             [
711 |                 ('title', self.title),
712 |                 ('publisher', self.publisher),
713 |                 ('identifier', self.identifier),
714 |                 ('language', self.language),
715 |                 ('creators', self.creators),
716 |                 ('meta', self.meta_dict),
717 |             ]
718 |         )
719 | 
720 |     def meta_tags(self):
721 |         return self.meta_element.findall(
722 |             './/{http://www.idpf.org/2007/opf}meta'
723 |         )
724 | 
725 |     @cached_property
726 |     def meta_dict(self):
727 |         od = OrderedDict()
728 |         for mt in self.meta_tags():
729 |             if mt.attrib.get('refines'):
730 |                 # refines 今回は無視
731 |                 continue
732 |             if mt.attrib.get('name') and mt.attrib.get('content'):
733 |                 od[mt.attrib.get('name')] = mt.attrib.get('content')
734 |                 continue
735 |             if mt.attrib.get('property'):
736 |                 od[mt.attrib.get('property')] = mt.text
737 |                 continue
738 |         return od
739 | 
740 | 
741 | class NavigationXml:
742 |     """
743 |     NAVIGATION XML (Required BeautifulSoup4)
744 |     """
745 | 
746 |     class NavigationXmlNotFound(EpubExtractorError):
747 |         pass
748 | 
749 |     def __init__(self, epub_extractor):
750 |         self.ee = epub_extractor
751 | 
752 |     @cached_property
753 |     def navigation_xml_path(self):
754 |         ntq = namespace_tag_query(self.ee.content_xml_etree._root)
755 |         manifest = self.ee.content_xml_etree.find(ntq('manifest'))
756 |         items = manifest.findall(ntq('item'))
757 |         for item in items:
758 |             if (
759 |                 item.attrib.get('id') == 'toc'
760 |                 or item.attrib.get('properties') == 'nav'
761 |             ):
762 |                 return os.path.join(
763 |                     self.ee.content_base_dir, item.attrib.get('href')
764 |                 )
765 |         raise self.NavigationXmlNotFound()
766 | 
767 |     @cached_property
768 |     def navigation_xml_etree(self):
769 |         return parse_xml_with_recover(self.navigation_xml_path)
770 | 
771 |     @cached_property
772 |     def navigation_xml_bs4(self):
773 |         from bs4 import BeautifulSoup
774 | 
775 |         return BeautifulSoup(open(self.navigation_xml_path), "html.parser")
776 | 
777 |     @cached_property
778 |     def navigation_xml_data(self):
779 |         def _gen():
780 |             bs = self.navigation_xml_bs4
781 |             for a in bs.find_all('a'):
782 |                 href = a['href']
783 |                 page_number = self.ee.get_page_number_from_page_xml_path(href)
784 |                 yield OrderedDict(
785 |                     [
786 |                         ('page_xml', href),
787 |                         ('start_page', page_number),
788 |                         ('section_title', a.text),
789 |                     ]
790 |                 )
791 | 
792 |         return list(_gen())
793 | 
794 |     @cached_property
795 |     def cleaned_navigation_xml_data(self):
796 |         attended = set()
797 |         navs = []
798 |         for o in sorted(
799 |             self.navigation_xml_data, key=lambda x: x['start_page']
800 |         ):
801 |             if o['start_page'] in attended:
802 |                 continue
803 |             attended.add(o['start_page'])
804 |             if navs:
805 |                 navs[-1]['end_page'] = o['start_page'] - 1
806 |             navs.append(o)
807 |         if navs:
808 |             navs[-1]['end_page'] = self.ee.last_page_number
809 |         return navs
810 | 
811 |     def debug_cleaned_navigation_xml_data(self):
812 |         for o in self.cleaned_navigation_xml_data:
813 |             print(
814 |                 '{}-{} {}'.format(
815 |                     self.ee.format_page_number(o['start_page']),
816 |                     self.ee.format_page_number(o['end_page']),
817 |                     o['section_title'],
818 |                 )
819 |             )
820 | 
821 | 
822 | class TocNcx:
823 |     """
824 |     TOC NCX
825 |     """
826 | 
827 |     class TocNcxNotFound(EpubExtractorError):
828 |         pass
829 | 
830 |     def __init__(self, epub_extractor):
831 |         self.ee = epub_extractor
832 | 
833 |     @cached_property
834 |     def toc_ncx_etree(self) -> ElementTree:
835 |         return parse_xml_with_recover(self.toc_ncx_path)
836 | 
837 |     @cached_property
838 |     def toc_ncx_path(self) -> str:
839 |         manifest = self.ee.content_xml_etree.find(
840 |             './/{http://www.idpf.org/2007/opf}manifest'
841 |         )
842 |         items = manifest.findall('.//{http://www.idpf.org/2007/opf}item')
843 |         for item in items:
844 |             if (
845 |                 item.attrib.get('media-type') == 'application/x-dtbncx+xml'
846 |                 or item.attrib.get('id') == 'ncx'
847 |             ):
848 |                 return os.path.join(
849 |                     self.ee.content_base_dir, item.attrib.get('href')
850 |                 )
851 |         raise self.TocNcxNotFound()
852 | 
853 |     @cached_property
854 |     def toc_ncx_data(self) -> List[OrderedDict]:
855 |         """
856 |         toc.ncx を解析した辞書
857 |         """
858 | 
859 |         def _gen():
860 |             ntq = namespace_tag_query(self.toc_ncx_etree._root)
861 |             for np in self.toc_ncx_etree.findall(ntq('navPoint')):
862 |                 text = np.find(ntq('text'))
863 |                 content = np.find(ntq('content'))
864 |                 src = content.attrib.get('src')
865 |                 page_number = self.ee.get_page_number_from_page_xml_path(src)
866 |                 # play_order = np.attrib.get('playOrder')
867 |                 yield OrderedDict(
868 |                     [
869 |                         ('page_xml', src),
870 |                         ('start_page', page_number),
871 |                         ('section_title', text.text),
872 |                     ]
873 |                 )
874 | 
875 |         return list(_gen())
876 | 
877 |     @cached_property
878 |     def cleaned_toc_ncx_data(self):
879 |         attended = set()
880 |         navs = []
881 |         for o in sorted(self.toc_ncx_data, key=lambda x: x['start_page']):
882 |             if o['start_page'] in attended:
883 |                 continue
884 |             attended.add(o['start_page'])
885 |             if navs:
886 |                 navs[-1]['end_page'] = o['start_page'] - 1
887 |             navs.append(o)
888 |         if navs:
889 |             navs[-1]['end_page'] = self.ee.last_page_number
890 |         return navs
891 | 
892 |     def debug_cleaned_toc_ncx_data(self) -> None:
893 |         for o in self.cleaned_toc_ncx_data:
894 |             print(
895 |                 '{}-{} {}'.format(
896 |                     self.ee.format_page_number(o['start_page']),
897 |                     self.ee.format_page_number(o['end_page']),
898 |                     o['section_title'],
899 |                 )
900 |             )
901 | 


--------------------------------------------------------------------------------