├── test-epubs ├── .gitignore └── README ├── Makefile ├── epub_extractor ├── __init__.py ├── epub_dump_toc.py ├── epub_dump_meta.py ├── epub_extract_jpeg.py └── epub_extractor.py ├── test-dump-toc.sh ├── test-dump-meta.sh ├── test-extract-jpeg.sh ├── .editorconfig ├── README.rst ├── .gitignore ├── setup.py ├── LICENSE.txt └── .pre-commit-config.yaml /test-epubs/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | !README 4 | -------------------------------------------------------------------------------- /test-epubs/README: -------------------------------------------------------------------------------- 1 | Put EPUBs this directory and run ./test-extract.sh 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | release: 2 | python3 setup.py sdist 3 | twine upload dist/* 4 | -------------------------------------------------------------------------------- /epub_extractor/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | 4 | __author__ = 'torico' 5 | __version__ = '0.4.3' 6 | __license__ = 'BSD' 7 | -------------------------------------------------------------------------------- /test-dump-toc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd $(dirname $0) 4 | 5 | for DIR in test-epubs/*; do 6 | if [ -d ${DIR} ]; then 7 | rm -r ${DIR} 8 | fi 9 | done 10 | 11 | for EPUB in test-epubs/*.epub; do 12 | echo ${EPUB} 13 | epub_extractor/epub_dump_toc.py ${EPUB} 14 | done 15 | -------------------------------------------------------------------------------- /test-dump-meta.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd $(dirname $0) 4 | 5 | for DIR in test-epubs/*; do 6 | if [ -d ${DIR} ]; then 7 | rm -r ${DIR} 8 | fi 9 | done 10 | 11 | for EPUB in test-epubs/*.epub; do 12 | echo ${EPUB} 13 | epub_extractor/epub_dump_meta.py ${EPUB} 14 | done 15 | -------------------------------------------------------------------------------- /test-extract-jpeg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd $(dirname $0) 4 | 5 | for DIR in test-epubs/*; do 6 | if [ -d ${DIR} ]; then 7 | rm -r ${DIR} 8 | fi 9 | done 10 | 11 | for EPUB in test-epubs/*.epub; do 12 | echo ${EPUB} 13 | epub_extractor/epub_extract_jpeg.py ${EPUB} 14 | done 15 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 2 6 | insert_final_newline = true 7 | trim_trailing_whitespace = true 8 | end_of_line = lf 9 | charset = utf-8 10 | 11 | [*.py] 12 | indent_size = 4 13 | max_line_length = 79 14 | 15 | [*.json] 16 | insert_final_newline = ignore 17 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ~~~~~~~~~~~~~~ 2 | epub-extractor 3 | ~~~~~~~~~~~~~~ 4 | 5 | * Extract Jpeg images from EPUB file. 6 | 7 | * Dump metadata from EPUB file. 8 | 9 | 10 | Install 11 | ------- 12 | 13 | :: 14 | 15 | $ pip install epub-extractor 16 | 17 | 18 | Requirements 19 | ------------ 20 | 21 | * unzip 22 | 23 | 24 | 25 | epub-extract-jpeg 26 | ----------------- 27 | 28 | :: 29 | 30 | $ epub-extract-jpeg comic.epub 31 | 32 | Extract jpeg images. 33 | 34 | 35 | 36 | epub-dump-meta 37 | -------------- 38 | 39 | :: 40 | 41 | $ epub-extract-jpeg comic.epub 42 | 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # General 2 | tmp 3 | temp 4 | #* 5 | .#* 6 | 7 | # Editor 8 | *~ 9 | *.orig 10 | *.swp 11 | 12 | # Python 13 | *.pyc 14 | *.pyo 15 | *.egg-info/ 16 | dist/ 17 | 18 | # Windows 19 | Thumbs.db 20 | 21 | # Eclipse 22 | .project 23 | 24 | # Pydev 25 | .pydevproject 26 | 27 | # Mac 28 | .DS_Store 29 | 30 | # Xcode 31 | build/ 32 | *.pbxuser 33 | !default.pbxuser 34 | *.mode1v3 35 | !default.mode1v3 36 | *.mode2v3 37 | !default.mode2v3 38 | *.perspectivev3 39 | !default.perspectivev3 40 | xcuserdata 41 | *.xccheckout 42 | *.moved-aside 43 | DerivedData 44 | *.hmap 45 | *.ipa 46 | *.xcuserstate 47 | 48 | 49 | # CocoaPod 50 | Pods/* 51 | Podfile.lock 52 | 53 | # JetBrains 54 | .idea 55 | 56 | # if required 57 | # data/* 58 | # config/* 59 | ~* 60 | venv 61 | .venv 62 | 63 | .env 64 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from setuptools import setup 3 | from epub_extractor import __author__, __version__, __license__ 4 | 5 | setup( 6 | name='epub-extractor', 7 | version=__version__, 8 | description='Extract comic EPUB pages to Jpeg files, ' 9 | 'Dump meta information.', 10 | license=__license__, 11 | author=__author__, 12 | author_email='ytyng@live.jp', 13 | url='https://github.com/ytyng/epub-extractor.git', 14 | keywords='comic epub extract jpeg images and meta information.', 15 | packages=['epub_extractor'], 16 | install_requires=[], 17 | entry_points={ 18 | 'console_scripts': [ 19 | 'epub-extract-jpeg = epub_extractor.epub_extract_jpeg:main', 20 | 'epub-dump-meta = epub_extractor.epub_dump_meta:main', 21 | ] 22 | }, 23 | ) 24 | -------------------------------------------------------------------------------- /epub_extractor/epub_dump_toc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | EPUB ファイルの TOC を表示 5 | """ 6 | 7 | import argparse 8 | 9 | try: 10 | from .epub_extractor import EpubExtractor 11 | except (ValueError, SystemError, ImportError): 12 | try: 13 | from epub_extractor import EpubExtractor 14 | except (ValueError, SystemError, ImportError): 15 | from epub_extractor.epub_extractor import EpubExtractor 16 | 17 | 18 | def procedure(file_path): 19 | epub_extractor = EpubExtractor(file_path) 20 | toc_table = epub_extractor.get_toc_table() 21 | epub_extractor.close() 22 | return toc_table 23 | 24 | 25 | def main(): 26 | parser = argparse.ArgumentParser(description='Dump EPUB toc data.') 27 | parser.add_argument( 28 | 'epub_files', 29 | metavar='EPUB-Files', 30 | type=str, 31 | nargs='+', 32 | help='Target Epub Files', 33 | ) 34 | 35 | args = parser.parse_args() 36 | 37 | if len(args.epub_files) > 1: 38 | out = [] 39 | for epub_file in args.epub_files: 40 | out.append(procedure(epub_file)) 41 | else: 42 | out = procedure(args.epub_files[0]) 43 | 44 | EpubExtractor.print_json(out) 45 | 46 | 47 | if __name__ == '__main__': 48 | main() 49 | -------------------------------------------------------------------------------- /epub_extractor/epub_dump_meta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | 4 | """ 5 | EPUB ファイルの Meta を表示 6 | """ 7 | 8 | import argparse 9 | 10 | try: 11 | from .epub_extractor import EpubExtractor 12 | except (ValueError, SystemError, ImportError): 13 | try: 14 | from epub_extractor import EpubExtractor 15 | except (ValueError, SystemError, ImportError): 16 | from epub_extractor.epub_extractor import EpubExtractor 17 | 18 | 19 | def procedure(file_path): 20 | epub_extractor = EpubExtractor(file_path) 21 | meta = epub_extractor.meta 22 | metadata = meta.as_ordered_dict() 23 | epub_extractor.close() 24 | return metadata 25 | 26 | 27 | def main(): 28 | parser = argparse.ArgumentParser(description='Dump EPUB Meta information.') 29 | parser.add_argument( 30 | 'epub_files', 31 | metavar='EPUB-Files', 32 | type=str, 33 | nargs='+', 34 | help='Target Epub Files', 35 | ) 36 | 37 | args = parser.parse_args() 38 | 39 | if len(args.epub_files) > 1: 40 | out = [] 41 | for epub_file in args.epub_files: 42 | out.append(procedure(epub_file)) 43 | else: 44 | out = procedure(args.epub_files[0]) 45 | 46 | EpubExtractor.print_json(out) 47 | 48 | 49 | if __name__ == '__main__': 50 | main() 51 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, @ytyng 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | * Redistributions of source code must retain the above copyright notice, 7 | this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright notice, 9 | this list of conditions and the following disclaimer in the documentation 10 | and/or other materials provided with the distribution. 11 | * Neither the name of the @ytyng nor the names of its contributors 12 | may be used to endorse or promote products derived from this software 13 | without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /epub_extractor/epub_extract_jpeg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | EPUB ファイルを jpeg に展開する 4 | """ 5 | 6 | import argparse 7 | 8 | try: 9 | from .epub_extractor import EpubExtractor 10 | except (ValueError, SystemError, ImportError): 11 | try: 12 | from epub_extractor import EpubExtractor 13 | except (ValueError, SystemError, ImportError): 14 | from epub_extractor.epub_extractor import EpubExtractor 15 | 16 | 17 | def procedure(file_path, convert_png=True, delete_exists_dir=False): 18 | epub_extractor = EpubExtractor(file_path) 19 | epub_extractor.extract_images( 20 | convert_png=convert_png, delete_exists_dir=delete_exists_dir 21 | ) 22 | epub_extractor.close() 23 | 24 | 25 | def main(): 26 | parser = argparse.ArgumentParser(description='Extract Jpeg files in EPUB') 27 | parser.add_argument( 28 | 'epub_files', 29 | metavar='EPUB-Files', 30 | type=str, 31 | nargs='+', 32 | help='Target Epub Files', 33 | ) 34 | parser.add_argument( 35 | '--no-png-convert', 36 | dest='no_png_convert', 37 | action='store_true', 38 | default=False, 39 | help='No png convert to jpeg', 40 | ) 41 | parser.add_argument( 42 | '--delete-exists-dir', 43 | dest='delete_exists_dir', 44 | action='store_true', 45 | default=False, 46 | help='No png convert to jpeg', 47 | ) 48 | 49 | args = parser.parse_args() 50 | 51 | for epub_file in args.epub_files: 52 | procedure( 53 | epub_file, 54 | convert_png=not args.no_png_convert, 55 | delete_exists_dir=args.delete_exists_dir, 56 | ) 57 | 58 | 59 | if __name__ == '__main__': 60 | main() 61 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 22.12.0 4 | hooks: 5 | - id: black 6 | language: python 7 | types: [python] 8 | # GitHub actions の black と同じ設定にしている。 .github/workflows/black.yml 9 | args: [ 10 | --line-length, "79", 11 | --target-version, "py310", 12 | --exclude, "migrations", 13 | --skip-string-normalization, # 文字列の正規化をスキップ 14 | --experimental-string-processing, 15 | ] 16 | 17 | - repo: https://github.com/PyCQA/flake8 18 | rev: 6.0.0 19 | hooks: 20 | - id: flake8 21 | # github actions の flake8 と同じ設定にしている。 .github/workflows/flake8.yml 22 | # B: Bugbearによって検出されるエラー 23 | # B950: Bugbearによって検出される、最大行長に対する緩いチェック 24 | # extend-ignore = E203, E501: これは、Flake8が無視するエラーコードを指定しています。 25 | # E203とE501はそれぞれ次のようなエラーコードです。 26 | # E203: コロンの前に空白があるときに発生するエラー。これは、BlackとPEP 8のコーディングスタイルが競合するため、無視されます。 27 | args: [ 28 | --count, 29 | --max-line-length, "88", 30 | --select, "E9,F63,F7,F82,B,B950", 31 | --extend-ignore, "E203", 32 | --statistics, 33 | --show-source, 34 | --exclude, "*/migrations/*", 35 | ] 36 | additional_dependencies: [ flake8-bugbear ] # flake8-bugbear を追加 37 | 38 | - repo: https://github.com/PyCQA/isort 39 | rev: 5.12.0 40 | hooks: 41 | - id: isort 42 | args: [--profile, "black"] # black とコンフリクトさせないよう同じ設定を使用 43 | 44 | - repo: https://github.com/pre-commit/pre-commit-hooks 45 | rev: v4.1.0 46 | hooks: 47 | # https://pre-commit.com/hooks.html にあるもの 48 | - id: check-json # json ファイルの構文チェック 49 | - id: check-toml # toml ファイルの構文チェック 50 | - id: check-yaml # yaml ファイルの構文チェック 51 | - id: debug-statements # デバッグ用の print 文を検出 52 | - id: end-of-file-fixer # ファイルの最後に改行を追加 53 | - id: fix-byte-order-marker # BOM を削除 54 | - id: trailing-whitespace # 行末の空白を削除 55 | - id: detect-aws-credentials # AWS の認証情報を検出 56 | - id: detect-private-key # 秘密鍵を検出 57 | -------------------------------------------------------------------------------- /epub_extractor/epub_extractor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import re 5 | import shutil 6 | import subprocess 7 | import sys 8 | import tempfile 9 | import warnings 10 | from abc import ABCMeta, abstractmethod 11 | from collections import OrderedDict 12 | from functools import cached_property 13 | from typing import Callable, Dict, Iterator, List, Optional 14 | from xml.etree import ElementTree 15 | from xml.etree.ElementTree import Element 16 | 17 | 18 | def parse_xml_with_recover(xml_path: str) -> ElementTree: 19 | """ 20 | xmlをパース 21 | & の使い方が悪いファイルがある場合、 22 | それをパースしようとするとエラーになるので、失敗したら文字列置換してリトライする。 23 | http://stackoverflow.com/questions/13046240/parseerror-not-well-formed 24 | -invalid-token-using-celementtree 25 | ここには、lxml の場合の対応方法があるが、python3 のxml ではやり方不明のため 26 | ( ElementTree.XMLParser のコンストラクタには recover 引数が無い)、 27 | 自力で置換する 28 | """ 29 | try: 30 | etree = ElementTree.parse(xml_path) 31 | return etree 32 | except ElementTree.ParseError as e: 33 | # ParseError の場合のみ、修復を試みる 34 | print('{}, {}'.format(e.__class__.__name__, e)) 35 | 36 | xml_source = open(xml_path).read() 37 | # 修復! 38 | xml_source = xml_repair(xml_source) 39 | return ElementTree.fromstring(xml_source) 40 | 41 | 42 | def convert_to_jpeg( 43 | source_file_path: str, 44 | destination_file_path: str, 45 | *, 46 | jpeg_quality: int = 70, 47 | copy: bool = True, 48 | ) -> None: 49 | """ 50 | PNG を Jpeg に変換する 51 | """ 52 | try: 53 | from PIL import Image 54 | except ImportError: 55 | print( 56 | 'PNG image found. Converting png to jpeg, require PIL.', 57 | file=sys.stderr, 58 | ) 59 | print( 60 | 'Try: "pip install PIL" or "pip install pillow"', file=sys.stderr 61 | ) 62 | raise 63 | 64 | im = Image.open(source_file_path) 65 | im = im.convert("RGB") 66 | im.save(destination_file_path, 'jpeg', quality=jpeg_quality) 67 | if not copy: 68 | os.remove(source_file_path) 69 | print('{} -> {}'.format(source_file_path, destination_file_path)) 70 | 71 | 72 | re_entity = re.compile(r'(>[^<]*)(&)([^<]*<)') 73 | re_replace = re.compile(r'&(?!\w*?;)') 74 | 75 | 76 | def xml_repair(xml_source: str) -> str: 77 | """ 78 | XMLのソースコードの & を & に変換する 79 | :param self: 80 | :param xml_source: 81 | :return: 82 | """ 83 | 84 | def _replace(matcher): 85 | return re_replace.sub('&', matcher.group(0)) 86 | 87 | return re_entity.sub(_replace, xml_source) 88 | 89 | 90 | def get_etree_namespace(element: Element) -> str: 91 | m = re.match('\{.*\}', element.tag) 92 | return m.group(0) if m else '' 93 | 94 | 95 | def namespace_tag_query(element: Element) -> Callable[[str], str]: 96 | """ 97 | element のネームスペースをバインドし、ネームスペースつきのタグ名を返す関数を返す 98 | """ 99 | ns = get_etree_namespace(element) 100 | 101 | def _generate_query(tag_name): 102 | return './/{}{}'.format(ns, tag_name) 103 | 104 | return _generate_query 105 | 106 | 107 | class ImageElementBase(metaclass=ABCMeta): 108 | class ItemHrefNotFound(Exception): 109 | pass 110 | 111 | @cached_property 112 | @abstractmethod 113 | def image_path(self) -> str: 114 | raise NotImplementedError 115 | 116 | @cached_property 117 | @abstractmethod 118 | def is_png(self) -> bool: 119 | """Is image type PNG""" 120 | raise NotImplementedError 121 | 122 | 123 | class ImageElement(ImageElementBase): 124 | """ 125 | item_element が、image/xxx の場合 (ページごとの XMLが無い場合) 126 | """ 127 | 128 | def __init__( 129 | self, 130 | item_element: Element, 131 | itemref_element: Element, 132 | epub_extractor: 'EpubExtractor', 133 | ): 134 | self.item_element = item_element 135 | self.itemref_element = itemref_element 136 | self.epub_extractor = epub_extractor 137 | 138 | @cached_property 139 | def image_path(self) -> str: 140 | item_href = self.item_element.attrib.get('href', None) 141 | if not item_href: 142 | raise self.ItemHrefNotFound(f'{self.item_element}') 143 | return os.path.join(self.epub_extractor.content_base_dir, item_href) 144 | 145 | @cached_property 146 | def is_png(self) -> bool: 147 | return self.item_element.attrib.get('href', '').endswith( 148 | '.png' 149 | ) or self.item_element.attrib.get('media-type', '').endswith('/png') 150 | 151 | 152 | class ImagePage(ImageElementBase): 153 | """ 154 | 画像ページ のクラス 155 | 156 | item_element が、xhtml で、その中に画像が含まれる場合の処理 157 | """ 158 | 159 | class InvalidImageLength(Exception): 160 | pass 161 | 162 | class ImagePathAttrNotFound(Exception): 163 | pass 164 | 165 | def __init__( 166 | self, 167 | item_element: Element, 168 | itemref_element: Element, 169 | epub_extractor: 'EpubExtractor', 170 | ): 171 | self.item_element = item_element 172 | self.itemref_element = itemref_element 173 | self.epub_extractor = epub_extractor 174 | 175 | @cached_property 176 | def page_xhtml_path(self) -> str: 177 | """ 178 | ページのXMLのパス 179 | 例: item/xhtml/001.xhtml 180 | :return: 181 | """ 182 | item_href = self.item_element.attrib.get('href', None) 183 | if not item_href: 184 | raise self.ItemHrefNotFound(self.item_element) 185 | 186 | return os.path.join(self.epub_extractor.content_base_dir, item_href) 187 | 188 | # page_xml_path = os.path.join(self.content_base_dir, item_href) 189 | 190 | @cached_property 191 | def page_xhtml_etree(self) -> ElementTree: 192 | # ページを解析 193 | return parse_xml_with_recover(self.page_xhtml_path) 194 | 195 | @cached_property 196 | def image_element(self) -> Element: 197 | 198 | if self.item_element.attrib.get('properties') == 'svg': 199 | # SVGラッピング 日本のコミックEPUBでよくある形式 200 | svg = self.page_xhtml_etree.find( 201 | './/{http://www.w3.org/2000/svg}svg' 202 | ) 203 | if not svg: 204 | # 極稀に、svg タグが存在していない場合がある。 205 | # 代わりに img タグを探す 206 | images = self.page_xhtml_etree.findall( 207 | './/{http://www.w3.org/1999/xhtml}img' 208 | ) 209 | else: 210 | images = svg.findall('.//{http://www.w3.org/2000/svg}image') 211 | # 画像パスの属性は {http://www.w3.org/1999/xlink}href 212 | 213 | else: 214 | # ここ未テスト 215 | images = self.page_xhtml_etree.findall( 216 | './/{http://www.w3.org/1999/xhtml}img' 217 | ) 218 | # 画像パスの属性は src 219 | 220 | if len(images) >= 2: 221 | return self.get_largest_image_element(images) 222 | 223 | if len(images) != 1: 224 | raise self.InvalidImageLength( 225 | '{}, {}'.format(self.item_element, len(images)) 226 | ) 227 | 228 | return images[0] 229 | 230 | @cached_property 231 | def image_path(self) -> str: 232 | """ 233 | 画像のフルパス 234 | :return: 235 | """ 236 | return self.get_image_path_of_image_element(self.image_element) 237 | 238 | def get_largest_image_element(self, image_elements: List[Element]): 239 | """ 240 | 複数の image_element から一番サイズの大きな画像を取得 241 | """ 242 | L = [ 243 | (i, self.get_image_size_of_image_element(i)) 244 | for i in image_elements 245 | ] 246 | return list(sorted(L, key=lambda x: x[1], reverse=True))[0][0] 247 | 248 | # その他プロパティが必要であれば 249 | # self.image_element.attrib.get('width', None) 250 | # self.image_element.attrib.get('height', None) 251 | # self.image_element.attrib.get('width', None) 252 | def get_image_path_of_image_element(self, image_element: Element) -> str: 253 | attr_names = [ 254 | '{http://www.w3.org/1999/xlink}href', 255 | 'src', 256 | '{http://www.w3.org/1999/xlink}src', 257 | ] 258 | for attr_name in attr_names: 259 | val = image_element.attrib.get(attr_name) 260 | if val: 261 | return os.path.join(os.path.dirname(self.page_xhtml_path), val) 262 | raise self.ImagePathAttrNotFound(image_element.attrib) 263 | 264 | def get_image_size_of_image_element(self, image_element: Element) -> int: 265 | """ 266 | 画像のサイズを取得 267 | """ 268 | return os.path.getsize( 269 | self.get_image_path_of_image_element(image_element) 270 | ) 271 | 272 | @cached_property 273 | def is_png(self) -> bool: 274 | return self.image_path.endswith('.png') 275 | 276 | @cached_property 277 | def item_href(self) -> Optional[str]: 278 | return self.item_element.attrib.get('href', None) 279 | 280 | 281 | class ImageSVGElement(ImagePage): 282 | """ 283 | item_element が、image/svg+xml の場合 284 | """ 285 | 286 | @cached_property 287 | def image_element(self) -> str: 288 | item_href = self.item_element.attrib.get('href', None) 289 | if not item_href: 290 | raise self.ItemHrefNotFound(f'{self.item_element}') 291 | if not item_href.lower().endswith('.svg'): 292 | # SVG ではない画像。普通の画像として扱う。 293 | return os.path.join( 294 | self.epub_extractor.content_base_dir, item_href 295 | ) 296 | # SVG だった。 297 | svg_path = os.path.join( 298 | self.epub_extractor.content_base_dir, item_href 299 | ) 300 | etree = parse_xml_with_recover(svg_path) 301 | # SVG から image を抽出 302 | images = etree.findall('.//{http://www.w3.org/2000/svg}image') 303 | 304 | if len(images) >= 2: 305 | return self.get_largest_image_element(images) 306 | 307 | if len(images) != 1: 308 | raise self.InvalidImageLength( 309 | '{}, {}'.format(self.item_element, len(images)) 310 | ) 311 | 312 | return images[0] 313 | 314 | 315 | class EpubExtractorError(Exception): 316 | pass 317 | 318 | 319 | class EpubExtractor: 320 | class EpubNotFound(EpubExtractorError): 321 | pass 322 | 323 | class NoEpubExtension(EpubExtractorError): 324 | pass 325 | 326 | class ContentXmlNotFound(EpubExtractorError): 327 | pass 328 | 329 | class IdRefNotFound(Exception): 330 | pass 331 | 332 | class ItemNotFound(Exception): 333 | pass 334 | 335 | class ItemHrefNotFound(Exception): 336 | pass 337 | 338 | class OutputDirectoryAlreadyExists(EpubExtractorError): 339 | pass 340 | 341 | def __init__(self, epub_file_path: str): 342 | if not os.path.exists(epub_file_path): 343 | raise self.EpubNotFound(epub_file_path) 344 | 345 | if not epub_file_path.endswith('.epub'): 346 | raise self.NoEpubExtension(epub_file_path) 347 | 348 | self.epub_file_path = epub_file_path 349 | self.setup() 350 | 351 | def setup(self) -> None: 352 | self.temp_dir = tempfile.mkdtemp(suffix='epub-dump-meta') 353 | # unzip 354 | subprocess.Popen( 355 | ('unzip', self.epub_file_path, "-d", self.temp_dir), 356 | stdout=subprocess.PIPE, 357 | stderr=subprocess.PIPE, 358 | ).communicate() 359 | 360 | def close(self, *, fail_silently=True) -> None: 361 | try: 362 | shutil.rmtree(self.temp_dir) 363 | except PermissionError as e: 364 | if not fail_silently: 365 | raise 366 | print('{}: {} ({})'.format(e.__class__.__name__, e, self.temp_dir)) 367 | 368 | @cached_property 369 | def content_xml_path(self) -> str: 370 | """ 371 | content.xml (standard.opf) のファイルパスを返す 372 | """ 373 | # META-INF/container.xml で固定 374 | container_xml_path = os.path.join( 375 | self.temp_dir, 'META-INF', 'container.xml' 376 | ) 377 | etree = parse_xml_with_recover(container_xml_path) 378 | # rootfile タグを探す 379 | rootfile_node = etree.find( 380 | ".//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile" 381 | ) 382 | content_opf_path = rootfile_node.attrib['full-path'] 383 | 384 | content_xml_path = os.path.join(self.temp_dir, content_opf_path) 385 | if not os.path.exists(content_xml_path): 386 | raise self.ContentXmlNotFound(content_xml_path) 387 | return content_xml_path 388 | 389 | @cached_property 390 | def content_xml_text(self) -> str: 391 | return open(self.content_xml_path).read() 392 | 393 | @cached_property 394 | def content_xml_etree(self) -> ElementTree: 395 | return parse_xml_with_recover(self.content_xml_path) 396 | 397 | @cached_property 398 | def content_base_dir(self) -> str: 399 | # ファイルのパス基準となるディレクトリ 400 | return os.path.dirname(self.content_xml_path) 401 | 402 | @cached_property 403 | def items_dict(self) -> Dict[str, Element]: 404 | """ 405 | id をキーにした item の辞書 406 | """ 407 | ntq = namespace_tag_query(self.content_xml_etree._root) 408 | manifest = self.content_xml_etree.find(ntq('manifest')) 409 | items = manifest.findall(ntq('item')) 410 | items_dict = {} # type: Dict[str, Element] 411 | for item in items: 412 | id = item.attrib.get('id') 413 | items_dict[id] = item 414 | return items_dict 415 | 416 | @cached_property 417 | def itemrefs(self) -> Iterator[Element]: 418 | """ 419 | spine > itemref をページ順に返すジェネレータ 420 | """ 421 | ntq = namespace_tag_query(self.content_xml_etree._root) 422 | spine = self.content_xml_etree.find(ntq('spine')) 423 | itemrefs = spine.findall(ntq('itemref')) 424 | for itemref in itemrefs: 425 | yield itemref 426 | 427 | def _get_image_pages(self) -> Iterator[ImageElementBase]: 428 | items_dict = self.items_dict 429 | 430 | for itemref in self.itemrefs: # type: Element 431 | 432 | idref = itemref.attrib.get('idref', None) 433 | if not idref: 434 | raise self.IdRefNotFound(itemref) 435 | 436 | if idref not in items_dict: 437 | raise self.ItemNotFound(idref) 438 | 439 | item = items_dict[idref] # type: Element 440 | 441 | media_type = item.attrib.get('media-type', '') 442 | if media_type.startswith('image/svg'): 443 | # image/svg+xml 等 444 | image_page = ImageSVGElement(item, itemref, self) 445 | elif media_type.startswith('image/'): 446 | # image/jpeg, image/png 447 | image_page = ImageElement(item, itemref, self) 448 | else: 449 | # application/xhtml+xml 等 450 | image_page = ImagePage(item, itemref, self) 451 | yield image_page 452 | 453 | @cached_property 454 | def image_pages(self) -> List[ImageElementBase]: 455 | return list(self._get_image_pages()) 456 | 457 | def format_page_number(self, page_number: str) -> str: 458 | return '{:05d}'.format(page_number) 459 | 460 | def _move_jpeg_file( 461 | self, 462 | image_page: ImageElementBase, 463 | output_dir: str, 464 | page_index: int, 465 | convert_png: bool = True, 466 | # copy: 指定すると、ファイルの移動ではなくコピーをする。 467 | # 1つの画像ファイルが複数箇所で使われれている場合、移動するとエラーになるので 468 | # コピーをしたほうが安全に処理ができる。 469 | # ただし、ストレージ容量が余分に必要で、遅い。 470 | copy: bool = True, 471 | ): 472 | source_image_path = image_page.image_path 473 | 474 | if image_page.is_png: 475 | if convert_png: 476 | # PNGを変換する場合 477 | destination_image_name = '{}.jpg'.format( 478 | self.format_page_number(page_index) 479 | ) 480 | destination_image_path = os.path.join( 481 | output_dir, destination_image_name 482 | ) 483 | convert_to_jpeg( 484 | source_image_path, destination_image_path, copy=copy 485 | ) 486 | return 487 | destination_image_name = '{}.png'.format( 488 | self.format_page_number(page_index) 489 | ) 490 | else: 491 | destination_image_name = '{}.jpg'.format( 492 | self.format_page_number(page_index) 493 | ) 494 | destination_image_path = os.path.join( 495 | output_dir, destination_image_name 496 | ) 497 | if copy: 498 | shutil.copy(source_image_path, destination_image_path) 499 | else: 500 | shutil.move(source_image_path, destination_image_path) 501 | print('{} -> {}'.format(source_image_path, destination_image_name)) 502 | 503 | def extract_images( 504 | self, 505 | output_dir: Optional[str] = None, 506 | convert_png: bool = True, 507 | delete_exists_dir: bool = False, 508 | copy: bool = True, 509 | fail_silently: bool = True, 510 | ): 511 | """ 512 | 画像ファイルをディレクトリに展開(移動) 513 | """ 514 | if not output_dir: 515 | output_dir, _ext = os.path.splitext(self.epub_file_path) 516 | if os.path.exists(output_dir): 517 | if delete_exists_dir: 518 | try: 519 | shutil.rmtree(output_dir) 520 | except PermissionError as e: 521 | if not fail_silently: 522 | raise 523 | print( 524 | '{}: {} ({})'.format( 525 | e.__class__.__name__, e, output_dir 526 | ) 527 | ) 528 | else: 529 | raise self.OutputDirectoryAlreadyExists(output_dir) 530 | 531 | os.mkdir(output_dir) 532 | 533 | for i, image_page in enumerate(self.image_pages, start=1): 534 | try: 535 | self._move_jpeg_file( 536 | image_page, 537 | output_dir, 538 | i, 539 | convert_png=convert_png, 540 | copy=copy, 541 | ) 542 | except ImagePage.InvalidImageLength as e: 543 | warnings.warn( 544 | '{} {}'.format(e.__class__.__name__, e), stacklevel=2 545 | ) 546 | 547 | @cached_property 548 | def metadata_element(self): 549 | """ 550 | コンテンツXML ( standard.opf) 内の、metadata エレメント 551 | """ 552 | ntq = namespace_tag_query(self.content_xml_etree._root) 553 | metadata = self.content_xml_etree.find(ntq('metadata')) 554 | return metadata 555 | 556 | @cached_property 557 | def last_page_number(self) -> int: 558 | return len(self.image_pages) 559 | 560 | def _get_item_href_from_image_page(self, image_page): 561 | """ 562 | ページのリンク先を取得 563 | e.g.: 'xhtml/cover.xhtml' 564 | """ 565 | path = getattr(image_page, 'item_href', None) 566 | if path: 567 | return path 568 | if hasattr(image_page, 'item_element') and hasattr( 569 | image_page.item_element, 'attrib' 570 | ): 571 | path = image_page.item_element.attrib.get('href', None) 572 | if path: 573 | return path 574 | # 未知のパターン。デバッグして調査してください。 575 | raise self.ItemHrefNotFound(image_page) 576 | 577 | @cached_property 578 | def xml_path_page_number_dict(self) -> Dict[str, Element]: 579 | """ 580 | XMLファイルとページ番号の対応表 581 | :return: dict 582 | """ 583 | return { 584 | self._get_item_href_from_image_page(image_page): i 585 | for i, image_page in enumerate(self.image_pages, start=1) 586 | } 587 | 588 | @cached_property 589 | def xml_path_page_number_dict_basename(self): 590 | """ 591 | XMLファイルとページ番号の対応表 ファイル名のみ版 592 | :return: dict 593 | """ 594 | return { 595 | os.path.basename(k): v 596 | for k, v in self.xml_path_page_number_dict.items() 597 | } 598 | 599 | def get_page_number_from_page_xml_path(self, page_xml_path, default=1): 600 | """ 601 | ページXMLパスから画像番号を取得 602 | page_xml_path は XHTMLファイルのパスか、画像のパスになる(EPUB形式による) 603 | """ 604 | if page_xml_path in self.xml_path_page_number_dict: 605 | return self.xml_path_page_number_dict[page_xml_path] 606 | else: 607 | return self.xml_path_page_number_dict_basename.get( 608 | os.path.basename(page_xml_path), default 609 | ) 610 | 611 | @cached_property 612 | def navigation_xml(self): 613 | """ 614 | :rtype: NavigationXml 615 | """ 616 | return NavigationXml(self) 617 | 618 | @cached_property 619 | def toc_ncx(self): 620 | """ 621 | :rtype: TocNcx 622 | """ 623 | return TocNcx(self) 624 | 625 | @cached_property 626 | def meta(self): 627 | """ 628 | :rtype: EpubMeta 629 | """ 630 | return EpubMeta(self) 631 | 632 | def get_toc_table(self): 633 | """ 634 | 目次情報を取得 635 | """ 636 | try: 637 | if self.toc_ncx.cleaned_toc_ncx_data: 638 | # toc.ncx がパースできたらそれを使う 639 | return self.toc_ncx.cleaned_toc_ncx_data 640 | except TocNcx.TocNcxNotFound: 641 | pass 642 | try: 643 | if self.navigation_xml.cleaned_navigation_xml_data: 644 | # toc.ncx がパースできなければ、navigation-xml から取得を試す 645 | return self.navigation_xml.cleaned_navigation_xml_data 646 | except NavigationXml.NavigationXmlNotFound: 647 | pass 648 | return None 649 | 650 | @staticmethod 651 | def print_json(object): 652 | import json 653 | 654 | print(json.dumps(object, ensure_ascii=False, indent=2)) 655 | 656 | def dump_meta(self): 657 | pass 658 | # self.toc_xml_path 659 | # self.navigation_xml.debug_cleaned_navigation_xml_data() 660 | 661 | # self.toc_ncx.debug_cleaned_toc_ncx_data() 662 | 663 | 664 | class EpubMeta: 665 | def __init__(self, epub_extractor): 666 | self.ee = epub_extractor 667 | self.meta_element = self.ee.metadata_element 668 | 669 | def _get_text_dc(self, tag_name): 670 | tag = self.meta_element.find( 671 | './/{}{}'.format("{http://purl.org/dc/elements/1.1/}", tag_name) 672 | ) 673 | if tag is not None: 674 | return tag.text 675 | else: 676 | return None 677 | 678 | def _get_texts_dc(self, tag_name): 679 | return [ 680 | e.text 681 | for e in self.meta_element.findall( 682 | './/{}{}'.format( 683 | "{http://purl.org/dc/elements/1.1/}", tag_name 684 | ) 685 | ) 686 | ] 687 | 688 | @cached_property 689 | def title(self): 690 | return self._get_text_dc('title') 691 | 692 | @cached_property 693 | def publisher(self): 694 | return self._get_text_dc('publisher') 695 | 696 | @cached_property 697 | def identifier(self): 698 | return self._get_text_dc('identifier') 699 | 700 | @cached_property 701 | def language(self): 702 | return self._get_text_dc('language') 703 | 704 | @cached_property 705 | def creators(self): 706 | return self._get_texts_dc('creator') 707 | 708 | def as_ordered_dict(self): 709 | return OrderedDict( 710 | [ 711 | ('title', self.title), 712 | ('publisher', self.publisher), 713 | ('identifier', self.identifier), 714 | ('language', self.language), 715 | ('creators', self.creators), 716 | ('meta', self.meta_dict), 717 | ] 718 | ) 719 | 720 | def meta_tags(self): 721 | return self.meta_element.findall( 722 | './/{http://www.idpf.org/2007/opf}meta' 723 | ) 724 | 725 | @cached_property 726 | def meta_dict(self): 727 | od = OrderedDict() 728 | for mt in self.meta_tags(): 729 | if mt.attrib.get('refines'): 730 | # refines 今回は無視 731 | continue 732 | if mt.attrib.get('name') and mt.attrib.get('content'): 733 | od[mt.attrib.get('name')] = mt.attrib.get('content') 734 | continue 735 | if mt.attrib.get('property'): 736 | od[mt.attrib.get('property')] = mt.text 737 | continue 738 | return od 739 | 740 | 741 | class NavigationXml: 742 | """ 743 | NAVIGATION XML (Required BeautifulSoup4) 744 | """ 745 | 746 | class NavigationXmlNotFound(EpubExtractorError): 747 | pass 748 | 749 | def __init__(self, epub_extractor): 750 | self.ee = epub_extractor 751 | 752 | @cached_property 753 | def navigation_xml_path(self): 754 | ntq = namespace_tag_query(self.ee.content_xml_etree._root) 755 | manifest = self.ee.content_xml_etree.find(ntq('manifest')) 756 | items = manifest.findall(ntq('item')) 757 | for item in items: 758 | if ( 759 | item.attrib.get('id') == 'toc' 760 | or item.attrib.get('properties') == 'nav' 761 | ): 762 | return os.path.join( 763 | self.ee.content_base_dir, item.attrib.get('href') 764 | ) 765 | raise self.NavigationXmlNotFound() 766 | 767 | @cached_property 768 | def navigation_xml_etree(self): 769 | return parse_xml_with_recover(self.navigation_xml_path) 770 | 771 | @cached_property 772 | def navigation_xml_bs4(self): 773 | from bs4 import BeautifulSoup 774 | 775 | return BeautifulSoup(open(self.navigation_xml_path), "html.parser") 776 | 777 | @cached_property 778 | def navigation_xml_data(self): 779 | def _gen(): 780 | bs = self.navigation_xml_bs4 781 | for a in bs.find_all('a'): 782 | href = a['href'] 783 | page_number = self.ee.get_page_number_from_page_xml_path(href) 784 | yield OrderedDict( 785 | [ 786 | ('page_xml', href), 787 | ('start_page', page_number), 788 | ('section_title', a.text), 789 | ] 790 | ) 791 | 792 | return list(_gen()) 793 | 794 | @cached_property 795 | def cleaned_navigation_xml_data(self): 796 | attended = set() 797 | navs = [] 798 | for o in sorted( 799 | self.navigation_xml_data, key=lambda x: x['start_page'] 800 | ): 801 | if o['start_page'] in attended: 802 | continue 803 | attended.add(o['start_page']) 804 | if navs: 805 | navs[-1]['end_page'] = o['start_page'] - 1 806 | navs.append(o) 807 | if navs: 808 | navs[-1]['end_page'] = self.ee.last_page_number 809 | return navs 810 | 811 | def debug_cleaned_navigation_xml_data(self): 812 | for o in self.cleaned_navigation_xml_data: 813 | print( 814 | '{}-{} {}'.format( 815 | self.ee.format_page_number(o['start_page']), 816 | self.ee.format_page_number(o['end_page']), 817 | o['section_title'], 818 | ) 819 | ) 820 | 821 | 822 | class TocNcx: 823 | """ 824 | TOC NCX 825 | """ 826 | 827 | class TocNcxNotFound(EpubExtractorError): 828 | pass 829 | 830 | def __init__(self, epub_extractor): 831 | self.ee = epub_extractor 832 | 833 | @cached_property 834 | def toc_ncx_etree(self) -> ElementTree: 835 | return parse_xml_with_recover(self.toc_ncx_path) 836 | 837 | @cached_property 838 | def toc_ncx_path(self) -> str: 839 | manifest = self.ee.content_xml_etree.find( 840 | './/{http://www.idpf.org/2007/opf}manifest' 841 | ) 842 | items = manifest.findall('.//{http://www.idpf.org/2007/opf}item') 843 | for item in items: 844 | if ( 845 | item.attrib.get('media-type') == 'application/x-dtbncx+xml' 846 | or item.attrib.get('id') == 'ncx' 847 | ): 848 | return os.path.join( 849 | self.ee.content_base_dir, item.attrib.get('href') 850 | ) 851 | raise self.TocNcxNotFound() 852 | 853 | @cached_property 854 | def toc_ncx_data(self) -> List[OrderedDict]: 855 | """ 856 | toc.ncx を解析した辞書 857 | """ 858 | 859 | def _gen(): 860 | ntq = namespace_tag_query(self.toc_ncx_etree._root) 861 | for np in self.toc_ncx_etree.findall(ntq('navPoint')): 862 | text = np.find(ntq('text')) 863 | content = np.find(ntq('content')) 864 | src = content.attrib.get('src') 865 | page_number = self.ee.get_page_number_from_page_xml_path(src) 866 | # play_order = np.attrib.get('playOrder') 867 | yield OrderedDict( 868 | [ 869 | ('page_xml', src), 870 | ('start_page', page_number), 871 | ('section_title', text.text), 872 | ] 873 | ) 874 | 875 | return list(_gen()) 876 | 877 | @cached_property 878 | def cleaned_toc_ncx_data(self): 879 | attended = set() 880 | navs = [] 881 | for o in sorted(self.toc_ncx_data, key=lambda x: x['start_page']): 882 | if o['start_page'] in attended: 883 | continue 884 | attended.add(o['start_page']) 885 | if navs: 886 | navs[-1]['end_page'] = o['start_page'] - 1 887 | navs.append(o) 888 | if navs: 889 | navs[-1]['end_page'] = self.ee.last_page_number 890 | return navs 891 | 892 | def debug_cleaned_toc_ncx_data(self) -> None: 893 | for o in self.cleaned_toc_ncx_data: 894 | print( 895 | '{}-{} {}'.format( 896 | self.ee.format_page_number(o['start_page']), 897 | self.ee.format_page_number(o['end_page']), 898 | o['section_title'], 899 | ) 900 | ) 901 | --------------------------------------------------------------------------------