├── src └── baca │ ├── utils │ ├── __init__.py │ ├── urls.py │ ├── tempdir.py │ ├── app_resources.py │ ├── keys_parser.py │ ├── user_appdirs.py │ ├── systems.py │ ├── queries.py │ ├── html_parser.py │ └── cli.py │ ├── components │ ├── __init__.py │ ├── events.py │ ├── windows.py │ └── contents.py │ ├── resources │ ├── __init__.py │ ├── config.ini │ └── style.css │ ├── tools │ ├── KindleUnpack │ │ ├── __init__.py │ │ ├── unipath.py │ │ ├── mobi_uncompress.py │ │ ├── mobi_sectioner.py │ │ ├── mobi_pagemap.py │ │ ├── mobi_nav.py │ │ ├── unpack_structure.py │ │ ├── mobi_utils.py │ │ ├── mobi_cover.py │ │ ├── compatibility_utils.py │ │ ├── mobi_ncx.py │ │ ├── mobi_k8resc.py │ │ ├── mobi_index.py │ │ ├── mobi_dict.py │ │ └── mobi_split.py │ └── __init__.py │ ├── __init__.py │ ├── ebooks │ ├── __init__.py │ ├── azw.py │ ├── base.py │ ├── mobi.py │ └── epub.py │ ├── exceptions.py │ ├── __main__.py │ ├── db.py │ ├── config.py │ ├── models.py │ └── app.py ├── .gitignore ├── poetry.toml ├── Makefile ├── tools └── debug.py ├── pyproject.toml ├── tests └── test_html_parser.py └── README.md /src/baca/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/baca/components/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/baca/resources/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | tmp/ 3 | __pycache__/ 4 | *.pyc 5 | .envrc 6 | -------------------------------------------------------------------------------- /poetry.toml: -------------------------------------------------------------------------------- 1 | [virtualenvs] 2 | in-project = true 3 | create = true 4 | path = ".venv" 5 | -------------------------------------------------------------------------------- /src/baca/tools/KindleUnpack/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai 3 | -------------------------------------------------------------------------------- /src/baca/tools/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["unpack_kindle_book"] 2 | 3 | from .KindleUnpack.kindleunpack import unpackBook as unpack_kindle_book 4 | -------------------------------------------------------------------------------- /src/baca/utils/urls.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | 3 | 4 | def is_url(url: str) -> bool: 5 | return urlparse(url).scheme != "" 6 | -------------------------------------------------------------------------------- /src/baca/__init__.py: -------------------------------------------------------------------------------- 1 | __appname__ = "baca" 2 | __version__ = "0.1.17" 3 | __license__ = "GPL-3.0" 4 | __author__ = "Benawi Adha" 5 | __email__ = "benawiadha@gmail.com" 6 | __url__ = "https://github.com/wustho/baca" 7 | -------------------------------------------------------------------------------- /src/baca/utils/tempdir.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from pathlib import Path 3 | 4 | from baca import __appname__ 5 | 6 | 7 | def create_tempdir() -> Path: 8 | return Path(tempfile.mkdtemp(prefix=f"{__appname__}-")) 9 | -------------------------------------------------------------------------------- /src/baca/ebooks/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "Azw", 3 | "Ebook", 4 | "Epub", 5 | "Mobi", 6 | ] 7 | 8 | 9 | from baca.ebooks.azw import Azw 10 | from baca.ebooks.base import Ebook 11 | from baca.ebooks.epub import Epub 12 | from baca.ebooks.mobi import Mobi 13 | -------------------------------------------------------------------------------- /src/baca/utils/app_resources.py: -------------------------------------------------------------------------------- 1 | from importlib.resources import as_file, files 2 | from pathlib import Path 3 | 4 | 5 | def get_resource_file(filename: str) -> Path: 6 | with as_file(files("baca.resources").joinpath(filename)) as resource_file: 7 | return resource_file 8 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: tests 2 | 3 | tests: 4 | python -m pytest tests 5 | 6 | format: 7 | isort src 8 | black src 9 | 10 | console: 11 | @textual console 12 | 13 | debug: 14 | @textual run --dev tools/debug.py:Baca 15 | 16 | publish: 17 | # poetry build 18 | poetry publish --build 19 | 20 | typecheck: 21 | pyright src 22 | -------------------------------------------------------------------------------- /src/baca/exceptions.py: -------------------------------------------------------------------------------- 1 | class TableDoesNotExist(Exception): 2 | pass 3 | 4 | 5 | class BacaException(Exception): 6 | def __init__(self, message: str): 7 | super().__init__(f"BacaError: {message}") 8 | 9 | 10 | class EbookNotFound(BacaException): 11 | pass 12 | 13 | 14 | class FormatNotSupported(BacaException): 15 | pass 16 | 17 | 18 | class LaunchingFileError(Exception): 19 | pass 20 | -------------------------------------------------------------------------------- /tools/debug.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from baca.app import Baca as _Baca 4 | from baca.utils.cli import get_ebook_class 5 | from baca.utils.queries import get_last_read_ebook 6 | 7 | 8 | class Baca(_Baca): 9 | def __init__(self): 10 | # file = Path("tmp/alice.azw3") 11 | file = get_last_read_ebook() 12 | assert file is not None 13 | # file = Path("tmp/andy.epub") 14 | # file = Path("tmp/frankenstein.older.mobi") 15 | # file = Path("tmp/frankenstein.mobi") 16 | super().__init__(file, get_ebook_class(file)) 17 | -------------------------------------------------------------------------------- /src/baca/ebooks/azw.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import os 3 | import zipfile 4 | from pathlib import Path 5 | 6 | from baca.ebooks.epub import Epub 7 | from baca.tools import unpack_kindle_book 8 | from baca.utils.tempdir import create_tempdir 9 | 10 | 11 | class Azw(Epub): 12 | def __init__(self, ebook_path: Path): 13 | self._path = ebook_path.resolve() 14 | self._tempdir = create_tempdir() 15 | self._tmpepub = self._tempdir / "mobi8" / f"{os.path.splitext(self._path)[0]}.epub" 16 | with contextlib.redirect_stdout(None): 17 | unpack_kindle_book(str(self._path), str(self._tempdir), epubver="A", use_hd=True) 18 | self._file = zipfile.ZipFile(self._tmpepub, "r") 19 | -------------------------------------------------------------------------------- /src/baca/components/events.py: -------------------------------------------------------------------------------- 1 | from textual.message import Message 2 | 3 | 4 | class DoneLoading(Message): 5 | def __init__(self, content): 6 | super().__init__() 7 | self.content = content 8 | 9 | 10 | class FollowThis(Message): 11 | def __init__(self, nav_point: str): 12 | super().__init__() 13 | self.nav_point = nav_point 14 | 15 | 16 | class OpenThisImage(Message): 17 | def __init__(self, value: str): 18 | super().__init__() 19 | self.value = value 20 | 21 | 22 | class SearchSubmitted(Message): 23 | def __init__(self, value: str, forward: bool): 24 | super().__init__() 25 | self.value = value 26 | self.forward = forward 27 | 28 | 29 | class Screenshot(Message): 30 | pass 31 | -------------------------------------------------------------------------------- /src/baca/utils/keys_parser.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | from textual import events 4 | from textual.actions import SkipAction 5 | 6 | from baca.models import KeyMap 7 | 8 | 9 | async def dispatch_key(maps: list[KeyMap], event: events.Key, *, propagate: bool = True) -> None: 10 | callback = {k: m.action for m in maps for k in m.keys}.get(event.key) 11 | 12 | if callback is not None: 13 | try: 14 | return_value = callback() 15 | if inspect.isawaitable(return_value): 16 | await return_value 17 | except SkipAction: 18 | pass 19 | 20 | if propagate: 21 | # stop propagating to base widget 22 | event.prevent_default() 23 | # stop propagating to parent widget 24 | event.stop() 25 | -------------------------------------------------------------------------------- /src/baca/__main__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from rich.console import Console 4 | from rich.text import Text 5 | 6 | from baca.app import Baca 7 | from baca.db import migrate 8 | from baca.exceptions import EbookNotFound, FormatNotSupported 9 | from baca.utils.cli import find_file, get_ebook_class 10 | 11 | 12 | def main(): 13 | try: 14 | migrate() 15 | ebook_path = find_file() 16 | ebook_class = get_ebook_class(ebook_path) 17 | return sys.exit(Baca(ebook_path=ebook_path, ebook_class=ebook_class).run()) 18 | 19 | except (Exception, EbookNotFound, FormatNotSupported) as e: 20 | console = Console() 21 | if isinstance(e, (EbookNotFound, FormatNotSupported)): 22 | console.print(Text(str(e), style="bold red")) 23 | else: 24 | console.print_exception() 25 | sys.exit(-1) 26 | 27 | 28 | if __name__ == "__main__": 29 | main() 30 | -------------------------------------------------------------------------------- /src/baca/utils/user_appdirs.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | import appdirs 5 | 6 | from baca import __appname__ 7 | from baca.utils.app_resources import get_resource_file 8 | 9 | DEFAULT_CONFIG = get_resource_file("config.ini") 10 | 11 | 12 | def retrieve_user_cache_dbfile() -> Path: 13 | cachedir = appdirs.user_cache_dir(__appname__) 14 | if not os.path.isdir(cachedir): 15 | os.makedirs(cachedir) 16 | 17 | return Path(cachedir) / f"{__appname__}.db" 18 | 19 | 20 | def retrieve_user_config_file() -> Path: 21 | configdir = Path(appdirs.user_config_dir(appname=__appname__)) 22 | if not os.path.isdir(configdir): 23 | os.makedirs(configdir) 24 | 25 | configfile = configdir / "config.ini" 26 | if not os.path.isfile(configfile): 27 | # shutil.copyfile(str(DEFAULT_CONFIG), str(configfile)) 28 | with open(DEFAULT_CONFIG, "r", encoding="utf-8") as src, open(configfile, "w", encoding="utf-8") as dest: 29 | dest.write(src.read()) 30 | 31 | return configfile 32 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "baca" 3 | version = "0.1.17" 4 | description = "TUI Ebook Reader" 5 | authors = ["Benawi Adha "] 6 | license = "GPL-3.0" 7 | readme = "README.md" 8 | packages = [ 9 | { include = "baca", from = "src" } 10 | ] 11 | 12 | [tool.poetry.scripts] 13 | baca = "baca.__main__:main" 14 | 15 | [tool.poetry.dependencies] 16 | python = "^3.10" 17 | textual = "^0.16.0" 18 | beautifulsoup4 = "^4.12.0" 19 | markdownify = "^0.11.6" 20 | appdirs = "^1.4.4" 21 | peewee = "^3.16.0" 22 | fuzzywuzzy = "^0.18.0" 23 | climage = "^0.2.0" 24 | 25 | [tool.poetry.group.dev.dependencies] 26 | black = "^23.1.0" 27 | isort = "^5.12.0" 28 | ipython = "^8.11.0" 29 | textual = {extras = ["dev"], version = "^0.16.0"} 30 | pytest = "^7.2.2" 31 | ipdb = "^0.13.13" 32 | 33 | [build-system] 34 | requires = ["poetry-core>=1.0.0"] 35 | build-backend = "poetry.core.masonry.api" 36 | 37 | [tool.pyright] 38 | exclude = ["src/baca/tools"] 39 | 40 | [tool.black] 41 | line-length = 120 42 | target-version = ['py310'] 43 | exclude = "src/baca/tools/" 44 | 45 | [tool.isort] 46 | profile = "black" 47 | skip = "src/baca/tools/" 48 | -------------------------------------------------------------------------------- /src/baca/utils/systems.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import platform 4 | import shutil 5 | from pathlib import Path 6 | 7 | from baca.exceptions import LaunchingFileError 8 | 9 | LAUNCHERS = [ 10 | "gnome-open", 11 | "gvfs-open", 12 | "xdg-open", 13 | "kde-open", 14 | ] 15 | 16 | 17 | async def launch_file(path: Path | str, preferred: str = LAUNCHERS[0]) -> None: 18 | if platform.system() == "Windows": 19 | loop = asyncio.get_running_loop() 20 | await loop.run_in_executor(None, os.startfile, path) # type: ignore 21 | return 22 | 23 | if platform.system() == "Darwin": 24 | launcher = "open" 25 | else: 26 | try: 27 | launcher = next(l for l in [preferred, *LAUNCHERS] if shutil.which(l) is not None) 28 | except StopIteration: 29 | raise LaunchingFileError("System launcher not found.") 30 | 31 | proc = await asyncio.create_subprocess_exec(launcher, path, stderr=asyncio.subprocess.PIPE) 32 | await proc.wait() 33 | if proc.returncode != 0: 34 | _, stderr = await proc.communicate() 35 | raise LaunchingFileError(stderr.decode()) 36 | -------------------------------------------------------------------------------- /src/baca/ebooks/base.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import xml.etree.ElementTree as ET 3 | from pathlib import Path 4 | from typing import Iterator 5 | 6 | from baca.models import BookMetadata, Segment, TocEntry 7 | 8 | 9 | class Ebook: 10 | def __init__(self, ebook_path: Path): 11 | raise NotImplementedError() 12 | 13 | def get_tempdir(self) -> Path: 14 | raise NotImplementedError() 15 | 16 | def get_path(self) -> Path: 17 | raise NotImplementedError() 18 | 19 | def get_raw_text(self, content: str | ET.Element) -> str: 20 | raise NotImplementedError() 21 | 22 | def get_img_bytestr(self, image_id: str) -> tuple[str, bytes]: 23 | raise NotImplementedError() 24 | 25 | def cleanup(self) -> None: 26 | shutil.rmtree(self.get_tempdir()) 27 | 28 | # TODO: maybe cache @lru_cache 29 | def get_toc(self) -> tuple[TocEntry, ...]: 30 | raise NotImplementedError() 31 | 32 | def iter_parsed_contents(self) -> Iterator[Segment]: 33 | raise NotImplementedError() 34 | 35 | def get_meta(self) -> BookMetadata: 36 | raise NotImplementedError("Ebook.get_meta() not implemented") 37 | -------------------------------------------------------------------------------- /src/baca/resources/config.ini: -------------------------------------------------------------------------------- 1 | [General] 2 | # pick your favorite image viewer 3 | PreferredImageViewer = auto 4 | 5 | # int or css value string like 90%% 6 | # (escape percent with double percent %%) 7 | MaxTextWidth = 80 8 | 9 | # 'justify', 'center', 'left', 'right' 10 | TextJustification = justify 11 | 12 | # currently using pretty=yes is slow 13 | # and taking huge amount of memory 14 | Pretty = no 15 | 16 | PageScrollDuration = 0.2 17 | 18 | # either show image as ansi image 19 | # or text 'IMAGE' as a placehoder 20 | # (showing ansi image will affect 21 | # performance & resource usage) 22 | ShowImageAsANSI = yes 23 | 24 | [Color Dark] 25 | Background = #1e1e1e 26 | Foreground = #f5f5f5 27 | Accent = #0178d4 28 | 29 | [Color Light] 30 | Background = #f5f5f5 31 | Foreground = #1e1e1e 32 | Accent = #0178d4 33 | 34 | [Keymaps] 35 | ToggleLightDark = c 36 | ScrollDown = down,j 37 | ScrollUp = up,k 38 | PageDown = ctrl+f,pagedown,l,space 39 | PageUp = ctrl+b,pageup,h 40 | Home = home,g 41 | End = end,G 42 | OpenToc = tab 43 | OpenMetadata = M 44 | OpenHelp = f1 45 | SearchForward = slash 46 | SearchBackward = question_mark 47 | NextMatch = n 48 | PreviousMatch = N 49 | Confirm = enter 50 | CloseOrQuit = q,escape 51 | Screenshot = f12 52 | -------------------------------------------------------------------------------- /src/baca/db.py: -------------------------------------------------------------------------------- 1 | """ 2 | NOTE: on using peewee for non-integer primary_key 3 | 4 | ```python 5 | # This works because .create() will specify `force_insert=True`. 6 | obj1 = UUIDModel.create(id=uuid.uuid4()) 7 | 8 | # This will not work, however. Peewee will attempt to do an update: 9 | obj2 = UUIDModel(id=uuid.uuid4()) 10 | obj2.save() # WRONG 11 | 12 | obj2.save(force_insert=True) # CORRECT 13 | 14 | # Once the object has been created, you can call save() normally. 15 | obj2.save() 16 | ``` 17 | 18 | Read more: http://docs.peewee-orm.com/en/latest/peewee/models.html?highlight=force_insert#id4 19 | """ 20 | 21 | from baca.exceptions import TableDoesNotExist 22 | from baca.models import DbMetadata, Migration, ReadingHistory, db 23 | 24 | 25 | def initial_migration() -> None: 26 | db.create_tables([DbMetadata, ReadingHistory]) 27 | 28 | 29 | MIGRATIONS: list[Migration] = [ 30 | Migration(version=0, migrate=initial_migration), 31 | ] 32 | 33 | 34 | def migrate() -> None: 35 | db.connect() 36 | try: 37 | for migration in sorted(MIGRATIONS, key=lambda x: x.version): 38 | try: 39 | if not DbMetadata.table_exists(): 40 | raise TableDoesNotExist() 41 | DbMetadata.get_by_id(migration.version) 42 | except (DbMetadata.DoesNotExist, TableDoesNotExist): 43 | migration.migrate() 44 | DbMetadata.create(version=migration.version) 45 | finally: 46 | db.close() 47 | -------------------------------------------------------------------------------- /src/baca/utils/queries.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from pathlib import Path 3 | from typing import Iterator 4 | 5 | from baca.models import ReadingHistory 6 | 7 | MIN_FUZZY_MATCH_RATIO = 10 8 | 9 | 10 | def get_all_reading_history() -> Iterator[ReadingHistory]: 11 | for rh in ReadingHistory.select().order_by(ReadingHistory.last_read.desc()): # type: ignore 12 | if Path(rh.filepath).is_file(): 13 | yield rh 14 | else: 15 | rh.delete_instance() 16 | 17 | 18 | def get_best_match_from_history(pattern: str) -> Path | None: 19 | try: 20 | from thefuzz import fuzz 21 | except ImportError: 22 | with warnings.catch_warnings(): 23 | warnings.simplefilter("ignore") 24 | from fuzzywuzzy import fuzz 25 | 26 | match_ratios = [ 27 | (rh.filepath, fuzz.ratio(tomatch, pattern)) 28 | for rh in get_all_reading_history() 29 | for tomatch in [rh.filepath, f"{rh.title} {rh.author}"] 30 | ] 31 | matches = [(Path(path), ratio) for path, ratio in match_ratios if ratio > MIN_FUZZY_MATCH_RATIO] # type: ignore 32 | return None if len(matches) == 0 else sorted(matches, key=lambda x: -x[1])[0][0] 33 | 34 | 35 | def get_nth_file_from_history(nth: int) -> Path | None: 36 | try: 37 | return Path(list(get_all_reading_history())[nth - 1].filepath) # type: ignore 38 | except IndexError: 39 | return None 40 | 41 | 42 | def get_last_read_ebook() -> Path | None: 43 | try: 44 | last_read_ebook = ReadingHistory.select().order_by(ReadingHistory.last_read.desc()).get() # type: ignore 45 | last_read_ebook = Path(last_read_ebook.filepath) 46 | return last_read_ebook if last_read_ebook.is_file() else None 47 | except ReadingHistory.DoesNotExist: 48 | return None 49 | -------------------------------------------------------------------------------- /tests/test_html_parser.py: -------------------------------------------------------------------------------- 1 | from baca.utils.html_parser import split_html_to_segments 2 | from baca.models import SegmentType 3 | 4 | HTML_TEST = """ 5 | The Dormouse's story 6 | 7 |

The Dormouse's 8 | story

9 | 10 |

Once upon a time there were three little sisters; and their names were 11 | Elsie, 12 | Lacie and 13 | and they lived at the bottom of a well.

14 | Girl in a jacket 15 | 16 |

...

17 | """ 18 | 19 | 20 | def test_html_splitters(): 21 | segments_iterator = split_html_to_segments(HTML_TEST, "test.html") 22 | 23 | segment = next(segments_iterator) 24 | assert segment.type == SegmentType.BODY 25 | assert ( 26 | segment.content 27 | == '

The Dormouse\'s story

Once upon a time there were three little sisters; and their names were ' 28 | ) 29 | assert segment.nav_point == "test.html" 30 | 31 | segment = next(segments_iterator) 32 | assert segment.type == SegmentType.BODY 33 | assert segment.content == 'Elsie, ' 34 | assert segment.nav_point == "test.html#link1" 35 | 36 | segment = next(segments_iterator) 37 | assert segment.type == SegmentType.BODY 38 | assert ( 39 | segment.content 40 | == 'Lacie and and they lived at the bottom of a well.

' 41 | ) 42 | assert segment.nav_point == "test.html#link2" 43 | 44 | segment = next(segments_iterator) 45 | assert segment.type == SegmentType.IMAGE 46 | assert segment.content == "img_girl.jpg" 47 | assert segment.nav_point is None 48 | 49 | segment = next(segments_iterator) 50 | assert segment.type == SegmentType.BODY 51 | assert ( 52 | segment.content 53 | == 'Girl in a jacket

...

' 54 | ) 55 | assert segment.nav_point is None 56 | -------------------------------------------------------------------------------- /src/baca/utils/html_parser.py: -------------------------------------------------------------------------------- 1 | from typing import Iterator 2 | from urllib.parse import urljoin 3 | 4 | from bs4 import BeautifulSoup 5 | from markdownify import MarkdownConverter as _MarkdownConverter 6 | 7 | from baca.models import Segment, SegmentType 8 | 9 | 10 | class MarkdownConverter(_MarkdownConverter): 11 | def convert_img(self, el, text, convert_as_inline): 12 | return "" 13 | 14 | 15 | def split_html_to_segments( 16 | html_src: str, section_name: str, *, ids_to_find: list[str] | None = None 17 | ) -> Iterator[Segment]: 18 | """ 19 | :param ids_to_find: 20 | ids_to_find is url fragment (eg. https://url.com/content.html#fragment) to find inside given `html_src` 21 | 22 | - if None will find all possible id(s) 23 | - if [] then, will skip finding id(s) section in html_src 24 | """ 25 | soup = BeautifulSoup(html_src, "html.parser", store_line_numbers=True) 26 | body = soup.find("body") 27 | body_html = str(body).replace("\n", " ") 28 | body = BeautifulSoup(body_html, "html.parser") 29 | 30 | find_nav_points = ids_to_find is None or len(ids_to_find) > 0 31 | if find_nav_points: 32 | section_elems = body.find_all(id=True if ids_to_find is None else ids_to_find) 33 | else: 34 | section_elems = [] 35 | img_elems = body.find_all(["img", "image"]) 36 | all_elems = sorted(section_elems + img_elems, key=lambda x: [x.sourceline, x.sourcepos]) # type: ignore 37 | 38 | start = 0 39 | nav_point = section_name 40 | for elem in all_elems: 41 | yield Segment(type=SegmentType.BODY, content=body_html[start : elem.sourcepos], nav_point=nav_point) # type: ignore 42 | 43 | start = elem.sourcepos 44 | fragment = elem.get("id") 45 | nav_point = f"{section_name}#{fragment}" if find_nav_points and fragment is not None else None 46 | 47 | if elem.name in {"img", "image"}: # type: ignore 48 | img_src = elem.get("src") or elem.get("href") # type: ignore 49 | if img_src is not None: 50 | # NOTE: urljoin should be able to handle relative path. ie urljoin("a", "b") == "b" 51 | yield Segment(type=SegmentType.IMAGE, content=urljoin(section_name, img_src), nav_point=nav_point) 52 | 53 | yield Segment(type=SegmentType.BODY, content=body_html[start:], nav_point=nav_point) 54 | 55 | 56 | def parse_html_to_segmented_md( 57 | html_src: str, section_name: str, *, ids_to_find: list[str] | None = None 58 | ) -> Iterator[Segment]: 59 | for segment in split_html_to_segments(html_src, section_name, ids_to_find=ids_to_find): 60 | yield Segment( 61 | type=segment.type, 62 | content=MarkdownConverter().convert(segment.content) 63 | if segment.type == SegmentType.BODY 64 | else segment.content, 65 | nav_point=segment.nav_point, 66 | ) 67 | -------------------------------------------------------------------------------- /src/baca/ebooks/mobi.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import os 3 | import tempfile 4 | import xml.etree.ElementTree as ET 5 | from enum import Enum 6 | from pathlib import Path 7 | 8 | from baca import __appname__ 9 | from baca.ebooks.epub import Epub 10 | from baca.models import TocEntry 11 | from baca.tools import unpack_kindle_book 12 | 13 | 14 | class MobiVersion(Enum): 15 | MOBI7 = "mobi7" 16 | MOBI8 = "mobi8" 17 | 18 | 19 | # TODO: test on windows machine 20 | class Mobi(Epub): 21 | def __init__(self, ebook_path: Path): 22 | self._path = ebook_path.resolve() 23 | self._tempdir = Path(tempfile.mkdtemp(prefix=f"{__appname__}-")) 24 | with contextlib.redirect_stdout(None): 25 | unpack_kindle_book(str(self._path), str(self._tempdir), epubver="A", use_hd=True) 26 | 27 | @property 28 | def _mobi_version(self) -> MobiVersion: 29 | if (self.get_tempdir() / "mobi8").is_dir(): 30 | return MobiVersion.MOBI8 31 | elif (self.get_tempdir() / "mobi7").is_dir(): 32 | return MobiVersion.MOBI7 33 | else: 34 | raise NotImplementedError("Unsupported Mobi version") 35 | 36 | @property 37 | def _book_dir(self) -> Path: 38 | return self.get_tempdir() / ("mobi8" if self._mobi_version == MobiVersion.MOBI8 else "mobi7") 39 | 40 | @property 41 | def _root_filepath(self) -> Path: 42 | if self._mobi_version == MobiVersion.MOBI8: 43 | container_file = ET.parse(self._book_dir / "META-INF" / "container.xml") 44 | rootfile_elem = container_file.find("CONT:rootfiles/CONT:rootfile", Epub.NAMESPACE) 45 | return self._book_dir / rootfile_elem.attrib["full-path"] # type: ignore 46 | else: 47 | return self._book_dir / "content.opf" 48 | 49 | @property 50 | def _root_dirpath(self) -> Path: 51 | return self._root_filepath.parent 52 | 53 | @property 54 | def _content_opf(self) -> ET.ElementTree: 55 | return ET.parse(self._root_filepath) 56 | 57 | @property 58 | def _toc_ncx(self) -> ET.Element: 59 | toc_ncx_path = self._root_dirpath / self._relactive_toc_ncx_path # type: ignore 60 | return ET.parse(toc_ncx_path).getroot() 61 | 62 | def _get_contents(self) -> tuple[str, ...] | tuple[ET.Element, ...]: 63 | # TODO: using path_resolver kward seems weird, refactor this! 64 | return Epub._parse_content_opf(self._content_opf, str(self._root_dirpath), path_resolver=os.path.join) 65 | 66 | def get_toc(self) -> tuple[TocEntry, ...]: 67 | # TODO: using path_resolver kward seems weird, refactor this! 68 | return Epub._parse_toc(self._toc_ncx, self._version, self._root_dirpath, path_resolver=os.path.join) 69 | 70 | def get_raw_text(self, content_path: str) -> str: 71 | with open(content_path, encoding="utf8") as f: 72 | return f.read() 73 | 74 | def get_img_bytestr(self, impath: str) -> tuple[str, bytes]: 75 | # TODO: test on windows, maybe urljoin? 76 | # if impath "Images/asdf.png" is problematic 77 | image_abspath = self._root_dirpath / impath 78 | image_abspath = os.path.normpath(image_abspath) # handle crossplatform path 79 | with open(image_abspath, "rb") as f: 80 | src = f.read() 81 | return impath, src 82 | -------------------------------------------------------------------------------- /src/baca/config.py: -------------------------------------------------------------------------------- 1 | from configparser import ConfigParser 2 | from typing import Literal, cast 3 | 4 | from baca.models import Color, Config, Keymaps 5 | from baca.utils.user_appdirs import DEFAULT_CONFIG, retrieve_user_config_file 6 | 7 | 8 | def parse_keymaps(config_map: str) -> list[str]: 9 | return [k.strip() for k in config_map.split(",")] 10 | 11 | 12 | def load_config() -> Config: 13 | user_conf = ConfigParser() 14 | user_conf.read(retrieve_user_config_file()) 15 | default_conf = ConfigParser() 16 | default_conf.read(DEFAULT_CONFIG) 17 | 18 | def get_value(section: str, key: str, is_bool: bool = False) -> str | bool: 19 | section_conf = user_conf[section] if section in user_conf else default_conf[section] 20 | return ( 21 | section_conf.get(key, default_conf[section][key]) 22 | if not is_bool 23 | else section_conf.getboolean(key, fallback=default_conf[section].getboolean(key)) 24 | ) 25 | 26 | return Config( 27 | preferred_image_viewer=str(get_value("General", "PreferredImageViewer")), 28 | max_text_width=str(get_value("General", "MaxTextWidth")), 29 | text_justification=cast( 30 | Literal["default", "center", "full", "right", "left"], get_value("General", "TextJustification") 31 | ), 32 | pretty=bool(get_value("General", "Pretty", True)), 33 | page_scroll_duration=float(get_value("General", "PageScrollDuration")), 34 | show_image_as_ansi=bool(get_value("General", "ShowImageAsANSI", True)), 35 | dark=Color( 36 | bg=str(get_value("Color Dark", "Background")), 37 | fg=str(get_value("Color Dark", "Foreground")), 38 | accent=str(get_value("Color Dark", "Accent")), 39 | ), 40 | light=Color( 41 | bg=str(get_value("Color Light", "Background")), 42 | fg=str(get_value("Color Light", "Foreground")), 43 | accent=str(get_value("Color Light", "Accent")), 44 | ), 45 | keymaps=Keymaps( 46 | toggle_dark=parse_keymaps(str(get_value("Keymaps", "ToggleLightDark"))), 47 | scroll_down=parse_keymaps(str(get_value("Keymaps", "ScrollDown"))), 48 | scroll_up=parse_keymaps(str(get_value("Keymaps", "ScrollUp"))), 49 | page_up=parse_keymaps(str(get_value("Keymaps", "PageUp"))), 50 | page_down=parse_keymaps(str(get_value("Keymaps", "PageDown"))), 51 | home=parse_keymaps(str(get_value("Keymaps", "Home"))), 52 | end=parse_keymaps(str(get_value("Keymaps", "End"))), 53 | open_toc=parse_keymaps(str(get_value("Keymaps", "OpenToc"))), 54 | open_metadata=parse_keymaps(str(get_value("Keymaps", "OpenMetadata"))), 55 | open_help=parse_keymaps(str(get_value("Keymaps", "OpenHelp"))), 56 | search_forward=parse_keymaps(str(get_value("Keymaps", "SearchForward"))), 57 | search_backward=parse_keymaps(str(get_value("Keymaps", "SearchBackward"))), 58 | next_match=parse_keymaps(str(get_value("Keymaps", "NextMatch"))), 59 | prev_match=parse_keymaps(str(get_value("Keymaps", "PreviousMatch"))), 60 | confirm=parse_keymaps(str(get_value("Keymaps", "Confirm"))), 61 | close=parse_keymaps(str(get_value("Keymaps", "CloseOrQuit"))), 62 | screenshot=parse_keymaps(str(get_value("Keymaps", "Screenshot"))), 63 | ), 64 | ) 65 | -------------------------------------------------------------------------------- /src/baca/models.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from datetime import datetime 3 | from enum import Enum 4 | from typing import Callable, Literal 5 | 6 | from peewee import ( 7 | CharField, 8 | DateTimeField, 9 | FloatField, 10 | IntegerField, 11 | Model, 12 | SqliteDatabase, 13 | ) 14 | 15 | from baca.utils.user_appdirs import retrieve_user_cache_dbfile 16 | 17 | db = SqliteDatabase(retrieve_user_cache_dbfile()) 18 | 19 | 20 | class BaseModel(Model): 21 | class Meta: 22 | database = db 23 | 24 | 25 | class DbMetadata(BaseModel): 26 | version = IntegerField(primary_key=True) 27 | migrated_at = DateTimeField(default=datetime.now) 28 | 29 | class Meta: 30 | table_name = "metadata" 31 | 32 | 33 | class ReadingHistory(BaseModel): 34 | filepath = CharField(primary_key=True) 35 | title = CharField(null=True) 36 | author = CharField(null=True) 37 | reading_progress = FloatField(null=False) 38 | last_read = DateTimeField(default=datetime.now, null=False) 39 | 40 | class Meta: 41 | table_name = "reading_history" 42 | 43 | 44 | @dataclass(frozen=True) 45 | class Migration: 46 | version: int 47 | migrate: Callable[[], None] 48 | 49 | 50 | class SegmentType(Enum): 51 | IMAGE = "image" 52 | BODY = "body" 53 | 54 | 55 | @dataclass(frozen=True) 56 | class Color: 57 | bg: str 58 | fg: str 59 | accent: str 60 | 61 | 62 | @dataclass(frozen=True) 63 | class Keymaps: 64 | toggle_dark: list[str] 65 | scroll_down: list[str] 66 | scroll_up: list[str] 67 | home: list[str] 68 | end: list[str] 69 | page_up: list[str] 70 | page_down: list[str] 71 | open_toc: list[str] 72 | open_metadata: list[str] 73 | open_help: list[str] 74 | search_forward: list[str] 75 | search_backward: list[str] 76 | next_match: list[str] 77 | prev_match: list[str] 78 | confirm: list[str] 79 | close: list[str] 80 | screenshot: list[str] 81 | 82 | 83 | @dataclass(frozen=True) 84 | class Config: 85 | preferred_image_viewer: str 86 | max_text_width: str 87 | text_justification: Literal["default", "center", "full", "right", "left"] 88 | pretty: bool 89 | page_scroll_duration: float 90 | show_image_as_ansi: bool 91 | dark: Color 92 | light: Color 93 | keymaps: Keymaps 94 | 95 | 96 | @dataclass(frozen=True) 97 | class BookMetadata: 98 | title: str | None = None 99 | creator: str | None = None 100 | description: str | None = None 101 | publisher: str | None = None 102 | date: str | None = None 103 | language: str | None = None 104 | format: str | None = None 105 | identifier: str | None = None 106 | source: str | None = None 107 | 108 | 109 | @dataclass(frozen=True) 110 | class TocEntry: 111 | label: str 112 | value: str 113 | 114 | 115 | @dataclass(frozen=True) 116 | class Segment: 117 | type: SegmentType 118 | content: str 119 | nav_point: str | None 120 | 121 | 122 | @dataclass(frozen=True) 123 | class KeyMap: 124 | keys: list[str] 125 | action: Callable 126 | 127 | 128 | @dataclass(frozen=True) 129 | class Coordinate: 130 | x: int 131 | y: int 132 | 133 | 134 | @dataclass(frozen=True) 135 | class SearchMode: 136 | pattern_str: str 137 | current_coord: Coordinate 138 | forward: bool = True 139 | saved_position: float = 0.0 140 | -------------------------------------------------------------------------------- /src/baca/tools/KindleUnpack/unipath.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | # Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay 6 | # All rights reserved. 7 | # 8 | # Redistribution and use in source and binary forms, with or without modification, 9 | # are permitted provided that the following conditions are met: 10 | # 11 | # 1. Redistributions of source code must retain the above copyright notice, this list of 12 | # conditions and the following disclaimer. 13 | # 14 | # 2. Redistributions in binary form must reproduce the above copyright notice, this list 15 | # of conditions and the following disclaimer in the documentation and/or other materials 16 | # provided with the distribution. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 19 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 21 | # SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 22 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 23 | # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 24 | # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 26 | # WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | from __future__ import unicode_literals, division, absolute_import, print_function 29 | from .compatibility_utils import PY2, text_type, binary_type 30 | 31 | import sys 32 | import os 33 | 34 | # utility routines to convert all paths to be full unicode 35 | 36 | # Under Python 2, if a bytestring, try to convert it to unicode using sys.getfilesystemencoding 37 | # Under Python 3, if bytes, try to convert it to unicode using os.fsencode() to decode it 38 | 39 | # Mac OS X and Windows will happily support full unicode paths 40 | # Linux can support full unicode paths but allows arbitrary byte paths which may be inconsistent with unicode 41 | 42 | fsencoding = sys.getfilesystemencoding() 43 | 44 | def pathof(s, enc=fsencoding): 45 | if s is None: 46 | return None 47 | if isinstance(s, text_type): 48 | return s 49 | if isinstance(s, binary_type): 50 | try: 51 | return s.decode(enc) 52 | except: 53 | pass 54 | return s 55 | 56 | def exists(s): 57 | return os.path.exists(pathof(s)) 58 | 59 | def isfile(s): 60 | return os.path.isfile(pathof(s)) 61 | 62 | def isdir(s): 63 | return os.path.isdir(pathof(s)) 64 | 65 | def mkdir(s): 66 | return os.mkdir(pathof(s)) 67 | 68 | def listdir(s): 69 | rv = [] 70 | for file in os.listdir(pathof(s)): 71 | rv.append(pathof(file)) 72 | return rv 73 | 74 | def getcwd(): 75 | if PY2: 76 | return os.getcwdu() 77 | return os.getcwd() 78 | 79 | def walk(top): 80 | top = pathof(top) 81 | rv = [] 82 | for base, dnames, names in os.walk(top): 83 | base = pathof(base) 84 | for name in names: 85 | name = pathof(name) 86 | rv.append(relpath(os.path.join(base, name), top)) 87 | return rv 88 | 89 | def relpath(path, start=None): 90 | return os.path.relpath(pathof(path) , pathof(start)) 91 | 92 | def abspath(path): 93 | return os.path.abspath(pathof(path)) 94 | -------------------------------------------------------------------------------- /src/baca/resources/style.css: -------------------------------------------------------------------------------- 1 | /* global */ 2 | .-dark-mode { 3 | background: $dark-bg; 4 | color: $dark-fg; 5 | } 6 | 7 | .-light-mode { 8 | background: $light-bg; 9 | color: $light-fg; 10 | } 11 | 12 | .-dark-mode * { 13 | scrollbar-color: $dark-accent; 14 | scrollbar-background: $dark-bg; 15 | } 16 | 17 | .-light-mode * { 18 | scrollbar-color: $light-accent; 19 | scrollbar-background: $light-bg; 20 | } 21 | 22 | Screen { 23 | align: center middle; 24 | height: auto; 25 | scrollbar-size: 1 1; 26 | layers: content search windows; 27 | } 28 | 29 | .-dark-mode Screen { 30 | background: $dark-bg; 31 | } 32 | 33 | .-light-mode Screen { 34 | background: $light-bg; 35 | } 36 | 37 | LoadingIndicator { 38 | layer: windows; 39 | } 40 | 41 | 42 | /* contents */ 43 | 44 | Table { 45 | /* NOTE: height & width important so table will overflow Metadata */ 46 | /* instead of its ScrollView parent widget */ 47 | height: auto; 48 | width: auto; 49 | } 50 | 51 | SegmentWidget { 52 | height: auto; 53 | } 54 | 55 | .-dark-mode Image { 56 | border: solid $dark-fg; 57 | } 58 | 59 | .-light-mode Image { 60 | border: solid $light-fg; 61 | } 62 | 63 | .-dark-mode Image:hover { 64 | border: double $dark-accent; 65 | color: $dark-accent; 66 | } 67 | 68 | .-light-mode Image:hover { 69 | border: double $light-accent; 70 | color: $light-accent; 71 | } 72 | 73 | Section { 74 | /* NOTE: this works but causing wrong index in initial toc */ 75 | /* ie. when saved in the top of chapter 7, it's restored as chapter 6 in TOC */ 76 | /* height: 0; */ 77 | 78 | /* NOTE: this works but look a little bit weird */ 79 | opacity: 0%; 80 | 81 | /* NOTE: this doesn't work */ 82 | /* visibility: hidden; */ 83 | /* display: none; */ 84 | } 85 | 86 | SearchMatch { 87 | layer: search; 88 | height: 1; 89 | width: auto; 90 | text-style: bold; 91 | } 92 | 93 | .-dark-mode SearchMatch { 94 | background: $dark-accent; 95 | } 96 | 97 | .-light-mode SearchMatch { 98 | background: $light-accent; 99 | } 100 | 101 | Content * { 102 | text-align: $text-justification; 103 | } 104 | 105 | Content { 106 | layout: vertical; 107 | height: auto; 108 | layer: content; 109 | max-width: $text-max-width; 110 | margin: 0 2; 111 | } 112 | 113 | Markdown { 114 | margin: 0 0; 115 | } 116 | 117 | /* windows */ 118 | 119 | SearchInputPrompt { 120 | layer: windows; 121 | dock: bottom; 122 | border-title-align: left; 123 | } 124 | 125 | .-dark-mode SearchInputPrompt { 126 | background: $dark-bg; 127 | color: $dark-fg; 128 | border: solid $dark-accent; 129 | } 130 | 131 | .-light-mode SearchInputPrompt { 132 | background: $light-bg; 133 | color: $light-fg; 134 | border: solid $light-accent; 135 | } 136 | 137 | Window { 138 | dock: top; 139 | layer: windows; 140 | padding: 1 4; 141 | scrollbar-size: 1 1; 142 | overflow-y: auto; 143 | border-title-align: center; 144 | /* NOTE: set this in Window.on_mount() */ 145 | /* so it will be responsive to screen size */ 146 | /* margin: 3 10; */ 147 | } 148 | 149 | .-dark-mode Window { 150 | border: double $dark-accent; 151 | } 152 | 153 | .-light-mode Window { 154 | border: double $light-accent; 155 | } 156 | 157 | DictDisplay { 158 | align: center top; 159 | } 160 | 161 | /* ToC { */ 162 | /* border: double $dark-accent; */ 163 | /* } */ 164 | 165 | Alert { 166 | layer: windows; 167 | border: solid red; 168 | color: red; 169 | scrollbar-color: red; 170 | } 171 | 172 | NavPoint { 173 | height: auto; 174 | border: tall grey; 175 | margin: 0 1 1 0; 176 | padding: 0 5; 177 | } 178 | 179 | .-dark-mode NavPoint.selected { 180 | background: $dark-accent; 181 | } 182 | 183 | .-light-mode NavPoint.selected { 184 | background: $light-accent; 185 | } 186 | -------------------------------------------------------------------------------- /src/baca/utils/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import textwrap 4 | from pathlib import Path 5 | from typing import Type 6 | 7 | from rich.console import Console 8 | from rich.table import Table 9 | 10 | from baca import __appname__, __version__ 11 | from baca.ebooks import Azw, Ebook, Epub, Mobi 12 | from baca.exceptions import EbookNotFound, FormatNotSupported 13 | from baca.utils.queries import ( 14 | get_all_reading_history, 15 | get_best_match_from_history, 16 | get_last_read_ebook, 17 | get_nth_file_from_history, 18 | ) 19 | 20 | 21 | def format_file_size(pathstr: str) -> str: 22 | byte_size = Path(pathstr).stat().st_size 23 | return f"{round(byte_size / 1024, 2)} kb" if byte_size <= 1024**2 else f"{round(byte_size / (1024 ** 2), 2)} mb" 24 | 25 | 26 | def print_reading_history() -> None: 27 | table = Table(title="Baca History") 28 | table.add_column("#", style="cyan", no_wrap=False, justify="right") 29 | table.add_column("Last Read", style="cyan", no_wrap=False) 30 | table.add_column("Progress", style="cyan", no_wrap=False, justify="right") 31 | table.add_column("Title", style="magenta", no_wrap=False) 32 | table.add_column("Author", style="green", no_wrap=False) 33 | table.add_column("Path", style="white", no_wrap=False) 34 | table.add_column("Size", style="blue", no_wrap=False, justify="right") 35 | 36 | for n, rh in enumerate(get_all_reading_history()): 37 | table.add_row( 38 | str(n + 1), 39 | f"{rh.last_read:%I:%M %p %b %d, %Y}", 40 | f"{round(rh.reading_progress*100, 2)}%", # type: ignore 41 | rh.title, # type: ignore 42 | rh.author, # type: ignore 43 | rh.filepath, # type: ignore 44 | format_file_size(rh.filepath), # type: ignore 45 | ) 46 | 47 | Console().print(table) 48 | 49 | 50 | def parse_cli_args() -> argparse.Namespace: 51 | prog = __appname__ 52 | positional_arg_help_str = "[PATH | # | PATTERN ]" 53 | args_parser = argparse.ArgumentParser( 54 | prog=prog, 55 | usage=f"%(prog)s [-h] [-r] [-v] {positional_arg_help_str}", 56 | formatter_class=argparse.RawDescriptionHelpFormatter, 57 | description="TUI Ebook Reader", 58 | epilog=textwrap.dedent( 59 | f"""\ 60 | examples: 61 | {prog} /path/to/ebook read /path/to/ebook file 62 | {prog} 3 read #3 file from reading history 63 | {prog} count monte read file matching 'count monte' 64 | from reading history 65 | """ 66 | ), 67 | ) 68 | args_parser.add_argument("-r", "--history", action="store_true", help="print reading history") 69 | args_parser.add_argument( 70 | "-v", 71 | "--version", 72 | action="version", 73 | version=f"v{__version__}", 74 | help="print version and exit", 75 | ) 76 | args_parser.add_argument( 77 | "ebook", 78 | action="store", 79 | nargs="*", 80 | metavar=positional_arg_help_str, 81 | help="ebook path, history number or pattern", 82 | ) 83 | return args_parser.parse_args() 84 | 85 | 86 | def find_file() -> Path: 87 | args = parse_cli_args() 88 | if args.history: 89 | print_reading_history() 90 | sys.exit(0) 91 | 92 | elif len(args.ebook) == 0: 93 | last_read = get_last_read_ebook() 94 | if last_read is not None: 95 | return last_read 96 | else: 97 | raise EbookNotFound("found no last read ebook file!") 98 | 99 | elif len(args.ebook) == 1: 100 | arg = args.ebook[0] 101 | try: 102 | nth = int(arg) 103 | ebook_path = get_nth_file_from_history(nth) 104 | if ebook_path is None: 105 | print_reading_history() 106 | raise EbookNotFound(f"#{nth} file not found from history!") 107 | 108 | else: 109 | return ebook_path 110 | 111 | except ValueError: 112 | if Path(arg).is_file(): 113 | return Path(arg) 114 | 115 | pattern = " ".join(args.ebook) 116 | ebook_path = get_best_match_from_history(pattern) 117 | if ebook_path is None: 118 | print_reading_history() 119 | raise EbookNotFound("found no matching ebook from history!") 120 | else: 121 | return ebook_path 122 | 123 | 124 | def get_ebook_class(ebook_path: Path) -> Type[Ebook]: 125 | ext = ebook_path.suffix.lower() 126 | try: 127 | return { 128 | ".epub": Epub, 129 | ".epub3": Epub, 130 | ".azw": Azw, 131 | ".azw3": Azw, 132 | ".mobi": Mobi, 133 | }[ext] 134 | except KeyError: 135 | raise FormatNotSupported("format not supported!") 136 | -------------------------------------------------------------------------------- /src/baca/tools/KindleUnpack/mobi_uncompress.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | from .compatibility_utils import PY2, bchr, lmap, bstr 8 | 9 | if PY2: 10 | range = xrange 11 | 12 | import struct 13 | # note: struct pack, unpack, unpack_from all require bytestring format 14 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring 15 | 16 | 17 | class unpackException(Exception): 18 | pass 19 | 20 | class UncompressedReader: 21 | 22 | def unpack(self, data): 23 | return data 24 | 25 | class PalmdocReader: 26 | 27 | def unpack(self, i): 28 | o, p = b'', 0 29 | while p < len(i): 30 | # for python 3 must use slice since i[p] returns int while slice returns character 31 | c = ord(i[p:p+1]) 32 | p += 1 33 | if (c >= 1 and c <= 8): 34 | o += i[p:p+c] 35 | p += c 36 | elif (c < 128): 37 | o += bchr(c) 38 | elif (c >= 192): 39 | o += b' ' + bchr(c ^ 128) 40 | else: 41 | if p < len(i): 42 | c = (c << 8) | ord(i[p:p+1]) 43 | p += 1 44 | m = (c >> 3) & 0x07ff 45 | n = (c & 7) + 3 46 | if (m > n): 47 | o += o[-m:n-m] 48 | else: 49 | for _ in range(n): 50 | # because of completely ass-backwards decision by python mainters for python 3 51 | # we must use slice for bytes as i[p] returns int while slice returns character 52 | if m == 1: 53 | o += o[-m:] 54 | else: 55 | o += o[-m:-m+1] 56 | return o 57 | 58 | class HuffcdicReader: 59 | q = struct.Struct(b'>Q').unpack_from 60 | 61 | def loadHuff(self, huff): 62 | if huff[0:8] != b'HUFF\x00\x00\x00\x18': 63 | raise unpackException('invalid huff header') 64 | off1, off2 = struct.unpack_from(b'>LL', huff, 8) 65 | 66 | def dict1_unpack(v): 67 | codelen, term, maxcode = v&0x1f, v&0x80, v>>8 68 | assert codelen != 0 69 | if codelen <= 8: 70 | assert term 71 | maxcode = ((maxcode + 1) << (32 - codelen)) - 1 72 | return (codelen, term, maxcode) 73 | self.dict1 = lmap(dict1_unpack, struct.unpack_from(b'>256L', huff, off1)) 74 | 75 | dict2 = struct.unpack_from(b'>64L', huff, off2) 76 | self.mincode, self.maxcode = (), () 77 | for codelen, mincode in enumerate((0,) + dict2[0::2]): 78 | self.mincode += (mincode << (32 - codelen), ) 79 | for codelen, maxcode in enumerate((0,) + dict2[1::2]): 80 | self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, ) 81 | 82 | self.dictionary = [] 83 | 84 | def loadCdic(self, cdic): 85 | if cdic[0:8] != b'CDIC\x00\x00\x00\x10': 86 | raise unpackException('invalid cdic header') 87 | phrases, bits = struct.unpack_from(b'>LL', cdic, 8) 88 | n = min(1<H').unpack_from 90 | def getslice(off): 91 | blen, = h(cdic, 16+off) 92 | slice = cdic[18+off:18+off+(blen&0x7fff)] 93 | return (slice, blen&0x8000) 94 | self.dictionary += lmap(getslice, struct.unpack_from(bstr('>%dH' % n), cdic, 16)) 95 | 96 | def unpack(self, data): 97 | q = HuffcdicReader.q 98 | 99 | bitsleft = len(data) * 8 100 | data += b"\x00\x00\x00\x00\x00\x00\x00\x00" 101 | pos = 0 102 | x, = q(data, pos) 103 | n = 32 104 | 105 | s = b'' 106 | while True: 107 | if n <= 0: 108 | pos += 4 109 | x, = q(data, pos) 110 | n += 32 111 | code = (x >> n) & ((1 << 32) - 1) 112 | 113 | codelen, term, maxcode = self.dict1[code >> 24] 114 | if not term: 115 | while code < self.mincode[codelen]: 116 | codelen += 1 117 | maxcode = self.maxcode[codelen] 118 | 119 | n -= codelen 120 | bitsleft -= codelen 121 | if bitsleft < 0: 122 | break 123 | 124 | r = (maxcode - code) >> (32 - codelen) 125 | slice, flag = self.dictionary[r] 126 | if not flag: 127 | self.dictionary[r] = None 128 | slice = self.unpack(slice) 129 | self.dictionary[r] = (slice, 1) 130 | s += slice 131 | return s 132 | -------------------------------------------------------------------------------- /src/baca/tools/KindleUnpack/mobi_sectioner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | from .compatibility_utils import PY2, hexlify, bstr, bord, bchar 8 | 9 | import datetime 10 | 11 | if PY2: 12 | range = xrange 13 | 14 | # note: struct pack, unpack, unpack_from all require bytestring format 15 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring 16 | import struct 17 | 18 | from .unipath import pathof 19 | 20 | DUMP = False 21 | """ Set to True to dump all possible information. """ 22 | 23 | class unpackException(Exception): 24 | pass 25 | 26 | 27 | def describe(data): 28 | txtans = '' 29 | hexans = hexlify(data) 30 | for i in data: 31 | if bord(i) < 32 or bord(i) > 127: 32 | txtans += '?' 33 | else: 34 | txtans += bchar(i).decode('latin-1') 35 | return '"' + txtans + '"' + ' 0x'+ hexans 36 | 37 | def datetimefrompalmtime(palmtime): 38 | if palmtime > 0x7FFFFFFF: 39 | pythondatetime = datetime.datetime(year=1904,month=1,day=1)+datetime.timedelta(seconds=palmtime) 40 | else: 41 | pythondatetime = datetime.datetime(year=1970,month=1,day=1)+datetime.timedelta(seconds=palmtime) 42 | return pythondatetime 43 | 44 | 45 | class Sectionizer: 46 | 47 | def __init__(self, filename): 48 | self.data = b'' 49 | with open(pathof(filename), 'rb') as f: 50 | self.data = f.read() 51 | self.palmheader = self.data[:78] 52 | self.palmname = self.data[:32] 53 | self.ident = self.palmheader[0x3C:0x3C+8] 54 | self.num_sections, = struct.unpack_from(b'>H', self.palmheader, 76) 55 | self.filelength = len(self.data) 56 | sectionsdata = struct.unpack_from(bstr('>%dL' % (self.num_sections*2)), self.data, 78) + (self.filelength, 0) 57 | self.sectionoffsets = sectionsdata[::2] 58 | self.sectionattributes = sectionsdata[1::2] 59 | self.sectiondescriptions = ["" for x in range(self.num_sections+1)] 60 | self.sectiondescriptions[-1] = "File Length Only" 61 | return 62 | 63 | def dumpsectionsinfo(self): 64 | print("Section Offset Length UID Attribs Description") 65 | for i in range(self.num_sections): 66 | print("%3d %3X 0x%07X 0x%05X % 8d % 7d %s" % (i,i, self.sectionoffsets[i], self.sectionoffsets[ 67 | i+1] - self.sectionoffsets[i], self.sectionattributes[i]&0xFFFFFF, (self.sectionattributes[i]>>24)&0xFF, self.sectiondescriptions[i])) 68 | print("%3d %3X 0x%07X %s" % 69 | (self.num_sections,self.num_sections, self.sectionoffsets[self.num_sections], self.sectiondescriptions[self.num_sections])) 70 | 71 | def setsectiondescription(self, section, description): 72 | if section < len(self.sectiondescriptions): 73 | self.sectiondescriptions[section] = description 74 | else: 75 | print("Section out of range: %d, description %s" % (section,description)) 76 | 77 | def dumppalmheader(self): 78 | print("Palm Database Header") 79 | print("Database name: " + repr(self.palmheader[:32])) 80 | dbattributes, = struct.unpack_from(b'>H', self.palmheader, 32) 81 | print("Bitfield attributes: 0x%0X" % dbattributes,) 82 | if dbattributes != 0: 83 | print(" (",) 84 | if (dbattributes & 2): 85 | print("Read-only; ",) 86 | if (dbattributes & 4): 87 | print("Dirty AppInfoArea; ",) 88 | if (dbattributes & 8): 89 | print("Needs to be backed up; ",) 90 | if (dbattributes & 16): 91 | print("OK to install over newer; ",) 92 | if (dbattributes & 32): 93 | print("Reset after installation; ",) 94 | if (dbattributes & 64): 95 | print("No copying by PalmPilot beaming; ",) 96 | print(")") 97 | else: 98 | print("") 99 | print("File version: %d" % struct.unpack_from(b'>H', self.palmheader, 34)[0]) 100 | dbcreation, = struct.unpack_from(b'>L', self.palmheader, 36) 101 | print("Creation Date: " + str(datetimefrompalmtime(dbcreation))+ (" (0x%0X)" % dbcreation)) 102 | dbmodification, = struct.unpack_from(b'>L', self.palmheader, 40) 103 | print("Modification Date: " + str(datetimefrompalmtime(dbmodification))+ (" (0x%0X)" % dbmodification)) 104 | dbbackup, = struct.unpack_from(b'>L', self.palmheader, 44) 105 | if dbbackup != 0: 106 | print("Backup Date: " + str(datetimefrompalmtime(dbbackup))+ (" (0x%0X)" % dbbackup)) 107 | print("Modification No.: %d" % struct.unpack_from(b'>L', self.palmheader, 48)[0]) 108 | print("App Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 52)[0]) 109 | print("Sort Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 56)[0]) 110 | print("Type/Creator: %s/%s" % (repr(self.palmheader[60:64]), repr(self.palmheader[64:68]))) 111 | print("Unique seed: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 68)[0]) 112 | expectedzero, = struct.unpack_from(b'>L', self.palmheader, 72) 113 | if expectedzero != 0: 114 | print("Should be zero but isn't: %d" % struct.unpack_from(b'>L', self.palmheader, 72)[0]) 115 | print("Number of sections: %d" % struct.unpack_from(b'>H', self.palmheader, 76)[0]) 116 | return 117 | 118 | def loadSection(self, section): 119 | before, after = self.sectionoffsets[section:section+2] 120 | return self.data[before:after] 121 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `baca`: TUI E-book Reader 2 | 3 | ![baca_screenshots](https://github.com/wustho/baca/assets/43810055/82d5beb0-d061-4e4c-82ed-a3bd84074d2f) 4 | 5 | Meet `baca`, [epy](https://github.com/wustho/epy)'s lovely sister who lets you indulge 6 | in your favorite e-books in the comfort of your terminal. 7 | But with a sleek and contemporary appearance that's sure to captivate you! 8 | 9 | ## Features 10 | 11 | - Formats supported: Epub, Epub3, Mobi & Azw 12 | - Remembers last reading position 13 | - Show images as ANSI image & you can click it for more detail 14 | - Scroll animations 15 | - Clean & modern looks 16 | - Text justification 17 | - Dark & light color scheme 18 | - Regex search 19 | - Hyperlinks 20 | 21 | ## Requirements 22 | 23 | - `python>=3.10` 24 | 25 | ## Installation 26 | 27 | - Via pip: `pip install baca` 28 | - Via git: `pip install git+https://github.com/wustho/baca` 29 | - Via AUR: `yay -S baca-ereader-git` 30 | 31 | ## Usage 32 | 33 | ```sh 34 | # to read an ebook 35 | baca path/to/your/ebook.epub 36 | 37 | # to read your last read ebook, just run baca without any argument 38 | baca 39 | 40 | # to see your reading history use -r as an argument 41 | baca -r 42 | 43 | # say you want to read an ebook from your reading history, 44 | # but you forgot the path to your ebook 45 | # just type any words you remember about your ebook 46 | # and baca will try to match it to path or title+author 47 | baca doc ebook.epub 48 | baca alice wonder lewis carroll 49 | ``` 50 | 51 | ## Opening an Image 52 | 53 | To open an image, when you encounter an ANSI image (when `ShowImageAsANSI=yes`) or some thing like this 54 | (if `ShowImageAsANSI=no`): 55 | 56 | ``` 57 | ┌──────────────────────────────────────────────────────────────────────────────┐ 58 | │ IMAGE │ 59 | └──────────────────────────────────────────────────────────────────────────────┘ 60 | ``` 61 | 62 | just click on it using mouse and it will open the image using system app. 63 | Yeah, I know you want to use keyboard for this, me too, but bear with this for now. 64 | 65 | > "Why show the images as ANSI images instead of render it directly on terminal like ranger does?" 66 | 67 | 1. The main reason is that currently, rendering images directly on the terminal 68 | doesn't allow for partial scrolling of the image. 69 | This means that we can't display only a portion (e.g., 30%) of the image when scrolling, 70 | resulting in a broken and non-seamless scrolling experience. 71 | 72 | 2. My primary intention in developing this app is for reading fiction e-books rather than technical ones, 73 | and most fiction e-books don't contain many images. 74 | 75 | 3. Displaying images on the terminal requires different implementations for various terminal emulators, 76 | which requires a lot of maintenance. 77 | 78 | ## Configurations 79 | 80 | ![pretty_yes_no_cap](https://user-images.githubusercontent.com/43810055/228417623-ac78fb84-0ee0-4930-a843-752ef693822d.png) 81 | 82 | Configuration file available at `~/.config/baca/config.ini` for linux users. Here is the default: 83 | 84 | ```ini 85 | [General] 86 | # pick your favorite image viewer 87 | PreferredImageViewer = auto 88 | 89 | # int or css value string like 90%% 90 | # (escape percent with double percent %%) 91 | MaxTextWidth = 80 92 | 93 | # 'justify', 'center', 'left', 'right' 94 | TextJustification = justify 95 | 96 | # currently using pretty=yes is slow 97 | # and taking huge amount of memory 98 | Pretty = no 99 | 100 | PageScrollDuration = 0.2 101 | 102 | # either show image as ansii image 103 | # or text 'IMAGE' as a placehoder 104 | # (showing ansii image will affect 105 | # performance & resource usage) 106 | ShowImageAsANSII = yes 107 | 108 | [Color Dark] 109 | Background = #1e1e1e 110 | Foreground = #f5f5f5 111 | Accent = #0178d4 112 | 113 | [Color Light] 114 | Background = #f5f5f5 115 | Foreground = #1e1e1e 116 | Accent = #0178d4 117 | 118 | [Keymaps] 119 | ToggleLightDark = c 120 | ScrollDown = down,j 121 | ScrollUp = up,k 122 | PageDown = ctrl+f,pagedown,l,space 123 | PageUp = ctrl+b,pageup,h 124 | Home = home,g 125 | End = end,G 126 | OpenToc = tab 127 | OpenMetadata = M 128 | OpenHelp = f1 129 | SearchForward = slash 130 | SearchBackward = question_mark 131 | NextMatch = n 132 | PreviousMatch = N 133 | Confirm = enter 134 | CloseOrQuit = q,escape 135 | Screenshot = f12 136 | ``` 137 | 138 | ## Known Limitations 139 | 140 | - When searching for specific phrases in `baca`, 141 | keep in mind that it may not be able to find them if they span across two lines, 142 | much like in the search behavior of editor vi(m). 143 | 144 | For example, `baca` won't be able to find the phrase `"for it"` because it is split into two lines 145 | in this example. 146 | 147 | ``` 148 | ... 149 | she had forgotten the little golden key, and when she went back to the table for 150 | it, she found she could not possibly reach it: she could see it quite plainly 151 | ... 152 | ``` 153 | 154 | 155 | Additionally, `baca` may struggle to locate certain phrases due to adjustments made for text justification. 156 | See the example above, `"see_it"` may become `"see__it"` due to adjusted spacing between words. 157 | In this case, it may be more effective to use a regex search for `"see +it"` or simply search for the word `"see"` alone. 158 | 159 | Overall, `baca`'s search feature is most effective for locating individual words 160 | rather than phrases that may be split across multiple lines or impacted by text justification. 161 | 162 | - Compared to [epy](https://github.com/wustho/epy), currently `baca` has some missing features. 163 | But these are planned to be implemented to `baca` in the near future: 164 | 165 | - [ ] **TODO** Bookmarks 166 | - [ ] **TODO** FictionBook support 167 | - [ ] **TODO** URL reading support 168 | 169 | ## Credits 170 | 171 | - Thanks to awesome [Textual Project](https://github.com/Textualize/textual) 172 | - [Kindle Unpack](https://github.com/kevinhendricks/KindleUnpack) 173 | - And many others! 174 | 175 | ## License 176 | 177 | GPL-3 178 | 179 | -------------------------------------------------------------------------------- /src/baca/tools/KindleUnpack/mobi_pagemap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | from .compatibility_utils import PY2, unicode_str 8 | 9 | if PY2: 10 | range = xrange 11 | 12 | import struct 13 | # note: struct pack, unpack, unpack_from all require bytestring format 14 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring 15 | 16 | import re 17 | # note: re requites the pattern to be the exact same type as the data to be searched in python3 18 | # but u"" is not allowed for the pattern itself only b"" 19 | 20 | 21 | _TABLE = [('m', 1000), ('cm', 900), ('d', 500), ('cd', 400), ('c', 100), ('xc', 90), ('l', 50), ('xl', 40), ('x', 10), ('ix', 9), ('v', 5), ('iv', 4), ('i', 1)] 22 | 23 | def int_to_roman(i): 24 | parts = [] 25 | num = i 26 | for letter, value in _TABLE: 27 | while value <= num: 28 | num -= value 29 | parts.append(letter) 30 | return ''.join(parts) 31 | 32 | def roman_to_int(s): 33 | result = 0 34 | rnstr = s 35 | for letter, value in _TABLE: 36 | while rnstr.startswith(letter): 37 | result += value 38 | rnstr = rnstr[len(letter):] 39 | return result 40 | 41 | _pattern = r'''\(([^\)]*)\)''' 42 | _tup_pattern = re.compile(_pattern,re.IGNORECASE) 43 | 44 | 45 | def _parseNames(numpages, data): 46 | data = unicode_str(data) 47 | pagenames = [] 48 | pageMap = '' 49 | for i in range(numpages): 50 | pagenames.append(None) 51 | for m in re.finditer(_tup_pattern, data): 52 | tup = m.group(1) 53 | if pageMap != '': 54 | pageMap += ',' 55 | pageMap += '(' + tup + ')' 56 | spos, nametype, svalue = tup.split(",") 57 | # print(spos, nametype, svalue) 58 | if nametype == 'a' or nametype == 'r': 59 | svalue = int(svalue) 60 | spos = int(spos) 61 | for i in range(spos - 1, numpages): 62 | if nametype == 'r': 63 | pname = int_to_roman(svalue) 64 | svalue += 1 65 | elif nametype == 'a': 66 | pname = "%s" % svalue 67 | svalue += 1 68 | elif nametype == 'c': 69 | sp = svalue.find('|') 70 | if sp == -1: 71 | pname = svalue 72 | else: 73 | pname = svalue[0:sp] 74 | svalue = svalue[sp+1:] 75 | else: 76 | print("Error: unknown page numbering type", nametype) 77 | pagenames[i] = pname 78 | return pagenames, pageMap 79 | 80 | 81 | class PageMapProcessor: 82 | 83 | def __init__(self, mh, data): 84 | self.mh = mh 85 | self.data = data 86 | self.pagenames = [] 87 | self.pageoffsets = [] 88 | self.pageMap = '' 89 | self.pm_len = 0 90 | self.pm_nn = 0 91 | self.pn_bits = 0 92 | self.pmoff = None 93 | self.pmstr = '' 94 | print("Extracting Page Map Information") 95 | rev_len, = struct.unpack_from(b'>L', self.data, 0x10) 96 | # skip over header, revision string length data, and revision string 97 | ptr = 0x14 + rev_len 98 | pm_1, self.pm_len, self.pm_nn, self.pm_bits = struct.unpack_from(b'>4H', self.data, ptr) 99 | # print(pm_1, self.pm_len, self.pm_nn, self.pm_bits) 100 | self.pmstr = self.data[ptr+8:ptr+8+self.pm_len] 101 | self.pmoff = self.data[ptr+8+self.pm_len:] 102 | offsize = b">L" 103 | offwidth = 4 104 | if self.pm_bits == 16: 105 | offsize = b">H" 106 | offwidth = 2 107 | ptr = 0 108 | for i in range(self.pm_nn): 109 | od, = struct.unpack_from(offsize, self.pmoff, ptr) 110 | ptr += offwidth 111 | self.pageoffsets.append(od) 112 | self.pagenames, self.pageMap = _parseNames(self.pm_nn, self.pmstr) 113 | 114 | def getPageMap(self): 115 | return self.pageMap 116 | 117 | def getNames(self): 118 | return self.pagenames 119 | 120 | def getOffsets(self): 121 | return self.pageoffsets 122 | 123 | # page-map.xml will be unicode but encoded to utf-8 immediately before being written to a file 124 | def generateKF8PageMapXML(self, k8proc): 125 | pagemapxml = '\n' 126 | for i in range(len(self.pagenames)): 127 | pos = self.pageoffsets[i] 128 | name = self.pagenames[i] 129 | if name is not None and name != "": 130 | [pn, dir, filename, skelpos, skelend, aidtext] = k8proc.getSkelInfo(pos) 131 | idtext = unicode_str(k8proc.getPageIDTag(pos)) 132 | linktgt = unicode_str(filename) 133 | if idtext != '': 134 | linktgt += '#' + idtext 135 | pagemapxml += '\n' % (name, dir, linktgt) 136 | pagemapxml += "\n" 137 | return pagemapxml 138 | 139 | def generateAPNX(self, apnx_meta): 140 | if apnx_meta['format'] == 'MOBI_8': 141 | content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","format":"%(format)s","fileRevisionId":"1","acr":"%(acr)s"}' %apnx_meta 142 | else: 143 | content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","fileRevisionId":"1"}' % apnx_meta 144 | content_header = content_header.encode('utf-8') 145 | page_header = '{"asin":"%(asin)s","pageMap":"%(pageMap)s"}' % apnx_meta 146 | page_header = page_header.encode('utf-8') 147 | apnx = struct.pack(b'>H',1) + struct.pack(b'>H',1) 148 | apnx += struct.pack(b'>I', 12 + len(content_header)) 149 | apnx += struct.pack(b'>I', len(content_header)) 150 | apnx += content_header 151 | apnx += struct.pack(b'>H', 1) 152 | apnx += struct.pack(b'>H', len(page_header)) 153 | apnx += struct.pack(b'>H', self.pm_nn) 154 | apnx += struct.pack(b'>H', 32) 155 | apnx += page_header 156 | for page in self.pageoffsets: 157 | apnx += struct.pack(b'>L', page) 158 | return apnx 159 | -------------------------------------------------------------------------------- /src/baca/components/windows.py: -------------------------------------------------------------------------------- 1 | from textual import events 2 | from textual.app import ComposeResult 3 | from textual.message import Message 4 | from textual.reactive import reactive 5 | from textual.widget import Widget 6 | from textual.widgets import Input, Static 7 | 8 | from baca.components.contents import Table 9 | from baca.components.events import FollowThis, Screenshot, SearchSubmitted 10 | from baca.models import Config, KeyMap, TocEntry 11 | from baca.utils.keys_parser import dispatch_key 12 | 13 | 14 | class SearchInputPrompt(Input): 15 | can_focus = True 16 | 17 | def __init__(self, forward: bool): 18 | super().__init__() 19 | self.forward = forward 20 | self.border_title = f"Search {'Forward' if forward else 'Backward'}" 21 | 22 | def on_mount(self): 23 | self.focus() 24 | 25 | async def on_key(self, event: events.Key) -> None: 26 | keymaps = [ 27 | KeyMap(["backspace", "ctrl+h"], self.action_delete_left), 28 | KeyMap(["home", "ctrl+a"], self.action_home), 29 | KeyMap(["end", "ctrl+e"], self.action_end), 30 | KeyMap(["left"], self.action_cursor_left), 31 | KeyMap(["right"], self.action_cursor_right), 32 | KeyMap(["ctrl+w"], self.action_delete_left_word), 33 | KeyMap(["delete"], self.action_delete_right), 34 | KeyMap(["enter"], self.action_submit), 35 | KeyMap(["escape"], self.action_close), 36 | ] 37 | 38 | if event.key not in set(k for keymap in keymaps for k in keymap.keys): 39 | await super().on_key(event) 40 | event.stop() 41 | event.prevent_default() 42 | else: 43 | await dispatch_key(keymaps, event) 44 | 45 | def action_submit(self) -> None: 46 | self.post_message(SearchSubmitted(value=self.value, forward=self.forward)) 47 | self.action_close() 48 | 49 | def action_close(self) -> None: 50 | self.call_after_refresh(self.remove) 51 | 52 | 53 | class Window(Widget): 54 | can_focus = True 55 | 56 | def __init__(self, config: Config, id: str | None = None): 57 | super().__init__(**(dict() if id is None else dict(id=id))) 58 | self.config = config 59 | keymaps = self.config.keymaps 60 | self.keymaps = [ 61 | KeyMap(keymaps.close, self.action_close), 62 | KeyMap(keymaps.scroll_down, self.action_scroll_down), 63 | KeyMap(keymaps.scroll_up, self.action_scroll_up), 64 | KeyMap(keymaps.page_down, self.action_page_down), 65 | KeyMap(keymaps.page_up, self.action_page_up), 66 | KeyMap(keymaps.screenshot, lambda: self.post_message(Screenshot())), 67 | ] 68 | 69 | async def on_key(self, event: events.Key) -> None: 70 | await dispatch_key(self.keymaps, event) 71 | 72 | def on_mount(self) -> None: 73 | # NOTE: somehow this method is automatically inherited 74 | # even if the child class overriding this without super().on_moun() 75 | self.focus(False) 76 | 77 | # NOTE: set here instead of in CSS file 78 | # so it will be responsive to screen size 79 | screen_size = self.screen.size 80 | self.styles.margin = (screen_size.height // 10, screen_size.width // 10) 81 | 82 | def action_close(self) -> None: 83 | self.call_after_refresh(self.remove) 84 | 85 | 86 | class Alert(Window): 87 | border_title = "❗" 88 | 89 | def __init__(self, config: Config, message: str): 90 | super().__init__(config) 91 | self.message = message 92 | 93 | def compose(self) -> ComposeResult: 94 | yield Static(self.message) 95 | 96 | # NOTE: self.render() is low level API 97 | # so, this won't be any auto scroll-overflow 98 | # use self.compose() instead 99 | # def render(self): 100 | 101 | 102 | class DictDisplay(Window): 103 | def __init__(self, config: Config, id: str, title: str, data: dict): 104 | super().__init__(config, id) 105 | self.data = data 106 | self.border_title = title 107 | 108 | def compose(self) -> ComposeResult: 109 | yield Table(headers=["key", "value"], rows=[(k, v) for k, v in self.data.items()]) 110 | 111 | 112 | class NavPoint(Widget): 113 | can_focus = False 114 | 115 | class Selected(Message): 116 | def __init__(self, index: int) -> None: 117 | super().__init__() 118 | self.index = index 119 | 120 | class Clicked(Selected): 121 | pass 122 | 123 | def __init__(self, index: int, label: str): 124 | super().__init__() 125 | self.index = index 126 | self.label = label 127 | 128 | def render(self): 129 | return self.label 130 | 131 | async def on_mouse_move(self, _: events.MouseMove) -> None: 132 | self.post_message(self.Selected(self.index)) 133 | 134 | async def on_click(self) -> None: 135 | self.post_message(self.Selected(self.index)) 136 | self.post_message(self.Clicked(self.index)) 137 | 138 | 139 | class ToC(Window): 140 | border_title = "Table of Contents" 141 | index = reactive(0) 142 | 143 | def __init__(self, config: Config, entries: list[TocEntry], initial_index: int = 0): 144 | super().__init__(config) 145 | self.entries = entries 146 | self.entry_widgets = [NavPoint(n, entry.label) for n, entry in enumerate(self.entries)] 147 | keymaps = config.keymaps 148 | self.keymaps = [ 149 | KeyMap(keymaps.close + config.keymaps.open_toc, self.action_close), 150 | KeyMap(keymaps.scroll_down, lambda: self.action_select_next(1)), 151 | KeyMap(keymaps.scroll_up, lambda: self.action_select_next(-1)), 152 | KeyMap(keymaps.home, lambda: self.action_select_index(0)), 153 | KeyMap(keymaps.end, lambda: self.action_select_index(-1)), 154 | KeyMap(keymaps.confirm, self.follow_nav_point), 155 | KeyMap(keymaps.screenshot, lambda: self.post_message(Screenshot())), 156 | ] 157 | self.index = initial_index 158 | 159 | def on_focus(self) -> None: 160 | # NOTE: by default when a widget gaining focus, in this case ToC 161 | # it will reset the scrolling position of this widget which will hide selected NavPoint 162 | # So, either assign new value for selected navpoint or run watch_selected_value() 163 | self.watch_index(self.index, self.index) 164 | 165 | def action_select_next(self, n: int) -> None: 166 | self.index = (self.index + n) % len(self.entries) 167 | 168 | def action_select_index(self, n: int) -> None: 169 | self.index = n 170 | 171 | def compose(self) -> ComposeResult: 172 | yield from self.entry_widgets 173 | 174 | def watch_index(self, old: int, new: int) -> None: 175 | [entry_widget.remove_class("selected") for entry_widget in self.entry_widgets] 176 | selected = self.entry_widgets[new] 177 | selected.add_class("selected") 178 | self.scroll_to_widget(selected, top=False) 179 | 180 | def on_nav_point_selected(self, message: NavPoint.Selected) -> None: 181 | self.index = message.index 182 | message.stop() 183 | 184 | def on_nav_point_clicked(self, message: NavPoint.Clicked) -> None: 185 | self.follow_nav_point() 186 | message.stop() 187 | 188 | def follow_nav_point(self) -> None: 189 | self.post_message(FollowThis(self.entries[self.index].value)) 190 | -------------------------------------------------------------------------------- /src/baca/tools/KindleUnpack/mobi_nav.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | from .compatibility_utils import unicode_str 8 | import os 9 | from .unipath import pathof 10 | 11 | import re 12 | # note: re requites the pattern to be the exact same type as the data to be searched in python3 13 | # but u"" is not allowed for the pattern itself only b"" 14 | 15 | DEBUG_NAV = False 16 | 17 | FORCE_DEFAULT_TITLE = False 18 | """ Set to True to force to use the default title. """ 19 | 20 | NAVIGATION_FINENAME = 'nav.xhtml' 21 | """ The name for the navigation document. """ 22 | 23 | DEFAULT_TITLE = 'Navigation' 24 | """ The default title for the navigation document. """ 25 | 26 | class NAVProcessor(object): 27 | 28 | def __init__(self, files): 29 | self.files = files 30 | self.navname = NAVIGATION_FINENAME 31 | 32 | def buildLandmarks(self, guidetext): 33 | header = '' 34 | header += ' \n' 41 | 42 | type_map = { 43 | 'cover' : 'cover', 44 | 'title-page' : 'title-page', 45 | # ?: 'frontmatter', 46 | 'text' : 'bodymatter', 47 | # ?: 'backmatter', 48 | 'toc' : 'toc', 49 | 'loi' : 'loi', 50 | 'lot' : 'lot', 51 | 'preface' : 'preface', 52 | 'bibliography' : 'bibliography', 53 | 'index' : 'index', 54 | 'glossary' : 'glossary', 55 | 'acknowledgements' : 'acknowledgements', 56 | 'colophon' : None, 57 | 'copyright-page' : None, 58 | 'dedication' : None, 59 | 'epigraph' : None, 60 | 'foreword' : None, 61 | 'notes' : None 62 | } 63 | 64 | re_type = re.compile(r'\s+type\s*=\s*"(.*?)"', re.I) 65 | re_title = re.compile(r'\s+title\s*=\s*"(.*?)"', re.I) 66 | re_link = re.compile(r'\s+href\s*=\s*"(.*?)"', re.I) 67 | dir_ = os.path.relpath(self.files.k8text, self.files.k8oebps).replace('\\', '/') 68 | 69 | data = '' 70 | references = re.findall(r'', unicode_str(guidetext), re.I) 71 | for reference in references: 72 | mo_type = re_type.search(reference) 73 | mo_title = re_title.search(reference) 74 | mo_link = re_link.search(reference) 75 | if mo_type is not None: 76 | type_ = type_map.get(mo_type.group(1), None) 77 | else: 78 | type_ = None 79 | if mo_title is not None: 80 | title = mo_title.group(1) 81 | else: 82 | title = None 83 | if mo_link is not None: 84 | link = mo_link.group(1) 85 | else: 86 | link = None 87 | 88 | if type_ is not None and title is not None and link is not None: 89 | link = os.path.relpath(link, dir_).replace('\\', '/') 90 | data += element.format(type_, link, title) 91 | if len(data) > 0: 92 | return header + data + footer 93 | else: 94 | return '' 95 | 96 | def buildTOC(self, indx_data): 97 | header = '' 98 | header += ' \n' 101 | 102 | # recursive part 103 | def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): 104 | if start>len(indx_data) or end>len(indx_data): 105 | print("Warning (in buildTOC): missing INDX child entries", start, end, len(indx_data)) 106 | return '' 107 | if DEBUG_NAV: 108 | print("recursINDX (in buildTOC) lvl %d from %d to %d" % (lvl, start, end)) 109 | xhtml = '' 110 | if start <= 0: 111 | start = 0 112 | if end <= 0: 113 | end = len(indx_data) 114 | if lvl > max_lvl: 115 | max_lvl = lvl 116 | 117 | indent1 = ' ' * (2 + lvl * 2) 118 | indent2 = ' ' * (3 + lvl * 2) 119 | xhtml += indent1 + '
    \n' 120 | for i in range(start, end): 121 | e = indx_data[i] 122 | htmlfile = e['filename'] 123 | desttag = e['idtag'] 124 | text = e['text'] 125 | if not e['hlvl'] == lvl: 126 | continue 127 | num += 1 128 | if desttag == '': 129 | link = htmlfile 130 | else: 131 | link = '{:s}#{:s}'.format(htmlfile, desttag) 132 | xhtml += indent2 + '
  1. ' 133 | entry = '{:s}'.format(link, text) 134 | xhtml += entry 135 | # recurs 136 | if e['child1'] >= 0: 137 | xhtml += '\n' 138 | xhtmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, 139 | e['child1'], e['childn'] + 1) 140 | xhtml += xhtmlrec 141 | xhtml += indent2 142 | # close entry 143 | xhtml += '
  2. \n' 144 | xhtml += indent1 + '
\n' 145 | return xhtml, max_lvl, num 146 | 147 | data, max_lvl, num = recursINDX() 148 | if not len(indx_data) == num: 149 | print("Warning (in buildTOC): different number of entries in NCX", len(indx_data), num) 150 | return header + data + footer 151 | 152 | def buildNAV(self, ncx_data, guidetext, title, lang): 153 | print("Building Navigation Document.") 154 | if FORCE_DEFAULT_TITLE: 155 | title = DEFAULT_TITLE 156 | nav_header = '' 157 | nav_header += '\n' 158 | nav_header += '\n' 147 | encryption += ' \n' 148 | encryption += ' \n' 149 | encryption += '\n' 150 | fileout = os.path.join(self.k8metainf,'encryption.xml') 151 | with open(pathof(fileout),'wb') as f: 152 | f.write(encryption.encode('utf-8')) 153 | 154 | # ready to build epub 155 | self.outzip = zipfile.ZipFile(pathof(bname), 'w') 156 | 157 | # add the mimetype file uncompressed 158 | mimetype = b'application/epub+zip' 159 | fileout = os.path.join(self.k8dir,'mimetype') 160 | with open(pathof(fileout),'wb') as f: 161 | f.write(mimetype) 162 | nzinfo = ZipInfo('mimetype', compress_type=zipfile.ZIP_STORED) 163 | nzinfo.external_attr = 0o600 << 16 # make this a normal file 164 | self.outzip.writestr(nzinfo, mimetype) 165 | self.zipUpDir(self.outzip,self.k8dir,'META-INF') 166 | self.zipUpDir(self.outzip,self.k8dir,'OEBPS') 167 | self.outzip.close() 168 | -------------------------------------------------------------------------------- /src/baca/ebooks/epub.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import os 3 | import xml.etree.ElementTree as ET 4 | import zipfile 5 | import zlib 6 | from pathlib import Path 7 | from typing import Callable, Iterator 8 | from urllib.parse import unquote, urljoin, urlparse 9 | 10 | from baca.ebooks.base import Ebook 11 | from baca.models import BookMetadata, Segment, TocEntry 12 | from baca.utils.html_parser import parse_html_to_segmented_md 13 | from baca.utils.tempdir import create_tempdir 14 | 15 | 16 | class Epub(Ebook): 17 | NAMESPACE = { 18 | "DAISY": "http://www.daisy.org/z3986/2005/ncx/", 19 | "OPF": "http://www.idpf.org/2007/opf", 20 | "CONT": "urn:oasis:names:tc:opendocument:xmlns:container", 21 | "XHTML": "http://www.w3.org/1999/xhtml", 22 | "EPUB": "http://www.idpf.org/2007/ops", 23 | # Dublin Core 24 | "DC": "http://purl.org/dc/elements/1.1/", 25 | } 26 | 27 | def __init__(self, ebook_path: Path): 28 | self._path = ebook_path.resolve() 29 | self._file: zipfile.ZipFile = zipfile.ZipFile(ebook_path, "r") 30 | self._tempdir = create_tempdir() 31 | 32 | @staticmethod 33 | def _parse_content_opf( 34 | content_opf: ET.ElementTree, root_dirpath: str, *, path_resolver: Callable = urljoin 35 | ) -> tuple[str, ...]: 36 | # cont = ET.parse(self.file.open(self.root_filepath)).getroot() 37 | manifests: list[tuple[str, str]] = [] 38 | for manifest_elem in content_opf.findall("OPF:manifest/*", Epub.NAMESPACE): 39 | # EPUB3 40 | # if manifest_elem.get("id") != "ncx" and manifest_elem.get("properties") != "nav": 41 | if ( 42 | manifest_elem.get("media-type") != "application/x-dtbncx+xml" 43 | and manifest_elem.get("properties") != "nav" 44 | ): 45 | manifest_id = manifest_elem.get("id") 46 | manifest_href = manifest_elem.get("href") 47 | manifests.append((manifest_id, manifest_href)) # type: ignore 48 | 49 | spines: list[str] = [] 50 | contents: list[str] = [] 51 | for spine_elem in content_opf.findall("OPF:spine/*", Epub.NAMESPACE): 52 | idref = spine_elem.get("idref") 53 | spines.append(idref) # type: ignore 54 | for spine in spines: 55 | for manifest in manifests: 56 | if spine == manifest[0]: 57 | # book_contents.append(root_dirpath + unquote(manifest[1])) 58 | contents.append(unquote(manifest[1])) 59 | manifests.remove(manifest) 60 | # TODO: test is break necessary 61 | break 62 | 63 | return tuple(path_resolver(root_dirpath, content) for content in contents) 64 | 65 | @staticmethod 66 | def _parse_toc( 67 | toc: ET.Element, version: str, root_dirpath, *, path_resolver: Callable = urljoin 68 | ) -> tuple[TocEntry, ...]: 69 | if version in {"1.0", "2.0"}: 70 | navPoints = toc.findall("DAISY:navMap//DAISY:navPoint", Epub.NAMESPACE) 71 | elif version == "3.0": 72 | navPoints = toc.findall("XHTML:body//XHTML:nav[@EPUB:type='toc']//XHTML:a", Epub.NAMESPACE) 73 | else: 74 | raise NotImplementedError(f"Unsupported Epub version: {version}") 75 | 76 | toc_entries: list[TocEntry] = [] 77 | for navPoint in navPoints: 78 | if version in {"1.0", "2.0"}: 79 | src_elem = navPoint.find("DAISY:content", Epub.NAMESPACE) 80 | src = src_elem.get("src") # type: ignore 81 | 82 | name_elem = navPoint.find("DAISY:navLabel/DAISY:text", Epub.NAMESPACE) 83 | name = name_elem.text # type: ignore 84 | elif version == "3.0": 85 | src_elem = navPoint 86 | src = src_elem.get("href") 87 | 88 | name = "".join(list(navPoint.itertext())) 89 | else: 90 | raise NotImplementedError(f"Unsupported Epub version: {version}") 91 | 92 | if name is not None: 93 | toc_entries.append( 94 | TocEntry( 95 | label=name, 96 | # content_index=idx, 97 | # section=src_id[1] if len(src_id) == 2 else None, 98 | value=path_resolver(root_dirpath, unquote(src)), # type: ignore 99 | ) 100 | ) 101 | return tuple(toc_entries) 102 | 103 | @property 104 | def _root_filepath(self) -> str: 105 | container = ET.parse(self._file.open("META-INF/container.xml")) 106 | rootfile_elem = container.find("CONT:rootfiles/CONT:rootfile", Epub.NAMESPACE) 107 | return rootfile_elem.attrib["full-path"] # type: ignore 108 | 109 | @property 110 | def _root_dirpath(self) -> str: 111 | dirname = os.path.dirname(self._root_filepath) 112 | return f"{dirname}/" if dirname != "" else "" 113 | 114 | @property 115 | def _content_opf(self) -> ET.ElementTree: 116 | return ET.parse(self._file.open(self._root_filepath)) 117 | 118 | @property 119 | def _relactive_toc_ncx_path(self) -> str: 120 | if self._version in {"1.0", "2.0"}: 121 | # "OPF:manifest/*[@id='ncx']" 122 | relative_toc = self._content_opf.find( 123 | "OPF:manifest/*[@media-type='application/x-dtbncx+xml']", Epub.NAMESPACE 124 | ) 125 | elif self._version == "3.0": 126 | relative_toc = self._content_opf.find("OPF:manifest/*[@properties='nav']", Epub.NAMESPACE) 127 | else: 128 | raise NotImplementedError(f"Unsupported Epub version: {self._version}") 129 | 130 | return relative_toc.get("href") # type: ignore 131 | 132 | @property 133 | def _toc_ncx(self) -> ET.Element: 134 | toc_ncx_path = urljoin(self._root_dirpath, self._relactive_toc_ncx_path) # type: ignore 135 | return ET.parse(self._file.open(toc_ncx_path)).getroot() 136 | 137 | @property 138 | def _version(self) -> str: 139 | return self._content_opf.getroot().get("version") # type: ignore 140 | 141 | def _get_contents(self) -> tuple[str, ...] | tuple[ET.Element, ...]: 142 | return Epub._parse_content_opf(self._content_opf, self._root_dirpath) 143 | 144 | def get_path(self) -> Path: 145 | return self._path 146 | 147 | def get_tempdir(self) -> Path: 148 | return self._tempdir 149 | 150 | def get_meta(self) -> BookMetadata: 151 | metadata: dict[str, str | None] = {} 152 | for field in dataclasses.fields(BookMetadata): 153 | element = self._content_opf.find(f".//DC:{field.name}", Epub.NAMESPACE) 154 | if element is not None: 155 | metadata[field.name] = element.text 156 | return BookMetadata(**metadata) 157 | 158 | def get_toc(self) -> tuple[TocEntry, ...]: 159 | return Epub._parse_toc(self._toc_ncx, self._version, self._root_dirpath) 160 | 161 | def get_raw_text(self, content_path: str | ET.Element) -> str: 162 | assert isinstance(content_path, str) 163 | 164 | max_tries: int | None = None 165 | 166 | # use try-except block to catch 167 | # zlib.error: Error -3 while decompressing data: invalid distance too far back 168 | # seems like caused by multiprocessing 169 | tries = 0 170 | while True: 171 | try: 172 | content = self._file.open(content_path).read() 173 | break 174 | except zlib.error as e: 175 | tries += 1 176 | if max_tries is not None and tries >= max_tries: 177 | raise e 178 | 179 | return content.decode("utf-8") 180 | 181 | def get_img_bytestr(self, impath: str) -> tuple[str, bytes]: 182 | assert isinstance(self._file, zipfile.ZipFile) 183 | unquoted_impath = unquote(impath) 184 | return os.path.basename(unquoted_impath), self._file.read(unquoted_impath) 185 | 186 | def iter_parsed_contents(self) -> Iterator[Segment]: 187 | toc_entries = self.get_toc() 188 | for content in self._get_contents(): 189 | ids_for_this_content = [ 190 | urlparse(t.value).fragment 191 | for t in toc_entries 192 | if t.value.startswith(content) and urlparse(t.value).fragment != "" 193 | ] 194 | raw = self.get_raw_text(content) 195 | for segment in parse_html_to_segmented_md(raw, str(content), ids_to_find=ids_for_this_content): 196 | yield segment 197 | -------------------------------------------------------------------------------- /src/baca/tools/KindleUnpack/mobi_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | # flake8: noqa 5 | 6 | from __future__ import unicode_literals, division, absolute_import, print_function 7 | 8 | from .compatibility_utils import PY2, text_type, bchr, bord 9 | 10 | import binascii 11 | 12 | if PY2: 13 | range = xrange 14 | 15 | from itertools import cycle 16 | 17 | def getLanguage(langID, sublangID): 18 | mobilangdict = { 19 | 54 : {0 : 'af'}, # Afrikaans 20 | 28 : {0 : 'sq'}, # Albanian 21 | 1 : {0 : 'ar' , 5 : 'ar-dz' , 15 : 'ar-bh' , 3 : 'ar-eg' , 2 : 'ar-iq', 11 : 'ar-jo' , 13 : 'ar-kw' , 12 : 'ar-lb' , 4: 'ar-ly', 22 | 6 : 'ar-ma' , 8 : 'ar-om' , 16 : 'ar-qa' , 1 : 'ar-sa' , 10 : 'ar-sy' , 7 : 'ar-tn' , 14 : 'ar-ae' , 9 : 'ar-ye'}, 23 | # Arabic, Arabic (Algeria), Arabic (Bahrain), Arabic (Egypt), Arabic 24 | # (Iraq), Arabic (Jordan), Arabic (Kuwait), Arabic (Lebanon), Arabic 25 | # (Libya), Arabic (Morocco), Arabic (Oman), Arabic (Qatar), Arabic 26 | # (Saudi Arabia), Arabic (Syria), Arabic (Tunisia), Arabic (United Arab 27 | # Emirates), Arabic (Yemen) 28 | 43 : {0 : 'hy'}, # Armenian 29 | 77 : {0 : 'as'}, # Assamese 30 | 44 : {0 : 'az'}, # "Azeri (IANA: Azerbaijani) 31 | 45 : {0 : 'eu'}, # Basque 32 | 35 : {0 : 'be'}, # Belarusian 33 | 69 : {0 : 'bn'}, # Bengali 34 | 2 : {0 : 'bg'}, # Bulgarian 35 | 3 : {0 : 'ca'}, # Catalan 36 | 4 : {0 : 'zh' , 3 : 'zh-hk' , 2 : 'zh-cn' , 4 : 'zh-sg' , 1 : 'zh-tw'}, 37 | # Chinese, Chinese (Hong Kong), Chinese (PRC), Chinese (Singapore), Chinese (Taiwan) 38 | 26 : {0 : 'hr', 3 : 'sr'}, # Croatian, Serbian 39 | 5 : {0 : 'cs'}, # Czech 40 | 6 : {0 : 'da'}, # Danish 41 | 19 : {0: 'nl', 1 : 'nl' , 2 : 'nl-be'}, # Dutch / Flemish, Dutch (Belgium) 42 | 9 : {0: 'en', 1 : 'en' , 3 : 'en-au' , 40 : 'en-bz' , 4 : 'en-ca' , 6 : 'en-ie' , 8 : 'en-jm' , 5 : 'en-nz' , 13 : 'en-ph' , 43 | 7 : 'en-za' , 11 : 'en-tt' , 2 : 'en-gb', 1 : 'en-us' , 12 : 'en-zw'}, 44 | # English, English (Australia), English (Belize), English (Canada), 45 | # English (Ireland), English (Jamaica), English (New Zealand), English 46 | # (Philippines), English (South Africa), English (Trinidad), English 47 | # (United Kingdom), English (United States), English (Zimbabwe) 48 | 37 : {0 : 'et'}, # Estonian 49 | 56 : {0 : 'fo'}, # Faroese 50 | 41 : {0 : 'fa'}, # Farsi / Persian 51 | 11 : {0 : 'fi'}, # Finnish 52 | 12 : {0 : 'fr', 1 : 'fr' , 2 : 'fr-be' , 3 : 'fr-ca' , 5 : 'fr-lu' , 6 : 'fr-mc' , 4 : 'fr-ch'}, 53 | # French, French (Belgium), French (Canada), French (Luxembourg), French (Monaco), French (Switzerland) 54 | 55 : {0 : 'ka'}, # Georgian 55 | 7 : {0 : 'de', 1 : 'de' , 3 : 'de-at' , 5 : 'de-li' , 4 : 'de-lu' , 2 : 'de-ch'}, 56 | # German, German (Austria), German (Liechtenstein), German (Luxembourg), German (Switzerland) 57 | 8 : {0 : 'el'}, # Greek, Modern (1453-) 58 | 71 : {0 : 'gu'}, # Gujarati 59 | 13 : {0 : 'he'}, # Hebrew (also code 'iw'?) 60 | 57 : {0 : 'hi'}, # Hindi 61 | 14 : {0 : 'hu'}, # Hungarian 62 | 15 : {0 : 'is'}, # Icelandic 63 | 33 : {0 : 'id'}, # Indonesian 64 | 16 : {0 : 'it', 1 : 'it' , 2 : 'it-ch'}, # Italian, Italian (Switzerland) 65 | 17 : {0 : 'ja'}, # Japanese 66 | 75 : {0 : 'kn'}, # Kannada 67 | 63 : {0 : 'kk'}, # Kazakh 68 | 87 : {0 : 'x-kok'}, # Konkani (real language code is 'kok'?) 69 | 18 : {0 : 'ko'}, # Korean 70 | 38 : {0 : 'lv'}, # Latvian 71 | 39 : {0 : 'lt'}, # Lithuanian 72 | 47 : {0 : 'mk'}, # Macedonian 73 | 62 : {0 : 'ms'}, # Malay 74 | 76 : {0 : 'ml'}, # Malayalam 75 | 58 : {0 : 'mt'}, # Maltese 76 | 78 : {0 : 'mr'}, # Marathi 77 | 97 : {0 : 'ne'}, # Nepali 78 | 20 : {0 : 'no'}, # Norwegian 79 | 72 : {0 : 'or'}, # Oriya 80 | 21 : {0 : 'pl'}, # Polish 81 | 22 : {0 : 'pt', 2 : 'pt' , 1 : 'pt-br'}, # Portuguese, Portuguese (Brazil) 82 | 70 : {0 : 'pa'}, # Punjabi 83 | 23 : {0 : 'rm'}, # "Rhaeto-Romanic" (IANA: Romansh) 84 | 24 : {0 : 'ro'}, # Romanian 85 | 25 : {0 : 'ru'}, # Russian 86 | 59 : {0 : 'sz'}, # "Sami (Lappish)" (not an IANA language code) 87 | # IANA code for "Northern Sami" is 'se' 88 | # 'SZ' is the IANA region code for Swaziland 89 | 79 : {0 : 'sa'}, # Sanskrit 90 | 27 : {0 : 'sk'}, # Slovak 91 | 36 : {0 : 'sl'}, # Slovenian 92 | 46 : {0 : 'sb'}, # "Sorbian" (not an IANA language code) 93 | # 'SB' is IANA region code for 'Solomon Islands' 94 | # Lower Sorbian = 'dsb' 95 | # Upper Sorbian = 'hsb' 96 | # Sorbian Languages = 'wen' 97 | 10 : {0 : 'es' , 4 : 'es' , 44 : 'es-ar' , 64 : 'es-bo' , 52 : 'es-cl' , 36 : 'es-co' , 20 : 'es-cr' , 28 : 'es-do' , 98 | 48 : 'es-ec' , 68 : 'es-sv' , 16 : 'es-gt' , 72 : 'es-hn' , 8 : 'es-mx' , 76 : 'es-ni' , 24 : 'es-pa' , 99 | 60 : 'es-py' , 40 : 'es-pe' , 80 : 'es-pr' , 56 : 'es-uy' , 32 : 'es-ve'}, 100 | # Spanish, Spanish (Mobipocket bug?), Spanish (Argentina), Spanish 101 | # (Bolivia), Spanish (Chile), Spanish (Colombia), Spanish (Costa Rica), 102 | # Spanish (Dominican Republic), Spanish (Ecuador), Spanish (El 103 | # Salvador), Spanish (Guatemala), Spanish (Honduras), Spanish (Mexico), 104 | # Spanish (Nicaragua), Spanish (Panama), Spanish (Paraguay), Spanish 105 | # (Peru), Spanish (Puerto Rico), Spanish (Uruguay), Spanish (Venezuela) 106 | 48 : {0 : 'sx'}, # "Sutu" (not an IANA language code) 107 | # "Sutu" is another name for "Southern Sotho"? 108 | # IANA code for "Southern Sotho" is 'st' 109 | 65 : {0 : 'sw'}, # Swahili 110 | 29 : {0 : 'sv' , 1 : 'sv' , 8 : 'sv-fi'}, # Swedish, Swedish (Finland) 111 | 73 : {0 : 'ta'}, # Tamil 112 | 68 : {0 : 'tt'}, # Tatar 113 | 74 : {0 : 'te'}, # Telugu 114 | 30 : {0 : 'th'}, # Thai 115 | 49 : {0 : 'ts'}, # Tsonga 116 | 50 : {0 : 'tn'}, # Tswana 117 | 31 : {0 : 'tr'}, # Turkish 118 | 34 : {0 : 'uk'}, # Ukrainian 119 | 32 : {0 : 'ur'}, # Urdu 120 | 67 : {0 : 'uz', 2 : 'uz'}, # Uzbek 121 | 42 : {0 : 'vi'}, # Vietnamese 122 | 52 : {0 : 'xh'}, # Xhosa 123 | 53 : {0 : 'zu'}, # Zulu 124 | } 125 | lang = "en" 126 | if langID in mobilangdict: 127 | subdict = mobilangdict[langID] 128 | lang = subdict[0] 129 | if sublangID in subdict: 130 | lang = subdict[sublangID] 131 | return lang 132 | 133 | 134 | def toHex(byteList): 135 | return binascii.hexlify(byteList) 136 | 137 | # returns base32 bytestring 138 | def toBase32(value, npad=4): 139 | digits = b'0123456789ABCDEFGHIJKLMNOPQRSTUV' 140 | num_string=b'' 141 | current = value 142 | while current != 0: 143 | next, remainder = divmod(current, 32) 144 | rem_string = digits[remainder:remainder+1] 145 | num_string = rem_string + num_string 146 | current=next 147 | if num_string == b'': 148 | num_string = b'0' 149 | pad = npad - len(num_string) 150 | if pad > 0: 151 | num_string = b'0' * pad + num_string 152 | return num_string 153 | 154 | 155 | # converts base32 string to value 156 | def fromBase32(str_num): 157 | if isinstance(str_num, text_type): 158 | str_num = str_num.encode('latin-1') 159 | scalelst = [1,32,1024,32768,1048576,33554432,1073741824,34359738368] 160 | value = 0 161 | j = 0 162 | n = len(str_num) 163 | scale = 0 164 | for i in range(n): 165 | c = str_num[n-i-1:n-i] 166 | if c in b'0123456789': 167 | v = ord(c) - ord(b'0') 168 | else: 169 | v = ord(c) - ord(b'A') + 10 170 | if j < len(scalelst): 171 | scale = scalelst[j] 172 | else: 173 | scale = scale * 32 174 | j += 1 175 | if v != 0: 176 | value = value + (v * scale) 177 | return value 178 | 179 | 180 | # note: if decode a bytestring using 'latin-1' (or any other 0-255 encoding) 181 | # in place of ascii you will get a byte to half-word or integer 182 | # one to one mapping of values from 0 - 255 183 | 184 | def mangle_fonts(encryption_key, data): 185 | if isinstance(encryption_key, text_type): 186 | encryption_key = encryption_key.encode('latin-1') 187 | crypt = data[:1024] 188 | key = cycle(iter(map(bord, encryption_key))) 189 | # encrypt = ''.join([chr(ord(x)^key.next()) for x in crypt]) 190 | encrypt = b''.join([bchr(bord(x)^next(key)) for x in crypt]) 191 | return encrypt + data[1024:] 192 | -------------------------------------------------------------------------------- /src/baca/tools/KindleUnpack/mobi_cover.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | from .compatibility_utils import unicode_str 8 | 9 | from .unipath import pathof 10 | import os 11 | import imghdr 12 | 13 | import struct 14 | # note: struct pack, unpack, unpack_from all require bytestring format 15 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring 16 | 17 | USE_SVG_WRAPPER = True 18 | """ Set to True to use svg wrapper for default. """ 19 | 20 | FORCE_DEFAULT_TITLE = False 21 | """ Set to True to force to use the default title. """ 22 | 23 | COVER_PAGE_FINENAME = 'cover_page.xhtml' 24 | """ The name for the cover page. """ 25 | 26 | DEFAULT_TITLE = 'Cover' 27 | """ The default title for the cover page. """ 28 | 29 | MAX_WIDTH = 4096 30 | """ The max width for the svg cover page. """ 31 | 32 | MAX_HEIGHT = 4096 33 | """ The max height for the svg cover page. """ 34 | 35 | 36 | def get_image_type(imgname, imgdata=None): 37 | imgtype = unicode_str(imghdr.what(pathof(imgname), imgdata)) 38 | 39 | # imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some 40 | # with only the magic JPEG bytes out there... 41 | # ImageMagick handles those, so, do it too. 42 | if imgtype is None: 43 | if imgdata is None: 44 | with open(pathof(imgname), 'rb') as f: 45 | imgdata = f.read() 46 | if imgdata[0:2] == b'\xFF\xD8': 47 | # Get last non-null bytes 48 | last = len(imgdata) 49 | while (imgdata[last-1:last] == b'\x00'): 50 | last-=1 51 | # Be extra safe, check the trailing bytes, too. 52 | if imgdata[last-2:last] == b'\xFF\xD9': 53 | imgtype = "jpeg" 54 | return imgtype 55 | 56 | 57 | def get_image_size(imgname, imgdata=None): 58 | '''Determine the image type of imgname (or imgdata) and return its size. 59 | 60 | Originally, 61 | Determine the image type of fhandle and return its size. 62 | from draco''' 63 | if imgdata is None: 64 | fhandle = open(pathof(imgname), 'rb') 65 | head = fhandle.read(24) 66 | else: 67 | head = imgdata[0:24] 68 | if len(head) != 24: 69 | return 70 | 71 | imgtype = get_image_type(imgname, imgdata) 72 | if imgtype == 'png': 73 | check = struct.unpack(b'>i', head[4:8])[0] 74 | if check != 0x0d0a1a0a: 75 | return 76 | width, height = struct.unpack(b'>ii', head[16:24]) 77 | elif imgtype == 'gif': 78 | width, height = struct.unpack(b'H', fhandle.read(2))[0] - 2 91 | # We are at a SOFn block 92 | fhandle.seek(1, 1) # Skip `precision' byte. 93 | height, width = struct.unpack(b'>HH', fhandle.read(4)) 94 | except Exception: # IGNORE:W0703 95 | return 96 | elif imgtype == 'jpeg' and imgdata is not None: 97 | try: 98 | pos = 0 99 | size = 2 100 | ftype = 0 101 | while not 0xc0 <= ftype <= 0xcf: 102 | pos += size 103 | byte = imgdata[pos:pos+1] 104 | pos += 1 105 | while ord(byte) == 0xff: 106 | byte = imgdata[pos:pos+1] 107 | pos += 1 108 | ftype = ord(byte) 109 | size = struct.unpack(b'>H', imgdata[pos:pos+2])[0] - 2 110 | pos += 2 111 | # We are at a SOFn block 112 | pos += 1 # Skip `precision' byte. 113 | height, width = struct.unpack(b'>HH', imgdata[pos:pos+4]) 114 | pos += 4 115 | except Exception: # IGNORE:W0703 116 | return 117 | else: 118 | return 119 | return width, height 120 | 121 | # XXX experimental 122 | class CoverProcessor(object): 123 | 124 | """Create a cover page. 125 | 126 | """ 127 | def __init__(self, files, metadata, rscnames, imgname=None, imgdata=None): 128 | self.files = files 129 | self.metadata = metadata 130 | self.rscnames = rscnames 131 | self.cover_page = COVER_PAGE_FINENAME 132 | self.use_svg = USE_SVG_WRAPPER # Use svg wrapper. 133 | self.lang = metadata.get('Language', ['en'])[0] 134 | # This should ensure that if the methods to find the cover image's 135 | # dimensions should fail for any reason, the SVG routine will not be used. 136 | [self.width, self.height] = (-1,-1) 137 | if FORCE_DEFAULT_TITLE: 138 | self.title = DEFAULT_TITLE 139 | else: 140 | self.title = metadata.get('Title', [DEFAULT_TITLE])[0] 141 | 142 | self.cover_image = None 143 | if imgname is not None: 144 | self.cover_image = imgname 145 | elif 'CoverOffset' in metadata: 146 | imageNumber = int(metadata['CoverOffset'][0]) 147 | cover_image = self.rscnames[imageNumber] 148 | if cover_image is not None: 149 | self.cover_image = cover_image 150 | else: 151 | print('Warning: Cannot identify the cover image.') 152 | if self.use_svg: 153 | try: 154 | if imgdata is None: 155 | fname = os.path.join(files.imgdir, self.cover_image) 156 | [self.width, self.height] = get_image_size(fname) 157 | else: 158 | [self.width, self.height] = get_image_size(None, imgdata) 159 | except: 160 | self.use_svg = False 161 | width = self.width 162 | height = self.height 163 | if width < 0 or height < 0 or width > MAX_WIDTH or height > MAX_HEIGHT: 164 | self.use_svg = False 165 | return 166 | 167 | def getImageName(self): 168 | return self.cover_image 169 | 170 | def getXHTMLName(self): 171 | return self.cover_page 172 | 173 | def buildXHTML(self): 174 | print('Building a cover page.') 175 | files = self.files 176 | cover_image = self.cover_image 177 | title = self.title 178 | lang = self.lang 179 | 180 | image_dir = os.path.normpath(os.path.relpath(files.k8images, files.k8text)) 181 | image_path = os.path.join(image_dir, cover_image).replace('\\', '/') 182 | 183 | if not self.use_svg: 184 | data = '' 185 | data += '' 186 | data += ' None: 33 | self.zebra_stripes = True 34 | self.show_cursor = False 35 | 36 | 37 | class SegmentWidget(Widget): 38 | can_focus = False 39 | 40 | def __init__(self, config: Config, nav_point: str | None): 41 | super().__init__() 42 | self.config = config 43 | self.nav_point = nav_point 44 | 45 | def get_text_at(self, y: int) -> str: 46 | return self.render_lines(Region(0, y, self.virtual_region_with_margin.width, 1))[0].text 47 | 48 | 49 | class Body(SegmentWidget): 50 | def __init__(self, _: Ebook, config: Config, content: str, nav_point: str | None = None): 51 | super().__init__(config, nav_point) 52 | self.content = content 53 | 54 | def render(self): 55 | # NOTE: Markdwon rich isn't widget, so we cannot set using css 56 | # hence this translation workaround 57 | return Markdown( 58 | self.content, justify=dict(center="center", left="left", right="right", justify="full")[self.styles.text_align] # type: ignore 59 | ) 60 | 61 | def render_line(self, y) -> Strip: 62 | strip = super().render_line(y) 63 | for s in strip._segments: 64 | if s.style is not None and s.style.link is not None: 65 | link = ( 66 | s.style.link 67 | if is_url(s.style.link) or self.nav_point is None 68 | else urljoin(self.nav_point, s.style.link) 69 | ) 70 | s.style._meta = dumps({"@click": f"link({link!r})"}) 71 | return strip 72 | 73 | 74 | class Image(SegmentWidget): 75 | def __init__(self, ebook: Ebook, config: Config, src: str, nav_point: str | None = None): 76 | super().__init__(config, nav_point) 77 | # TODO: maybe put it in Widget.id? 78 | self.content = src 79 | self.ebook = ebook 80 | self._renderable = Text("IMAGE", justify="center") 81 | 82 | def render(self): 83 | return self._renderable 84 | 85 | def show_ansi_image(self): 86 | img = PILImage.open(io.BytesIO(self.ebook.get_img_bytestr(self.content)[1])).convert("RGB") 87 | img_ansi = climage._toAnsi( 88 | img, 89 | # NOTE: -1 for precaution on rounding of screen width 90 | oWidth=self.size.width - 1, 91 | is_unicode=True, 92 | color_type=climage.color_types.truecolor, 93 | palette="default", 94 | ) 95 | img.close() 96 | self._renderable = Text.from_ansi(img_ansi) 97 | self.refresh(layout=True) 98 | 99 | # TODO: "Click ot Open" on mouse hover 100 | # def on_mouse_move(self, _: events.MouseMove) -> None: 101 | # self.styles.background = "red" 102 | 103 | async def on_click(self) -> None: 104 | self.post_message(OpenThisImage(self.content)) 105 | 106 | 107 | class PrettyBody(PrettyMarkdown): 108 | def __init__(self, _: Ebook, config: Config, value: str, nav_point: str | None = None): 109 | super().__init__(value) 110 | self.nav_point = nav_point 111 | 112 | def get_text_at(self, y: int) -> str | None: 113 | # TODO: this implementation still has issue in positioning match 114 | # at the end of ebook segment 115 | accumulated_height = 0 116 | for child in self.children: 117 | if accumulated_height + child.virtual_region_with_margin.height > y: 118 | return child.render_lines(Region(0, y - accumulated_height, child.virtual_region_with_margin.width, 1))[ 119 | 0 120 | ].text 121 | accumulated_height += child.virtual_region_with_margin.height 122 | 123 | 124 | class SearchMatch(Widget): 125 | can_focus = False 126 | 127 | def __init__(self, match_str: str, coordinate: Coordinate): 128 | super().__init__() 129 | self.match_str = match_str 130 | self.coordinate = coordinate 131 | 132 | def on_mount(self): 133 | self.styles.offset = (self.coordinate.x, self.coordinate.y) 134 | 135 | def render(self): 136 | return self.match_str 137 | 138 | def scroll_visible(self): 139 | # NOTE: need to override default .scroll_visible(). 140 | # Somehow this widget.virtual_region_with_margin 141 | # will cause the screen to scroll to 0. 142 | self.screen.scroll_to_region( 143 | Region( 144 | x=self.coordinate.x, 145 | y=self.coordinate.y, 146 | width=self.virtual_size.width, 147 | height=self.virtual_size.height, 148 | ) 149 | ) 150 | 151 | 152 | class Content(Widget): 153 | can_focus = False 154 | 155 | def __init__(self, config: Config, ebook: Ebook): 156 | super().__init__() 157 | self.config = config 158 | 159 | self._segments: list[SegmentWidget | PrettyBody] = [] 160 | for segment in ebook.iter_parsed_contents(): 161 | if segment.type == SegmentType.BODY: 162 | component_cls = Body if not config.pretty else PrettyBody 163 | else: 164 | component_cls = Image 165 | self._segments.append(component_cls(ebook, self.config, segment.content, segment.nav_point)) 166 | 167 | def get_navigables(self): 168 | return [s for s in self._segments if s.nav_point is not None] 169 | 170 | def scroll_to_section(self, nav_point: str) -> None: 171 | # TODO: add attr TocEntry.uuid so we can query("#{uuid}") 172 | for s in self.get_navigables(): 173 | if s.nav_point == nav_point: 174 | s.scroll_visible(top=True) 175 | break 176 | 177 | def on_mouse_scroll_down(self, _: events.MouseScrollDown) -> None: 178 | self.screen.scroll_down() 179 | 180 | def on_mouse_scroll_up(self, _: events.MouseScrollUp) -> None: 181 | self.screen.scroll_up() 182 | 183 | # NOTE: override initial message 184 | def render(self): 185 | return "" 186 | 187 | def compose(self) -> ComposeResult: 188 | yield from iter(self._segments) 189 | 190 | def get_text_at(self, y: int) -> str | None: 191 | accumulated_height = 0 192 | for segment in self._segments: 193 | if accumulated_height + segment.virtual_region_with_margin.height > y: 194 | return segment.get_text_at(y - accumulated_height) 195 | accumulated_height += segment.virtual_region_with_margin.height 196 | 197 | async def search_next( 198 | self, pattern_str: str, current_coord: Coordinate = Coordinate(-1, 0), forward: bool = True 199 | ) -> Coordinate | None: 200 | pattern = re.compile(pattern_str, re.IGNORECASE) 201 | current_x = current_coord.x 202 | line_range = ( 203 | range(current_coord.y, self.virtual_size.height) if forward else reversed(range(0, current_coord.y + 1)) 204 | ) 205 | for linenr in line_range: 206 | line_text = self.get_text_at(linenr) 207 | if line_text is not None: 208 | for match in pattern.finditer(line_text): 209 | is_next_match = (match.start() > current_x) if forward else (match.start() < current_x) 210 | if is_next_match: 211 | await self.clear_search() 212 | 213 | match_str = match.group() 214 | match_coord = Coordinate(match.start(), linenr) 215 | match_widget = SearchMatch(match_str, match_coord) 216 | await self.mount(match_widget) 217 | match_widget.scroll_visible() 218 | return match_coord 219 | current_x = -1 if forward else self.size.width # maybe virtual_size? 220 | 221 | async def clear_search(self) -> None: 222 | await self.query(SearchMatch.__name__).remove() 223 | 224 | def scroll_to_widget(self, *args, **kwargs) -> bool: 225 | return self.screen.scroll_to_widget(*args, **kwargs) 226 | 227 | def show_ansi_images(self): 228 | if not self.config.show_image_as_ansi: 229 | return 230 | 231 | # TODO: lazy load the images 232 | # 1. Need to change how reading prog saved 233 | # instead of global 30%, save local by segment (ie. segment 3, 60%) 234 | # 2. Only load image when scrolled in view. (Checkout `scroll_visible` in Widget/Screen) 235 | for segment in self._segments: 236 | if isinstance(segment, Image): 237 | segment.show_ansi_image() 238 | self.refresh(layout=True) 239 | 240 | def on_resize(self): 241 | self.show_ansi_images() 242 | 243 | # Already handled by self.styles.max_width 244 | # async def on_resize(self, event: events.Resize) -> None: 245 | # self.styles.width = min(WIDTH, event.size.width - 2) 246 | -------------------------------------------------------------------------------- /src/baca/tools/KindleUnpack/compatibility_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | # Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay 6 | # All rights reserved. 7 | # 8 | # Redistribution and use in source and binary forms, with or without modification, 9 | # are permitted provided that the following conditions are met: 10 | # 11 | # 1. Redistributions of source code must retain the above copyright notice, this list of 12 | # conditions and the following disclaimer. 13 | # 14 | # 2. Redistributions in binary form must reproduce the above copyright notice, this list 15 | # of conditions and the following disclaimer in the documentation and/or other materials 16 | # provided with the distribution. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 19 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 21 | # SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 22 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 23 | # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 24 | # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 26 | # WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | from __future__ import unicode_literals, division, absolute_import, print_function 29 | 30 | import sys 31 | import codecs 32 | 33 | PY2 = sys.version_info[0] == 2 34 | PY3 = sys.version_info[0] == 3 35 | 36 | iswindows = sys.platform.startswith('win') 37 | 38 | try: 39 | from urllib.parse import unquote 40 | except ImportError: 41 | from urllib import unquote 42 | 43 | if PY2: 44 | from HTMLParser import HTMLParser 45 | _h = HTMLParser() 46 | elif sys.version_info[1] < 4: 47 | import html.parser 48 | _h = html.parser.HTMLParser() 49 | else: 50 | import html as _h 51 | 52 | if PY3: 53 | text_type = str 54 | binary_type = bytes 55 | # if will be printing arbitraty binary data to stdout on python 3 56 | # sys.stdin = sys.stdin.detach() 57 | # sys.stdout = sys.stdout.detach() 58 | # sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach()) 59 | else: 60 | range = xrange 61 | text_type = unicode 62 | binary_type = str 63 | # if will be printing unicode under python 2 need to protect 64 | # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode 65 | # sys.stdout = codecs.getwriter("utf-8")(sys.stdout) 66 | # alternatively set environment variable as follows **before** launching python: export PYTHONIOENCODING=UTF-8 67 | 68 | # NOTE: Python 3 is completely broken when accessing single bytes in bytes strings 69 | # (and they amazingly claim by design and no bug!) 70 | 71 | # To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode 72 | # >>> o = '123456789' 73 | # >>> o[-3] 74 | # '7' 75 | # >>> type(o[-3]) 76 | # 77 | # >>> type(o) 78 | # 79 | 80 | # Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings 81 | # >>> o = b'123456789' 82 | # >>> o[-3] 83 | # 55 84 | # >>> type(o[-3]) 85 | # 86 | # >>> type(o) 87 | # 88 | 89 | # This mind boggling behaviour also happens when indexing a bytestring and/or 90 | # iteratoring over a bytestring. In other words it will return an int but not 91 | # the byte itself!!!!!!! 92 | 93 | # The only way to access a single byte as a byte in bytestring and get the byte in both 94 | # Python 2 and Python 3 is to use a slice 95 | 96 | # This problem is so common there are horrible hacks floating around the net to **try** 97 | # to work around it, so that code that works on both Python 2 and Python 3 is possible. 98 | 99 | # So in order to write code that works on both Python 2 and Python 3 100 | # if you index or access a single byte and want its ord() then use the bord() function. 101 | # If instead you want it as a single character byte use the bchar() function 102 | # both of which are defined below. 103 | 104 | if PY3: 105 | # Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding) 106 | # in place of ascii you will get a byte value to half-word or integer value 107 | # one-to-one mapping (in the 0 - 255 range) 108 | 109 | def bchr(s): 110 | return bytes([s]) 111 | 112 | def bstr(s): 113 | if isinstance(s, str): 114 | return bytes(s, 'latin-1') 115 | else: 116 | return bytes(s) 117 | 118 | def bord(s): 119 | return s 120 | 121 | def bchar(s): 122 | return bytes([s]) 123 | 124 | else: 125 | def bchr(s): 126 | return chr(s) 127 | 128 | def bstr(s): 129 | return str(s) 130 | 131 | def bord(s): 132 | return ord(s) 133 | 134 | def bchar(s): 135 | return s 136 | 137 | if PY3: 138 | # list-producing versions of the major Python iterating functions 139 | def lrange(*args, **kwargs): 140 | return list(range(*args, **kwargs)) 141 | 142 | def lzip(*args, **kwargs): 143 | return list(zip(*args, **kwargs)) 144 | 145 | def lmap(*args, **kwargs): 146 | return list(map(*args, **kwargs)) 147 | 148 | def lfilter(*args, **kwargs): 149 | return list(filter(*args, **kwargs)) 150 | else: 151 | import __builtin__ 152 | # Python 2-builtin ranges produce lists 153 | lrange = __builtin__.range 154 | lzip = __builtin__.zip 155 | lmap = __builtin__.map 156 | lfilter = __builtin__.filter 157 | 158 | # In Python 3 you can no longer use .encode('hex') on a bytestring 159 | # instead use the following on both platforms 160 | import binascii 161 | def hexlify(bdata): 162 | return (binascii.hexlify(bdata)).decode('ascii') 163 | 164 | # If you: import struct 165 | # Note: struct pack, unpack, unpack_from all *require* bytestring format 166 | # data all the way up to at least Python 2.7.5, Python 3 is okay with either 167 | 168 | # If you: import re 169 | # note: Python 3 "re" requires the pattern to be the exact same type as the data to be 170 | # searched ... but u"" is not allowed for the pattern itself only b"" 171 | # Python 2.X allows the pattern to be any type and converts it to match the data 172 | # and returns the same type as the data 173 | 174 | # convert string to be utf-8 encoded 175 | def utf8_str(p, enc='utf-8'): 176 | if p is None: 177 | return None 178 | if isinstance(p, text_type): 179 | return p.encode('utf-8') 180 | if enc != 'utf-8': 181 | return p.decode(enc).encode('utf-8') 182 | return p 183 | 184 | # convert string to be unicode encoded 185 | def unicode_str(p, enc='utf-8'): 186 | if p is None: 187 | return None 188 | if isinstance(p, text_type): 189 | return p 190 | return p.decode(enc) 191 | 192 | ASCII_CHARS = set(chr(x) for x in range(128)) 193 | URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 194 | 'abcdefghijklmnopqrstuvwxyz' 195 | '0123456789' '#' '_.-/~') 196 | IRI_UNSAFE = ASCII_CHARS - URL_SAFE 197 | 198 | # returns a quoted IRI (not a URI) 199 | def quoteurl(href): 200 | if isinstance(href,binary_type): 201 | href = href.decode('utf-8') 202 | result = [] 203 | for char in href: 204 | if char in IRI_UNSAFE: 205 | char = "%%%02x" % ord(char) 206 | result.append(char) 207 | return ''.join(result) 208 | 209 | # unquotes url/iri 210 | def unquoteurl(href): 211 | if isinstance(href,binary_type): 212 | href = href.decode('utf-8') 213 | href = unquote(href) 214 | return href 215 | 216 | # unescape html 217 | def unescapeit(sval): 218 | return _h.unescape(sval) 219 | 220 | # Python 2.X commandline parsing under Windows has been horribly broken for years! 221 | # Use the following code to emulate full unicode commandline parsing on Python 2 222 | # ie. To get sys.argv arguments and properly encode them as unicode 223 | 224 | def unicode_argv(): 225 | global iswindows 226 | global PY3 227 | if PY3: 228 | return sys.argv 229 | if iswindows: 230 | # Versions 2.x of Python don't support Unicode in sys.argv on 231 | # Windows, with the underlying Windows API instead replacing multi-byte 232 | # characters with '?'. So use shell32.GetCommandLineArgvW to get sys.argv 233 | # as a list of Unicode strings 234 | from ctypes import POINTER, byref, cdll, c_int, windll 235 | from ctypes.wintypes import LPCWSTR, LPWSTR 236 | 237 | GetCommandLineW = cdll.kernel32.GetCommandLineW 238 | GetCommandLineW.argtypes = [] 239 | GetCommandLineW.restype = LPCWSTR 240 | 241 | CommandLineToArgvW = windll.shell32.CommandLineToArgvW 242 | CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)] 243 | CommandLineToArgvW.restype = POINTER(LPWSTR) 244 | 245 | cmd = GetCommandLineW() 246 | argc = c_int(0) 247 | argv = CommandLineToArgvW(cmd, byref(argc)) 248 | if argc.value > 0: 249 | # Remove Python executable and commands if present 250 | start = argc.value - len(sys.argv) 251 | return [argv[i] for i in 252 | range(start, argc.value)] 253 | # this should never happen 254 | return None 255 | else: 256 | argv = [] 257 | argvencoding = sys.stdin.encoding 258 | if argvencoding is None: 259 | argvencoding = sys.getfilesystemencoding() 260 | if argvencoding is None: 261 | argvencoding = 'utf-8' 262 | for arg in sys.argv: 263 | if isinstance(arg, text_type): 264 | argv.append(arg) 265 | else: 266 | argv.append(arg.decode(argvencoding)) 267 | return argv 268 | 269 | 270 | # Python 2.X is broken in that it does not recognize CP65001 as UTF-8 271 | def add_cp65001_codec(): 272 | if PY2: 273 | try: 274 | codecs.lookup('cp65001') 275 | except LookupError: 276 | codecs.register( 277 | lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None) 278 | return 279 | -------------------------------------------------------------------------------- /src/baca/tools/KindleUnpack/mobi_ncx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | import os 8 | from .unipath import pathof 9 | from .compatibility_utils import unescapeit 10 | 11 | 12 | import re 13 | # note: re requites the pattern to be the exact same type as the data to be searched in python3 14 | # but u"" is not allowed for the pattern itself only b"" 15 | 16 | from xml.sax.saxutils import escape as xmlescape 17 | 18 | from .mobi_utils import toBase32 19 | from .mobi_index import MobiIndex 20 | 21 | DEBUG_NCX = False 22 | 23 | class ncxExtract: 24 | 25 | def __init__(self, mh, files): 26 | self.mh = mh 27 | self.sect = self.mh.sect 28 | self.files = files 29 | self.isNCX = False 30 | self.mi = MobiIndex(self.sect) 31 | self.ncxidx = self.mh.ncxidx 32 | self.indx_data = None 33 | 34 | def parseNCX(self): 35 | indx_data = [] 36 | tag_fieldname_map = { 37 | 1: ['pos',0], 38 | 2: ['len',0], 39 | 3: ['noffs',0], 40 | 4: ['hlvl',0], 41 | 5: ['koffs',0], 42 | 6: ['pos_fid',0], 43 | 21: ['parent',0], 44 | 22: ['child1',0], 45 | 23: ['childn',0] 46 | } 47 | if self.ncxidx != 0xffffffff: 48 | outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX") 49 | if DEBUG_NCX: 50 | print(ctoc_text) 51 | print(outtbl) 52 | num = 0 53 | for [text, tagMap] in outtbl: 54 | tmp = { 55 | 'name': text.decode('utf-8'), 56 | 'pos': -1, 57 | 'len': 0, 58 | 'noffs': -1, 59 | 'text' : "Unknown Text", 60 | 'hlvl' : -1, 61 | 'kind' : "Unknown Kind", 62 | 'pos_fid' : None, 63 | 'parent' : -1, 64 | 'child1' : -1, 65 | 'childn' : -1, 66 | 'num' : num 67 | } 68 | for tag in tag_fieldname_map: 69 | [fieldname, i] = tag_fieldname_map[tag] 70 | if tag in tagMap: 71 | fieldvalue = tagMap[tag][i] 72 | if tag == 6: 73 | pos_fid = toBase32(fieldvalue,4).decode('utf-8') 74 | fieldvalue2 = tagMap[tag][i+1] 75 | pos_off = toBase32(fieldvalue2,10).decode('utf-8') 76 | fieldvalue = 'kindle:pos:fid:%s:off:%s' % (pos_fid, pos_off) 77 | tmp[fieldname] = fieldvalue 78 | if tag == 3: 79 | toctext = ctoc_text.get(fieldvalue, 'Unknown Text') 80 | toctext = toctext.decode(self.mh.codec) 81 | tmp['text'] = toctext 82 | if tag == 5: 83 | kindtext = ctoc_text.get(fieldvalue, 'Unknown Kind') 84 | kindtext = kindtext.decode(self.mh.codec) 85 | tmp['kind'] = kindtext 86 | indx_data.append(tmp) 87 | if DEBUG_NCX: 88 | print("record number: ", num) 89 | print("name: ", tmp['name'],) 90 | print("position", tmp['pos']," length: ", tmp['len']) 91 | print("text: ", tmp['text']) 92 | print("kind: ", tmp['kind']) 93 | print("heading level: ", tmp['hlvl']) 94 | print("parent:", tmp['parent']) 95 | print("first child: ",tmp['child1']," last child: ", tmp['childn']) 96 | print("pos_fid is ", tmp['pos_fid']) 97 | print("\n\n") 98 | num += 1 99 | self.indx_data = indx_data 100 | return indx_data 101 | 102 | def buildNCX(self, htmlfile, title, ident, lang): 103 | indx_data = self.indx_data 104 | 105 | ncx_header = \ 106 | ''' 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | %s 117 | 118 | 119 | ''' 120 | 121 | ncx_footer = \ 122 | ''' 123 | 124 | ''' 125 | 126 | ncx_entry = \ 127 | ''' 128 | 129 | %s 130 | 131 | ''' 132 | 133 | # recursive part 134 | def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): 135 | if start>len(indx_data) or end>len(indx_data): 136 | print("Warning: missing INDX child entries", start, end, len(indx_data)) 137 | return '' 138 | if DEBUG_NCX: 139 | print("recursINDX lvl %d from %d to %d" % (lvl, start, end)) 140 | xml = '' 141 | if start <= 0: 142 | start = 0 143 | if end <= 0: 144 | end = len(indx_data) 145 | if lvl > max_lvl: 146 | max_lvl = lvl 147 | indent = ' ' * (2 + lvl) 148 | 149 | for i in range(start, end): 150 | e = indx_data[i] 151 | if not e['hlvl'] == lvl: 152 | continue 153 | # open entry 154 | num += 1 155 | link = '%s#filepos%d' % (htmlfile, e['pos']) 156 | tagid = 'np_%d' % num 157 | entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link) 158 | entry = re.sub(re.compile('^', re.M), indent, entry, 0) 159 | xml += entry + '\n' 160 | # recurs 161 | if e['child1']>=0: 162 | xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, 163 | e['child1'], e['childn'] + 1) 164 | xml += xmlrec 165 | # close entry 166 | xml += indent + '\n' 167 | return xml, max_lvl, num 168 | 169 | body, max_lvl, num = recursINDX() 170 | header = ncx_header % (lang, ident, max_lvl + 1, title) 171 | ncx = header + body + ncx_footer 172 | if not len(indx_data) == num: 173 | print("Warning: different number of entries in NCX", len(indx_data), num) 174 | return ncx 175 | 176 | def writeNCX(self, metadata): 177 | # build the xml 178 | self.isNCX = True 179 | print("Write ncx") 180 | # htmlname = os.path.basename(self.files.outbase) 181 | # htmlname += '.html' 182 | htmlname = 'book.html' 183 | xml = self.buildNCX(htmlname, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0]) 184 | # write the ncx file 185 | # ncxname = os.path.join(self.files.mobi7dir, self.files.getInputFileBasename() + '.ncx') 186 | ncxname = os.path.join(self.files.mobi7dir, 'toc.ncx') 187 | with open(pathof(ncxname), 'wb') as f: 188 | f.write(xml.encode('utf-8')) 189 | 190 | def buildK8NCX(self, indx_data, title, ident, lang): 191 | ncx_header = \ 192 | ''' 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | %s 203 | 204 | 205 | ''' 206 | 207 | ncx_footer = \ 208 | ''' 209 | 210 | ''' 211 | 212 | ncx_entry = \ 213 | ''' 214 | 215 | %s 216 | 217 | ''' 218 | 219 | # recursive part 220 | def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): 221 | if start>len(indx_data) or end>len(indx_data): 222 | print("Warning: missing INDX child entries", start, end, len(indx_data)) 223 | return '' 224 | if DEBUG_NCX: 225 | print("recursINDX lvl %d from %d to %d" % (lvl, start, end)) 226 | xml = '' 227 | if start <= 0: 228 | start = 0 229 | if end <= 0: 230 | end = len(indx_data) 231 | if lvl > max_lvl: 232 | max_lvl = lvl 233 | indent = ' ' * (2 + lvl) 234 | 235 | for i in range(start, end): 236 | e = indx_data[i] 237 | htmlfile = e['filename'] 238 | desttag = e['idtag'] 239 | if not e['hlvl'] == lvl: 240 | continue 241 | # open entry 242 | num += 1 243 | if desttag == '': 244 | link = 'Text/%s' % htmlfile 245 | else: 246 | link = 'Text/%s#%s' % (htmlfile, desttag) 247 | tagid = 'np_%d' % num 248 | entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link) 249 | entry = re.sub(re.compile('^', re.M), indent, entry, 0) 250 | xml += entry + '\n' 251 | # recurs 252 | if e['child1']>=0: 253 | xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1, 254 | e['child1'], e['childn'] + 1) 255 | xml += xmlrec 256 | # close entry 257 | xml += indent + '\n' 258 | return xml, max_lvl, num 259 | 260 | body, max_lvl, num = recursINDX() 261 | header = ncx_header % (lang, ident, max_lvl + 1, title) 262 | ncx = header + body + ncx_footer 263 | if not len(indx_data) == num: 264 | print("Warning: different number of entries in NCX", len(indx_data), num) 265 | return ncx 266 | 267 | def writeK8NCX(self, ncx_data, metadata): 268 | # build the xml 269 | self.isNCX = True 270 | print("Write K8 ncx") 271 | xml = self.buildK8NCX(ncx_data, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0]) 272 | bname = 'toc.ncx' 273 | ncxname = os.path.join(self.files.k8oebps,bname) 274 | with open(pathof(ncxname), 'wb') as f: 275 | f.write(xml.encode('utf-8')) 276 | -------------------------------------------------------------------------------- /src/baca/tools/KindleUnpack/mobi_k8resc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | DEBUG_USE_ORDERED_DICTIONARY = False # OrderedDict is supoorted >= python 2.7. 8 | """ set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr.""" 9 | 10 | if DEBUG_USE_ORDERED_DICTIONARY: 11 | from collections import OrderedDict as dict_ 12 | else: 13 | dict_ = dict 14 | 15 | from .compatibility_utils import unicode_str 16 | 17 | from .mobi_utils import fromBase32 18 | 19 | _OPF_PARENT_TAGS = ['xml', 'package', 'metadata', 'dc-metadata', 20 | 'x-metadata', 'manifest', 'spine', 'tours', 'guide'] 21 | 22 | class K8RESCProcessor(object): 23 | 24 | def __init__(self, data, debug=False): 25 | self._debug = debug 26 | self.resc = None 27 | self.opos = 0 28 | self.extrameta = [] 29 | self.cover_name = None 30 | self.spine_idrefs = {} 31 | self.spine_order = [] 32 | self.spine_pageattributes = {} 33 | self.spine_ppd = None 34 | # need3 indicate the book has fields which require epub3. 35 | # but the estimation of the source epub version from the fields is difficult. 36 | self.need3 = False 37 | self.package_ver = None 38 | self.extra_metadata = [] 39 | self.refines_metadata = [] 40 | self.extra_attributes = [] 41 | # get header 42 | start_pos = data.find(b'<') 43 | self.resc_header = data[:start_pos] 44 | # get resc data length 45 | start = self.resc_header.find(b'=') + 1 46 | end = self.resc_header.find(b'&', start) 47 | resc_size = 0 48 | if end > 0: 49 | resc_size = fromBase32(self.resc_header[start:end]) 50 | resc_rawbytes = len(data) - start_pos 51 | if resc_rawbytes == resc_size: 52 | self.resc_length = resc_size 53 | else: 54 | # Most RESC has a nul string at its tail but some do not. 55 | end_pos = data.find(b'\x00', start_pos) 56 | if end_pos < 0: 57 | self.resc_length = resc_rawbytes 58 | else: 59 | self.resc_length = end_pos - start_pos 60 | if self.resc_length != resc_size: 61 | print("Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size)) 62 | # now parse RESC after converting it to unicode from utf-8 63 | try: 64 | self.resc = unicode_str(data[start_pos:start_pos+self.resc_length]) 65 | except UnicodeDecodeError: 66 | self.resc = unicode_str(data[start_pos:start_pos+self.resc_length], enc='latin-1') 67 | self.parseData() 68 | 69 | def prepend_to_spine(self, key, idref, linear, properties): 70 | self.spine_order = [key] + self.spine_order 71 | self.spine_idrefs[key] = idref 72 | attributes = {} 73 | if linear is not None: 74 | attributes['linear'] = linear 75 | if properties is not None: 76 | attributes['properties'] = properties 77 | self.spine_pageattributes[key] = attributes 78 | 79 | # RESC tag iterator 80 | def resc_tag_iter(self): 81 | tcontent = last_tattr = None 82 | prefix = [''] 83 | while True: 84 | text, tag = self.parseresc() 85 | if text is None and tag is None: 86 | break 87 | if text is not None: 88 | tcontent = text.rstrip(' \r\n') 89 | else: # we have a tag 90 | ttype, tname, tattr = self.parsetag(tag) 91 | if ttype == 'begin': 92 | tcontent = None 93 | prefix.append(tname + '.') 94 | if tname in _OPF_PARENT_TAGS: 95 | yield ''.join(prefix), tname, tattr, tcontent 96 | else: 97 | last_tattr = tattr 98 | else: # single or end 99 | if ttype == 'end': 100 | prefix.pop() 101 | tattr = last_tattr 102 | last_tattr = None 103 | if tname in _OPF_PARENT_TAGS: 104 | tname += '-end' 105 | yield ''.join(prefix), tname, tattr, tcontent 106 | tcontent = None 107 | 108 | # now parse the RESC to extract spine and extra metadata info 109 | def parseData(self): 110 | for prefix, tname, tattr, tcontent in self.resc_tag_iter(): 111 | if self._debug: 112 | print(" Parsing RESC: ", prefix, tname, tattr, tcontent) 113 | if tname == 'package': 114 | self.package_ver = tattr.get('version', '2.0') 115 | package_prefix = tattr.get('prefix','') 116 | if self.package_ver.startswith('3') or package_prefix.startswith('rendition'): 117 | self.need3 = True 118 | if tname == 'spine': 119 | self.spine_ppd = tattr.get('page-progession-direction', None) 120 | if self.spine_ppd is not None and self.spine_ppd == 'rtl': 121 | self.need3 = True 122 | if tname == 'itemref': 123 | skelid = tattr.pop('skelid', None) 124 | if skelid is None and len(self.spine_order) == 0: 125 | # assume it was removed initial coverpage 126 | skelid = 'coverpage' 127 | tattr['linear'] = 'no' 128 | self.spine_order.append(skelid) 129 | idref = tattr.pop('idref', None) 130 | if idref is not None: 131 | idref = 'x_' + idref 132 | self.spine_idrefs[skelid] = idref 133 | if 'id' in tattr: 134 | del tattr['id'] 135 | # tattr["id"] = 'x_' + tattr["id"] 136 | if 'properties' in tattr: 137 | self.need3 = True 138 | self.spine_pageattributes[skelid] = tattr 139 | if tname == 'meta' or tname.startswith('dc:'): 140 | if 'refines' in tattr or 'property' in tattr: 141 | self.need3 = True 142 | if tattr.get('name','') == 'cover': 143 | cover_name = tattr.get('content',None) 144 | if cover_name is not None: 145 | cover_name = 'x_' + cover_name 146 | self.cover_name = cover_name 147 | else: 148 | self.extrameta.append([tname, tattr, tcontent]) 149 | 150 | # parse and return either leading text or the next tag 151 | def parseresc(self): 152 | p = self.opos 153 | if p >= len(self.resc): 154 | return None, None 155 | if self.resc[p] != '<': 156 | res = self.resc.find('<',p) 157 | if res == -1 : 158 | res = len(self.resc) 159 | self.opos = res 160 | return self.resc[p:res], None 161 | # handle comment as a special case 162 | if self.resc[p:p+4] == '',p+1) 164 | if te != -1: 165 | te = te+2 166 | else: 167 | te = self.resc.find('>',p+1) 168 | ntb = self.resc.find('<',p+1) 169 | if ntb != -1 and ntb < te: 170 | self.opos = ntb 171 | return self.resc[p:ntb], None 172 | self.opos = te + 1 173 | return None, self.resc[p:te+1] 174 | 175 | # parses tag to identify: [tname, ttype, tattr] 176 | # tname: tag name 177 | # ttype: tag type ('begin', 'end' or 'single'); 178 | # tattr: dictionary of tag atributes 179 | def parsetag(self, s): 180 | p = 1 181 | tname = None 182 | ttype = None 183 | tattr = dict_() 184 | while s[p:p+1] == ' ' : 185 | p += 1 186 | if s[p:p+1] == '/': 187 | ttype = 'end' 188 | p += 1 189 | while s[p:p+1] == ' ' : 190 | p += 1 191 | b = p 192 | while s[p:p+1] not in ('>', '/', ' ', '"', "'",'\r','\n') : 193 | p += 1 194 | tname=s[b:p].lower() 195 | # some special cases 196 | if tname == '?xml': 197 | tname = 'xml' 198 | if tname == '!--': 199 | ttype = 'single' 200 | comment = s[p:-3].strip() 201 | tattr['comment'] = comment 202 | if ttype is None: 203 | # parse any attributes of begin or single tags 204 | while s.find('=',p) != -1 : 205 | while s[p:p+1] == ' ' : 206 | p += 1 207 | b = p 208 | while s[p:p+1] != '=' : 209 | p += 1 210 | aname = s[b:p].lower() 211 | aname = aname.rstrip(' ') 212 | p += 1 213 | while s[p:p+1] == ' ' : 214 | p += 1 215 | if s[p:p+1] in ('"', "'") : 216 | p = p + 1 217 | b = p 218 | while s[p:p+1] not in ('"', "'"): 219 | p += 1 220 | val = s[b:p] 221 | p += 1 222 | else : 223 | b = p 224 | while s[p:p+1] not in ('>', '/', ' ') : 225 | p += 1 226 | val = s[b:p] 227 | tattr[aname] = val 228 | if ttype is None: 229 | ttype = 'begin' 230 | if s.find('/',p) >= 0: 231 | ttype = 'single' 232 | return ttype, tname, tattr 233 | 234 | def taginfo_toxml(self, taginfo): 235 | res = [] 236 | tname, tattr, tcontent = taginfo 237 | res.append('<' + tname) 238 | if tattr is not None: 239 | for key in tattr: 240 | res.append(' ' + key + '="'+tattr[key]+'"') 241 | if tcontent is not None: 242 | res.append('>' + tcontent + '\n') 243 | else: 244 | res.append('/>\n') 245 | return "".join(res) 246 | 247 | def hasSpine(self): 248 | return len(self.spine_order) > 0 249 | 250 | def needEPUB3(self): 251 | return self.need3 252 | 253 | def hasRefines(self): 254 | for [tname, tattr, tcontent] in self.extrameta: 255 | if 'refines' in tattr: 256 | return True 257 | return False 258 | 259 | def createMetadata(self, epubver): 260 | for taginfo in self.extrameta: 261 | tname, tattr, tcontent = taginfo 262 | if 'refines' in tattr: 263 | if epubver == 'F' and 'property' in tattr: 264 | attr = ' id="%s" opf:%s="%s"\n' % (tattr['refines'], tattr['property'], tcontent) 265 | self.extra_attributes.append(attr) 266 | else: 267 | tag = self.taginfo_toxml(taginfo) 268 | self.refines_metadata.append(tag) 269 | else: 270 | tag = self.taginfo_toxml(taginfo) 271 | self.extra_metadata.append(tag) 272 | -------------------------------------------------------------------------------- /src/baca/tools/KindleUnpack/mobi_index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | from .compatibility_utils import PY2, bchr, bstr, bord 8 | if PY2: 9 | range = xrange 10 | 11 | import struct 12 | # note: struct pack, unpack, unpack_from all require bytestring format 13 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring 14 | 15 | from .mobi_utils import toHex 16 | 17 | class MobiIndex: 18 | 19 | def __init__(self, sect, DEBUG=False): 20 | self.sect = sect 21 | self.DEBUG = DEBUG 22 | 23 | def getIndexData(self, idx, label="Unknown"): 24 | sect = self.sect 25 | outtbl = [] 26 | ctoc_text = {} 27 | if idx != 0xffffffff: 28 | sect.setsectiondescription(idx,"{0} Main INDX section".format(label)) 29 | data = sect.loadSection(idx) 30 | idxhdr, hordt1, hordt2 = self.parseINDXHeader(data) 31 | IndexCount = idxhdr['count'] 32 | # handle the case of multiple sections used for CTOC 33 | rec_off = 0 34 | off = idx + IndexCount + 1 35 | for j in range(idxhdr['nctoc']): 36 | cdata = sect.loadSection(off + j) 37 | sect.setsectiondescription(off+j, label + ' CTOC Data ' + str(j)) 38 | ctocdict = self.readCTOC(cdata) 39 | for k in ctocdict: 40 | ctoc_text[k + rec_off] = ctocdict[k] 41 | rec_off += 0x10000 42 | tagSectionStart = idxhdr['len'] 43 | controlByteCount, tagTable = readTagSection(tagSectionStart, data) 44 | if self.DEBUG: 45 | print("ControlByteCount is", controlByteCount) 46 | print("IndexCount is", IndexCount) 47 | print("TagTable: %s" % tagTable) 48 | for i in range(idx + 1, idx + 1 + IndexCount): 49 | sect.setsectiondescription(i,"{0} Extra {1:d} INDX section".format(label,i-idx)) 50 | data = sect.loadSection(i) 51 | hdrinfo, ordt1, ordt2 = self.parseINDXHeader(data) 52 | idxtPos = hdrinfo['start'] 53 | entryCount = hdrinfo['count'] 54 | if self.DEBUG: 55 | print(idxtPos, entryCount) 56 | # loop through to build up the IDXT position starts 57 | idxPositions = [] 58 | for j in range(entryCount): 59 | pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j)) 60 | idxPositions.append(pos) 61 | # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) 62 | idxPositions.append(idxtPos) 63 | # for each entry in the IDXT build up the tagMap and any associated text 64 | for j in range(entryCount): 65 | startPos = idxPositions[j] 66 | endPos = idxPositions[j+1] 67 | textLength = ord(data[startPos:startPos+1]) 68 | text = data[startPos+1:startPos+1+textLength] 69 | if hordt2 is not None: 70 | text = b''.join(bchr(hordt2[bord(x)]) for x in text) 71 | tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos) 72 | outtbl.append([text, tagMap]) 73 | if self.DEBUG: 74 | print(tagMap) 75 | print(text) 76 | return outtbl, ctoc_text 77 | 78 | def parseINDXHeader(self, data): 79 | "read INDX header" 80 | if not data[:4] == b'INDX': 81 | print("Warning: index section is not INDX") 82 | return False 83 | words = ( 84 | 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', 85 | 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc' 86 | ) 87 | num = len(words) 88 | values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)]) 89 | header = {} 90 | for n in range(num): 91 | header[words[n]] = values[n] 92 | 93 | ordt1 = None 94 | ordt2 = None 95 | 96 | ocnt, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4) 97 | if header['code'] == 0xfdea or ocnt != 0 or oentries > 0: 98 | # horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify 99 | # them in the proper place in the header. They seem to be codepage 65002 which seems 100 | # to be some sort of strange EBCDIC utf-8 or 16 encoded strings 101 | 102 | # so we need to look for them and store them away to process leading text 103 | # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries 104 | # we only ever seem to use the seocnd but ... 105 | assert(ocnt == 1) 106 | assert(data[op1:op1+4] == b'ORDT') 107 | assert(data[op2:op2+4] == b'ORDT') 108 | ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4) 109 | ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4) 110 | 111 | if self.DEBUG: 112 | print("parsed INDX header:") 113 | for n in words: 114 | print(n, "%X" % header[n],) 115 | print("") 116 | return header, ordt1, ordt2 117 | 118 | def readCTOC(self, txtdata): 119 | # read all blocks from CTOC 120 | ctoc_data = {} 121 | offset = 0 122 | while offset next bytes: name 134 | name = txtdata[offset:offset+ilen] 135 | offset += ilen 136 | if self.DEBUG: 137 | print("name length is ", ilen) 138 | print(idx_offs, name) 139 | ctoc_data[idx_offs] = name 140 | return ctoc_data 141 | 142 | 143 | def getVariableWidthValue(data, offset): 144 | ''' 145 | Decode variable width value from given bytes. 146 | 147 | @param data: The bytes to decode. 148 | @param offset: The start offset into data. 149 | @return: Tuple of consumed bytes count and decoded value. 150 | ''' 151 | value = 0 152 | consumed = 0 153 | finished = False 154 | while not finished: 155 | v = data[offset + consumed: offset + consumed + 1] 156 | consumed += 1 157 | if ord(v) & 0x80: 158 | finished = True 159 | value = (value << 7) | (ord(v) & 0x7f) 160 | return consumed, value 161 | 162 | 163 | def readTagSection(start, data): 164 | ''' 165 | Read tag section from given data. 166 | 167 | @param start: The start position in the data. 168 | @param data: The data to process. 169 | @return: Tuple of control byte count and list of tag tuples. 170 | ''' 171 | controlByteCount = 0 172 | tags = [] 173 | if data[start:start+4] == b"TAGX": 174 | firstEntryOffset, = struct.unpack_from(b'>L', data, start + 0x04) 175 | controlByteCount, = struct.unpack_from(b'>L', data, start + 0x08) 176 | 177 | # Skip the first 12 bytes already read above. 178 | for i in range(12, firstEntryOffset, 4): 179 | pos = start + i 180 | tags.append((ord(data[pos:pos+1]), ord(data[pos+1:pos+2]), ord(data[pos+2:pos+3]), ord(data[pos+3:pos+4]))) 181 | return controlByteCount, tags 182 | 183 | 184 | def countSetBits(value, bits=8): 185 | ''' 186 | Count the set bits in the given value. 187 | 188 | @param value: Integer value. 189 | @param bits: The number of bits of the input value (defaults to 8). 190 | @return: Number of set bits. 191 | ''' 192 | count = 0 193 | for _ in range(bits): 194 | if value & 0x01 == 0x01: 195 | count += 1 196 | value = value >> 1 197 | return count 198 | 199 | 200 | def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos): 201 | ''' 202 | Create a map of tags and values from the given byte section. 203 | 204 | @param controlByteCount: The number of control bytes. 205 | @param tagTable: The tag table. 206 | @param entryData: The data to process. 207 | @param startPos: The starting position in entryData. 208 | @param endPos: The end position in entryData or None if it is unknown. 209 | @return: Hashmap of tag and list of values. 210 | ''' 211 | tags = [] 212 | tagHashMap = {} 213 | controlByteIndex = 0 214 | dataStart = startPos + controlByteCount 215 | 216 | for tag, valuesPerEntry, mask, endFlag in tagTable: 217 | if endFlag == 0x01: 218 | controlByteIndex += 1 219 | continue 220 | cbyte = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) 221 | if 0: 222 | print("Control Byte Index %0x , Control Byte Value %0x" % (controlByteIndex, cbyte)) 223 | 224 | value = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) & mask 225 | if value != 0: 226 | if value == mask: 227 | if countSetBits(mask) > 1: 228 | # If all bits of masked value are set and the mask has more than one bit, a variable width value 229 | # will follow after the control bytes which defines the length of bytes (NOT the value count!) 230 | # which will contain the corresponding variable width values. 231 | consumed, value = getVariableWidthValue(entryData, dataStart) 232 | dataStart += consumed 233 | tags.append((tag, None, value, valuesPerEntry)) 234 | else: 235 | tags.append((tag, 1, None, valuesPerEntry)) 236 | else: 237 | # Shift bits to get the masked value. 238 | while mask & 0x01 == 0: 239 | mask = mask >> 1 240 | value = value >> 1 241 | tags.append((tag, value, None, valuesPerEntry)) 242 | for tag, valueCount, valueBytes, valuesPerEntry in tags: 243 | values = [] 244 | if valueCount is not None: 245 | # Read valueCount * valuesPerEntry variable width values. 246 | for _ in range(valueCount): 247 | for _ in range(valuesPerEntry): 248 | consumed, data = getVariableWidthValue(entryData, dataStart) 249 | dataStart += consumed 250 | values.append(data) 251 | else: 252 | # Convert valueBytes to variable width values. 253 | totalConsumed = 0 254 | while totalConsumed < valueBytes: 255 | # Does this work for valuesPerEntry != 1? 256 | consumed, data = getVariableWidthValue(entryData, dataStart) 257 | dataStart += consumed 258 | totalConsumed += consumed 259 | values.append(data) 260 | if totalConsumed != valueBytes: 261 | print("Error: Should consume %s bytes, but consumed %s" % (valueBytes, totalConsumed)) 262 | tagHashMap[tag] = values 263 | # Test that all bytes have been processed if endPos is given. 264 | if endPos is not None and dataStart != endPos: 265 | # The last entry might have some zero padding bytes, so complain only if non zero bytes are left. 266 | for char in entryData[dataStart:endPos]: 267 | if bord(char) != 0: 268 | print("Warning: There are unprocessed index bytes left: %s" % toHex(entryData[dataStart:endPos])) 269 | if 0: 270 | print("controlByteCount: %s" % controlByteCount) 271 | print("tagTable: %s" % tagTable) 272 | print("data: %s" % toHex(entryData[startPos:endPos])) 273 | print("tagHashMap: %s" % tagHashMap) 274 | break 275 | 276 | return tagHashMap 277 | -------------------------------------------------------------------------------- /src/baca/app.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import dataclasses 3 | from datetime import datetime 4 | from pathlib import Path 5 | from typing import Type 6 | 7 | from textual import events 8 | from textual.actions import SkipAction 9 | from textual.app import App, ComposeResult 10 | from textual.css.query import NoMatches 11 | from textual.widgets import LoadingIndicator 12 | 13 | from baca.components.contents import Content 14 | from baca.components.events import ( 15 | DoneLoading, 16 | FollowThis, 17 | OpenThisImage, 18 | Screenshot, 19 | SearchSubmitted, 20 | ) 21 | from baca.components.windows import Alert, DictDisplay, SearchInputPrompt, ToC 22 | from baca.config import load_config 23 | from baca.ebooks import Ebook 24 | from baca.exceptions import LaunchingFileError 25 | from baca.models import Coordinate, KeyMap, ReadingHistory, SearchMode 26 | from baca.utils.app_resources import get_resource_file 27 | from baca.utils.keys_parser import dispatch_key 28 | from baca.utils.systems import launch_file 29 | from baca.utils.urls import is_url 30 | 31 | 32 | class Baca(App): 33 | CSS_PATH = str(get_resource_file("style.css")) 34 | 35 | def __init__(self, ebook_path: Path, ebook_class: Type[Ebook]): 36 | # load first to resolve css variables 37 | self.config = load_config() 38 | super().__init__() 39 | self.ebook_path = ebook_path 40 | self.ebook_class = ebook_class 41 | # TODO: make reactive and display percentage 42 | # as alternative for scrollbar 43 | self.reading_progress = 0.0 44 | self.search_mode = None 45 | 46 | def on_load(self, _: events.Load) -> None: 47 | assert self._loop is not None 48 | self._loop.run_in_executor(None, self.load_everything) 49 | 50 | def load_everything(self): 51 | self.ebook = self.ebook_class(self.ebook_path) 52 | content = Content(self.config, self.ebook) 53 | self.ebook_state, _ = ReadingHistory.get_or_create( 54 | filepath=str(self.ebook.get_path()), defaults=dict(reading_progress=0.0) 55 | ) 56 | # NOTE: using a message instead of calling 57 | # the callback directly to make sure that the app is ready 58 | # before calling the callback, since this message will 59 | # get processed after app ready and composed 60 | # (self._screen_stack isn't empty) 61 | # see: Widget.on_event(), App._process_message() 62 | self.post_message(DoneLoading(content)) 63 | 64 | async def on_done_loading(self, event: DoneLoading) -> None: 65 | # to be safe, unnecessary? 66 | # while self.screen is None: 67 | # await asyncio.sleep(0.1) 68 | 69 | # NOTE: await to prevent broken layout 70 | await self.mount(event.content) 71 | 72 | def restore_reading_progress() -> None: 73 | # restore reading progress 74 | # make sure to call this after refresh so the screen.max_scroll_y != 0 75 | self.reading_progress = self.ebook_state.reading_progress * self.screen.max_scroll_y 76 | self.screen.scroll_to(None, self.reading_progress, duration=0, animate=False) # type: ignore 77 | 78 | self.get_widget_by_id("startup-loader", LoadingIndicator).remove() 79 | 80 | def show_images() -> None: 81 | self.content.show_ansi_images() 82 | self.refresh(layout=True) 83 | self.call_after_refresh(restore_reading_progress) 84 | 85 | self.call_after_refresh(show_images) 86 | 87 | def on_mount(self): 88 | def screen_watch_scroll_y_wrapper(old_watcher, screen): 89 | def new_watcher(old, new): 90 | result = old_watcher(old, new) 91 | if screen.max_scroll_y != 0: 92 | self.reading_progress = new / screen.max_scroll_y 93 | return result 94 | 95 | return new_watcher 96 | 97 | screen_scroll_y_watcher = getattr(self.screen, "watch_scroll_y") 98 | setattr(self.screen, "watch_scroll_y", screen_watch_scroll_y_wrapper(screen_scroll_y_watcher, self.screen)) 99 | 100 | def get_css_variables(self): 101 | original = super().get_css_variables() 102 | return { 103 | **original, 104 | **{ 105 | "text-max-width": self.config.max_text_width, 106 | "text-justification": self.config.text_justification, 107 | "dark-bg": self.config.dark.bg, 108 | "dark-fg": self.config.dark.fg, 109 | "dark-accent": self.config.dark.accent, 110 | "light-bg": self.config.light.bg, 111 | "light-fg": self.config.light.fg, 112 | "light-accent": self.config.light.accent, 113 | }, 114 | } 115 | 116 | async def on_key(self, event: events.Key) -> None: 117 | keymaps = self.config.keymaps 118 | await dispatch_key( 119 | [ 120 | KeyMap(keymaps.close, self.action_cancel_search_or_quit), 121 | KeyMap(keymaps.scroll_down, self.screen.action_scroll_down), 122 | KeyMap(keymaps.scroll_up, self.screen.action_scroll_up), 123 | # KeyMap(keymaps.page_up, self.screen.action_page_up), 124 | # KeyMap(keymaps.page_down, self.screen.action_page_down), 125 | KeyMap(keymaps.page_up, self.action_page_up), 126 | KeyMap(keymaps.page_down, self.action_page_down), 127 | KeyMap(keymaps.home, self.screen.action_scroll_home), 128 | KeyMap(keymaps.end, self.screen.action_scroll_end), 129 | KeyMap(keymaps.open_toc, self.action_open_toc), 130 | KeyMap(keymaps.open_metadata, self.action_open_metadata), 131 | KeyMap(keymaps.open_help, self.action_open_help), 132 | KeyMap(keymaps.toggle_dark, self.action_toggle_dark), 133 | KeyMap(keymaps.screenshot, lambda: self.post_message(Screenshot())), 134 | KeyMap(keymaps.search_forward, lambda: self.action_input_search(forward=True)), 135 | KeyMap(keymaps.search_backward, lambda: self.action_input_search(forward=False)), 136 | KeyMap(keymaps.next_match, self.action_search_next), 137 | KeyMap(keymaps.prev_match, self.action_search_prev), 138 | KeyMap(keymaps.confirm, self.action_stop_search), 139 | # KeyMap(["D"], lambda: self.log()), 140 | ], 141 | event, 142 | ) 143 | 144 | def compose(self) -> ComposeResult: 145 | yield LoadingIndicator(id="startup-loader") 146 | 147 | async def alert(self, message: str) -> None: 148 | alert = Alert(self.config, message) 149 | await self.mount(alert) 150 | 151 | async def action_open_metadata(self) -> None: 152 | if self.metadata_window is None: 153 | metadata_window = DictDisplay( 154 | config=self.config, id="metadata", title="Metadata", data=dataclasses.asdict(self.ebook.get_meta()) 155 | ) 156 | await self.mount(metadata_window) 157 | 158 | def action_page_down(self) -> None: 159 | if not self.screen.allow_vertical_scroll: 160 | raise SkipAction() 161 | self.screen.scroll_page_down(duration=self.config.page_scroll_duration) 162 | 163 | def action_page_up(self) -> None: 164 | if not self.screen.allow_vertical_scroll: 165 | raise SkipAction() 166 | self.screen.scroll_page_up(duration=self.config.page_scroll_duration) 167 | 168 | async def action_input_search(self, forward: bool) -> None: 169 | await self.mount(SearchInputPrompt(forward=forward)) 170 | 171 | async def action_search_next(self) -> bool: 172 | if self.search_mode is not None: 173 | new_coord = await self.content.search_next( 174 | self.search_mode.pattern_str, 175 | self.search_mode.current_coord, 176 | self.search_mode.forward, 177 | ) 178 | if new_coord is not None: 179 | self.search_mode = dataclasses.replace(self.search_mode, current_coord=new_coord) 180 | return True 181 | else: 182 | # TODO: inconsistent alert window size on initial search 183 | await self.alert(f"Found no match: '{self.search_mode.pattern_str}'") 184 | 185 | return False 186 | 187 | async def action_search_prev(self) -> None: 188 | if self.search_mode is not None: 189 | new_coord = await self.content.search_next( 190 | self.search_mode.pattern_str, 191 | self.search_mode.current_coord, 192 | not self.search_mode.forward, 193 | ) 194 | if new_coord is not None: 195 | self.search_mode = dataclasses.replace(self.search_mode, current_coord=new_coord) 196 | 197 | async def action_stop_search(self) -> None: 198 | if self.search_mode is not None: 199 | self.search_mode = None 200 | await self.content.clear_search() 201 | 202 | async def action_open_help(self) -> None: 203 | if self.help_window is None: 204 | keymap_data = { 205 | k.replace("_", " ").title(): ",".join(v) for k, v in dataclasses.asdict(self.config.keymaps).items() 206 | } 207 | help_window = DictDisplay(config=self.config, id="help", title="Keymaps", data=keymap_data) 208 | await self.mount(help_window) 209 | 210 | async def action_open_toc(self) -> None: 211 | if self.toc_window is None: 212 | toc_entries = list(self.ebook.get_toc()) 213 | if len(toc_entries) == 0: 214 | return await self.alert("No content navigations for this ebook.") 215 | 216 | initial_index = 0 217 | toc_values = [e.value for e in toc_entries] 218 | for s in self.content.get_navigables(): 219 | if s.nav_point is not None and s.nav_point in toc_values: 220 | # if round(self.screen.scroll_y) >= s.virtual_region.y: 221 | if self.screen.scroll_offset.y >= s.virtual_region.y: 222 | initial_index = toc_values.index(s.nav_point) 223 | else: 224 | break 225 | 226 | toc = ToC(self.config, entries=toc_entries, initial_index=initial_index) 227 | # NOTE: await to prevent broken layout 228 | await self.mount(toc) 229 | 230 | async def action_cancel_search_or_quit(self) -> None: 231 | if self.search_mode is not None: 232 | self.screen.scroll_to( 233 | 0, self.search_mode.saved_position * self.screen.max_scroll_y, duration=self.config.page_scroll_duration 234 | ) 235 | await self.action_stop_search() 236 | else: 237 | await self.action_quit() 238 | 239 | async def action_link(self, link: str) -> None: 240 | if is_url(link): 241 | try: 242 | await launch_file(link) 243 | except LaunchingFileError as e: 244 | await self.alert(str(e)) 245 | 246 | elif link in [n.nav_point for n in self.content.get_navigables()]: 247 | self.content.scroll_to_section(link) 248 | 249 | else: 250 | await self.alert(f"No nav point found in document: {link}") 251 | 252 | async def on_search_submitted(self, message: SearchSubmitted) -> None: 253 | self.search_mode = SearchMode( 254 | pattern_str=message.value, 255 | current_coord=Coordinate(-1 if message.forward else self.content.size.width, self.screen.scroll_offset.y), 256 | forward=message.forward, 257 | saved_position=self.reading_progress, 258 | ) 259 | is_found = await self.action_search_next() 260 | if not is_found: 261 | self.search_mode = None 262 | 263 | async def on_follow_this(self, message: FollowThis) -> None: 264 | self.content.scroll_to_section(message.nav_point) 265 | # NOTE: remove after refresh so the event get handled 266 | self.call_after_refresh(self.toc_window.remove) # type: ignore 267 | 268 | async def on_open_this_image(self, message: OpenThisImage) -> None: 269 | try: 270 | filename, bytestr = self.ebook.get_img_bytestr(message.value) 271 | tmpfilepath = self.ebook.get_tempdir() / filename 272 | with open(tmpfilepath, "wb") as img_tmp: 273 | img_tmp.write(bytestr) 274 | 275 | await launch_file(tmpfilepath, preferred=self.config.preferred_image_viewer) 276 | except LaunchingFileError as e: 277 | await self.alert(f"Error opening an image: {e}") 278 | 279 | async def on_screenshot(self, _: Screenshot) -> None: 280 | self.save_screenshot(f"baca_{datetime.now().isoformat()}.svg") 281 | 282 | def run(self, *args, **kwargs): 283 | try: 284 | return super().run(*args, **kwargs) 285 | finally: 286 | meta = self.ebook.get_meta() 287 | self.ebook_state.last_read = datetime.now() # type: ignore 288 | self.ebook_state.title = meta.title # type: ignore 289 | self.ebook_state.author = meta.creator # type: ignore 290 | self.ebook_state.reading_progress = self.reading_progress # type: ignore 291 | self.ebook_state.save() 292 | self.ebook.cleanup() 293 | 294 | @property 295 | def toc_window(self) -> ToC | None: 296 | try: 297 | return self.query_one(ToC.__name__, ToC) 298 | except NoMatches: 299 | return None 300 | 301 | @property 302 | def metadata_window(self) -> DictDisplay | None: 303 | try: 304 | return self.get_widget_by_id("metadata", DictDisplay) 305 | except NoMatches: 306 | return None 307 | 308 | @property 309 | def help_window(self) -> DictDisplay | None: 310 | try: 311 | return self.get_widget_by_id("help", DictDisplay) 312 | except NoMatches: 313 | return None 314 | 315 | @property 316 | def content(self) -> Content: 317 | return self.query_one(Content.__name__, Content) 318 | 319 | # def _remove_nodes(self, widgets: list[Widget], parent: DOMNode) -> AwaitRemove: 320 | # await_remove = super()._remove_nodes(widgets, parent) 321 | # self.refresh(layout=True) 322 | # return await_remove 323 | # def on_mount(self) -> None: 324 | # self.screen.can_focus = True 325 | -------------------------------------------------------------------------------- /src/baca/tools/KindleUnpack/mobi_dict.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr 8 | 9 | if PY2: 10 | range = xrange 11 | array_format = b'B' 12 | if PY3: 13 | unichr = chr 14 | array_format = "B" 15 | 16 | import array 17 | 18 | import struct 19 | # note: struct pack, unpack, unpack_from all require bytestring format 20 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring 21 | 22 | from .mobi_index import getVariableWidthValue, readTagSection, getTagMap 23 | from .mobi_utils import toHex 24 | 25 | DEBUG_DICT = False 26 | 27 | class InflectionData(object): 28 | 29 | def __init__(self, infldatas): 30 | self.infldatas = infldatas 31 | self.starts = [] 32 | self.counts = [] 33 | for idata in self.infldatas: 34 | start, = struct.unpack_from(b'>L', idata, 0x14) 35 | count, = struct.unpack_from(b'>L', idata, 0x18) 36 | self.starts.append(start) 37 | self.counts.append(count) 38 | 39 | def lookup(self, lookupvalue): 40 | i = 0 41 | rvalue = lookupvalue 42 | while rvalue >= self.counts[i]: 43 | rvalue = rvalue - self.counts[i] 44 | i += 1 45 | if i == len(self.counts): 46 | print("Error: Problem with multiple inflections data sections") 47 | return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0] 48 | return rvalue, self.starts[i], self.counts[i], self.infldatas[i] 49 | 50 | def offsets(self, value): 51 | rvalue, start, count, data = self.lookup(value) 52 | offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue)) 53 | if rvalue + 1 < count: 54 | nextOffset, = struct.unpack_from(b'>H',data, start + 4 + (2 * (rvalue + 1))) 55 | else: 56 | nextOffset = None 57 | return offset, nextOffset, data 58 | 59 | 60 | class dictSupport(object): 61 | 62 | def __init__(self, mh, sect): 63 | self.mh = mh 64 | self.header = mh.header 65 | self.sect = sect 66 | self.metaOrthIndex = mh.metaOrthIndex 67 | self.metaInflIndex = mh.metaInflIndex 68 | 69 | def parseHeader(self, data): 70 | "read INDX header" 71 | if not data[:4] == b'INDX': 72 | print("Warning: index section is not INDX") 73 | return False 74 | words = ( 75 | 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code', 76 | 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc' 77 | ) 78 | num = len(words) 79 | values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)]) 80 | header = {} 81 | for n in range(num): 82 | header[words[n]] = values[n] 83 | 84 | ordt1 = None 85 | ordt2 = None 86 | 87 | otype, oentries, op1, op2, otagx = struct.unpack_from(b'>LLLLL',data, 0xa4) 88 | header['otype'] = otype 89 | header['oentries'] = oentries 90 | 91 | if DEBUG_DICT: 92 | print("otype %d, oentries %d, op1 %d, op2 %d, otagx %d" % (otype, oentries, op1, op2, otagx)) 93 | 94 | if header['code'] == 0xfdea or oentries > 0: 95 | # some dictionaries seem to be codepage 65002 (0xFDEA) which seems 96 | # to be some sort of strange EBCDIC utf-8 or 16 encoded strings 97 | # So we need to look for them and store them away to process leading text 98 | # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries 99 | # we only ever seem to use the second but ... 100 | # 101 | # if otype = 0, ORDT table uses 16 bit values as offsets into the table 102 | # if otype = 1, ORDT table uses 8 bit values as offsets inot the table 103 | 104 | assert(data[op1:op1+4] == b'ORDT') 105 | assert(data[op2:op2+4] == b'ORDT') 106 | ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4) 107 | ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4) 108 | 109 | if DEBUG_DICT: 110 | print("parsed INDX header:") 111 | for key in header: 112 | print(key, "%x" % header[key],) 113 | print("\n") 114 | return header, ordt1, ordt2 115 | 116 | def getPositionMap(self): 117 | sect = self.sect 118 | 119 | positionMap = {} 120 | 121 | metaOrthIndex = self.metaOrthIndex 122 | metaInflIndex = self.metaInflIndex 123 | 124 | decodeInflection = True 125 | if metaOrthIndex != 0xFFFFFFFF: 126 | print("Info: Document contains orthographic index, handle as dictionary") 127 | if metaInflIndex == 0xFFFFFFFF: 128 | decodeInflection = False 129 | else: 130 | metaInflIndexData = sect.loadSection(metaInflIndex) 131 | 132 | print("\nParsing metaInflIndexData") 133 | midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData) 134 | 135 | metaIndexCount = midxhdr['count'] 136 | idatas = [] 137 | for j in range(metaIndexCount): 138 | idatas.append(sect.loadSection(metaInflIndex + 1 + j)) 139 | dinfl = InflectionData(idatas) 140 | 141 | inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount) 142 | tagSectionStart = midxhdr['len'] 143 | inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData) 144 | if DEBUG_DICT: 145 | print("inflectionTagTable: %s" % inflectionTagTable) 146 | if self.hasTag(inflectionTagTable, 0x07): 147 | print("Error: Dictionary uses obsolete inflection rule scheme which is not yet supported") 148 | decodeInflection = False 149 | 150 | data = sect.loadSection(metaOrthIndex) 151 | 152 | print("\nParsing metaOrthIndex") 153 | idxhdr, hordt1, hordt2 = self.parseHeader(data) 154 | 155 | tagSectionStart = idxhdr['len'] 156 | controlByteCount, tagTable = readTagSection(tagSectionStart, data) 157 | orthIndexCount = idxhdr['count'] 158 | print("orthIndexCount is", orthIndexCount) 159 | if DEBUG_DICT: 160 | print("orthTagTable: %s" % tagTable) 161 | if hordt2 is not None: 162 | print("orth entry uses ordt2 lookup table of type ", idxhdr['otype']) 163 | hasEntryLength = self.hasTag(tagTable, 0x02) 164 | if not hasEntryLength: 165 | print("Info: Index doesn't contain entry length tags") 166 | 167 | print("Read dictionary index data") 168 | for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount): 169 | data = sect.loadSection(i) 170 | hdrinfo, ordt1, ordt2 = self.parseHeader(data) 171 | idxtPos = hdrinfo['start'] 172 | entryCount = hdrinfo['count'] 173 | idxPositions = [] 174 | for j in range(entryCount): 175 | pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j)) 176 | idxPositions.append(pos) 177 | # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!) 178 | idxPositions.append(idxtPos) 179 | for j in range(entryCount): 180 | startPos = idxPositions[j] 181 | endPos = idxPositions[j+1] 182 | textLength = ord(data[startPos:startPos+1]) 183 | text = data[startPos+1:startPos+1+textLength] 184 | if hordt2 is not None: 185 | utext = u"" 186 | if idxhdr['otype'] == 0: 187 | pattern = b'>H' 188 | inc = 2 189 | else: 190 | pattern = b'>B' 191 | inc = 1 192 | pos = 0 193 | while pos < textLength: 194 | off, = struct.unpack_from(pattern, text, pos) 195 | if off < len(hordt2): 196 | utext += unichr(hordt2[off]) 197 | else: 198 | utext += unichr(off) 199 | pos += inc 200 | text = utext.encode('utf-8') 201 | 202 | tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos) 203 | if 0x01 in tagMap: 204 | if decodeInflection and 0x2a in tagMap: 205 | inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable, 206 | dinfl, inflNameData, tagMap[0x2a]) 207 | else: 208 | inflectionGroups = b'' 209 | assert len(tagMap[0x01]) == 1 210 | entryStartPosition = tagMap[0x01][0] 211 | if hasEntryLength: 212 | # The idx:entry attribute "scriptable" must be present to create entry length tags. 213 | ml = b'' + inflectionGroups + b'' 214 | if entryStartPosition in positionMap: 215 | positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml 216 | else: 217 | positionMap[entryStartPosition] = ml 218 | assert len(tagMap[0x02]) == 1 219 | entryEndPosition = entryStartPosition + tagMap[0x02][0] 220 | if entryEndPosition in positionMap: 221 | positionMap[entryEndPosition] = b"" + positionMap[entryEndPosition] 222 | else: 223 | positionMap[entryEndPosition] = b"" 224 | 225 | else: 226 | indexTags = b'\n\n' + inflectionGroups + b'\n' 227 | if entryStartPosition in positionMap: 228 | positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags 229 | else: 230 | positionMap[entryStartPosition] = indexTags 231 | return positionMap 232 | 233 | def hasTag(self, tagTable, tag): 234 | ''' 235 | Test if tag table contains given tag. 236 | 237 | @param tagTable: The tag table. 238 | @param tag: The tag to search. 239 | @return: True if tag table contains given tag; False otherwise. 240 | ''' 241 | for currentTag, _, _, _ in tagTable: 242 | if currentTag == tag: 243 | return True 244 | return False 245 | 246 | def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList): 247 | ''' 248 | Create string which contains the inflection groups with inflection rules as mobipocket tags. 249 | 250 | @param mainEntry: The word to inflect. 251 | @param controlByteCount: The number of control bytes. 252 | @param tagTable: The tag table. 253 | @param data: The Inflection data object to properly select the right inflection data section to use 254 | @param inflectionNames: The inflection rule name data. 255 | @param groupList: The list of inflection groups to process. 256 | @return: String with inflection groups and rules or empty string if required tags are not available. 257 | ''' 258 | result = b"" 259 | for value in groupList: 260 | offset, nextOffset, data = dinfl.offsets(value) 261 | 262 | # First byte seems to be always 0x00 and must be skipped. 263 | assert ord(data[offset:offset+1]) == 0x00 264 | tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset) 265 | 266 | # Make sure that the required tags are available. 267 | if 0x05 not in tagMap: 268 | print("Error: Required tag 0x05 not found in tagMap") 269 | return "" 270 | if 0x1a not in tagMap: 271 | print("Error: Required tag 0x1a not found in tagMap") 272 | return b'' 273 | 274 | result += b'' 275 | 276 | for i in range(len(tagMap[0x05])): 277 | 278 | # Get name of inflection rule. 279 | value = tagMap[0x05][i] 280 | consumed, textLength = getVariableWidthValue(inflectionNames, value) 281 | inflectionName = inflectionNames[value+consumed:value+consumed+textLength] 282 | 283 | # Get and apply inflection rule across possibly multiple inflection data sections 284 | value = tagMap[0x1a][i] 285 | rvalue, start, count, data = dinfl.lookup(value) 286 | offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue)) 287 | textLength = ord(data[offset:offset+1]) 288 | inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength) 289 | if inflection is not None: 290 | result += b' ' 291 | 292 | result += b'' 293 | return result 294 | 295 | def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end): 296 | ''' 297 | Apply inflection rule. 298 | 299 | @param mainEntry: The word to inflect. 300 | @param inflectionRuleData: The inflection rules. 301 | @param start: The start position of the inflection rule to use. 302 | @param end: The end position of the inflection rule to use. 303 | @return: The string with the inflected word or None if an error occurs. 304 | ''' 305 | mode = -1 306 | byteArray = array.array(array_format, mainEntry) 307 | position = len(byteArray) 308 | for charOffset in range(start, end): 309 | char = inflectionRuleData[charOffset:charOffset+1] 310 | abyte = ord(char) 311 | if abyte >= 0x0a and abyte <= 0x13: 312 | # Move cursor backwards 313 | offset = abyte - 0x0a 314 | if mode not in [0x02, 0x03]: 315 | mode = 0x02 316 | position = len(byteArray) 317 | position -= offset 318 | elif abyte > 0x13: 319 | if mode == -1: 320 | print("Error: Unexpected first byte %i of inflection rule" % abyte) 321 | return None 322 | elif position == -1: 323 | print("Error: Unexpected first byte %i of inflection rule" % abyte) 324 | return None 325 | else: 326 | if mode == 0x01: 327 | # Insert at word start 328 | byteArray.insert(position, abyte) 329 | position += 1 330 | elif mode == 0x02: 331 | # Insert at word end 332 | byteArray.insert(position, abyte) 333 | elif mode == 0x03: 334 | # Delete at word end 335 | position -= 1 336 | deleted = byteArray.pop(position) 337 | if bchr(deleted) != char: 338 | if DEBUG_DICT: 339 | print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted))) 340 | print("Error: Delete operation of inflection rule failed") 341 | return None 342 | elif mode == 0x04: 343 | # Delete at word start 344 | deleted = byteArray.pop(position) 345 | if bchr(deleted) != char: 346 | if DEBUG_DICT: 347 | print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted))) 348 | print("Error: Delete operation of inflection rule failed") 349 | return None 350 | else: 351 | print("Error: Inflection rule mode %x is not implemented" % mode) 352 | return None 353 | elif abyte == 0x01: 354 | # Insert at word start 355 | if mode not in [0x01, 0x04]: 356 | position = 0 357 | mode = abyte 358 | elif abyte == 0x02: 359 | # Insert at word end 360 | if mode not in [0x02, 0x03]: 361 | position = len(byteArray) 362 | mode = abyte 363 | elif abyte == 0x03: 364 | # Delete at word end 365 | if mode not in [0x02, 0x03]: 366 | position = len(byteArray) 367 | mode = abyte 368 | elif abyte == 0x04: 369 | # Delete at word start 370 | if mode not in [0x01, 0x04]: 371 | position = 0 372 | # Delete at word start 373 | mode = abyte 374 | else: 375 | print("Error: Inflection rule mode %x is not implemented" % abyte) 376 | return None 377 | return utf8_str(byteArray.tostring()) 378 | -------------------------------------------------------------------------------- /src/baca/tools/KindleUnpack/mobi_split.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab 4 | 5 | from __future__ import unicode_literals, division, absolute_import, print_function 6 | 7 | import struct 8 | # note: struct pack, unpack, unpack_from all require bytestring format 9 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring 10 | 11 | from .unipath import pathof 12 | 13 | 14 | # important pdb header offsets 15 | unique_id_seed = 68 16 | number_of_pdb_records = 76 17 | 18 | # important palmdoc header offsets 19 | book_length = 4 20 | book_record_count = 8 21 | first_pdb_record = 78 22 | 23 | # important rec0 offsets 24 | length_of_book = 4 25 | mobi_header_base = 16 26 | mobi_header_length = 20 27 | mobi_type = 24 28 | mobi_version = 36 29 | first_non_text = 80 30 | title_offset = 84 31 | first_resc_record = 108 32 | first_content_index = 192 33 | last_content_index = 194 34 | kf8_fdst_index = 192 # for KF8 mobi headers 35 | fcis_index = 200 36 | flis_index = 208 37 | srcs_index = 224 38 | srcs_count = 228 39 | primary_index = 244 40 | datp_index = 256 41 | huffoff = 112 42 | hufftbloff = 120 43 | 44 | def getint(datain,ofs,sz=b'L'): 45 | i, = struct.unpack_from(b'>'+sz,datain,ofs) 46 | return i 47 | 48 | def writeint(datain,ofs,n,len=b'L'): 49 | if len==b'L': 50 | return datain[:ofs]+struct.pack(b'>L',n)+datain[ofs+4:] 51 | else: 52 | return datain[:ofs]+struct.pack(b'>H',n)+datain[ofs+2:] 53 | 54 | def getsecaddr(datain,secno): 55 | nsec = getint(datain,number_of_pdb_records,b'H') 56 | assert secno>=0 & secnoL',2*nsec+1)) 78 | datalst.append(datain[unique_id_seed+4:number_of_pdb_records]) 79 | datalst.append(struct.pack(b'>H',nsec)) 80 | newstart = zerosecstart 81 | for i in range(0,secno): 82 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) 83 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) 84 | datalst.append(struct.pack(b'>L', secstart) + struct.pack(b'>L', (2*secno))) 85 | for i in range(secno+1,nsec): 86 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) 87 | ofs = ofs + dif 88 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) 89 | lpad = newstart - (first_pdb_record + 8*nsec) 90 | if lpad > 0: 91 | datalst.append(b'\0' * lpad) 92 | datalst.append(datain[zerosecstart:secstart]) 93 | datalst.append(secdata) 94 | datalst.append(datain[secend:]) 95 | dataout = b''.join(datalst) 96 | return dataout 97 | 98 | def nullsection(datain,secno): # make it zero-length without deleting it 99 | datalst = [] 100 | nsec = getint(datain,number_of_pdb_records,b'H') 101 | secstart, secend = getsecaddr(datain,secno) 102 | zerosecstart, zerosecend = getsecaddr(datain, 0) 103 | dif = secend-secstart 104 | datalst.append(datain[:first_pdb_record]) 105 | for i in range(0,secno+1): 106 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) 107 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) 108 | for i in range(secno+1, nsec): 109 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) 110 | ofs = ofs - dif 111 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) 112 | lpad = zerosecstart - (first_pdb_record + 8*nsec) 113 | if lpad > 0: 114 | datalst.append(b'\0' * lpad) 115 | datalst.append(datain[zerosecstart: secstart]) 116 | datalst.append(datain[secend:]) 117 | dataout = b''.join(datalst) 118 | return dataout 119 | 120 | def deletesectionrange(datain,firstsec,lastsec): # delete a range of sections 121 | datalst = [] 122 | firstsecstart,firstsecend = getsecaddr(datain,firstsec) 123 | lastsecstart,lastsecend = getsecaddr(datain,lastsec) 124 | zerosecstart, zerosecend = getsecaddr(datain, 0) 125 | dif = lastsecend - firstsecstart + 8*(lastsec-firstsec+1) 126 | nsec = getint(datain,number_of_pdb_records,b'H') 127 | datalst.append(datain[:unique_id_seed]) 128 | datalst.append(struct.pack(b'>L',2*(nsec-(lastsec-firstsec+1))+1)) 129 | datalst.append(datain[unique_id_seed+4:number_of_pdb_records]) 130 | datalst.append(struct.pack(b'>H',nsec-(lastsec-firstsec+1))) 131 | newstart = zerosecstart - 8*(lastsec-firstsec+1) 132 | for i in range(0,firstsec): 133 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) 134 | ofs = ofs-8*(lastsec-firstsec+1) 135 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) 136 | for i in range(lastsec+1,nsec): 137 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) 138 | ofs = ofs - dif 139 | flgval = 2*(i-(lastsec-firstsec+1)) 140 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) 141 | lpad = newstart - (first_pdb_record + 8*(nsec - (lastsec - firstsec + 1))) 142 | if lpad > 0: 143 | datalst.append(b'\0' * lpad) 144 | datalst.append(datain[zerosecstart:firstsecstart]) 145 | datalst.append(datain[lastsecend:]) 146 | dataout = b''.join(datalst) 147 | return dataout 148 | 149 | def insertsection(datain,secno,secdata): # insert a new section 150 | datalst = [] 151 | nsec = getint(datain,number_of_pdb_records,b'H') 152 | # print("inserting secno" , secno, "into" ,nsec, "sections") 153 | secstart,secend = getsecaddr(datain,secno) 154 | zerosecstart,zerosecend = getsecaddr(datain,0) 155 | dif = len(secdata) 156 | datalst.append(datain[:unique_id_seed]) 157 | datalst.append(struct.pack(b'>L',2*(nsec+1)+1)) 158 | datalst.append(datain[unique_id_seed+4:number_of_pdb_records]) 159 | datalst.append(struct.pack(b'>H',nsec+1)) 160 | newstart = zerosecstart + 8 161 | for i in range(0,secno): 162 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) 163 | ofs += 8 164 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval)) 165 | datalst.append(struct.pack(b'>L', secstart + 8) + struct.pack(b'>L', (2*secno))) 166 | for i in range(secno,nsec): 167 | ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8) 168 | ofs = ofs + dif + 8 169 | flgval = 2*(i+1) 170 | datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval)) 171 | lpad = newstart - (first_pdb_record + 8*(nsec + 1)) 172 | if lpad > 0: 173 | datalst.append(b'\0' * lpad) 174 | datalst.append(datain[zerosecstart:secstart]) 175 | datalst.append(secdata) 176 | datalst.append(datain[secstart:]) 177 | dataout = b''.join(datalst) 178 | return dataout 179 | 180 | 181 | def insertsectionrange(sectionsource,firstsec,lastsec,sectiontarget,targetsec): # insert a range of sections 182 | # print("inserting secno" , firstsec, "to", lastsec, "into" ,targetsec, "sections") 183 | # dataout = sectiontarget 184 | # for idx in range(lastsec,firstsec-1,-1): 185 | # dataout = insertsection(dataout,targetsec,readsection(sectionsource,idx)) 186 | # return dataout 187 | datalst = [] 188 | nsec = getint(sectiontarget,number_of_pdb_records,b'H') 189 | zerosecstart, zerosecend = getsecaddr(sectiontarget,0) 190 | insstart, nul = getsecaddr(sectiontarget,targetsec) 191 | nins = lastsec - firstsec + 1 192 | srcstart, nul = getsecaddr(sectionsource,firstsec) 193 | nul, srcend = getsecaddr(sectionsource,lastsec) 194 | newstart = zerosecstart + 8*nins 195 | 196 | datalst.append(sectiontarget[:unique_id_seed]) 197 | datalst.append(struct.pack(b'>L',2*(nsec+nins)+1)) 198 | datalst.append(sectiontarget[unique_id_seed+4:number_of_pdb_records]) 199 | datalst.append(struct.pack(b'>H',nsec+nins)) 200 | for i in range(0,targetsec): 201 | ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8) 202 | ofsnew = ofs + 8*nins 203 | flgvalnew = flgval 204 | datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew)) 205 | # print(ofsnew, flgvalnew, ofs, flgval) 206 | srcstart0, nul = getsecaddr(sectionsource,firstsec) 207 | for i in range(nins): 208 | isrcstart, nul = getsecaddr(sectionsource,firstsec+i) 209 | ofsnew = insstart + (isrcstart-srcstart0) + 8*nins 210 | flgvalnew = 2*(targetsec+i) 211 | datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew)) 212 | # print(ofsnew, flgvalnew) 213 | dif = srcend - srcstart 214 | for i in range(targetsec,nsec): 215 | ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8) 216 | ofsnew = ofs + dif + 8*nins 217 | flgvalnew = 2*(i+nins) 218 | datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L',flgvalnew)) 219 | # print(ofsnew, flgvalnew, ofs, flgval) 220 | lpad = newstart - (first_pdb_record + 8*(nsec + nins)) 221 | if lpad > 0: 222 | datalst.append(b'\0' * lpad) 223 | datalst.append(sectiontarget[zerosecstart:insstart]) 224 | datalst.append(sectionsource[srcstart:srcend]) 225 | datalst.append(sectiontarget[insstart:]) 226 | dataout = b''.join(datalst) 227 | return dataout 228 | 229 | def get_exth_params(rec0): 230 | ebase = mobi_header_base + getint(rec0,mobi_header_length) 231 | elen = getint(rec0,ebase+4) 232 | enum = getint(rec0,ebase+8) 233 | return ebase,elen,enum 234 | 235 | def add_exth(rec0,exth_num,exth_bytes): 236 | ebase,elen,enum = get_exth_params(rec0) 237 | newrecsize = 8+len(exth_bytes) 238 | newrec0 = rec0[0:ebase+4]+struct.pack(b'>L',elen+newrecsize)+struct.pack(b'>L',enum+1)+\ 239 | struct.pack(b'>L',exth_num)+struct.pack(b'>L',newrecsize)+exth_bytes+rec0[ebase+12:] 240 | newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+newrecsize) 241 | return newrec0 242 | 243 | def read_exth(rec0,exth_num): 244 | exth_values = [] 245 | ebase,elen,enum = get_exth_params(rec0) 246 | ebase = ebase+12 247 | while enum>0: 248 | exth_id = getint(rec0,ebase) 249 | if exth_id == exth_num: 250 | # We might have multiple exths, so build a list. 251 | exth_values.append(rec0[ebase+8:ebase+getint(rec0,ebase+4)]) 252 | enum = enum-1 253 | ebase = ebase+getint(rec0,ebase+4) 254 | return exth_values 255 | 256 | def write_exth(rec0,exth_num,exth_bytes): 257 | ebase,elen,enum = get_exth_params(rec0) 258 | ebase_idx = ebase+12 259 | enum_idx = enum 260 | while enum_idx>0: 261 | exth_id = getint(rec0,ebase_idx) 262 | if exth_id == exth_num: 263 | dif = len(exth_bytes)+8-getint(rec0,ebase_idx+4) 264 | newrec0 = rec0 265 | if dif != 0: 266 | newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+dif) 267 | return newrec0[:ebase+4]+struct.pack(b'>L',elen+len(exth_bytes)+8-getint(rec0,ebase_idx+4))+\ 268 | struct.pack(b'>L',enum)+rec0[ebase+12:ebase_idx+4]+\ 269 | struct.pack(b'>L',len(exth_bytes)+8)+exth_bytes+\ 270 | rec0[ebase_idx+getint(rec0,ebase_idx+4):] 271 | enum_idx = enum_idx-1 272 | ebase_idx = ebase_idx+getint(rec0,ebase_idx+4) 273 | return rec0 274 | 275 | def del_exth(rec0,exth_num): 276 | ebase,elen,enum = get_exth_params(rec0) 277 | ebase_idx = ebase+12 278 | enum_idx = 0 279 | while enum_idx < enum: 280 | exth_id = getint(rec0,ebase_idx) 281 | exth_size = getint(rec0,ebase_idx+4) 282 | if exth_id == exth_num: 283 | newrec0 = rec0 284 | newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)-exth_size) 285 | newrec0 = newrec0[:ebase_idx]+newrec0[ebase_idx+exth_size:] 286 | newrec0 = newrec0[0:ebase+4]+struct.pack(b'>L',elen-exth_size)+struct.pack(b'>L',enum-1)+newrec0[ebase+12:] 287 | return newrec0 288 | enum_idx += 1 289 | ebase_idx = ebase_idx+exth_size 290 | return rec0 291 | 292 | 293 | class mobi_split: 294 | 295 | def __init__(self, infile): 296 | datain = b'' 297 | with open(pathof(infile), 'rb') as f: 298 | datain = f.read() 299 | datain_rec0 = readsection(datain,0) 300 | ver = getint(datain_rec0,mobi_version) 301 | self.combo = (ver!=8) 302 | if not self.combo: 303 | return 304 | exth121 = read_exth(datain_rec0,121) 305 | if len(exth121) == 0: 306 | self.combo = False 307 | return 308 | else: 309 | # only pay attention to first exth121 310 | # (there should only be one) 311 | datain_kf8, = struct.unpack_from(b'>L',exth121[0],0) 312 | if datain_kf8 == 0xffffffff: 313 | self.combo = False 314 | return 315 | datain_kfrec0 =readsection(datain,datain_kf8) 316 | 317 | # create the standalone mobi7 318 | num_sec = getint(datain,number_of_pdb_records,b'H') 319 | # remove BOUNDARY up to but not including ELF record 320 | self.result_file7 = deletesectionrange(datain,datain_kf8-1,num_sec-2) 321 | # check if there are SRCS records and delete them 322 | srcs = getint(datain_rec0,srcs_index) 323 | num_srcs = getint(datain_rec0,srcs_count) 324 | if srcs != 0xffffffff and num_srcs > 0: 325 | self.result_file7 = deletesectionrange(self.result_file7,srcs,srcs+num_srcs-1) 326 | datain_rec0 = writeint(datain_rec0,srcs_index,0xffffffff) 327 | datain_rec0 = writeint(datain_rec0,srcs_count,0) 328 | # reset the EXTH 121 KF8 Boundary meta data to 0xffffffff 329 | datain_rec0 = write_exth(datain_rec0,121, struct.pack(b'>L', 0xffffffff)) 330 | # datain_rec0 = del_exth(datain_rec0,121) 331 | # datain_rec0 = del_exth(datain_rec0,534) 332 | # don't remove the EXTH 125 KF8 Count of Resources, seems to be present in mobi6 files as well 333 | # set the EXTH 129 KF8 Masthead / Cover Image string to the null string 334 | datain_rec0 = write_exth(datain_rec0,129, b'') 335 | # don't remove the EXTH 131 KF8 Unidentified Count, seems to be present in mobi6 files as well 336 | 337 | # need to reset flags stored in 0x80-0x83 338 | # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050 339 | # Bit Flags 340 | # 0x1000 = Bit 12 indicates if embedded fonts are used or not 341 | # 0x0800 = means this Header points to *shared* images/resource/fonts ?? 342 | # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8? 343 | # 0x0040 = exth exists 344 | # 0x0010 = Not sure but this is always set so far 345 | fval, = struct.unpack_from(b'>L',datain_rec0, 0x80) 346 | # need to remove flag 0x0800 for KindlePreviewer 2.8 and unset Bit 12 for embedded fonts 347 | fval = fval & 0x07FF 348 | datain_rec0 = datain_rec0[:0x80] + struct.pack(b'>L',fval) + datain_rec0[0x84:] 349 | 350 | self.result_file7 = writesection(self.result_file7,0,datain_rec0) 351 | 352 | # no need to replace kf8 style fcis with mobi 7 one 353 | # fcis_secnum, = struct.unpack_from(b'>L',datain_rec0, 0xc8) 354 | # if fcis_secnum != 0xffffffff: 355 | # fcis_info = readsection(datain, fcis_secnum) 356 | # text_len, = struct.unpack_from(b'>L', fcis_info, 0x14) 357 | # new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00' 358 | # new_fcis += struct.pack(b'>L',text_len) 359 | # new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00' 360 | # self.result_file7 = writesection(self.result_file7, fcis_secnum, new_fcis) 361 | 362 | firstimage = getint(datain_rec0,first_resc_record) 363 | lastimage = getint(datain_rec0,last_content_index,b'H') 364 | # print("Old First Image, last Image", firstimage,lastimage) 365 | if lastimage == 0xffff: 366 | # find the lowest of the next sections and copy up to that. 367 | ofs_list = [(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')] 368 | for ofs,sz in ofs_list: 369 | n = getint(datain_rec0,ofs,sz) 370 | # print("n",n) 371 | if n > 0 and n < lastimage: 372 | lastimage = n-1 373 | print("First Image, last Image", firstimage,lastimage) 374 | 375 | # Try to null out FONT and RES, but leave the (empty) PDB record so image refs remain valid 376 | for i in range(firstimage,lastimage): 377 | imgsec = readsection(self.result_file7,i) 378 | if imgsec[0:4] in [b'RESC',b'FONT']: 379 | self.result_file7 = nullsection(self.result_file7,i) 380 | 381 | # mobi7 finished 382 | 383 | # create standalone mobi8 384 | self.result_file8 = deletesectionrange(datain,0,datain_kf8-1) 385 | target = getint(datain_kfrec0,first_resc_record) 386 | self.result_file8 = insertsectionrange(datain,firstimage,lastimage,self.result_file8,target) 387 | datain_kfrec0 =readsection(self.result_file8,0) 388 | 389 | # Only keep the correct EXTH 116 StartOffset, KG 2.5 carries over the one from the mobi7 part, which then points at garbage in the mobi8 part, and confuses FW 3.4 390 | kf8starts = read_exth(datain_kfrec0,116) 391 | # If we have multiple StartOffset, keep only the last one 392 | kf8start_count = len(kf8starts) 393 | while kf8start_count > 1: 394 | kf8start_count -= 1 395 | datain_kfrec0 = del_exth(datain_kfrec0,116) 396 | 397 | # update the EXTH 125 KF8 Count of Images/Fonts/Resources 398 | datain_kfrec0 = write_exth(datain_kfrec0,125,struct.pack(b'>L',lastimage-firstimage+1)) 399 | 400 | # need to reset flags stored in 0x80-0x83 401 | # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050 402 | # standalone mobi8 with exth: 0x0050 403 | # Bit Flags 404 | # 0x1000 = Bit 12 indicates if embedded fonts are used or not 405 | # 0x0800 = means this Header points to *shared* images/resource/fonts ?? 406 | # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8? 407 | # 0x0040 = exth exists 408 | # 0x0010 = Not sure but this is always set so far 409 | fval, = struct.unpack_from('>L',datain_kfrec0, 0x80) 410 | fval = fval & 0x1FFF 411 | fval |= 0x0800 412 | datain_kfrec0 = datain_kfrec0[:0x80] + struct.pack(b'>L',fval) + datain_kfrec0[0x84:] 413 | 414 | # properly update other index pointers that have been shifted by the insertion of images 415 | ofs_list = [(kf8_fdst_index,b'L'),(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')] 416 | for ofs,sz in ofs_list: 417 | n = getint(datain_kfrec0,ofs,sz) 418 | if n != 0xffffffff: 419 | datain_kfrec0 = writeint(datain_kfrec0,ofs,n+lastimage-firstimage+1,sz) 420 | self.result_file8 = writesection(self.result_file8,0,datain_kfrec0) 421 | 422 | # no need to replace kf8 style fcis with mobi 7 one 423 | # fcis_secnum, = struct.unpack_from(b'>L',datain_kfrec0, 0xc8) 424 | # if fcis_secnum != 0xffffffff: 425 | # fcis_info = readsection(self.result_file8, fcis_secnum) 426 | # text_len, = struct.unpack_from(b'>L', fcis_info, 0x14) 427 | # new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00' 428 | # new_fcis += struct.pack(b'>L',text_len) 429 | # new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00' 430 | # self.result_file8 = writesection(self.result_file8, fcis_secnum, new_fcis) 431 | 432 | # mobi8 finished 433 | 434 | def getResult8(self): 435 | return self.result_file8 436 | 437 | def getResult7(self): 438 | return self.result_file7 439 | --------------------------------------------------------------------------------