├── src
    └── baca
    │   ├── utils
    │       ├── __init__.py
    │       ├── urls.py
    │       ├── tempdir.py
    │       ├── app_resources.py
    │       ├── keys_parser.py
    │       ├── user_appdirs.py
    │       ├── systems.py
    │       ├── queries.py
    │       ├── html_parser.py
    │       └── cli.py
    │   ├── components
    │       ├── __init__.py
    │       ├── events.py
    │       ├── windows.py
    │       └── contents.py
    │   ├── resources
    │       ├── __init__.py
    │       ├── config.ini
    │       └── style.css
    │   ├── tools
    │       ├── KindleUnpack
    │       │   ├── __init__.py
    │       │   ├── unipath.py
    │       │   ├── mobi_uncompress.py
    │       │   ├── mobi_sectioner.py
    │       │   ├── mobi_pagemap.py
    │       │   ├── mobi_nav.py
    │       │   ├── unpack_structure.py
    │       │   ├── mobi_utils.py
    │       │   ├── mobi_cover.py
    │       │   ├── compatibility_utils.py
    │       │   ├── mobi_ncx.py
    │       │   ├── mobi_k8resc.py
    │       │   ├── mobi_index.py
    │       │   ├── mobi_dict.py
    │       │   └── mobi_split.py
    │       └── __init__.py
    │   ├── __init__.py
    │   ├── ebooks
    │       ├── __init__.py
    │       ├── azw.py
    │       ├── base.py
    │       ├── mobi.py
    │       └── epub.py
    │   ├── exceptions.py
    │   ├── __main__.py
    │   ├── db.py
    │   ├── config.py
    │   ├── models.py
    │   └── app.py
├── .gitignore
├── poetry.toml
├── Makefile
├── tools
    └── debug.py
├── pyproject.toml
├── tests
    └── test_html_parser.py
└── README.md


/src/baca/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/baca/components/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/baca/resources/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | dist
2 | tmp/
3 | __pycache__/
4 | *.pyc
5 | .envrc
6 | 


--------------------------------------------------------------------------------
/poetry.toml:
--------------------------------------------------------------------------------
1 | [virtualenvs]
2 | in-project = true
3 | create = true
4 | path = ".venv"
5 | 


--------------------------------------------------------------------------------
/src/baca/tools/KindleUnpack/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
3 | 


--------------------------------------------------------------------------------
/src/baca/tools/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["unpack_kindle_book"]
2 | 
3 | from .KindleUnpack.kindleunpack import unpackBook as unpack_kindle_book
4 | 


--------------------------------------------------------------------------------
/src/baca/utils/urls.py:
--------------------------------------------------------------------------------
1 | from urllib.parse import urlparse
2 | 
3 | 
4 | def is_url(url: str) -> bool:
5 |     return urlparse(url).scheme != ""
6 | 


--------------------------------------------------------------------------------
/src/baca/__init__.py:
--------------------------------------------------------------------------------
1 | __appname__ = "baca"
2 | __version__ = "0.1.17"
3 | __license__ = "GPL-3.0"
4 | __author__ = "Benawi Adha"
5 | __email__ = "benawiadha@gmail.com"
6 | __url__ = "https://github.com/wustho/baca"
7 | 


--------------------------------------------------------------------------------
/src/baca/utils/tempdir.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | from pathlib import Path
3 | 
4 | from baca import __appname__
5 | 
6 | 
7 | def create_tempdir() -> Path:
8 |     return Path(tempfile.mkdtemp(prefix=f"{__appname__}-"))
9 | 


--------------------------------------------------------------------------------
/src/baca/ebooks/__init__.py:
--------------------------------------------------------------------------------
 1 | __all__ = [
 2 |     "Azw",
 3 |     "Ebook",
 4 |     "Epub",
 5 |     "Mobi",
 6 | ]
 7 | 
 8 | 
 9 | from baca.ebooks.azw import Azw
10 | from baca.ebooks.base import Ebook
11 | from baca.ebooks.epub import Epub
12 | from baca.ebooks.mobi import Mobi
13 | 


--------------------------------------------------------------------------------
/src/baca/utils/app_resources.py:
--------------------------------------------------------------------------------
1 | from importlib.resources import as_file, files
2 | from pathlib import Path
3 | 
4 | 
5 | def get_resource_file(filename: str) -> Path:
6 |     with as_file(files("baca.resources").joinpath(filename)) as resource_file:
7 |         return resource_file
8 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: tests
 2 | 
 3 | tests:
 4 | 	python -m pytest tests
 5 | 
 6 | format:
 7 | 	isort src
 8 | 	black src
 9 | 
10 | console:
11 | 	@textual console
12 | 
13 | debug:
14 | 	@textual run --dev tools/debug.py:Baca
15 | 
16 | publish:
17 | 	# poetry build
18 | 	poetry publish --build
19 | 
20 | typecheck:
21 | 	pyright src
22 | 


--------------------------------------------------------------------------------
/src/baca/exceptions.py:
--------------------------------------------------------------------------------
 1 | class TableDoesNotExist(Exception):
 2 |     pass
 3 | 
 4 | 
 5 | class BacaException(Exception):
 6 |     def __init__(self, message: str):
 7 |         super().__init__(f"BacaError: {message}")
 8 | 
 9 | 
10 | class EbookNotFound(BacaException):
11 |     pass
12 | 
13 | 
14 | class FormatNotSupported(BacaException):
15 |     pass
16 | 
17 | 
18 | class LaunchingFileError(Exception):
19 |     pass
20 | 


--------------------------------------------------------------------------------
/tools/debug.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from baca.app import Baca as _Baca
 4 | from baca.utils.cli import get_ebook_class
 5 | from baca.utils.queries import get_last_read_ebook
 6 | 
 7 | 
 8 | class Baca(_Baca):
 9 |     def __init__(self):
10 |         # file = Path("tmp/alice.azw3")
11 |         file = get_last_read_ebook()
12 |         assert file is not None
13 |         # file = Path("tmp/andy.epub")
14 |         # file = Path("tmp/frankenstein.older.mobi")
15 |         # file = Path("tmp/frankenstein.mobi")
16 |         super().__init__(file, get_ebook_class(file))
17 | 


--------------------------------------------------------------------------------
/src/baca/ebooks/azw.py:
--------------------------------------------------------------------------------
 1 | import contextlib
 2 | import os
 3 | import zipfile
 4 | from pathlib import Path
 5 | 
 6 | from baca.ebooks.epub import Epub
 7 | from baca.tools import unpack_kindle_book
 8 | from baca.utils.tempdir import create_tempdir
 9 | 
10 | 
11 | class Azw(Epub):
12 |     def __init__(self, ebook_path: Path):
13 |         self._path = ebook_path.resolve()
14 |         self._tempdir = create_tempdir()
15 |         self._tmpepub = self._tempdir / "mobi8" / f"{os.path.splitext(self._path)[0]}.epub"
16 |         with contextlib.redirect_stdout(None):
17 |             unpack_kindle_book(str(self._path), str(self._tempdir), epubver="A", use_hd=True)
18 |         self._file = zipfile.ZipFile(self._tmpepub, "r")
19 | 


--------------------------------------------------------------------------------
/src/baca/components/events.py:
--------------------------------------------------------------------------------
 1 | from textual.message import Message
 2 | 
 3 | 
 4 | class DoneLoading(Message):
 5 |     def __init__(self, content):
 6 |         super().__init__()
 7 |         self.content = content
 8 | 
 9 | 
10 | class FollowThis(Message):
11 |     def __init__(self, nav_point: str):
12 |         super().__init__()
13 |         self.nav_point = nav_point
14 | 
15 | 
16 | class OpenThisImage(Message):
17 |     def __init__(self, value: str):
18 |         super().__init__()
19 |         self.value = value
20 | 
21 | 
22 | class SearchSubmitted(Message):
23 |     def __init__(self, value: str, forward: bool):
24 |         super().__init__()
25 |         self.value = value
26 |         self.forward = forward
27 | 
28 | 
29 | class Screenshot(Message):
30 |     pass
31 | 


--------------------------------------------------------------------------------
/src/baca/utils/keys_parser.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | 
 3 | from textual import events
 4 | from textual.actions import SkipAction
 5 | 
 6 | from baca.models import KeyMap
 7 | 
 8 | 
 9 | async def dispatch_key(maps: list[KeyMap], event: events.Key, *, propagate: bool = True) -> None:
10 |     callback = {k: m.action for m in maps for k in m.keys}.get(event.key)
11 | 
12 |     if callback is not None:
13 |         try:
14 |             return_value = callback()
15 |             if inspect.isawaitable(return_value):
16 |                 await return_value
17 |         except SkipAction:
18 |             pass
19 | 
20 |     if propagate:
21 |         # stop propagating to base widget
22 |         event.prevent_default()
23 |         # stop propagating to parent widget
24 |         event.stop()
25 | 


--------------------------------------------------------------------------------
/src/baca/__main__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from rich.console import Console
 4 | from rich.text import Text
 5 | 
 6 | from baca.app import Baca
 7 | from baca.db import migrate
 8 | from baca.exceptions import EbookNotFound, FormatNotSupported
 9 | from baca.utils.cli import find_file, get_ebook_class
10 | 
11 | 
12 | def main():
13 |     try:
14 |         migrate()
15 |         ebook_path = find_file()
16 |         ebook_class = get_ebook_class(ebook_path)
17 |         return sys.exit(Baca(ebook_path=ebook_path, ebook_class=ebook_class).run())
18 | 
19 |     except (Exception, EbookNotFound, FormatNotSupported) as e:
20 |         console = Console()
21 |         if isinstance(e, (EbookNotFound, FormatNotSupported)):
22 |             console.print(Text(str(e), style="bold red"))
23 |         else:
24 |             console.print_exception()
25 |         sys.exit(-1)
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     main()
30 | 


--------------------------------------------------------------------------------
/src/baca/utils/user_appdirs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | 
 4 | import appdirs
 5 | 
 6 | from baca import __appname__
 7 | from baca.utils.app_resources import get_resource_file
 8 | 
 9 | DEFAULT_CONFIG = get_resource_file("config.ini")
10 | 
11 | 
12 | def retrieve_user_cache_dbfile() -> Path:
13 |     cachedir = appdirs.user_cache_dir(__appname__)
14 |     if not os.path.isdir(cachedir):
15 |         os.makedirs(cachedir)
16 | 
17 |     return Path(cachedir) / f"{__appname__}.db"
18 | 
19 | 
20 | def retrieve_user_config_file() -> Path:
21 |     configdir = Path(appdirs.user_config_dir(appname=__appname__))
22 |     if not os.path.isdir(configdir):
23 |         os.makedirs(configdir)
24 | 
25 |     configfile = configdir / "config.ini"
26 |     if not os.path.isfile(configfile):
27 |         # shutil.copyfile(str(DEFAULT_CONFIG), str(configfile))
28 |         with open(DEFAULT_CONFIG, "r", encoding="utf-8") as src, open(configfile, "w", encoding="utf-8") as dest:
29 |             dest.write(src.read())
30 | 
31 |     return configfile
32 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "baca"
 3 | version = "0.1.17"
 4 | description = "TUI Ebook Reader"
 5 | authors = ["Benawi Adha <benawiadha@gmail.com>"]
 6 | license = "GPL-3.0"
 7 | readme = "README.md"
 8 | packages = [
 9 |     { include = "baca", from = "src" }
10 | ]
11 | 
12 | [tool.poetry.scripts]
13 | baca = "baca.__main__:main"
14 | 
15 | [tool.poetry.dependencies]
16 | python = "^3.10"
17 | textual = "^0.16.0"
18 | beautifulsoup4 = "^4.12.0"
19 | markdownify = "^0.11.6"
20 | appdirs = "^1.4.4"
21 | peewee = "^3.16.0"
22 | fuzzywuzzy = "^0.18.0"
23 | climage = "^0.2.0"
24 | 
25 | [tool.poetry.group.dev.dependencies]
26 | black = "^23.1.0"
27 | isort = "^5.12.0"
28 | ipython = "^8.11.0"
29 | textual = {extras = ["dev"], version = "^0.16.0"}
30 | pytest = "^7.2.2"
31 | ipdb = "^0.13.13"
32 | 
33 | [build-system]
34 | requires = ["poetry-core>=1.0.0"]
35 | build-backend = "poetry.core.masonry.api"
36 | 
37 | [tool.pyright]
38 | exclude = ["src/baca/tools"]
39 | 
40 | [tool.black]
41 | line-length = 120
42 | target-version = ['py310']
43 | exclude = "src/baca/tools/"
44 | 
45 | [tool.isort]
46 | profile = "black"
47 | skip = "src/baca/tools/"
48 | 


--------------------------------------------------------------------------------
/src/baca/utils/systems.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import platform
 4 | import shutil
 5 | from pathlib import Path
 6 | 
 7 | from baca.exceptions import LaunchingFileError
 8 | 
 9 | LAUNCHERS = [
10 |     "gnome-open",
11 |     "gvfs-open",
12 |     "xdg-open",
13 |     "kde-open",
14 | ]
15 | 
16 | 
17 | async def launch_file(path: Path | str, preferred: str = LAUNCHERS[0]) -> None:
18 |     if platform.system() == "Windows":
19 |         loop = asyncio.get_running_loop()
20 |         await loop.run_in_executor(None, os.startfile, path)  # type: ignore
21 |         return
22 | 
23 |     if platform.system() == "Darwin":
24 |         launcher = "open"
25 |     else:
26 |         try:
27 |             launcher = next(l for l in [preferred, *LAUNCHERS] if shutil.which(l) is not None)
28 |         except StopIteration:
29 |             raise LaunchingFileError("System launcher not found.")
30 | 
31 |     proc = await asyncio.create_subprocess_exec(launcher, path, stderr=asyncio.subprocess.PIPE)
32 |     await proc.wait()
33 |     if proc.returncode != 0:
34 |         _, stderr = await proc.communicate()
35 |         raise LaunchingFileError(stderr.decode())
36 | 


--------------------------------------------------------------------------------
/src/baca/ebooks/base.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import xml.etree.ElementTree as ET
 3 | from pathlib import Path
 4 | from typing import Iterator
 5 | 
 6 | from baca.models import BookMetadata, Segment, TocEntry
 7 | 
 8 | 
 9 | class Ebook:
10 |     def __init__(self, ebook_path: Path):
11 |         raise NotImplementedError()
12 | 
13 |     def get_tempdir(self) -> Path:
14 |         raise NotImplementedError()
15 | 
16 |     def get_path(self) -> Path:
17 |         raise NotImplementedError()
18 | 
19 |     def get_raw_text(self, content: str | ET.Element) -> str:
20 |         raise NotImplementedError()
21 | 
22 |     def get_img_bytestr(self, image_id: str) -> tuple[str, bytes]:
23 |         raise NotImplementedError()
24 | 
25 |     def cleanup(self) -> None:
26 |         shutil.rmtree(self.get_tempdir())
27 | 
28 |     # TODO: maybe cache @lru_cache
29 |     def get_toc(self) -> tuple[TocEntry, ...]:
30 |         raise NotImplementedError()
31 | 
32 |     def iter_parsed_contents(self) -> Iterator[Segment]:
33 |         raise NotImplementedError()
34 | 
35 |     def get_meta(self) -> BookMetadata:
36 |         raise NotImplementedError("Ebook.get_meta() not implemented")
37 | 


--------------------------------------------------------------------------------
/src/baca/resources/config.ini:
--------------------------------------------------------------------------------
 1 | [General]
 2 | # pick your favorite image viewer
 3 | PreferredImageViewer = auto
 4 | 
 5 | # int or css value string like 90%%
 6 | # (escape percent with double percent %%)
 7 | MaxTextWidth = 80
 8 | 
 9 | # 'justify', 'center', 'left', 'right'
10 | TextJustification = justify
11 | 
12 | # currently using pretty=yes is slow
13 | # and taking huge amount of memory
14 | Pretty = no
15 | 
16 | PageScrollDuration = 0.2
17 | 
18 | # either show image as ansi image
19 | # or text 'IMAGE' as a placehoder
20 | # (showing ansi image will affect
21 | # performance & resource usage)
22 | ShowImageAsANSI = yes
23 | 
24 | [Color Dark]
25 | Background = #1e1e1e
26 | Foreground = #f5f5f5
27 | Accent = #0178d4
28 | 
29 | [Color Light]
30 | Background = #f5f5f5
31 | Foreground = #1e1e1e
32 | Accent = #0178d4
33 | 
34 | [Keymaps]
35 | ToggleLightDark = c
36 | ScrollDown = down,j
37 | ScrollUp = up,k
38 | PageDown = ctrl+f,pagedown,l,space
39 | PageUp = ctrl+b,pageup,h
40 | Home = home,g
41 | End = end,G
42 | OpenToc = tab
43 | OpenMetadata = M
44 | OpenHelp = f1
45 | SearchForward = slash
46 | SearchBackward = question_mark
47 | NextMatch = n
48 | PreviousMatch = N
49 | Confirm = enter
50 | CloseOrQuit = q,escape
51 | Screenshot = f12
52 | 


--------------------------------------------------------------------------------
/src/baca/db.py:
--------------------------------------------------------------------------------
 1 | """
 2 | NOTE: on using peewee for non-integer primary_key
 3 | 
 4 | ```python
 5 | # This works because .create() will specify `force_insert=True`.
 6 | obj1 = UUIDModel.create(id=uuid.uuid4())
 7 | 
 8 | # This will not work, however. Peewee will attempt to do an update:
 9 | obj2 = UUIDModel(id=uuid.uuid4())
10 | obj2.save() # WRONG
11 | 
12 | obj2.save(force_insert=True) # CORRECT
13 | 
14 | # Once the object has been created, you can call save() normally.
15 | obj2.save()
16 | ```
17 | 
18 | Read more: http://docs.peewee-orm.com/en/latest/peewee/models.html?highlight=force_insert#id4
19 | """
20 | 
21 | from baca.exceptions import TableDoesNotExist
22 | from baca.models import DbMetadata, Migration, ReadingHistory, db
23 | 
24 | 
25 | def initial_migration() -> None:
26 |     db.create_tables([DbMetadata, ReadingHistory])
27 | 
28 | 
29 | MIGRATIONS: list[Migration] = [
30 |     Migration(version=0, migrate=initial_migration),
31 | ]
32 | 
33 | 
34 | def migrate() -> None:
35 |     db.connect()
36 |     try:
37 |         for migration in sorted(MIGRATIONS, key=lambda x: x.version):
38 |             try:
39 |                 if not DbMetadata.table_exists():
40 |                     raise TableDoesNotExist()
41 |                 DbMetadata.get_by_id(migration.version)
42 |             except (DbMetadata.DoesNotExist, TableDoesNotExist):
43 |                 migration.migrate()
44 |                 DbMetadata.create(version=migration.version)
45 |     finally:
46 |         db.close()
47 | 


--------------------------------------------------------------------------------
/src/baca/utils/queries.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from pathlib import Path
 3 | from typing import Iterator
 4 | 
 5 | from baca.models import ReadingHistory
 6 | 
 7 | MIN_FUZZY_MATCH_RATIO = 10
 8 | 
 9 | 
10 | def get_all_reading_history() -> Iterator[ReadingHistory]:
11 |     for rh in ReadingHistory.select().order_by(ReadingHistory.last_read.desc()):  # type: ignore
12 |         if Path(rh.filepath).is_file():
13 |             yield rh
14 |         else:
15 |             rh.delete_instance()
16 | 
17 | 
18 | def get_best_match_from_history(pattern: str) -> Path | None:
19 |     try:
20 |         from thefuzz import fuzz
21 |     except ImportError:
22 |         with warnings.catch_warnings():
23 |             warnings.simplefilter("ignore")
24 |             from fuzzywuzzy import fuzz
25 | 
26 |     match_ratios = [
27 |         (rh.filepath, fuzz.ratio(tomatch, pattern))
28 |         for rh in get_all_reading_history()
29 |         for tomatch in [rh.filepath, f"{rh.title} {rh.author}"]
30 |     ]
31 |     matches = [(Path(path), ratio) for path, ratio in match_ratios if ratio > MIN_FUZZY_MATCH_RATIO]  # type: ignore
32 |     return None if len(matches) == 0 else sorted(matches, key=lambda x: -x[1])[0][0]
33 | 
34 | 
35 | def get_nth_file_from_history(nth: int) -> Path | None:
36 |     try:
37 |         return Path(list(get_all_reading_history())[nth - 1].filepath)  # type: ignore
38 |     except IndexError:
39 |         return None
40 | 
41 | 
42 | def get_last_read_ebook() -> Path | None:
43 |     try:
44 |         last_read_ebook = ReadingHistory.select().order_by(ReadingHistory.last_read.desc()).get()  # type: ignore
45 |         last_read_ebook = Path(last_read_ebook.filepath)
46 |         return last_read_ebook if last_read_ebook.is_file() else None
47 |     except ReadingHistory.DoesNotExist:
48 |         return None
49 | 


--------------------------------------------------------------------------------
/tests/test_html_parser.py:
--------------------------------------------------------------------------------
 1 | from baca.utils.html_parser import split_html_to_segments
 2 | from baca.models import SegmentType
 3 | 
 4 | HTML_TEST = """
 5 | <html><head><title>The Dormouse's story</title></head>
 6 | <body>
 7 | <p class="title"><b>The Dormouse's
 8 | story</b></p>
 9 | 
10 | <p class="story">Once upon a time there were three little sisters; and their names were
11 | <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
12 |   <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
13 | and they lived at the bottom of a well.</p>
14 | <img src="img_girl.jpg" alt="Girl in a jacket" width="500" height="600">
15 | 
16 | <p class="story">...</p>
17 | """
18 | 
19 | 
20 | def test_html_splitters():
21 |     segments_iterator = split_html_to_segments(HTML_TEST, "test.html")
22 | 
23 |     segment = next(segments_iterator)
24 |     assert segment.type == SegmentType.BODY
25 |     assert (
26 |         segment.content
27 |         == '<body> <p class="title"><b>The Dormouse\'s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were '
28 |     )
29 |     assert segment.nav_point == "test.html"
30 | 
31 |     segment = next(segments_iterator)
32 |     assert segment.type == SegmentType.BODY
33 |     assert segment.content == '<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,   '
34 |     assert segment.nav_point == "test.html#link1"
35 | 
36 |     segment = next(segments_iterator)
37 |     assert segment.type == SegmentType.BODY
38 |     assert (
39 |         segment.content
40 |         == '<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and and they lived at the bottom of a well.</p> '
41 |     )
42 |     assert segment.nav_point == "test.html#link2"
43 | 
44 |     segment = next(segments_iterator)
45 |     assert segment.type == SegmentType.IMAGE
46 |     assert segment.content == "img_girl.jpg"
47 |     assert segment.nav_point is None
48 | 
49 |     segment = next(segments_iterator)
50 |     assert segment.type == SegmentType.BODY
51 |     assert (
52 |         segment.content
53 |         == '<img alt="Girl in a jacket" height="600" src="img_girl.jpg" width="500"/> <p class="story">...</p> </body>'
54 |     )
55 |     assert segment.nav_point is None
56 | 


--------------------------------------------------------------------------------
/src/baca/utils/html_parser.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterator
 2 | from urllib.parse import urljoin
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | from markdownify import MarkdownConverter as _MarkdownConverter
 6 | 
 7 | from baca.models import Segment, SegmentType
 8 | 
 9 | 
10 | class MarkdownConverter(_MarkdownConverter):
11 |     def convert_img(self, el, text, convert_as_inline):
12 |         return ""
13 | 
14 | 
15 | def split_html_to_segments(
16 |     html_src: str, section_name: str, *, ids_to_find: list[str] | None = None
17 | ) -> Iterator[Segment]:
18 |     """
19 |     :param ids_to_find:
20 |         ids_to_find is url fragment (eg. https://url.com/content.html#fragment) to find inside given `html_src`
21 | 
22 |         - if None will find all possible id(s)
23 |         - if [] then, will skip finding id(s) section in html_src
24 |     """
25 |     soup = BeautifulSoup(html_src, "html.parser", store_line_numbers=True)
26 |     body = soup.find("body")
27 |     body_html = str(body).replace("\n", " ")
28 |     body = BeautifulSoup(body_html, "html.parser")
29 | 
30 |     find_nav_points = ids_to_find is None or len(ids_to_find) > 0
31 |     if find_nav_points:
32 |         section_elems = body.find_all(id=True if ids_to_find is None else ids_to_find)
33 |     else:
34 |         section_elems = []
35 |     img_elems = body.find_all(["img", "image"])
36 |     all_elems = sorted(section_elems + img_elems, key=lambda x: [x.sourceline, x.sourcepos])  # type: ignore
37 | 
38 |     start = 0
39 |     nav_point = section_name
40 |     for elem in all_elems:
41 |         yield Segment(type=SegmentType.BODY, content=body_html[start : elem.sourcepos], nav_point=nav_point)  # type: ignore
42 | 
43 |         start = elem.sourcepos
44 |         fragment = elem.get("id")
45 |         nav_point = f"{section_name}#{fragment}" if find_nav_points and fragment is not None else None
46 | 
47 |         if elem.name in {"img", "image"}:  # type: ignore
48 |             img_src = elem.get("src") or elem.get("href")  # type: ignore
49 |             if img_src is not None:
50 |                 # NOTE: urljoin should be able to handle relative path. ie urljoin("a", "b") == "b"
51 |                 yield Segment(type=SegmentType.IMAGE, content=urljoin(section_name, img_src), nav_point=nav_point)
52 | 
53 |     yield Segment(type=SegmentType.BODY, content=body_html[start:], nav_point=nav_point)
54 | 
55 | 
56 | def parse_html_to_segmented_md(
57 |     html_src: str, section_name: str, *, ids_to_find: list[str] | None = None
58 | ) -> Iterator[Segment]:
59 |     for segment in split_html_to_segments(html_src, section_name, ids_to_find=ids_to_find):
60 |         yield Segment(
61 |             type=segment.type,
62 |             content=MarkdownConverter().convert(segment.content)
63 |             if segment.type == SegmentType.BODY
64 |             else segment.content,
65 |             nav_point=segment.nav_point,
66 |         )
67 | 


--------------------------------------------------------------------------------
/src/baca/ebooks/mobi.py:
--------------------------------------------------------------------------------
 1 | import contextlib
 2 | import os
 3 | import tempfile
 4 | import xml.etree.ElementTree as ET
 5 | from enum import Enum
 6 | from pathlib import Path
 7 | 
 8 | from baca import __appname__
 9 | from baca.ebooks.epub import Epub
10 | from baca.models import TocEntry
11 | from baca.tools import unpack_kindle_book
12 | 
13 | 
14 | class MobiVersion(Enum):
15 |     MOBI7 = "mobi7"
16 |     MOBI8 = "mobi8"
17 | 
18 | 
19 | # TODO: test on windows machine
20 | class Mobi(Epub):
21 |     def __init__(self, ebook_path: Path):
22 |         self._path = ebook_path.resolve()
23 |         self._tempdir = Path(tempfile.mkdtemp(prefix=f"{__appname__}-"))
24 |         with contextlib.redirect_stdout(None):
25 |             unpack_kindle_book(str(self._path), str(self._tempdir), epubver="A", use_hd=True)
26 | 
27 |     @property
28 |     def _mobi_version(self) -> MobiVersion:
29 |         if (self.get_tempdir() / "mobi8").is_dir():
30 |             return MobiVersion.MOBI8
31 |         elif (self.get_tempdir() / "mobi7").is_dir():
32 |             return MobiVersion.MOBI7
33 |         else:
34 |             raise NotImplementedError("Unsupported Mobi version")
35 | 
36 |     @property
37 |     def _book_dir(self) -> Path:
38 |         return self.get_tempdir() / ("mobi8" if self._mobi_version == MobiVersion.MOBI8 else "mobi7")
39 | 
40 |     @property
41 |     def _root_filepath(self) -> Path:
42 |         if self._mobi_version == MobiVersion.MOBI8:
43 |             container_file = ET.parse(self._book_dir / "META-INF" / "container.xml")
44 |             rootfile_elem = container_file.find("CONT:rootfiles/CONT:rootfile", Epub.NAMESPACE)
45 |             return self._book_dir / rootfile_elem.attrib["full-path"]  # type: ignore
46 |         else:
47 |             return self._book_dir / "content.opf"
48 | 
49 |     @property
50 |     def _root_dirpath(self) -> Path:
51 |         return self._root_filepath.parent
52 | 
53 |     @property
54 |     def _content_opf(self) -> ET.ElementTree:
55 |         return ET.parse(self._root_filepath)
56 | 
57 |     @property
58 |     def _toc_ncx(self) -> ET.Element:
59 |         toc_ncx_path = self._root_dirpath / self._relactive_toc_ncx_path  # type: ignore
60 |         return ET.parse(toc_ncx_path).getroot()
61 | 
62 |     def _get_contents(self) -> tuple[str, ...] | tuple[ET.Element, ...]:
63 |         # TODO: using path_resolver kward seems weird, refactor this!
64 |         return Epub._parse_content_opf(self._content_opf, str(self._root_dirpath), path_resolver=os.path.join)
65 | 
66 |     def get_toc(self) -> tuple[TocEntry, ...]:
67 |         # TODO: using path_resolver kward seems weird, refactor this!
68 |         return Epub._parse_toc(self._toc_ncx, self._version, self._root_dirpath, path_resolver=os.path.join)
69 | 
70 |     def get_raw_text(self, content_path: str) -> str:
71 |         with open(content_path, encoding="utf8") as f:
72 |             return f.read()
73 | 
74 |     def get_img_bytestr(self, impath: str) -> tuple[str, bytes]:
75 |         # TODO: test on windows, maybe urljoin?
76 |         # if impath "Images/asdf.png" is problematic
77 |         image_abspath = self._root_dirpath / impath
78 |         image_abspath = os.path.normpath(image_abspath)  # handle crossplatform path
79 |         with open(image_abspath, "rb") as f:
80 |             src = f.read()
81 |         return impath, src
82 | 


--------------------------------------------------------------------------------
/src/baca/config.py:
--------------------------------------------------------------------------------
 1 | from configparser import ConfigParser
 2 | from typing import Literal, cast
 3 | 
 4 | from baca.models import Color, Config, Keymaps
 5 | from baca.utils.user_appdirs import DEFAULT_CONFIG, retrieve_user_config_file
 6 | 
 7 | 
 8 | def parse_keymaps(config_map: str) -> list[str]:
 9 |     return [k.strip() for k in config_map.split(",")]
10 | 
11 | 
12 | def load_config() -> Config:
13 |     user_conf = ConfigParser()
14 |     user_conf.read(retrieve_user_config_file())
15 |     default_conf = ConfigParser()
16 |     default_conf.read(DEFAULT_CONFIG)
17 | 
18 |     def get_value(section: str, key: str, is_bool: bool = False) -> str | bool:
19 |         section_conf = user_conf[section] if section in user_conf else default_conf[section]
20 |         return (
21 |             section_conf.get(key, default_conf[section][key])
22 |             if not is_bool
23 |             else section_conf.getboolean(key, fallback=default_conf[section].getboolean(key))
24 |         )
25 | 
26 |     return Config(
27 |         preferred_image_viewer=str(get_value("General", "PreferredImageViewer")),
28 |         max_text_width=str(get_value("General", "MaxTextWidth")),
29 |         text_justification=cast(
30 |             Literal["default", "center", "full", "right", "left"], get_value("General", "TextJustification")
31 |         ),
32 |         pretty=bool(get_value("General", "Pretty", True)),
33 |         page_scroll_duration=float(get_value("General", "PageScrollDuration")),
34 |         show_image_as_ansi=bool(get_value("General", "ShowImageAsANSI", True)),
35 |         dark=Color(
36 |             bg=str(get_value("Color Dark", "Background")),
37 |             fg=str(get_value("Color Dark", "Foreground")),
38 |             accent=str(get_value("Color Dark", "Accent")),
39 |         ),
40 |         light=Color(
41 |             bg=str(get_value("Color Light", "Background")),
42 |             fg=str(get_value("Color Light", "Foreground")),
43 |             accent=str(get_value("Color Light", "Accent")),
44 |         ),
45 |         keymaps=Keymaps(
46 |             toggle_dark=parse_keymaps(str(get_value("Keymaps", "ToggleLightDark"))),
47 |             scroll_down=parse_keymaps(str(get_value("Keymaps", "ScrollDown"))),
48 |             scroll_up=parse_keymaps(str(get_value("Keymaps", "ScrollUp"))),
49 |             page_up=parse_keymaps(str(get_value("Keymaps", "PageUp"))),
50 |             page_down=parse_keymaps(str(get_value("Keymaps", "PageDown"))),
51 |             home=parse_keymaps(str(get_value("Keymaps", "Home"))),
52 |             end=parse_keymaps(str(get_value("Keymaps", "End"))),
53 |             open_toc=parse_keymaps(str(get_value("Keymaps", "OpenToc"))),
54 |             open_metadata=parse_keymaps(str(get_value("Keymaps", "OpenMetadata"))),
55 |             open_help=parse_keymaps(str(get_value("Keymaps", "OpenHelp"))),
56 |             search_forward=parse_keymaps(str(get_value("Keymaps", "SearchForward"))),
57 |             search_backward=parse_keymaps(str(get_value("Keymaps", "SearchBackward"))),
58 |             next_match=parse_keymaps(str(get_value("Keymaps", "NextMatch"))),
59 |             prev_match=parse_keymaps(str(get_value("Keymaps", "PreviousMatch"))),
60 |             confirm=parse_keymaps(str(get_value("Keymaps", "Confirm"))),
61 |             close=parse_keymaps(str(get_value("Keymaps", "CloseOrQuit"))),
62 |             screenshot=parse_keymaps(str(get_value("Keymaps", "Screenshot"))),
63 |         ),
64 |     )
65 | 


--------------------------------------------------------------------------------
/src/baca/models.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from datetime import datetime
  3 | from enum import Enum
  4 | from typing import Callable, Literal
  5 | 
  6 | from peewee import (
  7 |     CharField,
  8 |     DateTimeField,
  9 |     FloatField,
 10 |     IntegerField,
 11 |     Model,
 12 |     SqliteDatabase,
 13 | )
 14 | 
 15 | from baca.utils.user_appdirs import retrieve_user_cache_dbfile
 16 | 
 17 | db = SqliteDatabase(retrieve_user_cache_dbfile())
 18 | 
 19 | 
 20 | class BaseModel(Model):
 21 |     class Meta:
 22 |         database = db
 23 | 
 24 | 
 25 | class DbMetadata(BaseModel):
 26 |     version = IntegerField(primary_key=True)
 27 |     migrated_at = DateTimeField(default=datetime.now)
 28 | 
 29 |     class Meta:
 30 |         table_name = "metadata"
 31 | 
 32 | 
 33 | class ReadingHistory(BaseModel):
 34 |     filepath = CharField(primary_key=True)
 35 |     title = CharField(null=True)
 36 |     author = CharField(null=True)
 37 |     reading_progress = FloatField(null=False)
 38 |     last_read = DateTimeField(default=datetime.now, null=False)
 39 | 
 40 |     class Meta:
 41 |         table_name = "reading_history"
 42 | 
 43 | 
 44 | @dataclass(frozen=True)
 45 | class Migration:
 46 |     version: int
 47 |     migrate: Callable[[], None]
 48 | 
 49 | 
 50 | class SegmentType(Enum):
 51 |     IMAGE = "image"
 52 |     BODY = "body"
 53 | 
 54 | 
 55 | @dataclass(frozen=True)
 56 | class Color:
 57 |     bg: str
 58 |     fg: str
 59 |     accent: str
 60 | 
 61 | 
 62 | @dataclass(frozen=True)
 63 | class Keymaps:
 64 |     toggle_dark: list[str]
 65 |     scroll_down: list[str]
 66 |     scroll_up: list[str]
 67 |     home: list[str]
 68 |     end: list[str]
 69 |     page_up: list[str]
 70 |     page_down: list[str]
 71 |     open_toc: list[str]
 72 |     open_metadata: list[str]
 73 |     open_help: list[str]
 74 |     search_forward: list[str]
 75 |     search_backward: list[str]
 76 |     next_match: list[str]
 77 |     prev_match: list[str]
 78 |     confirm: list[str]
 79 |     close: list[str]
 80 |     screenshot: list[str]
 81 | 
 82 | 
 83 | @dataclass(frozen=True)
 84 | class Config:
 85 |     preferred_image_viewer: str
 86 |     max_text_width: str
 87 |     text_justification: Literal["default", "center", "full", "right", "left"]
 88 |     pretty: bool
 89 |     page_scroll_duration: float
 90 |     show_image_as_ansi: bool
 91 |     dark: Color
 92 |     light: Color
 93 |     keymaps: Keymaps
 94 | 
 95 | 
 96 | @dataclass(frozen=True)
 97 | class BookMetadata:
 98 |     title: str | None = None
 99 |     creator: str | None = None
100 |     description: str | None = None
101 |     publisher: str | None = None
102 |     date: str | None = None
103 |     language: str | None = None
104 |     format: str | None = None
105 |     identifier: str | None = None
106 |     source: str | None = None
107 | 
108 | 
109 | @dataclass(frozen=True)
110 | class TocEntry:
111 |     label: str
112 |     value: str
113 | 
114 | 
115 | @dataclass(frozen=True)
116 | class Segment:
117 |     type: SegmentType
118 |     content: str
119 |     nav_point: str | None
120 | 
121 | 
122 | @dataclass(frozen=True)
123 | class KeyMap:
124 |     keys: list[str]
125 |     action: Callable
126 | 
127 | 
128 | @dataclass(frozen=True)
129 | class Coordinate:
130 |     x: int
131 |     y: int
132 | 
133 | 
134 | @dataclass(frozen=True)
135 | class SearchMode:
136 |     pattern_str: str
137 |     current_coord: Coordinate
138 |     forward: bool = True
139 |     saved_position: float = 0.0
140 | 


--------------------------------------------------------------------------------
/src/baca/tools/KindleUnpack/unipath.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 4 | 
 5 | # Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
 6 | # All rights reserved.
 7 | #
 8 | # Redistribution and use in source and binary forms, with or without modification,
 9 | # are permitted provided that the following conditions are met:
10 | #
11 | # 1. Redistributions of source code must retain the above copyright notice, this list of
12 | # conditions and the following disclaimer.
13 | #
14 | # 2. Redistributions in binary form must reproduce the above copyright notice, this list
15 | # of conditions and the following disclaimer in the documentation and/or other materials
16 | # provided with the distribution.
17 | #
18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
19 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
21 | # SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
23 | # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24 | # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
26 | # WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | from __future__ import unicode_literals, division, absolute_import, print_function
29 | from .compatibility_utils import PY2, text_type, binary_type
30 | 
31 | import sys
32 | import os
33 | 
34 | # utility routines to convert all paths to be full unicode
35 | 
36 | # Under Python 2, if a bytestring, try to convert it to unicode using sys.getfilesystemencoding
37 | # Under Python 3, if bytes, try to convert it to unicode using os.fsencode() to decode it
38 | 
39 | # Mac OS X and Windows will happily support full unicode paths
40 | # Linux can support full unicode paths but allows arbitrary byte paths which may be inconsistent with unicode
41 | 
42 | fsencoding = sys.getfilesystemencoding()
43 | 
44 | def pathof(s, enc=fsencoding):
45 |     if s is None:
46 |         return None
47 |     if isinstance(s, text_type):
48 |         return s
49 |     if isinstance(s, binary_type):
50 |         try:
51 |             return s.decode(enc)
52 |         except:
53 |             pass
54 |     return s
55 | 
56 | def exists(s):
57 |     return os.path.exists(pathof(s))
58 | 
59 | def isfile(s):
60 |     return os.path.isfile(pathof(s))
61 | 
62 | def isdir(s):
63 |     return os.path.isdir(pathof(s))
64 | 
65 | def mkdir(s):
66 |     return os.mkdir(pathof(s))
67 | 
68 | def listdir(s):
69 |     rv = []
70 |     for file in os.listdir(pathof(s)):
71 |         rv.append(pathof(file))
72 |     return rv
73 | 
74 | def getcwd():
75 |     if PY2:
76 |         return os.getcwdu()
77 |     return os.getcwd()
78 | 
79 | def walk(top):
80 |     top = pathof(top)
81 |     rv = []
82 |     for base, dnames, names in os.walk(top):
83 |         base = pathof(base)
84 |         for name in names:
85 |             name = pathof(name)
86 |             rv.append(relpath(os.path.join(base, name), top))
87 |     return rv
88 | 
89 | def relpath(path, start=None):
90 |     return os.path.relpath(pathof(path) , pathof(start))
91 | 
92 | def abspath(path):
93 |     return os.path.abspath(pathof(path))
94 | 


--------------------------------------------------------------------------------
/src/baca/resources/style.css:
--------------------------------------------------------------------------------
  1 | /* global */
  2 | .-dark-mode {
  3 |   background: $dark-bg;
  4 |   color: $dark-fg;
  5 | }
  6 | 
  7 | .-light-mode {
  8 |   background: $light-bg;
  9 |   color: $light-fg;
 10 | }
 11 | 
 12 | .-dark-mode * {
 13 |   scrollbar-color: $dark-accent;
 14 |   scrollbar-background: $dark-bg;
 15 | }
 16 | 
 17 | .-light-mode * {
 18 |   scrollbar-color: $light-accent;
 19 |   scrollbar-background: $light-bg;
 20 | }
 21 | 
 22 | Screen {
 23 |   align: center middle;
 24 |   height: auto;
 25 |   scrollbar-size: 1 1;
 26 |   layers: content search windows;
 27 | }
 28 | 
 29 | .-dark-mode Screen {
 30 |   background: $dark-bg;
 31 | }
 32 | 
 33 | .-light-mode Screen {
 34 |   background: $light-bg;
 35 | }
 36 | 
 37 | LoadingIndicator {
 38 |   layer: windows;
 39 | }
 40 | 
 41 | 
 42 | /* contents */
 43 | 
 44 | Table {
 45 |   /* NOTE: height & width important so table will overflow Metadata */
 46 |   /* instead of its ScrollView parent widget */
 47 |   height: auto;
 48 |   width: auto;
 49 | }
 50 | 
 51 | SegmentWidget {
 52 |   height: auto;
 53 | }
 54 | 
 55 | .-dark-mode Image {
 56 |   border: solid $dark-fg;
 57 | }
 58 | 
 59 | .-light-mode Image {
 60 |   border: solid $light-fg;
 61 | }
 62 | 
 63 | .-dark-mode Image:hover {
 64 |   border: double $dark-accent;
 65 |   color: $dark-accent;
 66 | }
 67 | 
 68 | .-light-mode Image:hover {
 69 |   border: double $light-accent;
 70 |   color: $light-accent;
 71 | }
 72 | 
 73 | Section {
 74 |   /* NOTE: this works but causing wrong index in initial toc */
 75 |   /* ie. when saved in the top of chapter 7, it's restored as chapter 6 in TOC */
 76 |   /* height: 0; */
 77 | 
 78 |   /* NOTE: this works but look a little bit weird */
 79 |   opacity: 0%;
 80 | 
 81 |   /* NOTE: this doesn't work */
 82 |   /* visibility: hidden; */
 83 |   /* display: none; */
 84 | }
 85 | 
 86 | SearchMatch {
 87 |   layer: search;
 88 |   height: 1;
 89 |   width: auto;
 90 |   text-style: bold;
 91 | }
 92 | 
 93 | .-dark-mode SearchMatch {
 94 |   background: $dark-accent;
 95 | }
 96 | 
 97 | .-light-mode SearchMatch {
 98 |   background: $light-accent;
 99 | }
100 | 
101 | Content * {
102 |   text-align: $text-justification;
103 | }
104 | 
105 | Content {
106 |   layout: vertical;
107 |   height: auto;
108 |   layer: content;
109 |   max-width: $text-max-width;
110 |   margin: 0 2;
111 | }
112 | 
113 | Markdown {
114 |   margin: 0 0;
115 | }
116 | 
117 | /* windows */
118 | 
119 | SearchInputPrompt {
120 |   layer: windows;
121 |   dock: bottom;
122 |   border-title-align: left;
123 | }
124 | 
125 | .-dark-mode SearchInputPrompt {
126 |   background: $dark-bg;
127 |   color: $dark-fg;
128 |   border: solid $dark-accent;
129 | }
130 | 
131 | .-light-mode SearchInputPrompt {
132 |   background: $light-bg;
133 |   color: $light-fg;
134 |   border: solid $light-accent;
135 | }
136 | 
137 | Window {
138 |   dock: top;
139 |   layer: windows;
140 |   padding: 1 4;
141 |   scrollbar-size: 1 1;
142 |   overflow-y: auto;
143 |   border-title-align: center;
144 |   /* NOTE: set this in Window.on_mount() */
145 |   /* so it will be responsive to screen size */
146 |   /* margin: 3 10; */
147 | }
148 | 
149 | .-dark-mode Window {
150 |   border: double $dark-accent;
151 | }
152 | 
153 | .-light-mode Window {
154 |   border: double $light-accent;
155 | }
156 | 
157 | DictDisplay {
158 |   align: center top;
159 | }
160 | 
161 | /* ToC { */
162 | /*   border: double $dark-accent; */
163 | /* } */
164 | 
165 | Alert {
166 |   layer: windows;
167 |   border: solid red;
168 |   color: red;
169 |   scrollbar-color: red;
170 | }
171 | 
172 | NavPoint {
173 |   height: auto;
174 |   border: tall grey;
175 |   margin: 0 1 1 0;
176 |   padding: 0 5;
177 | }
178 | 
179 | .-dark-mode NavPoint.selected {
180 |   background: $dark-accent;
181 | }
182 | 
183 | .-light-mode NavPoint.selected {
184 |   background: $light-accent;
185 | }
186 | 


--------------------------------------------------------------------------------
/src/baca/utils/cli.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import sys
  3 | import textwrap
  4 | from pathlib import Path
  5 | from typing import Type
  6 | 
  7 | from rich.console import Console
  8 | from rich.table import Table
  9 | 
 10 | from baca import __appname__, __version__
 11 | from baca.ebooks import Azw, Ebook, Epub, Mobi
 12 | from baca.exceptions import EbookNotFound, FormatNotSupported
 13 | from baca.utils.queries import (
 14 |     get_all_reading_history,
 15 |     get_best_match_from_history,
 16 |     get_last_read_ebook,
 17 |     get_nth_file_from_history,
 18 | )
 19 | 
 20 | 
 21 | def format_file_size(pathstr: str) -> str:
 22 |     byte_size = Path(pathstr).stat().st_size
 23 |     return f"{round(byte_size / 1024, 2)} kb" if byte_size <= 1024**2 else f"{round(byte_size / (1024 ** 2), 2)} mb"
 24 | 
 25 | 
 26 | def print_reading_history() -> None:
 27 |     table = Table(title="Baca History")
 28 |     table.add_column("#", style="cyan", no_wrap=False, justify="right")
 29 |     table.add_column("Last Read", style="cyan", no_wrap=False)
 30 |     table.add_column("Progress", style="cyan", no_wrap=False, justify="right")
 31 |     table.add_column("Title", style="magenta", no_wrap=False)
 32 |     table.add_column("Author", style="green", no_wrap=False)
 33 |     table.add_column("Path", style="white", no_wrap=False)
 34 |     table.add_column("Size", style="blue", no_wrap=False, justify="right")
 35 | 
 36 |     for n, rh in enumerate(get_all_reading_history()):
 37 |         table.add_row(
 38 |             str(n + 1),
 39 |             f"{rh.last_read:%I:%M %p %b %d, %Y}",
 40 |             f"{round(rh.reading_progress*100, 2)}%",  # type: ignore
 41 |             rh.title,  # type: ignore
 42 |             rh.author,  # type: ignore
 43 |             rh.filepath,  # type: ignore
 44 |             format_file_size(rh.filepath),  # type: ignore
 45 |         )
 46 | 
 47 |     Console().print(table)
 48 | 
 49 | 
 50 | def parse_cli_args() -> argparse.Namespace:
 51 |     prog = __appname__
 52 |     positional_arg_help_str = "[PATH | # | PATTERN ]"
 53 |     args_parser = argparse.ArgumentParser(
 54 |         prog=prog,
 55 |         usage=f"%(prog)s [-h] [-r] [-v] {positional_arg_help_str}",
 56 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 57 |         description="TUI Ebook Reader",
 58 |         epilog=textwrap.dedent(
 59 |             f"""\
 60 |         examples:
 61 |           {prog} /path/to/ebook    read /path/to/ebook file
 62 |           {prog} 3                 read #3 file from reading history
 63 |           {prog} count monte       read file matching 'count monte'
 64 |                                  from reading history
 65 |         """
 66 |         ),
 67 |     )
 68 |     args_parser.add_argument("-r", "--history", action="store_true", help="print reading history")
 69 |     args_parser.add_argument(
 70 |         "-v",
 71 |         "--version",
 72 |         action="version",
 73 |         version=f"v{__version__}",
 74 |         help="print version and exit",
 75 |     )
 76 |     args_parser.add_argument(
 77 |         "ebook",
 78 |         action="store",
 79 |         nargs="*",
 80 |         metavar=positional_arg_help_str,
 81 |         help="ebook path, history number or pattern",
 82 |     )
 83 |     return args_parser.parse_args()
 84 | 
 85 | 
 86 | def find_file() -> Path:
 87 |     args = parse_cli_args()
 88 |     if args.history:
 89 |         print_reading_history()
 90 |         sys.exit(0)
 91 | 
 92 |     elif len(args.ebook) == 0:
 93 |         last_read = get_last_read_ebook()
 94 |         if last_read is not None:
 95 |             return last_read
 96 |         else:
 97 |             raise EbookNotFound("found no last read ebook file!")
 98 | 
 99 |     elif len(args.ebook) == 1:
100 |         arg = args.ebook[0]
101 |         try:
102 |             nth = int(arg)
103 |             ebook_path = get_nth_file_from_history(nth)
104 |             if ebook_path is None:
105 |                 print_reading_history()
106 |                 raise EbookNotFound(f"#{nth} file not found from history!")
107 | 
108 |             else:
109 |                 return ebook_path
110 | 
111 |         except ValueError:
112 |             if Path(arg).is_file():
113 |                 return Path(arg)
114 | 
115 |     pattern = " ".join(args.ebook)
116 |     ebook_path = get_best_match_from_history(pattern)
117 |     if ebook_path is None:
118 |         print_reading_history()
119 |         raise EbookNotFound("found no matching ebook from history!")
120 |     else:
121 |         return ebook_path
122 | 
123 | 
124 | def get_ebook_class(ebook_path: Path) -> Type[Ebook]:
125 |     ext = ebook_path.suffix.lower()
126 |     try:
127 |         return {
128 |             ".epub": Epub,
129 |             ".epub3": Epub,
130 |             ".azw": Azw,
131 |             ".azw3": Azw,
132 |             ".mobi": Mobi,
133 |         }[ext]
134 |     except KeyError:
135 |         raise FormatNotSupported("format not supported!")
136 | 


--------------------------------------------------------------------------------
/src/baca/tools/KindleUnpack/mobi_uncompress.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | from .compatibility_utils import PY2, bchr, lmap, bstr
  8 | 
  9 | if PY2:
 10 |     range = xrange
 11 | 
 12 | import struct
 13 | # note:  struct pack, unpack, unpack_from all require bytestring format
 14 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 15 | 
 16 | 
 17 | class unpackException(Exception):
 18 |     pass
 19 | 
 20 | class UncompressedReader:
 21 | 
 22 |     def unpack(self, data):
 23 |         return data
 24 | 
 25 | class PalmdocReader:
 26 | 
 27 |     def unpack(self, i):
 28 |         o, p = b'', 0
 29 |         while p < len(i):
 30 |             # for python 3 must use slice since i[p] returns int while slice returns character
 31 |             c = ord(i[p:p+1])
 32 |             p += 1
 33 |             if (c >= 1 and c <= 8):
 34 |                 o += i[p:p+c]
 35 |                 p += c
 36 |             elif (c < 128):
 37 |                 o += bchr(c)
 38 |             elif (c >= 192):
 39 |                 o += b' ' + bchr(c ^ 128)
 40 |             else:
 41 |                 if p < len(i):
 42 |                     c = (c << 8) | ord(i[p:p+1])
 43 |                     p += 1
 44 |                     m = (c >> 3) & 0x07ff
 45 |                     n = (c & 7) + 3
 46 |                     if (m > n):
 47 |                         o += o[-m:n-m]
 48 |                     else:
 49 |                         for _ in range(n):
 50 |                             # because of completely ass-backwards decision by python mainters for python 3
 51 |                             # we must use slice for bytes as i[p] returns int while slice returns character
 52 |                             if m == 1:
 53 |                                 o += o[-m:]
 54 |                             else:
 55 |                                 o += o[-m:-m+1]
 56 |         return o
 57 | 
 58 | class HuffcdicReader:
 59 |     q = struct.Struct(b'>Q').unpack_from
 60 | 
 61 |     def loadHuff(self, huff):
 62 |         if huff[0:8] != b'HUFF\x00\x00\x00\x18':
 63 |             raise unpackException('invalid huff header')
 64 |         off1, off2 = struct.unpack_from(b'>LL', huff, 8)
 65 | 
 66 |         def dict1_unpack(v):
 67 |             codelen, term, maxcode = v&0x1f, v&0x80, v>>8
 68 |             assert codelen != 0
 69 |             if codelen <= 8:
 70 |                 assert term
 71 |             maxcode = ((maxcode + 1) << (32 - codelen)) - 1
 72 |             return (codelen, term, maxcode)
 73 |         self.dict1 = lmap(dict1_unpack, struct.unpack_from(b'>256L', huff, off1))
 74 | 
 75 |         dict2 = struct.unpack_from(b'>64L', huff, off2)
 76 |         self.mincode, self.maxcode = (), ()
 77 |         for codelen, mincode in enumerate((0,) + dict2[0::2]):
 78 |             self.mincode += (mincode << (32 - codelen), )
 79 |         for codelen, maxcode in enumerate((0,) + dict2[1::2]):
 80 |             self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, )
 81 | 
 82 |         self.dictionary = []
 83 | 
 84 |     def loadCdic(self, cdic):
 85 |         if cdic[0:8] != b'CDIC\x00\x00\x00\x10':
 86 |             raise unpackException('invalid cdic header')
 87 |         phrases, bits = struct.unpack_from(b'>LL', cdic, 8)
 88 |         n = min(1<<bits, phrases-len(self.dictionary))
 89 |         h = struct.Struct(b'>H').unpack_from
 90 |         def getslice(off):
 91 |             blen, = h(cdic, 16+off)
 92 |             slice = cdic[18+off:18+off+(blen&0x7fff)]
 93 |             return (slice, blen&0x8000)
 94 |         self.dictionary += lmap(getslice, struct.unpack_from(bstr('>%dH' % n), cdic, 16))
 95 | 
 96 |     def unpack(self, data):
 97 |         q = HuffcdicReader.q
 98 | 
 99 |         bitsleft = len(data) * 8
100 |         data += b"\x00\x00\x00\x00\x00\x00\x00\x00"
101 |         pos = 0
102 |         x, = q(data, pos)
103 |         n = 32
104 | 
105 |         s = b''
106 |         while True:
107 |             if n <= 0:
108 |                 pos += 4
109 |                 x, = q(data, pos)
110 |                 n += 32
111 |             code = (x >> n) & ((1 << 32) - 1)
112 | 
113 |             codelen, term, maxcode = self.dict1[code >> 24]
114 |             if not term:
115 |                 while code < self.mincode[codelen]:
116 |                     codelen += 1
117 |                 maxcode = self.maxcode[codelen]
118 | 
119 |             n -= codelen
120 |             bitsleft -= codelen
121 |             if bitsleft < 0:
122 |                 break
123 | 
124 |             r = (maxcode - code) >> (32 - codelen)
125 |             slice, flag = self.dictionary[r]
126 |             if not flag:
127 |                 self.dictionary[r] = None
128 |                 slice = self.unpack(slice)
129 |                 self.dictionary[r] = (slice, 1)
130 |             s += slice
131 |         return s
132 | 


--------------------------------------------------------------------------------
/src/baca/tools/KindleUnpack/mobi_sectioner.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | from .compatibility_utils import PY2, hexlify, bstr, bord, bchar
  8 | 
  9 | import datetime
 10 | 
 11 | if PY2:
 12 |     range = xrange
 13 | 
 14 | # note:  struct pack, unpack, unpack_from all require bytestring format
 15 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 16 | import struct
 17 | 
 18 | from .unipath import pathof
 19 | 
 20 | DUMP = False
 21 | """ Set to True to dump all possible information. """
 22 | 
 23 | class unpackException(Exception):
 24 |     pass
 25 | 
 26 | 
 27 | def describe(data):
 28 |     txtans = ''
 29 |     hexans = hexlify(data)
 30 |     for i in data:
 31 |         if bord(i) < 32 or bord(i) > 127:
 32 |             txtans += '?'
 33 |         else:
 34 |             txtans += bchar(i).decode('latin-1')
 35 |     return '"' + txtans + '"' + ' 0x'+ hexans
 36 | 
 37 | def datetimefrompalmtime(palmtime):
 38 |     if palmtime > 0x7FFFFFFF:
 39 |         pythondatetime = datetime.datetime(year=1904,month=1,day=1)+datetime.timedelta(seconds=palmtime)
 40 |     else:
 41 |         pythondatetime = datetime.datetime(year=1970,month=1,day=1)+datetime.timedelta(seconds=palmtime)
 42 |     return pythondatetime
 43 | 
 44 | 
 45 | class Sectionizer:
 46 | 
 47 |     def __init__(self, filename):
 48 |         self.data = b''
 49 |         with open(pathof(filename), 'rb') as f:
 50 |             self.data = f.read()
 51 |         self.palmheader = self.data[:78]
 52 |         self.palmname = self.data[:32]
 53 |         self.ident = self.palmheader[0x3C:0x3C+8]
 54 |         self.num_sections, = struct.unpack_from(b'>H', self.palmheader, 76)
 55 |         self.filelength = len(self.data)
 56 |         sectionsdata = struct.unpack_from(bstr('>%dL' % (self.num_sections*2)), self.data, 78) + (self.filelength, 0)
 57 |         self.sectionoffsets = sectionsdata[::2]
 58 |         self.sectionattributes = sectionsdata[1::2]
 59 |         self.sectiondescriptions = ["" for x in range(self.num_sections+1)]
 60 |         self.sectiondescriptions[-1] = "File Length Only"
 61 |         return
 62 | 
 63 |     def dumpsectionsinfo(self):
 64 |         print("Section     Offset  Length      UID Attribs Description")
 65 |         for i in range(self.num_sections):
 66 |             print("%3d %3X  0x%07X 0x%05X % 8d % 7d %s" % (i,i, self.sectionoffsets[i], self.sectionoffsets[
 67 |                   i+1] - self.sectionoffsets[i], self.sectionattributes[i]&0xFFFFFF, (self.sectionattributes[i]>>24)&0xFF, self.sectiondescriptions[i]))
 68 |         print("%3d %3X  0x%07X                          %s" %
 69 |               (self.num_sections,self.num_sections, self.sectionoffsets[self.num_sections], self.sectiondescriptions[self.num_sections]))
 70 | 
 71 |     def setsectiondescription(self, section, description):
 72 |         if section < len(self.sectiondescriptions):
 73 |             self.sectiondescriptions[section] = description
 74 |         else:
 75 |             print("Section out of range: %d, description %s" % (section,description))
 76 | 
 77 |     def dumppalmheader(self):
 78 |         print("Palm Database Header")
 79 |         print("Database name: " + repr(self.palmheader[:32]))
 80 |         dbattributes, = struct.unpack_from(b'>H', self.palmheader, 32)
 81 |         print("Bitfield attributes: 0x%0X" % dbattributes,)
 82 |         if dbattributes != 0:
 83 |             print(" (",)
 84 |             if (dbattributes & 2):
 85 |                 print("Read-only; ",)
 86 |             if (dbattributes & 4):
 87 |                 print("Dirty AppInfoArea; ",)
 88 |             if (dbattributes & 8):
 89 |                 print("Needs to be backed up; ",)
 90 |             if (dbattributes & 16):
 91 |                 print("OK to install over newer; ",)
 92 |             if (dbattributes & 32):
 93 |                 print("Reset after installation; ",)
 94 |             if (dbattributes & 64):
 95 |                 print("No copying by PalmPilot beaming; ",)
 96 |             print(")")
 97 |         else:
 98 |             print("")
 99 |         print("File version: %d" % struct.unpack_from(b'>H', self.palmheader, 34)[0])
100 |         dbcreation, = struct.unpack_from(b'>L', self.palmheader, 36)
101 |         print("Creation Date: " + str(datetimefrompalmtime(dbcreation))+ (" (0x%0X)" % dbcreation))
102 |         dbmodification, = struct.unpack_from(b'>L', self.palmheader, 40)
103 |         print("Modification Date: " + str(datetimefrompalmtime(dbmodification))+ (" (0x%0X)" % dbmodification))
104 |         dbbackup, = struct.unpack_from(b'>L', self.palmheader, 44)
105 |         if dbbackup != 0:
106 |             print("Backup Date: " + str(datetimefrompalmtime(dbbackup))+ (" (0x%0X)" % dbbackup))
107 |         print("Modification No.: %d" % struct.unpack_from(b'>L', self.palmheader, 48)[0])
108 |         print("App Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 52)[0])
109 |         print("Sort Info offset: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 56)[0])
110 |         print("Type/Creator: %s/%s" % (repr(self.palmheader[60:64]), repr(self.palmheader[64:68])))
111 |         print("Unique seed: 0x%0X" % struct.unpack_from(b'>L', self.palmheader, 68)[0])
112 |         expectedzero, = struct.unpack_from(b'>L', self.palmheader, 72)
113 |         if expectedzero != 0:
114 |             print("Should be zero but isn't: %d" % struct.unpack_from(b'>L', self.palmheader, 72)[0])
115 |         print("Number of sections: %d" % struct.unpack_from(b'>H', self.palmheader, 76)[0])
116 |         return
117 | 
118 |     def loadSection(self, section):
119 |         before, after = self.sectionoffsets[section:section+2]
120 |         return self.data[before:after]
121 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # `baca`: TUI E-book Reader
  2 | 
  3 | ![baca_screenshots](https://github.com/wustho/baca/assets/43810055/82d5beb0-d061-4e4c-82ed-a3bd84074d2f)
  4 | 
  5 | Meet `baca`, [epy](https://github.com/wustho/epy)'s lovely sister who lets you indulge
  6 | in your favorite e-books in the comfort of your terminal.
  7 | But with a sleek and contemporary appearance that's sure to captivate you!
  8 | 
  9 | ## Features
 10 | 
 11 | - Formats supported: Epub, Epub3, Mobi & Azw
 12 | - Remembers last reading position
 13 | - Show images as ANSI image & you can click it for more detail
 14 | - Scroll animations
 15 | - Clean & modern looks
 16 | - Text justification
 17 | - Dark & light color scheme
 18 | - Regex search
 19 | - Hyperlinks
 20 | 
 21 | ## Requirements
 22 | 
 23 | - `python>=3.10`
 24 | 
 25 | ## Installation
 26 | 
 27 | - Via pip: `pip install baca`
 28 | - Via git: `pip install git+https://github.com/wustho/baca`
 29 | - Via AUR: `yay -S baca-ereader-git`
 30 | 
 31 | ## Usage
 32 | 
 33 | ```sh
 34 | # to read an ebook
 35 | baca path/to/your/ebook.epub
 36 | 
 37 | # to read your last read ebook, just run baca without any argument
 38 | baca
 39 | 
 40 | # to see your reading history use -r as an argument
 41 | baca -r
 42 | 
 43 | # say you want to read an ebook from your reading history,
 44 | # but you forgot the path to your ebook
 45 | # just type any words you remember about your ebook
 46 | # and baca will try to match it to path or title+author
 47 | baca doc ebook.epub
 48 | baca alice wonder lewis carroll
 49 | ```
 50 | 
 51 | ## Opening an Image
 52 | 
 53 | To open an image, when you encounter an ANSI image (when `ShowImageAsANSI=yes`) or some thing like this
 54 | (if `ShowImageAsANSI=no`):
 55 | 
 56 | ```
 57 | ┌──────────────────────────────────────────────────────────────────────────────┐
 58 | │                                    IMAGE                                     │
 59 | └──────────────────────────────────────────────────────────────────────────────┘
 60 | ```
 61 | 
 62 | just click on it using mouse and it will open the image using system app.
 63 | Yeah, I know you want to use keyboard for this, me too, but bear with this for now.
 64 | 
 65 | > "Why show the images as ANSI images instead of render it directly on terminal like ranger does?"
 66 | 
 67 | 1. The main reason is that currently, rendering images directly on the terminal
 68 |    doesn't allow for partial scrolling of the image.
 69 |    This means that we can't display only a portion (e.g., 30%) of the image when scrolling,
 70 |    resulting in a broken and non-seamless scrolling experience.
 71 | 
 72 | 2. My primary intention in developing this app is for reading fiction e-books rather than technical ones,
 73 |    and most fiction e-books don't contain many images.
 74 | 
 75 | 3. Displaying images on the terminal requires different implementations for various terminal emulators,
 76 |    which requires a lot of maintenance.
 77 | 
 78 | ## Configurations
 79 | 
 80 | ![pretty_yes_no_cap](https://user-images.githubusercontent.com/43810055/228417623-ac78fb84-0ee0-4930-a843-752ef693822d.png)
 81 | 
 82 | Configuration file available at `~/.config/baca/config.ini` for linux users. Here is the default:
 83 | 
 84 | ```ini
 85 | [General]
 86 | # pick your favorite image viewer
 87 | PreferredImageViewer = auto
 88 | 
 89 | # int or css value string like 90%%
 90 | # (escape percent with double percent %%)
 91 | MaxTextWidth = 80
 92 | 
 93 | # 'justify', 'center', 'left', 'right'
 94 | TextJustification = justify
 95 | 
 96 | # currently using pretty=yes is slow
 97 | # and taking huge amount of memory
 98 | Pretty = no
 99 | 
100 | PageScrollDuration = 0.2
101 | 
102 | # either show image as ansii image
103 | # or text 'IMAGE' as a placehoder
104 | # (showing ansii image will affect
105 | # performance & resource usage)
106 | ShowImageAsANSII = yes
107 | 
108 | [Color Dark]
109 | Background = #1e1e1e
110 | Foreground = #f5f5f5
111 | Accent = #0178d4
112 | 
113 | [Color Light]
114 | Background = #f5f5f5
115 | Foreground = #1e1e1e
116 | Accent = #0178d4
117 | 
118 | [Keymaps]
119 | ToggleLightDark = c
120 | ScrollDown = down,j
121 | ScrollUp = up,k
122 | PageDown = ctrl+f,pagedown,l,space
123 | PageUp = ctrl+b,pageup,h
124 | Home = home,g
125 | End = end,G
126 | OpenToc = tab
127 | OpenMetadata = M
128 | OpenHelp = f1
129 | SearchForward = slash
130 | SearchBackward = question_mark
131 | NextMatch = n
132 | PreviousMatch = N
133 | Confirm = enter
134 | CloseOrQuit = q,escape
135 | Screenshot = f12
136 | ```
137 | 
138 | ## Known Limitations
139 | 
140 | - When searching for specific phrases in `baca`,
141 |   keep in mind that it may not be able to find them if they span across two lines,
142 |   much like in the search behavior of editor vi(m).
143 | 
144 |   For example, `baca` won't be able to find the phrase `"for it"` because it is split into two lines
145 |   in this example.
146 | 
147 |   ```
148 |   ...
149 |   she had forgotten the little golden key, and when she went back to the table for
150 |   it, she found she could not possibly reach it: she could see  it  quite  plainly
151 |   ...
152 |   ```
153 | 
154 | 
155 |   Additionally, `baca` may struggle to locate certain phrases due to adjustments made for text justification.
156 |   See the example above, `"see_it"` may become `"see__it"` due to adjusted spacing between words.
157 |   In this case, it may be more effective to use a regex search for `"see +it"` or simply search for the word `"see"` alone.
158 | 
159 |   Overall, `baca`'s search feature is most effective for locating individual words
160 |   rather than phrases that may be split across multiple lines or impacted by text justification.
161 | 
162 | - Compared to [epy](https://github.com/wustho/epy), currently `baca` has some missing features.
163 |   But these are planned to be implemented to `baca` in the near future:
164 | 
165 |   - [ ] **TODO** Bookmarks
166 |   - [ ] **TODO** FictionBook support
167 |   - [ ] **TODO** URL reading support
168 | 
169 | ## Credits
170 | 
171 | - Thanks to awesome [Textual Project](https://github.com/Textualize/textual)
172 | - [Kindle Unpack](https://github.com/kevinhendricks/KindleUnpack)
173 | - And many others!
174 | 
175 | ## License
176 | 
177 | GPL-3
178 | 
179 | 


--------------------------------------------------------------------------------
/src/baca/tools/KindleUnpack/mobi_pagemap.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | from .compatibility_utils import PY2, unicode_str
  8 | 
  9 | if PY2:
 10 |     range = xrange
 11 | 
 12 | import struct
 13 | # note:  struct pack, unpack, unpack_from all require bytestring format
 14 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 15 | 
 16 | import re
 17 | # note: re requites the pattern to be the exact same type as the data to be searched in python3
 18 | # but u"" is not allowed for the pattern itself only b""
 19 | 
 20 | 
 21 | _TABLE = [('m', 1000), ('cm', 900), ('d', 500), ('cd', 400), ('c', 100), ('xc', 90), ('l', 50), ('xl', 40), ('x', 10), ('ix', 9), ('v', 5), ('iv', 4), ('i', 1)]
 22 | 
 23 | def int_to_roman(i):
 24 |     parts = []
 25 |     num = i
 26 |     for letter, value in _TABLE:
 27 |         while value <= num:
 28 |             num -= value
 29 |             parts.append(letter)
 30 |     return ''.join(parts)
 31 | 
 32 | def roman_to_int(s):
 33 |     result = 0
 34 |     rnstr = s
 35 |     for letter, value in _TABLE:
 36 |         while rnstr.startswith(letter):
 37 |             result += value
 38 |             rnstr = rnstr[len(letter):]
 39 |     return result
 40 | 
 41 | _pattern = r'''\(([^\)]*)\)'''
 42 | _tup_pattern = re.compile(_pattern,re.IGNORECASE)
 43 | 
 44 | 
 45 | def _parseNames(numpages, data):
 46 |     data = unicode_str(data)
 47 |     pagenames = []
 48 |     pageMap = ''
 49 |     for i in range(numpages):
 50 |         pagenames.append(None)
 51 |     for m in re.finditer(_tup_pattern, data):
 52 |         tup = m.group(1)
 53 |         if pageMap != '':
 54 |             pageMap += ','
 55 |         pageMap += '(' + tup + ')'
 56 |         spos, nametype, svalue = tup.split(",")
 57 |         # print(spos, nametype, svalue)
 58 |         if nametype == 'a' or nametype == 'r':
 59 |             svalue = int(svalue)
 60 |         spos = int(spos)
 61 |         for i in range(spos - 1, numpages):
 62 |             if nametype == 'r':
 63 |                 pname = int_to_roman(svalue)
 64 |                 svalue += 1
 65 |             elif nametype == 'a':
 66 |                 pname = "%s" % svalue
 67 |                 svalue += 1
 68 |             elif nametype == 'c':
 69 |                 sp = svalue.find('|')
 70 |                 if sp == -1:
 71 |                     pname = svalue
 72 |                 else:
 73 |                     pname = svalue[0:sp]
 74 |                     svalue = svalue[sp+1:]
 75 |             else:
 76 |                 print("Error: unknown page numbering type", nametype)
 77 |             pagenames[i] = pname
 78 |     return pagenames, pageMap
 79 | 
 80 | 
 81 | class PageMapProcessor:
 82 | 
 83 |     def __init__(self, mh, data):
 84 |         self.mh = mh
 85 |         self.data = data
 86 |         self.pagenames = []
 87 |         self.pageoffsets = []
 88 |         self.pageMap = ''
 89 |         self.pm_len = 0
 90 |         self.pm_nn = 0
 91 |         self.pn_bits = 0
 92 |         self.pmoff = None
 93 |         self.pmstr = ''
 94 |         print("Extracting Page Map Information")
 95 |         rev_len, = struct.unpack_from(b'>L', self.data, 0x10)
 96 |         # skip over header, revision string length data, and revision string
 97 |         ptr = 0x14 + rev_len
 98 |         pm_1, self.pm_len, self.pm_nn, self.pm_bits  = struct.unpack_from(b'>4H', self.data, ptr)
 99 |         # print(pm_1, self.pm_len, self.pm_nn, self.pm_bits)
100 |         self.pmstr = self.data[ptr+8:ptr+8+self.pm_len]
101 |         self.pmoff = self.data[ptr+8+self.pm_len:]
102 |         offsize = b">L"
103 |         offwidth = 4
104 |         if self.pm_bits == 16:
105 |             offsize = b">H"
106 |             offwidth = 2
107 |         ptr = 0
108 |         for i in range(self.pm_nn):
109 |             od, = struct.unpack_from(offsize, self.pmoff, ptr)
110 |             ptr += offwidth
111 |             self.pageoffsets.append(od)
112 |         self.pagenames, self.pageMap = _parseNames(self.pm_nn, self.pmstr)
113 | 
114 |     def getPageMap(self):
115 |         return self.pageMap
116 | 
117 |     def getNames(self):
118 |         return self.pagenames
119 | 
120 |     def getOffsets(self):
121 |         return self.pageoffsets
122 | 
123 |     # page-map.xml will be unicode but encoded to utf-8 immediately before being written to a file
124 |     def generateKF8PageMapXML(self, k8proc):
125 |         pagemapxml = '<page-map xmlns="http://www.idpf.org/2007/opf">\n'
126 |         for i in range(len(self.pagenames)):
127 |             pos = self.pageoffsets[i]
128 |             name = self.pagenames[i]
129 |             if name is not None and name != "":
130 |                 [pn, dir, filename, skelpos, skelend, aidtext] = k8proc.getSkelInfo(pos)
131 |                 idtext = unicode_str(k8proc.getPageIDTag(pos))
132 |                 linktgt = unicode_str(filename)
133 |                 if idtext != '':
134 |                     linktgt += '#' + idtext
135 |                 pagemapxml += '<page name="%s" href="%s/%s" />\n' % (name, dir, linktgt)
136 |         pagemapxml += "</page-map>\n"
137 |         return pagemapxml
138 | 
139 |     def generateAPNX(self, apnx_meta):
140 |         if apnx_meta['format'] == 'MOBI_8':
141 |             content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","format":"%(format)s","fileRevisionId":"1","acr":"%(acr)s"}' %apnx_meta
142 |         else:
143 |             content_header = '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","fileRevisionId":"1"}' % apnx_meta
144 |         content_header = content_header.encode('utf-8')
145 |         page_header = '{"asin":"%(asin)s","pageMap":"%(pageMap)s"}' % apnx_meta
146 |         page_header = page_header.encode('utf-8')
147 |         apnx = struct.pack(b'>H',1) + struct.pack(b'>H',1)
148 |         apnx += struct.pack(b'>I', 12 + len(content_header))
149 |         apnx += struct.pack(b'>I', len(content_header))
150 |         apnx += content_header
151 |         apnx += struct.pack(b'>H', 1)
152 |         apnx += struct.pack(b'>H', len(page_header))
153 |         apnx += struct.pack(b'>H', self.pm_nn)
154 |         apnx += struct.pack(b'>H', 32)
155 |         apnx += page_header
156 |         for page in self.pageoffsets:
157 |             apnx += struct.pack(b'>L', page)
158 |         return apnx
159 | 


--------------------------------------------------------------------------------
/src/baca/components/windows.py:
--------------------------------------------------------------------------------
  1 | from textual import events
  2 | from textual.app import ComposeResult
  3 | from textual.message import Message
  4 | from textual.reactive import reactive
  5 | from textual.widget import Widget
  6 | from textual.widgets import Input, Static
  7 | 
  8 | from baca.components.contents import Table
  9 | from baca.components.events import FollowThis, Screenshot, SearchSubmitted
 10 | from baca.models import Config, KeyMap, TocEntry
 11 | from baca.utils.keys_parser import dispatch_key
 12 | 
 13 | 
 14 | class SearchInputPrompt(Input):
 15 |     can_focus = True
 16 | 
 17 |     def __init__(self, forward: bool):
 18 |         super().__init__()
 19 |         self.forward = forward
 20 |         self.border_title = f"Search {'Forward' if forward else 'Backward'}"
 21 | 
 22 |     def on_mount(self):
 23 |         self.focus()
 24 | 
 25 |     async def on_key(self, event: events.Key) -> None:
 26 |         keymaps = [
 27 |             KeyMap(["backspace", "ctrl+h"], self.action_delete_left),
 28 |             KeyMap(["home", "ctrl+a"], self.action_home),
 29 |             KeyMap(["end", "ctrl+e"], self.action_end),
 30 |             KeyMap(["left"], self.action_cursor_left),
 31 |             KeyMap(["right"], self.action_cursor_right),
 32 |             KeyMap(["ctrl+w"], self.action_delete_left_word),
 33 |             KeyMap(["delete"], self.action_delete_right),
 34 |             KeyMap(["enter"], self.action_submit),
 35 |             KeyMap(["escape"], self.action_close),
 36 |         ]
 37 | 
 38 |         if event.key not in set(k for keymap in keymaps for k in keymap.keys):
 39 |             await super().on_key(event)
 40 |             event.stop()
 41 |             event.prevent_default()
 42 |         else:
 43 |             await dispatch_key(keymaps, event)
 44 | 
 45 |     def action_submit(self) -> None:
 46 |         self.post_message(SearchSubmitted(value=self.value, forward=self.forward))
 47 |         self.action_close()
 48 | 
 49 |     def action_close(self) -> None:
 50 |         self.call_after_refresh(self.remove)
 51 | 
 52 | 
 53 | class Window(Widget):
 54 |     can_focus = True
 55 | 
 56 |     def __init__(self, config: Config, id: str | None = None):
 57 |         super().__init__(**(dict() if id is None else dict(id=id)))
 58 |         self.config = config
 59 |         keymaps = self.config.keymaps
 60 |         self.keymaps = [
 61 |             KeyMap(keymaps.close, self.action_close),
 62 |             KeyMap(keymaps.scroll_down, self.action_scroll_down),
 63 |             KeyMap(keymaps.scroll_up, self.action_scroll_up),
 64 |             KeyMap(keymaps.page_down, self.action_page_down),
 65 |             KeyMap(keymaps.page_up, self.action_page_up),
 66 |             KeyMap(keymaps.screenshot, lambda: self.post_message(Screenshot())),
 67 |         ]
 68 | 
 69 |     async def on_key(self, event: events.Key) -> None:
 70 |         await dispatch_key(self.keymaps, event)
 71 | 
 72 |     def on_mount(self) -> None:
 73 |         # NOTE: somehow this method is automatically inherited
 74 |         # even if the child class overriding this without super().on_moun()
 75 |         self.focus(False)
 76 | 
 77 |         # NOTE: set here instead of in CSS file
 78 |         # so it will be responsive to screen size
 79 |         screen_size = self.screen.size
 80 |         self.styles.margin = (screen_size.height // 10, screen_size.width // 10)
 81 | 
 82 |     def action_close(self) -> None:
 83 |         self.call_after_refresh(self.remove)
 84 | 
 85 | 
 86 | class Alert(Window):
 87 |     border_title = "❗"
 88 | 
 89 |     def __init__(self, config: Config, message: str):
 90 |         super().__init__(config)
 91 |         self.message = message
 92 | 
 93 |     def compose(self) -> ComposeResult:
 94 |         yield Static(self.message)
 95 | 
 96 |     # NOTE: self.render() is low level API
 97 |     # so, this won't be any auto scroll-overflow
 98 |     # use self.compose() instead
 99 |     # def render(self):
100 | 
101 | 
102 | class DictDisplay(Window):
103 |     def __init__(self, config: Config, id: str, title: str, data: dict):
104 |         super().__init__(config, id)
105 |         self.data = data
106 |         self.border_title = title
107 | 
108 |     def compose(self) -> ComposeResult:
109 |         yield Table(headers=["key", "value"], rows=[(k, v) for k, v in self.data.items()])
110 | 
111 | 
112 | class NavPoint(Widget):
113 |     can_focus = False
114 | 
115 |     class Selected(Message):
116 |         def __init__(self, index: int) -> None:
117 |             super().__init__()
118 |             self.index = index
119 | 
120 |     class Clicked(Selected):
121 |         pass
122 | 
123 |     def __init__(self, index: int, label: str):
124 |         super().__init__()
125 |         self.index = index
126 |         self.label = label
127 | 
128 |     def render(self):
129 |         return self.label
130 | 
131 |     async def on_mouse_move(self, _: events.MouseMove) -> None:
132 |         self.post_message(self.Selected(self.index))
133 | 
134 |     async def on_click(self) -> None:
135 |         self.post_message(self.Selected(self.index))
136 |         self.post_message(self.Clicked(self.index))
137 | 
138 | 
139 | class ToC(Window):
140 |     border_title = "Table of Contents"
141 |     index = reactive(0)
142 | 
143 |     def __init__(self, config: Config, entries: list[TocEntry], initial_index: int = 0):
144 |         super().__init__(config)
145 |         self.entries = entries
146 |         self.entry_widgets = [NavPoint(n, entry.label) for n, entry in enumerate(self.entries)]
147 |         keymaps = config.keymaps
148 |         self.keymaps = [
149 |             KeyMap(keymaps.close + config.keymaps.open_toc, self.action_close),
150 |             KeyMap(keymaps.scroll_down, lambda: self.action_select_next(1)),
151 |             KeyMap(keymaps.scroll_up, lambda: self.action_select_next(-1)),
152 |             KeyMap(keymaps.home, lambda: self.action_select_index(0)),
153 |             KeyMap(keymaps.end, lambda: self.action_select_index(-1)),
154 |             KeyMap(keymaps.confirm, self.follow_nav_point),
155 |             KeyMap(keymaps.screenshot, lambda: self.post_message(Screenshot())),
156 |         ]
157 |         self.index = initial_index
158 | 
159 |     def on_focus(self) -> None:
160 |         # NOTE: by default when a widget gaining focus, in this case ToC
161 |         # it will reset the scrolling position of this widget which will hide selected NavPoint
162 |         # So, either assign new value for selected navpoint or run watch_selected_value()
163 |         self.watch_index(self.index, self.index)
164 | 
165 |     def action_select_next(self, n: int) -> None:
166 |         self.index = (self.index + n) % len(self.entries)
167 | 
168 |     def action_select_index(self, n: int) -> None:
169 |         self.index = n
170 | 
171 |     def compose(self) -> ComposeResult:
172 |         yield from self.entry_widgets
173 | 
174 |     def watch_index(self, old: int, new: int) -> None:
175 |         [entry_widget.remove_class("selected") for entry_widget in self.entry_widgets]
176 |         selected = self.entry_widgets[new]
177 |         selected.add_class("selected")
178 |         self.scroll_to_widget(selected, top=False)
179 | 
180 |     def on_nav_point_selected(self, message: NavPoint.Selected) -> None:
181 |         self.index = message.index
182 |         message.stop()
183 | 
184 |     def on_nav_point_clicked(self, message: NavPoint.Clicked) -> None:
185 |         self.follow_nav_point()
186 |         message.stop()
187 | 
188 |     def follow_nav_point(self) -> None:
189 |         self.post_message(FollowThis(self.entries[self.index].value))
190 | 


--------------------------------------------------------------------------------
/src/baca/tools/KindleUnpack/mobi_nav.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | from .compatibility_utils import unicode_str
  8 | import os
  9 | from .unipath import pathof
 10 | 
 11 | import re
 12 | # note: re requites the pattern to be the exact same type as the data to be searched in python3
 13 | # but u"" is not allowed for the pattern itself only b""
 14 | 
 15 | DEBUG_NAV = False
 16 | 
 17 | FORCE_DEFAULT_TITLE = False
 18 | """ Set to True to force to use the default title. """
 19 | 
 20 | NAVIGATION_FINENAME = 'nav.xhtml'
 21 | """ The name for the navigation document. """
 22 | 
 23 | DEFAULT_TITLE = 'Navigation'
 24 | """ The default title for the navigation document. """
 25 | 
 26 | class NAVProcessor(object):
 27 | 
 28 |     def __init__(self, files):
 29 |         self.files = files
 30 |         self.navname = NAVIGATION_FINENAME
 31 | 
 32 |     def buildLandmarks(self, guidetext):
 33 |         header = ''
 34 |         header += '  <nav epub:type="landmarks" id="landmarks" hidden="">\n'
 35 |         header += '    <h2>Guide</h2>\n'
 36 |         header += '    <ol>\n'
 37 |         element = '      <li><a epub:type="{:s}" href="{:s}">{:s}</a></li>\n'
 38 |         footer = ''
 39 |         footer += '    </ol>\n'
 40 |         footer += '  </nav>\n'
 41 | 
 42 |         type_map = {
 43 |             'cover' : 'cover',
 44 |             'title-page' : 'title-page',
 45 |             # ?: 'frontmatter',
 46 |             'text' : 'bodymatter',
 47 |             # ?: 'backmatter',
 48 |             'toc' : 'toc',
 49 |             'loi' : 'loi',
 50 |             'lot' : 'lot',
 51 |             'preface' : 'preface',
 52 |             'bibliography' : 'bibliography',
 53 |             'index' : 'index',
 54 |             'glossary' : 'glossary',
 55 |             'acknowledgements' : 'acknowledgements',
 56 |             'colophon' : None,
 57 |             'copyright-page' : None,
 58 |             'dedication' : None,
 59 |             'epigraph' : None,
 60 |             'foreword' : None,
 61 |             'notes' : None
 62 |             }
 63 | 
 64 |         re_type = re.compile(r'\s+type\s*=\s*"(.*?)"', re.I)
 65 |         re_title = re.compile(r'\s+title\s*=\s*"(.*?)"', re.I)
 66 |         re_link = re.compile(r'\s+href\s*=\s*"(.*?)"', re.I)
 67 |         dir_ = os.path.relpath(self.files.k8text, self.files.k8oebps).replace('\\', '/')
 68 | 
 69 |         data = ''
 70 |         references = re.findall(r'<reference\s+.*?>', unicode_str(guidetext), re.I)
 71 |         for reference in references:
 72 |             mo_type = re_type.search(reference)
 73 |             mo_title = re_title.search(reference)
 74 |             mo_link = re_link.search(reference)
 75 |             if mo_type is not None:
 76 |                 type_ = type_map.get(mo_type.group(1), None)
 77 |             else:
 78 |                 type_ = None
 79 |             if mo_title is not None:
 80 |                 title = mo_title.group(1)
 81 |             else:
 82 |                 title = None
 83 |             if mo_link is not None:
 84 |                 link = mo_link.group(1)
 85 |             else:
 86 |                 link = None
 87 | 
 88 |             if type_ is not None and title is not None and link is not None:
 89 |                 link = os.path.relpath(link, dir_).replace('\\', '/')
 90 |                 data += element.format(type_, link, title)
 91 |         if len(data) > 0:
 92 |             return header + data + footer
 93 |         else:
 94 |             return ''
 95 | 
 96 |     def buildTOC(self, indx_data):
 97 |         header = ''
 98 |         header += '  <nav epub:type="toc" id="toc">\n'
 99 |         header += '    <h1>Table of contents</h1>\n'
100 |         footer = '  </nav>\n'
101 | 
102 |         # recursive part
103 |         def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
104 |             if start>len(indx_data) or end>len(indx_data):
105 |                 print("Warning (in buildTOC): missing INDX child entries", start, end, len(indx_data))
106 |                 return ''
107 |             if DEBUG_NAV:
108 |                 print("recursINDX (in buildTOC) lvl %d from %d to %d" % (lvl, start, end))
109 |             xhtml = ''
110 |             if start <= 0:
111 |                 start = 0
112 |             if end <= 0:
113 |                 end = len(indx_data)
114 |             if lvl > max_lvl:
115 |                 max_lvl = lvl
116 | 
117 |             indent1 = '  ' * (2 + lvl * 2)
118 |             indent2 = '  ' * (3 + lvl * 2)
119 |             xhtml += indent1 + '<ol>\n'
120 |             for i in range(start, end):
121 |                 e = indx_data[i]
122 |                 htmlfile = e['filename']
123 |                 desttag = e['idtag']
124 |                 text = e['text']
125 |                 if not e['hlvl'] == lvl:
126 |                     continue
127 |                 num += 1
128 |                 if desttag == '':
129 |                     link = htmlfile
130 |                 else:
131 |                     link = '{:s}#{:s}'.format(htmlfile, desttag)
132 |                 xhtml += indent2 + '<li>'
133 |                 entry = '<a href="{:}">{:s}</a>'.format(link, text)
134 |                 xhtml += entry
135 |                 # recurs
136 |                 if e['child1'] >= 0:
137 |                     xhtml += '\n'
138 |                     xhtmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
139 |                             e['child1'], e['childn'] + 1)
140 |                     xhtml += xhtmlrec
141 |                     xhtml += indent2
142 |                 # close entry
143 |                 xhtml += '</li>\n'
144 |             xhtml += indent1 + '</ol>\n'
145 |             return xhtml, max_lvl, num
146 | 
147 |         data, max_lvl, num = recursINDX()
148 |         if not len(indx_data) == num:
149 |             print("Warning (in buildTOC): different number of entries in NCX", len(indx_data), num)
150 |         return header + data + footer
151 | 
152 |     def buildNAV(self, ncx_data, guidetext, title, lang):
153 |         print("Building Navigation Document.")
154 |         if FORCE_DEFAULT_TITLE:
155 |             title = DEFAULT_TITLE
156 |         nav_header = ''
157 |         nav_header += '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>'
158 |         nav_header += '<html xmlns="http://www.w3.org/1999/xhtml"'
159 |         nav_header += ' xmlns:epub="http://www.idpf.org/2007/ops"'
160 |         nav_header += ' lang="{0:s}" xml:lang="{0:s}">\n'.format(lang)
161 |         nav_header += '<head>\n<title>{:s}</title>\n'.format(title)
162 |         nav_header += '<meta charset="UTF-8" />\n'
163 |         nav_header += '<style type="text/css">\n'
164 |         nav_header += 'nav#landmarks { display:none; }\n'
165 |         nav_header += 'ol { list-style-type: none; }'
166 |         nav_header += '</style>\n</head>\n<body>\n'
167 |         nav_footer = '</body>\n</html>\n'
168 | 
169 |         landmarks =  self.buildLandmarks(guidetext)
170 |         toc = self.buildTOC(ncx_data)
171 | 
172 |         data = nav_header
173 |         data += landmarks
174 |         data += toc
175 |         data += nav_footer
176 |         return data
177 | 
178 |     def getNAVName(self):
179 |         return self.navname
180 | 
181 |     def writeNAV(self, ncx_data, guidetext, metadata):
182 |         # build the xhtml
183 |         # print("Write Navigation Document.")
184 |         xhtml = self.buildNAV(ncx_data, guidetext, metadata.get('Title')[0], metadata.get('Language')[0])
185 |         fname = os.path.join(self.files.k8text, self.navname)
186 |         with open(pathof(fname), 'wb') as f:
187 |             f.write(xhtml.encode('utf-8'))
188 | 


--------------------------------------------------------------------------------
/src/baca/tools/KindleUnpack/unpack_structure.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | from .compatibility_utils import text_type
  8 | 
  9 | from . import unipath
 10 | from .unipath import pathof
 11 | 
 12 | DUMP = False
 13 | """ Set to True to dump all possible information. """
 14 | 
 15 | import os
 16 | 
 17 | import re
 18 | # note: re requites the pattern to be the exact same type as the data to be searched in python3
 19 | # but u"" is not allowed for the pattern itself only b""
 20 | 
 21 | import zipfile
 22 | import binascii
 23 | from .mobi_utils import mangle_fonts
 24 | 
 25 | class unpackException(Exception):
 26 |     pass
 27 | 
 28 | class ZipInfo(zipfile.ZipInfo):
 29 | 
 30 |     def __init__(self, *args, **kwargs):
 31 |         if 'compress_type' in kwargs:
 32 |             compress_type = kwargs.pop('compress_type')
 33 |         super(ZipInfo, self).__init__(*args, **kwargs)
 34 |         self.compress_type = compress_type
 35 | 
 36 | class fileNames:
 37 | 
 38 |     def __init__(self, infile, outdir):
 39 |         self.infile = infile
 40 |         self.outdir = outdir
 41 |         if not unipath.exists(self.outdir):
 42 |             unipath.mkdir(self.outdir)
 43 |         self.mobi7dir = os.path.join(self.outdir,'mobi7')
 44 |         if not unipath.exists(self.mobi7dir):
 45 |             unipath.mkdir(self.mobi7dir)
 46 |         self.imgdir = os.path.join(self.mobi7dir, 'Images')
 47 |         if not unipath.exists(self.imgdir):
 48 |             unipath.mkdir(self.imgdir)
 49 |         self.hdimgdir = os.path.join(self.outdir,'HDImages')
 50 |         if not unipath.exists(self.hdimgdir):
 51 |             unipath.mkdir(self.hdimgdir)
 52 |         self.outbase = os.path.join(self.outdir, os.path.splitext(os.path.split(infile)[1])[0])
 53 | 
 54 |     def getInputFileBasename(self):
 55 |         return os.path.splitext(os.path.basename(self.infile))[0]
 56 | 
 57 |     def makeK8Struct(self):
 58 |         self.k8dir = os.path.join(self.outdir,'mobi8')
 59 |         if not unipath.exists(self.k8dir):
 60 |             unipath.mkdir(self.k8dir)
 61 |         self.k8metainf = os.path.join(self.k8dir,'META-INF')
 62 |         if not unipath.exists(self.k8metainf):
 63 |             unipath.mkdir(self.k8metainf)
 64 |         self.k8oebps = os.path.join(self.k8dir,'OEBPS')
 65 |         if not unipath.exists(self.k8oebps):
 66 |             unipath.mkdir(self.k8oebps)
 67 |         self.k8images = os.path.join(self.k8oebps,'Images')
 68 |         if not unipath.exists(self.k8images):
 69 |             unipath.mkdir(self.k8images)
 70 |         self.k8fonts = os.path.join(self.k8oebps,'Fonts')
 71 |         if not unipath.exists(self.k8fonts):
 72 |             unipath.mkdir(self.k8fonts)
 73 |         self.k8styles = os.path.join(self.k8oebps,'Styles')
 74 |         if not unipath.exists(self.k8styles):
 75 |             unipath.mkdir(self.k8styles)
 76 |         self.k8text = os.path.join(self.k8oebps,'Text')
 77 |         if not unipath.exists(self.k8text):
 78 |             unipath.mkdir(self.k8text)
 79 | 
 80 |     # recursive zip creation support routine
 81 |     def zipUpDir(self, myzip, tdir, localname):
 82 |         currentdir = tdir
 83 |         if localname != "":
 84 |             currentdir = os.path.join(currentdir,localname)
 85 |         list = unipath.listdir(currentdir)
 86 |         for file in list:
 87 |             afilename = file
 88 |             localfilePath = os.path.join(localname, afilename)
 89 |             realfilePath = os.path.join(currentdir,file)
 90 |             if unipath.isfile(realfilePath):
 91 |                 myzip.write(pathof(realfilePath), pathof(localfilePath), zipfile.ZIP_DEFLATED)
 92 |             elif unipath.isdir(realfilePath):
 93 |                 self.zipUpDir(myzip, tdir, localfilePath)
 94 | 
 95 |     def makeEPUB(self, usedmap, obfuscate_data, uid):
 96 |         bname = os.path.join(self.k8dir, self.getInputFileBasename() + '.epub')
 97 |         # Create an encryption key for Adobe font obfuscation
 98 |         # based on the epub's uid
 99 |         if isinstance(uid,text_type):
100 |             uid = uid.encode('ascii')
101 |         if obfuscate_data:
102 |             key = re.sub(br'[^a-fA-F0-9]', b'', uid)
103 |             key = binascii.unhexlify((key + key)[:32])
104 | 
105 |         # copy over all images and fonts that are actually used in the ebook
106 |         # and remove all font files from mobi7 since not supported
107 |         imgnames = unipath.listdir(self.imgdir)
108 |         for name in imgnames:
109 |             if usedmap.get(name,'not used') == 'used':
110 |                 filein = os.path.join(self.imgdir,name)
111 |                 if name.endswith(".ttf"):
112 |                     fileout = os.path.join(self.k8fonts,name)
113 |                 elif name.endswith(".otf"):
114 |                     fileout = os.path.join(self.k8fonts,name)
115 |                 elif name.endswith(".failed"):
116 |                     fileout = os.path.join(self.k8fonts,name)
117 |                 else:
118 |                     fileout = os.path.join(self.k8images,name)
119 |                 data = b''
120 |                 with open(pathof(filein),'rb') as f:
121 |                     data = f.read()
122 |                 if obfuscate_data:
123 |                     if name in obfuscate_data:
124 |                         data = mangle_fonts(key, data)
125 |                 open(pathof(fileout),'wb').write(data)
126 |                 if name.endswith(".ttf") or name.endswith(".otf"):
127 |                     os.remove(pathof(filein))
128 | 
129 |         # opf file name hard coded to "content.opf"
130 |         container = '<?xml version="1.0" encoding="UTF-8"?>\n'
131 |         container += '<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">\n'
132 |         container += '    <rootfiles>\n'
133 |         container += '<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>'
134 |         container += '    </rootfiles>\n</container>\n'
135 |         fileout = os.path.join(self.k8metainf,'container.xml')
136 |         with open(pathof(fileout),'wb') as f:
137 |             f.write(container.encode('utf-8'))
138 | 
139 |         if obfuscate_data:
140 |             encryption = '<encryption xmlns="urn:oasis:names:tc:opendocument:xmlns:container" \
141 | xmlns:enc="http://www.w3.org/2001/04/xmlenc#" xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">\n'
142 |             for font in obfuscate_data:
143 |                 encryption += '  <enc:EncryptedData>\n'
144 |                 encryption += '    <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>\n'
145 |                 encryption += '    <enc:CipherData>\n'
146 |                 encryption += '      <enc:CipherReference URI="OEBPS/Fonts/' + font + '"/>\n'
147 |                 encryption += '    </enc:CipherData>\n'
148 |                 encryption += '  </enc:EncryptedData>\n'
149 |             encryption += '</encryption>\n'
150 |             fileout = os.path.join(self.k8metainf,'encryption.xml')
151 |             with open(pathof(fileout),'wb') as f:
152 |                 f.write(encryption.encode('utf-8'))
153 | 
154 |         # ready to build epub
155 |         self.outzip = zipfile.ZipFile(pathof(bname), 'w')
156 | 
157 |         # add the mimetype file uncompressed
158 |         mimetype = b'application/epub+zip'
159 |         fileout = os.path.join(self.k8dir,'mimetype')
160 |         with open(pathof(fileout),'wb') as f:
161 |             f.write(mimetype)
162 |         nzinfo = ZipInfo('mimetype', compress_type=zipfile.ZIP_STORED)
163 |         nzinfo.external_attr = 0o600 << 16 # make this a normal file
164 |         self.outzip.writestr(nzinfo, mimetype)
165 |         self.zipUpDir(self.outzip,self.k8dir,'META-INF')
166 |         self.zipUpDir(self.outzip,self.k8dir,'OEBPS')
167 |         self.outzip.close()
168 | 


--------------------------------------------------------------------------------
/src/baca/ebooks/epub.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import os
  3 | import xml.etree.ElementTree as ET
  4 | import zipfile
  5 | import zlib
  6 | from pathlib import Path
  7 | from typing import Callable, Iterator
  8 | from urllib.parse import unquote, urljoin, urlparse
  9 | 
 10 | from baca.ebooks.base import Ebook
 11 | from baca.models import BookMetadata, Segment, TocEntry
 12 | from baca.utils.html_parser import parse_html_to_segmented_md
 13 | from baca.utils.tempdir import create_tempdir
 14 | 
 15 | 
 16 | class Epub(Ebook):
 17 |     NAMESPACE = {
 18 |         "DAISY": "http://www.daisy.org/z3986/2005/ncx/",
 19 |         "OPF": "http://www.idpf.org/2007/opf",
 20 |         "CONT": "urn:oasis:names:tc:opendocument:xmlns:container",
 21 |         "XHTML": "http://www.w3.org/1999/xhtml",
 22 |         "EPUB": "http://www.idpf.org/2007/ops",
 23 |         # Dublin Core
 24 |         "DC": "http://purl.org/dc/elements/1.1/",
 25 |     }
 26 | 
 27 |     def __init__(self, ebook_path: Path):
 28 |         self._path = ebook_path.resolve()
 29 |         self._file: zipfile.ZipFile = zipfile.ZipFile(ebook_path, "r")
 30 |         self._tempdir = create_tempdir()
 31 | 
 32 |     @staticmethod
 33 |     def _parse_content_opf(
 34 |         content_opf: ET.ElementTree, root_dirpath: str, *, path_resolver: Callable = urljoin
 35 |     ) -> tuple[str, ...]:
 36 |         # cont = ET.parse(self.file.open(self.root_filepath)).getroot()
 37 |         manifests: list[tuple[str, str]] = []
 38 |         for manifest_elem in content_opf.findall("OPF:manifest/*", Epub.NAMESPACE):
 39 |             # EPUB3
 40 |             # if manifest_elem.get("id") != "ncx" and manifest_elem.get("properties") != "nav":
 41 |             if (
 42 |                 manifest_elem.get("media-type") != "application/x-dtbncx+xml"
 43 |                 and manifest_elem.get("properties") != "nav"
 44 |             ):
 45 |                 manifest_id = manifest_elem.get("id")
 46 |                 manifest_href = manifest_elem.get("href")
 47 |                 manifests.append((manifest_id, manifest_href))  # type: ignore
 48 | 
 49 |         spines: list[str] = []
 50 |         contents: list[str] = []
 51 |         for spine_elem in content_opf.findall("OPF:spine/*", Epub.NAMESPACE):
 52 |             idref = spine_elem.get("idref")
 53 |             spines.append(idref)  # type: ignore
 54 |         for spine in spines:
 55 |             for manifest in manifests:
 56 |                 if spine == manifest[0]:
 57 |                     # book_contents.append(root_dirpath + unquote(manifest[1]))
 58 |                     contents.append(unquote(manifest[1]))
 59 |                     manifests.remove(manifest)
 60 |                     # TODO: test is break necessary
 61 |                     break
 62 | 
 63 |         return tuple(path_resolver(root_dirpath, content) for content in contents)
 64 | 
 65 |     @staticmethod
 66 |     def _parse_toc(
 67 |         toc: ET.Element, version: str, root_dirpath, *, path_resolver: Callable = urljoin
 68 |     ) -> tuple[TocEntry, ...]:
 69 |         if version in {"1.0", "2.0"}:
 70 |             navPoints = toc.findall("DAISY:navMap//DAISY:navPoint", Epub.NAMESPACE)
 71 |         elif version == "3.0":
 72 |             navPoints = toc.findall("XHTML:body//XHTML:nav[@EPUB:type='toc']//XHTML:a", Epub.NAMESPACE)
 73 |         else:
 74 |             raise NotImplementedError(f"Unsupported Epub version: {version}")
 75 | 
 76 |         toc_entries: list[TocEntry] = []
 77 |         for navPoint in navPoints:
 78 |             if version in {"1.0", "2.0"}:
 79 |                 src_elem = navPoint.find("DAISY:content", Epub.NAMESPACE)
 80 |                 src = src_elem.get("src")  # type: ignore
 81 | 
 82 |                 name_elem = navPoint.find("DAISY:navLabel/DAISY:text", Epub.NAMESPACE)
 83 |                 name = name_elem.text  # type: ignore
 84 |             elif version == "3.0":
 85 |                 src_elem = navPoint
 86 |                 src = src_elem.get("href")
 87 | 
 88 |                 name = "".join(list(navPoint.itertext()))
 89 |             else:
 90 |                 raise NotImplementedError(f"Unsupported Epub version: {version}")
 91 | 
 92 |             if name is not None:
 93 |                 toc_entries.append(
 94 |                     TocEntry(
 95 |                         label=name,
 96 |                         # content_index=idx,
 97 |                         # section=src_id[1] if len(src_id) == 2 else None,
 98 |                         value=path_resolver(root_dirpath, unquote(src)),  # type: ignore
 99 |                     )
100 |                 )
101 |         return tuple(toc_entries)
102 | 
103 |     @property
104 |     def _root_filepath(self) -> str:
105 |         container = ET.parse(self._file.open("META-INF/container.xml"))
106 |         rootfile_elem = container.find("CONT:rootfiles/CONT:rootfile", Epub.NAMESPACE)
107 |         return rootfile_elem.attrib["full-path"]  # type: ignore
108 | 
109 |     @property
110 |     def _root_dirpath(self) -> str:
111 |         dirname = os.path.dirname(self._root_filepath)
112 |         return f"{dirname}/" if dirname != "" else ""
113 | 
114 |     @property
115 |     def _content_opf(self) -> ET.ElementTree:
116 |         return ET.parse(self._file.open(self._root_filepath))
117 | 
118 |     @property
119 |     def _relactive_toc_ncx_path(self) -> str:
120 |         if self._version in {"1.0", "2.0"}:
121 |             # "OPF:manifest/*[@id='ncx']"
122 |             relative_toc = self._content_opf.find(
123 |                 "OPF:manifest/*[@media-type='application/x-dtbncx+xml']", Epub.NAMESPACE
124 |             )
125 |         elif self._version == "3.0":
126 |             relative_toc = self._content_opf.find("OPF:manifest/*[@properties='nav']", Epub.NAMESPACE)
127 |         else:
128 |             raise NotImplementedError(f"Unsupported Epub version: {self._version}")
129 | 
130 |         return relative_toc.get("href")  # type: ignore
131 | 
132 |     @property
133 |     def _toc_ncx(self) -> ET.Element:
134 |         toc_ncx_path = urljoin(self._root_dirpath, self._relactive_toc_ncx_path)  # type: ignore
135 |         return ET.parse(self._file.open(toc_ncx_path)).getroot()
136 | 
137 |     @property
138 |     def _version(self) -> str:
139 |         return self._content_opf.getroot().get("version")  # type: ignore
140 | 
141 |     def _get_contents(self) -> tuple[str, ...] | tuple[ET.Element, ...]:
142 |         return Epub._parse_content_opf(self._content_opf, self._root_dirpath)
143 | 
144 |     def get_path(self) -> Path:
145 |         return self._path
146 | 
147 |     def get_tempdir(self) -> Path:
148 |         return self._tempdir
149 | 
150 |     def get_meta(self) -> BookMetadata:
151 |         metadata: dict[str, str | None] = {}
152 |         for field in dataclasses.fields(BookMetadata):
153 |             element = self._content_opf.find(f".//DC:{field.name}", Epub.NAMESPACE)
154 |             if element is not None:
155 |                 metadata[field.name] = element.text
156 |         return BookMetadata(**metadata)
157 | 
158 |     def get_toc(self) -> tuple[TocEntry, ...]:
159 |         return Epub._parse_toc(self._toc_ncx, self._version, self._root_dirpath)
160 | 
161 |     def get_raw_text(self, content_path: str | ET.Element) -> str:
162 |         assert isinstance(content_path, str)
163 | 
164 |         max_tries: int | None = None
165 | 
166 |         # use try-except block to catch
167 |         # zlib.error: Error -3 while decompressing data: invalid distance too far back
168 |         # seems like caused by multiprocessing
169 |         tries = 0
170 |         while True:
171 |             try:
172 |                 content = self._file.open(content_path).read()
173 |                 break
174 |             except zlib.error as e:
175 |                 tries += 1
176 |                 if max_tries is not None and tries >= max_tries:
177 |                     raise e
178 | 
179 |         return content.decode("utf-8")
180 | 
181 |     def get_img_bytestr(self, impath: str) -> tuple[str, bytes]:
182 |         assert isinstance(self._file, zipfile.ZipFile)
183 |         unquoted_impath = unquote(impath)
184 |         return os.path.basename(unquoted_impath), self._file.read(unquoted_impath)
185 | 
186 |     def iter_parsed_contents(self) -> Iterator[Segment]:
187 |         toc_entries = self.get_toc()
188 |         for content in self._get_contents():
189 |             ids_for_this_content = [
190 |                 urlparse(t.value).fragment
191 |                 for t in toc_entries
192 |                 if t.value.startswith(content) and urlparse(t.value).fragment != ""
193 |             ]
194 |             raw = self.get_raw_text(content)
195 |             for segment in parse_html_to_segmented_md(raw, str(content), ids_to_find=ids_for_this_content):
196 |                 yield segment
197 | 


--------------------------------------------------------------------------------
/src/baca/tools/KindleUnpack/mobi_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | # flake8: noqa
  5 | 
  6 | from __future__ import unicode_literals, division, absolute_import, print_function
  7 | 
  8 | from .compatibility_utils import PY2, text_type, bchr, bord
  9 | 
 10 | import binascii
 11 | 
 12 | if PY2:
 13 |     range = xrange
 14 | 
 15 | from itertools import cycle
 16 | 
 17 | def getLanguage(langID, sublangID):
 18 |     mobilangdict = {
 19 |             54 : {0 : 'af'},  # Afrikaans
 20 |             28 : {0 : 'sq'},  # Albanian
 21 |              1 : {0 : 'ar' , 5 : 'ar-dz' , 15 : 'ar-bh' , 3 : 'ar-eg' , 2 : 'ar-iq',  11 : 'ar-jo' , 13 : 'ar-kw' , 12 : 'ar-lb' , 4: 'ar-ly',
 22 |                   6 : 'ar-ma' , 8 : 'ar-om' , 16 : 'ar-qa' , 1 : 'ar-sa' , 10 : 'ar-sy' , 7 : 'ar-tn' , 14 : 'ar-ae' , 9 : 'ar-ye'},
 23 |              # Arabic,  Arabic (Algeria),  Arabic (Bahrain),  Arabic (Egypt),  Arabic
 24 |              # (Iraq), Arabic (Jordan),  Arabic (Kuwait),  Arabic (Lebanon),  Arabic
 25 |              # (Libya), Arabic (Morocco),  Arabic (Oman),  Arabic (Qatar),  Arabic
 26 |              # (Saudi Arabia),  Arabic (Syria),  Arabic (Tunisia),  Arabic (United Arab
 27 |              # Emirates),  Arabic (Yemen)
 28 |             43 : {0 : 'hy'},  # Armenian
 29 |             77 : {0 : 'as'},  # Assamese
 30 |             44 : {0 : 'az'},  # "Azeri (IANA: Azerbaijani)
 31 |             45 : {0 : 'eu'},  # Basque
 32 |             35 : {0 : 'be'},  # Belarusian
 33 |             69 : {0 : 'bn'},  # Bengali
 34 |              2 : {0 : 'bg'},  # Bulgarian
 35 |              3 : {0 : 'ca'},  # Catalan
 36 |              4 : {0 : 'zh' , 3 : 'zh-hk' , 2 : 'zh-cn' , 4 : 'zh-sg' , 1 : 'zh-tw'},
 37 |             # Chinese,  Chinese (Hong Kong),  Chinese (PRC),  Chinese (Singapore),  Chinese (Taiwan)
 38 |             26 : {0 : 'hr', 3 : 'sr'},  # Croatian, Serbian
 39 |              5 : {0 : 'cs'},  # Czech
 40 |              6 : {0 : 'da'},  # Danish
 41 |             19 : {0: 'nl', 1 : 'nl' , 2 : 'nl-be'},  # Dutch / Flemish,  Dutch (Belgium)
 42 |              9 : {0: 'en', 1 : 'en' , 3 : 'en-au' , 40 : 'en-bz' , 4 : 'en-ca' , 6 : 'en-ie' , 8 : 'en-jm' , 5 : 'en-nz' , 13 : 'en-ph' ,
 43 |                   7 : 'en-za' , 11 : 'en-tt' , 2 : 'en-gb', 1 : 'en-us' , 12 : 'en-zw'},
 44 |              # English,  English (Australia),  English (Belize),  English (Canada),
 45 |              # English (Ireland),  English (Jamaica),  English (New Zealand),  English
 46 |              # (Philippines),  English (South Africa),  English (Trinidad),  English
 47 |              # (United Kingdom),  English (United States),  English (Zimbabwe)
 48 |             37 : {0 : 'et'},  # Estonian
 49 |             56 : {0 : 'fo'},  # Faroese
 50 |             41 : {0 : 'fa'},  # Farsi / Persian
 51 |             11 : {0 : 'fi'},  # Finnish
 52 |             12 : {0 : 'fr', 1 : 'fr' , 2 : 'fr-be' , 3 : 'fr-ca' , 5 : 'fr-lu' , 6 : 'fr-mc' , 4 : 'fr-ch'},
 53 |             # French,  French (Belgium),  French (Canada),  French (Luxembourg),  French (Monaco),  French (Switzerland)
 54 |             55 : {0 : 'ka'},  # Georgian
 55 |              7 : {0 : 'de', 1 : 'de' , 3 : 'de-at' , 5 : 'de-li' , 4 : 'de-lu' , 2 : 'de-ch'},
 56 |              # German,  German (Austria),  German (Liechtenstein),  German (Luxembourg),  German (Switzerland)
 57 |              8 : {0 : 'el'},  # Greek, Modern (1453-)
 58 |             71 : {0 : 'gu'},  # Gujarati
 59 |             13 : {0 : 'he'},  # Hebrew (also code 'iw'?)
 60 |             57 : {0 : 'hi'},  # Hindi
 61 |             14 : {0 : 'hu'},  # Hungarian
 62 |             15 : {0 : 'is'},  # Icelandic
 63 |             33 : {0 : 'id'},  # Indonesian
 64 |             16 : {0 : 'it', 1 : 'it' , 2 : 'it-ch'},  # Italian,  Italian (Switzerland)
 65 |             17 : {0 : 'ja'},  # Japanese
 66 |             75 : {0 : 'kn'},  # Kannada
 67 |             63 : {0 : 'kk'},  # Kazakh
 68 |             87 : {0 : 'x-kok'},  # Konkani (real language code is 'kok'?)
 69 |             18 : {0 : 'ko'},  # Korean
 70 |             38 : {0 : 'lv'},  # Latvian
 71 |             39 : {0 : 'lt'},  # Lithuanian
 72 |             47 : {0 : 'mk'},  # Macedonian
 73 |             62 : {0 : 'ms'},  # Malay
 74 |             76 : {0 : 'ml'},  # Malayalam
 75 |             58 : {0 : 'mt'},  # Maltese
 76 |             78 : {0 : 'mr'},  # Marathi
 77 |             97 : {0 : 'ne'},  # Nepali
 78 |             20 : {0 : 'no'},  # Norwegian
 79 |             72 : {0 : 'or'},  # Oriya
 80 |             21 : {0 : 'pl'},  # Polish
 81 |             22 : {0 : 'pt', 2 : 'pt' , 1 : 'pt-br'},  # Portuguese,  Portuguese (Brazil)
 82 |             70 : {0 : 'pa'},  # Punjabi
 83 |             23 : {0 : 'rm'},  # "Rhaeto-Romanic" (IANA: Romansh)
 84 |             24 : {0 : 'ro'},  # Romanian
 85 |             25 : {0 : 'ru'},  # Russian
 86 |             59 : {0 : 'sz'},  # "Sami (Lappish)" (not an IANA language code)
 87 |             # IANA code for "Northern Sami" is 'se'
 88 |             # 'SZ' is the IANA region code for Swaziland
 89 |             79 : {0 : 'sa'},  # Sanskrit
 90 |             27 : {0 : 'sk'},  # Slovak
 91 |             36 : {0 : 'sl'},  # Slovenian
 92 |             46 : {0 : 'sb'},  # "Sorbian" (not an IANA language code)
 93 |             # 'SB' is IANA region code for 'Solomon Islands'
 94 |             # Lower Sorbian = 'dsb'
 95 |             # Upper Sorbian = 'hsb'
 96 |             # Sorbian Languages = 'wen'
 97 |             10 : {0 : 'es' , 4 : 'es' , 44 : 'es-ar' , 64 : 'es-bo' , 52 : 'es-cl' , 36 : 'es-co' , 20 : 'es-cr' , 28 : 'es-do' ,
 98 |                   48 : 'es-ec' , 68 : 'es-sv' , 16 : 'es-gt' , 72 : 'es-hn' , 8 : 'es-mx' , 76 : 'es-ni' , 24 : 'es-pa' ,
 99 |                   60 : 'es-py' , 40 : 'es-pe' , 80 : 'es-pr' , 56 : 'es-uy' , 32 : 'es-ve'},
100 |             # Spanish,  Spanish (Mobipocket bug?),  Spanish (Argentina),  Spanish
101 |             # (Bolivia),  Spanish (Chile),  Spanish (Colombia),  Spanish (Costa Rica),
102 |             # Spanish (Dominican Republic),  Spanish (Ecuador),  Spanish (El
103 |             # Salvador),  Spanish (Guatemala),  Spanish (Honduras),  Spanish (Mexico),
104 |             # Spanish (Nicaragua),  Spanish (Panama),  Spanish (Paraguay),  Spanish
105 |             # (Peru),  Spanish (Puerto Rico),  Spanish (Uruguay),  Spanish (Venezuela)
106 |             48 : {0 : 'sx'},  # "Sutu" (not an IANA language code)
107 |             # "Sutu" is another name for "Southern Sotho"?
108 |             # IANA code for "Southern Sotho" is 'st'
109 |             65 : {0 : 'sw'},  # Swahili
110 |             29 : {0 : 'sv' , 1 : 'sv' , 8 : 'sv-fi'},  # Swedish,  Swedish (Finland)
111 |             73 : {0 : 'ta'},  # Tamil
112 |             68 : {0 : 'tt'},  # Tatar
113 |             74 : {0 : 'te'},  # Telugu
114 |             30 : {0 : 'th'},  # Thai
115 |             49 : {0 : 'ts'},  # Tsonga
116 |             50 : {0 : 'tn'},  # Tswana
117 |             31 : {0 : 'tr'},  # Turkish
118 |             34 : {0 : 'uk'},  # Ukrainian
119 |             32 : {0 : 'ur'},  # Urdu
120 |             67 : {0 : 'uz', 2 : 'uz'},  # Uzbek
121 |             42 : {0 : 'vi'},  # Vietnamese
122 |             52 : {0 : 'xh'},  # Xhosa
123 |             53 : {0 : 'zu'},  # Zulu
124 |     }
125 |     lang = "en"
126 |     if langID in mobilangdict:
127 |         subdict = mobilangdict[langID]
128 |         lang = subdict[0]
129 |         if sublangID in subdict:
130 |             lang = subdict[sublangID]
131 |     return lang
132 | 
133 | 
134 | def toHex(byteList):
135 |     return binascii.hexlify(byteList)
136 | 
137 | # returns base32 bytestring
138 | def toBase32(value, npad=4):
139 |     digits = b'0123456789ABCDEFGHIJKLMNOPQRSTUV'
140 |     num_string=b''
141 |     current = value
142 |     while current != 0:
143 |         next, remainder = divmod(current, 32)
144 |         rem_string = digits[remainder:remainder+1]
145 |         num_string = rem_string + num_string
146 |         current=next
147 |     if num_string == b'':
148 |         num_string = b'0'
149 |     pad = npad - len(num_string)
150 |     if pad > 0:
151 |         num_string = b'0' * pad + num_string
152 |     return num_string
153 | 
154 | 
155 | # converts base32 string to value
156 | def fromBase32(str_num):
157 |     if isinstance(str_num, text_type):
158 |         str_num = str_num.encode('latin-1')
159 |     scalelst = [1,32,1024,32768,1048576,33554432,1073741824,34359738368]
160 |     value = 0
161 |     j = 0
162 |     n = len(str_num)
163 |     scale = 0
164 |     for i in range(n):
165 |         c = str_num[n-i-1:n-i]
166 |         if c in b'0123456789':
167 |             v = ord(c) - ord(b'0')
168 |         else:
169 |             v = ord(c) - ord(b'A') + 10
170 |         if j < len(scalelst):
171 |             scale = scalelst[j]
172 |         else:
173 |             scale = scale * 32
174 |         j += 1
175 |         if v != 0:
176 |             value = value + (v * scale)
177 |     return value
178 | 
179 | 
180 | # note: if decode a bytestring using 'latin-1' (or any other 0-255 encoding)
181 | # in place of ascii you will get a byte to half-word or integer
182 | # one to one mapping of values from 0 - 255
183 | 
184 | def mangle_fonts(encryption_key, data):
185 |     if isinstance(encryption_key, text_type):
186 |         encryption_key = encryption_key.encode('latin-1')
187 |     crypt = data[:1024]
188 |     key = cycle(iter(map(bord, encryption_key)))
189 |     # encrypt = ''.join([chr(ord(x)^key.next()) for x in crypt])
190 |     encrypt = b''.join([bchr(bord(x)^next(key)) for x in crypt])
191 |     return encrypt + data[1024:]
192 | 


--------------------------------------------------------------------------------
/src/baca/tools/KindleUnpack/mobi_cover.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | from .compatibility_utils import unicode_str
  8 | 
  9 | from .unipath import pathof
 10 | import os
 11 | import imghdr
 12 | 
 13 | import struct
 14 | # note:  struct pack, unpack, unpack_from all require bytestring format
 15 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 16 | 
 17 | USE_SVG_WRAPPER = True
 18 | """ Set to True to use svg wrapper for default. """
 19 | 
 20 | FORCE_DEFAULT_TITLE = False
 21 | """ Set to True to force to use the default title. """
 22 | 
 23 | COVER_PAGE_FINENAME = 'cover_page.xhtml'
 24 | """ The name for the cover page. """
 25 | 
 26 | DEFAULT_TITLE = 'Cover'
 27 | """ The default title for the cover page. """
 28 | 
 29 | MAX_WIDTH = 4096
 30 | """ The max width for the svg cover page. """
 31 | 
 32 | MAX_HEIGHT = 4096
 33 | """ The max height for the svg cover page. """
 34 | 
 35 | 
 36 | def get_image_type(imgname, imgdata=None):
 37 |     imgtype = unicode_str(imghdr.what(pathof(imgname), imgdata))
 38 | 
 39 |     # imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some
 40 |     # with only the magic JPEG bytes out there...
 41 |     # ImageMagick handles those, so, do it too.
 42 |     if imgtype is None:
 43 |         if imgdata is None:
 44 |             with open(pathof(imgname), 'rb') as f:
 45 |                 imgdata = f.read()
 46 |         if imgdata[0:2] == b'\xFF\xD8':
 47 |             # Get last non-null bytes
 48 |             last = len(imgdata)
 49 |             while (imgdata[last-1:last] == b'\x00'):
 50 |                 last-=1
 51 |             # Be extra safe, check the trailing bytes, too.
 52 |             if imgdata[last-2:last] == b'\xFF\xD9':
 53 |                 imgtype = "jpeg"
 54 |     return imgtype
 55 | 
 56 | 
 57 | def get_image_size(imgname, imgdata=None):
 58 |     '''Determine the image type of imgname (or imgdata) and return its size.
 59 | 
 60 |     Originally,
 61 |     Determine the image type of fhandle and return its size.
 62 |     from draco'''
 63 |     if imgdata is None:
 64 |         fhandle = open(pathof(imgname), 'rb')
 65 |         head = fhandle.read(24)
 66 |     else:
 67 |         head = imgdata[0:24]
 68 |     if len(head) != 24:
 69 |         return
 70 | 
 71 |     imgtype = get_image_type(imgname, imgdata)
 72 |     if imgtype == 'png':
 73 |         check = struct.unpack(b'>i', head[4:8])[0]
 74 |         if check != 0x0d0a1a0a:
 75 |             return
 76 |         width, height = struct.unpack(b'>ii', head[16:24])
 77 |     elif imgtype == 'gif':
 78 |         width, height = struct.unpack(b'<HH', head[6:10])
 79 |     elif imgtype == 'jpeg' and imgdata is None:
 80 |         try:
 81 |             fhandle.seek(0)  # Read 0xff next
 82 |             size = 2
 83 |             ftype = 0
 84 |             while not 0xc0 <= ftype <= 0xcf:
 85 |                 fhandle.seek(size, 1)
 86 |                 byte = fhandle.read(1)
 87 |                 while ord(byte) == 0xff:
 88 |                     byte = fhandle.read(1)
 89 |                 ftype = ord(byte)
 90 |                 size = struct.unpack(b'>H', fhandle.read(2))[0] - 2
 91 |             # We are at a SOFn block
 92 |             fhandle.seek(1, 1)  # Skip `precision' byte.
 93 |             height, width = struct.unpack(b'>HH', fhandle.read(4))
 94 |         except Exception:  # IGNORE:W0703
 95 |             return
 96 |     elif imgtype == 'jpeg' and imgdata is not None:
 97 |         try:
 98 |             pos = 0
 99 |             size = 2
100 |             ftype = 0
101 |             while not 0xc0 <= ftype <= 0xcf:
102 |                 pos += size
103 |                 byte = imgdata[pos:pos+1]
104 |                 pos += 1
105 |                 while ord(byte) == 0xff:
106 |                     byte = imgdata[pos:pos+1]
107 |                     pos += 1
108 |                 ftype = ord(byte)
109 |                 size = struct.unpack(b'>H', imgdata[pos:pos+2])[0] - 2
110 |                 pos += 2
111 |             # We are at a SOFn block
112 |             pos += 1  # Skip `precision' byte.
113 |             height, width = struct.unpack(b'>HH', imgdata[pos:pos+4])
114 |             pos += 4
115 |         except Exception:  # IGNORE:W0703
116 |             return
117 |     else:
118 |         return
119 |     return width, height
120 | 
121 | # XXX experimental
122 | class CoverProcessor(object):
123 | 
124 |     """Create a cover page.
125 | 
126 |     """
127 |     def __init__(self, files, metadata, rscnames, imgname=None, imgdata=None):
128 |         self.files = files
129 |         self.metadata = metadata
130 |         self.rscnames = rscnames
131 |         self.cover_page = COVER_PAGE_FINENAME
132 |         self.use_svg = USE_SVG_WRAPPER  # Use svg wrapper.
133 |         self.lang = metadata.get('Language', ['en'])[0]
134 |         # This should ensure that if the methods to find the cover image's
135 |         # dimensions should fail for any reason, the SVG routine will not be used.
136 |         [self.width, self.height] = (-1,-1)
137 |         if FORCE_DEFAULT_TITLE:
138 |             self.title = DEFAULT_TITLE
139 |         else:
140 |             self.title = metadata.get('Title', [DEFAULT_TITLE])[0]
141 | 
142 |         self.cover_image = None
143 |         if imgname is not None:
144 |             self.cover_image = imgname
145 |         elif 'CoverOffset' in metadata:
146 |             imageNumber = int(metadata['CoverOffset'][0])
147 |             cover_image = self.rscnames[imageNumber]
148 |             if cover_image is not None:
149 |                 self.cover_image = cover_image
150 |             else:
151 |                 print('Warning: Cannot identify the cover image.')
152 |         if self.use_svg:
153 |             try:
154 |                 if imgdata is None:
155 |                     fname = os.path.join(files.imgdir, self.cover_image)
156 |                     [self.width, self.height] = get_image_size(fname)
157 |                 else:
158 |                     [self.width, self.height] = get_image_size(None, imgdata)
159 |             except:
160 |                 self.use_svg = False
161 |             width = self.width
162 |             height = self.height
163 |             if width < 0 or height < 0 or width > MAX_WIDTH or height > MAX_HEIGHT:
164 |                 self.use_svg = False
165 |         return
166 | 
167 |     def getImageName(self):
168 |         return self.cover_image
169 | 
170 |     def getXHTMLName(self):
171 |         return self.cover_page
172 | 
173 |     def buildXHTML(self):
174 |         print('Building a cover page.')
175 |         files = self.files
176 |         cover_image = self.cover_image
177 |         title = self.title
178 |         lang = self.lang
179 | 
180 |         image_dir = os.path.normpath(os.path.relpath(files.k8images, files.k8text))
181 |         image_path = os.path.join(image_dir, cover_image).replace('\\', '/')
182 | 
183 |         if not self.use_svg:
184 |             data = ''
185 |             data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>'
186 |             data += '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"'
187 |             data += ' xml:lang="{:s}">\n'.format(lang)
188 |             data += '<head>\n<title>{:s}</title>\n'.format(title)
189 |             data += '<style type="text/css">\n'
190 |             data += 'body {\n  margin: 0;\n  padding: 0;\n  text-align: center;\n}\n'
191 |             data += 'div {\n  height: 100%;\n  width: 100%;\n  text-align: center;\n  page-break-inside: avoid;\n}\n'
192 |             data += 'img {\n  display: inline-block;\n  height: 100%;\n  margin: 0 auto;\n}\n'
193 |             data += '</style>\n</head>\n'
194 |             data += '<body><div>\n'
195 |             data += '  <img src="{:s}" alt=""/>\n'.format(image_path)
196 |             data += '</div></body>\n</html>'
197 |         else:
198 |             width = self.width
199 |             height = self.height
200 |             viewBox = "0 0 {0:d} {1:d}".format(width, height)
201 | 
202 |             data = ''
203 |             data += '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html>'
204 |             data += '<html xmlns="http://www.w3.org/1999/xhtml"'
205 |             data += ' xml:lang="{:s}">\n'.format(lang)
206 |             data += '<head>\n  <title>{:s}</title>\n'.format(title)
207 |             data += '<style type="text/css">\n'
208 |             data += 'svg {padding: 0pt; margin:0pt}\n'
209 |             data += 'body { text-align: center; padding:0pt; margin: 0pt; }\n'
210 |             data += '</style>\n</head>\n'
211 |             data += '<body>\n  <div>\n'
212 |             data += '    <svg xmlns="http://www.w3.org/2000/svg" height="100%" preserveAspectRatio="xMidYMid meet"'
213 |             data += ' version="1.1" viewBox="{0:s}" width="100%" xmlns:xlink="http://www.w3.org/1999/xlink">\n'.format(viewBox)
214 |             data += '      <image height="{0}" width="{1}" xlink:href="{2}"/>\n'.format(height, width, image_path)
215 |             data += '    </svg>\n'
216 |             data += '  </div>\n</body>\n</html>'
217 |         return data
218 | 
219 |     def writeXHTML(self):
220 |         files = self.files
221 |         cover_page = self.cover_page
222 | 
223 |         data = self.buildXHTML()
224 | 
225 |         outfile = os.path.join(files.k8text, cover_page)
226 |         if os.path.exists(pathof(outfile)):
227 |             print('Warning: {:s} already exists.'.format(cover_page))
228 |             os.remove(pathof(outfile))
229 |         with open(pathof(outfile), 'wb') as f:
230 |             f.write(data.encode('utf-8'))
231 |         return
232 | 
233 |     def guide_toxml(self):
234 |         files = self.files
235 |         text_dir = os.path.relpath(files.k8text, files.k8oebps)
236 |         data = '<reference type="cover" title="Cover" href="{:s}/{:s}" />\n'.format(
237 |                 text_dir, self.cover_page)
238 |         return data
239 | 


--------------------------------------------------------------------------------
/src/baca/components/contents.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import re
  3 | from marshal import dumps
  4 | from urllib.parse import urljoin
  5 | 
  6 | from climage import climage
  7 | from PIL import Image as PILImage
  8 | from rich.markdown import Markdown
  9 | from rich.text import Text
 10 | from textual import events
 11 | from textual.app import ComposeResult
 12 | from textual.geometry import Region
 13 | from textual.strip import Strip
 14 | from textual.widget import Widget
 15 | from textual.widgets import DataTable
 16 | from textual.widgets.markdown import Markdown as PrettyMarkdown
 17 | 
 18 | from baca.components.events import OpenThisImage
 19 | from baca.ebooks import Ebook
 20 | from baca.models import Config, Coordinate, SegmentType
 21 | from baca.utils.urls import is_url
 22 | 
 23 | 
 24 | class Table(DataTable):
 25 |     can_focus = False
 26 | 
 27 |     def __init__(self, headers: list[str], rows: list[tuple]):
 28 |         super().__init__(show_header=True, zebra_stripes=True, show_cursor=False)
 29 |         self.add_columns(*headers)
 30 |         self.add_rows(rows)
 31 | 
 32 |     def on_mount(self) -> None:
 33 |         self.zebra_stripes = True
 34 |         self.show_cursor = False
 35 | 
 36 | 
 37 | class SegmentWidget(Widget):
 38 |     can_focus = False
 39 | 
 40 |     def __init__(self, config: Config, nav_point: str | None):
 41 |         super().__init__()
 42 |         self.config = config
 43 |         self.nav_point = nav_point
 44 | 
 45 |     def get_text_at(self, y: int) -> str:
 46 |         return self.render_lines(Region(0, y, self.virtual_region_with_margin.width, 1))[0].text
 47 | 
 48 | 
 49 | class Body(SegmentWidget):
 50 |     def __init__(self, _: Ebook, config: Config, content: str, nav_point: str | None = None):
 51 |         super().__init__(config, nav_point)
 52 |         self.content = content
 53 | 
 54 |     def render(self):
 55 |         # NOTE: Markdwon rich isn't widget, so we cannot set using css
 56 |         # hence this translation workaround
 57 |         return Markdown(
 58 |             self.content, justify=dict(center="center", left="left", right="right", justify="full")[self.styles.text_align]  # type: ignore
 59 |         )
 60 | 
 61 |     def render_line(self, y) -> Strip:
 62 |         strip = super().render_line(y)
 63 |         for s in strip._segments:
 64 |             if s.style is not None and s.style.link is not None:
 65 |                 link = (
 66 |                     s.style.link
 67 |                     if is_url(s.style.link) or self.nav_point is None
 68 |                     else urljoin(self.nav_point, s.style.link)
 69 |                 )
 70 |                 s.style._meta = dumps({"@click": f"link({link!r})"})
 71 |         return strip
 72 | 
 73 | 
 74 | class Image(SegmentWidget):
 75 |     def __init__(self, ebook: Ebook, config: Config, src: str, nav_point: str | None = None):
 76 |         super().__init__(config, nav_point)
 77 |         # TODO: maybe put it in Widget.id?
 78 |         self.content = src
 79 |         self.ebook = ebook
 80 |         self._renderable = Text("IMAGE", justify="center")
 81 | 
 82 |     def render(self):
 83 |         return self._renderable
 84 | 
 85 |     def show_ansi_image(self):
 86 |         img = PILImage.open(io.BytesIO(self.ebook.get_img_bytestr(self.content)[1])).convert("RGB")
 87 |         img_ansi = climage._toAnsi(
 88 |             img,
 89 |             # NOTE: -1 for precaution on rounding of screen width
 90 |             oWidth=self.size.width - 1,
 91 |             is_unicode=True,
 92 |             color_type=climage.color_types.truecolor,
 93 |             palette="default",
 94 |         )
 95 |         img.close()
 96 |         self._renderable = Text.from_ansi(img_ansi)
 97 |         self.refresh(layout=True)
 98 | 
 99 |     # TODO: "Click ot Open" on mouse hover
100 |     # def on_mouse_move(self, _: events.MouseMove) -> None:
101 |     #     self.styles.background = "red"
102 | 
103 |     async def on_click(self) -> None:
104 |         self.post_message(OpenThisImage(self.content))
105 | 
106 | 
107 | class PrettyBody(PrettyMarkdown):
108 |     def __init__(self, _: Ebook, config: Config, value: str, nav_point: str | None = None):
109 |         super().__init__(value)
110 |         self.nav_point = nav_point
111 | 
112 |     def get_text_at(self, y: int) -> str | None:
113 |         # TODO: this implementation still has issue in positioning match
114 |         # at the end of ebook segment
115 |         accumulated_height = 0
116 |         for child in self.children:
117 |             if accumulated_height + child.virtual_region_with_margin.height > y:
118 |                 return child.render_lines(Region(0, y - accumulated_height, child.virtual_region_with_margin.width, 1))[
119 |                     0
120 |                 ].text
121 |             accumulated_height += child.virtual_region_with_margin.height
122 | 
123 | 
124 | class SearchMatch(Widget):
125 |     can_focus = False
126 | 
127 |     def __init__(self, match_str: str, coordinate: Coordinate):
128 |         super().__init__()
129 |         self.match_str = match_str
130 |         self.coordinate = coordinate
131 | 
132 |     def on_mount(self):
133 |         self.styles.offset = (self.coordinate.x, self.coordinate.y)
134 | 
135 |     def render(self):
136 |         return self.match_str
137 | 
138 |     def scroll_visible(self):
139 |         # NOTE: need to override default .scroll_visible().
140 |         # Somehow this widget.virtual_region_with_margin
141 |         # will cause the screen to scroll to 0.
142 |         self.screen.scroll_to_region(
143 |             Region(
144 |                 x=self.coordinate.x,
145 |                 y=self.coordinate.y,
146 |                 width=self.virtual_size.width,
147 |                 height=self.virtual_size.height,
148 |             )
149 |         )
150 | 
151 | 
152 | class Content(Widget):
153 |     can_focus = False
154 | 
155 |     def __init__(self, config: Config, ebook: Ebook):
156 |         super().__init__()
157 |         self.config = config
158 | 
159 |         self._segments: list[SegmentWidget | PrettyBody] = []
160 |         for segment in ebook.iter_parsed_contents():
161 |             if segment.type == SegmentType.BODY:
162 |                 component_cls = Body if not config.pretty else PrettyBody
163 |             else:
164 |                 component_cls = Image
165 |             self._segments.append(component_cls(ebook, self.config, segment.content, segment.nav_point))
166 | 
167 |     def get_navigables(self):
168 |         return [s for s in self._segments if s.nav_point is not None]
169 | 
170 |     def scroll_to_section(self, nav_point: str) -> None:
171 |         # TODO: add attr TocEntry.uuid so we can query("#{uuid}")
172 |         for s in self.get_navigables():
173 |             if s.nav_point == nav_point:
174 |                 s.scroll_visible(top=True)
175 |                 break
176 | 
177 |     def on_mouse_scroll_down(self, _: events.MouseScrollDown) -> None:
178 |         self.screen.scroll_down()
179 | 
180 |     def on_mouse_scroll_up(self, _: events.MouseScrollUp) -> None:
181 |         self.screen.scroll_up()
182 | 
183 |     # NOTE: override initial message
184 |     def render(self):
185 |         return ""
186 | 
187 |     def compose(self) -> ComposeResult:
188 |         yield from iter(self._segments)
189 | 
190 |     def get_text_at(self, y: int) -> str | None:
191 |         accumulated_height = 0
192 |         for segment in self._segments:
193 |             if accumulated_height + segment.virtual_region_with_margin.height > y:
194 |                 return segment.get_text_at(y - accumulated_height)
195 |             accumulated_height += segment.virtual_region_with_margin.height
196 | 
197 |     async def search_next(
198 |         self, pattern_str: str, current_coord: Coordinate = Coordinate(-1, 0), forward: bool = True
199 |     ) -> Coordinate | None:
200 |         pattern = re.compile(pattern_str, re.IGNORECASE)
201 |         current_x = current_coord.x
202 |         line_range = (
203 |             range(current_coord.y, self.virtual_size.height) if forward else reversed(range(0, current_coord.y + 1))
204 |         )
205 |         for linenr in line_range:
206 |             line_text = self.get_text_at(linenr)
207 |             if line_text is not None:
208 |                 for match in pattern.finditer(line_text):
209 |                     is_next_match = (match.start() > current_x) if forward else (match.start() < current_x)
210 |                     if is_next_match:
211 |                         await self.clear_search()
212 | 
213 |                         match_str = match.group()
214 |                         match_coord = Coordinate(match.start(), linenr)
215 |                         match_widget = SearchMatch(match_str, match_coord)
216 |                         await self.mount(match_widget)
217 |                         match_widget.scroll_visible()
218 |                         return match_coord
219 |             current_x = -1 if forward else self.size.width  # maybe virtual_size?
220 | 
221 |     async def clear_search(self) -> None:
222 |         await self.query(SearchMatch.__name__).remove()
223 | 
224 |     def scroll_to_widget(self, *args, **kwargs) -> bool:
225 |         return self.screen.scroll_to_widget(*args, **kwargs)
226 | 
227 |     def show_ansi_images(self):
228 |         if not self.config.show_image_as_ansi:
229 |             return
230 | 
231 |         # TODO: lazy load the images
232 |         # 1. Need to change how reading prog saved
233 |         #    instead of global 30%, save local by segment (ie. segment 3, 60%)
234 |         # 2. Only load image when scrolled in view. (Checkout `scroll_visible` in Widget/Screen)
235 |         for segment in self._segments:
236 |             if isinstance(segment, Image):
237 |                 segment.show_ansi_image()
238 |         self.refresh(layout=True)
239 | 
240 |     def on_resize(self):
241 |         self.show_ansi_images()
242 | 
243 |     # Already handled by self.styles.max_width
244 |     # async def on_resize(self, event: events.Resize) -> None:
245 |     #     self.styles.width = min(WIDTH, event.size.width - 2)
246 | 


--------------------------------------------------------------------------------
/src/baca/tools/KindleUnpack/compatibility_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | # Copyright (c) 2014 Kevin B. Hendricks, John Schember, and Doug Massay
  6 | # All rights reserved.
  7 | #
  8 | # Redistribution and use in source and binary forms, with or without modification,
  9 | # are permitted provided that the following conditions are met:
 10 | #
 11 | # 1. Redistributions of source code must retain the above copyright notice, this list of
 12 | # conditions and the following disclaimer.
 13 | #
 14 | # 2. Redistributions in binary form must reproduce the above copyright notice, this list
 15 | # of conditions and the following disclaimer in the documentation and/or other materials
 16 | # provided with the distribution.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
 19 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 20 | # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
 21 | # SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 22 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 23 | # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 24 | # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 25 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
 26 | # WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | 
 28 | from __future__ import unicode_literals, division, absolute_import, print_function
 29 | 
 30 | import sys
 31 | import codecs
 32 | 
 33 | PY2 = sys.version_info[0] == 2
 34 | PY3 = sys.version_info[0] == 3
 35 | 
 36 | iswindows = sys.platform.startswith('win')
 37 | 
 38 | try:
 39 |     from urllib.parse import unquote
 40 | except ImportError:
 41 |     from urllib import unquote
 42 | 
 43 | if PY2:
 44 |     from HTMLParser import HTMLParser
 45 |     _h = HTMLParser()
 46 | elif sys.version_info[1] < 4:
 47 |     import html.parser
 48 |     _h = html.parser.HTMLParser()
 49 | else:
 50 |     import html as _h
 51 | 
 52 | if PY3:
 53 |     text_type = str
 54 |     binary_type = bytes
 55 |     # if will be printing arbitraty binary data to stdout on python 3
 56 |     # sys.stdin = sys.stdin.detach()
 57 |     # sys.stdout = sys.stdout.detach()
 58 |     # sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
 59 | else:
 60 |     range = xrange
 61 |     text_type = unicode
 62 |     binary_type = str
 63 |     # if will be printing unicode under python 2 need to protect
 64 |     # against sys.stdout.encoding being None stupidly forcing forcing ascii encoding of unicode
 65 |     # sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
 66 |     # alternatively set environment variable as follows **before** launching python:  export PYTHONIOENCODING=UTF-8
 67 | 
 68 | # NOTE: Python 3 is completely broken when accessing single bytes in bytes strings
 69 | # (and they amazingly claim by design and no bug!)
 70 | 
 71 | # To illustrate: this works for unicode in Python 3 and for all Python 2.X for both bytestrings and unicode
 72 | # >>> o = '123456789'
 73 | # >>> o[-3]
 74 | # '7'
 75 | # >>> type(o[-3])
 76 | # <class 'str'>
 77 | # >>> type(o)
 78 | # <class 'str'>
 79 | 
 80 | # Unfortunately, this is what Python 3 does for no sane reason and only for bytestrings
 81 | # >>> o = b'123456789'
 82 | # >>> o[-3]
 83 | # 55
 84 | # >>> type(o[-3])
 85 | # <class 'int'>
 86 | # >>> type(o)
 87 | # <class 'bytes'>
 88 | 
 89 | # This mind boggling  behaviour also happens when indexing a bytestring and/or
 90 | # iteratoring over a bytestring.  In other words it will return an int but not
 91 | # the byte itself!!!!!!!
 92 | 
 93 | # The only way to access a single byte as a byte in bytestring and get the byte in both
 94 | # Python 2 and Python 3 is to use a slice
 95 | 
 96 | # This problem is so common there are horrible hacks floating around the net to **try**
 97 | # to work around it, so that code that works on both Python 2 and Python 3 is possible.
 98 | 
 99 | # So in order to write code that works on both Python 2 and Python 3
100 | # if you index or access a single byte and want its ord() then use the bord() function.
101 | # If instead you want it as a single character byte use the bchar() function
102 | # both of which are defined below.
103 | 
104 | if PY3:
105 |     # Also Note: if decode a bytestring using 'latin-1' (or any other full range 0-255 encoding)
106 |     # in place of ascii you will get a byte value to half-word or integer value
107 |     # one-to-one mapping (in the 0 - 255 range)
108 | 
109 |     def bchr(s):
110 |         return bytes([s])
111 | 
112 |     def bstr(s):
113 |         if isinstance(s, str):
114 |             return bytes(s, 'latin-1')
115 |         else:
116 |             return bytes(s)
117 | 
118 |     def bord(s):
119 |         return s
120 | 
121 |     def bchar(s):
122 |         return bytes([s])
123 | 
124 | else:
125 |     def bchr(s):
126 |         return chr(s)
127 | 
128 |     def bstr(s):
129 |         return str(s)
130 | 
131 |     def bord(s):
132 |         return ord(s)
133 | 
134 |     def bchar(s):
135 |         return s
136 | 
137 | if PY3:
138 |     # list-producing versions of the major Python iterating functions
139 |     def lrange(*args, **kwargs):
140 |         return list(range(*args, **kwargs))
141 | 
142 |     def lzip(*args, **kwargs):
143 |         return list(zip(*args, **kwargs))
144 | 
145 |     def lmap(*args, **kwargs):
146 |         return list(map(*args, **kwargs))
147 | 
148 |     def lfilter(*args, **kwargs):
149 |         return list(filter(*args, **kwargs))
150 | else:
151 |     import __builtin__
152 |     # Python 2-builtin ranges produce lists
153 |     lrange = __builtin__.range
154 |     lzip = __builtin__.zip
155 |     lmap = __builtin__.map
156 |     lfilter = __builtin__.filter
157 | 
158 | # In Python 3 you can no longer use .encode('hex') on a bytestring
159 | # instead use the following on both platforms
160 | import binascii
161 | def hexlify(bdata):
162 |     return (binascii.hexlify(bdata)).decode('ascii')
163 | 
164 | # If you: import struct
165 | # Note:  struct pack, unpack, unpack_from all *require* bytestring format
166 | # data all the way up to at least Python 2.7.5, Python 3 is okay with either
167 | 
168 | # If you: import re
169 | # note: Python 3 "re" requires the pattern to be the exact same type as the data to be
170 | # searched ... but u"" is not allowed for the pattern itself only b""
171 | # Python 2.X allows the pattern to be any type and converts it to match the data
172 | # and returns the same type as the data
173 | 
174 | # convert string to be utf-8 encoded
175 | def utf8_str(p, enc='utf-8'):
176 |     if p is None:
177 |         return None
178 |     if isinstance(p, text_type):
179 |         return p.encode('utf-8')
180 |     if enc != 'utf-8':
181 |         return p.decode(enc).encode('utf-8')
182 |     return p
183 | 
184 | # convert string to be unicode encoded
185 | def unicode_str(p, enc='utf-8'):
186 |     if p is None:
187 |         return None
188 |     if isinstance(p, text_type):
189 |         return p
190 |     return p.decode(enc)
191 | 
192 | ASCII_CHARS   = set(chr(x) for x in range(128))
193 | URL_SAFE      = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
194 |                     'abcdefghijklmnopqrstuvwxyz'
195 |                     '0123456789' '#' '_.-/~')
196 | IRI_UNSAFE = ASCII_CHARS - URL_SAFE
197 | 
198 | # returns a quoted IRI (not a URI)
199 | def quoteurl(href):
200 |     if isinstance(href,binary_type):
201 |         href = href.decode('utf-8')
202 |     result = []
203 |     for char in href:
204 |         if char in IRI_UNSAFE:
205 |             char = "%%%02x" % ord(char)
206 |         result.append(char)
207 |     return ''.join(result)
208 | 
209 | # unquotes url/iri
210 | def unquoteurl(href):
211 |     if isinstance(href,binary_type):
212 |         href = href.decode('utf-8')
213 |     href = unquote(href)
214 |     return href
215 | 
216 | # unescape html
217 | def unescapeit(sval):
218 |     return _h.unescape(sval)
219 | 
220 | # Python 2.X commandline parsing under Windows has been horribly broken for years!
221 | # Use the following code to emulate full unicode commandline parsing on Python 2
222 | # ie. To get  sys.argv arguments and properly encode them as unicode
223 | 
224 | def unicode_argv():
225 |     global iswindows
226 |     global PY3
227 |     if PY3:
228 |         return sys.argv
229 |     if iswindows:
230 |         # Versions 2.x of Python don't support Unicode in sys.argv on
231 |         # Windows, with the underlying Windows API instead replacing multi-byte
232 |         # characters with '?'.  So use shell32.GetCommandLineArgvW to get sys.argv
233 |         # as a list of Unicode strings
234 |         from ctypes import POINTER, byref, cdll, c_int, windll
235 |         from ctypes.wintypes import LPCWSTR, LPWSTR
236 | 
237 |         GetCommandLineW = cdll.kernel32.GetCommandLineW
238 |         GetCommandLineW.argtypes = []
239 |         GetCommandLineW.restype = LPCWSTR
240 | 
241 |         CommandLineToArgvW = windll.shell32.CommandLineToArgvW
242 |         CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)]
243 |         CommandLineToArgvW.restype = POINTER(LPWSTR)
244 | 
245 |         cmd = GetCommandLineW()
246 |         argc = c_int(0)
247 |         argv = CommandLineToArgvW(cmd, byref(argc))
248 |         if argc.value > 0:
249 |             # Remove Python executable and commands if present
250 |             start = argc.value - len(sys.argv)
251 |             return [argv[i] for i in
252 |                     range(start, argc.value)]
253 |         # this should never happen
254 |         return None
255 |     else:
256 |         argv = []
257 |         argvencoding = sys.stdin.encoding
258 |         if argvencoding is None:
259 |             argvencoding = sys.getfilesystemencoding()
260 |         if argvencoding is None:
261 |             argvencoding = 'utf-8'
262 |         for arg in sys.argv:
263 |             if isinstance(arg, text_type):
264 |                 argv.append(arg)
265 |             else:
266 |                 argv.append(arg.decode(argvencoding))
267 |         return argv
268 | 
269 | 
270 | # Python 2.X is broken in that it does not recognize CP65001 as UTF-8
271 | def add_cp65001_codec():
272 |     if PY2:
273 |         try:
274 |             codecs.lookup('cp65001')
275 |         except LookupError:
276 |             codecs.register(
277 |                 lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
278 |     return
279 | 


--------------------------------------------------------------------------------
/src/baca/tools/KindleUnpack/mobi_ncx.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | import os
  8 | from .unipath import pathof
  9 | from .compatibility_utils import unescapeit
 10 | 
 11 | 
 12 | import re
 13 | # note: re requites the pattern to be the exact same type as the data to be searched in python3
 14 | # but u"" is not allowed for the pattern itself only b""
 15 | 
 16 | from xml.sax.saxutils import escape as xmlescape
 17 | 
 18 | from .mobi_utils import toBase32
 19 | from .mobi_index import MobiIndex
 20 | 
 21 | DEBUG_NCX = False
 22 | 
 23 | class ncxExtract:
 24 | 
 25 |     def __init__(self, mh, files):
 26 |         self.mh = mh
 27 |         self.sect = self.mh.sect
 28 |         self.files = files
 29 |         self.isNCX = False
 30 |         self.mi = MobiIndex(self.sect)
 31 |         self.ncxidx = self.mh.ncxidx
 32 |         self.indx_data = None
 33 | 
 34 |     def parseNCX(self):
 35 |         indx_data = []
 36 |         tag_fieldname_map = {
 37 |                 1: ['pos',0],
 38 |                 2: ['len',0],
 39 |                 3: ['noffs',0],
 40 |                 4: ['hlvl',0],
 41 |                 5: ['koffs',0],
 42 |                 6: ['pos_fid',0],
 43 |                 21: ['parent',0],
 44 |                 22: ['child1',0],
 45 |                 23: ['childn',0]
 46 |         }
 47 |         if self.ncxidx != 0xffffffff:
 48 |             outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX")
 49 |             if DEBUG_NCX:
 50 |                 print(ctoc_text)
 51 |                 print(outtbl)
 52 |             num = 0
 53 |             for [text, tagMap] in outtbl:
 54 |                 tmp = {
 55 |                         'name': text.decode('utf-8'),
 56 |                         'pos':  -1,
 57 |                         'len':  0,
 58 |                         'noffs': -1,
 59 |                         'text' : "Unknown Text",
 60 |                         'hlvl' : -1,
 61 |                         'kind' : "Unknown Kind",
 62 |                         'pos_fid' : None,
 63 |                         'parent' : -1,
 64 |                         'child1' : -1,
 65 |                         'childn' : -1,
 66 |                         'num'  : num
 67 |                         }
 68 |                 for tag in tag_fieldname_map:
 69 |                     [fieldname, i] = tag_fieldname_map[tag]
 70 |                     if tag in tagMap:
 71 |                         fieldvalue = tagMap[tag][i]
 72 |                         if tag == 6:
 73 |                             pos_fid = toBase32(fieldvalue,4).decode('utf-8')
 74 |                             fieldvalue2 = tagMap[tag][i+1]
 75 |                             pos_off = toBase32(fieldvalue2,10).decode('utf-8')
 76 |                             fieldvalue = 'kindle:pos:fid:%s:off:%s' % (pos_fid, pos_off)
 77 |                         tmp[fieldname] = fieldvalue
 78 |                         if tag == 3:
 79 |                             toctext = ctoc_text.get(fieldvalue, 'Unknown Text')
 80 |                             toctext = toctext.decode(self.mh.codec)
 81 |                             tmp['text'] = toctext
 82 |                         if tag == 5:
 83 |                             kindtext = ctoc_text.get(fieldvalue, 'Unknown Kind')
 84 |                             kindtext = kindtext.decode(self.mh.codec)
 85 |                             tmp['kind'] = kindtext
 86 |                 indx_data.append(tmp)
 87 |                 if DEBUG_NCX:
 88 |                     print("record number: ", num)
 89 |                     print("name: ", tmp['name'],)
 90 |                     print("position", tmp['pos']," length: ", tmp['len'])
 91 |                     print("text: ", tmp['text'])
 92 |                     print("kind: ", tmp['kind'])
 93 |                     print("heading level: ", tmp['hlvl'])
 94 |                     print("parent:", tmp['parent'])
 95 |                     print("first child: ",tmp['child1']," last child: ", tmp['childn'])
 96 |                     print("pos_fid is ", tmp['pos_fid'])
 97 |                     print("\n\n")
 98 |                 num += 1
 99 |         self.indx_data = indx_data
100 |         return indx_data
101 | 
102 |     def buildNCX(self, htmlfile, title, ident, lang):
103 |         indx_data = self.indx_data
104 | 
105 |         ncx_header = \
106 | '''<?xml version='1.0' encoding='utf-8'?>
107 | <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%s">
108 | <head>
109 | <meta content="%s" name="dtb:uid"/>
110 | <meta content="%d" name="dtb:depth"/>
111 | <meta content="mobiunpack.py" name="dtb:generator"/>
112 | <meta content="0" name="dtb:totalPageCount"/>
113 | <meta content="0" name="dtb:maxPageNumber"/>
114 | </head>
115 | <docTitle>
116 | <text>%s</text>
117 | </docTitle>
118 | <navMap>
119 | '''
120 | 
121 |         ncx_footer = \
122 | '''  </navMap>
123 | </ncx>
124 | '''
125 | 
126 |         ncx_entry = \
127 | '''<navPoint id="%s" playOrder="%d">
128 | <navLabel>
129 | <text>%s</text>
130 | </navLabel>
131 | <content src="%s"/>'''
132 | 
133 |         # recursive part
134 |         def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
135 |             if start>len(indx_data) or end>len(indx_data):
136 |                 print("Warning: missing INDX child entries", start, end, len(indx_data))
137 |                 return ''
138 |             if DEBUG_NCX:
139 |                 print("recursINDX lvl %d from %d to %d" % (lvl, start, end))
140 |             xml = ''
141 |             if start <= 0:
142 |                 start = 0
143 |             if end <= 0:
144 |                 end = len(indx_data)
145 |             if lvl > max_lvl:
146 |                 max_lvl = lvl
147 |             indent = '  ' * (2 + lvl)
148 | 
149 |             for i in range(start, end):
150 |                 e = indx_data[i]
151 |                 if not e['hlvl'] == lvl:
152 |                     continue
153 |                 # open entry
154 |                 num += 1
155 |                 link = '%s#filepos%d' % (htmlfile, e['pos'])
156 |                 tagid = 'np_%d' % num
157 |                 entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link)
158 |                 entry = re.sub(re.compile('^', re.M), indent, entry, 0)
159 |                 xml += entry + '\n'
160 |                 # recurs
161 |                 if e['child1']>=0:
162 |                     xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
163 |                             e['child1'], e['childn'] + 1)
164 |                     xml += xmlrec
165 |                 # close entry
166 |                 xml += indent + '</navPoint>\n'
167 |             return xml, max_lvl, num
168 | 
169 |         body, max_lvl, num = recursINDX()
170 |         header = ncx_header % (lang, ident, max_lvl + 1, title)
171 |         ncx =  header + body + ncx_footer
172 |         if not len(indx_data) == num:
173 |             print("Warning: different number of entries in NCX", len(indx_data), num)
174 |         return ncx
175 | 
176 |     def writeNCX(self, metadata):
177 |         # build the xml
178 |         self.isNCX = True
179 |         print("Write ncx")
180 |         # htmlname = os.path.basename(self.files.outbase)
181 |         # htmlname += '.html'
182 |         htmlname = 'book.html'
183 |         xml = self.buildNCX(htmlname, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0])
184 |         # write the ncx file
185 |         # ncxname = os.path.join(self.files.mobi7dir, self.files.getInputFileBasename() + '.ncx')
186 |         ncxname = os.path.join(self.files.mobi7dir, 'toc.ncx')
187 |         with open(pathof(ncxname), 'wb') as f:
188 |             f.write(xml.encode('utf-8'))
189 | 
190 |     def buildK8NCX(self, indx_data, title, ident, lang):
191 |         ncx_header = \
192 | '''<?xml version='1.0' encoding='utf-8'?>
193 | <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="%s">
194 | <head>
195 | <meta content="%s" name="dtb:uid"/>
196 | <meta content="%d" name="dtb:depth"/>
197 | <meta content="mobiunpack.py" name="dtb:generator"/>
198 | <meta content="0" name="dtb:totalPageCount"/>
199 | <meta content="0" name="dtb:maxPageNumber"/>
200 | </head>
201 | <docTitle>
202 | <text>%s</text>
203 | </docTitle>
204 | <navMap>
205 | '''
206 | 
207 |         ncx_footer = \
208 | '''  </navMap>
209 | </ncx>
210 | '''
211 | 
212 |         ncx_entry = \
213 | '''<navPoint id="%s" playOrder="%d">
214 | <navLabel>
215 | <text>%s</text>
216 | </navLabel>
217 | <content src="%s"/>'''
218 | 
219 |         # recursive part
220 |         def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1):
221 |             if start>len(indx_data) or end>len(indx_data):
222 |                 print("Warning: missing INDX child entries", start, end, len(indx_data))
223 |                 return ''
224 |             if DEBUG_NCX:
225 |                 print("recursINDX lvl %d from %d to %d" % (lvl, start, end))
226 |             xml = ''
227 |             if start <= 0:
228 |                 start = 0
229 |             if end <= 0:
230 |                 end = len(indx_data)
231 |             if lvl > max_lvl:
232 |                 max_lvl = lvl
233 |             indent = '  ' * (2 + lvl)
234 | 
235 |             for i in range(start, end):
236 |                 e = indx_data[i]
237 |                 htmlfile = e['filename']
238 |                 desttag = e['idtag']
239 |                 if not e['hlvl'] == lvl:
240 |                     continue
241 |                 # open entry
242 |                 num += 1
243 |                 if desttag == '':
244 |                     link = 'Text/%s' % htmlfile
245 |                 else:
246 |                     link = 'Text/%s#%s' % (htmlfile, desttag)
247 |                 tagid = 'np_%d' % num
248 |                 entry = ncx_entry % (tagid, num, xmlescape(unescapeit(e['text'])), link)
249 |                 entry = re.sub(re.compile('^', re.M), indent, entry, 0)
250 |                 xml += entry + '\n'
251 |                 # recurs
252 |                 if e['child1']>=0:
253 |                     xmlrec, max_lvl, num = recursINDX(max_lvl, num, lvl + 1,
254 |                             e['child1'], e['childn'] + 1)
255 |                     xml += xmlrec
256 |                 # close entry
257 |                 xml += indent + '</navPoint>\n'
258 |             return xml, max_lvl, num
259 | 
260 |         body, max_lvl, num = recursINDX()
261 |         header = ncx_header % (lang, ident, max_lvl + 1, title)
262 |         ncx =  header + body + ncx_footer
263 |         if not len(indx_data) == num:
264 |             print("Warning: different number of entries in NCX", len(indx_data), num)
265 |         return ncx
266 | 
267 |     def writeK8NCX(self, ncx_data, metadata):
268 |         # build the xml
269 |         self.isNCX = True
270 |         print("Write K8 ncx")
271 |         xml = self.buildK8NCX(ncx_data, metadata['Title'][0], metadata['UniqueID'][0], metadata.get('Language')[0])
272 |         bname = 'toc.ncx'
273 |         ncxname = os.path.join(self.files.k8oebps,bname)
274 |         with open(pathof(ncxname), 'wb') as f:
275 |             f.write(xml.encode('utf-8'))
276 | 


--------------------------------------------------------------------------------
/src/baca/tools/KindleUnpack/mobi_k8resc.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | DEBUG_USE_ORDERED_DICTIONARY = False  # OrderedDict is supoorted >= python 2.7.
  8 | """ set to True to use OrderedDict for K8RESCProcessor.parsetag.tattr."""
  9 | 
 10 | if DEBUG_USE_ORDERED_DICTIONARY:
 11 |     from collections import OrderedDict as dict_
 12 | else:
 13 |     dict_ = dict
 14 | 
 15 | from .compatibility_utils import unicode_str
 16 | 
 17 | from .mobi_utils import fromBase32
 18 | 
 19 | _OPF_PARENT_TAGS = ['xml', 'package', 'metadata', 'dc-metadata',
 20 |                     'x-metadata', 'manifest', 'spine', 'tours', 'guide']
 21 | 
 22 | class K8RESCProcessor(object):
 23 | 
 24 |     def __init__(self, data, debug=False):
 25 |         self._debug = debug
 26 |         self.resc = None
 27 |         self.opos = 0
 28 |         self.extrameta = []
 29 |         self.cover_name = None
 30 |         self.spine_idrefs = {}
 31 |         self.spine_order = []
 32 |         self.spine_pageattributes = {}
 33 |         self.spine_ppd = None
 34 |         # need3 indicate the book has fields which require epub3.
 35 |         # but the estimation of the source epub version from the fields is difficult.
 36 |         self.need3 = False
 37 |         self.package_ver = None
 38 |         self.extra_metadata = []
 39 |         self.refines_metadata = []
 40 |         self.extra_attributes = []
 41 |         # get header
 42 |         start_pos = data.find(b'<')
 43 |         self.resc_header = data[:start_pos]
 44 |         # get resc data length
 45 |         start = self.resc_header.find(b'=') + 1
 46 |         end = self.resc_header.find(b'&', start)
 47 |         resc_size = 0
 48 |         if end > 0:
 49 |             resc_size = fromBase32(self.resc_header[start:end])
 50 |         resc_rawbytes = len(data) - start_pos
 51 |         if resc_rawbytes == resc_size:
 52 |             self.resc_length = resc_size
 53 |         else:
 54 |             # Most RESC has a nul string at its tail but some do not.
 55 |             end_pos = data.find(b'\x00', start_pos)
 56 |             if end_pos < 0:
 57 |                 self.resc_length = resc_rawbytes
 58 |             else:
 59 |                 self.resc_length = end_pos - start_pos
 60 |         if self.resc_length != resc_size:
 61 |             print("Warning: RESC section length({:d}bytes) does not match its size({:d}bytes).".format(self.resc_length, resc_size))
 62 |         # now parse RESC after converting it to unicode from utf-8
 63 |         try:
 64 |             self.resc = unicode_str(data[start_pos:start_pos+self.resc_length])
 65 |         except UnicodeDecodeError:
 66 |             self.resc = unicode_str(data[start_pos:start_pos+self.resc_length], enc='latin-1')
 67 |         self.parseData()
 68 | 
 69 |     def prepend_to_spine(self, key, idref, linear, properties):
 70 |         self.spine_order = [key] + self.spine_order
 71 |         self.spine_idrefs[key] = idref
 72 |         attributes = {}
 73 |         if linear is not None:
 74 |             attributes['linear'] = linear
 75 |         if properties is not None:
 76 |             attributes['properties'] = properties
 77 |         self.spine_pageattributes[key] = attributes
 78 | 
 79 |     # RESC tag iterator
 80 |     def resc_tag_iter(self):
 81 |         tcontent = last_tattr = None
 82 |         prefix = ['']
 83 |         while True:
 84 |             text, tag = self.parseresc()
 85 |             if text is None and tag is None:
 86 |                 break
 87 |             if text is not None:
 88 |                 tcontent = text.rstrip(' \r\n')
 89 |             else:  # we have a tag
 90 |                 ttype, tname, tattr = self.parsetag(tag)
 91 |                 if ttype == 'begin':
 92 |                     tcontent = None
 93 |                     prefix.append(tname + '.')
 94 |                     if tname in _OPF_PARENT_TAGS:
 95 |                         yield ''.join(prefix), tname, tattr, tcontent
 96 |                     else:
 97 |                         last_tattr = tattr
 98 |                 else:  # single or end
 99 |                     if ttype == 'end':
100 |                         prefix.pop()
101 |                         tattr = last_tattr
102 |                         last_tattr = None
103 |                         if tname in _OPF_PARENT_TAGS:
104 |                             tname += '-end'
105 |                     yield ''.join(prefix), tname, tattr, tcontent
106 |                     tcontent = None
107 | 
108 |     # now parse the RESC to extract spine and extra metadata info
109 |     def parseData(self):
110 |         for prefix, tname, tattr, tcontent in self.resc_tag_iter():
111 |             if self._debug:
112 |                 print("   Parsing RESC: ", prefix, tname, tattr, tcontent)
113 |             if tname == 'package':
114 |                 self.package_ver = tattr.get('version', '2.0')
115 |                 package_prefix = tattr.get('prefix','')
116 |                 if self.package_ver.startswith('3') or package_prefix.startswith('rendition'):
117 |                     self.need3 = True
118 |             if tname == 'spine':
119 |                 self.spine_ppd = tattr.get('page-progession-direction', None)
120 |                 if self.spine_ppd is not None and self.spine_ppd == 'rtl':
121 |                     self.need3 = True
122 |             if tname == 'itemref':
123 |                 skelid = tattr.pop('skelid', None)
124 |                 if skelid is None and len(self.spine_order) == 0:
125 |                     # assume it was removed initial coverpage
126 |                     skelid = 'coverpage'
127 |                     tattr['linear'] = 'no'
128 |                 self.spine_order.append(skelid)
129 |                 idref = tattr.pop('idref', None)
130 |                 if idref is not None:
131 |                     idref = 'x_' + idref
132 |                 self.spine_idrefs[skelid] = idref
133 |                 if 'id' in tattr:
134 |                     del tattr['id']
135 |                 # tattr["id"] = 'x_' + tattr["id"]
136 |                 if 'properties' in tattr:
137 |                     self.need3 = True
138 |                 self.spine_pageattributes[skelid] = tattr
139 |             if tname == 'meta' or tname.startswith('dc:'):
140 |                 if 'refines' in tattr or 'property' in tattr:
141 |                     self.need3 = True
142 |                 if tattr.get('name','') == 'cover':
143 |                     cover_name = tattr.get('content',None)
144 |                     if cover_name is not None:
145 |                         cover_name = 'x_' + cover_name
146 |                     self.cover_name = cover_name
147 |                 else:
148 |                     self.extrameta.append([tname, tattr, tcontent])
149 | 
150 |     # parse and return either leading text or the next tag
151 |     def parseresc(self):
152 |         p = self.opos
153 |         if p >= len(self.resc):
154 |             return None, None
155 |         if self.resc[p] != '<':
156 |             res = self.resc.find('<',p)
157 |             if res == -1 :
158 |                 res = len(self.resc)
159 |             self.opos = res
160 |             return self.resc[p:res], None
161 |         # handle comment as a special case
162 |         if self.resc[p:p+4] == '<!--':
163 |             te = self.resc.find('-->',p+1)
164 |             if te != -1:
165 |                 te = te+2
166 |         else:
167 |             te = self.resc.find('>',p+1)
168 |             ntb = self.resc.find('<',p+1)
169 |             if ntb != -1 and ntb < te:
170 |                 self.opos = ntb
171 |                 return self.resc[p:ntb], None
172 |         self.opos = te + 1
173 |         return None, self.resc[p:te+1]
174 | 
175 |     # parses tag to identify:  [tname, ttype, tattr]
176 |     #    tname: tag name
177 |     #    ttype: tag type ('begin', 'end' or 'single');
178 |     #    tattr: dictionary of tag atributes
179 |     def parsetag(self, s):
180 |         p = 1
181 |         tname = None
182 |         ttype = None
183 |         tattr = dict_()
184 |         while s[p:p+1] == ' ' :
185 |             p += 1
186 |         if s[p:p+1] == '/':
187 |             ttype = 'end'
188 |             p += 1
189 |             while s[p:p+1] == ' ' :
190 |                 p += 1
191 |         b = p
192 |         while s[p:p+1] not in ('>', '/', ' ', '"', "'",'\r','\n') :
193 |             p += 1
194 |         tname=s[b:p].lower()
195 |         # some special cases
196 |         if tname == '?xml':
197 |             tname = 'xml'
198 |         if tname == '!--':
199 |             ttype = 'single'
200 |             comment = s[p:-3].strip()
201 |             tattr['comment'] = comment
202 |         if ttype is None:
203 |             # parse any attributes of begin or single tags
204 |             while s.find('=',p) != -1 :
205 |                 while s[p:p+1] == ' ' :
206 |                     p += 1
207 |                 b = p
208 |                 while s[p:p+1] != '=' :
209 |                     p += 1
210 |                 aname = s[b:p].lower()
211 |                 aname = aname.rstrip(' ')
212 |                 p += 1
213 |                 while s[p:p+1] == ' ' :
214 |                     p += 1
215 |                 if s[p:p+1] in ('"', "'") :
216 |                     p = p + 1
217 |                     b = p
218 |                     while s[p:p+1] not in ('"', "'"):
219 |                         p += 1
220 |                     val = s[b:p]
221 |                     p += 1
222 |                 else :
223 |                     b = p
224 |                     while s[p:p+1] not in ('>', '/', ' ') :
225 |                         p += 1
226 |                     val = s[b:p]
227 |                 tattr[aname] = val
228 |         if ttype is None:
229 |             ttype = 'begin'
230 |             if s.find('/',p) >= 0:
231 |                 ttype = 'single'
232 |         return ttype, tname, tattr
233 | 
234 |     def taginfo_toxml(self, taginfo):
235 |         res = []
236 |         tname, tattr, tcontent = taginfo
237 |         res.append('<' + tname)
238 |         if tattr is not None:
239 |             for key in tattr:
240 |                 res.append(' ' + key + '="'+tattr[key]+'"')
241 |         if tcontent is not None:
242 |             res.append('>' + tcontent + '</' + tname + '>\n')
243 |         else:
244 |             res.append('/>\n')
245 |         return "".join(res)
246 | 
247 |     def hasSpine(self):
248 |         return len(self.spine_order) > 0
249 | 
250 |     def needEPUB3(self):
251 |         return self.need3
252 | 
253 |     def hasRefines(self):
254 |         for [tname, tattr, tcontent] in self.extrameta:
255 |             if 'refines' in tattr:
256 |                 return True
257 |         return False
258 | 
259 |     def createMetadata(self, epubver):
260 |         for taginfo in self.extrameta:
261 |             tname, tattr, tcontent = taginfo
262 |             if 'refines' in tattr:
263 |                 if epubver == 'F' and 'property' in tattr:
264 |                     attr = ' id="%s" opf:%s="%s"\n' % (tattr['refines'], tattr['property'], tcontent)
265 |                     self.extra_attributes.append(attr)
266 |                 else:
267 |                     tag = self.taginfo_toxml(taginfo)
268 |                     self.refines_metadata.append(tag)
269 |             else:
270 |                 tag = self.taginfo_toxml(taginfo)
271 |                 self.extra_metadata.append(tag)
272 | 


--------------------------------------------------------------------------------
/src/baca/tools/KindleUnpack/mobi_index.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | from .compatibility_utils import PY2, bchr, bstr, bord
  8 | if PY2:
  9 |     range = xrange
 10 | 
 11 | import struct
 12 | # note:  struct pack, unpack, unpack_from all require bytestring format
 13 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 14 | 
 15 | from .mobi_utils import toHex
 16 | 
 17 | class MobiIndex:
 18 | 
 19 |     def __init__(self, sect, DEBUG=False):
 20 |         self.sect = sect
 21 |         self.DEBUG = DEBUG
 22 | 
 23 |     def getIndexData(self, idx, label="Unknown"):
 24 |         sect = self.sect
 25 |         outtbl = []
 26 |         ctoc_text = {}
 27 |         if idx != 0xffffffff:
 28 |             sect.setsectiondescription(idx,"{0} Main INDX section".format(label))
 29 |             data = sect.loadSection(idx)
 30 |             idxhdr, hordt1, hordt2 = self.parseINDXHeader(data)
 31 |             IndexCount = idxhdr['count']
 32 |             # handle the case of multiple sections used for CTOC
 33 |             rec_off = 0
 34 |             off = idx + IndexCount + 1
 35 |             for j in range(idxhdr['nctoc']):
 36 |                 cdata = sect.loadSection(off + j)
 37 |                 sect.setsectiondescription(off+j, label + ' CTOC Data ' + str(j))
 38 |                 ctocdict = self.readCTOC(cdata)
 39 |                 for k in ctocdict:
 40 |                     ctoc_text[k + rec_off] = ctocdict[k]
 41 |                 rec_off += 0x10000
 42 |             tagSectionStart = idxhdr['len']
 43 |             controlByteCount, tagTable = readTagSection(tagSectionStart, data)
 44 |             if self.DEBUG:
 45 |                 print("ControlByteCount is", controlByteCount)
 46 |                 print("IndexCount is", IndexCount)
 47 |                 print("TagTable: %s" % tagTable)
 48 |             for i in range(idx + 1, idx + 1 + IndexCount):
 49 |                 sect.setsectiondescription(i,"{0} Extra {1:d} INDX section".format(label,i-idx))
 50 |                 data = sect.loadSection(i)
 51 |                 hdrinfo, ordt1, ordt2 = self.parseINDXHeader(data)
 52 |                 idxtPos = hdrinfo['start']
 53 |                 entryCount = hdrinfo['count']
 54 |                 if self.DEBUG:
 55 |                     print(idxtPos, entryCount)
 56 |                 # loop through to build up the IDXT position starts
 57 |                 idxPositions = []
 58 |                 for j in range(entryCount):
 59 |                     pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j))
 60 |                     idxPositions.append(pos)
 61 |                 # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
 62 |                 idxPositions.append(idxtPos)
 63 |                 # for each entry in the IDXT build up the tagMap and any associated text
 64 |                 for j in range(entryCount):
 65 |                     startPos = idxPositions[j]
 66 |                     endPos = idxPositions[j+1]
 67 |                     textLength = ord(data[startPos:startPos+1])
 68 |                     text = data[startPos+1:startPos+1+textLength]
 69 |                     if hordt2 is not None:
 70 |                         text = b''.join(bchr(hordt2[bord(x)]) for x in text)
 71 |                     tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
 72 |                     outtbl.append([text, tagMap])
 73 |                     if self.DEBUG:
 74 |                         print(tagMap)
 75 |                         print(text)
 76 |         return outtbl, ctoc_text
 77 | 
 78 |     def parseINDXHeader(self, data):
 79 |         "read INDX header"
 80 |         if not data[:4] == b'INDX':
 81 |             print("Warning: index section is not INDX")
 82 |             return False
 83 |         words = (
 84 |                 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
 85 |                 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc'
 86 |         )
 87 |         num = len(words)
 88 |         values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)])
 89 |         header = {}
 90 |         for n in range(num):
 91 |             header[words[n]] = values[n]
 92 | 
 93 |         ordt1 = None
 94 |         ordt2 = None
 95 | 
 96 |         ocnt, oentries, op1, op2, otagx  = struct.unpack_from(b'>LLLLL',data, 0xa4)
 97 |         if header['code'] == 0xfdea or ocnt != 0 or oentries > 0:
 98 |             # horribly hacked up ESP (sample) mobi books use two ORDT sections but never specify
 99 |             # them in the proper place in the header.  They seem to be codepage 65002 which seems
100 |             # to be some sort of strange EBCDIC utf-8 or 16 encoded strings
101 | 
102 |             # so we need to look for them and store them away to process leading text
103 |             # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
104 |             # we only ever seem to use the seocnd but ...
105 |             assert(ocnt == 1)
106 |             assert(data[op1:op1+4] == b'ORDT')
107 |             assert(data[op2:op2+4] == b'ORDT')
108 |             ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4)
109 |             ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4)
110 | 
111 |         if self.DEBUG:
112 |             print("parsed INDX header:")
113 |             for n in words:
114 |                 print(n, "%X" % header[n],)
115 |             print("")
116 |         return header, ordt1, ordt2
117 | 
118 |     def readCTOC(self, txtdata):
119 |         # read all blocks from CTOC
120 |         ctoc_data = {}
121 |         offset = 0
122 |         while offset<len(txtdata):
123 |             if PY2:
124 |                 if txtdata[offset] == b'\0':
125 |                     break
126 |             else:
127 |                 if txtdata[offset] == 0:
128 |                     break
129 |             idx_offs = offset
130 |             # first n bytes: name len as vwi
131 |             pos, ilen = getVariableWidthValue(txtdata, offset)
132 |             offset += pos
133 |             # <len> next bytes: name
134 |             name = txtdata[offset:offset+ilen]
135 |             offset += ilen
136 |             if self.DEBUG:
137 |                 print("name length is ", ilen)
138 |                 print(idx_offs, name)
139 |             ctoc_data[idx_offs] = name
140 |         return ctoc_data
141 | 
142 | 
143 | def getVariableWidthValue(data, offset):
144 |     '''
145 |     Decode variable width value from given bytes.
146 | 
147 |     @param data: The bytes to decode.
148 |     @param offset: The start offset into data.
149 |     @return: Tuple of consumed bytes count and decoded value.
150 |     '''
151 |     value = 0
152 |     consumed = 0
153 |     finished = False
154 |     while not finished:
155 |         v = data[offset + consumed: offset + consumed + 1]
156 |         consumed += 1
157 |         if ord(v) & 0x80:
158 |             finished = True
159 |         value = (value << 7) | (ord(v) & 0x7f)
160 |     return consumed, value
161 | 
162 | 
163 | def readTagSection(start, data):
164 |     '''
165 |     Read tag section from given data.
166 | 
167 |     @param start: The start position in the data.
168 |     @param data: The data to process.
169 |     @return: Tuple of control byte count and list of tag tuples.
170 |     '''
171 |     controlByteCount = 0
172 |     tags = []
173 |     if data[start:start+4] == b"TAGX":
174 |         firstEntryOffset, = struct.unpack_from(b'>L', data, start + 0x04)
175 |         controlByteCount, = struct.unpack_from(b'>L', data, start + 0x08)
176 | 
177 |         # Skip the first 12 bytes already read above.
178 |         for i in range(12, firstEntryOffset, 4):
179 |             pos = start + i
180 |             tags.append((ord(data[pos:pos+1]), ord(data[pos+1:pos+2]), ord(data[pos+2:pos+3]), ord(data[pos+3:pos+4])))
181 |     return controlByteCount, tags
182 | 
183 | 
184 | def countSetBits(value, bits=8):
185 |     '''
186 |     Count the set bits in the given value.
187 | 
188 |     @param value: Integer value.
189 |     @param bits: The number of bits of the input value (defaults to 8).
190 |     @return: Number of set bits.
191 |     '''
192 |     count = 0
193 |     for _ in range(bits):
194 |         if value & 0x01 == 0x01:
195 |             count += 1
196 |         value = value >> 1
197 |     return count
198 | 
199 | 
200 | def getTagMap(controlByteCount, tagTable, entryData, startPos, endPos):
201 |     '''
202 |     Create a map of tags and values from the given byte section.
203 | 
204 |     @param controlByteCount: The number of control bytes.
205 |     @param tagTable: The tag table.
206 |     @param entryData: The data to process.
207 |     @param startPos: The starting position in entryData.
208 |     @param endPos: The end position in entryData or None if it is unknown.
209 |     @return: Hashmap of tag and list of values.
210 |     '''
211 |     tags = []
212 |     tagHashMap = {}
213 |     controlByteIndex = 0
214 |     dataStart = startPos + controlByteCount
215 | 
216 |     for tag, valuesPerEntry, mask, endFlag in tagTable:
217 |         if endFlag == 0x01:
218 |             controlByteIndex += 1
219 |             continue
220 |         cbyte = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1])
221 |         if 0:
222 |             print("Control Byte Index %0x , Control Byte Value %0x" % (controlByteIndex, cbyte))
223 | 
224 |         value = ord(entryData[startPos + controlByteIndex:startPos + controlByteIndex+1]) & mask
225 |         if value != 0:
226 |             if value == mask:
227 |                 if countSetBits(mask) > 1:
228 |                     # If all bits of masked value are set and the mask has more than one bit, a variable width value
229 |                     # will follow after the control bytes which defines the length of bytes (NOT the value count!)
230 |                     # which will contain the corresponding variable width values.
231 |                     consumed, value = getVariableWidthValue(entryData, dataStart)
232 |                     dataStart += consumed
233 |                     tags.append((tag, None, value, valuesPerEntry))
234 |                 else:
235 |                     tags.append((tag, 1, None, valuesPerEntry))
236 |             else:
237 |                 # Shift bits to get the masked value.
238 |                 while mask & 0x01 == 0:
239 |                     mask = mask >> 1
240 |                     value = value >> 1
241 |                 tags.append((tag, value, None, valuesPerEntry))
242 |     for tag, valueCount, valueBytes, valuesPerEntry in tags:
243 |         values = []
244 |         if valueCount is not None:
245 |             # Read valueCount * valuesPerEntry variable width values.
246 |             for _ in range(valueCount):
247 |                 for _ in range(valuesPerEntry):
248 |                     consumed, data = getVariableWidthValue(entryData, dataStart)
249 |                     dataStart += consumed
250 |                     values.append(data)
251 |         else:
252 |             # Convert valueBytes to variable width values.
253 |             totalConsumed = 0
254 |             while totalConsumed < valueBytes:
255 |                 # Does this work for valuesPerEntry != 1?
256 |                 consumed, data = getVariableWidthValue(entryData, dataStart)
257 |                 dataStart += consumed
258 |                 totalConsumed += consumed
259 |                 values.append(data)
260 |             if totalConsumed != valueBytes:
261 |                 print("Error: Should consume %s bytes, but consumed %s" % (valueBytes, totalConsumed))
262 |         tagHashMap[tag] = values
263 |     # Test that all bytes have been processed if endPos is given.
264 |     if endPos is not None and dataStart != endPos:
265 |         # The last entry might have some zero padding bytes, so complain only if non zero bytes are left.
266 |         for char in entryData[dataStart:endPos]:
267 |             if bord(char) != 0:
268 |                 print("Warning: There are unprocessed index bytes left: %s" % toHex(entryData[dataStart:endPos]))
269 |                 if 0:
270 |                     print("controlByteCount: %s" % controlByteCount)
271 |                     print("tagTable: %s" % tagTable)
272 |                     print("data: %s" % toHex(entryData[startPos:endPos]))
273 |                     print("tagHashMap: %s" % tagHashMap)
274 |                 break
275 | 
276 |     return tagHashMap
277 | 


--------------------------------------------------------------------------------
/src/baca/app.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import dataclasses
  3 | from datetime import datetime
  4 | from pathlib import Path
  5 | from typing import Type
  6 | 
  7 | from textual import events
  8 | from textual.actions import SkipAction
  9 | from textual.app import App, ComposeResult
 10 | from textual.css.query import NoMatches
 11 | from textual.widgets import LoadingIndicator
 12 | 
 13 | from baca.components.contents import Content
 14 | from baca.components.events import (
 15 |     DoneLoading,
 16 |     FollowThis,
 17 |     OpenThisImage,
 18 |     Screenshot,
 19 |     SearchSubmitted,
 20 | )
 21 | from baca.components.windows import Alert, DictDisplay, SearchInputPrompt, ToC
 22 | from baca.config import load_config
 23 | from baca.ebooks import Ebook
 24 | from baca.exceptions import LaunchingFileError
 25 | from baca.models import Coordinate, KeyMap, ReadingHistory, SearchMode
 26 | from baca.utils.app_resources import get_resource_file
 27 | from baca.utils.keys_parser import dispatch_key
 28 | from baca.utils.systems import launch_file
 29 | from baca.utils.urls import is_url
 30 | 
 31 | 
 32 | class Baca(App):
 33 |     CSS_PATH = str(get_resource_file("style.css"))
 34 | 
 35 |     def __init__(self, ebook_path: Path, ebook_class: Type[Ebook]):
 36 |         # load first to resolve css variables
 37 |         self.config = load_config()
 38 |         super().__init__()
 39 |         self.ebook_path = ebook_path
 40 |         self.ebook_class = ebook_class
 41 |         # TODO: make reactive and display percentage
 42 |         # as alternative for scrollbar
 43 |         self.reading_progress = 0.0
 44 |         self.search_mode = None
 45 | 
 46 |     def on_load(self, _: events.Load) -> None:
 47 |         assert self._loop is not None
 48 |         self._loop.run_in_executor(None, self.load_everything)
 49 | 
 50 |     def load_everything(self):
 51 |         self.ebook = self.ebook_class(self.ebook_path)
 52 |         content = Content(self.config, self.ebook)
 53 |         self.ebook_state, _ = ReadingHistory.get_or_create(
 54 |             filepath=str(self.ebook.get_path()), defaults=dict(reading_progress=0.0)
 55 |         )
 56 |         # NOTE: using a message instead of calling
 57 |         # the callback directly to make sure that the app is ready
 58 |         # before calling the callback, since this message will
 59 |         # get processed after app ready and composed
 60 |         # (self._screen_stack isn't empty)
 61 |         # see: Widget.on_event(), App._process_message()
 62 |         self.post_message(DoneLoading(content))
 63 | 
 64 |     async def on_done_loading(self, event: DoneLoading) -> None:
 65 |         # to be safe, unnecessary?
 66 |         # while self.screen is None:
 67 |         #     await asyncio.sleep(0.1)
 68 | 
 69 |         # NOTE: await to prevent broken layout
 70 |         await self.mount(event.content)
 71 | 
 72 |         def restore_reading_progress() -> None:
 73 |             # restore reading progress
 74 |             # make sure to call this after refresh so the screen.max_scroll_y != 0
 75 |             self.reading_progress = self.ebook_state.reading_progress * self.screen.max_scroll_y
 76 |             self.screen.scroll_to(None, self.reading_progress, duration=0, animate=False)  # type: ignore
 77 | 
 78 |             self.get_widget_by_id("startup-loader", LoadingIndicator).remove()
 79 | 
 80 |         def show_images() -> None:
 81 |             self.content.show_ansi_images()
 82 |             self.refresh(layout=True)
 83 |             self.call_after_refresh(restore_reading_progress)
 84 | 
 85 |         self.call_after_refresh(show_images)
 86 | 
 87 |     def on_mount(self):
 88 |         def screen_watch_scroll_y_wrapper(old_watcher, screen):
 89 |             def new_watcher(old, new):
 90 |                 result = old_watcher(old, new)
 91 |                 if screen.max_scroll_y != 0:
 92 |                     self.reading_progress = new / screen.max_scroll_y
 93 |                 return result
 94 | 
 95 |             return new_watcher
 96 | 
 97 |         screen_scroll_y_watcher = getattr(self.screen, "watch_scroll_y")
 98 |         setattr(self.screen, "watch_scroll_y", screen_watch_scroll_y_wrapper(screen_scroll_y_watcher, self.screen))
 99 | 
100 |     def get_css_variables(self):
101 |         original = super().get_css_variables()
102 |         return {
103 |             **original,
104 |             **{
105 |                 "text-max-width": self.config.max_text_width,
106 |                 "text-justification": self.config.text_justification,
107 |                 "dark-bg": self.config.dark.bg,
108 |                 "dark-fg": self.config.dark.fg,
109 |                 "dark-accent": self.config.dark.accent,
110 |                 "light-bg": self.config.light.bg,
111 |                 "light-fg": self.config.light.fg,
112 |                 "light-accent": self.config.light.accent,
113 |             },
114 |         }
115 | 
116 |     async def on_key(self, event: events.Key) -> None:
117 |         keymaps = self.config.keymaps
118 |         await dispatch_key(
119 |             [
120 |                 KeyMap(keymaps.close, self.action_cancel_search_or_quit),
121 |                 KeyMap(keymaps.scroll_down, self.screen.action_scroll_down),
122 |                 KeyMap(keymaps.scroll_up, self.screen.action_scroll_up),
123 |                 # KeyMap(keymaps.page_up, self.screen.action_page_up),
124 |                 # KeyMap(keymaps.page_down, self.screen.action_page_down),
125 |                 KeyMap(keymaps.page_up, self.action_page_up),
126 |                 KeyMap(keymaps.page_down, self.action_page_down),
127 |                 KeyMap(keymaps.home, self.screen.action_scroll_home),
128 |                 KeyMap(keymaps.end, self.screen.action_scroll_end),
129 |                 KeyMap(keymaps.open_toc, self.action_open_toc),
130 |                 KeyMap(keymaps.open_metadata, self.action_open_metadata),
131 |                 KeyMap(keymaps.open_help, self.action_open_help),
132 |                 KeyMap(keymaps.toggle_dark, self.action_toggle_dark),
133 |                 KeyMap(keymaps.screenshot, lambda: self.post_message(Screenshot())),
134 |                 KeyMap(keymaps.search_forward, lambda: self.action_input_search(forward=True)),
135 |                 KeyMap(keymaps.search_backward, lambda: self.action_input_search(forward=False)),
136 |                 KeyMap(keymaps.next_match, self.action_search_next),
137 |                 KeyMap(keymaps.prev_match, self.action_search_prev),
138 |                 KeyMap(keymaps.confirm, self.action_stop_search),
139 |                 # KeyMap(["D"], lambda: self.log()),
140 |             ],
141 |             event,
142 |         )
143 | 
144 |     def compose(self) -> ComposeResult:
145 |         yield LoadingIndicator(id="startup-loader")
146 | 
147 |     async def alert(self, message: str) -> None:
148 |         alert = Alert(self.config, message)
149 |         await self.mount(alert)
150 | 
151 |     async def action_open_metadata(self) -> None:
152 |         if self.metadata_window is None:
153 |             metadata_window = DictDisplay(
154 |                 config=self.config, id="metadata", title="Metadata", data=dataclasses.asdict(self.ebook.get_meta())
155 |             )
156 |             await self.mount(metadata_window)
157 | 
158 |     def action_page_down(self) -> None:
159 |         if not self.screen.allow_vertical_scroll:
160 |             raise SkipAction()
161 |         self.screen.scroll_page_down(duration=self.config.page_scroll_duration)
162 | 
163 |     def action_page_up(self) -> None:
164 |         if not self.screen.allow_vertical_scroll:
165 |             raise SkipAction()
166 |         self.screen.scroll_page_up(duration=self.config.page_scroll_duration)
167 | 
168 |     async def action_input_search(self, forward: bool) -> None:
169 |         await self.mount(SearchInputPrompt(forward=forward))
170 | 
171 |     async def action_search_next(self) -> bool:
172 |         if self.search_mode is not None:
173 |             new_coord = await self.content.search_next(
174 |                 self.search_mode.pattern_str,
175 |                 self.search_mode.current_coord,
176 |                 self.search_mode.forward,
177 |             )
178 |             if new_coord is not None:
179 |                 self.search_mode = dataclasses.replace(self.search_mode, current_coord=new_coord)
180 |                 return True
181 |             else:
182 |                 # TODO: inconsistent alert window size on initial search
183 |                 await self.alert(f"Found no match: '{self.search_mode.pattern_str}'")
184 | 
185 |         return False
186 | 
187 |     async def action_search_prev(self) -> None:
188 |         if self.search_mode is not None:
189 |             new_coord = await self.content.search_next(
190 |                 self.search_mode.pattern_str,
191 |                 self.search_mode.current_coord,
192 |                 not self.search_mode.forward,
193 |             )
194 |             if new_coord is not None:
195 |                 self.search_mode = dataclasses.replace(self.search_mode, current_coord=new_coord)
196 | 
197 |     async def action_stop_search(self) -> None:
198 |         if self.search_mode is not None:
199 |             self.search_mode = None
200 |             await self.content.clear_search()
201 | 
202 |     async def action_open_help(self) -> None:
203 |         if self.help_window is None:
204 |             keymap_data = {
205 |                 k.replace("_", " ").title(): ",".join(v) for k, v in dataclasses.asdict(self.config.keymaps).items()
206 |             }
207 |             help_window = DictDisplay(config=self.config, id="help", title="Keymaps", data=keymap_data)
208 |             await self.mount(help_window)
209 | 
210 |     async def action_open_toc(self) -> None:
211 |         if self.toc_window is None:
212 |             toc_entries = list(self.ebook.get_toc())
213 |             if len(toc_entries) == 0:
214 |                 return await self.alert("No content navigations for this ebook.")
215 | 
216 |             initial_index = 0
217 |             toc_values = [e.value for e in toc_entries]
218 |             for s in self.content.get_navigables():
219 |                 if s.nav_point is not None and s.nav_point in toc_values:
220 |                     # if round(self.screen.scroll_y) >= s.virtual_region.y:
221 |                     if self.screen.scroll_offset.y >= s.virtual_region.y:
222 |                         initial_index = toc_values.index(s.nav_point)
223 |                     else:
224 |                         break
225 | 
226 |             toc = ToC(self.config, entries=toc_entries, initial_index=initial_index)
227 |             # NOTE: await to prevent broken layout
228 |             await self.mount(toc)
229 | 
230 |     async def action_cancel_search_or_quit(self) -> None:
231 |         if self.search_mode is not None:
232 |             self.screen.scroll_to(
233 |                 0, self.search_mode.saved_position * self.screen.max_scroll_y, duration=self.config.page_scroll_duration
234 |             )
235 |             await self.action_stop_search()
236 |         else:
237 |             await self.action_quit()
238 | 
239 |     async def action_link(self, link: str) -> None:
240 |         if is_url(link):
241 |             try:
242 |                 await launch_file(link)
243 |             except LaunchingFileError as e:
244 |                 await self.alert(str(e))
245 | 
246 |         elif link in [n.nav_point for n in self.content.get_navigables()]:
247 |             self.content.scroll_to_section(link)
248 | 
249 |         else:
250 |             await self.alert(f"No nav point found in document: {link}")
251 | 
252 |     async def on_search_submitted(self, message: SearchSubmitted) -> None:
253 |         self.search_mode = SearchMode(
254 |             pattern_str=message.value,
255 |             current_coord=Coordinate(-1 if message.forward else self.content.size.width, self.screen.scroll_offset.y),
256 |             forward=message.forward,
257 |             saved_position=self.reading_progress,
258 |         )
259 |         is_found = await self.action_search_next()
260 |         if not is_found:
261 |             self.search_mode = None
262 | 
263 |     async def on_follow_this(self, message: FollowThis) -> None:
264 |         self.content.scroll_to_section(message.nav_point)
265 |         # NOTE: remove after refresh so the event get handled
266 |         self.call_after_refresh(self.toc_window.remove)  # type: ignore
267 | 
268 |     async def on_open_this_image(self, message: OpenThisImage) -> None:
269 |         try:
270 |             filename, bytestr = self.ebook.get_img_bytestr(message.value)
271 |             tmpfilepath = self.ebook.get_tempdir() / filename
272 |             with open(tmpfilepath, "wb") as img_tmp:
273 |                 img_tmp.write(bytestr)
274 | 
275 |             await launch_file(tmpfilepath, preferred=self.config.preferred_image_viewer)
276 |         except LaunchingFileError as e:
277 |             await self.alert(f"Error opening an image: {e}")
278 | 
279 |     async def on_screenshot(self, _: Screenshot) -> None:
280 |         self.save_screenshot(f"baca_{datetime.now().isoformat()}.svg")
281 | 
282 |     def run(self, *args, **kwargs):
283 |         try:
284 |             return super().run(*args, **kwargs)
285 |         finally:
286 |             meta = self.ebook.get_meta()
287 |             self.ebook_state.last_read = datetime.now()  # type: ignore
288 |             self.ebook_state.title = meta.title  # type: ignore
289 |             self.ebook_state.author = meta.creator  # type: ignore
290 |             self.ebook_state.reading_progress = self.reading_progress  # type: ignore
291 |             self.ebook_state.save()
292 |             self.ebook.cleanup()
293 | 
294 |     @property
295 |     def toc_window(self) -> ToC | None:
296 |         try:
297 |             return self.query_one(ToC.__name__, ToC)
298 |         except NoMatches:
299 |             return None
300 | 
301 |     @property
302 |     def metadata_window(self) -> DictDisplay | None:
303 |         try:
304 |             return self.get_widget_by_id("metadata", DictDisplay)
305 |         except NoMatches:
306 |             return None
307 | 
308 |     @property
309 |     def help_window(self) -> DictDisplay | None:
310 |         try:
311 |             return self.get_widget_by_id("help", DictDisplay)
312 |         except NoMatches:
313 |             return None
314 | 
315 |     @property
316 |     def content(self) -> Content:
317 |         return self.query_one(Content.__name__, Content)
318 | 
319 |     # def _remove_nodes(self, widgets: list[Widget], parent: DOMNode) -> AwaitRemove:
320 |     #     await_remove = super()._remove_nodes(widgets, parent)
321 |     #     self.refresh(layout=True)
322 |     #     return await_remove
323 |     # def on_mount(self) -> None:
324 |     #     self.screen.can_focus = True
325 | 


--------------------------------------------------------------------------------
/src/baca/tools/KindleUnpack/mobi_dict.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | from .compatibility_utils import PY2, PY3, utf8_str, bstr, bchr
  8 | 
  9 | if PY2:
 10 |     range = xrange
 11 |     array_format = b'B'
 12 | if PY3:
 13 |     unichr = chr
 14 |     array_format = "B"
 15 | 
 16 | import array
 17 | 
 18 | import struct
 19 | # note:  struct pack, unpack, unpack_from all require bytestring format
 20 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 21 | 
 22 | from .mobi_index import getVariableWidthValue, readTagSection, getTagMap
 23 | from .mobi_utils import toHex
 24 | 
 25 | DEBUG_DICT = False
 26 | 
 27 | class InflectionData(object):
 28 | 
 29 |     def __init__(self, infldatas):
 30 |         self.infldatas = infldatas
 31 |         self.starts = []
 32 |         self.counts = []
 33 |         for idata in self.infldatas:
 34 |             start, = struct.unpack_from(b'>L', idata, 0x14)
 35 |             count, = struct.unpack_from(b'>L', idata, 0x18)
 36 |             self.starts.append(start)
 37 |             self.counts.append(count)
 38 | 
 39 |     def lookup(self, lookupvalue):
 40 |         i = 0
 41 |         rvalue = lookupvalue
 42 |         while rvalue >= self.counts[i]:
 43 |             rvalue = rvalue - self.counts[i]
 44 |             i += 1
 45 |             if i == len(self.counts):
 46 |                 print("Error: Problem with multiple inflections data sections")
 47 |                 return lookupvalue, self.starts[0], self.counts[0], self.infldatas[0]
 48 |         return rvalue, self.starts[i], self.counts[i], self.infldatas[i]
 49 | 
 50 |     def offsets(self, value):
 51 |         rvalue, start, count, data = self.lookup(value)
 52 |         offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue))
 53 |         if rvalue + 1 < count:
 54 |             nextOffset, = struct.unpack_from(b'>H',data, start + 4 + (2 * (rvalue + 1)))
 55 |         else:
 56 |             nextOffset = None
 57 |         return offset, nextOffset, data
 58 | 
 59 | 
 60 | class dictSupport(object):
 61 | 
 62 |     def __init__(self, mh, sect):
 63 |         self.mh = mh
 64 |         self.header = mh.header
 65 |         self.sect = sect
 66 |         self.metaOrthIndex = mh.metaOrthIndex
 67 |         self.metaInflIndex = mh.metaInflIndex
 68 | 
 69 |     def parseHeader(self, data):
 70 |         "read INDX header"
 71 |         if not data[:4] == b'INDX':
 72 |             print("Warning: index section is not INDX")
 73 |             return False
 74 |         words = (
 75 |                 'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
 76 |                 'lng', 'total', 'ordt', 'ligt', 'nligt', 'nctoc'
 77 |         )
 78 |         num = len(words)
 79 |         values = struct.unpack(bstr('>%dL' % num), data[4:4*(num+1)])
 80 |         header = {}
 81 |         for n in range(num):
 82 |             header[words[n]] = values[n]
 83 | 
 84 |         ordt1 = None
 85 |         ordt2 = None
 86 | 
 87 |         otype, oentries, op1, op2, otagx  = struct.unpack_from(b'>LLLLL',data, 0xa4)
 88 |         header['otype'] = otype
 89 |         header['oentries'] = oentries
 90 | 
 91 |         if DEBUG_DICT:
 92 |             print("otype %d, oentries %d, op1 %d, op2 %d, otagx %d" % (otype, oentries, op1, op2, otagx))
 93 | 
 94 |         if header['code'] == 0xfdea or oentries > 0:
 95 |             # some dictionaries seem to be codepage 65002 (0xFDEA) which seems
 96 |             # to be some sort of strange EBCDIC utf-8 or 16 encoded strings
 97 |             # So we need to look for them and store them away to process leading text
 98 |             # ORDT1 has 1 byte long entries, ORDT2 has 2 byte long entries
 99 |             # we only ever seem to use the second but ...
100 |             #
101 |             # if otype = 0, ORDT table uses 16 bit values as offsets into the table
102 |             # if otype = 1, ORDT table uses 8 bit values as offsets inot the table
103 | 
104 |             assert(data[op1:op1+4] == b'ORDT')
105 |             assert(data[op2:op2+4] == b'ORDT')
106 |             ordt1 = struct.unpack_from(bstr('>%dB' % oentries), data, op1+4)
107 |             ordt2 = struct.unpack_from(bstr('>%dH' % oentries), data, op2+4)
108 | 
109 |         if DEBUG_DICT:
110 |             print("parsed INDX header:")
111 |             for key in header:
112 |                 print(key, "%x" % header[key],)
113 |             print("\n")
114 |         return header, ordt1, ordt2
115 | 
116 |     def getPositionMap(self):
117 |         sect = self.sect
118 | 
119 |         positionMap = {}
120 | 
121 |         metaOrthIndex = self.metaOrthIndex
122 |         metaInflIndex = self.metaInflIndex
123 | 
124 |         decodeInflection = True
125 |         if metaOrthIndex != 0xFFFFFFFF:
126 |             print("Info: Document contains orthographic index, handle as dictionary")
127 |             if metaInflIndex == 0xFFFFFFFF:
128 |                 decodeInflection = False
129 |             else:
130 |                 metaInflIndexData = sect.loadSection(metaInflIndex)
131 | 
132 |                 print("\nParsing metaInflIndexData")
133 |                 midxhdr, mhordt1, mhordt2 = self.parseHeader(metaInflIndexData)
134 | 
135 |                 metaIndexCount = midxhdr['count']
136 |                 idatas = []
137 |                 for j in range(metaIndexCount):
138 |                     idatas.append(sect.loadSection(metaInflIndex + 1 + j))
139 |                 dinfl = InflectionData(idatas)
140 | 
141 |                 inflNameData = sect.loadSection(metaInflIndex + 1 + metaIndexCount)
142 |                 tagSectionStart = midxhdr['len']
143 |                 inflectionControlByteCount, inflectionTagTable = readTagSection(tagSectionStart, metaInflIndexData)
144 |                 if DEBUG_DICT:
145 |                     print("inflectionTagTable: %s" % inflectionTagTable)
146 |                 if self.hasTag(inflectionTagTable, 0x07):
147 |                     print("Error: Dictionary uses obsolete inflection rule scheme which is not yet supported")
148 |                     decodeInflection = False
149 | 
150 |             data = sect.loadSection(metaOrthIndex)
151 | 
152 |             print("\nParsing metaOrthIndex")
153 |             idxhdr, hordt1, hordt2 = self.parseHeader(data)
154 | 
155 |             tagSectionStart = idxhdr['len']
156 |             controlByteCount, tagTable = readTagSection(tagSectionStart, data)
157 |             orthIndexCount = idxhdr['count']
158 |             print("orthIndexCount is", orthIndexCount)
159 |             if DEBUG_DICT:
160 |                 print("orthTagTable: %s" % tagTable)
161 |             if hordt2 is not None:
162 |                 print("orth entry uses ordt2 lookup table of type ", idxhdr['otype'])
163 |             hasEntryLength = self.hasTag(tagTable, 0x02)
164 |             if not hasEntryLength:
165 |                 print("Info: Index doesn't contain entry length tags")
166 | 
167 |             print("Read dictionary index data")
168 |             for i in range(metaOrthIndex + 1, metaOrthIndex + 1 + orthIndexCount):
169 |                 data = sect.loadSection(i)
170 |                 hdrinfo, ordt1, ordt2 = self.parseHeader(data)
171 |                 idxtPos = hdrinfo['start']
172 |                 entryCount = hdrinfo['count']
173 |                 idxPositions = []
174 |                 for j in range(entryCount):
175 |                     pos, = struct.unpack_from(b'>H', data, idxtPos + 4 + (2 * j))
176 |                     idxPositions.append(pos)
177 |                 # The last entry ends before the IDXT tag (but there might be zero fill bytes we need to ignore!)
178 |                 idxPositions.append(idxtPos)
179 |                 for j in range(entryCount):
180 |                     startPos = idxPositions[j]
181 |                     endPos = idxPositions[j+1]
182 |                     textLength = ord(data[startPos:startPos+1])
183 |                     text = data[startPos+1:startPos+1+textLength]
184 |                     if hordt2 is not None:
185 |                         utext = u""
186 |                         if idxhdr['otype'] == 0:
187 |                             pattern = b'>H'
188 |                             inc = 2
189 |                         else:
190 |                             pattern = b'>B'
191 |                             inc = 1
192 |                         pos = 0
193 |                         while pos < textLength:
194 |                             off, = struct.unpack_from(pattern, text, pos)
195 |                             if off < len(hordt2):
196 |                                 utext += unichr(hordt2[off])
197 |                             else:
198 |                                 utext += unichr(off)
199 |                             pos += inc
200 |                         text = utext.encode('utf-8')
201 | 
202 |                     tagMap = getTagMap(controlByteCount, tagTable, data, startPos+1+textLength, endPos)
203 |                     if 0x01 in tagMap:
204 |                         if decodeInflection and 0x2a in tagMap:
205 |                             inflectionGroups = self.getInflectionGroups(text, inflectionControlByteCount, inflectionTagTable,
206 |                                                                         dinfl, inflNameData, tagMap[0x2a])
207 |                         else:
208 |                             inflectionGroups = b''
209 |                         assert len(tagMap[0x01]) == 1
210 |                         entryStartPosition = tagMap[0x01][0]
211 |                         if hasEntryLength:
212 |                             # The idx:entry attribute "scriptable" must be present to create entry length tags.
213 |                             ml = b'<idx:entry scriptable="yes"><idx:orth value="' + text + b'">' + inflectionGroups + b'</idx:orth>'
214 |                             if entryStartPosition in positionMap:
215 |                                 positionMap[entryStartPosition] = positionMap[entryStartPosition] + ml
216 |                             else:
217 |                                 positionMap[entryStartPosition] = ml
218 |                             assert len(tagMap[0x02]) == 1
219 |                             entryEndPosition = entryStartPosition + tagMap[0x02][0]
220 |                             if entryEndPosition in positionMap:
221 |                                 positionMap[entryEndPosition] = b"</idx:entry>" + positionMap[entryEndPosition]
222 |                             else:
223 |                                 positionMap[entryEndPosition] = b"</idx:entry>"
224 | 
225 |                         else:
226 |                             indexTags = b'<idx:entry>\n<idx:orth value="' + text + b'">\n' + inflectionGroups + b'</idx:entry>\n'
227 |                             if entryStartPosition in positionMap:
228 |                                 positionMap[entryStartPosition] = positionMap[entryStartPosition] + indexTags
229 |                             else:
230 |                                 positionMap[entryStartPosition] = indexTags
231 |         return positionMap
232 | 
233 |     def hasTag(self, tagTable, tag):
234 |         '''
235 |         Test if tag table contains given tag.
236 | 
237 |         @param tagTable: The tag table.
238 |         @param tag: The tag to search.
239 |         @return: True if tag table contains given tag; False otherwise.
240 |         '''
241 |         for currentTag, _, _, _ in tagTable:
242 |             if currentTag == tag:
243 |                 return True
244 |         return False
245 | 
246 |     def getInflectionGroups(self, mainEntry, controlByteCount, tagTable, dinfl, inflectionNames, groupList):
247 |         '''
248 |         Create string which contains the inflection groups with inflection rules as mobipocket tags.
249 | 
250 |         @param mainEntry: The word to inflect.
251 |         @param controlByteCount: The number of control bytes.
252 |         @param tagTable: The tag table.
253 |         @param data: The Inflection data object to properly select the right inflection data section to use
254 |         @param inflectionNames: The inflection rule name data.
255 |         @param groupList: The list of inflection groups to process.
256 |         @return: String with inflection groups and rules or empty string if required tags are not available.
257 |         '''
258 |         result = b""
259 |         for value in groupList:
260 |             offset, nextOffset, data = dinfl.offsets(value)
261 | 
262 |             # First byte seems to be always 0x00 and must be skipped.
263 |             assert ord(data[offset:offset+1]) == 0x00
264 |             tagMap = getTagMap(controlByteCount, tagTable, data, offset + 1, nextOffset)
265 | 
266 |             # Make sure that the required tags are available.
267 |             if 0x05 not in tagMap:
268 |                 print("Error: Required tag 0x05 not found in tagMap")
269 |                 return ""
270 |             if 0x1a not in tagMap:
271 |                 print("Error: Required tag 0x1a not found in tagMap")
272 |                 return b''
273 | 
274 |             result += b'<idx:infl>'
275 | 
276 |             for i in range(len(tagMap[0x05])):
277 | 
278 |                 # Get name of inflection rule.
279 |                 value = tagMap[0x05][i]
280 |                 consumed, textLength = getVariableWidthValue(inflectionNames, value)
281 |                 inflectionName = inflectionNames[value+consumed:value+consumed+textLength]
282 | 
283 |                 # Get and apply inflection rule across possibly multiple inflection data sections
284 |                 value = tagMap[0x1a][i]
285 |                 rvalue, start, count, data = dinfl.lookup(value)
286 |                 offset, = struct.unpack_from(b'>H', data, start + 4 + (2 * rvalue))
287 |                 textLength = ord(data[offset:offset+1])
288 |                 inflection = self.applyInflectionRule(mainEntry, data, offset+1, offset+1+textLength)
289 |                 if inflection is not None:
290 |                     result += b'  <idx:iform name="' + inflectionName + b'" value="' + inflection + b'"/>'
291 | 
292 |             result += b'</idx:infl>'
293 |         return result
294 | 
295 |     def applyInflectionRule(self, mainEntry, inflectionRuleData, start, end):
296 |         '''
297 |         Apply inflection rule.
298 | 
299 |         @param mainEntry: The word to inflect.
300 |         @param inflectionRuleData: The inflection rules.
301 |         @param start: The start position of the inflection rule to use.
302 |         @param end: The end position of the inflection rule to use.
303 |         @return: The string with the inflected word or None if an error occurs.
304 |         '''
305 |         mode = -1
306 |         byteArray = array.array(array_format, mainEntry)
307 |         position = len(byteArray)
308 |         for charOffset in range(start, end):
309 |             char = inflectionRuleData[charOffset:charOffset+1]
310 |             abyte = ord(char)
311 |             if abyte >= 0x0a and abyte <= 0x13:
312 |                 # Move cursor backwards
313 |                 offset = abyte - 0x0a
314 |                 if mode not in [0x02, 0x03]:
315 |                     mode = 0x02
316 |                     position = len(byteArray)
317 |                 position -= offset
318 |             elif abyte > 0x13:
319 |                 if mode == -1:
320 |                     print("Error: Unexpected first byte %i of inflection rule" % abyte)
321 |                     return None
322 |                 elif position == -1:
323 |                     print("Error: Unexpected first byte %i of inflection rule" % abyte)
324 |                     return None
325 |                 else:
326 |                     if mode == 0x01:
327 |                         # Insert at word start
328 |                         byteArray.insert(position, abyte)
329 |                         position += 1
330 |                     elif mode == 0x02:
331 |                         # Insert at word end
332 |                         byteArray.insert(position, abyte)
333 |                     elif mode == 0x03:
334 |                         # Delete at word end
335 |                         position -= 1
336 |                         deleted = byteArray.pop(position)
337 |                         if bchr(deleted) != char:
338 |                             if DEBUG_DICT:
339 |                                 print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted)))
340 |                             print("Error: Delete operation of inflection rule failed")
341 |                             return None
342 |                     elif mode == 0x04:
343 |                         # Delete at word start
344 |                         deleted = byteArray.pop(position)
345 |                         if bchr(deleted) != char:
346 |                             if DEBUG_DICT:
347 |                                 print("0x03: %s %s %s %s" % (mainEntry, toHex(inflectionRuleData[start:end]), char, bchr(deleted)))
348 |                             print("Error: Delete operation of inflection rule failed")
349 |                             return None
350 |                     else:
351 |                         print("Error: Inflection rule mode %x is not implemented" % mode)
352 |                         return None
353 |             elif abyte == 0x01:
354 |                 # Insert at word start
355 |                 if mode not in [0x01, 0x04]:
356 |                     position = 0
357 |                 mode = abyte
358 |             elif abyte == 0x02:
359 |                 # Insert at word end
360 |                 if mode not in [0x02, 0x03]:
361 |                     position = len(byteArray)
362 |                 mode = abyte
363 |             elif abyte == 0x03:
364 |                 # Delete at word end
365 |                 if mode not in [0x02, 0x03]:
366 |                     position = len(byteArray)
367 |                 mode = abyte
368 |             elif abyte == 0x04:
369 |                 # Delete at word start
370 |                 if mode not in [0x01, 0x04]:
371 |                     position = 0
372 |                 # Delete at word start
373 |                 mode = abyte
374 |             else:
375 |                 print("Error: Inflection rule mode %x is not implemented" % abyte)
376 |                 return None
377 |         return utf8_str(byteArray.tostring())
378 | 


--------------------------------------------------------------------------------
/src/baca/tools/KindleUnpack/mobi_split.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
  4 | 
  5 | from __future__ import unicode_literals, division, absolute_import, print_function
  6 | 
  7 | import struct
  8 | # note:  struct pack, unpack, unpack_from all require bytestring format
  9 | # data all the way up to at least python 2.7.5, python 3 okay with bytestring
 10 | 
 11 | from .unipath import pathof
 12 | 
 13 | 
 14 | # important  pdb header offsets
 15 | unique_id_seed = 68
 16 | number_of_pdb_records = 76
 17 | 
 18 | # important palmdoc header offsets
 19 | book_length = 4
 20 | book_record_count = 8
 21 | first_pdb_record = 78
 22 | 
 23 | # important rec0 offsets
 24 | length_of_book = 4
 25 | mobi_header_base = 16
 26 | mobi_header_length = 20
 27 | mobi_type = 24
 28 | mobi_version = 36
 29 | first_non_text = 80
 30 | title_offset = 84
 31 | first_resc_record = 108
 32 | first_content_index = 192
 33 | last_content_index = 194
 34 | kf8_fdst_index = 192  # for KF8 mobi headers
 35 | fcis_index = 200
 36 | flis_index = 208
 37 | srcs_index = 224
 38 | srcs_count = 228
 39 | primary_index = 244
 40 | datp_index = 256
 41 | huffoff = 112
 42 | hufftbloff = 120
 43 | 
 44 | def getint(datain,ofs,sz=b'L'):
 45 |     i, = struct.unpack_from(b'>'+sz,datain,ofs)
 46 |     return i
 47 | 
 48 | def writeint(datain,ofs,n,len=b'L'):
 49 |     if len==b'L':
 50 |         return datain[:ofs]+struct.pack(b'>L',n)+datain[ofs+4:]
 51 |     else:
 52 |         return datain[:ofs]+struct.pack(b'>H',n)+datain[ofs+2:]
 53 | 
 54 | def getsecaddr(datain,secno):
 55 |     nsec = getint(datain,number_of_pdb_records,b'H')
 56 |     assert secno>=0 & secno<nsec,'secno %d out of range (nsec=%d)'%(secno,nsec)
 57 |     secstart = getint(datain,first_pdb_record+secno*8)
 58 |     if secno == nsec-1:
 59 |         secend = len(datain)
 60 |     else:
 61 |         secend = getint(datain,first_pdb_record+(secno+1)*8)
 62 |     return secstart,secend
 63 | 
 64 | def readsection(datain,secno):
 65 |     secstart, secend = getsecaddr(datain,secno)
 66 |     return datain[secstart:secend]
 67 | 
 68 | def writesection(datain,secno,secdata):  # overwrite, accounting for different length
 69 |     # dataout = deletesectionrange(datain,secno, secno)
 70 |     # return insertsection(dataout, secno, secdata)
 71 |     datalst = []
 72 |     nsec = getint(datain,number_of_pdb_records,b'H')
 73 |     zerosecstart,zerosecend = getsecaddr(datain,0)
 74 |     secstart,secend = getsecaddr(datain,secno)
 75 |     dif = len(secdata) - (secend - secstart)
 76 |     datalst.append(datain[:unique_id_seed])
 77 |     datalst.append(struct.pack(b'>L',2*nsec+1))
 78 |     datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
 79 |     datalst.append(struct.pack(b'>H',nsec))
 80 |     newstart = zerosecstart
 81 |     for i in range(0,secno):
 82 |         ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
 83 |         datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
 84 |     datalst.append(struct.pack(b'>L', secstart) + struct.pack(b'>L', (2*secno)))
 85 |     for i in range(secno+1,nsec):
 86 |         ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
 87 |         ofs = ofs + dif
 88 |         datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
 89 |     lpad = newstart - (first_pdb_record + 8*nsec)
 90 |     if lpad > 0:
 91 |         datalst.append(b'\0' * lpad)
 92 |     datalst.append(datain[zerosecstart:secstart])
 93 |     datalst.append(secdata)
 94 |     datalst.append(datain[secend:])
 95 |     dataout = b''.join(datalst)
 96 |     return dataout
 97 | 
 98 | def nullsection(datain,secno):  # make it zero-length without deleting it
 99 |     datalst = []
100 |     nsec = getint(datain,number_of_pdb_records,b'H')
101 |     secstart, secend = getsecaddr(datain,secno)
102 |     zerosecstart, zerosecend = getsecaddr(datain, 0)
103 |     dif =  secend-secstart
104 |     datalst.append(datain[:first_pdb_record])
105 |     for i in range(0,secno+1):
106 |         ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
107 |         datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
108 |     for i in range(secno+1, nsec):
109 |         ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
110 |         ofs = ofs - dif
111 |         datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
112 |     lpad = zerosecstart - (first_pdb_record + 8*nsec)
113 |     if lpad > 0:
114 |         datalst.append(b'\0' * lpad)
115 |     datalst.append(datain[zerosecstart: secstart])
116 |     datalst.append(datain[secend:])
117 |     dataout = b''.join(datalst)
118 |     return dataout
119 | 
120 | def deletesectionrange(datain,firstsec,lastsec):  # delete a range of sections
121 |     datalst = []
122 |     firstsecstart,firstsecend = getsecaddr(datain,firstsec)
123 |     lastsecstart,lastsecend = getsecaddr(datain,lastsec)
124 |     zerosecstart, zerosecend = getsecaddr(datain, 0)
125 |     dif = lastsecend - firstsecstart + 8*(lastsec-firstsec+1)
126 |     nsec = getint(datain,number_of_pdb_records,b'H')
127 |     datalst.append(datain[:unique_id_seed])
128 |     datalst.append(struct.pack(b'>L',2*(nsec-(lastsec-firstsec+1))+1))
129 |     datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
130 |     datalst.append(struct.pack(b'>H',nsec-(lastsec-firstsec+1)))
131 |     newstart = zerosecstart - 8*(lastsec-firstsec+1)
132 |     for i in range(0,firstsec):
133 |         ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
134 |         ofs = ofs-8*(lastsec-firstsec+1)
135 |         datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
136 |     for i in range(lastsec+1,nsec):
137 |         ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
138 |         ofs = ofs - dif
139 |         flgval = 2*(i-(lastsec-firstsec+1))
140 |         datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
141 |     lpad = newstart - (first_pdb_record + 8*(nsec - (lastsec - firstsec + 1)))
142 |     if lpad > 0:
143 |         datalst.append(b'\0' * lpad)
144 |     datalst.append(datain[zerosecstart:firstsecstart])
145 |     datalst.append(datain[lastsecend:])
146 |     dataout = b''.join(datalst)
147 |     return dataout
148 | 
149 | def insertsection(datain,secno,secdata):  # insert a new section
150 |     datalst = []
151 |     nsec = getint(datain,number_of_pdb_records,b'H')
152 |     # print("inserting secno" , secno,  "into" ,nsec, "sections")
153 |     secstart,secend = getsecaddr(datain,secno)
154 |     zerosecstart,zerosecend = getsecaddr(datain,0)
155 |     dif = len(secdata)
156 |     datalst.append(datain[:unique_id_seed])
157 |     datalst.append(struct.pack(b'>L',2*(nsec+1)+1))
158 |     datalst.append(datain[unique_id_seed+4:number_of_pdb_records])
159 |     datalst.append(struct.pack(b'>H',nsec+1))
160 |     newstart = zerosecstart + 8
161 |     for i in range(0,secno):
162 |         ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
163 |         ofs += 8
164 |         datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L', flgval))
165 |     datalst.append(struct.pack(b'>L', secstart + 8) + struct.pack(b'>L', (2*secno)))
166 |     for i in range(secno,nsec):
167 |         ofs, flgval = struct.unpack_from(b'>2L',datain,first_pdb_record+i*8)
168 |         ofs = ofs + dif + 8
169 |         flgval = 2*(i+1)
170 |         datalst.append(struct.pack(b'>L',ofs) + struct.pack(b'>L',flgval))
171 |     lpad = newstart - (first_pdb_record + 8*(nsec + 1))
172 |     if lpad > 0:
173 |         datalst.append(b'\0' * lpad)
174 |     datalst.append(datain[zerosecstart:secstart])
175 |     datalst.append(secdata)
176 |     datalst.append(datain[secstart:])
177 |     dataout = b''.join(datalst)
178 |     return dataout
179 | 
180 | 
181 | def insertsectionrange(sectionsource,firstsec,lastsec,sectiontarget,targetsec):  # insert a range of sections
182 |     # print("inserting secno" , firstsec,  "to", lastsec, "into" ,targetsec, "sections")
183 |     # dataout = sectiontarget
184 |     # for idx in range(lastsec,firstsec-1,-1):
185 |     #    dataout = insertsection(dataout,targetsec,readsection(sectionsource,idx))
186 |     # return dataout
187 |     datalst = []
188 |     nsec = getint(sectiontarget,number_of_pdb_records,b'H')
189 |     zerosecstart, zerosecend = getsecaddr(sectiontarget,0)
190 |     insstart, nul = getsecaddr(sectiontarget,targetsec)
191 |     nins = lastsec - firstsec + 1
192 |     srcstart, nul = getsecaddr(sectionsource,firstsec)
193 |     nul, srcend = getsecaddr(sectionsource,lastsec)
194 |     newstart = zerosecstart + 8*nins
195 | 
196 |     datalst.append(sectiontarget[:unique_id_seed])
197 |     datalst.append(struct.pack(b'>L',2*(nsec+nins)+1))
198 |     datalst.append(sectiontarget[unique_id_seed+4:number_of_pdb_records])
199 |     datalst.append(struct.pack(b'>H',nsec+nins))
200 |     for i in range(0,targetsec):
201 |         ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8)
202 |         ofsnew = ofs + 8*nins
203 |         flgvalnew = flgval
204 |         datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew))
205 |         # print(ofsnew, flgvalnew, ofs, flgval)
206 |     srcstart0, nul = getsecaddr(sectionsource,firstsec)
207 |     for i in range(nins):
208 |         isrcstart, nul = getsecaddr(sectionsource,firstsec+i)
209 |         ofsnew = insstart + (isrcstart-srcstart0) + 8*nins
210 |         flgvalnew = 2*(targetsec+i)
211 |         datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L', flgvalnew))
212 |         # print(ofsnew, flgvalnew)
213 |     dif = srcend - srcstart
214 |     for i in range(targetsec,nsec):
215 |         ofs, flgval = struct.unpack_from(b'>2L',sectiontarget,first_pdb_record+i*8)
216 |         ofsnew = ofs + dif + 8*nins
217 |         flgvalnew = 2*(i+nins)
218 |         datalst.append(struct.pack(b'>L',ofsnew) + struct.pack(b'>L',flgvalnew))
219 |         # print(ofsnew, flgvalnew, ofs, flgval)
220 |     lpad = newstart - (first_pdb_record + 8*(nsec + nins))
221 |     if lpad > 0:
222 |         datalst.append(b'\0' * lpad)
223 |     datalst.append(sectiontarget[zerosecstart:insstart])
224 |     datalst.append(sectionsource[srcstart:srcend])
225 |     datalst.append(sectiontarget[insstart:])
226 |     dataout = b''.join(datalst)
227 |     return dataout
228 | 
229 | def get_exth_params(rec0):
230 |     ebase = mobi_header_base + getint(rec0,mobi_header_length)
231 |     elen = getint(rec0,ebase+4)
232 |     enum = getint(rec0,ebase+8)
233 |     return ebase,elen,enum
234 | 
235 | def add_exth(rec0,exth_num,exth_bytes):
236 |     ebase,elen,enum = get_exth_params(rec0)
237 |     newrecsize = 8+len(exth_bytes)
238 |     newrec0 = rec0[0:ebase+4]+struct.pack(b'>L',elen+newrecsize)+struct.pack(b'>L',enum+1)+\
239 |               struct.pack(b'>L',exth_num)+struct.pack(b'>L',newrecsize)+exth_bytes+rec0[ebase+12:]
240 |     newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+newrecsize)
241 |     return newrec0
242 | 
243 | def read_exth(rec0,exth_num):
244 |     exth_values = []
245 |     ebase,elen,enum = get_exth_params(rec0)
246 |     ebase = ebase+12
247 |     while enum>0:
248 |         exth_id = getint(rec0,ebase)
249 |         if exth_id == exth_num:
250 |             # We might have multiple exths, so build a list.
251 |             exth_values.append(rec0[ebase+8:ebase+getint(rec0,ebase+4)])
252 |         enum = enum-1
253 |         ebase = ebase+getint(rec0,ebase+4)
254 |     return exth_values
255 | 
256 | def write_exth(rec0,exth_num,exth_bytes):
257 |     ebase,elen,enum = get_exth_params(rec0)
258 |     ebase_idx = ebase+12
259 |     enum_idx = enum
260 |     while enum_idx>0:
261 |         exth_id = getint(rec0,ebase_idx)
262 |         if exth_id == exth_num:
263 |             dif = len(exth_bytes)+8-getint(rec0,ebase_idx+4)
264 |             newrec0 = rec0
265 |             if dif != 0:
266 |                 newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)+dif)
267 |             return newrec0[:ebase+4]+struct.pack(b'>L',elen+len(exth_bytes)+8-getint(rec0,ebase_idx+4))+\
268 |                                               struct.pack(b'>L',enum)+rec0[ebase+12:ebase_idx+4]+\
269 |                                               struct.pack(b'>L',len(exth_bytes)+8)+exth_bytes+\
270 |                                               rec0[ebase_idx+getint(rec0,ebase_idx+4):]
271 |         enum_idx = enum_idx-1
272 |         ebase_idx = ebase_idx+getint(rec0,ebase_idx+4)
273 |     return rec0
274 | 
275 | def del_exth(rec0,exth_num):
276 |     ebase,elen,enum = get_exth_params(rec0)
277 |     ebase_idx = ebase+12
278 |     enum_idx = 0
279 |     while enum_idx < enum:
280 |         exth_id = getint(rec0,ebase_idx)
281 |         exth_size = getint(rec0,ebase_idx+4)
282 |         if exth_id == exth_num:
283 |             newrec0 = rec0
284 |             newrec0 = writeint(newrec0,title_offset,getint(newrec0,title_offset)-exth_size)
285 |             newrec0 = newrec0[:ebase_idx]+newrec0[ebase_idx+exth_size:]
286 |             newrec0 = newrec0[0:ebase+4]+struct.pack(b'>L',elen-exth_size)+struct.pack(b'>L',enum-1)+newrec0[ebase+12:]
287 |             return newrec0
288 |         enum_idx += 1
289 |         ebase_idx = ebase_idx+exth_size
290 |     return rec0
291 | 
292 | 
293 | class mobi_split:
294 | 
295 |     def __init__(self, infile):
296 |         datain = b''
297 |         with open(pathof(infile), 'rb') as f:
298 |             datain = f.read()
299 |         datain_rec0 = readsection(datain,0)
300 |         ver = getint(datain_rec0,mobi_version)
301 |         self.combo = (ver!=8)
302 |         if not self.combo:
303 |             return
304 |         exth121 = read_exth(datain_rec0,121)
305 |         if len(exth121) == 0:
306 |             self.combo = False
307 |             return
308 |         else:
309 |             # only pay attention to first exth121
310 |             # (there should only be one)
311 |             datain_kf8, = struct.unpack_from(b'>L',exth121[0],0)
312 |             if datain_kf8 == 0xffffffff:
313 |                 self.combo = False
314 |                 return
315 |         datain_kfrec0 =readsection(datain,datain_kf8)
316 | 
317 |         # create the standalone mobi7
318 |         num_sec = getint(datain,number_of_pdb_records,b'H')
319 |         # remove BOUNDARY up to but not including ELF record
320 |         self.result_file7 = deletesectionrange(datain,datain_kf8-1,num_sec-2)
321 |         # check if there are SRCS records and delete them
322 |         srcs = getint(datain_rec0,srcs_index)
323 |         num_srcs = getint(datain_rec0,srcs_count)
324 |         if srcs != 0xffffffff and num_srcs > 0:
325 |             self.result_file7 = deletesectionrange(self.result_file7,srcs,srcs+num_srcs-1)
326 |             datain_rec0 = writeint(datain_rec0,srcs_index,0xffffffff)
327 |             datain_rec0 = writeint(datain_rec0,srcs_count,0)
328 |         # reset the EXTH 121 KF8 Boundary meta data to 0xffffffff
329 |         datain_rec0 = write_exth(datain_rec0,121, struct.pack(b'>L', 0xffffffff))
330 |         # datain_rec0 = del_exth(datain_rec0,121)
331 |         # datain_rec0 = del_exth(datain_rec0,534)
332 |         # don't remove the EXTH 125 KF8 Count of Resources, seems to be present in mobi6 files as well
333 |         # set the EXTH 129 KF8 Masthead / Cover Image string to the null string
334 |         datain_rec0 = write_exth(datain_rec0,129, b'')
335 |         # don't remove the EXTH 131 KF8 Unidentified Count, seems to be present in mobi6 files as well
336 | 
337 |         # need to reset flags stored in 0x80-0x83
338 |         # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
339 |         # Bit Flags
340 |         # 0x1000 = Bit 12 indicates if embedded fonts are used or not
341 |         # 0x0800 = means this Header points to *shared* images/resource/fonts ??
342 |         # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
343 |         # 0x0040 = exth exists
344 |         # 0x0010 = Not sure but this is always set so far
345 |         fval, = struct.unpack_from(b'>L',datain_rec0, 0x80)
346 |         # need to remove flag 0x0800 for KindlePreviewer 2.8 and unset Bit 12 for embedded fonts
347 |         fval = fval & 0x07FF
348 |         datain_rec0 = datain_rec0[:0x80] + struct.pack(b'>L',fval) + datain_rec0[0x84:]
349 | 
350 |         self.result_file7 = writesection(self.result_file7,0,datain_rec0)
351 | 
352 |         # no need to replace kf8 style fcis with mobi 7 one
353 |         # fcis_secnum, = struct.unpack_from(b'>L',datain_rec0, 0xc8)
354 |         # if fcis_secnum != 0xffffffff:
355 |         #     fcis_info = readsection(datain, fcis_secnum)
356 |         #     text_len,  = struct.unpack_from(b'>L', fcis_info, 0x14)
357 |         #     new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
358 |         #     new_fcis += struct.pack(b'>L',text_len)
359 |         #     new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
360 |         #     self.result_file7 = writesection(self.result_file7, fcis_secnum, new_fcis)
361 | 
362 |         firstimage = getint(datain_rec0,first_resc_record)
363 |         lastimage = getint(datain_rec0,last_content_index,b'H')
364 |         # print("Old First Image, last Image", firstimage,lastimage)
365 |         if lastimage == 0xffff:
366 |             # find the lowest of the next sections and copy up to that.
367 |             ofs_list = [(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')]
368 |             for ofs,sz in ofs_list:
369 |                 n = getint(datain_rec0,ofs,sz)
370 |                 # print("n",n)
371 |                 if n > 0 and n < lastimage:
372 |                     lastimage = n-1
373 |         print("First Image, last Image", firstimage,lastimage)
374 | 
375 |         # Try to null out FONT and RES, but leave the (empty) PDB record so image refs remain valid
376 |         for i in range(firstimage,lastimage):
377 |             imgsec = readsection(self.result_file7,i)
378 |             if imgsec[0:4] in [b'RESC',b'FONT']:
379 |                 self.result_file7 = nullsection(self.result_file7,i)
380 | 
381 |         # mobi7 finished
382 | 
383 |         # create standalone mobi8
384 |         self.result_file8 = deletesectionrange(datain,0,datain_kf8-1)
385 |         target = getint(datain_kfrec0,first_resc_record)
386 |         self.result_file8 = insertsectionrange(datain,firstimage,lastimage,self.result_file8,target)
387 |         datain_kfrec0 =readsection(self.result_file8,0)
388 | 
389 |         # Only keep the correct EXTH 116 StartOffset, KG 2.5 carries over the one from the mobi7 part, which then points at garbage in the mobi8 part, and confuses FW 3.4
390 |         kf8starts = read_exth(datain_kfrec0,116)
391 |         # If we have multiple StartOffset, keep only the last one
392 |         kf8start_count = len(kf8starts)
393 |         while kf8start_count > 1:
394 |             kf8start_count -= 1
395 |             datain_kfrec0 = del_exth(datain_kfrec0,116)
396 | 
397 |         # update the EXTH 125 KF8 Count of Images/Fonts/Resources
398 |         datain_kfrec0 = write_exth(datain_kfrec0,125,struct.pack(b'>L',lastimage-firstimage+1))
399 | 
400 |         # need to reset flags stored in 0x80-0x83
401 |         # old mobi with exth: 0x50, mobi7 part with exth: 0x1850, mobi8 part with exth: 0x1050
402 |         # standalone mobi8 with exth: 0x0050
403 |         # Bit Flags
404 |         # 0x1000 = Bit 12 indicates if embedded fonts are used or not
405 |         # 0x0800 = means this Header points to *shared* images/resource/fonts ??
406 |         # 0x0080 = unknown new flag, why is this now being set by Kindlegen 2.8?
407 |         # 0x0040 = exth exists
408 |         # 0x0010 = Not sure but this is always set so far
409 |         fval, = struct.unpack_from('>L',datain_kfrec0, 0x80)
410 |         fval = fval & 0x1FFF
411 |         fval |= 0x0800
412 |         datain_kfrec0 = datain_kfrec0[:0x80] + struct.pack(b'>L',fval) + datain_kfrec0[0x84:]
413 | 
414 |         # properly update other index pointers that have been shifted by the insertion of images
415 |         ofs_list = [(kf8_fdst_index,b'L'),(fcis_index,b'L'),(flis_index,b'L'),(datp_index,b'L'),(hufftbloff, b'L')]
416 |         for ofs,sz in ofs_list:
417 |             n = getint(datain_kfrec0,ofs,sz)
418 |             if n != 0xffffffff:
419 |                 datain_kfrec0 = writeint(datain_kfrec0,ofs,n+lastimage-firstimage+1,sz)
420 |         self.result_file8 = writesection(self.result_file8,0,datain_kfrec0)
421 | 
422 |         # no need to replace kf8 style fcis with mobi 7 one
423 |         # fcis_secnum, = struct.unpack_from(b'>L',datain_kfrec0, 0xc8)
424 |         # if fcis_secnum != 0xffffffff:
425 |         #     fcis_info = readsection(self.result_file8, fcis_secnum)
426 |         #     text_len,  = struct.unpack_from(b'>L', fcis_info, 0x14)
427 |         #     new_fcis = 'FCIS\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00'
428 |         #     new_fcis += struct.pack(b'>L',text_len)
429 |         #     new_fcis += '\x00\x00\x00\x00\x00\x00\x00\x20\x00\x00\x00\x08\x00\x01\x00\x01\x00\x00\x00\x00'
430 |         #     self.result_file8 = writesection(self.result_file8, fcis_secnum, new_fcis)
431 | 
432 |         # mobi8 finished
433 | 
434 |     def getResult8(self):
435 |         return self.result_file8
436 | 
437 |     def getResult7(self):
438 |         return self.result_file7
439 | 


--------------------------------------------------------------------------------