├── tests
    ├── __init__.py
    ├── divs
    │   ├── __init__.py
    │   ├── test_div_elements.py
    │   └── test_parse_divs.py
    ├── docs
    │   ├── __init__.py
    │   ├── test_token_mapping.py
    │   ├── test_wordtoks.py
    │   ├── test_token_diffs.py
    │   └── test_text_doc.py
    ├── html
    │   ├── __init__.py
    │   └── test_timestamps.py
    ├── util
    │   ├── __init__.py
    │   └── test_lemmatize.py
    └── transforms
    │   ├── __init__.py
    │   ├── test_sliding_windows.py
    │   ├── test_sliding_transforms.py
    │   └── test_diff_filters.py
├── src
    └── chopdiff
    │   ├── py.typed
    │   ├── __init__.py
    │   ├── util
    │       ├── __init__.py
    │       ├── tiktoken_utils.py
    │       └── lemmatize.py
    │   ├── html
    │       ├── html_plaintext.py
    │       ├── extractor.py
    │       ├── __init__.py
    │       ├── timestamps.py
    │       ├── html_in_md.py
    │       └── html_tags.py
    │   ├── divs
    │       ├── __init__.py
    │       ├── chunk_utils.py
    │       ├── div_elements.py
    │       ├── parse_divs.py
    │       └── text_node.py
    │   ├── docs
    │       ├── sizes.py
    │       ├── __init__.py
    │       ├── search_tokens.py
    │       ├── token_mapping.py
    │       ├── wordtoks.py
    │       └── token_diffs.py
    │   └── transforms
    │       ├── __init__.py
    │       ├── sliding_windows.py
    │       ├── window_settings.py
    │       ├── diff_filters.py
    │       └── sliding_transforms.py
├── .copier-answers.yml
├── installation.md
├── LICENSE
├── Makefile
├── .github
    └── workflows
    │   ├── publish.yml
    │   └── ci.yml
├── examples
    ├── gettysberg.txt
    ├── insert_para_breaks.py
    └── backfill_timestamps.py
├── devtools
    └── lint.py
├── publishing.md
├── development.md
├── .cursor
    └── rules
    │   ├── general.mdc
    │   └── python.mdc
├── .gitignore
└── pyproject.toml


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/chopdiff/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/divs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/docs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/html/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/util/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/chopdiff/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/chopdiff/util/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa: F401
 2 | 
 3 | from chopdiff.util.lemmatize import lemmatize, lemmatized_equal
 4 | from chopdiff.util.tiktoken_utils import tiktoken_len
 5 | 
 6 | __all__ = [
 7 |     "lemmatize",
 8 |     "lemmatized_equal",
 9 |     "tiktoken_len",
10 | ]
11 | 


--------------------------------------------------------------------------------
/src/chopdiff/util/tiktoken_utils.py:
--------------------------------------------------------------------------------
 1 | import tiktoken
 2 | 
 3 | 
 4 | def tiktoken_len(string: str, encoding_name: str = "o200k_base") -> int:
 5 |     """
 6 |     Length of text in tiktokens.
 7 |     """
 8 |     encoding = tiktoken.get_encoding(encoding_name)
 9 |     num_tokens = len(encoding.encode(string))
10 |     return num_tokens
11 | 


--------------------------------------------------------------------------------
/.copier-answers.yml:
--------------------------------------------------------------------------------
 1 | # Changes here will be overwritten by Copier. Do not edit manually.
 2 | _commit: v0.2.17
 3 | _src_path: gh:jlevy/simple-modern-uv
 4 | package_author_email: joshua@cal.berkeley.edu
 5 | package_author_name: Joshua Levy
 6 | package_description: Simple tools for parsing/diffing/processing text to support LLM
 7 |     applications
 8 | package_github_org: jlevy
 9 | package_module: chopdiff
10 | package_name: chopdiff
11 | 


--------------------------------------------------------------------------------
/tests/util/test_lemmatize.py:
--------------------------------------------------------------------------------
 1 | from chopdiff.util.lemmatize import lemmatize, lemmatized_equal
 2 | 
 3 | 
 4 | def test_lemmatize():
 5 |     assert lemmatize("running") == "run"
 6 |     assert lemmatize("better") == "good"
 7 |     assert lemmatize("The cats are running") == "the cat be run"
 8 |     assert lemmatize("Hello, world!") == "hello , world !"
 9 |     assert lemmatize("I have 3 cats.") == "I have 3 cat ."
10 |     assert lemmatized_equal("The cat runs", "The cats running")
11 |     assert not lemmatized_equal("The cat  runs", "The dog runs")
12 |     assert lemmatized_equal("The CAT runs", "the cats RUN")
13 |     assert not lemmatized_equal("The CAT runs", "the cats RAN", case_sensitive=True)
14 | 


--------------------------------------------------------------------------------
/src/chopdiff/html/html_plaintext.py:
--------------------------------------------------------------------------------
 1 | import html
 2 | import re
 3 | 
 4 | 
 5 | def plaintext_to_html(text: str):
 6 |     """
 7 |     Convert plaintext to HTML, also handling newlines and whitespace.
 8 |     """
 9 |     return (
10 |         html.escape(text)
11 |         .replace("\n", "<br>")
12 |         .replace("\t", "&nbsp;" * 4)
13 |         .replace("  ", "&nbsp;&nbsp;")
14 |     )
15 | 
16 | 
17 | def html_to_plaintext(text: str):
18 |     """
19 |     Convert HTML to plaintext, stripping tags and converting entities.
20 |     """
21 |     text = re.sub(r"<br>", "\n", text, flags=re.IGNORECASE)
22 |     text = re.sub(r"<p>", "\n\n", text, flags=re.IGNORECASE)
23 |     unescaped_text = html.unescape(text)
24 |     clean_text = re.sub("<[^<]+?>", "", unescaped_text)
25 |     return clean_text
26 | 


--------------------------------------------------------------------------------
/installation.md:
--------------------------------------------------------------------------------
 1 | ## Installing uv and Python
 2 | 
 3 | This project is set up to use [**uv**](https://docs.astral.sh/uv/), the new package
 4 | manager for Python. `uv` replaces traditional use of `pyenv`, `pipx`, `poetry`, `pip`,
 5 | etc. This is a quick cheat sheet on that:
 6 | 
 7 | On macOS or Linux, if you don't have `uv` installed, a quick way to install it:
 8 | 
 9 | ```shell
10 | curl -LsSf https://astral.sh/uv/install.sh | sh
11 | ```
12 | 
13 | For macOS, you prefer [brew](https://brew.sh/) you can install or upgrade uv with:
14 | 
15 | ```shell
16 | brew update
17 | brew install uv
18 | ```
19 | 
20 | See [uv's docs](https://docs.astral.sh/uv/getting-started/installation/) for more
21 | installation methods and platforms.
22 | 
23 | Now you can use uv to install a current Python environment:
24 | 
25 | ```shell
26 | uv python install 3.13 # Or pick another version.
27 | ```
28 | 


--------------------------------------------------------------------------------
/src/chopdiff/divs/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa: F401
 2 | 
 3 | from chopdiff.divs.chunk_utils import chunk_children, chunk_generator, chunk_paras
 4 | from chopdiff.divs.div_elements import (
 5 |     CHUNK,
 6 |     GROUP,
 7 |     ORIGINAL,
 8 |     RESULT,
 9 |     chunk_text_as_divs,
10 |     div,
11 |     div_get_original,
12 |     div_insert_wrapped,
13 | )
14 | from chopdiff.divs.parse_divs import parse_divs, parse_divs_by_class, parse_divs_single
15 | from chopdiff.divs.text_node import TextNode
16 | 
17 | __all__ = [
18 |     "chunk_children",
19 |     "chunk_generator",
20 |     "chunk_paras",
21 |     "CHUNK",
22 |     "GROUP",
23 |     "ORIGINAL",
24 |     "RESULT",
25 |     "chunk_text_as_divs",
26 |     "div",
27 |     "div_get_original",
28 |     "div_insert_wrapped",
29 |     "parse_divs",
30 |     "parse_divs_by_class",
31 |     "parse_divs_single",
32 |     "TextNode",
33 | ]
34 | 


--------------------------------------------------------------------------------
/src/chopdiff/html/extractor.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from collections.abc import Iterable
 3 | from typing import Generic, TypeAlias, TypeVar
 4 | 
 5 | T = TypeVar("T")
 6 | 
 7 | Match: TypeAlias = tuple[T, int, int]
 8 | """Match, index, and offset of content found by an extractor."""
 9 | 
10 | 
11 | class ContentNotFound(ValueError):
12 |     """
13 |     Exception raised when content is not found by an extractor.
14 |     """
15 | 
16 | 
17 | class Extractor(ABC, Generic[T]):
18 |     """
19 |     Abstract base class for extractors that extract information from a document at a
20 |     given location. We use a class and not a pure function since we may need to
21 |     preprocess the document.
22 |     """
23 | 
24 |     @abstractmethod
25 |     def extract_all(self) -> Iterable[Match[T]]:
26 |         pass
27 | 
28 |     @abstractmethod
29 |     def extract_preceding(self, wordtok_offset: int) -> Match[T]:
30 |         pass
31 | 


--------------------------------------------------------------------------------
/src/chopdiff/util/lemmatize.py:
--------------------------------------------------------------------------------
 1 | def lemmatize(text: str, lang: str = "en") -> str:
 2 |     """
 3 |     Returns a string of lemmatized tokens using simplemma.
 4 |     """
 5 |     try:
 6 |         import simplemma
 7 |     except ImportError:
 8 |         raise ImportError(
 9 |             "simplemma is an optional dependency of chopdiff. Add it to use lemmatization."
10 |         )
11 | 
12 |     tokens = simplemma.simple_tokenizer(text)
13 |     lemmatized_tokens = [simplemma.lemmatize(token, lang=lang) for token in tokens]
14 |     return " ".join(lemmatized_tokens)
15 | 
16 | 
17 | def lemmatized_equal(text1: str, text2: str, case_sensitive: bool = False) -> bool:
18 |     """
19 |     Compare two texts to see if they are the same except for lemmatization.
20 |     Ignores whitespace. Does not ignore punctuation.
21 |     """
22 |     if not case_sensitive:
23 |         text1 = text1.lower()
24 |         text2 = text2.lower()
25 |     return lemmatize(text1) == lemmatize(text2)
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Joshua Levy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for easy development workflows.
 2 | # See development.md for docs.
 3 | # Note GitHub Actions call uv directly, not this Makefile.
 4 | 
 5 | .DEFAULT_GOAL := default
 6 | 
 7 | .PHONY: default install lint test upgrade build clean agent-rules
 8 | 
 9 | default: agent-rules install lint test 
10 | 
11 | install:
12 | 	uv sync --all-extras
13 | 
14 | lint:
15 | 	uv run python devtools/lint.py
16 | 
17 | test:
18 | 	uv run pytest
19 | 
20 | upgrade:
21 | 	uv sync --upgrade --all-extras --dev
22 | 
23 | build:
24 | 	uv build
25 | 
26 | agent-rules: CLAUDE.md AGENTS.md
27 | 
28 | # Use .cursor/rules for sources of rules.
29 | # Create Claude and Codex rules from these.
30 | CLAUDE.md: .cursor/rules/general.mdc .cursor/rules/python.mdc
31 | 	cat .cursor/rules/general.mdc .cursor/rules/python.mdc > CLAUDE.md
32 | 
33 | AGENTS.md: .cursor/rules/general.mdc .cursor/rules/python.mdc
34 | 	cat .cursor/rules/general.mdc .cursor/rules/python.mdc > AGENTS.md
35 | 
36 | clean:
37 | 	-rm -rf dist/
38 | 	-rm -rf *.egg-info/
39 | 	-rm -rf .pytest_cache/
40 | 	-rm -rf .mypy_cache/
41 | 	-rm -rf .venv/
42 | 	-rm -rf CLAUDE.md AGENTS.md
43 | 	-find . -type d -name "__pycache__" -exec rm -rf {} +
44 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPI
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 |   workflow_dispatch: # Enable manual trigger.
 7 | 
 8 | jobs:
 9 |   build-and-publish:
10 |     runs-on: ubuntu-latest
11 |     permissions:
12 |       id-token: write # Mandatory for OIDC.
13 |       contents: read
14 |     steps:
15 |       - name: Checkout (official GitHub action)
16 |         uses: actions/checkout@v4
17 |         with:
18 |           # Important for versioning plugins:
19 |           fetch-depth: 0
20 | 
21 |       - name: Install uv (official Astral action)
22 |         uses: astral-sh/setup-uv@v5
23 |         with:
24 |           version: "0.8.9"
25 |           enable-cache: true
26 |           python-version: "3.12"
27 | 
28 |       - name: Set up Python (using uv)
29 |         run: uv python install
30 | 
31 |       - name: Install all dependencies
32 |         run: uv sync --all-extras
33 | 
34 |       - name: Run tests
35 |         run: uv run pytest
36 | 
37 |       - name: Build package
38 |         run: uv build
39 | 
40 |       - name: Publish to PyPI
41 |         run: uv publish --trusted-publishing always
42 |         # Although uv is newer and faster, the "official" publishing option is the one from PyPA,
43 |         # which uses twine. If desired, replace `uv publish` with:
44 |         # uses: pypa/gh-action-pypi-publish@release/v1
45 | 


--------------------------------------------------------------------------------
/src/chopdiff/docs/sizes.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | from chopdiff.docs.wordtoks import wordtokenize
 4 | from chopdiff.html.html_plaintext import html_to_plaintext
 5 | from chopdiff.util.tiktoken_utils import tiktoken_len
 6 | 
 7 | 
 8 | def size_in_bytes(text: str) -> int:
 9 |     return len(text.encode("utf-8"))
10 | 
11 | 
12 | def size_in_wordtoks(text: str) -> int:
13 |     return len(wordtokenize(text))
14 | 
15 | 
16 | class TextUnit(Enum):
17 |     """
18 |     Text units of measure.
19 |     """
20 | 
21 |     lines = "lines"
22 |     bytes = "bytes"
23 |     chars = "chars"
24 |     words = "words"
25 |     wordtoks = "wordtoks"
26 |     paragraphs = "paragraphs"
27 |     sentences = "sentences"
28 |     tiktokens = "tiktokens"
29 | 
30 | 
31 | def size(text: str, unit: TextUnit) -> int:
32 |     if unit == TextUnit.lines:
33 |         return len(text.splitlines())
34 |     elif unit == TextUnit.bytes:
35 |         return size_in_bytes(text)
36 |     elif unit == TextUnit.chars:
37 |         return len(text)
38 |     elif unit == TextUnit.words:
39 |         # Roughly accurate for HTML, text, or Markdown docs.
40 |         return len(html_to_plaintext(text).split())
41 |     elif unit == TextUnit.wordtoks:
42 |         return size_in_wordtoks(text)
43 |     elif unit == TextUnit.tiktokens:
44 |         return tiktoken_len(text)
45 |     else:
46 |         raise NotImplementedError(f"Unsupported unit for string: {unit}")
47 | 


--------------------------------------------------------------------------------
/examples/gettysberg.txt:
--------------------------------------------------------------------------------
 1 | four score and seven years ago our fathers brought forth on this continent, a new
 2 | nation, conceived in Liberty, and dedicated to the proposition that all men are created
 3 | equal. Now we are engaged in a great civil war, testing whether that nation, or any
 4 | nation so conceived and so dedicated, can long endure. We are met on a great
 5 | battle-field of that war. We have come to dedicate a portion of that field, as a final
 6 | resting place for those who here gave their lives that that nation might live. It is
 7 | altogether fitting and proper that we should do this. But, in a larger sense, we can not
 8 | dedicate—we can not consecrate—we can not hallow—this ground. The brave men, living and
 9 | dead, who struggled here, have consecrated it, far above our poor power to add or
10 | detract. The world will little note, nor long remember what we say here, but it can
11 | never forget what they did here. It is for us the living, rather, to be dedicated here
12 | to the unfinished work which they who fought here have thus far so nobly advanced. It is
13 | rather for us to be here dedicated to the great task remaining before us—that from these
14 | honored dead we take increased devotion to that cause for which they gave the last full
15 | measure of devotion—that we here highly resolve that these dead shall not have died in
16 | vain—that this nation, under God, shall have a new birth of freedom—and that government
17 | of the people, by the people, for the people, shall not perish from the earth.


--------------------------------------------------------------------------------
/src/chopdiff/html/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa: F401
 2 | 
 3 | from chopdiff.html.extractor import ContentNotFound, Extractor, Match
 4 | from chopdiff.html.html_in_md import (
 5 |     Attrs,
 6 |     ClassNames,
 7 |     Wrapper,
 8 |     div_wrapper,
 9 |     escape_md_html,
10 |     html_a,
11 |     html_b,
12 |     html_div,
13 |     html_i,
14 |     html_img,
15 |     html_join_blocks,
16 |     html_span,
17 |     md_para,
18 |     span_wrapper,
19 |     tag_with_attrs,
20 | )
21 | from chopdiff.html.html_plaintext import html_to_plaintext, plaintext_to_html
22 | from chopdiff.html.html_tags import (
23 |     TagMatch,
24 |     html_extract_attribute_value,
25 |     html_find_tag,
26 |     rewrite_html_img_urls,
27 |     rewrite_html_tag_attr,
28 | )
29 | from chopdiff.html.timestamps import (
30 |     TimestampExtractor,
31 |     extract_timestamp,
32 |     has_timestamp,
33 | )
34 | 
35 | __all__ = [
36 |     "Attrs",
37 |     "ClassNames",
38 |     "ContentNotFound",
39 |     "Extractor",
40 |     "Match",
41 |     "TagMatch",
42 |     "html_extract_attribute_value",
43 |     "html_find_tag",
44 |     "rewrite_html_img_urls",
45 |     "rewrite_html_tag_attr",
46 |     "Wrapper",
47 |     "div_wrapper",
48 |     "escape_md_html",
49 |     "html_a",
50 |     "html_b",
51 |     "html_div",
52 |     "html_i",
53 |     "html_img",
54 |     "html_join_blocks",
55 |     "html_span",
56 |     "md_para",
57 |     "span_wrapper",
58 |     "tag_with_attrs",
59 |     "html_to_plaintext",
60 |     "plaintext_to_html",
61 |     "TimestampExtractor",
62 |     "extract_timestamp",
63 |     "has_timestamp",
64 | ]
65 | 


--------------------------------------------------------------------------------
/devtools/lint.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | from funlog import log_calls
 4 | from rich import get_console, reconfigure
 5 | from rich import print as rprint
 6 | 
 7 | # Update as needed.
 8 | SRC_PATHS = ["src", "tests", "devtools", "examples"]
 9 | DOC_PATHS = ["README.md"]
10 | 
11 | 
12 | reconfigure(emoji=not get_console().options.legacy_windows)  # No emojis on legacy windows.
13 | 
14 | 
15 | def main():
16 |     rprint()
17 | 
18 |     errcount = 0
19 |     errcount += run(["codespell", "--write-changes", *SRC_PATHS, *DOC_PATHS])
20 |     errcount += run(["ruff", "check", "--fix", *SRC_PATHS])
21 |     errcount += run(["ruff", "format", *SRC_PATHS])
22 |     errcount += run(["basedpyright", "--stats", *SRC_PATHS])
23 | 
24 |     rprint()
25 | 
26 |     if errcount != 0:
27 |         rprint(f"[bold red]:x: Lint failed with {errcount} errors.[/bold red]")
28 |     else:
29 |         rprint("[bold green]:white_check_mark: Lint passed![/bold green]")
30 |     rprint()
31 | 
32 |     return errcount
33 | 
34 | 
35 | @log_calls(level="warning", show_timing_only=True)
36 | def run(cmd: list[str]) -> int:
37 |     rprint()
38 |     rprint(f"[bold green]>> {' '.join(cmd)}[/bold green]")
39 |     errcount = 0
40 |     try:
41 |         subprocess.run(cmd, text=True, check=True)
42 |     except KeyboardInterrupt:
43 |         rprint("[yellow]Keyboard interrupt - Cancelled[/yellow]")
44 |         errcount = 1
45 |     except subprocess.CalledProcessError as e:
46 |         rprint(f"[bold red]Error: {e}[/bold red]")
47 |         errcount = 1
48 | 
49 |     return errcount
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     exit(main())
54 | 


--------------------------------------------------------------------------------
/tests/transforms/test_sliding_windows.py:
--------------------------------------------------------------------------------
 1 | from pprint import pprint
 2 | from textwrap import dedent
 3 | 
 4 | from chopdiff.docs.sizes import TextUnit, size
 5 | from chopdiff.docs.text_doc import TextDoc
 6 | from chopdiff.transforms.sliding_windows import sliding_word_window
 7 | 
 8 | _example_text = dedent(
 9 |     """
10 |     This is the first paragraph. It has multiple sentences.
11 | 
12 |     This is the second paragraph. It also has multiple sentences. And it continues.
13 |     
14 |     Here is the third paragraph. More sentences follow. And here is another one.
15 |     """
16 | ).strip()
17 | 
18 | 
19 | def test_sliding_window():
20 |     doc = TextDoc.from_text(_example_text)
21 |     window_size = 80
22 |     window_shift = 60
23 | 
24 |     windows = list(sliding_word_window(doc, window_size, window_shift, TextUnit.bytes))
25 |     pprint(windows)
26 | 
27 |     sentence_windows = [
28 |         [[sent.text for sent in para.sentences] for para in doc.paragraphs] for doc in windows
29 |     ]
30 | 
31 |     assert sentence_windows == [
32 |         [["This is the first paragraph.", "It has multiple sentences."]],
33 |         [["It has multiple sentences."], ["This is the second paragraph."]],
34 |         [
35 |             [
36 |                 "This is the second paragraph.",
37 |                 "It also has multiple sentences.",
38 |                 "And it continues.",
39 |             ]
40 |         ],
41 |         [
42 |             ["And it continues."],
43 |             ["Here is the third paragraph.", "More sentences follow."],
44 |         ],
45 |     ]
46 | 
47 |     for sub_doc in windows:
48 |         sub_text = sub_doc.reassemble()
49 | 
50 |         print(f"\n\n---Sub-document length {size(sub_text, TextUnit.bytes)}")
51 |         pprint(sub_text)
52 | 
53 |         assert size(sub_text, TextUnit.bytes) <= window_size
54 | 
55 |         assert sub_text in doc.reassemble()
56 | 


--------------------------------------------------------------------------------
/src/chopdiff/transforms/__init__.py:
--------------------------------------------------------------------------------
 1 | from chopdiff.transforms.diff_filters import (
 2 |     WILDCARD_TOK,
 3 |     adds_headings,
 4 |     changes_whitespace,
 5 |     changes_whitespace_or_punct,
 6 |     make_token_sequence_filter,
 7 |     no_word_lemma_changes,
 8 |     removes_word_lemmas,
 9 |     removes_words,
10 | )
11 | from chopdiff.transforms.sliding_transforms import (
12 |     TextDocTransform,
13 |     filtered_transform,
14 |     remove_window_br,
15 |     sliding_para_window_transform,
16 |     sliding_window_transform,
17 |     sliding_wordtok_window_transform,
18 | )
19 | from chopdiff.transforms.sliding_windows import sliding_para_window, sliding_word_window
20 | from chopdiff.transforms.window_settings import (
21 |     WINDOW_1_PARA,
22 |     WINDOW_2_PARA,
23 |     WINDOW_2K_WORDTOKS,
24 |     WINDOW_4_PARA,
25 |     WINDOW_8_PARA,
26 |     WINDOW_16_PARA,
27 |     WINDOW_32_PARA,
28 |     WINDOW_64_PARA,
29 |     WINDOW_128_PARA,
30 |     WINDOW_256_PARA,
31 |     WINDOW_512_PARA,
32 |     WINDOW_1024_PARA,
33 |     WINDOW_BR,
34 |     WINDOW_BR_SEP,
35 |     WINDOW_NONE,
36 |     WindowSettings,
37 | )
38 | 
39 | __all__ = [
40 |     "WILDCARD_TOK",
41 |     "adds_headings",
42 |     "changes_whitespace",
43 |     "changes_whitespace_or_punct",
44 |     "make_token_sequence_filter",
45 |     "no_word_lemma_changes",
46 |     "removes_word_lemmas",
47 |     "removes_words",
48 |     "TextDocTransform",
49 |     "filtered_transform",
50 |     "remove_window_br",
51 |     "sliding_para_window_transform",
52 |     "sliding_window_transform",
53 |     "sliding_wordtok_window_transform",
54 |     "sliding_para_window",
55 |     "sliding_word_window",
56 |     "WINDOW_1_PARA",
57 |     "WINDOW_2_PARA",
58 |     "WINDOW_2K_WORDTOKS",
59 |     "WINDOW_4_PARA",
60 |     "WINDOW_8_PARA",
61 |     "WINDOW_16_PARA",
62 |     "WINDOW_32_PARA",
63 |     "WINDOW_64_PARA",
64 |     "WINDOW_128_PARA",
65 |     "WINDOW_256_PARA",
66 |     "WINDOW_512_PARA",
67 |     "WINDOW_1024_PARA",
68 |     "WINDOW_BR",
69 |     "WINDOW_BR_SEP",
70 |     "WINDOW_NONE",
71 |     "WindowSettings",
72 | ]
73 | 


--------------------------------------------------------------------------------
/tests/html/test_timestamps.py:
--------------------------------------------------------------------------------
 1 | from textwrap import dedent
 2 | 
 3 | from chopdiff.html.extractor import ContentNotFound
 4 | from chopdiff.html.timestamps import TimestampExtractor
 5 | 
 6 | 
 7 | def test_timestamp_extractor():
 8 |     doc_str = '<span data-timestamp="1.234">Sentence one.</span> <span data-timestamp="23">Sentence two.</span> Sentence three.'
 9 | 
10 |     extractor = TimestampExtractor(doc_str)
11 |     wordtoks = extractor.wordtoks
12 | 
13 |     results: list[str] = []
14 |     offsets: list[int] = []
15 |     for i, wordtok in enumerate(wordtoks):
16 |         try:
17 |             timestamp, _index, offset = extractor.extract_preceding(i)
18 |         except ContentNotFound:
19 |             timestamp = None
20 |             offset = -1
21 |         results.append(f"{i}: {timestamp} ⎪{wordtok}⎪")
22 |         offsets.append(offset)
23 | 
24 |     print("\n".join(results))
25 |     print(offsets)
26 | 
27 |     assert (
28 |         "\n".join(results)
29 |         == dedent(
30 |             """
31 |             0: None ⎪<-BOF->⎪
32 |             1: None ⎪<span data-timestamp="1.234">⎪
33 |             2: 1.234 ⎪Sentence⎪
34 |             3: 1.234 ⎪ ⎪
35 |             4: 1.234 ⎪one⎪
36 |             5: 1.234 ⎪.⎪
37 |             6: 1.234 ⎪</span>⎪
38 |             7: 1.234 ⎪ ⎪
39 |             8: 1.234 ⎪<span data-timestamp="23">⎪
40 |             9: 23.0 ⎪Sentence⎪
41 |             10: 23.0 ⎪ ⎪
42 |             11: 23.0 ⎪two⎪
43 |             12: 23.0 ⎪.⎪
44 |             13: 23.0 ⎪</span>⎪
45 |             14: 23.0 ⎪ ⎪
46 |             15: 23.0 ⎪Sentence⎪
47 |             16: 23.0 ⎪ ⎪
48 |             17: 23.0 ⎪three⎪
49 |             18: 23.0 ⎪.⎪
50 |             19: 23.0 ⎪<-EOF->⎪
51 |             """
52 |         ).strip()
53 |     )
54 | 
55 |     assert offsets == [
56 |         -1,
57 |         -1,
58 |         0,
59 |         0,
60 |         0,
61 |         0,
62 |         0,
63 |         0,
64 |         0,
65 |         50,
66 |         50,
67 |         50,
68 |         50,
69 |         50,
70 |         50,
71 |         50,
72 |         50,
73 |         50,
74 |         50,
75 |         50,
76 |     ]
77 | 


--------------------------------------------------------------------------------
/src/chopdiff/docs/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa: F401
 2 | 
 3 | from chopdiff.docs.search_tokens import search_tokens
 4 | from chopdiff.docs.sizes import TextUnit
 5 | from chopdiff.docs.text_doc import Paragraph, Sentence, SentIndex, TextDoc
 6 | from chopdiff.docs.token_diffs import (
 7 |     DIFF_FILTER_NONE,
 8 |     DiffFilter,
 9 |     DiffOp,
10 |     DiffStats,
11 |     OpType,
12 |     TokenDiff,
13 |     diff_docs,
14 |     diff_wordtoks,
15 |     scored_diff_wordtoks,
16 | )
17 | from chopdiff.docs.token_mapping import TokenMapping
18 | from chopdiff.docs.wordtoks import (
19 |     BOF_STR,
20 |     BOF_TOK,
21 |     EOF_STR,
22 |     EOF_TOK,
23 |     PARA_BR_STR,
24 |     PARA_BR_TOK,
25 |     SENT_BR_STR,
26 |     SENT_BR_TOK,
27 |     SPACE_TOK,
28 |     SYMBOL_SEP,
29 |     Tag,
30 |     first_wordtok,
31 |     is_break_or_space,
32 |     is_div,
33 |     is_header_tag,
34 |     is_tag,
35 |     is_tag_close,
36 |     is_tag_open,
37 |     is_whitespace_or_punct,
38 |     is_word,
39 |     join_wordtoks,
40 |     normalize_wordtok,
41 |     wordtok_len,
42 |     wordtok_to_str,
43 |     wordtokenize,
44 |     wordtokenize_with_offsets,
45 | )
46 | 
47 | __all__ = [
48 |     "search_tokens",
49 |     "TextUnit",
50 |     "Paragraph",
51 |     "Sentence",
52 |     "SentIndex",
53 |     "TextDoc",
54 |     "DIFF_FILTER_NONE",
55 |     "DiffFilter",
56 |     "DiffOp",
57 |     "DiffStats",
58 |     "OpType",
59 |     "TokenDiff",
60 |     "diff_docs",
61 |     "diff_wordtoks",
62 |     "scored_diff_wordtoks",
63 |     "TokenMapping",
64 |     "BOF_STR",
65 |     "BOF_TOK",
66 |     "EOF_STR",
67 |     "EOF_TOK",
68 |     "PARA_BR_STR",
69 |     "PARA_BR_TOK",
70 |     "SENT_BR_STR",
71 |     "SENT_BR_TOK",
72 |     "SPACE_TOK",
73 |     "SYMBOL_SEP",
74 |     "Tag",
75 |     "first_wordtok",
76 |     "is_break_or_space",
77 |     "is_div",
78 |     "is_header_tag",
79 |     "is_tag",
80 |     "is_tag_close",
81 |     "is_tag_open",
82 |     "is_whitespace_or_punct",
83 |     "is_word",
84 |     "join_wordtoks",
85 |     "normalize_wordtok",
86 |     "wordtok_len",
87 |     "wordtok_to_str",
88 |     "wordtokenize",
89 |     "wordtokenize_with_offsets",
90 | ]
91 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: CI
 5 | 
 6 | on:
 7 |   push:
 8 |     # Use ["main", "master"] for CI only on the default branch.
 9 |     # Use ["**"] for CI on all branches.
10 |     branches: ["main", "master"]
11 |   pull_request:
12 |     branches: ["main", "master"]
13 | 
14 | permissions:
15 |   contents: read
16 | 
17 | jobs:
18 |   build:
19 |     strategy:
20 |       matrix:
21 |         # Update this as needed:
22 |         # Common platforms: ["ubuntu-latest", "macos-latest", "windows-latest"]
23 |         os: ["ubuntu-latest"]
24 |         python-version: ["3.11", "3.12", "3.13"]
25 | 
26 |     # Linux only by default. Use ${{ matrix.os }} for other OSes.
27 |     runs-on: ${{ matrix.os }}
28 | 
29 |     steps:
30 | 
31 |       # Generally following uv docs:
32 |       # https://docs.astral.sh/uv/guides/integration/github/
33 | 
34 |       - name: Checkout (official GitHub action)
35 |         uses: actions/checkout@v4
36 |         with:
37 |           # Important for versioning plugins:
38 |           fetch-depth: 0
39 | 
40 |       # From debugging the cydifflib build failure.
41 |       # Confirmed we have version 3.31.6 installed.
42 |       - name: Display CMake Version
43 |         run: cmake --version
44 | 
45 |       - name: Install uv (official Astral action)
46 |         uses: astral-sh/setup-uv@v5
47 |         with:
48 |           # Update this as needed:
49 |           version: "0.8.9"
50 |           enable-cache: true
51 |           python-version: ${{ matrix.python-version }}
52 | 
53 |       - name: Set up Python (using uv)
54 |         run: uv python install
55 | 
56 |       # Alternately can use the official Python action:
57 |       # - name: Set up Python (using actions/setup-python)
58 |       #   uses: actions/setup-python@v5
59 |       #   with:
60 |       #     python-version: ${{ matrix.python-version }}
61 | 
62 |       - name: Install all dependencies
63 |         run: uv sync --all-extras
64 | 
65 |       - name: Run linting
66 |         run: uv run python devtools/lint.py
67 | 
68 |       - name: Run tests
69 |         run: uv run pytest


--------------------------------------------------------------------------------
/src/chopdiff/divs/chunk_utils.py:
--------------------------------------------------------------------------------
 1 | from collections.abc import Callable, Generator
 2 | from typing import TypeVar
 3 | 
 4 | from chopdiff.divs.text_node import TextNode
 5 | from chopdiff.docs.sizes import TextUnit
 6 | from chopdiff.docs.text_doc import TextDoc
 7 | 
 8 | T = TypeVar("T")
 9 | 
10 | 
11 | def chunk_generator(
12 |     doc: T,
13 |     condition: Callable[[T], bool],
14 |     slicer: Callable[[T, int, int], T],
15 |     total_size: int,
16 | ) -> Generator[T, None, None]:
17 |     """
18 |     Walk through the elements of a document and yield sequential subdocs once they meet
19 |     a specific condition.
20 |     """
21 | 
22 |     start_index = 0
23 |     current_index = 0
24 | 
25 |     while current_index < total_size:
26 |         current_doc = slicer(doc, start_index, current_index)
27 | 
28 |         if condition(current_doc):
29 |             yield current_doc
30 |             start_index = current_index + 1
31 |             current_index = start_index
32 |         else:
33 |             current_index += 1
34 | 
35 |     if start_index < total_size:
36 |         yield slicer(doc, start_index, total_size)
37 | 
38 | 
39 | def chunk_paras(doc: TextDoc, min_size: int, unit: TextUnit) -> Generator[TextDoc, None, None]:
40 |     """
41 |     Generate TextDoc chunks where each chunk is at least the specified minimum size.
42 |     """
43 | 
44 |     def condition(slice: TextDoc) -> bool:
45 |         return slice.size(unit) >= min_size
46 | 
47 |     def slicer(doc: TextDoc, start: int, end: int) -> TextDoc:
48 |         return doc.sub_paras(start, end)
49 | 
50 |     total_paragraphs = len(doc.paragraphs)
51 | 
52 |     yield from chunk_generator(doc, condition, slicer, total_paragraphs)
53 | 
54 | 
55 | def chunk_children(
56 |     node: TextNode, min_size: int, unit: TextUnit
57 | ) -> Generator[TextNode, None, None]:
58 |     """
59 |     Generate TextNode chunks where each chunk is at least the specified minimum size.
60 |     """
61 | 
62 |     def condition(slice: TextNode) -> bool:
63 |         return slice.size(unit) >= min_size
64 | 
65 |     def slicer(node: TextNode, start: int, end: int) -> TextNode:
66 |         return node.slice_children(start, end)
67 | 
68 |     total_children = len(node.children)
69 | 
70 |     yield from chunk_generator(node, condition, slicer, total_children)
71 | 


--------------------------------------------------------------------------------
/src/chopdiff/transforms/sliding_windows.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Sliding windows of text on a text doc.
 3 | """
 4 | 
 5 | import logging
 6 | from collections.abc import Callable, Generator
 7 | 
 8 | from flowmark import fill_markdown
 9 | 
10 | from chopdiff.docs.sizes import TextUnit
11 | from chopdiff.docs.text_doc import SentIndex, TextDoc
12 | 
13 | log = logging.getLogger(__name__)
14 | 
15 | 
16 | def sliding_word_window(
17 |     doc: TextDoc, window_size: int, window_shift: int, unit: TextUnit
18 | ) -> Generator[TextDoc, None, None]:
19 |     """
20 |     Generate TextDoc sub-documents in a sliding window over the given document.
21 |     """
22 |     total_size = doc.size(unit)
23 |     start_offset = 0
24 |     start_index, _ = doc.seek_to_sent(start_offset, unit)
25 | 
26 |     while start_offset < total_size:
27 |         end_offset = start_offset + window_size
28 |         end_index, _ = doc.seek_to_sent(end_offset, unit)
29 | 
30 |         # Sentence may extend past the window, so back up to ensure it fits.
31 |         sub_doc = doc.sub_doc(start_index, end_index)
32 |         try:
33 |             while sub_doc.size(unit) > window_size:
34 |                 end_index = doc.prev_sent(end_index)
35 |                 sub_doc = doc.sub_doc(start_index, end_index)
36 |         except ValueError:
37 |             raise ValueError(
38 |                 f"Window size {window_size} too small for sentence at offset {start_offset}"
39 |             )
40 | 
41 |         yield sub_doc
42 | 
43 |         start_offset += window_shift
44 |         start_index = end_index
45 | 
46 | 
47 | def sliding_para_window(
48 |     doc: TextDoc, nparas: int, normalizer: Callable[[str], str] = fill_markdown
49 | ) -> Generator[TextDoc, None, None]:
50 |     """
51 |     Generate TextDoc sub-documents taking `nparas` paragraphs at a time.
52 |     """
53 |     for i in range(0, len(doc.paragraphs), nparas):
54 |         end_index = min(i + nparas - 1, len(doc.paragraphs) - 1)
55 |         sub_doc = doc.sub_doc(SentIndex(i, 0), SentIndex(end_index, 0))
56 | 
57 |         # XXX It's important we re-normalize especially because LLMs can output itemized lists with just
58 |         # one newline, but for Markdown we want separate paragraphs for each list item.
59 |         formatted_sub_doc = TextDoc.from_text(normalizer(sub_doc.reassemble()))
60 | 
61 |         yield formatted_sub_doc
62 | 


--------------------------------------------------------------------------------
/src/chopdiff/html/timestamps.py:
--------------------------------------------------------------------------------
 1 | from collections.abc import Iterable
 2 | 
 3 | import regex
 4 | from typing_extensions import override
 5 | 
 6 | from chopdiff.docs.search_tokens import search_tokens
 7 | from chopdiff.docs.wordtoks import wordtokenize_with_offsets
 8 | from chopdiff.html.extractor import ContentNotFound, Extractor, Match
 9 | 
10 | # Match any span or div with a data-timestamp attribute.
11 | _TIMESTAMP_RE = regex.compile(r'(?:<\w+[^>]*\s)?data-timestamp=[\'"](\d+(\.\d+)?)[\'"][^>]*>')
12 | 
13 | 
14 | def extract_timestamp(wordtok: str) -> float | None:
15 |     match = _TIMESTAMP_RE.search(wordtok)
16 |     return float(match.group(1)) if match else None
17 | 
18 | 
19 | def has_timestamp(wordtok: str) -> bool:
20 |     return extract_timestamp(wordtok) is not None
21 | 
22 | 
23 | class TimestampExtractor(Extractor[float]):
24 |     """
25 |     Extract timestamps of the form `<... data-timestamp="123.45">` from a document.
26 |     """
27 | 
28 |     def __init__(self, doc_str: str):
29 |         self.doc_str = doc_str
30 |         self.wordtoks, self.offsets = wordtokenize_with_offsets(self.doc_str, bof_eof=True)
31 | 
32 |     @override
33 |     def extract_all(self) -> Iterable[Match[float]]:
34 |         """
35 |         Extract all timestamps from the document.
36 |         """
37 |         for index, (wordtok, offset) in enumerate(zip(self.wordtoks, self.offsets, strict=False)):
38 |             timestamp = extract_timestamp(wordtok)
39 |             if timestamp is not None:
40 |                 yield timestamp, index, offset
41 | 
42 |     @override
43 |     def extract_preceding(self, wordtok_offset: int) -> Match[float]:
44 |         try:
45 |             index, wordtok = (
46 |                 search_tokens(self.wordtoks).at(wordtok_offset).seek_back(has_timestamp).get_token()
47 |             )
48 |             if wordtok:
49 |                 timestamp = extract_timestamp(wordtok)
50 |                 if timestamp is not None:
51 |                     return timestamp, index, self.offsets[index]
52 |             raise ContentNotFound(
53 |                 f"No timestamp found seeking back from token {wordtok_offset}: {wordtok!r}"
54 |             )
55 |         except KeyError as e:
56 |             raise ContentNotFound(
57 |                 f"No timestamp found searching back from token {wordtok_offset}: {e}"
58 |             )
59 | 


--------------------------------------------------------------------------------
/publishing.md:
--------------------------------------------------------------------------------
 1 | ## Publishing Releases
 2 | 
 3 | This is how to publish a Python package to [**PyPI**](https://pypi.org/) from GitHub
 4 | Actions, when using the
 5 | [**simple-modern-uv**](https://github.com/jlevy/simple-modern-uv) template.
 6 | 
 7 | Thanks to [the dynamic versioning
 8 | plugin](https://github.com/ninoseki/uv-dynamic-versioning/) and the
 9 | [`publish.yml` workflow](https://github.com/jlevy/simple-modern-uv/blob/main/template/.github/workflows/publish.yml),
10 | you can simply create tagged releases (using standard format for the tag name, e.g.
11 | `v0.1.0`) on GitHub and the tag will trigger a release build, which then uploads it to
12 | PyPI.
13 | 
14 | ### How to Publish the First Time
15 | 
16 | This part is a little confusing the first time.
17 | Here is the simplest way to do it.
18 | For the purposes of this example replace OWNER and PROJECT with the right values.
19 | 
20 | 1. **Get a PyPI account** at [pypi.org](https://pypi.org/) and sign in.
21 | 
22 | 2. **Pick a name for the project** that isn't already taken.
23 | 
24 |    - Go to `https://pypi.org/project/PROJECT` to see if another project with that name
25 |      already exits.
26 | 
27 |    - If needed, update your `pyproject.yml` with the correct name.
28 | 
29 | 3. **Authorize** your repository to publish to PyPI:
30 | 
31 |    - Go to [the publishing settings page](https://pypi.org/manage/account/publishing/).
32 | 
33 |    - Find "Trusted Publisher Management" and register your GitHub repo as a new
34 |      "pending" trusted publisher
35 | 
36 |    - Enter the project name, repo owner, repo name, and `publish.yml` as the workflow
37 |      name. (You can leave the "environment name" field blank.)
38 | 
39 | 4. **Create a release** on GitHub:
40 | 
41 |    - Commit code and make sure it's running correctly.
42 | 
43 |    - Go to your GitHub project page, then click on Actions tab.
44 | 
45 |    - Confirm all tests are passing in the last CI workflow.
46 |      (If you want, you can even publish this template when it's empty as just a stub
47 |      project, to try all this out.)
48 | 
49 |    - Go to your GitHub project page, click on Releases.
50 | 
51 |    - Fill in the tag and the release name.
52 |      Select to create a new tag, and pick a version.
53 |      A good option is `v0.1.0`. (It's wise to have it start with a `v`.)
54 | 
55 |    - Submit to create the release.
56 | 
57 | 5. **Confirm it publishes to PyPI**
58 | 
59 |    - Watch for the release workflow in the GitHub Actions tab.
60 | 
61 |    - If it succeeds, you should see it appear at `https://pypi.org/project/PROJECT`.
62 | 
63 | ### How to Publish Subsequent Releases
64 | 
65 | Just create a new release!
66 | Everything is the same as the last two steps above.
67 | 
68 | * * *
69 | 
70 | *This file was built with
71 | [simple-modern-uv](https://github.com/jlevy/simple-modern-uv).*
72 | 


--------------------------------------------------------------------------------
/src/chopdiff/docs/search_tokens.py:
--------------------------------------------------------------------------------
 1 | from collections.abc import Callable
 2 | from typing import TypeAlias
 3 | 
 4 | Predicate: TypeAlias = Callable[[str], bool] | list[str]
 5 | 
 6 | 
 7 | class _TokenSearcher:
 8 |     def __init__(self, toks: list[str]):
 9 |         self.toks = toks
10 |         self._cur_idx = 0
11 | 
12 |     def at(self, index: int):
13 |         if index is None:  # pyright: ignore
14 |             raise KeyError("Index cannot be None")
15 |         # Convert negative indices to positive ones.
16 |         self._cur_idx = index if index >= 0 else len(self.toks) + index
17 |         return self
18 | 
19 |     def start(self):
20 |         self._cur_idx = 0
21 |         return self
22 | 
23 |     def end(self):
24 |         self._cur_idx = len(self.toks)
25 |         return self
26 | 
27 |     def seek_back(self, predicate: Predicate):
28 |         if isinstance(predicate, list):
29 |             allowed: list[str] = predicate
30 |             predicate = lambda x: x in allowed
31 |         for idx in range(self._cur_idx - 1, -1, -1):
32 |             if predicate(self.toks[idx]):
33 |                 self._cur_idx = idx
34 |                 return self
35 |         raise KeyError("No matching token found before the current index")
36 | 
37 |     def seek_forward(self, predicate: Predicate):
38 |         if isinstance(predicate, list):
39 |             allowed: list[str] = predicate
40 |             predicate = lambda x: x in allowed
41 |         for idx in range(self._cur_idx + 1, len(self.toks)):
42 |             if predicate(self.toks[idx]):
43 |                 self._cur_idx = idx
44 |                 return self
45 |         raise KeyError("No matching token found after the current index")
46 | 
47 |     def prev(self):
48 |         if self._cur_idx - 1 < 0:
49 |             raise KeyError("No previous token available")
50 |         self._cur_idx -= 1
51 |         return self
52 | 
53 |     def next(self):
54 |         if self._cur_idx + 1 >= len(self.toks):
55 |             raise KeyError("No next token available")
56 |         self._cur_idx += 1
57 |         return self
58 | 
59 |     def get_index(self) -> int:
60 |         return self._cur_idx
61 | 
62 |     def get_token(self) -> tuple[int, str]:
63 |         return self._cur_idx, self.toks[self._cur_idx]
64 | 
65 | 
66 | def search_tokens(wordtoks: list[str]) -> _TokenSearcher:
67 |     """
68 |     Fluent convenience function to search for offsets in an array of string tokens
69 |     based on a predicate, previous, next, etc. Raises `KeyError` if any search
70 |     has no matches.
71 | 
72 |     Example:
73 |     ```
74 |     index, token = (
75 |         search_tokens(list_of_tokens)
76 |             .at(my_offset)
77 |             .seek_back(has_timestamp)
78 |             .next()
79 |             .get_token()
80 |     )
81 |     ```
82 |     """
83 |     return _TokenSearcher(wordtoks)
84 | 


--------------------------------------------------------------------------------
/tests/transforms/test_sliding_transforms.py:
--------------------------------------------------------------------------------
 1 | from textwrap import dedent
 2 | 
 3 | from chopdiff.docs.sizes import TextUnit
 4 | from chopdiff.docs.text_doc import TextDoc
 5 | from chopdiff.transforms.sliding_transforms import (
 6 |     sliding_para_window_transform,
 7 |     sliding_window_transform,
 8 | )
 9 | from chopdiff.transforms.window_settings import WINDOW_BR_SEP, WindowSettings
10 | 
11 | _example_text = dedent(
12 |     """
13 |     This is the first paragraph. It has multiple sentences.
14 | 
15 |     This is the second paragraph. It also has multiple sentences. And it continues.
16 |     
17 |     Here is the third paragraph. More sentences follow. And here is another one.
18 |     """
19 | ).strip()
20 | 
21 | 
22 | def test_sliding_word_window_transform():
23 |     long_text = (_example_text + "\n\n") * 2
24 |     doc = TextDoc.from_text(long_text)
25 | 
26 |     # Simple transformation that converts all text to uppercase.
27 |     def transform_func(window: TextDoc) -> TextDoc:
28 |         transformed_text = window.reassemble().upper()
29 |         return TextDoc.from_text(transformed_text)
30 | 
31 |     transformed_doc = sliding_window_transform(
32 |         doc,
33 |         transform_func,
34 |         WindowSettings(TextUnit.wordtoks, 80, 60, min_overlap=5, separator="|"),
35 |     )
36 |     print("---Wordtok transformed doc:")
37 |     print(transformed_doc.reassemble())
38 | 
39 |     assert transformed_doc.reassemble().count("|") == 2
40 | 
41 |     long_text = (_example_text + "\n\n") * 20
42 |     doc = TextDoc.from_text(long_text)
43 |     transformed_doc = sliding_window_transform(
44 |         doc, transform_func, WindowSettings(TextUnit.wordtoks, 80, 60, min_overlap=5)
45 |     )
46 |     assert transformed_doc.reassemble() == long_text.upper().strip()
47 | 
48 | 
49 | def test_sliding_para_window_transform():
50 |     def transform_func(window: TextDoc) -> TextDoc:
51 |         transformed_text = window.reassemble().upper()
52 |         return TextDoc.from_text(transformed_text)
53 | 
54 |     text = "\n\n".join(f"Paragraph {i}." for i in range(7))
55 |     doc = TextDoc.from_text(text)
56 | 
57 |     transformed_doc = sliding_para_window_transform(
58 |         doc,
59 |         transform_func,
60 |         WindowSettings(
61 |             TextUnit.paragraphs,
62 |             3,
63 |             3,
64 |             separator=WINDOW_BR_SEP,
65 |         ),
66 |     )
67 | 
68 |     print("---Paragraph transformed doc:")
69 |     print(transformed_doc.reassemble())
70 | 
71 |     assert (
72 |         transformed_doc.reassemble()
73 |         == dedent(
74 |             """
75 |             PARAGRAPH 0.
76 | 
77 |             PARAGRAPH 1.
78 | 
79 |             PARAGRAPH 2.
80 | 
81 |             <!--window-br--> PARAGRAPH 3.
82 | 
83 |             PARAGRAPH 4.
84 | 
85 |             PARAGRAPH 5.
86 | 
87 |             <!--window-br--> PARAGRAPH 6.
88 |             """
89 |         ).strip()
90 |     )
91 | 


--------------------------------------------------------------------------------
/development.md:
--------------------------------------------------------------------------------
  1 | # Development
  2 | 
  3 | ## Setting Up uv
  4 | 
  5 | This project is set up to use [uv](https://docs.astral.sh/uv/) to manage Python and
  6 | dependencies. First, be sure you
  7 | [have uv installed](https://docs.astral.sh/uv/getting-started/installation/).
  8 | 
  9 | Then [fork the jlevy/chopdiff
 10 | repo](https://github.com/jlevy/chopdiff/fork) (having your own
 11 | fork will make it easier to contribute) and
 12 | [clone it](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository).
 13 | 
 14 | ## Basic Developer Workflows
 15 | 
 16 | The `Makefile` simply offers shortcuts to `uv` commands for developer convenience.
 17 | (For clarity, GitHub Actions don't use the Makefile and just call `uv` directly.)
 18 | 
 19 | ```shell
 20 | # First, install all dependencies and set up your virtual environment.
 21 | # This simply runs `uv sync --all-extras` to install all packages,
 22 | # including dev dependencies and optional dependencies.
 23 | make install
 24 | 
 25 | # Run uv sync, lint, and test (and also generate agent rules):
 26 | make
 27 | 
 28 | # Build wheel:
 29 | make build
 30 | 
 31 | # Linting:
 32 | make lint
 33 | 
 34 | # Run tests:
 35 | make test
 36 | 
 37 | # Delete all the build artifacts:
 38 | make clean
 39 | 
 40 | # Upgrade dependencies to compatible versions:
 41 | make upgrade
 42 | 
 43 | # To run tests by hand:
 44 | uv run pytest   # all tests
 45 | uv run pytest -s src/module/some_file.py  # one test, showing outputs
 46 | 
 47 | # Build and install current dev executables, to let you use your dev copies
 48 | # as local tools:
 49 | uv tool install --editable .
 50 | 
 51 | # Dependency management directly with uv:
 52 | # Add a new dependency:
 53 | uv add package_name
 54 | # Add a development dependency:
 55 | uv add --dev package_name
 56 | # Update to latest compatible versions (including dependencies on git repos):
 57 | uv sync --upgrade
 58 | # Update a specific package:
 59 | uv lock --upgrade-package package_name
 60 | # Update dependencies on a package:
 61 | uv add package_name@latest
 62 | 
 63 | # Run a shell within the Python environment:
 64 | uv venv
 65 | source .venv/bin/activate
 66 | ```
 67 | 
 68 | See [uv docs](https://docs.astral.sh/uv/) for details.
 69 | 
 70 | ## Agent Rules
 71 | 
 72 | See [.cursor/rules](.cursor/rules) for agent rules.
 73 | These are written for [Cursor](https://www.cursor.com/) but are also used by other
 74 | agents because the Makefile will generate `CLAUDE.md` and `AGENTS.md` from the same
 75 | rules.
 76 | 
 77 | ```shell
 78 | make agent-rules
 79 | ```
 80 | 
 81 | ## IDE setup
 82 | 
 83 | If you use VSCode or a fork like Cursor or Windsurf, you can install the following
 84 | extensions:
 85 | 
 86 | - [Python](https://marketplace.visualstudio.com/items?itemName=ms-python.python)
 87 | 
 88 | - [Based Pyright](https://marketplace.visualstudio.com/items?itemName=detachhead.basedpyright)
 89 |   for type checking. Note that this extension works with non-Microsoft VSCode forks like
 90 |   Cursor.
 91 | 
 92 | ## Documentation
 93 | 
 94 | - [uv docs](https://docs.astral.sh/uv/)
 95 | 
 96 | - [basedpyright docs](https://docs.basedpyright.com/latest/)
 97 | 
 98 | * * *
 99 | 
100 | *This file was built with
101 | [simple-modern-uv](https://github.com/jlevy/simple-modern-uv).*
102 | 


--------------------------------------------------------------------------------
/src/chopdiff/transforms/window_settings.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | 
  3 | from typing_extensions import override
  4 | 
  5 | from chopdiff.docs.sizes import TextUnit
  6 | 
  7 | WINDOW_BR = "<!--window-br-->"
  8 | """Marker inserted into result documents to show where window breaks have occurred."""
  9 | 
 10 | WINDOW_BR_SEP = f"\n{WINDOW_BR}\n"
 11 | 
 12 | 
 13 | @dataclass(frozen=True)
 14 | class WindowSettings:
 15 |     """
 16 |     Size of the sliding window, the shift, and the min overlap required when stitching windows
 17 |     together. All sizes in wordtoks.
 18 |     """
 19 | 
 20 |     unit: TextUnit
 21 |     size: int
 22 |     shift: int
 23 |     min_overlap: int = 0
 24 |     separator: str = ""
 25 | 
 26 |     @override
 27 |     def __str__(self):
 28 |         return f"windowing size={self.size}, shift={self.shift}, min_overlap={self.min_overlap} {self.unit.value}"
 29 | 
 30 | 
 31 | WINDOW_NONE = WindowSettings(unit=TextUnit.wordtoks, size=0, shift=0, min_overlap=0, separator="")
 32 | """
 33 | Do not use a sliding window.
 34 | """
 35 | 
 36 | WINDOW_2K_WORDTOKS = WindowSettings(
 37 |     TextUnit.wordtoks,
 38 |     size=2048,
 39 |     shift=2048 - 256,
 40 |     min_overlap=8,
 41 |     separator=WINDOW_BR_SEP,
 42 | )
 43 | """
 44 | Sliding, overlapping word-based window. Useful for finding paragraph breaks.
 45 | 2K wordtoks is several paragraphs.
 46 | """
 47 | 
 48 | 
 49 | WINDOW_1_PARA = WindowSettings(
 50 |     TextUnit.paragraphs, size=1, shift=1, min_overlap=0, separator=WINDOW_BR_SEP
 51 | )
 52 | """Process 1 paragraph at a time."""
 53 | 
 54 | 
 55 | WINDOW_2_PARA = WindowSettings(
 56 |     TextUnit.paragraphs, size=2, shift=2, min_overlap=0, separator=WINDOW_BR_SEP
 57 | )
 58 | """Process 2 paragraphs at a time."""
 59 | 
 60 | 
 61 | WINDOW_4_PARA = WindowSettings(
 62 |     TextUnit.paragraphs, size=4, shift=4, min_overlap=0, separator=WINDOW_BR_SEP
 63 | )
 64 | """Process 4 paragraph at a time."""
 65 | 
 66 | 
 67 | WINDOW_8_PARA = WindowSettings(
 68 |     TextUnit.paragraphs, size=8, shift=8, min_overlap=0, separator=WINDOW_BR_SEP
 69 | )
 70 | """Process 8 paragraphs at a time."""
 71 | 
 72 | 
 73 | WINDOW_16_PARA = WindowSettings(
 74 |     TextUnit.paragraphs, size=16, shift=16, min_overlap=0, separator=WINDOW_BR_SEP
 75 | )
 76 | """Process 16 paragraphs at a time."""
 77 | 
 78 | WINDOW_32_PARA = WindowSettings(
 79 |     TextUnit.paragraphs, size=32, shift=32, min_overlap=0, separator=WINDOW_BR_SEP
 80 | )
 81 | """Process 32 paragraphs at a time."""
 82 | 
 83 | WINDOW_64_PARA = WindowSettings(
 84 |     TextUnit.paragraphs, size=64, shift=64, min_overlap=0, separator=WINDOW_BR_SEP
 85 | )
 86 | """Process 64 paragraphs at a time."""
 87 | 
 88 | WINDOW_128_PARA = WindowSettings(
 89 |     TextUnit.paragraphs, size=128, shift=128, min_overlap=0, separator=WINDOW_BR_SEP
 90 | )
 91 | """Process 128 paragraphs at a time."""
 92 | 
 93 | WINDOW_256_PARA = WindowSettings(
 94 |     TextUnit.paragraphs, size=256, shift=256, min_overlap=0, separator=WINDOW_BR_SEP
 95 | )
 96 | """Process 256 paragraphs at a time."""
 97 | 
 98 | WINDOW_512_PARA = WindowSettings(
 99 |     TextUnit.paragraphs, size=512, shift=512, min_overlap=0, separator=WINDOW_BR_SEP
100 | )
101 | """Process 512 paragraphs at a time."""
102 | 
103 | WINDOW_1024_PARA = WindowSettings(
104 |     TextUnit.paragraphs, size=1024, shift=1024, min_overlap=0, separator=WINDOW_BR_SEP
105 | )
106 | """Process 1024 paragraphs at a time."""
107 | 


--------------------------------------------------------------------------------
/.cursor/rules/general.mdc:
--------------------------------------------------------------------------------
 1 | ---
 2 | description: General Guidelines
 3 | globs: 
 4 | alwaysApply: true
 5 | ---
 6 | # Assistant Rules
 7 | 
 8 | **Your fundamental responsibility:** Remember you are a senior engineer and have a
 9 | serious responsibility to be clear, factual, think step by step and be systematic,
10 | express expert opinion, and make use of the user’s attention wisely.
11 | 
12 | **Rules must be followed:** It is your responsibility to carefully read these rules as
13 | well as Python or other language-specific rules included here.
14 | 
15 | Therefore:
16 | 
17 | - Be concise. State answers or responses directly, without extra commentary.
18 |   Or (if it is clear) directly do what is asked.
19 | 
20 | - If instructions are unclear or there are two or more ways to fulfill the request that
21 |   are substantially different, make a tentative plan (or offer options) and ask for
22 |   confirmation.
23 | 
24 | - If you can think of a much better approach that the user requests, be sure to mention
25 |   it. It’s your responsibility to suggest approaches that lead to better, simpler
26 |   solutions.
27 | 
28 | - Give thoughtful opinions on better/worse approaches, but NEVER say “great idea!”
29 |   or “good job” or other compliments, encouragement, or non-essential banter.
30 |   Your job is to give expert opinions and to solve problems, not to motivate the user.
31 | 
32 | - Avoid gratuitous enthusiasm or generalizations.
33 |   Use thoughtful comparisons like saying which code is “cleaner” but don’t congratulate
34 |   yourself. Avoid subjective descriptions.
35 |   For example, don’t say “I’ve meticulously improved the code and it is in great shape!”
36 |   That is useless generalization.
37 |   Instead, specifically say what you’ve done, e.g., "I’ve added types, including
38 |   generics, to all the methods in `Foo` and fixed all linter errors."
39 | 
40 | # General Coding Guidelines
41 | 
42 | ## Using Comments
43 | 
44 | - Keep all comments concise and clear and suitable for inclusion in final production.
45 | 
46 | - DO use comments whenever the intent of a given piece of code is subtle or confusing or
47 |   avoids a bug or is not obvious from the code itself.
48 | 
49 | - DO NOT repeat in comments what is obvious from the names of functions or variables or
50 |   types.
51 | 
52 | - DO NOT include comments that reflect what you did, such as “Added this function” as
53 |   this is meaningless to anyone reading the code later.
54 |   (Instead, describe in your message to the user any other contextual information.)
55 | 
56 | - DO NOT use fancy or needlessly decorated headings like “===== MIGRATION TOOLS =====”
57 |   in comments
58 | 
59 | - DO NOT number steps in comments.
60 |   These are hard to maintain if the code changes.
61 |   NEVER DO THIS: “// Step 3: Fetch the data from the cache”\
62 |   This is fine: “// Now fetch the data from the cache”
63 | 
64 | - DO NOT use emojis or special unicode characters like ① or • or – or — in comments.
65 | 
66 | - Use emojis in output if it enhances the clarity and can be done consistently.
67 |   You may use ✔︎ and ✘ to indicate success and failure, and ∆ and ‼︎ for user-facing
68 |   warnings and errors, for example, but be sure to do it consistently.
69 |   DO NOT use emojis gratuitously in comments or output.
70 |   You may use then ONLY when they have clear meanings (like success or failure).
71 |   Unless the user says otherwise, avoid emojis and Unicode in comments as clutters the
72 |   output with little benefit.
73 | 


--------------------------------------------------------------------------------
/src/chopdiff/docs/token_mapping.py:
--------------------------------------------------------------------------------
 1 | from typing_extensions import override
 2 | 
 3 | from chopdiff.docs.token_diffs import SYMBOL_SEP, OpType, TokenDiff, diff_wordtoks
 4 | 
 5 | 
 6 | class TokenMapping:
 7 |     """
 8 |     Given two sequences of tokens, create a best-estimate mapping of how the tokens
 9 |     in the second sequence map to the tokens in the first sequence, based on an
10 |     LCS-style diff.
11 |     """
12 | 
13 |     def __init__(
14 |         self,
15 |         tokens1: list[str],
16 |         tokens2: list[str],
17 |         diff: TokenDiff | None = None,
18 |         min_tokens: int = 10,
19 |         max_diff_frac: float = 0.4,
20 |     ):
21 |         self.tokens1 = tokens1
22 |         self.tokens2 = tokens2
23 |         self.diff = diff or diff_wordtoks(self.tokens1, self.tokens2)
24 |         self._validate(min_tokens, max_diff_frac)
25 |         self.backmap: dict[int, int] = {}
26 |         self._create_mapping()
27 | 
28 |     def map_back(self, offset2: int) -> int:
29 |         """
30 |         Map an offset in the second sequence back to the offset that most closely corresponds to it
31 |         in the first sequence. This might be an exact match (e.g. the same word) or the closest token
32 |         (e.g. the last word before a deleted or changed word).
33 |         """
34 |         return self.backmap[offset2]
35 | 
36 |     def _validate(self, min_wordtoks: int, max_diff_frac: float):
37 |         if len(self.tokens1) < min_wordtoks or len(self.tokens2) < min_wordtoks:
38 |             raise ValueError(f"Documents should have at least {min_wordtoks} wordtoks")
39 | 
40 |         nchanges = len(self.diff.changes())
41 |         if float(nchanges) / len(self.tokens1) > max_diff_frac:
42 |             raise ValueError(
43 |                 f"Documents have too many changes: {nchanges}/{len(self.tokens1)} ({float(nchanges) / len(self.tokens1):.2f} > {max_diff_frac})"
44 |             )
45 | 
46 |     def _create_mapping(self):
47 |         offset1 = 0
48 |         offset2 = 0
49 |         last_offset1 = 0
50 | 
51 |         for op in self.diff.ops:
52 |             if op.action == OpType.EQUAL:
53 |                 for _ in op.left:
54 |                     self.backmap[offset2] = offset1
55 |                     last_offset1 = offset1
56 |                     offset1 += 1
57 |                     offset2 += 1
58 |             elif op.action == OpType.DELETE:
59 |                 for _ in op.left:
60 |                     last_offset1 = offset1
61 |                     offset1 += 1
62 |             elif op.action == OpType.INSERT:
63 |                 for _ in op.right:
64 |                     self.backmap[offset2] = last_offset1
65 |                     offset2 += 1
66 |             elif op.action == OpType.REPLACE:
67 |                 for _ in op.left:
68 |                     last_offset1 = offset1
69 |                     offset1 += 1
70 |                 for _ in op.right:
71 |                     self.backmap[offset2] = last_offset1
72 |                     offset2 += 1
73 | 
74 |     def full_mapping_str(self):
75 |         """
76 |         For debugging or logging, return a verbose, readable table of the mapping of each
77 |         token in the second sequence to the first sequence.
78 |         """
79 |         return "\n".join(
80 |             f"{i} {SYMBOL_SEP}{self.tokens2[i]}{SYMBOL_SEP} -> {self.map_back(i)} {SYMBOL_SEP}{self.tokens1[self.map_back(i)]}{SYMBOL_SEP}"
81 |             for i in range(len(self.tokens2))
82 |         )
83 | 
84 |     @override
85 |     def __str__(self):
86 |         return f"OffsetMapping(doc1 len {len(self.tokens1)}, doc2 len {len(self.tokens2)}, mapping len {len(self.backmap)})"
87 | 


--------------------------------------------------------------------------------
/src/chopdiff/divs/div_elements.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | from chopdiff.divs.chunk_utils import chunk_children, chunk_paras
  4 | from chopdiff.divs.parse_divs import parse_divs
  5 | from chopdiff.divs.text_node import TextNode
  6 | from chopdiff.docs.sizes import TextUnit
  7 | from chopdiff.docs.text_doc import TextDoc
  8 | from chopdiff.docs.wordtoks import first_wordtok, is_div
  9 | from chopdiff.html.html_in_md import Attrs, ClassNames, div_wrapper, html_join_blocks
 10 | 
 11 | log = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | CHUNK = "chunk"
 15 | """Class name for a chunk of text."""
 16 | 
 17 | ORIGINAL = "original"
 18 | """Class name for the original content."""
 19 | 
 20 | RESULT = "result"
 21 | """Class name for the result of an LLM action."""
 22 | 
 23 | GROUP = "group"
 24 | """Class name for a generic combination of elements."""
 25 | 
 26 | 
 27 | def div(
 28 |     class_name: ClassNames,
 29 |     *blocks: str | None,
 30 |     attrs: Attrs | None = None,
 31 |     safe: bool = True,
 32 | ) -> str:
 33 |     """
 34 |     Convenience to create Markdown-compatible div with HTML in its own paragraphs.
 35 |     """
 36 |     return div_wrapper(class_name=class_name, attrs=attrs, safe=safe, padding="\n\n")(
 37 |         html_join_blocks(*blocks)
 38 |     )
 39 | 
 40 | 
 41 | def div_get_original(element: TextNode, child_name: str = ORIGINAL) -> str:
 42 |     """
 43 |     Get content of the named child element if it exists, otherwise use the whole contents.
 44 |     """
 45 |     child = element.child_by_class_name(child_name)
 46 |     return child.contents if child else element.contents
 47 | 
 48 | 
 49 | def div_insert_wrapped(
 50 |     element: TextNode,
 51 |     new_child_blocks: list[str],
 52 |     container_class: ClassNames = CHUNK,
 53 |     original_class: str = ORIGINAL,
 54 |     at_front: bool = True,
 55 | ) -> str:
 56 |     """
 57 |     Insert new children into a div element. As a base case, wrap the original
 58 |     content in a child div if it's not already present as a child.
 59 |     """
 60 | 
 61 |     original_element = element.child_by_class_name(original_class)
 62 |     if original_element:
 63 |         prev_contents = element.contents
 64 |     else:
 65 |         prev_contents = div(original_class, element.contents)
 66 | 
 67 |     if at_front:
 68 |         blocks = [*new_child_blocks, prev_contents]
 69 |     else:
 70 |         blocks = [prev_contents, *new_child_blocks]
 71 | 
 72 |     return div(container_class, html_join_blocks(*blocks))
 73 | 
 74 | 
 75 | def chunk_text_as_divs(
 76 |     text: str, min_size: int, unit: TextUnit, class_name: ClassNames = CHUNK
 77 | ) -> str:
 78 |     """
 79 |     Add HTML divs around "chunks" of text paragraphs or top-level divs, where each chunk
 80 |     is at least the specified minimum size.
 81 |     """
 82 | 
 83 |     if is_div(first_wordtok(text)):
 84 |         log.info("Chunking paragraphs using divs.")
 85 |         parsed = parse_divs(text)
 86 |         div_chunks = chunk_children(parsed, min_size, unit)
 87 |         chunk_strs = [chunk.reassemble() for chunk in div_chunks]
 88 |         size_summary = parsed.size_summary()
 89 |     else:
 90 |         log.info("Chunking paragraphs using newlines.")
 91 |         doc = TextDoc.from_text(text)
 92 |         doc_chunks = chunk_paras(doc, min_size, unit)
 93 |         chunk_strs = [chunk.reassemble() for chunk in doc_chunks]
 94 |         size_summary = doc.size_summary()
 95 | 
 96 |     result_divs = [div(class_name, chunk_str) for chunk_str in chunk_strs]
 97 | 
 98 |     log.info("Added %s div chunks on doc:\n%s", len(result_divs), size_summary)
 99 | 
100 |     return "\n\n".join(result_divs)
101 | 


--------------------------------------------------------------------------------
/tests/divs/test_div_elements.py:
--------------------------------------------------------------------------------
  1 | from textwrap import dedent
  2 | 
  3 | from chopdiff.divs.div_elements import CHUNK, chunk_text_as_divs, div, div_insert_wrapped
  4 | from chopdiff.divs.parse_divs import parse_divs_single
  5 | from chopdiff.docs.sizes import TextUnit
  6 | 
  7 | 
  8 | def test_div_insert_child():
  9 |     node1 = parse_divs_single("Chunk text.")
 10 |     node2 = parse_divs_single(div(CHUNK, "Chunk text."))
 11 | 
 12 |     child_str = div("new", "New child text.")
 13 | 
 14 |     new_result1 = div_insert_wrapped(node1, [child_str])
 15 |     new_result2 = div_insert_wrapped(node2, [child_str])
 16 | 
 17 |     print("\n---test_div_insert_child---")
 18 |     print("\nnode1:")
 19 |     print(node1.original_text)
 20 |     print("\nnode2:")
 21 |     print(node2.original_text)
 22 |     print("\nnew_child_str:")
 23 |     print(child_str)
 24 |     print("\nnew_result1:")
 25 |     print(new_result1)
 26 |     print("\nnew_result2:")
 27 |     print(new_result2)
 28 | 
 29 |     assert (
 30 |         new_result1
 31 |         == dedent(
 32 |             """
 33 |             <div class="chunk">
 34 | 
 35 |             <div class="new">
 36 | 
 37 |             New child text.
 38 | 
 39 |             </div>
 40 | 
 41 |             <div class="original">
 42 | 
 43 |             Chunk text.
 44 | 
 45 |             </div>
 46 | 
 47 |             </div>
 48 |             """
 49 |         ).strip()
 50 |     )
 51 | 
 52 |     assert new_result2 == new_result1
 53 | 
 54 |     node3 = parse_divs_single(new_result1)
 55 | 
 56 |     another_child_str = div("another", "Another child text.")
 57 | 
 58 |     new_result3 = div_insert_wrapped(node3, [another_child_str])
 59 |     print("\nnew_result3:")
 60 |     print(new_result3)
 61 | 
 62 |     assert (
 63 |         new_result3
 64 |         == dedent(
 65 |             """
 66 |             <div class="chunk">
 67 | 
 68 |             <div class="another">
 69 | 
 70 |             Another child text.
 71 | 
 72 |             </div>
 73 | 
 74 |             <div class="new">
 75 | 
 76 |             New child text.
 77 | 
 78 |             </div>
 79 | 
 80 |             <div class="original">
 81 | 
 82 |             Chunk text.
 83 | 
 84 |             </div>
 85 | 
 86 |             </div>
 87 |             """
 88 |         ).strip()
 89 |     )
 90 | 
 91 | 
 92 | _med_test_doc = dedent(
 93 |     """
 94 |         # Title
 95 | 
 96 |         Hello World. This is an example sentence. And here's another one!
 97 | 
 98 |         ## Subtitle
 99 | 
100 |         This is a new paragraph.
101 |         It has several sentences.
102 |         There may be line breaks within a paragraph, but these should not affect handlingof the paragraph.
103 |         There are also [links](http://www.google.com) and **bold** and *italic* text.
104 | 
105 |         ### Itemized List
106 | 
107 |         - Item 1
108 | 
109 |         - Item 2
110 | 
111 |         - Item 3
112 | 
113 |         <div>  extra 
114 |         </div>
115 | 
116 |         Blah blah.
117 |         """
118 | ).strip()
119 | 
120 | 
121 | def test_chunk_text_into_divs():
122 |     assert chunk_text_as_divs("", 7, TextUnit.words) == ""
123 |     assert (
124 |         chunk_text_as_divs("hello", 100, TextUnit.words) == '<div class="chunk">\n\nhello\n\n</div>'
125 |     )
126 | 
127 |     chunked = chunk_text_as_divs(_med_test_doc, 7, TextUnit.words)
128 | 
129 |     print("\n---test_chunk_paras_as_divs---")
130 |     print("Chunked doc:\n---\n" + chunked + "\n---")
131 | 
132 |     expected_first_chunk = dedent(
133 |         """
134 |         <div class="chunk">
135 | 
136 |         # Title
137 | 
138 |         Hello World. This is an example sentence. And here's another one!
139 | 
140 |         </div>
141 |         """
142 |     ).strip()
143 | 
144 |     assert chunked.startswith(expected_first_chunk)
145 |     assert chunked.endswith("</div>")
146 |     assert chunked.count("<div class=") == 4
147 |     assert chunked.count("</div>") == 5  # Extra spurious </div>.
148 | 


--------------------------------------------------------------------------------
/tests/docs/test_token_mapping.py:
--------------------------------------------------------------------------------
  1 | from textwrap import dedent
  2 | 
  3 | from chopdiff.docs.text_doc import TextDoc
  4 | from chopdiff.docs.token_mapping import TokenMapping
  5 | from chopdiff.docs.wordtoks import wordtokenize
  6 | 
  7 | 
  8 | def test_offset_mapping():
  9 |     doc1 = TextDoc.from_text("This is a simple test with some words.")
 10 |     doc2 = TextDoc.from_text(
 11 |         "This is<-PARA-BR->a simple pytest adding other words.<-SENT-BR->And another sentence."
 12 |     )
 13 | 
 14 |     mapping = TokenMapping(list(doc1.as_wordtoks()), list(doc2.as_wordtoks()))
 15 | 
 16 |     mapping_str = mapping.full_mapping_str()
 17 | 
 18 |     print(mapping.diff.as_diff_str(include_equal=True))
 19 |     print(mapping)
 20 |     print(mapping.backmap)
 21 |     print(mapping_str)
 22 | 
 23 |     assert (
 24 |         mapping_str
 25 |         == dedent(
 26 |             """
 27 |             0 ⎪This⎪ -> 0 ⎪This⎪
 28 |             1 ⎪ ⎪ -> 1 ⎪ ⎪
 29 |             2 ⎪is⎪ -> 2 ⎪is⎪
 30 |             3 ⎪<-PARA-BR->⎪ -> 3 ⎪ ⎪
 31 |             4 ⎪a⎪ -> 4 ⎪a⎪
 32 |             5 ⎪ ⎪ -> 5 ⎪ ⎪
 33 |             6 ⎪simple⎪ -> 6 ⎪simple⎪
 34 |             7 ⎪ ⎪ -> 7 ⎪ ⎪
 35 |             8 ⎪pytest⎪ -> 8 ⎪test⎪
 36 |             9 ⎪ ⎪ -> 9 ⎪ ⎪
 37 |             10 ⎪adding⎪ -> 10 ⎪with⎪
 38 |             11 ⎪ ⎪ -> 11 ⎪ ⎪
 39 |             12 ⎪other⎪ -> 12 ⎪some⎪
 40 |             13 ⎪ ⎪ -> 13 ⎪ ⎪
 41 |             14 ⎪words⎪ -> 14 ⎪words⎪
 42 |             15 ⎪.⎪ -> 15 ⎪.⎪
 43 |             16 ⎪<-SENT-BR->⎪ -> 15 ⎪.⎪
 44 |             17 ⎪And⎪ -> 15 ⎪.⎪
 45 |             18 ⎪ ⎪ -> 15 ⎪.⎪
 46 |             19 ⎪another⎪ -> 15 ⎪.⎪
 47 |             20 ⎪ ⎪ -> 15 ⎪.⎪
 48 |             21 ⎪sentence⎪ -> 15 ⎪.⎪
 49 |             22 ⎪.⎪ -> 15 ⎪.⎪
 50 |             """
 51 |         ).strip()
 52 |     )
 53 | 
 54 | 
 55 | def test_offset_mapping_longer():
 56 |     doc1 = dedent(
 57 |         """
 58 |         <span data-timestamp="5.60">Alright, guys.</span>
 59 |         <span data-timestamp="6.16">Here's the deal.</span>
 60 |         <span data-timestamp="7.92">You can follow me on my daily workouts.</span>
 61 |         """
 62 |     )
 63 |     doc2 = dedent(
 64 |         """
 65 |         Alright, guys. Here's the deal.
 66 |         You can follow me on my daily workouts.
 67 |         """
 68 |     )
 69 | 
 70 |     doc1_wordtoks = wordtokenize(doc1)
 71 |     doc2_wordtoks = list(TextDoc.from_text(doc2).as_wordtoks())
 72 | 
 73 |     mapping = TokenMapping(doc1_wordtoks, doc2_wordtoks)
 74 | 
 75 |     mapping_str = mapping.full_mapping_str()
 76 | 
 77 |     print(mapping.diff.as_diff_str(include_equal=True))
 78 |     print(mapping)
 79 |     print(mapping.backmap)
 80 |     print(mapping_str)
 81 | 
 82 |     assert (
 83 |         mapping_str
 84 |         == dedent(
 85 |             """
 86 |             0 ⎪Alright⎪ -> 2 ⎪Alright⎪
 87 |             1 ⎪,⎪ -> 3 ⎪,⎪
 88 |             2 ⎪ ⎪ -> 4 ⎪ ⎪
 89 |             3 ⎪guys⎪ -> 5 ⎪guys⎪
 90 |             4 ⎪.⎪ -> 6 ⎪.⎪
 91 |             5 ⎪ ⎪ -> 8 ⎪ ⎪
 92 |             6 ⎪Here⎪ -> 10 ⎪Here⎪
 93 |             7 ⎪'⎪ -> 11 ⎪'⎪
 94 |             8 ⎪s⎪ -> 12 ⎪s⎪
 95 |             9 ⎪ ⎪ -> 13 ⎪ ⎪
 96 |             10 ⎪the⎪ -> 14 ⎪the⎪
 97 |             11 ⎪ ⎪ -> 15 ⎪ ⎪
 98 |             12 ⎪deal⎪ -> 16 ⎪deal⎪
 99 |             13 ⎪.⎪ -> 17 ⎪.⎪
100 |             14 ⎪<-SENT-BR->⎪ -> 20 ⎪<span data-timestamp="7.92">⎪
101 |             15 ⎪You⎪ -> 21 ⎪You⎪
102 |             16 ⎪ ⎪ -> 22 ⎪ ⎪
103 |             17 ⎪can⎪ -> 23 ⎪can⎪
104 |             18 ⎪ ⎪ -> 24 ⎪ ⎪
105 |             19 ⎪follow⎪ -> 25 ⎪follow⎪
106 |             20 ⎪ ⎪ -> 26 ⎪ ⎪
107 |             21 ⎪me⎪ -> 27 ⎪me⎪
108 |             22 ⎪ ⎪ -> 28 ⎪ ⎪
109 |             23 ⎪on⎪ -> 29 ⎪on⎪
110 |             24 ⎪ ⎪ -> 30 ⎪ ⎪
111 |             25 ⎪my⎪ -> 31 ⎪my⎪
112 |             26 ⎪ ⎪ -> 32 ⎪ ⎪
113 |             27 ⎪daily⎪ -> 33 ⎪daily⎪
114 |             28 ⎪ ⎪ -> 34 ⎪ ⎪
115 |             29 ⎪workouts⎪ -> 35 ⎪workouts⎪
116 |             30 ⎪.⎪ -> 36 ⎪.⎪
117 |             """
118 |         ).strip()
119 |     )
120 | 


--------------------------------------------------------------------------------
/examples/insert_para_breaks.py:
--------------------------------------------------------------------------------
  1 | # /// script
  2 | # requires-python = ">=3.13"
  3 | # dependencies = [
  4 | #     "chopdiff",
  5 | #     "flowmark",
  6 | #     "openai",
  7 | # ]
  8 | # ///
  9 | import argparse
 10 | import logging
 11 | from textwrap import dedent
 12 | 
 13 | import openai  # pyright: ignore  # Not a project dep.
 14 | from flowmark import fill_text
 15 | 
 16 | from chopdiff.docs import TextDoc
 17 | from chopdiff.transforms import WINDOW_2K_WORDTOKS, changes_whitespace, filtered_transform
 18 | 
 19 | logging.basicConfig(format=">> %(message)s")
 20 | log = logging.getLogger(__name__)
 21 | log.setLevel(logging.INFO)
 22 | 
 23 | 
 24 | def heading(text: str):
 25 |     return "\n--- " + text + " " + "-" * (70 - len(text)) + "\n"
 26 | 
 27 | 
 28 | def insert_paragraph_breaks(text: str) -> str:
 29 |     # Create a TextDoc from the input text
 30 |     doc = TextDoc.from_text(text)
 31 | 
 32 |     # Handy calculations of document size in paragraphs, sentences, etc.
 33 |     print(f"\nInput document: {doc.size_summary()}")
 34 | 
 35 |     # Define the transformation function.
 36 |     # Note in this case we run the LLM on strings, but you could also work directly
 37 |     # on the TextDoc if appropriate.
 38 |     def transform(doc: TextDoc) -> TextDoc:
 39 |         return TextDoc.from_text(llm_insert_para_breaks(doc.reassemble()))
 40 | 
 41 |     # Apply the transformation with windowing and filtering.
 42 |     #
 43 |     # This will walk along the document in approximately 2K "wordtok" chunks
 44 |     # (~1000 words) and apply the transformation to each chunk. Chunks can
 45 |     # slightly overlap to make this more robust.
 46 |     #
 47 |     # The change on each chunk will then be filtered to only include whitespace
 48 |     # changes.
 49 |     #
 50 |     # Finally each change will be "stitched back" to form the original document,
 51 |     # by looking for the right alignment of words between the original and the
 52 |     # transformed chunk.
 53 |     #
 54 |     # (Turn on logging to see these details.)
 55 |     result_doc = filtered_transform(
 56 |         doc, transform, windowing=WINDOW_2K_WORDTOKS, diff_filter=changes_whitespace
 57 |     )
 58 | 
 59 |     print(heading("Output document"))
 60 |     print(f"\nOutput document: {result_doc.size_summary()}")
 61 | 
 62 |     # Return the transformed text
 63 |     return result_doc.reassemble()
 64 | 
 65 | 
 66 | def llm_insert_para_breaks(input_text: str) -> str:
 67 |     """
 68 |     Call OpenAI to insert paragraph breaks on a chunk of text.
 69 |     This works best on a smaller chunk of text and might make
 70 |     other non-whitespace changes.
 71 |     """
 72 |     client: openai.OpenAI = openai.OpenAI()
 73 | 
 74 |     response = client.chat.completions.create(
 75 |         model="gpt-4o-mini",
 76 |         messages=[
 77 |             {"role": "system", "content": "You are a careful and precise editor."},
 78 |             {
 79 |                 "role": "user",
 80 |                 "content": dedent(
 81 |                     f"""
 82 |                     Break the following text into paragraphs.
 83 | 
 84 |                     Original text:
 85 | 
 86 |                     {input_text}
 87 | 
 88 |                     Formatted text:
 89 |                     """
 90 |                 ),
 91 |             },
 92 |         ],
 93 |         temperature=0.0,
 94 |     )
 95 | 
 96 |     return response.choices[0].message.content or ""
 97 | 
 98 | 
 99 | def main():
100 |     parser = argparse.ArgumentParser(
101 |         description="Insert paragraph breaks in text files, making no other changes of any kind to a document."
102 |     )
103 |     parser.add_argument("input_file", help="Path to the input text file")
104 |     parser.add_argument("-o", "--output", help="Path to the output file (default: stdout)")
105 |     args = parser.parse_args()
106 | 
107 |     logging.basicConfig(level=logging.INFO)
108 | 
109 |     with open(args.input_file, encoding="utf-8") as f:
110 |         input_text = f.read()
111 | 
112 |     print(heading("Original"))
113 |     print(fill_text(input_text))
114 | 
115 |     result = insert_paragraph_breaks(input_text)
116 | 
117 |     print(heading("With paragraph breaks"))
118 |     print(fill_text(result))
119 | 
120 | 
121 | if __name__ == "__main__":
122 |     main()
123 | 


--------------------------------------------------------------------------------
/tests/transforms/test_diff_filters.py:
--------------------------------------------------------------------------------
  1 | from chopdiff.docs.text_doc import TextDoc
  2 | from chopdiff.docs.token_diffs import DiffOp, OpType, diff_wordtoks
  3 | from chopdiff.docs.wordtoks import PARA_BR_TOK, SENT_BR_TOK, is_break_or_space
  4 | from chopdiff.transforms.diff_filters import (
  5 |     WILDCARD_TOK,
  6 |     changes_whitespace,
  7 |     make_token_sequence_filter,
  8 |     no_word_lemma_changes,
  9 |     removes_word_lemmas,
 10 |     removes_words,
 11 | )
 12 | 
 13 | 
 14 | def test_filter_br_and_space():
 15 |     from ..docs.test_token_diffs import _short_text1, _short_text2, _short_text3
 16 | 
 17 |     wordtoks1 = list(TextDoc.from_text(_short_text1).as_wordtoks())
 18 |     wordtoks2 = list(TextDoc.from_text(_short_text2).as_wordtoks())
 19 |     wordtoks3 = list(TextDoc.from_text(_short_text3).as_wordtoks())
 20 | 
 21 |     diff = diff_wordtoks(wordtoks1, wordtoks2)
 22 | 
 23 |     accepted, rejected = diff.filter(changes_whitespace)
 24 | 
 25 |     accepted_result = accepted.apply_to(wordtoks1)
 26 |     rejected_result = rejected.apply_to(wordtoks1)
 27 | 
 28 |     print("---Filtered diff:")
 29 |     print("Original: " + "/".join(wordtoks1))
 30 |     print("Full diff:", diff)
 31 |     print("Accepted diff:", accepted)
 32 |     print("Rejected diff:", rejected)
 33 |     print("Accepted result: " + "/".join(accepted_result))
 34 |     print("Rejected result: " + "/".join(rejected_result))
 35 | 
 36 |     assert accepted_result == wordtoks3
 37 | 
 38 | 
 39 | def test_token_sequence_filter_with_predicate():
 40 |     insert_op = DiffOp(OpType.INSERT, [], [SENT_BR_TOK, "<h1>", "Title", "</h1>", PARA_BR_TOK])
 41 |     delete_op = DiffOp(OpType.DELETE, [SENT_BR_TOK, "<h1>", "Old Title", "</h1>", PARA_BR_TOK], [])
 42 |     replace_op = DiffOp(OpType.REPLACE, ["Some", "text"], ["New", "text"])
 43 |     equal_op = DiffOp(OpType.EQUAL, ["Unchanged"], ["Unchanged"])
 44 | 
 45 |     action = OpType.INSERT
 46 |     filter_fn = make_token_sequence_filter(
 47 |         [is_break_or_space, "<h1>", WILDCARD_TOK, "</h1>", is_break_or_space], action
 48 |     )
 49 | 
 50 |     assert filter_fn(insert_op)
 51 |     assert not filter_fn(delete_op)  # action is INSERT
 52 |     assert not filter_fn(replace_op)
 53 |     assert not filter_fn(equal_op)
 54 | 
 55 |     ignore_whitespace_filter_fn = make_token_sequence_filter(
 56 |         ["<h1>", WILDCARD_TOK, "</h1>"],
 57 |         action=OpType.INSERT,
 58 |         ignore=is_break_or_space,
 59 |     )
 60 | 
 61 |     insert_op_with_whitespace = DiffOp(
 62 |         OpType.INSERT,
 63 |         [],
 64 |         [" ", SENT_BR_TOK, " ", "<h1>", "Title", "</h1>", " ", PARA_BR_TOK, " "],
 65 |     )
 66 | 
 67 |     assert ignore_whitespace_filter_fn(insert_op_with_whitespace)
 68 |     assert not ignore_whitespace_filter_fn(delete_op)  # action is INSERT
 69 |     assert not ignore_whitespace_filter_fn(replace_op)
 70 |     assert not ignore_whitespace_filter_fn(equal_op)
 71 | 
 72 | 
 73 | def test_no_word_changes_lemmatized():
 74 |     assert not no_word_lemma_changes(DiffOp(OpType.INSERT, [], ["the"]))
 75 |     assert not no_word_lemma_changes(DiffOp(OpType.DELETE, ["the"], []))
 76 |     assert not no_word_lemma_changes(
 77 |         DiffOp(
 78 |             OpType.REPLACE,
 79 |             ["The", "dogs", "were", "running", "fast"],
 80 |             ["The", "dog", "was", "running"],
 81 |         )
 82 |     )
 83 |     assert no_word_lemma_changes(
 84 |         DiffOp(
 85 |             OpType.REPLACE,
 86 |             ["The", "dogs", "were", "running"],
 87 |             ["The", "dog", "was", "running"],
 88 |         )
 89 |     )
 90 | 
 91 | 
 92 | def test_removes_words():
 93 |     assert removes_words(DiffOp(OpType.DELETE, ["Hello", " "], []))
 94 |     assert removes_words(DiffOp(OpType.REPLACE, ["Hello", " ", "world"], ["world"]))
 95 |     assert not removes_words(DiffOp(OpType.REPLACE, ["Hello", " ", "world"], ["World"]))
 96 |     assert removes_word_lemmas(DiffOp(OpType.REPLACE, ["Hello", " ", "world"], ["World"]))
 97 | 
 98 |     assert not removes_words(
 99 |         DiffOp(OpType.REPLACE, ["Hello", "*", "world"], ["hello", "*", "world"])
100 |     )
101 |     assert removes_word_lemmas(
102 |         DiffOp(OpType.REPLACE, ["Hello", "*", "world"], ["hello", "*", "world"])
103 |     )
104 | 
105 |     assert removes_words(DiffOp(OpType.DELETE, ["Hello", "world"], []))
106 |     assert removes_word_lemmas(DiffOp(OpType.DELETE, ["Hello", "world"], []))
107 | 


--------------------------------------------------------------------------------
/src/chopdiff/divs/parse_divs.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import re
  3 | 
  4 | from chopdiff.divs.text_node import TextNode
  5 | 
  6 | DIV_TAGS = re.compile(r"(<div\b[^>]*>|</div>)", re.IGNORECASE)
  7 | 
  8 | CLASS_NAME_PATTERN = re.compile(r"\bclass=\"([^\"]+)\"", re.IGNORECASE)
  9 | 
 10 | 
 11 | def parse_divs(text: str, skip_whitespace: bool = True) -> TextNode:
 12 |     """
 13 |     Parse a string recursively into `TextNode`s based on `<div>` tags.
 14 | 
 15 |     All offsets are relative to the original text. Text outside of a div tag is
 16 |     included as a `TextNode` with None markers.
 17 | 
 18 |     We do our own parsing to keep this simple and exactly preserve formatting.
 19 |     """
 20 |     parsed = _parse_divs_recursive(
 21 |         text,
 22 |         0,
 23 |         TextNode(original_text=text, offset=0, content_start=0, content_end=len(text)),
 24 |     )
 25 | 
 26 |     if skip_whitespace:
 27 |         parsed = _skip_whitespace_nodes(parsed)
 28 | 
 29 |     return parsed
 30 | 
 31 | 
 32 | def parse_divs_single(text: str, skip_whitespace: bool = True) -> TextNode:
 33 |     """
 34 |     Same as parse_divs but unwraps any singleton child.
 35 |     """
 36 |     divs = parse_divs(text, skip_whitespace=skip_whitespace)
 37 |     if len(divs.children) == 1:
 38 |         return divs.children[0]
 39 |     else:
 40 |         return divs
 41 | 
 42 | 
 43 | def _skip_whitespace_nodes(node: TextNode) -> TextNode:
 44 |     filtered_node = copy.copy(node)
 45 |     filtered_node.children = [
 46 |         _skip_whitespace_nodes(child) for child in node.children if not child.is_whitespace()
 47 |     ]
 48 |     return filtered_node
 49 | 
 50 | 
 51 | def _parse_divs_recursive(
 52 |     text: str,
 53 |     start_offset: int,
 54 |     result: TextNode,
 55 | ) -> TextNode:
 56 |     current_offset = start_offset
 57 | 
 58 |     while current_offset < len(text):
 59 |         match = DIV_TAGS.search(text, current_offset)
 60 | 
 61 |         if not match:
 62 |             # No more div tags, add remaining content as a child node
 63 |             if current_offset < len(text):
 64 |                 result.children.append(
 65 |                     TextNode(
 66 |                         original_text=text,
 67 |                         offset=current_offset,
 68 |                         content_start=current_offset,
 69 |                         content_end=len(text),
 70 |                     )
 71 |                 )
 72 |             break
 73 | 
 74 |         if match.start() > current_offset:
 75 |             # Add content before the div tag as a child node.
 76 |             result.children.append(
 77 |                 TextNode(
 78 |                     original_text=text,
 79 |                     offset=current_offset,
 80 |                     content_start=current_offset,
 81 |                     content_end=match.start(),
 82 |                 )
 83 |             )
 84 | 
 85 |         tag = match.group(1)
 86 |         is_end_tag = tag.startswith("</")
 87 | 
 88 |         if is_end_tag:
 89 |             # Closing tag. We're done with this node.
 90 |             result.end_marker = tag
 91 |             result.content_end = match.start()
 92 |             current_offset = match.end()
 93 |             break
 94 |         else:
 95 |             # Opening tag. Create a new child node and recurse.
 96 |             class_match = CLASS_NAME_PATTERN.search(tag)
 97 |             class_name = class_match.group(1) if class_match else None
 98 | 
 99 |             child_node = TextNode(
100 |                 original_text=text,
101 |                 offset=match.start(),
102 |                 content_start=match.end(),
103 |                 content_end=len(text),
104 |                 tag_name="div",
105 |                 class_name=class_name,
106 |                 begin_marker=tag,
107 |             )
108 | 
109 |             child_node = _parse_divs_recursive(text, match.end(), child_node)
110 | 
111 |             result.children.append(child_node)
112 | 
113 |             current_offset = child_node.end_offset
114 | 
115 |     return result
116 | 
117 | 
118 | def parse_divs_by_class(text: str, class_name: str) -> list[TextNode]:
119 |     """
120 |     Parse div chunks into TextNodes.
121 |     """
122 | 
123 |     text_node = parse_divs(text)
124 | 
125 |     matched_divs = text_node.children_by_class_names(class_name, recursive=True)
126 | 
127 |     if not matched_divs:
128 |         raise ValueError(f"No `{class_name}` divs found in text.")
129 | 
130 |     return matched_divs
131 | 


--------------------------------------------------------------------------------
/tests/docs/test_wordtoks.py:
--------------------------------------------------------------------------------
 1 | from textwrap import dedent
 2 | 
 3 | from chopdiff.docs.search_tokens import search_tokens
 4 | from chopdiff.docs.wordtoks import (
 5 |     Tag,
 6 |     _insert_para_wordtoks,
 7 |     is_entity,
 8 |     is_tag,
 9 |     is_tag_close,
10 |     is_tag_open,
11 |     parse_tag,
12 |     visualize_wordtoks,
13 |     wordtokenize,
14 | )
15 | 
16 | _test_doc = dedent(
17 |     """
18 |     Hello, world!
19 |     This is an "example sentence with punctuation.
20 |     "Special characters: @#%^&*()"
21 |     <span data-timestamp="5.60">Alright, guys.</span>
22 | 
23 |     <span data-timestamp="6.16">Here's the deal.</span>
24 |     <span data-timestamp="7.92">You can follow me on my daily workouts.&nbsp;<span class="citation timestamp-link" data-src="resources/the_time_is_now.resource.yml"
25 |     data-timestamp="10.29"><a
26 |     href="https://www.youtube.com/">00:10</a></span>
27 |     """
28 | ).strip()
29 | 
30 | 
31 | def test_html_doc():
32 |     wordtoks = wordtokenize(_test_doc, bof_eof=True)
33 | 
34 |     print("\n---Wordtoks test:")
35 |     print(visualize_wordtoks(wordtoks))
36 | 
37 |     print("\n---Wordtoks with para br:")
38 |     wordtoks_with_para = wordtokenize(_insert_para_wordtoks(_test_doc), bof_eof=True)
39 |     print(visualize_wordtoks(wordtoks_with_para))
40 | 
41 |     assert (
42 |         visualize_wordtoks(wordtoks)
43 |         == """⎪<-BOF->⎪Hello⎪,⎪ ⎪world⎪!⎪ ⎪This⎪ ⎪is⎪ ⎪an⎪ ⎪"⎪example⎪ ⎪sentence⎪ ⎪with⎪ ⎪punctuation⎪.⎪ ⎪"⎪Special⎪ ⎪characters⎪:⎪ ⎪@⎪#⎪%⎪^⎪&⎪*⎪(⎪)⎪"⎪ ⎪<span data-timestamp="5.60">⎪Alright⎪,⎪ ⎪guys⎪.⎪</span>⎪ ⎪<span data-timestamp="6.16">⎪Here⎪'⎪s⎪ ⎪the⎪ ⎪deal⎪.⎪</span>⎪ ⎪<span data-timestamp="7.92">⎪You⎪ ⎪can⎪ ⎪follow⎪ ⎪me⎪ ⎪on⎪ ⎪my⎪ ⎪daily⎪ ⎪workouts⎪.⎪&nbsp;⎪<span class="citation timestamp-link" data-src="resources/the_time_is_now.resource.yml" data-timestamp="10.29">⎪<a href="https://www.youtube.com/">⎪00⎪:⎪10⎪</a>⎪</span>⎪<-EOF->⎪"""
44 |     )
45 | 
46 |     assert (
47 |         visualize_wordtoks(wordtoks_with_para)
48 |         == """⎪<-BOF->⎪Hello⎪,⎪ ⎪world⎪!⎪ ⎪This⎪ ⎪is⎪ ⎪an⎪ ⎪"⎪example⎪ ⎪sentence⎪ ⎪with⎪ ⎪punctuation⎪.⎪ ⎪"⎪Special⎪ ⎪characters⎪:⎪ ⎪@⎪#⎪%⎪^⎪&⎪*⎪(⎪)⎪"⎪ ⎪<span data-timestamp="5.60">⎪Alright⎪,⎪ ⎪guys⎪.⎪</span>⎪<-PARA-BR->⎪<span data-timestamp="6.16">⎪Here⎪'⎪s⎪ ⎪the⎪ ⎪deal⎪.⎪</span>⎪ ⎪<span data-timestamp="7.92">⎪You⎪ ⎪can⎪ ⎪follow⎪ ⎪me⎪ ⎪on⎪ ⎪my⎪ ⎪daily⎪ ⎪workouts⎪.⎪&nbsp;⎪<span class="citation timestamp-link" data-src="resources/the_time_is_now.resource.yml" data-timestamp="10.29">⎪<a href="https://www.youtube.com/">⎪00⎪:⎪10⎪</a>⎪</span>⎪<-EOF->⎪"""
49 |     )
50 | 
51 |     print("\n---Searching tokens")
52 | 
53 |     print(search_tokens(wordtoks).at(0).seek_forward(["example"]).get_token())
54 |     print(search_tokens(wordtoks).at(-1).seek_back(["follow"]).get_token())
55 |     print(search_tokens(wordtoks).at(-1).seek_back(["Special"]).seek_forward(is_tag).get_token())
56 | 
57 |     assert search_tokens(wordtoks).at(0).seek_forward(["example"]).get_token() == (
58 |         14,
59 |         "example",
60 |     )
61 |     assert search_tokens(wordtoks).at(-1).seek_back(["follow"]).get_token() == (
62 |         63,
63 |         "follow",
64 |     )
65 |     assert search_tokens(wordtoks).at(-1).seek_back(["Special"]).seek_forward(
66 |         is_tag
67 |     ).get_token() == (39, '<span data-timestamp="5.60">')
68 | 
69 | 
70 | def test_tag_functions():
71 |     assert parse_tag("<div>") == Tag(name="div", is_open=True, is_close=False, attrs={})
72 |     assert parse_tag("</div>") == Tag(name="div", is_open=False, is_close=True, attrs={})
73 |     assert parse_tag("<div/>") == Tag(name="div", is_open=True, is_close=True, attrs={})
74 |     assert parse_tag("<!-- Comment -->") == Tag(
75 |         name="", is_open=False, is_close=False, attrs={}, comment=" Comment "
76 |     )
77 | 
78 |     assert not is_tag("foo")
79 |     assert not is_tag("<a")
80 |     assert is_tag("<div>")
81 |     assert is_tag("</div>")
82 |     assert is_tag("<span>")
83 |     assert is_tag("<div>", ["div"])
84 |     assert not is_tag("<div>", ["span"])
85 |     assert is_tag("<div/>")
86 | 
87 |     assert is_tag_close("</div>")
88 |     assert not is_tag_close("<div>")
89 |     assert is_tag_close("</div>", ["div"])
90 |     assert not is_tag_close("</div>", ["span"])
91 |     assert is_tag_close("<div/>")
92 |     assert is_tag_open("<div>")
93 |     assert not is_tag_open("</div>")
94 |     assert is_tag_open("<div>", ["div"])
95 |     assert not is_tag_open("<div>", ["span"])
96 | 
97 |     assert is_entity("&amp;")
98 |     assert not is_entity("nbsp;")
99 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Generated by Makefile
  2 | CLAUDE.md
  3 | AGENTS.md
  4 | 
  5 | # Additions to standard GitHub .gitignore:
  6 | *.bak
  7 | *.orig
  8 | tmp/
  9 | trash/
 10 | attic/
 11 | .kash/
 12 | 
 13 | # Byte-compiled / optimized / DLL files
 14 | __pycache__/
 15 | *.py[cod]
 16 | *$py.class
 17 | 
 18 | # C extensions
 19 | *.so
 20 | 
 21 | # Distribution / packaging
 22 | .Python
 23 | build/
 24 | develop-eggs/
 25 | dist/
 26 | downloads/
 27 | eggs/
 28 | .eggs/
 29 | lib/
 30 | lib64/
 31 | parts/
 32 | sdist/
 33 | var/
 34 | wheels/
 35 | share/python-wheels/
 36 | *.egg-info/
 37 | .installed.cfg
 38 | *.egg
 39 | MANIFEST
 40 | 
 41 | # PyInstaller
 42 | #  Usually these files are written by a python script from a template
 43 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 44 | *.manifest
 45 | *.spec
 46 | 
 47 | # Installer logs
 48 | pip-log.txt
 49 | pip-delete-this-directory.txt
 50 | 
 51 | # Unit test / coverage reports
 52 | htmlcov/
 53 | .tox/
 54 | .nox/
 55 | .coverage
 56 | .coverage.*
 57 | .cache
 58 | nosetests.xml
 59 | coverage.xml
 60 | *.cover
 61 | *.py,cover
 62 | .hypothesis/
 63 | .pytest_cache/
 64 | cover/
 65 | 
 66 | # Translations
 67 | *.mo
 68 | *.pot
 69 | 
 70 | # Django stuff:
 71 | *.log
 72 | local_settings.py
 73 | db.sqlite3
 74 | db.sqlite3-journal
 75 | 
 76 | # Flask stuff:
 77 | instance/
 78 | .webassets-cache
 79 | 
 80 | # Scrapy stuff:
 81 | .scrapy
 82 | 
 83 | # Sphinx documentation
 84 | docs/_build/
 85 | 
 86 | # PyBuilder
 87 | .pybuilder/
 88 | target/
 89 | 
 90 | # Jupyter Notebook
 91 | .ipynb_checkpoints
 92 | 
 93 | # IPython
 94 | profile_default/
 95 | ipython_config.py
 96 | 
 97 | # pyenv
 98 | #   For a library or package, you might want to ignore these files since the code is
 99 | #   intended to run in multiple environments; otherwise, check them in:
100 | # .python-version
101 | 
102 | # pipenv
103 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
104 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
105 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
106 | #   install all needed dependencies.
107 | #Pipfile.lock
108 | 
109 | # UV
110 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
111 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
112 | #   commonly ignored for libraries.
113 | #uv.lock
114 | 
115 | # poetry
116 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
117 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
118 | #   commonly ignored for libraries.
119 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
120 | #poetry.lock
121 | 
122 | # pdm
123 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
124 | #pdm.lock
125 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
126 | #   in version control.
127 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
128 | .pdm.toml
129 | .pdm-python
130 | .pdm-build/
131 | 
132 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
133 | __pypackages__/
134 | 
135 | # Celery stuff
136 | celerybeat-schedule
137 | celerybeat.pid
138 | 
139 | # SageMath parsed files
140 | *.sage.py
141 | 
142 | # Environments
143 | .env
144 | .venv
145 | env/
146 | venv/
147 | ENV/
148 | env.bak/
149 | venv.bak/
150 | 
151 | # Spyder project settings
152 | .spyderproject
153 | .spyproject
154 | 
155 | # Rope project settings
156 | .ropeproject
157 | 
158 | # mkdocs documentation
159 | /site
160 | 
161 | # mypy
162 | .mypy_cache/
163 | .dmypy.json
164 | dmypy.json
165 | 
166 | # Pyre type checker
167 | .pyre/
168 | 
169 | # pytype static type analyzer
170 | .pytype/
171 | 
172 | # Cython debug symbols
173 | cython_debug/
174 | 
175 | # PyCharm
176 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
177 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
178 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
179 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
180 | #.idea/
181 | 
182 | # PyPI configuration file
183 | .pypirc
184 | 


--------------------------------------------------------------------------------
/tests/docs/test_token_diffs.py:
--------------------------------------------------------------------------------
  1 | from textwrap import dedent
  2 | 
  3 | from chopdiff.docs.text_doc import SentIndex, TextDoc
  4 | from chopdiff.docs.token_diffs import DiffStats, diff_wordtoks, find_best_alignment
  5 | 
  6 | _short_text1 = dedent(
  7 |     """
  8 |     Paragraph one. Sentence 1a. Sentence 1b. Sentence 1c.
  9 |     
 10 |     Paragraph two. Sentence 2a. Sentence 2b. Sentence 2c.
 11 |     
 12 |     Paragraph three. Sentence 3a. Sentence 3b. Sentence 3c.
 13 |     """
 14 | ).strip()
 15 | 
 16 | 
 17 | _short_text2 = dedent(
 18 |     """
 19 |     Paragraph one. Sentence 1a. Sentence 1b. Sentence 1c.
 20 |     Paragraph two blah. Sentence 2a. Sentence 2b. Sentence 2c.
 21 |     
 22 |     Paragraph three! Sentence 3a. Sentence 3b.
 23 |     """
 24 | ).strip()
 25 | 
 26 | # _short_text3 contains all the whitespace and break-only changes from _short_text1 to _short_text2.
 27 | _short_text3 = dedent(
 28 |     """
 29 |     Paragraph one. Sentence 1a. Sentence 1b. Sentence 1c.
 30 |     Paragraph two. Sentence 2a. Sentence 2b. Sentence 2c.
 31 |     
 32 |     Paragraph three. Sentence 3a. Sentence 3b. Sentence 3c.
 33 |     """
 34 | ).strip()
 35 | 
 36 | 
 37 | def test_lcs_diff_wordtoks():
 38 |     wordtoks1 = list(TextDoc.from_text(_short_text1).as_wordtoks())
 39 |     wordtoks2 = list(TextDoc.from_text(_short_text2).as_wordtoks())
 40 | 
 41 |     diff = diff_wordtoks(wordtoks1, wordtoks2)
 42 | 
 43 |     print("---Diff:")
 44 |     print(diff.as_diff_str(True))
 45 | 
 46 |     print("---Diff stats:")
 47 |     print(diff.stats())
 48 |     assert diff.stats() == DiffStats(added=5, removed=8, input_size=59)
 49 | 
 50 |     expected_diff = dedent(
 51 |         """
 52 |         TextDiff: add/remove +5/-8 out of 59 total:
 53 |         at pos    0 keep   19 toks:   ⎪Paragraph one. Sentence 1a. Sentence 1b. Sentence 1c.⎪
 54 |         at pos   19 repl    1 toks: - ⎪<-PARA-BR->⎪
 55 |                     repl    1 toks: + ⎪ ⎪
 56 |         at pos   20 keep    3 toks:   ⎪Paragraph two⎪
 57 |         at pos   23 add     2 toks: + ⎪ blah⎪
 58 |         at pos   23 keep    1 toks:   ⎪.⎪
 59 |         at pos   24 repl    1 toks: - ⎪ ⎪
 60 |                     repl    1 toks: + ⎪<-SENT-BR->⎪
 61 |         at pos   25 keep   18 toks:   ⎪Sentence 2a. Sentence 2b. Sentence 2c.<-PARA-BR->Paragraph three⎪
 62 |         at pos   43 repl    1 toks: - ⎪.⎪
 63 |                     repl    1 toks: + ⎪!⎪
 64 |         at pos   44 keep   10 toks:   ⎪<-SENT-BR->Sentence 3a. Sentence 3b.⎪
 65 |         at pos   54 del     5 toks: - ⎪ Sentence 3c.⎪
 66 |         """
 67 |     ).strip()
 68 | 
 69 |     assert str(diff.as_diff_str(True)) == expected_diff
 70 | 
 71 | 
 72 | def test_apply_to():
 73 |     wordtoks1 = list(TextDoc.from_text(_short_text1).as_wordtoks())
 74 |     wordtoks2 = list(TextDoc.from_text(_short_text2).as_wordtoks())
 75 | 
 76 |     diff = diff_wordtoks(wordtoks1, wordtoks2)
 77 | 
 78 |     print("---Before apply:")
 79 |     print("/".join(wordtoks1))
 80 |     print(diff)
 81 |     result = diff.apply_to(wordtoks1)
 82 |     print("---Result of apply:")
 83 |     print("/".join(result))
 84 |     print("---Expected:")
 85 |     print("/".join(wordtoks2))
 86 |     assert result == wordtoks2
 87 | 
 88 |     wordtoks3 = ["a", "b", "c", "d", "e"]
 89 |     wordtoks4 = ["a", "x", "c", "y", "e"]
 90 |     diff2 = diff_wordtoks(wordtoks3, wordtoks4)
 91 |     result2 = diff2.apply_to(wordtoks3)
 92 |     assert result2 == wordtoks4
 93 | 
 94 | 
 95 | def test_find_best_alignment():
 96 |     wordtoks1 = list(TextDoc.from_text(_short_text1).as_wordtoks())
 97 |     wordtoks2 = list(TextDoc.from_text(_short_text1).sub_doc(SentIndex(1, 1)).as_wordtoks())
 98 |     wordtoks3 = wordtoks2 + ["Extra", "wordtoks", "at", "the", "end"]
 99 |     wordtoks4 = list(wordtoks3)
100 |     wordtoks4[0] = "X"
101 |     wordtoks4[3] = "Y"
102 | 
103 |     print("---Alignment:")
104 |     print("/".join(wordtoks1))
105 |     print("/".join(wordtoks2))
106 |     offset, (score, diff) = find_best_alignment(wordtoks1, wordtoks2, 1)
107 |     print(f"Offset: {offset}, Score: {score}")
108 |     print(diff)
109 |     print()
110 |     assert offset == 39
111 |     assert score == 0.0
112 |     assert diff.changes() == []
113 | 
114 |     offset, (score, diff) = find_best_alignment(wordtoks1, wordtoks3, 3)
115 |     print(f"Offset: {offset}, Score: {score}")
116 |     print(diff)
117 |     print()
118 |     assert offset == 39
119 |     assert score == 0.0
120 |     assert diff.changes() == []
121 | 
122 |     offset, (score, diff) = find_best_alignment(wordtoks1, wordtoks4, 3)
123 |     print(f"Offset: {offset}, Score: {score}")
124 |     print(diff)
125 |     print()
126 |     assert offset == 39
127 |     assert score > 0 and score < 0.3
128 |     assert diff.stats().nchanges() == 4
129 | 


--------------------------------------------------------------------------------
/tests/divs/test_parse_divs.py:
--------------------------------------------------------------------------------
  1 | from textwrap import dedent
  2 | 
  3 | from chopdiff.divs.parse_divs import parse_divs, parse_divs_by_class
  4 | from chopdiff.divs.text_node import TextNode
  5 | 
  6 | _test_text = dedent(
  7 |     """
  8 | 
  9 |     <div class="outer">
 10 |         Outer content paragraph 1.
 11 | 
 12 |         Outer content paragraph 2.
 13 |         <div class="inner">
 14 |             Inner content.
 15 |             <div>
 16 |                 Nested content.
 17 |             </div>
 18 | 
 19 |             <div class="nested-inner">
 20 | 
 21 |                 Nested inner content.
 22 |                 <div>
 23 |                     Deeply nested content.
 24 |                 </div>
 25 |             </div>
 26 | 
 27 |             
 28 |         </div>
 29 |         Outer content paragraph 3.
 30 |     </div>
 31 |     """
 32 | )
 33 | 
 34 | 
 35 | def _strip_lines(text: str) -> list[str]:
 36 |     return [line.strip() for line in text.strip().split("\n")]
 37 | 
 38 | 
 39 | def test_parse_divs():
 40 |     def validate_node(node: TextNode, original_text: str):
 41 |         assert node.original_text == original_text
 42 |         assert 0 <= node.content_start <= len(original_text)
 43 |         assert 0 <= node.content_end <= len(original_text)
 44 |         assert node.content_start <= node.content_end
 45 |         assert node.contents == original_text[node.content_start : node.content_end]
 46 |         assert (
 47 |             node.begin_marker is None
 48 |             or original_text[node.offset : node.offset + len(node.begin_marker)]
 49 |             == node.begin_marker
 50 |         )
 51 |         assert (
 52 |             node.end_marker is None
 53 |             or original_text[node.content_end : node.content_end + len(node.end_marker)]
 54 |             == node.end_marker
 55 |         )
 56 | 
 57 |         for child in node.children:
 58 |             validate_node(child, original_text)
 59 | 
 60 |     node = parse_divs(_test_text, skip_whitespace=False)
 61 | 
 62 |     node_no_whitespace = parse_divs(_test_text, skip_whitespace=True)
 63 | 
 64 |     reassembled = node.reassemble(padding="")
 65 | 
 66 |     print()
 67 |     print(f"Original text (length {len(_test_text)}):")
 68 |     print(_test_text)
 69 | 
 70 |     print()
 71 |     print("Parsed text:")
 72 |     print(node)
 73 | 
 74 |     print()
 75 |     print("Parsed text (no whitespace):")
 76 |     print(node_no_whitespace)
 77 | 
 78 |     print()
 79 |     print(f"Reassembled text (length {len(reassembled)}):")
 80 |     print(reassembled)
 81 | 
 82 |     print()
 83 |     print("Reassembled text (normalized padding):")
 84 |     print(node.reassemble())
 85 | 
 86 |     validate_node(node, _test_text)
 87 | 
 88 |     assert reassembled.count("<div") == reassembled.count("</div")
 89 | 
 90 |     assert node.reassemble(padding="") == _test_text
 91 | 
 92 | 
 93 | def test_structure_summary_str_1():
 94 |     doc = """
 95 |         <div class="chunk">Chunk1</div>
 96 |         <div class="chunk">Chunk2</div>
 97 |         <div class="chunk">Chunk3</div>
 98 |         """
 99 | 
100 |     node = parse_divs(doc)
101 |     summary_str = node.structure_summary_str() or ""
102 | 
103 |     print()
104 |     print("Structure summary:")
105 |     print(summary_str)
106 | 
107 |     expected_summary = dedent(
108 |         """
109 |         HTML structure:
110 |             3  div.chunk
111 |         """
112 |     ).strip()
113 | 
114 |     assert _strip_lines(summary_str) == _strip_lines(expected_summary)
115 | 
116 | 
117 | def test_structure_summary_str_2():
118 |     node = parse_divs(_test_text)
119 |     summary_str = node.structure_summary_str() or ""
120 | 
121 |     print()
122 |     print("Structure summary:")
123 |     print(summary_str)
124 | 
125 |     expected_summary = dedent(
126 |         """
127 |         HTML structure:
128 |             1  div.outer
129 |             1  div.outer > div.inner
130 |             1  div.outer > div.inner > div
131 |             1  div.outer > div.inner > div.nested-inner
132 |             1  div.outer > div.inner > div.nested-inner > div
133 |         """
134 |     ).strip()
135 | 
136 |     assert _strip_lines(summary_str) == _strip_lines(expected_summary)
137 | 
138 | 
139 | def test_parse_chunk_divs():
140 |     text = dedent(
141 |         """
142 |         <div class="chunk">
143 | 
144 |         Chunk 1 text.
145 | 
146 |         </div>
147 | 
148 |         <div class="chunk">
149 | 
150 |         Chunk 2 text.
151 | 
152 |         </div>
153 | 
154 |         <div class="chunk">Empty chunk.</div>
155 | 
156 |         """
157 |     )
158 | 
159 |     chunk_divs = parse_divs_by_class(text, "chunk")
160 | 
161 |     print("\n---test_parse_chunk_divs---")
162 |     for chunk_div in chunk_divs:
163 |         print(chunk_div.reassemble())
164 |         print("---")
165 | 
166 |     assert chunk_divs[0].reassemble() == """<div class="chunk">\n\nChunk 1 text.\n\n</div>"""
167 |     assert chunk_divs[0].contents.strip() == "Chunk 1 text."
168 |     assert len(chunk_divs) == 3
169 | 


--------------------------------------------------------------------------------
/examples/backfill_timestamps.py:
--------------------------------------------------------------------------------
  1 | # /// script
  2 | # requires-python = ">=3.13"
  3 | # dependencies = [
  4 | #     "chopdiff",
  5 | #     "flowmark",
  6 | # ]
  7 | # ///
  8 | import logging
  9 | from textwrap import dedent
 10 | 
 11 | from chopdiff.docs import BOF_TOK, EOF_TOK, PARA_BR_TOK, TextDoc, TokenMapping, search_tokens
 12 | from chopdiff.html import ContentNotFound, TimestampExtractor
 13 | 
 14 | logging.basicConfig(format=">> %(message)s")
 15 | log = logging.getLogger(__name__)
 16 | log.setLevel(logging.INFO)
 17 | 
 18 | 
 19 | def format_timestamp(timestamp: float) -> str:
 20 |     hours, remainder = divmod(timestamp, 3600)
 21 |     minutes, seconds = divmod(remainder, 60)
 22 |     if hours:
 23 |         return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}"
 24 |     else:
 25 |         return f"{int(minutes):02}:{int(seconds):02}"
 26 | 
 27 | 
 28 | def add_timestamp(text: str, timestamp: float) -> str:
 29 |     return f'{text} <span class="timestamp">⏱️{format_timestamp(timestamp)}</span> '
 30 | 
 31 | 
 32 | def heading(text: str):
 33 |     return "\n--- " + text + " " + "-" * (70 - len(text)) + "\n"
 34 | 
 35 | 
 36 | def backfill_timestamps(target_text: str, source_text: str) -> str:
 37 |     """
 38 |     Backfill timestamps from a source document into a target document.
 39 |     The source document should have timestamps in `<span>`s with a `data-timestamp` attribute.
 40 |     The target document should have mostly similar text but no timestamps.
 41 |     """
 42 | 
 43 |     print(heading("Source text (with timestamps)"))
 44 |     print(source_text)
 45 | 
 46 |     print(heading("Target text (without timestamps)"))
 47 |     print(target_text)
 48 | 
 49 |     # Parse the target document into wordtoks.
 50 |     target_doc = TextDoc.from_text(target_text)
 51 |     extractor = TimestampExtractor(source_text)
 52 |     source_wordtoks = extractor.wordtoks
 53 | 
 54 |     # Create a mapping between source and target docs.
 55 |     target_wordtoks = list(target_doc.as_wordtoks(bof_eof=True))
 56 |     token_mapping = TokenMapping(source_wordtoks, target_wordtoks)
 57 | 
 58 |     print(heading("Diff"))
 59 |     print(token_mapping.diff.as_diff_str())
 60 | 
 61 |     print(heading("Token mapping"))
 62 |     print(token_mapping.full_mapping_str())
 63 | 
 64 |     for wordtok_offset, (wordtok, sent_index) in enumerate(
 65 |         target_doc.as_wordtok_to_sent(bof_eof=True)
 66 |     ):
 67 |         # Look for each end of paragraph or end of doc.
 68 |         if wordtok in [PARA_BR_TOK, EOF_TOK]:
 69 |             # Find the start of the paragraph.
 70 |             start_para_index, start_para_wordtok = (
 71 |                 search_tokens(target_wordtoks)
 72 |                 .at(wordtok_offset)
 73 |                 .seek_back([BOF_TOK, PARA_BR_TOK])
 74 |                 .next()
 75 |                 .get_token()
 76 |             )
 77 | 
 78 |             wordtok_offset = start_para_index
 79 | 
 80 |             source_wordtok_offset = token_mapping.map_back(wordtok_offset)
 81 | 
 82 |             log.info(
 83 |                 "Seeking back tok %s (%s) to para start tok %s (%s), map back to source tok %s (%s)",
 84 |                 wordtok_offset,
 85 |                 wordtok,
 86 |                 start_para_index,
 87 |                 start_para_wordtok,
 88 |                 source_wordtok_offset,
 89 |                 source_wordtoks[source_wordtok_offset],
 90 |             )
 91 | 
 92 |             try:
 93 |                 timestamp, _index, _offset = extractor.extract_preceding(source_wordtok_offset)
 94 |                 sent = target_doc.get_sent(sent_index)
 95 | 
 96 |                 if sent.is_markup():
 97 |                     log.info("Skipping markup-only sentence: %s", sent.text)
 98 |                     continue
 99 | 
100 |                 log.info("Adding timestamp to sentence: %s", sent)
101 | 
102 |                 sent.text = add_timestamp(sent.text, timestamp)
103 | 
104 |             except ContentNotFound:
105 |                 # Missing timestamps shouldn't be fatal.
106 |                 log.warning(
107 |                     "Failed to extract timestamp at doc token %s (%s) -> source token %s (%s): %s",
108 |                     wordtok_offset,
109 |                     wordtok,
110 |                     source_wordtok_offset,
111 |                     source_wordtoks[source_wordtok_offset],
112 |                     sent_index,
113 |                 )
114 | 
115 |     result = target_doc.reassemble()
116 | 
117 |     print(heading("Result (with backfilled timestamps)"))
118 |     print(result)
119 | 
120 |     return result
121 | 
122 | 
123 | def main():
124 |     # Example source text with timestamps:
125 |     source_text = dedent(
126 |         """
127 |         <span data-timestamp="0.0">Welcome to this um ... video about Python programming.</span>
128 |         <span data-timestamp="15.5">First, we'll talk about variables. Variables are containers for storing data values.</span>
129 |         <span data-timestamp="25.2">Then let's look at functions. Functions help us organize and reuse code.</span>
130 |       """
131 |     )
132 | 
133 |     # Example target text (similar content but edited, with no timestamps):
134 |     target_text = dedent(
135 |         """
136 |         ## Introduction
137 | 
138 |         Welcome to this video about Python programming.
139 |         
140 |         First, we'll talk about variables. Next, let's look at functions. Functions help us organize and reuse code.
141 |         """
142 |     )
143 | 
144 |     backfill_timestamps(target_text, source_text)
145 | 
146 | 
147 | if __name__ == "__main__":
148 |     main()
149 | 


--------------------------------------------------------------------------------
/src/chopdiff/transforms/diff_filters.py:
--------------------------------------------------------------------------------
  1 | from collections.abc import Callable
  2 | from typing import TypeAlias
  3 | 
  4 | from typing_extensions import override
  5 | 
  6 | from chopdiff.docs.token_diffs import DiffFilter, DiffOp, OpType
  7 | from chopdiff.docs.wordtoks import (
  8 |     is_break_or_space,
  9 |     is_tag_close,
 10 |     is_tag_open,
 11 |     is_whitespace_or_punct,
 12 |     is_word,
 13 | )
 14 | from chopdiff.util.lemmatize import lemmatize, lemmatized_equal
 15 | 
 16 | 
 17 | class WildcardToken:
 18 |     """
 19 |     Wildcard token that matches any number of tokens (including zero).
 20 |     """
 21 | 
 22 |     @override
 23 |     def __str__(self):
 24 |         return "*"
 25 | 
 26 | 
 27 | WILDCARD_TOK = WildcardToken()
 28 | 
 29 | TokenMatcher: TypeAlias = list[str] | Callable[[str], bool]
 30 | 
 31 | TokenPattern: TypeAlias = str | Callable[[str], bool] | WildcardToken
 32 | 
 33 | 
 34 | def _matches_pattern(tokens: list[str], pattern: list[TokenPattern]) -> bool:
 35 |     def match_from(i: int, j: int) -> bool:
 36 |         while i <= len(tokens) and j < len(pattern):
 37 |             pattern_elem = pattern[j]
 38 |             if pattern_elem == WILDCARD_TOK:
 39 |                 # If '*' is the last pattern element, it matches any remaining tokens.
 40 |                 if j + 1 == len(pattern):
 41 |                     return True
 42 |                 # Advance pattern index to next pattern after ANY_TOKEN.
 43 |                 j += 1
 44 |                 while i < len(tokens):
 45 |                     if match_from(i, j):
 46 |                         return True
 47 |                     i += 1
 48 |                 return False
 49 |             else:
 50 |                 if i >= len(tokens):
 51 |                     return False
 52 |                 token = tokens[i]
 53 |                 if isinstance(pattern_elem, str):
 54 |                     if token != pattern_elem:
 55 |                         return False
 56 |                 elif callable(pattern_elem):
 57 |                     if not pattern_elem(token):
 58 |                         return False
 59 |                 else:
 60 |                     return False
 61 |                 i += 1
 62 |                 j += 1
 63 |         # Skip any remaining ANY_TOKEN in the pattern.
 64 |         while j < len(pattern) and pattern[j] == WILDCARD_TOK:
 65 |             j += 1
 66 |         # The tokens match the pattern if both indices are at the end.
 67 |         return i == len(tokens) and j == len(pattern)
 68 | 
 69 |     return match_from(0, 0)
 70 | 
 71 | 
 72 | def make_token_sequence_filter(
 73 |     pattern: list[TokenPattern],
 74 |     action: OpType | None = None,
 75 |     ignore: TokenMatcher | None = None,
 76 | ) -> DiffFilter:
 77 |     """
 78 |     Returns a `DiffFilter` that accepts `DiffOps` where the tokens match the given pattern.
 79 |     The pattern is a list where each element can be a string or a predicate function that
 80 |     takes a token and returns a bool (True if the token matches).
 81 |     The '*' in the pattern list matches any number of tokens (including zero).
 82 |     If `action` is specified, only `DiffOps` with that action are considered.
 83 |     """
 84 | 
 85 |     def filter_fn(diff_op: DiffOp) -> bool:
 86 |         if action and diff_op.action != action:
 87 |             return False
 88 | 
 89 |         tokens = diff_op.all_changed()
 90 |         if ignore and isinstance(ignore, str):
 91 |             tokens = [tok for tok in tokens if tok not in ignore]
 92 |         elif ignore and callable(ignore):
 93 |             tokens = [tok for tok in tokens if not ignore(tok)]
 94 | 
 95 |         return _matches_pattern(tokens, pattern)
 96 | 
 97 |     return filter_fn
 98 | 
 99 | 
100 | def changes_whitespace(diff_op: DiffOp) -> bool:
101 |     """
102 |     Only accepts changes to sentence and paragraph breaks and whitespace.
103 |     """
104 | 
105 |     return all(is_break_or_space(tok) for tok in diff_op.all_changed())
106 | 
107 | 
108 | def changes_whitespace_or_punct(diff_op: DiffOp) -> bool:
109 |     """
110 |     Only accepts changes to punctuation and whitespace.
111 |     """
112 | 
113 |     return all(is_whitespace_or_punct(tok) for tok in diff_op.all_changed())
114 | 
115 | 
116 | def no_word_lemma_changes(diff_op: DiffOp) -> bool:
117 |     """
118 |     Only accept changes that preserve the lemmatized form of words.
119 |     """
120 |     if diff_op.action == OpType.EQUAL:
121 |         return True
122 |     elif diff_op.action == OpType.REPLACE:
123 |         return lemmatized_equal(
124 |             " ".join(tok for tok in diff_op.left if is_word(tok)),
125 |             " ".join(tok for tok in diff_op.right if is_word(tok)),
126 |         )
127 |     else:
128 |         return len([tok for tok in diff_op.all_changed() if is_word(tok)]) == 0
129 | 
130 | 
131 | def removes_words(diff_op: DiffOp) -> bool:
132 |     """
133 |     Only accept changes that remove words. Changes to spaces and punctuation are allowed.
134 |     """
135 |     if diff_op.action == OpType.DELETE or diff_op.action == OpType.EQUAL:
136 |         return True
137 |     elif diff_op.action == OpType.REPLACE or diff_op.action == OpType.INSERT:
138 |         return all(is_whitespace_or_punct(tok) for tok in set(diff_op.right) - set(diff_op.left))
139 |     else:
140 |         return False
141 | 
142 | 
143 | def removes_word_lemmas(diff_op: DiffOp) -> bool:
144 |     """
145 |     Only accept changes that remove words or replace them with their lemmatized forms.
146 |     Changes to spaces and punctuation are allowed.
147 |     """
148 |     if diff_op.action == OpType.DELETE or diff_op.action == OpType.EQUAL:
149 |         return True
150 |     elif diff_op.action == OpType.REPLACE or diff_op.action == OpType.INSERT:
151 |         left_words = [tok for tok in diff_op.left if is_word(tok)]
152 |         right_words = [tok for tok in diff_op.right if is_word(tok)]
153 | 
154 |         left_lemmas = [lemmatize(word) for word in left_words]
155 |         right_lemmas = [lemmatize(word) for word in right_words]
156 | 
157 |         return set(right_lemmas).issubset(set(left_lemmas))
158 |     else:
159 |         return False
160 | 
161 | 
162 | def adds_headings(diff_op: DiffOp) -> bool:
163 |     """
164 |     Only accept changes that add contents within header tags.
165 |     """
166 |     headers = ["h1", "h2", "h3", "h4", "h5", "h6"]
167 |     is_header = lambda tok: is_tag_open(tok, tag_names=headers)  # pyright: ignore
168 |     is_header_close = lambda tok: is_tag_close(tok, tag_names=headers)  # pyright: ignore
169 |     matcher = make_token_sequence_filter(
170 |         [is_header, WILDCARD_TOK, is_header_close],
171 |         action=OpType.INSERT,
172 |         ignore=is_break_or_space,
173 |     )
174 |     return matcher(diff_op)
175 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | # ---- Project Info and Dependencies ----
  2 | 
  3 | [project.urls]
  4 | Repository = "https://github.com/jlevy/chopdiff"
  5 | # Homepage = "https://..."
  6 | # Documentation = "https://..."
  7 | 
  8 | [project]
  9 | name = "chopdiff"
 10 | description = "Chunking, diff filtering, and windowed transforms of text to support LLM applications"
 11 | authors = [
 12 |     { name="Joshua Levy", email="joshua@cal.berkeley.edu" },
 13 | ]
 14 | readme = "README.md"
 15 | license = "MIT"
 16 | requires-python = ">=3.11,<4.0"
 17 | dynamic = ["version"]
 18 | 
 19 | # https://pypi.org/classifiers/
 20 | # Adjust as needed:
 21 | classifiers = [
 22 |     # Adjust as needed:
 23 |     "Development Status :: 4 - Beta",
 24 |     # "Development Status :: 5 - Production/Stable",
 25 |     "Intended Audience :: Developers",
 26 |     "Operating System :: OS Independent",
 27 |     "Programming Language :: Python",
 28 |     "Programming Language :: Python :: 3",
 29 |     "Programming Language :: Python :: 3.11",
 30 |     "Programming Language :: Python :: 3.12",
 31 |     "Programming Language :: Python :: 3.13",
 32 |     "Typing :: Typed",
 33 |     # Include this to avoid accidentally publishing to PyPI:
 34 |     # "Private :: Do Not Upload",
 35 | ]
 36 | 
 37 | 
 38 | # ---- Main dependencies ----
 39 | 
 40 | dependencies = [
 41 |     "prettyfmt>=0.3.0",
 42 |     "flowmark>=0.5.3",
 43 |     "strif>=2.1.0",
 44 |     "funlog>=0.2.1",
 45 |     "cydifflib>=1.2.0",
 46 |     "tiktoken>=0.9.0",
 47 |     "regex>=2024.11.6",
 48 |     "selectolax>=0.3.32",
 49 | ]
 50 | 
 51 | [project.optional-dependencies]
 52 | extras = [
 53 |     "simplemma>=1.1.2",
 54 | ]
 55 | 
 56 | [dependency-groups]
 57 | dev = [
 58 |     "pytest>=8.3.5",
 59 |     "pytest-sugar>=1.0.0",
 60 |     "ruff>=0.11.9",
 61 |     "codespell>=2.4.1",
 62 |     "rich>=14.0.0",
 63 |     "basedpyright==1.29.5",  # TODO: Upgrade when Cursor supports it.
 64 |     "funlog>=0.2.1",
 65 | ]
 66 | 
 67 | [project.scripts]
 68 | # Add script entry points here:
 69 | chopdiff = "chopdiff:main"
 70 | 
 71 | 
 72 | # ---- Build system ----
 73 | 
 74 | # Dynamic versioning from:
 75 | # https://github.com/ninoseki/uv-dynamic-versioning/
 76 | 
 77 | [build-system]
 78 | requires = ["hatchling", "uv-dynamic-versioning"]
 79 | build-backend = "hatchling.build"
 80 | 
 81 | [tool.hatch.version]
 82 | source = "uv-dynamic-versioning"
 83 | # Note JSON schemas don't seem to be right for tool.hatch.version.source so
 84 | # this may cause false warnings in IDEs.
 85 | # https://github.com/ninoseki/uv-dynamic-versioning/issues/21
 86 | 
 87 | [tool.uv-dynamic-versioning]
 88 | vcs = "git"
 89 | style = "pep440"
 90 | bump = true
 91 | 
 92 | [tool.hatch.build.targets.wheel]
 93 | # The source location for the package.
 94 | packages = ["src/chopdiff"]
 95 | 
 96 | 
 97 | # ---- Settings ----
 98 | 
 99 | [tool.ruff]
100 | # Set as desired, typically 88 (black standard) or 100 (wide).
101 | line-length = 100
102 | 
103 | [tool.ruff.lint]
104 | select = [
105 |     # See: https://docs.astral.sh/ruff/rules/
106 |     # Basic list from: https://docs.astral.sh/ruff/linter/#rule-selection
107 |     "E", # https://docs.astral.sh/ruff/rules/#error-e
108 |     "F", # https://docs.astral.sh/ruff/rules/#pyflakes-f
109 |     "UP", # https://docs.astral.sh/ruff/rules/#pyupgrade-up
110 |     "B", # https://docs.astral.sh/ruff/rules/#flake8-bugbear-b
111 |     "I", # https://docs.astral.sh/ruff/rules/#isort-i
112 |     # Other possibilities:
113 |     # "D" # https://docs.astral.sh/ruff/rules/#pydocstyle-d
114 |     # "Q" # https://docs.astral.sh/ruff/rules/#flake8-quotes-q
115 |     # "COM" # https://docs.astral.sh/ruff/rules/#flake8-commas-com
116 |     # "SIM", # https://docs.astral.sh/ruff/rules/#flake8-simplify-sim
117 | 
118 | ]
119 | ignore = [
120 |     # Disable some rules that are overly pedantic. Add/remove as desired:
121 |     "E501", # https://docs.astral.sh/ruff/rules/line-too-long/
122 |     "E402", # https://docs.astral.sh/ruff/rules/module-import-not-at-top-of-file/
123 |     "E731", # https://docs.astral.sh/ruff/rules/lambda-assignment/
124 |     "B904",
125 |     # We use both ruff formatter and linter so some rules should always be disabled.
126 |     # See: https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules
127 |     "W191", # https://docs.astral.sh/ruff/rules/tab-indentation/
128 |     "E111", # https://docs.astral.sh/ruff/rules/indentation-with-invalid-multiple/
129 |     "E114", # https://docs.astral.sh/ruff/rules/indentation-with-invalid-multiple-comment/
130 |     "E117", # https://docs.astral.sh/ruff/rules/over-indented/
131 |     "D206", # https://docs.astral.sh/ruff/rules/docstring-tab-indentation/
132 |     "D300", # https://docs.astral.sh/ruff/rules/triple-single-quotes/
133 |     "Q000", # https://docs.astral.sh/ruff/rules/bad-quotes-inline-string/
134 |     "Q001", # https://docs.astral.sh/ruff/rules/bad-quotes-multiline-string/
135 |     "Q002", # https://docs.astral.sh/ruff/rules/bad-quotes-docstring/
136 |     "Q003", # https://docs.astral.sh/ruff/rules/avoidable-escaped-quote/
137 |     "COM812", # https://docs.astral.sh/ruff/rules/missing-trailing-comma/
138 |     "COM819", # https://docs.astral.sh/ruff/rules/prohibited-trailing-comma/
139 |     "ISC002", # https://docs.astral.sh/ruff/rules/multi-line-implicit-string-concatenation/
140 | ]
141 | 
142 | [tool.basedpyright]
143 | # BasedPyright currently seems like the best type checker option, much faster
144 | # than mypy and with a good extension for VSCode/Cursor.
145 | # https://marketplace.visualstudio.com/items?itemName=detachhead.basedpyright
146 | # https://docs.basedpyright.com/latest/configuration/config-files/#sample-pyprojecttoml-file
147 | include = ["src", "tests", "devtools"]
148 | # By default BasedPyright is very strict, so you almost certainly want to disable
149 | # some of the rules.
150 | # First, these turn off warnings about (yes) how you ignore warnings:
151 | reportIgnoreCommentWithoutRule = false
152 | reportUnnecessaryTypeIgnoreComment = false
153 | # A few typically noisy warnings are next.
154 | # How many you enable is up to you. The first few are off by default, but you can
155 | # comment/uncomment these as desired:
156 | reportMissingTypeStubs = false
157 | reportUnusedCallResult = false
158 | reportAny = false
159 | reportExplicitAny = false
160 | reportImplicitStringConcatenation = false
161 | reportUnreachable = false
162 | reportUnknownMemberType = false
163 | # reportPrivateImportUsage = false
164 | # reportPrivateLocalImportUsage = false
165 | # reportMissingImports = false
166 | # reportUnnecessaryIsInstance = false
167 | reportUnknownVariableType = false
168 | # reportUnknownArgumentType = false
169 | reportUnannotatedClassAttribute = false
170 | reportUnknownLambdaType = false
171 | reportPrivateUsage = false
172 | 
173 | [tool.codespell]
174 | ignore-words-list = "Numbe"
175 | # skip = "foo.py,bar.py"
176 | 
177 | [tool.pytest.ini_options]
178 | python_files = ["*.py"]
179 | python_classes = ["Test*"]
180 | python_functions = ["test_*"]
181 | testpaths = [
182 |     "src",
183 |     "tests",
184 | ]
185 | norecursedirs = []
186 | filterwarnings = []
187 | 
188 | 
189 | 


--------------------------------------------------------------------------------
/src/chopdiff/divs/text_node.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from collections.abc import Callable
  4 | from copy import copy
  5 | from dataclasses import dataclass, field
  6 | 
  7 | from prettyfmt import fmt_lines
  8 | from typing_extensions import override
  9 | 
 10 | from chopdiff.docs.sizes import TextUnit
 11 | from chopdiff.docs.text_doc import Splitter, TextDoc, default_sentence_splitter
 12 | from chopdiff.html.html_in_md import div_wrapper
 13 | 
 14 | 
 15 | @dataclass
 16 | class TextNode:
 17 |     """
 18 |     A node in parsed structured text, with reference offsets into the original text.
 19 |     Useful for parsing Markdown broken into div tags.
 20 |     """
 21 | 
 22 |     original_text: str
 23 | 
 24 |     # Offsets into the original text.
 25 |     offset: int
 26 |     content_start: int
 27 |     content_end: int
 28 | 
 29 |     tag_name: str | None = None
 30 |     class_name: str | None = None
 31 |     begin_marker: str | None = None
 32 |     end_marker: str | None = None
 33 | 
 34 |     children: list[TextNode] = field(default_factory=list)
 35 | 
 36 |     @property
 37 |     def end_offset(self) -> int:
 38 |         assert self.content_end >= 0
 39 |         return self.content_end + len(self.end_marker) if self.end_marker else self.content_end
 40 | 
 41 |     @property
 42 |     def contents(self) -> str:
 43 |         return self.original_text[self.content_start : self.content_end]
 44 | 
 45 |     def text_doc(self, sentence_splitter: Splitter = default_sentence_splitter) -> TextDoc:
 46 |         return TextDoc.from_text(self.contents, sentence_splitter=sentence_splitter)
 47 | 
 48 |     def slice_children(self, start: int, end: int) -> TextNode:
 49 |         if not self.children:
 50 |             raise ValueError("Cannot slice_children on a non-container node.")
 51 |         else:
 52 |             node_copy = copy(self)
 53 |             node_copy.children = node_copy.children[start:end]
 54 |             return node_copy
 55 | 
 56 |     def size(self, unit: TextUnit) -> int:
 57 |         if self.children:
 58 |             return sum(child.size(unit) for child in self.children)
 59 |         else:
 60 |             return self.text_doc().size(unit)
 61 | 
 62 |     def structure_summary(self) -> dict[str, int]:
 63 |         """
 64 |         Recursively tally the number of non-empty leaf nodes of different types as CSS-style paths.
 65 |         For example
 66 | 
 67 |         { "_total": 7, "div.chunk": 5, "div.chunk > div.summary": 2, "div.chunk > div.content": 5 }
 68 | 
 69 |         would mean that there were 7 chunk divs, each with a content div, and 2 with
 70 |         a summary div within it.
 71 |         """
 72 | 
 73 |         def path_join(*selectors: str) -> str:
 74 |             return " > ".join(selectors)
 75 | 
 76 |         def tally_recursive(node: TextNode, path: list[str], tally: dict[str, int]) -> None:
 77 |             # Skip leaf nodes.
 78 |             if not node.children and not node.tag_name and not node.class_name:
 79 |                 return
 80 | 
 81 |             tag_selector = node.tag_name if node.tag_name else ""
 82 |             class_selector = f".{node.class_name}" if node.class_name else ""
 83 |             selector = f"{tag_selector}{class_selector}"
 84 |             new_path = path + [selector] if selector else path
 85 | 
 86 |             # Increment counts.
 87 |             path_key = path_join(*new_path)
 88 |             if path_key:
 89 |                 tally[path_key] = tally.get(path_key, 0) + 1
 90 | 
 91 |             for child in node.children:
 92 |                 tally_recursive(child, new_path, tally)
 93 | 
 94 |         tally: dict[str, int] = {}
 95 |         tally_recursive(self, [], tally)
 96 | 
 97 |         sorted_tally = dict(sorted(tally.items()))
 98 |         return sorted_tally
 99 | 
100 |     def structure_summary_str(self) -> str | None:
101 |         structure_summary = self.structure_summary()
102 |         if not structure_summary:
103 |             return None
104 |         else:
105 |             return "HTML structure:\n" + fmt_lines(
106 |                 [f"{count:6d}  {path}" for path, count in self.structure_summary().items()],
107 |                 prefix="",
108 |             )
109 | 
110 |     def size_summary(self) -> str:
111 |         """
112 |         Return a summary of the size of the doc as well as a summary of its
113 |         div/HTML structure.
114 |         """
115 |         summary = self.text_doc().size_summary()
116 |         if structure_summary_str := self.structure_summary_str():
117 |             summary += "\n" + structure_summary_str
118 |         return summary
119 | 
120 |     def is_whitespace(self) -> bool:
121 |         """
122 |         Is this node whitespace only?
123 |         """
124 |         return not self.children and self.contents.strip() == ""
125 | 
126 |     def children_by_class_names(self, *class_names: str, recursive: bool = False) -> list[TextNode]:
127 |         def collect_children(node: TextNode) -> list[TextNode]:
128 |             matching_children = [
129 |                 child for child in node.children if child.class_name in class_names
130 |             ]
131 |             if recursive:
132 |                 for child in node.children:
133 |                     matching_children.extend(collect_children(child))
134 |             return matching_children
135 | 
136 |         return collect_children(self)
137 | 
138 |     def child_by_class_name(self, class_name: str) -> TextNode | None:
139 |         nodes = self.children_by_class_names(class_name, recursive=False)
140 |         if len(nodes) == 0:
141 |             return None
142 |         if len(nodes) > 1:
143 |             raise ValueError(f"Multiple children with class name {class_name}")
144 |         return nodes[0]
145 | 
146 |     def reassemble(self, padding: str = "\n\n") -> str:
147 |         """
148 |         Reassemble as string. If padding is provided (not ""), then strip, skip whitespace,
149 |         and insert our own padding.
150 |         """
151 |         strip_fn: Callable[[str], str] = lambda s: s.strip() if padding else s
152 |         skip_whitespace = bool(padding)
153 | 
154 |         if not self.children:
155 |             if not self.tag_name:
156 |                 return strip_fn(self.contents)
157 |             else:
158 |                 wrap = div_wrapper(self.class_name, padding=padding)
159 |                 return wrap(strip_fn(self.contents))
160 |         else:
161 |             padded_children = (padding or "").join(
162 |                 child.reassemble(padding)
163 |                 for child in self.children
164 |                 if (not skip_whitespace or not child.is_whitespace())
165 |             )
166 |             if not self.tag_name:
167 |                 return padded_children
168 |             else:
169 |                 wrap = div_wrapper(self.class_name, padding=padding)
170 |                 return wrap(padded_children)
171 | 
172 |     @override
173 |     def __str__(self):
174 |         """
175 |         Return a recursive, formatted string representation of the node and its children.
176 |         """
177 |         return self._str_recursive()
178 | 
179 |     def _str_recursive(self, level: int = 0, max_len: int = 40) -> str:
180 |         indent = "    " * level
181 |         content_preview = self.contents
182 |         if len(content_preview) > max_len:
183 |             content_preview = content_preview[:20] + "…" + content_preview[-20:]
184 |         result = (
185 |             f"{indent}TextNode(tag_name={self.tag_name} class_name={self.class_name} offset={self.offset},"
186 |             f" content_start={self.content_start}, content_end={self.content_end}) "
187 |             f"{repr(content_preview)}\n"
188 |         )
189 |         for child in self.children:
190 |             result += child._str_recursive(level + 1)
191 |         return result
192 | 


--------------------------------------------------------------------------------
/src/chopdiff/docs/wordtoks.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Support for treating text as a sequence of word, punctuation, whitespace
  3 | (word, setnence, and paragraph breaks), or HTML tags as tokens, which we call
  4 | "wordtoks".
  5 | 
  6 | Also works well with Markdown. Wordtoks make it possible to do word-oriented
  7 | parsing, diffs, and transforms, while also preserving HTML tags and significant
  8 | whitespace.
  9 | """
 10 | 
 11 | from dataclasses import dataclass
 12 | 
 13 | import regex
 14 | 
 15 | # Special tokens to represent sentence, paragraph, and document boundaries.
 16 | # Note these parse as tokens and like HTML tags, so they can safely be mixed into inputs if desired.
 17 | SENT_BR_TOK = "<-SENT-BR->"
 18 | PARA_BR_TOK = "<-PARA-BR->"
 19 | BOF_TOK = "<-BOF->"
 20 | EOF_TOK = "<-EOF->"
 21 | 
 22 | SENT_BR_STR = " "
 23 | PARA_BR_STR = "\n\n"
 24 | BOF_STR = ""
 25 | EOF_STR = ""
 26 | 
 27 | SPACE_TOK = " "
 28 | 
 29 | SYMBOL_SEP = "⎪"
 30 | 
 31 | # Currently break on words, spaces, or any single other/punctuation character.
 32 | # HTML tags (of length <1024 chars, possibly with newlines) and entities are also a single token.
 33 | # TODO: Could add nicer support for Markdown formatting as well.
 34 | # Updated pattern to include HTML entities
 35 | _wordtok_pattern = regex.compile(
 36 |     r"(<(?:[^<>]|\n){0,1024}>|\&\w+;|\&\#\d+;|\w+|[^\w\s]|\s+)", regex.DOTALL
 37 | )
 38 | 
 39 | _para_br_pattern = regex.compile(r"\s*\n\n\s*")
 40 | 
 41 | # TODO: Is it worth using the regex package here to get \p{L} or is there a good
 42 | # enough way with re only?
 43 | _word_pat = regex.compile(r"\p{L}+", regex.UNICODE)
 44 | 
 45 | _number_pat = regex.compile(r"\d+")
 46 | 
 47 | _tag_pattern = regex.compile(r"<(/?)(\w+)([^>]*?)(/?)\s*>", regex.IGNORECASE)
 48 | 
 49 | _comment_pattern = regex.compile(r"<!--(.*?)-->", regex.DOTALL)
 50 | 
 51 | 
 52 | def wordtok_to_str(wordtok: str) -> str:
 53 |     """
 54 |     Convert a wordtok to a string, mapping any special wordtoks to their usual
 55 |     representations.
 56 |     """
 57 |     if wordtok == SENT_BR_TOK:
 58 |         return SENT_BR_STR
 59 |     if wordtok == PARA_BR_TOK:
 60 |         return PARA_BR_STR
 61 |     if wordtok == BOF_TOK:
 62 |         return BOF_STR
 63 |     if wordtok == EOF_TOK:
 64 |         return EOF_STR
 65 |     return wordtok
 66 | 
 67 | 
 68 | def wordtok_len(wordtok: str) -> int:
 69 |     """
 70 |     Char length of a wordtok.
 71 |     """
 72 |     return len(wordtok_to_str(wordtok))
 73 | 
 74 | 
 75 | _whitespace = regex.compile(r"\s+")
 76 | 
 77 | 
 78 | def normalize_wordtok(wordtok: str) -> str:
 79 |     if wordtok.isspace():
 80 |         normalized = SPACE_TOK
 81 |     elif wordtok.startswith("<"):
 82 |         normalized = _whitespace.sub(" ", wordtok)
 83 |     else:
 84 |         normalized = wordtok
 85 |     return normalized
 86 | 
 87 | 
 88 | def wordtokenize_with_offsets(text: str, bof_eof: bool = False) -> tuple[list[str], list[int]]:
 89 |     """
 90 |     Same as `wordtokenize`, but returns a list of tuples `(wordtok, offset)`.
 91 |     """
 92 |     wordtoks = []
 93 |     offsets = []
 94 |     offset = 0
 95 |     for match in _wordtok_pattern.finditer(text):
 96 |         wordtok = normalize_wordtok(match.group())
 97 |         wordtoks.append(wordtok)
 98 |         offsets.append(offset)
 99 |         offset = match.end()
100 | 
101 |     if bof_eof:
102 |         wordtoks = [BOF_TOK] + wordtoks + [EOF_TOK]
103 |         offsets = [0] + offsets + [len(text)]
104 | 
105 |     return wordtoks, offsets
106 | 
107 | 
108 | def wordtokenize(text: str, bof_eof: bool = False) -> list[str]:
109 |     """
110 |     Convert text to word tokens, including words, whitespace, punctuation, and
111 |     HTML tags. Does not parse paragraph or sentence breaks. Normalizes all
112 |     whitespace to a single space character.
113 |     """
114 |     wordtoks, _offsets = wordtokenize_with_offsets(text, bof_eof)
115 |     return wordtoks
116 | 
117 | 
118 | def _insert_para_wordtoks(text: str) -> str:  # pyright: ignore
119 |     """
120 |     Replace paragraph breaks in text with para break tokens.
121 |     """
122 |     return _para_br_pattern.sub(PARA_BR_TOK, text)
123 | 
124 | 
125 | def _initial_wordtoks(text: str, max_chars: int) -> list[str]:
126 |     sub_text = text[:max_chars]
127 |     wordtoks = wordtokenize(sub_text)
128 |     if wordtoks:
129 |         wordtoks.pop()  # Drop any cut off token.
130 |     return wordtoks
131 | 
132 | 
133 | def first_wordtok(text: str) -> str | None:
134 |     """
135 |     Get the first wordtok from the text, if it has one.
136 |     """
137 |     wordtoks = _initial_wordtoks(text, 100)
138 |     return wordtoks[0] if wordtoks else None
139 | 
140 | 
141 | def join_wordtoks(wordtoks: list[str]) -> str:
142 |     """
143 |     Join wordtoks back into a sentence.
144 |     """
145 |     wordtoks = [wordtok_to_str(wordtok) for wordtok in wordtoks]
146 |     return "".join(wordtoks)
147 | 
148 | 
149 | def visualize_wordtoks(wordtoks: list[str]) -> str:
150 |     """
151 |     Visualize wordtoks with a separator for debugging.
152 |     """
153 |     return SYMBOL_SEP + SYMBOL_SEP.join(wordtoks) + SYMBOL_SEP
154 | 
155 | 
156 | def is_break_or_space(wordtok: str) -> bool:
157 |     """
158 |     Any kind of paragraph break, sentence break, or space (including
159 |     the beginning or end of the document).
160 |     """
161 |     return (
162 |         wordtok == PARA_BR_TOK
163 |         or wordtok == SENT_BR_TOK
164 |         or wordtok.isspace()
165 |         or wordtok == BOF_TOK
166 |         or wordtok == EOF_TOK
167 |     )
168 | 
169 | 
170 | def is_word(wordtok: str) -> bool:
171 |     """
172 |     Is this wordtok a word, not punctuation or whitespace or a number?
173 |     """
174 |     return bool(len(wordtok) > 0 and _word_pat.match(wordtok) and not _number_pat.match(wordtok))
175 | 
176 | 
177 | def is_number(wordtok: str) -> bool:
178 |     """
179 |     Is this wordtok a number?
180 |     """
181 |     return bool(_number_pat.match(wordtok))
182 | 
183 | 
184 | def is_whitespace_or_punct(wordtok: str) -> bool:
185 |     """
186 |     Is this wordtok whitespace or punctuation?
187 |     """
188 |     return bool(not is_word(wordtok) and not is_number(wordtok))
189 | 
190 | 
191 | @dataclass(frozen=True)
192 | class Tag:
193 |     """
194 |     An HTML tag or comment.
195 |     """
196 | 
197 |     name: str
198 |     is_open: bool
199 |     is_close: bool
200 |     attrs: dict[str, str]
201 |     comment: str | None = None
202 | 
203 | 
204 | def parse_tag(wordtok: str | None = None) -> Tag | None:
205 |     """
206 |     Parse a wordtok to determine if it's an HTML tag and extract its components.
207 |     """
208 |     if not wordtok:
209 |         return None
210 | 
211 |     match = _tag_pattern.match(wordtok)
212 |     if not match:
213 |         match = _comment_pattern.match(wordtok)
214 |         if not match:
215 |             return None
216 |         return Tag(name="", is_open=False, is_close=False, attrs={}, comment=match.group(1))
217 | 
218 |     is_open = not bool(match.group(1))
219 |     is_close = bool(match.group(1) or match.group(4))
220 |     tag_name = match.group(2).lower()
221 |     attrs_str = match.group(3).strip()
222 | 
223 |     attrs: dict[str, str] = {}
224 |     if attrs_str:
225 |         attr_pattern = regex.compile(r'(\w+)\s*=\s*"([^"]*)"')
226 |         for attr_match in attr_pattern.finditer(attrs_str):
227 |             attr_name, attr_value = attr_match.groups()
228 |             attrs[attr_name] = attr_value
229 | 
230 |     return Tag(name=tag_name, is_open=is_open, is_close=is_close, attrs=attrs)
231 | 
232 | 
233 | def is_tag(wordtok: str | None = None, tag_names: list[str] | None = None) -> bool:
234 |     """
235 |     Check if a wordtok is an HTML tag and optionally if it's in the specified tag names.
236 |     """
237 |     tag = parse_tag(wordtok)
238 |     return bool(tag and (not tag_names or tag.name in [name.lower() for name in tag_names]))
239 | 
240 | 
241 | def is_tag_close(wordtok: str, tag_names: list[str] | None = None) -> bool:
242 |     """
243 |     Check if a wordtok is an HTML close tag and optionally if it's in the specified tag names.
244 |     """
245 |     tag = parse_tag(wordtok)
246 |     return bool(
247 |         tag and tag.is_close and (not tag_names or tag.name in [name.lower() for name in tag_names])
248 |     )
249 | 
250 | 
251 | def is_tag_open(wordtok: str, tag_names: list[str] | None = None) -> bool:
252 |     """
253 |     Check if a wordtok is an HTML open tag and optionally if it's in the specified tag names.
254 |     """
255 |     tag = parse_tag(wordtok)
256 |     return bool(
257 |         tag and tag.is_open and (not tag_names or tag.name in [name.lower() for name in tag_names])
258 |     )
259 | 
260 | 
261 | def is_div(wordtok: str | None = None) -> bool:
262 |     return is_tag(wordtok, tag_names=["div"])
263 | 
264 | 
265 | def is_entity(wordtok: str | None = None) -> bool:
266 |     """
267 |     Check if a wordtok is an HTML entity.
268 |     """
269 |     return bool(wordtok and wordtok.startswith("&") and wordtok.endswith(";"))
270 | 
271 | 
272 | header_tags = ["h1", "h2", "h3", "h4", "h5", "h6"]
273 | 
274 | 
275 | def is_header_tag(wordtok: str) -> bool:
276 |     """
277 |     Is this wordtok an HTML header tag?
278 |     """
279 |     return is_tag(wordtok, tag_names=header_tags)
280 | 


--------------------------------------------------------------------------------
/src/chopdiff/transforms/sliding_transforms.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Transform text using sliding windows over a document, then reassembling the
  3 | transformed text.
  4 | """
  5 | 
  6 | import logging
  7 | from collections.abc import Callable
  8 | from math import ceil
  9 | from typing import Any, TypeAlias
 10 | 
 11 | from flowmark import fill_markdown
 12 | from prettyfmt import fmt_lines
 13 | 
 14 | from chopdiff.docs.sizes import TextUnit
 15 | from chopdiff.docs.text_doc import Paragraph, TextDoc
 16 | from chopdiff.docs.token_diffs import DIFF_FILTER_NONE, DiffFilter, diff_docs, find_best_alignment
 17 | from chopdiff.docs.wordtoks import join_wordtoks
 18 | from chopdiff.transforms.sliding_windows import sliding_para_window, sliding_word_window
 19 | from chopdiff.transforms.window_settings import WINDOW_BR, WindowSettings
 20 | 
 21 | log = logging.getLogger(__name__)
 22 | 
 23 | TextDocTransform: TypeAlias = Callable[[TextDoc], TextDoc]
 24 | 
 25 | SaveFunc: TypeAlias = Callable[[str, str, Any], None]
 26 | 
 27 | 
 28 | def remove_window_br(doc: TextDoc):
 29 |     """
 30 |     Remove `<!--window-br-->` markers in a document.
 31 |     """
 32 |     doc.replace_str(WINDOW_BR, "")
 33 | 
 34 | 
 35 | def filtered_transform(
 36 |     doc: TextDoc,
 37 |     transform_func: TextDocTransform,
 38 |     windowing: WindowSettings | None,
 39 |     diff_filter: DiffFilter | None = None,
 40 |     debug_save: SaveFunc | None = None,
 41 | ) -> TextDoc:
 42 |     """
 43 |     Apply a transform with sliding window across the input doc, enforcing the changes it's
 44 |     allowed to make with `diff_filter`.
 45 | 
 46 |     If windowing is None, apply the transform to the entire document at once.
 47 | 
 48 |     `debug_save` is an optional function that takes a message, a filename, and an object, and saves
 49 |     the object to a file for debugging.
 50 |     """
 51 |     has_filter = diff_filter and diff_filter != DIFF_FILTER_NONE
 52 | 
 53 |     if not windowing or not windowing.size:
 54 |         transformed_doc = transform_func(doc)
 55 |     else:
 56 | 
 57 |         def transform_and_check_diff(input_doc: TextDoc) -> TextDoc:
 58 |             # Avoid having window breaks build up after multiple transforms.
 59 |             remove_window_br(input_doc)
 60 | 
 61 |             transformed_doc = transform_func(input_doc)
 62 | 
 63 |             if has_filter:
 64 |                 # Check the transform did what it should have.
 65 |                 diff = diff_docs(input_doc, transformed_doc)
 66 |                 accepted_diff, rejected_diff = diff.filter(diff_filter)
 67 | 
 68 |                 assert diff.left_size() == input_doc.size(TextUnit.wordtoks)
 69 |                 assert accepted_diff.left_size() == input_doc.size(TextUnit.wordtoks)
 70 |                 assert rejected_diff.left_size() == input_doc.size(TextUnit.wordtoks)
 71 | 
 72 |                 log.info(
 73 |                     "Accepted transform changes:\n%s",
 74 |                     fmt_lines(str(accepted_diff).splitlines()),
 75 |                 )
 76 | 
 77 |                 # Note any rejections.
 78 |                 rejected_changes = rejected_diff.changes()
 79 |                 if rejected_changes:
 80 |                     log.info(
 81 |                         "Filtering extraneous changes:\n%s",
 82 |                         fmt_lines(rejected_diff.as_diff_str(False).splitlines()),
 83 |                     )
 84 | 
 85 |                 # Apply only the accepted changes.
 86 |                 final_doc = TextDoc.from_wordtoks(
 87 |                     accepted_diff.apply_to(list(input_doc.as_wordtoks()))
 88 |                 )
 89 |                 log.info(
 90 |                     "Word token changes:\n%s",
 91 |                     fmt_lines(
 92 |                         [
 93 |                             f"Accepted: {accepted_diff.stats()}",
 94 |                             f"Rejected: {rejected_diff.stats()}",
 95 |                         ]
 96 |                     ),
 97 |                 )
 98 |             else:
 99 |                 diff = None
100 |                 accepted_diff, rejected_diff = None, None
101 |                 final_doc = transformed_doc
102 | 
103 |             if debug_save:
104 |                 debug_save(
105 |                     "Input doc normalized",
106 |                     "filtered_transform",
107 |                     fill_markdown(input_doc.reassemble()),
108 |                 )
109 |                 debug_save("Output doc raw", "filtered_transform", transformed_doc.reassemble())
110 |                 # log_save(
111 |                 #     "Output doc normalized",
112 |                 #     "filtered_transform",
113 |                 #     normalize_markdown(transformed_doc.reassemble()),
114 |                 # )
115 |                 if diff:
116 |                     debug_save("Transform diff", "filtered_transform", diff)
117 |                 # if accepted_diff:
118 |                 #     log.save_object("Accepted diff", "filtered_transform", accepted_diff)
119 |                 if rejected_diff:
120 |                     debug_save("Rejected diff", "filtered_transform", rejected_diff)
121 | 
122 |                 debug_save("Final doc", "filtered_transform", final_doc.reassemble())
123 | 
124 |             return final_doc
125 | 
126 |         transformed_doc = sliding_window_transform(
127 |             doc,
128 |             transform_and_check_diff,
129 |             windowing,
130 |         )
131 | 
132 |     return transformed_doc
133 | 
134 | 
135 | def sliding_window_transform(
136 |     doc: TextDoc, transform_func: TextDocTransform, settings: WindowSettings
137 | ) -> TextDoc:
138 |     if settings.unit == TextUnit.wordtoks:
139 |         return sliding_wordtok_window_transform(doc, transform_func, settings)
140 |     elif settings.unit == TextUnit.paragraphs:
141 |         return sliding_para_window_transform(doc, transform_func, settings)
142 |     else:
143 |         raise ValueError(f"Unsupported sliding transform unit: {settings.unit}")
144 | 
145 | 
146 | def sliding_wordtok_window_transform(
147 |     doc: TextDoc, transform_func: TextDocTransform, settings: WindowSettings
148 | ) -> TextDoc:
149 |     """
150 |     Apply a transformation function to each TextDoc in a sliding window over the given document,
151 |     stepping through wordtoks, then reassemble the transformed document. Uses best effort to
152 |     stitch the results together seamlessly by searching for the best alignment (minimum wordtok
153 |     edit distance) of each transformed window.
154 |     """
155 |     if settings.unit != TextUnit.wordtoks:
156 |         raise ValueError(f"This sliding window expects wordtoks, not {settings.unit}")
157 | 
158 |     windows = sliding_word_window(doc, settings.size, settings.shift, TextUnit.wordtoks)
159 | 
160 |     nwordtoks = doc.size(TextUnit.wordtoks)
161 |     nbytes = doc.size(TextUnit.bytes)
162 |     nwindows = ceil(nwordtoks / settings.shift)
163 |     sep_wordtoks = [settings.separator] if settings.separator else []
164 | 
165 |     log.info(
166 |         "Sliding word transform: Begin on doc: total %s wordtoks, %s bytes, %s windows, %s",
167 |         nwordtoks,
168 |         nbytes,
169 |         nwindows,
170 |         settings,
171 |     )
172 | 
173 |     output_wordtoks: list[str] = []
174 |     for i, window in enumerate(windows):
175 |         log.info(
176 |             "Sliding word transform window %s/%s (%s wordtoks, %s bytes), at %s wordtoks so far",
177 |             i + 1,
178 |             nwindows,
179 |             window.size(TextUnit.wordtoks),
180 |             window.size(TextUnit.bytes),
181 |             len(output_wordtoks),
182 |         )
183 | 
184 |         transformed_window = transform_func(window)
185 | 
186 |         new_wordtoks = list(transformed_window.as_wordtoks())
187 | 
188 |         if not output_wordtoks:
189 |             output_wordtoks = new_wordtoks
190 |         else:
191 |             if len(output_wordtoks) < settings.min_overlap:
192 |                 raise ValueError(
193 |                     "Output wordtoks too short to align with min_overlap %s: %s",
194 |                     settings.min_overlap,
195 |                     output_wordtoks,
196 |                 )
197 |             if len(new_wordtoks) < settings.min_overlap:
198 |                 log.error(
199 |                     "New wordtoks too short to align with min_overlap %s, skipping: %s",
200 |                     settings.min_overlap,
201 |                     new_wordtoks,
202 |                 )
203 |                 continue
204 | 
205 |             offset, (score, diff) = find_best_alignment(
206 |                 output_wordtoks, new_wordtoks, settings.min_overlap
207 |             )
208 | 
209 |             log.info(
210 |                 "Sliding word transform: Best alignment of window %s is at token offset %s (score %s, %s)",
211 |                 i,
212 |                 offset,
213 |                 score,
214 |                 diff.stats(),
215 |             )
216 | 
217 |             output_wordtoks = output_wordtoks[:offset] + sep_wordtoks + new_wordtoks
218 | 
219 |     log.info(
220 |         "Sliding word transform: Done, output total %s wordtoks",
221 |         len(output_wordtoks),
222 |     )
223 | 
224 |     # An alternate approach would be to accumulate the document sentences instead of wordtoks to
225 |     # avoid re-parsing, but this is probably a little simpler.
226 |     output_doc = TextDoc.from_text(join_wordtoks(output_wordtoks))
227 | 
228 |     return output_doc
229 | 
230 | 
231 | def sliding_para_window_transform(
232 |     doc: TextDoc,
233 |     transform_func: TextDocTransform,
234 |     settings: WindowSettings,
235 |     normalizer: Callable[[str], str] = fill_markdown,
236 | ) -> TextDoc:
237 |     """
238 |     Apply a transformation function to each TextDoc, stepping through paragraphs `settings.size`
239 |     at a time, then reassemble the transformed document.
240 |     """
241 |     if settings.unit != TextUnit.paragraphs:
242 |         raise ValueError(f"This sliding window expects paragraphs, not {settings.unit}")
243 |     if settings.size != settings.shift:
244 |         raise ValueError("Paragraph window transform requires equal size and shift")
245 | 
246 |     windows = sliding_para_window(doc, settings.size, normalizer)
247 | 
248 |     nwindows = ceil(doc.size(TextUnit.paragraphs) / settings.size)
249 | 
250 |     log.info(
251 |         "Sliding paragraph transform: Begin on doc: %s windows of size %s paragraphs on total %s",
252 |         nwindows,
253 |         settings.size,
254 |         doc.size_summary(),
255 |     )
256 | 
257 |     transformed_paras: list[Paragraph] = []
258 |     for i, window in enumerate(windows):
259 |         log.info(
260 |             "Sliding paragraph transform: Window %s/%s input is %s",
261 |             i,
262 |             nwindows,
263 |             window.size_summary(),
264 |         )
265 | 
266 |         new_doc = transform_func(window)
267 |         if i > 0:
268 |             try:
269 |                 new_doc.paragraphs[0].sentences[0].text = (
270 |                     settings.separator + new_doc.paragraphs[0].sentences[0].text
271 |                 )
272 |             except (KeyError, IndexError):
273 |                 pass
274 |         transformed_paras.extend(new_doc.paragraphs)
275 | 
276 |     transformed_text = "\n\n".join(para.reassemble() for para in transformed_paras)
277 |     new_text_doc = TextDoc.from_text(transformed_text)
278 | 
279 |     log.info(
280 |         "Sliding paragraph transform: Done, output total %s",
281 |         new_text_doc.size_summary(),
282 |     )
283 | 
284 |     return new_text_doc
285 | 


--------------------------------------------------------------------------------
/.cursor/rules/python.mdc:
--------------------------------------------------------------------------------
  1 | ---
  2 | description: Python Coding Guidelines
  3 | globs: *.py,pyproject.toml
  4 | alwaysApply: false
  5 | ---
  6 | # Python Coding Guidelines
  7 | 
  8 | These are rules for a modern Python project using uv.
  9 | 
 10 | ## Python Version
 11 | 
 12 | Write for Python 3.11-3.13. Do NOT write code to support earlier versions of Python.
 13 | Always use modern Python practices appropriate for Python 3.11-3.13.
 14 | 
 15 | Always use full type annotations, generics, and other modern practices.
 16 | 
 17 | ## Project Setup and Developer Workflows
 18 | 
 19 | - Important: BE SURE you read and understand the project setup by reading the
 20 |   pyproject.toml file and the Makefile.
 21 | 
 22 | - ALWAYS use uv for running all code and managing dependencies.
 23 |   Never use direct `pip` or `python` commands.
 24 | 
 25 | - Use modern uv commands: `uv sync`, `uv run ...`, etc.
 26 |   Prefer `uv add` over `uv pip install`.
 27 | 
 28 | - You may use the following shortcuts
 29 |   ```shell
 30 |   
 31 |   # Install all dependencies:
 32 |   make install
 33 |   
 34 |   # Run linting (with ruff) and type checking (with basedpyright).
 35 |   # Note when you run this, ruff will auto-format and sort imports, resolving any
 36 |   # linter warnings about import ordering:
 37 |   make lint
 38 |   
 39 |   # Run tests:
 40 |   make test
 41 |   
 42 |   # Run uv sync, lint, and test in one command:
 43 |   make
 44 |   ```
 45 | 
 46 | - The usual `make test` like standard pytest does not show test output.
 47 |   Run individual tests and see output with `uv run pytest -s some/file.py`.
 48 | 
 49 | - Always run `make lint` and `make test` to check your code after changes.
 50 | 
 51 | - You must verify there are zero linter warnings/errors or test failures before
 52 |   considering any task complete.
 53 | 
 54 | ## General Development Practices
 55 | 
 56 | - Be sure to resolve the pyright (basedpyright) linter errors as you develop and make
 57 |   changes.
 58 | 
 59 | - If type checker errors are hard to resolve, you may add a comment `# pyright: ignore`
 60 |   to disable Pyright warnings or errors but ONLY if you know they are not a real problem
 61 |   and are difficult to fix.
 62 | 
 63 | - In special cases you may consider disabling it globally it in pyproject.toml but YOU
 64 |   MUST ASK FOR CONFIRMATION from the user before globally disabling lint or type checker
 65 |   rules.
 66 | 
 67 | - Never change an existing comment, pydoc, or a log statement, unless it is directly
 68 |   fixing the issue you are changing, or the user has asked you to clean up the code.
 69 |   Do not drop existing comments when editing code!
 70 |   And do not delete or change logging statements.
 71 | 
 72 | ## Coding Conventions and Imports
 73 | 
 74 | - Always use full, absolute imports for paths.
 75 |   do NOT use `from .module1.module2 import ...`. Such relative paths make it hard to
 76 |   refactor. Use `from toplevel_pkg.module1.modlule2 import ...` instead.
 77 | 
 78 | - Be sure to import things like `Callable` and other types from the right modules,
 79 |   remembering that many are now in `collections.abc` or `typing_extensions`. For
 80 |   example: `from collections.abc import Callable, Coroutine`
 81 | 
 82 | - Use `typing_extensions` for things like `@override` (you need to use this, and not
 83 |   `typing` since we want to support Python 3.11).
 84 | 
 85 | - Add `from __future__ import annotations` on files with types whenever applicable.
 86 | 
 87 | - Use pathlib `Path` instead of strings.
 88 |   Use `Path(filename).read_text()` instead of two-line `with open(...)` blocks.
 89 | 
 90 | - Use strif’s `atomic_output_file` context manager when writing files to ensure output
 91 |   files are written atomically.
 92 | 
 93 | ## Use Modern Python Practices
 94 | 
 95 | - ALWAYS use `@override` decorators to override methods from base classes.
 96 |   This is a modern Python practice and helps avoid bugs.
 97 | 
 98 | ## Testing
 99 | 
100 | - For longer tests put them in a file like `tests/test_somename.py` in the `tests/`
101 |   directory (or `tests/module_name/test_somename.py` file for a submodule).
102 | 
103 | - For simple tests, prefer inline functions in the original code file below a `## Tests`
104 |   comment. This keeps the tests easy to maintain and close to the code.
105 |   Inline tests should NOT import pytest or pytest fixtures as we do not want runtime
106 |   dependency on pytest.
107 | 
108 | - DO NOT write one-off test code in extra files that are throwaway.
109 | 
110 | - DO NOT put `if __name__ == "__main__":` just for quick testing.
111 |   Instead use the inline function tests and run them with `uv run pytest`.
112 | 
113 | - You can run such individual tests with `uv run pytest -s src/.../path/to/test`
114 | 
115 | - Don’t add docs to assertions unless it’s not obvious what they’re checking - the
116 |   assertion appears in the stack trace.
117 |   Do NOT write `assert x == 5, "x should be 5"`. Do NOT write `assert x == 5 # Check if
118 |   x is 5`. That is redundant.
119 |   Just write `assert x == 5`.
120 | 
121 | - DO NOT write trivial or obvious tests that are evident directly from code, such as
122 |   assertions that confirm the value of a constant setting.
123 | 
124 | - NEVER write `assert False`. If a test reaches an unexpected branch and must fail
125 |   explicitly, `raise AssertionError("Some explanation")` instead.
126 |   This is best typical best practice in Python since assertions can be removed with
127 |   optimization.
128 | 
129 | - DO NOT use pytest fixtures like parameterized tests or expected exception decorators
130 |   unless absolutely necessary in more complex tests.
131 |   It is typically simpler to use simple assertions and put the checks inside the test.
132 |   This is also preferable because then simple tests have no explicit pytest dependencies
133 |   and can be placed in code anywhere.
134 | 
135 | - DO NOT write trivial tests that test something we know already works, like
136 |   instantiating a Pydantic object.
137 | 
138 |   ```python
139 |   class Link(BaseModel):
140 |     url: str
141 |     title: str = None
142 |   
143 |   # DO NOT write tests like this. They are trivial and only create clutter!
144 |   def test_link_model():
145 |     link = Link(url="https://example.com", title="Example")
146 |     assert link.url == "https://example.com"
147 |     assert link.title == "Example"
148 |   ```
149 | 
150 | ## Types and Type Annotations
151 | 
152 | - Use modern union syntax: `str | None` instead of `Optional[str]`, `dict[str]` instead
153 |   of `Dict[str]`, `list[str]` instead of `List[str]`, etc.
154 | 
155 | - Never use/import `Optional` for new code.
156 | 
157 | - Use modern enums like `StrEnum` if appropriate.
158 | 
159 | - One exception to common practice on enums: If an enum has many values that are
160 |   strings, and they have a literal value as a string (like in a JSON protocol), it’s
161 |   fine to use lower_snake_case for enum values to match the actual value.
162 |   This is more readable than LONG_ALL_CAPS_VALUES, and you can simply set the value to
163 |   be the same as the name for each.
164 |   For example:
165 |   ```python
166 |   class MediaType(Enum):
167 |     """
168 |     Media types. For broad categories only, to determine what processing
169 |     is possible.
170 |     """
171 |   
172 |     text = "text"
173 |     image = "image"
174 |     audio = "audio"
175 |     video = "video"
176 |     webpage = "webpage"
177 |     binary = "binary"
178 |   ```
179 | 
180 | ## Guidelines for Literal Strings
181 | 
182 | - For multi-line strings NEVER put multi-line strings flush against the left margin.
183 |   ALWAYS use a `dedent()` function to make it more readable.
184 |   You may wish to add a `strip()` as well.
185 |   Example:
186 |   ```python
187 |   from textwrap import dedent
188 |   markdown_content = dedent("""
189 |       # Title 1
190 |       Some text.
191 |       ## Subtitle 1.1
192 |       More text.
193 |       """).strip()
194 |   ```
195 | 
196 | ## Guidelines for Comments
197 | 
198 | - Comments should be EXPLANATORY: Explain *WHY* something is done a certain way and not
199 |   just *what* is done.
200 | 
201 | - Comments should be CONCISE: Remove all extraneous words.
202 | 
203 | - DO NOT use comments to state obvious things or repeat what is evident from the code.
204 |   Here is an example of a comment that SHOULD BE REMOVED because it simply repeats the
205 |   code, which is distracting and adds no value:
206 |   ```python
207 |   if self.failed == 0:
208 |       # All successful
209 |       return "All tasks finished successfully"
210 |   ```
211 | 
212 | ## Guidelines for Docstrings
213 | 
214 | - Here is an example of the correct style for docstrings:
215 |   ```python
216 |   def check_if_url(
217 |       text: UnresolvedLocator, only_schemes: list[str] | None = None
218 |   ) -> ParseResult | None:
219 |       """
220 |       Convenience function to check if a string or Path is a URL and if so return
221 |       the `urlparse.ParseResult`.
222 |   
223 |       Also returns false for Paths, so that it's easy to use local paths and URLs
224 |       (`Locator`s) interchangeably. Can provide `HTTP_ONLY` or `HTTP_OR_FILE` to
225 |       restrict to only certain schemes.
226 |       """
227 |       # Function body
228 |   
229 |   def is_url(text: UnresolvedLocator, only_schemes: list[str] | None = None) -> bool:
230 |       """
231 |       Check if a string is a URL. For convenience, also returns false for
232 |       Paths, so that it's easy to use local paths and URLs interchangeably.
233 |       """
234 |       return check_if_url(text, only_schemes) is not None
235 |   ```
236 | 
237 | - Use concise pydoc strings with triple quotes on their own lines.
238 | 
239 | - Use `backticks` around variable names and inline code excerpts.
240 | 
241 | - Use plain fences (```) around code blocks inside of pydocs.
242 | 
243 | - For classes with many methods, use a concise docstring on the class that explains all
244 |   the common information, and avoid repeating the same information on every method.
245 | 
246 | - Docstrings should provide context or as concisely as possible explain “why”, not
247 |   obvious details evident from the class names, function names, parameter names, and
248 |   type annotations.
249 | 
250 | - Docstrings *should* mention any key rationale or pitfalls when using the class or
251 |   function.
252 | 
253 | - Avoid obvious or repetitive docstrings.
254 |   Do NOT add pydocs that just repeat in English facts that are obvious from the function
255 |   name, variable name, or types.
256 |   That is silly and obvious and makes the code longer for no reason.
257 | 
258 | - Do NOT list args and return values if they’re obvious.
259 |   In the above examples, you do not need and `Arguments:` or `Returns:` section, since
260 |   sections as it is obvious from context.
261 |   do list these if there are many arguments and their meaning isn’t clear.
262 |   If it returns a less obvious type like a tuple, do explain in the pydoc.
263 | 
264 | - Exported/public variables, functions, or methods SHOULD have concise docstrings.
265 |   Internal/local variables, functions, and methods DO NOT need docstrings unless their
266 |   purpose is not obvious.
267 | 
268 | ## General Clean Coding Practices
269 | 
270 | - Avoid writing trivial wrapper functions.
271 |   For example, when writing a class DO NOT blindly make delegation methods around public
272 |   member variables. DO NOT write methods like this:
273 |   ```python
274 |       def reassemble(self) -> str:
275 |         """Call the original reassemble method."""
276 |         return self.paragraph.reassemble()
277 |   ```
278 |   In general, the user can just call the enclosed objects methods, reducing code bloat.
279 | 
280 | - If a function does not use a parameter, but it should still be present, you can use `#
281 |   pyright: ignore[reportUnusedParameter]` in a comment to suppress the linter warning.
282 | 
283 | ## Guidelines for Backward Compatibility
284 | 
285 | - When changing code in a library or general function, if a change to an API or library
286 |   will break backward compatibility, MENTION THIS to the user.
287 | 
288 | - DO NOT implement additional code for backward compatiblity (such as extra methods or
289 |   variable aliases or comments about backward compatibility) UNLESS the user has
290 |   confirmed that it is necessary.
291 | 


--------------------------------------------------------------------------------
/src/chopdiff/docs/token_diffs.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import logging
  4 | from collections.abc import Callable
  5 | from dataclasses import dataclass
  6 | from enum import Enum
  7 | from typing import TypeAlias
  8 | 
  9 | import cydifflib as difflib
 10 | from funlog import log_calls, tally_calls
 11 | from typing_extensions import override
 12 | 
 13 | from chopdiff.docs.text_doc import TextDoc
 14 | 
 15 | log = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | SYMBOL_SEP = "⎪"
 19 | 
 20 | 
 21 | class OpType(Enum):
 22 |     EQUAL = "equal"
 23 |     INSERT = "insert"
 24 |     DELETE = "delete"
 25 |     REPLACE = "replace"
 26 | 
 27 |     def as_symbol(self):
 28 |         abbrev = {
 29 |             OpType.EQUAL: " ",
 30 |             OpType.INSERT: "+",
 31 |             OpType.DELETE: "-",
 32 |             OpType.REPLACE: "±",
 33 |         }
 34 |         return abbrev[self]
 35 | 
 36 |     def as_abbrev(self):
 37 |         abbrev = {
 38 |             OpType.EQUAL: "keep",
 39 |             OpType.INSERT: "add ",
 40 |             OpType.DELETE: "del ",
 41 |             OpType.REPLACE: "repl",
 42 |         }
 43 |         return abbrev[self]
 44 | 
 45 | 
 46 | @dataclass(frozen=True)
 47 | class DiffOp:
 48 |     action: OpType
 49 |     left: list[str]
 50 |     right: list[str]
 51 | 
 52 |     def __post_init__(self):
 53 |         if self.action == OpType.REPLACE:
 54 |             assert self.left and self.right
 55 |         elif self.action == OpType.EQUAL:
 56 |             assert self.left == self.right
 57 |         elif self.action == OpType.INSERT:
 58 |             assert not self.left
 59 |         elif self.action == OpType.DELETE:
 60 |             assert not self.right
 61 | 
 62 |     def left_str(self, show_toks: bool = True) -> str:
 63 |         s = f"{self.action.as_abbrev()} {len(self.left):4} toks"
 64 |         if show_toks:
 65 |             s += f": - {SYMBOL_SEP}{''.join(tok for tok in self.left)}{SYMBOL_SEP}"
 66 |         return s
 67 | 
 68 |     def right_str(self, show_toks: bool = True) -> str:
 69 |         s = f"{self.action.as_abbrev()} {len(self.right):4} toks"
 70 |         if show_toks:
 71 |             s += f": + {SYMBOL_SEP}{''.join(tok for tok in self.right)}{SYMBOL_SEP}"
 72 |         return s
 73 | 
 74 |     def equal_str(self, show_toks: bool = True) -> str:
 75 |         s = f"{self.action.as_abbrev()} {len(self.left):4} toks"
 76 |         if show_toks:
 77 |             s += f":   {SYMBOL_SEP}{''.join(tok for tok in self.left)}{SYMBOL_SEP}"
 78 |         return s
 79 | 
 80 |     def all_changed(self) -> list[str]:
 81 |         return [] if self.action == OpType.EQUAL else self.left + self.right
 82 | 
 83 | 
 84 | @dataclass(frozen=True)
 85 | class DiffStats:
 86 |     added: int
 87 |     removed: int
 88 |     input_size: int
 89 | 
 90 |     def nchanges(self) -> int:
 91 |         return self.added + self.removed
 92 | 
 93 |     @override
 94 |     def __str__(self):
 95 |         return f"add/remove +{self.added}/-{self.removed} out of {self.input_size} total"
 96 | 
 97 | 
 98 | DiffFilter: TypeAlias = Callable[[DiffOp], bool]
 99 | 
100 | DIFF_FILTER_NONE: DiffFilter = lambda op: True
101 | """
102 | Diff filter that accepts all diff operations.
103 | """
104 | 
105 | 
106 | @dataclass
107 | class TokenDiff:
108 |     """
109 |     A diff of two texts as a sequence of EQUAL, INSERT, and DELETE operations on wordtoks.
110 |     """
111 | 
112 |     ops: list[DiffOp]
113 | 
114 |     def left_size(self) -> int:
115 |         return sum(len(op.left) for op in self.ops)
116 | 
117 |     def right_size(self) -> int:
118 |         return sum(len(op.right) for op in self.ops)
119 | 
120 |     def changes(self) -> list[DiffOp]:
121 |         return [op for op in self.ops if op.action != OpType.EQUAL]
122 | 
123 |     def stats(self) -> DiffStats:
124 |         wordtoks_added = sum(len(op.right) for op in self.ops if op.action != OpType.EQUAL)
125 |         wordtoks_removed = sum(len(op.left) for op in self.ops if op.action != OpType.EQUAL)
126 |         return DiffStats(wordtoks_added, wordtoks_removed, self.left_size())
127 | 
128 |     def apply_to(self, original_wordtoks: list[str]) -> list[str]:
129 |         """
130 |         Apply a complete diff (including equality ops) to a list of wordtoks.
131 |         """
132 |         result: list[str] = []
133 |         original_index = 0
134 | 
135 |         if len(original_wordtoks) != self.left_size():
136 |             raise AssertionError(
137 |                 f"Diff should be complete: original wordtoks length {len(original_wordtoks)} != diff length {self.left_size()}"
138 |             )
139 | 
140 |         for op in self.ops:
141 |             if op.left:
142 |                 original_index += len(op.left)
143 |             if op.right:
144 |                 result.extend(op.right)
145 | 
146 |         return result
147 | 
148 |     def filter(self, accept_fn: DiffFilter | None) -> tuple[TokenDiff, TokenDiff]:
149 |         """
150 |         Return two diffs, one that only has accepted operations and one that only has
151 |         rejected operations.
152 |         """
153 |         if not accept_fn:
154 |             accept_fn = DIFF_FILTER_NONE
155 | 
156 |         accepted_ops: list[DiffOp] = []
157 |         rejected_ops: list[DiffOp] = []
158 | 
159 |         for op in self.ops:
160 |             if op.action == OpType.EQUAL:
161 |                 # For equal ops, all tokens are both accepted and rejected.
162 |                 accepted_ops.append(op)
163 |                 rejected_ops.append(op)
164 |             else:
165 |                 # We accept or reject the DiffOp as a whole, not token by token, since token by
166 |                 # token would give odd results, like deleting words but leaving whitespace.
167 |                 if accept_fn(op):
168 |                     accepted_ops.append(op)
169 |                     rejected_ops.append(DiffOp(OpType.EQUAL, op.left, op.left))
170 |                 else:
171 |                     accepted_ops.append(DiffOp(OpType.EQUAL, op.left, op.left))
172 |                     rejected_ops.append(op)
173 | 
174 |         assert len(accepted_ops) == len(self.ops)
175 |         assert len(accepted_ops) == len(rejected_ops)
176 | 
177 |         accepted_diff, rejected_diff = TokenDiff(accepted_ops), TokenDiff(rejected_ops)
178 | 
179 |         assert accepted_diff.left_size() == self.left_size()
180 |         assert rejected_diff.left_size() == self.left_size()
181 | 
182 |         return accepted_diff, rejected_diff
183 | 
184 |     def _diff_lines(self, include_equal: bool = False) -> list[str]:
185 |         if len(self.ops) == 0:
186 |             return ["(No changes)"]
187 | 
188 |         pos = 0
189 |         lines: list[str] = []
190 |         for op in self.ops:
191 |             if op.action == OpType.EQUAL:
192 |                 if include_equal:
193 |                     lines.append(f"at pos {pos:4} {op.equal_str()}")
194 |             elif op.action == OpType.INSERT:
195 |                 lines.append(f"at pos {pos:4} {op.right_str()}")
196 |             elif op.action == OpType.DELETE:
197 |                 lines.append(f"at pos {pos:4} {op.left_str()}")
198 |             elif op.action == OpType.REPLACE:
199 |                 lines.append(f"at pos {pos:4} {op.left_str()}")
200 |                 lines.append(f"       {'':4} {op.right_str()}")
201 | 
202 |             pos += len(op.left)
203 |         return lines
204 | 
205 |     def as_diff_str(self, include_equal: bool = True) -> str:
206 |         diff_str = "\n".join(self._diff_lines(include_equal=include_equal))
207 |         return f"TextDiff: {self.stats()}:\n{diff_str}"
208 | 
209 |     @override
210 |     def __str__(self):
211 |         return self.as_diff_str()
212 | 
213 | 
214 | def diff_docs(doc1: TextDoc, doc2: TextDoc) -> TokenDiff:
215 |     """
216 |     Calculate the LCS-style diff between two documents based on words.
217 |     """
218 | 
219 |     diff = diff_wordtoks(list(doc1.as_wordtoks()), list(doc2.as_wordtoks()))
220 | 
221 |     # log.save_object("doc1 wordtoks", "diff_docs", "\n".join(list(doc1.as_wordtoks())))
222 |     # log.save_object("doc2 wordtoks", "diff_docs", "\n".join(list(doc2.as_wordtoks())))
223 |     # log.save_object("diff", "diff_docs", diff)
224 | 
225 |     return diff
226 | 
227 | 
228 | @tally_calls(level="warning", min_total_runtime=5)
229 | def diff_wordtoks(wordtoks1: list[str], wordtoks2: list[str]) -> TokenDiff:
230 |     """
231 |     Perform an LCS-style diff on two lists of wordtoks.
232 |     """
233 |     s = difflib.SequenceMatcher(None, wordtoks1, wordtoks2, autojunk=False)  # pyright: ignore
234 |     diff: list[DiffOp] = []
235 | 
236 |     # log.message(f"Diffing {len(wordtoks1)} wordtoks against {len(wordtoks2)} wordtoks")
237 |     # log.save_object("wordtoks1", "diff_wordtoks", "".join(wordtoks1))
238 |     # log.save_object("wordtoks2", "diff_wordtoks", "".join(wordtoks2))
239 |     # log.save_object("diff opcodes", "diff_wordtoks", "\n".join(str(o) for o in s.get_opcodes()))
240 | 
241 |     for tag, i1, i2, j1, j2 in s.get_opcodes():  # pyright: ignore
242 |         if tag == "equal":
243 |             slice1 = wordtoks1[i1:i2]
244 |             assert slice1 == wordtoks2[j1:j2]
245 |             diff.append(DiffOp(OpType.EQUAL, slice1, slice1))
246 |         elif tag == "insert":
247 |             diff.append(DiffOp(OpType.INSERT, [], wordtoks2[j1:j2]))
248 |         elif tag == "delete":
249 |             diff.append(DiffOp(OpType.DELETE, wordtoks1[i1:i2], []))
250 |         elif tag == "replace":
251 |             diff.append(DiffOp(OpType.REPLACE, wordtoks1[i1:i2], wordtoks2[j1:j2]))
252 | 
253 |     return TokenDiff(diff)
254 | 
255 | 
256 | ScoredDiff: TypeAlias = tuple[float, TokenDiff]
257 | 
258 | 
259 | def scored_diff_wordtoks(wordtoks1: list[str], wordtoks2: list[str]) -> ScoredDiff:
260 |     """
261 |     Calculate the number of wordtoks added and removed between two lists of tokens.
262 |     Score is (wordtoks_added + wordtoks_removed) / min(len(doc1), len(doc2)),
263 |     which is 0 for identical docs.
264 |     """
265 | 
266 |     if len(wordtoks1) == 0 or len(wordtoks2) == 0:
267 |         raise ValueError("Cannot score diff for empty documents")
268 | 
269 |     diff = diff_wordtoks(wordtoks1, wordtoks2)
270 |     score = float(diff.stats().nchanges()) / min(len(wordtoks1), len(wordtoks2))
271 |     return score, diff
272 | 
273 | 
274 | @log_calls(level="message", if_slower_than=0.25)
275 | def find_best_alignment(
276 |     list1: list[str],
277 |     list2: list[str],
278 |     min_overlap: int,
279 |     max_overlap: int | None = None,
280 |     scored_diff: Callable[[list[str], list[str]], ScoredDiff] = scored_diff_wordtoks,
281 |     give_up_score: float = 0.75,
282 |     give_up_count: int = 30,
283 | ) -> tuple[int, ScoredDiff]:
284 |     """
285 |     Find the best alignment of two lists of values, where edit distance is smallest but overlap is
286 |     at least min_overlap and at most max_overlap. Returns offset into list1 and diff object.
287 |     """
288 |     len1, len2 = len(list1), len(list2)
289 |     best_offset = -1
290 |     best_score = float("inf")
291 |     best_diff = None
292 |     max_overlap = min(len1, len2, max_overlap) if max_overlap is not None else min(len1, len2)
293 | 
294 |     if min_overlap > len1 or min_overlap > len2:
295 |         raise ValueError(
296 |             f"Minimum overlap {min_overlap} should never exceed the length of one of the lists ({len1}, {len2})"
297 |         )
298 | 
299 |     log.info(
300 |         "Finding best alignment: List lengths: lengths %s and %s with overlap of %s to %s",
301 |         len1,
302 |         len2,
303 |         min_overlap,
304 |         max_overlap,
305 |     )
306 | 
307 |     # To make this a bit more efficient we check if we have a run of increasing scores and
308 |     # give up if we have many in a row.
309 |     scores_increasing = 0
310 |     prev_score = float("-inf")
311 | 
312 |     # Slide the second list over the first list, starting from the end of the first list.
313 |     # TODO: This could be much more efficient by being cleverer about reusing diff calculations.s
314 |     for overlap in range(min_overlap, max_overlap + 1):
315 |         start1 = len1 - overlap
316 |         end1 = len1
317 |         start2 = 0
318 |         end2 = overlap
319 | 
320 |         score, diff = scored_diff(list1[start1:end1], list2[start2:end2])
321 | 
322 |         log.info("Offset %s: Overlap %s: Score %f", start1, overlap, score)
323 | 
324 |         if score < best_score:
325 |             best_score = score
326 |             best_offset = start1
327 |             best_diff = diff
328 |             scores_increasing = 0
329 |         elif score >= give_up_score and score >= prev_score:
330 |             scores_increasing += 1
331 |             if scores_increasing >= give_up_count:
332 |                 log.info(
333 |                     "Giving up after %s increasing scores, last score %s > %s",
334 |                     give_up_count,
335 |                     score,
336 |                     give_up_score,
337 |                 )
338 |                 break
339 | 
340 |         prev_score = score
341 | 
342 |     if best_diff is None:
343 |         raise ValueError("No alignment found")
344 | 
345 |     return best_offset, (best_score, best_diff)
346 | 


--------------------------------------------------------------------------------
/tests/docs/test_text_doc.py:
--------------------------------------------------------------------------------
  1 | from pprint import pprint
  2 | from textwrap import dedent
  3 | 
  4 | import regex
  5 | from prettyfmt import fmt_words
  6 | from strif import abbrev_str
  7 | 
  8 | from chopdiff.docs.sizes import TextUnit
  9 | from chopdiff.docs.text_doc import SentIndex, TextDoc
 10 | from chopdiff.docs.wordtoks import (
 11 |     PARA_BR_TOK,
 12 |     is_break_or_space,
 13 |     is_entity,
 14 |     is_header_tag,
 15 |     is_number,
 16 |     is_tag,
 17 |     is_word,
 18 |     join_wordtoks,
 19 |     visualize_wordtoks,
 20 |     wordtok_len,
 21 |     wordtokenize,
 22 | )
 23 | 
 24 | _med_test_doc = dedent(
 25 |     """
 26 |         # Title
 27 | 
 28 |         Hello World. This is an example sentence. And here's another one!
 29 | 
 30 |         ## Subtitle
 31 | 
 32 |         This is a new paragraph.
 33 |         It has several sentences.
 34 |         There may be line breaks within a paragraph, but these should not affect handlingof the paragraph.
 35 |         There are also [links](http://www.google.com) and **bold** and *italic* text.
 36 | 
 37 |         ### Itemized List
 38 | 
 39 |         - Item 1
 40 | 
 41 |         - Item 2
 42 | 
 43 |         - Item 3
 44 | 
 45 |         ### Numbered List
 46 | 
 47 |         1. Item 1
 48 | 
 49 |         2. Item 2
 50 | 
 51 |         3. Item 3
 52 | 
 53 |         Testing some embedded HTML tags.
 54 | 
 55 |         <h3>An HTML header</h3>
 56 | 
 57 |         <!--window-br-->
 58 |         
 59 |         <span class="citation timestamp-link"
 60 |         data-src="resources/https_www_youtube_com_watch_v_da_2h2b4fau.resource.yml"
 61 |         data-timestamp="352.81">⏱️<a
 62 |         href="https://www.youtube.com/watch?v=Da-2h2B4faU&t=352.81s">05:52</a>&nbsp;</span>
 63 | 
 64 |         """
 65 | ).strip()
 66 | 
 67 | 
 68 | def test_document_parse_reassemble():
 69 |     text = _med_test_doc
 70 |     doc = TextDoc.from_text(text)
 71 | 
 72 |     print("\n---Original:")
 73 |     pprint(text)
 74 |     print("\n---Parsed:")
 75 |     pprint(doc)
 76 | 
 77 |     reassembled_text = doc.reassemble()
 78 | 
 79 |     # Should be exactly the same except for within-paragraph line breaks.
 80 |     def normalize(text: str) -> str:
 81 |         return regex.sub(r"\s+", " ", text.replace("\n\n", "<PARA>"))
 82 | 
 83 |     assert normalize(reassembled_text) == normalize(text)
 84 | 
 85 |     # Check offset of a paragraph towards the end of the document.
 86 |     last_para = doc.paragraphs[-1]
 87 |     last_para_char_offset = text.rindex(last_para.original_text)
 88 |     assert last_para.char_offset == last_para_char_offset
 89 | 
 90 | 
 91 | def test_markup_detection():
 92 |     text = _med_test_doc
 93 |     doc = TextDoc.from_text(text)
 94 | 
 95 |     print("Paragraph markup and header detection:")
 96 |     result: list[str] = []
 97 |     for para in doc.paragraphs:
 98 |         result.append(
 99 |             fmt_words(
100 |                 abbrev_str(para.original_text, 10),
101 |                 "is_markup" if para.is_markup() else "",
102 |                 "is_header" if para.is_header() else "",
103 |             )
104 |         )
105 | 
106 |     print("\n".join(result))
107 |     assert (
108 |         "\n".join(result)
109 |         == dedent(
110 |             """
111 |             # Title is_header
112 |             Hello Wor…
113 |             ## Subtit… is_header
114 |             This is a…
115 |             ### Itemi… is_header
116 |             - Item 1
117 |             - Item 2
118 |             - Item 3
119 |             ### Numbe… is_header
120 |             1. Item 1
121 |             2. Item 2
122 |             3. Item 3
123 |             Testing s…
124 |             <h3>An HT… is_header
125 |             <!--windo… is_markup
126 |             <span cla… is_markup
127 |             """
128 |         ).strip()
129 |     )
130 | 
131 |     print("Last paragraphs:")
132 |     print(list(doc.paragraphs[-2].as_wordtoks()))
133 |     print(list(doc.paragraphs[-1].as_wordtoks()))
134 | 
135 |     wordtoks = doc.paragraphs[-1].as_wordtoks()
136 |     result = []
137 |     for wordtok in wordtoks:
138 |         result.append(
139 |             fmt_words(
140 |                 visualize_wordtoks([wordtok]),
141 |                 "is_break_or_space" if is_break_or_space(wordtok) else "",
142 |                 "is_word" if is_word(wordtok) else "",
143 |                 "is_number" if is_number(wordtok) else "",
144 |                 "is_tag" if is_tag(wordtok) else "",
145 |                 "is_header_tag" if is_header_tag(wordtok) else "",
146 |                 "is_entity" if is_entity(wordtok) else "",
147 |             )
148 |         )
149 |     print("\n".join(result))
150 | 
151 |     assert (
152 |         "\n".join(result)
153 |         == dedent(
154 |             """
155 |             ⎪<span class="citation timestamp-link" data-src="resources/https_www_youtube_com_watch_v_da_2h2b4fau.resource.yml" data-timestamp="352.81">⎪ is_tag
156 |             ⎪⏱⎪
157 |             ⎪️⎪
158 |             ⎪<a href="https://www.youtube.com/watch?v=Da-2h2B4faU&t=352.81s">⎪ is_tag
159 |             ⎪05⎪ is_number
160 |             ⎪:⎪
161 |             ⎪52⎪ is_number
162 |             ⎪</a>⎪ is_tag
163 |             ⎪&nbsp;⎪ is_entity
164 |             ⎪</span>⎪ is_tag
165 |             """
166 |         ).strip()
167 |     )
168 | 
169 |     assert doc.paragraphs[-2].sentences[0].text == "<!--window-br-->"
170 |     assert doc.paragraphs[-2].is_markup()
171 |     assert doc.paragraphs[-1].sentences[-1].is_markup()
172 | 
173 | 
174 | _simple_test_doc = dedent(
175 |     """
176 |     This is the first paragraph. It has multiple sentences.
177 | 
178 |     This is the second paragraph. It also has multiple sentences. And it continues.
179 |     
180 |     Here is the third paragraph. More sentences follow. And here is another one.
181 |     """
182 | ).strip()
183 | 
184 | 
185 | def test_doc_sizes():
186 |     text = _med_test_doc
187 |     doc = TextDoc.from_text(text)
188 |     print("\n---Sizes:")
189 |     size_summary = doc.size_summary()
190 |     print(size_summary)
191 | 
192 |     assert size_summary == "726 bytes (37 lines, 16 paras, 20 sents, 82 words, 215 tiktoks)"
193 | 
194 | 
195 | def test_seek_doc():
196 |     doc = TextDoc.from_text(_simple_test_doc)
197 | 
198 |     offset = 1
199 |     sent_index, sent_offset = doc.seek_to_sent(offset, TextUnit.bytes)
200 |     print(f"Seeked to {sent_index} offset {sent_offset} for offset {offset} bytes")
201 |     assert sent_index == SentIndex(para_index=0, sent_index=0)
202 |     assert sent_offset == 0
203 | 
204 |     offset = len("This is the first paragraph.")
205 |     sent_index, sent_offset = doc.seek_to_sent(offset, TextUnit.bytes)
206 |     print(f"Seeked to {sent_index} offset {sent_offset} for offset {offset} bytes")
207 |     assert sent_index == SentIndex(para_index=0, sent_index=0)
208 |     assert sent_offset == 0
209 | 
210 |     offset = len("This is the first paragraph. ")
211 |     sent_index, sent_offset = doc.seek_to_sent(offset, TextUnit.bytes)
212 |     print(f"Seeked to {sent_index} offset {sent_offset} for offset {offset} bytes")
213 |     assert sent_index == SentIndex(para_index=0, sent_index=1)
214 |     assert sent_offset == offset
215 | 
216 |     offset = len(
217 |         "This is the first paragraph. It has multiple sentences.\n\nThis is the second paragraph."
218 |     )
219 |     sent_index, sent_offset = doc.seek_to_sent(offset, TextUnit.bytes)
220 |     print(f"Seeked to {sent_index} offset {sent_offset} for offset {offset} bytes")
221 |     assert sent_index == SentIndex(para_index=1, sent_index=0)
222 |     assert sent_offset == len("This is the first paragraph. It has multiple sentences.\n\n")
223 | 
224 |     offset = len(_simple_test_doc) + 10
225 |     sent_index, sent_offset = doc.seek_to_sent(offset, TextUnit.bytes)
226 |     print(f"Seeked to {sent_index} offset {sent_offset} for offset {offset} bytes")
227 |     assert sent_index == SentIndex(para_index=2, sent_index=2)
228 | 
229 | 
230 | _short_test_doc = dedent(
231 |     """
232 |     Paragraph one lorem ipsum.
233 |     Sentence 1a lorem ipsum. Sentence 1b lorem ipsum. Sentence 1c lorem ipsum.
234 |     
235 |     Paragraph two lorem ipsum. Sentence 2a lorem ipsum. Sentence 2b lorem ipsum. Sentence 2c lorem ipsum.
236 |     
237 |     Paragraph three lorem ipsum. Sentence 3a lorem ipsum. Sentence 3b lorem ipsum. Sentence 3c lorem ipsum.
238 |     """
239 | ).strip()
240 | 
241 | 
242 | def test_sub_doc():
243 |     doc = TextDoc.from_text(_short_test_doc)
244 | 
245 |     sub_doc_start = SentIndex(1, 1)
246 |     sub_doc_end = SentIndex(2, 1)
247 |     sub_doc = doc.sub_doc(sub_doc_start, sub_doc_end)
248 | 
249 |     expected_text = dedent(
250 |         """
251 |         Sentence 2a lorem ipsum. Sentence 2b lorem ipsum. Sentence 2c lorem ipsum.
252 |         
253 |         Paragraph three lorem ipsum. Sentence 3a lorem ipsum.
254 |         """
255 |     ).strip()
256 |     expected_sub_doc = TextDoc.from_text(expected_text)
257 | 
258 |     print("---Original:")
259 |     pprint(doc)
260 |     print("---Subdoc:")
261 |     pprint(sub_doc)
262 | 
263 |     # Confirm reassembled text is correct.
264 |     assert sub_doc.reassemble() == expected_sub_doc.reassemble()
265 | 
266 |     # Confirm sentences and offsets are preserved in sub-doc.
267 |     orig_sentences = [sent for _index, sent in doc.sent_iter()]
268 |     sub_sentences = [sent for _index, sent in sub_doc.sent_iter()]
269 |     assert orig_sentences[5:10] == sub_sentences
270 | 
271 |     # Confirm indexing and reverse iteration.
272 |     assert doc.sub_doc(SentIndex(0, 0), None) == doc
273 |     reversed_sentences = [sent for _index, sent in doc.sent_iter(reverse=True)]
274 |     assert reversed_sentences == list(reversed(orig_sentences))
275 | 
276 | 
277 | def test_tokenization():
278 |     doc = TextDoc.from_text(_short_test_doc)
279 |     wordtoks = list(doc.as_wordtoks())
280 | 
281 |     print("\n---Tokens:")
282 |     pprint(wordtoks)
283 | 
284 |     assert wordtoks[:6] == ["Paragraph", " ", "one", " ", "lorem", " "]
285 |     assert wordtoks[-7:] == [" ", "3c", " ", "lorem", " ", "ipsum", "."]
286 |     assert wordtoks.count(PARA_BR_TOK) == 2
287 |     assert join_wordtoks(wordtoks) == _short_test_doc.replace(
288 |         "\n", " ", 1
289 |     )  # First \n is not a para break.
290 | 
291 | 
292 | def test_wordtok_mappings():
293 |     doc = TextDoc.from_text(_short_test_doc)
294 | 
295 |     print("\n---Mapping:")
296 |     wordtok_mapping, sent_mapping = doc.wordtok_mappings()
297 |     pprint(wordtok_mapping)
298 |     pprint(sent_mapping)
299 | 
300 |     assert wordtok_mapping[0] == SentIndex(0, 0)
301 |     assert wordtok_mapping[1] == SentIndex(0, 0)
302 |     assert wordtok_mapping[4] == SentIndex(0, 0)
303 |     assert wordtok_mapping[9] == SentIndex(0, 1)
304 | 
305 |     assert sent_mapping[SentIndex(0, 0)] == [0, 1, 2, 3, 4, 5, 6, 7, 8]
306 |     assert sent_mapping[SentIndex(2, 3)] == [99, 100, 101, 102, 103, 104, 105, 106]
307 | 
308 | 
309 | _sentence_tests = [
310 |     "Hello, world!",
311 |     "This is an example sentence with punctuation.",
312 |     "And here's another one!",
313 |     "Special characters: @#%^&*()",
314 | ]
315 | 
316 | _sentence_test_html = 'This is <span data-timestamp="1.234">a test</span>.'
317 | 
318 | 
319 | def test_wordtokization():
320 |     for sentence in _sentence_tests:
321 |         wordtoks = wordtokenize(sentence)
322 |         reassembled_sentence = "".join(wordtoks)
323 |         assert reassembled_sentence == sentence
324 | 
325 |     assert wordtokenize("Multiple     spaces and tabs\tand\nnewlines in between.") == [
326 |         "Multiple",
327 |         " ",
328 |         "spaces",
329 |         " ",
330 |         "and",
331 |         " ",
332 |         "tabs",
333 |         " ",
334 |         "and",
335 |         " ",
336 |         "newlines",
337 |         " ",
338 |         "in",
339 |         " ",
340 |         "between",
341 |         ".",
342 |     ]
343 |     assert wordtokenize("") == []
344 |     assert wordtokenize("   ") == [" "]
345 | 
346 |     assert wordtokenize(_sentence_test_html) == [
347 |         "This",
348 |         " ",
349 |         "is",
350 |         " ",
351 |         '<span data-timestamp="1.234">',
352 |         "a",
353 |         " ",
354 |         "test",
355 |         "</span>",
356 |         ".",
357 |     ]
358 | 
359 |     assert len(_sentence_test_html) == sum(
360 |         wordtok_len(wordtok) for wordtok in wordtokenize(_sentence_test_html)
361 |     )
362 | 
363 | 
364 | def test_html_tokenization():
365 |     doc = TextDoc.from_text(_sentence_test_html)
366 |     wordtoks = list(doc.as_wordtoks())
367 | 
368 |     print("\n---HTML Tokens:")
369 |     pprint(wordtoks)
370 | 
371 |     assert wordtoks == [
372 |         "This",
373 |         " ",
374 |         "is",
375 |         " ",
376 |         '<span data-timestamp="1.234">',
377 |         "a",
378 |         " ",
379 |         "test",
380 |         "</span>",
381 |         ".",
382 |     ]
383 |     assert list(map(is_tag, wordtoks)) == [
384 |         False,
385 |         False,
386 |         False,
387 |         False,
388 |         True,
389 |         False,
390 |         False,
391 |         False,
392 |         True,
393 |         False,
394 |     ]
395 |     assert list(map(is_break_or_space, wordtoks)) == [
396 |         False,
397 |         True,
398 |         False,
399 |         True,
400 |         False,
401 |         False,
402 |         True,
403 |         False,
404 |         False,
405 |         False,
406 |     ]
407 | 
408 | 
409 | def test_tiktoken_len():
410 |     doc = TextDoc.from_text(_med_test_doc)
411 | 
412 |     len = doc.size(TextUnit.tiktokens)
413 |     print("--Tiktoken len:")
414 |     print(len)
415 | 
416 |     assert len > 100
417 | 
418 | 
419 | def test_is_footnote_def_detection():
420 |     doc = TextDoc.from_text(
421 |         dedent(
422 |             """
423 |             Title.
424 | 
425 |             Body with a ref[^a1].
426 | 
427 |             [^a1]: The definition line
428 |             """
429 |         ).strip()
430 |     )
431 | 
432 |     assert len(doc.paragraphs) == 3
433 |     assert not doc.paragraphs[0].is_footnote_def()
434 |     assert not doc.paragraphs[1].is_footnote_def()
435 |     assert doc.paragraphs[2].is_footnote_def()
436 | 


--------------------------------------------------------------------------------
/src/chopdiff/html/html_in_md.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Formatting of Markdown with a small set of known HTML classes. We do this directly
  3 | ourselves to keep the HTML very minimal, control whitespace, and to avoid any
  4 | confusions of using full HTML escaping (like unnecessary &quot;s etc.)
  5 | 
  6 | Perhaps worth using FastHTML for this?
  7 | """
  8 | 
  9 | import re
 10 | from collections.abc import Callable
 11 | from typing import TypeAlias
 12 | 
 13 | 
 14 | def escape_md_html(s: str, safe: bool = False) -> str:
 15 |     """
 16 |     Escape a string for Markdown with HTML. Don't escape single and double quotes.
 17 |     """
 18 |     if safe:
 19 |         return s
 20 |     s = s.replace("&", "&amp;")
 21 |     s = s.replace("<", "&lt;")
 22 |     s = s.replace(">", "&gt;")
 23 |     return s
 24 | 
 25 | 
 26 | def escape_attribute(s: str) -> str:
 27 |     """
 28 |     Escape a string for use as an HTML attribute. Escape single and double quotes.
 29 |     """
 30 |     s = escape_md_html(s)
 31 |     s = s.replace('"', "&quot;")
 32 |     s = s.replace("'", "&#39;")
 33 |     return s
 34 | 
 35 | 
 36 | ClassNames = str | list[str] | None
 37 | Attrs = dict[str, str | bool]
 38 | 
 39 | _TAGS_WITH_PADDING = ["div", "p"]
 40 | 
 41 | 
 42 | def tag_with_attrs(
 43 |     tag: str,
 44 |     text: str | None,
 45 |     class_name: ClassNames = None,
 46 |     *,
 47 |     attrs: Attrs | None = None,
 48 |     safe: bool = False,
 49 |     padding: str | None = None,
 50 | ) -> str:
 51 |     """
 52 |     Create an HTML tag with optional class names and attributes.
 53 |     Boolean attribute values: True includes the attribute, False omits it.
 54 |     """
 55 |     class_value = ""
 56 |     if class_name is not None:
 57 |         if isinstance(class_name, str):
 58 |             class_value = class_name.strip()
 59 |         else:  # list[str]
 60 |             # Filter out empty strings and join
 61 |             filtered_classes = [cls for cls in class_name if cls.strip()]
 62 |             class_value = " ".join(filtered_classes)
 63 | 
 64 |     attr_str = f' class="{escape_attribute(class_value)}"' if class_value else ""
 65 |     if attrs:
 66 |         for k, v in attrs.items():
 67 |             if isinstance(v, bool):
 68 |                 if v:  # Only include attribute if True
 69 |                     attr_str += f" {k}"
 70 |             else:  # string value
 71 |                 attr_str += f' {k}="{escape_attribute(v)}"'
 72 |     # Default padding for div and p tags.
 73 |     if text is None:
 74 |         return f"<{tag}{attr_str} />"
 75 |     else:
 76 |         content = escape_md_html(text, safe)
 77 |         if padding is None:
 78 |             padding = "\n" if tag in _TAGS_WITH_PADDING else ""
 79 |         if padding:
 80 |             content = content.strip("\n")
 81 |             if not content:
 82 |                 padding = ""
 83 |         return f"<{tag}{attr_str}>{padding}{content}{padding}</{tag}>"
 84 | 
 85 | 
 86 | def html_span(
 87 |     text: str,
 88 |     class_name: ClassNames = None,
 89 |     *,
 90 |     attrs: Attrs | None = None,
 91 |     safe: bool = False,
 92 | ) -> str:
 93 |     """
 94 |     Write a span tag for use in Markdown, with the given text and optional class and attributes.
 95 |     """
 96 |     return tag_with_attrs("span", text, class_name, attrs=attrs, safe=safe)
 97 | 
 98 | 
 99 | def html_div(
100 |     text: str,
101 |     class_name: ClassNames = None,
102 |     *,
103 |     attrs: Attrs | None = None,
104 |     safe: bool = False,
105 |     padding: str | None = None,
106 | ) -> str:
107 |     """
108 |     Write a div tag for use in Markdown, with the given text and optional class and attributes.
109 |     """
110 |     return tag_with_attrs("div", text, class_name, attrs=attrs, safe=safe, padding=padding)
111 | 
112 | 
113 | def html_a(
114 |     text: str,
115 |     href: str,
116 |     class_name: ClassNames = None,
117 |     *,
118 |     attrs: Attrs | None = None,
119 |     safe: bool = False,
120 | ) -> str:
121 |     """
122 |     Write an anchor tag with href, optional class and attributes.
123 |     """
124 |     link_attrs: Attrs = {"href": href}
125 |     if attrs:
126 |         link_attrs.update(attrs)
127 |     return tag_with_attrs("a", text, class_name, attrs=link_attrs, safe=safe)
128 | 
129 | 
130 | def html_b(
131 |     text: str,
132 |     class_name: ClassNames = None,
133 |     *,
134 |     attrs: Attrs | None = None,
135 |     safe: bool = False,
136 | ) -> str:
137 |     """
138 |     Write a bold tag with optional class and attributes.
139 |     """
140 |     return tag_with_attrs("b", text, class_name, attrs=attrs, safe=safe)
141 | 
142 | 
143 | def html_i(
144 |     text: str,
145 |     class_name: ClassNames = None,
146 |     *,
147 |     attrs: Attrs | None = None,
148 |     safe: bool = False,
149 | ) -> str:
150 |     """
151 |     Write an italic tag with optional class and attributes.
152 |     """
153 |     return tag_with_attrs("i", text, class_name, attrs=attrs, safe=safe)
154 | 
155 | 
156 | def html_img(
157 |     src: str,
158 |     alt: str,
159 |     class_name: ClassNames = None,
160 |     *,
161 |     attrs: Attrs | None = None,
162 |     safe: bool = False,
163 | ) -> str:
164 |     img_attrs: Attrs = {"src": src, "alt": alt}
165 |     if attrs:
166 |         for k, v in attrs.items():
167 |             img_attrs[k] = v
168 |     return tag_with_attrs("img", None, class_name, attrs=img_attrs, safe=safe)
169 | 
170 | 
171 | def html_join_blocks(*blocks: str | None) -> str:
172 |     """
173 |     Join block elements, with double newlines for better Markdown compatibility.
174 |     Ignore empty strings or None.
175 |     """
176 |     return "\n\n".join(block.strip("\n") for block in blocks if block)
177 | 
178 | 
179 | def md_para(text: str) -> str:
180 |     """
181 |     Add double newlines to the start and end of the text to make it a paragraph.
182 |     """
183 |     return "\n\n".join(text.split("\n"))
184 | 
185 | 
186 | Wrapper: TypeAlias = Callable[[str], str]
187 | """Wraps a string to identify it in some way."""
188 | 
189 | 
190 | def identity_wrapper(text: str) -> str:
191 |     return text
192 | 
193 | 
194 | def _check_class_name(class_name: ClassNames) -> None:
195 |     if class_name:
196 |         if isinstance(class_name, str):
197 |             # Allow modern CSS class naming including BEM notation (block__element--modifier)
198 |             if class_name.strip() and not re.match(r"^[a-zA-Z_][\w_-]*$", class_name):
199 |                 raise ValueError(f"Expected a valid CSS class name but got: '{class_name}'")
200 |         else:  # list[str]
201 |             for cls in class_name:
202 |                 if cls.strip() and not re.match(r"^[a-zA-Z_][\w_-]*$", cls):
203 |                     raise ValueError(f"Expected a valid CSS class name but got: '{cls}'")
204 | 
205 | 
206 | def html_p(
207 |     text: str,
208 |     class_name: ClassNames = None,
209 |     *,
210 |     attrs: Attrs | None = None,
211 |     safe: bool = False,
212 |     padding: str | None = None,
213 | ) -> str:
214 |     """
215 |     Write a p tag for use in Markdown, with the given text and optional class and attributes.
216 |     """
217 |     return tag_with_attrs("p", text, class_name, attrs=attrs, safe=safe, padding=padding)
218 | 
219 | 
220 | def html_tag(
221 |     tag: str,
222 |     text: str | None = None,
223 |     class_name: ClassNames = None,
224 |     *,
225 |     attrs: Attrs | None = None,
226 |     safe: bool = False,
227 |     padding: str | None = None,
228 | ) -> str:
229 |     """
230 |     Generic function to create any HTML tag with optional class and attributes.
231 |     """
232 |     return tag_with_attrs(tag, text, class_name, attrs=attrs, safe=safe, padding=padding)
233 | 
234 | 
235 | def div_wrapper(
236 |     class_name: ClassNames = None,
237 |     *,
238 |     attrs: Attrs | None = None,
239 |     safe: bool = True,
240 |     padding: str | None = "\n\n",
241 | ) -> Wrapper:
242 |     _check_class_name(class_name)
243 | 
244 |     def div_wrapper_func(text: str) -> str:
245 |         return html_div(text, class_name, attrs=attrs, safe=safe, padding=padding)
246 | 
247 |     return div_wrapper_func
248 | 
249 | 
250 | def span_wrapper(
251 |     class_name: ClassNames = None,
252 |     *,
253 |     attrs: Attrs | None = None,
254 |     safe: bool = True,
255 | ) -> Wrapper:
256 |     _check_class_name(class_name)
257 | 
258 |     def span_wrapper_func(text: str) -> str:
259 |         return html_span(text, class_name, attrs=attrs, safe=safe)
260 | 
261 |     return span_wrapper_func
262 | 
263 | 
264 | def tag_wrapper(
265 |     tag: str,
266 |     class_name: ClassNames = None,
267 |     *,
268 |     attrs: Attrs | None = None,
269 |     safe: bool = True,
270 |     padding: str | None = None,
271 | ) -> Wrapper:
272 |     """
273 |     Generic wrapper factory for any HTML tag.
274 |     """
275 |     _check_class_name(class_name)
276 | 
277 |     def tag_wrapper_func(text: str) -> str:
278 |         return html_tag(tag, text, class_name, attrs=attrs, safe=safe, padding=padding)
279 | 
280 |     return tag_wrapper_func
281 | 
282 | 
283 | ## Tests
284 | 
285 | 
286 | def test_html():
287 |     assert escape_md_html("&<>") == "&amp;&lt;&gt;"
288 |     assert escape_attribute("\"'&<>") == "&quot;&#39;&amp;&lt;&gt;"
289 |     assert (
290 |         tag_with_attrs("span", "text", class_name="foo", attrs={"id": "a"})
291 |         == '<span class="foo" id="a">text</span>'
292 |     )
293 |     assert (
294 |         html_span("text", class_name="foo", attrs={"id": "a"})
295 |         == '<span class="foo" id="a">text</span>'
296 |     )
297 |     assert (
298 |         html_div("text 1<2", class_name="foo", attrs={"id": "a"})
299 |         == '<div class="foo" id="a">\ntext 1&lt;2\n</div>'
300 |     )
301 |     assert html_div("text") == "<div>\ntext\n</div>"
302 | 
303 | 
304 | def test_boolean_attrs():
305 |     assert tag_with_attrs("input", None, attrs={"disabled": True}) == "<input disabled />"
306 |     assert tag_with_attrs("input", None, attrs={"disabled": False}) == "<input />"
307 |     assert (
308 |         tag_with_attrs("input", None, attrs={"disabled": True, "required": True, "id": "test"})
309 |         == '<input disabled required id="test" />'
310 |     )
311 |     assert (
312 |         tag_with_attrs("input", None, attrs={"disabled": False, "required": True})
313 |         == "<input required />"
314 |     )
315 | 
316 | 
317 | def test_class_names():
318 |     assert (
319 |         tag_with_attrs("div", "text", class_name=["foo", "bar"])
320 |         == '<div class="foo bar">\ntext\n</div>'
321 |     )
322 |     assert tag_with_attrs("span", "text", class_name="single") == '<span class="single">text</span>'
323 |     assert tag_with_attrs("span", "text", class_name=None) == "<span>text</span>"
324 |     assert tag_with_attrs("span", "text", class_name=[]) == "<span>text</span>"
325 |     assert tag_with_attrs("span", "text", class_name="") == "<span>text</span>"
326 |     assert tag_with_attrs("span", "text", class_name=["", ""]) == "<span>text</span>"
327 |     assert (
328 |         tag_with_attrs("span", "text", class_name=["foo", "", "bar"])
329 |         == '<span class="foo bar">text</span>'
330 |     )
331 | 
332 | 
333 | def test_padding():
334 |     assert tag_with_attrs("span", "text") == "<span>text</span>"
335 |     assert tag_with_attrs("div", "text") == "<div>\ntext\n</div>"
336 |     assert tag_with_attrs("p", "text") == "<p>\ntext\n</p>"
337 |     assert tag_with_attrs("div", "text", padding="") == "<div>text</div>"
338 |     assert tag_with_attrs("div", "", padding="\n") == "<div></div>"
339 | 
340 | 
341 | def test_safe_mode():
342 |     assert tag_with_attrs("div", "<script>", safe=True) == "<div>\n<script>\n</div>"
343 |     assert tag_with_attrs("div", "<script>", safe=False) == "<div>\n&lt;script&gt;\n</div>"
344 | 
345 | 
346 | def test_html_functions():
347 |     assert html_a("link", "http://example.com") == '<a href="http://example.com">link</a>'
348 |     assert (
349 |         html_a("link", "http://example.com", class_name="external")
350 |         == '<a class="external" href="http://example.com">link</a>'
351 |     )
352 |     assert (
353 |         html_a("link", "http://example.com", attrs={"target": "_blank"})
354 |         == '<a href="http://example.com" target="_blank">link</a>'
355 |     )
356 | 
357 |     assert html_b("bold") == "<b>bold</b>"
358 |     assert html_b("bold", class_name="emphasis") == '<b class="emphasis">bold</b>'
359 | 
360 |     assert html_i("italic") == "<i>italic</i>"
361 |     assert html_i("italic", attrs={"title": "emphasis"}) == '<i title="emphasis">italic</i>'
362 | 
363 |     assert html_p("paragraph") == "<p>\nparagraph\n</p>"
364 |     assert html_p("paragraph", class_name="intro") == '<p class="intro">\nparagraph\n</p>'
365 | 
366 |     assert html_img("pic.jpg", "A picture") == '<img src="pic.jpg" alt="A picture" />'
367 |     assert (
368 |         html_img("pic.jpg", "A picture", attrs={"loading": "lazy"})
369 |         == '<img src="pic.jpg" alt="A picture" loading="lazy" />'
370 |     )
371 | 
372 | 
373 | def test_html_tag():
374 |     assert html_tag("section", "content") == "<section>content</section>"
375 |     assert (
376 |         html_tag("section", "content", class_name="main")
377 |         == '<section class="main">content</section>'
378 |     )
379 |     assert html_tag("hr", None) == "<hr />"
380 |     assert html_tag("article", "text", padding="\n") == "<article>\ntext\n</article>"
381 | 
382 | 
383 | def test_html_join_blocks():
384 |     assert html_join_blocks("block1", "block2") == "block1\n\nblock2"
385 |     assert html_join_blocks("block1", None, "block2") == "block1\n\nblock2"
386 |     assert html_join_blocks("", "block2") == "block2"
387 | 
388 | 
389 | def test_div_wrapper():
390 |     safe_wrapper = div_wrapper(class_name="foo")
391 |     assert safe_wrapper("<div>text</div>") == '<div class="foo">\n\n<div>text</div>\n\n</div>'
392 | 
393 |     unsafe_wrapper = div_wrapper(class_name="foo", safe=False)
394 |     assert (
395 |         unsafe_wrapper("<div>text</div>")
396 |         == '<div class="foo">\n\n&lt;div&gt;text&lt;/div&gt;\n\n</div>'
397 |     )
398 | 
399 |     bool_wrapper = div_wrapper(attrs={"hidden": True})
400 |     assert bool_wrapper("content") == "<div hidden>\n\ncontent\n\n</div>"
401 | 
402 |     list_wrapper = div_wrapper(class_name=["foo", "bar"])
403 |     assert list_wrapper("content") == '<div class="foo bar">\n\ncontent\n\n</div>'
404 | 
405 |     empty_wrapper = div_wrapper(class_name=[])
406 |     assert empty_wrapper("content") == "<div>\n\ncontent\n\n</div>"
407 | 
408 |     empty_str_wrapper = div_wrapper(class_name="")
409 |     assert empty_str_wrapper("content") == "<div>\n\ncontent\n\n</div>"
410 | 
411 | 
412 | def test_span_wrapper():
413 |     wrapper = span_wrapper(class_name="highlight", attrs={"data-id": "123"})
414 |     assert wrapper("text") == '<span class="highlight" data-id="123">text</span>'
415 | 
416 |     list_wrapper = span_wrapper(class_name=["highlight", "bold"])
417 |     assert list_wrapper("text") == '<span class="highlight bold">text</span>'
418 | 
419 |     empty_wrapper = span_wrapper(class_name=[])
420 |     assert empty_wrapper("text") == "<span>text</span>"
421 | 
422 |     empty_str_wrapper = span_wrapper(class_name="")
423 |     assert empty_str_wrapper("text") == "<span>text</span>"
424 | 
425 | 
426 | def test_check_class_name():
427 |     # Valid single class names
428 |     _check_class_name("foo")
429 |     _check_class_name("foo-bar")
430 |     _check_class_name("_private")
431 |     _check_class_name("block__element--modifier")  # BEM notation
432 |     _check_class_name(None)
433 | 
434 |     # Valid list of class names
435 |     _check_class_name(["foo", "bar"])
436 |     _check_class_name(["foo-bar", "_private"])
437 | 
438 |     # Empty list should be valid (no class name)
439 |     _check_class_name([])
440 | 
441 |     # Empty string should be valid (no class name)
442 |     _check_class_name("")
443 | 
444 |     # Invalid class names should raise
445 |     try:
446 |         _check_class_name("123invalid")
447 |         raise AssertionError("Should have raised")
448 |     except ValueError:
449 |         pass
450 | 
451 |     try:
452 |         _check_class_name(["valid", "123invalid"])
453 |         raise AssertionError("Should have raised")
454 |     except ValueError:
455 |         pass
456 | 
457 | 
458 | def test_tag_wrapper():
459 |     section_wrapper = tag_wrapper("section", class_name="content")
460 |     assert section_wrapper("text") == '<section class="content">text</section>'
461 | 
462 |     header_wrapper = tag_wrapper("header", padding="\n")
463 |     assert header_wrapper("title") == "<header>\ntitle\n</header>"
464 | 
465 |     # Test with empty content
466 |     empty_wrapper = tag_wrapper("div")
467 |     assert empty_wrapper("") == "<div></div>"
468 | 


--------------------------------------------------------------------------------
/src/chopdiff/html/html_tags.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import re
  4 | from collections.abc import Callable
  5 | from dataclasses import dataclass
  6 | from typing import TypeAlias
  7 | 
  8 | from prettyfmt import abbrev_obj
  9 | from strif import replace_multiple
 10 | from typing_extensions import override
 11 | 
 12 | ValueRewriter: TypeAlias = Callable[[str], str | None]
 13 | """
 14 | A value rewriter function takes an attribute value string and returns a new value or
 15 | None to skip rewriting.
 16 | """
 17 | 
 18 | # Precompiled regex patterns for better performance
 19 | _TAG_NAME_EXTRACTOR = re.compile(r"<\s*([a-zA-Z][a-zA-Z0-9-]*)")
 20 | _SELF_CLOSING_DETECTOR = re.compile(r"/\s*>$")
 21 | _CLOSING_TAG_DETECTOR = re.compile(r"^<\s*/")
 22 | 
 23 | 
 24 | @dataclass(frozen=True)
 25 | class TagMatch:
 26 |     """
 27 |     A matched HTML tag with its properties, including exact offsets.
 28 |     """
 29 | 
 30 |     tag_name: str
 31 |     start_offset: int
 32 |     end_offset: int
 33 |     attribute_name: str | None
 34 |     attribute_value: str | None
 35 |     inner_text: str
 36 | 
 37 |     @override
 38 |     def __repr__(self):
 39 |         return abbrev_obj(self)
 40 | 
 41 | 
 42 | def _find_balanced_closing_tag(html: str, tag: str, start_pos: int) -> int | None:
 43 |     """
 44 |     Find the end position of the balanced closing tag for an element.
 45 | 
 46 |     Args:
 47 |         html: The HTML string to search in
 48 |         tag: The tag name to match
 49 |         start_pos: Position right after the opening tag
 50 | 
 51 |     Returns:
 52 |         Position right after the matching closing tag, or None if not found
 53 |     """
 54 |     # Pattern matches both opening and closing tags of the same name
 55 |     # This handles whitespace: <tag>, < tag>, </tag>, < /tag>, < / tag>, etc.
 56 |     pattern = re.compile(rf"<\s*/?\s*{re.escape(tag)}\b[^>]*>", flags=re.IGNORECASE)
 57 | 
 58 |     depth = 0
 59 |     for match in pattern.finditer(html, start_pos):
 60 |         if _CLOSING_TAG_DETECTOR.match(match.group(0)):
 61 |             # It's a closing tag
 62 |             if depth == 0:
 63 |                 # This is the balanced close for our opener
 64 |                 return match.end()
 65 |             depth -= 1
 66 |         else:
 67 |             # It's another opening tag of the same name
 68 |             depth += 1
 69 | 
 70 |     # No balanced closing tag found
 71 |     return None
 72 | 
 73 | 
 74 | def html_find_tag(
 75 |     html_string: str,
 76 |     tag_name: str | None = None,
 77 |     attr_name: str | None = None,
 78 |     attr_value: str | None = None,
 79 | ) -> list[TagMatch]:
 80 |     """
 81 |     Find all HTML elements matching the specified tag name, attribute name, and attribute value.
 82 | 
 83 |     We want this to enable surgical HTML editing, so returns exact offsets.
 84 | 
 85 |     It seems this is a bit of a headache to do with regular HTML parsers, so we're
 86 |     using a hybrid approach: regex for finding potential matches with accurate offsets,
 87 |     then selectolax for robust validation and parsing of each match.
 88 |     Why does this seem necessary?
 89 |     - Pure selectolax/lxml/BeautifulSoup: Great parsers but don't expose byte offsets
 90 |     - Pure regex: Can get offsets but fragile with complex/malformed HTML
 91 |     - This hybrid: Regex finds candidates with offsets, selectolax validates/parses them
 92 |     - Result: Accurate offsets + robust parsing = surgical HTML editing capability
 93 | 
 94 |     Args:
 95 |         html_string: The HTML content to search
 96 |         tag_name: Optional tag name to match (e.g., "p", "div"). If None, matches any tag.
 97 |         attr_name: Optional attribute name to match (e.g., "class", "id")
 98 |         attr_value: Optional specific attribute value to match
 99 | 
100 |     Returns:
101 |         List of TagMatch objects containing matched elements with accurate native offsets
102 |     """
103 |     from selectolax.parser import HTMLParser
104 | 
105 |     matches: list[TagMatch] = []
106 | 
107 |     # First, find all HTML comments and track their positions
108 |     comment_ranges: list[tuple[int, int]] = []
109 |     comment_pattern = re.compile(r"<!--.*?-->", re.DOTALL)
110 |     for comment_match in comment_pattern.finditer(html_string):
111 |         comment_ranges.append((comment_match.start(), comment_match.end()))
112 | 
113 |     def is_in_comment(pos: int) -> bool:
114 |         """Check if a position is inside any HTML comment."""
115 |         for start, end in comment_ranges:
116 |             if start <= pos < end:
117 |                 return True
118 |         return False
119 | 
120 |     # Build a relaxed regex pattern to find potential tags
121 |     if tag_name:
122 |         # Match specific tag with optional attributes
123 |         pattern = rf"<\s*{re.escape(tag_name)}(?:\s+[^>]*)?\s*/?>"
124 |     else:
125 |         # Match any tag
126 |         pattern = r"<\s*([a-zA-Z][a-zA-Z0-9-]*)\s*[^>]*/?>"
127 | 
128 |     # Find all potential tag matches with regex (gives us accurate offsets)
129 |     for match in re.finditer(pattern, html_string, re.IGNORECASE):
130 |         start_offset = match.start()
131 | 
132 |         # Skip if this tag is inside a comment
133 |         if is_in_comment(start_offset):
134 |             continue
135 | 
136 |         tag_html = match.group()
137 | 
138 |         # Extract the actual tag name from the match if we didn't specify one
139 |         if not tag_name:
140 |             tag_match = _TAG_NAME_EXTRACTOR.match(tag_html)
141 |             if tag_match:
142 |                 current_tag = tag_match.group(1).lower()
143 |             else:
144 |                 continue
145 |         else:
146 |             current_tag = tag_name.lower()
147 | 
148 |         # Check if it's a self-closing tag (handles whitespace before >)
149 |         is_self_closing = bool(_SELF_CLOSING_DETECTOR.search(tag_html))
150 | 
151 |         # Find the end of the element
152 |         if is_self_closing:
153 |             end_offset = match.end()
154 |             element_html = tag_html
155 |         else:
156 |             # Look for the balanced closing tag to handle nested elements
157 |             end_offset = _find_balanced_closing_tag(html_string, current_tag, match.end())
158 |             if end_offset is not None:
159 |                 element_html = html_string[start_offset:end_offset]
160 |             else:
161 |                 # No closing tag found, treat as self-closing
162 |                 end_offset = match.end()
163 |                 element_html = tag_html
164 | 
165 |         # Now use selectolax to parse and validate this specific element
166 |         try:
167 |             parser = HTMLParser(element_html)
168 | 
169 |             # Find the first real element (skip text nodes)
170 |             target_element = None
171 |             for elem in parser.tags(current_tag):
172 |                 target_element = elem
173 |                 break
174 | 
175 |             if not target_element:
176 |                 continue
177 | 
178 |             # Check attribute filters
179 |             if attr_name:
180 |                 elem_attr_value = (
181 |                     target_element.attrs.get(attr_name)
182 |                     if hasattr(target_element, "attrs")
183 |                     else None
184 |                 )
185 | 
186 |                 # If we specified an attribute value, it must match
187 |                 if attr_value is not None and elem_attr_value != attr_value:
188 |                     continue
189 | 
190 |                 # If we only specified attribute name, the element must have it
191 |                 if attr_value is None and elem_attr_value is None:
192 |                     continue
193 | 
194 |                 actual_attr_value = elem_attr_value
195 |             else:
196 |                 actual_attr_value = None
197 | 
198 |             # Get inner text
199 |             inner_text = target_element.text(strip=False) if hasattr(target_element, "text") else ""
200 | 
201 |             # Add the validated match with accurate offsets
202 |             matches.append(
203 |                 TagMatch(
204 |                     tag_name=current_tag,
205 |                     start_offset=start_offset,
206 |                     end_offset=end_offset,
207 |                     attribute_name=attr_name,
208 |                     attribute_value=actual_attr_value,
209 |                     inner_text=inner_text or "",
210 |                 )
211 |             )
212 | 
213 |         except Exception:
214 |             # If selectolax can't parse it, skip this match
215 |             continue
216 | 
217 |     return matches
218 | 
219 | 
220 | def html_extract_attribute_value(attr_name: str) -> Callable[[str], str | None]:
221 |     """
222 |     Create a function that extracts the value of a specific attribute from HTML content.
223 | 
224 |     Uses selectolax for robust HTML parsing instead of regex.
225 | 
226 |     Args:
227 |         attr_name: The name of the attribute to extract
228 | 
229 |     Returns:
230 |         A function that takes HTML string and returns the attribute value or None
231 |     """
232 |     from selectolax.parser import HTMLParser
233 | 
234 |     def extractor(html_string: str) -> str | None:
235 |         tree = HTMLParser(html_string)
236 | 
237 |         # Find first element with the specified attribute
238 |         for element in tree.css(f"[{attr_name}]"):
239 |             value = element.attrs.get(attr_name)
240 |             if value:
241 |                 return value
242 | 
243 |         return None
244 | 
245 |     return extractor
246 | 
247 | 
248 | def rewrite_html_tag_attr(
249 |     html: str,
250 |     tag_name: str,
251 |     attr_name: str,
252 |     value_rewriter: ValueRewriter | None = None,
253 |     *,
254 |     from_prefix: str | None = None,
255 |     to_prefix: str | None = None,
256 | ) -> str:
257 |     """
258 |     Rewrite attribute values for specified HTML tags using the provided rewriter function or
259 |     prefix replacement.
260 | 
261 |     Does robust parsing and surgical replacement, preserving the original HTML exactly
262 |     except for the replaced attribute values.
263 | 
264 |     Works with closed and unclosed tags, with any attribute order and quoting style.
265 | 
266 |     Args:
267 |         html: The HTML content to process
268 |         tag_name: The HTML tag name to target (e.g., "img", "a", "script")
269 |         attr_name: The attribute name to rewrite (e.g., "src", "href")
270 |         value_rewriter: Optional custom rewriter function that takes an attribute value and returns
271 |                         a new value to replace it, or None to skip rewriting that value
272 |         from_prefix: If value_rewriter is None, this prefix will be matched and replaced
273 |         to_prefix: If value_rewriter is None, the replacement prefix for from_prefix
274 | 
275 |     Returns:
276 |         The HTML content with rewritten attribute values
277 | 
278 |     Raises:
279 |         ValueError: If neither value_rewriter nor both from_prefix and to_prefix are provided
280 | 
281 |     Examples:
282 |         >>> # Rewrite img src attributes
283 |         >>> rewrite_html_tag_attr('<img src="./photo.jpg">', "img", "src", from_prefix="./", to_prefix="/static/")
284 |         '<img src="/static/photo.jpg">'
285 | 
286 |         >>> # Rewrite link hrefs
287 |         >>> rewrite_html_tag_attr('<a href="./page.html">Link</a>', "a", "href", from_prefix="./", to_prefix="/")
288 |         '<a href="/page.html">Link</a>'
289 |     """
290 |     # Validate arguments
291 |     if value_rewriter is None:
292 |         if from_prefix is None or to_prefix is None:
293 |             raise ValueError("Either provide value_rewriter or both from_prefix and to_prefix")
294 | 
295 |         # Create a simple prefix rewriter
296 |         def prefix_rewriter(value: str) -> str | None:
297 |             if value.startswith(from_prefix):
298 |                 return value.replace(from_prefix, to_prefix, 1)
299 |             return None
300 | 
301 |         value_rewriter = prefix_rewriter
302 | 
303 |     # Use our html_find_tag to get accurate offsets
304 |     matches = html_find_tag(html, tag_name=tag_name, attr_name=attr_name)
305 | 
306 |     # Collect all replacements to make using strif.replace_multiple
307 |     replacements: list[tuple[int, int, str]] = []  # (start, end, new_value)
308 | 
309 |     # Process each matched element
310 |     for match in matches:
311 |         if match.attribute_value:
312 |             new_value = value_rewriter(match.attribute_value)
313 |             if new_value is not None and new_value != match.attribute_value:
314 |                 # Extract the element HTML
315 |                 element_html = html[match.start_offset : match.end_offset]
316 | 
317 |                 # Match the attribute with various quote styles (including unquoted)
318 |                 # First try quoted version
319 |                 quoted_pattern = re.compile(
320 |                     rf'\b{re.escape(attr_name)}\s*=\s*(["\'])({re.escape(match.attribute_value)})\1',
321 |                     re.IGNORECASE,
322 |                 )
323 |                 attr_match = quoted_pattern.search(element_html)
324 | 
325 |                 if attr_match:
326 |                     # Quoted attribute value
327 |                     # Get the quote character used (single or double)
328 |                     quote_char = attr_match.group(1)
329 | 
330 |                     # Escape the new value appropriately for the quote style
331 |                     if quote_char == '"':
332 |                         # For double quotes, escape any double quotes in the value
333 |                         escaped_new_value = new_value.replace('"', "&quot;")
334 |                     else:
335 |                         # For single quotes, escape any single quotes in the value
336 |                         escaped_new_value = new_value.replace("'", "&#39;")
337 | 
338 |                     # Calculate absolute positions
339 |                     attr_value_start = match.start_offset + attr_match.start(2)
340 |                     attr_value_end = match.start_offset + attr_match.end(2)
341 |                     replacements.append((attr_value_start, attr_value_end, escaped_new_value))
342 |                 else:
343 |                     # Try unquoted attribute pattern
344 |                     # Unquoted values end at whitespace or >
345 |                     unquoted_pattern = re.compile(
346 |                         rf"\b{re.escape(attr_name)}\s*=\s*({re.escape(match.attribute_value)})(?=\s|>|/)",
347 |                         re.IGNORECASE,
348 |                     )
349 |                     attr_match = unquoted_pattern.search(element_html)
350 |                     if attr_match:
351 |                         # For unquoted attributes, we don't need to escape quotes
352 |                         # but we should avoid spaces in the replacement value
353 |                         # Calculate absolute positions
354 |                         attr_value_start = match.start_offset + attr_match.start(1)
355 |                         attr_value_end = match.start_offset + attr_match.end(1)
356 |                         replacements.append((attr_value_start, attr_value_end, new_value))
357 | 
358 |     if replacements:
359 |         return replace_multiple(html, replacements)
360 |     return html
361 | 
362 | 
363 | def rewrite_html_img_urls(
364 |     html: str,
365 |     url_rewriter: ValueRewriter | None = None,
366 |     *,
367 |     from_prefix: str | None = None,
368 |     to_prefix: str | None = None,
369 | ) -> str:
370 |     """
371 |     Rewrite image URLs in HTML content using the provided rewriter function or prefix replacement.
372 | 
373 |     This is a convenience function that delegates to rewrite_html_tag_attr for img src attributes.
374 | 
375 |     Args:
376 |         html: The HTML content to process
377 |         url_rewriter: Optional custom rewriter function that takes a URL string and returns
378 |                       a new URL string to replace it, or None to skip rewriting that URL
379 |         from_prefix: If url_rewriter is None, this prefix will be matched and replaced
380 |         to_prefix: If url_rewriter is None, the replacement prefix for from_prefix
381 | 
382 |     Returns:
383 |         The HTML content with rewritten image URLs
384 | 
385 |     Raises:
386 |         ValueError: If neither url_rewriter nor both from_prefix and to_prefix are provided
387 | 
388 |     Examples:
389 |         >>> # Using prefix replacement
390 |         >>> rewrite_html_img_urls('<img src="./photo.jpg">', from_prefix="./", to_prefix="/static/")
391 |         '<img src="/static/photo.jpg">'
392 |     """
393 |     return rewrite_html_tag_attr(
394 |         html,
395 |         "img",
396 |         "src",
397 |         value_rewriter=url_rewriter,
398 |         from_prefix=from_prefix,
399 |         to_prefix=to_prefix,
400 |     )
401 | 


--------------------------------------------------------------------------------