├── tests
├── __init__.py
├── divs
│ ├── __init__.py
│ ├── test_div_elements.py
│ └── test_parse_divs.py
├── docs
│ ├── __init__.py
│ ├── test_token_mapping.py
│ ├── test_wordtoks.py
│ ├── test_token_diffs.py
│ └── test_text_doc.py
├── html
│ ├── __init__.py
│ └── test_timestamps.py
├── util
│ ├── __init__.py
│ └── test_lemmatize.py
└── transforms
│ ├── __init__.py
│ ├── test_sliding_windows.py
│ ├── test_sliding_transforms.py
│ └── test_diff_filters.py
├── src
└── chopdiff
│ ├── py.typed
│ ├── __init__.py
│ ├── util
│ ├── __init__.py
│ ├── tiktoken_utils.py
│ └── lemmatize.py
│ ├── html
│ ├── html_plaintext.py
│ ├── extractor.py
│ ├── __init__.py
│ ├── timestamps.py
│ ├── html_in_md.py
│ └── html_tags.py
│ ├── divs
│ ├── __init__.py
│ ├── chunk_utils.py
│ ├── div_elements.py
│ ├── parse_divs.py
│ └── text_node.py
│ ├── docs
│ ├── sizes.py
│ ├── __init__.py
│ ├── search_tokens.py
│ ├── token_mapping.py
│ ├── wordtoks.py
│ └── token_diffs.py
│ └── transforms
│ ├── __init__.py
│ ├── sliding_windows.py
│ ├── window_settings.py
│ ├── diff_filters.py
│ └── sliding_transforms.py
├── .copier-answers.yml
├── installation.md
├── LICENSE
├── Makefile
├── .github
└── workflows
│ ├── publish.yml
│ └── ci.yml
├── examples
├── gettysberg.txt
├── insert_para_breaks.py
└── backfill_timestamps.py
├── devtools
└── lint.py
├── publishing.md
├── development.md
├── .cursor
└── rules
│ ├── general.mdc
│ └── python.mdc
├── .gitignore
└── pyproject.toml
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/chopdiff/py.typed:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/divs/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/docs/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/html/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/util/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/chopdiff/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/transforms/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/chopdiff/util/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa: F401
2 |
3 | from chopdiff.util.lemmatize import lemmatize, lemmatized_equal
4 | from chopdiff.util.tiktoken_utils import tiktoken_len
5 |
6 | __all__ = [
7 | "lemmatize",
8 | "lemmatized_equal",
9 | "tiktoken_len",
10 | ]
11 |
--------------------------------------------------------------------------------
/src/chopdiff/util/tiktoken_utils.py:
--------------------------------------------------------------------------------
1 | import tiktoken
2 |
3 |
4 | def tiktoken_len(string: str, encoding_name: str = "o200k_base") -> int:
5 | """
6 | Length of text in tiktokens.
7 | """
8 | encoding = tiktoken.get_encoding(encoding_name)
9 | num_tokens = len(encoding.encode(string))
10 | return num_tokens
11 |
--------------------------------------------------------------------------------
/.copier-answers.yml:
--------------------------------------------------------------------------------
1 | # Changes here will be overwritten by Copier. Do not edit manually.
2 | _commit: v0.2.17
3 | _src_path: gh:jlevy/simple-modern-uv
4 | package_author_email: joshua@cal.berkeley.edu
5 | package_author_name: Joshua Levy
6 | package_description: Simple tools for parsing/diffing/processing text to support LLM
7 | applications
8 | package_github_org: jlevy
9 | package_module: chopdiff
10 | package_name: chopdiff
11 |
--------------------------------------------------------------------------------
/tests/util/test_lemmatize.py:
--------------------------------------------------------------------------------
1 | from chopdiff.util.lemmatize import lemmatize, lemmatized_equal
2 |
3 |
4 | def test_lemmatize():
5 | assert lemmatize("running") == "run"
6 | assert lemmatize("better") == "good"
7 | assert lemmatize("The cats are running") == "the cat be run"
8 | assert lemmatize("Hello, world!") == "hello , world !"
9 | assert lemmatize("I have 3 cats.") == "I have 3 cat ."
10 | assert lemmatized_equal("The cat runs", "The cats running")
11 | assert not lemmatized_equal("The cat runs", "The dog runs")
12 | assert lemmatized_equal("The CAT runs", "the cats RUN")
13 | assert not lemmatized_equal("The CAT runs", "the cats RAN", case_sensitive=True)
14 |
--------------------------------------------------------------------------------
/src/chopdiff/html/html_plaintext.py:
--------------------------------------------------------------------------------
1 | import html
2 | import re
3 |
4 |
5 | def plaintext_to_html(text: str):
6 | """
7 | Convert plaintext to HTML, also handling newlines and whitespace.
8 | """
9 | return (
10 | html.escape(text)
11 | .replace("\n", " ")
12 | .replace("\t", " " * 4)
13 | .replace(" ", " ")
14 | )
15 |
16 |
17 | def html_to_plaintext(text: str):
18 | """
19 | Convert HTML to plaintext, stripping tags and converting entities.
20 | """
21 | text = re.sub(r" ", "\n", text, flags=re.IGNORECASE)
22 | text = re.sub(r"
", "\n\n", text, flags=re.IGNORECASE)
23 | unescaped_text = html.unescape(text)
24 | clean_text = re.sub("<[^<]+?>", "", unescaped_text)
25 | return clean_text
26 |
--------------------------------------------------------------------------------
/installation.md:
--------------------------------------------------------------------------------
1 | ## Installing uv and Python
2 |
3 | This project is set up to use [**uv**](https://docs.astral.sh/uv/), the new package
4 | manager for Python. `uv` replaces traditional use of `pyenv`, `pipx`, `poetry`, `pip`,
5 | etc. This is a quick cheat sheet on that:
6 |
7 | On macOS or Linux, if you don't have `uv` installed, a quick way to install it:
8 |
9 | ```shell
10 | curl -LsSf https://astral.sh/uv/install.sh | sh
11 | ```
12 |
13 | For macOS, you prefer [brew](https://brew.sh/) you can install or upgrade uv with:
14 |
15 | ```shell
16 | brew update
17 | brew install uv
18 | ```
19 |
20 | See [uv's docs](https://docs.astral.sh/uv/getting-started/installation/) for more
21 | installation methods and platforms.
22 |
23 | Now you can use uv to install a current Python environment:
24 |
25 | ```shell
26 | uv python install 3.13 # Or pick another version.
27 | ```
28 |
--------------------------------------------------------------------------------
/src/chopdiff/divs/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa: F401
2 |
3 | from chopdiff.divs.chunk_utils import chunk_children, chunk_generator, chunk_paras
4 | from chopdiff.divs.div_elements import (
5 | CHUNK,
6 | GROUP,
7 | ORIGINAL,
8 | RESULT,
9 | chunk_text_as_divs,
10 | div,
11 | div_get_original,
12 | div_insert_wrapped,
13 | )
14 | from chopdiff.divs.parse_divs import parse_divs, parse_divs_by_class, parse_divs_single
15 | from chopdiff.divs.text_node import TextNode
16 |
17 | __all__ = [
18 | "chunk_children",
19 | "chunk_generator",
20 | "chunk_paras",
21 | "CHUNK",
22 | "GROUP",
23 | "ORIGINAL",
24 | "RESULT",
25 | "chunk_text_as_divs",
26 | "div",
27 | "div_get_original",
28 | "div_insert_wrapped",
29 | "parse_divs",
30 | "parse_divs_by_class",
31 | "parse_divs_single",
32 | "TextNode",
33 | ]
34 |
--------------------------------------------------------------------------------
/src/chopdiff/html/extractor.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from collections.abc import Iterable
3 | from typing import Generic, TypeAlias, TypeVar
4 |
5 | T = TypeVar("T")
6 |
7 | Match: TypeAlias = tuple[T, int, int]
8 | """Match, index, and offset of content found by an extractor."""
9 |
10 |
11 | class ContentNotFound(ValueError):
12 | """
13 | Exception raised when content is not found by an extractor.
14 | """
15 |
16 |
17 | class Extractor(ABC, Generic[T]):
18 | """
19 | Abstract base class for extractors that extract information from a document at a
20 | given location. We use a class and not a pure function since we may need to
21 | preprocess the document.
22 | """
23 |
24 | @abstractmethod
25 | def extract_all(self) -> Iterable[Match[T]]:
26 | pass
27 |
28 | @abstractmethod
29 | def extract_preceding(self, wordtok_offset: int) -> Match[T]:
30 | pass
31 |
--------------------------------------------------------------------------------
/src/chopdiff/util/lemmatize.py:
--------------------------------------------------------------------------------
1 | def lemmatize(text: str, lang: str = "en") -> str:
2 | """
3 | Returns a string of lemmatized tokens using simplemma.
4 | """
5 | try:
6 | import simplemma
7 | except ImportError:
8 | raise ImportError(
9 | "simplemma is an optional dependency of chopdiff. Add it to use lemmatization."
10 | )
11 |
12 | tokens = simplemma.simple_tokenizer(text)
13 | lemmatized_tokens = [simplemma.lemmatize(token, lang=lang) for token in tokens]
14 | return " ".join(lemmatized_tokens)
15 |
16 |
17 | def lemmatized_equal(text1: str, text2: str, case_sensitive: bool = False) -> bool:
18 | """
19 | Compare two texts to see if they are the same except for lemmatization.
20 | Ignores whitespace. Does not ignore punctuation.
21 | """
22 | if not case_sensitive:
23 | text1 = text1.lower()
24 | text2 = text2.lower()
25 | return lemmatize(text1) == lemmatize(text2)
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 Joshua Levy
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile for easy development workflows.
2 | # See development.md for docs.
3 | # Note GitHub Actions call uv directly, not this Makefile.
4 |
5 | .DEFAULT_GOAL := default
6 |
7 | .PHONY: default install lint test upgrade build clean agent-rules
8 |
9 | default: agent-rules install lint test
10 |
11 | install:
12 | uv sync --all-extras
13 |
14 | lint:
15 | uv run python devtools/lint.py
16 |
17 | test:
18 | uv run pytest
19 |
20 | upgrade:
21 | uv sync --upgrade --all-extras --dev
22 |
23 | build:
24 | uv build
25 |
26 | agent-rules: CLAUDE.md AGENTS.md
27 |
28 | # Use .cursor/rules for sources of rules.
29 | # Create Claude and Codex rules from these.
30 | CLAUDE.md: .cursor/rules/general.mdc .cursor/rules/python.mdc
31 | cat .cursor/rules/general.mdc .cursor/rules/python.mdc > CLAUDE.md
32 |
33 | AGENTS.md: .cursor/rules/general.mdc .cursor/rules/python.mdc
34 | cat .cursor/rules/general.mdc .cursor/rules/python.mdc > AGENTS.md
35 |
36 | clean:
37 | -rm -rf dist/
38 | -rm -rf *.egg-info/
39 | -rm -rf .pytest_cache/
40 | -rm -rf .mypy_cache/
41 | -rm -rf .venv/
42 | -rm -rf CLAUDE.md AGENTS.md
43 | -find . -type d -name "__pycache__" -exec rm -rf {} +
44 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Publish to PyPI
2 |
3 | on:
4 | release:
5 | types: [published]
6 | workflow_dispatch: # Enable manual trigger.
7 |
8 | jobs:
9 | build-and-publish:
10 | runs-on: ubuntu-latest
11 | permissions:
12 | id-token: write # Mandatory for OIDC.
13 | contents: read
14 | steps:
15 | - name: Checkout (official GitHub action)
16 | uses: actions/checkout@v4
17 | with:
18 | # Important for versioning plugins:
19 | fetch-depth: 0
20 |
21 | - name: Install uv (official Astral action)
22 | uses: astral-sh/setup-uv@v5
23 | with:
24 | version: "0.8.9"
25 | enable-cache: true
26 | python-version: "3.12"
27 |
28 | - name: Set up Python (using uv)
29 | run: uv python install
30 |
31 | - name: Install all dependencies
32 | run: uv sync --all-extras
33 |
34 | - name: Run tests
35 | run: uv run pytest
36 |
37 | - name: Build package
38 | run: uv build
39 |
40 | - name: Publish to PyPI
41 | run: uv publish --trusted-publishing always
42 | # Although uv is newer and faster, the "official" publishing option is the one from PyPA,
43 | # which uses twine. If desired, replace `uv publish` with:
44 | # uses: pypa/gh-action-pypi-publish@release/v1
45 |
--------------------------------------------------------------------------------
/src/chopdiff/docs/sizes.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 |
3 | from chopdiff.docs.wordtoks import wordtokenize
4 | from chopdiff.html.html_plaintext import html_to_plaintext
5 | from chopdiff.util.tiktoken_utils import tiktoken_len
6 |
7 |
8 | def size_in_bytes(text: str) -> int:
9 | return len(text.encode("utf-8"))
10 |
11 |
12 | def size_in_wordtoks(text: str) -> int:
13 | return len(wordtokenize(text))
14 |
15 |
16 | class TextUnit(Enum):
17 | """
18 | Text units of measure.
19 | """
20 |
21 | lines = "lines"
22 | bytes = "bytes"
23 | chars = "chars"
24 | words = "words"
25 | wordtoks = "wordtoks"
26 | paragraphs = "paragraphs"
27 | sentences = "sentences"
28 | tiktokens = "tiktokens"
29 |
30 |
31 | def size(text: str, unit: TextUnit) -> int:
32 | if unit == TextUnit.lines:
33 | return len(text.splitlines())
34 | elif unit == TextUnit.bytes:
35 | return size_in_bytes(text)
36 | elif unit == TextUnit.chars:
37 | return len(text)
38 | elif unit == TextUnit.words:
39 | # Roughly accurate for HTML, text, or Markdown docs.
40 | return len(html_to_plaintext(text).split())
41 | elif unit == TextUnit.wordtoks:
42 | return size_in_wordtoks(text)
43 | elif unit == TextUnit.tiktokens:
44 | return tiktoken_len(text)
45 | else:
46 | raise NotImplementedError(f"Unsupported unit for string: {unit}")
47 |
--------------------------------------------------------------------------------
/examples/gettysberg.txt:
--------------------------------------------------------------------------------
1 | four score and seven years ago our fathers brought forth on this continent, a new
2 | nation, conceived in Liberty, and dedicated to the proposition that all men are created
3 | equal. Now we are engaged in a great civil war, testing whether that nation, or any
4 | nation so conceived and so dedicated, can long endure. We are met on a great
5 | battle-field of that war. We have come to dedicate a portion of that field, as a final
6 | resting place for those who here gave their lives that that nation might live. It is
7 | altogether fitting and proper that we should do this. But, in a larger sense, we can not
8 | dedicate—we can not consecrate—we can not hallow—this ground. The brave men, living and
9 | dead, who struggled here, have consecrated it, far above our poor power to add or
10 | detract. The world will little note, nor long remember what we say here, but it can
11 | never forget what they did here. It is for us the living, rather, to be dedicated here
12 | to the unfinished work which they who fought here have thus far so nobly advanced. It is
13 | rather for us to be here dedicated to the great task remaining before us—that from these
14 | honored dead we take increased devotion to that cause for which they gave the last full
15 | measure of devotion—that we here highly resolve that these dead shall not have died in
16 | vain—that this nation, under God, shall have a new birth of freedom—and that government
17 | of the people, by the people, for the people, shall not perish from the earth.
--------------------------------------------------------------------------------
/src/chopdiff/html/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa: F401
2 |
3 | from chopdiff.html.extractor import ContentNotFound, Extractor, Match
4 | from chopdiff.html.html_in_md import (
5 | Attrs,
6 | ClassNames,
7 | Wrapper,
8 | div_wrapper,
9 | escape_md_html,
10 | html_a,
11 | html_b,
12 | html_div,
13 | html_i,
14 | html_img,
15 | html_join_blocks,
16 | html_span,
17 | md_para,
18 | span_wrapper,
19 | tag_with_attrs,
20 | )
21 | from chopdiff.html.html_plaintext import html_to_plaintext, plaintext_to_html
22 | from chopdiff.html.html_tags import (
23 | TagMatch,
24 | html_extract_attribute_value,
25 | html_find_tag,
26 | rewrite_html_img_urls,
27 | rewrite_html_tag_attr,
28 | )
29 | from chopdiff.html.timestamps import (
30 | TimestampExtractor,
31 | extract_timestamp,
32 | has_timestamp,
33 | )
34 |
35 | __all__ = [
36 | "Attrs",
37 | "ClassNames",
38 | "ContentNotFound",
39 | "Extractor",
40 | "Match",
41 | "TagMatch",
42 | "html_extract_attribute_value",
43 | "html_find_tag",
44 | "rewrite_html_img_urls",
45 | "rewrite_html_tag_attr",
46 | "Wrapper",
47 | "div_wrapper",
48 | "escape_md_html",
49 | "html_a",
50 | "html_b",
51 | "html_div",
52 | "html_i",
53 | "html_img",
54 | "html_join_blocks",
55 | "html_span",
56 | "md_para",
57 | "span_wrapper",
58 | "tag_with_attrs",
59 | "html_to_plaintext",
60 | "plaintext_to_html",
61 | "TimestampExtractor",
62 | "extract_timestamp",
63 | "has_timestamp",
64 | ]
65 |
--------------------------------------------------------------------------------
/devtools/lint.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 |
3 | from funlog import log_calls
4 | from rich import get_console, reconfigure
5 | from rich import print as rprint
6 |
7 | # Update as needed.
8 | SRC_PATHS = ["src", "tests", "devtools", "examples"]
9 | DOC_PATHS = ["README.md"]
10 |
11 |
12 | reconfigure(emoji=not get_console().options.legacy_windows) # No emojis on legacy windows.
13 |
14 |
15 | def main():
16 | rprint()
17 |
18 | errcount = 0
19 | errcount += run(["codespell", "--write-changes", *SRC_PATHS, *DOC_PATHS])
20 | errcount += run(["ruff", "check", "--fix", *SRC_PATHS])
21 | errcount += run(["ruff", "format", *SRC_PATHS])
22 | errcount += run(["basedpyright", "--stats", *SRC_PATHS])
23 |
24 | rprint()
25 |
26 | if errcount != 0:
27 | rprint(f"[bold red]:x: Lint failed with {errcount} errors.[/bold red]")
28 | else:
29 | rprint("[bold green]:white_check_mark: Lint passed![/bold green]")
30 | rprint()
31 |
32 | return errcount
33 |
34 |
35 | @log_calls(level="warning", show_timing_only=True)
36 | def run(cmd: list[str]) -> int:
37 | rprint()
38 | rprint(f"[bold green]>> {' '.join(cmd)}[/bold green]")
39 | errcount = 0
40 | try:
41 | subprocess.run(cmd, text=True, check=True)
42 | except KeyboardInterrupt:
43 | rprint("[yellow]Keyboard interrupt - Cancelled[/yellow]")
44 | errcount = 1
45 | except subprocess.CalledProcessError as e:
46 | rprint(f"[bold red]Error: {e}[/bold red]")
47 | errcount = 1
48 |
49 | return errcount
50 |
51 |
52 | if __name__ == "__main__":
53 | exit(main())
54 |
--------------------------------------------------------------------------------
/tests/transforms/test_sliding_windows.py:
--------------------------------------------------------------------------------
1 | from pprint import pprint
2 | from textwrap import dedent
3 |
4 | from chopdiff.docs.sizes import TextUnit, size
5 | from chopdiff.docs.text_doc import TextDoc
6 | from chopdiff.transforms.sliding_windows import sliding_word_window
7 |
8 | _example_text = dedent(
9 | """
10 | This is the first paragraph. It has multiple sentences.
11 |
12 | This is the second paragraph. It also has multiple sentences. And it continues.
13 |
14 | Here is the third paragraph. More sentences follow. And here is another one.
15 | """
16 | ).strip()
17 |
18 |
19 | def test_sliding_window():
20 | doc = TextDoc.from_text(_example_text)
21 | window_size = 80
22 | window_shift = 60
23 |
24 | windows = list(sliding_word_window(doc, window_size, window_shift, TextUnit.bytes))
25 | pprint(windows)
26 |
27 | sentence_windows = [
28 | [[sent.text for sent in para.sentences] for para in doc.paragraphs] for doc in windows
29 | ]
30 |
31 | assert sentence_windows == [
32 | [["This is the first paragraph.", "It has multiple sentences."]],
33 | [["It has multiple sentences."], ["This is the second paragraph."]],
34 | [
35 | [
36 | "This is the second paragraph.",
37 | "It also has multiple sentences.",
38 | "And it continues.",
39 | ]
40 | ],
41 | [
42 | ["And it continues."],
43 | ["Here is the third paragraph.", "More sentences follow."],
44 | ],
45 | ]
46 |
47 | for sub_doc in windows:
48 | sub_text = sub_doc.reassemble()
49 |
50 | print(f"\n\n---Sub-document length {size(sub_text, TextUnit.bytes)}")
51 | pprint(sub_text)
52 |
53 | assert size(sub_text, TextUnit.bytes) <= window_size
54 |
55 | assert sub_text in doc.reassemble()
56 |
--------------------------------------------------------------------------------
/src/chopdiff/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | from chopdiff.transforms.diff_filters import (
2 | WILDCARD_TOK,
3 | adds_headings,
4 | changes_whitespace,
5 | changes_whitespace_or_punct,
6 | make_token_sequence_filter,
7 | no_word_lemma_changes,
8 | removes_word_lemmas,
9 | removes_words,
10 | )
11 | from chopdiff.transforms.sliding_transforms import (
12 | TextDocTransform,
13 | filtered_transform,
14 | remove_window_br,
15 | sliding_para_window_transform,
16 | sliding_window_transform,
17 | sliding_wordtok_window_transform,
18 | )
19 | from chopdiff.transforms.sliding_windows import sliding_para_window, sliding_word_window
20 | from chopdiff.transforms.window_settings import (
21 | WINDOW_1_PARA,
22 | WINDOW_2_PARA,
23 | WINDOW_2K_WORDTOKS,
24 | WINDOW_4_PARA,
25 | WINDOW_8_PARA,
26 | WINDOW_16_PARA,
27 | WINDOW_32_PARA,
28 | WINDOW_64_PARA,
29 | WINDOW_128_PARA,
30 | WINDOW_256_PARA,
31 | WINDOW_512_PARA,
32 | WINDOW_1024_PARA,
33 | WINDOW_BR,
34 | WINDOW_BR_SEP,
35 | WINDOW_NONE,
36 | WindowSettings,
37 | )
38 |
39 | __all__ = [
40 | "WILDCARD_TOK",
41 | "adds_headings",
42 | "changes_whitespace",
43 | "changes_whitespace_or_punct",
44 | "make_token_sequence_filter",
45 | "no_word_lemma_changes",
46 | "removes_word_lemmas",
47 | "removes_words",
48 | "TextDocTransform",
49 | "filtered_transform",
50 | "remove_window_br",
51 | "sliding_para_window_transform",
52 | "sliding_window_transform",
53 | "sliding_wordtok_window_transform",
54 | "sliding_para_window",
55 | "sliding_word_window",
56 | "WINDOW_1_PARA",
57 | "WINDOW_2_PARA",
58 | "WINDOW_2K_WORDTOKS",
59 | "WINDOW_4_PARA",
60 | "WINDOW_8_PARA",
61 | "WINDOW_16_PARA",
62 | "WINDOW_32_PARA",
63 | "WINDOW_64_PARA",
64 | "WINDOW_128_PARA",
65 | "WINDOW_256_PARA",
66 | "WINDOW_512_PARA",
67 | "WINDOW_1024_PARA",
68 | "WINDOW_BR",
69 | "WINDOW_BR_SEP",
70 | "WINDOW_NONE",
71 | "WindowSettings",
72 | ]
73 |
--------------------------------------------------------------------------------
/tests/html/test_timestamps.py:
--------------------------------------------------------------------------------
1 | from textwrap import dedent
2 |
3 | from chopdiff.html.extractor import ContentNotFound
4 | from chopdiff.html.timestamps import TimestampExtractor
5 |
6 |
7 | def test_timestamp_extractor():
8 | doc_str = 'Sentence one. Sentence two. Sentence three.'
9 |
10 | extractor = TimestampExtractor(doc_str)
11 | wordtoks = extractor.wordtoks
12 |
13 | results: list[str] = []
14 | offsets: list[int] = []
15 | for i, wordtok in enumerate(wordtoks):
16 | try:
17 | timestamp, _index, offset = extractor.extract_preceding(i)
18 | except ContentNotFound:
19 | timestamp = None
20 | offset = -1
21 | results.append(f"{i}: {timestamp} ⎪{wordtok}⎪")
22 | offsets.append(offset)
23 |
24 | print("\n".join(results))
25 | print(offsets)
26 |
27 | assert (
28 | "\n".join(results)
29 | == dedent(
30 | """
31 | 0: None ⎪<-BOF->⎪
32 | 1: None ⎪⎪
33 | 2: 1.234 ⎪Sentence⎪
34 | 3: 1.234 ⎪ ⎪
35 | 4: 1.234 ⎪one⎪
36 | 5: 1.234 ⎪.⎪
37 | 6: 1.234 ⎪ ⎪
38 | 7: 1.234 ⎪ ⎪
39 | 8: 1.234 ⎪⎪
40 | 9: 23.0 ⎪Sentence⎪
41 | 10: 23.0 ⎪ ⎪
42 | 11: 23.0 ⎪two⎪
43 | 12: 23.0 ⎪.⎪
44 | 13: 23.0 ⎪ ⎪
45 | 14: 23.0 ⎪ ⎪
46 | 15: 23.0 ⎪Sentence⎪
47 | 16: 23.0 ⎪ ⎪
48 | 17: 23.0 ⎪three⎪
49 | 18: 23.0 ⎪.⎪
50 | 19: 23.0 ⎪<-EOF->⎪
51 | """
52 | ).strip()
53 | )
54 |
55 | assert offsets == [
56 | -1,
57 | -1,
58 | 0,
59 | 0,
60 | 0,
61 | 0,
62 | 0,
63 | 0,
64 | 0,
65 | 50,
66 | 50,
67 | 50,
68 | 50,
69 | 50,
70 | 50,
71 | 50,
72 | 50,
73 | 50,
74 | 50,
75 | 50,
76 | ]
77 |
--------------------------------------------------------------------------------
/src/chopdiff/docs/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa: F401
2 |
3 | from chopdiff.docs.search_tokens import search_tokens
4 | from chopdiff.docs.sizes import TextUnit
5 | from chopdiff.docs.text_doc import Paragraph, Sentence, SentIndex, TextDoc
6 | from chopdiff.docs.token_diffs import (
7 | DIFF_FILTER_NONE,
8 | DiffFilter,
9 | DiffOp,
10 | DiffStats,
11 | OpType,
12 | TokenDiff,
13 | diff_docs,
14 | diff_wordtoks,
15 | scored_diff_wordtoks,
16 | )
17 | from chopdiff.docs.token_mapping import TokenMapping
18 | from chopdiff.docs.wordtoks import (
19 | BOF_STR,
20 | BOF_TOK,
21 | EOF_STR,
22 | EOF_TOK,
23 | PARA_BR_STR,
24 | PARA_BR_TOK,
25 | SENT_BR_STR,
26 | SENT_BR_TOK,
27 | SPACE_TOK,
28 | SYMBOL_SEP,
29 | Tag,
30 | first_wordtok,
31 | is_break_or_space,
32 | is_div,
33 | is_header_tag,
34 | is_tag,
35 | is_tag_close,
36 | is_tag_open,
37 | is_whitespace_or_punct,
38 | is_word,
39 | join_wordtoks,
40 | normalize_wordtok,
41 | wordtok_len,
42 | wordtok_to_str,
43 | wordtokenize,
44 | wordtokenize_with_offsets,
45 | )
46 |
47 | __all__ = [
48 | "search_tokens",
49 | "TextUnit",
50 | "Paragraph",
51 | "Sentence",
52 | "SentIndex",
53 | "TextDoc",
54 | "DIFF_FILTER_NONE",
55 | "DiffFilter",
56 | "DiffOp",
57 | "DiffStats",
58 | "OpType",
59 | "TokenDiff",
60 | "diff_docs",
61 | "diff_wordtoks",
62 | "scored_diff_wordtoks",
63 | "TokenMapping",
64 | "BOF_STR",
65 | "BOF_TOK",
66 | "EOF_STR",
67 | "EOF_TOK",
68 | "PARA_BR_STR",
69 | "PARA_BR_TOK",
70 | "SENT_BR_STR",
71 | "SENT_BR_TOK",
72 | "SPACE_TOK",
73 | "SYMBOL_SEP",
74 | "Tag",
75 | "first_wordtok",
76 | "is_break_or_space",
77 | "is_div",
78 | "is_header_tag",
79 | "is_tag",
80 | "is_tag_close",
81 | "is_tag_open",
82 | "is_whitespace_or_punct",
83 | "is_word",
84 | "join_wordtoks",
85 | "normalize_wordtok",
86 | "wordtok_len",
87 | "wordtok_to_str",
88 | "wordtokenize",
89 | "wordtokenize_with_offsets",
90 | ]
91 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3 |
4 | name: CI
5 |
6 | on:
7 | push:
8 | # Use ["main", "master"] for CI only on the default branch.
9 | # Use ["**"] for CI on all branches.
10 | branches: ["main", "master"]
11 | pull_request:
12 | branches: ["main", "master"]
13 |
14 | permissions:
15 | contents: read
16 |
17 | jobs:
18 | build:
19 | strategy:
20 | matrix:
21 | # Update this as needed:
22 | # Common platforms: ["ubuntu-latest", "macos-latest", "windows-latest"]
23 | os: ["ubuntu-latest"]
24 | python-version: ["3.11", "3.12", "3.13"]
25 |
26 | # Linux only by default. Use ${{ matrix.os }} for other OSes.
27 | runs-on: ${{ matrix.os }}
28 |
29 | steps:
30 |
31 | # Generally following uv docs:
32 | # https://docs.astral.sh/uv/guides/integration/github/
33 |
34 | - name: Checkout (official GitHub action)
35 | uses: actions/checkout@v4
36 | with:
37 | # Important for versioning plugins:
38 | fetch-depth: 0
39 |
40 | # From debugging the cydifflib build failure.
41 | # Confirmed we have version 3.31.6 installed.
42 | - name: Display CMake Version
43 | run: cmake --version
44 |
45 | - name: Install uv (official Astral action)
46 | uses: astral-sh/setup-uv@v5
47 | with:
48 | # Update this as needed:
49 | version: "0.8.9"
50 | enable-cache: true
51 | python-version: ${{ matrix.python-version }}
52 |
53 | - name: Set up Python (using uv)
54 | run: uv python install
55 |
56 | # Alternately can use the official Python action:
57 | # - name: Set up Python (using actions/setup-python)
58 | # uses: actions/setup-python@v5
59 | # with:
60 | # python-version: ${{ matrix.python-version }}
61 |
62 | - name: Install all dependencies
63 | run: uv sync --all-extras
64 |
65 | - name: Run linting
66 | run: uv run python devtools/lint.py
67 |
68 | - name: Run tests
69 | run: uv run pytest
--------------------------------------------------------------------------------
/src/chopdiff/divs/chunk_utils.py:
--------------------------------------------------------------------------------
1 | from collections.abc import Callable, Generator
2 | from typing import TypeVar
3 |
4 | from chopdiff.divs.text_node import TextNode
5 | from chopdiff.docs.sizes import TextUnit
6 | from chopdiff.docs.text_doc import TextDoc
7 |
8 | T = TypeVar("T")
9 |
10 |
11 | def chunk_generator(
12 | doc: T,
13 | condition: Callable[[T], bool],
14 | slicer: Callable[[T, int, int], T],
15 | total_size: int,
16 | ) -> Generator[T, None, None]:
17 | """
18 | Walk through the elements of a document and yield sequential subdocs once they meet
19 | a specific condition.
20 | """
21 |
22 | start_index = 0
23 | current_index = 0
24 |
25 | while current_index < total_size:
26 | current_doc = slicer(doc, start_index, current_index)
27 |
28 | if condition(current_doc):
29 | yield current_doc
30 | start_index = current_index + 1
31 | current_index = start_index
32 | else:
33 | current_index += 1
34 |
35 | if start_index < total_size:
36 | yield slicer(doc, start_index, total_size)
37 |
38 |
39 | def chunk_paras(doc: TextDoc, min_size: int, unit: TextUnit) -> Generator[TextDoc, None, None]:
40 | """
41 | Generate TextDoc chunks where each chunk is at least the specified minimum size.
42 | """
43 |
44 | def condition(slice: TextDoc) -> bool:
45 | return slice.size(unit) >= min_size
46 |
47 | def slicer(doc: TextDoc, start: int, end: int) -> TextDoc:
48 | return doc.sub_paras(start, end)
49 |
50 | total_paragraphs = len(doc.paragraphs)
51 |
52 | yield from chunk_generator(doc, condition, slicer, total_paragraphs)
53 |
54 |
55 | def chunk_children(
56 | node: TextNode, min_size: int, unit: TextUnit
57 | ) -> Generator[TextNode, None, None]:
58 | """
59 | Generate TextNode chunks where each chunk is at least the specified minimum size.
60 | """
61 |
62 | def condition(slice: TextNode) -> bool:
63 | return slice.size(unit) >= min_size
64 |
65 | def slicer(node: TextNode, start: int, end: int) -> TextNode:
66 | return node.slice_children(start, end)
67 |
68 | total_children = len(node.children)
69 |
70 | yield from chunk_generator(node, condition, slicer, total_children)
71 |
--------------------------------------------------------------------------------
/src/chopdiff/transforms/sliding_windows.py:
--------------------------------------------------------------------------------
1 | """
2 | Sliding windows of text on a text doc.
3 | """
4 |
5 | import logging
6 | from collections.abc import Callable, Generator
7 |
8 | from flowmark import fill_markdown
9 |
10 | from chopdiff.docs.sizes import TextUnit
11 | from chopdiff.docs.text_doc import SentIndex, TextDoc
12 |
13 | log = logging.getLogger(__name__)
14 |
15 |
16 | def sliding_word_window(
17 | doc: TextDoc, window_size: int, window_shift: int, unit: TextUnit
18 | ) -> Generator[TextDoc, None, None]:
19 | """
20 | Generate TextDoc sub-documents in a sliding window over the given document.
21 | """
22 | total_size = doc.size(unit)
23 | start_offset = 0
24 | start_index, _ = doc.seek_to_sent(start_offset, unit)
25 |
26 | while start_offset < total_size:
27 | end_offset = start_offset + window_size
28 | end_index, _ = doc.seek_to_sent(end_offset, unit)
29 |
30 | # Sentence may extend past the window, so back up to ensure it fits.
31 | sub_doc = doc.sub_doc(start_index, end_index)
32 | try:
33 | while sub_doc.size(unit) > window_size:
34 | end_index = doc.prev_sent(end_index)
35 | sub_doc = doc.sub_doc(start_index, end_index)
36 | except ValueError:
37 | raise ValueError(
38 | f"Window size {window_size} too small for sentence at offset {start_offset}"
39 | )
40 |
41 | yield sub_doc
42 |
43 | start_offset += window_shift
44 | start_index = end_index
45 |
46 |
47 | def sliding_para_window(
48 | doc: TextDoc, nparas: int, normalizer: Callable[[str], str] = fill_markdown
49 | ) -> Generator[TextDoc, None, None]:
50 | """
51 | Generate TextDoc sub-documents taking `nparas` paragraphs at a time.
52 | """
53 | for i in range(0, len(doc.paragraphs), nparas):
54 | end_index = min(i + nparas - 1, len(doc.paragraphs) - 1)
55 | sub_doc = doc.sub_doc(SentIndex(i, 0), SentIndex(end_index, 0))
56 |
57 | # XXX It's important we re-normalize especially because LLMs can output itemized lists with just
58 | # one newline, but for Markdown we want separate paragraphs for each list item.
59 | formatted_sub_doc = TextDoc.from_text(normalizer(sub_doc.reassemble()))
60 |
61 | yield formatted_sub_doc
62 |
--------------------------------------------------------------------------------
/src/chopdiff/html/timestamps.py:
--------------------------------------------------------------------------------
1 | from collections.abc import Iterable
2 |
3 | import regex
4 | from typing_extensions import override
5 |
6 | from chopdiff.docs.search_tokens import search_tokens
7 | from chopdiff.docs.wordtoks import wordtokenize_with_offsets
8 | from chopdiff.html.extractor import ContentNotFound, Extractor, Match
9 |
10 | # Match any span or div with a data-timestamp attribute.
11 | _TIMESTAMP_RE = regex.compile(r'(?:<\w+[^>]*\s)?data-timestamp=[\'"](\d+(\.\d+)?)[\'"][^>]*>')
12 |
13 |
14 | def extract_timestamp(wordtok: str) -> float | None:
15 | match = _TIMESTAMP_RE.search(wordtok)
16 | return float(match.group(1)) if match else None
17 |
18 |
19 | def has_timestamp(wordtok: str) -> bool:
20 | return extract_timestamp(wordtok) is not None
21 |
22 |
23 | class TimestampExtractor(Extractor[float]):
24 | """
25 | Extract timestamps of the form `<... data-timestamp="123.45">` from a document.
26 | """
27 |
28 | def __init__(self, doc_str: str):
29 | self.doc_str = doc_str
30 | self.wordtoks, self.offsets = wordtokenize_with_offsets(self.doc_str, bof_eof=True)
31 |
32 | @override
33 | def extract_all(self) -> Iterable[Match[float]]:
34 | """
35 | Extract all timestamps from the document.
36 | """
37 | for index, (wordtok, offset) in enumerate(zip(self.wordtoks, self.offsets, strict=False)):
38 | timestamp = extract_timestamp(wordtok)
39 | if timestamp is not None:
40 | yield timestamp, index, offset
41 |
42 | @override
43 | def extract_preceding(self, wordtok_offset: int) -> Match[float]:
44 | try:
45 | index, wordtok = (
46 | search_tokens(self.wordtoks).at(wordtok_offset).seek_back(has_timestamp).get_token()
47 | )
48 | if wordtok:
49 | timestamp = extract_timestamp(wordtok)
50 | if timestamp is not None:
51 | return timestamp, index, self.offsets[index]
52 | raise ContentNotFound(
53 | f"No timestamp found seeking back from token {wordtok_offset}: {wordtok!r}"
54 | )
55 | except KeyError as e:
56 | raise ContentNotFound(
57 | f"No timestamp found searching back from token {wordtok_offset}: {e}"
58 | )
59 |
--------------------------------------------------------------------------------
/publishing.md:
--------------------------------------------------------------------------------
1 | ## Publishing Releases
2 |
3 | This is how to publish a Python package to [**PyPI**](https://pypi.org/) from GitHub
4 | Actions, when using the
5 | [**simple-modern-uv**](https://github.com/jlevy/simple-modern-uv) template.
6 |
7 | Thanks to [the dynamic versioning
8 | plugin](https://github.com/ninoseki/uv-dynamic-versioning/) and the
9 | [`publish.yml` workflow](https://github.com/jlevy/simple-modern-uv/blob/main/template/.github/workflows/publish.yml),
10 | you can simply create tagged releases (using standard format for the tag name, e.g.
11 | `v0.1.0`) on GitHub and the tag will trigger a release build, which then uploads it to
12 | PyPI.
13 |
14 | ### How to Publish the First Time
15 |
16 | This part is a little confusing the first time.
17 | Here is the simplest way to do it.
18 | For the purposes of this example replace OWNER and PROJECT with the right values.
19 |
20 | 1. **Get a PyPI account** at [pypi.org](https://pypi.org/) and sign in.
21 |
22 | 2. **Pick a name for the project** that isn't already taken.
23 |
24 | - Go to `https://pypi.org/project/PROJECT` to see if another project with that name
25 | already exits.
26 |
27 | - If needed, update your `pyproject.yml` with the correct name.
28 |
29 | 3. **Authorize** your repository to publish to PyPI:
30 |
31 | - Go to [the publishing settings page](https://pypi.org/manage/account/publishing/).
32 |
33 | - Find "Trusted Publisher Management" and register your GitHub repo as a new
34 | "pending" trusted publisher
35 |
36 | - Enter the project name, repo owner, repo name, and `publish.yml` as the workflow
37 | name. (You can leave the "environment name" field blank.)
38 |
39 | 4. **Create a release** on GitHub:
40 |
41 | - Commit code and make sure it's running correctly.
42 |
43 | - Go to your GitHub project page, then click on Actions tab.
44 |
45 | - Confirm all tests are passing in the last CI workflow.
46 | (If you want, you can even publish this template when it's empty as just a stub
47 | project, to try all this out.)
48 |
49 | - Go to your GitHub project page, click on Releases.
50 |
51 | - Fill in the tag and the release name.
52 | Select to create a new tag, and pick a version.
53 | A good option is `v0.1.0`. (It's wise to have it start with a `v`.)
54 |
55 | - Submit to create the release.
56 |
57 | 5. **Confirm it publishes to PyPI**
58 |
59 | - Watch for the release workflow in the GitHub Actions tab.
60 |
61 | - If it succeeds, you should see it appear at `https://pypi.org/project/PROJECT`.
62 |
63 | ### How to Publish Subsequent Releases
64 |
65 | Just create a new release!
66 | Everything is the same as the last two steps above.
67 |
68 | * * *
69 |
70 | *This file was built with
71 | [simple-modern-uv](https://github.com/jlevy/simple-modern-uv).*
72 |
--------------------------------------------------------------------------------
/src/chopdiff/docs/search_tokens.py:
--------------------------------------------------------------------------------
1 | from collections.abc import Callable
2 | from typing import TypeAlias
3 |
4 | Predicate: TypeAlias = Callable[[str], bool] | list[str]
5 |
6 |
7 | class _TokenSearcher:
8 | def __init__(self, toks: list[str]):
9 | self.toks = toks
10 | self._cur_idx = 0
11 |
12 | def at(self, index: int):
13 | if index is None: # pyright: ignore
14 | raise KeyError("Index cannot be None")
15 | # Convert negative indices to positive ones.
16 | self._cur_idx = index if index >= 0 else len(self.toks) + index
17 | return self
18 |
19 | def start(self):
20 | self._cur_idx = 0
21 | return self
22 |
23 | def end(self):
24 | self._cur_idx = len(self.toks)
25 | return self
26 |
27 | def seek_back(self, predicate: Predicate):
28 | if isinstance(predicate, list):
29 | allowed: list[str] = predicate
30 | predicate = lambda x: x in allowed
31 | for idx in range(self._cur_idx - 1, -1, -1):
32 | if predicate(self.toks[idx]):
33 | self._cur_idx = idx
34 | return self
35 | raise KeyError("No matching token found before the current index")
36 |
37 | def seek_forward(self, predicate: Predicate):
38 | if isinstance(predicate, list):
39 | allowed: list[str] = predicate
40 | predicate = lambda x: x in allowed
41 | for idx in range(self._cur_idx + 1, len(self.toks)):
42 | if predicate(self.toks[idx]):
43 | self._cur_idx = idx
44 | return self
45 | raise KeyError("No matching token found after the current index")
46 |
47 | def prev(self):
48 | if self._cur_idx - 1 < 0:
49 | raise KeyError("No previous token available")
50 | self._cur_idx -= 1
51 | return self
52 |
53 | def next(self):
54 | if self._cur_idx + 1 >= len(self.toks):
55 | raise KeyError("No next token available")
56 | self._cur_idx += 1
57 | return self
58 |
59 | def get_index(self) -> int:
60 | return self._cur_idx
61 |
62 | def get_token(self) -> tuple[int, str]:
63 | return self._cur_idx, self.toks[self._cur_idx]
64 |
65 |
66 | def search_tokens(wordtoks: list[str]) -> _TokenSearcher:
67 | """
68 | Fluent convenience function to search for offsets in an array of string tokens
69 | based on a predicate, previous, next, etc. Raises `KeyError` if any search
70 | has no matches.
71 |
72 | Example:
73 | ```
74 | index, token = (
75 | search_tokens(list_of_tokens)
76 | .at(my_offset)
77 | .seek_back(has_timestamp)
78 | .next()
79 | .get_token()
80 | )
81 | ```
82 | """
83 | return _TokenSearcher(wordtoks)
84 |
--------------------------------------------------------------------------------
/tests/transforms/test_sliding_transforms.py:
--------------------------------------------------------------------------------
1 | from textwrap import dedent
2 |
3 | from chopdiff.docs.sizes import TextUnit
4 | from chopdiff.docs.text_doc import TextDoc
5 | from chopdiff.transforms.sliding_transforms import (
6 | sliding_para_window_transform,
7 | sliding_window_transform,
8 | )
9 | from chopdiff.transforms.window_settings import WINDOW_BR_SEP, WindowSettings
10 |
11 | _example_text = dedent(
12 | """
13 | This is the first paragraph. It has multiple sentences.
14 |
15 | This is the second paragraph. It also has multiple sentences. And it continues.
16 |
17 | Here is the third paragraph. More sentences follow. And here is another one.
18 | """
19 | ).strip()
20 |
21 |
22 | def test_sliding_word_window_transform():
23 | long_text = (_example_text + "\n\n") * 2
24 | doc = TextDoc.from_text(long_text)
25 |
26 | # Simple transformation that converts all text to uppercase.
27 | def transform_func(window: TextDoc) -> TextDoc:
28 | transformed_text = window.reassemble().upper()
29 | return TextDoc.from_text(transformed_text)
30 |
31 | transformed_doc = sliding_window_transform(
32 | doc,
33 | transform_func,
34 | WindowSettings(TextUnit.wordtoks, 80, 60, min_overlap=5, separator="|"),
35 | )
36 | print("---Wordtok transformed doc:")
37 | print(transformed_doc.reassemble())
38 |
39 | assert transformed_doc.reassemble().count("|") == 2
40 |
41 | long_text = (_example_text + "\n\n") * 20
42 | doc = TextDoc.from_text(long_text)
43 | transformed_doc = sliding_window_transform(
44 | doc, transform_func, WindowSettings(TextUnit.wordtoks, 80, 60, min_overlap=5)
45 | )
46 | assert transformed_doc.reassemble() == long_text.upper().strip()
47 |
48 |
49 | def test_sliding_para_window_transform():
50 | def transform_func(window: TextDoc) -> TextDoc:
51 | transformed_text = window.reassemble().upper()
52 | return TextDoc.from_text(transformed_text)
53 |
54 | text = "\n\n".join(f"Paragraph {i}." for i in range(7))
55 | doc = TextDoc.from_text(text)
56 |
57 | transformed_doc = sliding_para_window_transform(
58 | doc,
59 | transform_func,
60 | WindowSettings(
61 | TextUnit.paragraphs,
62 | 3,
63 | 3,
64 | separator=WINDOW_BR_SEP,
65 | ),
66 | )
67 |
68 | print("---Paragraph transformed doc:")
69 | print(transformed_doc.reassemble())
70 |
71 | assert (
72 | transformed_doc.reassemble()
73 | == dedent(
74 | """
75 | PARAGRAPH 0.
76 |
77 | PARAGRAPH 1.
78 |
79 | PARAGRAPH 2.
80 |
81 | PARAGRAPH 3.
82 |
83 | PARAGRAPH 4.
84 |
85 | PARAGRAPH 5.
86 |
87 | PARAGRAPH 6.
88 | """
89 | ).strip()
90 | )
91 |
--------------------------------------------------------------------------------
/development.md:
--------------------------------------------------------------------------------
1 | # Development
2 |
3 | ## Setting Up uv
4 |
5 | This project is set up to use [uv](https://docs.astral.sh/uv/) to manage Python and
6 | dependencies. First, be sure you
7 | [have uv installed](https://docs.astral.sh/uv/getting-started/installation/).
8 |
9 | Then [fork the jlevy/chopdiff
10 | repo](https://github.com/jlevy/chopdiff/fork) (having your own
11 | fork will make it easier to contribute) and
12 | [clone it](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository).
13 |
14 | ## Basic Developer Workflows
15 |
16 | The `Makefile` simply offers shortcuts to `uv` commands for developer convenience.
17 | (For clarity, GitHub Actions don't use the Makefile and just call `uv` directly.)
18 |
19 | ```shell
20 | # First, install all dependencies and set up your virtual environment.
21 | # This simply runs `uv sync --all-extras` to install all packages,
22 | # including dev dependencies and optional dependencies.
23 | make install
24 |
25 | # Run uv sync, lint, and test (and also generate agent rules):
26 | make
27 |
28 | # Build wheel:
29 | make build
30 |
31 | # Linting:
32 | make lint
33 |
34 | # Run tests:
35 | make test
36 |
37 | # Delete all the build artifacts:
38 | make clean
39 |
40 | # Upgrade dependencies to compatible versions:
41 | make upgrade
42 |
43 | # To run tests by hand:
44 | uv run pytest # all tests
45 | uv run pytest -s src/module/some_file.py # one test, showing outputs
46 |
47 | # Build and install current dev executables, to let you use your dev copies
48 | # as local tools:
49 | uv tool install --editable .
50 |
51 | # Dependency management directly with uv:
52 | # Add a new dependency:
53 | uv add package_name
54 | # Add a development dependency:
55 | uv add --dev package_name
56 | # Update to latest compatible versions (including dependencies on git repos):
57 | uv sync --upgrade
58 | # Update a specific package:
59 | uv lock --upgrade-package package_name
60 | # Update dependencies on a package:
61 | uv add package_name@latest
62 |
63 | # Run a shell within the Python environment:
64 | uv venv
65 | source .venv/bin/activate
66 | ```
67 |
68 | See [uv docs](https://docs.astral.sh/uv/) for details.
69 |
70 | ## Agent Rules
71 |
72 | See [.cursor/rules](.cursor/rules) for agent rules.
73 | These are written for [Cursor](https://www.cursor.com/) but are also used by other
74 | agents because the Makefile will generate `CLAUDE.md` and `AGENTS.md` from the same
75 | rules.
76 |
77 | ```shell
78 | make agent-rules
79 | ```
80 |
81 | ## IDE setup
82 |
83 | If you use VSCode or a fork like Cursor or Windsurf, you can install the following
84 | extensions:
85 |
86 | - [Python](https://marketplace.visualstudio.com/items?itemName=ms-python.python)
87 |
88 | - [Based Pyright](https://marketplace.visualstudio.com/items?itemName=detachhead.basedpyright)
89 | for type checking. Note that this extension works with non-Microsoft VSCode forks like
90 | Cursor.
91 |
92 | ## Documentation
93 |
94 | - [uv docs](https://docs.astral.sh/uv/)
95 |
96 | - [basedpyright docs](https://docs.basedpyright.com/latest/)
97 |
98 | * * *
99 |
100 | *This file was built with
101 | [simple-modern-uv](https://github.com/jlevy/simple-modern-uv).*
102 |
--------------------------------------------------------------------------------
/src/chopdiff/transforms/window_settings.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 |
3 | from typing_extensions import override
4 |
5 | from chopdiff.docs.sizes import TextUnit
6 |
7 | WINDOW_BR = ""
8 | """Marker inserted into result documents to show where window breaks have occurred."""
9 |
10 | WINDOW_BR_SEP = f"\n{WINDOW_BR}\n"
11 |
12 |
13 | @dataclass(frozen=True)
14 | class WindowSettings:
15 | """
16 | Size of the sliding window, the shift, and the min overlap required when stitching windows
17 | together. All sizes in wordtoks.
18 | """
19 |
20 | unit: TextUnit
21 | size: int
22 | shift: int
23 | min_overlap: int = 0
24 | separator: str = ""
25 |
26 | @override
27 | def __str__(self):
28 | return f"windowing size={self.size}, shift={self.shift}, min_overlap={self.min_overlap} {self.unit.value}"
29 |
30 |
31 | WINDOW_NONE = WindowSettings(unit=TextUnit.wordtoks, size=0, shift=0, min_overlap=0, separator="")
32 | """
33 | Do not use a sliding window.
34 | """
35 |
36 | WINDOW_2K_WORDTOKS = WindowSettings(
37 | TextUnit.wordtoks,
38 | size=2048,
39 | shift=2048 - 256,
40 | min_overlap=8,
41 | separator=WINDOW_BR_SEP,
42 | )
43 | """
44 | Sliding, overlapping word-based window. Useful for finding paragraph breaks.
45 | 2K wordtoks is several paragraphs.
46 | """
47 |
48 |
49 | WINDOW_1_PARA = WindowSettings(
50 | TextUnit.paragraphs, size=1, shift=1, min_overlap=0, separator=WINDOW_BR_SEP
51 | )
52 | """Process 1 paragraph at a time."""
53 |
54 |
55 | WINDOW_2_PARA = WindowSettings(
56 | TextUnit.paragraphs, size=2, shift=2, min_overlap=0, separator=WINDOW_BR_SEP
57 | )
58 | """Process 2 paragraphs at a time."""
59 |
60 |
61 | WINDOW_4_PARA = WindowSettings(
62 | TextUnit.paragraphs, size=4, shift=4, min_overlap=0, separator=WINDOW_BR_SEP
63 | )
64 | """Process 4 paragraph at a time."""
65 |
66 |
67 | WINDOW_8_PARA = WindowSettings(
68 | TextUnit.paragraphs, size=8, shift=8, min_overlap=0, separator=WINDOW_BR_SEP
69 | )
70 | """Process 8 paragraphs at a time."""
71 |
72 |
73 | WINDOW_16_PARA = WindowSettings(
74 | TextUnit.paragraphs, size=16, shift=16, min_overlap=0, separator=WINDOW_BR_SEP
75 | )
76 | """Process 16 paragraphs at a time."""
77 |
78 | WINDOW_32_PARA = WindowSettings(
79 | TextUnit.paragraphs, size=32, shift=32, min_overlap=0, separator=WINDOW_BR_SEP
80 | )
81 | """Process 32 paragraphs at a time."""
82 |
83 | WINDOW_64_PARA = WindowSettings(
84 | TextUnit.paragraphs, size=64, shift=64, min_overlap=0, separator=WINDOW_BR_SEP
85 | )
86 | """Process 64 paragraphs at a time."""
87 |
88 | WINDOW_128_PARA = WindowSettings(
89 | TextUnit.paragraphs, size=128, shift=128, min_overlap=0, separator=WINDOW_BR_SEP
90 | )
91 | """Process 128 paragraphs at a time."""
92 |
93 | WINDOW_256_PARA = WindowSettings(
94 | TextUnit.paragraphs, size=256, shift=256, min_overlap=0, separator=WINDOW_BR_SEP
95 | )
96 | """Process 256 paragraphs at a time."""
97 |
98 | WINDOW_512_PARA = WindowSettings(
99 | TextUnit.paragraphs, size=512, shift=512, min_overlap=0, separator=WINDOW_BR_SEP
100 | )
101 | """Process 512 paragraphs at a time."""
102 |
103 | WINDOW_1024_PARA = WindowSettings(
104 | TextUnit.paragraphs, size=1024, shift=1024, min_overlap=0, separator=WINDOW_BR_SEP
105 | )
106 | """Process 1024 paragraphs at a time."""
107 |
--------------------------------------------------------------------------------
/.cursor/rules/general.mdc:
--------------------------------------------------------------------------------
1 | ---
2 | description: General Guidelines
3 | globs:
4 | alwaysApply: true
5 | ---
6 | # Assistant Rules
7 |
8 | **Your fundamental responsibility:** Remember you are a senior engineer and have a
9 | serious responsibility to be clear, factual, think step by step and be systematic,
10 | express expert opinion, and make use of the user’s attention wisely.
11 |
12 | **Rules must be followed:** It is your responsibility to carefully read these rules as
13 | well as Python or other language-specific rules included here.
14 |
15 | Therefore:
16 |
17 | - Be concise. State answers or responses directly, without extra commentary.
18 | Or (if it is clear) directly do what is asked.
19 |
20 | - If instructions are unclear or there are two or more ways to fulfill the request that
21 | are substantially different, make a tentative plan (or offer options) and ask for
22 | confirmation.
23 |
24 | - If you can think of a much better approach that the user requests, be sure to mention
25 | it. It’s your responsibility to suggest approaches that lead to better, simpler
26 | solutions.
27 |
28 | - Give thoughtful opinions on better/worse approaches, but NEVER say “great idea!”
29 | or “good job” or other compliments, encouragement, or non-essential banter.
30 | Your job is to give expert opinions and to solve problems, not to motivate the user.
31 |
32 | - Avoid gratuitous enthusiasm or generalizations.
33 | Use thoughtful comparisons like saying which code is “cleaner” but don’t congratulate
34 | yourself. Avoid subjective descriptions.
35 | For example, don’t say “I’ve meticulously improved the code and it is in great shape!”
36 | That is useless generalization.
37 | Instead, specifically say what you’ve done, e.g., "I’ve added types, including
38 | generics, to all the methods in `Foo` and fixed all linter errors."
39 |
40 | # General Coding Guidelines
41 |
42 | ## Using Comments
43 |
44 | - Keep all comments concise and clear and suitable for inclusion in final production.
45 |
46 | - DO use comments whenever the intent of a given piece of code is subtle or confusing or
47 | avoids a bug or is not obvious from the code itself.
48 |
49 | - DO NOT repeat in comments what is obvious from the names of functions or variables or
50 | types.
51 |
52 | - DO NOT include comments that reflect what you did, such as “Added this function” as
53 | this is meaningless to anyone reading the code later.
54 | (Instead, describe in your message to the user any other contextual information.)
55 |
56 | - DO NOT use fancy or needlessly decorated headings like “===== MIGRATION TOOLS =====”
57 | in comments
58 |
59 | - DO NOT number steps in comments.
60 | These are hard to maintain if the code changes.
61 | NEVER DO THIS: “// Step 3: Fetch the data from the cache”\
62 | This is fine: “// Now fetch the data from the cache”
63 |
64 | - DO NOT use emojis or special unicode characters like ① or • or – or — in comments.
65 |
66 | - Use emojis in output if it enhances the clarity and can be done consistently.
67 | You may use ✔︎ and ✘ to indicate success and failure, and ∆ and ‼︎ for user-facing
68 | warnings and errors, for example, but be sure to do it consistently.
69 | DO NOT use emojis gratuitously in comments or output.
70 | You may use then ONLY when they have clear meanings (like success or failure).
71 | Unless the user says otherwise, avoid emojis and Unicode in comments as clutters the
72 | output with little benefit.
73 |
--------------------------------------------------------------------------------
/src/chopdiff/docs/token_mapping.py:
--------------------------------------------------------------------------------
1 | from typing_extensions import override
2 |
3 | from chopdiff.docs.token_diffs import SYMBOL_SEP, OpType, TokenDiff, diff_wordtoks
4 |
5 |
6 | class TokenMapping:
7 | """
8 | Given two sequences of tokens, create a best-estimate mapping of how the tokens
9 | in the second sequence map to the tokens in the first sequence, based on an
10 | LCS-style diff.
11 | """
12 |
13 | def __init__(
14 | self,
15 | tokens1: list[str],
16 | tokens2: list[str],
17 | diff: TokenDiff | None = None,
18 | min_tokens: int = 10,
19 | max_diff_frac: float = 0.4,
20 | ):
21 | self.tokens1 = tokens1
22 | self.tokens2 = tokens2
23 | self.diff = diff or diff_wordtoks(self.tokens1, self.tokens2)
24 | self._validate(min_tokens, max_diff_frac)
25 | self.backmap: dict[int, int] = {}
26 | self._create_mapping()
27 |
28 | def map_back(self, offset2: int) -> int:
29 | """
30 | Map an offset in the second sequence back to the offset that most closely corresponds to it
31 | in the first sequence. This might be an exact match (e.g. the same word) or the closest token
32 | (e.g. the last word before a deleted or changed word).
33 | """
34 | return self.backmap[offset2]
35 |
36 | def _validate(self, min_wordtoks: int, max_diff_frac: float):
37 | if len(self.tokens1) < min_wordtoks or len(self.tokens2) < min_wordtoks:
38 | raise ValueError(f"Documents should have at least {min_wordtoks} wordtoks")
39 |
40 | nchanges = len(self.diff.changes())
41 | if float(nchanges) / len(self.tokens1) > max_diff_frac:
42 | raise ValueError(
43 | f"Documents have too many changes: {nchanges}/{len(self.tokens1)} ({float(nchanges) / len(self.tokens1):.2f} > {max_diff_frac})"
44 | )
45 |
46 | def _create_mapping(self):
47 | offset1 = 0
48 | offset2 = 0
49 | last_offset1 = 0
50 |
51 | for op in self.diff.ops:
52 | if op.action == OpType.EQUAL:
53 | for _ in op.left:
54 | self.backmap[offset2] = offset1
55 | last_offset1 = offset1
56 | offset1 += 1
57 | offset2 += 1
58 | elif op.action == OpType.DELETE:
59 | for _ in op.left:
60 | last_offset1 = offset1
61 | offset1 += 1
62 | elif op.action == OpType.INSERT:
63 | for _ in op.right:
64 | self.backmap[offset2] = last_offset1
65 | offset2 += 1
66 | elif op.action == OpType.REPLACE:
67 | for _ in op.left:
68 | last_offset1 = offset1
69 | offset1 += 1
70 | for _ in op.right:
71 | self.backmap[offset2] = last_offset1
72 | offset2 += 1
73 |
74 | def full_mapping_str(self):
75 | """
76 | For debugging or logging, return a verbose, readable table of the mapping of each
77 | token in the second sequence to the first sequence.
78 | """
79 | return "\n".join(
80 | f"{i} {SYMBOL_SEP}{self.tokens2[i]}{SYMBOL_SEP} -> {self.map_back(i)} {SYMBOL_SEP}{self.tokens1[self.map_back(i)]}{SYMBOL_SEP}"
81 | for i in range(len(self.tokens2))
82 | )
83 |
84 | @override
85 | def __str__(self):
86 | return f"OffsetMapping(doc1 len {len(self.tokens1)}, doc2 len {len(self.tokens2)}, mapping len {len(self.backmap)})"
87 |
--------------------------------------------------------------------------------
/src/chopdiff/divs/div_elements.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from chopdiff.divs.chunk_utils import chunk_children, chunk_paras
4 | from chopdiff.divs.parse_divs import parse_divs
5 | from chopdiff.divs.text_node import TextNode
6 | from chopdiff.docs.sizes import TextUnit
7 | from chopdiff.docs.text_doc import TextDoc
8 | from chopdiff.docs.wordtoks import first_wordtok, is_div
9 | from chopdiff.html.html_in_md import Attrs, ClassNames, div_wrapper, html_join_blocks
10 |
11 | log = logging.getLogger(__name__)
12 |
13 |
14 | CHUNK = "chunk"
15 | """Class name for a chunk of text."""
16 |
17 | ORIGINAL = "original"
18 | """Class name for the original content."""
19 |
20 | RESULT = "result"
21 | """Class name for the result of an LLM action."""
22 |
23 | GROUP = "group"
24 | """Class name for a generic combination of elements."""
25 |
26 |
27 | def div(
28 | class_name: ClassNames,
29 | *blocks: str | None,
30 | attrs: Attrs | None = None,
31 | safe: bool = True,
32 | ) -> str:
33 | """
34 | Convenience to create Markdown-compatible div with HTML in its own paragraphs.
35 | """
36 | return div_wrapper(class_name=class_name, attrs=attrs, safe=safe, padding="\n\n")(
37 | html_join_blocks(*blocks)
38 | )
39 |
40 |
41 | def div_get_original(element: TextNode, child_name: str = ORIGINAL) -> str:
42 | """
43 | Get content of the named child element if it exists, otherwise use the whole contents.
44 | """
45 | child = element.child_by_class_name(child_name)
46 | return child.contents if child else element.contents
47 |
48 |
49 | def div_insert_wrapped(
50 | element: TextNode,
51 | new_child_blocks: list[str],
52 | container_class: ClassNames = CHUNK,
53 | original_class: str = ORIGINAL,
54 | at_front: bool = True,
55 | ) -> str:
56 | """
57 | Insert new children into a div element. As a base case, wrap the original
58 | content in a child div if it's not already present as a child.
59 | """
60 |
61 | original_element = element.child_by_class_name(original_class)
62 | if original_element:
63 | prev_contents = element.contents
64 | else:
65 | prev_contents = div(original_class, element.contents)
66 |
67 | if at_front:
68 | blocks = [*new_child_blocks, prev_contents]
69 | else:
70 | blocks = [prev_contents, *new_child_blocks]
71 |
72 | return div(container_class, html_join_blocks(*blocks))
73 |
74 |
75 | def chunk_text_as_divs(
76 | text: str, min_size: int, unit: TextUnit, class_name: ClassNames = CHUNK
77 | ) -> str:
78 | """
79 | Add HTML divs around "chunks" of text paragraphs or top-level divs, where each chunk
80 | is at least the specified minimum size.
81 | """
82 |
83 | if is_div(first_wordtok(text)):
84 | log.info("Chunking paragraphs using divs.")
85 | parsed = parse_divs(text)
86 | div_chunks = chunk_children(parsed, min_size, unit)
87 | chunk_strs = [chunk.reassemble() for chunk in div_chunks]
88 | size_summary = parsed.size_summary()
89 | else:
90 | log.info("Chunking paragraphs using newlines.")
91 | doc = TextDoc.from_text(text)
92 | doc_chunks = chunk_paras(doc, min_size, unit)
93 | chunk_strs = [chunk.reassemble() for chunk in doc_chunks]
94 | size_summary = doc.size_summary()
95 |
96 | result_divs = [div(class_name, chunk_str) for chunk_str in chunk_strs]
97 |
98 | log.info("Added %s div chunks on doc:\n%s", len(result_divs), size_summary)
99 |
100 | return "\n\n".join(result_divs)
101 |
--------------------------------------------------------------------------------
/tests/divs/test_div_elements.py:
--------------------------------------------------------------------------------
1 | from textwrap import dedent
2 |
3 | from chopdiff.divs.div_elements import CHUNK, chunk_text_as_divs, div, div_insert_wrapped
4 | from chopdiff.divs.parse_divs import parse_divs_single
5 | from chopdiff.docs.sizes import TextUnit
6 |
7 |
8 | def test_div_insert_child():
9 | node1 = parse_divs_single("Chunk text.")
10 | node2 = parse_divs_single(div(CHUNK, "Chunk text."))
11 |
12 | child_str = div("new", "New child text.")
13 |
14 | new_result1 = div_insert_wrapped(node1, [child_str])
15 | new_result2 = div_insert_wrapped(node2, [child_str])
16 |
17 | print("\n---test_div_insert_child---")
18 | print("\nnode1:")
19 | print(node1.original_text)
20 | print("\nnode2:")
21 | print(node2.original_text)
22 | print("\nnew_child_str:")
23 | print(child_str)
24 | print("\nnew_result1:")
25 | print(new_result1)
26 | print("\nnew_result2:")
27 | print(new_result2)
28 |
29 | assert (
30 | new_result1
31 | == dedent(
32 | """
33 |
34 |
35 |
36 |
37 | New child text.
38 |
39 |
40 |
41 |
42 |
43 | Chunk text.
44 |
45 |
46 |
47 |
48 | """
49 | ).strip()
50 | )
51 |
52 | assert new_result2 == new_result1
53 |
54 | node3 = parse_divs_single(new_result1)
55 |
56 | another_child_str = div("another", "Another child text.")
57 |
58 | new_result3 = div_insert_wrapped(node3, [another_child_str])
59 | print("\nnew_result3:")
60 | print(new_result3)
61 |
62 | assert (
63 | new_result3
64 | == dedent(
65 | """
66 |
67 |
68 |
69 |
70 | Another child text.
71 |
72 |
73 |
74 |
75 |
76 | New child text.
77 |
78 |
79 |
80 |
81 |
82 | Chunk text.
83 |
84 |
85 |
86 |
87 | """
88 | ).strip()
89 | )
90 |
91 |
92 | _med_test_doc = dedent(
93 | """
94 | # Title
95 |
96 | Hello World. This is an example sentence. And here's another one!
97 |
98 | ## Subtitle
99 |
100 | This is a new paragraph.
101 | It has several sentences.
102 | There may be line breaks within a paragraph, but these should not affect handlingof the paragraph.
103 | There are also [links](http://www.google.com) and **bold** and *italic* text.
104 |
105 | ### Itemized List
106 |
107 | - Item 1
108 |
109 | - Item 2
110 |
111 | - Item 3
112 |
113 | extra
114 |
115 |
116 | Blah blah.
117 | """
118 | ).strip()
119 |
120 |
121 | def test_chunk_text_into_divs():
122 | assert chunk_text_as_divs("", 7, TextUnit.words) == ""
123 | assert (
124 | chunk_text_as_divs("hello", 100, TextUnit.words) == '\n\nhello\n\n
'
125 | )
126 |
127 | chunked = chunk_text_as_divs(_med_test_doc, 7, TextUnit.words)
128 |
129 | print("\n---test_chunk_paras_as_divs---")
130 | print("Chunked doc:\n---\n" + chunked + "\n---")
131 |
132 | expected_first_chunk = dedent(
133 | """
134 |
135 |
136 | # Title
137 |
138 | Hello World. This is an example sentence. And here's another one!
139 |
140 |
141 | """
142 | ).strip()
143 |
144 | assert chunked.startswith(expected_first_chunk)
145 | assert chunked.endswith("")
146 | assert chunked.count("") == 5 # Extra spurious
.
148 |
--------------------------------------------------------------------------------
/tests/docs/test_token_mapping.py:
--------------------------------------------------------------------------------
1 | from textwrap import dedent
2 |
3 | from chopdiff.docs.text_doc import TextDoc
4 | from chopdiff.docs.token_mapping import TokenMapping
5 | from chopdiff.docs.wordtoks import wordtokenize
6 |
7 |
8 | def test_offset_mapping():
9 | doc1 = TextDoc.from_text("This is a simple test with some words.")
10 | doc2 = TextDoc.from_text(
11 | "This is<-PARA-BR->a simple pytest adding other words.<-SENT-BR->And another sentence."
12 | )
13 |
14 | mapping = TokenMapping(list(doc1.as_wordtoks()), list(doc2.as_wordtoks()))
15 |
16 | mapping_str = mapping.full_mapping_str()
17 |
18 | print(mapping.diff.as_diff_str(include_equal=True))
19 | print(mapping)
20 | print(mapping.backmap)
21 | print(mapping_str)
22 |
23 | assert (
24 | mapping_str
25 | == dedent(
26 | """
27 | 0 ⎪This⎪ -> 0 ⎪This⎪
28 | 1 ⎪ ⎪ -> 1 ⎪ ⎪
29 | 2 ⎪is⎪ -> 2 ⎪is⎪
30 | 3 ⎪<-PARA-BR->⎪ -> 3 ⎪ ⎪
31 | 4 ⎪a⎪ -> 4 ⎪a⎪
32 | 5 ⎪ ⎪ -> 5 ⎪ ⎪
33 | 6 ⎪simple⎪ -> 6 ⎪simple⎪
34 | 7 ⎪ ⎪ -> 7 ⎪ ⎪
35 | 8 ⎪pytest⎪ -> 8 ⎪test⎪
36 | 9 ⎪ ⎪ -> 9 ⎪ ⎪
37 | 10 ⎪adding⎪ -> 10 ⎪with⎪
38 | 11 ⎪ ⎪ -> 11 ⎪ ⎪
39 | 12 ⎪other⎪ -> 12 ⎪some⎪
40 | 13 ⎪ ⎪ -> 13 ⎪ ⎪
41 | 14 ⎪words⎪ -> 14 ⎪words⎪
42 | 15 ⎪.⎪ -> 15 ⎪.⎪
43 | 16 ⎪<-SENT-BR->⎪ -> 15 ⎪.⎪
44 | 17 ⎪And⎪ -> 15 ⎪.⎪
45 | 18 ⎪ ⎪ -> 15 ⎪.⎪
46 | 19 ⎪another⎪ -> 15 ⎪.⎪
47 | 20 ⎪ ⎪ -> 15 ⎪.⎪
48 | 21 ⎪sentence⎪ -> 15 ⎪.⎪
49 | 22 ⎪.⎪ -> 15 ⎪.⎪
50 | """
51 | ).strip()
52 | )
53 |
54 |
55 | def test_offset_mapping_longer():
56 | doc1 = dedent(
57 | """
58 | Alright, guys.
59 | Here's the deal.
60 | You can follow me on my daily workouts.
61 | """
62 | )
63 | doc2 = dedent(
64 | """
65 | Alright, guys. Here's the deal.
66 | You can follow me on my daily workouts.
67 | """
68 | )
69 |
70 | doc1_wordtoks = wordtokenize(doc1)
71 | doc2_wordtoks = list(TextDoc.from_text(doc2).as_wordtoks())
72 |
73 | mapping = TokenMapping(doc1_wordtoks, doc2_wordtoks)
74 |
75 | mapping_str = mapping.full_mapping_str()
76 |
77 | print(mapping.diff.as_diff_str(include_equal=True))
78 | print(mapping)
79 | print(mapping.backmap)
80 | print(mapping_str)
81 |
82 | assert (
83 | mapping_str
84 | == dedent(
85 | """
86 | 0 ⎪Alright⎪ -> 2 ⎪Alright⎪
87 | 1 ⎪,⎪ -> 3 ⎪,⎪
88 | 2 ⎪ ⎪ -> 4 ⎪ ⎪
89 | 3 ⎪guys⎪ -> 5 ⎪guys⎪
90 | 4 ⎪.⎪ -> 6 ⎪.⎪
91 | 5 ⎪ ⎪ -> 8 ⎪ ⎪
92 | 6 ⎪Here⎪ -> 10 ⎪Here⎪
93 | 7 ⎪'⎪ -> 11 ⎪'⎪
94 | 8 ⎪s⎪ -> 12 ⎪s⎪
95 | 9 ⎪ ⎪ -> 13 ⎪ ⎪
96 | 10 ⎪the⎪ -> 14 ⎪the⎪
97 | 11 ⎪ ⎪ -> 15 ⎪ ⎪
98 | 12 ⎪deal⎪ -> 16 ⎪deal⎪
99 | 13 ⎪.⎪ -> 17 ⎪.⎪
100 | 14 ⎪<-SENT-BR->⎪ -> 20 ⎪⎪
101 | 15 ⎪You⎪ -> 21 ⎪You⎪
102 | 16 ⎪ ⎪ -> 22 ⎪ ⎪
103 | 17 ⎪can⎪ -> 23 ⎪can⎪
104 | 18 ⎪ ⎪ -> 24 ⎪ ⎪
105 | 19 ⎪follow⎪ -> 25 ⎪follow⎪
106 | 20 ⎪ ⎪ -> 26 ⎪ ⎪
107 | 21 ⎪me⎪ -> 27 ⎪me⎪
108 | 22 ⎪ ⎪ -> 28 ⎪ ⎪
109 | 23 ⎪on⎪ -> 29 ⎪on⎪
110 | 24 ⎪ ⎪ -> 30 ⎪ ⎪
111 | 25 ⎪my⎪ -> 31 ⎪my⎪
112 | 26 ⎪ ⎪ -> 32 ⎪ ⎪
113 | 27 ⎪daily⎪ -> 33 ⎪daily⎪
114 | 28 ⎪ ⎪ -> 34 ⎪ ⎪
115 | 29 ⎪workouts⎪ -> 35 ⎪workouts⎪
116 | 30 ⎪.⎪ -> 36 ⎪.⎪
117 | """
118 | ).strip()
119 | )
120 |
--------------------------------------------------------------------------------
/examples/insert_para_breaks.py:
--------------------------------------------------------------------------------
1 | # /// script
2 | # requires-python = ">=3.13"
3 | # dependencies = [
4 | # "chopdiff",
5 | # "flowmark",
6 | # "openai",
7 | # ]
8 | # ///
9 | import argparse
10 | import logging
11 | from textwrap import dedent
12 |
13 | import openai # pyright: ignore # Not a project dep.
14 | from flowmark import fill_text
15 |
16 | from chopdiff.docs import TextDoc
17 | from chopdiff.transforms import WINDOW_2K_WORDTOKS, changes_whitespace, filtered_transform
18 |
19 | logging.basicConfig(format=">> %(message)s")
20 | log = logging.getLogger(__name__)
21 | log.setLevel(logging.INFO)
22 |
23 |
24 | def heading(text: str):
25 | return "\n--- " + text + " " + "-" * (70 - len(text)) + "\n"
26 |
27 |
28 | def insert_paragraph_breaks(text: str) -> str:
29 | # Create a TextDoc from the input text
30 | doc = TextDoc.from_text(text)
31 |
32 | # Handy calculations of document size in paragraphs, sentences, etc.
33 | print(f"\nInput document: {doc.size_summary()}")
34 |
35 | # Define the transformation function.
36 | # Note in this case we run the LLM on strings, but you could also work directly
37 | # on the TextDoc if appropriate.
38 | def transform(doc: TextDoc) -> TextDoc:
39 | return TextDoc.from_text(llm_insert_para_breaks(doc.reassemble()))
40 |
41 | # Apply the transformation with windowing and filtering.
42 | #
43 | # This will walk along the document in approximately 2K "wordtok" chunks
44 | # (~1000 words) and apply the transformation to each chunk. Chunks can
45 | # slightly overlap to make this more robust.
46 | #
47 | # The change on each chunk will then be filtered to only include whitespace
48 | # changes.
49 | #
50 | # Finally each change will be "stitched back" to form the original document,
51 | # by looking for the right alignment of words between the original and the
52 | # transformed chunk.
53 | #
54 | # (Turn on logging to see these details.)
55 | result_doc = filtered_transform(
56 | doc, transform, windowing=WINDOW_2K_WORDTOKS, diff_filter=changes_whitespace
57 | )
58 |
59 | print(heading("Output document"))
60 | print(f"\nOutput document: {result_doc.size_summary()}")
61 |
62 | # Return the transformed text
63 | return result_doc.reassemble()
64 |
65 |
66 | def llm_insert_para_breaks(input_text: str) -> str:
67 | """
68 | Call OpenAI to insert paragraph breaks on a chunk of text.
69 | This works best on a smaller chunk of text and might make
70 | other non-whitespace changes.
71 | """
72 | client: openai.OpenAI = openai.OpenAI()
73 |
74 | response = client.chat.completions.create(
75 | model="gpt-4o-mini",
76 | messages=[
77 | {"role": "system", "content": "You are a careful and precise editor."},
78 | {
79 | "role": "user",
80 | "content": dedent(
81 | f"""
82 | Break the following text into paragraphs.
83 |
84 | Original text:
85 |
86 | {input_text}
87 |
88 | Formatted text:
89 | """
90 | ),
91 | },
92 | ],
93 | temperature=0.0,
94 | )
95 |
96 | return response.choices[0].message.content or ""
97 |
98 |
99 | def main():
100 | parser = argparse.ArgumentParser(
101 | description="Insert paragraph breaks in text files, making no other changes of any kind to a document."
102 | )
103 | parser.add_argument("input_file", help="Path to the input text file")
104 | parser.add_argument("-o", "--output", help="Path to the output file (default: stdout)")
105 | args = parser.parse_args()
106 |
107 | logging.basicConfig(level=logging.INFO)
108 |
109 | with open(args.input_file, encoding="utf-8") as f:
110 | input_text = f.read()
111 |
112 | print(heading("Original"))
113 | print(fill_text(input_text))
114 |
115 | result = insert_paragraph_breaks(input_text)
116 |
117 | print(heading("With paragraph breaks"))
118 | print(fill_text(result))
119 |
120 |
121 | if __name__ == "__main__":
122 | main()
123 |
--------------------------------------------------------------------------------
/tests/transforms/test_diff_filters.py:
--------------------------------------------------------------------------------
1 | from chopdiff.docs.text_doc import TextDoc
2 | from chopdiff.docs.token_diffs import DiffOp, OpType, diff_wordtoks
3 | from chopdiff.docs.wordtoks import PARA_BR_TOK, SENT_BR_TOK, is_break_or_space
4 | from chopdiff.transforms.diff_filters import (
5 | WILDCARD_TOK,
6 | changes_whitespace,
7 | make_token_sequence_filter,
8 | no_word_lemma_changes,
9 | removes_word_lemmas,
10 | removes_words,
11 | )
12 |
13 |
14 | def test_filter_br_and_space():
15 | from ..docs.test_token_diffs import _short_text1, _short_text2, _short_text3
16 |
17 | wordtoks1 = list(TextDoc.from_text(_short_text1).as_wordtoks())
18 | wordtoks2 = list(TextDoc.from_text(_short_text2).as_wordtoks())
19 | wordtoks3 = list(TextDoc.from_text(_short_text3).as_wordtoks())
20 |
21 | diff = diff_wordtoks(wordtoks1, wordtoks2)
22 |
23 | accepted, rejected = diff.filter(changes_whitespace)
24 |
25 | accepted_result = accepted.apply_to(wordtoks1)
26 | rejected_result = rejected.apply_to(wordtoks1)
27 |
28 | print("---Filtered diff:")
29 | print("Original: " + "/".join(wordtoks1))
30 | print("Full diff:", diff)
31 | print("Accepted diff:", accepted)
32 | print("Rejected diff:", rejected)
33 | print("Accepted result: " + "/".join(accepted_result))
34 | print("Rejected result: " + "/".join(rejected_result))
35 |
36 | assert accepted_result == wordtoks3
37 |
38 |
39 | def test_token_sequence_filter_with_predicate():
40 | insert_op = DiffOp(OpType.INSERT, [], [SENT_BR_TOK, "", "Title", " ", PARA_BR_TOK])
41 | delete_op = DiffOp(OpType.DELETE, [SENT_BR_TOK, "", "Old Title", " ", PARA_BR_TOK], [])
42 | replace_op = DiffOp(OpType.REPLACE, ["Some", "text"], ["New", "text"])
43 | equal_op = DiffOp(OpType.EQUAL, ["Unchanged"], ["Unchanged"])
44 |
45 | action = OpType.INSERT
46 | filter_fn = make_token_sequence_filter(
47 | [is_break_or_space, "", WILDCARD_TOK, " ", is_break_or_space], action
48 | )
49 |
50 | assert filter_fn(insert_op)
51 | assert not filter_fn(delete_op) # action is INSERT
52 | assert not filter_fn(replace_op)
53 | assert not filter_fn(equal_op)
54 |
55 | ignore_whitespace_filter_fn = make_token_sequence_filter(
56 | ["", WILDCARD_TOK, " "],
57 | action=OpType.INSERT,
58 | ignore=is_break_or_space,
59 | )
60 |
61 | insert_op_with_whitespace = DiffOp(
62 | OpType.INSERT,
63 | [],
64 | [" ", SENT_BR_TOK, " ", "", "Title", " ", " ", PARA_BR_TOK, " "],
65 | )
66 |
67 | assert ignore_whitespace_filter_fn(insert_op_with_whitespace)
68 | assert not ignore_whitespace_filter_fn(delete_op) # action is INSERT
69 | assert not ignore_whitespace_filter_fn(replace_op)
70 | assert not ignore_whitespace_filter_fn(equal_op)
71 |
72 |
73 | def test_no_word_changes_lemmatized():
74 | assert not no_word_lemma_changes(DiffOp(OpType.INSERT, [], ["the"]))
75 | assert not no_word_lemma_changes(DiffOp(OpType.DELETE, ["the"], []))
76 | assert not no_word_lemma_changes(
77 | DiffOp(
78 | OpType.REPLACE,
79 | ["The", "dogs", "were", "running", "fast"],
80 | ["The", "dog", "was", "running"],
81 | )
82 | )
83 | assert no_word_lemma_changes(
84 | DiffOp(
85 | OpType.REPLACE,
86 | ["The", "dogs", "were", "running"],
87 | ["The", "dog", "was", "running"],
88 | )
89 | )
90 |
91 |
92 | def test_removes_words():
93 | assert removes_words(DiffOp(OpType.DELETE, ["Hello", " "], []))
94 | assert removes_words(DiffOp(OpType.REPLACE, ["Hello", " ", "world"], ["world"]))
95 | assert not removes_words(DiffOp(OpType.REPLACE, ["Hello", " ", "world"], ["World"]))
96 | assert removes_word_lemmas(DiffOp(OpType.REPLACE, ["Hello", " ", "world"], ["World"]))
97 |
98 | assert not removes_words(
99 | DiffOp(OpType.REPLACE, ["Hello", "*", "world"], ["hello", "*", "world"])
100 | )
101 | assert removes_word_lemmas(
102 | DiffOp(OpType.REPLACE, ["Hello", "*", "world"], ["hello", "*", "world"])
103 | )
104 |
105 | assert removes_words(DiffOp(OpType.DELETE, ["Hello", "world"], []))
106 | assert removes_word_lemmas(DiffOp(OpType.DELETE, ["Hello", "world"], []))
107 |
--------------------------------------------------------------------------------
/src/chopdiff/divs/parse_divs.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import re
3 |
4 | from chopdiff.divs.text_node import TextNode
5 |
6 | DIV_TAGS = re.compile(r"(]*>|
)", re.IGNORECASE)
7 |
8 | CLASS_NAME_PATTERN = re.compile(r"\bclass=\"([^\"]+)\"", re.IGNORECASE)
9 |
10 |
11 | def parse_divs(text: str, skip_whitespace: bool = True) -> TextNode:
12 | """
13 | Parse a string recursively into `TextNode`s based on `` tags.
14 |
15 | All offsets are relative to the original text. Text outside of a div tag is
16 | included as a `TextNode` with None markers.
17 |
18 | We do our own parsing to keep this simple and exactly preserve formatting.
19 | """
20 | parsed = _parse_divs_recursive(
21 | text,
22 | 0,
23 | TextNode(original_text=text, offset=0, content_start=0, content_end=len(text)),
24 | )
25 |
26 | if skip_whitespace:
27 | parsed = _skip_whitespace_nodes(parsed)
28 |
29 | return parsed
30 |
31 |
32 | def parse_divs_single(text: str, skip_whitespace: bool = True) -> TextNode:
33 | """
34 | Same as parse_divs but unwraps any singleton child.
35 | """
36 | divs = parse_divs(text, skip_whitespace=skip_whitespace)
37 | if len(divs.children) == 1:
38 | return divs.children[0]
39 | else:
40 | return divs
41 |
42 |
43 | def _skip_whitespace_nodes(node: TextNode) -> TextNode:
44 | filtered_node = copy.copy(node)
45 | filtered_node.children = [
46 | _skip_whitespace_nodes(child) for child in node.children if not child.is_whitespace()
47 | ]
48 | return filtered_node
49 |
50 |
51 | def _parse_divs_recursive(
52 | text: str,
53 | start_offset: int,
54 | result: TextNode,
55 | ) -> TextNode:
56 | current_offset = start_offset
57 |
58 | while current_offset < len(text):
59 | match = DIV_TAGS.search(text, current_offset)
60 |
61 | if not match:
62 | # No more div tags, add remaining content as a child node
63 | if current_offset < len(text):
64 | result.children.append(
65 | TextNode(
66 | original_text=text,
67 | offset=current_offset,
68 | content_start=current_offset,
69 | content_end=len(text),
70 | )
71 | )
72 | break
73 |
74 | if match.start() > current_offset:
75 | # Add content before the div tag as a child node.
76 | result.children.append(
77 | TextNode(
78 | original_text=text,
79 | offset=current_offset,
80 | content_start=current_offset,
81 | content_end=match.start(),
82 | )
83 | )
84 |
85 | tag = match.group(1)
86 | is_end_tag = tag.startswith("")
87 |
88 | if is_end_tag:
89 | # Closing tag. We're done with this node.
90 | result.end_marker = tag
91 | result.content_end = match.start()
92 | current_offset = match.end()
93 | break
94 | else:
95 | # Opening tag. Create a new child node and recurse.
96 | class_match = CLASS_NAME_PATTERN.search(tag)
97 | class_name = class_match.group(1) if class_match else None
98 |
99 | child_node = TextNode(
100 | original_text=text,
101 | offset=match.start(),
102 | content_start=match.end(),
103 | content_end=len(text),
104 | tag_name="div",
105 | class_name=class_name,
106 | begin_marker=tag,
107 | )
108 |
109 | child_node = _parse_divs_recursive(text, match.end(), child_node)
110 |
111 | result.children.append(child_node)
112 |
113 | current_offset = child_node.end_offset
114 |
115 | return result
116 |
117 |
118 | def parse_divs_by_class(text: str, class_name: str) -> list[TextNode]:
119 | """
120 | Parse div chunks into TextNodes.
121 | """
122 |
123 | text_node = parse_divs(text)
124 |
125 | matched_divs = text_node.children_by_class_names(class_name, recursive=True)
126 |
127 | if not matched_divs:
128 | raise ValueError(f"No `{class_name}` divs found in text.")
129 |
130 | return matched_divs
131 |
--------------------------------------------------------------------------------
/tests/docs/test_wordtoks.py:
--------------------------------------------------------------------------------
1 | from textwrap import dedent
2 |
3 | from chopdiff.docs.search_tokens import search_tokens
4 | from chopdiff.docs.wordtoks import (
5 | Tag,
6 | _insert_para_wordtoks,
7 | is_entity,
8 | is_tag,
9 | is_tag_close,
10 | is_tag_open,
11 | parse_tag,
12 | visualize_wordtoks,
13 | wordtokenize,
14 | )
15 |
16 | _test_doc = dedent(
17 | """
18 | Hello, world!
19 | This is an "example sentence with punctuation.
20 | "Special characters: @#%^&*()"
21 |
Alright, guys.
22 |
23 |
Here's the deal.
24 |
You can follow me on my daily workouts. 00:10
27 | """
28 | ).strip()
29 |
30 |
31 | def test_html_doc():
32 | wordtoks = wordtokenize(_test_doc, bof_eof=True)
33 |
34 | print("\n---Wordtoks test:")
35 | print(visualize_wordtoks(wordtoks))
36 |
37 | print("\n---Wordtoks with para br:")
38 | wordtoks_with_para = wordtokenize(_insert_para_wordtoks(_test_doc), bof_eof=True)
39 | print(visualize_wordtoks(wordtoks_with_para))
40 |
41 | assert (
42 | visualize_wordtoks(wordtoks)
43 | == """⎪<-BOF->⎪Hello⎪,⎪ ⎪world⎪!⎪ ⎪This⎪ ⎪is⎪ ⎪an⎪ ⎪"⎪example⎪ ⎪sentence⎪ ⎪with⎪ ⎪punctuation⎪.⎪ ⎪"⎪Special⎪ ⎪characters⎪:⎪ ⎪@⎪#⎪%⎪^⎪&⎪*⎪(⎪)⎪"⎪ ⎪⎪Alright⎪,⎪ ⎪guys⎪.⎪ ⎪ ⎪⎪Here⎪'⎪s⎪ ⎪the⎪ ⎪deal⎪.⎪ ⎪ ⎪⎪You⎪ ⎪can⎪ ⎪follow⎪ ⎪me⎪ ⎪on⎪ ⎪my⎪ ⎪daily⎪ ⎪workouts⎪.⎪ ⎪⎪⎪00⎪:⎪10⎪ ⎪ ⎪<-EOF->⎪"""
44 | )
45 |
46 | assert (
47 | visualize_wordtoks(wordtoks_with_para)
48 | == """⎪<-BOF->⎪Hello⎪,⎪ ⎪world⎪!⎪ ⎪This⎪ ⎪is⎪ ⎪an⎪ ⎪"⎪example⎪ ⎪sentence⎪ ⎪with⎪ ⎪punctuation⎪.⎪ ⎪"⎪Special⎪ ⎪characters⎪:⎪ ⎪@⎪#⎪%⎪^⎪&⎪*⎪(⎪)⎪"⎪ ⎪⎪Alright⎪,⎪ ⎪guys⎪.⎪ ⎪<-PARA-BR->⎪⎪Here⎪'⎪s⎪ ⎪the⎪ ⎪deal⎪.⎪ ⎪ ⎪⎪You⎪ ⎪can⎪ ⎪follow⎪ ⎪me⎪ ⎪on⎪ ⎪my⎪ ⎪daily⎪ ⎪workouts⎪.⎪ ⎪⎪⎪00⎪:⎪10⎪ ⎪ ⎪<-EOF->⎪"""
49 | )
50 |
51 | print("\n---Searching tokens")
52 |
53 | print(search_tokens(wordtoks).at(0).seek_forward(["example"]).get_token())
54 | print(search_tokens(wordtoks).at(-1).seek_back(["follow"]).get_token())
55 | print(search_tokens(wordtoks).at(-1).seek_back(["Special"]).seek_forward(is_tag).get_token())
56 |
57 | assert search_tokens(wordtoks).at(0).seek_forward(["example"]).get_token() == (
58 | 14,
59 | "example",
60 | )
61 | assert search_tokens(wordtoks).at(-1).seek_back(["follow"]).get_token() == (
62 | 63,
63 | "follow",
64 | )
65 | assert search_tokens(wordtoks).at(-1).seek_back(["Special"]).seek_forward(
66 | is_tag
67 | ).get_token() == (39, '')
68 |
69 |
70 | def test_tag_functions():
71 | assert parse_tag("") == Tag(name="div", is_open=True, is_close=False, attrs={})
72 | assert parse_tag("
") == Tag(name="div", is_open=False, is_close=True, attrs={})
73 | assert parse_tag("
") == Tag(name="div", is_open=True, is_close=True, attrs={})
74 | assert parse_tag("") == Tag(
75 | name="", is_open=False, is_close=False, attrs={}, comment=" Comment "
76 | )
77 |
78 | assert not is_tag("foo")
79 | assert not is_tag("")
81 | assert is_tag(" ")
82 | assert is_tag("")
83 | assert is_tag("", ["div"])
84 | assert not is_tag("
", ["span"])
85 | assert is_tag("
")
86 |
87 | assert is_tag_close("
")
88 | assert not is_tag_close("
")
89 | assert is_tag_close("
", ["div"])
90 | assert not is_tag_close("
", ["span"])
91 | assert is_tag_close("
")
92 | assert is_tag_open("")
93 | assert not is_tag_open("
")
94 | assert is_tag_open("", ["div"])
95 | assert not is_tag_open("
", ["span"])
96 |
97 | assert is_entity("&")
98 | assert not is_entity("nbsp;")
99 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Generated by Makefile
2 | CLAUDE.md
3 | AGENTS.md
4 |
5 | # Additions to standard GitHub .gitignore:
6 | *.bak
7 | *.orig
8 | tmp/
9 | trash/
10 | attic/
11 | .kash/
12 |
13 | # Byte-compiled / optimized / DLL files
14 | __pycache__/
15 | *.py[cod]
16 | *$py.class
17 |
18 | # C extensions
19 | *.so
20 |
21 | # Distribution / packaging
22 | .Python
23 | build/
24 | develop-eggs/
25 | dist/
26 | downloads/
27 | eggs/
28 | .eggs/
29 | lib/
30 | lib64/
31 | parts/
32 | sdist/
33 | var/
34 | wheels/
35 | share/python-wheels/
36 | *.egg-info/
37 | .installed.cfg
38 | *.egg
39 | MANIFEST
40 |
41 | # PyInstaller
42 | # Usually these files are written by a python script from a template
43 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
44 | *.manifest
45 | *.spec
46 |
47 | # Installer logs
48 | pip-log.txt
49 | pip-delete-this-directory.txt
50 |
51 | # Unit test / coverage reports
52 | htmlcov/
53 | .tox/
54 | .nox/
55 | .coverage
56 | .coverage.*
57 | .cache
58 | nosetests.xml
59 | coverage.xml
60 | *.cover
61 | *.py,cover
62 | .hypothesis/
63 | .pytest_cache/
64 | cover/
65 |
66 | # Translations
67 | *.mo
68 | *.pot
69 |
70 | # Django stuff:
71 | *.log
72 | local_settings.py
73 | db.sqlite3
74 | db.sqlite3-journal
75 |
76 | # Flask stuff:
77 | instance/
78 | .webassets-cache
79 |
80 | # Scrapy stuff:
81 | .scrapy
82 |
83 | # Sphinx documentation
84 | docs/_build/
85 |
86 | # PyBuilder
87 | .pybuilder/
88 | target/
89 |
90 | # Jupyter Notebook
91 | .ipynb_checkpoints
92 |
93 | # IPython
94 | profile_default/
95 | ipython_config.py
96 |
97 | # pyenv
98 | # For a library or package, you might want to ignore these files since the code is
99 | # intended to run in multiple environments; otherwise, check them in:
100 | # .python-version
101 |
102 | # pipenv
103 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
104 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
105 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
106 | # install all needed dependencies.
107 | #Pipfile.lock
108 |
109 | # UV
110 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
111 | # This is especially recommended for binary packages to ensure reproducibility, and is more
112 | # commonly ignored for libraries.
113 | #uv.lock
114 |
115 | # poetry
116 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
117 | # This is especially recommended for binary packages to ensure reproducibility, and is more
118 | # commonly ignored for libraries.
119 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
120 | #poetry.lock
121 |
122 | # pdm
123 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
124 | #pdm.lock
125 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
126 | # in version control.
127 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
128 | .pdm.toml
129 | .pdm-python
130 | .pdm-build/
131 |
132 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
133 | __pypackages__/
134 |
135 | # Celery stuff
136 | celerybeat-schedule
137 | celerybeat.pid
138 |
139 | # SageMath parsed files
140 | *.sage.py
141 |
142 | # Environments
143 | .env
144 | .venv
145 | env/
146 | venv/
147 | ENV/
148 | env.bak/
149 | venv.bak/
150 |
151 | # Spyder project settings
152 | .spyderproject
153 | .spyproject
154 |
155 | # Rope project settings
156 | .ropeproject
157 |
158 | # mkdocs documentation
159 | /site
160 |
161 | # mypy
162 | .mypy_cache/
163 | .dmypy.json
164 | dmypy.json
165 |
166 | # Pyre type checker
167 | .pyre/
168 |
169 | # pytype static type analyzer
170 | .pytype/
171 |
172 | # Cython debug symbols
173 | cython_debug/
174 |
175 | # PyCharm
176 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
177 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
178 | # and can be added to the global gitignore or merged into this file. For a more nuclear
179 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
180 | #.idea/
181 |
182 | # PyPI configuration file
183 | .pypirc
184 |
--------------------------------------------------------------------------------
/tests/docs/test_token_diffs.py:
--------------------------------------------------------------------------------
1 | from textwrap import dedent
2 |
3 | from chopdiff.docs.text_doc import SentIndex, TextDoc
4 | from chopdiff.docs.token_diffs import DiffStats, diff_wordtoks, find_best_alignment
5 |
6 | _short_text1 = dedent(
7 | """
8 | Paragraph one. Sentence 1a. Sentence 1b. Sentence 1c.
9 |
10 | Paragraph two. Sentence 2a. Sentence 2b. Sentence 2c.
11 |
12 | Paragraph three. Sentence 3a. Sentence 3b. Sentence 3c.
13 | """
14 | ).strip()
15 |
16 |
17 | _short_text2 = dedent(
18 | """
19 | Paragraph one. Sentence 1a. Sentence 1b. Sentence 1c.
20 | Paragraph two blah. Sentence 2a. Sentence 2b. Sentence 2c.
21 |
22 | Paragraph three! Sentence 3a. Sentence 3b.
23 | """
24 | ).strip()
25 |
26 | # _short_text3 contains all the whitespace and break-only changes from _short_text1 to _short_text2.
27 | _short_text3 = dedent(
28 | """
29 | Paragraph one. Sentence 1a. Sentence 1b. Sentence 1c.
30 | Paragraph two. Sentence 2a. Sentence 2b. Sentence 2c.
31 |
32 | Paragraph three. Sentence 3a. Sentence 3b. Sentence 3c.
33 | """
34 | ).strip()
35 |
36 |
37 | def test_lcs_diff_wordtoks():
38 | wordtoks1 = list(TextDoc.from_text(_short_text1).as_wordtoks())
39 | wordtoks2 = list(TextDoc.from_text(_short_text2).as_wordtoks())
40 |
41 | diff = diff_wordtoks(wordtoks1, wordtoks2)
42 |
43 | print("---Diff:")
44 | print(diff.as_diff_str(True))
45 |
46 | print("---Diff stats:")
47 | print(diff.stats())
48 | assert diff.stats() == DiffStats(added=5, removed=8, input_size=59)
49 |
50 | expected_diff = dedent(
51 | """
52 | TextDiff: add/remove +5/-8 out of 59 total:
53 | at pos 0 keep 19 toks: ⎪Paragraph one. Sentence 1a. Sentence 1b. Sentence 1c.⎪
54 | at pos 19 repl 1 toks: - ⎪<-PARA-BR->⎪
55 | repl 1 toks: + ⎪ ⎪
56 | at pos 20 keep 3 toks: ⎪Paragraph two⎪
57 | at pos 23 add 2 toks: + ⎪ blah⎪
58 | at pos 23 keep 1 toks: ⎪.⎪
59 | at pos 24 repl 1 toks: - ⎪ ⎪
60 | repl 1 toks: + ⎪<-SENT-BR->⎪
61 | at pos 25 keep 18 toks: ⎪Sentence 2a. Sentence 2b. Sentence 2c.<-PARA-BR->Paragraph three⎪
62 | at pos 43 repl 1 toks: - ⎪.⎪
63 | repl 1 toks: + ⎪!⎪
64 | at pos 44 keep 10 toks: ⎪<-SENT-BR->Sentence 3a. Sentence 3b.⎪
65 | at pos 54 del 5 toks: - ⎪ Sentence 3c.⎪
66 | """
67 | ).strip()
68 |
69 | assert str(diff.as_diff_str(True)) == expected_diff
70 |
71 |
72 | def test_apply_to():
73 | wordtoks1 = list(TextDoc.from_text(_short_text1).as_wordtoks())
74 | wordtoks2 = list(TextDoc.from_text(_short_text2).as_wordtoks())
75 |
76 | diff = diff_wordtoks(wordtoks1, wordtoks2)
77 |
78 | print("---Before apply:")
79 | print("/".join(wordtoks1))
80 | print(diff)
81 | result = diff.apply_to(wordtoks1)
82 | print("---Result of apply:")
83 | print("/".join(result))
84 | print("---Expected:")
85 | print("/".join(wordtoks2))
86 | assert result == wordtoks2
87 |
88 | wordtoks3 = ["a", "b", "c", "d", "e"]
89 | wordtoks4 = ["a", "x", "c", "y", "e"]
90 | diff2 = diff_wordtoks(wordtoks3, wordtoks4)
91 | result2 = diff2.apply_to(wordtoks3)
92 | assert result2 == wordtoks4
93 |
94 |
95 | def test_find_best_alignment():
96 | wordtoks1 = list(TextDoc.from_text(_short_text1).as_wordtoks())
97 | wordtoks2 = list(TextDoc.from_text(_short_text1).sub_doc(SentIndex(1, 1)).as_wordtoks())
98 | wordtoks3 = wordtoks2 + ["Extra", "wordtoks", "at", "the", "end"]
99 | wordtoks4 = list(wordtoks3)
100 | wordtoks4[0] = "X"
101 | wordtoks4[3] = "Y"
102 |
103 | print("---Alignment:")
104 | print("/".join(wordtoks1))
105 | print("/".join(wordtoks2))
106 | offset, (score, diff) = find_best_alignment(wordtoks1, wordtoks2, 1)
107 | print(f"Offset: {offset}, Score: {score}")
108 | print(diff)
109 | print()
110 | assert offset == 39
111 | assert score == 0.0
112 | assert diff.changes() == []
113 |
114 | offset, (score, diff) = find_best_alignment(wordtoks1, wordtoks3, 3)
115 | print(f"Offset: {offset}, Score: {score}")
116 | print(diff)
117 | print()
118 | assert offset == 39
119 | assert score == 0.0
120 | assert diff.changes() == []
121 |
122 | offset, (score, diff) = find_best_alignment(wordtoks1, wordtoks4, 3)
123 | print(f"Offset: {offset}, Score: {score}")
124 | print(diff)
125 | print()
126 | assert offset == 39
127 | assert score > 0 and score < 0.3
128 | assert diff.stats().nchanges() == 4
129 |
--------------------------------------------------------------------------------
/tests/divs/test_parse_divs.py:
--------------------------------------------------------------------------------
1 | from textwrap import dedent
2 |
3 | from chopdiff.divs.parse_divs import parse_divs, parse_divs_by_class
4 | from chopdiff.divs.text_node import TextNode
5 |
6 | _test_text = dedent(
7 | """
8 |
9 |
10 | Outer content paragraph 1.
11 |
12 | Outer content paragraph 2.
13 |
14 | Inner content.
15 |
16 | Nested content.
17 |
18 |
19 |
20 |
21 | Nested inner content.
22 |
23 | Deeply nested content.
24 |
25 |
26 |
27 |
28 |
29 | Outer content paragraph 3.
30 |
31 | """
32 | )
33 |
34 |
35 | def _strip_lines(text: str) -> list[str]:
36 | return [line.strip() for line in text.strip().split("\n")]
37 |
38 |
39 | def test_parse_divs():
40 | def validate_node(node: TextNode, original_text: str):
41 | assert node.original_text == original_text
42 | assert 0 <= node.content_start <= len(original_text)
43 | assert 0 <= node.content_end <= len(original_text)
44 | assert node.content_start <= node.content_end
45 | assert node.contents == original_text[node.content_start : node.content_end]
46 | assert (
47 | node.begin_marker is None
48 | or original_text[node.offset : node.offset + len(node.begin_marker)]
49 | == node.begin_marker
50 | )
51 | assert (
52 | node.end_marker is None
53 | or original_text[node.content_end : node.content_end + len(node.end_marker)]
54 | == node.end_marker
55 | )
56 |
57 | for child in node.children:
58 | validate_node(child, original_text)
59 |
60 | node = parse_divs(_test_text, skip_whitespace=False)
61 |
62 | node_no_whitespace = parse_divs(_test_text, skip_whitespace=True)
63 |
64 | reassembled = node.reassemble(padding="")
65 |
66 | print()
67 | print(f"Original text (length {len(_test_text)}):")
68 | print(_test_text)
69 |
70 | print()
71 | print("Parsed text:")
72 | print(node)
73 |
74 | print()
75 | print("Parsed text (no whitespace):")
76 | print(node_no_whitespace)
77 |
78 | print()
79 | print(f"Reassembled text (length {len(reassembled)}):")
80 | print(reassembled)
81 |
82 | print()
83 | print("Reassembled text (normalized padding):")
84 | print(node.reassemble())
85 |
86 | validate_node(node, _test_text)
87 |
88 | assert reassembled.count("
Chunk1
96 |
Chunk2
97 |
Chunk3
98 | """
99 |
100 | node = parse_divs(doc)
101 | summary_str = node.structure_summary_str() or ""
102 |
103 | print()
104 | print("Structure summary:")
105 | print(summary_str)
106 |
107 | expected_summary = dedent(
108 | """
109 | HTML structure:
110 | 3 div.chunk
111 | """
112 | ).strip()
113 |
114 | assert _strip_lines(summary_str) == _strip_lines(expected_summary)
115 |
116 |
117 | def test_structure_summary_str_2():
118 | node = parse_divs(_test_text)
119 | summary_str = node.structure_summary_str() or ""
120 |
121 | print()
122 | print("Structure summary:")
123 | print(summary_str)
124 |
125 | expected_summary = dedent(
126 | """
127 | HTML structure:
128 | 1 div.outer
129 | 1 div.outer > div.inner
130 | 1 div.outer > div.inner > div
131 | 1 div.outer > div.inner > div.nested-inner
132 | 1 div.outer > div.inner > div.nested-inner > div
133 | """
134 | ).strip()
135 |
136 | assert _strip_lines(summary_str) == _strip_lines(expected_summary)
137 |
138 |
139 | def test_parse_chunk_divs():
140 | text = dedent(
141 | """
142 |
143 |
144 | Chunk 1 text.
145 |
146 |
147 |
148 |
149 |
150 | Chunk 2 text.
151 |
152 |
153 |
154 |
Empty chunk.
155 |
156 | """
157 | )
158 |
159 | chunk_divs = parse_divs_by_class(text, "chunk")
160 |
161 | print("\n---test_parse_chunk_divs---")
162 | for chunk_div in chunk_divs:
163 | print(chunk_div.reassemble())
164 | print("---")
165 |
166 | assert chunk_divs[0].reassemble() == """
\n\nChunk 1 text.\n\n
"""
167 | assert chunk_divs[0].contents.strip() == "Chunk 1 text."
168 | assert len(chunk_divs) == 3
169 |
--------------------------------------------------------------------------------
/examples/backfill_timestamps.py:
--------------------------------------------------------------------------------
1 | # /// script
2 | # requires-python = ">=3.13"
3 | # dependencies = [
4 | # "chopdiff",
5 | # "flowmark",
6 | # ]
7 | # ///
8 | import logging
9 | from textwrap import dedent
10 |
11 | from chopdiff.docs import BOF_TOK, EOF_TOK, PARA_BR_TOK, TextDoc, TokenMapping, search_tokens
12 | from chopdiff.html import ContentNotFound, TimestampExtractor
13 |
14 | logging.basicConfig(format=">> %(message)s")
15 | log = logging.getLogger(__name__)
16 | log.setLevel(logging.INFO)
17 |
18 |
19 | def format_timestamp(timestamp: float) -> str:
20 | hours, remainder = divmod(timestamp, 3600)
21 | minutes, seconds = divmod(remainder, 60)
22 | if hours:
23 | return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}"
24 | else:
25 | return f"{int(minutes):02}:{int(seconds):02}"
26 |
27 |
28 | def add_timestamp(text: str, timestamp: float) -> str:
29 | return f'{text}
⏱️{format_timestamp(timestamp)} '
30 |
31 |
32 | def heading(text: str):
33 | return "\n--- " + text + " " + "-" * (70 - len(text)) + "\n"
34 |
35 |
36 | def backfill_timestamps(target_text: str, source_text: str) -> str:
37 | """
38 | Backfill timestamps from a source document into a target document.
39 | The source document should have timestamps in `
`s with a `data-timestamp` attribute.
40 | The target document should have mostly similar text but no timestamps.
41 | """
42 |
43 | print(heading("Source text (with timestamps)"))
44 | print(source_text)
45 |
46 | print(heading("Target text (without timestamps)"))
47 | print(target_text)
48 |
49 | # Parse the target document into wordtoks.
50 | target_doc = TextDoc.from_text(target_text)
51 | extractor = TimestampExtractor(source_text)
52 | source_wordtoks = extractor.wordtoks
53 |
54 | # Create a mapping between source and target docs.
55 | target_wordtoks = list(target_doc.as_wordtoks(bof_eof=True))
56 | token_mapping = TokenMapping(source_wordtoks, target_wordtoks)
57 |
58 | print(heading("Diff"))
59 | print(token_mapping.diff.as_diff_str())
60 |
61 | print(heading("Token mapping"))
62 | print(token_mapping.full_mapping_str())
63 |
64 | for wordtok_offset, (wordtok, sent_index) in enumerate(
65 | target_doc.as_wordtok_to_sent(bof_eof=True)
66 | ):
67 | # Look for each end of paragraph or end of doc.
68 | if wordtok in [PARA_BR_TOK, EOF_TOK]:
69 | # Find the start of the paragraph.
70 | start_para_index, start_para_wordtok = (
71 | search_tokens(target_wordtoks)
72 | .at(wordtok_offset)
73 | .seek_back([BOF_TOK, PARA_BR_TOK])
74 | .next()
75 | .get_token()
76 | )
77 |
78 | wordtok_offset = start_para_index
79 |
80 | source_wordtok_offset = token_mapping.map_back(wordtok_offset)
81 |
82 | log.info(
83 | "Seeking back tok %s (%s) to para start tok %s (%s), map back to source tok %s (%s)",
84 | wordtok_offset,
85 | wordtok,
86 | start_para_index,
87 | start_para_wordtok,
88 | source_wordtok_offset,
89 | source_wordtoks[source_wordtok_offset],
90 | )
91 |
92 | try:
93 | timestamp, _index, _offset = extractor.extract_preceding(source_wordtok_offset)
94 | sent = target_doc.get_sent(sent_index)
95 |
96 | if sent.is_markup():
97 | log.info("Skipping markup-only sentence: %s", sent.text)
98 | continue
99 |
100 | log.info("Adding timestamp to sentence: %s", sent)
101 |
102 | sent.text = add_timestamp(sent.text, timestamp)
103 |
104 | except ContentNotFound:
105 | # Missing timestamps shouldn't be fatal.
106 | log.warning(
107 | "Failed to extract timestamp at doc token %s (%s) -> source token %s (%s): %s",
108 | wordtok_offset,
109 | wordtok,
110 | source_wordtok_offset,
111 | source_wordtoks[source_wordtok_offset],
112 | sent_index,
113 | )
114 |
115 | result = target_doc.reassemble()
116 |
117 | print(heading("Result (with backfilled timestamps)"))
118 | print(result)
119 |
120 | return result
121 |
122 |
123 | def main():
124 | # Example source text with timestamps:
125 | source_text = dedent(
126 | """
127 | Welcome to this um ... video about Python programming.
128 | First, we'll talk about variables. Variables are containers for storing data values.
129 | Then let's look at functions. Functions help us organize and reuse code.
130 | """
131 | )
132 |
133 | # Example target text (similar content but edited, with no timestamps):
134 | target_text = dedent(
135 | """
136 | ## Introduction
137 |
138 | Welcome to this video about Python programming.
139 |
140 | First, we'll talk about variables. Next, let's look at functions. Functions help us organize and reuse code.
141 | """
142 | )
143 |
144 | backfill_timestamps(target_text, source_text)
145 |
146 |
147 | if __name__ == "__main__":
148 | main()
149 |
--------------------------------------------------------------------------------
/src/chopdiff/transforms/diff_filters.py:
--------------------------------------------------------------------------------
1 | from collections.abc import Callable
2 | from typing import TypeAlias
3 |
4 | from typing_extensions import override
5 |
6 | from chopdiff.docs.token_diffs import DiffFilter, DiffOp, OpType
7 | from chopdiff.docs.wordtoks import (
8 | is_break_or_space,
9 | is_tag_close,
10 | is_tag_open,
11 | is_whitespace_or_punct,
12 | is_word,
13 | )
14 | from chopdiff.util.lemmatize import lemmatize, lemmatized_equal
15 |
16 |
17 | class WildcardToken:
18 | """
19 | Wildcard token that matches any number of tokens (including zero).
20 | """
21 |
22 | @override
23 | def __str__(self):
24 | return "*"
25 |
26 |
27 | WILDCARD_TOK = WildcardToken()
28 |
29 | TokenMatcher: TypeAlias = list[str] | Callable[[str], bool]
30 |
31 | TokenPattern: TypeAlias = str | Callable[[str], bool] | WildcardToken
32 |
33 |
34 | def _matches_pattern(tokens: list[str], pattern: list[TokenPattern]) -> bool:
35 | def match_from(i: int, j: int) -> bool:
36 | while i <= len(tokens) and j < len(pattern):
37 | pattern_elem = pattern[j]
38 | if pattern_elem == WILDCARD_TOK:
39 | # If '*' is the last pattern element, it matches any remaining tokens.
40 | if j + 1 == len(pattern):
41 | return True
42 | # Advance pattern index to next pattern after ANY_TOKEN.
43 | j += 1
44 | while i < len(tokens):
45 | if match_from(i, j):
46 | return True
47 | i += 1
48 | return False
49 | else:
50 | if i >= len(tokens):
51 | return False
52 | token = tokens[i]
53 | if isinstance(pattern_elem, str):
54 | if token != pattern_elem:
55 | return False
56 | elif callable(pattern_elem):
57 | if not pattern_elem(token):
58 | return False
59 | else:
60 | return False
61 | i += 1
62 | j += 1
63 | # Skip any remaining ANY_TOKEN in the pattern.
64 | while j < len(pattern) and pattern[j] == WILDCARD_TOK:
65 | j += 1
66 | # The tokens match the pattern if both indices are at the end.
67 | return i == len(tokens) and j == len(pattern)
68 |
69 | return match_from(0, 0)
70 |
71 |
72 | def make_token_sequence_filter(
73 | pattern: list[TokenPattern],
74 | action: OpType | None = None,
75 | ignore: TokenMatcher | None = None,
76 | ) -> DiffFilter:
77 | """
78 | Returns a `DiffFilter` that accepts `DiffOps` where the tokens match the given pattern.
79 | The pattern is a list where each element can be a string or a predicate function that
80 | takes a token and returns a bool (True if the token matches).
81 | The '*' in the pattern list matches any number of tokens (including zero).
82 | If `action` is specified, only `DiffOps` with that action are considered.
83 | """
84 |
85 | def filter_fn(diff_op: DiffOp) -> bool:
86 | if action and diff_op.action != action:
87 | return False
88 |
89 | tokens = diff_op.all_changed()
90 | if ignore and isinstance(ignore, str):
91 | tokens = [tok for tok in tokens if tok not in ignore]
92 | elif ignore and callable(ignore):
93 | tokens = [tok for tok in tokens if not ignore(tok)]
94 |
95 | return _matches_pattern(tokens, pattern)
96 |
97 | return filter_fn
98 |
99 |
100 | def changes_whitespace(diff_op: DiffOp) -> bool:
101 | """
102 | Only accepts changes to sentence and paragraph breaks and whitespace.
103 | """
104 |
105 | return all(is_break_or_space(tok) for tok in diff_op.all_changed())
106 |
107 |
108 | def changes_whitespace_or_punct(diff_op: DiffOp) -> bool:
109 | """
110 | Only accepts changes to punctuation and whitespace.
111 | """
112 |
113 | return all(is_whitespace_or_punct(tok) for tok in diff_op.all_changed())
114 |
115 |
116 | def no_word_lemma_changes(diff_op: DiffOp) -> bool:
117 | """
118 | Only accept changes that preserve the lemmatized form of words.
119 | """
120 | if diff_op.action == OpType.EQUAL:
121 | return True
122 | elif diff_op.action == OpType.REPLACE:
123 | return lemmatized_equal(
124 | " ".join(tok for tok in diff_op.left if is_word(tok)),
125 | " ".join(tok for tok in diff_op.right if is_word(tok)),
126 | )
127 | else:
128 | return len([tok for tok in diff_op.all_changed() if is_word(tok)]) == 0
129 |
130 |
131 | def removes_words(diff_op: DiffOp) -> bool:
132 | """
133 | Only accept changes that remove words. Changes to spaces and punctuation are allowed.
134 | """
135 | if diff_op.action == OpType.DELETE or diff_op.action == OpType.EQUAL:
136 | return True
137 | elif diff_op.action == OpType.REPLACE or diff_op.action == OpType.INSERT:
138 | return all(is_whitespace_or_punct(tok) for tok in set(diff_op.right) - set(diff_op.left))
139 | else:
140 | return False
141 |
142 |
143 | def removes_word_lemmas(diff_op: DiffOp) -> bool:
144 | """
145 | Only accept changes that remove words or replace them with their lemmatized forms.
146 | Changes to spaces and punctuation are allowed.
147 | """
148 | if diff_op.action == OpType.DELETE or diff_op.action == OpType.EQUAL:
149 | return True
150 | elif diff_op.action == OpType.REPLACE or diff_op.action == OpType.INSERT:
151 | left_words = [tok for tok in diff_op.left if is_word(tok)]
152 | right_words = [tok for tok in diff_op.right if is_word(tok)]
153 |
154 | left_lemmas = [lemmatize(word) for word in left_words]
155 | right_lemmas = [lemmatize(word) for word in right_words]
156 |
157 | return set(right_lemmas).issubset(set(left_lemmas))
158 | else:
159 | return False
160 |
161 |
162 | def adds_headings(diff_op: DiffOp) -> bool:
163 | """
164 | Only accept changes that add contents within header tags.
165 | """
166 | headers = ["h1", "h2", "h3", "h4", "h5", "h6"]
167 | is_header = lambda tok: is_tag_open(tok, tag_names=headers) # pyright: ignore
168 | is_header_close = lambda tok: is_tag_close(tok, tag_names=headers) # pyright: ignore
169 | matcher = make_token_sequence_filter(
170 | [is_header, WILDCARD_TOK, is_header_close],
171 | action=OpType.INSERT,
172 | ignore=is_break_or_space,
173 | )
174 | return matcher(diff_op)
175 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | # ---- Project Info and Dependencies ----
2 |
3 | [project.urls]
4 | Repository = "https://github.com/jlevy/chopdiff"
5 | # Homepage = "https://..."
6 | # Documentation = "https://..."
7 |
8 | [project]
9 | name = "chopdiff"
10 | description = "Chunking, diff filtering, and windowed transforms of text to support LLM applications"
11 | authors = [
12 | { name="Joshua Levy", email="joshua@cal.berkeley.edu" },
13 | ]
14 | readme = "README.md"
15 | license = "MIT"
16 | requires-python = ">=3.11,<4.0"
17 | dynamic = ["version"]
18 |
19 | # https://pypi.org/classifiers/
20 | # Adjust as needed:
21 | classifiers = [
22 | # Adjust as needed:
23 | "Development Status :: 4 - Beta",
24 | # "Development Status :: 5 - Production/Stable",
25 | "Intended Audience :: Developers",
26 | "Operating System :: OS Independent",
27 | "Programming Language :: Python",
28 | "Programming Language :: Python :: 3",
29 | "Programming Language :: Python :: 3.11",
30 | "Programming Language :: Python :: 3.12",
31 | "Programming Language :: Python :: 3.13",
32 | "Typing :: Typed",
33 | # Include this to avoid accidentally publishing to PyPI:
34 | # "Private :: Do Not Upload",
35 | ]
36 |
37 |
38 | # ---- Main dependencies ----
39 |
40 | dependencies = [
41 | "prettyfmt>=0.3.0",
42 | "flowmark>=0.5.3",
43 | "strif>=2.1.0",
44 | "funlog>=0.2.1",
45 | "cydifflib>=1.2.0",
46 | "tiktoken>=0.9.0",
47 | "regex>=2024.11.6",
48 | "selectolax>=0.3.32",
49 | ]
50 |
51 | [project.optional-dependencies]
52 | extras = [
53 | "simplemma>=1.1.2",
54 | ]
55 |
56 | [dependency-groups]
57 | dev = [
58 | "pytest>=8.3.5",
59 | "pytest-sugar>=1.0.0",
60 | "ruff>=0.11.9",
61 | "codespell>=2.4.1",
62 | "rich>=14.0.0",
63 | "basedpyright==1.29.5", # TODO: Upgrade when Cursor supports it.
64 | "funlog>=0.2.1",
65 | ]
66 |
67 | [project.scripts]
68 | # Add script entry points here:
69 | chopdiff = "chopdiff:main"
70 |
71 |
72 | # ---- Build system ----
73 |
74 | # Dynamic versioning from:
75 | # https://github.com/ninoseki/uv-dynamic-versioning/
76 |
77 | [build-system]
78 | requires = ["hatchling", "uv-dynamic-versioning"]
79 | build-backend = "hatchling.build"
80 |
81 | [tool.hatch.version]
82 | source = "uv-dynamic-versioning"
83 | # Note JSON schemas don't seem to be right for tool.hatch.version.source so
84 | # this may cause false warnings in IDEs.
85 | # https://github.com/ninoseki/uv-dynamic-versioning/issues/21
86 |
87 | [tool.uv-dynamic-versioning]
88 | vcs = "git"
89 | style = "pep440"
90 | bump = true
91 |
92 | [tool.hatch.build.targets.wheel]
93 | # The source location for the package.
94 | packages = ["src/chopdiff"]
95 |
96 |
97 | # ---- Settings ----
98 |
99 | [tool.ruff]
100 | # Set as desired, typically 88 (black standard) or 100 (wide).
101 | line-length = 100
102 |
103 | [tool.ruff.lint]
104 | select = [
105 | # See: https://docs.astral.sh/ruff/rules/
106 | # Basic list from: https://docs.astral.sh/ruff/linter/#rule-selection
107 | "E", # https://docs.astral.sh/ruff/rules/#error-e
108 | "F", # https://docs.astral.sh/ruff/rules/#pyflakes-f
109 | "UP", # https://docs.astral.sh/ruff/rules/#pyupgrade-up
110 | "B", # https://docs.astral.sh/ruff/rules/#flake8-bugbear-b
111 | "I", # https://docs.astral.sh/ruff/rules/#isort-i
112 | # Other possibilities:
113 | # "D" # https://docs.astral.sh/ruff/rules/#pydocstyle-d
114 | # "Q" # https://docs.astral.sh/ruff/rules/#flake8-quotes-q
115 | # "COM" # https://docs.astral.sh/ruff/rules/#flake8-commas-com
116 | # "SIM", # https://docs.astral.sh/ruff/rules/#flake8-simplify-sim
117 |
118 | ]
119 | ignore = [
120 | # Disable some rules that are overly pedantic. Add/remove as desired:
121 | "E501", # https://docs.astral.sh/ruff/rules/line-too-long/
122 | "E402", # https://docs.astral.sh/ruff/rules/module-import-not-at-top-of-file/
123 | "E731", # https://docs.astral.sh/ruff/rules/lambda-assignment/
124 | "B904",
125 | # We use both ruff formatter and linter so some rules should always be disabled.
126 | # See: https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules
127 | "W191", # https://docs.astral.sh/ruff/rules/tab-indentation/
128 | "E111", # https://docs.astral.sh/ruff/rules/indentation-with-invalid-multiple/
129 | "E114", # https://docs.astral.sh/ruff/rules/indentation-with-invalid-multiple-comment/
130 | "E117", # https://docs.astral.sh/ruff/rules/over-indented/
131 | "D206", # https://docs.astral.sh/ruff/rules/docstring-tab-indentation/
132 | "D300", # https://docs.astral.sh/ruff/rules/triple-single-quotes/
133 | "Q000", # https://docs.astral.sh/ruff/rules/bad-quotes-inline-string/
134 | "Q001", # https://docs.astral.sh/ruff/rules/bad-quotes-multiline-string/
135 | "Q002", # https://docs.astral.sh/ruff/rules/bad-quotes-docstring/
136 | "Q003", # https://docs.astral.sh/ruff/rules/avoidable-escaped-quote/
137 | "COM812", # https://docs.astral.sh/ruff/rules/missing-trailing-comma/
138 | "COM819", # https://docs.astral.sh/ruff/rules/prohibited-trailing-comma/
139 | "ISC002", # https://docs.astral.sh/ruff/rules/multi-line-implicit-string-concatenation/
140 | ]
141 |
142 | [tool.basedpyright]
143 | # BasedPyright currently seems like the best type checker option, much faster
144 | # than mypy and with a good extension for VSCode/Cursor.
145 | # https://marketplace.visualstudio.com/items?itemName=detachhead.basedpyright
146 | # https://docs.basedpyright.com/latest/configuration/config-files/#sample-pyprojecttoml-file
147 | include = ["src", "tests", "devtools"]
148 | # By default BasedPyright is very strict, so you almost certainly want to disable
149 | # some of the rules.
150 | # First, these turn off warnings about (yes) how you ignore warnings:
151 | reportIgnoreCommentWithoutRule = false
152 | reportUnnecessaryTypeIgnoreComment = false
153 | # A few typically noisy warnings are next.
154 | # How many you enable is up to you. The first few are off by default, but you can
155 | # comment/uncomment these as desired:
156 | reportMissingTypeStubs = false
157 | reportUnusedCallResult = false
158 | reportAny = false
159 | reportExplicitAny = false
160 | reportImplicitStringConcatenation = false
161 | reportUnreachable = false
162 | reportUnknownMemberType = false
163 | # reportPrivateImportUsage = false
164 | # reportPrivateLocalImportUsage = false
165 | # reportMissingImports = false
166 | # reportUnnecessaryIsInstance = false
167 | reportUnknownVariableType = false
168 | # reportUnknownArgumentType = false
169 | reportUnannotatedClassAttribute = false
170 | reportUnknownLambdaType = false
171 | reportPrivateUsage = false
172 |
173 | [tool.codespell]
174 | ignore-words-list = "Numbe"
175 | # skip = "foo.py,bar.py"
176 |
177 | [tool.pytest.ini_options]
178 | python_files = ["*.py"]
179 | python_classes = ["Test*"]
180 | python_functions = ["test_*"]
181 | testpaths = [
182 | "src",
183 | "tests",
184 | ]
185 | norecursedirs = []
186 | filterwarnings = []
187 |
188 |
189 |
--------------------------------------------------------------------------------
/src/chopdiff/divs/text_node.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from collections.abc import Callable
4 | from copy import copy
5 | from dataclasses import dataclass, field
6 |
7 | from prettyfmt import fmt_lines
8 | from typing_extensions import override
9 |
10 | from chopdiff.docs.sizes import TextUnit
11 | from chopdiff.docs.text_doc import Splitter, TextDoc, default_sentence_splitter
12 | from chopdiff.html.html_in_md import div_wrapper
13 |
14 |
15 | @dataclass
16 | class TextNode:
17 | """
18 | A node in parsed structured text, with reference offsets into the original text.
19 | Useful for parsing Markdown broken into div tags.
20 | """
21 |
22 | original_text: str
23 |
24 | # Offsets into the original text.
25 | offset: int
26 | content_start: int
27 | content_end: int
28 |
29 | tag_name: str | None = None
30 | class_name: str | None = None
31 | begin_marker: str | None = None
32 | end_marker: str | None = None
33 |
34 | children: list[TextNode] = field(default_factory=list)
35 |
36 | @property
37 | def end_offset(self) -> int:
38 | assert self.content_end >= 0
39 | return self.content_end + len(self.end_marker) if self.end_marker else self.content_end
40 |
41 | @property
42 | def contents(self) -> str:
43 | return self.original_text[self.content_start : self.content_end]
44 |
45 | def text_doc(self, sentence_splitter: Splitter = default_sentence_splitter) -> TextDoc:
46 | return TextDoc.from_text(self.contents, sentence_splitter=sentence_splitter)
47 |
48 | def slice_children(self, start: int, end: int) -> TextNode:
49 | if not self.children:
50 | raise ValueError("Cannot slice_children on a non-container node.")
51 | else:
52 | node_copy = copy(self)
53 | node_copy.children = node_copy.children[start:end]
54 | return node_copy
55 |
56 | def size(self, unit: TextUnit) -> int:
57 | if self.children:
58 | return sum(child.size(unit) for child in self.children)
59 | else:
60 | return self.text_doc().size(unit)
61 |
62 | def structure_summary(self) -> dict[str, int]:
63 | """
64 | Recursively tally the number of non-empty leaf nodes of different types as CSS-style paths.
65 | For example
66 |
67 | { "_total": 7, "div.chunk": 5, "div.chunk > div.summary": 2, "div.chunk > div.content": 5 }
68 |
69 | would mean that there were 7 chunk divs, each with a content div, and 2 with
70 | a summary div within it.
71 | """
72 |
73 | def path_join(*selectors: str) -> str:
74 | return " > ".join(selectors)
75 |
76 | def tally_recursive(node: TextNode, path: list[str], tally: dict[str, int]) -> None:
77 | # Skip leaf nodes.
78 | if not node.children and not node.tag_name and not node.class_name:
79 | return
80 |
81 | tag_selector = node.tag_name if node.tag_name else ""
82 | class_selector = f".{node.class_name}" if node.class_name else ""
83 | selector = f"{tag_selector}{class_selector}"
84 | new_path = path + [selector] if selector else path
85 |
86 | # Increment counts.
87 | path_key = path_join(*new_path)
88 | if path_key:
89 | tally[path_key] = tally.get(path_key, 0) + 1
90 |
91 | for child in node.children:
92 | tally_recursive(child, new_path, tally)
93 |
94 | tally: dict[str, int] = {}
95 | tally_recursive(self, [], tally)
96 |
97 | sorted_tally = dict(sorted(tally.items()))
98 | return sorted_tally
99 |
100 | def structure_summary_str(self) -> str | None:
101 | structure_summary = self.structure_summary()
102 | if not structure_summary:
103 | return None
104 | else:
105 | return "HTML structure:\n" + fmt_lines(
106 | [f"{count:6d} {path}" for path, count in self.structure_summary().items()],
107 | prefix="",
108 | )
109 |
110 | def size_summary(self) -> str:
111 | """
112 | Return a summary of the size of the doc as well as a summary of its
113 | div/HTML structure.
114 | """
115 | summary = self.text_doc().size_summary()
116 | if structure_summary_str := self.structure_summary_str():
117 | summary += "\n" + structure_summary_str
118 | return summary
119 |
120 | def is_whitespace(self) -> bool:
121 | """
122 | Is this node whitespace only?
123 | """
124 | return not self.children and self.contents.strip() == ""
125 |
126 | def children_by_class_names(self, *class_names: str, recursive: bool = False) -> list[TextNode]:
127 | def collect_children(node: TextNode) -> list[TextNode]:
128 | matching_children = [
129 | child for child in node.children if child.class_name in class_names
130 | ]
131 | if recursive:
132 | for child in node.children:
133 | matching_children.extend(collect_children(child))
134 | return matching_children
135 |
136 | return collect_children(self)
137 |
138 | def child_by_class_name(self, class_name: str) -> TextNode | None:
139 | nodes = self.children_by_class_names(class_name, recursive=False)
140 | if len(nodes) == 0:
141 | return None
142 | if len(nodes) > 1:
143 | raise ValueError(f"Multiple children with class name {class_name}")
144 | return nodes[0]
145 |
146 | def reassemble(self, padding: str = "\n\n") -> str:
147 | """
148 | Reassemble as string. If padding is provided (not ""), then strip, skip whitespace,
149 | and insert our own padding.
150 | """
151 | strip_fn: Callable[[str], str] = lambda s: s.strip() if padding else s
152 | skip_whitespace = bool(padding)
153 |
154 | if not self.children:
155 | if not self.tag_name:
156 | return strip_fn(self.contents)
157 | else:
158 | wrap = div_wrapper(self.class_name, padding=padding)
159 | return wrap(strip_fn(self.contents))
160 | else:
161 | padded_children = (padding or "").join(
162 | child.reassemble(padding)
163 | for child in self.children
164 | if (not skip_whitespace or not child.is_whitespace())
165 | )
166 | if not self.tag_name:
167 | return padded_children
168 | else:
169 | wrap = div_wrapper(self.class_name, padding=padding)
170 | return wrap(padded_children)
171 |
172 | @override
173 | def __str__(self):
174 | """
175 | Return a recursive, formatted string representation of the node and its children.
176 | """
177 | return self._str_recursive()
178 |
179 | def _str_recursive(self, level: int = 0, max_len: int = 40) -> str:
180 | indent = " " * level
181 | content_preview = self.contents
182 | if len(content_preview) > max_len:
183 | content_preview = content_preview[:20] + "…" + content_preview[-20:]
184 | result = (
185 | f"{indent}TextNode(tag_name={self.tag_name} class_name={self.class_name} offset={self.offset},"
186 | f" content_start={self.content_start}, content_end={self.content_end}) "
187 | f"{repr(content_preview)}\n"
188 | )
189 | for child in self.children:
190 | result += child._str_recursive(level + 1)
191 | return result
192 |
--------------------------------------------------------------------------------
/src/chopdiff/docs/wordtoks.py:
--------------------------------------------------------------------------------
1 | """
2 | Support for treating text as a sequence of word, punctuation, whitespace
3 | (word, setnence, and paragraph breaks), or HTML tags as tokens, which we call
4 | "wordtoks".
5 |
6 | Also works well with Markdown. Wordtoks make it possible to do word-oriented
7 | parsing, diffs, and transforms, while also preserving HTML tags and significant
8 | whitespace.
9 | """
10 |
11 | from dataclasses import dataclass
12 |
13 | import regex
14 |
15 | # Special tokens to represent sentence, paragraph, and document boundaries.
16 | # Note these parse as tokens and like HTML tags, so they can safely be mixed into inputs if desired.
17 | SENT_BR_TOK = "<-SENT-BR->"
18 | PARA_BR_TOK = "<-PARA-BR->"
19 | BOF_TOK = "<-BOF->"
20 | EOF_TOK = "<-EOF->"
21 |
22 | SENT_BR_STR = " "
23 | PARA_BR_STR = "\n\n"
24 | BOF_STR = ""
25 | EOF_STR = ""
26 |
27 | SPACE_TOK = " "
28 |
29 | SYMBOL_SEP = "⎪"
30 |
31 | # Currently break on words, spaces, or any single other/punctuation character.
32 | # HTML tags (of length <1024 chars, possibly with newlines) and entities are also a single token.
33 | # TODO: Could add nicer support for Markdown formatting as well.
34 | # Updated pattern to include HTML entities
35 | _wordtok_pattern = regex.compile(
36 | r"(<(?:[^<>]|\n){0,1024}>|\&\w+;|\&\#\d+;|\w+|[^\w\s]|\s+)", regex.DOTALL
37 | )
38 |
39 | _para_br_pattern = regex.compile(r"\s*\n\n\s*")
40 |
41 | # TODO: Is it worth using the regex package here to get \p{L} or is there a good
42 | # enough way with re only?
43 | _word_pat = regex.compile(r"\p{L}+", regex.UNICODE)
44 |
45 | _number_pat = regex.compile(r"\d+")
46 |
47 | _tag_pattern = regex.compile(r"<(/?)(\w+)([^>]*?)(/?)\s*>", regex.IGNORECASE)
48 |
49 | _comment_pattern = regex.compile(r"", regex.DOTALL)
50 |
51 |
52 | def wordtok_to_str(wordtok: str) -> str:
53 | """
54 | Convert a wordtok to a string, mapping any special wordtoks to their usual
55 | representations.
56 | """
57 | if wordtok == SENT_BR_TOK:
58 | return SENT_BR_STR
59 | if wordtok == PARA_BR_TOK:
60 | return PARA_BR_STR
61 | if wordtok == BOF_TOK:
62 | return BOF_STR
63 | if wordtok == EOF_TOK:
64 | return EOF_STR
65 | return wordtok
66 |
67 |
68 | def wordtok_len(wordtok: str) -> int:
69 | """
70 | Char length of a wordtok.
71 | """
72 | return len(wordtok_to_str(wordtok))
73 |
74 |
75 | _whitespace = regex.compile(r"\s+")
76 |
77 |
78 | def normalize_wordtok(wordtok: str) -> str:
79 | if wordtok.isspace():
80 | normalized = SPACE_TOK
81 | elif wordtok.startswith("<"):
82 | normalized = _whitespace.sub(" ", wordtok)
83 | else:
84 | normalized = wordtok
85 | return normalized
86 |
87 |
88 | def wordtokenize_with_offsets(text: str, bof_eof: bool = False) -> tuple[list[str], list[int]]:
89 | """
90 | Same as `wordtokenize`, but returns a list of tuples `(wordtok, offset)`.
91 | """
92 | wordtoks = []
93 | offsets = []
94 | offset = 0
95 | for match in _wordtok_pattern.finditer(text):
96 | wordtok = normalize_wordtok(match.group())
97 | wordtoks.append(wordtok)
98 | offsets.append(offset)
99 | offset = match.end()
100 |
101 | if bof_eof:
102 | wordtoks = [BOF_TOK] + wordtoks + [EOF_TOK]
103 | offsets = [0] + offsets + [len(text)]
104 |
105 | return wordtoks, offsets
106 |
107 |
108 | def wordtokenize(text: str, bof_eof: bool = False) -> list[str]:
109 | """
110 | Convert text to word tokens, including words, whitespace, punctuation, and
111 | HTML tags. Does not parse paragraph or sentence breaks. Normalizes all
112 | whitespace to a single space character.
113 | """
114 | wordtoks, _offsets = wordtokenize_with_offsets(text, bof_eof)
115 | return wordtoks
116 |
117 |
118 | def _insert_para_wordtoks(text: str) -> str: # pyright: ignore
119 | """
120 | Replace paragraph breaks in text with para break tokens.
121 | """
122 | return _para_br_pattern.sub(PARA_BR_TOK, text)
123 |
124 |
125 | def _initial_wordtoks(text: str, max_chars: int) -> list[str]:
126 | sub_text = text[:max_chars]
127 | wordtoks = wordtokenize(sub_text)
128 | if wordtoks:
129 | wordtoks.pop() # Drop any cut off token.
130 | return wordtoks
131 |
132 |
133 | def first_wordtok(text: str) -> str | None:
134 | """
135 | Get the first wordtok from the text, if it has one.
136 | """
137 | wordtoks = _initial_wordtoks(text, 100)
138 | return wordtoks[0] if wordtoks else None
139 |
140 |
141 | def join_wordtoks(wordtoks: list[str]) -> str:
142 | """
143 | Join wordtoks back into a sentence.
144 | """
145 | wordtoks = [wordtok_to_str(wordtok) for wordtok in wordtoks]
146 | return "".join(wordtoks)
147 |
148 |
149 | def visualize_wordtoks(wordtoks: list[str]) -> str:
150 | """
151 | Visualize wordtoks with a separator for debugging.
152 | """
153 | return SYMBOL_SEP + SYMBOL_SEP.join(wordtoks) + SYMBOL_SEP
154 |
155 |
156 | def is_break_or_space(wordtok: str) -> bool:
157 | """
158 | Any kind of paragraph break, sentence break, or space (including
159 | the beginning or end of the document).
160 | """
161 | return (
162 | wordtok == PARA_BR_TOK
163 | or wordtok == SENT_BR_TOK
164 | or wordtok.isspace()
165 | or wordtok == BOF_TOK
166 | or wordtok == EOF_TOK
167 | )
168 |
169 |
170 | def is_word(wordtok: str) -> bool:
171 | """
172 | Is this wordtok a word, not punctuation or whitespace or a number?
173 | """
174 | return bool(len(wordtok) > 0 and _word_pat.match(wordtok) and not _number_pat.match(wordtok))
175 |
176 |
177 | def is_number(wordtok: str) -> bool:
178 | """
179 | Is this wordtok a number?
180 | """
181 | return bool(_number_pat.match(wordtok))
182 |
183 |
184 | def is_whitespace_or_punct(wordtok: str) -> bool:
185 | """
186 | Is this wordtok whitespace or punctuation?
187 | """
188 | return bool(not is_word(wordtok) and not is_number(wordtok))
189 |
190 |
191 | @dataclass(frozen=True)
192 | class Tag:
193 | """
194 | An HTML tag or comment.
195 | """
196 |
197 | name: str
198 | is_open: bool
199 | is_close: bool
200 | attrs: dict[str, str]
201 | comment: str | None = None
202 |
203 |
204 | def parse_tag(wordtok: str | None = None) -> Tag | None:
205 | """
206 | Parse a wordtok to determine if it's an HTML tag and extract its components.
207 | """
208 | if not wordtok:
209 | return None
210 |
211 | match = _tag_pattern.match(wordtok)
212 | if not match:
213 | match = _comment_pattern.match(wordtok)
214 | if not match:
215 | return None
216 | return Tag(name="", is_open=False, is_close=False, attrs={}, comment=match.group(1))
217 |
218 | is_open = not bool(match.group(1))
219 | is_close = bool(match.group(1) or match.group(4))
220 | tag_name = match.group(2).lower()
221 | attrs_str = match.group(3).strip()
222 |
223 | attrs: dict[str, str] = {}
224 | if attrs_str:
225 | attr_pattern = regex.compile(r'(\w+)\s*=\s*"([^"]*)"')
226 | for attr_match in attr_pattern.finditer(attrs_str):
227 | attr_name, attr_value = attr_match.groups()
228 | attrs[attr_name] = attr_value
229 |
230 | return Tag(name=tag_name, is_open=is_open, is_close=is_close, attrs=attrs)
231 |
232 |
233 | def is_tag(wordtok: str | None = None, tag_names: list[str] | None = None) -> bool:
234 | """
235 | Check if a wordtok is an HTML tag and optionally if it's in the specified tag names.
236 | """
237 | tag = parse_tag(wordtok)
238 | return bool(tag and (not tag_names or tag.name in [name.lower() for name in tag_names]))
239 |
240 |
241 | def is_tag_close(wordtok: str, tag_names: list[str] | None = None) -> bool:
242 | """
243 | Check if a wordtok is an HTML close tag and optionally if it's in the specified tag names.
244 | """
245 | tag = parse_tag(wordtok)
246 | return bool(
247 | tag and tag.is_close and (not tag_names or tag.name in [name.lower() for name in tag_names])
248 | )
249 |
250 |
251 | def is_tag_open(wordtok: str, tag_names: list[str] | None = None) -> bool:
252 | """
253 | Check if a wordtok is an HTML open tag and optionally if it's in the specified tag names.
254 | """
255 | tag = parse_tag(wordtok)
256 | return bool(
257 | tag and tag.is_open and (not tag_names or tag.name in [name.lower() for name in tag_names])
258 | )
259 |
260 |
261 | def is_div(wordtok: str | None = None) -> bool:
262 | return is_tag(wordtok, tag_names=["div"])
263 |
264 |
265 | def is_entity(wordtok: str | None = None) -> bool:
266 | """
267 | Check if a wordtok is an HTML entity.
268 | """
269 | return bool(wordtok and wordtok.startswith("&") and wordtok.endswith(";"))
270 |
271 |
272 | header_tags = ["h1", "h2", "h3", "h4", "h5", "h6"]
273 |
274 |
275 | def is_header_tag(wordtok: str) -> bool:
276 | """
277 | Is this wordtok an HTML header tag?
278 | """
279 | return is_tag(wordtok, tag_names=header_tags)
280 |
--------------------------------------------------------------------------------
/src/chopdiff/transforms/sliding_transforms.py:
--------------------------------------------------------------------------------
1 | """
2 | Transform text using sliding windows over a document, then reassembling the
3 | transformed text.
4 | """
5 |
6 | import logging
7 | from collections.abc import Callable
8 | from math import ceil
9 | from typing import Any, TypeAlias
10 |
11 | from flowmark import fill_markdown
12 | from prettyfmt import fmt_lines
13 |
14 | from chopdiff.docs.sizes import TextUnit
15 | from chopdiff.docs.text_doc import Paragraph, TextDoc
16 | from chopdiff.docs.token_diffs import DIFF_FILTER_NONE, DiffFilter, diff_docs, find_best_alignment
17 | from chopdiff.docs.wordtoks import join_wordtoks
18 | from chopdiff.transforms.sliding_windows import sliding_para_window, sliding_word_window
19 | from chopdiff.transforms.window_settings import WINDOW_BR, WindowSettings
20 |
21 | log = logging.getLogger(__name__)
22 |
23 | TextDocTransform: TypeAlias = Callable[[TextDoc], TextDoc]
24 |
25 | SaveFunc: TypeAlias = Callable[[str, str, Any], None]
26 |
27 |
28 | def remove_window_br(doc: TextDoc):
29 | """
30 | Remove `` markers in a document.
31 | """
32 | doc.replace_str(WINDOW_BR, "")
33 |
34 |
35 | def filtered_transform(
36 | doc: TextDoc,
37 | transform_func: TextDocTransform,
38 | windowing: WindowSettings | None,
39 | diff_filter: DiffFilter | None = None,
40 | debug_save: SaveFunc | None = None,
41 | ) -> TextDoc:
42 | """
43 | Apply a transform with sliding window across the input doc, enforcing the changes it's
44 | allowed to make with `diff_filter`.
45 |
46 | If windowing is None, apply the transform to the entire document at once.
47 |
48 | `debug_save` is an optional function that takes a message, a filename, and an object, and saves
49 | the object to a file for debugging.
50 | """
51 | has_filter = diff_filter and diff_filter != DIFF_FILTER_NONE
52 |
53 | if not windowing or not windowing.size:
54 | transformed_doc = transform_func(doc)
55 | else:
56 |
57 | def transform_and_check_diff(input_doc: TextDoc) -> TextDoc:
58 | # Avoid having window breaks build up after multiple transforms.
59 | remove_window_br(input_doc)
60 |
61 | transformed_doc = transform_func(input_doc)
62 |
63 | if has_filter:
64 | # Check the transform did what it should have.
65 | diff = diff_docs(input_doc, transformed_doc)
66 | accepted_diff, rejected_diff = diff.filter(diff_filter)
67 |
68 | assert diff.left_size() == input_doc.size(TextUnit.wordtoks)
69 | assert accepted_diff.left_size() == input_doc.size(TextUnit.wordtoks)
70 | assert rejected_diff.left_size() == input_doc.size(TextUnit.wordtoks)
71 |
72 | log.info(
73 | "Accepted transform changes:\n%s",
74 | fmt_lines(str(accepted_diff).splitlines()),
75 | )
76 |
77 | # Note any rejections.
78 | rejected_changes = rejected_diff.changes()
79 | if rejected_changes:
80 | log.info(
81 | "Filtering extraneous changes:\n%s",
82 | fmt_lines(rejected_diff.as_diff_str(False).splitlines()),
83 | )
84 |
85 | # Apply only the accepted changes.
86 | final_doc = TextDoc.from_wordtoks(
87 | accepted_diff.apply_to(list(input_doc.as_wordtoks()))
88 | )
89 | log.info(
90 | "Word token changes:\n%s",
91 | fmt_lines(
92 | [
93 | f"Accepted: {accepted_diff.stats()}",
94 | f"Rejected: {rejected_diff.stats()}",
95 | ]
96 | ),
97 | )
98 | else:
99 | diff = None
100 | accepted_diff, rejected_diff = None, None
101 | final_doc = transformed_doc
102 |
103 | if debug_save:
104 | debug_save(
105 | "Input doc normalized",
106 | "filtered_transform",
107 | fill_markdown(input_doc.reassemble()),
108 | )
109 | debug_save("Output doc raw", "filtered_transform", transformed_doc.reassemble())
110 | # log_save(
111 | # "Output doc normalized",
112 | # "filtered_transform",
113 | # normalize_markdown(transformed_doc.reassemble()),
114 | # )
115 | if diff:
116 | debug_save("Transform diff", "filtered_transform", diff)
117 | # if accepted_diff:
118 | # log.save_object("Accepted diff", "filtered_transform", accepted_diff)
119 | if rejected_diff:
120 | debug_save("Rejected diff", "filtered_transform", rejected_diff)
121 |
122 | debug_save("Final doc", "filtered_transform", final_doc.reassemble())
123 |
124 | return final_doc
125 |
126 | transformed_doc = sliding_window_transform(
127 | doc,
128 | transform_and_check_diff,
129 | windowing,
130 | )
131 |
132 | return transformed_doc
133 |
134 |
135 | def sliding_window_transform(
136 | doc: TextDoc, transform_func: TextDocTransform, settings: WindowSettings
137 | ) -> TextDoc:
138 | if settings.unit == TextUnit.wordtoks:
139 | return sliding_wordtok_window_transform(doc, transform_func, settings)
140 | elif settings.unit == TextUnit.paragraphs:
141 | return sliding_para_window_transform(doc, transform_func, settings)
142 | else:
143 | raise ValueError(f"Unsupported sliding transform unit: {settings.unit}")
144 |
145 |
146 | def sliding_wordtok_window_transform(
147 | doc: TextDoc, transform_func: TextDocTransform, settings: WindowSettings
148 | ) -> TextDoc:
149 | """
150 | Apply a transformation function to each TextDoc in a sliding window over the given document,
151 | stepping through wordtoks, then reassemble the transformed document. Uses best effort to
152 | stitch the results together seamlessly by searching for the best alignment (minimum wordtok
153 | edit distance) of each transformed window.
154 | """
155 | if settings.unit != TextUnit.wordtoks:
156 | raise ValueError(f"This sliding window expects wordtoks, not {settings.unit}")
157 |
158 | windows = sliding_word_window(doc, settings.size, settings.shift, TextUnit.wordtoks)
159 |
160 | nwordtoks = doc.size(TextUnit.wordtoks)
161 | nbytes = doc.size(TextUnit.bytes)
162 | nwindows = ceil(nwordtoks / settings.shift)
163 | sep_wordtoks = [settings.separator] if settings.separator else []
164 |
165 | log.info(
166 | "Sliding word transform: Begin on doc: total %s wordtoks, %s bytes, %s windows, %s",
167 | nwordtoks,
168 | nbytes,
169 | nwindows,
170 | settings,
171 | )
172 |
173 | output_wordtoks: list[str] = []
174 | for i, window in enumerate(windows):
175 | log.info(
176 | "Sliding word transform window %s/%s (%s wordtoks, %s bytes), at %s wordtoks so far",
177 | i + 1,
178 | nwindows,
179 | window.size(TextUnit.wordtoks),
180 | window.size(TextUnit.bytes),
181 | len(output_wordtoks),
182 | )
183 |
184 | transformed_window = transform_func(window)
185 |
186 | new_wordtoks = list(transformed_window.as_wordtoks())
187 |
188 | if not output_wordtoks:
189 | output_wordtoks = new_wordtoks
190 | else:
191 | if len(output_wordtoks) < settings.min_overlap:
192 | raise ValueError(
193 | "Output wordtoks too short to align with min_overlap %s: %s",
194 | settings.min_overlap,
195 | output_wordtoks,
196 | )
197 | if len(new_wordtoks) < settings.min_overlap:
198 | log.error(
199 | "New wordtoks too short to align with min_overlap %s, skipping: %s",
200 | settings.min_overlap,
201 | new_wordtoks,
202 | )
203 | continue
204 |
205 | offset, (score, diff) = find_best_alignment(
206 | output_wordtoks, new_wordtoks, settings.min_overlap
207 | )
208 |
209 | log.info(
210 | "Sliding word transform: Best alignment of window %s is at token offset %s (score %s, %s)",
211 | i,
212 | offset,
213 | score,
214 | diff.stats(),
215 | )
216 |
217 | output_wordtoks = output_wordtoks[:offset] + sep_wordtoks + new_wordtoks
218 |
219 | log.info(
220 | "Sliding word transform: Done, output total %s wordtoks",
221 | len(output_wordtoks),
222 | )
223 |
224 | # An alternate approach would be to accumulate the document sentences instead of wordtoks to
225 | # avoid re-parsing, but this is probably a little simpler.
226 | output_doc = TextDoc.from_text(join_wordtoks(output_wordtoks))
227 |
228 | return output_doc
229 |
230 |
231 | def sliding_para_window_transform(
232 | doc: TextDoc,
233 | transform_func: TextDocTransform,
234 | settings: WindowSettings,
235 | normalizer: Callable[[str], str] = fill_markdown,
236 | ) -> TextDoc:
237 | """
238 | Apply a transformation function to each TextDoc, stepping through paragraphs `settings.size`
239 | at a time, then reassemble the transformed document.
240 | """
241 | if settings.unit != TextUnit.paragraphs:
242 | raise ValueError(f"This sliding window expects paragraphs, not {settings.unit}")
243 | if settings.size != settings.shift:
244 | raise ValueError("Paragraph window transform requires equal size and shift")
245 |
246 | windows = sliding_para_window(doc, settings.size, normalizer)
247 |
248 | nwindows = ceil(doc.size(TextUnit.paragraphs) / settings.size)
249 |
250 | log.info(
251 | "Sliding paragraph transform: Begin on doc: %s windows of size %s paragraphs on total %s",
252 | nwindows,
253 | settings.size,
254 | doc.size_summary(),
255 | )
256 |
257 | transformed_paras: list[Paragraph] = []
258 | for i, window in enumerate(windows):
259 | log.info(
260 | "Sliding paragraph transform: Window %s/%s input is %s",
261 | i,
262 | nwindows,
263 | window.size_summary(),
264 | )
265 |
266 | new_doc = transform_func(window)
267 | if i > 0:
268 | try:
269 | new_doc.paragraphs[0].sentences[0].text = (
270 | settings.separator + new_doc.paragraphs[0].sentences[0].text
271 | )
272 | except (KeyError, IndexError):
273 | pass
274 | transformed_paras.extend(new_doc.paragraphs)
275 |
276 | transformed_text = "\n\n".join(para.reassemble() for para in transformed_paras)
277 | new_text_doc = TextDoc.from_text(transformed_text)
278 |
279 | log.info(
280 | "Sliding paragraph transform: Done, output total %s",
281 | new_text_doc.size_summary(),
282 | )
283 |
284 | return new_text_doc
285 |
--------------------------------------------------------------------------------
/.cursor/rules/python.mdc:
--------------------------------------------------------------------------------
1 | ---
2 | description: Python Coding Guidelines
3 | globs: *.py,pyproject.toml
4 | alwaysApply: false
5 | ---
6 | # Python Coding Guidelines
7 |
8 | These are rules for a modern Python project using uv.
9 |
10 | ## Python Version
11 |
12 | Write for Python 3.11-3.13. Do NOT write code to support earlier versions of Python.
13 | Always use modern Python practices appropriate for Python 3.11-3.13.
14 |
15 | Always use full type annotations, generics, and other modern practices.
16 |
17 | ## Project Setup and Developer Workflows
18 |
19 | - Important: BE SURE you read and understand the project setup by reading the
20 | pyproject.toml file and the Makefile.
21 |
22 | - ALWAYS use uv for running all code and managing dependencies.
23 | Never use direct `pip` or `python` commands.
24 |
25 | - Use modern uv commands: `uv sync`, `uv run ...`, etc.
26 | Prefer `uv add` over `uv pip install`.
27 |
28 | - You may use the following shortcuts
29 | ```shell
30 |
31 | # Install all dependencies:
32 | make install
33 |
34 | # Run linting (with ruff) and type checking (with basedpyright).
35 | # Note when you run this, ruff will auto-format and sort imports, resolving any
36 | # linter warnings about import ordering:
37 | make lint
38 |
39 | # Run tests:
40 | make test
41 |
42 | # Run uv sync, lint, and test in one command:
43 | make
44 | ```
45 |
46 | - The usual `make test` like standard pytest does not show test output.
47 | Run individual tests and see output with `uv run pytest -s some/file.py`.
48 |
49 | - Always run `make lint` and `make test` to check your code after changes.
50 |
51 | - You must verify there are zero linter warnings/errors or test failures before
52 | considering any task complete.
53 |
54 | ## General Development Practices
55 |
56 | - Be sure to resolve the pyright (basedpyright) linter errors as you develop and make
57 | changes.
58 |
59 | - If type checker errors are hard to resolve, you may add a comment `# pyright: ignore`
60 | to disable Pyright warnings or errors but ONLY if you know they are not a real problem
61 | and are difficult to fix.
62 |
63 | - In special cases you may consider disabling it globally it in pyproject.toml but YOU
64 | MUST ASK FOR CONFIRMATION from the user before globally disabling lint or type checker
65 | rules.
66 |
67 | - Never change an existing comment, pydoc, or a log statement, unless it is directly
68 | fixing the issue you are changing, or the user has asked you to clean up the code.
69 | Do not drop existing comments when editing code!
70 | And do not delete or change logging statements.
71 |
72 | ## Coding Conventions and Imports
73 |
74 | - Always use full, absolute imports for paths.
75 | do NOT use `from .module1.module2 import ...`. Such relative paths make it hard to
76 | refactor. Use `from toplevel_pkg.module1.modlule2 import ...` instead.
77 |
78 | - Be sure to import things like `Callable` and other types from the right modules,
79 | remembering that many are now in `collections.abc` or `typing_extensions`. For
80 | example: `from collections.abc import Callable, Coroutine`
81 |
82 | - Use `typing_extensions` for things like `@override` (you need to use this, and not
83 | `typing` since we want to support Python 3.11).
84 |
85 | - Add `from __future__ import annotations` on files with types whenever applicable.
86 |
87 | - Use pathlib `Path` instead of strings.
88 | Use `Path(filename).read_text()` instead of two-line `with open(...)` blocks.
89 |
90 | - Use strif’s `atomic_output_file` context manager when writing files to ensure output
91 | files are written atomically.
92 |
93 | ## Use Modern Python Practices
94 |
95 | - ALWAYS use `@override` decorators to override methods from base classes.
96 | This is a modern Python practice and helps avoid bugs.
97 |
98 | ## Testing
99 |
100 | - For longer tests put them in a file like `tests/test_somename.py` in the `tests/`
101 | directory (or `tests/module_name/test_somename.py` file for a submodule).
102 |
103 | - For simple tests, prefer inline functions in the original code file below a `## Tests`
104 | comment. This keeps the tests easy to maintain and close to the code.
105 | Inline tests should NOT import pytest or pytest fixtures as we do not want runtime
106 | dependency on pytest.
107 |
108 | - DO NOT write one-off test code in extra files that are throwaway.
109 |
110 | - DO NOT put `if __name__ == "__main__":` just for quick testing.
111 | Instead use the inline function tests and run them with `uv run pytest`.
112 |
113 | - You can run such individual tests with `uv run pytest -s src/.../path/to/test`
114 |
115 | - Don’t add docs to assertions unless it’s not obvious what they’re checking - the
116 | assertion appears in the stack trace.
117 | Do NOT write `assert x == 5, "x should be 5"`. Do NOT write `assert x == 5 # Check if
118 | x is 5`. That is redundant.
119 | Just write `assert x == 5`.
120 |
121 | - DO NOT write trivial or obvious tests that are evident directly from code, such as
122 | assertions that confirm the value of a constant setting.
123 |
124 | - NEVER write `assert False`. If a test reaches an unexpected branch and must fail
125 | explicitly, `raise AssertionError("Some explanation")` instead.
126 | This is best typical best practice in Python since assertions can be removed with
127 | optimization.
128 |
129 | - DO NOT use pytest fixtures like parameterized tests or expected exception decorators
130 | unless absolutely necessary in more complex tests.
131 | It is typically simpler to use simple assertions and put the checks inside the test.
132 | This is also preferable because then simple tests have no explicit pytest dependencies
133 | and can be placed in code anywhere.
134 |
135 | - DO NOT write trivial tests that test something we know already works, like
136 | instantiating a Pydantic object.
137 |
138 | ```python
139 | class Link(BaseModel):
140 | url: str
141 | title: str = None
142 |
143 | # DO NOT write tests like this. They are trivial and only create clutter!
144 | def test_link_model():
145 | link = Link(url="https://example.com", title="Example")
146 | assert link.url == "https://example.com"
147 | assert link.title == "Example"
148 | ```
149 |
150 | ## Types and Type Annotations
151 |
152 | - Use modern union syntax: `str | None` instead of `Optional[str]`, `dict[str]` instead
153 | of `Dict[str]`, `list[str]` instead of `List[str]`, etc.
154 |
155 | - Never use/import `Optional` for new code.
156 |
157 | - Use modern enums like `StrEnum` if appropriate.
158 |
159 | - One exception to common practice on enums: If an enum has many values that are
160 | strings, and they have a literal value as a string (like in a JSON protocol), it’s
161 | fine to use lower_snake_case for enum values to match the actual value.
162 | This is more readable than LONG_ALL_CAPS_VALUES, and you can simply set the value to
163 | be the same as the name for each.
164 | For example:
165 | ```python
166 | class MediaType(Enum):
167 | """
168 | Media types. For broad categories only, to determine what processing
169 | is possible.
170 | """
171 |
172 | text = "text"
173 | image = "image"
174 | audio = "audio"
175 | video = "video"
176 | webpage = "webpage"
177 | binary = "binary"
178 | ```
179 |
180 | ## Guidelines for Literal Strings
181 |
182 | - For multi-line strings NEVER put multi-line strings flush against the left margin.
183 | ALWAYS use a `dedent()` function to make it more readable.
184 | You may wish to add a `strip()` as well.
185 | Example:
186 | ```python
187 | from textwrap import dedent
188 | markdown_content = dedent("""
189 | # Title 1
190 | Some text.
191 | ## Subtitle 1.1
192 | More text.
193 | """).strip()
194 | ```
195 |
196 | ## Guidelines for Comments
197 |
198 | - Comments should be EXPLANATORY: Explain *WHY* something is done a certain way and not
199 | just *what* is done.
200 |
201 | - Comments should be CONCISE: Remove all extraneous words.
202 |
203 | - DO NOT use comments to state obvious things or repeat what is evident from the code.
204 | Here is an example of a comment that SHOULD BE REMOVED because it simply repeats the
205 | code, which is distracting and adds no value:
206 | ```python
207 | if self.failed == 0:
208 | # All successful
209 | return "All tasks finished successfully"
210 | ```
211 |
212 | ## Guidelines for Docstrings
213 |
214 | - Here is an example of the correct style for docstrings:
215 | ```python
216 | def check_if_url(
217 | text: UnresolvedLocator, only_schemes: list[str] | None = None
218 | ) -> ParseResult | None:
219 | """
220 | Convenience function to check if a string or Path is a URL and if so return
221 | the `urlparse.ParseResult`.
222 |
223 | Also returns false for Paths, so that it's easy to use local paths and URLs
224 | (`Locator`s) interchangeably. Can provide `HTTP_ONLY` or `HTTP_OR_FILE` to
225 | restrict to only certain schemes.
226 | """
227 | # Function body
228 |
229 | def is_url(text: UnresolvedLocator, only_schemes: list[str] | None = None) -> bool:
230 | """
231 | Check if a string is a URL. For convenience, also returns false for
232 | Paths, so that it's easy to use local paths and URLs interchangeably.
233 | """
234 | return check_if_url(text, only_schemes) is not None
235 | ```
236 |
237 | - Use concise pydoc strings with triple quotes on their own lines.
238 |
239 | - Use `backticks` around variable names and inline code excerpts.
240 |
241 | - Use plain fences (```) around code blocks inside of pydocs.
242 |
243 | - For classes with many methods, use a concise docstring on the class that explains all
244 | the common information, and avoid repeating the same information on every method.
245 |
246 | - Docstrings should provide context or as concisely as possible explain “why”, not
247 | obvious details evident from the class names, function names, parameter names, and
248 | type annotations.
249 |
250 | - Docstrings *should* mention any key rationale or pitfalls when using the class or
251 | function.
252 |
253 | - Avoid obvious or repetitive docstrings.
254 | Do NOT add pydocs that just repeat in English facts that are obvious from the function
255 | name, variable name, or types.
256 | That is silly and obvious and makes the code longer for no reason.
257 |
258 | - Do NOT list args and return values if they’re obvious.
259 | In the above examples, you do not need and `Arguments:` or `Returns:` section, since
260 | sections as it is obvious from context.
261 | do list these if there are many arguments and their meaning isn’t clear.
262 | If it returns a less obvious type like a tuple, do explain in the pydoc.
263 |
264 | - Exported/public variables, functions, or methods SHOULD have concise docstrings.
265 | Internal/local variables, functions, and methods DO NOT need docstrings unless their
266 | purpose is not obvious.
267 |
268 | ## General Clean Coding Practices
269 |
270 | - Avoid writing trivial wrapper functions.
271 | For example, when writing a class DO NOT blindly make delegation methods around public
272 | member variables. DO NOT write methods like this:
273 | ```python
274 | def reassemble(self) -> str:
275 | """Call the original reassemble method."""
276 | return self.paragraph.reassemble()
277 | ```
278 | In general, the user can just call the enclosed objects methods, reducing code bloat.
279 |
280 | - If a function does not use a parameter, but it should still be present, you can use `#
281 | pyright: ignore[reportUnusedParameter]` in a comment to suppress the linter warning.
282 |
283 | ## Guidelines for Backward Compatibility
284 |
285 | - When changing code in a library or general function, if a change to an API or library
286 | will break backward compatibility, MENTION THIS to the user.
287 |
288 | - DO NOT implement additional code for backward compatiblity (such as extra methods or
289 | variable aliases or comments about backward compatibility) UNLESS the user has
290 | confirmed that it is necessary.
291 |
--------------------------------------------------------------------------------
/src/chopdiff/docs/token_diffs.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import logging
4 | from collections.abc import Callable
5 | from dataclasses import dataclass
6 | from enum import Enum
7 | from typing import TypeAlias
8 |
9 | import cydifflib as difflib
10 | from funlog import log_calls, tally_calls
11 | from typing_extensions import override
12 |
13 | from chopdiff.docs.text_doc import TextDoc
14 |
15 | log = logging.getLogger(__name__)
16 |
17 |
18 | SYMBOL_SEP = "⎪"
19 |
20 |
21 | class OpType(Enum):
22 | EQUAL = "equal"
23 | INSERT = "insert"
24 | DELETE = "delete"
25 | REPLACE = "replace"
26 |
27 | def as_symbol(self):
28 | abbrev = {
29 | OpType.EQUAL: " ",
30 | OpType.INSERT: "+",
31 | OpType.DELETE: "-",
32 | OpType.REPLACE: "±",
33 | }
34 | return abbrev[self]
35 |
36 | def as_abbrev(self):
37 | abbrev = {
38 | OpType.EQUAL: "keep",
39 | OpType.INSERT: "add ",
40 | OpType.DELETE: "del ",
41 | OpType.REPLACE: "repl",
42 | }
43 | return abbrev[self]
44 |
45 |
46 | @dataclass(frozen=True)
47 | class DiffOp:
48 | action: OpType
49 | left: list[str]
50 | right: list[str]
51 |
52 | def __post_init__(self):
53 | if self.action == OpType.REPLACE:
54 | assert self.left and self.right
55 | elif self.action == OpType.EQUAL:
56 | assert self.left == self.right
57 | elif self.action == OpType.INSERT:
58 | assert not self.left
59 | elif self.action == OpType.DELETE:
60 | assert not self.right
61 |
62 | def left_str(self, show_toks: bool = True) -> str:
63 | s = f"{self.action.as_abbrev()} {len(self.left):4} toks"
64 | if show_toks:
65 | s += f": - {SYMBOL_SEP}{''.join(tok for tok in self.left)}{SYMBOL_SEP}"
66 | return s
67 |
68 | def right_str(self, show_toks: bool = True) -> str:
69 | s = f"{self.action.as_abbrev()} {len(self.right):4} toks"
70 | if show_toks:
71 | s += f": + {SYMBOL_SEP}{''.join(tok for tok in self.right)}{SYMBOL_SEP}"
72 | return s
73 |
74 | def equal_str(self, show_toks: bool = True) -> str:
75 | s = f"{self.action.as_abbrev()} {len(self.left):4} toks"
76 | if show_toks:
77 | s += f": {SYMBOL_SEP}{''.join(tok for tok in self.left)}{SYMBOL_SEP}"
78 | return s
79 |
80 | def all_changed(self) -> list[str]:
81 | return [] if self.action == OpType.EQUAL else self.left + self.right
82 |
83 |
84 | @dataclass(frozen=True)
85 | class DiffStats:
86 | added: int
87 | removed: int
88 | input_size: int
89 |
90 | def nchanges(self) -> int:
91 | return self.added + self.removed
92 |
93 | @override
94 | def __str__(self):
95 | return f"add/remove +{self.added}/-{self.removed} out of {self.input_size} total"
96 |
97 |
98 | DiffFilter: TypeAlias = Callable[[DiffOp], bool]
99 |
100 | DIFF_FILTER_NONE: DiffFilter = lambda op: True
101 | """
102 | Diff filter that accepts all diff operations.
103 | """
104 |
105 |
106 | @dataclass
107 | class TokenDiff:
108 | """
109 | A diff of two texts as a sequence of EQUAL, INSERT, and DELETE operations on wordtoks.
110 | """
111 |
112 | ops: list[DiffOp]
113 |
114 | def left_size(self) -> int:
115 | return sum(len(op.left) for op in self.ops)
116 |
117 | def right_size(self) -> int:
118 | return sum(len(op.right) for op in self.ops)
119 |
120 | def changes(self) -> list[DiffOp]:
121 | return [op for op in self.ops if op.action != OpType.EQUAL]
122 |
123 | def stats(self) -> DiffStats:
124 | wordtoks_added = sum(len(op.right) for op in self.ops if op.action != OpType.EQUAL)
125 | wordtoks_removed = sum(len(op.left) for op in self.ops if op.action != OpType.EQUAL)
126 | return DiffStats(wordtoks_added, wordtoks_removed, self.left_size())
127 |
128 | def apply_to(self, original_wordtoks: list[str]) -> list[str]:
129 | """
130 | Apply a complete diff (including equality ops) to a list of wordtoks.
131 | """
132 | result: list[str] = []
133 | original_index = 0
134 |
135 | if len(original_wordtoks) != self.left_size():
136 | raise AssertionError(
137 | f"Diff should be complete: original wordtoks length {len(original_wordtoks)} != diff length {self.left_size()}"
138 | )
139 |
140 | for op in self.ops:
141 | if op.left:
142 | original_index += len(op.left)
143 | if op.right:
144 | result.extend(op.right)
145 |
146 | return result
147 |
148 | def filter(self, accept_fn: DiffFilter | None) -> tuple[TokenDiff, TokenDiff]:
149 | """
150 | Return two diffs, one that only has accepted operations and one that only has
151 | rejected operations.
152 | """
153 | if not accept_fn:
154 | accept_fn = DIFF_FILTER_NONE
155 |
156 | accepted_ops: list[DiffOp] = []
157 | rejected_ops: list[DiffOp] = []
158 |
159 | for op in self.ops:
160 | if op.action == OpType.EQUAL:
161 | # For equal ops, all tokens are both accepted and rejected.
162 | accepted_ops.append(op)
163 | rejected_ops.append(op)
164 | else:
165 | # We accept or reject the DiffOp as a whole, not token by token, since token by
166 | # token would give odd results, like deleting words but leaving whitespace.
167 | if accept_fn(op):
168 | accepted_ops.append(op)
169 | rejected_ops.append(DiffOp(OpType.EQUAL, op.left, op.left))
170 | else:
171 | accepted_ops.append(DiffOp(OpType.EQUAL, op.left, op.left))
172 | rejected_ops.append(op)
173 |
174 | assert len(accepted_ops) == len(self.ops)
175 | assert len(accepted_ops) == len(rejected_ops)
176 |
177 | accepted_diff, rejected_diff = TokenDiff(accepted_ops), TokenDiff(rejected_ops)
178 |
179 | assert accepted_diff.left_size() == self.left_size()
180 | assert rejected_diff.left_size() == self.left_size()
181 |
182 | return accepted_diff, rejected_diff
183 |
184 | def _diff_lines(self, include_equal: bool = False) -> list[str]:
185 | if len(self.ops) == 0:
186 | return ["(No changes)"]
187 |
188 | pos = 0
189 | lines: list[str] = []
190 | for op in self.ops:
191 | if op.action == OpType.EQUAL:
192 | if include_equal:
193 | lines.append(f"at pos {pos:4} {op.equal_str()}")
194 | elif op.action == OpType.INSERT:
195 | lines.append(f"at pos {pos:4} {op.right_str()}")
196 | elif op.action == OpType.DELETE:
197 | lines.append(f"at pos {pos:4} {op.left_str()}")
198 | elif op.action == OpType.REPLACE:
199 | lines.append(f"at pos {pos:4} {op.left_str()}")
200 | lines.append(f" {'':4} {op.right_str()}")
201 |
202 | pos += len(op.left)
203 | return lines
204 |
205 | def as_diff_str(self, include_equal: bool = True) -> str:
206 | diff_str = "\n".join(self._diff_lines(include_equal=include_equal))
207 | return f"TextDiff: {self.stats()}:\n{diff_str}"
208 |
209 | @override
210 | def __str__(self):
211 | return self.as_diff_str()
212 |
213 |
214 | def diff_docs(doc1: TextDoc, doc2: TextDoc) -> TokenDiff:
215 | """
216 | Calculate the LCS-style diff between two documents based on words.
217 | """
218 |
219 | diff = diff_wordtoks(list(doc1.as_wordtoks()), list(doc2.as_wordtoks()))
220 |
221 | # log.save_object("doc1 wordtoks", "diff_docs", "\n".join(list(doc1.as_wordtoks())))
222 | # log.save_object("doc2 wordtoks", "diff_docs", "\n".join(list(doc2.as_wordtoks())))
223 | # log.save_object("diff", "diff_docs", diff)
224 |
225 | return diff
226 |
227 |
228 | @tally_calls(level="warning", min_total_runtime=5)
229 | def diff_wordtoks(wordtoks1: list[str], wordtoks2: list[str]) -> TokenDiff:
230 | """
231 | Perform an LCS-style diff on two lists of wordtoks.
232 | """
233 | s = difflib.SequenceMatcher(None, wordtoks1, wordtoks2, autojunk=False) # pyright: ignore
234 | diff: list[DiffOp] = []
235 |
236 | # log.message(f"Diffing {len(wordtoks1)} wordtoks against {len(wordtoks2)} wordtoks")
237 | # log.save_object("wordtoks1", "diff_wordtoks", "".join(wordtoks1))
238 | # log.save_object("wordtoks2", "diff_wordtoks", "".join(wordtoks2))
239 | # log.save_object("diff opcodes", "diff_wordtoks", "\n".join(str(o) for o in s.get_opcodes()))
240 |
241 | for tag, i1, i2, j1, j2 in s.get_opcodes(): # pyright: ignore
242 | if tag == "equal":
243 | slice1 = wordtoks1[i1:i2]
244 | assert slice1 == wordtoks2[j1:j2]
245 | diff.append(DiffOp(OpType.EQUAL, slice1, slice1))
246 | elif tag == "insert":
247 | diff.append(DiffOp(OpType.INSERT, [], wordtoks2[j1:j2]))
248 | elif tag == "delete":
249 | diff.append(DiffOp(OpType.DELETE, wordtoks1[i1:i2], []))
250 | elif tag == "replace":
251 | diff.append(DiffOp(OpType.REPLACE, wordtoks1[i1:i2], wordtoks2[j1:j2]))
252 |
253 | return TokenDiff(diff)
254 |
255 |
256 | ScoredDiff: TypeAlias = tuple[float, TokenDiff]
257 |
258 |
259 | def scored_diff_wordtoks(wordtoks1: list[str], wordtoks2: list[str]) -> ScoredDiff:
260 | """
261 | Calculate the number of wordtoks added and removed between two lists of tokens.
262 | Score is (wordtoks_added + wordtoks_removed) / min(len(doc1), len(doc2)),
263 | which is 0 for identical docs.
264 | """
265 |
266 | if len(wordtoks1) == 0 or len(wordtoks2) == 0:
267 | raise ValueError("Cannot score diff for empty documents")
268 |
269 | diff = diff_wordtoks(wordtoks1, wordtoks2)
270 | score = float(diff.stats().nchanges()) / min(len(wordtoks1), len(wordtoks2))
271 | return score, diff
272 |
273 |
274 | @log_calls(level="message", if_slower_than=0.25)
275 | def find_best_alignment(
276 | list1: list[str],
277 | list2: list[str],
278 | min_overlap: int,
279 | max_overlap: int | None = None,
280 | scored_diff: Callable[[list[str], list[str]], ScoredDiff] = scored_diff_wordtoks,
281 | give_up_score: float = 0.75,
282 | give_up_count: int = 30,
283 | ) -> tuple[int, ScoredDiff]:
284 | """
285 | Find the best alignment of two lists of values, where edit distance is smallest but overlap is
286 | at least min_overlap and at most max_overlap. Returns offset into list1 and diff object.
287 | """
288 | len1, len2 = len(list1), len(list2)
289 | best_offset = -1
290 | best_score = float("inf")
291 | best_diff = None
292 | max_overlap = min(len1, len2, max_overlap) if max_overlap is not None else min(len1, len2)
293 |
294 | if min_overlap > len1 or min_overlap > len2:
295 | raise ValueError(
296 | f"Minimum overlap {min_overlap} should never exceed the length of one of the lists ({len1}, {len2})"
297 | )
298 |
299 | log.info(
300 | "Finding best alignment: List lengths: lengths %s and %s with overlap of %s to %s",
301 | len1,
302 | len2,
303 | min_overlap,
304 | max_overlap,
305 | )
306 |
307 | # To make this a bit more efficient we check if we have a run of increasing scores and
308 | # give up if we have many in a row.
309 | scores_increasing = 0
310 | prev_score = float("-inf")
311 |
312 | # Slide the second list over the first list, starting from the end of the first list.
313 | # TODO: This could be much more efficient by being cleverer about reusing diff calculations.s
314 | for overlap in range(min_overlap, max_overlap + 1):
315 | start1 = len1 - overlap
316 | end1 = len1
317 | start2 = 0
318 | end2 = overlap
319 |
320 | score, diff = scored_diff(list1[start1:end1], list2[start2:end2])
321 |
322 | log.info("Offset %s: Overlap %s: Score %f", start1, overlap, score)
323 |
324 | if score < best_score:
325 | best_score = score
326 | best_offset = start1
327 | best_diff = diff
328 | scores_increasing = 0
329 | elif score >= give_up_score and score >= prev_score:
330 | scores_increasing += 1
331 | if scores_increasing >= give_up_count:
332 | log.info(
333 | "Giving up after %s increasing scores, last score %s > %s",
334 | give_up_count,
335 | score,
336 | give_up_score,
337 | )
338 | break
339 |
340 | prev_score = score
341 |
342 | if best_diff is None:
343 | raise ValueError("No alignment found")
344 |
345 | return best_offset, (best_score, best_diff)
346 |
--------------------------------------------------------------------------------
/tests/docs/test_text_doc.py:
--------------------------------------------------------------------------------
1 | from pprint import pprint
2 | from textwrap import dedent
3 |
4 | import regex
5 | from prettyfmt import fmt_words
6 | from strif import abbrev_str
7 |
8 | from chopdiff.docs.sizes import TextUnit
9 | from chopdiff.docs.text_doc import SentIndex, TextDoc
10 | from chopdiff.docs.wordtoks import (
11 | PARA_BR_TOK,
12 | is_break_or_space,
13 | is_entity,
14 | is_header_tag,
15 | is_number,
16 | is_tag,
17 | is_word,
18 | join_wordtoks,
19 | visualize_wordtoks,
20 | wordtok_len,
21 | wordtokenize,
22 | )
23 |
24 | _med_test_doc = dedent(
25 | """
26 | # Title
27 |
28 | Hello World. This is an example sentence. And here's another one!
29 |
30 | ## Subtitle
31 |
32 | This is a new paragraph.
33 | It has several sentences.
34 | There may be line breaks within a paragraph, but these should not affect handlingof the paragraph.
35 | There are also [links](http://www.google.com) and **bold** and *italic* text.
36 |
37 | ### Itemized List
38 |
39 | - Item 1
40 |
41 | - Item 2
42 |
43 | - Item 3
44 |
45 | ### Numbered List
46 |
47 | 1. Item 1
48 |
49 | 2. Item 2
50 |
51 | 3. Item 3
52 |
53 | Testing some embedded HTML tags.
54 |
55 | An HTML header
56 |
57 |
58 |
59 | ⏱️05:52
63 |
64 | """
65 | ).strip()
66 |
67 |
68 | def test_document_parse_reassemble():
69 | text = _med_test_doc
70 | doc = TextDoc.from_text(text)
71 |
72 | print("\n---Original:")
73 | pprint(text)
74 | print("\n---Parsed:")
75 | pprint(doc)
76 |
77 | reassembled_text = doc.reassemble()
78 |
79 | # Should be exactly the same except for within-paragraph line breaks.
80 | def normalize(text: str) -> str:
81 | return regex.sub(r"\s+", " ", text.replace("\n\n", ""))
82 |
83 | assert normalize(reassembled_text) == normalize(text)
84 |
85 | # Check offset of a paragraph towards the end of the document.
86 | last_para = doc.paragraphs[-1]
87 | last_para_char_offset = text.rindex(last_para.original_text)
88 | assert last_para.char_offset == last_para_char_offset
89 |
90 |
91 | def test_markup_detection():
92 | text = _med_test_doc
93 | doc = TextDoc.from_text(text)
94 |
95 | print("Paragraph markup and header detection:")
96 | result: list[str] = []
97 | for para in doc.paragraphs:
98 | result.append(
99 | fmt_words(
100 | abbrev_str(para.original_text, 10),
101 | "is_markup" if para.is_markup() else "",
102 | "is_header" if para.is_header() else "",
103 | )
104 | )
105 |
106 | print("\n".join(result))
107 | assert (
108 | "\n".join(result)
109 | == dedent(
110 | """
111 | # Title is_header
112 | Hello Wor…
113 | ## Subtit… is_header
114 | This is a…
115 | ### Itemi… is_header
116 | - Item 1
117 | - Item 2
118 | - Item 3
119 | ### Numbe… is_header
120 | 1. Item 1
121 | 2. Item 2
122 | 3. Item 3
123 | Testing s…
124 | An HT… is_header
125 | "
170 | assert doc.paragraphs[-2].is_markup()
171 | assert doc.paragraphs[-1].sentences[-1].is_markup()
172 |
173 |
174 | _simple_test_doc = dedent(
175 | """
176 | This is the first paragraph. It has multiple sentences.
177 |
178 | This is the second paragraph. It also has multiple sentences. And it continues.
179 |
180 | Here is the third paragraph. More sentences follow. And here is another one.
181 | """
182 | ).strip()
183 |
184 |
185 | def test_doc_sizes():
186 | text = _med_test_doc
187 | doc = TextDoc.from_text(text)
188 | print("\n---Sizes:")
189 | size_summary = doc.size_summary()
190 | print(size_summary)
191 |
192 | assert size_summary == "726 bytes (37 lines, 16 paras, 20 sents, 82 words, 215 tiktoks)"
193 |
194 |
195 | def test_seek_doc():
196 | doc = TextDoc.from_text(_simple_test_doc)
197 |
198 | offset = 1
199 | sent_index, sent_offset = doc.seek_to_sent(offset, TextUnit.bytes)
200 | print(f"Seeked to {sent_index} offset {sent_offset} for offset {offset} bytes")
201 | assert sent_index == SentIndex(para_index=0, sent_index=0)
202 | assert sent_offset == 0
203 |
204 | offset = len("This is the first paragraph.")
205 | sent_index, sent_offset = doc.seek_to_sent(offset, TextUnit.bytes)
206 | print(f"Seeked to {sent_index} offset {sent_offset} for offset {offset} bytes")
207 | assert sent_index == SentIndex(para_index=0, sent_index=0)
208 | assert sent_offset == 0
209 |
210 | offset = len("This is the first paragraph. ")
211 | sent_index, sent_offset = doc.seek_to_sent(offset, TextUnit.bytes)
212 | print(f"Seeked to {sent_index} offset {sent_offset} for offset {offset} bytes")
213 | assert sent_index == SentIndex(para_index=0, sent_index=1)
214 | assert sent_offset == offset
215 |
216 | offset = len(
217 | "This is the first paragraph. It has multiple sentences.\n\nThis is the second paragraph."
218 | )
219 | sent_index, sent_offset = doc.seek_to_sent(offset, TextUnit.bytes)
220 | print(f"Seeked to {sent_index} offset {sent_offset} for offset {offset} bytes")
221 | assert sent_index == SentIndex(para_index=1, sent_index=0)
222 | assert sent_offset == len("This is the first paragraph. It has multiple sentences.\n\n")
223 |
224 | offset = len(_simple_test_doc) + 10
225 | sent_index, sent_offset = doc.seek_to_sent(offset, TextUnit.bytes)
226 | print(f"Seeked to {sent_index} offset {sent_offset} for offset {offset} bytes")
227 | assert sent_index == SentIndex(para_index=2, sent_index=2)
228 |
229 |
230 | _short_test_doc = dedent(
231 | """
232 | Paragraph one lorem ipsum.
233 | Sentence 1a lorem ipsum. Sentence 1b lorem ipsum. Sentence 1c lorem ipsum.
234 |
235 | Paragraph two lorem ipsum. Sentence 2a lorem ipsum. Sentence 2b lorem ipsum. Sentence 2c lorem ipsum.
236 |
237 | Paragraph three lorem ipsum. Sentence 3a lorem ipsum. Sentence 3b lorem ipsum. Sentence 3c lorem ipsum.
238 | """
239 | ).strip()
240 |
241 |
242 | def test_sub_doc():
243 | doc = TextDoc.from_text(_short_test_doc)
244 |
245 | sub_doc_start = SentIndex(1, 1)
246 | sub_doc_end = SentIndex(2, 1)
247 | sub_doc = doc.sub_doc(sub_doc_start, sub_doc_end)
248 |
249 | expected_text = dedent(
250 | """
251 | Sentence 2a lorem ipsum. Sentence 2b lorem ipsum. Sentence 2c lorem ipsum.
252 |
253 | Paragraph three lorem ipsum. Sentence 3a lorem ipsum.
254 | """
255 | ).strip()
256 | expected_sub_doc = TextDoc.from_text(expected_text)
257 |
258 | print("---Original:")
259 | pprint(doc)
260 | print("---Subdoc:")
261 | pprint(sub_doc)
262 |
263 | # Confirm reassembled text is correct.
264 | assert sub_doc.reassemble() == expected_sub_doc.reassemble()
265 |
266 | # Confirm sentences and offsets are preserved in sub-doc.
267 | orig_sentences = [sent for _index, sent in doc.sent_iter()]
268 | sub_sentences = [sent for _index, sent in sub_doc.sent_iter()]
269 | assert orig_sentences[5:10] == sub_sentences
270 |
271 | # Confirm indexing and reverse iteration.
272 | assert doc.sub_doc(SentIndex(0, 0), None) == doc
273 | reversed_sentences = [sent for _index, sent in doc.sent_iter(reverse=True)]
274 | assert reversed_sentences == list(reversed(orig_sentences))
275 |
276 |
277 | def test_tokenization():
278 | doc = TextDoc.from_text(_short_test_doc)
279 | wordtoks = list(doc.as_wordtoks())
280 |
281 | print("\n---Tokens:")
282 | pprint(wordtoks)
283 |
284 | assert wordtoks[:6] == ["Paragraph", " ", "one", " ", "lorem", " "]
285 | assert wordtoks[-7:] == [" ", "3c", " ", "lorem", " ", "ipsum", "."]
286 | assert wordtoks.count(PARA_BR_TOK) == 2
287 | assert join_wordtoks(wordtoks) == _short_test_doc.replace(
288 | "\n", " ", 1
289 | ) # First \n is not a para break.
290 |
291 |
292 | def test_wordtok_mappings():
293 | doc = TextDoc.from_text(_short_test_doc)
294 |
295 | print("\n---Mapping:")
296 | wordtok_mapping, sent_mapping = doc.wordtok_mappings()
297 | pprint(wordtok_mapping)
298 | pprint(sent_mapping)
299 |
300 | assert wordtok_mapping[0] == SentIndex(0, 0)
301 | assert wordtok_mapping[1] == SentIndex(0, 0)
302 | assert wordtok_mapping[4] == SentIndex(0, 0)
303 | assert wordtok_mapping[9] == SentIndex(0, 1)
304 |
305 | assert sent_mapping[SentIndex(0, 0)] == [0, 1, 2, 3, 4, 5, 6, 7, 8]
306 | assert sent_mapping[SentIndex(2, 3)] == [99, 100, 101, 102, 103, 104, 105, 106]
307 |
308 |
309 | _sentence_tests = [
310 | "Hello, world!",
311 | "This is an example sentence with punctuation.",
312 | "And here's another one!",
313 | "Special characters: @#%^&*()",
314 | ]
315 |
316 | _sentence_test_html = 'This is a test .'
317 |
318 |
319 | def test_wordtokization():
320 | for sentence in _sentence_tests:
321 | wordtoks = wordtokenize(sentence)
322 | reassembled_sentence = "".join(wordtoks)
323 | assert reassembled_sentence == sentence
324 |
325 | assert wordtokenize("Multiple spaces and tabs\tand\nnewlines in between.") == [
326 | "Multiple",
327 | " ",
328 | "spaces",
329 | " ",
330 | "and",
331 | " ",
332 | "tabs",
333 | " ",
334 | "and",
335 | " ",
336 | "newlines",
337 | " ",
338 | "in",
339 | " ",
340 | "between",
341 | ".",
342 | ]
343 | assert wordtokenize("") == []
344 | assert wordtokenize(" ") == [" "]
345 |
346 | assert wordtokenize(_sentence_test_html) == [
347 | "This",
348 | " ",
349 | "is",
350 | " ",
351 | '',
352 | "a",
353 | " ",
354 | "test",
355 | " ",
356 | ".",
357 | ]
358 |
359 | assert len(_sentence_test_html) == sum(
360 | wordtok_len(wordtok) for wordtok in wordtokenize(_sentence_test_html)
361 | )
362 |
363 |
364 | def test_html_tokenization():
365 | doc = TextDoc.from_text(_sentence_test_html)
366 | wordtoks = list(doc.as_wordtoks())
367 |
368 | print("\n---HTML Tokens:")
369 | pprint(wordtoks)
370 |
371 | assert wordtoks == [
372 | "This",
373 | " ",
374 | "is",
375 | " ",
376 | '',
377 | "a",
378 | " ",
379 | "test",
380 | " ",
381 | ".",
382 | ]
383 | assert list(map(is_tag, wordtoks)) == [
384 | False,
385 | False,
386 | False,
387 | False,
388 | True,
389 | False,
390 | False,
391 | False,
392 | True,
393 | False,
394 | ]
395 | assert list(map(is_break_or_space, wordtoks)) == [
396 | False,
397 | True,
398 | False,
399 | True,
400 | False,
401 | False,
402 | True,
403 | False,
404 | False,
405 | False,
406 | ]
407 |
408 |
409 | def test_tiktoken_len():
410 | doc = TextDoc.from_text(_med_test_doc)
411 |
412 | len = doc.size(TextUnit.tiktokens)
413 | print("--Tiktoken len:")
414 | print(len)
415 |
416 | assert len > 100
417 |
418 |
419 | def test_is_footnote_def_detection():
420 | doc = TextDoc.from_text(
421 | dedent(
422 | """
423 | Title.
424 |
425 | Body with a ref[^a1].
426 |
427 | [^a1]: The definition line
428 | """
429 | ).strip()
430 | )
431 |
432 | assert len(doc.paragraphs) == 3
433 | assert not doc.paragraphs[0].is_footnote_def()
434 | assert not doc.paragraphs[1].is_footnote_def()
435 | assert doc.paragraphs[2].is_footnote_def()
436 |
--------------------------------------------------------------------------------
/src/chopdiff/html/html_in_md.py:
--------------------------------------------------------------------------------
1 | """
2 | Formatting of Markdown with a small set of known HTML classes. We do this directly
3 | ourselves to keep the HTML very minimal, control whitespace, and to avoid any
4 | confusions of using full HTML escaping (like unnecessary "s etc.)
5 |
6 | Perhaps worth using FastHTML for this?
7 | """
8 |
9 | import re
10 | from collections.abc import Callable
11 | from typing import TypeAlias
12 |
13 |
14 | def escape_md_html(s: str, safe: bool = False) -> str:
15 | """
16 | Escape a string for Markdown with HTML. Don't escape single and double quotes.
17 | """
18 | if safe:
19 | return s
20 | s = s.replace("&", "&")
21 | s = s.replace("<", "<")
22 | s = s.replace(">", ">")
23 | return s
24 |
25 |
26 | def escape_attribute(s: str) -> str:
27 | """
28 | Escape a string for use as an HTML attribute. Escape single and double quotes.
29 | """
30 | s = escape_md_html(s)
31 | s = s.replace('"', """)
32 | s = s.replace("'", "'")
33 | return s
34 |
35 |
36 | ClassNames = str | list[str] | None
37 | Attrs = dict[str, str | bool]
38 |
39 | _TAGS_WITH_PADDING = ["div", "p"]
40 |
41 |
42 | def tag_with_attrs(
43 | tag: str,
44 | text: str | None,
45 | class_name: ClassNames = None,
46 | *,
47 | attrs: Attrs | None = None,
48 | safe: bool = False,
49 | padding: str | None = None,
50 | ) -> str:
51 | """
52 | Create an HTML tag with optional class names and attributes.
53 | Boolean attribute values: True includes the attribute, False omits it.
54 | """
55 | class_value = ""
56 | if class_name is not None:
57 | if isinstance(class_name, str):
58 | class_value = class_name.strip()
59 | else: # list[str]
60 | # Filter out empty strings and join
61 | filtered_classes = [cls for cls in class_name if cls.strip()]
62 | class_value = " ".join(filtered_classes)
63 |
64 | attr_str = f' class="{escape_attribute(class_value)}"' if class_value else ""
65 | if attrs:
66 | for k, v in attrs.items():
67 | if isinstance(v, bool):
68 | if v: # Only include attribute if True
69 | attr_str += f" {k}"
70 | else: # string value
71 | attr_str += f' {k}="{escape_attribute(v)}"'
72 | # Default padding for div and p tags.
73 | if text is None:
74 | return f"<{tag}{attr_str} />"
75 | else:
76 | content = escape_md_html(text, safe)
77 | if padding is None:
78 | padding = "\n" if tag in _TAGS_WITH_PADDING else ""
79 | if padding:
80 | content = content.strip("\n")
81 | if not content:
82 | padding = ""
83 | return f"<{tag}{attr_str}>{padding}{content}{padding}{tag}>"
84 |
85 |
86 | def html_span(
87 | text: str,
88 | class_name: ClassNames = None,
89 | *,
90 | attrs: Attrs | None = None,
91 | safe: bool = False,
92 | ) -> str:
93 | """
94 | Write a span tag for use in Markdown, with the given text and optional class and attributes.
95 | """
96 | return tag_with_attrs("span", text, class_name, attrs=attrs, safe=safe)
97 |
98 |
99 | def html_div(
100 | text: str,
101 | class_name: ClassNames = None,
102 | *,
103 | attrs: Attrs | None = None,
104 | safe: bool = False,
105 | padding: str | None = None,
106 | ) -> str:
107 | """
108 | Write a div tag for use in Markdown, with the given text and optional class and attributes.
109 | """
110 | return tag_with_attrs("div", text, class_name, attrs=attrs, safe=safe, padding=padding)
111 |
112 |
113 | def html_a(
114 | text: str,
115 | href: str,
116 | class_name: ClassNames = None,
117 | *,
118 | attrs: Attrs | None = None,
119 | safe: bool = False,
120 | ) -> str:
121 | """
122 | Write an anchor tag with href, optional class and attributes.
123 | """
124 | link_attrs: Attrs = {"href": href}
125 | if attrs:
126 | link_attrs.update(attrs)
127 | return tag_with_attrs("a", text, class_name, attrs=link_attrs, safe=safe)
128 |
129 |
130 | def html_b(
131 | text: str,
132 | class_name: ClassNames = None,
133 | *,
134 | attrs: Attrs | None = None,
135 | safe: bool = False,
136 | ) -> str:
137 | """
138 | Write a bold tag with optional class and attributes.
139 | """
140 | return tag_with_attrs("b", text, class_name, attrs=attrs, safe=safe)
141 |
142 |
143 | def html_i(
144 | text: str,
145 | class_name: ClassNames = None,
146 | *,
147 | attrs: Attrs | None = None,
148 | safe: bool = False,
149 | ) -> str:
150 | """
151 | Write an italic tag with optional class and attributes.
152 | """
153 | return tag_with_attrs("i", text, class_name, attrs=attrs, safe=safe)
154 |
155 |
156 | def html_img(
157 | src: str,
158 | alt: str,
159 | class_name: ClassNames = None,
160 | *,
161 | attrs: Attrs | None = None,
162 | safe: bool = False,
163 | ) -> str:
164 | img_attrs: Attrs = {"src": src, "alt": alt}
165 | if attrs:
166 | for k, v in attrs.items():
167 | img_attrs[k] = v
168 | return tag_with_attrs("img", None, class_name, attrs=img_attrs, safe=safe)
169 |
170 |
171 | def html_join_blocks(*blocks: str | None) -> str:
172 | """
173 | Join block elements, with double newlines for better Markdown compatibility.
174 | Ignore empty strings or None.
175 | """
176 | return "\n\n".join(block.strip("\n") for block in blocks if block)
177 |
178 |
179 | def md_para(text: str) -> str:
180 | """
181 | Add double newlines to the start and end of the text to make it a paragraph.
182 | """
183 | return "\n\n".join(text.split("\n"))
184 |
185 |
186 | Wrapper: TypeAlias = Callable[[str], str]
187 | """Wraps a string to identify it in some way."""
188 |
189 |
190 | def identity_wrapper(text: str) -> str:
191 | return text
192 |
193 |
194 | def _check_class_name(class_name: ClassNames) -> None:
195 | if class_name:
196 | if isinstance(class_name, str):
197 | # Allow modern CSS class naming including BEM notation (block__element--modifier)
198 | if class_name.strip() and not re.match(r"^[a-zA-Z_][\w_-]*$", class_name):
199 | raise ValueError(f"Expected a valid CSS class name but got: '{class_name}'")
200 | else: # list[str]
201 | for cls in class_name:
202 | if cls.strip() and not re.match(r"^[a-zA-Z_][\w_-]*$", cls):
203 | raise ValueError(f"Expected a valid CSS class name but got: '{cls}'")
204 |
205 |
206 | def html_p(
207 | text: str,
208 | class_name: ClassNames = None,
209 | *,
210 | attrs: Attrs | None = None,
211 | safe: bool = False,
212 | padding: str | None = None,
213 | ) -> str:
214 | """
215 | Write a p tag for use in Markdown, with the given text and optional class and attributes.
216 | """
217 | return tag_with_attrs("p", text, class_name, attrs=attrs, safe=safe, padding=padding)
218 |
219 |
220 | def html_tag(
221 | tag: str,
222 | text: str | None = None,
223 | class_name: ClassNames = None,
224 | *,
225 | attrs: Attrs | None = None,
226 | safe: bool = False,
227 | padding: str | None = None,
228 | ) -> str:
229 | """
230 | Generic function to create any HTML tag with optional class and attributes.
231 | """
232 | return tag_with_attrs(tag, text, class_name, attrs=attrs, safe=safe, padding=padding)
233 |
234 |
235 | def div_wrapper(
236 | class_name: ClassNames = None,
237 | *,
238 | attrs: Attrs | None = None,
239 | safe: bool = True,
240 | padding: str | None = "\n\n",
241 | ) -> Wrapper:
242 | _check_class_name(class_name)
243 |
244 | def div_wrapper_func(text: str) -> str:
245 | return html_div(text, class_name, attrs=attrs, safe=safe, padding=padding)
246 |
247 | return div_wrapper_func
248 |
249 |
250 | def span_wrapper(
251 | class_name: ClassNames = None,
252 | *,
253 | attrs: Attrs | None = None,
254 | safe: bool = True,
255 | ) -> Wrapper:
256 | _check_class_name(class_name)
257 |
258 | def span_wrapper_func(text: str) -> str:
259 | return html_span(text, class_name, attrs=attrs, safe=safe)
260 |
261 | return span_wrapper_func
262 |
263 |
264 | def tag_wrapper(
265 | tag: str,
266 | class_name: ClassNames = None,
267 | *,
268 | attrs: Attrs | None = None,
269 | safe: bool = True,
270 | padding: str | None = None,
271 | ) -> Wrapper:
272 | """
273 | Generic wrapper factory for any HTML tag.
274 | """
275 | _check_class_name(class_name)
276 |
277 | def tag_wrapper_func(text: str) -> str:
278 | return html_tag(tag, text, class_name, attrs=attrs, safe=safe, padding=padding)
279 |
280 | return tag_wrapper_func
281 |
282 |
283 | ## Tests
284 |
285 |
286 | def test_html():
287 | assert escape_md_html("&<>") == "&<>"
288 | assert escape_attribute("\"'&<>") == ""'&<>"
289 | assert (
290 | tag_with_attrs("span", "text", class_name="foo", attrs={"id": "a"})
291 | == 'text '
292 | )
293 | assert (
294 | html_span("text", class_name="foo", attrs={"id": "a"})
295 | == 'text '
296 | )
297 | assert (
298 | html_div("text 1<2", class_name="foo", attrs={"id": "a"})
299 | == ' \ntext 1<2\n
'
300 | )
301 | assert html_div("text") == "\ntext\n
"
302 |
303 |
304 | def test_boolean_attrs():
305 | assert tag_with_attrs("input", None, attrs={"disabled": True}) == " "
306 | assert tag_with_attrs("input", None, attrs={"disabled": False}) == " "
307 | assert (
308 | tag_with_attrs("input", None, attrs={"disabled": True, "required": True, "id": "test"})
309 | == ' '
310 | )
311 | assert (
312 | tag_with_attrs("input", None, attrs={"disabled": False, "required": True})
313 | == " "
314 | )
315 |
316 |
317 | def test_class_names():
318 | assert (
319 | tag_with_attrs("div", "text", class_name=["foo", "bar"])
320 | == '\ntext\n
'
321 | )
322 | assert tag_with_attrs("span", "text", class_name="single") == 'text '
323 | assert tag_with_attrs("span", "text", class_name=None) == "text "
324 | assert tag_with_attrs("span", "text", class_name=[]) == "text "
325 | assert tag_with_attrs("span", "text", class_name="") == "text "
326 | assert tag_with_attrs("span", "text", class_name=["", ""]) == "text "
327 | assert (
328 | tag_with_attrs("span", "text", class_name=["foo", "", "bar"])
329 | == 'text '
330 | )
331 |
332 |
333 | def test_padding():
334 | assert tag_with_attrs("span", "text") == "text "
335 | assert tag_with_attrs("div", "text") == "\ntext\n
"
336 | assert tag_with_attrs("p", "text") == "\ntext\n
"
337 | assert tag_with_attrs("div", "text", padding="") == "text
"
338 | assert tag_with_attrs("div", "", padding="\n") == "
"
339 |
340 |
341 | def test_safe_mode():
342 | assert tag_with_attrs("div", "