├── tests ├── __init__.py ├── divs │ ├── __init__.py │ ├── test_div_elements.py │ └── test_parse_divs.py ├── docs │ ├── __init__.py │ ├── test_token_mapping.py │ ├── test_wordtoks.py │ ├── test_token_diffs.py │ └── test_text_doc.py ├── html │ ├── __init__.py │ └── test_timestamps.py ├── util │ ├── __init__.py │ └── test_lemmatize.py └── transforms │ ├── __init__.py │ ├── test_sliding_windows.py │ ├── test_sliding_transforms.py │ └── test_diff_filters.py ├── src └── chopdiff │ ├── py.typed │ ├── __init__.py │ ├── util │ ├── __init__.py │ ├── tiktoken_utils.py │ └── lemmatize.py │ ├── html │ ├── html_plaintext.py │ ├── extractor.py │ ├── __init__.py │ ├── timestamps.py │ ├── html_in_md.py │ └── html_tags.py │ ├── divs │ ├── __init__.py │ ├── chunk_utils.py │ ├── div_elements.py │ ├── parse_divs.py │ └── text_node.py │ ├── docs │ ├── sizes.py │ ├── __init__.py │ ├── search_tokens.py │ ├── token_mapping.py │ ├── wordtoks.py │ └── token_diffs.py │ └── transforms │ ├── __init__.py │ ├── sliding_windows.py │ ├── window_settings.py │ ├── diff_filters.py │ └── sliding_transforms.py ├── .copier-answers.yml ├── installation.md ├── LICENSE ├── Makefile ├── .github └── workflows │ ├── publish.yml │ └── ci.yml ├── examples ├── gettysberg.txt ├── insert_para_breaks.py └── backfill_timestamps.py ├── devtools └── lint.py ├── publishing.md ├── development.md ├── .cursor └── rules │ ├── general.mdc │ └── python.mdc ├── .gitignore └── pyproject.toml /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/chopdiff/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/divs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/docs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/html/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/chopdiff/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/chopdiff/util/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa: F401 2 | 3 | from chopdiff.util.lemmatize import lemmatize, lemmatized_equal 4 | from chopdiff.util.tiktoken_utils import tiktoken_len 5 | 6 | __all__ = [ 7 | "lemmatize", 8 | "lemmatized_equal", 9 | "tiktoken_len", 10 | ] 11 | -------------------------------------------------------------------------------- /src/chopdiff/util/tiktoken_utils.py: -------------------------------------------------------------------------------- 1 | import tiktoken 2 | 3 | 4 | def tiktoken_len(string: str, encoding_name: str = "o200k_base") -> int: 5 | """ 6 | Length of text in tiktokens. 7 | """ 8 | encoding = tiktoken.get_encoding(encoding_name) 9 | num_tokens = len(encoding.encode(string)) 10 | return num_tokens 11 | -------------------------------------------------------------------------------- /.copier-answers.yml: -------------------------------------------------------------------------------- 1 | # Changes here will be overwritten by Copier. Do not edit manually. 2 | _commit: v0.2.17 3 | _src_path: gh:jlevy/simple-modern-uv 4 | package_author_email: joshua@cal.berkeley.edu 5 | package_author_name: Joshua Levy 6 | package_description: Simple tools for parsing/diffing/processing text to support LLM 7 | applications 8 | package_github_org: jlevy 9 | package_module: chopdiff 10 | package_name: chopdiff 11 | -------------------------------------------------------------------------------- /tests/util/test_lemmatize.py: -------------------------------------------------------------------------------- 1 | from chopdiff.util.lemmatize import lemmatize, lemmatized_equal 2 | 3 | 4 | def test_lemmatize(): 5 | assert lemmatize("running") == "run" 6 | assert lemmatize("better") == "good" 7 | assert lemmatize("The cats are running") == "the cat be run" 8 | assert lemmatize("Hello, world!") == "hello , world !" 9 | assert lemmatize("I have 3 cats.") == "I have 3 cat ." 10 | assert lemmatized_equal("The cat runs", "The cats running") 11 | assert not lemmatized_equal("The cat runs", "The dog runs") 12 | assert lemmatized_equal("The CAT runs", "the cats RUN") 13 | assert not lemmatized_equal("The CAT runs", "the cats RAN", case_sensitive=True) 14 | -------------------------------------------------------------------------------- /src/chopdiff/html/html_plaintext.py: -------------------------------------------------------------------------------- 1 | import html 2 | import re 3 | 4 | 5 | def plaintext_to_html(text: str): 6 | """ 7 | Convert plaintext to HTML, also handling newlines and whitespace. 8 | """ 9 | return ( 10 | html.escape(text) 11 | .replace("\n", "
") 12 | .replace("\t", " " * 4) 13 | .replace(" ", "  ") 14 | ) 15 | 16 | 17 | def html_to_plaintext(text: str): 18 | """ 19 | Convert HTML to plaintext, stripping tags and converting entities. 20 | """ 21 | text = re.sub(r"
", "\n", text, flags=re.IGNORECASE) 22 | text = re.sub(r"

", "\n\n", text, flags=re.IGNORECASE) 23 | unescaped_text = html.unescape(text) 24 | clean_text = re.sub("<[^<]+?>", "", unescaped_text) 25 | return clean_text 26 | -------------------------------------------------------------------------------- /installation.md: -------------------------------------------------------------------------------- 1 | ## Installing uv and Python 2 | 3 | This project is set up to use [**uv**](https://docs.astral.sh/uv/), the new package 4 | manager for Python. `uv` replaces traditional use of `pyenv`, `pipx`, `poetry`, `pip`, 5 | etc. This is a quick cheat sheet on that: 6 | 7 | On macOS or Linux, if you don't have `uv` installed, a quick way to install it: 8 | 9 | ```shell 10 | curl -LsSf https://astral.sh/uv/install.sh | sh 11 | ``` 12 | 13 | For macOS, you prefer [brew](https://brew.sh/) you can install or upgrade uv with: 14 | 15 | ```shell 16 | brew update 17 | brew install uv 18 | ``` 19 | 20 | See [uv's docs](https://docs.astral.sh/uv/getting-started/installation/) for more 21 | installation methods and platforms. 22 | 23 | Now you can use uv to install a current Python environment: 24 | 25 | ```shell 26 | uv python install 3.13 # Or pick another version. 27 | ``` 28 | -------------------------------------------------------------------------------- /src/chopdiff/divs/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa: F401 2 | 3 | from chopdiff.divs.chunk_utils import chunk_children, chunk_generator, chunk_paras 4 | from chopdiff.divs.div_elements import ( 5 | CHUNK, 6 | GROUP, 7 | ORIGINAL, 8 | RESULT, 9 | chunk_text_as_divs, 10 | div, 11 | div_get_original, 12 | div_insert_wrapped, 13 | ) 14 | from chopdiff.divs.parse_divs import parse_divs, parse_divs_by_class, parse_divs_single 15 | from chopdiff.divs.text_node import TextNode 16 | 17 | __all__ = [ 18 | "chunk_children", 19 | "chunk_generator", 20 | "chunk_paras", 21 | "CHUNK", 22 | "GROUP", 23 | "ORIGINAL", 24 | "RESULT", 25 | "chunk_text_as_divs", 26 | "div", 27 | "div_get_original", 28 | "div_insert_wrapped", 29 | "parse_divs", 30 | "parse_divs_by_class", 31 | "parse_divs_single", 32 | "TextNode", 33 | ] 34 | -------------------------------------------------------------------------------- /src/chopdiff/html/extractor.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from collections.abc import Iterable 3 | from typing import Generic, TypeAlias, TypeVar 4 | 5 | T = TypeVar("T") 6 | 7 | Match: TypeAlias = tuple[T, int, int] 8 | """Match, index, and offset of content found by an extractor.""" 9 | 10 | 11 | class ContentNotFound(ValueError): 12 | """ 13 | Exception raised when content is not found by an extractor. 14 | """ 15 | 16 | 17 | class Extractor(ABC, Generic[T]): 18 | """ 19 | Abstract base class for extractors that extract information from a document at a 20 | given location. We use a class and not a pure function since we may need to 21 | preprocess the document. 22 | """ 23 | 24 | @abstractmethod 25 | def extract_all(self) -> Iterable[Match[T]]: 26 | pass 27 | 28 | @abstractmethod 29 | def extract_preceding(self, wordtok_offset: int) -> Match[T]: 30 | pass 31 | -------------------------------------------------------------------------------- /src/chopdiff/util/lemmatize.py: -------------------------------------------------------------------------------- 1 | def lemmatize(text: str, lang: str = "en") -> str: 2 | """ 3 | Returns a string of lemmatized tokens using simplemma. 4 | """ 5 | try: 6 | import simplemma 7 | except ImportError: 8 | raise ImportError( 9 | "simplemma is an optional dependency of chopdiff. Add it to use lemmatization." 10 | ) 11 | 12 | tokens = simplemma.simple_tokenizer(text) 13 | lemmatized_tokens = [simplemma.lemmatize(token, lang=lang) for token in tokens] 14 | return " ".join(lemmatized_tokens) 15 | 16 | 17 | def lemmatized_equal(text1: str, text2: str, case_sensitive: bool = False) -> bool: 18 | """ 19 | Compare two texts to see if they are the same except for lemmatization. 20 | Ignores whitespace. Does not ignore punctuation. 21 | """ 22 | if not case_sensitive: 23 | text1 = text1.lower() 24 | text2 = text2.lower() 25 | return lemmatize(text1) == lemmatize(text2) 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Joshua Levy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for easy development workflows. 2 | # See development.md for docs. 3 | # Note GitHub Actions call uv directly, not this Makefile. 4 | 5 | .DEFAULT_GOAL := default 6 | 7 | .PHONY: default install lint test upgrade build clean agent-rules 8 | 9 | default: agent-rules install lint test 10 | 11 | install: 12 | uv sync --all-extras 13 | 14 | lint: 15 | uv run python devtools/lint.py 16 | 17 | test: 18 | uv run pytest 19 | 20 | upgrade: 21 | uv sync --upgrade --all-extras --dev 22 | 23 | build: 24 | uv build 25 | 26 | agent-rules: CLAUDE.md AGENTS.md 27 | 28 | # Use .cursor/rules for sources of rules. 29 | # Create Claude and Codex rules from these. 30 | CLAUDE.md: .cursor/rules/general.mdc .cursor/rules/python.mdc 31 | cat .cursor/rules/general.mdc .cursor/rules/python.mdc > CLAUDE.md 32 | 33 | AGENTS.md: .cursor/rules/general.mdc .cursor/rules/python.mdc 34 | cat .cursor/rules/general.mdc .cursor/rules/python.mdc > AGENTS.md 35 | 36 | clean: 37 | -rm -rf dist/ 38 | -rm -rf *.egg-info/ 39 | -rm -rf .pytest_cache/ 40 | -rm -rf .mypy_cache/ 41 | -rm -rf .venv/ 42 | -rm -rf CLAUDE.md AGENTS.md 43 | -find . -type d -name "__pycache__" -exec rm -rf {} + 44 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | 3 | on: 4 | release: 5 | types: [published] 6 | workflow_dispatch: # Enable manual trigger. 7 | 8 | jobs: 9 | build-and-publish: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | id-token: write # Mandatory for OIDC. 13 | contents: read 14 | steps: 15 | - name: Checkout (official GitHub action) 16 | uses: actions/checkout@v4 17 | with: 18 | # Important for versioning plugins: 19 | fetch-depth: 0 20 | 21 | - name: Install uv (official Astral action) 22 | uses: astral-sh/setup-uv@v5 23 | with: 24 | version: "0.8.9" 25 | enable-cache: true 26 | python-version: "3.12" 27 | 28 | - name: Set up Python (using uv) 29 | run: uv python install 30 | 31 | - name: Install all dependencies 32 | run: uv sync --all-extras 33 | 34 | - name: Run tests 35 | run: uv run pytest 36 | 37 | - name: Build package 38 | run: uv build 39 | 40 | - name: Publish to PyPI 41 | run: uv publish --trusted-publishing always 42 | # Although uv is newer and faster, the "official" publishing option is the one from PyPA, 43 | # which uses twine. If desired, replace `uv publish` with: 44 | # uses: pypa/gh-action-pypi-publish@release/v1 45 | -------------------------------------------------------------------------------- /src/chopdiff/docs/sizes.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | from chopdiff.docs.wordtoks import wordtokenize 4 | from chopdiff.html.html_plaintext import html_to_plaintext 5 | from chopdiff.util.tiktoken_utils import tiktoken_len 6 | 7 | 8 | def size_in_bytes(text: str) -> int: 9 | return len(text.encode("utf-8")) 10 | 11 | 12 | def size_in_wordtoks(text: str) -> int: 13 | return len(wordtokenize(text)) 14 | 15 | 16 | class TextUnit(Enum): 17 | """ 18 | Text units of measure. 19 | """ 20 | 21 | lines = "lines" 22 | bytes = "bytes" 23 | chars = "chars" 24 | words = "words" 25 | wordtoks = "wordtoks" 26 | paragraphs = "paragraphs" 27 | sentences = "sentences" 28 | tiktokens = "tiktokens" 29 | 30 | 31 | def size(text: str, unit: TextUnit) -> int: 32 | if unit == TextUnit.lines: 33 | return len(text.splitlines()) 34 | elif unit == TextUnit.bytes: 35 | return size_in_bytes(text) 36 | elif unit == TextUnit.chars: 37 | return len(text) 38 | elif unit == TextUnit.words: 39 | # Roughly accurate for HTML, text, or Markdown docs. 40 | return len(html_to_plaintext(text).split()) 41 | elif unit == TextUnit.wordtoks: 42 | return size_in_wordtoks(text) 43 | elif unit == TextUnit.tiktokens: 44 | return tiktoken_len(text) 45 | else: 46 | raise NotImplementedError(f"Unsupported unit for string: {unit}") 47 | -------------------------------------------------------------------------------- /examples/gettysberg.txt: -------------------------------------------------------------------------------- 1 | four score and seven years ago our fathers brought forth on this continent, a new 2 | nation, conceived in Liberty, and dedicated to the proposition that all men are created 3 | equal. Now we are engaged in a great civil war, testing whether that nation, or any 4 | nation so conceived and so dedicated, can long endure. We are met on a great 5 | battle-field of that war. We have come to dedicate a portion of that field, as a final 6 | resting place for those who here gave their lives that that nation might live. It is 7 | altogether fitting and proper that we should do this. But, in a larger sense, we can not 8 | dedicate—we can not consecrate—we can not hallow—this ground. The brave men, living and 9 | dead, who struggled here, have consecrated it, far above our poor power to add or 10 | detract. The world will little note, nor long remember what we say here, but it can 11 | never forget what they did here. It is for us the living, rather, to be dedicated here 12 | to the unfinished work which they who fought here have thus far so nobly advanced. It is 13 | rather for us to be here dedicated to the great task remaining before us—that from these 14 | honored dead we take increased devotion to that cause for which they gave the last full 15 | measure of devotion—that we here highly resolve that these dead shall not have died in 16 | vain—that this nation, under God, shall have a new birth of freedom—and that government 17 | of the people, by the people, for the people, shall not perish from the earth. -------------------------------------------------------------------------------- /src/chopdiff/html/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa: F401 2 | 3 | from chopdiff.html.extractor import ContentNotFound, Extractor, Match 4 | from chopdiff.html.html_in_md import ( 5 | Attrs, 6 | ClassNames, 7 | Wrapper, 8 | div_wrapper, 9 | escape_md_html, 10 | html_a, 11 | html_b, 12 | html_div, 13 | html_i, 14 | html_img, 15 | html_join_blocks, 16 | html_span, 17 | md_para, 18 | span_wrapper, 19 | tag_with_attrs, 20 | ) 21 | from chopdiff.html.html_plaintext import html_to_plaintext, plaintext_to_html 22 | from chopdiff.html.html_tags import ( 23 | TagMatch, 24 | html_extract_attribute_value, 25 | html_find_tag, 26 | rewrite_html_img_urls, 27 | rewrite_html_tag_attr, 28 | ) 29 | from chopdiff.html.timestamps import ( 30 | TimestampExtractor, 31 | extract_timestamp, 32 | has_timestamp, 33 | ) 34 | 35 | __all__ = [ 36 | "Attrs", 37 | "ClassNames", 38 | "ContentNotFound", 39 | "Extractor", 40 | "Match", 41 | "TagMatch", 42 | "html_extract_attribute_value", 43 | "html_find_tag", 44 | "rewrite_html_img_urls", 45 | "rewrite_html_tag_attr", 46 | "Wrapper", 47 | "div_wrapper", 48 | "escape_md_html", 49 | "html_a", 50 | "html_b", 51 | "html_div", 52 | "html_i", 53 | "html_img", 54 | "html_join_blocks", 55 | "html_span", 56 | "md_para", 57 | "span_wrapper", 58 | "tag_with_attrs", 59 | "html_to_plaintext", 60 | "plaintext_to_html", 61 | "TimestampExtractor", 62 | "extract_timestamp", 63 | "has_timestamp", 64 | ] 65 | -------------------------------------------------------------------------------- /devtools/lint.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | from funlog import log_calls 4 | from rich import get_console, reconfigure 5 | from rich import print as rprint 6 | 7 | # Update as needed. 8 | SRC_PATHS = ["src", "tests", "devtools", "examples"] 9 | DOC_PATHS = ["README.md"] 10 | 11 | 12 | reconfigure(emoji=not get_console().options.legacy_windows) # No emojis on legacy windows. 13 | 14 | 15 | def main(): 16 | rprint() 17 | 18 | errcount = 0 19 | errcount += run(["codespell", "--write-changes", *SRC_PATHS, *DOC_PATHS]) 20 | errcount += run(["ruff", "check", "--fix", *SRC_PATHS]) 21 | errcount += run(["ruff", "format", *SRC_PATHS]) 22 | errcount += run(["basedpyright", "--stats", *SRC_PATHS]) 23 | 24 | rprint() 25 | 26 | if errcount != 0: 27 | rprint(f"[bold red]:x: Lint failed with {errcount} errors.[/bold red]") 28 | else: 29 | rprint("[bold green]:white_check_mark: Lint passed![/bold green]") 30 | rprint() 31 | 32 | return errcount 33 | 34 | 35 | @log_calls(level="warning", show_timing_only=True) 36 | def run(cmd: list[str]) -> int: 37 | rprint() 38 | rprint(f"[bold green]>> {' '.join(cmd)}[/bold green]") 39 | errcount = 0 40 | try: 41 | subprocess.run(cmd, text=True, check=True) 42 | except KeyboardInterrupt: 43 | rprint("[yellow]Keyboard interrupt - Cancelled[/yellow]") 44 | errcount = 1 45 | except subprocess.CalledProcessError as e: 46 | rprint(f"[bold red]Error: {e}[/bold red]") 47 | errcount = 1 48 | 49 | return errcount 50 | 51 | 52 | if __name__ == "__main__": 53 | exit(main()) 54 | -------------------------------------------------------------------------------- /tests/transforms/test_sliding_windows.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | from textwrap import dedent 3 | 4 | from chopdiff.docs.sizes import TextUnit, size 5 | from chopdiff.docs.text_doc import TextDoc 6 | from chopdiff.transforms.sliding_windows import sliding_word_window 7 | 8 | _example_text = dedent( 9 | """ 10 | This is the first paragraph. It has multiple sentences. 11 | 12 | This is the second paragraph. It also has multiple sentences. And it continues. 13 | 14 | Here is the third paragraph. More sentences follow. And here is another one. 15 | """ 16 | ).strip() 17 | 18 | 19 | def test_sliding_window(): 20 | doc = TextDoc.from_text(_example_text) 21 | window_size = 80 22 | window_shift = 60 23 | 24 | windows = list(sliding_word_window(doc, window_size, window_shift, TextUnit.bytes)) 25 | pprint(windows) 26 | 27 | sentence_windows = [ 28 | [[sent.text for sent in para.sentences] for para in doc.paragraphs] for doc in windows 29 | ] 30 | 31 | assert sentence_windows == [ 32 | [["This is the first paragraph.", "It has multiple sentences."]], 33 | [["It has multiple sentences."], ["This is the second paragraph."]], 34 | [ 35 | [ 36 | "This is the second paragraph.", 37 | "It also has multiple sentences.", 38 | "And it continues.", 39 | ] 40 | ], 41 | [ 42 | ["And it continues."], 43 | ["Here is the third paragraph.", "More sentences follow."], 44 | ], 45 | ] 46 | 47 | for sub_doc in windows: 48 | sub_text = sub_doc.reassemble() 49 | 50 | print(f"\n\n---Sub-document length {size(sub_text, TextUnit.bytes)}") 51 | pprint(sub_text) 52 | 53 | assert size(sub_text, TextUnit.bytes) <= window_size 54 | 55 | assert sub_text in doc.reassemble() 56 | -------------------------------------------------------------------------------- /src/chopdiff/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | from chopdiff.transforms.diff_filters import ( 2 | WILDCARD_TOK, 3 | adds_headings, 4 | changes_whitespace, 5 | changes_whitespace_or_punct, 6 | make_token_sequence_filter, 7 | no_word_lemma_changes, 8 | removes_word_lemmas, 9 | removes_words, 10 | ) 11 | from chopdiff.transforms.sliding_transforms import ( 12 | TextDocTransform, 13 | filtered_transform, 14 | remove_window_br, 15 | sliding_para_window_transform, 16 | sliding_window_transform, 17 | sliding_wordtok_window_transform, 18 | ) 19 | from chopdiff.transforms.sliding_windows import sliding_para_window, sliding_word_window 20 | from chopdiff.transforms.window_settings import ( 21 | WINDOW_1_PARA, 22 | WINDOW_2_PARA, 23 | WINDOW_2K_WORDTOKS, 24 | WINDOW_4_PARA, 25 | WINDOW_8_PARA, 26 | WINDOW_16_PARA, 27 | WINDOW_32_PARA, 28 | WINDOW_64_PARA, 29 | WINDOW_128_PARA, 30 | WINDOW_256_PARA, 31 | WINDOW_512_PARA, 32 | WINDOW_1024_PARA, 33 | WINDOW_BR, 34 | WINDOW_BR_SEP, 35 | WINDOW_NONE, 36 | WindowSettings, 37 | ) 38 | 39 | __all__ = [ 40 | "WILDCARD_TOK", 41 | "adds_headings", 42 | "changes_whitespace", 43 | "changes_whitespace_or_punct", 44 | "make_token_sequence_filter", 45 | "no_word_lemma_changes", 46 | "removes_word_lemmas", 47 | "removes_words", 48 | "TextDocTransform", 49 | "filtered_transform", 50 | "remove_window_br", 51 | "sliding_para_window_transform", 52 | "sliding_window_transform", 53 | "sliding_wordtok_window_transform", 54 | "sliding_para_window", 55 | "sliding_word_window", 56 | "WINDOW_1_PARA", 57 | "WINDOW_2_PARA", 58 | "WINDOW_2K_WORDTOKS", 59 | "WINDOW_4_PARA", 60 | "WINDOW_8_PARA", 61 | "WINDOW_16_PARA", 62 | "WINDOW_32_PARA", 63 | "WINDOW_64_PARA", 64 | "WINDOW_128_PARA", 65 | "WINDOW_256_PARA", 66 | "WINDOW_512_PARA", 67 | "WINDOW_1024_PARA", 68 | "WINDOW_BR", 69 | "WINDOW_BR_SEP", 70 | "WINDOW_NONE", 71 | "WindowSettings", 72 | ] 73 | -------------------------------------------------------------------------------- /tests/html/test_timestamps.py: -------------------------------------------------------------------------------- 1 | from textwrap import dedent 2 | 3 | from chopdiff.html.extractor import ContentNotFound 4 | from chopdiff.html.timestamps import TimestampExtractor 5 | 6 | 7 | def test_timestamp_extractor(): 8 | doc_str = 'Sentence one. Sentence two. Sentence three.' 9 | 10 | extractor = TimestampExtractor(doc_str) 11 | wordtoks = extractor.wordtoks 12 | 13 | results: list[str] = [] 14 | offsets: list[int] = [] 15 | for i, wordtok in enumerate(wordtoks): 16 | try: 17 | timestamp, _index, offset = extractor.extract_preceding(i) 18 | except ContentNotFound: 19 | timestamp = None 20 | offset = -1 21 | results.append(f"{i}: {timestamp} ⎪{wordtok}⎪") 22 | offsets.append(offset) 23 | 24 | print("\n".join(results)) 25 | print(offsets) 26 | 27 | assert ( 28 | "\n".join(results) 29 | == dedent( 30 | """ 31 | 0: None ⎪<-BOF->⎪ 32 | 1: None ⎪⎪ 33 | 2: 1.234 ⎪Sentence⎪ 34 | 3: 1.234 ⎪ ⎪ 35 | 4: 1.234 ⎪one⎪ 36 | 5: 1.234 ⎪.⎪ 37 | 6: 1.234 ⎪⎪ 38 | 7: 1.234 ⎪ ⎪ 39 | 8: 1.234 ⎪⎪ 40 | 9: 23.0 ⎪Sentence⎪ 41 | 10: 23.0 ⎪ ⎪ 42 | 11: 23.0 ⎪two⎪ 43 | 12: 23.0 ⎪.⎪ 44 | 13: 23.0 ⎪⎪ 45 | 14: 23.0 ⎪ ⎪ 46 | 15: 23.0 ⎪Sentence⎪ 47 | 16: 23.0 ⎪ ⎪ 48 | 17: 23.0 ⎪three⎪ 49 | 18: 23.0 ⎪.⎪ 50 | 19: 23.0 ⎪<-EOF->⎪ 51 | """ 52 | ).strip() 53 | ) 54 | 55 | assert offsets == [ 56 | -1, 57 | -1, 58 | 0, 59 | 0, 60 | 0, 61 | 0, 62 | 0, 63 | 0, 64 | 0, 65 | 50, 66 | 50, 67 | 50, 68 | 50, 69 | 50, 70 | 50, 71 | 50, 72 | 50, 73 | 50, 74 | 50, 75 | 50, 76 | ] 77 | -------------------------------------------------------------------------------- /src/chopdiff/docs/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa: F401 2 | 3 | from chopdiff.docs.search_tokens import search_tokens 4 | from chopdiff.docs.sizes import TextUnit 5 | from chopdiff.docs.text_doc import Paragraph, Sentence, SentIndex, TextDoc 6 | from chopdiff.docs.token_diffs import ( 7 | DIFF_FILTER_NONE, 8 | DiffFilter, 9 | DiffOp, 10 | DiffStats, 11 | OpType, 12 | TokenDiff, 13 | diff_docs, 14 | diff_wordtoks, 15 | scored_diff_wordtoks, 16 | ) 17 | from chopdiff.docs.token_mapping import TokenMapping 18 | from chopdiff.docs.wordtoks import ( 19 | BOF_STR, 20 | BOF_TOK, 21 | EOF_STR, 22 | EOF_TOK, 23 | PARA_BR_STR, 24 | PARA_BR_TOK, 25 | SENT_BR_STR, 26 | SENT_BR_TOK, 27 | SPACE_TOK, 28 | SYMBOL_SEP, 29 | Tag, 30 | first_wordtok, 31 | is_break_or_space, 32 | is_div, 33 | is_header_tag, 34 | is_tag, 35 | is_tag_close, 36 | is_tag_open, 37 | is_whitespace_or_punct, 38 | is_word, 39 | join_wordtoks, 40 | normalize_wordtok, 41 | wordtok_len, 42 | wordtok_to_str, 43 | wordtokenize, 44 | wordtokenize_with_offsets, 45 | ) 46 | 47 | __all__ = [ 48 | "search_tokens", 49 | "TextUnit", 50 | "Paragraph", 51 | "Sentence", 52 | "SentIndex", 53 | "TextDoc", 54 | "DIFF_FILTER_NONE", 55 | "DiffFilter", 56 | "DiffOp", 57 | "DiffStats", 58 | "OpType", 59 | "TokenDiff", 60 | "diff_docs", 61 | "diff_wordtoks", 62 | "scored_diff_wordtoks", 63 | "TokenMapping", 64 | "BOF_STR", 65 | "BOF_TOK", 66 | "EOF_STR", 67 | "EOF_TOK", 68 | "PARA_BR_STR", 69 | "PARA_BR_TOK", 70 | "SENT_BR_STR", 71 | "SENT_BR_TOK", 72 | "SPACE_TOK", 73 | "SYMBOL_SEP", 74 | "Tag", 75 | "first_wordtok", 76 | "is_break_or_space", 77 | "is_div", 78 | "is_header_tag", 79 | "is_tag", 80 | "is_tag_close", 81 | "is_tag_open", 82 | "is_whitespace_or_punct", 83 | "is_word", 84 | "join_wordtoks", 85 | "normalize_wordtok", 86 | "wordtok_len", 87 | "wordtok_to_str", 88 | "wordtokenize", 89 | "wordtokenize_with_offsets", 90 | ] 91 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: CI 5 | 6 | on: 7 | push: 8 | # Use ["main", "master"] for CI only on the default branch. 9 | # Use ["**"] for CI on all branches. 10 | branches: ["main", "master"] 11 | pull_request: 12 | branches: ["main", "master"] 13 | 14 | permissions: 15 | contents: read 16 | 17 | jobs: 18 | build: 19 | strategy: 20 | matrix: 21 | # Update this as needed: 22 | # Common platforms: ["ubuntu-latest", "macos-latest", "windows-latest"] 23 | os: ["ubuntu-latest"] 24 | python-version: ["3.11", "3.12", "3.13"] 25 | 26 | # Linux only by default. Use ${{ matrix.os }} for other OSes. 27 | runs-on: ${{ matrix.os }} 28 | 29 | steps: 30 | 31 | # Generally following uv docs: 32 | # https://docs.astral.sh/uv/guides/integration/github/ 33 | 34 | - name: Checkout (official GitHub action) 35 | uses: actions/checkout@v4 36 | with: 37 | # Important for versioning plugins: 38 | fetch-depth: 0 39 | 40 | # From debugging the cydifflib build failure. 41 | # Confirmed we have version 3.31.6 installed. 42 | - name: Display CMake Version 43 | run: cmake --version 44 | 45 | - name: Install uv (official Astral action) 46 | uses: astral-sh/setup-uv@v5 47 | with: 48 | # Update this as needed: 49 | version: "0.8.9" 50 | enable-cache: true 51 | python-version: ${{ matrix.python-version }} 52 | 53 | - name: Set up Python (using uv) 54 | run: uv python install 55 | 56 | # Alternately can use the official Python action: 57 | # - name: Set up Python (using actions/setup-python) 58 | # uses: actions/setup-python@v5 59 | # with: 60 | # python-version: ${{ matrix.python-version }} 61 | 62 | - name: Install all dependencies 63 | run: uv sync --all-extras 64 | 65 | - name: Run linting 66 | run: uv run python devtools/lint.py 67 | 68 | - name: Run tests 69 | run: uv run pytest -------------------------------------------------------------------------------- /src/chopdiff/divs/chunk_utils.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Callable, Generator 2 | from typing import TypeVar 3 | 4 | from chopdiff.divs.text_node import TextNode 5 | from chopdiff.docs.sizes import TextUnit 6 | from chopdiff.docs.text_doc import TextDoc 7 | 8 | T = TypeVar("T") 9 | 10 | 11 | def chunk_generator( 12 | doc: T, 13 | condition: Callable[[T], bool], 14 | slicer: Callable[[T, int, int], T], 15 | total_size: int, 16 | ) -> Generator[T, None, None]: 17 | """ 18 | Walk through the elements of a document and yield sequential subdocs once they meet 19 | a specific condition. 20 | """ 21 | 22 | start_index = 0 23 | current_index = 0 24 | 25 | while current_index < total_size: 26 | current_doc = slicer(doc, start_index, current_index) 27 | 28 | if condition(current_doc): 29 | yield current_doc 30 | start_index = current_index + 1 31 | current_index = start_index 32 | else: 33 | current_index += 1 34 | 35 | if start_index < total_size: 36 | yield slicer(doc, start_index, total_size) 37 | 38 | 39 | def chunk_paras(doc: TextDoc, min_size: int, unit: TextUnit) -> Generator[TextDoc, None, None]: 40 | """ 41 | Generate TextDoc chunks where each chunk is at least the specified minimum size. 42 | """ 43 | 44 | def condition(slice: TextDoc) -> bool: 45 | return slice.size(unit) >= min_size 46 | 47 | def slicer(doc: TextDoc, start: int, end: int) -> TextDoc: 48 | return doc.sub_paras(start, end) 49 | 50 | total_paragraphs = len(doc.paragraphs) 51 | 52 | yield from chunk_generator(doc, condition, slicer, total_paragraphs) 53 | 54 | 55 | def chunk_children( 56 | node: TextNode, min_size: int, unit: TextUnit 57 | ) -> Generator[TextNode, None, None]: 58 | """ 59 | Generate TextNode chunks where each chunk is at least the specified minimum size. 60 | """ 61 | 62 | def condition(slice: TextNode) -> bool: 63 | return slice.size(unit) >= min_size 64 | 65 | def slicer(node: TextNode, start: int, end: int) -> TextNode: 66 | return node.slice_children(start, end) 67 | 68 | total_children = len(node.children) 69 | 70 | yield from chunk_generator(node, condition, slicer, total_children) 71 | -------------------------------------------------------------------------------- /src/chopdiff/transforms/sliding_windows.py: -------------------------------------------------------------------------------- 1 | """ 2 | Sliding windows of text on a text doc. 3 | """ 4 | 5 | import logging 6 | from collections.abc import Callable, Generator 7 | 8 | from flowmark import fill_markdown 9 | 10 | from chopdiff.docs.sizes import TextUnit 11 | from chopdiff.docs.text_doc import SentIndex, TextDoc 12 | 13 | log = logging.getLogger(__name__) 14 | 15 | 16 | def sliding_word_window( 17 | doc: TextDoc, window_size: int, window_shift: int, unit: TextUnit 18 | ) -> Generator[TextDoc, None, None]: 19 | """ 20 | Generate TextDoc sub-documents in a sliding window over the given document. 21 | """ 22 | total_size = doc.size(unit) 23 | start_offset = 0 24 | start_index, _ = doc.seek_to_sent(start_offset, unit) 25 | 26 | while start_offset < total_size: 27 | end_offset = start_offset + window_size 28 | end_index, _ = doc.seek_to_sent(end_offset, unit) 29 | 30 | # Sentence may extend past the window, so back up to ensure it fits. 31 | sub_doc = doc.sub_doc(start_index, end_index) 32 | try: 33 | while sub_doc.size(unit) > window_size: 34 | end_index = doc.prev_sent(end_index) 35 | sub_doc = doc.sub_doc(start_index, end_index) 36 | except ValueError: 37 | raise ValueError( 38 | f"Window size {window_size} too small for sentence at offset {start_offset}" 39 | ) 40 | 41 | yield sub_doc 42 | 43 | start_offset += window_shift 44 | start_index = end_index 45 | 46 | 47 | def sliding_para_window( 48 | doc: TextDoc, nparas: int, normalizer: Callable[[str], str] = fill_markdown 49 | ) -> Generator[TextDoc, None, None]: 50 | """ 51 | Generate TextDoc sub-documents taking `nparas` paragraphs at a time. 52 | """ 53 | for i in range(0, len(doc.paragraphs), nparas): 54 | end_index = min(i + nparas - 1, len(doc.paragraphs) - 1) 55 | sub_doc = doc.sub_doc(SentIndex(i, 0), SentIndex(end_index, 0)) 56 | 57 | # XXX It's important we re-normalize especially because LLMs can output itemized lists with just 58 | # one newline, but for Markdown we want separate paragraphs for each list item. 59 | formatted_sub_doc = TextDoc.from_text(normalizer(sub_doc.reassemble())) 60 | 61 | yield formatted_sub_doc 62 | -------------------------------------------------------------------------------- /src/chopdiff/html/timestamps.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterable 2 | 3 | import regex 4 | from typing_extensions import override 5 | 6 | from chopdiff.docs.search_tokens import search_tokens 7 | from chopdiff.docs.wordtoks import wordtokenize_with_offsets 8 | from chopdiff.html.extractor import ContentNotFound, Extractor, Match 9 | 10 | # Match any span or div with a data-timestamp attribute. 11 | _TIMESTAMP_RE = regex.compile(r'(?:<\w+[^>]*\s)?data-timestamp=[\'"](\d+(\.\d+)?)[\'"][^>]*>') 12 | 13 | 14 | def extract_timestamp(wordtok: str) -> float | None: 15 | match = _TIMESTAMP_RE.search(wordtok) 16 | return float(match.group(1)) if match else None 17 | 18 | 19 | def has_timestamp(wordtok: str) -> bool: 20 | return extract_timestamp(wordtok) is not None 21 | 22 | 23 | class TimestampExtractor(Extractor[float]): 24 | """ 25 | Extract timestamps of the form `<... data-timestamp="123.45">` from a document. 26 | """ 27 | 28 | def __init__(self, doc_str: str): 29 | self.doc_str = doc_str 30 | self.wordtoks, self.offsets = wordtokenize_with_offsets(self.doc_str, bof_eof=True) 31 | 32 | @override 33 | def extract_all(self) -> Iterable[Match[float]]: 34 | """ 35 | Extract all timestamps from the document. 36 | """ 37 | for index, (wordtok, offset) in enumerate(zip(self.wordtoks, self.offsets, strict=False)): 38 | timestamp = extract_timestamp(wordtok) 39 | if timestamp is not None: 40 | yield timestamp, index, offset 41 | 42 | @override 43 | def extract_preceding(self, wordtok_offset: int) -> Match[float]: 44 | try: 45 | index, wordtok = ( 46 | search_tokens(self.wordtoks).at(wordtok_offset).seek_back(has_timestamp).get_token() 47 | ) 48 | if wordtok: 49 | timestamp = extract_timestamp(wordtok) 50 | if timestamp is not None: 51 | return timestamp, index, self.offsets[index] 52 | raise ContentNotFound( 53 | f"No timestamp found seeking back from token {wordtok_offset}: {wordtok!r}" 54 | ) 55 | except KeyError as e: 56 | raise ContentNotFound( 57 | f"No timestamp found searching back from token {wordtok_offset}: {e}" 58 | ) 59 | -------------------------------------------------------------------------------- /publishing.md: -------------------------------------------------------------------------------- 1 | ## Publishing Releases 2 | 3 | This is how to publish a Python package to [**PyPI**](https://pypi.org/) from GitHub 4 | Actions, when using the 5 | [**simple-modern-uv**](https://github.com/jlevy/simple-modern-uv) template. 6 | 7 | Thanks to [the dynamic versioning 8 | plugin](https://github.com/ninoseki/uv-dynamic-versioning/) and the 9 | [`publish.yml` workflow](https://github.com/jlevy/simple-modern-uv/blob/main/template/.github/workflows/publish.yml), 10 | you can simply create tagged releases (using standard format for the tag name, e.g. 11 | `v0.1.0`) on GitHub and the tag will trigger a release build, which then uploads it to 12 | PyPI. 13 | 14 | ### How to Publish the First Time 15 | 16 | This part is a little confusing the first time. 17 | Here is the simplest way to do it. 18 | For the purposes of this example replace OWNER and PROJECT with the right values. 19 | 20 | 1. **Get a PyPI account** at [pypi.org](https://pypi.org/) and sign in. 21 | 22 | 2. **Pick a name for the project** that isn't already taken. 23 | 24 | - Go to `https://pypi.org/project/PROJECT` to see if another project with that name 25 | already exits. 26 | 27 | - If needed, update your `pyproject.yml` with the correct name. 28 | 29 | 3. **Authorize** your repository to publish to PyPI: 30 | 31 | - Go to [the publishing settings page](https://pypi.org/manage/account/publishing/). 32 | 33 | - Find "Trusted Publisher Management" and register your GitHub repo as a new 34 | "pending" trusted publisher 35 | 36 | - Enter the project name, repo owner, repo name, and `publish.yml` as the workflow 37 | name. (You can leave the "environment name" field blank.) 38 | 39 | 4. **Create a release** on GitHub: 40 | 41 | - Commit code and make sure it's running correctly. 42 | 43 | - Go to your GitHub project page, then click on Actions tab. 44 | 45 | - Confirm all tests are passing in the last CI workflow. 46 | (If you want, you can even publish this template when it's empty as just a stub 47 | project, to try all this out.) 48 | 49 | - Go to your GitHub project page, click on Releases. 50 | 51 | - Fill in the tag and the release name. 52 | Select to create a new tag, and pick a version. 53 | A good option is `v0.1.0`. (It's wise to have it start with a `v`.) 54 | 55 | - Submit to create the release. 56 | 57 | 5. **Confirm it publishes to PyPI** 58 | 59 | - Watch for the release workflow in the GitHub Actions tab. 60 | 61 | - If it succeeds, you should see it appear at `https://pypi.org/project/PROJECT`. 62 | 63 | ### How to Publish Subsequent Releases 64 | 65 | Just create a new release! 66 | Everything is the same as the last two steps above. 67 | 68 | * * * 69 | 70 | *This file was built with 71 | [simple-modern-uv](https://github.com/jlevy/simple-modern-uv).* 72 | -------------------------------------------------------------------------------- /src/chopdiff/docs/search_tokens.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Callable 2 | from typing import TypeAlias 3 | 4 | Predicate: TypeAlias = Callable[[str], bool] | list[str] 5 | 6 | 7 | class _TokenSearcher: 8 | def __init__(self, toks: list[str]): 9 | self.toks = toks 10 | self._cur_idx = 0 11 | 12 | def at(self, index: int): 13 | if index is None: # pyright: ignore 14 | raise KeyError("Index cannot be None") 15 | # Convert negative indices to positive ones. 16 | self._cur_idx = index if index >= 0 else len(self.toks) + index 17 | return self 18 | 19 | def start(self): 20 | self._cur_idx = 0 21 | return self 22 | 23 | def end(self): 24 | self._cur_idx = len(self.toks) 25 | return self 26 | 27 | def seek_back(self, predicate: Predicate): 28 | if isinstance(predicate, list): 29 | allowed: list[str] = predicate 30 | predicate = lambda x: x in allowed 31 | for idx in range(self._cur_idx - 1, -1, -1): 32 | if predicate(self.toks[idx]): 33 | self._cur_idx = idx 34 | return self 35 | raise KeyError("No matching token found before the current index") 36 | 37 | def seek_forward(self, predicate: Predicate): 38 | if isinstance(predicate, list): 39 | allowed: list[str] = predicate 40 | predicate = lambda x: x in allowed 41 | for idx in range(self._cur_idx + 1, len(self.toks)): 42 | if predicate(self.toks[idx]): 43 | self._cur_idx = idx 44 | return self 45 | raise KeyError("No matching token found after the current index") 46 | 47 | def prev(self): 48 | if self._cur_idx - 1 < 0: 49 | raise KeyError("No previous token available") 50 | self._cur_idx -= 1 51 | return self 52 | 53 | def next(self): 54 | if self._cur_idx + 1 >= len(self.toks): 55 | raise KeyError("No next token available") 56 | self._cur_idx += 1 57 | return self 58 | 59 | def get_index(self) -> int: 60 | return self._cur_idx 61 | 62 | def get_token(self) -> tuple[int, str]: 63 | return self._cur_idx, self.toks[self._cur_idx] 64 | 65 | 66 | def search_tokens(wordtoks: list[str]) -> _TokenSearcher: 67 | """ 68 | Fluent convenience function to search for offsets in an array of string tokens 69 | based on a predicate, previous, next, etc. Raises `KeyError` if any search 70 | has no matches. 71 | 72 | Example: 73 | ``` 74 | index, token = ( 75 | search_tokens(list_of_tokens) 76 | .at(my_offset) 77 | .seek_back(has_timestamp) 78 | .next() 79 | .get_token() 80 | ) 81 | ``` 82 | """ 83 | return _TokenSearcher(wordtoks) 84 | -------------------------------------------------------------------------------- /tests/transforms/test_sliding_transforms.py: -------------------------------------------------------------------------------- 1 | from textwrap import dedent 2 | 3 | from chopdiff.docs.sizes import TextUnit 4 | from chopdiff.docs.text_doc import TextDoc 5 | from chopdiff.transforms.sliding_transforms import ( 6 | sliding_para_window_transform, 7 | sliding_window_transform, 8 | ) 9 | from chopdiff.transforms.window_settings import WINDOW_BR_SEP, WindowSettings 10 | 11 | _example_text = dedent( 12 | """ 13 | This is the first paragraph. It has multiple sentences. 14 | 15 | This is the second paragraph. It also has multiple sentences. And it continues. 16 | 17 | Here is the third paragraph. More sentences follow. And here is another one. 18 | """ 19 | ).strip() 20 | 21 | 22 | def test_sliding_word_window_transform(): 23 | long_text = (_example_text + "\n\n") * 2 24 | doc = TextDoc.from_text(long_text) 25 | 26 | # Simple transformation that converts all text to uppercase. 27 | def transform_func(window: TextDoc) -> TextDoc: 28 | transformed_text = window.reassemble().upper() 29 | return TextDoc.from_text(transformed_text) 30 | 31 | transformed_doc = sliding_window_transform( 32 | doc, 33 | transform_func, 34 | WindowSettings(TextUnit.wordtoks, 80, 60, min_overlap=5, separator="|"), 35 | ) 36 | print("---Wordtok transformed doc:") 37 | print(transformed_doc.reassemble()) 38 | 39 | assert transformed_doc.reassemble().count("|") == 2 40 | 41 | long_text = (_example_text + "\n\n") * 20 42 | doc = TextDoc.from_text(long_text) 43 | transformed_doc = sliding_window_transform( 44 | doc, transform_func, WindowSettings(TextUnit.wordtoks, 80, 60, min_overlap=5) 45 | ) 46 | assert transformed_doc.reassemble() == long_text.upper().strip() 47 | 48 | 49 | def test_sliding_para_window_transform(): 50 | def transform_func(window: TextDoc) -> TextDoc: 51 | transformed_text = window.reassemble().upper() 52 | return TextDoc.from_text(transformed_text) 53 | 54 | text = "\n\n".join(f"Paragraph {i}." for i in range(7)) 55 | doc = TextDoc.from_text(text) 56 | 57 | transformed_doc = sliding_para_window_transform( 58 | doc, 59 | transform_func, 60 | WindowSettings( 61 | TextUnit.paragraphs, 62 | 3, 63 | 3, 64 | separator=WINDOW_BR_SEP, 65 | ), 66 | ) 67 | 68 | print("---Paragraph transformed doc:") 69 | print(transformed_doc.reassemble()) 70 | 71 | assert ( 72 | transformed_doc.reassemble() 73 | == dedent( 74 | """ 75 | PARAGRAPH 0. 76 | 77 | PARAGRAPH 1. 78 | 79 | PARAGRAPH 2. 80 | 81 | PARAGRAPH 3. 82 | 83 | PARAGRAPH 4. 84 | 85 | PARAGRAPH 5. 86 | 87 | PARAGRAPH 6. 88 | """ 89 | ).strip() 90 | ) 91 | -------------------------------------------------------------------------------- /development.md: -------------------------------------------------------------------------------- 1 | # Development 2 | 3 | ## Setting Up uv 4 | 5 | This project is set up to use [uv](https://docs.astral.sh/uv/) to manage Python and 6 | dependencies. First, be sure you 7 | [have uv installed](https://docs.astral.sh/uv/getting-started/installation/). 8 | 9 | Then [fork the jlevy/chopdiff 10 | repo](https://github.com/jlevy/chopdiff/fork) (having your own 11 | fork will make it easier to contribute) and 12 | [clone it](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository). 13 | 14 | ## Basic Developer Workflows 15 | 16 | The `Makefile` simply offers shortcuts to `uv` commands for developer convenience. 17 | (For clarity, GitHub Actions don't use the Makefile and just call `uv` directly.) 18 | 19 | ```shell 20 | # First, install all dependencies and set up your virtual environment. 21 | # This simply runs `uv sync --all-extras` to install all packages, 22 | # including dev dependencies and optional dependencies. 23 | make install 24 | 25 | # Run uv sync, lint, and test (and also generate agent rules): 26 | make 27 | 28 | # Build wheel: 29 | make build 30 | 31 | # Linting: 32 | make lint 33 | 34 | # Run tests: 35 | make test 36 | 37 | # Delete all the build artifacts: 38 | make clean 39 | 40 | # Upgrade dependencies to compatible versions: 41 | make upgrade 42 | 43 | # To run tests by hand: 44 | uv run pytest # all tests 45 | uv run pytest -s src/module/some_file.py # one test, showing outputs 46 | 47 | # Build and install current dev executables, to let you use your dev copies 48 | # as local tools: 49 | uv tool install --editable . 50 | 51 | # Dependency management directly with uv: 52 | # Add a new dependency: 53 | uv add package_name 54 | # Add a development dependency: 55 | uv add --dev package_name 56 | # Update to latest compatible versions (including dependencies on git repos): 57 | uv sync --upgrade 58 | # Update a specific package: 59 | uv lock --upgrade-package package_name 60 | # Update dependencies on a package: 61 | uv add package_name@latest 62 | 63 | # Run a shell within the Python environment: 64 | uv venv 65 | source .venv/bin/activate 66 | ``` 67 | 68 | See [uv docs](https://docs.astral.sh/uv/) for details. 69 | 70 | ## Agent Rules 71 | 72 | See [.cursor/rules](.cursor/rules) for agent rules. 73 | These are written for [Cursor](https://www.cursor.com/) but are also used by other 74 | agents because the Makefile will generate `CLAUDE.md` and `AGENTS.md` from the same 75 | rules. 76 | 77 | ```shell 78 | make agent-rules 79 | ``` 80 | 81 | ## IDE setup 82 | 83 | If you use VSCode or a fork like Cursor or Windsurf, you can install the following 84 | extensions: 85 | 86 | - [Python](https://marketplace.visualstudio.com/items?itemName=ms-python.python) 87 | 88 | - [Based Pyright](https://marketplace.visualstudio.com/items?itemName=detachhead.basedpyright) 89 | for type checking. Note that this extension works with non-Microsoft VSCode forks like 90 | Cursor. 91 | 92 | ## Documentation 93 | 94 | - [uv docs](https://docs.astral.sh/uv/) 95 | 96 | - [basedpyright docs](https://docs.basedpyright.com/latest/) 97 | 98 | * * * 99 | 100 | *This file was built with 101 | [simple-modern-uv](https://github.com/jlevy/simple-modern-uv).* 102 | -------------------------------------------------------------------------------- /src/chopdiff/transforms/window_settings.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from typing_extensions import override 4 | 5 | from chopdiff.docs.sizes import TextUnit 6 | 7 | WINDOW_BR = "" 8 | """Marker inserted into result documents to show where window breaks have occurred.""" 9 | 10 | WINDOW_BR_SEP = f"\n{WINDOW_BR}\n" 11 | 12 | 13 | @dataclass(frozen=True) 14 | class WindowSettings: 15 | """ 16 | Size of the sliding window, the shift, and the min overlap required when stitching windows 17 | together. All sizes in wordtoks. 18 | """ 19 | 20 | unit: TextUnit 21 | size: int 22 | shift: int 23 | min_overlap: int = 0 24 | separator: str = "" 25 | 26 | @override 27 | def __str__(self): 28 | return f"windowing size={self.size}, shift={self.shift}, min_overlap={self.min_overlap} {self.unit.value}" 29 | 30 | 31 | WINDOW_NONE = WindowSettings(unit=TextUnit.wordtoks, size=0, shift=0, min_overlap=0, separator="") 32 | """ 33 | Do not use a sliding window. 34 | """ 35 | 36 | WINDOW_2K_WORDTOKS = WindowSettings( 37 | TextUnit.wordtoks, 38 | size=2048, 39 | shift=2048 - 256, 40 | min_overlap=8, 41 | separator=WINDOW_BR_SEP, 42 | ) 43 | """ 44 | Sliding, overlapping word-based window. Useful for finding paragraph breaks. 45 | 2K wordtoks is several paragraphs. 46 | """ 47 | 48 | 49 | WINDOW_1_PARA = WindowSettings( 50 | TextUnit.paragraphs, size=1, shift=1, min_overlap=0, separator=WINDOW_BR_SEP 51 | ) 52 | """Process 1 paragraph at a time.""" 53 | 54 | 55 | WINDOW_2_PARA = WindowSettings( 56 | TextUnit.paragraphs, size=2, shift=2, min_overlap=0, separator=WINDOW_BR_SEP 57 | ) 58 | """Process 2 paragraphs at a time.""" 59 | 60 | 61 | WINDOW_4_PARA = WindowSettings( 62 | TextUnit.paragraphs, size=4, shift=4, min_overlap=0, separator=WINDOW_BR_SEP 63 | ) 64 | """Process 4 paragraph at a time.""" 65 | 66 | 67 | WINDOW_8_PARA = WindowSettings( 68 | TextUnit.paragraphs, size=8, shift=8, min_overlap=0, separator=WINDOW_BR_SEP 69 | ) 70 | """Process 8 paragraphs at a time.""" 71 | 72 | 73 | WINDOW_16_PARA = WindowSettings( 74 | TextUnit.paragraphs, size=16, shift=16, min_overlap=0, separator=WINDOW_BR_SEP 75 | ) 76 | """Process 16 paragraphs at a time.""" 77 | 78 | WINDOW_32_PARA = WindowSettings( 79 | TextUnit.paragraphs, size=32, shift=32, min_overlap=0, separator=WINDOW_BR_SEP 80 | ) 81 | """Process 32 paragraphs at a time.""" 82 | 83 | WINDOW_64_PARA = WindowSettings( 84 | TextUnit.paragraphs, size=64, shift=64, min_overlap=0, separator=WINDOW_BR_SEP 85 | ) 86 | """Process 64 paragraphs at a time.""" 87 | 88 | WINDOW_128_PARA = WindowSettings( 89 | TextUnit.paragraphs, size=128, shift=128, min_overlap=0, separator=WINDOW_BR_SEP 90 | ) 91 | """Process 128 paragraphs at a time.""" 92 | 93 | WINDOW_256_PARA = WindowSettings( 94 | TextUnit.paragraphs, size=256, shift=256, min_overlap=0, separator=WINDOW_BR_SEP 95 | ) 96 | """Process 256 paragraphs at a time.""" 97 | 98 | WINDOW_512_PARA = WindowSettings( 99 | TextUnit.paragraphs, size=512, shift=512, min_overlap=0, separator=WINDOW_BR_SEP 100 | ) 101 | """Process 512 paragraphs at a time.""" 102 | 103 | WINDOW_1024_PARA = WindowSettings( 104 | TextUnit.paragraphs, size=1024, shift=1024, min_overlap=0, separator=WINDOW_BR_SEP 105 | ) 106 | """Process 1024 paragraphs at a time.""" 107 | -------------------------------------------------------------------------------- /.cursor/rules/general.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: General Guidelines 3 | globs: 4 | alwaysApply: true 5 | --- 6 | # Assistant Rules 7 | 8 | **Your fundamental responsibility:** Remember you are a senior engineer and have a 9 | serious responsibility to be clear, factual, think step by step and be systematic, 10 | express expert opinion, and make use of the user’s attention wisely. 11 | 12 | **Rules must be followed:** It is your responsibility to carefully read these rules as 13 | well as Python or other language-specific rules included here. 14 | 15 | Therefore: 16 | 17 | - Be concise. State answers or responses directly, without extra commentary. 18 | Or (if it is clear) directly do what is asked. 19 | 20 | - If instructions are unclear or there are two or more ways to fulfill the request that 21 | are substantially different, make a tentative plan (or offer options) and ask for 22 | confirmation. 23 | 24 | - If you can think of a much better approach that the user requests, be sure to mention 25 | it. It’s your responsibility to suggest approaches that lead to better, simpler 26 | solutions. 27 | 28 | - Give thoughtful opinions on better/worse approaches, but NEVER say “great idea!” 29 | or “good job” or other compliments, encouragement, or non-essential banter. 30 | Your job is to give expert opinions and to solve problems, not to motivate the user. 31 | 32 | - Avoid gratuitous enthusiasm or generalizations. 33 | Use thoughtful comparisons like saying which code is “cleaner” but don’t congratulate 34 | yourself. Avoid subjective descriptions. 35 | For example, don’t say “I’ve meticulously improved the code and it is in great shape!” 36 | That is useless generalization. 37 | Instead, specifically say what you’ve done, e.g., "I’ve added types, including 38 | generics, to all the methods in `Foo` and fixed all linter errors." 39 | 40 | # General Coding Guidelines 41 | 42 | ## Using Comments 43 | 44 | - Keep all comments concise and clear and suitable for inclusion in final production. 45 | 46 | - DO use comments whenever the intent of a given piece of code is subtle or confusing or 47 | avoids a bug or is not obvious from the code itself. 48 | 49 | - DO NOT repeat in comments what is obvious from the names of functions or variables or 50 | types. 51 | 52 | - DO NOT include comments that reflect what you did, such as “Added this function” as 53 | this is meaningless to anyone reading the code later. 54 | (Instead, describe in your message to the user any other contextual information.) 55 | 56 | - DO NOT use fancy or needlessly decorated headings like “===== MIGRATION TOOLS =====” 57 | in comments 58 | 59 | - DO NOT number steps in comments. 60 | These are hard to maintain if the code changes. 61 | NEVER DO THIS: “// Step 3: Fetch the data from the cache”\ 62 | This is fine: “// Now fetch the data from the cache” 63 | 64 | - DO NOT use emojis or special unicode characters like ① or • or – or — in comments. 65 | 66 | - Use emojis in output if it enhances the clarity and can be done consistently. 67 | You may use ✔︎ and ✘ to indicate success and failure, and ∆ and ‼︎ for user-facing 68 | warnings and errors, for example, but be sure to do it consistently. 69 | DO NOT use emojis gratuitously in comments or output. 70 | You may use then ONLY when they have clear meanings (like success or failure). 71 | Unless the user says otherwise, avoid emojis and Unicode in comments as clutters the 72 | output with little benefit. 73 | -------------------------------------------------------------------------------- /src/chopdiff/docs/token_mapping.py: -------------------------------------------------------------------------------- 1 | from typing_extensions import override 2 | 3 | from chopdiff.docs.token_diffs import SYMBOL_SEP, OpType, TokenDiff, diff_wordtoks 4 | 5 | 6 | class TokenMapping: 7 | """ 8 | Given two sequences of tokens, create a best-estimate mapping of how the tokens 9 | in the second sequence map to the tokens in the first sequence, based on an 10 | LCS-style diff. 11 | """ 12 | 13 | def __init__( 14 | self, 15 | tokens1: list[str], 16 | tokens2: list[str], 17 | diff: TokenDiff | None = None, 18 | min_tokens: int = 10, 19 | max_diff_frac: float = 0.4, 20 | ): 21 | self.tokens1 = tokens1 22 | self.tokens2 = tokens2 23 | self.diff = diff or diff_wordtoks(self.tokens1, self.tokens2) 24 | self._validate(min_tokens, max_diff_frac) 25 | self.backmap: dict[int, int] = {} 26 | self._create_mapping() 27 | 28 | def map_back(self, offset2: int) -> int: 29 | """ 30 | Map an offset in the second sequence back to the offset that most closely corresponds to it 31 | in the first sequence. This might be an exact match (e.g. the same word) or the closest token 32 | (e.g. the last word before a deleted or changed word). 33 | """ 34 | return self.backmap[offset2] 35 | 36 | def _validate(self, min_wordtoks: int, max_diff_frac: float): 37 | if len(self.tokens1) < min_wordtoks or len(self.tokens2) < min_wordtoks: 38 | raise ValueError(f"Documents should have at least {min_wordtoks} wordtoks") 39 | 40 | nchanges = len(self.diff.changes()) 41 | if float(nchanges) / len(self.tokens1) > max_diff_frac: 42 | raise ValueError( 43 | f"Documents have too many changes: {nchanges}/{len(self.tokens1)} ({float(nchanges) / len(self.tokens1):.2f} > {max_diff_frac})" 44 | ) 45 | 46 | def _create_mapping(self): 47 | offset1 = 0 48 | offset2 = 0 49 | last_offset1 = 0 50 | 51 | for op in self.diff.ops: 52 | if op.action == OpType.EQUAL: 53 | for _ in op.left: 54 | self.backmap[offset2] = offset1 55 | last_offset1 = offset1 56 | offset1 += 1 57 | offset2 += 1 58 | elif op.action == OpType.DELETE: 59 | for _ in op.left: 60 | last_offset1 = offset1 61 | offset1 += 1 62 | elif op.action == OpType.INSERT: 63 | for _ in op.right: 64 | self.backmap[offset2] = last_offset1 65 | offset2 += 1 66 | elif op.action == OpType.REPLACE: 67 | for _ in op.left: 68 | last_offset1 = offset1 69 | offset1 += 1 70 | for _ in op.right: 71 | self.backmap[offset2] = last_offset1 72 | offset2 += 1 73 | 74 | def full_mapping_str(self): 75 | """ 76 | For debugging or logging, return a verbose, readable table of the mapping of each 77 | token in the second sequence to the first sequence. 78 | """ 79 | return "\n".join( 80 | f"{i} {SYMBOL_SEP}{self.tokens2[i]}{SYMBOL_SEP} -> {self.map_back(i)} {SYMBOL_SEP}{self.tokens1[self.map_back(i)]}{SYMBOL_SEP}" 81 | for i in range(len(self.tokens2)) 82 | ) 83 | 84 | @override 85 | def __str__(self): 86 | return f"OffsetMapping(doc1 len {len(self.tokens1)}, doc2 len {len(self.tokens2)}, mapping len {len(self.backmap)})" 87 | -------------------------------------------------------------------------------- /src/chopdiff/divs/div_elements.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from chopdiff.divs.chunk_utils import chunk_children, chunk_paras 4 | from chopdiff.divs.parse_divs import parse_divs 5 | from chopdiff.divs.text_node import TextNode 6 | from chopdiff.docs.sizes import TextUnit 7 | from chopdiff.docs.text_doc import TextDoc 8 | from chopdiff.docs.wordtoks import first_wordtok, is_div 9 | from chopdiff.html.html_in_md import Attrs, ClassNames, div_wrapper, html_join_blocks 10 | 11 | log = logging.getLogger(__name__) 12 | 13 | 14 | CHUNK = "chunk" 15 | """Class name for a chunk of text.""" 16 | 17 | ORIGINAL = "original" 18 | """Class name for the original content.""" 19 | 20 | RESULT = "result" 21 | """Class name for the result of an LLM action.""" 22 | 23 | GROUP = "group" 24 | """Class name for a generic combination of elements.""" 25 | 26 | 27 | def div( 28 | class_name: ClassNames, 29 | *blocks: str | None, 30 | attrs: Attrs | None = None, 31 | safe: bool = True, 32 | ) -> str: 33 | """ 34 | Convenience to create Markdown-compatible div with HTML in its own paragraphs. 35 | """ 36 | return div_wrapper(class_name=class_name, attrs=attrs, safe=safe, padding="\n\n")( 37 | html_join_blocks(*blocks) 38 | ) 39 | 40 | 41 | def div_get_original(element: TextNode, child_name: str = ORIGINAL) -> str: 42 | """ 43 | Get content of the named child element if it exists, otherwise use the whole contents. 44 | """ 45 | child = element.child_by_class_name(child_name) 46 | return child.contents if child else element.contents 47 | 48 | 49 | def div_insert_wrapped( 50 | element: TextNode, 51 | new_child_blocks: list[str], 52 | container_class: ClassNames = CHUNK, 53 | original_class: str = ORIGINAL, 54 | at_front: bool = True, 55 | ) -> str: 56 | """ 57 | Insert new children into a div element. As a base case, wrap the original 58 | content in a child div if it's not already present as a child. 59 | """ 60 | 61 | original_element = element.child_by_class_name(original_class) 62 | if original_element: 63 | prev_contents = element.contents 64 | else: 65 | prev_contents = div(original_class, element.contents) 66 | 67 | if at_front: 68 | blocks = [*new_child_blocks, prev_contents] 69 | else: 70 | blocks = [prev_contents, *new_child_blocks] 71 | 72 | return div(container_class, html_join_blocks(*blocks)) 73 | 74 | 75 | def chunk_text_as_divs( 76 | text: str, min_size: int, unit: TextUnit, class_name: ClassNames = CHUNK 77 | ) -> str: 78 | """ 79 | Add HTML divs around "chunks" of text paragraphs or top-level divs, where each chunk 80 | is at least the specified minimum size. 81 | """ 82 | 83 | if is_div(first_wordtok(text)): 84 | log.info("Chunking paragraphs using divs.") 85 | parsed = parse_divs(text) 86 | div_chunks = chunk_children(parsed, min_size, unit) 87 | chunk_strs = [chunk.reassemble() for chunk in div_chunks] 88 | size_summary = parsed.size_summary() 89 | else: 90 | log.info("Chunking paragraphs using newlines.") 91 | doc = TextDoc.from_text(text) 92 | doc_chunks = chunk_paras(doc, min_size, unit) 93 | chunk_strs = [chunk.reassemble() for chunk in doc_chunks] 94 | size_summary = doc.size_summary() 95 | 96 | result_divs = [div(class_name, chunk_str) for chunk_str in chunk_strs] 97 | 98 | log.info("Added %s div chunks on doc:\n%s", len(result_divs), size_summary) 99 | 100 | return "\n\n".join(result_divs) 101 | -------------------------------------------------------------------------------- /tests/divs/test_div_elements.py: -------------------------------------------------------------------------------- 1 | from textwrap import dedent 2 | 3 | from chopdiff.divs.div_elements import CHUNK, chunk_text_as_divs, div, div_insert_wrapped 4 | from chopdiff.divs.parse_divs import parse_divs_single 5 | from chopdiff.docs.sizes import TextUnit 6 | 7 | 8 | def test_div_insert_child(): 9 | node1 = parse_divs_single("Chunk text.") 10 | node2 = parse_divs_single(div(CHUNK, "Chunk text.")) 11 | 12 | child_str = div("new", "New child text.") 13 | 14 | new_result1 = div_insert_wrapped(node1, [child_str]) 15 | new_result2 = div_insert_wrapped(node2, [child_str]) 16 | 17 | print("\n---test_div_insert_child---") 18 | print("\nnode1:") 19 | print(node1.original_text) 20 | print("\nnode2:") 21 | print(node2.original_text) 22 | print("\nnew_child_str:") 23 | print(child_str) 24 | print("\nnew_result1:") 25 | print(new_result1) 26 | print("\nnew_result2:") 27 | print(new_result2) 28 | 29 | assert ( 30 | new_result1 31 | == dedent( 32 | """ 33 |

34 | 35 |
36 | 37 | New child text. 38 | 39 |
40 | 41 |
42 | 43 | Chunk text. 44 | 45 |
46 | 47 |
48 | """ 49 | ).strip() 50 | ) 51 | 52 | assert new_result2 == new_result1 53 | 54 | node3 = parse_divs_single(new_result1) 55 | 56 | another_child_str = div("another", "Another child text.") 57 | 58 | new_result3 = div_insert_wrapped(node3, [another_child_str]) 59 | print("\nnew_result3:") 60 | print(new_result3) 61 | 62 | assert ( 63 | new_result3 64 | == dedent( 65 | """ 66 |
67 | 68 |
69 | 70 | Another child text. 71 | 72 |
73 | 74 |
75 | 76 | New child text. 77 | 78 |
79 | 80 |
81 | 82 | Chunk text. 83 | 84 |
85 | 86 |
87 | """ 88 | ).strip() 89 | ) 90 | 91 | 92 | _med_test_doc = dedent( 93 | """ 94 | # Title 95 | 96 | Hello World. This is an example sentence. And here's another one! 97 | 98 | ## Subtitle 99 | 100 | This is a new paragraph. 101 | It has several sentences. 102 | There may be line breaks within a paragraph, but these should not affect handlingof the paragraph. 103 | There are also [links](http://www.google.com) and **bold** and *italic* text. 104 | 105 | ### Itemized List 106 | 107 | - Item 1 108 | 109 | - Item 2 110 | 111 | - Item 3 112 | 113 |
extra 114 |
115 | 116 | Blah blah. 117 | """ 118 | ).strip() 119 | 120 | 121 | def test_chunk_text_into_divs(): 122 | assert chunk_text_as_divs("", 7, TextUnit.words) == "" 123 | assert ( 124 | chunk_text_as_divs("hello", 100, TextUnit.words) == '
\n\nhello\n\n
' 125 | ) 126 | 127 | chunked = chunk_text_as_divs(_med_test_doc, 7, TextUnit.words) 128 | 129 | print("\n---test_chunk_paras_as_divs---") 130 | print("Chunked doc:\n---\n" + chunked + "\n---") 131 | 132 | expected_first_chunk = dedent( 133 | """ 134 |
135 | 136 | # Title 137 | 138 | Hello World. This is an example sentence. And here's another one! 139 | 140 |
141 | """ 142 | ).strip() 143 | 144 | assert chunked.startswith(expected_first_chunk) 145 | assert chunked.endswith("") 146 | assert chunked.count("
") == 5 # Extra spurious
. 148 | -------------------------------------------------------------------------------- /tests/docs/test_token_mapping.py: -------------------------------------------------------------------------------- 1 | from textwrap import dedent 2 | 3 | from chopdiff.docs.text_doc import TextDoc 4 | from chopdiff.docs.token_mapping import TokenMapping 5 | from chopdiff.docs.wordtoks import wordtokenize 6 | 7 | 8 | def test_offset_mapping(): 9 | doc1 = TextDoc.from_text("This is a simple test with some words.") 10 | doc2 = TextDoc.from_text( 11 | "This is<-PARA-BR->a simple pytest adding other words.<-SENT-BR->And another sentence." 12 | ) 13 | 14 | mapping = TokenMapping(list(doc1.as_wordtoks()), list(doc2.as_wordtoks())) 15 | 16 | mapping_str = mapping.full_mapping_str() 17 | 18 | print(mapping.diff.as_diff_str(include_equal=True)) 19 | print(mapping) 20 | print(mapping.backmap) 21 | print(mapping_str) 22 | 23 | assert ( 24 | mapping_str 25 | == dedent( 26 | """ 27 | 0 ⎪This⎪ -> 0 ⎪This⎪ 28 | 1 ⎪ ⎪ -> 1 ⎪ ⎪ 29 | 2 ⎪is⎪ -> 2 ⎪is⎪ 30 | 3 ⎪<-PARA-BR->⎪ -> 3 ⎪ ⎪ 31 | 4 ⎪a⎪ -> 4 ⎪a⎪ 32 | 5 ⎪ ⎪ -> 5 ⎪ ⎪ 33 | 6 ⎪simple⎪ -> 6 ⎪simple⎪ 34 | 7 ⎪ ⎪ -> 7 ⎪ ⎪ 35 | 8 ⎪pytest⎪ -> 8 ⎪test⎪ 36 | 9 ⎪ ⎪ -> 9 ⎪ ⎪ 37 | 10 ⎪adding⎪ -> 10 ⎪with⎪ 38 | 11 ⎪ ⎪ -> 11 ⎪ ⎪ 39 | 12 ⎪other⎪ -> 12 ⎪some⎪ 40 | 13 ⎪ ⎪ -> 13 ⎪ ⎪ 41 | 14 ⎪words⎪ -> 14 ⎪words⎪ 42 | 15 ⎪.⎪ -> 15 ⎪.⎪ 43 | 16 ⎪<-SENT-BR->⎪ -> 15 ⎪.⎪ 44 | 17 ⎪And⎪ -> 15 ⎪.⎪ 45 | 18 ⎪ ⎪ -> 15 ⎪.⎪ 46 | 19 ⎪another⎪ -> 15 ⎪.⎪ 47 | 20 ⎪ ⎪ -> 15 ⎪.⎪ 48 | 21 ⎪sentence⎪ -> 15 ⎪.⎪ 49 | 22 ⎪.⎪ -> 15 ⎪.⎪ 50 | """ 51 | ).strip() 52 | ) 53 | 54 | 55 | def test_offset_mapping_longer(): 56 | doc1 = dedent( 57 | """ 58 | Alright, guys. 59 | Here's the deal. 60 | You can follow me on my daily workouts. 61 | """ 62 | ) 63 | doc2 = dedent( 64 | """ 65 | Alright, guys. Here's the deal. 66 | You can follow me on my daily workouts. 67 | """ 68 | ) 69 | 70 | doc1_wordtoks = wordtokenize(doc1) 71 | doc2_wordtoks = list(TextDoc.from_text(doc2).as_wordtoks()) 72 | 73 | mapping = TokenMapping(doc1_wordtoks, doc2_wordtoks) 74 | 75 | mapping_str = mapping.full_mapping_str() 76 | 77 | print(mapping.diff.as_diff_str(include_equal=True)) 78 | print(mapping) 79 | print(mapping.backmap) 80 | print(mapping_str) 81 | 82 | assert ( 83 | mapping_str 84 | == dedent( 85 | """ 86 | 0 ⎪Alright⎪ -> 2 ⎪Alright⎪ 87 | 1 ⎪,⎪ -> 3 ⎪,⎪ 88 | 2 ⎪ ⎪ -> 4 ⎪ ⎪ 89 | 3 ⎪guys⎪ -> 5 ⎪guys⎪ 90 | 4 ⎪.⎪ -> 6 ⎪.⎪ 91 | 5 ⎪ ⎪ -> 8 ⎪ ⎪ 92 | 6 ⎪Here⎪ -> 10 ⎪Here⎪ 93 | 7 ⎪'⎪ -> 11 ⎪'⎪ 94 | 8 ⎪s⎪ -> 12 ⎪s⎪ 95 | 9 ⎪ ⎪ -> 13 ⎪ ⎪ 96 | 10 ⎪the⎪ -> 14 ⎪the⎪ 97 | 11 ⎪ ⎪ -> 15 ⎪ ⎪ 98 | 12 ⎪deal⎪ -> 16 ⎪deal⎪ 99 | 13 ⎪.⎪ -> 17 ⎪.⎪ 100 | 14 ⎪<-SENT-BR->⎪ -> 20 ⎪⎪ 101 | 15 ⎪You⎪ -> 21 ⎪You⎪ 102 | 16 ⎪ ⎪ -> 22 ⎪ ⎪ 103 | 17 ⎪can⎪ -> 23 ⎪can⎪ 104 | 18 ⎪ ⎪ -> 24 ⎪ ⎪ 105 | 19 ⎪follow⎪ -> 25 ⎪follow⎪ 106 | 20 ⎪ ⎪ -> 26 ⎪ ⎪ 107 | 21 ⎪me⎪ -> 27 ⎪me⎪ 108 | 22 ⎪ ⎪ -> 28 ⎪ ⎪ 109 | 23 ⎪on⎪ -> 29 ⎪on⎪ 110 | 24 ⎪ ⎪ -> 30 ⎪ ⎪ 111 | 25 ⎪my⎪ -> 31 ⎪my⎪ 112 | 26 ⎪ ⎪ -> 32 ⎪ ⎪ 113 | 27 ⎪daily⎪ -> 33 ⎪daily⎪ 114 | 28 ⎪ ⎪ -> 34 ⎪ ⎪ 115 | 29 ⎪workouts⎪ -> 35 ⎪workouts⎪ 116 | 30 ⎪.⎪ -> 36 ⎪.⎪ 117 | """ 118 | ).strip() 119 | ) 120 | -------------------------------------------------------------------------------- /examples/insert_para_breaks.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.13" 3 | # dependencies = [ 4 | # "chopdiff", 5 | # "flowmark", 6 | # "openai", 7 | # ] 8 | # /// 9 | import argparse 10 | import logging 11 | from textwrap import dedent 12 | 13 | import openai # pyright: ignore # Not a project dep. 14 | from flowmark import fill_text 15 | 16 | from chopdiff.docs import TextDoc 17 | from chopdiff.transforms import WINDOW_2K_WORDTOKS, changes_whitespace, filtered_transform 18 | 19 | logging.basicConfig(format=">> %(message)s") 20 | log = logging.getLogger(__name__) 21 | log.setLevel(logging.INFO) 22 | 23 | 24 | def heading(text: str): 25 | return "\n--- " + text + " " + "-" * (70 - len(text)) + "\n" 26 | 27 | 28 | def insert_paragraph_breaks(text: str) -> str: 29 | # Create a TextDoc from the input text 30 | doc = TextDoc.from_text(text) 31 | 32 | # Handy calculations of document size in paragraphs, sentences, etc. 33 | print(f"\nInput document: {doc.size_summary()}") 34 | 35 | # Define the transformation function. 36 | # Note in this case we run the LLM on strings, but you could also work directly 37 | # on the TextDoc if appropriate. 38 | def transform(doc: TextDoc) -> TextDoc: 39 | return TextDoc.from_text(llm_insert_para_breaks(doc.reassemble())) 40 | 41 | # Apply the transformation with windowing and filtering. 42 | # 43 | # This will walk along the document in approximately 2K "wordtok" chunks 44 | # (~1000 words) and apply the transformation to each chunk. Chunks can 45 | # slightly overlap to make this more robust. 46 | # 47 | # The change on each chunk will then be filtered to only include whitespace 48 | # changes. 49 | # 50 | # Finally each change will be "stitched back" to form the original document, 51 | # by looking for the right alignment of words between the original and the 52 | # transformed chunk. 53 | # 54 | # (Turn on logging to see these details.) 55 | result_doc = filtered_transform( 56 | doc, transform, windowing=WINDOW_2K_WORDTOKS, diff_filter=changes_whitespace 57 | ) 58 | 59 | print(heading("Output document")) 60 | print(f"\nOutput document: {result_doc.size_summary()}") 61 | 62 | # Return the transformed text 63 | return result_doc.reassemble() 64 | 65 | 66 | def llm_insert_para_breaks(input_text: str) -> str: 67 | """ 68 | Call OpenAI to insert paragraph breaks on a chunk of text. 69 | This works best on a smaller chunk of text and might make 70 | other non-whitespace changes. 71 | """ 72 | client: openai.OpenAI = openai.OpenAI() 73 | 74 | response = client.chat.completions.create( 75 | model="gpt-4o-mini", 76 | messages=[ 77 | {"role": "system", "content": "You are a careful and precise editor."}, 78 | { 79 | "role": "user", 80 | "content": dedent( 81 | f""" 82 | Break the following text into paragraphs. 83 | 84 | Original text: 85 | 86 | {input_text} 87 | 88 | Formatted text: 89 | """ 90 | ), 91 | }, 92 | ], 93 | temperature=0.0, 94 | ) 95 | 96 | return response.choices[0].message.content or "" 97 | 98 | 99 | def main(): 100 | parser = argparse.ArgumentParser( 101 | description="Insert paragraph breaks in text files, making no other changes of any kind to a document." 102 | ) 103 | parser.add_argument("input_file", help="Path to the input text file") 104 | parser.add_argument("-o", "--output", help="Path to the output file (default: stdout)") 105 | args = parser.parse_args() 106 | 107 | logging.basicConfig(level=logging.INFO) 108 | 109 | with open(args.input_file, encoding="utf-8") as f: 110 | input_text = f.read() 111 | 112 | print(heading("Original")) 113 | print(fill_text(input_text)) 114 | 115 | result = insert_paragraph_breaks(input_text) 116 | 117 | print(heading("With paragraph breaks")) 118 | print(fill_text(result)) 119 | 120 | 121 | if __name__ == "__main__": 122 | main() 123 | -------------------------------------------------------------------------------- /tests/transforms/test_diff_filters.py: -------------------------------------------------------------------------------- 1 | from chopdiff.docs.text_doc import TextDoc 2 | from chopdiff.docs.token_diffs import DiffOp, OpType, diff_wordtoks 3 | from chopdiff.docs.wordtoks import PARA_BR_TOK, SENT_BR_TOK, is_break_or_space 4 | from chopdiff.transforms.diff_filters import ( 5 | WILDCARD_TOK, 6 | changes_whitespace, 7 | make_token_sequence_filter, 8 | no_word_lemma_changes, 9 | removes_word_lemmas, 10 | removes_words, 11 | ) 12 | 13 | 14 | def test_filter_br_and_space(): 15 | from ..docs.test_token_diffs import _short_text1, _short_text2, _short_text3 16 | 17 | wordtoks1 = list(TextDoc.from_text(_short_text1).as_wordtoks()) 18 | wordtoks2 = list(TextDoc.from_text(_short_text2).as_wordtoks()) 19 | wordtoks3 = list(TextDoc.from_text(_short_text3).as_wordtoks()) 20 | 21 | diff = diff_wordtoks(wordtoks1, wordtoks2) 22 | 23 | accepted, rejected = diff.filter(changes_whitespace) 24 | 25 | accepted_result = accepted.apply_to(wordtoks1) 26 | rejected_result = rejected.apply_to(wordtoks1) 27 | 28 | print("---Filtered diff:") 29 | print("Original: " + "/".join(wordtoks1)) 30 | print("Full diff:", diff) 31 | print("Accepted diff:", accepted) 32 | print("Rejected diff:", rejected) 33 | print("Accepted result: " + "/".join(accepted_result)) 34 | print("Rejected result: " + "/".join(rejected_result)) 35 | 36 | assert accepted_result == wordtoks3 37 | 38 | 39 | def test_token_sequence_filter_with_predicate(): 40 | insert_op = DiffOp(OpType.INSERT, [], [SENT_BR_TOK, "

", "Title", "

", PARA_BR_TOK]) 41 | delete_op = DiffOp(OpType.DELETE, [SENT_BR_TOK, "

", "Old Title", "

", PARA_BR_TOK], []) 42 | replace_op = DiffOp(OpType.REPLACE, ["Some", "text"], ["New", "text"]) 43 | equal_op = DiffOp(OpType.EQUAL, ["Unchanged"], ["Unchanged"]) 44 | 45 | action = OpType.INSERT 46 | filter_fn = make_token_sequence_filter( 47 | [is_break_or_space, "

", WILDCARD_TOK, "

", is_break_or_space], action 48 | ) 49 | 50 | assert filter_fn(insert_op) 51 | assert not filter_fn(delete_op) # action is INSERT 52 | assert not filter_fn(replace_op) 53 | assert not filter_fn(equal_op) 54 | 55 | ignore_whitespace_filter_fn = make_token_sequence_filter( 56 | ["

", WILDCARD_TOK, "

"], 57 | action=OpType.INSERT, 58 | ignore=is_break_or_space, 59 | ) 60 | 61 | insert_op_with_whitespace = DiffOp( 62 | OpType.INSERT, 63 | [], 64 | [" ", SENT_BR_TOK, " ", "

", "Title", "

", " ", PARA_BR_TOK, " "], 65 | ) 66 | 67 | assert ignore_whitespace_filter_fn(insert_op_with_whitespace) 68 | assert not ignore_whitespace_filter_fn(delete_op) # action is INSERT 69 | assert not ignore_whitespace_filter_fn(replace_op) 70 | assert not ignore_whitespace_filter_fn(equal_op) 71 | 72 | 73 | def test_no_word_changes_lemmatized(): 74 | assert not no_word_lemma_changes(DiffOp(OpType.INSERT, [], ["the"])) 75 | assert not no_word_lemma_changes(DiffOp(OpType.DELETE, ["the"], [])) 76 | assert not no_word_lemma_changes( 77 | DiffOp( 78 | OpType.REPLACE, 79 | ["The", "dogs", "were", "running", "fast"], 80 | ["The", "dog", "was", "running"], 81 | ) 82 | ) 83 | assert no_word_lemma_changes( 84 | DiffOp( 85 | OpType.REPLACE, 86 | ["The", "dogs", "were", "running"], 87 | ["The", "dog", "was", "running"], 88 | ) 89 | ) 90 | 91 | 92 | def test_removes_words(): 93 | assert removes_words(DiffOp(OpType.DELETE, ["Hello", " "], [])) 94 | assert removes_words(DiffOp(OpType.REPLACE, ["Hello", " ", "world"], ["world"])) 95 | assert not removes_words(DiffOp(OpType.REPLACE, ["Hello", " ", "world"], ["World"])) 96 | assert removes_word_lemmas(DiffOp(OpType.REPLACE, ["Hello", " ", "world"], ["World"])) 97 | 98 | assert not removes_words( 99 | DiffOp(OpType.REPLACE, ["Hello", "*", "world"], ["hello", "*", "world"]) 100 | ) 101 | assert removes_word_lemmas( 102 | DiffOp(OpType.REPLACE, ["Hello", "*", "world"], ["hello", "*", "world"]) 103 | ) 104 | 105 | assert removes_words(DiffOp(OpType.DELETE, ["Hello", "world"], [])) 106 | assert removes_word_lemmas(DiffOp(OpType.DELETE, ["Hello", "world"], [])) 107 | -------------------------------------------------------------------------------- /src/chopdiff/divs/parse_divs.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import re 3 | 4 | from chopdiff.divs.text_node import TextNode 5 | 6 | DIV_TAGS = re.compile(r"(]*>|)", re.IGNORECASE) 7 | 8 | CLASS_NAME_PATTERN = re.compile(r"\bclass=\"([^\"]+)\"", re.IGNORECASE) 9 | 10 | 11 | def parse_divs(text: str, skip_whitespace: bool = True) -> TextNode: 12 | """ 13 | Parse a string recursively into `TextNode`s based on `
` tags. 14 | 15 | All offsets are relative to the original text. Text outside of a div tag is 16 | included as a `TextNode` with None markers. 17 | 18 | We do our own parsing to keep this simple and exactly preserve formatting. 19 | """ 20 | parsed = _parse_divs_recursive( 21 | text, 22 | 0, 23 | TextNode(original_text=text, offset=0, content_start=0, content_end=len(text)), 24 | ) 25 | 26 | if skip_whitespace: 27 | parsed = _skip_whitespace_nodes(parsed) 28 | 29 | return parsed 30 | 31 | 32 | def parse_divs_single(text: str, skip_whitespace: bool = True) -> TextNode: 33 | """ 34 | Same as parse_divs but unwraps any singleton child. 35 | """ 36 | divs = parse_divs(text, skip_whitespace=skip_whitespace) 37 | if len(divs.children) == 1: 38 | return divs.children[0] 39 | else: 40 | return divs 41 | 42 | 43 | def _skip_whitespace_nodes(node: TextNode) -> TextNode: 44 | filtered_node = copy.copy(node) 45 | filtered_node.children = [ 46 | _skip_whitespace_nodes(child) for child in node.children if not child.is_whitespace() 47 | ] 48 | return filtered_node 49 | 50 | 51 | def _parse_divs_recursive( 52 | text: str, 53 | start_offset: int, 54 | result: TextNode, 55 | ) -> TextNode: 56 | current_offset = start_offset 57 | 58 | while current_offset < len(text): 59 | match = DIV_TAGS.search(text, current_offset) 60 | 61 | if not match: 62 | # No more div tags, add remaining content as a child node 63 | if current_offset < len(text): 64 | result.children.append( 65 | TextNode( 66 | original_text=text, 67 | offset=current_offset, 68 | content_start=current_offset, 69 | content_end=len(text), 70 | ) 71 | ) 72 | break 73 | 74 | if match.start() > current_offset: 75 | # Add content before the div tag as a child node. 76 | result.children.append( 77 | TextNode( 78 | original_text=text, 79 | offset=current_offset, 80 | content_start=current_offset, 81 | content_end=match.start(), 82 | ) 83 | ) 84 | 85 | tag = match.group(1) 86 | is_end_tag = tag.startswith(" list[TextNode]: 119 | """ 120 | Parse div chunks into TextNodes. 121 | """ 122 | 123 | text_node = parse_divs(text) 124 | 125 | matched_divs = text_node.children_by_class_names(class_name, recursive=True) 126 | 127 | if not matched_divs: 128 | raise ValueError(f"No `{class_name}` divs found in text.") 129 | 130 | return matched_divs 131 | -------------------------------------------------------------------------------- /tests/docs/test_wordtoks.py: -------------------------------------------------------------------------------- 1 | from textwrap import dedent 2 | 3 | from chopdiff.docs.search_tokens import search_tokens 4 | from chopdiff.docs.wordtoks import ( 5 | Tag, 6 | _insert_para_wordtoks, 7 | is_entity, 8 | is_tag, 9 | is_tag_close, 10 | is_tag_open, 11 | parse_tag, 12 | visualize_wordtoks, 13 | wordtokenize, 14 | ) 15 | 16 | _test_doc = dedent( 17 | """ 18 | Hello, world! 19 | This is an "example sentence with punctuation. 20 | "Special characters: @#%^&*()" 21 | Alright, guys. 22 | 23 | Here's the deal. 24 | You can follow me on my daily workouts. 00:10 27 | """ 28 | ).strip() 29 | 30 | 31 | def test_html_doc(): 32 | wordtoks = wordtokenize(_test_doc, bof_eof=True) 33 | 34 | print("\n---Wordtoks test:") 35 | print(visualize_wordtoks(wordtoks)) 36 | 37 | print("\n---Wordtoks with para br:") 38 | wordtoks_with_para = wordtokenize(_insert_para_wordtoks(_test_doc), bof_eof=True) 39 | print(visualize_wordtoks(wordtoks_with_para)) 40 | 41 | assert ( 42 | visualize_wordtoks(wordtoks) 43 | == """⎪<-BOF->⎪Hello⎪,⎪ ⎪world⎪!⎪ ⎪This⎪ ⎪is⎪ ⎪an⎪ ⎪"⎪example⎪ ⎪sentence⎪ ⎪with⎪ ⎪punctuation⎪.⎪ ⎪"⎪Special⎪ ⎪characters⎪:⎪ ⎪@⎪#⎪%⎪^⎪&⎪*⎪(⎪)⎪"⎪ ⎪⎪Alright⎪,⎪ ⎪guys⎪.⎪⎪ ⎪⎪Here⎪'⎪s⎪ ⎪the⎪ ⎪deal⎪.⎪⎪ ⎪⎪You⎪ ⎪can⎪ ⎪follow⎪ ⎪me⎪ ⎪on⎪ ⎪my⎪ ⎪daily⎪ ⎪workouts⎪.⎪ ⎪⎪00⎪:⎪10⎪⎪<-EOF->⎪""" 44 | ) 45 | 46 | assert ( 47 | visualize_wordtoks(wordtoks_with_para) 48 | == """⎪<-BOF->⎪Hello⎪,⎪ ⎪world⎪!⎪ ⎪This⎪ ⎪is⎪ ⎪an⎪ ⎪"⎪example⎪ ⎪sentence⎪ ⎪with⎪ ⎪punctuation⎪.⎪ ⎪"⎪Special⎪ ⎪characters⎪:⎪ ⎪@⎪#⎪%⎪^⎪&⎪*⎪(⎪)⎪"⎪ ⎪⎪Alright⎪,⎪ ⎪guys⎪.⎪⎪<-PARA-BR->⎪⎪Here⎪'⎪s⎪ ⎪the⎪ ⎪deal⎪.⎪⎪ ⎪⎪You⎪ ⎪can⎪ ⎪follow⎪ ⎪me⎪ ⎪on⎪ ⎪my⎪ ⎪daily⎪ ⎪workouts⎪.⎪ ⎪⎪00⎪:⎪10⎪⎪<-EOF->⎪""" 49 | ) 50 | 51 | print("\n---Searching tokens") 52 | 53 | print(search_tokens(wordtoks).at(0).seek_forward(["example"]).get_token()) 54 | print(search_tokens(wordtoks).at(-1).seek_back(["follow"]).get_token()) 55 | print(search_tokens(wordtoks).at(-1).seek_back(["Special"]).seek_forward(is_tag).get_token()) 56 | 57 | assert search_tokens(wordtoks).at(0).seek_forward(["example"]).get_token() == ( 58 | 14, 59 | "example", 60 | ) 61 | assert search_tokens(wordtoks).at(-1).seek_back(["follow"]).get_token() == ( 62 | 63, 63 | "follow", 64 | ) 65 | assert search_tokens(wordtoks).at(-1).seek_back(["Special"]).seek_forward( 66 | is_tag 67 | ).get_token() == (39, '') 68 | 69 | 70 | def test_tag_functions(): 71 | assert parse_tag("
") == Tag(name="div", is_open=True, is_close=False, attrs={}) 72 | assert parse_tag("
") == Tag(name="div", is_open=False, is_close=True, attrs={}) 73 | assert parse_tag("
") == Tag(name="div", is_open=True, is_close=True, attrs={}) 74 | assert parse_tag("") == Tag( 75 | name="", is_open=False, is_close=False, attrs={}, comment=" Comment " 76 | ) 77 | 78 | assert not is_tag("foo") 79 | assert not is_tag("") 81 | assert is_tag("
") 82 | assert is_tag("") 83 | assert is_tag("
", ["div"]) 84 | assert not is_tag("
", ["span"]) 85 | assert is_tag("
") 86 | 87 | assert is_tag_close("
") 88 | assert not is_tag_close("
") 89 | assert is_tag_close("
", ["div"]) 90 | assert not is_tag_close("
", ["span"]) 91 | assert is_tag_close("
") 92 | assert is_tag_open("
") 93 | assert not is_tag_open("
") 94 | assert is_tag_open("
", ["div"]) 95 | assert not is_tag_open("
", ["span"]) 96 | 97 | assert is_entity("&") 98 | assert not is_entity("nbsp;") 99 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Makefile 2 | CLAUDE.md 3 | AGENTS.md 4 | 5 | # Additions to standard GitHub .gitignore: 6 | *.bak 7 | *.orig 8 | tmp/ 9 | trash/ 10 | attic/ 11 | .kash/ 12 | 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | *$py.class 17 | 18 | # C extensions 19 | *.so 20 | 21 | # Distribution / packaging 22 | .Python 23 | build/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | wheels/ 35 | share/python-wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | MANIFEST 40 | 41 | # PyInstaller 42 | # Usually these files are written by a python script from a template 43 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 44 | *.manifest 45 | *.spec 46 | 47 | # Installer logs 48 | pip-log.txt 49 | pip-delete-this-directory.txt 50 | 51 | # Unit test / coverage reports 52 | htmlcov/ 53 | .tox/ 54 | .nox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *.cover 61 | *.py,cover 62 | .hypothesis/ 63 | .pytest_cache/ 64 | cover/ 65 | 66 | # Translations 67 | *.mo 68 | *.pot 69 | 70 | # Django stuff: 71 | *.log 72 | local_settings.py 73 | db.sqlite3 74 | db.sqlite3-journal 75 | 76 | # Flask stuff: 77 | instance/ 78 | .webassets-cache 79 | 80 | # Scrapy stuff: 81 | .scrapy 82 | 83 | # Sphinx documentation 84 | docs/_build/ 85 | 86 | # PyBuilder 87 | .pybuilder/ 88 | target/ 89 | 90 | # Jupyter Notebook 91 | .ipynb_checkpoints 92 | 93 | # IPython 94 | profile_default/ 95 | ipython_config.py 96 | 97 | # pyenv 98 | # For a library or package, you might want to ignore these files since the code is 99 | # intended to run in multiple environments; otherwise, check them in: 100 | # .python-version 101 | 102 | # pipenv 103 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 104 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 105 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 106 | # install all needed dependencies. 107 | #Pipfile.lock 108 | 109 | # UV 110 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 111 | # This is especially recommended for binary packages to ensure reproducibility, and is more 112 | # commonly ignored for libraries. 113 | #uv.lock 114 | 115 | # poetry 116 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 117 | # This is especially recommended for binary packages to ensure reproducibility, and is more 118 | # commonly ignored for libraries. 119 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 120 | #poetry.lock 121 | 122 | # pdm 123 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 124 | #pdm.lock 125 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 126 | # in version control. 127 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 128 | .pdm.toml 129 | .pdm-python 130 | .pdm-build/ 131 | 132 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 133 | __pypackages__/ 134 | 135 | # Celery stuff 136 | celerybeat-schedule 137 | celerybeat.pid 138 | 139 | # SageMath parsed files 140 | *.sage.py 141 | 142 | # Environments 143 | .env 144 | .venv 145 | env/ 146 | venv/ 147 | ENV/ 148 | env.bak/ 149 | venv.bak/ 150 | 151 | # Spyder project settings 152 | .spyderproject 153 | .spyproject 154 | 155 | # Rope project settings 156 | .ropeproject 157 | 158 | # mkdocs documentation 159 | /site 160 | 161 | # mypy 162 | .mypy_cache/ 163 | .dmypy.json 164 | dmypy.json 165 | 166 | # Pyre type checker 167 | .pyre/ 168 | 169 | # pytype static type analyzer 170 | .pytype/ 171 | 172 | # Cython debug symbols 173 | cython_debug/ 174 | 175 | # PyCharm 176 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 177 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 178 | # and can be added to the global gitignore or merged into this file. For a more nuclear 179 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 180 | #.idea/ 181 | 182 | # PyPI configuration file 183 | .pypirc 184 | -------------------------------------------------------------------------------- /tests/docs/test_token_diffs.py: -------------------------------------------------------------------------------- 1 | from textwrap import dedent 2 | 3 | from chopdiff.docs.text_doc import SentIndex, TextDoc 4 | from chopdiff.docs.token_diffs import DiffStats, diff_wordtoks, find_best_alignment 5 | 6 | _short_text1 = dedent( 7 | """ 8 | Paragraph one. Sentence 1a. Sentence 1b. Sentence 1c. 9 | 10 | Paragraph two. Sentence 2a. Sentence 2b. Sentence 2c. 11 | 12 | Paragraph three. Sentence 3a. Sentence 3b. Sentence 3c. 13 | """ 14 | ).strip() 15 | 16 | 17 | _short_text2 = dedent( 18 | """ 19 | Paragraph one. Sentence 1a. Sentence 1b. Sentence 1c. 20 | Paragraph two blah. Sentence 2a. Sentence 2b. Sentence 2c. 21 | 22 | Paragraph three! Sentence 3a. Sentence 3b. 23 | """ 24 | ).strip() 25 | 26 | # _short_text3 contains all the whitespace and break-only changes from _short_text1 to _short_text2. 27 | _short_text3 = dedent( 28 | """ 29 | Paragraph one. Sentence 1a. Sentence 1b. Sentence 1c. 30 | Paragraph two. Sentence 2a. Sentence 2b. Sentence 2c. 31 | 32 | Paragraph three. Sentence 3a. Sentence 3b. Sentence 3c. 33 | """ 34 | ).strip() 35 | 36 | 37 | def test_lcs_diff_wordtoks(): 38 | wordtoks1 = list(TextDoc.from_text(_short_text1).as_wordtoks()) 39 | wordtoks2 = list(TextDoc.from_text(_short_text2).as_wordtoks()) 40 | 41 | diff = diff_wordtoks(wordtoks1, wordtoks2) 42 | 43 | print("---Diff:") 44 | print(diff.as_diff_str(True)) 45 | 46 | print("---Diff stats:") 47 | print(diff.stats()) 48 | assert diff.stats() == DiffStats(added=5, removed=8, input_size=59) 49 | 50 | expected_diff = dedent( 51 | """ 52 | TextDiff: add/remove +5/-8 out of 59 total: 53 | at pos 0 keep 19 toks: ⎪Paragraph one. Sentence 1a. Sentence 1b. Sentence 1c.⎪ 54 | at pos 19 repl 1 toks: - ⎪<-PARA-BR->⎪ 55 | repl 1 toks: + ⎪ ⎪ 56 | at pos 20 keep 3 toks: ⎪Paragraph two⎪ 57 | at pos 23 add 2 toks: + ⎪ blah⎪ 58 | at pos 23 keep 1 toks: ⎪.⎪ 59 | at pos 24 repl 1 toks: - ⎪ ⎪ 60 | repl 1 toks: + ⎪<-SENT-BR->⎪ 61 | at pos 25 keep 18 toks: ⎪Sentence 2a. Sentence 2b. Sentence 2c.<-PARA-BR->Paragraph three⎪ 62 | at pos 43 repl 1 toks: - ⎪.⎪ 63 | repl 1 toks: + ⎪!⎪ 64 | at pos 44 keep 10 toks: ⎪<-SENT-BR->Sentence 3a. Sentence 3b.⎪ 65 | at pos 54 del 5 toks: - ⎪ Sentence 3c.⎪ 66 | """ 67 | ).strip() 68 | 69 | assert str(diff.as_diff_str(True)) == expected_diff 70 | 71 | 72 | def test_apply_to(): 73 | wordtoks1 = list(TextDoc.from_text(_short_text1).as_wordtoks()) 74 | wordtoks2 = list(TextDoc.from_text(_short_text2).as_wordtoks()) 75 | 76 | diff = diff_wordtoks(wordtoks1, wordtoks2) 77 | 78 | print("---Before apply:") 79 | print("/".join(wordtoks1)) 80 | print(diff) 81 | result = diff.apply_to(wordtoks1) 82 | print("---Result of apply:") 83 | print("/".join(result)) 84 | print("---Expected:") 85 | print("/".join(wordtoks2)) 86 | assert result == wordtoks2 87 | 88 | wordtoks3 = ["a", "b", "c", "d", "e"] 89 | wordtoks4 = ["a", "x", "c", "y", "e"] 90 | diff2 = diff_wordtoks(wordtoks3, wordtoks4) 91 | result2 = diff2.apply_to(wordtoks3) 92 | assert result2 == wordtoks4 93 | 94 | 95 | def test_find_best_alignment(): 96 | wordtoks1 = list(TextDoc.from_text(_short_text1).as_wordtoks()) 97 | wordtoks2 = list(TextDoc.from_text(_short_text1).sub_doc(SentIndex(1, 1)).as_wordtoks()) 98 | wordtoks3 = wordtoks2 + ["Extra", "wordtoks", "at", "the", "end"] 99 | wordtoks4 = list(wordtoks3) 100 | wordtoks4[0] = "X" 101 | wordtoks4[3] = "Y" 102 | 103 | print("---Alignment:") 104 | print("/".join(wordtoks1)) 105 | print("/".join(wordtoks2)) 106 | offset, (score, diff) = find_best_alignment(wordtoks1, wordtoks2, 1) 107 | print(f"Offset: {offset}, Score: {score}") 108 | print(diff) 109 | print() 110 | assert offset == 39 111 | assert score == 0.0 112 | assert diff.changes() == [] 113 | 114 | offset, (score, diff) = find_best_alignment(wordtoks1, wordtoks3, 3) 115 | print(f"Offset: {offset}, Score: {score}") 116 | print(diff) 117 | print() 118 | assert offset == 39 119 | assert score == 0.0 120 | assert diff.changes() == [] 121 | 122 | offset, (score, diff) = find_best_alignment(wordtoks1, wordtoks4, 3) 123 | print(f"Offset: {offset}, Score: {score}") 124 | print(diff) 125 | print() 126 | assert offset == 39 127 | assert score > 0 and score < 0.3 128 | assert diff.stats().nchanges() == 4 129 | -------------------------------------------------------------------------------- /tests/divs/test_parse_divs.py: -------------------------------------------------------------------------------- 1 | from textwrap import dedent 2 | 3 | from chopdiff.divs.parse_divs import parse_divs, parse_divs_by_class 4 | from chopdiff.divs.text_node import TextNode 5 | 6 | _test_text = dedent( 7 | """ 8 | 9 |
10 | Outer content paragraph 1. 11 | 12 | Outer content paragraph 2. 13 |
14 | Inner content. 15 |
16 | Nested content. 17 |
18 | 19 |
20 | 21 | Nested inner content. 22 |
23 | Deeply nested content. 24 |
25 |
26 | 27 | 28 |
29 | Outer content paragraph 3. 30 |
31 | """ 32 | ) 33 | 34 | 35 | def _strip_lines(text: str) -> list[str]: 36 | return [line.strip() for line in text.strip().split("\n")] 37 | 38 | 39 | def test_parse_divs(): 40 | def validate_node(node: TextNode, original_text: str): 41 | assert node.original_text == original_text 42 | assert 0 <= node.content_start <= len(original_text) 43 | assert 0 <= node.content_end <= len(original_text) 44 | assert node.content_start <= node.content_end 45 | assert node.contents == original_text[node.content_start : node.content_end] 46 | assert ( 47 | node.begin_marker is None 48 | or original_text[node.offset : node.offset + len(node.begin_marker)] 49 | == node.begin_marker 50 | ) 51 | assert ( 52 | node.end_marker is None 53 | or original_text[node.content_end : node.content_end + len(node.end_marker)] 54 | == node.end_marker 55 | ) 56 | 57 | for child in node.children: 58 | validate_node(child, original_text) 59 | 60 | node = parse_divs(_test_text, skip_whitespace=False) 61 | 62 | node_no_whitespace = parse_divs(_test_text, skip_whitespace=True) 63 | 64 | reassembled = node.reassemble(padding="") 65 | 66 | print() 67 | print(f"Original text (length {len(_test_text)}):") 68 | print(_test_text) 69 | 70 | print() 71 | print("Parsed text:") 72 | print(node) 73 | 74 | print() 75 | print("Parsed text (no whitespace):") 76 | print(node_no_whitespace) 77 | 78 | print() 79 | print(f"Reassembled text (length {len(reassembled)}):") 80 | print(reassembled) 81 | 82 | print() 83 | print("Reassembled text (normalized padding):") 84 | print(node.reassemble()) 85 | 86 | validate_node(node, _test_text) 87 | 88 | assert reassembled.count("Chunk1
96 |
Chunk2
97 |
Chunk3
98 | """ 99 | 100 | node = parse_divs(doc) 101 | summary_str = node.structure_summary_str() or "" 102 | 103 | print() 104 | print("Structure summary:") 105 | print(summary_str) 106 | 107 | expected_summary = dedent( 108 | """ 109 | HTML structure: 110 | 3 div.chunk 111 | """ 112 | ).strip() 113 | 114 | assert _strip_lines(summary_str) == _strip_lines(expected_summary) 115 | 116 | 117 | def test_structure_summary_str_2(): 118 | node = parse_divs(_test_text) 119 | summary_str = node.structure_summary_str() or "" 120 | 121 | print() 122 | print("Structure summary:") 123 | print(summary_str) 124 | 125 | expected_summary = dedent( 126 | """ 127 | HTML structure: 128 | 1 div.outer 129 | 1 div.outer > div.inner 130 | 1 div.outer > div.inner > div 131 | 1 div.outer > div.inner > div.nested-inner 132 | 1 div.outer > div.inner > div.nested-inner > div 133 | """ 134 | ).strip() 135 | 136 | assert _strip_lines(summary_str) == _strip_lines(expected_summary) 137 | 138 | 139 | def test_parse_chunk_divs(): 140 | text = dedent( 141 | """ 142 |
143 | 144 | Chunk 1 text. 145 | 146 |
147 | 148 |
149 | 150 | Chunk 2 text. 151 | 152 |
153 | 154 |
Empty chunk.
155 | 156 | """ 157 | ) 158 | 159 | chunk_divs = parse_divs_by_class(text, "chunk") 160 | 161 | print("\n---test_parse_chunk_divs---") 162 | for chunk_div in chunk_divs: 163 | print(chunk_div.reassemble()) 164 | print("---") 165 | 166 | assert chunk_divs[0].reassemble() == """
\n\nChunk 1 text.\n\n
""" 167 | assert chunk_divs[0].contents.strip() == "Chunk 1 text." 168 | assert len(chunk_divs) == 3 169 | -------------------------------------------------------------------------------- /examples/backfill_timestamps.py: -------------------------------------------------------------------------------- 1 | # /// script 2 | # requires-python = ">=3.13" 3 | # dependencies = [ 4 | # "chopdiff", 5 | # "flowmark", 6 | # ] 7 | # /// 8 | import logging 9 | from textwrap import dedent 10 | 11 | from chopdiff.docs import BOF_TOK, EOF_TOK, PARA_BR_TOK, TextDoc, TokenMapping, search_tokens 12 | from chopdiff.html import ContentNotFound, TimestampExtractor 13 | 14 | logging.basicConfig(format=">> %(message)s") 15 | log = logging.getLogger(__name__) 16 | log.setLevel(logging.INFO) 17 | 18 | 19 | def format_timestamp(timestamp: float) -> str: 20 | hours, remainder = divmod(timestamp, 3600) 21 | minutes, seconds = divmod(remainder, 60) 22 | if hours: 23 | return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}" 24 | else: 25 | return f"{int(minutes):02}:{int(seconds):02}" 26 | 27 | 28 | def add_timestamp(text: str, timestamp: float) -> str: 29 | return f'{text} ⏱️{format_timestamp(timestamp)} ' 30 | 31 | 32 | def heading(text: str): 33 | return "\n--- " + text + " " + "-" * (70 - len(text)) + "\n" 34 | 35 | 36 | def backfill_timestamps(target_text: str, source_text: str) -> str: 37 | """ 38 | Backfill timestamps from a source document into a target document. 39 | The source document should have timestamps in ``s with a `data-timestamp` attribute. 40 | The target document should have mostly similar text but no timestamps. 41 | """ 42 | 43 | print(heading("Source text (with timestamps)")) 44 | print(source_text) 45 | 46 | print(heading("Target text (without timestamps)")) 47 | print(target_text) 48 | 49 | # Parse the target document into wordtoks. 50 | target_doc = TextDoc.from_text(target_text) 51 | extractor = TimestampExtractor(source_text) 52 | source_wordtoks = extractor.wordtoks 53 | 54 | # Create a mapping between source and target docs. 55 | target_wordtoks = list(target_doc.as_wordtoks(bof_eof=True)) 56 | token_mapping = TokenMapping(source_wordtoks, target_wordtoks) 57 | 58 | print(heading("Diff")) 59 | print(token_mapping.diff.as_diff_str()) 60 | 61 | print(heading("Token mapping")) 62 | print(token_mapping.full_mapping_str()) 63 | 64 | for wordtok_offset, (wordtok, sent_index) in enumerate( 65 | target_doc.as_wordtok_to_sent(bof_eof=True) 66 | ): 67 | # Look for each end of paragraph or end of doc. 68 | if wordtok in [PARA_BR_TOK, EOF_TOK]: 69 | # Find the start of the paragraph. 70 | start_para_index, start_para_wordtok = ( 71 | search_tokens(target_wordtoks) 72 | .at(wordtok_offset) 73 | .seek_back([BOF_TOK, PARA_BR_TOK]) 74 | .next() 75 | .get_token() 76 | ) 77 | 78 | wordtok_offset = start_para_index 79 | 80 | source_wordtok_offset = token_mapping.map_back(wordtok_offset) 81 | 82 | log.info( 83 | "Seeking back tok %s (%s) to para start tok %s (%s), map back to source tok %s (%s)", 84 | wordtok_offset, 85 | wordtok, 86 | start_para_index, 87 | start_para_wordtok, 88 | source_wordtok_offset, 89 | source_wordtoks[source_wordtok_offset], 90 | ) 91 | 92 | try: 93 | timestamp, _index, _offset = extractor.extract_preceding(source_wordtok_offset) 94 | sent = target_doc.get_sent(sent_index) 95 | 96 | if sent.is_markup(): 97 | log.info("Skipping markup-only sentence: %s", sent.text) 98 | continue 99 | 100 | log.info("Adding timestamp to sentence: %s", sent) 101 | 102 | sent.text = add_timestamp(sent.text, timestamp) 103 | 104 | except ContentNotFound: 105 | # Missing timestamps shouldn't be fatal. 106 | log.warning( 107 | "Failed to extract timestamp at doc token %s (%s) -> source token %s (%s): %s", 108 | wordtok_offset, 109 | wordtok, 110 | source_wordtok_offset, 111 | source_wordtoks[source_wordtok_offset], 112 | sent_index, 113 | ) 114 | 115 | result = target_doc.reassemble() 116 | 117 | print(heading("Result (with backfilled timestamps)")) 118 | print(result) 119 | 120 | return result 121 | 122 | 123 | def main(): 124 | # Example source text with timestamps: 125 | source_text = dedent( 126 | """ 127 | Welcome to this um ... video about Python programming. 128 | First, we'll talk about variables. Variables are containers for storing data values. 129 | Then let's look at functions. Functions help us organize and reuse code. 130 | """ 131 | ) 132 | 133 | # Example target text (similar content but edited, with no timestamps): 134 | target_text = dedent( 135 | """ 136 | ## Introduction 137 | 138 | Welcome to this video about Python programming. 139 | 140 | First, we'll talk about variables. Next, let's look at functions. Functions help us organize and reuse code. 141 | """ 142 | ) 143 | 144 | backfill_timestamps(target_text, source_text) 145 | 146 | 147 | if __name__ == "__main__": 148 | main() 149 | -------------------------------------------------------------------------------- /src/chopdiff/transforms/diff_filters.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Callable 2 | from typing import TypeAlias 3 | 4 | from typing_extensions import override 5 | 6 | from chopdiff.docs.token_diffs import DiffFilter, DiffOp, OpType 7 | from chopdiff.docs.wordtoks import ( 8 | is_break_or_space, 9 | is_tag_close, 10 | is_tag_open, 11 | is_whitespace_or_punct, 12 | is_word, 13 | ) 14 | from chopdiff.util.lemmatize import lemmatize, lemmatized_equal 15 | 16 | 17 | class WildcardToken: 18 | """ 19 | Wildcard token that matches any number of tokens (including zero). 20 | """ 21 | 22 | @override 23 | def __str__(self): 24 | return "*" 25 | 26 | 27 | WILDCARD_TOK = WildcardToken() 28 | 29 | TokenMatcher: TypeAlias = list[str] | Callable[[str], bool] 30 | 31 | TokenPattern: TypeAlias = str | Callable[[str], bool] | WildcardToken 32 | 33 | 34 | def _matches_pattern(tokens: list[str], pattern: list[TokenPattern]) -> bool: 35 | def match_from(i: int, j: int) -> bool: 36 | while i <= len(tokens) and j < len(pattern): 37 | pattern_elem = pattern[j] 38 | if pattern_elem == WILDCARD_TOK: 39 | # If '*' is the last pattern element, it matches any remaining tokens. 40 | if j + 1 == len(pattern): 41 | return True 42 | # Advance pattern index to next pattern after ANY_TOKEN. 43 | j += 1 44 | while i < len(tokens): 45 | if match_from(i, j): 46 | return True 47 | i += 1 48 | return False 49 | else: 50 | if i >= len(tokens): 51 | return False 52 | token = tokens[i] 53 | if isinstance(pattern_elem, str): 54 | if token != pattern_elem: 55 | return False 56 | elif callable(pattern_elem): 57 | if not pattern_elem(token): 58 | return False 59 | else: 60 | return False 61 | i += 1 62 | j += 1 63 | # Skip any remaining ANY_TOKEN in the pattern. 64 | while j < len(pattern) and pattern[j] == WILDCARD_TOK: 65 | j += 1 66 | # The tokens match the pattern if both indices are at the end. 67 | return i == len(tokens) and j == len(pattern) 68 | 69 | return match_from(0, 0) 70 | 71 | 72 | def make_token_sequence_filter( 73 | pattern: list[TokenPattern], 74 | action: OpType | None = None, 75 | ignore: TokenMatcher | None = None, 76 | ) -> DiffFilter: 77 | """ 78 | Returns a `DiffFilter` that accepts `DiffOps` where the tokens match the given pattern. 79 | The pattern is a list where each element can be a string or a predicate function that 80 | takes a token and returns a bool (True if the token matches). 81 | The '*' in the pattern list matches any number of tokens (including zero). 82 | If `action` is specified, only `DiffOps` with that action are considered. 83 | """ 84 | 85 | def filter_fn(diff_op: DiffOp) -> bool: 86 | if action and diff_op.action != action: 87 | return False 88 | 89 | tokens = diff_op.all_changed() 90 | if ignore and isinstance(ignore, str): 91 | tokens = [tok for tok in tokens if tok not in ignore] 92 | elif ignore and callable(ignore): 93 | tokens = [tok for tok in tokens if not ignore(tok)] 94 | 95 | return _matches_pattern(tokens, pattern) 96 | 97 | return filter_fn 98 | 99 | 100 | def changes_whitespace(diff_op: DiffOp) -> bool: 101 | """ 102 | Only accepts changes to sentence and paragraph breaks and whitespace. 103 | """ 104 | 105 | return all(is_break_or_space(tok) for tok in diff_op.all_changed()) 106 | 107 | 108 | def changes_whitespace_or_punct(diff_op: DiffOp) -> bool: 109 | """ 110 | Only accepts changes to punctuation and whitespace. 111 | """ 112 | 113 | return all(is_whitespace_or_punct(tok) for tok in diff_op.all_changed()) 114 | 115 | 116 | def no_word_lemma_changes(diff_op: DiffOp) -> bool: 117 | """ 118 | Only accept changes that preserve the lemmatized form of words. 119 | """ 120 | if diff_op.action == OpType.EQUAL: 121 | return True 122 | elif diff_op.action == OpType.REPLACE: 123 | return lemmatized_equal( 124 | " ".join(tok for tok in diff_op.left if is_word(tok)), 125 | " ".join(tok for tok in diff_op.right if is_word(tok)), 126 | ) 127 | else: 128 | return len([tok for tok in diff_op.all_changed() if is_word(tok)]) == 0 129 | 130 | 131 | def removes_words(diff_op: DiffOp) -> bool: 132 | """ 133 | Only accept changes that remove words. Changes to spaces and punctuation are allowed. 134 | """ 135 | if diff_op.action == OpType.DELETE or diff_op.action == OpType.EQUAL: 136 | return True 137 | elif diff_op.action == OpType.REPLACE or diff_op.action == OpType.INSERT: 138 | return all(is_whitespace_or_punct(tok) for tok in set(diff_op.right) - set(diff_op.left)) 139 | else: 140 | return False 141 | 142 | 143 | def removes_word_lemmas(diff_op: DiffOp) -> bool: 144 | """ 145 | Only accept changes that remove words or replace them with their lemmatized forms. 146 | Changes to spaces and punctuation are allowed. 147 | """ 148 | if diff_op.action == OpType.DELETE or diff_op.action == OpType.EQUAL: 149 | return True 150 | elif diff_op.action == OpType.REPLACE or diff_op.action == OpType.INSERT: 151 | left_words = [tok for tok in diff_op.left if is_word(tok)] 152 | right_words = [tok for tok in diff_op.right if is_word(tok)] 153 | 154 | left_lemmas = [lemmatize(word) for word in left_words] 155 | right_lemmas = [lemmatize(word) for word in right_words] 156 | 157 | return set(right_lemmas).issubset(set(left_lemmas)) 158 | else: 159 | return False 160 | 161 | 162 | def adds_headings(diff_op: DiffOp) -> bool: 163 | """ 164 | Only accept changes that add contents within header tags. 165 | """ 166 | headers = ["h1", "h2", "h3", "h4", "h5", "h6"] 167 | is_header = lambda tok: is_tag_open(tok, tag_names=headers) # pyright: ignore 168 | is_header_close = lambda tok: is_tag_close(tok, tag_names=headers) # pyright: ignore 169 | matcher = make_token_sequence_filter( 170 | [is_header, WILDCARD_TOK, is_header_close], 171 | action=OpType.INSERT, 172 | ignore=is_break_or_space, 173 | ) 174 | return matcher(diff_op) 175 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # ---- Project Info and Dependencies ---- 2 | 3 | [project.urls] 4 | Repository = "https://github.com/jlevy/chopdiff" 5 | # Homepage = "https://..." 6 | # Documentation = "https://..." 7 | 8 | [project] 9 | name = "chopdiff" 10 | description = "Chunking, diff filtering, and windowed transforms of text to support LLM applications" 11 | authors = [ 12 | { name="Joshua Levy", email="joshua@cal.berkeley.edu" }, 13 | ] 14 | readme = "README.md" 15 | license = "MIT" 16 | requires-python = ">=3.11,<4.0" 17 | dynamic = ["version"] 18 | 19 | # https://pypi.org/classifiers/ 20 | # Adjust as needed: 21 | classifiers = [ 22 | # Adjust as needed: 23 | "Development Status :: 4 - Beta", 24 | # "Development Status :: 5 - Production/Stable", 25 | "Intended Audience :: Developers", 26 | "Operating System :: OS Independent", 27 | "Programming Language :: Python", 28 | "Programming Language :: Python :: 3", 29 | "Programming Language :: Python :: 3.11", 30 | "Programming Language :: Python :: 3.12", 31 | "Programming Language :: Python :: 3.13", 32 | "Typing :: Typed", 33 | # Include this to avoid accidentally publishing to PyPI: 34 | # "Private :: Do Not Upload", 35 | ] 36 | 37 | 38 | # ---- Main dependencies ---- 39 | 40 | dependencies = [ 41 | "prettyfmt>=0.3.0", 42 | "flowmark>=0.5.3", 43 | "strif>=2.1.0", 44 | "funlog>=0.2.1", 45 | "cydifflib>=1.2.0", 46 | "tiktoken>=0.9.0", 47 | "regex>=2024.11.6", 48 | "selectolax>=0.3.32", 49 | ] 50 | 51 | [project.optional-dependencies] 52 | extras = [ 53 | "simplemma>=1.1.2", 54 | ] 55 | 56 | [dependency-groups] 57 | dev = [ 58 | "pytest>=8.3.5", 59 | "pytest-sugar>=1.0.0", 60 | "ruff>=0.11.9", 61 | "codespell>=2.4.1", 62 | "rich>=14.0.0", 63 | "basedpyright==1.29.5", # TODO: Upgrade when Cursor supports it. 64 | "funlog>=0.2.1", 65 | ] 66 | 67 | [project.scripts] 68 | # Add script entry points here: 69 | chopdiff = "chopdiff:main" 70 | 71 | 72 | # ---- Build system ---- 73 | 74 | # Dynamic versioning from: 75 | # https://github.com/ninoseki/uv-dynamic-versioning/ 76 | 77 | [build-system] 78 | requires = ["hatchling", "uv-dynamic-versioning"] 79 | build-backend = "hatchling.build" 80 | 81 | [tool.hatch.version] 82 | source = "uv-dynamic-versioning" 83 | # Note JSON schemas don't seem to be right for tool.hatch.version.source so 84 | # this may cause false warnings in IDEs. 85 | # https://github.com/ninoseki/uv-dynamic-versioning/issues/21 86 | 87 | [tool.uv-dynamic-versioning] 88 | vcs = "git" 89 | style = "pep440" 90 | bump = true 91 | 92 | [tool.hatch.build.targets.wheel] 93 | # The source location for the package. 94 | packages = ["src/chopdiff"] 95 | 96 | 97 | # ---- Settings ---- 98 | 99 | [tool.ruff] 100 | # Set as desired, typically 88 (black standard) or 100 (wide). 101 | line-length = 100 102 | 103 | [tool.ruff.lint] 104 | select = [ 105 | # See: https://docs.astral.sh/ruff/rules/ 106 | # Basic list from: https://docs.astral.sh/ruff/linter/#rule-selection 107 | "E", # https://docs.astral.sh/ruff/rules/#error-e 108 | "F", # https://docs.astral.sh/ruff/rules/#pyflakes-f 109 | "UP", # https://docs.astral.sh/ruff/rules/#pyupgrade-up 110 | "B", # https://docs.astral.sh/ruff/rules/#flake8-bugbear-b 111 | "I", # https://docs.astral.sh/ruff/rules/#isort-i 112 | # Other possibilities: 113 | # "D" # https://docs.astral.sh/ruff/rules/#pydocstyle-d 114 | # "Q" # https://docs.astral.sh/ruff/rules/#flake8-quotes-q 115 | # "COM" # https://docs.astral.sh/ruff/rules/#flake8-commas-com 116 | # "SIM", # https://docs.astral.sh/ruff/rules/#flake8-simplify-sim 117 | 118 | ] 119 | ignore = [ 120 | # Disable some rules that are overly pedantic. Add/remove as desired: 121 | "E501", # https://docs.astral.sh/ruff/rules/line-too-long/ 122 | "E402", # https://docs.astral.sh/ruff/rules/module-import-not-at-top-of-file/ 123 | "E731", # https://docs.astral.sh/ruff/rules/lambda-assignment/ 124 | "B904", 125 | # We use both ruff formatter and linter so some rules should always be disabled. 126 | # See: https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules 127 | "W191", # https://docs.astral.sh/ruff/rules/tab-indentation/ 128 | "E111", # https://docs.astral.sh/ruff/rules/indentation-with-invalid-multiple/ 129 | "E114", # https://docs.astral.sh/ruff/rules/indentation-with-invalid-multiple-comment/ 130 | "E117", # https://docs.astral.sh/ruff/rules/over-indented/ 131 | "D206", # https://docs.astral.sh/ruff/rules/docstring-tab-indentation/ 132 | "D300", # https://docs.astral.sh/ruff/rules/triple-single-quotes/ 133 | "Q000", # https://docs.astral.sh/ruff/rules/bad-quotes-inline-string/ 134 | "Q001", # https://docs.astral.sh/ruff/rules/bad-quotes-multiline-string/ 135 | "Q002", # https://docs.astral.sh/ruff/rules/bad-quotes-docstring/ 136 | "Q003", # https://docs.astral.sh/ruff/rules/avoidable-escaped-quote/ 137 | "COM812", # https://docs.astral.sh/ruff/rules/missing-trailing-comma/ 138 | "COM819", # https://docs.astral.sh/ruff/rules/prohibited-trailing-comma/ 139 | "ISC002", # https://docs.astral.sh/ruff/rules/multi-line-implicit-string-concatenation/ 140 | ] 141 | 142 | [tool.basedpyright] 143 | # BasedPyright currently seems like the best type checker option, much faster 144 | # than mypy and with a good extension for VSCode/Cursor. 145 | # https://marketplace.visualstudio.com/items?itemName=detachhead.basedpyright 146 | # https://docs.basedpyright.com/latest/configuration/config-files/#sample-pyprojecttoml-file 147 | include = ["src", "tests", "devtools"] 148 | # By default BasedPyright is very strict, so you almost certainly want to disable 149 | # some of the rules. 150 | # First, these turn off warnings about (yes) how you ignore warnings: 151 | reportIgnoreCommentWithoutRule = false 152 | reportUnnecessaryTypeIgnoreComment = false 153 | # A few typically noisy warnings are next. 154 | # How many you enable is up to you. The first few are off by default, but you can 155 | # comment/uncomment these as desired: 156 | reportMissingTypeStubs = false 157 | reportUnusedCallResult = false 158 | reportAny = false 159 | reportExplicitAny = false 160 | reportImplicitStringConcatenation = false 161 | reportUnreachable = false 162 | reportUnknownMemberType = false 163 | # reportPrivateImportUsage = false 164 | # reportPrivateLocalImportUsage = false 165 | # reportMissingImports = false 166 | # reportUnnecessaryIsInstance = false 167 | reportUnknownVariableType = false 168 | # reportUnknownArgumentType = false 169 | reportUnannotatedClassAttribute = false 170 | reportUnknownLambdaType = false 171 | reportPrivateUsage = false 172 | 173 | [tool.codespell] 174 | ignore-words-list = "Numbe" 175 | # skip = "foo.py,bar.py" 176 | 177 | [tool.pytest.ini_options] 178 | python_files = ["*.py"] 179 | python_classes = ["Test*"] 180 | python_functions = ["test_*"] 181 | testpaths = [ 182 | "src", 183 | "tests", 184 | ] 185 | norecursedirs = [] 186 | filterwarnings = [] 187 | 188 | 189 | -------------------------------------------------------------------------------- /src/chopdiff/divs/text_node.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections.abc import Callable 4 | from copy import copy 5 | from dataclasses import dataclass, field 6 | 7 | from prettyfmt import fmt_lines 8 | from typing_extensions import override 9 | 10 | from chopdiff.docs.sizes import TextUnit 11 | from chopdiff.docs.text_doc import Splitter, TextDoc, default_sentence_splitter 12 | from chopdiff.html.html_in_md import div_wrapper 13 | 14 | 15 | @dataclass 16 | class TextNode: 17 | """ 18 | A node in parsed structured text, with reference offsets into the original text. 19 | Useful for parsing Markdown broken into div tags. 20 | """ 21 | 22 | original_text: str 23 | 24 | # Offsets into the original text. 25 | offset: int 26 | content_start: int 27 | content_end: int 28 | 29 | tag_name: str | None = None 30 | class_name: str | None = None 31 | begin_marker: str | None = None 32 | end_marker: str | None = None 33 | 34 | children: list[TextNode] = field(default_factory=list) 35 | 36 | @property 37 | def end_offset(self) -> int: 38 | assert self.content_end >= 0 39 | return self.content_end + len(self.end_marker) if self.end_marker else self.content_end 40 | 41 | @property 42 | def contents(self) -> str: 43 | return self.original_text[self.content_start : self.content_end] 44 | 45 | def text_doc(self, sentence_splitter: Splitter = default_sentence_splitter) -> TextDoc: 46 | return TextDoc.from_text(self.contents, sentence_splitter=sentence_splitter) 47 | 48 | def slice_children(self, start: int, end: int) -> TextNode: 49 | if not self.children: 50 | raise ValueError("Cannot slice_children on a non-container node.") 51 | else: 52 | node_copy = copy(self) 53 | node_copy.children = node_copy.children[start:end] 54 | return node_copy 55 | 56 | def size(self, unit: TextUnit) -> int: 57 | if self.children: 58 | return sum(child.size(unit) for child in self.children) 59 | else: 60 | return self.text_doc().size(unit) 61 | 62 | def structure_summary(self) -> dict[str, int]: 63 | """ 64 | Recursively tally the number of non-empty leaf nodes of different types as CSS-style paths. 65 | For example 66 | 67 | { "_total": 7, "div.chunk": 5, "div.chunk > div.summary": 2, "div.chunk > div.content": 5 } 68 | 69 | would mean that there were 7 chunk divs, each with a content div, and 2 with 70 | a summary div within it. 71 | """ 72 | 73 | def path_join(*selectors: str) -> str: 74 | return " > ".join(selectors) 75 | 76 | def tally_recursive(node: TextNode, path: list[str], tally: dict[str, int]) -> None: 77 | # Skip leaf nodes. 78 | if not node.children and not node.tag_name and not node.class_name: 79 | return 80 | 81 | tag_selector = node.tag_name if node.tag_name else "" 82 | class_selector = f".{node.class_name}" if node.class_name else "" 83 | selector = f"{tag_selector}{class_selector}" 84 | new_path = path + [selector] if selector else path 85 | 86 | # Increment counts. 87 | path_key = path_join(*new_path) 88 | if path_key: 89 | tally[path_key] = tally.get(path_key, 0) + 1 90 | 91 | for child in node.children: 92 | tally_recursive(child, new_path, tally) 93 | 94 | tally: dict[str, int] = {} 95 | tally_recursive(self, [], tally) 96 | 97 | sorted_tally = dict(sorted(tally.items())) 98 | return sorted_tally 99 | 100 | def structure_summary_str(self) -> str | None: 101 | structure_summary = self.structure_summary() 102 | if not structure_summary: 103 | return None 104 | else: 105 | return "HTML structure:\n" + fmt_lines( 106 | [f"{count:6d} {path}" for path, count in self.structure_summary().items()], 107 | prefix="", 108 | ) 109 | 110 | def size_summary(self) -> str: 111 | """ 112 | Return a summary of the size of the doc as well as a summary of its 113 | div/HTML structure. 114 | """ 115 | summary = self.text_doc().size_summary() 116 | if structure_summary_str := self.structure_summary_str(): 117 | summary += "\n" + structure_summary_str 118 | return summary 119 | 120 | def is_whitespace(self) -> bool: 121 | """ 122 | Is this node whitespace only? 123 | """ 124 | return not self.children and self.contents.strip() == "" 125 | 126 | def children_by_class_names(self, *class_names: str, recursive: bool = False) -> list[TextNode]: 127 | def collect_children(node: TextNode) -> list[TextNode]: 128 | matching_children = [ 129 | child for child in node.children if child.class_name in class_names 130 | ] 131 | if recursive: 132 | for child in node.children: 133 | matching_children.extend(collect_children(child)) 134 | return matching_children 135 | 136 | return collect_children(self) 137 | 138 | def child_by_class_name(self, class_name: str) -> TextNode | None: 139 | nodes = self.children_by_class_names(class_name, recursive=False) 140 | if len(nodes) == 0: 141 | return None 142 | if len(nodes) > 1: 143 | raise ValueError(f"Multiple children with class name {class_name}") 144 | return nodes[0] 145 | 146 | def reassemble(self, padding: str = "\n\n") -> str: 147 | """ 148 | Reassemble as string. If padding is provided (not ""), then strip, skip whitespace, 149 | and insert our own padding. 150 | """ 151 | strip_fn: Callable[[str], str] = lambda s: s.strip() if padding else s 152 | skip_whitespace = bool(padding) 153 | 154 | if not self.children: 155 | if not self.tag_name: 156 | return strip_fn(self.contents) 157 | else: 158 | wrap = div_wrapper(self.class_name, padding=padding) 159 | return wrap(strip_fn(self.contents)) 160 | else: 161 | padded_children = (padding or "").join( 162 | child.reassemble(padding) 163 | for child in self.children 164 | if (not skip_whitespace or not child.is_whitespace()) 165 | ) 166 | if not self.tag_name: 167 | return padded_children 168 | else: 169 | wrap = div_wrapper(self.class_name, padding=padding) 170 | return wrap(padded_children) 171 | 172 | @override 173 | def __str__(self): 174 | """ 175 | Return a recursive, formatted string representation of the node and its children. 176 | """ 177 | return self._str_recursive() 178 | 179 | def _str_recursive(self, level: int = 0, max_len: int = 40) -> str: 180 | indent = " " * level 181 | content_preview = self.contents 182 | if len(content_preview) > max_len: 183 | content_preview = content_preview[:20] + "…" + content_preview[-20:] 184 | result = ( 185 | f"{indent}TextNode(tag_name={self.tag_name} class_name={self.class_name} offset={self.offset}," 186 | f" content_start={self.content_start}, content_end={self.content_end}) " 187 | f"{repr(content_preview)}\n" 188 | ) 189 | for child in self.children: 190 | result += child._str_recursive(level + 1) 191 | return result 192 | -------------------------------------------------------------------------------- /src/chopdiff/docs/wordtoks.py: -------------------------------------------------------------------------------- 1 | """ 2 | Support for treating text as a sequence of word, punctuation, whitespace 3 | (word, setnence, and paragraph breaks), or HTML tags as tokens, which we call 4 | "wordtoks". 5 | 6 | Also works well with Markdown. Wordtoks make it possible to do word-oriented 7 | parsing, diffs, and transforms, while also preserving HTML tags and significant 8 | whitespace. 9 | """ 10 | 11 | from dataclasses import dataclass 12 | 13 | import regex 14 | 15 | # Special tokens to represent sentence, paragraph, and document boundaries. 16 | # Note these parse as tokens and like HTML tags, so they can safely be mixed into inputs if desired. 17 | SENT_BR_TOK = "<-SENT-BR->" 18 | PARA_BR_TOK = "<-PARA-BR->" 19 | BOF_TOK = "<-BOF->" 20 | EOF_TOK = "<-EOF->" 21 | 22 | SENT_BR_STR = " " 23 | PARA_BR_STR = "\n\n" 24 | BOF_STR = "" 25 | EOF_STR = "" 26 | 27 | SPACE_TOK = " " 28 | 29 | SYMBOL_SEP = "⎪" 30 | 31 | # Currently break on words, spaces, or any single other/punctuation character. 32 | # HTML tags (of length <1024 chars, possibly with newlines) and entities are also a single token. 33 | # TODO: Could add nicer support for Markdown formatting as well. 34 | # Updated pattern to include HTML entities 35 | _wordtok_pattern = regex.compile( 36 | r"(<(?:[^<>]|\n){0,1024}>|\&\w+;|\&\#\d+;|\w+|[^\w\s]|\s+)", regex.DOTALL 37 | ) 38 | 39 | _para_br_pattern = regex.compile(r"\s*\n\n\s*") 40 | 41 | # TODO: Is it worth using the regex package here to get \p{L} or is there a good 42 | # enough way with re only? 43 | _word_pat = regex.compile(r"\p{L}+", regex.UNICODE) 44 | 45 | _number_pat = regex.compile(r"\d+") 46 | 47 | _tag_pattern = regex.compile(r"<(/?)(\w+)([^>]*?)(/?)\s*>", regex.IGNORECASE) 48 | 49 | _comment_pattern = regex.compile(r"", regex.DOTALL) 50 | 51 | 52 | def wordtok_to_str(wordtok: str) -> str: 53 | """ 54 | Convert a wordtok to a string, mapping any special wordtoks to their usual 55 | representations. 56 | """ 57 | if wordtok == SENT_BR_TOK: 58 | return SENT_BR_STR 59 | if wordtok == PARA_BR_TOK: 60 | return PARA_BR_STR 61 | if wordtok == BOF_TOK: 62 | return BOF_STR 63 | if wordtok == EOF_TOK: 64 | return EOF_STR 65 | return wordtok 66 | 67 | 68 | def wordtok_len(wordtok: str) -> int: 69 | """ 70 | Char length of a wordtok. 71 | """ 72 | return len(wordtok_to_str(wordtok)) 73 | 74 | 75 | _whitespace = regex.compile(r"\s+") 76 | 77 | 78 | def normalize_wordtok(wordtok: str) -> str: 79 | if wordtok.isspace(): 80 | normalized = SPACE_TOK 81 | elif wordtok.startswith("<"): 82 | normalized = _whitespace.sub(" ", wordtok) 83 | else: 84 | normalized = wordtok 85 | return normalized 86 | 87 | 88 | def wordtokenize_with_offsets(text: str, bof_eof: bool = False) -> tuple[list[str], list[int]]: 89 | """ 90 | Same as `wordtokenize`, but returns a list of tuples `(wordtok, offset)`. 91 | """ 92 | wordtoks = [] 93 | offsets = [] 94 | offset = 0 95 | for match in _wordtok_pattern.finditer(text): 96 | wordtok = normalize_wordtok(match.group()) 97 | wordtoks.append(wordtok) 98 | offsets.append(offset) 99 | offset = match.end() 100 | 101 | if bof_eof: 102 | wordtoks = [BOF_TOK] + wordtoks + [EOF_TOK] 103 | offsets = [0] + offsets + [len(text)] 104 | 105 | return wordtoks, offsets 106 | 107 | 108 | def wordtokenize(text: str, bof_eof: bool = False) -> list[str]: 109 | """ 110 | Convert text to word tokens, including words, whitespace, punctuation, and 111 | HTML tags. Does not parse paragraph or sentence breaks. Normalizes all 112 | whitespace to a single space character. 113 | """ 114 | wordtoks, _offsets = wordtokenize_with_offsets(text, bof_eof) 115 | return wordtoks 116 | 117 | 118 | def _insert_para_wordtoks(text: str) -> str: # pyright: ignore 119 | """ 120 | Replace paragraph breaks in text with para break tokens. 121 | """ 122 | return _para_br_pattern.sub(PARA_BR_TOK, text) 123 | 124 | 125 | def _initial_wordtoks(text: str, max_chars: int) -> list[str]: 126 | sub_text = text[:max_chars] 127 | wordtoks = wordtokenize(sub_text) 128 | if wordtoks: 129 | wordtoks.pop() # Drop any cut off token. 130 | return wordtoks 131 | 132 | 133 | def first_wordtok(text: str) -> str | None: 134 | """ 135 | Get the first wordtok from the text, if it has one. 136 | """ 137 | wordtoks = _initial_wordtoks(text, 100) 138 | return wordtoks[0] if wordtoks else None 139 | 140 | 141 | def join_wordtoks(wordtoks: list[str]) -> str: 142 | """ 143 | Join wordtoks back into a sentence. 144 | """ 145 | wordtoks = [wordtok_to_str(wordtok) for wordtok in wordtoks] 146 | return "".join(wordtoks) 147 | 148 | 149 | def visualize_wordtoks(wordtoks: list[str]) -> str: 150 | """ 151 | Visualize wordtoks with a separator for debugging. 152 | """ 153 | return SYMBOL_SEP + SYMBOL_SEP.join(wordtoks) + SYMBOL_SEP 154 | 155 | 156 | def is_break_or_space(wordtok: str) -> bool: 157 | """ 158 | Any kind of paragraph break, sentence break, or space (including 159 | the beginning or end of the document). 160 | """ 161 | return ( 162 | wordtok == PARA_BR_TOK 163 | or wordtok == SENT_BR_TOK 164 | or wordtok.isspace() 165 | or wordtok == BOF_TOK 166 | or wordtok == EOF_TOK 167 | ) 168 | 169 | 170 | def is_word(wordtok: str) -> bool: 171 | """ 172 | Is this wordtok a word, not punctuation or whitespace or a number? 173 | """ 174 | return bool(len(wordtok) > 0 and _word_pat.match(wordtok) and not _number_pat.match(wordtok)) 175 | 176 | 177 | def is_number(wordtok: str) -> bool: 178 | """ 179 | Is this wordtok a number? 180 | """ 181 | return bool(_number_pat.match(wordtok)) 182 | 183 | 184 | def is_whitespace_or_punct(wordtok: str) -> bool: 185 | """ 186 | Is this wordtok whitespace or punctuation? 187 | """ 188 | return bool(not is_word(wordtok) and not is_number(wordtok)) 189 | 190 | 191 | @dataclass(frozen=True) 192 | class Tag: 193 | """ 194 | An HTML tag or comment. 195 | """ 196 | 197 | name: str 198 | is_open: bool 199 | is_close: bool 200 | attrs: dict[str, str] 201 | comment: str | None = None 202 | 203 | 204 | def parse_tag(wordtok: str | None = None) -> Tag | None: 205 | """ 206 | Parse a wordtok to determine if it's an HTML tag and extract its components. 207 | """ 208 | if not wordtok: 209 | return None 210 | 211 | match = _tag_pattern.match(wordtok) 212 | if not match: 213 | match = _comment_pattern.match(wordtok) 214 | if not match: 215 | return None 216 | return Tag(name="", is_open=False, is_close=False, attrs={}, comment=match.group(1)) 217 | 218 | is_open = not bool(match.group(1)) 219 | is_close = bool(match.group(1) or match.group(4)) 220 | tag_name = match.group(2).lower() 221 | attrs_str = match.group(3).strip() 222 | 223 | attrs: dict[str, str] = {} 224 | if attrs_str: 225 | attr_pattern = regex.compile(r'(\w+)\s*=\s*"([^"]*)"') 226 | for attr_match in attr_pattern.finditer(attrs_str): 227 | attr_name, attr_value = attr_match.groups() 228 | attrs[attr_name] = attr_value 229 | 230 | return Tag(name=tag_name, is_open=is_open, is_close=is_close, attrs=attrs) 231 | 232 | 233 | def is_tag(wordtok: str | None = None, tag_names: list[str] | None = None) -> bool: 234 | """ 235 | Check if a wordtok is an HTML tag and optionally if it's in the specified tag names. 236 | """ 237 | tag = parse_tag(wordtok) 238 | return bool(tag and (not tag_names or tag.name in [name.lower() for name in tag_names])) 239 | 240 | 241 | def is_tag_close(wordtok: str, tag_names: list[str] | None = None) -> bool: 242 | """ 243 | Check if a wordtok is an HTML close tag and optionally if it's in the specified tag names. 244 | """ 245 | tag = parse_tag(wordtok) 246 | return bool( 247 | tag and tag.is_close and (not tag_names or tag.name in [name.lower() for name in tag_names]) 248 | ) 249 | 250 | 251 | def is_tag_open(wordtok: str, tag_names: list[str] | None = None) -> bool: 252 | """ 253 | Check if a wordtok is an HTML open tag and optionally if it's in the specified tag names. 254 | """ 255 | tag = parse_tag(wordtok) 256 | return bool( 257 | tag and tag.is_open and (not tag_names or tag.name in [name.lower() for name in tag_names]) 258 | ) 259 | 260 | 261 | def is_div(wordtok: str | None = None) -> bool: 262 | return is_tag(wordtok, tag_names=["div"]) 263 | 264 | 265 | def is_entity(wordtok: str | None = None) -> bool: 266 | """ 267 | Check if a wordtok is an HTML entity. 268 | """ 269 | return bool(wordtok and wordtok.startswith("&") and wordtok.endswith(";")) 270 | 271 | 272 | header_tags = ["h1", "h2", "h3", "h4", "h5", "h6"] 273 | 274 | 275 | def is_header_tag(wordtok: str) -> bool: 276 | """ 277 | Is this wordtok an HTML header tag? 278 | """ 279 | return is_tag(wordtok, tag_names=header_tags) 280 | -------------------------------------------------------------------------------- /src/chopdiff/transforms/sliding_transforms.py: -------------------------------------------------------------------------------- 1 | """ 2 | Transform text using sliding windows over a document, then reassembling the 3 | transformed text. 4 | """ 5 | 6 | import logging 7 | from collections.abc import Callable 8 | from math import ceil 9 | from typing import Any, TypeAlias 10 | 11 | from flowmark import fill_markdown 12 | from prettyfmt import fmt_lines 13 | 14 | from chopdiff.docs.sizes import TextUnit 15 | from chopdiff.docs.text_doc import Paragraph, TextDoc 16 | from chopdiff.docs.token_diffs import DIFF_FILTER_NONE, DiffFilter, diff_docs, find_best_alignment 17 | from chopdiff.docs.wordtoks import join_wordtoks 18 | from chopdiff.transforms.sliding_windows import sliding_para_window, sliding_word_window 19 | from chopdiff.transforms.window_settings import WINDOW_BR, WindowSettings 20 | 21 | log = logging.getLogger(__name__) 22 | 23 | TextDocTransform: TypeAlias = Callable[[TextDoc], TextDoc] 24 | 25 | SaveFunc: TypeAlias = Callable[[str, str, Any], None] 26 | 27 | 28 | def remove_window_br(doc: TextDoc): 29 | """ 30 | Remove `` markers in a document. 31 | """ 32 | doc.replace_str(WINDOW_BR, "") 33 | 34 | 35 | def filtered_transform( 36 | doc: TextDoc, 37 | transform_func: TextDocTransform, 38 | windowing: WindowSettings | None, 39 | diff_filter: DiffFilter | None = None, 40 | debug_save: SaveFunc | None = None, 41 | ) -> TextDoc: 42 | """ 43 | Apply a transform with sliding window across the input doc, enforcing the changes it's 44 | allowed to make with `diff_filter`. 45 | 46 | If windowing is None, apply the transform to the entire document at once. 47 | 48 | `debug_save` is an optional function that takes a message, a filename, and an object, and saves 49 | the object to a file for debugging. 50 | """ 51 | has_filter = diff_filter and diff_filter != DIFF_FILTER_NONE 52 | 53 | if not windowing or not windowing.size: 54 | transformed_doc = transform_func(doc) 55 | else: 56 | 57 | def transform_and_check_diff(input_doc: TextDoc) -> TextDoc: 58 | # Avoid having window breaks build up after multiple transforms. 59 | remove_window_br(input_doc) 60 | 61 | transformed_doc = transform_func(input_doc) 62 | 63 | if has_filter: 64 | # Check the transform did what it should have. 65 | diff = diff_docs(input_doc, transformed_doc) 66 | accepted_diff, rejected_diff = diff.filter(diff_filter) 67 | 68 | assert diff.left_size() == input_doc.size(TextUnit.wordtoks) 69 | assert accepted_diff.left_size() == input_doc.size(TextUnit.wordtoks) 70 | assert rejected_diff.left_size() == input_doc.size(TextUnit.wordtoks) 71 | 72 | log.info( 73 | "Accepted transform changes:\n%s", 74 | fmt_lines(str(accepted_diff).splitlines()), 75 | ) 76 | 77 | # Note any rejections. 78 | rejected_changes = rejected_diff.changes() 79 | if rejected_changes: 80 | log.info( 81 | "Filtering extraneous changes:\n%s", 82 | fmt_lines(rejected_diff.as_diff_str(False).splitlines()), 83 | ) 84 | 85 | # Apply only the accepted changes. 86 | final_doc = TextDoc.from_wordtoks( 87 | accepted_diff.apply_to(list(input_doc.as_wordtoks())) 88 | ) 89 | log.info( 90 | "Word token changes:\n%s", 91 | fmt_lines( 92 | [ 93 | f"Accepted: {accepted_diff.stats()}", 94 | f"Rejected: {rejected_diff.stats()}", 95 | ] 96 | ), 97 | ) 98 | else: 99 | diff = None 100 | accepted_diff, rejected_diff = None, None 101 | final_doc = transformed_doc 102 | 103 | if debug_save: 104 | debug_save( 105 | "Input doc normalized", 106 | "filtered_transform", 107 | fill_markdown(input_doc.reassemble()), 108 | ) 109 | debug_save("Output doc raw", "filtered_transform", transformed_doc.reassemble()) 110 | # log_save( 111 | # "Output doc normalized", 112 | # "filtered_transform", 113 | # normalize_markdown(transformed_doc.reassemble()), 114 | # ) 115 | if diff: 116 | debug_save("Transform diff", "filtered_transform", diff) 117 | # if accepted_diff: 118 | # log.save_object("Accepted diff", "filtered_transform", accepted_diff) 119 | if rejected_diff: 120 | debug_save("Rejected diff", "filtered_transform", rejected_diff) 121 | 122 | debug_save("Final doc", "filtered_transform", final_doc.reassemble()) 123 | 124 | return final_doc 125 | 126 | transformed_doc = sliding_window_transform( 127 | doc, 128 | transform_and_check_diff, 129 | windowing, 130 | ) 131 | 132 | return transformed_doc 133 | 134 | 135 | def sliding_window_transform( 136 | doc: TextDoc, transform_func: TextDocTransform, settings: WindowSettings 137 | ) -> TextDoc: 138 | if settings.unit == TextUnit.wordtoks: 139 | return sliding_wordtok_window_transform(doc, transform_func, settings) 140 | elif settings.unit == TextUnit.paragraphs: 141 | return sliding_para_window_transform(doc, transform_func, settings) 142 | else: 143 | raise ValueError(f"Unsupported sliding transform unit: {settings.unit}") 144 | 145 | 146 | def sliding_wordtok_window_transform( 147 | doc: TextDoc, transform_func: TextDocTransform, settings: WindowSettings 148 | ) -> TextDoc: 149 | """ 150 | Apply a transformation function to each TextDoc in a sliding window over the given document, 151 | stepping through wordtoks, then reassemble the transformed document. Uses best effort to 152 | stitch the results together seamlessly by searching for the best alignment (minimum wordtok 153 | edit distance) of each transformed window. 154 | """ 155 | if settings.unit != TextUnit.wordtoks: 156 | raise ValueError(f"This sliding window expects wordtoks, not {settings.unit}") 157 | 158 | windows = sliding_word_window(doc, settings.size, settings.shift, TextUnit.wordtoks) 159 | 160 | nwordtoks = doc.size(TextUnit.wordtoks) 161 | nbytes = doc.size(TextUnit.bytes) 162 | nwindows = ceil(nwordtoks / settings.shift) 163 | sep_wordtoks = [settings.separator] if settings.separator else [] 164 | 165 | log.info( 166 | "Sliding word transform: Begin on doc: total %s wordtoks, %s bytes, %s windows, %s", 167 | nwordtoks, 168 | nbytes, 169 | nwindows, 170 | settings, 171 | ) 172 | 173 | output_wordtoks: list[str] = [] 174 | for i, window in enumerate(windows): 175 | log.info( 176 | "Sliding word transform window %s/%s (%s wordtoks, %s bytes), at %s wordtoks so far", 177 | i + 1, 178 | nwindows, 179 | window.size(TextUnit.wordtoks), 180 | window.size(TextUnit.bytes), 181 | len(output_wordtoks), 182 | ) 183 | 184 | transformed_window = transform_func(window) 185 | 186 | new_wordtoks = list(transformed_window.as_wordtoks()) 187 | 188 | if not output_wordtoks: 189 | output_wordtoks = new_wordtoks 190 | else: 191 | if len(output_wordtoks) < settings.min_overlap: 192 | raise ValueError( 193 | "Output wordtoks too short to align with min_overlap %s: %s", 194 | settings.min_overlap, 195 | output_wordtoks, 196 | ) 197 | if len(new_wordtoks) < settings.min_overlap: 198 | log.error( 199 | "New wordtoks too short to align with min_overlap %s, skipping: %s", 200 | settings.min_overlap, 201 | new_wordtoks, 202 | ) 203 | continue 204 | 205 | offset, (score, diff) = find_best_alignment( 206 | output_wordtoks, new_wordtoks, settings.min_overlap 207 | ) 208 | 209 | log.info( 210 | "Sliding word transform: Best alignment of window %s is at token offset %s (score %s, %s)", 211 | i, 212 | offset, 213 | score, 214 | diff.stats(), 215 | ) 216 | 217 | output_wordtoks = output_wordtoks[:offset] + sep_wordtoks + new_wordtoks 218 | 219 | log.info( 220 | "Sliding word transform: Done, output total %s wordtoks", 221 | len(output_wordtoks), 222 | ) 223 | 224 | # An alternate approach would be to accumulate the document sentences instead of wordtoks to 225 | # avoid re-parsing, but this is probably a little simpler. 226 | output_doc = TextDoc.from_text(join_wordtoks(output_wordtoks)) 227 | 228 | return output_doc 229 | 230 | 231 | def sliding_para_window_transform( 232 | doc: TextDoc, 233 | transform_func: TextDocTransform, 234 | settings: WindowSettings, 235 | normalizer: Callable[[str], str] = fill_markdown, 236 | ) -> TextDoc: 237 | """ 238 | Apply a transformation function to each TextDoc, stepping through paragraphs `settings.size` 239 | at a time, then reassemble the transformed document. 240 | """ 241 | if settings.unit != TextUnit.paragraphs: 242 | raise ValueError(f"This sliding window expects paragraphs, not {settings.unit}") 243 | if settings.size != settings.shift: 244 | raise ValueError("Paragraph window transform requires equal size and shift") 245 | 246 | windows = sliding_para_window(doc, settings.size, normalizer) 247 | 248 | nwindows = ceil(doc.size(TextUnit.paragraphs) / settings.size) 249 | 250 | log.info( 251 | "Sliding paragraph transform: Begin on doc: %s windows of size %s paragraphs on total %s", 252 | nwindows, 253 | settings.size, 254 | doc.size_summary(), 255 | ) 256 | 257 | transformed_paras: list[Paragraph] = [] 258 | for i, window in enumerate(windows): 259 | log.info( 260 | "Sliding paragraph transform: Window %s/%s input is %s", 261 | i, 262 | nwindows, 263 | window.size_summary(), 264 | ) 265 | 266 | new_doc = transform_func(window) 267 | if i > 0: 268 | try: 269 | new_doc.paragraphs[0].sentences[0].text = ( 270 | settings.separator + new_doc.paragraphs[0].sentences[0].text 271 | ) 272 | except (KeyError, IndexError): 273 | pass 274 | transformed_paras.extend(new_doc.paragraphs) 275 | 276 | transformed_text = "\n\n".join(para.reassemble() for para in transformed_paras) 277 | new_text_doc = TextDoc.from_text(transformed_text) 278 | 279 | log.info( 280 | "Sliding paragraph transform: Done, output total %s", 281 | new_text_doc.size_summary(), 282 | ) 283 | 284 | return new_text_doc 285 | -------------------------------------------------------------------------------- /.cursor/rules/python.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: Python Coding Guidelines 3 | globs: *.py,pyproject.toml 4 | alwaysApply: false 5 | --- 6 | # Python Coding Guidelines 7 | 8 | These are rules for a modern Python project using uv. 9 | 10 | ## Python Version 11 | 12 | Write for Python 3.11-3.13. Do NOT write code to support earlier versions of Python. 13 | Always use modern Python practices appropriate for Python 3.11-3.13. 14 | 15 | Always use full type annotations, generics, and other modern practices. 16 | 17 | ## Project Setup and Developer Workflows 18 | 19 | - Important: BE SURE you read and understand the project setup by reading the 20 | pyproject.toml file and the Makefile. 21 | 22 | - ALWAYS use uv for running all code and managing dependencies. 23 | Never use direct `pip` or `python` commands. 24 | 25 | - Use modern uv commands: `uv sync`, `uv run ...`, etc. 26 | Prefer `uv add` over `uv pip install`. 27 | 28 | - You may use the following shortcuts 29 | ```shell 30 | 31 | # Install all dependencies: 32 | make install 33 | 34 | # Run linting (with ruff) and type checking (with basedpyright). 35 | # Note when you run this, ruff will auto-format and sort imports, resolving any 36 | # linter warnings about import ordering: 37 | make lint 38 | 39 | # Run tests: 40 | make test 41 | 42 | # Run uv sync, lint, and test in one command: 43 | make 44 | ``` 45 | 46 | - The usual `make test` like standard pytest does not show test output. 47 | Run individual tests and see output with `uv run pytest -s some/file.py`. 48 | 49 | - Always run `make lint` and `make test` to check your code after changes. 50 | 51 | - You must verify there are zero linter warnings/errors or test failures before 52 | considering any task complete. 53 | 54 | ## General Development Practices 55 | 56 | - Be sure to resolve the pyright (basedpyright) linter errors as you develop and make 57 | changes. 58 | 59 | - If type checker errors are hard to resolve, you may add a comment `# pyright: ignore` 60 | to disable Pyright warnings or errors but ONLY if you know they are not a real problem 61 | and are difficult to fix. 62 | 63 | - In special cases you may consider disabling it globally it in pyproject.toml but YOU 64 | MUST ASK FOR CONFIRMATION from the user before globally disabling lint or type checker 65 | rules. 66 | 67 | - Never change an existing comment, pydoc, or a log statement, unless it is directly 68 | fixing the issue you are changing, or the user has asked you to clean up the code. 69 | Do not drop existing comments when editing code! 70 | And do not delete or change logging statements. 71 | 72 | ## Coding Conventions and Imports 73 | 74 | - Always use full, absolute imports for paths. 75 | do NOT use `from .module1.module2 import ...`. Such relative paths make it hard to 76 | refactor. Use `from toplevel_pkg.module1.modlule2 import ...` instead. 77 | 78 | - Be sure to import things like `Callable` and other types from the right modules, 79 | remembering that many are now in `collections.abc` or `typing_extensions`. For 80 | example: `from collections.abc import Callable, Coroutine` 81 | 82 | - Use `typing_extensions` for things like `@override` (you need to use this, and not 83 | `typing` since we want to support Python 3.11). 84 | 85 | - Add `from __future__ import annotations` on files with types whenever applicable. 86 | 87 | - Use pathlib `Path` instead of strings. 88 | Use `Path(filename).read_text()` instead of two-line `with open(...)` blocks. 89 | 90 | - Use strif’s `atomic_output_file` context manager when writing files to ensure output 91 | files are written atomically. 92 | 93 | ## Use Modern Python Practices 94 | 95 | - ALWAYS use `@override` decorators to override methods from base classes. 96 | This is a modern Python practice and helps avoid bugs. 97 | 98 | ## Testing 99 | 100 | - For longer tests put them in a file like `tests/test_somename.py` in the `tests/` 101 | directory (or `tests/module_name/test_somename.py` file for a submodule). 102 | 103 | - For simple tests, prefer inline functions in the original code file below a `## Tests` 104 | comment. This keeps the tests easy to maintain and close to the code. 105 | Inline tests should NOT import pytest or pytest fixtures as we do not want runtime 106 | dependency on pytest. 107 | 108 | - DO NOT write one-off test code in extra files that are throwaway. 109 | 110 | - DO NOT put `if __name__ == "__main__":` just for quick testing. 111 | Instead use the inline function tests and run them with `uv run pytest`. 112 | 113 | - You can run such individual tests with `uv run pytest -s src/.../path/to/test` 114 | 115 | - Don’t add docs to assertions unless it’s not obvious what they’re checking - the 116 | assertion appears in the stack trace. 117 | Do NOT write `assert x == 5, "x should be 5"`. Do NOT write `assert x == 5 # Check if 118 | x is 5`. That is redundant. 119 | Just write `assert x == 5`. 120 | 121 | - DO NOT write trivial or obvious tests that are evident directly from code, such as 122 | assertions that confirm the value of a constant setting. 123 | 124 | - NEVER write `assert False`. If a test reaches an unexpected branch and must fail 125 | explicitly, `raise AssertionError("Some explanation")` instead. 126 | This is best typical best practice in Python since assertions can be removed with 127 | optimization. 128 | 129 | - DO NOT use pytest fixtures like parameterized tests or expected exception decorators 130 | unless absolutely necessary in more complex tests. 131 | It is typically simpler to use simple assertions and put the checks inside the test. 132 | This is also preferable because then simple tests have no explicit pytest dependencies 133 | and can be placed in code anywhere. 134 | 135 | - DO NOT write trivial tests that test something we know already works, like 136 | instantiating a Pydantic object. 137 | 138 | ```python 139 | class Link(BaseModel): 140 | url: str 141 | title: str = None 142 | 143 | # DO NOT write tests like this. They are trivial and only create clutter! 144 | def test_link_model(): 145 | link = Link(url="https://example.com", title="Example") 146 | assert link.url == "https://example.com" 147 | assert link.title == "Example" 148 | ``` 149 | 150 | ## Types and Type Annotations 151 | 152 | - Use modern union syntax: `str | None` instead of `Optional[str]`, `dict[str]` instead 153 | of `Dict[str]`, `list[str]` instead of `List[str]`, etc. 154 | 155 | - Never use/import `Optional` for new code. 156 | 157 | - Use modern enums like `StrEnum` if appropriate. 158 | 159 | - One exception to common practice on enums: If an enum has many values that are 160 | strings, and they have a literal value as a string (like in a JSON protocol), it’s 161 | fine to use lower_snake_case for enum values to match the actual value. 162 | This is more readable than LONG_ALL_CAPS_VALUES, and you can simply set the value to 163 | be the same as the name for each. 164 | For example: 165 | ```python 166 | class MediaType(Enum): 167 | """ 168 | Media types. For broad categories only, to determine what processing 169 | is possible. 170 | """ 171 | 172 | text = "text" 173 | image = "image" 174 | audio = "audio" 175 | video = "video" 176 | webpage = "webpage" 177 | binary = "binary" 178 | ``` 179 | 180 | ## Guidelines for Literal Strings 181 | 182 | - For multi-line strings NEVER put multi-line strings flush against the left margin. 183 | ALWAYS use a `dedent()` function to make it more readable. 184 | You may wish to add a `strip()` as well. 185 | Example: 186 | ```python 187 | from textwrap import dedent 188 | markdown_content = dedent(""" 189 | # Title 1 190 | Some text. 191 | ## Subtitle 1.1 192 | More text. 193 | """).strip() 194 | ``` 195 | 196 | ## Guidelines for Comments 197 | 198 | - Comments should be EXPLANATORY: Explain *WHY* something is done a certain way and not 199 | just *what* is done. 200 | 201 | - Comments should be CONCISE: Remove all extraneous words. 202 | 203 | - DO NOT use comments to state obvious things or repeat what is evident from the code. 204 | Here is an example of a comment that SHOULD BE REMOVED because it simply repeats the 205 | code, which is distracting and adds no value: 206 | ```python 207 | if self.failed == 0: 208 | # All successful 209 | return "All tasks finished successfully" 210 | ``` 211 | 212 | ## Guidelines for Docstrings 213 | 214 | - Here is an example of the correct style for docstrings: 215 | ```python 216 | def check_if_url( 217 | text: UnresolvedLocator, only_schemes: list[str] | None = None 218 | ) -> ParseResult | None: 219 | """ 220 | Convenience function to check if a string or Path is a URL and if so return 221 | the `urlparse.ParseResult`. 222 | 223 | Also returns false for Paths, so that it's easy to use local paths and URLs 224 | (`Locator`s) interchangeably. Can provide `HTTP_ONLY` or `HTTP_OR_FILE` to 225 | restrict to only certain schemes. 226 | """ 227 | # Function body 228 | 229 | def is_url(text: UnresolvedLocator, only_schemes: list[str] | None = None) -> bool: 230 | """ 231 | Check if a string is a URL. For convenience, also returns false for 232 | Paths, so that it's easy to use local paths and URLs interchangeably. 233 | """ 234 | return check_if_url(text, only_schemes) is not None 235 | ``` 236 | 237 | - Use concise pydoc strings with triple quotes on their own lines. 238 | 239 | - Use `backticks` around variable names and inline code excerpts. 240 | 241 | - Use plain fences (```) around code blocks inside of pydocs. 242 | 243 | - For classes with many methods, use a concise docstring on the class that explains all 244 | the common information, and avoid repeating the same information on every method. 245 | 246 | - Docstrings should provide context or as concisely as possible explain “why”, not 247 | obvious details evident from the class names, function names, parameter names, and 248 | type annotations. 249 | 250 | - Docstrings *should* mention any key rationale or pitfalls when using the class or 251 | function. 252 | 253 | - Avoid obvious or repetitive docstrings. 254 | Do NOT add pydocs that just repeat in English facts that are obvious from the function 255 | name, variable name, or types. 256 | That is silly and obvious and makes the code longer for no reason. 257 | 258 | - Do NOT list args and return values if they’re obvious. 259 | In the above examples, you do not need and `Arguments:` or `Returns:` section, since 260 | sections as it is obvious from context. 261 | do list these if there are many arguments and their meaning isn’t clear. 262 | If it returns a less obvious type like a tuple, do explain in the pydoc. 263 | 264 | - Exported/public variables, functions, or methods SHOULD have concise docstrings. 265 | Internal/local variables, functions, and methods DO NOT need docstrings unless their 266 | purpose is not obvious. 267 | 268 | ## General Clean Coding Practices 269 | 270 | - Avoid writing trivial wrapper functions. 271 | For example, when writing a class DO NOT blindly make delegation methods around public 272 | member variables. DO NOT write methods like this: 273 | ```python 274 | def reassemble(self) -> str: 275 | """Call the original reassemble method.""" 276 | return self.paragraph.reassemble() 277 | ``` 278 | In general, the user can just call the enclosed objects methods, reducing code bloat. 279 | 280 | - If a function does not use a parameter, but it should still be present, you can use `# 281 | pyright: ignore[reportUnusedParameter]` in a comment to suppress the linter warning. 282 | 283 | ## Guidelines for Backward Compatibility 284 | 285 | - When changing code in a library or general function, if a change to an API or library 286 | will break backward compatibility, MENTION THIS to the user. 287 | 288 | - DO NOT implement additional code for backward compatiblity (such as extra methods or 289 | variable aliases or comments about backward compatibility) UNLESS the user has 290 | confirmed that it is necessary. 291 | -------------------------------------------------------------------------------- /src/chopdiff/docs/token_diffs.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | from collections.abc import Callable 5 | from dataclasses import dataclass 6 | from enum import Enum 7 | from typing import TypeAlias 8 | 9 | import cydifflib as difflib 10 | from funlog import log_calls, tally_calls 11 | from typing_extensions import override 12 | 13 | from chopdiff.docs.text_doc import TextDoc 14 | 15 | log = logging.getLogger(__name__) 16 | 17 | 18 | SYMBOL_SEP = "⎪" 19 | 20 | 21 | class OpType(Enum): 22 | EQUAL = "equal" 23 | INSERT = "insert" 24 | DELETE = "delete" 25 | REPLACE = "replace" 26 | 27 | def as_symbol(self): 28 | abbrev = { 29 | OpType.EQUAL: " ", 30 | OpType.INSERT: "+", 31 | OpType.DELETE: "-", 32 | OpType.REPLACE: "±", 33 | } 34 | return abbrev[self] 35 | 36 | def as_abbrev(self): 37 | abbrev = { 38 | OpType.EQUAL: "keep", 39 | OpType.INSERT: "add ", 40 | OpType.DELETE: "del ", 41 | OpType.REPLACE: "repl", 42 | } 43 | return abbrev[self] 44 | 45 | 46 | @dataclass(frozen=True) 47 | class DiffOp: 48 | action: OpType 49 | left: list[str] 50 | right: list[str] 51 | 52 | def __post_init__(self): 53 | if self.action == OpType.REPLACE: 54 | assert self.left and self.right 55 | elif self.action == OpType.EQUAL: 56 | assert self.left == self.right 57 | elif self.action == OpType.INSERT: 58 | assert not self.left 59 | elif self.action == OpType.DELETE: 60 | assert not self.right 61 | 62 | def left_str(self, show_toks: bool = True) -> str: 63 | s = f"{self.action.as_abbrev()} {len(self.left):4} toks" 64 | if show_toks: 65 | s += f": - {SYMBOL_SEP}{''.join(tok for tok in self.left)}{SYMBOL_SEP}" 66 | return s 67 | 68 | def right_str(self, show_toks: bool = True) -> str: 69 | s = f"{self.action.as_abbrev()} {len(self.right):4} toks" 70 | if show_toks: 71 | s += f": + {SYMBOL_SEP}{''.join(tok for tok in self.right)}{SYMBOL_SEP}" 72 | return s 73 | 74 | def equal_str(self, show_toks: bool = True) -> str: 75 | s = f"{self.action.as_abbrev()} {len(self.left):4} toks" 76 | if show_toks: 77 | s += f": {SYMBOL_SEP}{''.join(tok for tok in self.left)}{SYMBOL_SEP}" 78 | return s 79 | 80 | def all_changed(self) -> list[str]: 81 | return [] if self.action == OpType.EQUAL else self.left + self.right 82 | 83 | 84 | @dataclass(frozen=True) 85 | class DiffStats: 86 | added: int 87 | removed: int 88 | input_size: int 89 | 90 | def nchanges(self) -> int: 91 | return self.added + self.removed 92 | 93 | @override 94 | def __str__(self): 95 | return f"add/remove +{self.added}/-{self.removed} out of {self.input_size} total" 96 | 97 | 98 | DiffFilter: TypeAlias = Callable[[DiffOp], bool] 99 | 100 | DIFF_FILTER_NONE: DiffFilter = lambda op: True 101 | """ 102 | Diff filter that accepts all diff operations. 103 | """ 104 | 105 | 106 | @dataclass 107 | class TokenDiff: 108 | """ 109 | A diff of two texts as a sequence of EQUAL, INSERT, and DELETE operations on wordtoks. 110 | """ 111 | 112 | ops: list[DiffOp] 113 | 114 | def left_size(self) -> int: 115 | return sum(len(op.left) for op in self.ops) 116 | 117 | def right_size(self) -> int: 118 | return sum(len(op.right) for op in self.ops) 119 | 120 | def changes(self) -> list[DiffOp]: 121 | return [op for op in self.ops if op.action != OpType.EQUAL] 122 | 123 | def stats(self) -> DiffStats: 124 | wordtoks_added = sum(len(op.right) for op in self.ops if op.action != OpType.EQUAL) 125 | wordtoks_removed = sum(len(op.left) for op in self.ops if op.action != OpType.EQUAL) 126 | return DiffStats(wordtoks_added, wordtoks_removed, self.left_size()) 127 | 128 | def apply_to(self, original_wordtoks: list[str]) -> list[str]: 129 | """ 130 | Apply a complete diff (including equality ops) to a list of wordtoks. 131 | """ 132 | result: list[str] = [] 133 | original_index = 0 134 | 135 | if len(original_wordtoks) != self.left_size(): 136 | raise AssertionError( 137 | f"Diff should be complete: original wordtoks length {len(original_wordtoks)} != diff length {self.left_size()}" 138 | ) 139 | 140 | for op in self.ops: 141 | if op.left: 142 | original_index += len(op.left) 143 | if op.right: 144 | result.extend(op.right) 145 | 146 | return result 147 | 148 | def filter(self, accept_fn: DiffFilter | None) -> tuple[TokenDiff, TokenDiff]: 149 | """ 150 | Return two diffs, one that only has accepted operations and one that only has 151 | rejected operations. 152 | """ 153 | if not accept_fn: 154 | accept_fn = DIFF_FILTER_NONE 155 | 156 | accepted_ops: list[DiffOp] = [] 157 | rejected_ops: list[DiffOp] = [] 158 | 159 | for op in self.ops: 160 | if op.action == OpType.EQUAL: 161 | # For equal ops, all tokens are both accepted and rejected. 162 | accepted_ops.append(op) 163 | rejected_ops.append(op) 164 | else: 165 | # We accept or reject the DiffOp as a whole, not token by token, since token by 166 | # token would give odd results, like deleting words but leaving whitespace. 167 | if accept_fn(op): 168 | accepted_ops.append(op) 169 | rejected_ops.append(DiffOp(OpType.EQUAL, op.left, op.left)) 170 | else: 171 | accepted_ops.append(DiffOp(OpType.EQUAL, op.left, op.left)) 172 | rejected_ops.append(op) 173 | 174 | assert len(accepted_ops) == len(self.ops) 175 | assert len(accepted_ops) == len(rejected_ops) 176 | 177 | accepted_diff, rejected_diff = TokenDiff(accepted_ops), TokenDiff(rejected_ops) 178 | 179 | assert accepted_diff.left_size() == self.left_size() 180 | assert rejected_diff.left_size() == self.left_size() 181 | 182 | return accepted_diff, rejected_diff 183 | 184 | def _diff_lines(self, include_equal: bool = False) -> list[str]: 185 | if len(self.ops) == 0: 186 | return ["(No changes)"] 187 | 188 | pos = 0 189 | lines: list[str] = [] 190 | for op in self.ops: 191 | if op.action == OpType.EQUAL: 192 | if include_equal: 193 | lines.append(f"at pos {pos:4} {op.equal_str()}") 194 | elif op.action == OpType.INSERT: 195 | lines.append(f"at pos {pos:4} {op.right_str()}") 196 | elif op.action == OpType.DELETE: 197 | lines.append(f"at pos {pos:4} {op.left_str()}") 198 | elif op.action == OpType.REPLACE: 199 | lines.append(f"at pos {pos:4} {op.left_str()}") 200 | lines.append(f" {'':4} {op.right_str()}") 201 | 202 | pos += len(op.left) 203 | return lines 204 | 205 | def as_diff_str(self, include_equal: bool = True) -> str: 206 | diff_str = "\n".join(self._diff_lines(include_equal=include_equal)) 207 | return f"TextDiff: {self.stats()}:\n{diff_str}" 208 | 209 | @override 210 | def __str__(self): 211 | return self.as_diff_str() 212 | 213 | 214 | def diff_docs(doc1: TextDoc, doc2: TextDoc) -> TokenDiff: 215 | """ 216 | Calculate the LCS-style diff between two documents based on words. 217 | """ 218 | 219 | diff = diff_wordtoks(list(doc1.as_wordtoks()), list(doc2.as_wordtoks())) 220 | 221 | # log.save_object("doc1 wordtoks", "diff_docs", "\n".join(list(doc1.as_wordtoks()))) 222 | # log.save_object("doc2 wordtoks", "diff_docs", "\n".join(list(doc2.as_wordtoks()))) 223 | # log.save_object("diff", "diff_docs", diff) 224 | 225 | return diff 226 | 227 | 228 | @tally_calls(level="warning", min_total_runtime=5) 229 | def diff_wordtoks(wordtoks1: list[str], wordtoks2: list[str]) -> TokenDiff: 230 | """ 231 | Perform an LCS-style diff on two lists of wordtoks. 232 | """ 233 | s = difflib.SequenceMatcher(None, wordtoks1, wordtoks2, autojunk=False) # pyright: ignore 234 | diff: list[DiffOp] = [] 235 | 236 | # log.message(f"Diffing {len(wordtoks1)} wordtoks against {len(wordtoks2)} wordtoks") 237 | # log.save_object("wordtoks1", "diff_wordtoks", "".join(wordtoks1)) 238 | # log.save_object("wordtoks2", "diff_wordtoks", "".join(wordtoks2)) 239 | # log.save_object("diff opcodes", "diff_wordtoks", "\n".join(str(o) for o in s.get_opcodes())) 240 | 241 | for tag, i1, i2, j1, j2 in s.get_opcodes(): # pyright: ignore 242 | if tag == "equal": 243 | slice1 = wordtoks1[i1:i2] 244 | assert slice1 == wordtoks2[j1:j2] 245 | diff.append(DiffOp(OpType.EQUAL, slice1, slice1)) 246 | elif tag == "insert": 247 | diff.append(DiffOp(OpType.INSERT, [], wordtoks2[j1:j2])) 248 | elif tag == "delete": 249 | diff.append(DiffOp(OpType.DELETE, wordtoks1[i1:i2], [])) 250 | elif tag == "replace": 251 | diff.append(DiffOp(OpType.REPLACE, wordtoks1[i1:i2], wordtoks2[j1:j2])) 252 | 253 | return TokenDiff(diff) 254 | 255 | 256 | ScoredDiff: TypeAlias = tuple[float, TokenDiff] 257 | 258 | 259 | def scored_diff_wordtoks(wordtoks1: list[str], wordtoks2: list[str]) -> ScoredDiff: 260 | """ 261 | Calculate the number of wordtoks added and removed between two lists of tokens. 262 | Score is (wordtoks_added + wordtoks_removed) / min(len(doc1), len(doc2)), 263 | which is 0 for identical docs. 264 | """ 265 | 266 | if len(wordtoks1) == 0 or len(wordtoks2) == 0: 267 | raise ValueError("Cannot score diff for empty documents") 268 | 269 | diff = diff_wordtoks(wordtoks1, wordtoks2) 270 | score = float(diff.stats().nchanges()) / min(len(wordtoks1), len(wordtoks2)) 271 | return score, diff 272 | 273 | 274 | @log_calls(level="message", if_slower_than=0.25) 275 | def find_best_alignment( 276 | list1: list[str], 277 | list2: list[str], 278 | min_overlap: int, 279 | max_overlap: int | None = None, 280 | scored_diff: Callable[[list[str], list[str]], ScoredDiff] = scored_diff_wordtoks, 281 | give_up_score: float = 0.75, 282 | give_up_count: int = 30, 283 | ) -> tuple[int, ScoredDiff]: 284 | """ 285 | Find the best alignment of two lists of values, where edit distance is smallest but overlap is 286 | at least min_overlap and at most max_overlap. Returns offset into list1 and diff object. 287 | """ 288 | len1, len2 = len(list1), len(list2) 289 | best_offset = -1 290 | best_score = float("inf") 291 | best_diff = None 292 | max_overlap = min(len1, len2, max_overlap) if max_overlap is not None else min(len1, len2) 293 | 294 | if min_overlap > len1 or min_overlap > len2: 295 | raise ValueError( 296 | f"Minimum overlap {min_overlap} should never exceed the length of one of the lists ({len1}, {len2})" 297 | ) 298 | 299 | log.info( 300 | "Finding best alignment: List lengths: lengths %s and %s with overlap of %s to %s", 301 | len1, 302 | len2, 303 | min_overlap, 304 | max_overlap, 305 | ) 306 | 307 | # To make this a bit more efficient we check if we have a run of increasing scores and 308 | # give up if we have many in a row. 309 | scores_increasing = 0 310 | prev_score = float("-inf") 311 | 312 | # Slide the second list over the first list, starting from the end of the first list. 313 | # TODO: This could be much more efficient by being cleverer about reusing diff calculations.s 314 | for overlap in range(min_overlap, max_overlap + 1): 315 | start1 = len1 - overlap 316 | end1 = len1 317 | start2 = 0 318 | end2 = overlap 319 | 320 | score, diff = scored_diff(list1[start1:end1], list2[start2:end2]) 321 | 322 | log.info("Offset %s: Overlap %s: Score %f", start1, overlap, score) 323 | 324 | if score < best_score: 325 | best_score = score 326 | best_offset = start1 327 | best_diff = diff 328 | scores_increasing = 0 329 | elif score >= give_up_score and score >= prev_score: 330 | scores_increasing += 1 331 | if scores_increasing >= give_up_count: 332 | log.info( 333 | "Giving up after %s increasing scores, last score %s > %s", 334 | give_up_count, 335 | score, 336 | give_up_score, 337 | ) 338 | break 339 | 340 | prev_score = score 341 | 342 | if best_diff is None: 343 | raise ValueError("No alignment found") 344 | 345 | return best_offset, (best_score, best_diff) 346 | -------------------------------------------------------------------------------- /tests/docs/test_text_doc.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | from textwrap import dedent 3 | 4 | import regex 5 | from prettyfmt import fmt_words 6 | from strif import abbrev_str 7 | 8 | from chopdiff.docs.sizes import TextUnit 9 | from chopdiff.docs.text_doc import SentIndex, TextDoc 10 | from chopdiff.docs.wordtoks import ( 11 | PARA_BR_TOK, 12 | is_break_or_space, 13 | is_entity, 14 | is_header_tag, 15 | is_number, 16 | is_tag, 17 | is_word, 18 | join_wordtoks, 19 | visualize_wordtoks, 20 | wordtok_len, 21 | wordtokenize, 22 | ) 23 | 24 | _med_test_doc = dedent( 25 | """ 26 | # Title 27 | 28 | Hello World. This is an example sentence. And here's another one! 29 | 30 | ## Subtitle 31 | 32 | This is a new paragraph. 33 | It has several sentences. 34 | There may be line breaks within a paragraph, but these should not affect handlingof the paragraph. 35 | There are also [links](http://www.google.com) and **bold** and *italic* text. 36 | 37 | ### Itemized List 38 | 39 | - Item 1 40 | 41 | - Item 2 42 | 43 | - Item 3 44 | 45 | ### Numbered List 46 | 47 | 1. Item 1 48 | 49 | 2. Item 2 50 | 51 | 3. Item 3 52 | 53 | Testing some embedded HTML tags. 54 | 55 |

An HTML header

56 | 57 | 58 | 59 | ⏱️05:52  63 | 64 | """ 65 | ).strip() 66 | 67 | 68 | def test_document_parse_reassemble(): 69 | text = _med_test_doc 70 | doc = TextDoc.from_text(text) 71 | 72 | print("\n---Original:") 73 | pprint(text) 74 | print("\n---Parsed:") 75 | pprint(doc) 76 | 77 | reassembled_text = doc.reassemble() 78 | 79 | # Should be exactly the same except for within-paragraph line breaks. 80 | def normalize(text: str) -> str: 81 | return regex.sub(r"\s+", " ", text.replace("\n\n", "")) 82 | 83 | assert normalize(reassembled_text) == normalize(text) 84 | 85 | # Check offset of a paragraph towards the end of the document. 86 | last_para = doc.paragraphs[-1] 87 | last_para_char_offset = text.rindex(last_para.original_text) 88 | assert last_para.char_offset == last_para_char_offset 89 | 90 | 91 | def test_markup_detection(): 92 | text = _med_test_doc 93 | doc = TextDoc.from_text(text) 94 | 95 | print("Paragraph markup and header detection:") 96 | result: list[str] = [] 97 | for para in doc.paragraphs: 98 | result.append( 99 | fmt_words( 100 | abbrev_str(para.original_text, 10), 101 | "is_markup" if para.is_markup() else "", 102 | "is_header" if para.is_header() else "", 103 | ) 104 | ) 105 | 106 | print("\n".join(result)) 107 | assert ( 108 | "\n".join(result) 109 | == dedent( 110 | """ 111 | # Title is_header 112 | Hello Wor… 113 | ## Subtit… is_header 114 | This is a… 115 | ### Itemi… is_header 116 | - Item 1 117 | - Item 2 118 | - Item 3 119 | ### Numbe… is_header 120 | 1. Item 1 121 | 2. Item 2 122 | 3. Item 3 123 | Testing s… 124 |

An HT… is_header 125 | " 170 | assert doc.paragraphs[-2].is_markup() 171 | assert doc.paragraphs[-1].sentences[-1].is_markup() 172 | 173 | 174 | _simple_test_doc = dedent( 175 | """ 176 | This is the first paragraph. It has multiple sentences. 177 | 178 | This is the second paragraph. It also has multiple sentences. And it continues. 179 | 180 | Here is the third paragraph. More sentences follow. And here is another one. 181 | """ 182 | ).strip() 183 | 184 | 185 | def test_doc_sizes(): 186 | text = _med_test_doc 187 | doc = TextDoc.from_text(text) 188 | print("\n---Sizes:") 189 | size_summary = doc.size_summary() 190 | print(size_summary) 191 | 192 | assert size_summary == "726 bytes (37 lines, 16 paras, 20 sents, 82 words, 215 tiktoks)" 193 | 194 | 195 | def test_seek_doc(): 196 | doc = TextDoc.from_text(_simple_test_doc) 197 | 198 | offset = 1 199 | sent_index, sent_offset = doc.seek_to_sent(offset, TextUnit.bytes) 200 | print(f"Seeked to {sent_index} offset {sent_offset} for offset {offset} bytes") 201 | assert sent_index == SentIndex(para_index=0, sent_index=0) 202 | assert sent_offset == 0 203 | 204 | offset = len("This is the first paragraph.") 205 | sent_index, sent_offset = doc.seek_to_sent(offset, TextUnit.bytes) 206 | print(f"Seeked to {sent_index} offset {sent_offset} for offset {offset} bytes") 207 | assert sent_index == SentIndex(para_index=0, sent_index=0) 208 | assert sent_offset == 0 209 | 210 | offset = len("This is the first paragraph. ") 211 | sent_index, sent_offset = doc.seek_to_sent(offset, TextUnit.bytes) 212 | print(f"Seeked to {sent_index} offset {sent_offset} for offset {offset} bytes") 213 | assert sent_index == SentIndex(para_index=0, sent_index=1) 214 | assert sent_offset == offset 215 | 216 | offset = len( 217 | "This is the first paragraph. It has multiple sentences.\n\nThis is the second paragraph." 218 | ) 219 | sent_index, sent_offset = doc.seek_to_sent(offset, TextUnit.bytes) 220 | print(f"Seeked to {sent_index} offset {sent_offset} for offset {offset} bytes") 221 | assert sent_index == SentIndex(para_index=1, sent_index=0) 222 | assert sent_offset == len("This is the first paragraph. It has multiple sentences.\n\n") 223 | 224 | offset = len(_simple_test_doc) + 10 225 | sent_index, sent_offset = doc.seek_to_sent(offset, TextUnit.bytes) 226 | print(f"Seeked to {sent_index} offset {sent_offset} for offset {offset} bytes") 227 | assert sent_index == SentIndex(para_index=2, sent_index=2) 228 | 229 | 230 | _short_test_doc = dedent( 231 | """ 232 | Paragraph one lorem ipsum. 233 | Sentence 1a lorem ipsum. Sentence 1b lorem ipsum. Sentence 1c lorem ipsum. 234 | 235 | Paragraph two lorem ipsum. Sentence 2a lorem ipsum. Sentence 2b lorem ipsum. Sentence 2c lorem ipsum. 236 | 237 | Paragraph three lorem ipsum. Sentence 3a lorem ipsum. Sentence 3b lorem ipsum. Sentence 3c lorem ipsum. 238 | """ 239 | ).strip() 240 | 241 | 242 | def test_sub_doc(): 243 | doc = TextDoc.from_text(_short_test_doc) 244 | 245 | sub_doc_start = SentIndex(1, 1) 246 | sub_doc_end = SentIndex(2, 1) 247 | sub_doc = doc.sub_doc(sub_doc_start, sub_doc_end) 248 | 249 | expected_text = dedent( 250 | """ 251 | Sentence 2a lorem ipsum. Sentence 2b lorem ipsum. Sentence 2c lorem ipsum. 252 | 253 | Paragraph three lorem ipsum. Sentence 3a lorem ipsum. 254 | """ 255 | ).strip() 256 | expected_sub_doc = TextDoc.from_text(expected_text) 257 | 258 | print("---Original:") 259 | pprint(doc) 260 | print("---Subdoc:") 261 | pprint(sub_doc) 262 | 263 | # Confirm reassembled text is correct. 264 | assert sub_doc.reassemble() == expected_sub_doc.reassemble() 265 | 266 | # Confirm sentences and offsets are preserved in sub-doc. 267 | orig_sentences = [sent for _index, sent in doc.sent_iter()] 268 | sub_sentences = [sent for _index, sent in sub_doc.sent_iter()] 269 | assert orig_sentences[5:10] == sub_sentences 270 | 271 | # Confirm indexing and reverse iteration. 272 | assert doc.sub_doc(SentIndex(0, 0), None) == doc 273 | reversed_sentences = [sent for _index, sent in doc.sent_iter(reverse=True)] 274 | assert reversed_sentences == list(reversed(orig_sentences)) 275 | 276 | 277 | def test_tokenization(): 278 | doc = TextDoc.from_text(_short_test_doc) 279 | wordtoks = list(doc.as_wordtoks()) 280 | 281 | print("\n---Tokens:") 282 | pprint(wordtoks) 283 | 284 | assert wordtoks[:6] == ["Paragraph", " ", "one", " ", "lorem", " "] 285 | assert wordtoks[-7:] == [" ", "3c", " ", "lorem", " ", "ipsum", "."] 286 | assert wordtoks.count(PARA_BR_TOK) == 2 287 | assert join_wordtoks(wordtoks) == _short_test_doc.replace( 288 | "\n", " ", 1 289 | ) # First \n is not a para break. 290 | 291 | 292 | def test_wordtok_mappings(): 293 | doc = TextDoc.from_text(_short_test_doc) 294 | 295 | print("\n---Mapping:") 296 | wordtok_mapping, sent_mapping = doc.wordtok_mappings() 297 | pprint(wordtok_mapping) 298 | pprint(sent_mapping) 299 | 300 | assert wordtok_mapping[0] == SentIndex(0, 0) 301 | assert wordtok_mapping[1] == SentIndex(0, 0) 302 | assert wordtok_mapping[4] == SentIndex(0, 0) 303 | assert wordtok_mapping[9] == SentIndex(0, 1) 304 | 305 | assert sent_mapping[SentIndex(0, 0)] == [0, 1, 2, 3, 4, 5, 6, 7, 8] 306 | assert sent_mapping[SentIndex(2, 3)] == [99, 100, 101, 102, 103, 104, 105, 106] 307 | 308 | 309 | _sentence_tests = [ 310 | "Hello, world!", 311 | "This is an example sentence with punctuation.", 312 | "And here's another one!", 313 | "Special characters: @#%^&*()", 314 | ] 315 | 316 | _sentence_test_html = 'This is a test.' 317 | 318 | 319 | def test_wordtokization(): 320 | for sentence in _sentence_tests: 321 | wordtoks = wordtokenize(sentence) 322 | reassembled_sentence = "".join(wordtoks) 323 | assert reassembled_sentence == sentence 324 | 325 | assert wordtokenize("Multiple spaces and tabs\tand\nnewlines in between.") == [ 326 | "Multiple", 327 | " ", 328 | "spaces", 329 | " ", 330 | "and", 331 | " ", 332 | "tabs", 333 | " ", 334 | "and", 335 | " ", 336 | "newlines", 337 | " ", 338 | "in", 339 | " ", 340 | "between", 341 | ".", 342 | ] 343 | assert wordtokenize("") == [] 344 | assert wordtokenize(" ") == [" "] 345 | 346 | assert wordtokenize(_sentence_test_html) == [ 347 | "This", 348 | " ", 349 | "is", 350 | " ", 351 | '', 352 | "a", 353 | " ", 354 | "test", 355 | "", 356 | ".", 357 | ] 358 | 359 | assert len(_sentence_test_html) == sum( 360 | wordtok_len(wordtok) for wordtok in wordtokenize(_sentence_test_html) 361 | ) 362 | 363 | 364 | def test_html_tokenization(): 365 | doc = TextDoc.from_text(_sentence_test_html) 366 | wordtoks = list(doc.as_wordtoks()) 367 | 368 | print("\n---HTML Tokens:") 369 | pprint(wordtoks) 370 | 371 | assert wordtoks == [ 372 | "This", 373 | " ", 374 | "is", 375 | " ", 376 | '', 377 | "a", 378 | " ", 379 | "test", 380 | "", 381 | ".", 382 | ] 383 | assert list(map(is_tag, wordtoks)) == [ 384 | False, 385 | False, 386 | False, 387 | False, 388 | True, 389 | False, 390 | False, 391 | False, 392 | True, 393 | False, 394 | ] 395 | assert list(map(is_break_or_space, wordtoks)) == [ 396 | False, 397 | True, 398 | False, 399 | True, 400 | False, 401 | False, 402 | True, 403 | False, 404 | False, 405 | False, 406 | ] 407 | 408 | 409 | def test_tiktoken_len(): 410 | doc = TextDoc.from_text(_med_test_doc) 411 | 412 | len = doc.size(TextUnit.tiktokens) 413 | print("--Tiktoken len:") 414 | print(len) 415 | 416 | assert len > 100 417 | 418 | 419 | def test_is_footnote_def_detection(): 420 | doc = TextDoc.from_text( 421 | dedent( 422 | """ 423 | Title. 424 | 425 | Body with a ref[^a1]. 426 | 427 | [^a1]: The definition line 428 | """ 429 | ).strip() 430 | ) 431 | 432 | assert len(doc.paragraphs) == 3 433 | assert not doc.paragraphs[0].is_footnote_def() 434 | assert not doc.paragraphs[1].is_footnote_def() 435 | assert doc.paragraphs[2].is_footnote_def() 436 | -------------------------------------------------------------------------------- /src/chopdiff/html/html_in_md.py: -------------------------------------------------------------------------------- 1 | """ 2 | Formatting of Markdown with a small set of known HTML classes. We do this directly 3 | ourselves to keep the HTML very minimal, control whitespace, and to avoid any 4 | confusions of using full HTML escaping (like unnecessary "s etc.) 5 | 6 | Perhaps worth using FastHTML for this? 7 | """ 8 | 9 | import re 10 | from collections.abc import Callable 11 | from typing import TypeAlias 12 | 13 | 14 | def escape_md_html(s: str, safe: bool = False) -> str: 15 | """ 16 | Escape a string for Markdown with HTML. Don't escape single and double quotes. 17 | """ 18 | if safe: 19 | return s 20 | s = s.replace("&", "&") 21 | s = s.replace("<", "<") 22 | s = s.replace(">", ">") 23 | return s 24 | 25 | 26 | def escape_attribute(s: str) -> str: 27 | """ 28 | Escape a string for use as an HTML attribute. Escape single and double quotes. 29 | """ 30 | s = escape_md_html(s) 31 | s = s.replace('"', """) 32 | s = s.replace("'", "'") 33 | return s 34 | 35 | 36 | ClassNames = str | list[str] | None 37 | Attrs = dict[str, str | bool] 38 | 39 | _TAGS_WITH_PADDING = ["div", "p"] 40 | 41 | 42 | def tag_with_attrs( 43 | tag: str, 44 | text: str | None, 45 | class_name: ClassNames = None, 46 | *, 47 | attrs: Attrs | None = None, 48 | safe: bool = False, 49 | padding: str | None = None, 50 | ) -> str: 51 | """ 52 | Create an HTML tag with optional class names and attributes. 53 | Boolean attribute values: True includes the attribute, False omits it. 54 | """ 55 | class_value = "" 56 | if class_name is not None: 57 | if isinstance(class_name, str): 58 | class_value = class_name.strip() 59 | else: # list[str] 60 | # Filter out empty strings and join 61 | filtered_classes = [cls for cls in class_name if cls.strip()] 62 | class_value = " ".join(filtered_classes) 63 | 64 | attr_str = f' class="{escape_attribute(class_value)}"' if class_value else "" 65 | if attrs: 66 | for k, v in attrs.items(): 67 | if isinstance(v, bool): 68 | if v: # Only include attribute if True 69 | attr_str += f" {k}" 70 | else: # string value 71 | attr_str += f' {k}="{escape_attribute(v)}"' 72 | # Default padding for div and p tags. 73 | if text is None: 74 | return f"<{tag}{attr_str} />" 75 | else: 76 | content = escape_md_html(text, safe) 77 | if padding is None: 78 | padding = "\n" if tag in _TAGS_WITH_PADDING else "" 79 | if padding: 80 | content = content.strip("\n") 81 | if not content: 82 | padding = "" 83 | return f"<{tag}{attr_str}>{padding}{content}{padding}" 84 | 85 | 86 | def html_span( 87 | text: str, 88 | class_name: ClassNames = None, 89 | *, 90 | attrs: Attrs | None = None, 91 | safe: bool = False, 92 | ) -> str: 93 | """ 94 | Write a span tag for use in Markdown, with the given text and optional class and attributes. 95 | """ 96 | return tag_with_attrs("span", text, class_name, attrs=attrs, safe=safe) 97 | 98 | 99 | def html_div( 100 | text: str, 101 | class_name: ClassNames = None, 102 | *, 103 | attrs: Attrs | None = None, 104 | safe: bool = False, 105 | padding: str | None = None, 106 | ) -> str: 107 | """ 108 | Write a div tag for use in Markdown, with the given text and optional class and attributes. 109 | """ 110 | return tag_with_attrs("div", text, class_name, attrs=attrs, safe=safe, padding=padding) 111 | 112 | 113 | def html_a( 114 | text: str, 115 | href: str, 116 | class_name: ClassNames = None, 117 | *, 118 | attrs: Attrs | None = None, 119 | safe: bool = False, 120 | ) -> str: 121 | """ 122 | Write an anchor tag with href, optional class and attributes. 123 | """ 124 | link_attrs: Attrs = {"href": href} 125 | if attrs: 126 | link_attrs.update(attrs) 127 | return tag_with_attrs("a", text, class_name, attrs=link_attrs, safe=safe) 128 | 129 | 130 | def html_b( 131 | text: str, 132 | class_name: ClassNames = None, 133 | *, 134 | attrs: Attrs | None = None, 135 | safe: bool = False, 136 | ) -> str: 137 | """ 138 | Write a bold tag with optional class and attributes. 139 | """ 140 | return tag_with_attrs("b", text, class_name, attrs=attrs, safe=safe) 141 | 142 | 143 | def html_i( 144 | text: str, 145 | class_name: ClassNames = None, 146 | *, 147 | attrs: Attrs | None = None, 148 | safe: bool = False, 149 | ) -> str: 150 | """ 151 | Write an italic tag with optional class and attributes. 152 | """ 153 | return tag_with_attrs("i", text, class_name, attrs=attrs, safe=safe) 154 | 155 | 156 | def html_img( 157 | src: str, 158 | alt: str, 159 | class_name: ClassNames = None, 160 | *, 161 | attrs: Attrs | None = None, 162 | safe: bool = False, 163 | ) -> str: 164 | img_attrs: Attrs = {"src": src, "alt": alt} 165 | if attrs: 166 | for k, v in attrs.items(): 167 | img_attrs[k] = v 168 | return tag_with_attrs("img", None, class_name, attrs=img_attrs, safe=safe) 169 | 170 | 171 | def html_join_blocks(*blocks: str | None) -> str: 172 | """ 173 | Join block elements, with double newlines for better Markdown compatibility. 174 | Ignore empty strings or None. 175 | """ 176 | return "\n\n".join(block.strip("\n") for block in blocks if block) 177 | 178 | 179 | def md_para(text: str) -> str: 180 | """ 181 | Add double newlines to the start and end of the text to make it a paragraph. 182 | """ 183 | return "\n\n".join(text.split("\n")) 184 | 185 | 186 | Wrapper: TypeAlias = Callable[[str], str] 187 | """Wraps a string to identify it in some way.""" 188 | 189 | 190 | def identity_wrapper(text: str) -> str: 191 | return text 192 | 193 | 194 | def _check_class_name(class_name: ClassNames) -> None: 195 | if class_name: 196 | if isinstance(class_name, str): 197 | # Allow modern CSS class naming including BEM notation (block__element--modifier) 198 | if class_name.strip() and not re.match(r"^[a-zA-Z_][\w_-]*$", class_name): 199 | raise ValueError(f"Expected a valid CSS class name but got: '{class_name}'") 200 | else: # list[str] 201 | for cls in class_name: 202 | if cls.strip() and not re.match(r"^[a-zA-Z_][\w_-]*$", cls): 203 | raise ValueError(f"Expected a valid CSS class name but got: '{cls}'") 204 | 205 | 206 | def html_p( 207 | text: str, 208 | class_name: ClassNames = None, 209 | *, 210 | attrs: Attrs | None = None, 211 | safe: bool = False, 212 | padding: str | None = None, 213 | ) -> str: 214 | """ 215 | Write a p tag for use in Markdown, with the given text and optional class and attributes. 216 | """ 217 | return tag_with_attrs("p", text, class_name, attrs=attrs, safe=safe, padding=padding) 218 | 219 | 220 | def html_tag( 221 | tag: str, 222 | text: str | None = None, 223 | class_name: ClassNames = None, 224 | *, 225 | attrs: Attrs | None = None, 226 | safe: bool = False, 227 | padding: str | None = None, 228 | ) -> str: 229 | """ 230 | Generic function to create any HTML tag with optional class and attributes. 231 | """ 232 | return tag_with_attrs(tag, text, class_name, attrs=attrs, safe=safe, padding=padding) 233 | 234 | 235 | def div_wrapper( 236 | class_name: ClassNames = None, 237 | *, 238 | attrs: Attrs | None = None, 239 | safe: bool = True, 240 | padding: str | None = "\n\n", 241 | ) -> Wrapper: 242 | _check_class_name(class_name) 243 | 244 | def div_wrapper_func(text: str) -> str: 245 | return html_div(text, class_name, attrs=attrs, safe=safe, padding=padding) 246 | 247 | return div_wrapper_func 248 | 249 | 250 | def span_wrapper( 251 | class_name: ClassNames = None, 252 | *, 253 | attrs: Attrs | None = None, 254 | safe: bool = True, 255 | ) -> Wrapper: 256 | _check_class_name(class_name) 257 | 258 | def span_wrapper_func(text: str) -> str: 259 | return html_span(text, class_name, attrs=attrs, safe=safe) 260 | 261 | return span_wrapper_func 262 | 263 | 264 | def tag_wrapper( 265 | tag: str, 266 | class_name: ClassNames = None, 267 | *, 268 | attrs: Attrs | None = None, 269 | safe: bool = True, 270 | padding: str | None = None, 271 | ) -> Wrapper: 272 | """ 273 | Generic wrapper factory for any HTML tag. 274 | """ 275 | _check_class_name(class_name) 276 | 277 | def tag_wrapper_func(text: str) -> str: 278 | return html_tag(tag, text, class_name, attrs=attrs, safe=safe, padding=padding) 279 | 280 | return tag_wrapper_func 281 | 282 | 283 | ## Tests 284 | 285 | 286 | def test_html(): 287 | assert escape_md_html("&<>") == "&<>" 288 | assert escape_attribute("\"'&<>") == ""'&<>" 289 | assert ( 290 | tag_with_attrs("span", "text", class_name="foo", attrs={"id": "a"}) 291 | == 'text' 292 | ) 293 | assert ( 294 | html_span("text", class_name="foo", attrs={"id": "a"}) 295 | == 'text' 296 | ) 297 | assert ( 298 | html_div("text 1<2", class_name="foo", attrs={"id": "a"}) 299 | == '
\ntext 1<2\n
' 300 | ) 301 | assert html_div("text") == "
\ntext\n
" 302 | 303 | 304 | def test_boolean_attrs(): 305 | assert tag_with_attrs("input", None, attrs={"disabled": True}) == "" 306 | assert tag_with_attrs("input", None, attrs={"disabled": False}) == "" 307 | assert ( 308 | tag_with_attrs("input", None, attrs={"disabled": True, "required": True, "id": "test"}) 309 | == '' 310 | ) 311 | assert ( 312 | tag_with_attrs("input", None, attrs={"disabled": False, "required": True}) 313 | == "" 314 | ) 315 | 316 | 317 | def test_class_names(): 318 | assert ( 319 | tag_with_attrs("div", "text", class_name=["foo", "bar"]) 320 | == '
\ntext\n
' 321 | ) 322 | assert tag_with_attrs("span", "text", class_name="single") == 'text' 323 | assert tag_with_attrs("span", "text", class_name=None) == "text" 324 | assert tag_with_attrs("span", "text", class_name=[]) == "text" 325 | assert tag_with_attrs("span", "text", class_name="") == "text" 326 | assert tag_with_attrs("span", "text", class_name=["", ""]) == "text" 327 | assert ( 328 | tag_with_attrs("span", "text", class_name=["foo", "", "bar"]) 329 | == 'text' 330 | ) 331 | 332 | 333 | def test_padding(): 334 | assert tag_with_attrs("span", "text") == "text" 335 | assert tag_with_attrs("div", "text") == "
\ntext\n
" 336 | assert tag_with_attrs("p", "text") == "

\ntext\n

" 337 | assert tag_with_attrs("div", "text", padding="") == "
text
" 338 | assert tag_with_attrs("div", "", padding="\n") == "
" 339 | 340 | 341 | def test_safe_mode(): 342 | assert tag_with_attrs("div", "