├── .gitignore ├── CHANGELOG.md ├── .github └── workflows │ ├── test.yml │ └── release.yml ├── pyproject.toml ├── Makefile ├── CLAUDE.md ├── tests ├── test_enhanced_recovery.py └── test_sloppy_xml.py ├── README.md ├── logo.svg ├── LICENSE └── sloppy_xml.py /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | *.egg-info 3 | uv.lock 4 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | ## 0.3.0 6 | 7 | - Added entity resolution in attributes 8 | 9 | ## 0.2.1 10 | 11 | - Fixed readme reference 12 | 13 | ## 0.2.0 14 | 15 | - Added `tree` parameter to parsing functions for more flexible tree building 16 | - Made parse options internal for cleaner API 17 | - Converted `namedtuple` usage to `NamedTuple` for better type hints 18 | 19 | ## 0.1.0 20 | 21 | - Initial implementation of sloppy XML parser 22 | - Streaming XML parser with event-based architecture 23 | - Tree-building functionality with ElementTree support 24 | - Error recovery mechanisms for malformed XML 25 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: ["3.10", "3.11", "3.12"] 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | 19 | - name: Install uv 20 | uses: astral-sh/setup-uv@v4 21 | with: 22 | version: "latest" 23 | 24 | - name: Set up Python ${{ matrix.python-version }} 25 | run: uv python install ${{ matrix.python-version }} 26 | 27 | - name: Install dependencies 28 | run: uv sync 29 | 30 | - name: Run tests 31 | run: uv run pytest -v 32 | 33 | - name: Check code style 34 | run: uv run ruff check 35 | 36 | - name: Check formatting 37 | run: uv run ruff format --check -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | 8 | jobs: 9 | release: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | contents: write 13 | id-token: write 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | 18 | - name: Install uv 19 | uses: astral-sh/setup-uv@v4 20 | with: 21 | version: "latest" 22 | 23 | - name: Set up Python 24 | run: uv python install 3.12 25 | 26 | - name: Install dependencies 27 | run: uv sync 28 | 29 | - name: Build package 30 | run: uv build 31 | 32 | - name: Publish to PyPI 33 | uses: pypa/gh-action-pypi-publish@release/v1 34 | 35 | - name: Create GitHub Release 36 | uses: softprops/action-gh-release@v2 37 | with: 38 | files: dist/* 39 | generate_release_notes: true -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "sloppy-xml" 7 | version = "0.3.0" 8 | description = "A sloppy XML parser for Python designed to be used with LLMs" 9 | readme = "README.md" 10 | authors = [{name = "Armin Ronacher", email = "armin.ronacher@active-4.com"}] 11 | requires-python = ">=3.10" 12 | license = "Apache-2.0" 13 | classifiers = [ 14 | "Programming Language :: Python :: 3", 15 | "Operating System :: OS Independent", 16 | ] 17 | 18 | [project.urls] 19 | Homepage = "https://github.com/mitsuhiko/sloppy-xml-py" 20 | Repository = "https://github.com/mitsuhiko/sloppy-xml-py" 21 | 22 | [tool.setuptools] 23 | py-modules = ["sloppy_xml"] 24 | 25 | [project.optional-dependencies] 26 | lxml = [ 27 | "lxml>=4.6.0", 28 | ] 29 | 30 | [dependency-groups] 31 | dev = [ 32 | "pytest>=8.3.5", 33 | "ruff>=0.12.0", 34 | ] 35 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all format check format-check lint sync test build clean help 2 | 3 | all: sync check test 4 | 5 | # Format code automatically 6 | format: 7 | @uv run ruff format 8 | 9 | # Check code style and quality 10 | check: format-check lint 11 | 12 | # Check if code formatting is correct without changing files 13 | format-check: 14 | @uv run ruff format --check 15 | 16 | # Run linting 17 | lint: 18 | @uv run ruff check 19 | 20 | # Install/update dependencies and sync environment 21 | sync: 22 | @uv sync 23 | 24 | # Run all tests 25 | test: 26 | @uv run pytest 27 | 28 | # Build source distribution and wheel packages 29 | build: 30 | @uv build 31 | 32 | # Clean build artifacts 33 | clean: 34 | @rm -rf dist/ build/ *.egg-info/ 35 | 36 | # Show available targets 37 | help: 38 | @echo "Available targets:" 39 | @echo " format - Format code automatically" 40 | @echo " check - Check code style and quality" 41 | @echo " format-check - Check if code formatting is correct" 42 | @echo " lint - Run linting" 43 | @echo " sync - Install/update dependencies" 44 | @echo " test - Run all tests" 45 | @echo " build - Build packages" 46 | @echo " clean - Clean build artifacts" 47 | @echo " help - Show this help message" 48 | -------------------------------------------------------------------------------- /CLAUDE.md: -------------------------------------------------------------------------------- 1 | # CLAUDE.md 2 | 3 | This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. 4 | 5 | ## Project Overview 6 | 7 | This is a sloppy XML parser library for Python - a single-file XML parser designed to handle malformed XML gracefully while maintaining high performance through pre-compiled regular expressions. The library provides both streaming and tree-building XML parsing capabilities with robust error recovery mechanisms for handling malformed XML commonly generated by LLMs and other automated systems. 8 | 9 | ## Development Commands 10 | 11 | ### Package Management 12 | This project uses **uv** as the package manager: 13 | - `uv sync` - Install/update dependencies and sync the environment 14 | - `uv add ` - Add a new dependency 15 | - `uv add --dev ` - Add a dev only dependency 16 | - `uv remove ` - Remove a dependency 17 | - `uv remove --dev ` - Remove a dev only dependency 18 | - `uv run ` - Run commands in the project environment 19 | 20 | ### Testing 21 | - `uv run pytest` - Run all tests 22 | - `uv run pytest tests/test_sloppy_xml.py` - Run specific test file 23 | - `uv run pytest -v` - Run tests with verbose output 24 | - `uv run pytest tests/test_enhanced_recovery.py` - Run enhanced recovery tests 25 | 26 | ### Linting & Formatting 27 | - `uv run ruff check` - Check code style and quality 28 | - `uv run ruff format` - Format code automatically 29 | - `uv run ruff check --fix` - Fix auto-fixable issues 30 | 31 | ### Building 32 | - `uv build` - Build source distribution and wheel packages 33 | 34 | ## Architecture 35 | 36 | The parser implements an event-based streaming architecture with these key components: 37 | 38 | ### Core Event Types (Named Tuples) 39 | - `StartElement` - Opening XML tags with attributes, line/column info 40 | - `EndElement` - Closing XML tags with auto-close detection 41 | - `Text` - Text content between tags with CDATA flag 42 | - `Comment` - XML comments 43 | - `ProcessingInstruction` - Processing instructions like `` 44 | - `EntityRef` - Entity references with resolution 45 | - `ParseError` - Error events with recovery information 46 | 47 | ### State Machine 48 | Parser uses enum-based states: `INITIAL`, `IN_TAG`, `IN_TEXT`, `IN_COMMENT`, `IN_CDATA`, `IN_PI`, `ERROR_RECOVERY`, `COMPLETE` 49 | 50 | ### Error Recovery 51 | - Tag stack management for auto-closing mismatched tags 52 | - Graceful handling of malformed attributes and entities 53 | - CDATA fallback mechanisms 54 | - Entity resolution for HTML entities and numeric entities 55 | 56 | ### API Functions 57 | - `stream_parse()` - Main streaming parser returning event iterator 58 | - `tree_parse()` - Convenience function for ElementTree construction 59 | - `ETreeBuilder` - Tree builder for constructing ElementTree objects 60 | 61 | ## File Structure 62 | 63 | - `sloppy_xml.py` - Main parser implementation (single file) 64 | - `tests/test_sloppy_xml.py` - Comprehensive test suite 65 | - `tests/test_enhanced_recovery.py` - Enhanced error recovery tests 66 | - `ARCHITECTURE.md` - Detailed architectural specification 67 | - `TODO.md` - Development todo list 68 | - `pyproject.toml` - Project configuration with dependencies 69 | 70 | ## Current Development Status 71 | 72 | See `TODO.md` for pending tasks. Key areas of active development: 73 | - Optimizing text event emission 74 | - Making ElementTreeBuilder generic for lxml compatibility 75 | - Improving error handling and reporting 76 | - Performance optimizations 77 | 78 | ## Testing Strategy 79 | 80 | The test suite covers: 81 | - Well-formed XML parsing validation 82 | - Malformed XML recovery scenarios 83 | - Entity resolution (standard HTML entities + numeric) 84 | - Performance benchmarks 85 | - Edge cases and boundary conditions 86 | - Real-world malformed XML from LLM outputs -------------------------------------------------------------------------------- /tests/test_enhanced_recovery.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Test script demonstrating enhanced error recovery features in the sloppy XML parser. 4 | """ 5 | 6 | import pytest 7 | import sloppy_xml 8 | from sloppy_xml import RecoveryStrategy 9 | 10 | 11 | def test_basic_recovery(): 12 | """Test basic error recovery functionality.""" 13 | # Malformed XML with unclosed tags 14 | xml = "textmore text" 15 | 16 | events = list(sloppy_xml.stream_parse(xml)) 17 | assert len(events) > 0 18 | 19 | # Check that we get various event types 20 | event_types = [type(event).__name__ for event in events] 21 | assert "StartElement" in event_types 22 | 23 | 24 | def test_advanced_recovery(): 25 | """Test advanced recovery with detailed error reporting.""" 26 | # XML with multiple issues 27 | malformed_xml = """ 28 | text" 142 | events = list(sloppy_xml.stream_parse(xml)) 143 | 144 | event_types = [type(e).__name__ for e in events] 145 | expected = [ 146 | "Comment", 147 | "StartElement", 148 | "Text", 149 | "EndElement", 150 | "ProcessingInstruction", 151 | ] 152 | assert event_types == expected 153 | 154 | 155 | def test_position_tracking(): 156 | """Test line and column position tracking.""" 157 | xml = """ 158 | text 159 | """ 160 | events = list(sloppy_xml.stream_parse(xml)) 161 | 162 | # First element should be at line 1 163 | start_event = events[0] 164 | assert start_event.line == 1 165 | assert start_event.column == 1 166 | 167 | # Child element should be at line 2 168 | child_events = [ 169 | e for e in events if isinstance(e, StartElement) and e.name == "child" 170 | ] 171 | assert len(child_events) == 1 172 | assert child_events[0].line == 2 173 | 174 | 175 | def test_streaming_large_input(): 176 | """Test streaming behavior with large input.""" 177 | # Create a large XML document 178 | large_xml = ( 179 | "" 180 | + "".join(f"content{i}" for i in range(1000)) 181 | + "" 182 | ) 183 | 184 | # Parse as stream - should not load everything into memory at once 185 | event_count = 0 186 | for event in sloppy_xml.stream_parse(large_xml): 187 | event_count += 1 188 | # Verify we can process events one by one 189 | assert isinstance(event, XMLEvent) 190 | 191 | # Should have 1 + (3 * 1000) + 1 = 3002 events 192 | assert event_count == 3002 193 | 194 | 195 | def test_file_input(): 196 | """Test parsing from file-like objects.""" 197 | xml = "text" 198 | 199 | # Test with StringIO 200 | file_obj = io.StringIO(xml) 201 | events = list(sloppy_xml.stream_parse(file_obj)) 202 | assert len(events) == 5 # StartElement, StartElement, Text, EndElement, EndElement 203 | 204 | # Test with BytesIO 205 | file_obj = io.BytesIO(xml.encode("utf-8")) 206 | events = list(sloppy_xml.stream_parse(file_obj)) 207 | assert len(events) == 5 # StartElement, StartElement, Text, EndElement, EndElement 208 | 209 | 210 | def test_legacy_parameters(): 211 | """Test backward compatibility with legacy parameters.""" 212 | xml = "text & more" 213 | 214 | # Test legacy parameter passing 215 | events = list( 216 | sloppy_xml.stream_parse( 217 | xml, recover=True, emit_errors=False, resolve_entities=True 218 | ) 219 | ) 220 | 221 | assert len(events) == 3 222 | text_event = [e for e in events if isinstance(e, Text)][0] 223 | assert ( 224 | "text & more" in text_event.content or "text & more" in text_event.content 225 | ) 226 | 227 | 228 | def test_basic_tree_construction(): 229 | """Test basic ElementTree construction.""" 230 | xml = "text" 231 | tree = sloppy_xml.tree_parse(xml) 232 | 233 | assert tree is not None 234 | assert tree.tag == "root" 235 | assert len(tree) == 1 236 | assert tree[0].tag == "child" 237 | assert tree[0].text == "text" 238 | 239 | 240 | def test_tree_with_attributes(): 241 | """Test tree construction preserves attributes.""" 242 | xml = 'content' 243 | tree = sloppy_xml.tree_parse(xml) 244 | 245 | assert tree.attrib["id"] == "1" 246 | assert tree[0].attrib["class"] == "test" 247 | assert tree[0].text == "content" 248 | 249 | 250 | def test_mixed_content(): 251 | """Test tree with mixed text and element content.""" 252 | xml = "beforeinnerafter" 253 | tree = sloppy_xml.tree_parse(xml) 254 | 255 | assert tree.text == "before" 256 | assert tree[0].text == "inner" 257 | assert tree[0].tail == "after" 258 | 259 | 260 | def test_multiple_children(): 261 | """Test tree with multiple child elements.""" 262 | xml = "text1text2" 263 | tree = sloppy_xml.tree_parse(xml) 264 | 265 | assert len(tree) == 2 266 | assert tree[0].tag == "child1" 267 | assert tree[1].tag == "child2" 268 | assert tree[0].text == "text1" 269 | assert tree[1].text == "text2" 270 | 271 | 272 | def test_custom_tree_builder(): 273 | """Test using custom tree builder.""" 274 | 275 | class MockTreeBuilder(TreeBuilder): 276 | def __init__(self): 277 | self.events = [] 278 | 279 | def start_element(self, event): 280 | self.events.append(("start", event.name)) 281 | 282 | def end_element(self, event): 283 | self.events.append(("end", event.name)) 284 | 285 | def text(self, event): 286 | self.events.append(("text", event.content)) 287 | 288 | def comment(self, event): 289 | self.events.append(("comment", event.content)) 290 | 291 | def processing_instruction(self, event): 292 | self.events.append(("pi", event.target)) 293 | 294 | def entity_ref(self, event): 295 | self.events.append(("entity", event.name)) 296 | 297 | def parse_error(self, event): 298 | self.events.append(("error", event.message)) 299 | 300 | def get_root(self): 301 | return self.events 302 | 303 | xml = "text" 304 | builder = MockTreeBuilder() 305 | result = sloppy_xml.tree_parse(xml, tree_builder=builder) 306 | 307 | assert result == [("start", "root"), ("text", "text"), ("end", "root")] 308 | 309 | 310 | def test_tree_parse_from_events(): 311 | """Test tree parsing from pre-generated events.""" 312 | xml = "text" 313 | events = sloppy_xml.stream_parse(xml) 314 | tree = sloppy_xml.tree_parse(events) 315 | 316 | assert tree.tag == "root" 317 | assert tree[0].text == "text" 318 | 319 | 320 | def test_tree_parameter(): 321 | """Test the tree parameter for different backend types.""" 322 | xml = "text" 323 | 324 | # Test default etree backend 325 | tree_etree = sloppy_xml.tree_parse(xml, tree="etree") 326 | assert tree_etree.tag == "root" 327 | assert tree_etree[0].text == "text" 328 | 329 | # Verify it's an ElementTree element 330 | import xml.etree.ElementTree as ET 331 | 332 | assert isinstance(tree_etree, ET.Element) 333 | 334 | # Test lxml backend if available 335 | if sloppy_xml.HAS_LXML: 336 | tree_lxml = sloppy_xml.tree_parse(xml, tree="lxml") 337 | assert tree_lxml.tag == "root" 338 | assert tree_lxml[0].text == "text" 339 | 340 | # Verify it's an lxml element 341 | from lxml import etree as lxml_etree 342 | 343 | assert isinstance(tree_lxml, lxml_etree._Element) 344 | 345 | # Test with invalid tree backend should raise an error 346 | import pytest 347 | 348 | with pytest.raises(KeyError): 349 | sloppy_xml.tree_parse(xml, tree="invalid_backend") 350 | 351 | # Test that tree parameter overrides custom tree_builder when both are provided 352 | class MockTreeBuilder(TreeBuilder): 353 | def __init__(self): 354 | self.events = [] 355 | 356 | def start_element(self, event): 357 | self.events.append(("start", event.name)) 358 | 359 | def end_element(self, event): 360 | self.events.append(("end", event.name)) 361 | 362 | def text(self, event): 363 | self.events.append(("text", event.content)) 364 | 365 | def comment(self, event): 366 | pass 367 | 368 | def processing_instruction(self, event): 369 | pass 370 | 371 | def entity_ref(self, event): 372 | pass 373 | 374 | def parse_error(self, event): 375 | pass 376 | 377 | def get_root(self): 378 | return self.events 379 | 380 | # When both tree_builder and tree are provided, tree parameter should be ignored 381 | # and custom tree_builder should be used 382 | custom_builder = MockTreeBuilder() 383 | result = sloppy_xml.tree_parse(xml, tree_builder=custom_builder, tree="etree") 384 | # The custom tree builder should be used, not the etree backend 385 | assert result == [ 386 | ("start", "root"), 387 | ("start", "child"), 388 | ("text", "text"), 389 | ("end", "child"), 390 | ("end", "root"), 391 | ] 392 | 393 | 394 | def test_standard_html_entities(): 395 | """Test resolution of standard HTML entities.""" 396 | xml = "<>&"'" 397 | events = list(sloppy_xml.stream_parse(xml)) 398 | 399 | text_event = [e for e in events if isinstance(e, Text)][0] 400 | assert text_event.content == "<>&\"'" 401 | 402 | 403 | def test_numeric_entities(): 404 | """Test numeric entity resolution.""" 405 | xml = "AA€" # A, A, Euro symbol 406 | events = list(sloppy_xml.stream_parse(xml)) 407 | 408 | text_event = [e for e in events if isinstance(e, Text)][0] 409 | assert "A" in text_event.content 410 | assert "€" in text_event.content or "€" in text_event.content 411 | 412 | 413 | def test_extended_html_entities(): 414 | """Test extended HTML entity resolution.""" 415 | xml = "©® " 416 | events = list(sloppy_xml.stream_parse(xml)) 417 | 418 | text_event = [e for e in events if isinstance(e, Text)][0] 419 | expected_chars = {"©", "®", "\u00a0"} 420 | # Check if any expected characters are present (some might not resolve) 421 | content = text_event.content 422 | has_resolved = any(char in content for char in expected_chars) 423 | has_original = any(entity in content for entity in ["©", "®", " "]) 424 | assert has_resolved or has_original 425 | 426 | 427 | def test_invalid_entities(): 428 | """Test handling of invalid entities.""" 429 | xml = "&invalid;¬entity;" 430 | events = list(sloppy_xml.stream_parse(xml)) 431 | 432 | text_event = [e for e in events if isinstance(e, Text)][0] 433 | # Invalid entities should be left as-is or handled gracefully 434 | assert "&invalid;" in text_event.content or "invalid" in text_event.content 435 | 436 | 437 | def test_entity_resolution_disabled(): 438 | """Test disabling entity resolution.""" 439 | xml = "<&" 440 | events = list(sloppy_xml.stream_parse(xml, resolve_entities=False)) 441 | 442 | text_event = [e for e in events if isinstance(e, Text)][0] 443 | assert "<" in text_event.content 444 | assert "&" in text_event.content 445 | 446 | 447 | def test_malformed_numeric_entities(): 448 | """Test handling of malformed numeric entities.""" 449 | xml = "&#invalid;&#x;�" 450 | events = list(sloppy_xml.stream_parse(xml)) 451 | 452 | # Should not crash and should handle gracefully 453 | text_events = [e for e in events if isinstance(e, Text)] 454 | assert len(text_events) > 0 455 | 456 | 457 | def test_entity_resolution_in_attributes(): 458 | """Test that entities are resolved in attribute values.""" 459 | # Test the specific bug case reported 460 | xml = '' 461 | 462 | # Test with entity resolution enabled (default) 463 | events = list(sloppy_xml.stream_parse(xml, resolve_entities=True)) 464 | start_events = [ 465 | e for e in events if isinstance(e, StartElement) and e.name == "link" 466 | ] 467 | assert len(start_events) == 1 468 | href = start_events[0].attrs.get("href") 469 | assert ( 470 | href 471 | == "https://secure.booking.invalid/myreservations.en-us.html?bn=4759;pincode=4391&entrypoint=email_wakeup" 472 | ) 473 | assert "&" not in href # Entity should be resolved 474 | 475 | # Test with entity resolution disabled 476 | events = list(sloppy_xml.stream_parse(xml, resolve_entities=False)) 477 | start_events = [ 478 | e for e in events if isinstance(e, StartElement) and e.name == "link" 479 | ] 480 | assert len(start_events) == 1 481 | href = start_events[0].attrs.get("href") 482 | assert "&" in href # Entity should remain unresolved 483 | 484 | # Test various entity types in attribute values 485 | test_cases = [ 486 | ('', "&"), 487 | ('', "<>"), 488 | ('', "\"'"), 489 | ('', "\"'"), 490 | ('', "ABC"), 491 | ('', "ABC"), 492 | ] 493 | 494 | for xml, expected in test_cases: 495 | events = list(sloppy_xml.stream_parse(xml, resolve_entities=True)) 496 | start_events = [ 497 | e for e in events if isinstance(e, StartElement) and e.name == "test" 498 | ] 499 | assert len(start_events) == 1 500 | attr_value = start_events[0].attrs.get("attr") 501 | assert attr_value == expected, ( 502 | f"Expected {expected!r}, got {attr_value!r} for {xml}" 503 | ) 504 | 505 | # Test tree parsing also works correctly 506 | original_xml = '' 507 | root = sloppy_xml.tree_parse(original_xml) 508 | link = root.find("link") 509 | assert link is not None 510 | href = link.get("href") 511 | assert ( 512 | href 513 | == "https://secure.booking.invalid/myreservations.en-us.html?bn=4759;pincode=4391&entrypoint=email_wakeup" 514 | ) 515 | 516 | 517 | def test_basic_comments(): 518 | """Test parsing basic comments.""" 519 | xml = "content" 520 | events = list(sloppy_xml.stream_parse(xml)) 521 | 522 | comment_events = [e for e in events if isinstance(e, Comment)] 523 | assert len(comment_events) == 1 524 | assert comment_events[0].content == "comment" 525 | 526 | 527 | def test_multiline_comments(): 528 | """Test multiline comments.""" 529 | xml = """""" 533 | events = list(sloppy_xml.stream_parse(xml)) 534 | 535 | comment_events = [e for e in events if isinstance(e, Comment)] 536 | assert len(comment_events) == 1 537 | assert "Multi-line" in comment_events[0].content 538 | 539 | 540 | def test_comments_with_special_chars(): 541 | """Test comments containing special characters.""" 542 | xml = "" 543 | events = list(sloppy_xml.stream_parse(xml)) 544 | 545 | comment_events = [e for e in events if isinstance(e, Comment)] 546 | assert len(comment_events) == 1 547 | assert comment_events[0].content == "<>&\"'" 548 | 549 | 550 | def test_nested_comment_chars(): 551 | """Test comments containing -- sequences.""" 552 | xml = "" 553 | events = list(sloppy_xml.stream_parse(xml)) 554 | 555 | comment_events = [e for e in events if isinstance(e, Comment)] 556 | assert len(comment_events) == 1 557 | 558 | 559 | def test_basic_pi(): 560 | """Test basic processing instructions.""" 561 | xml = '' 562 | events = list(sloppy_xml.stream_parse(xml)) 563 | 564 | pi_events = [e for e in events if isinstance(e, ProcessingInstruction)] 565 | assert len(pi_events) == 1 566 | assert pi_events[0].target == "xml" 567 | assert 'version="1.0"' in pi_events[0].data 568 | 569 | 570 | def test_pi_without_data(): 571 | """Test processing instructions without data.""" 572 | xml = "" 573 | events = list(sloppy_xml.stream_parse(xml)) 574 | 575 | pi_events = [e for e in events if isinstance(e, ProcessingInstruction)] 576 | assert len(pi_events) == 1 577 | assert pi_events[0].target == "target" 578 | assert pi_events[0].data is None or pi_events[0].data == "" 579 | 580 | 581 | def test_multiple_pis(): 582 | """Test multiple processing instructions.""" 583 | xml = '' 584 | events = list(sloppy_xml.stream_parse(xml)) 585 | 586 | pi_events = [e for e in events if isinstance(e, ProcessingInstruction)] 587 | assert len(pi_events) == 2 588 | assert pi_events[0].target == "xml" 589 | assert pi_events[1].target == "stylesheet" 590 | 591 | 592 | def test_basic_cdata(): 593 | """Test basic CDATA sections.""" 594 | xml = "" 595 | events = list(sloppy_xml.stream_parse(xml)) 596 | 597 | text_events = [e for e in events if isinstance(e, Text)] 598 | assert len(text_events) == 1 599 | assert text_events[0].content == "some data" 600 | assert text_events[0].is_cdata 601 | 602 | 603 | def test_cdata_with_special_chars(): 604 | """Test CDATA with special characters.""" 605 | xml = "&\"']]>" 606 | events = list(sloppy_xml.stream_parse(xml)) 607 | 608 | text_events = [e for e in events if isinstance(e, Text)] 609 | assert len(text_events) == 1 610 | assert text_events[0].content == "<>&\"'" 611 | assert text_events[0].is_cdata 612 | 613 | 614 | def test_cdata_with_xml_content(): 615 | """Test CDATA containing XML-like content.""" 616 | xml = "content]]>" 617 | events = list(sloppy_xml.stream_parse(xml)) 618 | 619 | text_events = [e for e in events if isinstance(e, Text)] 620 | assert len(text_events) == 1 621 | assert text_events[0].content == "content" 622 | assert text_events[0].is_cdata 623 | 624 | 625 | def test_unclosed_tags(): 626 | """Test recovery from unclosed tags.""" 627 | xml = "text" 628 | events = list(sloppy_xml.stream_parse(xml, recover=True, auto_close_tags=True)) 629 | 630 | # Should auto-close both tags 631 | end_events = [e for e in events if isinstance(e, EndElement)] 632 | assert len(end_events) >= 1 # At least the child should be auto-closed 633 | 634 | # Build tree to verify structure 635 | tree = sloppy_xml.tree_parse(xml, recover=True, auto_close_tags=True) 636 | assert tree is not None 637 | 638 | 639 | def test_mismatched_tags(): 640 | """Test recovery from mismatched tags.""" 641 | xml = "" 642 | events = list(sloppy_xml.stream_parse(xml, recover=True, emit_errors=True)) 643 | 644 | error_events = [e for e in events if isinstance(e, ParseError)] 645 | assert len(error_events) > 0 646 | 647 | # Should still be able to build a tree 648 | tree = sloppy_xml.tree_parse(xml, recover=True, emit_errors=True) 649 | assert tree is not None 650 | 651 | 652 | def test_malformed_attributes(): 653 | """Test recovery from malformed attributes.""" 654 | xml = '= 0 # May or may not recover comment 679 | assert len(element_events) == 1 # Should definitely get the element 680 | 681 | 682 | def test_broken_cdata(): 683 | """Test recovery from broken CDATA.""" 684 | xml = "" 685 | events = list( 686 | sloppy_xml.stream_parse(xml, recovery_strategy=RecoveryStrategy.AGGRESSIVE) 687 | ) 688 | 689 | # Should not crash and should produce some events 690 | assert len(events) > 0 691 | 692 | 693 | def test_unescaped_characters(): 694 | """Test recovery from unescaped special characters.""" 695 | xml = "text with < and & characters" 696 | events = list( 697 | sloppy_xml.stream_parse(xml, recovery_strategy=RecoveryStrategy.LENIENT) 698 | ) 699 | 700 | # Should handle gracefully 701 | text_events = [e for e in events if isinstance(e, Text)] 702 | assert len(text_events) > 0 703 | 704 | 705 | def test_recovery_strategies(): 706 | """Test different recovery strategies.""" 707 | malformed_xml = '= 0 745 | 746 | 747 | def test_single_character(): 748 | """Test parsing single character.""" 749 | events = list(sloppy_xml.stream_parse("a")) 750 | text_events = [e for e in events if isinstance(e, Text)] 751 | assert len(text_events) == 1 752 | assert text_events[0].content == "a" 753 | 754 | 755 | def test_very_long_tag_names(): 756 | """Test very long tag names.""" 757 | long_name = "a" * 1000 758 | xml = f"<{long_name}>content" 759 | events = list(sloppy_xml.stream_parse(xml)) 760 | 761 | start_events = [e for e in events if isinstance(e, StartElement)] 762 | assert len(start_events) == 1 763 | assert start_events[0].name == long_name 764 | 765 | 766 | def test_very_long_attribute_values(): 767 | """Test very long attribute values.""" 768 | long_value = "x" * 10000 769 | xml = f'content' 770 | events = list(sloppy_xml.stream_parse(xml)) 771 | 772 | start_events = [e for e in events if isinstance(e, StartElement)] 773 | assert len(start_events) == 1 774 | assert start_events[0].attrs["attr"] == long_value 775 | 776 | 777 | def test_deeply_nested_elements(): 778 | """Test deeply nested elements.""" 779 | depth = 100 780 | open_tags = "".join(f"" for i in range(depth)) 781 | close_tags = "".join(f"" for i in range(depth - 1, -1, -1)) 782 | xml = open_tags + "content" + close_tags 783 | 784 | events = list(sloppy_xml.stream_parse(xml)) 785 | start_events = [e for e in events if isinstance(e, StartElement)] 786 | assert len(start_events) == depth 787 | 788 | 789 | def test_maximum_nesting_depth(): 790 | """Test maximum nesting depth limit.""" 791 | depth = 20 792 | xml = "".join(f"" for i in range(depth)) + "content" 793 | 794 | events = list(sloppy_xml.stream_parse(xml, max_depth=10)) 795 | error_events = [e for e in events if isinstance(e, ParseError)] 796 | 797 | # Should hit depth limit and generate error 798 | depth_errors = [e for e in error_events if "depth" in e.error_type.lower()] 799 | assert len(depth_errors) > 0 or len(events) > 0 # Either error or truncation 800 | 801 | 802 | def test_many_attributes(): 803 | """Test elements with many attributes.""" 804 | attrs = " ".join(f'attr{i}="value{i}"' for i in range(100)) 805 | xml = f"content" 806 | events = list(sloppy_xml.stream_parse(xml)) 807 | 808 | start_events = [e for e in events if isinstance(e, StartElement)] 809 | assert len(start_events) == 1 810 | assert len(start_events[0].attrs) == 100 811 | 812 | 813 | def test_special_characters_in_content(): 814 | """Test special characters in text content.""" 815 | special_chars = "áéíóú ñç 中文 🚀 \U0001f600" # Unicode chars 816 | xml = f"{special_chars}" 817 | events = list(sloppy_xml.stream_parse(xml)) 818 | 819 | text_events = [e for e in events if isinstance(e, Text)] 820 | assert len(text_events) == 1 821 | assert special_chars in text_events[0].content 822 | 823 | 824 | def test_xml_declaration(): 825 | """Test XML declaration handling.""" 826 | xml = 'content' 827 | events = list(sloppy_xml.stream_parse(xml)) 828 | 829 | pi_events = [e for e in events if isinstance(e, ProcessingInstruction)] 830 | # Should have XML declaration as processing instruction 831 | xml_decl = [e for e in pi_events if e.target == "xml"] 832 | assert len(xml_decl) == 1 833 | 834 | 835 | def test_basic_namespaces(): 836 | """Test basic namespace-aware parsing.""" 837 | xml = 'content' 838 | events = list(sloppy_xml.stream_parse(xml, namespace_aware=True)) 839 | 840 | start_events = [e for e in events if isinstance(e, StartElement)] 841 | ns_elements = [e for e in start_events if ":" in e.name] 842 | assert len(ns_elements) > 0 843 | 844 | 845 | def test_default_namespace(): 846 | """Test default namespace handling.""" 847 | xml = 'content' 848 | events = list(sloppy_xml.stream_parse(xml, namespace_aware=True)) 849 | 850 | # Should parse without errors 851 | start_events = [e for e in events if isinstance(e, StartElement)] 852 | assert len(start_events) == 2 853 | 854 | 855 | def test_namespace_disabled(): 856 | """Test parsing with namespaces disabled.""" 857 | xml = ( 858 | 'content' 859 | ) 860 | events = list(sloppy_xml.stream_parse(xml, namespace_aware=False)) # Default 861 | 862 | start_events = [e for e in events if isinstance(e, StartElement)] 863 | # Should treat ns:root as a regular tag name 864 | assert any(e.name == "ns:root" for e in start_events) 865 | 866 | 867 | def test_large_document_performance(): 868 | """Test performance with large documents.""" 869 | # Create a moderately large XML document 870 | num_elements = 5000 871 | xml_parts = [""] 872 | xml_parts.extend( 873 | f"Content for item {i}" for i in range(num_elements) 874 | ) 875 | xml_parts.append("") 876 | large_xml = "".join(xml_parts) 877 | 878 | start_time = time.time() 879 | events = list(sloppy_xml.stream_parse(large_xml)) 880 | parse_time = time.time() - start_time 881 | 882 | # Should complete in reasonable time (less than 1 second for 5000 elements) 883 | assert parse_time < 5.0, f"Parsing took {parse_time:.2f} seconds, too slow" 884 | 885 | # Should produce correct number of events 886 | start_events = [e for e in events if isinstance(e, StartElement)] 887 | assert len(start_events) == num_elements + 1 # +1 for root 888 | 889 | 890 | def test_deep_nesting_performance(): 891 | """Test performance with deeply nested documents.""" 892 | depth = 500 893 | open_tags = "".join(f"" for i in range(depth)) 894 | close_tags = "".join(f"" for i in range(depth - 1, -1, -1)) 895 | deep_xml = open_tags + "content" + close_tags 896 | 897 | start_time = time.time() 898 | events = list(sloppy_xml.stream_parse(deep_xml, max_depth=600)) 899 | parse_time = time.time() - start_time 900 | 901 | # Should complete in reasonable time 902 | assert parse_time < 2.0, f"Deep nesting parsing took {parse_time:.2f} seconds" 903 | assert len(events) > 0 904 | 905 | 906 | def test_memory_usage_streaming(): 907 | """Test that streaming doesn't accumulate excessive memory.""" 908 | # Create a large document 909 | large_xml = ( 910 | "" + "".join(f"data" for i in range(1000)) + "" 911 | ) 912 | 913 | # Parse as generator - should not load everything into memory 914 | event_generator = sloppy_xml.stream_parse(large_xml) 915 | 916 | # Process a few events to ensure generator works 917 | first_few_events = [] 918 | for i, event in enumerate(event_generator): 919 | first_few_events.append(event) 920 | if i >= 10: # Just get first 10 events 921 | break 922 | 923 | assert len(first_few_events) == 11 924 | # Generator should still have more events available 925 | next_event = next(event_generator, None) 926 | assert next_event is not None 927 | 928 | 929 | def test_entity_heavy_performance(): 930 | """Test performance with many entities.""" 931 | # Create XML with many entity references 932 | content_with_entities = "Text with & < > entities " * 1000 933 | xml = f"{content_with_entities}" 934 | 935 | start_time = time.time() 936 | events = list(sloppy_xml.stream_parse(xml)) 937 | parse_time = time.time() - start_time 938 | 939 | assert parse_time < 2.0 940 | text_events = [e for e in events if isinstance(e, Text)] 941 | assert len(text_events) == 1 942 | 943 | 944 | def test_llm_generated_malformed_xml(): 945 | """Test typical LLM-generated malformed XML.""" 946 | malformed_examples = [ 947 | # Missing quotes in attributes 948 | "content", 949 | # Mixed quotes 950 | "content", 951 | # Unclosed tags 952 | "content", 953 | # Tag soup 954 | "

textmore

", 955 | # Unescaped characters 956 | "text with < and & chars", 957 | # Broken CDATA 958 | "", 959 | # Malformed comments 960 | " 1022 | 1023 | Text content 1024 | alert('hello')]]> 1025 | 1026 | More text & entities 1027 | 1028 | 1029 | 1030 | 1031 | """ 1032 | 1033 | events = list(sloppy_xml.stream_parse(mixed_xml)) 1034 | 1035 | # Should have various event types 1036 | event_types = {type(e).__name__ for e in events} 1037 | expected_types = { 1038 | "StartElement", 1039 | "EndElement", 1040 | "Text", 1041 | "Comment", 1042 | "ProcessingInstruction", 1043 | } 1044 | 1045 | # Should have most of the expected types 1046 | assert len(event_types.intersection(expected_types)) >= 3 1047 | 1048 | 1049 | def test_encoding_issues(): 1050 | """Test documents with encoding issues.""" 1051 | # Simulate common encoding problems 1052 | xml_with_issues = 'Text with em—dash and "smart quotes"' 1053 | 1054 | events = list( 1055 | sloppy_xml.stream_parse(xml_with_issues, fix_encoding=True, emit_errors=True) 1056 | ) 1057 | 1058 | # Should handle without crashing 1059 | assert len(events) > 0 1060 | 1061 | # Should be able to build tree 1062 | tree = sloppy_xml.tree_parse(xml_with_issues, fix_encoding=True, emit_errors=True) 1063 | assert tree is not None 1064 | 1065 | 1066 | def test_fragment_parsing(): 1067 | """Test parsing XML fragments.""" 1068 | fragments = [ 1069 | "Just text content", 1070 | "content", # No root 1071 | "

para1

para2

", # Multiple roots 1072 | "Text before content text after", 1073 | ] 1074 | 1075 | for fragment in fragments: 1076 | events = list(sloppy_xml.stream_parse(fragment, allow_fragments=True)) 1077 | assert len(events) > 0 1078 | 1079 | # Should be able to handle fragments in tree parsing too 1080 | try: 1081 | tree = sloppy_xml.tree_parse(fragment, allow_fragments=True) 1082 | assert tree is not None 1083 | except ValueError: 1084 | # Some fragments might not produce valid trees, that's ok 1085 | pass 1086 | 1087 | 1088 | def test_full_pipeline_wellformed(): 1089 | """Test complete parsing pipeline with well-formed XML.""" 1090 | xml = """ 1091 | 1092 | 1093 | Test Document 1094 | 1095 |

Paragraph with emphasis and & entities.

1096 | 1097 | First item 1098 | Second item 1099 | 1100 | raw data]]> 1101 |
1102 |
""" 1103 | 1104 | # Test stream parsing 1105 | events = list(sloppy_xml.stream_parse(xml)) 1106 | assert len(events) > 10 1107 | 1108 | # Test tree building 1109 | tree = sloppy_xml.tree_parse(xml) 1110 | assert tree.tag == "document" 1111 | assert len(tree) == 2 # title and content 1112 | 1113 | # Test with various options 1114 | tree_with_opts = sloppy_xml.tree_parse( 1115 | xml, preserve_whitespace=False, resolve_entities=True, namespace_aware=True 1116 | ) 1117 | assert tree_with_opts.tag == "document" 1118 | 1119 | 1120 | def test_full_pipeline_malformed(): 1121 | """Test complete parsing pipeline with malformed XML.""" 1122 | malformed_xml = """", FLAGS), 232 | # Malformed comment patterns 233 | "broken_comment": re.compile(r")", 852 | "added missing comment close", 853 | severity="warning", 854 | ) 855 | pos = broken_comment_match.end() 856 | line, column = _update_position( 857 | text, broken_comment_match.start(), pos, line, column 858 | ) 859 | recovery_attempts += 1 860 | continue 861 | 862 | # Check for CDATA 863 | cdata_match = PATTERNS["cdata"].match(text, pos) 864 | if cdata_match: 865 | yield from flush_text() 866 | cdata_content = cdata_match.group(1) 867 | yield Text(cdata_content, line, column, is_cdata=True) 868 | pos = cdata_match.end() 869 | line, column = _update_position( 870 | text, cdata_match.start(), pos, line, column 871 | ) 872 | continue 873 | 874 | # Try to recover from broken CDATA 875 | if options.recovery_strategy in [ 876 | RecoveryStrategy.LENIENT, 877 | RecoveryStrategy.AGGRESSIVE, 878 | ]: 879 | broken_cdata_match = PATTERNS["broken_cdata"].match(text, pos) 880 | if broken_cdata_match: 881 | yield from flush_text() 882 | cdata_content = broken_cdata_match.group(1) 883 | yield Text(cdata_content, line, column, is_cdata=True) 884 | yield from emit_error( 885 | "broken_cdata", 886 | "Fixed malformed CDATA section (missing closing ]]>)", 887 | "added missing CDATA close", 888 | severity="warning", 889 | ) 890 | pos = broken_cdata_match.end() 891 | line, column = _update_position( 892 | text, broken_cdata_match.start(), pos, line, column 893 | ) 894 | recovery_attempts += 1 895 | continue 896 | 897 | # Check for processing instructions 898 | pi_match = PATTERNS["pi"].match(text, pos) 899 | if pi_match: 900 | yield from flush_text() 901 | target = pi_match.group(1) 902 | data = pi_match.group(2).strip() if pi_match.group(2) else None 903 | yield ProcessingInstruction(target, data, line, column) 904 | pos = pi_match.end() 905 | line, column = _update_position( 906 | text, pi_match.start(), pos, line, column 907 | ) 908 | continue 909 | 910 | # Check for end tags 911 | end_tag_match = PATTERNS["end_tag"].match(text, pos) 912 | if end_tag_match: 913 | yield from flush_text() 914 | tag_name = end_tag_match.group(1) 915 | 916 | # Handle tag mismatch with recovery 917 | if tag_stack and tag_stack[-1] != tag_name: 918 | if options.recover: 919 | # Attempt recovery 920 | recovery_events = _recover_tag_mismatch( 921 | tag_stack, tag_name, line, column 922 | ) 923 | for event in recovery_events: 924 | yield event 925 | yield from emit_error( 926 | "tag_mismatch", 927 | f"Mismatched end tag '{tag_name}', expected '{tag_stack[-1] if tag_stack else 'none'}'", 928 | f"auto-closed {len(recovery_events)} tags", 929 | severity="warning", 930 | ) 931 | recovery_attempts += 1 932 | else: 933 | yield from emit_error( 934 | "tag_mismatch", 935 | f"Mismatched end tag '{tag_name}'", 936 | fatal=True, 937 | ) 938 | break 939 | elif tag_stack: 940 | tag_stack.pop() 941 | 942 | yield EndElement(tag_name, line, column, auto_closed=False) 943 | pos = end_tag_match.end() 944 | line, column = _update_position( 945 | text, end_tag_match.start(), pos, line, column 946 | ) 947 | continue 948 | 949 | # Check for start tags 950 | start_tag_match = PATTERNS["start_tag"].match(text, pos) 951 | if start_tag_match: 952 | yield from flush_text() 953 | tag_name = start_tag_match.group(1) 954 | attr_string = start_tag_match.group(2) 955 | self_closing = bool(start_tag_match.group(3)) 956 | 957 | # Parse attributes with enhanced recovery 958 | try: 959 | if options.repair_attributes: 960 | attributes, attr_recovery_messages = _repair_attributes( 961 | attr_string, 962 | options.smart_quotes, 963 | options.recovery_strategy, 964 | options.resolve_entities, 965 | ) 966 | 967 | # Emit recovery messages 968 | for recovery_msg in attr_recovery_messages: 969 | yield from emit_error( 970 | "attribute_repair", 971 | recovery_msg, 972 | recovery_msg, 973 | severity="warning", 974 | ) 975 | recovery_attempts += 1 976 | else: 977 | attributes = _parse_attributes( 978 | attr_string, options.resolve_entities 979 | ) 980 | except Exception as e: 981 | attributes = {} 982 | yield from emit_error( 983 | "attribute_parse", 984 | f"Failed to parse attributes: {e}", 985 | "using empty attributes", 986 | ) 987 | 988 | # Handle namespaces if enabled 989 | namespace = None 990 | if options.namespace_aware and ":" in tag_name: 991 | # Simple namespace handling - could be enhanced 992 | prefix, local_name = tag_name.split(":", 1) 993 | namespace = attributes.get(f"xmlns:{prefix}") 994 | 995 | yield StartElement(tag_name, attributes, line, column, namespace) 996 | 997 | # Track tag for matching (unless self-closing) 998 | if not self_closing: 999 | tag_stack.append(tag_name) 1000 | else: 1001 | # Emit matching end element for self-closing tag 1002 | yield EndElement(tag_name, line, column, auto_closed=False) 1003 | 1004 | pos = start_tag_match.end() 1005 | line, column = _update_position( 1006 | text, start_tag_match.start(), pos, line, column 1007 | ) 1008 | continue 1009 | 1010 | # Regular text content 1011 | text_match = PATTERNS["text_content"].match(text, pos) 1012 | if text_match: 1013 | text_content = text_match.group(0) 1014 | text_buffer.append(text_content) 1015 | pos = text_match.end() 1016 | line, column = _update_position( 1017 | text, text_match.start(), pos, line, column 1018 | ) 1019 | continue 1020 | 1021 | # Handle entity references as part of text content 1022 | entity_match = PATTERNS["entity_ref"].match(text, pos) 1023 | if entity_match: 1024 | # Add entity to text buffer (will be resolved in flush_text) 1025 | entity_text = entity_match.group(0) 1026 | text_buffer.append(entity_text) 1027 | pos = entity_match.end() 1028 | line, column = _update_position( 1029 | text, entity_match.start(), pos, line, column 1030 | ) 1031 | continue 1032 | 1033 | # Check for incomplete tags at end of input 1034 | if options.recovery_strategy in [ 1035 | RecoveryStrategy.LENIENT, 1036 | RecoveryStrategy.AGGRESSIVE, 1037 | ]: 1038 | incomplete_tag, new_pos, incomplete_recovery = _handle_incomplete_tag( 1039 | text, pos, line, column, options.recovery_strategy 1040 | ) 1041 | if incomplete_tag: 1042 | yield from flush_text() 1043 | # Create a start element for the incomplete tag 1044 | yield StartElement(incomplete_tag, {}, line, column, None) 1045 | if options.auto_close_tags: 1046 | yield EndElement(incomplete_tag, line, column, auto_closed=True) 1047 | 1048 | for recovery_msg in incomplete_recovery: 1049 | yield from emit_error( 1050 | "incomplete_tag", 1051 | recovery_msg, 1052 | recovery_msg, 1053 | severity="warning", 1054 | ) 1055 | 1056 | pos = new_pos 1057 | recovery_attempts += 1 1058 | continue 1059 | 1060 | # Handle smart quote recovery if enabled 1061 | if options.smart_quotes and options.recovery_strategy in [ 1062 | RecoveryStrategy.LENIENT, 1063 | RecoveryStrategy.AGGRESSIVE, 1064 | ]: 1065 | if pos < len(text) and text[pos] in {'"', "'"}: 1066 | fixed_text, new_pos, quote_recovery = _smart_quote_recovery( 1067 | text, pos 1068 | ) 1069 | if quote_recovery: 1070 | text = fixed_text 1071 | for recovery_msg in quote_recovery: 1072 | yield from emit_error( 1073 | "quote_mismatch", 1074 | recovery_msg, 1075 | recovery_msg, 1076 | severity="warning", 1077 | ) 1078 | recovery_attempts += 1 1079 | # Continue parsing from current position 1080 | continue 1081 | 1082 | # If nothing matches, advance by one character to avoid infinite loop 1083 | if pos < len(text): 1084 | char = text[pos] 1085 | if char not in " \t\r\n" or options.preserve_whitespace: 1086 | text_buffer.append(char) 1087 | pos += 1 1088 | if char == "\n": 1089 | line += 1 1090 | column = 1 1091 | else: 1092 | column += 1 1093 | 1094 | # Flush any remaining text 1095 | yield from flush_text() 1096 | 1097 | # Auto-close any remaining open tags if recovery is enabled 1098 | if options.recover and options.auto_close_tags and tag_stack: 1099 | yield from emit_error( 1100 | "unclosed_tags", 1101 | f"{len(tag_stack)} unclosed tags at end of input", 1102 | "auto-closing all remaining tags", 1103 | severity="warning", 1104 | ) 1105 | while tag_stack: 1106 | tag_name = tag_stack.pop() 1107 | yield EndElement(tag_name, line, column, auto_closed=True) 1108 | 1109 | # Emit collected errors if requested 1110 | if options.collect_errors and collected_errors: 1111 | for error in collected_errors: 1112 | yield error 1113 | 1114 | 1115 | def tree_parse( 1116 | xml_input: Union[str, TextIO, Iterator[XMLEvent]], 1117 | tree_builder: Optional[TreeBuilder] = None, 1118 | tree: TreeType = "etree", 1119 | **parse_options, 1120 | ) -> ET.Element: 1121 | """ 1122 | Build XML tree from input or parsing events. 1123 | 1124 | This is a convenience function that combines streaming parsing with tree 1125 | building. It can accept either raw XML input (which will be parsed) or 1126 | an iterator of parsing events. 1127 | 1128 | Args: 1129 | xml_input: XML string, file-like object, or iterator of XMLEvents 1130 | tree_builder: Tree builder instance (defaults to ElementTreeBuilder) 1131 | **parse_options: Options passed to stream_parse if needed 1132 | 1133 | Returns: 1134 | ET.Element: Root element of the constructed XML tree 1135 | 1136 | Raises: 1137 | ValueError: If input is invalid or tree construction fails 1138 | TypeError: If tree_builder doesn't implement TreeBuilder interface 1139 | 1140 | Examples: 1141 | >>> xml = "text" 1142 | >>> root = tree_parse(xml) 1143 | >>> root.tag 1144 | 'root' 1145 | >>> root[0].text 1146 | 'text' 1147 | 1148 | >>> # Use enhanced options for fragments 1149 | >>> root = tree_parse("text only", allow_fragments=True) 1150 | """ 1151 | # Create the right tree builder if none provided 1152 | if tree_builder is None: 1153 | tree_builder = _backends[tree]() 1154 | 1155 | # Check if input is already an event stream (but not a file-like object) 1156 | if ( 1157 | hasattr(xml_input, "__iter__") 1158 | and not isinstance(xml_input, (str, bytes)) 1159 | and not hasattr(xml_input, "read") 1160 | ): 1161 | # Assume it's an iterator of events 1162 | events = xml_input 1163 | else: 1164 | # Parse raw XML input (string or file-like object) 1165 | events = stream_parse(xml_input, **parse_options) 1166 | 1167 | # Process events through tree builder 1168 | for event in events: 1169 | if isinstance(event, StartElement): 1170 | tree_builder.start_element(event) 1171 | elif isinstance(event, EndElement): 1172 | tree_builder.end_element(event) 1173 | elif isinstance(event, Text): 1174 | tree_builder.text(event) 1175 | elif isinstance(event, Comment): 1176 | tree_builder.comment(event) 1177 | elif isinstance(event, ProcessingInstruction): 1178 | tree_builder.processing_instruction(event) 1179 | elif isinstance(event, EntityRef): 1180 | tree_builder.entity_ref(event) 1181 | elif isinstance(event, ParseError): 1182 | tree_builder.parse_error(event) 1183 | 1184 | # Return constructed tree root 1185 | root = tree_builder.get_root() 1186 | if root is None: 1187 | # Check if fragments are allowed (default behavior) 1188 | allow_fragments = parse_options.get("allow_fragments", True) 1189 | if allow_fragments: 1190 | # Create a synthetic root element for fragments 1191 | from xml.etree.ElementTree import Element 1192 | 1193 | synthetic_root = Element("fragment") 1194 | return synthetic_root 1195 | else: 1196 | raise ValueError("No valid XML root element found") 1197 | return root 1198 | 1199 | 1200 | # ============================================================================= 1201 | # Internal Helper Functions 1202 | # ============================================================================= 1203 | 1204 | 1205 | def _resolve_entity(entity_name: str, is_numeric: bool = False) -> str: 1206 | """ 1207 | Resolve entity reference to character. 1208 | 1209 | Args: 1210 | entity_name: Entity name (without & and ;) 1211 | is_numeric: True for numeric entities ({ or ) 1212 | 1213 | Returns: 1214 | str: Resolved character or original entity if unresolvable 1215 | """ 1216 | if is_numeric: 1217 | try: 1218 | if entity_name.startswith("x") or entity_name.startswith("X"): 1219 | # Hexadecimal numeric entity 1220 | code_point = int(entity_name[1:], 16) 1221 | else: 1222 | # Decimal numeric entity 1223 | code_point = int(entity_name) 1224 | 1225 | # Validate Unicode code point range 1226 | if 0 <= code_point <= 0x10FFFF: 1227 | return chr(code_point) 1228 | except (ValueError, OverflowError): 1229 | pass 1230 | else: 1231 | # Named entity - try both with and without semicolon 1232 | if entity_name in HTML_ENTITIES: 1233 | return HTML_ENTITIES[entity_name] 1234 | elif entity_name + ";" in HTML_ENTITIES: 1235 | return HTML_ENTITIES[entity_name + ";"] 1236 | 1237 | # Return original entity if unresolvable 1238 | return f"&{entity_name};" 1239 | 1240 | 1241 | def _resolve_entities_in_text(text: str) -> str: 1242 | """ 1243 | Resolve entity references in text content. 1244 | 1245 | Args: 1246 | text: Text content that may contain entity references 1247 | 1248 | Returns: 1249 | str: Text with entities resolved to their character equivalents 1250 | """ 1251 | if not text or "&" not in text: 1252 | return text 1253 | 1254 | # Find and resolve entities in text 1255 | entity_pos = 0 1256 | resolved_parts = [] 1257 | has_entities = False 1258 | 1259 | for entity_match in PATTERNS["entity_ref"].finditer(text): 1260 | has_entities = True 1261 | # Add text before entity 1262 | resolved_parts.append(text[entity_pos : entity_match.start()]) 1263 | 1264 | # Resolve entity 1265 | if entity_match.group(1): # Named entity 1266 | resolved = _resolve_entity(entity_match.group(1), False) 1267 | elif entity_match.group(2): # Decimal numeric 1268 | resolved = _resolve_entity(entity_match.group(2), True) 1269 | elif entity_match.group(3): # Hex numeric 1270 | resolved = _resolve_entity("x" + entity_match.group(3), True) 1271 | else: 1272 | resolved = entity_match.group(0) # Keep original 1273 | 1274 | resolved_parts.append(resolved) 1275 | entity_pos = entity_match.end() 1276 | 1277 | if has_entities: 1278 | # Add remaining text 1279 | resolved_parts.append(text[entity_pos:]) 1280 | return "".join(resolved_parts) 1281 | 1282 | return text 1283 | 1284 | 1285 | def _parse_attributes( 1286 | attr_string: str, resolve_entities: bool = True 1287 | ) -> Dict[str, str]: 1288 | """ 1289 | Parse attribute string into name-value dictionary. 1290 | 1291 | Args: 1292 | attr_string: Raw attribute string from tag 1293 | resolve_entities: Whether to resolve entity references in attribute values 1294 | 1295 | Returns: 1296 | Dict[str, str]: Dictionary of attribute names to values 1297 | """ 1298 | attributes = {} 1299 | if not attr_string.strip(): 1300 | return attributes 1301 | 1302 | # Find all attribute matches 1303 | for match in PATTERNS["attributes"].finditer(attr_string): 1304 | name = match.group(1) 1305 | # Groups: 1=name, 2=double-quoted, 3=single-quoted, 4=unquoted 1306 | if match.group(2) is not None: 1307 | value = match.group(2) # Double-quoted value 1308 | elif match.group(3) is not None: 1309 | value = match.group(3) # Single-quoted value 1310 | elif match.group(4) is not None: 1311 | value = match.group(4) # Unquoted value 1312 | else: 1313 | value = "" # Attribute without value 1314 | 1315 | # Resolve entities in attribute value if enabled 1316 | if resolve_entities and value: 1317 | value = _resolve_entities_in_text(value) 1318 | 1319 | attributes[name] = value 1320 | 1321 | return attributes 1322 | 1323 | 1324 | def _update_position( 1325 | text: str, start_pos: int, end_pos: int, current_line: int, current_column: int 1326 | ) -> tuple[int, int]: 1327 | """ 1328 | Update line and column numbers based on text consumed. 1329 | 1330 | Args: 1331 | text: Source text 1332 | start_pos: Starting position 1333 | end_pos: Ending position 1334 | current_line: Current line number 1335 | current_column: Current column number 1336 | 1337 | Returns: 1338 | tuple[int, int]: Updated (line, column) position 1339 | """ 1340 | consumed_text = text[start_pos:end_pos] 1341 | 1342 | # Count newlines in the consumed text 1343 | newline_count = consumed_text.count("\n") 1344 | 1345 | if newline_count == 0: 1346 | # No newlines, just advance column 1347 | return current_line, current_column + (end_pos - start_pos) 1348 | else: 1349 | # Update line number and reset column to position after last newline 1350 | new_line = current_line + newline_count 1351 | last_newline_pos = consumed_text.rfind("\n") 1352 | new_column = len(consumed_text) - last_newline_pos - 1 1353 | return new_line, new_column 1354 | 1355 | 1356 | def _recover_tag_mismatch( 1357 | tag_stack: List[str], end_tag: str, line: int, column: int 1358 | ) -> List[EndElement]: 1359 | """ 1360 | Recover from mismatched end tags by auto-closing open tags. 1361 | 1362 | Args: 1363 | tag_stack: Stack of currently open tags 1364 | end_tag: The end tag that was encountered 1365 | line: Current line number 1366 | column: Current column number 1367 | 1368 | Returns: 1369 | List[EndElement]: Auto-generated end elements for recovery 1370 | """ 1371 | recovery_events = [] 1372 | 1373 | # Look for the matching start tag in the stack 1374 | matching_index = -1 1375 | for i in range(len(tag_stack) - 1, -1, -1): 1376 | if tag_stack[i] == end_tag: 1377 | matching_index = i 1378 | break 1379 | 1380 | if matching_index >= 0: 1381 | # Auto-close all tags above the matching one 1382 | for i in range(len(tag_stack) - 1, matching_index, -1): 1383 | tag_name = tag_stack.pop() 1384 | recovery_events.append(EndElement(tag_name, line, column, auto_closed=True)) 1385 | # Remove the matching tag as well 1386 | tag_stack.pop() 1387 | 1388 | return recovery_events 1389 | 1390 | 1391 | def _create_error_with_context( 1392 | error_type: str, 1393 | message: str, 1394 | line: int, 1395 | column: int, 1396 | text: str, 1397 | pos: int, 1398 | recovery: str = "", 1399 | fatal: bool = False, 1400 | severity: str = "error", 1401 | ) -> ParseError: 1402 | """ 1403 | Create a ParseError with surrounding context for better debugging. 1404 | 1405 | Args: 1406 | error_type: Category of error 1407 | message: Human-readable error description 1408 | line: Line number where error occurred 1409 | column: Column number where error occurred 1410 | text: Full source text 1411 | pos: Position in text where error occurred 1412 | recovery: Description of recovery action 1413 | fatal: Whether parsing cannot continue 1414 | severity: Error severity level 1415 | 1416 | Returns: 1417 | ParseError: Enhanced error with context 1418 | """ 1419 | # Extract context around the error position 1420 | context_start = max(0, pos - 50) 1421 | context_end = min(len(text), pos + 50) 1422 | context = text[context_start:context_end] 1423 | 1424 | # Mark the error position in context 1425 | if pos >= context_start and pos < context_end: 1426 | error_pos = pos - context_start 1427 | context = context[:error_pos] + ">>>" + context[error_pos:] 1428 | 1429 | # Clean up context (remove newlines for readability) 1430 | context = context.replace("\n", "\\n").replace("\r", "\\r") 1431 | 1432 | return ParseError( 1433 | error_type=error_type, 1434 | message=message, 1435 | line=line, 1436 | column=column, 1437 | recovery=recovery, 1438 | fatal=fatal, 1439 | context=context, 1440 | severity=severity, 1441 | ) 1442 | 1443 | 1444 | def _repair_attributes( 1445 | attr_string: str, 1446 | smart_quotes: bool = True, 1447 | recovery_strategy: RecoveryStrategy = RecoveryStrategy.LENIENT, 1448 | resolve_entities: bool = True, 1449 | ) -> Tuple[Dict[str, str], List[str]]: 1450 | """ 1451 | Enhanced attribute parsing with error recovery. 1452 | 1453 | Args: 1454 | attr_string: Raw attribute string from tag 1455 | smart_quotes: Enable smart quote matching 1456 | recovery_strategy: Recovery approach to use 1457 | resolve_entities: Whether to resolve entity references in attribute values 1458 | 1459 | Returns: 1460 | Tuple of (attributes dict, list of recovery messages) 1461 | """ 1462 | attributes = {} 1463 | recovery_messages = [] 1464 | 1465 | if not attr_string.strip(): 1466 | return attributes, recovery_messages 1467 | 1468 | # First try normal parsing 1469 | for match in PATTERNS["attributes"].finditer(attr_string): 1470 | name = match.group(1) 1471 | # Groups: 1=name, 2=double-quoted, 3=single-quoted, 4=unquoted 1472 | if match.group(2) is not None: 1473 | value = match.group(2) # Double-quoted value 1474 | elif match.group(3) is not None: 1475 | value = match.group(3) # Single-quoted value 1476 | elif match.group(4) is not None: 1477 | value = match.group(4) # Unquoted value 1478 | else: 1479 | value = "" # Attribute without value 1480 | 1481 | # Resolve entities in attribute value if enabled 1482 | if resolve_entities and value: 1483 | value = _resolve_entities_in_text(value) 1484 | 1485 | attributes[name] = value 1486 | 1487 | # If normal parsing didn't capture everything, try recovery 1488 | if recovery_strategy != RecoveryStrategy.STRICT: 1489 | remaining = attr_string 1490 | 1491 | # Remove successfully parsed attributes 1492 | for match in PATTERNS["attributes"].finditer(attr_string): 1493 | remaining = remaining.replace(match.group(0), "", 1) 1494 | 1495 | remaining = remaining.strip() 1496 | if remaining: 1497 | # Try to recover malformed attributes 1498 | if smart_quotes: 1499 | # Handle mixed quotes 1500 | for match in PATTERNS["mixed_quotes"].finditer(remaining): 1501 | name = match.group(1) 1502 | value = match.group(2) or match.group(3) 1503 | 1504 | # Resolve entities in attribute value if enabled 1505 | if resolve_entities and value: 1506 | value = _resolve_entities_in_text(value) 1507 | 1508 | attributes[name] = value 1509 | recovery_messages.append( 1510 | f"Fixed mixed quotes in attribute '{name}'" 1511 | ) 1512 | remaining = remaining.replace(match.group(0), "", 1) 1513 | 1514 | # Handle unquoted values 1515 | for match in PATTERNS["malformed_attr"].finditer(remaining): 1516 | name = match.group(1) 1517 | value = match.group(2) 1518 | 1519 | # Resolve entities in attribute value if enabled 1520 | if resolve_entities and value: 1521 | value = _resolve_entities_in_text(value) 1522 | 1523 | attributes[name] = value 1524 | recovery_messages.append(f"Added quotes to unquoted attribute '{name}'") 1525 | remaining = remaining.replace(match.group(0), "", 1) 1526 | 1527 | return attributes, recovery_messages 1528 | 1529 | 1530 | def _fix_encoding_issues(text: str) -> Tuple[str, List[str]]: 1531 | """ 1532 | Attempt to fix common encoding issues in XML text. 1533 | 1534 | Args: 1535 | text: Input text that may have encoding issues 1536 | 1537 | Returns: 1538 | Tuple of (fixed text, list of fix messages) 1539 | """ 1540 | fixes = [] 1541 | result = text 1542 | 1543 | # Remove or replace control characters 1544 | if PATTERNS["encoding_issues"].search(result): 1545 | original_len = len(result) 1546 | result = PATTERNS["encoding_issues"].sub("", result) 1547 | fixes.append(f"Removed {original_len - len(result)} control characters") 1548 | 1549 | # Common encoding fixes 1550 | encoding_fixes = { 1551 | "\u2013": "-", # en dash 1552 | "\u2014": "--", # em dash 1553 | "\u2018": "'", # left single quote 1554 | "\u2019": "'", # right single quote 1555 | "\u201c": '"', # left double quote 1556 | "\u201d": '"', # right double quote 1557 | "\u2026": "...", # ellipsis 1558 | "\u00a0": " ", # non-breaking space 1559 | } 1560 | 1561 | for bad_char, replacement in encoding_fixes.items(): 1562 | if bad_char in result: 1563 | result = result.replace(bad_char, replacement) 1564 | fixes.append(f"Replaced '{bad_char}' with '{replacement}'") 1565 | 1566 | return result, fixes 1567 | 1568 | 1569 | def _smart_quote_recovery(text: str, pos: int) -> Tuple[str, int, List[str]]: 1570 | """ 1571 | Attempt to recover from quote mismatches using smart matching. 1572 | 1573 | Args: 1574 | text: Full text being parsed 1575 | pos: Current position in text 1576 | 1577 | Returns: 1578 | Tuple of (recovered text, new position, recovery messages) 1579 | """ 1580 | recovery_messages = [] 1581 | 1582 | # Look for common quote patterns that can be fixed 1583 | # This is a simplified implementation - could be much more sophisticated 1584 | 1585 | # Find the nearest quote characters 1586 | quote_chars = {'"', "'"} 1587 | 1588 | # Look ahead for potential quote issues 1589 | look_ahead = text[pos : pos + 100] # Look at next 100 chars 1590 | 1591 | quote_positions = [] 1592 | for i, char in enumerate(look_ahead): 1593 | if char in quote_chars: 1594 | quote_positions.append((i + pos, char)) 1595 | 1596 | if len(quote_positions) >= 2: 1597 | # Try to match quotes intelligently 1598 | if quote_positions[0][1] != quote_positions[1][1]: 1599 | # Mismatched quotes - try to fix 1600 | first_pos, first_char = quote_positions[0] 1601 | second_pos, second_char = quote_positions[1] 1602 | 1603 | # Replace the second quote with the first to match 1604 | fixed_text = text[:second_pos] + first_char + text[second_pos + 1 :] 1605 | recovery_messages.append( 1606 | f"Fixed mismatched quotes: '{second_char}' -> '{first_char}'" 1607 | ) 1608 | return fixed_text, pos, recovery_messages 1609 | 1610 | return text, pos, recovery_messages 1611 | 1612 | 1613 | def _handle_incomplete_tag( 1614 | text: str, 1615 | pos: int, 1616 | line: int, 1617 | column: int, 1618 | recovery_strategy: RecoveryStrategy = RecoveryStrategy.LENIENT, 1619 | ) -> Tuple[Optional[str], int, List[str]]: 1620 | """ 1621 | Handle incomplete or malformed tags at end of input. 1622 | 1623 | Args: 1624 | text: Full text being parsed 1625 | pos: Current position in text 1626 | line: Current line number 1627 | column: Current column number 1628 | recovery_strategy: Recovery approach to use 1629 | 1630 | Returns: 1631 | Tuple of (recovered tag name or None, new position, recovery messages) 1632 | """ 1633 | recovery_messages = [] 1634 | 1635 | # Check if we have an incomplete tag 1636 | incomplete_match = PATTERNS["incomplete_tag"].match(text, pos) 1637 | if incomplete_match: 1638 | tag_name = incomplete_match.group(1) 1639 | incomplete_match.group(2) 1640 | 1641 | if recovery_strategy in [RecoveryStrategy.LENIENT, RecoveryStrategy.AGGRESSIVE]: 1642 | # Try to recover by adding missing > 1643 | recovery_messages.append( 1644 | f"Added missing '>' to incomplete tag '{tag_name}'" 1645 | ) 1646 | return tag_name, len(text), recovery_messages 1647 | 1648 | # Look for other common incomplete patterns 1649 | remaining = text[pos:].strip() 1650 | if remaining.startswith("<"): 1651 | # Incomplete tag start 1652 | if recovery_strategy == RecoveryStrategy.AGGRESSIVE: 1653 | # Try to extract tag name 1654 | tag_match = re.match(r"<\s*([a-zA-Z_:][a-zA-Z0-9_:.-]*)", remaining) 1655 | if tag_match: 1656 | tag_name = tag_match.group(1) 1657 | recovery_messages.append(f"Recovered incomplete tag '{tag_name}'") 1658 | return tag_name, len(text), recovery_messages 1659 | 1660 | return None, pos, recovery_messages 1661 | --------------------------------------------------------------------------------