├── .gitignore
├── CHANGELOG.md
├── .github
    └── workflows
    │   ├── test.yml
    │   └── release.yml
├── pyproject.toml
├── Makefile
├── CLAUDE.md
├── tests
    ├── test_enhanced_recovery.py
    └── test_sloppy_xml.py
├── README.md
├── logo.svg
├── LICENSE
└── sloppy_xml.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .venv
2 | *.egg-info
3 | uv.lock
4 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to this project will be documented in this file.
 4 | 
 5 | ## 0.3.0
 6 | 
 7 | - Added entity resolution in attributes
 8 | 
 9 | ## 0.2.1
10 | 
11 | - Fixed readme reference
12 | 
13 | ## 0.2.0
14 | 
15 | - Added `tree` parameter to parsing functions for more flexible tree building
16 | - Made parse options internal for cleaner API
17 | - Converted `namedtuple` usage to `NamedTuple` for better type hints
18 | 
19 | ## 0.1.0
20 | 
21 | - Initial implementation of sloppy XML parser
22 | - Streaming XML parser with event-based architecture
23 | - Tree-building functionality with ElementTree support
24 | - Error recovery mechanisms for malformed XML
25 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |   test:
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         python-version: ["3.10", "3.11", "3.12"]
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v4
18 |     
19 |     - name: Install uv
20 |       uses: astral-sh/setup-uv@v4
21 |       with:
22 |         version: "latest"
23 |     
24 |     - name: Set up Python ${{ matrix.python-version }}
25 |       run: uv python install ${{ matrix.python-version }}
26 |     
27 |     - name: Install dependencies
28 |       run: uv sync
29 |     
30 |     - name: Run tests
31 |       run: uv run pytest -v
32 |     
33 |     - name: Check code style
34 |       run: uv run ruff check
35 |     
36 |     - name: Check formatting
37 |       run: uv run ruff format --check


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - '*'
 7 | 
 8 | jobs:
 9 |   release:
10 |     runs-on: ubuntu-latest
11 |     permissions:
12 |       contents: write
13 |       id-token: write
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v4
17 |     
18 |     - name: Install uv
19 |       uses: astral-sh/setup-uv@v4
20 |       with:
21 |         version: "latest"
22 |     
23 |     - name: Set up Python
24 |       run: uv python install 3.12
25 |     
26 |     - name: Install dependencies
27 |       run: uv sync
28 |     
29 |     - name: Build package
30 |       run: uv build
31 |     
32 |     - name: Publish to PyPI
33 |       uses: pypa/gh-action-pypi-publish@release/v1
34 |     
35 |     - name: Create GitHub Release
36 |       uses: softprops/action-gh-release@v2
37 |       with:
38 |         files: dist/*
39 |         generate_release_notes: true


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "sloppy-xml"
 7 | version = "0.3.0"
 8 | description = "A sloppy XML parser for Python designed to be used with LLMs"
 9 | readme = "README.md"
10 | authors = [{name = "Armin Ronacher", email = "armin.ronacher@active-4.com"}]
11 | requires-python = ">=3.10"
12 | license = "Apache-2.0"
13 | classifiers = [
14 |     "Programming Language :: Python :: 3",
15 |     "Operating System :: OS Independent",
16 | ]
17 | 
18 | [project.urls]
19 | Homepage = "https://github.com/mitsuhiko/sloppy-xml-py"
20 | Repository = "https://github.com/mitsuhiko/sloppy-xml-py"
21 | 
22 | [tool.setuptools]
23 | py-modules = ["sloppy_xml"]
24 | 
25 | [project.optional-dependencies]
26 | lxml = [
27 |     "lxml>=4.6.0",
28 | ]
29 | 
30 | [dependency-groups]
31 | dev = [
32 |     "pytest>=8.3.5",
33 |     "ruff>=0.12.0",
34 | ]
35 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all format check format-check lint sync test build clean help
 2 | 
 3 | all: sync check test
 4 | 
 5 | # Format code automatically
 6 | format:
 7 | 	@uv run ruff format
 8 | 
 9 | # Check code style and quality
10 | check: format-check lint
11 | 
12 | # Check if code formatting is correct without changing files
13 | format-check:
14 | 	@uv run ruff format --check
15 | 
16 | # Run linting
17 | lint:
18 | 	@uv run ruff check
19 | 
20 | # Install/update dependencies and sync environment
21 | sync:
22 | 	@uv sync
23 | 
24 | # Run all tests
25 | test:
26 | 	@uv run pytest
27 | 
28 | # Build source distribution and wheel packages
29 | build:
30 | 	@uv build
31 | 
32 | # Clean build artifacts
33 | clean:
34 | 	@rm -rf dist/ build/ *.egg-info/
35 | 
36 | # Show available targets
37 | help:
38 | 	@echo "Available targets:"
39 | 	@echo "  format       - Format code automatically"
40 | 	@echo "  check        - Check code style and quality"
41 | 	@echo "  format-check - Check if code formatting is correct"
42 | 	@echo "  lint         - Run linting"
43 | 	@echo "  sync         - Install/update dependencies"
44 | 	@echo "  test         - Run all tests"
45 | 	@echo "  build        - Build packages"
46 | 	@echo "  clean        - Clean build artifacts"
47 | 	@echo "  help         - Show this help message"
48 | 


--------------------------------------------------------------------------------
/CLAUDE.md:
--------------------------------------------------------------------------------
 1 | # CLAUDE.md
 2 | 
 3 | This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
 4 | 
 5 | ## Project Overview
 6 | 
 7 | This is a sloppy XML parser library for Python - a single-file XML parser designed to handle malformed XML gracefully while maintaining high performance through pre-compiled regular expressions. The library provides both streaming and tree-building XML parsing capabilities with robust error recovery mechanisms for handling malformed XML commonly generated by LLMs and other automated systems.
 8 | 
 9 | ## Development Commands
10 | 
11 | ### Package Management
12 | This project uses **uv** as the package manager:
13 | - `uv sync` - Install/update dependencies and sync the environment
14 | - `uv add <package>` - Add a new dependency
15 | - `uv add --dev <package>` - Add a dev only dependency
16 | - `uv remove <package>` - Remove a dependency
17 | - `uv remove --dev <package>` - Remove a dev only dependency
18 | - `uv run <command>` - Run commands in the project environment
19 | 
20 | ### Testing
21 | - `uv run pytest` - Run all tests
22 | - `uv run pytest tests/test_sloppy_xml.py` - Run specific test file
23 | - `uv run pytest -v` - Run tests with verbose output
24 | - `uv run pytest tests/test_enhanced_recovery.py` - Run enhanced recovery tests
25 | 
26 | ### Linting & Formatting
27 | - `uv run ruff check` - Check code style and quality
28 | - `uv run ruff format` - Format code automatically
29 | - `uv run ruff check --fix` - Fix auto-fixable issues
30 | 
31 | ### Building
32 | - `uv build` - Build source distribution and wheel packages
33 | 
34 | ## Architecture
35 | 
36 | The parser implements an event-based streaming architecture with these key components:
37 | 
38 | ### Core Event Types (Named Tuples)
39 | - `StartElement` - Opening XML tags with attributes, line/column info
40 | - `EndElement` - Closing XML tags with auto-close detection
41 | - `Text` - Text content between tags with CDATA flag
42 | - `Comment` - XML comments
43 | - `ProcessingInstruction` - Processing instructions like `<?xml?>`
44 | - `EntityRef` - Entity references with resolution
45 | - `ParseError` - Error events with recovery information
46 | 
47 | ### State Machine
48 | Parser uses enum-based states: `INITIAL`, `IN_TAG`, `IN_TEXT`, `IN_COMMENT`, `IN_CDATA`, `IN_PI`, `ERROR_RECOVERY`, `COMPLETE`
49 | 
50 | ### Error Recovery
51 | - Tag stack management for auto-closing mismatched tags
52 | - Graceful handling of malformed attributes and entities
53 | - CDATA fallback mechanisms
54 | - Entity resolution for HTML entities and numeric entities
55 | 
56 | ### API Functions
57 | - `stream_parse()` - Main streaming parser returning event iterator
58 | - `tree_parse()` - Convenience function for ElementTree construction
59 | - `ETreeBuilder` - Tree builder for constructing ElementTree objects
60 | 
61 | ## File Structure
62 | 
63 | - `sloppy_xml.py` - Main parser implementation (single file)
64 | - `tests/test_sloppy_xml.py` - Comprehensive test suite
65 | - `tests/test_enhanced_recovery.py` - Enhanced error recovery tests
66 | - `ARCHITECTURE.md` - Detailed architectural specification
67 | - `TODO.md` - Development todo list
68 | - `pyproject.toml` - Project configuration with dependencies
69 | 
70 | ## Current Development Status
71 | 
72 | See `TODO.md` for pending tasks. Key areas of active development:
73 | - Optimizing text event emission
74 | - Making ElementTreeBuilder generic for lxml compatibility
75 | - Improving error handling and reporting
76 | - Performance optimizations
77 | 
78 | ## Testing Strategy
79 | 
80 | The test suite covers:
81 | - Well-formed XML parsing validation
82 | - Malformed XML recovery scenarios
83 | - Entity resolution (standard HTML entities + numeric)
84 | - Performance benchmarks
85 | - Edge cases and boundary conditions
86 | - Real-world malformed XML from LLM outputs


--------------------------------------------------------------------------------
/tests/test_enhanced_recovery.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Test script demonstrating enhanced error recovery features in the sloppy XML parser.
  4 | """
  5 | 
  6 | import pytest
  7 | import sloppy_xml
  8 | from sloppy_xml import RecoveryStrategy
  9 | 
 10 | 
 11 | def test_basic_recovery():
 12 |     """Test basic error recovery functionality."""
 13 |     # Malformed XML with unclosed tags
 14 |     xml = "<root><child>text<child>more text"
 15 | 
 16 |     events = list(sloppy_xml.stream_parse(xml))
 17 |     assert len(events) > 0
 18 | 
 19 |     # Check that we get various event types
 20 |     event_types = [type(event).__name__ for event in events]
 21 |     assert "StartElement" in event_types
 22 | 
 23 | 
 24 | def test_advanced_recovery():
 25 |     """Test advanced recovery with detailed error reporting."""
 26 |     # XML with multiple issues
 27 |     malformed_xml = """
 28 |         <!-- broken comment ->
 29 |         <root>
 30 |             <child attr="missing quote>
 31 |                 Text with &broken entity
 32 |                 <![CDATA[broken cdata section]>
 33 |             </child>
 34 |         """
 35 | 
 36 |     events = list(
 37 |         sloppy_xml.stream_parse(
 38 |             malformed_xml,
 39 |             emit_errors=True,
 40 |             recovery_strategy=RecoveryStrategy.AGGRESSIVE,
 41 |             repair_attributes=True,
 42 |             smart_quotes=True,
 43 |         )
 44 |     )
 45 |     assert len(events) > 0
 46 | 
 47 |     # Check for error events if available
 48 |     [e for e in events if hasattr(e, "error_type")]
 49 |     # Some parsers may not emit error events, that's OK
 50 | 
 51 |     # Should at least get some element events
 52 |     element_events = [
 53 |         e for e in events if type(e).__name__ in ["StartElement", "EndElement"]
 54 |     ]
 55 |     assert len(element_events) > 0
 56 | 
 57 | 
 58 | def test_encoding_recovery():
 59 |     """Test encoding issue recovery."""
 60 |     # XML with encoding issues (simulated)
 61 |     xml_with_encoding_issues = '<root>Text with em—dash and "smart quotes"</root>'
 62 | 
 63 |     events = list(
 64 |         sloppy_xml.stream_parse(
 65 |             xml_with_encoding_issues,
 66 |             fix_encoding=True,
 67 |             emit_errors=True,
 68 |             recovery_strategy=RecoveryStrategy.LENIENT,
 69 |         )
 70 |     )
 71 |     assert len(events) > 0
 72 | 
 73 |     # Should get text events
 74 |     text_events = [e for e in events if hasattr(e, "content")]
 75 |     assert len(text_events) > 0
 76 | 
 77 | 
 78 | def test_fragment_support():
 79 |     """Test XML fragment support."""
 80 |     # Text without root element
 81 |     fragment = "Just some text without a root element"
 82 | 
 83 |     tree = sloppy_xml.tree_parse(fragment, allow_fragments=True)
 84 |     assert tree is not None
 85 | 
 86 |     # Multiple root elements
 87 |     multi_root = "<root1>content1</root1><root2>content2</root2>"
 88 | 
 89 |     tree = sloppy_xml.tree_parse(multi_root, allow_fragments=True)
 90 |     assert tree is not None
 91 | 
 92 | 
 93 | def test_recovery_strategies():
 94 |     """Test different recovery strategies."""
 95 |     malformed = '<root><child attr="broken>text</child>'
 96 | 
 97 |     strategies = [
 98 |         (RecoveryStrategy.STRICT, "Strict"),
 99 |         (RecoveryStrategy.LENIENT, "Lenient"),
100 |         (RecoveryStrategy.AGGRESSIVE, "Aggressive"),
101 |     ]
102 | 
103 |     for strategy, name in strategies:
104 |         events = list(
105 |             sloppy_xml.stream_parse(
106 |                 malformed,
107 |                 recovery_strategy=strategy,
108 |                 emit_errors=True,
109 |                 repair_attributes=True,
110 |             )
111 |         )
112 |         sum(1 for e in events if hasattr(e, "error_type"))
113 |         element_count = sum(
114 |             1 for e in events if type(e).__name__ in ["StartElement", "EndElement"]
115 |         )
116 | 
117 |         # Should get at least some events
118 |         assert len(events) > 0
119 |         # Should get at least some elements
120 |         assert element_count > 0
121 | 
122 | 
123 | if __name__ == "__main__":
124 |     # Run tests with pytest
125 |     pytest.main([__file__, "-v"])
126 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |   <img src="https://github.com/mitsuhiko/sloppy-xml-py/raw/main/logo.svg" alt="" width=240>
  3 |   <p><strong>A sloppy XML parser for Python designed to handle malformed XML gracefully.</strong></p>
  4 | 
  5 | [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/mitsuhiko/sloppy-xml-py/blob/main/LICENSE)
  6 | [![Python Version](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://python.org)
  7 | [![Tests](https://img.shields.io/badge/tests-passing-green.svg)](https://github.com/mitsuhiko/sloppy-xml-py)
  8 | 
  9 | </div>
 10 | 
 11 | Sloppy XML is a single-file XML parser library that prioritizes resilience over strict XML compliance. In fact it tries not to be XML compliant at all. It's specifically designed to handle malformed XML commonly generated by LLMs, automated systems, and other sources where perfect XML structure cannot be guaranteed.
 12 | 
 13 | The parser provides both streaming and tree-building capabilities with robust error recovery mechanisms, making it ideal for parsing XML from unreliable sources while maintaining reasonable performance.
 14 | 
 15 | **Note:** this library was 100% AI generated with Claude Code and used experimentally for some evals I'm doing.  I will try to fix it up as good as possible as I ran into issues, but I cannot vouch for the quality of it.
 16 | 
 17 | ## Goals
 18 | 
 19 | * **Graceful Error Recovery**: Handle malformed XML without crashing
 20 | * **Dual API**: Both streaming events and ElementTree construction
 21 | * **Zero Dependencies**: Single file with only standard library dependencies
 22 | * **LLM-Friendly**: Specifically designed for XML generated by language models
 23 | * **Detailed Diagnostics**: Rich error reporting with line/column information
 24 | 
 25 | ## Quick Example
 26 | 
 27 | ```python
 28 | import sloppy_xml
 29 | 
 30 | # Streaming API - handle malformed XML gracefully
 31 | xml_content = '''
 32 | <root>
 33 |     <item name="test" broken-attr=>
 34 |         Some text with <unclosed-tag>
 35 |         <!-- Malformed comment --
 36 |     </item>
 37 | </root>
 38 | '''
 39 | 
 40 | # Stream parsing with error recovery
 41 | for event in sloppy_xml.stream_parse(xml_content):
 42 |     if isinstance(event, sloppy_xml.StartElement):
 43 |         print(f"Start: {event.name}, attrs: {event.attrs}")
 44 |     elif isinstance(event, sloppy_xml.EndElement):
 45 |         print(f"End: {event.name}")
 46 |     elif isinstance(event, sloppy_xml.Text):
 47 |         print(f"Text: {repr(event.content)}")
 48 |     elif isinstance(event, sloppy_xml.ParseError):
 49 |         print(f"Error recovered: {event.message} at {event.line}:{event.column}")
 50 | 
 51 | # Tree parsing - get an ElementTree despite malformed input
 52 | root = sloppy_xml.tree_parse(xml_content)
 53 | print(f"Parsed tree with root: {root.tag}")
 54 | ```
 55 | 
 56 | ## Event Types
 57 | 
 58 | The streaming parser emits these event types:
 59 | 
 60 | * **`StartElement`** - Opening tags with attributes and position info
 61 | * **`EndElement`** - Closing tags (including auto-closed mismatched tags)
 62 | * **`Text`** - Text content with CDATA detection
 63 | * **`Comment`** - XML comments
 64 | * **`ProcessingInstruction`** - Processing instructions like `<?xml?>`
 65 | * **`EntityRef`** - Entity references with automatic resolution
 66 | * **`ParseError`** - Recoverable parsing errors with diagnostic information
 67 | 
 68 | ## Error Recovery Features
 69 | 
 70 | * **Tag Stack Management**: Automatically closes mismatched opening tags
 71 | * **Malformed Attribute Handling**: Recovers from broken attribute syntax
 72 | * **Entity Resolution**: Handles standard HTML entities and numeric references
 73 | * **CDATA Fallback**: Treats malformed CDATA as regular text
 74 | * **Comment Recovery**: Handles unclosed comments gracefully
 75 | * **State Recovery**: Returns parser to valid state after errors
 76 | 
 77 | ## API Functions
 78 | 
 79 | ### Streaming API
 80 | 
 81 | ```python
 82 | sloppy_xml.stream_parse(xml_input)
 83 | ```
 84 | 
 85 | Returns an iterator of events for streaming XML processing.
 86 | 
 87 | ### Tree API  
 88 | 
 89 | ```python
 90 | sloppy_xml.tree_parse(xml_input)
 91 | ```
 92 | 
 93 | Returns an `xml.etree.ElementTree.Element` root node.
 94 | 
 95 | ## Installation
 96 | 
 97 | ```bash
 98 | uv add sloppy-xml-py
 99 | ```
100 | 
101 | ## Development
102 | 
103 | This project uses [uv](https://github.com/astral-sh/uv) for dependency management:
104 | 
105 | ```bash
106 | # Setup
107 | uv sync
108 | 
109 | # Run tests
110 | uv run pytest
111 | 
112 | # Format code
113 | uv run ruff format
114 | 
115 | # Check code quality
116 | uv run ruff check
117 | 
118 | # Build package
119 | uv build
120 | ```
121 | 
122 | ## Similar Projects
123 | 
124 | * **xml.etree.ElementTree** (stdlib) - Strict XML parsing, no error recovery
125 | * **lxml** - Fast XML parsing with some error tolerance
126 | * **BeautifulSoup** - HTML/XML parsing with tag soup handling
127 | 
128 | Sloppy XML fills the gap for applications that need structured XML parsing with aggressive error recovery, particularly for machine-generated content.
129 | 
130 | ## Sponsor
131 | 
132 | If you like the project and find it useful you can [become a
133 | sponsor](https://github.com/sponsors/mitsuhiko).
134 | 
135 | ## License and Links
136 | 
137 | - [Issue Tracker](https://github.com/mitsuhiko/sloppy-xml-py/issues)
138 | - License: [Apache-2.0](https://github.com/mitsuhiko/sloppy-xml-py/blob/main/LICENSE)
139 | 


--------------------------------------------------------------------------------
/logo.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?><svg id="Layer_1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 680.54 826.63"><defs><style>.cls-1{fill:#010101;}.cls-2{fill:#e67701;}@media (prefers-color-scheme: dark){.cls-1{fill:#ffffff;}}</style></defs><g id="N0fVYm.tif"><path class="cls-1" d="M417.12,826.15c-.39-.09-2.71-2.24-3.55-2.69-2.83-28.22-2.8-56.96-5.45-85.23l-5.66,10.32-25.34,58.58c-3.5,5.04-16.16,4.62-21.98,4.09-4.47-.41-6.3-1.47-8.65-5.33-7.3-11.98-11.71-30.6-18.6-43.34l-12.15-25.29-5.45,84.97c-.43,1.15-1.1,2.17-2.16,2.84-2.35,1.48-20.5,1.7-23.85,1.08-7.87-1.47-3.73-16.92-3.53-22.46.92-25.71,2.6-51.24,4.05-76.87.44-7.77-.32-35.18,2.92-40.04.44-.66,1.36-1.64,2.05-1.94,2.12-.94,26-1,29.39-.46,2.35.37,4.09,1.66,5.57,3.42l35.74,84.14,2.69,2.31,36.47-86.44c.91-.97,2.03-2.23,3.26-2.73,3.08-1.24,25.08-1.36,29.26-.81,3.84.51,5.04,2.79,5.47,6.52,1.41,35.35,3.61,70.61,4.93,105.96.28,7.49,2.42,18.54,1.01,25.91-.21,1.1-1.72,3.52-2.47,3.52-5.91,0-19.2,1.08-23.98,0Z"/><path class="cls-1" d="M137.61,684.5c2.9-.85,22.93-.68,26.79-.23,1.83.21,2.91.47,4.45,1.54,2.59,1.8,10.49,16.31,12.81,20.16,3.96,6.56,9.48,18.94,13.73,24.23.79.98,1.52,2.11,2.96,2.1l2.37-1.68c7.63-12.45,14.41-27.37,22.68-39.25,2.61-3.75,3.75-6.5,8.88-7.1,4.12-.48,23.35-.57,26.55.47s3.74,3.45,2.41,6.5c-11.45,18.58-24.86,36.23-35.84,55.07-2.36,4.05-6.62,8.49-3.42,13.25l39.26,59.63c.87,1.81,1.04,3.98-.44,5.48-2.09,2.1-21.49,1.89-25.53,1.51-5.41-.51-7.08-1.46-10.27-5.71-8.56-11.42-14.98-28.34-23.7-39.23-2.26-2.82-3.65-2.82-5.91,0-8.7,10.87-15.21,27.9-23.7,39.23-2.78,3.71-4.5,5.13-9.31,5.67-4.05.46-21.34.59-24.52-.45-2.4-.78-3.53-2.21-2.78-4.8l41.57-65.35c-12.61-23.42-29.9-44.33-42.57-67.62.19-1.01,2.48-3.12,3.54-3.43Z"/><path class="cls-1" d="M260.58,527.74c92.84-8.45,96.82,141.36-2.28,132.56-72.35-6.43-74.53-125.57,2.28-132.56ZM260.52,553.65c-37.25,5.78-38.9,87.4,11.64,81.57,42.31-4.88,40.81-89.71-11.64-81.57Z"/><path class="cls-1" d="M348.69,532.93c.41-1.4,4.07-3.06,5.63-3.17,37.44.24,97.84-6.54,91.17,50.02-1.89,16.05-19.78,35.57-36.36,35.57h-29.97c-.11,0-1.5,1.39-1.5,1.5v37.96c0,.21-2.5,2.74-3.02,2.98-1.83.83-19.8,1-22.43.49-1.1-.21-3.52-1.72-3.52-2.47v-122.88ZM399.14,553.41c-5.58-.75-13.99,0-19.98,0-.11,0-1.5,1.39-1.5,1.5v35.96c0,.11,1.39,1.5,1.5,1.5h16.98c.95,0,7.05-1.95,8.48-2.5,18.61-7.24,14.12-33.83-5.49-36.46Z"/><path class="cls-1" d="M492.55,654.82c-.91,5.03-18.02,3.92-22.52,3.54-2.99-.25-5.77-.18-6.21-3.73l-.29-117.74c0-3.08.81-6.65,4.49-6.5,36.36.03,95.85-8.81,92.51,45.56-1.88,30.73-26.55,40.06-53.49,40.42-1.93.03-14.49-2.89-14.49.5-1.43,11.26,1.93,27.34,0,37.96ZM494.05,553.41c-.33.51-1.5,1.23-1.5,1.5v34.97c0,3.73,16.25,2.57,19.2,2.22,15.11-1.82,24.54-13.34,17.95-28.38-2.19-5-10.42-10.31-15.67-10.31h-19.98Z"/><path class="cls-1" d="M39.77,527.72c12.91-1.54,26.84.11,38.5,5.9,3.8,1.89,13.42,6.8,11.16,11.77-.7,1.54-7.65,11.27-9.07,12.94-1.22,1.43-2.37,3-4.06,3.93-8.57-4.85-16.14-11.2-26.77-10.86-16.35.52-23.43,15.34-7.85,24.8,18.63,11.31,50.86,10.74,53.28,40.62,3.71,45.77-56.48,53.84-86.86,33.92-2.16-1.42-7.86-5.51-8.11-7.94s11.73-18.84,13.72-19.34,11.9,7.54,15.6,9.06c11.73,4.83,39.18,4.85,35.41-13.44-2.09-10.13-22.15-13.55-30.72-17.22-20.17-8.66-33.21-19.45-29.73-43.68,2.42-16.83,19.42-28.53,35.48-30.45Z"/><path class="cls-1" d="M552.99,799.17c4.66,1.34,4.49,21.37,2.58,24.32-.98,1.52-7.43,2.55-9.54,2.69-19.98,1.26-42.56-1.28-63.03-1.08-5.6.06-11.25,2.46-16.94-.94l-.33-137.21c.2-.79.79-1.16,1.41-1.59,2.68-1.9,28.3-1.98,29.91.91.16.29,1.49,6.17,1.49,6.5v104.9c0,.11,1.39,1.5,1.5,1.5h52.95Z"/><path class="cls-1" d="M625.41,583.38l22.8-47.65c.82-1.27,2.43-3.67,3.64-4.35,1.67-.95,14.96-2,18.04-2.01,4.09-.01,11.68.09,10.56,5.54-12.47,26.38-28.5,51.28-41.34,77.63-1.06,11.33-.91,22.95-.73,34.32.08,4.86,3.5,11.12-4.51,12.39-5.49.87-22.71,1.86-24.2-4.65-2.31-10.08,1.24-26.94-.17-37.79-9.68-25.56-26.21-48.8-38.52-73.37-2.28-4.55-8.49-12.94-.03-14.05,4.67-.61,23.23-.14,26.49,2.57l25.16,49.77,2.83,1.65Z"/><path class="cls-1" d="M143.89,633.34h50.45c4.39,0,3.86,17.18,3.4,20.42-.27,1.96-.67,3.69-2.82,4.11-9.24-.15-77.64,2.2-80.11-.96-.93-1.19-1-2.6-.96-4.03l.56-120.44c2-4.23,20.27-3.61,24.54-2.56,3.43.84,4.05,1.88,4.69,5.3-.06,28.87.91,57.85.29,86.71-.09,3.93-2.16,7.25-.04,11.45Z"/><path class="cls-2" d="M264.57.25c54.98-1.41,108.88,3.56,163.5,4.74,45.02.97,90.19-.94,134.9,4.97,56.46,7.46,56.55,51.37,58.5,97.35,3.51,82.98-6.72,160.06-9.07,241.76-2.01,69.89,13.67,119.82-74.39,130.41-76.58,9.2-153.3.97-229.76-2-52.5-2.04-105.31-1.09-157.87-3.98-25.19-1.38-50.27.4-72.72-13.2-31.58-19.12-23.88-64.51-25.63-96.25-1.26-22.9-4.28-45.94-5.08-68.85-1.18-33.78,1.43-67.38,2.09-100.9.81-41.7-3.72-85.84-1.09-126.97,2.45-38.25,24.3-59.06,62.44-61.44C160.61,2.77,214.06,1.54,264.57.25ZM129.16,436.24c.74.42,9.72-.84,12.28-.75,10.59.38,22.2,2.64,32.88,3.07,41.21,1.68,83.6-1.18,124.96-.08,78.13,2.08,166.14,15.81,243.24.56,39.32-7.78,32.71-27.77,33.9-61,3.24-90.46,10.59-180.92,6.99-271.65-.66-16.67,2.57-46.51-16.58-54.36-7.99-3.27-13.1-1.26-20.76-2.21-6.24-.77-12.92-4.29-20.08-4.9-39.82-3.36-80.89-2.86-120.83-4.05-59.84-1.79-117.67-5.04-177.91-3-26.4.9-53.73,4.27-79.84,5.08-15.65.49-35.61-4.49-48.97,5.98-9.33,7.31-12.75,17.89-13.55,29.41-2.3,33.05.51,70.32.08,103.98-.44,34.82-3.44,69.84-2.08,104.9,1.02,26.32,5.91,52.6,7.02,78.89.62,14.65-2.74,40.3,3.19,52.76,2.1,4.41,9.88,13.23,14.55,14.41,6.57,1.67,15.43.03,20.56,1.27.74.18.67,1.55.93,1.7Z"/><path class="cls-2" d="M329.05,168.45c-11.62-13.03.1-25.19,10.12-34.65,14.65-13.84,38.94-28.67,59.85-26.82,26.76,2.37,30.3,46.54,25.94,66.65-2.62,12.08-13.69,26.02-11.12,37.41,1.07,4.74,13.87,9.5,18.29,12.67,10.13,7.26,15.75,19.3,10.34,31.37-3.98,8.87-13.89,13.66-18.29,21.67-8.31,15.11-.49,39.44-1.52,56.42-1.6,26.15-17.71,58.92-48.5,55.41-15.37-1.75-18.59-21.91-5.76-28.74,8.21-4.37,12.49-.52,18.69-11.28,10.08-17.49.62-47.65,2.68-68.25,1.24-12.41,7.74-26.58,16.84-35.09-21.41-10.79-31.7-25.84-25.45-50.44,2.65-10.41,11.77-20.28,13.24-29.72.72-4.64,1.64-22.47-4.31-23.26s-22.01,11.81-26.89,16.03c-10.34,8.94-19.8,26.71-34.15,10.62Z"/><path class="cls-2" d="M269.52,97.09c8.13-1.15,13.41,2.16,18.06,8.43s7.39,12.85,1.18,19.82c-5.57,6.26-10.57,2.66-14.77,14.2-9.84,27.01,12.03,50.92,5.07,75-3.92,13.57-14.71,23.96-25.26,32.67,15.44,9.97,28.79,27.75,27.94,46.94-.63,14.34-10,32.93-7.92,45.91,1.74,10.91,21.88,7.89,20.99,25.06-.9,17.4-22.55,16-33.95,8.88-33.29-20.82-11.58-52.57-12.28-79.83-.63-24.21-34.88-30.38-34.84-48.96.04-19.7,27.86-23.98,31.58-38.43,2.9-11.26-5.58-26.07-6.56-37.46-1.95-22.78,1.64-68.1,30.75-72.23Z"/><path class="cls-2" d="M474.22,156.16c2.42-.09,5.48,1.04,7.9,1.59,27.24,22.6,49.4,51.26,75.38,75.47,5.31,7.03,2.49,17.04-2.48,23.51-24.12,20.12-47.31,51.5-71.45,70.42-6.54,5.12-12.06,6.69-19.7,2.7-9.57-5.01-13.3-13.01-7.49-22.89,2.19-3.72,11.55-12.93,15.17-16.8,11.83-12.65,25.07-25.27,37.46-37.46,3.43-3.37,8.08-5.59,10.34-10.47l-56.79-55.91c-11.79-9.46-.69-29.67,11.66-30.14Z"/><path class="cls-2" d="M173.21,317.3c-11.36-10.65-24.11-24.13-34.83-35.6-8.06-8.62-28.04-28.23-30-38.93-.73-3.99-.42-7.23,1.06-11.02,2.9-7.46,30.01-33.94,37.43-41.5,8.22-8.38,23.93-26.46,32.49-32.45,5.93-4.16,12.08-4.16,17.99.01,7.26,5.13,11.94,13.56,8.4,22.37-2,4.99-28.47,29.89-34.34,35.59-8.25,8.01-17.21,15.52-25.43,23.59,7.13,8.7,14.76,17.09,22.39,25.35,7.65,8.28,35.81,34.41,38,42.91,3.22,12.47-8.25,25.02-20.91,19.91-2.7-1.09-9.65-7.81-12.24-10.24Z"/><path class="cls-2" d="M325.45,205.98c13.77-2.77,20.58,8.44,18.02,21.03-1.44,7.06-11.93,14.3-10.78,21.16.52,3.11,10.65,8.7,13.5,11.54,14.47,14.39,13.09,34.42,2.92,50.86-7.16,11.57-18.55,28-33.31,17.5-14.88-10.57,6.91-25.77,9.46-35.37,4.21-15.87-10.02-16.09-17.52-25.51-10.87-13.66-7.73-26-.17-40.13,3.15-5.88,11.21-19.75,17.89-21.09Z"/></g></svg>


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |    To apply the Apache License to your work, attach the following
181 |    boilerplate notice, with the fields enclosed by brackets "[]"
182 |    replaced with your own identifying information. (Don't include
183 |    the brackets!)  The text should be enclosed in the appropriate
184 |    comment syntax for the file format. We also recommend that a
185 |    file or class name and description of purpose be included on the
186 |    same "printed page" as the copyright notice for easier
187 |    identification within third-party archives.
188 | 
189 | Copyright [yyyy] [name of copyright owner]
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 | 	http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 


--------------------------------------------------------------------------------
/tests/test_sloppy_xml.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python3
   2 | """
   3 | Comprehensive test suite for the sloppy XML parser.
   4 | 
   5 | This test suite validates all functionality including:
   6 | - Basic XML parsing (well-formed documents)
   7 | - Stream parsing functionality (event generation)
   8 | - Tree parsing functionality (ElementTree construction)
   9 | - Entity resolution (valid and invalid entities)
  10 | - Error recovery scenarios (malformed XML)
  11 | - Edge cases and boundary conditions
  12 | - Performance characteristics
  13 | - Real-world malformed XML scenarios
  14 | """
  15 | 
  16 | import pytest
  17 | import io
  18 | import time
  19 | import tempfile
  20 | import os
  21 | 
  22 | # Import the parser under test
  23 | import sloppy_xml
  24 | from sloppy_xml import (
  25 |     StartElement,
  26 |     EndElement,
  27 |     Text,
  28 |     Comment,
  29 |     ProcessingInstruction,
  30 |     ParseError,
  31 |     XMLEvent,
  32 |     RecoveryStrategy,
  33 |     TreeBuilder,
  34 | )
  35 | 
  36 | 
  37 | def test_simple_element():
  38 |     """Test parsing a simple element."""
  39 |     xml = "<root>content</root>"
  40 |     events = list(sloppy_xml.stream_parse(xml))
  41 | 
  42 |     # Find events by type rather than assuming exact count
  43 |     start_events = [e for e in events if isinstance(e, StartElement)]
  44 |     text_events = [e for e in events if isinstance(e, Text)]
  45 |     end_events = [e for e in events if isinstance(e, EndElement)]
  46 | 
  47 |     assert len(start_events) == 1
  48 |     assert start_events[0].name == "root"
  49 |     assert start_events[0].attrs == {}
  50 | 
  51 |     assert len(text_events) >= 0  # May or may not have text events
  52 |     if text_events:
  53 |         assert "content" in text_events[0].content
  54 | 
  55 |     assert len(end_events) == 1
  56 |     assert end_events[0].name == "root"
  57 |     assert not end_events[0].auto_closed
  58 | 
  59 | 
  60 | def test_nested_elements():
  61 |     """Test parsing nested elements."""
  62 |     xml = "<root><child><grandchild>text</grandchild></child></root>"
  63 |     events = list(sloppy_xml.stream_parse(xml))
  64 | 
  65 |     # Find events by type
  66 |     start_events = [e for e in events if isinstance(e, StartElement)]
  67 |     end_events = [e for e in events if isinstance(e, EndElement)]
  68 |     text_events = [e for e in events if isinstance(e, Text)]
  69 | 
  70 |     assert len(start_events) == 3  # root, child, grandchild
  71 |     assert len(end_events) == 3  # grandchild, child, root
  72 |     assert len(text_events) >= 0  # May or may not have text events
  73 | 
  74 |     # Check that we have the expected element names
  75 |     element_names = [e.name for e in start_events]
  76 |     assert "root" in element_names
  77 |     assert "child" in element_names
  78 |     assert "grandchild" in element_names
  79 | 
  80 | 
  81 | def test_attributes():
  82 |     """Test parsing elements with attributes."""
  83 |     xml = '<root id="123" class="main" checked>content</root>'
  84 |     events = list(sloppy_xml.stream_parse(xml))
  85 | 
  86 |     start_element = events[0]
  87 |     assert isinstance(start_element, StartElement)
  88 |     assert start_element.attrs["id"] == "123"
  89 |     assert start_element.attrs["class"] == "main"
  90 |     assert start_element.attrs["checked"] == ""
  91 | 
  92 | 
  93 | def test_self_closing_tags():
  94 |     """Test parsing self-closing tags."""
  95 |     xml = '<root><img src="test.jpg"/><br/></root>'
  96 |     events = list(sloppy_xml.stream_parse(xml))
  97 | 
  98 |     # root start, img start+end, br start+end, root end = 6 events
  99 |     assert len(events) == 6
 100 |     assert events[1].name == "img"
 101 |     assert events[2].name == "img"
 102 |     assert events[3].name == "br"
 103 |     assert events[4].name == "br"
 104 | 
 105 | 
 106 | def test_empty_element():
 107 |     """Test parsing empty elements."""
 108 |     xml = "<root></root>"
 109 |     events = list(sloppy_xml.stream_parse(xml))
 110 | 
 111 |     assert len(events) == 2
 112 |     assert isinstance(events[0], StartElement)
 113 |     assert isinstance(events[1], EndElement)
 114 | 
 115 | 
 116 | def test_whitespace_handling():
 117 |     """Test whitespace handling with different options."""
 118 |     xml = "<root>  \n  text  \n  </root>"
 119 | 
 120 |     # Default - should preserve significant whitespace
 121 |     events = list(sloppy_xml.stream_parse(xml))
 122 |     text_events = [e for e in events if isinstance(e, Text)]
 123 |     assert len(text_events) == 1
 124 |     assert "text" in text_events[0].content
 125 | 
 126 |     # Preserve all whitespace
 127 |     events = list(sloppy_xml.stream_parse(xml, preserve_whitespace=True))
 128 |     text_events = [e for e in events if isinstance(e, Text)]
 129 |     assert len(text_events) == 1
 130 |     assert text_events[0].content == "  \n  text  \n  "
 131 | 
 132 |     # Normalize whitespace
 133 |     events = list(sloppy_xml.stream_parse(xml, normalize_whitespace=True))
 134 |     text_events = [e for e in events if isinstance(e, Text)]
 135 |     assert len(text_events) == 1
 136 |     assert text_events[0].content == " text "
 137 | 
 138 | 
 139 | def test_stream_events_order():
 140 |     """Test that events are generated in correct order."""
 141 |     xml = "<!--comment--><root>text</root><?pi target?>"
 142 |     events = list(sloppy_xml.stream_parse(xml))
 143 | 
 144 |     event_types = [type(e).__name__ for e in events]
 145 |     expected = [
 146 |         "Comment",
 147 |         "StartElement",
 148 |         "Text",
 149 |         "EndElement",
 150 |         "ProcessingInstruction",
 151 |     ]
 152 |     assert event_types == expected
 153 | 
 154 | 
 155 | def test_position_tracking():
 156 |     """Test line and column position tracking."""
 157 |     xml = """<root>
 158 |     <child>text</child>
 159 | </root>"""
 160 |     events = list(sloppy_xml.stream_parse(xml))
 161 | 
 162 |     # First element should be at line 1
 163 |     start_event = events[0]
 164 |     assert start_event.line == 1
 165 |     assert start_event.column == 1
 166 | 
 167 |     # Child element should be at line 2
 168 |     child_events = [
 169 |         e for e in events if isinstance(e, StartElement) and e.name == "child"
 170 |     ]
 171 |     assert len(child_events) == 1
 172 |     assert child_events[0].line == 2
 173 | 
 174 | 
 175 | def test_streaming_large_input():
 176 |     """Test streaming behavior with large input."""
 177 |     # Create a large XML document
 178 |     large_xml = (
 179 |         "<root>"
 180 |         + "".join(f"<item{i}>content{i}</item{i}>" for i in range(1000))
 181 |         + "</root>"
 182 |     )
 183 | 
 184 |     # Parse as stream - should not load everything into memory at once
 185 |     event_count = 0
 186 |     for event in sloppy_xml.stream_parse(large_xml):
 187 |         event_count += 1
 188 |         # Verify we can process events one by one
 189 |         assert isinstance(event, XMLEvent)
 190 | 
 191 |     # Should have 1 + (3 * 1000) + 1 = 3002 events
 192 |     assert event_count == 3002
 193 | 
 194 | 
 195 | def test_file_input():
 196 |     """Test parsing from file-like objects."""
 197 |     xml = "<root><child>text</child></root>"
 198 | 
 199 |     # Test with StringIO
 200 |     file_obj = io.StringIO(xml)
 201 |     events = list(sloppy_xml.stream_parse(file_obj))
 202 |     assert len(events) == 5  # StartElement, StartElement, Text, EndElement, EndElement
 203 | 
 204 |     # Test with BytesIO
 205 |     file_obj = io.BytesIO(xml.encode("utf-8"))
 206 |     events = list(sloppy_xml.stream_parse(file_obj))
 207 |     assert len(events) == 5  # StartElement, StartElement, Text, EndElement, EndElement
 208 | 
 209 | 
 210 | def test_legacy_parameters():
 211 |     """Test backward compatibility with legacy parameters."""
 212 |     xml = "<root>text &amp; more</root>"
 213 | 
 214 |     # Test legacy parameter passing
 215 |     events = list(
 216 |         sloppy_xml.stream_parse(
 217 |             xml, recover=True, emit_errors=False, resolve_entities=True
 218 |         )
 219 |     )
 220 | 
 221 |     assert len(events) == 3
 222 |     text_event = [e for e in events if isinstance(e, Text)][0]
 223 |     assert (
 224 |         "text & more" in text_event.content or "text &amp; more" in text_event.content
 225 |     )
 226 | 
 227 | 
 228 | def test_basic_tree_construction():
 229 |     """Test basic ElementTree construction."""
 230 |     xml = "<root><child>text</child></root>"
 231 |     tree = sloppy_xml.tree_parse(xml)
 232 | 
 233 |     assert tree is not None
 234 |     assert tree.tag == "root"
 235 |     assert len(tree) == 1
 236 |     assert tree[0].tag == "child"
 237 |     assert tree[0].text == "text"
 238 | 
 239 | 
 240 | def test_tree_with_attributes():
 241 |     """Test tree construction preserves attributes."""
 242 |     xml = '<root id="1"><child class="test">content</child></root>'
 243 |     tree = sloppy_xml.tree_parse(xml)
 244 | 
 245 |     assert tree.attrib["id"] == "1"
 246 |     assert tree[0].attrib["class"] == "test"
 247 |     assert tree[0].text == "content"
 248 | 
 249 | 
 250 | def test_mixed_content():
 251 |     """Test tree with mixed text and element content."""
 252 |     xml = "<root>before<child>inner</child>after</root>"
 253 |     tree = sloppy_xml.tree_parse(xml)
 254 | 
 255 |     assert tree.text == "before"
 256 |     assert tree[0].text == "inner"
 257 |     assert tree[0].tail == "after"
 258 | 
 259 | 
 260 | def test_multiple_children():
 261 |     """Test tree with multiple child elements."""
 262 |     xml = "<root><child1>text1</child1><child2>text2</child2></root>"
 263 |     tree = sloppy_xml.tree_parse(xml)
 264 | 
 265 |     assert len(tree) == 2
 266 |     assert tree[0].tag == "child1"
 267 |     assert tree[1].tag == "child2"
 268 |     assert tree[0].text == "text1"
 269 |     assert tree[1].text == "text2"
 270 | 
 271 | 
 272 | def test_custom_tree_builder():
 273 |     """Test using custom tree builder."""
 274 | 
 275 |     class MockTreeBuilder(TreeBuilder):
 276 |         def __init__(self):
 277 |             self.events = []
 278 | 
 279 |         def start_element(self, event):
 280 |             self.events.append(("start", event.name))
 281 | 
 282 |         def end_element(self, event):
 283 |             self.events.append(("end", event.name))
 284 | 
 285 |         def text(self, event):
 286 |             self.events.append(("text", event.content))
 287 | 
 288 |         def comment(self, event):
 289 |             self.events.append(("comment", event.content))
 290 | 
 291 |         def processing_instruction(self, event):
 292 |             self.events.append(("pi", event.target))
 293 | 
 294 |         def entity_ref(self, event):
 295 |             self.events.append(("entity", event.name))
 296 | 
 297 |         def parse_error(self, event):
 298 |             self.events.append(("error", event.message))
 299 | 
 300 |         def get_root(self):
 301 |             return self.events
 302 | 
 303 |     xml = "<root>text</root>"
 304 |     builder = MockTreeBuilder()
 305 |     result = sloppy_xml.tree_parse(xml, tree_builder=builder)
 306 | 
 307 |     assert result == [("start", "root"), ("text", "text"), ("end", "root")]
 308 | 
 309 | 
 310 | def test_tree_parse_from_events():
 311 |     """Test tree parsing from pre-generated events."""
 312 |     xml = "<root><child>text</child></root>"
 313 |     events = sloppy_xml.stream_parse(xml)
 314 |     tree = sloppy_xml.tree_parse(events)
 315 | 
 316 |     assert tree.tag == "root"
 317 |     assert tree[0].text == "text"
 318 | 
 319 | 
 320 | def test_tree_parameter():
 321 |     """Test the tree parameter for different backend types."""
 322 |     xml = "<root><child>text</child></root>"
 323 | 
 324 |     # Test default etree backend
 325 |     tree_etree = sloppy_xml.tree_parse(xml, tree="etree")
 326 |     assert tree_etree.tag == "root"
 327 |     assert tree_etree[0].text == "text"
 328 | 
 329 |     # Verify it's an ElementTree element
 330 |     import xml.etree.ElementTree as ET
 331 | 
 332 |     assert isinstance(tree_etree, ET.Element)
 333 | 
 334 |     # Test lxml backend if available
 335 |     if sloppy_xml.HAS_LXML:
 336 |         tree_lxml = sloppy_xml.tree_parse(xml, tree="lxml")
 337 |         assert tree_lxml.tag == "root"
 338 |         assert tree_lxml[0].text == "text"
 339 | 
 340 |         # Verify it's an lxml element
 341 |         from lxml import etree as lxml_etree
 342 | 
 343 |         assert isinstance(tree_lxml, lxml_etree._Element)
 344 | 
 345 |     # Test with invalid tree backend should raise an error
 346 |     import pytest
 347 | 
 348 |     with pytest.raises(KeyError):
 349 |         sloppy_xml.tree_parse(xml, tree="invalid_backend")
 350 | 
 351 |     # Test that tree parameter overrides custom tree_builder when both are provided
 352 |     class MockTreeBuilder(TreeBuilder):
 353 |         def __init__(self):
 354 |             self.events = []
 355 | 
 356 |         def start_element(self, event):
 357 |             self.events.append(("start", event.name))
 358 | 
 359 |         def end_element(self, event):
 360 |             self.events.append(("end", event.name))
 361 | 
 362 |         def text(self, event):
 363 |             self.events.append(("text", event.content))
 364 | 
 365 |         def comment(self, event):
 366 |             pass
 367 | 
 368 |         def processing_instruction(self, event):
 369 |             pass
 370 | 
 371 |         def entity_ref(self, event):
 372 |             pass
 373 | 
 374 |         def parse_error(self, event):
 375 |             pass
 376 | 
 377 |         def get_root(self):
 378 |             return self.events
 379 | 
 380 |     # When both tree_builder and tree are provided, tree parameter should be ignored
 381 |     # and custom tree_builder should be used
 382 |     custom_builder = MockTreeBuilder()
 383 |     result = sloppy_xml.tree_parse(xml, tree_builder=custom_builder, tree="etree")
 384 |     # The custom tree builder should be used, not the etree backend
 385 |     assert result == [
 386 |         ("start", "root"),
 387 |         ("start", "child"),
 388 |         ("text", "text"),
 389 |         ("end", "child"),
 390 |         ("end", "root"),
 391 |     ]
 392 | 
 393 | 
 394 | def test_standard_html_entities():
 395 |     """Test resolution of standard HTML entities."""
 396 |     xml = "<root>&lt;&gt;&amp;&quot;&apos;</root>"
 397 |     events = list(sloppy_xml.stream_parse(xml))
 398 | 
 399 |     text_event = [e for e in events if isinstance(e, Text)][0]
 400 |     assert text_event.content == "<>&\"'"
 401 | 
 402 | 
 403 | def test_numeric_entities():
 404 |     """Test numeric entity resolution."""
 405 |     xml = "<root>&#65;&#x41;&#8364;</root>"  # A, A, Euro symbol
 406 |     events = list(sloppy_xml.stream_parse(xml))
 407 | 
 408 |     text_event = [e for e in events if isinstance(e, Text)][0]
 409 |     assert "A" in text_event.content
 410 |     assert "€" in text_event.content or "&#8364;" in text_event.content
 411 | 
 412 | 
 413 | def test_extended_html_entities():
 414 |     """Test extended HTML entity resolution."""
 415 |     xml = "<root>&copy;&reg;&nbsp;</root>"
 416 |     events = list(sloppy_xml.stream_parse(xml))
 417 | 
 418 |     text_event = [e for e in events if isinstance(e, Text)][0]
 419 |     expected_chars = {"©", "®", "\u00a0"}
 420 |     # Check if any expected characters are present (some might not resolve)
 421 |     content = text_event.content
 422 |     has_resolved = any(char in content for char in expected_chars)
 423 |     has_original = any(entity in content for entity in ["&copy;", "&reg;", "&nbsp;"])
 424 |     assert has_resolved or has_original
 425 | 
 426 | 
 427 | def test_invalid_entities():
 428 |     """Test handling of invalid entities."""
 429 |     xml = "<root>&invalid;&not;entity;</root>"
 430 |     events = list(sloppy_xml.stream_parse(xml))
 431 | 
 432 |     text_event = [e for e in events if isinstance(e, Text)][0]
 433 |     # Invalid entities should be left as-is or handled gracefully
 434 |     assert "&invalid;" in text_event.content or "invalid" in text_event.content
 435 | 
 436 | 
 437 | def test_entity_resolution_disabled():
 438 |     """Test disabling entity resolution."""
 439 |     xml = "<root>&lt;&amp;</root>"
 440 |     events = list(sloppy_xml.stream_parse(xml, resolve_entities=False))
 441 | 
 442 |     text_event = [e for e in events if isinstance(e, Text)][0]
 443 |     assert "&lt;" in text_event.content
 444 |     assert "&amp;" in text_event.content
 445 | 
 446 | 
 447 | def test_malformed_numeric_entities():
 448 |     """Test handling of malformed numeric entities."""
 449 |     xml = "<root>&#invalid;&#x;&#999999999999999;</root>"
 450 |     events = list(sloppy_xml.stream_parse(xml))
 451 | 
 452 |     # Should not crash and should handle gracefully
 453 |     text_events = [e for e in events if isinstance(e, Text)]
 454 |     assert len(text_events) > 0
 455 | 
 456 | 
 457 | def test_entity_resolution_in_attributes():
 458 |     """Test that entities are resolved in attribute values."""
 459 |     # Test the specific bug case reported
 460 |     xml = '<x><link href="https://secure.booking.invalid/myreservations.en-us.html?bn=4759;pincode=4391&amp;entrypoint=email_wakeup"/></x>'
 461 | 
 462 |     # Test with entity resolution enabled (default)
 463 |     events = list(sloppy_xml.stream_parse(xml, resolve_entities=True))
 464 |     start_events = [
 465 |         e for e in events if isinstance(e, StartElement) and e.name == "link"
 466 |     ]
 467 |     assert len(start_events) == 1
 468 |     href = start_events[0].attrs.get("href")
 469 |     assert (
 470 |         href
 471 |         == "https://secure.booking.invalid/myreservations.en-us.html?bn=4759;pincode=4391&entrypoint=email_wakeup"
 472 |     )
 473 |     assert "&amp;" not in href  # Entity should be resolved
 474 | 
 475 |     # Test with entity resolution disabled
 476 |     events = list(sloppy_xml.stream_parse(xml, resolve_entities=False))
 477 |     start_events = [
 478 |         e for e in events if isinstance(e, StartElement) and e.name == "link"
 479 |     ]
 480 |     assert len(start_events) == 1
 481 |     href = start_events[0].attrs.get("href")
 482 |     assert "&amp;" in href  # Entity should remain unresolved
 483 | 
 484 |     # Test various entity types in attribute values
 485 |     test_cases = [
 486 |         ('<test attr="&amp;"/>', "&"),
 487 |         ('<test attr="&lt;&gt;"/>', "<>"),
 488 |         ('<test attr="&quot;&apos;"/>', "\"'"),
 489 |         ('<test attr="&quot&apos"/>', "\"'"),
 490 |         ('<test attr="&#65;&#66;&#67;"/>', "ABC"),
 491 |         ('<test attr="&#x41;&#x42;&#x43;"/>', "ABC"),
 492 |     ]
 493 | 
 494 |     for xml, expected in test_cases:
 495 |         events = list(sloppy_xml.stream_parse(xml, resolve_entities=True))
 496 |         start_events = [
 497 |             e for e in events if isinstance(e, StartElement) and e.name == "test"
 498 |         ]
 499 |         assert len(start_events) == 1
 500 |         attr_value = start_events[0].attrs.get("attr")
 501 |         assert attr_value == expected, (
 502 |             f"Expected {expected!r}, got {attr_value!r} for {xml}"
 503 |         )
 504 | 
 505 |     # Test tree parsing also works correctly
 506 |     original_xml = '<x><link href="https://secure.booking.invalid/myreservations.en-us.html?bn=4759;pincode=4391&amp;entrypoint=email_wakeup"/></x>'
 507 |     root = sloppy_xml.tree_parse(original_xml)
 508 |     link = root.find("link")
 509 |     assert link is not None
 510 |     href = link.get("href")
 511 |     assert (
 512 |         href
 513 |         == "https://secure.booking.invalid/myreservations.en-us.html?bn=4759;pincode=4391&entrypoint=email_wakeup"
 514 |     )
 515 | 
 516 | 
 517 | def test_basic_comments():
 518 |     """Test parsing basic comments."""
 519 |     xml = "<!--comment--><root>content</root>"
 520 |     events = list(sloppy_xml.stream_parse(xml))
 521 | 
 522 |     comment_events = [e for e in events if isinstance(e, Comment)]
 523 |     assert len(comment_events) == 1
 524 |     assert comment_events[0].content == "comment"
 525 | 
 526 | 
 527 | def test_multiline_comments():
 528 |     """Test multiline comments."""
 529 |     xml = """<!--
 530 |         Multi-line
 531 |         comment
 532 |         --><root/>"""
 533 |     events = list(sloppy_xml.stream_parse(xml))
 534 | 
 535 |     comment_events = [e for e in events if isinstance(e, Comment)]
 536 |     assert len(comment_events) == 1
 537 |     assert "Multi-line" in comment_events[0].content
 538 | 
 539 | 
 540 | def test_comments_with_special_chars():
 541 |     """Test comments containing special characters."""
 542 |     xml = "<!--<>&\"'--><root/>"
 543 |     events = list(sloppy_xml.stream_parse(xml))
 544 | 
 545 |     comment_events = [e for e in events if isinstance(e, Comment)]
 546 |     assert len(comment_events) == 1
 547 |     assert comment_events[0].content == "<>&\"'"
 548 | 
 549 | 
 550 | def test_nested_comment_chars():
 551 |     """Test comments containing -- sequences."""
 552 |     xml = "<!--comment with -- double dashes--><root/>"
 553 |     events = list(sloppy_xml.stream_parse(xml))
 554 | 
 555 |     comment_events = [e for e in events if isinstance(e, Comment)]
 556 |     assert len(comment_events) == 1
 557 | 
 558 | 
 559 | def test_basic_pi():
 560 |     """Test basic processing instructions."""
 561 |     xml = '<?xml version="1.0"?><root/>'
 562 |     events = list(sloppy_xml.stream_parse(xml))
 563 | 
 564 |     pi_events = [e for e in events if isinstance(e, ProcessingInstruction)]
 565 |     assert len(pi_events) == 1
 566 |     assert pi_events[0].target == "xml"
 567 |     assert 'version="1.0"' in pi_events[0].data
 568 | 
 569 | 
 570 | def test_pi_without_data():
 571 |     """Test processing instructions without data."""
 572 |     xml = "<?target?><root/>"
 573 |     events = list(sloppy_xml.stream_parse(xml))
 574 | 
 575 |     pi_events = [e for e in events if isinstance(e, ProcessingInstruction)]
 576 |     assert len(pi_events) == 1
 577 |     assert pi_events[0].target == "target"
 578 |     assert pi_events[0].data is None or pi_events[0].data == ""
 579 | 
 580 | 
 581 | def test_multiple_pis():
 582 |     """Test multiple processing instructions."""
 583 |     xml = '<?xml version="1.0"?><?stylesheet type="text/css"?><root/>'
 584 |     events = list(sloppy_xml.stream_parse(xml))
 585 | 
 586 |     pi_events = [e for e in events if isinstance(e, ProcessingInstruction)]
 587 |     assert len(pi_events) == 2
 588 |     assert pi_events[0].target == "xml"
 589 |     assert pi_events[1].target == "stylesheet"
 590 | 
 591 | 
 592 | def test_basic_cdata():
 593 |     """Test basic CDATA sections."""
 594 |     xml = "<root><![CDATA[some data]]></root>"
 595 |     events = list(sloppy_xml.stream_parse(xml))
 596 | 
 597 |     text_events = [e for e in events if isinstance(e, Text)]
 598 |     assert len(text_events) == 1
 599 |     assert text_events[0].content == "some data"
 600 |     assert text_events[0].is_cdata
 601 | 
 602 | 
 603 | def test_cdata_with_special_chars():
 604 |     """Test CDATA with special characters."""
 605 |     xml = "<root><![CDATA[<>&\"']]></root>"
 606 |     events = list(sloppy_xml.stream_parse(xml))
 607 | 
 608 |     text_events = [e for e in events if isinstance(e, Text)]
 609 |     assert len(text_events) == 1
 610 |     assert text_events[0].content == "<>&\"'"
 611 |     assert text_events[0].is_cdata
 612 | 
 613 | 
 614 | def test_cdata_with_xml_content():
 615 |     """Test CDATA containing XML-like content."""
 616 |     xml = "<root><![CDATA[<tag>content</tag>]]></root>"
 617 |     events = list(sloppy_xml.stream_parse(xml))
 618 | 
 619 |     text_events = [e for e in events if isinstance(e, Text)]
 620 |     assert len(text_events) == 1
 621 |     assert text_events[0].content == "<tag>content</tag>"
 622 |     assert text_events[0].is_cdata
 623 | 
 624 | 
 625 | def test_unclosed_tags():
 626 |     """Test recovery from unclosed tags."""
 627 |     xml = "<root><child>text"
 628 |     events = list(sloppy_xml.stream_parse(xml, recover=True, auto_close_tags=True))
 629 | 
 630 |     # Should auto-close both tags
 631 |     end_events = [e for e in events if isinstance(e, EndElement)]
 632 |     assert len(end_events) >= 1  # At least the child should be auto-closed
 633 | 
 634 |     # Build tree to verify structure
 635 |     tree = sloppy_xml.tree_parse(xml, recover=True, auto_close_tags=True)
 636 |     assert tree is not None
 637 | 
 638 | 
 639 | def test_mismatched_tags():
 640 |     """Test recovery from mismatched tags."""
 641 |     xml = "<root><child></different></root>"
 642 |     events = list(sloppy_xml.stream_parse(xml, recover=True, emit_errors=True))
 643 | 
 644 |     error_events = [e for e in events if isinstance(e, ParseError)]
 645 |     assert len(error_events) > 0
 646 | 
 647 |     # Should still be able to build a tree
 648 |     tree = sloppy_xml.tree_parse(xml, recover=True, emit_errors=True)
 649 |     assert tree is not None
 650 | 
 651 | 
 652 | def test_malformed_attributes():
 653 |     """Test recovery from malformed attributes."""
 654 |     xml = '<root attr="missing quote>content</root>'
 655 |     events = list(
 656 |         sloppy_xml.stream_parse(xml, repair_attributes=True, emit_errors=True)
 657 |     )
 658 | 
 659 |     start_events = [e for e in events if isinstance(e, StartElement)]
 660 |     assert len(start_events) == 1
 661 |     # Should have attempted to fix the attribute
 662 |     assert "attr" in start_events[0].attrs or len(start_events[0].attrs) == 0
 663 | 
 664 | 
 665 | def test_broken_comments():
 666 |     """Test recovery from broken comments."""
 667 |     xml = "<!-- broken comment -> <root/>"
 668 |     events = list(
 669 |         sloppy_xml.stream_parse(
 670 |             xml, recovery_strategy=RecoveryStrategy.AGGRESSIVE, emit_errors=True
 671 |         )
 672 |     )
 673 | 
 674 |     # Should recover and parse both comment and element
 675 |     comment_events = [e for e in events if isinstance(e, Comment)]
 676 |     element_events = [e for e in events if isinstance(e, StartElement)]
 677 | 
 678 |     assert len(comment_events) >= 0  # May or may not recover comment
 679 |     assert len(element_events) == 1  # Should definitely get the element
 680 | 
 681 | 
 682 | def test_broken_cdata():
 683 |     """Test recovery from broken CDATA."""
 684 |     xml = "<root><![CDATA[broken cdata]></root>"
 685 |     events = list(
 686 |         sloppy_xml.stream_parse(xml, recovery_strategy=RecoveryStrategy.AGGRESSIVE)
 687 |     )
 688 | 
 689 |     # Should not crash and should produce some events
 690 |     assert len(events) > 0
 691 | 
 692 | 
 693 | def test_unescaped_characters():
 694 |     """Test recovery from unescaped special characters."""
 695 |     xml = "<root>text with < and & characters</root>"
 696 |     events = list(
 697 |         sloppy_xml.stream_parse(xml, recovery_strategy=RecoveryStrategy.LENIENT)
 698 |     )
 699 | 
 700 |     # Should handle gracefully
 701 |     text_events = [e for e in events if isinstance(e, Text)]
 702 |     assert len(text_events) > 0
 703 | 
 704 | 
 705 | def test_recovery_strategies():
 706 |     """Test different recovery strategies."""
 707 |     malformed_xml = '<root><child attr="broken>text</child>'
 708 | 
 709 |     strategies = [
 710 |         RecoveryStrategy.STRICT,
 711 |         RecoveryStrategy.LENIENT,
 712 |         RecoveryStrategy.AGGRESSIVE,
 713 |     ]
 714 | 
 715 |     for strategy in strategies:
 716 |         events = list(
 717 |             sloppy_xml.stream_parse(
 718 |                 malformed_xml, recovery_strategy=strategy, emit_errors=True
 719 |             )
 720 |         )
 721 | 
 722 |         # All strategies should produce some events
 723 |         assert len(events) > 0
 724 | 
 725 |         # More aggressive strategies should produce fewer errors or more recovery
 726 |         if strategy == RecoveryStrategy.AGGRESSIVE:
 727 |             # Should attempt maximum recovery
 728 |             element_events = [
 729 |                 e for e in events if isinstance(e, (StartElement, EndElement))
 730 |             ]
 731 |             assert len(element_events) > 0
 732 | 
 733 | 
 734 | def test_empty_input():
 735 |     """Test parsing empty input."""
 736 |     events = list(sloppy_xml.stream_parse(""))
 737 |     assert len(events) == 0
 738 | 
 739 | 
 740 | def test_whitespace_only():
 741 |     """Test parsing whitespace-only input."""
 742 |     events = list(sloppy_xml.stream_parse("   \n\t  "))
 743 |     # Should either be empty or contain whitespace text
 744 |     assert len(events) >= 0
 745 | 
 746 | 
 747 | def test_single_character():
 748 |     """Test parsing single character."""
 749 |     events = list(sloppy_xml.stream_parse("a"))
 750 |     text_events = [e for e in events if isinstance(e, Text)]
 751 |     assert len(text_events) == 1
 752 |     assert text_events[0].content == "a"
 753 | 
 754 | 
 755 | def test_very_long_tag_names():
 756 |     """Test very long tag names."""
 757 |     long_name = "a" * 1000
 758 |     xml = f"<{long_name}>content</{long_name}>"
 759 |     events = list(sloppy_xml.stream_parse(xml))
 760 | 
 761 |     start_events = [e for e in events if isinstance(e, StartElement)]
 762 |     assert len(start_events) == 1
 763 |     assert start_events[0].name == long_name
 764 | 
 765 | 
 766 | def test_very_long_attribute_values():
 767 |     """Test very long attribute values."""
 768 |     long_value = "x" * 10000
 769 |     xml = f'<root attr="{long_value}">content</root>'
 770 |     events = list(sloppy_xml.stream_parse(xml))
 771 | 
 772 |     start_events = [e for e in events if isinstance(e, StartElement)]
 773 |     assert len(start_events) == 1
 774 |     assert start_events[0].attrs["attr"] == long_value
 775 | 
 776 | 
 777 | def test_deeply_nested_elements():
 778 |     """Test deeply nested elements."""
 779 |     depth = 100
 780 |     open_tags = "".join(f"<level{i}>" for i in range(depth))
 781 |     close_tags = "".join(f"</level{i}>" for i in range(depth - 1, -1, -1))
 782 |     xml = open_tags + "content" + close_tags
 783 | 
 784 |     events = list(sloppy_xml.stream_parse(xml))
 785 |     start_events = [e for e in events if isinstance(e, StartElement)]
 786 |     assert len(start_events) == depth
 787 | 
 788 | 
 789 | def test_maximum_nesting_depth():
 790 |     """Test maximum nesting depth limit."""
 791 |     depth = 20
 792 |     xml = "".join(f"<level{i}>" for i in range(depth)) + "content"
 793 | 
 794 |     events = list(sloppy_xml.stream_parse(xml, max_depth=10))
 795 |     error_events = [e for e in events if isinstance(e, ParseError)]
 796 | 
 797 |     # Should hit depth limit and generate error
 798 |     depth_errors = [e for e in error_events if "depth" in e.error_type.lower()]
 799 |     assert len(depth_errors) > 0 or len(events) > 0  # Either error or truncation
 800 | 
 801 | 
 802 | def test_many_attributes():
 803 |     """Test elements with many attributes."""
 804 |     attrs = " ".join(f'attr{i}="value{i}"' for i in range(100))
 805 |     xml = f"<root {attrs}>content</root>"
 806 |     events = list(sloppy_xml.stream_parse(xml))
 807 | 
 808 |     start_events = [e for e in events if isinstance(e, StartElement)]
 809 |     assert len(start_events) == 1
 810 |     assert len(start_events[0].attrs) == 100
 811 | 
 812 | 
 813 | def test_special_characters_in_content():
 814 |     """Test special characters in text content."""
 815 |     special_chars = "áéíóú ñç 中文 🚀 \U0001f600"  # Unicode chars
 816 |     xml = f"<root>{special_chars}</root>"
 817 |     events = list(sloppy_xml.stream_parse(xml))
 818 | 
 819 |     text_events = [e for e in events if isinstance(e, Text)]
 820 |     assert len(text_events) == 1
 821 |     assert special_chars in text_events[0].content
 822 | 
 823 | 
 824 | def test_xml_declaration():
 825 |     """Test XML declaration handling."""
 826 |     xml = '<?xml version="1.0" encoding="UTF-8"?><root>content</root>'
 827 |     events = list(sloppy_xml.stream_parse(xml))
 828 | 
 829 |     pi_events = [e for e in events if isinstance(e, ProcessingInstruction)]
 830 |     # Should have XML declaration as processing instruction
 831 |     xml_decl = [e for e in pi_events if e.target == "xml"]
 832 |     assert len(xml_decl) == 1
 833 | 
 834 | 
 835 | def test_basic_namespaces():
 836 |     """Test basic namespace-aware parsing."""
 837 |     xml = '<root xmlns:ns="http://example.com"><ns:child>content</ns:child></root>'
 838 |     events = list(sloppy_xml.stream_parse(xml, namespace_aware=True))
 839 | 
 840 |     start_events = [e for e in events if isinstance(e, StartElement)]
 841 |     ns_elements = [e for e in start_events if ":" in e.name]
 842 |     assert len(ns_elements) > 0
 843 | 
 844 | 
 845 | def test_default_namespace():
 846 |     """Test default namespace handling."""
 847 |     xml = '<root xmlns="http://example.com"><child>content</child></root>'
 848 |     events = list(sloppy_xml.stream_parse(xml, namespace_aware=True))
 849 | 
 850 |     # Should parse without errors
 851 |     start_events = [e for e in events if isinstance(e, StartElement)]
 852 |     assert len(start_events) == 2
 853 | 
 854 | 
 855 | def test_namespace_disabled():
 856 |     """Test parsing with namespaces disabled."""
 857 |     xml = (
 858 |         '<ns:root xmlns:ns="http://example.com"><ns:child>content</ns:child></ns:root>'
 859 |     )
 860 |     events = list(sloppy_xml.stream_parse(xml, namespace_aware=False))  # Default
 861 | 
 862 |     start_events = [e for e in events if isinstance(e, StartElement)]
 863 |     # Should treat ns:root as a regular tag name
 864 |     assert any(e.name == "ns:root" for e in start_events)
 865 | 
 866 | 
 867 | def test_large_document_performance():
 868 |     """Test performance with large documents."""
 869 |     # Create a moderately large XML document
 870 |     num_elements = 5000
 871 |     xml_parts = ["<root>"]
 872 |     xml_parts.extend(
 873 |         f"<item{i} id='{i}'>Content for item {i}</item{i}>" for i in range(num_elements)
 874 |     )
 875 |     xml_parts.append("</root>")
 876 |     large_xml = "".join(xml_parts)
 877 | 
 878 |     start_time = time.time()
 879 |     events = list(sloppy_xml.stream_parse(large_xml))
 880 |     parse_time = time.time() - start_time
 881 | 
 882 |     # Should complete in reasonable time (less than 1 second for 5000 elements)
 883 |     assert parse_time < 5.0, f"Parsing took {parse_time:.2f} seconds, too slow"
 884 | 
 885 |     # Should produce correct number of events
 886 |     start_events = [e for e in events if isinstance(e, StartElement)]
 887 |     assert len(start_events) == num_elements + 1  # +1 for root
 888 | 
 889 | 
 890 | def test_deep_nesting_performance():
 891 |     """Test performance with deeply nested documents."""
 892 |     depth = 500
 893 |     open_tags = "".join(f"<level{i}>" for i in range(depth))
 894 |     close_tags = "".join(f"</level{i}>" for i in range(depth - 1, -1, -1))
 895 |     deep_xml = open_tags + "content" + close_tags
 896 | 
 897 |     start_time = time.time()
 898 |     events = list(sloppy_xml.stream_parse(deep_xml, max_depth=600))
 899 |     parse_time = time.time() - start_time
 900 | 
 901 |     # Should complete in reasonable time
 902 |     assert parse_time < 2.0, f"Deep nesting parsing took {parse_time:.2f} seconds"
 903 |     assert len(events) > 0
 904 | 
 905 | 
 906 | def test_memory_usage_streaming():
 907 |     """Test that streaming doesn't accumulate excessive memory."""
 908 |     # Create a large document
 909 |     large_xml = (
 910 |         "<root>" + "".join(f"<item{i}>data</item{i}>" for i in range(1000)) + "</root>"
 911 |     )
 912 | 
 913 |     # Parse as generator - should not load everything into memory
 914 |     event_generator = sloppy_xml.stream_parse(large_xml)
 915 | 
 916 |     # Process a few events to ensure generator works
 917 |     first_few_events = []
 918 |     for i, event in enumerate(event_generator):
 919 |         first_few_events.append(event)
 920 |         if i >= 10:  # Just get first 10 events
 921 |             break
 922 | 
 923 |     assert len(first_few_events) == 11
 924 |     # Generator should still have more events available
 925 |     next_event = next(event_generator, None)
 926 |     assert next_event is not None
 927 | 
 928 | 
 929 | def test_entity_heavy_performance():
 930 |     """Test performance with many entities."""
 931 |     # Create XML with many entity references
 932 |     content_with_entities = "Text with &amp; &lt; &gt; entities " * 1000
 933 |     xml = f"<root>{content_with_entities}</root>"
 934 | 
 935 |     start_time = time.time()
 936 |     events = list(sloppy_xml.stream_parse(xml))
 937 |     parse_time = time.time() - start_time
 938 | 
 939 |     assert parse_time < 2.0
 940 |     text_events = [e for e in events if isinstance(e, Text)]
 941 |     assert len(text_events) == 1
 942 | 
 943 | 
 944 | def test_llm_generated_malformed_xml():
 945 |     """Test typical LLM-generated malformed XML."""
 946 |     malformed_examples = [
 947 |         # Missing quotes in attributes
 948 |         "<root attr=value>content</root>",
 949 |         # Mixed quotes
 950 |         "<root attr=\"value'>content</root>",
 951 |         # Unclosed tags
 952 |         "<root><child>content",
 953 |         # Tag soup
 954 |         "<div><p>text<span>more</div>",
 955 |         # Unescaped characters
 956 |         "<root>text with < and & chars</root>",
 957 |         # Broken CDATA
 958 |         "<root><![CDATA[some data]></root>",
 959 |         # Malformed comments
 960 |         "<root><!-- comment -> more content</root>",
 961 |     ]
 962 | 
 963 |     for i, xml in enumerate(malformed_examples):
 964 |         try:
 965 |             events = list(
 966 |                 sloppy_xml.stream_parse(
 967 |                     xml,
 968 |                     recovery_strategy=RecoveryStrategy.AGGRESSIVE,
 969 |                     repair_attributes=True,
 970 |                 )
 971 |             )
 972 |             # Should not crash and should produce some events
 973 |             assert len(events) > 0, f"Example {i} produced no events"
 974 | 
 975 |             # Try to build tree
 976 |             tree = sloppy_xml.tree_parse(
 977 |                 xml,
 978 |                 recovery_strategy=RecoveryStrategy.AGGRESSIVE,
 979 |                 repair_attributes=True,
 980 |             )
 981 |             assert tree is not None, f"Example {i} failed to build tree"
 982 | 
 983 |         except Exception as e:
 984 |             pytest.fail(f"Example {i} crashed with: {e}")
 985 | 
 986 | 
 987 | def test_html_like_structures():
 988 |     """Test HTML-like structures that are not well-formed XML."""
 989 |     html_examples = [
 990 |         '<div><p>paragraph<br><img src="test.jpg"></div>',
 991 |         "<ul><li>item 1<li>item 2</ul>",
 992 |         "<table><tr><td>cell</table>",
 993 |         "<div class=test>content</div>",  # unquoted attribute
 994 |     ]
 995 | 
 996 |     for html in html_examples:
 997 |         events = list(
 998 |             sloppy_xml.stream_parse(
 999 |                 html,
1000 |                 recovery_strategy=RecoveryStrategy.AGGRESSIVE,
1001 |                 repair_attributes=True,
1002 |                 auto_close_tags=True,
1003 |             )
1004 |         )
1005 |         assert len(events) > 0
1006 | 
1007 |         # Should be able to build some kind of tree
1008 |         tree = sloppy_xml.tree_parse(
1009 |             html,
1010 |             recovery_strategy=RecoveryStrategy.AGGRESSIVE,
1011 |             repair_attributes=True,
1012 |             auto_close_tags=True,
1013 |         )
1014 |         assert tree is not None
1015 | 
1016 | 
1017 | def test_mixed_content_types():
1018 |     """Test documents with mixed content types."""
1019 |     mixed_xml = """
1020 |         <?xml version="1.0"?>
1021 |         <!-- This is a comment -->
1022 |         <root>
1023 |             Text content
1024 |             <![CDATA[<script>alert('hello')</script>]]>
1025 |             <child attr="value">
1026 |                 More text &amp; entities
1027 |                 <grandchild/>
1028 |             </child>
1029 |             <?processing instruction data?>
1030 |         </root>
1031 |         """
1032 | 
1033 |     events = list(sloppy_xml.stream_parse(mixed_xml))
1034 | 
1035 |     # Should have various event types
1036 |     event_types = {type(e).__name__ for e in events}
1037 |     expected_types = {
1038 |         "StartElement",
1039 |         "EndElement",
1040 |         "Text",
1041 |         "Comment",
1042 |         "ProcessingInstruction",
1043 |     }
1044 | 
1045 |     # Should have most of the expected types
1046 |     assert len(event_types.intersection(expected_types)) >= 3
1047 | 
1048 | 
1049 | def test_encoding_issues():
1050 |     """Test documents with encoding issues."""
1051 |     # Simulate common encoding problems
1052 |     xml_with_issues = '<root>Text with em—dash and "smart quotes"</root>'
1053 | 
1054 |     events = list(
1055 |         sloppy_xml.stream_parse(xml_with_issues, fix_encoding=True, emit_errors=True)
1056 |     )
1057 | 
1058 |     # Should handle without crashing
1059 |     assert len(events) > 0
1060 | 
1061 |     # Should be able to build tree
1062 |     tree = sloppy_xml.tree_parse(xml_with_issues, fix_encoding=True, emit_errors=True)
1063 |     assert tree is not None
1064 | 
1065 | 
1066 | def test_fragment_parsing():
1067 |     """Test parsing XML fragments."""
1068 |     fragments = [
1069 |         "Just text content",
1070 |         "<child>content</child>",  # No root
1071 |         "<p>para1</p><p>para2</p>",  # Multiple roots
1072 |         "Text before <tag>content</tag> text after",
1073 |     ]
1074 | 
1075 |     for fragment in fragments:
1076 |         events = list(sloppy_xml.stream_parse(fragment, allow_fragments=True))
1077 |         assert len(events) > 0
1078 | 
1079 |         # Should be able to handle fragments in tree parsing too
1080 |         try:
1081 |             tree = sloppy_xml.tree_parse(fragment, allow_fragments=True)
1082 |             assert tree is not None
1083 |         except ValueError:
1084 |             # Some fragments might not produce valid trees, that's ok
1085 |             pass
1086 | 
1087 | 
1088 | def test_full_pipeline_wellformed():
1089 |     """Test complete parsing pipeline with well-formed XML."""
1090 |     xml = """<?xml version="1.0" encoding="UTF-8"?>
1091 |         <!-- Sample document -->
1092 |         <document xmlns="http://example.com">
1093 |             <title>Test Document</title>
1094 |             <content>
1095 |                 <p>Paragraph with <em>emphasis</em> and &amp; entities.</p>
1096 |                 <list>
1097 |                     <item id="1">First item</item>
1098 |                     <item id="2">Second item</item>
1099 |                 </list>
1100 |                 <data><![CDATA[<xml>raw data</xml>]]></data>
1101 |             </content>
1102 |         </document>"""
1103 | 
1104 |     # Test stream parsing
1105 |     events = list(sloppy_xml.stream_parse(xml))
1106 |     assert len(events) > 10
1107 | 
1108 |     # Test tree building
1109 |     tree = sloppy_xml.tree_parse(xml)
1110 |     assert tree.tag == "document"
1111 |     assert len(tree) == 2  # title and content
1112 | 
1113 |     # Test with various options
1114 |     tree_with_opts = sloppy_xml.tree_parse(
1115 |         xml, preserve_whitespace=False, resolve_entities=True, namespace_aware=True
1116 |     )
1117 |     assert tree_with_opts.tag == "document"
1118 | 
1119 | 
1120 | def test_full_pipeline_malformed():
1121 |     """Test complete parsing pipeline with malformed XML."""
1122 |     malformed_xml = """<!-- broken comment ->
1123 |         <document>
1124 |             <title attr="missing quote>Test Document</title>
1125 |             <content>
1126 |                 <p>Paragraph with unescaped < characters
1127 |                 <list>
1128 |                     <item id=1>First item
1129 |                     <item id="2">Second item</item>
1130 |                 <data><![CDATA[broken cdata]></data>
1131 |             <content>
1132 |         """
1133 | 
1134 |     # Should handle malformed XML gracefully
1135 |     events = list(
1136 |         sloppy_xml.stream_parse(
1137 |             malformed_xml,
1138 |             recovery_strategy=RecoveryStrategy.AGGRESSIVE,
1139 |             emit_errors=True,
1140 |             repair_attributes=True,
1141 |             auto_close_tags=True,
1142 |         )
1143 |     )
1144 |     assert len(events) > 0
1145 | 
1146 |     # Should be able to build a tree despite issues
1147 |     tree = sloppy_xml.tree_parse(
1148 |         malformed_xml,
1149 |         recovery_strategy=RecoveryStrategy.AGGRESSIVE,
1150 |         emit_errors=True,
1151 |         repair_attributes=True,
1152 |         auto_close_tags=True,
1153 |     )
1154 |     assert tree is not None
1155 |     assert tree.tag == "document"
1156 | 
1157 | 
1158 | def test_error_collection():
1159 |     """Test error collection functionality."""
1160 |     malformed_xml = """<root>
1161 |             <child attr="broken">content</child>
1162 |             <unclosed>content
1163 |             </wrong_end>
1164 |         </root>"""
1165 | 
1166 |     events = list(
1167 |         sloppy_xml.stream_parse(
1168 |             malformed_xml,
1169 |             collect_errors=True,
1170 |             emit_errors=True,
1171 |             recovery_strategy=RecoveryStrategy.LENIENT,
1172 |         )
1173 |     )
1174 | 
1175 |     # Should have collected errors
1176 |     error_events = [e for e in events if isinstance(e, ParseError)]
1177 |     assert len(error_events) > 0
1178 | 
1179 | 
1180 | def test_file_integration():
1181 |     """Test integration with file I/O."""
1182 |     xml_content = "<root><child>file content</child></root>"
1183 | 
1184 |     # Test with temporary file
1185 |     with tempfile.NamedTemporaryFile(mode="w", suffix=".xml", delete=False) as f:
1186 |         f.write(xml_content)
1187 |         temp_path = f.name
1188 | 
1189 |     try:
1190 |         # Test parsing from file path (using open)
1191 |         with open(temp_path, "r") as file:
1192 |             events = list(sloppy_xml.stream_parse(file))
1193 |             assert (
1194 |                 len(events) == 5
1195 |             )  # StartElement, StartElement, Text, EndElement, EndElement
1196 | 
1197 |             # Test tree parsing from file
1198 |             file.seek(0)
1199 |             tree = sloppy_xml.tree_parse(file)
1200 |             assert tree.tag == "root"
1201 |             assert tree[0].text == "file content"
1202 | 
1203 |     finally:
1204 |         os.unlink(temp_path)
1205 | 
1206 | 
1207 | def test_custom_options_integration():
1208 |     """Test integration of custom parsing options."""
1209 |     xml = '<root attr="value">Text with &amp; entity</root>'
1210 | 
1211 |     # Test with custom options
1212 |     events = list(
1213 |         sloppy_xml.stream_parse(
1214 |             xml,
1215 |             recover=True,
1216 |             emit_errors=False,
1217 |             preserve_whitespace=True,
1218 |             resolve_entities=False,
1219 |             max_depth=500,
1220 |             recovery_strategy=RecoveryStrategy.LENIENT,
1221 |             repair_attributes=False,
1222 |         )
1223 |     )  # Keep entities as-is
1224 | 
1225 |     # Check that options were respected
1226 |     text_events = [e for e in events if isinstance(e, Text)]
1227 |     assert len(text_events) == 1
1228 |     # Entity should not be resolved
1229 |     assert "&amp;" in text_events[0].content
1230 | 
1231 | 
1232 | # Utility fixtures and helpers
1233 | @pytest.fixture
1234 | def sample_xml():
1235 |     """Fixture providing sample well-formed XML."""
1236 |     return """<?xml version="1.0"?>
1237 |     <catalog>
1238 |         <book id="1" genre="fiction">
1239 |             <title>The Great Gatsby</title>
1240 |             <author>F. Scott Fitzgerald</author>
1241 |             <price>12.99</price>
1242 |         </book>
1243 |         <book id="2" genre="fiction">
1244 |             <title>1984</title>
1245 |             <author>George Orwell</author>
1246 |             <price>13.99</price>
1247 |         </book>
1248 |     </catalog>"""
1249 | 
1250 | 
1251 | @pytest.fixture
1252 | def malformed_xml():
1253 |     """Fixture providing malformed XML for testing recovery."""
1254 |     return """<catalog>
1255 |         <book id=1 genre="fiction">
1256 |             <title>The Great Gatsby
1257 |             <author>F. Scott Fitzgerald</author>
1258 |             <price>12.99</price>
1259 |         </book>
1260 |         <book id="2" genre=fiction>
1261 |             <title>1984</title>
1262 |             <author>George Orwell</author>
1263 |         <book>
1264 |     """
1265 | 
1266 | 
1267 | def test_sample_xml_fixture(sample_xml):
1268 |     """Test parsing with sample XML fixture."""
1269 |     events = list(sloppy_xml.stream_parse(sample_xml))
1270 | 
1271 |     # Should have books
1272 |     start_events = [e for e in events if isinstance(e, StartElement)]
1273 |     book_elements = [e for e in start_events if e.name == "book"]
1274 |     assert len(book_elements) == 2
1275 | 
1276 |     # Should build valid tree
1277 |     tree = sloppy_xml.tree_parse(sample_xml)
1278 |     assert tree.tag == "catalog"
1279 |     assert len(tree) == 2
1280 | 
1281 | 
1282 | def test_malformed_xml_fixture(malformed_xml):
1283 |     """Test recovery with malformed XML fixture."""
1284 |     events = list(
1285 |         sloppy_xml.stream_parse(
1286 |             malformed_xml, recovery_strategy=RecoveryStrategy.AGGRESSIVE
1287 |         )
1288 |     )
1289 | 
1290 |     # Should still produce events despite malformation
1291 |     assert len(events) > 0
1292 | 
1293 |     # Should be able to build some kind of tree
1294 |     tree = sloppy_xml.tree_parse(
1295 |         malformed_xml, recovery_strategy=RecoveryStrategy.AGGRESSIVE
1296 |     )
1297 |     assert tree is not None
1298 | 
1299 | 
1300 | # Parametrized tests
1301 | @pytest.mark.parametrize(
1302 |     "recovery_strategy",
1303 |     [
1304 |         RecoveryStrategy.STRICT,
1305 |         RecoveryStrategy.LENIENT,
1306 |         RecoveryStrategy.AGGRESSIVE,
1307 |     ],
1308 | )
1309 | def test_recovery_strategies_parametrized(recovery_strategy):
1310 |     """Test all recovery strategies with parametrized tests."""
1311 |     xml = '<root><child attr="broken>text</child>'
1312 |     events = list(
1313 |         sloppy_xml.stream_parse(
1314 |             xml, recovery_strategy=recovery_strategy, emit_errors=True
1315 |         )
1316 |     )
1317 |     assert len(events) > 0
1318 | 
1319 | 
1320 | @pytest.mark.parametrize(
1321 |     "entity,expected",
1322 |     [
1323 |         ("&amp;", "&"),
1324 |         ("&lt;", "<"),
1325 |         ("&gt;", ">"),
1326 |         ("&quot;", '"'),
1327 |         ("&apos;", "'"),
1328 |         ("&#65;", "A"),
1329 |         ("&#x41;", "A"),
1330 |     ],
1331 | )
1332 | def test_entity_resolution_parametrized(entity, expected):
1333 |     """Test entity resolution with parametrized entities."""
1334 |     xml = f"<root>|{entity}|</root>"
1335 |     events = list(sloppy_xml.stream_parse(xml))
1336 | 
1337 |     text_events = [e for e in events if isinstance(e, Text)]
1338 |     assert len(text_events) == 1
1339 |     assert ("|%s|" % expected) in text_events[0].content
1340 | 
1341 | 
1342 | @pytest.mark.parametrize(
1343 |     "malformed_attr",
1344 |     [
1345 |         'attr="missing end quote',
1346 |         "attr='missing end quote",
1347 |         "attr=\"mixed quote'",
1348 |         "attr=unquoted_value",
1349 |         'attr=""',
1350 |         "attr",  # No value
1351 |     ],
1352 | )
1353 | def test_malformed_attributes_parametrized(malformed_attr):
1354 |     """Test various malformed attribute scenarios."""
1355 |     xml = f"<root {malformed_attr}>content</root>"
1356 |     events = list(
1357 |         sloppy_xml.stream_parse(
1358 |             xml, repair_attributes=True, recovery_strategy=RecoveryStrategy.AGGRESSIVE
1359 |         )
1360 |     )
1361 | 
1362 |     # Should not crash
1363 |     assert len(events) > 0
1364 | 
1365 |     # Should produce a start element
1366 |     start_events = [e for e in events if isinstance(e, StartElement)]
1367 |     assert len(start_events) == 1
1368 | 
1369 | 
1370 | if __name__ == "__main__":
1371 |     # Run tests with pytest
1372 |     pytest.main([__file__, "-v"])
1373 | 


--------------------------------------------------------------------------------
/sloppy_xml.py:
--------------------------------------------------------------------------------
   1 | """
   2 | Sloppy XML Parser Library
   3 | 
   4 | A single-file XML parser designed to handle malformed XML gracefully while
   5 | maintaining reasonable performance through pre-compiled regular expressions.
   6 | 
   7 | This library provides both streaming and tree-building XML parsing capabilities
   8 | with robust error recovery mechanisms for handling malformed XML commonly
   9 | generated by LLMs and other automated systems.
  10 | """
  11 | 
  12 | import re
  13 | import xml.etree.ElementTree as ET
  14 | from abc import ABC, abstractmethod
  15 | from dataclasses import dataclass
  16 | from enum import Enum, auto
  17 | from typing import (
  18 |     Any,
  19 |     Dict,
  20 |     Iterator,
  21 |     List,
  22 |     NamedTuple,
  23 |     Optional,
  24 |     Pattern,
  25 |     TextIO,
  26 |     Union,
  27 |     Tuple,
  28 |     Literal,
  29 | )
  30 | import html.entities
  31 | 
  32 | # Optional lxml import
  33 | try:
  34 |     from lxml import etree as lxml_etree
  35 | 
  36 |     HAS_LXML = True
  37 | except ImportError:
  38 |     HAS_LXML = False
  39 |     lxml_etree = None
  40 | 
  41 | 
  42 | __all__ = [
  43 |     # Event types
  44 |     "StartElement",
  45 |     "EndElement",
  46 |     "Text",
  47 |     "Comment",
  48 |     "ProcessingInstruction",
  49 |     "EntityRef",
  50 |     "ParseError",
  51 |     "XMLEvent",
  52 |     # Parser states and options
  53 |     "ParserState",
  54 |     "RecoveryStrategy",
  55 |     # Tree builder interface
  56 |     "TreeBuilder",
  57 |     "ElementTreeBuilder",
  58 |     # Main parsing functions
  59 |     "stream_parse",
  60 |     "tree_parse",
  61 |     # Constants
  62 |     "HAS_LXML",
  63 | ]
  64 | 
  65 | 
  66 | # =============================================================================
  67 | # Event Type Definitions
  68 | # =============================================================================
  69 | 
  70 | 
  71 | class StartElement(NamedTuple):
  72 |     name: str  # tag name
  73 |     attrs: Dict[str, str]  # attributes dictionary
  74 |     line: int  # line number in source
  75 |     column: int  # column number in source
  76 |     namespace: Optional[str]  # namespace URI if applicable
  77 | 
  78 | 
  79 | class EndElement(NamedTuple):
  80 |     name: str  # tag name
  81 |     line: int  # line number in source
  82 |     column: int  # column number in source
  83 |     auto_closed: bool  # True if auto-closed due to malformed XML
  84 | 
  85 | 
  86 | class Text(NamedTuple):
  87 |     content: str  # text content
  88 |     line: int  # line number in source
  89 |     column: int  # column number in source
  90 |     is_cdata: bool  # True if content was in CDATA section
  91 | 
  92 | 
  93 | class Comment(NamedTuple):
  94 |     content: str  # comment text
  95 |     line: int  # line number in source
  96 |     column: int  # column number in source
  97 | 
  98 | 
  99 | class ProcessingInstruction(NamedTuple):
 100 |     target: str  # PI target
 101 |     data: Optional[str]  # PI data
 102 |     line: int  # line number in source
 103 |     column: int  # column number in source
 104 | 
 105 | 
 106 | class EntityRef(NamedTuple):
 107 |     name: str  # entity name (e.g., 'amp', 'lt')
 108 |     resolved: str  # resolved value (e.g., '&', '<')
 109 |     line: int  # line number in source
 110 |     column: int  # column number in source
 111 | 
 112 | 
 113 | class ParseError(NamedTuple):
 114 |     error_type: str  # error category
 115 |     message: str  # human-readable error description
 116 |     line: int  # line number where error occurred
 117 |     column: int  # column number where error occurred
 118 |     recovery: str  # description of recovery action taken
 119 |     fatal: bool  # True if parsing cannot continue
 120 |     context: str  # surrounding text for context
 121 |     severity: str  # error severity level (warning, error, critical)
 122 | 
 123 | 
 124 | # Type alias for all possible XML events
 125 | XMLEvent = Union[
 126 |     StartElement,
 127 |     EndElement,
 128 |     Text,
 129 |     Comment,
 130 |     ProcessingInstruction,
 131 |     EntityRef,
 132 |     ParseError,
 133 | ]
 134 | 
 135 | 
 136 | # =============================================================================
 137 | # Parser State Machine
 138 | # =============================================================================
 139 | 
 140 | 
 141 | class ParserState(Enum):
 142 |     """States for the XML parser state machine."""
 143 | 
 144 |     INITIAL = auto()  # Looking for next XML construct
 145 |     IN_TAG = auto()  # Processing tag content and attributes
 146 |     IN_TEXT = auto()  # Accumulating text content
 147 |     IN_COMMENT = auto()  # Processing comment content
 148 |     IN_CDATA = auto()  # Processing CDATA section
 149 |     IN_PI = auto()  # Processing processing instruction
 150 |     ERROR_RECOVERY = auto()  # Attempting to recover from malformed XML
 151 |     COMPLETE = auto()  # End of input reached
 152 |     IN_ATTRIBUTE = auto()  # Processing attribute values
 153 |     IN_QUOTE_RECOVERY = auto()  # Recovering from quote mismatches
 154 | 
 155 | 
 156 | class RecoveryStrategy(Enum):
 157 |     """Different recovery strategies for malformed XML."""
 158 | 
 159 |     STRICT = auto()  # Minimal recovery, fail on most errors
 160 |     LENIENT = auto()  # Moderate recovery, fix common issues
 161 |     AGGRESSIVE = auto()  # Maximum recovery, fix everything possible
 162 |     CUSTOM = auto()  # User-defined recovery rules
 163 | 
 164 | 
 165 | # =============================================================================
 166 | # Configuration
 167 | # =============================================================================
 168 | 
 169 | 
 170 | @dataclass
 171 | class _ParseOptions:
 172 |     """Configuration options for XML parsing."""
 173 | 
 174 |     recover: bool = True  # Enable error recovery
 175 |     emit_errors: bool = False  # Yield ParseError events
 176 |     preserve_whitespace: bool = False  # Keep all whitespace
 177 |     resolve_entities: bool = True  # Resolve HTML entities
 178 |     namespace_aware: bool = False  # Process XML namespaces
 179 |     max_depth: int = 1000  # Maximum nesting depth
 180 |     encoding: str = "utf-8"  # Input encoding
 181 |     recovery_strategy: RecoveryStrategy = RecoveryStrategy.LENIENT  # Recovery approach
 182 |     max_recovery_attempts: int = 10  # Maximum recovery attempts per error
 183 |     collect_errors: bool = False  # Collect all errors instead of yielding
 184 |     smart_quotes: bool = True  # Enable smart quote matching
 185 |     auto_close_tags: bool = True  # Auto-close unclosed tags
 186 |     fix_encoding: bool = True  # Attempt encoding error recovery
 187 |     normalize_whitespace: bool = False  # Normalize whitespace in text
 188 |     allow_fragments: bool = True  # Allow XML fragments without root
 189 |     repair_attributes: bool = True  # Repair malformed attributes
 190 | 
 191 | 
 192 | # =============================================================================
 193 | # Regular Expression Patterns
 194 | # =============================================================================
 195 | 
 196 | # Compilation flags for all patterns
 197 | FLAGS = re.MULTILINE | re.DOTALL
 198 | 
 199 | # Core XML constructs - pre-compiled for performance
 200 | PATTERNS: Dict[str, Pattern[str]] = {
 201 |     # Tag patterns - handle both well-formed and malformed tags
 202 |     "start_tag": re.compile(r"<\s*([a-zA-Z_:][a-zA-Z0-9_:.-]*)\s*([^>]*?)(/?)>", FLAGS),
 203 |     # Enhanced end tag pattern that handles malformed closing tags
 204 |     "end_tag": re.compile(r"</\s*([a-zA-Z_:][a-zA-Z0-9_:.-]*)\s*>", FLAGS),
 205 |     # More robust incomplete tag detection
 206 |     "incomplete_tag": re.compile(
 207 |         r"<\s*([a-zA-Z_:][a-zA-Z0-9_:.-]*)\s*([^>]*?)$", FLAGS
 208 |     ),
 209 |     # Enhanced attribute parsing - handles malformed quotes and values
 210 |     "attributes": re.compile(
 211 |         r'([a-zA-Z_:][a-zA-Z0-9_:.-]*)\s*(?:=\s*(?:"([^"]*)"|\'([^\']*)\'|([^\s>]+)))?',
 212 |         FLAGS,
 213 |     ),
 214 |     # Malformed attribute patterns for recovery
 215 |     "malformed_attr": re.compile(
 216 |         r'([a-zA-Z_:][a-zA-Z0-9_:.-]*)\s*=\s*([^"\'>\s][^>\s]*)', FLAGS
 217 |     ),
 218 |     # Mismatched quote patterns
 219 |     "mixed_quotes": re.compile(
 220 |         r'([a-zA-Z_:][a-zA-Z0-9_:.-]*)\s*=\s*(?:"([^"]*?)\'|\'([^\']*?)")', FLAGS
 221 |     ),
 222 |     # Enhanced entity patterns - handles malformed entities
 223 |     "entity_ref": re.compile(
 224 |         r"&(?:([a-zA-Z][a-zA-Z0-9]*)|#(?:([0-9]+)|x([0-9a-fA-F]+)));?", FLAGS
 225 |     ),
 226 |     # Malformed entity recovery
 227 |     "broken_entity": re.compile(
 228 |         r"&([a-zA-Z][a-zA-Z0-9]*|#(?:[0-9]+|x[0-9a-fA-F]+))(?![;a-zA-Z0-9])", FLAGS
 229 |     ),
 230 |     # Enhanced comment patterns - handles various malformed comments
 231 |     "comment": re.compile(r"<!--(.*?)-->", FLAGS),
 232 |     # Malformed comment patterns
 233 |     "broken_comment": re.compile(r"<!--([^>]*?)(?:->|>|$)", FLAGS),
 234 |     # Enhanced CDATA patterns - graceful fallback for malformed
 235 |     "cdata": re.compile(r"<!\[CDATA\[(.*?)(?:\]\]>|$)", FLAGS),
 236 |     # Malformed CDATA recovery
 237 |     "broken_cdata": re.compile(r"<!\[CDATA\[(.*?)(?:\]>|>|$)", FLAGS),
 238 |     # Enhanced processing instruction
 239 |     "pi": re.compile(r"<\?([a-zA-Z_:][a-zA-Z0-9_:.-]*)\s*(.*?)(?:\?>|>|$)", FLAGS),
 240 |     # Whitespace handling
 241 |     "whitespace": re.compile(r"\s+", FLAGS),
 242 |     # Text content between tags - enhanced to handle special chars
 243 |     "text_content": re.compile(r"[^<&]+", FLAGS),
 244 |     # Unescaped special character detection
 245 |     "unescaped_chars": re.compile(r"[<>&]", FLAGS),
 246 |     # Common encoding issues
 247 |     "encoding_issues": re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]", FLAGS),
 248 | }
 249 | 
 250 | # Use Python's standard HTML5 entity mappings
 251 | HTML_ENTITIES = html.entities.html5
 252 | 
 253 | 
 254 | # =============================================================================
 255 | # Tree Builder Interface
 256 | # =============================================================================
 257 | 
 258 | 
 259 | class TreeBuilder(ABC):
 260 |     """Abstract base class for XML tree builders."""
 261 | 
 262 |     @abstractmethod
 263 |     def start_element(self, event: StartElement) -> None:
 264 |         """Process start element event."""
 265 |         pass
 266 | 
 267 |     @abstractmethod
 268 |     def end_element(self, event: EndElement) -> None:
 269 |         """Process end element event."""
 270 |         pass
 271 | 
 272 |     @abstractmethod
 273 |     def text(self, event: Text) -> None:
 274 |         """Process text content event."""
 275 |         pass
 276 | 
 277 |     @abstractmethod
 278 |     def comment(self, event: Comment) -> None:
 279 |         """Process comment event."""
 280 |         pass
 281 | 
 282 |     @abstractmethod
 283 |     def processing_instruction(self, event: ProcessingInstruction) -> None:
 284 |         """Process processing instruction event."""
 285 |         pass
 286 | 
 287 |     @abstractmethod
 288 |     def entity_ref(self, event: EntityRef) -> None:
 289 |         """Process entity reference event."""
 290 |         pass
 291 | 
 292 |     @abstractmethod
 293 |     def parse_error(self, event: ParseError) -> None:
 294 |         """Process parse error event."""
 295 |         pass
 296 | 
 297 |     @abstractmethod
 298 |     def get_root(self) -> Any:
 299 |         """Return the constructed tree root."""
 300 |         pass
 301 | 
 302 | 
 303 | class ElementTreeBuilder(TreeBuilder):
 304 |     """ElementTree-based tree builder implementation."""
 305 | 
 306 |     def __init__(self):
 307 |         """Initialize the ElementTree builder."""
 308 |         self.root: Optional[ET.Element] = None
 309 |         self.element_stack: List[ET.Element] = []
 310 |         self.current_text: List[str] = []
 311 | 
 312 |     def start_element(self, event: StartElement) -> None:
 313 |         """Create new element and add to tree."""
 314 |         # Flush any accumulated text to the current element
 315 |         self._flush_text()
 316 | 
 317 |         # Create new element with attributes
 318 |         element = ET.Element(event.name, event.attrs)
 319 | 
 320 |         # Set as root if this is the first element
 321 |         if self.root is None:
 322 |             self.root = element
 323 |         else:
 324 |             # Add as child to current parent element
 325 |             if self.element_stack:
 326 |                 parent = self.element_stack[-1]
 327 |                 parent.append(element)
 328 | 
 329 |         # Push element onto stack to track hierarchy
 330 |         self.element_stack.append(element)
 331 | 
 332 |     def end_element(self, event: EndElement) -> None:
 333 |         """Close current element."""
 334 |         # Flush any accumulated text to the current element
 335 |         self._flush_text()
 336 | 
 337 |         # Pop element from stack if not empty
 338 |         if self.element_stack:
 339 |             self.element_stack.pop()
 340 | 
 341 |         # Note: auto_closed flag is available in event.auto_closed
 342 |         # but ElementTree doesn't need special handling for auto-closed tags
 343 | 
 344 |     def text(self, event: Text) -> None:
 345 |         """Accumulate text content."""
 346 |         # Add text content to buffer for later flushing
 347 |         # CDATA sections are treated as regular text in ElementTree
 348 |         self.current_text.append(event.content)
 349 | 
 350 |     def comment(self, event: Comment) -> None:
 351 |         """Process comment event."""
 352 |         # Flush any accumulated text first
 353 |         self._flush_text()
 354 | 
 355 |         # ElementTree doesn't preserve comments in the tree by default
 356 |         # Comments are ignored in the standard ElementTree implementation
 357 |         # If comment preservation is needed, could be implemented with
 358 |         # custom Comment nodes or processing instructions
 359 |         pass
 360 | 
 361 |     def processing_instruction(self, event: ProcessingInstruction) -> None:
 362 |         """Process processing instruction event."""
 363 |         # Flush any accumulated text first
 364 |         self._flush_text()
 365 | 
 366 |         # ElementTree doesn't preserve PIs in the tree by default
 367 |         # Processing instructions are typically handled during parsing
 368 |         # but not stored in the tree structure
 369 |         pass
 370 | 
 371 |     def entity_ref(self, event: EntityRef) -> None:
 372 |         """Process entity reference event."""
 373 |         # Add resolved entity text to the current text buffer
 374 |         # The entity should already be resolved in the event
 375 |         self.current_text.append(event.resolved)
 376 | 
 377 |     def parse_error(self, event: ParseError) -> None:
 378 |         """Process parse error event."""
 379 |         # For now, we'll silently handle parse errors
 380 |         # In a production implementation, you might want to:
 381 |         # - Log the error
 382 |         # - Collect errors for later reporting
 383 |         # - Raise an exception for fatal errors
 384 | 
 385 |         if event.fatal:
 386 |             # For fatal errors, we might want to raise an exception
 387 |             # but for robustness, we'll continue processing
 388 |             pass
 389 | 
 390 |         # Non-fatal errors are handled through recovery in the parser
 391 |         pass
 392 | 
 393 |     def get_root(self) -> Optional[ET.Element]:
 394 |         """Return the constructed tree root."""
 395 |         # Flush any remaining accumulated text
 396 |         self._flush_text()
 397 | 
 398 |         # Return the root element
 399 |         return self.root
 400 | 
 401 |     def _flush_text(self) -> None:
 402 |         """Add accumulated text to current element or previous sibling's tail."""
 403 |         if not self.current_text:
 404 |             # Clear buffer even if nothing to flush
 405 |             self.current_text.clear()
 406 |             return
 407 | 
 408 |         # Join accumulated text
 409 |         text_content = "".join(self.current_text)
 410 | 
 411 |         # Only add non-empty text
 412 |         if text_content:
 413 |             if not self.element_stack:
 414 |                 # No elements on stack, nowhere to put text
 415 |                 pass
 416 |             else:
 417 |                 current_element = self.element_stack[-1]
 418 | 
 419 |                 # Check if this text should go to the current element's text
 420 |                 # or to a previous sibling's tail
 421 |                 if len(current_element) == 0:
 422 |                     # No children yet, text goes to element's text content
 423 |                     if current_element.text is None:
 424 |                         current_element.text = text_content
 425 |                     else:
 426 |                         current_element.text += text_content
 427 |                 else:
 428 |                     # Has children, text goes to the last child's tail
 429 |                     last_child = current_element[-1]
 430 |                     if last_child.tail is None:
 431 |                         last_child.tail = text_content
 432 |                     else:
 433 |                         last_child.tail += text_content
 434 | 
 435 |         # Clear the text buffer
 436 |         self.current_text.clear()
 437 | 
 438 | 
 439 | _backends = {"etree": ElementTreeBuilder}
 440 | 
 441 | # Conditionally define LxmlElementTreeBuilder if lxml is available
 442 | if HAS_LXML:
 443 | 
 444 |     class LxmlElementTreeBuilder(ElementTreeBuilder):
 445 |         """lxml-based tree builder implementation."""
 446 | 
 447 |         def __init__(self):
 448 |             """Initialize the lxml ElementTree builder."""
 449 |             super().__init__()
 450 |             # Override type hints for lxml elements
 451 |             self.root: Optional[lxml_etree._Element] = None
 452 |             self.element_stack: List[lxml_etree._Element] = []
 453 | 
 454 |         def start_element(self, event: StartElement) -> None:
 455 |             """Create new element and add to tree."""
 456 |             # Flush any accumulated text to the current element
 457 |             self._flush_text()
 458 | 
 459 |             # Create new lxml element with attributes
 460 |             element = lxml_etree.Element(event.name, event.attrs)
 461 | 
 462 |             # Set as root if this is the first element
 463 |             if self.root is None:
 464 |                 self.root = element
 465 |             else:
 466 |                 # Add as child to current parent element
 467 |                 if self.element_stack:
 468 |                     parent = self.element_stack[-1]
 469 |                     parent.append(element)
 470 | 
 471 |             # Push element onto stack to track hierarchy
 472 |             self.element_stack.append(element)
 473 | 
 474 |         def get_root(self) -> Optional[lxml_etree._Element]:
 475 |             """Return the constructed tree root."""
 476 |             # Flush any remaining accumulated text
 477 |             self._flush_text()
 478 | 
 479 |             # Return the root element
 480 |             return self.root
 481 | 
 482 |         def comment(self, event: Comment) -> None:
 483 |             """Process comment event."""
 484 |             # Flush any accumulated text first
 485 |             self._flush_text()
 486 | 
 487 |             # lxml can preserve comments in the tree
 488 |             if self.element_stack:
 489 |                 comment_element = lxml_etree.Comment(event.content)
 490 |                 current_element = self.element_stack[-1]
 491 |                 current_element.append(comment_element)
 492 | 
 493 |         def processing_instruction(self, event: ProcessingInstruction) -> None:
 494 |             """Process processing instruction event."""
 495 |             # Flush any accumulated text first
 496 |             self._flush_text()
 497 | 
 498 |             # lxml can preserve processing instructions in the tree
 499 |             if self.element_stack:
 500 |                 pi_element = lxml_etree.ProcessingInstruction(event.target, event.data)
 501 |                 current_element = self.element_stack[-1]
 502 |                 current_element.append(pi_element)
 503 | 
 504 |     __all__.append("LxmlElementTreeBuilder")
 505 |     _backends["lxml"] = LxmlElementTreeBuilder
 506 | 
 507 | 
 508 | # lxml will only work if lxml is installed
 509 | TreeType = Literal["etree", "lxml"]
 510 | 
 511 | 
 512 | # =============================================================================
 513 | # Core Parsing Functions
 514 | # =============================================================================
 515 | 
 516 | 
 517 | def stream_parse(
 518 |     xml_input: Union[str, TextIO],
 519 |     # Configuration parameters
 520 |     encoding: str = "utf-8",
 521 |     recover: bool = True,
 522 |     emit_errors: bool = False,
 523 |     preserve_whitespace: bool = False,
 524 |     resolve_entities: bool = True,
 525 |     namespace_aware: bool = False,
 526 |     max_depth: int = 1000,
 527 |     recovery_strategy: RecoveryStrategy = RecoveryStrategy.LENIENT,
 528 |     max_recovery_attempts: int = 10,
 529 |     collect_errors: bool = False,
 530 |     smart_quotes: bool = True,
 531 |     auto_close_tags: bool = True,
 532 |     fix_encoding: bool = True,
 533 |     normalize_whitespace: bool = False,
 534 |     allow_fragments: bool = True,
 535 |     repair_attributes: bool = True,
 536 | ) -> Iterator[XMLEvent]:
 537 |     """
 538 |     Parse XML input stream and yield parsing events.
 539 | 
 540 |     This function provides streaming XML parsing with robust error recovery
 541 |     for handling malformed XML. It yields events as named tuples representing
 542 |     different XML constructs (elements, text, comments, etc.).
 543 | 
 544 |     Args:
 545 |         xml_input: XML string or file-like object to parse
 546 |         encoding: Text encoding for file input (default: 'utf-8')
 547 |         recover: Enable error recovery for malformed XML (default: True)
 548 |         emit_errors: Yield ParseError events for diagnostics (default: False)
 549 |         preserve_whitespace: Keep all whitespace in text content (default: False)
 550 |         resolve_entities: Resolve HTML entities to characters (default: True)
 551 |         namespace_aware: Process XML namespaces (default: False)
 552 |         max_depth: Maximum nesting depth allowed (default: 1000)
 553 |         recovery_strategy: Recovery approach to use (default: LENIENT)
 554 |         max_recovery_attempts: Maximum recovery attempts per error (default: 10)
 555 |         collect_errors: Collect all errors instead of yielding (default: False)
 556 |         smart_quotes: Enable smart quote matching (default: True)
 557 |         auto_close_tags: Auto-close unclosed tags (default: True)
 558 |         fix_encoding: Attempt encoding error recovery (default: True)
 559 |         normalize_whitespace: Normalize whitespace in text (default: False)
 560 |         allow_fragments: Allow XML fragments without root (default: True)
 561 |         repair_attributes: Repair malformed attributes (default: True)
 562 | 
 563 |     Yields:
 564 |         XMLEvent: Parsing events as named tuples (StartElement, EndElement,
 565 |                  Text, Comment, ProcessingInstruction, EntityRef, ParseError)
 566 | 
 567 |     Raises:
 568 |         ValueError: If input is invalid or max_depth is exceeded
 569 |         UnicodeDecodeError: If encoding is incorrect for file input
 570 | 
 571 |     Examples:
 572 |         >>> xml = "<root><child>text</child></root>"
 573 |         >>> events = list(stream_parse(xml))
 574 |         >>> len(events)
 575 |         4
 576 | 
 577 |         >>> # Handle malformed XML with enhanced recovery
 578 |         >>> malformed = '<root><child attr="value with missing quote>text</child></root>'
 579 |         >>> events = list(stream_parse(malformed, recovery_strategy=RecoveryStrategy.AGGRESSIVE))
 580 |         >>> # Auto-recovery will fix quotes and close unclosed tags
 581 |     """
 582 |     # Create internal _ParseOptions from parameters
 583 |     options = _ParseOptions(
 584 |         recover=recover,
 585 |         emit_errors=emit_errors,
 586 |         preserve_whitespace=preserve_whitespace,
 587 |         resolve_entities=resolve_entities,
 588 |         namespace_aware=namespace_aware,
 589 |         max_depth=max_depth,
 590 |         encoding=encoding,
 591 |         recovery_strategy=recovery_strategy,
 592 |         max_recovery_attempts=max_recovery_attempts,
 593 |         collect_errors=collect_errors,
 594 |         smart_quotes=smart_quotes,
 595 |         auto_close_tags=auto_close_tags,
 596 |         fix_encoding=fix_encoding,
 597 |         normalize_whitespace=normalize_whitespace,
 598 |         allow_fragments=allow_fragments,
 599 |         repair_attributes=repair_attributes,
 600 |     )
 601 | 
 602 |     # Handle input type with encoding fixes if enabled
 603 |     if hasattr(xml_input, "read"):
 604 |         # File-like object
 605 |         try:
 606 |             text = xml_input.read()
 607 |             if isinstance(text, bytes):
 608 |                 text = text.decode(options.encoding)
 609 |         except UnicodeDecodeError as e:
 610 |             if options.fix_encoding:
 611 |                 # Try common encodings
 612 |                 for fallback_encoding in ["latin1", "cp1252", "ascii"]:
 613 |                     try:
 614 |                         text = xml_input.read().decode(fallback_encoding)
 615 |                         break
 616 |                     except (UnicodeDecodeError, AttributeError):
 617 |                         continue
 618 |                 else:
 619 |                     raise UnicodeDecodeError(
 620 |                         e.encoding,
 621 |                         e.object,
 622 |                         e.start,
 623 |                         e.end,
 624 |                         f"Unable to decode input with {options.encoding} encoding",
 625 |                     )
 626 |             else:
 627 |                 raise UnicodeDecodeError(
 628 |                     e.encoding,
 629 |                     e.object,
 630 |                     e.start,
 631 |                     e.end,
 632 |                     f"Unable to decode input with {options.encoding} encoding",
 633 |                 )
 634 |     else:
 635 |         # String input
 636 |         text = str(xml_input)
 637 | 
 638 |     # Apply encoding fixes if enabled
 639 |     if options.fix_encoding:
 640 |         original_text = text
 641 |         text, encoding_fixes = _fix_encoding_issues(text)
 642 |         if encoding_fixes and options.emit_errors:
 643 |             for fix in encoding_fixes:
 644 |                 yield _create_error_with_context(
 645 |                     "encoding_fix",
 646 |                     fix,
 647 |                     1,
 648 |                     1,
 649 |                     original_text,
 650 |                     0,
 651 |                     recovery=fix,
 652 |                     fatal=False,
 653 |                     severity="warning",
 654 |                 )
 655 | 
 656 |     # Initialize parser state
 657 |     state = ParserState.INITIAL
 658 |     pos = 0
 659 |     line = 1
 660 |     column = 1
 661 |     tag_stack = []
 662 |     text_buffer = []
 663 |     error_count = 0
 664 |     recovery_attempts = 0
 665 |     collected_errors = []
 666 | 
 667 |     def emit_error(
 668 |         error_type: str,
 669 |         message: str,
 670 |         recovery: str = "",
 671 |         fatal: bool = False,
 672 |         severity: str = "error",
 673 |     ):
 674 |         """Helper to emit parse errors if enabled."""
 675 |         nonlocal error_count, collected_errors
 676 |         error_count += 1
 677 | 
 678 |         error = _create_error_with_context(
 679 |             error_type, message, line, column, text, pos, recovery, fatal, severity
 680 |         )
 681 | 
 682 |         if options.collect_errors:
 683 |             collected_errors.append(error)
 684 |         elif options.emit_errors:
 685 |             yield error
 686 | 
 687 |     def flush_text():
 688 |         """Helper to yield accumulated text content."""
 689 |         if not text_buffer:
 690 |             return
 691 | 
 692 |         content = "".join(text_buffer)
 693 |         text_buffer.clear()
 694 | 
 695 |         # Normalize whitespace if enabled
 696 |         if options.normalize_whitespace:
 697 |             content = re.sub(r"\s+", " ", content)
 698 | 
 699 |         # Early return if no text to emit
 700 |         if not (options.preserve_whitespace or content.strip()):
 701 |             return
 702 | 
 703 |         # Check for unescaped special characters (but not those in entity references)
 704 |         if options.recovery_strategy in [
 705 |             RecoveryStrategy.LENIENT,
 706 |             RecoveryStrategy.AGGRESSIVE,
 707 |         ]:
 708 |             # First, find all entity positions to avoid escaping & characters within entities
 709 |             entity_ranges = []
 710 |             for entity_match in PATTERNS["entity_ref"].finditer(content):
 711 |                 entity_ranges.append((entity_match.start(), entity_match.end()))
 712 | 
 713 |             unescaped_matches = []
 714 |             for match in PATTERNS["unescaped_chars"].finditer(content):
 715 |                 # Check if this character is inside an entity reference
 716 |                 char_pos = match.start()
 717 |                 is_in_entity = any(
 718 |                     start <= char_pos < end for start, end in entity_ranges
 719 |                 )
 720 |                 if not is_in_entity:
 721 |                     unescaped_matches.append(match)
 722 | 
 723 |             if unescaped_matches:
 724 |                 # Fix unescaped characters
 725 |                 escape_map = {"<": "&lt;", ">": "&gt;", "&": "&amp;"}
 726 |                 fixed_content = content
 727 |                 for match in reversed(
 728 |                     unescaped_matches
 729 |                 ):  # Process from end to maintain positions
 730 |                     char = match.group(0)
 731 |                     if char in escape_map:
 732 |                         fixed_content = (
 733 |                             fixed_content[: match.start()]
 734 |                             + escape_map[char]
 735 |                             + fixed_content[match.end() :]
 736 |                         )
 737 | 
 738 |                 if fixed_content != content:
 739 |                     yield from emit_error(
 740 |                         "unescaped_chars",
 741 |                         f"Fixed {len(unescaped_matches)} unescaped characters in text",
 742 |                         "escaped special characters",
 743 |                         severity="warning",
 744 |                     )
 745 |                     content = fixed_content
 746 | 
 747 |         # Process entities in text content
 748 |         if options.resolve_entities:
 749 |             # Find and resolve entities in text
 750 |             entity_pos = 0
 751 |             resolved_parts = []
 752 |             has_entities = False
 753 | 
 754 |             for entity_match in PATTERNS["entity_ref"].finditer(content):
 755 |                 has_entities = True
 756 |                 # Add text before entity
 757 |                 resolved_parts.append(content[entity_pos : entity_match.start()])
 758 | 
 759 |                 # Resolve entity
 760 |                 if entity_match.group(1):  # Named entity
 761 |                     resolved = _resolve_entity(entity_match.group(1), False)
 762 |                 elif entity_match.group(2):  # Decimal numeric
 763 |                     resolved = _resolve_entity(entity_match.group(2), True)
 764 |                 elif entity_match.group(3):  # Hex numeric
 765 |                     resolved = _resolve_entity("x" + entity_match.group(3), True)
 766 |                 else:
 767 |                     resolved = entity_match.group(0)  # Keep original
 768 | 
 769 |                 resolved_parts.append(resolved)
 770 |                 entity_pos = entity_match.end()
 771 | 
 772 |             if has_entities:
 773 |                 # Add remaining text
 774 |                 resolved_parts.append(content[entity_pos:])
 775 |                 content = "".join(resolved_parts)
 776 | 
 777 |             # Try to fix broken entities if recovery is enabled
 778 |             if options.recovery_strategy in [
 779 |                 RecoveryStrategy.LENIENT,
 780 |                 RecoveryStrategy.AGGRESSIVE,
 781 |             ]:
 782 |                 broken_entities = list(PATTERNS["broken_entity"].finditer(content))
 783 |                 if broken_entities:
 784 |                     fixed_content = content
 785 |                     for match in reversed(broken_entities):
 786 |                         # Add missing semicolon
 787 |                         fixed_content = (
 788 |                             fixed_content[: match.end()]
 789 |                             + ";"
 790 |                             + fixed_content[match.end() :]
 791 |                         )
 792 | 
 793 |                     if fixed_content != content:
 794 |                         yield from emit_error(
 795 |                             "broken_entity",
 796 |                             f"Fixed {len(broken_entities)} broken entity references",
 797 |                             "added missing semicolons",
 798 |                             severity="warning",
 799 |                         )
 800 |                         content = fixed_content
 801 | 
 802 |         yield Text(content, line, column, is_cdata=False)
 803 | 
 804 |     # Main parsing loop
 805 |     while pos < len(text):
 806 |         if len(tag_stack) > options.max_depth:
 807 |             yield from emit_error(
 808 |                 "depth_exceeded",
 809 |                 f"Maximum nesting depth {options.max_depth} exceeded",
 810 |                 "stopping parse",
 811 |                 fatal=True,
 812 |             )
 813 |             break
 814 | 
 815 |         # Check if we've hit the recovery attempt limit
 816 |         if recovery_attempts > options.max_recovery_attempts:
 817 |             yield from emit_error(
 818 |                 "recovery_limit",
 819 |                 f"Maximum recovery attempts ({options.max_recovery_attempts}) exceeded",
 820 |                 "stopping recovery",
 821 |                 fatal=True,
 822 |             )
 823 |             break
 824 | 
 825 |         # Look for next XML construct
 826 |         if state == ParserState.INITIAL:
 827 |             # Check for comments
 828 |             comment_match = PATTERNS["comment"].match(text, pos)
 829 |             if comment_match:
 830 |                 yield from flush_text()
 831 |                 comment_content = comment_match.group(1)
 832 |                 yield Comment(comment_content, line, column)
 833 |                 pos = comment_match.end()
 834 |                 line, column = _update_position(
 835 |                     text, comment_match.start(), pos, line, column
 836 |                 )
 837 |                 continue
 838 | 
 839 |             # Try to recover from broken comments
 840 |             if options.recovery_strategy in [
 841 |                 RecoveryStrategy.LENIENT,
 842 |                 RecoveryStrategy.AGGRESSIVE,
 843 |             ]:
 844 |                 broken_comment_match = PATTERNS["broken_comment"].match(text, pos)
 845 |                 if broken_comment_match:
 846 |                     yield from flush_text()
 847 |                     comment_content = broken_comment_match.group(1)
 848 |                     yield Comment(comment_content, line, column)
 849 |                     yield from emit_error(
 850 |                         "broken_comment",
 851 |                         "Fixed malformed comment (missing closing -->)",
 852 |                         "added missing comment close",
 853 |                         severity="warning",
 854 |                     )
 855 |                     pos = broken_comment_match.end()
 856 |                     line, column = _update_position(
 857 |                         text, broken_comment_match.start(), pos, line, column
 858 |                     )
 859 |                     recovery_attempts += 1
 860 |                     continue
 861 | 
 862 |             # Check for CDATA
 863 |             cdata_match = PATTERNS["cdata"].match(text, pos)
 864 |             if cdata_match:
 865 |                 yield from flush_text()
 866 |                 cdata_content = cdata_match.group(1)
 867 |                 yield Text(cdata_content, line, column, is_cdata=True)
 868 |                 pos = cdata_match.end()
 869 |                 line, column = _update_position(
 870 |                     text, cdata_match.start(), pos, line, column
 871 |                 )
 872 |                 continue
 873 | 
 874 |             # Try to recover from broken CDATA
 875 |             if options.recovery_strategy in [
 876 |                 RecoveryStrategy.LENIENT,
 877 |                 RecoveryStrategy.AGGRESSIVE,
 878 |             ]:
 879 |                 broken_cdata_match = PATTERNS["broken_cdata"].match(text, pos)
 880 |                 if broken_cdata_match:
 881 |                     yield from flush_text()
 882 |                     cdata_content = broken_cdata_match.group(1)
 883 |                     yield Text(cdata_content, line, column, is_cdata=True)
 884 |                     yield from emit_error(
 885 |                         "broken_cdata",
 886 |                         "Fixed malformed CDATA section (missing closing ]]>)",
 887 |                         "added missing CDATA close",
 888 |                         severity="warning",
 889 |                     )
 890 |                     pos = broken_cdata_match.end()
 891 |                     line, column = _update_position(
 892 |                         text, broken_cdata_match.start(), pos, line, column
 893 |                     )
 894 |                     recovery_attempts += 1
 895 |                     continue
 896 | 
 897 |             # Check for processing instructions
 898 |             pi_match = PATTERNS["pi"].match(text, pos)
 899 |             if pi_match:
 900 |                 yield from flush_text()
 901 |                 target = pi_match.group(1)
 902 |                 data = pi_match.group(2).strip() if pi_match.group(2) else None
 903 |                 yield ProcessingInstruction(target, data, line, column)
 904 |                 pos = pi_match.end()
 905 |                 line, column = _update_position(
 906 |                     text, pi_match.start(), pos, line, column
 907 |                 )
 908 |                 continue
 909 | 
 910 |             # Check for end tags
 911 |             end_tag_match = PATTERNS["end_tag"].match(text, pos)
 912 |             if end_tag_match:
 913 |                 yield from flush_text()
 914 |                 tag_name = end_tag_match.group(1)
 915 | 
 916 |                 # Handle tag mismatch with recovery
 917 |                 if tag_stack and tag_stack[-1] != tag_name:
 918 |                     if options.recover:
 919 |                         # Attempt recovery
 920 |                         recovery_events = _recover_tag_mismatch(
 921 |                             tag_stack, tag_name, line, column
 922 |                         )
 923 |                         for event in recovery_events:
 924 |                             yield event
 925 |                         yield from emit_error(
 926 |                             "tag_mismatch",
 927 |                             f"Mismatched end tag '{tag_name}', expected '{tag_stack[-1] if tag_stack else 'none'}'",
 928 |                             f"auto-closed {len(recovery_events)} tags",
 929 |                             severity="warning",
 930 |                         )
 931 |                         recovery_attempts += 1
 932 |                     else:
 933 |                         yield from emit_error(
 934 |                             "tag_mismatch",
 935 |                             f"Mismatched end tag '{tag_name}'",
 936 |                             fatal=True,
 937 |                         )
 938 |                         break
 939 |                 elif tag_stack:
 940 |                     tag_stack.pop()
 941 | 
 942 |                 yield EndElement(tag_name, line, column, auto_closed=False)
 943 |                 pos = end_tag_match.end()
 944 |                 line, column = _update_position(
 945 |                     text, end_tag_match.start(), pos, line, column
 946 |                 )
 947 |                 continue
 948 | 
 949 |             # Check for start tags
 950 |             start_tag_match = PATTERNS["start_tag"].match(text, pos)
 951 |             if start_tag_match:
 952 |                 yield from flush_text()
 953 |                 tag_name = start_tag_match.group(1)
 954 |                 attr_string = start_tag_match.group(2)
 955 |                 self_closing = bool(start_tag_match.group(3))
 956 | 
 957 |                 # Parse attributes with enhanced recovery
 958 |                 try:
 959 |                     if options.repair_attributes:
 960 |                         attributes, attr_recovery_messages = _repair_attributes(
 961 |                             attr_string,
 962 |                             options.smart_quotes,
 963 |                             options.recovery_strategy,
 964 |                             options.resolve_entities,
 965 |                         )
 966 | 
 967 |                         # Emit recovery messages
 968 |                         for recovery_msg in attr_recovery_messages:
 969 |                             yield from emit_error(
 970 |                                 "attribute_repair",
 971 |                                 recovery_msg,
 972 |                                 recovery_msg,
 973 |                                 severity="warning",
 974 |                             )
 975 |                             recovery_attempts += 1
 976 |                     else:
 977 |                         attributes = _parse_attributes(
 978 |                             attr_string, options.resolve_entities
 979 |                         )
 980 |                 except Exception as e:
 981 |                     attributes = {}
 982 |                     yield from emit_error(
 983 |                         "attribute_parse",
 984 |                         f"Failed to parse attributes: {e}",
 985 |                         "using empty attributes",
 986 |                     )
 987 | 
 988 |                 # Handle namespaces if enabled
 989 |                 namespace = None
 990 |                 if options.namespace_aware and ":" in tag_name:
 991 |                     # Simple namespace handling - could be enhanced
 992 |                     prefix, local_name = tag_name.split(":", 1)
 993 |                     namespace = attributes.get(f"xmlns:{prefix}")
 994 | 
 995 |                 yield StartElement(tag_name, attributes, line, column, namespace)
 996 | 
 997 |                 # Track tag for matching (unless self-closing)
 998 |                 if not self_closing:
 999 |                     tag_stack.append(tag_name)
1000 |                 else:
1001 |                     # Emit matching end element for self-closing tag
1002 |                     yield EndElement(tag_name, line, column, auto_closed=False)
1003 | 
1004 |                 pos = start_tag_match.end()
1005 |                 line, column = _update_position(
1006 |                     text, start_tag_match.start(), pos, line, column
1007 |                 )
1008 |                 continue
1009 | 
1010 |             # Regular text content
1011 |             text_match = PATTERNS["text_content"].match(text, pos)
1012 |             if text_match:
1013 |                 text_content = text_match.group(0)
1014 |                 text_buffer.append(text_content)
1015 |                 pos = text_match.end()
1016 |                 line, column = _update_position(
1017 |                     text, text_match.start(), pos, line, column
1018 |                 )
1019 |                 continue
1020 | 
1021 |             # Handle entity references as part of text content
1022 |             entity_match = PATTERNS["entity_ref"].match(text, pos)
1023 |             if entity_match:
1024 |                 # Add entity to text buffer (will be resolved in flush_text)
1025 |                 entity_text = entity_match.group(0)
1026 |                 text_buffer.append(entity_text)
1027 |                 pos = entity_match.end()
1028 |                 line, column = _update_position(
1029 |                     text, entity_match.start(), pos, line, column
1030 |                 )
1031 |                 continue
1032 | 
1033 |             # Check for incomplete tags at end of input
1034 |             if options.recovery_strategy in [
1035 |                 RecoveryStrategy.LENIENT,
1036 |                 RecoveryStrategy.AGGRESSIVE,
1037 |             ]:
1038 |                 incomplete_tag, new_pos, incomplete_recovery = _handle_incomplete_tag(
1039 |                     text, pos, line, column, options.recovery_strategy
1040 |                 )
1041 |                 if incomplete_tag:
1042 |                     yield from flush_text()
1043 |                     # Create a start element for the incomplete tag
1044 |                     yield StartElement(incomplete_tag, {}, line, column, None)
1045 |                     if options.auto_close_tags:
1046 |                         yield EndElement(incomplete_tag, line, column, auto_closed=True)
1047 | 
1048 |                     for recovery_msg in incomplete_recovery:
1049 |                         yield from emit_error(
1050 |                             "incomplete_tag",
1051 |                             recovery_msg,
1052 |                             recovery_msg,
1053 |                             severity="warning",
1054 |                         )
1055 | 
1056 |                     pos = new_pos
1057 |                     recovery_attempts += 1
1058 |                     continue
1059 | 
1060 |             # Handle smart quote recovery if enabled
1061 |             if options.smart_quotes and options.recovery_strategy in [
1062 |                 RecoveryStrategy.LENIENT,
1063 |                 RecoveryStrategy.AGGRESSIVE,
1064 |             ]:
1065 |                 if pos < len(text) and text[pos] in {'"', "'"}:
1066 |                     fixed_text, new_pos, quote_recovery = _smart_quote_recovery(
1067 |                         text, pos
1068 |                     )
1069 |                     if quote_recovery:
1070 |                         text = fixed_text
1071 |                         for recovery_msg in quote_recovery:
1072 |                             yield from emit_error(
1073 |                                 "quote_mismatch",
1074 |                                 recovery_msg,
1075 |                                 recovery_msg,
1076 |                                 severity="warning",
1077 |                             )
1078 |                         recovery_attempts += 1
1079 |                         # Continue parsing from current position
1080 |                         continue
1081 | 
1082 |             # If nothing matches, advance by one character to avoid infinite loop
1083 |             if pos < len(text):
1084 |                 char = text[pos]
1085 |                 if char not in " \t\r\n" or options.preserve_whitespace:
1086 |                     text_buffer.append(char)
1087 |                 pos += 1
1088 |                 if char == "\n":
1089 |                     line += 1
1090 |                     column = 1
1091 |                 else:
1092 |                     column += 1
1093 | 
1094 |     # Flush any remaining text
1095 |     yield from flush_text()
1096 | 
1097 |     # Auto-close any remaining open tags if recovery is enabled
1098 |     if options.recover and options.auto_close_tags and tag_stack:
1099 |         yield from emit_error(
1100 |             "unclosed_tags",
1101 |             f"{len(tag_stack)} unclosed tags at end of input",
1102 |             "auto-closing all remaining tags",
1103 |             severity="warning",
1104 |         )
1105 |         while tag_stack:
1106 |             tag_name = tag_stack.pop()
1107 |             yield EndElement(tag_name, line, column, auto_closed=True)
1108 | 
1109 |     # Emit collected errors if requested
1110 |     if options.collect_errors and collected_errors:
1111 |         for error in collected_errors:
1112 |             yield error
1113 | 
1114 | 
1115 | def tree_parse(
1116 |     xml_input: Union[str, TextIO, Iterator[XMLEvent]],
1117 |     tree_builder: Optional[TreeBuilder] = None,
1118 |     tree: TreeType = "etree",
1119 |     **parse_options,
1120 | ) -> ET.Element:
1121 |     """
1122 |     Build XML tree from input or parsing events.
1123 | 
1124 |     This is a convenience function that combines streaming parsing with tree
1125 |     building. It can accept either raw XML input (which will be parsed) or
1126 |     an iterator of parsing events.
1127 | 
1128 |     Args:
1129 |         xml_input: XML string, file-like object, or iterator of XMLEvents
1130 |         tree_builder: Tree builder instance (defaults to ElementTreeBuilder)
1131 |         **parse_options: Options passed to stream_parse if needed
1132 | 
1133 |     Returns:
1134 |         ET.Element: Root element of the constructed XML tree
1135 | 
1136 |     Raises:
1137 |         ValueError: If input is invalid or tree construction fails
1138 |         TypeError: If tree_builder doesn't implement TreeBuilder interface
1139 | 
1140 |     Examples:
1141 |         >>> xml = "<root><child>text</child></root>"
1142 |         >>> root = tree_parse(xml)
1143 |         >>> root.tag
1144 |         'root'
1145 |         >>> root[0].text
1146 |         'text'
1147 | 
1148 |         >>> # Use enhanced options for fragments
1149 |         >>> root = tree_parse("text only", allow_fragments=True)
1150 |     """
1151 |     # Create the right tree builder if none provided
1152 |     if tree_builder is None:
1153 |         tree_builder = _backends[tree]()
1154 | 
1155 |     # Check if input is already an event stream (but not a file-like object)
1156 |     if (
1157 |         hasattr(xml_input, "__iter__")
1158 |         and not isinstance(xml_input, (str, bytes))
1159 |         and not hasattr(xml_input, "read")
1160 |     ):
1161 |         # Assume it's an iterator of events
1162 |         events = xml_input
1163 |     else:
1164 |         # Parse raw XML input (string or file-like object)
1165 |         events = stream_parse(xml_input, **parse_options)
1166 | 
1167 |     # Process events through tree builder
1168 |     for event in events:
1169 |         if isinstance(event, StartElement):
1170 |             tree_builder.start_element(event)
1171 |         elif isinstance(event, EndElement):
1172 |             tree_builder.end_element(event)
1173 |         elif isinstance(event, Text):
1174 |             tree_builder.text(event)
1175 |         elif isinstance(event, Comment):
1176 |             tree_builder.comment(event)
1177 |         elif isinstance(event, ProcessingInstruction):
1178 |             tree_builder.processing_instruction(event)
1179 |         elif isinstance(event, EntityRef):
1180 |             tree_builder.entity_ref(event)
1181 |         elif isinstance(event, ParseError):
1182 |             tree_builder.parse_error(event)
1183 | 
1184 |     # Return constructed tree root
1185 |     root = tree_builder.get_root()
1186 |     if root is None:
1187 |         # Check if fragments are allowed (default behavior)
1188 |         allow_fragments = parse_options.get("allow_fragments", True)
1189 |         if allow_fragments:
1190 |             # Create a synthetic root element for fragments
1191 |             from xml.etree.ElementTree import Element
1192 | 
1193 |             synthetic_root = Element("fragment")
1194 |             return synthetic_root
1195 |         else:
1196 |             raise ValueError("No valid XML root element found")
1197 |     return root
1198 | 
1199 | 
1200 | # =============================================================================
1201 | # Internal Helper Functions
1202 | # =============================================================================
1203 | 
1204 | 
1205 | def _resolve_entity(entity_name: str, is_numeric: bool = False) -> str:
1206 |     """
1207 |     Resolve entity reference to character.
1208 | 
1209 |     Args:
1210 |         entity_name: Entity name (without & and ;)
1211 |         is_numeric: True for numeric entities (&#123; or &#x1A;)
1212 | 
1213 |     Returns:
1214 |         str: Resolved character or original entity if unresolvable
1215 |     """
1216 |     if is_numeric:
1217 |         try:
1218 |             if entity_name.startswith("x") or entity_name.startswith("X"):
1219 |                 # Hexadecimal numeric entity
1220 |                 code_point = int(entity_name[1:], 16)
1221 |             else:
1222 |                 # Decimal numeric entity
1223 |                 code_point = int(entity_name)
1224 | 
1225 |             # Validate Unicode code point range
1226 |             if 0 <= code_point <= 0x10FFFF:
1227 |                 return chr(code_point)
1228 |         except (ValueError, OverflowError):
1229 |             pass
1230 |     else:
1231 |         # Named entity - try both with and without semicolon
1232 |         if entity_name in HTML_ENTITIES:
1233 |             return HTML_ENTITIES[entity_name]
1234 |         elif entity_name + ";" in HTML_ENTITIES:
1235 |             return HTML_ENTITIES[entity_name + ";"]
1236 | 
1237 |     # Return original entity if unresolvable
1238 |     return f"&{entity_name};"
1239 | 
1240 | 
1241 | def _resolve_entities_in_text(text: str) -> str:
1242 |     """
1243 |     Resolve entity references in text content.
1244 | 
1245 |     Args:
1246 |         text: Text content that may contain entity references
1247 | 
1248 |     Returns:
1249 |         str: Text with entities resolved to their character equivalents
1250 |     """
1251 |     if not text or "&" not in text:
1252 |         return text
1253 | 
1254 |     # Find and resolve entities in text
1255 |     entity_pos = 0
1256 |     resolved_parts = []
1257 |     has_entities = False
1258 | 
1259 |     for entity_match in PATTERNS["entity_ref"].finditer(text):
1260 |         has_entities = True
1261 |         # Add text before entity
1262 |         resolved_parts.append(text[entity_pos : entity_match.start()])
1263 | 
1264 |         # Resolve entity
1265 |         if entity_match.group(1):  # Named entity
1266 |             resolved = _resolve_entity(entity_match.group(1), False)
1267 |         elif entity_match.group(2):  # Decimal numeric
1268 |             resolved = _resolve_entity(entity_match.group(2), True)
1269 |         elif entity_match.group(3):  # Hex numeric
1270 |             resolved = _resolve_entity("x" + entity_match.group(3), True)
1271 |         else:
1272 |             resolved = entity_match.group(0)  # Keep original
1273 | 
1274 |         resolved_parts.append(resolved)
1275 |         entity_pos = entity_match.end()
1276 | 
1277 |     if has_entities:
1278 |         # Add remaining text
1279 |         resolved_parts.append(text[entity_pos:])
1280 |         return "".join(resolved_parts)
1281 | 
1282 |     return text
1283 | 
1284 | 
1285 | def _parse_attributes(
1286 |     attr_string: str, resolve_entities: bool = True
1287 | ) -> Dict[str, str]:
1288 |     """
1289 |     Parse attribute string into name-value dictionary.
1290 | 
1291 |     Args:
1292 |         attr_string: Raw attribute string from tag
1293 |         resolve_entities: Whether to resolve entity references in attribute values
1294 | 
1295 |     Returns:
1296 |         Dict[str, str]: Dictionary of attribute names to values
1297 |     """
1298 |     attributes = {}
1299 |     if not attr_string.strip():
1300 |         return attributes
1301 | 
1302 |     # Find all attribute matches
1303 |     for match in PATTERNS["attributes"].finditer(attr_string):
1304 |         name = match.group(1)
1305 |         # Groups: 1=name, 2=double-quoted, 3=single-quoted, 4=unquoted
1306 |         if match.group(2) is not None:
1307 |             value = match.group(2)  # Double-quoted value
1308 |         elif match.group(3) is not None:
1309 |             value = match.group(3)  # Single-quoted value
1310 |         elif match.group(4) is not None:
1311 |             value = match.group(4)  # Unquoted value
1312 |         else:
1313 |             value = ""  # Attribute without value
1314 | 
1315 |         # Resolve entities in attribute value if enabled
1316 |         if resolve_entities and value:
1317 |             value = _resolve_entities_in_text(value)
1318 | 
1319 |         attributes[name] = value
1320 | 
1321 |     return attributes
1322 | 
1323 | 
1324 | def _update_position(
1325 |     text: str, start_pos: int, end_pos: int, current_line: int, current_column: int
1326 | ) -> tuple[int, int]:
1327 |     """
1328 |     Update line and column numbers based on text consumed.
1329 | 
1330 |     Args:
1331 |         text: Source text
1332 |         start_pos: Starting position
1333 |         end_pos: Ending position
1334 |         current_line: Current line number
1335 |         current_column: Current column number
1336 | 
1337 |     Returns:
1338 |         tuple[int, int]: Updated (line, column) position
1339 |     """
1340 |     consumed_text = text[start_pos:end_pos]
1341 | 
1342 |     # Count newlines in the consumed text
1343 |     newline_count = consumed_text.count("\n")
1344 | 
1345 |     if newline_count == 0:
1346 |         # No newlines, just advance column
1347 |         return current_line, current_column + (end_pos - start_pos)
1348 |     else:
1349 |         # Update line number and reset column to position after last newline
1350 |         new_line = current_line + newline_count
1351 |         last_newline_pos = consumed_text.rfind("\n")
1352 |         new_column = len(consumed_text) - last_newline_pos - 1
1353 |         return new_line, new_column
1354 | 
1355 | 
1356 | def _recover_tag_mismatch(
1357 |     tag_stack: List[str], end_tag: str, line: int, column: int
1358 | ) -> List[EndElement]:
1359 |     """
1360 |     Recover from mismatched end tags by auto-closing open tags.
1361 | 
1362 |     Args:
1363 |         tag_stack: Stack of currently open tags
1364 |         end_tag: The end tag that was encountered
1365 |         line: Current line number
1366 |         column: Current column number
1367 | 
1368 |     Returns:
1369 |         List[EndElement]: Auto-generated end elements for recovery
1370 |     """
1371 |     recovery_events = []
1372 | 
1373 |     # Look for the matching start tag in the stack
1374 |     matching_index = -1
1375 |     for i in range(len(tag_stack) - 1, -1, -1):
1376 |         if tag_stack[i] == end_tag:
1377 |             matching_index = i
1378 |             break
1379 | 
1380 |     if matching_index >= 0:
1381 |         # Auto-close all tags above the matching one
1382 |         for i in range(len(tag_stack) - 1, matching_index, -1):
1383 |             tag_name = tag_stack.pop()
1384 |             recovery_events.append(EndElement(tag_name, line, column, auto_closed=True))
1385 |         # Remove the matching tag as well
1386 |         tag_stack.pop()
1387 | 
1388 |     return recovery_events
1389 | 
1390 | 
1391 | def _create_error_with_context(
1392 |     error_type: str,
1393 |     message: str,
1394 |     line: int,
1395 |     column: int,
1396 |     text: str,
1397 |     pos: int,
1398 |     recovery: str = "",
1399 |     fatal: bool = False,
1400 |     severity: str = "error",
1401 | ) -> ParseError:
1402 |     """
1403 |     Create a ParseError with surrounding context for better debugging.
1404 | 
1405 |     Args:
1406 |         error_type: Category of error
1407 |         message: Human-readable error description
1408 |         line: Line number where error occurred
1409 |         column: Column number where error occurred
1410 |         text: Full source text
1411 |         pos: Position in text where error occurred
1412 |         recovery: Description of recovery action
1413 |         fatal: Whether parsing cannot continue
1414 |         severity: Error severity level
1415 | 
1416 |     Returns:
1417 |         ParseError: Enhanced error with context
1418 |     """
1419 |     # Extract context around the error position
1420 |     context_start = max(0, pos - 50)
1421 |     context_end = min(len(text), pos + 50)
1422 |     context = text[context_start:context_end]
1423 | 
1424 |     # Mark the error position in context
1425 |     if pos >= context_start and pos < context_end:
1426 |         error_pos = pos - context_start
1427 |         context = context[:error_pos] + ">>>" + context[error_pos:]
1428 | 
1429 |     # Clean up context (remove newlines for readability)
1430 |     context = context.replace("\n", "\\n").replace("\r", "\\r")
1431 | 
1432 |     return ParseError(
1433 |         error_type=error_type,
1434 |         message=message,
1435 |         line=line,
1436 |         column=column,
1437 |         recovery=recovery,
1438 |         fatal=fatal,
1439 |         context=context,
1440 |         severity=severity,
1441 |     )
1442 | 
1443 | 
1444 | def _repair_attributes(
1445 |     attr_string: str,
1446 |     smart_quotes: bool = True,
1447 |     recovery_strategy: RecoveryStrategy = RecoveryStrategy.LENIENT,
1448 |     resolve_entities: bool = True,
1449 | ) -> Tuple[Dict[str, str], List[str]]:
1450 |     """
1451 |     Enhanced attribute parsing with error recovery.
1452 | 
1453 |     Args:
1454 |         attr_string: Raw attribute string from tag
1455 |         smart_quotes: Enable smart quote matching
1456 |         recovery_strategy: Recovery approach to use
1457 |         resolve_entities: Whether to resolve entity references in attribute values
1458 | 
1459 |     Returns:
1460 |         Tuple of (attributes dict, list of recovery messages)
1461 |     """
1462 |     attributes = {}
1463 |     recovery_messages = []
1464 | 
1465 |     if not attr_string.strip():
1466 |         return attributes, recovery_messages
1467 | 
1468 |     # First try normal parsing
1469 |     for match in PATTERNS["attributes"].finditer(attr_string):
1470 |         name = match.group(1)
1471 |         # Groups: 1=name, 2=double-quoted, 3=single-quoted, 4=unquoted
1472 |         if match.group(2) is not None:
1473 |             value = match.group(2)  # Double-quoted value
1474 |         elif match.group(3) is not None:
1475 |             value = match.group(3)  # Single-quoted value
1476 |         elif match.group(4) is not None:
1477 |             value = match.group(4)  # Unquoted value
1478 |         else:
1479 |             value = ""  # Attribute without value
1480 | 
1481 |         # Resolve entities in attribute value if enabled
1482 |         if resolve_entities and value:
1483 |             value = _resolve_entities_in_text(value)
1484 | 
1485 |         attributes[name] = value
1486 | 
1487 |     # If normal parsing didn't capture everything, try recovery
1488 |     if recovery_strategy != RecoveryStrategy.STRICT:
1489 |         remaining = attr_string
1490 | 
1491 |         # Remove successfully parsed attributes
1492 |         for match in PATTERNS["attributes"].finditer(attr_string):
1493 |             remaining = remaining.replace(match.group(0), "", 1)
1494 | 
1495 |         remaining = remaining.strip()
1496 |         if remaining:
1497 |             # Try to recover malformed attributes
1498 |             if smart_quotes:
1499 |                 # Handle mixed quotes
1500 |                 for match in PATTERNS["mixed_quotes"].finditer(remaining):
1501 |                     name = match.group(1)
1502 |                     value = match.group(2) or match.group(3)
1503 | 
1504 |                     # Resolve entities in attribute value if enabled
1505 |                     if resolve_entities and value:
1506 |                         value = _resolve_entities_in_text(value)
1507 | 
1508 |                     attributes[name] = value
1509 |                     recovery_messages.append(
1510 |                         f"Fixed mixed quotes in attribute '{name}'"
1511 |                     )
1512 |                     remaining = remaining.replace(match.group(0), "", 1)
1513 | 
1514 |             # Handle unquoted values
1515 |             for match in PATTERNS["malformed_attr"].finditer(remaining):
1516 |                 name = match.group(1)
1517 |                 value = match.group(2)
1518 | 
1519 |                 # Resolve entities in attribute value if enabled
1520 |                 if resolve_entities and value:
1521 |                     value = _resolve_entities_in_text(value)
1522 | 
1523 |                 attributes[name] = value
1524 |                 recovery_messages.append(f"Added quotes to unquoted attribute '{name}'")
1525 |                 remaining = remaining.replace(match.group(0), "", 1)
1526 | 
1527 |     return attributes, recovery_messages
1528 | 
1529 | 
1530 | def _fix_encoding_issues(text: str) -> Tuple[str, List[str]]:
1531 |     """
1532 |     Attempt to fix common encoding issues in XML text.
1533 | 
1534 |     Args:
1535 |         text: Input text that may have encoding issues
1536 | 
1537 |     Returns:
1538 |         Tuple of (fixed text, list of fix messages)
1539 |     """
1540 |     fixes = []
1541 |     result = text
1542 | 
1543 |     # Remove or replace control characters
1544 |     if PATTERNS["encoding_issues"].search(result):
1545 |         original_len = len(result)
1546 |         result = PATTERNS["encoding_issues"].sub("", result)
1547 |         fixes.append(f"Removed {original_len - len(result)} control characters")
1548 | 
1549 |     # Common encoding fixes
1550 |     encoding_fixes = {
1551 |         "\u2013": "-",  # en dash
1552 |         "\u2014": "--",  # em dash
1553 |         "\u2018": "'",  # left single quote
1554 |         "\u2019": "'",  # right single quote
1555 |         "\u201c": '"',  # left double quote
1556 |         "\u201d": '"',  # right double quote
1557 |         "\u2026": "...",  # ellipsis
1558 |         "\u00a0": " ",  # non-breaking space
1559 |     }
1560 | 
1561 |     for bad_char, replacement in encoding_fixes.items():
1562 |         if bad_char in result:
1563 |             result = result.replace(bad_char, replacement)
1564 |             fixes.append(f"Replaced '{bad_char}' with '{replacement}'")
1565 | 
1566 |     return result, fixes
1567 | 
1568 | 
1569 | def _smart_quote_recovery(text: str, pos: int) -> Tuple[str, int, List[str]]:
1570 |     """
1571 |     Attempt to recover from quote mismatches using smart matching.
1572 | 
1573 |     Args:
1574 |         text: Full text being parsed
1575 |         pos: Current position in text
1576 | 
1577 |     Returns:
1578 |         Tuple of (recovered text, new position, recovery messages)
1579 |     """
1580 |     recovery_messages = []
1581 | 
1582 |     # Look for common quote patterns that can be fixed
1583 |     # This is a simplified implementation - could be much more sophisticated
1584 | 
1585 |     # Find the nearest quote characters
1586 |     quote_chars = {'"', "'"}
1587 | 
1588 |     # Look ahead for potential quote issues
1589 |     look_ahead = text[pos : pos + 100]  # Look at next 100 chars
1590 | 
1591 |     quote_positions = []
1592 |     for i, char in enumerate(look_ahead):
1593 |         if char in quote_chars:
1594 |             quote_positions.append((i + pos, char))
1595 | 
1596 |     if len(quote_positions) >= 2:
1597 |         # Try to match quotes intelligently
1598 |         if quote_positions[0][1] != quote_positions[1][1]:
1599 |             # Mismatched quotes - try to fix
1600 |             first_pos, first_char = quote_positions[0]
1601 |             second_pos, second_char = quote_positions[1]
1602 | 
1603 |             # Replace the second quote with the first to match
1604 |             fixed_text = text[:second_pos] + first_char + text[second_pos + 1 :]
1605 |             recovery_messages.append(
1606 |                 f"Fixed mismatched quotes: '{second_char}' -> '{first_char}'"
1607 |             )
1608 |             return fixed_text, pos, recovery_messages
1609 | 
1610 |     return text, pos, recovery_messages
1611 | 
1612 | 
1613 | def _handle_incomplete_tag(
1614 |     text: str,
1615 |     pos: int,
1616 |     line: int,
1617 |     column: int,
1618 |     recovery_strategy: RecoveryStrategy = RecoveryStrategy.LENIENT,
1619 | ) -> Tuple[Optional[str], int, List[str]]:
1620 |     """
1621 |     Handle incomplete or malformed tags at end of input.
1622 | 
1623 |     Args:
1624 |         text: Full text being parsed
1625 |         pos: Current position in text
1626 |         line: Current line number
1627 |         column: Current column number
1628 |         recovery_strategy: Recovery approach to use
1629 | 
1630 |     Returns:
1631 |         Tuple of (recovered tag name or None, new position, recovery messages)
1632 |     """
1633 |     recovery_messages = []
1634 | 
1635 |     # Check if we have an incomplete tag
1636 |     incomplete_match = PATTERNS["incomplete_tag"].match(text, pos)
1637 |     if incomplete_match:
1638 |         tag_name = incomplete_match.group(1)
1639 |         incomplete_match.group(2)
1640 | 
1641 |         if recovery_strategy in [RecoveryStrategy.LENIENT, RecoveryStrategy.AGGRESSIVE]:
1642 |             # Try to recover by adding missing >
1643 |             recovery_messages.append(
1644 |                 f"Added missing '>' to incomplete tag '{tag_name}'"
1645 |             )
1646 |             return tag_name, len(text), recovery_messages
1647 | 
1648 |     # Look for other common incomplete patterns
1649 |     remaining = text[pos:].strip()
1650 |     if remaining.startswith("<"):
1651 |         # Incomplete tag start
1652 |         if recovery_strategy == RecoveryStrategy.AGGRESSIVE:
1653 |             # Try to extract tag name
1654 |             tag_match = re.match(r"<\s*([a-zA-Z_:][a-zA-Z0-9_:.-]*)", remaining)
1655 |             if tag_match:
1656 |                 tag_name = tag_match.group(1)
1657 |                 recovery_messages.append(f"Recovered incomplete tag '{tag_name}'")
1658 |                 return tag_name, len(text), recovery_messages
1659 | 
1660 |     return None, pos, recovery_messages
1661 | 


--------------------------------------------------------------------------------