├── py.typed ├── tests ├── __init__.py ├── samples │ ├── html_fragment.html │ ├── html_doc.html │ └── html_unicorn_fragment.html ├── element │ ├── test_content.py │ ├── test_create_tags.py │ └── test_element.py ├── html │ ├── test_root_element.py │ ├── test_query.py │ ├── test_html.py │ ├── test_prettify.py │ └── samples │ │ ├── hacker-news.html │ │ └── expected │ │ └── hacker-news.html └── test_benchmarks.py ├── README.md ├── .readthedocs.yml ├── docs ├── source │ ├── installation.md │ ├── changelog.md │ ├── conf.py │ ├── editing.md │ ├── index.md │ ├── querying.md │ ├── element.md │ └── parsing.md ├── Makefile └── make.bat ├── minestrone ├── formatter.py ├── __init__.py └── element │ ├── prettifier.py │ └── __init__.py ├── conftest.py ├── LICENSE ├── .gitignore └── pyproject.toml /py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # minestrone 2 | 3 | Search, modify, and parse messy HTML with ease. 4 | 5 | Documentation at https://minestrone.readthedocs.io/. 6 | -------------------------------------------------------------------------------- /tests/samples/html_fragment.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sphinx: 4 | configuration: docs/source/conf.py 5 | fail_on_warning: true 6 | builder: dirhtml 7 | 8 | formats: 9 | - pdf 10 | - epub 11 | 12 | python: 13 | version: 3 14 | install: 15 | - method: pip 16 | path: . 17 | extra_requirements: 18 | - docs 19 | -------------------------------------------------------------------------------- /docs/source/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | To use `minestrone`, first install it using `poetry`: 4 | 5 | ```shell 6 | poetry add minestrone 7 | ``` 8 | 9 | OR install it using `pip`: 10 | 11 | ```shell 12 | pip install minestrone 13 | ``` 14 | 15 | ```{note} 16 | `minestrone[lxml]` or `minestrone[html5]` can be installed to include support for external HTML parsers. More information in [parsing](parsing.md). 17 | ``` -------------------------------------------------------------------------------- /minestrone/formatter.py: -------------------------------------------------------------------------------- 1 | import bs4 2 | 3 | 4 | class UnsortedAttributes(bs4.formatter.HTMLFormatter): 5 | """Prevent `beautifulsoup` from re-ordering HTML attributes.""" 6 | 7 | def __init__(self): 8 | super().__init__( 9 | entity_substitution=bs4.dammit.EntitySubstitution.substitute_html 10 | ) 11 | 12 | def attributes(self, tag: bs4.element.Tag): 13 | for k, v in tag.attrs.items(): 14 | yield k, v 15 | -------------------------------------------------------------------------------- /tests/samples/html_doc.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | The Dormouse's story 4 | 5 | 6 |

The Dormouse's story

7 | 8 | 13 | 14 | -------------------------------------------------------------------------------- /docs/source/changelog.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.8.0 4 | 5 | - Add `Element.insert` and `Element.remove_children`. 6 | 7 | ## 0.7.0 8 | 9 | - Add `HTML.elements`. 10 | 11 | ## 0.6.2 12 | 13 | - Optimize `prettify` method to be as fast as possible. 14 | - Support HTML doctype, comments, void elements, and other improvements for `prettify`. 15 | 16 | ## 0.6.1 17 | 18 | - Fix a few bugs for `HTML.prettify()` and `Element.prettify()`. 19 | 20 | ## 0.6.0 21 | 22 | - Add `Element.prettify()`. 23 | 24 | ## 0.5.1 25 | 26 | - Handle HTML tags when getting `Element.text`. 27 | 28 | ## 0.5.0 29 | 30 | - Add setter for `Element.id`. 31 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /tests/element/test_content.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from minestrone.element import Content 4 | 5 | 6 | @pytest.fixture 7 | def content(): 8 | return Content() 9 | 10 | 11 | def test_convert_attributes_klass(content): 12 | attributes = {"klass": "test1"} 13 | actual = content._convert_attributes(attributes) 14 | 15 | assert actual == {"class": "test1"} 16 | 17 | 18 | def test_convert_attributes_css(content): 19 | attributes = {"css": "test1"} 20 | actual = content._convert_attributes(attributes) 21 | 22 | assert actual == {"class": "test1"} 23 | 24 | 25 | def test_convert_attributes_true_value(content): 26 | attributes = {"disabled": True} 27 | actual = content._convert_attributes(attributes) 28 | 29 | assert actual == {"disabled": "disabled"} 30 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from minestrone import HTML 4 | 5 | 6 | @pytest.fixture 7 | def html_doc(html_doc_str) -> HTML: 8 | return HTML(html_doc_str) 9 | 10 | 11 | @pytest.fixture 12 | def html_doc_str() -> str: 13 | with open("tests/samples/html_doc.html", "r") as f: 14 | return f.read() 15 | 16 | 17 | @pytest.fixture 18 | def html_fragment(html_fragment_str) -> HTML: 19 | return HTML(html_fragment_str) 20 | 21 | 22 | @pytest.fixture 23 | def html_fragment_str() -> str: 24 | with open("tests/samples/html_fragment.html", "r") as f: 25 | return f.read() 26 | 27 | 28 | @pytest.fixture 29 | def html_unicorn_fragment(html_unicorn_fragment_str) -> HTML: 30 | return HTML(html_unicorn_fragment_str) 31 | 32 | 33 | @pytest.fixture 34 | def html_unicorn_fragment_str() -> str: 35 | with open("tests/samples/html_unicorn_fragment.html", "r") as f: 36 | return f.read() 37 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /tests/samples/html_unicorn_fragment.html: -------------------------------------------------------------------------------- 1 |
3 |
6 |
Step 2
7 |
8 | 9 | 10 | 11 | 12 |
13 |
14 | Address: 123 Main St
15 | City: Anytown
16 | State: CA
17 | Zip code: 12345
18 |
19 | 20 | 21 |
22 |
-------------------------------------------------------------------------------- /tests/html/test_root_element.py: -------------------------------------------------------------------------------- 1 | from minestrone import HTML, Element 2 | 3 | 4 | def test_root_element(html_doc): 5 | root_element = html_doc.root_element 6 | assert isinstance(root_element, Element) 7 | assert root_element.name == "html" 8 | 9 | 10 | def test_root_element_with_extra_linebreaks(): 11 | html = HTML( 12 | """ 13 | 14 |

The Dormouse's story

15 | """ 16 | ) 17 | 18 | root_element = html.root_element 19 | assert isinstance(root_element, Element) 20 | assert root_element.name == "p" 21 | 22 | 23 | def test_root_element_with_comment(): 24 | html = HTML( 25 | """ 26 | 27 |

The Dormouse's story

28 | """ 29 | ) 30 | 31 | root_element = html.root_element 32 | assert isinstance(root_element, Element) 33 | assert root_element.name == "p" 34 | 35 | 36 | def test_root_element_missing(): 37 | html = HTML( 38 | """ 39 | 40 | testing 41 | """ 42 | ) 43 | 44 | root_element = html.root_element 45 | assert root_element is None 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Adam Hill 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice (including the next paragraph) shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | import toml 2 | 3 | project = "minestrone" 4 | copyright = "2021, Adam Hill" 5 | author = "Adam Hill" 6 | 7 | pyproject = toml.load("../../pyproject.toml") 8 | version = pyproject["tool"]["poetry"]["version"] 9 | release = version 10 | 11 | # -- General configuration 12 | 13 | extensions = [ 14 | "sphinx.ext.duration", 15 | "sphinx.ext.doctest", 16 | "sphinx.ext.autodoc", 17 | "sphinx.ext.autosummary", 18 | "sphinx.ext.intersphinx", 19 | "myst_parser", 20 | "sphinx_copybutton", 21 | "sphinx.ext.napoleon", 22 | "sphinx.ext.autosectionlabel", 23 | ] 24 | 25 | intersphinx_mapping = { 26 | "python": ("https://docs.python.org/3/", None), 27 | "sphinx": ("https://www.sphinx-doc.org/en/master/", None), 28 | } 29 | intersphinx_disabled_domains = ["std"] 30 | 31 | templates_path = ["_templates"] 32 | 33 | # -- Options for HTML output 34 | 35 | html_theme = "furo" 36 | 37 | # -- Options for EPUB output 38 | epub_show_urls = "footnote" 39 | 40 | autosectionlabel_prefix_document = True 41 | autosectionlabel_maxdepth = 3 42 | 43 | myst_heading_anchors = 3 44 | myst_enable_extensions = ["linkify", "colon_fence"] 45 | -------------------------------------------------------------------------------- /docs/source/editing.md: -------------------------------------------------------------------------------- 1 | # Editing 2 | 3 | To edit HTML, first query for an `Element` and then call one of the following methods. 4 | 5 | ## prepend 6 | 7 | Adds new text or an element **before** the calling element. 8 | 9 | ### Prepend an element 10 | 11 | ```python 12 | from minestrone import HTML 13 | html = HTML("Dormouse") 14 | html.root_element.prepend(name="span", text="The", klass="mr-2") 15 | 16 | assert str(html) == "TheDormouse" 17 | ``` 18 | 19 | ### Prepend text 20 | 21 | ```python 22 | from minestrone import HTML 23 | html = HTML("Dormouse") 24 | html.root_element.prepend(text="The ") 25 | 26 | assert html == "The Dormouse" 27 | ``` 28 | 29 | ## append 30 | 31 | Adds text content or a new element **after** the calling element. 32 | 33 | ### Append an element 34 | 35 | ```python 36 | from minestrone import HTML 37 | html = HTML("Dormouse") 38 | html.root_element.append(name="span", text="Story", klass="ml-2") 39 | 40 | assert str(html) == "DormouseStory" 41 | ``` 42 | 43 | ### Append text 44 | 45 | ```python 46 | from minestrone import HTML 47 | html = HTML("Dormouse") 48 | html.root_element.append(text=" Story") 49 | 50 | assert html == "Dormouse Story" 51 | ``` 52 | -------------------------------------------------------------------------------- /tests/html/test_query.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from minestrone import Element 4 | 5 | 6 | def test_query_id(html_doc): 7 | elements = list(html_doc.query("a#elsie")) 8 | assert len(elements) == 1 9 | assert isinstance(elements[0], Element) 10 | 11 | expected = ( 12 | 'Elsie' 13 | ) 14 | actual = elements[0] 15 | 16 | assert str(actual) == expected 17 | 18 | 19 | def test_query_class(html_doc): 20 | assert 3 == len(list(html_doc.query("a.sister"))) 21 | 22 | 23 | def test_query_tag(html_doc): 24 | assert 3 == len(list(html_doc.query("a"))) 25 | 26 | 27 | def test_elements_with_one_parent(html_unicorn_fragment): 28 | actual = len(list(html_unicorn_fragment.elements)) 29 | expected = 15 30 | 31 | assert actual == expected 32 | 33 | 34 | def test_elements_with_multiple_parents(): 35 | from minestrone import HTML 36 | 37 | html = HTML( 38 | """ 39 |
40 | Dormouse 41 | """ 42 | ) 43 | 44 | assert [e.name for e in html.elements] == ["div", "span"] 45 | 46 | 47 | def test_query_len_raises(html_doc): 48 | with pytest.raises(TypeError) as e: 49 | len(html_doc.query("a")) 50 | 51 | assert e.exconly() == "TypeError: object of type 'generator' has no len()" 52 | 53 | 54 | def test_query_to_list(html_doc): 55 | assert 3 == len(html_doc.query_to_list("a")) 56 | 57 | 58 | def test_query_css_selector(html_doc): 59 | for a in html_doc.query("ul li a.sister"): 60 | assert ( 61 | str(a) 62 | == 'Elsie' 63 | ) 64 | break 65 | -------------------------------------------------------------------------------- /docs/source/index.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | `minestrone` is an opinionated Python library that lets you search, modify, and parse messy HTML with ease. 4 | 5 | ## Behind the scenes 6 | 7 | `minestrone` utilizes [`Beautiful Soup`](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) to do all the real work, but aims to provide a simple, consistent, and intuitive API to interact with an HTML document. `Beautiful Soup` provides a _lot_ of functionality, although it can be hard to grok the documentation. The hope is that `minestrone` makes that functionality easier. 8 | 9 | ## Related projects 10 | 11 | There are a few other libraries to interact with HTML in Python, but most are focused on the retrieval of HTML and searching through the document. However, they are listed below in case they might be useful. 12 | 13 | ### Beautiful Soup related 14 | 15 | - [`SoupSieve`](https://facelessuser.github.io/soupsieve/): provides selecting, matching, and filtering using modern CSS selectors. It provides the functionality used by the `select` function in `Beautiful Soup` which is also used by `minestrone`, however it can be used separately. 16 | - [`soupy`](https://soupy.readthedocs.io/): wrapper around `Beautiful Soup` that makes it easier to search through HTML and XML documents. 17 | - [`fast-soup`](https://pypi.org/project/fast-soup/): faster `Beautiful Soup` search via `lxml`. 18 | - [`BeautifulSauce`](https://github.com/nateraw/BeautifulSauce): `Beautiful Soup`'s saucy sibling! 19 | - [`SoupCan`](https://pypi.org/project/soupcan/): simplifies the process of designing a Python tool for extracting and displaying webpage content. 20 | 21 | ### Beautiful Soup replacements 22 | 23 | - [`lxml.html`](https://lxml.de/lxmlhtml.html): based on `lxml`, but provides a special Element API for HTML elements, as well as a number of utilities for common HTML processing tasks. 24 | - [`html.parser`](https://docs.python.org/3/library/html.parser.html): simple HTML and XHTML parser in standard library. 25 | - [`parsel`](https://parsel.readthedocs.io/): Parsel is a BSD-licensed Python library to extract data from HTML, JSON, and XML documents. 26 | - [`selectolax`](https://github.com/rushter/selectolax): a fast HTML5 parser with CSS selectors. 27 | - [`gazpacho`](https://pypi.org/project/gazpacho/): simple, fast, and modern web scraping library. The library is stable, actively maintained, and installed with zero dependencies. 28 | - [`Requests-HTML`](https://requests-html.kennethreitz.org/): HTML Parsing for Humans. It intends to make parsing HTML (e.g. scraping the web) as simple and intuitive as possible. 29 | - [`regex`](https://docs.python.org/3/library/re.html): just kidding, you probably shouldn't use `regex` to parse HTML unless you are a masochist. 30 | 31 | ```{toctree} 32 | :maxdepth: 2 33 | :hidden: 34 | 35 | self 36 | installation 37 | changelog 38 | ``` 39 | 40 | ```{toctree} 41 | :caption: HTML 42 | :maxdepth: 2 43 | :hidden: 44 | 45 | parsing 46 | querying 47 | element 48 | editing 49 | ``` 50 | 51 | ```{toctree} 52 | :caption: Links 53 | :maxdepth: 2 54 | :hidden: 55 | 56 | GitHub 57 | Sponsor 58 | ``` 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/python,jupyternotebooks 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,jupyternotebooks 4 | 5 | ### JupyterNotebooks ### 6 | # gitignore template for Jupyter Notebooks 7 | # website: http://jupyter.org/ 8 | 9 | .ipynb_checkpoints 10 | */.ipynb_checkpoints/* 11 | 12 | # IPython 13 | profile_default/ 14 | ipython_config.py 15 | 16 | # Remove previous ipynb_checkpoints 17 | # git rm -r .ipynb_checkpoints/ 18 | 19 | ### Python ### 20 | # Byte-compiled / optimized / DLL files 21 | __pycache__/ 22 | *.py[cod] 23 | *$py.class 24 | 25 | # C extensions 26 | *.so 27 | 28 | # Distribution / packaging 29 | .Python 30 | build/ 31 | develop-eggs/ 32 | dist/ 33 | downloads/ 34 | eggs/ 35 | .eggs/ 36 | lib/ 37 | lib64/ 38 | parts/ 39 | sdist/ 40 | var/ 41 | wheels/ 42 | share/python-wheels/ 43 | *.egg-info/ 44 | .installed.cfg 45 | *.egg 46 | MANIFEST 47 | 48 | # PyInstaller 49 | # Usually these files are written by a python script from a template 50 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 51 | *.manifest 52 | *.spec 53 | 54 | # Installer logs 55 | pip-log.txt 56 | pip-delete-this-directory.txt 57 | 58 | # Unit test / coverage reports 59 | htmlcov/ 60 | .tox/ 61 | .nox/ 62 | .coverage 63 | .coverage.* 64 | .cache 65 | nosetests.xml 66 | coverage.xml 67 | *.cover 68 | *.py,cover 69 | .hypothesis/ 70 | .pytest_cache/ 71 | cover/ 72 | 73 | # Translations 74 | *.mo 75 | *.pot 76 | 77 | # Django stuff: 78 | *.log 79 | local_settings.py 80 | db.sqlite3 81 | db.sqlite3-journal 82 | 83 | # Flask stuff: 84 | instance/ 85 | .webassets-cache 86 | 87 | # Scrapy stuff: 88 | .scrapy 89 | 90 | # Sphinx documentation 91 | docs/_build/ 92 | 93 | # PyBuilder 94 | .pybuilder/ 95 | target/ 96 | 97 | # Jupyter Notebook 98 | 99 | # IPython 100 | 101 | # pyenv 102 | # For a library or package, you might want to ignore these files since the code is 103 | # intended to run in multiple environments; otherwise, check them in: 104 | # .python-version 105 | 106 | # pipenv 107 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 108 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 109 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 110 | # install all needed dependencies. 111 | #Pipfile.lock 112 | 113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 114 | __pypackages__/ 115 | 116 | # Celery stuff 117 | celerybeat-schedule 118 | celerybeat.pid 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | .dmypy.json 145 | dmypy.json 146 | 147 | # Pyre type checker 148 | .pyre/ 149 | 150 | # pytype static type analyzer 151 | .pytype/ 152 | 153 | # Cython debug symbols 154 | cython_debug/ 155 | 156 | # End of https://www.toptal.com/developers/gitignore/api/python,jupyternotebooks 157 | -------------------------------------------------------------------------------- /docs/source/querying.md: -------------------------------------------------------------------------------- 1 | # Querying 2 | 3 | `minestrone` allows searching through HTML via CSS selectors (similar to JQuery or other frontend libraries). 4 | 5 | ```{note} 6 | Querying uses the [`select`](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#css-selectors) method in `Beautiful Soup` which delegates to `SoupSieve`. More details about `SoupSieve` is available in [their documentation](https://facelessuser.github.io/soupsieve/). 7 | ``` 8 | 9 | ## root_element 10 | 11 | Gets the root [element](element.md) of the HTML. 12 | 13 | ```python 14 | from minestrone import HTML 15 | html = HTML(""" 16 |
17 | Dormouse 18 |
19 | """) 20 | 21 | assert html.root_element.name == "div" 22 | ``` 23 | 24 | ## elements 25 | 26 | Recursively get all [elements](element.md) in the HTML. 27 | 28 | ```python 29 | from minestrone import HTML 30 | html = HTML(""" 31 |
32 | Dormouse 33 |
34 | """) 35 | 36 | assert [e.name for e in html.elements] == ["div", "span"] 37 | ``` 38 | 39 | ## query 40 | 41 | Takes a CSS selector and returns an iterator of [`Element`](element.md) items. 42 | 43 | ### Query by element name 44 | 45 | ```python 46 | from minestrone import HTML 47 | html = HTML(""" 48 |

The Dormouse's Story

49 |

There was a table...

50 | """) 51 | 52 | for h1 in html.query("h1"): 53 | assert str(h1) == "

The Dormouse's Story

" 54 | ``` 55 | 56 | ### Query by id 57 | 58 | ```python 59 | from minestrone import HTML 60 | html = HTML(""" 61 | 65 | """) 66 | 67 | for a in html.query("a#elsie"): 68 | assert str(a) == 'Elsie' 69 | ``` 70 | 71 | ### Query by class 72 | 73 | ```python 74 | from minestrone import HTML 75 | html = HTML(""" 76 | 80 | """) 81 | 82 | elsie_link = next(html.query("ul li a.sister")) 83 | assert str(elsie_link) == 'Elsie' 84 | 85 | lacie_link = next(html.query("ul li a.sister")) 86 | assert str(lacie_link) == 'Lacie' 87 | ``` 88 | 89 | ## query_to_list 90 | 91 | Exactly the same as [query](querying.md#query) except it returns a list of [`Element`](element.md) items instead of a generator. This is sometimes more useful than the `query` above, but it can take more time to parse and more memory to store the data if the HTML document is large. 92 | 93 | ```python 94 | from minestrone import HTML 95 | html = HTML(""" 96 | 100 | """) 101 | 102 | assert len(html.query_to_list("a")) == 2 103 | assert str(html.query_to_list("a")[0]) == 'Elsie' 104 | assert html.query_to_list("a") == list(html.query("a")) 105 | ``` 106 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "minestrone" 3 | authors = [{name = "Adam Hill", email = "adam@adamghill.com"}] 4 | dynamic = ["version", "description"] 5 | license = { file = "LICENSE" } 6 | 7 | [tool.poetry] 8 | name = "minestrone" 9 | version = "0.8.0" 10 | description = "Search, modify, and parse messy HTML with ease." 11 | authors = ["adamghill "] 12 | license = "MIT" 13 | readme = "README.md" 14 | repository = "https://github.com/adamghill/minestrone/" 15 | homepage = "https://github.com/adamghill/minestrone/" 16 | documentation = "http://minestrone.readthedocs.io/" 17 | keywords = ["python", "html", "beautifulsoup"] 18 | 19 | [tool.poetry.urls] 20 | "Funding" = "https://github.com/sponsors/adamghill" 21 | 22 | [tool.poetry.dependencies] 23 | python = ">=3.7,<4.0.0" 24 | beautifulsoup4 = "^4.10.0" 25 | 26 | # optional parsers 27 | lxml = { version = "4.9.1", optional = true } 28 | html5lib = { version = "1.1", optional = true } 29 | 30 | # docs 31 | Sphinx = { version = "^4.3.2", optional = true } 32 | linkify-it-py = { version = "^1.0.3", optional = true } 33 | myst-parser = { version = "^0.16.1", optional = true } 34 | furo = { version = "^2021.11.23", optional = true } 35 | sphinx-copybutton = { version = "^0.4.0", optional = true } 36 | attrs = { version = "^21.4.0", optional = true } 37 | toml = { version = "*", optional = true } 38 | 39 | [tool.poetry.group.dev.dependencies] 40 | pytest = "^6" 41 | black = "^22" 42 | isort = "^5" 43 | sphinx-autobuild = "^2021.3.14" 44 | types-beautifulsoup4 = "^4" 45 | mypy = "^0" 46 | coverage = {extras = ["toml"], version = "^6.2"} 47 | pytest-cov = "^3" 48 | ruff = "^0" 49 | pytest-benchmark = "^4.0.0" 50 | 51 | [tool.poetry.extras] 52 | docs = ["Sphinx", "linkify-it-py", "myst-parser", "furo", "sphinx-copybutton", "toml", "attrs"] 53 | lxml = ["lxml"] 54 | html5 = ["html5lib"] 55 | 56 | [tool.black] 57 | line-length = 88 58 | 59 | [tool.isort] 60 | profile = "black" 61 | multi_line_output = 3 62 | 63 | [tool.pytest.ini_options] 64 | addopts = "--quiet --failed-first -p no:warnings --benchmark-skip" 65 | testpaths = [ 66 | "tests" 67 | ] 68 | 69 | [tool.ruff] 70 | line-length = 88 71 | select = ["E", "F"] 72 | ignore = [] 73 | extend-select = ["D"] 74 | extend-ignore = [ 75 | "D101", 76 | "D102", 77 | "D103", 78 | "D105", 79 | "D106", 80 | "D202", 81 | "D203", 82 | "D204", 83 | "D213", 84 | "D215", 85 | "D400", 86 | "D404", 87 | "D406", 88 | "D407", 89 | "D408", 90 | "D409", 91 | "D413", 92 | "D100", 93 | ] 94 | 95 | [tool.coverage.run] 96 | branch = true 97 | parallel = true 98 | 99 | [tool.coverage.report] 100 | show_missing = true 101 | skip_covered = true 102 | skip_empty = true 103 | 104 | [tool.poe.tasks] 105 | t = { cmd = "pytest", help = "Run tests" } 106 | tc = { cmd = "pytest --cov=minestrone", help = "Run tests with coverage" } 107 | r = { cmd = "coverage report", help = "Show coverage report" } 108 | my = { cmd = "mypy .", help = "Run mypy" } 109 | b = { cmd = "black . --check --quiet", help = "Run black" } 110 | i = { cmd = "isort . --check --quiet", help = "Run isort" } 111 | tm = ["b", "i", "t", "my"] 112 | sa = { cmd = "sphinx-autobuild -W docs/source docs/build", help = "Sphinx autobuild" } 113 | sb = { cmd = "sphinx-build -W docs/source docs/build", help = "Build documentation" } 114 | publish = { shell = "poetry publish --build -r test && poetry publish" } 115 | 116 | [build-system] 117 | requires = ["poetry-core>=1.0.0"] 118 | build-backend = "poetry.core.masonry.api" 119 | -------------------------------------------------------------------------------- /docs/source/element.md: -------------------------------------------------------------------------------- 1 | # Element 2 | 3 | `Element`s are returned from [querying](querying.md) methods. 4 | 5 | ## properties 6 | 7 | ### name 8 | 9 | Gets the name of the `Element`. 10 | 11 | ```python 12 | html = HTML("Dormouse") 13 | span_element = html.root_element 14 | 15 | assert span_element.name == "span" 16 | ``` 17 | 18 | ### id 19 | 20 | #### Get the id 21 | 22 | ```python 23 | html = HTML('Dormouse') 24 | span_element = html.root_element 25 | 26 | assert span_element.id == "dormouse" 27 | ``` 28 | 29 | #### Set the id 30 | 31 | ```python 32 | html = HTML("Dormouse") 33 | span_element = html.root_element 34 | 35 | span_element.id = "dormouse" 36 | assert span_element.id == "dormouse" 37 | ``` 38 | 39 | ### attributes 40 | 41 | #### Get attributes 42 | 43 | ```python 44 | html = HTML('') 45 | button_element = html.root_element 46 | 47 | assert button_element.attributes == {"class": "mt-2 pb-2", "disabled": True} 48 | ``` 49 | 50 | #### Set attributes 51 | 52 | ```python 53 | html = HTML("") 54 | button_element = html.root_element 55 | button_element.attributes = {"class": "mt-2 pb-2", "disabled": True} 56 | 57 | assert str(button_element) == '' 58 | ``` 59 | 60 | ### classes 61 | 62 | Gets a list of classes for the element. 63 | 64 | ```python 65 | html = HTML('') 66 | button_element = html.root_element 67 | 68 | assert button_element.classes == ["mt-2", "pb-2"] 69 | ``` 70 | 71 | ### text 72 | 73 | #### Get text context 74 | 75 | ```python 76 | html = HTML("") 77 | button_element = html.root_element 78 | 79 | assert button_element.text == "Wake Up" 80 | ``` 81 | 82 | #### Set text content 83 | 84 | ```python 85 | html = HTML("") 86 | button_element = html.root_element 87 | 88 | button_element.text = "Go back to sleep" 89 | 90 | assert str(button_element) == "" 91 | ``` 92 | 93 | ### children 94 | 95 | Gets an iterator of the children for the element. 96 | 97 | ```python 98 | html = HTML(""" 99 |
    100 |
  • 1
  • 101 |
  • 2
  • 102 |
  • 3
  • 103 |
104 | """) 105 | ul_element = html.root_element 106 | 107 | assert len(list(ul_element.children)) == 3 108 | ``` 109 | 110 | ### parent 111 | 112 | Gets the parent for the element. 113 | 114 | ```python 115 | html = HTML(""" 116 |
    117 |
  • 1
  • 118 |
119 | """) 120 | li_element = next(html.query("#li-1")) 121 | 122 | assert li_element.parent.name == "ul" 123 | ``` 124 | 125 | ## methods 126 | 127 | ### insert 128 | 129 | Inserts an element into an element. 130 | 131 | ```python 132 | html = HTML("
    ") 133 | ul_element = next(html.query("ul")) 134 | 135 | li_element = Element.create("li", "item") 136 | ul_element.insert(li_element) 137 | 138 | assert str(ul_element) == "
    • item
    " 139 | ``` 140 | 141 | ```python 142 | html = HTML("
    • item
    ") 143 | ul_element = next(html.query("ul")) 144 | 145 | li_element = Element.create("li", "another item") 146 | ul_element.insert(li_element, -1) 147 | 148 | assert str(ul_element) == "
    • item
    • another item
    " 149 | ``` 150 | 151 | ### prettify 152 | 153 | Returns a prettified version of the element. 154 | 155 | ```python 156 | html = HTML('
    • 1
    ') 157 | ul_element = next(html.query("ul")) 158 | 159 | assert ul_element.prettify() == """ 160 |
      161 |
    • 1
    • 162 |
    163 | """ 164 | ``` 165 | 166 | ### remove_children 167 | 168 | Removes all children from an element. 169 | 170 | ```python 171 | html = HTML(''' 172 |
      173 |
    • 1
    • 174 |
    • 2
    • 175 |
    176 | ''') 177 | ul_element = next(html.query("ul")) 178 | ul_element.remove_children() 179 | 180 | assert str(ul_element) == "
      " 181 | ``` 182 | -------------------------------------------------------------------------------- /tests/html/test_html.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | 5 | # Test that `HTML` is imported from the asterisk 6 | from minestrone import * 7 | 8 | 9 | def test_html_str_unsorted_attributes(html_doc): 10 | with open("tests/samples/html_doc.html", "r") as f: 11 | original_html_doc = f.read() 12 | 13 | # All spaces are replaced which isn't great 14 | expected = original_html_doc 15 | expected = re.sub(r"^\s+", "", expected, flags=re.MULTILINE) 16 | actual = str(html_doc) 17 | 18 | assert actual == expected 19 | 20 | 21 | def test_html_str_closes_tags(): 22 | html = HTML( 23 | """ 24 | 25 | The Dormouse's story 26 | 27 | 28 |

      The Dormouse's story

      """ 29 | ) 30 | 31 | expected = """ 32 | 33 | The Dormouse's story 34 | 35 | 36 |

      The Dormouse's story

      """ 37 | 38 | actual = str(html) 39 | 40 | assert actual == expected 41 | 42 | 43 | def test_html_fragments(): 44 | html = HTML( 45 | """

      The Dormouse's story

      46 | """ 51 | ) 52 | 53 | expected = """

      The Dormouse's story

      54 | """ 59 | 60 | actual = str(html) 61 | 62 | assert actual == expected 63 | 64 | 65 | def test_html_html_strings(html_fragment): 66 | html = HTML(html_fragment) 67 | 68 | assert str(html_fragment) == str(html) 69 | 70 | 71 | def test_html_html_soups(html_fragment): 72 | html = HTML(html_fragment) 73 | 74 | assert html_fragment._soup == html._soup 75 | 76 | 77 | def test_html_wrong_type(): 78 | with pytest.raises(Exception): 79 | HTML(1) 80 | 81 | 82 | def test_html_repr(html_fragment): 83 | assert repr(html_fragment) == str(html_fragment) 84 | 85 | 86 | def test_html_parser_fragment(html_fragment_str): 87 | html_parsed_with_html = HTML(html_fragment_str, parser=Parser.HTML) 88 | assert html_parsed_with_html 89 | 90 | html_parsed_with_lxml = HTML(html_fragment_str, parser=Parser.LXML) 91 | assert html_parsed_with_lxml 92 | 93 | html_parsed_with_html5 = HTML(html_fragment_str, parser=Parser.HTML5) 94 | assert html_parsed_with_html5 95 | 96 | 97 | def test_html_parser_fragment_html(): 98 | assert str(HTML("dormouse", parser=Parser.HTML)) == "dormouse" 99 | 100 | 101 | def test_html_parser_fragment_lxml(): 102 | assert ( 103 | str(HTML("dormouse", parser=Parser.LXML)) 104 | == "dormouse" 105 | ) 106 | 107 | 108 | def test_html_parser_fragment_html5(): 109 | assert ( 110 | str(HTML("dormouse", parser=Parser.HTML5)) 111 | == "dormouse" 112 | ) 113 | 114 | 115 | def test_html_parser_doc(html_doc_str): 116 | html_parsed_with_html = HTML(html_doc_str, parser=Parser.HTML) 117 | assert html_parsed_with_html 118 | 119 | html_parsed_with_lxml = HTML(html_doc_str, parser=Parser.LXML) 120 | assert html_parsed_with_lxml 121 | 122 | html_parsed_with_html5 = HTML(html_doc_str, parser=Parser.HTML5) 123 | assert html_parsed_with_html5 124 | 125 | 126 | def test_html_encoding(): 127 | html = HTML(b"

      \xed\xe5\xec\xf9

      ") 128 | assert str(html) == "

      翴檛

      " 129 | assert html.encoding == "big5" 130 | 131 | html = HTML(b"

      \xed\xe5\xec\xf9

      ", encoding="iso-8859-8") 132 | assert str(html) == "

      םולש

      " 133 | assert html.encoding == "iso-8859-8" 134 | -------------------------------------------------------------------------------- /minestrone/__init__.py: -------------------------------------------------------------------------------- 1 | """minestrone - Search, modify, and parse messy HTML with ease.""" 2 | 3 | from enum import Enum 4 | from typing import Iterator, List, Optional, Union 5 | 6 | import bs4 7 | 8 | from .element import Element, Text 9 | from .formatter import UnsortedAttributes 10 | 11 | __all__ = [ 12 | "HTML", 13 | "Element", 14 | "Text", 15 | "Parser", 16 | ] 17 | 18 | 19 | class Parser(Enum): 20 | HTML = "html.parser" 21 | LXML = "lxml" 22 | HTML5 = "html5lib" 23 | 24 | 25 | class HTML: 26 | encoding: Optional[str] = "utf-8" 27 | 28 | def __init__( 29 | self, 30 | html: Union[str, "HTML"], 31 | parser: Parser = Parser.HTML, 32 | encoding: str = None, 33 | ): 34 | self.html = html 35 | self.parser = parser 36 | 37 | if isinstance(html, str) or isinstance(html, bytes): 38 | self._soup = bs4.BeautifulSoup( 39 | html, features=parser.value, from_encoding=encoding 40 | ) 41 | elif isinstance(html, HTML): 42 | self._soup = html._soup 43 | self.html = html.html 44 | else: 45 | raise Exception("Unknown type to init HTML") 46 | 47 | if encoding: 48 | self.encoding = encoding 49 | else: 50 | self.encoding = self._soup.original_encoding 51 | 52 | def query(self, selector: str) -> Iterator[Element]: 53 | """Returns an iterator of `Element`s that match the CSS selector.""" 54 | 55 | for _tag in self._soup.select(selector): 56 | yield Element.convert_from_tag(self._soup, _tag) 57 | 58 | def query_to_list(self, selector: str) -> List[Element]: 59 | """Returns a list of `Element`s that match the CSS selector.""" 60 | 61 | return list(self.query(selector)) 62 | 63 | def prettify( 64 | self, indent: int = 2, max_line_length: int = 88, use_bs4: bool = False 65 | ) -> str: 66 | """Prettify HTML. 67 | 68 | Args: 69 | indent: How many spaces to indent for each level in the hierarchy. Defaults to 2. 70 | max_line_length: How long the line can reach before indenting another level. Defaults to 88. If `None` it will never used. 71 | use_bs4: Whether to use the `BeautifulSoup` `prettify` function or `minestrone`. Defaults to `False`. 72 | """ 73 | 74 | if use_bs4: 75 | return self._soup.prettify() 76 | 77 | strings = [] 78 | 79 | for top_level_child in self._soup.contents: 80 | if isinstance(top_level_child, bs4.Doctype) and top_level_child: 81 | strings.append("\n") 84 | elif isinstance(top_level_child, bs4.Tag): 85 | element = Element.convert_from_tag(self._soup, top_level_child) 86 | strings.append(element.prettify(indent, max_line_length)) 87 | elif isinstance(top_level_child, bs4.Comment): 88 | strings.append("") 91 | strings.append("\n") 92 | elif isinstance(top_level_child, str) and top_level_child != "\n": 93 | strings.append(top_level_child.strip()) 94 | strings.append("\n") 95 | 96 | return "".join(strings) 97 | 98 | @property 99 | def root_element(self) -> Optional[Element]: 100 | """Gets the root `Element` for the HTML.""" 101 | 102 | for _element in self._soup.contents: 103 | if isinstance(_element, bs4.element.Tag) and _element.name: 104 | return Element.convert_from_tag(self._soup, _element) 105 | 106 | return None 107 | 108 | @property 109 | def elements(self) -> Iterator[Element]: 110 | """Recursively yield all `Element`s in the HTML.""" 111 | 112 | for _element in self._soup.descendants: 113 | if isinstance(_element, bs4.element.Tag) and _element.name: 114 | yield Element.convert_from_tag(self._soup, _element) 115 | 116 | def __str__(self): 117 | # Cleans up `BeautifulSoup` modifications 118 | self._soup.smooth() 119 | 120 | # Prevent `BeautifulSoup` from re-ordering attributes in alphabetical order 121 | return self._soup.encode(formatter=UnsortedAttributes()).decode() 122 | 123 | def __repr__(self): 124 | return self.__str__() 125 | -------------------------------------------------------------------------------- /minestrone/element/prettifier.py: -------------------------------------------------------------------------------- 1 | import bs4 2 | 3 | from . import VOID_ELEMENTS, Element 4 | 5 | 6 | def prettify_element( 7 | element: Element, 8 | indent: int, 9 | max_line_length: int, 10 | spaces: str = "", 11 | ) -> str: 12 | def __increase_spaces(_spaces): 13 | return " " * (len(_spaces) + indent) 14 | 15 | def __decrease_spaces(_spaces): 16 | return " " * (len(_spaces) - indent) 17 | 18 | def __append_newline_if_needed(_strings): 19 | if not _strings[-1].endswith("\n"): 20 | _strings.append("\n") 21 | 22 | def __append_string(_strings, _string): 23 | if _string: 24 | _strings.append(_string) 25 | 26 | strings = [] 27 | __append_string(strings, spaces) 28 | __append_string(strings, element.tag_string) 29 | 30 | content_children = [ 31 | c 32 | for c in element._self.contents 33 | if (isinstance(c, str) and c != "\n") 34 | or isinstance(c, bs4.element.Tag) 35 | or isinstance(c, bs4.element.Comment) 36 | ] 37 | has_children = False 38 | 39 | for child in element.children: 40 | if has_children is False: 41 | if content_children: 42 | extra_child_spaces = __increase_spaces(spaces) 43 | 44 | for content_child in content_children.copy(): 45 | content_children.pop(0) 46 | 47 | if isinstance(content_child, str): 48 | child_text = content_child.strip() 49 | 50 | if child_text: 51 | # Make sure that any newlines are indented to the correct number of spaces 52 | child_text = child_text.replace( 53 | "\n", f"\n{extra_child_spaces}" 54 | ) 55 | 56 | __append_string(strings, "\n") 57 | __append_string(strings, extra_child_spaces) 58 | __append_string(strings, child_text) 59 | else: 60 | break 61 | 62 | if element.name not in VOID_ELEMENTS: 63 | # Only increase the number of spaces if the current element can have children 64 | # and it's the first child 65 | spaces = __increase_spaces(spaces) 66 | 67 | __append_newline_if_needed(strings) 68 | 69 | has_children = True 70 | __append_string( 71 | strings, prettify_element(child, indent, max_line_length, spaces=spaces) 72 | ) 73 | 74 | if content_children: 75 | extra_child_spaces = __increase_spaces(spaces) 76 | 77 | for child in content_children.copy(): 78 | content_children.pop(0) 79 | 80 | if isinstance(child, str): 81 | child_text = child.strip() 82 | 83 | if child_text: 84 | # Make sure that any newlines are indented to the correct number of spaces 85 | child_text = child_text.replace("\n", f"\n{spaces}") 86 | __append_string(strings, spaces) 87 | 88 | if isinstance(child, bs4.Comment): 89 | __append_string(strings, "") 95 | else: 96 | break 97 | 98 | if has_children: 99 | spaces = __decrease_spaces(spaces) 100 | 101 | __append_newline_if_needed(strings) 102 | __append_string(strings, spaces) 103 | __append_string(strings, element.closing_tag_string) 104 | else: 105 | is_long_line = False 106 | 107 | if max_line_length is not None and len(element._self.text) > max_line_length: 108 | is_long_line = True 109 | 110 | if is_long_line: 111 | spaces = __increase_spaces(spaces) 112 | __append_string(strings, "\n") 113 | __append_string(strings, spaces) 114 | 115 | __append_string(strings, element._self.text) 116 | 117 | if is_long_line: 118 | spaces = __decrease_spaces(spaces) 119 | __append_string(strings, "\n") 120 | __append_string(strings, spaces) 121 | 122 | __append_string(strings, element.closing_tag_string) 123 | 124 | __append_newline_if_needed(strings) 125 | 126 | return "".join(strings) 127 | -------------------------------------------------------------------------------- /docs/source/parsing.md: -------------------------------------------------------------------------------- 1 | # Parsing 2 | 3 | The `HTML` class parses a string of HTML and provides methods to [query](querying.md) the DOM for specific elements. 4 | 5 | ## \_\_init\_\_ 6 | 7 | Creates an `HTML` object from a `str` or `bytes`. 8 | 9 | ```python 10 | from minestrone import HTML 11 | html = HTML(""" 12 | 13 | 14 | The Dormouse's Story 15 | 16 | 17 |

      The Dormouse's Story

      18 | 19 | 23 | 24 | 25 | """) 26 | ``` 27 | 28 | If closing tags are missing, then they will be added as needed to make the HTML valid. 29 | 30 | ```python 31 | from minestrone import HTML 32 | assert str(HTML("dormouse")) == "dormouse" 33 | ``` 34 | 35 | ### parser 36 | Three parsers are available in `minestrone` and they all have different trade-offs. By default, the built-in, pure Python `html.parser` is used. `lxml` can be used for faster parsing speed. `html5lib` is another option to ensure a valid HTML5 document. 37 | 38 | ```{note} 39 | `lxml` and `html5lib` are not installed with `minestrone` by default and must be specifically installed. 40 | 41 | - `poetry add minestrone[lxml]` or `pip install minestrone[lxml]` 42 | - `poetry add minestrone[html5]` or `pip install minestrone[html5]` 43 | ``` 44 | 45 | ```{note} 46 | BeautifulSoup has a [summary table](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser) of the three parsers. There is also a more detailed [breakdown of the differences](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#differences-between-parsers) between the parsers. 47 | ``` 48 | 49 | ### Parser.HTML 50 | 51 | ```python 52 | from minestrone import HTML, Parser 53 | assert str(HTML("dormouse"), parser=Parser.HTML) == "dormouse" 54 | ``` 55 | 56 | ### Parser.LXML 57 | 58 | ```python 59 | from minestrone import HTML, Parser 60 | assert str(HTML("dormouse"), parser=Parser.LXML) == "dormouse" 61 | ``` 62 | 63 | ### Parser.HTML5 64 | 65 | ```python 66 | from minestrone import HTML, Parser 67 | assert str(HTML("dormouse"), parser=Parser.HTML5) == "dormouse" 68 | ``` 69 | 70 | ## encoding 71 | 72 | `Beautiful Soup` [attempts to decipher the encoding](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#encodings) of the HTML string, however it isn't always correct. An encoding can be passed along if necessary. 73 | 74 | ```python 75 | from minestrone import HTML, Parser 76 | html_bytes = b"

      \xed\xe5\xec\xf9

      " 77 | 78 | assert str(HTML(html_bytes)) == "

      翴檛

      " 79 | assert HTML(html_bytes).encoding == "big5" 80 | 81 | assert str(HTML(html_bytes), encoding="iso-8859-8") == "

      םולש

      " 82 | assert HTML(html_bytes).encoding == "iso-8859-8" 83 | ``` 84 | 85 | ## prettify 86 | 87 | Returns a prettified version of the HTML. 88 | 89 | ```python 90 | html = HTML(""" 91 | 92 | 93 | The Dormouse's Story 94 | 95 | 96 |

      The Dormouse's Story

      97 | 98 | 102 | 103 | 104 | """) 105 | 106 | assert html.prettify() == """ 107 | 108 | The Dormouse's Story 109 | 110 | 111 |

      The Dormouse's Story

      112 |
        113 |
      • 114 | Elsie 115 |
      • 116 |
      • 117 | Lacie 118 |
      • 119 |
      120 | 121 | 122 | """ 123 | ``` 124 | 125 | 126 | ## \_\_str\_\_ 127 | 128 | Returns the `HTML` object as a string. 129 | 130 | ```python 131 | from minestrone import HTML 132 | html = HTML(""" 133 | 134 | 135 | The Dormouse's Story 136 | 137 | 138 |

      The Dormouse's Story

      139 | 140 | 144 | 145 | 146 | """) 147 | 148 | assert str(html) == """ 149 | 150 | The Dormouse's Story 151 | 152 | 153 |

      The Dormouse's Story

      154 | 158 | 159 | """ 160 | ``` 161 | 162 | ```{note} 163 | Rendering the `HTML` into a string _will_ remove preceding spaces. 164 | ``` 165 | -------------------------------------------------------------------------------- /tests/element/test_create_tags.py: -------------------------------------------------------------------------------- 1 | from minestrone import HTML, Text 2 | 3 | 4 | def test_create_tag_with_true_attribute_value(html_fragment: HTML): 5 | elsie = next(html_fragment.query("a#elsie")) 6 | actual = elsie._create_tag("button", "Save", disabled=True) 7 | 8 | expected = '' 9 | 10 | assert str(actual) == expected 11 | 12 | 13 | def test_create_tag_with_klass(html_fragment: HTML): 14 | elsie = next(html_fragment.query("a#elsie")) 15 | actual = elsie._create_tag("button", "Save", klass="test-class") 16 | 17 | expected = '' 18 | 19 | assert str(actual) == expected 20 | 21 | 22 | def test_prepend(html_fragment: HTML): 23 | elsie = next(html_fragment.query("a#elsie")) 24 | elsie.prepend("span", "test prepend content", klass="test-class") 25 | 26 | expected = """""" 31 | 32 | actual = str(html_fragment) 33 | 34 | assert actual == expected 35 | 36 | 37 | def test_text_str(html_fragment: HTML): 38 | elsie = next(html_fragment.query("a#elsie")) 39 | text = elsie.prepend(text="test prepend text content") 40 | 41 | expected = "test prepend text content" 42 | actual = str(text) 43 | 44 | assert actual == expected 45 | 46 | 47 | def test_text_repr(html_fragment: HTML): 48 | elsie = next(html_fragment.query("a#elsie")) 49 | text = elsie.prepend(text="test prepend text content") 50 | 51 | expected = "test prepend text content" 52 | actual = repr(text) 53 | 54 | assert actual == expected 55 | 56 | 57 | def test_prepend_text(html_fragment: HTML): 58 | elsie = next(html_fragment.query("a#elsie")) 59 | elsie.prepend(text="test prepend text content") 60 | 61 | expected = """""" 66 | 67 | actual = str(html_fragment) 68 | 69 | assert actual == expected 70 | 71 | 72 | def test_append(html_fragment: HTML): 73 | elsie = next(html_fragment.query("a#elsie")) 74 | elsie.append("span", "test append content", klass="test-class") 75 | 76 | expected = """""" 81 | 82 | actual = str(html_fragment) 83 | 84 | assert actual == expected 85 | 86 | 87 | def test_append_text(html_fragment: HTML): 88 | elsie = next(html_fragment.query("a#elsie")) 89 | 90 | text = elsie.append(text="test append text content") 91 | assert text 92 | assert isinstance(text, Text) 93 | 94 | expected = """""" 99 | 100 | actual = str(html_fragment) 101 | 102 | assert actual == expected 103 | 104 | 105 | def test_append_text_to_text(html_fragment: HTML): 106 | elsie = next(html_fragment.query("a#elsie")) 107 | first_text = elsie.append(text="test append 1") 108 | assert first_text 109 | assert isinstance(first_text, Text) 110 | 111 | second_text = first_text.append(text=" test append 2") 112 | assert second_text 113 | assert isinstance(second_text, Text) 114 | 115 | expected = """""" 120 | 121 | actual = str(html_fragment) 122 | 123 | assert actual == expected 124 | 125 | 126 | def test_append_multiple(html_fragment: HTML): 127 | elsie = next(html_fragment.query("a#elsie")) 128 | new_tag = elsie.append("span", "test append content 1", klass="test-class") 129 | new_tag.append("span", "test append content 2", klass="test-class") 130 | 131 | expected = """
        132 |
      • Elsietest append content 1test append content 2
      • 133 |
      • Lacie
      • 134 |
      • Tillie
      • 135 |
      """ 136 | 137 | actual = str(html_fragment) 138 | 139 | assert actual == expected 140 | -------------------------------------------------------------------------------- /tests/test_benchmarks.py: -------------------------------------------------------------------------------- 1 | # Run this with `poe t tests/test_benchmarks.py --benchmark-only` 2 | from typing import Iterator 3 | 4 | import re 5 | from minestrone import HTML, Element 6 | from html.parser import HTMLParser 7 | 8 | from lxml import html as lxml_html 9 | from bs4 import BeautifulSoup, Tag 10 | 11 | UNICORN_MODEL_REGEX = re.compile( 12 | r"(unicorn:model|u:model)(\.[^=]+)?=[\"'](?P[^\"']+)[\"']" 13 | ) 14 | 15 | HTML_FRAGMENT = """
      16 |
      17 |
      Step 2
      18 |
      19 | 20 | 21 | 22 | 23 |
      24 |
      25 | Address: 123 Main St
      26 | City: Anytown
      27 | State: CA
      28 | Zip code: 12345
      29 |
      30 | 31 | 32 |
      33 |
      """ 34 | 35 | EXPECTED = ["address", "city", "state", "zip_code"] 36 | 37 | 38 | def _parse_beautiful_soup(soup: BeautifulSoup): 39 | unicorn_model_names = [] 40 | 41 | for tag in soup.descendants: 42 | if isinstance(tag, Tag) and tag.name: 43 | for attr in tag.attrs.keys(): 44 | if attr.startswith("unicorn:model") or attr.startswith("u:model"): 45 | unicorn_model_names.append(tag.attrs[attr]) 46 | 47 | return unicorn_model_names 48 | 49 | 50 | def test_beautiful_soup_html_parser_with_existing_soup(benchmark): 51 | html_parser_soup = BeautifulSoup(HTML_FRAGMENT, features="html.parser") 52 | 53 | def _(): 54 | return _parse_beautiful_soup(html_parser_soup) 55 | 56 | actual = benchmark(_) 57 | assert EXPECTED == actual 58 | 59 | 60 | def test_beautiful_soup_html_parser(benchmark): 61 | def _(): 62 | soup = BeautifulSoup(HTML_FRAGMENT, features="html.parser") 63 | return _parse_beautiful_soup(soup) 64 | 65 | actual = benchmark(_) 66 | assert EXPECTED == actual 67 | 68 | 69 | def test_beautiful_soup_lxml(benchmark): 70 | def _(): 71 | soup = BeautifulSoup(HTML_FRAGMENT, features="lxml") 72 | return _parse_beautiful_soup(soup) 73 | 74 | actual = benchmark(_) 75 | assert EXPECTED == actual 76 | 77 | 78 | def test_beautiful_soup_lxml_with_existing_soup(benchmark): 79 | lxml_soup = BeautifulSoup(HTML_FRAGMENT, features="lxml") 80 | 81 | def _(): 82 | return _parse_beautiful_soup(lxml_soup) 83 | 84 | actual = benchmark(_) 85 | assert EXPECTED == actual 86 | 87 | 88 | def test_regex(benchmark): 89 | def _(): 90 | return [ 91 | m.group("unicorn_model_name") 92 | for m in UNICORN_MODEL_REGEX.finditer(HTML_FRAGMENT) 93 | ] 94 | 95 | actual = benchmark(_) 96 | 97 | assert EXPECTED == actual 98 | 99 | 100 | def _minestrone_get_unicorn_models(element: Element) -> Iterator[str]: 101 | for attribute in element.attributes.keys(): 102 | if attribute.startswith("unicorn:model") or attribute.startswith("u:model"): 103 | yield element.attributes[attribute] 104 | 105 | 106 | def test_minestrone(benchmark): 107 | def _(): 108 | minestrone_html = HTML(HTML_FRAGMENT) 109 | unicorn_model_names = [] 110 | 111 | for element in minestrone_html.elements: 112 | for attribute_value in _minestrone_get_unicorn_models(element): 113 | unicorn_model_names.append(attribute_value) 114 | 115 | return unicorn_model_names 116 | 117 | actual = benchmark(_) 118 | 119 | assert EXPECTED == actual 120 | 121 | 122 | def test_minestrone_with_existing_html(benchmark): 123 | minestrone_html = HTML(HTML_FRAGMENT) 124 | 125 | def _(): 126 | unicorn_model_names = [] 127 | 128 | for element in minestrone_html.elements: 129 | for attribute_value in _minestrone_get_unicorn_models(element): 130 | unicorn_model_names.append(attribute_value) 131 | 132 | return unicorn_model_names 133 | 134 | actual = benchmark(_) 135 | 136 | assert EXPECTED == actual 137 | 138 | 139 | # def test_parsel(benchmark): 140 | # from parsel import Selector 141 | # def _(): 142 | # selector = Selector(html) 143 | 144 | # r = selector.css("input::attr(unicorn:model)").getall() 145 | 146 | # print(r) 147 | 148 | # return [] 149 | 150 | # unicorn_model_names = benchmark(_) 151 | # assert len(unicorn_model_names) == 4 152 | 153 | 154 | def test_lxml_html(benchmark): 155 | def _(): 156 | unicorn_model_names = [] 157 | 158 | for element in lxml_html.fragment_fromstring(HTML_FRAGMENT).iter(): 159 | for attrs in element.items(): 160 | if attrs[0].startswith("unicorn:model") or attrs[0].startswith( 161 | "u:model" 162 | ): 163 | unicorn_model_names.append(attrs[1]) 164 | 165 | return unicorn_model_names 166 | 167 | actual = benchmark(_) 168 | 169 | assert EXPECTED == actual 170 | 171 | 172 | class UnicornModelParser(HTMLParser): 173 | def feed(self, data): 174 | self.unicorn_model_names = [] 175 | 176 | super().reset() 177 | super().feed(data) 178 | 179 | def handle_starttag(self, tag, attrs: list): 180 | for attr in attrs: 181 | if attr[0].startswith("unicorn:model") or attr[0].startswith("u:model"): 182 | self.unicorn_model_names.append(attr[1]) 183 | 184 | 185 | def test_html_parser(benchmark): 186 | def _(): 187 | parser = UnicornModelParser() 188 | parser.feed(HTML_FRAGMENT) 189 | 190 | return parser.unicorn_model_names 191 | 192 | actual = benchmark(_) 193 | 194 | assert EXPECTED == actual 195 | -------------------------------------------------------------------------------- /tests/element/test_element.py: -------------------------------------------------------------------------------- 1 | import bs4 2 | import pytest 3 | 4 | from minestrone import HTML, Element 5 | 6 | 7 | def test_get_text(html_doc): 8 | tillie = next(html_doc.query("a#tillie")) 9 | 10 | expected = "Tillie" 11 | actual = tillie.text 12 | 13 | assert actual == expected 14 | 15 | 16 | def test_get_text_with_tags(): 17 | html = HTML( 18 | """ 19 |

      stuff in here header

      20 | """ 21 | ) 22 | el = next(html.query("h1")) 23 | 24 | expected = "stuff in here header" 25 | actual = el.text 26 | 27 | assert actual == expected 28 | 29 | 30 | def test_set_text(html_doc): 31 | tillie = next(html_doc.query("a#tillie")) 32 | 33 | tillie.text = "Billie" 34 | 35 | expected = "Billie" 36 | actual = tillie.text 37 | 38 | assert actual == expected 39 | 40 | 41 | def test_get_name(html_doc): 42 | tillie = next(html_doc.query("a#tillie")) 43 | 44 | expected = "a" 45 | actual = tillie.name 46 | 47 | assert actual == expected 48 | 49 | 50 | def test_element_klass(): 51 | actual = Element.create("button", "Save", klass="test-class") 52 | 53 | expected = '' 54 | 55 | assert str(actual) == expected 56 | 57 | 58 | def test_element_get_attributes(): 59 | span = Element.create( 60 | "span", 61 | "test attrs content", 62 | klass="test-class1 test-class2", 63 | disabled=True, 64 | id="span1", 65 | ) 66 | 67 | assert span.name == "span" 68 | assert span.text == "test attrs content" 69 | assert span.attributes == { 70 | "class": "test-class1 test-class2", 71 | "disabled": "disabled", 72 | "id": "span1", 73 | } 74 | 75 | 76 | def test_element_set_attributes_id(): 77 | span = Element.create( 78 | "span", 79 | ) 80 | 81 | assert span.id is None 82 | 83 | span.attributes = {"id": "test-id"} 84 | 85 | assert span.id == "test-id" 86 | assert span.attributes == { 87 | "id": "test-id", 88 | } 89 | 90 | 91 | def test_element_set_id(): 92 | span = Element.create( 93 | "span", 94 | ) 95 | 96 | assert span.id is None 97 | 98 | span.id = "test-id" 99 | 100 | assert span.id == "test-id" 101 | assert span.attributes == { 102 | "id": "test-id", 103 | } 104 | 105 | 106 | def test_element_set_attributes_klass(): 107 | span = Element.create( 108 | "span", 109 | klass="test-class1", 110 | ) 111 | 112 | span.attributes = {"klass": "test-class2 test-class3"} 113 | 114 | assert span.name == "span" 115 | assert span.attributes == { 116 | "class": "test-class2 test-class3", 117 | } 118 | assert span.classes == ["test-class2", "test-class3"] 119 | 120 | 121 | def test_element_set_attributes_css(): 122 | span = Element.create( 123 | "span", 124 | klass="test-class1", 125 | ) 126 | 127 | span.attributes = {"css": "test-class2 test-class3"} 128 | 129 | assert span.name == "span" 130 | assert span.attributes == { 131 | "class": "test-class2 test-class3", 132 | } 133 | assert span.classes == ["test-class2", "test-class3"] 134 | 135 | 136 | def test_element_set_attributes_class_list(): 137 | span = Element.create( 138 | "span", 139 | klass="test-class1", 140 | ) 141 | 142 | span.attributes = {"css": ["test-class2", "test-class3"]} 143 | 144 | assert span.name == "span" 145 | assert span.attributes == { 146 | "class": "test-class2 test-class3", 147 | } 148 | assert span.classes == ["test-class2", "test-class3"] 149 | 150 | 151 | def test_element_set_attributes_class_tuple(): 152 | span = Element.create( 153 | "span", 154 | klass="test-class1", 155 | ) 156 | 157 | span.attributes = {"css": ("test-class2", "test-class3")} 158 | 159 | assert span.name == "span" 160 | assert span.attributes == { 161 | "class": "test-class2 test-class3", 162 | } 163 | assert span.classes == ["test-class2", "test-class3"] 164 | 165 | 166 | def test_element_set_attributes_invalid_type(): 167 | span = Element.create("span") 168 | 169 | with pytest.raises(Exception): 170 | span.attributes = {"css": 0} 171 | 172 | 173 | def test_element_classes_from_klass_kwarg(): 174 | span = Element.create( 175 | "span", 176 | klass="test-class1 test-class2", 177 | ) 178 | 179 | assert span.classes == ["test-class1", "test-class2"] 180 | 181 | 182 | def test_element_classes_from_css_kwarg(): 183 | span = Element.create( 184 | "span", 185 | css="test-class1 test-class2", 186 | ) 187 | 188 | assert span.classes == ["test-class1", "test-class2"] 189 | 190 | 191 | def test_element_classes_empty(): 192 | span = Element.create("span") 193 | 194 | assert span.classes == [] 195 | 196 | 197 | def test_element_children(html_doc): 198 | ul = next(html_doc.query("ul")) 199 | 200 | assert len(list(ul.children)) == 3 201 | 202 | # get generator so next() will work to get all children 203 | children = ul.children 204 | 205 | first_li = next(children) 206 | elsie = next(first_li.children) 207 | assert elsie.id == "elsie" 208 | assert len(list(elsie.children)) == 0 209 | 210 | second_li = next(children) 211 | lacie = next(second_li.children) 212 | assert lacie.id == "lacie" 213 | 214 | third_li = next(children) 215 | tillie = next(third_li.children) 216 | assert tillie.id == "tillie" 217 | 218 | 219 | def test_element_parent(html_doc): 220 | elsie = next(html_doc.query("#elsie")) 221 | 222 | assert elsie.parent 223 | assert elsie.parent.name == "li" 224 | assert elsie.parent.parent.name == "ul" 225 | 226 | 227 | def test_element_parent_none(html_doc: HTML): 228 | assert html_doc.root_element 229 | assert html_doc.root_element.parent 230 | assert html_doc.root_element.parent.parent is None 231 | 232 | 233 | def test_create_without_soup(): 234 | span = Element.create( 235 | "span", 236 | ) 237 | 238 | assert span._soup 239 | 240 | 241 | def test_create_with_soup(): 242 | soup = bs4.BeautifulSoup() 243 | 244 | span = Element.create( 245 | "span", 246 | soup=soup, 247 | ) 248 | 249 | assert span._soup 250 | assert id(span._soup) == id(soup) 251 | 252 | 253 | def test_repr(): 254 | span = Element.create( 255 | "span", 256 | ) 257 | 258 | assert repr(span) == "" 259 | 260 | 261 | def test_tag_string(html_doc): 262 | ul = next(html_doc.query("ul")) 263 | 264 | expected = "