├── py.typed ├── tests ├── __init__.py ├── samples │ ├── html_fragment.html │ ├── html_doc.html │ └── html_unicorn_fragment.html ├── element │ ├── test_content.py │ ├── test_create_tags.py │ └── test_element.py ├── html │ ├── test_root_element.py │ ├── test_query.py │ ├── test_html.py │ ├── test_prettify.py │ └── samples │ │ ├── hacker-news.html │ │ └── expected │ │ └── hacker-news.html └── test_benchmarks.py ├── README.md ├── .readthedocs.yml ├── docs ├── source │ ├── installation.md │ ├── changelog.md │ ├── conf.py │ ├── editing.md │ ├── index.md │ ├── querying.md │ ├── element.md │ └── parsing.md ├── Makefile └── make.bat ├── minestrone ├── formatter.py ├── __init__.py └── element │ ├── prettifier.py │ └── __init__.py ├── conftest.py ├── LICENSE ├── .gitignore └── pyproject.toml /py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # minestrone 2 | 3 | Search, modify, and parse messy HTML with ease. 4 | 5 | Documentation at https://minestrone.readthedocs.io/. 6 | -------------------------------------------------------------------------------- /tests/samples/html_fragment.html: -------------------------------------------------------------------------------- 1 |
-------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sphinx: 4 | configuration: docs/source/conf.py 5 | fail_on_warning: true 6 | builder: dirhtml 7 | 8 | formats: 9 | - pdf 10 | - epub 11 | 12 | python: 13 | version: 3 14 | install: 15 | - method: pip 16 | path: . 17 | extra_requirements: 18 | - docs 19 | -------------------------------------------------------------------------------- /docs/source/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | To use `minestrone`, first install it using `poetry`: 4 | 5 | ```shell 6 | poetry add minestrone 7 | ``` 8 | 9 | OR install it using `pip`: 10 | 11 | ```shell 12 | pip install minestrone 13 | ``` 14 | 15 | ```{note} 16 | `minestrone[lxml]` or `minestrone[html5]` can be installed to include support for external HTML parsers. More information in [parsing](parsing.md). 17 | ``` -------------------------------------------------------------------------------- /minestrone/formatter.py: -------------------------------------------------------------------------------- 1 | import bs4 2 | 3 | 4 | class UnsortedAttributes(bs4.formatter.HTMLFormatter): 5 | """Prevent `beautifulsoup` from re-ordering HTML attributes.""" 6 | 7 | def __init__(self): 8 | super().__init__( 9 | entity_substitution=bs4.dammit.EntitySubstitution.substitute_html 10 | ) 11 | 12 | def attributes(self, tag: bs4.element.Tag): 13 | for k, v in tag.attrs.items(): 14 | yield k, v 15 | -------------------------------------------------------------------------------- /tests/samples/html_doc.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |The Dormouse's story
15 | """ 16 | ) 17 | 18 | root_element = html.root_element 19 | assert isinstance(root_element, Element) 20 | assert root_element.name == "p" 21 | 22 | 23 | def test_root_element_with_comment(): 24 | html = HTML( 25 | """ 26 | 27 |The Dormouse's story
28 | """ 29 | ) 30 | 31 | root_element = html.root_element 32 | assert isinstance(root_element, Element) 33 | assert root_element.name == "p" 34 | 35 | 36 | def test_root_element_missing(): 37 | html = HTML( 38 | """ 39 | 40 | testing 41 | """ 42 | ) 43 | 44 | root_element = html.root_element 45 | assert root_element is None 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Adam Hill 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice (including the next paragraph) shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | import toml 2 | 3 | project = "minestrone" 4 | copyright = "2021, Adam Hill" 5 | author = "Adam Hill" 6 | 7 | pyproject = toml.load("../../pyproject.toml") 8 | version = pyproject["tool"]["poetry"]["version"] 9 | release = version 10 | 11 | # -- General configuration 12 | 13 | extensions = [ 14 | "sphinx.ext.duration", 15 | "sphinx.ext.doctest", 16 | "sphinx.ext.autodoc", 17 | "sphinx.ext.autosummary", 18 | "sphinx.ext.intersphinx", 19 | "myst_parser", 20 | "sphinx_copybutton", 21 | "sphinx.ext.napoleon", 22 | "sphinx.ext.autosectionlabel", 23 | ] 24 | 25 | intersphinx_mapping = { 26 | "python": ("https://docs.python.org/3/", None), 27 | "sphinx": ("https://www.sphinx-doc.org/en/master/", None), 28 | } 29 | intersphinx_disabled_domains = ["std"] 30 | 31 | templates_path = ["_templates"] 32 | 33 | # -- Options for HTML output 34 | 35 | html_theme = "furo" 36 | 37 | # -- Options for EPUB output 38 | epub_show_urls = "footnote" 39 | 40 | autosectionlabel_prefix_document = True 41 | autosectionlabel_maxdepth = 3 42 | 43 | myst_heading_anchors = 3 44 | myst_enable_extensions = ["linkify", "colon_fence"] 45 | -------------------------------------------------------------------------------- /docs/source/editing.md: -------------------------------------------------------------------------------- 1 | # Editing 2 | 3 | To edit HTML, first query for an `Element` and then call one of the following methods. 4 | 5 | ## prepend 6 | 7 | Adds new text or an element **before** the calling element. 8 | 9 | ### Prepend an element 10 | 11 | ```python 12 | from minestrone import HTML 13 | html = HTML("Dormouse") 14 | html.root_element.prepend(name="span", text="The", klass="mr-2") 15 | 16 | assert str(html) == "TheDormouse" 17 | ``` 18 | 19 | ### Prepend text 20 | 21 | ```python 22 | from minestrone import HTML 23 | html = HTML("Dormouse") 24 | html.root_element.prepend(text="The ") 25 | 26 | assert html == "The Dormouse" 27 | ``` 28 | 29 | ## append 30 | 31 | Adds text content or a new element **after** the calling element. 32 | 33 | ### Append an element 34 | 35 | ```python 36 | from minestrone import HTML 37 | html = HTML("Dormouse") 38 | html.root_element.append(name="span", text="Story", klass="ml-2") 39 | 40 | assert str(html) == "DormouseStory" 41 | ``` 42 | 43 | ### Append text 44 | 45 | ```python 46 | from minestrone import HTML 47 | html = HTML("Dormouse") 48 | html.root_element.append(text=" Story") 49 | 50 | assert html == "Dormouse Story" 51 | ``` 52 | -------------------------------------------------------------------------------- /tests/html/test_query.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from minestrone import Element 4 | 5 | 6 | def test_query_id(html_doc): 7 | elements = list(html_doc.query("a#elsie")) 8 | assert len(elements) == 1 9 | assert isinstance(elements[0], Element) 10 | 11 | expected = ( 12 | 'Elsie' 13 | ) 14 | actual = elements[0] 15 | 16 | assert str(actual) == expected 17 | 18 | 19 | def test_query_class(html_doc): 20 | assert 3 == len(list(html_doc.query("a.sister"))) 21 | 22 | 23 | def test_query_tag(html_doc): 24 | assert 3 == len(list(html_doc.query("a"))) 25 | 26 | 27 | def test_elements_with_one_parent(html_unicorn_fragment): 28 | actual = len(list(html_unicorn_fragment.elements)) 29 | expected = 15 30 | 31 | assert actual == expected 32 | 33 | 34 | def test_elements_with_multiple_parents(): 35 | from minestrone import HTML 36 | 37 | html = HTML( 38 | """ 39 | 40 | Dormouse 41 | """ 42 | ) 43 | 44 | assert [e.name for e in html.elements] == ["div", "span"] 45 | 46 | 47 | def test_query_len_raises(html_doc): 48 | with pytest.raises(TypeError) as e: 49 | len(html_doc.query("a")) 50 | 51 | assert e.exconly() == "TypeError: object of type 'generator' has no len()" 52 | 53 | 54 | def test_query_to_list(html_doc): 55 | assert 3 == len(html_doc.query_to_list("a")) 56 | 57 | 58 | def test_query_css_selector(html_doc): 59 | for a in html_doc.query("ul li a.sister"): 60 | assert ( 61 | str(a) 62 | == 'Elsie' 63 | ) 64 | break 65 | -------------------------------------------------------------------------------- /docs/source/index.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | `minestrone` is an opinionated Python library that lets you search, modify, and parse messy HTML with ease. 4 | 5 | ## Behind the scenes 6 | 7 | `minestrone` utilizes [`Beautiful Soup`](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) to do all the real work, but aims to provide a simple, consistent, and intuitive API to interact with an HTML document. `Beautiful Soup` provides a _lot_ of functionality, although it can be hard to grok the documentation. The hope is that `minestrone` makes that functionality easier. 8 | 9 | ## Related projects 10 | 11 | There are a few other libraries to interact with HTML in Python, but most are focused on the retrieval of HTML and searching through the document. However, they are listed below in case they might be useful. 12 | 13 | ### Beautiful Soup related 14 | 15 | - [`SoupSieve`](https://facelessuser.github.io/soupsieve/): provides selecting, matching, and filtering using modern CSS selectors. It provides the functionality used by the `select` function in `Beautiful Soup` which is also used by `minestrone`, however it can be used separately. 16 | - [`soupy`](https://soupy.readthedocs.io/): wrapper around `Beautiful Soup` that makes it easier to search through HTML and XML documents. 17 | - [`fast-soup`](https://pypi.org/project/fast-soup/): faster `Beautiful Soup` search via `lxml`. 18 | - [`BeautifulSauce`](https://github.com/nateraw/BeautifulSauce): `Beautiful Soup`'s saucy sibling! 19 | - [`SoupCan`](https://pypi.org/project/soupcan/): simplifies the process of designing a Python tool for extracting and displaying webpage content. 20 | 21 | ### Beautiful Soup replacements 22 | 23 | - [`lxml.html`](https://lxml.de/lxmlhtml.html): based on `lxml`, but provides a special Element API for HTML elements, as well as a number of utilities for common HTML processing tasks. 24 | - [`html.parser`](https://docs.python.org/3/library/html.parser.html): simple HTML and XHTML parser in standard library. 25 | - [`parsel`](https://parsel.readthedocs.io/): Parsel is a BSD-licensed Python library to extract data from HTML, JSON, and XML documents. 26 | - [`selectolax`](https://github.com/rushter/selectolax): a fast HTML5 parser with CSS selectors. 27 | - [`gazpacho`](https://pypi.org/project/gazpacho/): simple, fast, and modern web scraping library. The library is stable, actively maintained, and installed with zero dependencies. 28 | - [`Requests-HTML`](https://requests-html.kennethreitz.org/): HTML Parsing for Humans. It intends to make parsing HTML (e.g. scraping the web) as simple and intuitive as possible. 29 | - [`regex`](https://docs.python.org/3/library/re.html): just kidding, you probably shouldn't use `regex` to parse HTML unless you are a masochist. 30 | 31 | ```{toctree} 32 | :maxdepth: 2 33 | :hidden: 34 | 35 | self 36 | installation 37 | changelog 38 | ``` 39 | 40 | ```{toctree} 41 | :caption: HTML 42 | :maxdepth: 2 43 | :hidden: 44 | 45 | parsing 46 | querying 47 | element 48 | editing 49 | ``` 50 | 51 | ```{toctree} 52 | :caption: Links 53 | :maxdepth: 2 54 | :hidden: 55 | 56 | GitHubThere was a table...
50 | """) 51 | 52 | for h1 in html.query("h1"): 53 | assert str(h1) == "stuff in here headerstuff in here header"
25 | actual = el.text
26 |
27 | assert actual == expected
28 |
29 |
30 | def test_set_text(html_doc):
31 | tillie = next(html_doc.query("a#tillie"))
32 |
33 | tillie.text = "Billie"
34 |
35 | expected = "Billie"
36 | actual = tillie.text
37 |
38 | assert actual == expected
39 |
40 |
41 | def test_get_name(html_doc):
42 | tillie = next(html_doc.query("a#tillie"))
43 |
44 | expected = "a"
45 | actual = tillie.name
46 |
47 | assert actual == expected
48 |
49 |
50 | def test_element_klass():
51 | actual = Element.create("button", "Save", klass="test-class")
52 |
53 | expected = ''
54 |
55 | assert str(actual) == expected
56 |
57 |
58 | def test_element_get_attributes():
59 | span = Element.create(
60 | "span",
61 | "test attrs content",
62 | klass="test-class1 test-class2",
63 | disabled=True,
64 | id="span1",
65 | )
66 |
67 | assert span.name == "span"
68 | assert span.text == "test attrs content"
69 | assert span.attributes == {
70 | "class": "test-class1 test-class2",
71 | "disabled": "disabled",
72 | "id": "span1",
73 | }
74 |
75 |
76 | def test_element_set_attributes_id():
77 | span = Element.create(
78 | "span",
79 | )
80 |
81 | assert span.id is None
82 |
83 | span.attributes = {"id": "test-id"}
84 |
85 | assert span.id == "test-id"
86 | assert span.attributes == {
87 | "id": "test-id",
88 | }
89 |
90 |
91 | def test_element_set_id():
92 | span = Element.create(
93 | "span",
94 | )
95 |
96 | assert span.id is None
97 |
98 | span.id = "test-id"
99 |
100 | assert span.id == "test-id"
101 | assert span.attributes == {
102 | "id": "test-id",
103 | }
104 |
105 |
106 | def test_element_set_attributes_klass():
107 | span = Element.create(
108 | "span",
109 | klass="test-class1",
110 | )
111 |
112 | span.attributes = {"klass": "test-class2 test-class3"}
113 |
114 | assert span.name == "span"
115 | assert span.attributes == {
116 | "class": "test-class2 test-class3",
117 | }
118 | assert span.classes == ["test-class2", "test-class3"]
119 |
120 |
121 | def test_element_set_attributes_css():
122 | span = Element.create(
123 | "span",
124 | klass="test-class1",
125 | )
126 |
127 | span.attributes = {"css": "test-class2 test-class3"}
128 |
129 | assert span.name == "span"
130 | assert span.attributes == {
131 | "class": "test-class2 test-class3",
132 | }
133 | assert span.classes == ["test-class2", "test-class3"]
134 |
135 |
136 | def test_element_set_attributes_class_list():
137 | span = Element.create(
138 | "span",
139 | klass="test-class1",
140 | )
141 |
142 | span.attributes = {"css": ["test-class2", "test-class3"]}
143 |
144 | assert span.name == "span"
145 | assert span.attributes == {
146 | "class": "test-class2 test-class3",
147 | }
148 | assert span.classes == ["test-class2", "test-class3"]
149 |
150 |
151 | def test_element_set_attributes_class_tuple():
152 | span = Element.create(
153 | "span",
154 | klass="test-class1",
155 | )
156 |
157 | span.attributes = {"css": ("test-class2", "test-class3")}
158 |
159 | assert span.name == "span"
160 | assert span.attributes == {
161 | "class": "test-class2 test-class3",
162 | }
163 | assert span.classes == ["test-class2", "test-class3"]
164 |
165 |
166 | def test_element_set_attributes_invalid_type():
167 | span = Element.create("span")
168 |
169 | with pytest.raises(Exception):
170 | span.attributes = {"css": 0}
171 |
172 |
173 | def test_element_classes_from_klass_kwarg():
174 | span = Element.create(
175 | "span",
176 | klass="test-class1 test-class2",
177 | )
178 |
179 | assert span.classes == ["test-class1", "test-class2"]
180 |
181 |
182 | def test_element_classes_from_css_kwarg():
183 | span = Element.create(
184 | "span",
185 | css="test-class1 test-class2",
186 | )
187 |
188 | assert span.classes == ["test-class1", "test-class2"]
189 |
190 |
191 | def test_element_classes_empty():
192 | span = Element.create("span")
193 |
194 | assert span.classes == []
195 |
196 |
197 | def test_element_children(html_doc):
198 | ul = next(html_doc.query("ul"))
199 |
200 | assert len(list(ul.children)) == 3
201 |
202 | # get generator so next() will work to get all children
203 | children = ul.children
204 |
205 | first_li = next(children)
206 | elsie = next(first_li.children)
207 | assert elsie.id == "elsie"
208 | assert len(list(elsie.children)) == 0
209 |
210 | second_li = next(children)
211 | lacie = next(second_li.children)
212 | assert lacie.id == "lacie"
213 |
214 | third_li = next(children)
215 | tillie = next(third_li.children)
216 | assert tillie.id == "tillie"
217 |
218 |
219 | def test_element_parent(html_doc):
220 | elsie = next(html_doc.query("#elsie"))
221 |
222 | assert elsie.parent
223 | assert elsie.parent.name == "li"
224 | assert elsie.parent.parent.name == "ul"
225 |
226 |
227 | def test_element_parent_none(html_doc: HTML):
228 | assert html_doc.root_element
229 | assert html_doc.root_element.parent
230 | assert html_doc.root_element.parent.parent is None
231 |
232 |
233 | def test_create_without_soup():
234 | span = Element.create(
235 | "span",
236 | )
237 |
238 | assert span._soup
239 |
240 |
241 | def test_create_with_soup():
242 | soup = bs4.BeautifulSoup()
243 |
244 | span = Element.create(
245 | "span",
246 | soup=soup,
247 | )
248 |
249 | assert span._soup
250 | assert id(span._soup) == id(soup)
251 |
252 |
253 | def test_repr():
254 | span = Element.create(
255 | "span",
256 | )
257 |
258 | assert repr(span) == ""
259 |
260 |
261 | def test_tag_string(html_doc):
262 | ul = next(html_doc.query("ul"))
263 |
264 | expected = "Hello WorldHello World"
277 | """
278 |
279 | return "".join([str(c) for c in self._self.contents])
280 |
281 | @text.setter
282 | def text(self, string: str) -> None:
283 | """Set the `text content` of the element."""
284 |
285 | self._self.clear()
286 | self._self.append(string.__class__(string))
287 |
288 | @property
289 | def tag_string(self) -> str:
290 | _attributes = self.attributes.items()
291 |
292 | if not _attributes:
293 | return f"<{self.name}>"
294 |
295 | _tag_string = f"<{self.name}"
296 |
297 | for key, value in _attributes:
298 | if isinstance(value, list):
299 | value = " ".join(value)
300 |
301 | _tag_string += f' {key}="{value}"'
302 |
303 | _tag_string = f"{_tag_string}>"
304 |
305 | return _tag_string
306 |
307 | @property
308 | def closing_tag_string(self) -> str:
309 | if self._self.is_empty_element or self.name in VOID_ELEMENTS:
310 | return HTML5_CLOSING_TAG
311 |
312 | return f"{self.name}>"
313 |
314 | def insert(self, element: "Element", index: int = 0) -> None:
315 | """Insert a child element into this element."""
316 |
317 | self._self.insert(index, element._self)
318 |
319 | def remove_children(self) -> None:
320 | """Remove all children from the element."""
321 |
322 | self._self.clear()
323 |
324 | def prettify(self, indent: int = 2, max_line_length: int = 88):
325 | # Import here to avoid circular imports
326 | from .prettifier import prettify_element
327 |
328 | return prettify_element(self, indent, max_line_length)
329 |
330 | def __str__(self):
331 | return self._self.encode(formatter=UnsortedAttributes()).decode()
332 |
333 | def __repr__(self):
334 | return self.__str__()
335 |
--------------------------------------------------------------------------------
/tests/html/test_prettify.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | import pytest
4 |
5 | from minestrone import HTML
6 |
7 | PRINT_TIMINGS = True
8 |
9 |
10 | def eq(actual, expected):
11 | print(expected)
12 | print("===")
13 | print(actual)
14 |
15 | assert actual == expected
16 |
17 |
18 | def test_html_prettify_no_root():
19 | expected = """
351 |
|
381 |
|
400 |
15 |
|
50 | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
54 |
|
1227 | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
1230 |
1237 | 1260 | 1261 | 1265 | |
1267 | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||