├── py.typed ├── tests ├── __init__.py ├── samples │ ├── html_fragment.html │ ├── html_doc.html │ └── html_unicorn_fragment.html ├── element │ ├── test_content.py │ ├── test_create_tags.py │ └── test_element.py ├── html │ ├── test_root_element.py │ ├── test_query.py │ ├── test_html.py │ ├── test_prettify.py │ └── samples │ │ ├── hacker-news.html │ │ └── expected │ │ └── hacker-news.html └── test_benchmarks.py ├── README.md ├── .readthedocs.yml ├── docs ├── source │ ├── installation.md │ ├── changelog.md │ ├── conf.py │ ├── editing.md │ ├── index.md │ ├── querying.md │ ├── element.md │ └── parsing.md ├── Makefile └── make.bat ├── minestrone ├── formatter.py ├── __init__.py └── element │ ├── prettifier.py │ └── __init__.py ├── conftest.py ├── LICENSE ├── .gitignore └── pyproject.toml /py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # minestrone 2 | 3 | Search, modify, and parse messy HTML with ease. 4 | 5 | Documentation at https://minestrone.readthedocs.io/. 6 | -------------------------------------------------------------------------------- /tests/samples/html_fragment.html: -------------------------------------------------------------------------------- 1 |

-------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sphinx: 4 | configuration: docs/source/conf.py 5 | fail_on_warning: true 6 | builder: dirhtml 7 | 8 | formats: 9 | - pdf 10 | - epub 11 | 12 | python: 13 | version: 3 14 | install: 15 | - method: pip 16 | path: . 17 | extra_requirements: 18 | - docs 19 | -------------------------------------------------------------------------------- /docs/source/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | To use `minestrone`, first install it using `poetry`: 4 | 5 | ```shell 6 | poetry add minestrone 7 | ``` 8 | 9 | OR install it using `pip`: 10 | 11 | ```shell 12 | pip install minestrone 13 | ``` 14 | 15 | ```{note} 16 | `minestrone[lxml]` or `minestrone[html5]` can be installed to include support for external HTML parsers. More information in [parsing](parsing.md). 17 | ``` -------------------------------------------------------------------------------- /minestrone/formatter.py: -------------------------------------------------------------------------------- 1 | import bs4 2 | 3 | 4 | class UnsortedAttributes(bs4.formatter.HTMLFormatter): 5 | """Prevent `beautifulsoup` from re-ordering HTML attributes.""" 6 | 7 | def __init__(self): 8 | super().__init__( 9 | entity_substitution=bs4.dammit.EntitySubstitution.substitute_html 10 | ) 11 | 12 | def attributes(self, tag: bs4.element.Tag): 13 | for k, v in tag.attrs.items(): 14 | yield k, v 15 | -------------------------------------------------------------------------------- /tests/samples/html_doc.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | The Dormouse's story 4 | 5 | 6 |

The Dormouse's story

7 | 8 |

Elsie
Lacie
Tillie

13 | 14 | -------------------------------------------------------------------------------- /docs/source/changelog.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.8.0 4 | 5 | - Add `Element.insert` and `Element.remove_children`. 6 | 7 | ## 0.7.0 8 | 9 | - Add `HTML.elements`. 10 | 11 | ## 0.6.2 12 | 13 | - Optimize `prettify` method to be as fast as possible. 14 | - Support HTML doctype, comments, void elements, and other improvements for `prettify`. 15 | 16 | ## 0.6.1 17 | 18 | - Fix a few bugs for `HTML.prettify()` and `Element.prettify()`. 19 | 20 | ## 0.6.0 21 | 22 | - Add `Element.prettify()`. 23 | 24 | ## 0.5.1 25 | 26 | - Handle HTML tags when getting `Element.text`. 27 | 28 | ## 0.5.0 29 | 30 | - Add setter for `Element.id`. 31 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /tests/element/test_content.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from minestrone.element import Content 4 | 5 | 6 | @pytest.fixture 7 | def content(): 8 | return Content() 9 | 10 | 11 | def test_convert_attributes_klass(content): 12 | attributes = {"klass": "test1"} 13 | actual = content._convert_attributes(attributes) 14 | 15 | assert actual == {"class": "test1"} 16 | 17 | 18 | def test_convert_attributes_css(content): 19 | attributes = {"css": "test1"} 20 | actual = content._convert_attributes(attributes) 21 | 22 | assert actual == {"class": "test1"} 23 | 24 | 25 | def test_convert_attributes_true_value(content): 26 | attributes = {"disabled": True} 27 | actual = content._convert_attributes(attributes) 28 | 29 | assert actual == {"disabled": "disabled"} 30 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from minestrone import HTML 4 | 5 | 6 | @pytest.fixture 7 | def html_doc(html_doc_str) -> HTML: 8 | return HTML(html_doc_str) 9 | 10 | 11 | @pytest.fixture 12 | def html_doc_str() -> str: 13 | with open("tests/samples/html_doc.html", "r") as f: 14 | return f.read() 15 | 16 | 17 | @pytest.fixture 18 | def html_fragment(html_fragment_str) -> HTML: 19 | return HTML(html_fragment_str) 20 | 21 | 22 | @pytest.fixture 23 | def html_fragment_str() -> str: 24 | with open("tests/samples/html_fragment.html", "r") as f: 25 | return f.read() 26 | 27 | 28 | @pytest.fixture 29 | def html_unicorn_fragment(html_unicorn_fragment_str) -> HTML: 30 | return HTML(html_unicorn_fragment_str) 31 | 32 | 33 | @pytest.fixture 34 | def html_unicorn_fragment_str() -> str: 35 | with open("tests/samples/html_unicorn_fragment.html", "r") as f: 36 | return f.read() 37 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /tests/samples/html_unicorn_fragment.html: -------------------------------------------------------------------------------- 1 |

3 |

6 |

Step 2

7 |

8 | 9 | 10 | 11 | 12 |

13 |

19 | 20 | 21 |

22 |

-------------------------------------------------------------------------------- /tests/html/test_root_element.py: -------------------------------------------------------------------------------- 1 | from minestrone import HTML, Element 2 | 3 | 4 | def test_root_element(html_doc): 5 | root_element = html_doc.root_element 6 | assert isinstance(root_element, Element) 7 | assert root_element.name == "html" 8 | 9 | 10 | def test_root_element_with_extra_linebreaks(): 11 | html = HTML( 12 | """ 13 | 14 |

The Dormouse's story

15 | """ 16 | ) 17 | 18 | root_element = html.root_element 19 | assert isinstance(root_element, Element) 20 | assert root_element.name == "p" 21 | 22 | 23 | def test_root_element_with_comment(): 24 | html = HTML( 25 | """ 26 | 27 |

The Dormouse's story

28 | """ 29 | ) 30 | 31 | root_element = html.root_element 32 | assert isinstance(root_element, Element) 33 | assert root_element.name == "p" 34 | 35 | 36 | def test_root_element_missing(): 37 | html = HTML( 38 | """ 39 | 40 | testing 41 | """ 42 | ) 43 | 44 | root_element = html.root_element 45 | assert root_element is None 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Adam Hill 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice (including the next paragraph) shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | import toml 2 | 3 | project = "minestrone" 4 | copyright = "2021, Adam Hill" 5 | author = "Adam Hill" 6 | 7 | pyproject = toml.load("../../pyproject.toml") 8 | version = pyproject["tool"]["poetry"]["version"] 9 | release = version 10 | 11 | # -- General configuration 12 | 13 | extensions = [ 14 | "sphinx.ext.duration", 15 | "sphinx.ext.doctest", 16 | "sphinx.ext.autodoc", 17 | "sphinx.ext.autosummary", 18 | "sphinx.ext.intersphinx", 19 | "myst_parser", 20 | "sphinx_copybutton", 21 | "sphinx.ext.napoleon", 22 | "sphinx.ext.autosectionlabel", 23 | ] 24 | 25 | intersphinx_mapping = { 26 | "python": ("https://docs.python.org/3/", None), 27 | "sphinx": ("https://www.sphinx-doc.org/en/master/", None), 28 | } 29 | intersphinx_disabled_domains = ["std"] 30 | 31 | templates_path = ["_templates"] 32 | 33 | # -- Options for HTML output 34 | 35 | html_theme = "furo" 36 | 37 | # -- Options for EPUB output 38 | epub_show_urls = "footnote" 39 | 40 | autosectionlabel_prefix_document = True 41 | autosectionlabel_maxdepth = 3 42 | 43 | myst_heading_anchors = 3 44 | myst_enable_extensions = ["linkify", "colon_fence"] 45 | -------------------------------------------------------------------------------- /docs/source/editing.md: -------------------------------------------------------------------------------- 1 | # Editing 2 | 3 | To edit HTML, first query for an `Element` and then call one of the following methods. 4 | 5 | ## prepend 6 | 7 | Adds new text or an element **before** the calling element. 8 | 9 | ### Prepend an element 10 | 11 | ```python 12 | from minestrone import HTML 13 | html = HTML("Dormouse") 14 | html.root_element.prepend(name="span", text="The", klass="mr-2") 15 | 16 | assert str(html) == "TheDormouse" 17 | ``` 18 | 19 | ### Prepend text 20 | 21 | ```python 22 | from minestrone import HTML 23 | html = HTML("Dormouse") 24 | html.root_element.prepend(text="The ") 25 | 26 | assert html == "The Dormouse" 27 | ``` 28 | 29 | ## append 30 | 31 | Adds text content or a new element **after** the calling element. 32 | 33 | ### Append an element 34 | 35 | ```python 36 | from minestrone import HTML 37 | html = HTML("Dormouse") 38 | html.root_element.append(name="span", text="Story", klass="ml-2") 39 | 40 | assert str(html) == "DormouseStory" 41 | ``` 42 | 43 | ### Append text 44 | 45 | ```python 46 | from minestrone import HTML 47 | html = HTML("Dormouse") 48 | html.root_element.append(text=" Story") 49 | 50 | assert html == "Dormouse Story" 51 | ``` 52 | -------------------------------------------------------------------------------- /tests/html/test_query.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from minestrone import Element 4 | 5 | 6 | def test_query_id(html_doc): 7 | elements = list(html_doc.query("a#elsie")) 8 | assert len(elements) == 1 9 | assert isinstance(elements[0], Element) 10 | 11 | expected = ( 12 | 'Elsie' 13 | ) 14 | actual = elements[0] 15 | 16 | assert str(actual) == expected 17 | 18 | 19 | def test_query_class(html_doc): 20 | assert 3 == len(list(html_doc.query("a.sister"))) 21 | 22 | 23 | def test_query_tag(html_doc): 24 | assert 3 == len(list(html_doc.query("a"))) 25 | 26 | 27 | def test_elements_with_one_parent(html_unicorn_fragment): 28 | actual = len(list(html_unicorn_fragment.elements)) 29 | expected = 15 30 | 31 | assert actual == expected 32 | 33 | 34 | def test_elements_with_multiple_parents(): 35 | from minestrone import HTML 36 | 37 | html = HTML( 38 | """ 39 |

40 | Dormouse 41 | """ 42 | ) 43 | 44 | assert [e.name for e in html.elements] == ["div", "span"] 45 | 46 | 47 | def test_query_len_raises(html_doc): 48 | with pytest.raises(TypeError) as e: 49 | len(html_doc.query("a")) 50 | 51 | assert e.exconly() == "TypeError: object of type 'generator' has no len()" 52 | 53 | 54 | def test_query_to_list(html_doc): 55 | assert 3 == len(html_doc.query_to_list("a")) 56 | 57 | 58 | def test_query_css_selector(html_doc): 59 | for a in html_doc.query("ul li a.sister"): 60 | assert ( 61 | str(a) 62 | == 'Elsie' 63 | ) 64 | break 65 | -------------------------------------------------------------------------------- /docs/source/index.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | `minestrone` is an opinionated Python library that lets you search, modify, and parse messy HTML with ease. 4 | 5 | ## Behind the scenes 6 | 7 | `minestrone` utilizes [`Beautiful Soup`](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) to do all the real work, but aims to provide a simple, consistent, and intuitive API to interact with an HTML document. `Beautiful Soup` provides a _lot_ of functionality, although it can be hard to grok the documentation. The hope is that `minestrone` makes that functionality easier. 8 | 9 | ## Related projects 10 | 11 | There are a few other libraries to interact with HTML in Python, but most are focused on the retrieval of HTML and searching through the document. However, they are listed below in case they might be useful. 12 | 13 | ### Beautiful Soup related 14 | 15 | - [`SoupSieve`](https://facelessuser.github.io/soupsieve/): provides selecting, matching, and filtering using modern CSS selectors. It provides the functionality used by the `select` function in `Beautiful Soup` which is also used by `minestrone`, however it can be used separately. 16 | - [`soupy`](https://soupy.readthedocs.io/): wrapper around `Beautiful Soup` that makes it easier to search through HTML and XML documents. 17 | - [`fast-soup`](https://pypi.org/project/fast-soup/): faster `Beautiful Soup` search via `lxml`. 18 | - [`BeautifulSauce`](https://github.com/nateraw/BeautifulSauce): `Beautiful Soup`'s saucy sibling! 19 | - [`SoupCan`](https://pypi.org/project/soupcan/): simplifies the process of designing a Python tool for extracting and displaying webpage content. 20 | 21 | ### Beautiful Soup replacements 22 | 23 | - [`lxml.html`](https://lxml.de/lxmlhtml.html): based on `lxml`, but provides a special Element API for HTML elements, as well as a number of utilities for common HTML processing tasks. 24 | - [`html.parser`](https://docs.python.org/3/library/html.parser.html): simple HTML and XHTML parser in standard library. 25 | - [`parsel`](https://parsel.readthedocs.io/): Parsel is a BSD-licensed Python library to extract data from HTML, JSON, and XML documents. 26 | - [`selectolax`](https://github.com/rushter/selectolax): a fast HTML5 parser with CSS selectors. 27 | - [`gazpacho`](https://pypi.org/project/gazpacho/): simple, fast, and modern web scraping library. The library is stable, actively maintained, and installed with zero dependencies. 28 | - [`Requests-HTML`](https://requests-html.kennethreitz.org/): HTML Parsing for Humans. It intends to make parsing HTML (e.g. scraping the web) as simple and intuitive as possible. 29 | - [`regex`](https://docs.python.org/3/library/re.html): just kidding, you probably shouldn't use `regex` to parse HTML unless you are a masochist. 30 | 31 | ```{toctree} 32 | :maxdepth: 2 33 | :hidden: 34 | 35 | self 36 | installation 37 | changelog 38 | ``` 39 | 40 | ```{toctree} 41 | :caption: HTML 42 | :maxdepth: 2 43 | :hidden: 44 | 45 | parsing 46 | querying 47 | element 48 | editing 49 | ``` 50 | 51 | ```{toctree} 52 | :caption: Links 53 | :maxdepth: 2 54 | :hidden: 55 | 56 | GitHub 57 | Sponsor 58 | ``` 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/python,jupyternotebooks 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,jupyternotebooks 4 | 5 | ### JupyterNotebooks ### 6 | # gitignore template for Jupyter Notebooks 7 | # website: http://jupyter.org/ 8 | 9 | .ipynb_checkpoints 10 | */.ipynb_checkpoints/* 11 | 12 | # IPython 13 | profile_default/ 14 | ipython_config.py 15 | 16 | # Remove previous ipynb_checkpoints 17 | # git rm -r .ipynb_checkpoints/ 18 | 19 | ### Python ### 20 | # Byte-compiled / optimized / DLL files 21 | __pycache__/ 22 | *.py[cod] 23 | *$py.class 24 | 25 | # C extensions 26 | *.so 27 | 28 | # Distribution / packaging 29 | .Python 30 | build/ 31 | develop-eggs/ 32 | dist/ 33 | downloads/ 34 | eggs/ 35 | .eggs/ 36 | lib/ 37 | lib64/ 38 | parts/ 39 | sdist/ 40 | var/ 41 | wheels/ 42 | share/python-wheels/ 43 | *.egg-info/ 44 | .installed.cfg 45 | *.egg 46 | MANIFEST 47 | 48 | # PyInstaller 49 | # Usually these files are written by a python script from a template 50 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 51 | *.manifest 52 | *.spec 53 | 54 | # Installer logs 55 | pip-log.txt 56 | pip-delete-this-directory.txt 57 | 58 | # Unit test / coverage reports 59 | htmlcov/ 60 | .tox/ 61 | .nox/ 62 | .coverage 63 | .coverage.* 64 | .cache 65 | nosetests.xml 66 | coverage.xml 67 | *.cover 68 | *.py,cover 69 | .hypothesis/ 70 | .pytest_cache/ 71 | cover/ 72 | 73 | # Translations 74 | *.mo 75 | *.pot 76 | 77 | # Django stuff: 78 | *.log 79 | local_settings.py 80 | db.sqlite3 81 | db.sqlite3-journal 82 | 83 | # Flask stuff: 84 | instance/ 85 | .webassets-cache 86 | 87 | # Scrapy stuff: 88 | .scrapy 89 | 90 | # Sphinx documentation 91 | docs/_build/ 92 | 93 | # PyBuilder 94 | .pybuilder/ 95 | target/ 96 | 97 | # Jupyter Notebook 98 | 99 | # IPython 100 | 101 | # pyenv 102 | # For a library or package, you might want to ignore these files since the code is 103 | # intended to run in multiple environments; otherwise, check them in: 104 | # .python-version 105 | 106 | # pipenv 107 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 108 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 109 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 110 | # install all needed dependencies. 111 | #Pipfile.lock 112 | 113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 114 | __pypackages__/ 115 | 116 | # Celery stuff 117 | celerybeat-schedule 118 | celerybeat.pid 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | .dmypy.json 145 | dmypy.json 146 | 147 | # Pyre type checker 148 | .pyre/ 149 | 150 | # pytype static type analyzer 151 | .pytype/ 152 | 153 | # Cython debug symbols 154 | cython_debug/ 155 | 156 | # End of https://www.toptal.com/developers/gitignore/api/python,jupyternotebooks 157 | -------------------------------------------------------------------------------- /docs/source/querying.md: -------------------------------------------------------------------------------- 1 | # Querying 2 | 3 | `minestrone` allows searching through HTML via CSS selectors (similar to JQuery or other frontend libraries). 4 | 5 | ```{note} 6 | Querying uses the [`select`](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#css-selectors) method in `Beautiful Soup` which delegates to `SoupSieve`. More details about `SoupSieve` is available in [their documentation](https://facelessuser.github.io/soupsieve/). 7 | ``` 8 | 9 | ## root_element 10 | 11 | Gets the root [element](element.md) of the HTML. 12 | 13 | ```python 14 | from minestrone import HTML 15 | html = HTML(""" 16 |

17 | Dormouse 18 |

19 | """) 20 | 21 | assert html.root_element.name == "div" 22 | ``` 23 | 24 | ## elements 25 | 26 | Recursively get all [elements](element.md) in the HTML. 27 | 28 | ```python 29 | from minestrone import HTML 30 | html = HTML(""" 31 |

32 | Dormouse 33 |

34 | """) 35 | 36 | assert [e.name for e in html.elements] == ["div", "span"] 37 | ``` 38 | 39 | ## query 40 | 41 | Takes a CSS selector and returns an iterator of [`Element`](element.md) items. 42 | 43 | ### Query by element name 44 | 45 | ```python 46 | from minestrone import HTML 47 | html = HTML(""" 48 |

The Dormouse's Story

49 |

There was a table...

50 | """) 51 | 52 | for h1 in html.query("h1"): 53 | assert str(h1) == "

The Dormouse's Story

" 54 | ``` 55 | 56 | ### Query by id 57 | 58 | ```python 59 | from minestrone import HTML 60 | html = HTML(""" 61 |

Elsie
Lacie

65 | """) 66 | 67 | for a in html.query("a#elsie"): 68 | assert str(a) == 'Elsie' 69 | ``` 70 | 71 | ### Query by class 72 | 73 | ```python 74 | from minestrone import HTML 75 | html = HTML(""" 76 |

Elsie
Lacie

80 | """) 81 | 82 | elsie_link = next(html.query("ul li a.sister")) 83 | assert str(elsie_link) == 'Elsie' 84 | 85 | lacie_link = next(html.query("ul li a.sister")) 86 | assert str(lacie_link) == 'Lacie' 87 | ``` 88 | 89 | ## query_to_list 90 | 91 | Exactly the same as [query](querying.md#query) except it returns a list of [`Element`](element.md) items instead of a generator. This is sometimes more useful than the `query` above, but it can take more time to parse and more memory to store the data if the HTML document is large. 92 | 93 | ```python 94 | from minestrone import HTML 95 | html = HTML(""" 96 |

Elsie
Lacie

100 | """) 101 | 102 | assert len(html.query_to_list("a")) == 2 103 | assert str(html.query_to_list("a")[0]) == 'Elsie' 104 | assert html.query_to_list("a") == list(html.query("a")) 105 | ``` 106 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "minestrone" 3 | authors = [{name = "Adam Hill", email = "adam@adamghill.com"}] 4 | dynamic = ["version", "description"] 5 | license = { file = "LICENSE" } 6 | 7 | [tool.poetry] 8 | name = "minestrone" 9 | version = "0.8.0" 10 | description = "Search, modify, and parse messy HTML with ease." 11 | authors = ["adamghill "] 12 | license = "MIT" 13 | readme = "README.md" 14 | repository = "https://github.com/adamghill/minestrone/" 15 | homepage = "https://github.com/adamghill/minestrone/" 16 | documentation = "http://minestrone.readthedocs.io/" 17 | keywords = ["python", "html", "beautifulsoup"] 18 | 19 | [tool.poetry.urls] 20 | "Funding" = "https://github.com/sponsors/adamghill" 21 | 22 | [tool.poetry.dependencies] 23 | python = ">=3.7,<4.0.0" 24 | beautifulsoup4 = "^4.10.0" 25 | 26 | # optional parsers 27 | lxml = { version = "4.9.1", optional = true } 28 | html5lib = { version = "1.1", optional = true } 29 | 30 | # docs 31 | Sphinx = { version = "^4.3.2", optional = true } 32 | linkify-it-py = { version = "^1.0.3", optional = true } 33 | myst-parser = { version = "^0.16.1", optional = true } 34 | furo = { version = "^2021.11.23", optional = true } 35 | sphinx-copybutton = { version = "^0.4.0", optional = true } 36 | attrs = { version = "^21.4.0", optional = true } 37 | toml = { version = "*", optional = true } 38 | 39 | [tool.poetry.group.dev.dependencies] 40 | pytest = "^6" 41 | black = "^22" 42 | isort = "^5" 43 | sphinx-autobuild = "^2021.3.14" 44 | types-beautifulsoup4 = "^4" 45 | mypy = "^0" 46 | coverage = {extras = ["toml"], version = "^6.2"} 47 | pytest-cov = "^3" 48 | ruff = "^0" 49 | pytest-benchmark = "^4.0.0" 50 | 51 | [tool.poetry.extras] 52 | docs = ["Sphinx", "linkify-it-py", "myst-parser", "furo", "sphinx-copybutton", "toml", "attrs"] 53 | lxml = ["lxml"] 54 | html5 = ["html5lib"] 55 | 56 | [tool.black] 57 | line-length = 88 58 | 59 | [tool.isort] 60 | profile = "black" 61 | multi_line_output = 3 62 | 63 | [tool.pytest.ini_options] 64 | addopts = "--quiet --failed-first -p no:warnings --benchmark-skip" 65 | testpaths = [ 66 | "tests" 67 | ] 68 | 69 | [tool.ruff] 70 | line-length = 88 71 | select = ["E", "F"] 72 | ignore = [] 73 | extend-select = ["D"] 74 | extend-ignore = [ 75 | "D101", 76 | "D102", 77 | "D103", 78 | "D105", 79 | "D106", 80 | "D202", 81 | "D203", 82 | "D204", 83 | "D213", 84 | "D215", 85 | "D400", 86 | "D404", 87 | "D406", 88 | "D407", 89 | "D408", 90 | "D409", 91 | "D413", 92 | "D100", 93 | ] 94 | 95 | [tool.coverage.run] 96 | branch = true 97 | parallel = true 98 | 99 | [tool.coverage.report] 100 | show_missing = true 101 | skip_covered = true 102 | skip_empty = true 103 | 104 | [tool.poe.tasks] 105 | t = { cmd = "pytest", help = "Run tests" } 106 | tc = { cmd = "pytest --cov=minestrone", help = "Run tests with coverage" } 107 | r = { cmd = "coverage report", help = "Show coverage report" } 108 | my = { cmd = "mypy .", help = "Run mypy" } 109 | b = { cmd = "black . --check --quiet", help = "Run black" } 110 | i = { cmd = "isort . --check --quiet", help = "Run isort" } 111 | tm = ["b", "i", "t", "my"] 112 | sa = { cmd = "sphinx-autobuild -W docs/source docs/build", help = "Sphinx autobuild" } 113 | sb = { cmd = "sphinx-build -W docs/source docs/build", help = "Build documentation" } 114 | publish = { shell = "poetry publish --build -r test && poetry publish" } 115 | 116 | [build-system] 117 | requires = ["poetry-core>=1.0.0"] 118 | build-backend = "poetry.core.masonry.api" 119 | -------------------------------------------------------------------------------- /docs/source/element.md: -------------------------------------------------------------------------------- 1 | # Element 2 | 3 | `Element`s are returned from [querying](querying.md) methods. 4 | 5 | ## properties 6 | 7 | ### name 8 | 9 | Gets the name of the `Element`. 10 | 11 | ```python 12 | html = HTML("Dormouse") 13 | span_element = html.root_element 14 | 15 | assert span_element.name == "span" 16 | ``` 17 | 18 | ### id 19 | 20 | #### Get the id 21 | 22 | ```python 23 | html = HTML('Dormouse') 24 | span_element = html.root_element 25 | 26 | assert span_element.id == "dormouse" 27 | ``` 28 | 29 | #### Set the id 30 | 31 | ```python 32 | html = HTML("Dormouse") 33 | span_element = html.root_element 34 | 35 | span_element.id = "dormouse" 36 | assert span_element.id == "dormouse" 37 | ``` 38 | 39 | ### attributes 40 | 41 | #### Get attributes 42 | 43 | ```python 44 | html = HTML('') 45 | button_element = html.root_element 46 | 47 | assert button_element.attributes == {"class": "mt-2 pb-2", "disabled": True} 48 | ``` 49 | 50 | #### Set attributes 51 | 52 | ```python 53 | html = HTML("") 54 | button_element = html.root_element 55 | button_element.attributes = {"class": "mt-2 pb-2", "disabled": True} 56 | 57 | assert str(button_element) == '' 58 | ``` 59 | 60 | ### classes 61 | 62 | Gets a list of classes for the element. 63 | 64 | ```python 65 | html = HTML('') 66 | button_element = html.root_element 67 | 68 | assert button_element.classes == ["mt-2", "pb-2"] 69 | ``` 70 | 71 | ### text 72 | 73 | #### Get text context 74 | 75 | ```python 76 | html = HTML("") 77 | button_element = html.root_element 78 | 79 | assert button_element.text == "Wake Up" 80 | ``` 81 | 82 | #### Set text content 83 | 84 | ```python 85 | html = HTML("") 86 | button_element = html.root_element 87 | 88 | button_element.text = "Go back to sleep" 89 | 90 | assert str(button_element) == "" 91 | ``` 92 | 93 | ### children 94 | 95 | Gets an iterator of the children for the element. 96 | 97 | ```python 98 | html = HTML(""" 99 |

104 | """) 105 | ul_element = html.root_element 106 | 107 | assert len(list(ul_element.children)) == 3 108 | ``` 109 | 110 | ### parent 111 | 112 | Gets the parent for the element. 113 | 114 | ```python 115 | html = HTML(""" 116 |

119 | """) 120 | li_element = next(html.query("#li-1")) 121 | 122 | assert li_element.parent.name == "ul" 123 | ``` 124 | 125 | ## methods 126 | 127 | ### insert 128 | 129 | Inserts an element into an element. 130 | 131 | ```python 132 | html = HTML("

") 133 | ul_element = next(html.query("ul")) 134 | 135 | li_element = Element.create("li", "item") 136 | ul_element.insert(li_element) 137 | 138 | assert str(ul_element) == "

item

" 139 | ``` 140 | 141 | ```python 142 | html = HTML("

item

") 143 | ul_element = next(html.query("ul")) 144 | 145 | li_element = Element.create("li", "another item") 146 | ul_element.insert(li_element, -1) 147 | 148 | assert str(ul_element) == "

item
another item

" 149 | ``` 150 | 151 | ### prettify 152 | 153 | Returns a prettified version of the element. 154 | 155 | ```python 156 | html = HTML('

') 157 | ul_element = next(html.query("ul")) 158 | 159 | assert ul_element.prettify() == """ 160 |

163 | """ 164 | ``` 165 | 166 | ### remove_children 167 | 168 | Removes all children from an element. 169 | 170 | ```python 171 | html = HTML(''' 172 |

176 | ''') 177 | ul_element = next(html.query("ul")) 178 | ul_element.remove_children() 179 | 180 | assert str(ul_element) == "

" 181 | ``` 182 | -------------------------------------------------------------------------------- /tests/html/test_html.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | 5 | # Test that `HTML` is imported from the asterisk 6 | from minestrone import * 7 | 8 | 9 | def test_html_str_unsorted_attributes(html_doc): 10 | with open("tests/samples/html_doc.html", "r") as f: 11 | original_html_doc = f.read() 12 | 13 | # All spaces are replaced which isn't great 14 | expected = original_html_doc 15 | expected = re.sub(r"^\s+", "", expected, flags=re.MULTILINE) 16 | actual = str(html_doc) 17 | 18 | assert actual == expected 19 | 20 | 21 | def test_html_str_closes_tags(): 22 | html = HTML( 23 | """ 24 | 25 | The Dormouse's story 26 | 27 | 28 |

The Dormouse's story

""" 29 | ) 30 | 31 | expected = """ 32 | 33 | The Dormouse's story 34 | 35 | 36 |

The Dormouse's story

""" 37 | 38 | actual = str(html) 39 | 40 | assert actual == expected 41 | 42 | 43 | def test_html_fragments(): 44 | html = HTML( 45 | """

The Dormouse's story

46 |

Elsie
Lacie
Tillie

""" 51 | ) 52 | 53 | expected = """

The Dormouse's story

54 |

Elsie
Lacie
Tillie

""" 59 | 60 | actual = str(html) 61 | 62 | assert actual == expected 63 | 64 | 65 | def test_html_html_strings(html_fragment): 66 | html = HTML(html_fragment) 67 | 68 | assert str(html_fragment) == str(html) 69 | 70 | 71 | def test_html_html_soups(html_fragment): 72 | html = HTML(html_fragment) 73 | 74 | assert html_fragment._soup == html._soup 75 | 76 | 77 | def test_html_wrong_type(): 78 | with pytest.raises(Exception): 79 | HTML(1) 80 | 81 | 82 | def test_html_repr(html_fragment): 83 | assert repr(html_fragment) == str(html_fragment) 84 | 85 | 86 | def test_html_parser_fragment(html_fragment_str): 87 | html_parsed_with_html = HTML(html_fragment_str, parser=Parser.HTML) 88 | assert html_parsed_with_html 89 | 90 | html_parsed_with_lxml = HTML(html_fragment_str, parser=Parser.LXML) 91 | assert html_parsed_with_lxml 92 | 93 | html_parsed_with_html5 = HTML(html_fragment_str, parser=Parser.HTML5) 94 | assert html_parsed_with_html5 95 | 96 | 97 | def test_html_parser_fragment_html(): 98 | assert str(HTML("dormouse", parser=Parser.HTML)) == "dormouse" 99 | 100 | 101 | def test_html_parser_fragment_lxml(): 102 | assert ( 103 | str(HTML("dormouse", parser=Parser.LXML)) 104 | == "dormouse" 105 | ) 106 | 107 | 108 | def test_html_parser_fragment_html5(): 109 | assert ( 110 | str(HTML("dormouse", parser=Parser.HTML5)) 111 | == "dormouse" 112 | ) 113 | 114 | 115 | def test_html_parser_doc(html_doc_str): 116 | html_parsed_with_html = HTML(html_doc_str, parser=Parser.HTML) 117 | assert html_parsed_with_html 118 | 119 | html_parsed_with_lxml = HTML(html_doc_str, parser=Parser.LXML) 120 | assert html_parsed_with_lxml 121 | 122 | html_parsed_with_html5 = HTML(html_doc_str, parser=Parser.HTML5) 123 | assert html_parsed_with_html5 124 | 125 | 126 | def test_html_encoding(): 127 | html = HTML(b"

\xed\xe5\xec\xf9

") 128 | assert str(html) == "

翴檛

" 129 | assert html.encoding == "big5" 130 | 131 | html = HTML(b"

\xed\xe5\xec\xf9

", encoding="iso-8859-8") 132 | assert str(html) == "

םולש

" 133 | assert html.encoding == "iso-8859-8" 134 | -------------------------------------------------------------------------------- /minestrone/__init__.py: -------------------------------------------------------------------------------- 1 | """minestrone - Search, modify, and parse messy HTML with ease.""" 2 | 3 | from enum import Enum 4 | from typing import Iterator, List, Optional, Union 5 | 6 | import bs4 7 | 8 | from .element import Element, Text 9 | from .formatter import UnsortedAttributes 10 | 11 | __all__ = [ 12 | "HTML", 13 | "Element", 14 | "Text", 15 | "Parser", 16 | ] 17 | 18 | 19 | class Parser(Enum): 20 | HTML = "html.parser" 21 | LXML = "lxml" 22 | HTML5 = "html5lib" 23 | 24 | 25 | class HTML: 26 | encoding: Optional[str] = "utf-8" 27 | 28 | def __init__( 29 | self, 30 | html: Union[str, "HTML"], 31 | parser: Parser = Parser.HTML, 32 | encoding: str = None, 33 | ): 34 | self.html = html 35 | self.parser = parser 36 | 37 | if isinstance(html, str) or isinstance(html, bytes): 38 | self._soup = bs4.BeautifulSoup( 39 | html, features=parser.value, from_encoding=encoding 40 | ) 41 | elif isinstance(html, HTML): 42 | self._soup = html._soup 43 | self.html = html.html 44 | else: 45 | raise Exception("Unknown type to init HTML") 46 | 47 | if encoding: 48 | self.encoding = encoding 49 | else: 50 | self.encoding = self._soup.original_encoding 51 | 52 | def query(self, selector: str) -> Iterator[Element]: 53 | """Returns an iterator of `Element`s that match the CSS selector.""" 54 | 55 | for _tag in self._soup.select(selector): 56 | yield Element.convert_from_tag(self._soup, _tag) 57 | 58 | def query_to_list(self, selector: str) -> List[Element]: 59 | """Returns a list of `Element`s that match the CSS selector.""" 60 | 61 | return list(self.query(selector)) 62 | 63 | def prettify( 64 | self, indent: int = 2, max_line_length: int = 88, use_bs4: bool = False 65 | ) -> str: 66 | """Prettify HTML. 67 | 68 | Args: 69 | indent: How many spaces to indent for each level in the hierarchy. Defaults to 2. 70 | max_line_length: How long the line can reach before indenting another level. Defaults to 88. If `None` it will never used. 71 | use_bs4: Whether to use the `BeautifulSoup` `prettify` function or `minestrone`. Defaults to `False`. 72 | """ 73 | 74 | if use_bs4: 75 | return self._soup.prettify() 76 | 77 | strings = [] 78 | 79 | for top_level_child in self._soup.contents: 80 | if isinstance(top_level_child, bs4.Doctype) and top_level_child: 81 | strings.append("\n") 84 | elif isinstance(top_level_child, bs4.Tag): 85 | element = Element.convert_from_tag(self._soup, top_level_child) 86 | strings.append(element.prettify(indent, max_line_length)) 87 | elif isinstance(top_level_child, bs4.Comment): 88 | strings.append("") 91 | strings.append("\n") 92 | elif isinstance(top_level_child, str) and top_level_child != "\n": 93 | strings.append(top_level_child.strip()) 94 | strings.append("\n") 95 | 96 | return "".join(strings) 97 | 98 | @property 99 | def root_element(self) -> Optional[Element]: 100 | """Gets the root `Element` for the HTML.""" 101 | 102 | for _element in self._soup.contents: 103 | if isinstance(_element, bs4.element.Tag) and _element.name: 104 | return Element.convert_from_tag(self._soup, _element) 105 | 106 | return None 107 | 108 | @property 109 | def elements(self) -> Iterator[Element]: 110 | """Recursively yield all `Element`s in the HTML.""" 111 | 112 | for _element in self._soup.descendants: 113 | if isinstance(_element, bs4.element.Tag) and _element.name: 114 | yield Element.convert_from_tag(self._soup, _element) 115 | 116 | def __str__(self): 117 | # Cleans up `BeautifulSoup` modifications 118 | self._soup.smooth() 119 | 120 | # Prevent `BeautifulSoup` from re-ordering attributes in alphabetical order 121 | return self._soup.encode(formatter=UnsortedAttributes()).decode() 122 | 123 | def __repr__(self): 124 | return self.__str__() 125 | -------------------------------------------------------------------------------- /minestrone/element/prettifier.py: -------------------------------------------------------------------------------- 1 | import bs4 2 | 3 | from . import VOID_ELEMENTS, Element 4 | 5 | 6 | def prettify_element( 7 | element: Element, 8 | indent: int, 9 | max_line_length: int, 10 | spaces: str = "", 11 | ) -> str: 12 | def __increase_spaces(_spaces): 13 | return " " * (len(_spaces) + indent) 14 | 15 | def __decrease_spaces(_spaces): 16 | return " " * (len(_spaces) - indent) 17 | 18 | def __append_newline_if_needed(_strings): 19 | if not _strings[-1].endswith("\n"): 20 | _strings.append("\n") 21 | 22 | def __append_string(_strings, _string): 23 | if _string: 24 | _strings.append(_string) 25 | 26 | strings = [] 27 | __append_string(strings, spaces) 28 | __append_string(strings, element.tag_string) 29 | 30 | content_children = [ 31 | c 32 | for c in element._self.contents 33 | if (isinstance(c, str) and c != "\n") 34 | or isinstance(c, bs4.element.Tag) 35 | or isinstance(c, bs4.element.Comment) 36 | ] 37 | has_children = False 38 | 39 | for child in element.children: 40 | if has_children is False: 41 | if content_children: 42 | extra_child_spaces = __increase_spaces(spaces) 43 | 44 | for content_child in content_children.copy(): 45 | content_children.pop(0) 46 | 47 | if isinstance(content_child, str): 48 | child_text = content_child.strip() 49 | 50 | if child_text: 51 | # Make sure that any newlines are indented to the correct number of spaces 52 | child_text = child_text.replace( 53 | "\n", f"\n{extra_child_spaces}" 54 | ) 55 | 56 | __append_string(strings, "\n") 57 | __append_string(strings, extra_child_spaces) 58 | __append_string(strings, child_text) 59 | else: 60 | break 61 | 62 | if element.name not in VOID_ELEMENTS: 63 | # Only increase the number of spaces if the current element can have children 64 | # and it's the first child 65 | spaces = __increase_spaces(spaces) 66 | 67 | __append_newline_if_needed(strings) 68 | 69 | has_children = True 70 | __append_string( 71 | strings, prettify_element(child, indent, max_line_length, spaces=spaces) 72 | ) 73 | 74 | if content_children: 75 | extra_child_spaces = __increase_spaces(spaces) 76 | 77 | for child in content_children.copy(): 78 | content_children.pop(0) 79 | 80 | if isinstance(child, str): 81 | child_text = child.strip() 82 | 83 | if child_text: 84 | # Make sure that any newlines are indented to the correct number of spaces 85 | child_text = child_text.replace("\n", f"\n{spaces}") 86 | __append_string(strings, spaces) 87 | 88 | if isinstance(child, bs4.Comment): 89 | __append_string(strings, "") 95 | else: 96 | break 97 | 98 | if has_children: 99 | spaces = __decrease_spaces(spaces) 100 | 101 | __append_newline_if_needed(strings) 102 | __append_string(strings, spaces) 103 | __append_string(strings, element.closing_tag_string) 104 | else: 105 | is_long_line = False 106 | 107 | if max_line_length is not None and len(element._self.text) > max_line_length: 108 | is_long_line = True 109 | 110 | if is_long_line: 111 | spaces = __increase_spaces(spaces) 112 | __append_string(strings, "\n") 113 | __append_string(strings, spaces) 114 | 115 | __append_string(strings, element._self.text) 116 | 117 | if is_long_line: 118 | spaces = __decrease_spaces(spaces) 119 | __append_string(strings, "\n") 120 | __append_string(strings, spaces) 121 | 122 | __append_string(strings, element.closing_tag_string) 123 | 124 | __append_newline_if_needed(strings) 125 | 126 | return "".join(strings) 127 | -------------------------------------------------------------------------------- /docs/source/parsing.md: -------------------------------------------------------------------------------- 1 | # Parsing 2 | 3 | The `HTML` class parses a string of HTML and provides methods to [query](querying.md) the DOM for specific elements. 4 | 5 | ## \_\_init\_\_ 6 | 7 | Creates an `HTML` object from a `str` or `bytes`. 8 | 9 | ```python 10 | from minestrone import HTML 11 | html = HTML(""" 12 | 13 | 14 | The Dormouse's Story 15 | 16 | 17 |

The Dormouse's Story

18 | 19 |

Elsie
Lacie

23 | 24 | 25 | """) 26 | ``` 27 | 28 | If closing tags are missing, then they will be added as needed to make the HTML valid. 29 | 30 | ```python 31 | from minestrone import HTML 32 | assert str(HTML("dormouse")) == "dormouse" 33 | ``` 34 | 35 | ### parser 36 | Three parsers are available in `minestrone` and they all have different trade-offs. By default, the built-in, pure Python `html.parser` is used. `lxml` can be used for faster parsing speed. `html5lib` is another option to ensure a valid HTML5 document. 37 | 38 | ```{note} 39 | `lxml` and `html5lib` are not installed with `minestrone` by default and must be specifically installed. 40 | 41 | - `poetry add minestrone[lxml]` or `pip install minestrone[lxml]` 42 | - `poetry add minestrone[html5]` or `pip install minestrone[html5]` 43 | ``` 44 | 45 | ```{note} 46 | BeautifulSoup has a [summary table](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser) of the three parsers. There is also a more detailed [breakdown of the differences](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#differences-between-parsers) between the parsers. 47 | ``` 48 | 49 | ### Parser.HTML 50 | 51 | ```python 52 | from minestrone import HTML, Parser 53 | assert str(HTML("dormouse"), parser=Parser.HTML) == "dormouse" 54 | ``` 55 | 56 | ### Parser.LXML 57 | 58 | ```python 59 | from minestrone import HTML, Parser 60 | assert str(HTML("dormouse"), parser=Parser.LXML) == "dormouse" 61 | ``` 62 | 63 | ### Parser.HTML5 64 | 65 | ```python 66 | from minestrone import HTML, Parser 67 | assert str(HTML("dormouse"), parser=Parser.HTML5) == "dormouse" 68 | ``` 69 | 70 | ## encoding 71 | 72 | `Beautiful Soup` [attempts to decipher the encoding](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#encodings) of the HTML string, however it isn't always correct. An encoding can be passed along if necessary. 73 | 74 | ```python 75 | from minestrone import HTML, Parser 76 | html_bytes = b"

\xed\xe5\xec\xf9

" 77 | 78 | assert str(HTML(html_bytes)) == "

翴檛

" 79 | assert HTML(html_bytes).encoding == "big5" 80 | 81 | assert str(HTML(html_bytes), encoding="iso-8859-8") == "

םולש

" 82 | assert HTML(html_bytes).encoding == "iso-8859-8" 83 | ``` 84 | 85 | ## prettify 86 | 87 | Returns a prettified version of the HTML. 88 | 89 | ```python 90 | html = HTML(""" 91 | 92 | 93 | The Dormouse's Story 94 | 95 | 96 |

The Dormouse's Story

97 | 98 |

Elsie
Lacie

102 | 103 | 104 | """) 105 | 106 | assert html.prettify() == """ 107 | 108 | The Dormouse's Story 109 | 110 | 111 |

The Dormouse's Story

112 |

114 | Elsie 115 |
117 | Lacie 118 |

120 | 121 | 122 | """ 123 | ``` 124 | 125 | 126 | ## \_\_str\_\_ 127 | 128 | Returns the `HTML` object as a string. 129 | 130 | ```python 131 | from minestrone import HTML 132 | html = HTML(""" 133 | 134 | 135 | The Dormouse's Story 136 | 137 | 138 |

The Dormouse's Story

139 | 140 |

Elsie
Lacie

144 | 145 | 146 | """) 147 | 148 | assert str(html) == """ 149 | 150 | The Dormouse's Story 151 | 152 | 153 |

The Dormouse's Story

154 |

Elsie
Lacie

158 | 159 | """ 160 | ``` 161 | 162 | ```{note} 163 | Rendering the `HTML` into a string _will_ remove preceding spaces. 164 | ``` 165 | -------------------------------------------------------------------------------- /tests/element/test_create_tags.py: -------------------------------------------------------------------------------- 1 | from minestrone import HTML, Text 2 | 3 | 4 | def test_create_tag_with_true_attribute_value(html_fragment: HTML): 5 | elsie = next(html_fragment.query("a#elsie")) 6 | actual = elsie._create_tag("button", "Save", disabled=True) 7 | 8 | expected = '' 9 | 10 | assert str(actual) == expected 11 | 12 | 13 | def test_create_tag_with_klass(html_fragment: HTML): 14 | elsie = next(html_fragment.query("a#elsie")) 15 | actual = elsie._create_tag("button", "Save", klass="test-class") 16 | 17 | expected = '' 18 | 19 | assert str(actual) == expected 20 | 21 | 22 | def test_prepend(html_fragment: HTML): 23 | elsie = next(html_fragment.query("a#elsie")) 24 | elsie.prepend("span", "test prepend content", klass="test-class") 25 | 26 | expected = """

test prepend contentElsie
Lacie
Tillie

""" 31 | 32 | actual = str(html_fragment) 33 | 34 | assert actual == expected 35 | 36 | 37 | def test_text_str(html_fragment: HTML): 38 | elsie = next(html_fragment.query("a#elsie")) 39 | text = elsie.prepend(text="test prepend text content") 40 | 41 | expected = "test prepend text content" 42 | actual = str(text) 43 | 44 | assert actual == expected 45 | 46 | 47 | def test_text_repr(html_fragment: HTML): 48 | elsie = next(html_fragment.query("a#elsie")) 49 | text = elsie.prepend(text="test prepend text content") 50 | 51 | expected = "test prepend text content" 52 | actual = repr(text) 53 | 54 | assert actual == expected 55 | 56 | 57 | def test_prepend_text(html_fragment: HTML): 58 | elsie = next(html_fragment.query("a#elsie")) 59 | elsie.prepend(text="test prepend text content") 60 | 61 | expected = """

test prepend text contentElsie
Lacie
Tillie

""" 66 | 67 | actual = str(html_fragment) 68 | 69 | assert actual == expected 70 | 71 | 72 | def test_append(html_fragment: HTML): 73 | elsie = next(html_fragment.query("a#elsie")) 74 | elsie.append("span", "test append content", klass="test-class") 75 | 76 | expected = """

Elsietest append content
Lacie
Tillie

""" 81 | 82 | actual = str(html_fragment) 83 | 84 | assert actual == expected 85 | 86 | 87 | def test_append_text(html_fragment: HTML): 88 | elsie = next(html_fragment.query("a#elsie")) 89 | 90 | text = elsie.append(text="test append text content") 91 | assert text 92 | assert isinstance(text, Text) 93 | 94 | expected = """

Elsietest append text content
Lacie
Tillie

Elsietest append 1 test append 2
Lacie
Tillie

""" 120 | 121 | actual = str(html_fragment) 122 | 123 | assert actual == expected 124 | 125 | 126 | def test_append_multiple(html_fragment: HTML): 127 | elsie = next(html_fragment.query("a#elsie")) 128 | new_tag = elsie.append("span", "test append content 1", klass="test-class") 129 | new_tag.append("span", "test append content 2", klass="test-class") 130 | 131 | expected = """

Elsietest append content 1test append content 2
Lacie
Tillie

""" 136 | 137 | actual = str(html_fragment) 138 | 139 | assert actual == expected 140 | -------------------------------------------------------------------------------- /tests/test_benchmarks.py: -------------------------------------------------------------------------------- 1 | # Run this with `poe t tests/test_benchmarks.py --benchmark-only` 2 | from typing import Iterator 3 | 4 | import re 5 | from minestrone import HTML, Element 6 | from html.parser import HTMLParser 7 | 8 | from lxml import html as lxml_html 9 | from bs4 import BeautifulSoup, Tag 10 | 11 | UNICORN_MODEL_REGEX = re.compile( 12 | r"(unicorn:model|u:model)(\.[^=]+)?=[\"'](?P[^\"']+)[\"']" 13 | ) 14 | 15 | HTML_FRAGMENT = """

16 |

17 |

Step 2

18 |

19 | 20 | 21 | 22 | 23 |

24 |

30 | 31 | 32 |

33 |

""" 34 | 35 | EXPECTED = ["address", "city", "state", "zip_code"] 36 | 37 | 38 | def _parse_beautiful_soup(soup: BeautifulSoup): 39 | unicorn_model_names = [] 40 | 41 | for tag in soup.descendants: 42 | if isinstance(tag, Tag) and tag.name: 43 | for attr in tag.attrs.keys(): 44 | if attr.startswith("unicorn:model") or attr.startswith("u:model"): 45 | unicorn_model_names.append(tag.attrs[attr]) 46 | 47 | return unicorn_model_names 48 | 49 | 50 | def test_beautiful_soup_html_parser_with_existing_soup(benchmark): 51 | html_parser_soup = BeautifulSoup(HTML_FRAGMENT, features="html.parser") 52 | 53 | def _(): 54 | return _parse_beautiful_soup(html_parser_soup) 55 | 56 | actual = benchmark(_) 57 | assert EXPECTED == actual 58 | 59 | 60 | def test_beautiful_soup_html_parser(benchmark): 61 | def _(): 62 | soup = BeautifulSoup(HTML_FRAGMENT, features="html.parser") 63 | return _parse_beautiful_soup(soup) 64 | 65 | actual = benchmark(_) 66 | assert EXPECTED == actual 67 | 68 | 69 | def test_beautiful_soup_lxml(benchmark): 70 | def _(): 71 | soup = BeautifulSoup(HTML_FRAGMENT, features="lxml") 72 | return _parse_beautiful_soup(soup) 73 | 74 | actual = benchmark(_) 75 | assert EXPECTED == actual 76 | 77 | 78 | def test_beautiful_soup_lxml_with_existing_soup(benchmark): 79 | lxml_soup = BeautifulSoup(HTML_FRAGMENT, features="lxml") 80 | 81 | def _(): 82 | return _parse_beautiful_soup(lxml_soup) 83 | 84 | actual = benchmark(_) 85 | assert EXPECTED == actual 86 | 87 | 88 | def test_regex(benchmark): 89 | def _(): 90 | return [ 91 | m.group("unicorn_model_name") 92 | for m in UNICORN_MODEL_REGEX.finditer(HTML_FRAGMENT) 93 | ] 94 | 95 | actual = benchmark(_) 96 | 97 | assert EXPECTED == actual 98 | 99 | 100 | def _minestrone_get_unicorn_models(element: Element) -> Iterator[str]: 101 | for attribute in element.attributes.keys(): 102 | if attribute.startswith("unicorn:model") or attribute.startswith("u:model"): 103 | yield element.attributes[attribute] 104 | 105 | 106 | def test_minestrone(benchmark): 107 | def _(): 108 | minestrone_html = HTML(HTML_FRAGMENT) 109 | unicorn_model_names = [] 110 | 111 | for element in minestrone_html.elements: 112 | for attribute_value in _minestrone_get_unicorn_models(element): 113 | unicorn_model_names.append(attribute_value) 114 | 115 | return unicorn_model_names 116 | 117 | actual = benchmark(_) 118 | 119 | assert EXPECTED == actual 120 | 121 | 122 | def test_minestrone_with_existing_html(benchmark): 123 | minestrone_html = HTML(HTML_FRAGMENT) 124 | 125 | def _(): 126 | unicorn_model_names = [] 127 | 128 | for element in minestrone_html.elements: 129 | for attribute_value in _minestrone_get_unicorn_models(element): 130 | unicorn_model_names.append(attribute_value) 131 | 132 | return unicorn_model_names 133 | 134 | actual = benchmark(_) 135 | 136 | assert EXPECTED == actual 137 | 138 | 139 | # def test_parsel(benchmark): 140 | # from parsel import Selector 141 | # def _(): 142 | # selector = Selector(html) 143 | 144 | # r = selector.css("input::attr(unicorn:model)").getall() 145 | 146 | # print(r) 147 | 148 | # return [] 149 | 150 | # unicorn_model_names = benchmark(_) 151 | # assert len(unicorn_model_names) == 4 152 | 153 | 154 | def test_lxml_html(benchmark): 155 | def _(): 156 | unicorn_model_names = [] 157 | 158 | for element in lxml_html.fragment_fromstring(HTML_FRAGMENT).iter(): 159 | for attrs in element.items(): 160 | if attrs[0].startswith("unicorn:model") or attrs[0].startswith( 161 | "u:model" 162 | ): 163 | unicorn_model_names.append(attrs[1]) 164 | 165 | return unicorn_model_names 166 | 167 | actual = benchmark(_) 168 | 169 | assert EXPECTED == actual 170 | 171 | 172 | class UnicornModelParser(HTMLParser): 173 | def feed(self, data): 174 | self.unicorn_model_names = [] 175 | 176 | super().reset() 177 | super().feed(data) 178 | 179 | def handle_starttag(self, tag, attrs: list): 180 | for attr in attrs: 181 | if attr[0].startswith("unicorn:model") or attr[0].startswith("u:model"): 182 | self.unicorn_model_names.append(attr[1]) 183 | 184 | 185 | def test_html_parser(benchmark): 186 | def _(): 187 | parser = UnicornModelParser() 188 | parser.feed(HTML_FRAGMENT) 189 | 190 | return parser.unicorn_model_names 191 | 192 | actual = benchmark(_) 193 | 194 | assert EXPECTED == actual 195 | -------------------------------------------------------------------------------- /tests/element/test_element.py: -------------------------------------------------------------------------------- 1 | import bs4 2 | import pytest 3 | 4 | from minestrone import HTML, Element 5 | 6 | 7 | def test_get_text(html_doc): 8 | tillie = next(html_doc.query("a#tillie")) 9 | 10 | expected = "Tillie" 11 | actual = tillie.text 12 | 13 | assert actual == expected 14 | 15 | 16 | def test_get_text_with_tags(): 17 | html = HTML( 18 | """ 19 |

`stuff in here` header

20 | """ 21 | ) 22 | el = next(html.query("h1")) 23 | 24 | expected = "stuff in here header" 25 | actual = el.text 26 | 27 | assert actual == expected 28 | 29 | 30 | def test_set_text(html_doc): 31 | tillie = next(html_doc.query("a#tillie")) 32 | 33 | tillie.text = "Billie" 34 | 35 | expected = "Billie" 36 | actual = tillie.text 37 | 38 | assert actual == expected 39 | 40 | 41 | def test_get_name(html_doc): 42 | tillie = next(html_doc.query("a#tillie")) 43 | 44 | expected = "a" 45 | actual = tillie.name 46 | 47 | assert actual == expected 48 | 49 | 50 | def test_element_klass(): 51 | actual = Element.create("button", "Save", klass="test-class") 52 | 53 | expected = '' 54 | 55 | assert str(actual) == expected 56 | 57 | 58 | def test_element_get_attributes(): 59 | span = Element.create( 60 | "span", 61 | "test attrs content", 62 | klass="test-class1 test-class2", 63 | disabled=True, 64 | id="span1", 65 | ) 66 | 67 | assert span.name == "span" 68 | assert span.text == "test attrs content" 69 | assert span.attributes == { 70 | "class": "test-class1 test-class2", 71 | "disabled": "disabled", 72 | "id": "span1", 73 | } 74 | 75 | 76 | def test_element_set_attributes_id(): 77 | span = Element.create( 78 | "span", 79 | ) 80 | 81 | assert span.id is None 82 | 83 | span.attributes = {"id": "test-id"} 84 | 85 | assert span.id == "test-id" 86 | assert span.attributes == { 87 | "id": "test-id", 88 | } 89 | 90 | 91 | def test_element_set_id(): 92 | span = Element.create( 93 | "span", 94 | ) 95 | 96 | assert span.id is None 97 | 98 | span.id = "test-id" 99 | 100 | assert span.id == "test-id" 101 | assert span.attributes == { 102 | "id": "test-id", 103 | } 104 | 105 | 106 | def test_element_set_attributes_klass(): 107 | span = Element.create( 108 | "span", 109 | klass="test-class1", 110 | ) 111 | 112 | span.attributes = {"klass": "test-class2 test-class3"} 113 | 114 | assert span.name == "span" 115 | assert span.attributes == { 116 | "class": "test-class2 test-class3", 117 | } 118 | assert span.classes == ["test-class2", "test-class3"] 119 | 120 | 121 | def test_element_set_attributes_css(): 122 | span = Element.create( 123 | "span", 124 | klass="test-class1", 125 | ) 126 | 127 | span.attributes = {"css": "test-class2 test-class3"} 128 | 129 | assert span.name == "span" 130 | assert span.attributes == { 131 | "class": "test-class2 test-class3", 132 | } 133 | assert span.classes == ["test-class2", "test-class3"] 134 | 135 | 136 | def test_element_set_attributes_class_list(): 137 | span = Element.create( 138 | "span", 139 | klass="test-class1", 140 | ) 141 | 142 | span.attributes = {"css": ["test-class2", "test-class3"]} 143 | 144 | assert span.name == "span" 145 | assert span.attributes == { 146 | "class": "test-class2 test-class3", 147 | } 148 | assert span.classes == ["test-class2", "test-class3"] 149 | 150 | 151 | def test_element_set_attributes_class_tuple(): 152 | span = Element.create( 153 | "span", 154 | klass="test-class1", 155 | ) 156 | 157 | span.attributes = {"css": ("test-class2", "test-class3")} 158 | 159 | assert span.name == "span" 160 | assert span.attributes == { 161 | "class": "test-class2 test-class3", 162 | } 163 | assert span.classes == ["test-class2", "test-class3"] 164 | 165 | 166 | def test_element_set_attributes_invalid_type(): 167 | span = Element.create("span") 168 | 169 | with pytest.raises(Exception): 170 | span.attributes = {"css": 0} 171 | 172 | 173 | def test_element_classes_from_klass_kwarg(): 174 | span = Element.create( 175 | "span", 176 | klass="test-class1 test-class2", 177 | ) 178 | 179 | assert span.classes == ["test-class1", "test-class2"] 180 | 181 | 182 | def test_element_classes_from_css_kwarg(): 183 | span = Element.create( 184 | "span", 185 | css="test-class1 test-class2", 186 | ) 187 | 188 | assert span.classes == ["test-class1", "test-class2"] 189 | 190 | 191 | def test_element_classes_empty(): 192 | span = Element.create("span") 193 | 194 | assert span.classes == [] 195 | 196 | 197 | def test_element_children(html_doc): 198 | ul = next(html_doc.query("ul")) 199 | 200 | assert len(list(ul.children)) == 3 201 | 202 | # get generator so next() will work to get all children 203 | children = ul.children 204 | 205 | first_li = next(children) 206 | elsie = next(first_li.children) 207 | assert elsie.id == "elsie" 208 | assert len(list(elsie.children)) == 0 209 | 210 | second_li = next(children) 211 | lacie = next(second_li.children) 212 | assert lacie.id == "lacie" 213 | 214 | third_li = next(children) 215 | tillie = next(third_li.children) 216 | assert tillie.id == "tillie" 217 | 218 | 219 | def test_element_parent(html_doc): 220 | elsie = next(html_doc.query("#elsie")) 221 | 222 | assert elsie.parent 223 | assert elsie.parent.name == "li" 224 | assert elsie.parent.parent.name == "ul" 225 | 226 | 227 | def test_element_parent_none(html_doc: HTML): 228 | assert html_doc.root_element 229 | assert html_doc.root_element.parent 230 | assert html_doc.root_element.parent.parent is None 231 | 232 | 233 | def test_create_without_soup(): 234 | span = Element.create( 235 | "span", 236 | ) 237 | 238 | assert span._soup 239 | 240 | 241 | def test_create_with_soup(): 242 | soup = bs4.BeautifulSoup() 243 | 244 | span = Element.create( 245 | "span", 246 | soup=soup, 247 | ) 248 | 249 | assert span._soup 250 | assert id(span._soup) == id(soup) 251 | 252 | 253 | def test_repr(): 254 | span = Element.create( 255 | "span", 256 | ) 257 | 258 | assert repr(span) == "" 259 | 260 | 261 | def test_tag_string(html_doc): 262 | ul = next(html_doc.query("ul")) 263 | 264 | expected = "

' 274 | actual = tillie.tag_string 275 | 276 | assert actual == expected 277 | 278 | 279 | def test_tag_string_with_list_attributes(): 280 | tillie = HTML('').root_element 281 | 282 | expected = '' 283 | actual = tillie.tag_string 284 | 285 | assert actual == expected 286 | 287 | 288 | def test_closing_tag_string(html_doc): 289 | tillie = next(html_doc.query("a#tillie")) 290 | 291 | expected = "

insert
insert-index

test

Hello World

`Hello` World

Hello

20 | Elsie 21 |
23 | Lacie 24 |
26 | Tillie 27 |
33 | Elsie 34 |
36 | Lacie 37 |
39 | Tillie 40 |

51 | Elsie 52 |
54 | Lacie 55 |
57 | Tillie 58 |

66 | Elsie 67 |
69 | Lacie 70 |
72 | Tillie 73 |

85 | Elsie 86 |
88 | Lacie 89 |
91 | Tillie 92 |

100 | Elsie 101 |
103 | Lacie 104 |
106 | Tillie 107 |

119 | 120 | Voluptatum qui magni omnis molestias beatae sint dolor eius aliquid aut consequatur. Possimus optio dolores veniam voluptatibus autem iste ut et ut nostrum tempora quia facere. Reprehenderit at aut laboriosam consequatur id nulla. 121 | 122 |

Voluptatum qui magni omnis molestias beatae sint dolor eius aliquid aut consequatur. Possimus optio dolores veniam voluptatibus autem iste ut et ut nostrum tempora quia facere. Reprehenderit at aut laboriosam consequatur id nulla.

137 | Voluptatum qui magni omnis molestias beatae sint dolor eius aliquid aut consequatur. Possimus optio dolores veniam voluptatibus autem iste ut et ut nostrum tempora quia facere. Reprehenderit at aut laboriosam consequatur id nulla. 138 |

Voluptatum qui magni omnis molestias beatae sint dolor eius aliquid aut consequatur. Possimus optio dolores veniam voluptatibus autem iste ut et ut nostrum tempora quia facere. Reprehenderit at aut laboriosam consequatur id nulla.

153 | extra text 154 | Voluptatum qui 155 | even more text 156 |

extra textVoluptatum quieven more text

171 | extra text1 172 | Voluptatum qui1 173 | even more text1 174 |
176 | extra text2 177 | Voluptatum qui2 178 | even more text2 179 |

extra text1Voluptatum qui1even more text1
extra text2Voluptatum qui2even more text2

216 | 0 217 | 1 218 | Voluptatum qui1 219 | 2 220 | 3 221 | Voluptatum qui2 222 | 4 223 | Voluptatum qui3 224 | 5 225 | 6 226 | Voluptatum qui4 227 | 7 228 |

0 237 | 1Voluptatum qui12 238 | 3Voluptatum qui24 239 | Voluptatum qui35 240 | 6Voluptatum qui4 241 | 7

The Dormouse's story

259 | Elsie 260 |
262 | Lacie 263 |
265 | Tillie 266 |

The Dormouse's Story

286 | Elsie 287 |
289 | Lacie 290 |

The Dormouse's Story

Elsie
Lacie

Hacker News

348 | 349 | 350 | 381 | 382 |

351 | 352 | 353 | 358 | 378 | 379 |

354 | 355 |

356 | 357 |

359 | 360 | 361 | Hacker News 362 | 363 | new 364 | | 365 | past 366 | | 367 | comments 368 | | 369 | ask 370 | | 371 | show 372 | | 373 | jobs 374 | | 375 | submit 376 | 377 |

380 |

383 |

Hacker News

394 | 400 | 401 |

395 | 397 | 398 |

399 |

402 |

433 | 434 | 435 | 436 |

441 | 442 | 443 |

Hacker News

5 | 11 | 135 | 147 |

6 | 10 |

8 | login 9 |

12 | 13 | 15 | 16 | 17 | 19 | 20 | 21 | 23 | 24 | 25 | 27 | 28 | 29 | 31 | 32 | 33 | 35 | 36 | 37 | 39 | 40 | 41 | 43 | 44 | 45 | 47 | 48 | 49 | 51 | 52 | 53 | 55 | 56 | 57 | 59 | 60 | 61 | 63 | 64 | 65 | 67 | 68 | 69 | 71 | 72 | 73 | 75 | 76 | 77 | 79 | 80 | 81 | 83 | 84 | 85 | 87 | 88 | 89 | 91 | 92 | 93 | 95 | 96 | 97 | 99 | 100 | 101 | 103 | 104 | 105 | 107 | 108 | 109 | 111 | 112 | 113 | 115 | 116 | 117 | 119 | 120 | 121 | 123 | 124 | 125 | 127 | 128 | 129 | 131 | 132 | 133 |

1.		Thank HN: You helped me get a new job
		14 \| 1328 points by atum47 11 hours ago \| hide \| 115 comments
2.		Waterfalls of the Great Lakes Region and Beyond (gowaterfalling.com)
		18 \| 38 points by rubidium 45 minutes ago \| hide \| 21 comments
3.		Patch Critical Cryptographic Vulnerability in Microsoft Windows [pdf] (defense.gov)
		22 \| 649 points by Moral_ 17 hours ago \| hide \| 192 comments
4.		Apple can be sued by app developers “on a monopsony theory” (npr.org)
		26 \| 90 points by moorage 6 hours ago \| hide \| 63 comments
5.		DevDegree: Work at Shopify and get a free CS degree in parallel (devdegree.ca)
		30 \| 244 points by PandawanFr 11 hours ago \| hide \| 84 comments
6.		Hipmunk Says Goodbye (hipmunk.com)
		34 \| 546 points by ienjoythebeach 16 hours ago \| hide \| 308 comments
7.		Bug #915: Solved (nedbatchelder.com)
		38 \| 306 points by ingve 14 hours ago \| hide \| 72 comments
8.		Stripe can make automatic LLCs but a wire transfer from Citi nearly ended me (abe-winter.github.io)
		42 \| 281 points by awinter-py 15 hours ago \| hide \| 187 comments
9.		Show HN: An open-source distributed graph database written in C++ (github.com)
		46 \| 98 points by jamie-vesoft 9 hours ago \| hide \| 39 comments
10.		NOBUS (Nobody but Us) (wikipedia.org)
		50 \| 135 points by apsec112 11 hours ago \| hide \| 35 comments
11.		Many games are held together by duct tape (polygon.com)
		54 \| 29 points by misotaur 4 hours ago \| hide \| 18 comments
12.		The few remaining uses of the name “Macintosh” (tidbits.com)
		58 \| 56 points by fanf2 4 hours ago \| hide \| 47 comments
13.		Working for someone vs. doing your own thing (tik.dev)
		62 \| 280 points by thakobyan 15 hours ago \| hide \| 175 comments
14.		Show HN: Ouroboros – A Decentralized Packet Network (ouroboros.rocks)
		66 \| 20 points by despair3435 4 hours ago \| hide \| 3 comments
15.		Life's clockwork: Scientist shows how molecular engines keep us ticking (phys.org)
		70 \| 18 points by lelf 5 hours ago \| hide \| 2 comments
16.		Reusable vs. Re-editable Code (2018) [pdf] (archives-ouvertes.fr)
		74 \| 31 points by akkartik 6 hours ago \| hide \| discuss
17.		Nobody Cares (2011) (a16z.com)
		78 \| 93 points by prostoalex 11 hours ago \| hide \| 48 comments
18.		Pragmatic Array Oriented Functional Programming [video] (jiotalks.com)
		82 \| 21 points by tosh 4 hours ago \| hide \| 2 comments
19.		Show HN: A pure reference counting GC in Go (github.com)
		86 \| 30 points by sendilkumarn 5 hours ago \| hide \| 28 comments
20.		Thinking Fast and Slow, Deep Learning, and AI [video] (lexfridman.com)
		90 \| 224 points by AlanTuring 17 hours ago \| hide \| 94 comments
21.		Get Me Off Your Fucking Mailing List (2005) [pdf] (stanford.edu)
		94 \| 134 points by af16090 6 hours ago \| hide \| 33 comments
22.		How to Make a Raspberry Pi VPN Server (electromaker.io)
		98 \| 200 points by FoxMulder23 18 hours ago \| hide \| 98 comments
23.		Firefox Multi-Account Containers (addons.mozilla.org)
		102 \| 486 points by rahuldottech 18 hours ago \| hide \| 143 comments
24.		Video Gaming Will Take Over (matthewball.vc)
		106 \| 147 points by thesauri 16 hours ago \| hide \| 246 comments
25.		Frank Abagnale on the death of the con artist and the rise of cybercrime (wired.co.uk)
		110 \| 135 points by cryptozeus 18 hours ago \| hide \| 63 comments
26.		Real-Time Ray-Tracing in WebGPU (maierfelix.github.io)
		114 \| 100 points by Schampu 15 hours ago \| hide \| 25 comments
27.		How the U.S. military thinks about AI [audio] (changelog.com)
		118 \| 39 points by killjoywashere 9 hours ago \| hide \| 36 comments
28.		AWS EC2 Spot instances can now be stopped and started like On-Demand instances (amazon.com)
		122 \| 82 points by HedgedHuman 15 hours ago \| hide \| 12 comments
29.		Stack-Oriented Programming (wikipedia.org)
		126 \| 86 points by azhenley 16 hours ago \| hide \| 58 comments
30.		Why Scientists Fall for Precariously Balanced Rocks (atlasobscura.com)
		130 \| 87 points by brundolf 16 hours ago \| hide \| 5 comments
		More

134 |

Hacker News

12 | 13 | 14 | 50 | 51 | 52 | 53 | 1227 | 1228 | 1229 | 1267 | 1268 |

15 | 16 | 17 | 22 | 42 | 47 | 48 |

18 | 19 |

20 | 21 |

23 | 24 | 25 | Hacker News 26 | 27 | new 28 | | 29 | past 30 | | 31 | comments 32 | | 33 | ask 34 | | 35 | show 36 | | 37 | jobs 38 | | 39 | submit 40 | 41 |

43 | 44 | login 45 | 46 |

49 |

54 | 55 | 56 | 59 | 66 | 69 | 70 | 71 | 72 | 85 | 86 | 87 | 88 | 91 | 98 | 108 | 109 | 110 | 111 | 124 | 125 | 126 | 127 | 130 | 137 | 147 | 148 | 149 | 150 | 163 | 164 | 165 | 166 | 169 | 176 | 186 | 187 | 188 | 189 | 202 | 203 | 204 | 205 | 208 | 215 | 225 | 226 | 227 | 228 | 241 | 242 | 243 | 244 | 247 | 254 | 264 | 265 | 266 | 267 | 280 | 281 | 282 | 283 | 286 | 293 | 303 | 304 | 305 | 306 | 319 | 320 | 321 | 322 | 325 | 332 | 342 | 343 | 344 | 345 | 358 | 359 | 360 | 361 | 364 | 371 | 381 | 382 | 383 | 384 | 397 | 398 | 399 | 400 | 403 | 410 | 420 | 421 | 422 | 423 | 436 | 437 | 438 | 439 | 442 | 449 | 459 | 460 | 461 | 462 | 475 | 476 | 477 | 478 | 481 | 488 | 498 | 499 | 500 | 501 | 514 | 515 | 516 | 517 | 520 | 527 | 537 | 538 | 539 | 540 | 553 | 554 | 555 | 556 | 559 | 566 | 576 | 577 | 578 | 579 | 592 | 593 | 594 | 595 | 598 | 605 | 615 | 616 | 617 | 618 | 631 | 632 | 633 | 634 | 637 | 644 | 654 | 655 | 656 | 657 | 670 | 671 | 672 | 673 | 676 | 683 | 693 | 694 | 695 | 696 | 709 | 710 | 711 | 712 | 715 | 722 | 732 | 733 | 734 | 735 | 748 | 749 | 750 | 751 | 754 | 761 | 771 | 772 | 773 | 774 | 787 | 788 | 789 | 790 | 793 | 800 | 810 | 811 | 812 | 813 | 826 | 827 | 828 | 829 | 832 | 839 | 849 | 850 | 851 | 852 | 865 | 866 | 867 | 868 | 871 | 878 | 888 | 889 | 890 | 891 | 904 | 905 | 906 | 907 | 910 | 917 | 927 | 928 | 929 | 930 | 943 | 944 | 945 | 946 | 949 | 956 | 966 | 967 | 968 | 969 | 982 | 983 | 984 | 985 | 988 | 995 | 1005 | 1006 | 1007 | 1008 | 1021 | 1022 | 1023 | 1024 | 1027 | 1034 | 1044 | 1045 | 1046 | 1047 | 1060 | 1061 | 1062 | 1063 | 1066 | 1073 | 1083 | 1084 | 1085 | 1086 | 1099 | 1100 | 1101 | 1102 | 1105 | 1112 | 1122 | 1123 | 1124 | 1125 | 1138 | 1139 | 1140 | 1141 | 1144 | 1151 | 1161 | 1162 | 1163 | 1164 | 1177 | 1178 | 1179 | 1180 | 1183 | 1190 | 1200 | 1201 | 1202 | 1203 | 1216 | 1217 | 1218 | 1219 | 1220 | 1221 | 1224 | 1225 |

57 \| 1. 58 \|	60 \| 61 \| 62 \| 63 \| 64 \| 65 \|	67 \| Thank HN: You helped me get a new job 68 \|
		73 \| 1328 points 74 \| by 75 \| atum47 76 \| 77 \| 11 hours ago 78 \| 79 \| 80 \| \| 81 \| hide 82 \| \| 83 \| 115 comments 84 \|
89 \| 2. 90 \|	92 \| 93 \| 94 \| 95 \| 96 \| 97 \|	99 \| Waterfalls of the Great Lakes Region and Beyond 100 \| 101 \| ( 102 \| 103 \| gowaterfalling.com 104 \| 105 \| ) 106 \| 107 \|
		112 \| 38 points 113 \| by 114 \| rubidium 115 \| 116 \| 45 minutes ago 117 \| 118 \| 119 \| \| 120 \| hide 121 \| \| 122 \| 21 comments 123 \|
128 \| 3. 129 \|	131 \| 132 \| 133 \| 134 \| 135 \| 136 \|	138 \| Patch Critical Cryptographic Vulnerability in Microsoft Windows [pdf] 139 \| 140 \| ( 141 \| 142 \| defense.gov 143 \| 144 \| ) 145 \| 146 \|
		151 \| 649 points 152 \| by 153 \| Moral_ 154 \| 155 \| 17 hours ago 156 \| 157 \| 158 \| \| 159 \| hide 160 \| \| 161 \| 192 comments 162 \|
167 \| 4. 168 \|	170 \| 171 \| 172 \| 173 \| 174 \| 175 \|	177 \| Apple can be sued by app developers “on a monopsony theory” 178 \| 179 \| ( 180 \| 181 \| npr.org 182 \| 183 \| ) 184 \| 185 \|
		190 \| 90 points 191 \| by 192 \| moorage 193 \| 194 \| 6 hours ago 195 \| 196 \| 197 \| \| 198 \| hide 199 \| \| 200 \| 63 comments 201 \|
206 \| 5. 207 \|	209 \| 210 \| 211 \| 212 \| 213 \| 214 \|	216 \| DevDegree: Work at Shopify and get a free CS degree in parallel 217 \| 218 \| ( 219 \| 220 \| devdegree.ca 221 \| 222 \| ) 223 \| 224 \|
		229 \| 244 points 230 \| by 231 \| PandawanFr 232 \| 233 \| 11 hours ago 234 \| 235 \| 236 \| \| 237 \| hide 238 \| \| 239 \| 84 comments 240 \|
245 \| 6. 246 \|	248 \| 249 \| 250 \| 251 \| 252 \| 253 \|	255 \| Hipmunk Says Goodbye 256 \| 257 \| ( 258 \| 259 \| hipmunk.com 260 \| 261 \| ) 262 \| 263 \|
		268 \| 546 points 269 \| by 270 \| ienjoythebeach 271 \| 272 \| 16 hours ago 273 \| 274 \| 275 \| \| 276 \| hide 277 \| \| 278 \| 308 comments 279 \|
284 \| 7. 285 \|	287 \| 288 \| 289 \| 290 \| 291 \| 292 \|	294 \| Bug #915: Solved 295 \| 296 \| ( 297 \| 298 \| nedbatchelder.com 299 \| 300 \| ) 301 \| 302 \|
		307 \| 306 points 308 \| by 309 \| ingve 310 \| 311 \| 14 hours ago 312 \| 313 \| 314 \| \| 315 \| hide 316 \| \| 317 \| 72 comments 318 \|
323 \| 8. 324 \|	326 \| 327 \| 328 \| 329 \| 330 \| 331 \|	333 \| Stripe can make automatic LLCs but a wire transfer from Citi nearly ended me 334 \| 335 \| ( 336 \| 337 \| abe-winter.github.io 338 \| 339 \| ) 340 \| 341 \|
		346 \| 281 points 347 \| by 348 \| awinter-py 349 \| 350 \| 15 hours ago 351 \| 352 \| 353 \| \| 354 \| hide 355 \| \| 356 \| 187 comments 357 \|
362 \| 9. 363 \|	365 \| 366 \| 367 \| 368 \| 369 \| 370 \|	372 \| Show HN: An open-source distributed graph database written in C++ 373 \| 374 \| ( 375 \| 376 \| github.com 377 \| 378 \| ) 379 \| 380 \|
		385 \| 98 points 386 \| by 387 \| jamie-vesoft 388 \| 389 \| 9 hours ago 390 \| 391 \| 392 \| \| 393 \| hide 394 \| \| 395 \| 39 comments 396 \|
401 \| 10. 402 \|	404 \| 405 \| 406 \| 407 \| 408 \| 409 \|	411 \| NOBUS (Nobody but Us) 412 \| 413 \| ( 414 \| 415 \| wikipedia.org 416 \| 417 \| ) 418 \| 419 \|
		424 \| 135 points 425 \| by 426 \| apsec112 427 \| 428 \| 11 hours ago 429 \| 430 \| 431 \| \| 432 \| hide 433 \| \| 434 \| 35 comments 435 \|
440 \| 11. 441 \|	443 \| 444 \| 445 \| 446 \| 447 \| 448 \|	450 \| Many games are held together by duct tape 451 \| 452 \| ( 453 \| 454 \| polygon.com 455 \| 456 \| ) 457 \| 458 \|
		463 \| 29 points 464 \| by 465 \| misotaur 466 \| 467 \| 4 hours ago 468 \| 469 \| 470 \| \| 471 \| hide 472 \| \| 473 \| 18 comments 474 \|
479 \| 12. 480 \|	482 \| 483 \| 484 \| 485 \| 486 \| 487 \|	489 \| The few remaining uses of the name “Macintosh” 490 \| 491 \| ( 492 \| 493 \| tidbits.com 494 \| 495 \| ) 496 \| 497 \|
		502 \| 56 points 503 \| by 504 \| fanf2 505 \| 506 \| 4 hours ago 507 \| 508 \| 509 \| \| 510 \| hide 511 \| \| 512 \| 47 comments 513 \|
518 \| 13. 519 \|	521 \| 522 \| 523 \| 524 \| 525 \| 526 \|	528 \| Working for someone vs. doing your own thing 529 \| 530 \| ( 531 \| 532 \| tik.dev 533 \| 534 \| ) 535 \| 536 \|
		541 \| 280 points 542 \| by 543 \| thakobyan 544 \| 545 \| 15 hours ago 546 \| 547 \| 548 \| \| 549 \| hide 550 \| \| 551 \| 175 comments 552 \|
557 \| 14. 558 \|	560 \| 561 \| 562 \| 563 \| 564 \| 565 \|	567 \| Show HN: Ouroboros – A Decentralized Packet Network 568 \| 569 \| ( 570 \| 571 \| ouroboros.rocks 572 \| 573 \| ) 574 \| 575 \|
		580 \| 20 points 581 \| by 582 \| despair3435 583 \| 584 \| 4 hours ago 585 \| 586 \| 587 \| \| 588 \| hide 589 \| \| 590 \| 3 comments 591 \|
596 \| 15. 597 \|	599 \| 600 \| 601 \| 602 \| 603 \| 604 \|	606 \| Life's clockwork: Scientist shows how molecular engines keep us ticking 607 \| 608 \| ( 609 \| 610 \| phys.org 611 \| 612 \| ) 613 \| 614 \|
		619 \| 18 points 620 \| by 621 \| lelf 622 \| 623 \| 5 hours ago 624 \| 625 \| 626 \| \| 627 \| hide 628 \| \| 629 \| 2 comments 630 \|
635 \| 16. 636 \|	638 \| 639 \| 640 \| 641 \| 642 \| 643 \|	645 \| Reusable vs. Re-editable Code (2018) [pdf] 646 \| 647 \| ( 648 \| 649 \| archives-ouvertes.fr 650 \| 651 \| ) 652 \| 653 \|
		658 \| 31 points 659 \| by 660 \| akkartik 661 \| 662 \| 6 hours ago 663 \| 664 \| 665 \| \| 666 \| hide 667 \| \| 668 \| discuss 669 \|
674 \| 17. 675 \|	677 \| 678 \| 679 \| 680 \| 681 \| 682 \|	684 \| Nobody Cares (2011) 685 \| 686 \| ( 687 \| 688 \| a16z.com 689 \| 690 \| ) 691 \| 692 \|
		697 \| 93 points 698 \| by 699 \| prostoalex 700 \| 701 \| 11 hours ago 702 \| 703 \| 704 \| \| 705 \| hide 706 \| \| 707 \| 48 comments 708 \|
713 \| 18. 714 \|	716 \| 717 \| 718 \| 719 \| 720 \| 721 \|	723 \| Pragmatic Array Oriented Functional Programming [video] 724 \| 725 \| ( 726 \| 727 \| jiotalks.com 728 \| 729 \| ) 730 \| 731 \|
		736 \| 21 points 737 \| by 738 \| tosh 739 \| 740 \| 4 hours ago 741 \| 742 \| 743 \| \| 744 \| hide 745 \| \| 746 \| 2 comments 747 \|
752 \| 19. 753 \|	755 \| 756 \| 757 \| 758 \| 759 \| 760 \|	762 \| Show HN: A pure reference counting GC in Go 763 \| 764 \| ( 765 \| 766 \| github.com 767 \| 768 \| ) 769 \| 770 \|
		775 \| 30 points 776 \| by 777 \| sendilkumarn 778 \| 779 \| 5 hours ago 780 \| 781 \| 782 \| \| 783 \| hide 784 \| \| 785 \| 28 comments 786 \|
791 \| 20. 792 \|	794 \| 795 \| 796 \| 797 \| 798 \| 799 \|	801 \| Thinking Fast and Slow, Deep Learning, and AI [video] 802 \| 803 \| ( 804 \| 805 \| lexfridman.com 806 \| 807 \| ) 808 \| 809 \|
		814 \| 224 points 815 \| by 816 \| AlanTuring 817 \| 818 \| 17 hours ago 819 \| 820 \| 821 \| \| 822 \| hide 823 \| \| 824 \| 94 comments 825 \|
830 \| 21. 831 \|	833 \| 834 \| 835 \| 836 \| 837 \| 838 \|	840 \| Get Me Off Your Fucking Mailing List (2005) [pdf] 841 \| 842 \| ( 843 \| 844 \| stanford.edu 845 \| 846 \| ) 847 \| 848 \|
		853 \| 134 points 854 \| by 855 \| af16090 856 \| 857 \| 6 hours ago 858 \| 859 \| 860 \| \| 861 \| hide 862 \| \| 863 \| 33 comments 864 \|
869 \| 22. 870 \|	872 \| 873 \| 874 \| 875 \| 876 \| 877 \|	879 \| How to Make a Raspberry Pi VPN Server 880 \| 881 \| ( 882 \| 883 \| electromaker.io 884 \| 885 \| ) 886 \| 887 \|
		892 \| 200 points 893 \| by 894 \| FoxMulder23 895 \| 896 \| 18 hours ago 897 \| 898 \| 899 \| \| 900 \| hide 901 \| \| 902 \| 98 comments 903 \|
908 \| 23. 909 \|	911 \| 912 \| 913 \| 914 \| 915 \| 916 \|	918 \| Firefox Multi-Account Containers 919 \| 920 \| ( 921 \| 922 \| addons.mozilla.org 923 \| 924 \| ) 925 \| 926 \|
		931 \| 486 points 932 \| by 933 \| rahuldottech 934 \| 935 \| 18 hours ago 936 \| 937 \| 938 \| \| 939 \| hide 940 \| \| 941 \| 143 comments 942 \|
947 \| 24. 948 \|	950 \| 951 \| 952 \| 953 \| 954 \| 955 \|	957 \| Video Gaming Will Take Over 958 \| 959 \| ( 960 \| 961 \| matthewball.vc 962 \| 963 \| ) 964 \| 965 \|
		970 \| 147 points 971 \| by 972 \| thesauri 973 \| 974 \| 16 hours ago 975 \| 976 \| 977 \| \| 978 \| hide 979 \| \| 980 \| 246 comments 981 \|
986 \| 25. 987 \|	989 \| 990 \| 991 \| 992 \| 993 \| 994 \|	996 \| Frank Abagnale on the death of the con artist and the rise of cybercrime 997 \| 998 \| ( 999 \| 1000 \| wired.co.uk 1001 \| 1002 \| ) 1003 \| 1004 \|
		1009 \| 135 points 1010 \| by 1011 \| cryptozeus 1012 \| 1013 \| 18 hours ago 1014 \| 1015 \| 1016 \| \| 1017 \| hide 1018 \| \| 1019 \| 63 comments 1020 \|
1025 \| 26. 1026 \|	1028 \| 1029 \| 1030 \| 1031 \| 1032 \| 1033 \|	1035 \| Real-Time Ray-Tracing in WebGPU 1036 \| 1037 \| ( 1038 \| 1039 \| maierfelix.github.io 1040 \| 1041 \| ) 1042 \| 1043 \|
		1048 \| 100 points 1049 \| by 1050 \| Schampu 1051 \| 1052 \| 15 hours ago 1053 \| 1054 \| 1055 \| \| 1056 \| hide 1057 \| \| 1058 \| 25 comments 1059 \|
1064 \| 27. 1065 \|	1067 \| 1068 \| 1069 \| 1070 \| 1071 \| 1072 \|	1074 \| How the U.S. military thinks about AI [audio] 1075 \| 1076 \| ( 1077 \| 1078 \| changelog.com 1079 \| 1080 \| ) 1081 \| 1082 \|
		1087 \| 39 points 1088 \| by 1089 \| killjoywashere 1090 \| 1091 \| 9 hours ago 1092 \| 1093 \| 1094 \| \| 1095 \| hide 1096 \| \| 1097 \| 36 comments 1098 \|
1103 \| 28. 1104 \|	1106 \| 1107 \| 1108 \| 1109 \| 1110 \| 1111 \|	1113 \| AWS EC2 Spot instances can now be stopped and started like On-Demand instances 1114 \| 1115 \| ( 1116 \| 1117 \| amazon.com 1118 \| 1119 \| ) 1120 \| 1121 \|
		1126 \| 82 points 1127 \| by 1128 \| HedgedHuman 1129 \| 1130 \| 15 hours ago 1131 \| 1132 \| 1133 \| \| 1134 \| hide 1135 \| \| 1136 \| 12 comments 1137 \|
1142 \| 29. 1143 \|	1145 \| 1146 \| 1147 \| 1148 \| 1149 \| 1150 \|	1152 \| Stack-Oriented Programming 1153 \| 1154 \| ( 1155 \| 1156 \| wikipedia.org 1157 \| 1158 \| ) 1159 \| 1160 \|
		1165 \| 86 points 1166 \| by 1167 \| azhenley 1168 \| 1169 \| 16 hours ago 1170 \| 1171 \| 1172 \| \| 1173 \| hide 1174 \| \| 1175 \| 58 comments 1176 \|
1181 \| 30. 1182 \|	1184 \| 1185 \| 1186 \| 1187 \| 1188 \| 1189 \|	1191 \| Why Scientists Fall for Precariously Balanced Rocks 1192 \| 1193 \| ( 1194 \| 1195 \| atlasobscura.com 1196 \| 1197 \| ) 1198 \| 1199 \|
		1204 \| 87 points 1205 \| by 1206 \| brundolf 1207 \| 1208 \| 16 hours ago 1209 \| 1210 \| 1211 \| \| 1212 \| hide 1213 \| \| 1214 \| 5 comments 1215 \|
		1222 \| More 1223 \|

1226 |

1230 |

1231 | 1232 | 1233 | 1234 | 1235 |

1236 |
1237 | 1238 | 1239 | Guidelines 1240 | | 1241 | FAQ 1242 | | 1243 | Support 1244 | | 1245 | API 1246 | | 1247 | Security 1248 | | 1249 | Lists 1250 | | 1251 | Bookmarklet 1252 | | 1253 | Legal 1254 | | 1255 | Apply to YC 1256 | | 1257 | Contact 1258 | 1259 |
1260 |
1261 | 1265 | 1266 |

1269 |

The Dormouse's story

The Dormouse's Story

The Dormouse's Story

The Dormouse's story

The Dormouse's story

The Dormouse's story

The Dormouse's story

\xed\xe5\xec\xf9

翴檛

\xed\xe5\xec\xf9

םולש

The Dormouse's Story

\xed\xe5\xec\xf9

翴檛

םולש

The Dormouse's Story

The Dormouse's Story

The Dormouse's Story

The Dormouse's Story

stuff in here header

Hello World

The Dormouse's story

The Dormouse's Story

The Dormouse's Story

`stuff in here` header

`Hello` World