├── .coveragerc ├── .github ├── dependabot.yml └── workflows │ ├── publish.yml │ └── tests.yml ├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.rst ├── docs ├── Makefile ├── README.rst ├── conf.py └── index.rst ├── examples.py ├── htmlement.py ├── setup.cfg ├── setup.py ├── tests ├── __init__.py └── test_module.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source=htmlement 3 | branch=True 4 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | # Maintain dependencies for GitHub Actions 9 | - package-ecosystem: "github-actions" 10 | target-branch: "master" 11 | directory: "/" 12 | schedule: 13 | interval: "daily" 14 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPi 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | pypi: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v3 12 | with: 13 | fetch-depth: 0 14 | 15 | - name: Install dependencies 16 | run: | 17 | python3 -m pip install --upgrade build 18 | python3 -m build 19 | 20 | - name: Publish package 21 | uses: pypa/gh-action-pypi-publish@release/v1 22 | with: 23 | password: ${{ secrets.PYPI_API_TOKEN }} 24 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | - push 5 | - pull_request 6 | 7 | jobs: 8 | tests: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | matrix: 12 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] 13 | steps: 14 | - uses: actions/checkout@v3 15 | 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v4 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | 21 | - name: Install test dependencies 22 | run: | 23 | python -m pip install --upgrade --no-cache-dir pip 24 | pip install --no-cache-dir tox tox-gh-actions 25 | 26 | - name: Test using tox 27 | run: tox 28 | 29 | - name: Upload test coverage to Codecov 30 | continue-on-error: true 31 | uses: codecov/codecov-action@3.1.2 32 | with: 33 | flags: unittests 34 | 35 | linting: 36 | runs-on: ubuntu-latest 37 | steps: 38 | - uses: actions/checkout@v3 39 | 40 | - name: Install test dependencies 41 | run: | 42 | python -m pip install --upgrade --no-cache-dir pip 43 | pip install --no-cache-dir flake8 44 | 45 | - name: Test linting with flake8 46 | run: flake8 --max-line-length=127 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # general things to ignore 2 | docs/_static/ 3 | docs/build/ 4 | dist/ 5 | *.egg-info/ 6 | *.egg 7 | *.py[cod] 8 | __pycache__/ 9 | .idea/ 10 | build/ 11 | .cache/ 12 | .tox/ 13 | .coverage 14 | docs/_build/ 15 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | arch: amd64 3 | os: linux 4 | dist: bionic # Has python 2.7, 3.6, 3.7, 3.8 pre installed 5 | 6 | matrix: 7 | include: 8 | - python: "2.7" 9 | env: TOXENV=py27 10 | stage: Tests 11 | - python: "3.6" 12 | env: TOXENV=py36 13 | - python: "3.7" 14 | env: TOXENV=py37 15 | - python: "3.8" 16 | env: TOXENV=py38 17 | - python: "3.9" 18 | env: TOXENV=py39 19 | - python: "3.10" 20 | env: TOXENV=py39 21 | - env: TOXENV=flake8 22 | 23 | install: 24 | - pip install coveralls 25 | - pip install tox 26 | 27 | script: tox 28 | after_success: coveralls 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 William Forde 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst LICENSE 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://badge.fury.io/py/htmlement.svg 2 | :target: https://pypi.python.org/pypi/htmlement 3 | 4 | .. image:: https://readthedocs.org/projects/python-htmlement/badge/?version=stable 5 | :target: http://python-htmlement.readthedocs.io/en/stable/?badge=stable 6 | 7 | .. image:: https://github.com/willforde/python-htmlement/actions/workflows/tests.yml/badge.svg?branch=master&event=push 8 | :target: https://github.com/willforde/python-htmlement/actions 9 | 10 | .. image:: https://codecov.io/gh/willforde/python-htmlement/branch/master/graph/badge.svg?token=D5EKKLIVBP 11 | :target: https://codecov.io/gh/willforde/python-htmlement 12 | 13 | .. image:: https://api.codeclimate.com/v1/badges/7d593426acc83cba5ef7/maintainability 14 | :target: https://codeclimate.com/github/willforde/python-htmlement/maintainability 15 | :alt: Maintainability 16 | 17 | 18 | HTMLement 19 | --------- 20 | 21 | HTMLement is a pure Python HTML Parser. 22 | 23 | The object of this project is to be a "pure-python HTML parser" which is also "faster" than "beautifulsoup". 24 | And like "beautifulsoup", will also parse invalid html. 25 | 26 | The most simple way to do this is to use ElementTree `XPath expressions`__. 27 | Python does support a simple (read limited) XPath engine inside its "ElementTree" module. 28 | A benefit of using "ElementTree" is that it can use a "C implementation" whenever available. 29 | 30 | This "HTML Parser" extends `html.parser.HTMLParser`_ to build a tree of `ElementTree.Element`_ instances. 31 | 32 | Install 33 | ------- 34 | Run :: 35 | 36 | pip install htmlement 37 | 38 | -or- :: 39 | 40 | pip install git+https://github.com/willforde/python-htmlement.git 41 | 42 | Parsing HTML 43 | ------------ 44 | Here I’ll be using a sample "HTML document" that will be "parsed" using "htmlement": :: 45 | 46 | html = """ 47 | 48 | 49 | GitHub 50 | 51 | 52 | GitHub 53 | GitHub Project 54 | 55 | 56 | """ 57 | 58 | # Parse the document 59 | import htmlement 60 | root = htmlement.fromstring(html) 61 | 62 | Root is an ElementTree.Element_ and supports the ElementTree API 63 | with XPath expressions. With this I'm easily able to get both the title and all anchors in the document. :: 64 | 65 | # Get title 66 | title = root.find("head/title").text 67 | print("Parsing: %s" % title) 68 | 69 | # Get all anchors 70 | for a in root.iterfind(".//a"): 71 | print(a.get("href")) 72 | 73 | And the output is as follows: :: 74 | 75 | Parsing: GitHub 76 | https://github.com/willforde 77 | https://github.com/willforde/python-htmlement 78 | 79 | 80 | Parsing HTML with a filter 81 | -------------------------- 82 | Here I’ll be using a slightly more complex "HTML document" that will be "parsed" using "htmlement with a filter" to fetch 83 | only the menu items. This can be very useful when dealing with large "HTML documents" since it can be a lot faster to 84 | only "parse the required section" and to ignore everything else. :: 85 | 86 | html = """ 87 | 88 | 89 | Coffee shop 90 | 91 | 92 | 97 | 101 | 102 | 103 | """ 104 | 105 | # Parse the document 106 | import htmlement 107 | root = htmlement.fromstring(html, "ul", attrs={"class": "menu"}) 108 | 109 | In this case I'm not unable to get the title, since all elements outside the filter were ignored. 110 | But this allows me to be able to extract all "list_item elements" within the menu list and nothing else. :: 111 | 112 | # Get all listitems 113 | for item in root.iterfind(".//li"): 114 | # Get text from listitem 115 | print(item.text) 116 | 117 | And the output is as follows: :: 118 | 119 | Coffee 120 | Tea 121 | Milk 122 | 123 | .. _html.parser.HTMLParser: https://docs.python.org/3.6/library/html.parser.html#html.parser.HTMLParser 124 | .. _ElementTree.Element: https://docs.python.org/3.6/library/xml.etree.elementtree.html#xml.etree.ElementTree.Element 125 | .. _Xpath: https://docs.python.org/3.6/library/xml.etree.elementtree.html#xpath-support 126 | __ XPath_ 127 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = HTMLement 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://readthedocs.org/projects/python-htmlement/badge/?version=stable 2 | :target: http://python-htmlement.readthedocs.io/en/stable/?badge=stable 3 | 4 | Please GoTo: http://python-htmlement.readthedocs.io/en/latest/?badge=stable 5 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # HTMLement documentation build configuration file, created by 5 | # sphinx-quickstart on Mon Jan 16 03:44:03 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | import sys 16 | import os 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | sys.path.insert(0, os.path.abspath('../')) 22 | 23 | # Fetch version number 24 | from htmlement import __version__ 25 | 26 | # General information about the project. 27 | project = 'HTMLement' 28 | author = "William Forde" 29 | 30 | # The version info for the project you're documenting, acts as replacement for 31 | # |version| and |release|, also used in various other places throughout the 32 | # built documents. 33 | # 34 | # The short X.Y version. 35 | version = __version__ 36 | # The full version, including alpha/beta/rc tags. 37 | release = __version__ 38 | 39 | # -- General configuration ------------------------------------------------ 40 | 41 | # If your documentation needs a minimal Sphinx version, state it here. 42 | # 43 | # needs_sphinx = '1.0' 44 | 45 | # Add any Sphinx extension module names here, as strings. They can be 46 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 47 | # ones. 48 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.viewcode'] 49 | 50 | # Add any paths that contain templates here, relative to this directory. 51 | templates_path = ['_templates'] 52 | 53 | # The suffix(es) of source filenames. 54 | # You can specify multiple suffix as a list of string: 55 | # 56 | # source_suffix = ['.rst', '.md'] 57 | source_suffix = '.rst' 58 | 59 | # The master toctree document. 60 | master_doc = 'index' 61 | 62 | # The language for content autogenerated by Sphinx. Refer to documentation 63 | # for a list of supported languages. 64 | # 65 | # This is also used if you do content translation via gettext catalogs. 66 | # Usually you set "language" from the command line for these cases. 67 | language = "en" 68 | 69 | # List of patterns, relative to source directory, that match files and 70 | # directories to ignore when looking for source files. 71 | # This patterns also effect to html_static_path and html_extra_path 72 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'README.rst'] 73 | 74 | # The name of the Pygments (syntax highlighting) style to use. 75 | pygments_style = 'sphinx' 76 | 77 | # If true, todo and todoList produce output, else they produce nothing. 78 | todo_include_todos = False 79 | 80 | 81 | # -- Options for HTML output ---------------------------------------------- 82 | 83 | # The theme to use for HTML and HTML Help pages. See the documentation for 84 | # a list of builtin themes. 85 | # 86 | html_theme = 'default' 87 | 88 | # Theme options are theme-specific and customize the look and feel of a theme 89 | # further. For a list of options available for each theme, see the 90 | # documentation. 91 | # 92 | # html_theme_options = {} 93 | 94 | # Add any paths that contain custom static files (such as style sheets) here, 95 | # relative to this directory. They are copied after the builtin static files, 96 | # so a file named "default.css" will overwrite the builtin "default.css". 97 | html_static_path = ['_static'] 98 | 99 | # This will exclude any warnings of 'nonlocal image URI found'. 100 | suppress_warnings = ['image.nonlocal_uri'] 101 | 102 | 103 | # -- Options for HTMLHelp output ------------------------------------------ 104 | 105 | # Output file base name for HTML help builder. 106 | htmlhelp_basename = 'HTMLementdoc' 107 | 108 | 109 | # -- Options for LaTeX output --------------------------------------------- 110 | 111 | latex_elements = { 112 | # The paper size ('letterpaper' or 'a4paper'). 113 | # 114 | # 'papersize': 'letterpaper', 115 | 116 | # The font size ('10pt', '11pt' or '12pt'). 117 | # 118 | # 'pointsize': '10pt', 119 | 120 | # Additional stuff for the LaTeX preamble. 121 | # 122 | # 'preamble': '', 123 | 124 | # Latex figure (float) alignment 125 | # 126 | # 'figure_align': 'htbp', 127 | } 128 | 129 | # Grouping the document tree into LaTeX files. List of tuples 130 | # (source start file, target name, title, 131 | # author, documentclass [howto, manual, or own class]). 132 | latex_documents = [ 133 | (master_doc, 'HTMLement.tex', 'HTMLement Documentation', 134 | 'William Forde', 'manual'), 135 | ] 136 | 137 | 138 | # -- Options for manual page output --------------------------------------- 139 | 140 | # One entry per manual page. List of tuples 141 | # (source start file, name, description, authors, manual section). 142 | man_pages = [ 143 | (master_doc, 'htmlement', 'HTMLement Documentation', 144 | [author], 1) 145 | ] 146 | 147 | 148 | # -- Options for Texinfo output ------------------------------------------- 149 | 150 | # Grouping the document tree into Texinfo files. List of tuples 151 | # (source start file, target name, title, author, 152 | # dir menu entry, description, category) 153 | texinfo_documents = [ 154 | (master_doc, 'HTMLement', 'HTMLement Documentation', 155 | author, 'HTMLement', 'One line description of project.', 156 | 'Miscellaneous'), 157 | ] 158 | 159 | 160 | # Example configuration for intersphinx: refer to the Python standard library. 161 | intersphinx_mapping = {'https://docs.python.org/3.6': None} 162 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. HTMLement documentation master file, created by 2 | sphinx-quickstart on Mon Jan 16 03:44:03 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to HTMLement's documentation! 7 | ===================================== 8 | 9 | .. include:: 10 | ../README.rst 11 | 12 | .. seealso:: 13 | More examples can be found in `examples.py`_. 14 | 15 | API 16 | --- 17 | 18 | .. automodule:: htmlement 19 | :members: 20 | 21 | External Links 22 | -------------- 23 | ElementTree: https://docs.python.org/2/library/xml.etree.elementtree.html 24 | 25 | Bug Tracker: https://github.com/willforde/python-htmlement/issues 26 | 27 | .. _examples.py: https://github.com/willforde/python-htmlement/blob/master/examples.py 28 | -------------------------------------------------------------------------------- /examples.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | For more information, see: 4 | @see https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.Element 5 | @see https://docs.python.org/3/library/xml.etree.elementtree.html#xpath-support 6 | """ 7 | from __future__ import print_function, unicode_literals 8 | from htmlement import HTMLement 9 | 10 | 11 | def example_simple(): 12 | """ 13 | This example will parse a simple html tree and 14 | extract the website title and all anchors 15 | 16 | >>> example_simple() 17 | Parsing: GitHub 18 | GitHub => https://github.com/willforde 19 | GitHub Project => https://github.com/willforde/python-htmlement 20 | """ 21 | html = """ 22 | 23 | 24 | GitHub 25 | 26 | 27 | GitHub 28 | GitHub Project 29 | 30 | 31 | """ 32 | 33 | # Parse the document 34 | parser = HTMLement() 35 | parser.feed(html) 36 | root = parser.close() 37 | 38 | # Root is an xml.etree.Element and supports the ElementTree API 39 | # (e.g. you may use its limited support for XPath expressions) 40 | 41 | # Get title 42 | title = root.find('head/title').text 43 | print("Parsing: {}".format(title)) 44 | 45 | # Get all anchors 46 | for a in root.iterfind(".//a"): 47 | # Get href attribute 48 | url = a.get("href") 49 | # Get anchor name 50 | name = a.text 51 | 52 | print("{} => {}".format(name, url)) 53 | 54 | 55 | def example_filter(): 56 | """ 57 | This example will parse a simple html tree and 58 | extract all the list items within the ul menu element using a tree filter. 59 | 60 | The tree filter will tell the parser to only parse the elements within the 61 | requested section and to ignore all other elements. 62 | Useful for speeding up the parsing of html pages. 63 | 64 | >>> example_filter() 65 | Menu Items 66 | - Coffee 67 | - Tea 68 | - Milk 69 | """ 70 | html = """ 71 | 72 | 73 | Coffee shop 74 | 75 | 76 | 81 | 85 | 86 | 87 | """ 88 | 89 | # Parse the document 90 | parser = HTMLement("ul", attrs={"class": "menu"}) 91 | parser.feed(html) 92 | root = parser.close() 93 | 94 | # Root should now be a 'ul' xml.etree.Element with all it's child elements available 95 | # All other elements have been ignored. Way faster to parse. 96 | 97 | # We are unable to get the title here sense all 98 | # elements outside the filter was ignored 99 | print("Menu Items") 100 | 101 | # Get all listitems 102 | for item in root.iterfind(".//li"): 103 | # Get text from listitem 104 | print("- {}".format(item.text)) 105 | 106 | 107 | def example_complex(): 108 | """ 109 | This example will parse a more complex html tree of python talk's and will 110 | extract the image, title, url and date of each talk. 111 | 112 | A filter will be used to extract the main talks div element 113 | 114 | >>> example_complex() 115 | Image = /presentations/c7f1fbb5d03a409d9de8abb5238d6a68/thumb_slide_0.jpg 116 | Url = /pycon2016/alex-martelli-exception-and-error-handling-in-python-2-and-python-3 117 | Title = Alex Martelli - Exception and error handling in Python 2 and Python 3 118 | Date = Jun 1, 2016 119 | 120 | Image = /presentations/eef8ffe5b6784f7cb84948cf866b2608/thumb_slide_0.jpg 121 | Url = /presentations/518cae54da12460e895163d809e25933/thumb_slide_0.jpg 122 | Title = Jake Vanderplas - Statistics for Hackers 123 | Date = May 29, 2016 124 | 125 | Image = /presentations/8b3ee51b5fcc4a238c4cb4b7787979ac/thumb_slide_0.jpg 126 | Url = /pycon2016/brett-slatkin-refactoring-python-why-and-how-to-restructure-your-code 127 | Title = Brett Slatkin - Refactoring Python: Why and how to restructure your code 128 | Date = May 29, 2016 129 | 130 | """ 131 | html = """ 132 | 133 | 134 | PyCon 2016 135 | 136 | 137 |
138 |

Talks by PyCon 2016

139 |
140 | 153 |
154 | 155 | 156 | 157 |
158 |

159 | 160 | Jake Vanderplas - Statistics for Hackers 161 | 162 |

163 |

May 29, 2016

164 |
165 |
166 | 179 |
180 |
181 | 182 | 183 | """ 184 | 185 | # Parse the document 186 | parser = HTMLement("div", attrs={"class": "talks", "id": True}) 187 | parser.feed(html) 188 | root = parser.close() 189 | 190 | # Extract all div tags with class of talk 191 | for talk in root.iterfind("./div[@class='talk']"): 192 | # Fetch image 193 | img = talk.find(".//img").get("src") 194 | print("Image = {}".format(img)) 195 | 196 | # Fetch title and url 197 | title_anchor = talk.find("./div/h3/a") 198 | url = title_anchor.get("href") 199 | print("Url = {}".format(url)) 200 | title = title_anchor.text 201 | print("Title = {}".format(title)) 202 | 203 | # Fetch date 204 | date = talk.find("./div/p").text 205 | print("Date = {}".format(date)) 206 | print("") 207 | 208 | 209 | if __name__ == "__main__": 210 | example_simple() 211 | print("") 212 | example_filter() 213 | print("") 214 | example_complex() 215 | -------------------------------------------------------------------------------- /htmlement.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # The MIT License (MIT) 5 | # 6 | # Copyright (c) 2016 William Forde 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 9 | # this software and associated documentation files (the "Software"), to deal in 10 | # the Software without restriction, including without limitation the rights to 11 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 12 | # the Software, and to permit persons to whom the Software is furnished to do so, 13 | # subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 20 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 21 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 22 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 23 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | 25 | """ 26 | HTMLement 27 | --------- 28 | Simple lightweight HTML parser with XPath support. 29 | 30 | Github: https://github.com/willforde/python-htmlement 31 | Documentation: https://python-htmlement.readthedocs.io/en/stable/?badge=stable 32 | Testing: https://github.com/willforde/python-htmlement/actions 33 | Coverage: https://codecov.io/gh/willforde/python-htmlement 34 | Maintainability: https://codeclimate.com/github/willforde/python-htmlement/maintainability 35 | """ 36 | 37 | # Standard Lib 38 | import xml.etree.ElementTree as Etree 39 | import warnings 40 | import re 41 | 42 | # HTML Parser 43 | from html.entities import name2codepoint 44 | from html.parser import HTMLParser 45 | 46 | __all__ = ["HTMLement", "fromstring", "fromstringlist", "parse"] 47 | __version__ = "2.0.0" 48 | 49 | # Add missing codepoints 50 | # TODO: This may no longer be required 51 | name2codepoint["apos"] = 0x0027 52 | 53 | 54 | def fromstring(text, tag="", attrs=None, encoding=None): 55 | """ 56 | Parse's "HTML" document from a string into an element tree. 57 | 58 | :param text: The "HTML" document to parse. 59 | :type text: str or bytes 60 | 61 | :param str tag: (optional) Name of "tag / element" which is used to filter down "the tree" to a required section. 62 | :type tag: str 63 | 64 | :param attrs: (optional) The attributes of the element, that will be used, when searchingfor the required section. 65 | :type attrs: dict(str, str) 66 | 67 | :param encoding: (optional) Encoding used, when decoding the source data before feeding it to the parser. 68 | :type encoding: str 69 | 70 | :return: The root element of the element tree. 71 | :rtype: xml.etree.ElementTree.Element 72 | 73 | :raises UnicodeDecodeError: If decoding of *text* fails. 74 | """ 75 | parser = HTMLement(tag, attrs, encoding) 76 | parser.feed(text) 77 | return parser.close() 78 | 79 | 80 | def fromstringlist(sequence, tag="", attrs=None, encoding=None): 81 | """ 82 | Parses an "HTML document" from a sequence of "HTML sections" into an element tree. 83 | 84 | :param sequence: A sequence of "HTML sections" to parse. 85 | :type sequence: list(str or bytes) 86 | 87 | :param str tag: (optional) Name of "tag / element" which is used to filter down "the tree" to a required section. 88 | :type tag: str 89 | 90 | :param attrs: (optional) The attributes of the element, that will be used, when searchingfor the required section. 91 | :type attrs: dict(str, str) 92 | 93 | :param encoding: (optional) Encoding used, when decoding the source data before feeding it to the parser. 94 | :type encoding: str 95 | 96 | :return: The root element of the element tree. 97 | :rtype: xml.etree.ElementTree.Element 98 | 99 | :raises UnicodeDecodeError: If decoding of a section within *sequence* fails. 100 | """ 101 | parser = HTMLement(tag, attrs, encoding) 102 | for text in sequence: 103 | parser.feed(text) 104 | return parser.close() 105 | 106 | 107 | def parse(source, tag="", attrs=None, encoding=None): 108 | """ 109 | Load an external "HTML document" into an element tree. 110 | 111 | :param source: A filename or file like object containing HTML data. 112 | :type source: str or io.TextIOBase 113 | 114 | :param str tag: (optional) Name of "tag / element" which is used to filter down "the tree" to a required section. 115 | :type tag: str 116 | 117 | :param attrs: (optional) The attributes of the element, that will be used, when searchingfor the required section. 118 | :type attrs: dict(str, str) 119 | 120 | :param encoding: (optional) Encoding used, when decoding the source data before feeding it to the parser. 121 | :type encoding: str 122 | 123 | :return: The root element of the element tree. 124 | :rtype: xml.etree.ElementTree.Element 125 | 126 | :raises UnicodeDecodeError: If decoding of *source* fails. 127 | """ 128 | # Assume that source is a file pointer if no read methods is found 129 | if not hasattr(source, "read"): 130 | source = open(source, "r", encoding=encoding) 131 | close_source = True 132 | else: 133 | close_source = False 134 | 135 | try: 136 | parser = HTMLement(tag, attrs, encoding) 137 | while True: 138 | # Read in 64k at a time 139 | data = source.read(65536) 140 | if not data: 141 | break 142 | 143 | # Feed the parser 144 | parser.feed(data) 145 | 146 | # Return the root element 147 | return parser.close() 148 | 149 | finally: 150 | if close_source: 151 | source.close() 152 | 153 | 154 | class HTMLement(object): 155 | """ 156 | Python HTMLParser extension with ElementTree Parser support. 157 | 158 | This HTML Parser extends :class:`html.parser.HTMLParser`, returning an :class:`xml.etree.ElementTree.Element` 159 | instance. The returned root element natively supports the ElementTree API. 160 | (e.g. you may use its limited support for `XPath expressions`__) 161 | 162 | When a "tag" and "tag attributes" are given the parser will search for a required section. Only when the required 163 | section is found, does the parser start parsing the "HTML document". The element that matches the search criteria 164 | will then become the new "root element". 165 | 166 | Attributes are given as a dict of {'name': 'value'}. Value can be the string to match, `True` or `False.` 167 | `True` will match any attribute with given name and any value. 168 | `False` will only give a match if given attribute does not exist in the element. 169 | 170 | :param str tag: (optional) Name of "tag / element" which is used to filter down "the tree" to a required section. 171 | :type tag: str 172 | 173 | :param attrs: (optional) The attributes of the element, that will be used, when searchingfor the required section. 174 | :type attrs: dict(str, str) 175 | 176 | :param encoding: (optional) Encoding used, when decoding the source data before feeding it to the parser. 177 | :type encoding: str 178 | 179 | .. _Xpath: https://docs.python.org/3.6/library/xml.etree.elementtree.html#xpath-support 180 | __ XPath_ 181 | """ 182 | def __init__(self, tag="", attrs=None, encoding=None): 183 | self._parser = ParseHTML(tag, attrs) 184 | self.encoding = encoding 185 | self._finished = False 186 | 187 | def feed(self, data): 188 | """ 189 | Feeds data to the parser. 190 | 191 | If *data*, is of type :class:`bytes` and where no encoding was specified, then the encoding 192 | will be extracted from *data* using "meta tags", if available. 193 | Otherwise encoding will default to "ISO-8859-1" 194 | 195 | :param data: HTML data 196 | :type data: str or bytes 197 | 198 | :raises UnicodeDecodeError: If decoding of *data* fails. 199 | """ 200 | # Skip feeding data into parser if we already have what we want 201 | if self._finished == 1: 202 | return None 203 | 204 | # Make sure that we have unicode before continuing 205 | if isinstance(data, bytes): 206 | if self.encoding: 207 | data = data.decode(self.encoding) 208 | else: 209 | data = self._make_unicode(data) 210 | 211 | # Parse the html document 212 | try: 213 | self._parser.feed(data) 214 | except EOFError: 215 | self._finished = True 216 | self._parser.reset() 217 | 218 | def close(self): 219 | """ 220 | Close the "tree builder" and return the "root element" of the "element tree". 221 | 222 | :return: The "root element" of the "element tree". 223 | :rtype: xml.etree.ElementTree.Element 224 | 225 | :raises RuntimeError: If no element matching search criteria was found. 226 | """ 227 | return self._parser.close() 228 | 229 | def _make_unicode(self, data): 230 | """ 231 | Convert *data* from type :class:`bytes` to type :class:`str`. 232 | 233 | :param data: The html document. 234 | :type data: bytes 235 | 236 | :return: HTML data decoded. 237 | :rtype: str 238 | """ 239 | # Atemp to find the encoding from the html source 240 | end_head_tag = data.find(b"") 241 | if end_head_tag: 242 | # Search for the charset attribute within the meta tags 243 | charset_refind = b'' 244 | charset = re.search(charset_refind, data[:end_head_tag], re.IGNORECASE) 245 | if charset: 246 | self.encoding = encoding = charset.group(1).decode() 247 | return data.decode(encoding) 248 | 249 | # Decode the string into unicode using default encoding 250 | warn_msg = "Unable to determine encoding, defaulting to iso-8859-1" 251 | warnings.warn(warn_msg, UnicodeWarning, stacklevel=2) 252 | self.encoding = "iso-8859-1" 253 | return data.decode("iso-8859-1") 254 | 255 | 256 | # noinspection PyAbstractClass 257 | class ParseHTML(HTMLParser): 258 | def __init__(self, tag="", attrs=None): 259 | # Initiate HTMLParser 260 | HTMLParser.__init__(self) 261 | self.convert_charrefs = True 262 | self._root = None # root element 263 | self._data = [] # data collector 264 | self._factory = Etree.Element 265 | self.enabled = not tag 266 | self._unw_attrs = [] 267 | self.tag = tag 268 | 269 | # Split attributes into wanted and unwanted attributes 270 | if attrs: 271 | self.attrs = attrs 272 | for key, value in attrs.copy().items(): 273 | if value == 0: 274 | self._unw_attrs.append(key) 275 | del attrs[key] 276 | else: 277 | self.attrs = {} 278 | 279 | # Some tags in html do not require closing tags so thoes tags will need to be auto closed (Void elements) 280 | # Refer to: https://www.w3.org/TR/html/syntax.html#void-elements 281 | self._voids = frozenset(("area", "base", "br", "col", "hr", "img", "input", "link", "meta", "param", 282 | # Only in HTML5 283 | "embed", "keygen", "source", "track", 284 | # Not supported in HTML5 285 | "basefont", "frame", "isindex", 286 | # SVG self closing tags 287 | "rect", "circle", "ellipse", "line", "polyline", "polygon", 288 | "path", "stop", "use", "image", "animatetransform")) 289 | 290 | # Create temporary root element to protect from badly written sites that either 291 | # have no html starting tag or multiple top level elements 292 | elem = self._factory("html") 293 | self._elem = [elem] 294 | self._last = elem 295 | self._tail = 0 296 | 297 | def handle_starttag(self, tag, attrs): 298 | self._handle_starttag(tag, attrs, self_closing=tag in self._voids) 299 | 300 | def handle_startendtag(self, tag, attrs): 301 | self._handle_starttag(tag, attrs, self_closing=True) 302 | 303 | def _handle_starttag(self, tag, attrs, self_closing=False): 304 | enabled = self.enabled 305 | # Add tag element to tree if we have no filter or that the filter matches 306 | if enabled or self._search(tag, attrs): 307 | # Convert attrs to dictionary 308 | attrs = {k: v or "" for k, v in attrs} 309 | self._flush() 310 | 311 | # Create the new element 312 | elem = self._factory(tag, attrs) 313 | self._elem[-1].append(elem) 314 | self._last = elem 315 | 316 | # Only append the element to the list of elements if it's not a self closing element 317 | if self_closing: 318 | self._tail = 1 319 | else: 320 | self._elem.append(elem) 321 | self._tail = 0 322 | 323 | # Set this element as the root element when the filter search matches 324 | if not enabled: 325 | self._root = elem 326 | self.enabled = True 327 | 328 | def handle_endtag(self, tag): 329 | # Only process end tags when we have no filter or that the filter has been matched 330 | if self.enabled and tag not in self._voids: 331 | _elem = self._elem 332 | _root = self._root 333 | # Check that the closing tag is what's actualy expected 334 | if _elem[-1].tag == tag: 335 | self._flush() 336 | self._tail = 1 337 | self._last = elem = _elem.pop() 338 | if elem is _root: 339 | raise EOFError 340 | 341 | # If a previous element is what we actually have then the expected element was not 342 | # properly closed so we must close that before closing what we have now 343 | elif len(_elem) >= 2 and any(_item.tag == tag for _item in _elem): 344 | self._flush() 345 | self._tail = 1 346 | while True: 347 | self._last = elem = _elem.pop() 348 | if elem.tag == tag: 349 | break 350 | if elem is _root: 351 | raise EOFError 352 | else: 353 | # Unable to match the tag to an element, ignoring it 354 | return None 355 | 356 | def handle_data(self, data): 357 | if data.strip() and self.enabled: 358 | self._data.append(data) 359 | 360 | def handle_entityref(self, name): 361 | if self.enabled: 362 | try: 363 | name = chr(name2codepoint[name]) 364 | except KeyError: 365 | pass 366 | self._data.append(name) 367 | 368 | def handle_charref(self, name): 369 | if self.enabled: 370 | try: 371 | if name[0].lower() == "x": 372 | name = chr(int(name[1:], 16)) 373 | else: 374 | name = chr(int(name)) 375 | except ValueError: 376 | pass 377 | self._data.append(name) 378 | 379 | def handle_comment(self, data): 380 | data = data.strip() 381 | if data and self.enabled: 382 | elem = Etree.Comment(data) 383 | self._elem[-1].append(elem) 384 | 385 | def close(self): 386 | self._flush() 387 | if self.enabled == 0: 388 | msg = "Unable to find requested section with tag of '{}' and attributes of {}" 389 | raise RuntimeError(msg.format(self.tag, self.attrs)) 390 | elif self._root is not None: 391 | return self._root 392 | else: 393 | # Search the root element to find a proper html root element if one exists 394 | tmp_root = self._elem[0] 395 | proper_root = tmp_root.find("html") 396 | if proper_root is None: 397 | # Not proper root was found 398 | return tmp_root 399 | else: 400 | # Proper root found 401 | return proper_root 402 | 403 | def _flush(self): 404 | if self._data: 405 | if self._last is not None: 406 | text = "".join(self._data) 407 | if self._tail: 408 | self._last.tail = text 409 | else: 410 | self._last.text = text 411 | self._data = [] 412 | 413 | def _search(self, tag, attrs): 414 | # Only search when the tag matches 415 | if tag == self.tag: 416 | # If we have required attrs to match then search all attrs for wanted attrs 417 | # And also check that we do not have any attrs that are unwanted 418 | if self.attrs or self._unw_attrs: 419 | if attrs: 420 | wanted_attrs = self.attrs.copy() 421 | unwanted_attrs = self._unw_attrs 422 | for key, value in attrs: 423 | # Check for unwanted attrs 424 | if key in unwanted_attrs: 425 | return False 426 | 427 | # Check for wanted attrs 428 | elif key in wanted_attrs: 429 | c_value = wanted_attrs[key] 430 | if c_value == value or c_value == 1: 431 | # Remove this attribute from the wanted dict of attributes 432 | # to indicate that this attribute has been found 433 | del wanted_attrs[key] 434 | 435 | # If wanted_attrs is now empty then all attributes must have been found 436 | if not wanted_attrs: 437 | return True 438 | else: 439 | # We only need to match tag 440 | return True 441 | 442 | # Unable to find required section 443 | return False 444 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | 4 | [metadata] 5 | license_file = LICENSE 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from codecs import open 3 | from os import path 4 | import re 5 | 6 | # Path to local directory 7 | here = path.abspath(path.dirname(__file__)) 8 | 9 | 10 | def readfile(filename): # type: (str) -> str 11 | """Get the long description from the README file""" 12 | readme_file = path.join(here, filename) 13 | with open(readme_file, "r", encoding="utf-8") as stream: 14 | return stream.read() 15 | 16 | 17 | def extract_variable(filename, variable): # type: (str, str) -> str 18 | """Extract the version number from a python file that contains the '__version__' variable.""" 19 | with open(filename, "r", encoding="utf8") as stream: 20 | search_refind = r'{} = ["\'](\d+\.\d+\.\d+)["\']'.format(variable) 21 | verdata = re.search(search_refind, stream.read()) 22 | if verdata: 23 | return verdata.group(1) 24 | else: 25 | raise RuntimeError("Unable to extract version number") 26 | 27 | 28 | setup( 29 | name='htmlement', 30 | version=extract_variable('htmlement.py', '__version__'), 31 | description='Pure-Python HTML parser with ElementTree support.', 32 | long_description=readfile('README.rst'), 33 | extras_require={"dev": ["pytest", "pytest-cov"]}, 34 | keywords='html html5 parsehtml htmlparser elementtree dom', 35 | classifiers=[ 36 | 'Development Status :: 5 - Production/Stable', 37 | 'Intended Audience :: Developers', 38 | 'License :: OSI Approved :: MIT License', 39 | 'Natural Language :: English', 40 | 'Operating System :: OS Independent', 41 | 'Programming Language :: Python :: 3', 42 | 'Programming Language :: Python :: 3.7', 43 | 'Programming Language :: Python :: 3.8', 44 | 'Programming Language :: Python :: 3.9', 45 | 'Programming Language :: Python :: 3.10', 46 | 'Programming Language :: Python :: 3.11', 47 | 'Topic :: Text Processing :: Markup :: HTML', 48 | 'Topic :: Software Development :: Libraries :: Python Modules' 49 | ], 50 | url='https://github.com/willforde/python-htmlement', 51 | platforms=['OS Independent'], 52 | author='William Forde', 53 | author_email='willforde@gmail.com', 54 | license='MIT License', 55 | py_modules=['htmlement'] 56 | ) 57 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # dummy 2 | -------------------------------------------------------------------------------- /tests/test_module.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Python 2 compatibility 5 | import xml.etree.ElementTree as Etree 6 | import htmlement 7 | import examples 8 | import tempfile 9 | import pytest 10 | import io 11 | import os 12 | 13 | 14 | def quick_parsehtml(html, encoding=""): 15 | obj = htmlement.HTMLement(encoding=encoding) 16 | obj.feed(html) 17 | root = obj.close() 18 | assert Etree.iselement(root) 19 | return root 20 | 21 | 22 | def quick_parse_filter(html, tag, attrs=None, encoding=""): 23 | obj = htmlement.HTMLement(tag, attrs, encoding=encoding) 24 | obj.feed(html) 25 | return obj.close() 26 | 27 | 28 | def test_initialization(): 29 | # Check that the parser even starts 30 | obj = htmlement.HTMLement() 31 | assert isinstance(obj, htmlement.HTMLement) 32 | 33 | 34 | # ############################# HTML Test ############################## # 35 | 36 | 37 | def test_basic_tree(): 38 | # Check that I can parse a simple tree 39 | html = "" 40 | root = quick_parsehtml(html) 41 | assert root.tag == "html" 42 | assert root[0].tag == "body" 43 | 44 | 45 | def test_basic_partial(): 46 | # Check that I can parse a simple tree segment at a time 47 | html = "" 48 | obj = htmlement.HTMLement() 49 | obj.feed(html[:9]) 50 | obj.feed(html[9:]) 51 | root = obj.close() 52 | assert Etree.iselement(root) 53 | assert root.tag == "html" 54 | assert root[0].tag == "body" 55 | 56 | 57 | def test_nohtml_tree(): 58 | # Check that the missing html starting tag is created 59 | html = "" 60 | root = quick_parsehtml(html) 61 | assert root.tag == "html" 62 | assert root[0].tag == "body" 63 | assert Etree.tostring(root, method="html") == b'' 64 | 65 | 66 | def test_text(): 67 | html = "text" 68 | root = quick_parsehtml(html) 69 | assert root.tag == "html" 70 | assert root[0].tag == "body" 71 | assert root[0].attrib == {} 72 | assert root[0].text == "text" 73 | 74 | 75 | def test_attrib(): 76 | html = "text" 77 | root = quick_parsehtml(html) 78 | assert root[0].attrib == {"test": "yes"} 79 | 80 | 81 | def test_tail(): 82 | html = "

text

tail" 83 | root = quick_parsehtml(html) 84 | assert root[0][0].tail == "tail" 85 | 86 | 87 | def test_self_closing_normal(): 88 | html = "" 89 | root = quick_parsehtml(html) 90 | assert root[0].attrib.get("test") == "self closing" 91 | 92 | 93 | def test_self_closing_void(): 94 | html = "" 95 | root = quick_parsehtml(html) 96 | assert root[0].tag == "img" 97 | assert root[0].attrib.get("src") == "http://myimages.com/myimage.jpg" 98 | 99 | 100 | def test_open_void(): 101 | html = "" 102 | root = quick_parsehtml(html) 103 | assert root[0].tag == "img" 104 | assert root[0].attrib.get("src") == "http://myimages.com/myimage.jpg" 105 | 106 | 107 | def test_comment(): 108 | html = "

This is a paragraph.

" 109 | root = quick_parsehtml(html) 110 | assert root[0][0].text == "This is a comment." 111 | assert root[0][1].tag == "p" 112 | assert root[0][1].text == "This is a paragraph." 113 | 114 | 115 | def test_missing_end_tag(): 116 | # Test for a missing 'a' end tag 117 | html = "link" 118 | root = quick_parsehtml(html) 119 | assert root.find(".//a").get("href") == "http://google.ie/" 120 | assert Etree.tostring(root, method="html") == b'link' 121 | 122 | 123 | def test_extra_tag(): 124 | # Check that a extra tag that should not exist was removed 125 | html = "" 126 | root = quick_parsehtml(html) 127 | assert len(root[0]) == 0 128 | assert Etree.tostring(root, method="html") == b'' 129 | 130 | 131 | def test_find_empty_attribute(): 132 | # Check whether we can find an element with an empty-valued attribute 133 | html = "
" 134 | form = quick_parse_filter(html, "form", {"autofocus": True}) 135 | assert "autofocus" in form.attrib 136 | assert form.find(".//input[@checked]") is not None 137 | 138 | 139 | # ############################# HTML Entity ############################## # 140 | 141 | 142 | def test_entity_name_euro(): 143 | html = "cost is €49.99" 144 | root = quick_parsehtml(html) 145 | assert root[0].text == "cost is €49.99" 146 | 147 | 148 | def test_entity_number_euro(): 149 | html = "cost is €49.99" 150 | root = quick_parsehtml(html) 151 | assert root[0].text == "cost is €49.99" 152 | 153 | 154 | def test_entity_hex_euro(): 155 | html = "cost is €49.99" 156 | root = quick_parsehtml(html) 157 | assert root[0].text == "cost is €49.99" 158 | 159 | 160 | def test_entity_name_euro_fail(): 161 | html = "cost is &euros;49.99" 162 | root = quick_parsehtml(html) 163 | assert "euros" in root[0].text 164 | 165 | 166 | def test_entity_hex_euro_fail(): 167 | html = "cost is �49.99" 168 | root = quick_parsehtml(html) 169 | assert "€" not in root[0].text 170 | 171 | 172 | # ############################# Text Content ############################# # 173 | 174 | 175 | def test_text_iterator(): 176 | html = "sample text content" 177 | root = quick_parsehtml(html) 178 | body = root.find(".//body") 179 | assert "".join(body.itertext()) == "sample text content" 180 | 181 | 182 | def test_text_iterator_unclosed_tag(): 183 | html = "
hello to the world!
unrelated
" 184 | root = quick_parsehtml(html) 185 | body = root.find(".//body") 186 | assert "".join(body.itertext()) == "hello to the world!" 187 | 188 | 189 | # ############################# Filter Test ############################## # 190 | 191 | 192 | def test_tag_match(): 193 | html = "

text

" 194 | root = quick_parse_filter(html, "div") 195 | assert root.tag == "div" 196 | assert root[0].tag == "p" 197 | 198 | 199 | def test_tag_no_match(): 200 | html = "" 201 | with pytest.raises(RuntimeError) as excinfo: 202 | quick_parse_filter(html, "div") 203 | excinfo.match("Unable to find requested section with tag of") 204 | 205 | 206 | def test_attrib_match(): 207 | html = "

text

text
" 208 | root = quick_parse_filter(html, "div", {"test": "yes"}) 209 | assert root.tag == "div" 210 | assert root.get("test") == "yes" 211 | assert root.text == "text" 212 | 213 | 214 | def test_attrib_no_match(): 215 | html = "

text

text
" 216 | with pytest.raises(RuntimeError) as excinfo: 217 | quick_parse_filter(html, "div", {"test": "yes"}) 218 | excinfo.match("Unable to find requested section with tag of") 219 | 220 | 221 | def test_attrib_match_name(): 222 | # Search for any div tag with a attribute of src of any value 223 | html = "

text

text
" 224 | root = quick_parse_filter(html, "div", {"src": True}) 225 | assert root.tag == "div" 226 | assert root.get("src") 227 | assert root.text == "text" 228 | 229 | 230 | def test_attrib_match_unwanted(): 231 | # Search for a div with a test attribute but not a src attribute 232 | html = "

text

text
" 233 | root = quick_parse_filter(html, "div", {"test": "yes", "src": False}) 234 | assert root.tag == "div" 235 | assert root.get("test") == "yes" 236 | assert "src" not in root.attrib 237 | assert root.text == "text" 238 | 239 | 240 | def test_tag_match_badhtml(): 241 | html = "

text

" 242 | root = quick_parse_filter(html, "div") 243 | assert root.tag == "div" 244 | assert root[0].tag == "p" 245 | 246 | 247 | def test_partial_filter(): 248 | # Check that the 249 | html = "

text

" 250 | obj = htmlement.HTMLement("div") 251 | obj.feed(html[:51]) 252 | obj.feed(html[51:]) 253 | root = obj.close() 254 | assert root.tag == "div" 255 | assert root[0].tag == "p" 256 | 257 | 258 | # ####################### Unicode Decoding Test ####################### # 259 | 260 | 261 | def test_with_encoding(): 262 | # Check that I can parse a simple tree 263 | html = b"" 264 | root = quick_parsehtml(html, encoding="utf-8") 265 | assert root.tag == "html" 266 | assert root[0].tag == "body" 267 | 268 | 269 | def test_no_encoding_with_header_type1(recwarn): 270 | # Check for charset header type one 271 | html = b"text" 272 | quick_parsehtml(html) 273 | # Check that no warnings ware raised 274 | warnmsg = "Unable to determine encoding, defaulting to iso-8859-1" 275 | for w in recwarn.list: 276 | assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg 277 | 278 | 279 | def test_no_encoding_with_header_type2(recwarn): 280 | # Check for charset header type one 281 | html = b'text' 282 | quick_parsehtml(html) 283 | # Check that no warnings ware raised 284 | warnmsg = "Unable to determine encoding, defaulting to iso-8859-1" 285 | for w in recwarn.list: 286 | assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg 287 | 288 | 289 | def test_no_encoding_with_header_type3(recwarn): 290 | # Check for charset header type one 291 | html = b"text" 292 | quick_parsehtml(html) 293 | # Check that no warnings ware raised 294 | warnmsg = "Unable to determine encoding, defaulting to iso-8859-1" 295 | for w in recwarn.list: 296 | assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg 297 | 298 | 299 | def test_no_encoding_with_header_type4(recwarn): 300 | # Check for charset header type one 301 | html = b'text' 302 | quick_parsehtml(html) 303 | # Check that no warnings ware raised 304 | warnmsg = "Unable to determine encoding, defaulting to iso-8859-1" 305 | for w in recwarn.list: 306 | assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg 307 | 308 | 309 | def test_no_encoding_with_header_type5(recwarn): 310 | # Check for charset header type one 311 | html = b"text" 312 | quick_parsehtml(html) 313 | # Check that no warnings ware raised 314 | warnmsg = "Unable to determine encoding, defaulting to iso-8859-1" 315 | for w in recwarn.list: 316 | assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg 317 | 318 | 319 | def test_no_encoding_with_header_type6(recwarn): 320 | # Check for charset header type one 321 | html = b'text' 322 | quick_parsehtml(html) 323 | # Check that no warnings ware raised 324 | warnmsg = "Unable to determine encoding, defaulting to iso-8859-1" 325 | for w in recwarn.list: 326 | assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg 327 | 328 | 329 | def test_no_encoding_with_header_type7(recwarn): 330 | # Check for charset header type one 331 | html = b"text" 332 | quick_parsehtml(html) 333 | # Check that no warnings ware raised 334 | warnmsg = "Unable to determine encoding, defaulting to iso-8859-1" 335 | for w in recwarn.list: 336 | assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg 337 | 338 | 339 | def test_no_encoding_with_header_type8(recwarn): 340 | # Check for charset header type one 341 | html = b'text' 342 | quick_parsehtml(html) 343 | # Check that no warnings ware raised 344 | warnmsg = "Unable to determine encoding, defaulting to iso-8859-1" 345 | for w in recwarn.list: 346 | assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg 347 | 348 | 349 | def test_no_encoding_no_header(): 350 | # Check that I can parse a simple tree 351 | html = b"text" 352 | with pytest.warns(UnicodeWarning): 353 | quick_parsehtml(html) 354 | 355 | 356 | # ####################### Funtion Tests ####################### # 357 | 358 | 359 | def test_fromstring(): 360 | # Check that I can parse a simple tree 361 | html = "" 362 | root = htmlement.fromstring(html) 363 | assert Etree.iselement(root) 364 | assert root.tag == "html" 365 | assert root[0].tag == "body" 366 | 367 | 368 | def test_fromstringlist(): 369 | # Check that I can parse a simple tree 370 | sequence = ["", ""] 371 | root = htmlement.fromstringlist(sequence) 372 | assert Etree.iselement(root) 373 | assert root.tag == "html" 374 | assert root[0].tag == "body" 375 | 376 | 377 | def test_parse_file_object(): 378 | html = "" 379 | fileobj = io.StringIO(html) 380 | root = htmlement.parse(fileobj, encoding="utf8") 381 | assert Etree.iselement(root) 382 | assert root.tag == "html" 383 | assert root[0].tag == "body" 384 | 385 | 386 | def test_parse_filename(): 387 | # Create temp file and add html data to it 388 | html = "" 389 | fileobj = tempfile.NamedTemporaryFile("w", delete=False) 390 | fileobj.write(html) 391 | filename = fileobj.name 392 | fileobj.close() 393 | 394 | try: 395 | root = htmlement.parse(filename, encoding="utf8") 396 | assert Etree.iselement(root) 397 | assert root.tag == "html" 398 | assert root[0].tag == "body" 399 | finally: 400 | os.remove(filename) 401 | 402 | 403 | # ####################### Examples Tests ####################### # 404 | 405 | 406 | def test_example_simple(): 407 | # Check that there is no errors 408 | examples.example_simple() 409 | 410 | 411 | def test_example_filter(): 412 | # Check that there is no errors 413 | examples.example_filter() 414 | 415 | 416 | def test_example_complex(): 417 | # Check that there is no errors 418 | examples.example_complex() 419 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py{37,38,39,310,311},flake8 3 | skip_missing_interpreters = true 4 | 5 | [gh-actions] 6 | python = 7 | 3.7: py37 8 | 3.8: py38 9 | 3.9: py39 10 | 3.10: py310 11 | 3.11: py311 12 | 13 | [testenv] 14 | extras = dev 15 | commands = pytest --cov=htmlement --cov-report xml 16 | 17 | # Flake8 Environment 18 | [testenv:flake8] 19 | basepython = python3 20 | skip_install = true 21 | deps = 22 | flake8 23 | commands = 24 | flake8 --max-line-length=127 25 | 26 | # Flake8 Configuration 27 | [flake8] 28 | ignore = 29 | F821, # undefined name 'unichr' 30 | exclude = 31 | .tox, 32 | .git, 33 | docs, 34 | tests 35 | 36 | [coverage:run] 37 | source=htmlement 38 | branch=True 39 | 40 | [coverage:report] 41 | exclude_lines = 42 | if __name__ == .__main__.: 43 | def __repr__ 44 | pragma: no cover 45 | 46 | [coverage:paths] 47 | source = 48 | htmlement 49 | .tox/*/lib/python*/site-packages/htmlement 50 | --------------------------------------------------------------------------------