├── .coveragerc
├── .github
├── dependabot.yml
└── workflows
│ ├── publish.yml
│ └── tests.yml
├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.rst
├── docs
├── Makefile
├── README.rst
├── conf.py
└── index.rst
├── examples.py
├── htmlement.py
├── setup.cfg
├── setup.py
├── tests
├── __init__.py
└── test_module.py
└── tox.ini
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | source=htmlement
3 | branch=True
4 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # To get started with Dependabot version updates, you'll need to specify which
2 | # package ecosystems to update and where the package manifests are located.
3 | # Please see the documentation for all configuration options:
4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5 |
6 | version: 2
7 | updates:
8 | # Maintain dependencies for GitHub Actions
9 | - package-ecosystem: "github-actions"
10 | target-branch: "master"
11 | directory: "/"
12 | schedule:
13 | interval: "daily"
14 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Publish to PyPi
2 |
3 | on:
4 | release:
5 | types: [published]
6 |
7 | jobs:
8 | pypi:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: actions/checkout@v3
12 | with:
13 | fetch-depth: 0
14 |
15 | - name: Install dependencies
16 | run: |
17 | python3 -m pip install --upgrade build
18 | python3 -m build
19 |
20 | - name: Publish package
21 | uses: pypa/gh-action-pypi-publish@release/v1
22 | with:
23 | password: ${{ secrets.PYPI_API_TOKEN }}
24 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: tests
2 |
3 | on:
4 | - push
5 | - pull_request
6 |
7 | jobs:
8 | tests:
9 | runs-on: ubuntu-latest
10 | strategy:
11 | matrix:
12 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
13 | steps:
14 | - uses: actions/checkout@v3
15 |
16 | - name: Set up Python ${{ matrix.python-version }}
17 | uses: actions/setup-python@v4
18 | with:
19 | python-version: ${{ matrix.python-version }}
20 |
21 | - name: Install test dependencies
22 | run: |
23 | python -m pip install --upgrade --no-cache-dir pip
24 | pip install --no-cache-dir tox tox-gh-actions
25 |
26 | - name: Test using tox
27 | run: tox
28 |
29 | - name: Upload test coverage to Codecov
30 | continue-on-error: true
31 | uses: codecov/codecov-action@3.1.2
32 | with:
33 | flags: unittests
34 |
35 | linting:
36 | runs-on: ubuntu-latest
37 | steps:
38 | - uses: actions/checkout@v3
39 |
40 | - name: Install test dependencies
41 | run: |
42 | python -m pip install --upgrade --no-cache-dir pip
43 | pip install --no-cache-dir flake8
44 |
45 | - name: Test linting with flake8
46 | run: flake8 --max-line-length=127
47 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # general things to ignore
2 | docs/_static/
3 | docs/build/
4 | dist/
5 | *.egg-info/
6 | *.egg
7 | *.py[cod]
8 | __pycache__/
9 | .idea/
10 | build/
11 | .cache/
12 | .tox/
13 | .coverage
14 | docs/_build/
15 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | arch: amd64
3 | os: linux
4 | dist: bionic # Has python 2.7, 3.6, 3.7, 3.8 pre installed
5 |
6 | matrix:
7 | include:
8 | - python: "2.7"
9 | env: TOXENV=py27
10 | stage: Tests
11 | - python: "3.6"
12 | env: TOXENV=py36
13 | - python: "3.7"
14 | env: TOXENV=py37
15 | - python: "3.8"
16 | env: TOXENV=py38
17 | - python: "3.9"
18 | env: TOXENV=py39
19 | - python: "3.10"
20 | env: TOXENV=py39
21 | - env: TOXENV=flake8
22 |
23 | install:
24 | - pip install coveralls
25 | - pip install tox
26 |
27 | script: tox
28 | after_success: coveralls
29 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016 William Forde
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
6 | this software and associated documentation files (the "Software"), to deal in
7 | the Software without restriction, including without limitation the rights to
8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst LICENSE
2 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | .. image:: https://badge.fury.io/py/htmlement.svg
2 | :target: https://pypi.python.org/pypi/htmlement
3 |
4 | .. image:: https://readthedocs.org/projects/python-htmlement/badge/?version=stable
5 | :target: http://python-htmlement.readthedocs.io/en/stable/?badge=stable
6 |
7 | .. image:: https://github.com/willforde/python-htmlement/actions/workflows/tests.yml/badge.svg?branch=master&event=push
8 | :target: https://github.com/willforde/python-htmlement/actions
9 |
10 | .. image:: https://codecov.io/gh/willforde/python-htmlement/branch/master/graph/badge.svg?token=D5EKKLIVBP
11 | :target: https://codecov.io/gh/willforde/python-htmlement
12 |
13 | .. image:: https://api.codeclimate.com/v1/badges/7d593426acc83cba5ef7/maintainability
14 | :target: https://codeclimate.com/github/willforde/python-htmlement/maintainability
15 | :alt: Maintainability
16 |
17 |
18 | HTMLement
19 | ---------
20 |
21 | HTMLement is a pure Python HTML Parser.
22 |
23 | The object of this project is to be a "pure-python HTML parser" which is also "faster" than "beautifulsoup".
24 | And like "beautifulsoup", will also parse invalid html.
25 |
26 | The most simple way to do this is to use ElementTree `XPath expressions`__.
27 | Python does support a simple (read limited) XPath engine inside its "ElementTree" module.
28 | A benefit of using "ElementTree" is that it can use a "C implementation" whenever available.
29 |
30 | This "HTML Parser" extends `html.parser.HTMLParser`_ to build a tree of `ElementTree.Element`_ instances.
31 |
32 | Install
33 | -------
34 | Run ::
35 |
36 | pip install htmlement
37 |
38 | -or- ::
39 |
40 | pip install git+https://github.com/willforde/python-htmlement.git
41 |
42 | Parsing HTML
43 | ------------
44 | Here I’ll be using a sample "HTML document" that will be "parsed" using "htmlement": ::
45 |
46 | html = """
47 |
48 |
49 | GitHub
50 |
51 |
52 | GitHub
53 | GitHub Project
54 |
55 |
56 | """
57 |
58 | # Parse the document
59 | import htmlement
60 | root = htmlement.fromstring(html)
61 |
62 | Root is an ElementTree.Element_ and supports the ElementTree API
63 | with XPath expressions. With this I'm easily able to get both the title and all anchors in the document. ::
64 |
65 | # Get title
66 | title = root.find("head/title").text
67 | print("Parsing: %s" % title)
68 |
69 | # Get all anchors
70 | for a in root.iterfind(".//a"):
71 | print(a.get("href"))
72 |
73 | And the output is as follows: ::
74 |
75 | Parsing: GitHub
76 | https://github.com/willforde
77 | https://github.com/willforde/python-htmlement
78 |
79 |
80 | Parsing HTML with a filter
81 | --------------------------
82 | Here I’ll be using a slightly more complex "HTML document" that will be "parsed" using "htmlement with a filter" to fetch
83 | only the menu items. This can be very useful when dealing with large "HTML documents" since it can be a lot faster to
84 | only "parse the required section" and to ignore everything else. ::
85 |
86 | html = """
87 |
88 |
89 | Coffee shop
90 |
91 |
92 |
97 |
101 |
102 |
103 | """
104 |
105 | # Parse the document
106 | import htmlement
107 | root = htmlement.fromstring(html, "ul", attrs={"class": "menu"})
108 |
109 | In this case I'm not unable to get the title, since all elements outside the filter were ignored.
110 | But this allows me to be able to extract all "list_item elements" within the menu list and nothing else. ::
111 |
112 | # Get all listitems
113 | for item in root.iterfind(".//li"):
114 | # Get text from listitem
115 | print(item.text)
116 |
117 | And the output is as follows: ::
118 |
119 | Coffee
120 | Tea
121 | Milk
122 |
123 | .. _html.parser.HTMLParser: https://docs.python.org/3.6/library/html.parser.html#html.parser.HTMLParser
124 | .. _ElementTree.Element: https://docs.python.org/3.6/library/xml.etree.elementtree.html#xml.etree.ElementTree.Element
125 | .. _Xpath: https://docs.python.org/3.6/library/xml.etree.elementtree.html#xpath-support
126 | __ XPath_
127 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = HTMLement
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/README.rst:
--------------------------------------------------------------------------------
1 | .. image:: https://readthedocs.org/projects/python-htmlement/badge/?version=stable
2 | :target: http://python-htmlement.readthedocs.io/en/stable/?badge=stable
3 |
4 | Please GoTo: http://python-htmlement.readthedocs.io/en/latest/?badge=stable
5 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | #
4 | # HTMLement documentation build configuration file, created by
5 | # sphinx-quickstart on Mon Jan 16 03:44:03 2017.
6 | #
7 | # This file is execfile()d with the current directory set to its
8 | # containing dir.
9 | #
10 | # Note that not all possible configuration values are present in this
11 | # autogenerated file.
12 | #
13 | # All configuration values have a default; values that are commented out
14 | # serve to show the default.
15 | import sys
16 | import os
17 |
18 | # If extensions (or modules to document with autodoc) are in another directory,
19 | # add these directories to sys.path here. If the directory is relative to the
20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
21 | sys.path.insert(0, os.path.abspath('../'))
22 |
23 | # Fetch version number
24 | from htmlement import __version__
25 |
26 | # General information about the project.
27 | project = 'HTMLement'
28 | author = "William Forde"
29 |
30 | # The version info for the project you're documenting, acts as replacement for
31 | # |version| and |release|, also used in various other places throughout the
32 | # built documents.
33 | #
34 | # The short X.Y version.
35 | version = __version__
36 | # The full version, including alpha/beta/rc tags.
37 | release = __version__
38 |
39 | # -- General configuration ------------------------------------------------
40 |
41 | # If your documentation needs a minimal Sphinx version, state it here.
42 | #
43 | # needs_sphinx = '1.0'
44 |
45 | # Add any Sphinx extension module names here, as strings. They can be
46 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
47 | # ones.
48 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.viewcode']
49 |
50 | # Add any paths that contain templates here, relative to this directory.
51 | templates_path = ['_templates']
52 |
53 | # The suffix(es) of source filenames.
54 | # You can specify multiple suffix as a list of string:
55 | #
56 | # source_suffix = ['.rst', '.md']
57 | source_suffix = '.rst'
58 |
59 | # The master toctree document.
60 | master_doc = 'index'
61 |
62 | # The language for content autogenerated by Sphinx. Refer to documentation
63 | # for a list of supported languages.
64 | #
65 | # This is also used if you do content translation via gettext catalogs.
66 | # Usually you set "language" from the command line for these cases.
67 | language = "en"
68 |
69 | # List of patterns, relative to source directory, that match files and
70 | # directories to ignore when looking for source files.
71 | # This patterns also effect to html_static_path and html_extra_path
72 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'README.rst']
73 |
74 | # The name of the Pygments (syntax highlighting) style to use.
75 | pygments_style = 'sphinx'
76 |
77 | # If true, todo and todoList produce output, else they produce nothing.
78 | todo_include_todos = False
79 |
80 |
81 | # -- Options for HTML output ----------------------------------------------
82 |
83 | # The theme to use for HTML and HTML Help pages. See the documentation for
84 | # a list of builtin themes.
85 | #
86 | html_theme = 'default'
87 |
88 | # Theme options are theme-specific and customize the look and feel of a theme
89 | # further. For a list of options available for each theme, see the
90 | # documentation.
91 | #
92 | # html_theme_options = {}
93 |
94 | # Add any paths that contain custom static files (such as style sheets) here,
95 | # relative to this directory. They are copied after the builtin static files,
96 | # so a file named "default.css" will overwrite the builtin "default.css".
97 | html_static_path = ['_static']
98 |
99 | # This will exclude any warnings of 'nonlocal image URI found'.
100 | suppress_warnings = ['image.nonlocal_uri']
101 |
102 |
103 | # -- Options for HTMLHelp output ------------------------------------------
104 |
105 | # Output file base name for HTML help builder.
106 | htmlhelp_basename = 'HTMLementdoc'
107 |
108 |
109 | # -- Options for LaTeX output ---------------------------------------------
110 |
111 | latex_elements = {
112 | # The paper size ('letterpaper' or 'a4paper').
113 | #
114 | # 'papersize': 'letterpaper',
115 |
116 | # The font size ('10pt', '11pt' or '12pt').
117 | #
118 | # 'pointsize': '10pt',
119 |
120 | # Additional stuff for the LaTeX preamble.
121 | #
122 | # 'preamble': '',
123 |
124 | # Latex figure (float) alignment
125 | #
126 | # 'figure_align': 'htbp',
127 | }
128 |
129 | # Grouping the document tree into LaTeX files. List of tuples
130 | # (source start file, target name, title,
131 | # author, documentclass [howto, manual, or own class]).
132 | latex_documents = [
133 | (master_doc, 'HTMLement.tex', 'HTMLement Documentation',
134 | 'William Forde', 'manual'),
135 | ]
136 |
137 |
138 | # -- Options for manual page output ---------------------------------------
139 |
140 | # One entry per manual page. List of tuples
141 | # (source start file, name, description, authors, manual section).
142 | man_pages = [
143 | (master_doc, 'htmlement', 'HTMLement Documentation',
144 | [author], 1)
145 | ]
146 |
147 |
148 | # -- Options for Texinfo output -------------------------------------------
149 |
150 | # Grouping the document tree into Texinfo files. List of tuples
151 | # (source start file, target name, title, author,
152 | # dir menu entry, description, category)
153 | texinfo_documents = [
154 | (master_doc, 'HTMLement', 'HTMLement Documentation',
155 | author, 'HTMLement', 'One line description of project.',
156 | 'Miscellaneous'),
157 | ]
158 |
159 |
160 | # Example configuration for intersphinx: refer to the Python standard library.
161 | intersphinx_mapping = {'https://docs.python.org/3.6': None}
162 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. HTMLement documentation master file, created by
2 | sphinx-quickstart on Mon Jan 16 03:44:03 2017.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to HTMLement's documentation!
7 | =====================================
8 |
9 | .. include::
10 | ../README.rst
11 |
12 | .. seealso::
13 | More examples can be found in `examples.py`_.
14 |
15 | API
16 | ---
17 |
18 | .. automodule:: htmlement
19 | :members:
20 |
21 | External Links
22 | --------------
23 | ElementTree: https://docs.python.org/2/library/xml.etree.elementtree.html
24 |
25 | Bug Tracker: https://github.com/willforde/python-htmlement/issues
26 |
27 | .. _examples.py: https://github.com/willforde/python-htmlement/blob/master/examples.py
28 |
--------------------------------------------------------------------------------
/examples.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | For more information, see:
4 | @see https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.Element
5 | @see https://docs.python.org/3/library/xml.etree.elementtree.html#xpath-support
6 | """
7 | from __future__ import print_function, unicode_literals
8 | from htmlement import HTMLement
9 |
10 |
11 | def example_simple():
12 | """
13 | This example will parse a simple html tree and
14 | extract the website title and all anchors
15 |
16 | >>> example_simple()
17 | Parsing: GitHub
18 | GitHub => https://github.com/willforde
19 | GitHub Project => https://github.com/willforde/python-htmlement
20 | """
21 | html = """
22 |
23 |
24 | GitHub
25 |
26 |
27 | GitHub
28 | GitHub Project
29 |
30 |
31 | """
32 |
33 | # Parse the document
34 | parser = HTMLement()
35 | parser.feed(html)
36 | root = parser.close()
37 |
38 | # Root is an xml.etree.Element and supports the ElementTree API
39 | # (e.g. you may use its limited support for XPath expressions)
40 |
41 | # Get title
42 | title = root.find('head/title').text
43 | print("Parsing: {}".format(title))
44 |
45 | # Get all anchors
46 | for a in root.iterfind(".//a"):
47 | # Get href attribute
48 | url = a.get("href")
49 | # Get anchor name
50 | name = a.text
51 |
52 | print("{} => {}".format(name, url))
53 |
54 |
55 | def example_filter():
56 | """
57 | This example will parse a simple html tree and
58 | extract all the list items within the ul menu element using a tree filter.
59 |
60 | The tree filter will tell the parser to only parse the elements within the
61 | requested section and to ignore all other elements.
62 | Useful for speeding up the parsing of html pages.
63 |
64 | >>> example_filter()
65 | Menu Items
66 | - Coffee
67 | - Tea
68 | - Milk
69 | """
70 | html = """
71 |
72 |
73 | Coffee shop
74 |
75 |
76 |
81 |
85 |
86 |
87 | """
88 |
89 | # Parse the document
90 | parser = HTMLement("ul", attrs={"class": "menu"})
91 | parser.feed(html)
92 | root = parser.close()
93 |
94 | # Root should now be a 'ul' xml.etree.Element with all it's child elements available
95 | # All other elements have been ignored. Way faster to parse.
96 |
97 | # We are unable to get the title here sense all
98 | # elements outside the filter was ignored
99 | print("Menu Items")
100 |
101 | # Get all listitems
102 | for item in root.iterfind(".//li"):
103 | # Get text from listitem
104 | print("- {}".format(item.text))
105 |
106 |
107 | def example_complex():
108 | """
109 | This example will parse a more complex html tree of python talk's and will
110 | extract the image, title, url and date of each talk.
111 |
112 | A filter will be used to extract the main talks div element
113 |
114 | >>> example_complex()
115 | Image = /presentations/c7f1fbb5d03a409d9de8abb5238d6a68/thumb_slide_0.jpg
116 | Url = /pycon2016/alex-martelli-exception-and-error-handling-in-python-2-and-python-3
117 | Title = Alex Martelli - Exception and error handling in Python 2 and Python 3
118 | Date = Jun 1, 2016
119 |
120 | Image = /presentations/eef8ffe5b6784f7cb84948cf866b2608/thumb_slide_0.jpg
121 | Url = /presentations/518cae54da12460e895163d809e25933/thumb_slide_0.jpg
122 | Title = Jake Vanderplas - Statistics for Hackers
123 | Date = May 29, 2016
124 |
125 | Image = /presentations/8b3ee51b5fcc4a238c4cb4b7787979ac/thumb_slide_0.jpg
126 | Url = /pycon2016/brett-slatkin-refactoring-python-why-and-how-to-restructure-your-code
127 | Title = Brett Slatkin - Refactoring Python: Why and how to restructure your code
128 | Date = May 29, 2016
129 |
130 | """
131 | html = """
132 |
133 |
134 | PyCon 2016
135 |
136 |
137 |
138 |
Talks by PyCon 2016
139 |
180 |
181 |
182 |
183 | """
184 |
185 | # Parse the document
186 | parser = HTMLement("div", attrs={"class": "talks", "id": True})
187 | parser.feed(html)
188 | root = parser.close()
189 |
190 | # Extract all div tags with class of talk
191 | for talk in root.iterfind("./div[@class='talk']"):
192 | # Fetch image
193 | img = talk.find(".//img").get("src")
194 | print("Image = {}".format(img))
195 |
196 | # Fetch title and url
197 | title_anchor = talk.find("./div/h3/a")
198 | url = title_anchor.get("href")
199 | print("Url = {}".format(url))
200 | title = title_anchor.text
201 | print("Title = {}".format(title))
202 |
203 | # Fetch date
204 | date = talk.find("./div/p").text
205 | print("Date = {}".format(date))
206 | print("")
207 |
208 |
209 | if __name__ == "__main__":
210 | example_simple()
211 | print("")
212 | example_filter()
213 | print("")
214 | example_complex()
215 |
--------------------------------------------------------------------------------
/htmlement.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | #
4 | # The MIT License (MIT)
5 | #
6 | # Copyright (c) 2016 William Forde
7 | #
8 | # Permission is hereby granted, free of charge, to any person obtaining a copy of
9 | # this software and associated documentation files (the "Software"), to deal in
10 | # the Software without restriction, including without limitation the rights to
11 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
12 | # the Software, and to permit persons to whom the Software is furnished to do so,
13 | # subject to the following conditions:
14 | #
15 | # The above copyright notice and this permission notice shall be included in all
16 | # copies or substantial portions of the Software.
17 | #
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
20 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
21 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
22 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 |
25 | """
26 | HTMLement
27 | ---------
28 | Simple lightweight HTML parser with XPath support.
29 |
30 | Github: https://github.com/willforde/python-htmlement
31 | Documentation: https://python-htmlement.readthedocs.io/en/stable/?badge=stable
32 | Testing: https://github.com/willforde/python-htmlement/actions
33 | Coverage: https://codecov.io/gh/willforde/python-htmlement
34 | Maintainability: https://codeclimate.com/github/willforde/python-htmlement/maintainability
35 | """
36 |
37 | # Standard Lib
38 | import xml.etree.ElementTree as Etree
39 | import warnings
40 | import re
41 |
42 | # HTML Parser
43 | from html.entities import name2codepoint
44 | from html.parser import HTMLParser
45 |
46 | __all__ = ["HTMLement", "fromstring", "fromstringlist", "parse"]
47 | __version__ = "2.0.0"
48 |
49 | # Add missing codepoints
50 | # TODO: This may no longer be required
51 | name2codepoint["apos"] = 0x0027
52 |
53 |
54 | def fromstring(text, tag="", attrs=None, encoding=None):
55 | """
56 | Parse's "HTML" document from a string into an element tree.
57 |
58 | :param text: The "HTML" document to parse.
59 | :type text: str or bytes
60 |
61 | :param str tag: (optional) Name of "tag / element" which is used to filter down "the tree" to a required section.
62 | :type tag: str
63 |
64 | :param attrs: (optional) The attributes of the element, that will be used, when searchingfor the required section.
65 | :type attrs: dict(str, str)
66 |
67 | :param encoding: (optional) Encoding used, when decoding the source data before feeding it to the parser.
68 | :type encoding: str
69 |
70 | :return: The root element of the element tree.
71 | :rtype: xml.etree.ElementTree.Element
72 |
73 | :raises UnicodeDecodeError: If decoding of *text* fails.
74 | """
75 | parser = HTMLement(tag, attrs, encoding)
76 | parser.feed(text)
77 | return parser.close()
78 |
79 |
80 | def fromstringlist(sequence, tag="", attrs=None, encoding=None):
81 | """
82 | Parses an "HTML document" from a sequence of "HTML sections" into an element tree.
83 |
84 | :param sequence: A sequence of "HTML sections" to parse.
85 | :type sequence: list(str or bytes)
86 |
87 | :param str tag: (optional) Name of "tag / element" which is used to filter down "the tree" to a required section.
88 | :type tag: str
89 |
90 | :param attrs: (optional) The attributes of the element, that will be used, when searchingfor the required section.
91 | :type attrs: dict(str, str)
92 |
93 | :param encoding: (optional) Encoding used, when decoding the source data before feeding it to the parser.
94 | :type encoding: str
95 |
96 | :return: The root element of the element tree.
97 | :rtype: xml.etree.ElementTree.Element
98 |
99 | :raises UnicodeDecodeError: If decoding of a section within *sequence* fails.
100 | """
101 | parser = HTMLement(tag, attrs, encoding)
102 | for text in sequence:
103 | parser.feed(text)
104 | return parser.close()
105 |
106 |
107 | def parse(source, tag="", attrs=None, encoding=None):
108 | """
109 | Load an external "HTML document" into an element tree.
110 |
111 | :param source: A filename or file like object containing HTML data.
112 | :type source: str or io.TextIOBase
113 |
114 | :param str tag: (optional) Name of "tag / element" which is used to filter down "the tree" to a required section.
115 | :type tag: str
116 |
117 | :param attrs: (optional) The attributes of the element, that will be used, when searchingfor the required section.
118 | :type attrs: dict(str, str)
119 |
120 | :param encoding: (optional) Encoding used, when decoding the source data before feeding it to the parser.
121 | :type encoding: str
122 |
123 | :return: The root element of the element tree.
124 | :rtype: xml.etree.ElementTree.Element
125 |
126 | :raises UnicodeDecodeError: If decoding of *source* fails.
127 | """
128 | # Assume that source is a file pointer if no read methods is found
129 | if not hasattr(source, "read"):
130 | source = open(source, "r", encoding=encoding)
131 | close_source = True
132 | else:
133 | close_source = False
134 |
135 | try:
136 | parser = HTMLement(tag, attrs, encoding)
137 | while True:
138 | # Read in 64k at a time
139 | data = source.read(65536)
140 | if not data:
141 | break
142 |
143 | # Feed the parser
144 | parser.feed(data)
145 |
146 | # Return the root element
147 | return parser.close()
148 |
149 | finally:
150 | if close_source:
151 | source.close()
152 |
153 |
154 | class HTMLement(object):
155 | """
156 | Python HTMLParser extension with ElementTree Parser support.
157 |
158 | This HTML Parser extends :class:`html.parser.HTMLParser`, returning an :class:`xml.etree.ElementTree.Element`
159 | instance. The returned root element natively supports the ElementTree API.
160 | (e.g. you may use its limited support for `XPath expressions`__)
161 |
162 | When a "tag" and "tag attributes" are given the parser will search for a required section. Only when the required
163 | section is found, does the parser start parsing the "HTML document". The element that matches the search criteria
164 | will then become the new "root element".
165 |
166 | Attributes are given as a dict of {'name': 'value'}. Value can be the string to match, `True` or `False.`
167 | `True` will match any attribute with given name and any value.
168 | `False` will only give a match if given attribute does not exist in the element.
169 |
170 | :param str tag: (optional) Name of "tag / element" which is used to filter down "the tree" to a required section.
171 | :type tag: str
172 |
173 | :param attrs: (optional) The attributes of the element, that will be used, when searchingfor the required section.
174 | :type attrs: dict(str, str)
175 |
176 | :param encoding: (optional) Encoding used, when decoding the source data before feeding it to the parser.
177 | :type encoding: str
178 |
179 | .. _Xpath: https://docs.python.org/3.6/library/xml.etree.elementtree.html#xpath-support
180 | __ XPath_
181 | """
182 | def __init__(self, tag="", attrs=None, encoding=None):
183 | self._parser = ParseHTML(tag, attrs)
184 | self.encoding = encoding
185 | self._finished = False
186 |
187 | def feed(self, data):
188 | """
189 | Feeds data to the parser.
190 |
191 | If *data*, is of type :class:`bytes` and where no encoding was specified, then the encoding
192 | will be extracted from *data* using "meta tags", if available.
193 | Otherwise encoding will default to "ISO-8859-1"
194 |
195 | :param data: HTML data
196 | :type data: str or bytes
197 |
198 | :raises UnicodeDecodeError: If decoding of *data* fails.
199 | """
200 | # Skip feeding data into parser if we already have what we want
201 | if self._finished == 1:
202 | return None
203 |
204 | # Make sure that we have unicode before continuing
205 | if isinstance(data, bytes):
206 | if self.encoding:
207 | data = data.decode(self.encoding)
208 | else:
209 | data = self._make_unicode(data)
210 |
211 | # Parse the html document
212 | try:
213 | self._parser.feed(data)
214 | except EOFError:
215 | self._finished = True
216 | self._parser.reset()
217 |
218 | def close(self):
219 | """
220 | Close the "tree builder" and return the "root element" of the "element tree".
221 |
222 | :return: The "root element" of the "element tree".
223 | :rtype: xml.etree.ElementTree.Element
224 |
225 | :raises RuntimeError: If no element matching search criteria was found.
226 | """
227 | return self._parser.close()
228 |
229 | def _make_unicode(self, data):
230 | """
231 | Convert *data* from type :class:`bytes` to type :class:`str`.
232 |
233 | :param data: The html document.
234 | :type data: bytes
235 |
236 | :return: HTML data decoded.
237 | :rtype: str
238 | """
239 | # Atemp to find the encoding from the html source
240 | end_head_tag = data.find(b"")
241 | if end_head_tag:
242 | # Search for the charset attribute within the meta tags
243 | charset_refind = b''
244 | charset = re.search(charset_refind, data[:end_head_tag], re.IGNORECASE)
245 | if charset:
246 | self.encoding = encoding = charset.group(1).decode()
247 | return data.decode(encoding)
248 |
249 | # Decode the string into unicode using default encoding
250 | warn_msg = "Unable to determine encoding, defaulting to iso-8859-1"
251 | warnings.warn(warn_msg, UnicodeWarning, stacklevel=2)
252 | self.encoding = "iso-8859-1"
253 | return data.decode("iso-8859-1")
254 |
255 |
256 | # noinspection PyAbstractClass
257 | class ParseHTML(HTMLParser):
258 | def __init__(self, tag="", attrs=None):
259 | # Initiate HTMLParser
260 | HTMLParser.__init__(self)
261 | self.convert_charrefs = True
262 | self._root = None # root element
263 | self._data = [] # data collector
264 | self._factory = Etree.Element
265 | self.enabled = not tag
266 | self._unw_attrs = []
267 | self.tag = tag
268 |
269 | # Split attributes into wanted and unwanted attributes
270 | if attrs:
271 | self.attrs = attrs
272 | for key, value in attrs.copy().items():
273 | if value == 0:
274 | self._unw_attrs.append(key)
275 | del attrs[key]
276 | else:
277 | self.attrs = {}
278 |
279 | # Some tags in html do not require closing tags so thoes tags will need to be auto closed (Void elements)
280 | # Refer to: https://www.w3.org/TR/html/syntax.html#void-elements
281 | self._voids = frozenset(("area", "base", "br", "col", "hr", "img", "input", "link", "meta", "param",
282 | # Only in HTML5
283 | "embed", "keygen", "source", "track",
284 | # Not supported in HTML5
285 | "basefont", "frame", "isindex",
286 | # SVG self closing tags
287 | "rect", "circle", "ellipse", "line", "polyline", "polygon",
288 | "path", "stop", "use", "image", "animatetransform"))
289 |
290 | # Create temporary root element to protect from badly written sites that either
291 | # have no html starting tag or multiple top level elements
292 | elem = self._factory("html")
293 | self._elem = [elem]
294 | self._last = elem
295 | self._tail = 0
296 |
297 | def handle_starttag(self, tag, attrs):
298 | self._handle_starttag(tag, attrs, self_closing=tag in self._voids)
299 |
300 | def handle_startendtag(self, tag, attrs):
301 | self._handle_starttag(tag, attrs, self_closing=True)
302 |
303 | def _handle_starttag(self, tag, attrs, self_closing=False):
304 | enabled = self.enabled
305 | # Add tag element to tree if we have no filter or that the filter matches
306 | if enabled or self._search(tag, attrs):
307 | # Convert attrs to dictionary
308 | attrs = {k: v or "" for k, v in attrs}
309 | self._flush()
310 |
311 | # Create the new element
312 | elem = self._factory(tag, attrs)
313 | self._elem[-1].append(elem)
314 | self._last = elem
315 |
316 | # Only append the element to the list of elements if it's not a self closing element
317 | if self_closing:
318 | self._tail = 1
319 | else:
320 | self._elem.append(elem)
321 | self._tail = 0
322 |
323 | # Set this element as the root element when the filter search matches
324 | if not enabled:
325 | self._root = elem
326 | self.enabled = True
327 |
328 | def handle_endtag(self, tag):
329 | # Only process end tags when we have no filter or that the filter has been matched
330 | if self.enabled and tag not in self._voids:
331 | _elem = self._elem
332 | _root = self._root
333 | # Check that the closing tag is what's actualy expected
334 | if _elem[-1].tag == tag:
335 | self._flush()
336 | self._tail = 1
337 | self._last = elem = _elem.pop()
338 | if elem is _root:
339 | raise EOFError
340 |
341 | # If a previous element is what we actually have then the expected element was not
342 | # properly closed so we must close that before closing what we have now
343 | elif len(_elem) >= 2 and any(_item.tag == tag for _item in _elem):
344 | self._flush()
345 | self._tail = 1
346 | while True:
347 | self._last = elem = _elem.pop()
348 | if elem.tag == tag:
349 | break
350 | if elem is _root:
351 | raise EOFError
352 | else:
353 | # Unable to match the tag to an element, ignoring it
354 | return None
355 |
356 | def handle_data(self, data):
357 | if data.strip() and self.enabled:
358 | self._data.append(data)
359 |
360 | def handle_entityref(self, name):
361 | if self.enabled:
362 | try:
363 | name = chr(name2codepoint[name])
364 | except KeyError:
365 | pass
366 | self._data.append(name)
367 |
368 | def handle_charref(self, name):
369 | if self.enabled:
370 | try:
371 | if name[0].lower() == "x":
372 | name = chr(int(name[1:], 16))
373 | else:
374 | name = chr(int(name))
375 | except ValueError:
376 | pass
377 | self._data.append(name)
378 |
379 | def handle_comment(self, data):
380 | data = data.strip()
381 | if data and self.enabled:
382 | elem = Etree.Comment(data)
383 | self._elem[-1].append(elem)
384 |
385 | def close(self):
386 | self._flush()
387 | if self.enabled == 0:
388 | msg = "Unable to find requested section with tag of '{}' and attributes of {}"
389 | raise RuntimeError(msg.format(self.tag, self.attrs))
390 | elif self._root is not None:
391 | return self._root
392 | else:
393 | # Search the root element to find a proper html root element if one exists
394 | tmp_root = self._elem[0]
395 | proper_root = tmp_root.find("html")
396 | if proper_root is None:
397 | # Not proper root was found
398 | return tmp_root
399 | else:
400 | # Proper root found
401 | return proper_root
402 |
403 | def _flush(self):
404 | if self._data:
405 | if self._last is not None:
406 | text = "".join(self._data)
407 | if self._tail:
408 | self._last.tail = text
409 | else:
410 | self._last.text = text
411 | self._data = []
412 |
413 | def _search(self, tag, attrs):
414 | # Only search when the tag matches
415 | if tag == self.tag:
416 | # If we have required attrs to match then search all attrs for wanted attrs
417 | # And also check that we do not have any attrs that are unwanted
418 | if self.attrs or self._unw_attrs:
419 | if attrs:
420 | wanted_attrs = self.attrs.copy()
421 | unwanted_attrs = self._unw_attrs
422 | for key, value in attrs:
423 | # Check for unwanted attrs
424 | if key in unwanted_attrs:
425 | return False
426 |
427 | # Check for wanted attrs
428 | elif key in wanted_attrs:
429 | c_value = wanted_attrs[key]
430 | if c_value == value or c_value == 1:
431 | # Remove this attribute from the wanted dict of attributes
432 | # to indicate that this attribute has been found
433 | del wanted_attrs[key]
434 |
435 | # If wanted_attrs is now empty then all attributes must have been found
436 | if not wanted_attrs:
437 | return True
438 | else:
439 | # We only need to match tag
440 | return True
441 |
442 | # Unable to find required section
443 | return False
444 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 |
4 | [metadata]
5 | license_file = LICENSE
6 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | from codecs import open
3 | from os import path
4 | import re
5 |
6 | # Path to local directory
7 | here = path.abspath(path.dirname(__file__))
8 |
9 |
10 | def readfile(filename): # type: (str) -> str
11 | """Get the long description from the README file"""
12 | readme_file = path.join(here, filename)
13 | with open(readme_file, "r", encoding="utf-8") as stream:
14 | return stream.read()
15 |
16 |
17 | def extract_variable(filename, variable): # type: (str, str) -> str
18 | """Extract the version number from a python file that contains the '__version__' variable."""
19 | with open(filename, "r", encoding="utf8") as stream:
20 | search_refind = r'{} = ["\'](\d+\.\d+\.\d+)["\']'.format(variable)
21 | verdata = re.search(search_refind, stream.read())
22 | if verdata:
23 | return verdata.group(1)
24 | else:
25 | raise RuntimeError("Unable to extract version number")
26 |
27 |
28 | setup(
29 | name='htmlement',
30 | version=extract_variable('htmlement.py', '__version__'),
31 | description='Pure-Python HTML parser with ElementTree support.',
32 | long_description=readfile('README.rst'),
33 | extras_require={"dev": ["pytest", "pytest-cov"]},
34 | keywords='html html5 parsehtml htmlparser elementtree dom',
35 | classifiers=[
36 | 'Development Status :: 5 - Production/Stable',
37 | 'Intended Audience :: Developers',
38 | 'License :: OSI Approved :: MIT License',
39 | 'Natural Language :: English',
40 | 'Operating System :: OS Independent',
41 | 'Programming Language :: Python :: 3',
42 | 'Programming Language :: Python :: 3.7',
43 | 'Programming Language :: Python :: 3.8',
44 | 'Programming Language :: Python :: 3.9',
45 | 'Programming Language :: Python :: 3.10',
46 | 'Programming Language :: Python :: 3.11',
47 | 'Topic :: Text Processing :: Markup :: HTML',
48 | 'Topic :: Software Development :: Libraries :: Python Modules'
49 | ],
50 | url='https://github.com/willforde/python-htmlement',
51 | platforms=['OS Independent'],
52 | author='William Forde',
53 | author_email='willforde@gmail.com',
54 | license='MIT License',
55 | py_modules=['htmlement']
56 | )
57 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # dummy
2 |
--------------------------------------------------------------------------------
/tests/test_module.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Python 2 compatibility
5 | import xml.etree.ElementTree as Etree
6 | import htmlement
7 | import examples
8 | import tempfile
9 | import pytest
10 | import io
11 | import os
12 |
13 |
14 | def quick_parsehtml(html, encoding=""):
15 | obj = htmlement.HTMLement(encoding=encoding)
16 | obj.feed(html)
17 | root = obj.close()
18 | assert Etree.iselement(root)
19 | return root
20 |
21 |
22 | def quick_parse_filter(html, tag, attrs=None, encoding=""):
23 | obj = htmlement.HTMLement(tag, attrs, encoding=encoding)
24 | obj.feed(html)
25 | return obj.close()
26 |
27 |
28 | def test_initialization():
29 | # Check that the parser even starts
30 | obj = htmlement.HTMLement()
31 | assert isinstance(obj, htmlement.HTMLement)
32 |
33 |
34 | # ############################# HTML Test ############################## #
35 |
36 |
37 | def test_basic_tree():
38 | # Check that I can parse a simple tree
39 | html = ""
40 | root = quick_parsehtml(html)
41 | assert root.tag == "html"
42 | assert root[0].tag == "body"
43 |
44 |
45 | def test_basic_partial():
46 | # Check that I can parse a simple tree segment at a time
47 | html = ""
48 | obj = htmlement.HTMLement()
49 | obj.feed(html[:9])
50 | obj.feed(html[9:])
51 | root = obj.close()
52 | assert Etree.iselement(root)
53 | assert root.tag == "html"
54 | assert root[0].tag == "body"
55 |
56 |
57 | def test_nohtml_tree():
58 | # Check that the missing html starting tag is created
59 | html = ""
60 | root = quick_parsehtml(html)
61 | assert root.tag == "html"
62 | assert root[0].tag == "body"
63 | assert Etree.tostring(root, method="html") == b''
64 |
65 |
66 | def test_text():
67 | html = "text"
68 | root = quick_parsehtml(html)
69 | assert root.tag == "html"
70 | assert root[0].tag == "body"
71 | assert root[0].attrib == {}
72 | assert root[0].text == "text"
73 |
74 |
75 | def test_attrib():
76 | html = "text"
77 | root = quick_parsehtml(html)
78 | assert root[0].attrib == {"test": "yes"}
79 |
80 |
81 | def test_tail():
82 | html = "text
tail"
83 | root = quick_parsehtml(html)
84 | assert root[0][0].tail == "tail"
85 |
86 |
87 | def test_self_closing_normal():
88 | html = ""
89 | root = quick_parsehtml(html)
90 | assert root[0].attrib.get("test") == "self closing"
91 |
92 |
93 | def test_self_closing_void():
94 | html = "
"
95 | root = quick_parsehtml(html)
96 | assert root[0].tag == "img"
97 | assert root[0].attrib.get("src") == "http://myimages.com/myimage.jpg"
98 |
99 |
100 | def test_open_void():
101 | html = "
"
102 | root = quick_parsehtml(html)
103 | assert root[0].tag == "img"
104 | assert root[0].attrib.get("src") == "http://myimages.com/myimage.jpg"
105 |
106 |
107 | def test_comment():
108 | html = "This is a paragraph.
"
109 | root = quick_parsehtml(html)
110 | assert root[0][0].text == "This is a comment."
111 | assert root[0][1].tag == "p"
112 | assert root[0][1].text == "This is a paragraph."
113 |
114 |
115 | def test_missing_end_tag():
116 | # Test for a missing 'a' end tag
117 | html = "link"
118 | root = quick_parsehtml(html)
119 | assert root.find(".//a").get("href") == "http://google.ie/"
120 | assert Etree.tostring(root, method="html") == b'link'
121 |
122 |
123 | def test_extra_tag():
124 | # Check that a extra tag that should not exist was removed
125 | html = ""
126 | root = quick_parsehtml(html)
127 | assert len(root[0]) == 0
128 | assert Etree.tostring(root, method="html") == b''
129 |
130 |
131 | def test_find_empty_attribute():
132 | # Check whether we can find an element with an empty-valued attribute
133 | html = ""
134 | form = quick_parse_filter(html, "form", {"autofocus": True})
135 | assert "autofocus" in form.attrib
136 | assert form.find(".//input[@checked]") is not None
137 |
138 |
139 | # ############################# HTML Entity ############################## #
140 |
141 |
142 | def test_entity_name_euro():
143 | html = "cost is €49.99"
144 | root = quick_parsehtml(html)
145 | assert root[0].text == "cost is €49.99"
146 |
147 |
148 | def test_entity_number_euro():
149 | html = "cost is €49.99"
150 | root = quick_parsehtml(html)
151 | assert root[0].text == "cost is €49.99"
152 |
153 |
154 | def test_entity_hex_euro():
155 | html = "cost is €49.99"
156 | root = quick_parsehtml(html)
157 | assert root[0].text == "cost is €49.99"
158 |
159 |
160 | def test_entity_name_euro_fail():
161 | html = "cost is &euros;49.99"
162 | root = quick_parsehtml(html)
163 | assert "euros" in root[0].text
164 |
165 |
166 | def test_entity_hex_euro_fail():
167 | html = "cost is 49.99"
168 | root = quick_parsehtml(html)
169 | assert "€" not in root[0].text
170 |
171 |
172 | # ############################# Text Content ############################# #
173 |
174 |
175 | def test_text_iterator():
176 | html = "sample text content"
177 | root = quick_parsehtml(html)
178 | body = root.find(".//body")
179 | assert "".join(body.itertext()) == "sample text content"
180 |
181 |
182 | def test_text_iterator_unclosed_tag():
183 | html = "hello to the world!
"
184 | root = quick_parsehtml(html)
185 | body = root.find(".//body")
186 | assert "".join(body.itertext()) == "hello to the world!"
187 |
188 |
189 | # ############################# Filter Test ############################## #
190 |
191 |
192 | def test_tag_match():
193 | html = ""
194 | root = quick_parse_filter(html, "div")
195 | assert root.tag == "div"
196 | assert root[0].tag == "p"
197 |
198 |
199 | def test_tag_no_match():
200 | html = ""
201 | with pytest.raises(RuntimeError) as excinfo:
202 | quick_parse_filter(html, "div")
203 | excinfo.match("Unable to find requested section with tag of")
204 |
205 |
206 | def test_attrib_match():
207 | html = "text
"
208 | root = quick_parse_filter(html, "div", {"test": "yes"})
209 | assert root.tag == "div"
210 | assert root.get("test") == "yes"
211 | assert root.text == "text"
212 |
213 |
214 | def test_attrib_no_match():
215 | html = "text
"
216 | with pytest.raises(RuntimeError) as excinfo:
217 | quick_parse_filter(html, "div", {"test": "yes"})
218 | excinfo.match("Unable to find requested section with tag of")
219 |
220 |
221 | def test_attrib_match_name():
222 | # Search for any div tag with a attribute of src of any value
223 | html = "text
"
224 | root = quick_parse_filter(html, "div", {"src": True})
225 | assert root.tag == "div"
226 | assert root.get("src")
227 | assert root.text == "text"
228 |
229 |
230 | def test_attrib_match_unwanted():
231 | # Search for a div with a test attribute but not a src attribute
232 | html = "text
"
233 | root = quick_parse_filter(html, "div", {"test": "yes", "src": False})
234 | assert root.tag == "div"
235 | assert root.get("test") == "yes"
236 | assert "src" not in root.attrib
237 | assert root.text == "text"
238 |
239 |
240 | def test_tag_match_badhtml():
241 | html = ""
242 | root = quick_parse_filter(html, "div")
243 | assert root.tag == "div"
244 | assert root[0].tag == "p"
245 |
246 |
247 | def test_partial_filter():
248 | # Check that the
249 | html = ""
250 | obj = htmlement.HTMLement("div")
251 | obj.feed(html[:51])
252 | obj.feed(html[51:])
253 | root = obj.close()
254 | assert root.tag == "div"
255 | assert root[0].tag == "p"
256 |
257 |
258 | # ####################### Unicode Decoding Test ####################### #
259 |
260 |
261 | def test_with_encoding():
262 | # Check that I can parse a simple tree
263 | html = b""
264 | root = quick_parsehtml(html, encoding="utf-8")
265 | assert root.tag == "html"
266 | assert root[0].tag == "body"
267 |
268 |
269 | def test_no_encoding_with_header_type1(recwarn):
270 | # Check for charset header type one
271 | html = b"text"
272 | quick_parsehtml(html)
273 | # Check that no warnings ware raised
274 | warnmsg = "Unable to determine encoding, defaulting to iso-8859-1"
275 | for w in recwarn.list:
276 | assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg
277 |
278 |
279 | def test_no_encoding_with_header_type2(recwarn):
280 | # Check for charset header type one
281 | html = b'text'
282 | quick_parsehtml(html)
283 | # Check that no warnings ware raised
284 | warnmsg = "Unable to determine encoding, defaulting to iso-8859-1"
285 | for w in recwarn.list:
286 | assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg
287 |
288 |
289 | def test_no_encoding_with_header_type3(recwarn):
290 | # Check for charset header type one
291 | html = b"text"
292 | quick_parsehtml(html)
293 | # Check that no warnings ware raised
294 | warnmsg = "Unable to determine encoding, defaulting to iso-8859-1"
295 | for w in recwarn.list:
296 | assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg
297 |
298 |
299 | def test_no_encoding_with_header_type4(recwarn):
300 | # Check for charset header type one
301 | html = b'text'
302 | quick_parsehtml(html)
303 | # Check that no warnings ware raised
304 | warnmsg = "Unable to determine encoding, defaulting to iso-8859-1"
305 | for w in recwarn.list:
306 | assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg
307 |
308 |
309 | def test_no_encoding_with_header_type5(recwarn):
310 | # Check for charset header type one
311 | html = b"text"
312 | quick_parsehtml(html)
313 | # Check that no warnings ware raised
314 | warnmsg = "Unable to determine encoding, defaulting to iso-8859-1"
315 | for w in recwarn.list:
316 | assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg
317 |
318 |
319 | def test_no_encoding_with_header_type6(recwarn):
320 | # Check for charset header type one
321 | html = b'text'
322 | quick_parsehtml(html)
323 | # Check that no warnings ware raised
324 | warnmsg = "Unable to determine encoding, defaulting to iso-8859-1"
325 | for w in recwarn.list:
326 | assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg
327 |
328 |
329 | def test_no_encoding_with_header_type7(recwarn):
330 | # Check for charset header type one
331 | html = b"text"
332 | quick_parsehtml(html)
333 | # Check that no warnings ware raised
334 | warnmsg = "Unable to determine encoding, defaulting to iso-8859-1"
335 | for w in recwarn.list:
336 | assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg
337 |
338 |
339 | def test_no_encoding_with_header_type8(recwarn):
340 | # Check for charset header type one
341 | html = b'text'
342 | quick_parsehtml(html)
343 | # Check that no warnings ware raised
344 | warnmsg = "Unable to determine encoding, defaulting to iso-8859-1"
345 | for w in recwarn.list:
346 | assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg
347 |
348 |
349 | def test_no_encoding_no_header():
350 | # Check that I can parse a simple tree
351 | html = b"text"
352 | with pytest.warns(UnicodeWarning):
353 | quick_parsehtml(html)
354 |
355 |
356 | # ####################### Funtion Tests ####################### #
357 |
358 |
359 | def test_fromstring():
360 | # Check that I can parse a simple tree
361 | html = ""
362 | root = htmlement.fromstring(html)
363 | assert Etree.iselement(root)
364 | assert root.tag == "html"
365 | assert root[0].tag == "body"
366 |
367 |
368 | def test_fromstringlist():
369 | # Check that I can parse a simple tree
370 | sequence = ["", ""]
371 | root = htmlement.fromstringlist(sequence)
372 | assert Etree.iselement(root)
373 | assert root.tag == "html"
374 | assert root[0].tag == "body"
375 |
376 |
377 | def test_parse_file_object():
378 | html = ""
379 | fileobj = io.StringIO(html)
380 | root = htmlement.parse(fileobj, encoding="utf8")
381 | assert Etree.iselement(root)
382 | assert root.tag == "html"
383 | assert root[0].tag == "body"
384 |
385 |
386 | def test_parse_filename():
387 | # Create temp file and add html data to it
388 | html = ""
389 | fileobj = tempfile.NamedTemporaryFile("w", delete=False)
390 | fileobj.write(html)
391 | filename = fileobj.name
392 | fileobj.close()
393 |
394 | try:
395 | root = htmlement.parse(filename, encoding="utf8")
396 | assert Etree.iselement(root)
397 | assert root.tag == "html"
398 | assert root[0].tag == "body"
399 | finally:
400 | os.remove(filename)
401 |
402 |
403 | # ####################### Examples Tests ####################### #
404 |
405 |
406 | def test_example_simple():
407 | # Check that there is no errors
408 | examples.example_simple()
409 |
410 |
411 | def test_example_filter():
412 | # Check that there is no errors
413 | examples.example_filter()
414 |
415 |
416 | def test_example_complex():
417 | # Check that there is no errors
418 | examples.example_complex()
419 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py{37,38,39,310,311},flake8
3 | skip_missing_interpreters = true
4 |
5 | [gh-actions]
6 | python =
7 | 3.7: py37
8 | 3.8: py38
9 | 3.9: py39
10 | 3.10: py310
11 | 3.11: py311
12 |
13 | [testenv]
14 | extras = dev
15 | commands = pytest --cov=htmlement --cov-report xml
16 |
17 | # Flake8 Environment
18 | [testenv:flake8]
19 | basepython = python3
20 | skip_install = true
21 | deps =
22 | flake8
23 | commands =
24 | flake8 --max-line-length=127
25 |
26 | # Flake8 Configuration
27 | [flake8]
28 | ignore =
29 | F821, # undefined name 'unichr'
30 | exclude =
31 | .tox,
32 | .git,
33 | docs,
34 | tests
35 |
36 | [coverage:run]
37 | source=htmlement
38 | branch=True
39 |
40 | [coverage:report]
41 | exclude_lines =
42 | if __name__ == .__main__.:
43 | def __repr__
44 | pragma: no cover
45 |
46 | [coverage:paths]
47 | source =
48 | htmlement
49 | .tox/*/lib/python*/site-packages/htmlement
50 |
--------------------------------------------------------------------------------