├── .coveragerc
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── publish.yml
    │   └── tests.yml
├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.rst
├── docs
    ├── Makefile
    ├── README.rst
    ├── conf.py
    └── index.rst
├── examples.py
├── htmlement.py
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    └── test_module.py
└── tox.ini


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | source=htmlement
3 | branch=True
4 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   # Maintain dependencies for GitHub Actions
 9 |   - package-ecosystem: "github-actions"
10 |     target-branch: "master"
11 |     directory: "/"
12 |     schedule:
13 |       interval: "daily"
14 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPi
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   pypi:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v3
12 |         with:
13 |           fetch-depth: 0
14 | 
15 |       - name: Install dependencies
16 |         run: |
17 |           python3 -m pip install --upgrade build
18 |           python3 -m build
19 | 
20 |       - name: Publish package
21 |         uses: pypa/gh-action-pypi-publish@release/v1
22 |         with:
23 |           password: ${{ secrets.PYPI_API_TOKEN }}
24 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: tests
 2 | 
 3 | on:
 4 |   - push
 5 |   - pull_request
 6 | 
 7 | jobs:
 8 |   tests:
 9 |     runs-on: ubuntu-latest
10 |     strategy:
11 |       matrix:
12 |         python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
13 |     steps:
14 |       - uses: actions/checkout@v3
15 | 
16 |       - name: Set up Python ${{ matrix.python-version }}
17 |         uses: actions/setup-python@v4
18 |         with:
19 |           python-version: ${{ matrix.python-version }}
20 | 
21 |       - name: Install test dependencies
22 |         run: |
23 |           python -m pip install --upgrade --no-cache-dir pip
24 |           pip install --no-cache-dir tox tox-gh-actions
25 | 
26 |       - name: Test using tox
27 |         run: tox
28 | 
29 |       - name: Upload test coverage to Codecov
30 |         continue-on-error: true
31 |         uses: codecov/codecov-action@3.1.2
32 |         with:
33 |           flags: unittests
34 | 
35 |   linting:
36 |     runs-on: ubuntu-latest
37 |     steps:
38 |       - uses: actions/checkout@v3
39 | 
40 |       - name: Install test dependencies
41 |         run: |
42 |           python -m pip install --upgrade --no-cache-dir pip
43 |           pip install --no-cache-dir flake8
44 | 
45 |       - name: Test linting with flake8
46 |         run: flake8 --max-line-length=127
47 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # general things to ignore
 2 | docs/_static/
 3 | docs/build/
 4 | dist/
 5 | *.egg-info/
 6 | *.egg
 7 | *.py[cod]
 8 | __pycache__/
 9 | .idea/
10 | build/
11 | .cache/
12 | .tox/
13 | .coverage
14 | docs/_build/
15 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | arch: amd64
 3 | os: linux
 4 | dist: bionic  # Has python 2.7, 3.6, 3.7, 3.8 pre installed
 5 | 
 6 | matrix:
 7 |   include:
 8 |     - python: "2.7"
 9 |       env: TOXENV=py27
10 |       stage: Tests
11 |     - python: "3.6"
12 |       env: TOXENV=py36
13 |     - python: "3.7"
14 |       env: TOXENV=py37
15 |     - python: "3.8"
16 |       env: TOXENV=py38
17 |     - python: "3.9"
18 |       env: TOXENV=py39
19 |     - python: "3.10"
20 |       env: TOXENV=py39
21 |     - env: TOXENV=flake8
22 | 
23 | install:
24 |   - pip install coveralls
25 |   - pip install tox
26 | 
27 | script: tox
28 | after_success: coveralls
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 William Forde
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst LICENSE
2 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | .. image:: https://badge.fury.io/py/htmlement.svg
  2 |    :target: https://pypi.python.org/pypi/htmlement
  3 | 
  4 | .. image:: https://readthedocs.org/projects/python-htmlement/badge/?version=stable
  5 |    :target: http://python-htmlement.readthedocs.io/en/stable/?badge=stable
  6 | 
  7 | .. image:: https://github.com/willforde/python-htmlement/actions/workflows/tests.yml/badge.svg?branch=master&event=push
  8 |    :target: https://github.com/willforde/python-htmlement/actions
  9 | 
 10 | .. image:: https://codecov.io/gh/willforde/python-htmlement/branch/master/graph/badge.svg?token=D5EKKLIVBP
 11 |    :target: https://codecov.io/gh/willforde/python-htmlement
 12 | 
 13 | .. image:: https://api.codeclimate.com/v1/badges/7d593426acc83cba5ef7/maintainability
 14 |    :target: https://codeclimate.com/github/willforde/python-htmlement/maintainability
 15 |    :alt: Maintainability
 16 | 
 17 | 
 18 | HTMLement
 19 | ---------
 20 | 
 21 | HTMLement is a pure Python HTML Parser.
 22 | 
 23 | The object of this project is to be a "pure-python HTML parser" which is also "faster" than "beautifulsoup".
 24 | And like "beautifulsoup", will also parse invalid html.
 25 | 
 26 | The most simple way to do this is to use ElementTree `XPath expressions`__.
 27 | Python does support a simple (read limited) XPath engine inside its "ElementTree" module.
 28 | A benefit of using "ElementTree" is that it can use a "C implementation" whenever available.
 29 | 
 30 | This "HTML Parser" extends `html.parser.HTMLParser`_ to build a tree of `ElementTree.Element`_ instances.
 31 | 
 32 | Install
 33 | -------
 34 | Run ::
 35 | 
 36 |     pip install htmlement
 37 | 
 38 | -or- ::
 39 | 
 40 |     pip install git+https://github.com/willforde/python-htmlement.git
 41 | 
 42 | Parsing HTML
 43 | ------------
 44 | Here I’ll be using a sample "HTML document" that will be "parsed" using "htmlement": ::
 45 | 
 46 |     html = """
 47 |     <html>
 48 |       <head>
 49 |         <title>GitHub</title>
 50 |       </head>
 51 |       <body>
 52 |         <a href="https://github.com/marmelo">GitHub</a>
 53 |         <a href="https://github.com/marmelo/python-htmlparser">GitHub Project</a>
 54 |       </body>
 55 |     </html>
 56 |     """
 57 | 
 58 |     # Parse the document
 59 |     import htmlement
 60 |     root = htmlement.fromstring(html)
 61 | 
 62 | Root is an ElementTree.Element_ and supports the ElementTree API
 63 | with XPath expressions. With this I'm easily able to get both the title and all anchors in the document. ::
 64 | 
 65 |     # Get title
 66 |     title = root.find("head/title").text
 67 |     print("Parsing: %s" % title)
 68 | 
 69 |     # Get all anchors
 70 |     for a in root.iterfind(".//a"):
 71 |         print(a.get("href"))
 72 | 
 73 | And the output is as follows: ::
 74 | 
 75 |     Parsing: GitHub
 76 |     https://github.com/willforde
 77 |     https://github.com/willforde/python-htmlement
 78 | 
 79 | 
 80 | Parsing HTML with a filter
 81 | --------------------------
 82 | Here I’ll be using a slightly more complex "HTML document" that will be "parsed" using "htmlement with a filter" to fetch
 83 | only the menu items. This can be very useful when dealing with large "HTML documents" since it can be a lot faster to
 84 | only "parse the required section" and to ignore everything else. ::
 85 | 
 86 |     html = """
 87 |     <html>
 88 |       <head>
 89 |         <title>Coffee shop</title>
 90 |       </head>
 91 |       <body>
 92 |         <ul class="menu">
 93 |           <li>Coffee</li>
 94 |           <li>Tea</li>
 95 |           <li>Milk</li>
 96 |         </ul>
 97 |         <ul class="extras">
 98 |           <li>Sugar</li>
 99 |           <li>Cream</li>
100 |         </ul>
101 |       </body>
102 |     </html>
103 |     """
104 | 
105 |     # Parse the document
106 |     import htmlement
107 |     root = htmlement.fromstring(html, "ul", attrs={"class": "menu"})
108 | 
109 | In this case I'm not unable to get the title, since all elements outside the filter were ignored.
110 | But this allows me to be able to extract all "list_item elements" within the menu list and nothing else. ::
111 | 
112 |     # Get all listitems
113 |     for item in root.iterfind(".//li"):
114 |         # Get text from listitem
115 |         print(item.text)
116 | 
117 | And the output is as follows: ::
118 | 
119 |     Coffee
120 |     Tea
121 |     Milk
122 | 
123 | .. _html.parser.HTMLParser: https://docs.python.org/3.6/library/html.parser.html#html.parser.HTMLParser
124 | .. _ElementTree.Element: https://docs.python.org/3.6/library/xml.etree.elementtree.html#xml.etree.ElementTree.Element
125 | .. _Xpath: https://docs.python.org/3.6/library/xml.etree.elementtree.html#xpath-support
126 | __ XPath_
127 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = HTMLement
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/README.rst:
--------------------------------------------------------------------------------
1 | .. image:: https://readthedocs.org/projects/python-htmlement/badge/?version=stable
2 |     :target: http://python-htmlement.readthedocs.io/en/stable/?badge=stable
3 | 
4 | Please GoTo: http://python-htmlement.readthedocs.io/en/latest/?badge=stable
5 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # HTMLement documentation build configuration file, created by
  5 | # sphinx-quickstart on Mon Jan 16 03:44:03 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | import sys
 16 | import os
 17 | 
 18 | # If extensions (or modules to document with autodoc) are in another directory,
 19 | # add these directories to sys.path here. If the directory is relative to the
 20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 21 | sys.path.insert(0, os.path.abspath('../'))
 22 | 
 23 | # Fetch version number
 24 | from htmlement import __version__
 25 | 
 26 | # General information about the project.
 27 | project = 'HTMLement'
 28 | author = "William Forde"
 29 | 
 30 | # The version info for the project you're documenting, acts as replacement for
 31 | # |version| and |release|, also used in various other places throughout the
 32 | # built documents.
 33 | #
 34 | # The short X.Y version.
 35 | version = __version__
 36 | # The full version, including alpha/beta/rc tags.
 37 | release = __version__
 38 | 
 39 | # -- General configuration ------------------------------------------------
 40 | 
 41 | # If your documentation needs a minimal Sphinx version, state it here.
 42 | #
 43 | # needs_sphinx = '1.0'
 44 | 
 45 | # Add any Sphinx extension module names here, as strings. They can be
 46 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 47 | # ones.
 48 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.viewcode']
 49 | 
 50 | # Add any paths that contain templates here, relative to this directory.
 51 | templates_path = ['_templates']
 52 | 
 53 | # The suffix(es) of source filenames.
 54 | # You can specify multiple suffix as a list of string:
 55 | #
 56 | # source_suffix = ['.rst', '.md']
 57 | source_suffix = '.rst'
 58 | 
 59 | # The master toctree document.
 60 | master_doc = 'index'
 61 | 
 62 | # The language for content autogenerated by Sphinx. Refer to documentation
 63 | # for a list of supported languages.
 64 | #
 65 | # This is also used if you do content translation via gettext catalogs.
 66 | # Usually you set "language" from the command line for these cases.
 67 | language = "en"
 68 | 
 69 | # List of patterns, relative to source directory, that match files and
 70 | # directories to ignore when looking for source files.
 71 | # This patterns also effect to html_static_path and html_extra_path
 72 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'README.rst']
 73 | 
 74 | # The name of the Pygments (syntax highlighting) style to use.
 75 | pygments_style = 'sphinx'
 76 | 
 77 | # If true, todo and todoList produce output, else they produce nothing.
 78 | todo_include_todos = False
 79 | 
 80 | 
 81 | # -- Options for HTML output ----------------------------------------------
 82 | 
 83 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 84 | # a list of builtin themes.
 85 | #
 86 | html_theme = 'default'
 87 | 
 88 | # Theme options are theme-specific and customize the look and feel of a theme
 89 | # further.  For a list of options available for each theme, see the
 90 | # documentation.
 91 | #
 92 | # html_theme_options = {}
 93 | 
 94 | # Add any paths that contain custom static files (such as style sheets) here,
 95 | # relative to this directory. They are copied after the builtin static files,
 96 | # so a file named "default.css" will overwrite the builtin "default.css".
 97 | html_static_path = ['_static']
 98 | 
 99 | # This will exclude any warnings of 'nonlocal image URI found'.
100 | suppress_warnings = ['image.nonlocal_uri']
101 | 
102 | 
103 | # -- Options for HTMLHelp output ------------------------------------------
104 | 
105 | # Output file base name for HTML help builder.
106 | htmlhelp_basename = 'HTMLementdoc'
107 | 
108 | 
109 | # -- Options for LaTeX output ---------------------------------------------
110 | 
111 | latex_elements = {
112 |     # The paper size ('letterpaper' or 'a4paper').
113 |     #
114 |     # 'papersize': 'letterpaper',
115 | 
116 |     # The font size ('10pt', '11pt' or '12pt').
117 |     #
118 |     # 'pointsize': '10pt',
119 | 
120 |     # Additional stuff for the LaTeX preamble.
121 |     #
122 |     # 'preamble': '',
123 | 
124 |     # Latex figure (float) alignment
125 |     #
126 |     # 'figure_align': 'htbp',
127 | }
128 | 
129 | # Grouping the document tree into LaTeX files. List of tuples
130 | # (source start file, target name, title,
131 | #  author, documentclass [howto, manual, or own class]).
132 | latex_documents = [
133 |     (master_doc, 'HTMLement.tex', 'HTMLement Documentation',
134 |      'William Forde', 'manual'),
135 | ]
136 | 
137 | 
138 | # -- Options for manual page output ---------------------------------------
139 | 
140 | # One entry per manual page. List of tuples
141 | # (source start file, name, description, authors, manual section).
142 | man_pages = [
143 |     (master_doc, 'htmlement', 'HTMLement Documentation',
144 |      [author], 1)
145 | ]
146 | 
147 | 
148 | # -- Options for Texinfo output -------------------------------------------
149 | 
150 | # Grouping the document tree into Texinfo files. List of tuples
151 | # (source start file, target name, title, author,
152 | #  dir menu entry, description, category)
153 | texinfo_documents = [
154 |     (master_doc, 'HTMLement', 'HTMLement Documentation',
155 |      author, 'HTMLement', 'One line description of project.',
156 |      'Miscellaneous'),
157 | ]
158 | 
159 | 
160 | # Example configuration for intersphinx: refer to the Python standard library.
161 | intersphinx_mapping = {'https://docs.python.org/3.6': None}
162 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. HTMLement documentation master file, created by
 2 |    sphinx-quickstart on Mon Jan 16 03:44:03 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to HTMLement's documentation!
 7 | =====================================
 8 | 
 9 | .. include::
10 |     ../README.rst
11 | 
12 | .. seealso::
13 |     More examples can be found in `examples.py`_.
14 | 
15 | API
16 | ---
17 | 
18 | .. automodule:: htmlement
19 |     :members:
20 | 
21 | External Links
22 | --------------
23 | ElementTree: https://docs.python.org/2/library/xml.etree.elementtree.html
24 | 
25 | Bug Tracker: https://github.com/willforde/python-htmlement/issues
26 | 
27 | .. _examples.py: https://github.com/willforde/python-htmlement/blob/master/examples.py
28 | 


--------------------------------------------------------------------------------
/examples.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | For more information, see:
  4 | @see https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.Element
  5 | @see https://docs.python.org/3/library/xml.etree.elementtree.html#xpath-support
  6 | """
  7 | from __future__ import print_function, unicode_literals
  8 | from htmlement import HTMLement
  9 | 
 10 | 
 11 | def example_simple():
 12 |     """
 13 |     This example will parse a simple html tree and
 14 |     extract the website title and all anchors
 15 | 
 16 |     >>> example_simple()
 17 |     Parsing: GitHub
 18 |     GitHub => https://github.com/willforde
 19 |     GitHub Project => https://github.com/willforde/python-htmlement
 20 |     """
 21 |     html = """
 22 |     <html>
 23 |       <head>
 24 |         <title>GitHub</title>
 25 |       </head>
 26 |       <body>
 27 |         <a href="https://github.com/willforde">GitHub</a>
 28 |         <a href="https://github.com/willforde/python-htmlement">GitHub Project</a>
 29 |       </body>
 30 |     </html>
 31 |     """
 32 | 
 33 |     # Parse the document
 34 |     parser = HTMLement()
 35 |     parser.feed(html)
 36 |     root = parser.close()
 37 | 
 38 |     # Root is an xml.etree.Element and supports the ElementTree API
 39 |     # (e.g. you may use its limited support for XPath expressions)
 40 | 
 41 |     # Get title
 42 |     title = root.find('head/title').text
 43 |     print("Parsing: {}".format(title))
 44 | 
 45 |     # Get all anchors
 46 |     for a in root.iterfind(".//a"):
 47 |         # Get href attribute
 48 |         url = a.get("href")
 49 |         # Get anchor name
 50 |         name = a.text
 51 | 
 52 |         print("{} => {}".format(name, url))
 53 | 
 54 | 
 55 | def example_filter():
 56 |     """
 57 |     This example will parse a simple html tree and
 58 |     extract all the list items within the ul menu element using a tree filter.
 59 | 
 60 |     The tree filter will tell the parser to only parse the elements within the
 61 |     requested section and to ignore all other elements.
 62 |     Useful for speeding up the parsing of html pages.
 63 | 
 64 |     >>> example_filter()
 65 |     Menu Items
 66 |     - Coffee
 67 |     - Tea
 68 |     - Milk
 69 |     """
 70 |     html = """
 71 |     <html>
 72 |       <head>
 73 |         <title>Coffee shop</title>
 74 |       </head>
 75 |       <body>
 76 |         <ul class="menu">
 77 |           <li>Coffee</li>
 78 |           <li>Tea</li>
 79 |           <li>Milk</li>
 80 |         </ul>
 81 |         <ul class="extras">
 82 |           <li>Sugar</li>
 83 |           <li>Cream</li>
 84 |         </ul>
 85 |       </body>
 86 |     </html>
 87 |     """
 88 | 
 89 |     # Parse the document
 90 |     parser = HTMLement("ul", attrs={"class": "menu"})
 91 |     parser.feed(html)
 92 |     root = parser.close()
 93 | 
 94 |     # Root should now be a 'ul' xml.etree.Element with all it's child elements available
 95 |     # All other elements have been ignored. Way faster to parse.
 96 | 
 97 |     # We are unable to get the title here sense all
 98 |     # elements outside the filter was ignored
 99 |     print("Menu Items")
100 | 
101 |     # Get all listitems
102 |     for item in root.iterfind(".//li"):
103 |         # Get text from listitem
104 |         print("- {}".format(item.text))
105 | 
106 | 
107 | def example_complex():
108 |     """
109 |     This example will parse a more complex html tree of python talk's and will
110 |     extract the image, title, url and date of each talk.
111 | 
112 |     A filter will be used to extract the main talks div element
113 | 
114 |     >>> example_complex()
115 |     Image = /presentations/c7f1fbb5d03a409d9de8abb5238d6a68/thumb_slide_0.jpg
116 |     Url = /pycon2016/alex-martelli-exception-and-error-handling-in-python-2-and-python-3
117 |     Title = Alex Martelli - Exception and error handling in Python 2 and Python 3
118 |     Date = Jun 1, 2016
119 |     <BLANKLINE>
120 |     Image = /presentations/eef8ffe5b6784f7cb84948cf866b2608/thumb_slide_0.jpg
121 |     Url = /presentations/518cae54da12460e895163d809e25933/thumb_slide_0.jpg
122 |     Title = Jake Vanderplas - Statistics for Hackers
123 |     Date = May 29, 2016
124 |     <BLANKLINE>
125 |     Image = /presentations/8b3ee51b5fcc4a238c4cb4b7787979ac/thumb_slide_0.jpg
126 |     Url = /pycon2016/brett-slatkin-refactoring-python-why-and-how-to-restructure-your-code
127 |     Title = Brett Slatkin - Refactoring Python: Why and how to restructure your code
128 |     Date = May 29, 2016
129 |     <BLANKLINE>
130 |     """
131 |     html = """
132 |     <html>
133 |       <head>
134 |         <title>PyCon 2016</title>
135 |       </head>
136 |       <body>
137 |         <div class="main">
138 |           <h1>Talks by PyCon 2016</h1>
139 |           <div class="talks" id="d5esfbb5d03adfdfede8a342238d6a68">
140 |             <div class="talk" data-id="c7f1fbb5d03a409d9de8abb5238d6a68">
141 |               <a href="/pycon2016/kelsey-gilmore-innis-seriously-strong-security-on-a-shoestring">
142 |                 <img src="/presentations/c7f1fbb5d03a409d9de8abb5238d6a68/thumb_slide_0.jpg">
143 |               </a>
144 |               <div class="talk-listing-meta">
145 |                 <h3 class="title">
146 |                   <a href="/pycon2016/alex-martelli-exception-and-error-handling-in-python-2-and-python-3">
147 |                     Alex Martelli - Exception and error handling in Python 2 and Python 3
148 |                   </a>
149 |                 </h3>
150 |                 <p class="date">Jun 1, 2016</p>
151 |               </div>
152 |             </div>
153 |             <div class="talk" data-id="518cae54da12460e895163d809e25933">
154 |               <a href="/pycon2016/manuel-ebert-putting-1-million-new-words-into-the-dictionary">
155 |                 <img src="/presentations/eef8ffe5b6784f7cb84948cf866b2608/thumb_slide_0.jpg">
156 |               </a>
157 |               <div class="talk-listing-meta">
158 |                 <h3 class="title">
159 |                   <a href="/presentations/518cae54da12460e895163d809e25933/thumb_slide_0.jpg">
160 |                     Jake Vanderplas - Statistics for Hackers
161 |                   </a>
162 |                 </h3>
163 |                 <p class="date">May 29, 2016</p>
164 |               </div>
165 |             </div>
166 |             <div class="talk" data-id="8b3ee51b5fcc4a238c4cb4b7787979ac">
167 |               <a href="/pycon2016/brett-slatkin-refactoring-python-why-and-how-to-restructure-your-code">
168 |                 <img src="/presentations/8b3ee51b5fcc4a238c4cb4b7787979ac/thumb_slide_0.jpg">
169 |               </a>
170 |               <div class="talk-listing-meta">
171 |                 <h3 class="title">
172 |                   <a href="/pycon2016/brett-slatkin-refactoring-python-why-and-how-to-restructure-your-code">
173 |                     Brett Slatkin - Refactoring Python: Why and how to restructure your code
174 |                   </a>
175 |                 </h3>
176 |                 <p class="date">May 29, 2016</p>
177 |               </div>
178 |             </div>
179 |           </div>
180 |         </div>
181 |       </body>
182 |     </html>
183 |     """
184 | 
185 |     # Parse the document
186 |     parser = HTMLement("div", attrs={"class": "talks", "id": True})
187 |     parser.feed(html)
188 |     root = parser.close()
189 | 
190 |     # Extract all div tags with class of talk
191 |     for talk in root.iterfind("./div[@class='talk']"):
192 |         # Fetch image
193 |         img = talk.find(".//img").get("src")
194 |         print("Image = {}".format(img))
195 | 
196 |         # Fetch title and url
197 |         title_anchor = talk.find("./div/h3/a")
198 |         url = title_anchor.get("href")
199 |         print("Url = {}".format(url))
200 |         title = title_anchor.text
201 |         print("Title = {}".format(title))
202 | 
203 |         # Fetch date
204 |         date = talk.find("./div/p").text
205 |         print("Date = {}".format(date))
206 |         print("")
207 | 
208 | 
209 | if __name__ == "__main__":
210 |     example_simple()
211 |     print("")
212 |     example_filter()
213 |     print("")
214 |     example_complex()
215 | 


--------------------------------------------------------------------------------
/htmlement.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # The MIT License (MIT)
  5 | #
  6 | # Copyright (c) 2016 William Forde
  7 | #
  8 | # Permission is hereby granted, free of charge, to any person obtaining a copy of
  9 | # this software and associated documentation files (the "Software"), to deal in
 10 | # the Software without restriction, including without limitation the rights to
 11 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 12 | # the Software, and to permit persons to whom the Software is furnished to do so,
 13 | # subject to the following conditions:
 14 | #
 15 | # The above copyright notice and this permission notice shall be included in all
 16 | # copies or substantial portions of the Software.
 17 | #
 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 20 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 21 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 22 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 23 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 24 | 
 25 | """
 26 | HTMLement
 27 | ---------
 28 | Simple lightweight HTML parser with XPath support.
 29 | 
 30 | Github: https://github.com/willforde/python-htmlement
 31 | Documentation: https://python-htmlement.readthedocs.io/en/stable/?badge=stable
 32 | Testing: https://github.com/willforde/python-htmlement/actions
 33 | Coverage: https://codecov.io/gh/willforde/python-htmlement
 34 | Maintainability: https://codeclimate.com/github/willforde/python-htmlement/maintainability
 35 | """
 36 | 
 37 | # Standard Lib
 38 | import xml.etree.ElementTree as Etree
 39 | import warnings
 40 | import re
 41 | 
 42 | # HTML Parser
 43 | from html.entities import name2codepoint
 44 | from html.parser import HTMLParser
 45 | 
 46 | __all__ = ["HTMLement", "fromstring", "fromstringlist", "parse"]
 47 | __version__ = "2.0.0"
 48 | 
 49 | # Add missing codepoints
 50 | # TODO: This may no longer be required
 51 | name2codepoint["apos"] = 0x0027
 52 | 
 53 | 
 54 | def fromstring(text, tag="", attrs=None, encoding=None):
 55 |     """
 56 |     Parse's "HTML" document from a string into an element tree.
 57 | 
 58 |     :param text: The "HTML" document to parse.
 59 |     :type text: str or bytes
 60 | 
 61 |     :param str tag: (optional) Name of "tag / element" which is used to filter down "the tree" to a required section.
 62 |     :type tag: str
 63 | 
 64 |     :param attrs: (optional) The attributes of the element, that will be used, when searchingfor the required section.
 65 |     :type attrs: dict(str, str)
 66 | 
 67 |     :param encoding: (optional) Encoding used, when decoding the source data before feeding it to the parser.
 68 |     :type encoding: str
 69 | 
 70 |     :return: The root element of the element tree.
 71 |     :rtype: xml.etree.ElementTree.Element
 72 | 
 73 |     :raises UnicodeDecodeError: If decoding of *text* fails.
 74 |     """
 75 |     parser = HTMLement(tag, attrs, encoding)
 76 |     parser.feed(text)
 77 |     return parser.close()
 78 | 
 79 | 
 80 | def fromstringlist(sequence, tag="", attrs=None, encoding=None):
 81 |     """
 82 |     Parses an "HTML document" from a sequence of "HTML sections" into an element tree.
 83 | 
 84 |     :param sequence: A sequence of "HTML sections" to parse.
 85 |     :type sequence: list(str or bytes)
 86 | 
 87 |     :param str tag: (optional) Name of "tag / element" which is used to filter down "the tree" to a required section.
 88 |     :type tag: str
 89 | 
 90 |     :param attrs: (optional) The attributes of the element, that will be used, when searchingfor the required section.
 91 |     :type attrs: dict(str, str)
 92 | 
 93 |     :param encoding: (optional) Encoding used, when decoding the source data before feeding it to the parser.
 94 |     :type encoding: str
 95 | 
 96 |     :return: The root element of the element tree.
 97 |     :rtype: xml.etree.ElementTree.Element
 98 | 
 99 |     :raises UnicodeDecodeError: If decoding of a section within *sequence* fails.
100 |     """
101 |     parser = HTMLement(tag, attrs, encoding)
102 |     for text in sequence:
103 |         parser.feed(text)
104 |     return parser.close()
105 | 
106 | 
107 | def parse(source, tag="", attrs=None, encoding=None):
108 |     """
109 |     Load an external "HTML document" into an element tree.
110 | 
111 |     :param source: A filename or file like object containing HTML data.
112 |     :type source: str or io.TextIOBase
113 | 
114 |     :param str tag: (optional) Name of "tag / element" which is used to filter down "the tree" to a required section.
115 |     :type tag: str
116 | 
117 |     :param attrs: (optional) The attributes of the element, that will be used, when searchingfor the required section.
118 |     :type attrs: dict(str, str)
119 | 
120 |     :param encoding: (optional) Encoding used, when decoding the source data before feeding it to the parser.
121 |     :type encoding: str
122 | 
123 |     :return: The root element of the element tree.
124 |     :rtype: xml.etree.ElementTree.Element
125 | 
126 |     :raises UnicodeDecodeError: If decoding of *source* fails.
127 |     """
128 |     # Assume that source is a file pointer if no read methods is found
129 |     if not hasattr(source, "read"):
130 |         source = open(source, "r", encoding=encoding)
131 |         close_source = True
132 |     else:
133 |         close_source = False
134 | 
135 |     try:
136 |         parser = HTMLement(tag, attrs, encoding)
137 |         while True:
138 |             # Read in 64k at a time
139 |             data = source.read(65536)
140 |             if not data:
141 |                 break
142 | 
143 |             # Feed the parser
144 |             parser.feed(data)
145 | 
146 |         # Return the root element
147 |         return parser.close()
148 | 
149 |     finally:
150 |         if close_source:
151 |             source.close()
152 | 
153 | 
154 | class HTMLement(object):
155 |     """
156 |     Python HTMLParser extension with ElementTree Parser support.
157 | 
158 |     This HTML Parser extends :class:`html.parser.HTMLParser`, returning an :class:`xml.etree.ElementTree.Element`
159 |     instance. The returned root element natively supports the ElementTree API.
160 |     (e.g. you may use its limited support for `XPath expressions`__)
161 | 
162 |     When a "tag" and "tag attributes" are given the parser will search for a required section. Only when the required
163 |     section is found, does the parser start parsing the "HTML document". The element that matches the search criteria
164 |     will then become the new "root element".
165 | 
166 |     Attributes are given as a dict of {'name': 'value'}. Value can be the string to match, `True` or `False.`
167 |     `True` will match any attribute with given name and any value.
168 |     `False` will only give a match if given attribute does not exist in the element.
169 | 
170 |     :param str tag: (optional) Name of "tag / element" which is used to filter down "the tree" to a required section.
171 |     :type tag: str
172 | 
173 |     :param attrs: (optional) The attributes of the element, that will be used, when searchingfor the required section.
174 |     :type attrs: dict(str, str)
175 | 
176 |     :param encoding: (optional) Encoding used, when decoding the source data before feeding it to the parser.
177 |     :type encoding: str
178 | 
179 |     .. _Xpath: https://docs.python.org/3.6/library/xml.etree.elementtree.html#xpath-support
180 |     __ XPath_
181 |     """
182 |     def __init__(self, tag="", attrs=None, encoding=None):
183 |         self._parser = ParseHTML(tag, attrs)
184 |         self.encoding = encoding
185 |         self._finished = False
186 | 
187 |     def feed(self, data):
188 |         """
189 |         Feeds data to the parser.
190 | 
191 |         If *data*, is of type :class:`bytes` and where no encoding was specified, then the encoding
192 |         will be extracted from *data* using "meta tags", if available.
193 |         Otherwise encoding will default to "ISO-8859-1"
194 | 
195 |         :param data: HTML data
196 |         :type data: str or bytes
197 | 
198 |         :raises UnicodeDecodeError: If decoding of *data* fails.
199 |         """
200 |         # Skip feeding data into parser if we already have what we want
201 |         if self._finished == 1:
202 |             return None
203 | 
204 |         # Make sure that we have unicode before continuing
205 |         if isinstance(data, bytes):
206 |             if self.encoding:
207 |                 data = data.decode(self.encoding)
208 |             else:
209 |                 data = self._make_unicode(data)
210 | 
211 |         # Parse the html document
212 |         try:
213 |             self._parser.feed(data)
214 |         except EOFError:
215 |             self._finished = True
216 |             self._parser.reset()
217 | 
218 |     def close(self):
219 |         """
220 |         Close the "tree builder" and return the "root element" of the "element tree".
221 | 
222 |         :return: The "root element" of the "element tree".
223 |         :rtype: xml.etree.ElementTree.Element
224 | 
225 |         :raises RuntimeError: If no element matching search criteria was found.
226 |         """
227 |         return self._parser.close()
228 | 
229 |     def _make_unicode(self, data):
230 |         """
231 |         Convert *data* from type :class:`bytes` to type :class:`str`.
232 | 
233 |         :param data: The html document.
234 |         :type data: bytes
235 | 
236 |         :return: HTML data decoded.
237 |         :rtype: str
238 |         """
239 |         # Atemp to find the encoding from the html source
240 |         end_head_tag = data.find(b"</head>")
241 |         if end_head_tag:
242 |             # Search for the charset attribute within the meta tags
243 |             charset_refind = b'<meta.+?charset=[\'"]*(.+?)["\'].*?>'
244 |             charset = re.search(charset_refind, data[:end_head_tag], re.IGNORECASE)
245 |             if charset:
246 |                 self.encoding = encoding = charset.group(1).decode()
247 |                 return data.decode(encoding)
248 | 
249 |         # Decode the string into unicode using default encoding
250 |         warn_msg = "Unable to determine encoding, defaulting to iso-8859-1"
251 |         warnings.warn(warn_msg, UnicodeWarning, stacklevel=2)
252 |         self.encoding = "iso-8859-1"
253 |         return data.decode("iso-8859-1")
254 | 
255 | 
256 | # noinspection PyAbstractClass
257 | class ParseHTML(HTMLParser):
258 |     def __init__(self, tag="", attrs=None):
259 |         # Initiate HTMLParser
260 |         HTMLParser.__init__(self)
261 |         self.convert_charrefs = True
262 |         self._root = None  # root element
263 |         self._data = []  # data collector
264 |         self._factory = Etree.Element
265 |         self.enabled = not tag
266 |         self._unw_attrs = []
267 |         self.tag = tag
268 | 
269 |         # Split attributes into wanted and unwanted attributes
270 |         if attrs:
271 |             self.attrs = attrs
272 |             for key, value in attrs.copy().items():
273 |                 if value == 0:
274 |                     self._unw_attrs.append(key)
275 |                     del attrs[key]
276 |         else:
277 |             self.attrs = {}
278 | 
279 |         # Some tags in html do not require closing tags so thoes tags will need to be auto closed (Void elements)
280 |         # Refer to: https://www.w3.org/TR/html/syntax.html#void-elements
281 |         self._voids = frozenset(("area", "base", "br", "col", "hr", "img", "input", "link", "meta", "param",
282 |                                  # Only in HTML5
283 |                                  "embed", "keygen", "source", "track",
284 |                                  # Not supported in HTML5
285 |                                  "basefont", "frame", "isindex",
286 |                                  # SVG self closing tags
287 |                                  "rect", "circle", "ellipse", "line", "polyline", "polygon",
288 |                                  "path", "stop", "use", "image", "animatetransform"))
289 | 
290 |         # Create temporary root element to protect from badly written sites that either
291 |         # have no html starting tag or multiple top level elements
292 |         elem = self._factory("html")
293 |         self._elem = [elem]
294 |         self._last = elem
295 |         self._tail = 0
296 | 
297 |     def handle_starttag(self, tag, attrs):
298 |         self._handle_starttag(tag, attrs, self_closing=tag in self._voids)
299 | 
300 |     def handle_startendtag(self, tag, attrs):
301 |         self._handle_starttag(tag, attrs, self_closing=True)
302 | 
303 |     def _handle_starttag(self, tag, attrs, self_closing=False):
304 |         enabled = self.enabled
305 |         # Add tag element to tree if we have no filter or that the filter matches
306 |         if enabled or self._search(tag, attrs):
307 |             # Convert attrs to dictionary
308 |             attrs = {k: v or "" for k, v in attrs}
309 |             self._flush()
310 | 
311 |             # Create the new element
312 |             elem = self._factory(tag, attrs)
313 |             self._elem[-1].append(elem)
314 |             self._last = elem
315 | 
316 |             # Only append the element to the list of elements if it's not a self closing element
317 |             if self_closing:
318 |                 self._tail = 1
319 |             else:
320 |                 self._elem.append(elem)
321 |                 self._tail = 0
322 | 
323 |             # Set this element as the root element when the filter search matches
324 |             if not enabled:
325 |                 self._root = elem
326 |                 self.enabled = True
327 | 
328 |     def handle_endtag(self, tag):
329 |         # Only process end tags when we have no filter or that the filter has been matched
330 |         if self.enabled and tag not in self._voids:
331 |             _elem = self._elem
332 |             _root = self._root
333 |             # Check that the closing tag is what's actualy expected
334 |             if _elem[-1].tag == tag:
335 |                 self._flush()
336 |                 self._tail = 1
337 |                 self._last = elem = _elem.pop()
338 |                 if elem is _root:
339 |                     raise EOFError
340 | 
341 |             # If a previous element is what we actually have then the expected element was not
342 |             # properly closed so we must close that before closing what we have now
343 |             elif len(_elem) >= 2 and any(_item.tag == tag for _item in _elem):
344 |                 self._flush()
345 |                 self._tail = 1
346 |                 while True:
347 |                     self._last = elem = _elem.pop()
348 |                     if elem.tag == tag:
349 |                         break
350 |                 if elem is _root:
351 |                     raise EOFError
352 |             else:
353 |                 # Unable to match the tag to an element, ignoring it
354 |                 return None
355 | 
356 |     def handle_data(self, data):
357 |         if data.strip() and self.enabled:
358 |             self._data.append(data)
359 | 
360 |     def handle_entityref(self, name):
361 |         if self.enabled:
362 |             try:
363 |                 name = chr(name2codepoint[name])
364 |             except KeyError:
365 |                 pass
366 |             self._data.append(name)
367 | 
368 |     def handle_charref(self, name):
369 |         if self.enabled:
370 |             try:
371 |                 if name[0].lower() == "x":
372 |                     name = chr(int(name[1:], 16))
373 |                 else:
374 |                     name = chr(int(name))
375 |             except ValueError:
376 |                 pass
377 |             self._data.append(name)
378 | 
379 |     def handle_comment(self, data):
380 |         data = data.strip()
381 |         if data and self.enabled:
382 |             elem = Etree.Comment(data)
383 |             self._elem[-1].append(elem)
384 | 
385 |     def close(self):
386 |         self._flush()
387 |         if self.enabled == 0:
388 |             msg = "Unable to find requested section with tag of '{}' and attributes of {}"
389 |             raise RuntimeError(msg.format(self.tag, self.attrs))
390 |         elif self._root is not None:
391 |             return self._root
392 |         else:
393 |             # Search the root element to find a proper html root element if one exists
394 |             tmp_root = self._elem[0]
395 |             proper_root = tmp_root.find("html")
396 |             if proper_root is None:
397 |                 # Not proper root was found
398 |                 return tmp_root
399 |             else:
400 |                 # Proper root found
401 |                 return proper_root
402 | 
403 |     def _flush(self):
404 |         if self._data:
405 |             if self._last is not None:
406 |                 text = "".join(self._data)
407 |                 if self._tail:
408 |                     self._last.tail = text
409 |                 else:
410 |                     self._last.text = text
411 |             self._data = []
412 | 
413 |     def _search(self, tag, attrs):
414 |         # Only search when the tag matches
415 |         if tag == self.tag:
416 |             # If we have required attrs to match then search all attrs for wanted attrs
417 |             # And also check that we do not have any attrs that are unwanted
418 |             if self.attrs or self._unw_attrs:
419 |                 if attrs:
420 |                     wanted_attrs = self.attrs.copy()
421 |                     unwanted_attrs = self._unw_attrs
422 |                     for key, value in attrs:
423 |                         # Check for unwanted attrs
424 |                         if key in unwanted_attrs:
425 |                             return False
426 | 
427 |                         # Check for wanted attrs
428 |                         elif key in wanted_attrs:
429 |                             c_value = wanted_attrs[key]
430 |                             if c_value == value or c_value == 1:
431 |                                 # Remove this attribute from the wanted dict of attributes
432 |                                 # to indicate that this attribute has been found
433 |                                 del wanted_attrs[key]
434 | 
435 |                     # If wanted_attrs is now empty then all attributes must have been found
436 |                     if not wanted_attrs:
437 |                         return True
438 |             else:
439 |                 # We only need to match tag
440 |                 return True
441 | 
442 |         # Unable to find required section
443 |         return False
444 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 | 
4 | [metadata]
5 | license_file = LICENSE
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from codecs import open
 3 | from os import path
 4 | import re
 5 | 
 6 | # Path to local directory
 7 | here = path.abspath(path.dirname(__file__))
 8 | 
 9 | 
10 | def readfile(filename):  # type: (str) -> str
11 |     """Get the long description from the README file"""
12 |     readme_file = path.join(here, filename)
13 |     with open(readme_file, "r", encoding="utf-8") as stream:
14 |         return stream.read()
15 | 
16 | 
17 | def extract_variable(filename, variable):  # type: (str, str) -> str
18 |     """Extract the version number from a python file that contains the '__version__' variable."""
19 |     with open(filename, "r", encoding="utf8") as stream:
20 |         search_refind = r'{} = ["\'](\d+\.\d+\.\d+)["\']'.format(variable)
21 |         verdata = re.search(search_refind, stream.read())
22 |         if verdata:
23 |             return verdata.group(1)
24 |         else:
25 |             raise RuntimeError("Unable to extract version number")
26 | 
27 | 
28 | setup(
29 |     name='htmlement',
30 |     version=extract_variable('htmlement.py', '__version__'),
31 |     description='Pure-Python HTML parser with ElementTree support.',
32 |     long_description=readfile('README.rst'),
33 |     extras_require={"dev": ["pytest", "pytest-cov"]},
34 |     keywords='html html5 parsehtml htmlparser elementtree dom',
35 |     classifiers=[
36 |         'Development Status :: 5 - Production/Stable',
37 |         'Intended Audience :: Developers',
38 |         'License :: OSI Approved :: MIT License',
39 |         'Natural Language :: English',
40 |         'Operating System :: OS Independent',
41 |         'Programming Language :: Python :: 3',
42 |         'Programming Language :: Python :: 3.7',
43 |         'Programming Language :: Python :: 3.8',
44 |         'Programming Language :: Python :: 3.9',
45 |         'Programming Language :: Python :: 3.10',
46 |         'Programming Language :: Python :: 3.11',
47 |         'Topic :: Text Processing :: Markup :: HTML',
48 |         'Topic :: Software Development :: Libraries :: Python Modules'
49 |     ],
50 |     url='https://github.com/willforde/python-htmlement',
51 |     platforms=['OS Independent'],
52 |     author='William Forde',
53 |     author_email='willforde@gmail.com',
54 |     license='MIT License',
55 |     py_modules=['htmlement']
56 | )
57 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # dummy
2 | 


--------------------------------------------------------------------------------
/tests/test_module.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Python 2 compatibility
  5 | import xml.etree.ElementTree as Etree
  6 | import htmlement
  7 | import examples
  8 | import tempfile
  9 | import pytest
 10 | import io
 11 | import os
 12 | 
 13 | 
 14 | def quick_parsehtml(html, encoding=""):
 15 |     obj = htmlement.HTMLement(encoding=encoding)
 16 |     obj.feed(html)
 17 |     root = obj.close()
 18 |     assert Etree.iselement(root)
 19 |     return root
 20 | 
 21 | 
 22 | def quick_parse_filter(html, tag, attrs=None, encoding=""):
 23 |     obj = htmlement.HTMLement(tag, attrs, encoding=encoding)
 24 |     obj.feed(html)
 25 |     return obj.close()
 26 | 
 27 | 
 28 | def test_initialization():
 29 |     # Check that the parser even starts
 30 |     obj = htmlement.HTMLement()
 31 |     assert isinstance(obj, htmlement.HTMLement)
 32 | 
 33 | 
 34 | # ############################# HTML Test ############################## #
 35 | 
 36 | 
 37 | def test_basic_tree():
 38 |     # Check that I can parse a simple tree
 39 |     html = "<html><body></body></html>"
 40 |     root = quick_parsehtml(html)
 41 |     assert root.tag == "html"
 42 |     assert root[0].tag == "body"
 43 | 
 44 | 
 45 | def test_basic_partial():
 46 |     # Check that I can parse a simple tree segment at a time
 47 |     html = "<html><body></body></html>"
 48 |     obj = htmlement.HTMLement()
 49 |     obj.feed(html[:9])
 50 |     obj.feed(html[9:])
 51 |     root = obj.close()
 52 |     assert Etree.iselement(root)
 53 |     assert root.tag == "html"
 54 |     assert root[0].tag == "body"
 55 | 
 56 | 
 57 | def test_nohtml_tree():
 58 |     # Check that the missing html starting tag is created
 59 |     html = "<body></body>"
 60 |     root = quick_parsehtml(html)
 61 |     assert root.tag == "html"
 62 |     assert root[0].tag == "body"
 63 |     assert Etree.tostring(root, method="html") == b'<html><body></body></html>'
 64 | 
 65 | 
 66 | def test_text():
 67 |     html = "<html><body>text</body></html>"
 68 |     root = quick_parsehtml(html)
 69 |     assert root.tag == "html"
 70 |     assert root[0].tag == "body"
 71 |     assert root[0].attrib == {}
 72 |     assert root[0].text == "text"
 73 | 
 74 | 
 75 | def test_attrib():
 76 |     html = "<html><body test='yes'>text</body></html>"
 77 |     root = quick_parsehtml(html)
 78 |     assert root[0].attrib == {"test": "yes"}
 79 | 
 80 | 
 81 | def test_tail():
 82 |     html = "<html><body test='yes'><p>text</p>tail</body></html>"
 83 |     root = quick_parsehtml(html)
 84 |     assert root[0][0].tail == "tail"
 85 | 
 86 | 
 87 | def test_self_closing_normal():
 88 |     html = "<html><check test='self closing'/></html>"
 89 |     root = quick_parsehtml(html)
 90 |     assert root[0].attrib.get("test") == "self closing"
 91 | 
 92 | 
 93 | def test_self_closing_void():
 94 |     html = "<html><img src='http://myimages.com/myimage.jpg'/></html>"
 95 |     root = quick_parsehtml(html)
 96 |     assert root[0].tag == "img"
 97 |     assert root[0].attrib.get("src") == "http://myimages.com/myimage.jpg"
 98 | 
 99 | 
100 | def test_open_void():
101 |     html = "<html><img src='http://myimages.com/myimage.jpg'></html>"
102 |     root = quick_parsehtml(html)
103 |     assert root[0].tag == "img"
104 |     assert root[0].attrib.get("src") == "http://myimages.com/myimage.jpg"
105 | 
106 | 
107 | def test_comment():
108 |     html = "<html><body><!--This is a comment.--><p>This is a paragraph.</p></body></html>"
109 |     root = quick_parsehtml(html)
110 |     assert root[0][0].text == "This is a comment."
111 |     assert root[0][1].tag == "p"
112 |     assert root[0][1].text == "This is a paragraph."
113 | 
114 | 
115 | def test_missing_end_tag():
116 |     # Test for a missing 'a' end tag
117 |     html = "<html><body><a href='http://google.ie/'>link</body></html>"
118 |     root = quick_parsehtml(html)
119 |     assert root.find(".//a").get("href") == "http://google.ie/"
120 |     assert Etree.tostring(root, method="html") == b'<html><body><a href="http://google.ie/">link</a></body></html>'
121 | 
122 | 
123 | def test_extra_tag():
124 |     # Check that a extra tag that should not exist was removed
125 |     html = "<html><body></div></body></html>"
126 |     root = quick_parsehtml(html)
127 |     assert len(root[0]) == 0
128 |     assert Etree.tostring(root, method="html") == b'<html><body></body></html>'
129 | 
130 | 
131 | def test_find_empty_attribute():
132 |     # Check whether we can find an element with an empty-valued attribute
133 |     html = "<html><body><form autofocus><input type='checkbox' checked></form></body></html>"
134 |     form = quick_parse_filter(html, "form", {"autofocus": True})
135 |     assert "autofocus" in form.attrib
136 |     assert form.find(".//input[@checked]") is not None
137 | 
138 | 
139 | # ############################# HTML Entity ############################## #
140 | 
141 | 
142 | def test_entity_name_euro():
143 |     html = "<html><body>cost is &euro;49.99</body></html>"
144 |     root = quick_parsehtml(html)
145 |     assert root[0].text == "cost is €49.99"
146 | 
147 | 
148 | def test_entity_number_euro():
149 |     html = "<html><body>cost is &#8364;49.99</body></html>"
150 |     root = quick_parsehtml(html)
151 |     assert root[0].text == "cost is €49.99"
152 | 
153 | 
154 | def test_entity_hex_euro():
155 |     html = "<html><body>cost is &#x20AC;49.99</body></html>"
156 |     root = quick_parsehtml(html)
157 |     assert root[0].text == "cost is €49.99"
158 | 
159 | 
160 | def test_entity_name_euro_fail():
161 |     html = "<html><body>cost is &euros;49.99</body></html>"
162 |     root = quick_parsehtml(html)
163 |     assert "euros" in root[0].text
164 | 
165 | 
166 | def test_entity_hex_euro_fail():
167 |     html = "<html><body>cost is &#xDB9900;49.99</body></html>"
168 |     root = quick_parsehtml(html)
169 |     assert "€" not in root[0].text
170 | 
171 | 
172 | # ############################# Text Content ############################# #
173 | 
174 | 
175 | def test_text_iterator():
176 |     html = "<html><body>sample text content</body></html>"
177 |     root = quick_parsehtml(html)
178 |     body = root.find(".//body")
179 |     assert "".join(body.itertext()) == "sample text content"
180 | 
181 | 
182 | def test_text_iterator_unclosed_tag():
183 |     html = "<html><body><div>hello <span>to <span>the <span>world!</div></body><footer>unrelated</footer></html>"
184 |     root = quick_parsehtml(html)
185 |     body = root.find(".//body")
186 |     assert "".join(body.itertext()) == "hello to the world!"
187 | 
188 | 
189 | # ############################# Filter Test ############################## #
190 | 
191 | 
192 | def test_tag_match():
193 |     html = "<html><body><div test='attribute'><p>text</p></div></body></html>"
194 |     root = quick_parse_filter(html, "div")
195 |     assert root.tag == "div"
196 |     assert root[0].tag == "p"
197 | 
198 | 
199 | def test_tag_no_match():
200 |     html = "<html><body></body></html>"
201 |     with pytest.raises(RuntimeError) as excinfo:
202 |         quick_parse_filter(html, "div")
203 |     excinfo.match("Unable to find requested section with tag of")
204 | 
205 | 
206 | def test_attrib_match():
207 |     html = "<html><body><div test='attribute'><p>text</p></div><div test='yes'>text</div></body></html>"
208 |     root = quick_parse_filter(html, "div", {"test": "yes"})
209 |     assert root.tag == "div"
210 |     assert root.get("test") == "yes"
211 |     assert root.text == "text"
212 | 
213 | 
214 | def test_attrib_no_match():
215 |     html = "<html><body><div><p>text</p></div><div>text</div></body></html>"
216 |     with pytest.raises(RuntimeError) as excinfo:
217 |         quick_parse_filter(html, "div", {"test": "yes"})
218 |     excinfo.match("Unable to find requested section with tag of")
219 | 
220 | 
221 | def test_attrib_match_name():
222 |     # Search for any div tag with a attribute of src of any value
223 |     html = "<html><body><div test='attribute'><p>text</p></div><div src='foo bar'>text</div></body></html>"
224 |     root = quick_parse_filter(html, "div", {"src": True})
225 |     assert root.tag == "div"
226 |     assert root.get("src")
227 |     assert root.text == "text"
228 | 
229 | 
230 | def test_attrib_match_unwanted():
231 |     # Search for a div with a test attribute but not a src attribute
232 |     html = "<html><body><div src='attribute' test='yes'><p>text</p></div><div test='yes'>text</div></body></html>"
233 |     root = quick_parse_filter(html, "div", {"test": "yes", "src": False})
234 |     assert root.tag == "div"
235 |     assert root.get("test") == "yes"
236 |     assert "src" not in root.attrib
237 |     assert root.text == "text"
238 | 
239 | 
240 | def test_tag_match_badhtml():
241 |     html = "<html><body><div test='attribute'><p>text</div></body></html>"
242 |     root = quick_parse_filter(html, "div")
243 |     assert root.tag == "div"
244 |     assert root[0].tag == "p"
245 | 
246 | 
247 | def test_partial_filter():
248 |     # Check that the
249 |     html = "<html><body><div test='attribute'><p>text</p></div></body></html>"
250 |     obj = htmlement.HTMLement("div")
251 |     obj.feed(html[:51])
252 |     obj.feed(html[51:])
253 |     root = obj.close()
254 |     assert root.tag == "div"
255 |     assert root[0].tag == "p"
256 | 
257 | 
258 | # ####################### Unicode Decoding Test ####################### #
259 | 
260 | 
261 | def test_with_encoding():
262 |     # Check that I can parse a simple tree
263 |     html = b"<html><body></body></html>"
264 |     root = quick_parsehtml(html, encoding="utf-8")
265 |     assert root.tag == "html"
266 |     assert root[0].tag == "body"
267 | 
268 | 
269 | def test_no_encoding_with_header_type1(recwarn):
270 |     # Check for charset header type one
271 |     html = b"<html><head><meta charset='utf-8'/></head><body>text</body></html>"
272 |     quick_parsehtml(html)
273 |     # Check that no warnings ware raised
274 |     warnmsg = "Unable to determine encoding, defaulting to iso-8859-1"
275 |     for w in recwarn.list:
276 |         assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg
277 | 
278 | 
279 | def test_no_encoding_with_header_type2(recwarn):
280 |     # Check for charset header type one
281 |     html = b'<html><head><meta charset="utf-8"/></head><body>text</body></html>'
282 |     quick_parsehtml(html)
283 |     # Check that no warnings ware raised
284 |     warnmsg = "Unable to determine encoding, defaulting to iso-8859-1"
285 |     for w in recwarn.list:
286 |         assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg
287 | 
288 | 
289 | def test_no_encoding_with_header_type3(recwarn):
290 |     # Check for charset header type one
291 |     html = b"<html><head><meta charset='utf-8'></head><body>text</body></html>"
292 |     quick_parsehtml(html)
293 |     # Check that no warnings ware raised
294 |     warnmsg = "Unable to determine encoding, defaulting to iso-8859-1"
295 |     for w in recwarn.list:
296 |         assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg
297 | 
298 | 
299 | def test_no_encoding_with_header_type4(recwarn):
300 |     # Check for charset header type one
301 |     html = b'<html><head><meta charset="utf-8"></head><body>text</body></html>'
302 |     quick_parsehtml(html)
303 |     # Check that no warnings ware raised
304 |     warnmsg = "Unable to determine encoding, defaulting to iso-8859-1"
305 |     for w in recwarn.list:
306 |         assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg
307 | 
308 | 
309 | def test_no_encoding_with_header_type5(recwarn):
310 |     # Check for charset header type one
311 |     html = b"<html><head><meta content='text/html; charset=utf-8'/></head><body>text</body></html>"
312 |     quick_parsehtml(html)
313 |     # Check that no warnings ware raised
314 |     warnmsg = "Unable to determine encoding, defaulting to iso-8859-1"
315 |     for w in recwarn.list:
316 |         assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg
317 | 
318 | 
319 | def test_no_encoding_with_header_type6(recwarn):
320 |     # Check for charset header type one
321 |     html = b'<html><head><meta content="text/html; charset=utf-8"/></head><body>text</body></html>'
322 |     quick_parsehtml(html)
323 |     # Check that no warnings ware raised
324 |     warnmsg = "Unable to determine encoding, defaulting to iso-8859-1"
325 |     for w in recwarn.list:
326 |         assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg
327 | 
328 | 
329 | def test_no_encoding_with_header_type7(recwarn):
330 |     # Check for charset header type one
331 |     html = b"<html><head><meta content='text/html; charset=utf-8'></head><body>text</body></html>"
332 |     quick_parsehtml(html)
333 |     # Check that no warnings ware raised
334 |     warnmsg = "Unable to determine encoding, defaulting to iso-8859-1"
335 |     for w in recwarn.list:
336 |         assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg
337 | 
338 | 
339 | def test_no_encoding_with_header_type8(recwarn):
340 |     # Check for charset header type one
341 |     html = b'<html><head><meta content="text/html; charset=utf-8"></head><body>text</body></html>'
342 |     quick_parsehtml(html)
343 |     # Check that no warnings ware raised
344 |     warnmsg = "Unable to determine encoding, defaulting to iso-8859-1"
345 |     for w in recwarn.list:
346 |         assert issubclass(w.category, UnicodeWarning) is False or not w.message == warnmsg
347 | 
348 | 
349 | def test_no_encoding_no_header():
350 |     # Check that I can parse a simple tree
351 |     html = b"<html><head></head><body>text</body></html>"
352 |     with pytest.warns(UnicodeWarning):
353 |         quick_parsehtml(html)
354 | 
355 | 
356 | # ####################### Funtion Tests ####################### #
357 | 
358 | 
359 | def test_fromstring():
360 |     # Check that I can parse a simple tree
361 |     html = "<html><body></body></html>"
362 |     root = htmlement.fromstring(html)
363 |     assert Etree.iselement(root)
364 |     assert root.tag == "html"
365 |     assert root[0].tag == "body"
366 | 
367 | 
368 | def test_fromstringlist():
369 |     # Check that I can parse a simple tree
370 |     sequence = ["<html><body>", "</body></html>"]
371 |     root = htmlement.fromstringlist(sequence)
372 |     assert Etree.iselement(root)
373 |     assert root.tag == "html"
374 |     assert root[0].tag == "body"
375 | 
376 | 
377 | def test_parse_file_object():
378 |     html = "<html><body></body></html>"
379 |     fileobj = io.StringIO(html)
380 |     root = htmlement.parse(fileobj, encoding="utf8")
381 |     assert Etree.iselement(root)
382 |     assert root.tag == "html"
383 |     assert root[0].tag == "body"
384 | 
385 | 
386 | def test_parse_filename():
387 |     # Create temp file and add html data to it
388 |     html = "<html><body></body></html>"
389 |     fileobj = tempfile.NamedTemporaryFile("w", delete=False)
390 |     fileobj.write(html)
391 |     filename = fileobj.name
392 |     fileobj.close()
393 | 
394 |     try:
395 |         root = htmlement.parse(filename, encoding="utf8")
396 |         assert Etree.iselement(root)
397 |         assert root.tag == "html"
398 |         assert root[0].tag == "body"
399 |     finally:
400 |         os.remove(filename)
401 | 
402 | 
403 | # ####################### Examples Tests ####################### #
404 | 
405 | 
406 | def test_example_simple():
407 |     # Check that there is no errors
408 |     examples.example_simple()
409 | 
410 | 
411 | def test_example_filter():
412 |     # Check that there is no errors
413 |     examples.example_filter()
414 | 
415 | 
416 | def test_example_complex():
417 |     # Check that there is no errors
418 |     examples.example_complex()
419 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py{37,38,39,310,311},flake8
 3 | skip_missing_interpreters = true
 4 | 
 5 | [gh-actions]
 6 | python =
 7 |     3.7: py37
 8 |     3.8: py38
 9 |     3.9: py39
10 |     3.10: py310
11 |     3.11: py311
12 | 
13 | [testenv]
14 | extras = dev
15 | commands = pytest --cov=htmlement --cov-report xml
16 | 
17 | # Flake8 Environment
18 | [testenv:flake8]
19 | basepython = python3
20 | skip_install = true
21 | deps =
22 |     flake8
23 | commands =
24 |     flake8 --max-line-length=127
25 | 
26 | # Flake8 Configuration
27 | [flake8]
28 | ignore =
29 |     F821,  # undefined name 'unichr'
30 | exclude =
31 |     .tox,
32 |     .git,
33 |     docs,
34 |     tests
35 | 
36 | [coverage:run]
37 | source=htmlement
38 | branch=True
39 | 
40 | [coverage:report]
41 | exclude_lines =
42 |     if __name__ == .__main__.:
43 |     def __repr__
44 |     pragma: no cover
45 | 
46 | [coverage:paths]
47 | source =
48 |     htmlement
49 |     .tox/*/lib/python*/site-packages/htmlement
50 | 


--------------------------------------------------------------------------------