├── .git-blame-ignore-revs ├── .github └── workflows │ ├── checks.yml │ ├── publish.yml │ ├── tests-macos.yml │ ├── tests-ubuntu.yml │ └── tests-windows.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── LICENSE ├── MANIFEST.in ├── NEWS ├── README.rst ├── docs ├── Makefile ├── _static │ ├── multiroot.html │ ├── python-insider.xml │ └── selectors-sample1.html ├── conf.py ├── conftest.py ├── history.rst ├── index.rst ├── installation.rst ├── make.bat ├── parsel.rst ├── requirements.txt └── usage.rst ├── parsel ├── __init__.py ├── csstranslator.py ├── py.typed ├── selector.py ├── utils.py └── xpathfuncs.py ├── pyproject.toml ├── release.rst ├── setup.py ├── tests ├── requirements.txt ├── test_selector.py ├── test_selector_csstranslator.py ├── test_selector_jmespath.py ├── test_utils.py ├── test_xml_attacks.py ├── test_xpathfuncs.py ├── typing │ └── selector.py └── xml_attacks │ └── billion_laughs.xml └── tox.ini /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # applying pre-commit hooks to the project 2 | a57c23e3b7be0f001595bd8767fe05e40a66e730 -------------------------------------------------------------------------------- /.github/workflows/checks.yml: -------------------------------------------------------------------------------- 1 | name: Checks 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | checks: 6 | runs-on: ubuntu-latest 7 | strategy: 8 | fail-fast: false 9 | matrix: 10 | include: 11 | - python-version: "3.13" 12 | env: 13 | TOXENV: pre-commit 14 | - python-version: "3.13" 15 | env: 16 | TOXENV: pylint 17 | - python-version: "3.13" # Keep in sync with .readthedocs.yml 18 | env: 19 | TOXENV: docs 20 | - python-version: "3.13" 21 | env: 22 | TOXENV: typing 23 | - python-version: "3.13" 24 | env: 25 | TOXENV: twinecheck 26 | 27 | steps: 28 | - uses: actions/checkout@v4 29 | 30 | - name: Set up Python ${{ matrix.python-version }} 31 | uses: actions/setup-python@v5 32 | with: 33 | python-version: ${{ matrix.python-version }} 34 | 35 | - name: Run check 36 | env: ${{ matrix.env }} 37 | run: | 38 | pip install -U tox 39 | tox 40 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | on: [push] 3 | 4 | jobs: 5 | publish: 6 | runs-on: ubuntu-latest 7 | if: startsWith(github.event.ref, 'refs/tags/') 8 | 9 | steps: 10 | - uses: actions/checkout@v4 11 | 12 | - name: Set up Python 3.13 13 | uses: actions/setup-python@v5 14 | with: 15 | python-version: "3.13" 16 | 17 | - name: Check Tag 18 | id: check-release-tag 19 | run: | 20 | if [[ ${{ github.event.ref }} =~ ^refs/tags/v[0-9]+[.][0-9]+[.][0-9]+(rc[0-9]+|[.]dev[0-9]+)?$ ]]; then 21 | echo ::set-output name=release_tag::true 22 | fi 23 | 24 | - name: Publish to PyPI 25 | if: steps.check-release-tag.outputs.release_tag == 'true' 26 | run: | 27 | pip install --upgrade setuptools wheel twine 28 | python setup.py sdist bdist_wheel 29 | export TWINE_USERNAME=__token__ 30 | export TWINE_PASSWORD=${{ secrets.PYPI_TOKEN }} 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /.github/workflows/tests-macos.yml: -------------------------------------------------------------------------------- 1 | name: macOS 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | tests: 6 | runs-on: macos-latest 7 | strategy: 8 | fail-fast: false 9 | matrix: 10 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | 20 | - name: Run tests 21 | run: | 22 | pip install -U tox 23 | tox -e py 24 | 25 | - name: Upload coverage report 26 | uses: codecov/codecov-action@v5 27 | -------------------------------------------------------------------------------- /.github/workflows/tests-ubuntu.yml: -------------------------------------------------------------------------------- 1 | name: Ubuntu 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | tests: 6 | runs-on: ubuntu-latest 7 | strategy: 8 | fail-fast: false 9 | matrix: 10 | include: 11 | - python-version: "3.9" 12 | env: 13 | TOXENV: py 14 | - python-version: "3.10" 15 | env: 16 | TOXENV: py 17 | - python-version: "3.11" 18 | env: 19 | TOXENV: py 20 | - python-version: "3.12" 21 | env: 22 | TOXENV: py 23 | - python-version: "3.13" 24 | env: 25 | TOXENV: py 26 | - python-version: pypy3.10 27 | env: 28 | TOXENV: pypy3 29 | - python-version: pypy3.11 30 | env: 31 | TOXENV: pypy3 32 | 33 | steps: 34 | - uses: actions/checkout@v4 35 | 36 | - name: Install system libraries 37 | if: contains(matrix.python-version, 'pypy') 38 | run: | 39 | sudo apt-get update 40 | sudo apt-get install libxml2-dev libxslt-dev 41 | 42 | - name: Set up Python ${{ matrix.python-version }} 43 | uses: actions/setup-python@v5 44 | with: 45 | python-version: ${{ matrix.python-version }} 46 | 47 | - name: Run tests 48 | env: ${{ matrix.env }} 49 | run: | 50 | pip install -U tox 51 | tox 52 | 53 | - name: Upload coverage report 54 | uses: codecov/codecov-action@v5 55 | -------------------------------------------------------------------------------- /.github/workflows/tests-windows.yml: -------------------------------------------------------------------------------- 1 | name: Windows 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | tests: 6 | runs-on: windows-latest 7 | strategy: 8 | fail-fast: false 9 | matrix: 10 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | 20 | - name: Run tests 21 | run: | 22 | pip install -U tox 23 | tox -e py 24 | 25 | - name: Upload coverage report 26 | uses: codecov/codecov-action@v5 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.eggs 9 | *.egg-info 10 | dist 11 | build 12 | eggs 13 | parts 14 | bin 15 | var 16 | sdist 17 | develop-eggs 18 | .installed.cfg 19 | lib 20 | lib64 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | /coverage.xml 28 | .tox 29 | nosetests.xml 30 | htmlcov 31 | .pytest_cache 32 | 33 | # Translations 34 | *.mo 35 | 36 | # Mr Developer 37 | .mr.developer.cfg 38 | .project 39 | .pydevproject 40 | 41 | # Complexity 42 | output/*.html 43 | output/*/index.html 44 | 45 | # Sphinx 46 | docs/_build 47 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | rev: v0.11.2 4 | hooks: 5 | - id: ruff 6 | args: [ --fix ] 7 | - id: ruff-format 8 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | formats: all 3 | sphinx: 4 | configuration: docs/conf.py 5 | fail_on_warning: true 6 | build: 7 | os: ubuntu-24.04 8 | tools: 9 | # For available versions, see: 10 | # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python 11 | python: "3.13" # Keep in sync with .github/workflows/checks.yml 12 | python: 13 | install: 14 | - requirements: docs/requirements.txt 15 | - path: . 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Scrapy developers. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of Scrapy nor the names of its contributors may be used 15 | to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include NEWS 2 | include LICENSE 3 | include README.rst 4 | include parsel/py.typed 5 | 6 | recursive-include tests * 7 | recursive-exclude * __pycache__ 8 | recursive-exclude * *.py[co] 9 | 10 | recursive-include docs *.rst conf.py Makefile make.bat 11 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | .. :changelog: 2 | 3 | History 4 | ------- 5 | 6 | 1.10.0 (2024-12-16) 7 | ~~~~~~~~~~~~~~~~~~~ 8 | 9 | * Removed support for Python 3.8. 10 | * Added support for Python 3.13. 11 | * Changed the default encoding name from ``"utf8"`` to ``"utf-8"`` everywhere. 12 | The former name is not supported in certain environments. 13 | * CI fixes and improvements. 14 | 15 | 1.9.1 (2024-04-08) 16 | ~~~~~~~~~~~~~~~~~~ 17 | 18 | * Removed the dependency on ``pytest-runner``. 19 | * Removed the obsolete ``Makefile``. 20 | 21 | 1.9.0 (2024-03-14) 22 | ~~~~~~~~~~~~~~~~~~ 23 | 24 | * Now requires ``cssselect >= 1.2.0`` (this minimum version was required since 25 | 1.8.0 but that wasn't properly recorded) 26 | * Removed support for Python 3.7 27 | * Added support for Python 3.12 and PyPy 3.10 28 | * Fixed an exception when calling ``__str__`` or ``__repr__`` on some JSON 29 | selectors 30 | * Code formatted with ``black`` 31 | * CI fixes and improvements 32 | 33 | 1.8.1 (2023-04-18) 34 | ~~~~~~~~~~~~~~~~~~ 35 | 36 | * Remove a Sphinx reference from NEWS to fix the PyPI description 37 | * Add a ``twine check`` CI check to detect such problems 38 | 39 | 1.8.0 (2023-04-18) 40 | ~~~~~~~~~~~~~~~~~~ 41 | 42 | * Add support for JMESPath: you can now create a selector for a JSON document 43 | and call ``Selector.jmespath()``. See `the documentation`_ for more 44 | information and examples. 45 | * Selectors can now be constructed from ``bytes`` (using the ``body`` and 46 | ``encoding`` arguments) instead of ``str`` (using the ``text`` argument), so 47 | that there is no internal conversion from ``str`` to ``bytes`` and the memory 48 | usage is lower. 49 | * Typing improvements 50 | * The ``pkg_resources`` module (which was absent from the requirements) is no 51 | longer used 52 | * Documentation build fixes 53 | * New requirements: 54 | 55 | * ``jmespath`` 56 | * ``typing_extensions`` (on Python 3.7) 57 | 58 | .. _the documentation: https://parsel.readthedocs.io/en/latest/usage.html 59 | 60 | 1.7.0 (2022-11-01) 61 | ~~~~~~~~~~~~~~~~~~ 62 | 63 | * Add PEP 561-style type information 64 | * Support for Python 2.7, 3.5 and 3.6 is removed 65 | * Support for Python 3.9-3.11 is added 66 | * Very large documents (with deep nesting or long tag content) can now be 67 | parsed, and ``Selector`` now takes a new argument ``huge_tree`` to disable 68 | this 69 | * Support for new features of cssselect 1.2.0 is added 70 | * The ``Selector.remove()`` and ``SelectorList.remove()`` methods are 71 | deprecated and replaced with the new ``Selector.drop()`` and 72 | ``SelectorList.drop()`` methods which don't delete text after the dropped 73 | elements when used in the HTML mode. 74 | 75 | 76 | 1.6.0 (2020-05-07) 77 | ~~~~~~~~~~~~~~~~~~ 78 | 79 | * Python 3.4 is no longer supported 80 | * New ``Selector.remove()`` and ``SelectorList.remove()`` methods to remove 81 | selected elements from the parsed document tree 82 | * Improvements to error reporting, test coverage and documentation, and code 83 | cleanup 84 | 85 | 86 | 1.5.2 (2019-08-09) 87 | ~~~~~~~~~~~~~~~~~~ 88 | 89 | * ``Selector.remove_namespaces`` received a significant performance improvement 90 | * The value of ``data`` within the printable representation of a selector 91 | (``repr(selector)``) now ends in ``...`` when truncated, to make the 92 | truncation obvious. 93 | * Minor documentation improvements. 94 | 95 | 96 | 1.5.1 (2018-10-25) 97 | ~~~~~~~~~~~~~~~~~~ 98 | 99 | * ``has-class`` XPath function handles newlines and other separators 100 | in class names properly; 101 | * fixed parsing of HTML documents with null bytes; 102 | * documentation improvements; 103 | * Python 3.7 tests are run on CI; other test improvements. 104 | 105 | 106 | 1.5.0 (2018-07-04) 107 | ~~~~~~~~~~~~~~~~~~ 108 | 109 | * New ``Selector.attrib`` and ``SelectorList.attrib`` properties which make 110 | it easier to get attributes of HTML elements. 111 | * CSS selectors became faster: compilation results are cached 112 | (LRU cache is used for ``css2xpath``), so there is 113 | less overhead when the same CSS expression is used several times. 114 | * ``.get()`` and ``.getall()`` selector methods are documented and recommended 115 | over ``.extract_first()`` and ``.extract()``. 116 | * Various documentation tweaks and improvements. 117 | 118 | One more change is that ``.extract()`` and ``.extract_first()`` methods 119 | are now implemented using ``.get()`` and ``.getall()``, not the other 120 | way around, and instead of calling ``Selector.extract`` all other methods 121 | now call ``Selector.get`` internally. It can be **backwards incompatible** 122 | in case of custom Selector subclasses which override ``Selector.extract`` 123 | without doing the same for ``Selector.get``. If you have such Selector 124 | subclass, make sure ``get`` method is also overridden. For example, this:: 125 | 126 | class MySelector(parsel.Selector): 127 | def extract(self): 128 | return super().extract() + " foo" 129 | 130 | should be changed to this:: 131 | 132 | class MySelector(parsel.Selector): 133 | def get(self): 134 | return super().get() + " foo" 135 | extract = get 136 | 137 | 138 | 1.4.0 (2018-02-08) 139 | ~~~~~~~~~~~~~~~~~~ 140 | 141 | * ``Selector`` and ``SelectorList`` can't be pickled because 142 | pickling/unpickling doesn't work for ``lxml.html.HtmlElement``; 143 | parsel now raises TypeError explicitly instead of allowing pickle to 144 | silently produce wrong output. This is technically backwards-incompatible 145 | if you're using Python < 3.6. 146 | 147 | 148 | 1.3.1 (2017-12-28) 149 | ~~~~~~~~~~~~~~~~~~ 150 | 151 | * Fix artifact uploads to pypi. 152 | 153 | 154 | 1.3.0 (2017-12-28) 155 | ~~~~~~~~~~~~~~~~~~ 156 | 157 | * ``has-class`` XPath extension function; 158 | * ``parsel.xpathfuncs.set_xpathfunc`` is a simplified way to register 159 | XPath extensions; 160 | * ``Selector.remove_namespaces`` now removes namespace declarations; 161 | * Python 3.3 support is dropped; 162 | * ``make htmlview`` command for easier Parsel docs development. 163 | * CI: PyPy installation is fixed; parsel now runs tests for PyPy3 as well. 164 | 165 | 166 | 1.2.0 (2017-05-17) 167 | ~~~~~~~~~~~~~~~~~~ 168 | 169 | * Add ``SelectorList.get`` and ``SelectorList.getall`` 170 | methods as aliases for ``SelectorList.extract_first`` 171 | and ``SelectorList.extract`` respectively 172 | * Add default value parameter to ``SelectorList.re_first`` method 173 | * Add ``Selector.re_first`` method 174 | * Add ``replace_entities`` argument on ``.re()`` and ``.re_first()`` 175 | to turn off replacing of character entity references 176 | * Bug fix: detect ``None`` result from lxml parsing and fallback with an empty document 177 | * Rearrange XML/HTML examples in the selectors usage docs 178 | * Travis CI: 179 | 180 | * Test against Python 3.6 181 | * Test against PyPy using "Portable PyPy for Linux" distribution 182 | 183 | 184 | 1.1.0 (2016-11-22) 185 | ~~~~~~~~~~~~~~~~~~ 186 | 187 | * Change default HTML parser to `lxml.html.HTMLParser `_, 188 | which makes easier to use some HTML specific features 189 | * Add css2xpath function to translate CSS to XPath 190 | * Add support for ad-hoc namespaces declarations 191 | * Add support for XPath variables 192 | * Documentation improvements and updates 193 | 194 | 195 | 1.0.3 (2016-07-29) 196 | ~~~~~~~~~~~~~~~~~~ 197 | 198 | * Add BSD-3-Clause license file 199 | * Re-enable PyPy tests 200 | * Integrate py.test runs with setuptools (needed for Debian packaging) 201 | * Changelog is now called ``NEWS`` 202 | 203 | 204 | 1.0.2 (2016-04-26) 205 | ~~~~~~~~~~~~~~~~~~ 206 | 207 | * Fix bug in exception handling causing original traceback to be lost 208 | * Added docstrings and other doc fixes 209 | 210 | 211 | 1.0.1 (2015-08-24) 212 | ~~~~~~~~~~~~~~~~~~ 213 | 214 | * Updated PyPI classifiers 215 | * Added docstrings for csstranslator module and other doc fixes 216 | 217 | 218 | 1.0.0 (2015-08-22) 219 | ~~~~~~~~~~~~~~~~~~ 220 | 221 | * Documentation fixes 222 | 223 | 224 | 0.9.6 (2015-08-14) 225 | ~~~~~~~~~~~~~~~~~~ 226 | 227 | * Updated documentation 228 | * Extended test coverage 229 | 230 | 231 | 0.9.5 (2015-08-11) 232 | ~~~~~~~~~~~~~~~~~~ 233 | 234 | * Support for extending SelectorList 235 | 236 | 237 | 0.9.4 (2015-08-10) 238 | ~~~~~~~~~~~~~~~~~~ 239 | 240 | * Try workaround for travis-ci/dpl#253 241 | 242 | 243 | 0.9.3 (2015-08-07) 244 | ~~~~~~~~~~~~~~~~~~ 245 | 246 | * Add base_url argument 247 | 248 | 249 | 0.9.2 (2015-08-07) 250 | ~~~~~~~~~~~~~~~~~~ 251 | 252 | * Rename module unified -> selector and promoted root attribute 253 | * Add create_root_node function 254 | 255 | 256 | 0.9.1 (2015-08-04) 257 | ~~~~~~~~~~~~~~~~~~ 258 | 259 | * Setup Sphinx build and docs structure 260 | * Build universal wheels 261 | * Rename some leftovers from package extraction 262 | 263 | 264 | 0.9.0 (2015-07-30) 265 | ~~~~~~~~~~~~~~~~~~ 266 | 267 | * First release on PyPI. 268 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ====== 2 | Parsel 3 | ====== 4 | 5 | .. image:: https://github.com/scrapy/parsel/actions/workflows/tests-ubuntu.yml/badge.svg 6 | :target: https://github.com/scrapy/parsel/actions/workflows/tests-ubuntu.yml 7 | :alt: Tests 8 | 9 | .. image:: https://img.shields.io/pypi/pyversions/parsel.svg 10 | :target: https://github.com/scrapy/parsel/actions/workflows/tests.yml 11 | :alt: Supported Python versions 12 | 13 | .. image:: https://img.shields.io/pypi/v/parsel.svg 14 | :target: https://pypi.python.org/pypi/parsel 15 | :alt: PyPI Version 16 | 17 | .. image:: https://img.shields.io/codecov/c/github/scrapy/parsel/master.svg 18 | :target: https://codecov.io/github/scrapy/parsel?branch=master 19 | :alt: Coverage report 20 | 21 | 22 | Parsel is a BSD-licensed Python_ library to extract data from HTML_, JSON_, and 23 | XML_ documents. 24 | 25 | It supports: 26 | 27 | - CSS_ and XPath_ expressions for HTML and XML documents 28 | 29 | - JMESPath_ expressions for JSON documents 30 | 31 | - `Regular expressions`_ 32 | 33 | Find the Parsel online documentation at https://parsel.readthedocs.org. 34 | 35 | Example (`open online demo`_): 36 | 37 | .. code-block:: python 38 | 39 | >>> from parsel import Selector 40 | >>> text = """ 41 | 42 | 43 |

Hello, Parsel!

44 | 48 | 49 | 50 | """ 51 | >>> selector = Selector(text=text) 52 | >>> selector.css('h1::text').get() 53 | 'Hello, Parsel!' 54 | >>> selector.xpath('//h1/text()').re(r'\w+') 55 | ['Hello', 'Parsel'] 56 | >>> for li in selector.css('ul > li'): 57 | ... print(li.xpath('.//@href').get()) 58 | http://example.com 59 | http://scrapy.org 60 | >>> selector.css('script::text').jmespath("a").get() 61 | 'b' 62 | >>> selector.css('script::text').jmespath("a").getall() 63 | ['b', 'c'] 64 | 65 | .. _CSS: https://en.wikipedia.org/wiki/Cascading_Style_Sheets 66 | .. _HTML: https://en.wikipedia.org/wiki/HTML 67 | .. _JMESPath: https://jmespath.org/ 68 | .. _JSON: https://en.wikipedia.org/wiki/JSON 69 | .. _open online demo: https://colab.research.google.com/drive/149VFa6Px3wg7S3SEnUqk--TyBrKplxCN#forceEdit=true&sandboxMode=true 70 | .. _Python: https://www.python.org/ 71 | .. _regular expressions: https://docs.python.org/library/re.html 72 | .. _XML: https://en.wikipedia.org/wiki/XML 73 | .. _XPath: https://en.wikipedia.org/wiki/XPath 74 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | PYTHON = python 6 | SPHINXOPTS = 7 | SPHINXBUILD = sphinx-build 8 | PAPER = 9 | BUILDDIR = _build 10 | 11 | # User-friendly check for sphinx-build 12 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 13 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 14 | endif 15 | 16 | # Internal variables. 17 | PAPEROPT_a4 = -D latex_paper_size=a4 18 | PAPEROPT_letter = -D latex_paper_size=letter 19 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 20 | # the i18n builder cannot share the environment and doctrees with the others 21 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 22 | 23 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 24 | 25 | help: 26 | @echo "Please use \`make ' where is one of" 27 | @echo " html to make standalone HTML files" 28 | @echo " dirhtml to make HTML files named index.html in directories" 29 | @echo " singlehtml to make a single large HTML file" 30 | @echo " pickle to make pickle files" 31 | @echo " json to make JSON files" 32 | @echo " htmlhelp to make HTML files and a HTML help project" 33 | @echo " qthelp to make HTML files and a qthelp project" 34 | @echo " devhelp to make HTML files and a Devhelp project" 35 | @echo " epub to make an epub" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " htmlview to view the compiled HTML files in browser" 50 | 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | html: 55 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 56 | @echo 57 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 58 | 59 | dirhtml: 60 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 61 | @echo 62 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 63 | 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | pickle: 70 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 71 | @echo 72 | @echo "Build finished; now you can process the pickle files." 73 | 74 | json: 75 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 76 | @echo 77 | @echo "Build finished; now you can process the JSON files." 78 | 79 | htmlhelp: 80 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 81 | @echo 82 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 83 | ".hhp project file in $(BUILDDIR)/htmlhelp." 84 | 85 | qthelp: 86 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 87 | @echo 88 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 89 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 90 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/parsel.qhcp" 91 | @echo "To view the help file:" 92 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/parsel.qhc" 93 | 94 | devhelp: 95 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 96 | @echo 97 | @echo "Build finished." 98 | @echo "To view the help file:" 99 | @echo "# mkdir -p $$HOME/.local/share/devhelp/parsel" 100 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/parsel" 101 | @echo "# devhelp" 102 | 103 | epub: 104 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 105 | @echo 106 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 107 | 108 | latex: 109 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 110 | @echo 111 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 112 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 113 | "(use \`make latexpdf' here to do that automatically)." 114 | 115 | latexpdf: 116 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 117 | @echo "Running LaTeX files through pdflatex..." 118 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 119 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 120 | 121 | latexpdfja: 122 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 123 | @echo "Running LaTeX files through platex and dvipdfmx..." 124 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 125 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 126 | 127 | text: 128 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 129 | @echo 130 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 131 | 132 | man: 133 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 134 | @echo 135 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 136 | 137 | texinfo: 138 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 139 | @echo 140 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 141 | @echo "Run \`make' in that directory to run these through makeinfo" \ 142 | "(use \`make info' here to do that automatically)." 143 | 144 | info: 145 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 146 | @echo "Running Texinfo files through makeinfo..." 147 | make -C $(BUILDDIR)/texinfo info 148 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 149 | 150 | gettext: 151 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 152 | @echo 153 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 154 | 155 | changes: 156 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 157 | @echo 158 | @echo "The overview file is in $(BUILDDIR)/changes." 159 | 160 | linkcheck: 161 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 162 | @echo 163 | @echo "Link check complete; look for any errors in the above output " \ 164 | "or in $(BUILDDIR)/linkcheck/output.txt." 165 | 166 | doctest: 167 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 168 | @echo "Testing of doctests in the sources finished, look at the " \ 169 | "results in $(BUILDDIR)/doctest/output.txt." 170 | 171 | xml: 172 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 173 | @echo 174 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 175 | 176 | pseudoxml: 177 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 178 | @echo 179 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 180 | 181 | htmlview: html 182 | $(PYTHON) -c "import webbrowser, os; webbrowser.open('file://' + \ 183 | os.path.realpath('_build/html/index.html'))" 184 | -------------------------------------------------------------------------------- /docs/_static/selectors-sample1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Example website 5 | 6 | 7 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from pathlib import Path 5 | 6 | # Get the project root dir, which is the parent dir of this 7 | project_root = str(Path.cwd().parent) 8 | 9 | # Insert the project root dir as the first element in the PYTHONPATH. 10 | # This lets us ensure that the source package is imported, and that its 11 | # version is used. 12 | sys.path.insert(0, project_root) 13 | 14 | import parsel # noqa: E402 15 | 16 | # -- General configuration --------------------------------------------- 17 | 18 | # Add any Sphinx extension module names here, as strings. They can be 19 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 20 | extensions = [ 21 | "notfound.extension", 22 | "sphinx.ext.autodoc", 23 | "sphinx.ext.intersphinx", 24 | "sphinx.ext.viewcode", 25 | ] 26 | 27 | # Add any paths that contain templates here, relative to this directory. 28 | templates_path = ["_templates"] 29 | 30 | # The suffix of source filenames. 31 | source_suffix = ".rst" 32 | 33 | # The master toctree document. 34 | master_doc = "index" 35 | 36 | # General information about the project. 37 | project = "Parsel" 38 | copyright = "2015, Scrapy Project" 39 | 40 | # The version info for the project you're documenting, acts as replacement 41 | # for |version| and |release|, also used in various other places throughout 42 | # the built documents. 43 | # 44 | # The short X.Y version. 45 | version = parsel.__version__ 46 | # The full version, including alpha/beta/rc tags. 47 | release = parsel.__version__ 48 | 49 | # List of patterns, relative to source directory, that match files and 50 | # directories to ignore when looking for source files. 51 | exclude_patterns = ["_build"] 52 | 53 | # The name of the Pygments (syntax highlighting) style to use. 54 | pygments_style = "sphinx" 55 | 56 | suppress_warnings = ["epub.unknown_project_files"] 57 | 58 | 59 | # -- Options for HTML output ------------------------------------------- 60 | 61 | # The theme to use for HTML and HTML Help pages. See the documentation for 62 | # a list of builtin themes. 63 | html_theme = "sphinx_rtd_theme" 64 | 65 | # Add any paths that contain custom static files (such as style sheets) 66 | # here, relative to this directory. They are copied after the builtin 67 | # static files, so a file named "default.css" will overwrite the builtin 68 | # "default.css". 69 | html_static_path = ["_static"] 70 | 71 | # Output file base name for HTML help builder. 72 | htmlhelp_basename = "parseldoc" 73 | 74 | 75 | # -- Options for LaTeX output ------------------------------------------ 76 | 77 | latex_elements = {} 78 | 79 | # Grouping the document tree into LaTeX files. List of tuples 80 | # (source start file, target name, title, author, documentclass 81 | # [howto/manual]). 82 | latex_documents = [ 83 | ( 84 | "index", 85 | "parsel.tex", 86 | "Parsel Documentation", 87 | "Scrapy Project", 88 | "manual", 89 | ), 90 | ] 91 | 92 | 93 | # -- Options for manual page output ------------------------------------ 94 | 95 | # One entry per manual page. List of tuples 96 | # (source start file, name, description, authors, manual section). 97 | man_pages = [ 98 | ("index", "parsel", "Parsel Documentation", ["Scrapy Project"], 1), 99 | ] 100 | 101 | # -- Options for Texinfo output ---------------------------------------- 102 | 103 | # Grouping the document tree into Texinfo files. List of tuples 104 | # (source start file, target name, title, author, 105 | # dir menu entry, description, category) 106 | texinfo_documents = [ 107 | ( 108 | "index", 109 | "parsel", 110 | "Parsel Documentation", 111 | "Scrapy Project", 112 | "parsel", 113 | "One line description of project.", 114 | "Miscellaneous", 115 | ), 116 | ] 117 | 118 | 119 | # -- Options for the InterSphinx extension ------------------------------------ 120 | 121 | intersphinx_mapping = { 122 | "cssselect": ("https://cssselect.readthedocs.io/en/latest", None), 123 | "python": ("https://docs.python.org/3", None), 124 | "requests": ("https://requests.kennethreitz.org/en/latest", None), 125 | "lxml": ("https://lxml.de/apidoc/", None), 126 | } 127 | 128 | 129 | # --- Nitpicking options ------------------------------------------------------ 130 | 131 | # nitpicky = True # https://github.com/scrapy/cssselect/pull/110 132 | nitpick_ignore = [ 133 | ("py:class", "ExpressionError"), 134 | ("py:class", "SelectorSyntaxError"), 135 | ("py:class", "cssselect.xpath.GenericTranslator"), 136 | ("py:class", "cssselect.xpath.HTMLTranslator"), 137 | ("py:class", "cssselect.xpath.XPathExpr"), 138 | ("py:class", "lxml.etree.XMLParser"), 139 | ] 140 | -------------------------------------------------------------------------------- /docs/conftest.py: -------------------------------------------------------------------------------- 1 | from doctest import ELLIPSIS, NORMALIZE_WHITESPACE 2 | from pathlib import Path 3 | 4 | from sybil import Sybil 5 | 6 | try: 7 | from sybil.parsers.codeblock import PythonCodeBlockParser 8 | except ImportError: 9 | from sybil.parsers.codeblock import ( 10 | CodeBlockParser as PythonCodeBlockParser, 11 | ) 12 | from sybil.parsers.doctest import DocTestParser 13 | from sybil.parsers.skip import skip 14 | 15 | from parsel import Selector 16 | 17 | 18 | def load_selector(filename, **kwargs): 19 | input_path = Path(__file__).parent / "_static" / filename 20 | return Selector(text=input_path.read_text(encoding="utf-8"), **kwargs) 21 | 22 | 23 | def setup(namespace): 24 | namespace["load_selector"] = load_selector 25 | 26 | 27 | pytest_collect_file = Sybil( 28 | parsers=[ 29 | DocTestParser(optionflags=ELLIPSIS | NORMALIZE_WHITESPACE), 30 | PythonCodeBlockParser(future_imports=["print_function"]), 31 | skip, 32 | ], 33 | pattern="*.rst", 34 | setup=setup, 35 | ).pytest() 36 | -------------------------------------------------------------------------------- /docs/history.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../NEWS 2 | 3 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. parsel documentation master file, created by 2 | sphinx-quickstart on Tue Jul 9 22:26:36 2013. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | .. include:: ../README.rst 7 | 8 | Parsel Documentation Contents 9 | ============================= 10 | 11 | Contents: 12 | 13 | .. toctree:: 14 | :maxdepth: 2 15 | 16 | installation 17 | usage 18 | parsel 19 | history 20 | 21 | Indices and tables 22 | ================== 23 | 24 | * :ref:`genindex` 25 | * :ref:`modindex` 26 | * :ref:`search` 27 | 28 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Installation 3 | ============ 4 | 5 | To install Parsel, we recommend you to use `pip `_:: 6 | 7 | $ pip install parsel 8 | 9 | You `probably shouldn't 10 | `_, 11 | but you can also install it with easy_install:: 12 | 13 | $ easy_install parsel 14 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\parsel.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\parsel.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /docs/parsel.rst: -------------------------------------------------------------------------------- 1 | API reference 2 | ============= 3 | 4 | parsel.csstranslator 5 | -------------------- 6 | 7 | .. automodule:: parsel.csstranslator 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | 13 | .. _topics-selectors-ref: 14 | 15 | parsel.selector 16 | --------------- 17 | 18 | .. automodule:: parsel.selector 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | parsel.utils 25 | ------------ 26 | 27 | .. automodule:: parsel.utils 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | sphinx-notfound-page 3 | sphinx_rtd_theme 4 | -------------------------------------------------------------------------------- /docs/usage.rst: -------------------------------------------------------------------------------- 1 | .. _topics-selectors: 2 | 3 | ===== 4 | Usage 5 | ===== 6 | 7 | Create a :class:`~parsel.selector.Selector` object for your input text. 8 | 9 | For HTML or XML, use `CSS`_ or `XPath`_ expressions to select data:: 10 | 11 | >>> from parsel import Selector 12 | >>> html_text = "

Hello, Parsel!

" 13 | >>> html_selector = Selector(text=html_text) 14 | >>> html_selector.css('h1') 15 | [] 16 | >>> html_selector.xpath('//h1') # the same, but now with XPath 17 | [] 18 | 19 | For JSON, use `JMESPath`_ expressions to select data:: 20 | 21 | >>> json_text = '{"title":"Hello, Parsel!"}' 22 | >>> json_selector = Selector(text=json_text) 23 | >>> json_selector.jmespath('title') 24 | [] 25 | 26 | And extract data from those elements:: 27 | 28 | >>> html_selector.xpath('//h1/text()').get() 29 | 'Hello, Parsel!' 30 | >>> json_selector.jmespath('title').getall() 31 | ['Hello, Parsel!'] 32 | 33 | .. _CSS: https://www.w3.org/TR/selectors 34 | .. _XPath: https://www.w3.org/TR/xpath 35 | .. _JMESPath: https://jmespath.org/ 36 | 37 | Learning expression languages 38 | ============================= 39 | 40 | `CSS`_ is a language for applying styles to HTML documents. It defines 41 | selectors to associate those styles with specific HTML elements. Resources to 42 | learn CSS_ selectors include: 43 | 44 | - `CSS selectors in the MDN`_ 45 | 46 | - `XPath/CSS Equivalents in Wikibooks`_ 47 | 48 | Parsel support for CSS selectors comes from cssselect, so read about `CSS 49 | selectors supported by cssselect`_. 50 | 51 | .. _CSS selectors supported by cssselect: https://cssselect.readthedocs.io/en/latest/#supported-selectors 52 | 53 | `XPath`_ is a language for selecting nodes in XML documents, which can also be 54 | used with HTML. Resources to learn XPath_ include: 55 | 56 | - `XPath Tutorial in W3Schools`_ 57 | 58 | - `XPath cheatsheet`_ 59 | 60 | For HTML and XML input, you can use either CSS_ or XPath_. CSS_ is usually 61 | more readable, but some things can only be done with XPath_. 62 | 63 | JMESPath_ allows you to declaratively specify how to extract elements from 64 | a JSON document. Resources to learn JMESPath_ include: 65 | 66 | - `JMESPath Tutorial`_ 67 | 68 | - `JMESPath Specification`_ 69 | 70 | .. _CSS selectors in the MDN: https://developer.mozilla.org/en-US/docs/Learn/CSS/Building_blocks/Selectors 71 | .. _XPath cheatsheet: https://devhints.io/xpath 72 | .. _XPath Tutorial in W3Schools: https://www.w3schools.com/xml/xpath_intro.asp 73 | .. _XPath/CSS Equivalents in Wikibooks: https://en.wikibooks.org/wiki/XPath/CSS_Equivalents 74 | .. _JMESPath Tutorial: https://jmespath.org/tutorial.html 75 | .. _JMESPath Specification: https://jmespath.org/specification.html 76 | 77 | 78 | Using selectors 79 | =============== 80 | 81 | To explain how to use the selectors we'll use the :mod:`requests` library 82 | to download an example page located in the Parsel's documentation: 83 | 84 | https://parsel.readthedocs.org/en/latest/_static/selectors-sample1.html 85 | 86 | .. _topics-selectors-htmlcode: 87 | 88 | For the sake of completeness, here's its full HTML code: 89 | 90 | .. literalinclude:: _static/selectors-sample1.html 91 | :language: html 92 | 93 | .. highlight:: python 94 | 95 | So, let's download that page and create a selector for it: 96 | 97 | .. skip: start 98 | 99 | >>> import requests 100 | >>> from parsel import Selector 101 | >>> url = 'https://parsel.readthedocs.org/en/latest/_static/selectors-sample1.html' 102 | >>> text = requests.get(url).text 103 | >>> selector = Selector(text=text) 104 | 105 | .. skip: end 106 | 107 | .. invisible-code-block: python 108 | 109 | selector = load_selector('selectors-sample1.html') 110 | 111 | Since we're dealing with HTML, the default type for Selector, we don't need 112 | to specify the `type` argument. 113 | 114 | So, by looking at the :ref:`HTML code ` of that 115 | page, let's construct an XPath for selecting the text inside the title tag:: 116 | 117 | >>> selector.xpath('//title/text()') 118 | [] 119 | 120 | You can also ask the same thing using CSS instead:: 121 | 122 | >>> selector.css('title::text') 123 | [] 124 | 125 | To actually extract the textual data, you must call the selector ``.get()`` 126 | or ``.getall()`` methods, as follows:: 127 | 128 | >>> selector.xpath('//title/text()').getall() 129 | ['Example website'] 130 | >>> selector.xpath('//title/text()').get() 131 | 'Example website' 132 | 133 | ``.get()`` always returns a single result; if there are several matches, 134 | content of a first match is returned; if there are no matches, None 135 | is returned. ``.getall()`` returns a list with all results. 136 | 137 | Notice that CSS selectors can select text or attribute nodes using CSS3 138 | pseudo-elements:: 139 | 140 | >>> selector.css('title::text').get() 141 | 'Example website' 142 | 143 | As you can see, ``.xpath()`` and ``.css()`` methods return a 144 | :class:`~parsel.selector.SelectorList` instance, which is a list of new 145 | selectors. This API can be used for quickly selecting nested data:: 146 | 147 | >>> selector.css('img').xpath('@src').getall() 148 | ['image1_thumb.jpg', 149 | 'image2_thumb.jpg', 150 | 'image3_thumb.jpg', 151 | 'image4_thumb.jpg', 152 | 'image5_thumb.jpg'] 153 | 154 | If you want to extract only the first matched element, you can call the 155 | selector ``.get()`` (or its alias ``.extract_first()`` commonly used in 156 | previous parsel versions):: 157 | 158 | >>> selector.xpath('//div[@id="images"]/a/text()').get() 159 | 'Name: My image 1 ' 160 | 161 | It returns ``None`` if no element was found:: 162 | 163 | >>> selector.xpath('//div[@id="not-exists"]/text()').get() is None 164 | True 165 | 166 | Instead of using e.g. ``'@src'`` XPath it is possible to query for attributes 167 | using ``.attrib`` property of a :class:`~parsel.selector.Selector`:: 168 | 169 | >>> [img.attrib['src'] for img in selector.css('img')] 170 | ['image1_thumb.jpg', 171 | 'image2_thumb.jpg', 172 | 'image3_thumb.jpg', 173 | 'image4_thumb.jpg', 174 | 'image5_thumb.jpg'] 175 | 176 | As a shortcut, ``.attrib`` is also available on SelectorList directly; 177 | it returns attributes for the first matching element:: 178 | 179 | >>> selector.css('img').attrib['src'] 180 | 'image1_thumb.jpg' 181 | 182 | This is most useful when only a single result is expected, e.g. when selecting 183 | by id, or selecting unique elements on a web page:: 184 | 185 | >>> selector.css('base').attrib['href'] 186 | 'http://example.com/' 187 | 188 | Now we're going to get the base URL and some image links:: 189 | 190 | >>> selector.xpath('//base/@href').get() 191 | 'http://example.com/' 192 | 193 | >>> selector.css('base::attr(href)').get() 194 | 'http://example.com/' 195 | 196 | >>> selector.css('base').attrib['href'] 197 | 'http://example.com/' 198 | 199 | >>> selector.xpath('//a[contains(@href, "image")]/@href').getall() 200 | ['image1.html', 201 | 'image2.html', 202 | 'image3.html', 203 | 'image4.html', 204 | 'image5.html'] 205 | 206 | >>> selector.css('a[href*=image]::attr(href)').getall() 207 | ['image1.html', 208 | 'image2.html', 209 | 'image3.html', 210 | 'image4.html', 211 | 'image5.html'] 212 | 213 | >>> selector.xpath('//a[contains(@href, "image")]/img/@src').getall() 214 | ['image1_thumb.jpg', 215 | 'image2_thumb.jpg', 216 | 'image3_thumb.jpg', 217 | 'image4_thumb.jpg', 218 | 'image5_thumb.jpg'] 219 | 220 | >>> selector.css('a[href*=image] img::attr(src)').getall() 221 | ['image1_thumb.jpg', 222 | 'image2_thumb.jpg', 223 | 'image3_thumb.jpg', 224 | 'image4_thumb.jpg', 225 | 'image5_thumb.jpg'] 226 | 227 | .. _topics-selectors-css-extensions: 228 | 229 | Extensions to CSS Selectors 230 | --------------------------- 231 | 232 | Per W3C standards, `CSS selectors`_ do not support selecting text nodes 233 | or attribute values. 234 | But selecting these is so essential in a web scraping context 235 | that Parsel implements a couple of **non-standard pseudo-elements**: 236 | 237 | * to select text nodes, use ``::text`` 238 | * to select attribute values, use ``::attr(name)`` where *name* is the 239 | name of the attribute that you want the value of 240 | 241 | .. warning:: 242 | These pseudo-elements are Scrapy-/Parsel-specific. 243 | They will most probably not work with other libraries like `lxml`_ or `PyQuery`_. 244 | 245 | 246 | Examples: 247 | 248 | * ``title::text`` selects children text nodes of a descendant ```` element:: 249 | 250 | >>> selector.css('title::text').get() 251 | 'Example website' 252 | 253 | * ``*::text`` selects all descendant text nodes of the current selector context:: 254 | 255 | >>> selector.css('#images *::text').getall() 256 | ['\n ', 257 | 'Name: My image 1 ', 258 | '\n ', 259 | 'Name: My image 2 ', 260 | '\n ', 261 | 'Name: My image 3 ', 262 | '\n ', 263 | 'Name: My image 4 ', 264 | '\n ', 265 | 'Name: My image 5 ', 266 | '\n '] 267 | 268 | * ``a::attr(href)`` selects the *href* attribute value of descendant links:: 269 | 270 | >>> selector.css('a::attr(href)').getall() 271 | ['image1.html', 272 | 'image2.html', 273 | 'image3.html', 274 | 'image4.html', 275 | 'image5.html'] 276 | 277 | .. note:: 278 | You cannot chain these pseudo-elements. But in practice it would not 279 | make much sense: text nodes do not have attributes, and attribute values 280 | are string values already and do not have children nodes. 281 | 282 | .. note:: 283 | See also: :ref:`selecting-attributes`. 284 | 285 | 286 | .. _CSS Selectors: https://www.w3.org/TR/css3-selectors/#selectors 287 | 288 | .. _topics-selectors-nesting-selectors: 289 | 290 | Nesting selectors 291 | ----------------- 292 | 293 | The selection methods (``.xpath()`` or ``.css()``) return a list of selectors 294 | of the same type, so you can call the selection methods for those selectors 295 | too. Here's an example:: 296 | 297 | >>> links = selector.xpath('//a[contains(@href, "image")]') 298 | >>> links.getall() 299 | ['<a href="image1.html">Name: My image 1 <br><img src="image1_thumb.jpg"></a>', 300 | '<a href="image2.html">Name: My image 2 <br><img src="image2_thumb.jpg"></a>', 301 | '<a href="image3.html">Name: My image 3 <br><img src="image3_thumb.jpg"></a>', 302 | '<a href="image4.html">Name: My image 4 <br><img src="image4_thumb.jpg"></a>', 303 | '<a href="image5.html">Name: My image 5 <br><img src="image5_thumb.jpg"></a>'] 304 | 305 | >>> for index, link in enumerate(links): 306 | ... args = (index, link.xpath('@href').get(), link.xpath('img/@src').get()) 307 | ... print('Link number %d points to url %r and image %r' % args) 308 | Link number 0 points to url 'image1.html' and image 'image1_thumb.jpg' 309 | Link number 1 points to url 'image2.html' and image 'image2_thumb.jpg' 310 | Link number 2 points to url 'image3.html' and image 'image3_thumb.jpg' 311 | Link number 3 points to url 'image4.html' and image 'image4_thumb.jpg' 312 | Link number 4 points to url 'image5.html' and image 'image5_thumb.jpg' 313 | 314 | .. _selecting-attributes: 315 | 316 | Selecting element attributes 317 | ---------------------------- 318 | 319 | There are several ways to get a value of an attribute. First, one can use 320 | XPath syntax:: 321 | 322 | >>> selector.xpath("//a/@href").getall() 323 | ['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html'] 324 | 325 | XPath syntax has a few advantages: it is a standard XPath feature, and 326 | ``@attributes`` can be used in other parts of an XPath expression - e.g. 327 | it is possible to filter by attribute value. 328 | 329 | parsel also provides an extension to CSS selectors (``::attr(...)``) 330 | which allows to get attribute values:: 331 | 332 | >>> selector.css('a::attr(href)').getall() 333 | ['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html'] 334 | 335 | In addition to that, there is a ``.attrib`` property of Selector. 336 | You can use it if you prefer to lookup attributes in Python 337 | code, without using XPaths or CSS extensions:: 338 | 339 | >>> [a.attrib['href'] for a in selector.css('a')] 340 | ['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html'] 341 | 342 | This property is also available on SelectorList; it returns a dictionary 343 | with attributes of a first matching element. It is convenient to use when 344 | a selector is expected to give a single result (e.g. when selecting by element 345 | ID, or when selecting an unique element on a page):: 346 | 347 | >>> selector.css('base').attrib 348 | {'href': 'http://example.com/'} 349 | >>> selector.css('base').attrib['href'] 350 | 'http://example.com/' 351 | 352 | ``.attrib`` property of an empty SelectorList is empty:: 353 | 354 | >>> selector.css('foo').attrib 355 | {} 356 | 357 | Using selectors with regular expressions 358 | ---------------------------------------- 359 | 360 | :class:`~parsel.selector.Selector` also has a ``.re()`` method for extracting 361 | data using regular expressions. However, unlike using ``.xpath()`` or 362 | ``.css()`` methods, ``.re()`` returns a list of strings. So you 363 | can't construct nested ``.re()`` calls. 364 | 365 | Here's an example used to extract image names from the :ref:`HTML code 366 | <topics-selectors-htmlcode>` above:: 367 | 368 | >>> selector.xpath('//a[contains(@href, "image")]/text()').re(r'Name:\s*(.*)') 369 | ['My image 1 ', 370 | 'My image 2 ', 371 | 'My image 3 ', 372 | 'My image 4 ', 373 | 'My image 5 '] 374 | 375 | There's an additional helper reciprocating ``.get()`` (and its 376 | alias ``.extract_first()``) for ``.re()``, named ``.re_first()``. 377 | Use it to extract just the first matching string:: 378 | 379 | >>> selector.xpath('//a[contains(@href, "image")]/text()').re_first(r'Name:\s*(.*)') 380 | 'My image 1 ' 381 | 382 | .. _topics-selectors-relative-xpaths: 383 | 384 | Working with relative XPaths 385 | ---------------------------- 386 | 387 | Keep in mind that if you are nesting selectors and use an XPath that starts 388 | with ``/``, that XPath will be absolute to the document and not relative to the 389 | selector you're calling it from. 390 | 391 | For example, suppose you want to extract all ``<p>`` elements inside ``<div>`` 392 | elements. First, you would get all ``<div>`` elements:: 393 | 394 | >>> divs = selector.xpath('//div') 395 | 396 | At first, you may be tempted to use the following approach, which is wrong, as 397 | it actually extracts all ``<p>`` elements from the document, not only those 398 | inside ``<div>`` elements:: 399 | 400 | >>> for p in divs.xpath('//p'): # this is wrong - gets all <p> from the whole document 401 | ... print(p.get()) 402 | 403 | This is the proper way to do it (note the dot prefixing the ``.//p`` XPath):: 404 | 405 | >>> for p in divs.xpath('.//p'): # extracts all <p> inside 406 | ... print(p.get()) 407 | 408 | Another common case would be to extract all direct ``<p>`` children:: 409 | 410 | >>> for p in divs.xpath('p'): 411 | ... print(p.get()) 412 | 413 | For more details about relative XPaths see the `Location Paths`_ section in the 414 | XPath specification. 415 | 416 | .. _Location Paths: https://www.w3.org/TR/xpath#location-paths 417 | 418 | 419 | Removing elements 420 | ----------------- 421 | 422 | If for any reason you need to remove elements based on a Selector or 423 | a SelectorList, you can do it with the ``drop()`` method, available for both 424 | classes. 425 | 426 | .. warning:: this is a destructive action and cannot be undone. The original 427 | content of the selector is removed from the elements tree. This could be useful 428 | when trying to reduce the memory footprint of Responses. 429 | 430 | Example removing an ad from a blog post: 431 | 432 | >>> from parsel import Selector 433 | >>> doc = """ 434 | ... <article> 435 | ... <div class="row">Content paragraph...</div> 436 | ... <div class="row"> 437 | ... <div class="ad"> 438 | ... Ad content... 439 | ... <a href="http://...">Link</a> 440 | ... </div> 441 | ... </div> 442 | ... <div class="row">More content...</div> 443 | ... </article> 444 | ... """ 445 | >>> sel = Selector(text=doc) 446 | >>> sel.xpath('//div/text()').getall() 447 | ['Content paragraph...', '\n ', '\n Ad content...\n ', '\n ', '\n ', 'More content...'] 448 | >>> sel.xpath('//div[@class="ad"]').drop() 449 | >>> sel.xpath('//div//text()').getall() 450 | ['Content paragraph...', 'More content...'] 451 | 452 | 453 | Using EXSLT extensions 454 | ---------------------- 455 | 456 | Being built atop `lxml`_, parsel selectors support some `EXSLT`_ extensions 457 | and come with these pre-registered namespaces to use in XPath expressions: 458 | 459 | 460 | ====== ===================================== ======================= 461 | prefix namespace usage 462 | ====== ===================================== ======================= 463 | re \http://exslt.org/regular-expressions `regular expressions`_ 464 | set \http://exslt.org/sets `set manipulation`_ 465 | ====== ===================================== ======================= 466 | 467 | Regular expressions 468 | ~~~~~~~~~~~~~~~~~~~ 469 | 470 | The ``test()`` function, for example, can prove quite useful when XPath's 471 | ``starts-with()`` or ``contains()`` are not sufficient. 472 | 473 | Example selecting links in list item with a "class" attribute ending with a digit:: 474 | 475 | >>> from parsel import Selector 476 | >>> doc = """ 477 | ... <div> 478 | ... <ul> 479 | ... <li class="item-0"><a href="link1.html">first item</a></li> 480 | ... <li class="item-1"><a href="link2.html">second item</a></li> 481 | ... <li class="item-inactive"><a href="link3.html">third item</a></li> 482 | ... <li class="item-1"><a href="link4.html">fourth item</a></li> 483 | ... <li class="item-0"><a href="link5.html">fifth item</a></li> 484 | ... </ul> 485 | ... </div> 486 | ... """ 487 | >>> sel = Selector(text=doc) 488 | >>> sel.xpath('//li//@href').getall() 489 | ['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html'] 490 | >>> sel.xpath(r'//li[re:test(@class, "item-\d$")]//@href').getall() 491 | ['link1.html', 'link2.html', 'link4.html', 'link5.html'] 492 | >>> 493 | 494 | .. warning:: C library ``libxslt`` doesn't natively support EXSLT regular 495 | expressions so `lxml`_'s implementation uses hooks to Python's ``re`` module. 496 | Thus, using regexp functions in your XPath expressions may add a small 497 | performance penalty. 498 | 499 | Set operations 500 | ~~~~~~~~~~~~~~ 501 | 502 | These can be handy for excluding parts of a document tree before 503 | extracting text elements for example. 504 | 505 | Example extracting microdata (sample content taken from http://schema.org/Product) 506 | with groups of itemscopes and corresponding itemprops:: 507 | 508 | >>> doc = """ 509 | ... <div itemscope itemtype="http://schema.org/Product"> 510 | ... <span itemprop="name">Kenmore White 17" Microwave</span> 511 | ... <img src="kenmore-microwave-17in.jpg" alt='Kenmore 17" Microwave' /> 512 | ... <div itemprop="aggregateRating" 513 | ... itemscope itemtype="http://schema.org/AggregateRating"> 514 | ... Rated <span itemprop="ratingValue">3.5</span>/5 515 | ... based on <span itemprop="reviewCount">11</span> customer reviews 516 | ... </div> 517 | ... 518 | ... <div itemprop="offers" itemscope itemtype="http://schema.org/Offer"> 519 | ... <span itemprop="price">$55.00</span> 520 | ... <link itemprop="availability" href="http://schema.org/InStock" />In stock 521 | ... </div> 522 | ... 523 | ... Product description: 524 | ... <span itemprop="description">0.7 cubic feet countertop microwave. 525 | ... Has six preset cooking categories and convenience features like 526 | ... Add-A-Minute and Child Lock.</span> 527 | ... 528 | ... Customer reviews: 529 | ... 530 | ... <div itemprop="review" itemscope itemtype="http://schema.org/Review"> 531 | ... <span itemprop="name">Not a happy camper</span> - 532 | ... by <span itemprop="author">Ellie</span>, 533 | ... <meta itemprop="datePublished" content="2011-04-01">April 1, 2011 534 | ... <div itemprop="reviewRating" itemscope itemtype="http://schema.org/Rating"> 535 | ... <meta itemprop="worstRating" content = "1"> 536 | ... <span itemprop="ratingValue">1</span>/ 537 | ... <span itemprop="bestRating">5</span>stars 538 | ... </div> 539 | ... <span itemprop="description">The lamp burned out and now I have to replace 540 | ... it. </span> 541 | ... </div> 542 | ... 543 | ... <div itemprop="review" itemscope itemtype="http://schema.org/Review"> 544 | ... <span itemprop="name">Value purchase</span> - 545 | ... by <span itemprop="author">Lucas</span>, 546 | ... <meta itemprop="datePublished" content="2011-03-25">March 25, 2011 547 | ... <div itemprop="reviewRating" itemscope itemtype="http://schema.org/Rating"> 548 | ... <meta itemprop="worstRating" content = "1"/> 549 | ... <span itemprop="ratingValue">4</span>/ 550 | ... <span itemprop="bestRating">5</span>stars 551 | ... </div> 552 | ... <span itemprop="description">Great microwave for the price. It is small and 553 | ... fits in my apartment.</span> 554 | ... </div> 555 | ... ... 556 | ... </div> 557 | ... """ 558 | >>> sel = Selector(text=doc, type="html") 559 | >>> for scope in sel.xpath('//div[@itemscope]'): 560 | ... print("current scope:", scope.xpath('@itemtype').getall()) 561 | ... props = scope.xpath(''' 562 | ... set:difference(./descendant::*/@itemprop, 563 | ... .//*[@itemscope]/*/@itemprop)''') 564 | ... print(" properties: %s" % (props.getall())) 565 | ... print("") 566 | current scope: ['http://schema.org/Product'] 567 | properties: ['name', 'aggregateRating', 'offers', 'description', 'review', 'review'] 568 | <BLANKLINE> 569 | current scope: ['http://schema.org/AggregateRating'] 570 | properties: ['ratingValue', 'reviewCount'] 571 | <BLANKLINE> 572 | current scope: ['http://schema.org/Offer'] 573 | properties: ['price', 'availability'] 574 | <BLANKLINE> 575 | current scope: ['http://schema.org/Review'] 576 | properties: ['name', 'author', 'datePublished', 'reviewRating', 'description'] 577 | <BLANKLINE> 578 | current scope: ['http://schema.org/Rating'] 579 | properties: ['worstRating', 'ratingValue', 'bestRating'] 580 | <BLANKLINE> 581 | current scope: ['http://schema.org/Review'] 582 | properties: ['name', 'author', 'datePublished', 'reviewRating', 'description'] 583 | <BLANKLINE> 584 | current scope: ['http://schema.org/Rating'] 585 | properties: ['worstRating', 'ratingValue', 'bestRating'] 586 | 587 | 588 | Here we first iterate over ``itemscope`` elements, and for each one, 589 | we look for all ``itemprops`` elements and exclude those that are themselves 590 | inside another ``itemscope``. 591 | 592 | .. _EXSLT: http://exslt.org/ 593 | .. _regular expressions: http://exslt.org/regexp/index.html 594 | .. _set manipulation: http://exslt.org/set/index.html 595 | 596 | .. _topics-xpath-other-extensions: 597 | 598 | Other XPath extensions 599 | ---------------------- 600 | 601 | Parsel also defines a sorely missed XPath extension function ``has-class`` that 602 | returns ``True`` for nodes that have all of the specified HTML classes:: 603 | 604 | >>> from parsel import Selector 605 | >>> sel = Selector(""" 606 | ... <p class="foo bar-baz">First</p> 607 | ... <p class="foo">Second</p> 608 | ... <p class="bar">Third</p> 609 | ... <p>Fourth</p> 610 | ... """) 611 | ... 612 | >>> sel.xpath('//p[has-class("foo")]') 613 | [<Selector query='//p[has-class("foo")]' data='<p class="foo bar-baz">First</p>'>, 614 | <Selector query='//p[has-class("foo")]' data='<p class="foo">Second</p>'>] 615 | >>> sel.xpath('//p[has-class("foo", "bar-baz")]') 616 | [<Selector query='//p[has-class("foo", "bar-baz")]' data='<p class="foo bar-baz">First</p>'>] 617 | >>> sel.xpath('//p[has-class("foo", "bar")]') 618 | [] 619 | 620 | So XPath ``//p[has-class("foo", "bar-baz")]`` is roughly equivalent to CSS 621 | ``p.foo.bar-baz``. Please note, that it is slower in most of the cases, 622 | because it's a pure-Python function that's invoked for every node in question 623 | whereas the CSS lookup is translated into XPath and thus runs more efficiently, 624 | so performance-wise its uses are limited to situations that are not easily 625 | described with CSS selectors. 626 | 627 | Parsel also simplifies adding your own XPath extensions. 628 | 629 | .. autofunction:: parsel.xpathfuncs.set_xpathfunc 630 | 631 | 632 | 633 | Some XPath tips 634 | --------------- 635 | 636 | Here are some tips that you may find useful when using XPath 637 | with Parsel, based on `this post from Zyte's blog`_. 638 | If you are not much familiar with XPath yet, 639 | you may want to take a look first at this `XPath tutorial`_. 640 | 641 | 642 | .. _`XPath tutorial`: http://www.zvon.org/comp/r/tut-XPath_1.html 643 | .. _`this post from Zyte's blog`: https://www.zyte.com/blog/xpath-tips-from-the-web-scraping-trenches/ 644 | 645 | 646 | Using text nodes in a condition 647 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 648 | 649 | When you need to use the text content as argument to an `XPath string function`_, 650 | avoid using ``.//text()`` and use just ``.`` instead. 651 | 652 | This is because the expression ``.//text()`` yields a collection of text elements -- a *node-set*. 653 | And when a node-set is converted to a string, which happens when it is passed as argument to 654 | a string function like ``contains()`` or ``starts-with()``, it results in the text for the first element only. 655 | 656 | Example:: 657 | 658 | >>> from parsel import Selector 659 | >>> sel = Selector(text='<a href="#">Click here to go to the <strong>Next Page</strong></a>') 660 | 661 | Converting a *node-set* to string:: 662 | 663 | >>> sel.xpath('//a//text()').getall() # take a peek at the node-set 664 | ['Click here to go to the ', 'Next Page'] 665 | >>> sel.xpath("string(//a[1]//text())").getall() # convert it to string 666 | ['Click here to go to the '] 667 | 668 | A *node* converted to a string, however, puts together the text of itself plus of all its descendants:: 669 | 670 | >>> sel.xpath("//a[1]").getall() # select the first node 671 | ['<a href="#">Click here to go to the <strong>Next Page</strong></a>'] 672 | >>> sel.xpath("string(//a[1])").getall() # convert it to string 673 | ['Click here to go to the Next Page'] 674 | 675 | So, using the ``.//text()`` node-set won't select anything in this case:: 676 | 677 | >>> sel.xpath("//a[contains(.//text(), 'Next Page')]").getall() 678 | [] 679 | 680 | But using the ``.`` to mean the node, works:: 681 | 682 | >>> sel.xpath("//a[contains(., 'Next Page')]").getall() 683 | ['<a href="#">Click here to go to the <strong>Next Page</strong></a>'] 684 | 685 | .. _`XPath string function`: https://www.w3.org/TR/xpath/#section-String-Functions 686 | 687 | Beware of the difference between //node[1] and (//node)[1] 688 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 689 | 690 | ``//node[1]`` selects all the nodes occurring first under their respective parents. 691 | 692 | ``(//node)[1]`` selects all the nodes in the document, and then gets only the first of them. 693 | 694 | Example:: 695 | 696 | >>> from parsel import Selector 697 | >>> sel = Selector(text=""" 698 | ... <ul class="list"> 699 | ... <li>1</li> 700 | ... <li>2</li> 701 | ... <li>3</li> 702 | ... </ul> 703 | ... <ul class="list"> 704 | ... <li>4</li> 705 | ... <li>5</li> 706 | ... <li>6</li> 707 | ... </ul>""") 708 | >>> xp = lambda x: sel.xpath(x).getall() 709 | 710 | This gets all first ``<li>`` elements under whatever it is its parent:: 711 | 712 | >>> xp("//li[1]") 713 | ['<li>1</li>', '<li>4</li>'] 714 | 715 | And this gets the first ``<li>`` element in the whole document:: 716 | 717 | >>> xp("(//li)[1]") 718 | ['<li>1</li>'] 719 | 720 | This gets all first ``<li>`` elements under an ``<ul>`` parent:: 721 | 722 | >>> xp("//ul/li[1]") 723 | ['<li>1</li>', '<li>4</li>'] 724 | 725 | And this gets the first ``<li>`` element under an ``<ul>`` parent in the whole document:: 726 | 727 | >>> xp("(//ul/li)[1]") 728 | ['<li>1</li>'] 729 | 730 | When querying by class, consider using CSS 731 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 732 | 733 | Because an element can contain multiple CSS classes, the XPath way to select elements 734 | by class is the rather verbose:: 735 | 736 | *[contains(concat(' ', normalize-space(@class), ' '), ' someclass ')] 737 | 738 | If you use ``@class='someclass'`` you may end up missing elements that have 739 | other classes, and if you just use ``contains(@class, 'someclass')`` to make up 740 | for that you may end up with more elements that you want, if they have a different 741 | class name that shares the string ``someclass``. 742 | 743 | As it turns out, parsel selectors allow you to chain selectors, so most of the time 744 | you can just select by class using CSS and then switch to XPath when needed:: 745 | 746 | >>> from parsel import Selector 747 | >>> sel = Selector(text='<div class="hero shout"><time datetime="2014-07-23 19:00">Special date</time></div>') 748 | >>> sel.css('.shout').xpath('./time/@datetime').getall() 749 | ['2014-07-23 19:00'] 750 | 751 | This is cleaner than using the verbose XPath trick shown above. Just remember 752 | to use the ``.`` in the XPath expressions that will follow. 753 | 754 | 755 | Beware of how script and style tags differ from other tags 756 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 757 | 758 | `Following the standard`__, the contents of ``script`` and ``style`` elements 759 | are parsed as plain text. 760 | 761 | __ https://www.w3.org/TR/html401/types.html#type-cdata 762 | 763 | This means that XML-like structures found within them, including comments, are 764 | all treated as part of the element text, and not as separate nodes. 765 | 766 | For example:: 767 | 768 | >>> from parsel import Selector 769 | >>> selector = Selector(text=""" 770 | ... <script> 771 | ... text 772 | ... <!-- comment --> 773 | ... <br/> 774 | ... </script> 775 | ... <style> 776 | ... text 777 | ... <!-- comment --> 778 | ... <br/> 779 | ... </style> 780 | ... <div> 781 | ... text 782 | ... <!-- comment --> 783 | ... <br/> 784 | ... </div>""") 785 | >>> for tag in selector.xpath('//*[contains(text(), "text")]'): 786 | ... print(tag.xpath('name()').get()) 787 | ... print(' Text: ' + (tag.xpath('text()').get() or '')) 788 | ... print(' Comment: ' + (tag.xpath('comment()').get() or '')) 789 | ... print(' Children: ' + ''.join(tag.xpath('*').getall())) 790 | ... 791 | script 792 | Text: 793 | text 794 | <!-- comment --> 795 | <br/> 796 | <BLANKLINE> 797 | Comment: 798 | Children: 799 | style 800 | Text: 801 | text 802 | <!-- comment --> 803 | <br/> 804 | <BLANKLINE> 805 | Comment: 806 | Children: 807 | div 808 | Text: 809 | text 810 | <BLANKLINE> 811 | Comment: <!-- comment --> 812 | Children: <br> 813 | 814 | .. _old-extraction-api: 815 | 816 | extract() and extract_first() 817 | ----------------------------- 818 | 819 | If you're a long-time parsel (or Scrapy) user, you're probably familiar 820 | with ``.extract()`` and ``.extract_first()`` selector methods. These methods 821 | are still supported by parsel, there are no plans to deprecate them. 822 | 823 | However, ``parsel`` usage docs are now written using ``.get()`` and 824 | ``.getall()`` methods. We feel that these new methods result in more concise 825 | and readable code. 826 | 827 | The following examples show how these methods map to each other. 828 | 829 | .. invisible-code-block: python 830 | 831 | selector = load_selector('selectors-sample1.html') 832 | 833 | 1. ``SelectorList.get()`` is the same as ``SelectorList.extract_first()``:: 834 | 835 | >>> selector.css('a::attr(href)').get() 836 | 'image1.html' 837 | >>> selector.css('a::attr(href)').extract_first() 838 | 'image1.html' 839 | 840 | 2. ``SelectorList.getall()`` is the same as ``SelectorList.extract()``:: 841 | 842 | >>> selector.css('a::attr(href)').getall() 843 | ['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html'] 844 | >>> selector.css('a::attr(href)').extract() 845 | ['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html'] 846 | 847 | 3. ``Selector.get()`` is the same as ``Selector.extract()``:: 848 | 849 | >>> selector.css('a::attr(href)')[0].get() 850 | 'image1.html' 851 | >>> selector.css('a::attr(href)')[0].extract() 852 | 'image1.html' 853 | 854 | 4. For consistency, there is also ``Selector.getall()``, which returns a list:: 855 | 856 | >>> selector.css('a::attr(href)')[0].getall() 857 | ['image1.html'] 858 | 859 | With the ``.extract()`` method it was not always obvious if a result is a list 860 | or not; to get a single result either ``.extract()`` or ``.extract_first()`` 861 | needed to be called, depending whether you had a ``Selector`` or ``SelectorList``. 862 | 863 | So, the main difference is that the outputs of ``.get()`` and ``.getall()`` 864 | are more predictable: ``.get()`` always returns a single result, 865 | ``.getall()`` always returns a list of all extracted results. 866 | 867 | 868 | Using CSS selectors in multi-root documents 869 | ------------------------------------------- 870 | 871 | Some webpages may have multiple root elements. It can happen, for example, when 872 | a webpage has broken code, such as missing closing tags. 873 | 874 | .. invisible-code-block: python 875 | 876 | selector = load_selector('multiroot.html') 877 | 878 | You can use XPath to determine if a page has multiple root elements: 879 | 880 | >>> len(selector.xpath('/*')) > 1 881 | True 882 | 883 | CSS selectors only work on the first root element, because the first root 884 | element is always used as the starting current element, and CSS selectors do 885 | not allow selecting parent elements (XPath’s ``..``) or elements relative to 886 | the document root (XPath’s ``/``). 887 | 888 | If you want to use a CSS selector that takes into account all root elements, 889 | you need to precede your CSS query by an XPath query that reaches all root 890 | elements:: 891 | 892 | selector.xpath('/*').css('<your CSS selector>') 893 | 894 | 895 | Command-Line Interface Tools 896 | ============================ 897 | 898 | There are third-party tools that allow using Parsel from the command line: 899 | 900 | - `Parsel CLI <https://github.com/rmax/parsel-cli>`_ allows applying 901 | Parsel selectors to the standard input. For example, you can apply a Parsel 902 | selector to the output of cURL_. 903 | 904 | - `parselcli 905 | <https://github.com/Granitosaurus/parsel-cli>`_ provides an interactive 906 | shell that allows applying Parsel selectors to a remote URL or a local 907 | file. 908 | 909 | .. _cURL: https://curl.haxx.se/ 910 | 911 | 912 | .. _selector-examples-html: 913 | 914 | Examples 915 | ======== 916 | 917 | Working on HTML 918 | --------------- 919 | 920 | Here are some :class:`~parsel.selector.Selector` examples to illustrate 921 | several concepts. In all cases, we assume there is already 922 | a :class:`~parsel.selector.Selector` instantiated with an HTML text like this:: 923 | 924 | sel = Selector(text=html_text) 925 | 926 | 1. Select all ``<h1>`` elements from an HTML text, returning a list of 927 | :class:`~parsel.selector.Selector` objects 928 | (ie. a :class:`~parsel.selector.SelectorList` object):: 929 | 930 | sel.xpath("//h1") 931 | 932 | 2. Extract the text of all ``<h1>`` elements from an HTML text, 933 | returning a list of strings:: 934 | 935 | sel.xpath("//h1").getall() # this includes the h1 tag 936 | sel.xpath("//h1/text()").getall() # this excludes the h1 tag 937 | 938 | 3. Iterate over all ``<p>`` tags and print their class attribute:: 939 | 940 | for node in sel.xpath("//p"): 941 | print(node.attrib['class']) 942 | 943 | 944 | .. _selector-examples-xml: 945 | 946 | Working on XML (and namespaces) 947 | ------------------------------- 948 | 949 | Here are some examples to illustrate concepts for 950 | :class:`~parsel.selector.Selector` objects instantiated with an XML text 951 | like this:: 952 | 953 | sel = Selector(text=xml_text, type='xml') 954 | 955 | 1. Select all ``<product>`` elements from an XML text, returning a list 956 | of :class:`~parsel.selector.Selector` objects 957 | (ie. a :class:`~parsel.selector.SelectorList` object):: 958 | 959 | sel.xpath("//product") 960 | 961 | 2. Extract all prices from a `Google Base XML feed`_ which requires registering 962 | a namespace:: 963 | 964 | sel.register_namespace("g", "http://base.google.com/ns/1.0") 965 | sel.xpath("//g:price").getall() 966 | 967 | .. _removing-namespaces: 968 | 969 | Removing namespaces 970 | ~~~~~~~~~~~~~~~~~~~ 971 | 972 | When dealing with scraping projects, it is often quite convenient to get rid of 973 | namespaces altogether and just work with element names, to write more 974 | simple/convenient XPaths. You can use the 975 | :meth:`Selector.remove_namespaces <parsel.selector.Selector.remove_namespaces>` 976 | method for that. 977 | 978 | Let's show an example that illustrates this with the Python Insider blog atom feed. 979 | 980 | Let's download the atom feed using :mod:`requests` and create a selector: 981 | 982 | .. skip: start 983 | 984 | >>> import requests 985 | >>> from parsel import Selector 986 | >>> text = requests.get('https://feeds.feedburner.com/PythonInsider').text 987 | >>> sel = Selector(text=text, type='xml') 988 | 989 | .. skip: end 990 | 991 | .. invisible-code-block: python 992 | 993 | sel = load_selector('python-insider.xml', type='xml') 994 | 995 | This is how the file starts: 996 | 997 | .. code-block:: xml 998 | 999 | <?xml version="1.0" encoding="UTF-8"?> 1000 | <?xml-stylesheet ... ?> 1001 | <feed xmlns="http://www.w3.org/2005/Atom" 1002 | xmlns:openSearch="http://a9.com/-/spec/opensearchrss/1.0/" 1003 | xmlns:blogger="http://schemas.google.com/blogger/2008" 1004 | xmlns:georss="http://www.georss.org/georss" 1005 | xmlns:gd="http://schemas.google.com/g/2005" 1006 | xmlns:thr="http://purl.org/syndication/thread/1.0" 1007 | xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0"> 1008 | ... 1009 | </feed> 1010 | 1011 | You can see several namespace declarations including a default 1012 | "http://www.w3.org/2005/Atom" and another one using the "gd:" prefix for 1013 | "http://schemas.google.com/g/2005". 1014 | 1015 | We can try selecting all ``<link>`` objects and then see that it doesn't work 1016 | (because the Atom XML namespace is obfuscating those nodes):: 1017 | 1018 | >>> sel.xpath("//link") 1019 | [] 1020 | 1021 | But once we call the :meth:`Selector.remove_namespaces 1022 | <parsel.selector.Selector.remove_namespaces>` method, all nodes can be accessed 1023 | directly by their names:: 1024 | 1025 | >>> sel.remove_namespaces() 1026 | >>> sel.xpath("//link") 1027 | [<Selector query='//link' data='<link rel="alternate" type="text/html...'>, 1028 | <Selector query='//link' data='<link rel="next" type="application/at...'>, 1029 | ...] 1030 | 1031 | If you wonder why the namespace removal procedure isn't called always by default 1032 | instead of having to call it manually, this is because of two reasons, which, in order 1033 | of relevance, are: 1034 | 1035 | 1. Removing namespaces requires to iterate and modify all nodes in the 1036 | document, which is a reasonably expensive operation to perform by default 1037 | for all documents. 1038 | 1039 | 2. There could be some cases where using namespaces is actually required, in 1040 | case some element names clash between namespaces. These cases are very rare 1041 | though. 1042 | 1043 | .. _Google Base XML feed: https://support.google.com/merchants/answer/160589?hl=en&ref_topic=2473799 1044 | .. _requests: https://www.python-requests.org/ 1045 | 1046 | 1047 | Ad-hoc namespaces references 1048 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1049 | 1050 | :class:`~parsel.selector.Selector` objects also allow passing namespaces 1051 | references along with the query, through a ``namespaces`` argument, 1052 | with the prefixes you declare being used in your XPath or CSS query. 1053 | 1054 | Let's use the same Python Insider Atom feed: 1055 | 1056 | .. skip: start 1057 | 1058 | >>> import requests 1059 | >>> from parsel import Selector 1060 | >>> text = requests.get('https://feeds.feedburner.com/PythonInsider').text 1061 | >>> sel = Selector(text=text, type='xml') 1062 | 1063 | .. skip: end 1064 | 1065 | .. invisible-code-block: python 1066 | 1067 | sel = load_selector('python-insider.xml', type='xml') 1068 | 1069 | And try to select the links again, now using an "atom:" prefix 1070 | for the "link" node test:: 1071 | 1072 | >>> sel.xpath("//atom:link", namespaces={"atom": "http://www.w3.org/2005/Atom"}) 1073 | [<Selector query='//atom:link' data='<link xmlns="http://www.w3.org/2005/A...'>, 1074 | <Selector query='//atom:link' data='<link xmlns="http://www.w3.org/2005/A...'>, 1075 | ...] 1076 | 1077 | You can pass several namespaces (here we're using shorter 1-letter prefixes):: 1078 | 1079 | >>> sel.xpath("//a:entry/a:author/g:image/@src", 1080 | ... namespaces={"a": "http://www.w3.org/2005/Atom", 1081 | ... "g": "http://schemas.google.com/g/2005"}).getall() 1082 | ['https://img1.blogblog.com/img/b16-rounded.gif', 1083 | 'https://img1.blogblog.com/img/b16-rounded.gif', 1084 | ...] 1085 | 1086 | .. _topics-xpath-variables: 1087 | 1088 | Variables in XPath expressions 1089 | ------------------------------ 1090 | 1091 | XPath allows you to reference variables in your XPath expressions, using 1092 | the ``$somevariable`` syntax. This is somewhat similar to parameterized 1093 | queries or prepared statements in the SQL world where you replace 1094 | some arguments in your queries with placeholders like ``?``, 1095 | which are then substituted with values passed with the query. 1096 | 1097 | .. invisible-code-block: python 1098 | 1099 | selector = load_selector('selectors-sample1.html') 1100 | 1101 | Here's an example to match an element based on its normalized string-value:: 1102 | 1103 | >>> str_to_match = "Name: My image 3" 1104 | >>> selector.xpath('//a[normalize-space(.)=$match]', 1105 | ... match=str_to_match).get() 1106 | '<a href="image3.html">Name: My image 3 <br><img src="image3_thumb.jpg"></a>' 1107 | 1108 | All variable references must have a binding value when calling ``.xpath()`` 1109 | (otherwise you'll get a ``ValueError: XPath error:`` exception). 1110 | This is done by passing as many named arguments as necessary. 1111 | 1112 | Here's another example using a position range passed as two integers:: 1113 | 1114 | >>> start, stop = 2, 4 1115 | >>> selector.xpath('//a[position()>=$_from and position()<=$_to]', 1116 | ... _from=start, _to=stop).getall() 1117 | ['<a href="image2.html">Name: My image 2 <br><img src="image2_thumb.jpg"></a>', 1118 | '<a href="image3.html">Name: My image 3 <br><img src="image3_thumb.jpg"></a>', 1119 | '<a href="image4.html">Name: My image 4 <br><img src="image4_thumb.jpg"></a>'] 1120 | 1121 | Named variables can be useful when strings need to be escaped for single 1122 | or double quotes characters. The example below would be a bit tricky to 1123 | get right (or legible) without a variable reference:: 1124 | 1125 | >>> html = '''<html> 1126 | ... <body> 1127 | ... <p>He said: "I don't know why, but I like mixing single and double quotes!"</p> 1128 | ... </body> 1129 | ... </html>''' 1130 | >>> selector = Selector(text=html) 1131 | >>> 1132 | >>> selector.xpath('//p[contains(., $mystring)]', 1133 | ... mystring='''He said: "I don't know''').get() 1134 | '<p>He said: "I don\'t know why, but I like mixing single and double quotes!"</p>' 1135 | 1136 | 1137 | Converting CSS to XPath 1138 | ----------------------- 1139 | 1140 | .. autofunction:: parsel.css2xpath 1141 | 1142 | When you're using an API that only accepts XPath expressions, it's sometimes 1143 | useful to convert CSS to XPath. This allows you to take advantage of the 1144 | conciseness of CSS to query elements by classes and the easeness of 1145 | manipulating XPath expressions at the same time. 1146 | 1147 | On those occasions, use the function :func:`~parsel.css2xpath`: 1148 | 1149 | :: 1150 | 1151 | >>> from parsel import css2xpath 1152 | >>> css2xpath('h1.title') 1153 | "descendant-or-self::h1[@class and contains(concat(' ', normalize-space(@class), ' '), ' title ')]" 1154 | >>> css2xpath('.profile-data') + '//h2' 1155 | "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' profile-data ')]//h2" 1156 | 1157 | As you can see from the examples above, it returns the translated CSS query 1158 | into an XPath expression as a string, which you can use as-is or combine to 1159 | build a more complex expression, before feeding to a function expecting XPath. 1160 | 1161 | 1162 | Similar libraries 1163 | ================= 1164 | 1165 | 1166 | * `BeautifulSoup`_ is a very popular screen scraping library among Python 1167 | programmers which constructs a Python object based on the structure of the 1168 | HTML code and also deals with bad markup reasonably well. 1169 | 1170 | * `lxml`_ is an XML parsing library (which also parses HTML) with a pythonic 1171 | API based on `ElementTree`_. (lxml is not part of the Python standard 1172 | library.). Parsel uses it under-the-hood. 1173 | 1174 | * `PyQuery`_ is a library that, like Parsel, uses `lxml`_ and 1175 | :doc:`cssselect <cssselect:index>` under the hood, but it offers a jQuery-like API to 1176 | traverse and manipulate XML/HTML documents. 1177 | 1178 | Parsel is built on top of the `lxml`_ library, which means they're very similar 1179 | in speed and parsing accuracy. The advantage of using Parsel over `lxml`_ is 1180 | that Parsel is simpler to use and extend, unlike the `lxml`_ API which is much 1181 | bigger because the `lxml`_ library can be used for many other tasks, besides 1182 | selecting markup documents. 1183 | 1184 | 1185 | .. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/ 1186 | .. _lxml: https://lxml.de/ 1187 | .. _PyQuery: https://pypi.python.org/pypi/pyquery 1188 | .. _ElementTree: https://docs.python.org/2/library/xml.etree.elementtree.html 1189 | -------------------------------------------------------------------------------- /parsel/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parsel lets you extract text from XML/HTML documents using XPath 3 | or CSS selectors 4 | """ 5 | 6 | __author__ = "Scrapy project" 7 | __email__ = "info@scrapy.org" 8 | __version__ = "1.10.0" 9 | __all__ = [ 10 | "Selector", 11 | "SelectorList", 12 | "css2xpath", 13 | "xpathfuncs", 14 | ] 15 | 16 | from parsel import xpathfuncs 17 | from parsel.csstranslator import css2xpath 18 | from parsel.selector import Selector, SelectorList 19 | 20 | xpathfuncs.setup() 21 | -------------------------------------------------------------------------------- /parsel/csstranslator.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from functools import lru_cache 4 | from typing import TYPE_CHECKING, Any, Protocol 5 | 6 | from cssselect import GenericTranslator as OriginalGenericTranslator 7 | from cssselect import HTMLTranslator as OriginalHTMLTranslator 8 | from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement 9 | from cssselect.xpath import ExpressionError 10 | from cssselect.xpath import XPathExpr as OriginalXPathExpr 11 | 12 | if TYPE_CHECKING: 13 | # typing.Self requires Python 3.11 14 | from typing_extensions import Self 15 | 16 | 17 | class XPathExpr(OriginalXPathExpr): 18 | textnode: bool = False 19 | attribute: str | None = None 20 | 21 | @classmethod 22 | def from_xpath( 23 | cls, 24 | xpath: OriginalXPathExpr, 25 | textnode: bool = False, 26 | attribute: str | None = None, 27 | ) -> Self: 28 | x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition) 29 | x.textnode = textnode 30 | x.attribute = attribute 31 | return x 32 | 33 | def __str__(self) -> str: 34 | path = super().__str__() 35 | if self.textnode: 36 | if path == "*": 37 | path = "text()" 38 | elif path.endswith("::*/*"): 39 | path = path[:-3] + "text()" 40 | else: 41 | path += "/text()" 42 | 43 | if self.attribute is not None: 44 | if path.endswith("::*/*"): 45 | path = path[:-2] 46 | path += f"/@{self.attribute}" 47 | 48 | return path 49 | 50 | def join( 51 | self: Self, 52 | combiner: str, 53 | other: OriginalXPathExpr, 54 | *args: Any, 55 | **kwargs: Any, 56 | ) -> Self: 57 | if not isinstance(other, XPathExpr): 58 | raise ValueError( 59 | f"Expressions of type {__name__}.XPathExpr can ony join expressions" 60 | f" of the same type (or its descendants), got {type(other)}" 61 | ) 62 | super().join(combiner, other, *args, **kwargs) 63 | self.textnode = other.textnode 64 | self.attribute = other.attribute 65 | return self 66 | 67 | 68 | # e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator 69 | class TranslatorProtocol(Protocol): 70 | def xpath_element(self, selector: Element) -> OriginalXPathExpr: 71 | pass 72 | 73 | def css_to_xpath(self, css: str, prefix: str = ...) -> str: 74 | pass 75 | 76 | 77 | class TranslatorMixin: 78 | """This mixin adds support to CSS pseudo elements via dynamic dispatch. 79 | 80 | Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``. 81 | """ 82 | 83 | def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr: 84 | # https://github.com/python/mypy/issues/14757 85 | xpath = super().xpath_element(selector) # type: ignore[safe-super] 86 | return XPathExpr.from_xpath(xpath) 87 | 88 | def xpath_pseudo_element( 89 | self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement 90 | ) -> OriginalXPathExpr: 91 | """ 92 | Dispatch method that transforms XPath to support pseudo-element 93 | """ 94 | if isinstance(pseudo_element, FunctionalPseudoElement): 95 | method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element" 96 | method = getattr(self, method_name, None) 97 | if not method: 98 | raise ExpressionError( 99 | f"The functional pseudo-element ::{pseudo_element.name}() is unknown" 100 | ) 101 | xpath = method(xpath, pseudo_element) 102 | else: 103 | method_name = ( 104 | f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element" 105 | ) 106 | method = getattr(self, method_name, None) 107 | if not method: 108 | raise ExpressionError( 109 | f"The pseudo-element ::{pseudo_element} is unknown" 110 | ) 111 | xpath = method(xpath) 112 | return xpath 113 | 114 | def xpath_attr_functional_pseudo_element( 115 | self, xpath: OriginalXPathExpr, function: FunctionalPseudoElement 116 | ) -> XPathExpr: 117 | """Support selecting attribute values using ::attr() pseudo-element""" 118 | if function.argument_types() not in (["STRING"], ["IDENT"]): 119 | raise ExpressionError( 120 | f"Expected a single string or ident for ::attr(), got {function.arguments!r}" 121 | ) 122 | return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value) 123 | 124 | def xpath_text_simple_pseudo_element(self, xpath: OriginalXPathExpr) -> XPathExpr: 125 | """Support selecting text nodes using ::text pseudo-element""" 126 | return XPathExpr.from_xpath(xpath, textnode=True) 127 | 128 | 129 | class GenericTranslator(TranslatorMixin, OriginalGenericTranslator): 130 | @lru_cache(maxsize=256) 131 | def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str: 132 | return super().css_to_xpath(css, prefix) 133 | 134 | 135 | class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator): 136 | @lru_cache(maxsize=256) 137 | def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str: 138 | return super().css_to_xpath(css, prefix) 139 | 140 | 141 | _translator = HTMLTranslator() 142 | 143 | 144 | def css2xpath(query: str) -> str: 145 | """Return translated XPath version of a given CSS query""" 146 | return _translator.css_to_xpath(query) 147 | -------------------------------------------------------------------------------- /parsel/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy/parsel/dba69f70ca4e875017c14b5cb658f5580c6cb794/parsel/py.typed -------------------------------------------------------------------------------- /parsel/selector.py: -------------------------------------------------------------------------------- 1 | """XPath and JMESPath selectors based on the lxml and jmespath Python 2 | packages.""" 3 | 4 | from __future__ import annotations 5 | 6 | import json 7 | import typing 8 | import warnings 9 | from io import BytesIO 10 | from typing import TYPE_CHECKING, Any, Literal, SupportsIndex, TypedDict, TypeVar, Union 11 | 12 | import jmespath 13 | from lxml import etree, html 14 | from packaging.version import Version 15 | 16 | from .csstranslator import GenericTranslator, HTMLTranslator 17 | from .utils import extract_regex, flatten, iflatten, shorten 18 | 19 | if TYPE_CHECKING: 20 | from collections.abc import Mapping 21 | from re import Pattern 22 | 23 | # typing.Self requires Python 3.11 24 | from typing_extensions import Self 25 | 26 | 27 | _SelectorType = TypeVar("_SelectorType", bound="Selector") 28 | _ParserType = Union[etree.XMLParser, etree.HTMLParser] 29 | # simplified _OutputMethodArg from types-lxml 30 | _TostringMethodType = Literal[ 31 | "html", 32 | "xml", 33 | ] 34 | 35 | lxml_version = Version(etree.__version__) 36 | lxml_huge_tree_version = Version("4.2") 37 | LXML_SUPPORTS_HUGE_TREE = lxml_version >= lxml_huge_tree_version 38 | 39 | 40 | class CannotRemoveElementWithoutRoot(Exception): 41 | pass 42 | 43 | 44 | class CannotRemoveElementWithoutParent(Exception): 45 | pass 46 | 47 | 48 | class CannotDropElementWithoutParent(CannotRemoveElementWithoutParent): 49 | pass 50 | 51 | 52 | class SafeXMLParser(etree.XMLParser): 53 | def __init__(self, *args: Any, **kwargs: Any) -> None: 54 | kwargs.setdefault("resolve_entities", False) 55 | super().__init__(*args, **kwargs) 56 | 57 | 58 | class CTGroupValue(TypedDict): 59 | _parser: type[etree.XMLParser | html.HTMLParser] 60 | _csstranslator: GenericTranslator | HTMLTranslator 61 | _tostring_method: _TostringMethodType 62 | 63 | 64 | _ctgroup: dict[str, CTGroupValue] = { 65 | "html": { 66 | "_parser": html.HTMLParser, 67 | "_csstranslator": HTMLTranslator(), 68 | "_tostring_method": "html", 69 | }, 70 | "xml": { 71 | "_parser": SafeXMLParser, 72 | "_csstranslator": GenericTranslator(), 73 | "_tostring_method": "xml", 74 | }, 75 | } 76 | 77 | 78 | def _xml_or_html(type: str | None) -> str: 79 | return "xml" if type == "xml" else "html" 80 | 81 | 82 | def create_root_node( 83 | text: str, 84 | parser_cls: type[_ParserType], 85 | base_url: str | None = None, 86 | huge_tree: bool = LXML_SUPPORTS_HUGE_TREE, 87 | body: bytes = b"", 88 | encoding: str = "utf-8", 89 | ) -> etree._Element: 90 | """Create root node for text using given parser class.""" 91 | if not text: 92 | body = body.replace(b"\x00", b"").strip() 93 | else: 94 | body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>" 95 | 96 | if huge_tree and LXML_SUPPORTS_HUGE_TREE: 97 | parser = parser_cls(recover=True, encoding=encoding, huge_tree=True) 98 | root = etree.fromstring(body, parser=parser, base_url=base_url) 99 | else: 100 | parser = parser_cls(recover=True, encoding=encoding) 101 | root = etree.fromstring(body, parser=parser, base_url=base_url) 102 | for error in parser.error_log: 103 | if "use XML_PARSE_HUGE option" in error.message: 104 | warnings.warn( 105 | f"Input data is too big. Upgrade to lxml " 106 | f"{lxml_huge_tree_version} or later for huge_tree support.", 107 | stacklevel=2, 108 | ) 109 | if root is None: 110 | root = etree.fromstring(b"<html/>", parser=parser, base_url=base_url) 111 | return root 112 | 113 | 114 | class SelectorList(list[_SelectorType]): 115 | """ 116 | The :class:`SelectorList` class is a subclass of the builtin ``list`` 117 | class, which provides a few additional methods. 118 | """ 119 | 120 | @typing.overload 121 | def __getitem__(self, pos: SupportsIndex) -> _SelectorType: 122 | pass 123 | 124 | @typing.overload 125 | def __getitem__(self, pos: slice) -> SelectorList[_SelectorType]: 126 | pass 127 | 128 | def __getitem__( 129 | self, pos: SupportsIndex | slice 130 | ) -> _SelectorType | SelectorList[_SelectorType]: 131 | o = super().__getitem__(pos) 132 | if isinstance(pos, slice): 133 | return self.__class__(typing.cast("SelectorList[_SelectorType]", o)) 134 | return typing.cast("_SelectorType", o) 135 | 136 | def __getstate__(self) -> None: 137 | raise TypeError("can't pickle SelectorList objects") 138 | 139 | def jmespath(self, query: str, **kwargs: Any) -> SelectorList[_SelectorType]: 140 | """ 141 | Call the ``.jmespath()`` method for each element in this list and return 142 | their results flattened as another :class:`SelectorList`. 143 | 144 | ``query`` is the same argument as the one in :meth:`Selector.jmespath`. 145 | 146 | Any additional named arguments are passed to the underlying 147 | ``jmespath.search`` call, e.g.:: 148 | 149 | selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict)) 150 | """ 151 | return self.__class__(flatten([x.jmespath(query, **kwargs) for x in self])) 152 | 153 | def xpath( 154 | self, 155 | xpath: str, 156 | namespaces: Mapping[str, str] | None = None, 157 | **kwargs: Any, 158 | ) -> SelectorList[_SelectorType]: 159 | """ 160 | Call the ``.xpath()`` method for each element in this list and return 161 | their results flattened as another :class:`SelectorList`. 162 | 163 | ``xpath`` is the same argument as the one in :meth:`Selector.xpath` 164 | 165 | ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict) 166 | for additional prefixes to those registered with ``register_namespace(prefix, uri)``. 167 | Contrary to ``register_namespace()``, these prefixes are not 168 | saved for future calls. 169 | 170 | Any additional named arguments can be used to pass values for XPath 171 | variables in the XPath expression, e.g.:: 172 | 173 | selector.xpath('//a[href=$url]', url="http://www.example.com") 174 | """ 175 | return self.__class__( 176 | flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self]) 177 | ) 178 | 179 | def css(self, query: str) -> SelectorList[_SelectorType]: 180 | """ 181 | Call the ``.css()`` method for each element in this list and return 182 | their results flattened as another :class:`SelectorList`. 183 | 184 | ``query`` is the same argument as the one in :meth:`Selector.css` 185 | """ 186 | return self.__class__(flatten([x.css(query) for x in self])) 187 | 188 | def re(self, regex: str | Pattern[str], replace_entities: bool = True) -> list[str]: 189 | """ 190 | Call the ``.re()`` method for each element in this list and return 191 | their results flattened, as a list of strings. 192 | 193 | By default, character entity references are replaced by their 194 | corresponding character (except for ``&`` and ``<``. 195 | Passing ``replace_entities`` as ``False`` switches off these 196 | replacements. 197 | """ 198 | return flatten([x.re(regex, replace_entities=replace_entities) for x in self]) 199 | 200 | @typing.overload 201 | def re_first( 202 | self, 203 | regex: str | Pattern[str], 204 | default: None = None, 205 | replace_entities: bool = True, 206 | ) -> str | None: 207 | pass 208 | 209 | @typing.overload 210 | def re_first( 211 | self, 212 | regex: str | Pattern[str], 213 | default: str, 214 | replace_entities: bool = True, 215 | ) -> str: 216 | pass 217 | 218 | def re_first( 219 | self, 220 | regex: str | Pattern[str], 221 | default: str | None = None, 222 | replace_entities: bool = True, 223 | ) -> str | None: 224 | """ 225 | Call the ``.re()`` method for the first element in this list and 226 | return the result in an string. If the list is empty or the 227 | regex doesn't match anything, return the default value (``None`` if 228 | the argument is not provided). 229 | 230 | By default, character entity references are replaced by their 231 | corresponding character (except for ``&`` and ``<``. 232 | Passing ``replace_entities`` as ``False`` switches off these 233 | replacements. 234 | """ 235 | for el in iflatten( 236 | x.re(regex, replace_entities=replace_entities) for x in self 237 | ): 238 | return typing.cast("str", el) 239 | return default 240 | 241 | def getall(self) -> list[str]: 242 | """ 243 | Call the ``.get()`` method for each element is this list and return 244 | their results flattened, as a list of strings. 245 | """ 246 | return [x.get() for x in self] 247 | 248 | extract = getall 249 | 250 | @typing.overload 251 | def get(self, default: None = None) -> str | None: 252 | pass 253 | 254 | @typing.overload 255 | def get(self, default: str) -> str: 256 | pass 257 | 258 | def get(self, default: str | None = None) -> Any: 259 | """ 260 | Return the result of ``.get()`` for the first element in this list. 261 | If the list is empty, return the default value. 262 | """ 263 | for x in self: 264 | return x.get() 265 | return default 266 | 267 | extract_first = get 268 | 269 | @property 270 | def attrib(self) -> Mapping[str, str]: 271 | """Return the attributes dictionary for the first element. 272 | If the list is empty, return an empty dict. 273 | """ 274 | for x in self: 275 | return x.attrib 276 | return {} 277 | 278 | def drop(self) -> None: 279 | """ 280 | Drop matched nodes from the parent for each element in this list. 281 | """ 282 | for x in self: 283 | x.drop() 284 | 285 | 286 | _NOT_SET = object() 287 | 288 | 289 | def _get_root_from_text(text: str, *, type: str, **lxml_kwargs: Any) -> etree._Element: 290 | return create_root_node(text, _ctgroup[type]["_parser"], **lxml_kwargs) 291 | 292 | 293 | def _get_root_and_type_from_bytes( 294 | body: bytes, 295 | encoding: str, 296 | *, 297 | input_type: str | None, 298 | **lxml_kwargs: Any, 299 | ) -> tuple[Any, str]: 300 | if input_type == "text": 301 | return body.decode(encoding), input_type 302 | if encoding == "utf-8": 303 | try: 304 | data = json.load(BytesIO(body)) 305 | except ValueError: 306 | data = _NOT_SET 307 | if data is not _NOT_SET: 308 | return data, "json" 309 | if input_type == "json": 310 | return None, "json" 311 | assert input_type in ("html", "xml", None) # nosec 312 | type = _xml_or_html(input_type) 313 | root = create_root_node( 314 | text="", 315 | body=body, 316 | encoding=encoding, 317 | parser_cls=_ctgroup[type]["_parser"], 318 | **lxml_kwargs, 319 | ) 320 | return root, type 321 | 322 | 323 | def _get_root_and_type_from_text( 324 | text: str, *, input_type: str | None, **lxml_kwargs: Any 325 | ) -> tuple[Any, str]: 326 | if input_type == "text": 327 | return text, input_type 328 | try: 329 | data = json.loads(text) 330 | except ValueError: 331 | data = _NOT_SET 332 | if data is not _NOT_SET: 333 | return data, "json" 334 | if input_type == "json": 335 | return None, "json" 336 | assert input_type in ("html", "xml", None) # nosec 337 | type = _xml_or_html(input_type) 338 | root = _get_root_from_text(text, type=type, **lxml_kwargs) 339 | return root, type 340 | 341 | 342 | def _get_root_type(root: Any, *, input_type: str | None) -> str: 343 | if isinstance(root, etree._Element): 344 | if input_type in {"json", "text"}: 345 | raise ValueError( 346 | f"Selector got an lxml.etree._Element object as root, " 347 | f"and {input_type!r} as type." 348 | ) 349 | return _xml_or_html(input_type) 350 | if isinstance(root, (dict, list)) or _is_valid_json(root): 351 | return "json" 352 | return input_type or "json" 353 | 354 | 355 | def _is_valid_json(text: str) -> bool: 356 | try: 357 | json.loads(text) 358 | except (TypeError, ValueError): 359 | return False 360 | return True 361 | 362 | 363 | def _load_json_or_none(text: str) -> Any: 364 | if isinstance(text, (str, bytes, bytearray)): 365 | try: 366 | return json.loads(text) 367 | except ValueError: 368 | return None 369 | return None 370 | 371 | 372 | class Selector: 373 | """Wrapper for input data in HTML, JSON, or XML format, that allows 374 | selecting parts of it using selection expressions. 375 | 376 | You can write selection expressions in CSS or XPath for HTML and XML 377 | inputs, or in JMESPath for JSON inputs. 378 | 379 | ``text`` is an ``str`` object. 380 | 381 | ``body`` is a ``bytes`` object. It can be used together with the 382 | ``encoding`` argument instead of the ``text`` argument. 383 | 384 | ``type`` defines the selector type. It can be ``"html"`` (default), 385 | ``"json"``, or ``"xml"``. 386 | 387 | ``base_url`` allows setting a URL for the document. This is needed when looking up external entities with relative paths. 388 | See the documentation for :func:`lxml.etree.fromstring` for more information. 389 | 390 | ``huge_tree`` controls the lxml/libxml2 feature that forbids parsing 391 | certain large documents to protect from possible memory exhaustion. The 392 | argument is ``True`` by default if the installed lxml version supports it, 393 | which disables the protection to allow parsing such documents. Set it to 394 | ``False`` if you want to enable the protection. 395 | See `this lxml FAQ entry <https://lxml.de/FAQ.html#is-lxml-vulnerable-to-xml-bombs>`_ 396 | for more information. 397 | """ 398 | 399 | __slots__ = [ 400 | "__weakref__", 401 | "_expr", 402 | "_huge_tree", 403 | "_text", 404 | "body", 405 | "namespaces", 406 | "root", 407 | "type", 408 | ] 409 | 410 | _default_namespaces = { 411 | "re": "http://exslt.org/regular-expressions", 412 | # supported in libxslt: 413 | # set:difference 414 | # set:has-same-node 415 | # set:intersection 416 | # set:leading 417 | # set:trailing 418 | "set": "http://exslt.org/sets", 419 | } 420 | _lxml_smart_strings = False 421 | selectorlist_cls = SelectorList["Selector"] 422 | 423 | def __init__( 424 | self, 425 | text: str | None = None, 426 | type: str | None = None, 427 | body: bytes | bytearray = b"", 428 | encoding: str = "utf-8", 429 | namespaces: Mapping[str, str] | None = None, 430 | root: Any | None = _NOT_SET, 431 | base_url: str | None = None, 432 | _expr: str | None = None, 433 | huge_tree: bool = LXML_SUPPORTS_HUGE_TREE, 434 | ) -> None: 435 | self.root: Any 436 | if type not in ("html", "json", "text", "xml", None): 437 | raise ValueError(f"Invalid type: {type}") 438 | 439 | if text is None and not body and root is _NOT_SET: 440 | raise ValueError("Selector needs text, body, or root arguments") 441 | 442 | if text is not None and not isinstance(text, str): 443 | msg = f"text argument should be of type str, got {text.__class__}" 444 | raise TypeError(msg) 445 | 446 | if text is not None: 447 | if root is not _NOT_SET: 448 | warnings.warn( 449 | "Selector got both text and root, root is being ignored.", 450 | stacklevel=2, 451 | ) 452 | if not isinstance(text, str): 453 | msg = f"text argument should be of type str, got {text.__class__}" 454 | raise TypeError(msg) 455 | 456 | root, type = _get_root_and_type_from_text( 457 | text, 458 | input_type=type, 459 | base_url=base_url, 460 | huge_tree=huge_tree, 461 | ) 462 | self.root = root 463 | self.type = type 464 | elif body: 465 | if not isinstance(body, (bytes, bytearray)): 466 | msg = f"body argument should be of type bytes or bytearray, got {body.__class__}" 467 | raise TypeError(msg) 468 | root, type = _get_root_and_type_from_bytes( 469 | body=bytes(body), 470 | encoding=encoding, 471 | input_type=type, 472 | base_url=base_url, 473 | huge_tree=huge_tree, 474 | ) 475 | self.root = root 476 | self.type = type 477 | elif root is _NOT_SET: 478 | raise ValueError("Selector needs text, body, or root arguments") 479 | else: 480 | self.root = root 481 | self.type = _get_root_type(root, input_type=type) 482 | 483 | self.namespaces = dict(self._default_namespaces) 484 | if namespaces is not None: 485 | self.namespaces.update(namespaces) 486 | 487 | self._expr = _expr 488 | self._huge_tree = huge_tree 489 | self._text = text 490 | 491 | def __getstate__(self) -> Any: 492 | raise TypeError("can't pickle Selector objects") 493 | 494 | def _get_root( 495 | self, 496 | text: str = "", 497 | base_url: str | None = None, 498 | huge_tree: bool = LXML_SUPPORTS_HUGE_TREE, 499 | type: str | None = None, 500 | body: bytes = b"", 501 | encoding: str = "utf-8", 502 | ) -> etree._Element: 503 | return create_root_node( 504 | text, 505 | body=body, 506 | encoding=encoding, 507 | parser_cls=_ctgroup[type or self.type]["_parser"], 508 | base_url=base_url, 509 | huge_tree=huge_tree, 510 | ) 511 | 512 | def jmespath( 513 | self, 514 | query: str, 515 | **kwargs: Any, 516 | ) -> SelectorList[Self]: 517 | """ 518 | Find objects matching the JMESPath ``query`` and return the result as a 519 | :class:`SelectorList` instance with all elements flattened. List 520 | elements implement :class:`Selector` interface too. 521 | 522 | ``query`` is a string containing the `JMESPath 523 | <https://jmespath.org/>`_ query to apply. 524 | 525 | Any additional named arguments are passed to the underlying 526 | ``jmespath.search`` call, e.g.:: 527 | 528 | selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict)) 529 | """ 530 | if self.type == "json": 531 | if isinstance(self.root, str): 532 | # Selector received a JSON string as root. 533 | data = _load_json_or_none(self.root) 534 | else: 535 | data = self.root 536 | else: 537 | assert self.type in {"html", "xml"} # nosec 538 | data = _load_json_or_none(self.root.text) 539 | 540 | result = jmespath.search(query, data, **kwargs) 541 | if result is None: 542 | result = [] 543 | elif not isinstance(result, list): 544 | result = [result] 545 | 546 | def make_selector(x: Any) -> Selector: # closure function 547 | if isinstance(x, str): 548 | return self.__class__(text=x, _expr=query, type="text") 549 | return self.__class__(root=x, _expr=query) 550 | 551 | result = [make_selector(x) for x in result] 552 | return typing.cast("SelectorList[Self]", self.selectorlist_cls(result)) 553 | 554 | def xpath( 555 | self, 556 | query: str, 557 | namespaces: Mapping[str, str] | None = None, 558 | **kwargs: Any, 559 | ) -> SelectorList[Self]: 560 | """ 561 | Find nodes matching the xpath ``query`` and return the result as a 562 | :class:`SelectorList` instance with all elements flattened. List 563 | elements implement :class:`Selector` interface too. 564 | 565 | ``query`` is a string containing the XPATH query to apply. 566 | 567 | ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict) 568 | for additional prefixes to those registered with ``register_namespace(prefix, uri)``. 569 | Contrary to ``register_namespace()``, these prefixes are not 570 | saved for future calls. 571 | 572 | Any additional named arguments can be used to pass values for XPath 573 | variables in the XPath expression, e.g.:: 574 | 575 | selector.xpath('//a[href=$url]', url="http://www.example.com") 576 | """ 577 | if self.type not in ("html", "xml", "text"): 578 | raise ValueError(f"Cannot use xpath on a Selector of type {self.type!r}") 579 | if self.type in ("html", "xml"): 580 | try: 581 | xpathev = self.root.xpath 582 | except AttributeError: 583 | return typing.cast("SelectorList[Self]", self.selectorlist_cls([])) 584 | else: 585 | try: 586 | xpathev = self._get_root(self._text or "", type="html").xpath 587 | except AttributeError: 588 | return typing.cast("SelectorList[Self]", self.selectorlist_cls([])) 589 | 590 | nsp = dict(self.namespaces) 591 | if namespaces is not None: 592 | nsp.update(namespaces) 593 | try: 594 | result = xpathev( 595 | query, 596 | namespaces=nsp, 597 | smart_strings=self._lxml_smart_strings, 598 | **kwargs, 599 | ) 600 | except etree.XPathError as exc: 601 | raise ValueError(f"XPath error: {exc} in {query}") 602 | 603 | if not isinstance(result, list): 604 | result = [result] 605 | 606 | result = [ 607 | self.__class__( 608 | root=x, 609 | _expr=query, 610 | namespaces=self.namespaces, 611 | type=_xml_or_html(self.type), 612 | ) 613 | for x in result 614 | ] 615 | return typing.cast("SelectorList[Self]", self.selectorlist_cls(result)) 616 | 617 | def css(self, query: str) -> SelectorList[Self]: 618 | """ 619 | Apply the given CSS selector and return a :class:`SelectorList` instance. 620 | 621 | ``query`` is a string containing the CSS selector to apply. 622 | 623 | In the background, CSS queries are translated into XPath queries using 624 | `cssselect`_ library and run ``.xpath()`` method. 625 | 626 | .. _cssselect: https://pypi.python.org/pypi/cssselect/ 627 | """ 628 | if self.type not in ("html", "xml", "text"): 629 | raise ValueError(f"Cannot use css on a Selector of type {self.type!r}") 630 | return self.xpath(self._css2xpath(query)) 631 | 632 | def _css2xpath(self, query: str) -> str: 633 | type = _xml_or_html(self.type) 634 | return _ctgroup[type]["_csstranslator"].css_to_xpath(query) 635 | 636 | def re(self, regex: str | Pattern[str], replace_entities: bool = True) -> list[str]: 637 | """ 638 | Apply the given regex and return a list of strings with the 639 | matches. 640 | 641 | ``regex`` can be either a compiled regular expression or a string which 642 | will be compiled to a regular expression using ``re.compile(regex)``. 643 | 644 | By default, character entity references are replaced by their 645 | corresponding character (except for ``&`` and ``<``). 646 | Passing ``replace_entities`` as ``False`` switches off these 647 | replacements. 648 | """ 649 | data = self.get() 650 | return extract_regex(regex, data, replace_entities=replace_entities) 651 | 652 | @typing.overload 653 | def re_first( 654 | self, 655 | regex: str | Pattern[str], 656 | default: None = None, 657 | replace_entities: bool = True, 658 | ) -> str | None: 659 | pass 660 | 661 | @typing.overload 662 | def re_first( 663 | self, 664 | regex: str | Pattern[str], 665 | default: str, 666 | replace_entities: bool = True, 667 | ) -> str: 668 | pass 669 | 670 | def re_first( 671 | self, 672 | regex: str | Pattern[str], 673 | default: str | None = None, 674 | replace_entities: bool = True, 675 | ) -> str | None: 676 | """ 677 | Apply the given regex and return the first string which matches. If 678 | there is no match, return the default value (``None`` if the argument 679 | is not provided). 680 | 681 | By default, character entity references are replaced by their 682 | corresponding character (except for ``&`` and ``<``). 683 | Passing ``replace_entities`` as ``False`` switches off these 684 | replacements. 685 | """ 686 | return next( 687 | iflatten(self.re(regex, replace_entities=replace_entities)), 688 | default, 689 | ) 690 | 691 | def get(self) -> Any: 692 | """ 693 | Serialize and return the matched nodes. 694 | 695 | For HTML and XML, the result is always a string, and percent-encoded 696 | content is unquoted. 697 | """ 698 | if self.type in ("text", "json"): 699 | return self.root 700 | try: 701 | return etree.tostring( 702 | self.root, 703 | method=_ctgroup[self.type]["_tostring_method"], 704 | encoding="unicode", 705 | with_tail=False, 706 | ) 707 | except (AttributeError, TypeError): 708 | if self.root is True: 709 | return "1" 710 | if self.root is False: 711 | return "0" 712 | return str(self.root) 713 | 714 | extract = get 715 | 716 | def getall(self) -> list[str]: 717 | """ 718 | Serialize and return the matched node in a 1-element list of strings. 719 | """ 720 | return [self.get()] 721 | 722 | def register_namespace(self, prefix: str, uri: str) -> None: 723 | """ 724 | Register the given namespace to be used in this :class:`Selector`. 725 | Without registering namespaces you can't select or extract data from 726 | non-standard namespaces. See :ref:`selector-examples-xml`. 727 | """ 728 | self.namespaces[prefix] = uri 729 | 730 | def remove_namespaces(self) -> None: 731 | """ 732 | Remove all namespaces, allowing to traverse the document using 733 | namespace-less xpaths. See :ref:`removing-namespaces`. 734 | """ 735 | for el in self.root.iter("*"): 736 | if el.tag.startswith("{"): 737 | el.tag = el.tag.split("}", 1)[1] 738 | # loop on element attributes also 739 | for an in el.attrib: 740 | if an.startswith("{"): 741 | el.attrib[an.split("}", 1)[1]] = el.attrib.pop(an) 742 | # remove namespace declarations 743 | etree.cleanup_namespaces(self.root) 744 | 745 | def drop(self) -> None: 746 | """ 747 | Drop matched nodes from the parent element. 748 | """ 749 | try: 750 | parent = self.root.getparent() 751 | except AttributeError: 752 | # 'str' object has no attribute 'getparent' 753 | raise CannotRemoveElementWithoutRoot( 754 | "The node you're trying to drop has no root, " 755 | "are you trying to drop a pseudo-element? " 756 | "Try to use 'li' as a selector instead of 'li::text' or " 757 | "'//li' instead of '//li/text()', for example." 758 | ) 759 | 760 | try: 761 | if self.type == "xml": 762 | if parent is None: 763 | raise ValueError("This node has no parent") 764 | parent.remove(self.root) 765 | else: 766 | typing.cast("html.HtmlElement", self.root).drop_tree() 767 | except (AttributeError, AssertionError): 768 | # 'NoneType' object has no attribute 'drop' 769 | raise CannotDropElementWithoutParent( 770 | "The node you're trying to remove has no parent, " 771 | "are you trying to remove a root element?" 772 | ) 773 | 774 | @property 775 | def attrib(self) -> dict[str, str]: 776 | """Return the attributes dictionary for underlying element.""" 777 | return dict(self.root.attrib) 778 | 779 | def __bool__(self) -> bool: 780 | """ 781 | Return ``True`` if there is any real content selected or ``False`` 782 | otherwise. In other words, the boolean value of a :class:`Selector` is 783 | given by the contents it selects. 784 | """ 785 | return bool(self.get()) 786 | 787 | __nonzero__ = __bool__ 788 | 789 | def __str__(self) -> str: 790 | return str(self.get()) 791 | 792 | def __repr__(self) -> str: 793 | data = repr(shorten(str(self.get()), width=40)) 794 | return f"<{type(self).__name__} query={self._expr!r} data={data}>" 795 | -------------------------------------------------------------------------------- /parsel/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import re 4 | from typing import TYPE_CHECKING, Any, cast 5 | 6 | from w3lib.html import replace_entities as w3lib_replace_entities 7 | 8 | if TYPE_CHECKING: 9 | from collections.abc import Iterable, Iterator 10 | 11 | 12 | def flatten(x: Iterable[Any]) -> list[Any]: 13 | """flatten(sequence) -> list 14 | Returns a single, flat list which contains all elements retrieved 15 | from the sequence and all recursively contained sub-sequences 16 | (iterables). 17 | Examples: 18 | >>> [1, 2, [3,4], (5,6)] 19 | [1, 2, [3, 4], (5, 6)] 20 | >>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)]) 21 | [1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10] 22 | >>> flatten(["foo", "bar"]) 23 | ['foo', 'bar'] 24 | >>> flatten(["foo", ["baz", 42], "bar"]) 25 | ['foo', 'baz', 42, 'bar'] 26 | """ 27 | return list(iflatten(x)) 28 | 29 | 30 | def iflatten(x: Iterable[Any]) -> Iterator[Any]: 31 | """iflatten(sequence) -> Iterator 32 | Similar to ``.flatten()``, but returns iterator instead""" 33 | for el in x: 34 | if _is_listlike(el): 35 | yield from flatten(el) 36 | else: 37 | yield el 38 | 39 | 40 | def _is_listlike(x: Any) -> bool: 41 | """ 42 | >>> _is_listlike("foo") 43 | False 44 | >>> _is_listlike(5) 45 | False 46 | >>> _is_listlike(b"foo") 47 | False 48 | >>> _is_listlike([b"foo"]) 49 | True 50 | >>> _is_listlike((b"foo",)) 51 | True 52 | >>> _is_listlike({}) 53 | True 54 | >>> _is_listlike(set()) 55 | True 56 | >>> _is_listlike((x for x in range(3))) 57 | True 58 | >>> _is_listlike(range(5)) 59 | True 60 | """ 61 | return hasattr(x, "__iter__") and not isinstance(x, (str, bytes)) 62 | 63 | 64 | def extract_regex( 65 | regex: str | re.Pattern[str], text: str, replace_entities: bool = True 66 | ) -> list[str]: 67 | """Extract a list of strings from the given text/encoding using the following policies: 68 | * if the regex contains a named group called "extract" that will be returned 69 | * if the regex contains multiple numbered groups, all those will be returned (flattened) 70 | * if the regex doesn't contain any group the entire regex matching is returned 71 | """ 72 | if isinstance(regex, str): 73 | regex = re.compile(regex, re.UNICODE) 74 | 75 | if "extract" in regex.groupindex: 76 | # named group 77 | try: 78 | extracted = cast("re.Match[str]", regex.search(text)).group("extract") 79 | except AttributeError: 80 | strings = [] 81 | else: 82 | strings = [extracted] if extracted is not None else [] 83 | else: 84 | # full regex or numbered groups 85 | strings = regex.findall(text) 86 | 87 | strings = flatten(strings) 88 | if not replace_entities: 89 | return strings 90 | return [w3lib_replace_entities(s, keep=["lt", "amp"]) for s in strings] 91 | 92 | 93 | def shorten(text: str, width: int, suffix: str = "...") -> str: 94 | """Truncate the given text to fit in the given width.""" 95 | if len(text) <= width: 96 | return text 97 | if width > len(suffix): 98 | return text[: width - len(suffix)] + suffix 99 | if width >= 0: 100 | return suffix[len(suffix) - width :] 101 | raise ValueError("width must be equal or greater than 0") 102 | -------------------------------------------------------------------------------- /parsel/xpathfuncs.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import re 4 | from typing import Any, Callable 5 | 6 | from lxml import etree 7 | from w3lib.html import HTML5_WHITESPACE 8 | 9 | regex = f"[{HTML5_WHITESPACE}]+" 10 | replace_html5_whitespaces = re.compile(regex).sub 11 | 12 | 13 | def set_xpathfunc(fname: str, func: Callable | None) -> None: # type: ignore[type-arg] 14 | """Register a custom extension function to use in XPath expressions. 15 | 16 | The function ``func`` registered under ``fname`` identifier will be called 17 | for every matching node, being passed a ``context`` parameter as well as 18 | any parameters passed from the corresponding XPath expression. 19 | 20 | If ``func`` is ``None``, the extension function will be removed. 21 | 22 | See more `in lxml documentation`_. 23 | 24 | .. _`in lxml documentation`: https://lxml.de/extensions.html#xpath-extension-functions 25 | 26 | """ 27 | ns_fns = etree.FunctionNamespace(None) 28 | if func is not None: 29 | ns_fns[fname] = func 30 | else: 31 | del ns_fns[fname] 32 | 33 | 34 | def setup() -> None: 35 | set_xpathfunc("has-class", has_class) 36 | 37 | 38 | def has_class(context: Any, *classes: str) -> bool: 39 | """has-class function. 40 | 41 | Return True if all ``classes`` are present in element's class attr. 42 | 43 | """ 44 | if not context.eval_context.get("args_checked"): 45 | if not classes: 46 | raise ValueError("XPath error: has-class must have at least 1 argument") 47 | for c in classes: 48 | if not isinstance(c, str): 49 | raise ValueError("XPath error: has-class arguments must be strings") 50 | context.eval_context["args_checked"] = True 51 | 52 | node_cls = context.context_node.get("class") 53 | if node_cls is None: 54 | return False 55 | node_cls = " " + node_cls + " " 56 | node_cls = replace_html5_whitespaces(" ", node_cls) 57 | return all(" " + cls + " " in node_cls for cls in classes) 58 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.bumpversion] 2 | current_version = "1.10.0" 3 | commit = true 4 | tag = true 5 | tag_name = "v{new_version}" 6 | 7 | [[tool.bumpversion.files]] 8 | filename = "setup.py" 9 | 10 | [[tool.bumpversion.files]] 11 | filename = "parsel/__init__.py" 12 | 13 | [tool.coverage.run] 14 | branch = true 15 | 16 | [tool.coverage.report] 17 | exclude_also = [ 18 | "@typing.overload", 19 | "if TYPE_CHECKING:", 20 | ] 21 | 22 | [tool.pylint.MASTER] 23 | ignore = "typing" 24 | persistent = "no" 25 | extension-pkg-allow-list = [ 26 | "lxml", 27 | ] 28 | 29 | [tool.pylint."MESSAGES CONTROL"] 30 | enable = [ 31 | "useless-suppression", 32 | ] 33 | disable = [ 34 | "fixme", 35 | "import-error", 36 | "import-outside-toplevel", 37 | "invalid-name", 38 | "line-too-long", 39 | "missing-class-docstring", 40 | "missing-function-docstring", 41 | "missing-module-docstring", 42 | "no-member", 43 | "not-callable", 44 | "protected-access", 45 | "raise-missing-from", 46 | "redefined-builtin", 47 | "too-few-public-methods", 48 | "too-many-arguments", 49 | "too-many-lines", 50 | "too-many-positional-arguments", 51 | "too-many-public-methods", 52 | "unused-argument", 53 | "wrong-import-position", 54 | ] 55 | 56 | [tool.pytest.ini_options] 57 | addopts = "--assert=plain --doctest-modules --ignore=setup.py" 58 | 59 | [tool.ruff.lint] 60 | extend-select = [ 61 | # flake8-bugbear 62 | "B", 63 | # flake8-comprehensions 64 | "C4", 65 | # pydocstyle 66 | "D", 67 | # flake8-future-annotations 68 | "FA", 69 | # flynt 70 | "FLY", 71 | # refurb 72 | "FURB", 73 | # isort 74 | "I", 75 | # flake8-implicit-str-concat 76 | "ISC", 77 | # flake8-logging 78 | "LOG", 79 | # Perflint 80 | "PERF", 81 | # pygrep-hooks 82 | "PGH", 83 | # flake8-pie 84 | "PIE", 85 | # pylint 86 | "PL", 87 | # flake8-use-pathlib 88 | "PTH", 89 | # flake8-pyi 90 | "PYI", 91 | # flake8-quotes 92 | "Q", 93 | # flake8-return 94 | "RET", 95 | # flake8-raise 96 | "RSE", 97 | # Ruff-specific rules 98 | "RUF", 99 | # flake8-bandit 100 | "S", 101 | # flake8-simplify 102 | "SIM", 103 | # flake8-slots 104 | "SLOT", 105 | # flake8-debugger 106 | "T10", 107 | # flake8-type-checking 108 | "TC", 109 | # pyupgrade 110 | "UP", 111 | # pycodestyle warnings 112 | "W", 113 | # flake8-2020 114 | "YTT", 115 | ] 116 | ignore = [ 117 | # Within an `except` clause, raise exceptions with `raise ... from` 118 | "B904", 119 | # Missing docstring in public module 120 | "D100", 121 | # Missing docstring in public class 122 | "D101", 123 | # Missing docstring in public method 124 | "D102", 125 | # Missing docstring in public function 126 | "D103", 127 | # Missing docstring in public package 128 | "D104", 129 | # Missing docstring in magic method 130 | "D105", 131 | # Missing docstring in public nested class 132 | "D106", 133 | # Missing docstring in __init__ 134 | "D107", 135 | # One-line docstring should fit on one line with quotes 136 | "D200", 137 | # No blank lines allowed after function docstring 138 | "D202", 139 | # 1 blank line required between summary line and description 140 | "D205", 141 | # Multi-line docstring closing quotes should be on a separate line 142 | "D209", 143 | # First line should end with a period 144 | "D400", 145 | # First line should be in imperative mood; try rephrasing 146 | "D401", 147 | # First line should not be the function's "signature" 148 | "D402", 149 | # First word of the first line should be properly capitalized 150 | "D403", 151 | # No blank lines allowed between a section header and its content 152 | "D412", 153 | # Too many return statements 154 | "PLR0911", 155 | # Too many branches 156 | "PLR0912", 157 | # Too many arguments in function definition 158 | "PLR0913", 159 | # Too many statements 160 | "PLR0915", 161 | # Magic value used in comparison 162 | "PLR2004", 163 | # String contains ambiguous {}. 164 | "RUF001", 165 | # Docstring contains ambiguous {}. 166 | "RUF002", 167 | # Comment contains ambiguous {}. 168 | "RUF003", 169 | # Mutable class attributes should be annotated with `typing.ClassVar` 170 | "RUF012", 171 | # Use of `assert` detected 172 | "S101", 173 | # Using lxml to parse untrusted data is known to be vulnerable to XML attacks 174 | "S320", 175 | 176 | # pending: https://github.com/scrapy/parsel/issues/312 177 | "B019", 178 | ] 179 | 180 | [tool.ruff.lint.per-file-ignores] 181 | "tests/typing/selector.py" = ["F841"] 182 | 183 | [tool.ruff.lint.pydocstyle] 184 | convention = "pep257" 185 | -------------------------------------------------------------------------------- /release.rst: -------------------------------------------------------------------------------- 1 | Release procedures 2 | ------------------ 3 | 4 | * Update NEWS file with the release notes. 5 | Review changes using: ``restview --pypi-strict <(cat README.rst NEWS | grep -v ':changelog')`` 6 | * Run bumpversion with the proper release type 7 | * Push code and tags to GitHub to trigger build 8 | * Copy release notes to https://github.com/scrapy/parsel/releases 9 | * Verify in a temporary virtualenv that ``pip install parsel`` installs the 10 | latest version 11 | * Update version builds at: https://readthedocs.org/projects/parsel/versions/ 12 | You should ensure that previous stable version is active and point stable to the new tag 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from pathlib import Path 3 | 4 | from setuptools import setup 5 | 6 | readme = Path("README.rst").read_text(encoding="utf-8") 7 | history = Path("NEWS").read_text(encoding="utf-8").replace(".. :changelog:", "") 8 | 9 | setup( 10 | name="parsel", 11 | version="1.10.0", 12 | description="Parsel is a library to extract data from HTML and XML using XPath and CSS selectors", 13 | long_description=readme + "\n\n" + history, 14 | long_description_content_type="text/x-rst", 15 | author="Scrapy project", 16 | author_email="info@scrapy.org", 17 | url="https://github.com/scrapy/parsel", 18 | packages=[ 19 | "parsel", 20 | ], 21 | package_dir={ 22 | "parsel": "parsel", 23 | }, 24 | include_package_data=True, 25 | install_requires=[ 26 | "cssselect>=1.2.0", 27 | "jmespath", 28 | "lxml", 29 | "packaging", 30 | "w3lib>=1.19.0", 31 | ], 32 | python_requires=">=3.9", 33 | license="BSD", 34 | zip_safe=False, 35 | keywords="parsel", 36 | classifiers=[ 37 | "Development Status :: 5 - Production/Stable", 38 | "Intended Audience :: Developers", 39 | "License :: OSI Approved :: BSD License", 40 | "Natural Language :: English", 41 | "Topic :: Text Processing :: Markup", 42 | "Topic :: Text Processing :: Markup :: HTML", 43 | "Topic :: Text Processing :: Markup :: XML", 44 | "Programming Language :: Python :: 3", 45 | "Programming Language :: Python :: 3.9", 46 | "Programming Language :: Python :: 3.10", 47 | "Programming Language :: Python :: 3.11", 48 | "Programming Language :: Python :: 3.12", 49 | "Programming Language :: Python :: 3.13", 50 | "Programming Language :: Python :: Implementation :: CPython", 51 | "Programming Language :: Python :: Implementation :: PyPy", 52 | ], 53 | ) 54 | -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | psutil==5.6.3 # https://github.com/giampaolo/psutil/issues/1659#issuecomment-586032229 2 | pytest 3 | pytest-cov 4 | sybil 5 | -------------------------------------------------------------------------------- /tests/test_selector.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pickle 4 | import re 5 | import typing 6 | import unittest 7 | import warnings 8 | import weakref 9 | from typing import TYPE_CHECKING, Any, cast 10 | 11 | from lxml import etree 12 | from packaging.version import Version 13 | 14 | from parsel import Selector, SelectorList 15 | from parsel.selector import ( 16 | _NOT_SET, 17 | LXML_SUPPORTS_HUGE_TREE, 18 | CannotRemoveElementWithoutParent, 19 | CannotRemoveElementWithoutRoot, 20 | ) 21 | 22 | if TYPE_CHECKING: 23 | from collections.abc import Mapping 24 | 25 | from lxml.html import HtmlElement 26 | 27 | 28 | class SelectorTestCase(unittest.TestCase): 29 | sscls = Selector 30 | 31 | def assertIsSelector(self, value: Any) -> None: 32 | self.assertEqual(type(value), type(self.sscls(text=""))) 33 | 34 | def assertIsSelectorList(self, value: Any) -> None: 35 | self.assertEqual(type(value), type(self.sscls.selectorlist_cls())) 36 | 37 | def test_pickle_selector(self) -> None: 38 | sel = self.sscls(text="<html><body><p>some text</p></body></html>") 39 | self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel) 40 | 41 | def test_pickle_selector_list(self) -> None: 42 | sel = self.sscls( 43 | text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>" 44 | ) 45 | sel_list = sel.css("li") 46 | empty_sel_list = sel.css("p") 47 | self.assertIsSelectorList(sel_list) 48 | self.assertIsSelectorList(empty_sel_list) 49 | self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel_list) 50 | self.assertRaises( 51 | TypeError, lambda s: pickle.dumps(s, protocol=2), empty_sel_list 52 | ) 53 | 54 | def test_simple_selection(self) -> None: 55 | """Simple selector tests""" 56 | body = "<p><input name='a'value='1'/><input name='b'value='2'/></p>" 57 | sel = self.sscls(text=body) 58 | 59 | xl = sel.xpath("//input") 60 | self.assertEqual(2, len(xl)) 61 | for x in xl: 62 | self.assertIsSelector(x) 63 | 64 | self.assertEqual( 65 | sel.xpath("//input").extract(), 66 | [x.extract() for x in sel.xpath("//input")], 67 | ) 68 | 69 | self.assertEqual( 70 | [x.extract() for x in sel.xpath("//input[@name='a']/@name")], 71 | ["a"], 72 | ) 73 | self.assertEqual( 74 | [ 75 | x.extract() 76 | for x in sel.xpath( 77 | "number(concat(//input[@name='a']/@value, //input[@name='b']/@value))" 78 | ) 79 | ], 80 | ["12.0"], 81 | ) 82 | 83 | self.assertEqual( 84 | sel.xpath("concat('xpath', 'rules')").extract(), ["xpathrules"] 85 | ) 86 | self.assertEqual( 87 | [ 88 | x.extract() 89 | for x in sel.xpath( 90 | "concat(//input[@name='a']/@value, //input[@name='b']/@value)" 91 | ) 92 | ], 93 | ["12"], 94 | ) 95 | 96 | def test_simple_selection_with_variables(self) -> None: 97 | """Using XPath variables""" 98 | body = "<p><input name='a' value='1'/><input name='b' value='2'/></p>" 99 | sel = self.sscls(text=body) 100 | 101 | self.assertEqual( 102 | [x.extract() for x in sel.xpath("//input[@value=$number]/@name", number=1)], 103 | ["a"], 104 | ) 105 | self.assertEqual( 106 | [ 107 | x.extract() 108 | for x in sel.xpath("//input[@name=$letter]/@value", letter="b") 109 | ], 110 | ["2"], 111 | ) 112 | 113 | self.assertEqual( 114 | sel.xpath( 115 | "count(//input[@value=$number or @name=$letter])", 116 | number=2, 117 | letter="a", 118 | ).extract(), 119 | ["2.0"], 120 | ) 121 | 122 | # you can also pass booleans 123 | self.assertEqual( 124 | sel.xpath("boolean(count(//input)=$cnt)=$test", cnt=2, test=True).extract(), 125 | ["1"], 126 | ) 127 | self.assertEqual( 128 | sel.xpath("boolean(count(//input)=$cnt)=$test", cnt=4, test=True).extract(), 129 | ["0"], 130 | ) 131 | self.assertEqual( 132 | sel.xpath( 133 | "boolean(count(//input)=$cnt)=$test", cnt=4, test=False 134 | ).extract(), 135 | ["1"], 136 | ) 137 | 138 | # for named nodes, you need to use "name()=node_name" 139 | self.assertEqual( 140 | sel.xpath( 141 | "boolean(count(//*[name()=$tag])=$cnt)=$test", 142 | tag="input", 143 | cnt=2, 144 | test=True, 145 | ).extract(), 146 | ["1"], 147 | ) 148 | 149 | def test_simple_selection_with_variables_escape_friendly(self) -> None: 150 | """Using XPath variables with quotes that would need escaping with string formatting""" 151 | body = """<p>I'm mixing single and <input name='a' value='I say "Yeah!"'/> 152 | "double quotes" and I don't care :)</p>""" 153 | sel = self.sscls(text=body) 154 | 155 | t = 'I say "Yeah!"' 156 | # naive string formatting with give something like: 157 | # ValueError: XPath error: Invalid predicate in //input[@value="I say "Yeah!""]/@name 158 | self.assertRaises(ValueError, sel.xpath, f'//input[@value="{t}"]/@name') 159 | 160 | # with XPath variables, escaping is done for you 161 | self.assertEqual( 162 | [x.extract() for x in sel.xpath("//input[@value=$text]/@name", text=t)], 163 | ["a"], 164 | ) 165 | lt = """I'm mixing single and "double quotes" and I don't care :)""" 166 | # the following gives you something like 167 | # ValueError: XPath error: Invalid predicate in //p[normalize-space()='I'm mixing single and "double quotes" and I don't care :)']//@name 168 | self.assertRaises( 169 | ValueError, sel.xpath, f"//p[normalize-space()='{lt}']//@name" 170 | ) 171 | 172 | self.assertEqual( 173 | [ 174 | x.extract() 175 | for x in sel.xpath("//p[normalize-space()=$lng]//@name", lng=lt) 176 | ], 177 | ["a"], 178 | ) 179 | 180 | def test_accessing_attributes(self) -> None: 181 | body = """ 182 | <html lang="en" version="1.0"> 183 | <body> 184 | <ul id="some-list" class="list-cls" class="list-cls"> 185 | <li class="item-cls" id="list-item-1"> 186 | <li class="item-cls active" id="list-item-2"> 187 | <li class="item-cls" id="list-item-3"> 188 | </ul> 189 | </body> 190 | </html> 191 | """ 192 | sel = self.sscls(text=body) 193 | self.assertEqual({"lang": "en", "version": "1.0"}, sel.attrib) 194 | self.assertEqual( 195 | {"id": "some-list", "class": "list-cls"}, sel.css("ul")[0].attrib 196 | ) 197 | 198 | # for a SelectorList, bring the attributes of first-element only 199 | self.assertEqual({"id": "some-list", "class": "list-cls"}, sel.css("ul").attrib) 200 | self.assertEqual( 201 | {"class": "item-cls", "id": "list-item-1"}, sel.css("li").attrib 202 | ) 203 | self.assertEqual({}, sel.css("body").attrib) 204 | self.assertEqual({}, sel.css("non-existing-element").attrib) 205 | 206 | self.assertEqual( 207 | [ 208 | {"class": "item-cls", "id": "list-item-1"}, 209 | {"class": "item-cls active", "id": "list-item-2"}, 210 | {"class": "item-cls", "id": "list-item-3"}, 211 | ], 212 | [e.attrib for e in sel.css("li")], 213 | ) 214 | 215 | def test_representation_slice(self) -> None: 216 | body = f"<p><input name='{50 * 'b'}' value='\xa9'/></p>" 217 | sel = self.sscls(text=body) 218 | 219 | representation = f"<Selector query='//input/@name' data='{37 * 'b'}...'>" 220 | 221 | self.assertEqual( 222 | [repr(it) for it in sel.xpath("//input/@name")], [representation] 223 | ) 224 | 225 | def test_representation_unicode_query(self) -> None: 226 | body = f"<p><input name='{50 * 'b'}' value='\xa9'/></p>" 227 | 228 | representation = "<Selector query='//input[@value=\"©\"]/@value' data='©'>" 229 | 230 | sel = self.sscls(text=body) 231 | self.assertEqual( 232 | [repr(it) for it in sel.xpath('//input[@value="\xa9"]/@value')], 233 | [representation], 234 | ) 235 | 236 | def test_check_text_argument_type(self) -> None: 237 | self.assertRaisesRegex( 238 | TypeError, 239 | "text argument should be of type", 240 | self.sscls, 241 | b"<html/>", 242 | ) 243 | 244 | def test_extract_first(self) -> None: 245 | """Test if extract_first() returns first element""" 246 | body = '<ul><li id="1">1</li><li id="2">2</li></ul>' 247 | sel = self.sscls(text=body) 248 | 249 | self.assertEqual( 250 | sel.xpath("//ul/li/text()").extract_first(), 251 | sel.xpath("//ul/li/text()").extract()[0], 252 | ) 253 | 254 | self.assertEqual( 255 | sel.xpath('//ul/li[@id="1"]/text()').extract_first(), 256 | sel.xpath('//ul/li[@id="1"]/text()').extract()[0], 257 | ) 258 | 259 | self.assertEqual( 260 | sel.xpath("//ul/li[2]/text()").extract_first(), 261 | sel.xpath("//ul/li/text()").extract()[1], 262 | ) 263 | 264 | self.assertEqual( 265 | sel.xpath('/ul/li[@id="doesnt-exist"]/text()').extract_first(), 266 | None, 267 | ) 268 | 269 | def test_extract_first_default(self) -> None: 270 | """Test if extract_first() returns default value when no results found""" 271 | body = '<ul><li id="1">1</li><li id="2">2</li></ul>' 272 | sel = self.sscls(text=body) 273 | 274 | self.assertEqual( 275 | sel.xpath("//div/text()").extract_first(default="missing"), 276 | "missing", 277 | ) 278 | 279 | def test_selector_get_alias(self) -> None: 280 | """Test if get() returns extracted value on a Selector""" 281 | body = '<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>' 282 | sel = self.sscls(text=body) 283 | 284 | self.assertEqual( 285 | sel.xpath("//ul/li[position()>1]")[0].get(), '<li id="2">2</li>' 286 | ) 287 | self.assertEqual(sel.xpath("//ul/li[position()>1]/text()")[0].get(), "2") 288 | 289 | def test_selector_getall_alias(self) -> None: 290 | """Test if get() returns extracted value on a Selector""" 291 | body = '<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>' 292 | sel = self.sscls(text=body) 293 | 294 | self.assertListEqual( 295 | sel.xpath("//ul/li[position()>1]")[0].getall(), 296 | ['<li id="2">2</li>'], 297 | ) 298 | self.assertListEqual( 299 | sel.xpath("//ul/li[position()>1]/text()")[0].getall(), ["2"] 300 | ) 301 | 302 | def test_selectorlist_get_alias(self) -> None: 303 | """Test if get() returns first element for a selection call""" 304 | body = '<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>' 305 | sel = self.sscls(text=body) 306 | 307 | self.assertEqual(sel.xpath("//ul/li").get(), '<li id="1">1</li>') 308 | self.assertEqual(sel.xpath("//ul/li/text()").get(), "1") 309 | 310 | def test_re_first(self) -> None: 311 | """Test if re_first() returns first matched element""" 312 | body = '<ul><li id="1">1</li><li id="2">2</li></ul>' 313 | sel = self.sscls(text=body) 314 | 315 | self.assertEqual( 316 | sel.xpath("//ul/li/text()").re_first(r"\d"), 317 | sel.xpath("//ul/li/text()").re(r"\d")[0], 318 | ) 319 | 320 | self.assertEqual( 321 | sel.xpath('//ul/li[@id="1"]/text()').re_first(r"\d"), 322 | sel.xpath('//ul/li[@id="1"]/text()').re(r"\d")[0], 323 | ) 324 | 325 | self.assertEqual( 326 | sel.xpath("//ul/li[2]/text()").re_first(r"\d"), 327 | sel.xpath("//ul/li/text()").re(r"\d")[1], 328 | ) 329 | 330 | self.assertEqual(sel.xpath("/ul/li/text()").re_first(r"\w+"), None) 331 | self.assertEqual( 332 | sel.xpath('/ul/li[@id="doesnt-exist"]/text()').re_first(r"\d"), 333 | None, 334 | ) 335 | 336 | self.assertEqual(sel.re_first(r'id="(\d+)'), "1") 337 | self.assertEqual(sel.re_first(r"foo"), None) 338 | self.assertEqual(sel.re_first(r"foo", default="bar"), "bar") 339 | 340 | def test_extract_first_re_default(self) -> None: 341 | """Test if re_first() returns default value when no results found""" 342 | body = '<ul><li id="1">1</li><li id="2">2</li></ul>' 343 | sel = self.sscls(text=body) 344 | 345 | self.assertEqual( 346 | sel.xpath("//div/text()").re_first(r"\w+", default="missing"), 347 | "missing", 348 | ) 349 | self.assertEqual( 350 | sel.xpath("/ul/li/text()").re_first(r"\w+", default="missing"), 351 | "missing", 352 | ) 353 | 354 | def test_select_unicode_query(self) -> None: 355 | body = "<p><input name='\xa9' value='1'/></p>" 356 | sel = self.sscls(text=body) 357 | self.assertEqual(sel.xpath('//input[@name="\xa9"]/@value').extract(), ["1"]) 358 | 359 | def test_list_elements_type(self) -> None: 360 | """Test Selector returning the same type in selection methods""" 361 | text = "<p>test<p>" 362 | self.assertEqual( 363 | type(self.sscls(text=text).xpath("//p")[0]), 364 | type(self.sscls(text=text)), 365 | ) 366 | self.assertEqual( 367 | type(self.sscls(text=text).css("p")[0]), 368 | type(self.sscls(text=text)), 369 | ) 370 | 371 | def test_boolean_result(self) -> None: 372 | body = "<p><input name='a'value='1'/><input name='b'value='2'/></p>" 373 | xs = self.sscls(text=body) 374 | self.assertEqual(xs.xpath("//input[@name='a']/@name='a'").extract(), ["1"]) 375 | self.assertEqual(xs.xpath("//input[@name='a']/@name='n'").extract(), ["0"]) 376 | 377 | def test_differences_parsing_xml_vs_html(self) -> None: 378 | """Test that XML and HTML Selector's behave differently""" 379 | # some text which is parsed differently by XML and HTML flavors 380 | text = '<div><img src="a.jpg"><p>Hello</div>' 381 | hs = self.sscls(text=text, type="html") 382 | self.assertEqual( 383 | hs.xpath("//div").extract(), 384 | ['<div><img src="a.jpg"><p>Hello</p></div>'], 385 | ) 386 | 387 | xs = self.sscls(text=text, type="xml") 388 | self.assertEqual( 389 | xs.xpath("//div").extract(), 390 | ['<div><img src="a.jpg"><p>Hello</p></img></div>'], 391 | ) 392 | 393 | def test_error_for_unknown_selector_type(self) -> None: 394 | self.assertRaises(ValueError, self.sscls, text="", type="_na_") 395 | 396 | def test_text_or_root_is_required(self) -> None: 397 | self.assertRaisesRegex( 398 | ValueError, 399 | "Selector needs text, body, or root arguments", 400 | self.sscls, 401 | ) 402 | 403 | def test_bool(self) -> None: 404 | text = '<a href="" >false</a><a href="nonempty">true</a>' 405 | hs = self.sscls(text=text, type="html") 406 | falsish = hs.xpath("//a/@href")[0] 407 | self.assertEqual(falsish.extract(), "") 408 | self.assertFalse(falsish) 409 | trueish = hs.xpath("//a/@href")[1] 410 | self.assertEqual(trueish.extract(), "nonempty") 411 | self.assertTrue(trueish) 412 | 413 | def test_slicing(self) -> None: 414 | text = "<div><p>1</p><p>2</p><p>3</p></div>" 415 | hs = self.sscls(text=text, type="html") 416 | self.assertIsSelector(hs.css("p")[2]) 417 | self.assertIsSelectorList(hs.css("p")[2:3]) 418 | self.assertIsSelectorList(hs.css("p")[:2]) 419 | self.assertEqual(hs.css("p")[2:3].extract(), ["<p>3</p>"]) 420 | self.assertEqual(hs.css("p")[1:3].extract(), ["<p>2</p>", "<p>3</p>"]) 421 | 422 | def test_nested_selectors(self) -> None: 423 | """Nested selector tests""" 424 | body = """<body> 425 | <div class='one'> 426 | <ul> 427 | <li>one</li><li>two</li> 428 | </ul> 429 | </div> 430 | <div class='two'> 431 | <ul> 432 | <li>four</li><li>five</li><li>six</li> 433 | </ul> 434 | </div> 435 | </body>""" 436 | 437 | x = self.sscls(text=body) 438 | divtwo = x.xpath('//div[@class="two"]') 439 | self.assertEqual( 440 | divtwo.xpath("//li").extract(), 441 | [ 442 | "<li>one</li>", 443 | "<li>two</li>", 444 | "<li>four</li>", 445 | "<li>five</li>", 446 | "<li>six</li>", 447 | ], 448 | ) 449 | self.assertEqual( 450 | divtwo.xpath("./ul/li").extract(), 451 | ["<li>four</li>", "<li>five</li>", "<li>six</li>"], 452 | ) 453 | self.assertEqual( 454 | divtwo.xpath(".//li").extract(), 455 | ["<li>four</li>", "<li>five</li>", "<li>six</li>"], 456 | ) 457 | self.assertEqual(divtwo.xpath("./li").extract(), []) 458 | 459 | def test_selectorlist_getall_alias(self) -> None: 460 | """Nested selector tests using getall()""" 461 | body = """<body> 462 | <div class='one'> 463 | <ul> 464 | <li>one</li><li>two</li> 465 | </ul> 466 | </div> 467 | <div class='two'> 468 | <ul> 469 | <li>four</li><li>five</li><li>six</li> 470 | </ul> 471 | </div> 472 | </body>""" 473 | 474 | x = self.sscls(text=body) 475 | divtwo = x.xpath('//div[@class="two"]') 476 | self.assertEqual( 477 | divtwo.xpath("//li").getall(), 478 | [ 479 | "<li>one</li>", 480 | "<li>two</li>", 481 | "<li>four</li>", 482 | "<li>five</li>", 483 | "<li>six</li>", 484 | ], 485 | ) 486 | self.assertEqual( 487 | divtwo.xpath("./ul/li").getall(), 488 | ["<li>four</li>", "<li>five</li>", "<li>six</li>"], 489 | ) 490 | self.assertEqual( 491 | divtwo.xpath(".//li").getall(), 492 | ["<li>four</li>", "<li>five</li>", "<li>six</li>"], 493 | ) 494 | self.assertEqual(divtwo.xpath("./li").getall(), []) 495 | 496 | def test_mixed_nested_selectors(self) -> None: 497 | body = """<body> 498 | <div id=1>not<span>me</span></div> 499 | <div class="dos"><p>text</p><a href='#'>foo</a></div> 500 | </body>""" 501 | sel = self.sscls(text=body) 502 | self.assertEqual( 503 | sel.xpath('//div[@id="1"]').css("span::text").extract(), ["me"] 504 | ) 505 | self.assertEqual(sel.css("#1").xpath("./span/text()").extract(), ["me"]) 506 | 507 | def test_dont_strip(self) -> None: 508 | sel = self.sscls(text='<div>fff: <a href="#">zzz</a></div>') 509 | self.assertEqual(sel.xpath("//text()").extract(), ["fff: ", "zzz"]) 510 | 511 | def test_namespaces_simple(self) -> None: 512 | body = """ 513 | <test xmlns:somens="http://scrapy.org"> 514 | <somens:a id="foo">take this</a> 515 | <a id="bar">found</a> 516 | </test> 517 | """ 518 | 519 | x = self.sscls(text=body, type="xml") 520 | 521 | x.register_namespace("somens", "http://scrapy.org") 522 | self.assertEqual(x.xpath("//somens:a/text()").extract(), ["take this"]) 523 | 524 | def test_namespaces_adhoc(self) -> None: 525 | body = """ 526 | <test xmlns:somens="http://scrapy.org"> 527 | <somens:a id="foo">take this</a> 528 | <a id="bar">found</a> 529 | </test> 530 | """ 531 | 532 | x = self.sscls(text=body, type="xml") 533 | 534 | self.assertEqual( 535 | x.xpath( 536 | "//somens:a/text()", 537 | namespaces={"somens": "http://scrapy.org"}, 538 | ).extract(), 539 | ["take this"], 540 | ) 541 | 542 | def test_namespaces_adhoc_variables(self) -> None: 543 | body = """ 544 | <test xmlns:somens="http://scrapy.org"> 545 | <somens:a id="foo">take this</a> 546 | <a id="bar">found</a> 547 | </test> 548 | """ 549 | 550 | x = self.sscls(text=body, type="xml") 551 | 552 | self.assertEqual( 553 | x.xpath( 554 | "//somens:a/following-sibling::a[@id=$identifier]/text()", 555 | namespaces={"somens": "http://scrapy.org"}, 556 | identifier="bar", 557 | ).extract(), 558 | ["found"], 559 | ) 560 | 561 | def test_namespaces_multiple(self) -> None: 562 | body = """<?xml version="1.0" encoding="UTF-8"?> 563 | <BrowseNode xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05" 564 | xmlns:b="http://somens.com" 565 | xmlns:p="http://www.scrapy.org/product" > 566 | <b:Operation>hello</b:Operation> 567 | <TestTag b:att="value"><Other>value</Other></TestTag> 568 | <p:SecondTestTag><material>iron</material><price>90</price><p:name>Dried Rose</p:name></p:SecondTestTag> 569 | </BrowseNode> 570 | """ 571 | x = self.sscls(text=body, type="xml") 572 | x.register_namespace( 573 | "xmlns", 574 | "http://webservices.amazon.com/AWSECommerceService/2005-10-05", 575 | ) 576 | x.register_namespace("p", "http://www.scrapy.org/product") 577 | x.register_namespace("b", "http://somens.com") 578 | self.assertEqual(len(x.xpath("//xmlns:TestTag")), 1) 579 | self.assertEqual(x.xpath("//b:Operation/text()").extract()[0], "hello") 580 | self.assertEqual(x.xpath("//xmlns:TestTag/@b:att").extract()[0], "value") 581 | self.assertEqual( 582 | x.xpath("//p:SecondTestTag/xmlns:price/text()").extract()[0], "90" 583 | ) 584 | self.assertEqual( 585 | x.xpath("//p:SecondTestTag").xpath("./xmlns:price/text()")[0].extract(), 586 | "90", 587 | ) 588 | self.assertEqual( 589 | x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], 590 | "iron", 591 | ) 592 | 593 | def test_namespaces_multiple_adhoc(self) -> None: 594 | body = """<?xml version="1.0" encoding="UTF-8"?> 595 | <BrowseNode xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05" 596 | xmlns:b="http://somens.com" 597 | xmlns:p="http://www.scrapy.org/product" > 598 | <b:Operation>hello</b:Operation> 599 | <TestTag b:att="value"><Other>value</Other></TestTag> 600 | <p:SecondTestTag><material>iron</material><price>90</price><p:name>Dried Rose</p:name></p:SecondTestTag> 601 | </BrowseNode> 602 | """ 603 | x = self.sscls(text=body, type="xml") 604 | x.register_namespace( 605 | "xmlns", 606 | "http://webservices.amazon.com/AWSECommerceService/2005-10-05", 607 | ) 608 | self.assertEqual(len(x.xpath("//xmlns:TestTag")), 1) 609 | 610 | # "b" namespace is not declared yet 611 | self.assertRaises(ValueError, x.xpath, "//xmlns:TestTag/@b:att") 612 | 613 | # "b" namespace being passed ad-hoc 614 | self.assertEqual( 615 | x.xpath( 616 | "//b:Operation/text()", namespaces={"b": "http://somens.com"} 617 | ).extract()[0], 618 | "hello", 619 | ) 620 | 621 | # "b" namespace declaration is not cached 622 | self.assertRaises(ValueError, x.xpath, "//xmlns:TestTag/@b:att") 623 | 624 | # "xmlns" is still defined 625 | self.assertEqual( 626 | x.xpath( 627 | "//xmlns:TestTag/@b:att", 628 | namespaces={"b": "http://somens.com"}, 629 | ).extract()[0], 630 | "value", 631 | ) 632 | 633 | # chained selectors still have knowledge of register_namespace() operations 634 | self.assertEqual( 635 | x.xpath( 636 | "//p:SecondTestTag", 637 | namespaces={"p": "http://www.scrapy.org/product"}, 638 | ) 639 | .xpath("./xmlns:price/text()")[0] 640 | .extract(), 641 | "90", 642 | ) 643 | 644 | # but chained selector don't know about parent ad-hoc declarations 645 | self.assertRaises( 646 | ValueError, 647 | x.xpath( 648 | "//p:SecondTestTag", 649 | namespaces={"p": "http://www.scrapy.org/product"}, 650 | ).xpath, 651 | "p:name/text()", 652 | ) 653 | 654 | # ad-hoc declarations need repeats when chaining 655 | self.assertEqual( 656 | x.xpath( 657 | "//p:SecondTestTag", 658 | namespaces={"p": "http://www.scrapy.org/product"}, 659 | ) 660 | .xpath( 661 | "p:name/text()", 662 | namespaces={"p": "http://www.scrapy.org/product"}, 663 | ) 664 | .extract_first(), 665 | "Dried Rose", 666 | ) 667 | 668 | # declaring several ad-hoc namespaces 669 | self.assertEqual( 670 | x.xpath( 671 | "string(//b:Operation/following-sibling::xmlns:TestTag" 672 | "/following-sibling::*//p:name)", 673 | namespaces={ 674 | "b": "http://somens.com", 675 | "p": "http://www.scrapy.org/product", 676 | }, 677 | ).extract_first(), 678 | "Dried Rose", 679 | ) 680 | 681 | # "p" prefix is not cached from previous calls 682 | self.assertRaises(ValueError, x.xpath, "//p:SecondTestTag/xmlns:price/text()") 683 | 684 | x.register_namespace("p", "http://www.scrapy.org/product") 685 | self.assertEqual( 686 | x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], 687 | "iron", 688 | ) 689 | 690 | def test_make_links_absolute(self) -> None: 691 | text = '<a href="file.html">link to file</a>' 692 | sel = Selector(text=text, base_url="http://example.com") 693 | typing.cast("HtmlElement", sel.root).make_links_absolute() 694 | self.assertEqual( 695 | "http://example.com/file.html", 696 | sel.xpath("//a/@href").extract_first(), 697 | ) 698 | 699 | def test_re(self) -> None: 700 | body = """<div>Name: Mary 701 | <ul> 702 | <li>Name: John</li> 703 | <li>Age: 10</li> 704 | <li>Name: Paul</li> 705 | <li>Age: 20</li> 706 | </ul> 707 | Age: 20 708 | </div>""" 709 | x = self.sscls(text=body) 710 | 711 | name_re = re.compile(r"Name: (\w+)") 712 | self.assertEqual(x.xpath("//ul/li").re(name_re), ["John", "Paul"]) 713 | self.assertEqual(x.xpath("//ul/li").re(r"Age: (\d+)"), ["10", "20"]) 714 | 715 | # Test named group, hit and miss 716 | x = self.sscls(text="foobar") 717 | self.assertEqual(x.re("(?P<extract>foo)"), ["foo"]) 718 | self.assertEqual(x.re("(?P<extract>baz)"), []) 719 | 720 | # A purposely constructed test for an edge case 721 | x = self.sscls(text="baz") 722 | self.assertEqual(x.re("(?P<extract>foo)|(?P<bar>baz)"), []) 723 | 724 | def test_re_replace_entities(self) -> None: 725 | body = """<script>{"foo":"bar & "baz""}</script>""" 726 | x = self.sscls(text=body) 727 | 728 | name_re = re.compile('{"foo":(.*)}') 729 | 730 | # by default, only & and < are preserved ; 731 | # other entities are converted 732 | expected = '"bar & "baz""' 733 | self.assertEqual(x.xpath("//script/text()").re(name_re), [expected]) 734 | self.assertEqual(x.xpath("//script").re(name_re), [expected]) 735 | self.assertEqual(x.xpath("//script/text()")[0].re(name_re), [expected]) 736 | self.assertEqual(x.xpath("//script")[0].re(name_re), [expected]) 737 | 738 | # check that re_first() works the same way for single value output 739 | self.assertEqual(x.xpath("//script").re_first(name_re), expected) 740 | self.assertEqual(x.xpath("//script")[0].re_first(name_re), expected) 741 | 742 | # switching off replace_entities will preserve " also 743 | expected = '"bar & "baz""' 744 | self.assertEqual( 745 | x.xpath("//script/text()").re(name_re, replace_entities=False), 746 | [expected], 747 | ) 748 | self.assertEqual( 749 | x.xpath("//script")[0].re(name_re, replace_entities=False), 750 | [expected], 751 | ) 752 | 753 | self.assertEqual( 754 | x.xpath("//script/text()").re_first(name_re, replace_entities=False), 755 | expected, 756 | ) 757 | self.assertEqual( 758 | x.xpath("//script")[0].re_first(name_re, replace_entities=False), 759 | expected, 760 | ) 761 | 762 | def test_re_intl(self) -> None: 763 | body = "<div>Evento: cumplea\xf1os</div>" 764 | x = self.sscls(text=body) 765 | self.assertEqual(x.xpath("//div").re(r"Evento: (\w+)"), ["cumplea\xf1os"]) 766 | 767 | def test_selector_over_text(self) -> None: 768 | hs = self.sscls(text="<root>lala</root>") 769 | self.assertEqual(hs.extract(), "<html><body><root>lala</root></body></html>") 770 | xs = self.sscls(text="<root>lala</root>", type="xml") 771 | self.assertEqual(xs.extract(), "<root>lala</root>") 772 | self.assertEqual(xs.xpath(".").extract(), ["<root>lala</root>"]) 773 | 774 | def test_invalid_xpath(self) -> None: 775 | """Test invalid xpath raises ValueError with the invalid xpath""" 776 | x = self.sscls(text="<html></html>") 777 | xpath = "//test[@foo='bar]" 778 | self.assertRaisesRegex(ValueError, re.escape(xpath), x.xpath, xpath) 779 | 780 | def test_invalid_xpath_unicode(self) -> None: 781 | """Test *Unicode* invalid xpath raises ValueError with the invalid xpath""" 782 | x = self.sscls(text="<html></html>") 783 | xpath = "//test[@foo='\\u0431ar]" 784 | self.assertRaisesRegex(ValueError, re.escape(xpath), x.xpath, xpath) 785 | 786 | def test_http_header_encoding_precedence(self) -> None: 787 | # '\xa3' = pound symbol in unicode 788 | # '\xc2\xa3' = pound symbol in utf-8 789 | # '\xa3' = pound symbol in latin-1 (iso-8859-1) 790 | 791 | text = """<html> 792 | <head><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"></head> 793 | <body><span id="blank">\xa3</span></body></html>""" 794 | x = self.sscls(text=text) 795 | self.assertEqual(x.xpath("//span[@id='blank']/text()").extract(), ["\xa3"]) 796 | 797 | def test_empty_bodies_shouldnt_raise_errors(self) -> None: 798 | self.sscls(text="").xpath("//text()").extract() 799 | 800 | def test_bodies_with_comments_only(self) -> None: 801 | sel = self.sscls(text="<!-- hello world -->", base_url="http://example.com") 802 | self.assertEqual("http://example.com", sel.root.base) 803 | 804 | def test_null_bytes_shouldnt_raise_errors(self) -> None: 805 | text = "<root>pre\x00post</root>" 806 | self.sscls(text).xpath("//text()").extract() 807 | 808 | def test_replacement_char_from_badly_encoded_body(self) -> None: 809 | # \xe9 alone isn't valid utf8 sequence 810 | text = "<html><p>an Jos\\ufffd de</p><html>" 811 | self.assertEqual( 812 | ["an Jos\\ufffd de"], self.sscls(text).xpath("//text()").extract() 813 | ) 814 | 815 | def test_select_on_unevaluable_nodes(self) -> None: 816 | r = self.sscls(text='<span class="big">some text</span>') 817 | # Text node 818 | x1 = r.xpath("//text()") 819 | self.assertEqual(x1.extract(), ["some text"]) 820 | self.assertEqual(x1.xpath(".//b").extract(), []) 821 | # Tag attribute 822 | x1 = r.xpath("//span/@class") 823 | self.assertEqual(x1.extract(), ["big"]) 824 | self.assertEqual(x1.xpath(".//text()").extract(), []) 825 | 826 | def test_select_on_text_nodes(self) -> None: 827 | r = self.sscls(text="<div><b>Options:</b>opt1</div><div><b>Other</b>opt2</div>") 828 | x1 = r.xpath( 829 | "//div/descendant::text()[preceding-sibling::b[contains(text(), 'Options')]]" 830 | ) 831 | self.assertEqual(x1.extract(), ["opt1"]) 832 | 833 | x1 = r.xpath( 834 | "//div/descendant::text()/preceding-sibling::b[contains(text(), 'Options')]" 835 | ) 836 | self.assertEqual(x1.extract(), ["<b>Options:</b>"]) 837 | 838 | @unittest.skip("Text nodes lost parent node reference in lxml") 839 | def test_nested_select_on_text_nodes(self) -> None: 840 | # FIXME: does not work with lxml backend [upstream] 841 | r = self.sscls(text="<div><b>Options:</b>opt1</div><div><b>Other</b>opt2</div>") 842 | x1 = r.xpath("//div/descendant::text()") 843 | x2 = x1.xpath("./preceding-sibling::b[contains(text(), 'Options')]") 844 | self.assertEqual(x2.extract(), ["<b>Options:</b>"]) 845 | 846 | def test_weakref_slots(self) -> None: 847 | """Check that classes are using slots and are weak-referenceable""" 848 | x = self.sscls(text="") 849 | weakref.ref(x) 850 | assert not hasattr(x, "__dict__"), ( 851 | f"{x.__class__.__name__} does not use __slots__" 852 | ) 853 | 854 | def test_remove_namespaces(self) -> None: 855 | xml = """<?xml version="1.0" encoding="UTF-8"?> 856 | <feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/"> 857 | <link type="text/html"/> 858 | <entry> 859 | <link type="text/html"/> 860 | </entry> 861 | <link type="application/atom+xml"/> 862 | </feed> 863 | """ 864 | sel = self.sscls(text=xml, type="xml") 865 | self.assertEqual(len(sel.xpath("//link")), 0) 866 | self.assertEqual(len(sel.xpath("./namespace::*")), 3) 867 | sel.remove_namespaces() 868 | self.assertEqual(len(sel.xpath("//link")), 3) 869 | self.assertEqual(len(sel.xpath("./namespace::*")), 1) 870 | 871 | def test_remove_namespaces_embedded(self) -> None: 872 | xml = """ 873 | <feed xmlns="http://www.w3.org/2005/Atom"> 874 | <link type="text/html"/> 875 | <entry> 876 | <link type="text/html"/> 877 | </entry> 878 | <svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 100 100"> 879 | <linearGradient id="gradient"> 880 | <stop class="begin" offset="0%" style="stop-color:yellow;"/> 881 | <stop class="end" offset="80%" style="stop-color:green;"/> 882 | </linearGradient> 883 | <circle cx="50" cy="50" r="30" style="fill:url(#gradient)" /> 884 | </svg> 885 | </feed> 886 | """ 887 | sel = self.sscls(text=xml, type="xml") 888 | self.assertEqual(len(sel.xpath("//link")), 0) 889 | self.assertEqual(len(sel.xpath("//stop")), 0) 890 | self.assertEqual(len(sel.xpath("./namespace::*")), 2) 891 | self.assertEqual( 892 | len( 893 | sel.xpath( 894 | "//f:link", 895 | namespaces={"f": "http://www.w3.org/2005/Atom"}, 896 | ) 897 | ), 898 | 2, 899 | ) 900 | self.assertEqual( 901 | len(sel.xpath("//s:stop", namespaces={"s": "http://www.w3.org/2000/svg"})), 902 | 2, 903 | ) 904 | sel.remove_namespaces() 905 | self.assertEqual(len(sel.xpath("//link")), 2) 906 | self.assertEqual(len(sel.xpath("//stop")), 2) 907 | self.assertEqual(len(sel.xpath("./namespace::*")), 1) 908 | 909 | def test_remove_attributes_namespaces(self) -> None: 910 | xml = """<?xml version="1.0" encoding="UTF-8"?> 911 | <feed xmlns:atom="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/"> 912 | <link atom:type="text/html"/> 913 | <entry> 914 | <link atom:type="text/html"/> 915 | </entry> 916 | <link atom:type="application/atom+xml"/> 917 | </feed> 918 | """ 919 | sel = self.sscls(text=xml, type="xml") 920 | self.assertEqual(len(sel.xpath("//link/@type")), 0) 921 | sel.remove_namespaces() 922 | self.assertEqual(len(sel.xpath("//link/@type")), 3) 923 | 924 | def test_smart_strings(self) -> None: 925 | """Lxml smart strings return values""" 926 | 927 | class SmartStringsSelector(Selector): 928 | _lxml_smart_strings = True 929 | 930 | body = """<body> 931 | <div class='one'> 932 | <ul> 933 | <li>one</li><li>two</li> 934 | </ul> 935 | </div> 936 | <div class='two'> 937 | <ul> 938 | <li>four</li><li>five</li><li>six</li> 939 | </ul> 940 | </div> 941 | </body>""" 942 | 943 | # .getparent() is available for text nodes and attributes 944 | # only when smart_strings are on 945 | x = self.sscls(text=body) 946 | li_text = x.xpath("//li/text()") 947 | self.assertFalse(any(hasattr(e.root, "getparent") for e in li_text)) 948 | div_class = x.xpath("//div/@class") 949 | self.assertFalse(any(hasattr(e.root, "getparent") for e in div_class)) 950 | 951 | smart_x = SmartStringsSelector(text=body) 952 | smart_li_text = smart_x.xpath("//li/text()") 953 | self.assertTrue(all(hasattr(e.root, "getparent") for e in smart_li_text)) 954 | smart_div_class = smart_x.xpath("//div/@class") 955 | self.assertTrue(all(hasattr(e.root, "getparent") for e in smart_div_class)) 956 | 957 | def test_xml_entity_expansion(self) -> None: 958 | malicious_xml = ( 959 | '<?xml version="1.0" encoding="ISO-8859-1"?>' 960 | "<!DOCTYPE foo [ <!ELEMENT foo ANY > <!ENTITY xxe SYSTEM " 961 | '"file:///etc/passwd" >]><foo>&xxe;</foo>' 962 | ) 963 | 964 | sel = self.sscls(text=malicious_xml, type="xml") 965 | 966 | self.assertEqual(sel.extract(), "<foo>&xxe;</foo>") 967 | 968 | def test_configure_base_url(self) -> None: 969 | sel = self.sscls(text="nothing", base_url="http://example.com") 970 | self.assertEqual("http://example.com", sel.root.base) 971 | 972 | def test_extending_selector(self) -> None: 973 | class MySelectorList(SelectorList["MySelector"]): 974 | pass 975 | 976 | class MySelector(Selector): 977 | selectorlist_cls = MySelectorList 978 | 979 | def extra_method(self) -> str: 980 | return "extra" + cast("str", self.get()) 981 | 982 | sel = MySelector(text="<html><div>foo</div></html>") 983 | self.assertIsInstance(sel.xpath("//div"), MySelectorList) 984 | self.assertIsInstance(sel.xpath("//div")[0], MySelector) 985 | self.assertIsInstance(sel.css("div"), MySelectorList) 986 | self.assertIsInstance(sel.css("div")[0], MySelector) 987 | content: str = sel.css("div")[0].extra_method() 988 | self.assertEqual("extra<div>foo</div>", content) 989 | 990 | def test_replacement_null_char_from_body(self) -> None: 991 | text = "<html>\x00<body><p>Grainy</p></body></html>" 992 | self.assertEqual( 993 | "<html><body><p>Grainy</p></body></html>", 994 | self.sscls(text).extract(), 995 | ) 996 | 997 | def test_remove_selector_list(self) -> None: 998 | sel = self.sscls( 999 | text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>" 1000 | ) 1001 | sel_list = sel.css("li") 1002 | sel_list.drop() 1003 | self.assertIsSelectorList(sel.css("li")) 1004 | self.assertEqual(sel.css("li"), []) 1005 | 1006 | def test_remove_selector(self) -> None: 1007 | sel = self.sscls( 1008 | text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>" 1009 | ) 1010 | sel_list = sel.css("li") 1011 | sel_list[0].drop() 1012 | self.assertIsSelectorList(sel.css("li")) 1013 | self.assertEqual(sel.css("li::text").getall(), ["2", "3"]) 1014 | 1015 | def test_remove_pseudo_element_selector_list(self) -> None: 1016 | sel = self.sscls( 1017 | text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>" 1018 | ) 1019 | sel_list = sel.css("li::text") 1020 | self.assertEqual(sel_list.getall(), ["1", "2", "3"]) 1021 | with self.assertRaises(CannotRemoveElementWithoutRoot): 1022 | sel_list.drop() 1023 | 1024 | self.assertIsSelectorList(sel.css("li")) 1025 | self.assertEqual(sel.css("li::text").getall(), ["1", "2", "3"]) 1026 | 1027 | def test_remove_pseudo_element_selector(self) -> None: 1028 | sel = self.sscls( 1029 | text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>" 1030 | ) 1031 | sel_list = sel.css("li::text") 1032 | self.assertEqual(sel_list.getall(), ["1", "2", "3"]) 1033 | with self.assertRaises(CannotRemoveElementWithoutRoot): 1034 | sel_list[0].drop() 1035 | 1036 | self.assertIsSelectorList(sel.css("li")) 1037 | self.assertEqual(sel.css("li::text").getall(), ["1", "2", "3"]) 1038 | 1039 | def test_remove_root_element_selector(self) -> None: 1040 | sel = self.sscls( 1041 | text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>" 1042 | ) 1043 | sel_list = sel.css("li::text") 1044 | self.assertEqual(sel_list.getall(), ["1", "2", "3"]) 1045 | with self.assertRaises(CannotRemoveElementWithoutParent): 1046 | sel.drop() 1047 | 1048 | with self.assertRaises(CannotRemoveElementWithoutParent): 1049 | sel.css("html").drop() 1050 | 1051 | self.assertIsSelectorList(sel.css("li")) 1052 | self.assertEqual(sel.css("li::text").getall(), ["1", "2", "3"]) 1053 | 1054 | sel.css("body").drop() 1055 | self.assertEqual(sel.get(), "<html></html>") 1056 | 1057 | def test_deep_nesting(self) -> None: 1058 | lxml_version = Version(etree.__version__) 1059 | lxml_huge_tree_version = Version("4.2") 1060 | 1061 | content = """ 1062 | <html> 1063 | <body> 1064 | <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span> 1065 | <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span> 1066 | <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span> 1067 | <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span> 1068 | <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span> 1069 | <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span> 1070 | <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span> 1071 | <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span> 1072 | <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span> 1073 | <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span> 1074 | <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span> 1075 | <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span> 1076 | <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span> 1077 | <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span> 1078 | <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span> 1079 | <span><span><span><span><span><span><span><span><span><span><span><span> 1080 | hello world 1081 | </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span> 1082 | </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span> 1083 | </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span> 1084 | </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span> 1085 | </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span> 1086 | </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span> 1087 | </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span> 1088 | </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span> 1089 | </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span> 1090 | </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span> 1091 | </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span> 1092 | </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span> 1093 | </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span> 1094 | </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span> 1095 | </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span> 1096 | </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span> 1097 | </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span> 1098 | </span></span></span></span></span></span></span></span></span></span> 1099 | <table> 1100 | <tr><td>some test</td></tr> 1101 | </table> 1102 | </body> 1103 | </html> 1104 | """ 1105 | 1106 | # If lxml doesn't support huge trees expect wrong results and a warning 1107 | if lxml_version < lxml_huge_tree_version: 1108 | with warnings.catch_warnings(record=True) as w: 1109 | sel = Selector(text=content) 1110 | self.assertIn("huge_tree", str(w[0].message)) 1111 | self.assertLessEqual(len(sel.css("span")), 256) 1112 | self.assertEqual(len(sel.css("td")), 0) 1113 | return 1114 | 1115 | # Same goes for explicitly disabling huge trees 1116 | with warnings.catch_warnings(record=True) as w: 1117 | sel = Selector(text=content, huge_tree=False) 1118 | self.assertIn("huge_tree", str(w[0].message)) 1119 | self.assertLessEqual(len(sel.css("span")), 256) 1120 | self.assertEqual(len(sel.css("td")), 0) 1121 | 1122 | # If huge trees are enabled, elements with a depth > 255 should be found 1123 | sel = Selector(text=content) 1124 | nest_level = 282 1125 | self.assertEqual(len(sel.css("span")), nest_level) 1126 | self.assertEqual(len(sel.css("td")), 1) 1127 | 1128 | def test_invalid_type(self) -> None: 1129 | with self.assertRaises(ValueError): 1130 | self.sscls("", type="xhtml") 1131 | 1132 | def test_default_type(self) -> None: 1133 | text = "foo" 1134 | selector = self.sscls(text) 1135 | self.assertEqual(selector.type, "html") 1136 | 1137 | def test_json_type(self) -> None: 1138 | obj = 1 1139 | selector = self.sscls(str(obj), type="json") 1140 | self.assertEqual(selector.root, obj) 1141 | self.assertEqual(selector.type, "json") 1142 | 1143 | def test_html_root(self) -> None: 1144 | root = etree.fromstring("<html/>") 1145 | selector = self.sscls(root=root) 1146 | self.assertEqual(selector.root, root) 1147 | self.assertEqual(selector.type, "html") 1148 | 1149 | def test_json_root(self) -> None: 1150 | obj = 1 1151 | selector = self.sscls(root=obj) 1152 | self.assertEqual(selector.root, obj) 1153 | self.assertEqual(selector.type, "json") 1154 | 1155 | def test_json_xpath(self) -> None: 1156 | obj = 1 1157 | selector = self.sscls(root=obj) 1158 | with self.assertRaises(ValueError): 1159 | selector.xpath("//*") 1160 | 1161 | def test_json_css(self) -> None: 1162 | obj = 1 1163 | selector = self.sscls(root=obj) 1164 | with self.assertRaises(ValueError): 1165 | selector.css("*") 1166 | 1167 | def test_invalid_json(self) -> None: 1168 | text = "<html/>" 1169 | selector = self.sscls(text, type="json") 1170 | self.assertEqual(selector.root, None) 1171 | self.assertEqual(selector.type, "json") 1172 | 1173 | def test_text_and_root_warning(self) -> None: 1174 | with warnings.catch_warnings(record=True) as w: 1175 | Selector(text="a", root="b") 1176 | self.assertIn("both text and root", str(w[0].message)) 1177 | 1178 | def test_etree_root_invalid_type(self) -> None: 1179 | selector = Selector("<html></html>") 1180 | self.assertRaisesRegex( 1181 | ValueError, 1182 | "object as root", 1183 | Selector, 1184 | root=selector.root, 1185 | type="text", 1186 | ) 1187 | self.assertRaisesRegex( 1188 | ValueError, 1189 | "object as root", 1190 | Selector, 1191 | root=selector.root, 1192 | type="json", 1193 | ) 1194 | 1195 | def test_json_selector_representation(self) -> None: 1196 | selector = Selector(text="true") 1197 | assert repr(selector) == "<Selector query=None data='True'>" 1198 | assert str(selector) == "True" 1199 | selector = Selector(text="1") 1200 | assert repr(selector) == "<Selector query=None data='1'>" 1201 | assert str(selector) == "1" 1202 | 1203 | def test_body_bytearray_support(self) -> None: 1204 | selector = Selector(body=bytearray("<h1>Hello World</h1>", "utf-8")) 1205 | assert selector.xpath("//h1/text()").get() == "Hello World" 1206 | 1207 | 1208 | class ExsltTestCase(unittest.TestCase): 1209 | sscls = Selector 1210 | 1211 | def test_regexp(self) -> None: 1212 | """EXSLT regular expression tests""" 1213 | body = """ 1214 | <p><input name='a' value='1'/><input name='b' value='2'/></p> 1215 | <div class="links"> 1216 | <a href="/first.html">first link</a> 1217 | <a href="/second.html">second link</a> 1218 | <a href="http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml">EXSLT match example</a> 1219 | </div> 1220 | """ 1221 | sel = self.sscls(text=body) 1222 | 1223 | # re:test() 1224 | self.assertEqual( 1225 | sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]').extract(), 1226 | [x.extract() for x in sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]')], 1227 | ) 1228 | self.assertEqual( 1229 | [x.extract() for x in sel.xpath(r'//a[re:test(@href, "\.html$")]/text()')], 1230 | ["first link", "second link"], 1231 | ) 1232 | self.assertEqual( 1233 | [x.extract() for x in sel.xpath('//a[re:test(@href, "first")]/text()')], 1234 | ["first link"], 1235 | ) 1236 | self.assertEqual( 1237 | [x.extract() for x in sel.xpath('//a[re:test(@href, "second")]/text()')], 1238 | ["second link"], 1239 | ) 1240 | 1241 | # re:match() is rather special: it returns a node-set of <match> nodes 1242 | # ['<match>http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml</match>', 1243 | # '<match>http</match>', 1244 | # '<match>www.bayes.co.uk</match>', 1245 | # '<match></match>', 1246 | # '<match>/xml/index.xml?/xml/utils/rechecker.xml</match>'] 1247 | self.assertEqual( 1248 | sel.xpath( 1249 | r're:match(//a[re:test(@href, "\.xml$")]/@href,' 1250 | r'"(\w+):\/\/([^/:]+)(:\d*)?([^# ]*)")/text()' 1251 | ).extract(), 1252 | [ 1253 | "http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml", 1254 | "http", 1255 | "www.bayes.co.uk", 1256 | "", 1257 | "/xml/index.xml?/xml/utils/rechecker.xml", 1258 | ], 1259 | ) 1260 | 1261 | # re:replace() 1262 | self.assertEqual( 1263 | sel.xpath( 1264 | r're:replace(//a[re:test(@href, "\.xml$")]/@href,' 1265 | r'"(\w+)://(.+)(\.xml)", "","https://\2.html")' 1266 | ).extract(), 1267 | ["https://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.html"], 1268 | ) 1269 | 1270 | def test_set(self) -> None: 1271 | """EXSLT set manipulation tests""" 1272 | # microdata example from http://schema.org/Event 1273 | body = """ 1274 | <div itemscope itemtype="http://schema.org/Event"> 1275 | <a itemprop="url" href="nba-miami-philidelphia-game3.html"> 1276 | NBA Eastern Conference First Round Playoff Tickets: 1277 | <span itemprop="name"> Miami Heat at Philadelphia 76ers - Game 3 (Home Game 1) </span> 1278 | </a> 1279 | 1280 | <meta itemprop="startDate" content="2016-04-21T20:00"> 1281 | Thu, 04/21/16 1282 | 8:00 p.m. 1283 | 1284 | <div itemprop="location" itemscope itemtype="http://schema.org/Place"> 1285 | <a itemprop="url" href="wells-fargo-center.html"> 1286 | Wells Fargo Center 1287 | </a> 1288 | <div itemprop="address" itemscope itemtype="http://schema.org/PostalAddress"> 1289 | <span itemprop="addressLocality">Philadelphia</span>, 1290 | <span itemprop="addressRegion">PA</span> 1291 | </div> 1292 | </div> 1293 | 1294 | <div itemprop="offers" itemscope itemtype="http://schema.org/AggregateOffer"> 1295 | Priced from: <span itemprop="lowPrice">$35</span> 1296 | <span itemprop="offerCount">1938</span> tickets left 1297 | </div> 1298 | </div> 1299 | """ 1300 | sel = self.sscls(text=body) 1301 | 1302 | self.assertEqual( 1303 | sel.xpath( 1304 | """//div[@itemtype="http://schema.org/Event"] 1305 | //@itemprop""" 1306 | ).extract(), 1307 | [ 1308 | "url", 1309 | "name", 1310 | "startDate", 1311 | "location", 1312 | "url", 1313 | "address", 1314 | "addressLocality", 1315 | "addressRegion", 1316 | "offers", 1317 | "lowPrice", 1318 | "offerCount", 1319 | ], 1320 | ) 1321 | 1322 | self.assertEqual( 1323 | sel.xpath( 1324 | """ 1325 | set:difference(//div[@itemtype="http://schema.org/Event"] 1326 | //@itemprop, 1327 | //div[@itemtype="http://schema.org/Event"] 1328 | //*[@itemscope]/*/@itemprop)""" 1329 | ).extract(), 1330 | ["url", "name", "startDate", "location", "offers"], 1331 | ) 1332 | 1333 | def test_dont_remove_text_after_deleted_element(self) -> None: 1334 | sel = self.sscls( 1335 | text="""<html><body>Text before.<span>Text in.</span> Text after.</body></html> 1336 | """ 1337 | ) 1338 | sel.css("span").drop() 1339 | self.assertEqual( 1340 | sel.get(), "<html><body>Text before. Text after.</body></html>" 1341 | ) 1342 | 1343 | def test_drop_with_xml_type(self) -> None: 1344 | sel = self.sscls(text="<a><b></b><c/></a>", type="xml") 1345 | el = sel.xpath("//b")[0] 1346 | assert el.root.getparent() is not None 1347 | el.drop() 1348 | assert sel.get() == "<a><c/></a>" 1349 | 1350 | 1351 | class SelectorBytesInput(Selector): 1352 | def __init__( 1353 | self, 1354 | text: str | None = None, 1355 | type: str | None = None, 1356 | body: bytes = b"", 1357 | encoding: str = "utf-8", 1358 | namespaces: Mapping[str, str] | None = None, 1359 | root: Any | None = _NOT_SET, 1360 | base_url: str | None = None, 1361 | _expr: str | None = None, 1362 | huge_tree: bool = LXML_SUPPORTS_HUGE_TREE, 1363 | ) -> None: 1364 | if text: 1365 | body = bytes(text, encoding=encoding) 1366 | text = None 1367 | super().__init__( 1368 | text=text, 1369 | type=type, 1370 | body=body, 1371 | encoding=encoding, 1372 | namespaces=namespaces, 1373 | root=root, 1374 | base_url=base_url, 1375 | _expr=_expr, 1376 | huge_tree=huge_tree, 1377 | ) 1378 | 1379 | 1380 | class SelectorTestCaseBytes(SelectorTestCase): 1381 | sscls = SelectorBytesInput 1382 | 1383 | def test_representation_slice(self) -> None: 1384 | pass 1385 | 1386 | def test_representation_unicode_query(self) -> None: 1387 | pass 1388 | 1389 | def test_weakref_slots(self) -> None: 1390 | pass 1391 | 1392 | def test_check_text_argument_type(self) -> None: 1393 | self.assertRaisesRegex( 1394 | TypeError, 1395 | "body argument should be of type", 1396 | self.sscls, 1397 | body="<html/>", 1398 | ) 1399 | 1400 | 1401 | class ExsltTestCaseBytes(ExsltTestCase): 1402 | sscls = SelectorBytesInput 1403 | -------------------------------------------------------------------------------- /tests/test_selector_csstranslator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Selector tests for cssselect backend 3 | """ 4 | 5 | from __future__ import annotations 6 | 7 | import unittest 8 | from typing import Any, Callable, Protocol 9 | 10 | import cssselect 11 | import pytest 12 | from cssselect.parser import SelectorSyntaxError 13 | from cssselect.xpath import ExpressionError 14 | from packaging.version import Version 15 | 16 | from parsel import Selector 17 | from parsel.csstranslator import GenericTranslator, HTMLTranslator, TranslatorProtocol 18 | 19 | HTMLBODY = """ 20 | <html> 21 | <body> 22 | <div> 23 | <a id="name-anchor" name="foo"></a> 24 | <a id="tag-anchor" rel="tag" href="http://localhost/foo">link</a> 25 | <a id="nofollow-anchor" rel="nofollow" href="https://example.org"> link</a> 26 | <p id="paragraph"> 27 | lorem ipsum text 28 | <b id="p-b">hi</b> <em id="p-em">there</em> 29 | <b id="p-b2">guy</b> 30 | <input type="checkbox" id="checkbox-unchecked" /> 31 | <input type="checkbox" id="checkbox-disabled" disabled="" /> 32 | <input type="text" id="text-checked" checked="checked" /> 33 | <input type="hidden" /> 34 | <input type="hidden" disabled="disabled" /> 35 | <input type="checkbox" id="checkbox-checked" checked="checked" /> 36 | <input type="checkbox" id="checkbox-disabled-checked" 37 | disabled="disabled" checked="checked" /> 38 | <fieldset id="fieldset" disabled="disabled"> 39 | <input type="checkbox" id="checkbox-fieldset-disabled" /> 40 | <input type="hidden" /> 41 | </fieldset> 42 | </p> 43 | <map name="dummymap"> 44 | <area shape="circle" coords="200,250,25" href="foo.html" id="area-href" /> 45 | <area shape="default" id="area-nohref" /> 46 | </map> 47 | </div> 48 | <div class="cool-footer" id="foobar-div" foobar="ab bc cde"> 49 | <span id="foobar-span">foo ter</span> 50 | </div> 51 | </body></html> 52 | """ 53 | 54 | 55 | class TranslatorTestProtocol(Protocol): 56 | tr_cls: type[TranslatorProtocol] 57 | tr: TranslatorProtocol 58 | 59 | def c2x(self, css: str, prefix: str = ...) -> str: 60 | pass 61 | 62 | def assertEqual(self, first: Any, second: Any, msg: Any = ...) -> None: 63 | pass 64 | 65 | def assertRaises( 66 | self, 67 | expected_exception: type[BaseException] | tuple[type[BaseException], ...], 68 | callable: Callable[..., object], 69 | *args: Any, 70 | **kwargs: Any, 71 | ) -> None: 72 | pass 73 | 74 | 75 | class TranslatorTestMixin: 76 | def setUp(self: TranslatorTestProtocol) -> None: 77 | self.tr = self.tr_cls() 78 | self.c2x = self.tr.css_to_xpath 79 | 80 | def test_attr_function(self: TranslatorTestProtocol) -> None: 81 | cases = [ 82 | ("::attr(name)", "descendant-or-self::*/@name"), 83 | ("a::attr(href)", "descendant-or-self::a/@href"), 84 | ( 85 | "a ::attr(img)", 86 | "descendant-or-self::a/descendant-or-self::*/@img", 87 | ), 88 | ("a > ::attr(class)", "descendant-or-self::a/*/@class"), 89 | ] 90 | for css, xpath in cases: 91 | self.assertEqual(self.c2x(css), xpath, css) 92 | 93 | def test_attr_function_exception(self: TranslatorTestProtocol) -> None: 94 | cases = [ 95 | ("::attr(12)", ExpressionError), 96 | ("::attr(34test)", ExpressionError), 97 | ("::attr(@href)", SelectorSyntaxError), 98 | ] 99 | for css, exc in cases: 100 | self.assertRaises(exc, self.c2x, css) 101 | 102 | def test_text_pseudo_element(self: TranslatorTestProtocol) -> None: 103 | cases = [ 104 | ("::text", "descendant-or-self::text()"), 105 | ("p::text", "descendant-or-self::p/text()"), 106 | ("p ::text", "descendant-or-self::p/descendant-or-self::text()"), 107 | ("#id::text", "descendant-or-self::*[@id = 'id']/text()"), 108 | ("p#id::text", "descendant-or-self::p[@id = 'id']/text()"), 109 | ( 110 | "p#id ::text", 111 | "descendant-or-self::p[@id = 'id']/descendant-or-self::text()", 112 | ), 113 | ("p#id > ::text", "descendant-or-self::p[@id = 'id']/*/text()"), 114 | ( 115 | "p#id ~ ::text", 116 | "descendant-or-self::p[@id = 'id']/following-sibling::*/text()", 117 | ), 118 | ("a[href]::text", "descendant-or-self::a[@href]/text()"), 119 | ( 120 | "a[href] ::text", 121 | "descendant-or-self::a[@href]/descendant-or-self::text()", 122 | ), 123 | ( 124 | "p::text, a::text", 125 | "descendant-or-self::p/text() | descendant-or-self::a/text()", 126 | ), 127 | ] 128 | for css, xpath in cases: 129 | self.assertEqual(self.c2x(css), xpath, css) 130 | 131 | def test_pseudo_function_exception(self: TranslatorTestProtocol) -> None: 132 | cases = [ 133 | ("::attribute(12)", ExpressionError), 134 | ("::text()", ExpressionError), 135 | ("::attr(@href)", SelectorSyntaxError), 136 | ] 137 | for css, exc in cases: 138 | self.assertRaises(exc, self.c2x, css) 139 | 140 | def test_unknown_pseudo_element(self: TranslatorTestProtocol) -> None: 141 | cases = [ 142 | ("::text-node", ExpressionError), 143 | ] 144 | for css, exc in cases: 145 | self.assertRaises(exc, self.c2x, css) 146 | 147 | def test_unknown_pseudo_class(self: TranslatorTestProtocol) -> None: 148 | cases = [ 149 | (":text", ExpressionError), 150 | (":attribute(name)", ExpressionError), 151 | ] 152 | for css, exc in cases: 153 | self.assertRaises(exc, self.c2x, css) 154 | 155 | 156 | class HTMLTranslatorTest(TranslatorTestMixin, unittest.TestCase): 157 | tr_cls = HTMLTranslator 158 | 159 | 160 | class GenericTranslatorTest(TranslatorTestMixin, unittest.TestCase): 161 | tr_cls = GenericTranslator 162 | 163 | 164 | class UtilCss2XPathTest(unittest.TestCase): 165 | def test_css2xpath(self) -> None: 166 | from parsel import css2xpath 167 | 168 | expected_xpath = ( 169 | "descendant-or-self::*[@class and contains(" 170 | "concat(' ', normalize-space(@class), ' '), ' some-class ')]" 171 | ) 172 | self.assertEqual(css2xpath(".some-class"), expected_xpath) 173 | 174 | 175 | class CSSSelectorTest(unittest.TestCase): 176 | sscls = Selector 177 | 178 | def setUp(self) -> None: 179 | self.sel = self.sscls(text=HTMLBODY) 180 | 181 | def x(self, *a: Any, **kw: Any) -> list[str]: 182 | return [v.strip() for v in self.sel.css(*a, **kw).extract() if v.strip()] 183 | 184 | def test_selector_simple(self) -> None: 185 | for x in self.sel.css("input"): 186 | self.assertTrue(isinstance(x, self.sel.__class__), x) 187 | self.assertEqual( 188 | self.sel.css("input").extract(), 189 | [x.extract() for x in self.sel.css("input")], 190 | ) 191 | 192 | def test_text_pseudo_element(self) -> None: 193 | self.assertEqual(self.x("#p-b2"), ['<b id="p-b2">guy</b>']) 194 | self.assertEqual(self.x("#p-b2::text"), ["guy"]) 195 | self.assertEqual(self.x("#p-b2 ::text"), ["guy"]) 196 | self.assertEqual(self.x("#paragraph::text"), ["lorem ipsum text"]) 197 | self.assertEqual( 198 | self.x("#paragraph ::text"), 199 | ["lorem ipsum text", "hi", "there", "guy"], 200 | ) 201 | self.assertEqual(self.x("p::text"), ["lorem ipsum text"]) 202 | self.assertEqual(self.x("p ::text"), ["lorem ipsum text", "hi", "there", "guy"]) 203 | 204 | def test_attribute_function(self) -> None: 205 | self.assertEqual(self.x("#p-b2::attr(id)"), ["p-b2"]) 206 | self.assertEqual(self.x(".cool-footer::attr(class)"), ["cool-footer"]) 207 | self.assertEqual( 208 | self.x(".cool-footer ::attr(id)"), ["foobar-div", "foobar-span"] 209 | ) 210 | self.assertEqual( 211 | self.x('map[name="dummymap"] ::attr(shape)'), ["circle", "default"] 212 | ) 213 | 214 | def test_nested_selector(self) -> None: 215 | self.assertEqual(self.sel.css("p").css("b::text").extract(), ["hi", "guy"]) 216 | self.assertEqual( 217 | self.sel.css("div").css("area:last-child").extract(), 218 | ['<area shape="default" id="area-nohref">'], 219 | ) 220 | 221 | @pytest.mark.xfail( 222 | Version(cssselect.__version__) < Version("1.2.0"), 223 | reason="Support added in cssselect 1.2.0", 224 | ) 225 | def test_pseudoclass_has(self) -> None: 226 | self.assertEqual(self.x("p:has(b)::text"), ["lorem ipsum text"]) 227 | 228 | 229 | class CSSSelectorTestBytes(CSSSelectorTest): 230 | def setUp(self) -> None: 231 | self.sel = self.sscls(body=bytes(HTMLBODY, encoding="utf-8")) 232 | -------------------------------------------------------------------------------- /tests/test_selector_jmespath.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import unittest 4 | 5 | from parsel import Selector 6 | from parsel.selector import _NOT_SET 7 | 8 | 9 | class JMESPathTestCase(unittest.TestCase): 10 | def test_json_has_html(self) -> None: 11 | """Sometimes the information is returned in a json wrapper""" 12 | data = """ 13 | { 14 | "content": [ 15 | { 16 | "name": "A", 17 | "value": "a" 18 | }, 19 | { 20 | "name": { 21 | "age": 18 22 | }, 23 | "value": "b" 24 | }, 25 | { 26 | "name": "C", 27 | "value": "c" 28 | }, 29 | { 30 | "name": "<a>D</a>", 31 | "value": "<div>d</div>" 32 | } 33 | ], 34 | "html": "<div><a>a<br>b</a>c</div><div><a>d</a>e<b>f</b></div>" 35 | } 36 | """ 37 | sel = Selector(text=data) 38 | self.assertEqual( 39 | sel.jmespath("html").get(), 40 | "<div><a>a<br>b</a>c</div><div><a>d</a>e<b>f</b></div>", 41 | ) 42 | self.assertEqual( 43 | sel.jmespath("html").xpath("//div/a/text()").getall(), 44 | ["a", "b", "d"], 45 | ) 46 | self.assertEqual(sel.jmespath("html").css("div > b").getall(), ["<b>f</b>"]) 47 | self.assertEqual(sel.jmespath("content").jmespath("name.age").get(), 18) 48 | 49 | def test_html_has_json(self) -> None: 50 | html_text = """ 51 | <div> 52 | <h1>Information</h1> 53 | <content> 54 | { 55 | "user": [ 56 | { 57 | "name": "A", 58 | "age": 18 59 | }, 60 | { 61 | "name": "B", 62 | "age": 32 63 | }, 64 | { 65 | "name": "C", 66 | "age": 22 67 | }, 68 | { 69 | "name": "D", 70 | "age": 25 71 | } 72 | ], 73 | "total": 4, 74 | "status": "ok" 75 | } 76 | </content> 77 | </div> 78 | """ 79 | sel = Selector(text=html_text) 80 | self.assertEqual( 81 | sel.xpath("//div/content/text()").jmespath("user[*].name").getall(), 82 | ["A", "B", "C", "D"], 83 | ) 84 | self.assertEqual( 85 | sel.xpath("//div/content").jmespath("user[*].name").getall(), 86 | ["A", "B", "C", "D"], 87 | ) 88 | self.assertEqual(sel.xpath("//div/content").jmespath("total").get(), 4) 89 | 90 | def test_jmestpath_with_re(self) -> None: 91 | html_text = """ 92 | <div> 93 | <h1>Information</h1> 94 | <content> 95 | { 96 | "user": [ 97 | { 98 | "name": "A", 99 | "age": 18 100 | }, 101 | { 102 | "name": "B", 103 | "age": 32 104 | }, 105 | { 106 | "name": "C", 107 | "age": 22 108 | }, 109 | { 110 | "name": "D", 111 | "age": 25 112 | } 113 | ], 114 | "total": 4, 115 | "status": "ok" 116 | } 117 | </content> 118 | </div> 119 | """ 120 | sel = Selector(text=html_text) 121 | self.assertEqual( 122 | sel.xpath("//div/content/text()").jmespath("user[*].name").re(r"(\w+)"), 123 | ["A", "B", "C", "D"], 124 | ) 125 | self.assertEqual( 126 | sel.xpath("//div/content").jmespath("user[*].name").re(r"(\w+)"), 127 | ["A", "B", "C", "D"], 128 | ) 129 | 130 | with self.assertRaises(TypeError): 131 | sel.xpath("//div/content").jmespath("user[*].age").re(r"(\d+)") 132 | 133 | self.assertEqual( 134 | sel.xpath("//div/content").jmespath("unavailable").re(r"(\d+)"), [] 135 | ) 136 | 137 | self.assertEqual( 138 | sel.xpath("//div/content").jmespath("unavailable").re_first(r"(\d+)"), 139 | None, 140 | ) 141 | 142 | self.assertEqual( 143 | sel.xpath("//div/content") 144 | .jmespath("user[*].age.to_string(@)") 145 | .re(r"(\d+)"), 146 | ["18", "32", "22", "25"], 147 | ) 148 | 149 | def test_json_types(self) -> None: 150 | for text, root in ( 151 | ("{}", {}), 152 | ('{"a": "b"}', {"a": "b"}), 153 | ("[]", []), 154 | ('["a"]', ["a"]), 155 | ('""', ""), 156 | ("0", 0), 157 | ("1", 1), 158 | ("true", True), 159 | ("false", False), 160 | ("null", None), 161 | ): 162 | selector = Selector(text=text, root=_NOT_SET) 163 | self.assertEqual(selector.type, "json") 164 | self.assertEqual(selector._text, text) 165 | self.assertEqual(selector.root, root) 166 | 167 | selector = Selector(text=None, root=root) 168 | self.assertEqual(selector.type, "json") 169 | self.assertEqual(selector._text, None) 170 | self.assertEqual(selector.root, root) 171 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | from pytest import mark, raises 6 | 7 | from parsel.utils import extract_regex, shorten 8 | 9 | if TYPE_CHECKING: 10 | from re import Pattern 11 | 12 | 13 | @mark.parametrize( 14 | "width,expected", 15 | ( 16 | (-1, ValueError), 17 | (0, ""), 18 | (1, "."), 19 | (2, ".."), 20 | (3, "..."), 21 | (4, "f..."), 22 | (5, "fo..."), 23 | (6, "foobar"), 24 | (7, "foobar"), 25 | ), 26 | ) 27 | def test_shorten(width: int, expected: str | type[Exception]) -> None: 28 | if isinstance(expected, str): 29 | assert shorten("foobar", width) == expected 30 | else: 31 | with raises(expected): 32 | shorten("foobar", width) 33 | 34 | 35 | @mark.parametrize( 36 | "regex, text, replace_entities, expected", 37 | ( 38 | [ 39 | r"(?P<month>\w+)\s*(?P<day>\d+)\s*\,?\s*(?P<year>\d+)", 40 | "October 25, 2019", 41 | True, 42 | ["October", "25", "2019"], 43 | ], 44 | [ 45 | r"(?P<month>\w+)\s*(?P<day>\d+)\s*\,?\s*(?P<year>\d+)", 46 | "October 25 2019", 47 | True, 48 | ["October", "25", "2019"], 49 | ], 50 | [ 51 | r"(?P<extract>\w+)\s*(?P<day>\d+)\s*\,?\s*(?P<year>\d+)", 52 | "October 25 2019", 53 | True, 54 | ["October"], 55 | ], 56 | [ 57 | r"\w+\s*\d+\s*\,?\s*\d+", 58 | "October 25 2019", 59 | True, 60 | ["October 25 2019"], 61 | ], 62 | [ 63 | r"^.*$", 64 | ""sometext" & "moretext"", 65 | True, 66 | ['"sometext" & "moretext"'], 67 | ], 68 | [ 69 | r"^.*$", 70 | ""sometext" & "moretext"", 71 | False, 72 | [""sometext" & "moretext""], 73 | ], 74 | ), 75 | ) 76 | def test_extract_regex( 77 | regex: str | Pattern[str], 78 | text: str, 79 | replace_entities: bool, 80 | expected: list[str], 81 | ) -> None: 82 | assert extract_regex(regex, text, replace_entities) == expected 83 | -------------------------------------------------------------------------------- /tests/test_xml_attacks.py: -------------------------------------------------------------------------------- 1 | """Tests for known XML attacks""" 2 | 3 | from pathlib import Path 4 | from unittest import TestCase 5 | 6 | from psutil import Process 7 | 8 | from parsel import Selector 9 | 10 | MiB_1 = 1024**2 11 | 12 | 13 | def _load(attack: str) -> str: 14 | folder_path = Path(__file__).parent 15 | file_path = folder_path / "xml_attacks" / f"{attack}.xml" 16 | return file_path.read_bytes().decode("utf-8") 17 | 18 | 19 | # List of known attacks: 20 | # https://github.com/tiran/defusedxml#python-xml-libraries 21 | class XMLAttackTestCase(TestCase): 22 | def test_billion_laughs(self) -> None: 23 | process = Process() 24 | memory_usage_before = process.memory_info().rss 25 | selector = Selector(text=_load("billion_laughs")) 26 | lolz = selector.css("lolz::text").get() 27 | memory_usage_after = process.memory_info().rss 28 | memory_change = memory_usage_after - memory_usage_before 29 | assert_message = f"Memory change: {memory_change}B" 30 | assert memory_change <= MiB_1, assert_message 31 | assert lolz == "&lol9;" 32 | -------------------------------------------------------------------------------- /tests/test_xpathfuncs.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import unittest 4 | from typing import Any 5 | 6 | from parsel import Selector 7 | from parsel.xpathfuncs import set_xpathfunc 8 | 9 | 10 | class XPathFuncsTestCase(unittest.TestCase): 11 | def test_has_class_simple(self) -> None: 12 | body = """ 13 | <p class="foo bar-baz">First</p> 14 | <p class="foo">Second</p> 15 | <p class="bar">Third</p> 16 | <p>Fourth</p> 17 | """ 18 | sel = Selector(text=body) 19 | self.assertEqual( 20 | [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')], 21 | ["First", "Second"], 22 | ) 23 | self.assertEqual( 24 | [x.extract() for x in sel.xpath('//p[has-class("bar")]/text()')], 25 | ["Third"], 26 | ) 27 | self.assertEqual( 28 | [x.extract() for x in sel.xpath('//p[has-class("foo","bar")]/text()')], 29 | [], 30 | ) 31 | self.assertEqual( 32 | [x.extract() for x in sel.xpath('//p[has-class("foo","bar-baz")]/text()')], 33 | ["First"], 34 | ) 35 | 36 | def test_has_class_error_no_args(self) -> None: 37 | body = """ 38 | <p CLASS="foo">First</p> 39 | """ 40 | sel = Selector(text=body) 41 | self.assertRaisesRegex( 42 | ValueError, 43 | "has-class must have at least 1 argument", 44 | sel.xpath, 45 | "has-class()", 46 | ) 47 | 48 | def test_has_class_error_invalid_arg_type(self) -> None: 49 | body = """ 50 | <p CLASS="foo">First</p> 51 | """ 52 | sel = Selector(text=body) 53 | self.assertRaisesRegex( 54 | ValueError, 55 | "has-class arguments must be strings", 56 | sel.xpath, 57 | "has-class(.)", 58 | ) 59 | 60 | def test_has_class_error_invalid_unicode(self) -> None: 61 | body = """ 62 | <p CLASS="foo">First</p> 63 | """ 64 | sel = Selector(text=body) 65 | self.assertRaisesRegex( 66 | ValueError, 67 | "All strings must be XML compatible", 68 | sel.xpath, 69 | 'has-class("héllö")'.encode(), 70 | ) 71 | 72 | def test_has_class_unicode(self) -> None: 73 | body = """ 74 | <p CLASS="fóó">First</p> 75 | """ 76 | sel = Selector(text=body) 77 | self.assertEqual( 78 | [x.extract() for x in sel.xpath('//p[has-class("fóó")]/text()')], 79 | ["First"], 80 | ) 81 | 82 | def test_has_class_uppercase(self) -> None: 83 | body = """ 84 | <p CLASS="foo">First</p> 85 | """ 86 | sel = Selector(text=body) 87 | self.assertEqual( 88 | [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')], 89 | ["First"], 90 | ) 91 | 92 | def test_has_class_newline(self) -> None: 93 | body = """ 94 | <p CLASS="foo 95 | bar">First</p> 96 | """ 97 | sel = Selector(text=body) 98 | self.assertEqual( 99 | [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')], 100 | ["First"], 101 | ) 102 | 103 | def test_has_class_tab(self) -> None: 104 | body = """ 105 | <p CLASS="foo\tbar">First</p> 106 | """ 107 | sel = Selector(text=body) 108 | self.assertEqual( 109 | [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')], 110 | ["First"], 111 | ) 112 | 113 | def test_set_xpathfunc(self) -> None: 114 | def myfunc(ctx: Any) -> None: 115 | myfunc.call_count += 1 # type: ignore[attr-defined] 116 | 117 | myfunc.call_count = 0 # type: ignore[attr-defined] 118 | 119 | body = """ 120 | <p CLASS="foo">First</p> 121 | """ 122 | sel = Selector(text=body) 123 | self.assertRaisesRegex( 124 | ValueError, 125 | "Unregistered function in myfunc", 126 | sel.xpath, 127 | "myfunc()", 128 | ) 129 | 130 | set_xpathfunc("myfunc", myfunc) 131 | sel.xpath("myfunc()") 132 | self.assertEqual(myfunc.call_count, 1) # type: ignore[attr-defined] 133 | 134 | set_xpathfunc("myfunc", None) 135 | self.assertRaisesRegex( 136 | ValueError, 137 | "Unregistered function in myfunc", 138 | sel.xpath, 139 | "myfunc()", 140 | ) 141 | -------------------------------------------------------------------------------- /tests/typing/selector.py: -------------------------------------------------------------------------------- 1 | # Basic usage of the Selector, strongly typed to test the typing of parsel's API. 2 | from __future__ import annotations 3 | 4 | import re 5 | 6 | from parsel import Selector 7 | 8 | 9 | def correct() -> None: 10 | selector = Selector( 11 | text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>" 12 | ) 13 | 14 | li_values: list[str] = selector.css("li").getall() 15 | selector.re_first(re.compile(r"[32]"), "").strip() 16 | xpath_values: list[str] = selector.xpath( 17 | "//somens:a/text()", namespaces={"somens": "http://scrapy.org"} 18 | ).extract() 19 | 20 | class MySelector(Selector): 21 | def my_own_func(self) -> int: 22 | return 3 23 | 24 | my_selector = MySelector() 25 | res: int = my_selector.my_own_func() 26 | sub_res: int = my_selector.xpath("//somens:a/text()")[0].my_own_func() 27 | 28 | 29 | # Negative checks: all the code lines below have typing errors. 30 | # the "# type: ignore" comment makes sure that mypy identifies them as errors. 31 | 32 | 33 | def incorrect() -> None: 34 | selector = Selector( 35 | text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>" 36 | ) 37 | 38 | # Wrong query type in css. 39 | selector.css(5).getall() # type: ignore[arg-type] 40 | 41 | # Cannot assign a list of str to an int. 42 | li_values: int = selector.css("li").getall() # type: ignore[assignment] 43 | 44 | # Cannot use a string to define namespaces in xpath. 45 | selector.xpath( 46 | "//somens:a/text()", 47 | namespaces='{"somens": "http://scrapy.org"}', # type: ignore[arg-type] 48 | ).extract() 49 | 50 | # Typo in the extract method name. 51 | selector.css("li").extact() # type: ignore[attr-defined] 52 | 53 | class MySelector(Selector): 54 | def my_own_func(self) -> int: 55 | return 3 56 | 57 | my_selector = MySelector() 58 | res: str = my_selector.my_own_func() # type: ignore[assignment] 59 | sub_res: str = my_selector.xpath("//somens:a/text()")[0].my_own_func() # type: ignore[assignment] 60 | -------------------------------------------------------------------------------- /tests/xml_attacks/billion_laughs.xml: -------------------------------------------------------------------------------- 1 | <?xml version="1.0"?> 2 | <!DOCTYPE lolz [ 3 | <!ENTITY lol "lol"> 4 | <!ELEMENT lolz (#PCDATA)> 5 | <!ENTITY lol1 "&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;"> 6 | <!ENTITY lol2 "&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;"> 7 | <!ENTITY lol3 "&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;"> 8 | <!ENTITY lol4 "&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;"> 9 | <!ENTITY lol5 "&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;"> 10 | <!ENTITY lol6 "&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;"> 11 | <!ENTITY lol7 "&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;"> 12 | <!ENTITY lol8 "&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;"> 13 | <!ENTITY lol9 "&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;"> 14 | ]> 15 | <lolz>&lol9;</lolz> 16 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = typing,pylint,docs,twinecheck,pre-commit,py39,py310,py311,py312,py313,pypy3.10 3 | 4 | [testenv] 5 | usedevelop = True 6 | deps = 7 | -r{toxinidir}/tests/requirements.txt 8 | commands = py.test --cov=parsel --cov-report=xml {posargs:docs parsel tests} 9 | 10 | [testenv:typing] 11 | deps = 12 | {[testenv]deps} 13 | types-jmespath==1.0.2.20240106 14 | types-lxml==2025.3.4 15 | types-psutil==7.0.0.20250218 16 | py==1.11.0 17 | mypy==1.15.0 18 | commands = 19 | mypy {posargs:parsel tests} --strict 20 | 21 | [testenv:pylint] 22 | deps = 23 | {[testenv]deps} 24 | pylint==3.3.6 25 | commands = 26 | pylint docs parsel tests setup.py 27 | 28 | [docs] 29 | changedir = docs 30 | deps = -rdocs/requirements.txt 31 | 32 | [testenv:docs] 33 | changedir = {[docs]changedir} 34 | deps = {[docs]deps} 35 | # No -W in LaTeX, because ReadTheDocs does not use it either, and there are 36 | # image conversion warnings that cannot be addressed in ReadTheDocs 37 | commands = 38 | sphinx-build -W -b html . {envtmpdir}/html 39 | sphinx-build -b latex . {envtmpdir}/latex 40 | sphinx-build -b epub . {envtmpdir}/epub 41 | 42 | [testenv:twinecheck] 43 | basepython = python3 44 | deps = 45 | twine==6.1.0 46 | build==1.2.2.post1 47 | commands = 48 | python -m build --sdist 49 | twine check dist/* 50 | 51 | [testenv:pre-commit] 52 | deps = pre-commit 53 | commands = pre-commit run --all-files --show-diff-on-failure 54 | skip_install = true 55 | --------------------------------------------------------------------------------