├── .git-blame-ignore-revs
├── .github
    └── workflows
    │   ├── checks.yml
    │   ├── publish.yml
    │   ├── tests-macos.yml
    │   ├── tests-ubuntu.yml
    │   └── tests-windows.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── LICENSE
├── MANIFEST.in
├── NEWS
├── README.rst
├── docs
    ├── Makefile
    ├── _static
    │   ├── multiroot.html
    │   ├── python-insider.xml
    │   └── selectors-sample1.html
    ├── conf.py
    ├── conftest.py
    ├── history.rst
    ├── index.rst
    ├── installation.rst
    ├── make.bat
    ├── parsel.rst
    ├── requirements.txt
    └── usage.rst
├── parsel
    ├── __init__.py
    ├── csstranslator.py
    ├── py.typed
    ├── selector.py
    ├── utils.py
    └── xpathfuncs.py
├── pyproject.toml
├── release.rst
├── setup.py
├── tests
    ├── requirements.txt
    ├── test_selector.py
    ├── test_selector_csstranslator.py
    ├── test_selector_jmespath.py
    ├── test_utils.py
    ├── test_xml_attacks.py
    ├── test_xpathfuncs.py
    ├── typing
    │   └── selector.py
    └── xml_attacks
    │   └── billion_laughs.xml
└── tox.ini


/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | # applying pre-commit hooks to the project
2 | a57c23e3b7be0f001595bd8767fe05e40a66e730


--------------------------------------------------------------------------------
/.github/workflows/checks.yml:
--------------------------------------------------------------------------------
 1 | name: Checks
 2 | on: [push, pull_request]
 3 | 
 4 | jobs:
 5 |   checks:
 6 |     runs-on: ubuntu-latest
 7 |     strategy:
 8 |       fail-fast: false
 9 |       matrix:
10 |         include:
11 |         - python-version: "3.13"
12 |           env:
13 |             TOXENV: pre-commit
14 |         - python-version: "3.13"
15 |           env:
16 |             TOXENV: pylint
17 |         - python-version: "3.13"  # Keep in sync with .readthedocs.yml
18 |           env:
19 |             TOXENV: docs
20 |         - python-version: "3.13"
21 |           env:
22 |             TOXENV: typing
23 |         - python-version: "3.13"
24 |           env:
25 |             TOXENV: twinecheck
26 | 
27 |     steps:
28 |     - uses: actions/checkout@v4
29 | 
30 |     - name: Set up Python ${{ matrix.python-version }}
31 |       uses: actions/setup-python@v5
32 |       with:
33 |         python-version: ${{ matrix.python-version }}
34 | 
35 |     - name: Run check
36 |       env: ${{ matrix.env }}
37 |       run: |
38 |         pip install -U tox
39 |         tox
40 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish
 2 | on: [push]
 3 | 
 4 | jobs:
 5 |   publish:
 6 |     runs-on: ubuntu-latest
 7 |     if: startsWith(github.event.ref, 'refs/tags/')
 8 | 
 9 |     steps:
10 |     - uses: actions/checkout@v4
11 | 
12 |     - name: Set up Python 3.13
13 |       uses: actions/setup-python@v5
14 |       with:
15 |         python-version: "3.13"
16 | 
17 |     - name: Check Tag
18 |       id: check-release-tag
19 |       run: |
20 |         if [[ ${{ github.event.ref }} =~ ^refs/tags/v[0-9]+[.][0-9]+[.][0-9]+(rc[0-9]+|[.]dev[0-9]+)?$ ]]; then
21 |           echo ::set-output name=release_tag::true
22 |         fi
23 | 
24 |     - name: Publish to PyPI
25 |       if: steps.check-release-tag.outputs.release_tag == 'true'
26 |       run: |
27 |         pip install --upgrade setuptools wheel twine
28 |         python setup.py sdist bdist_wheel
29 |         export TWINE_USERNAME=__token__
30 |         export TWINE_PASSWORD=${{ secrets.PYPI_TOKEN }}
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/.github/workflows/tests-macos.yml:
--------------------------------------------------------------------------------
 1 | name: macOS
 2 | on: [push, pull_request]
 3 | 
 4 | jobs:
 5 |   tests:
 6 |     runs-on: macos-latest
 7 |     strategy:
 8 |       fail-fast: false
 9 |       matrix:
10 |         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
11 | 
12 |     steps:
13 |     - uses: actions/checkout@v4
14 | 
15 |     - name: Set up Python ${{ matrix.python-version }}
16 |       uses: actions/setup-python@v5
17 |       with:
18 |         python-version: ${{ matrix.python-version }}
19 | 
20 |     - name: Run tests
21 |       run: |
22 |         pip install -U tox
23 |         tox -e py
24 | 
25 |     - name: Upload coverage report
26 |       uses: codecov/codecov-action@v5
27 | 


--------------------------------------------------------------------------------
/.github/workflows/tests-ubuntu.yml:
--------------------------------------------------------------------------------
 1 | name: Ubuntu
 2 | on: [push, pull_request]
 3 | 
 4 | jobs:
 5 |   tests:
 6 |     runs-on: ubuntu-latest
 7 |     strategy:
 8 |       fail-fast: false
 9 |       matrix:
10 |         include:
11 |         - python-version: "3.9"
12 |           env:
13 |             TOXENV: py
14 |         - python-version: "3.10"
15 |           env:
16 |             TOXENV: py
17 |         - python-version: "3.11"
18 |           env:
19 |             TOXENV: py
20 |         - python-version: "3.12"
21 |           env:
22 |             TOXENV: py
23 |         - python-version: "3.13"
24 |           env:
25 |             TOXENV: py
26 |         - python-version: pypy3.10
27 |           env:
28 |             TOXENV: pypy3
29 |         - python-version: pypy3.11
30 |           env:
31 |             TOXENV: pypy3
32 | 
33 |     steps:
34 |     - uses: actions/checkout@v4
35 | 
36 |     - name: Install system libraries
37 |       if: contains(matrix.python-version, 'pypy')
38 |       run: |
39 |         sudo apt-get update
40 |         sudo apt-get install libxml2-dev libxslt-dev
41 | 
42 |     - name: Set up Python ${{ matrix.python-version }}
43 |       uses: actions/setup-python@v5
44 |       with:
45 |         python-version: ${{ matrix.python-version }}
46 | 
47 |     - name: Run tests
48 |       env: ${{ matrix.env }}
49 |       run: |
50 |         pip install -U tox
51 |         tox
52 | 
53 |     - name: Upload coverage report
54 |       uses: codecov/codecov-action@v5
55 | 


--------------------------------------------------------------------------------
/.github/workflows/tests-windows.yml:
--------------------------------------------------------------------------------
 1 | name: Windows
 2 | on: [push, pull_request]
 3 | 
 4 | jobs:
 5 |   tests:
 6 |     runs-on: windows-latest
 7 |     strategy:
 8 |       fail-fast: false
 9 |       matrix:
10 |         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
11 | 
12 |     steps:
13 |     - uses: actions/checkout@v4
14 | 
15 |     - name: Set up Python ${{ matrix.python-version }}
16 |       uses: actions/setup-python@v5
17 |       with:
18 |         python-version: ${{ matrix.python-version }}
19 | 
20 |     - name: Run tests
21 |       run: |
22 |         pip install -U tox
23 |         tox -e py
24 | 
25 |     - name: Upload coverage report
26 |       uses: codecov/codecov-action@v5
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.eggs
 9 | *.egg-info
10 | dist
11 | build
12 | eggs
13 | parts
14 | bin
15 | var
16 | sdist
17 | develop-eggs
18 | .installed.cfg
19 | lib
20 | lib64
21 | 
22 | # Installer logs
23 | pip-log.txt
24 | 
25 | # Unit test / coverage reports
26 | .coverage
27 | /coverage.xml
28 | .tox
29 | nosetests.xml
30 | htmlcov
31 | .pytest_cache
32 | 
33 | # Translations
34 | *.mo
35 | 
36 | # Mr Developer
37 | .mr.developer.cfg
38 | .project
39 | .pydevproject
40 | 
41 | # Complexity
42 | output/*.html
43 | output/*/index.html
44 | 
45 | # Sphinx
46 | docs/_build
47 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/astral-sh/ruff-pre-commit
3 |   rev: v0.11.2
4 |   hooks:
5 |     - id: ruff
6 |       args: [ --fix ]
7 |     - id: ruff-format
8 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | formats: all
 3 | sphinx:
 4 |   configuration: docs/conf.py
 5 |   fail_on_warning: true
 6 | build:
 7 |   os: ubuntu-24.04
 8 |   tools:
 9 |     # For available versions, see:
10 |     # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python
11 |     python: "3.13"  # Keep in sync with .github/workflows/checks.yml
12 | python:
13 |   install:
14 |     - requirements: docs/requirements.txt
15 |     - path: .
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) Scrapy developers.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 |     1. Redistributions of source code must retain the above copyright notice, 
 8 |        this list of conditions and the following disclaimer.
 9 |     
10 |     2. Redistributions in binary form must reproduce the above copyright 
11 |        notice, this list of conditions and the following disclaimer in the
12 |        documentation and/or other materials provided with the distribution.
13 | 
14 |     3. Neither the name of Scrapy nor the names of its contributors may be used
15 |        to endorse or promote products derived from this software without
16 |        specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include NEWS
 2 | include LICENSE
 3 | include README.rst
 4 | include parsel/py.typed
 5 | 
 6 | recursive-include tests *
 7 | recursive-exclude * __pycache__
 8 | recursive-exclude * *.py[co]
 9 | 
10 | recursive-include docs *.rst conf.py Makefile make.bat
11 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
  1 | .. :changelog:
  2 | 
  3 | History
  4 | -------
  5 | 
  6 | 1.10.0 (2024-12-16)
  7 | ~~~~~~~~~~~~~~~~~~~
  8 | 
  9 | * Removed support for Python 3.8.
 10 | * Added support for Python 3.13.
 11 | * Changed the default encoding name from ``"utf8"`` to ``"utf-8"`` everywhere.
 12 |   The former name is not supported in certain environments.
 13 | * CI fixes and improvements.
 14 | 
 15 | 1.9.1 (2024-04-08)
 16 | ~~~~~~~~~~~~~~~~~~
 17 | 
 18 | * Removed the dependency on ``pytest-runner``.
 19 | * Removed the obsolete ``Makefile``.
 20 | 
 21 | 1.9.0 (2024-03-14)
 22 | ~~~~~~~~~~~~~~~~~~
 23 | 
 24 | * Now requires ``cssselect >= 1.2.0`` (this minimum version was required since
 25 |   1.8.0 but that wasn't properly recorded)
 26 | * Removed support for Python 3.7
 27 | * Added support for Python 3.12 and PyPy 3.10
 28 | * Fixed an exception when calling ``__str__`` or ``__repr__`` on some JSON
 29 |   selectors
 30 | * Code formatted with ``black``
 31 | * CI fixes and improvements
 32 | 
 33 | 1.8.1 (2023-04-18)
 34 | ~~~~~~~~~~~~~~~~~~
 35 | 
 36 | * Remove a Sphinx reference from NEWS to fix the PyPI description
 37 | * Add a ``twine check`` CI check to detect such problems
 38 | 
 39 | 1.8.0 (2023-04-18)
 40 | ~~~~~~~~~~~~~~~~~~
 41 | 
 42 | * Add support for JMESPath: you can now create a selector for a JSON document
 43 |   and call ``Selector.jmespath()``. See `the documentation`_ for more
 44 |   information and examples.
 45 | * Selectors can now be constructed from ``bytes`` (using the ``body`` and
 46 |   ``encoding`` arguments) instead of ``str`` (using the ``text`` argument), so
 47 |   that there is no internal conversion from ``str`` to ``bytes`` and the memory
 48 |   usage is lower.
 49 | * Typing improvements
 50 | * The ``pkg_resources`` module (which was absent from the requirements) is no
 51 |   longer used
 52 | * Documentation build fixes
 53 | * New requirements:
 54 | 
 55 |   * ``jmespath``
 56 |   * ``typing_extensions`` (on Python 3.7)
 57 | 
 58 |  .. _the documentation: https://parsel.readthedocs.io/en/latest/usage.html
 59 | 
 60 | 1.7.0 (2022-11-01)
 61 | ~~~~~~~~~~~~~~~~~~
 62 | 
 63 | * Add PEP 561-style type information
 64 | * Support for Python 2.7, 3.5 and 3.6 is removed
 65 | * Support for Python 3.9-3.11 is added
 66 | * Very large documents (with deep nesting or long tag content) can now be
 67 |   parsed, and ``Selector`` now takes a new argument ``huge_tree`` to disable
 68 |   this
 69 | * Support for new features of cssselect 1.2.0 is added
 70 | * The ``Selector.remove()`` and ``SelectorList.remove()`` methods are
 71 |   deprecated and replaced with the new ``Selector.drop()`` and
 72 |   ``SelectorList.drop()`` methods which don't delete text after the dropped
 73 |   elements when used in the HTML mode.
 74 | 
 75 | 
 76 | 1.6.0 (2020-05-07)
 77 | ~~~~~~~~~~~~~~~~~~
 78 | 
 79 | * Python 3.4 is no longer supported
 80 | * New ``Selector.remove()`` and ``SelectorList.remove()`` methods to remove
 81 |   selected elements from the parsed document tree
 82 | * Improvements to error reporting, test coverage and documentation, and code
 83 |   cleanup
 84 | 
 85 | 
 86 | 1.5.2 (2019-08-09)
 87 | ~~~~~~~~~~~~~~~~~~
 88 | 
 89 | * ``Selector.remove_namespaces`` received a significant performance improvement
 90 | * The value of ``data`` within the printable representation of a selector
 91 |   (``repr(selector)``) now ends in ``...`` when truncated, to make the
 92 |   truncation obvious.
 93 | * Minor documentation improvements.
 94 | 
 95 | 
 96 | 1.5.1 (2018-10-25)
 97 | ~~~~~~~~~~~~~~~~~~
 98 | 
 99 | * ``has-class`` XPath function handles newlines and other separators
100 |   in class names properly;
101 | * fixed parsing of HTML documents with null bytes;
102 | * documentation improvements;
103 | * Python 3.7 tests are run on CI; other test improvements.
104 | 
105 | 
106 | 1.5.0 (2018-07-04)
107 | ~~~~~~~~~~~~~~~~~~
108 | 
109 | * New ``Selector.attrib`` and ``SelectorList.attrib`` properties which make
110 |   it easier to get attributes of HTML elements.
111 | * CSS selectors became faster: compilation results are cached
112 |   (LRU cache is used for ``css2xpath``), so there is
113 |   less overhead when the same CSS expression is used several times.
114 | * ``.get()`` and ``.getall()`` selector methods are documented and recommended
115 |   over ``.extract_first()`` and ``.extract()``.
116 | * Various documentation tweaks and improvements.
117 | 
118 | One more change is that ``.extract()`` and  ``.extract_first()`` methods
119 | are now implemented using ``.get()`` and ``.getall()``, not the other
120 | way around, and instead of calling ``Selector.extract`` all other methods
121 | now call ``Selector.get`` internally. It can be **backwards incompatible**
122 | in case of custom Selector subclasses which override ``Selector.extract``
123 | without doing the same for ``Selector.get``. If you have such Selector
124 | subclass, make sure ``get`` method is also overridden. For example, this::
125 | 
126 |     class MySelector(parsel.Selector):
127 |         def extract(self):
128 |             return super().extract() + " foo"
129 | 
130 | should be changed to this::
131 | 
132 |     class MySelector(parsel.Selector):
133 |         def get(self):
134 |             return super().get() + " foo"
135 |         extract = get
136 | 
137 | 
138 | 1.4.0 (2018-02-08)
139 | ~~~~~~~~~~~~~~~~~~
140 | 
141 | * ``Selector`` and ``SelectorList`` can't be pickled because
142 |   pickling/unpickling doesn't work for ``lxml.html.HtmlElement``;
143 |   parsel now raises TypeError explicitly instead of allowing pickle to
144 |   silently produce wrong output. This is technically backwards-incompatible
145 |   if you're using Python < 3.6.
146 | 
147 | 
148 | 1.3.1 (2017-12-28)
149 | ~~~~~~~~~~~~~~~~~~
150 | 
151 | * Fix artifact uploads to pypi.
152 | 
153 | 
154 | 1.3.0 (2017-12-28)
155 | ~~~~~~~~~~~~~~~~~~
156 | 
157 | * ``has-class`` XPath extension function;
158 | * ``parsel.xpathfuncs.set_xpathfunc`` is a simplified way to register
159 |   XPath extensions;
160 | * ``Selector.remove_namespaces`` now removes namespace declarations;
161 | * Python 3.3 support is dropped;
162 | * ``make htmlview`` command for easier Parsel docs development.
163 | * CI: PyPy installation is fixed; parsel now runs tests for PyPy3 as well.
164 | 
165 | 
166 | 1.2.0 (2017-05-17)
167 | ~~~~~~~~~~~~~~~~~~
168 | 
169 | * Add ``SelectorList.get`` and ``SelectorList.getall``
170 |   methods as aliases for ``SelectorList.extract_first``
171 |   and ``SelectorList.extract`` respectively
172 | * Add default value parameter to ``SelectorList.re_first`` method
173 | * Add ``Selector.re_first`` method
174 | * Add ``replace_entities`` argument on ``.re()`` and ``.re_first()``
175 |   to turn off replacing of character entity references
176 | * Bug fix: detect ``None`` result from lxml parsing and fallback with an empty document
177 | * Rearrange XML/HTML examples in the selectors usage docs
178 | * Travis CI:
179 | 
180 |   * Test against Python 3.6
181 |   * Test against PyPy using "Portable PyPy for Linux" distribution
182 | 
183 | 
184 | 1.1.0 (2016-11-22)
185 | ~~~~~~~~~~~~~~~~~~
186 | 
187 | * Change default HTML parser to `lxml.html.HTMLParser <https://lxml.de/api/lxml.html.HTMLParser-class.html>`_,
188 |   which makes easier to use some HTML specific features
189 | * Add css2xpath function to translate CSS to XPath
190 | * Add support for ad-hoc namespaces declarations
191 | * Add support for XPath variables
192 | * Documentation improvements and updates
193 | 
194 | 
195 | 1.0.3 (2016-07-29)
196 | ~~~~~~~~~~~~~~~~~~
197 | 
198 | * Add BSD-3-Clause license file
199 | * Re-enable PyPy tests
200 | * Integrate py.test runs with setuptools (needed for Debian packaging)
201 | * Changelog is now called ``NEWS``
202 | 
203 | 
204 | 1.0.2 (2016-04-26)
205 | ~~~~~~~~~~~~~~~~~~
206 | 
207 | * Fix bug in exception handling causing original traceback to be lost
208 | * Added docstrings and other doc fixes
209 | 
210 | 
211 | 1.0.1 (2015-08-24)
212 | ~~~~~~~~~~~~~~~~~~
213 | 
214 | * Updated PyPI classifiers
215 | * Added docstrings for csstranslator module and other doc fixes
216 | 
217 | 
218 | 1.0.0 (2015-08-22)
219 | ~~~~~~~~~~~~~~~~~~
220 | 
221 | * Documentation fixes
222 | 
223 | 
224 | 0.9.6 (2015-08-14)
225 | ~~~~~~~~~~~~~~~~~~
226 | 
227 | * Updated documentation
228 | * Extended test coverage
229 | 
230 | 
231 | 0.9.5 (2015-08-11)
232 | ~~~~~~~~~~~~~~~~~~
233 | 
234 | * Support for extending SelectorList
235 | 
236 | 
237 | 0.9.4 (2015-08-10)
238 | ~~~~~~~~~~~~~~~~~~
239 | 
240 | * Try workaround for travis-ci/dpl#253
241 | 
242 | 
243 | 0.9.3 (2015-08-07)
244 | ~~~~~~~~~~~~~~~~~~
245 | 
246 | * Add base_url argument
247 | 
248 | 
249 | 0.9.2 (2015-08-07)
250 | ~~~~~~~~~~~~~~~~~~
251 | 
252 | * Rename module unified -> selector and promoted root attribute
253 | * Add create_root_node function
254 | 
255 | 
256 | 0.9.1 (2015-08-04)
257 | ~~~~~~~~~~~~~~~~~~
258 | 
259 | * Setup Sphinx build and docs structure
260 | * Build universal wheels
261 | * Rename some leftovers from package extraction
262 | 
263 | 
264 | 0.9.0 (2015-07-30)
265 | ~~~~~~~~~~~~~~~~~~
266 | 
267 | * First release on PyPI.
268 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | ======
 2 | Parsel
 3 | ======
 4 | 
 5 | .. image:: https://github.com/scrapy/parsel/actions/workflows/tests-ubuntu.yml/badge.svg
 6 |    :target: https://github.com/scrapy/parsel/actions/workflows/tests-ubuntu.yml
 7 |    :alt: Tests
 8 | 
 9 | .. image:: https://img.shields.io/pypi/pyversions/parsel.svg
10 |    :target: https://github.com/scrapy/parsel/actions/workflows/tests.yml
11 |    :alt: Supported Python versions
12 | 
13 | .. image:: https://img.shields.io/pypi/v/parsel.svg
14 |    :target: https://pypi.python.org/pypi/parsel
15 |    :alt: PyPI Version
16 | 
17 | .. image:: https://img.shields.io/codecov/c/github/scrapy/parsel/master.svg
18 |    :target: https://codecov.io/github/scrapy/parsel?branch=master
19 |    :alt: Coverage report
20 | 
21 | 
22 | Parsel is a BSD-licensed Python_ library to extract data from HTML_, JSON_, and
23 | XML_ documents.
24 | 
25 | It supports:
26 | 
27 | -   CSS_ and XPath_ expressions for HTML and XML documents
28 | 
29 | -   JMESPath_ expressions for JSON documents
30 | 
31 | -   `Regular expressions`_
32 | 
33 | Find the Parsel online documentation at https://parsel.readthedocs.org.
34 | 
35 | Example (`open online demo`_):
36 | 
37 | .. code-block:: python
38 | 
39 |     >>> from parsel import Selector
40 |     >>> text = """
41 |             <html>
42 |                 <body>
43 |                     <h1>Hello, Parsel!</h1>
44 |                     <ul>
45 |                         <li><a href="http://example.com">Link 1</a></li>
46 |                         <li><a href="http://scrapy.org">Link 2</a></li>
47 |                     </ul>
48 |                     <script type="application/json">{"a": ["b", "c"]}</script>
49 |                 </body>
50 |             </html>"""
51 |     >>> selector = Selector(text=text)
52 |     >>> selector.css('h1::text').get()
53 |     'Hello, Parsel!'
54 |     >>> selector.xpath('//h1/text()').re(r'\w+')
55 |     ['Hello', 'Parsel']
56 |     >>> for li in selector.css('ul > li'):
57 |     ...     print(li.xpath('.//@href').get())
58 |     http://example.com
59 |     http://scrapy.org
60 |     >>> selector.css('script::text').jmespath("a").get()
61 |     'b'
62 |     >>> selector.css('script::text').jmespath("a").getall()
63 |     ['b', 'c']
64 | 
65 | .. _CSS: https://en.wikipedia.org/wiki/Cascading_Style_Sheets
66 | .. _HTML: https://en.wikipedia.org/wiki/HTML
67 | .. _JMESPath: https://jmespath.org/
68 | .. _JSON: https://en.wikipedia.org/wiki/JSON
69 | .. _open online demo: https://colab.research.google.com/drive/149VFa6Px3wg7S3SEnUqk--TyBrKplxCN#forceEdit=true&sandboxMode=true
70 | .. _Python: https://www.python.org/
71 | .. _regular expressions: https://docs.python.org/library/re.html
72 | .. _XML: https://en.wikipedia.org/wiki/XML
73 | .. _XPath: https://en.wikipedia.org/wiki/XPath
74 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | PYTHON        = python
  6 | SPHINXOPTS    =
  7 | SPHINXBUILD   = sphinx-build
  8 | PAPER         =
  9 | BUILDDIR      = _build
 10 | 
 11 | # User-friendly check for sphinx-build
 12 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 13 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 14 | endif
 15 | 
 16 | # Internal variables.
 17 | PAPEROPT_a4     = -D latex_paper_size=a4
 18 | PAPEROPT_letter = -D latex_paper_size=letter
 19 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 20 | # the i18n builder cannot share the environment and doctrees with the others
 21 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 22 | 
 23 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 24 | 
 25 | help:
 26 | 	@echo "Please use \`make <target>' where <target> is one of"
 27 | 	@echo "  html       to make standalone HTML files"
 28 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 29 | 	@echo "  singlehtml to make a single large HTML file"
 30 | 	@echo "  pickle     to make pickle files"
 31 | 	@echo "  json       to make JSON files"
 32 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 33 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 34 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 35 | 	@echo "  epub       to make an epub"
 36 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 37 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 38 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 39 | 	@echo "  text       to make text files"
 40 | 	@echo "  man        to make manual pages"
 41 | 	@echo "  texinfo    to make Texinfo files"
 42 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 43 | 	@echo "  gettext    to make PO message catalogs"
 44 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 45 | 	@echo "  xml        to make Docutils-native XML files"
 46 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 47 | 	@echo "  linkcheck  to check all external links for integrity"
 48 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 49 | 	@echo "  htmlview   to view the compiled HTML files in browser"
 50 | 
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | html:
 55 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 58 | 
 59 | dirhtml:
 60 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 61 | 	@echo
 62 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 63 | 
 64 | singlehtml:
 65 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 68 | 
 69 | pickle:
 70 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 71 | 	@echo
 72 | 	@echo "Build finished; now you can process the pickle files."
 73 | 
 74 | json:
 75 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 76 | 	@echo
 77 | 	@echo "Build finished; now you can process the JSON files."
 78 | 
 79 | htmlhelp:
 80 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 81 | 	@echo
 82 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 83 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 84 | 
 85 | qthelp:
 86 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 87 | 	@echo
 88 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 89 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 90 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/parsel.qhcp"
 91 | 	@echo "To view the help file:"
 92 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/parsel.qhc"
 93 | 
 94 | devhelp:
 95 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 96 | 	@echo
 97 | 	@echo "Build finished."
 98 | 	@echo "To view the help file:"
 99 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/parsel"
100 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/parsel"
101 | 	@echo "# devhelp"
102 | 
103 | epub:
104 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
105 | 	@echo
106 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
107 | 
108 | latex:
109 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
110 | 	@echo
111 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
112 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
113 | 	      "(use \`make latexpdf' here to do that automatically)."
114 | 
115 | latexpdf:
116 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
117 | 	@echo "Running LaTeX files through pdflatex..."
118 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
119 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
120 | 
121 | latexpdfja:
122 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
123 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
124 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
125 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
126 | 
127 | text:
128 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
129 | 	@echo
130 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
131 | 
132 | man:
133 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
134 | 	@echo
135 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
136 | 
137 | texinfo:
138 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
139 | 	@echo
140 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
141 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
142 | 	      "(use \`make info' here to do that automatically)."
143 | 
144 | info:
145 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
146 | 	@echo "Running Texinfo files through makeinfo..."
147 | 	make -C $(BUILDDIR)/texinfo info
148 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
149 | 
150 | gettext:
151 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
152 | 	@echo
153 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
154 | 
155 | changes:
156 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
157 | 	@echo
158 | 	@echo "The overview file is in $(BUILDDIR)/changes."
159 | 
160 | linkcheck:
161 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
162 | 	@echo
163 | 	@echo "Link check complete; look for any errors in the above output " \
164 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
165 | 
166 | doctest:
167 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
168 | 	@echo "Testing of doctests in the sources finished, look at the " \
169 | 	      "results in $(BUILDDIR)/doctest/output.txt."
170 | 
171 | xml:
172 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
173 | 	@echo
174 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
175 | 
176 | pseudoxml:
177 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
178 | 	@echo
179 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
180 | 
181 | htmlview: html
182 | 	 $(PYTHON) -c "import webbrowser, os; webbrowser.open('file://' + \
183 | 	 os.path.realpath('_build/html/index.html'))"
184 | 


--------------------------------------------------------------------------------
/docs/_static/selectors-sample1.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |  <head>
 3 |   <base href='http://example.com/' />
 4 |   <title>Example website</title>
 5 |  </head>
 6 |  <body>
 7 |   <div id='images'>
 8 |    <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
 9 |    <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
10 |    <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
11 |    <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
12 |    <a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
13 |   </div>
14 |  </body>
15 | </html>
16 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | from pathlib import Path
  5 | 
  6 | # Get the project root dir, which is the parent dir of this
  7 | project_root = str(Path.cwd().parent)
  8 | 
  9 | # Insert the project root dir as the first element in the PYTHONPATH.
 10 | # This lets us ensure that the source package is imported, and that its
 11 | # version is used.
 12 | sys.path.insert(0, project_root)
 13 | 
 14 | import parsel  # noqa: E402
 15 | 
 16 | # -- General configuration ---------------------------------------------
 17 | 
 18 | # Add any Sphinx extension module names here, as strings. They can be
 19 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 20 | extensions = [
 21 |     "notfound.extension",
 22 |     "sphinx.ext.autodoc",
 23 |     "sphinx.ext.intersphinx",
 24 |     "sphinx.ext.viewcode",
 25 | ]
 26 | 
 27 | # Add any paths that contain templates here, relative to this directory.
 28 | templates_path = ["_templates"]
 29 | 
 30 | # The suffix of source filenames.
 31 | source_suffix = ".rst"
 32 | 
 33 | # The master toctree document.
 34 | master_doc = "index"
 35 | 
 36 | # General information about the project.
 37 | project = "Parsel"
 38 | copyright = "2015, Scrapy Project"
 39 | 
 40 | # The version info for the project you're documenting, acts as replacement
 41 | # for |version| and |release|, also used in various other places throughout
 42 | # the built documents.
 43 | #
 44 | # The short X.Y version.
 45 | version = parsel.__version__
 46 | # The full version, including alpha/beta/rc tags.
 47 | release = parsel.__version__
 48 | 
 49 | # List of patterns, relative to source directory, that match files and
 50 | # directories to ignore when looking for source files.
 51 | exclude_patterns = ["_build"]
 52 | 
 53 | # The name of the Pygments (syntax highlighting) style to use.
 54 | pygments_style = "sphinx"
 55 | 
 56 | suppress_warnings = ["epub.unknown_project_files"]
 57 | 
 58 | 
 59 | # -- Options for HTML output -------------------------------------------
 60 | 
 61 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 62 | # a list of builtin themes.
 63 | html_theme = "sphinx_rtd_theme"
 64 | 
 65 | # Add any paths that contain custom static files (such as style sheets)
 66 | # here, relative to this directory. They are copied after the builtin
 67 | # static files, so a file named "default.css" will overwrite the builtin
 68 | # "default.css".
 69 | html_static_path = ["_static"]
 70 | 
 71 | # Output file base name for HTML help builder.
 72 | htmlhelp_basename = "parseldoc"
 73 | 
 74 | 
 75 | # -- Options for LaTeX output ------------------------------------------
 76 | 
 77 | latex_elements = {}
 78 | 
 79 | # Grouping the document tree into LaTeX files. List of tuples
 80 | # (source start file, target name, title, author, documentclass
 81 | # [howto/manual]).
 82 | latex_documents = [
 83 |     (
 84 |         "index",
 85 |         "parsel.tex",
 86 |         "Parsel Documentation",
 87 |         "Scrapy Project",
 88 |         "manual",
 89 |     ),
 90 | ]
 91 | 
 92 | 
 93 | # -- Options for manual page output ------------------------------------
 94 | 
 95 | # One entry per manual page. List of tuples
 96 | # (source start file, name, description, authors, manual section).
 97 | man_pages = [
 98 |     ("index", "parsel", "Parsel Documentation", ["Scrapy Project"], 1),
 99 | ]
100 | 
101 | # -- Options for Texinfo output ----------------------------------------
102 | 
103 | # Grouping the document tree into Texinfo files. List of tuples
104 | # (source start file, target name, title, author,
105 | #  dir menu entry, description, category)
106 | texinfo_documents = [
107 |     (
108 |         "index",
109 |         "parsel",
110 |         "Parsel Documentation",
111 |         "Scrapy Project",
112 |         "parsel",
113 |         "One line description of project.",
114 |         "Miscellaneous",
115 |     ),
116 | ]
117 | 
118 | 
119 | # -- Options for the InterSphinx extension ------------------------------------
120 | 
121 | intersphinx_mapping = {
122 |     "cssselect": ("https://cssselect.readthedocs.io/en/latest", None),
123 |     "python": ("https://docs.python.org/3", None),
124 |     "requests": ("https://requests.kennethreitz.org/en/latest", None),
125 |     "lxml": ("https://lxml.de/apidoc/", None),
126 | }
127 | 
128 | 
129 | # --- Nitpicking options ------------------------------------------------------
130 | 
131 | # nitpicky = True  # https://github.com/scrapy/cssselect/pull/110
132 | nitpick_ignore = [
133 |     ("py:class", "ExpressionError"),
134 |     ("py:class", "SelectorSyntaxError"),
135 |     ("py:class", "cssselect.xpath.GenericTranslator"),
136 |     ("py:class", "cssselect.xpath.HTMLTranslator"),
137 |     ("py:class", "cssselect.xpath.XPathExpr"),
138 |     ("py:class", "lxml.etree.XMLParser"),
139 | ]
140 | 


--------------------------------------------------------------------------------
/docs/conftest.py:
--------------------------------------------------------------------------------
 1 | from doctest import ELLIPSIS, NORMALIZE_WHITESPACE
 2 | from pathlib import Path
 3 | 
 4 | from sybil import Sybil
 5 | 
 6 | try:
 7 |     from sybil.parsers.codeblock import PythonCodeBlockParser
 8 | except ImportError:
 9 |     from sybil.parsers.codeblock import (
10 |         CodeBlockParser as PythonCodeBlockParser,
11 |     )
12 | from sybil.parsers.doctest import DocTestParser
13 | from sybil.parsers.skip import skip
14 | 
15 | from parsel import Selector
16 | 
17 | 
18 | def load_selector(filename, **kwargs):
19 |     input_path = Path(__file__).parent / "_static" / filename
20 |     return Selector(text=input_path.read_text(encoding="utf-8"), **kwargs)
21 | 
22 | 
23 | def setup(namespace):
24 |     namespace["load_selector"] = load_selector
25 | 
26 | 
27 | pytest_collect_file = Sybil(
28 |     parsers=[
29 |         DocTestParser(optionflags=ELLIPSIS | NORMALIZE_WHITESPACE),
30 |         PythonCodeBlockParser(future_imports=["print_function"]),
31 |         skip,
32 |     ],
33 |     pattern="*.rst",
34 |     setup=setup,
35 | ).pytest()
36 | 


--------------------------------------------------------------------------------
/docs/history.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../NEWS
2 | 
3 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. parsel documentation master file, created by
 2 |    sphinx-quickstart on Tue Jul  9 22:26:36 2013.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | .. include:: ../README.rst
 7 | 
 8 | Parsel Documentation Contents
 9 | =============================
10 | 
11 | Contents:
12 | 
13 | .. toctree::
14 |    :maxdepth: 2
15 | 
16 |    installation
17 |    usage
18 |    parsel
19 |    history
20 | 
21 | Indices and tables
22 | ==================
23 | 
24 | * :ref:`genindex`
25 | * :ref:`modindex`
26 | * :ref:`search`
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | Installation
 3 | ============
 4 | 
 5 | To install Parsel, we recommend you to use `pip <https://pip.pypa.io/>`_::
 6 | 
 7 |     $ pip install parsel
 8 | 
 9 | You `probably shouldn't
10 | <https://stackoverflow.com/questions/3220404/why-use-pip-over-easy-install>`_,
11 | but you can also install it with easy_install::
12 | 
13 |     $ easy_install parsel
14 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	goto end
 41 | )
 42 | 
 43 | if "%1" == "clean" (
 44 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 45 | 	del /q /s %BUILDDIR%\*
 46 | 	goto end
 47 | )
 48 | 
 49 | 
 50 | %SPHINXBUILD% 2> nul
 51 | if errorlevel 9009 (
 52 | 	echo.
 53 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 54 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 55 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 56 | 	echo.may add the Sphinx directory to PATH.
 57 | 	echo.
 58 | 	echo.If you don't have Sphinx installed, grab it from
 59 | 	echo.http://sphinx-doc.org/
 60 | 	exit /b 1
 61 | )
 62 | 
 63 | if "%1" == "html" (
 64 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "dirhtml" (
 72 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "singlehtml" (
 80 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "pickle" (
 88 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can process the pickle files.
 92 | 	goto end
 93 | )
 94 | 
 95 | if "%1" == "json" (
 96 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 97 | 	if errorlevel 1 exit /b 1
 98 | 	echo.
 99 | 	echo.Build finished; now you can process the JSON files.
100 | 	goto end
101 | )
102 | 
103 | if "%1" == "htmlhelp" (
104 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
105 | 	if errorlevel 1 exit /b 1
106 | 	echo.
107 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
108 | .hhp project file in %BUILDDIR%/htmlhelp.
109 | 	goto end
110 | )
111 | 
112 | if "%1" == "qthelp" (
113 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
114 | 	if errorlevel 1 exit /b 1
115 | 	echo.
116 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
117 | .qhcp project file in %BUILDDIR%/qthelp, like this:
118 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\parsel.qhcp
119 | 	echo.To view the help file:
120 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\parsel.ghc
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "devhelp" (
125 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "epub" (
133 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "latex" (
141 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "latexpdf" (
149 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
150 | 	cd %BUILDDIR%/latex
151 | 	make all-pdf
152 | 	cd %BUILDDIR%/..
153 | 	echo.
154 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
155 | 	goto end
156 | )
157 | 
158 | if "%1" == "latexpdfja" (
159 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
160 | 	cd %BUILDDIR%/latex
161 | 	make all-pdf-ja
162 | 	cd %BUILDDIR%/..
163 | 	echo.
164 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
165 | 	goto end
166 | )
167 | 
168 | if "%1" == "text" (
169 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
170 | 	if errorlevel 1 exit /b 1
171 | 	echo.
172 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
173 | 	goto end
174 | )
175 | 
176 | if "%1" == "man" (
177 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
178 | 	if errorlevel 1 exit /b 1
179 | 	echo.
180 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
181 | 	goto end
182 | )
183 | 
184 | if "%1" == "texinfo" (
185 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
186 | 	if errorlevel 1 exit /b 1
187 | 	echo.
188 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
189 | 	goto end
190 | )
191 | 
192 | if "%1" == "gettext" (
193 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
194 | 	if errorlevel 1 exit /b 1
195 | 	echo.
196 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
197 | 	goto end
198 | )
199 | 
200 | if "%1" == "changes" (
201 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
202 | 	if errorlevel 1 exit /b 1
203 | 	echo.
204 | 	echo.The overview file is in %BUILDDIR%/changes.
205 | 	goto end
206 | )
207 | 
208 | if "%1" == "linkcheck" (
209 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
210 | 	if errorlevel 1 exit /b 1
211 | 	echo.
212 | 	echo.Link check complete; look for any errors in the above output ^
213 | or in %BUILDDIR%/linkcheck/output.txt.
214 | 	goto end
215 | )
216 | 
217 | if "%1" == "doctest" (
218 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
219 | 	if errorlevel 1 exit /b 1
220 | 	echo.
221 | 	echo.Testing of doctests in the sources finished, look at the ^
222 | results in %BUILDDIR%/doctest/output.txt.
223 | 	goto end
224 | )
225 | 
226 | if "%1" == "xml" (
227 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
228 | 	if errorlevel 1 exit /b 1
229 | 	echo.
230 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
231 | 	goto end
232 | )
233 | 
234 | if "%1" == "pseudoxml" (
235 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
236 | 	if errorlevel 1 exit /b 1
237 | 	echo.
238 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
239 | 	goto end
240 | )
241 | 
242 | :end
243 | 


--------------------------------------------------------------------------------
/docs/parsel.rst:
--------------------------------------------------------------------------------
 1 | API reference
 2 | =============
 3 | 
 4 | parsel.csstranslator
 5 | --------------------
 6 | 
 7 | .. automodule:: parsel.csstranslator
 8 |     :members:
 9 |     :undoc-members:
10 |     :show-inheritance:
11 | 
12 | 
13 | .. _topics-selectors-ref:
14 | 
15 | parsel.selector
16 | ---------------
17 | 
18 | .. automodule:: parsel.selector
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | 
24 | parsel.utils
25 | ------------
26 | 
27 | .. automodule:: parsel.utils
28 |     :members:
29 |     :undoc-members:
30 |     :show-inheritance:
31 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | sphinx-notfound-page
3 | sphinx_rtd_theme
4 | 


--------------------------------------------------------------------------------
/docs/usage.rst:
--------------------------------------------------------------------------------
   1 | .. _topics-selectors:
   2 | 
   3 | =====
   4 | Usage
   5 | =====
   6 | 
   7 | Create a :class:`~parsel.selector.Selector` object for your input text.
   8 | 
   9 | For HTML or XML, use `CSS`_ or `XPath`_ expressions to select data::
  10 | 
  11 |     >>> from parsel import Selector
  12 |     >>> html_text = "<html><body><h1>Hello, Parsel!</h1></body></html>"
  13 |     >>> html_selector = Selector(text=html_text)
  14 |     >>> html_selector.css('h1')
  15 |     [<Selector query='descendant-or-self::h1' data='<h1>Hello, Parsel!</h1>'>]
  16 |     >>> html_selector.xpath('//h1')  # the same, but now with XPath
  17 |     [<Selector query='//h1' data='<h1>Hello, Parsel!</h1>'>]
  18 | 
  19 | For JSON, use `JMESPath`_ expressions to select data::
  20 | 
  21 |     >>> json_text = '{"title":"Hello, Parsel!"}'
  22 |     >>> json_selector = Selector(text=json_text)
  23 |     >>> json_selector.jmespath('title')
  24 |     [<Selector query='title' data='Hello, Parsel!'>]
  25 | 
  26 | And extract data from those elements::
  27 | 
  28 |     >>> html_selector.xpath('//h1/text()').get()
  29 |     'Hello, Parsel!'
  30 |     >>> json_selector.jmespath('title').getall()
  31 |     ['Hello, Parsel!']
  32 | 
  33 | .. _CSS: https://www.w3.org/TR/selectors
  34 | .. _XPath: https://www.w3.org/TR/xpath
  35 | .. _JMESPath: https://jmespath.org/
  36 | 
  37 | Learning expression languages
  38 | =============================
  39 | 
  40 | `CSS`_ is a language for applying styles to HTML documents. It defines
  41 | selectors to associate those styles with specific HTML elements. Resources to
  42 | learn CSS_ selectors include:
  43 | 
  44 | -   `CSS selectors in the MDN`_
  45 | 
  46 | -   `XPath/CSS Equivalents in Wikibooks`_
  47 | 
  48 | Parsel support for CSS selectors comes from cssselect, so read about `CSS
  49 | selectors supported by cssselect`_.
  50 | 
  51 | .. _CSS selectors supported by cssselect: https://cssselect.readthedocs.io/en/latest/#supported-selectors
  52 | 
  53 | `XPath`_ is a language for selecting nodes in XML documents, which can also be
  54 | used with HTML. Resources to learn XPath_ include:
  55 | 
  56 | -   `XPath Tutorial in W3Schools`_
  57 | 
  58 | -   `XPath cheatsheet`_
  59 | 
  60 | For HTML and XML input, you can use either CSS_ or XPath_. CSS_ is usually
  61 | more readable, but some things can only be done with XPath_.
  62 | 
  63 | JMESPath_ allows you to declaratively specify how to extract elements from
  64 | a JSON document. Resources to learn JMESPath_ include:
  65 | 
  66 | -   `JMESPath Tutorial`_
  67 | 
  68 | -   `JMESPath Specification`_
  69 | 
  70 | .. _CSS selectors in the MDN: https://developer.mozilla.org/en-US/docs/Learn/CSS/Building_blocks/Selectors
  71 | .. _XPath cheatsheet: https://devhints.io/xpath
  72 | .. _XPath Tutorial in W3Schools: https://www.w3schools.com/xml/xpath_intro.asp
  73 | .. _XPath/CSS Equivalents in Wikibooks: https://en.wikibooks.org/wiki/XPath/CSS_Equivalents
  74 | .. _JMESPath Tutorial: https://jmespath.org/tutorial.html
  75 | .. _JMESPath Specification: https://jmespath.org/specification.html
  76 | 
  77 | 
  78 | Using selectors
  79 | ===============
  80 | 
  81 | To explain how to use the selectors we'll use the :mod:`requests` library
  82 | to download an example page located in the Parsel's documentation:
  83 | 
  84 |     https://parsel.readthedocs.org/en/latest/_static/selectors-sample1.html
  85 | 
  86 | .. _topics-selectors-htmlcode:
  87 | 
  88 | For the sake of completeness, here's its full HTML code:
  89 | 
  90 | .. literalinclude:: _static/selectors-sample1.html
  91 |    :language: html
  92 | 
  93 | .. highlight:: python
  94 | 
  95 | So, let's download that page and create a selector for it:
  96 | 
  97 | .. skip: start
  98 | 
  99 | >>> import requests
 100 | >>> from parsel import Selector
 101 | >>> url = 'https://parsel.readthedocs.org/en/latest/_static/selectors-sample1.html'
 102 | >>> text = requests.get(url).text
 103 | >>> selector = Selector(text=text)
 104 | 
 105 | .. skip: end
 106 | 
 107 | .. invisible-code-block: python
 108 | 
 109 |    selector = load_selector('selectors-sample1.html')
 110 | 
 111 | Since we're dealing with HTML, the default type for Selector, we don't need
 112 | to specify the `type` argument.
 113 | 
 114 | So, by looking at the :ref:`HTML code <topics-selectors-htmlcode>` of that
 115 | page, let's construct an XPath for selecting the text inside the title tag::
 116 | 
 117 |     >>> selector.xpath('//title/text()')
 118 |     [<Selector query='//title/text()' data='Example website'>]
 119 | 
 120 | You can also ask the same thing using CSS instead::
 121 | 
 122 |     >>> selector.css('title::text')
 123 |     [<Selector query='descendant-or-self::title/text()' data='Example website'>]
 124 | 
 125 | To actually extract the textual data, you must call the selector ``.get()``
 126 | or ``.getall()`` methods, as follows::
 127 | 
 128 |     >>> selector.xpath('//title/text()').getall()
 129 |     ['Example website']
 130 |     >>> selector.xpath('//title/text()').get()
 131 |     'Example website'
 132 | 
 133 | ``.get()`` always returns a single result; if there are several matches,
 134 | content of a first match is returned; if there are no matches, None
 135 | is returned. ``.getall()`` returns a list with all results.
 136 | 
 137 | Notice that CSS selectors can select text or attribute nodes using CSS3
 138 | pseudo-elements::
 139 | 
 140 |     >>> selector.css('title::text').get()
 141 |     'Example website'
 142 | 
 143 | As you can see, ``.xpath()`` and ``.css()`` methods return a
 144 | :class:`~parsel.selector.SelectorList` instance, which is a list of new
 145 | selectors. This API can be used for quickly selecting nested data::
 146 | 
 147 |     >>> selector.css('img').xpath('@src').getall()
 148 |     ['image1_thumb.jpg',
 149 |      'image2_thumb.jpg',
 150 |      'image3_thumb.jpg',
 151 |      'image4_thumb.jpg',
 152 |      'image5_thumb.jpg']
 153 | 
 154 | If you want to extract only the first matched element, you can call the
 155 | selector ``.get()`` (or its alias ``.extract_first()`` commonly used in
 156 | previous parsel versions)::
 157 | 
 158 |     >>> selector.xpath('//div[@id="images"]/a/text()').get()
 159 |     'Name: My image 1 '
 160 | 
 161 | It returns ``None`` if no element was found::
 162 | 
 163 |     >>> selector.xpath('//div[@id="not-exists"]/text()').get() is None
 164 |     True
 165 | 
 166 | Instead of using e.g. ``'@src'`` XPath it is possible to query for attributes
 167 | using ``.attrib`` property of a :class:`~parsel.selector.Selector`::
 168 | 
 169 |     >>> [img.attrib['src'] for img in selector.css('img')]
 170 |     ['image1_thumb.jpg',
 171 |      'image2_thumb.jpg',
 172 |      'image3_thumb.jpg',
 173 |      'image4_thumb.jpg',
 174 |      'image5_thumb.jpg']
 175 | 
 176 | As a shortcut, ``.attrib`` is also available on SelectorList directly;
 177 | it returns attributes for the first matching element::
 178 | 
 179 |     >>> selector.css('img').attrib['src']
 180 |     'image1_thumb.jpg'
 181 | 
 182 | This is most useful when only a single result is expected, e.g. when selecting
 183 | by id, or selecting unique elements on a web page::
 184 | 
 185 |     >>> selector.css('base').attrib['href']
 186 |     'http://example.com/'
 187 | 
 188 | Now we're going to get the base URL and some image links::
 189 | 
 190 |     >>> selector.xpath('//base/@href').get()
 191 |     'http://example.com/'
 192 | 
 193 |     >>> selector.css('base::attr(href)').get()
 194 |     'http://example.com/'
 195 | 
 196 |     >>> selector.css('base').attrib['href']
 197 |     'http://example.com/'
 198 | 
 199 |     >>> selector.xpath('//a[contains(@href, "image")]/@href').getall()
 200 |     ['image1.html',
 201 |      'image2.html',
 202 |      'image3.html',
 203 |      'image4.html',
 204 |      'image5.html']
 205 | 
 206 |     >>> selector.css('a[href*=image]::attr(href)').getall()
 207 |     ['image1.html',
 208 |      'image2.html',
 209 |      'image3.html',
 210 |      'image4.html',
 211 |      'image5.html']
 212 | 
 213 |     >>> selector.xpath('//a[contains(@href, "image")]/img/@src').getall()
 214 |     ['image1_thumb.jpg',
 215 |      'image2_thumb.jpg',
 216 |      'image3_thumb.jpg',
 217 |      'image4_thumb.jpg',
 218 |      'image5_thumb.jpg']
 219 | 
 220 |     >>> selector.css('a[href*=image] img::attr(src)').getall()
 221 |     ['image1_thumb.jpg',
 222 |      'image2_thumb.jpg',
 223 |      'image3_thumb.jpg',
 224 |      'image4_thumb.jpg',
 225 |      'image5_thumb.jpg']
 226 | 
 227 | .. _topics-selectors-css-extensions:
 228 | 
 229 | Extensions to CSS Selectors
 230 | ---------------------------
 231 | 
 232 | Per W3C standards, `CSS selectors`_ do not support selecting text nodes
 233 | or attribute values.
 234 | But selecting these is so essential in a web scraping context
 235 | that Parsel implements a couple of **non-standard pseudo-elements**:
 236 | 
 237 | * to select text nodes, use ``::text``
 238 | * to select attribute values, use ``::attr(name)`` where *name* is the
 239 |   name of the attribute that you want the value of
 240 | 
 241 | .. warning::
 242 |     These pseudo-elements are Scrapy-/Parsel-specific.
 243 |     They will most probably not work with other libraries like `lxml`_ or `PyQuery`_.
 244 | 
 245 | 
 246 | Examples:
 247 | 
 248 | * ``title::text`` selects children text nodes of a descendant ``<title>`` element::
 249 | 
 250 |     >>> selector.css('title::text').get()
 251 |     'Example website'
 252 | 
 253 | * ``*::text`` selects all descendant text nodes of the current selector context::
 254 | 
 255 |     >>> selector.css('#images *::text').getall()
 256 |     ['\n   ',
 257 |      'Name: My image 1 ',
 258 |      '\n   ',
 259 |      'Name: My image 2 ',
 260 |      '\n   ',
 261 |      'Name: My image 3 ',
 262 |      '\n   ',
 263 |      'Name: My image 4 ',
 264 |      '\n   ',
 265 |      'Name: My image 5 ',
 266 |      '\n  ']
 267 | 
 268 | * ``a::attr(href)`` selects the *href* attribute value of descendant links::
 269 | 
 270 |     >>> selector.css('a::attr(href)').getall()
 271 |     ['image1.html',
 272 |      'image2.html',
 273 |      'image3.html',
 274 |      'image4.html',
 275 |      'image5.html']
 276 | 
 277 | .. note::
 278 |     You cannot chain these pseudo-elements. But in practice it would not
 279 |     make much sense: text nodes do not have attributes, and attribute values
 280 |     are string values already and do not have children nodes.
 281 | 
 282 | .. note::
 283 |     See also: :ref:`selecting-attributes`.
 284 | 
 285 | 
 286 | .. _CSS Selectors: https://www.w3.org/TR/css3-selectors/#selectors
 287 | 
 288 | .. _topics-selectors-nesting-selectors:
 289 | 
 290 | Nesting selectors
 291 | -----------------
 292 | 
 293 | The selection methods (``.xpath()`` or ``.css()``) return a list of selectors
 294 | of the same type, so you can call the selection methods for those selectors
 295 | too. Here's an example::
 296 | 
 297 |     >>> links = selector.xpath('//a[contains(@href, "image")]')
 298 |     >>> links.getall()
 299 |     ['<a href="image1.html">Name: My image 1 <br><img src="image1_thumb.jpg"></a>',
 300 |      '<a href="image2.html">Name: My image 2 <br><img src="image2_thumb.jpg"></a>',
 301 |      '<a href="image3.html">Name: My image 3 <br><img src="image3_thumb.jpg"></a>',
 302 |      '<a href="image4.html">Name: My image 4 <br><img src="image4_thumb.jpg"></a>',
 303 |      '<a href="image5.html">Name: My image 5 <br><img src="image5_thumb.jpg"></a>']
 304 | 
 305 |     >>> for index, link in enumerate(links):
 306 |     ...     args = (index, link.xpath('@href').get(), link.xpath('img/@src').get())
 307 |     ...     print('Link number %d points to url %r and image %r' % args)
 308 |     Link number 0 points to url 'image1.html' and image 'image1_thumb.jpg'
 309 |     Link number 1 points to url 'image2.html' and image 'image2_thumb.jpg'
 310 |     Link number 2 points to url 'image3.html' and image 'image3_thumb.jpg'
 311 |     Link number 3 points to url 'image4.html' and image 'image4_thumb.jpg'
 312 |     Link number 4 points to url 'image5.html' and image 'image5_thumb.jpg'
 313 | 
 314 | .. _selecting-attributes:
 315 | 
 316 | Selecting element attributes
 317 | ----------------------------
 318 | 
 319 | There are several ways to get a value of an attribute. First, one can use
 320 | XPath syntax::
 321 | 
 322 |     >>> selector.xpath("//a/@href").getall()
 323 |     ['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html']
 324 | 
 325 | XPath syntax has a few advantages: it is a standard XPath feature, and
 326 | ``@attributes`` can be used in other parts of an XPath expression - e.g.
 327 | it is possible to filter by attribute value.
 328 | 
 329 | parsel also provides an extension to CSS selectors (``::attr(...)``)
 330 | which allows to get attribute values::
 331 | 
 332 |     >>> selector.css('a::attr(href)').getall()
 333 |     ['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html']
 334 | 
 335 | In addition to that, there is a ``.attrib`` property of Selector.
 336 | You can use it if you prefer to lookup attributes in Python
 337 | code, without using XPaths or CSS extensions::
 338 | 
 339 |     >>> [a.attrib['href'] for a in selector.css('a')]
 340 |     ['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html']
 341 | 
 342 | This property is also available on SelectorList; it returns a dictionary
 343 | with attributes of a first matching element. It is convenient to use when
 344 | a selector is expected to give a single result (e.g. when selecting by element
 345 | ID, or when selecting an unique element on a page)::
 346 | 
 347 |     >>> selector.css('base').attrib
 348 |     {'href': 'http://example.com/'}
 349 |     >>> selector.css('base').attrib['href']
 350 |     'http://example.com/'
 351 | 
 352 | ``.attrib`` property of an empty SelectorList is empty::
 353 | 
 354 |     >>> selector.css('foo').attrib
 355 |     {}
 356 | 
 357 | Using selectors with regular expressions
 358 | ----------------------------------------
 359 | 
 360 | :class:`~parsel.selector.Selector` also has a ``.re()`` method for extracting
 361 | data using regular expressions. However, unlike using ``.xpath()`` or
 362 | ``.css()`` methods, ``.re()`` returns a list of strings. So you
 363 | can't construct nested ``.re()`` calls.
 364 | 
 365 | Here's an example used to extract image names from the :ref:`HTML code
 366 | <topics-selectors-htmlcode>` above::
 367 | 
 368 |     >>> selector.xpath('//a[contains(@href, "image")]/text()').re(r'Name:\s*(.*)')
 369 |     ['My image 1 ',
 370 |      'My image 2 ',
 371 |      'My image 3 ',
 372 |      'My image 4 ',
 373 |      'My image 5 ']
 374 | 
 375 | There's an additional helper reciprocating ``.get()`` (and its
 376 | alias ``.extract_first()``) for ``.re()``, named ``.re_first()``.
 377 | Use it to extract just the first matching string::
 378 | 
 379 |     >>> selector.xpath('//a[contains(@href, "image")]/text()').re_first(r'Name:\s*(.*)')
 380 |     'My image 1 '
 381 | 
 382 | .. _topics-selectors-relative-xpaths:
 383 | 
 384 | Working with relative XPaths
 385 | ----------------------------
 386 | 
 387 | Keep in mind that if you are nesting selectors and use an XPath that starts
 388 | with ``/``, that XPath will be absolute to the document and not relative to the
 389 | selector you're calling it from.
 390 | 
 391 | For example, suppose you want to extract all ``<p>`` elements inside ``<div>``
 392 | elements. First, you would get all ``<div>`` elements::
 393 | 
 394 |     >>> divs = selector.xpath('//div')
 395 | 
 396 | At first, you may be tempted to use the following approach, which is wrong, as
 397 | it actually extracts all ``<p>`` elements from the document, not only those
 398 | inside ``<div>`` elements::
 399 | 
 400 |     >>> for p in divs.xpath('//p'):  # this is wrong - gets all <p> from the whole document
 401 |     ...     print(p.get())
 402 | 
 403 | This is the proper way to do it (note the dot prefixing the ``.//p`` XPath)::
 404 | 
 405 |     >>> for p in divs.xpath('.//p'):  # extracts all <p> inside
 406 |     ...     print(p.get())
 407 | 
 408 | Another common case would be to extract all direct ``<p>`` children::
 409 | 
 410 |     >>> for p in divs.xpath('p'):
 411 |     ...     print(p.get())
 412 | 
 413 | For more details about relative XPaths see the `Location Paths`_ section in the
 414 | XPath specification.
 415 | 
 416 | .. _Location Paths: https://www.w3.org/TR/xpath#location-paths
 417 | 
 418 | 
 419 | Removing elements
 420 | -----------------
 421 | 
 422 | If for any reason you need to remove elements based on a Selector or
 423 | a SelectorList, you can do it with the ``drop()`` method, available for both
 424 | classes.
 425 | 
 426 | .. warning:: this is a destructive action and cannot be undone. The original
 427 |     content of the selector is removed from the elements tree. This could be useful
 428 |     when trying to reduce the memory footprint of Responses.
 429 | 
 430 | Example removing an ad from a blog post:
 431 | 
 432 |     >>> from parsel import Selector
 433 |     >>> doc = """
 434 |     ... <article>
 435 |     ...     <div class="row">Content paragraph...</div>
 436 |     ...     <div class="row">
 437 |     ...         <div class="ad">
 438 |     ...             Ad content...
 439 |     ...             <a href="http://...">Link</a>
 440 |     ...         </div>
 441 |     ...     </div>
 442 |     ...     <div class="row">More content...</div>
 443 |     ... </article>
 444 |     ... """
 445 |     >>> sel = Selector(text=doc)
 446 |     >>> sel.xpath('//div/text()').getall()
 447 |     ['Content paragraph...', '\n        ', '\n            Ad content...\n            ', '\n        ', '\n    ', 'More content...']
 448 |     >>> sel.xpath('//div[@class="ad"]').drop()
 449 |     >>> sel.xpath('//div//text()').getall()
 450 |     ['Content paragraph...', 'More content...']
 451 | 
 452 | 
 453 | Using EXSLT extensions
 454 | ----------------------
 455 | 
 456 | Being built atop `lxml`_, parsel selectors support some `EXSLT`_ extensions
 457 | and come with these pre-registered namespaces to use in XPath expressions:
 458 | 
 459 | 
 460 | ======  =====================================    =======================
 461 | prefix  namespace                                usage
 462 | ======  =====================================    =======================
 463 | re      \http://exslt.org/regular-expressions    `regular expressions`_
 464 | set     \http://exslt.org/sets                   `set manipulation`_
 465 | ======  =====================================    =======================
 466 | 
 467 | Regular expressions
 468 | ~~~~~~~~~~~~~~~~~~~
 469 | 
 470 | The ``test()`` function, for example, can prove quite useful when XPath's
 471 | ``starts-with()`` or ``contains()`` are not sufficient.
 472 | 
 473 | Example selecting links in list item with a "class" attribute ending with a digit::
 474 | 
 475 |     >>> from parsel import Selector
 476 |     >>> doc = """
 477 |     ... <div>
 478 |     ...     <ul>
 479 |     ...         <li class="item-0"><a href="link1.html">first item</a></li>
 480 |     ...         <li class="item-1"><a href="link2.html">second item</a></li>
 481 |     ...         <li class="item-inactive"><a href="link3.html">third item</a></li>
 482 |     ...         <li class="item-1"><a href="link4.html">fourth item</a></li>
 483 |     ...         <li class="item-0"><a href="link5.html">fifth item</a></li>
 484 |     ...     </ul>
 485 |     ... </div>
 486 |     ... """
 487 |     >>> sel = Selector(text=doc)
 488 |     >>> sel.xpath('//li//@href').getall()
 489 |     ['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']
 490 |     >>> sel.xpath(r'//li[re:test(@class, "item-\d$")]//@href').getall()
 491 |     ['link1.html', 'link2.html', 'link4.html', 'link5.html']
 492 |     >>>
 493 | 
 494 | .. warning:: C library ``libxslt`` doesn't natively support EXSLT regular
 495 |     expressions so `lxml`_'s implementation uses hooks to Python's ``re`` module.
 496 |     Thus, using regexp functions in your XPath expressions may add a small
 497 |     performance penalty.
 498 | 
 499 | Set operations
 500 | ~~~~~~~~~~~~~~
 501 | 
 502 | These can be handy for excluding parts of a document tree before
 503 | extracting text elements for example.
 504 | 
 505 | Example extracting microdata (sample content taken from http://schema.org/Product)
 506 | with groups of itemscopes and corresponding itemprops::
 507 | 
 508 |     >>> doc = """
 509 |     ... <div itemscope itemtype="http://schema.org/Product">
 510 |     ...   <span itemprop="name">Kenmore White 17" Microwave</span>
 511 |     ...   <img src="kenmore-microwave-17in.jpg" alt='Kenmore 17" Microwave' />
 512 |     ...   <div itemprop="aggregateRating"
 513 |     ...     itemscope itemtype="http://schema.org/AggregateRating">
 514 |     ...    Rated <span itemprop="ratingValue">3.5</span>/5
 515 |     ...    based on <span itemprop="reviewCount">11</span> customer reviews
 516 |     ...   </div>
 517 |     ...
 518 |     ...   <div itemprop="offers" itemscope itemtype="http://schema.org/Offer">
 519 |     ...     <span itemprop="price">$55.00</span>
 520 |     ...     <link itemprop="availability" href="http://schema.org/InStock" />In stock
 521 |     ...   </div>
 522 |     ...
 523 |     ...   Product description:
 524 |     ...   <span itemprop="description">0.7 cubic feet countertop microwave.
 525 |     ...   Has six preset cooking categories and convenience features like
 526 |     ...   Add-A-Minute and Child Lock.</span>
 527 |     ...
 528 |     ...   Customer reviews:
 529 |     ...
 530 |     ...   <div itemprop="review" itemscope itemtype="http://schema.org/Review">
 531 |     ...     <span itemprop="name">Not a happy camper</span> -
 532 |     ...     by <span itemprop="author">Ellie</span>,
 533 |     ...     <meta itemprop="datePublished" content="2011-04-01">April 1, 2011
 534 |     ...     <div itemprop="reviewRating" itemscope itemtype="http://schema.org/Rating">
 535 |     ...       <meta itemprop="worstRating" content = "1">
 536 |     ...       <span itemprop="ratingValue">1</span>/
 537 |     ...       <span itemprop="bestRating">5</span>stars
 538 |     ...     </div>
 539 |     ...     <span itemprop="description">The lamp burned out and now I have to replace
 540 |     ...     it. </span>
 541 |     ...   </div>
 542 |     ...
 543 |     ...   <div itemprop="review" itemscope itemtype="http://schema.org/Review">
 544 |     ...     <span itemprop="name">Value purchase</span> -
 545 |     ...     by <span itemprop="author">Lucas</span>,
 546 |     ...     <meta itemprop="datePublished" content="2011-03-25">March 25, 2011
 547 |     ...     <div itemprop="reviewRating" itemscope itemtype="http://schema.org/Rating">
 548 |     ...       <meta itemprop="worstRating" content = "1"/>
 549 |     ...       <span itemprop="ratingValue">4</span>/
 550 |     ...       <span itemprop="bestRating">5</span>stars
 551 |     ...     </div>
 552 |     ...     <span itemprop="description">Great microwave for the price. It is small and
 553 |     ...     fits in my apartment.</span>
 554 |     ...   </div>
 555 |     ...   ...
 556 |     ... </div>
 557 |     ... """
 558 |     >>> sel = Selector(text=doc, type="html")
 559 |     >>> for scope in sel.xpath('//div[@itemscope]'):
 560 |     ...     print("current scope:", scope.xpath('@itemtype').getall())
 561 |     ...     props = scope.xpath('''
 562 |     ...                 set:difference(./descendant::*/@itemprop,
 563 |     ...                                .//*[@itemscope]/*/@itemprop)''')
 564 |     ...     print("    properties: %s" % (props.getall()))
 565 |     ...     print("")
 566 |     current scope: ['http://schema.org/Product']
 567 |         properties: ['name', 'aggregateRating', 'offers', 'description', 'review', 'review']
 568 |     <BLANKLINE>
 569 |     current scope: ['http://schema.org/AggregateRating']
 570 |         properties: ['ratingValue', 'reviewCount']
 571 |     <BLANKLINE>
 572 |     current scope: ['http://schema.org/Offer']
 573 |         properties: ['price', 'availability']
 574 |     <BLANKLINE>
 575 |     current scope: ['http://schema.org/Review']
 576 |         properties: ['name', 'author', 'datePublished', 'reviewRating', 'description']
 577 |     <BLANKLINE>
 578 |     current scope: ['http://schema.org/Rating']
 579 |         properties: ['worstRating', 'ratingValue', 'bestRating']
 580 |     <BLANKLINE>
 581 |     current scope: ['http://schema.org/Review']
 582 |         properties: ['name', 'author', 'datePublished', 'reviewRating', 'description']
 583 |     <BLANKLINE>
 584 |     current scope: ['http://schema.org/Rating']
 585 |         properties: ['worstRating', 'ratingValue', 'bestRating']
 586 | 
 587 | 
 588 | Here we first iterate over ``itemscope`` elements, and for each one,
 589 | we look for all ``itemprops`` elements and exclude those that are themselves
 590 | inside another ``itemscope``.
 591 | 
 592 | .. _EXSLT: http://exslt.org/
 593 | .. _regular expressions: http://exslt.org/regexp/index.html
 594 | .. _set manipulation: http://exslt.org/set/index.html
 595 | 
 596 | .. _topics-xpath-other-extensions:
 597 | 
 598 | Other XPath extensions
 599 | ----------------------
 600 | 
 601 | Parsel also defines a sorely missed XPath extension function ``has-class`` that
 602 | returns ``True`` for nodes that have all of the specified HTML classes::
 603 | 
 604 |     >>> from parsel import Selector
 605 |     >>> sel = Selector("""
 606 |     ...         <p class="foo bar-baz">First</p>
 607 |     ...         <p class="foo">Second</p>
 608 |     ...         <p class="bar">Third</p>
 609 |     ...         <p>Fourth</p>
 610 |     ... """)
 611 |     ...
 612 |     >>> sel.xpath('//p[has-class("foo")]')
 613 |     [<Selector query='//p[has-class("foo")]' data='<p class="foo bar-baz">First</p>'>,
 614 |      <Selector query='//p[has-class("foo")]' data='<p class="foo">Second</p>'>]
 615 |     >>> sel.xpath('//p[has-class("foo", "bar-baz")]')
 616 |     [<Selector query='//p[has-class("foo", "bar-baz")]' data='<p class="foo bar-baz">First</p>'>]
 617 |     >>> sel.xpath('//p[has-class("foo", "bar")]')
 618 |     []
 619 | 
 620 | So XPath ``//p[has-class("foo", "bar-baz")]`` is roughly equivalent to CSS
 621 | ``p.foo.bar-baz``.  Please note, that it is slower in most of the cases,
 622 | because it's a pure-Python function that's invoked for every node in question
 623 | whereas the CSS lookup is translated into XPath and thus runs more efficiently,
 624 | so performance-wise its uses are limited to situations that are not easily
 625 | described with CSS selectors.
 626 | 
 627 | Parsel also simplifies adding your own XPath extensions.
 628 | 
 629 | .. autofunction:: parsel.xpathfuncs.set_xpathfunc
 630 | 
 631 | 
 632 | 
 633 | Some XPath tips
 634 | ---------------
 635 | 
 636 | Here are some tips that you may find useful when using XPath
 637 | with Parsel, based on `this post from Zyte's blog`_.
 638 | If you are not much familiar with XPath yet,
 639 | you may want to take a look first at this `XPath tutorial`_.
 640 | 
 641 | 
 642 | .. _`XPath tutorial`: http://www.zvon.org/comp/r/tut-XPath_1.html
 643 | .. _`this post from Zyte's blog`: https://www.zyte.com/blog/xpath-tips-from-the-web-scraping-trenches/
 644 | 
 645 | 
 646 | Using text nodes in a condition
 647 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 648 | 
 649 | When you need to use the text content as argument to an `XPath string function`_,
 650 | avoid using ``.//text()`` and use just ``.`` instead.
 651 | 
 652 | This is because the expression ``.//text()`` yields a collection of text elements -- a *node-set*.
 653 | And when a node-set is converted to a string, which happens when it is passed as argument to
 654 | a string function like ``contains()`` or ``starts-with()``, it results in the text for the first element only.
 655 | 
 656 | Example::
 657 | 
 658 |     >>> from parsel import Selector
 659 |     >>> sel = Selector(text='<a href="#">Click here to go to the <strong>Next Page</strong></a>')
 660 | 
 661 | Converting a *node-set* to string::
 662 | 
 663 |     >>> sel.xpath('//a//text()').getall() # take a peek at the node-set
 664 |     ['Click here to go to the ', 'Next Page']
 665 |     >>> sel.xpath("string(//a[1]//text())").getall() # convert it to string
 666 |     ['Click here to go to the ']
 667 | 
 668 | A *node* converted to a string, however, puts together the text of itself plus of all its descendants::
 669 | 
 670 |     >>> sel.xpath("//a[1]").getall() # select the first node
 671 |     ['<a href="#">Click here to go to the <strong>Next Page</strong></a>']
 672 |     >>> sel.xpath("string(//a[1])").getall() # convert it to string
 673 |     ['Click here to go to the Next Page']
 674 | 
 675 | So, using the ``.//text()`` node-set won't select anything in this case::
 676 | 
 677 |     >>> sel.xpath("//a[contains(.//text(), 'Next Page')]").getall()
 678 |     []
 679 | 
 680 | But using the ``.`` to mean the node, works::
 681 | 
 682 |     >>> sel.xpath("//a[contains(., 'Next Page')]").getall()
 683 |     ['<a href="#">Click here to go to the <strong>Next Page</strong></a>']
 684 | 
 685 | .. _`XPath string function`: https://www.w3.org/TR/xpath/#section-String-Functions
 686 | 
 687 | Beware of the difference between //node[1] and (//node)[1]
 688 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 689 | 
 690 | ``//node[1]`` selects all the nodes occurring first under their respective parents.
 691 | 
 692 | ``(//node)[1]`` selects all the nodes in the document, and then gets only the first of them.
 693 | 
 694 | Example::
 695 | 
 696 |     >>> from parsel import Selector
 697 |     >>> sel = Selector(text="""
 698 |     ...     <ul class="list">
 699 |     ...         <li>1</li>
 700 |     ...         <li>2</li>
 701 |     ...         <li>3</li>
 702 |     ...     </ul>
 703 |     ...     <ul class="list">
 704 |     ...         <li>4</li>
 705 |     ...         <li>5</li>
 706 |     ...         <li>6</li>
 707 |     ...     </ul>""")
 708 |     >>> xp = lambda x: sel.xpath(x).getall()
 709 | 
 710 | This gets all first ``<li>``  elements under whatever it is its parent::
 711 | 
 712 |     >>> xp("//li[1]")
 713 |     ['<li>1</li>', '<li>4</li>']
 714 | 
 715 | And this gets the first ``<li>``  element in the whole document::
 716 | 
 717 |     >>> xp("(//li)[1]")
 718 |     ['<li>1</li>']
 719 | 
 720 | This gets all first ``<li>``  elements under an ``<ul>``  parent::
 721 | 
 722 |     >>> xp("//ul/li[1]")
 723 |     ['<li>1</li>', '<li>4</li>']
 724 | 
 725 | And this gets the first ``<li>``  element under an ``<ul>``  parent in the whole document::
 726 | 
 727 |     >>> xp("(//ul/li)[1]")
 728 |     ['<li>1</li>']
 729 | 
 730 | When querying by class, consider using CSS
 731 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 732 | 
 733 | Because an element can contain multiple CSS classes, the XPath way to select elements
 734 | by class is the rather verbose::
 735 | 
 736 |     *[contains(concat(' ', normalize-space(@class), ' '), ' someclass ')]
 737 | 
 738 | If you use ``@class='someclass'`` you may end up missing elements that have
 739 | other classes, and if you just use ``contains(@class, 'someclass')`` to make up
 740 | for that you may end up with more elements that you want, if they have a different
 741 | class name that shares the string ``someclass``.
 742 | 
 743 | As it turns out, parsel selectors allow you to chain selectors, so most of the time
 744 | you can just select by class using CSS and then switch to XPath when needed::
 745 | 
 746 |     >>> from parsel import Selector
 747 |     >>> sel = Selector(text='<div class="hero shout"><time datetime="2014-07-23 19:00">Special date</time></div>')
 748 |     >>> sel.css('.shout').xpath('./time/@datetime').getall()
 749 |     ['2014-07-23 19:00']
 750 | 
 751 | This is cleaner than using the verbose XPath trick shown above. Just remember
 752 | to use the ``.`` in the XPath expressions that will follow.
 753 | 
 754 | 
 755 | Beware of how script and style tags differ from other tags
 756 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 757 | 
 758 | `Following the standard`__, the contents of ``script`` and ``style`` elements
 759 | are parsed as plain text.
 760 | 
 761 | __ https://www.w3.org/TR/html401/types.html#type-cdata
 762 | 
 763 | This means that XML-like structures found within them, including comments, are
 764 | all treated as part of the element text, and not as separate nodes.
 765 | 
 766 | For example::
 767 | 
 768 |     >>> from parsel import Selector
 769 |     >>> selector = Selector(text="""
 770 |     ...     <script>
 771 |     ...         text
 772 |     ...         <!-- comment -->
 773 |     ...         <br/>
 774 |     ...     </script>
 775 |     ...     <style>
 776 |     ...         text
 777 |     ...         <!-- comment -->
 778 |     ...         <br/>
 779 |     ...     </style>
 780 |     ...     <div>
 781 |     ...         text
 782 |     ...         <!-- comment -->
 783 |     ...         <br/>
 784 |     ...     </div>""")
 785 |     >>> for tag in selector.xpath('//*[contains(text(), "text")]'):
 786 |     ...     print(tag.xpath('name()').get())
 787 |     ...     print('    Text: ' + (tag.xpath('text()').get() or ''))
 788 |     ...     print('    Comment: ' + (tag.xpath('comment()').get() or ''))
 789 |     ...     print('    Children: ' + ''.join(tag.xpath('*').getall()))
 790 |     ...
 791 |     script
 792 |         Text:
 793 |             text
 794 |             <!-- comment -->
 795 |             <br/>
 796 |     <BLANKLINE>
 797 |         Comment:
 798 |         Children:
 799 |     style
 800 |         Text:
 801 |             text
 802 |             <!-- comment -->
 803 |             <br/>
 804 |     <BLANKLINE>
 805 |         Comment:
 806 |         Children:
 807 |     div
 808 |         Text:
 809 |             text
 810 |     <BLANKLINE>
 811 |         Comment: <!-- comment -->
 812 |         Children: <br>
 813 | 
 814 | .. _old-extraction-api:
 815 | 
 816 | extract() and extract_first()
 817 | -----------------------------
 818 | 
 819 | If you're a long-time parsel (or Scrapy) user, you're probably familiar
 820 | with ``.extract()`` and ``.extract_first()`` selector methods. These methods
 821 | are still supported by parsel, there are no plans to deprecate them.
 822 | 
 823 | However, ``parsel`` usage docs are now written using ``.get()`` and
 824 | ``.getall()`` methods. We feel that these new methods result in more concise
 825 | and readable code.
 826 | 
 827 | The following examples show how these methods map to each other.
 828 | 
 829 | .. invisible-code-block: python
 830 | 
 831 |    selector = load_selector('selectors-sample1.html')
 832 | 
 833 | 1. ``SelectorList.get()`` is the same as ``SelectorList.extract_first()``::
 834 | 
 835 |      >>> selector.css('a::attr(href)').get()
 836 |      'image1.html'
 837 |      >>> selector.css('a::attr(href)').extract_first()
 838 |      'image1.html'
 839 | 
 840 | 2. ``SelectorList.getall()`` is the same as ``SelectorList.extract()``::
 841 | 
 842 |      >>> selector.css('a::attr(href)').getall()
 843 |      ['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html']
 844 |      >>> selector.css('a::attr(href)').extract()
 845 |      ['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html']
 846 | 
 847 | 3. ``Selector.get()`` is the same as ``Selector.extract()``::
 848 | 
 849 |      >>> selector.css('a::attr(href)')[0].get()
 850 |      'image1.html'
 851 |      >>> selector.css('a::attr(href)')[0].extract()
 852 |      'image1.html'
 853 | 
 854 | 4. For consistency, there is also ``Selector.getall()``, which returns a list::
 855 | 
 856 |     >>> selector.css('a::attr(href)')[0].getall()
 857 |     ['image1.html']
 858 | 
 859 | With the ``.extract()`` method it was not always obvious if a result is a list
 860 | or not; to get a single result either ``.extract()`` or ``.extract_first()``
 861 | needed to be called, depending whether you had a ``Selector`` or ``SelectorList``.
 862 | 
 863 | So, the main difference is that the outputs of ``.get()`` and ``.getall()``
 864 | are more predictable: ``.get()`` always returns a single result,
 865 | ``.getall()`` always returns a list of all extracted results.
 866 | 
 867 | 
 868 | Using CSS selectors in multi-root documents
 869 | -------------------------------------------
 870 | 
 871 | Some webpages may have multiple root elements. It can happen, for example, when
 872 | a webpage has broken code, such as missing closing tags.
 873 | 
 874 | .. invisible-code-block: python
 875 | 
 876 |    selector = load_selector('multiroot.html')
 877 | 
 878 | You can use XPath to determine if a page has multiple root elements:
 879 | 
 880 | >>> len(selector.xpath('/*')) > 1
 881 | True
 882 | 
 883 | CSS selectors only work on the first root element, because the first root
 884 | element is always used as the starting current element, and CSS selectors do
 885 | not allow selecting parent elements (XPath’s ``..``) or elements relative to
 886 | the document root (XPath’s ``/``).
 887 | 
 888 | If you want to use a CSS selector that takes into account all root elements,
 889 | you need to precede your CSS query by an XPath query that reaches all root
 890 | elements::
 891 | 
 892 |     selector.xpath('/*').css('<your CSS selector>')
 893 | 
 894 | 
 895 | Command-Line Interface Tools
 896 | ============================
 897 | 
 898 | There are third-party tools that allow using Parsel from the command line:
 899 | 
 900 | -   `Parsel CLI <https://github.com/rmax/parsel-cli>`_ allows applying
 901 |     Parsel selectors to the standard input. For example, you can apply a Parsel
 902 |     selector to the output of cURL_.
 903 | 
 904 | -   `parselcli
 905 |     <https://github.com/Granitosaurus/parsel-cli>`_ provides an interactive
 906 |     shell that allows applying Parsel selectors to a remote URL or a local
 907 |     file.
 908 | 
 909 | .. _cURL: https://curl.haxx.se/
 910 | 
 911 | 
 912 | .. _selector-examples-html:
 913 | 
 914 | Examples
 915 | ========
 916 | 
 917 | Working on HTML
 918 | ---------------
 919 | 
 920 | Here are some :class:`~parsel.selector.Selector` examples to illustrate
 921 | several concepts. In all cases, we assume there is already
 922 | a :class:`~parsel.selector.Selector` instantiated with an HTML text like this::
 923 | 
 924 |       sel = Selector(text=html_text)
 925 | 
 926 | 1. Select all ``<h1>`` elements from an HTML text, returning a list of
 927 |    :class:`~parsel.selector.Selector` objects
 928 |    (ie. a :class:`~parsel.selector.SelectorList` object)::
 929 | 
 930 |       sel.xpath("//h1")
 931 | 
 932 | 2. Extract the text of all ``<h1>`` elements from an HTML text,
 933 |    returning a list of strings::
 934 | 
 935 |       sel.xpath("//h1").getall()         # this includes the h1 tag
 936 |       sel.xpath("//h1/text()").getall()  # this excludes the h1 tag
 937 | 
 938 | 3. Iterate over all ``<p>`` tags and print their class attribute::
 939 | 
 940 |       for node in sel.xpath("//p"):
 941 |           print(node.attrib['class'])
 942 | 
 943 | 
 944 | .. _selector-examples-xml:
 945 | 
 946 | Working on XML (and namespaces)
 947 | -------------------------------
 948 | 
 949 | Here are some examples to illustrate concepts for
 950 | :class:`~parsel.selector.Selector` objects instantiated with an XML text
 951 | like this::
 952 | 
 953 |       sel = Selector(text=xml_text, type='xml')
 954 | 
 955 | 1. Select all ``<product>`` elements from an XML text, returning a list
 956 |    of :class:`~parsel.selector.Selector` objects
 957 |    (ie. a :class:`~parsel.selector.SelectorList` object)::
 958 | 
 959 |       sel.xpath("//product")
 960 | 
 961 | 2. Extract all prices from a `Google Base XML feed`_ which requires registering
 962 |    a namespace::
 963 | 
 964 |       sel.register_namespace("g", "http://base.google.com/ns/1.0")
 965 |       sel.xpath("//g:price").getall()
 966 | 
 967 | .. _removing-namespaces:
 968 | 
 969 | Removing namespaces
 970 | ~~~~~~~~~~~~~~~~~~~
 971 | 
 972 | When dealing with scraping projects, it is often quite convenient to get rid of
 973 | namespaces altogether and just work with element names, to write more
 974 | simple/convenient XPaths. You can use the
 975 | :meth:`Selector.remove_namespaces <parsel.selector.Selector.remove_namespaces>`
 976 | method for that.
 977 | 
 978 | Let's show an example that illustrates this with the Python Insider blog atom feed.
 979 | 
 980 | Let's download the atom feed using :mod:`requests` and create a selector:
 981 | 
 982 | .. skip: start
 983 | 
 984 | >>> import requests
 985 | >>> from parsel import Selector
 986 | >>> text = requests.get('https://feeds.feedburner.com/PythonInsider').text
 987 | >>> sel = Selector(text=text, type='xml')
 988 | 
 989 | .. skip: end
 990 | 
 991 | .. invisible-code-block: python
 992 | 
 993 |    sel = load_selector('python-insider.xml', type='xml')
 994 | 
 995 | This is how the file starts:
 996 | 
 997 | .. code-block:: xml
 998 | 
 999 |     <?xml version="1.0" encoding="UTF-8"?>
1000 |     <?xml-stylesheet ... ?>
1001 |     <feed xmlns="http://www.w3.org/2005/Atom"
1002 |           xmlns:openSearch="http://a9.com/-/spec/opensearchrss/1.0/"
1003 |           xmlns:blogger="http://schemas.google.com/blogger/2008"
1004 |           xmlns:georss="http://www.georss.org/georss"
1005 |           xmlns:gd="http://schemas.google.com/g/2005"
1006 |           xmlns:thr="http://purl.org/syndication/thread/1.0"
1007 |           xmlns:feedburner="http://rssnamespace.org/feedburner/ext/1.0">
1008 |       ...
1009 |     </feed>
1010 | 
1011 | You can see several namespace declarations including a default
1012 | "http://www.w3.org/2005/Atom" and another one using the "gd:" prefix for
1013 | "http://schemas.google.com/g/2005".
1014 | 
1015 | We can try selecting all ``<link>`` objects and then see that it doesn't work
1016 | (because the Atom XML namespace is obfuscating those nodes)::
1017 | 
1018 |     >>> sel.xpath("//link")
1019 |     []
1020 | 
1021 | But once we call the :meth:`Selector.remove_namespaces
1022 | <parsel.selector.Selector.remove_namespaces>` method, all nodes can be accessed
1023 | directly by their names::
1024 | 
1025 |     >>> sel.remove_namespaces()
1026 |     >>> sel.xpath("//link")
1027 |     [<Selector query='//link' data='<link rel="alternate" type="text/html...'>,
1028 |      <Selector query='//link' data='<link rel="next" type="application/at...'>,
1029 |      ...]
1030 | 
1031 | If you wonder why the namespace removal procedure isn't called always by default
1032 | instead of having to call it manually, this is because of two reasons, which, in order
1033 | of relevance, are:
1034 | 
1035 | 1. Removing namespaces requires to iterate and modify all nodes in the
1036 |    document, which is a reasonably expensive operation to perform by default
1037 |    for all documents.
1038 | 
1039 | 2. There could be some cases where using namespaces is actually required, in
1040 |    case some element names clash between namespaces. These cases are very rare
1041 |    though.
1042 | 
1043 | .. _Google Base XML feed: https://support.google.com/merchants/answer/160589?hl=en&ref_topic=2473799
1044 | .. _requests: https://www.python-requests.org/
1045 | 
1046 | 
1047 | Ad-hoc namespaces references
1048 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1049 | 
1050 | :class:`~parsel.selector.Selector` objects also allow passing namespaces
1051 | references along with the query, through a ``namespaces`` argument,
1052 | with the prefixes you declare being used in your XPath or CSS query.
1053 | 
1054 | Let's use the same Python Insider Atom feed:
1055 | 
1056 | .. skip: start
1057 | 
1058 | >>> import requests
1059 | >>> from parsel import Selector
1060 | >>> text = requests.get('https://feeds.feedburner.com/PythonInsider').text
1061 | >>> sel = Selector(text=text, type='xml')
1062 | 
1063 | .. skip: end
1064 | 
1065 | .. invisible-code-block: python
1066 | 
1067 |    sel = load_selector('python-insider.xml', type='xml')
1068 | 
1069 | And try to select the links again, now using an "atom:" prefix
1070 | for the "link" node test::
1071 | 
1072 |     >>> sel.xpath("//atom:link", namespaces={"atom": "http://www.w3.org/2005/Atom"})
1073 |     [<Selector query='//atom:link' data='<link xmlns="http://www.w3.org/2005/A...'>,
1074 |      <Selector query='//atom:link' data='<link xmlns="http://www.w3.org/2005/A...'>,
1075 |      ...]
1076 | 
1077 | You can pass several namespaces (here we're using shorter 1-letter prefixes)::
1078 | 
1079 |     >>> sel.xpath("//a:entry/a:author/g:image/@src",
1080 |     ...           namespaces={"a": "http://www.w3.org/2005/Atom",
1081 |     ...                       "g": "http://schemas.google.com/g/2005"}).getall()
1082 |     ['https://img1.blogblog.com/img/b16-rounded.gif',
1083 |      'https://img1.blogblog.com/img/b16-rounded.gif',
1084 |      ...]
1085 | 
1086 | .. _topics-xpath-variables:
1087 | 
1088 | Variables in XPath expressions
1089 | ------------------------------
1090 | 
1091 | XPath allows you to reference variables in your XPath expressions, using
1092 | the ``$somevariable`` syntax. This is somewhat similar to parameterized
1093 | queries or prepared statements in the SQL world where you replace
1094 | some arguments in your queries with placeholders like ``?``,
1095 | which are then substituted with values passed with the query.
1096 | 
1097 | .. invisible-code-block: python
1098 | 
1099 |    selector = load_selector('selectors-sample1.html')
1100 | 
1101 | Here's an example to match an element based on its normalized string-value::
1102 | 
1103 |     >>> str_to_match = "Name: My image 3"
1104 |     >>> selector.xpath('//a[normalize-space(.)=$match]',
1105 |     ...                match=str_to_match).get()
1106 |     '<a href="image3.html">Name: My image 3 <br><img src="image3_thumb.jpg"></a>'
1107 | 
1108 | All variable references must have a binding value when calling ``.xpath()``
1109 | (otherwise you'll get a ``ValueError: XPath error:`` exception).
1110 | This is done by passing as many named arguments as necessary.
1111 | 
1112 | Here's another example using a position range passed as two integers::
1113 | 
1114 |     >>> start, stop = 2, 4
1115 |     >>> selector.xpath('//a[position()>=$_from and position()<=$_to]',
1116 |     ...                _from=start, _to=stop).getall()
1117 |     ['<a href="image2.html">Name: My image 2 <br><img src="image2_thumb.jpg"></a>',
1118 |      '<a href="image3.html">Name: My image 3 <br><img src="image3_thumb.jpg"></a>',
1119 |      '<a href="image4.html">Name: My image 4 <br><img src="image4_thumb.jpg"></a>']
1120 | 
1121 | Named variables can be useful when strings need to be escaped for single
1122 | or double quotes characters. The example below would be a bit tricky to
1123 | get right (or legible) without a variable reference::
1124 | 
1125 |     >>> html = '''<html>
1126 |     ... <body>
1127 |     ...   <p>He said: "I don't know why, but I like mixing single and double quotes!"</p>
1128 |     ... </body>
1129 |     ... </html>'''
1130 |     >>> selector = Selector(text=html)
1131 |     >>>
1132 |     >>> selector.xpath('//p[contains(., $mystring)]',
1133 |     ...                mystring='''He said: "I don't know''').get()
1134 |     '<p>He said: "I don\'t know why, but I like mixing single and double quotes!"</p>'
1135 | 
1136 | 
1137 | Converting CSS to XPath
1138 | -----------------------
1139 | 
1140 | .. autofunction:: parsel.css2xpath
1141 | 
1142 | When you're using an API that only accepts XPath expressions, it's sometimes
1143 | useful to convert CSS to XPath. This allows you to take advantage of the
1144 | conciseness of CSS to query elements by classes and the easeness of
1145 | manipulating XPath expressions at the same time.
1146 | 
1147 | On those occasions, use the function :func:`~parsel.css2xpath`:
1148 | 
1149 | ::
1150 | 
1151 |     >>> from parsel import css2xpath
1152 |     >>> css2xpath('h1.title')
1153 |     "descendant-or-self::h1[@class and contains(concat(' ', normalize-space(@class), ' '), ' title ')]"
1154 |     >>> css2xpath('.profile-data') + '//h2'
1155 |     "descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' profile-data ')]//h2"
1156 | 
1157 | As you can see from the examples above, it returns the translated CSS query
1158 | into an XPath expression as a string, which you can use as-is or combine to
1159 | build a more complex expression, before feeding to a function expecting XPath.
1160 | 
1161 | 
1162 | Similar libraries
1163 | =================
1164 | 
1165 | 
1166 |  * `BeautifulSoup`_ is a very popular screen scraping library among Python
1167 |    programmers which constructs a Python object based on the structure of the
1168 |    HTML code and also deals with bad markup reasonably well.
1169 | 
1170 |  * `lxml`_ is an XML parsing library (which also parses HTML) with a pythonic
1171 |    API based on `ElementTree`_. (lxml is not part of the Python standard
1172 |    library.). Parsel uses it under-the-hood.
1173 | 
1174 |  * `PyQuery`_ is a library that, like Parsel, uses `lxml`_ and
1175 |    :doc:`cssselect <cssselect:index>` under the hood, but it offers a jQuery-like API to
1176 |    traverse and manipulate XML/HTML documents.
1177 | 
1178 | Parsel is built on top of the `lxml`_ library, which means they're very similar
1179 | in speed and parsing accuracy. The advantage of using Parsel over `lxml`_ is
1180 | that Parsel is simpler to use and extend, unlike the `lxml`_ API which is much
1181 | bigger because the `lxml`_ library can be used for many other tasks, besides
1182 | selecting markup documents.
1183 | 
1184 | 
1185 | .. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
1186 | .. _lxml: https://lxml.de/
1187 | .. _PyQuery: https://pypi.python.org/pypi/pyquery
1188 | .. _ElementTree: https://docs.python.org/2/library/xml.etree.elementtree.html
1189 | 


--------------------------------------------------------------------------------
/parsel/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Parsel lets you extract text from XML/HTML documents using XPath
 3 | or CSS selectors
 4 | """
 5 | 
 6 | __author__ = "Scrapy project"
 7 | __email__ = "info@scrapy.org"
 8 | __version__ = "1.10.0"
 9 | __all__ = [
10 |     "Selector",
11 |     "SelectorList",
12 |     "css2xpath",
13 |     "xpathfuncs",
14 | ]
15 | 
16 | from parsel import xpathfuncs
17 | from parsel.csstranslator import css2xpath
18 | from parsel.selector import Selector, SelectorList
19 | 
20 | xpathfuncs.setup()
21 | 


--------------------------------------------------------------------------------
/parsel/csstranslator.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from functools import lru_cache
  4 | from typing import TYPE_CHECKING, Any, Protocol
  5 | 
  6 | from cssselect import GenericTranslator as OriginalGenericTranslator
  7 | from cssselect import HTMLTranslator as OriginalHTMLTranslator
  8 | from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
  9 | from cssselect.xpath import ExpressionError
 10 | from cssselect.xpath import XPathExpr as OriginalXPathExpr
 11 | 
 12 | if TYPE_CHECKING:
 13 |     # typing.Self requires Python 3.11
 14 |     from typing_extensions import Self
 15 | 
 16 | 
 17 | class XPathExpr(OriginalXPathExpr):
 18 |     textnode: bool = False
 19 |     attribute: str | None = None
 20 | 
 21 |     @classmethod
 22 |     def from_xpath(
 23 |         cls,
 24 |         xpath: OriginalXPathExpr,
 25 |         textnode: bool = False,
 26 |         attribute: str | None = None,
 27 |     ) -> Self:
 28 |         x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
 29 |         x.textnode = textnode
 30 |         x.attribute = attribute
 31 |         return x
 32 | 
 33 |     def __str__(self) -> str:
 34 |         path = super().__str__()
 35 |         if self.textnode:
 36 |             if path == "*":
 37 |                 path = "text()"
 38 |             elif path.endswith("::*/*"):
 39 |                 path = path[:-3] + "text()"
 40 |             else:
 41 |                 path += "/text()"
 42 | 
 43 |         if self.attribute is not None:
 44 |             if path.endswith("::*/*"):
 45 |                 path = path[:-2]
 46 |             path += f"/@{self.attribute}"
 47 | 
 48 |         return path
 49 | 
 50 |     def join(
 51 |         self: Self,
 52 |         combiner: str,
 53 |         other: OriginalXPathExpr,
 54 |         *args: Any,
 55 |         **kwargs: Any,
 56 |     ) -> Self:
 57 |         if not isinstance(other, XPathExpr):
 58 |             raise ValueError(
 59 |                 f"Expressions of type {__name__}.XPathExpr can ony join expressions"
 60 |                 f" of the same type (or its descendants), got {type(other)}"
 61 |             )
 62 |         super().join(combiner, other, *args, **kwargs)
 63 |         self.textnode = other.textnode
 64 |         self.attribute = other.attribute
 65 |         return self
 66 | 
 67 | 
 68 | # e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
 69 | class TranslatorProtocol(Protocol):
 70 |     def xpath_element(self, selector: Element) -> OriginalXPathExpr:
 71 |         pass
 72 | 
 73 |     def css_to_xpath(self, css: str, prefix: str = ...) -> str:
 74 |         pass
 75 | 
 76 | 
 77 | class TranslatorMixin:
 78 |     """This mixin adds support to CSS pseudo elements via dynamic dispatch.
 79 | 
 80 |     Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.
 81 |     """
 82 | 
 83 |     def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr:
 84 |         # https://github.com/python/mypy/issues/14757
 85 |         xpath = super().xpath_element(selector)  # type: ignore[safe-super]
 86 |         return XPathExpr.from_xpath(xpath)
 87 | 
 88 |     def xpath_pseudo_element(
 89 |         self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement
 90 |     ) -> OriginalXPathExpr:
 91 |         """
 92 |         Dispatch method that transforms XPath to support pseudo-element
 93 |         """
 94 |         if isinstance(pseudo_element, FunctionalPseudoElement):
 95 |             method_name = f"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element"
 96 |             method = getattr(self, method_name, None)
 97 |             if not method:
 98 |                 raise ExpressionError(
 99 |                     f"The functional pseudo-element ::{pseudo_element.name}() is unknown"
100 |                 )
101 |             xpath = method(xpath, pseudo_element)
102 |         else:
103 |             method_name = (
104 |                 f"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element"
105 |             )
106 |             method = getattr(self, method_name, None)
107 |             if not method:
108 |                 raise ExpressionError(
109 |                     f"The pseudo-element ::{pseudo_element} is unknown"
110 |                 )
111 |             xpath = method(xpath)
112 |         return xpath
113 | 
114 |     def xpath_attr_functional_pseudo_element(
115 |         self, xpath: OriginalXPathExpr, function: FunctionalPseudoElement
116 |     ) -> XPathExpr:
117 |         """Support selecting attribute values using ::attr() pseudo-element"""
118 |         if function.argument_types() not in (["STRING"], ["IDENT"]):
119 |             raise ExpressionError(
120 |                 f"Expected a single string or ident for ::attr(), got {function.arguments!r}"
121 |             )
122 |         return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)
123 | 
124 |     def xpath_text_simple_pseudo_element(self, xpath: OriginalXPathExpr) -> XPathExpr:
125 |         """Support selecting text nodes using ::text pseudo-element"""
126 |         return XPathExpr.from_xpath(xpath, textnode=True)
127 | 
128 | 
129 | class GenericTranslator(TranslatorMixin, OriginalGenericTranslator):
130 |     @lru_cache(maxsize=256)
131 |     def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
132 |         return super().css_to_xpath(css, prefix)
133 | 
134 | 
135 | class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
136 |     @lru_cache(maxsize=256)
137 |     def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
138 |         return super().css_to_xpath(css, prefix)
139 | 
140 | 
141 | _translator = HTMLTranslator()
142 | 
143 | 
144 | def css2xpath(query: str) -> str:
145 |     """Return translated XPath version of a given CSS query"""
146 |     return _translator.css_to_xpath(query)
147 | 


--------------------------------------------------------------------------------
/parsel/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapy/parsel/dba69f70ca4e875017c14b5cb658f5580c6cb794/parsel/py.typed


--------------------------------------------------------------------------------
/parsel/selector.py:
--------------------------------------------------------------------------------
  1 | """XPath and JMESPath selectors based on the lxml and jmespath Python
  2 | packages."""
  3 | 
  4 | from __future__ import annotations
  5 | 
  6 | import json
  7 | import typing
  8 | import warnings
  9 | from io import BytesIO
 10 | from typing import TYPE_CHECKING, Any, Literal, SupportsIndex, TypedDict, TypeVar, Union
 11 | 
 12 | import jmespath
 13 | from lxml import etree, html
 14 | from packaging.version import Version
 15 | 
 16 | from .csstranslator import GenericTranslator, HTMLTranslator
 17 | from .utils import extract_regex, flatten, iflatten, shorten
 18 | 
 19 | if TYPE_CHECKING:
 20 |     from collections.abc import Mapping
 21 |     from re import Pattern
 22 | 
 23 |     # typing.Self requires Python 3.11
 24 |     from typing_extensions import Self
 25 | 
 26 | 
 27 | _SelectorType = TypeVar("_SelectorType", bound="Selector")
 28 | _ParserType = Union[etree.XMLParser, etree.HTMLParser]
 29 | # simplified _OutputMethodArg from types-lxml
 30 | _TostringMethodType = Literal[
 31 |     "html",
 32 |     "xml",
 33 | ]
 34 | 
 35 | lxml_version = Version(etree.__version__)
 36 | lxml_huge_tree_version = Version("4.2")
 37 | LXML_SUPPORTS_HUGE_TREE = lxml_version >= lxml_huge_tree_version
 38 | 
 39 | 
 40 | class CannotRemoveElementWithoutRoot(Exception):
 41 |     pass
 42 | 
 43 | 
 44 | class CannotRemoveElementWithoutParent(Exception):
 45 |     pass
 46 | 
 47 | 
 48 | class CannotDropElementWithoutParent(CannotRemoveElementWithoutParent):
 49 |     pass
 50 | 
 51 | 
 52 | class SafeXMLParser(etree.XMLParser):
 53 |     def __init__(self, *args: Any, **kwargs: Any) -> None:
 54 |         kwargs.setdefault("resolve_entities", False)
 55 |         super().__init__(*args, **kwargs)
 56 | 
 57 | 
 58 | class CTGroupValue(TypedDict):
 59 |     _parser: type[etree.XMLParser | html.HTMLParser]
 60 |     _csstranslator: GenericTranslator | HTMLTranslator
 61 |     _tostring_method: _TostringMethodType
 62 | 
 63 | 
 64 | _ctgroup: dict[str, CTGroupValue] = {
 65 |     "html": {
 66 |         "_parser": html.HTMLParser,
 67 |         "_csstranslator": HTMLTranslator(),
 68 |         "_tostring_method": "html",
 69 |     },
 70 |     "xml": {
 71 |         "_parser": SafeXMLParser,
 72 |         "_csstranslator": GenericTranslator(),
 73 |         "_tostring_method": "xml",
 74 |     },
 75 | }
 76 | 
 77 | 
 78 | def _xml_or_html(type: str | None) -> str:
 79 |     return "xml" if type == "xml" else "html"
 80 | 
 81 | 
 82 | def create_root_node(
 83 |     text: str,
 84 |     parser_cls: type[_ParserType],
 85 |     base_url: str | None = None,
 86 |     huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
 87 |     body: bytes = b"",
 88 |     encoding: str = "utf-8",
 89 | ) -> etree._Element:
 90 |     """Create root node for text using given parser class."""
 91 |     if not text:
 92 |         body = body.replace(b"\x00", b"").strip()
 93 |     else:
 94 |         body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
 95 | 
 96 |     if huge_tree and LXML_SUPPORTS_HUGE_TREE:
 97 |         parser = parser_cls(recover=True, encoding=encoding, huge_tree=True)
 98 |         root = etree.fromstring(body, parser=parser, base_url=base_url)
 99 |     else:
100 |         parser = parser_cls(recover=True, encoding=encoding)
101 |         root = etree.fromstring(body, parser=parser, base_url=base_url)
102 |         for error in parser.error_log:
103 |             if "use XML_PARSE_HUGE option" in error.message:
104 |                 warnings.warn(
105 |                     f"Input data is too big. Upgrade to lxml "
106 |                     f"{lxml_huge_tree_version} or later for huge_tree support.",
107 |                     stacklevel=2,
108 |                 )
109 |     if root is None:
110 |         root = etree.fromstring(b"<html/>", parser=parser, base_url=base_url)
111 |     return root
112 | 
113 | 
114 | class SelectorList(list[_SelectorType]):
115 |     """
116 |     The :class:`SelectorList` class is a subclass of the builtin ``list``
117 |     class, which provides a few additional methods.
118 |     """
119 | 
120 |     @typing.overload
121 |     def __getitem__(self, pos: SupportsIndex) -> _SelectorType:
122 |         pass
123 | 
124 |     @typing.overload
125 |     def __getitem__(self, pos: slice) -> SelectorList[_SelectorType]:
126 |         pass
127 | 
128 |     def __getitem__(
129 |         self, pos: SupportsIndex | slice
130 |     ) -> _SelectorType | SelectorList[_SelectorType]:
131 |         o = super().__getitem__(pos)
132 |         if isinstance(pos, slice):
133 |             return self.__class__(typing.cast("SelectorList[_SelectorType]", o))
134 |         return typing.cast("_SelectorType", o)
135 | 
136 |     def __getstate__(self) -> None:
137 |         raise TypeError("can't pickle SelectorList objects")
138 | 
139 |     def jmespath(self, query: str, **kwargs: Any) -> SelectorList[_SelectorType]:
140 |         """
141 |         Call the ``.jmespath()`` method for each element in this list and return
142 |         their results flattened as another :class:`SelectorList`.
143 | 
144 |         ``query`` is the same argument as the one in :meth:`Selector.jmespath`.
145 | 
146 |         Any additional named arguments are passed to the underlying
147 |         ``jmespath.search`` call, e.g.::
148 | 
149 |             selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict))
150 |         """
151 |         return self.__class__(flatten([x.jmespath(query, **kwargs) for x in self]))
152 | 
153 |     def xpath(
154 |         self,
155 |         xpath: str,
156 |         namespaces: Mapping[str, str] | None = None,
157 |         **kwargs: Any,
158 |     ) -> SelectorList[_SelectorType]:
159 |         """
160 |         Call the ``.xpath()`` method for each element in this list and return
161 |         their results flattened as another :class:`SelectorList`.
162 | 
163 |         ``xpath`` is the same argument as the one in :meth:`Selector.xpath`
164 | 
165 |         ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict)
166 |         for additional prefixes to those registered with ``register_namespace(prefix, uri)``.
167 |         Contrary to ``register_namespace()``, these prefixes are not
168 |         saved for future calls.
169 | 
170 |         Any additional named arguments can be used to pass values for XPath
171 |         variables in the XPath expression, e.g.::
172 | 
173 |             selector.xpath('//a[href=$url]', url="http://www.example.com")
174 |         """
175 |         return self.__class__(
176 |             flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self])
177 |         )
178 | 
179 |     def css(self, query: str) -> SelectorList[_SelectorType]:
180 |         """
181 |         Call the ``.css()`` method for each element in this list and return
182 |         their results flattened as another :class:`SelectorList`.
183 | 
184 |         ``query`` is the same argument as the one in :meth:`Selector.css`
185 |         """
186 |         return self.__class__(flatten([x.css(query) for x in self]))
187 | 
188 |     def re(self, regex: str | Pattern[str], replace_entities: bool = True) -> list[str]:
189 |         """
190 |         Call the ``.re()`` method for each element in this list and return
191 |         their results flattened, as a list of strings.
192 | 
193 |         By default, character entity references are replaced by their
194 |         corresponding character (except for ``&amp;`` and ``&lt;``.
195 |         Passing ``replace_entities`` as ``False`` switches off these
196 |         replacements.
197 |         """
198 |         return flatten([x.re(regex, replace_entities=replace_entities) for x in self])
199 | 
200 |     @typing.overload
201 |     def re_first(
202 |         self,
203 |         regex: str | Pattern[str],
204 |         default: None = None,
205 |         replace_entities: bool = True,
206 |     ) -> str | None:
207 |         pass
208 | 
209 |     @typing.overload
210 |     def re_first(
211 |         self,
212 |         regex: str | Pattern[str],
213 |         default: str,
214 |         replace_entities: bool = True,
215 |     ) -> str:
216 |         pass
217 | 
218 |     def re_first(
219 |         self,
220 |         regex: str | Pattern[str],
221 |         default: str | None = None,
222 |         replace_entities: bool = True,
223 |     ) -> str | None:
224 |         """
225 |         Call the ``.re()`` method for the first element in this list and
226 |         return the result in an string. If the list is empty or the
227 |         regex doesn't match anything, return the default value (``None`` if
228 |         the argument is not provided).
229 | 
230 |         By default, character entity references are replaced by their
231 |         corresponding character (except for ``&amp;`` and ``&lt;``.
232 |         Passing ``replace_entities`` as ``False`` switches off these
233 |         replacements.
234 |         """
235 |         for el in iflatten(
236 |             x.re(regex, replace_entities=replace_entities) for x in self
237 |         ):
238 |             return typing.cast("str", el)
239 |         return default
240 | 
241 |     def getall(self) -> list[str]:
242 |         """
243 |         Call the ``.get()`` method for each element is this list and return
244 |         their results flattened, as a list of strings.
245 |         """
246 |         return [x.get() for x in self]
247 | 
248 |     extract = getall
249 | 
250 |     @typing.overload
251 |     def get(self, default: None = None) -> str | None:
252 |         pass
253 | 
254 |     @typing.overload
255 |     def get(self, default: str) -> str:
256 |         pass
257 | 
258 |     def get(self, default: str | None = None) -> Any:
259 |         """
260 |         Return the result of ``.get()`` for the first element in this list.
261 |         If the list is empty, return the default value.
262 |         """
263 |         for x in self:
264 |             return x.get()
265 |         return default
266 | 
267 |     extract_first = get
268 | 
269 |     @property
270 |     def attrib(self) -> Mapping[str, str]:
271 |         """Return the attributes dictionary for the first element.
272 |         If the list is empty, return an empty dict.
273 |         """
274 |         for x in self:
275 |             return x.attrib
276 |         return {}
277 | 
278 |     def drop(self) -> None:
279 |         """
280 |         Drop matched nodes from the parent for each element in this list.
281 |         """
282 |         for x in self:
283 |             x.drop()
284 | 
285 | 
286 | _NOT_SET = object()
287 | 
288 | 
289 | def _get_root_from_text(text: str, *, type: str, **lxml_kwargs: Any) -> etree._Element:
290 |     return create_root_node(text, _ctgroup[type]["_parser"], **lxml_kwargs)
291 | 
292 | 
293 | def _get_root_and_type_from_bytes(
294 |     body: bytes,
295 |     encoding: str,
296 |     *,
297 |     input_type: str | None,
298 |     **lxml_kwargs: Any,
299 | ) -> tuple[Any, str]:
300 |     if input_type == "text":
301 |         return body.decode(encoding), input_type
302 |     if encoding == "utf-8":
303 |         try:
304 |             data = json.load(BytesIO(body))
305 |         except ValueError:
306 |             data = _NOT_SET
307 |         if data is not _NOT_SET:
308 |             return data, "json"
309 |     if input_type == "json":
310 |         return None, "json"
311 |     assert input_type in ("html", "xml", None)  # nosec
312 |     type = _xml_or_html(input_type)
313 |     root = create_root_node(
314 |         text="",
315 |         body=body,
316 |         encoding=encoding,
317 |         parser_cls=_ctgroup[type]["_parser"],
318 |         **lxml_kwargs,
319 |     )
320 |     return root, type
321 | 
322 | 
323 | def _get_root_and_type_from_text(
324 |     text: str, *, input_type: str | None, **lxml_kwargs: Any
325 | ) -> tuple[Any, str]:
326 |     if input_type == "text":
327 |         return text, input_type
328 |     try:
329 |         data = json.loads(text)
330 |     except ValueError:
331 |         data = _NOT_SET
332 |     if data is not _NOT_SET:
333 |         return data, "json"
334 |     if input_type == "json":
335 |         return None, "json"
336 |     assert input_type in ("html", "xml", None)  # nosec
337 |     type = _xml_or_html(input_type)
338 |     root = _get_root_from_text(text, type=type, **lxml_kwargs)
339 |     return root, type
340 | 
341 | 
342 | def _get_root_type(root: Any, *, input_type: str | None) -> str:
343 |     if isinstance(root, etree._Element):
344 |         if input_type in {"json", "text"}:
345 |             raise ValueError(
346 |                 f"Selector got an lxml.etree._Element object as root, "
347 |                 f"and {input_type!r} as type."
348 |             )
349 |         return _xml_or_html(input_type)
350 |     if isinstance(root, (dict, list)) or _is_valid_json(root):
351 |         return "json"
352 |     return input_type or "json"
353 | 
354 | 
355 | def _is_valid_json(text: str) -> bool:
356 |     try:
357 |         json.loads(text)
358 |     except (TypeError, ValueError):
359 |         return False
360 |     return True
361 | 
362 | 
363 | def _load_json_or_none(text: str) -> Any:
364 |     if isinstance(text, (str, bytes, bytearray)):
365 |         try:
366 |             return json.loads(text)
367 |         except ValueError:
368 |             return None
369 |     return None
370 | 
371 | 
372 | class Selector:
373 |     """Wrapper for input data in HTML, JSON, or XML format, that allows
374 |     selecting parts of it using selection expressions.
375 | 
376 |     You can write selection expressions in CSS or XPath for HTML and XML
377 |     inputs, or in JMESPath for JSON inputs.
378 | 
379 |     ``text`` is an ``str`` object.
380 | 
381 |     ``body`` is a ``bytes`` object. It can be used together with the
382 |     ``encoding`` argument instead of the ``text`` argument.
383 | 
384 |     ``type`` defines the selector type. It can be ``"html"`` (default),
385 |     ``"json"``, or ``"xml"``.
386 | 
387 |     ``base_url`` allows setting a URL for the document. This is needed when looking up external entities with relative paths.
388 |     See the documentation for :func:`lxml.etree.fromstring` for more information.
389 | 
390 |     ``huge_tree`` controls the lxml/libxml2 feature that forbids parsing
391 |     certain large documents to protect from possible memory exhaustion. The
392 |     argument is ``True`` by default if the installed lxml version supports it,
393 |     which disables the protection to allow parsing such documents. Set it to
394 |     ``False`` if you want to enable the protection.
395 |     See `this lxml FAQ entry <https://lxml.de/FAQ.html#is-lxml-vulnerable-to-xml-bombs>`_
396 |     for more information.
397 |     """
398 | 
399 |     __slots__ = [
400 |         "__weakref__",
401 |         "_expr",
402 |         "_huge_tree",
403 |         "_text",
404 |         "body",
405 |         "namespaces",
406 |         "root",
407 |         "type",
408 |     ]
409 | 
410 |     _default_namespaces = {
411 |         "re": "http://exslt.org/regular-expressions",
412 |         # supported in libxslt:
413 |         # set:difference
414 |         # set:has-same-node
415 |         # set:intersection
416 |         # set:leading
417 |         # set:trailing
418 |         "set": "http://exslt.org/sets",
419 |     }
420 |     _lxml_smart_strings = False
421 |     selectorlist_cls = SelectorList["Selector"]
422 | 
423 |     def __init__(
424 |         self,
425 |         text: str | None = None,
426 |         type: str | None = None,
427 |         body: bytes | bytearray = b"",
428 |         encoding: str = "utf-8",
429 |         namespaces: Mapping[str, str] | None = None,
430 |         root: Any | None = _NOT_SET,
431 |         base_url: str | None = None,
432 |         _expr: str | None = None,
433 |         huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
434 |     ) -> None:
435 |         self.root: Any
436 |         if type not in ("html", "json", "text", "xml", None):
437 |             raise ValueError(f"Invalid type: {type}")
438 | 
439 |         if text is None and not body and root is _NOT_SET:
440 |             raise ValueError("Selector needs text, body, or root arguments")
441 | 
442 |         if text is not None and not isinstance(text, str):
443 |             msg = f"text argument should be of type str, got {text.__class__}"
444 |             raise TypeError(msg)
445 | 
446 |         if text is not None:
447 |             if root is not _NOT_SET:
448 |                 warnings.warn(
449 |                     "Selector got both text and root, root is being ignored.",
450 |                     stacklevel=2,
451 |                 )
452 |             if not isinstance(text, str):
453 |                 msg = f"text argument should be of type str, got {text.__class__}"
454 |                 raise TypeError(msg)
455 | 
456 |             root, type = _get_root_and_type_from_text(
457 |                 text,
458 |                 input_type=type,
459 |                 base_url=base_url,
460 |                 huge_tree=huge_tree,
461 |             )
462 |             self.root = root
463 |             self.type = type
464 |         elif body:
465 |             if not isinstance(body, (bytes, bytearray)):
466 |                 msg = f"body argument should be of type bytes or bytearray, got {body.__class__}"
467 |                 raise TypeError(msg)
468 |             root, type = _get_root_and_type_from_bytes(
469 |                 body=bytes(body),
470 |                 encoding=encoding,
471 |                 input_type=type,
472 |                 base_url=base_url,
473 |                 huge_tree=huge_tree,
474 |             )
475 |             self.root = root
476 |             self.type = type
477 |         elif root is _NOT_SET:
478 |             raise ValueError("Selector needs text, body, or root arguments")
479 |         else:
480 |             self.root = root
481 |             self.type = _get_root_type(root, input_type=type)
482 | 
483 |         self.namespaces = dict(self._default_namespaces)
484 |         if namespaces is not None:
485 |             self.namespaces.update(namespaces)
486 | 
487 |         self._expr = _expr
488 |         self._huge_tree = huge_tree
489 |         self._text = text
490 | 
491 |     def __getstate__(self) -> Any:
492 |         raise TypeError("can't pickle Selector objects")
493 | 
494 |     def _get_root(
495 |         self,
496 |         text: str = "",
497 |         base_url: str | None = None,
498 |         huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
499 |         type: str | None = None,
500 |         body: bytes = b"",
501 |         encoding: str = "utf-8",
502 |     ) -> etree._Element:
503 |         return create_root_node(
504 |             text,
505 |             body=body,
506 |             encoding=encoding,
507 |             parser_cls=_ctgroup[type or self.type]["_parser"],
508 |             base_url=base_url,
509 |             huge_tree=huge_tree,
510 |         )
511 | 
512 |     def jmespath(
513 |         self,
514 |         query: str,
515 |         **kwargs: Any,
516 |     ) -> SelectorList[Self]:
517 |         """
518 |         Find objects matching the JMESPath ``query`` and return the result as a
519 |         :class:`SelectorList` instance with all elements flattened. List
520 |         elements implement :class:`Selector` interface too.
521 | 
522 |         ``query`` is a string containing the `JMESPath
523 |         <https://jmespath.org/>`_ query to apply.
524 | 
525 |         Any additional named arguments are passed to the underlying
526 |         ``jmespath.search`` call, e.g.::
527 | 
528 |             selector.jmespath('author.name', options=jmespath.Options(dict_cls=collections.OrderedDict))
529 |         """
530 |         if self.type == "json":
531 |             if isinstance(self.root, str):
532 |                 # Selector received a JSON string as root.
533 |                 data = _load_json_or_none(self.root)
534 |             else:
535 |                 data = self.root
536 |         else:
537 |             assert self.type in {"html", "xml"}  # nosec
538 |             data = _load_json_or_none(self.root.text)
539 | 
540 |         result = jmespath.search(query, data, **kwargs)
541 |         if result is None:
542 |             result = []
543 |         elif not isinstance(result, list):
544 |             result = [result]
545 | 
546 |         def make_selector(x: Any) -> Selector:  # closure function
547 |             if isinstance(x, str):
548 |                 return self.__class__(text=x, _expr=query, type="text")
549 |             return self.__class__(root=x, _expr=query)
550 | 
551 |         result = [make_selector(x) for x in result]
552 |         return typing.cast("SelectorList[Self]", self.selectorlist_cls(result))
553 | 
554 |     def xpath(
555 |         self,
556 |         query: str,
557 |         namespaces: Mapping[str, str] | None = None,
558 |         **kwargs: Any,
559 |     ) -> SelectorList[Self]:
560 |         """
561 |         Find nodes matching the xpath ``query`` and return the result as a
562 |         :class:`SelectorList` instance with all elements flattened. List
563 |         elements implement :class:`Selector` interface too.
564 | 
565 |         ``query`` is a string containing the XPATH query to apply.
566 | 
567 |         ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict)
568 |         for additional prefixes to those registered with ``register_namespace(prefix, uri)``.
569 |         Contrary to ``register_namespace()``, these prefixes are not
570 |         saved for future calls.
571 | 
572 |         Any additional named arguments can be used to pass values for XPath
573 |         variables in the XPath expression, e.g.::
574 | 
575 |             selector.xpath('//a[href=$url]', url="http://www.example.com")
576 |         """
577 |         if self.type not in ("html", "xml", "text"):
578 |             raise ValueError(f"Cannot use xpath on a Selector of type {self.type!r}")
579 |         if self.type in ("html", "xml"):
580 |             try:
581 |                 xpathev = self.root.xpath
582 |             except AttributeError:
583 |                 return typing.cast("SelectorList[Self]", self.selectorlist_cls([]))
584 |         else:
585 |             try:
586 |                 xpathev = self._get_root(self._text or "", type="html").xpath
587 |             except AttributeError:
588 |                 return typing.cast("SelectorList[Self]", self.selectorlist_cls([]))
589 | 
590 |         nsp = dict(self.namespaces)
591 |         if namespaces is not None:
592 |             nsp.update(namespaces)
593 |         try:
594 |             result = xpathev(
595 |                 query,
596 |                 namespaces=nsp,
597 |                 smart_strings=self._lxml_smart_strings,
598 |                 **kwargs,
599 |             )
600 |         except etree.XPathError as exc:
601 |             raise ValueError(f"XPath error: {exc} in {query}")
602 | 
603 |         if not isinstance(result, list):
604 |             result = [result]
605 | 
606 |         result = [
607 |             self.__class__(
608 |                 root=x,
609 |                 _expr=query,
610 |                 namespaces=self.namespaces,
611 |                 type=_xml_or_html(self.type),
612 |             )
613 |             for x in result
614 |         ]
615 |         return typing.cast("SelectorList[Self]", self.selectorlist_cls(result))
616 | 
617 |     def css(self, query: str) -> SelectorList[Self]:
618 |         """
619 |         Apply the given CSS selector and return a :class:`SelectorList` instance.
620 | 
621 |         ``query`` is a string containing the CSS selector to apply.
622 | 
623 |         In the background, CSS queries are translated into XPath queries using
624 |         `cssselect`_ library and run ``.xpath()`` method.
625 | 
626 |         .. _cssselect: https://pypi.python.org/pypi/cssselect/
627 |         """
628 |         if self.type not in ("html", "xml", "text"):
629 |             raise ValueError(f"Cannot use css on a Selector of type {self.type!r}")
630 |         return self.xpath(self._css2xpath(query))
631 | 
632 |     def _css2xpath(self, query: str) -> str:
633 |         type = _xml_or_html(self.type)
634 |         return _ctgroup[type]["_csstranslator"].css_to_xpath(query)
635 | 
636 |     def re(self, regex: str | Pattern[str], replace_entities: bool = True) -> list[str]:
637 |         """
638 |         Apply the given regex and return a list of strings with the
639 |         matches.
640 | 
641 |         ``regex`` can be either a compiled regular expression or a string which
642 |         will be compiled to a regular expression using ``re.compile(regex)``.
643 | 
644 |         By default, character entity references are replaced by their
645 |         corresponding character (except for ``&amp;`` and ``&lt;``).
646 |         Passing ``replace_entities`` as ``False`` switches off these
647 |         replacements.
648 |         """
649 |         data = self.get()
650 |         return extract_regex(regex, data, replace_entities=replace_entities)
651 | 
652 |     @typing.overload
653 |     def re_first(
654 |         self,
655 |         regex: str | Pattern[str],
656 |         default: None = None,
657 |         replace_entities: bool = True,
658 |     ) -> str | None:
659 |         pass
660 | 
661 |     @typing.overload
662 |     def re_first(
663 |         self,
664 |         regex: str | Pattern[str],
665 |         default: str,
666 |         replace_entities: bool = True,
667 |     ) -> str:
668 |         pass
669 | 
670 |     def re_first(
671 |         self,
672 |         regex: str | Pattern[str],
673 |         default: str | None = None,
674 |         replace_entities: bool = True,
675 |     ) -> str | None:
676 |         """
677 |         Apply the given regex and return the first string which matches. If
678 |         there is no match, return the default value (``None`` if the argument
679 |         is not provided).
680 | 
681 |         By default, character entity references are replaced by their
682 |         corresponding character (except for ``&amp;`` and ``&lt;``).
683 |         Passing ``replace_entities`` as ``False`` switches off these
684 |         replacements.
685 |         """
686 |         return next(
687 |             iflatten(self.re(regex, replace_entities=replace_entities)),
688 |             default,
689 |         )
690 | 
691 |     def get(self) -> Any:
692 |         """
693 |         Serialize and return the matched nodes.
694 | 
695 |         For HTML and XML, the result is always a string, and percent-encoded
696 |         content is unquoted.
697 |         """
698 |         if self.type in ("text", "json"):
699 |             return self.root
700 |         try:
701 |             return etree.tostring(
702 |                 self.root,
703 |                 method=_ctgroup[self.type]["_tostring_method"],
704 |                 encoding="unicode",
705 |                 with_tail=False,
706 |             )
707 |         except (AttributeError, TypeError):
708 |             if self.root is True:
709 |                 return "1"
710 |             if self.root is False:
711 |                 return "0"
712 |             return str(self.root)
713 | 
714 |     extract = get
715 | 
716 |     def getall(self) -> list[str]:
717 |         """
718 |         Serialize and return the matched node in a 1-element list of strings.
719 |         """
720 |         return [self.get()]
721 | 
722 |     def register_namespace(self, prefix: str, uri: str) -> None:
723 |         """
724 |         Register the given namespace to be used in this :class:`Selector`.
725 |         Without registering namespaces you can't select or extract data from
726 |         non-standard namespaces. See :ref:`selector-examples-xml`.
727 |         """
728 |         self.namespaces[prefix] = uri
729 | 
730 |     def remove_namespaces(self) -> None:
731 |         """
732 |         Remove all namespaces, allowing to traverse the document using
733 |         namespace-less xpaths. See :ref:`removing-namespaces`.
734 |         """
735 |         for el in self.root.iter("*"):
736 |             if el.tag.startswith("{"):
737 |                 el.tag = el.tag.split("}", 1)[1]
738 |             # loop on element attributes also
739 |             for an in el.attrib:
740 |                 if an.startswith("{"):
741 |                     el.attrib[an.split("}", 1)[1]] = el.attrib.pop(an)
742 |         # remove namespace declarations
743 |         etree.cleanup_namespaces(self.root)
744 | 
745 |     def drop(self) -> None:
746 |         """
747 |         Drop matched nodes from the parent element.
748 |         """
749 |         try:
750 |             parent = self.root.getparent()
751 |         except AttributeError:
752 |             # 'str' object has no attribute 'getparent'
753 |             raise CannotRemoveElementWithoutRoot(
754 |                 "The node you're trying to drop has no root, "
755 |                 "are you trying to drop a pseudo-element? "
756 |                 "Try to use 'li' as a selector instead of 'li::text' or "
757 |                 "'//li' instead of '//li/text()', for example."
758 |             )
759 | 
760 |         try:
761 |             if self.type == "xml":
762 |                 if parent is None:
763 |                     raise ValueError("This node has no parent")
764 |                 parent.remove(self.root)
765 |             else:
766 |                 typing.cast("html.HtmlElement", self.root).drop_tree()
767 |         except (AttributeError, AssertionError):
768 |             # 'NoneType' object has no attribute 'drop'
769 |             raise CannotDropElementWithoutParent(
770 |                 "The node you're trying to remove has no parent, "
771 |                 "are you trying to remove a root element?"
772 |             )
773 | 
774 |     @property
775 |     def attrib(self) -> dict[str, str]:
776 |         """Return the attributes dictionary for underlying element."""
777 |         return dict(self.root.attrib)
778 | 
779 |     def __bool__(self) -> bool:
780 |         """
781 |         Return ``True`` if there is any real content selected or ``False``
782 |         otherwise.  In other words, the boolean value of a :class:`Selector` is
783 |         given by the contents it selects.
784 |         """
785 |         return bool(self.get())
786 | 
787 |     __nonzero__ = __bool__
788 | 
789 |     def __str__(self) -> str:
790 |         return str(self.get())
791 | 
792 |     def __repr__(self) -> str:
793 |         data = repr(shorten(str(self.get()), width=40))
794 |         return f"<{type(self).__name__} query={self._expr!r} data={data}>"
795 | 


--------------------------------------------------------------------------------
/parsel/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import re
  4 | from typing import TYPE_CHECKING, Any, cast
  5 | 
  6 | from w3lib.html import replace_entities as w3lib_replace_entities
  7 | 
  8 | if TYPE_CHECKING:
  9 |     from collections.abc import Iterable, Iterator
 10 | 
 11 | 
 12 | def flatten(x: Iterable[Any]) -> list[Any]:
 13 |     """flatten(sequence) -> list
 14 |     Returns a single, flat list which contains all elements retrieved
 15 |     from the sequence and all recursively contained sub-sequences
 16 |     (iterables).
 17 |     Examples:
 18 |     >>> [1, 2, [3,4], (5,6)]
 19 |     [1, 2, [3, 4], (5, 6)]
 20 |     >>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)])
 21 |     [1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]
 22 |     >>> flatten(["foo", "bar"])
 23 |     ['foo', 'bar']
 24 |     >>> flatten(["foo", ["baz", 42], "bar"])
 25 |     ['foo', 'baz', 42, 'bar']
 26 |     """
 27 |     return list(iflatten(x))
 28 | 
 29 | 
 30 | def iflatten(x: Iterable[Any]) -> Iterator[Any]:
 31 |     """iflatten(sequence) -> Iterator
 32 |     Similar to ``.flatten()``, but returns iterator instead"""
 33 |     for el in x:
 34 |         if _is_listlike(el):
 35 |             yield from flatten(el)
 36 |         else:
 37 |             yield el
 38 | 
 39 | 
 40 | def _is_listlike(x: Any) -> bool:
 41 |     """
 42 |     >>> _is_listlike("foo")
 43 |     False
 44 |     >>> _is_listlike(5)
 45 |     False
 46 |     >>> _is_listlike(b"foo")
 47 |     False
 48 |     >>> _is_listlike([b"foo"])
 49 |     True
 50 |     >>> _is_listlike((b"foo",))
 51 |     True
 52 |     >>> _is_listlike({})
 53 |     True
 54 |     >>> _is_listlike(set())
 55 |     True
 56 |     >>> _is_listlike((x for x in range(3)))
 57 |     True
 58 |     >>> _is_listlike(range(5))
 59 |     True
 60 |     """
 61 |     return hasattr(x, "__iter__") and not isinstance(x, (str, bytes))
 62 | 
 63 | 
 64 | def extract_regex(
 65 |     regex: str | re.Pattern[str], text: str, replace_entities: bool = True
 66 | ) -> list[str]:
 67 |     """Extract a list of strings from the given text/encoding using the following policies:
 68 |     * if the regex contains a named group called "extract" that will be returned
 69 |     * if the regex contains multiple numbered groups, all those will be returned (flattened)
 70 |     * if the regex doesn't contain any group the entire regex matching is returned
 71 |     """
 72 |     if isinstance(regex, str):
 73 |         regex = re.compile(regex, re.UNICODE)
 74 | 
 75 |     if "extract" in regex.groupindex:
 76 |         # named group
 77 |         try:
 78 |             extracted = cast("re.Match[str]", regex.search(text)).group("extract")
 79 |         except AttributeError:
 80 |             strings = []
 81 |         else:
 82 |             strings = [extracted] if extracted is not None else []
 83 |     else:
 84 |         # full regex or numbered groups
 85 |         strings = regex.findall(text)
 86 | 
 87 |     strings = flatten(strings)
 88 |     if not replace_entities:
 89 |         return strings
 90 |     return [w3lib_replace_entities(s, keep=["lt", "amp"]) for s in strings]
 91 | 
 92 | 
 93 | def shorten(text: str, width: int, suffix: str = "...") -> str:
 94 |     """Truncate the given text to fit in the given width."""
 95 |     if len(text) <= width:
 96 |         return text
 97 |     if width > len(suffix):
 98 |         return text[: width - len(suffix)] + suffix
 99 |     if width >= 0:
100 |         return suffix[len(suffix) - width :]
101 |     raise ValueError("width must be equal or greater than 0")
102 | 


--------------------------------------------------------------------------------
/parsel/xpathfuncs.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import re
 4 | from typing import Any, Callable
 5 | 
 6 | from lxml import etree
 7 | from w3lib.html import HTML5_WHITESPACE
 8 | 
 9 | regex = f"[{HTML5_WHITESPACE}]+"
10 | replace_html5_whitespaces = re.compile(regex).sub
11 | 
12 | 
13 | def set_xpathfunc(fname: str, func: Callable | None) -> None:  # type: ignore[type-arg]
14 |     """Register a custom extension function to use in XPath expressions.
15 | 
16 |     The function ``func`` registered under ``fname`` identifier will be called
17 |     for every matching node, being passed a ``context`` parameter as well as
18 |     any parameters passed from the corresponding XPath expression.
19 | 
20 |     If ``func`` is ``None``, the extension function will be removed.
21 | 
22 |     See more `in lxml documentation`_.
23 | 
24 |     .. _`in lxml documentation`: https://lxml.de/extensions.html#xpath-extension-functions
25 | 
26 |     """
27 |     ns_fns = etree.FunctionNamespace(None)
28 |     if func is not None:
29 |         ns_fns[fname] = func
30 |     else:
31 |         del ns_fns[fname]
32 | 
33 | 
34 | def setup() -> None:
35 |     set_xpathfunc("has-class", has_class)
36 | 
37 | 
38 | def has_class(context: Any, *classes: str) -> bool:
39 |     """has-class function.
40 | 
41 |     Return True if all ``classes`` are present in element's class attr.
42 | 
43 |     """
44 |     if not context.eval_context.get("args_checked"):
45 |         if not classes:
46 |             raise ValueError("XPath error: has-class must have at least 1 argument")
47 |         for c in classes:
48 |             if not isinstance(c, str):
49 |                 raise ValueError("XPath error: has-class arguments must be strings")
50 |         context.eval_context["args_checked"] = True
51 | 
52 |     node_cls = context.context_node.get("class")
53 |     if node_cls is None:
54 |         return False
55 |     node_cls = " " + node_cls + " "
56 |     node_cls = replace_html5_whitespaces(" ", node_cls)
57 |     return all(" " + cls + " " in node_cls for cls in classes)
58 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.bumpversion]
  2 | current_version = "1.10.0"
  3 | commit = true
  4 | tag = true
  5 | tag_name = "v{new_version}"
  6 | 
  7 | [[tool.bumpversion.files]]
  8 | filename = "setup.py"
  9 | 
 10 | [[tool.bumpversion.files]]
 11 | filename = "parsel/__init__.py"
 12 | 
 13 | [tool.coverage.run]
 14 | branch = true
 15 | 
 16 | [tool.coverage.report]
 17 | exclude_also = [
 18 |     "@typing.overload",
 19 |     "if TYPE_CHECKING:",
 20 | ]
 21 | 
 22 | [tool.pylint.MASTER]
 23 | ignore = "typing"
 24 | persistent = "no"
 25 | extension-pkg-allow-list = [
 26 |     "lxml",
 27 | ]
 28 | 
 29 | [tool.pylint."MESSAGES CONTROL"]
 30 | enable = [
 31 |     "useless-suppression",
 32 | ]
 33 | disable = [
 34 |     "fixme",
 35 |     "import-error",
 36 |     "import-outside-toplevel",
 37 |     "invalid-name",
 38 |     "line-too-long",
 39 |     "missing-class-docstring",
 40 |     "missing-function-docstring",
 41 |     "missing-module-docstring",
 42 |     "no-member",
 43 |     "not-callable",
 44 |     "protected-access",
 45 |     "raise-missing-from",
 46 |     "redefined-builtin",
 47 |     "too-few-public-methods",
 48 |     "too-many-arguments",
 49 |     "too-many-lines",
 50 |     "too-many-positional-arguments",
 51 |     "too-many-public-methods",
 52 |     "unused-argument",
 53 |     "wrong-import-position",
 54 | ]
 55 | 
 56 | [tool.pytest.ini_options]
 57 | addopts = "--assert=plain --doctest-modules --ignore=setup.py"
 58 | 
 59 | [tool.ruff.lint]
 60 | extend-select = [
 61 |     # flake8-bugbear
 62 |     "B",
 63 |     # flake8-comprehensions
 64 |     "C4",
 65 |     # pydocstyle
 66 |     "D",
 67 |     # flake8-future-annotations
 68 |     "FA",
 69 |     # flynt
 70 |     "FLY",
 71 |     # refurb
 72 |     "FURB",
 73 |     # isort
 74 |     "I",
 75 |     # flake8-implicit-str-concat
 76 |     "ISC",
 77 |     # flake8-logging
 78 |     "LOG",
 79 |     # Perflint
 80 |     "PERF",
 81 |     # pygrep-hooks
 82 |     "PGH",
 83 |     # flake8-pie
 84 |     "PIE",
 85 |     # pylint
 86 |     "PL",
 87 |     # flake8-use-pathlib
 88 |     "PTH",
 89 |     # flake8-pyi
 90 |     "PYI",
 91 |     # flake8-quotes
 92 |     "Q",
 93 |     # flake8-return
 94 |     "RET",
 95 |     # flake8-raise
 96 |     "RSE",
 97 |     # Ruff-specific rules
 98 |     "RUF",
 99 |     # flake8-bandit
100 |     "S",
101 |     # flake8-simplify
102 |     "SIM",
103 |     # flake8-slots
104 |     "SLOT",
105 |     # flake8-debugger
106 |     "T10",
107 |     # flake8-type-checking
108 |     "TC",
109 |     # pyupgrade
110 |     "UP",
111 |     # pycodestyle warnings
112 |     "W",
113 |     # flake8-2020
114 |     "YTT",
115 | ]
116 | ignore = [
117 |     # Within an `except` clause, raise exceptions with `raise ... from`
118 |     "B904",
119 |     # Missing docstring in public module
120 |     "D100",
121 |     # Missing docstring in public class
122 |     "D101",
123 |     # Missing docstring in public method
124 |     "D102",
125 |     # Missing docstring in public function
126 |     "D103",
127 |     # Missing docstring in public package
128 |     "D104",
129 |     # Missing docstring in magic method
130 |     "D105",
131 |     # Missing docstring in public nested class
132 |     "D106",
133 |     # Missing docstring in __init__
134 |     "D107",
135 |     # One-line docstring should fit on one line with quotes
136 |     "D200",
137 |     # No blank lines allowed after function docstring
138 |     "D202",
139 |     # 1 blank line required between summary line and description
140 |     "D205",
141 |     # Multi-line docstring closing quotes should be on a separate line
142 |     "D209",
143 |     # First line should end with a period
144 |     "D400",
145 |     # First line should be in imperative mood; try rephrasing
146 |     "D401",
147 |     # First line should not be the function's "signature"
148 |     "D402",
149 |     # First word of the first line should be properly capitalized
150 |     "D403",
151 |     # No blank lines allowed between a section header and its content
152 |     "D412",
153 |     # Too many return statements
154 |     "PLR0911",
155 |     # Too many branches
156 |     "PLR0912",
157 |     # Too many arguments in function definition
158 |     "PLR0913",
159 |     # Too many statements
160 |     "PLR0915",
161 |     # Magic value used in comparison
162 |     "PLR2004",
163 |     # String contains ambiguous {}.
164 |     "RUF001",
165 |     # Docstring contains ambiguous {}.
166 |     "RUF002",
167 |     # Comment contains ambiguous {}.
168 |     "RUF003",
169 |     # Mutable class attributes should be annotated with `typing.ClassVar`
170 |     "RUF012",
171 |     # Use of `assert` detected
172 |     "S101",
173 |     # Using lxml to parse untrusted data is known to be vulnerable to XML attacks
174 |     "S320",
175 | 
176 |     # pending: https://github.com/scrapy/parsel/issues/312
177 |     "B019",
178 | ]
179 | 
180 | [tool.ruff.lint.per-file-ignores]
181 | "tests/typing/selector.py" = ["F841"]
182 | 
183 | [tool.ruff.lint.pydocstyle]
184 | convention = "pep257"
185 | 


--------------------------------------------------------------------------------
/release.rst:
--------------------------------------------------------------------------------
 1 | Release procedures
 2 | ------------------
 3 | 
 4 | * Update NEWS file with the release notes.
 5 |   Review changes using: ``restview --pypi-strict <(cat README.rst NEWS | grep -v ':changelog')``
 6 | * Run bumpversion with the proper release type
 7 | * Push code and tags to GitHub to trigger build
 8 | * Copy release notes to https://github.com/scrapy/parsel/releases
 9 | * Verify in a temporary virtualenv that ``pip install parsel`` installs the
10 |   latest version
11 | * Update version builds at: https://readthedocs.org/projects/parsel/versions/
12 |   You should ensure that previous stable version is active and point stable to the new tag
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from pathlib import Path
 3 | 
 4 | from setuptools import setup
 5 | 
 6 | readme = Path("README.rst").read_text(encoding="utf-8")
 7 | history = Path("NEWS").read_text(encoding="utf-8").replace(".. :changelog:", "")
 8 | 
 9 | setup(
10 |     name="parsel",
11 |     version="1.10.0",
12 |     description="Parsel is a library to extract data from HTML and XML using XPath and CSS selectors",
13 |     long_description=readme + "\n\n" + history,
14 |     long_description_content_type="text/x-rst",
15 |     author="Scrapy project",
16 |     author_email="info@scrapy.org",
17 |     url="https://github.com/scrapy/parsel",
18 |     packages=[
19 |         "parsel",
20 |     ],
21 |     package_dir={
22 |         "parsel": "parsel",
23 |     },
24 |     include_package_data=True,
25 |     install_requires=[
26 |         "cssselect>=1.2.0",
27 |         "jmespath",
28 |         "lxml",
29 |         "packaging",
30 |         "w3lib>=1.19.0",
31 |     ],
32 |     python_requires=">=3.9",
33 |     license="BSD",
34 |     zip_safe=False,
35 |     keywords="parsel",
36 |     classifiers=[
37 |         "Development Status :: 5 - Production/Stable",
38 |         "Intended Audience :: Developers",
39 |         "License :: OSI Approved :: BSD License",
40 |         "Natural Language :: English",
41 |         "Topic :: Text Processing :: Markup",
42 |         "Topic :: Text Processing :: Markup :: HTML",
43 |         "Topic :: Text Processing :: Markup :: XML",
44 |         "Programming Language :: Python :: 3",
45 |         "Programming Language :: Python :: 3.9",
46 |         "Programming Language :: Python :: 3.10",
47 |         "Programming Language :: Python :: 3.11",
48 |         "Programming Language :: Python :: 3.12",
49 |         "Programming Language :: Python :: 3.13",
50 |         "Programming Language :: Python :: Implementation :: CPython",
51 |         "Programming Language :: Python :: Implementation :: PyPy",
52 |     ],
53 | )
54 | 


--------------------------------------------------------------------------------
/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | psutil==5.6.3  # https://github.com/giampaolo/psutil/issues/1659#issuecomment-586032229
2 | pytest
3 | pytest-cov
4 | sybil
5 | 


--------------------------------------------------------------------------------
/tests/test_selector.py:
--------------------------------------------------------------------------------
   1 | from __future__ import annotations
   2 | 
   3 | import pickle
   4 | import re
   5 | import typing
   6 | import unittest
   7 | import warnings
   8 | import weakref
   9 | from typing import TYPE_CHECKING, Any, cast
  10 | 
  11 | from lxml import etree
  12 | from packaging.version import Version
  13 | 
  14 | from parsel import Selector, SelectorList
  15 | from parsel.selector import (
  16 |     _NOT_SET,
  17 |     LXML_SUPPORTS_HUGE_TREE,
  18 |     CannotRemoveElementWithoutParent,
  19 |     CannotRemoveElementWithoutRoot,
  20 | )
  21 | 
  22 | if TYPE_CHECKING:
  23 |     from collections.abc import Mapping
  24 | 
  25 |     from lxml.html import HtmlElement
  26 | 
  27 | 
  28 | class SelectorTestCase(unittest.TestCase):
  29 |     sscls = Selector
  30 | 
  31 |     def assertIsSelector(self, value: Any) -> None:
  32 |         self.assertEqual(type(value), type(self.sscls(text="")))
  33 | 
  34 |     def assertIsSelectorList(self, value: Any) -> None:
  35 |         self.assertEqual(type(value), type(self.sscls.selectorlist_cls()))
  36 | 
  37 |     def test_pickle_selector(self) -> None:
  38 |         sel = self.sscls(text="<html><body><p>some text</p></body></html>")
  39 |         self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel)
  40 | 
  41 |     def test_pickle_selector_list(self) -> None:
  42 |         sel = self.sscls(
  43 |             text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>"
  44 |         )
  45 |         sel_list = sel.css("li")
  46 |         empty_sel_list = sel.css("p")
  47 |         self.assertIsSelectorList(sel_list)
  48 |         self.assertIsSelectorList(empty_sel_list)
  49 |         self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel_list)
  50 |         self.assertRaises(
  51 |             TypeError, lambda s: pickle.dumps(s, protocol=2), empty_sel_list
  52 |         )
  53 | 
  54 |     def test_simple_selection(self) -> None:
  55 |         """Simple selector tests"""
  56 |         body = "<p><input name='a'value='1'/><input name='b'value='2'/></p>"
  57 |         sel = self.sscls(text=body)
  58 | 
  59 |         xl = sel.xpath("//input")
  60 |         self.assertEqual(2, len(xl))
  61 |         for x in xl:
  62 |             self.assertIsSelector(x)
  63 | 
  64 |         self.assertEqual(
  65 |             sel.xpath("//input").extract(),
  66 |             [x.extract() for x in sel.xpath("//input")],
  67 |         )
  68 | 
  69 |         self.assertEqual(
  70 |             [x.extract() for x in sel.xpath("//input[@name='a']/@name")],
  71 |             ["a"],
  72 |         )
  73 |         self.assertEqual(
  74 |             [
  75 |                 x.extract()
  76 |                 for x in sel.xpath(
  77 |                     "number(concat(//input[@name='a']/@value, //input[@name='b']/@value))"
  78 |                 )
  79 |             ],
  80 |             ["12.0"],
  81 |         )
  82 | 
  83 |         self.assertEqual(
  84 |             sel.xpath("concat('xpath', 'rules')").extract(), ["xpathrules"]
  85 |         )
  86 |         self.assertEqual(
  87 |             [
  88 |                 x.extract()
  89 |                 for x in sel.xpath(
  90 |                     "concat(//input[@name='a']/@value, //input[@name='b']/@value)"
  91 |                 )
  92 |             ],
  93 |             ["12"],
  94 |         )
  95 | 
  96 |     def test_simple_selection_with_variables(self) -> None:
  97 |         """Using XPath variables"""
  98 |         body = "<p><input name='a' value='1'/><input name='b' value='2'/></p>"
  99 |         sel = self.sscls(text=body)
 100 | 
 101 |         self.assertEqual(
 102 |             [x.extract() for x in sel.xpath("//input[@value=$number]/@name", number=1)],
 103 |             ["a"],
 104 |         )
 105 |         self.assertEqual(
 106 |             [
 107 |                 x.extract()
 108 |                 for x in sel.xpath("//input[@name=$letter]/@value", letter="b")
 109 |             ],
 110 |             ["2"],
 111 |         )
 112 | 
 113 |         self.assertEqual(
 114 |             sel.xpath(
 115 |                 "count(//input[@value=$number or @name=$letter])",
 116 |                 number=2,
 117 |                 letter="a",
 118 |             ).extract(),
 119 |             ["2.0"],
 120 |         )
 121 | 
 122 |         # you can also pass booleans
 123 |         self.assertEqual(
 124 |             sel.xpath("boolean(count(//input)=$cnt)=$test", cnt=2, test=True).extract(),
 125 |             ["1"],
 126 |         )
 127 |         self.assertEqual(
 128 |             sel.xpath("boolean(count(//input)=$cnt)=$test", cnt=4, test=True).extract(),
 129 |             ["0"],
 130 |         )
 131 |         self.assertEqual(
 132 |             sel.xpath(
 133 |                 "boolean(count(//input)=$cnt)=$test", cnt=4, test=False
 134 |             ).extract(),
 135 |             ["1"],
 136 |         )
 137 | 
 138 |         # for named nodes, you need to use "name()=node_name"
 139 |         self.assertEqual(
 140 |             sel.xpath(
 141 |                 "boolean(count(//*[name()=$tag])=$cnt)=$test",
 142 |                 tag="input",
 143 |                 cnt=2,
 144 |                 test=True,
 145 |             ).extract(),
 146 |             ["1"],
 147 |         )
 148 | 
 149 |     def test_simple_selection_with_variables_escape_friendly(self) -> None:
 150 |         """Using XPath variables with quotes that would need escaping with string formatting"""
 151 |         body = """<p>I'm mixing single and <input name='a' value='I say "Yeah!"'/>
 152 |         "double quotes" and I don't care :)</p>"""
 153 |         sel = self.sscls(text=body)
 154 | 
 155 |         t = 'I say "Yeah!"'
 156 |         # naive string formatting with give something like:
 157 |         # ValueError: XPath error: Invalid predicate in //input[@value="I say "Yeah!""]/@name
 158 |         self.assertRaises(ValueError, sel.xpath, f'//input[@value="{t}"]/@name')
 159 | 
 160 |         # with XPath variables, escaping is done for you
 161 |         self.assertEqual(
 162 |             [x.extract() for x in sel.xpath("//input[@value=$text]/@name", text=t)],
 163 |             ["a"],
 164 |         )
 165 |         lt = """I'm mixing single and "double quotes" and I don't care :)"""
 166 |         # the following gives you something like
 167 |         # ValueError: XPath error: Invalid predicate in //p[normalize-space()='I'm mixing single and "double quotes" and I don't care :)']//@name
 168 |         self.assertRaises(
 169 |             ValueError, sel.xpath, f"//p[normalize-space()='{lt}']//@name"
 170 |         )
 171 | 
 172 |         self.assertEqual(
 173 |             [
 174 |                 x.extract()
 175 |                 for x in sel.xpath("//p[normalize-space()=$lng]//@name", lng=lt)
 176 |             ],
 177 |             ["a"],
 178 |         )
 179 | 
 180 |     def test_accessing_attributes(self) -> None:
 181 |         body = """
 182 | <html lang="en" version="1.0">
 183 |     <body>
 184 |         <ul id="some-list" class="list-cls" class="list-cls">
 185 |             <li class="item-cls" id="list-item-1">
 186 |             <li class="item-cls active" id="list-item-2">
 187 |             <li class="item-cls" id="list-item-3">
 188 |         </ul>
 189 |     </body>
 190 | </html>
 191 |         """
 192 |         sel = self.sscls(text=body)
 193 |         self.assertEqual({"lang": "en", "version": "1.0"}, sel.attrib)
 194 |         self.assertEqual(
 195 |             {"id": "some-list", "class": "list-cls"}, sel.css("ul")[0].attrib
 196 |         )
 197 | 
 198 |         # for a SelectorList, bring the attributes of first-element only
 199 |         self.assertEqual({"id": "some-list", "class": "list-cls"}, sel.css("ul").attrib)
 200 |         self.assertEqual(
 201 |             {"class": "item-cls", "id": "list-item-1"}, sel.css("li").attrib
 202 |         )
 203 |         self.assertEqual({}, sel.css("body").attrib)
 204 |         self.assertEqual({}, sel.css("non-existing-element").attrib)
 205 | 
 206 |         self.assertEqual(
 207 |             [
 208 |                 {"class": "item-cls", "id": "list-item-1"},
 209 |                 {"class": "item-cls active", "id": "list-item-2"},
 210 |                 {"class": "item-cls", "id": "list-item-3"},
 211 |             ],
 212 |             [e.attrib for e in sel.css("li")],
 213 |         )
 214 | 
 215 |     def test_representation_slice(self) -> None:
 216 |         body = f"<p><input name='{50 * 'b'}' value='\xa9'/></p>"
 217 |         sel = self.sscls(text=body)
 218 | 
 219 |         representation = f"<Selector query='//input/@name' data='{37 * 'b'}...'>"
 220 | 
 221 |         self.assertEqual(
 222 |             [repr(it) for it in sel.xpath("//input/@name")], [representation]
 223 |         )
 224 | 
 225 |     def test_representation_unicode_query(self) -> None:
 226 |         body = f"<p><input name='{50 * 'b'}' value='\xa9'/></p>"
 227 | 
 228 |         representation = "<Selector query='//input[@value=\"©\"]/@value' data='©'>"
 229 | 
 230 |         sel = self.sscls(text=body)
 231 |         self.assertEqual(
 232 |             [repr(it) for it in sel.xpath('//input[@value="\xa9"]/@value')],
 233 |             [representation],
 234 |         )
 235 | 
 236 |     def test_check_text_argument_type(self) -> None:
 237 |         self.assertRaisesRegex(
 238 |             TypeError,
 239 |             "text argument should be of type",
 240 |             self.sscls,
 241 |             b"<html/>",
 242 |         )
 243 | 
 244 |     def test_extract_first(self) -> None:
 245 |         """Test if extract_first() returns first element"""
 246 |         body = '<ul><li id="1">1</li><li id="2">2</li></ul>'
 247 |         sel = self.sscls(text=body)
 248 | 
 249 |         self.assertEqual(
 250 |             sel.xpath("//ul/li/text()").extract_first(),
 251 |             sel.xpath("//ul/li/text()").extract()[0],
 252 |         )
 253 | 
 254 |         self.assertEqual(
 255 |             sel.xpath('//ul/li[@id="1"]/text()').extract_first(),
 256 |             sel.xpath('//ul/li[@id="1"]/text()').extract()[0],
 257 |         )
 258 | 
 259 |         self.assertEqual(
 260 |             sel.xpath("//ul/li[2]/text()").extract_first(),
 261 |             sel.xpath("//ul/li/text()").extract()[1],
 262 |         )
 263 | 
 264 |         self.assertEqual(
 265 |             sel.xpath('/ul/li[@id="doesnt-exist"]/text()').extract_first(),
 266 |             None,
 267 |         )
 268 | 
 269 |     def test_extract_first_default(self) -> None:
 270 |         """Test if extract_first() returns default value when no results found"""
 271 |         body = '<ul><li id="1">1</li><li id="2">2</li></ul>'
 272 |         sel = self.sscls(text=body)
 273 | 
 274 |         self.assertEqual(
 275 |             sel.xpath("//div/text()").extract_first(default="missing"),
 276 |             "missing",
 277 |         )
 278 | 
 279 |     def test_selector_get_alias(self) -> None:
 280 |         """Test if get() returns extracted value on a Selector"""
 281 |         body = '<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>'
 282 |         sel = self.sscls(text=body)
 283 | 
 284 |         self.assertEqual(
 285 |             sel.xpath("//ul/li[position()>1]")[0].get(), '<li id="2">2</li>'
 286 |         )
 287 |         self.assertEqual(sel.xpath("//ul/li[position()>1]/text()")[0].get(), "2")
 288 | 
 289 |     def test_selector_getall_alias(self) -> None:
 290 |         """Test if get() returns extracted value on a Selector"""
 291 |         body = '<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>'
 292 |         sel = self.sscls(text=body)
 293 | 
 294 |         self.assertListEqual(
 295 |             sel.xpath("//ul/li[position()>1]")[0].getall(),
 296 |             ['<li id="2">2</li>'],
 297 |         )
 298 |         self.assertListEqual(
 299 |             sel.xpath("//ul/li[position()>1]/text()")[0].getall(), ["2"]
 300 |         )
 301 | 
 302 |     def test_selectorlist_get_alias(self) -> None:
 303 |         """Test if get() returns first element for a selection call"""
 304 |         body = '<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>'
 305 |         sel = self.sscls(text=body)
 306 | 
 307 |         self.assertEqual(sel.xpath("//ul/li").get(), '<li id="1">1</li>')
 308 |         self.assertEqual(sel.xpath("//ul/li/text()").get(), "1")
 309 | 
 310 |     def test_re_first(self) -> None:
 311 |         """Test if re_first() returns first matched element"""
 312 |         body = '<ul><li id="1">1</li><li id="2">2</li></ul>'
 313 |         sel = self.sscls(text=body)
 314 | 
 315 |         self.assertEqual(
 316 |             sel.xpath("//ul/li/text()").re_first(r"\d"),
 317 |             sel.xpath("//ul/li/text()").re(r"\d")[0],
 318 |         )
 319 | 
 320 |         self.assertEqual(
 321 |             sel.xpath('//ul/li[@id="1"]/text()').re_first(r"\d"),
 322 |             sel.xpath('//ul/li[@id="1"]/text()').re(r"\d")[0],
 323 |         )
 324 | 
 325 |         self.assertEqual(
 326 |             sel.xpath("//ul/li[2]/text()").re_first(r"\d"),
 327 |             sel.xpath("//ul/li/text()").re(r"\d")[1],
 328 |         )
 329 | 
 330 |         self.assertEqual(sel.xpath("/ul/li/text()").re_first(r"\w+"), None)
 331 |         self.assertEqual(
 332 |             sel.xpath('/ul/li[@id="doesnt-exist"]/text()').re_first(r"\d"),
 333 |             None,
 334 |         )
 335 | 
 336 |         self.assertEqual(sel.re_first(r'id="(\d+)'), "1")
 337 |         self.assertEqual(sel.re_first(r"foo"), None)
 338 |         self.assertEqual(sel.re_first(r"foo", default="bar"), "bar")
 339 | 
 340 |     def test_extract_first_re_default(self) -> None:
 341 |         """Test if re_first() returns default value when no results found"""
 342 |         body = '<ul><li id="1">1</li><li id="2">2</li></ul>'
 343 |         sel = self.sscls(text=body)
 344 | 
 345 |         self.assertEqual(
 346 |             sel.xpath("//div/text()").re_first(r"\w+", default="missing"),
 347 |             "missing",
 348 |         )
 349 |         self.assertEqual(
 350 |             sel.xpath("/ul/li/text()").re_first(r"\w+", default="missing"),
 351 |             "missing",
 352 |         )
 353 | 
 354 |     def test_select_unicode_query(self) -> None:
 355 |         body = "<p><input name='\xa9' value='1'/></p>"
 356 |         sel = self.sscls(text=body)
 357 |         self.assertEqual(sel.xpath('//input[@name="\xa9"]/@value').extract(), ["1"])
 358 | 
 359 |     def test_list_elements_type(self) -> None:
 360 |         """Test Selector returning the same type in selection methods"""
 361 |         text = "<p>test<p>"
 362 |         self.assertEqual(
 363 |             type(self.sscls(text=text).xpath("//p")[0]),
 364 |             type(self.sscls(text=text)),
 365 |         )
 366 |         self.assertEqual(
 367 |             type(self.sscls(text=text).css("p")[0]),
 368 |             type(self.sscls(text=text)),
 369 |         )
 370 | 
 371 |     def test_boolean_result(self) -> None:
 372 |         body = "<p><input name='a'value='1'/><input name='b'value='2'/></p>"
 373 |         xs = self.sscls(text=body)
 374 |         self.assertEqual(xs.xpath("//input[@name='a']/@name='a'").extract(), ["1"])
 375 |         self.assertEqual(xs.xpath("//input[@name='a']/@name='n'").extract(), ["0"])
 376 | 
 377 |     def test_differences_parsing_xml_vs_html(self) -> None:
 378 |         """Test that XML and HTML Selector's behave differently"""
 379 |         # some text which is parsed differently by XML and HTML flavors
 380 |         text = '<div><img src="a.jpg"><p>Hello</div>'
 381 |         hs = self.sscls(text=text, type="html")
 382 |         self.assertEqual(
 383 |             hs.xpath("//div").extract(),
 384 |             ['<div><img src="a.jpg"><p>Hello</p></div>'],
 385 |         )
 386 | 
 387 |         xs = self.sscls(text=text, type="xml")
 388 |         self.assertEqual(
 389 |             xs.xpath("//div").extract(),
 390 |             ['<div><img src="a.jpg"><p>Hello</p></img></div>'],
 391 |         )
 392 | 
 393 |     def test_error_for_unknown_selector_type(self) -> None:
 394 |         self.assertRaises(ValueError, self.sscls, text="", type="_na_")
 395 | 
 396 |     def test_text_or_root_is_required(self) -> None:
 397 |         self.assertRaisesRegex(
 398 |             ValueError,
 399 |             "Selector needs text, body, or root arguments",
 400 |             self.sscls,
 401 |         )
 402 | 
 403 |     def test_bool(self) -> None:
 404 |         text = '<a href="" >false</a><a href="nonempty">true</a>'
 405 |         hs = self.sscls(text=text, type="html")
 406 |         falsish = hs.xpath("//a/@href")[0]
 407 |         self.assertEqual(falsish.extract(), "")
 408 |         self.assertFalse(falsish)
 409 |         trueish = hs.xpath("//a/@href")[1]
 410 |         self.assertEqual(trueish.extract(), "nonempty")
 411 |         self.assertTrue(trueish)
 412 | 
 413 |     def test_slicing(self) -> None:
 414 |         text = "<div><p>1</p><p>2</p><p>3</p></div>"
 415 |         hs = self.sscls(text=text, type="html")
 416 |         self.assertIsSelector(hs.css("p")[2])
 417 |         self.assertIsSelectorList(hs.css("p")[2:3])
 418 |         self.assertIsSelectorList(hs.css("p")[:2])
 419 |         self.assertEqual(hs.css("p")[2:3].extract(), ["<p>3</p>"])
 420 |         self.assertEqual(hs.css("p")[1:3].extract(), ["<p>2</p>", "<p>3</p>"])
 421 | 
 422 |     def test_nested_selectors(self) -> None:
 423 |         """Nested selector tests"""
 424 |         body = """<body>
 425 |                     <div class='one'>
 426 |                       <ul>
 427 |                         <li>one</li><li>two</li>
 428 |                       </ul>
 429 |                     </div>
 430 |                     <div class='two'>
 431 |                       <ul>
 432 |                         <li>four</li><li>five</li><li>six</li>
 433 |                       </ul>
 434 |                     </div>
 435 |                   </body>"""
 436 | 
 437 |         x = self.sscls(text=body)
 438 |         divtwo = x.xpath('//div[@class="two"]')
 439 |         self.assertEqual(
 440 |             divtwo.xpath("//li").extract(),
 441 |             [
 442 |                 "<li>one</li>",
 443 |                 "<li>two</li>",
 444 |                 "<li>four</li>",
 445 |                 "<li>five</li>",
 446 |                 "<li>six</li>",
 447 |             ],
 448 |         )
 449 |         self.assertEqual(
 450 |             divtwo.xpath("./ul/li").extract(),
 451 |             ["<li>four</li>", "<li>five</li>", "<li>six</li>"],
 452 |         )
 453 |         self.assertEqual(
 454 |             divtwo.xpath(".//li").extract(),
 455 |             ["<li>four</li>", "<li>five</li>", "<li>six</li>"],
 456 |         )
 457 |         self.assertEqual(divtwo.xpath("./li").extract(), [])
 458 | 
 459 |     def test_selectorlist_getall_alias(self) -> None:
 460 |         """Nested selector tests using getall()"""
 461 |         body = """<body>
 462 |                     <div class='one'>
 463 |                       <ul>
 464 |                         <li>one</li><li>two</li>
 465 |                       </ul>
 466 |                     </div>
 467 |                     <div class='two'>
 468 |                       <ul>
 469 |                         <li>four</li><li>five</li><li>six</li>
 470 |                       </ul>
 471 |                     </div>
 472 |                   </body>"""
 473 | 
 474 |         x = self.sscls(text=body)
 475 |         divtwo = x.xpath('//div[@class="two"]')
 476 |         self.assertEqual(
 477 |             divtwo.xpath("//li").getall(),
 478 |             [
 479 |                 "<li>one</li>",
 480 |                 "<li>two</li>",
 481 |                 "<li>four</li>",
 482 |                 "<li>five</li>",
 483 |                 "<li>six</li>",
 484 |             ],
 485 |         )
 486 |         self.assertEqual(
 487 |             divtwo.xpath("./ul/li").getall(),
 488 |             ["<li>four</li>", "<li>five</li>", "<li>six</li>"],
 489 |         )
 490 |         self.assertEqual(
 491 |             divtwo.xpath(".//li").getall(),
 492 |             ["<li>four</li>", "<li>five</li>", "<li>six</li>"],
 493 |         )
 494 |         self.assertEqual(divtwo.xpath("./li").getall(), [])
 495 | 
 496 |     def test_mixed_nested_selectors(self) -> None:
 497 |         body = """<body>
 498 |                     <div id=1>not<span>me</span></div>
 499 |                     <div class="dos"><p>text</p><a href='#'>foo</a></div>
 500 |                </body>"""
 501 |         sel = self.sscls(text=body)
 502 |         self.assertEqual(
 503 |             sel.xpath('//div[@id="1"]').css("span::text").extract(), ["me"]
 504 |         )
 505 |         self.assertEqual(sel.css("#1").xpath("./span/text()").extract(), ["me"])
 506 | 
 507 |     def test_dont_strip(self) -> None:
 508 |         sel = self.sscls(text='<div>fff: <a href="#">zzz</a></div>')
 509 |         self.assertEqual(sel.xpath("//text()").extract(), ["fff: ", "zzz"])
 510 | 
 511 |     def test_namespaces_simple(self) -> None:
 512 |         body = """
 513 |         <test xmlns:somens="http://scrapy.org">
 514 |            <somens:a id="foo">take this</a>
 515 |            <a id="bar">found</a>
 516 |         </test>
 517 |         """
 518 | 
 519 |         x = self.sscls(text=body, type="xml")
 520 | 
 521 |         x.register_namespace("somens", "http://scrapy.org")
 522 |         self.assertEqual(x.xpath("//somens:a/text()").extract(), ["take this"])
 523 | 
 524 |     def test_namespaces_adhoc(self) -> None:
 525 |         body = """
 526 |         <test xmlns:somens="http://scrapy.org">
 527 |            <somens:a id="foo">take this</a>
 528 |            <a id="bar">found</a>
 529 |         </test>
 530 |         """
 531 | 
 532 |         x = self.sscls(text=body, type="xml")
 533 | 
 534 |         self.assertEqual(
 535 |             x.xpath(
 536 |                 "//somens:a/text()",
 537 |                 namespaces={"somens": "http://scrapy.org"},
 538 |             ).extract(),
 539 |             ["take this"],
 540 |         )
 541 | 
 542 |     def test_namespaces_adhoc_variables(self) -> None:
 543 |         body = """
 544 |         <test xmlns:somens="http://scrapy.org">
 545 |            <somens:a id="foo">take this</a>
 546 |            <a id="bar">found</a>
 547 |         </test>
 548 |         """
 549 | 
 550 |         x = self.sscls(text=body, type="xml")
 551 | 
 552 |         self.assertEqual(
 553 |             x.xpath(
 554 |                 "//somens:a/following-sibling::a[@id=$identifier]/text()",
 555 |                 namespaces={"somens": "http://scrapy.org"},
 556 |                 identifier="bar",
 557 |             ).extract(),
 558 |             ["found"],
 559 |         )
 560 | 
 561 |     def test_namespaces_multiple(self) -> None:
 562 |         body = """<?xml version="1.0" encoding="UTF-8"?>
 563 | <BrowseNode xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05"
 564 |             xmlns:b="http://somens.com"
 565 |             xmlns:p="http://www.scrapy.org/product" >
 566 |     <b:Operation>hello</b:Operation>
 567 |     <TestTag b:att="value"><Other>value</Other></TestTag>
 568 |     <p:SecondTestTag><material>iron</material><price>90</price><p:name>Dried Rose</p:name></p:SecondTestTag>
 569 | </BrowseNode>
 570 |         """
 571 |         x = self.sscls(text=body, type="xml")
 572 |         x.register_namespace(
 573 |             "xmlns",
 574 |             "http://webservices.amazon.com/AWSECommerceService/2005-10-05",
 575 |         )
 576 |         x.register_namespace("p", "http://www.scrapy.org/product")
 577 |         x.register_namespace("b", "http://somens.com")
 578 |         self.assertEqual(len(x.xpath("//xmlns:TestTag")), 1)
 579 |         self.assertEqual(x.xpath("//b:Operation/text()").extract()[0], "hello")
 580 |         self.assertEqual(x.xpath("//xmlns:TestTag/@b:att").extract()[0], "value")
 581 |         self.assertEqual(
 582 |             x.xpath("//p:SecondTestTag/xmlns:price/text()").extract()[0], "90"
 583 |         )
 584 |         self.assertEqual(
 585 |             x.xpath("//p:SecondTestTag").xpath("./xmlns:price/text()")[0].extract(),
 586 |             "90",
 587 |         )
 588 |         self.assertEqual(
 589 |             x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0],
 590 |             "iron",
 591 |         )
 592 | 
 593 |     def test_namespaces_multiple_adhoc(self) -> None:
 594 |         body = """<?xml version="1.0" encoding="UTF-8"?>
 595 | <BrowseNode xmlns="http://webservices.amazon.com/AWSECommerceService/2005-10-05"
 596 |             xmlns:b="http://somens.com"
 597 |             xmlns:p="http://www.scrapy.org/product" >
 598 |     <b:Operation>hello</b:Operation>
 599 |     <TestTag b:att="value"><Other>value</Other></TestTag>
 600 |     <p:SecondTestTag><material>iron</material><price>90</price><p:name>Dried Rose</p:name></p:SecondTestTag>
 601 | </BrowseNode>
 602 |         """
 603 |         x = self.sscls(text=body, type="xml")
 604 |         x.register_namespace(
 605 |             "xmlns",
 606 |             "http://webservices.amazon.com/AWSECommerceService/2005-10-05",
 607 |         )
 608 |         self.assertEqual(len(x.xpath("//xmlns:TestTag")), 1)
 609 | 
 610 |         # "b" namespace is not declared yet
 611 |         self.assertRaises(ValueError, x.xpath, "//xmlns:TestTag/@b:att")
 612 | 
 613 |         # "b" namespace being passed ad-hoc
 614 |         self.assertEqual(
 615 |             x.xpath(
 616 |                 "//b:Operation/text()", namespaces={"b": "http://somens.com"}
 617 |             ).extract()[0],
 618 |             "hello",
 619 |         )
 620 | 
 621 |         # "b" namespace declaration is not cached
 622 |         self.assertRaises(ValueError, x.xpath, "//xmlns:TestTag/@b:att")
 623 | 
 624 |         # "xmlns" is still defined
 625 |         self.assertEqual(
 626 |             x.xpath(
 627 |                 "//xmlns:TestTag/@b:att",
 628 |                 namespaces={"b": "http://somens.com"},
 629 |             ).extract()[0],
 630 |             "value",
 631 |         )
 632 | 
 633 |         # chained selectors still have knowledge of register_namespace() operations
 634 |         self.assertEqual(
 635 |             x.xpath(
 636 |                 "//p:SecondTestTag",
 637 |                 namespaces={"p": "http://www.scrapy.org/product"},
 638 |             )
 639 |             .xpath("./xmlns:price/text()")[0]
 640 |             .extract(),
 641 |             "90",
 642 |         )
 643 | 
 644 |         # but chained selector don't know about parent ad-hoc declarations
 645 |         self.assertRaises(
 646 |             ValueError,
 647 |             x.xpath(
 648 |                 "//p:SecondTestTag",
 649 |                 namespaces={"p": "http://www.scrapy.org/product"},
 650 |             ).xpath,
 651 |             "p:name/text()",
 652 |         )
 653 | 
 654 |         # ad-hoc declarations need repeats when chaining
 655 |         self.assertEqual(
 656 |             x.xpath(
 657 |                 "//p:SecondTestTag",
 658 |                 namespaces={"p": "http://www.scrapy.org/product"},
 659 |             )
 660 |             .xpath(
 661 |                 "p:name/text()",
 662 |                 namespaces={"p": "http://www.scrapy.org/product"},
 663 |             )
 664 |             .extract_first(),
 665 |             "Dried Rose",
 666 |         )
 667 | 
 668 |         # declaring several ad-hoc namespaces
 669 |         self.assertEqual(
 670 |             x.xpath(
 671 |                 "string(//b:Operation/following-sibling::xmlns:TestTag"
 672 |                 "/following-sibling::*//p:name)",
 673 |                 namespaces={
 674 |                     "b": "http://somens.com",
 675 |                     "p": "http://www.scrapy.org/product",
 676 |                 },
 677 |             ).extract_first(),
 678 |             "Dried Rose",
 679 |         )
 680 | 
 681 |         # "p" prefix is not cached from previous calls
 682 |         self.assertRaises(ValueError, x.xpath, "//p:SecondTestTag/xmlns:price/text()")
 683 | 
 684 |         x.register_namespace("p", "http://www.scrapy.org/product")
 685 |         self.assertEqual(
 686 |             x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0],
 687 |             "iron",
 688 |         )
 689 | 
 690 |     def test_make_links_absolute(self) -> None:
 691 |         text = '<a href="file.html">link to file</a>'
 692 |         sel = Selector(text=text, base_url="http://example.com")
 693 |         typing.cast("HtmlElement", sel.root).make_links_absolute()
 694 |         self.assertEqual(
 695 |             "http://example.com/file.html",
 696 |             sel.xpath("//a/@href").extract_first(),
 697 |         )
 698 | 
 699 |     def test_re(self) -> None:
 700 |         body = """<div>Name: Mary
 701 |                     <ul>
 702 |                       <li>Name: John</li>
 703 |                       <li>Age: 10</li>
 704 |                       <li>Name: Paul</li>
 705 |                       <li>Age: 20</li>
 706 |                     </ul>
 707 |                     Age: 20
 708 |                   </div>"""
 709 |         x = self.sscls(text=body)
 710 | 
 711 |         name_re = re.compile(r"Name: (\w+)")
 712 |         self.assertEqual(x.xpath("//ul/li").re(name_re), ["John", "Paul"])
 713 |         self.assertEqual(x.xpath("//ul/li").re(r"Age: (\d+)"), ["10", "20"])
 714 | 
 715 |         # Test named group, hit and miss
 716 |         x = self.sscls(text="foobar")
 717 |         self.assertEqual(x.re("(?P<extract>foo)"), ["foo"])
 718 |         self.assertEqual(x.re("(?P<extract>baz)"), [])
 719 | 
 720 |         # A purposely constructed test for an edge case
 721 |         x = self.sscls(text="baz")
 722 |         self.assertEqual(x.re("(?P<extract>foo)|(?P<bar>baz)"), [])
 723 | 
 724 |     def test_re_replace_entities(self) -> None:
 725 |         body = """<script>{"foo":"bar &amp; &quot;baz&quot;"}</script>"""
 726 |         x = self.sscls(text=body)
 727 | 
 728 |         name_re = re.compile('{"foo":(.*)}')
 729 | 
 730 |         # by default, only &amp; and &lt; are preserved ;
 731 |         # other entities are converted
 732 |         expected = '"bar &amp; "baz""'
 733 |         self.assertEqual(x.xpath("//script/text()").re(name_re), [expected])
 734 |         self.assertEqual(x.xpath("//script").re(name_re), [expected])
 735 |         self.assertEqual(x.xpath("//script/text()")[0].re(name_re), [expected])
 736 |         self.assertEqual(x.xpath("//script")[0].re(name_re), [expected])
 737 | 
 738 |         # check that re_first() works the same way for single value output
 739 |         self.assertEqual(x.xpath("//script").re_first(name_re), expected)
 740 |         self.assertEqual(x.xpath("//script")[0].re_first(name_re), expected)
 741 | 
 742 |         # switching off replace_entities will preserve &quot; also
 743 |         expected = '"bar &amp; &quot;baz&quot;"'
 744 |         self.assertEqual(
 745 |             x.xpath("//script/text()").re(name_re, replace_entities=False),
 746 |             [expected],
 747 |         )
 748 |         self.assertEqual(
 749 |             x.xpath("//script")[0].re(name_re, replace_entities=False),
 750 |             [expected],
 751 |         )
 752 | 
 753 |         self.assertEqual(
 754 |             x.xpath("//script/text()").re_first(name_re, replace_entities=False),
 755 |             expected,
 756 |         )
 757 |         self.assertEqual(
 758 |             x.xpath("//script")[0].re_first(name_re, replace_entities=False),
 759 |             expected,
 760 |         )
 761 | 
 762 |     def test_re_intl(self) -> None:
 763 |         body = "<div>Evento: cumplea\xf1os</div>"
 764 |         x = self.sscls(text=body)
 765 |         self.assertEqual(x.xpath("//div").re(r"Evento: (\w+)"), ["cumplea\xf1os"])
 766 | 
 767 |     def test_selector_over_text(self) -> None:
 768 |         hs = self.sscls(text="<root>lala</root>")
 769 |         self.assertEqual(hs.extract(), "<html><body><root>lala</root></body></html>")
 770 |         xs = self.sscls(text="<root>lala</root>", type="xml")
 771 |         self.assertEqual(xs.extract(), "<root>lala</root>")
 772 |         self.assertEqual(xs.xpath(".").extract(), ["<root>lala</root>"])
 773 | 
 774 |     def test_invalid_xpath(self) -> None:
 775 |         """Test invalid xpath raises ValueError with the invalid xpath"""
 776 |         x = self.sscls(text="<html></html>")
 777 |         xpath = "//test[@foo='bar]"
 778 |         self.assertRaisesRegex(ValueError, re.escape(xpath), x.xpath, xpath)
 779 | 
 780 |     def test_invalid_xpath_unicode(self) -> None:
 781 |         """Test *Unicode* invalid xpath raises ValueError with the invalid xpath"""
 782 |         x = self.sscls(text="<html></html>")
 783 |         xpath = "//test[@foo='\\u0431ar]"
 784 |         self.assertRaisesRegex(ValueError, re.escape(xpath), x.xpath, xpath)
 785 | 
 786 |     def test_http_header_encoding_precedence(self) -> None:
 787 |         # '\xa3'     = pound symbol in unicode
 788 |         # '\xc2\xa3' = pound symbol in utf-8
 789 |         # '\xa3'     = pound symbol in latin-1 (iso-8859-1)
 790 | 
 791 |         text = """<html>
 792 |         <head><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"></head>
 793 |         <body><span id="blank">\xa3</span></body></html>"""
 794 |         x = self.sscls(text=text)
 795 |         self.assertEqual(x.xpath("//span[@id='blank']/text()").extract(), ["\xa3"])
 796 | 
 797 |     def test_empty_bodies_shouldnt_raise_errors(self) -> None:
 798 |         self.sscls(text="").xpath("//text()").extract()
 799 | 
 800 |     def test_bodies_with_comments_only(self) -> None:
 801 |         sel = self.sscls(text="<!-- hello world -->", base_url="http://example.com")
 802 |         self.assertEqual("http://example.com", sel.root.base)
 803 | 
 804 |     def test_null_bytes_shouldnt_raise_errors(self) -> None:
 805 |         text = "<root>pre\x00post</root>"
 806 |         self.sscls(text).xpath("//text()").extract()
 807 | 
 808 |     def test_replacement_char_from_badly_encoded_body(self) -> None:
 809 |         # \xe9 alone isn't valid utf8 sequence
 810 |         text = "<html><p>an Jos\\ufffd de</p><html>"
 811 |         self.assertEqual(
 812 |             ["an Jos\\ufffd de"], self.sscls(text).xpath("//text()").extract()
 813 |         )
 814 | 
 815 |     def test_select_on_unevaluable_nodes(self) -> None:
 816 |         r = self.sscls(text='<span class="big">some text</span>')
 817 |         # Text node
 818 |         x1 = r.xpath("//text()")
 819 |         self.assertEqual(x1.extract(), ["some text"])
 820 |         self.assertEqual(x1.xpath(".//b").extract(), [])
 821 |         # Tag attribute
 822 |         x1 = r.xpath("//span/@class")
 823 |         self.assertEqual(x1.extract(), ["big"])
 824 |         self.assertEqual(x1.xpath(".//text()").extract(), [])
 825 | 
 826 |     def test_select_on_text_nodes(self) -> None:
 827 |         r = self.sscls(text="<div><b>Options:</b>opt1</div><div><b>Other</b>opt2</div>")
 828 |         x1 = r.xpath(
 829 |             "//div/descendant::text()[preceding-sibling::b[contains(text(), 'Options')]]"
 830 |         )
 831 |         self.assertEqual(x1.extract(), ["opt1"])
 832 | 
 833 |         x1 = r.xpath(
 834 |             "//div/descendant::text()/preceding-sibling::b[contains(text(), 'Options')]"
 835 |         )
 836 |         self.assertEqual(x1.extract(), ["<b>Options:</b>"])
 837 | 
 838 |     @unittest.skip("Text nodes lost parent node reference in lxml")
 839 |     def test_nested_select_on_text_nodes(self) -> None:
 840 |         # FIXME: does not work with lxml backend [upstream]
 841 |         r = self.sscls(text="<div><b>Options:</b>opt1</div><div><b>Other</b>opt2</div>")
 842 |         x1 = r.xpath("//div/descendant::text()")
 843 |         x2 = x1.xpath("./preceding-sibling::b[contains(text(), 'Options')]")
 844 |         self.assertEqual(x2.extract(), ["<b>Options:</b>"])
 845 | 
 846 |     def test_weakref_slots(self) -> None:
 847 |         """Check that classes are using slots and are weak-referenceable"""
 848 |         x = self.sscls(text="")
 849 |         weakref.ref(x)
 850 |         assert not hasattr(x, "__dict__"), (
 851 |             f"{x.__class__.__name__} does not use __slots__"
 852 |         )
 853 | 
 854 |     def test_remove_namespaces(self) -> None:
 855 |         xml = """<?xml version="1.0" encoding="UTF-8"?>
 856 | <feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/">
 857 |   <link type="text/html"/>
 858 |   <entry>
 859 |     <link type="text/html"/>
 860 |   </entry>
 861 |   <link type="application/atom+xml"/>
 862 | </feed>
 863 | """
 864 |         sel = self.sscls(text=xml, type="xml")
 865 |         self.assertEqual(len(sel.xpath("//link")), 0)
 866 |         self.assertEqual(len(sel.xpath("./namespace::*")), 3)
 867 |         sel.remove_namespaces()
 868 |         self.assertEqual(len(sel.xpath("//link")), 3)
 869 |         self.assertEqual(len(sel.xpath("./namespace::*")), 1)
 870 | 
 871 |     def test_remove_namespaces_embedded(self) -> None:
 872 |         xml = """
 873 |         <feed xmlns="http://www.w3.org/2005/Atom">
 874 |           <link type="text/html"/>
 875 |           <entry>
 876 |             <link type="text/html"/>
 877 |           </entry>
 878 |           <svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 100 100">
 879 |             <linearGradient id="gradient">
 880 |               <stop class="begin" offset="0%" style="stop-color:yellow;"/>
 881 |               <stop class="end" offset="80%" style="stop-color:green;"/>
 882 |             </linearGradient>
 883 |             <circle cx="50" cy="50" r="30" style="fill:url(#gradient)" />
 884 |           </svg>
 885 |         </feed>
 886 |         """
 887 |         sel = self.sscls(text=xml, type="xml")
 888 |         self.assertEqual(len(sel.xpath("//link")), 0)
 889 |         self.assertEqual(len(sel.xpath("//stop")), 0)
 890 |         self.assertEqual(len(sel.xpath("./namespace::*")), 2)
 891 |         self.assertEqual(
 892 |             len(
 893 |                 sel.xpath(
 894 |                     "//f:link",
 895 |                     namespaces={"f": "http://www.w3.org/2005/Atom"},
 896 |                 )
 897 |             ),
 898 |             2,
 899 |         )
 900 |         self.assertEqual(
 901 |             len(sel.xpath("//s:stop", namespaces={"s": "http://www.w3.org/2000/svg"})),
 902 |             2,
 903 |         )
 904 |         sel.remove_namespaces()
 905 |         self.assertEqual(len(sel.xpath("//link")), 2)
 906 |         self.assertEqual(len(sel.xpath("//stop")), 2)
 907 |         self.assertEqual(len(sel.xpath("./namespace::*")), 1)
 908 | 
 909 |     def test_remove_attributes_namespaces(self) -> None:
 910 |         xml = """<?xml version="1.0" encoding="UTF-8"?>
 911 | <feed xmlns:atom="http://www.w3.org/2005/Atom" xml:lang="en-US" xmlns:media="http://search.yahoo.com/mrss/">
 912 |   <link atom:type="text/html"/>
 913 |   <entry>
 914 |     <link atom:type="text/html"/>
 915 |   </entry>
 916 |   <link atom:type="application/atom+xml"/>
 917 | </feed>
 918 | """
 919 |         sel = self.sscls(text=xml, type="xml")
 920 |         self.assertEqual(len(sel.xpath("//link/@type")), 0)
 921 |         sel.remove_namespaces()
 922 |         self.assertEqual(len(sel.xpath("//link/@type")), 3)
 923 | 
 924 |     def test_smart_strings(self) -> None:
 925 |         """Lxml smart strings return values"""
 926 | 
 927 |         class SmartStringsSelector(Selector):
 928 |             _lxml_smart_strings = True
 929 | 
 930 |         body = """<body>
 931 |                     <div class='one'>
 932 |                       <ul>
 933 |                         <li>one</li><li>two</li>
 934 |                       </ul>
 935 |                     </div>
 936 |                     <div class='two'>
 937 |                       <ul>
 938 |                         <li>four</li><li>five</li><li>six</li>
 939 |                       </ul>
 940 |                     </div>
 941 |                   </body>"""
 942 | 
 943 |         # .getparent() is available for text nodes and attributes
 944 |         # only when smart_strings are on
 945 |         x = self.sscls(text=body)
 946 |         li_text = x.xpath("//li/text()")
 947 |         self.assertFalse(any(hasattr(e.root, "getparent") for e in li_text))
 948 |         div_class = x.xpath("//div/@class")
 949 |         self.assertFalse(any(hasattr(e.root, "getparent") for e in div_class))
 950 | 
 951 |         smart_x = SmartStringsSelector(text=body)
 952 |         smart_li_text = smart_x.xpath("//li/text()")
 953 |         self.assertTrue(all(hasattr(e.root, "getparent") for e in smart_li_text))
 954 |         smart_div_class = smart_x.xpath("//div/@class")
 955 |         self.assertTrue(all(hasattr(e.root, "getparent") for e in smart_div_class))
 956 | 
 957 |     def test_xml_entity_expansion(self) -> None:
 958 |         malicious_xml = (
 959 |             '<?xml version="1.0" encoding="ISO-8859-1"?>'
 960 |             "<!DOCTYPE foo [ <!ELEMENT foo ANY > <!ENTITY xxe SYSTEM "
 961 |             '"file:///etc/passwd" >]><foo>&xxe;</foo>'
 962 |         )
 963 | 
 964 |         sel = self.sscls(text=malicious_xml, type="xml")
 965 | 
 966 |         self.assertEqual(sel.extract(), "<foo>&xxe;</foo>")
 967 | 
 968 |     def test_configure_base_url(self) -> None:
 969 |         sel = self.sscls(text="nothing", base_url="http://example.com")
 970 |         self.assertEqual("http://example.com", sel.root.base)
 971 | 
 972 |     def test_extending_selector(self) -> None:
 973 |         class MySelectorList(SelectorList["MySelector"]):
 974 |             pass
 975 | 
 976 |         class MySelector(Selector):
 977 |             selectorlist_cls = MySelectorList
 978 | 
 979 |             def extra_method(self) -> str:
 980 |                 return "extra" + cast("str", self.get())
 981 | 
 982 |         sel = MySelector(text="<html><div>foo</div></html>")
 983 |         self.assertIsInstance(sel.xpath("//div"), MySelectorList)
 984 |         self.assertIsInstance(sel.xpath("//div")[0], MySelector)
 985 |         self.assertIsInstance(sel.css("div"), MySelectorList)
 986 |         self.assertIsInstance(sel.css("div")[0], MySelector)
 987 |         content: str = sel.css("div")[0].extra_method()
 988 |         self.assertEqual("extra<div>foo</div>", content)
 989 | 
 990 |     def test_replacement_null_char_from_body(self) -> None:
 991 |         text = "<html>\x00<body><p>Grainy</p></body></html>"
 992 |         self.assertEqual(
 993 |             "<html><body><p>Grainy</p></body></html>",
 994 |             self.sscls(text).extract(),
 995 |         )
 996 | 
 997 |     def test_remove_selector_list(self) -> None:
 998 |         sel = self.sscls(
 999 |             text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>"
1000 |         )
1001 |         sel_list = sel.css("li")
1002 |         sel_list.drop()
1003 |         self.assertIsSelectorList(sel.css("li"))
1004 |         self.assertEqual(sel.css("li"), [])
1005 | 
1006 |     def test_remove_selector(self) -> None:
1007 |         sel = self.sscls(
1008 |             text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>"
1009 |         )
1010 |         sel_list = sel.css("li")
1011 |         sel_list[0].drop()
1012 |         self.assertIsSelectorList(sel.css("li"))
1013 |         self.assertEqual(sel.css("li::text").getall(), ["2", "3"])
1014 | 
1015 |     def test_remove_pseudo_element_selector_list(self) -> None:
1016 |         sel = self.sscls(
1017 |             text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>"
1018 |         )
1019 |         sel_list = sel.css("li::text")
1020 |         self.assertEqual(sel_list.getall(), ["1", "2", "3"])
1021 |         with self.assertRaises(CannotRemoveElementWithoutRoot):
1022 |             sel_list.drop()
1023 | 
1024 |         self.assertIsSelectorList(sel.css("li"))
1025 |         self.assertEqual(sel.css("li::text").getall(), ["1", "2", "3"])
1026 | 
1027 |     def test_remove_pseudo_element_selector(self) -> None:
1028 |         sel = self.sscls(
1029 |             text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>"
1030 |         )
1031 |         sel_list = sel.css("li::text")
1032 |         self.assertEqual(sel_list.getall(), ["1", "2", "3"])
1033 |         with self.assertRaises(CannotRemoveElementWithoutRoot):
1034 |             sel_list[0].drop()
1035 | 
1036 |         self.assertIsSelectorList(sel.css("li"))
1037 |         self.assertEqual(sel.css("li::text").getall(), ["1", "2", "3"])
1038 | 
1039 |     def test_remove_root_element_selector(self) -> None:
1040 |         sel = self.sscls(
1041 |             text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>"
1042 |         )
1043 |         sel_list = sel.css("li::text")
1044 |         self.assertEqual(sel_list.getall(), ["1", "2", "3"])
1045 |         with self.assertRaises(CannotRemoveElementWithoutParent):
1046 |             sel.drop()
1047 | 
1048 |         with self.assertRaises(CannotRemoveElementWithoutParent):
1049 |             sel.css("html").drop()
1050 | 
1051 |         self.assertIsSelectorList(sel.css("li"))
1052 |         self.assertEqual(sel.css("li::text").getall(), ["1", "2", "3"])
1053 | 
1054 |         sel.css("body").drop()
1055 |         self.assertEqual(sel.get(), "<html></html>")
1056 | 
1057 |     def test_deep_nesting(self) -> None:
1058 |         lxml_version = Version(etree.__version__)
1059 |         lxml_huge_tree_version = Version("4.2")
1060 | 
1061 |         content = """
1062 |         <html>
1063 |         <body>
1064 |         <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span>
1065 |         <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span>
1066 |         <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span>
1067 |         <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span>
1068 |         <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span>
1069 |         <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span>
1070 |         <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span>
1071 |         <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span>
1072 |         <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span>
1073 |         <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span>
1074 |         <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span>
1075 |         <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span>
1076 |         <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span>
1077 |         <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span>
1078 |         <span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span><span>
1079 |         <span><span><span><span><span><span><span><span><span><span><span><span>
1080 |         hello world
1081 |         </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span>
1082 |         </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span>
1083 |         </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span>
1084 |         </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span>
1085 |         </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span>
1086 |         </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span>
1087 |         </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span>
1088 |         </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span>
1089 |         </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span>
1090 |         </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span>
1091 |         </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span>
1092 |         </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span>
1093 |         </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span>
1094 |         </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span>
1095 |         </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span>
1096 |         </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span>
1097 |         </span></span></span></span></span></span></span></span></span></span></span></span></span></span></span></span>
1098 |         </span></span></span></span></span></span></span></span></span></span>
1099 |         <table>
1100 |          <tr><td>some test</td></tr>
1101 |         </table>
1102 |         </body>
1103 |         </html>
1104 |         """
1105 | 
1106 |         # If lxml doesn't support huge trees expect wrong results and a warning
1107 |         if lxml_version < lxml_huge_tree_version:
1108 |             with warnings.catch_warnings(record=True) as w:
1109 |                 sel = Selector(text=content)
1110 |                 self.assertIn("huge_tree", str(w[0].message))
1111 |                 self.assertLessEqual(len(sel.css("span")), 256)
1112 |                 self.assertEqual(len(sel.css("td")), 0)
1113 |             return
1114 | 
1115 |         # Same goes for explicitly disabling huge trees
1116 |         with warnings.catch_warnings(record=True) as w:
1117 |             sel = Selector(text=content, huge_tree=False)
1118 |             self.assertIn("huge_tree", str(w[0].message))
1119 |             self.assertLessEqual(len(sel.css("span")), 256)
1120 |             self.assertEqual(len(sel.css("td")), 0)
1121 | 
1122 |         # If huge trees are enabled, elements with a depth > 255 should be found
1123 |         sel = Selector(text=content)
1124 |         nest_level = 282
1125 |         self.assertEqual(len(sel.css("span")), nest_level)
1126 |         self.assertEqual(len(sel.css("td")), 1)
1127 | 
1128 |     def test_invalid_type(self) -> None:
1129 |         with self.assertRaises(ValueError):
1130 |             self.sscls("", type="xhtml")
1131 | 
1132 |     def test_default_type(self) -> None:
1133 |         text = "foo"
1134 |         selector = self.sscls(text)
1135 |         self.assertEqual(selector.type, "html")
1136 | 
1137 |     def test_json_type(self) -> None:
1138 |         obj = 1
1139 |         selector = self.sscls(str(obj), type="json")
1140 |         self.assertEqual(selector.root, obj)
1141 |         self.assertEqual(selector.type, "json")
1142 | 
1143 |     def test_html_root(self) -> None:
1144 |         root = etree.fromstring("<html/>")
1145 |         selector = self.sscls(root=root)
1146 |         self.assertEqual(selector.root, root)
1147 |         self.assertEqual(selector.type, "html")
1148 | 
1149 |     def test_json_root(self) -> None:
1150 |         obj = 1
1151 |         selector = self.sscls(root=obj)
1152 |         self.assertEqual(selector.root, obj)
1153 |         self.assertEqual(selector.type, "json")
1154 | 
1155 |     def test_json_xpath(self) -> None:
1156 |         obj = 1
1157 |         selector = self.sscls(root=obj)
1158 |         with self.assertRaises(ValueError):
1159 |             selector.xpath("//*")
1160 | 
1161 |     def test_json_css(self) -> None:
1162 |         obj = 1
1163 |         selector = self.sscls(root=obj)
1164 |         with self.assertRaises(ValueError):
1165 |             selector.css("*")
1166 | 
1167 |     def test_invalid_json(self) -> None:
1168 |         text = "<html/>"
1169 |         selector = self.sscls(text, type="json")
1170 |         self.assertEqual(selector.root, None)
1171 |         self.assertEqual(selector.type, "json")
1172 | 
1173 |     def test_text_and_root_warning(self) -> None:
1174 |         with warnings.catch_warnings(record=True) as w:
1175 |             Selector(text="a", root="b")
1176 |             self.assertIn("both text and root", str(w[0].message))
1177 | 
1178 |     def test_etree_root_invalid_type(self) -> None:
1179 |         selector = Selector("<html></html>")
1180 |         self.assertRaisesRegex(
1181 |             ValueError,
1182 |             "object as root",
1183 |             Selector,
1184 |             root=selector.root,
1185 |             type="text",
1186 |         )
1187 |         self.assertRaisesRegex(
1188 |             ValueError,
1189 |             "object as root",
1190 |             Selector,
1191 |             root=selector.root,
1192 |             type="json",
1193 |         )
1194 | 
1195 |     def test_json_selector_representation(self) -> None:
1196 |         selector = Selector(text="true")
1197 |         assert repr(selector) == "<Selector query=None data='True'>"
1198 |         assert str(selector) == "True"
1199 |         selector = Selector(text="1")
1200 |         assert repr(selector) == "<Selector query=None data='1'>"
1201 |         assert str(selector) == "1"
1202 | 
1203 |     def test_body_bytearray_support(self) -> None:
1204 |         selector = Selector(body=bytearray("<h1>Hello World</h1>", "utf-8"))
1205 |         assert selector.xpath("//h1/text()").get() == "Hello World"
1206 | 
1207 | 
1208 | class ExsltTestCase(unittest.TestCase):
1209 |     sscls = Selector
1210 | 
1211 |     def test_regexp(self) -> None:
1212 |         """EXSLT regular expression tests"""
1213 |         body = """
1214 |         <p><input name='a' value='1'/><input name='b' value='2'/></p>
1215 |         <div class="links">
1216 |         <a href="/first.html">first link</a>
1217 |         <a href="/second.html">second link</a>
1218 |         <a href="http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml">EXSLT match example</a>
1219 |         </div>
1220 |         """
1221 |         sel = self.sscls(text=body)
1222 | 
1223 |         # re:test()
1224 |         self.assertEqual(
1225 |             sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]').extract(),
1226 |             [x.extract() for x in sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]')],
1227 |         )
1228 |         self.assertEqual(
1229 |             [x.extract() for x in sel.xpath(r'//a[re:test(@href, "\.html$")]/text()')],
1230 |             ["first link", "second link"],
1231 |         )
1232 |         self.assertEqual(
1233 |             [x.extract() for x in sel.xpath('//a[re:test(@href, "first")]/text()')],
1234 |             ["first link"],
1235 |         )
1236 |         self.assertEqual(
1237 |             [x.extract() for x in sel.xpath('//a[re:test(@href, "second")]/text()')],
1238 |             ["second link"],
1239 |         )
1240 | 
1241 |         # re:match() is rather special: it returns a node-set of <match> nodes
1242 |         # ['<match>http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml</match>',
1243 |         # '<match>http</match>',
1244 |         # '<match>www.bayes.co.uk</match>',
1245 |         # '<match></match>',
1246 |         # '<match>/xml/index.xml?/xml/utils/rechecker.xml</match>']
1247 |         self.assertEqual(
1248 |             sel.xpath(
1249 |                 r're:match(//a[re:test(@href, "\.xml$")]/@href,'
1250 |                 r'"(\w+):\/\/([^/:]+)(:\d*)?([^# ]*)")/text()'
1251 |             ).extract(),
1252 |             [
1253 |                 "http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml",
1254 |                 "http",
1255 |                 "www.bayes.co.uk",
1256 |                 "",
1257 |                 "/xml/index.xml?/xml/utils/rechecker.xml",
1258 |             ],
1259 |         )
1260 | 
1261 |         # re:replace()
1262 |         self.assertEqual(
1263 |             sel.xpath(
1264 |                 r're:replace(//a[re:test(@href, "\.xml$")]/@href,'
1265 |                 r'"(\w+)://(.+)(\.xml)", "","https://\2.html")'
1266 |             ).extract(),
1267 |             ["https://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.html"],
1268 |         )
1269 | 
1270 |     def test_set(self) -> None:
1271 |         """EXSLT set manipulation tests"""
1272 |         # microdata example from http://schema.org/Event
1273 |         body = """
1274 |         <div itemscope itemtype="http://schema.org/Event">
1275 |           <a itemprop="url" href="nba-miami-philidelphia-game3.html">
1276 |           NBA Eastern Conference First Round Playoff Tickets:
1277 |           <span itemprop="name"> Miami Heat at Philadelphia 76ers - Game 3 (Home Game 1) </span>
1278 |           </a>
1279 | 
1280 |           <meta itemprop="startDate" content="2016-04-21T20:00">
1281 |             Thu, 04/21/16
1282 |             8:00 p.m.
1283 | 
1284 |           <div itemprop="location" itemscope itemtype="http://schema.org/Place">
1285 |             <a itemprop="url" href="wells-fargo-center.html">
1286 |             Wells Fargo Center
1287 |             </a>
1288 |             <div itemprop="address" itemscope itemtype="http://schema.org/PostalAddress">
1289 |               <span itemprop="addressLocality">Philadelphia</span>,
1290 |               <span itemprop="addressRegion">PA</span>
1291 |             </div>
1292 |           </div>
1293 | 
1294 |           <div itemprop="offers" itemscope itemtype="http://schema.org/AggregateOffer">
1295 |             Priced from: <span itemprop="lowPrice">$35</span>
1296 |             <span itemprop="offerCount">1938</span> tickets left
1297 |           </div>
1298 |         </div>
1299 |         """
1300 |         sel = self.sscls(text=body)
1301 | 
1302 |         self.assertEqual(
1303 |             sel.xpath(
1304 |                 """//div[@itemtype="http://schema.org/Event"]
1305 |                             //@itemprop"""
1306 |             ).extract(),
1307 |             [
1308 |                 "url",
1309 |                 "name",
1310 |                 "startDate",
1311 |                 "location",
1312 |                 "url",
1313 |                 "address",
1314 |                 "addressLocality",
1315 |                 "addressRegion",
1316 |                 "offers",
1317 |                 "lowPrice",
1318 |                 "offerCount",
1319 |             ],
1320 |         )
1321 | 
1322 |         self.assertEqual(
1323 |             sel.xpath(
1324 |                 """
1325 |                 set:difference(//div[@itemtype="http://schema.org/Event"]
1326 |                                     //@itemprop,
1327 |                                //div[@itemtype="http://schema.org/Event"]
1328 |                                     //*[@itemscope]/*/@itemprop)"""
1329 |             ).extract(),
1330 |             ["url", "name", "startDate", "location", "offers"],
1331 |         )
1332 | 
1333 |     def test_dont_remove_text_after_deleted_element(self) -> None:
1334 |         sel = self.sscls(
1335 |             text="""<html><body>Text before.<span>Text in.</span> Text after.</body></html>
1336 |             """
1337 |         )
1338 |         sel.css("span").drop()
1339 |         self.assertEqual(
1340 |             sel.get(), "<html><body>Text before. Text after.</body></html>"
1341 |         )
1342 | 
1343 |     def test_drop_with_xml_type(self) -> None:
1344 |         sel = self.sscls(text="<a><b></b><c/></a>", type="xml")
1345 |         el = sel.xpath("//b")[0]
1346 |         assert el.root.getparent() is not None
1347 |         el.drop()
1348 |         assert sel.get() == "<a><c/></a>"
1349 | 
1350 | 
1351 | class SelectorBytesInput(Selector):
1352 |     def __init__(
1353 |         self,
1354 |         text: str | None = None,
1355 |         type: str | None = None,
1356 |         body: bytes = b"",
1357 |         encoding: str = "utf-8",
1358 |         namespaces: Mapping[str, str] | None = None,
1359 |         root: Any | None = _NOT_SET,
1360 |         base_url: str | None = None,
1361 |         _expr: str | None = None,
1362 |         huge_tree: bool = LXML_SUPPORTS_HUGE_TREE,
1363 |     ) -> None:
1364 |         if text:
1365 |             body = bytes(text, encoding=encoding)
1366 |             text = None
1367 |         super().__init__(
1368 |             text=text,
1369 |             type=type,
1370 |             body=body,
1371 |             encoding=encoding,
1372 |             namespaces=namespaces,
1373 |             root=root,
1374 |             base_url=base_url,
1375 |             _expr=_expr,
1376 |             huge_tree=huge_tree,
1377 |         )
1378 | 
1379 | 
1380 | class SelectorTestCaseBytes(SelectorTestCase):
1381 |     sscls = SelectorBytesInput
1382 | 
1383 |     def test_representation_slice(self) -> None:
1384 |         pass
1385 | 
1386 |     def test_representation_unicode_query(self) -> None:
1387 |         pass
1388 | 
1389 |     def test_weakref_slots(self) -> None:
1390 |         pass
1391 | 
1392 |     def test_check_text_argument_type(self) -> None:
1393 |         self.assertRaisesRegex(
1394 |             TypeError,
1395 |             "body argument should be of type",
1396 |             self.sscls,
1397 |             body="<html/>",
1398 |         )
1399 | 
1400 | 
1401 | class ExsltTestCaseBytes(ExsltTestCase):
1402 |     sscls = SelectorBytesInput
1403 | 


--------------------------------------------------------------------------------
/tests/test_selector_csstranslator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Selector tests for cssselect backend
  3 | """
  4 | 
  5 | from __future__ import annotations
  6 | 
  7 | import unittest
  8 | from typing import Any, Callable, Protocol
  9 | 
 10 | import cssselect
 11 | import pytest
 12 | from cssselect.parser import SelectorSyntaxError
 13 | from cssselect.xpath import ExpressionError
 14 | from packaging.version import Version
 15 | 
 16 | from parsel import Selector
 17 | from parsel.csstranslator import GenericTranslator, HTMLTranslator, TranslatorProtocol
 18 | 
 19 | HTMLBODY = """
 20 | <html>
 21 | <body>
 22 | <div>
 23 |  <a id="name-anchor" name="foo"></a>
 24 |  <a id="tag-anchor" rel="tag" href="http://localhost/foo">link</a>
 25 |  <a id="nofollow-anchor" rel="nofollow" href="https://example.org"> link</a>
 26 |  <p id="paragraph">
 27 |    lorem ipsum text
 28 |    <b id="p-b">hi</b> <em id="p-em">there</em>
 29 |    <b id="p-b2">guy</b>
 30 |    <input type="checkbox" id="checkbox-unchecked" />
 31 |    <input type="checkbox" id="checkbox-disabled" disabled="" />
 32 |    <input type="text" id="text-checked" checked="checked" />
 33 |    <input type="hidden" />
 34 |    <input type="hidden" disabled="disabled" />
 35 |    <input type="checkbox" id="checkbox-checked" checked="checked" />
 36 |    <input type="checkbox" id="checkbox-disabled-checked"
 37 |           disabled="disabled" checked="checked" />
 38 |    <fieldset id="fieldset" disabled="disabled">
 39 |      <input type="checkbox" id="checkbox-fieldset-disabled" />
 40 |      <input type="hidden" />
 41 |    </fieldset>
 42 |  </p>
 43 |  <map name="dummymap">
 44 |    <area shape="circle" coords="200,250,25" href="foo.html" id="area-href" />
 45 |    <area shape="default" id="area-nohref" />
 46 |  </map>
 47 | </div>
 48 | <div class="cool-footer" id="foobar-div" foobar="ab bc cde">
 49 |     <span id="foobar-span">foo ter</span>
 50 | </div>
 51 | </body></html>
 52 | """
 53 | 
 54 | 
 55 | class TranslatorTestProtocol(Protocol):
 56 |     tr_cls: type[TranslatorProtocol]
 57 |     tr: TranslatorProtocol
 58 | 
 59 |     def c2x(self, css: str, prefix: str = ...) -> str:
 60 |         pass
 61 | 
 62 |     def assertEqual(self, first: Any, second: Any, msg: Any = ...) -> None:
 63 |         pass
 64 | 
 65 |     def assertRaises(
 66 |         self,
 67 |         expected_exception: type[BaseException] | tuple[type[BaseException], ...],
 68 |         callable: Callable[..., object],
 69 |         *args: Any,
 70 |         **kwargs: Any,
 71 |     ) -> None:
 72 |         pass
 73 | 
 74 | 
 75 | class TranslatorTestMixin:
 76 |     def setUp(self: TranslatorTestProtocol) -> None:
 77 |         self.tr = self.tr_cls()
 78 |         self.c2x = self.tr.css_to_xpath
 79 | 
 80 |     def test_attr_function(self: TranslatorTestProtocol) -> None:
 81 |         cases = [
 82 |             ("::attr(name)", "descendant-or-self::*/@name"),
 83 |             ("a::attr(href)", "descendant-or-self::a/@href"),
 84 |             (
 85 |                 "a ::attr(img)",
 86 |                 "descendant-or-self::a/descendant-or-self::*/@img",
 87 |             ),
 88 |             ("a > ::attr(class)", "descendant-or-self::a/*/@class"),
 89 |         ]
 90 |         for css, xpath in cases:
 91 |             self.assertEqual(self.c2x(css), xpath, css)
 92 | 
 93 |     def test_attr_function_exception(self: TranslatorTestProtocol) -> None:
 94 |         cases = [
 95 |             ("::attr(12)", ExpressionError),
 96 |             ("::attr(34test)", ExpressionError),
 97 |             ("::attr(@href)", SelectorSyntaxError),
 98 |         ]
 99 |         for css, exc in cases:
100 |             self.assertRaises(exc, self.c2x, css)
101 | 
102 |     def test_text_pseudo_element(self: TranslatorTestProtocol) -> None:
103 |         cases = [
104 |             ("::text", "descendant-or-self::text()"),
105 |             ("p::text", "descendant-or-self::p/text()"),
106 |             ("p ::text", "descendant-or-self::p/descendant-or-self::text()"),
107 |             ("#id::text", "descendant-or-self::*[@id = 'id']/text()"),
108 |             ("p#id::text", "descendant-or-self::p[@id = 'id']/text()"),
109 |             (
110 |                 "p#id ::text",
111 |                 "descendant-or-self::p[@id = 'id']/descendant-or-self::text()",
112 |             ),
113 |             ("p#id > ::text", "descendant-or-self::p[@id = 'id']/*/text()"),
114 |             (
115 |                 "p#id ~ ::text",
116 |                 "descendant-or-self::p[@id = 'id']/following-sibling::*/text()",
117 |             ),
118 |             ("a[href]::text", "descendant-or-self::a[@href]/text()"),
119 |             (
120 |                 "a[href] ::text",
121 |                 "descendant-or-self::a[@href]/descendant-or-self::text()",
122 |             ),
123 |             (
124 |                 "p::text, a::text",
125 |                 "descendant-or-self::p/text() | descendant-or-self::a/text()",
126 |             ),
127 |         ]
128 |         for css, xpath in cases:
129 |             self.assertEqual(self.c2x(css), xpath, css)
130 | 
131 |     def test_pseudo_function_exception(self: TranslatorTestProtocol) -> None:
132 |         cases = [
133 |             ("::attribute(12)", ExpressionError),
134 |             ("::text()", ExpressionError),
135 |             ("::attr(@href)", SelectorSyntaxError),
136 |         ]
137 |         for css, exc in cases:
138 |             self.assertRaises(exc, self.c2x, css)
139 | 
140 |     def test_unknown_pseudo_element(self: TranslatorTestProtocol) -> None:
141 |         cases = [
142 |             ("::text-node", ExpressionError),
143 |         ]
144 |         for css, exc in cases:
145 |             self.assertRaises(exc, self.c2x, css)
146 | 
147 |     def test_unknown_pseudo_class(self: TranslatorTestProtocol) -> None:
148 |         cases = [
149 |             (":text", ExpressionError),
150 |             (":attribute(name)", ExpressionError),
151 |         ]
152 |         for css, exc in cases:
153 |             self.assertRaises(exc, self.c2x, css)
154 | 
155 | 
156 | class HTMLTranslatorTest(TranslatorTestMixin, unittest.TestCase):
157 |     tr_cls = HTMLTranslator
158 | 
159 | 
160 | class GenericTranslatorTest(TranslatorTestMixin, unittest.TestCase):
161 |     tr_cls = GenericTranslator
162 | 
163 | 
164 | class UtilCss2XPathTest(unittest.TestCase):
165 |     def test_css2xpath(self) -> None:
166 |         from parsel import css2xpath
167 | 
168 |         expected_xpath = (
169 |             "descendant-or-self::*[@class and contains("
170 |             "concat(' ', normalize-space(@class), ' '), ' some-class ')]"
171 |         )
172 |         self.assertEqual(css2xpath(".some-class"), expected_xpath)
173 | 
174 | 
175 | class CSSSelectorTest(unittest.TestCase):
176 |     sscls = Selector
177 | 
178 |     def setUp(self) -> None:
179 |         self.sel = self.sscls(text=HTMLBODY)
180 | 
181 |     def x(self, *a: Any, **kw: Any) -> list[str]:
182 |         return [v.strip() for v in self.sel.css(*a, **kw).extract() if v.strip()]
183 | 
184 |     def test_selector_simple(self) -> None:
185 |         for x in self.sel.css("input"):
186 |             self.assertTrue(isinstance(x, self.sel.__class__), x)
187 |         self.assertEqual(
188 |             self.sel.css("input").extract(),
189 |             [x.extract() for x in self.sel.css("input")],
190 |         )
191 | 
192 |     def test_text_pseudo_element(self) -> None:
193 |         self.assertEqual(self.x("#p-b2"), ['<b id="p-b2">guy</b>'])
194 |         self.assertEqual(self.x("#p-b2::text"), ["guy"])
195 |         self.assertEqual(self.x("#p-b2 ::text"), ["guy"])
196 |         self.assertEqual(self.x("#paragraph::text"), ["lorem ipsum text"])
197 |         self.assertEqual(
198 |             self.x("#paragraph ::text"),
199 |             ["lorem ipsum text", "hi", "there", "guy"],
200 |         )
201 |         self.assertEqual(self.x("p::text"), ["lorem ipsum text"])
202 |         self.assertEqual(self.x("p ::text"), ["lorem ipsum text", "hi", "there", "guy"])
203 | 
204 |     def test_attribute_function(self) -> None:
205 |         self.assertEqual(self.x("#p-b2::attr(id)"), ["p-b2"])
206 |         self.assertEqual(self.x(".cool-footer::attr(class)"), ["cool-footer"])
207 |         self.assertEqual(
208 |             self.x(".cool-footer ::attr(id)"), ["foobar-div", "foobar-span"]
209 |         )
210 |         self.assertEqual(
211 |             self.x('map[name="dummymap"] ::attr(shape)'), ["circle", "default"]
212 |         )
213 | 
214 |     def test_nested_selector(self) -> None:
215 |         self.assertEqual(self.sel.css("p").css("b::text").extract(), ["hi", "guy"])
216 |         self.assertEqual(
217 |             self.sel.css("div").css("area:last-child").extract(),
218 |             ['<area shape="default" id="area-nohref">'],
219 |         )
220 | 
221 |     @pytest.mark.xfail(
222 |         Version(cssselect.__version__) < Version("1.2.0"),
223 |         reason="Support added in cssselect 1.2.0",
224 |     )
225 |     def test_pseudoclass_has(self) -> None:
226 |         self.assertEqual(self.x("p:has(b)::text"), ["lorem ipsum text"])
227 | 
228 | 
229 | class CSSSelectorTestBytes(CSSSelectorTest):
230 |     def setUp(self) -> None:
231 |         self.sel = self.sscls(body=bytes(HTMLBODY, encoding="utf-8"))
232 | 


--------------------------------------------------------------------------------
/tests/test_selector_jmespath.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import unittest
  4 | 
  5 | from parsel import Selector
  6 | from parsel.selector import _NOT_SET
  7 | 
  8 | 
  9 | class JMESPathTestCase(unittest.TestCase):
 10 |     def test_json_has_html(self) -> None:
 11 |         """Sometimes the information is returned in a json wrapper"""
 12 |         data = """
 13 |         {
 14 |             "content": [
 15 |                 {
 16 |                     "name": "A",
 17 |                     "value": "a"
 18 |                 },
 19 |                 {
 20 |                     "name": {
 21 |                         "age": 18
 22 |                     },
 23 |                     "value": "b"
 24 |                 },
 25 |                 {
 26 |                     "name": "C",
 27 |                     "value": "c"
 28 |                 },
 29 |                 {
 30 |                     "name": "<a>D</a>",
 31 |                     "value": "<div>d</div>"
 32 |                 }
 33 |             ],
 34 |             "html": "<div><a>a<br>b</a>c</div><div><a>d</a>e<b>f</b></div>"
 35 |         }
 36 |         """
 37 |         sel = Selector(text=data)
 38 |         self.assertEqual(
 39 |             sel.jmespath("html").get(),
 40 |             "<div><a>a<br>b</a>c</div><div><a>d</a>e<b>f</b></div>",
 41 |         )
 42 |         self.assertEqual(
 43 |             sel.jmespath("html").xpath("//div/a/text()").getall(),
 44 |             ["a", "b", "d"],
 45 |         )
 46 |         self.assertEqual(sel.jmespath("html").css("div > b").getall(), ["<b>f</b>"])
 47 |         self.assertEqual(sel.jmespath("content").jmespath("name.age").get(), 18)
 48 | 
 49 |     def test_html_has_json(self) -> None:
 50 |         html_text = """
 51 |         <div>
 52 |             <h1>Information</h1>
 53 |             <content>
 54 |             {
 55 |               "user": [
 56 |                         {
 57 |                                   "name": "A",
 58 |                                   "age": 18
 59 |                         },
 60 |                         {
 61 |                                   "name": "B",
 62 |                                   "age": 32
 63 |                         },
 64 |                         {
 65 |                                   "name": "C",
 66 |                                   "age": 22
 67 |                         },
 68 |                         {
 69 |                                   "name": "D",
 70 |                                   "age": 25
 71 |                         }
 72 |               ],
 73 |               "total": 4,
 74 |               "status": "ok"
 75 |             }
 76 |             </content>
 77 |         </div>
 78 |         """
 79 |         sel = Selector(text=html_text)
 80 |         self.assertEqual(
 81 |             sel.xpath("//div/content/text()").jmespath("user[*].name").getall(),
 82 |             ["A", "B", "C", "D"],
 83 |         )
 84 |         self.assertEqual(
 85 |             sel.xpath("//div/content").jmespath("user[*].name").getall(),
 86 |             ["A", "B", "C", "D"],
 87 |         )
 88 |         self.assertEqual(sel.xpath("//div/content").jmespath("total").get(), 4)
 89 | 
 90 |     def test_jmestpath_with_re(self) -> None:
 91 |         html_text = """
 92 |             <div>
 93 |                 <h1>Information</h1>
 94 |                 <content>
 95 |                 {
 96 |                   "user": [
 97 |                             {
 98 |                                       "name": "A",
 99 |                                       "age": 18
100 |                             },
101 |                             {
102 |                                       "name": "B",
103 |                                       "age": 32
104 |                             },
105 |                             {
106 |                                       "name": "C",
107 |                                       "age": 22
108 |                             },
109 |                             {
110 |                                       "name": "D",
111 |                                       "age": 25
112 |                             }
113 |                   ],
114 |                   "total": 4,
115 |                   "status": "ok"
116 |                 }
117 |                 </content>
118 |             </div>
119 |             """
120 |         sel = Selector(text=html_text)
121 |         self.assertEqual(
122 |             sel.xpath("//div/content/text()").jmespath("user[*].name").re(r"(\w+)"),
123 |             ["A", "B", "C", "D"],
124 |         )
125 |         self.assertEqual(
126 |             sel.xpath("//div/content").jmespath("user[*].name").re(r"(\w+)"),
127 |             ["A", "B", "C", "D"],
128 |         )
129 | 
130 |         with self.assertRaises(TypeError):
131 |             sel.xpath("//div/content").jmespath("user[*].age").re(r"(\d+)")
132 | 
133 |         self.assertEqual(
134 |             sel.xpath("//div/content").jmespath("unavailable").re(r"(\d+)"), []
135 |         )
136 | 
137 |         self.assertEqual(
138 |             sel.xpath("//div/content").jmespath("unavailable").re_first(r"(\d+)"),
139 |             None,
140 |         )
141 | 
142 |         self.assertEqual(
143 |             sel.xpath("//div/content")
144 |             .jmespath("user[*].age.to_string(@)")
145 |             .re(r"(\d+)"),
146 |             ["18", "32", "22", "25"],
147 |         )
148 | 
149 |     def test_json_types(self) -> None:
150 |         for text, root in (
151 |             ("{}", {}),
152 |             ('{"a": "b"}', {"a": "b"}),
153 |             ("[]", []),
154 |             ('["a"]', ["a"]),
155 |             ('""', ""),
156 |             ("0", 0),
157 |             ("1", 1),
158 |             ("true", True),
159 |             ("false", False),
160 |             ("null", None),
161 |         ):
162 |             selector = Selector(text=text, root=_NOT_SET)
163 |             self.assertEqual(selector.type, "json")
164 |             self.assertEqual(selector._text, text)
165 |             self.assertEqual(selector.root, root)
166 | 
167 |             selector = Selector(text=None, root=root)
168 |             self.assertEqual(selector.type, "json")
169 |             self.assertEqual(selector._text, None)
170 |             self.assertEqual(selector.root, root)
171 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import TYPE_CHECKING
 4 | 
 5 | from pytest import mark, raises
 6 | 
 7 | from parsel.utils import extract_regex, shorten
 8 | 
 9 | if TYPE_CHECKING:
10 |     from re import Pattern
11 | 
12 | 
13 | @mark.parametrize(
14 |     "width,expected",
15 |     (
16 |         (-1, ValueError),
17 |         (0, ""),
18 |         (1, "."),
19 |         (2, ".."),
20 |         (3, "..."),
21 |         (4, "f..."),
22 |         (5, "fo..."),
23 |         (6, "foobar"),
24 |         (7, "foobar"),
25 |     ),
26 | )
27 | def test_shorten(width: int, expected: str | type[Exception]) -> None:
28 |     if isinstance(expected, str):
29 |         assert shorten("foobar", width) == expected
30 |     else:
31 |         with raises(expected):
32 |             shorten("foobar", width)
33 | 
34 | 
35 | @mark.parametrize(
36 |     "regex, text, replace_entities, expected",
37 |     (
38 |         [
39 |             r"(?P<month>\w+)\s*(?P<day>\d+)\s*\,?\s*(?P<year>\d+)",
40 |             "October  25, 2019",
41 |             True,
42 |             ["October", "25", "2019"],
43 |         ],
44 |         [
45 |             r"(?P<month>\w+)\s*(?P<day>\d+)\s*\,?\s*(?P<year>\d+)",
46 |             "October  25 2019",
47 |             True,
48 |             ["October", "25", "2019"],
49 |         ],
50 |         [
51 |             r"(?P<extract>\w+)\s*(?P<day>\d+)\s*\,?\s*(?P<year>\d+)",
52 |             "October  25 2019",
53 |             True,
54 |             ["October"],
55 |         ],
56 |         [
57 |             r"\w+\s*\d+\s*\,?\s*\d+",
58 |             "October  25 2019",
59 |             True,
60 |             ["October  25 2019"],
61 |         ],
62 |         [
63 |             r"^.*$",
64 |             "&quot;sometext&quot; &amp; &quot;moretext&quot;",
65 |             True,
66 |             ['"sometext" &amp; "moretext"'],
67 |         ],
68 |         [
69 |             r"^.*$",
70 |             "&quot;sometext&quot; &amp; &quot;moretext&quot;",
71 |             False,
72 |             ["&quot;sometext&quot; &amp; &quot;moretext&quot;"],
73 |         ],
74 |     ),
75 | )
76 | def test_extract_regex(
77 |     regex: str | Pattern[str],
78 |     text: str,
79 |     replace_entities: bool,
80 |     expected: list[str],
81 | ) -> None:
82 |     assert extract_regex(regex, text, replace_entities) == expected
83 | 


--------------------------------------------------------------------------------
/tests/test_xml_attacks.py:
--------------------------------------------------------------------------------
 1 | """Tests for known XML attacks"""
 2 | 
 3 | from pathlib import Path
 4 | from unittest import TestCase
 5 | 
 6 | from psutil import Process
 7 | 
 8 | from parsel import Selector
 9 | 
10 | MiB_1 = 1024**2
11 | 
12 | 
13 | def _load(attack: str) -> str:
14 |     folder_path = Path(__file__).parent
15 |     file_path = folder_path / "xml_attacks" / f"{attack}.xml"
16 |     return file_path.read_bytes().decode("utf-8")
17 | 
18 | 
19 | # List of known attacks:
20 | # https://github.com/tiran/defusedxml#python-xml-libraries
21 | class XMLAttackTestCase(TestCase):
22 |     def test_billion_laughs(self) -> None:
23 |         process = Process()
24 |         memory_usage_before = process.memory_info().rss
25 |         selector = Selector(text=_load("billion_laughs"))
26 |         lolz = selector.css("lolz::text").get()
27 |         memory_usage_after = process.memory_info().rss
28 |         memory_change = memory_usage_after - memory_usage_before
29 |         assert_message = f"Memory change: {memory_change}B"
30 |         assert memory_change <= MiB_1, assert_message
31 |         assert lolz == "&lol9;"
32 | 


--------------------------------------------------------------------------------
/tests/test_xpathfuncs.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import unittest
  4 | from typing import Any
  5 | 
  6 | from parsel import Selector
  7 | from parsel.xpathfuncs import set_xpathfunc
  8 | 
  9 | 
 10 | class XPathFuncsTestCase(unittest.TestCase):
 11 |     def test_has_class_simple(self) -> None:
 12 |         body = """
 13 |         <p class="foo bar-baz">First</p>
 14 |         <p class="foo">Second</p>
 15 |         <p class="bar">Third</p>
 16 |         <p>Fourth</p>
 17 |         """
 18 |         sel = Selector(text=body)
 19 |         self.assertEqual(
 20 |             [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')],
 21 |             ["First", "Second"],
 22 |         )
 23 |         self.assertEqual(
 24 |             [x.extract() for x in sel.xpath('//p[has-class("bar")]/text()')],
 25 |             ["Third"],
 26 |         )
 27 |         self.assertEqual(
 28 |             [x.extract() for x in sel.xpath('//p[has-class("foo","bar")]/text()')],
 29 |             [],
 30 |         )
 31 |         self.assertEqual(
 32 |             [x.extract() for x in sel.xpath('//p[has-class("foo","bar-baz")]/text()')],
 33 |             ["First"],
 34 |         )
 35 | 
 36 |     def test_has_class_error_no_args(self) -> None:
 37 |         body = """
 38 |         <p CLASS="foo">First</p>
 39 |         """
 40 |         sel = Selector(text=body)
 41 |         self.assertRaisesRegex(
 42 |             ValueError,
 43 |             "has-class must have at least 1 argument",
 44 |             sel.xpath,
 45 |             "has-class()",
 46 |         )
 47 | 
 48 |     def test_has_class_error_invalid_arg_type(self) -> None:
 49 |         body = """
 50 |         <p CLASS="foo">First</p>
 51 |         """
 52 |         sel = Selector(text=body)
 53 |         self.assertRaisesRegex(
 54 |             ValueError,
 55 |             "has-class arguments must be strings",
 56 |             sel.xpath,
 57 |             "has-class(.)",
 58 |         )
 59 | 
 60 |     def test_has_class_error_invalid_unicode(self) -> None:
 61 |         body = """
 62 |         <p CLASS="foo">First</p>
 63 |         """
 64 |         sel = Selector(text=body)
 65 |         self.assertRaisesRegex(
 66 |             ValueError,
 67 |             "All strings must be XML compatible",
 68 |             sel.xpath,
 69 |             'has-class("héllö")'.encode(),
 70 |         )
 71 | 
 72 |     def test_has_class_unicode(self) -> None:
 73 |         body = """
 74 |         <p CLASS="fóó">First</p>
 75 |         """
 76 |         sel = Selector(text=body)
 77 |         self.assertEqual(
 78 |             [x.extract() for x in sel.xpath('//p[has-class("fóó")]/text()')],
 79 |             ["First"],
 80 |         )
 81 | 
 82 |     def test_has_class_uppercase(self) -> None:
 83 |         body = """
 84 |         <p CLASS="foo">First</p>
 85 |         """
 86 |         sel = Selector(text=body)
 87 |         self.assertEqual(
 88 |             [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')],
 89 |             ["First"],
 90 |         )
 91 | 
 92 |     def test_has_class_newline(self) -> None:
 93 |         body = """
 94 |         <p CLASS="foo
 95 |         bar">First</p>
 96 |         """
 97 |         sel = Selector(text=body)
 98 |         self.assertEqual(
 99 |             [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')],
100 |             ["First"],
101 |         )
102 | 
103 |     def test_has_class_tab(self) -> None:
104 |         body = """
105 |         <p CLASS="foo\tbar">First</p>
106 |         """
107 |         sel = Selector(text=body)
108 |         self.assertEqual(
109 |             [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')],
110 |             ["First"],
111 |         )
112 | 
113 |     def test_set_xpathfunc(self) -> None:
114 |         def myfunc(ctx: Any) -> None:
115 |             myfunc.call_count += 1  # type: ignore[attr-defined]
116 | 
117 |         myfunc.call_count = 0  # type: ignore[attr-defined]
118 | 
119 |         body = """
120 |         <p CLASS="foo">First</p>
121 |         """
122 |         sel = Selector(text=body)
123 |         self.assertRaisesRegex(
124 |             ValueError,
125 |             "Unregistered function in myfunc",
126 |             sel.xpath,
127 |             "myfunc()",
128 |         )
129 | 
130 |         set_xpathfunc("myfunc", myfunc)
131 |         sel.xpath("myfunc()")
132 |         self.assertEqual(myfunc.call_count, 1)  # type: ignore[attr-defined]
133 | 
134 |         set_xpathfunc("myfunc", None)
135 |         self.assertRaisesRegex(
136 |             ValueError,
137 |             "Unregistered function in myfunc",
138 |             sel.xpath,
139 |             "myfunc()",
140 |         )
141 | 


--------------------------------------------------------------------------------
/tests/typing/selector.py:
--------------------------------------------------------------------------------
 1 | # Basic usage of the Selector, strongly typed to test the typing of parsel's API.
 2 | from __future__ import annotations
 3 | 
 4 | import re
 5 | 
 6 | from parsel import Selector
 7 | 
 8 | 
 9 | def correct() -> None:
10 |     selector = Selector(
11 |         text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>"
12 |     )
13 | 
14 |     li_values: list[str] = selector.css("li").getall()
15 |     selector.re_first(re.compile(r"[32]"), "").strip()
16 |     xpath_values: list[str] = selector.xpath(
17 |         "//somens:a/text()", namespaces={"somens": "http://scrapy.org"}
18 |     ).extract()
19 | 
20 |     class MySelector(Selector):
21 |         def my_own_func(self) -> int:
22 |             return 3
23 | 
24 |     my_selector = MySelector()
25 |     res: int = my_selector.my_own_func()
26 |     sub_res: int = my_selector.xpath("//somens:a/text()")[0].my_own_func()
27 | 
28 | 
29 | # Negative checks: all the code lines below have typing errors.
30 | # the "# type: ignore" comment makes sure that mypy identifies them as errors.
31 | 
32 | 
33 | def incorrect() -> None:
34 |     selector = Selector(
35 |         text="<html><body><ul><li>1</li><li>2</li><li>3</li></ul></body></html>"
36 |     )
37 | 
38 |     # Wrong query type in css.
39 |     selector.css(5).getall()  # type: ignore[arg-type]
40 | 
41 |     # Cannot assign a list of str to an int.
42 |     li_values: int = selector.css("li").getall()  # type: ignore[assignment]
43 | 
44 |     # Cannot use a string to define namespaces in xpath.
45 |     selector.xpath(
46 |         "//somens:a/text()",
47 |         namespaces='{"somens": "http://scrapy.org"}',  # type: ignore[arg-type]
48 |     ).extract()
49 | 
50 |     # Typo in the extract method name.
51 |     selector.css("li").extact()  # type: ignore[attr-defined]
52 | 
53 |     class MySelector(Selector):
54 |         def my_own_func(self) -> int:
55 |             return 3
56 | 
57 |     my_selector = MySelector()
58 |     res: str = my_selector.my_own_func()  # type: ignore[assignment]
59 |     sub_res: str = my_selector.xpath("//somens:a/text()")[0].my_own_func()  # type: ignore[assignment]
60 | 


--------------------------------------------------------------------------------
/tests/xml_attacks/billion_laughs.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!DOCTYPE lolz [
 3 |  <!ENTITY lol "lol">
 4 |  <!ELEMENT lolz (#PCDATA)>
 5 |  <!ENTITY lol1 "&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;">
 6 |  <!ENTITY lol2 "&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;">
 7 |  <!ENTITY lol3 "&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;">
 8 |  <!ENTITY lol4 "&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;">
 9 |  <!ENTITY lol5 "&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;">
10 |  <!ENTITY lol6 "&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;">
11 |  <!ENTITY lol7 "&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;">
12 |  <!ENTITY lol8 "&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;">
13 |  <!ENTITY lol9 "&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;">
14 | ]>
15 | <lolz>&lol9;</lolz>
16 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = typing,pylint,docs,twinecheck,pre-commit,py39,py310,py311,py312,py313,pypy3.10
 3 | 
 4 | [testenv]
 5 | usedevelop = True
 6 | deps =
 7 |     -r{toxinidir}/tests/requirements.txt
 8 | commands = py.test --cov=parsel --cov-report=xml {posargs:docs parsel tests}
 9 | 
10 | [testenv:typing]
11 | deps =
12 |     {[testenv]deps}
13 |     types-jmespath==1.0.2.20240106
14 |     types-lxml==2025.3.4
15 |     types-psutil==7.0.0.20250218
16 |     py==1.11.0
17 |     mypy==1.15.0
18 | commands =
19 |     mypy {posargs:parsel tests} --strict
20 | 
21 | [testenv:pylint]
22 | deps =
23 |     {[testenv]deps}
24 |     pylint==3.3.6
25 | commands =
26 |     pylint docs parsel tests setup.py
27 | 
28 | [docs]
29 | changedir = docs
30 | deps = -rdocs/requirements.txt
31 | 
32 | [testenv:docs]
33 | changedir = {[docs]changedir}
34 | deps = {[docs]deps}
35 | # No -W in LaTeX, because ReadTheDocs does not use it either, and there are
36 | # image conversion warnings that cannot be addressed in ReadTheDocs
37 | commands =
38 |     sphinx-build -W -b html . {envtmpdir}/html
39 |     sphinx-build -b latex . {envtmpdir}/latex
40 |     sphinx-build -b epub . {envtmpdir}/epub
41 | 
42 | [testenv:twinecheck]
43 | basepython = python3
44 | deps =
45 |     twine==6.1.0
46 |     build==1.2.2.post1
47 | commands =
48 |     python -m build --sdist
49 |     twine check dist/*
50 | 
51 | [testenv:pre-commit]
52 | deps = pre-commit
53 | commands = pre-commit run --all-files --show-diff-on-failure
54 | skip_install = true
55 | 


--------------------------------------------------------------------------------