├── .editorconfig ├── .git-blame-ignore-revs ├── .github └── workflows │ ├── checks.yml │ ├── publish.yml │ ├── tests-macos.yml │ ├── tests-ubuntu.yml │ └── tests-windows.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── AUTHORS ├── CHANGES ├── LICENSE ├── README.rst ├── cssselect ├── __init__.py ├── parser.py ├── py.typed └── xpath.py ├── docs ├── conf.py ├── conftest.py ├── index.rst └── requirements.txt ├── pyproject.toml ├── tests ├── __init__.py └── test_cssselect.py └── tox.ini /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | indent_style = space 6 | indent_size = 4 7 | insert_final_newline = true 8 | end_of_line = lf 9 | 10 | [*.{yml,yaml}] 11 | indent_size = 2 12 | -------------------------------------------------------------------------------- /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # applying pre-commit hooks to the project 2 | e91101b37f82558db84a6b8ee9a6dba1fd2ae0bb -------------------------------------------------------------------------------- /.github/workflows/checks.yml: -------------------------------------------------------------------------------- 1 | name: Checks 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | checks: 6 | runs-on: ubuntu-latest 7 | strategy: 8 | fail-fast: false 9 | matrix: 10 | include: 11 | - python-version: 3.13 12 | env: 13 | TOXENV: pylint 14 | - python-version: 3.13 # Keep in sync with .readthedocs.yml 15 | env: 16 | TOXENV: docs 17 | - python-version: 3.13 18 | env: 19 | TOXENV: typing 20 | - python-version: 3.13 21 | env: 22 | TOXENV: twinecheck 23 | 24 | steps: 25 | - uses: actions/checkout@v4 26 | 27 | - name: Set up Python ${{ matrix.python-version }} 28 | uses: actions/setup-python@v5 29 | with: 30 | python-version: ${{ matrix.python-version }} 31 | 32 | - name: Run check 33 | env: ${{ matrix.env }} 34 | run: | 35 | pip install -U pip 36 | pip install -U tox 37 | tox 38 | 39 | pre-commit: 40 | runs-on: ubuntu-latest 41 | steps: 42 | - uses: actions/checkout@v4 43 | - uses: pre-commit/action@v3.0.1 44 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | on: 3 | push: 4 | tags: 5 | - 'v[0-9]+.[0-9]+.[0-9]+' 6 | 7 | jobs: 8 | publish: 9 | runs-on: ubuntu-latest 10 | 11 | environment: 12 | name: pypi 13 | url: https://pypi.org/p/cssselect 14 | 15 | permissions: 16 | id-token: write 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - name: Set up Python 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: 3.13 25 | 26 | - name: Build 27 | run: | 28 | python -m pip install --upgrade build 29 | python -m build 30 | 31 | - name: Publish to PyPI 32 | uses: pypa/gh-action-pypi-publish@release/v1 33 | -------------------------------------------------------------------------------- /.github/workflows/tests-macos.yml: -------------------------------------------------------------------------------- 1 | name: macOS 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | tests: 6 | runs-on: macos-latest 7 | strategy: 8 | fail-fast: false 9 | matrix: 10 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | 20 | - name: Run tests 21 | run: | 22 | pip install -U pip 23 | pip install -U tox 24 | tox -e py 25 | 26 | - name: Upload coverage report 27 | uses: codecov/codecov-action@v5 28 | -------------------------------------------------------------------------------- /.github/workflows/tests-ubuntu.yml: -------------------------------------------------------------------------------- 1 | name: Ubuntu 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | tests: 6 | runs-on: ubuntu-latest 7 | strategy: 8 | fail-fast: false 9 | matrix: 10 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "pypy3.10"] 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | 20 | - name: Run tests 21 | run: | 22 | pip install -U pip 23 | pip install -U tox 24 | tox -e py 25 | 26 | - name: Upload coverage report 27 | uses: codecov/codecov-action@v5 28 | -------------------------------------------------------------------------------- /.github/workflows/tests-windows.yml: -------------------------------------------------------------------------------- 1 | name: Windows 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | tests: 6 | runs-on: windows-latest 7 | strategy: 8 | fail-fast: false 9 | matrix: 10 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | 20 | - name: Run tests 21 | run: | 22 | pip install -U pip 23 | pip install -U tox 24 | tox -e py 25 | 26 | - name: Upload coverage report 27 | uses: codecov/codecov-action@v5 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.egg-info 3 | /.tox 4 | /MANIFEST 5 | /dist 6 | /docs/_build 7 | /.coverage 8 | .idea 9 | htmlcov/ 10 | coverage.xml 11 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | rev: v0.11.2 4 | hooks: 5 | - id: ruff 6 | args: [ --fix ] 7 | - id: ruff-format 8 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | formats: all 3 | sphinx: 4 | configuration: docs/conf.py 5 | fail_on_warning: true 6 | build: 7 | os: ubuntu-24.04 8 | tools: 9 | # For available versions, see: 10 | # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python 11 | python: "3.13" # Keep in sync with .github/workflows/checks.yml 12 | python: 13 | install: 14 | - requirements: docs/requirements.txt 15 | - path: . 16 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Daniel Graña 2 | Ian Bicking 3 | James Salter 4 | Laurence Rowe 5 | Mikhail Korobov 6 | Nik Nyby 7 | Paul Tremberth 8 | Simon Potter 9 | Simon Sapin 10 | Stefan Behnel 11 | Thomas Grainger 12 | Varialus 13 | Arthur Darcet 14 | -------------------------------------------------------------------------------- /CHANGES: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | Version 1.3.0 5 | ------------- 6 | 7 | Released on 2025-03-10. 8 | 9 | * Dropped support for Python 3.7-3.8, added support for Python 3.12-3.13 and 10 | PyPy 3.10. 11 | 12 | * Removed ``_unicode_safe_getattr()``, deprecated in 1.2.0. 13 | 14 | * Added ``pre-commit`` and formatted the code with ``ruff``. 15 | 16 | * Many CI additions and improvements. 17 | 18 | 19 | Version 1.2.0 20 | ------------- 21 | 22 | Released on 2022-10-27. 23 | 24 | * Drop support for Python 2.7, 3.4-3.6, add support for Python 3.7-3.11. 25 | 26 | * Add type annotations (PEP 484 and PEP 561). 27 | 28 | * More features from the CSS Selectors Level 4: 29 | 30 | * The ``:is()`` pseudo-class. 31 | 32 | * The ``:where()`` pseudo-class. 33 | 34 | * The ``:has()`` pseudo-class, with some limitations. 35 | 36 | * Fix parsing ``:scope`` after a comma. 37 | 38 | * Add parentheses to fix condition precedence in some cases. 39 | 40 | * Private API changes related to the removal of the Python 2 support: 41 | 42 | * Remove ``_unicode`` and ``_unichr`` aliases from ``csselect.parser``. 43 | 44 | * Remove ``_basestring`` and ``_unicode`` aliases from ``csselect.xpath``. 45 | 46 | * Deprecate ``csselect.xpath._unicode_safe_getattr()`` and change it to just 47 | call ``getattr()``. 48 | 49 | * Include tests in the PyPI tarball. 50 | 51 | * Many CI additions and improvements. 52 | 53 | * Improve the test coverage. 54 | 55 | 56 | Version 1.1.0 57 | ------------- 58 | 59 | Released on 2019-08-09. 60 | 61 | * Support for the ``:scope`` selector, which allows to access immediate 62 | children of a selector. 63 | 64 | * Support for the ``|E`` syntax for type selectors without a namespace. 65 | 66 | * A new selector method, ``canonical``, returns the CSS expression of the 67 | selector, as a string. 68 | 69 | 70 | Version 1.0.3 71 | ------------- 72 | 73 | Released on 2017-12-27. 74 | 75 | * Fix artifact uploads to pypi 76 | 77 | 78 | Version 1.0.2 79 | ------------- 80 | 81 | Released on 2017-12-26. 82 | 83 | * Drop support for Python 2.6 and Python 3.3. 84 | * Fix deprecation warning in Python 3.6. 85 | * Minor cleanups. 86 | 87 | 88 | Version 1.0.1 89 | ------------- 90 | 91 | Released on 2017-01-10. 92 | 93 | * Add support for Python 3.6. 94 | * Documentation hosted `on Read the Docs `_ 95 | 96 | 97 | Version 1.0.0 98 | ------------- 99 | 100 | Released on 2016-10-21. 101 | 102 | * Add code coverage reports. 103 | * Fix ``:nth-*(an+b)`` pseudo-classes selectors. 104 | (except ``*:nth-child()`` which looks untranslatable to XPath 1.0.) 105 | 106 | 107 | Version 0.9.2 108 | ------------- 109 | 110 | Released on 2016-06-15. 111 | 112 | * Distribute as universal wheel. 113 | * Add support for Python 3.3, 3.4 and 3.5. 114 | * Drop support for Python 2.5 as testing is getting difficult. 115 | * Improve tests on pseudo-elements. 116 | 117 | 118 | Version 0.9.1 119 | ------------- 120 | 121 | Released on 2013-10-17. 122 | 123 | * **Backward incompatible change from 0.9**: 124 | :meth:`~GenericTranslator.selector_to_xpath` defaults to 125 | ignoring pseudo-elements, 126 | as it did in 0.8 and previous versions. 127 | (:meth:`~GenericTranslator.css_to_xpath` doesn’t change.) 128 | * Drop official support for Python 2.4 and 3.1, 129 | as testing was becoming difficult. 130 | Nothing will break overnight, 131 | but future releases may on may not work on these versions. 132 | Older releases will remain available on PyPI. 133 | 134 | 135 | Version 0.9 136 | ----------- 137 | 138 | Released on 2013-10-11. 139 | 140 | Add parser support for :attr:`functional 141 | pseudo-elements `. 142 | 143 | *Update:* 144 | This version accidentally introduced a **backward incompatible** change: 145 | :meth:`~GenericTranslator.selector_to_xpath` defaults to 146 | rejecting pseudo-elements instead of ignoring them. 147 | 148 | 149 | Version 0.8 150 | ----------- 151 | 152 | Released on 2013-03-15. 153 | 154 | Improvements: 155 | 156 | * `#22 `_ 157 | Let extended translators override what XPathExpr class is used 158 | * `#19 `_ 159 | Use the built-in ``lang()`` XPath function 160 | for implementing the ``:lang()`` pseudo-class 161 | with XML documents. 162 | This is probably faster than ``ancestor-or-self::``. 163 | 164 | Bug fixes: 165 | 166 | * `#14 `_ 167 | Fix non-ASCII pseudo-classes. (Invalid selector instead of crash.) 168 | * `#20 `_ 169 | As per the spec, elements containing only whitespace are not considered empty 170 | for the ``:empty`` pseudo-class. 171 | 172 | 173 | Version 0.7.1 174 | ------------- 175 | 176 | Released on 2012-06-14. Code name *remember-to-test-with-tox*. 177 | 178 | 0.7 broke the parser in Python 2.4 and 2.5; the tests in 2.x. 179 | Now all is well again. 180 | 181 | Also, pseudo-elements are now correctly made lower-case. (They are supposed 182 | to be case-insensitive.) 183 | 184 | 185 | Version 0.7 186 | ----------- 187 | 188 | Released on 2012-06-14. 189 | 190 | Bug fix release: see #2, #7 and #10 on GitHub. 191 | 192 | * The tokenizer and parser have been rewritten to be much closer to the 193 | specified grammar. In particular, non-ASCII characters and backslash-escapes 194 | are now handled correctly. 195 | * Special characters are protected in the output so that generated XPath 196 | exrpessions should always be valid 197 | * The ``~=``, ``^=`` and ``*=`` attribute operators now correctly never match 198 | when used with an empty string. 199 | 200 | 201 | Version 0.6.1 202 | ------------- 203 | 204 | Released on 2012-04-25. 205 | 206 | Make sure that internal token objects do not "leak" into the public API and 207 | :attr:`Selector.pseudo_element` is an unicode string. 208 | 209 | 210 | Version 0.6 211 | ----------- 212 | 213 | Released on 2012-04-24. 214 | 215 | * In ``setup.py`` use setuptools/distribute if available, but fall back 216 | on distutils. 217 | * Implement the ``:lang()`` pseudo-class, although it is only based on 218 | ``xml:lang`` or ``lang`` attributes. If the document language is known from 219 | some other meta-data (like a ``Content-Language`` HTTP header or ```` 220 | element), a workaround is to set a lang attribute on the root element. 221 | 222 | 223 | Version 0.5 224 | ----------- 225 | 226 | Released on 2012-04-20. 227 | 228 | * Fix case sensitivity issues. 229 | * Implement :class:`HTMLTranslator` based on the `HTML5 specification`_ 230 | rather than guessing; add the ``xhtml`` parameter. 231 | * Several bug fixes and better test coverage. 232 | 233 | .. _HTML5 specification: http://www.w3.org/TR/html5/links.html#selectors 234 | 235 | 236 | Version 0.4 237 | ----------- 238 | 239 | Released on 2012-04-18. 240 | 241 | * Add proper support for pseudo-elements 242 | * Add specificity calculation 243 | * Expose the :func:`parse` function and the parsed :class:`Selector` objects 244 | in the API. 245 | * Add the :meth:`~GenericTranslator.selector_to_xpath` method. 246 | 247 | 248 | Version 0.3 249 | ----------- 250 | 251 | Released on 2012-04-17. 252 | 253 | * Fix many parsing bugs. 254 | * Rename the ``Translator`` class to :class:`GenericTranslator` 255 | * There, implement ``:target``, ``:hover``, ``:focus``, ``:active`` 256 | ``:checked``, ``:enabled``, ``:disabled``, ``:link`` and ``:visited`` 257 | as never matching. 258 | * Make a new HTML-specific ``HTMLTranslator`` subclass. There, implement 259 | ``:checked``, ``:enabled``, ``:disabled``, ``:link`` and ``:visited`` 260 | as appropriate for HTML, with all links "not visited". 261 | * Remove the ``css_to_xpath`` function. The translator classes 262 | are the new API. 263 | * Add support for ``:contains()`` back, but case-sensitive. lxml will 264 | override it to be case-insensitive for backward-compatibility. 265 | 266 | Discussion is open if anyone is interested in implementing eg. ``:target`` 267 | or ``:visited`` differently, but they can always do it in a ``Translator`` 268 | subclass. 269 | 270 | 271 | Version 0.2 272 | ----------- 273 | 274 | Released on 2012-04-16. 275 | 276 | * Remove the ``CSSSelector`` class. (The ``css_to_xpath()`` function is now 277 | the main API.) 278 | * Remove support for the ``:contains()`` pseudo-class. 279 | 280 | These changes allow cssselect to be used without lxml. (Hey, this was 281 | the whole point of this project.) The tests still require lxml, though. 282 | The removed parts are expected to stay in lxml for backward-compatibility. 283 | 284 | ``:contains()`` only existed in an `early draft 285 | `_ 286 | of the Selectors specification, and was removed before Level 3 stabilized. 287 | Internally, it used a custom XPath extension function which can be 288 | difficult to express outside of lxml. 289 | 290 | 291 | * Separate the XPath translation from the parsed objects into a new 292 | ``Translator`` class. 293 | 294 | Subclasses of ``Translator`` can be made to change the way that some selector 295 | (eg. a pseudo-class) is implemented. 296 | 297 | 298 | Version 0.1 299 | ----------- 300 | 301 | Released on 2012-04-13. 302 | 303 | Extract lxml.cssselect from the rest of lxml and make it a stand-alone project. 304 | 305 | Commit ``ea53ceaf7e44ba4fbb5c818ae31370932f47774e`` was taken on 2012-04-11 306 | from the 'master' branch of lxml’s git repository. This is somewhere 307 | between versions 2.3.4 and 2.4. 308 | 309 | The commit history has been rewritten to: 310 | 311 | * Remove lxml files unrelated to cssselect 312 | * Import the early history from the 'html' branch in the old SVN repository 313 | * Fix author names in commits from SVN 314 | 315 | This project has its own import name, tests and documentation. But the 316 | code itself is unchanged and still depends on lxml. 317 | 318 | 319 | Earlier history 320 | --------------- 321 | 322 | Search for *cssselect* in `lxml’s changelog 323 | `_ 324 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2007-2012 Ian Bicking and contributors. See AUTHORS 2 | for more details. 3 | 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | 1. Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | 13 | 2. Redistributions in binary form must reproduce the above copyright 14 | notice, this list of conditions and the following disclaimer in 15 | the documentation and/or other materials provided with the 16 | distribution. 17 | 18 | 3. Neither the name of Ian Bicking nor the names of its contributors may 19 | be used to endorse or promote products derived from this software 20 | without specific prior written permission. 21 | 22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 26 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 27 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 28 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 29 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 30 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 31 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 32 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | 2 | =================================== 3 | cssselect: CSS Selectors for Python 4 | =================================== 5 | 6 | .. image:: https://img.shields.io/pypi/v/cssselect.svg 7 | :target: https://pypi.python.org/pypi/cssselect 8 | :alt: PyPI Version 9 | 10 | .. image:: https://img.shields.io/pypi/pyversions/cssselect.svg 11 | :target: https://pypi.python.org/pypi/cssselect 12 | :alt: Supported Python Versions 13 | 14 | .. image:: https://github.com/scrapy/cssselect/actions/workflows/tests-ubuntu.yml/badge.svg 15 | :target: https://github.com/scrapy/cssselect/actions/workflows/tests-ubuntu.yml 16 | :alt: Tests 17 | 18 | .. image:: https://img.shields.io/codecov/c/github/scrapy/cssselect/master.svg 19 | :target: https://codecov.io/github/scrapy/cssselect?branch=master 20 | :alt: Coverage report 21 | 22 | **cssselect** is a BSD-licensed Python library to parse `CSS3 selectors`_ and 23 | translate them to `XPath 1.0`_ expressions. 24 | 25 | `XPath 1.0`_ expressions can be used in lxml_ or another XPath engine to find 26 | the matching elements in an XML or HTML document. 27 | 28 | Find the cssselect online documentation at https://cssselect.readthedocs.io. 29 | 30 | Quick facts: 31 | 32 | * Source, issues and pull requests `on GitHub 33 | `_ 34 | * Releases `on PyPI `_ 35 | * Install with ``pip install cssselect`` 36 | 37 | 38 | .. _CSS3 selectors: https://www.w3.org/TR/selectors-3/ 39 | .. _XPath 1.0: https://www.w3.org/TR/xpath/all/ 40 | .. _lxml: https://lxml.de/ 41 | -------------------------------------------------------------------------------- /cssselect/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | CSS Selectors based on XPath 3 | ============================ 4 | 5 | This module supports selecting XML/HTML elements based on CSS selectors. 6 | See the `CSSSelector` class for details. 7 | 8 | 9 | :copyright: (c) 2007-2012 Ian Bicking and contributors. 10 | See AUTHORS for more details. 11 | :license: BSD, see LICENSE for more details. 12 | 13 | """ 14 | 15 | from cssselect.parser import ( 16 | FunctionalPseudoElement, 17 | Selector, 18 | SelectorError, 19 | SelectorSyntaxError, 20 | parse, 21 | ) 22 | from cssselect.xpath import ExpressionError, GenericTranslator, HTMLTranslator 23 | 24 | __all__ = ( 25 | "ExpressionError", 26 | "FunctionalPseudoElement", 27 | "GenericTranslator", 28 | "HTMLTranslator", 29 | "Selector", 30 | "SelectorError", 31 | "SelectorSyntaxError", 32 | "parse", 33 | ) 34 | 35 | VERSION = "1.3.0" 36 | __version__ = VERSION 37 | -------------------------------------------------------------------------------- /cssselect/parser.py: -------------------------------------------------------------------------------- 1 | """ 2 | cssselect.parser 3 | ================ 4 | 5 | Tokenizer, parser and parsed objects for CSS selectors. 6 | 7 | 8 | :copyright: (c) 2007-2012 Ian Bicking and contributors. 9 | See AUTHORS for more details. 10 | :license: BSD, see LICENSE for more details. 11 | 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | import operator 17 | import re 18 | import sys 19 | from typing import TYPE_CHECKING, Literal, Optional, Protocol, Union, cast, overload 20 | 21 | if TYPE_CHECKING: 22 | from collections.abc import Iterable, Iterator, Sequence 23 | 24 | # typing.Self requires Python 3.11 25 | from typing_extensions import Self 26 | 27 | 28 | def ascii_lower(string: str) -> str: 29 | """Lower-case, but only in the ASCII range.""" 30 | return string.encode("utf8").lower().decode("utf8") 31 | 32 | 33 | class SelectorError(Exception): 34 | """Common parent for :class:`SelectorSyntaxError` and 35 | :class:`ExpressionError`. 36 | 37 | You can just use ``except SelectorError:`` when calling 38 | :meth:`~GenericTranslator.css_to_xpath` and handle both exceptions types. 39 | 40 | """ 41 | 42 | 43 | class SelectorSyntaxError(SelectorError, SyntaxError): 44 | """Parsing a selector that does not match the grammar.""" 45 | 46 | 47 | #### Parsed objects 48 | 49 | Tree = Union[ 50 | "Element", 51 | "Hash", 52 | "Class", 53 | "Function", 54 | "Pseudo", 55 | "Attrib", 56 | "Negation", 57 | "Relation", 58 | "Matching", 59 | "SpecificityAdjustment", 60 | "CombinedSelector", 61 | ] 62 | PseudoElement = Union["FunctionalPseudoElement", str] 63 | 64 | 65 | class Selector: 66 | """ 67 | Represents a parsed selector. 68 | 69 | :meth:`~GenericTranslator.selector_to_xpath` accepts this object, 70 | but ignores :attr:`pseudo_element`. It is the user’s responsibility 71 | to account for pseudo-elements and reject selectors with unknown 72 | or unsupported pseudo-elements. 73 | 74 | """ 75 | 76 | def __init__(self, tree: Tree, pseudo_element: PseudoElement | None = None) -> None: 77 | self.parsed_tree = tree 78 | if pseudo_element is not None and not isinstance( 79 | pseudo_element, FunctionalPseudoElement 80 | ): 81 | pseudo_element = ascii_lower(pseudo_element) 82 | #: A :class:`FunctionalPseudoElement`, 83 | #: or the identifier for the pseudo-element as a string, 84 | # or ``None``. 85 | #: 86 | #: +-------------------------+----------------+--------------------------------+ 87 | #: | | Selector | Pseudo-element | 88 | #: +=========================+================+================================+ 89 | #: | CSS3 syntax | ``a::before`` | ``'before'`` | 90 | #: +-------------------------+----------------+--------------------------------+ 91 | #: | Older syntax | ``a:before`` | ``'before'`` | 92 | #: +-------------------------+----------------+--------------------------------+ 93 | #: | From the Lists3_ draft, | ``li::marker`` | ``'marker'`` | 94 | #: | not in Selectors3 | | | 95 | #: +-------------------------+----------------+--------------------------------+ 96 | #: | Invalid pseudo-class | ``li:marker`` | ``None`` | 97 | #: +-------------------------+----------------+--------------------------------+ 98 | #: | Functional | ``a::foo(2)`` | ``FunctionalPseudoElement(…)`` | 99 | #: +-------------------------+----------------+--------------------------------+ 100 | #: 101 | #: .. _Lists3: http://www.w3.org/TR/2011/WD-css3-lists-20110524/#marker-pseudoelement 102 | self.pseudo_element = pseudo_element 103 | 104 | def __repr__(self) -> str: 105 | if isinstance(self.pseudo_element, FunctionalPseudoElement): 106 | pseudo_element = repr(self.pseudo_element) 107 | elif self.pseudo_element: 108 | pseudo_element = f"::{self.pseudo_element}" 109 | else: 110 | pseudo_element = "" 111 | return f"{self.__class__.__name__}[{self.parsed_tree!r}{pseudo_element}]" 112 | 113 | def canonical(self) -> str: 114 | """Return a CSS representation for this selector (a string)""" 115 | if isinstance(self.pseudo_element, FunctionalPseudoElement): 116 | pseudo_element = f"::{self.pseudo_element.canonical()}" 117 | elif self.pseudo_element: 118 | pseudo_element = f"::{self.pseudo_element}" 119 | else: 120 | pseudo_element = "" 121 | res = f"{self.parsed_tree.canonical()}{pseudo_element}" 122 | if len(res) > 1: 123 | res = res.lstrip("*") 124 | return res 125 | 126 | def specificity(self) -> tuple[int, int, int]: 127 | """Return the specificity_ of this selector as a tuple of 3 integers. 128 | 129 | .. _specificity: http://www.w3.org/TR/selectors/#specificity 130 | 131 | """ 132 | a, b, c = self.parsed_tree.specificity() 133 | if self.pseudo_element: 134 | c += 1 135 | return a, b, c 136 | 137 | 138 | class Class: 139 | """ 140 | Represents selector.class_name 141 | """ 142 | 143 | def __init__(self, selector: Tree, class_name: str) -> None: 144 | self.selector = selector 145 | self.class_name = class_name 146 | 147 | def __repr__(self) -> str: 148 | return f"{self.__class__.__name__}[{self.selector!r}.{self.class_name}]" 149 | 150 | def canonical(self) -> str: 151 | return f"{self.selector.canonical()}.{self.class_name}" 152 | 153 | def specificity(self) -> tuple[int, int, int]: 154 | a, b, c = self.selector.specificity() 155 | b += 1 156 | return a, b, c 157 | 158 | 159 | class FunctionalPseudoElement: 160 | """ 161 | Represents selector::name(arguments) 162 | 163 | .. attribute:: name 164 | 165 | The name (identifier) of the pseudo-element, as a string. 166 | 167 | .. attribute:: arguments 168 | 169 | The arguments of the pseudo-element, as a list of tokens. 170 | 171 | **Note:** tokens are not part of the public API, 172 | and may change between cssselect versions. 173 | Use at your own risks. 174 | 175 | """ 176 | 177 | def __init__(self, name: str, arguments: Sequence[Token]): 178 | self.name = ascii_lower(name) 179 | self.arguments = arguments 180 | 181 | def __repr__(self) -> str: 182 | token_values = [token.value for token in self.arguments] 183 | return f"{self.__class__.__name__}[::{self.name}({token_values!r})]" 184 | 185 | def argument_types(self) -> list[str]: 186 | return [token.type for token in self.arguments] 187 | 188 | def canonical(self) -> str: 189 | args = "".join(token.css() for token in self.arguments) 190 | return f"{self.name}({args})" 191 | 192 | 193 | class Function: 194 | """ 195 | Represents selector:name(expr) 196 | """ 197 | 198 | def __init__(self, selector: Tree, name: str, arguments: Sequence[Token]) -> None: 199 | self.selector = selector 200 | self.name = ascii_lower(name) 201 | self.arguments = arguments 202 | 203 | def __repr__(self) -> str: 204 | token_values = [token.value for token in self.arguments] 205 | return f"{self.__class__.__name__}[{self.selector!r}:{self.name}({token_values!r})]" 206 | 207 | def argument_types(self) -> list[str]: 208 | return [token.type for token in self.arguments] 209 | 210 | def canonical(self) -> str: 211 | args = "".join(token.css() for token in self.arguments) 212 | return f"{self.selector.canonical()}:{self.name}({args})" 213 | 214 | def specificity(self) -> tuple[int, int, int]: 215 | a, b, c = self.selector.specificity() 216 | b += 1 217 | return a, b, c 218 | 219 | 220 | class Pseudo: 221 | """ 222 | Represents selector:ident 223 | """ 224 | 225 | def __init__(self, selector: Tree, ident: str) -> None: 226 | self.selector = selector 227 | self.ident = ascii_lower(ident) 228 | 229 | def __repr__(self) -> str: 230 | return f"{self.__class__.__name__}[{self.selector!r}:{self.ident}]" 231 | 232 | def canonical(self) -> str: 233 | return f"{self.selector.canonical()}:{self.ident}" 234 | 235 | def specificity(self) -> tuple[int, int, int]: 236 | a, b, c = self.selector.specificity() 237 | b += 1 238 | return a, b, c 239 | 240 | 241 | class Negation: 242 | """ 243 | Represents selector:not(subselector) 244 | """ 245 | 246 | def __init__(self, selector: Tree, subselector: Tree) -> None: 247 | self.selector = selector 248 | self.subselector = subselector 249 | 250 | def __repr__(self) -> str: 251 | return f"{self.__class__.__name__}[{self.selector!r}:not({self.subselector!r})]" 252 | 253 | def canonical(self) -> str: 254 | subsel = self.subselector.canonical() 255 | if len(subsel) > 1: 256 | subsel = subsel.lstrip("*") 257 | return f"{self.selector.canonical()}:not({subsel})" 258 | 259 | def specificity(self) -> tuple[int, int, int]: 260 | a1, b1, c1 = self.selector.specificity() 261 | a2, b2, c2 = self.subselector.specificity() 262 | return a1 + a2, b1 + b2, c1 + c2 263 | 264 | 265 | class Relation: 266 | """ 267 | Represents selector:has(subselector) 268 | """ 269 | 270 | def __init__(self, selector: Tree, combinator: Token, subselector: Selector): 271 | self.selector = selector 272 | self.combinator = combinator 273 | self.subselector = subselector 274 | 275 | def __repr__(self) -> str: 276 | return f"{self.__class__.__name__}[{self.selector!r}:has({self.subselector!r})]" 277 | 278 | def canonical(self) -> str: 279 | try: 280 | subsel = self.subselector[0].canonical() # type: ignore[index] 281 | except TypeError: 282 | subsel = self.subselector.canonical() 283 | if len(subsel) > 1: 284 | subsel = subsel.lstrip("*") 285 | return f"{self.selector.canonical()}:has({subsel})" 286 | 287 | def specificity(self) -> tuple[int, int, int]: 288 | a1, b1, c1 = self.selector.specificity() 289 | try: 290 | a2, b2, c2 = self.subselector[-1].specificity() # type: ignore[index] 291 | except TypeError: 292 | a2, b2, c2 = self.subselector.specificity() 293 | return a1 + a2, b1 + b2, c1 + c2 294 | 295 | 296 | class Matching: 297 | """ 298 | Represents selector:is(selector_list) 299 | """ 300 | 301 | def __init__(self, selector: Tree, selector_list: Iterable[Tree]): 302 | self.selector = selector 303 | self.selector_list = selector_list 304 | 305 | def __repr__(self) -> str: 306 | args_str = ", ".join(repr(s) for s in self.selector_list) 307 | return f"{self.__class__.__name__}[{self.selector!r}:is({args_str})]" 308 | 309 | def canonical(self) -> str: 310 | selector_arguments = [] 311 | for s in self.selector_list: 312 | selarg = s.canonical() 313 | selector_arguments.append(selarg.lstrip("*")) 314 | args_str = ", ".join(str(s) for s in selector_arguments) 315 | return f"{self.selector.canonical()}:is({args_str})" 316 | 317 | def specificity(self) -> tuple[int, int, int]: 318 | return max(x.specificity() for x in self.selector_list) 319 | 320 | 321 | class SpecificityAdjustment: 322 | """ 323 | Represents selector:where(selector_list) 324 | Same as selector:is(selector_list), but its specificity is always 0 325 | """ 326 | 327 | def __init__(self, selector: Tree, selector_list: list[Tree]): 328 | self.selector = selector 329 | self.selector_list = selector_list 330 | 331 | def __repr__(self) -> str: 332 | args_str = ", ".join(repr(s) for s in self.selector_list) 333 | return f"{self.__class__.__name__}[{self.selector!r}:where({args_str})]" 334 | 335 | def canonical(self) -> str: 336 | selector_arguments = [] 337 | for s in self.selector_list: 338 | selarg = s.canonical() 339 | selector_arguments.append(selarg.lstrip("*")) 340 | args_str = ", ".join(str(s) for s in selector_arguments) 341 | return f"{self.selector.canonical()}:where({args_str})" 342 | 343 | def specificity(self) -> tuple[int, int, int]: 344 | return 0, 0, 0 345 | 346 | 347 | class Attrib: 348 | """ 349 | Represents selector[namespace|attrib operator value] 350 | """ 351 | 352 | @overload 353 | def __init__( 354 | self, 355 | selector: Tree, 356 | namespace: str | None, 357 | attrib: str, 358 | operator: Literal["exists"], 359 | value: None, 360 | ) -> None: ... 361 | 362 | @overload 363 | def __init__( 364 | self, 365 | selector: Tree, 366 | namespace: str | None, 367 | attrib: str, 368 | operator: str, 369 | value: Token, 370 | ) -> None: ... 371 | 372 | def __init__( 373 | self, 374 | selector: Tree, 375 | namespace: str | None, 376 | attrib: str, 377 | operator: str, 378 | value: Token | None, 379 | ) -> None: 380 | self.selector = selector 381 | self.namespace = namespace 382 | self.attrib = attrib 383 | self.operator = operator 384 | self.value = value 385 | 386 | def __repr__(self) -> str: 387 | attrib = f"{self.namespace}|{self.attrib}" if self.namespace else self.attrib 388 | if self.operator == "exists": 389 | return f"{self.__class__.__name__}[{self.selector!r}[{attrib}]]" 390 | assert self.value is not None 391 | return f"{self.__class__.__name__}[{self.selector!r}[{attrib} {self.operator} {self.value.value!r}]]" 392 | 393 | def canonical(self) -> str: 394 | attrib = f"{self.namespace}|{self.attrib}" if self.namespace else self.attrib 395 | 396 | if self.operator == "exists": 397 | op = attrib 398 | else: 399 | assert self.value is not None 400 | op = f"{attrib}{self.operator}{self.value.css()}" 401 | 402 | return f"{self.selector.canonical()}[{op}]" 403 | 404 | def specificity(self) -> tuple[int, int, int]: 405 | a, b, c = self.selector.specificity() 406 | b += 1 407 | return a, b, c 408 | 409 | 410 | class Element: 411 | """ 412 | Represents namespace|element 413 | 414 | `None` is for the universal selector '*' 415 | 416 | """ 417 | 418 | def __init__( 419 | self, namespace: str | None = None, element: str | None = None 420 | ) -> None: 421 | self.namespace = namespace 422 | self.element = element 423 | 424 | def __repr__(self) -> str: 425 | return f"{self.__class__.__name__}[{self.canonical()}]" 426 | 427 | def canonical(self) -> str: 428 | element = self.element or "*" 429 | if self.namespace: 430 | element = f"{self.namespace}|{element}" 431 | return element 432 | 433 | def specificity(self) -> tuple[int, int, int]: 434 | if self.element: 435 | return 0, 0, 1 436 | return 0, 0, 0 437 | 438 | 439 | class Hash: 440 | """ 441 | Represents selector#id 442 | """ 443 | 444 | def __init__(self, selector: Tree, id: str) -> None: 445 | self.selector = selector 446 | self.id = id 447 | 448 | def __repr__(self) -> str: 449 | return f"{self.__class__.__name__}[{self.selector!r}#{self.id}]" 450 | 451 | def canonical(self) -> str: 452 | return f"{self.selector.canonical()}#{self.id}" 453 | 454 | def specificity(self) -> tuple[int, int, int]: 455 | a, b, c = self.selector.specificity() 456 | a += 1 457 | return a, b, c 458 | 459 | 460 | class CombinedSelector: 461 | def __init__(self, selector: Tree, combinator: str, subselector: Tree) -> None: 462 | assert selector is not None 463 | self.selector = selector 464 | self.combinator = combinator 465 | self.subselector = subselector 466 | 467 | def __repr__(self) -> str: 468 | comb = "" if self.combinator == " " else self.combinator 469 | return ( 470 | f"{self.__class__.__name__}[{self.selector!r} {comb} {self.subselector!r}]" 471 | ) 472 | 473 | def canonical(self) -> str: 474 | subsel = self.subselector.canonical() 475 | if len(subsel) > 1: 476 | subsel = subsel.lstrip("*") 477 | return f"{self.selector.canonical()} {self.combinator} {subsel}" 478 | 479 | def specificity(self) -> tuple[int, int, int]: 480 | a1, b1, c1 = self.selector.specificity() 481 | a2, b2, c2 = self.subselector.specificity() 482 | return a1 + a2, b1 + b2, c1 + c2 483 | 484 | 485 | #### Parser 486 | 487 | # foo 488 | _el_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]+)[ \t\r\n\f]*$") 489 | 490 | # foo#bar or #bar 491 | _id_re = re.compile(r"^[ \t\r\n\f]*([a-zA-Z]*)#([a-zA-Z0-9_-]+)[ \t\r\n\f]*$") 492 | 493 | # foo.bar or .bar 494 | _class_re = re.compile( 495 | r"^[ \t\r\n\f]*([a-zA-Z]*)\.([a-zA-Z][a-zA-Z0-9_-]*)[ \t\r\n\f]*$" 496 | ) 497 | 498 | 499 | def parse(css: str) -> list[Selector]: 500 | """Parse a CSS *group of selectors*. 501 | 502 | If you don't care about pseudo-elements or selector specificity, 503 | you can skip this and use :meth:`~GenericTranslator.css_to_xpath`. 504 | 505 | :param css: 506 | A *group of selectors* as a string. 507 | :raises: 508 | :class:`SelectorSyntaxError` on invalid selectors. 509 | :returns: 510 | A list of parsed :class:`Selector` objects, one for each 511 | selector in the comma-separated group. 512 | 513 | """ 514 | # Fast path for simple cases 515 | match = _el_re.match(css) 516 | if match: 517 | return [Selector(Element(element=match.group(1)))] 518 | match = _id_re.match(css) 519 | if match is not None: 520 | return [Selector(Hash(Element(element=match.group(1) or None), match.group(2)))] 521 | match = _class_re.match(css) 522 | if match is not None: 523 | return [ 524 | Selector(Class(Element(element=match.group(1) or None), match.group(2))) 525 | ] 526 | 527 | stream = TokenStream(tokenize(css)) 528 | stream.source = css 529 | return list(parse_selector_group(stream)) 530 | 531 | 532 | # except SelectorSyntaxError: 533 | # e = sys.exc_info()[1] 534 | # message = "%s at %s -> %r" % ( 535 | # e, stream.used, stream.peek()) 536 | # e.msg = message 537 | # e.args = tuple([message]) 538 | # raise 539 | 540 | 541 | def parse_selector_group(stream: TokenStream) -> Iterator[Selector]: 542 | stream.skip_whitespace() 543 | while 1: 544 | yield Selector(*parse_selector(stream)) 545 | if stream.peek() == ("DELIM", ","): 546 | stream.next() 547 | stream.skip_whitespace() 548 | else: 549 | break 550 | 551 | 552 | def parse_selector(stream: TokenStream) -> tuple[Tree, PseudoElement | None]: 553 | result, pseudo_element = parse_simple_selector(stream) 554 | while 1: 555 | stream.skip_whitespace() 556 | peek = stream.peek() 557 | if peek in (("EOF", None), ("DELIM", ",")): 558 | break 559 | if pseudo_element: 560 | raise SelectorSyntaxError( 561 | f"Got pseudo-element ::{pseudo_element} not at the end of a selector" 562 | ) 563 | if peek.is_delim("+", ">", "~"): 564 | # A combinator 565 | combinator = cast("str", stream.next().value) 566 | stream.skip_whitespace() 567 | else: 568 | # By exclusion, the last parse_simple_selector() ended 569 | # at peek == ' ' 570 | combinator = " " 571 | next_selector, pseudo_element = parse_simple_selector(stream) 572 | result = CombinedSelector(result, combinator, next_selector) 573 | return result, pseudo_element 574 | 575 | 576 | def parse_simple_selector( 577 | stream: TokenStream, inside_negation: bool = False 578 | ) -> tuple[Tree, PseudoElement | None]: 579 | stream.skip_whitespace() 580 | selector_start = len(stream.used) 581 | peek = stream.peek() 582 | if peek.type == "IDENT" or peek == ("DELIM", "*"): 583 | if peek.type == "IDENT": 584 | namespace = stream.next().value 585 | else: 586 | stream.next() 587 | namespace = None 588 | if stream.peek() == ("DELIM", "|"): 589 | stream.next() 590 | element = stream.next_ident_or_star() 591 | else: 592 | element = namespace 593 | namespace = None 594 | else: 595 | element = namespace = None 596 | result: Tree = Element(namespace, element) 597 | pseudo_element: PseudoElement | None = None 598 | while 1: 599 | peek = stream.peek() 600 | if ( 601 | peek.type in ("S", "EOF") 602 | or peek.is_delim(",", "+", ">", "~") 603 | or (inside_negation and peek == ("DELIM", ")")) 604 | ): 605 | break 606 | if pseudo_element: 607 | raise SelectorSyntaxError( 608 | f"Got pseudo-element ::{pseudo_element} not at the end of a selector" 609 | ) 610 | if peek.type == "HASH": 611 | result = Hash(result, cast("str", stream.next().value)) 612 | elif peek == ("DELIM", "."): 613 | stream.next() 614 | result = Class(result, stream.next_ident()) 615 | elif peek == ("DELIM", "|"): 616 | stream.next() 617 | result = Element(None, stream.next_ident()) 618 | elif peek == ("DELIM", "["): 619 | stream.next() 620 | result = parse_attrib(result, stream) 621 | elif peek == ("DELIM", ":"): 622 | stream.next() 623 | if stream.peek() == ("DELIM", ":"): 624 | stream.next() 625 | pseudo_element = stream.next_ident() 626 | if stream.peek() == ("DELIM", "("): 627 | stream.next() 628 | pseudo_element = FunctionalPseudoElement( 629 | pseudo_element, parse_arguments(stream) 630 | ) 631 | continue 632 | ident = stream.next_ident() 633 | if ident.lower() in ("first-line", "first-letter", "before", "after"): 634 | # Special case: CSS 2.1 pseudo-elements can have a single ':' 635 | # Any new pseudo-element must have two. 636 | pseudo_element = str(ident) 637 | continue 638 | if stream.peek() != ("DELIM", "("): 639 | result = Pseudo(result, ident) 640 | if repr(result) == "Pseudo[Element[*]:scope]" and not ( 641 | len(stream.used) == 2 642 | or (len(stream.used) == 3 and stream.used[0].type == "S") 643 | or (len(stream.used) >= 3 and stream.used[-3].is_delim(",")) 644 | or ( 645 | len(stream.used) >= 4 646 | and stream.used[-3].type == "S" 647 | and stream.used[-4].is_delim(",") 648 | ) 649 | ): 650 | raise SelectorSyntaxError( 651 | 'Got immediate child pseudo-element ":scope" ' 652 | "not at the start of a selector" 653 | ) 654 | continue 655 | stream.next() 656 | stream.skip_whitespace() 657 | if ident.lower() == "not": 658 | if inside_negation: 659 | raise SelectorSyntaxError("Got nested :not()") 660 | argument, argument_pseudo_element = parse_simple_selector( 661 | stream, inside_negation=True 662 | ) 663 | next = stream.next() 664 | if argument_pseudo_element: 665 | raise SelectorSyntaxError( 666 | f"Got pseudo-element ::{argument_pseudo_element} inside :not() at {next.pos}" 667 | ) 668 | if next != ("DELIM", ")"): 669 | raise SelectorSyntaxError(f"Expected ')', got {next}") 670 | result = Negation(result, argument) 671 | elif ident.lower() == "has": 672 | combinator, arguments = parse_relative_selector(stream) 673 | result = Relation(result, combinator, arguments) 674 | 675 | elif ident.lower() in ("matches", "is"): 676 | selectors = parse_simple_selector_arguments(stream) 677 | result = Matching(result, selectors) 678 | elif ident.lower() == "where": 679 | selectors = parse_simple_selector_arguments(stream) 680 | result = SpecificityAdjustment(result, selectors) 681 | else: 682 | result = Function(result, ident, parse_arguments(stream)) 683 | else: 684 | raise SelectorSyntaxError(f"Expected selector, got {peek}") 685 | if len(stream.used) == selector_start: 686 | raise SelectorSyntaxError(f"Expected selector, got {stream.peek()}") 687 | return result, pseudo_element 688 | 689 | 690 | def parse_arguments(stream: TokenStream) -> list[Token]: 691 | arguments: list[Token] = [] 692 | while 1: # noqa: RET503 693 | stream.skip_whitespace() 694 | next = stream.next() 695 | if next.type in ("IDENT", "STRING", "NUMBER") or next in [ 696 | ("DELIM", "+"), 697 | ("DELIM", "-"), 698 | ]: 699 | arguments.append(next) 700 | elif next == ("DELIM", ")"): 701 | return arguments 702 | else: 703 | raise SelectorSyntaxError(f"Expected an argument, got {next}") 704 | 705 | 706 | def parse_relative_selector(stream: TokenStream) -> tuple[Token, Selector]: 707 | stream.skip_whitespace() 708 | subselector = "" 709 | next = stream.next() 710 | 711 | if next in [("DELIM", "+"), ("DELIM", "-"), ("DELIM", ">"), ("DELIM", "~")]: 712 | combinator = next 713 | stream.skip_whitespace() 714 | next = stream.next() 715 | else: 716 | combinator = Token("DELIM", " ", pos=0) 717 | 718 | while 1: # noqa: RET503 719 | if next.type in ("IDENT", "STRING", "NUMBER") or next in [ 720 | ("DELIM", "."), 721 | ("DELIM", "*"), 722 | ]: 723 | subselector += cast("str", next.value) 724 | elif next == ("DELIM", ")"): 725 | result = parse(subselector) 726 | return combinator, result[0] 727 | else: 728 | raise SelectorSyntaxError(f"Expected an argument, got {next}") 729 | next = stream.next() 730 | 731 | 732 | def parse_simple_selector_arguments(stream: TokenStream) -> list[Tree]: 733 | arguments = [] 734 | while 1: 735 | result, pseudo_element = parse_simple_selector(stream, True) 736 | if pseudo_element: 737 | raise SelectorSyntaxError( 738 | f"Got pseudo-element ::{pseudo_element} inside function" 739 | ) 740 | stream.skip_whitespace() 741 | next = stream.next() 742 | if next in (("EOF", None), ("DELIM", ",")): 743 | stream.next() 744 | stream.skip_whitespace() 745 | arguments.append(result) 746 | elif next == ("DELIM", ")"): 747 | arguments.append(result) 748 | break 749 | else: 750 | raise SelectorSyntaxError(f"Expected an argument, got {next}") 751 | return arguments 752 | 753 | 754 | def parse_attrib(selector: Tree, stream: TokenStream) -> Attrib: 755 | stream.skip_whitespace() 756 | attrib = stream.next_ident_or_star() 757 | if attrib is None and stream.peek() != ("DELIM", "|"): 758 | raise SelectorSyntaxError(f"Expected '|', got {stream.peek()}") 759 | namespace: str | None 760 | op: str | None 761 | if stream.peek() == ("DELIM", "|"): 762 | stream.next() 763 | if stream.peek() == ("DELIM", "="): 764 | namespace = None 765 | stream.next() 766 | op = "|=" 767 | else: 768 | namespace = attrib 769 | attrib = stream.next_ident() 770 | op = None 771 | else: 772 | namespace = op = None 773 | if op is None: 774 | stream.skip_whitespace() 775 | next = stream.next() 776 | if next == ("DELIM", "]"): 777 | return Attrib(selector, namespace, cast("str", attrib), "exists", None) 778 | if next == ("DELIM", "="): 779 | op = "=" 780 | elif next.is_delim("^", "$", "*", "~", "|", "!") and ( 781 | stream.peek() == ("DELIM", "=") 782 | ): 783 | op = cast("str", next.value) + "=" 784 | stream.next() 785 | else: 786 | raise SelectorSyntaxError(f"Operator expected, got {next}") 787 | stream.skip_whitespace() 788 | value = stream.next() 789 | if value.type not in ("IDENT", "STRING"): 790 | raise SelectorSyntaxError(f"Expected string or ident, got {value}") 791 | stream.skip_whitespace() 792 | next = stream.next() 793 | if next != ("DELIM", "]"): 794 | raise SelectorSyntaxError(f"Expected ']', got {next}") 795 | return Attrib(selector, namespace, cast("str", attrib), op, value) 796 | 797 | 798 | def parse_series(tokens: Iterable[Token]) -> tuple[int, int]: 799 | """ 800 | Parses the arguments for :nth-child() and friends. 801 | 802 | :raises: A list of tokens 803 | :returns: :``(a, b)`` 804 | 805 | """ 806 | for token in tokens: 807 | if token.type == "STRING": 808 | raise ValueError("String tokens not allowed in series.") 809 | s = "".join(cast("str", token.value) for token in tokens).strip() 810 | if s == "odd": 811 | return 2, 1 812 | if s == "even": 813 | return 2, 0 814 | if s == "n": 815 | return 1, 0 816 | if "n" not in s: 817 | # Just b 818 | return 0, int(s) 819 | a, b = s.split("n", 1) 820 | a_as_int: int 821 | if not a: 822 | a_as_int = 1 823 | elif a in {"-", "+"}: 824 | a_as_int = int(a + "1") 825 | else: 826 | a_as_int = int(a) 827 | b_as_int = int(b) if b else 0 828 | return a_as_int, b_as_int 829 | 830 | 831 | #### Token objects 832 | 833 | 834 | class Token(tuple[str, Optional[str]]): # noqa: SLOT001 835 | @overload 836 | def __new__( 837 | cls, 838 | type_: Literal["IDENT", "HASH", "STRING", "S", "DELIM", "NUMBER"], 839 | value: str, 840 | pos: int, 841 | ) -> Self: ... 842 | 843 | @overload 844 | def __new__(cls, type_: Literal["EOF"], value: None, pos: int) -> Self: ... 845 | 846 | def __new__(cls, type_: str, value: str | None, pos: int) -> Self: 847 | obj = tuple.__new__(cls, (type_, value)) 848 | obj.pos = pos 849 | return obj 850 | 851 | def __repr__(self) -> str: 852 | return f"<{self.type} '{self.value}' at {self.pos}>" 853 | 854 | def is_delim(self, *values: str) -> bool: 855 | return self.type == "DELIM" and self.value in values 856 | 857 | pos: int 858 | 859 | @property 860 | def type(self) -> str: 861 | return self[0] 862 | 863 | @property 864 | def value(self) -> str | None: 865 | return self[1] 866 | 867 | def css(self) -> str: 868 | if self.type == "STRING": 869 | return repr(self.value) 870 | return cast("str", self.value) 871 | 872 | 873 | class EOFToken(Token): 874 | def __new__(cls, pos: int) -> Self: 875 | return Token.__new__(cls, "EOF", None, pos) 876 | 877 | def __repr__(self) -> str: 878 | return f"<{self.type} at {self.pos}>" 879 | 880 | 881 | #### Tokenizer 882 | 883 | 884 | class TokenMacros: 885 | unicode_escape = r"\\([0-9a-f]{1,6})(?:\r\n|[ \n\r\t\f])?" 886 | escape = unicode_escape + r"|\\[^\n\r\f0-9a-f]" 887 | string_escape = r"\\(?:\n|\r\n|\r|\f)|" + escape 888 | nonascii = r"[^\0-\177]" 889 | nmchar = f"[_a-z0-9-]|{escape}|{nonascii}" 890 | nmstart = f"[_a-z]|{escape}|{nonascii}" 891 | 892 | 893 | class MatchFunc(Protocol): 894 | def __call__( 895 | self, string: str, pos: int = ..., endpos: int = ... 896 | ) -> re.Match[str] | None: ... 897 | 898 | 899 | def _compile(pattern: str) -> MatchFunc: 900 | return re.compile(pattern % vars(TokenMacros), re.IGNORECASE).match 901 | 902 | 903 | _match_whitespace = _compile(r"[ \t\r\n\f]+") 904 | _match_number = _compile(r"[+-]?(?:[0-9]*\.[0-9]+|[0-9]+)") 905 | _match_hash = _compile("#(?:%(nmchar)s)+") 906 | _match_ident = _compile("-?(?:%(nmstart)s)(?:%(nmchar)s)*") 907 | _match_string_by_quote = { 908 | "'": _compile(r"([^\n\r\f\\']|%(string_escape)s)*"), 909 | '"': _compile(r'([^\n\r\f\\"]|%(string_escape)s)*'), 910 | } 911 | 912 | _sub_simple_escape = re.compile(r"\\(.)").sub 913 | _sub_unicode_escape = re.compile(TokenMacros.unicode_escape, re.IGNORECASE).sub 914 | _sub_newline_escape = re.compile(r"\\(?:\n|\r\n|\r|\f)").sub 915 | 916 | # Same as r'\1', but faster on CPython 917 | _replace_simple = operator.methodcaller("group", 1) 918 | 919 | 920 | def _replace_unicode(match: re.Match[str]) -> str: 921 | codepoint = int(match.group(1), 16) 922 | if codepoint > sys.maxunicode: 923 | codepoint = 0xFFFD 924 | return chr(codepoint) 925 | 926 | 927 | def unescape_ident(value: str) -> str: 928 | value = _sub_unicode_escape(_replace_unicode, value) 929 | return _sub_simple_escape(_replace_simple, value) 930 | 931 | 932 | def tokenize(s: str) -> Iterator[Token]: 933 | pos = 0 934 | len_s = len(s) 935 | while pos < len_s: 936 | match = _match_whitespace(s, pos=pos) 937 | if match: 938 | yield Token("S", " ", pos) 939 | pos = match.end() 940 | continue 941 | 942 | match = _match_ident(s, pos=pos) 943 | if match: 944 | value = _sub_simple_escape( 945 | _replace_simple, _sub_unicode_escape(_replace_unicode, match.group()) 946 | ) 947 | yield Token("IDENT", value, pos) 948 | pos = match.end() 949 | continue 950 | 951 | match = _match_hash(s, pos=pos) 952 | if match: 953 | value = _sub_simple_escape( 954 | _replace_simple, 955 | _sub_unicode_escape(_replace_unicode, match.group()[1:]), 956 | ) 957 | yield Token("HASH", value, pos) 958 | pos = match.end() 959 | continue 960 | 961 | quote = s[pos] 962 | if quote in _match_string_by_quote: 963 | match = _match_string_by_quote[quote](s, pos=pos + 1) 964 | assert match, "Should have found at least an empty match" 965 | end_pos = match.end() 966 | if end_pos == len_s: 967 | raise SelectorSyntaxError(f"Unclosed string at {pos}") 968 | if s[end_pos] != quote: 969 | raise SelectorSyntaxError(f"Invalid string at {pos}") 970 | value = _sub_simple_escape( 971 | _replace_simple, 972 | _sub_unicode_escape( 973 | _replace_unicode, _sub_newline_escape("", match.group()) 974 | ), 975 | ) 976 | yield Token("STRING", value, pos) 977 | pos = end_pos + 1 978 | continue 979 | 980 | match = _match_number(s, pos=pos) 981 | if match: 982 | value = match.group() 983 | yield Token("NUMBER", value, pos) 984 | pos = match.end() 985 | continue 986 | 987 | pos2 = pos + 2 988 | if s[pos:pos2] == "/*": 989 | pos = s.find("*/", pos2) 990 | if pos == -1: 991 | pos = len_s 992 | else: 993 | pos += 2 994 | continue 995 | 996 | yield Token("DELIM", s[pos], pos) 997 | pos += 1 998 | 999 | assert pos == len_s 1000 | yield EOFToken(pos) 1001 | 1002 | 1003 | class TokenStream: 1004 | def __init__(self, tokens: Iterable[Token], source: str | None = None) -> None: 1005 | self.used: list[Token] = [] 1006 | self.tokens = iter(tokens) 1007 | self.source = source 1008 | self.peeked: Token | None = None 1009 | self._peeking = False 1010 | self.next_token = self.tokens.__next__ 1011 | 1012 | def next(self) -> Token: 1013 | if self._peeking: 1014 | self._peeking = False 1015 | assert self.peeked is not None 1016 | self.used.append(self.peeked) 1017 | return self.peeked 1018 | next = self.next_token() 1019 | self.used.append(next) 1020 | return next 1021 | 1022 | def peek(self) -> Token: 1023 | if not self._peeking: 1024 | self.peeked = self.next_token() 1025 | self._peeking = True 1026 | assert self.peeked is not None 1027 | return self.peeked 1028 | 1029 | def next_ident(self) -> str: 1030 | next = self.next() 1031 | if next.type != "IDENT": 1032 | raise SelectorSyntaxError(f"Expected ident, got {next}") 1033 | return cast("str", next.value) 1034 | 1035 | def next_ident_or_star(self) -> str | None: 1036 | next = self.next() 1037 | if next.type == "IDENT": 1038 | return next.value 1039 | if next == ("DELIM", "*"): 1040 | return None 1041 | raise SelectorSyntaxError(f"Expected ident or '*', got {next}") 1042 | 1043 | def skip_whitespace(self) -> None: 1044 | peek = self.peek() 1045 | if peek.type == "S": 1046 | self.next() 1047 | -------------------------------------------------------------------------------- /cssselect/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy/cssselect/b478ce96deddd07bd7bd5311d49fd0b5bbf3f54f/cssselect/py.typed -------------------------------------------------------------------------------- /cssselect/xpath.py: -------------------------------------------------------------------------------- 1 | """ 2 | cssselect.xpath 3 | =============== 4 | 5 | Translation of parsed CSS selectors to XPath expressions. 6 | 7 | 8 | :copyright: (c) 2007-2012 Ian Bicking and contributors. 9 | See AUTHORS for more details. 10 | :license: BSD, see LICENSE for more details. 11 | 12 | """ 13 | 14 | from __future__ import annotations 15 | 16 | import re 17 | from typing import TYPE_CHECKING, cast 18 | 19 | from cssselect.parser import ( 20 | Attrib, 21 | Class, 22 | CombinedSelector, 23 | Element, 24 | Function, 25 | Hash, 26 | Matching, 27 | Negation, 28 | Pseudo, 29 | PseudoElement, 30 | Relation, 31 | Selector, 32 | SelectorError, 33 | SpecificityAdjustment, 34 | Tree, 35 | parse, 36 | parse_series, 37 | ) 38 | 39 | if TYPE_CHECKING: 40 | from collections.abc import Callable 41 | 42 | # typing.Self requires Python 3.11 43 | from typing_extensions import Self 44 | 45 | 46 | class ExpressionError(SelectorError, RuntimeError): 47 | """Unknown or unsupported selector (eg. pseudo-class).""" 48 | 49 | 50 | #### XPath Helpers 51 | 52 | 53 | class XPathExpr: 54 | def __init__( 55 | self, 56 | path: str = "", 57 | element: str = "*", 58 | condition: str = "", 59 | star_prefix: bool = False, 60 | ) -> None: 61 | self.path = path 62 | self.element = element 63 | self.condition = condition 64 | 65 | def __str__(self) -> str: 66 | path = str(self.path) + str(self.element) 67 | if self.condition: 68 | path += f"[{self.condition}]" 69 | return path 70 | 71 | def __repr__(self) -> str: 72 | return f"{self.__class__.__name__}[{self}]" 73 | 74 | def add_condition(self, condition: str, conjuction: str = "and") -> Self: 75 | if self.condition: 76 | self.condition = f"({self.condition}) {conjuction} ({condition})" 77 | else: 78 | self.condition = condition 79 | return self 80 | 81 | def add_name_test(self) -> None: 82 | if self.element == "*": 83 | # We weren't doing a test anyway 84 | return 85 | self.add_condition(f"name() = {GenericTranslator.xpath_literal(self.element)}") 86 | self.element = "*" 87 | 88 | def add_star_prefix(self) -> None: 89 | """ 90 | Append '*/' to the path to keep the context constrained 91 | to a single parent. 92 | """ 93 | self.path += "*/" 94 | 95 | def join( 96 | self, 97 | combiner: str, 98 | other: XPathExpr, 99 | closing_combiner: str | None = None, 100 | has_inner_condition: bool = False, 101 | ) -> Self: 102 | path = str(self) + combiner 103 | # Any "star prefix" is redundant when joining. 104 | if other.path != "*/": 105 | path += other.path 106 | self.path = path 107 | if not has_inner_condition: 108 | self.element = ( 109 | other.element + closing_combiner if closing_combiner else other.element 110 | ) 111 | self.condition = other.condition 112 | else: 113 | self.element = other.element 114 | if other.condition: 115 | self.element += "[" + other.condition + "]" 116 | if closing_combiner: 117 | self.element += closing_combiner 118 | return self 119 | 120 | 121 | split_at_single_quotes = re.compile("('+)").split 122 | 123 | # The spec is actually more permissive than that, but don’t bother. 124 | # This is just for the fast path. 125 | # http://www.w3.org/TR/REC-xml/#NT-NameStartChar 126 | is_safe_name = re.compile("^[a-zA-Z_][a-zA-Z0-9_.-]*$").match 127 | 128 | # Test that the string is not empty and does not contain whitespace 129 | is_non_whitespace = re.compile(r"^[^ \t\r\n\f]+$").match 130 | 131 | 132 | #### Translation 133 | 134 | 135 | class GenericTranslator: 136 | """ 137 | Translator for "generic" XML documents. 138 | 139 | Everything is case-sensitive, no assumption is made on the meaning 140 | of element names and attribute names. 141 | 142 | """ 143 | 144 | #### 145 | #### HERE BE DRAGONS 146 | #### 147 | #### You are welcome to hook into this to change some behavior, 148 | #### but do so at your own risks. 149 | #### Until it has received a lot more work and review, 150 | #### I reserve the right to change this API in backward-incompatible ways 151 | #### with any minor version of cssselect. 152 | #### See https://github.com/scrapy/cssselect/pull/22 153 | #### -- Simon Sapin. 154 | #### 155 | 156 | combinator_mapping = { 157 | " ": "descendant", 158 | ">": "child", 159 | "+": "direct_adjacent", 160 | "~": "indirect_adjacent", 161 | } 162 | 163 | attribute_operator_mapping = { 164 | "exists": "exists", 165 | "=": "equals", 166 | "~=": "includes", 167 | "|=": "dashmatch", 168 | "^=": "prefixmatch", 169 | "$=": "suffixmatch", 170 | "*=": "substringmatch", 171 | "!=": "different", # XXX Not in Level 3 but meh 172 | } 173 | 174 | #: The attribute used for ID selectors depends on the document language: 175 | #: http://www.w3.org/TR/selectors/#id-selectors 176 | id_attribute = "id" 177 | 178 | #: The attribute used for ``:lang()`` depends on the document language: 179 | #: http://www.w3.org/TR/selectors/#lang-pseudo 180 | lang_attribute = "xml:lang" 181 | 182 | #: The case sensitivity of document language element names, 183 | #: attribute names, and attribute values in selectors depends 184 | #: on the document language. 185 | #: http://www.w3.org/TR/selectors/#casesens 186 | #: 187 | #: When a document language defines one of these as case-insensitive, 188 | #: cssselect assumes that the document parser makes the parsed values 189 | #: lower-case. Making the selector lower-case too makes the comparaison 190 | #: case-insensitive. 191 | #: 192 | #: In HTML, element names and attributes names (but not attribute values) 193 | #: are case-insensitive. All of lxml.html, html5lib, BeautifulSoup4 194 | #: and HTMLParser make them lower-case in their parse result, so 195 | #: the assumption holds. 196 | lower_case_element_names = False 197 | lower_case_attribute_names = False 198 | lower_case_attribute_values = False 199 | 200 | # class used to represent and xpath expression 201 | xpathexpr_cls = XPathExpr 202 | 203 | def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str: 204 | """Translate a *group of selectors* to XPath. 205 | 206 | Pseudo-elements are not supported here since XPath only knows 207 | about "real" elements. 208 | 209 | :param css: 210 | A *group of selectors* as a string. 211 | :param prefix: 212 | This string is prepended to the XPath expression for each selector. 213 | The default makes selectors scoped to the context node’s subtree. 214 | :raises: 215 | :class:`~cssselect.SelectorSyntaxError` on invalid selectors, 216 | :class:`ExpressionError` on unknown/unsupported selectors, 217 | including pseudo-elements. 218 | :returns: 219 | The equivalent XPath 1.0 expression as a string. 220 | 221 | """ 222 | return " | ".join( 223 | self.selector_to_xpath(selector, prefix, translate_pseudo_elements=True) 224 | for selector in parse(css) 225 | ) 226 | 227 | def selector_to_xpath( 228 | self, 229 | selector: Selector, 230 | prefix: str = "descendant-or-self::", 231 | translate_pseudo_elements: bool = False, 232 | ) -> str: 233 | """Translate a parsed selector to XPath. 234 | 235 | 236 | :param selector: 237 | A parsed :class:`Selector` object. 238 | :param prefix: 239 | This string is prepended to the resulting XPath expression. 240 | The default makes selectors scoped to the context node’s subtree. 241 | :param translate_pseudo_elements: 242 | Unless this is set to ``True`` (as :meth:`css_to_xpath` does), 243 | the :attr:`~Selector.pseudo_element` attribute of the selector 244 | is ignored. 245 | It is the caller's responsibility to reject selectors 246 | with pseudo-elements, or to account for them somehow. 247 | :raises: 248 | :class:`ExpressionError` on unknown/unsupported selectors. 249 | :returns: 250 | The equivalent XPath 1.0 expression as a string. 251 | 252 | """ 253 | tree = getattr(selector, "parsed_tree", None) 254 | if not tree: 255 | raise TypeError(f"Expected a parsed selector, got {selector!r}") 256 | xpath = self.xpath(tree) 257 | assert isinstance(xpath, self.xpathexpr_cls) # help debug a missing 'return' 258 | if translate_pseudo_elements and selector.pseudo_element: 259 | xpath = self.xpath_pseudo_element(xpath, selector.pseudo_element) 260 | return (prefix or "") + str(xpath) 261 | 262 | def xpath_pseudo_element( 263 | self, xpath: XPathExpr, pseudo_element: PseudoElement 264 | ) -> XPathExpr: 265 | """Translate a pseudo-element. 266 | 267 | Defaults to not supporting pseudo-elements at all, 268 | but can be overridden by sub-classes. 269 | 270 | """ 271 | raise ExpressionError("Pseudo-elements are not supported.") 272 | 273 | @staticmethod 274 | def xpath_literal(s: str) -> str: 275 | s = str(s) 276 | if "'" not in s: 277 | s = f"'{s}'" 278 | elif '"' not in s: 279 | s = f'"{s}"' 280 | else: 281 | parts_quoted = [ 282 | f'"{part}"' if "'" in part else f"'{part}'" 283 | for part in split_at_single_quotes(s) 284 | if part 285 | ] 286 | s = "concat({})".format(",".join(parts_quoted)) 287 | return s 288 | 289 | def xpath(self, parsed_selector: Tree) -> XPathExpr: 290 | """Translate any parsed selector object.""" 291 | type_name = type(parsed_selector).__name__ 292 | method = cast( 293 | "Callable[[Tree], XPathExpr] | None", 294 | getattr(self, f"xpath_{type_name.lower()}", None), 295 | ) 296 | if method is None: 297 | raise ExpressionError(f"{type_name} is not supported.") 298 | return method(parsed_selector) 299 | 300 | # Dispatched by parsed object type 301 | 302 | def xpath_combinedselector(self, combined: CombinedSelector) -> XPathExpr: 303 | """Translate a combined selector.""" 304 | combinator = self.combinator_mapping[combined.combinator] 305 | method = cast( 306 | "Callable[[XPathExpr, XPathExpr], XPathExpr]", 307 | getattr(self, f"xpath_{combinator}_combinator"), 308 | ) 309 | return method(self.xpath(combined.selector), self.xpath(combined.subselector)) 310 | 311 | def xpath_negation(self, negation: Negation) -> XPathExpr: 312 | xpath = self.xpath(negation.selector) 313 | sub_xpath = self.xpath(negation.subselector) 314 | sub_xpath.add_name_test() 315 | if sub_xpath.condition: 316 | return xpath.add_condition(f"not({sub_xpath.condition})") 317 | return xpath.add_condition("0") 318 | 319 | def xpath_relation(self, relation: Relation) -> XPathExpr: 320 | xpath = self.xpath(relation.selector) 321 | combinator = relation.combinator 322 | subselector = relation.subselector 323 | right = self.xpath(subselector.parsed_tree) 324 | method = cast( 325 | "Callable[[XPathExpr, XPathExpr], XPathExpr]", 326 | getattr( 327 | self, 328 | f"xpath_relation_{self.combinator_mapping[cast('str', combinator.value)]}_combinator", 329 | ), 330 | ) 331 | return method(xpath, right) 332 | 333 | def xpath_matching(self, matching: Matching) -> XPathExpr: 334 | xpath = self.xpath(matching.selector) 335 | exprs = [self.xpath(selector) for selector in matching.selector_list] 336 | for e in exprs: 337 | e.add_name_test() 338 | if e.condition: 339 | xpath.add_condition(e.condition, "or") 340 | return xpath 341 | 342 | def xpath_specificityadjustment(self, matching: SpecificityAdjustment) -> XPathExpr: 343 | xpath = self.xpath(matching.selector) 344 | exprs = [self.xpath(selector) for selector in matching.selector_list] 345 | for e in exprs: 346 | e.add_name_test() 347 | if e.condition: 348 | xpath.add_condition(e.condition, "or") 349 | return xpath 350 | 351 | def xpath_function(self, function: Function) -> XPathExpr: 352 | """Translate a functional pseudo-class.""" 353 | method_name = "xpath_{}_function".format(function.name.replace("-", "_")) 354 | method = cast( 355 | "Callable[[XPathExpr, Function], XPathExpr] | None", 356 | getattr(self, method_name, None), 357 | ) 358 | if not method: 359 | raise ExpressionError(f"The pseudo-class :{function.name}() is unknown") 360 | return method(self.xpath(function.selector), function) 361 | 362 | def xpath_pseudo(self, pseudo: Pseudo) -> XPathExpr: 363 | """Translate a pseudo-class.""" 364 | method_name = "xpath_{}_pseudo".format(pseudo.ident.replace("-", "_")) 365 | method = cast( 366 | "Callable[[XPathExpr], XPathExpr] | None", 367 | getattr(self, method_name, None), 368 | ) 369 | if not method: 370 | # TODO: better error message for pseudo-elements? 371 | raise ExpressionError(f"The pseudo-class :{pseudo.ident} is unknown") 372 | return method(self.xpath(pseudo.selector)) 373 | 374 | def xpath_attrib(self, selector: Attrib) -> XPathExpr: 375 | """Translate an attribute selector.""" 376 | operator = self.attribute_operator_mapping[selector.operator] 377 | method = cast( 378 | "Callable[[XPathExpr, str, str | None], XPathExpr]", 379 | getattr(self, f"xpath_attrib_{operator}"), 380 | ) 381 | if self.lower_case_attribute_names: 382 | name = selector.attrib.lower() 383 | else: 384 | name = selector.attrib 385 | safe = is_safe_name(name) 386 | if selector.namespace: 387 | name = f"{selector.namespace}:{name}" 388 | safe = safe and is_safe_name(selector.namespace) 389 | if safe: 390 | attrib = "@" + name 391 | else: 392 | attrib = f"attribute::*[name() = {self.xpath_literal(name)}]" 393 | if selector.value is None: 394 | value = None 395 | elif self.lower_case_attribute_values: 396 | value = cast("str", selector.value.value).lower() 397 | else: 398 | value = selector.value.value 399 | return method(self.xpath(selector.selector), attrib, value) 400 | 401 | def xpath_class(self, class_selector: Class) -> XPathExpr: 402 | """Translate a class selector.""" 403 | # .foo is defined as [class~=foo] in the spec. 404 | xpath = self.xpath(class_selector.selector) 405 | return self.xpath_attrib_includes(xpath, "@class", class_selector.class_name) 406 | 407 | def xpath_hash(self, id_selector: Hash) -> XPathExpr: 408 | """Translate an ID selector.""" 409 | xpath = self.xpath(id_selector.selector) 410 | return self.xpath_attrib_equals(xpath, "@id", id_selector.id) 411 | 412 | def xpath_element(self, selector: Element) -> XPathExpr: 413 | """Translate a type or universal selector.""" 414 | element = selector.element 415 | if not element: 416 | element = "*" 417 | safe = True 418 | else: 419 | safe = bool(is_safe_name(element)) 420 | if self.lower_case_element_names: 421 | element = element.lower() 422 | if selector.namespace: 423 | # Namespace prefixes are case-sensitive. 424 | # http://www.w3.org/TR/css3-namespace/#prefixes 425 | element = f"{selector.namespace}:{element}" 426 | safe = safe and bool(is_safe_name(selector.namespace)) 427 | xpath = self.xpathexpr_cls(element=element) 428 | if not safe: 429 | xpath.add_name_test() 430 | return xpath 431 | 432 | # CombinedSelector: dispatch by combinator 433 | 434 | def xpath_descendant_combinator( 435 | self, left: XPathExpr, right: XPathExpr 436 | ) -> XPathExpr: 437 | """right is a child, grand-child or further descendant of left""" 438 | return left.join("/descendant-or-self::*/", right) 439 | 440 | def xpath_child_combinator(self, left: XPathExpr, right: XPathExpr) -> XPathExpr: 441 | """right is an immediate child of left""" 442 | return left.join("/", right) 443 | 444 | def xpath_direct_adjacent_combinator( 445 | self, left: XPathExpr, right: XPathExpr 446 | ) -> XPathExpr: 447 | """right is a sibling immediately after left""" 448 | xpath = left.join("/following-sibling::", right) 449 | xpath.add_name_test() 450 | return xpath.add_condition("position() = 1") 451 | 452 | def xpath_indirect_adjacent_combinator( 453 | self, left: XPathExpr, right: XPathExpr 454 | ) -> XPathExpr: 455 | """right is a sibling after left, immediately or not""" 456 | return left.join("/following-sibling::", right) 457 | 458 | def xpath_relation_descendant_combinator( 459 | self, left: XPathExpr, right: XPathExpr 460 | ) -> XPathExpr: 461 | """right is a child, grand-child or further descendant of left; select left""" 462 | return left.join( 463 | "[descendant::", right, closing_combiner="]", has_inner_condition=True 464 | ) 465 | 466 | def xpath_relation_child_combinator( 467 | self, left: XPathExpr, right: XPathExpr 468 | ) -> XPathExpr: 469 | """right is an immediate child of left; select left""" 470 | return left.join("[./", right, closing_combiner="]") 471 | 472 | def xpath_relation_direct_adjacent_combinator( 473 | self, left: XPathExpr, right: XPathExpr 474 | ) -> XPathExpr: 475 | """right is a sibling immediately after left; select left""" 476 | return left.add_condition( 477 | f"following-sibling::*[(name() = '{right.element}') and (position() = 1)]" 478 | ) 479 | 480 | def xpath_relation_indirect_adjacent_combinator( 481 | self, left: XPathExpr, right: XPathExpr 482 | ) -> XPathExpr: 483 | """right is a sibling after left, immediately or not; select left""" 484 | return left.join("[following-sibling::", right, closing_combiner="]") 485 | 486 | # Function: dispatch by function/pseudo-class name 487 | 488 | def xpath_nth_child_function( 489 | self, 490 | xpath: XPathExpr, 491 | function: Function, 492 | last: bool = False, 493 | add_name_test: bool = True, 494 | ) -> XPathExpr: 495 | try: 496 | a, b = parse_series(function.arguments) 497 | except ValueError as ex: 498 | raise ExpressionError(f"Invalid series: '{function.arguments!r}'") from ex 499 | 500 | # From https://www.w3.org/TR/css3-selectors/#structural-pseudos: 501 | # 502 | # :nth-child(an+b) 503 | # an+b-1 siblings before 504 | # 505 | # :nth-last-child(an+b) 506 | # an+b-1 siblings after 507 | # 508 | # :nth-of-type(an+b) 509 | # an+b-1 siblings with the same expanded element name before 510 | # 511 | # :nth-last-of-type(an+b) 512 | # an+b-1 siblings with the same expanded element name after 513 | # 514 | # So, 515 | # for :nth-child and :nth-of-type 516 | # 517 | # count(preceding-sibling::) = an+b-1 518 | # 519 | # for :nth-last-child and :nth-last-of-type 520 | # 521 | # count(following-sibling::) = an+b-1 522 | # 523 | # therefore, 524 | # count(...) - (b-1) ≡ 0 (mod a) 525 | # 526 | # if a == 0: 527 | # ~~~~~~~~~~ 528 | # count(...) = b-1 529 | # 530 | # if a < 0: 531 | # ~~~~~~~~~ 532 | # count(...) - b +1 <= 0 533 | # -> count(...) <= b-1 534 | # 535 | # if a > 0: 536 | # ~~~~~~~~~ 537 | # count(...) - b +1 >= 0 538 | # -> count(...) >= b-1 539 | 540 | # work with b-1 instead 541 | b_min_1 = b - 1 542 | 543 | # early-exit condition 1: 544 | # ~~~~~~~~~~~~~~~~~~~~~~~ 545 | # for a == 1, nth-*(an+b) means n+b-1 siblings before/after, 546 | # and since n ∈ {0, 1, 2, ...}, if b-1<=0, 547 | # there is always an "n" matching any number of siblings (maybe none) 548 | if a == 1 and b_min_1 <= 0: 549 | return xpath 550 | 551 | # early-exit condition 2: 552 | # ~~~~~~~~~~~~~~~~~~~~~~~ 553 | # an+b-1 siblings with a<0 and (b-1)<0 is not possible 554 | if a < 0 and b_min_1 < 0: 555 | return xpath.add_condition("0") 556 | 557 | # `add_name_test` boolean is inverted and somewhat counter-intuitive: 558 | # 559 | # nth_of_type() calls nth_child(add_name_test=False) 560 | nodetest = "*" if add_name_test else f"{xpath.element}" 561 | 562 | # count siblings before or after the element 563 | if not last: 564 | siblings_count = f"count(preceding-sibling::{nodetest})" 565 | else: 566 | siblings_count = f"count(following-sibling::{nodetest})" 567 | 568 | # special case of fixed position: nth-*(0n+b) 569 | # if a == 0: 570 | # ~~~~~~~~~~ 571 | # count(***-sibling::***) = b-1 572 | if a == 0: 573 | return xpath.add_condition(f"{siblings_count} = {b_min_1}") 574 | 575 | expressions = [] 576 | 577 | if a > 0: 578 | # siblings count, an+b-1, is always >= 0, 579 | # so if a>0, and (b-1)<=0, an "n" exists to satisfy this, 580 | # therefore, the predicate is only interesting if (b-1)>0 581 | if b_min_1 > 0: 582 | expressions.append(f"{siblings_count} >= {b_min_1}") 583 | else: 584 | # if a<0, and (b-1)<0, no "n" satisfies this, 585 | # this is tested above as an early exist condition 586 | # otherwise, 587 | expressions.append(f"{siblings_count} <= {b_min_1}") 588 | 589 | # operations modulo 1 or -1 are simpler, one only needs to verify: 590 | # 591 | # - either: 592 | # count(***-sibling::***) - (b-1) = n = 0, 1, 2, 3, etc., 593 | # i.e. count(***-sibling::***) >= (b-1) 594 | # 595 | # - or: 596 | # count(***-sibling::***) - (b-1) = -n = 0, -1, -2, -3, etc., 597 | # i.e. count(***-sibling::***) <= (b-1) 598 | # we we just did above. 599 | # 600 | if abs(a) != 1: 601 | # count(***-sibling::***) - (b-1) ≡ 0 (mod a) 602 | left = siblings_count 603 | 604 | # apply "modulo a" on 2nd term, -(b-1), 605 | # to simplify things like "(... +6) % -3", 606 | # and also make it positive with |a| 607 | b_neg = (-b_min_1) % abs(a) 608 | 609 | if b_neg != 0: 610 | left = f"({left} +{b_neg})" 611 | 612 | expressions.append(f"{left} mod {a} = 0") 613 | 614 | template = "(%s)" if len(expressions) > 1 else "%s" 615 | xpath.add_condition( 616 | " and ".join(template % expression for expression in expressions) 617 | ) 618 | return xpath 619 | 620 | def xpath_nth_last_child_function( 621 | self, xpath: XPathExpr, function: Function 622 | ) -> XPathExpr: 623 | return self.xpath_nth_child_function(xpath, function, last=True) 624 | 625 | def xpath_nth_of_type_function( 626 | self, xpath: XPathExpr, function: Function 627 | ) -> XPathExpr: 628 | if xpath.element == "*": 629 | raise ExpressionError("*:nth-of-type() is not implemented") 630 | return self.xpath_nth_child_function(xpath, function, add_name_test=False) 631 | 632 | def xpath_nth_last_of_type_function( 633 | self, xpath: XPathExpr, function: Function 634 | ) -> XPathExpr: 635 | if xpath.element == "*": 636 | raise ExpressionError("*:nth-of-type() is not implemented") 637 | return self.xpath_nth_child_function( 638 | xpath, function, last=True, add_name_test=False 639 | ) 640 | 641 | def xpath_contains_function( 642 | self, xpath: XPathExpr, function: Function 643 | ) -> XPathExpr: 644 | # Defined there, removed in later drafts: 645 | # http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors 646 | if function.argument_types() not in (["STRING"], ["IDENT"]): 647 | raise ExpressionError( 648 | f"Expected a single string or ident for :contains(), got {function.arguments!r}" 649 | ) 650 | value = cast("str", function.arguments[0].value) 651 | return xpath.add_condition(f"contains(., {self.xpath_literal(value)})") 652 | 653 | def xpath_lang_function(self, xpath: XPathExpr, function: Function) -> XPathExpr: 654 | if function.argument_types() not in (["STRING"], ["IDENT"]): 655 | raise ExpressionError( 656 | f"Expected a single string or ident for :lang(), got {function.arguments!r}" 657 | ) 658 | value = cast("str", function.arguments[0].value) 659 | return xpath.add_condition(f"lang({self.xpath_literal(value)})") 660 | 661 | # Pseudo: dispatch by pseudo-class name 662 | 663 | def xpath_root_pseudo(self, xpath: XPathExpr) -> XPathExpr: 664 | return xpath.add_condition("not(parent::*)") 665 | 666 | # CSS immediate children (CSS ":scope > div" to XPath "child::div" or "./div") 667 | # Works only at the start of a selector 668 | # Needed to get immediate children of a processed selector in Scrapy 669 | # for product in response.css('.product'): 670 | # description = product.css(':scope > div::text').get() 671 | def xpath_scope_pseudo(self, xpath: XPathExpr) -> XPathExpr: 672 | return xpath.add_condition("1") 673 | 674 | def xpath_first_child_pseudo(self, xpath: XPathExpr) -> XPathExpr: 675 | return xpath.add_condition("count(preceding-sibling::*) = 0") 676 | 677 | def xpath_last_child_pseudo(self, xpath: XPathExpr) -> XPathExpr: 678 | return xpath.add_condition("count(following-sibling::*) = 0") 679 | 680 | def xpath_first_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr: 681 | if xpath.element == "*": 682 | raise ExpressionError("*:first-of-type is not implemented") 683 | return xpath.add_condition(f"count(preceding-sibling::{xpath.element}) = 0") 684 | 685 | def xpath_last_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr: 686 | if xpath.element == "*": 687 | raise ExpressionError("*:last-of-type is not implemented") 688 | return xpath.add_condition(f"count(following-sibling::{xpath.element}) = 0") 689 | 690 | def xpath_only_child_pseudo(self, xpath: XPathExpr) -> XPathExpr: 691 | return xpath.add_condition("count(parent::*/child::*) = 1") 692 | 693 | def xpath_only_of_type_pseudo(self, xpath: XPathExpr) -> XPathExpr: 694 | if xpath.element == "*": 695 | raise ExpressionError("*:only-of-type is not implemented") 696 | return xpath.add_condition(f"count(parent::*/child::{xpath.element}) = 1") 697 | 698 | def xpath_empty_pseudo(self, xpath: XPathExpr) -> XPathExpr: 699 | return xpath.add_condition("not(*) and not(string-length())") 700 | 701 | def pseudo_never_matches(self, xpath: XPathExpr) -> XPathExpr: 702 | """Common implementation for pseudo-classes that never match.""" 703 | return xpath.add_condition("0") 704 | 705 | xpath_link_pseudo = pseudo_never_matches 706 | xpath_visited_pseudo = pseudo_never_matches 707 | xpath_hover_pseudo = pseudo_never_matches 708 | xpath_active_pseudo = pseudo_never_matches 709 | xpath_focus_pseudo = pseudo_never_matches 710 | xpath_target_pseudo = pseudo_never_matches 711 | xpath_enabled_pseudo = pseudo_never_matches 712 | xpath_disabled_pseudo = pseudo_never_matches 713 | xpath_checked_pseudo = pseudo_never_matches 714 | 715 | # Attrib: dispatch by attribute operator 716 | 717 | def xpath_attrib_exists( 718 | self, xpath: XPathExpr, name: str, value: str | None 719 | ) -> XPathExpr: 720 | assert not value 721 | xpath.add_condition(name) 722 | return xpath 723 | 724 | def xpath_attrib_equals( 725 | self, xpath: XPathExpr, name: str, value: str | None 726 | ) -> XPathExpr: 727 | assert value is not None 728 | xpath.add_condition(f"{name} = {self.xpath_literal(value)}") 729 | return xpath 730 | 731 | def xpath_attrib_different( 732 | self, xpath: XPathExpr, name: str, value: str | None 733 | ) -> XPathExpr: 734 | assert value is not None 735 | # FIXME: this seems like a weird hack... 736 | if value: 737 | xpath.add_condition(f"not({name}) or {name} != {self.xpath_literal(value)}") 738 | else: 739 | xpath.add_condition(f"{name} != {self.xpath_literal(value)}") 740 | return xpath 741 | 742 | def xpath_attrib_includes( 743 | self, xpath: XPathExpr, name: str, value: str | None 744 | ) -> XPathExpr: 745 | if value and is_non_whitespace(value): 746 | arg = self.xpath_literal(" " + value + " ") 747 | xpath.add_condition( 748 | f"{name} and contains(concat(' ', normalize-space({name}), ' '), {arg})" 749 | ) 750 | else: 751 | xpath.add_condition("0") 752 | return xpath 753 | 754 | def xpath_attrib_dashmatch( 755 | self, xpath: XPathExpr, name: str, value: str | None 756 | ) -> XPathExpr: 757 | assert value is not None 758 | arg = self.xpath_literal(value) 759 | arg_dash = self.xpath_literal(value + "-") 760 | # Weird, but true... 761 | xpath.add_condition( 762 | f"{name} and ({name} = {arg} or starts-with({name}, {arg_dash}))" 763 | ) 764 | return xpath 765 | 766 | def xpath_attrib_prefixmatch( 767 | self, xpath: XPathExpr, name: str, value: str | None 768 | ) -> XPathExpr: 769 | if value: 770 | xpath.add_condition( 771 | f"{name} and starts-with({name}, {self.xpath_literal(value)})" 772 | ) 773 | else: 774 | xpath.add_condition("0") 775 | return xpath 776 | 777 | def xpath_attrib_suffixmatch( 778 | self, xpath: XPathExpr, name: str, value: str | None 779 | ) -> XPathExpr: 780 | if value: 781 | # Oddly there is a starts-with in XPath 1.0, but not ends-with 782 | xpath.add_condition( 783 | f"{name} and substring({name}, string-length({name})-{len(value) - 1}) = {self.xpath_literal(value)}" 784 | ) 785 | else: 786 | xpath.add_condition("0") 787 | return xpath 788 | 789 | def xpath_attrib_substringmatch( 790 | self, xpath: XPathExpr, name: str, value: str | None 791 | ) -> XPathExpr: 792 | if value: 793 | # Attribute selectors are case sensitive 794 | xpath.add_condition( 795 | f"{name} and contains({name}, {self.xpath_literal(value)})" 796 | ) 797 | else: 798 | xpath.add_condition("0") 799 | return xpath 800 | 801 | 802 | class HTMLTranslator(GenericTranslator): 803 | """ 804 | Translator for (X)HTML documents. 805 | 806 | Has a more useful implementation of some pseudo-classes based on 807 | HTML-specific element names and attribute names, as described in 808 | the `HTML5 specification`_. It assumes no-quirks mode. 809 | The API is the same as :class:`GenericTranslator`. 810 | 811 | .. _HTML5 specification: http://www.w3.org/TR/html5/links.html#selectors 812 | 813 | :param xhtml: 814 | If false (the default), element names and attribute names 815 | are case-insensitive. 816 | 817 | """ 818 | 819 | lang_attribute = "lang" 820 | 821 | def __init__(self, xhtml: bool = False) -> None: 822 | self.xhtml = xhtml # Might be useful for sub-classes? 823 | if not xhtml: 824 | # See their definition in GenericTranslator. 825 | self.lower_case_element_names = True 826 | self.lower_case_attribute_names = True 827 | 828 | def xpath_checked_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore[override] 829 | # FIXME: is this really all the elements? 830 | return xpath.add_condition( 831 | "(@selected and name(.) = 'option') or " 832 | "(@checked " 833 | "and (name(.) = 'input' or name(.) = 'command')" 834 | "and (@type = 'checkbox' or @type = 'radio'))" 835 | ) 836 | 837 | def xpath_lang_function(self, xpath: XPathExpr, function: Function) -> XPathExpr: 838 | if function.argument_types() not in (["STRING"], ["IDENT"]): 839 | raise ExpressionError( 840 | f"Expected a single string or ident for :lang(), got {function.arguments!r}" 841 | ) 842 | value = function.arguments[0].value 843 | assert value 844 | arg = self.xpath_literal(value.lower() + "-") 845 | return xpath.add_condition( 846 | "ancestor-or-self::*[@lang][1][starts-with(concat(" 847 | # XPath 1.0 has no lower-case function... 848 | f"translate(@{self.lang_attribute}, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', " 849 | "'abcdefghijklmnopqrstuvwxyz'), " 850 | f"'-'), {arg})]" 851 | ) 852 | 853 | def xpath_link_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore[override] 854 | return xpath.add_condition( 855 | "@href and (name(.) = 'a' or name(.) = 'link' or name(.) = 'area')" 856 | ) 857 | 858 | # Links are never visited, the implementation for :visited is the same 859 | # as in GenericTranslator 860 | 861 | def xpath_disabled_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore[override] 862 | # http://www.w3.org/TR/html5/section-index.html#attributes-1 863 | return xpath.add_condition( 864 | """ 865 | ( 866 | @disabled and 867 | ( 868 | (name(.) = 'input' and @type != 'hidden') or 869 | name(.) = 'button' or 870 | name(.) = 'select' or 871 | name(.) = 'textarea' or 872 | name(.) = 'command' or 873 | name(.) = 'fieldset' or 874 | name(.) = 'optgroup' or 875 | name(.) = 'option' 876 | ) 877 | ) or ( 878 | ( 879 | (name(.) = 'input' and @type != 'hidden') or 880 | name(.) = 'button' or 881 | name(.) = 'select' or 882 | name(.) = 'textarea' 883 | ) 884 | and ancestor::fieldset[@disabled] 885 | ) 886 | """ 887 | ) 888 | # FIXME: in the second half, add "and is not a descendant of that 889 | # fieldset element's first legend element child, if any." 890 | 891 | def xpath_enabled_pseudo(self, xpath: XPathExpr) -> XPathExpr: # type: ignore[override] 892 | # http://www.w3.org/TR/html5/section-index.html#attributes-1 893 | return xpath.add_condition( 894 | """ 895 | ( 896 | @href and ( 897 | name(.) = 'a' or 898 | name(.) = 'link' or 899 | name(.) = 'area' 900 | ) 901 | ) or ( 902 | ( 903 | name(.) = 'command' or 904 | name(.) = 'fieldset' or 905 | name(.) = 'optgroup' 906 | ) 907 | and not(@disabled) 908 | ) or ( 909 | ( 910 | (name(.) = 'input' and @type != 'hidden') or 911 | name(.) = 'button' or 912 | name(.) = 'select' or 913 | name(.) = 'textarea' or 914 | name(.) = 'keygen' 915 | ) 916 | and not (@disabled or ancestor::fieldset[@disabled]) 917 | ) or ( 918 | name(.) = 'option' and not( 919 | @disabled or ancestor::optgroup[@disabled] 920 | ) 921 | ) 922 | """ 923 | ) 924 | # FIXME: ... or "li elements that are children of menu elements, 925 | # and that have a child element that defines a command, if the first 926 | # such element's Disabled State facet is false (not disabled)". 927 | # FIXME: after ancestor::fieldset[@disabled], add "and is not a 928 | # descendant of that fieldset element's first legend element child, 929 | # if any." 930 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # cssselect documentation build configuration file, created by 4 | # sphinx-quickstart on Tue Mar 27 14:20:34 2012. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import re 15 | from pathlib import Path 16 | 17 | # If extensions (or modules to document with autodoc) are in another directory, 18 | # add these directories to sys.path here. If the directory is relative to the 19 | # documentation root, use os.path.abspath to make it absolute, like shown here. 20 | # sys.path.insert(0, os.path.abspath('.')) 21 | 22 | # -- General configuration ----------------------------------------------------- 23 | 24 | # If your documentation needs a minimal Sphinx version, state it here. 25 | # needs_sphinx = '1.0' 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be extensions 28 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 29 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.doctest"] 30 | 31 | # Add any paths that contain templates here, relative to this directory. 32 | templates_path = ["_templates"] 33 | 34 | # The suffix of source filenames. 35 | source_suffix = ".rst" 36 | 37 | # The encoding of source files. 38 | # source_encoding = 'utf-8-sig' 39 | 40 | # The master toctree document. 41 | master_doc = "index" 42 | 43 | # General information about the project. 44 | project = "cssselect" 45 | copyright = "2012-2017, Simon Sapin, Scrapy developers" 46 | 47 | # The version info for the project you're documenting, acts as replacement for 48 | # |version| and |release|, also used in various other places throughout the 49 | # built documents. 50 | # 51 | # The full version, including alpha/beta/rc tags. 52 | init_py = (Path(__file__).parent.parent / "cssselect" / "__init__.py").read_text() 53 | release = re.search('VERSION = "([^"]+)"', init_py).group(1) 54 | # The short X.Y version. 55 | version = release.rstrip("dev") 56 | 57 | # The language for content autogenerated by Sphinx. Refer to documentation 58 | # for a list of supported languages. 59 | # language = None 60 | 61 | # There are two options for replacing |today|: either, you set today to some 62 | # non-false value, then it is used: 63 | # today = '' 64 | # Else, today_fmt is used as the format for a strftime call. 65 | # today_fmt = '%B %d, %Y' 66 | 67 | # List of patterns, relative to source directory, that match files and 68 | # directories to ignore when looking for source files. 69 | exclude_patterns = ["_build"] 70 | 71 | # The reST default role (used for this markup: `text`) to use for all documents. 72 | # default_role = None 73 | 74 | # If true, '()' will be appended to :func: etc. cross-reference text. 75 | # add_function_parentheses = True 76 | 77 | # If true, the current module name will be prepended to all description 78 | # unit titles (such as .. function::). 79 | # add_module_names = True 80 | 81 | # If true, sectionauthor and moduleauthor directives will be shown in the 82 | # output. They are ignored by default. 83 | # show_authors = False 84 | 85 | # The name of the Pygments (syntax highlighting) style to use. 86 | pygments_style = "sphinx" 87 | 88 | # A list of ignored prefixes for module index sorting. 89 | # modindex_common_prefix = [] 90 | 91 | 92 | # -- Options for HTML output --------------------------------------------------- 93 | 94 | # The theme to use for HTML and HTML Help pages. See the documentation for 95 | # a list of builtin themes. 96 | html_theme = "sphinx_rtd_theme" 97 | 98 | # Theme options are theme-specific and customize the look and feel of a theme 99 | # further. For a list of options available for each theme, see the 100 | # documentation. 101 | # html_theme_options = {} 102 | 103 | # Add any paths that contain custom themes here, relative to this directory. 104 | # html_theme_path = [] 105 | 106 | # The name for this set of Sphinx documents. If None, it defaults to 107 | # " v documentation". 108 | # html_title = None 109 | 110 | # A shorter title for the navigation bar. Default is the same as html_title. 111 | # html_short_title = None 112 | 113 | # The name of an image file (relative to this directory) to place at the top 114 | # of the sidebar. 115 | # html_logo = None 116 | 117 | # The name of an image file (within the static path) to use as favicon of the 118 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 119 | # pixels large. 120 | # html_favicon = None 121 | 122 | # Add any paths that contain custom static files (such as style sheets) here, 123 | # relative to this directory. They are copied after the builtin static files, 124 | # so a file named "default.css" will overwrite the builtin "default.css". 125 | # html_static_path = ['_static'] 126 | 127 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 128 | # using the given strftime format. 129 | # html_last_updated_fmt = '%b %d, %Y' 130 | 131 | # If true, SmartyPants will be used to convert quotes and dashes to 132 | # typographically correct entities. 133 | # html_use_smartypants = True 134 | 135 | # Custom sidebar templates, maps document names to template names. 136 | # html_sidebars = {} 137 | 138 | # Additional templates that should be rendered to pages, maps page names to 139 | # template names. 140 | # html_additional_pages = {} 141 | 142 | # If false, no module index is generated. 143 | # html_domain_indices = True 144 | 145 | # If false, no index is generated. 146 | # html_use_index = True 147 | 148 | # If true, the index is split into individual pages for each letter. 149 | # html_split_index = False 150 | 151 | # If true, links to the reST sources are added to the pages. 152 | # html_show_sourcelink = True 153 | 154 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 155 | # html_show_sphinx = True 156 | 157 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 158 | # html_show_copyright = True 159 | 160 | # If true, an OpenSearch description file will be output, and all pages will 161 | # contain a tag referring to it. The value of this option must be the 162 | # base URL from which the finished HTML is served. 163 | # html_use_opensearch = '' 164 | 165 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 166 | # html_file_suffix = None 167 | 168 | # Output file base name for HTML help builder. 169 | htmlhelp_basename = "cssselectdoc" 170 | 171 | 172 | # -- Options for LaTeX output -------------------------------------------------- 173 | 174 | latex_elements = { 175 | # The paper size ('letterpaper' or 'a4paper'). 176 | #'papersize': 'letterpaper', 177 | # The font size ('10pt', '11pt' or '12pt'). 178 | #'pointsize': '10pt', 179 | # Additional stuff for the LaTeX preamble. 180 | #'preamble': '', 181 | } 182 | 183 | # Grouping the document tree into LaTeX files. List of tuples 184 | # (source start file, target name, title, author, documentclass [howto/manual]). 185 | latex_documents = [ 186 | ("index", "cssselect.tex", "cssselect Documentation", "Simon Sapin", "manual"), 187 | ] 188 | 189 | # The name of an image file (relative to this directory) to place at the top of 190 | # the title page. 191 | # latex_logo = None 192 | 193 | # For "manual" documents, if this is true, then toplevel headings are parts, 194 | # not chapters. 195 | # latex_use_parts = False 196 | 197 | # If true, show page references after internal links. 198 | # latex_show_pagerefs = False 199 | 200 | # If true, show URL addresses after external links. 201 | # latex_show_urls = False 202 | 203 | # Documents to append as an appendix to all manuals. 204 | # latex_appendices = [] 205 | 206 | # If false, no module index is generated. 207 | # latex_domain_indices = True 208 | 209 | 210 | # -- Options for manual page output -------------------------------------------- 211 | 212 | # One entry per manual page. List of tuples 213 | # (source start file, name, description, authors, manual section). 214 | man_pages = [("index", "cssselect", "cssselect Documentation", ["Simon Sapin"], 1)] 215 | 216 | # If true, show URL addresses after external links. 217 | # man_show_urls = False 218 | 219 | 220 | # -- Options for Texinfo output ------------------------------------------------ 221 | 222 | # Grouping the document tree into Texinfo files. List of tuples 223 | # (source start file, target name, title, author, 224 | # dir menu entry, description, category) 225 | texinfo_documents = [ 226 | ( 227 | "index", 228 | "cssselect", 229 | "cssselect Documentation", 230 | "Simon Sapin", 231 | "cssselect", 232 | "One line description of project.", 233 | "Miscellaneous", 234 | ), 235 | ] 236 | 237 | # Documents to append as an appendix to all manuals. 238 | # texinfo_appendices = [] 239 | 240 | # If false, no module index is generated. 241 | # texinfo_domain_indices = True 242 | 243 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 244 | # texinfo_show_urls = 'footnote' 245 | 246 | 247 | # Example configuration for intersphinx: refer to the Python standard library. 248 | intersphinx_mapping = {"python": ("https://docs.python.org/3", None)} 249 | 250 | 251 | # --- Nitpicking options ------------------------------------------------------ 252 | 253 | nitpicky = True 254 | nitpick_ignore = [ 255 | # explicitly not a part of the public API 256 | ("py:class", "Token"), 257 | ] 258 | -------------------------------------------------------------------------------- /docs/conftest.py: -------------------------------------------------------------------------------- 1 | from doctest import ELLIPSIS, NORMALIZE_WHITESPACE 2 | 3 | from sybil import Sybil 4 | from sybil.parsers.doctest import DocTestParser 5 | from sybil.parsers.skip import skip 6 | 7 | try: 8 | # sybil 3.0.0+ 9 | from sybil.parsers.codeblock import PythonCodeBlockParser 10 | except ImportError: 11 | from sybil.parsers.codeblock import CodeBlockParser as PythonCodeBlockParser 12 | 13 | 14 | pytest_collect_file = Sybil( 15 | parsers=[ 16 | DocTestParser(optionflags=ELLIPSIS | NORMALIZE_WHITESPACE), 17 | PythonCodeBlockParser(future_imports=["print_function"]), 18 | skip, 19 | ], 20 | pattern="*.rst", 21 | ).pytest() 22 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. module:: cssselect 2 | 3 | .. include:: ../README.rst 4 | 5 | 6 | .. contents:: Contents 7 | :local: 8 | :depth: 1 9 | 10 | Quickstart 11 | ========== 12 | 13 | Use :class:`HTMLTranslator` for HTML documents, :class:`GenericTranslator` 14 | for "generic" XML documents. (The former has a more useful translation 15 | for some selectors, based on HTML-specific element types or attributes.) 16 | 17 | 18 | .. sourcecode:: pycon 19 | 20 | >>> from cssselect import GenericTranslator, SelectorError 21 | >>> try: 22 | ... expression = GenericTranslator().css_to_xpath('div.content') 23 | ... except SelectorError: 24 | ... print('Invalid selector.') 25 | ... 26 | >>> print(expression) 27 | descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' content ')] 28 | 29 | The resulting expression can be used with lxml's `XPath engine`_: 30 | 31 | .. _XPath engine: http://lxml.de/xpathxslt.html#xpath 32 | 33 | .. sourcecode:: pycon 34 | 35 | >>> from lxml.etree import fromstring 36 | >>> document = fromstring(''' 37 | ...
38 | ...
text
39 | ...
40 | ... ''') 41 | >>> [e.get('id') for e in document.xpath(expression)] 42 | ['inner'] 43 | 44 | User API 45 | ======== 46 | 47 | In CSS3 Selectors terms, the top-level object is a `group of selectors`_, a 48 | sequence of comma-separated selectors. For example, ``div, h1.title + p`` 49 | is a group of two selectors. 50 | 51 | .. _group of selectors: http://www.w3.org/TR/selectors/#grouping 52 | 53 | .. autofunction:: parse 54 | .. autoclass:: Selector() 55 | :members: 56 | 57 | .. autoclass:: FunctionalPseudoElement 58 | 59 | .. autoclass:: GenericTranslator 60 | :members: css_to_xpath, selector_to_xpath 61 | 62 | .. autoclass:: HTMLTranslator 63 | 64 | Exceptions 65 | ---------- 66 | 67 | .. autoexception:: SelectorError 68 | .. autoexception:: SelectorSyntaxError 69 | .. autoexception:: ExpressionError 70 | 71 | 72 | Supported selectors 73 | =================== 74 | 75 | This library implements CSS3 selectors as described in `the W3C specification 76 | `_. 77 | In this context however, there is no interactivity or history of visited links. 78 | Therefore, these pseudo-classes are accepted but never match anything: 79 | 80 | * ``:hover`` 81 | * ``:active`` 82 | * ``:focus`` 83 | * ``:target`` 84 | * ``:visited`` 85 | 86 | Additionally, these depend on document knowledge and only have a useful 87 | implementation in :class:`HTMLTranslator`. In :class:`GenericTranslator`, 88 | they never match: 89 | 90 | * ``:link`` 91 | * ``:enabled`` 92 | * ``:disabled`` 93 | * ``:checked`` 94 | 95 | These applicable pseudo-classes are not yet implemented: 96 | 97 | * ``*:first-of-type``, ``*:last-of-type``, ``*:nth-of-type``, 98 | ``*:nth-last-of-type``, ``*:only-of-type``. All of these work when 99 | you specify an element type, but not with ``*`` 100 | 101 | On the other hand, *cssselect* supports some selectors that are not 102 | in the Level 3 specification. 103 | 104 | These parts of the Level 4 specification are supported (note that a large part 105 | of the Level 4 additions is not applicable to cssselect similarly to ``:hover`` 106 | or not representable in XPath 1.0 so the complete specification is unlikely to 107 | be implemented): 108 | 109 | * The ``:scope`` pseudo-class. Limitation: it can only be used at a start of a 110 | selector. 111 | * The ``:is()``, ``:where()`` and ``:has()`` pseudo-classes. Limitation: 112 | ``:has()`` cannot contain nested ``:has()`` or ``:not()``. 113 | 114 | These are non-standard extensions: 115 | 116 | * The ``:contains(text)`` pseudo-class that existed in `an early draft`_ 117 | but was then removed. 118 | * The ``!=`` attribute operator. ``[foo!=bar]`` is the same as 119 | ``:not([foo=bar])``. 120 | * ``:not()`` accepts a *sequence of simple selectors*, not just single 121 | *simple selector*. For example, ``:not(a.important[rel])`` is allowed, 122 | even though the negation contains 3 *simple selectors*. 123 | 124 | .. _an early draft: http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors 125 | 126 | .. 127 | The following claim was copied from lxml: 128 | 129 | """ 130 | XPath has underspecified string quoting rules (there seems to be no 131 | string quoting at all), so if you use expressions that contain 132 | characters that requiring quoting you might have problems with the 133 | translation from CSS to XPath. 134 | """ 135 | 136 | It seems "string quoting" meant "quote escaping". There is indeed 137 | no quote escaping, but the xpath_literal method handles this. 138 | It should not be a problem anymore. 139 | 140 | 141 | Customizing the translation 142 | =========================== 143 | 144 | Just like :class:`HTMLTranslator` is a subclass of :class:`GenericTranslator`, 145 | you can make new sub-classes of either of them and override some methods. 146 | This enables you, for example, to customize how some pseudo-class is 147 | implemented without forking or monkey-patching cssselect. 148 | 149 | The "customization API" is the set of methods in translation classes 150 | and their signature. You can look at the `source code`_ to see how it works. 151 | However, be aware that this API is not very stable yet. It might change 152 | and break your sub-class. 153 | 154 | .. _source code: https://github.com/scrapy/cssselect/blob/master/cssselect/xpath.py 155 | 156 | 157 | Namespaces 158 | ========== 159 | 160 | In CSS you can use ``namespace-prefix|element``, similar to 161 | ``namespace-prefix:element`` in an XPath expression. In fact, it maps 162 | one-to-one. How prefixes are mapped to namespace URIs depends on the 163 | XPath implementation. 164 | 165 | .. include:: ../CHANGES 166 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==8.2.3 2 | sphinx-rtd-theme==3.0.2 3 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | build-backend = "hatchling.build" 3 | requires = ["hatchling>=1.27.0"] 4 | 5 | [project] 6 | name = "cssselect" 7 | license = "BSD-3-Clause" 8 | license-files = ["LICENSE", "AUTHORS"] 9 | description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0" 10 | readme = "README.rst" 11 | authors = [{ name = "Ian Bicking", email = "ianb@colorstudy.com" }] 12 | maintainers = [{ name = "Paul Tremberth", email = "paul.tremberth@gmail.com" }] 13 | requires-python = ">=3.9" 14 | classifiers = [ 15 | "Development Status :: 4 - Beta", 16 | "Intended Audience :: Developers", 17 | "Programming Language :: Python :: 3", 18 | "Programming Language :: Python :: 3.9", 19 | "Programming Language :: Python :: 3.10", 20 | "Programming Language :: Python :: 3.11", 21 | "Programming Language :: Python :: 3.12", 22 | "Programming Language :: Python :: 3.13", 23 | "Programming Language :: Python :: Implementation :: CPython", 24 | "Programming Language :: Python :: Implementation :: PyPy", 25 | ] 26 | dynamic = ["version"] 27 | 28 | [project.urls] 29 | "Homepage" = "https://github.com/scrapy/cssselect" 30 | 31 | [tool.hatch.version] 32 | path = "cssselect/__init__.py" 33 | 34 | [tool.hatch.build.targets.sdist] 35 | include = [ 36 | "/cssselect", 37 | "/docs", 38 | "/tests", 39 | "/CHANGES", 40 | "/README.rst", 41 | "/tox.ini", 42 | ] 43 | exclude = [ 44 | "/docs/_build", 45 | ] 46 | 47 | [tool.hatch.build.targets.wheel] 48 | packages = ["cssselect"] 49 | 50 | [tool.bumpversion] 51 | current_version = "1.3.0" 52 | commit = true 53 | tag = true 54 | 55 | [[tool.bumpversion.files]] 56 | filename = "cssselect/__init__.py" 57 | 58 | [tool.coverage.run] 59 | branch = true 60 | source = ["cssselect"] 61 | 62 | [tool.coverage.report] 63 | exclude_also = [ 64 | "def __repr__", 65 | "if sys.version_info", 66 | "if __name__ == '__main__':", 67 | "if TYPE_CHECKING:", 68 | ] 69 | 70 | [tool.pylint.MASTER] 71 | persistent = "no" 72 | extension-pkg-allow-list = ["lxml"] 73 | 74 | [tool.pylint."MESSAGES CONTROL"] 75 | enable = [ 76 | "useless-suppression", 77 | ] 78 | disable = [ 79 | "consider-using-f-string", 80 | "fixme", 81 | "invalid-name", 82 | "line-too-long", 83 | "missing-class-docstring", 84 | "missing-function-docstring", 85 | "missing-module-docstring", 86 | "no-member", 87 | "not-callable", 88 | "redefined-builtin", 89 | "redefined-outer-name", 90 | "too-few-public-methods", 91 | "too-many-arguments", 92 | "too-many-branches", 93 | "too-many-function-args", 94 | "too-many-lines", 95 | "too-many-locals", 96 | "too-many-positional-arguments", 97 | "too-many-public-methods", 98 | "too-many-statements", 99 | "unused-argument", 100 | ] 101 | 102 | [tool.pytest.ini_options] 103 | testpaths = ["tests"] 104 | 105 | [tool.ruff.lint] 106 | extend-select = [ 107 | # flake8-bugbear 108 | "B", 109 | # flake8-comprehensions 110 | "C4", 111 | # pydocstyle 112 | "D", 113 | # flake8-future-annotations 114 | "FA", 115 | # flynt 116 | "FLY", 117 | # refurb 118 | "FURB", 119 | # isort 120 | "I", 121 | # flake8-implicit-str-concat 122 | "ISC", 123 | # flake8-logging 124 | "LOG", 125 | # Perflint 126 | "PERF", 127 | # pygrep-hooks 128 | "PGH", 129 | # flake8-pie 130 | "PIE", 131 | # pylint 132 | "PL", 133 | # flake8-use-pathlib 134 | "PTH", 135 | # flake8-pyi 136 | "PYI", 137 | # flake8-quotes 138 | "Q", 139 | # flake8-return 140 | "RET", 141 | # flake8-raise 142 | "RSE", 143 | # Ruff-specific rules 144 | "RUF", 145 | # flake8-bandit 146 | "S", 147 | # flake8-simplify 148 | "SIM", 149 | # flake8-slots 150 | "SLOT", 151 | # flake8-debugger 152 | "T10", 153 | # flake8-type-checking 154 | "TC", 155 | # pyupgrade 156 | "UP", 157 | # pycodestyle warnings 158 | "W", 159 | # flake8-2020 160 | "YTT", 161 | ] 162 | ignore = [ 163 | # Missing docstring in public module 164 | "D100", 165 | # Missing docstring in public class 166 | "D101", 167 | # Missing docstring in public method 168 | "D102", 169 | # Missing docstring in public function 170 | "D103", 171 | # Missing docstring in public package 172 | "D104", 173 | # Missing docstring in magic method 174 | "D105", 175 | # Missing docstring in public nested class 176 | "D106", 177 | # Missing docstring in __init__ 178 | "D107", 179 | # One-line docstring should fit on one line with quotes 180 | "D200", 181 | # No blank lines allowed after function docstring 182 | "D202", 183 | # 1 blank line required between summary line and description 184 | "D205", 185 | # Multi-line docstring closing quotes should be on a separate line 186 | "D209", 187 | # First line should end with a period 188 | "D400", 189 | # First line should be in imperative mood; try rephrasing 190 | "D401", 191 | # First line should not be the function's "signature" 192 | "D402", 193 | # First word of the first line should be properly capitalized 194 | "D403", 195 | # Too many return statements 196 | "PLR0911", 197 | # Too many branches 198 | "PLR0912", 199 | # Too many arguments in function definition 200 | "PLR0913", 201 | # Too many statements 202 | "PLR0915", 203 | # Magic value used in comparison 204 | "PLR2004", 205 | # String contains ambiguous {}. 206 | "RUF001", 207 | # Docstring contains ambiguous {}. 208 | "RUF002", 209 | # Comment contains ambiguous {}. 210 | "RUF003", 211 | # Mutable class attributes should be annotated with `typing.ClassVar` 212 | "RUF012", 213 | # Use of `assert` detected 214 | "S101", 215 | # Using lxml to parse untrusted data is known to be vulnerable to XML attacks 216 | "S320", 217 | ] 218 | 219 | [tool.ruff.lint.pydocstyle] 220 | convention = "pep257" 221 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy/cssselect/b478ce96deddd07bd7bd5311d49fd0b5bbf3f54f/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_cssselect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Tests for cssselect 4 | =================== 5 | 6 | These tests can be run either by py.test or by the standard library's 7 | unittest. They use plain ``assert`` statements and do little reporting 8 | themselves in case of failure. 9 | 10 | Use py.test to get fancy error reporting and assert introspection. 11 | 12 | 13 | :copyright: (c) 2007-2012 Ian Bicking and contributors. 14 | See AUTHORS for more details. 15 | :license: BSD, see LICENSE for more details. 16 | 17 | """ 18 | 19 | from __future__ import annotations 20 | 21 | import sys 22 | import typing 23 | import unittest 24 | from typing import TYPE_CHECKING 25 | 26 | from lxml import etree, html 27 | 28 | from cssselect import ( 29 | ExpressionError, 30 | GenericTranslator, 31 | HTMLTranslator, 32 | SelectorSyntaxError, 33 | parse, 34 | ) 35 | from cssselect.parser import ( 36 | Function, 37 | FunctionalPseudoElement, 38 | PseudoElement, 39 | Token, 40 | parse_series, 41 | tokenize, 42 | ) 43 | from cssselect.xpath import XPathExpr 44 | 45 | if TYPE_CHECKING: 46 | from collections.abc import Sequence 47 | 48 | 49 | class TestCssselect(unittest.TestCase): 50 | def test_tokenizer(self) -> None: 51 | tokens = [ 52 | str(item) 53 | for item in tokenize(r'E\ é > f [a~="y\"x"]:nth(/* fu /]* */-3.7)') 54 | ] 55 | assert tokens == [ 56 | "", 57 | "", 58 | "' at 5>", 59 | "", 60 | # the no-break space is not whitespace in CSS 61 | "", # f\xa0 62 | "", 63 | "", 64 | "", 65 | "", 67 | "", 68 | "", 69 | "", 70 | "", 71 | "", 72 | "", 73 | "", 74 | ] 75 | 76 | def test_parser(self) -> None: 77 | def repr_parse(css: str) -> list[str]: 78 | selectors = parse(css) 79 | for selector in selectors: 80 | assert selector.pseudo_element is None 81 | return [repr(selector.parsed_tree) for selector in selectors] 82 | 83 | def parse_many(first: str, *others: str) -> list[str]: 84 | result = repr_parse(first) 85 | for other in others: 86 | assert repr_parse(other) == result 87 | return result 88 | 89 | assert parse_many("*") == ["Element[*]"] 90 | assert parse_many("*|*") == ["Element[*]"] 91 | assert parse_many("*|foo") == ["Element[foo]"] 92 | assert parse_many("|foo") == ["Element[foo]"] 93 | assert parse_many("foo|*") == ["Element[foo|*]"] 94 | assert parse_many("foo|bar") == ["Element[foo|bar]"] 95 | # This will never match, but it is valid: 96 | assert parse_many("#foo#bar") == ["Hash[Hash[Element[*]#foo]#bar]"] 97 | assert parse_many( 98 | "div>.foo", 99 | "div> .foo", 100 | "div >.foo", 101 | "div > .foo", 102 | "div \n> \t \t .foo", 103 | "div\r>\n\n\n.foo", 104 | "div\f>\f.foo", 105 | ) == ["CombinedSelector[Element[div] > Class[Element[*].foo]]"] 106 | assert parse_many( 107 | "td.foo,.bar", "td.foo, .bar", "td.foo\t\r\n\f ,\t\r\n\f .bar" 108 | ) == [ 109 | "Class[Element[td].foo]", 110 | "Class[Element[*].bar]", 111 | ] 112 | assert parse_many("div, td.foo, div.bar span") == [ 113 | "Element[div]", 114 | "Class[Element[td].foo]", 115 | "CombinedSelector[Class[Element[div].bar] Element[span]]", 116 | ] 117 | assert parse_many("div > p") == ["CombinedSelector[Element[div] > Element[p]]"] 118 | assert parse_many("td:first") == ["Pseudo[Element[td]:first]"] 119 | assert parse_many("td:first") == ["Pseudo[Element[td]:first]"] 120 | assert parse_many("td :first") == [ 121 | "CombinedSelector[Element[td] Pseudo[Element[*]:first]]" 122 | ] 123 | assert parse_many("td :first") == [ 124 | "CombinedSelector[Element[td] Pseudo[Element[*]:first]]" 125 | ] 126 | assert parse_many("a[name]", "a[ name\t]") == ["Attrib[Element[a][name]]"] 127 | assert parse_many("a [name]") == [ 128 | "CombinedSelector[Element[a] Attrib[Element[*][name]]]" 129 | ] 130 | assert parse_many('a[rel="include"]', "a[rel = include]") == [ 131 | "Attrib[Element[a][rel = 'include']]" 132 | ] 133 | assert parse_many("a[hreflang |= 'en']", "a[hreflang|=en]") == [ 134 | "Attrib[Element[a][hreflang |= 'en']]" 135 | ] 136 | assert parse_many("div:nth-child(10)") == [ 137 | "Function[Element[div]:nth-child(['10'])]" 138 | ] 139 | assert parse_many(":nth-child(2n+2)") == [ 140 | "Function[Element[*]:nth-child(['2', 'n', '+2'])]" 141 | ] 142 | assert parse_many("div:nth-of-type(10)") == [ 143 | "Function[Element[div]:nth-of-type(['10'])]" 144 | ] 145 | assert parse_many("div div:nth-of-type(10) .aclass") == [ 146 | "CombinedSelector[CombinedSelector[Element[div] " 147 | "Function[Element[div]:nth-of-type(['10'])]] " 148 | " Class[Element[*].aclass]]" 149 | ] 150 | assert parse_many("label:only") == ["Pseudo[Element[label]:only]"] 151 | assert parse_many("a:lang(fr)") == ["Function[Element[a]:lang(['fr'])]"] 152 | assert parse_many('div:contains("foo")') == [ 153 | "Function[Element[div]:contains(['foo'])]" 154 | ] 155 | assert parse_many("div#foobar") == ["Hash[Element[div]#foobar]"] 156 | assert parse_many("div:not(div.foo)") == [ 157 | "Negation[Element[div]:not(Class[Element[div].foo])]" 158 | ] 159 | assert parse_many("div:has(div.foo)") == [ 160 | "Relation[Element[div]:has(Selector[Class[Element[div].foo]])]" 161 | ] 162 | assert parse_many("div:is(.foo, #bar)") == [ 163 | "Matching[Element[div]:is(Class[Element[*].foo], Hash[Element[*]#bar])]" 164 | ] 165 | assert parse_many(":is(:hover, :visited)") == [ 166 | "Matching[Element[*]:is(Pseudo[Element[*]:hover], Pseudo[Element[*]:visited])]" 167 | ] 168 | assert parse_many(":where(:hover, :visited)") == [ 169 | "SpecificityAdjustment[Element[*]:where(Pseudo[Element[*]:hover]," 170 | " Pseudo[Element[*]:visited])]" 171 | ] 172 | assert parse_many("td ~ th") == ["CombinedSelector[Element[td] ~ Element[th]]"] 173 | assert parse_many(":scope > foo") == [ 174 | "CombinedSelector[Pseudo[Element[*]:scope] > Element[foo]]" 175 | ] 176 | assert parse_many(" :scope > foo") == [ 177 | "CombinedSelector[Pseudo[Element[*]:scope] > Element[foo]]" 178 | ] 179 | assert parse_many(":scope > foo bar > div") == [ 180 | "CombinedSelector[CombinedSelector[CombinedSelector[Pseudo[Element[*]:scope] > " 181 | "Element[foo]] Element[bar]] > Element[div]]" 182 | ] 183 | assert parse_many(":scope > #foo #bar") == [ 184 | "CombinedSelector[CombinedSelector[Pseudo[Element[*]:scope] > " 185 | "Hash[Element[*]#foo]] Hash[Element[*]#bar]]" 186 | ] 187 | 188 | def test_pseudo_elements(self) -> None: 189 | def parse_pseudo(css: str) -> list[tuple[str, str | None]]: 190 | result: list[tuple[str, str | None]] = [] 191 | for selector in parse(css): 192 | pseudo = selector.pseudo_element 193 | pseudo = str(pseudo) if pseudo else pseudo 194 | # No Symbol here 195 | assert pseudo is None or isinstance(pseudo, str) 196 | selector_as_str = repr(selector.parsed_tree) 197 | result.append((selector_as_str, pseudo)) 198 | return result 199 | 200 | def parse_one(css: str) -> tuple[str, str | None]: 201 | result = parse_pseudo(css) 202 | assert len(result) == 1 203 | return result[0] 204 | 205 | def test_pseudo_repr(css: str) -> str: 206 | result = parse(css) 207 | assert len(result) == 1 208 | selector = result[0] 209 | return repr(selector.parsed_tree) 210 | 211 | assert parse_one("foo") == ("Element[foo]", None) 212 | assert parse_one("*") == ("Element[*]", None) 213 | assert parse_one(":empty") == ("Pseudo[Element[*]:empty]", None) 214 | assert parse_one(":scope") == ("Pseudo[Element[*]:scope]", None) 215 | 216 | # Special cases for CSS 2.1 pseudo-elements 217 | assert parse_one(":BEfore") == ("Element[*]", "before") 218 | assert parse_one(":aftER") == ("Element[*]", "after") 219 | assert parse_one(":First-Line") == ("Element[*]", "first-line") 220 | assert parse_one(":First-Letter") == ("Element[*]", "first-letter") 221 | 222 | assert parse_one("::befoRE") == ("Element[*]", "before") 223 | assert parse_one("::AFter") == ("Element[*]", "after") 224 | assert parse_one("::firsT-linE") == ("Element[*]", "first-line") 225 | assert parse_one("::firsT-letteR") == ("Element[*]", "first-letter") 226 | 227 | assert parse_one("::text-content") == ("Element[*]", "text-content") 228 | assert parse_one("::attr(name)") == ( 229 | "Element[*]", 230 | "FunctionalPseudoElement[::attr(['name'])]", 231 | ) 232 | 233 | assert parse_one("::Selection") == ("Element[*]", "selection") 234 | assert parse_one("foo:after") == ("Element[foo]", "after") 235 | assert parse_one("foo::selection") == ("Element[foo]", "selection") 236 | assert parse_one("lorem#ipsum ~ a#b.c[href]:empty::selection") == ( 237 | "CombinedSelector[Hash[Element[lorem]#ipsum] ~ " 238 | "Pseudo[Attrib[Class[Hash[Element[a]#b].c][href]]:empty]]", 239 | "selection", 240 | ) 241 | assert parse_pseudo(":scope > div, foo bar") == [ 242 | ("CombinedSelector[Pseudo[Element[*]:scope] > Element[div]]", None), 243 | ("CombinedSelector[Element[foo] Element[bar]]", None), 244 | ] 245 | assert parse_pseudo("foo bar, :scope > div") == [ 246 | ("CombinedSelector[Element[foo] Element[bar]]", None), 247 | ("CombinedSelector[Pseudo[Element[*]:scope] > Element[div]]", None), 248 | ] 249 | assert parse_pseudo("foo bar,:scope > div") == [ 250 | ("CombinedSelector[Element[foo] Element[bar]]", None), 251 | ("CombinedSelector[Pseudo[Element[*]:scope] > Element[div]]", None), 252 | ] 253 | assert parse_pseudo("foo:before, bar, baz:after") == [ 254 | ("Element[foo]", "before"), 255 | ("Element[bar]", None), 256 | ("Element[baz]", "after"), 257 | ] 258 | 259 | # Special cases for CSS 2.1 pseudo-elements are ignored by default 260 | for pseudo in ("after", "before", "first-line", "first-letter"): 261 | (selector,) = parse(f"e:{pseudo}") 262 | assert selector.pseudo_element == pseudo 263 | assert GenericTranslator().selector_to_xpath(selector, prefix="") == "e" 264 | 265 | # Pseudo Elements are ignored by default, but if allowed they are not 266 | # supported by GenericTranslator 267 | tr = GenericTranslator() 268 | (selector,) = parse("e::foo") 269 | assert selector.pseudo_element == "foo" 270 | assert tr.selector_to_xpath(selector, prefix="") == "e" 271 | self.assertRaises( 272 | ExpressionError, 273 | tr.selector_to_xpath, 274 | selector, 275 | translate_pseudo_elements=True, 276 | ) 277 | 278 | # Special test for the unicode symbols and ':scope' element if check 279 | # Errors if use repr() instead of __repr__() 280 | assert test_pseudo_repr(":fİrst-child") == "Pseudo[Element[*]:fİrst-child]" 281 | assert test_pseudo_repr(":scope") == "Pseudo[Element[*]:scope]" 282 | 283 | def test_specificity(self) -> None: 284 | def specificity(css: str) -> tuple[int, int, int]: 285 | selectors = parse(css) 286 | assert len(selectors) == 1 287 | return selectors[0].specificity() 288 | 289 | assert specificity("*") == (0, 0, 0) 290 | assert specificity(" foo") == (0, 0, 1) 291 | assert specificity(":empty ") == (0, 1, 0) 292 | assert specificity(":before") == (0, 0, 1) 293 | assert specificity("*:before") == (0, 0, 1) 294 | assert specificity(":nth-child(2)") == (0, 1, 0) 295 | assert specificity(".bar") == (0, 1, 0) 296 | assert specificity("[baz]") == (0, 1, 0) 297 | assert specificity('[baz="4"]') == (0, 1, 0) 298 | assert specificity('[baz^="4"]') == (0, 1, 0) 299 | assert specificity("#lipsum") == (1, 0, 0) 300 | assert specificity("::attr(name)") == (0, 0, 1) 301 | 302 | assert specificity(":not(*)") == (0, 0, 0) 303 | assert specificity(":not(foo)") == (0, 0, 1) 304 | assert specificity(":not(.foo)") == (0, 1, 0) 305 | assert specificity(":not([foo])") == (0, 1, 0) 306 | assert specificity(":not(:empty)") == (0, 1, 0) 307 | assert specificity(":not(#foo)") == (1, 0, 0) 308 | 309 | assert specificity(":has(*)") == (0, 0, 0) 310 | assert specificity(":has(foo)") == (0, 0, 1) 311 | assert specificity(":has(.foo)") == (0, 1, 0) 312 | assert specificity(":has(> foo)") == (0, 0, 1) 313 | 314 | assert specificity(":is(.foo, #bar)") == (1, 0, 0) 315 | assert specificity(":is(:hover, :visited)") == (0, 1, 0) 316 | assert specificity(":where(:hover, :visited)") == (0, 0, 0) 317 | 318 | assert specificity("foo:empty") == (0, 1, 1) 319 | assert specificity("foo:before") == (0, 0, 2) 320 | assert specificity("foo::before") == (0, 0, 2) 321 | assert specificity("foo:empty::before") == (0, 1, 2) 322 | 323 | assert specificity("#lorem + foo#ipsum:first-child > bar:first-line") == ( 324 | 2, 325 | 1, 326 | 3, 327 | ) 328 | 329 | def test_css_export(self) -> None: 330 | def css2css(css: str, res: str | None = None) -> None: 331 | selectors = parse(css) 332 | assert len(selectors) == 1 333 | assert selectors[0].canonical() == (res or css) 334 | 335 | css2css("*") 336 | css2css(" foo", "foo") 337 | css2css("Foo", "Foo") 338 | css2css(":empty ", ":empty") 339 | css2css(":before", "::before") 340 | css2css(":beFOre", "::before") 341 | css2css("*:before", "::before") 342 | css2css(":nth-child(2)") 343 | css2css(".bar") 344 | css2css("[baz]") 345 | css2css('[baz="4"]', "[baz='4']") 346 | css2css('[baz^="4"]', "[baz^='4']") 347 | css2css("[ns|attr='4']") 348 | css2css("#lipsum") 349 | css2css(":not(*)") 350 | css2css(":not(foo)") 351 | css2css(":not(*.foo)", ":not(.foo)") 352 | css2css(":not(*[foo])", ":not([foo])") 353 | css2css(":not(:empty)") 354 | css2css(":not(#foo)") 355 | css2css(":has(*)") 356 | css2css(":has(foo)") 357 | css2css(":has(*.foo)", ":has(.foo)") 358 | css2css(":is(#bar, .foo)") 359 | css2css(":is(:focused, :visited)") 360 | css2css(":where(:focused, :visited)") 361 | css2css("foo:empty") 362 | css2css("foo::before") 363 | css2css("foo:empty::before") 364 | css2css('::name(arg + "val" - 3)', "::name(arg+'val'-3)") 365 | css2css("#lorem + foo#ipsum:first-child > bar::first-line") 366 | css2css("foo > *") 367 | 368 | def test_parse_errors(self) -> None: 369 | def get_error(css: str) -> str | None: 370 | try: 371 | parse(css) 372 | except SelectorSyntaxError: 373 | return str(sys.exc_info()[1]) 374 | return None 375 | 376 | assert get_error("attributes(href)/html/body/a") == ( 377 | "Expected selector, got " 378 | ) 379 | assert get_error("attributes(href)") == ( 380 | "Expected selector, got " 381 | ) 382 | assert get_error("html/body/a") == ("Expected selector, got ") 383 | assert get_error(" ") == ("Expected selector, got ") 384 | assert get_error("div, ") == ("Expected selector, got ") 385 | assert get_error(" , div") == ("Expected selector, got ") 386 | assert get_error("p, , div") == ("Expected selector, got ") 387 | assert get_error("div > ") == ("Expected selector, got ") 388 | assert get_error(" > div") == ("Expected selector, got ' at 2>") 389 | assert get_error("foo|#bar") == ("Expected ident or '*', got ") 390 | assert get_error("#.foo") == ("Expected selector, got ") 391 | assert get_error(".#foo") == ("Expected ident, got ") 392 | assert get_error(":#foo") == ("Expected ident, got ") 393 | assert get_error("[*]") == ("Expected '|', got ") 394 | assert get_error("[foo|]") == ("Expected ident, got ") 395 | assert get_error("[#]") == ("Expected ident or '*', got ") 396 | assert get_error("[foo=#]") == ( 397 | "Expected string or ident, got " 398 | ) 399 | assert get_error("[href]a") == ("Expected selector, got ") 400 | assert get_error("[rel=stylesheet]") is None 401 | assert get_error("[rel:stylesheet]") == ( 402 | "Operator expected, got " 403 | ) 404 | assert get_error("[rel=stylesheet") == ("Expected ']', got ") 405 | assert get_error(":lang(fr)") is None 406 | assert get_error(":lang(fr") == ("Expected an argument, got ") 407 | assert get_error(':contains("foo') == ("Unclosed string at 10") 408 | assert get_error("foo!") == ("Expected selector, got ") 409 | 410 | # Mis-placed pseudo-elements 411 | assert get_error("a:before:empty") == ( 412 | "Got pseudo-element ::before not at the end of a selector" 413 | ) 414 | assert get_error("li:before a") == ( 415 | "Got pseudo-element ::before not at the end of a selector" 416 | ) 417 | assert get_error(":not(:before)") == ( 418 | "Got pseudo-element ::before inside :not() at 12" 419 | ) 420 | assert get_error(":not(:not(a))") == ("Got nested :not()") 421 | assert get_error(":is(:before)") == ( 422 | "Got pseudo-element ::before inside function" 423 | ) 424 | assert get_error(":is(a b)") == ("Expected an argument, got ") 425 | assert get_error(":where(:before)") == ( 426 | "Got pseudo-element ::before inside function" 427 | ) 428 | assert get_error(":where(a b)") == ( 429 | "Expected an argument, got " 430 | ) 431 | assert get_error(":scope > div :scope header") == ( 432 | 'Got immediate child pseudo-element ":scope" not at the start of a selector' 433 | ) 434 | assert get_error("div :scope header") == ( 435 | 'Got immediate child pseudo-element ":scope" not at the start of a selector' 436 | ) 437 | assert get_error("> div p") == ("Expected selector, got ' at 0>") 438 | 439 | # Unsupported :has() with several arguments 440 | assert get_error(":has(a, b)") == ("Expected an argument, got ") 441 | assert get_error(":has()") == ("Expected selector, got ") 442 | 443 | def test_translation(self) -> None: 444 | def xpath(css: str) -> str: 445 | return str(GenericTranslator().css_to_xpath(css, prefix="")) 446 | 447 | assert xpath("*") == "*" 448 | assert xpath("e") == "e" 449 | assert xpath("*|e") == "e" 450 | assert xpath("e|f") == "e:f" 451 | assert xpath("e[foo]") == "e[@foo]" 452 | assert xpath("e[foo|bar]") == "e[@foo:bar]" 453 | assert xpath('e[foo="bar"]') == "e[@foo = 'bar']" 454 | assert xpath('e[foo~="bar"]') == ( 455 | "e[@foo and contains(concat(' ', normalize-space(@foo), ' '), ' bar ')]" 456 | ) 457 | assert xpath('e[foo^="bar"]') == ("e[@foo and starts-with(@foo, 'bar')]") 458 | assert xpath('e[foo$="bar"]') == ( 459 | "e[@foo and substring(@foo, string-length(@foo)-2) = 'bar']" 460 | ) 461 | assert xpath('e[foo*="bar"]') == ("e[@foo and contains(@foo, 'bar')]") 462 | assert xpath('e[hreflang|="en"]') == ( 463 | "e[@hreflang and (@hreflang = 'en' or starts-with(@hreflang, 'en-'))]" 464 | ) 465 | 466 | # --- nth-* and nth-last-* ------------------------------------- 467 | assert xpath("e:nth-child(1)") == ("e[count(preceding-sibling::*) = 0]") 468 | 469 | # always true 470 | assert xpath("e:nth-child(n)") == ("e") 471 | assert xpath("e:nth-child(n+1)") == ("e") 472 | # always true too 473 | assert xpath("e:nth-child(n-10)") == ("e") 474 | # b=2 is the limit... 475 | assert xpath("e:nth-child(n+2)") == ("e[count(preceding-sibling::*) >= 1]") 476 | # always false 477 | assert xpath("e:nth-child(-n)") == ("e[0]") 478 | # equivalent to first child 479 | assert xpath("e:nth-child(-n+1)") == ("e[count(preceding-sibling::*) <= 0]") 480 | 481 | assert xpath("e:nth-child(3n+2)") == ( 482 | "e[(count(preceding-sibling::*) >= 1) and " 483 | "((count(preceding-sibling::*) +2) mod 3 = 0)]" 484 | ) 485 | assert xpath("e:nth-child(3n-2)") == ( 486 | "e[count(preceding-sibling::*) mod 3 = 0]" 487 | ) 488 | assert xpath("e:nth-child(-n+6)") == ("e[count(preceding-sibling::*) <= 5]") 489 | 490 | assert xpath("e:nth-last-child(1)") == ("e[count(following-sibling::*) = 0]") 491 | assert xpath("e:nth-last-child(2n)") == ( 492 | "e[(count(following-sibling::*) +1) mod 2 = 0]" 493 | ) 494 | assert xpath("e:nth-last-child(2n+1)") == ( 495 | "e[count(following-sibling::*) mod 2 = 0]" 496 | ) 497 | assert xpath("e:nth-last-child(2n+2)") == ( 498 | "e[(count(following-sibling::*) >= 1) and " 499 | "((count(following-sibling::*) +1) mod 2 = 0)]" 500 | ) 501 | assert xpath("e:nth-last-child(3n+1)") == ( 502 | "e[count(following-sibling::*) mod 3 = 0]" 503 | ) 504 | # represents the two last e elements 505 | assert xpath("e:nth-last-child(-n+2)") == ( 506 | "e[count(following-sibling::*) <= 1]" 507 | ) 508 | 509 | assert xpath("e:nth-of-type(1)") == ("e[count(preceding-sibling::e) = 0]") 510 | assert xpath("e:nth-last-of-type(1)") == ("e[count(following-sibling::e) = 0]") 511 | assert xpath("div e:nth-last-of-type(1) .aclass") == ( 512 | "div/descendant-or-self::*/e[count(following-sibling::e) = 0]" 513 | "/descendant-or-self::*/*[@class and contains(" 514 | "concat(' ', normalize-space(@class), ' '), ' aclass ')]" 515 | ) 516 | 517 | assert xpath("e:first-child") == ("e[count(preceding-sibling::*) = 0]") 518 | assert xpath("e:last-child") == ("e[count(following-sibling::*) = 0]") 519 | assert xpath("e:first-of-type") == ("e[count(preceding-sibling::e) = 0]") 520 | assert xpath("e:last-of-type") == ("e[count(following-sibling::e) = 0]") 521 | assert xpath("e:only-child") == ("e[count(parent::*/child::*) = 1]") 522 | assert xpath("e:only-of-type") == ("e[count(parent::*/child::e) = 1]") 523 | assert xpath("e:empty") == ("e[not(*) and not(string-length())]") 524 | assert xpath("e:EmPTY") == ("e[not(*) and not(string-length())]") 525 | assert xpath("e:root") == ("e[not(parent::*)]") 526 | assert xpath("e:hover") == ("e[0]") # never matches 527 | assert ( 528 | xpath("div:has(bar.foo)") == "div[descendant::bar" 529 | "[@class and contains(concat(' ', normalize-space(@class), ' '), ' foo ')]]" 530 | ) 531 | assert xpath("e:has(> f)") == "e[./f]" 532 | assert xpath("e:has(f)") == "e[descendant::f]" 533 | assert xpath("e:has(~ f)") == "e[following-sibling::f]" 534 | assert ( 535 | xpath("e:has(+ f)") 536 | == "e[following-sibling::*[(name() = 'f') and (position() = 1)]]" 537 | ) 538 | assert xpath('e:contains("foo")') == ("e[contains(., 'foo')]") 539 | assert xpath("e:ConTains(foo)") == ("e[contains(., 'foo')]") 540 | assert xpath("e.warning") == ( 541 | "e[@class and contains(" 542 | "concat(' ', normalize-space(@class), ' '), ' warning ')]" 543 | ) 544 | assert xpath("e#myid") == ("e[@id = 'myid']") 545 | assert xpath("e:not(:nth-child(odd))") == ( 546 | "e[not(count(preceding-sibling::*) mod 2 = 0)]" 547 | ) 548 | assert xpath("e:nOT(*)") == ("e[0]") # never matches 549 | assert xpath("e f") == ("e/descendant-or-self::*/f") 550 | assert xpath("e > f") == ("e/f") 551 | assert xpath("e + f") == ( 552 | "e/following-sibling::*[(name() = 'f') and (position() = 1)]" 553 | ) 554 | assert xpath("e ~ f") == ("e/following-sibling::f") 555 | assert xpath("e ~ f:nth-child(3)") == ( 556 | "e/following-sibling::f[count(preceding-sibling::*) = 2]" 557 | ) 558 | assert xpath("div#container p") == ( 559 | "div[@id = 'container']/descendant-or-self::*/p" 560 | ) 561 | assert xpath("e:where(foo)") == "e[name() = 'foo']" 562 | assert xpath("e:where(foo, bar)") == "e[(name() = 'foo') or (name() = 'bar')]" 563 | 564 | # Invalid characters in XPath element names 565 | assert xpath(r"di\a0 v") == ("*[name() = 'di v']") # di\xa0v 566 | assert xpath(r"di\[v") == ("*[name() = 'di[v']") 567 | assert xpath(r"[h\a0 ref]") == ("*[attribute::*[name() = 'h ref']]") # h\xa0ref 568 | assert xpath(r"[h\]ref]") == ("*[attribute::*[name() = 'h]ref']]") 569 | 570 | self.assertRaises(ExpressionError, xpath, ":fİrst-child") 571 | self.assertRaises(ExpressionError, xpath, ":first-of-type") 572 | self.assertRaises(ExpressionError, xpath, ":only-of-type") 573 | self.assertRaises(ExpressionError, xpath, ":last-of-type") 574 | self.assertRaises(ExpressionError, xpath, ":nth-of-type(1)") 575 | self.assertRaises(ExpressionError, xpath, ":nth-last-of-type(1)") 576 | self.assertRaises(ExpressionError, xpath, ":nth-child(n-)") 577 | self.assertRaises(ExpressionError, xpath, ":after") 578 | self.assertRaises(ExpressionError, xpath, ":lorem-ipsum") 579 | self.assertRaises(ExpressionError, xpath, ":lorem(ipsum)") 580 | self.assertRaises(ExpressionError, xpath, "::lorem-ipsum") 581 | self.assertRaises(TypeError, GenericTranslator().css_to_xpath, 4) 582 | self.assertRaises(TypeError, GenericTranslator().selector_to_xpath, "foo") 583 | 584 | def test_unicode(self) -> None: 585 | css = ".a\xc1b" 586 | xpath = GenericTranslator().css_to_xpath(css) 587 | assert css[1:] in xpath 588 | xpath = xpath.encode("ascii", "xmlcharrefreplace").decode("ASCII") 589 | assert xpath == ( 590 | "descendant-or-self::*[@class and contains(" 591 | "concat(' ', normalize-space(@class), ' '), ' aÁb ')]" 592 | ) 593 | 594 | def test_quoting(self) -> None: 595 | css_to_xpath = GenericTranslator().css_to_xpath 596 | assert css_to_xpath('*[aval="\'"]') == ( 597 | """descendant-or-self::*[@aval = "'"]""" 598 | ) 599 | assert css_to_xpath("*[aval=\"'''\"]") == ( 600 | """descendant-or-self::*[@aval = "'''"]""" 601 | ) 602 | assert css_to_xpath("*[aval='\"']") == ( 603 | """descendant-or-self::*[@aval = '"']""" 604 | ) 605 | assert css_to_xpath('*[aval=\'"""\']') == ( 606 | '''descendant-or-self::*[@aval = '"""']''' 607 | ) 608 | assert css_to_xpath(':scope > div[dataimg=""]') == ( 609 | "descendant-or-self::*[1]/div[@dataimg = '']" 610 | ) 611 | 612 | def test_unicode_escapes(self) -> None: 613 | # \22 == '"' \20 == ' ' 614 | css_to_xpath = GenericTranslator().css_to_xpath 615 | assert css_to_xpath(r'*[aval="\'\22\'"]') == ( 616 | """descendant-or-self::*[@aval = concat("'",'"',"'")]""" 617 | ) 618 | assert css_to_xpath(r'*[aval="\'\22 2\'"]') == ( 619 | """descendant-or-self::*[@aval = concat("'",'"2',"'")]""" 620 | ) 621 | assert css_to_xpath(r'*[aval="\'\20 \'"]') == ( 622 | """descendant-or-self::*[@aval = "' '"]""" 623 | ) 624 | assert css_to_xpath("*[aval=\"'\\20\r\n '\"]") == ( 625 | """descendant-or-self::*[@aval = "' '"]""" 626 | ) 627 | 628 | def test_xpath_pseudo_elements(self) -> None: 629 | class CustomTranslator(GenericTranslator): 630 | def xpath_pseudo_element( 631 | self, xpath: XPathExpr, pseudo_element: PseudoElement 632 | ) -> XPathExpr: 633 | if isinstance(pseudo_element, FunctionalPseudoElement): 634 | method_name = "xpath_{}_functional_pseudo_element".format( 635 | pseudo_element.name.replace("-", "_") 636 | ) 637 | method = getattr(self, method_name, None) 638 | if not method: 639 | raise ExpressionError( 640 | f"The functional pseudo-element ::{pseudo_element.name}() is unknown" 641 | ) 642 | xpath = method(xpath, pseudo_element.arguments) 643 | else: 644 | method_name = "xpath_{}_simple_pseudo_element".format( 645 | pseudo_element.replace("-", "_") 646 | ) 647 | method = getattr(self, method_name, None) 648 | if not method: 649 | raise ExpressionError( 650 | f"The pseudo-element ::{pseudo_element} is unknown" 651 | ) 652 | xpath = method(xpath) 653 | return xpath 654 | 655 | # functional pseudo-class: 656 | # elements that have a certain number of attributes 657 | def xpath_nb_attr_function( 658 | self, xpath: XPathExpr, function: Function 659 | ) -> XPathExpr: 660 | assert function.arguments[0].value 661 | nb_attributes = int(function.arguments[0].value) 662 | return xpath.add_condition(f"count(@*)={nb_attributes}") 663 | 664 | # pseudo-class: 665 | # elements that have 5 attributes 666 | def xpath_five_attributes_pseudo(self, xpath: XPathExpr) -> XPathExpr: 667 | return xpath.add_condition("count(@*)=5") 668 | 669 | # functional pseudo-element: 670 | # element's attribute by name 671 | def xpath_attr_functional_pseudo_element( 672 | self, xpath: XPathExpr, arguments: Sequence[Token] 673 | ) -> XPathExpr: 674 | attribute_name = arguments[0].value 675 | other = XPathExpr( 676 | f"@{attribute_name}", 677 | "", 678 | ) 679 | return xpath.join("/", other) 680 | 681 | # pseudo-element: 682 | # element's text() nodes 683 | def xpath_text_node_simple_pseudo_element( 684 | self, xpath: XPathExpr 685 | ) -> XPathExpr: 686 | other = XPathExpr( 687 | "text()", 688 | "", 689 | ) 690 | return xpath.join("/", other) 691 | 692 | # pseudo-element: 693 | # element's href attribute 694 | def xpath_attr_href_simple_pseudo_element( 695 | self, xpath: XPathExpr 696 | ) -> XPathExpr: 697 | other = XPathExpr( 698 | "@href", 699 | "", 700 | ) 701 | return xpath.join("/", other) 702 | 703 | # pseudo-element: 704 | # used to demonstrate operator precedence 705 | def xpath_first_or_second_pseudo(self, xpath: XPathExpr) -> XPathExpr: 706 | return xpath.add_condition("@id = 'first' or @id = 'second'") 707 | 708 | def xpath(css: str) -> str: 709 | return str(CustomTranslator().css_to_xpath(css)) 710 | 711 | assert xpath(":five-attributes") == "descendant-or-self::*[count(@*)=5]" 712 | assert xpath(":nb-attr(3)") == "descendant-or-self::*[count(@*)=3]" 713 | assert xpath("::attr(href)") == "descendant-or-self::*/@href" 714 | assert xpath("::text-node") == "descendant-or-self::*/text()" 715 | assert xpath("::attr-href") == "descendant-or-self::*/@href" 716 | assert xpath("p img::attr(src)") == ( 717 | "descendant-or-self::p/descendant-or-self::*/img/@src" 718 | ) 719 | assert xpath(":scope") == "descendant-or-self::*[1]" 720 | assert xpath(":first-or-second[href]") == ( 721 | "descendant-or-self::*[(@id = 'first' or @id = 'second') and (@href)]" 722 | ) 723 | 724 | assert str(XPathExpr("", "", condition="@href")) == "[@href]" 725 | 726 | document = etree.fromstring(OPERATOR_PRECEDENCE_IDS) 727 | sort_key = {el: count for count, el in enumerate(document.iter())}.__getitem__ 728 | 729 | def operator_id(selector: str) -> list[str]: 730 | xpath = CustomTranslator().css_to_xpath(selector) 731 | items = typing.cast("list[etree._Element]", document.xpath(xpath)) 732 | items.sort(key=sort_key) 733 | return [element.get("id", "nil") for element in items] 734 | 735 | assert operator_id(":first-or-second") == ["first", "second"] 736 | assert operator_id(":first-or-second[href]") == ["second"] 737 | assert operator_id("[href]:first-or-second") == ["second"] 738 | 739 | def test_series(self) -> None: 740 | def series(css: str) -> tuple[int, int] | None: 741 | (selector,) = parse(f":nth-child({css})") 742 | args = typing.cast( 743 | "FunctionalPseudoElement", selector.parsed_tree 744 | ).arguments 745 | try: 746 | return parse_series(args) 747 | except ValueError: 748 | return None 749 | 750 | assert series("1n+3") == (1, 3) 751 | assert series("1n +3") == (1, 3) 752 | assert series("1n + 3") == (1, 3) 753 | assert series("1n+ 3") == (1, 3) 754 | assert series("1n-3") == (1, -3) 755 | assert series("1n -3") == (1, -3) 756 | assert series("1n - 3") == (1, -3) 757 | assert series("1n- 3") == (1, -3) 758 | assert series("n-5") == (1, -5) 759 | assert series("odd") == (2, 1) 760 | assert series("even") == (2, 0) 761 | assert series("3n") == (3, 0) 762 | assert series("n") == (1, 0) 763 | assert series("+n") == (1, 0) 764 | assert series("-n") == (-1, 0) 765 | assert series("5") == (0, 5) 766 | assert series("foo") is None 767 | assert series("n+") is None 768 | 769 | def test_lang(self) -> None: 770 | document = etree.fromstring(XMLLANG_IDS) 771 | sort_key = {el: count for count, el in enumerate(document.iter())}.__getitem__ 772 | css_to_xpath = GenericTranslator().css_to_xpath 773 | 774 | def langid(selector: str) -> list[str]: 775 | xpath = css_to_xpath(selector) 776 | items = typing.cast("list[etree._Element]", document.xpath(xpath)) 777 | items.sort(key=sort_key) 778 | return [element.get("id", "nil") for element in items] 779 | 780 | assert langid(':lang("EN")') == ["first", "second", "third", "fourth"] 781 | assert langid(':lang("en-us")') == ["second", "fourth"] 782 | assert langid(":lang(en-nz)") == ["third"] 783 | assert langid(":lang(fr)") == ["fifth"] 784 | assert langid(":lang(ru)") == ["sixth"] 785 | assert langid(":lang('ZH')") == ["eighth"] 786 | assert langid(":lang(de) :lang(zh)") == ["eighth"] 787 | assert langid(":lang(en), :lang(zh)") == [ 788 | "first", 789 | "second", 790 | "third", 791 | "fourth", 792 | "eighth", 793 | ] 794 | assert langid(":lang(es)") == [] 795 | 796 | def test_argument_types(self) -> None: 797 | class CustomTranslator(GenericTranslator): 798 | def __init__(self) -> None: 799 | self.argument_types: list[str] = [] 800 | 801 | def xpath_pseudo_element( 802 | self, xpath: XPathExpr, pseudo_element: PseudoElement 803 | ) -> XPathExpr: 804 | self.argument_types += typing.cast( 805 | "FunctionalPseudoElement", pseudo_element 806 | ).argument_types() 807 | return xpath 808 | 809 | def argument_types(css: str) -> list[str]: 810 | translator = CustomTranslator() 811 | translator.css_to_xpath(css) 812 | return translator.argument_types 813 | 814 | mappings: list[tuple[str, list[str]]] = [ 815 | ("", []), 816 | ("ident", ["IDENT"]), 817 | ('"string"', ["STRING"]), 818 | ("1", ["NUMBER"]), 819 | ] 820 | for argument_string, argument_list in mappings: 821 | css = f"::pseudo_element({argument_string})" 822 | assert argument_types(css) == argument_list 823 | 824 | def test_select(self) -> None: 825 | document = etree.fromstring(HTML_IDS) 826 | sort_key = {el: count for count, el in enumerate(document.iter())}.__getitem__ 827 | css_to_xpath = GenericTranslator().css_to_xpath 828 | html_css_to_xpath = HTMLTranslator().css_to_xpath 829 | 830 | def select_ids(selector: str, html_only: bool) -> list[str]: 831 | xpath = css_to_xpath(selector) 832 | items = typing.cast("list[etree._Element]", document.xpath(xpath)) 833 | if html_only: 834 | assert items == [] 835 | xpath = html_css_to_xpath(selector) 836 | items = typing.cast("list[etree._Element]", document.xpath(xpath)) 837 | items.sort(key=sort_key) 838 | return [element.get("id", "nil") for element in items] 839 | 840 | def pcss(main: str, *selectors: str, **kwargs: bool) -> list[str]: 841 | html_only = kwargs.pop("html_only", False) 842 | result = select_ids(main, html_only) 843 | for selector in selectors: 844 | assert select_ids(selector, html_only) == result 845 | return result 846 | 847 | all_ids = pcss("*") 848 | assert all_ids[:6] == [ 849 | "html", 850 | "nil", 851 | "link-href", 852 | "link-nohref", 853 | "nil", 854 | "outer-div", 855 | ] 856 | assert all_ids[-1:] == ["foobar-span"] 857 | assert pcss("div") == ["outer-div", "li-div", "foobar-div"] 858 | assert pcss("DIV", html_only=True) == [ 859 | "outer-div", 860 | "li-div", 861 | "foobar-div", 862 | ] # case-insensitive in HTML 863 | assert pcss("div div") == ["li-div"] 864 | assert pcss("div, div div") == ["outer-div", "li-div", "foobar-div"] 865 | assert pcss("a[name]") == ["name-anchor"] 866 | assert pcss("a[NAme]", html_only=True) == [ 867 | "name-anchor" 868 | ] # case-insensitive in HTML: 869 | assert pcss("a[rel]") == ["tag-anchor", "nofollow-anchor"] 870 | assert pcss('a[rel="tag"]') == ["tag-anchor"] 871 | assert pcss('a[href*="localhost"]') == ["tag-anchor"] 872 | assert pcss('a[href*=""]') == [] 873 | assert pcss('a[href^="http"]') == ["tag-anchor", "nofollow-anchor"] 874 | assert pcss('a[href^="http:"]') == ["tag-anchor"] 875 | assert pcss('a[href^=""]') == [] 876 | assert pcss('a[href$="org"]') == ["nofollow-anchor"] 877 | assert pcss('a[href$=""]') == [] 878 | assert pcss('div[foobar~="bc"]', 'div[foobar~="cde"]') == ["foobar-div"] 879 | assert pcss('[foobar~="ab bc"]', '[foobar~=""]', '[foobar~=" \t"]') == [] 880 | assert pcss('div[foobar~="cd"]') == [] 881 | assert pcss('*[lang|="En"]', '[lang|="En-us"]') == ["second-li"] 882 | # Attribute values are case sensitive 883 | assert pcss('*[lang|="en"]', '[lang|="en-US"]') == [] 884 | assert pcss('*[lang|="e"]') == [] 885 | # ... :lang() is not. 886 | assert pcss(':lang("EN")', "*:lang(en-US)", html_only=True) == [ 887 | "second-li", 888 | "li-div", 889 | ] 890 | assert pcss(':lang("e")', html_only=True) == [] 891 | assert pcss(":scope > div") == [] 892 | assert pcss(":scope body") == ["nil"] 893 | assert pcss(":scope body > div") == ["outer-div", "foobar-div"] 894 | assert pcss(":scope head") == ["nil"] 895 | assert pcss(":scope html") == [] 896 | 897 | # --- nth-* and nth-last-* ------------------------------------- 898 | 899 | # select nothing 900 | assert pcss("li:nth-child(-n)") == [] 901 | # select all children 902 | assert pcss("li:nth-child(n)") == [ 903 | "first-li", 904 | "second-li", 905 | "third-li", 906 | "fourth-li", 907 | "fifth-li", 908 | "sixth-li", 909 | "seventh-li", 910 | ] 911 | 912 | assert pcss("li:nth-child(3)", "#first-li ~ :nth-child(3)") == ["third-li"] 913 | assert pcss("li:nth-child(10)") == [] 914 | assert pcss("li:nth-child(2n)", "li:nth-child(even)", "li:nth-child(2n+0)") == [ 915 | "second-li", 916 | "fourth-li", 917 | "sixth-li", 918 | ] 919 | assert pcss("li:nth-child(+2n+1)", "li:nth-child(odd)") == [ 920 | "first-li", 921 | "third-li", 922 | "fifth-li", 923 | "seventh-li", 924 | ] 925 | assert pcss("li:nth-child(2n+4)") == ["fourth-li", "sixth-li"] 926 | assert pcss("li:nth-child(3n+1)") == ["first-li", "fourth-li", "seventh-li"] 927 | assert pcss("li:nth-child(-n+3)") == ["first-li", "second-li", "third-li"] 928 | assert pcss("li:nth-child(-2n+4)") == ["second-li", "fourth-li"] 929 | assert pcss("li:nth-last-child(0)") == [] 930 | assert pcss("li:nth-last-child(1)") == ["seventh-li"] 931 | assert pcss("li:nth-last-child(2n)", "li:nth-last-child(even)") == [ 932 | "second-li", 933 | "fourth-li", 934 | "sixth-li", 935 | ] 936 | assert pcss("li:nth-last-child(2n+1)") == [ 937 | "first-li", 938 | "third-li", 939 | "fifth-li", 940 | "seventh-li", 941 | ] 942 | assert pcss("li:nth-last-child(2n+2)") == ["second-li", "fourth-li", "sixth-li"] 943 | assert pcss("li:nth-last-child(3n+1)") == [ 944 | "first-li", 945 | "fourth-li", 946 | "seventh-li", 947 | ] 948 | assert pcss("ol:first-of-type") == ["first-ol"] 949 | assert pcss("ol:nth-child(1)") == [] 950 | assert pcss("ol:nth-of-type(2)") == ["second-ol"] 951 | assert pcss("ol:nth-last-of-type(1)") == ["second-ol"] 952 | 953 | # "+" and "~" tests 954 | assert pcss("ol#first-ol li + li:nth-child(4)") == ["fourth-li"] 955 | assert pcss("li + li:nth-child(1)") == [] 956 | assert pcss("li ~ li:nth-child(2n+1)") == [ 957 | "third-li", 958 | "fifth-li", 959 | "seventh-li", 960 | ] # all but the first 961 | assert pcss("li ~ li:nth-last-child(2n+1)") == [ 962 | "third-li", 963 | "fifth-li", 964 | "seventh-li", 965 | ] # all but the first 966 | 967 | assert pcss("span:only-child") == ["foobar-span"] 968 | assert pcss("li div:only-child") == ["li-div"] 969 | assert pcss("div *:only-child") == ["li-div", "foobar-span"] 970 | self.assertRaises(ExpressionError, pcss, "p *:only-of-type") 971 | assert pcss("p:only-of-type") == ["paragraph"] 972 | assert pcss("a:empty", "a:EMpty") == ["name-anchor"] 973 | assert pcss("li:empty") == ["third-li", "fourth-li", "fifth-li", "sixth-li"] 974 | assert pcss(":root", "html:root") == ["html"] 975 | assert pcss("li:root", "* :root") == [] 976 | assert pcss('*:contains("link")', ':CONtains("link")') == [ 977 | "html", 978 | "nil", 979 | "outer-div", 980 | "tag-anchor", 981 | "nofollow-anchor", 982 | ] 983 | assert pcss('*:contains("LInk")') == [] # case sensitive 984 | assert pcss('*:contains("e")') == [ 985 | "html", 986 | "nil", 987 | "outer-div", 988 | "first-ol", 989 | "first-li", 990 | "paragraph", 991 | "p-em", 992 | ] 993 | assert pcss('*:contains("E")') == [] # case-sensitive 994 | assert pcss(".a", ".b", "*.a", "ol.a") == ["first-ol"] 995 | assert pcss(".c", "*.c") == ["first-ol", "third-li", "fourth-li"] 996 | assert pcss("ol *.c", "ol li.c", "li ~ li.c", "ol > li.c") == [ 997 | "third-li", 998 | "fourth-li", 999 | ] 1000 | assert pcss("#first-li", "li#first-li", "*#first-li") == ["first-li"] 1001 | assert pcss("li div", "li > div", "div div") == ["li-div"] 1002 | assert pcss("div > div") == [] 1003 | assert pcss("div>.c", "div > .c") == ["first-ol"] 1004 | assert pcss("div + div") == ["foobar-div"] 1005 | assert pcss("a ~ a") == ["tag-anchor", "nofollow-anchor"] 1006 | assert pcss('a[rel="tag"] ~ a') == ["nofollow-anchor"] 1007 | assert pcss("ol#first-ol li:last-child") == ["seventh-li"] 1008 | assert pcss("ol#first-ol *:last-child") == ["li-div", "seventh-li"] 1009 | assert pcss("#outer-div:first-child") == ["outer-div"] 1010 | assert pcss("#outer-div :first-child") == [ 1011 | "name-anchor", 1012 | "first-li", 1013 | "li-div", 1014 | "p-b", 1015 | "checkbox-fieldset-disabled", 1016 | "area-href", 1017 | ] 1018 | assert pcss("a[href]") == ["tag-anchor", "nofollow-anchor"] 1019 | assert pcss(":not(*)") == [] 1020 | assert pcss("a:not([href])") == ["name-anchor"] 1021 | assert pcss("ol :Not(li[class])") == [ 1022 | "first-li", 1023 | "second-li", 1024 | "li-div", 1025 | "fifth-li", 1026 | "sixth-li", 1027 | "seventh-li", 1028 | ] 1029 | assert pcss("link:has(*)") == [] 1030 | assert pcss("ol:has(div)") == ["first-ol"] 1031 | assert pcss(":is(#first-li, #second-li)") == ["first-li", "second-li"] 1032 | assert pcss("a:is(#name-anchor, #tag-anchor)") == ["name-anchor", "tag-anchor"] 1033 | assert pcss(":is(.c)") == ["first-ol", "third-li", "fourth-li"] 1034 | assert pcss("ol.a.b.c > li.c:nth-child(3)") == ["third-li"] 1035 | 1036 | # Invalid characters in XPath element names, should not crash 1037 | assert pcss(r"di\a0 v", r"div\[") == [] 1038 | assert pcss(r"[h\a0 ref]", r"[h\]ref]") == [] 1039 | 1040 | # HTML-specific 1041 | assert pcss(":link", html_only=True) == [ 1042 | "link-href", 1043 | "tag-anchor", 1044 | "nofollow-anchor", 1045 | "area-href", 1046 | ] 1047 | assert pcss(":visited", html_only=True) == [] 1048 | assert pcss(":enabled", html_only=True) == [ 1049 | "link-href", 1050 | "tag-anchor", 1051 | "nofollow-anchor", 1052 | "checkbox-unchecked", 1053 | "text-checked", 1054 | "checkbox-checked", 1055 | "area-href", 1056 | ] 1057 | assert pcss(":disabled", html_only=True) == [ 1058 | "checkbox-disabled", 1059 | "checkbox-disabled-checked", 1060 | "fieldset", 1061 | "checkbox-fieldset-disabled", 1062 | ] 1063 | assert pcss(":checked", html_only=True) == [ 1064 | "checkbox-checked", 1065 | "checkbox-disabled-checked", 1066 | ] 1067 | 1068 | def test_select_shakespeare(self) -> None: 1069 | document = html.document_fromstring(HTML_SHAKESPEARE) 1070 | body = typing.cast("list[etree._Element]", document.xpath("//body"))[0] 1071 | css_to_xpath = GenericTranslator().css_to_xpath 1072 | 1073 | basestring_ = (str, bytes) 1074 | 1075 | def count(selector: str) -> int: 1076 | xpath = css_to_xpath(selector) 1077 | results = typing.cast("list[etree._Element]", body.xpath(xpath)) 1078 | assert not isinstance(results, basestring_) 1079 | found = set() 1080 | for item in results: 1081 | assert item not in found 1082 | found.add(item) 1083 | assert not isinstance(item, basestring_) 1084 | return len(results) 1085 | 1086 | # Data borrowed from http://mootools.net/slickspeed/ 1087 | 1088 | ## Changed from original; probably because I'm only 1089 | ## searching the body. 1090 | # assert count('*') == 252 1091 | assert count("*") == 246 1092 | assert count("div:contains(CELIA)") == 26 1093 | assert count("div:only-child") == 22 # ? 1094 | assert count("div:nth-child(even)") == 106 1095 | assert count("div:nth-child(2n)") == 106 1096 | assert count("div:nth-child(odd)") == 137 1097 | assert count("div:nth-child(2n+1)") == 137 1098 | assert count("div:nth-child(n)") == 243 1099 | assert count("div:last-child") == 53 1100 | assert count("div:first-child") == 51 1101 | assert count("div > div") == 242 1102 | assert count("div + div") == 190 1103 | assert count("div ~ div") == 190 1104 | assert count("body") == 1 1105 | assert count("body div") == 243 1106 | assert count("div") == 243 1107 | assert count("div div") == 242 1108 | assert count("div div div") == 241 1109 | assert count("div, div, div") == 243 1110 | assert count("div, a, span") == 243 1111 | assert count(".dialog") == 51 1112 | assert count("div.dialog") == 51 1113 | assert count("div .dialog") == 51 1114 | assert count("div.character, div.dialog") == 99 1115 | assert count("div.direction.dialog") == 0 1116 | assert count("div.dialog.direction") == 0 1117 | assert count("div.dialog.scene") == 1 1118 | assert count("div.scene.scene") == 1 1119 | assert count("div.scene .scene") == 0 1120 | assert count("div.direction .dialog ") == 0 1121 | assert count("div .dialog .direction") == 4 1122 | assert count("div.dialog .dialog .direction") == 4 1123 | assert count("#speech5") == 1 1124 | assert count("div#speech5") == 1 1125 | assert count("div #speech5") == 1 1126 | assert count("div.scene div.dialog") == 49 1127 | assert count("div#scene1 div.dialog div") == 142 1128 | assert count("#scene1 #speech1") == 1 1129 | assert count("div[class]") == 103 1130 | assert count("div[class=dialog]") == 50 1131 | assert count("div[class^=dia]") == 51 1132 | assert count("div[class$=log]") == 50 1133 | assert count("div[class*=sce]") == 1 1134 | assert count("div[class|=dialog]") == 50 # ? Seems right 1135 | assert count("div[class!=madeup]") == 243 # ? Seems right 1136 | assert count("div[class~=dialog]") == 51 # ? Seems right 1137 | assert count(":scope > div") == 1 1138 | assert count(":scope > div > div[class=dialog]") == 1 1139 | assert count(":scope > div div") == 242 1140 | 1141 | 1142 | OPERATOR_PRECEDENCE_IDS = """ 1143 | 1144 | 1145 | 1146 | 1147 | 1148 | """ 1149 | 1150 | XMLLANG_IDS = """ 1151 | 1152 | a 1153 | b 1154 | c 1155 | d 1156 | e 1157 | f 1158 | 1159 | 1160 | 1161 | 1162 | """ 1163 | 1164 | HTML_IDS = """ 1165 | 1166 | 1167 | 1168 | 1169 |
1170 | 1171 | 1172 | 1173 | link 1174 |
    1175 |
  1. content
  2. 1176 |
  3. 1177 |
    1178 |
    1179 |
  4. 1180 |
  5. 1181 |
  6. 1183 |
  7. 1184 |
  8. 1185 |
  9. 1186 |
1187 |

1188 | hi there 1189 | guy 1190 | 1191 | 1192 | 1193 | 1194 | 1195 | 1196 | 1198 |

1199 | 1200 | 1201 |
1202 |

1203 |
    1204 |
1205 | 1206 | 1207 | 1208 | 1209 |
1210 |
1212 | 1213 | """ 1214 | 1215 | 1216 | HTML_SHAKESPEARE = """ 1217 | 1219 | 1220 | 1221 | 1222 | 1223 | 1224 |
1225 |
1226 |

As You Like It

1227 |
1228 | by William Shakespeare 1229 |
1230 |
1231 |

ACT I, SCENE III. A room in the palace.

1232 |
1233 |
Enter CELIA and ROSALIND
1234 |
1235 |
CELIA
1236 |
1237 |
Why, cousin! why, Rosalind! Cupid have mercy! not a word?
1238 |
1239 |
ROSALIND
1240 |
1241 |
Not one to throw at a dog.
1242 |
1243 |
CELIA
1244 |
1245 |
No, thy words are too precious to be cast away upon
1246 |
curs; throw some of them at me; come, lame me with reasons.
1247 |
1248 |
ROSALIND
1249 |
CELIA
1250 |
1251 |
But is all this for your father?
1252 |
1253 |
1254 |
Then there were two cousins laid up; when the one
1255 |
should be lamed with reasons and the other mad
1256 |
without any.
1257 |
1258 |
ROSALIND
1259 |
1260 |
No, some of it is for my child's father. O, how
1261 |
full of briers is this working-day world!
1262 |
1263 |
CELIA
1264 |
1265 |
They are but burs, cousin, thrown upon thee in
1266 |
holiday foolery: if we walk not in the trodden
1267 |
paths our very petticoats will catch them.
1268 |
1269 |
ROSALIND
1270 |
1271 |
I could shake them off my coat: these burs are in my heart.
1272 |
1273 |
CELIA
1274 |
1275 |
Hem them away.
1276 |
1277 |
ROSALIND
1278 |
1279 |
I would try, if I could cry 'hem' and have him.
1280 |
1281 |
CELIA
1282 |
1283 |
Come, come, wrestle with thy affections.
1284 |
1285 |
ROSALIND
1286 |
1287 |
O, they take the part of a better wrestler than myself!
1288 |
1289 |
CELIA
1290 |
1291 |
O, a good wish upon you! you will try in time, in
1292 |
despite of a fall. But, turning these jests out of
1293 |
service, let us talk in good earnest: is it
1294 |
possible, on such a sudden, you should fall into so
1295 |
strong a liking with old Sir Rowland's youngest son?
1296 |
1297 |
ROSALIND
1298 |
1299 |
The duke my father loved his father dearly.
1300 |
1301 |
CELIA
1302 |
1303 |
Doth it therefore ensue that you should love his son
1304 |
dearly? By this kind of chase, I should hate him,
1305 |
for my father hated his father dearly; yet I hate
1306 |
not Orlando.
1307 |
1308 |
ROSALIND
1309 |
1310 |
No, faith, hate him not, for my sake.
1311 |
1312 |
CELIA
1313 |
1314 |
Why should I not? doth he not deserve well?
1315 |
1316 |
ROSALIND
1317 |
1318 |
Let me love him for that, and do you love him
1319 |
because I do. Look, here comes the duke.
1320 |
1321 |
CELIA
1322 |
1323 |
With his eyes full of anger.
1324 |
Enter DUKE FREDERICK, with Lords
1325 |
1326 |
DUKE FREDERICK
1327 |
1328 |
Mistress, dispatch you with your safest haste
1329 |
And get you from our court.
1330 |
1331 |
ROSALIND
1332 |
1333 |
Me, uncle?
1334 |
1335 |
DUKE FREDERICK
1336 |
1337 |
You, cousin
1338 |
Within these ten days if that thou be'st found
1339 |
So near our public court as twenty miles,
1340 |
Thou diest for it.
1341 |
1342 |
ROSALIND
1343 |
1344 |
I do beseech your grace,
1345 |
Let me the knowledge of my fault bear with me:
1346 |
If with myself I hold intelligence
1347 |
Or have acquaintance with mine own desires,
1348 |
If that I do not dream or be not frantic,--
1349 |
As I do trust I am not--then, dear uncle,
1350 |
Never so much as in a thought unborn
1351 |
Did I offend your highness.
1352 |
1353 |
DUKE FREDERICK
1354 |
1355 |
Thus do all traitors:
1356 |
If their purgation did consist in words,
1357 |
They are as innocent as grace itself:
1358 |
Let it suffice thee that I trust thee not.
1359 |
1360 |
ROSALIND
1361 |
1362 |
Yet your mistrust cannot make me a traitor:
1363 |
Tell me whereon the likelihood depends.
1364 |
1365 |
DUKE FREDERICK
1366 |
1367 |
Thou art thy father's daughter; there's enough.
1368 |
1369 |
ROSALIND
1370 |
1371 |
So was I when your highness took his dukedom;
1372 |
So was I when your highness banish'd him:
1373 |
Treason is not inherited, my lord;
1374 |
Or, if we did derive it from our friends,
1375 |
What's that to me? my father was no traitor:
1376 |
Then, good my liege, mistake me not so much
1377 |
To think my poverty is treacherous.
1378 |
1379 |
CELIA
1380 |
1381 |
Dear sovereign, hear me speak.
1382 |
1383 |
DUKE FREDERICK
1384 |
1385 |
Ay, Celia; we stay'd her for your sake,
1386 |
Else had she with her father ranged along.
1387 |
1388 |
CELIA
1389 |
1390 |
I did not then entreat to have her stay;
1391 |
It was your pleasure and your own remorse:
1392 |
I was too young that time to value her;
1393 |
But now I know her: if she be a traitor,
1394 |
Why so am I; we still have slept together,
1395 |
Rose at an instant, learn'd, play'd, eat together,
1396 |
And wheresoever we went, like Juno's swans,
1397 |
Still we went coupled and inseparable.
1398 |
1399 |
DUKE FREDERICK
1400 |
1401 |
She is too subtle for thee; and her smoothness,
1402 |
Her very silence and her patience
1403 |
Speak to the people, and they pity her.
1404 |
Thou art a fool: she robs thee of thy name;
1405 |
And thou wilt show more bright and seem more virtuous
1406 |
When she is gone. Then open not thy lips:
1407 |
Firm and irrevocable is my doom
1408 |
Which I have pass'd upon her; she is banish'd.
1409 |
1410 |
CELIA
1411 |
1412 |
Pronounce that sentence then on me, my liege:
1413 |
I cannot live out of her company.
1414 |
1415 |
DUKE FREDERICK
1416 |
1417 |
You are a fool. You, niece, provide yourself:
1418 |
If you outstay the time, upon mine honour,
1419 |
And in the greatness of my word, you die.
1420 |
Exeunt DUKE FREDERICK and Lords
1421 |
1422 |
CELIA
1423 |
1424 |
O my poor Rosalind, whither wilt thou go?
1425 |
Wilt thou change fathers? I will give thee mine.
1426 |
I charge thee, be not thou more grieved than I am.
1427 |
1428 |
ROSALIND
1429 |
1430 |
I have more cause.
1431 |
1432 |
CELIA
1433 |
1434 |
Thou hast not, cousin;
1435 |
Prithee be cheerful: know'st thou not, the duke
1436 |
Hath banish'd me, his daughter?
1437 |
1438 |
ROSALIND
1439 |
1440 |
That he hath not.
1441 |
1442 |
CELIA
1443 |
1444 |
No, hath not? Rosalind lacks then the love
1445 |
Which teacheth thee that thou and I am one:
1446 |
Shall we be sunder'd? shall we part, sweet girl?
1447 |
No: let my father seek another heir.
1448 |
Therefore devise with me how we may fly,
1449 |
Whither to go and what to bear with us;
1450 |
And do not seek to take your change upon you,
1451 |
To bear your griefs yourself and leave me out;
1452 |
For, by this heaven, now at our sorrows pale,
1453 |
Say what thou canst, I'll go along with thee.
1454 |
1455 |
ROSALIND
1456 |
1457 |
Why, whither shall we go?
1458 |
1459 |
CELIA
1460 |
1461 |
To seek my uncle in the forest of Arden.
1462 |
1463 |
ROSALIND
1464 |
1465 |
Alas, what danger will it be to us,
1466 |
Maids as we are, to travel forth so far!
1467 |
Beauty provoketh thieves sooner than gold.
1468 |
1469 |
CELIA
1470 |
1471 |
I'll put myself in poor and mean attire
1472 |
And with a kind of umber smirch my face;
1473 |
The like do you: so shall we pass along
1474 |
And never stir assailants.
1475 |
1476 |
ROSALIND
1477 |
1478 |
Were it not better,
1479 |
Because that I am more than common tall,
1480 |
That I did suit me all points like a man?
1481 |
A gallant curtle-axe upon my thigh,
1482 |
A boar-spear in my hand; and--in my heart
1483 |
Lie there what hidden woman's fear there will--
1484 |
We'll have a swashing and a martial outside,
1485 |
As many other mannish cowards have
1486 |
That do outface it with their semblances.
1487 |
1488 |
CELIA
1489 |
1490 |
What shall I call thee when thou art a man?
1491 |
1492 |
ROSALIND
1493 |
1494 |
I'll have no worse a name than Jove's own page;
1495 |
And therefore look you call me Ganymede.
1496 |
But what will you be call'd?
1497 |
1498 |
CELIA
1499 |
1500 |
Something that hath a reference to my state
1501 |
No longer Celia, but Aliena.
1502 |
1503 |
ROSALIND
1504 |
1505 |
But, cousin, what if we assay'd to steal
1506 |
The clownish fool out of your father's court?
1507 |
Would he not be a comfort to our travel?
1508 |
1509 |
CELIA
1510 |
1511 |
He'll go along o'er the wide world with me;
1512 |
Leave me alone to woo him. Let's away,
1513 |
And get our jewels and our wealth together,
1514 |
Devise the fittest time and safest way
1515 |
To hide us from pursuit that will be made
1516 |
After my flight. Now go we in content
1517 |
To liberty and not to banishment.
1518 |
Exeunt
1519 |
1520 |
1521 |
1522 |
1523 | 1524 | 1525 | """ 1526 | 1527 | 1528 | if __name__ == "__main__": 1529 | unittest.main() 1530 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = pre-commit,pylint,py,docs,typing 3 | 4 | [testenv] 5 | deps = 6 | lxml>=4.4 7 | pytest-cov>=2.8 8 | pytest>=5.4 9 | setuptools 10 | sybil 11 | commands = 12 | pytest --cov=cssselect \ 13 | --cov-report=term-missing --cov-report=html --cov-report=xml \ 14 | --verbose {posargs: cssselect tests docs} 15 | 16 | [testenv:pylint] 17 | deps = 18 | {[testenv]deps} 19 | pylint==3.3.5 20 | commands = 21 | pylint {posargs: cssselect tests docs} 22 | 23 | [testenv:docs] 24 | changedir = docs 25 | deps = 26 | -r docs/requirements.txt 27 | commands = 28 | sphinx-build -W -b html . {envtmpdir}/html 29 | 30 | [testenv:typing] 31 | deps = 32 | {[testenv]deps} 33 | mypy==1.15.0 34 | types-lxml==2025.3.4 35 | commands = 36 | mypy --strict {posargs: cssselect tests} 37 | 38 | [testenv:pre-commit] 39 | deps = pre-commit 40 | commands = pre-commit run --all-files --show-diff-on-failure 41 | skip_install = true 42 | 43 | [testenv:twinecheck] 44 | basepython = python3 45 | deps = 46 | twine==6.1.0 47 | build==1.2.2.post1 48 | commands = 49 | python -m build --sdist 50 | twine check dist/* 51 | --------------------------------------------------------------------------------